1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
|
/*
LibRCC - ISO8859-1/UTF-8 detection engine
Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>
This program is free software; you can redistribute it and/or modify it
under the terms of version 2 of the GNU General Public License as published
by the Free Software Foundation.
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/
#include <stdio.h>
#include <string.h>
#include <librcc.h>
#define bit(i) (1<<i)
/*
* Latin unicode subset:
* 0x100 - 0x17E
* 0x180 - 0x24F
* 0x1E00 - 0x1EFF
*/
static rcc_autocharset_id AutoengineWestern(rcc_engine_context ctx, const char *sbuf, int len) {
const unsigned char *buf = (const unsigned char*)sbuf;
long i,j;
int bytes=0,rflag=0;
int res=0;
if (!len) len = strlen((char*)buf);
for (i=0;i<len;i++) {
if (buf[i]<128) continue;
if (bytes>0) {
if ((buf[i]&0xC0)==0x80) {
if (rflag) {
// Western is 0x100-0x17e
res++;
}
bytes--;
} else {
res--;
bytes=1-bytes;
rflag=0;
}
} else {
for (j=6;j>=0;j--)
if ((buf[i]&bit(j))==0) break;
if ((j==0)||(j==6)) {
if ((j==6)&&(bytes<0)) bytes++;
else res--;
continue;
}
bytes=6-j;
if (bytes==1) {
// Western Languages (C2-C3)
if (buf[i]==0xC2) rflag=1;
else if (buf[i]==0xC3) rflag=2;
}
}
if ((buf[i]==0xC0)||(buf[i]==0xC1)) {
if (i+1==len) break;
}
}
if (res > 0) return (rcc_autocharset_id)0;
return (rcc_autocharset_id)1;
}
static rcc_engine western_engine = {
"Western", NULL, NULL, &AutoengineWestern, {"UTF-8","ISO8859-1", NULL}
};
rcc_engine *rccGetInfo(const char *lang) {
if (!lang) return NULL;
return &western_engine;
}
|