summaryrefslogtreecommitdiffstats
path: root/engines/western.c
blob: ac924762b15217646641fad2c072095d968769fe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
/*
  LibRCC - ISO8859-1/UTF-8 detection engine

  Copyright (C) 2005-2008 Suren A. Chilingaryan <csa@dside.dyndns.org>

  This program is free software; you can redistribute it and/or modify it
  under the terms of version 2 of the GNU General Public License as published
  by the Free Software Foundation.

  This program is distributed in the hope that it will be useful, but WITHOUT
  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  more details.

  You should have received a copy of the GNU General Public License along
  with this program; if not, write to the Free Software Foundation, Inc.,
  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
*/

#include <stdio.h>
#include <string.h>

#include <librcc.h>

#define bit(i) (1<<i)

/* 
 * Latin unicode subset:
 * 0x100 - 0x17E
 * 0x180 - 0x24F
 * 0x1E00 - 0x1EFF
 */

static rcc_autocharset_id AutoengineWestern(rcc_engine_context ctx, const char *sbuf, int len) {
    const unsigned char *buf = (const unsigned char*)sbuf;
    long i,j;
    int bytes=0,rflag=0;
    int res=0;

    if (!len) len = strlen((char*)buf);
    for (i=0;i<len;i++) {
	if (buf[i]<128) continue;
	
	if (bytes>0) {
	    if ((buf[i]&0xC0)==0x80) {
		if (rflag) {
		    // Western is 0x100-0x17e
		    res++;
		}
		bytes--;
	    } else {
		res--;
		bytes=1-bytes;
		rflag=0;
	    }
	} else {
	    for (j=6;j>=0;j--)
		if ((buf[i]&bit(j))==0) break;
	    
	    if ((j==0)||(j==6)) {
		if ((j==6)&&(bytes<0)) bytes++;
		else res--;
		continue;
	    }
	    bytes=6-j;
	    if (bytes==1) {
		// Western Languages (C2-C3)
		if (buf[i]==0xC2) rflag=1;
		else if (buf[i]==0xC3) rflag=2;
	    }
	}
	    
	if ((buf[i]==0xC0)||(buf[i]==0xC1)) {
	    if (i+1==len) break;
	    	    
	}
    }

    if (res > 0) return (rcc_autocharset_id)0;
    return (rcc_autocharset_id)1;
}

static rcc_engine western_engine = {
    "Western", NULL, NULL, &AutoengineWestern, {"UTF-8","ISO8859-1", NULL}
};

rcc_engine *rccGetInfo(const char *lang) {
    if (!lang) return NULL;

    return &western_engine;
}