diff options
author | Suren A. Chilingaryan <csa@dside.dyndns.org> | 2007-06-27 09:17:03 +0000 |
---|---|---|
committer | Suren A. Chilingaryan <csa@dside.dyndns.org> | 2007-06-27 09:17:03 +0000 |
commit | b68103e7018957e6fd25610da1d65deedd825497 (patch) | |
tree | f4815d103363a343ab0f7d5a21a6c47c5c318e87 /src | |
parent | 1b2aa527c43c89acc48fb146b2d61ed15eef5b61 (diff) | |
download | librcd-b68103e7018957e6fd25610da1d65deedd825497.tar.gz librcd-b68103e7018957e6fd25610da1d65deedd825497.tar.bz2 librcd-b68103e7018957e6fd25610da1d65deedd825497.tar.xz librcd-b68103e7018957e6fd25610da1d65deedd825497.zip |
ISO-8859-1 support
- Try to detect genuine ISO-8859-1 encoding
Diffstat (limited to 'src')
-rw-r--r-- | src/librcd.c | 35 | ||||
-rw-r--r-- | src/librcd.h | 3 |
2 files changed, 37 insertions, 1 deletions
diff --git a/src/librcd.c b/src/librcd.c index 55eefc0..36986cc 100644 --- a/src/librcd.c +++ b/src/librcd.c @@ -1,5 +1,7 @@ #include <stdio.h> +#include "../config.h" + #define _LIBRCD_C #include "librcd.h" @@ -252,6 +254,36 @@ static int check_utf8(const unsigned char *buf, int len) { return res; } +/* In russian language we will have whole word consisting of >127 characters, +with latin languages there is in every word besides umlauts should exist at +least one standard latin character with code < 127. */ +static int check_latin(const unsigned char *buf, int len) { + long i; + int word = 0; + int latin = 0; + + for (i=0;i<len;i++) { + if (buf[i]<128) { + if (((buf[i]>='a')&&(buf[i]<='z'))||((buf[i]>='A')&&(buf[i]<='Z'))) { + // Latin character inside a word, so it isn't cyrillic word + latin++; + } else { + // Treating as a word separator. + if (word > 0) { + if (!latin) return 0; + if ((word/latin)>4) return 0; + } + + word = 0; + latin = 0; + } + } else { + // Could be cyrillic word + if (word>=0) word++; + } + } + return 1; +} rcd_russian_charset rcdGetRussianCharset(const char *buf,int len) { @@ -259,6 +291,9 @@ rcd_russian_charset rcdGetRussianCharset(const char *buf,int len) { l = len?len:strlen(buf); if (check_utf8(buf,l)>1) return RUSSIAN_CHARSET_UTF8; +#ifdef DETECT_LATIN + if (check_latin(buf,l)) return RUSSIAN_CHARSET_LATIN; +#endif /* DETECT_LATIN */ return is_win_charset2(buf,l); } diff --git a/src/librcd.h b/src/librcd.h index 56db6c8..6fc3281 100644 --- a/src/librcd.h +++ b/src/librcd.h @@ -9,7 +9,8 @@ enum rcd_russian_charset_t { RUSSIAN_CHARSET_WIN = 0, RUSSIAN_CHARSET_KOI, RUSSIAN_CHARSET_UTF8, - RUSSIAN_CHARSET_ALT + RUSSIAN_CHARSET_ALT, + RUSSIAN_CHARSET_LATIN }; typedef enum rcd_russian_charset_t rcd_russian_charset; |