ISO-8859-1 support

- Try to detect genuine ISO-8859-1 encoding
author: Suren A. Chilingaryan <csa@dside.dyndns.org> 2007-06-27 09:17:03 +0000
committer: Suren A. Chilingaryan <csa@dside.dyndns.org> 2007-06-27 09:17:03 +0000
commit: b68103e7018957e6fd25610da1d65deedd825497 (patch)
tree: f4815d103363a343ab0f7d5a21a6c47c5c318e87 /src
parent: 1b2aa527c43c89acc48fb146b2d61ed15eef5b61 (diff)
2 files changed, 37 insertions, 1 deletions
diff --git a/src/librcd.c b/src/librcd.c
index 55eefc0..36986cc 100644
--- a/src/librcd.c
+++ b/src/librcd.c
@@ -1,5 +1,7 @@
 #include <stdio.h>
 
+#include "../config.h"
+
 #define _LIBRCD_C
 #include "librcd.h"
 
@@ -252,6 +254,36 @@ static int check_utf8(const unsigned char *buf, int len) {
     return res;
 }
 
+/* In russian language we will have whole word consisting of >127 characters,
+with latin languages there is in every word besides umlauts should exist at
+least one standard latin character with code < 127. */
+static int check_latin(const unsigned char *buf, int len) {
+    long i;
+    int word = 0;
+    int latin = 0;
+    
+    for (i=0;i<len;i++) {
+	if (buf[i]<128) {
+	    if (((buf[i]>='a')&&(buf[i]<='z'))||((buf[i]>='A')&&(buf[i]<='Z'))) {
+		    // Latin character inside a word, so it isn't cyrillic word
+		latin++;
+	    } else {
+		    // Treating as a word separator.
+		if (word > 0) {
+		    if (!latin) return 0;
+		    if ((word/latin)>4) return 0;
+		}
+
+		word = 0;
+		latin = 0;
+	    }
+	} else {
+		// Could be cyrillic word
+	    if (word>=0) word++;
+	}
+    }
+    return 1;
+}
 
 
 rcd_russian_charset rcdGetRussianCharset(const char *buf,int len) {
@@ -259,6 +291,9 @@ rcd_russian_charset rcdGetRussianCharset(const char *buf,int len) {
 
     l = len?len:strlen(buf);
     if (check_utf8(buf,l)>1) return RUSSIAN_CHARSET_UTF8;
+#ifdef DETECT_LATIN
+    if (check_latin(buf,l)) return RUSSIAN_CHARSET_LATIN;
+#endif /* DETECT_LATIN */
     return is_win_charset2(buf,l);
 }
 
diff --git a/src/librcd.h b/src/librcd.h
index 56db6c8..6fc3281 100644
--- a/src/librcd.h
+++ b/src/librcd.h
@@ -9,7 +9,8 @@ enum rcd_russian_charset_t {
     RUSSIAN_CHARSET_WIN = 0,
     RUSSIAN_CHARSET_KOI,
     RUSSIAN_CHARSET_UTF8,
-    RUSSIAN_CHARSET_ALT
+    RUSSIAN_CHARSET_ALT,
+    RUSSIAN_CHARSET_LATIN
 };
 typedef enum rcd_russian_charset_t rcd_russian_charset;
author	Suren A. Chilingaryan <csa@dside.dyndns.org>	2007-06-27 09:17:03 +0000
committer	Suren A. Chilingaryan <csa@dside.dyndns.org>	2007-06-27 09:17:03 +0000
commit	b68103e7018957e6fd25610da1d65deedd825497 (patch)
tree	f4815d103363a343ab0f7d5a21a6c47c5c318e87 /src
parent	1b2aa527c43c89acc48fb146b2d61ed15eef5b61 (diff)