summaryrefslogtreecommitdiffstats
path: root/src/recode.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/recode.c')
-rw-r--r--src/recode.c162
1 files changed, 116 insertions, 46 deletions
diff --git a/src/recode.c b/src/recode.c
index 7e12343..d337164 100644
--- a/src/recode.c
+++ b/src/recode.c
@@ -15,21 +15,34 @@
#include "rccspell.h"
#define isSpace(ch) ((ch<0x7F)&&((ch<'A')||(ch>'z')||((ch>'Z')&&(ch<'a'))))
-#define RCC_REQUIRED_PROBABILITY 0.66
+#define RCC_PROBABILITY_STEP 0.10
+#define RCC_REQUIRED_PROBABILITY 0.33
+#define RCC_REQUIRED_LENGTH 5
+#define RCC_ACCEPTABLE_PROBABILITY 0
+#define RCC_ACCEPTABLE_LENGTH 3
rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) {
- rcc_speller speller;
+ rcc_speller speller = NULL, english_speller = NULL;
unsigned long i, nlanguages;
rcc_language_config config, config0 = NULL;
rcc_string recoded;
unsigned char *utf8;
size_t j, mode;
- unsigned long words, english, result;
+ unsigned long spres, words, english, result;
+ size_t longest;
unsigned char english_mode, english_word = 1;
+ char *english_string = NULL;
rcc_language_id english_lang = (rcc_language_id)-1;
+ size_t english_longest = 0;
+ unsigned char is_english_string = 1;
double res, english_res = 0;
rcc_option_value usedb4;
-
+ rcc_language_id bestlang = (rcc_language_id)-1;
+ unsigned long bestlongest = RCC_ACCEPTABLE_LENGTH;
+ double bestres = RCC_ACCEPTABLE_PROBABILITY;
+ char *best_string = NULL;
+
+ unsigned long accepted_nonenglish_langs = 0;
usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
@@ -50,6 +63,15 @@ rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id
nlanguages = ctx->n_languages;
+ english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn);
+ if (english_lang != (rcc_language_id)-1) {
+ config = rccGetUsableConfig(ctx, english_lang);
+ if (config) {
+ english_speller = rccConfigGetSpeller(config);
+ if (rccSpellerGetError(english_speller)) english_speller = NULL;
+ }
+ }
+
for (i=0;i<nlanguages;i++) {
config = rccGetUsableConfig(ctx, (rcc_language_id)i);
if (!config) continue;
@@ -68,11 +90,20 @@ rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id
else english_mode = 0;
utf8 = (char*)rccStringGetString(recoded);
- for (result=0,english=0,words=0,mode=0,j=0;utf8[j];j++) {
+ printf("%s\n", config->language->sn);
+
+ for (result=0,english=0,words=0,longest=0,mode=0,j=0;utf8[j];j++) {
if (isSpace(utf8[j])) {
if (mode) {
- if ((!english_mode)&&(english_word)) english++;
- result+=rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0;
+ if ((!english_mode)&&(english_word)&&(rccSpellerSized(english_speller, utf8 + mode -1, j - mode + 1)))
+ english++;
+ else {
+ if ((english_mode)&&(!english_word)) is_english_string = 0;
+ spres = rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0;
+ printf("%.*s %s\n",j-mode+1,utf8+mode-1, spres?"<======":"");
+ if ((spres)&&((j - mode + 1)>longest)) longest = j - mode + 1;
+ result+=spres;
+ }
words++;
mode = 0;
} else continue;
@@ -85,40 +116,89 @@ rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id
}
}
}
+
if (mode) {
- result+=rccSpeller(speller, utf8 + mode - 1)?1:0;
+ if ((!english_mode)&&(english_word)&&(rccSpeller(english_speller, utf8 + mode -1)))
+ english++;
+ else {
+ if ((english_mode)&&(!english_word)) is_english_string = 0;
+ spres = rccSpeller(speller, utf8 + mode - 1)?1:0;
+ if ((spres)&&((j-mode+1)>longest)) longest = j - mode + 1;
+ result += spres;
+ }
words++;
}
if (english_mode) {
+ if (english_string) free(english_string);
+ printf("%u %u\n", result, words);
+
english_res = 1.*result/words;
- english_lang = (rcc_language_id)i;
- } else if (words) {
- res = 1.*result/words;
- if (res > RCC_REQUIRED_PROBABILITY) {
+ english_lang = (rcc_language_id)i;
+ english_longest = longest;
+ english_string = recoded;
+ } else if (words>english) {
+ res = 1.*result/(words - english);
+ printf("%u %u %u\n", result, words, english);
+ if ((res > RCC_REQUIRED_PROBABILITY)&&(longest > RCC_REQUIRED_LENGTH)) {
+ if (best_string) free(best_string);
+ if (english_string) free(english_string);
+
if (retstring) *retstring = recoded;
else free(recoded);
return (rcc_language_id)i;
- }
- if (words > english) {
- res = 1.*(result - english)/(words - english);
- if (res > RCC_REQUIRED_PROBABILITY) {
- if (retstring) *retstring = recoded;
- else free(recoded);
- return (rcc_language_id)i;
- }
- }
- }
-
- free(recoded);
+ } else if ((res > bestres + RCC_PROBABILITY_STEP)||
+ ((res > bestres - RCC_PROBABILITY_STEP)&&(longest > bestlongest))||
+ ((res > bestres)&&(longest == bestlongest))) {
+
+ if (best_string) free(best_string);
+
+ bestres = res;
+ bestlang = (rcc_language_id)i;
+ bestlongest = longest;
+ best_string = recoded;
+ } else if (!accepted_nonenglish_langs) {
+ bestlang = (rcc_language_id)i;
+ best_string = recoded;
+ } else free(recoded);
+
+ accepted_nonenglish_langs++;
+ } else free(recoded);
}
- if (english_res > RCC_REQUIRED_PROBABILITY) {
- if (retstring) {
- *retstring = rccCreateString(english_lang, buf, len);
- }
+ if ((is_english_string)&&(english_res > RCC_REQUIRED_PROBABILITY)&&(english_longest > RCC_REQUIRED_LENGTH)) {
+ if (best_string) free(best_string);
+ if (retstring) *retstring = english_string;
+ else if (english_string) free(english_string);
return english_lang;
}
+
+ if ((bestres > RCC_ACCEPTABLE_PROBABILITY)&&(bestlongest > RCC_ACCEPTABLE_LENGTH)) {
+ if (english_string) free(english_string);
+ if (retstring) *retstring = best_string;
+ else if (best_string) free(best_string);
+ return bestlang;
+ }
+
+ if ((is_english_string)&&(english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH)) {
+ if (best_string) free(best_string);
+ if (retstring) *retstring = english_string;
+ else if (english_string) free(english_string);
+ return english_lang;
+ }
+
+ if (best_string) {
+ if (english_string) free(english_string);
+ if (retstring) *retstring = best_string;
+ else if (best_string) free(best_string);
+ return bestlang;
+ } else if (best_string) free(best_string);
+
+ if ((english_res > RCC_ACCEPTABLE_PROBABILITY)&&(english_longest > RCC_ACCEPTABLE_LENGTH)) {
+ if (retstring) *retstring = english_string;
+ else if (english_string) free(english_string);
+ return english_lang;
+ } else if (english_string) free(english_string);
return (rcc_language_id)-1;
}
@@ -206,9 +286,12 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf,
*/
detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result);
- if (detected_language_id != (rcc_language_id)-1) return result;
+ if (detected_language_id != (rcc_language_id)-1) {
+ printf("Language %i: %s\n", rccStringGetLanguage(result), result);
+ return result;
+ }
+
-
err = rccConfigure(ctx);
if (err) return NULL;
@@ -316,7 +399,6 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s
}
if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((config->trans)&&(!translated))) {
- puts("entrans");
if (!config->entrans) {
config->entrans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rcc_english_language_sn);
}
@@ -384,7 +466,6 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const
const char *from_charset, *to_charset;
rcc_charset_id from_charset_id, to_charset_id;
rcc_class_type class_type;
- rcc_option_value usedb4;
if (!ctx) {
if (rcc_default_ctx) ctx = rcc_default_ctx;
@@ -394,20 +475,9 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const
class_type = rccGetClassType(ctx, to);
if ((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_NAMES))) goto recoding;
- if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)&RCC_OPTION_LEARNING_FLAG_LEARN) goto recoding;
-
- usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE);
- if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) {
- stmp = rccDb4GetKey(ctx->db4ctx, buf, len);
- if (stmp) {
- if (rccStringFixID(stmp, ctx)) free(stmp);
- else {
- result = rccSizedTo(ctx, to, stmp, rlen);
- free(stmp);
- return result;
- }
- }
- }
+ if (rccGetOption(ctx, RCC_OPTION_LEARNING_MODE)) goto recoding;
+ if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) goto recoding;
+ if (rccGetOption(ctx, RCC_OPTION_TRANSLATE)) goto recoding;
err = rccConfigure(ctx);
if (err) return NULL;