From 94ca629ceec7b0dc9f6f724b2e15923d3ec1d5b3 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Fri, 5 Aug 2005 03:06:50 +0000 Subject: Language AutoDetection Improvements - Fix: Loading/Saving range options. - Fix: Language AutoDetection. Using locale language instead of selected one. - Support for range options in GTK UI. - Option to control recoding timeout is provided. - LibRCC.h is updated (Translate, Spell, IConv). - Documentation is updated. - Add 'rcc-config' alias to 'rcc-gtk2-config' in spec. - Implemented concept of parrent languages + The concept is used in language autodetection. The string in considered language is permited to have words from all it's parrent languages. + English is assumed to be parrent for all other languages by default. + Russian is parrent language for Ukrainian and Belorussian. - No translation to english if translation between related (one of the languages is parrent for another one) languages is failed. --- src/internal.h | 26 +++++---- src/librcc.c | 70 ++++++++++++++++++++++-- src/librcc.h | 96 +++++++++++++++++++++++++++++++-- src/lngconfig.c | 41 +++++++++++++-- src/opt.c | 4 +- src/rccconfig.c | 37 +++++++++++-- src/rccconfig.h | 5 ++ src/rccdb4.h | 4 +- src/rccspell.c | 53 +++++++++++++++++-- src/rccspell.h | 11 ++-- src/rccstring.c | 23 ++++++++ src/rccstring.h | 1 + src/rccxml.c | 59 ++++++++++++++++++--- src/recode.c | 161 ++++++++++++++++++++++++++++++++++++++++---------------- 14 files changed, 501 insertions(+), 90 deletions(-) (limited to 'src') diff --git a/src/internal.h b/src/internal.h index fcaa4c6..d5797fc 100644 --- a/src/internal.h +++ b/src/internal.h @@ -5,15 +5,8 @@ # define LIBRCC_DATA_DIR "/usr/lib/rcc" #endif /* LIBRCC_DATA_DIR */ -#include "librcc.h" -#include "recode.h" -#include "engine.h" -#include "lngconfig.h" -#include "rccstring.h" -#include "rccdb4.h" -#include "rcciconv.h" -#include "rccstring.h" -#include "rccmutex.h" +#define RCC_MAX_LANGUAGE_PARRENTS 4 +#define RCC_MAX_RELATIONS RCC_MAX_LANGUAGES #ifdef HAVE_STRNLEN #define STRNLEN(str,n) (n?strnlen(str,n):strlen(str)) @@ -26,6 +19,20 @@ #define RCC_MAX_PREFIX_CHARS 32 #define RCC_MIN_DB4_CHARS 3 +#include "librcc.h" +#include "recode.h" +#include "engine.h" +#include "lngconfig.h" +#include "rccstring.h" +#include "rccdb4.h" +#include "rcciconv.h" +#include "rccstring.h" +#include "rccmutex.h" + + + +typedef rcc_language_id rcc_language_parrent_list[RCC_MAX_LANGUAGE_PARRENTS]; + struct rcc_context_t { char locale_variable[RCC_MAX_VARIABLE_CHARS+1]; @@ -37,6 +44,7 @@ struct rcc_context_t { unsigned int max_languages; unsigned int n_languages; rcc_language_ptr *languages; + rcc_language_parrent_list *language_parrents; rcc_language_config configs; unsigned int max_classes; diff --git a/src/librcc.c b/src/librcc.c index 757b71b..208fcb3 100644 --- a/src/librcc.c +++ b/src/librcc.c @@ -58,6 +58,7 @@ rcc_compiled_configuration rccGetCompiledConfiguration() { int rccInit() { int err; char *tmp; + unsigned long i, rpos; #ifdef HAVE_PWD_H struct passwd *pw; @@ -78,12 +79,26 @@ int rccInit() { if (!rcc_home_dir) rcc_home_dir = strdup("/"); memcpy(rcc_default_languages, rcc_default_languages_embeded, (RCC_MAX_LANGUAGES + 1)*sizeof(rcc_language)); + memcpy(rcc_default_aliases, rcc_default_aliases_embeded, (RCC_MAX_ALIASES + 1)*sizeof(rcc_language_alias)); + memcpy(rcc_default_relations, rcc_default_relations_embeded, (RCC_MAX_RELATIONS + 1)*sizeof(rcc_language_relation)); memcpy(rcc_option_descriptions, rcc_option_descriptions_embeded, (RCC_MAX_OPTIONS + 1)*sizeof(rcc_option_description)); #ifdef HAVE_LIBTRANSLATE rccExternalInit(); #endif /* HAVE_LIBTRANSLATE */ + for (rpos=0;rcc_default_relations[rpos].lang;rpos++); + for (i=0;rcc_default_languages[i].sn;i++) { + if (!strcasecmp(rcc_default_languages[i].sn, rcc_default_language_sn)) continue; + if (!strcasecmp(rcc_default_languages[i].sn, rcc_disabled_language_sn)) continue; + if (!strcasecmp(rcc_default_languages[i].sn, rcc_english_language_sn)) continue; + + rcc_default_relations[rpos].lang = rcc_default_languages[i].sn; + rcc_default_relations[rpos++].parrent = rcc_english_language_sn; + } + rcc_default_relations[rpos].lang = NULL; + rcc_default_relations[rpos].parrent = NULL; + err = rccPluginInit(); if (!err) err = rccTranslateInit(); if (!err) err = rccXmlInit(1); @@ -125,6 +140,7 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu rcc_context ctx; rcc_language_ptr *languages; + rcc_language_parrent_list *language_parrents; rcc_class_ptr *classes; rcc_language_config configs; rcc_iconv *from; @@ -151,16 +167,18 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu languages = (rcc_language_ptr*)malloc((max_languages+1)*sizeof(rcc_language_ptr)); classes = (rcc_class_ptr*)malloc((max_classes+1)*sizeof(rcc_class_ptr)); from = (rcc_iconv*)malloc((max_classes)*sizeof(rcc_iconv)); + language_parrents = (rcc_language_parrent_list*)malloc((max_languages+1)*sizeof(rcc_language_parrent_list)); mutex = rccMutexCreate(); configs = (rcc_language_config)malloc((max_languages)*sizeof(struct rcc_language_config_t)); - if ((!ctx)||(!languages)||(!classes)||(!mutex)) { + if ((!ctx)||(!languages)||(!classes)||(!mutex)||(!language_parrents)) { if (mutex) rccMutexFree(mutex); if (from) free(from); if (configs) free(configs); if (classes) free(classes); if (languages) free(languages); + if (language_parrents) free(language_parrents); if (ctx) free(ctx); return NULL; } @@ -174,7 +192,10 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu ctx->aliases[0] = NULL; for (i=0;rcc_default_aliases[i].alias;i++) rccRegisterLanguageAlias(ctx, rcc_default_aliases + i); - + + ctx->language_parrents = language_parrents; + for (i=0;ilanguages = languages; ctx->max_languages = max_languages; ctx->n_languages = 0; @@ -216,12 +237,15 @@ rcc_context rccCreateContext(const char *locale_variable, unsigned int max_langu } else { for (i=0;rcc_default_languages[i].sn;i++) rccRegisterLanguage(ctx, rcc_default_languages+i); - + if (max_languages < i) { rccFree(ctx); return NULL; } + for (i=0;rcc_default_relations[i].lang;i++) + rccRegisterLanguageRelation(ctx, rcc_default_relations+i); + ctx->current_config = rccGetCurrentConfig(ctx); } @@ -282,6 +306,7 @@ void rccFreeContext(rcc_context ctx) { free(ctx->configs); } if (ctx->classes) free(ctx->classes); + if (ctx->language_parrents) free(ctx->language_parrents); if (ctx->languages) free(ctx->languages); if (ctx->mutex) rccMutexFree(ctx->mutex); free(ctx); @@ -397,6 +422,45 @@ rcc_alias_id rccRegisterLanguageAlias(rcc_context ctx, rcc_language_alias *alias return i-1; } +rcc_relation_id rccRegisterLanguageRelation(rcc_context ctx, rcc_language_relation *relation) { + unsigned int i; + rcc_language_id language_id; + const char *lang; + const char *parrent; + rcc_language_id *list; + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return (rcc_alias_id)-1; + } + if (!relation) return (rcc_relation_id)-1; + + lang = relation->lang; + parrent = relation->parrent; + if ((!lang)||(!parrent)||(!strcasecmp(lang,parrent))) return (rcc_relation_id)-1; + + language_id = rccGetLanguageByName(ctx, lang); + if (language_id == (rcc_language_id)-1) return (rcc_relation_id)-1; + + + list = ctx->language_parrents[language_id]; + + language_id = rccGetLanguageByName(ctx, parrent); + if (language_id == (rcc_language_id)-1) return (rcc_relation_id)0; + + for (i=0;list[i]!=(rcc_language_id)-1;i++) + if (list[i] == language_id) return (rcc_relation_id)0; + + if (imutex); - if (!config->speller) config->speller = rccSpellerCreate(config->language->sn); + if (!config->speller) { + config->speller = rccSpellerCreate(config->language->sn); + + if (config->speller) language_id = rccConfigGetLanguage(config); + else language_id = (rcc_language_id)-1; + if (language_id != (rcc_language_id)-1) parrents = config->ctx->language_parrents[language_id]; + else parrents = NULL; + + if (parrents) { + for (i = 0; parrents[i]!=(rcc_language_id)-1; i++) { + pconfig = rccGetConfig(config->ctx, parrents[i]); + if (pconfig) { + speller = rccConfigGetSpeller(pconfig); + rccSpellerAddParrent(config->speller, speller); + } + } + } + } rccMutexUnLock(config->mutex); return config->speller; } rcc_translate rccConfigGetTranslator(rcc_language_config config, rcc_language_id to) { + rcc_option_value timeout; + if (!config) return NULL; rccMutexLock(config->mutex); @@ -360,7 +384,11 @@ rcc_translate rccConfigGetTranslator(rcc_language_config config, rcc_language_id if (!config->trans) { config->trans = rccTranslateOpen(config->language->sn, rccGetLanguageName(config->ctx, to)); - config->translang = to; + if (config->trans) { + config->translang = to; + timeout = rccGetOption(config->ctx, RCC_OPTION_TIMEOUT); + if (timeout) rccTranslateSetTimeout(config->trans, timeout); + } } rccMutexUnLock(config->mutex); @@ -368,11 +396,18 @@ rcc_translate rccConfigGetTranslator(rcc_language_config config, rcc_language_id } rcc_translate rccConfigGetEnglishTranslator(rcc_language_config config) { + rcc_option_value timeout; + if (!config) return NULL; rccMutexLock(config->mutex); - if (!config->entrans) + if (!config->entrans) { config->entrans = rccTranslateOpen(config->language->sn, rcc_english_language_sn); + if (config->entrans) { + timeout = rccGetOption(config->ctx, RCC_OPTION_TIMEOUT); + if (timeout) rccTranslateSetTimeout(config->entrans, timeout); + } + } rccMutexUnLock(config->mutex); return config->entrans; diff --git a/src/opt.c b/src/opt.c index e6f8486..9e9f00d 100644 --- a/src/opt.c +++ b/src/opt.c @@ -112,7 +112,7 @@ rcc_option rccOptionDescriptionGetOption(rcc_option_description *desc) { const char *rccOptionDescriptionGetValueName(rcc_option_description *desc, rcc_option_value value) { unsigned int i; - if (desc) { + if ((desc)&&(desc->vsn)) { for (i=0;desc->vsn[i];i++) { if (i == value) return desc->vsn[i]; } @@ -123,7 +123,7 @@ const char *rccOptionDescriptionGetValueName(rcc_option_description *desc, rcc_o rcc_option_value rccOptionDescriptionGetValueByName(rcc_option_description *desc, const char *name) { unsigned int i; - if ((desc)&&(name)) { + if ((desc)&&(desc->vsn)&&(name)) { for (i=0;desc->vsn[i];i++) { if (!strcasecmp(desc->vsn[i], name)) return (rcc_option_value)i; } diff --git a/src/rccconfig.c b/src/rccconfig.c index f820606..a54b778 100644 --- a/src/rccconfig.c +++ b/src/rccconfig.c @@ -6,10 +6,20 @@ #include "engine.h" #include "opt.h" -rcc_language_alias rcc_default_aliases[] = { +#define RCC_DEFAULT_RECODING_TIMEOUT 500000 + +rcc_language_alias rcc_default_aliases[RCC_MAX_ALIASES + 1]; +rcc_language_alias rcc_default_aliases_embeded[RCC_MAX_ALIASES + 1] = { { "cs_SK", "sk" }, { "ru_UA", "uk" }, - { NULL, NULL} + { NULL, NULL } +}; + +rcc_language_relation rcc_default_relations[RCC_MAX_RELATIONS + 1]; +rcc_language_relation rcc_default_relations_embeded[RCC_MAX_RELATIONS + 1] = { + { "uk", "ru" }, + { "be", "ru" }, + { NULL, NULL } }; const char rcc_default_language_sn[] = "default"; @@ -140,6 +150,11 @@ rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = { {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1}, RCC_OPTION_TYPE_INVISIBLE, "TRANSLATE", rcc_sn_translate }, #endif /* HAVE_LIBTRANSLATE */ {RCC_OPTION_AUTOENGINE_SET_CURRENT, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTOENGINE_SET_CURRENT", rcc_sn_boolean }, +#ifdef HAVE_LIBTRANSLATE + {RCC_OPTION_TIMEOUT, RCC_DEFAULT_RECODING_TIMEOUT, { RCC_OPTION_RANGE_TYPE_RANGE, 0, 5000000, 50000}, RCC_OPTION_TYPE_STANDARD, "TIMEOUT", NULL }, +#else + {RCC_OPTION_TIMEOUT, RCC_DEFAULT_RECODING_TIMEOUT, { RCC_OPTION_RANGE_TYPE_RANGE, 0, 5000000, 50000}, RCC_OPTION_TYPE_INVISIBLE, "TIMEOUT", NULL }, +#endif /* HAVE_LIBTRANSLATE */ {RCC_MAX_OPTIONS} }; @@ -149,7 +164,8 @@ rcc_option_description *rccGetOptionDescription(rcc_option option) { if ((option<0)||(option>=RCC_MAX_OPTIONS)) return NULL; for (i=0;rcc_option_descriptions[i].option!=RCC_MAX_OPTIONS;i++) - if (rcc_option_descriptions[i].option == option) return rcc_option_descriptions+i; + if (rcc_option_descriptions[i].option == option) + return rcc_option_descriptions+i; return NULL; } @@ -180,3 +196,18 @@ int rccIsUTF8(const char *name) { if ((!name)||(strcasecmp(name, "UTF-8")&&strcasecmp(name, "UTF8"))) return 0; return 1; } + +unsigned int rccDefaultDropLanguageRelations(const char *lang) { + unsigned long i, j; + for (i=0,j=0;rcc_default_relations[i].lang;i++) { + if (strcasecmp(lang, rcc_default_relations[i].lang)) { + if (j #endif /* HAVE_DB_H */ diff --git a/src/rccspell.c b/src/rccspell.c index c54e267..da5e4d1 100644 --- a/src/rccspell.c +++ b/src/rccspell.c @@ -1,6 +1,7 @@ #include #include +#include "internal.h" #include "rccspell.h" rcc_speller rccSpellerCreate(const char *lang) { @@ -28,6 +29,7 @@ rcc_speller rccSpellerCreate(const char *lang) { } rccspeller->speller = speller; + rccspeller->parrents[0] = NULL; return rccspeller; #else return NULL; @@ -47,17 +49,58 @@ int rccSpellerGetError(rcc_speller rccspeller) { return 0; } -int rccSpellerSized(rcc_speller speller, const char *word, size_t len) { +int rccSpellerAddParrent(rcc_speller speller, rcc_speller parrent) { + unsigned int i; + if ((!speller)||(!parrent)) return -1; + + for (i=0;speller->parrents[i];i++); + if (i >= RCC_MAX_LANGUAGE_PARRENTS) return -1; + speller->parrents[i++] = parrent; + speller->parrents[i] = NULL; + + return 0; +} + +rcc_speller_result rccSpellerSized(rcc_speller speller, const char *word, size_t len, int recursion) { #ifdef HAVE_ASPELL + rcc_speller_result result, saved_result = (rcc_speller_result)0; + unsigned int i; int res; + + if (rccSpellerGetError(speller)) return (rcc_speller_result)RCC_SPELLER_INCORRECT; + + if (recursion) { + for (i=0; speller->parrents[i]; i++) { + result = rccSpellerSized(speller->parrents[i], word, len, 0); + if ((result == RCC_SPELLER_CORRECT)||(result == RCC_SPELLER_PARRENT)) return RCC_SPELLER_PARRENT; + if ((result == RCC_SPELLER_ALMOST_CORRECT)||(result == RCC_SPELLER_ALMOST_PARRENT)) saved_result = RCC_SPELLER_ALMOST_PARRENT; + } + } - if (rccSpellerGetError(speller)) return 0; + if (saved_result) return saved_result; + res = aspell_speller_check(speller->speller, word, len?len:-1); - return res<0?0:res; + return res<=0?RCC_SPELLER_INCORRECT:RCC_SPELLER_CORRECT; #endif /* HAVE_ASPELL */ return 0; } -int rccSpeller(rcc_speller speller, const char *word) { - return rccSpellerSized(speller, word, 0); +rcc_speller_result rccSpeller(rcc_speller speller, const char *word) { + return rccSpellerSized(speller, word, 0, 1); +} + +int rccSpellerResultIsOwn(rcc_speller_result res) { + if ((res == RCC_SPELLER_ALMOST_CORRECT)||(res == RCC_SPELLER_CORRECT)) return 1; + return 0; +} + +int rccSpellerResultIsPrecise(rcc_speller_result res) { + if ((res == RCC_SPELLER_PARRENT)||(res == RCC_SPELLER_CORRECT)) return 1; + return 0; +} + +int rccSpellerResultIsCorrect(rcc_speller_result res) { + if ((res == RCC_SPELLER_ALMOST_CORRECT)||(res == RCC_SPELLER_CORRECT)) return 1; + if ((res == RCC_SPELLER_ALMOST_PARRENT)||(res == RCC_SPELLER_PARRENT)) return 1; + return 0; } diff --git a/src/rccspell.h b/src/rccspell.h index 49e39f4..49d5c99 100644 --- a/src/rccspell.h +++ b/src/rccspell.h @@ -7,23 +7,22 @@ #include #endif /* HAVE_ASPELL */ +#include "internal.h" + struct rcc_speller_t { #ifdef HAVE_ASPELL struct AspellSpeller *speller; #else void *speller; #endif /* HAVE_ASPELL */ + rcc_speller parrents[RCC_MAX_LANGUAGE_PARRENTS+1]; }; -typedef struct rcc_speller_t *rcc_speller; typedef struct rcc_speller_t rcc_speller_s; -rcc_speller rccSpellerCreate(const char *lang); -void rccSpellerFree(rcc_speller speller); - int rccSpellerGetError(rcc_speller rccspeller); -int rccSpellerSized(rcc_speller speller, const char *word, size_t len); -int rccSpeller(rcc_speller speller, const char *word); + +rcc_speller_result rccSpellerSized(rcc_speller speller, const char *word, size_t len, int recursion); #endif /* _RCC_SPELL_H */ diff --git a/src/rccstring.c b/src/rccstring.c index aa92407..0f46c90 100644 --- a/src/rccstring.c +++ b/src/rccstring.c @@ -175,3 +175,26 @@ int rccIsASCII(const char *str) { if ((unsigned char)str[i]>0x7F) return 0; return 1; } + +size_t rccStringSizedGetChars(const char *str, size_t size) { + size_t i, skip = 0, chars = 0; + const unsigned char *tmp; + + tmp = rccGetString(str); + + for (i=0;(size?(size-i):tmp[i]);i++) { + if (skip) { + skip--; + continue; + } + + if (tmp[i]<0x80) skip = 0; + else if ((tmp[i]>0xBF)&&(tmp[i]<0xE0)) skip = 1; + else if ((tmp[i]>0xDF)&&(tmp[i]<0xF0)) skip = 2; + else if ((tmp[i]>0xEF)&&(tmp[i]<0xF8)) skip = 3; + else skip = 4; + chars++; + } + + return chars; +} diff --git a/src/rccstring.h b/src/rccstring.h index e9e9734..96f8b2d 100644 --- a/src/rccstring.h +++ b/src/rccstring.h @@ -26,5 +26,6 @@ int strnlen(const char *str, size_t size); int rccStrnlen(const char *str, size_t size); #endif /* HAVE_STRNLEN */ int rccIsASCII(const char *str); +size_t rccStringSizedGetChars(const char *str, size_t size); #endif /* _RCC_STRING_H */ diff --git a/src/rccxml.c b/src/rccxml.c index 143f930..b40d4fc 100644 --- a/src/rccxml.c +++ b/src/rccxml.c @@ -50,7 +50,7 @@ int rccXmlInit(int LoadConfiguration) { FILE *f; char config[MAX_HOME_CHARS + 32]; - xmlXPathContextPtr xpathctx; + xmlXPathContextPtr xpathctx = NULL; xmlXPathObjectPtr obj = NULL; xmlNodeSetPtr node_set; unsigned long i, nnodes; @@ -58,6 +58,8 @@ int rccXmlInit(int LoadConfiguration) { xmlAttrPtr attr; const char *lang, *engine_name; unsigned int pos, lpos, epos, cpos; + const char *alias, *parrent; + unsigned int j, apos, rpos; rcc_engine *engine; @@ -82,6 +84,8 @@ int rccXmlInit(int LoadConfiguration) { } else config[0] = 0; + for (apos=0;rcc_default_aliases[apos].alias;apos++); + // Load Extra Languages if (config[0]) { xmlctx = xmlReadFile(config, NULL, 0); @@ -108,7 +112,17 @@ int rccXmlInit(int LoadConfiguration) { pos = rccDefaultGetLanguageByName(lang); if (!pos) continue; - if (pos == (rcc_language_id)-1) pos = lpos; + if (pos == (rcc_language_id)-1) { + for (rpos=0;rcc_default_relations[rpos].lang;rpos++); + if (rpos < RCC_MAX_RELATIONS) { + rcc_default_relations[rpos].parrent = rcc_english_language_sn; + rcc_default_relations[rpos++].lang = lang; + rcc_default_relations[rpos].parrent = NULL; + rcc_default_relations[rpos].lang = NULL; + } + + pos = lpos; + } else if (pos == RCC_MAX_LANGUAGES) continue; for (epos = 1, cpos = 1,node=pnode->children;node;node=node->next) { @@ -121,10 +135,10 @@ int rccXmlInit(int LoadConfiguration) { } } } - if (!xmlStrcmp(node->name, "Engines")) { + else if (!xmlStrcmp(node->name, "Engines")) { for (enode=node->children;enode;enode=enode->next) { if (enode->type != XML_ELEMENT_NODE) continue; - if ((!xmlStrcmp(enode->name, "Engine"))&&(rccXmlGetText(enode))&&(eposname, "Engine"))&&(eposname, "Aliases")) { + for (enode=node->children;enode;enode=enode->next) { + if (enode->type != XML_ELEMENT_NODE) continue; + if ((!xmlStrcmp(enode->name, "Alias"))&&(aposname, "Relations")) { + rpos = rccDefaultDropLanguageRelations(lang); + for (enode=node->children;enode;enode=enode->next) { + if (enode->type != XML_ELEMENT_NODE) continue; + if ((!xmlStrcmp(enode->name, "Parrent"))&&(rpos 1)||(epos > 1)) { @@ -161,6 +208,7 @@ clear: } } } + return 0; } @@ -507,8 +555,7 @@ int rccLoad(rcc_context ctx, const char *name) { ovalue = rccOptionDescriptionGetValueByName(odesc, tmp); if (ovalue == (rcc_option_value)-1) ovalue = (rcc_option_value)atoi(tmp); err = rccSetOption(ctx, (rcc_option)i, ovalue); - } - else err = -1; + } else err = -1; } else err = -1; if (err) rccOptionSetDefault(ctx, (rcc_option)i); } diff --git a/src/recode.c b/src/recode.c index 48ce2d6..27dff92 100644 --- a/src/recode.c +++ b/src/recode.c @@ -22,25 +22,32 @@ #define RCC_ACCEPTABLE_LENGTH 3 static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) { - rcc_speller speller = NULL, english_speller = NULL; + rcc_speller speller = NULL; unsigned long i, nlanguages; rcc_language_config config, config0 = NULL; rcc_string recoded; unsigned char *utf8; size_t j, mode; - unsigned long spres, words, english, result; - size_t longest; + rcc_speller_result spres; + unsigned long words, result, own; + size_t longest, ownlongest; unsigned char english_mode, english_word = 1; char *english_string = NULL; rcc_language_id english_lang = (rcc_language_id)-1; size_t english_longest = 0; unsigned char is_english_string = 1; - double res, english_res = 0; + double res, ownres, english_res = 0; rcc_option_value usedb4; rcc_language_id bestlang = (rcc_language_id)-1; - unsigned long bestlongest = RCC_ACCEPTABLE_LENGTH; + size_t bestlongest = RCC_ACCEPTABLE_LENGTH; + size_t bestownlongest = RCC_ACCEPTABLE_LENGTH; + unsigned long bestown = 0; double bestres = RCC_ACCEPTABLE_PROBABILITY; char *best_string = NULL; + rcc_language_id bestfixlang = (rcc_language_id)-1; + unsigned long k; + rcc_language_id *parrents; + size_t chars = 0; unsigned long accepted_nonenglish_langs = 0; @@ -64,22 +71,24 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c nlanguages = ctx->n_languages; english_lang = rccGetLanguageByName(ctx, rcc_english_language_sn); - if (english_lang != (rcc_language_id)-1) { - config = rccGetUsableConfig(ctx, english_lang); - if (config) { - english_speller = rccConfigGetSpeller(config); - if (rccSpellerGetError(english_speller)) english_speller = NULL; - } - } for (i=0;ilanguage_parrents[i]; + for (k = 0;parrents[k] != (rcc_language_id)-1;k++) + if (parrents[k] == bestfixlang) break; + + if (parrents[k] != bestfixlang) continue; + } + speller = rccConfigGetSpeller(config); if (rccSpellerGetError(speller)) continue; @@ -91,17 +100,24 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c utf8 = (char*)rccStringGetString(recoded); - for (result=0,english=0,words=0,longest=0,mode=0,j=0;utf8[j];j++) { + for (result=0,own=0,words=0,ownlongest=0,longest=0,mode=0,j=0;utf8[j];j++) { if (isSpace(utf8[j])) { if (mode) { - if ((!english_mode)&&(english_word)&&(rccSpellerSized(english_speller, utf8 + mode -1, j - mode + 1))) - english++; - else { - if ((english_mode)&&(!english_word)) is_english_string = 0; - spres = rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0; - if ((spres)&&((j - mode + 1)>longest)) longest = j - mode + 1; - result+=spres; + if ((english_mode)&&(!english_word)) is_english_string = 0; + + spres = rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1, 1); + if (rccSpellerResultIsCorrect(spres)) { + result++; + chars = rccStringSizedGetChars(utf8 + mode - 1, j - mode + 1); + if (chars > longest) longest = chars; } + if (rccSpellerResultIsOwn(spres)) { + own++; + if (chars > ownlongest) ownlongest = chars; + } +#if RCC_DEBUG_LANGDETECT > 1 + printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1); +#endif /* RCC_DEBUG_LANGDETECT */ words++; mode = 0; } else continue; @@ -116,14 +132,22 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c } if (mode) { - if ((!english_mode)&&(english_word)&&(rccSpeller(english_speller, utf8 + mode -1))) - english++; - else { - if ((english_mode)&&(!english_word)) is_english_string = 0; - spres = rccSpeller(speller, utf8 + mode - 1)?1:0; - if ((spres)&&((j-mode+1)>longest)) longest = j - mode + 1; - result += spres; + if ((english_mode)&&(!english_word)) is_english_string = 0; + + spres = rccSpeller(speller, utf8 + mode - 1); + if (rccSpellerResultIsCorrect(spres)) { + result++; + chars = rccStringSizedGetChars(utf8 + mode - 1, 0); + if (chars > longest) longest = chars; } + if (rccSpellerResultIsOwn(spres)) { + own++; + if (chars > ownlongest) ownlongest = chars; + } +#if RCC_DEBUG_LANGDETECT > 1 + printf("%s: %u (%.*s)\n", config->language->sn, spres, j - mode + 1, utf8 + mode -1); +#endif /* RCC_DEBUG_LANGDETECT */ + words++; } @@ -134,25 +158,27 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c english_lang = (rcc_language_id)i; english_longest = longest; english_string = recoded; - } else if (words>english) { - res = 1.*result/(words - english); - if ((res > RCC_REQUIRED_PROBABILITY)&&(longest > RCC_REQUIRED_LENGTH)) { - if (best_string) free(best_string); - if (english_string) free(english_string); - - if (retstring) *retstring = recoded; - else free(recoded); - return (rcc_language_id)i; - } else if ((res > bestres + RCC_PROBABILITY_STEP)|| + } else if (words>0) { + res = 1.*result/words; + ownres = 1.*own/words; + + if ((res > bestres + RCC_PROBABILITY_STEP)|| ((res > bestres - RCC_PROBABILITY_STEP)&&(longest > bestlongest))|| - ((res > bestres)&&(longest == bestlongest))) { - + ((res > bestres + 1E-10)&&(longest == bestlongest))|| + (((res-bestres)<1E-10)&&((bestres-res)<1E-10)&&(longest == bestlongest)&&(own > 0))) { + if (best_string) free(best_string); bestres = res; - bestlang = (rcc_language_id)i; + bestlang = rccGetRealLanguage(ctx, (rcc_language_id)i); bestlongest = longest; best_string = recoded; + bestown = own; + bestownlongest = ownlongest; + + if ((ownres > RCC_REQUIRED_PROBABILITY)&&(ownlongest > RCC_REQUIRED_LENGTH)) { + bestfixlang = bestlang; + } } else if (!accepted_nonenglish_langs) { bestlang = (rcc_language_id)i; best_string = recoded; @@ -162,6 +188,13 @@ static rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id c } else free(recoded); } + if ((bestres > RCC_REQUIRED_PROBABILITY)&&(bestlongest > RCC_REQUIRED_LENGTH)&&(bestown>0)) { + if (english_string) free(english_string); + if (retstring) *retstring = best_string; + else if (best_string) free(best_string); + return bestlang; + } + if ((is_english_string)&&(english_res > RCC_REQUIRED_PROBABILITY)&&(english_longest > RCC_REQUIRED_LENGTH)) { if (best_string) free(best_string); if (retstring) *retstring = english_string; @@ -242,6 +275,25 @@ rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_ return rccConfigDetectCharsetInternal(config, class_id, buf, len); } +static int rccAreLanguagesRelated(rcc_context ctx, rcc_language_id l1, rcc_language_id l2, rcc_language_id skip) { + unsigned int i; + rcc_language_id *list; + + if ((l1 == skip)||(l2 == skip)) return 0; + + if (l1 == l2) return 1; + + list = ctx->language_parrents[l1]; + for (i=0;list[i] != (rcc_language_id)-1;i++) + if (list[i] == l2) return 1; + + list = ctx->language_parrents[l2]; + for (i=0;list[i] != (rcc_language_id)-1;i++) + if (list[i] == l1) return 1; + + return 0; +} + rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { int err; size_t ret; @@ -286,7 +338,9 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result); if (detected_language_id != (rcc_language_id)-1) { - /*printf("Language %i: %s\n", rccStringGetLanguage(result), result);*/ +#ifdef RCC_DEBUG_LANGDETECT + printf("Language %i(%s): %s\n", rccStringGetLanguage(result), rccStringGetLanguage(result)?rccGetLanguageName(ctx, rccStringGetLanguage(result)):"", result); +#endif /* RCC_DEBUG_LANGDETECT */ return result; } @@ -332,6 +386,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s rcc_language_config config; rcc_language_id language_id; rcc_language_id current_language_id; + rcc_language_id english_language_id; rcc_class_type class_type; rcc_option_value translate; rcc_translate trans, entrans; @@ -366,6 +421,8 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s else english_source = 1; if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) { + english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn); + rccMutexLock(ctx->mutex); current_language_id = rccGetCurrentLanguage(ctx); @@ -374,6 +431,18 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s trans = rccConfigGetTranslator(config, current_language_id); if (trans) { translated = rccTranslate(trans, utfstring); + if (translated) { + if ((current_language_id != english_language_id)&&(rccIsASCII(translated))) { + /* Ffrench to german (no umlauts) => not related + english to german (no umlauts) => skiping english relations + DS: Problem if we have relation between french and german */ + if (rccAreLanguagesRelated(ctx, language_id, current_language_id, english_language_id)) { + free(translated); + translated = NULL; + translate = 0; + } + } + } if (translated) { language_id = current_language_id; @@ -394,11 +463,11 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s } } - if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((config->trans)&&(!translated))) { + if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((translate)&&(!translated)&&(!english_language_id == current_language_id)&&(!rccAreLanguagesRelated(ctx, language_id, current_language_id, (rcc_language_id)-1)))) { entrans = rccConfigGetEnglishTranslator(config); if (entrans) { translated = rccTranslate(config->entrans, utfstring); - +/* config = rccGetConfig(ctx, language_id); if (!config) { rccMutexUnLock(ctx->mutex); @@ -409,7 +478,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s if (err) { rccMutexUnLock(ctx->mutex); return translated; - } + }*/ } } } -- cgit v1.2.3