diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile.am | 5 | ||||
-rw-r--r-- | src/librcc.h | 119 | ||||
-rw-r--r-- | src/lng.c | 45 | ||||
-rw-r--r-- | src/lng.h | 2 | ||||
-rw-r--r-- | src/lngconfig.c | 229 | ||||
-rw-r--r-- | src/lngconfig.h | 7 | ||||
-rw-r--r-- | src/rccconfig.c | 27 | ||||
-rw-r--r-- | src/rccconfig.h | 5 | ||||
-rw-r--r-- | src/rccexternal.c | 2 | ||||
-rw-r--r-- | src/rcciconv.c | 5 | ||||
-rw-r--r-- | src/rcciconv.h | 2 | ||||
-rw-r--r-- | src/rccspell.c | 63 | ||||
-rw-r--r-- | src/rccspell.h | 29 | ||||
-rw-r--r-- | src/rccstring.c | 8 | ||||
-rw-r--r-- | src/rccstring.h | 1 | ||||
-rw-r--r-- | src/rcctranslate.c | 10 | ||||
-rw-r--r-- | src/recode.c | 233 |
17 files changed, 723 insertions, 69 deletions
diff --git a/src/Makefile.am b/src/Makefile.am index baa08a4..4ba3c35 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -12,6 +12,7 @@ librcc_la_SOURCES = librcc.c \ fake_enca.h fake_rcd.h \ rccenca.c rccenca.h \ rccdb4.c rccdb4.h \ + rccspell.c rccspell.h \ engine.c engine.h \ rccstring.c rccstring.h \ rccxml.c rccxml.h \ @@ -22,7 +23,7 @@ librcc_la_SOURCES = librcc.c \ internal.h include_HEADERS = librcc.h -AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@ -librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ +AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@ @ASPELL_CFLAGS@ +librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ librcc_la_LDFLAGS = -version-info @LIBRCC_VERSION_INFO@ diff --git a/src/librcc.h b/src/librcc.h index 52e6be4..d08937e 100644 --- a/src/librcc.h +++ b/src/librcc.h @@ -364,6 +364,23 @@ typedef int rcc_option_value; #define RCC_OPTION_LEARNING_FLAG_LEARN 2 /** + * Switch translation off. + */ +#define RCC_OPTION_TRANSLATE_OFF 0 +/** + * Translate data to english language (Current language don't matter). + */ +#define RCC_OPTION_TRANSLATE_TO_ENGLISH 1 +/** + * Skip translation of the english text. + */ +#define RCC_OPTION_TRANSLATE_SKIP_ENGLISH 2 +/** + * Translate whole data to the current language. + */ +#define RCC_OPTION_TRANSLATE_FULL 3 + +/** * List of options available */ typedef enum rcc_option_t { @@ -371,8 +388,9 @@ typedef enum rcc_option_t { RCC_OPTION_AUTODETECT_FS_TITLES, /**< Detect titles of #RCC_CLASS_FS classes */ RCC_OPTION_AUTODETECT_FS_NAMES, /**< Try to find encoding of #RCC_CLASS_FS by accessing fs */ RCC_OPTION_CONFIGURED_LANGUAGES_ONLY, /**< Use only configured languages or languages with auto-engines */ - RCC_OPTION_TRANSLATE, /**< Translate #rcc_string if it's language differs from current one */ RCC_OPTION_AUTOENGINE_SET_CURRENT, /**< If enabled autodetection engine will set current charset */ + RCC_OPTION_AUTODETECT_LANGUAGE, /**< Enables language detection */ + RCC_OPTION_TRANSLATE, /**< Translate #rcc_string if it's language differs from current one */ RCC_MAX_OPTIONS } rcc_option; @@ -970,6 +988,26 @@ int rccTranslateSetTimeout(rcc_translate translate, unsigned long us); char *rccTranslate(rcc_translate translate, const char *buf); /* recode.c */ + +/** + * Tries to detect language of string + * @param ctx is working context ( or default one if NULL supplied ) + * @param class_id is encoding class + * @param buf is original string (perhaps not zero terminated) + * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function. + * @result is language_id or -1 if autodetection is failed + */ +rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len); +/** + * Tries to detect charset of string + * @param ctx is working context ( or default one if NULL supplied ) + * @param class_id is encoding class + * @param buf is original string (perhaps not zero terminated) + * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function. + * @result is auto_charset_id or -1 if autodetection is failed + */ +int rccDetectCharset(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len); + /** * Recode string from specified encoding class to #rcc_string. Encoding detection engines and * recoding cache are used (if possible) to detect original 'buf' encoding. Otherwise the @@ -1079,7 +1117,7 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char * @param rlen in rlen the size of recoded string will be returned. * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory. */ -char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen); +rcc_string rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen); /** * Recode string between specified encodings. * @@ -1094,6 +1132,77 @@ char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const ch char *rccSizedRecodeCharsets(rcc_context ctx, const char *from, const char *to, const char *buf, size_t len, size_t *rlen); +/** + * Tries to detect charset of string + * @param config is language configuration + * @param class_id is encoding class + * @param buf is original string (perhaps not zero terminated) + * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function. + * @result is auto_charset_id or -1 if autodetection is failed + */ +rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len); + +/** + * Recode string from specified encoding class to #rcc_string. Encoding detection engines and + * recoding cache are used (if possible) to detect original 'buf' encoding. Otherwise the + * preconfigured encoding of class is assumed. + * + * @param config is language configuration + * @param class_id is encoding class + * @param buf is original string (perhaps not zero terminated) + * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function. + * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory. + */ +rcc_string rccConfigSizedFrom(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len); +/** + * Recode string from #rcc_string to specified encoding class. If encoding class is of + * 'File System' type, the autoprobing for file names can be performed. In the other cases + * the rcc_string will be recoded in preconfigured class encoding. + * + * @param config is language configuration + * @param class_id is encoding class + * @param buf is original zero terminated string + * @param rlen in rlen the size of recoded string will be returned. + * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory. + */ +char *rccConfigSizedTo(rcc_language_config config, rcc_class_id class_id, rcc_const_string buf, size_t *rlen); +/** + * Recode string between different encoding classes. The conversion is relays on rccConfigSizedFrom + * and rccConfigSizedTo functions. + * @see rccConfigSizedFrom + * @see rccConfigSizedTo + * + * @param config is language configuration + * @param from is source encoding class + * @param to is destination encoding class + * @param buf is original string (perhaps not zero terminated) + * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function. + * @param rlen in rlen the size of recoded string will be returned. + * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory. + */ +char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen); +/** + * Recode string from specified encoding to #rcc_string. + * + * @param config is language configuration + * @param charset is source encoding + * @param buf is original string (perhaps not zero terminated) + * @param len is exact size of string or 0. In the last case the size is determined using 'strlen' function. + * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory. + */ +rcc_string rccConfigSizedRecodeFromCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen); +/** + * Recode string from #rcc_string to specified encoding. + * + * @param config is language configuration + * @param charset is destination encoding + * @param buf is original zero terminated string + * @param rlen in rlen the size of recoded string will be returned. + * @result is recoded string or NULL if recoding is not required or failed. It is up to the caller to free memory. + */ +char *rccConfigSizedRecodeToCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen); + + #define rccFrom(ctx, class_id, buf) rccSizedFrom(ctx, class_id, buf, 0) #define rccTo(ctx, class_id, buf) rccSizedTo(ctx, class_id, buf, NULL) #define rccRecode(ctx, from, to, buf) rccSizedRecode(ctx, from, to, buf, 0, NULL) @@ -1104,6 +1213,12 @@ char *rccSizedRecodeCharsets(rcc_context ctx, const char *from, const char *to, #define rccRecodeFromCharset(ctx, class_id, charset, buf) rccSizedRecodeFromCharset(ctx, class_id, charset, buf, 0, NULL) #define rccRecodeCharsets(ctx, from, to, buf) rccSizedRecodeCharsets(ctx, from, to, buf, 0, NULL) +#define rccConfigFrom(ctx, class_id, buf) rccConfigSizedFrom(ctx, class_id, buf, 0) +#define rccConfigTo(ctx, class_id, buf) rccConfigSizedTo(ctx, class_id, buf, NULL) +#define rccConfigRecode(ctx, from, to, buf) rccConfigSizedRecode(ctx, from, to, buf, 0, NULL) +#define rccConfigRecodeToCharset(ctx, class_id, charset, buf) rccConfigSizedRecodeToCharset(ctx, class_id, charset, buf, 0, NULL) +#define rccConfigRecodeFromCharset(ctx, class_id, charset, buf) rccConfigSizedRecodeFromCharset(ctx, class_id, charset, buf, 0, NULL) + /******************************************************************************* ******************************** Options *************************************** *******************************************************************************/ @@ -36,11 +36,39 @@ rcc_language_id rccGetLanguageByName(rcc_context ctx, const char *name) { return (rcc_language_id)-1; } -static rcc_language_id rccGetDefaultLanguage(rcc_context ctx) { - unsigned int i; +int rccCheckLanguageUsability(rcc_context ctx, rcc_language_id language_id) { + rcc_language_config config; rcc_option_value clo; rcc_engine_ptr *engines; - rcc_language_config config; + rcc_charset *charsets; + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return 0; + } + if (language_id>=ctx->n_languages) return 0; + + language_id = rccGetRealLanguage(ctx, language_id); + + clo = rccGetOption(ctx, RCC_OPTION_CONFIGURED_LANGUAGES_ONLY); + if (clo) { + config = rccCheckConfig(ctx, (rcc_language_id)language_id); + if ((!config)||(!config->configured)) { + charsets = ctx->languages[language_id]->charsets; + if ((charsets[0])&&(charsets[1])&&(charsets[2])) { + if (clo == 1) { + engines = ctx->languages[language_id]->engines; + if ((!engines[0])||(!engines[1])) return 0; + } else return 0; + } + } + } + return 1; +} + + +static rcc_language_id rccGetDefaultLanguage(rcc_context ctx) { + unsigned int i; char stmp[RCC_MAX_LANGUAGE_CHARS+1]; if (ctx->default_language) return ctx->default_language; @@ -48,16 +76,7 @@ static rcc_language_id rccGetDefaultLanguage(rcc_context ctx) { if (!rccLocaleGetLanguage(stmp, ctx->locale_variable, RCC_MAX_LANGUAGE_CHARS)) { for (i=0;ctx->languages[i];i++) { if (!strcmp(ctx->languages[i]->sn, stmp)) { - clo = rccGetOption(ctx, RCC_OPTION_CONFIGURED_LANGUAGES_ONLY); - if (clo) { - config = rccCheckConfig(ctx, (rcc_language_id)i); - if ((!config)||(!config->configured)) { - if (clo == 1) { - engines = ctx->languages[i]->engines; - if ((!engines[0])||(!engines[1])) break; - } else break; - } - } + if (!rccCheckLanguageUsability(ctx, (rcc_language_id)i)) break; ctx->default_language = (rcc_language_id)i; return (rcc_language_id)i; } @@ -4,6 +4,8 @@ #include "internal.h" #include "lngconfig.h" + +int rccCheckLanguageUsability(rcc_context ctx, rcc_language_id language_id); rcc_language_ptr rccGetLanguagePointer(rcc_context ctx, rcc_language_id language_id); #define rccGetCurrentEnginePointer(ctx) rccConfigGetCurrentEnginePointer(ctx->current_config) diff --git a/src/lngconfig.c b/src/lngconfig.c index c50ee74..26d0779 100644 --- a/src/lngconfig.c +++ b/src/lngconfig.c @@ -2,9 +2,12 @@ #include <stdlib.h> #include <string.h> +#include "../config.h" + #include "internal.h" #include "rccconfig.h" #include "rcclocale.h" +#include "lng.h" rcc_engine_ptr rccConfigGetEnginePointer(rcc_language_config config, rcc_engine_id engine_id) { unsigned int i; @@ -165,6 +168,7 @@ int rccConfigInit(rcc_language_config config, rcc_context ctx) { config->fsiconv = NULL; config->trans = NULL; + config->entrans = NULL; config->ctx = ctx; config->language = NULL; @@ -172,6 +176,7 @@ int rccConfigInit(rcc_language_config config, rcc_context ctx) { config->engine = -1; config->default_charset = dcharsets; config->configured = 0; + config->speller = NULL; config->iconv_to = iconv_to; config->configure = 1; @@ -204,6 +209,10 @@ void rccConfigClear(rcc_language_config config) { rccTranslateClose(config->trans); config->trans = NULL; } + if (config->entrans) { + rccTranslateClose(config->entrans); + config->entrans = NULL; + } if (config->iconv_to) { free(config->iconv_to); config->iconv_to = NULL; @@ -216,31 +225,55 @@ void rccConfigClear(rcc_language_config config) { free(config->default_charset); config->default_charset = NULL; } + if (config->speller) { + rccSpellerFree(config->speller); + config->speller = NULL; + } } } -rcc_language_config rccCheckConfig(rcc_context ctx, rcc_language_id language_id) { - rcc_language_id new_language_id; - - new_language_id = rccGetRealLanguage(ctx, language_id); - if ((new_language_id == (rcc_language_id)-1)||(new_language_id != language_id)) return NULL; - if (!ctx->configs[language_id].charset) return NULL; - if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL; +static rcc_language_config rccGetConfigPointer(rcc_context ctx, rcc_language_id language_id, rcc_language_id *r_language_id) { + + language_id = rccGetRealLanguage(ctx, language_id); + if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL; + if (r_language_id) *r_language_id = language_id; return ctx->configs + language_id; } +rcc_language_config rccCheckConfig(rcc_context ctx, rcc_language_id language_id) { + rcc_language_config config; + + config = rccGetConfigPointer(ctx, language_id, NULL); + if ((config)&&(!config->charset)) return NULL; + + return config; +} + + +rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language_id) { + rcc_language_config config; + + config = rccGetConfigPointer(ctx, language_id, &language_id); + if (config) { + if (!rccCheckLanguageUsability(ctx, language_id)) return NULL; + if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL; + config->language = ctx->languages[language_id]; + } + + return config; +} + rcc_language_config rccGetConfig(rcc_context ctx, rcc_language_id language_id) { - language_id = rccGetRealLanguage(ctx, language_id); - if (language_id == (rcc_language_id)-1) return NULL; - if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL; - - if (!ctx->configs[language_id].charset) { - if (rccConfigInit(ctx->configs+language_id, ctx)) return NULL; - } + rcc_language_config config; - ctx->configs[language_id].language = ctx->languages[language_id]; - return ctx->configs + language_id; + config = rccGetConfigPointer(ctx, language_id, &language_id); + if (config) { + if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL; + config->language = ctx->languages[language_id]; + } + + return config; } rcc_language_config rccGetConfigByName(rcc_context ctx, const char *name) { @@ -261,6 +294,15 @@ rcc_language_config rccGetCurrentConfig(rcc_context ctx) { return rccGetConfig(ctx, language_id); } +rcc_speller rccConfigGetSpeller(rcc_language_config config) { + if (!config) return NULL; + + if (config->speller) return config->speller; + + config->speller = rccSpellerCreate(config->language->sn); + return config->speller; +} + rcc_engine_id rccConfigGetSelectedEngine(rcc_language_config config) { if (!config) return (rcc_engine_id)-1; @@ -532,6 +574,161 @@ int rccConfigConfigure(rcc_language_config config) { return 0; } + +rcc_string rccConfigSizedFrom(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { + rcc_context ctx; + rcc_string result; + rcc_option_value usedb4; + rcc_autocharset_id charset_id; + const char *charset; + + + if (!config) return NULL; + ctx = config->ctx; + + if (rccStringSizedCheck(buf, len)) return NULL; + + usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); + + if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { + result = rccDb4GetKey(ctx->db4ctx, buf, len); + if (result) { + if (rccStringFixID(result, ctx)) free(result); + else return result; + } + } + + charset_id = rccConfigDetectCharset(config, class_id, buf, len); + if (charset_id != (rcc_autocharset_id)-1) + charset = rccConfigGetAutoCharsetName(config, charset_id); + else + charset = rccConfigGetCurrentCharsetName(config, class_id); + + if (charset) { + result = rccSizedFromCharset(ctx, charset, buf, len); + if (result) rccStringChangeID(result, rccGetLanguageByName(ctx, config->language->sn)); + return result; + } + + return NULL; +} + +char *rccConfigSizedTo(rcc_language_config config, rcc_class_id class_id, rcc_const_string buf, size_t *rlen) { + rcc_context ctx; + const char *charset; + + if (!config) return NULL; + ctx = config->ctx; + + charset = rccConfigGetCurrentCharsetName(config, class_id); + + if (charset) + return rccSizedToCharset(ctx, charset, buf, rlen); + + return NULL; +} + + +char *rccConfigSizedRecode(rcc_language_config config, rcc_class_id from, rcc_class_id to, const char *buf, size_t len, size_t *rlen) { + rcc_context ctx; + rcc_string result; + rcc_option_value usedb4; + rcc_autocharset_id charset_id; + rcc_string stmp; + const char *tocharset, *fromcharset; + + + if (!config) return NULL; + ctx = config->ctx; + + if (rccStringSizedCheck(buf, len)) return NULL; + + usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); + + if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { + stmp = rccDb4GetKey(ctx->db4ctx, buf, len); + if (stmp) { + if (rccStringFixID(stmp, ctx)) free(stmp); + else { + result = rccConfigSizedTo(config, to, stmp, rlen); + free(stmp); + return result; + } + } + } + + charset_id = rccConfigDetectCharset(config, from, buf, len); + if (charset_id != (rcc_autocharset_id)-1) + fromcharset = rccConfigGetAutoCharsetName(config, charset_id); + else + fromcharset = rccConfigGetCurrentCharsetName(config, from); + + tocharset = rccConfigGetCurrentCharsetName(config, to); + + if ((fromcharset)&&(tocharset)) + return rccSizedRecodeCharsets(ctx, fromcharset, tocharset, buf, len, rlen); + + return NULL; + +} + + +char *rccConfigSizedRecodeToCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, rcc_const_string buf, size_t len, size_t *rlen) { + rcc_context ctx; + rcc_string result; + rcc_option_value usedb4; + rcc_autocharset_id charset_id; + rcc_string stmp; + const char *ocharset; + + + if (!config) return NULL; + ctx = config->ctx; + + if (rccStringSizedCheck(buf, len)) return NULL; + + usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); + + if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { + stmp = rccDb4GetKey(ctx->db4ctx, buf, len); + if (stmp) { + if (rccStringFixID(stmp, ctx)) free(stmp); + else { + result = rccSizedToCharset(ctx, charset, stmp, rlen); + free(stmp); + return result; + } + } + } + + charset_id = rccConfigDetectCharset(config, class_id, buf, len); + if (charset_id != (rcc_autocharset_id)-1) + ocharset = rccConfigGetAutoCharsetName(config, charset_id); + else + ocharset = rccConfigGetCurrentCharsetName(config, class_id); + + if (ocharset) + return rccSizedRecodeCharsets(ctx, ocharset, charset, buf, len, rlen); + + return NULL; +} + +char *rccConfigSizedRecodeFromCharset(rcc_language_config config, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) { + rcc_context ctx; + const char *ocharset; + + if (!config) return NULL; + ctx = config->ctx; + + ocharset = rccConfigGetCurrentCharsetName(config, class_id); + + if (ocharset) + return rccSizedRecodeCharsets(ctx, charset, ocharset, buf, len, rlen); + + return NULL; +} + + /* rcc_option_value options[RCC_MAX_OPTIONS]; diff --git a/src/lngconfig.h b/src/lngconfig.h index 92cc050..9d23139 100644 --- a/src/lngconfig.h +++ b/src/lngconfig.h @@ -3,6 +3,7 @@ #include "rcciconv.h" #include "rcctranslate.h" +#include "rccspell.h" struct rcc_language_config_t { rcc_context ctx; @@ -17,8 +18,10 @@ struct rcc_language_config_t { unsigned char configured; + rcc_speller speller; rcc_translate trans; rcc_language_id translang; + rcc_translate entrans; rcc_iconv fsiconv; }; @@ -30,9 +33,13 @@ rcc_engine_ptr rccConfigCheckEnginePointer(rcc_language_config config, rcc_engin rcc_engine_ptr rccConfigGetCurrentEnginePointer(rcc_language_config config); rcc_engine_ptr rccConfigCheckCurrentEnginePointer(rcc_language_config config); +rcc_speller rccConfigGetSpeller(rcc_language_config config); + int rccConfigInit(rcc_language_config config, rcc_context ctx); void rccConfigClear(rcc_language_config config); +rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language_id); + int rccConfigConfigure(rcc_language_config config); rcc_charset_id rccConfigGetLocaleUnicodeCharset(rcc_language_config config, const char *locale_variable); diff --git a/src/rccconfig.c b/src/rccconfig.c index ed6d30a..f820606 100644 --- a/src/rccconfig.c +++ b/src/rccconfig.c @@ -12,13 +12,18 @@ rcc_language_alias rcc_default_aliases[] = { { NULL, NULL} }; +const char rcc_default_language_sn[] = "default"; +const char rcc_disabled_language_sn[] = "Off"; +const char rcc_english_language_sn[] = "en"; +const char rcc_disabled_engine_sn[] = "Off"; const char rcc_default_charset[] = "Default"; + const char rcc_utf8_charset[] = "UTF-8"; const char rcc_engine_nonconfigured[] = "Default"; const char rcc_option_nonconfigured[] = "DEFAULT"; rcc_engine rcc_default_engine = { - "Off", NULL, NULL, NULL, {NULL} + rcc_disabled_engine_sn, NULL, NULL, NULL, {NULL} }; rcc_engine rcc_russian_engine = { @@ -32,11 +37,11 @@ rcc_engine rcc_ukrainian_engine = { rcc_language rcc_default_languages[RCC_MAX_LANGUAGES + 1]; rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { -{"default", {rcc_default_charset, NULL}, { +{rcc_default_language_sn, {rcc_default_charset, NULL}, { &rcc_default_engine, NULL }}, -{"off", {rcc_default_charset, NULL}, { +{rcc_disabled_language_sn, {rcc_default_charset, NULL}, { &rcc_default_engine, NULL }}, @@ -112,14 +117,28 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL }; rcc_option_value_name rcc_sn_learning[] = { "OFF", "ON", "RELEARN", "LEARN", NULL }; rcc_option_value_name rcc_sn_clo[] = { "ALL", "CONFIGURED_AND_AUTO", "CONFIGURED_ONLY", NULL }; +rcc_option_value_name rcc_sn_translate[] = { "OFF", "TO_ENGLISH", "SKIP_ENGLISH", "FULL", NULL }; rcc_option_description rcc_option_descriptions[RCC_MAX_OPTIONS+1]; rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = { +#ifdef HAVE_DB_H {RCC_OPTION_LEARNING_MODE, 1, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1 }, RCC_OPTION_TYPE_STANDARD, "LEARNING_MODE", rcc_sn_learning }, +#else + {RCC_OPTION_LEARNING_MODE, 1, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1 }, RCC_OPTION_TYPE_INVISIBLE, "LEARNING_MODE", rcc_sn_learning }, +#endif /* HAVE_DB_H */ {RCC_OPTION_AUTODETECT_FS_NAMES, 1, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTODETECT_FS_NAMES", rcc_sn_boolean}, {RCC_OPTION_AUTODETECT_FS_TITLES, 1, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_INVISIBLE, "AUTODETECT_FS_TITLES", rcc_sn_boolean}, {RCC_OPTION_CONFIGURED_LANGUAGES_ONLY, 1, { RCC_OPTION_RANGE_TYPE_MENU, 0, 2, 1}, RCC_OPTION_TYPE_INVISIBLE, "CONFIGURED_LANGUAGES_ONLY", rcc_sn_clo}, - {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "TRANSLATE", rcc_sn_boolean }, +#ifdef HAVE_ASPELL + {RCC_OPTION_AUTODETECT_LANGUAGE, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTODETECT_LANGUAGE", rcc_sn_boolean}, +#else + {RCC_OPTION_AUTODETECT_LANGUAGE, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_INVISIBLE, "AUTODETECT_LANGUAGE", rcc_sn_boolean}, +#endif +#ifdef HAVE_LIBTRANSLATE + {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1}, RCC_OPTION_TYPE_STANDARD, "TRANSLATE", rcc_sn_translate }, +#else + {RCC_OPTION_TRANSLATE, 0, { RCC_OPTION_RANGE_TYPE_MENU, 0, 3, 1}, RCC_OPTION_TYPE_INVISIBLE, "TRANSLATE", rcc_sn_translate }, +#endif /* HAVE_LIBTRANSLATE */ {RCC_OPTION_AUTOENGINE_SET_CURRENT, 0, { RCC_OPTION_RANGE_TYPE_BOOLEAN, 0, 0, 0}, RCC_OPTION_TYPE_STANDARD, "AUTOENGINE_SET_CURRENT", rcc_sn_boolean }, {RCC_MAX_OPTIONS} }; diff --git a/src/rccconfig.h b/src/rccconfig.h index b94a39b..8e794ba 100644 --- a/src/rccconfig.h +++ b/src/rccconfig.h @@ -6,6 +6,11 @@ #undef RCC_DEBUG #define RCC_LOCALE_VARIABLE "LC_CTYPE" +extern const char rcc_default_language_sn[]; +extern const char rcc_english_language_sn[]; +extern const char rcc_disabled_language_sn[]; +extern const char rcc_disabled_engine_sn[]; + extern rcc_language_alias rcc_default_aliases[]; extern const char rcc_default_charset[]; extern const char rcc_utf8_charset[]; diff --git a/src/rccexternal.c b/src/rccexternal.c index 16b3667..4a09948 100644 --- a/src/rccexternal.c +++ b/src/rccexternal.c @@ -153,7 +153,7 @@ int rccExternalConnect(unsigned char module) { fd_set fdcon; if (pid == (pid_t)-1) return -1; - + sock = socket(PF_UNIX, SOCK_STREAM, 0); if (sock<=0) return -1; diff --git a/src/rcciconv.c b/src/rcciconv.c index d9903de..93278a7 100644 --- a/src/rcciconv.c +++ b/src/rcciconv.c @@ -48,6 +48,11 @@ void rccIConvClose(rcc_iconv icnv) { } } +int rccIConvGetError(rcc_iconv icnv) { + if ((!icnv)||(icnv->icnv == (iconv_t)-1)) return -1; + return 0; +} + size_t rccIConvRecode(rcc_iconv icnv, char *outbuf, size_t outsize, const char *buf, size_t size) { char *in_buf, *out_buf, err; int in_left, out_left; diff --git a/src/rcciconv.h b/src/rcciconv.h index 0070696..1520534 100644 --- a/src/rcciconv.h +++ b/src/rcciconv.h @@ -8,6 +8,8 @@ struct rcc_iconv_t { }; typedef struct rcc_iconv_t rcc_iconv_s; +int rccIConvGetError(rcc_iconv icnv); + size_t rccIConvInternal(rcc_context ctx, rcc_iconv icnv, const char *buf, size_t len); /** diff --git a/src/rccspell.c b/src/rccspell.c new file mode 100644 index 0000000..c54e267 --- /dev/null +++ b/src/rccspell.c @@ -0,0 +1,63 @@ +#include <stdio.h> +#include <stdlib.h> + +#include "rccspell.h" + +rcc_speller rccSpellerCreate(const char *lang) { +#ifdef HAVE_ASPELL + rcc_speller rccspeller; + AspellSpeller *speller = NULL; + AspellConfig *config; + AspellCanHaveError *possible_err; + + if (!lang) return NULL; + + rccspeller = (rcc_speller)malloc(sizeof(rcc_speller_s)); + if (!rccspeller) return rccspeller; + + config = new_aspell_config(); + + if (config) { + if (aspell_config_replace(config, "encoding", "utf-8")&&aspell_config_replace(config, "master", lang)) { + possible_err = new_aspell_speller(config); + if (aspell_error_number(possible_err) == 0) { + speller = to_aspell_speller(possible_err); + } + } + delete_aspell_config(config); + } + + rccspeller->speller = speller; + return rccspeller; +#else + return NULL; +#endif /* HAVE_ASPELL */ +} + +void rccSpellerFree(rcc_speller rccspeller) { +#ifdef HAVE_ASPELL + if ((rccspeller)&&(rccspeller->speller)) + delete_aspell_speller(rccspeller->speller); + free(rccspeller); +#endif /* HAVE_ASPELL */ +} + +int rccSpellerGetError(rcc_speller rccspeller) { + if ((!rccspeller)||(!rccspeller->speller)) return -1; + return 0; +} + +int rccSpellerSized(rcc_speller speller, const char *word, size_t len) { +#ifdef HAVE_ASPELL + int res; + + if (rccSpellerGetError(speller)) return 0; + res = aspell_speller_check(speller->speller, word, len?len:-1); + return res<0?0:res; +#endif /* HAVE_ASPELL */ + return 0; +} + +int rccSpeller(rcc_speller speller, const char *word) { + return rccSpellerSized(speller, word, 0); +} diff --git a/src/rccspell.h b/src/rccspell.h new file mode 100644 index 0000000..49e39f4 --- /dev/null +++ b/src/rccspell.h @@ -0,0 +1,29 @@ +#ifndef _RCC_SPELL_H +#define _RCC_SPELL_H + +#include "../config.h" + +#ifdef HAVE_ASPELL +#include <aspell.h> +#endif /* HAVE_ASPELL */ + +struct rcc_speller_t { +#ifdef HAVE_ASPELL + struct AspellSpeller *speller; +#else + void *speller; +#endif /* HAVE_ASPELL */ +}; + +typedef struct rcc_speller_t *rcc_speller; +typedef struct rcc_speller_t rcc_speller_s; + +rcc_speller rccSpellerCreate(const char *lang); +void rccSpellerFree(rcc_speller speller); + +int rccSpellerGetError(rcc_speller rccspeller); + +int rccSpellerSized(rcc_speller speller, const char *word, size_t len); +int rccSpeller(rcc_speller speller, const char *word); + +#endif /* _RCC_SPELL_H */ diff --git a/src/rccstring.c b/src/rccstring.c index d6c6805..9c4c19f 100644 --- a/src/rccstring.c +++ b/src/rccstring.c @@ -58,6 +58,14 @@ int rccStringFixID(rcc_string string, rcc_context ctx) { return 0; } +int rccStringChangeID(rcc_string string, rcc_language_id language_id) { + if ((!string)&&(language_id != (rcc_language_id)-1)) return -1; + + ((rcc_string_header*)string)->language_id = language_id; + return 0; +} + + void rccStringFree(rcc_string str) { if (str) free(str); } diff --git a/src/rccstring.h b/src/rccstring.h index 3c5d8d7..e9e9734 100644 --- a/src/rccstring.h +++ b/src/rccstring.h @@ -16,6 +16,7 @@ void rccStringFree(rcc_string str); int rccStringSetLang(rcc_string string, const char *sn); int rccStringFixID(rcc_string string, rcc_context ctx); +int rccStringChangeID(rcc_string string, rcc_language_id language_id); #ifdef HAVE_STRNLEN # ifndef strnlen diff --git a/src/rcctranslate.c b/src/rcctranslate.c index 3bbd916..d7bb4e4 100644 --- a/src/rcctranslate.c +++ b/src/rcctranslate.c @@ -66,18 +66,22 @@ int rccTranslateSetTimeout(rcc_translate translate, unsigned long us) { char *rccTranslate(rcc_translate translate, const char *buf) { #ifdef HAVE_LIBTRANSLATE - size_t i; rcc_external_command_s resp; size_t err, len; char *buffer; - - if ((!translate)||(!buf)) return NULL; +/* + size_t i; +*/ + if ((!translate)||(!buf)) return NULL; + +/* if (!strcmp(translate->prefix.to, "en")) { for (i=0;buf[i];i++) if ((unsigned char)buf[i]>0x7F) break; if (!buf[i]) return NULL; } +*/ if (translate->sock == -1) { translate->sock = rccExternalConnect(RCC_EXTERNAL_MODULE_LIBRTRANSLATE); diff --git a/src/recode.c b/src/recode.c index c44095c..7e12343 100644 --- a/src/recode.c +++ b/src/recode.c @@ -2,6 +2,8 @@ #include <stdlib.h> #include <string.h> +#include "../config.h" + #include "internal.h" #include "rcciconv.h" #include "fs.h" @@ -10,19 +12,140 @@ #include "rccconfig.h" #include "rccdb4.h" #include "rcctranslate.h" +#include "rccspell.h" + +#define isSpace(ch) ((ch<0x7F)&&((ch<'A')||(ch>'z')||((ch>'Z')&&(ch<'a')))) +#define RCC_REQUIRED_PROBABILITY 0.66 + +rcc_language_id rccDetectLanguageInternal(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len, rcc_string *retstring) { + rcc_speller speller; + unsigned long i, nlanguages; + rcc_language_config config, config0 = NULL; + rcc_string recoded; + unsigned char *utf8; + size_t j, mode; + unsigned long words, english, result; + unsigned char english_mode, english_word = 1; + rcc_language_id english_lang = (rcc_language_id)-1; + double res, english_res = 0; + rcc_option_value usedb4; + + + usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); + + if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { + recoded = rccDb4GetKey(ctx->db4ctx, buf, len); + if (recoded) { + if (rccStringFixID(recoded, ctx)) free(recoded); + else { + english_lang = rccStringGetLanguage(recoded); + if (retstring) *retstring = recoded; + else free(recoded); + return english_lang; + } + } + } + + if (!rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) return (rcc_language_id)-1; + + nlanguages = ctx->n_languages; + + for (i=0;i<nlanguages;i++) { + config = rccGetUsableConfig(ctx, (rcc_language_id)i); + if (!config) continue; + + if (i) { + if (config==config0) continue; + } else config0=config; + + speller = rccConfigGetSpeller(config); + if (rccSpellerGetError(speller)) continue; + + recoded = rccConfigSizedFrom(config, class_id, buf, len); + if (!recoded) continue; + + if (!strcasecmp(config->language->sn, rcc_english_language_sn)) english_mode = 1; + else english_mode = 0; + + utf8 = (char*)rccStringGetString(recoded); + for (result=0,english=0,words=0,mode=0,j=0;utf8[j];j++) { + if (isSpace(utf8[j])) { + if (mode) { + if ((!english_mode)&&(english_word)) english++; + result+=rccSpellerSized(speller, utf8 + mode - 1, j - mode + 1)?1:0; + words++; + mode = 0; + } else continue; + } else { + if (mode) { + if (utf8[j]>0x7F) english_word = 0; + } else { + mode = j + 1; + english_word = 1; + } + } + } + if (mode) { + result+=rccSpeller(speller, utf8 + mode - 1)?1:0; + words++; + } + + if (english_mode) { + english_res = 1.*result/words; + english_lang = (rcc_language_id)i; + } else if (words) { + res = 1.*result/words; + if (res > RCC_REQUIRED_PROBABILITY) { + if (retstring) *retstring = recoded; + else free(recoded); + return (rcc_language_id)i; + } + if (words > english) { + res = 1.*(result - english)/(words - english); + if (res > RCC_REQUIRED_PROBABILITY) { + if (retstring) *retstring = recoded; + else free(recoded); + return (rcc_language_id)i; + } + } + } + + free(recoded); + } + + if (english_res > RCC_REQUIRED_PROBABILITY) { + if (retstring) { + *retstring = rccCreateString(english_lang, buf, len); + } + return english_lang; + } + + return (rcc_language_id)-1; +} +rcc_language_id rccDetectLanguage(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return -1; + } + + return rccDetectLanguageInternal(ctx, class_id, buf, len, NULL); +} -static rcc_autocharset_id rccIConvAuto(rcc_context ctx, rcc_class_id class_id, const char *buf, int len) { +rcc_autocharset_id rccConfigDetectCharset(rcc_language_config config, rcc_class_id class_id, const char *buf, size_t len) { + rcc_context ctx; rcc_class_type class_type; rcc_engine_ptr engine; - if (!buf) return (rcc_autocharset_id)-1; + if ((!buf)||(!config)) return (rcc_autocharset_id)-1; + + ctx = config->ctx; class_type = rccGetClassType(ctx, class_id); if ((class_type != RCC_CLASS_FS)||((class_type == RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_AUTODETECT_FS_TITLES)))) { - engine = rccGetCurrentEnginePointer(ctx); + engine = rccConfigGetCurrentEnginePointer(config); if ((!engine)||(!engine->func)) return (rcc_autocharset_id)-1; return engine->func(&ctx->engine_ctx, buf, len); } @@ -30,16 +153,26 @@ static rcc_autocharset_id rccIConvAuto(rcc_context ctx, rcc_class_id class_id, c return (rcc_autocharset_id)-1; } +int rccDetectCharset(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return -1; + } + + return rccConfigDetectCharset(ctx->current_config, class_id, buf, len); +} + + rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, size_t len) { int err; size_t ret; - rcc_language_id language_id; + rcc_language_id language_id, detected_language_id; rcc_autocharset_id charset_id; rcc_iconv icnv = NULL; rcc_string result; rcc_option_value usedb4; const char *charset; - + if (!ctx) { if (rcc_default_ctx) ctx = rcc_default_ctx; else return NULL; @@ -52,10 +185,11 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, language_id = rccGetCurrentLanguage(ctx); if (language_id == (rcc_language_id)-1) return NULL; - if (!strcasecmp(ctx->languages[language_id]->sn, "off")) return NULL; + if (!strcasecmp(ctx->languages[language_id]->sn, rcc_disabled_language_sn)) return NULL; - usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); + usedb4 = rccGetOption(ctx, RCC_OPTION_LEARNING_MODE); +/* if (usedb4&RCC_OPTION_LEARNING_FLAG_USE) { result = rccDb4GetKey(ctx->db4ctx, buf, len); if (result) { @@ -63,11 +197,22 @@ rcc_string rccSizedFrom(rcc_context ctx, rcc_class_id class_id, const char *buf, else return result; } } + + if (rccGetOption(ctx, RCC_OPTION_AUTODETECT_LANGUAGE)) { + detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len); + if (detected_language_id != (rcc_language_id)-1) + language_id = detected_language_id; + } +*/ + + detected_language_id = rccDetectLanguageInternal(ctx, class_id, buf, len, &result); + if (detected_language_id != (rcc_language_id)-1) return result; + err = rccConfigure(ctx); if (err) return NULL; - charset_id = rccIConvAuto(ctx, class_id, buf, len); + charset_id = rccDetectCharset(ctx, class_id, buf, len); if (charset_id != (rcc_autocharset_id)-1) { icnv = ctx->iconv_auto[charset_id]; if (rccGetOption(ctx, RCC_OPTION_AUTOENGINE_SET_CURRENT)) { @@ -105,6 +250,9 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s rcc_language_id language_id; rcc_language_id current_language_id; rcc_class_type class_type; + rcc_option_value translate; + const char *langname; + unsigned char english_source; rcc_iconv icnv; if (!ctx) { @@ -127,33 +275,60 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s if (err) return NULL; class_type = rccGetClassType(ctx, class_id); - if ((class_type != RCC_CLASS_FS)&&(rccGetOption(ctx, RCC_OPTION_TRANSLATE))) { + translate = rccGetOption(ctx, RCC_OPTION_TRANSLATE); + + langname = rccGetLanguageName(ctx, language_id); + if (strcasecmp(langname, rcc_english_language_sn)) english_source = 0; + else english_source = 1; + + if ((class_type != RCC_CLASS_FS)&&((translate==RCC_OPTION_TRANSLATE_FULL)||((translate)&&(!english_source)))) { current_language_id = rccGetCurrentLanguage(ctx); if (current_language_id != language_id) { if ((config->trans)&&(config->translang != current_language_id)) { rccTranslateClose(config->trans); config->trans = NULL; } - if (!config->trans) { - config->trans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rccGetLanguageName(ctx, current_language_id)); - config->translang = current_language_id; + + if (translate != RCC_OPTION_TRANSLATE_TO_ENGLISH) { + if (!config->trans) { + config->trans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rccGetLanguageName(ctx, current_language_id)); + config->translang = current_language_id; + } + + if (config->trans) { + translated = rccTranslate(config->trans, utfstring); + if (translated) { + language_id = current_language_id; + + config = rccGetConfig(ctx, language_id); + if (!config) { + free(translated); + return NULL; + } + + err = rccConfigConfigure(config); + if (err) { + free(translated); + return NULL; + } + } + } } - if (config->trans) { - translated = rccTranslate(config->trans, utfstring); - if (translated) { - language_id = current_language_id; + + if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||((config->trans)&&(!translated))) { + puts("entrans"); + if (!config->entrans) { + config->entrans = rccTranslateOpen(rccGetLanguageName(ctx, language_id), rcc_english_language_sn); + } + if (config->entrans) { + translated = rccTranslate(config->entrans, utfstring); + config = rccGetConfig(ctx, language_id); - if (!config) { - free(translated); - return NULL; - } + if (!config) return translated; err = rccConfigConfigure(config); - if (err) { - free(translated); - return NULL; - } + if (err) return translated; } } } @@ -183,7 +358,7 @@ char *rccSizedTo(rcc_context ctx, rcc_class_id class_id, rcc_const_string buf, s icnv = config->iconv_to[class_id]; if (icnv) { - newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, newlen); + newlen = rccIConvInternal(ctx, icnv, translated?translated:utfstring, translated?0:newlen); if (translated) free(translated); if (newlen == (size_t)-1) return NULL; @@ -237,7 +412,7 @@ char *rccSizedRecode(rcc_context ctx, rcc_class_id from, rcc_class_id to, const err = rccConfigure(ctx); if (err) return NULL; - from_charset_id = rccIConvAuto(ctx, from, buf, len); + from_charset_id = rccDetectCharset(ctx, from, buf, len); if (from_charset_id != (rcc_charset_id)-1) { from_charset = rccGetAutoCharsetName(ctx, from_charset_id); to_charset = rccGetCurrentCharsetName(ctx, to); @@ -385,13 +560,15 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char return extracted; } -/* Convert to class_id from Charset */ +/* Convert to class_id from Charset. +Usage of this function assuming the knowledge about the incoming string. +The charset as well as the language. So no detection (DB4,Aspell) of language +will be performed. */ char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const char *charset, const char *buf, size_t len, size_t *rlen) { size_t res; rcc_iconv icnv; rcc_string str; char *extracted; - if (!charset) return NULL; |