From 35381569403e90b8d34b223f524519521bc81598 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Wed, 27 Jun 2007 09:28:22 +0000 Subject: Engines rework - LibGuess support for far east language autodetection - Support for LibRCD 0.1.9 supporting ISO-8859-1 strings - Fixing wrong encodings names returned by Enca - Engine plugins naming scheme is altered - New API functions: rccEngineGetInfo, rccEngineGetAutoCharsetByName - Most of languages are no more hardcoded, but moved to the configuration - RCD engine is added to Belarusian language (I hope it should work) - Some encoding names are fixed in configuration - Support for external libiconv - Support for libcharset - Find UI interface language from LC_MESSAGES locale - Simple compilation fix (Thanx to D. Panov) --- src/Makefile.am | 2 +- src/engine.c | 27 +++++++++++++++++++++++++++ src/engine.h | 1 + src/librcc.h | 3 +++ src/lngconfig.c | 2 +- src/plugin.c | 8 ++++---- src/rccconfig.c | 25 ++++++++++++++----------- src/rccenca.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-- src/rccenca.h | 8 ++++++++ src/rccexternal.h | 3 +++ src/rcclocale.c | 9 ++++++++- src/recode.c | 22 +++++++++++++++++++++- 12 files changed, 137 insertions(+), 21 deletions(-) (limited to 'src') diff --git a/src/Makefile.am b/src/Makefile.am index 79976c6..42c5966 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -28,6 +28,6 @@ librcc_la_SOURCES = librcc.c \ include_HEADERS = librcc.h AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@ @ASPELL_CFLAGS@ @PTHREAD_CFLAGS@ -librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ @PTHREAD_LIBS@ +librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ @PTHREAD_LIBS@ @EXTRA_LIBS@ librcc_la_LDFLAGS = -version-info @LIBRCC_VERSION_INFO@ diff --git a/src/engine.c b/src/engine.c index f9c2284..3d3e023 100644 --- a/src/engine.c +++ b/src/engine.c @@ -125,6 +125,7 @@ int rccEngineConfigure(rcc_engine_context ctx) { engine = ctx->config->language->engines[engine_id]; + ctx->id = engine_id; ctx->free_func = engine->free_func; ctx->func = engine->func; @@ -134,6 +135,30 @@ int rccEngineConfigure(rcc_engine_context ctx) { return 0; } + +rcc_engine *rccEngineGetInfo(rcc_engine_context ctx) { + if (!ctx) return NULL; + return ctx->config->language->engines[ctx->id]; +} + +rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name) { + unsigned int i; + rcc_engine *info; + rcc_charset *charsets; + + if ((!ctx)||(!name)) return (rcc_autocharset_id)-1; + + info = rccEngineGetInfo(ctx); + if (info) { + charsets = info->charsets; + + for (i=0;charsets[i];i++) + if (!strcasecmp(charsets[i],name)) return (rcc_autocharset_id)i; + } + + return (rcc_autocharset_id)-1; +} + rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx) { if (!ctx) return NULL; @@ -186,6 +211,8 @@ static int CheckWestern(const unsigned char *buf, int len) { rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len) { rcc_autocharset_id utf; + /* DS: This should be done directly in autoengines, otherwise we will + fail to detect 7bit encodings */ if (CheckWestern(buf, len)) { utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF-8"); if (utf != (rcc_autocharset_id)-1) return utf; diff --git a/src/engine.h b/src/engine.h index 96e6db6..3213f2b 100644 --- a/src/engine.h +++ b/src/engine.h @@ -26,6 +26,7 @@ struct rcc_engine_context_t { rcc_engine_function func; rcc_engine_free_function free_func; + rcc_engine_id id; rcc_engine_internal internal; }; typedef struct rcc_engine_context_t rcc_engine_context_s; diff --git a/src/librcc.h b/src/librcc.h index e5749cd..88cc802 100644 --- a/src/librcc.h +++ b/src/librcc.h @@ -1481,6 +1481,9 @@ typedef rcc_engine *(*rcc_plugin_engine_info_function)(const char *lang); rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx); rcc_language *rccEngineGetLanguage(rcc_engine_context ctx); rcc_context rccEngineGetRccContext(rcc_engine_context ctx); +rcc_engine *rccEngineGetInfo(rcc_engine_context ctx); +rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name); + /******************************************************************************* **************************** Configuration ************************************* diff --git a/src/lngconfig.c b/src/lngconfig.c index 67e05c6..670d97f 100644 --- a/src/lngconfig.c +++ b/src/lngconfig.c @@ -405,7 +405,7 @@ rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language rcc_language_config rccGetConfig(rcc_context ctx, rcc_language_id language_id) { rcc_language_config config; - + config = rccGetConfigPointer(ctx, language_id, &language_id); if (config) { if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL; diff --git a/src/plugin.c b/src/plugin.c index 38337fb..c53726f 100644 --- a/src/plugin.c +++ b/src/plugin.c @@ -121,13 +121,13 @@ rcc_plugin_handle rccPluginLoad(rcc_plugin_type type, const char *name) { switch (type) { case RCC_PLUGIN_TYPE_ENGINE: - pluginfn = (char*)malloc((32 + strlen(rcc_home_dir) + strlen(name))*sizeof(char)); + pluginfn = (char*)malloc((48 + strlen(rcc_home_dir) + strlen(name))*sizeof(char)); if (!pluginfn) return NULL; - sprintf(pluginfn, "%s/.rcc/engines/lib%s.so", rcc_home_dir, name); + sprintf(pluginfn, "%s/.rcc/engines/%s_engine.so", rcc_home_dir, name); res = rccLibraryOpen(pluginfn); if (!res) { - sprintf(pluginfn, LIBRCC_DATA_DIR "/engines/lib%s.so", name); + sprintf(pluginfn, LIBRCC_DATA_DIR "/engines/%s_engine.so", name); res = rccLibraryOpen(pluginfn); } free(pluginfn); @@ -156,7 +156,7 @@ rcc_plugin_handle rccPluginLoad(rcc_plugin_type type, const char *name) { rcc_engine *rccPluginEngineGetInfo(const char *name, const char *language) { rcc_plugin_handle handle; rcc_plugin_engine_info_function infofunc; - + handle = rccPluginLoad(RCC_PLUGIN_TYPE_ENGINE, name); if (!handle) return NULL; diff --git a/src/rccconfig.c b/src/rccconfig.c index 6723825..d5546c7 100644 --- a/src/rccconfig.c +++ b/src/rccconfig.c @@ -10,8 +10,8 @@ rcc_language_alias rcc_default_aliases[RCC_MAX_ALIASES + 1]; rcc_language_alias rcc_default_aliases_embeded[RCC_MAX_ALIASES + 1] = { - { "cs_SK", "sk" }, - { "ru_UA", "uk" }, +/* { "cs_SK", "sk" }, + { "ru_UA", "uk" },*/ { NULL, NULL } }; @@ -45,11 +45,15 @@ rcc_engine rcc_default_engine = { }; rcc_engine rcc_russian_engine = { - "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", NULL} + "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", "ISO8859-1", NULL} }; rcc_engine rcc_ukrainian_engine = { - "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", NULL} + "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", "ISO8859-1", NULL} +}; + +rcc_engine rcc_belarussian_engine = { + "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","ISO-IR-111","UTF-8","IBM865", "ISO8859-1", NULL} }; rcc_language rcc_default_languages[RCC_MAX_LANGUAGES + 1]; @@ -81,11 +85,14 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { #endif /* RCC_RCD_SUPPORT */ NULL }}, -{"be", {rcc_default_charset, rcc_utf8_charset, "CP1251", "IBM866", "ISO-8859-5", "KOI8-UNI", "maccyr" "IBM855", NULL},{ +{"be", {rcc_default_charset, rcc_utf8_charset, "CP1251", "IBM866", "ISO-8859-5", "ISO-IR-111", "ISO-IR-111", "MACCYRILLIC" "IBM855", NULL},{ &rcc_default_engine, +#ifdef RCC_RCD_SUPPORT + &rcc_ukrainian_engine, +#endif /* RCC_RCD_SUPPORT */ NULL }}, -{"bg", {rcc_default_charset, rcc_utf8_charset, "CP1251", "ISO-8859-5", "IBM855", "maccyr", "ECMA-113", NULL},{ +/*{"bg", {rcc_default_charset, rcc_utf8_charset, "CP1251", "ISO-8859-5", "IBM855", "maccyr", "ECMA-113", NULL},{ &rcc_default_engine, NULL }}, @@ -124,11 +131,7 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { {"sl", {rcc_default_charset, rcc_utf8_charset, "ISO-8859-2", "CP1250", "IBM852", "macce", "CORK", NULL},{ &rcc_default_engine, NULL -}}, -{"zh", {rcc_default_charset, rcc_utf8_charset, "GB2312", "GBK", "GB18030", "BIG5", NULL},{ - &rcc_default_engine, - NULL -}}, +}},*/ {NULL} }; rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL }; diff --git a/src/rccenca.c b/src/rccenca.c index 28d3ccf..e46847e 100644 --- a/src/rccenca.c +++ b/src/rccenca.c @@ -20,6 +20,41 @@ static rcc_library_handle enca_handle = NULL; #endif /* RCC_ENCA_DYNAMIC */ static rcc_engine *enca_engines = NULL; + +/* CORK, KEYBCS2 is missing */ +rcc_enca_corrections rcc_enca_missing_corrections[] = { + { "be", "KOI8-UNI", "ISO-IR-111" }, + { NULL, "macce", "MACCENTRALEUROPE" }, + { "zh", "HZ", "HZ" }, + { "sk", "KOI-8_CS_2", "CSKOI8R" }, + { NULL, NULL, NULL } +}; + +rcc_enca_corrections rcc_enca_error_corrections[] = { + { NULL, "ECMA-cyrillic", "ISO-IR-111" }, + { NULL, NULL, NULL } +}; + + +static const char *rccEncaGetCorrection(const char *lang, const char *charset) { + int i; + for (i=0;rcc_enca_error_corrections[i].enca_charset;i++) { + if (((!rcc_enca_error_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_error_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_error_corrections[i].enca_charset))) + return rcc_enca_error_corrections[i].iconv_charset; + } + return charset; +} + +static const char *rccEncaGetMissing(const char *lang, const char *charset) { + int i; + for (i=0;rcc_enca_missing_corrections[i].enca_charset;i++) { + if (((!rcc_enca_missing_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_missing_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_missing_corrections[i].enca_charset))) + return rcc_enca_missing_corrections[i].iconv_charset; + } + return charset; +} + + rcc_engine_internal rccEncaInitContext(rcc_engine_context ctx) { #ifdef RCC_ENCA_SUPPORT EncaAnalyser enca; @@ -65,7 +100,12 @@ rcc_autocharset_id rccEnca(rcc_engine_context ctx, const char *buf, int len) { if (ee.charset<0) return (rcc_charset_id)-1; charset = enca_charset_name(ee.charset, ENCA_NAME_STYLE_ICONV); - return rccGetAutoCharsetByName(ctx->config->ctx, charset); + if (charset) { + charset = rccEncaGetCorrection(rccEngineGetLanguage(ctx)->sn, charset); + } else { + charset = rccEncaGetMissing(rccEngineGetLanguage(ctx)->sn, enca_charset_name(ee.charset, ENCA_NAME_STYLE_ENCA)); + } + return rccEngineGetAutoCharsetByName(ctx, charset); #else /* RCC_ENCA_SUPPORT */ return (rcc_charset_id)-1; #endif /* RCC_ENCA_SUPPORT */ @@ -160,7 +200,11 @@ int rccEncaInit() { for (l=0;l +#endif /* HAVE_SYS_TYPES_H */ typedef enum rcc_external_module_t { RCC_EXTERNAL_MODULE_CONTROL = 0, diff --git a/src/rcclocale.c b/src/rcclocale.c index 99d2b8f..9869a72 100644 --- a/src/rcclocale.c +++ b/src/rcclocale.c @@ -5,10 +5,14 @@ #include "../config.h" +#ifdef HAVE_LIBCHARSET +# include +#endif /* HAVE_LIBCHARSET */ #ifdef HAVE_CODESET # include #endif + #include "rccconfig.h" int rccLocaleGetClassByName(const char *locale) { @@ -80,9 +84,12 @@ int rccLocaleGetCharset(char *result, const char *lv, unsigned int n) { if (locale_class == LC_CTYPE) { l = getenv("CHARSET"); +#ifdef HAVE_LIBCHARSET + if (!l) l = locale_charset(); +#endif /* HAVE_LIBCHARSET */ #ifdef HAVE_CODESET if (!l) l = nl_langinfo(CODESET); -#endif +#endif /* HAVE_CODESET */ if (l) { if (strlen(l)>=n) return -1; strcpy(result, l); diff --git a/src/recode.c b/src/recode.c index e1e8e81..1d98306 100644 --- a/src/recode.c +++ b/src/recode.c @@ -742,7 +742,12 @@ rcc_string rccSizedFromCharset(rcc_context ctx, const char *charset, const char rcc_string ret; if ((!buf)||(!charset)) return NULL; - + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } + config = rccGetCurrentConfig(ctx); if (!config) return NULL; @@ -768,6 +773,11 @@ char *rccSizedToCharset(rcc_context ctx, const char *charset, rcc_const_string b if ((!buf)||(!charset)) return NULL; + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } + res = rccStringCheck(buf); if (!res) return NULL; @@ -799,6 +809,11 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char char *utf8, *extracted; if (!charset) return NULL; + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } utf8 = rccSizedFrom(ctx, class_id, buf, len); if (!utf8) return utf8; @@ -839,6 +854,11 @@ char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const ch char *extracted; if (!charset) return NULL; + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } icnv = rccIConvOpen("UTF-8", charset); if (icnv) { -- cgit v1.2.3