summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@dside.dyndns.org>2007-06-27 09:28:22 +0000
committerSuren A. Chilingaryan <csa@dside.dyndns.org>2007-06-27 09:28:22 +0000
commit35381569403e90b8d34b223f524519521bc81598 (patch)
tree924527ab503a59400cfd96859e101ce8234eabd0 /src
parentfd502754926131e3562a2210ff81af111ccaf867 (diff)
downloadlibrcc-35381569403e90b8d34b223f524519521bc81598.tar.gz
librcc-35381569403e90b8d34b223f524519521bc81598.tar.bz2
librcc-35381569403e90b8d34b223f524519521bc81598.tar.xz
librcc-35381569403e90b8d34b223f524519521bc81598.zip
Engines rework
- LibGuess support for far east language autodetection - Support for LibRCD 0.1.9 supporting ISO-8859-1 strings - Fixing wrong encodings names returned by Enca - Engine plugins naming scheme is altered - New API functions: rccEngineGetInfo, rccEngineGetAutoCharsetByName - Most of languages are no more hardcoded, but moved to the configuration - RCD engine is added to Belarusian language (I hope it should work) - Some encoding names are fixed in configuration - Support for external libiconv - Support for libcharset - Find UI interface language from LC_MESSAGES locale - Simple compilation fix (Thanx to D. Panov)
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am2
-rw-r--r--src/engine.c27
-rw-r--r--src/engine.h1
-rw-r--r--src/librcc.h3
-rw-r--r--src/lngconfig.c2
-rw-r--r--src/plugin.c8
-rw-r--r--src/rccconfig.c25
-rw-r--r--src/rccenca.c48
-rw-r--r--src/rccenca.h8
-rw-r--r--src/rccexternal.h3
-rw-r--r--src/rcclocale.c9
-rw-r--r--src/recode.c22
12 files changed, 137 insertions, 21 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 79976c6..42c5966 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -28,6 +28,6 @@ librcc_la_SOURCES = librcc.c \
include_HEADERS = librcc.h
AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@ @ASPELL_CFLAGS@ @PTHREAD_CFLAGS@
-librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ @PTHREAD_LIBS@
+librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ @PTHREAD_LIBS@ @EXTRA_LIBS@
librcc_la_LDFLAGS = -version-info @LIBRCC_VERSION_INFO@
diff --git a/src/engine.c b/src/engine.c
index f9c2284..3d3e023 100644
--- a/src/engine.c
+++ b/src/engine.c
@@ -125,6 +125,7 @@ int rccEngineConfigure(rcc_engine_context ctx) {
engine = ctx->config->language->engines[engine_id];
+ ctx->id = engine_id;
ctx->free_func = engine->free_func;
ctx->func = engine->func;
@@ -134,6 +135,30 @@ int rccEngineConfigure(rcc_engine_context ctx) {
return 0;
}
+
+rcc_engine *rccEngineGetInfo(rcc_engine_context ctx) {
+ if (!ctx) return NULL;
+ return ctx->config->language->engines[ctx->id];
+}
+
+rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name) {
+ unsigned int i;
+ rcc_engine *info;
+ rcc_charset *charsets;
+
+ if ((!ctx)||(!name)) return (rcc_autocharset_id)-1;
+
+ info = rccEngineGetInfo(ctx);
+ if (info) {
+ charsets = info->charsets;
+
+ for (i=0;charsets[i];i++)
+ if (!strcasecmp(charsets[i],name)) return (rcc_autocharset_id)i;
+ }
+
+ return (rcc_autocharset_id)-1;
+}
+
rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx) {
if (!ctx) return NULL;
@@ -186,6 +211,8 @@ static int CheckWestern(const unsigned char *buf, int len) {
rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len) {
rcc_autocharset_id utf;
+ /* DS: This should be done directly in autoengines, otherwise we will
+ fail to detect 7bit encodings */
if (CheckWestern(buf, len)) {
utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF-8");
if (utf != (rcc_autocharset_id)-1) return utf;
diff --git a/src/engine.h b/src/engine.h
index 96e6db6..3213f2b 100644
--- a/src/engine.h
+++ b/src/engine.h
@@ -26,6 +26,7 @@ struct rcc_engine_context_t {
rcc_engine_function func;
rcc_engine_free_function free_func;
+ rcc_engine_id id;
rcc_engine_internal internal;
};
typedef struct rcc_engine_context_t rcc_engine_context_s;
diff --git a/src/librcc.h b/src/librcc.h
index e5749cd..88cc802 100644
--- a/src/librcc.h
+++ b/src/librcc.h
@@ -1481,6 +1481,9 @@ typedef rcc_engine *(*rcc_plugin_engine_info_function)(const char *lang);
rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx);
rcc_language *rccEngineGetLanguage(rcc_engine_context ctx);
rcc_context rccEngineGetRccContext(rcc_engine_context ctx);
+rcc_engine *rccEngineGetInfo(rcc_engine_context ctx);
+rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name);
+
/*******************************************************************************
**************************** Configuration *************************************
diff --git a/src/lngconfig.c b/src/lngconfig.c
index 67e05c6..670d97f 100644
--- a/src/lngconfig.c
+++ b/src/lngconfig.c
@@ -405,7 +405,7 @@ rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language
rcc_language_config rccGetConfig(rcc_context ctx, rcc_language_id language_id) {
rcc_language_config config;
-
+
config = rccGetConfigPointer(ctx, language_id, &language_id);
if (config) {
if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL;
diff --git a/src/plugin.c b/src/plugin.c
index 38337fb..c53726f 100644
--- a/src/plugin.c
+++ b/src/plugin.c
@@ -121,13 +121,13 @@ rcc_plugin_handle rccPluginLoad(rcc_plugin_type type, const char *name) {
switch (type) {
case RCC_PLUGIN_TYPE_ENGINE:
- pluginfn = (char*)malloc((32 + strlen(rcc_home_dir) + strlen(name))*sizeof(char));
+ pluginfn = (char*)malloc((48 + strlen(rcc_home_dir) + strlen(name))*sizeof(char));
if (!pluginfn) return NULL;
- sprintf(pluginfn, "%s/.rcc/engines/lib%s.so", rcc_home_dir, name);
+ sprintf(pluginfn, "%s/.rcc/engines/%s_engine.so", rcc_home_dir, name);
res = rccLibraryOpen(pluginfn);
if (!res) {
- sprintf(pluginfn, LIBRCC_DATA_DIR "/engines/lib%s.so", name);
+ sprintf(pluginfn, LIBRCC_DATA_DIR "/engines/%s_engine.so", name);
res = rccLibraryOpen(pluginfn);
}
free(pluginfn);
@@ -156,7 +156,7 @@ rcc_plugin_handle rccPluginLoad(rcc_plugin_type type, const char *name) {
rcc_engine *rccPluginEngineGetInfo(const char *name, const char *language) {
rcc_plugin_handle handle;
rcc_plugin_engine_info_function infofunc;
-
+
handle = rccPluginLoad(RCC_PLUGIN_TYPE_ENGINE, name);
if (!handle) return NULL;
diff --git a/src/rccconfig.c b/src/rccconfig.c
index 6723825..d5546c7 100644
--- a/src/rccconfig.c
+++ b/src/rccconfig.c
@@ -10,8 +10,8 @@
rcc_language_alias rcc_default_aliases[RCC_MAX_ALIASES + 1];
rcc_language_alias rcc_default_aliases_embeded[RCC_MAX_ALIASES + 1] = {
- { "cs_SK", "sk" },
- { "ru_UA", "uk" },
+/* { "cs_SK", "sk" },
+ { "ru_UA", "uk" },*/
{ NULL, NULL }
};
@@ -45,11 +45,15 @@ rcc_engine rcc_default_engine = {
};
rcc_engine rcc_russian_engine = {
- "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", NULL}
+ "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", "ISO8859-1", NULL}
};
rcc_engine rcc_ukrainian_engine = {
- "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", NULL}
+ "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", "ISO8859-1", NULL}
+};
+
+rcc_engine rcc_belarussian_engine = {
+ "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","ISO-IR-111","UTF-8","IBM865", "ISO8859-1", NULL}
};
rcc_language rcc_default_languages[RCC_MAX_LANGUAGES + 1];
@@ -81,11 +85,14 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = {
#endif /* RCC_RCD_SUPPORT */
NULL
}},
-{"be", {rcc_default_charset, rcc_utf8_charset, "CP1251", "IBM866", "ISO-8859-5", "KOI8-UNI", "maccyr" "IBM855", NULL},{
+{"be", {rcc_default_charset, rcc_utf8_charset, "CP1251", "IBM866", "ISO-8859-5", "ISO-IR-111", "ISO-IR-111", "MACCYRILLIC" "IBM855", NULL},{
&rcc_default_engine,
+#ifdef RCC_RCD_SUPPORT
+ &rcc_ukrainian_engine,
+#endif /* RCC_RCD_SUPPORT */
NULL
}},
-{"bg", {rcc_default_charset, rcc_utf8_charset, "CP1251", "ISO-8859-5", "IBM855", "maccyr", "ECMA-113", NULL},{
+/*{"bg", {rcc_default_charset, rcc_utf8_charset, "CP1251", "ISO-8859-5", "IBM855", "maccyr", "ECMA-113", NULL},{
&rcc_default_engine,
NULL
}},
@@ -124,11 +131,7 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = {
{"sl", {rcc_default_charset, rcc_utf8_charset, "ISO-8859-2", "CP1250", "IBM852", "macce", "CORK", NULL},{
&rcc_default_engine,
NULL
-}},
-{"zh", {rcc_default_charset, rcc_utf8_charset, "GB2312", "GBK", "GB18030", "BIG5", NULL},{
- &rcc_default_engine,
- NULL
-}},
+}},*/
{NULL}
};
rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL };
diff --git a/src/rccenca.c b/src/rccenca.c
index 28d3ccf..e46847e 100644
--- a/src/rccenca.c
+++ b/src/rccenca.c
@@ -20,6 +20,41 @@ static rcc_library_handle enca_handle = NULL;
#endif /* RCC_ENCA_DYNAMIC */
static rcc_engine *enca_engines = NULL;
+
+/* CORK, KEYBCS2 is missing */
+rcc_enca_corrections rcc_enca_missing_corrections[] = {
+ { "be", "KOI8-UNI", "ISO-IR-111" },
+ { NULL, "macce", "MACCENTRALEUROPE" },
+ { "zh", "HZ", "HZ" },
+ { "sk", "KOI-8_CS_2", "CSKOI8R" },
+ { NULL, NULL, NULL }
+};
+
+rcc_enca_corrections rcc_enca_error_corrections[] = {
+ { NULL, "ECMA-cyrillic", "ISO-IR-111" },
+ { NULL, NULL, NULL }
+};
+
+
+static const char *rccEncaGetCorrection(const char *lang, const char *charset) {
+ int i;
+ for (i=0;rcc_enca_error_corrections[i].enca_charset;i++) {
+ if (((!rcc_enca_error_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_error_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_error_corrections[i].enca_charset)))
+ return rcc_enca_error_corrections[i].iconv_charset;
+ }
+ return charset;
+}
+
+static const char *rccEncaGetMissing(const char *lang, const char *charset) {
+ int i;
+ for (i=0;rcc_enca_missing_corrections[i].enca_charset;i++) {
+ if (((!rcc_enca_missing_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_missing_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_missing_corrections[i].enca_charset)))
+ return rcc_enca_missing_corrections[i].iconv_charset;
+ }
+ return charset;
+}
+
+
rcc_engine_internal rccEncaInitContext(rcc_engine_context ctx) {
#ifdef RCC_ENCA_SUPPORT
EncaAnalyser enca;
@@ -65,7 +100,12 @@ rcc_autocharset_id rccEnca(rcc_engine_context ctx, const char *buf, int len) {
if (ee.charset<0) return (rcc_charset_id)-1;
charset = enca_charset_name(ee.charset, ENCA_NAME_STYLE_ICONV);
- return rccGetAutoCharsetByName(ctx->config->ctx, charset);
+ if (charset) {
+ charset = rccEncaGetCorrection(rccEngineGetLanguage(ctx)->sn, charset);
+ } else {
+ charset = rccEncaGetMissing(rccEngineGetLanguage(ctx)->sn, enca_charset_name(ee.charset, ENCA_NAME_STYLE_ENCA));
+ }
+ return rccEngineGetAutoCharsetByName(ctx, charset);
#else /* RCC_ENCA_SUPPORT */
return (rcc_charset_id)-1;
#endif /* RCC_ENCA_SUPPORT */
@@ -160,7 +200,11 @@ int rccEncaInit() {
for (l=0;l<n_charsets;l++) {
// Enca bug, STYLE_ICONV return's a lot of NULL's
charset = enca_charset_name(charsets[l], ENCA_NAME_STYLE_ICONV);
- if (!charset) charset = enca_charset_name(charsets[l], ENCA_NAME_STYLE_ENCA);
+ if (charset) {
+ charset = rccEncaGetCorrection(rcc_default_languages[i].sn, charset);
+ } else {
+ charset = rccEncaGetMissing(rcc_default_languages[i].sn, enca_charset_name(charsets[l], ENCA_NAME_STYLE_ENCA));
+ }
enca_engines[i].charsets[k++] = charset;
}
enca_engines[j].charsets[k] = NULL;
diff --git a/src/rccenca.h b/src/rccenca.h
index 2f2c487..308b8fb 100644
--- a/src/rccenca.h
+++ b/src/rccenca.h
@@ -20,6 +20,14 @@
# define RCC_ENCA_SUPPORT
#endif
+struct rcc_enca_corrections_t {
+ char *lang;
+ const char *enca_charset;
+ const char *iconv_charset;
+};
+typedef struct rcc_enca_corrections_t rcc_enca_corrections;
+
+
int rccEncaInit();
void rccEncaFree();
diff --git a/src/rccexternal.h b/src/rccexternal.h
index 181a6ec..fe7052f 100644
--- a/src/rccexternal.h
+++ b/src/rccexternal.h
@@ -2,6 +2,9 @@
#define _RCC_EXTERNAL_H
#include "../config.h"
+#ifdef HAVE_SYS_TYPES_H
+# include <sys/types.h>
+#endif /* HAVE_SYS_TYPES_H */
typedef enum rcc_external_module_t {
RCC_EXTERNAL_MODULE_CONTROL = 0,
diff --git a/src/rcclocale.c b/src/rcclocale.c
index 99d2b8f..9869a72 100644
--- a/src/rcclocale.c
+++ b/src/rcclocale.c
@@ -5,10 +5,14 @@
#include "../config.h"
+#ifdef HAVE_LIBCHARSET
+# include <libcharset.h>
+#endif /* HAVE_LIBCHARSET */
#ifdef HAVE_CODESET
# include <langinfo.h>
#endif
+
#include "rccconfig.h"
int rccLocaleGetClassByName(const char *locale) {
@@ -80,9 +84,12 @@ int rccLocaleGetCharset(char *result, const char *lv, unsigned int n) {
if (locale_class == LC_CTYPE) {
l = getenv("CHARSET");
+#ifdef HAVE_LIBCHARSET
+ if (!l) l = locale_charset();
+#endif /* HAVE_LIBCHARSET */
#ifdef HAVE_CODESET
if (!l) l = nl_langinfo(CODESET);
-#endif
+#endif /* HAVE_CODESET */
if (l) {
if (strlen(l)>=n) return -1;
strcpy(result, l);
diff --git a/src/recode.c b/src/recode.c
index e1e8e81..1d98306 100644
--- a/src/recode.c
+++ b/src/recode.c
@@ -742,7 +742,12 @@ rcc_string rccSizedFromCharset(rcc_context ctx, const char *charset, const char
rcc_string ret;
if ((!buf)||(!charset)) return NULL;
-
+
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return NULL;
+ }
+
config = rccGetCurrentConfig(ctx);
if (!config) return NULL;
@@ -768,6 +773,11 @@ char *rccSizedToCharset(rcc_context ctx, const char *charset, rcc_const_string b
if ((!buf)||(!charset)) return NULL;
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return NULL;
+ }
+
res = rccStringCheck(buf);
if (!res) return NULL;
@@ -799,6 +809,11 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char
char *utf8, *extracted;
if (!charset) return NULL;
+
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return NULL;
+ }
utf8 = rccSizedFrom(ctx, class_id, buf, len);
if (!utf8) return utf8;
@@ -839,6 +854,11 @@ char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const ch
char *extracted;
if (!charset) return NULL;
+
+ if (!ctx) {
+ if (rcc_default_ctx) ctx = rcc_default_ctx;
+ else return NULL;
+ }
icnv = rccIConvOpen("UTF-8", charset);
if (icnv) {