diff options
-rw-r--r-- | configure.in | 51 | ||||
-rw-r--r-- | docs/encodings.txt | 2 | ||||
-rw-r--r-- | engines/Makefile.am | 22 | ||||
-rw-r--r-- | engines/libguess.c | 80 | ||||
-rw-r--r-- | engines/librcd.c | 9 | ||||
-rw-r--r-- | examples/rcc.xml | 296 | ||||
-rw-r--r-- | examples/rcc.xml.chinese | 28 | ||||
-rw-r--r-- | examples/rcc.xml.eastern | 107 | ||||
-rw-r--r-- | examples/rcc.xml.western | 25 | ||||
-rw-r--r-- | src/Makefile.am | 2 | ||||
-rw-r--r-- | src/engine.c | 27 | ||||
-rw-r--r-- | src/engine.h | 1 | ||||
-rw-r--r-- | src/librcc.h | 3 | ||||
-rw-r--r-- | src/lngconfig.c | 2 | ||||
-rw-r--r-- | src/plugin.c | 8 | ||||
-rw-r--r-- | src/rccconfig.c | 25 | ||||
-rw-r--r-- | src/rccenca.c | 48 | ||||
-rw-r--r-- | src/rccenca.h | 8 | ||||
-rw-r--r-- | src/rccexternal.h | 3 | ||||
-rw-r--r-- | src/rcclocale.c | 9 | ||||
-rw-r--r-- | src/recode.c | 22 | ||||
-rw-r--r-- | ui/librccui.c | 2 | ||||
-rw-r--r-- | ui/rccnames.c | 2 |
23 files changed, 746 insertions, 36 deletions
diff --git a/configure.in b/configure.in index 7f85408..ce87ce8 100644 --- a/configure.in +++ b/configure.in @@ -41,6 +41,10 @@ AC_ARG_ENABLE( bdb, [ --disable-bdb disable usage of berkeleydb for recodings caching],, disable_bdb="yes") +AC_ARG_ENABLE( force-system-iconv, + [ --enable-force-system-iconv force usage of iconv library from glibc],, + enable_force_system_iconv="no") + AC_PROG_CC AM_PROG_CC_C_O AC_PROG_INSTALL @@ -118,13 +122,35 @@ AC_CHECK_HEADER(dlfcn.h, [AC_CHECK_LIB(dl, dlopen, [ ]) ])]) + +EXTRA_LIBS="" +EXTERNAL_ICONV=no +HAVE_LIBCHARSET=no + +if test "x$enable_force_system_iconv" != "xyes"; then +AC_CHECK_LIB(iconv, iconv_open, [ + EXTERNAL_ICONV=yes + EXTRA_LIBS+=" -liconv" +]) +AC_CHECK_HEADER(libcharset.h, [AC_CHECK_LIB(charset, locale_charset, [ + AC_DEFINE(HAVE_LIBCHARSET,1,[Defines if libRCD is available]) + HAVE_LIBCHARSET=yes + EXTRA_LIBS+=" -lcharset" +])]) +fi +AM_CONDITIONAL(HAVE_LIBCHARSET, [ test "x$HAVE_LIBCHARSET" = "xyes" ]) +AC_SUBST(EXTRA_LIBS) + + RCD_LIBS="" RCD_INCLUDES="" HAVE_RCD=no ENCA_LIBS="" ENCA_INCLUDES="" HAVE_ENCA=no - +LIBGUESS_LIBS="" +LIBGUESS_INCLUDES="" +HAVE_LIBGUESS=no if test "x$enable_force_dynamic_engines" != "xyes"; then AC_CHECK_HEADER(librcd.h, [AC_CHECK_LIB(rcd, rcdGetRussianCharset, [ @@ -141,13 +167,23 @@ AC_CHECK_HEADER(enca.h, [AC_CHECK_LIB(enca, enca_analyse, [ ])]) fi +AC_CHECK_HEADER(libguess.h, [AC_CHECK_LIB(guess, guess_jp, [ + AC_DEFINE(HAVE_LIBGUESS,1,[Defines if libguess is available]) + LIBGUESS_LIBS="-lguess" + LIBGUESS_INCLUDES="" + HAVE_LIBGUESS=yes +])]) + AM_CONDITIONAL(HAVE_RCD, [ test "x$HAVE_RCD" = "xyes" ]) AM_CONDITIONAL(HAVE_ENCA, [ test "x$HAVE_ENCA" = "xyes" ]) +AM_CONDITIONAL(HAVE_LIBGUESS, [ test "x$HAVE_LIBGUESS" = "xyes" ]) AC_SUBST(RCD_LIBS) AC_SUBST(RCD_INCLUDES) AC_SUBST(ENCA_LIBS) AC_SUBST(ENCA_INCLUDES) +AC_SUBST(LIBGUESS_LIBS) +AC_SUBST(LIBGUESS_INCLUDES) USE_DLOPEN=no if test "x$HAVE_DLOPEN" = "xyes"; then @@ -272,13 +308,23 @@ AC_CHECK_FUNCS(strcasecmp strncasecmp strdup strnlen) AC_OUTPUT(src/Makefile engines/Makefile external/Makefile ui/Makefile examples/Makefile Makefile librcc.spec) +rccdir=${pkgdatadir} +while expr ${rccdir:0:1} == '$' &>/dev/null; do + rccdir=`eval echo $rccdir` +done + + echo "" echo "Configuration:" echo " POSIX Threading Support: $HAVE_PTHREAD" echo "" +echo " External IConv Library: $EXTERNAL_ICONV" +echo " LibCharset Library: $HAVE_LIBCHARSET" +echo "" echo " Dynamic Engine Loading Support: $HAVE_DLOPEN" echo " Enca Charset Detection Support: $HAVE_ENCA" echo " LibRCD Charset Detection Support: $HAVE_RCD" +echo " LibGUESS Charset Detection Support: $HAVE_LIBGUESS" echo "" echo " Multilanguage support with DB4: $HAVE_BDB" echo " Language autodetection using aspell: $HAVE_ASPELL" @@ -289,5 +335,8 @@ echo "User Interfaces:" echo " GTK User Interface: $HAVE_GTK" echo " GTK2 User Interface: $HAVE_GTK2" echo "" +echo "Directories:" +echo " RCC Data Directory: ${rccdir}" +echo "" echo "" echo "" diff --git a/docs/encodings.txt b/docs/encodings.txt new file mode 100644 index 0000000..aa2355d --- /dev/null +++ b/docs/encodings.txt @@ -0,0 +1,2 @@ +Enca supports HZ chinese encoding which is not supported by the iconv shiped +with GLibc. Portable iconv library seems to support it. diff --git a/engines/Makefile.am b/engines/Makefile.am index 2b7bb26..93e490a 100644 --- a/engines/Makefile.am +++ b/engines/Makefile.am @@ -1,14 +1,22 @@ -lib_LTLIBRARIES = libwestern.la +lib_LTLIBRARIES = western_engine.la libdir = $(pkgdatadir)/engines if HAVE_RCD -lib_LTLIBRARIES += librcd.la -librcd_la_SOURCES = librcd.c -librcd_la_LDFLAGS = -module -avoid-version -export-symbols-regex "rccGetInfo" +lib_LTLIBRARIES += librcd_engine.la +librcd_engine_la_SOURCES = librcd.c +librcd_engine_la_LDFLAGS = -module -avoid-version -export-symbols-regex "rccGetInfo" +librcd_engine_la_LIBADD = @RCD_LIBS@ endif -libwestern_la_SOURCES = western.c -libwestern_la_LDFLAGS = -module -avoid-version -export-symbols-regex "rccGetInfo" +if HAVE_LIBGUESS +lib_LTLIBRARIES += libguess_engine.la +libguess_engine_la_SOURCES = libguess.c +libguess_engine_la_LDFLAGS = -module -avoid-version -export-symbols-regex "rccGetInfo" +libguess_engine_la_LIBADD = @LIBGUESS_LIBS@ +endif + +western_engine_la_SOURCES = western.c +western_engine_la_LDFLAGS = -module -avoid-version -export-symbols-regex "rccGetInfo" -AM_CPPFLAGS = -I../src @RCD_INCLUDES@ +AM_CPPFLAGS = -I../src @RCD_INCLUDES@ @LIBGUESS_INCLUDES@ diff --git a/engines/libguess.c b/engines/libguess.c new file mode 100644 index 0000000..7f13b50 --- /dev/null +++ b/engines/libguess.c @@ -0,0 +1,80 @@ +#include <stdio.h> +#include <string.h> + +#include <librcc.h> +#include <libguess.h> + +#define UTF8_ID 0 +#define UTF16_ID 1 + +typedef const char *(*guess_function)(const char *buf, int buflen); + +struct rcc_guess_engine_t { + struct rcc_engine_t engine; + guess_function func; +}; +typedef struct rcc_guess_engine_t rcc_guess_engine; + +rcc_autocharset_id guessDetect(rcc_engine_context ctx, const char *buf, int len) { + const char *res; + rcc_guess_engine *info; + + if (!buf) return (rcc_autocharset_id)-1; + + info = (rcc_guess_engine*)rccEngineGetInfo(ctx); + if (info) { + if (info->func) res = info->func(buf, len?len:strlen(buf)); + else { + if (!len) len = strlen(buf); + res = guess_cn(buf, len); + if (!res) res = guess_tw(buf, len); + printf("%s\n",res?res:"null"); + } + } else + res = NULL; + + if (!res) return (rcc_autocharset_id)-1; + return rccEngineGetAutoCharsetByName(ctx, res); +} + + +struct rcc_guess_engine_t guessJPEngine = { + { + "LibGUESS", + NULL, /* Constructor */ + NULL, /* Destructor */ + &guessDetect, + {"UTF-8", "UTF-16", "ISO-2022-JP", "EUC-JP", "SJIS", NULL} + }, + &guess_jp +}; + +struct rcc_guess_engine_t guessCNEngine = { + { + "LibGUESS", + NULL, /* Constructor */ + NULL, /* Destructor */ + &guessDetect, + {"UTF-8", "UTF-16", "ISO-2022-CN", "GB2312", "GB18030", "BIG5", NULL} + }, + NULL +}; + +struct rcc_guess_engine_t guessKREngine = { + { + "LibGUESS", + NULL, /* Constructor */ + NULL, /* Destructor */ + &guessDetect, + {"UTF-8", "UTF-16", "ISO-2022-KR", "EUC-KR", "JOHAB", NULL} + }, + &guess_kr +}; + + +rcc_engine *rccGetInfo(const char *lang) { + if (!strcmp(lang, "zh")) return (rcc_engine*)&guessCNEngine; + if (!strcmp(lang, "ja")) return (rcc_engine*)&guessJPEngine; + if (!strcmp(lang, "ko")) return (rcc_engine*)&guessKREngine; + return NULL; +} diff --git a/engines/librcd.c b/engines/librcd.c index c24d244..bfb14b0 100644 --- a/engines/librcd.c +++ b/engines/librcd.c @@ -9,11 +9,15 @@ static rcc_autocharset_id AutoengineRussian(rcc_engine_context ctx, const char * } static rcc_engine russian_engine = { - "LibRCD", NULL, NULL, &AutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", NULL} + "LibRCD", NULL, NULL, &AutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", "ISO8859-1", NULL} }; static rcc_engine ukrainian_engine = { - "LibRCD", NULL, NULL, &AutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", NULL} + "LibRCD", NULL, NULL, &AutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", "ISO8859-1", NULL} +}; + +static rcc_engine belarussian_engine = { + "LibRCD", NULL, NULL, &AutoengineRussian, {"CP1251","ISO-IR-111","UTF-8","IBM866", "ISO8859-1", NULL} }; rcc_engine *rccGetInfo(const char *lang) { @@ -21,6 +25,7 @@ rcc_engine *rccGetInfo(const char *lang) { if (!strcmp(lang, "ru")) return &russian_engine; if (!strcmp(lang, "uk")) return &ukrainian_engine; + if (!strcmp(lang, "be")) return &belarussian_engine; return NULL; } diff --git a/examples/rcc.xml b/examples/rcc.xml index 12f667b..eda97f1 100644 --- a/examples/rcc.xml +++ b/examples/rcc.xml @@ -1,8 +1,189 @@ <?xml version='1.0' encoding='UTF-8' ?> <LibRCC> <Languages> + <Language name="default"> + <FullName locale="ru">Автоопределение</FullName> + </Language> + <Language name="off"> + <FullName locale="ru">Отключить</FullName> + </Language> + <Language name="ru"> + <FullName locale="ru">Русский</FullName> + </Language> + <Language name="uk"> + <FullName locale="ru">Украинский</FullName> + </Language> + <Language name="be"> + <FullName locale="ru">Беларуский</FullName> + </Language> + <Language name="en"> + <FullName locale="ru">Английский</FullName> + <Charsets> + <Charset>ISO8859-1</Charset> + <Charset>UTF-8</Charset> + </Charsets> + <Engines> + <Engine>western</Engine> + </Engines> + </Language> + <Language name="bg"> + <FullName locale="ru">Болгарский</FullName> + <Charsets> + <Charset>CP1251</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-1</Charset> + <Charset>IBM855</Charset> + <Charset>MACCYRILLIC</Charset> + <Charset>ISO-IR-111</Charset> + </Charsets> + </Language> + <Language name="cz"> + <FullName locale="ru">Чешский</FullName> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>CSKOI8R</Charset> + </Charsets> + </Language> + <Language name="es"> + <FullName locale="ru">Эстонский</FullName> + <Charsets> + <Charset>CP1257</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-4</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>IBM755</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="hr"> + <FullName locale="ru">Хорватский</FullName> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + </Charsets> + </Language> + <Language name="hu"> + <FullName locale="ru">Венгерский</FullName> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + </Charsets> + </Language> + <Language name="lt"> + <FullName locale="ru">Латвийский</FullName> + <Charsets> + <Charset>CP1257</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-4</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>IBM755</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="lv"> + <FullName locale="ru">Литовский</FullName> + <Charsets> + <Charset>CP1257</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-4</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>IBM755</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="pl"> + <FullName locale="ru">Польский</FullName> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>ISO-8859-16</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="sk"> + <FullName locale="ru">Словацкий</FullName> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>CSKOI8R</Charset> + </Charsets> + </Language> + <Language name="sl"> + <FullName locale="ru">Cловенский</FullName> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + </Charsets> + </Language> + <Language name="ja"> + <FullName locale="ru">Японский</FullName> + <Charsets> + <Charset>UTF-8</Charset> + <Charset>UTF-16</Charset> + <Charset>ISO-2022-JP</Charset> + <Charset>EUC-JP</Charset> + <Charset>SJIS</Charset> + </Charsets> + <Engines> + <Engine>libguess</Engine> + </Engines> + </Language> + <Language name="ko"> + <FullName locale="ru">Корейский</FullName> + <Charsets> + <Charset>UTF-8</Charset> + <Charset>UTF-16</Charset> + <Charset>ISO-2022-KR</Charset> + <Charset>EUC-KR</Charset> + <Charset>JOHAB</Charset> + </Charsets> + <Engines> + <Engine>libguess</Engine> + </Engines> + </Language> + <Language name="zh"> + <FullName locale="ru">Китайский</FullName> + <Charsets> + <Charset>UTF-8</Charset> + <Charset>UTF-16</Charset> + <Charset>ISO-2022-CN</Charset> + <Charset>GB2312</Charset> + <Charset>GB18030</Charset> + <Charset>GBK</Charset> + <Charset>BIG5</Charset> + <Charset>EUC-CN</Charset> + <Charset>HZ</Charset> + </Charsets> + <Engines> + <Engine>libguess</Engine> + </Engines> + </Language> <Language name="de"> <FullName>German</FullName> + <FullName locale="ru">Немецкий</FullName> <Charsets> <Charset>ISO8859-1</Charset> <Charset>UTF-8</Charset> @@ -13,6 +194,7 @@ </Language> <Language name="fr"> <FullName>French</FullName> + <FullName locale="ru">Французский</FullName> <Charsets> <Charset>ISO8859-1</Charset> <Charset>UTF-8</Charset> @@ -22,4 +204,118 @@ </Engines> </Language> </Languages> + <Options> + <Option name="LEARNING_MODE"> + <FullName locale="ru">Кэширование</FullName> + <Value name="OFF"> + <FullName locale="ru">Выключено</FullName> + </Value> + <Value name="ON"> + <FullName locale="ru">Включено</FullName> + </Value> + <Value name="RELEARN"> + <FullName locale="ru">Обновлять данные кэша</FullName> + </Value> + <Value name="LEARN"> + <FullName locale="ru">Добовлять данные в кэш</FullName> + </Value> + </Option> + <Option name="AUTODETECT_FS_NAMES"> + <FullName locale="ru">Поиск файлов</FullName> + </Option> + <Option name="AUTOENGINE_SET_CURRENT"> + <FullName locale="ru">Автоматически устанавливать текущую кодировку</FullName> + </Option> + <Option name="AUTODETECT_LANGUAGE"> + <FullName locale="ru">Автоопределение языка</FullName> + </Option> + <Option name="TRANSLATE"> + <FullName locale="ru">Перевод</FullName> + <Value name="OFF"> + <FullName locale="ru">Отключено</FullName> + </Value> + <Value name="TRANSLITERATE"> + <FullName locale="ru">Транслитерация</FullName> + </Value> + <Value name="TO_ENGLISH"> + <FullName locale="ru">На английский</FullName> + </Value> + <Value name="SKIP_RELATED"> + <FullName locale="ru">Не переводить родственные языки</FullName> + </Value> + <Value name="SKIP_PARENT"> + <FullName locale="ru">Не переводить из родительского языка</FullName> + </Value> + <Value name="FULL"> + <FullName locale="ru">Переводить все</FullName> + </Value> + </Option> + <Option name="TIMEOUT"> + <FullName locale="ru">Разрешенное время перевода (us)</FullName> + </Option> + </Options> + <Classes> + <Class name="id3"> + <FullName locale="ru">Кодировка ID3 Тэгов</FullName> + </Class> + <Class name="pl"> + <FullName locale="ru">Кодировка Списка Песен</FullName> + </Class> + <Class name="plfs"> + <FullName locale="ru">Кодировка Файлов в Списке</FullName> + </Class> + <Class name="fs"> + <FullName locale="ru">Кодировка Файловой Системы</FullName> + </Class> + <Class name="out"> + <FullName locale="ru">Отображение</FullName> + </Class> + </Classes> + <Charsets> + <Charset name="Default"> + <FullName locale="ru">Авто-определение</FullName> + </Charset> + </Charsets> + <Engines> + <Engine name="off"> + <FullName locale="ru">Отключить</FullName> + </Engine> + <Engine name="Russian"> + <FullName locale="ru">Библиотека LibRCD</FullName> + </Engine> + <Engine name="Enca Library"> + <FullName locale="ru">Библиотека Enca</FullName> + </Engine> + <Engine name="LibGUESS"> + <FullName locale="ru">Библиотека LibGUESS</FullName> + </Engine> + </Engines> + <Pages> + <Page name="RusXMMS"> + <FullName locale="ru">РусXMMS</FullName> + <Frames> + <Frame name="Language"> + <FullName locale="ru">Язык</FullName> + <Boxes> + <Box name="Language"> + <FullName locale="ru">Текущий Язык</FullName> + </Box> + </Boxes> + </Frame> + <Frame name="Engine"> + <FullName locale="ru">Авто-определение</FullName> + <Boxes> + <Box name="Engine"> + <FullName locale="ru">Движок</FullName> + </Box> + <Box name=""> + </Box> + </Boxes> + </Frame> + <Frame name="Charset"> + <FullName locale="ru">Кодировки</FullName> + </Frame> + </Frames> + </Page> + </Pages> </LibRCC>
\ No newline at end of file diff --git a/examples/rcc.xml.chinese b/examples/rcc.xml.chinese index 0cc914c..ea916c6 100644 --- a/examples/rcc.xml.chinese +++ b/examples/rcc.xml.chinese @@ -5,24 +5,44 @@ <FullName>Japanese</FullName> <Charsets> <Charset>UTF-8</Charset> + <Charset>UTF-16</Charset> <Charset>ISO-2022-JP</Charset> <Charset>EUC-JP</Charset> - <Charset>SHIFT-JIS</Charset> + <Charset>SJIS</Charset> </Charsets> + <Engines> + <Engine>libguess</Engine> + </Engines> + </Language> + <Language name="ko"> + <FullName>Korean</FullName> + <Charsets> + <Charset>UTF-8</Charset> + <Charset>UTF-16</Charset> + <Charset>ISO-2022-KR</Charset> + <Charset>EUC-KR</Charset> + <Charset>JOHAB</Charset> + </Charsets> + <Engines> + <Engine>libguess</Engine> + </Engines> </Language> <Language name="zh"> <FullName>Chinese</FullName> <Charsets> <Charset>UTF-8</Charset> + <Charset>UTF-16</Charset> + <Charset>ISO-2022-CN</Charset> <Charset>GB2312</Charset> <Charset>GB18030</Charset> <Charset>GBK</Charset> - <Charset>ISO-2022-CN</Charset> <Charset>BIG5</Charset> - <Charset>BIG5-HKSCS</Charset> <Charset>EUC-CN</Charset> - <Charset>EUC-TW</Charset> + <Charset>HZ</Charset> </Charsets> + <Engines> + <Engine>libguess</Engine> + </Engines> </Language> </Languages> </LibRCC> diff --git a/examples/rcc.xml.eastern b/examples/rcc.xml.eastern new file mode 100644 index 0000000..52fbfcb --- /dev/null +++ b/examples/rcc.xml.eastern @@ -0,0 +1,107 @@ +<?xml version='1.0' encoding='UTF-8' ?> +<LibRCC> + <Languages> + <Language name="bg"> + <Charsets> + <Charset>CP1251</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-1</Charset> + <Charset>IBM855</Charset> + <Charset>MACCYRILLIC</Charset> + <Charset>ISO-IR-111</Charset> + </Charsets> + </Language> + <Language name="cz"> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>CSKOI8R</Charset> + </Charsets> + </Language> + <Language name="es"> + <Charsets> + <Charset>CP1257</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-4</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>IBM755</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="hr"> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + </Charsets> + </Language> + <Language name="hu"> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + </Charsets> + </Language> + <Language name="lt"> + <Charsets> + <Charset>CP1257</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-4</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>IBM755</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="lv"> + <Charsets> + <Charset>CP1257</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-4</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>IBM755</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="pl"> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>ISO-8859-13</Charset> + <Charset>ISO-8859-16</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>BALTIC</Charset> + </Charsets> + </Language> + <Language name="sk"> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + <Charset>CSKOI8R</Charset> + </Charsets> + </Language> + <Language name="sl"> + <Charsets> + <Charset>CP1250</Charset> + <Charset>UTF-8</Charset> + <Charset>ISO-8859-2</Charset> + <Charset>IBM852</Charset> + <Charset>MACCENTRALEUROPE</Charset> + </Charsets> + </Language> + </Languages> +</LibRCC>
\ No newline at end of file diff --git a/examples/rcc.xml.western b/examples/rcc.xml.western new file mode 100644 index 0000000..12f667b --- /dev/null +++ b/examples/rcc.xml.western @@ -0,0 +1,25 @@ +<?xml version='1.0' encoding='UTF-8' ?> +<LibRCC> + <Languages> + <Language name="de"> + <FullName>German</FullName> + <Charsets> + <Charset>ISO8859-1</Charset> + <Charset>UTF-8</Charset> + </Charsets> + <Engines> + <Engine>western</Engine> + </Engines> + </Language> + <Language name="fr"> + <FullName>French</FullName> + <Charsets> + <Charset>ISO8859-1</Charset> + <Charset>UTF-8</Charset> + </Charsets> + <Engines> + <Engine>western</Engine> + </Engines> + </Language> + </Languages> +</LibRCC>
\ No newline at end of file diff --git a/src/Makefile.am b/src/Makefile.am index 79976c6..42c5966 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -28,6 +28,6 @@ librcc_la_SOURCES = librcc.c \ include_HEADERS = librcc.h AM_CPPFLAGS = -I../src -DLIBRCC_DATA_DIR=\"${pkgdatadir}\" @XML_INCLUDES@ @DLOPEN_INCLUDES@ @RCD_INCLUDES@ @ENCA_INCLUDES@ @BDB_INCLUDES@ @ASPELL_CFLAGS@ @PTHREAD_CFLAGS@ -librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ @PTHREAD_LIBS@ +librcc_la_LIBADD = @XML_LIBS@ @DLOPEN_LIBS@ @RCD_LIBS@ @ENCA_LIBS@ @BDB_LIBS@ @ASPELL_LIBS@ @PTHREAD_LIBS@ @EXTRA_LIBS@ librcc_la_LDFLAGS = -version-info @LIBRCC_VERSION_INFO@ diff --git a/src/engine.c b/src/engine.c index f9c2284..3d3e023 100644 --- a/src/engine.c +++ b/src/engine.c @@ -125,6 +125,7 @@ int rccEngineConfigure(rcc_engine_context ctx) { engine = ctx->config->language->engines[engine_id]; + ctx->id = engine_id; ctx->free_func = engine->free_func; ctx->func = engine->func; @@ -134,6 +135,30 @@ int rccEngineConfigure(rcc_engine_context ctx) { return 0; } + +rcc_engine *rccEngineGetInfo(rcc_engine_context ctx) { + if (!ctx) return NULL; + return ctx->config->language->engines[ctx->id]; +} + +rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name) { + unsigned int i; + rcc_engine *info; + rcc_charset *charsets; + + if ((!ctx)||(!name)) return (rcc_autocharset_id)-1; + + info = rccEngineGetInfo(ctx); + if (info) { + charsets = info->charsets; + + for (i=0;charsets[i];i++) + if (!strcasecmp(charsets[i],name)) return (rcc_autocharset_id)i; + } + + return (rcc_autocharset_id)-1; +} + rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx) { if (!ctx) return NULL; @@ -186,6 +211,8 @@ static int CheckWestern(const unsigned char *buf, int len) { rcc_autocharset_id rccEngineDetectCharset(rcc_engine_context ctx, const char *buf, size_t len) { rcc_autocharset_id utf; + /* DS: This should be done directly in autoengines, otherwise we will + fail to detect 7bit encodings */ if (CheckWestern(buf, len)) { utf=rccConfigGetAutoCharsetByName(ctx->config, "UTF-8"); if (utf != (rcc_autocharset_id)-1) return utf; diff --git a/src/engine.h b/src/engine.h index 96e6db6..3213f2b 100644 --- a/src/engine.h +++ b/src/engine.h @@ -26,6 +26,7 @@ struct rcc_engine_context_t { rcc_engine_function func; rcc_engine_free_function free_func; + rcc_engine_id id; rcc_engine_internal internal; }; typedef struct rcc_engine_context_t rcc_engine_context_s; diff --git a/src/librcc.h b/src/librcc.h index e5749cd..88cc802 100644 --- a/src/librcc.h +++ b/src/librcc.h @@ -1481,6 +1481,9 @@ typedef rcc_engine *(*rcc_plugin_engine_info_function)(const char *lang); rcc_engine_internal rccEngineGetInternal(rcc_engine_context ctx); rcc_language *rccEngineGetLanguage(rcc_engine_context ctx); rcc_context rccEngineGetRccContext(rcc_engine_context ctx); +rcc_engine *rccEngineGetInfo(rcc_engine_context ctx); +rcc_autocharset_id rccEngineGetAutoCharsetByName(rcc_engine_context ctx, const char *name); + /******************************************************************************* **************************** Configuration ************************************* diff --git a/src/lngconfig.c b/src/lngconfig.c index 67e05c6..670d97f 100644 --- a/src/lngconfig.c +++ b/src/lngconfig.c @@ -405,7 +405,7 @@ rcc_language_config rccGetUsableConfig(rcc_context ctx, rcc_language_id language rcc_language_config rccGetConfig(rcc_context ctx, rcc_language_id language_id) { rcc_language_config config; - + config = rccGetConfigPointer(ctx, language_id, &language_id); if (config) { if ((!config->charset)&&(rccConfigInit(config, ctx))) return NULL; diff --git a/src/plugin.c b/src/plugin.c index 38337fb..c53726f 100644 --- a/src/plugin.c +++ b/src/plugin.c @@ -121,13 +121,13 @@ rcc_plugin_handle rccPluginLoad(rcc_plugin_type type, const char *name) { switch (type) { case RCC_PLUGIN_TYPE_ENGINE: - pluginfn = (char*)malloc((32 + strlen(rcc_home_dir) + strlen(name))*sizeof(char)); + pluginfn = (char*)malloc((48 + strlen(rcc_home_dir) + strlen(name))*sizeof(char)); if (!pluginfn) return NULL; - sprintf(pluginfn, "%s/.rcc/engines/lib%s.so", rcc_home_dir, name); + sprintf(pluginfn, "%s/.rcc/engines/%s_engine.so", rcc_home_dir, name); res = rccLibraryOpen(pluginfn); if (!res) { - sprintf(pluginfn, LIBRCC_DATA_DIR "/engines/lib%s.so", name); + sprintf(pluginfn, LIBRCC_DATA_DIR "/engines/%s_engine.so", name); res = rccLibraryOpen(pluginfn); } free(pluginfn); @@ -156,7 +156,7 @@ rcc_plugin_handle rccPluginLoad(rcc_plugin_type type, const char *name) { rcc_engine *rccPluginEngineGetInfo(const char *name, const char *language) { rcc_plugin_handle handle; rcc_plugin_engine_info_function infofunc; - + handle = rccPluginLoad(RCC_PLUGIN_TYPE_ENGINE, name); if (!handle) return NULL; diff --git a/src/rccconfig.c b/src/rccconfig.c index 6723825..d5546c7 100644 --- a/src/rccconfig.c +++ b/src/rccconfig.c @@ -10,8 +10,8 @@ rcc_language_alias rcc_default_aliases[RCC_MAX_ALIASES + 1]; rcc_language_alias rcc_default_aliases_embeded[RCC_MAX_ALIASES + 1] = { - { "cs_SK", "sk" }, - { "ru_UA", "uk" }, +/* { "cs_SK", "sk" }, + { "ru_UA", "uk" },*/ { NULL, NULL } }; @@ -45,11 +45,15 @@ rcc_engine rcc_default_engine = { }; rcc_engine rcc_russian_engine = { - "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", NULL} + "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-R","UTF-8","IBM866", "ISO8859-1", NULL} }; rcc_engine rcc_ukrainian_engine = { - "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", NULL} + "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","KOI8-U","UTF-8","IBM865", "ISO8859-1", NULL} +}; + +rcc_engine rcc_belarussian_engine = { + "LibRCD", NULL, NULL, &rccAutoengineRussian, {"CP1251","ISO-IR-111","UTF-8","IBM865", "ISO8859-1", NULL} }; rcc_language rcc_default_languages[RCC_MAX_LANGUAGES + 1]; @@ -81,11 +85,14 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { #endif /* RCC_RCD_SUPPORT */ NULL }}, -{"be", {rcc_default_charset, rcc_utf8_charset, "CP1251", "IBM866", "ISO-8859-5", "KOI8-UNI", "maccyr" "IBM855", NULL},{ +{"be", {rcc_default_charset, rcc_utf8_charset, "CP1251", "IBM866", "ISO-8859-5", "ISO-IR-111", "ISO-IR-111", "MACCYRILLIC" "IBM855", NULL},{ &rcc_default_engine, +#ifdef RCC_RCD_SUPPORT + &rcc_ukrainian_engine, +#endif /* RCC_RCD_SUPPORT */ NULL }}, -{"bg", {rcc_default_charset, rcc_utf8_charset, "CP1251", "ISO-8859-5", "IBM855", "maccyr", "ECMA-113", NULL},{ +/*{"bg", {rcc_default_charset, rcc_utf8_charset, "CP1251", "ISO-8859-5", "IBM855", "maccyr", "ECMA-113", NULL},{ &rcc_default_engine, NULL }}, @@ -124,11 +131,7 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = { {"sl", {rcc_default_charset, rcc_utf8_charset, "ISO-8859-2", "CP1250", "IBM852", "macce", "CORK", NULL},{ &rcc_default_engine, NULL -}}, -{"zh", {rcc_default_charset, rcc_utf8_charset, "GB2312", "GBK", "GB18030", "BIG5", NULL},{ - &rcc_default_engine, - NULL -}}, +}},*/ {NULL} }; rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL }; diff --git a/src/rccenca.c b/src/rccenca.c index 28d3ccf..e46847e 100644 --- a/src/rccenca.c +++ b/src/rccenca.c @@ -20,6 +20,41 @@ static rcc_library_handle enca_handle = NULL; #endif /* RCC_ENCA_DYNAMIC */ static rcc_engine *enca_engines = NULL; + +/* CORK, KEYBCS2 is missing */ +rcc_enca_corrections rcc_enca_missing_corrections[] = { + { "be", "KOI8-UNI", "ISO-IR-111" }, + { NULL, "macce", "MACCENTRALEUROPE" }, + { "zh", "HZ", "HZ" }, + { "sk", "KOI-8_CS_2", "CSKOI8R" }, + { NULL, NULL, NULL } +}; + +rcc_enca_corrections rcc_enca_error_corrections[] = { + { NULL, "ECMA-cyrillic", "ISO-IR-111" }, + { NULL, NULL, NULL } +}; + + +static const char *rccEncaGetCorrection(const char *lang, const char *charset) { + int i; + for (i=0;rcc_enca_error_corrections[i].enca_charset;i++) { + if (((!rcc_enca_error_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_error_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_error_corrections[i].enca_charset))) + return rcc_enca_error_corrections[i].iconv_charset; + } + return charset; +} + +static const char *rccEncaGetMissing(const char *lang, const char *charset) { + int i; + for (i=0;rcc_enca_missing_corrections[i].enca_charset;i++) { + if (((!rcc_enca_missing_corrections[i].lang)||((lang)&&(!strcmp(lang, rcc_enca_missing_corrections[i].lang))))&&(!strcmp(charset, rcc_enca_missing_corrections[i].enca_charset))) + return rcc_enca_missing_corrections[i].iconv_charset; + } + return charset; +} + + rcc_engine_internal rccEncaInitContext(rcc_engine_context ctx) { #ifdef RCC_ENCA_SUPPORT EncaAnalyser enca; @@ -65,7 +100,12 @@ rcc_autocharset_id rccEnca(rcc_engine_context ctx, const char *buf, int len) { if (ee.charset<0) return (rcc_charset_id)-1; charset = enca_charset_name(ee.charset, ENCA_NAME_STYLE_ICONV); - return rccGetAutoCharsetByName(ctx->config->ctx, charset); + if (charset) { + charset = rccEncaGetCorrection(rccEngineGetLanguage(ctx)->sn, charset); + } else { + charset = rccEncaGetMissing(rccEngineGetLanguage(ctx)->sn, enca_charset_name(ee.charset, ENCA_NAME_STYLE_ENCA)); + } + return rccEngineGetAutoCharsetByName(ctx, charset); #else /* RCC_ENCA_SUPPORT */ return (rcc_charset_id)-1; #endif /* RCC_ENCA_SUPPORT */ @@ -160,7 +200,11 @@ int rccEncaInit() { for (l=0;l<n_charsets;l++) { // Enca bug, STYLE_ICONV return's a lot of NULL's charset = enca_charset_name(charsets[l], ENCA_NAME_STYLE_ICONV); - if (!charset) charset = enca_charset_name(charsets[l], ENCA_NAME_STYLE_ENCA); + if (charset) { + charset = rccEncaGetCorrection(rcc_default_languages[i].sn, charset); + } else { + charset = rccEncaGetMissing(rcc_default_languages[i].sn, enca_charset_name(charsets[l], ENCA_NAME_STYLE_ENCA)); + } enca_engines[i].charsets[k++] = charset; } enca_engines[j].charsets[k] = NULL; diff --git a/src/rccenca.h b/src/rccenca.h index 2f2c487..308b8fb 100644 --- a/src/rccenca.h +++ b/src/rccenca.h @@ -20,6 +20,14 @@ # define RCC_ENCA_SUPPORT #endif +struct rcc_enca_corrections_t { + char *lang; + const char *enca_charset; + const char *iconv_charset; +}; +typedef struct rcc_enca_corrections_t rcc_enca_corrections; + + int rccEncaInit(); void rccEncaFree(); diff --git a/src/rccexternal.h b/src/rccexternal.h index 181a6ec..fe7052f 100644 --- a/src/rccexternal.h +++ b/src/rccexternal.h @@ -2,6 +2,9 @@ #define _RCC_EXTERNAL_H #include "../config.h" +#ifdef HAVE_SYS_TYPES_H +# include <sys/types.h> +#endif /* HAVE_SYS_TYPES_H */ typedef enum rcc_external_module_t { RCC_EXTERNAL_MODULE_CONTROL = 0, diff --git a/src/rcclocale.c b/src/rcclocale.c index 99d2b8f..9869a72 100644 --- a/src/rcclocale.c +++ b/src/rcclocale.c @@ -5,10 +5,14 @@ #include "../config.h" +#ifdef HAVE_LIBCHARSET +# include <libcharset.h> +#endif /* HAVE_LIBCHARSET */ #ifdef HAVE_CODESET # include <langinfo.h> #endif + #include "rccconfig.h" int rccLocaleGetClassByName(const char *locale) { @@ -80,9 +84,12 @@ int rccLocaleGetCharset(char *result, const char *lv, unsigned int n) { if (locale_class == LC_CTYPE) { l = getenv("CHARSET"); +#ifdef HAVE_LIBCHARSET + if (!l) l = locale_charset(); +#endif /* HAVE_LIBCHARSET */ #ifdef HAVE_CODESET if (!l) l = nl_langinfo(CODESET); -#endif +#endif /* HAVE_CODESET */ if (l) { if (strlen(l)>=n) return -1; strcpy(result, l); diff --git a/src/recode.c b/src/recode.c index e1e8e81..1d98306 100644 --- a/src/recode.c +++ b/src/recode.c @@ -742,7 +742,12 @@ rcc_string rccSizedFromCharset(rcc_context ctx, const char *charset, const char rcc_string ret; if ((!buf)||(!charset)) return NULL; - + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } + config = rccGetCurrentConfig(ctx); if (!config) return NULL; @@ -768,6 +773,11 @@ char *rccSizedToCharset(rcc_context ctx, const char *charset, rcc_const_string b if ((!buf)||(!charset)) return NULL; + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } + res = rccStringCheck(buf); if (!res) return NULL; @@ -799,6 +809,11 @@ char *rccSizedRecodeToCharset(rcc_context ctx, rcc_class_id class_id, const char char *utf8, *extracted; if (!charset) return NULL; + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } utf8 = rccSizedFrom(ctx, class_id, buf, len); if (!utf8) return utf8; @@ -839,6 +854,11 @@ char *rccSizedRecodeFromCharset(rcc_context ctx, rcc_class_id class_id, const ch char *extracted; if (!charset) return NULL; + + if (!ctx) { + if (rcc_default_ctx) ctx = rcc_default_ctx; + else return NULL; + } icnv = rccIConvOpen("UTF-8", charset); if (icnv) { diff --git a/ui/librccui.c b/ui/librccui.c index 17e7281..1979899 100644 --- a/ui/librccui.c +++ b/ui/librccui.c @@ -219,7 +219,7 @@ int rccUiInit() { else icnv = rccIConvOpen(ctype_charset, "UTF-8"); } - if (!rccLocaleGetLanguage(locale, "LANGUAGE", 32)) { + if (!rccLocaleGetLanguage(locale, "LC_MESSAGES", 32)) { search[0] = strdup(locale); if (!search[0]) goto clean; lpos = strrchr(search[0], '@'); diff --git a/ui/rccnames.c b/ui/rccnames.c index 7f4f912..3a8ade1 100644 --- a/ui/rccnames.c +++ b/ui/rccnames.c @@ -26,6 +26,8 @@ rcc_name rcc_default_language_names_embeded[RCC_MAX_LANGUAGES+1] = { {"sk","Slovak"}, {"sl","Slovenian"}, {"zh","Chinese"}, +{"ko","Korean"}, +{"ja","Japanese"}, {NULL, NULL} }; |