summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@dside.dyndns.org>2005-08-11 01:06:56 +0000
committerSuren A. Chilingaryan <csa@dside.dyndns.org>2005-08-11 01:06:56 +0000
commit3736c5f3635863e54ab2cc47860628d26855c749 (patch)
tree3c1dadec1b75557463fcc740429cceb6e948f998 /src
parent63bf2a90a6d6fb0859e4c9dd9fcac85de9adc0f1 (diff)
downloadlibrcc-3736c5f3635863e54ab2cc47860628d26855c749.tar.gz
librcc-3736c5f3635863e54ab2cc47860628d26855c749.tar.bz2
librcc-3736c5f3635863e54ab2cc47860628d26855c749.tar.xz
librcc-3736c5f3635863e54ab2cc47860628d26855c749.zip
Transliteration and Documentation Update
- Fix: Autodetection of dissabled charsets. - Fix: Cleanely terminate external process if parrent thread terminated. - Transliteration for Russian, Ukrainian and using IConv. - Documentation Update.
Diffstat (limited to 'src')
-rw-r--r--src/librcc.h10
-rw-r--r--src/lngconfig.c59
-rw-r--r--src/rccconfig.c11
-rw-r--r--src/rccconfig.h3
-rw-r--r--src/recode.c47
5 files changed, 106 insertions, 24 deletions
diff --git a/src/librcc.h b/src/librcc.h
index 9b064d1..98ca1a6 100644
--- a/src/librcc.h
+++ b/src/librcc.h
@@ -427,6 +427,7 @@ typedef int rcc_option_value;
typedef enum rcc_option_translate_t {
RCC_OPTION_TRANSLATE_OFF = 0, /**< Switch translation off. */
+ RCC_OPTION_TRANSLATE_TRANSLITERATE, /**< Transliterate data. */
RCC_OPTION_TRANSLATE_TO_ENGLISH, /**< Translate data to english language (Current language don't matter). */
RCC_OPTION_TRANSLATE_SKIP_RELATED, /**< Skip translation of the text's between related languages. */
RCC_OPTION_TRANSLATE_SKIP_PARRENT, /**< Skip translation of the text's from parrent languages (from english). */
@@ -821,7 +822,7 @@ rcc_charset_id rccConfigGetClassCharsetByName(rcc_language_config config, rcc_cl
* Checks if charset is disabled for the specified class.
* @param config is language configuration
* @param class_id is class id.
- * @param charset is charset name.
+ * @param charset_id is charset id.
* @return 1 if charset is disabled, 0 if charset is enabled, -1 in the case of error.
*/
int rccConfigIsDisabledCharset(rcc_language_config config, rcc_class_id class_id, rcc_charset_id charset_id);
@@ -885,10 +886,13 @@ const char *rccConfigGetSelectedCharsetName(rcc_language_config config, rcc_clas
/**
* Return current encoding_id. The default value will be resolved to paticular encoding id.
* The following procedure is used to detect default encoding:
+ * - If Unicode encoding selected for the same class english language. Return this encoding.
* - If the parrent class is defined in #defcharset, - return current encoding of parrent class.
- * - If the locale variable is defined in #defcharset and config language coincide with locale language, use locale encoding.
+ * - If the locale variable is defined in #defcharset and either config language coincide with locale language or unciode encoding defined, use locale encoding.
* - If the default value for config language is defined in #defvalue return that default value.
- * - Return language with id 0. Normally this should be dummy language which indicates that RCC library is not used.
+ * - If the default value for all languages is defined in #defvalue return that default value.
+ * - If either config language is coincide with locale language or unicode locale is used, return locale encoding.
+ * - Return first by the list non-dissabled encoding.
*
* @param config is language configuration
* @param class_id is encoding class
diff --git a/src/lngconfig.c b/src/lngconfig.c
index 20aff63..631abd1 100644
--- a/src/lngconfig.c
+++ b/src/lngconfig.c
@@ -567,9 +567,11 @@ const char *rccConfigGetSelectedCharsetName(rcc_language_config config, rcc_clas
}
rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_id class_id) {
+ rcc_language_config enconfig;
unsigned int i, max;
rcc_charset_id charset_id;
rcc_charset_id all_charset_id = (rcc_language_id)-1;
+ const char *charset;
rcc_class_default_charset *defcharset;
const char *lang;
@@ -582,10 +584,19 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_
const char *defvalue;
if ((!config)||(!config->ctx)||(class_id<0)||(class_id>=config->ctx->n_classes)) return -1;
-
+
charset_id = config->charset[class_id];
if (charset_id) return charset_id;
+ enconfig = rccGetConfigByName(config->ctx, rcc_english_language_sn);
+ if ((enconfig)&&(enconfig!=config)) {
+ charset_id = enconfig->charset[class_id];
+ if (charset_id) {
+ charset = rccConfigGetClassCharsetName(enconfig, class_id, charset_id);
+ if ((charset)&&(rccIsUnicode(charset))) return charset_id;
+ }
+ }
+
if (!config->language) return (rcc_charset_id)-1;
else language = config->language;
@@ -598,23 +609,27 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_
if (!strcmp(classes[i]->name, defvalue))
return rccConfigGetCurrentCharset(config, i);
}
- } else defvalue = config->ctx->locale_variable;
+ }
if (config->default_charset[class_id]) return config->default_charset[class_id];
if (cl->defvalue) {
charset_id = rccConfigGetLocaleClassCharset(config, class_id, defvalue);
if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) {
- config->default_charset[class_id] = charset_id;
- return charset_id;
+ if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) {
+ config->default_charset[class_id] = charset_id;
+ return charset_id;
+ }
}
}
if (cl->defvalue) {
charset_id = rccConfigGetClassCharsetByName(config, class_id, defvalue);
if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) {
- config->default_charset[class_id] = charset_id;
- return charset_id;
+ if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) {
+ config->default_charset[class_id] = charset_id;
+ return charset_id;
+ }
}
}
@@ -626,9 +641,17 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_
if (!strcasecmp(lang, defcharset[i].lang)) {
charset_id = rccConfigGetClassCharsetByName(config, class_id, defcharset[i].charset);
if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) {
- config->default_charset[class_id] = charset_id;
- return charset_id;
- } else break;
+ if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) {
+ config->default_charset[class_id] = charset_id;
+ return charset_id;
+ } else {
+ all_charset_id = (rcc_charset_id)-1;
+ break;
+ }
+ } else {
+ all_charset_id = (rcc_charset_id)-1;
+ break;
+ }
} else if (!strcasecmp(rcc_default_all, defcharset[i].lang)) {
charset_id = rccConfigGetClassCharsetByName(config, class_id, defcharset[i].charset);
if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) {
@@ -638,20 +661,26 @@ rcc_charset_id rccConfigGetCurrentCharset(rcc_language_config config, rcc_class_
}
if (all_charset_id != (rcc_language_id)-1) {
- config->default_charset[class_id] = all_charset_id;
- return all_charset_id;
+ if (!rccConfigIsDisabledCharset(config, class_id, all_charset_id)) {
+ config->default_charset[class_id] = all_charset_id;
+ return all_charset_id;
+ }
}
}
- charset_id = rccConfigGetLocaleClassCharset(config, class_id, defvalue);
+ charset_id = rccConfigGetLocaleClassCharset(config, class_id, config->ctx->locale_variable);
if ((charset_id != 0)&&(charset_id != (rcc_charset_id)-1)) {
- config->default_charset[class_id] = charset_id;
- return charset_id;
+ if (!rccConfigIsDisabledCharset(config, class_id, charset_id)) {
+ config->default_charset[class_id] = charset_id;
+ return charset_id;
+ }
}
max = rccConfigGetClassCharsetNumber(config, class_id);
for (i = 1; i< max; i++)
- if (!rccConfigIsDisabledCharset(config, class_id, (rcc_charset_id)i)) return (rcc_charset_id)i;
+ if (!rccConfigIsDisabledCharset(config, class_id, (rcc_charset_id)i)) {
+ return (rcc_charset_id)i;
+ }
return (rcc_charset_id)-1;
}
diff --git a/src/rccconfig.c b/src/rccconfig.c
index 0752ee3..ae47a63 100644
--- a/src/rccconfig.c
+++ b/src/rccconfig.c
@@ -31,6 +31,8 @@ const char rcc_default_all[] = "all";
const char rcc_default_language_sn[] = "default";
const char rcc_disabled_language_sn[] = "Off";
const char rcc_english_language_sn[] = "en";
+const char rcc_russian_language_sn[] = "ru";
+const char rcc_ukrainian_language_sn[] = "uk";
const char rcc_disabled_engine_sn[] = "Off";
const char rcc_default_charset[] = "Default";
@@ -61,18 +63,18 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = {
&rcc_default_engine,
NULL
}},
-{"en", {rcc_default_charset, rcc_utf8_charset, NULL}, {
+{rcc_english_language_sn, {rcc_default_charset, rcc_utf8_charset, "ISO8859-1", NULL}, {
&rcc_default_engine,
NULL
}},
-{"ru", {rcc_default_charset,"KOI8-R","CP1251",rcc_utf8_charset,"IBM866","MACCYRILLIC","ISO8859-5", NULL}, {
+{rcc_russian_language_sn, {rcc_default_charset,"KOI8-R","CP1251",rcc_utf8_charset,"IBM866","MACCYRILLIC","ISO8859-5", NULL}, {
&rcc_default_engine,
#ifdef RCC_RCD_SUPPORT
&rcc_russian_engine,
#endif /* RCC_RCD_SUPPORT */
NULL
}},
-{"uk", {rcc_default_charset,"KOI8-U","CP1251",rcc_utf8_charset,"IBM855","MACCYRILLIC","ISO8859-5","CP1125", NULL}, {
+{rcc_ukrainian_language_sn, {rcc_default_charset,"KOI8-U","CP1251",rcc_utf8_charset,"IBM855","MACCYRILLIC","ISO8859-5","CP1125", NULL}, {
&rcc_default_engine,
#ifdef RCC_RCD_SUPPORT
&rcc_ukrainian_engine,
@@ -129,11 +131,10 @@ rcc_language rcc_default_languages_embeded[RCC_MAX_LANGUAGES + 1] = {
}},
{NULL}
};
-
rcc_option_value_name rcc_sn_boolean[] = { "OFF", "ON", NULL };
rcc_option_value_name rcc_sn_learning[] = { "OFF", "ON", "RELEARN", "LEARN", NULL };
rcc_option_value_name rcc_sn_clo[] = { "ALL", "CONFIGURED_AND_AUTO", "CONFIGURED_ONLY", NULL };
-rcc_option_value_name rcc_sn_translate[] = { "OFF", "TO_ENGLISH", "SKIP_RELATED", "SKIP_PARRENT", "FULL", NULL };
+rcc_option_value_name rcc_sn_translate[] = { "OFF", "TRANSLITERATE", "TO_ENGLISH", "SKIP_RELATED", "SKIP_PARRENT", "FULL", NULL };
rcc_option_description rcc_option_descriptions[RCC_MAX_OPTIONS+1];
rcc_option_description rcc_option_descriptions_embeded[RCC_MAX_OPTIONS+1] = {
diff --git a/src/rccconfig.h b/src/rccconfig.h
index f7f70dd..8b5ac0d 100644
--- a/src/rccconfig.h
+++ b/src/rccconfig.h
@@ -10,6 +10,9 @@
extern const char rcc_default_all[];
extern const char rcc_default_language_sn[];
extern const char rcc_english_language_sn[];
+extern const char rcc_russian_language_sn[];
+extern const char rcc_ukrainian_language_sn[];
+
extern const char rcc_disabled_language_sn[];
extern const char rcc_disabled_engine_sn[];
diff --git a/src/recode.c b/src/recode.c
index a528481..9e19078 100644
--- a/src/recode.c
+++ b/src/recode.c
@@ -322,7 +322,9 @@ static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_
rcc_translate trans, entrans;
+ unsigned int i;
char *translated;
+ unsigned char change_case;
ctx = (*config)->ctx;
@@ -336,7 +338,7 @@ static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_
english_language_id = rccGetLanguageByName(ctx, rcc_english_language_sn);
- if (translate == RCC_OPTION_TRANSLATE_TO_ENGLISH) {
+ if ((translate == RCC_OPTION_TRANSLATE_TO_ENGLISH)||(translate == RCC_OPTION_TRANSLATE_TRANSLITERATE)) {
current_language_id = english_language_id ;
} else {
if (ctype == RCC_CLASS_TRANSLATE_LOCALE) {
@@ -356,6 +358,49 @@ static char *rccRecodeTranslate(rcc_language_config *config, rcc_class_id class_
if (rccConfigConfigure(curconfig)) return NULL;
+ if (translate == RCC_OPTION_TRANSLATE_TRANSLITERATE) {
+ if (!strcasecmp((*config)->language->sn, rcc_russian_language_sn)) {
+ translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-R", utfstring, 0, NULL);
+ if (!translated) return NULL;
+ for (i=0;translated[i];i++) {
+ if (translated[i]&0x80) change_case = 1;
+ else change_case = 0;
+
+ translated[i]=translated[i]&0x7F;
+ if (change_case) {
+ if ((translated[i]<'Z')&&(translated[i]>'A'))
+ translated[i]=translated[i]-'A'+'a';
+ else if ((translated[i]<'z')&&(translated[i]>'a'))
+ translated[i]=translated[i]-'a'+'A';
+ }
+ }
+ *config = curconfig;
+ return translated;
+ }
+ if (!strcasecmp((*config)->language->sn, rcc_ukrainian_language_sn)) {
+ translated = rccSizedRecodeCharsets(ctx, "UTF-8", "KOI8-U", utfstring, 0, NULL);
+ if (!translated) return NULL;
+ for (i=0;translated[i];i++) {
+ if (translated[i]&0x80) change_case = 1;
+ else change_case = 0;
+
+ translated[i]=translated[i]&0x7F;
+ if (change_case) {
+ if ((translated[i]<'Z')&&(translated[i]>'A'))
+ translated[i]=translated[i]-'A'+'a';
+ else if ((translated[i]<'z')&&(translated[i]>'a'))
+ translated[i]=translated[i]-'a'+'A';
+ }
+ }
+ *config = curconfig;
+ return translated;
+ }
+
+ translated = rccSizedRecodeCharsets(ctx, "UTF-8", "US-ASCII//TRANSLIT", utfstring, 0, NULL);
+ if (translated) *config = curconfig;
+ return translated;
+ }
+
if (translate == RCC_OPTION_TRANSLATE_SKIP_RELATED) {
if (rccAreRelatedLanguages(curconfig, *config)) return NULL;
}