diff options
author | Suren A. Chilingaryan <csa@dside.dyndns.org> | 2005-06-16 23:19:27 +0000 |
---|---|---|
committer | Suren A. Chilingaryan <csa@dside.dyndns.org> | 2005-06-16 23:19:27 +0000 |
commit | 70fbe7822024d0acc68df3607ff25bf8d7a71751 (patch) | |
tree | 553cd2ef8cfc936fc890113596db2c4478fe5163 /statgen | |
download | librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.gz librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.bz2 librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.tar.xz librcd-70fbe7822024d0acc68df3607ff25bf8d7a71751.zip |
initial import
(automatically generated log message)
Diffstat (limited to 'statgen')
-rw-r--r-- | statgen/Makefile | 10 | ||||
-rw-r--r-- | statgen/ascii.c | 11 | ||||
-rw-r--r-- | statgen/debug.c | 33 | ||||
-rwxr-xr-x | statgen/doit | 42 | ||||
-rw-r--r-- | statgen/generate.c | 258 | ||||
-rw-r--r-- | statgen/test.c | 84 | ||||
-rwxr-xr-x | statgen/traslations | 12 | ||||
-rw-r--r-- | statgen/upper.c | 75 |
8 files changed, 525 insertions, 0 deletions
diff --git a/statgen/Makefile b/statgen/Makefile new file mode 100644 index 0000000..76251bd --- /dev/null +++ b/statgen/Makefile @@ -0,0 +1,10 @@ +all: generate ascii test upper debug +generate: generate.c + gcc -lm -o generate generate.c +ascii: ascii.c + gcc -o ascii ascii.c +test: test.c charset_auto_russian.h russian_table.h + gcc -o test test.c +debug: debug.c charset_auto_russian.h russian_table.h + gcc -o debug debug.c +upper: upper.c
\ No newline at end of file diff --git a/statgen/ascii.c b/statgen/ascii.c new file mode 100644 index 0000000..73dd6e4 --- /dev/null +++ b/statgen/ascii.c @@ -0,0 +1,11 @@ +#include <stdio.h> + +main() { + int i; + + for (i=32;i<256;i++) { + if ((i%8)==0) printf("\n"); + printf("%3.u %2.x %c ",i,i,i); + } + printf("\n\n"); +} diff --git a/statgen/debug.c b/statgen/debug.c new file mode 100644 index 0000000..85b950b --- /dev/null +++ b/statgen/debug.c @@ -0,0 +1,33 @@ +#include <stdio.h> +#define _AUTO_DEBUG +#include "charset_auto_russian.h" + + +main(int argc, char *argv[]) { + FILE *f; + int len,st; + char word[256]; + + + if (argc!=2) { + printf("Usage: %s <file name>\n",argv[0]); + exit(0); + } + + f=fopen(argv[1],"r"); + if (!f) { + printf("Failed to open specified file. Check permissions!\n"); + exit(1); + } + + while(!feof(f)) { + fscanf(f,"%s",&word); +// len=strlen(word)-1; + for(st=0;word[st]=='"'||word[st]=='\''||word[st]=='(';st++); + for(len=strlen(word)-1;word[len]==','||word[len]=='.'||word[len]=='!'||word[len]=='?'||word[len]==';'||word[len]=='-'||word[len]==':'||word[len]=='"'||word[len]=='\''||word[len]==')';len--); + if (len<5) continue; + else word[len+1]=0; + + autocharset_russian_uc(word+st,len+1-st); + } +} diff --git a/statgen/doit b/statgen/doit new file mode 100755 index 0000000..c38fe39 --- /dev/null +++ b/statgen/doit @@ -0,0 +1,42 @@ +#! /bin/bash + +if [ -z "$1" ]; then + echo "Usage: doit <file name>" + exit +fi + +# In some CP866 texts used "Yo" and "N" simbols from CP1251 encoding. This fixes it. +dos2unix -U $1 + +cat $1 | sed -e "s/¸/ñ/g;s/¹/N/g;s/°/ø/g" | iconv -f CP866 -t KOI8-R > $1.koi +cat $1 | sed -e "s/¸/ñ/g;s/¹/N/g;s/°/ø/g" | iconv -f CP866 -t CP1251 > $1.win +cat $1 | sed -e "s/¸/ñ/g;s/¹/N/g;s/°/ø/g" > $1.alt + +#cat $1 | sed -e "s/¸/ñ/g;s/¹/ü/g;s/°/ø/g" | iconv -f CP866 -t UTF-8 > $1.utf + +export LC_CTYPE="ru_RU.KOI8-R" +./generate $1.koi koi > russian_table.h 2> header1.tmp +./generate $1.koi win >> russian_table.h 2> header2.tmp +./generate $1.koi alt >> russian_table.h 2> header3.tmp +#./generate $1.win win >> russian_table.h 2> header2.tmp +#./generate $1 alt >> russian_table.h 2> header3.tmp + +cmp header1.tmp header2.tmp +if [ $? -ne 0 ]; then + echo "Different number items in win & koi tables. Strange..." + rm -f russian_table.h +else + cmp header1.tmp header3.tmp + if [ $? -ne 0 ]; then + echo "Different number items in win & koi tables. Strange..." + rm -f russian_table.h + else + cat header1.tmp >> russian_table.h + fi +fi + +rm -f header?.tmp +rm -f $1.koi +#rm -f $1.win +#rm -f $1.alt +#rm -f $1.utf diff --git a/statgen/generate.c b/statgen/generate.c new file mode 100644 index 0000000..838c0a7 --- /dev/null +++ b/statgen/generate.c @@ -0,0 +1,258 @@ +#include <stdio.h> +#include <unistd.h> +#include <ctype.h> +#include <locale.h> +#include <math.h> +#include <iconv.h> +#include <langinfo.h> +#include <sys/types.h> +#include <sys/stat.h> + +#define first_char 128 +#define last_char 255 + +#define original_first_char 192 +#define original_last_char 255 + +#define chars_number (last_char-first_char+1) +#define array_size (chars_number*chars_number) + +struct array_pos { + int ll; + int uu; + int lu; + int ul; +}; + +struct pstat { + unsigned long p; + unsigned long s; + unsigned long e; +}; + +iconv_t icnv=(iconv_t)-1; + +int end_symbol(char ch) { + if (ch=='\r'||ch=='\n'||ch==0||ch==' '||ch=='\t'||ch==','||ch=='.'||ch=='!'||ch=='?'||ch==';'||ch=='-'||ch==':'||ch=='"'||ch=='\''||ch==')') return 1; + return 0; +} + +int start_symbol(char ch) { + if ((ch=='\t')||ch=='\r'||ch=='\n'||(ch==' ')||(ch=='(')||(ch=='"')||(ch=='\'')) return 1; + return 0; +} + + +unsigned char convert_char(unsigned char c) { + char r; + char *pr, *pc; + size_t lr=1,lc=1; + pr=&r;pc=&c; + + if (icnv == (iconv_t)-1) return c; + if (iconv(icnv,&pc,&lc,&pr,&lr)<0) { + printf("Error converting characters!\n"); + exit(1); + } + return r; +} + +int get_array_pos(struct array_pos *pos, int a, int b) { + int la,ua,lb,ub; + if ((a<original_first_char)||(a>original_last_char)) return -1; + if ((b<original_first_char)||(b>original_last_char)) return -1; + + la=tolower(a); + ua=toupper(a); + lb=tolower(b); + ub=toupper(b); + + if ((la<original_first_char)||(la>original_last_char)) la=a; + if ((lb<original_first_char)||(lb>original_last_char)) lb=b; + if ((ua<original_first_char)||(ua>original_last_char)) ua=a; + if ((ub<original_first_char)||(ub>original_last_char)) ub=b; + + la=convert_char(la); + ua=convert_char(ua); + lb=convert_char(lb); + ub=convert_char(ub); + +// la=a;lb=b;ua=a;ub=b; + + pos->ll=(la-first_char)*chars_number+(lb-first_char); + if (la!=ua) { + pos->ul=(ua-first_char)*chars_number+(lb-first_char); + } else { + pos->ul=-1; + } + if (lb!=ub) { + pos->lu=(la-first_char)*chars_number+(ub-first_char); + } + else { + pos->lu=-1; + } + if ((lb!=ub)&&(la!=ua)) { + pos->uu=(ua-first_char)*chars_number+(ub-first_char); + } else { + pos->uu=-1; + } + return 0; +} + + +struct pstat *analyze(const unsigned char *text, unsigned long length) { + struct pstat *a; + unsigned long i; + struct array_pos pos; + + a=(struct pstat*)malloc(array_size*sizeof(struct pstat)); + if (!a) return NULL; + + for (i=0;i<array_size;i++) { + a[i].p=0; + a[i].s=0; + a[i].e=0; + } + + for (i=1;i<length;i++) { + if (get_array_pos(&pos,text[i-1],text[i])>=0) { + if (pos.ll>=0) { + if ((i==1)||(start_symbol(text[i-2]))) a[pos.ll].s++; + else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.ll].e++; + else a[pos.ll].p++; + } + if (pos.ul>=0) { + if ((i==1)||(start_symbol(text[i-2]))) a[pos.ul].s++; + else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.ul].e++; + else a[pos.ul].p++; + } +// if (pos.lu>=0) { +// if ((i==1)||(start_symbol(text[i-2]))) a[pos.lu].s++; +// else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.lu].e++; +// else a[pos.lu].p++; +// } + if (pos.uu>=0) { + if ((i==1)||(start_symbol(text[i-2]))) a[pos.uu].s++; + else if ((i+2==length)||(end_symbol(text[i+1]))) a[pos.uu].e++; + else a[pos.uu].p++; + } + } + } + return a; +} + + +int print(struct pstat *a) { + int i,j,k,n; + + for (i=first_char,k=0,n=0;i<=last_char;i++) + for (j=first_char;j<=last_char;j++,k++) { + if ((a[k].p)||(a[k].s)||(a[k].e)) { + if ((n)&&(n%8==0)) printf(",\n"); + else if (n) printf(", "); + printf("{'%c','%c',%lf,%lf,%lf}",i,j,a[k].p?log10(a[k].p):-2,a[k].s?log10(a[k].s):-2,a[k].e?log10(a[k].e):-2); + n++; + } + } + if ((n%8)!=1) printf("\n"); + return n; +} + + +unsigned long npow(unsigned long n) { + unsigned long res=2; + while (res<=n) res*=2; + return res; +} + +main(int argc, char *argv[]) { + FILE *f; + struct stat st; + unsigned char *text; + unsigned long len; + struct pstat *a; + int num; + long i,sum; + char locale[32]; + + + if (argc!=3) { + printf("Usage: %s <file name> <encoding>\n",argv[0]); + exit(0); + } + + if (strlen(argv[2])>12) { + printf("Invalid encoding(%s) specified!\n",argv[2]); + exit(1); + } + + if ((!strcasecmp(argv[2],"koi"))||(!strcasecmp(argv[2],"koi8"))||(!strcasecmp(argv[2],"koi-8"))||(!strcasecmp(argv[2],"koi8-r"))) + sprintf(locale,"%s","KOI8-R"); + else if ((!strcasecmp(argv[2],"win"))||(!strcasecmp(argv[2],"cp1251"))||(!strcasecmp(argv[2],"cp-1251"))||(!strcasecmp(argv[2],"win1251"))||(!strcasecmp(argv[2],"win-1251"))) + sprintf(locale,"%s","CP1251"); + else if ((!strcasecmp(argv[2],"alt"))||(!strcasecmp(argv[2],"cp866"))||(!strcasecmp(argv[2],"cp-866"))||(!strcasecmp(argv[2],"ibm866"))||(!strcasecmp(argv[2],"ibm-866"))) + sprintf(locale,"%s","IBM866"); + else + sprintf(locale,"%s",argv[2]); + + if (!setlocale(LC_CTYPE,"")) { + printf("Can't set locale!\n"); + exit(1); + } + + if (strcmp(locale,nl_langinfo(CODESET))) { + if ((icnv=iconv_open(locale,nl_langinfo(CODESET)))<0) { + printf("Can't initialize iconv!\n"); + exit(1); + } + } + + + if (stat(argv[1],&st)) { + printf("Specified file can't be stated!\n"); + iconv_close(icnv); + exit(1); + } + + if (!S_ISREG(st.st_mode)) { + printf("Specified file isn't regular file!\n"); + iconv_close(icnv); + exit(1); + } + + text=(unsigned char*)malloc(st.st_size); + if (!text) { + printf("Can't allocate %lu bytes of memory!\n",st.st_size); + iconv_close(icnv); + exit(1); + } + + f=fopen(argv[1],"r"); + if (!f) { + printf("Failed to open specified file. Check permissions!\n"); + free(text); + iconv_close(icnv); + exit(1); + } + if (fread(text,1,st.st_size,f)!=st.st_size) { + printf("Problem reading specified file!\n"); + free(text); + fclose(f); + iconv_close(icnv); + exit(1); + } + fclose(f); + + a=analyze(text,st.st_size); + if (a) { + printf("static const lng_stat2 enc_%s[]={\n",argv[2]); + num=print(a); + printf("};\n\n"); + free(a); + fprintf(stderr,"static unsigned int indexes2=%lu;\n",num); + fprintf(stderr,"static unsigned int npow2=%lu;\n",npow(num)); + } else printf("Failed to allocate %lu bytes of memory!\n",array_size*sizeof(struct pstat)); + + free(text); + iconv_close(icnv); +} diff --git a/statgen/test.c b/statgen/test.c new file mode 100644 index 0000000..936b491 --- /dev/null +++ b/statgen/test.c @@ -0,0 +1,84 @@ +#include <stdio.h> +#include "charset_auto_russian.h" + +main(int argc, char *argv[]) { + FILE *f; + int len,st; + char word[256],phrase[8192]; + unsigned long a[4]={0,0,0,0}; + int i,max,mw; + + + if ((argc!=2)&&(argc!=3)) { + printf("Usage: %s <file name> [<max words>]\n",argv[0]); + exit(0); + } + + if (argc==3) mw=atoi(argv[2]); + else mw=1; + + f=fopen(argv[1],"r"); + if (!f) { + printf("Failed to open specified file. Check permissions!\n"); + exit(1); + } + + while(!feof(f)) { + strcpy(phrase,""); + for (i=0;i<mw;i++) { + if (i) strcat(phrase," "); + fscanf(f,"%s",&word); + for(st=0;word[st]=='"'||word[st]=='\''||word[st]=='(';st++); + if (strlen(word)<1) continue; + for(len=strlen(word)-1;word[len]==','||word[len]=='.'||word[len]=='!'||word[len]=='?'||word[len]==';'||word[len]=='-'||word[len]==':'||word[len]=='"'||word[len]=='\''||word[len]==')';len--); + if (strlen(word)<1) continue; + else word[len+1]=0; + strcat(phrase,word+st); + } + if (strlen(phrase)<5) continue; + + a[autocharset_russian_uc(phrase,strlen(phrase))]++; +// a[autocharset_russian(phrase,strlen(phrase))]++; + +// a[autocharset_russian(word+st,len+1-st)]++; +// puts(word); + } + + printf("Win: %lu, Koi: %lu, Alt: %lu, UTF: %lu\n",a[0],a[1],a[3],a[2]); + fclose(f); + if (a[0]>a[1]) { + if (a[0]>a[2]) max=0; + else max=2; + } else { + if (a[1]>a[2]) max=1; + else max=2; + } + if (a[3]>max) max=3; + + f=fopen(argv[1],"r"); + while(!feof(f)) { + strcpy(phrase,""); + for (i=0;i<mw;i++) { + if (i) strcat(phrase," "); + fscanf(f,"%s",&word); + for(st=0;word[st]=='"'||word[st]=='\''||word[st]=='(';st++); + if (strlen(word)<1) continue; + for(len=strlen(word)-1;word[len]==','||word[len]=='.'||word[len]=='!'||word[len]=='?'||word[len]==';'||word[len]=='-'||word[len]==':'||word[len]=='"'||word[len]=='\''||word[len]==')';len--); + if (strlen(word)<1) continue; + else word[len+1]=0; + strcat(phrase,word+st); + } + if (strlen(phrase)<5) continue; + + i=autocharset_russian_uc(phrase,strlen(phrase)); +// i=autocharset_russian(phrase,strlen(phrase)); +// i=autocharset_russian(word+st,len+1-st); + if (i!=max) { + if (i==0) printf("Win: %s\n",phrase); + else if (i==1) printf("Koi: %s\n",phrase); + else if (i==2) printf("UTF: %s\n",phrase); + else if (i==3) printf("ALT: %s\n",phrase); + } + } + fclose(f); +} diff --git a/statgen/traslations b/statgen/traslations new file mode 100755 index 0000000..630735a --- /dev/null +++ b/statgen/traslations @@ -0,0 +1,12 @@ +#! /bin/bash + +if [ -z "$1" ]; then + echo "Usage: doit <file name>" + exit +fi + +# In some CP866 texts used "Yo" and "N" simbols from CP1251 encoding. This fixes it. +dos2unix -U $1 +cat $1 | sed -e "s/¸/ñ/g" | sed -e "s/¹/N/g;s/°/ø/g" | iconv -f CP866 -t KOI8-R > $1.koi +cat $1 | sed -e "s/¸/ñ/g" | sed -e "s/¹/ü/g;s/°/ø/g" | iconv -f CP866 -t CP1251 > $1.win +cat $1 | sed -e "s/¸/ñ/g" | sed -e "s/¹/ü/g;s/°/ø/g" | iconv -f CP866 -t UTF-8 > $1.utf diff --git a/statgen/upper.c b/statgen/upper.c new file mode 100644 index 0000000..be1a01c --- /dev/null +++ b/statgen/upper.c @@ -0,0 +1,75 @@ +#include <stdio.h> +#include <unistd.h> +#include <ctype.h> +#include <locale.h> +#include <sys/types.h> +#include <sys/stat.h> + + +main(int argc, char *argv[]) { + FILE *f; + struct stat st; + unsigned char *text; + char locale[32]; + int i; + + + if (argc!=3) { + printf("Usage: %s <file name> <encoding>\n",argv[0]); + exit(0); + } + + if (strlen(argv[2])>12) { + printf("Invalid encoding(%s) specified!\n",argv[2]); + exit(1); + } + + if ((!strcasecmp(argv[2],"koi"))||(!strcasecmp(argv[2],"koi8"))||(!strcasecmp(argv[2],"koi-8"))||(!strcasecmp(argv[2],"koi8-r"))) + sprintf(locale,"ru_RU.%s","KOI8-R"); + else if ((!strcasecmp(argv[2],"win"))||(!strcasecmp(argv[2],"cp1251"))||(!strcasecmp(argv[2],"cp-1251"))||(!strcasecmp(argv[2],"win1251"))||(!strcasecmp(argv[2],"win-1251"))) + sprintf(locale,"ru_RU.%s","CP1251"); + else + sprintf(locale,"ru_RU.%s",argv[2]); + if (!setlocale(LC_CTYPE,locale)) { + printf("Can't set locale %s!\n",argv[2]); + exit(1); + } + + if (stat(argv[1],&st)) { + printf("Specified file can't be stated!\n"); + exit(1); + } + + if (!S_ISREG(st.st_mode)) { + printf("Specified file isn't regular file!\n"); + exit(1); + } + + text=(unsigned char*)malloc(st.st_size); + if (!text) { + printf("Can't allocate %lu bytes of memory!\n",st.st_size); + exit(1); + } + + f=fopen(argv[1],"r"); + if (!f) { + printf("Failed to open specified file. Check permissions!\n"); + free(text); + exit(1); + } + if (fread(text,1,st.st_size,f)!=st.st_size) { + printf("Problem reading specified file!\n"); + free(text); + fclose(f); + exit(1); + } + fclose(f); + + for (i=0;i<st.st_size;i++) + text[i]=toupper(text[i]); + + f=fopen("UPPED.OUT","w"); + fwrite(text,1,st.st_size,f); + fclose(f); + free(text); +} |