#include #include #include #include #include #define PAGE_SH 8 #define PAGE_MAX (1u << PAGE_SH) #define PAGES (0x20000 / PAGE_MAX) #define PAGEH (0x1000/8 + PAGES) /* Direct map 0x1000 codepts (or less) */ static int tabnr, dictnr, matchnr, directwc; static int vflag; /* Table max PAGEH+254 words and dictionary max 128, but both may be smaller */ static union { unsigned char b[PAGEH + 254*4]; unsigned w[PAGEH/4 + 254]; } table; static unsigned short dict[128]; /* Only include headers to test against old style data, otherwise zero init */ unsigned char src_table[8192] = { #if WCTYPE_USE_OLD == 1 #include "alpha.h" #elif WCTYPE_USE_OLD == 2 #include "punct.h" #else 0 #endif }; static int getwcprop(wint_t wc, const unsigned char src[]) { if (wc<0x20000) { return (src[src[wc>>8]*32 + ((wc&255)>>3)]>>(wc&7)) & 1; } return 1; } static void read_src(unsigned wanted_set, unsigned char src[8192]) { char *set = calloc(0x110000,1); char buf[128], dummy; unsigned a, b; FILE *f1, *f2; f1 = fopen("data/UnicodeData.txt", "rb"); f2 = fopen("data/DerivedCoreProperties.txt", "rb"); if (!f1 || !f2 || wanted_set<1 || wanted_set>2) { printf("error: no Unicode data files or unknown set %d\n", wanted_set); return; } while (fgets(buf, sizeof buf, f1)) { if (sscanf(buf, "%x;%*[^;];Nd%c", &a, &dummy)==2) set[a] = 1; else if (sscanf(buf, "%x;%*[^;];%c", &a, &dummy)==2) set[a] = 2; } fclose(f1); while (fgets(buf, sizeof buf, f2)) { if (sscanf(buf, "%x..%x ; Alphabetic%c", &a, &b, &dummy)==3) for (; a<=b; a++) set[a]=1; else if (sscanf(buf, "%x ; Alphabetic%c", &a, &dummy)==2) set[a] = 1; } fclose(f2); /* Fix misclassified Thai characters */ set[0xe2f] = set[0xe46] = 2; /* Clear digits */ for (a=0x30; a<=0x39; a++) set[a]=0; /* Clear spaces */ set[0x0020] = 0; for (a=0x2000; a<=0x2006; a++) set[a]=0; for (a=0x2008; a<=0x200a; a++) set[a]=0; set[0x205f] = 0; set[0x3000] = 0; /* Clear controls */ for (a=0x00; a<=0x1f; a++) set[a]=0; for (a=0x7f; a<=0x9f; a++) set[a]=0; for (a=0x2028; a<=0x2029; a++) set[a]=0; for (a=0xfff9; a<=0xfffb; a++) set[a]=0; for (a=0xd800; a<=0xdfff; a++) set[a]=0; /* Fill in elided CJK ranges */ for (a=0x3400; a<=0x4db5; a++) set[a]=1; for (a=0x4e00; a<=0x9fcc; a++) set[a]=1; for (a=0xac00; a<=0xd7a3; a++) set[a]=1; for (a=0x20000; a<=0x2a6d6; a++) set[a]=1; for (a=0x2a700; a<=0x2b734; a++) set[a]=1; for (a=0x2b740; a<=0x2b81d; a++) set[a]=1; for (a=0; a<=0x10ffff; a++) if (set[a]!=wanted_set) set[a]=0; int i=0, p=0, step=256; for (a=0; a<0x20000; a+=step, i++) { for (b=0; b=128) err=1, printf("error: data doesn't fit constraints\n"); if (word<=0x3fff || word>=0xc000) { unsigned upper=word & 0xff80; if (upper==0 || upper==0xff80) quad=2, ext=word; else if ((word&0xff)==pat_low) quad=(pat_low&1), ext=(word>>7) & 0xfe; } if (quad==(~pat_low&1)) { for (j=0; j=2 ? 3:2) << (i*2 % (align*8)); exts[e++]=(ext&0xff) | (quad&1); if (vflag) printf("0x%04x [%02x] ", word, exts[e-1]); } wordnr[p]=e; if (vflag) printf("\ngenerated U+%05x: extnr %2d dictnr %3d total %d\n", wc, e, dictnr, 4+e+(dictnr-d)*2); } if (!err) for (p=start, tabnr=2*align; pPAGES) threshold++, q=start-1; if (threshold>align) { printf("/* success: all odd pages matched to even < 0x%03x */\n", p); gap=(align-wordnr[p] % align) % align; matchnr = matchnr ? matchnr:p; q=-1; break; } } /* Write data into table */ if (q>=0 && (wordnr[p]+wordnr[q]+gap) % align || tabnr % align) { printf("error: data alignment corrupted at %d:%d %d:%d %d [%d]\n", p, q, wordnr[p], q>=0 ? wordnr[q]:0, gap, tabnr); return 2; } table.b[p]=tabnr/align; table.w[h+tabnr/align]=words[p][0]; tabnr += 4; memcpy(&table.w[h + tabnr/align], words[p]+1, wordnr[p]); tabnr += wordnr[p]+gap; if (q>=0) { for (j=0; j=254*align || h*align+tabnr>=sizeof table.b) { printf("error: data too big or resize table to fit data\n"); return 3; } if (vflag) printf("inserted U+%05x: tabnr %d wordnr %d:%d, %d:%d gap %d\n", p * PAGE_MAX, tabnr, p, q, wordnr[p], q>=0 ? wordnr[q]:0, gap); } return err; } void verify_table(int wanted_set, const unsigned char src[]) { unsigned wc; unsigned log=0, fails=0; for (wc=0x0; wc<0x20000; wc++) { int exp=getwcprop(wc, src); int got=wanted_set<=1 ? iswalpha(wc):iswpunct(wc); if (exp!=!!got) log+=!log, fails++; if (log && log++<=20) { unsigned direct, page, bmap, shfr, lane; unsigned target; char *msg = (exp == got) ? "PASS":"FAIL"; direct = wc < directwc; page = (wc >> PAGE_SH); bmap = (wc >> 3) + PAGES & -direct; target = wc & (PAGE_MAX-1); shfr = (target & 15); lane = (target >> 4); printf("/* %s U+%05X: exp %d got %d page %03x bmap %03x " "target 0x%03x shfr %2d lane %2d */\n", msg, wc, exp, got, page, bmap, target, shfr, lane); } } printf("/* wctype: %u codepoints verified, %u errors */\n", wc, fails); } void export_table() { unsigned align=4; unsigned start=directwc/PAGE_MAX; unsigned h=PAGEH/align-2, b=h*align; int p, q, j, beg, end; printf("/* wctype: table %d x %d codepoints */\n", PAGES, PAGE_MAX); for (p=0; p=PAGES) end=tabnr/align + 1; /* +1 is for guard word at end */ end -= (p='a' && *cmd=='t') read_src(1 + (opt=='p'), src_table); verify_table(1 + (opt=='p'||opt=='P'), src_table); } else if (cmd && (*cmd | 32)=='a') { /* Export table (a) or dictionary (A) for iswalpha */ directwc=0x1000; read_src(1, src_table); encode_table(0xff, src_table); *cmd == 'a' ? export_table():export_dict(); } else if (cmd && (*cmd | 32)=='p') { /* Export table (p) or dictionary (P) for iswpunct */ directwc=0x800; read_src(2, src_table); encode_table(0x00, src_table); *cmd == 'p' ? export_table():export_dict(); } else { printf("usage: %s [-v] a|p|A|P|t|ta|tp|Ta|Tp\n", arg0); } return 0; }