/* ******************************************************************************* * * Copyright (C) 2000, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: rptp2ucm.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2001feb16 * created by: Markus W. Scherer * * This tool reads two CDRA conversion table files (RPMAP & TPMAP or RXMAP and TXMAP) and * generates a canonicalized ICU .ucm file from them. * If the RPMAP/RXMAP file does not contain a comment line with the substitution character, * then this tool also attempts to read the header of the corresponding UPMAP/UXMAP file * to extract subchar and subchar1. * * R*MAP: Unicode->codepage * T*MAP: codepage->Unicode * * To compile, just call a C compiler/linker with this source file. * On Windows: cl rptp2ucm.c */ #include #include #include typedef struct UCMSubchar { const char *name; unsigned long subchar, subchar1; } UCMSubchar; static const UCMSubchar knownSubchars[]={ "274_P100", 0x3f, 0, "850_P100", 0x7f, 0, "913_P100", 0x1a, 0, "1047_P100", 0x3f, 0 }; typedef struct CCSIDStateTable { unsigned int ccsid; const char *table; } CCSIDStateTable; static const CCSIDStateTable knownStateTables[]={ 367, " 0-7f\n", 941, " 0-80:2, 81-fe:1, ff:2\n" " 40-7e, 80-fc\n" "\n", 942, " 0-80, 81-9f:1, a0-df, e0-fc:1, fd-ff\n" " 40-7e, 80-fc\n", 943, " 0-7f, 81-9f:1, a0-df, e0-fc:1\n" " 40-7e, 80-fc\n", 944, " 0-80, 81-bf:1, c0-ff\n" " 40-7e, 80-fe\n", 949, " 0-84, 8f-fe:1\n" " 40-7e, 80-fe\n", 950, " 0-7f, 81-fe:1\n" " 40-7e, 81-fe\n", 964, " 0-8d, 8e:2, 90-9f, a1-fe:1, aa-c1:5, c3:5, fe:5\n" " a1-fe\n" " a1-b0:3, a1:4, a2:8, a3-ab:4, ac:7, ad:6, ae-b0:4\n" " a1-fe:1\n" " a1-fe:5\n" " a1-fe.u\n" " a1-a4:1, a5-fe:5\n" " a1-e2:1, e3-fe:5\n" " a1-f2:1, f3-fe:5\n", 970, " 0-9f, a1-fe:1\n" " a1-fe\n", 1363, " 0-7f, 81-fe:1\n" " 40-7e, 80-fe\n", 1370, " 0-80, 81-fe:1\n" " 40-7e, 81-fe\n", 1383, " 0-9f, a1-fe:1\n" " a1-fe\n", 1386, " 0-7f, 81-fe:1\n" " 40-7e, 80-fe\n", 5050, " 0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" " a1-fe\n" " a1-e4\n" " a1-fe:1, a1:4, a3-af:4, b6:4, d6:4, da-db:4, ed-f2:4\n" " a1-fe.u\n", 21427, " 0-80:2, 81-fe:1, ff:2\n" " 40-7e, 80-fe\n" "\n", 33722, " 0-8d, 8e:2, 8f:3, 90-9f, a1-fe:1\n" " a1-fe\n" " a1-e4\n" " a1-fe:1, a1:4, a3-af:4, b6:4, d6:4, da-db:4, ed-f2:4\n" " a1-fe.u\n" }; typedef struct Mapping { /* * u bits: * 31..24 fallback indicator * 0 roundtrip * 1 Unicode->codepage * 3 codepage->Unicode * 23.. 0 Unicode code point * * b: codepage bytes with leading zeroes */ unsigned long u, b; } Mapping; #define MAX_MAPPINGS_COUNT 200000 static Mapping fromUMappings[MAX_MAPPINGS_COUNT], toUMappings[MAX_MAPPINGS_COUNT]; static long fromUMappingsTop, toUMappingsTop; static unsigned long subchar, subchar1; static unsigned int ccsid; enum { ASCII, EBCDIC, UNKNOWN }; static char minCharLength, maxCharLength, charsetFamily, usesPUA, variantLF, variantASCII, variantControls, variantSUB; static void init() { fromUMappingsTop=toUMappingsTop=0; subchar=subchar1=0; ccsid=0; minCharLength=4; maxCharLength=0; charsetFamily=UNKNOWN; usesPUA=0; variantLF=0; variantASCII=0; variantControls=0; variantSUB=0; } /* lexically compare Mappings for sorting */ static int compareMappings(const void *left, const void *right) { const Mapping *l=(const Mapping *)left, *r=(const Mapping *)right; long result; /* the code points use fewer than 32 bits, just cast them to signed values and subtract */ result=(long)(l->u&0xffffff)-(long)(r->u&0xffffff); if(result!=0) { /* shift right 16 with sign-extend to take care of int possibly being 16 bits wide */ return (int)(result>>16)|1; } /* the b fields may use all 32 bits as unsigned long, so result=(long)(l->b-r->b) would not work (try l->b=0x80000000 and r->b=1) */ if(l->bb) { return -1; } else if(l->b>r->b) { return 1; } return (int)(l->u>>24)-(int)(r->u>>24); } static const char * skipWhitespace(const char *s) { while(*s==' ' || *s=='\t') { ++s; } return s; } static long parseMappings(FILE *f, Mapping *mappings) { char line[200]; Mapping *oldMappings; char *s, *end; long mappingsTop=0; oldMappings=mappings; while(fgets(line, sizeof(line), f)!=NULL) { s=(char *)skipWhitespace(line); /* skip empty lines */ if(*s==0 || *s=='\n' || *s=='\r') { continue; } /* explicit end of table */ if(memcmp(s, "END CHARMAP", 11)==0) { break; } /* comment lines, parse substitution characters, otherwise skip them */ if(*s=='#' || *s=='*') { /* get subchar1 */ s=strstr(line, "for U+00xx"); if(s!=NULL) { s=strstr(line, "x'"); if(s!=NULL) { s+=2; subchar1=strtoul(s, &end, 16); if(end!=s+2 || *end!='\'') { fprintf(stderr, "error parsing subchar1 from \"%s\"\n", line); exit(2); } continue; } else { fprintf(stderr, "error finding subchar1 on \"%s\"\n", line); exit(2); } } /* get subchar */ s=strstr(line, "for U+xxxx"); if(s!=NULL) { s=strstr(line, "x'"); if(s!=NULL) { s+=2; subchar=strtoul(s, &end, 16); if(endb=strtoul(s, &end, 16); if(s==end || (*end!=' ' && *end!='\t')) { if((s+1)==end && *end=='-' && (mappings->b<=3)) { /* this is a special EUC format where the code set number prepends the bytes */ unsigned long prefix; switch(mappings->b) { case 0: prefix=0; break; case 1: prefix=0; break; case 2: prefix=0x8e; break; case 3: prefix=0x8f; break; default: /* never occurs because of above check */ break; } s+=2; mappings->b=strtoul(s, &end, 16); if(s==end || ((end-s)&1) || (*end!=' ' && *end!='\t')) { fprintf(stderr, "error parsing EUC codepage bytes on \"%s\"\n", line); exit(2); } mappings->b|=prefix<<(4*(end-s)); } else { fprintf(stderr, "error parsing codepage bytes on \"%s\"\n", line); exit(2); } } s=(char *)skipWhitespace(end); mappings->u=strtoul(s, &end, 16); if(s==end || (*end!=' ' && *end!='\t' && *end!='\n' && *end!='\r' && *end!=0)) { if(strncmp(s, "????", 4)==0 || strstr(s, "UNASSIGNED")!=NULL) { /* this is a non-entry, do not add it to the mapping table */ continue; } fprintf(stderr, "error parsing Unicode code point on \"%s\"\n", line); exit(2); } ++mappings; if(++mappingsTop>=MAX_MAPPINGS_COUNT) { fprintf(stderr, "error: too many mappings at \"%s\"\n", line); exit(2); } } /* sort the mappings */ qsort(oldMappings, mappingsTop, sizeof(Mapping), compareMappings); return mappingsTop; } /* merge the mappings into fromUMappings and add fallback indicator values to Mapping.u bits 31..24 */ static void mergeMappings() { long fromUIndex, toUIndex, newFromUMappingsTop=fromUMappingsTop; int cmp; fromUIndex=toUIndex=0; while(fromUIndexcodepage */ if(fromUMappings[fromUIndex].b!=subchar && fromUMappings[fromUIndex].b!=subchar1) { fromUMappings[fromUIndex++].u|=0x1000000; } else { fromUMappings[fromUIndex++].u|=0x2000000; } } else { /* * the toU mapping does not have a fromU counterpart: * (reverse) fallback codepage->Unicode, copy it to the fromU table */ fromUMappings[newFromUMappingsTop].u=toUMappings[toUIndex].u|=0x3000000; fromUMappings[newFromUMappingsTop++].b=toUMappings[toUIndex++].b; } } /* either one or both tables are exhausted */ while(fromUIndex>24; u=fromUMappings[i].u&0xffffff; b=fromUMappings[i].b; /* character length? */ if(b<=0xff) { length=1; } else if(b<=0xffff) { length=2; if(bmaxTwoByte) { maxTwoByte=b; } } else if(b<=0xffffff) { length=3; } else { length=4; } if(lengthmaxCharLength) { maxCharLength=length; } /* PUA used? */ if((unsigned long)(u-0xe000)<0x1900 || (unsigned long)(u-0xf0000)<0x20000) { usesPUA=1; } /* only consider roundtrip mappings for the rest */ if(f!=0) { continue; } /* ASCII or EBCDIC? */ if(u==0x41) { if(b==0x41) { charsetFamily=ASCII; } else if(b==0xc1) { charsetFamily=EBCDIC; } } else if(u==0xa) { if(b==0xa) { charsetFamily=ASCII; } else if(b==0x25) { charsetFamily=EBCDIC; variantLF=0; } else if(b==0x15) { charsetFamily=EBCDIC; variantLF=1; } } /* US-ASCII? */ if((unsigned long)(u-0x21)<94) { if(u==b) { ++countASCII; } else { variantASCII=1; } } else if(u<0x20 || u==0x7f) { /* non-ISO C0 controls? */ if(u!=b) { if(u==0x1a && b==0x7f || u==0x1c && b==0x1a || u==0x7f && b==0x1c) { charsetFamily=ASCII; variantSUB=1; } else { variantControls=1; } } } } if(charsetFamily==UNKNOWN) { if(minCharLength==2 && maxCharLength==2) { /* guess the charset family for DBCS according to typical byte distributions */ if( ((0x2020<=minTwoByte || minTwoByte<=0x217e) && maxTwoByte<=0x7e7e) || ((0xa0a0<=minTwoByte || minTwoByte<=0xa1fe) && maxTwoByte<=0xfefe) || ((0x8140<=minTwoByte || minTwoByte<=0x81fe) && maxTwoByte<=0xfefe) ) { charsetFamily=ASCII; } else if((minTwoByte==0x4040 || (0x4141<=minTwoByte && minTwoByte<=0x41fe)) && maxTwoByte<=0xfefe) { charsetFamily=EBCDIC; } } if(charsetFamily==UNKNOWN) { fprintf(stderr, "error: unable to determine the charset family\n"); exit(3); } } /* reset variant indicators if they do not apply */ if(charsetFamily!=ASCII || minCharLength!=1) { variantASCII=variantSUB=variantControls=0; } else if(countASCII!=94) { /* if there are not 94 mappings for ASCII graphic characters, then set variantASCII */ variantASCII=1; } if(charsetFamily!=EBCDIC || minCharLength!=1) { variantLF=0; } } static int getSubchar(const char *name) { int i; for(i=0; i", 9)==0) { s=(char *)skipWhitespace(s+9); p=&subchar; } else if(memcmp(s, "", 10)==0) { s=(char *)skipWhitespace(s+10); p=&subchar1; } else if(memcmp(s, "#", 11)==0) { s=(char *)skipWhitespace(s+11); p=&subchar1; } else { continue; } /* get the value and store it in *p */ bytes=0; while(s[0]=='\\' && s[1]=='x') { value=strtoul(s+2, &end, 16); s+=4; if(end!=s) { fprintf(stderr, "error parsing UPMAP subchar from \"%s\"\n", line); exit(2); } bytes=(bytes<<8)|value; } *p=bytes; } } static const char * getStateTable() { int i; for(i=0; i>8, b&0xff); } else if(b<=0xffffff) { sprintf(s, "\\x%02lX\\x%02lX\\x%02lX", b>>16, (b>>8)&0xff, b&0xff); } else { sprintf(s, "\\x%02lX\\x%02lX\\x%02lX\\x%02lX", b>>24, (b>>16)&0xff, (b>>8)&0xff, b&0xff); } } static void writeUCM(FILE *f, const char *ucmname, const char *rpname, const char *tpname) { char buffer[100]; long i; /* write the header */ fprintf(f, "# *******************************************************************************\n" "# *\n" "# * Copyright (C) 1995-2001, International Business Machines\n" "# * Corporation and others. All Rights Reserved.\n" "# *\n" "# *******************************************************************************\n" "#\n" "# File created by rptp2ucm (compiled on %s)\n" "# from source files %s and %s\n" "#\n", __DATE__, rpname, tpname); /* ucmname does not have a path or .ucm */ fprintf(f, " \"%s\"\n", ucmname); fputs(" \"AXXXX\"\n", f); fprintf(f, " %u\n", maxCharLength); fprintf(f, " %u\n", minCharLength); if(maxCharLength==1) { fputs(" \"SBCS\"\n", f); } else if(maxCharLength==2) { if(minCharLength==1) { if(charsetFamily==EBCDIC) { fputs(" \"EBCDIC_STATEFUL\"\n", f); } else { fputs(" \"MBCS\"\n", f); } } else if(minCharLength==2) { fputs(" \"DBCS\"\n", f); } else { fputs(" \"MBCS\"\n", f); } } else { fputs(" \"MBCS\"\n", f); } if(subchar!=0) { writeBytes(buffer, subchar); fprintf(f, " %s\n", buffer); } if(subchar1!=0) { fprintf(f, " \\x%02X\n", subchar1); } /* write charset family */ if(charsetFamily==ASCII) { fputs(" \"ASCII\"\n", f); } else { fputs(" \"EBCDIC\"\n", f); } /* write alias describing the codepage */ sprintf(buffer, " \"ibm-%u", ccsid); if(!usesPUA && !variantLF && !variantASCII && !variantControls && !variantSUB) { strcat(buffer, "_STD\"\n\n"); } else { /* add variant indicators in alphabetic order */ if(variantASCII) { strcat(buffer, "_VASCII"); } if(variantControls) { strcat(buffer, "_VGCTRL"); } if(variantLF) { strcat(buffer, "_VLF"); } if(variantSUB) { strcat(buffer, "_VSUB"); } if(usesPUA) { strcat(buffer, "_VPUA"); } strcat(buffer, "\"\n\n"); } fputs(buffer, f); /* write the state table - */ fputs(getStateTable(), f); /* write the mappings */ fputs("\nCHARMAP\n", f); for(i=0; i %s |%lu\n", fromUMappings[i].u&0xffffff, buffer, fromUMappings[i].u>>24); } fputs("END CHARMAP\n", f); } static void processTable(const char *arg) { char filename[1024], tpname[32]; const char *basename, *s; FILE *rpmap, *tpmap, *ucm; unsigned long value, unicode; int length; init(); /* separate path and basename */ basename=strrchr(arg, '/'); if(basename==NULL) { basename=strrchr(arg, '\\'); if(basename==NULL) { basename=arg; } else { ++basename; } } else { ++basename; s=strrchr(arg, '\\'); if(s!=NULL && ++s>basename) { basename=s; } } /* is this a standard RPMAP filename? */ value=strtoul(basename, (char **)&s, 16); if( strlen(basename)!=17 || (memcmp(basename+9, "RPMAP", 5)!=0 && memcmp(basename+9, "rpmap", 5)!=0 && memcmp(basename+9, "RXMAP", 5)!=0 && memcmp(basename+9, "rxmap", 5)!=0) || (s-basename)!=8 || *s!='.' ) { fprintf(stderr, "error: \"%s\" is not a standard RPMAP filename\n", basename); exit(1); } /* is this really a Unicode conversion table? - get the CCSID */ unicode=value&0xffff; if(unicode==13488 || unicode==17584) { ccsid=(unsigned int)(value>>16); } else { unicode=value>>16; if(unicode==13488 || unicode==17584) { ccsid=(unsigned int)(value&0xffff); } else { fprintf(stderr, "error: \"%s\" is not a Unicode conversion table\n", basename); exit(1); } } /* try to open the RPMAP file */ rpmap=fopen(arg, "r"); if(rpmap==NULL) { fprintf(stderr, "error: unable to open \"%s\"\n", arg); exit(1); } /* try to open the TPMAP file */ strcpy(filename, arg); length=strlen(filename); /* guess the TPMAP filename; note that above we have checked the format of the basename */ /* replace the R in RPMAP by T, keep upper- or lowercase */ if(filename[length-8]=='R') { filename[length-8]='T'; } else { filename[length-8]='t'; } /* reverse the CCSIDs */ memcpy(filename+length-17, basename+4, 4); memcpy(filename+length-13, basename, 4); /* first, keep the same suffix */ tpmap=fopen(filename, "r"); if(tpmap==NULL) { /* next, try reducing the second to last digit by 1 */ --filename[length-2]; tpmap=fopen(filename, "r"); if(tpmap==NULL) { /* there is no TPMAP */ fprintf(stderr, "error: unable to find the TPMAP file for \"%s\"\n", arg); exit(1); } } strcpy(tpname, filename+length-17); /* parse both files */ fromUMappingsTop=parseMappings(rpmap, fromUMappings); toUMappingsTop=parseMappings(tpmap, toUMappings); fclose(tpmap); fclose(rpmap); /* if there is no subchar, then try to get it from the corresponding UPMAP */ if(subchar==0) { FILE *f; /* restore the RPMAP filename and just replace the R by U */ strcpy(filename+length-17, basename); if(filename[length-8]=='R') { filename[length-8]='U'; } else { filename[length-8]='u'; } f=fopen(filename, "r"); if(f==NULL) { /* try reversing the CCSIDs */ memcpy(filename+length-17, basename+4, 4); memcpy(filename+length-13, basename, 4); f=fopen(filename, "r"); } if(f!=NULL) { getSubcharFromUPMAP(f); fclose(f); } } /* generate the .ucm filename - necessary before getSubchar() */ length=sprintf(filename, "ibm-%u_", ccsid); /* uppercase and append the suffix */ filename[length++]=toupper(basename[10]); /* P or X */ filename[length++]=toupper(basename[14]); /* last 3 suffix characters */ filename[length++]=toupper(basename[15]); filename[length++]=toupper(basename[16]); filename[length]=0; /* find the subchar if still necessary - necessary before merging for correct |2 */ if(subchar==0 && !getSubchar(filename+4)) { fprintf(stderr, "warning: missing subchar in \"%s\" (CCSID=0x%04X)\n", filename, ccsid); } /* merge the mappings */ mergeMappings(); /* analyze the conversion table */ analyzeTable(); /* open the .ucm file */ strcat(filename, ".ucm"); ucm=fopen(filename, "w"); if(ucm==NULL) { fprintf(stderr, "error: unable to open output file \"%s\"\n", filename); exit(4); } /* remove the .ucm from the filename for the following processing */ filename[strlen(filename)-4]=0; /* write the .ucm file */ writeUCM(ucm, filename, basename, tpname); fclose(ucm); } extern int main(int argc, const char *argv[]) { if(argc<2) { fprintf(stderr, "usage: %s { rpmap/rxmap-filename }+\n", argv[0]); exit(1); } while(--argc>0) { processTable(*++argv); } return 0; }