/* ******************************************************************************* * * Copyright (C) 1999, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: gennames.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999sep30 * created by: Markus W. Scherer * * This program reads the Unicode character database text file, * parses it, and extracts the character code, * the "modern" character name, and optionally the * Unicode 1.0 character name. * It then tokenizes and compresses the names and builds * compact binary tables for random-access lookup * in a u_charName() API function. */ #include #include #include "unicode/utypes.h" #include "cmemory.h" #include "cstring.h" #include "filestrm.h" #include "unicode/udata.h" #include "unewdata.h" #define STRING_STORE_SIZE 1000000 #define GROUP_STORE_SIZE 5000 #define GROUP_SHIFT 5 #define LINES_PER_GROUP (1UL<=length) { fprintf(stderr, "gennames: too few fields at code 0x%lx\n", code); exit(U_PARSE_ERROR); } limit=getField(line, name1Start, length); /* do not store pseudo-names in <> brackets */ if(line[name1Start]!='<') { name1Length=limit-name1Start; } else { name1Length=0; } if(store10Names) { /* skip 8 fields and get the following one */ for(i=0; i<9; ++i) { name2Start=limit+1; if(name2Start>=length) { fprintf(stderr, "gennames: too few fields at code 0x%lx\n", code); exit(U_PARSE_ERROR); } limit=getField(line, name2Start, length); } /* get the second character name, the one from Unicode 1.0 */ /* do not store pseudo-names in <> brackets */ if(line[name2Start]!='<') { name2Length=limit-name2Start; } else { name2Length=0; } } if(name1Length>0 || name2Length>0) { /* printf("%lx:%.*s(%.*s)\n", code, name1Length, line+name1Start, name2Length, line+name2Start); */ parseName(line+name1Start, name1Length); parseName(line+name2Start, name2Length); addLine(code, line+name1Start, name1Length, line+name2Start, name2Length); } } printf("size of all names in the database: %lu\n", lineTop); printf("number of named Unicode characters: %lu\n", lineCount); printf("number of words in the dictionary from these names: %lu\n", wordCount); } static void parseName(char *name, int16_t length) { int16_t start=0, limit, wordLength/*, prevStart=-1*/; Word *word; while(start1) { word=findWord(name+start, wordLength); if(word==NULL) { word=addWord(name+start, wordLength); } countWord(word); } #if 0 /* * if there was a word before this * (with no noise in between), then add the pair of words, too */ if(prevStart!=-1) { wordLength=limit-prevStart; word=findWord(name+prevStart, wordLength); if(word==NULL) { word=addWord(name+prevStart, wordLength); } countWord(word); } #endif /*prevStart=start;*/ start=limit; } } static int16_t getField(char *line, int16_t start, int16_t limit) { while(start0 && words[wordCount-1].weight<1) { --wordCount; } /* count the letters in the token range */ letterCount=0; for(i=LEADBYTE_LIMIT; i<256; ++i) { if(tokens[i]==-1) { ++letterCount; } } printf("number of letters used in the names: %d\n", letterCount); /* do we need double-byte tokens? */ if(wordCount+letterCount<=256) { /* no, single-byte tokens are enough */ leadByteCount=0; for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { if(tokens[i]!=-1) { tokens[i]=wordNumber; if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", i, words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } ++wordNumber; } } tokenCount=i; } else { /* * The tokens that need two token bytes * get their weight reduced by their count * because they save less. */ tokenCount=256-letterCount; for(i=tokenCount; i0 && words[wordCount-1].weight<1) { --wordCount; } /* how many tokens and lead bytes do we have now? */ tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); leadByteCount=(int16_t)(tokenCount>>8); if(leadByteCountcode; /* segment the lines to groups of 32 */ if(inLine>>GROUP_SHIFT!=groupMSB) { /* finish the current group with empty lines */ while((++outLine&GROUP_MASK)!=0) { appendLineLength(0); } /* store the group like a line */ if(groupTop>0) { if(groupTop>GROUP_STORE_SIZE) { fprintf(stderr, "gennames: group store overflow\n"); exit(U_BUFFER_OVERFLOW_ERROR); } addGroup(groupMSB, groupStore, groupTop); if(lineTop>(uint32_t)(line->s-stringStore)) { fprintf(stderr, "gennames: group store runs into string store\n"); exit(U_INTERNAL_PROGRAM_ERROR); } } /* start the new group */ lineLengthsTop=0; groupTop=0; groupMSB=inLine>>GROUP_SHIFT; outLine=(inLine&~GROUP_MASK)-1; } /* write empty lines between the previous line in the group and this one */ while(++outLines, line->length, &groupTop)); } printf("number of groups: %lu\n", lineCount); } static int16_t compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { int16_t start, limit, token, groupTop=*pGroupTop; start=0; do { /* write any "noise" characters */ limit=skipNoise((char *)s, start, length); while(start0xff) { groupStore[groupTop++]=(uint8_t)(token>>8); } groupStore[groupTop++]=(uint8_t)token; start=limit; } else { while(startweight-((Word *)word1)->weight; } /* generate output data ----------------------------------------------------- */ static void generateData() { UNewDataMemory *pData; UErrorCode errorCode=U_ZERO_ERROR; uint16_t groupWords[3]; uint32_t i, groupTop=lineTop, offset, size, tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; long dataLength; int16_t token; pData=udata_create(DATA_TYPE, DATA_NAME, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); exit(errorCode); } /* first, see how much space we need, and prepare the token strings */ for(i=0; i>16); groupWords[2]=(uint16_t)(offset); udata_writeBlock(pData, groupWords, 6); } /* group strings */ udata_writeBlock(pData, stringStore, groupTop); /* 4-align the algorithmic names data */ udata_writePadding(pData, algNamesOffset-(groupStringOffset+groupTop)); generateAlgorithmicData(pData); /* finish up */ dataLength=udata_finish(pData, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "gennames: error %d writing the output file\n", errorCode); exit(errorCode); } if(dataLength!=(long)size) { fprintf(stderr, "gennames: data length %ld != calculated size %lu\n", dataLength, size); exit(U_INTERNAL_PROGRAM_ERROR); } } /* the structure for algorithmic names needs to be 4-aligned */ typedef struct AlgorithmicRange { uint32_t rangeStart, rangeEnd; uint8_t algorithmType, algorithmVariant; uint16_t rangeSize; } AlgorithmicRange; static uint32_t generateAlgorithmicData(UNewDataMemory *pData) { static char *prefix="CJK UNIFIED IDEOGRAPH-"; # define PREFIX_LENGTH 23 # define PREFIX_LENGTH_4 24 static AlgorithmicRange cjkExtA={ 0x3400, 0x4db5, 0, 4, sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 }; static AlgorithmicRange cjk={ 0x4e00, 0x9fa5, 0, 4, sizeof(AlgorithmicRange)+PREFIX_LENGTH_4 }; static char jamo[]= "HANGUL SYLLABLE \0" "G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0" "S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0" "A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0" "WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0" "YU\0EU\0YI\0I\0" "\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0" "LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0" "S\0SS\0NG\0J\0C\0K\0T\0P\0H" ; static AlgorithmicRange hangul={ 0xac00, 0xd7a3, 1, 3, sizeof(AlgorithmicRange)+6+sizeof(jamo) }; /* modulo factors, maximum 8 */ /* 3 factors: 19, 21, 28, most-to-least-significant */ static uint16_t hangulFactors[3]={ 19, 21, 28 }; uint32_t size; size=0; /* number of ranges of algorithmic names */ if(pData!=NULL) { udata_write32(pData, 3); } else { size+=4; } /* * each range: * uint32_t rangeStart * uint32_t rangeEnd * uint8_t algorithmType * uint8_t algorithmVariant * uint16_t size of range data * uint8_t[size] data */ /* range 0: cjk extension a */ if(pData!=NULL) { udata_writeBlock(pData, &cjkExtA, sizeof(AlgorithmicRange)); udata_writeString(pData, prefix, PREFIX_LENGTH); if(PREFIX_LENGTHweight=-(length+1+2); word->count=0; word->length=length; word->s=stringStart; ++wordCount; return word; } static void countWord(Word *word) { /* add to the weight the savings: the length of the word minus 1 byte for the token */ word->weight+=word->length-1; ++word->count; } static void addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length) { uint8_t *stringStart; Line *line; int16_t length; if(lineCount==MAX_LINE_COUNT) { fprintf(stderr, "gennames: too many lines\n"); exit(U_BUFFER_OVERFLOW_ERROR); } length=name1Length; if(name2Length>0) { length+=1+name2Length; } stringStart=allocLine(length); if(name1Length>0) { uprv_memcpy(stringStart, name1, name1Length); } if(name2Length>0) { stringStart[name1Length]=NAME_SEPARATOR_CHAR; uprv_memcpy(stringStart+name1Length+1, name2, name2Length); } line=lines+lineCount; line->code=code; line->length=length; line->s=stringStart; ++lineCount; /* prevent a character value that is actually in a name from becoming a token */ while(length>0) { tokens[stringStart[--length]]=-1; } } static void addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { uint8_t *stringStart; Line *line; if(lineCount==MAX_LINE_COUNT) { fprintf(stderr, "gennames: too many groups\n"); exit(U_BUFFER_OVERFLOW_ERROR); } /* store the line lengths first, then the strings */ lineLengthsTop=(lineLengthsTop+1)/2; stringStart=allocLine(lineLengthsTop+length); uprv_memcpy(stringStart, lineLengths, lineLengthsTop); uprv_memcpy(stringStart+lineLengthsTop, strings, length); line=lines+lineCount; line->code=groupMSB; line->length=length; line->s=stringStart; ++lineCount; } static uint32_t addToken(uint8_t *s, int16_t length) { uint8_t *stringStart; stringStart=allocLine(length+1); uprv_memcpy(stringStart, s, length); stringStart[length]=0; return stringStart-stringStore; } static void appendLineLength(int16_t length) { if(length>=76) { fprintf(stderr, "gennames: compressed line too long\n"); exit(U_BUFFER_OVERFLOW_ERROR); } if(length>=12) { length-=12; appendLineLengthNibble((uint8_t)((length>>4)|12)); } appendLineLengthNibble((uint8_t)length); } static void appendLineLengthNibble(uint8_t nibble) { if((lineLengthsTop&1)==0) { lineLengths[lineLengthsTop/2]=nibble<<4; } else { lineLengths[lineLengthsTop/2]|=nibble&0xf; } ++lineLengthsTop; } static uint8_t * allocLine(uint32_t length) { uint32_t top=lineTop+length; uint8_t *p; if(top>wordBottom) { fprintf(stderr, "gennames: out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } p=stringStore+lineTop; lineTop=top; return p; } static uint8_t * allocWord(uint32_t length) { uint32_t bottom=wordBottom-length; if(lineTop>bottom) { fprintf(stderr, "gennames: out of memory\n"); exit(U_MEMORY_ALLOCATION_ERROR); } wordBottom=bottom; return stringStore+bottom; }