From 23504ad5746ebc960e6fae1ec927c3a3ea9657f6 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Sat, 20 Nov 1999 01:14:07 +0000 Subject: [PATCH] ICU-158 generate unames.dat with the unicode character names X-SVN-Rev: 200 --- icu4c/source/tools/gennames/gennames.c | 1040 ++++++++++++++++++++++ icu4c/source/tools/gennames/gennames.dsp | 102 +++ 2 files changed, 1142 insertions(+) create mode 100644 icu4c/source/tools/gennames/gennames.c create mode 100644 icu4c/source/tools/gennames/gennames.dsp diff --git a/icu4c/source/tools/gennames/gennames.c b/icu4c/source/tools/gennames/gennames.c new file mode 100644 index 0000000000..a55f26a6e1 --- /dev/null +++ b/icu4c/source/tools/gennames/gennames.c @@ -0,0 +1,1040 @@ +/* +******************************************************************************* +* * +* COPYRIGHT: * +* (C) Copyright International Business Machines Corporation, 1999 * +* Licensed Material - Program-Property of IBM - All Rights Reserved. * +* US Government Users Restricted Rights - Use, duplication, or disclosure * +* restricted by GSA ADP Schedule Contract with IBM Corp. * +* * +******************************************************************************* +* file name: gennames.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 1999sep30 +* created by: Markus W. Scherer +* +* This program reads the Unicode character database text file, +* parses it, and extracts the character code, +* the "modern" character name, and optionally the +* Unicode 1.0 character name. +* It then tokenizes and compresses the names and builds +* compact binary tables for random-access lookup +* in a u_charName() API function. +*/ + +#include +#include +#include "utypes.h" +#include "cmemory.h" +#include "cstring.h" +#include "filestrm.h" +#include "udata.h" +#include "../toolutil/unewdata.h" + +#define STRING_STORE_SIZE 1000000 +#define GROUP_STORE_SIZE 5000 + +#define GROUP_SHIFT 5 +#define LINES_PER_GROUP (1UL<=length) { + fprintf(stderr, "gennames: too few fields at code 0x%lx\n", code); + exit(U_PARSE_ERROR); + } + limit=getField(line, name1Start, length); + + /* do not store pseudo-names in <> brackets */ + if(line[name1Start]!='<') { + name1Length=limit-name1Start; + } else { + name1Length=0; + } + + if(store10Names) { + /* skip 8 fields and get the following one */ + for(i=0; i<9; ++i) { + name2Start=limit+1; + if(name2Start>=length) { + fprintf(stderr, "gennames: too few fields at code 0x%lx\n", code); + exit(U_PARSE_ERROR); + } + limit=getField(line, name2Start, length); + } + + /* get the second character name, the one from Unicode 1.0 */ + /* do not store pseudo-names in <> brackets */ + if(line[name2Start]!='<') { + name2Length=limit-name2Start; + } else { + name2Length=0; + } + } + + if(name1Length>0 || name2Length>0) { + /* printf("%lx:%.*s(%.*s)\n", code, name1Length, line+name1Start, name2Length, line+name2Start); */ + + parseName(line+name1Start, name1Length); + parseName(line+name2Start, name2Length); + + addLine(code, line+name1Start, name1Length, line+name2Start, name2Length); + } + } + + printf("size of all names in the database: %lu\n", lineTop); + printf("number of named Unicode characters: %lu\n", lineCount); + printf("number of words in the dictionary from these names: %lu\n", wordCount); +} + +static void +parseName(char *name, int16_t length) { + int16_t start=0, limit, wordLength/*, prevStart=-1*/; + Word *word; + + while(start1) { + word=findWord(name+start, wordLength); + if(word==NULL) { + word=addWord(name+start, wordLength); + } + countWord(word); + } + +#if 0 + /* + * if there was a word before this + * (with no noise in between), then add the pair of words, too + */ + if(prevStart!=-1) { + wordLength=limit-prevStart; + word=findWord(name+prevStart, wordLength); + if(word==NULL) { + word=addWord(name+prevStart, wordLength); + } + countWord(word); + } +#endif + + /*prevStart=start;*/ + start=limit; + } +} + +static int16_t +getField(char *line, int16_t start, int16_t limit) { + while(start0 && words[wordCount-1].weight<1) { + --wordCount; + } + + /* count the letters in the token range */ + letterCount=0; + for(i=LEADBYTE_LIMIT; i<256; ++i) { + if(tokens[i]==-1) { + ++letterCount; + } + } + printf("number of letters used in the names: %d\n", letterCount); + + /* do we need double-byte tokens? */ + if(wordCount+letterCount<=256) { + /* no, single-byte tokens are enough */ + leadByteCount=0; + for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { + if(tokens[i]!=-1) { + tokens[i]=wordNumber; + if(beVerbose) { + printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", + i, words[wordNumber].weight, + words[wordNumber].length, words[wordNumber].s); + } + ++wordNumber; + } + } + tokenCount=i; + } else { + /* + * The tokens that need two token bytes + * get their weight reduced by their count + * because they save less. + */ + tokenCount=256-letterCount; + for(i=tokenCount; i0 && words[wordCount-1].weight<1) { + --wordCount; + } + + /* how many tokens and lead bytes do we have now? */ + tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1); + leadByteCount=(int16_t)(tokenCount>>8); + if(leadByteCountcode; + + /* segment the lines to groups of 32 */ + if(inLine>>GROUP_SHIFT!=groupMSB) { + /* finish the current group with empty lines */ + while((++outLine&GROUP_MASK)!=0) { + appendLineLength(0); + } + + /* store the group like a line */ + if(groupTop>0) { + if(groupTop>GROUP_STORE_SIZE) { + fprintf(stderr, "gennames: group store overflow\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + addGroup(groupMSB, groupStore, groupTop); + if(lineTop>(uint32_t)(line->s-stringStore)) { + fprintf(stderr, "gennames: group store runs into string store\n"); + exit(U_INTERNAL_PROGRAM_ERROR); + } + } + + /* start the new group */ + lineLengthsTop=0; + groupTop=0; + groupMSB=inLine>>GROUP_SHIFT; + outLine=(inLine&~GROUP_MASK)-1; + } + + /* write empty lines between the previous line in the group and this one */ + while(++outLines, line->length, &groupTop)); + } + + printf("number of groups: %lu\n", lineCount); +} + +static int16_t +compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) { + int16_t start, limit, token, groupTop=*pGroupTop; + + start=0; + do { + /* write any "noise" characters */ + limit=skipNoise((char *)s, start, length); + while(start0xff) { + groupStore[groupTop++]=(uint8_t)(token>>8); + } + groupStore[groupTop++]=(uint8_t)token; + start=limit; + } else { + while(startweight-((Word *)word1)->weight; +} + +/* generate output data ----------------------------------------------------- */ + +static void +generateData() { + UNewDataMemory *pData; + UErrorCode errorCode=U_ZERO_ERROR; + uint16_t groupWords[3]; + uint32_t i, groupTop=lineTop, offset, size, + tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; + long dataLength; + int16_t token; + + pData=udata_create(DATA_TYPE, DATA_NAME, &dataInfo, + haveCopyright ? DATA_COPYRIGHT : NULL, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "gennames: unable to create data memory, error %d\n", errorCode); + exit(errorCode); + } + + /* first, see how much space we need, and prepare the token strings */ + for(i=0; i>16); + groupWords[2]=(uint16_t)(offset); + udata_writeBlock(pData, groupWords, 6); + } + + /* group strings */ + udata_writeBlock(pData, stringStore, groupTop); + + /* 4-align the algorithmic names data */ + offset=groupStringOffset+groupTop; + while(offsetweight=-(length+1+2); + word->count=0; + word->length=length; + word->s=stringStart; + + ++wordCount; + + return word; +} + +static void +countWord(Word *word) { + /* add to the weight the savings: the length of the word minus 1 byte for the token */ + word->weight+=word->length-1; + ++word->count; +} + +static void +addLine(uint32_t code, char *name1, int16_t name1Length, char *name2, int16_t name2Length) { + uint8_t *stringStart; + Line *line; + int16_t length; + + if(lineCount==MAX_LINE_COUNT) { + fprintf(stderr, "gennames: too many lines\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + length=name1Length; + if(name2Length>0) { + length+=1+name2Length; + } + + stringStart=allocLine(length); + if(name1Length>0) { + icu_memcpy(stringStart, name1, name1Length); + } + if(name2Length>0) { + stringStart[name1Length]=NAME_SEPARATOR_CHAR; + icu_memcpy(stringStart+name1Length+1, name2, name2Length); + } + + line=lines+lineCount; + + line->code=code; + line->length=length; + line->s=stringStart; + + ++lineCount; + + /* prevent a character value that is actually in a name from becoming a token */ + while(length>0) { + tokens[stringStart[--length]]=-1; + } +} + +static void +addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) { + uint8_t *stringStart; + Line *line; + + if(lineCount==MAX_LINE_COUNT) { + fprintf(stderr, "gennames: too many groups\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + + /* store the line lengths first, then the strings */ + lineLengthsTop=(lineLengthsTop+1)/2; + stringStart=allocLine(lineLengthsTop+length); + icu_memcpy(stringStart, lineLengths, lineLengthsTop); + icu_memcpy(stringStart+lineLengthsTop, strings, length); + + line=lines+lineCount; + + line->code=groupMSB; + line->length=length; + line->s=stringStart; + + ++lineCount; +} + +static uint32_t +addToken(uint8_t *s, int16_t length) { + uint8_t *stringStart; + + stringStart=allocLine(length+1); + icu_memcpy(stringStart, s, length); + stringStart[length]=0; + + return stringStart-stringStore; +} + +static void +appendLineLength(int16_t length) { + if(length>=76) { + fprintf(stderr, "gennames: compressed line too long\n"); + exit(U_BUFFER_OVERFLOW_ERROR); + } + if(length>=12) { + length-=12; + appendLineLengthNibble((uint8_t)((length>>4)|12)); + } + appendLineLengthNibble((uint8_t)length); +} + +static void +appendLineLengthNibble(uint8_t nibble) { + if((lineLengthsTop&1)==0) { + lineLengths[lineLengthsTop/2]=nibble<<4; + } else { + lineLengths[lineLengthsTop/2]|=nibble&0xf; + } + ++lineLengthsTop; +} + +static uint8_t * +allocLine(uint32_t length) { + uint32_t top=lineTop+length; + uint8_t *p; + + if(top>wordBottom) { + fprintf(stderr, "gennames: out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + p=stringStore+lineTop; + lineTop=top; + return p; +} + +static uint8_t * +allocWord(uint32_t length) { + uint32_t bottom=wordBottom-length; + + if(lineTop>bottom) { + fprintf(stderr, "gennames: out of memory\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + wordBottom=bottom; + return stringStore+bottom; +} diff --git a/icu4c/source/tools/gennames/gennames.dsp b/icu4c/source/tools/gennames/gennames.dsp new file mode 100644 index 0000000000..0b9dea8318 --- /dev/null +++ b/icu4c/source/tools/gennames/gennames.dsp @@ -0,0 +1,102 @@ +# Microsoft Developer Studio Project File - Name="gennames" - Package Owner=<4> +# Microsoft Developer Studio Generated Build File, Format Version 6.00 +# ** DO NOT EDIT ** + +# TARGTYPE "Win32 (x86) Console Application" 0x0103 + +CFG=gennames - Win32 Debug +!MESSAGE This is not a valid makefile. To build this project using NMAKE, +!MESSAGE use the Export Makefile command and run +!MESSAGE +!MESSAGE NMAKE /f "gennames.mak". +!MESSAGE +!MESSAGE You can specify a configuration when running NMAKE +!MESSAGE by defining the macro CFG on the command line. For example: +!MESSAGE +!MESSAGE NMAKE /f "gennames.mak" CFG="gennames - Win32 Debug" +!MESSAGE +!MESSAGE Possible choices for configuration are: +!MESSAGE +!MESSAGE "gennames - Win32 Release" (based on "Win32 (x86) Console Application") +!MESSAGE "gennames - Win32 Debug" (based on "Win32 (x86) Console Application") +!MESSAGE + +# Begin Project +# PROP AllowPerConfigDependencies 0 +# PROP Scc_ProjName "" +# PROP Scc_LocalPath "" +CPP=cl.exe +RSC=rc.exe + +!IF "$(CFG)" == "gennames - Win32 Release" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 0 +# PROP BASE Output_Dir "Release" +# PROP BASE Intermediate_Dir "Release" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 0 +# PROP Output_Dir "Release" +# PROP Intermediate_Dir "Release" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD CPP /nologo /Za /W3 /GX /O2 /I "..\..\common" /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c +# ADD BASE RSC /l 0x409 /d "NDEBUG" +# ADD RSC /l 0x409 /d "NDEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 +# ADD LINK32 toolutil.lib icuuc.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386 /libpath:"..\toolutil\Release" /libpath:"..\..\..\lib\Release" + +!ELSEIF "$(CFG)" == "gennames - Win32 Debug" + +# PROP BASE Use_MFC 0 +# PROP BASE Use_Debug_Libraries 1 +# PROP BASE Output_Dir "Debug" +# PROP BASE Intermediate_Dir "Debug" +# PROP BASE Target_Dir "" +# PROP Use_MFC 0 +# PROP Use_Debug_Libraries 1 +# PROP Output_Dir "Debug" +# PROP Intermediate_Dir "Debug" +# PROP Ignore_Export_Lib 0 +# PROP Target_Dir "" +# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD CPP /nologo /Za /W3 /Gm /GX /ZI /Od /I "..\..\common" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c +# ADD BASE RSC /l 0x409 /d "_DEBUG" +# ADD RSC /l 0x409 /d "_DEBUG" +BSC32=bscmake.exe +# ADD BASE BSC32 /nologo +# ADD BSC32 /nologo +LINK32=link.exe +# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept +# ADD LINK32 toolutil.lib icuuc.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\toolutil\Debug" /libpath:"..\..\..\lib\Debug" + +!ENDIF + +# Begin Target + +# Name "gennames - Win32 Release" +# Name "gennames - Win32 Debug" +# Begin Group "Source Files" + +# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" +# Begin Source File + +SOURCE=.\gennames.c +# End Source File +# End Group +# Begin Group "Header Files" + +# PROP Default_Filter "h;hpp;hxx;hm;inl" +# End Group +# Begin Group "Resource Files" + +# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe" +# End Group +# End Target +# End Project