scuffed-code/tools/unicode/c/genprops/namespropsbuilder.cpp
Markus Scherer f712c8bc00 ICU-8972 merge gencase into genprops
X-SVN-Rev: 31189
2012-01-10 07:23:44 +00:00

1138 lines
34 KiB
C++

/*
*******************************************************************************
*
* Copyright (C) 1999-2012, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: namespropsbuilder.cpp (was gennames/gennames.c)
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 1999sep30
* created by: Markus W. Scherer
*
* This builder reads Unicode character names and aliases,
* tokenizes and compresses them, and builds
* compact binary tables for random-access lookup
* in a u_charName() API function.
*
* unames.icu file format (after UDataInfo header etc. - see udata.c)
* (all data is static const)
*
* UDataInfo fields:
* dataFormat "unam"
* formatVersion 1.0
* dataVersion = Unicode version from -u or --unicode command line option, defaults to 3.0.0
*
* -- data-based names
* uint32_t tokenStringOffset,
* groupsOffset,
* groupStringOffset,
* algNamesOffset;
*
* uint16_t tokenCount;
* uint16_t tokenTable[tokenCount];
*
* char tokenStrings[]; -- padded to even count
*
* -- strings (groupStrings) are tokenized as follows:
* for each character c
* if(c>=tokenCount) write that character c directly
* else
* token=tokenTable[c];
* if(token==0xfffe) -- lead byte of double-byte token
* token=tokenTable[c<<8|next character];
* if(token==-1)
* write c directly
* else
* tokenString=tokenStrings+token; (tokenStrings=start of names data + tokenStringOffset;)
* append zero-terminated tokenString;
*
* Different strings for a code point - normal name, 1.0 name, and ISO comment -
* are separated by ';'.
*
* uint16_t groupCount;
* struct {
* uint16_t groupMSB; -- for a group of 32 character names stored, this is code point>>5
* uint16_t offsetHigh; -- group strings are at start of names data + groupStringsOffset + this 32 bit-offset
* uint16_t offsetLow;
* } groupTable[groupCount];
*
* char groupStrings[]; -- padded to 4-count
*
* -- The actual, tokenized group strings are not zero-terminated because
* that would take up too much space.
* Instead, they are preceeded by their length, written in a variable-length sequence:
* For each of the 32 group strings, one or two nibbles are stored for its length.
* Nibbles (4-bit values, half-bytes) are read MSB first.
* A nibble with a value of 0..11 directly indicates the length of the name string.
* A nibble n with a value of 12..15 is a lead nibble and forms a value with the following nibble m
* by (((n-12)<<4)|m)+12, reaching values of 12..75.
* These lengths are sequentially for each tokenized string, not for the de-tokenized result.
* For the de-tokenizing, see token description above; the strings immediately follow the
* 32 lengths.
*
* -- algorithmic names
*
* typedef struct AlgorithmicRange {
* uint32_t rangeStart, rangeEnd;
* uint8_t algorithmType, algorithmVariant;
* uint16_t rangeSize;
* } AlgorithmicRange;
*
* uint32_t algRangesCount; -- number of data blocks for ranges of
* algorithmic names (Unicode 3.0.0: 3, hardcoded in gennames)
*
* struct {
* AlgorithmicRange algRange;
* uint8_t algRangeData[]; -- padded to 4-count except in last range
* } algRanges[algNamesCount];
* -- not a real array because each part has a different size
* of algRange.rangeSize (including AlgorithmicRange)
*
* -- algorithmic range types:
*
* 0 Names are formed from a string prefix that is stored in
* the algRangeData (zero-terminated), followed by the Unicode code point
* of the character in hexadecimal digits;
* algRange.algorithmVariant digits are written
*
* 1 Names are formed by calculating modulo-factors of the code point value as follows:
* algRange.algorithmVariant is the count of modulo factors
* algRangeData contains
* uint16_t factors[algRange.algorithmVariant];
* char strings[];
* the first zero-terminated string is written as the prefix; then:
*
* The rangeStart is subtracted; with the difference, here "code":
* for(i=algRange.algorithmVariant-1 to 0 step -1)
* index[i]=code%factor[i];
* code/=factor[i];
*
* The strings after the prefix are short pieces that are then appended to the result
* according to index[0..algRange.algorithmVariant-1].
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/udata.h"
#include "charstr.h"
#include "cmemory.h"
#include "cstring.h"
#include "genprops.h"
#include "ppucd.h"
#include "uarrsort.h"
#include "uassert.h"
#include "unewdata.h"
#include "uoptions.h"
#define STRING_STORE_SIZE 1000000
#define GROUP_STORE_SIZE 5000
#define GROUP_SHIFT 5
#define LINES_PER_GROUP (1UL<<GROUP_SHIFT)
#define GROUP_MASK (LINES_PER_GROUP-1)
#define MAX_LINE_COUNT 50000
#define MAX_WORD_COUNT 20000
#define MAX_GROUP_COUNT 5000
#define NAME_SEPARATOR_CHAR ';'
/* generator data ----------------------------------------------------------- */
U_NAMESPACE_USE
/* UDataInfo cf. udata.h */
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
sizeof(UChar),
0,
{0x75, 0x6e, 0x61, 0x6d}, /* dataFormat="unam" */
{1, 0, 0, 0}, /* formatVersion */
{3, 0, 0, 0} /* dataVersion */
};
static uint8_t stringStore[STRING_STORE_SIZE],
groupStore[GROUP_STORE_SIZE],
lineLengths[LINES_PER_GROUP];
static uint32_t lineTop=0, groupBottom, wordBottom=STRING_STORE_SIZE, lineLengthsTop;
typedef struct {
uint32_t code;
int16_t length;
uint8_t *s;
} Line;
typedef struct {
int32_t weight; /* -(cost for token) + (number of occurences) * (length-1) */
int16_t count;
int16_t length;
uint8_t *s;
} Word;
static Line lines[MAX_LINE_COUNT];
static Word words[MAX_WORD_COUNT];
static uint32_t lineCount=0, wordCount=0;
static int16_t leadByteCount;
#define LEADBYTE_LIMIT 16
static int16_t tokens[LEADBYTE_LIMIT*256];
static uint32_t tokenCount;
/* the structure for algorithmic names needs to be 4-aligned */
struct AlgorithmicRange {
UChar32 start, end;
uint8_t type, variant;
uint16_t size;
};
class NamesPropsBuilder : public PropsBuilder {
public:
NamesPropsBuilder(UErrorCode &errorCode);
virtual ~NamesPropsBuilder();
virtual void setUnicodeVersion(const UVersionInfo version);
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
virtual void build(UErrorCode &errorCode);
virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
private:
virtual void setAlgNamesRange(UChar32 start, UChar32 end,
const char *type, const char *prefix, UErrorCode &errorCode);
CharString algRanges;
int32_t countAlgRanges;
};
NamesPropsBuilder::NamesPropsBuilder(UErrorCode &errorCode)
: countAlgRanges(0) {
for(int i=0; i<256; ++i) {
tokens[i]=0;
}
}
NamesPropsBuilder::~NamesPropsBuilder() {
}
void
NamesPropsBuilder::setUnicodeVersion(const UVersionInfo version) {
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
/* prototypes --------------------------------------------------------------- */
static void
parseName(const char *name, int16_t length);
static int16_t
skipNoise(const char *line, int16_t start, int16_t limit);
static int16_t
getWord(const char *line, int16_t start, int16_t limit);
static void
compress(UErrorCode &errorCode);
static void
compressLines(void);
static int16_t
compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop);
static int32_t
compareWords(const void *context, const void *word1, const void *word2);
static int16_t
findToken(uint8_t *s, int16_t length);
static Word *
findWord(const char *s, int16_t length);
static Word *
addWord(const char *s, int16_t length);
static void
countWord(Word *word);
static void
addLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count);
static void
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length);
static uint32_t
addToken(uint8_t *s, int16_t length);
static void
appendLineLength(int16_t length);
static void
appendLineLengthNibble(uint8_t nibble);
static uint8_t *
allocLine(int32_t length);
static uint8_t *
allocWord(uint32_t length);
/* parsing ------------------------------------------------------------------ */
void
NamesPropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(!newValues.contains(UCHAR_NAME) && !newValues.contains(PPUCD_NAME_ALIAS)) {
return;
}
U_ASSERT(props.start==props.end);
const char *names[4]={ NULL, NULL, NULL, NULL };
int16_t lengths[4]={ 0, 0, 0, 0 };
/* get the character name */
if(props.name!=NULL) {
names[0]=props.name;
lengths[0]=(int16_t)uprv_strlen(props.name);
parseName(names[0], lengths[0]);
}
CharString buffer;
if(props.nameAlias!=NULL) {
/*
* Only use "correction" aliases for now, from Unicode 6.1 NameAliases.txt with 3 fields per line.
* TODO: Work on ticket #8963 to deal with multiple type:alias pairs per character.
*/
const char *corr=uprv_strstr(props.nameAlias, "correction=");
if(corr!=NULL) {
corr+=11; // skip "correction="
const char *limit=uprv_strchr(corr, ',');
if(limit!=NULL) {
buffer.append(corr, limit-corr, errorCode);
names[3]=buffer.data();
lengths[3]=(int16_t)(limit-corr);
} else {
names[3]=corr;
lengths[3]=(int16_t)uprv_strlen(corr);
}
parseName(names[3], lengths[3]);
}
}
addLine(props.start, names, lengths, LENGTHOF(names));
}
static void
parseName(const char *name, int16_t length) {
int16_t start=0, limit, wordLength/*, prevStart=-1*/;
Word *word;
while(start<length) {
/* skip any "noise" characters */
limit=skipNoise(name, start, length);
if(start<limit) {
/*prevStart=-1;*/
start=limit;
}
if(start==length) {
break;
}
/* get a word and add it if it is longer than 1 */
limit=getWord(name, start, length);
wordLength=(int16_t)(limit-start);
if(wordLength>1) {
word=findWord(name+start, wordLength);
if(word==NULL) {
word=addWord(name+start, wordLength);
}
countWord(word);
}
#if 0
/*
* if there was a word before this
* (with no noise in between), then add the pair of words, too
*/
if(prevStart!=-1) {
wordLength=limit-prevStart;
word=findWord(name+prevStart, wordLength);
if(word==NULL) {
word=addWord(name+prevStart, wordLength);
}
countWord(word);
}
#endif
/*prevStart=start;*/
start=limit;
}
}
static UBool
isWordChar(char c) {
return ('A'<=c && c<='I') || /* EBCDIC-safe check for letters */
('J'<=c && c<='R') ||
('S'<=c && c<='Z') ||
('0'<=c && c<='9');
}
static int16_t
skipNoise(const char *line, int16_t start, int16_t limit) {
/* skip anything that is not part of a word in this sense */
while(start<limit && !isWordChar(line[start])) {
++start;
}
return start;
}
static int16_t
getWord(const char *line, int16_t start, int16_t limit) {
char c=0; /* initialize to avoid a compiler warning although the code was safe */
/* a unicode character name word consists of A-Z0-9 */
while(start<limit && isWordChar(line[start])) {
++start;
}
/* include a following space or dash */
if(start<limit && ((c=line[start])==' ' || c=='-')) {
++start;
}
return start;
}
void
NamesPropsBuilder::setAlgNamesRange(UChar32 start, UChar32 end,
const char *type,
const char *prefix, // number of hex digits
UErrorCode &errorCode) {
/* modulo factors, maximum 8 */
/* 3 factors: 19, 21, 28, most-to-least-significant */
static const uint16_t hangulFactors[3]={
19, 21, 28
};
static const char jamo[]=
"HANGUL SYLLABLE \0"
"G\0GG\0N\0D\0DD\0R\0M\0B\0BB\0"
"S\0SS\0\0J\0JJ\0C\0K\0T\0P\0H\0"
"A\0AE\0YA\0YAE\0EO\0E\0YEO\0YE\0O\0"
"WA\0WAE\0OE\0YO\0U\0WEO\0WE\0WI\0"
"YU\0EU\0YI\0I\0"
"\0G\0GG\0GS\0N\0NJ\0NH\0D\0L\0LG\0LM\0"
"LB\0LS\0LT\0LP\0LH\0M\0B\0BS\0"
"S\0SS\0NG\0J\0C\0K\0T\0P\0H";
int32_t prefixLength=0;
AlgorithmicRange range;
uprv_memset(&range, 0, sizeof(AlgorithmicRange));
int32_t rangeSize=(int32_t)sizeof(AlgorithmicRange);
range.start=start;
range.end=end;
if(0==uprv_strcmp(type, "han")) {
range.type=0;
range.variant= end<=0xffff ? 4 : 5;
prefixLength=uprv_strlen(prefix)+1;
rangeSize+=prefixLength;
} else if(0==uprv_strcmp(type, "hangul")) {
range.type=1;
range.variant=(uint8_t)LENGTHOF(hangulFactors);
rangeSize+=(int32_t)sizeof(hangulFactors);
rangeSize+=(int32_t)sizeof(jamo);
} else {
fprintf(stderr, "genprops error: unknown algnamesrange type '%s'\n", prefix);
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return;
}
int32_t paddingLength=paddingLength=rangeSize&3;
if(paddingLength) {
paddingLength=4-paddingLength;
rangeSize+=paddingLength;
}
range.size=(uint16_t)rangeSize;
algRanges.append((char *)&range, (int32_t)sizeof(AlgorithmicRange), errorCode);
if(range.type==0) { // han
algRanges.append(prefix, prefixLength, errorCode);
} else /* type==1 */ { // hangul
algRanges.append((char *)hangulFactors, (int32_t)sizeof(hangulFactors), errorCode);
algRanges.append(jamo, (int32_t)sizeof(jamo), errorCode);
}
while(paddingLength) {
algRanges.append((char)0xaa, errorCode);
--paddingLength;
}
++countAlgRanges;
}
/* compressing -------------------------------------------------------------- */
static void
compress(UErrorCode &errorCode) {
uint32_t i, letterCount;
int16_t wordNumber;
/* sort the words in reverse order by weight */
uprv_sortArray(words, wordCount, sizeof(Word),
compareWords, NULL, FALSE, &errorCode);
/* remove the words that do not save anything */
while(wordCount>0 && words[wordCount-1].weight<1) {
--wordCount;
}
/* count the letters in the token range */
letterCount=0;
for(i=LEADBYTE_LIMIT; i<256; ++i) {
if(tokens[i]==-1) {
++letterCount;
}
}
if(!beQuiet) {
printf("number of letters used in the names: %d\n", (int)letterCount);
}
/* do we need double-byte tokens? */
if(wordCount+letterCount<=256) {
/* no, single-byte tokens are enough */
leadByteCount=0;
for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) {
if(tokens[i]!=-1) {
tokens[i]=wordNumber;
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
++wordNumber;
}
}
tokenCount=i;
} else {
/*
* The tokens that need two token bytes
* get their weight reduced by their count
* because they save less.
*/
tokenCount=256-letterCount;
for(i=tokenCount; i<wordCount; ++i) {
words[i].weight-=words[i].count;
}
/* sort these words in reverse order by weight */
errorCode=U_ZERO_ERROR;
uprv_sortArray(words+tokenCount, wordCount-tokenCount, sizeof(Word),
compareWords, NULL, FALSE, &errorCode);
/* remove the words that do not save anything */
while(wordCount>0 && words[wordCount-1].weight<1) {
--wordCount;
}
/* how many tokens and lead bytes do we have now? */
tokenCount=wordCount+letterCount+(LEADBYTE_LIMIT-1);
/*
* adjust upwards to take into account that
* double-byte tokens must not
* use NAME_SEPARATOR_CHAR as a second byte
*/
tokenCount+=(tokenCount-256+254)/255;
leadByteCount=(int16_t)(tokenCount>>8);
if(leadByteCount<LEADBYTE_LIMIT) {
/* adjust for the real number of lead bytes */
tokenCount-=(LEADBYTE_LIMIT-1)-leadByteCount;
} else {
/* limit the number of lead bytes */
leadByteCount=LEADBYTE_LIMIT-1;
tokenCount=LEADBYTE_LIMIT*256;
wordCount=tokenCount-letterCount-(LEADBYTE_LIMIT-1);
/* adjust again to skip double-byte tokens with ';' */
wordCount-=(tokenCount-256+254)/255;
}
/* set token 0 to word 0 */
tokens[0]=0;
if(beVerbose) {
printf("tokens[0x000]: word%8ld \"%.*s\"\n",
(long)words[0].weight,
words[0].length, words[0].s);
}
wordNumber=1;
/* set the lead byte tokens */
for(i=1; (int16_t)i<=leadByteCount; ++i) {
tokens[i]=-2;
}
/* set the tokens */
for(; i<256; ++i) {
/* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */
if(tokens[i]!=-1) {
tokens[i]=wordNumber;
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
++wordNumber;
}
}
/* continue above 255 where there are no letters */
for(; (uint32_t)wordNumber<wordCount; ++i) {
if((i&0xff)==NAME_SEPARATOR_CHAR) {
tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
} else {
tokens[i]=wordNumber;
if(beVerbose) {
printf("tokens[0x%03x]: word%8ld \"%.*s\"\n",
(int)i, (long)words[wordNumber].weight,
words[wordNumber].length, words[wordNumber].s);
}
++wordNumber;
}
}
tokenCount=i; /* should be already tokenCount={i or i+1} */
}
if(!beQuiet) {
printf("number of lead bytes: %d\n", leadByteCount);
printf("number of single-byte tokens: %lu\n",
(unsigned long)256-letterCount-leadByteCount);
printf("number of tokens: %lu\n", (unsigned long)tokenCount);
}
compressLines();
}
static void
compressLines() {
Line *line=NULL;
uint32_t i=0, inLine, outLine=0xffffffff /* (uint32_t)(-1) */,
groupMSB=0xffff, lineCount2;
int16_t groupTop=0;
/* store the groups like lines, with compressed data after raw strings */
groupBottom=lineTop;
lineCount2=lineCount;
lineCount=0;
/* loop over all lines */
while(i<lineCount2) {
line=lines+i++;
inLine=line->code;
/* segment the lines to groups of 32 */
if(inLine>>GROUP_SHIFT!=groupMSB) {
/* finish the current group with empty lines */
while((++outLine&GROUP_MASK)!=0) {
appendLineLength(0);
}
/* store the group like a line */
if(groupTop>0) {
if(groupTop>GROUP_STORE_SIZE) {
fprintf(stderr, "gennames: group store overflow\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
addGroup(groupMSB, groupStore, groupTop);
}
/* start the new group */
lineLengthsTop=0;
groupTop=0;
groupMSB=inLine>>GROUP_SHIFT;
outLine=(inLine&~GROUP_MASK)-1;
}
/* write empty lines between the previous line in the group and this one */
while(++outLine<inLine) {
appendLineLength(0);
}
/* write characters and tokens for this line */
appendLineLength(compressLine(line->s, line->length, &groupTop));
}
/* finish and store the last group */
if(line && groupMSB!=0xffff) {
/* finish the current group with empty lines */
while((++outLine&GROUP_MASK)!=0) {
appendLineLength(0);
}
/* store the group like a line */
if(groupTop>0) {
if(groupTop>GROUP_STORE_SIZE) {
fprintf(stderr, "gennames: group store overflow\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
addGroup(groupMSB, groupStore, groupTop);
}
}
if(!beQuiet) {
printf("number of groups: %lu\n", (unsigned long)lineCount);
}
}
static int16_t
compressLine(uint8_t *s, int16_t length, int16_t *pGroupTop) {
int16_t start, limit, token, groupTop=*pGroupTop;
start=0;
do {
/* write any "noise" characters */
limit=skipNoise((char *)s, start, length);
while(start<limit) {
groupStore[groupTop++]=s[start++];
}
if(start==length) {
break;
}
/* write a word, as token or directly */
limit=getWord((char *)s, start, length);
if(limit-start==1) {
groupStore[groupTop++]=s[start++];
} else {
token=findToken(s+start, (int16_t)(limit-start));
if(token!=-1) {
if(token>0xff) {
groupStore[groupTop++]=(uint8_t)(token>>8);
}
groupStore[groupTop++]=(uint8_t)token;
start=limit;
} else {
while(start<limit) {
groupStore[groupTop++]=s[start++];
}
}
}
} while(start<length);
length=(int16_t)(groupTop-*pGroupTop);
*pGroupTop=groupTop;
return length;
}
static int32_t
compareWords(const void *context, const void *word1, const void *word2) {
/* reverse sort by word weight */
return ((Word *)word2)->weight-((Word *)word1)->weight;
}
void
NamesPropsBuilder::build(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
if(!beQuiet) {
puts("* unames.icu stats *");
printf("size of all names in the database: %lu\n",
(unsigned long)lineTop);
printf("number of named Unicode characters: %lu\n",
(unsigned long)lineCount);
printf("number of words in the dictionary from these names: %lu\n",
(unsigned long)wordCount);
}
compress(errorCode);
}
/* generate output data ----------------------------------------------------- */
void
NamesPropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return; }
UNewDataMemory *pData=udata_create(path, "icu", "unames", &dataInfo,
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genprops: udata_create(%s, unames.icu) failed - %s\n",
path, u_errorName(errorCode));
return;
}
uint16_t groupWords[3];
uint32_t i, groupTop=lineTop, size,
tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset;
long dataLength;
int16_t token;
/* first, see how much space we need, and prepare the token strings */
for(i=0; i<tokenCount; ++i) {
token=tokens[i];
if(token!=-1 && token!=-2) {
tokens[i]=(int16_t)(addToken(words[token].s, words[token].length)-groupTop);
}
}
/*
* Required padding for data swapping:
* The token table undergoes a permutation during data swapping when the
* input and output charsets are different.
* The token table cannot grow during swapping, so we need to make sure that
* the table is long enough for successful in-place permutation.
*
* We simply round up tokenCount to the next multiple of 256 to account for
* all possible permutations.
*
* An optimization is possible if we only ever swap between ASCII and EBCDIC:
*
* If tokenCount>256, then a semicolon (NAME_SEPARATOR_CHAR) is used
* and will be swapped between ASCII and EBCDIC between
* positions 0x3b (ASCII semicolon) and 0x5e (EBCDIC semicolon).
* This should be the only -1 entry in tokens[256..511] on which the data
* swapper bases its trail byte permutation map (trailMap[]).
*
* It would be sufficient to increase tokenCount so that its lower 8 bits
* are at least 0x5e+1 to make room for swapping between the two semicolons.
* For values higher than 0x5e, the trail byte permutation map (trailMap[])
* should always be an identity map, where we do not need additional room.
*/
i=tokenCount;
tokenCount=(tokenCount+0xff)&~0xff;
if(!beQuiet && i<tokenCount) {
printf("number of tokens[] padding entries for data swapping: %lu\n", (unsigned long)(tokenCount-i));
}
for(; i<tokenCount; ++i) {
if((i&0xff)==NAME_SEPARATOR_CHAR) {
tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */
} else {
tokens[i]=0; /* unused token for padding */
}
}
/*
* Calculate the total size in bytes of the data including:
* - the offset to the token strings, uint32_t (4)
* - the offset to the group table, uint32_t (4)
* - the offset to the group strings, uint32_t (4)
* - the offset to the algorithmic names, uint32_t (4)
*
* - the number of tokens, uint16_t (2)
* - the token table, uint16_t[tokenCount] (2*tokenCount)
*
* - the token strings, each zero-terminated (tokenSize=(lineTop-groupTop)), 2-padded
*
* - the number of groups, uint16_t (2)
* - the group table, { uint16_t groupMSB, uint16_t offsetHigh, uint16_t offsetLow }[6*groupCount]
*
* - the group strings (groupTop-groupBottom), 2-padded
*
* - the size of the data for the algorithmic names
*/
tokenStringOffset=4+4+4+4+2+2*tokenCount;
groupsOffset=(tokenStringOffset+(lineTop-groupTop)+1)&~1;
groupStringOffset=groupsOffset+2+6*lineCount;
algNamesOffset=(groupStringOffset+(groupTop-groupBottom)+3)&~3;
size=algNamesOffset+4+algRanges.length();
if(!beQuiet) {
printf("size of the Unicode Names data:\n"
"total data length %lu, token strings %lu, compressed strings %lu, algorithmic names %lu\n",
(unsigned long)size, (unsigned long)(lineTop-groupTop),
(unsigned long)(groupTop-groupBottom), (unsigned long)(4+algRanges.length()));
}
/* write the data to the file */
/* offsets */
udata_write32(pData, tokenStringOffset);
udata_write32(pData, groupsOffset);
udata_write32(pData, groupStringOffset);
udata_write32(pData, algNamesOffset);
/* token table */
udata_write16(pData, (uint16_t)tokenCount);
udata_writeBlock(pData, tokens, 2*tokenCount);
/* token strings */
udata_writeBlock(pData, stringStore+groupTop, lineTop-groupTop);
if((lineTop-groupTop)&1) {
/* 2-padding */
udata_writePadding(pData, 1);
}
/* group table */
udata_write16(pData, (uint16_t)lineCount);
for(i=0; i<lineCount; ++i) {
/* groupMSB */
groupWords[0]=(uint16_t)lines[i].code;
/* offset */
uint32_t offset = (uint32_t)((lines[i].s - stringStore)-groupBottom);
groupWords[1]=(uint16_t)(offset>>16);
groupWords[2]=(uint16_t)(offset);
udata_writeBlock(pData, groupWords, 6);
}
/* group strings */
udata_writeBlock(pData, stringStore+groupBottom, groupTop-groupBottom);
/* 4-align the algorithmic names data */
udata_writePadding(pData, algNamesOffset-(groupStringOffset+(groupTop-groupBottom)));
udata_write32(pData, countAlgRanges);
udata_writeBlock(pData, algRanges.data(), algRanges.length());
/* finish up */
dataLength=udata_finish(pData, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennames: error %d writing the output file\n", errorCode);
exit(errorCode);
}
if(dataLength!=(long)size) {
fprintf(stderr, "gennames: data length %ld != calculated size %lu\n",
dataLength, (unsigned long)size);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
/* helpers ------------------------------------------------------------------ */
static int16_t
findToken(uint8_t *s, int16_t length) {
int16_t i, token;
for(i=0; i<(int16_t)tokenCount; ++i) {
token=tokens[i];
if(token>=0 && length==words[token].length && 0==uprv_memcmp(s, words[token].s, length)) {
return i;
}
}
return -1;
}
static Word *
findWord(const char *s, int16_t length) {
uint32_t i;
for(i=0; i<wordCount; ++i) {
if(length==words[i].length && 0==uprv_memcmp(s, words[i].s, length)) {
return words+i;
}
}
return NULL;
}
static Word *
addWord(const char *s, int16_t length) {
uint8_t *stringStart;
Word *word;
if(wordCount==MAX_WORD_COUNT) {
fprintf(stderr, "gennames: too many words\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
stringStart=allocWord(length);
uprv_memcpy(stringStart, s, length);
word=words+wordCount;
/*
* Initialize the weight with the costs for this token:
* a zero-terminated string and a 16-bit offset.
*/
word->weight=-(length+1+2);
word->count=0;
word->length=length;
word->s=stringStart;
++wordCount;
return word;
}
static void
countWord(Word *word) {
/* add to the weight the savings: the length of the word minus 1 byte for the token */
word->weight+=word->length-1;
++word->count;
}
static void
addLine(UChar32 code, const char *names[], int16_t lengths[], int16_t count) {
uint8_t *stringStart;
Line *line;
int16_t i, length;
if(lineCount==MAX_LINE_COUNT) {
fprintf(stderr, "gennames: too many lines\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
/* find the last non-empty name */
while(count>0 && lengths[count-1]==0) {
--count;
}
if(count==0) {
return; /* should not occur: caller should not have called */
}
/* there will be (count-1) separator characters */
i=count;
length=count-1;
/* add lengths of strings */
while(i>0) {
length+=lengths[--i];
}
/* allocate line memory */
stringStart=allocLine(length);
/* copy all strings into the line memory */
length=0; /* number of chars copied so far */
for(i=0; i<count; ++i) {
if(i>0) {
stringStart[length++]=NAME_SEPARATOR_CHAR;
}
if(lengths[i]>0) {
uprv_memcpy(stringStart+length, names[i], lengths[i]);
length+=lengths[i];
}
}
line=lines+lineCount;
line->code=code;
line->length=length;
line->s=stringStart;
++lineCount;
/* prevent a character value that is actually in a name from becoming a token */
while(length>0) {
tokens[stringStart[--length]]=-1;
}
}
static void
addGroup(uint32_t groupMSB, uint8_t *strings, int16_t length) {
uint8_t *stringStart;
Line *line;
if(lineCount==MAX_LINE_COUNT) {
fprintf(stderr, "gennames: too many groups\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
/* store the line lengths first, then the strings */
lineLengthsTop=(lineLengthsTop+1)/2;
stringStart=allocLine(lineLengthsTop+length);
uprv_memcpy(stringStart, lineLengths, lineLengthsTop);
uprv_memcpy(stringStart+lineLengthsTop, strings, length);
line=lines+lineCount;
line->code=groupMSB;
line->length=length;
line->s=stringStart;
++lineCount;
}
static uint32_t
addToken(uint8_t *s, int16_t length) {
uint8_t *stringStart;
stringStart=allocLine(length+1);
uprv_memcpy(stringStart, s, length);
stringStart[length]=0;
return (uint32_t)(stringStart - stringStore);
}
static void
appendLineLength(int16_t length) {
if(length>=76) {
fprintf(stderr, "gennames: compressed line too long\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
if(length>=12) {
length-=12;
appendLineLengthNibble((uint8_t)((length>>4)|12));
}
appendLineLengthNibble((uint8_t)length);
}
static void
appendLineLengthNibble(uint8_t nibble) {
if((lineLengthsTop&1)==0) {
lineLengths[lineLengthsTop/2]=(uint8_t)(nibble<<4);
} else {
lineLengths[lineLengthsTop/2]|=nibble&0xf;
}
++lineLengthsTop;
}
static uint8_t *
allocLine(int32_t length) {
uint32_t top=lineTop+length;
uint8_t *p;
if(top>wordBottom) {
fprintf(stderr, "gennames: out of memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
p=stringStore+lineTop;
lineTop=top;
return p;
}
static uint8_t *
allocWord(uint32_t length) {
uint32_t bottom=wordBottom-length;
if(lineTop>bottom) {
fprintf(stderr, "gennames: out of memory\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
wordBottom=bottom;
return stringStore+bottom;
}
PropsBuilder *
createNamesPropsBuilder(UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) { return NULL; }
PropsBuilder *pb=new NamesPropsBuilder(errorCode);
if(pb==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
return pb;
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/