/* ******************************************************************************* * * Copyright (C) 2000-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: genuca.cpp * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created at the end of XX century * created by: Vladimir Weinstein, * modified in 2013-2014 by Markus Scherer * * This program reads the Fractional UCA table and generates * internal format for UCA table as well as inverse UCA table. * It then writes the ucadata.icu binary file containing the data. */ #define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1 #include #include "unicode/utypes.h" #include "unicode/errorcode.h" #include "unicode/localpointer.h" #include "charstr.h" #include "cmemory.h" #include "collation.h" #include "collationbasedatabuilder.h" #include "collationdata.h" #include "collationdatabuilder.h" #include "collationdatareader.h" #include "collationdatawriter.h" #include "collationinfo.h" #include "collationrootelements.h" #include "collationruleparser.h" #include "collationtailoring.h" #include "cstring.h" #include "normalizer2impl.h" #include "toolutil.h" #include "unewdata.h" #include "uoptions.h" #include "uparse.h" #include "writesrc.h" #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) #if UCONFIG_NO_COLLATION extern "C" int main(int argc, char* argv[]) { (void)argc; (void)argv; return 1; } #else U_NAMESPACE_USE static UBool beVerbose=FALSE, withCopyright=TRUE; static UVersionInfo UCAVersion={ 0, 0, 0, 0 }; static UDataInfo ucaDataInfo={ sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, U_SIZEOF_UCHAR, 0, { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" { 4, 0, 0, 0 }, // formatVersion { 6, 3, 0, 0 } // dataVersion }; static char *skipWhiteSpace(char *s) { while(*s == ' ' || *s == '\t') { ++s; } return s; } static int32_t hex2num(char hex) { if(hex>='0' && hex <='9') { return hex-'0'; } else if(hex>='a' && hex<='f') { return hex-'a'+10; } else if(hex>='A' && hex<='F') { return hex-'A'+10; } else { return -1; } } static uint32_t parseWeight(char *&s, const char *separators, int32_t maxBytes, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } uint32_t weight = 0; int32_t numBytes = 0; for(;;) { // Check one character after another, so that we don't just run over a 00. int32_t nibble1, nibble2; if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) { // Stop when we find something other than a pair of hex digits. break; } if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) { // Too many bytes, or a 00 or 01 byte which is illegal inside a weight. errorCode = U_INVALID_FORMAT_ERROR; return 0; } weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2; ++numBytes; s += 2; if(*s != ' ') { break; } ++s; } char c = *s; if(c == 0 || strchr(separators, c) == NULL) { errorCode = U_INVALID_FORMAT_ERROR; return 0; } // numBytes==0 is ok, for example in [,,] or [, 82, 05] // Left-align the weight. while(numBytes < 4) { weight <<= 8; ++numBytes; } return weight; } /** * Parse a CE like [0A 86, 05, 17] or [U+4E00, 10]. * Stop with an error, or else with the pointer s after the closing bracket. */ static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } ++s; // skip over the '[' if(s[0] == 'U' && s[1] == '+') { // Read a code point and look up its CE. // We use this especially for implicit primary weights, // so that we can use different algorithms in the FractionalUCA.txt // generator and the parser. // The generator may not even need to compute any implicit primaries at all. s += 2; char *end; unsigned long longCp = uprv_strtoul(s, &end, 16); if(end == s || longCp > 0x10ffff) { errorCode = U_INVALID_FORMAT_ERROR; return 0; } UChar32 c = (UChar32)longCp; int64_t ce = builder.getSingleCE(c, errorCode); if(U_FAILURE(errorCode)) { return 0; } s = end; if(*s == ']') { // [U+4E00] ++s; return ce; } if(*s != ',') { errorCode = U_INVALID_FORMAT_ERROR; return 0; } // Parse the following, secondary or tertiary weight. s = skipWhiteSpace(s + 1); uint32_t w = parseWeight(s, ",]", 2, errorCode); if(U_FAILURE(errorCode)) { return 0; } if(*s == ']') { // [U+4E00, 10] ++s; // Set the tertiary weight to w. return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16); } // Set the secondary weight to w: [U+9F9C, 70, 20] ce = (ce & INT64_C(0xffffffff00000000)) | w; // Parse and set the tertiary weight. s = skipWhiteSpace(s + 1); w = parseWeight(s, "]", 2, errorCode); ++s; return ce | (w >> 16); } else { uint32_t p = parseWeight(s, ",", 4, errorCode); if(U_FAILURE(errorCode)) { return 0; } int64_t ce = (int64_t)p << 32; s = skipWhiteSpace(s + 1); uint32_t w = parseWeight(s, ",", 2, errorCode); if(U_FAILURE(errorCode)) { return 0; } ce |= w; s = skipWhiteSpace(s + 1); w = parseWeight(s, "]", 2, errorCode); ++s; return ce | (w >> 16); } } static const struct { const char *name; int32_t code; } specialReorderTokens[] = { { "TERMINATOR", -2 }, // -2 means "ignore" { "LEVEL-SEPARATOR", -2 }, { "FIELD-SEPARATOR", -2 }, { "COMPRESS", -3 }, // The standard name is "PUNCT" but FractionalUCA.txt uses the long form. { "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION }, { "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte. { "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits). { "SPECIAL", -2 } // We must never reorder internal, special CE lead bytes. }; int32_t getReorderCode(const char* name) { int32_t code = CollationRuleParser::getReorderCode(name); if (code >= 0) { return code; } for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) { if (0 == strcmp(name, specialReorderTokens[i].name)) { return specialReorderTokens[i].code; } } return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE. } enum ActionType { READCE, READPRIMARY, READBYTE, READUNIFIEDIDEOGRAPH, READUCAVERSION, READLEADBYTETOSCRIPTS, IGNORE }; static struct { const char *const name; int64_t value; const ActionType what_to_do; } vt[] = { {"[first tertiary ignorable", 0, IGNORE}, {"[last tertiary ignorable", 0, IGNORE}, {"[first secondary ignorable", 0, READCE}, {"[last secondary ignorable", 0, READCE}, {"[first primary ignorable", 0, READCE}, {"[last primary ignorable", 0, READCE}, {"[first variable", 0, READCE}, {"[last variable", 0, READCE}, {"[first regular", 0, READCE}, {"[last regular", 0, READCE}, {"[first implicit", 0, READCE}, {"[last implicit", 0, READCE}, {"[first trailing", 0, READCE}, {"[last trailing", 0, READCE}, {"[Unified_Ideograph", 0, READUNIFIEDIDEOGRAPH}, {"[fixed first implicit byte", 0, IGNORE}, {"[fixed last implicit byte", 0, IGNORE}, {"[fixed first trail byte", 0, IGNORE}, {"[fixed last trail byte", 0, IGNORE}, {"[fixed first special byte", 0, IGNORE}, {"[fixed last special byte", 0, IGNORE}, {"[fixed secondary common byte", 0, READBYTE}, {"[fixed last secondary common byte", 0, READBYTE}, {"[fixed first ignorable secondary byte", 0, READBYTE}, {"[fixed tertiary common byte", 0, READBYTE}, {"[fixed first ignorable tertiary byte", 0, READBYTE}, {"[variable top = ", 0, IGNORE}, {"[UCA version = ", 0, READUCAVERSION}, {"[top_byte", 0, READLEADBYTETOSCRIPTS}, {"[reorderingTokens", 0, IGNORE}, {"[categories", 0, IGNORE}, {"[first tertiary in secondary non-ignorable", 0, IGNORE}, {"[last tertiary in secondary non-ignorable", 0, IGNORE}, {"[first secondary in primary non-ignorable", 0, IGNORE}, {"[last secondary in primary non-ignorable", 0, IGNORE}, }; static int64_t getOptionValue(const char *name) { for (int32_t i = 0; i < LENGTHOF(vt); ++i) { if(uprv_strcmp(name, vt[i].name) == 0) { return vt[i].value; } } return 0; } static UnicodeString *leadByteScripts = NULL; static void readAnOption( CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) { for (int32_t cnt = 0; cnt> 24; if(U_FAILURE(*status)) { fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer); return; } } else if(what_to_do == READUNIFIEDIDEOGRAPH) { UVector32 unihan(*status); if(U_FAILURE(*status)) { return; } for(;;) { if(*pointer == ']') { break; } if(*pointer == 0) { // Missing ] after ranges. *status = U_INVALID_FORMAT_ERROR; return; } char *s = pointer; while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; } char c = *s; *s = 0; uint32_t start, end; u_parseCodePointRange(pointer, &start, &end, status); *s = c; if(U_FAILURE(*status)) { fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer); *status = U_INVALID_FORMAT_ERROR; return; } unihan.addElement((UChar32)start, *status); unihan.addElement((UChar32)end, *status); pointer = skipWhiteSpace(s); } builder.initHanRanges(unihan.getBuffer(), unihan.size(), *status); } else if (what_to_do == READUCAVERSION) { u_versionFromString(UCAVersion, pointer); if(beVerbose) { char uca[U_MAX_VERSION_STRING_LENGTH]; u_versionToString(UCAVersion, uca); printf("UCA version %s\n", uca); } UVersionInfo UCDVersion; u_getUnicodeVersion(UCDVersion); if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) { char uca[U_MAX_VERSION_STRING_LENGTH]; char ucd[U_MAX_VERSION_STRING_LENGTH]; u_versionToString(UCAVersion, uca); u_versionToString(UCDVersion, ucd); // Warning, not error, to permit bootstrapping during a version upgrade. fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd); } } else if (what_to_do == READLEADBYTETOSCRIPTS) { uint16_t leadByte = (hex2num(*pointer++) * 16); leadByte += hex2num(*pointer++); if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) { // Extend the Hani range to the end of what this implementation uses. // FractionalUCA.txt assumes a different algorithm for implicit primary weights, // and different high-lead byte ranges. leadByteScripts[leadByte] = leadByteScripts[0xdf]; return; } UnicodeString scripts; for(;;) { pointer = skipWhiteSpace(pointer); if (*pointer == ']') { break; } const char *scriptName = pointer; char c; while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; } if(c == 0) { fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer); *status = U_INVALID_FORMAT_ERROR; return; } *pointer = 0; int32_t reorderCode = getReorderCode(scriptName); *pointer = c; if (reorderCode == -3) { // COMPRESS builder.setCompressibleLeadByte(leadByte); continue; } if (reorderCode == -2) { continue; // Ignore "TERMINATOR" etc. } if (reorderCode < 0 || 0xffff < reorderCode) { fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName); *status = U_INVALID_FORMAT_ERROR; return; } scripts.append((UChar)reorderCode); } if(!scripts.isEmpty()) { if(leadByteScripts == NULL) { leadByteScripts = new UnicodeString[256]; } leadByteScripts[leadByte] = scripts; } } return; } } fprintf(stderr, "Warning: unrecognized option: %s\n", buffer); } static UBool readAnElement(FILE *data, CollationBaseDataBuilder &builder, UnicodeString &prefix, UnicodeString &s, int64_t ces[32], int32_t &cesLength, UErrorCode *status) { if(U_FAILURE(*status)) { return FALSE; } char buffer[2048]; char *result = fgets(buffer, 2048, data); if(result == NULL) { if(feof(data)) { return FALSE; } else { fprintf(stderr, "empty line but no EOF!\n"); *status = U_INVALID_FORMAT_ERROR; return FALSE; } } int32_t buflen = (int32_t)uprv_strlen(buffer); while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) { buffer[--buflen] = 0; } if(buffer[0] == 0 || buffer[0] == '#') { return FALSE; // just a comment, skip whole line } // Directives. if(buffer[0] == '[') { readAnOption(builder, buffer, status); return FALSE; } char *startCodePoint = buffer; char *endCodePoint = strchr(startCodePoint, ';'); if(endCodePoint == NULL) { fprintf(stderr, "error - line with no code point!\n"); *status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */ return FALSE; } else { *endCodePoint = 0; } char *pipePointer = strchr(buffer, '|'); if (pipePointer != NULL) { // Read the prefix string which precedes the actual string. *pipePointer = 0; UChar *prefixChars = prefix.getBuffer(32); int32_t prefixSize = u_parseString(startCodePoint, prefixChars, prefix.getCapacity(), NULL, status); if(U_FAILURE(*status)) { prefix.releaseBuffer(0); fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n", startCodePoint, u_errorName(*status)); *status = U_INVALID_FORMAT_ERROR; return FALSE; } prefix.releaseBuffer(prefixSize); startCodePoint = pipePointer + 1; } // Read the string which gets the CE(s) assigned. UChar *uchars = s.getBuffer(32); int32_t cSize = u_parseString(startCodePoint, uchars, s.getCapacity(), NULL, status); if(U_FAILURE(*status)) { s.releaseBuffer(0); fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n", startCodePoint, u_errorName(*status)); *status = U_INVALID_FORMAT_ERROR; return FALSE; } s.releaseBuffer(cSize); char *pointer = endCodePoint + 1; char *commentStart = strchr(pointer, '#'); if(commentStart == NULL) { commentStart = strchr(pointer, 0); } cesLength = 0; for(;;) { pointer = skipWhiteSpace(pointer); if(pointer == commentStart) { break; } if(cesLength >= 31) { fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer); *status = U_INVALID_FORMAT_ERROR; return FALSE; } ces[cesLength++] = parseCE(builder, pointer, *status); if(U_FAILURE(*status)) { fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n", buffer, u_errorName(*status)); return FALSE; } } if(s.length() == 1 && s[0] == 0xfffe) { // UCA 6.0 gives U+FFFE a special minimum weight using the // byte 02 which is the merge-sort-key separator and illegal for any // other characters. } else { // Rudimentary check for valid bytes in CE weights. // For a more comprehensive check see CollationTest::TestRootElements(), // intltest collate/CollationTest/TestRootElements for (int32_t i = 0; i < cesLength; ++i) { int64_t ce = ces[i]; UBool isCompressible = FALSE; for (int j = 7; j >= 0; --j) { uint8_t b = (uint8_t)(ce >> (j * 8)); if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits if (b == 1) { fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer); return FALSE; } if ((j == 7 || j == 3 || j == 1) && b == 2) { fprintf(stderr, "Warning: invalid UCA weight lead byte 02 for %s\n", buffer); return FALSE; } if (j == 7) { isCompressible = builder.isCompressibleLeadByte(b); } else if (j == 6) { // Primary second bytes 03 and FF are compression terminators. // 02, 03 and FF are usable when the lead byte is not compressible. // 02 is unusable and 03 is the low compression terminator when the lead byte is compressible. if (isCompressible && (b <= 3 || b == 0xff)) { fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n", b, buffer); return FALSE; } } } } } return TRUE; } static void parseFractionalUCA(const char *filename, CollationBaseDataBuilder &builder, UErrorCode *status) { if(U_FAILURE(*status)) { return; } FILE *data = fopen(filename, "r"); if(data == NULL) { fprintf(stderr, "Couldn't open file: %s\n", filename); *status = U_FILE_ACCESS_ERROR; return; } uint32_t line = 0; UChar32 maxCodePoint = 0; while(!feof(data)) { if(U_FAILURE(*status)) { fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n", *status, u_errorName(*status), (int)line, filename); exit(*status); } line++; UnicodeString prefix; UnicodeString s; int64_t ces[32]; int32_t cesLength = 0; if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) { // we have read the line, now do something sensible with the read data! uint32_t p = (uint32_t)(ces[0] >> 32); if(s.length() > 1 && s[0] == 0xFDD0) { // FractionalUCA.txt contractions starting with U+FDD0 // are only entered into the inverse table, // not into the normal collation data. builder.addRootElements(ces, cesLength, *status); if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) { // Lead byte for numeric sorting. builder.setNumericPrimary(p); } } else { UChar32 c = s.char32At(0); if(c > maxCodePoint) { maxCodePoint = c; } // We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary. // CollationBaseDataBuilder::init() maps them to special CEs. // Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt. if(0xfffd <= c && c <= 0xffff) { continue; } if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) { continue; } if(0xe0000000 <= p && p < 0xf0000000) { fprintf(stderr, "Error: Unexpected mapping to an implicit or trailing primary" " on line %u of %s.\n", (int)line, filename); exit(U_INVALID_FORMAT_ERROR); } builder.add(prefix, s, ces, cesLength, *status); } } } int32_t numRanges = 0; int32_t numRangeCodePoints = 0; UChar32 rangeFirst = U_SENTINEL; UChar32 rangeLast = U_SENTINEL; uint32_t rangeFirstPrimary = 0; uint32_t rangeLastPrimary = 0; int32_t rangeStep = -1; // Detect ranges of characters in primary code point order, // with 3-byte primaries and // with consistent "step" differences between adjacent primaries. // This relies on the FractionalUCA generator using the same primary-weight incrementation. // Start at U+0180: No ranges for common Latin characters. // Go one beyond maxCodePoint in case a range ends there. for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) { UBool action; uint32_t p = builder.getLongPrimaryIfSingleCE(c); if(p != 0) { // p is a "long" (three-byte) primary. if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) { // Find the offset between the two primaries. int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries( rangeLastPrimary, p, builder.isCompressiblePrimary(p)); if(rangeFirst == rangeLast && step >= 2) { // c == rangeFirst + 1, store the "step" between range primaries. rangeStep = step; rangeLast = c; rangeLastPrimary = p; action = 0; // continue range } else if(rangeStep == step) { // Continue the range with the same "step" difference. rangeLast = c; rangeLastPrimary = p; action = 0; // continue range } else { action = 1; // maybe finish range, start a new one } } else { action = 1; // maybe finish range, start a new one } } else { action = -1; // maybe finish range, do not start a new one } if(action != 0 && rangeFirst >= 0) { // Finish a range. // Set offset CE32s for a long range, leave single CEs for a short range. UBool didSetRange = builder.maybeSetPrimaryRange( rangeFirst, rangeLast, rangeFirstPrimary, rangeStep, *status); if(U_FAILURE(*status)) { fprintf(stderr, "failure setting code point order range U+%04lx..U+%04lx " "%08lx..%08lx step %d - %s\n", (long)rangeFirst, (long)rangeLast, (long)rangeFirstPrimary, (long)rangeLastPrimary, (int)rangeStep, u_errorName(*status)); } else if(didSetRange) { int32_t rangeLength = rangeLast - rangeFirst + 1; if(beVerbose) { printf("* set code point order range U+%04lx..U+%04lx [%d] " "%08lx..%08lx step %d\n", (long)rangeFirst, (long)rangeLast, (int)rangeLength, (long)rangeFirstPrimary, (long)rangeLastPrimary, (int)rangeStep); } ++numRanges; numRangeCodePoints += rangeLength; } rangeFirst = U_SENTINEL; rangeStep = -1; } if(action > 0) { // Start a new range. rangeFirst = rangeLast = c; rangeFirstPrimary = rangeLastPrimary = p; } } printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints); // Idea: Probably best to work in two passes. // Pass 1 for reading all data, setting isCompressible flags (and reordering groups) // and finding ranges. // Then set the ranges in a newly initialized builder // for optimal compression (makes sure that adjacent blocks can overlap easily). // Then set all mappings outside the ranges. // // In the first pass, we could store mappings in a simple list, // with single-character/single-long-primary-CE mappings in a UTrie2; // or store the mappings in a temporary builder; // or we could just parse the input file again in the second pass. // // Ideally set/copy U+0000..U+017F before setting anything else, // then set default Han/Hangul, then set the ranges, then copy non-range mappings. // It should be easy to copy mappings from an un-built builder to a new one. // Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions. if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) { fprintf(stderr, "UCA version not specified. Cannot create data file!\n"); fclose(data); return; } if (beVerbose) { printf("\nLines read: %u\n", (int)line); } fclose(data); return; } static void buildAndWriteBaseData(CollationBaseDataBuilder &builder, const char *path, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) { fprintf(stderr, "error: unexpected [fixed secondary common byte]"); errorCode = U_INVALID_FORMAT_ERROR; return; } if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) { fprintf(stderr, "error: unexpected [fixed tertiary common byte]"); errorCode = U_INVALID_FORMAT_ERROR; return; } if(leadByteScripts != NULL) { uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1; do { // Find the range of lead bytes with this set of scripts. const UnicodeString &firstScripts = leadByteScripts[firstLead]; if(firstScripts.isEmpty()) { fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead); errorCode = U_INVALID_FORMAT_ERROR; return; } uint32_t lead = firstLead; for(;;) { ++lead; const UnicodeString &scripts = leadByteScripts[lead]; // The scripts should either be the same or disjoint. // We do not test if all reordering groups have disjoint sets of scripts. if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; } if(scripts != firstScripts) { fprintf(stderr, "[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] " "but not all scripts match\n", (int)firstLead, scripts[0], (int)lead); errorCode = U_INVALID_FORMAT_ERROR; return; } } // lead is one greater than the last lead byte with the same set of scripts as firstLead. builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode); if(U_FAILURE(errorCode)) { return; } firstLead = lead; } while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE); delete[] leadByteScripts; } CollationData data(*Normalizer2Factory::getNFCImpl(errorCode)); builder.enableFastLatin(); builder.build(data, errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "builder.build() failed: %s\n", u_errorName(errorCode)); return; } // The CollationSettings constructor gives us the properly encoded // default options, so that we need not duplicate them here. CollationSettings settings; UVector32 rootElements(errorCode); for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) { rootElements.addElement(0, errorCode); } builder.buildRootElementsTable(rootElements, errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n", u_errorName(errorCode)); return; } int32_t index = CollationRootElements::IX_COUNT; rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX); while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; } rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX); while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) { ++index; } rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX); rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE, CollationRootElements::IX_COMMON_SEC_AND_TER_CE); int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24; secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16; secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte"); rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES); LocalMemory buffer; int32_t capacity = 1000000; uint8_t *dest = buffer.allocateInsteadAndCopy(capacity); if(dest == NULL) { fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n", (long)capacity); errorCode = U_MEMORY_ALLOCATION_ERROR; return; } int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; int32_t totalSize = CollationDataWriter::writeBase( data, settings, rootElements.getBuffer(), rootElements.size(), indexes, dest, capacity, errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n", (long)capacity, u_errorName(errorCode)); return; } printf("*** CLDR root collation part sizes ***\n"); CollationInfo::printSizes(totalSize, indexes); printf("*** CLDR root collation size: %6ld (with file header but no copyright string)\n", (long)totalSize + 32); // 32 bytes = DataHeader rounded up to 16-byte boundary CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion); UNewDataMemory *pData=udata_create(path, "icu", "ucadata", &ucaDataInfo, withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n", path, u_errorName(errorCode)); return; } udata_writeBlock(pData, dest, totalSize); long dataLength = udata_finish(pData, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode)); return; } if(dataLength != (long)totalSize) { fprintf(stderr, "udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n", dataLength, (long)totalSize); errorCode=U_INTERNAL_PROGRAM_ERROR; } } /** * Adds each lead surrogate to the bmp set if any of the 1024 * associated supplementary code points is in the supp set. * These can be one and the same set. */ static void setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) { UChar32 c = 0x10000; for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { if(supp.containsSome(c, c + 0x3ff)) { bmp.add(lead); } } } static int32_t makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256], UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return 0; } bits[0] = 0; // no bits set bits[1] = 0xffffffff; // all bits set int32_t bitsLength = 2; int32_t i = 0; for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) { if(set.containsNone(c, c + 0x1f)) { index[i] = 0; } else if(set.contains(c, c + 0x1f)) { index[i] = 1; } else { uint32_t b = 0; for(int32_t j = 0; j <= 0x1f; ++j) { if(set.contains(c + j)) { b |= (uint32_t)1 << j; } } int32_t k; for(k = 2;; ++k) { if(k == bitsLength) { // new bit combination if(bitsLength == 256) { errorCode = U_BUFFER_OVERFLOW_ERROR; return 0; } bits[bitsLength++] = b; break; } if(bits[k] == b) { // duplicate bit combination break; } } index[i] = k; } } return bitsLength; } // TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values, // use that rather than properties APIs. // Then consider moving related logic for the unsafeBwdSet back from the loader into this builder. /** * Builds data for the FCD check fast path. * For details see the CollationFCD class comments. */ static void buildAndWriteFCDData(const char *path, UErrorCode &errorCode) { UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode); UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode); if(U_FAILURE(errorCode)) { return; } setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet); // The following supp(lccc)->lead(tccc) should be unnecessary // after the previous supp(tccc)->lead(tccc) // because there should not be any characters with lccc!=0 and tccc=0. // It is safe and harmless. setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet); setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet); uint8_t lcccIndex[0x800], tcccIndex[0x800]; uint32_t lcccBits[256], tcccBits[256]; int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode); int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode); printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4); printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4); if(U_FAILURE(errorCode)) { return; } FILE *f=usrc_create(path, "collationfcd.cpp", "icu/tools/unicode/c/genuca/genuca.cpp"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; return; } fputs("#include \"unicode/utypes.h\"\n\n", f); fputs("#if !UCONFIG_NO_COLLATION\n\n", f); fputs("#include \"collationfcd.h\"\n\n", f); fputs("U_NAMESPACE_BEGIN\n\n", f); usrc_writeArray(f, "const uint8_t CollationFCD::lcccIndex[%ld]={\n", lcccIndex, 8, 0x800, "\n};\n\n"); usrc_writeArray(f, "const uint32_t CollationFCD::lcccBits[%ld]={\n", lcccBits, 32, lcccBitsLength, "\n};\n\n"); usrc_writeArray(f, "const uint8_t CollationFCD::tcccIndex[%ld]={\n", tcccIndex, 8, 0x800, "\n};\n\n"); usrc_writeArray(f, "const uint32_t CollationFCD::tcccBits[%ld]={\n", tcccBits, 32, tcccBitsLength, "\n};\n\n"); fputs("U_NAMESPACE_END\n\n", f); fputs("#endif // !UCONFIG_NO_COLLATION\n", f); fclose(f); } static void parseAndWriteCollationRootData( const char *fracUCAPath, const char *binaryDataPath, const char *sourceCodePath, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } CollationBaseDataBuilder builder(errorCode); builder.init(errorCode); parseFractionalUCA(fracUCAPath, builder, &errorCode); buildAndWriteBaseData(builder, binaryDataPath, errorCode); buildAndWriteFCDData(sourceCodePath, errorCode); } // ------------------------------------------------------------------------- *** enum { HELP_H, HELP_QUESTION_MARK, VERBOSE, COPYRIGHT }; static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_VERBOSE, UOPTION_COPYRIGHT }; extern "C" int main(int argc, char* argv[]) { U_MAIN_INIT_ARGS(argc, argv); argc=u_parseArgs(argc, argv, LENGTHOF(options), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } if( argc<2 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur ) { /* * Broken into chunks because the C89 standard says the minimum * required supported string length is 509 bytes. */ fprintf(stderr, "Usage: %s [-options] path/to/ICU/src/root\n" "\n" "Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n" "writes source and binary data files with the collation root data.\n" "\n", argv[0]); fprintf(stderr, "Options:\n" "\t-h or -? or --help this usage text\n" "\t-v or --verbose verbose output\n" "\t-c or --copyright include a copyright notice\n"); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } beVerbose=options[VERBOSE].doesOccur; withCopyright=options[COPYRIGHT].doesOccur; IcuToolErrorCode errorCode("genuca"); CharString icuSrcRoot(argv[1], errorCode); CharString icuSource(icuSrcRoot, errorCode); icuSource.appendPathPart("source", errorCode); CharString icuSourceData(icuSource, errorCode); icuSourceData.appendPathPart("data", errorCode); CharString fracUCAPath(icuSourceData, errorCode); fracUCAPath.appendPathPart("unidata", errorCode); fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode); CharString sourceDataInColl(icuSourceData, errorCode); sourceDataInColl.appendPathPart("in", errorCode); sourceDataInColl.appendPathPart("coll", errorCode); CharString sourceI18n(icuSource, errorCode); sourceI18n.appendPathPart("i18n", errorCode); errorCode.assertSuccess(); parseAndWriteCollationRootData( fracUCAPath.data(), sourceDataInColl.data(), sourceI18n.data(), errorCode); return errorCode; } #endif // UCONFIG_NO_COLLATION