/* ********************************************************************** * Copyright (C) 2002-2011, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 10/11/02 aliu Creation. * 2010nov19 Markus Scherer Rewrite for formatVersion 2. * 2011dec18 Markus Scherer Moved genpname/genpname.cpp to genprops/pnamesbuilder.cpp. ********************************************************************** */ #include "unicode/utypes.h" #include "unicode/bytestrie.h" #include "unicode/bytestriebuilder.h" #include "unicode/putil.h" #include "unicode/uclean.h" #include "charstr.h" #include "cstring.h" #include "denseranges.h" #include "genprops.h" #include "propname.h" #include "toolutil.h" #include "uhash.h" #include "uinvchar.h" #include "unewdata.h" #include "uvectr32.h" #include "writesrc.h" #include // We test for ASCII delimiters and White_Space, and build ASCII string BytesTries. #if U_CHARSET_FAMILY!=U_ASCII_FAMILY # error This builder requires U_CHARSET_FAMILY==U_ASCII_FAMILY. #endif #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) U_NAMESPACE_USE //---------------------------------------------------------------------- // BEGIN DATA // // This is the raw data to be output. We define the data structure, // then include a machine-generated header that contains the actual // data. #include "unicode/uchar.h" #include "unicode/unorm2.h" #include "unicode/uscript.h" // Dilemma: We want to use MAX_ALIASES to define fields in the Value class. // However, we need to define the class before including the data header // and we can use MAX_ALIASES only after including it. // So we define a second constant and at runtime check that it's >=MAX_ALIASES. static const int32_t VALUE_MAX_ALIASES=4; static const int32_t JOINED_ALIASES_CAPACITY=100; class Value { public: Value(int32_t enumValue, const char *joinedAliases) : enumValue(enumValue), joinedAliases(joinedAliases), count(0) { if(uprv_strlen(joinedAliases)>=JOINED_ALIASES_CAPACITY) { fprintf(stderr, "genprops error: pnamesbuilder.cpp Value::Value(%ld, \"%s\"): " "joined aliases too long: " "increase JOINED_ALIASES_CAPACITY, to at least %ld\n", (long)enumValue, joinedAliases, uprv_strlen(joinedAliases)+1); exit(U_BUFFER_OVERFLOW_ERROR); } // Copy the space-separated aliases into NUL-separated ones and count them. // Write a normalized version of each one. const char *j=joinedAliases; char *a=aliasesBuffer; char *n=normalizedBuffer; char c; do { aliases[count]=a; normalized[count++]=n; while((c=*j++)!=' ' && c!=0) { *a++=c; // Ignore delimiters '-' and '_'. if(!(c=='-' || c=='_')) { *n++=uprv_tolower(c); } } *a++=0; *n++=0; } while(c!=0); } /** * Writes at most MAX_ALIASES pointers for unique normalized aliases * (no empty strings) to dest and returns how many there are. */ int32_t getUniqueNormalizedAliases(const char *dest[]) const { int32_t numUnique=0; for(int32_t i=0; i0) { // writeValueAliases(PROPERTIES[0], ...) already done setPropertyInt(PROPERTIES[propIndex].enumValue, 0, writeValueAliases(PROPERTIES[propIndex], errorCode)); } int32_t valueCount=PROPERTIES[propIndex].valueCount; if(valueCount>0) { int32_t valueMapOffset; const Value *values=PROPERTIES[propIndex].values; if(values==VALUES_binprop) { valueMapOffset=binPropsValueMapOffset; } else if(values==VALUES_ccc || values==VALUES_lccc || values==VALUES_tccc) { valueMapOffset=cccValueMapOffset; } else { valueMapOffset=valueMaps.size(); bytesTrieOffset=buildValuesBytesTrie(values, valueCount, errorCode); valueMaps.addElement(bytesTrieOffset, errorCode); buildValueMap(values, valueCount, errorCode); } setPropertyInt(PROPERTIES[propIndex].enumValue, 1, valueMapOffset); } } // Write the indexes. int32_t offset=(int32_t)sizeof(indexes); indexes[PropNameData::IX_VALUE_MAPS_OFFSET]=offset; offset+=valueMaps.size()*4; indexes[PropNameData::IX_BYTE_TRIES_OFFSET]=offset; offset+=bytesTries.length(); indexes[PropNameData::IX_NAME_GROUPS_OFFSET]=offset; offset+=nameGroups.length(); for(i=PropNameData::IX_RESERVED3_OFFSET; i<=PropNameData::IX_TOTAL_SIZE; ++i) { indexes[i]=offset; } indexes[PropNameData::IX_MAX_NAME_LENGTH]=maxNameLength; for(i=PropNameData::IX_RESERVED7; i=0x20) { fprintf(stderr, "Error: Too many aliases in \"%s\"\n", value.joinedAliases); exit(U_INDEX_OUTOFBOUNDS_ERROR); } nameGroups.append((char)count, errorCode); // There is at least a short name (sometimes empty) and a long name. (count>=2) // Note: Sometimes the short and long names are the same. // In such a case, we could set a flag and omit the duplicate, // but that would save only about 1.35% of total data size (Unicode 6.0/ICU 4.6) // which is not worth the trouble. // Note: In Unicode 6.1, there are more duplicates due to newly added // short names for blocks and other properties. // It might now be worth changing the data structure. for(int32_t i=0; imaxNameLength) { maxNameLength=sLength; } nameGroups.append(s, sLength, errorCode); // including NUL } return nameOffset; } void buildValueMap(const Value values[], int32_t length, UErrorCode &errorCode) { UVector32 sortedValues(errorCode); UVector32 nameOffsets(errorCode); // Parallel to values[]. int32_t i; for(i=0; i0) { valueMaps.addElement(numRanges, errorCode); for(i=0; i=0 ? nameOffsets.elementAti(valueIndex) : 0; valueMaps.addElement(nameOffset, errorCode); } } } else { // No dense ranges. valueMaps.addElement(0x10+length, errorCode); for(i=0; i(bytesTries.data())); return &pnames; } private: int32_t indexes[PropNameData::IX_COUNT]; UVector32 valueMaps; BytesTrieBuilder btb; CharString bytesTries; CharString nameGroups; int32_t maxNameLength; PNamesPropertyNames pnames; UHashtable *nameGroupToOffset; }; /* UDataInfo cf. udata.h */ static const UDataInfo dataInfo = { sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, sizeof(UChar), 0, { PNAME_SIG_0, PNAME_SIG_1, PNAME_SIG_2, PNAME_SIG_3 }, { 2, 0, 0, 0 }, /* formatVersion */ UNICODE_VERSION }; void PNamesBuilderImpl::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } UNewDataMemory *pdata=udata_create(path, PNAME_DATA_TYPE, PNAME_DATA_NAME, &dataInfo, withCopyright ? U_COPYRIGHT_STRING : 0, &errorCode); if(U_FAILURE(errorCode)) { fprintf(stderr, "genprops: udata_create(%s, pnames.icu) failed - %s\n", path, u_errorName(errorCode)); return; } udata_writeBlock(pdata, indexes, PropNameData::IX_COUNT*4); udata_writeBlock(pdata, valueMaps.getBuffer(), valueMaps.size()*4); udata_writeBlock(pdata, bytesTries.data(), bytesTries.length()); udata_writeBlock(pdata, nameGroups.data(), nameGroups.length()); int32_t dataLength=(int32_t)udata_finish(pdata, &errorCode); if(dataLength!=indexes[PropNameData::IX_TOTAL_SIZE]) { fprintf(stderr, "udata_finish(pnames.icu) reports %ld bytes written but should be %ld\n", (long)dataLength, (long)indexes[PropNameData::IX_TOTAL_SIZE]); errorCode=U_INTERNAL_PROGRAM_ERROR; } } void PNamesBuilderImpl::writeCSourceFile(const char *path, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } FILE *f=usrc_createFromGenerator(path, "propname_data.h", "icu/tools/unicode/c/genprops/pnamesbuilder.cpp"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; return; // usrc_create() reported an error. } fputs("#ifndef INCLUDED_FROM_PROPNAME_CPP\n" "# error This file must be #included from propname.cpp only.\n" "#endif\n\n", f); fputs("U_NAMESPACE_BEGIN\n\n", f); usrc_writeArray(f, "const int32_t PropNameData::indexes[%ld]={", indexes, 32, PropNameData::IX_COUNT, "};\n\n"); usrc_writeArray(f, "const int32_t PropNameData::valueMaps[%ld]={\n", valueMaps.getBuffer(), 32, valueMaps.size(), "\n};\n\n"); usrc_writeArray(f, "const uint8_t PropNameData::bytesTries[%ld]={\n", bytesTries.data(), 8, bytesTries.length(), "\n};\n\n"); usrc_writeArrayOfMostlyInvChars( f, "const char PropNameData::nameGroups[%ld]={\n", nameGroups.data(), nameGroups.length(), "\n};\n\n"); fputs("U_NAMESPACE_END\n", f); fclose(f); } PNamesBuilder * createPNamesBuilder(UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return NULL; } PNamesBuilder *pw=new PNamesBuilderImpl(errorCode); if(pw==NULL) { errorCode=U_MEMORY_ALLOCATION_ERROR; } return pw; } // Note: The following is a partial copy of runtime propname.cpp code. // Consider changing that into a semi-public API to avoid duplication. int32_t PNamesPropertyNames::findProperty(int32_t property) const { int32_t i=1; // valueMaps index, initially after numRanges for(int32_t numRanges=valueMaps[0]; numRanges>0; --numRanges) { // Read and skip the start and limit of this range. int32_t start=valueMaps[i]; int32_t limit=valueMaps[i+1]; i+=2; if(property