/* ****************************************************************************** * * Copyright (C) 1999-2001, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: unames.c * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 1999oct04 * created by: Markus W. Scherer */ /* set import/export definitions */ #ifndef U_COMMON_IMPLEMENTATION # define U_COMMON_IMPLEMENTATION #endif #include "unicode/utypes.h" #include "unicode/uchar.h" #include "unicode/udata.h" #include "unicode/uset.h" #include "ustr_imp.h" #include "umutex.h" #include "cmemory.h" #include "cstring.h" #include "ucln_cmn.h" #include "uprops.h" /* prototypes ------------------------------------------------------------- */ #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) static const char DATA_NAME[] = "unames"; static const char DATA_TYPE[] = "icu"; #define GROUP_SHIFT 5 #define LINES_PER_GROUP (1UL<=U_CHAR_NAME_CHOICE_COUNT || bufferLength<0 || (bufferLength>0 && buffer==NULL) ) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { return u_terminateChars(buffer, bufferLength, 0, pErrorCode); } length=0; /* try algorithmic names first */ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); i=*p; algRange=(AlgorithmicRange *)(p+1); while(i>0) { if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); break; } algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); --i; } if(i==0) { if (nameChoice == U_EXTENDED_CHAR_NAME) { length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); if (!length) { /* extended character name */ length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); } } else { /* normal character name */ length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); } } return u_terminateChars(buffer, bufferLength, length, pErrorCode); } #define _U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT U_CAPI int32_t U_EXPORT2 u_getISOComment(UChar32 c, char *dest, int32_t destCapacity, UErrorCode *pErrorCode) { int32_t length; /* check the argument values */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return 0; } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return 0; } if((uint32_t)c>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { return u_terminateChars(dest, destCapacity, 0, pErrorCode); } /* the ISO comment is stored like a normal character name */ length=getName(uCharNames, (uint32_t)c, _U_ISO_COMMENT, dest, (uint16_t)destCapacity); return u_terminateChars(dest, destCapacity, length, pErrorCode); } U_CAPI UChar32 U_EXPORT2 u_charFromName(UCharNameChoice nameChoice, const char *name, UErrorCode *pErrorCode) { char upper[120], lower[120]; FindName findName; AlgorithmicRange *algRange; uint32_t *p; uint32_t i; UChar32 cp = 0; char c0; UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return error; } if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return error; } if(!isDataLoaded(pErrorCode)) { return error; } /* construct the uppercase and lowercase of the name first */ for(i=0; i') { for (--i; lower[i] && lower[i] != '-'; --i); if (lower[i] == '-') { /* We've got a category. */ uint32_t cIdx; lower[i] = 0; for (++i; lower[i] != '>'; ++i) { if (lower[i] >= '0' && lower[i] <= '9') { cp = (cp << 4) + lower[i] - '0'; } else if (lower[i] >= 'a' && lower[i] <= 'f') { cp = (cp << 4) + lower[i] - 'a' + 10; } else { *pErrorCode = U_ILLEGAL_CHAR_FOUND; return error; } } /* Now validate the category name. We could use a binary search, or a trie, if we really wanted to. */ for (lower[i] = 0, cIdx = 0; cIdx < LENGTHOF(charCatNames); ++cIdx) { if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { if (getCharCat(cp) == cIdx) { return cp; } break; } } } } } *pErrorCode = U_ILLEGAL_CHAR_FOUND; return error; } /* try algorithmic names now */ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); i=*p; algRange=(AlgorithmicRange *)(p+1); while(i>0) { if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { return cp; } algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); --i; } /* normal character name */ findName.otherName=upper; findName.code=error; enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); if (findName.code == error) { *pErrorCode = U_ILLEGAL_CHAR_FOUND; } return findName.code; } U_CAPI void U_EXPORT2 u_enumCharNames(UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice, UErrorCode *pErrorCode) { AlgorithmicRange *algRange; uint32_t *p; uint32_t i; if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { return; } if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; return; } if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { limit = UCHAR_MAX_VALUE + 1; } if((uint32_t)start>=(uint32_t)limit) { return; } if(!isDataLoaded(pErrorCode)) { return; } /* interleave the data-driven ones with the algorithmic ones */ /* iterate over all algorithmic ranges; assume that they are in ascending order */ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); i=*p; algRange=(AlgorithmicRange *)(p+1); while(i>0) { /* enumerate the character names before the current algorithmic range */ /* here: startstart) { if((uint32_t)limit<=algRange->start) { enumNames(uCharNames, start, limit, fn, context, nameChoice); return; } if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { return; } start=(UChar32)algRange->start; } /* enumerate the character names in the current algorithmic range */ /* here: algRange->start<=startend) { if((uint32_t)limit<=(algRange->end+1)) { enumAlgNames(algRange, start, limit, fn, context, nameChoice); return; } if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { return; } start=(UChar32)algRange->end+1; } /* continue to the next algorithmic range (here: startsize); --i; } /* enumerate the character names after the last algorithmic range */ enumNames(uCharNames, start, limit, fn, context, nameChoice); } /* implementation ----------------------------------------------------------- */ UBool unames_cleanup() { if(uCharNamesData) { udata_close(uCharNamesData); uCharNamesData = NULL; } if(uCharNames) { uCharNames = NULL; } gMaxNameLength=0; return TRUE; } static UBool isDataLoaded(UErrorCode *pErrorCode) { /* load UCharNames from file if necessary */ if(uCharNames==NULL) { UCharNames *names; UDataMemory *data; /* check error code from previous attempt */ if(U_FAILURE(gLoadErrorCode)) { *pErrorCode=gLoadErrorCode; return FALSE; } /* open the data outside the mutex block */ data=udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, pErrorCode); if(U_FAILURE(*pErrorCode)) { gLoadErrorCode=*pErrorCode; return FALSE; } names=(UCharNames *)udata_getMemory(data); /* in the mutex block, set the data for this process */ { umtx_lock(NULL); if(uCharNames==NULL) { uCharNames=names; uCharNamesData=data; data=NULL; names=NULL; } umtx_unlock(NULL); } /* if a different thread set it first, then close the extra data */ if(data!=NULL) { udata_close(data); /* NULL if it was set correctly */ } } return TRUE; } static UBool U_CALLCONV isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo) { return (UBool)( pInfo->size>=20 && pInfo->isBigEndian==U_IS_BIG_ENDIAN && pInfo->charsetFamily==U_CHARSET_FAMILY && pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ pInfo->dataFormat[1]==0x6e && pInfo->dataFormat[2]==0x61 && pInfo->dataFormat[3]==0x6d && pInfo->formatVersion[0]==1); } /* * getGroup() does a binary search for the group that contains the * Unicode code point "code". * The return value is always a valid Group* that may contain "code" * or else is the highest group before "code". * If the lowest group is after "code", then that one is returned. */ static Group * getGroup(UCharNames *names, uint32_t code) { uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), start=0, limit=*(uint16_t *)((char *)names+names->groupsOffset), number; Group *groups=(Group *)((char *)names+names->groupsOffset+2); /* binary search for the group of names that contains the one for code */ while(start>GROUP_SHIFT)==group->groupMSB) { return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, buffer, bufferLength); } else { /* group not found */ /* zero-terminate */ if(bufferLength>0) { *buffer=0; } return 0; } } /* * expandGroupLengths() reads a block of compressed lengths of 32 strings and * expands them into offsets and lengths for each string. * Lengths are stored with a variable-width encoding in consecutive nibbles: * If a nibble<0xc, then it is the length itself (0=empty string). * If a nibble>=0xc, then it forms a length value with the following nibble. * Calculation see below. * The offsets and lengths arrays must be at least 33 (one more) long because * there is no check here at the end if the last nibble is still used. */ static const uint8_t * expandGroupLengths(const uint8_t *s, uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { /* read the lengths of the 32 strings in this group and get each string's offset */ uint16_t i=0, offset=0, length=0; uint8_t lengthByte; /* all 32 lengths must be read to get the offset of the first group string */ while(i=12) { /* double-nibble length spread across two bytes */ length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); lengthByte&=0xf; } else if((lengthByte /* &0xf0 */)>=0xc0) { /* double-nibble length spread across this one byte */ length=(uint16_t)((lengthByte&0x3f)+12); } else { /* single-nibble length in MSBs */ length=(uint16_t)(lengthByte>>4); lengthByte&=0xf; } *offsets++=offset; *lengths++=length; offset+=length; ++i; /* read odd nibble - LSBs of lengthByte */ if((lengthByte&0xf0)==0) { /* this nibble was not consumed for a double-nibble length above */ length=lengthByte; if(length<12) { /* single-nibble length in LSBs */ *offsets++=offset; *lengths++=length; offset+=length; ++i; } } else { length=0; /* prevent double-nibble detection in the next iteration */ } } /* now, s is at the first group string */ return s; } static uint16_t expandGroupName(UCharNames *names, Group *group, uint16_t lineNumber, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; const uint8_t *s=(uint8_t *)names+names->groupStringOffset+ (group->offsetHigh<<16|group->offsetLow); s=expandGroupLengths(s, offsets, lengths); return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, buffer, bufferLength); } #define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \ if((bufferLength)>0) { \ *(buffer)++=c; \ --(bufferLength); \ } \ ++(bufferPos); \ } /* * Important: expandName() and compareName() are almost the same - * apply fixes to both. * * UnicodeData.txt uses ';' as a field separator, so no * field can contain ';' as part of its contents. * In unames.dat, it is marked as token[';']==-1 only if the * semicolon is used in the data file - which is iff we * have Unicode 1.0 names or ISO comments. * So, it will be token[';']==-1 if we store U1.0 names/ISO comments * although we know that it will never be part of a name. */ static uint16_t expandName(UCharNames *names, const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { uint16_t *tokens=(uint16_t *)names+8; uint16_t token, tokenCount=*tokens++, bufferPos=0; uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; uint8_t c; if(nameChoice==U_UNICODE_10_CHAR_NAME || nameChoice==_U_ISO_COMMENT) { /* * skip the modern name if it is not requested _and_ * if the semicolon byte value is a character, not a token number */ if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { while(nameLength>0) { --nameLength; if(*name++==';') { break; } } if(nameChoice==_U_ISO_COMMENT) { /* skip the Unicode 1.0 name as well to get the ISO comment */ while(nameLength>0) { --nameLength; if(*name++==';') { break; } } } } else { /* * the semicolon byte value is a token number, therefore * only modern names are stored in unames.dat and there is no * such requested Unicode 1.0 name here */ nameLength=0; } } /* write each letter directly, and write a token word per token */ while(nameLength>0) { --nameLength; c=*name++; if(c>=tokenCount) { if(c!=';') { /* implicit letter */ WRITE_CHAR(buffer, bufferLength, bufferPos, c); } else { /* finished */ break; } } else { token=tokens[c]; if(token==(uint16_t)(-2)) { /* this is a lead byte for a double-byte token */ token=tokens[c<<8|*name++]; --nameLength; } if(token==(uint16_t)(-1)) { if(c!=';') { /* explicit letter */ WRITE_CHAR(buffer, bufferLength, bufferPos, c); } else { /* stop, but skip the semicolon if we are seeking extended names and there was no 2.0 name but there is a 1.0 name. */ if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { continue; } } /* finished */ break; } } else { /* write token word */ uint8_t *tokenString=tokenStrings+token; while((c=*tokenString++)!=0) { WRITE_CHAR(buffer, bufferLength, bufferPos, c); } } } } /* zero-terminate */ if(bufferLength>0) { *buffer=0; } return bufferPos; } /* * compareName() is almost the same as expandName() except that it compares * the currently expanded name to an input name. * It returns the match/no match result as soon as possible. */ static UBool compareName(UCharNames *names, const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, const char *otherName) { uint16_t *tokens=(uint16_t *)names+8; uint16_t token, tokenCount=*tokens++; uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; uint8_t c; const char *origOtherName = otherName; if(nameChoice==U_UNICODE_10_CHAR_NAME) { /* * skip the modern name if it is not requested _and_ * if the semicolon byte value is a character, not a token number */ if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { while(nameLength>0) { --nameLength; if(*name++==';') { break; } } } else { /* * the semicolon byte value is a token number, therefore * only modern names are stored in unames.dat and there is no * such requested Unicode 1.0 name here */ nameLength=0; } } /* compare each letter directly, and compare a token word per token */ while(nameLength>0) { --nameLength; c=*name++; if(c>=tokenCount) { if(c!=';') { /* implicit letter */ if((char)c!=*otherName++) { return FALSE; } } else { /* finished */ break; } } else { token=tokens[c]; if(token==(uint16_t)(-2)) { /* this is a lead byte for a double-byte token */ token=tokens[c<<8|*name++]; --nameLength; } if(token==(uint16_t)(-1)) { if(c!=';') { /* explicit letter */ if((char)c!=*otherName++) { return FALSE; } } else { /* stop, but skip the semicolon if we are seeking extended names and there was no 2.0 name but there is a 1.0 name. */ if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { continue; } } /* finished */ break; } } else { /* write token word */ uint8_t *tokenString=tokenStrings+token; while((c=*tokenString++)!=0) { if((char)c!=*otherName++) { return FALSE; } } } } } /* complete match? */ return (UBool)(*otherName==0); } /* * enumGroupNames() enumerates all the names in a 32-group * and either calls the enumerator function or finds a given input name. */ static UBool enumGroupNames(UCharNames *names, Group *group, UChar32 start, UChar32 end, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice) { uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; const uint8_t *s=(uint8_t *)names+names->groupStringOffset+ (group->offsetHigh<<16|group->offsetLow); s=expandGroupLengths(s, offsets, lengths); if(fn!=DO_FIND_NAME) { char buffer[200]; uint16_t length; while(start<=end) { length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; } /* here, we assume that the buffer is large enough */ if(length>0) { if(!fn(context, start, nameChoice, buffer, length)) { return FALSE; } } ++start; } } else { const char *otherName=((FindName *)context)->otherName; while(start<=end) { if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { ((FindName *)context)->code=start; return FALSE; } ++start; } } return TRUE; } /* * enumExtNames enumerate extended names. * It only needs to do it if it is called with a real function and not * with the dummy DO_FIND_NAME, because u_charFromName() does a check * for extended names by itself. */ static UBool enumExtNames(UChar32 start, UChar32 end, UEnumCharNamesFn *fn, void *context) { if(fn!=DO_FIND_NAME) { char buffer[200]; uint16_t length; while(start<=end) { buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; /* here, we assume that the buffer is large enough */ if(length>0) { if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { return FALSE; } } ++start; } } return TRUE; } static UBool enumNames(UCharNames *names, UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice) { uint16_t startGroupMSB, endGroupMSB, groupCount; Group *group, *groupLimit; startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); /* find the group that contains start, or the highest before it */ group=getGroup(names, start); if(startGroupMSB==endGroupMSB) { if(startGroupMSB==group->groupMSB) { /* if start and limit-1 are in the same group, then enumerate only in that one */ return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); } } else { groupCount=*(uint16_t *)((char *)names+names->groupsOffset); groupLimit=(Group *)((char *)names+names->groupsOffset+2)+groupCount; if(startGroupMSB==group->groupMSB) { /* enumerate characters in the partial start group */ if((start&GROUP_MASK)!=0) { if(!enumGroupNames(names, group, start, ((UChar32)startGroupMSB<group->groupMSB) { /* make sure that we start enumerating with the first group after start */ if (group + 1 < groupLimit && (group + 1)->groupMSB > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { UChar32 end = (group + 1)->groupMSB << GROUP_SHIFT; if (end > limit) { end = limit; } if (!enumExtNames(start, end - 1, fn, context)) { return FALSE; } } ++group; } /* enumerate entire groups between the start- and end-groups */ while(groupgroupMSBgroupMSB<groupMSB > group->groupMSB + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { UChar32 end = (group + 1)->groupMSB << GROUP_SHIFT; if (end > limit) { end = limit; } if (!enumExtNames((group->groupMSB + 1) << GROUP_SHIFT, end - 1, fn, context)) { return FALSE; } } ++group; } /* enumerate within the end group (group->groupMSB==endGroupMSB) */ if(groupgroupMSB==endGroupMSB) { return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { UChar32 next = ((group - 1)->groupMSB + 1) << GROUP_SHIFT; if (next > start) { start = next; } } else { return TRUE; } } /* we have not found a group, which means everything is made of extended names. */ if (nameChoice == U_EXTENDED_CHAR_NAME) { if (limit > UCHAR_MAX_VALUE + 1) { limit = UCHAR_MAX_VALUE + 1; } return enumExtNames(start, limit - 1, fn, context); } return TRUE; } /* * Important: * Parts of findAlgName() are almost the same as some of getAlgName(). * Fixes must be applied to both. */ static uint16_t getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, char *buffer, uint16_t bufferLength) { uint16_t bufferPos=0; /* * Do not write algorithmic Unicode 1.0 names because * Unihan names are the same as the modern ones, * extension A was only introduced with Unicode 3.0, and * the Hangul syllable block was moved and changed around Unicode 1.1.5. */ if(nameChoice==U_UNICODE_10_CHAR_NAME) { /* zero-terminate */ if(bufferLength>0) { *buffer=0; } return 0; } switch(range->type) { case 0: { /* name = prefix hex-digits */ const char *s=(const char *)(range+1); char c; uint16_t i, count; /* copy prefix */ while((c=*s++)!=0) { WRITE_CHAR(buffer, bufferLength, bufferPos, c); } /* write hexadecimal code point value */ count=range->variant; /* zero-terminate */ if(count0;) { if(--i>=4; } bufferPos+=count; break; } case 1: { /* name = prefix factorized-elements */ uint16_t indexes[8]; const uint16_t *factors=(const uint16_t *)(range+1); uint16_t count=range->variant; const char *s=(const char *)(factors+count); char c; /* copy prefix */ while((c=*s++)!=0) { WRITE_CHAR(buffer, bufferLength, bufferPos, c); } bufferPos+=writeFactorSuffix(factors, count, s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); break; } default: /* undefined type */ /* zero-terminate */ if(bufferLength>0) { *buffer=0; } break; } return bufferPos; } static uint16_t writeFactorSuffix(const uint16_t *factors, uint16_t count, const char *s, /* suffix elements */ uint32_t code, uint16_t indexes[8], /* output fields from here */ const char *elementBases[8], const char *elements[8], char *buffer, uint16_t bufferLength) { uint16_t i, factor, bufferPos=0; char c; /* write elements according to the factors */ /* * the factorized elements are determined by modulo arithmetic * with the factors of this algorithm * * note that for fewer operations, count is decremented here */ --count; for(i=count; i>0; --i) { factor=factors[i]; indexes[i]=(uint16_t)(code%factor); code/=factor; } /* * we don't need to calculate the last modulus because start<=code<=end * guarantees here that code<=factors[0] */ indexes[0]=(uint16_t)code; /* write each element */ for(;;) { if(elementBases!=NULL) { *elementBases++=s; } /* skip indexes[i] strings */ factor=indexes[i]; while(factor>0) { while(*s++!=0) {} --factor; } if(elements!=NULL) { *elements++=s; } /* write element */ while((c=*s++)!=0) { WRITE_CHAR(buffer, bufferLength, bufferPos, c); } /* we do not need to perform the rest of this loop for i==count - break here */ if(i>=count) { break; } /* skip the rest of the strings for this factors[i] */ factor=(uint16_t)(factors[i]-indexes[i]-1); while(factor>0) { while(*s++!=0) {} --factor; } ++i; } /* zero-terminate */ if(bufferLength>0) { *buffer=0; } return bufferPos; } /* * Important: enumAlgNames() and findAlgName() are almost the same. * Any fix must be applied to both. */ static UBool enumAlgNames(AlgorithmicRange *range, UChar32 start, UChar32 limit, UEnumCharNamesFn *fn, void *context, UCharNameChoice nameChoice) { char buffer[200]; uint16_t length; if(nameChoice==U_UNICODE_10_CHAR_NAME) { return TRUE; } switch(range->type) { case 0: { char *s, *end; char c; /* get the full name of the start character */ length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); if(length<=0) { return TRUE; } /* call the enumerator function with this first character */ if(!fn(context, start, nameChoice, buffer, length)) { return FALSE; } /* go to the end of the name; all these names have the same length */ end=buffer; while(*end!=0) { ++end; } /* enumerate the rest of the names */ while(++startvariant; const char *s=(const char *)(factors+count); char *suffix, *t; uint16_t prefixLength, i, index; char c; /* name = prefix factorized-elements */ /* copy prefix */ suffix=buffer; prefixLength=0; while((c=*s++)!=0) { *suffix++=c; ++prefixLength; } /* append the suffix of the start character */ length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, s, (uint32_t)start-range->start, indexes, elementBases, elements, suffix, (uint16_t)(sizeof(buffer)-prefixLength))); /* call the enumerator function with this first character */ if(!fn(context, start, nameChoice, buffer, length)) { return FALSE; } /* enumerate the rest of the names */ while(++starttype) { case 0: { /* name = prefix hex-digits */ const char *s=(const char *)(range+1); char c; uint16_t i, count; /* compare prefix */ while((c=*s++)!=0) { if((char)c!=*otherName++) { return 0xffff; } } /* read hexadecimal code point value */ count=range->variant; code=0; for(i=0; istart<=(uint32_t)code && (uint32_t)code<=range->end) { return code; } break; } case 1: { char buffer[64]; uint16_t indexes[8]; const char *elementBases[8], *elements[8]; const uint16_t *factors=(const uint16_t *)(range+1); uint16_t count=range->variant; const char *s=(const char *)(factors+count), *t; UChar32 start, limit; uint16_t i, index; char c; /* name = prefix factorized-elements */ /* compare prefix */ while((c=*s++)!=0) { if((char)c!=*otherName++) { return 0xffff; } } start=(UChar32)range->start; limit=(UChar32)(range->end+1); /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ writeFactorSuffix(factors, count, s, 0, indexes, elementBases, elements, buffer, sizeof(buffer)); /* compare the first suffix */ if(0==uprv_strcmp(otherName, buffer)) { return start; } /* enumerate and compare the rest of the suffixes */ while(++start= LENGTHOF(charCatNames)) { return "unknown"; } else { return charCatNames[cat]; } } static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { const char *catname = getCharCatName(code); uint16_t length = 0; UChar32 cp; int ndigits, i; WRITE_CHAR(buffer, bufferLength, length, '<'); while (catname[length - 1]) { WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); } WRITE_CHAR(buffer, bufferLength, length, '-'); for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) ; if (ndigits < 4) ndigits = 4; for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { uint8_t v = (uint8_t)(cp & 0xf); buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); } buffer += ndigits; length += ndigits; WRITE_CHAR(buffer, bufferLength, length, '>'); return length; } /* sets of name characters, maximum name lengths ---------------------------- */ #define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) #define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) static int32_t calcStringSetLength(uint32_t set[8], const char *s) { int32_t length=0; char c; while((c=*s++)!=0) { SET_ADD(set, c); ++length; } return length; } static int32_t calcAlgNameSetsLengths(int32_t maxNameLength) { AlgorithmicRange *range; uint32_t *p; uint32_t rangeCount; int32_t length; /* enumerate algorithmic ranges */ p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); rangeCount=*p; range=(AlgorithmicRange *)(p+1); while(rangeCount>0) { switch(range->type) { case 0: /* name = prefix + (range->variant times) hex-digits */ /* prefix */ length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; if(length>maxNameLength) { maxNameLength=length; } break; case 1: { /* name = prefix factorized-elements */ const uint16_t *factors=(const uint16_t *)(range+1); const char *s; int32_t i, count=range->variant, factor, factorLength, maxFactorLength; /* prefix length */ s=(const char *)(factors+count); length=calcStringSetLength(gNameSet, s); s+=length+1; /* start of factor suffixes */ /* get the set and maximum factor suffix length for each factor */ for(i=0; i0; --factor) { factorLength=calcStringSetLength(gNameSet, s); s+=factorLength+1; if(factorLength>maxFactorLength) { maxFactorLength=factorLength; } } length+=maxFactorLength; } if(length>maxNameLength) { maxNameLength=length; } break; } default: /* unknown type */ break; } range=(AlgorithmicRange *)((uint8_t *)range+range->size); --rangeCount; } return maxNameLength; } static int32_t calcExtNameSetsLengths(int32_t maxNameLength) { int32_t i, length; for(i=0; i * 1 for - * 6 for most hex digits per code point */ length=9+calcStringSetLength(gNameSet, charCatNames[i]); if(length>maxNameLength) { maxNameLength=length; } } return maxNameLength; } static int32_t calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, uint32_t set[8], const uint8_t **pLine, const uint8_t *lineLimit) { const uint8_t *line=*pLine; int32_t length=0, tokenLength; uint16_t c, token; while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { if(c>=tokenCount) { /* implicit letter */ SET_ADD(set, c); ++length; } else { token=tokens[c]; if(token==(uint16_t)(-2)) { /* this is a lead byte for a double-byte token */ c=c<<8|*line++; token=tokens[c]; } if(token==(uint16_t)(-1)) { /* explicit letter */ SET_ADD(set, c); ++length; } else { /* count token word */ if(tokenLengths!=NULL) { /* use cached token length */ tokenLength=tokenLengths[c]; if(tokenLength==0) { tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); tokenLengths[c]=(int8_t)tokenLength; } } else { tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); } length+=tokenLength; } } } *pLine=line; return length; } static void calcGroupNameSetsLengths(int32_t maxNameLength) { uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; uint16_t *tokens=(uint16_t *)uCharNames+8; uint16_t tokenCount=*tokens++; uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; int8_t *tokenLengths; uint16_t *groups; Group *group; const uint8_t *s, *line, *lineLimit; int32_t maxISOCommentLength=0; int32_t groupCount, lineNumber, length; tokenLengths=(int8_t *)uprv_malloc(tokenCount); if(tokenLengths!=NULL) { uprv_memset(tokenLengths, 0, tokenCount); } groups=(uint16_t *)((char *)uCharNames+uCharNames->groupsOffset); groupCount=*groups++; group=(Group *)groups; /* enumerate all groups */ while(groupCount>0) { s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+ ((int32_t)group->offsetHigh<<16|group->offsetLow); s=expandGroupLengths(s, offsets, lengths); /* enumerate all lines in each group */ for(lineNumber=0; lineNumbermaxNameLength) { maxNameLength=length; } if(line==lineLimit) { continue; } /* read Unicode 1.0 name */ length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); if(length>maxNameLength) { maxNameLength=length; } if(line==lineLimit) { continue; } /* read ISO comment */ length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit); if(length>maxISOCommentLength) { maxISOCommentLength=length; } } ++group; --groupCount; } if(tokenLengths!=NULL) { uprv_free(tokenLengths); } /* set gMax... - name length last for threading */ gMaxISOCommentLength=maxISOCommentLength; gMaxNameLength=maxNameLength; } static UBool calcNameSetsLengths(UErrorCode *pErrorCode) { static const char extChars[]="0123456789ABCDEF<>-"; int32_t i, maxNameLength; if(gMaxNameLength!=0) { return TRUE; } if(!isDataLoaded(pErrorCode)) { return FALSE; } /* set hex digits, used in various names, and <>-, used in extended names */ for(i=0; i