From 295dc24d64b5e92ed7d277e406f16739ce127d74 Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Thu, 25 Oct 2007 17:05:36 +0000 Subject: [PATCH] ICU-5987 merge small-conversion-file feature into trunk, from svn merge -r 22780:22805 .../branches/markus/smallcnv X-SVN-Rev: 22852 --- icu4c/source/common/ucnv_bld.c | 48 ++- icu4c/source/common/ucnv_cnv.h | 13 + icu4c/source/common/ucnvmbcs.c | 472 ++++++++++++++++++++--- icu4c/source/common/ucnvmbcs.h | 108 ++++-- icu4c/source/test/testdata/Makefile.in | 2 +- icu4c/source/test/testdata/testdata.mak | 10 +- icu4c/source/tools/makeconv/gencnvex.c | 4 +- icu4c/source/tools/makeconv/genmbcs.c | 212 +++++++--- icu4c/source/tools/makeconv/genmbcs.h | 13 +- icu4c/source/tools/makeconv/makeconv.c | 59 ++- icu4c/source/tools/makeconv/makeconv.h | 3 +- icu4c/source/tools/toolutil/pkgitems.cpp | 16 +- 12 files changed, 784 insertions(+), 176 deletions(-) diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index f627d6b823..943a1ca507 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -1261,6 +1261,9 @@ ucnv_swap(const UDataSwapper *ds, const _MBCSHeader *inMBCSHeader; _MBCSHeader *outMBCSHeader; _MBCSHeader mbcsHeader; + uint32_t mbcsHeaderLength; + UBool noFromU=FALSE; + uint8_t outputType; int32_t maxFastUChar, mbcsIndexLength; @@ -1350,7 +1353,15 @@ ucnv_swap(const UDataSwapper *ds, *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return 0; } - if(!(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1)) { + if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) { + mbcsHeaderLength=MBCS_HEADER_V4_LENGTH; + } else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 && + ((mbcsHeader.options=ds->readUInt32(inMBCSHeader->options))& + MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0 + ) { + mbcsHeaderLength=mbcsHeader.options&MBCS_OPT_LENGTH_MASK; + noFromU=(UBool)((mbcsHeader.options&MBCS_OPT_NO_FROM_U)!=0); + } else { udata_printError(ds, "ucnv_swap(): unsupported _MBCSHeader.version %d.%d\n", inMBCSHeader->version[0], inMBCSHeader->version[1]); *pErrorCode=U_UNSUPPORTED_ERROR; @@ -1365,9 +1376,15 @@ ucnv_swap(const UDataSwapper *ds, mbcsHeader.offsetFromUBytes= ds->readUInt32(inMBCSHeader->offsetFromUBytes); mbcsHeader.flags= ds->readUInt32(inMBCSHeader->flags); mbcsHeader.fromUBytesLength= ds->readUInt32(inMBCSHeader->fromUBytesLength); + /* mbcsHeader.options have been read above */ extOffset=(int32_t)(mbcsHeader.flags>>8); outputType=(uint8_t)mbcsHeader.flags; + if(noFromU && outputType==MBCS_OUTPUT_1) { + udata_printError(ds, "ucnv_swap(): unsupported combination of makeconv --small with SBCS\n"); + *pErrorCode=U_UNSUPPORTED_ERROR; + return 0; + } /* make sure that the output type is known */ switch(outputType) { @@ -1406,7 +1423,10 @@ ucnv_swap(const UDataSwapper *ds, } if(extOffset==0) { - size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsHeader.fromUBytesLength+mbcsIndexLength); + size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsIndexLength); + if(!noFromU) { + size+=(int32_t)mbcsHeader.fromUBytesLength; + } /* avoid compiler warnings - not otherwise necessary, and the value does not matter */ inExtIndexes=NULL; @@ -1436,8 +1456,9 @@ ucnv_swap(const UDataSwapper *ds, uprv_memcpy(outBytes, inBytes, size); } - /* swap the MBCSHeader */ - ds->swapArray32(ds, &inMBCSHeader->countStates, 7*4, + /* swap the MBCSHeader, except for the version field */ + count=mbcsHeaderLength*4; + ds->swapArray32(ds, &inMBCSHeader->countStates, count-4, &outMBCSHeader->countStates, pErrorCode); if(outputType==MBCS_OUTPUT_EXT_ONLY) { @@ -1447,18 +1468,23 @@ ucnv_swap(const UDataSwapper *ds, */ /* swap the base name, between the header and the extension data */ - ds->swapInvChars(ds, inMBCSHeader+1, (int32_t)uprv_strlen((const char *)(inMBCSHeader+1)), - outMBCSHeader+1, pErrorCode); + const char *inBaseName=(const char *)inBytes+count; + char *outBaseName=(char *)outBytes+count; + ds->swapInvChars(ds, inBaseName, (int32_t)uprv_strlen(inBaseName), + outBaseName, pErrorCode); } else { /* normal file with base table data */ /* swap the state table, 1kB per state */ - ds->swapArray32(ds, inMBCSHeader+1, (int32_t)(mbcsHeader.countStates*1024), - outMBCSHeader+1, pErrorCode); + offset=count; + count=mbcsHeader.countStates*1024; + ds->swapArray32(ds, inBytes+offset, (int32_t)count, + outBytes+offset, pErrorCode); /* swap the toUFallbacks[] */ - offset=sizeof(_MBCSHeader)+mbcsHeader.countStates*1024; - ds->swapArray32(ds, inBytes+offset, (int32_t)(mbcsHeader.countToUFallbacks*8), + offset+=count; + count=mbcsHeader.countToUFallbacks*8; + ds->swapArray32(ds, inBytes+offset, (int32_t)count, outBytes+offset, pErrorCode); /* swap the unicodeCodeUnits[] */ @@ -1495,7 +1521,7 @@ ucnv_swap(const UDataSwapper *ds, /* stage 3/result bytes: sometimes uint16_t[] or uint32_t[] */ offset=mbcsHeader.offsetFromUBytes; - count=mbcsHeader.fromUBytesLength; + count= noFromU ? 0 : mbcsHeader.fromUBytesLength; switch(outputType) { case MBCS_OUTPUT_2: case MBCS_OUTPUT_3_EUC: diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index cf612a754d..a51faaf26a 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -175,6 +175,19 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv, int32_t *pBufferSize, UErrorCode *status); +/** + * Filters for some ucnv_getUnicodeSet() implementation code. + */ +typedef enum UConverterSetFilter { + UCNV_SET_FILTER_NONE, + UCNV_SET_FILTER_DBCS_ONLY, + UCNV_SET_FILTER_2022_CN, + UCNV_SET_FILTER_SJIS, + UCNV_SET_FILTER_GR94DBCS, + UCNV_SET_FILTER_HZ, + UCNV_SET_FILTER_COUNT +} UConverterSetFilter; + /** * Fills the set of Unicode code points that can be converted by an ICU converter. * The API function ucnv_getUnicodeSet() clears the USet before calling diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 10dbe74daa..06f2644a1c 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -61,9 +61,47 @@ #define MBCS_UNROLL_SINGLE_FROM_BMP 0 /* - * _MBCSHeader version 4.3 + * _MBCSHeader versions 5.3 & 4.3 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) * + * This version is optional. Version 5 is used for incompatible data format changes. + * makeconv will continue to generate version 4 files if possible. + * + * Changes from version 4: + * + * The main difference is an additional _MBCSHeader field with + * - the length (number of uint32_t) of the _MBCSHeader + * - flags for further incompatible data format changes + * - flags for further, backward compatible data format changes + * + * The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from + * the file and needs to be reconstituted at load time. + * This requires a utf8Friendly format with an additional mbcsIndex table for fast + * (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar. + * (For details about these structures see below, and see ucnvmbcs.h.) + * + * utf8Friendly also implies that the fromUnicode mappings are stored in ascending order + * of the Unicode code points. (This requires that the .ucm file has the |0 etc. + * precision markers for all mappings.) + * + * All fallbacks have been moved to the extension table, leaving only roundtrips in the + * omitted data that can be reconstituted from the toUnicode data. + * + * Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted. + * With only roundtrip mappings in the base fromUnicode data, this part is fully + * redundant with the mbcsIndex and will be reconstituted from that (also using the + * stage 1 table which contains the information about how stage 2 was compacted). + * + * The rest of the stage 2 table, the part for code points above maxFastUChar, + * is stored in the file and will be appended to the reconstituted part. + * + * The entire fromUBytes array is omitted from the file and will be reconstitued. + * This is done by enumerating all toUnicode roundtrip mappings, performing + * each mapping (using the stage 1 and reconstituted stage 2 tables) and + * writing instead of reading the byte values. + * + * _MBCSHeader version 4.3 + * * Change from version 4.2: * - Optional utf8Friendly data structures, with 64-entry stage 3 block * allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS @@ -362,101 +400,240 @@ gb18030Ranges[13][4]={ /* Miscellaneous ------------------------------------------------------------ */ -#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ +/** + * Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from + * consecutive sequences of bytes, starting from the one encoded in value, + * to Unicode code points. (Multiple mappings to reduce per-function call overhead.) + * Does not currently support m:n mappings or reverse fallbacks. + * This function will not be called for sequences of bytes with leading zeros. + * + * @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode() + * @param value contains 1..4 bytes of the first byte sequence, right-aligned + * @param codePoints resulting Unicode code points, or negative if a byte sequence does + * not map to anything + * @return TRUE to continue enumeration, FALSE to stop + */ +typedef UBool U_CALLCONV +UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]); /* similar to ucnv_MBCSGetNextUChar() but recursive */ -static void -_getUnicodeSetForBytes(const UConverterSharedData *sharedData, - const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits, - const USetAdder *sa, - UConverterUnicodeSet which, - uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte, - - UErrorCode *pErrorCode) { - int32_t b, entry; +static UBool +enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[], + int32_t state, uint32_t offset, + uint32_t value, + UConverterEnumToUCallback *callback, const void *context, + UErrorCode *pErrorCode) { + UChar32 codePoints[32]; + const int32_t *row; + const uint16_t *unicodeCodeUnits; + UChar32 anyCodePoints; + int32_t b, limit; - for(b=lowByte; b<=highByte; ++b) { - entry=stateTable[state][b]; + row=mbcsTable->stateTable[state]; + unicodeCodeUnits=mbcsTable->unicodeCodeUnits; + + value<<=8; + anyCodePoints=-1; /* becomes non-negative if there is a mapping */ + + b=(stateProps[state]&0x38)<<2; + if(b==0 && stateProps[state]>=0x40) { + /* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */ + codePoints[0]=U_SENTINEL; + b=1; + } + limit=((stateProps[state]&7)+1)<<5; + while(b=0) { + /* recurse to a state with non-ignorable actions */ + if(!enumToU( + mbcsTable, stateProps, nextState, + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), + value|(uint32_t)b, + callback, context, + pErrorCode)) { + return FALSE; + } + } + codePoints[b&0x1f]=U_SENTINEL; } else { UChar32 c; - int32_t rowOffset=offset; - uint8_t action; - - c=U_SENTINEL; + int32_t action; /* * An if-else-if chain provides more reliable performance for * the most common cases compared to a switch. */ - action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); + action=MBCS_ENTRY_FINAL_ACTION(entry); if(action==MBCS_STATE_VALID_DIRECT_16) { /* output BMP code point */ c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); } else if(action==MBCS_STATE_VALID_16) { - offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); - c=unicodeCodeUnits[offset]; + int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + c=unicodeCodeUnits[finalOffset]; if(c<0xfffe) { /* output BMP code point */ } else { c=U_SENTINEL; } } else if(action==MBCS_STATE_VALID_16_PAIR) { - offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); - c=unicodeCodeUnits[offset++]; + int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + c=unicodeCodeUnits[finalOffset++]; if(c<0xd800) { /* output BMP code point below 0xd800 */ } else if(c<=0xdbff) { /* output roundtrip or fallback supplementary code point */ - c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); + c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); } else if(c==0xe000) { /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ - c=unicodeCodeUnits[offset]; + c=unicodeCodeUnits[finalOffset]; } else { c=U_SENTINEL; } } else if(action==MBCS_STATE_VALID_DIRECT_20) { /* output supplementary code point */ c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); + } else { + c=U_SENTINEL; } - if(c>=0) { - sa->add(sa->set, c); + codePoints[b&0x1f]=c; + anyCodePoints&=c; + } + if(((++b)&0x1f)==0) { + if(anyCodePoints>=0) { + if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) { + return FALSE; + } + anyCodePoints=-1; } - offset=rowOffset; } } + return TRUE; } /* - * Internal function returning a UnicodeSet for toUnicode() conversion. - * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. - * In the future, if we add support for reverse-fallback sets, this function - * needs to be updated, and called for each initial state. - * Does not currently handle extensions. - * Does not empty the set first. + * Only called if stateProps[state]==-1. + * A recursive call may do stateProps[state]|=0x40 if this state is the target of an + * MBCS_STATE_CHANGE_ONLY. */ -U_CFUNC void -ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, - const USetAdder *sa, - UConverterUnicodeSet which, - uint8_t state, int32_t lowByte, int32_t highByte, - UErrorCode *pErrorCode) { - _getUnicodeSetForBytes( - sharedData, sharedData->mbcs.stateTable, sharedData->mbcs.unicodeCodeUnits, - sa, which, - state, 0, lowByte, highByte, - pErrorCode); +static int8_t +getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) { + const int32_t *row; + int32_t min, max, entry, nextState; + + row=stateTable[state]; + stateProps[state]=0; + + /* find first non-ignorable state */ + for(min=0;; ++min) { + entry=row[min]; + nextState=MBCS_ENTRY_STATE(entry); + if(stateProps[nextState]==-1) { + getStateProp(stateTable, stateProps, nextState); + } + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + if(stateProps[nextState]>=0) { + break; + } + } else if(MBCS_ENTRY_FINAL_ACTION(entry)>5)<<3); + + /* find last non-ignorable state */ + for(max=0xff; min=0) { + break; + } + } else if(MBCS_ENTRY_FINAL_ACTION(entry)>5); + + /* recurse further and collect direct-state information */ + while(min<=max) { + entry=row[min]; + nextState=MBCS_ENTRY_STATE(entry); + if(stateProps[nextState]==-1) { + getStateProp(stateTable, stateProps, nextState); + } + if(MBCS_ENTRY_IS_FINAL(entry)) { + stateProps[nextState]|=0x40; + if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) { + stateProps[state]|=0x40; + } + } + ++min; + } + return stateProps[state]; } -#endif +/* + * Internal function enumerating the toUnicode data of an MBCS converter. + * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U + * table, but could also be used for a future ucnv_getUnicodeSet() option + * that includes reverse fallbacks (after updating this function's implementation). + * Currently only handles roundtrip mappings. + * Does not currently handle extensions. + */ +static void +ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable, + UConverterEnumToUCallback *callback, const void *context, + UErrorCode *pErrorCode) { + /* + * Properties for each state, to speed up the enumeration. + * Ignorable actions are unassigned/illegal/state-change-only: + * They do not lead to mappings. + * + * Bits 7..6: + * 1 direct/initial state (stateful converters have multiple) + * 0 non-initial state with transitions or with non-ignorable result actions + * -1 final state with only ignorable actions + * + * Bits 5..3: + * The lowest byte value with non-ignorable actions is + * value<<5 (rounded down). + * + * Bits 2..0: + * The highest byte value with non-ignorable actions is + * (value<<5)&0x1f (rounded up). + */ + int8_t stateProps[MBCS_MAX_STATE_COUNT]; + int32_t state; + + uprv_memset(stateProps, -1, sizeof(stateProps)); + + /* recurse from state 0 and set all stateProps */ + getStateProp(mbcsTable->stateTable, stateProps, 0); + + for(state=0; statecountStates; ++state) { + /*if(stateProps[state]==-1) { + printf("unused/unreachable %d\n", state); + }*/ + if(stateProps[state]>=0x40) { + /* start from each direct state */ + enumToU( + mbcsTable, stateProps, state, 0, 0, + callback, context, + pErrorCode); + } + } +} U_CFUNC void ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData, @@ -1006,6 +1183,156 @@ _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) { return TRUE; } +/* reconstitute omitted fromUnicode data ------------------------------------ */ + +/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */ +static UBool U_CALLCONV +writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) { + UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context; + const uint16_t *table; + uint32_t *stage2; + uint8_t *bytes, *p; + UChar32 c; + int32_t i, st3; + + table=mbcsTable->fromUnicodeTable; + bytes=(uint8_t *)mbcsTable->fromUnicodeBytes; + + /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3_EUC: + if(value<=0xffff) { + /* short sequences are stored directly */ + /* code set 0 or 1 */ + } else if(value<=0x8effff) { + /* code set 2 */ + value&=0x7fff; + } else /* first byte is 0x8f */ { + /* code set 3 */ + value&=0xff7f; + } + break; + case MBCS_OUTPUT_4_EUC: + if(value<=0xffffff) { + /* short sequences are stored directly */ + /* code set 0 or 1 */ + } else if(value<=0x8effffff) { + /* code set 2 */ + value&=0x7fffff; + } else /* first byte is 0x8f */ { + /* code set 3 */ + value&=0xff7fff; + } + break; + default: + break; + } + + for(i=0; i<=0x1f; ++value, ++i) { + c=codePoints[i]; + if(c<0) { + continue; + } + + /* locate the stage 2 & 3 data */ + stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f); + p=bytes; + st3=(int32_t)(uint16_t)*stage2*16+(c&0xf); + + /* write the codepage bytes into stage 3 */ + switch(mbcsTable->outputType) { + case MBCS_OUTPUT_3: + case MBCS_OUTPUT_4_EUC: + p+=st3*3; + p[0]=(uint8_t)(value>>16); + p[1]=(uint8_t)(value>>8); + p[2]=(uint8_t)value; + break; + case MBCS_OUTPUT_4: + ((uint32_t *)p)[st3]=value; + break; + default: + /* 2 bytes per character */ + ((uint16_t *)p)[st3]=(uint16_t)value; + break; + } + + /* set the roundtrip flag */ + *stage2|=(1UL<<(16+(c&0xf))); + } + return TRUE; + } + +static void +reconstituteData(UConverterMBCSTable *mbcsTable, + uint32_t stage1Length, uint32_t stage2Length, + uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */ + UErrorCode *pErrorCode) { + uint16_t *stage1; + uint32_t *stage2; + uint8_t *bytes; + uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength; + mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength); + if(mbcsTable->reconstitutedData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + return; + } + uprv_memset(mbcsTable->reconstitutedData, 0, dataLength); + + /* copy existing data and reroute the pointers */ + stage1=(uint16_t *)mbcsTable->reconstitutedData; + uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2); + + stage2=(uint32_t *)(stage1+stage1Length); + uprv_memcpy(stage2+(fullStage2Length-stage2Length), + mbcsTable->fromUnicodeTable+stage1Length, + stage2Length*4); + + mbcsTable->fromUnicodeTable=stage1; + mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length); + + /* indexes into stage 2 count from the bottom of the fromUnicodeTable */ + stage2=(uint32_t *)stage1; + + /* reconstitute the initial part of stage 2 from the mbcsIndex */ + { + int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6; + int32_t stageUTF8Index=0; + int32_t st1, st2, st3, i; + + for(st1=0; stageUTF8IndexmbcsIndex[stageUTF8Index++]; + if(st3!=0) { + /* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ + st3>>=4; + /* + * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are + * allocated together as a single 64-block for access from the mbcsIndex + */ + stage2[st2++]=st3++; + stage2[st2++]=st3++; + stage2[st2++]=st3++; + stage2[st2++]=st3; + } else { + /* no stage 3 block, skip */ + st2+=4; + } + } + } else { + /* no stage 2 block, skip */ + stageUTF8Index+=16; + } + } + } + + /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ + ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode); +} + /* MBCS setup functions ----------------------------------------------------- */ static void @@ -1017,13 +1344,25 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData, UConverterMBCSTable *mbcsTable=&sharedData->mbcs; _MBCSHeader *header=(_MBCSHeader *)raw; uint32_t offset; + uint32_t headerLength; + UBool noFromU=FALSE; - if(header->version[0]!=4) { + if(header->version[0]==4) { + headerLength=MBCS_HEADER_V4_LENGTH; + } else if(header->version[0]==5 && header->version[1]>=3 && + (header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) { + headerLength=header->options&MBCS_OPT_LENGTH_MASK; + noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0); + } else { *pErrorCode=U_INVALID_TABLE_FORMAT; return; } mbcsTable->outputType=(uint8_t)header->flags; + if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + return; + } /* extension data, header version 4.2 and higher */ offset=header->flags>>8; @@ -1051,7 +1390,7 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData, } /* load the base table */ - baseName=(const char *)(header+1); + baseName=(const char *)header+headerLength*4; if(0==uprv_strcmp(baseName, sharedData->staticData->name)) { /* forbid loading this same extension-only file */ *pErrorCode=U_INVALID_TABLE_FORMAT; @@ -1095,6 +1434,12 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData, mbcsTable->swapLFNLFromUnicodeBytes=NULL; mbcsTable->swapLFNLName=NULL; + /* + * The reconstitutedData must be deleted only when the base converter + * is unloaded. + */ + mbcsTable->reconstitutedData=NULL; + /* * Set a special, runtime-only outputType if the extension converter * is a DBCS version of a base converter that also maps single bytes. @@ -1187,7 +1532,7 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData, mbcsTable->countStates=(uint8_t)header->countStates; mbcsTable->countToUFallbacks=header->countToUFallbacks; - mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader)); + mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4); mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates); mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits); @@ -1244,7 +1589,9 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData, * The .cnv file is prebuilt with an additional stage table with indexes * to each block. */ - mbcsTable->mbcsIndex=(const uint16_t *)(mbcsTable->fromUnicodeBytes+mbcsTable->fromUBytesLength); + mbcsTable->mbcsIndex=(const uint16_t *) + (mbcsTable->fromUnicodeBytes+ + (noFromU ? 0 : mbcsTable->fromUBytesLength)); mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff; } } @@ -1261,6 +1608,16 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData, } mbcsTable->asciiRoundtrips=asciiRoundtrips; } + + if(noFromU) { + uint32_t stage1Length= + mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ? + 0x440 : 0x40; + uint32_t stage2Length= + (header->offsetFromUBytes-header->offsetFromUTable)/4- + stage1Length/2; + reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode); + } } /* Set the impl pointer here so that it is set for both extension-only and base tables. */ @@ -1296,6 +1653,9 @@ ucnv_MBCSUnload(UConverterSharedData *sharedData) { if(mbcsTable->baseSharedData!=NULL) { ucnv_unload(mbcsTable->baseSharedData); } + if(mbcsTable->reconstitutedData!=NULL) { + uprv_free(mbcsTable->reconstitutedData); + } } static void diff --git a/icu4c/source/common/ucnvmbcs.h b/icu4c/source/common/ucnvmbcs.h index 42f64ee353..9e4f295703 100644 --- a/icu4c/source/common/ucnvmbcs.h +++ b/icu4c/source/common/ucnvmbcs.h @@ -23,6 +23,7 @@ #include "unicode/ucnv.h" #include "ucnv_cnv.h" +#include "ucnv_ext.h" /** * ICU conversion (.cnv) data file structure, following the usual UDataInfo @@ -41,6 +42,24 @@ * the same toUnicode structures, while the fromUnicode structures for SBCS * differ from those for other MBCS-style converters. * + * _MBCSHeader.version 5 is optional and not backward-compatible + * (as usual for changes in the major version field). + * + * Versions 5.m work like versions 4.m except: + * - The _MBCSHeader has variable length (and is always longer than in version 4). + * See the struct _MBCSHeader further description below. + * - There is a set of flags which indicate further incompatible changes. + * (Reader code must reject the file if it does not recognize them all.) + * - In particular, one of these flags indicates that most of the fromUnicode + * data is missing and must be reconstituted from the toUnicode data + * and from the utf8Friendly mbcsIndex at load time. + * (This only works with a utf8Friendly table.) + * In this case, makeconv may increase maxFastUChar automatically to U+FFFF. + * + * The first of these versions is 5.3, which is like 4.3 except for the differences above. + * + * When possible, makeconv continues to generate version 4.m files. + * * _MBCSHeader.version 4.3 optionally modifies the fromUnicode data structures * slightly and optionally adds a table for conversion to MBCS (non-SBCS) * charsets. @@ -127,6 +146,26 @@ * 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher * counts bytes in fromUBytes[] * + * New and required in version 5: + * 8 uint32_t options, bits: + * 31..16 reserved for flags that can be added without breaking + * backward compatibility + * 15.. 6 reserved for flags whose addition will break + * backward compatibility + * 6 MBCS_OPT_FROM_U -- if set, + * then most of the fromUnicode data is omitted; + * fullStage2Length is present and the missing + * bottom part of stage 2 must be reconstituted from + * the toUnicode data; + * stage 3 is missing completely as well; + * not used for SBCS tables + * 5.. 0 length of the _MBCSHeader (number of uint32_t) + * + * New and optional in version 5: + * 9 uint32_t fullStage2Length: used if MBCS_OPT_FROM_U is set + * specifies the full length of stage 2 + * including the omitted part + * * if(outputType==MBCS_OUTPUT_EXT_ONLY) { * -- base table name for extension-only table * char baseTableName[variable]; -- with NUL plus padding for 4-alignment @@ -153,7 +192,7 @@ * -- BMP-only tables have a smaller stage 1 table * uint16_t fromUTable[0x40]; (32-bit-aligned) * } - * + * * -- stage 2 tables * length determined by top of stage 1 and bottom of stage 3 tables * if(outputType==MBCS_OUTPUT_1) { @@ -162,17 +201,24 @@ * } else { * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes * uint32_t stage 2 flags and indexes[?]; + * if(options&MBCS_OPT_NO_FROM_U) { + * stage 2 really has length fullStage2Length + * and the omitted lower part must be reconstituted from + * the toUnicode data + * } * } - * + * * -- stage 3 tables with byte results * if(outputType==MBCS_OUTPUT_1) { * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c * uint16_t fromUBytes[fromUBytesLength/2]; - * } else { + * } else if(!(options&MBCS_OPT_NO_FROM_U)) { * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c * uint8_t fromUBytes[fromUBytesLength]; or * uint16_t fromUBytes[fromUBytesLength/2]; or * uint32_t fromUBytes[fromUBytesLength/4]; + * } else { + * fromUBytes[] must be reconstituted from the toUnicode data * } * * -- optional utf8Friendly mbcsIndex -- _MBCSHeader.version 4.3 (ICU 3.8) and higher @@ -340,6 +386,9 @@ typedef struct UConverterMBCSTable { /* roundtrips */ uint32_t asciiRoundtrips; + /* reconstituted data that was omitted from the .cnv file */ + uint8_t *reconstitutedData; + /* converter name for swaplfnl */ char *swapLFNLName; @@ -348,6 +397,26 @@ typedef struct UConverterMBCSTable { const int32_t *extIndexes; } UConverterMBCSTable; +enum { + MBCS_OPT_LENGTH_MASK=0x3f, + MBCS_OPT_NO_FROM_U=0x40, + /* + * If any of the following options bits are set, + * then the file must be rejected. + */ + MBCS_OPT_INCOMPATIBLE_MASK=0xffc0, + /* + * Remove bits from this mask as more options are recognized + * by all implementations that use this constant. + */ + MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80 +}; + +enum { + MBCS_HEADER_V4_LENGTH=8, + MBCS_HEADER_V5_MIN_LENGTH=9 +}; + /** * MBCS data header. See data format description above. */ @@ -360,6 +429,12 @@ typedef struct { offsetFromUBytes, flags, fromUBytesLength; + + /* new and required in version 5 */ + uint32_t options; + + /* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */ + uint32_t fullStage2Length; /* number of 32-bit units */ } _MBCSHeader; /* @@ -456,23 +531,6 @@ U_CFUNC void ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode); -#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */ -/* - * Internal function returning a UnicodeSet for toUnicode() conversion. - * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. - * In the future, if we add support for reverse-fallback sets, this function - * needs to be updated, and called for each initial state. - * Does not currently handle extensions. - * Does not empty the set first. - */ -U_CFUNC void -ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData, - const USetAdder *sa, - UConverterUnicodeSet which, - uint8_t state, int32_t lowByte, int32_t highByte, - UErrorCode *pErrorCode); -#endif - /* * Internal function returning a UnicodeSet for toUnicode() conversion. * Currently only used for ISO-2022-CN, and only handles roundtrip mappings. @@ -487,16 +545,6 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData, UConverterUnicodeSet which, UErrorCode *pErrorCode); -typedef enum UConverterSetFilter { - UCNV_SET_FILTER_NONE, - UCNV_SET_FILTER_DBCS_ONLY, - UCNV_SET_FILTER_2022_CN, - UCNV_SET_FILTER_SJIS, - UCNV_SET_FILTER_GR94DBCS, - UCNV_SET_FILTER_HZ, - UCNV_SET_FILTER_COUNT -} UConverterSetFilter; - /* * Same as ucnv_MBCSGetUnicodeSetForUnicode() but * the set can be filtered by encoding scheme. diff --git a/icu4c/source/test/testdata/Makefile.in b/icu4c/source/test/testdata/Makefile.in index 657c770c97..a99f0f14de 100644 --- a/icu4c/source/test/testdata/Makefile.in +++ b/icu4c/source/test/testdata/Makefile.in @@ -186,7 +186,7 @@ $(TESTBUILDDIR)/nfsmxp.spp: $(BINDIR)/gensprep$(EXEEXT) $(TESTSRCDATADIR)/nfs4_m $(INVOKE) $(BINDIR)/gensprep -s $(TESTSRCDATADIR) $(ICU_DATA_OPT) -d $(TESTBUILDDIR) -b nfsmxp -k -n $(UNICODEDATADIR) -u 3.2.0 nfs4_mixed_prep_p.txt $(TESTBUILDDIR)/%.cnv: $(TESTSRCDATADIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT) - $(INVOKE) $(BINDIR)/makeconv -c -d $(TESTBUILDDIR) $(TESTSRCDATADIR)/$(ucm->baseName[length++]=0; } - headerSize=sizeof(header)+length; + headerSize=MBCS_HEADER_V4_LENGTH*4+length; /* fill the header */ header.version[0]=4; @@ -138,7 +138,7 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY); /* write the header and the base table name */ - udata_writeBlock(pData, &header, sizeof(header)); + udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4); udata_writeBlock(pData, extData->ucm->baseName, length); } diff --git a/icu4c/source/tools/makeconv/genmbcs.c b/icu4c/source/tools/makeconv/genmbcs.c index 139ab0109b..6757b7781a 100644 --- a/icu4c/source/tools/makeconv/genmbcs.c +++ b/icu4c/source/tools/makeconv/genmbcs.c @@ -30,7 +30,7 @@ * Reduce tests for maxCharLength. */ -typedef struct MBCSData { +struct MBCSData { NewConverter newConverter; UCMFile *ucm; @@ -48,10 +48,18 @@ typedef struct MBCSData { uint32_t stage2Top, stage3Top; /* fromUTF8 */ - uint16_t stageUTF8[MBCS_UTF8_STAGE_SIZE]; + uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT]; /* allow for utf8Max=0xffff */ + + /* + * Maximum UTF-8-friendly code point. + * 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100. + * If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff. + */ + uint16_t utf8Max; UBool utf8Friendly; -} MBCSData; + UBool omitFromU; +}; /* prototypes */ static void @@ -115,6 +123,29 @@ printBytes(char *buffer, const uint8_t *bytes, int32_t length) { /* implementation ----------------------------------------------------------- */ +static MBCSData gDummy; + +U_CFUNC const MBCSData * +MBCSGetDummy() { + uprv_memset(&gDummy, 0, sizeof(MBCSData)); + + /* + * Set "pessimistic" values which may sometimes move too many + * mappings to the extension table (but never too few). + * These values cause MBCSOkForBaseFromUnicode() to return FALSE for the + * largest set of mappings. + * Assume maxCharLength>1. + */ + gDummy.utf8Friendly=TRUE; + if(SMALL) { + gDummy.utf8Max=0xffff; + gDummy.omitFromU=TRUE; + } else { + gDummy.utf8Max=MBCS_UTF8_MAX; + } + return &gDummy; +} + static void MBCSInit(MBCSData *mbcsData, UCMFile *ucm) { uprv_memset(mbcsData, 0, sizeof(MBCSData)); @@ -680,7 +711,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData, /* inspect stage 1 */ index=c>>MBCS_STAGE_1_SHIFT; - if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) { + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1); } else { nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK; @@ -716,7 +747,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData, /* inspect stage 2 */ index=mbcsData->stage1[index]+nextOffset; - if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) { + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { /* allocate 64-entry blocks for UTF-8-friendly lookup */ blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength; nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK; @@ -761,12 +792,12 @@ MBCSAddFromUnicode(MBCSData *mbcsData, stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[index]; /* Build an alternate, UTF-8-friendly stage table as well. */ - if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) { + if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) { /* Overflow for uint16_t entries in stageUTF8? */ if(stage3Index>0xffff) { /* * This can occur only if the mapping table is nearly perfectly filled and if - * MBCS_UTF8_MAX==0xffff. + * utf8Max==0xffff. * (There is no known charset like this. GB 18030 does not map * surrogate code points and LMBCS does not map 256 PUA code points.) * @@ -776,20 +807,20 @@ MBCSAddFromUnicode(MBCSData *mbcsData, * mappings with 0<=c0x%s\n", - (int)c, printBytes(buffer, bytes, length)); - return FALSE; + mbcsData->utf8Max=0xfeff; + } else { + /* + * The stage 3 block has been assigned for the regular trie. + * Just copy its index into stageUTF8[], without the granularity. + */ + mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index; } - - /* - * The stage 3 block has been assigned for the regular trie. - * Just copy its index into stageUTF8[], without the granularity. - */ - mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index; } /* write the codepage bytes into stage 3 and get the previous bytes */ @@ -856,7 +887,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData, } U_CFUNC UBool -MBCSOkForBaseFromUnicode(UBool utf8Friendly, +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, const uint8_t *bytes, int32_t length, UChar32 c, int8_t flag) { /* @@ -883,7 +914,16 @@ MBCSOkForBaseFromUnicode(UBool utf8Friendly, * - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry) * - any |1 fallback (no roundtrip flags in the optimized table) */ - if(utf8Friendly && flag<=1 && c<=MBCS_UTF8_MAX && (bytes[0]==0 || flag==1)) { + if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) { + return FALSE; + } + + /* + * If we omit the fromUnicode data, we can only store roundtrips there + * because only they are recoverable from the toUnicode data. + * Fallbacks must go into the extension table. + */ + if(mbcsData->omitFromU && flag!=0) { return FALSE; } @@ -918,6 +958,18 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati * indicators are used. */ mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0); + if(utf8Friendly) { + mbcsData->utf8Max=MBCS_UTF8_MAX; + if(SMALL && maxCharLength>1) { + mbcsData->omitFromU=TRUE; + } + } else { + mbcsData->utf8Max=0; + if(SMALL && maxCharLength>1) { + fprintf(stderr, + "makeconv warning: --small not available for .ucm files without |0 etc.\n"); + } + } if(!MBCSStartMappings(mbcsData)) { return FALSE; @@ -933,6 +985,28 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati c=m->u; f=m->f; + /* + * Small optimization for --small .cnv files: + * + * If there are fromUnicode mappings above MBCS_UTF8_MAX, + * then the file size will be smaller if we make utf8Max larger + * because the size increase in stageUTF8 will be more than balanced by + * how much less of stage2 needs to be stored. + * + * There is no point in doing this incrementally because stageUTF8 + * uses so much less space per block than stage2, + * so we immediately increase utf8Max to 0xffff. + * + * Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode() + * sets it to that value when stageUTF8 overflows. + */ + if( mbcsData->omitFromU && f<=1 && + mbcsData->utf8Maxutf8Max<0xfeff + ) { + mbcsData->utf8Max=0xffff; + } + switch(f) { case -1: /* there was no precision/fallback indicator */ @@ -943,7 +1017,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati if(maxCharLength==1) { isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); - } else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) { + } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) { isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); } else { m->f|=MBCS_FROM_U_EXT_FLAG; @@ -955,7 +1029,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati if(maxCharLength==1) { staticData->hasFromUnicodeFallback=TRUE; isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); - } else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) { + } else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) { staticData->hasFromUnicodeFallback=TRUE; isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f); } else { @@ -965,7 +1039,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati break; case 2: /* ignore |2 SUB mappings, except to move mappings to the extension table */ - if(maxCharLength>1 && !MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) { + if(maxCharLength>1 && m->bLen==1) { m->f|=MBCS_FROM_U_EXT_FLAG; m->moveFlag=UCM_MOVE_TO_EXT; } @@ -1329,24 +1403,56 @@ static uint32_t MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData, int32_t tableType) { MBCSData *mbcsData=(MBCSData *)cnvData; + uint32_t stage2Start, stage2Length; uint32_t top, stageUTF8Length=0; int32_t i, stage1Top; + uint32_t headerLength; _MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 }; + stage2Length=mbcsData->stage2Top; + if(mbcsData->omitFromU) { + /* find how much of stage2 can be omitted */ + int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1; + uint32_t st2; + + i=utf8Limit>>MBCS_STAGE_1_SHIFT; + if((utf8Limit&((1<stage1[i])!=0) { + /* utf8Limit is in the middle of an existing stage 2 block */ + stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK); + } else { + /* find the last stage2 block with mappings before utf8Limit */ + while(i>0 && (st2=mbcsData->stage1[--i])==0) {} + /* stage2 up to the end of this block corresponds to stageUTF8 */ + stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE; + } + header.options|=MBCS_OPT_NO_FROM_U; + header.fullStage2Length=stage2Length; + stage2Length-=stage2Start; + if(VERBOSE) { + printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n", + stage2Start, mbcsData->stage2Top, mbcsData->stage3Top); + printf("+ total size savings: %lu bytes\n", stage2Start*4+mbcsData->stage3Top); + } + } else { + stage2Start=0; + } + + if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { + stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ + } else { + stage1Top=0x40; /* 0x40==64 */ + } + /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */ if(mbcsData->ucm->states.maxCharLength==1) { - if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { - stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ - } else { - stage1Top=0x40; /* 0x40==64 */ - } for(i=0; istage1[i]+=(uint16_t)stage1Top; } - /* stage2Top has counted 16-bit results, now we need to count bytes */ - mbcsData->stage2Top*=2; + /* stage2Top/Length have counted 16-bit results, now we need to count bytes */ + /* also round up to a multiple of 4 bytes */ + stage2Length=(stage2Length*2+1)&~1; /* stage3Top has counted 16-bit results, now we need to count bytes */ mbcsData->stage3Top*=2; @@ -1355,40 +1461,47 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */ } } else { - if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { - stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ - } else { - stage1Top=0x40; /* 0x40==64 */ - } for(i=0; istage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */ } - /* stage2Top has counted 32-bit results, now we need to count bytes */ - mbcsData->stage2Top*=4; + /* stage2Top/Length have counted 32-bit results, now we need to count bytes */ + stage2Length*=4; + /* leave stage2Start counting 32-bit units */ if(mbcsData->utf8Friendly) { - stageUTF8Length=MBCS_UTF8_STAGE_SIZE; - header.version[2]=(uint8_t)(MBCS_UTF8_MAX>>8); /* store 0xd7 for max==0xd7ff */ + stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT; + header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */ } /* stage3Top has already counted bytes */ } - /* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */ - mbcsData->stage2Top=(mbcsData->stage2Top+3)&~3; + /* round up stage3Top so that the sizes of all data blocks are multiples of 4 */ mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3; /* fill the header */ - header.version[0]=4; + if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) { + header.version[0]=5; + if(header.options&MBCS_OPT_NO_FROM_U) { + headerLength=10; /* include fullStage2Length */ + } else { + headerLength=MBCS_HEADER_V5_MIN_LENGTH; /* 9 */ + } + } else { + header.version[0]=4; + headerLength=MBCS_HEADER_V4_LENGTH; /* 8 */ + } header.version[1]=3; /* header.version[2] set above for utf8Friendly data */ + header.options|=(uint32_t)headerLength; + header.countStates=mbcsData->ucm->states.countStates; header.countToUFallbacks=mbcsData->countToUFallbacks; header.offsetToUCodeUnits= - sizeof(_MBCSHeader)+ + headerLength*4+ mbcsData->ucm->states.countStates*1024+ mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback); header.offsetFromUTable= @@ -1397,10 +1510,13 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, header.offsetFromUBytes= header.offsetFromUTable+ stage1Top*2+ - mbcsData->stage2Top; + stage2Length; header.fromUBytesLength=mbcsData->stage3Top; - top=header.offsetFromUBytes+header.fromUBytesLength+stageUTF8Length*2; + top=header.offsetFromUBytes+stageUTF8Length*2; + if(!(header.options&MBCS_OPT_NO_FROM_U)) { + top+=header.fromUBytesLength; + } header.flags=(uint8_t)(mbcsData->ucm->states.outputType); @@ -1414,17 +1530,19 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, } /* write the MBCS data */ - udata_writeBlock(pData, &header, sizeof(_MBCSHeader)); + udata_writeBlock(pData, &header, headerLength*4); udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024); udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback)); udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2); udata_writeBlock(pData, mbcsData->stage1, stage1Top*2); if(mbcsData->ucm->states.maxCharLength==1) { - udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top); + udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length); } else { - udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top); + udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length); + } + if(!(header.options&MBCS_OPT_NO_FROM_U)) { + udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top); } - udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top); if(stageUTF8Length>0) { udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2); diff --git a/icu4c/source/tools/makeconv/genmbcs.h b/icu4c/source/tools/makeconv/genmbcs.h index 60f52e3ddb..cb0cc5e6eb 100644 --- a/icu4c/source/tools/makeconv/genmbcs.h +++ b/icu4c/source/tools/makeconv/genmbcs.h @@ -101,9 +101,20 @@ enum { U_CFUNC NewConverter * MBCSOpen(UCMFile *ucm); +struct MBCSData; +typedef struct MBCSData MBCSData; + +/* + * Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode() + * for creating an extension-only file. + * Assume maxCharLength>1. + */ +U_CFUNC const MBCSData * +MBCSGetDummy(); + /* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */ U_CFUNC UBool -MBCSOkForBaseFromUnicode(UBool utf8Friendly, +MBCSOkForBaseFromUnicode(const MBCSData *mbcsData, const uint8_t *bytes, int32_t length, UChar32 c, int8_t flag); diff --git a/icu4c/source/tools/makeconv/makeconv.c b/icu4c/source/tools/makeconv/makeconv.c index d5aeafadc6..7e62c86809 100644 --- a/icu4c/source/tools/makeconv/makeconv.c +++ b/icu4c/source/tools/makeconv/makeconv.c @@ -34,6 +34,8 @@ #include "makeconv.h" #include "genmbcs.h" +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + #define DEBUG 0 typedef struct ConvData { @@ -76,6 +78,7 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP * Global - verbosity */ UBool VERBOSE = FALSE; +UBool SMALL = FALSE; static void createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); @@ -163,13 +166,25 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr } } +enum { + OPT_HELP_H, + OPT_HELP_QUESTION_MARK, + OPT_COPYRIGHT, + OPT_VERSION, + OPT_DESTDIR, + OPT_VERBOSE, + OPT_SMALL, + OPT_COUNT +}; + static UOption options[]={ - UOPTION_HELP_H, /* 0 Numbers for those who*/ - UOPTION_HELP_QUESTION_MARK, /* 1 can't count. */ - UOPTION_COPYRIGHT, /* 2 */ - UOPTION_VERSION, /* 3 */ - UOPTION_DESTDIR, /* 4 */ - UOPTION_VERBOSE, /* 5 */ + UOPTION_HELP_H, + UOPTION_HELP_QUESTION_MARK, + UOPTION_COPYRIGHT, + UOPTION_VERSION, + UOPTION_DESTDIR, + UOPTION_VERBOSE, + { "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 } }; int main(int argc, char* argv[]) @@ -194,8 +209,8 @@ int main(int argc, char* argv[]) uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo)); /* preset then read command line options */ - options[4].value=u_getDataDirectory(); - argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); + options[OPT_DESTDIR].value=u_getDataDirectory(); + argc=u_parseArgs(argc, argv, LENGTHOF(options), options); /* error handling, printing usage message */ if(argc<0) { @@ -205,8 +220,9 @@ int main(int argc, char* argv[]) } else if(argc<2) { argc=-1; } - if(argc<0 || options[0].doesOccur || options[1].doesOccur) { - fprintf(stderr, + if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) { + FILE *stdfile=argc<0 ? stderr : stdout; + fprintf(stdfile, "usage: %s [-options] files...\n" "\tread .ucm codepage mapping files and write .cnv files\n" "options:\n" @@ -216,20 +232,26 @@ int main(int argc, char* argv[]) "\t-d or --destdir destination directory, followed by the path\n" "\t-v or --verbose Turn on verbose output\n", argv[0]); + fprintf(stdfile, + "\t --small Generate smaller .cnv files. They will be\n" + "\t significantly smaller but may not be compatible with\n" + "\t older versions of ICU and will require heap memory\n" + "\t allocation when loaded.\n"); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } - if(options[3].doesOccur) { - fprintf(stderr,"makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n", - dataInfo.formatVersion[0], dataInfo.formatVersion[1]); - fprintf(stderr, U_COPYRIGHT_STRING "\n"); + if(options[OPT_VERSION].doesOccur) { + printf("makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n", + dataInfo.formatVersion[0], dataInfo.formatVersion[1]); + printf("%s\n", U_COPYRIGHT_STRING); exit(0); } /* get the options values */ - haveCopyright = options[2].doesOccur; - destdir = options[4].value; - VERBOSE = options[5].doesOccur; + haveCopyright = options[OPT_COPYRIGHT].doesOccur; + destdir = options[OPT_DESTDIR].value; + VERBOSE = options[OPT_VERBOSE].doesOccur; + SMALL = options[OPT_SMALL].doesOccur; if (destdir != NULL && *destdir != 0) { uprv_strcpy(outFileName, destdir); @@ -766,12 +788,13 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod * * Do this after ucm_checkBaseExt(). */ + const MBCSData *mbcsData=MBCSGetDummy(); int32_t needsMove=0; for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength; mb.bytes, m->bLen, m->u, m->f)) { + if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) { m->f|=MBCS_FROM_U_EXT_FLAG; m->moveFlag=UCM_MOVE_TO_EXT; ++needsMove; diff --git a/icu4c/source/tools/makeconv/makeconv.h b/icu4c/source/tools/makeconv/makeconv.h index 0fa0fb2d1d..a3c2d375a1 100644 --- a/icu4c/source/tools/makeconv/makeconv.h +++ b/icu4c/source/tools/makeconv/makeconv.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2006, International Business Machines +* Copyright (C) 2000-2007, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -24,6 +24,7 @@ /* exports from makeconv.c */ U_CFUNC UBool VERBOSE; +U_CFUNC UBool SMALL; /* converter table type for writing */ enum { diff --git a/icu4c/source/tools/toolutil/pkgitems.cpp b/icu4c/source/tools/toolutil/pkgitems.cpp index 6a93769c0d..2a8f01289a 100644 --- a/icu4c/source/tools/toolutil/pkgitems.cpp +++ b/icu4c/source/tools/toolutil/pkgitems.cpp @@ -497,7 +497,7 @@ ucnv_enumDependencies(const UDataSwapper *ds, /* check for supported conversionType values */ if(inStaticData->conversionType==UCNV_MBCS) { /* MBCS data */ - uint32_t mbcsHeaderFlags; + uint32_t mbcsHeaderLength, mbcsHeaderFlags, mbcsHeaderOptions; int32_t extOffset; inMBCSHeader=(const _MBCSHeader *)inBytes; @@ -508,7 +508,14 @@ ucnv_enumDependencies(const UDataSwapper *ds, *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; return; } - if(!(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1)) { + if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) { + mbcsHeaderLength=MBCS_HEADER_V4_LENGTH; + } else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 && + ((mbcsHeaderOptions=ds->readUInt32(inMBCSHeader->options))& + MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0 + ) { + mbcsHeaderLength=mbcsHeaderOptions&MBCS_OPT_LENGTH_MASK; + } else { udata_printError(ds, "icupkg/ucnv_enumDependencies(): unsupported _MBCSHeader.version %d.%d\n", inMBCSHeader->version[0], inMBCSHeader->version[1]); *pErrorCode=U_UNSUPPORTED_ERROR; @@ -536,14 +543,15 @@ ucnv_enumDependencies(const UDataSwapper *ds, } /* swap the base name, between the header and the extension data */ - baseNameLength=(int32_t)strlen((const char *)(inMBCSHeader+1)); + const char *inBaseName=(const char *)inBytes+mbcsHeaderLength*4; + baseNameLength=(int32_t)strlen(inBaseName); if(baseNameLength>=(int32_t)sizeof(baseName)) { udata_printError(ds, "icupkg/ucnv_enumDependencies(%s): base name length %ld too long\n", itemName, baseNameLength); *pErrorCode=U_UNSUPPORTED_ERROR; return; } - ds->swapInvChars(ds, inMBCSHeader+1, baseNameLength+1, baseName, pErrorCode); + ds->swapInvChars(ds, inBaseName, baseNameLength+1, baseName, pErrorCode); checkIDSuffix(itemName, baseName, -1, ".cnv", check, context, pErrorCode); }