ICU-5987 merge small-conversion-file feature into trunk, from svn merge -r 22780:22805 .../branches/markus/smallcnv

X-SVN-Rev: 22852
This commit is contained in:
Markus Scherer 2007-10-25 17:05:36 +00:00
parent b69ac49696
commit 295dc24d64
12 changed files with 784 additions and 176 deletions

View File

@ -1261,6 +1261,9 @@ ucnv_swap(const UDataSwapper *ds,
const _MBCSHeader *inMBCSHeader;
_MBCSHeader *outMBCSHeader;
_MBCSHeader mbcsHeader;
uint32_t mbcsHeaderLength;
UBool noFromU=FALSE;
uint8_t outputType;
int32_t maxFastUChar, mbcsIndexLength;
@ -1350,7 +1353,15 @@ ucnv_swap(const UDataSwapper *ds,
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
if(!(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1)) {
if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) {
mbcsHeaderLength=MBCS_HEADER_V4_LENGTH;
} else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 &&
((mbcsHeader.options=ds->readUInt32(inMBCSHeader->options))&
MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0
) {
mbcsHeaderLength=mbcsHeader.options&MBCS_OPT_LENGTH_MASK;
noFromU=(UBool)((mbcsHeader.options&MBCS_OPT_NO_FROM_U)!=0);
} else {
udata_printError(ds, "ucnv_swap(): unsupported _MBCSHeader.version %d.%d\n",
inMBCSHeader->version[0], inMBCSHeader->version[1]);
*pErrorCode=U_UNSUPPORTED_ERROR;
@ -1365,9 +1376,15 @@ ucnv_swap(const UDataSwapper *ds,
mbcsHeader.offsetFromUBytes= ds->readUInt32(inMBCSHeader->offsetFromUBytes);
mbcsHeader.flags= ds->readUInt32(inMBCSHeader->flags);
mbcsHeader.fromUBytesLength= ds->readUInt32(inMBCSHeader->fromUBytesLength);
/* mbcsHeader.options have been read above */
extOffset=(int32_t)(mbcsHeader.flags>>8);
outputType=(uint8_t)mbcsHeader.flags;
if(noFromU && outputType==MBCS_OUTPUT_1) {
udata_printError(ds, "ucnv_swap(): unsupported combination of makeconv --small with SBCS\n");
*pErrorCode=U_UNSUPPORTED_ERROR;
return 0;
}
/* make sure that the output type is known */
switch(outputType) {
@ -1406,7 +1423,10 @@ ucnv_swap(const UDataSwapper *ds,
}
if(extOffset==0) {
size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsHeader.fromUBytesLength+mbcsIndexLength);
size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsIndexLength);
if(!noFromU) {
size+=(int32_t)mbcsHeader.fromUBytesLength;
}
/* avoid compiler warnings - not otherwise necessary, and the value does not matter */
inExtIndexes=NULL;
@ -1436,8 +1456,9 @@ ucnv_swap(const UDataSwapper *ds,
uprv_memcpy(outBytes, inBytes, size);
}
/* swap the MBCSHeader */
ds->swapArray32(ds, &inMBCSHeader->countStates, 7*4,
/* swap the MBCSHeader, except for the version field */
count=mbcsHeaderLength*4;
ds->swapArray32(ds, &inMBCSHeader->countStates, count-4,
&outMBCSHeader->countStates, pErrorCode);
if(outputType==MBCS_OUTPUT_EXT_ONLY) {
@ -1447,18 +1468,23 @@ ucnv_swap(const UDataSwapper *ds,
*/
/* swap the base name, between the header and the extension data */
ds->swapInvChars(ds, inMBCSHeader+1, (int32_t)uprv_strlen((const char *)(inMBCSHeader+1)),
outMBCSHeader+1, pErrorCode);
const char *inBaseName=(const char *)inBytes+count;
char *outBaseName=(char *)outBytes+count;
ds->swapInvChars(ds, inBaseName, (int32_t)uprv_strlen(inBaseName),
outBaseName, pErrorCode);
} else {
/* normal file with base table data */
/* swap the state table, 1kB per state */
ds->swapArray32(ds, inMBCSHeader+1, (int32_t)(mbcsHeader.countStates*1024),
outMBCSHeader+1, pErrorCode);
offset=count;
count=mbcsHeader.countStates*1024;
ds->swapArray32(ds, inBytes+offset, (int32_t)count,
outBytes+offset, pErrorCode);
/* swap the toUFallbacks[] */
offset=sizeof(_MBCSHeader)+mbcsHeader.countStates*1024;
ds->swapArray32(ds, inBytes+offset, (int32_t)(mbcsHeader.countToUFallbacks*8),
offset+=count;
count=mbcsHeader.countToUFallbacks*8;
ds->swapArray32(ds, inBytes+offset, (int32_t)count,
outBytes+offset, pErrorCode);
/* swap the unicodeCodeUnits[] */
@ -1495,7 +1521,7 @@ ucnv_swap(const UDataSwapper *ds,
/* stage 3/result bytes: sometimes uint16_t[] or uint32_t[] */
offset=mbcsHeader.offsetFromUBytes;
count=mbcsHeader.fromUBytesLength;
count= noFromU ? 0 : mbcsHeader.fromUBytesLength;
switch(outputType) {
case MBCS_OUTPUT_2:
case MBCS_OUTPUT_3_EUC:

View File

@ -175,6 +175,19 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv,
int32_t *pBufferSize,
UErrorCode *status);
/**
* Filters for some ucnv_getUnicodeSet() implementation code.
*/
typedef enum UConverterSetFilter {
UCNV_SET_FILTER_NONE,
UCNV_SET_FILTER_DBCS_ONLY,
UCNV_SET_FILTER_2022_CN,
UCNV_SET_FILTER_SJIS,
UCNV_SET_FILTER_GR94DBCS,
UCNV_SET_FILTER_HZ,
UCNV_SET_FILTER_COUNT
} UConverterSetFilter;
/**
* Fills the set of Unicode code points that can be converted by an ICU converter.
* The API function ucnv_getUnicodeSet() clears the USet before calling

View File

@ -61,9 +61,47 @@
#define MBCS_UNROLL_SINGLE_FROM_BMP 0
/*
* _MBCSHeader version 4.3
* _MBCSHeader versions 5.3 & 4.3
* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
*
* This version is optional. Version 5 is used for incompatible data format changes.
* makeconv will continue to generate version 4 files if possible.
*
* Changes from version 4:
*
* The main difference is an additional _MBCSHeader field with
* - the length (number of uint32_t) of the _MBCSHeader
* - flags for further incompatible data format changes
* - flags for further, backward compatible data format changes
*
* The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
* the file and needs to be reconstituted at load time.
* This requires a utf8Friendly format with an additional mbcsIndex table for fast
* (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
* (For details about these structures see below, and see ucnvmbcs.h.)
*
* utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
* of the Unicode code points. (This requires that the .ucm file has the |0 etc.
* precision markers for all mappings.)
*
* All fallbacks have been moved to the extension table, leaving only roundtrips in the
* omitted data that can be reconstituted from the toUnicode data.
*
* Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
* With only roundtrip mappings in the base fromUnicode data, this part is fully
* redundant with the mbcsIndex and will be reconstituted from that (also using the
* stage 1 table which contains the information about how stage 2 was compacted).
*
* The rest of the stage 2 table, the part for code points above maxFastUChar,
* is stored in the file and will be appended to the reconstituted part.
*
* The entire fromUBytes array is omitted from the file and will be reconstitued.
* This is done by enumerating all toUnicode roundtrip mappings, performing
* each mapping (using the stage 1 and reconstituted stage 2 tables) and
* writing instead of reading the byte values.
*
* _MBCSHeader version 4.3
*
* Change from version 4.2:
* - Optional utf8Friendly data structures, with 64-entry stage 3 block
* allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
@ -362,101 +400,240 @@ gb18030Ranges[13][4]={
/* Miscellaneous ------------------------------------------------------------ */
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
/**
* Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
* consecutive sequences of bytes, starting from the one encoded in value,
* to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
* Does not currently support m:n mappings or reverse fallbacks.
* This function will not be called for sequences of bytes with leading zeros.
*
* @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
* @param value contains 1..4 bytes of the first byte sequence, right-aligned
* @param codePoints resulting Unicode code points, or negative if a byte sequence does
* not map to anything
* @return TRUE to continue enumeration, FALSE to stop
*/
typedef UBool U_CALLCONV
UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
/* similar to ucnv_MBCSGetNextUChar() but recursive */
static void
_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
const USetAdder *sa,
UConverterUnicodeSet which,
uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
UErrorCode *pErrorCode) {
int32_t b, entry;
static UBool
enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
int32_t state, uint32_t offset,
uint32_t value,
UConverterEnumToUCallback *callback, const void *context,
UErrorCode *pErrorCode) {
UChar32 codePoints[32];
const int32_t *row;
const uint16_t *unicodeCodeUnits;
UChar32 anyCodePoints;
int32_t b, limit;
for(b=lowByte; b<=highByte; ++b) {
entry=stateTable[state][b];
row=mbcsTable->stateTable[state];
unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
value<<=8;
anyCodePoints=-1; /* becomes non-negative if there is a mapping */
b=(stateProps[state]&0x38)<<2;
if(b==0 && stateProps[state]>=0x40) {
/* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
codePoints[0]=U_SENTINEL;
b=1;
}
limit=((stateProps[state]&7)+1)<<5;
while(b<limit) {
int32_t entry=row[b];
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
_getUnicodeSetForBytes(
sharedData, stateTable, unicodeCodeUnits,
sa, which,
(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry),
offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
0, 0xff,
pErrorCode);
int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
if(stateProps[nextState]>=0) {
/* recurse to a state with non-ignorable actions */
if(!enumToU(
mbcsTable, stateProps, nextState,
offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
value|(uint32_t)b,
callback, context,
pErrorCode)) {
return FALSE;
}
}
codePoints[b&0x1f]=U_SENTINEL;
} else {
UChar32 c;
int32_t rowOffset=offset;
uint8_t action;
c=U_SENTINEL;
int32_t action;
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
action=MBCS_ENTRY_FINAL_ACTION(entry);
if(action==MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
} else if(action==MBCS_STATE_VALID_16) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset];
int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[finalOffset];
if(c<0xfffe) {
/* output BMP code point */
} else {
c=U_SENTINEL;
}
} else if(action==MBCS_STATE_VALID_16_PAIR) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset++];
int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[finalOffset++];
if(c<0xd800) {
/* output BMP code point below 0xd800 */
} else if(c<=0xdbff) {
/* output roundtrip or fallback supplementary code point */
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
} else if(c==0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
c=unicodeCodeUnits[offset];
c=unicodeCodeUnits[finalOffset];
} else {
c=U_SENTINEL;
}
} else if(action==MBCS_STATE_VALID_DIRECT_20) {
/* output supplementary code point */
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
} else {
c=U_SENTINEL;
}
if(c>=0) {
sa->add(sa->set, c);
codePoints[b&0x1f]=c;
anyCodePoints&=c;
}
if(((++b)&0x1f)==0) {
if(anyCodePoints>=0) {
if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
return FALSE;
}
anyCodePoints=-1;
}
offset=rowOffset;
}
}
return TRUE;
}
/*
* Internal function returning a UnicodeSet for toUnicode() conversion.
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
* In the future, if we add support for reverse-fallback sets, this function
* needs to be updated, and called for each initial state.
* Does not currently handle extensions.
* Does not empty the set first.
* Only called if stateProps[state]==-1.
* A recursive call may do stateProps[state]|=0x40 if this state is the target of an
* MBCS_STATE_CHANGE_ONLY.
*/
U_CFUNC void
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
const USetAdder *sa,
UConverterUnicodeSet which,
uint8_t state, int32_t lowByte, int32_t highByte,
UErrorCode *pErrorCode) {
_getUnicodeSetForBytes(
sharedData, sharedData->mbcs.stateTable, sharedData->mbcs.unicodeCodeUnits,
sa, which,
state, 0, lowByte, highByte,
pErrorCode);
static int8_t
getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
const int32_t *row;
int32_t min, max, entry, nextState;
row=stateTable[state];
stateProps[state]=0;
/* find first non-ignorable state */
for(min=0;; ++min) {
entry=row[min];
nextState=MBCS_ENTRY_STATE(entry);
if(stateProps[nextState]==-1) {
getStateProp(stateTable, stateProps, nextState);
}
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
if(stateProps[nextState]>=0) {
break;
}
} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
break;
}
if(min==0xff) {
stateProps[state]=-0x40; /* (int8_t)0xc0 */
return stateProps[state];
}
}
stateProps[state]|=(int8_t)((min>>5)<<3);
/* find last non-ignorable state */
for(max=0xff; min<max; --max) {
entry=row[max];
nextState=MBCS_ENTRY_STATE(entry);
if(stateProps[nextState]==-1) {
getStateProp(stateTable, stateProps, nextState);
}
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
if(stateProps[nextState]>=0) {
break;
}
} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
break;
}
}
stateProps[state]|=(int8_t)(max>>5);
/* recurse further and collect direct-state information */
while(min<=max) {
entry=row[min];
nextState=MBCS_ENTRY_STATE(entry);
if(stateProps[nextState]==-1) {
getStateProp(stateTable, stateProps, nextState);
}
if(MBCS_ENTRY_IS_FINAL(entry)) {
stateProps[nextState]|=0x40;
if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
stateProps[state]|=0x40;
}
}
++min;
}
return stateProps[state];
}
#endif
/*
* Internal function enumerating the toUnicode data of an MBCS converter.
* Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
* table, but could also be used for a future ucnv_getUnicodeSet() option
* that includes reverse fallbacks (after updating this function's implementation).
* Currently only handles roundtrip mappings.
* Does not currently handle extensions.
*/
static void
ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
UConverterEnumToUCallback *callback, const void *context,
UErrorCode *pErrorCode) {
/*
* Properties for each state, to speed up the enumeration.
* Ignorable actions are unassigned/illegal/state-change-only:
* They do not lead to mappings.
*
* Bits 7..6:
* 1 direct/initial state (stateful converters have multiple)
* 0 non-initial state with transitions or with non-ignorable result actions
* -1 final state with only ignorable actions
*
* Bits 5..3:
* The lowest byte value with non-ignorable actions is
* value<<5 (rounded down).
*
* Bits 2..0:
* The highest byte value with non-ignorable actions is
* (value<<5)&0x1f (rounded up).
*/
int8_t stateProps[MBCS_MAX_STATE_COUNT];
int32_t state;
uprv_memset(stateProps, -1, sizeof(stateProps));
/* recurse from state 0 and set all stateProps */
getStateProp(mbcsTable->stateTable, stateProps, 0);
for(state=0; state<mbcsTable->countStates; ++state) {
/*if(stateProps[state]==-1) {
printf("unused/unreachable <icu:state> %d\n", state);
}*/
if(stateProps[state]>=0x40) {
/* start from each direct state */
enumToU(
mbcsTable, stateProps, state, 0, 0,
callback, context,
pErrorCode);
}
}
}
U_CFUNC void
ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
@ -1006,6 +1183,156 @@ _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
return TRUE;
}
/* reconstitute omitted fromUnicode data ------------------------------------ */
/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
static UBool U_CALLCONV
writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
const uint16_t *table;
uint32_t *stage2;
uint8_t *bytes, *p;
UChar32 c;
int32_t i, st3;
table=mbcsTable->fromUnicodeTable;
bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
/* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
switch(mbcsTable->outputType) {
case MBCS_OUTPUT_3_EUC:
if(value<=0xffff) {
/* short sequences are stored directly */
/* code set 0 or 1 */
} else if(value<=0x8effff) {
/* code set 2 */
value&=0x7fff;
} else /* first byte is 0x8f */ {
/* code set 3 */
value&=0xff7f;
}
break;
case MBCS_OUTPUT_4_EUC:
if(value<=0xffffff) {
/* short sequences are stored directly */
/* code set 0 or 1 */
} else if(value<=0x8effffff) {
/* code set 2 */
value&=0x7fffff;
} else /* first byte is 0x8f */ {
/* code set 3 */
value&=0xff7fff;
}
break;
default:
break;
}
for(i=0; i<=0x1f; ++value, ++i) {
c=codePoints[i];
if(c<0) {
continue;
}
/* locate the stage 2 & 3 data */
stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
p=bytes;
st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
/* write the codepage bytes into stage 3 */
switch(mbcsTable->outputType) {
case MBCS_OUTPUT_3:
case MBCS_OUTPUT_4_EUC:
p+=st3*3;
p[0]=(uint8_t)(value>>16);
p[1]=(uint8_t)(value>>8);
p[2]=(uint8_t)value;
break;
case MBCS_OUTPUT_4:
((uint32_t *)p)[st3]=value;
break;
default:
/* 2 bytes per character */
((uint16_t *)p)[st3]=(uint16_t)value;
break;
}
/* set the roundtrip flag */
*stage2|=(1UL<<(16+(c&0xf)));
}
return TRUE;
}
static void
reconstituteData(UConverterMBCSTable *mbcsTable,
uint32_t stage1Length, uint32_t stage2Length,
uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */
UErrorCode *pErrorCode) {
uint16_t *stage1;
uint32_t *stage2;
uint8_t *bytes;
uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
if(mbcsTable->reconstitutedData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
/* copy existing data and reroute the pointers */
stage1=(uint16_t *)mbcsTable->reconstitutedData;
uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
stage2=(uint32_t *)(stage1+stage1Length);
uprv_memcpy(stage2+(fullStage2Length-stage2Length),
mbcsTable->fromUnicodeTable+stage1Length,
stage2Length*4);
mbcsTable->fromUnicodeTable=stage1;
mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
/* indexes into stage 2 count from the bottom of the fromUnicodeTable */
stage2=(uint32_t *)stage1;
/* reconstitute the initial part of stage 2 from the mbcsIndex */
{
int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
int32_t stageUTF8Index=0;
int32_t st1, st2, st3, i;
for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
st2=stage1[st1];
if(st2!=stage1Length/2) {
/* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
for(i=0; i<16; ++i) {
st3=mbcsTable->mbcsIndex[stageUTF8Index++];
if(st3!=0) {
/* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
st3>>=4;
/*
* 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
* allocated together as a single 64-block for access from the mbcsIndex
*/
stage2[st2++]=st3++;
stage2[st2++]=st3++;
stage2[st2++]=st3++;
stage2[st2++]=st3;
} else {
/* no stage 3 block, skip */
st2+=4;
}
}
} else {
/* no stage 2 block, skip */
stageUTF8Index+=16;
}
}
}
/* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
}
/* MBCS setup functions ----------------------------------------------------- */
static void
@ -1017,13 +1344,25 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
_MBCSHeader *header=(_MBCSHeader *)raw;
uint32_t offset;
uint32_t headerLength;
UBool noFromU=FALSE;
if(header->version[0]!=4) {
if(header->version[0]==4) {
headerLength=MBCS_HEADER_V4_LENGTH;
} else if(header->version[0]==5 && header->version[1]>=3 &&
(header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
headerLength=header->options&MBCS_OPT_LENGTH_MASK;
noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
} else {
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
mbcsTable->outputType=(uint8_t)header->flags;
if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
/* extension data, header version 4.2 and higher */
offset=header->flags>>8;
@ -1051,7 +1390,7 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
}
/* load the base table */
baseName=(const char *)(header+1);
baseName=(const char *)header+headerLength*4;
if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
/* forbid loading this same extension-only file */
*pErrorCode=U_INVALID_TABLE_FORMAT;
@ -1095,6 +1434,12 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
mbcsTable->swapLFNLFromUnicodeBytes=NULL;
mbcsTable->swapLFNLName=NULL;
/*
* The reconstitutedData must be deleted only when the base converter
* is unloaded.
*/
mbcsTable->reconstitutedData=NULL;
/*
* Set a special, runtime-only outputType if the extension converter
* is a DBCS version of a base converter that also maps single bytes.
@ -1187,7 +1532,7 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
mbcsTable->countStates=(uint8_t)header->countStates;
mbcsTable->countToUFallbacks=header->countToUFallbacks;
mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
@ -1244,7 +1589,9 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
* The .cnv file is prebuilt with an additional stage table with indexes
* to each block.
*/
mbcsTable->mbcsIndex=(const uint16_t *)(mbcsTable->fromUnicodeBytes+mbcsTable->fromUBytesLength);
mbcsTable->mbcsIndex=(const uint16_t *)
(mbcsTable->fromUnicodeBytes+
(noFromU ? 0 : mbcsTable->fromUBytesLength));
mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
}
}
@ -1261,6 +1608,16 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
}
mbcsTable->asciiRoundtrips=asciiRoundtrips;
}
if(noFromU) {
uint32_t stage1Length=
mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
0x440 : 0x40;
uint32_t stage2Length=
(header->offsetFromUBytes-header->offsetFromUTable)/4-
stage1Length/2;
reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
}
}
/* Set the impl pointer here so that it is set for both extension-only and base tables. */
@ -1296,6 +1653,9 @@ ucnv_MBCSUnload(UConverterSharedData *sharedData) {
if(mbcsTable->baseSharedData!=NULL) {
ucnv_unload(mbcsTable->baseSharedData);
}
if(mbcsTable->reconstitutedData!=NULL) {
uprv_free(mbcsTable->reconstitutedData);
}
}
static void

View File

@ -23,6 +23,7 @@
#include "unicode/ucnv.h"
#include "ucnv_cnv.h"
#include "ucnv_ext.h"
/**
* ICU conversion (.cnv) data file structure, following the usual UDataInfo
@ -41,6 +42,24 @@
* the same toUnicode structures, while the fromUnicode structures for SBCS
* differ from those for other MBCS-style converters.
*
* _MBCSHeader.version 5 is optional and not backward-compatible
* (as usual for changes in the major version field).
*
* Versions 5.m work like versions 4.m except:
* - The _MBCSHeader has variable length (and is always longer than in version 4).
* See the struct _MBCSHeader further description below.
* - There is a set of flags which indicate further incompatible changes.
* (Reader code must reject the file if it does not recognize them all.)
* - In particular, one of these flags indicates that most of the fromUnicode
* data is missing and must be reconstituted from the toUnicode data
* and from the utf8Friendly mbcsIndex at load time.
* (This only works with a utf8Friendly table.)
* In this case, makeconv may increase maxFastUChar automatically to U+FFFF.
*
* The first of these versions is 5.3, which is like 4.3 except for the differences above.
*
* When possible, makeconv continues to generate version 4.m files.
*
* _MBCSHeader.version 4.3 optionally modifies the fromUnicode data structures
* slightly and optionally adds a table for conversion to MBCS (non-SBCS)
* charsets.
@ -127,6 +146,26 @@
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
* counts bytes in fromUBytes[]
*
* New and required in version 5:
* 8 uint32_t options, bits:
* 31..16 reserved for flags that can be added without breaking
* backward compatibility
* 15.. 6 reserved for flags whose addition will break
* backward compatibility
* 6 MBCS_OPT_FROM_U -- if set,
* then most of the fromUnicode data is omitted;
* fullStage2Length is present and the missing
* bottom part of stage 2 must be reconstituted from
* the toUnicode data;
* stage 3 is missing completely as well;
* not used for SBCS tables
* 5.. 0 length of the _MBCSHeader (number of uint32_t)
*
* New and optional in version 5:
* 9 uint32_t fullStage2Length: used if MBCS_OPT_FROM_U is set
* specifies the full length of stage 2
* including the omitted part
*
* if(outputType==MBCS_OUTPUT_EXT_ONLY) {
* -- base table name for extension-only table
* char baseTableName[variable]; -- with NUL plus padding for 4-alignment
@ -153,7 +192,7 @@
* -- BMP-only tables have a smaller stage 1 table
* uint16_t fromUTable[0x40]; (32-bit-aligned)
* }
*
*
* -- stage 2 tables
* length determined by top of stage 1 and bottom of stage 3 tables
* if(outputType==MBCS_OUTPUT_1) {
@ -162,17 +201,24 @@
* } else {
* -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
* uint32_t stage 2 flags and indexes[?];
* if(options&MBCS_OPT_NO_FROM_U) {
* stage 2 really has length fullStage2Length
* and the omitted lower part must be reconstituted from
* the toUnicode data
* }
* }
*
*
* -- stage 3 tables with byte results
* if(outputType==MBCS_OUTPUT_1) {
* -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
* uint16_t fromUBytes[fromUBytesLength/2];
* } else {
* } else if(!(options&MBCS_OPT_NO_FROM_U)) {
* -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
* uint8_t fromUBytes[fromUBytesLength]; or
* uint16_t fromUBytes[fromUBytesLength/2]; or
* uint32_t fromUBytes[fromUBytesLength/4];
* } else {
* fromUBytes[] must be reconstituted from the toUnicode data
* }
*
* -- optional utf8Friendly mbcsIndex -- _MBCSHeader.version 4.3 (ICU 3.8) and higher
@ -340,6 +386,9 @@ typedef struct UConverterMBCSTable {
/* roundtrips */
uint32_t asciiRoundtrips;
/* reconstituted data that was omitted from the .cnv file */
uint8_t *reconstitutedData;
/* converter name for swaplfnl */
char *swapLFNLName;
@ -348,6 +397,26 @@ typedef struct UConverterMBCSTable {
const int32_t *extIndexes;
} UConverterMBCSTable;
enum {
MBCS_OPT_LENGTH_MASK=0x3f,
MBCS_OPT_NO_FROM_U=0x40,
/*
* If any of the following options bits are set,
* then the file must be rejected.
*/
MBCS_OPT_INCOMPATIBLE_MASK=0xffc0,
/*
* Remove bits from this mask as more options are recognized
* by all implementations that use this constant.
*/
MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80
};
enum {
MBCS_HEADER_V4_LENGTH=8,
MBCS_HEADER_V5_MIN_LENGTH=9
};
/**
* MBCS data header. See data format description above.
*/
@ -360,6 +429,12 @@ typedef struct {
offsetFromUBytes,
flags,
fromUBytesLength;
/* new and required in version 5 */
uint32_t options;
/* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */
uint32_t fullStage2Length; /* number of 32-bit units */
} _MBCSHeader;
/*
@ -456,23 +531,6 @@ U_CFUNC void
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode);
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
/*
* Internal function returning a UnicodeSet for toUnicode() conversion.
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
* In the future, if we add support for reverse-fallback sets, this function
* needs to be updated, and called for each initial state.
* Does not currently handle extensions.
* Does not empty the set first.
*/
U_CFUNC void
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
const USetAdder *sa,
UConverterUnicodeSet which,
uint8_t state, int32_t lowByte, int32_t highByte,
UErrorCode *pErrorCode);
#endif
/*
* Internal function returning a UnicodeSet for toUnicode() conversion.
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
@ -487,16 +545,6 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
UConverterUnicodeSet which,
UErrorCode *pErrorCode);
typedef enum UConverterSetFilter {
UCNV_SET_FILTER_NONE,
UCNV_SET_FILTER_DBCS_ONLY,
UCNV_SET_FILTER_2022_CN,
UCNV_SET_FILTER_SJIS,
UCNV_SET_FILTER_GR94DBCS,
UCNV_SET_FILTER_HZ,
UCNV_SET_FILTER_COUNT
} UConverterSetFilter;
/*
* Same as ucnv_MBCSGetUnicodeSetForUnicode() but
* the set can be filtered by encoding scheme.

View File

@ -186,7 +186,7 @@ $(TESTBUILDDIR)/nfsmxp.spp: $(BINDIR)/gensprep$(EXEEXT) $(TESTSRCDATADIR)/nfs4_m
$(INVOKE) $(BINDIR)/gensprep -s $(TESTSRCDATADIR) $(ICU_DATA_OPT) -d $(TESTBUILDDIR) -b nfsmxp -k -n $(UNICODEDATADIR) -u 3.2.0 nfs4_mixed_prep_p.txt
$(TESTBUILDDIR)/%.cnv: $(TESTSRCDATADIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT)
$(INVOKE) $(BINDIR)/makeconv -c -d $(TESTBUILDDIR) $(TESTSRCDATADIR)/$(<F)
$(INVOKE) $(BINDIR)/makeconv --small -c -d $(TESTBUILDDIR) $(TESTSRCDATADIR)/$(<F)
$(TESTBUILDDIR)/%.res: $(TESTSRCDATADIR)/%.txt $(BINDIR)/genrb$(EXEEXT) $(DAT_FILES)
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -q -s $(TESTSRCDATADIR) $(ICU_DATA_OPT) -d $(TESTBUILDDIR) $(<F)

View File

@ -124,21 +124,21 @@ $(TEST_RES_FILES:.res =.res
# Targets for test converter data
"$(TESTDATABLD)\test1.cnv": "$(TESTDATA)\test1.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
"$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
"$(TESTDATABLD)\test4.cnv": "$(TESTDATA)\test4.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
"$(TESTDATABLD)\test4x.cnv": "$(TESTDATA)\test4x.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
"$(TESTDATABLD)\ibm9027.cnv": "$(TESTDATA)\ibm9027.ucm"
@echo Building $@
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**

View File

@ -130,7 +130,7 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
extData->ucm->baseName[length++]=0;
}
headerSize=sizeof(header)+length;
headerSize=MBCS_HEADER_V4_LENGTH*4+length;
/* fill the header */
header.version[0]=4;
@ -138,7 +138,7 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY);
/* write the header and the base table name */
udata_writeBlock(pData, &header, sizeof(header));
udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4);
udata_writeBlock(pData, extData->ucm->baseName, length);
}

View File

@ -30,7 +30,7 @@
* Reduce tests for maxCharLength.
*/
typedef struct MBCSData {
struct MBCSData {
NewConverter newConverter;
UCMFile *ucm;
@ -48,10 +48,18 @@ typedef struct MBCSData {
uint32_t stage2Top, stage3Top;
/* fromUTF8 */
uint16_t stageUTF8[MBCS_UTF8_STAGE_SIZE];
uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT]; /* allow for utf8Max=0xffff */
/*
* Maximum UTF-8-friendly code point.
* 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100.
* If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff.
*/
uint16_t utf8Max;
UBool utf8Friendly;
} MBCSData;
UBool omitFromU;
};
/* prototypes */
static void
@ -115,6 +123,29 @@ printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
/* implementation ----------------------------------------------------------- */
static MBCSData gDummy;
U_CFUNC const MBCSData *
MBCSGetDummy() {
uprv_memset(&gDummy, 0, sizeof(MBCSData));
/*
* Set "pessimistic" values which may sometimes move too many
* mappings to the extension table (but never too few).
* These values cause MBCSOkForBaseFromUnicode() to return FALSE for the
* largest set of mappings.
* Assume maxCharLength>1.
*/
gDummy.utf8Friendly=TRUE;
if(SMALL) {
gDummy.utf8Max=0xffff;
gDummy.omitFromU=TRUE;
} else {
gDummy.utf8Max=MBCS_UTF8_MAX;
}
return &gDummy;
}
static void
MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
uprv_memset(mbcsData, 0, sizeof(MBCSData));
@ -680,7 +711,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
/* inspect stage 1 */
index=c>>MBCS_STAGE_1_SHIFT;
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
} else {
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
@ -716,7 +747,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
/* inspect stage 2 */
index=mbcsData->stage1[index]+nextOffset;
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
/* allocate 64-entry blocks for UTF-8-friendly lookup */
blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
@ -761,12 +792,12 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[index];
/* Build an alternate, UTF-8-friendly stage table as well. */
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
/* Overflow for uint16_t entries in stageUTF8? */
if(stage3Index>0xffff) {
/*
* This can occur only if the mapping table is nearly perfectly filled and if
* MBCS_UTF8_MAX==0xffff.
* utf8Max==0xffff.
* (There is no known charset like this. GB 18030 does not map
* surrogate code points and LMBCS does not map 256 PUA code points.)
*
@ -776,20 +807,20 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
* mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
* the initial all-unassigned block in stage3.
*
* Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff.
*
* (See svn revision 20866 of the markus/ucnvutf8 feature branch for
* code that causes MBCSAddTable() to rebuild the table not utf8Friendly
* in case of overflow. That code was not tested.)
*/
fprintf(stderr, "too many stage 3 entries for UTF-8-friendly format, processing U+%04x<->0x%s\n",
(int)c, printBytes(buffer, bytes, length));
return FALSE;
mbcsData->utf8Max=0xfeff;
} else {
/*
* The stage 3 block has been assigned for the regular trie.
* Just copy its index into stageUTF8[], without the granularity.
*/
mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
}
/*
* The stage 3 block has been assigned for the regular trie.
* Just copy its index into stageUTF8[], without the granularity.
*/
mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
}
/* write the codepage bytes into stage 3 and get the previous bytes */
@ -856,7 +887,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
}
U_CFUNC UBool
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
UChar32 c, int8_t flag) {
/*
@ -883,7 +914,16 @@ MBCSOkForBaseFromUnicode(UBool utf8Friendly,
* - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
* - any |1 fallback (no roundtrip flags in the optimized table)
*/
if(utf8Friendly && flag<=1 && c<=MBCS_UTF8_MAX && (bytes[0]==0 || flag==1)) {
if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) {
return FALSE;
}
/*
* If we omit the fromUnicode data, we can only store roundtrips there
* because only they are recoverable from the toUnicode data.
* Fallbacks must go into the extension table.
*/
if(mbcsData->omitFromU && flag!=0) {
return FALSE;
}
@ -918,6 +958,18 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
* indicators are used.
*/
mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
if(utf8Friendly) {
mbcsData->utf8Max=MBCS_UTF8_MAX;
if(SMALL && maxCharLength>1) {
mbcsData->omitFromU=TRUE;
}
} else {
mbcsData->utf8Max=0;
if(SMALL && maxCharLength>1) {
fprintf(stderr,
"makeconv warning: --small not available for .ucm files without |0 etc.\n");
}
}
if(!MBCSStartMappings(mbcsData)) {
return FALSE;
@ -933,6 +985,28 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
c=m->u;
f=m->f;
/*
* Small optimization for --small .cnv files:
*
* If there are fromUnicode mappings above MBCS_UTF8_MAX,
* then the file size will be smaller if we make utf8Max larger
* because the size increase in stageUTF8 will be more than balanced by
* how much less of stage2 needs to be stored.
*
* There is no point in doing this incrementally because stageUTF8
* uses so much less space per block than stage2,
* so we immediately increase utf8Max to 0xffff.
*
* Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode()
* sets it to that value when stageUTF8 overflows.
*/
if( mbcsData->omitFromU && f<=1 &&
mbcsData->utf8Max<c && c<=0xffff &&
mbcsData->utf8Max<0xfeff
) {
mbcsData->utf8Max=0xffff;
}
switch(f) {
case -1:
/* there was no precision/fallback indicator */
@ -943,7 +1017,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
if(maxCharLength==1) {
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
} else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else {
m->f|=MBCS_FROM_U_EXT_FLAG;
@ -955,7 +1029,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
if(maxCharLength==1) {
staticData->hasFromUnicodeFallback=TRUE;
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
} else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
staticData->hasFromUnicodeFallback=TRUE;
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
} else {
@ -965,7 +1039,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
break;
case 2:
/* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
if(maxCharLength>1 && !MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
if(maxCharLength>1 && m->bLen==1) {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
}
@ -1329,24 +1403,56 @@ static uint32_t
MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
UNewDataMemory *pData, int32_t tableType) {
MBCSData *mbcsData=(MBCSData *)cnvData;
uint32_t stage2Start, stage2Length;
uint32_t top, stageUTF8Length=0;
int32_t i, stage1Top;
uint32_t headerLength;
_MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
stage2Length=mbcsData->stage2Top;
if(mbcsData->omitFromU) {
/* find how much of stage2 can be omitted */
int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1;
uint32_t st2;
i=utf8Limit>>MBCS_STAGE_1_SHIFT;
if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) {
/* utf8Limit is in the middle of an existing stage 2 block */
stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK);
} else {
/* find the last stage2 block with mappings before utf8Limit */
while(i>0 && (st2=mbcsData->stage1[--i])==0) {}
/* stage2 up to the end of this block corresponds to stageUTF8 */
stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE;
}
header.options|=MBCS_OPT_NO_FROM_U;
header.fullStage2Length=stage2Length;
stage2Length-=stage2Start;
if(VERBOSE) {
printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n",
stage2Start, mbcsData->stage2Top, mbcsData->stage3Top);
printf("+ total size savings: %lu bytes\n", stage2Start*4+mbcsData->stage3Top);
}
} else {
stage2Start=0;
}
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
} else {
stage1Top=0x40; /* 0x40==64 */
}
/* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
if(mbcsData->ucm->states.maxCharLength==1) {
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
} else {
stage1Top=0x40; /* 0x40==64 */
}
for(i=0; i<stage1Top; ++i) {
mbcsData->stage1[i]+=(uint16_t)stage1Top;
}
/* stage2Top has counted 16-bit results, now we need to count bytes */
mbcsData->stage2Top*=2;
/* stage2Top/Length have counted 16-bit results, now we need to count bytes */
/* also round up to a multiple of 4 bytes */
stage2Length=(stage2Length*2+1)&~1;
/* stage3Top has counted 16-bit results, now we need to count bytes */
mbcsData->stage3Top*=2;
@ -1355,40 +1461,47 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
}
} else {
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
} else {
stage1Top=0x40; /* 0x40==64 */
}
for(i=0; i<stage1Top; ++i) {
mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
}
/* stage2Top has counted 32-bit results, now we need to count bytes */
mbcsData->stage2Top*=4;
/* stage2Top/Length have counted 32-bit results, now we need to count bytes */
stage2Length*=4;
/* leave stage2Start counting 32-bit units */
if(mbcsData->utf8Friendly) {
stageUTF8Length=MBCS_UTF8_STAGE_SIZE;
header.version[2]=(uint8_t)(MBCS_UTF8_MAX>>8); /* store 0xd7 for max==0xd7ff */
stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT;
header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */
}
/* stage3Top has already counted bytes */
}
/* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */
mbcsData->stage2Top=(mbcsData->stage2Top+3)&~3;
/* round up stage3Top so that the sizes of all data blocks are multiples of 4 */
mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
/* fill the header */
header.version[0]=4;
if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) {
header.version[0]=5;
if(header.options&MBCS_OPT_NO_FROM_U) {
headerLength=10; /* include fullStage2Length */
} else {
headerLength=MBCS_HEADER_V5_MIN_LENGTH; /* 9 */
}
} else {
header.version[0]=4;
headerLength=MBCS_HEADER_V4_LENGTH; /* 8 */
}
header.version[1]=3;
/* header.version[2] set above for utf8Friendly data */
header.options|=(uint32_t)headerLength;
header.countStates=mbcsData->ucm->states.countStates;
header.countToUFallbacks=mbcsData->countToUFallbacks;
header.offsetToUCodeUnits=
sizeof(_MBCSHeader)+
headerLength*4+
mbcsData->ucm->states.countStates*1024+
mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
header.offsetFromUTable=
@ -1397,10 +1510,13 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
header.offsetFromUBytes=
header.offsetFromUTable+
stage1Top*2+
mbcsData->stage2Top;
stage2Length;
header.fromUBytesLength=mbcsData->stage3Top;
top=header.offsetFromUBytes+header.fromUBytesLength+stageUTF8Length*2;
top=header.offsetFromUBytes+stageUTF8Length*2;
if(!(header.options&MBCS_OPT_NO_FROM_U)) {
top+=header.fromUBytesLength;
}
header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
@ -1414,17 +1530,19 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
}
/* write the MBCS data */
udata_writeBlock(pData, &header, sizeof(_MBCSHeader));
udata_writeBlock(pData, &header, headerLength*4);
udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
if(mbcsData->ucm->states.maxCharLength==1) {
udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top);
udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length);
} else {
udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top);
udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length);
}
if(!(header.options&MBCS_OPT_NO_FROM_U)) {
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
}
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
if(stageUTF8Length>0) {
udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);

View File

@ -101,9 +101,20 @@ enum {
U_CFUNC NewConverter *
MBCSOpen(UCMFile *ucm);
struct MBCSData;
typedef struct MBCSData MBCSData;
/*
* Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode()
* for creating an extension-only file.
* Assume maxCharLength>1.
*/
U_CFUNC const MBCSData *
MBCSGetDummy();
/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
U_CFUNC UBool
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
UChar32 c, int8_t flag);

View File

@ -34,6 +34,8 @@
#include "makeconv.h"
#include "genmbcs.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
#define DEBUG 0
typedef struct ConvData {
@ -76,6 +78,7 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP
* Global - verbosity
*/
UBool VERBOSE = FALSE;
UBool SMALL = FALSE;
static void
createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
@ -163,13 +166,25 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
}
}
enum {
OPT_HELP_H,
OPT_HELP_QUESTION_MARK,
OPT_COPYRIGHT,
OPT_VERSION,
OPT_DESTDIR,
OPT_VERBOSE,
OPT_SMALL,
OPT_COUNT
};
static UOption options[]={
UOPTION_HELP_H, /* 0 Numbers for those who*/
UOPTION_HELP_QUESTION_MARK, /* 1 can't count. */
UOPTION_COPYRIGHT, /* 2 */
UOPTION_VERSION, /* 3 */
UOPTION_DESTDIR, /* 4 */
UOPTION_VERBOSE, /* 5 */
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_COPYRIGHT,
UOPTION_VERSION,
UOPTION_DESTDIR,
UOPTION_VERBOSE,
{ "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
};
int main(int argc, char* argv[])
@ -194,8 +209,8 @@ int main(int argc, char* argv[])
uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
/* preset then read command line options */
options[4].value=u_getDataDirectory();
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
options[OPT_DESTDIR].value=u_getDataDirectory();
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
/* error handling, printing usage message */
if(argc<0) {
@ -205,8 +220,9 @@ int main(int argc, char* argv[])
} else if(argc<2) {
argc=-1;
}
if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
fprintf(stderr,
if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
FILE *stdfile=argc<0 ? stderr : stdout;
fprintf(stdfile,
"usage: %s [-options] files...\n"
"\tread .ucm codepage mapping files and write .cnv files\n"
"options:\n"
@ -216,20 +232,26 @@ int main(int argc, char* argv[])
"\t-d or --destdir destination directory, followed by the path\n"
"\t-v or --verbose Turn on verbose output\n",
argv[0]);
fprintf(stdfile,
"\t --small Generate smaller .cnv files. They will be\n"
"\t significantly smaller but may not be compatible with\n"
"\t older versions of ICU and will require heap memory\n"
"\t allocation when loaded.\n");
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
if(options[3].doesOccur) {
fprintf(stderr,"makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
fprintf(stderr, U_COPYRIGHT_STRING "\n");
if(options[OPT_VERSION].doesOccur) {
printf("makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
printf("%s\n", U_COPYRIGHT_STRING);
exit(0);
}
/* get the options values */
haveCopyright = options[2].doesOccur;
destdir = options[4].value;
VERBOSE = options[5].doesOccur;
haveCopyright = options[OPT_COPYRIGHT].doesOccur;
destdir = options[OPT_DESTDIR].value;
VERBOSE = options[OPT_VERBOSE].doesOccur;
SMALL = options[OPT_SMALL].doesOccur;
if (destdir != NULL && *destdir != 0) {
uprv_strcpy(outFileName, destdir);
@ -766,12 +788,13 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
*
* Do this after ucm_checkBaseExt().
*/
const MBCSData *mbcsData=MBCSGetDummy();
int32_t needsMove=0;
for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
m<mLimit;
++m
) {
if(!MBCSOkForBaseFromUnicode(TRUE, m->b.bytes, m->bLen, m->u, m->f)) {
if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
++needsMove;

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2006, International Business Machines
* Copyright (C) 2000-2007, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -24,6 +24,7 @@
/* exports from makeconv.c */
U_CFUNC UBool VERBOSE;
U_CFUNC UBool SMALL;
/* converter table type for writing */
enum {

View File

@ -497,7 +497,7 @@ ucnv_enumDependencies(const UDataSwapper *ds,
/* check for supported conversionType values */
if(inStaticData->conversionType==UCNV_MBCS) {
/* MBCS data */
uint32_t mbcsHeaderFlags;
uint32_t mbcsHeaderLength, mbcsHeaderFlags, mbcsHeaderOptions;
int32_t extOffset;
inMBCSHeader=(const _MBCSHeader *)inBytes;
@ -508,7 +508,14 @@ ucnv_enumDependencies(const UDataSwapper *ds,
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
if(!(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1)) {
if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) {
mbcsHeaderLength=MBCS_HEADER_V4_LENGTH;
} else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 &&
((mbcsHeaderOptions=ds->readUInt32(inMBCSHeader->options))&
MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0
) {
mbcsHeaderLength=mbcsHeaderOptions&MBCS_OPT_LENGTH_MASK;
} else {
udata_printError(ds, "icupkg/ucnv_enumDependencies(): unsupported _MBCSHeader.version %d.%d\n",
inMBCSHeader->version[0], inMBCSHeader->version[1]);
*pErrorCode=U_UNSUPPORTED_ERROR;
@ -536,14 +543,15 @@ ucnv_enumDependencies(const UDataSwapper *ds,
}
/* swap the base name, between the header and the extension data */
baseNameLength=(int32_t)strlen((const char *)(inMBCSHeader+1));
const char *inBaseName=(const char *)inBytes+mbcsHeaderLength*4;
baseNameLength=(int32_t)strlen(inBaseName);
if(baseNameLength>=(int32_t)sizeof(baseName)) {
udata_printError(ds, "icupkg/ucnv_enumDependencies(%s): base name length %ld too long\n",
itemName, baseNameLength);
*pErrorCode=U_UNSUPPORTED_ERROR;
return;
}
ds->swapInvChars(ds, inMBCSHeader+1, baseNameLength+1, baseName, pErrorCode);
ds->swapInvChars(ds, inBaseName, baseNameLength+1, baseName, pErrorCode);
checkIDSuffix(itemName, baseName, -1, ".cnv", check, context, pErrorCode);
}