ICU-5987 merge small-conversion-file feature into trunk, from svn merge -r 22780:22805 .../branches/markus/smallcnv
X-SVN-Rev: 22852
This commit is contained in:
parent
b69ac49696
commit
295dc24d64
@ -1261,6 +1261,9 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
const _MBCSHeader *inMBCSHeader;
|
||||
_MBCSHeader *outMBCSHeader;
|
||||
_MBCSHeader mbcsHeader;
|
||||
uint32_t mbcsHeaderLength;
|
||||
UBool noFromU=FALSE;
|
||||
|
||||
uint8_t outputType;
|
||||
|
||||
int32_t maxFastUChar, mbcsIndexLength;
|
||||
@ -1350,7 +1353,15 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if(!(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1)) {
|
||||
if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) {
|
||||
mbcsHeaderLength=MBCS_HEADER_V4_LENGTH;
|
||||
} else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 &&
|
||||
((mbcsHeader.options=ds->readUInt32(inMBCSHeader->options))&
|
||||
MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0
|
||||
) {
|
||||
mbcsHeaderLength=mbcsHeader.options&MBCS_OPT_LENGTH_MASK;
|
||||
noFromU=(UBool)((mbcsHeader.options&MBCS_OPT_NO_FROM_U)!=0);
|
||||
} else {
|
||||
udata_printError(ds, "ucnv_swap(): unsupported _MBCSHeader.version %d.%d\n",
|
||||
inMBCSHeader->version[0], inMBCSHeader->version[1]);
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
@ -1365,9 +1376,15 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
mbcsHeader.offsetFromUBytes= ds->readUInt32(inMBCSHeader->offsetFromUBytes);
|
||||
mbcsHeader.flags= ds->readUInt32(inMBCSHeader->flags);
|
||||
mbcsHeader.fromUBytesLength= ds->readUInt32(inMBCSHeader->fromUBytesLength);
|
||||
/* mbcsHeader.options have been read above */
|
||||
|
||||
extOffset=(int32_t)(mbcsHeader.flags>>8);
|
||||
outputType=(uint8_t)mbcsHeader.flags;
|
||||
if(noFromU && outputType==MBCS_OUTPUT_1) {
|
||||
udata_printError(ds, "ucnv_swap(): unsupported combination of makeconv --small with SBCS\n");
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* make sure that the output type is known */
|
||||
switch(outputType) {
|
||||
@ -1406,7 +1423,10 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
}
|
||||
|
||||
if(extOffset==0) {
|
||||
size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsHeader.fromUBytesLength+mbcsIndexLength);
|
||||
size=(int32_t)(mbcsHeader.offsetFromUBytes+mbcsIndexLength);
|
||||
if(!noFromU) {
|
||||
size+=(int32_t)mbcsHeader.fromUBytesLength;
|
||||
}
|
||||
|
||||
/* avoid compiler warnings - not otherwise necessary, and the value does not matter */
|
||||
inExtIndexes=NULL;
|
||||
@ -1436,8 +1456,9 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
uprv_memcpy(outBytes, inBytes, size);
|
||||
}
|
||||
|
||||
/* swap the MBCSHeader */
|
||||
ds->swapArray32(ds, &inMBCSHeader->countStates, 7*4,
|
||||
/* swap the MBCSHeader, except for the version field */
|
||||
count=mbcsHeaderLength*4;
|
||||
ds->swapArray32(ds, &inMBCSHeader->countStates, count-4,
|
||||
&outMBCSHeader->countStates, pErrorCode);
|
||||
|
||||
if(outputType==MBCS_OUTPUT_EXT_ONLY) {
|
||||
@ -1447,18 +1468,23 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
*/
|
||||
|
||||
/* swap the base name, between the header and the extension data */
|
||||
ds->swapInvChars(ds, inMBCSHeader+1, (int32_t)uprv_strlen((const char *)(inMBCSHeader+1)),
|
||||
outMBCSHeader+1, pErrorCode);
|
||||
const char *inBaseName=(const char *)inBytes+count;
|
||||
char *outBaseName=(char *)outBytes+count;
|
||||
ds->swapInvChars(ds, inBaseName, (int32_t)uprv_strlen(inBaseName),
|
||||
outBaseName, pErrorCode);
|
||||
} else {
|
||||
/* normal file with base table data */
|
||||
|
||||
/* swap the state table, 1kB per state */
|
||||
ds->swapArray32(ds, inMBCSHeader+1, (int32_t)(mbcsHeader.countStates*1024),
|
||||
outMBCSHeader+1, pErrorCode);
|
||||
offset=count;
|
||||
count=mbcsHeader.countStates*1024;
|
||||
ds->swapArray32(ds, inBytes+offset, (int32_t)count,
|
||||
outBytes+offset, pErrorCode);
|
||||
|
||||
/* swap the toUFallbacks[] */
|
||||
offset=sizeof(_MBCSHeader)+mbcsHeader.countStates*1024;
|
||||
ds->swapArray32(ds, inBytes+offset, (int32_t)(mbcsHeader.countToUFallbacks*8),
|
||||
offset+=count;
|
||||
count=mbcsHeader.countToUFallbacks*8;
|
||||
ds->swapArray32(ds, inBytes+offset, (int32_t)count,
|
||||
outBytes+offset, pErrorCode);
|
||||
|
||||
/* swap the unicodeCodeUnits[] */
|
||||
@ -1495,7 +1521,7 @@ ucnv_swap(const UDataSwapper *ds,
|
||||
|
||||
/* stage 3/result bytes: sometimes uint16_t[] or uint32_t[] */
|
||||
offset=mbcsHeader.offsetFromUBytes;
|
||||
count=mbcsHeader.fromUBytesLength;
|
||||
count= noFromU ? 0 : mbcsHeader.fromUBytesLength;
|
||||
switch(outputType) {
|
||||
case MBCS_OUTPUT_2:
|
||||
case MBCS_OUTPUT_3_EUC:
|
||||
|
@ -175,6 +175,19 @@ typedef UConverter * (*UConverterSafeClone) (const UConverter *cnv,
|
||||
int32_t *pBufferSize,
|
||||
UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Filters for some ucnv_getUnicodeSet() implementation code.
|
||||
*/
|
||||
typedef enum UConverterSetFilter {
|
||||
UCNV_SET_FILTER_NONE,
|
||||
UCNV_SET_FILTER_DBCS_ONLY,
|
||||
UCNV_SET_FILTER_2022_CN,
|
||||
UCNV_SET_FILTER_SJIS,
|
||||
UCNV_SET_FILTER_GR94DBCS,
|
||||
UCNV_SET_FILTER_HZ,
|
||||
UCNV_SET_FILTER_COUNT
|
||||
} UConverterSetFilter;
|
||||
|
||||
/**
|
||||
* Fills the set of Unicode code points that can be converted by an ICU converter.
|
||||
* The API function ucnv_getUnicodeSet() clears the USet before calling
|
||||
|
@ -61,9 +61,47 @@
|
||||
#define MBCS_UNROLL_SINGLE_FROM_BMP 0
|
||||
|
||||
/*
|
||||
* _MBCSHeader version 4.3
|
||||
* _MBCSHeader versions 5.3 & 4.3
|
||||
* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
|
||||
*
|
||||
* This version is optional. Version 5 is used for incompatible data format changes.
|
||||
* makeconv will continue to generate version 4 files if possible.
|
||||
*
|
||||
* Changes from version 4:
|
||||
*
|
||||
* The main difference is an additional _MBCSHeader field with
|
||||
* - the length (number of uint32_t) of the _MBCSHeader
|
||||
* - flags for further incompatible data format changes
|
||||
* - flags for further, backward compatible data format changes
|
||||
*
|
||||
* The MBCS_OPT_FROM_U flag indicates that most of the fromUnicode data is omitted from
|
||||
* the file and needs to be reconstituted at load time.
|
||||
* This requires a utf8Friendly format with an additional mbcsIndex table for fast
|
||||
* (and UTF-8-friendly) fromUnicode conversion for Unicode code points up to maxFastUChar.
|
||||
* (For details about these structures see below, and see ucnvmbcs.h.)
|
||||
*
|
||||
* utf8Friendly also implies that the fromUnicode mappings are stored in ascending order
|
||||
* of the Unicode code points. (This requires that the .ucm file has the |0 etc.
|
||||
* precision markers for all mappings.)
|
||||
*
|
||||
* All fallbacks have been moved to the extension table, leaving only roundtrips in the
|
||||
* omitted data that can be reconstituted from the toUnicode data.
|
||||
*
|
||||
* Of the stage 2 table, the part corresponding to maxFastUChar and below is omitted.
|
||||
* With only roundtrip mappings in the base fromUnicode data, this part is fully
|
||||
* redundant with the mbcsIndex and will be reconstituted from that (also using the
|
||||
* stage 1 table which contains the information about how stage 2 was compacted).
|
||||
*
|
||||
* The rest of the stage 2 table, the part for code points above maxFastUChar,
|
||||
* is stored in the file and will be appended to the reconstituted part.
|
||||
*
|
||||
* The entire fromUBytes array is omitted from the file and will be reconstitued.
|
||||
* This is done by enumerating all toUnicode roundtrip mappings, performing
|
||||
* each mapping (using the stage 1 and reconstituted stage 2 tables) and
|
||||
* writing instead of reading the byte values.
|
||||
*
|
||||
* _MBCSHeader version 4.3
|
||||
*
|
||||
* Change from version 4.2:
|
||||
* - Optional utf8Friendly data structures, with 64-entry stage 3 block
|
||||
* allocation for parts of the BMP, and an additional mbcsIndex in non-SBCS
|
||||
@ -362,101 +400,240 @@ gb18030Ranges[13][4]={
|
||||
|
||||
/* Miscellaneous ------------------------------------------------------------ */
|
||||
|
||||
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
|
||||
/**
|
||||
* Callback from ucnv_MBCSEnumToUnicode(), takes 32 mappings from
|
||||
* consecutive sequences of bytes, starting from the one encoded in value,
|
||||
* to Unicode code points. (Multiple mappings to reduce per-function call overhead.)
|
||||
* Does not currently support m:n mappings or reverse fallbacks.
|
||||
* This function will not be called for sequences of bytes with leading zeros.
|
||||
*
|
||||
* @param context an opaque pointer, as passed into ucnv_MBCSEnumToUnicode()
|
||||
* @param value contains 1..4 bytes of the first byte sequence, right-aligned
|
||||
* @param codePoints resulting Unicode code points, or negative if a byte sequence does
|
||||
* not map to anything
|
||||
* @return TRUE to continue enumeration, FALSE to stop
|
||||
*/
|
||||
typedef UBool U_CALLCONV
|
||||
UConverterEnumToUCallback(const void *context, uint32_t value, UChar32 codePoints[32]);
|
||||
|
||||
/* similar to ucnv_MBCSGetNextUChar() but recursive */
|
||||
static void
|
||||
_getUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
const int32_t (*stateTable)[256], const uint16_t *unicodeCodeUnits,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, uint32_t offset, int32_t lowByte, int32_t highByte,
|
||||
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t b, entry;
|
||||
static UBool
|
||||
enumToU(UConverterMBCSTable *mbcsTable, int8_t stateProps[],
|
||||
int32_t state, uint32_t offset,
|
||||
uint32_t value,
|
||||
UConverterEnumToUCallback *callback, const void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar32 codePoints[32];
|
||||
const int32_t *row;
|
||||
const uint16_t *unicodeCodeUnits;
|
||||
UChar32 anyCodePoints;
|
||||
int32_t b, limit;
|
||||
|
||||
for(b=lowByte; b<=highByte; ++b) {
|
||||
entry=stateTable[state][b];
|
||||
row=mbcsTable->stateTable[state];
|
||||
unicodeCodeUnits=mbcsTable->unicodeCodeUnits;
|
||||
|
||||
value<<=8;
|
||||
anyCodePoints=-1; /* becomes non-negative if there is a mapping */
|
||||
|
||||
b=(stateProps[state]&0x38)<<2;
|
||||
if(b==0 && stateProps[state]>=0x40) {
|
||||
/* skip byte sequences with leading zeros because they are not stored in the fromUnicode table */
|
||||
codePoints[0]=U_SENTINEL;
|
||||
b=1;
|
||||
}
|
||||
limit=((stateProps[state]&7)+1)<<5;
|
||||
while(b<limit) {
|
||||
int32_t entry=row[b];
|
||||
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
|
||||
_getUnicodeSetForBytes(
|
||||
sharedData, stateTable, unicodeCodeUnits,
|
||||
sa, which,
|
||||
(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry),
|
||||
offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
|
||||
0, 0xff,
|
||||
pErrorCode);
|
||||
int32_t nextState=MBCS_ENTRY_TRANSITION_STATE(entry);
|
||||
if(stateProps[nextState]>=0) {
|
||||
/* recurse to a state with non-ignorable actions */
|
||||
if(!enumToU(
|
||||
mbcsTable, stateProps, nextState,
|
||||
offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
|
||||
value|(uint32_t)b,
|
||||
callback, context,
|
||||
pErrorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
codePoints[b&0x1f]=U_SENTINEL;
|
||||
} else {
|
||||
UChar32 c;
|
||||
int32_t rowOffset=offset;
|
||||
uint8_t action;
|
||||
|
||||
c=U_SENTINEL;
|
||||
int32_t action;
|
||||
|
||||
/*
|
||||
* An if-else-if chain provides more reliable performance for
|
||||
* the most common cases compared to a switch.
|
||||
*/
|
||||
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
|
||||
action=MBCS_ENTRY_FINAL_ACTION(entry);
|
||||
if(action==MBCS_STATE_VALID_DIRECT_16) {
|
||||
/* output BMP code point */
|
||||
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
} else if(action==MBCS_STATE_VALID_16) {
|
||||
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[offset];
|
||||
int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[finalOffset];
|
||||
if(c<0xfffe) {
|
||||
/* output BMP code point */
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else if(action==MBCS_STATE_VALID_16_PAIR) {
|
||||
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[offset++];
|
||||
int32_t finalOffset=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
|
||||
c=unicodeCodeUnits[finalOffset++];
|
||||
if(c<0xd800) {
|
||||
/* output BMP code point below 0xd800 */
|
||||
} else if(c<=0xdbff) {
|
||||
/* output roundtrip or fallback supplementary code point */
|
||||
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
|
||||
c=((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00);
|
||||
} else if(c==0xe000) {
|
||||
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
|
||||
c=unicodeCodeUnits[offset];
|
||||
c=unicodeCodeUnits[finalOffset];
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
} else if(action==MBCS_STATE_VALID_DIRECT_20) {
|
||||
/* output supplementary code point */
|
||||
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
|
||||
} else {
|
||||
c=U_SENTINEL;
|
||||
}
|
||||
|
||||
if(c>=0) {
|
||||
sa->add(sa->set, c);
|
||||
codePoints[b&0x1f]=c;
|
||||
anyCodePoints&=c;
|
||||
}
|
||||
if(((++b)&0x1f)==0) {
|
||||
if(anyCodePoints>=0) {
|
||||
if(!callback(context, value|(uint32_t)(b-0x20), codePoints)) {
|
||||
return FALSE;
|
||||
}
|
||||
anyCodePoints=-1;
|
||||
}
|
||||
offset=rowOffset;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
|
||||
* In the future, if we add support for reverse-fallback sets, this function
|
||||
* needs to be updated, and called for each initial state.
|
||||
* Does not currently handle extensions.
|
||||
* Does not empty the set first.
|
||||
* Only called if stateProps[state]==-1.
|
||||
* A recursive call may do stateProps[state]|=0x40 if this state is the target of an
|
||||
* MBCS_STATE_CHANGE_ONLY.
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||
UErrorCode *pErrorCode) {
|
||||
_getUnicodeSetForBytes(
|
||||
sharedData, sharedData->mbcs.stateTable, sharedData->mbcs.unicodeCodeUnits,
|
||||
sa, which,
|
||||
state, 0, lowByte, highByte,
|
||||
pErrorCode);
|
||||
static int8_t
|
||||
getStateProp(const int32_t (*stateTable)[256], int8_t stateProps[], int state) {
|
||||
const int32_t *row;
|
||||
int32_t min, max, entry, nextState;
|
||||
|
||||
row=stateTable[state];
|
||||
stateProps[state]=0;
|
||||
|
||||
/* find first non-ignorable state */
|
||||
for(min=0;; ++min) {
|
||||
entry=row[min];
|
||||
nextState=MBCS_ENTRY_STATE(entry);
|
||||
if(stateProps[nextState]==-1) {
|
||||
getStateProp(stateTable, stateProps, nextState);
|
||||
}
|
||||
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
|
||||
if(stateProps[nextState]>=0) {
|
||||
break;
|
||||
}
|
||||
} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
|
||||
break;
|
||||
}
|
||||
if(min==0xff) {
|
||||
stateProps[state]=-0x40; /* (int8_t)0xc0 */
|
||||
return stateProps[state];
|
||||
}
|
||||
}
|
||||
stateProps[state]|=(int8_t)((min>>5)<<3);
|
||||
|
||||
/* find last non-ignorable state */
|
||||
for(max=0xff; min<max; --max) {
|
||||
entry=row[max];
|
||||
nextState=MBCS_ENTRY_STATE(entry);
|
||||
if(stateProps[nextState]==-1) {
|
||||
getStateProp(stateTable, stateProps, nextState);
|
||||
}
|
||||
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
|
||||
if(stateProps[nextState]>=0) {
|
||||
break;
|
||||
}
|
||||
} else if(MBCS_ENTRY_FINAL_ACTION(entry)<MBCS_STATE_UNASSIGNED) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
stateProps[state]|=(int8_t)(max>>5);
|
||||
|
||||
/* recurse further and collect direct-state information */
|
||||
while(min<=max) {
|
||||
entry=row[min];
|
||||
nextState=MBCS_ENTRY_STATE(entry);
|
||||
if(stateProps[nextState]==-1) {
|
||||
getStateProp(stateTable, stateProps, nextState);
|
||||
}
|
||||
if(MBCS_ENTRY_IS_FINAL(entry)) {
|
||||
stateProps[nextState]|=0x40;
|
||||
if(MBCS_ENTRY_FINAL_ACTION(entry)<=MBCS_STATE_FALLBACK_DIRECT_20) {
|
||||
stateProps[state]|=0x40;
|
||||
}
|
||||
}
|
||||
++min;
|
||||
}
|
||||
return stateProps[state];
|
||||
}
|
||||
|
||||
#endif
|
||||
/*
|
||||
* Internal function enumerating the toUnicode data of an MBCS converter.
|
||||
* Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U
|
||||
* table, but could also be used for a future ucnv_getUnicodeSet() option
|
||||
* that includes reverse fallbacks (after updating this function's implementation).
|
||||
* Currently only handles roundtrip mappings.
|
||||
* Does not currently handle extensions.
|
||||
*/
|
||||
static void
|
||||
ucnv_MBCSEnumToUnicode(UConverterMBCSTable *mbcsTable,
|
||||
UConverterEnumToUCallback *callback, const void *context,
|
||||
UErrorCode *pErrorCode) {
|
||||
/*
|
||||
* Properties for each state, to speed up the enumeration.
|
||||
* Ignorable actions are unassigned/illegal/state-change-only:
|
||||
* They do not lead to mappings.
|
||||
*
|
||||
* Bits 7..6:
|
||||
* 1 direct/initial state (stateful converters have multiple)
|
||||
* 0 non-initial state with transitions or with non-ignorable result actions
|
||||
* -1 final state with only ignorable actions
|
||||
*
|
||||
* Bits 5..3:
|
||||
* The lowest byte value with non-ignorable actions is
|
||||
* value<<5 (rounded down).
|
||||
*
|
||||
* Bits 2..0:
|
||||
* The highest byte value with non-ignorable actions is
|
||||
* (value<<5)&0x1f (rounded up).
|
||||
*/
|
||||
int8_t stateProps[MBCS_MAX_STATE_COUNT];
|
||||
int32_t state;
|
||||
|
||||
uprv_memset(stateProps, -1, sizeof(stateProps));
|
||||
|
||||
/* recurse from state 0 and set all stateProps */
|
||||
getStateProp(mbcsTable->stateTable, stateProps, 0);
|
||||
|
||||
for(state=0; state<mbcsTable->countStates; ++state) {
|
||||
/*if(stateProps[state]==-1) {
|
||||
printf("unused/unreachable <icu:state> %d\n", state);
|
||||
}*/
|
||||
if(stateProps[state]>=0x40) {
|
||||
/* start from each direct state */
|
||||
enumToU(
|
||||
mbcsTable, stateProps, state, 0, 0,
|
||||
callback, context,
|
||||
pErrorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetFilteredUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
@ -1006,6 +1183,156 @@ _EBCDICSwapLFNL(UConverterSharedData *sharedData, UErrorCode *pErrorCode) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
/* reconstitute omitted fromUnicode data ------------------------------------ */
|
||||
|
||||
/* for details, compare with genmbcs.c MBCSAddFromUnicode() and transformEUC() */
|
||||
static UBool U_CALLCONV
|
||||
writeStage3Roundtrip(const void *context, uint32_t value, UChar32 codePoints[32]) {
|
||||
UConverterMBCSTable *mbcsTable=(UConverterMBCSTable *)context;
|
||||
const uint16_t *table;
|
||||
uint32_t *stage2;
|
||||
uint8_t *bytes, *p;
|
||||
UChar32 c;
|
||||
int32_t i, st3;
|
||||
|
||||
table=mbcsTable->fromUnicodeTable;
|
||||
bytes=(uint8_t *)mbcsTable->fromUnicodeBytes;
|
||||
|
||||
/* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */
|
||||
switch(mbcsTable->outputType) {
|
||||
case MBCS_OUTPUT_3_EUC:
|
||||
if(value<=0xffff) {
|
||||
/* short sequences are stored directly */
|
||||
/* code set 0 or 1 */
|
||||
} else if(value<=0x8effff) {
|
||||
/* code set 2 */
|
||||
value&=0x7fff;
|
||||
} else /* first byte is 0x8f */ {
|
||||
/* code set 3 */
|
||||
value&=0xff7f;
|
||||
}
|
||||
break;
|
||||
case MBCS_OUTPUT_4_EUC:
|
||||
if(value<=0xffffff) {
|
||||
/* short sequences are stored directly */
|
||||
/* code set 0 or 1 */
|
||||
} else if(value<=0x8effffff) {
|
||||
/* code set 2 */
|
||||
value&=0x7fffff;
|
||||
} else /* first byte is 0x8f */ {
|
||||
/* code set 3 */
|
||||
value&=0xff7fff;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
for(i=0; i<=0x1f; ++value, ++i) {
|
||||
c=codePoints[i];
|
||||
if(c<0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
/* locate the stage 2 & 3 data */
|
||||
stage2=((uint32_t *)table)+table[c>>10]+((c>>4)&0x3f);
|
||||
p=bytes;
|
||||
st3=(int32_t)(uint16_t)*stage2*16+(c&0xf);
|
||||
|
||||
/* write the codepage bytes into stage 3 */
|
||||
switch(mbcsTable->outputType) {
|
||||
case MBCS_OUTPUT_3:
|
||||
case MBCS_OUTPUT_4_EUC:
|
||||
p+=st3*3;
|
||||
p[0]=(uint8_t)(value>>16);
|
||||
p[1]=(uint8_t)(value>>8);
|
||||
p[2]=(uint8_t)value;
|
||||
break;
|
||||
case MBCS_OUTPUT_4:
|
||||
((uint32_t *)p)[st3]=value;
|
||||
break;
|
||||
default:
|
||||
/* 2 bytes per character */
|
||||
((uint16_t *)p)[st3]=(uint16_t)value;
|
||||
break;
|
||||
}
|
||||
|
||||
/* set the roundtrip flag */
|
||||
*stage2|=(1UL<<(16+(c&0xf)));
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
static void
|
||||
reconstituteData(UConverterMBCSTable *mbcsTable,
|
||||
uint32_t stage1Length, uint32_t stage2Length,
|
||||
uint32_t fullStage2Length, /* lengths are numbers of units, not bytes */
|
||||
UErrorCode *pErrorCode) {
|
||||
uint16_t *stage1;
|
||||
uint32_t *stage2;
|
||||
uint8_t *bytes;
|
||||
uint32_t dataLength=stage1Length*2+fullStage2Length*4+mbcsTable->fromUBytesLength;
|
||||
mbcsTable->reconstitutedData=(uint8_t *)uprv_malloc(dataLength);
|
||||
if(mbcsTable->reconstitutedData==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uprv_memset(mbcsTable->reconstitutedData, 0, dataLength);
|
||||
|
||||
/* copy existing data and reroute the pointers */
|
||||
stage1=(uint16_t *)mbcsTable->reconstitutedData;
|
||||
uprv_memcpy(stage1, mbcsTable->fromUnicodeTable, stage1Length*2);
|
||||
|
||||
stage2=(uint32_t *)(stage1+stage1Length);
|
||||
uprv_memcpy(stage2+(fullStage2Length-stage2Length),
|
||||
mbcsTable->fromUnicodeTable+stage1Length,
|
||||
stage2Length*4);
|
||||
|
||||
mbcsTable->fromUnicodeTable=stage1;
|
||||
mbcsTable->fromUnicodeBytes=bytes=(uint8_t *)(stage2+fullStage2Length);
|
||||
|
||||
/* indexes into stage 2 count from the bottom of the fromUnicodeTable */
|
||||
stage2=(uint32_t *)stage1;
|
||||
|
||||
/* reconstitute the initial part of stage 2 from the mbcsIndex */
|
||||
{
|
||||
int32_t stageUTF8Length=((int32_t)mbcsTable->maxFastUChar+1)>>6;
|
||||
int32_t stageUTF8Index=0;
|
||||
int32_t st1, st2, st3, i;
|
||||
|
||||
for(st1=0; stageUTF8Index<stageUTF8Length; ++st1) {
|
||||
st2=stage1[st1];
|
||||
if(st2!=stage1Length/2) {
|
||||
/* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */
|
||||
for(i=0; i<16; ++i) {
|
||||
st3=mbcsTable->mbcsIndex[stageUTF8Index++];
|
||||
if(st3!=0) {
|
||||
/* an stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */
|
||||
st3>>=4;
|
||||
/*
|
||||
* 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are
|
||||
* allocated together as a single 64-block for access from the mbcsIndex
|
||||
*/
|
||||
stage2[st2++]=st3++;
|
||||
stage2[st2++]=st3++;
|
||||
stage2[st2++]=st3++;
|
||||
stage2[st2++]=st3;
|
||||
} else {
|
||||
/* no stage 3 block, skip */
|
||||
st2+=4;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/* no stage 2 block, skip */
|
||||
stageUTF8Index+=16;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */
|
||||
ucnv_MBCSEnumToUnicode(mbcsTable, writeStage3Roundtrip, mbcsTable, pErrorCode);
|
||||
}
|
||||
|
||||
/* MBCS setup functions ----------------------------------------------------- */
|
||||
|
||||
static void
|
||||
@ -1017,13 +1344,25 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
|
||||
UConverterMBCSTable *mbcsTable=&sharedData->mbcs;
|
||||
_MBCSHeader *header=(_MBCSHeader *)raw;
|
||||
uint32_t offset;
|
||||
uint32_t headerLength;
|
||||
UBool noFromU=FALSE;
|
||||
|
||||
if(header->version[0]!=4) {
|
||||
if(header->version[0]==4) {
|
||||
headerLength=MBCS_HEADER_V4_LENGTH;
|
||||
} else if(header->version[0]==5 && header->version[1]>=3 &&
|
||||
(header->options&MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0) {
|
||||
headerLength=header->options&MBCS_OPT_LENGTH_MASK;
|
||||
noFromU=(UBool)((header->options&MBCS_OPT_NO_FROM_U)!=0);
|
||||
} else {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
return;
|
||||
}
|
||||
|
||||
mbcsTable->outputType=(uint8_t)header->flags;
|
||||
if(noFromU && mbcsTable->outputType==MBCS_OUTPUT_1) {
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
return;
|
||||
}
|
||||
|
||||
/* extension data, header version 4.2 and higher */
|
||||
offset=header->flags>>8;
|
||||
@ -1051,7 +1390,7 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
|
||||
}
|
||||
|
||||
/* load the base table */
|
||||
baseName=(const char *)(header+1);
|
||||
baseName=(const char *)header+headerLength*4;
|
||||
if(0==uprv_strcmp(baseName, sharedData->staticData->name)) {
|
||||
/* forbid loading this same extension-only file */
|
||||
*pErrorCode=U_INVALID_TABLE_FORMAT;
|
||||
@ -1095,6 +1434,12 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
|
||||
mbcsTable->swapLFNLFromUnicodeBytes=NULL;
|
||||
mbcsTable->swapLFNLName=NULL;
|
||||
|
||||
/*
|
||||
* The reconstitutedData must be deleted only when the base converter
|
||||
* is unloaded.
|
||||
*/
|
||||
mbcsTable->reconstitutedData=NULL;
|
||||
|
||||
/*
|
||||
* Set a special, runtime-only outputType if the extension converter
|
||||
* is a DBCS version of a base converter that also maps single bytes.
|
||||
@ -1187,7 +1532,7 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
|
||||
|
||||
mbcsTable->countStates=(uint8_t)header->countStates;
|
||||
mbcsTable->countToUFallbacks=header->countToUFallbacks;
|
||||
mbcsTable->stateTable=(const int32_t (*)[256])(raw+sizeof(_MBCSHeader));
|
||||
mbcsTable->stateTable=(const int32_t (*)[256])(raw+headerLength*4);
|
||||
mbcsTable->toUFallbacks=(const _MBCSToUFallback *)(mbcsTable->stateTable+header->countStates);
|
||||
mbcsTable->unicodeCodeUnits=(const uint16_t *)(raw+header->offsetToUCodeUnits);
|
||||
|
||||
@ -1244,7 +1589,9 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
|
||||
* The .cnv file is prebuilt with an additional stage table with indexes
|
||||
* to each block.
|
||||
*/
|
||||
mbcsTable->mbcsIndex=(const uint16_t *)(mbcsTable->fromUnicodeBytes+mbcsTable->fromUBytesLength);
|
||||
mbcsTable->mbcsIndex=(const uint16_t *)
|
||||
(mbcsTable->fromUnicodeBytes+
|
||||
(noFromU ? 0 : mbcsTable->fromUBytesLength));
|
||||
mbcsTable->maxFastUChar=(((UChar)header->version[2])<<8)|0xff;
|
||||
}
|
||||
}
|
||||
@ -1261,6 +1608,16 @@ ucnv_MBCSLoad(UConverterSharedData *sharedData,
|
||||
}
|
||||
mbcsTable->asciiRoundtrips=asciiRoundtrips;
|
||||
}
|
||||
|
||||
if(noFromU) {
|
||||
uint32_t stage1Length=
|
||||
mbcsTable->unicodeMask&UCNV_HAS_SUPPLEMENTARY ?
|
||||
0x440 : 0x40;
|
||||
uint32_t stage2Length=
|
||||
(header->offsetFromUBytes-header->offsetFromUTable)/4-
|
||||
stage1Length/2;
|
||||
reconstituteData(mbcsTable, stage1Length, stage2Length, header->fullStage2Length, pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/* Set the impl pointer here so that it is set for both extension-only and base tables. */
|
||||
@ -1296,6 +1653,9 @@ ucnv_MBCSUnload(UConverterSharedData *sharedData) {
|
||||
if(mbcsTable->baseSharedData!=NULL) {
|
||||
ucnv_unload(mbcsTable->baseSharedData);
|
||||
}
|
||||
if(mbcsTable->reconstitutedData!=NULL) {
|
||||
uprv_free(mbcsTable->reconstitutedData);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -23,6 +23,7 @@
|
||||
|
||||
#include "unicode/ucnv.h"
|
||||
#include "ucnv_cnv.h"
|
||||
#include "ucnv_ext.h"
|
||||
|
||||
/**
|
||||
* ICU conversion (.cnv) data file structure, following the usual UDataInfo
|
||||
@ -41,6 +42,24 @@
|
||||
* the same toUnicode structures, while the fromUnicode structures for SBCS
|
||||
* differ from those for other MBCS-style converters.
|
||||
*
|
||||
* _MBCSHeader.version 5 is optional and not backward-compatible
|
||||
* (as usual for changes in the major version field).
|
||||
*
|
||||
* Versions 5.m work like versions 4.m except:
|
||||
* - The _MBCSHeader has variable length (and is always longer than in version 4).
|
||||
* See the struct _MBCSHeader further description below.
|
||||
* - There is a set of flags which indicate further incompatible changes.
|
||||
* (Reader code must reject the file if it does not recognize them all.)
|
||||
* - In particular, one of these flags indicates that most of the fromUnicode
|
||||
* data is missing and must be reconstituted from the toUnicode data
|
||||
* and from the utf8Friendly mbcsIndex at load time.
|
||||
* (This only works with a utf8Friendly table.)
|
||||
* In this case, makeconv may increase maxFastUChar automatically to U+FFFF.
|
||||
*
|
||||
* The first of these versions is 5.3, which is like 4.3 except for the differences above.
|
||||
*
|
||||
* When possible, makeconv continues to generate version 4.m files.
|
||||
*
|
||||
* _MBCSHeader.version 4.3 optionally modifies the fromUnicode data structures
|
||||
* slightly and optionally adds a table for conversion to MBCS (non-SBCS)
|
||||
* charsets.
|
||||
@ -127,6 +146,26 @@
|
||||
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
|
||||
* counts bytes in fromUBytes[]
|
||||
*
|
||||
* New and required in version 5:
|
||||
* 8 uint32_t options, bits:
|
||||
* 31..16 reserved for flags that can be added without breaking
|
||||
* backward compatibility
|
||||
* 15.. 6 reserved for flags whose addition will break
|
||||
* backward compatibility
|
||||
* 6 MBCS_OPT_FROM_U -- if set,
|
||||
* then most of the fromUnicode data is omitted;
|
||||
* fullStage2Length is present and the missing
|
||||
* bottom part of stage 2 must be reconstituted from
|
||||
* the toUnicode data;
|
||||
* stage 3 is missing completely as well;
|
||||
* not used for SBCS tables
|
||||
* 5.. 0 length of the _MBCSHeader (number of uint32_t)
|
||||
*
|
||||
* New and optional in version 5:
|
||||
* 9 uint32_t fullStage2Length: used if MBCS_OPT_FROM_U is set
|
||||
* specifies the full length of stage 2
|
||||
* including the omitted part
|
||||
*
|
||||
* if(outputType==MBCS_OUTPUT_EXT_ONLY) {
|
||||
* -- base table name for extension-only table
|
||||
* char baseTableName[variable]; -- with NUL plus padding for 4-alignment
|
||||
@ -153,7 +192,7 @@
|
||||
* -- BMP-only tables have a smaller stage 1 table
|
||||
* uint16_t fromUTable[0x40]; (32-bit-aligned)
|
||||
* }
|
||||
*
|
||||
*
|
||||
* -- stage 2 tables
|
||||
* length determined by top of stage 1 and bottom of stage 3 tables
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
@ -162,17 +201,24 @@
|
||||
* } else {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
|
||||
* uint32_t stage 2 flags and indexes[?];
|
||||
* if(options&MBCS_OPT_NO_FROM_U) {
|
||||
* stage 2 really has length fullStage2Length
|
||||
* and the omitted lower part must be reconstituted from
|
||||
* the toUnicode data
|
||||
* }
|
||||
* }
|
||||
*
|
||||
*
|
||||
* -- stage 3 tables with byte results
|
||||
* if(outputType==MBCS_OUTPUT_1) {
|
||||
* -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
|
||||
* uint16_t fromUBytes[fromUBytesLength/2];
|
||||
* } else {
|
||||
* } else if(!(options&MBCS_OPT_NO_FROM_U)) {
|
||||
* -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
|
||||
* uint8_t fromUBytes[fromUBytesLength]; or
|
||||
* uint16_t fromUBytes[fromUBytesLength/2]; or
|
||||
* uint32_t fromUBytes[fromUBytesLength/4];
|
||||
* } else {
|
||||
* fromUBytes[] must be reconstituted from the toUnicode data
|
||||
* }
|
||||
*
|
||||
* -- optional utf8Friendly mbcsIndex -- _MBCSHeader.version 4.3 (ICU 3.8) and higher
|
||||
@ -340,6 +386,9 @@ typedef struct UConverterMBCSTable {
|
||||
/* roundtrips */
|
||||
uint32_t asciiRoundtrips;
|
||||
|
||||
/* reconstituted data that was omitted from the .cnv file */
|
||||
uint8_t *reconstitutedData;
|
||||
|
||||
/* converter name for swaplfnl */
|
||||
char *swapLFNLName;
|
||||
|
||||
@ -348,6 +397,26 @@ typedef struct UConverterMBCSTable {
|
||||
const int32_t *extIndexes;
|
||||
} UConverterMBCSTable;
|
||||
|
||||
enum {
|
||||
MBCS_OPT_LENGTH_MASK=0x3f,
|
||||
MBCS_OPT_NO_FROM_U=0x40,
|
||||
/*
|
||||
* If any of the following options bits are set,
|
||||
* then the file must be rejected.
|
||||
*/
|
||||
MBCS_OPT_INCOMPATIBLE_MASK=0xffc0,
|
||||
/*
|
||||
* Remove bits from this mask as more options are recognized
|
||||
* by all implementations that use this constant.
|
||||
*/
|
||||
MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80
|
||||
};
|
||||
|
||||
enum {
|
||||
MBCS_HEADER_V4_LENGTH=8,
|
||||
MBCS_HEADER_V5_MIN_LENGTH=9
|
||||
};
|
||||
|
||||
/**
|
||||
* MBCS data header. See data format description above.
|
||||
*/
|
||||
@ -360,6 +429,12 @@ typedef struct {
|
||||
offsetFromUBytes,
|
||||
flags,
|
||||
fromUBytesLength;
|
||||
|
||||
/* new and required in version 5 */
|
||||
uint32_t options;
|
||||
|
||||
/* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */
|
||||
uint32_t fullStage2Length; /* number of 32-bit units */
|
||||
} _MBCSHeader;
|
||||
|
||||
/*
|
||||
@ -456,23 +531,6 @@ U_CFUNC void
|
||||
ucnv_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#if 0 /* Replaced by ucnv_MBCSGetFilteredUnicodeSetForUnicode() until we implement ucnv_getUnicodeSet() with reverse fallbacks. */
|
||||
/*
|
||||
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
|
||||
* In the future, if we add support for reverse-fallback sets, this function
|
||||
* needs to be updated, and called for each initial state.
|
||||
* Does not currently handle extensions.
|
||||
* Does not empty the set first.
|
||||
*/
|
||||
U_CFUNC void
|
||||
ucnv_MBCSGetUnicodeSetForBytes(const UConverterSharedData *sharedData,
|
||||
const USetAdder *sa,
|
||||
UConverterUnicodeSet which,
|
||||
uint8_t state, int32_t lowByte, int32_t highByte,
|
||||
UErrorCode *pErrorCode);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Internal function returning a UnicodeSet for toUnicode() conversion.
|
||||
* Currently only used for ISO-2022-CN, and only handles roundtrip mappings.
|
||||
@ -487,16 +545,6 @@ ucnv_MBCSGetUnicodeSetForUnicode(const UConverterSharedData *sharedData,
|
||||
UConverterUnicodeSet which,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
typedef enum UConverterSetFilter {
|
||||
UCNV_SET_FILTER_NONE,
|
||||
UCNV_SET_FILTER_DBCS_ONLY,
|
||||
UCNV_SET_FILTER_2022_CN,
|
||||
UCNV_SET_FILTER_SJIS,
|
||||
UCNV_SET_FILTER_GR94DBCS,
|
||||
UCNV_SET_FILTER_HZ,
|
||||
UCNV_SET_FILTER_COUNT
|
||||
} UConverterSetFilter;
|
||||
|
||||
/*
|
||||
* Same as ucnv_MBCSGetUnicodeSetForUnicode() but
|
||||
* the set can be filtered by encoding scheme.
|
||||
|
2
icu4c/source/test/testdata/Makefile.in
vendored
2
icu4c/source/test/testdata/Makefile.in
vendored
@ -186,7 +186,7 @@ $(TESTBUILDDIR)/nfsmxp.spp: $(BINDIR)/gensprep$(EXEEXT) $(TESTSRCDATADIR)/nfs4_m
|
||||
$(INVOKE) $(BINDIR)/gensprep -s $(TESTSRCDATADIR) $(ICU_DATA_OPT) -d $(TESTBUILDDIR) -b nfsmxp -k -n $(UNICODEDATADIR) -u 3.2.0 nfs4_mixed_prep_p.txt
|
||||
|
||||
$(TESTBUILDDIR)/%.cnv: $(TESTSRCDATADIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT)
|
||||
$(INVOKE) $(BINDIR)/makeconv -c -d $(TESTBUILDDIR) $(TESTSRCDATADIR)/$(<F)
|
||||
$(INVOKE) $(BINDIR)/makeconv --small -c -d $(TESTBUILDDIR) $(TESTSRCDATADIR)/$(<F)
|
||||
|
||||
$(TESTBUILDDIR)/%.res: $(TESTSRCDATADIR)/%.txt $(BINDIR)/genrb$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(BINDIR)/genrb $(GENRBOPTS) -q -s $(TESTSRCDATADIR) $(ICU_DATA_OPT) -d $(TESTBUILDDIR) $(<F)
|
||||
|
10
icu4c/source/test/testdata/testdata.mak
vendored
10
icu4c/source/test/testdata/testdata.mak
vendored
@ -124,21 +124,21 @@ $(TEST_RES_FILES:.res =.res
|
||||
# Targets for test converter data
|
||||
"$(TESTDATABLD)\test1.cnv": "$(TESTDATA)\test1.ucm"
|
||||
@echo Building $@
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
|
||||
|
||||
"$(TESTDATABLD)\test3.cnv": "$(TESTDATA)\test3.ucm"
|
||||
@echo Building $@
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
|
||||
|
||||
"$(TESTDATABLD)\test4.cnv": "$(TESTDATA)\test4.ucm"
|
||||
@echo Building $@
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
|
||||
|
||||
"$(TESTDATABLD)\test4x.cnv": "$(TESTDATA)\test4x.ucm"
|
||||
@echo Building $@
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
|
||||
|
||||
"$(TESTDATABLD)\ibm9027.cnv": "$(TESTDATA)\ibm9027.ucm"
|
||||
@echo Building $@
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" -d"$(TESTDATABLD)" $**
|
||||
@"$(ICUTOOLS)\makeconv\$(CFG)\makeconv" --small -d"$(TESTDATABLD)" $**
|
||||
|
||||
|
@ -130,7 +130,7 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
extData->ucm->baseName[length++]=0;
|
||||
}
|
||||
|
||||
headerSize=sizeof(header)+length;
|
||||
headerSize=MBCS_HEADER_V4_LENGTH*4+length;
|
||||
|
||||
/* fill the header */
|
||||
header.version[0]=4;
|
||||
@ -138,7 +138,7 @@ CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY);
|
||||
|
||||
/* write the header and the base table name */
|
||||
udata_writeBlock(pData, &header, sizeof(header));
|
||||
udata_writeBlock(pData, &header, MBCS_HEADER_V4_LENGTH*4);
|
||||
udata_writeBlock(pData, extData->ucm->baseName, length);
|
||||
}
|
||||
|
||||
|
@ -30,7 +30,7 @@
|
||||
* Reduce tests for maxCharLength.
|
||||
*/
|
||||
|
||||
typedef struct MBCSData {
|
||||
struct MBCSData {
|
||||
NewConverter newConverter;
|
||||
|
||||
UCMFile *ucm;
|
||||
@ -48,10 +48,18 @@ typedef struct MBCSData {
|
||||
uint32_t stage2Top, stage3Top;
|
||||
|
||||
/* fromUTF8 */
|
||||
uint16_t stageUTF8[MBCS_UTF8_STAGE_SIZE];
|
||||
uint16_t stageUTF8[0x10000>>MBCS_UTF8_STAGE_SHIFT]; /* allow for utf8Max=0xffff */
|
||||
|
||||
/*
|
||||
* Maximum UTF-8-friendly code point.
|
||||
* 0 if !utf8Friendly, otherwise 0x01ff..0xffff in steps of 0x100.
|
||||
* If utf8Friendly, utf8Max is normally either MBCS_UTF8_MAX or 0xffff.
|
||||
*/
|
||||
uint16_t utf8Max;
|
||||
|
||||
UBool utf8Friendly;
|
||||
} MBCSData;
|
||||
UBool omitFromU;
|
||||
};
|
||||
|
||||
/* prototypes */
|
||||
static void
|
||||
@ -115,6 +123,29 @@ printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
|
||||
|
||||
/* implementation ----------------------------------------------------------- */
|
||||
|
||||
static MBCSData gDummy;
|
||||
|
||||
U_CFUNC const MBCSData *
|
||||
MBCSGetDummy() {
|
||||
uprv_memset(&gDummy, 0, sizeof(MBCSData));
|
||||
|
||||
/*
|
||||
* Set "pessimistic" values which may sometimes move too many
|
||||
* mappings to the extension table (but never too few).
|
||||
* These values cause MBCSOkForBaseFromUnicode() to return FALSE for the
|
||||
* largest set of mappings.
|
||||
* Assume maxCharLength>1.
|
||||
*/
|
||||
gDummy.utf8Friendly=TRUE;
|
||||
if(SMALL) {
|
||||
gDummy.utf8Max=0xffff;
|
||||
gDummy.omitFromU=TRUE;
|
||||
} else {
|
||||
gDummy.utf8Max=MBCS_UTF8_MAX;
|
||||
}
|
||||
return &gDummy;
|
||||
}
|
||||
|
||||
static void
|
||||
MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
|
||||
uprv_memset(mbcsData, 0, sizeof(MBCSData));
|
||||
@ -680,7 +711,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
||||
|
||||
/* inspect stage 1 */
|
||||
index=c>>MBCS_STAGE_1_SHIFT;
|
||||
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
|
||||
if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
|
||||
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK&~(MBCS_UTF8_STAGE_3_BLOCKS-1);
|
||||
} else {
|
||||
nextOffset=(c>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK;
|
||||
@ -716,7 +747,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
||||
|
||||
/* inspect stage 2 */
|
||||
index=mbcsData->stage1[index]+nextOffset;
|
||||
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
|
||||
if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
|
||||
/* allocate 64-entry blocks for UTF-8-friendly lookup */
|
||||
blockSize=MBCS_UTF8_STAGE_3_BLOCK_SIZE*maxCharLength;
|
||||
nextOffset=c&MBCS_UTF8_STAGE_3_BLOCK_MASK;
|
||||
@ -761,12 +792,12 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
||||
stage3Index=MBCS_STAGE_3_GRANULARITY*(uint32_t)(uint16_t)mbcsData->stage2[index];
|
||||
|
||||
/* Build an alternate, UTF-8-friendly stage table as well. */
|
||||
if(mbcsData->utf8Friendly && c<=MBCS_UTF8_MAX) {
|
||||
if(mbcsData->utf8Friendly && c<=mbcsData->utf8Max) {
|
||||
/* Overflow for uint16_t entries in stageUTF8? */
|
||||
if(stage3Index>0xffff) {
|
||||
/*
|
||||
* This can occur only if the mapping table is nearly perfectly filled and if
|
||||
* MBCS_UTF8_MAX==0xffff.
|
||||
* utf8Max==0xffff.
|
||||
* (There is no known charset like this. GB 18030 does not map
|
||||
* surrogate code points and LMBCS does not map 256 PUA code points.)
|
||||
*
|
||||
@ -776,20 +807,20 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
||||
* mappings with 0<=c<MBCS_UTF8_LIMIT, and there is only also
|
||||
* the initial all-unassigned block in stage3.
|
||||
*
|
||||
* Solution for the overflow: Reduce utf8Max to the next lower value, 0xfeff.
|
||||
*
|
||||
* (See svn revision 20866 of the markus/ucnvutf8 feature branch for
|
||||
* code that causes MBCSAddTable() to rebuild the table not utf8Friendly
|
||||
* in case of overflow. That code was not tested.)
|
||||
*/
|
||||
fprintf(stderr, "too many stage 3 entries for UTF-8-friendly format, processing U+%04x<->0x%s\n",
|
||||
(int)c, printBytes(buffer, bytes, length));
|
||||
return FALSE;
|
||||
mbcsData->utf8Max=0xfeff;
|
||||
} else {
|
||||
/*
|
||||
* The stage 3 block has been assigned for the regular trie.
|
||||
* Just copy its index into stageUTF8[], without the granularity.
|
||||
*/
|
||||
mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
|
||||
}
|
||||
|
||||
/*
|
||||
* The stage 3 block has been assigned for the regular trie.
|
||||
* Just copy its index into stageUTF8[], without the granularity.
|
||||
*/
|
||||
mbcsData->stageUTF8[c>>MBCS_UTF8_STAGE_SHIFT]=(uint16_t)stage3Index;
|
||||
}
|
||||
|
||||
/* write the codepage bytes into stage 3 and get the previous bytes */
|
||||
@ -856,7 +887,7 @@ MBCSAddFromUnicode(MBCSData *mbcsData,
|
||||
}
|
||||
|
||||
U_CFUNC UBool
|
||||
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
|
||||
MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c, int8_t flag) {
|
||||
/*
|
||||
@ -883,7 +914,16 @@ MBCSOkForBaseFromUnicode(UBool utf8Friendly,
|
||||
* - any mapping to 0x00 (result value 0, indistinguishable from unmappable entry)
|
||||
* - any |1 fallback (no roundtrip flags in the optimized table)
|
||||
*/
|
||||
if(utf8Friendly && flag<=1 && c<=MBCS_UTF8_MAX && (bytes[0]==0 || flag==1)) {
|
||||
if(mbcsData->utf8Friendly && flag<=1 && c<=mbcsData->utf8Max && (bytes[0]==0 || flag==1)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we omit the fromUnicode data, we can only store roundtrips there
|
||||
* because only they are recoverable from the toUnicode data.
|
||||
* Fallbacks must go into the extension table.
|
||||
*/
|
||||
if(mbcsData->omitFromU && flag!=0) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
@ -918,6 +958,18 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
||||
* indicators are used.
|
||||
*/
|
||||
mbcsData->utf8Friendly=utf8Friendly=(UBool)((table->flagsType&UCM_FLAGS_EXPLICIT)!=0);
|
||||
if(utf8Friendly) {
|
||||
mbcsData->utf8Max=MBCS_UTF8_MAX;
|
||||
if(SMALL && maxCharLength>1) {
|
||||
mbcsData->omitFromU=TRUE;
|
||||
}
|
||||
} else {
|
||||
mbcsData->utf8Max=0;
|
||||
if(SMALL && maxCharLength>1) {
|
||||
fprintf(stderr,
|
||||
"makeconv warning: --small not available for .ucm files without |0 etc.\n");
|
||||
}
|
||||
}
|
||||
|
||||
if(!MBCSStartMappings(mbcsData)) {
|
||||
return FALSE;
|
||||
@ -933,6 +985,28 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
||||
c=m->u;
|
||||
f=m->f;
|
||||
|
||||
/*
|
||||
* Small optimization for --small .cnv files:
|
||||
*
|
||||
* If there are fromUnicode mappings above MBCS_UTF8_MAX,
|
||||
* then the file size will be smaller if we make utf8Max larger
|
||||
* because the size increase in stageUTF8 will be more than balanced by
|
||||
* how much less of stage2 needs to be stored.
|
||||
*
|
||||
* There is no point in doing this incrementally because stageUTF8
|
||||
* uses so much less space per block than stage2,
|
||||
* so we immediately increase utf8Max to 0xffff.
|
||||
*
|
||||
* Do not increase utf8Max if it is already at 0xfeff because MBCSAddFromUnicode()
|
||||
* sets it to that value when stageUTF8 overflows.
|
||||
*/
|
||||
if( mbcsData->omitFromU && f<=1 &&
|
||||
mbcsData->utf8Max<c && c<=0xffff &&
|
||||
mbcsData->utf8Max<0xfeff
|
||||
) {
|
||||
mbcsData->utf8Max=0xffff;
|
||||
}
|
||||
|
||||
switch(f) {
|
||||
case -1:
|
||||
/* there was no precision/fallback indicator */
|
||||
@ -943,7 +1017,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
||||
|
||||
if(maxCharLength==1) {
|
||||
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
|
||||
} else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
|
||||
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
@ -955,7 +1029,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
||||
if(maxCharLength==1) {
|
||||
staticData->hasFromUnicodeFallback=TRUE;
|
||||
isOK&=MBCSSingleAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else if(MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
|
||||
} else if(MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f)) {
|
||||
staticData->hasFromUnicodeFallback=TRUE;
|
||||
isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
} else {
|
||||
@ -965,7 +1039,7 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
||||
break;
|
||||
case 2:
|
||||
/* ignore |2 SUB mappings, except to move <subchar1> mappings to the extension table */
|
||||
if(maxCharLength>1 && !MBCSOkForBaseFromUnicode(utf8Friendly, m->b.bytes, m->bLen, c, f)) {
|
||||
if(maxCharLength>1 && m->bLen==1) {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
}
|
||||
@ -1329,24 +1403,56 @@ static uint32_t
|
||||
MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
UNewDataMemory *pData, int32_t tableType) {
|
||||
MBCSData *mbcsData=(MBCSData *)cnvData;
|
||||
uint32_t stage2Start, stage2Length;
|
||||
uint32_t top, stageUTF8Length=0;
|
||||
int32_t i, stage1Top;
|
||||
uint32_t headerLength;
|
||||
|
||||
_MBCSHeader header={ { 0, 0, 0, 0 }, 0, 0, 0, 0, 0, 0, 0 };
|
||||
|
||||
stage2Length=mbcsData->stage2Top;
|
||||
if(mbcsData->omitFromU) {
|
||||
/* find how much of stage2 can be omitted */
|
||||
int32_t utf8Limit=(int32_t)mbcsData->utf8Max+1;
|
||||
uint32_t st2;
|
||||
|
||||
i=utf8Limit>>MBCS_STAGE_1_SHIFT;
|
||||
if((utf8Limit&((1<<MBCS_STAGE_1_SHIFT)-1))!=0 && (st2=mbcsData->stage1[i])!=0) {
|
||||
/* utf8Limit is in the middle of an existing stage 2 block */
|
||||
stage2Start=st2+((utf8Limit>>MBCS_STAGE_2_SHIFT)&MBCS_STAGE_2_BLOCK_MASK);
|
||||
} else {
|
||||
/* find the last stage2 block with mappings before utf8Limit */
|
||||
while(i>0 && (st2=mbcsData->stage1[--i])==0) {}
|
||||
/* stage2 up to the end of this block corresponds to stageUTF8 */
|
||||
stage2Start=st2+MBCS_STAGE_2_BLOCK_SIZE;
|
||||
}
|
||||
header.options|=MBCS_OPT_NO_FROM_U;
|
||||
header.fullStage2Length=stage2Length;
|
||||
stage2Length-=stage2Start;
|
||||
if(VERBOSE) {
|
||||
printf("+ omitting %lu out of %lu stage2 entries and %lu fromUBytes\n",
|
||||
stage2Start, mbcsData->stage2Top, mbcsData->stage3Top);
|
||||
printf("+ total size savings: %lu bytes\n", stage2Start*4+mbcsData->stage3Top);
|
||||
}
|
||||
} else {
|
||||
stage2Start=0;
|
||||
}
|
||||
|
||||
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
|
||||
} else {
|
||||
stage1Top=0x40; /* 0x40==64 */
|
||||
}
|
||||
|
||||
/* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
|
||||
if(mbcsData->ucm->states.maxCharLength==1) {
|
||||
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
|
||||
} else {
|
||||
stage1Top=0x40; /* 0x40==64 */
|
||||
}
|
||||
for(i=0; i<stage1Top; ++i) {
|
||||
mbcsData->stage1[i]+=(uint16_t)stage1Top;
|
||||
}
|
||||
|
||||
/* stage2Top has counted 16-bit results, now we need to count bytes */
|
||||
mbcsData->stage2Top*=2;
|
||||
/* stage2Top/Length have counted 16-bit results, now we need to count bytes */
|
||||
/* also round up to a multiple of 4 bytes */
|
||||
stage2Length=(stage2Length*2+1)&~1;
|
||||
|
||||
/* stage3Top has counted 16-bit results, now we need to count bytes */
|
||||
mbcsData->stage3Top*=2;
|
||||
@ -1355,40 +1461,47 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
header.version[2]=(uint8_t)(SBCS_UTF8_MAX>>8); /* store 0x1f for max==0x1fff */
|
||||
}
|
||||
} else {
|
||||
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
|
||||
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
|
||||
} else {
|
||||
stage1Top=0x40; /* 0x40==64 */
|
||||
}
|
||||
for(i=0; i<stage1Top; ++i) {
|
||||
mbcsData->stage1[i]+=(uint16_t)stage1Top/2; /* stage 2 contains 32-bit entries, stage 1 16-bit entries */
|
||||
}
|
||||
|
||||
/* stage2Top has counted 32-bit results, now we need to count bytes */
|
||||
mbcsData->stage2Top*=4;
|
||||
/* stage2Top/Length have counted 32-bit results, now we need to count bytes */
|
||||
stage2Length*=4;
|
||||
/* leave stage2Start counting 32-bit units */
|
||||
|
||||
if(mbcsData->utf8Friendly) {
|
||||
stageUTF8Length=MBCS_UTF8_STAGE_SIZE;
|
||||
header.version[2]=(uint8_t)(MBCS_UTF8_MAX>>8); /* store 0xd7 for max==0xd7ff */
|
||||
stageUTF8Length=(mbcsData->utf8Max+1)>>MBCS_UTF8_STAGE_SHIFT;
|
||||
header.version[2]=(uint8_t)(mbcsData->utf8Max>>8); /* store 0xd7 for max==0xd7ff */
|
||||
}
|
||||
|
||||
/* stage3Top has already counted bytes */
|
||||
}
|
||||
|
||||
/* round up stage2Top and stage3Top so that the sizes of all data blocks are multiples of 4 */
|
||||
mbcsData->stage2Top=(mbcsData->stage2Top+3)&~3;
|
||||
/* round up stage3Top so that the sizes of all data blocks are multiples of 4 */
|
||||
mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
|
||||
|
||||
/* fill the header */
|
||||
header.version[0]=4;
|
||||
if(header.options&MBCS_OPT_INCOMPATIBLE_MASK) {
|
||||
header.version[0]=5;
|
||||
if(header.options&MBCS_OPT_NO_FROM_U) {
|
||||
headerLength=10; /* include fullStage2Length */
|
||||
} else {
|
||||
headerLength=MBCS_HEADER_V5_MIN_LENGTH; /* 9 */
|
||||
}
|
||||
} else {
|
||||
header.version[0]=4;
|
||||
headerLength=MBCS_HEADER_V4_LENGTH; /* 8 */
|
||||
}
|
||||
header.version[1]=3;
|
||||
/* header.version[2] set above for utf8Friendly data */
|
||||
|
||||
header.options|=(uint32_t)headerLength;
|
||||
|
||||
header.countStates=mbcsData->ucm->states.countStates;
|
||||
header.countToUFallbacks=mbcsData->countToUFallbacks;
|
||||
|
||||
header.offsetToUCodeUnits=
|
||||
sizeof(_MBCSHeader)+
|
||||
headerLength*4+
|
||||
mbcsData->ucm->states.countStates*1024+
|
||||
mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
|
||||
header.offsetFromUTable=
|
||||
@ -1397,10 +1510,13 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
header.offsetFromUBytes=
|
||||
header.offsetFromUTable+
|
||||
stage1Top*2+
|
||||
mbcsData->stage2Top;
|
||||
stage2Length;
|
||||
header.fromUBytesLength=mbcsData->stage3Top;
|
||||
|
||||
top=header.offsetFromUBytes+header.fromUBytesLength+stageUTF8Length*2;
|
||||
top=header.offsetFromUBytes+stageUTF8Length*2;
|
||||
if(!(header.options&MBCS_OPT_NO_FROM_U)) {
|
||||
top+=header.fromUBytesLength;
|
||||
}
|
||||
|
||||
header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
|
||||
|
||||
@ -1414,17 +1530,19 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
|
||||
}
|
||||
|
||||
/* write the MBCS data */
|
||||
udata_writeBlock(pData, &header, sizeof(_MBCSHeader));
|
||||
udata_writeBlock(pData, &header, headerLength*4);
|
||||
udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
|
||||
udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
|
||||
udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
|
||||
udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
|
||||
if(mbcsData->ucm->states.maxCharLength==1) {
|
||||
udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top);
|
||||
udata_writeBlock(pData, mbcsData->stage2Single+stage2Start, stage2Length);
|
||||
} else {
|
||||
udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top);
|
||||
udata_writeBlock(pData, mbcsData->stage2+stage2Start, stage2Length);
|
||||
}
|
||||
if(!(header.options&MBCS_OPT_NO_FROM_U)) {
|
||||
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
|
||||
}
|
||||
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
|
||||
|
||||
if(stageUTF8Length>0) {
|
||||
udata_writeBlock(pData, mbcsData->stageUTF8, stageUTF8Length*2);
|
||||
|
@ -101,9 +101,20 @@ enum {
|
||||
U_CFUNC NewConverter *
|
||||
MBCSOpen(UCMFile *ucm);
|
||||
|
||||
struct MBCSData;
|
||||
typedef struct MBCSData MBCSData;
|
||||
|
||||
/*
|
||||
* Get a dummy MBCSData for use with MBCSOkForBaseFromUnicode()
|
||||
* for creating an extension-only file.
|
||||
* Assume maxCharLength>1.
|
||||
*/
|
||||
U_CFUNC const MBCSData *
|
||||
MBCSGetDummy();
|
||||
|
||||
/* Test if a 1:1 mapping fits into the MBCS base table's fromUnicode structure. */
|
||||
U_CFUNC UBool
|
||||
MBCSOkForBaseFromUnicode(UBool utf8Friendly,
|
||||
MBCSOkForBaseFromUnicode(const MBCSData *mbcsData,
|
||||
const uint8_t *bytes, int32_t length,
|
||||
UChar32 c, int8_t flag);
|
||||
|
||||
|
@ -34,6 +34,8 @@
|
||||
#include "makeconv.h"
|
||||
#include "genmbcs.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
#define DEBUG 0
|
||||
|
||||
typedef struct ConvData {
|
||||
@ -76,6 +78,7 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP
|
||||
* Global - verbosity
|
||||
*/
|
||||
UBool VERBOSE = FALSE;
|
||||
UBool SMALL = FALSE;
|
||||
|
||||
static void
|
||||
createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
|
||||
@ -163,13 +166,25 @@ writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErr
|
||||
}
|
||||
}
|
||||
|
||||
enum {
|
||||
OPT_HELP_H,
|
||||
OPT_HELP_QUESTION_MARK,
|
||||
OPT_COPYRIGHT,
|
||||
OPT_VERSION,
|
||||
OPT_DESTDIR,
|
||||
OPT_VERBOSE,
|
||||
OPT_SMALL,
|
||||
OPT_COUNT
|
||||
};
|
||||
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H, /* 0 Numbers for those who*/
|
||||
UOPTION_HELP_QUESTION_MARK, /* 1 can't count. */
|
||||
UOPTION_COPYRIGHT, /* 2 */
|
||||
UOPTION_VERSION, /* 3 */
|
||||
UOPTION_DESTDIR, /* 4 */
|
||||
UOPTION_VERBOSE, /* 5 */
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_COPYRIGHT,
|
||||
UOPTION_VERSION,
|
||||
UOPTION_DESTDIR,
|
||||
UOPTION_VERBOSE,
|
||||
{ "small", NULL, NULL, NULL, '\1', UOPT_NO_ARG, 0 }
|
||||
};
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
@ -194,8 +209,8 @@ int main(int argc, char* argv[])
|
||||
uprv_memcpy(&dataInfo.dataVersion, &icuVersion, sizeof(UVersionInfo));
|
||||
|
||||
/* preset then read command line options */
|
||||
options[4].value=u_getDataDirectory();
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
options[OPT_DESTDIR].value=u_getDataDirectory();
|
||||
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
@ -205,8 +220,9 @@ int main(int argc, char* argv[])
|
||||
} else if(argc<2) {
|
||||
argc=-1;
|
||||
}
|
||||
if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
|
||||
fprintf(stderr,
|
||||
if(argc<0 || options[OPT_HELP_H].doesOccur || options[OPT_HELP_QUESTION_MARK].doesOccur) {
|
||||
FILE *stdfile=argc<0 ? stderr : stdout;
|
||||
fprintf(stdfile,
|
||||
"usage: %s [-options] files...\n"
|
||||
"\tread .ucm codepage mapping files and write .cnv files\n"
|
||||
"options:\n"
|
||||
@ -216,20 +232,26 @@ int main(int argc, char* argv[])
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t-v or --verbose Turn on verbose output\n",
|
||||
argv[0]);
|
||||
fprintf(stdfile,
|
||||
"\t --small Generate smaller .cnv files. They will be\n"
|
||||
"\t significantly smaller but may not be compatible with\n"
|
||||
"\t older versions of ICU and will require heap memory\n"
|
||||
"\t allocation when loaded.\n");
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
if(options[3].doesOccur) {
|
||||
fprintf(stderr,"makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
|
||||
dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
|
||||
fprintf(stderr, U_COPYRIGHT_STRING "\n");
|
||||
if(options[OPT_VERSION].doesOccur) {
|
||||
printf("makeconv version %hu.%hu, ICU tool to read .ucm codepage mapping files and write .cnv files\n",
|
||||
dataInfo.formatVersion[0], dataInfo.formatVersion[1]);
|
||||
printf("%s\n", U_COPYRIGHT_STRING);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
/* get the options values */
|
||||
haveCopyright = options[2].doesOccur;
|
||||
destdir = options[4].value;
|
||||
VERBOSE = options[5].doesOccur;
|
||||
haveCopyright = options[OPT_COPYRIGHT].doesOccur;
|
||||
destdir = options[OPT_DESTDIR].value;
|
||||
VERBOSE = options[OPT_VERBOSE].doesOccur;
|
||||
SMALL = options[OPT_SMALL].doesOccur;
|
||||
|
||||
if (destdir != NULL && *destdir != 0) {
|
||||
uprv_strcpy(outFileName, destdir);
|
||||
@ -766,12 +788,13 @@ createConverter(ConvData *data, const char *converterName, UErrorCode *pErrorCod
|
||||
*
|
||||
* Do this after ucm_checkBaseExt().
|
||||
*/
|
||||
const MBCSData *mbcsData=MBCSGetDummy();
|
||||
int32_t needsMove=0;
|
||||
for(m=baseData.ucm->base->mappings, mLimit=m+baseData.ucm->base->mappingsLength;
|
||||
m<mLimit;
|
||||
++m
|
||||
) {
|
||||
if(!MBCSOkForBaseFromUnicode(TRUE, m->b.bytes, m->bLen, m->u, m->f)) {
|
||||
if(!MBCSOkForBaseFromUnicode(mbcsData, m->b.bytes, m->bLen, m->u, m->f)) {
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
++needsMove;
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2006, International Business Machines
|
||||
* Copyright (C) 2000-2007, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -24,6 +24,7 @@
|
||||
|
||||
/* exports from makeconv.c */
|
||||
U_CFUNC UBool VERBOSE;
|
||||
U_CFUNC UBool SMALL;
|
||||
|
||||
/* converter table type for writing */
|
||||
enum {
|
||||
|
@ -497,7 +497,7 @@ ucnv_enumDependencies(const UDataSwapper *ds,
|
||||
/* check for supported conversionType values */
|
||||
if(inStaticData->conversionType==UCNV_MBCS) {
|
||||
/* MBCS data */
|
||||
uint32_t mbcsHeaderFlags;
|
||||
uint32_t mbcsHeaderLength, mbcsHeaderFlags, mbcsHeaderOptions;
|
||||
int32_t extOffset;
|
||||
|
||||
inMBCSHeader=(const _MBCSHeader *)inBytes;
|
||||
@ -508,7 +508,14 @@ ucnv_enumDependencies(const UDataSwapper *ds,
|
||||
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
if(!(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1)) {
|
||||
if(inMBCSHeader->version[0]==4 && inMBCSHeader->version[1]>=1) {
|
||||
mbcsHeaderLength=MBCS_HEADER_V4_LENGTH;
|
||||
} else if(inMBCSHeader->version[0]==5 && inMBCSHeader->version[1]>=3 &&
|
||||
((mbcsHeaderOptions=ds->readUInt32(inMBCSHeader->options))&
|
||||
MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK)==0
|
||||
) {
|
||||
mbcsHeaderLength=mbcsHeaderOptions&MBCS_OPT_LENGTH_MASK;
|
||||
} else {
|
||||
udata_printError(ds, "icupkg/ucnv_enumDependencies(): unsupported _MBCSHeader.version %d.%d\n",
|
||||
inMBCSHeader->version[0], inMBCSHeader->version[1]);
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
@ -536,14 +543,15 @@ ucnv_enumDependencies(const UDataSwapper *ds,
|
||||
}
|
||||
|
||||
/* swap the base name, between the header and the extension data */
|
||||
baseNameLength=(int32_t)strlen((const char *)(inMBCSHeader+1));
|
||||
const char *inBaseName=(const char *)inBytes+mbcsHeaderLength*4;
|
||||
baseNameLength=(int32_t)strlen(inBaseName);
|
||||
if(baseNameLength>=(int32_t)sizeof(baseName)) {
|
||||
udata_printError(ds, "icupkg/ucnv_enumDependencies(%s): base name length %ld too long\n",
|
||||
itemName, baseNameLength);
|
||||
*pErrorCode=U_UNSUPPORTED_ERROR;
|
||||
return;
|
||||
}
|
||||
ds->swapInvChars(ds, inMBCSHeader+1, baseNameLength+1, baseName, pErrorCode);
|
||||
ds->swapInvChars(ds, inBaseName, baseNameLength+1, baseName, pErrorCode);
|
||||
|
||||
checkIDSuffix(itemName, baseName, -1, ".cnv", check, context, pErrorCode);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user