ICU-2404 first code for m:n conversion extensions

X-SVN-Rev: 13490
This commit is contained in:
Markus Scherer 2003-10-25 00:29:13 +00:00
parent ff0a9c0244
commit cea34629f2
35 changed files with 6193 additions and 2125 deletions

View File

@ -61,7 +61,7 @@ OBJECTS = putil.o uobject.o cmemory.o umutex.o \
udata.o ucmndata.o udatamem.o udataswp.o umapfile.o ucol_swp.o \
uresbund.o uresdata.o resbund.o ucat.o locmap.o uloc.o locid.o \
uhash.o uhash_us.o \
ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_io.o ucnvlat1.o \
ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_ext.o ucnv_io.o ucnvlat1.o \
ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o \
unistr.o utf_impl.o ustring.o ustrcase.o cstring.o ustrfmt.o ustrtrns.o \

View File

@ -1347,6 +1347,14 @@ InputPath=.\unicode\ucnv_err.h
# End Source File
# Begin Source File
SOURCE=.\ucnv_ext.c
# End Source File
# Begin Source File
SOURCE=.\ucnv_ext.h
# End Source File
# Begin Source File
SOURCE=.\ucnv_imp.h
# End Source File
# Begin Source File

View File

@ -730,6 +730,12 @@
Outputs="..\..\include\unicode\$(InputName).h"/>
</FileConfiguration>
</File>
<File
RelativePath=".\ucnv_ext.c">
</File>
<File
RelativePath=".\ucnv_ext.h">
</File>
<File
RelativePath=".\ucnv_imp.h">
</File>

View File

@ -608,11 +608,14 @@ static void _reset(UConverter *converter, UConverterResetChoice choice,
converter->mode = 0;
converter->toULength = 0;
converter->invalidCharLength = converter->UCharErrorBufferLength = 0;
converter->preToULength = 0;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
converter->fromUnicodeStatus = 0;
converter->fromUChar32 = 0;
converter->invalidUCharLength = converter->charErrorBufferLength = 0;
converter->preFromUFirstCP = U_SENTINEL;
converter->preFromULength = 0;
}
if (converter->sharedData->impl->reset != NULL) {
@ -811,6 +814,28 @@ _updateOffsets(int32_t *offsets, int32_t length,
/* ucnv_fromUnicode --------------------------------------------------------- */
/*
* Implementation note for m:n conversions
*
* While collecting source units to find the longest match for m:n conversion,
* some source units may need to be stored for a partial match.
* When a second buffer does not yield a match on all of the previously stored
* source units, then they must be "replayed", i.e., fed back into the converter.
*
* The code relies on the fact that replaying will not nest -
* converting a replay buffer will not result in a replay.
* This is because a replay is necessary only after the _continuation_ of a
* partial match failed, but a replay buffer is converted as a whole.
* It may result in some of its units being stored again for a partial match,
* but there will not be a continuation _during_ the replay which could fail.
*
* It is conceivable that a callback function could call the converter
* recursively in a way that causes another replay to be stored, but that
* would be an error in the callback function.
* Such violations will cause assertion failures in a debug build,
* and wrong output, but they will not cause a crash.
*/
static void
_fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
UConverterFromUnicode fromUnicode;
@ -822,6 +847,12 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
int32_t errorInputLength;
UBool converterSawEndOfInput, calledCallback;
/* variables for m:n conversion */
UChar replay[UCNV_EXT_MAX_UCHARS];
const UChar *realSource, *realSourceLimit;
int32_t realSourceIndex;
UBool realFlush;
cnv=pArgs->converter;
s=pArgs->source;
t=pArgs->target;
@ -841,6 +872,29 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
}
}
if(cnv->preFromULength>=0) {
/* normal mode */
realSource=NULL;
} else {
/*
* Previous m:n conversion stored source units from a partial match
* and failed to consume all of them.
* We need to "replay" them from a temporary buffer and convert them first.
*/
realSource=pArgs->source;
realSourceLimit=pArgs->sourceLimit;
realFlush=pArgs->flush;
realSourceIndex=sourceIndex;
uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
pArgs->source=replay;
pArgs->sourceLimit=replay-cnv->preFromULength;
pArgs->flush=FALSE;
sourceIndex=-1;
cnv->preFromULength=0;
}
/*
* loop for conversion and error handling
*
@ -897,8 +951,37 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
pArgs->offsets=offsets+=length;
}
if(sourceIndex>=0) {
sourceIndex+=(int32_t)(pArgs->source-s);
}
}
if(cnv->preFromULength<0) {
/*
* switch the source to new replay units (cannot occur while replaying)
* after offset handling and before end-of-input and callback handling
*/
if(realSource==NULL) {
realSource=pArgs->source;
realSourceLimit=pArgs->sourceLimit;
realFlush=pArgs->flush;
realSourceIndex=sourceIndex;
uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
pArgs->source=replay;
pArgs->sourceLimit=replay-cnv->preFromULength;
pArgs->flush=FALSE;
if((sourceIndex+=cnv->preFromULength)<0) {
sourceIndex=-1;
}
cnv->preFromULength=0;
} else {
/* see implementation note before _fromUnicodeWithCallback() */
U_ASSERT(realSource==NULL);
*err=U_INTERNAL_PROGRAM_ERROR;
}
}
/* update pointers */
s=pArgs->source;
@ -911,6 +994,15 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
* (continue converting by breaking out of only the inner loop)
*/
break;
} else if(realSource!=NULL) {
/* switch back from replaying to the real source and continue */
pArgs->source=realSource;
pArgs->sourceLimit=realSourceLimit;
pArgs->flush=realFlush;
sourceIndex=realSourceIndex;
realSource=NULL;
break;
} else if(pArgs->flush && cnv->fromUChar32!=0) {
/*
* the entire input stream is consumed
@ -960,7 +1052,27 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
*
* if we were replaying, then the replay buffer must be
* copied back into the UConverter
* and the real arguments must be restored
*/
if(realSource!=NULL) {
int32_t length;
U_ASSERT(cnv->preFromULength==0);
length=(int32_t)(pArgs->sourceLimit-pArgs->source);
if(length>0) {
uprv_memcpy(cnv->preFromU, pArgs->source, length*U_SIZEOF_UCHAR);
cnv->preFromULength=(int8_t)-length;
}
pArgs->source=realSource;
pArgs->sourceLimit=realSourceLimit;
pArgs->flush=realFlush;
}
return;
}
}
@ -1079,7 +1191,7 @@ ucnv_fromUnicode(UConverter *cnv,
cnv->charErrorBufferLength=0;
}
if(!flush && s==sourceLimit) {
if(!flush && s==sourceLimit && cnv->preFromULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
*target=t;
return;
@ -1122,6 +1234,12 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
int32_t errorInputLength;
UBool converterSawEndOfInput, calledCallback;
/* variables for m:n conversion */
char replay[UCNV_EXT_MAX_BYTES];
const char *realSource, *realSourceLimit;
int32_t realSourceIndex;
UBool realFlush;
cnv=pArgs->converter;
s=pArgs->source;
t=pArgs->target;
@ -1141,6 +1259,29 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
}
}
if(cnv->preToULength>=0) {
/* normal mode */
realSource=NULL;
} else {
/*
* Previous m:n conversion stored source units from a partial match
* and failed to consume all of them.
* We need to "replay" them from a temporary buffer and convert them first.
*/
realSource=pArgs->source;
realSourceLimit=pArgs->sourceLimit;
realFlush=pArgs->flush;
realSourceIndex=sourceIndex;
uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
pArgs->source=replay;
pArgs->sourceLimit=replay-cnv->preToULength;
pArgs->flush=FALSE;
sourceIndex=-1;
cnv->preToULength=0;
}
/*
* loop for conversion and error handling
*
@ -1202,8 +1343,37 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
pArgs->offsets=offsets+=length;
}
if(sourceIndex>=0) {
sourceIndex+=(int32_t)(pArgs->source-s);
}
}
if(cnv->preToULength<0) {
/*
* switch the source to new replay units (cannot occur while replaying)
* after offset handling and before end-of-input and callback handling
*/
if(realSource==NULL) {
realSource=pArgs->source;
realSourceLimit=pArgs->sourceLimit;
realFlush=pArgs->flush;
realSourceIndex=sourceIndex;
uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
pArgs->source=replay;
pArgs->sourceLimit=replay-cnv->preToULength;
pArgs->flush=FALSE;
if((sourceIndex+=cnv->preToULength)<0) {
sourceIndex=-1;
}
cnv->preToULength=0;
} else {
/* see implementation note before _fromUnicodeWithCallback() */
U_ASSERT(realSource==NULL);
*err=U_INTERNAL_PROGRAM_ERROR;
}
}
/* update pointers */
s=pArgs->source;
@ -1216,6 +1386,15 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
* (continue converting by breaking out of only the inner loop)
*/
break;
} else if(realSource!=NULL) {
/* switch back from replaying to the real source and continue */
pArgs->source=realSource;
pArgs->sourceLimit=realSourceLimit;
pArgs->flush=realFlush;
sourceIndex=realSourceIndex;
realSource=NULL;
break;
} else if(pArgs->flush && cnv->toULength>0) {
/*
* the entire input stream is consumed
@ -1265,7 +1444,27 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
*
* if we were replaying, then the replay buffer must be
* copied back into the UConverter
* and the real arguments must be restored
*/
if(realSource!=NULL) {
int32_t length;
U_ASSERT(cnv->preToULength==0);
length=(int32_t)(pArgs->sourceLimit-pArgs->source);
if(length>0) {
uprv_memcpy(cnv->preToU, pArgs->source, length);
cnv->preToULength=(int8_t)-length;
}
pArgs->source=realSource;
pArgs->sourceLimit=realSourceLimit;
pArgs->flush=realFlush;
}
return;
}
}
@ -1379,7 +1578,7 @@ ucnv_toUnicode(UConverter *cnv,
cnv->UCharErrorBufferLength=0;
}
if(!flush && s==sourceLimit) {
if(!flush && s==sourceLimit && cnv->preToULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
*target=t;
return;

View File

@ -776,6 +776,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
myUConverter->subChar1 = myUConverter->sharedData->staticData->subChar1;
myUConverter->subCharLen = myUConverter->sharedData->staticData->subCharLen;
uprv_memcpy (myUConverter->subChar, myUConverter->sharedData->staticData->subChar, myUConverter->subCharLen);
myUConverter->preFromUFirstCP = U_SENTINEL;
if(myUConverter != NULL && myUConverter->sharedData->impl->open != NULL) {
myUConverter->sharedData->impl->open(myUConverter, realName, locale,options, err);

View File

@ -20,6 +20,7 @@
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
#include "ucnv_ext.h"
#include "udataswp.h"
/* size of the overflow buffers in UConverter, enough for escaping callbacks */
@ -168,12 +169,22 @@ struct UConverter {
int8_t UCharErrorBufferLength; /* number of valid UChars in charErrorBuffer */
uint8_t subChar1; /* single-byte substitution character if different from subChar */
UBool useSubChar1;
uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* codepage specific character sequence */
char invalidCharBuffer[UCNV_MAX_CHAR_LEN]; /* bytes from last error/callback situation */
uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* codepage output from Error functions */
UChar invalidUCharBuffer[U16_MAX_LENGTH]; /* UChars from last error/callback situation */
UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* unicode output from Error functions */
/* fields for conversion extension */
/* store previous UChars/chars to continue partial matches */
UChar32 preFromUFirstCP; /* >=0: partial match */
UChar preFromU[UCNV_EXT_MAX_UCHARS];
char preToU[UCNV_EXT_MAX_BYTES];
int8_t preFromULength, preToULength; /* negative: replay */
int8_t preToUFirstLength; /* length of first character */
};
U_CDECL_END /* end of UConverter */

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2000-2001, International Business Machines
* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv_cb.c:
@ -35,50 +35,16 @@ ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err)
{
int32_t togo;
int8_t toerr;
int32_t i;
if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
{
uprv_memcpy(args->target, source, length);
args->target += length;
if(args->offsets) /* set all the offsets to the same # */
{
for(i=0;i<length;i++)
{
*(args->offsets++) = offsetIndex;
}
}
}
else
{
togo = (int32_t)(args->targetLimit - args->target);
uprv_memcpy(args->target, source, togo);
args->target += togo;
if(args->offsets)
{
for(i=0;i<togo;i++)
{
*(args->offsets++) = offsetIndex;
}
if(U_FAILURE(*err)) {
return;
}
/* Now, copy the remainder into the errbuff */
source += togo;
toerr = (int8_t)(length - togo);
uprv_memcpy(args->converter->charErrorBuffer +
args->converter->charErrorBufferLength,
source,
toerr * sizeof(source[0]));
args->converter->charErrorBufferLength += toerr;
*err = U_BUFFER_OVERFLOW_ERROR;
}
ucnv_fromUWriteBytes(
args->converter,
source, length,
&args->target, args->targetLimit,
&args->offsets, offsetIndex,
err);
}
U_CAPI void U_EXPORT2
@ -232,55 +198,16 @@ ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err)
{
int32_t togo;
int8_t toerr;
int32_t i;
if(U_FAILURE(*err))
{
if(U_FAILURE(*err)) {
return;
}
if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
{
uprv_memcpy(args->target, source, length * sizeof(args->target[0]) );
args->target += length;
if(args->offsets) /* set all the offsets to the same # */
{
for(i=0;i<length;i++)
{
*(args->offsets++) = offsetIndex;
}
}
}
else
{
togo = (int32_t)(args->targetLimit - args->target);
uprv_memcpy(args->target, source, togo * sizeof(args->target[0]) );
args->target += togo;
if(args->offsets)
{
for(i=0;i<togo;i++)
{
*(args->offsets++) = offsetIndex;
}
}
/* Now, copy the remainder into the errbuff */
source += togo;
toerr = (int8_t)(length - togo);
uprv_memcpy(args->converter->UCharErrorBuffer +
args->converter->UCharErrorBufferLength,
source,
toerr * sizeof(source[0]));
args->converter->UCharErrorBufferLength += toerr;
*err = U_BUFFER_OVERFLOW_ERROR;
}
ucnv_toUWriteUChars(
args->converter,
source, length,
&args->target, args->targetLimit,
&args->offsets, offsetIndex,
err);
}
U_CAPI void U_EXPORT2

View File

@ -79,6 +79,46 @@ ucnv_fromUWriteBytes(UConverter *cnv,
}
}
U_CFUNC void
ucnv_toUWriteUChars(UConverter *cnv,
const UChar *uchars, int32_t length,
UChar **target, const UChar *targetLimit,
int32_t **offsets,
int32_t sourceIndex,
UErrorCode *pErrorCode) {
UChar *t=*target;
int32_t *o;
/* write UChars */
if(offsets==NULL || (o=*offsets)==NULL) {
while(length>0 && t<targetLimit) {
*t++=*uchars++;
--length;
}
} else {
/* output with offsets */
while(length>0 && t<targetLimit) {
*t++=*uchars++;
*o++=sourceIndex;
--length;
}
*offsets=o;
}
*target=t;
/* write overflow */
if(length>0) {
if(cnv!=NULL) {
t=cnv->UCharErrorBuffer;
cnv->UCharErrorBufferLength=(int8_t)length;
do {
*t++=*uchars++;
} while(--length>0);
}
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
}
}
U_CFUNC void
ucnv_toUWriteCodePoint(UConverter *cnv,
UChar32 c,

View File

@ -251,6 +251,13 @@ ucnv_fromUWriteBytes(UConverter *cnv,
int32_t **offsets,
int32_t sourceIndex,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_toUWriteUChars(UConverter *cnv,
const UChar *uchars, int32_t length,
UChar **target, const UChar *targetLimit,
int32_t **offsets,
int32_t sourceIndex,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_toUWriteCodePoint(UConverter *cnv,

View File

@ -0,0 +1,921 @@
/*
******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: ucnv_ext.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jun13
* created by: Markus W. Scherer
*
* Conversion extensions
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_LEGACY_CONVERSION
#include "ucnv_bld.h"
#include "ucnv_cnv.h"
#include "ucnv_ext.h"
#include "cmemory.h"
/*
* ### TODO
*
* implement getUnicodeSet for the extension table
* implement data swapping for it
*/
/*
* ### TODO: probably need pointer to baseTableSharedData
* and also copy the base table's pointers for the base table arrays etc.
* into this sharedData
*/
/* to Unicode --------------------------------------------------------------- */
/*
* @return lookup value for the byte, if found; else 0
*/
static U_INLINE uint32_t
ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
uint32_t word;
int32_t i, start, limit;
/* check the input byte against the lowest and highest section bytes */
start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
if(byte<start || limit<byte) {
return 0; /* the byte is out of range */
}
if(length==((limit-start)+1)) {
/* direct access on a linear array */
return UCNV_EXT_TO_U_GET_VALUE(toUSection[byte-start]); /* could be 0 */
}
/*
* Shift byte once instead of each section word and add 0xffffff.
* We will compare the shifted/added byte (bbffffff) against
* section words which have byte values in the same bit position.
* If and only if byte bb < section byte ss then bbffffff<ssvvvvvv
* for all v=0..f
* so we need not mask off the lower 24 bits of each section word.
*/
word=UCNV_EXT_TO_U_MAKE_WORD(byte, UCNV_EXT_TO_U_VALUE_MASK);
/* binary search */
start=0;
limit=length;
for(;;) {
i=limit-start;
if(i<=1) {
break; /* done */
}
/* start<limit-1 */
if(i<=4) {
/* linear search for the last part */
if(word>=toUSection[start]) {
break;
}
if(++start<limit && word>=toUSection[start]) {
break;
}
if(++start<limit && word>=toUSection[start]) {
break;
}
/* always break at start==limit-1 */
++start;
break;
}
i=(start+limit)/2;
if(word<toUSection[i]) {
limit=i;
} else {
start=i;
}
}
/* did we really find it? */
if(start<limit && byte==UCNV_EXT_TO_U_GET_BYTE(word=toUSection[start])) {
return UCNV_EXT_TO_U_GET_VALUE(word); /* never 0 */
} else {
return 0; /* not found */
}
}
/*
* this works like ucnv_extMatchFromU() except
* - the first character is in pre
* - no trie is used
* - the returned matchLength is not offset by 2
*/
static int32_t
ucnv_extMatchToU(const int32_t *cx,
const char *pre, int32_t preLength,
const char *src, int32_t srcLength,
const UChar **pResult, int32_t *pResultLength,
UBool useFallback, UBool flush) {
const uint32_t *toUTable, *toUSection;
uint32_t value, matchValue;
int32_t i, j, index, length, matchLength;
uint8_t b;
if(cx==NULL) {
return 0; /* no extension data, no match */
}
/* initialize */
toUTable=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_INDEX, uint32_t);
index=0;
matchValue=0;
i=j=matchLength=0;
/* we must not remember fallback matches when not using fallbacks */
/* match input units until there is a full match or the input is consumed */
for(;;) {
/* go to the next section */
toUSection=toUTable+index;
/* read first pair of the section */
value=*toUSection++;
length=UCNV_EXT_TO_U_GET_BYTE(value);
value=UCNV_EXT_TO_U_GET_VALUE(value);
if( value!=0 &&
(UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
TO_U_USE_FALLBACK(useFallback))
) {
/* remember longest match so far */
matchValue=value;
matchLength=i+j;
}
/* match pre[] then src[] */
if(i<preLength) {
b=(uint8_t)pre[i++];
} else if(j<srcLength) {
b=(uint8_t)src[j++];
} else {
/* all input consumed, partial match */
if(flush || (length=(i+j))>UCNV_EXT_MAX_BYTES) {
/*
* end of the entire input stream, stop with the longest match so far
* or: partial match must not be longer than UCNV_EXT_MAX_BYTES
* because it must fit into state buffers
*/
break;
} else {
/* continue with more input next time */
return -length;
}
}
/* search for the current UChar */
value=ucnv_extFindToU(toUSection, length, b);
if(value==0) {
/* no match here, stop with the longest match so far */
break;
} else {
if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
/* partial match, continue */
index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
} else {
if( UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
TO_U_USE_FALLBACK(useFallback)
) {
/* full match, stop with result */
matchValue=value;
matchLength=i+j;
} else {
/* full match on fallback not taken, stop with the longest match so far */
}
break;
}
}
}
if(matchLength==0) {
/* no match at all */
return 0;
}
/* return result */
matchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
if(UCNV_EXT_TO_U_IS_CODE_POINT(matchValue)) {
*pResultLength=-(int32_t)matchValue;
} else {
*pResultLength=UCNV_EXT_TO_U_GET_LENGTH(matchValue);
*pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+UCNV_EXT_TO_U_GET_INDEX(matchValue);
}
return matchLength;
}
static U_INLINE void
ucnv_extWriteToU(UConverter *cnv,
const UChar *result, int32_t resultLength,
UChar **target, const UChar *targetLimit,
int32_t **offsets, int32_t srcIndex,
UErrorCode *pErrorCode) {
/* output the result */
if(resultLength<0) {
/* output a single code point */
ucnv_toUWriteCodePoint(
cnv, UCNV_EXT_TO_U_GET_CODE_POINT(-resultLength),
target, targetLimit,
offsets, srcIndex,
pErrorCode);
} else {
/* output a string - with correct data we have resultLength>0 */
ucnv_toUWriteUChars(
cnv,
result, resultLength,
target, targetLimit,
offsets, srcIndex,
pErrorCode);
}
}
/*
* target<targetLimit; set error code for overflow
*/
U_CFUNC UBool
ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
int32_t firstLength,
const char **src, const char *srcLimit,
UChar **target, const UChar *targetLimit,
int32_t **offsets, int32_t srcIndex,
UBool flush,
UErrorCode *pErrorCode) {
const UChar *result;
int32_t resultLength, match;
/* try to match */
match=ucnv_extMatchToU(cx,
(const char *)cnv->toUBytes, firstLength,
*src, (int32_t)(srcLimit-*src),
&result, &resultLength,
cnv->useFallback, flush);
if(match>0) {
/* advance src pointer for the consumed input */
*src+=match-firstLength;
/* write result to target */
ucnv_extWriteToU(cnv,
result, resultLength,
target, targetLimit,
offsets, srcIndex,
pErrorCode);
return TRUE;
} else if(match<0) {
/* save state for partial match */
const char *s;
int32_t j;
/* copy the first code point */
s=(const char *)cnv->toUBytes;
cnv->preToUFirstLength=(int8_t)firstLength;
for(j=0; j<firstLength; ++j) {
cnv->preToU[j]=*s++;
}
/* now copy the newly consumed input */
s=*src;
match=-match;
for(; j<match; ++j) {
cnv->preToU[j]=*s++;
}
*src=s; /* same as *src=srcLimit; because we reached the end of input */
cnv->preToULength=(int8_t)match;
return TRUE;
} else /* match==0 no match */ {
return FALSE;
}
}
#if 0
/* ### TODO */
U_CFUNC int32_t
ucnv_extSimpleMatchToU(const int32_t *cx,
UChar32 cp, uint32_t *pValue,
UBool useFallback,
UErrorCode *pErrorCode) {
const uint8_t *result;
int32_t resultLength, match;
/* try to match */
match=ucnv_extMatchToU(cx,
cp,
NULL, 0,
NULL, 0,
&result, &resultLength,
useFallback, TRUE);
if(match>=2) {
/* write result for simple, single-character conversion */
if(resultLength<0) {
resultLength=-resultLength;
*pValue=(uint32_t)UCNV_EXT_TO_U_GET_DATA(resultLength);
return UCNV_EXT_TO_U_GET_LENGTH(resultLength);
} else if(resultLength==4) {
/* de-serialize a 4-byte result */
*pValue=
((uint32_t)result[0]<<24)|
((uint32_t)result[1]<<16)|
((uint32_t)result[2]<<8)|
result[3];
return 4;
}
}
/*
* return no match because
* - match>1 && resultLength>4: result too long for simple conversion
* - match==1: no match found, <subchar1> preferred
* - match==0: no match found in the first place
* - match<0: partial match, not supported for simple conversion (and flush==TRUE)
*/
return 0;
}
#endif
/*
* continue partial match with new input
* never called for simple, single-character conversion
*/
U_CFUNC void
ucnv_extContinueMatchToU(UConverter *cnv,
UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode) {
const UChar *result;
int32_t resultLength, match, length;
match=ucnv_extMatchToU(cnv->sharedData->table->mbcs.extIndexes,
cnv->preToU, cnv->preToULength,
pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
&result, &resultLength,
cnv->useFallback, pArgs->flush);
if(match>0) {
if(match>=cnv->preToULength) {
/* advance src pointer for the consumed input */
pArgs->source+=match-cnv->preToULength;
cnv->preToULength=0;
} else {
/* the match did not use all of preToU[] - keep the rest for replay */
int32_t length=cnv->preToULength-match;
uprv_memmove(cnv->preToU, cnv->preToU+match, length);
cnv->preToULength=(int8_t)-length;
}
/* write result */
ucnv_extWriteToU(cnv,
result, resultLength,
&pArgs->target, pArgs->targetLimit,
&pArgs->offsets, srcIndex,
pErrorCode);
} else if(match<0) {
/* save state for partial match */
const char *s;
int32_t j;
/* just _append_ the newly consumed input to preToU[] */
s=pArgs->source;
match=-match;
for(j=cnv->preToULength; j<match; ++j) {
cnv->preToU[j]=*s++;
}
pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
cnv->preToULength=(int8_t)match;
} else /* match==0 */ {
/*
* no match
*
* We need to split the previous input into two parts:
*
* 1. The first codepage character is unmappable - that's how we got into
* trying the extension data in the first place.
* We need to move it from the preToU buffer
* to the error buffer, set an error code,
* and prepare the rest of the previous input for 2.
*
* 2. The rest of the previous input must be converted once we
* come back from the callback for the first character.
* At that time, we have to try again from scratch to convert
* these input characters.
* The replay will be handled by the ucnv.c conversion code.
*/
/* move the first codepage character to the error field */
uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
cnv->toULength=cnv->preToUFirstLength;
/* move the rest up inside the buffer */
length=cnv->preToULength-cnv->preToUFirstLength;
if(length>0) {
uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
}
/* mark preToU for replay */
cnv->preToULength=(int8_t)-length;
/* set the error code for unassigned */
*pErrorCode=U_INVALID_CHAR_FOUND;
}
}
/* from Unicode ------------------------------------------------------------- */
/*
* @return index of the UChar, if found; else <0
*/
static U_INLINE int32_t
ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
int32_t i, start, limit;
/* binary search */
start=0;
limit=length;
for(;;) {
i=limit-start;
if(i<=1) {
break; /* done */
}
/* start<limit-1 */
if(i<=4) {
/* linear search for the last part */
if(u>=fromUSection[start]) {
break;
}
if(++start<limit && u>=fromUSection[start]) {
break;
}
if(++start<limit && u>=fromUSection[start]) {
break;
}
/* always break at start==limit-1 */
++start;
break;
}
i=(start+limit)/2;
if(u<fromUSection[i]) {
limit=i;
} else {
start=i;
}
}
/* did we really find it? */
if(start<limit && u==fromUSection[start]) {
return start;
} else {
return -1; /* not found */
}
}
/*
* @param cx pointer to extension data; if NULL, returns 0
* @param firstCP the first code point before all the other UChars
* @param pre UChars that must match; !initialMatch: partial match with them
* @param preLength length of pre, >=0
* @param src UChars that can be used to complete a match
* @param srcLength length of src, >=0
* @param pResult [out] address of pointer to result bytes
* set only in case of a match
* @param pResultLength [out] address of result length variable;
* gets a negative value if the length variable
* itself contains the length and bytes, encoded in
* the format of fromUTableValues[] and then inverted
* @param useFallback "use fallback" flag, usually from cnv->useFallback
* @param flush TRUE if the end of the input stream is reached
* @return >1: matched, return value=total match length (number of input units matched)
* 1: matched, no mapping but request for <subchar1>
* (only for the first code point)
* 0: no match
* <0: partial match, return value=negative total match length
* (partial matches are never returned for flush==TRUE)
* (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
* the matchLength is 2 if only firstCP matched, and >2 if firstCP and
* further code units matched
*/
static int32_t
ucnv_extMatchFromU(const int32_t *cx,
UChar32 firstCP,
const UChar *pre, int32_t preLength,
const UChar *src, int32_t srcLength,
const uint8_t **pResult, int32_t *pResultLength,
UBool useFallback, UBool flush) {
const uint16_t *stage12, *stage3;
const uint32_t *stage3b;
const UChar *fromUTableUChars, *fromUSectionUChars;
const uint32_t *fromUTableValues, *fromUSectionValues;
uint32_t value, matchValue;
int32_t i, j, index, length, matchLength;
UChar c;
if(cx==NULL) {
return 0; /* no extension data, no match */
}
/* trie lookup of firstCP */
index=firstCP>>10; /* stage 1 index */
if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
return 0; /* the first code point is outside the trie */
}
stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP);
stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
value=stage3b[index];
if(value==0) {
return 0;
}
if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
/* partial match, enter the loop below */
index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
/* initialize */
fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
matchValue=0;
i=j=matchLength=0;
/* we must not remember fallback matches when not using fallbacks */
/* match input units until there is a full match or the input is consumed */
for(;;) {
/* go to the next section */
fromUSectionUChars=fromUTableUChars+index;
fromUSectionValues=fromUTableValues+index;
/* read first pair of the section */
length=*fromUSectionUChars++;
value=*fromUSectionValues++;
if( value!=0 &&
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
FROM_U_USE_FALLBACK(useFallback, firstCP))
) {
/* remember longest match so far */
matchValue=value;
matchLength=2+i+j;
}
/* match pre[] then src[] */
if(i<preLength) {
c=pre[i++];
} else if(j<srcLength) {
c=src[j++];
} else {
/* all input consumed, partial match */
if(flush || (length=(i+j))>UCNV_EXT_MAX_UCHARS) {
/*
* end of the entire input stream, stop with the longest match so far
* or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
* because it must fit into state buffers
*/
break;
} else {
/* continue with more input next time */
return -(2+length);
}
}
/* search for the current UChar */
index=ucnv_extFindFromU(fromUSectionUChars, length, c);
if(index<0) {
/* no match here, stop with the longest match so far */
break;
} else {
value=fromUSectionValues[index];
if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
/* partial match, continue */
index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
} else {
if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
FROM_U_USE_FALLBACK(useFallback, firstCP)
) {
/* full match, stop with result */
matchValue=value;
matchLength=2+i+j;
} else {
/* full match on fallback not taken, stop with the longest match so far */
}
break;
}
}
}
if(matchLength==0) {
/* no match at all */
return 0;
}
} else /* result from firstCP trie lookup */ {
if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
FROM_U_USE_FALLBACK(useFallback, firstCP)
) {
/* full match, stop with result */
matchValue=value;
matchLength=2;
} else {
/* fallback not taken */
return 0;
}
}
if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) {
/* do not interpret values with reserved bits used, for forward compatibility */
return 0;
}
/* return result */
if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
return 1;
}
matchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue);
length=(int32_t)UCNV_EXT_FROM_U_GET_LENGTH(matchValue);
if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
*pResultLength=-(int32_t)matchValue;
} else {
*pResultLength=length;
*pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+UCNV_EXT_FROM_U_GET_DATA(matchValue);
}
return matchLength;
}
static U_INLINE void
ucnv_extWriteFromU(UConverter *cnv,
const uint8_t *result, int32_t resultLength,
char **target, const char *targetLimit,
int32_t **offsets, int32_t srcIndex,
UErrorCode *pErrorCode) {
uint8_t buffer[4];
/* output the result */
if(resultLength<0) {
/*
* Generate a byte array and then write it below.
* This is not the fastest possible way, but it should be ok for
* extension mappings, and it is much simpler.
* Offset and overflow handling are only done once this way.
*/
uint8_t *p;
uint32_t value;
resultLength=-resultLength;
value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength);
resultLength=UCNV_EXT_FROM_U_GET_LENGTH(resultLength);
/* resultLength<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH==3 */
p=buffer;
switch(resultLength) {
case 3:
*p++=(uint8_t)(value>>16);
case 2:
*p++=(uint8_t)(value>>8);
case 1:
*p++=(uint8_t)value;
default:
break; /* will never occur */
}
result=buffer;
}
/* with correct data we have resultLength>0 */
ucnv_fromUWriteBytes(cnv, (const char *)result, resultLength,
target, targetLimit,
offsets, srcIndex,
pErrorCode);
}
/*
* target<targetLimit; set error code for overflow
*/
U_CFUNC UBool
ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
UChar32 cp,
const UChar **src, const UChar *srcLimit,
char **target, const char *targetLimit,
int32_t **offsets, int32_t srcIndex,
UBool flush,
UErrorCode *pErrorCode) {
const uint8_t *result;
int32_t resultLength, match;
/* try to match */
match=ucnv_extMatchFromU(cx, cp,
NULL, 0,
*src, (int32_t)(srcLimit-*src),
&result, &resultLength,
cnv->useFallback, flush);
if(match>=2) {
/* advance src pointer for the consumed input */
*src+=match-2; /* remove 2 for the initial code point */
/* write result to target */
ucnv_extWriteFromU(cnv,
result, resultLength,
target, targetLimit,
offsets, srcIndex,
pErrorCode);
return TRUE;
} else if(match<0) {
/* save state for partial match */
const UChar *s;
int32_t j;
/* copy the first code point */
cnv->preFromUFirstCP=cp;
/* now copy the newly consumed input */
s=*src;
match=-match-2; /* remove 2 for the initial code point */
for(j=0; j<match; ++j) {
cnv->preFromU[j]=*s++;
}
*src=s; /* same as *src=srcLimit; because we reached the end of input */
cnv->preFromULength=(int8_t)match;
return TRUE;
} else if(match==1) {
/* matched, no mapping but request for <subchar1> */
cnv->useSubChar1=TRUE;
return FALSE;
} else /* match==0 no match */ {
return FALSE;
}
}
U_CFUNC int32_t
ucnv_extSimpleMatchFromU(const int32_t *cx,
UChar32 cp, uint32_t *pValue,
UBool useFallback,
UErrorCode *pErrorCode) {
const uint8_t *result;
int32_t resultLength, match;
/* try to match */
match=ucnv_extMatchFromU(cx,
cp,
NULL, 0,
NULL, 0,
&result, &resultLength,
useFallback, TRUE);
if(match>=2) {
/* write result for simple, single-character conversion */
if(resultLength<0) {
resultLength=-resultLength;
*pValue=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength);
return UCNV_EXT_FROM_U_GET_LENGTH(resultLength);
} else if(resultLength==4) {
/* de-serialize a 4-byte result */
*pValue=
((uint32_t)result[0]<<24)|
((uint32_t)result[1]<<16)|
((uint32_t)result[2]<<8)|
result[3];
return 4;
}
}
/*
* return no match because
* - match>1 && resultLength>4: result too long for simple conversion
* - match==1: no match found, <subchar1> preferred
* - match==0: no match found in the first place
* - match<0: partial match, not supported for simple conversion (and flush==TRUE)
*/
return 0;
}
/*
* continue partial match with new input, requires cnv->preFromUFirstCP>=0
* never called for simple, single-character conversion
*/
U_CFUNC void
ucnv_extContinueMatchFromU(UConverter *cnv,
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode) {
const uint8_t *result;
int32_t resultLength, match;
match=ucnv_extMatchFromU(cnv->sharedData->table->mbcs.extIndexes,
cnv->preFromUFirstCP,
cnv->preFromU, cnv->preFromULength,
pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
&result, &resultLength,
cnv->useFallback, pArgs->flush);
if(match>=2) {
match-=2; /* remove 2 for the initial code point */
if(match>=cnv->preFromULength) {
/* advance src pointer for the consumed input */
pArgs->source+=match-cnv->preFromULength;
cnv->preFromULength=0;
} else {
/* the match did not use all of preFromU[] - keep the rest for replay */
int32_t length=cnv->preFromULength-match;
uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
cnv->preFromULength=(int8_t)-length;
}
/* finish the partial match */
cnv->preFromUFirstCP=U_SENTINEL;
/* write result */
ucnv_extWriteFromU(cnv,
result, resultLength,
&pArgs->target, pArgs->targetLimit,
&pArgs->offsets, srcIndex,
pErrorCode);
} else if(match<0) {
/* save state for partial match */
const UChar *s;
int32_t j;
/* just _append_ the newly consumed input to preFromU[] */
s=pArgs->source;
match=-match-2; /* remove 2 for the initial code point */
for(j=cnv->preFromULength; j<match; ++j) {
cnv->preFromU[j]=*s++;
}
pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
cnv->preFromULength=(int8_t)match;
} else /* match==0 or 1 */ {
/*
* no match
*
* We need to split the previous input into two parts:
*
* 1. The first code point is unmappable - that's how we got into
* trying the extension data in the first place.
* We need to move it from the preFromU buffer
* to the error buffer, set an error code,
* and prepare the rest of the previous input for 2.
*
* 2. The rest of the previous input must be converted once we
* come back from the callback for the first code point.
* At that time, we have to try again from scratch to convert
* these input characters.
* The replay will be handled by the ucnv.c conversion code.
*/
if(match==1) {
/* matched, no mapping but request for <subchar1> */
cnv->useSubChar1=TRUE;
}
/* move the first code point to the error field */
cnv->fromUChar32=cnv->preFromUFirstCP;
cnv->preFromUFirstCP=U_SENTINEL;
/* mark preFromU for replay */
cnv->preFromULength=-cnv->preFromULength;
/* set the error code for unassigned */
*pErrorCode=U_INVALID_CHAR_FOUND;
}
}
/*
* ### TODO
*
* - test toU() functions
*
* - EBCDIC_STATEFUL: support extensions, but the charset string must be
* either one single-byte character or a sequence of double-byte ones,
* to avoid state transitions inside the mapping and to avoid having to
* store character boundaries.
* The extension functions will need an additional EBCDIC state in/out
* parameter and will have to be able to insert an SI or SO before writing
* the mapping result.
* - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB
* - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate
* - make dbcsonly work with extensions
*
* - test |2 to <subchar1> for regular code point, prefix code point,
* multiple code points
* - test fallback from non-zero to 00
* - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone()
*/
#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */

View File

@ -0,0 +1,417 @@
/*
******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: ucnv_ext.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jun13
* created by: Markus W. Scherer
*
* Conversion extensions
*/
#ifndef __UCNV_EXT_H__
#define __UCNV_EXT_H__
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
/*
* See icuhtml/design/conversion/conversion_extensions.html
*
* Conversion extensions serve two purposes:
* 1. They support m:n mappings.
* 2. They support extension-only conversion files that are used together
* with the regular conversion data in base files.
*
* A base file may contain an extension table (explicitly requested or
* implicitly generated for m:n mappings), but its extension table is not
* used when an extension-only file is used.
*
* It is an error if a base file contains any regular (not extension) mapping
* from the same sequence as a mapping in the extension file
* because the base mapping would hide the extension mapping.
*
*
* Data for conversion extensions:
*
* One set of data structures per conversion direction (to/from Unicode).
* The data structures are sorted by input units to allow for binary search.
* Input sequences of more than one unit are handled like contraction tables
* in collation:
* The lookup value of a unit points to another table that is to be searched
* for the next unit, recursively.
*
* For conversion from Unicode, the initial code point is looked up in
* a 3-stage trie for speed,
* with an additional table of unique results to save space.
*
* Long output strings are stored in separate arrays, with length and index
* in the lookup tables.
* Output results also include a flag distinguishing roundtrip from
* (reverse) fallback mappings.
*
* Input Unicode strings must not begin or end with unpaired surrogates
* to avoid problems with matches on parts of surrogate pairs.
*
* Mappings from multiple characters (code points or codepage state
* table sequences) must be searched preferring the longest match.
* For this to work and be efficient, the variable-width table must contain
* all mappings that contain prefixes of the multiple characters.
* If an extension table is built on top of a base table in another file
* and a base table entry is a prefix of a multi-character mapping, then
* this is an error.
*
*
* Implementation note:
*
* Currently, the parser and several checks in the code limit the number
* of UChars or bytes in a mapping to
* UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
* which are output value limits in the data structure.
*
* For input, this is not strictly necessary - it is a hard limit only for the
* buffers in UConverter that are used to store partial matches.
*
* Input sequences could otherwise be arbitrarily long if partial matches
* need not be stored (i.e., if a sequence does not span several buffers with too
* many units before the last buffer), although then results would differ
* depending on whether partial matches exceed the limits or not,
* which depends on the pattern of buffer sizes.
*
*
* Data structure:
*
* int32_t indexes[>=32];
*
* Array of indexes and lengths etc. The length of the array is at least 32.
* The actual length is stored in indexes[0] to be forward compatible.
*
* Each index to another array is the number of bytes from indexes[].
* Each length of an array is the number of array base units in that array.
*
* Some of the structures may not be present, in which case their indexes
* and lengths are 0.
*
* Usage of indexes[i]:
* [0] length of indexes[]
*
* // to Unicode table
* [1] index of toUTable[] (array of uint32_t)
* [2] length of toUTable[]
* [3] index of toUUChars[] (array of UChar)
* [4] length of toUUChars[]
*
* // from Unicode table, not for the initial code point
* [5] index of fromUTableUChars[] (array of UChar)
* [6] index of fromUTableValues[] (array of uint32_t)
* [7] length of fromUTableUChars[] and fromUTableValues[]
* [8] index of fromUBytes[] (array of char)
* [9] length of fromUBytes[]
*
* // from Unicode trie for initial-code point lookup
* [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
* [11] length of stage 1 portion of fromUStage12[]
* [12] length of fromUStage12[]
* [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
* [14] length of fromUStage3[]
* [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
* [16] length of fromUStage3b[]
*
* [17]..[30] reserved
* [31] number of bytes for the entire extension structure
* [>31] reserved; there are indexes[0] indexes
*
*
* uint32_t toUTable[];
*
* Array of byte/value pairs for lookups for toUnicode conversion.
* The array is partitioned into sections like collation contraction tables.
* Each section contains one word with the number of following words and
* a default value for when the lookup in this section yields no match.
*
* A section is sorted in ascending order of input bytes,
* allowing for fast linear or binary searches.
* The builder may store entries for a contiguous range of byte values
* (compare difference between the first and last one with count),
* which then allows for direct array access.
* The builder should always do this for the initial table section.
*
* Entries may have 0 values, see below.
* No two entries in a section have the same byte values.
*
* Each uint32_t contains an input byte value in bits 31..24 and the
* corresponding lookup value in bits 23..0.
* Interpret the value as follows:
* if(value==0) {
* no match, see below
* } else if(value<0x1f0000) {
* partial match - use value as index to the next toUTable section
* and match the next unit; (value indexes toUTable[value])
* } else {
* if(bit 23 set) {
* roundtrip;
* } else {
* fallback;
* }
* unset value bit 23;
* if(value<=0x2fffff) {
* (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
* } else {
* bits 17..0 (value&0x3ffff) is an index to
* the result UChars in toUUChars[]; (0 indexes toUUChars[0])
* length of the result=((value>>18)-12); (length=0..19)
* }
* }
*
* The first word in a section contains the number of following words in the
* input byte position (bits 31..24, number=1..0xff).
* The value of the initial word is used when the current byte is not found
* in this section.
* If the value is not 0, then it represents a result as above.
* If the value is 0, then the search has to return a shorter match with an
* earlier default value as the result, or result in "unmappable" even for the
* initial bytes.
* If the value is 0 for the initial toUTable entry, then the initial byte
* does not start any mapping input.
*
*
* UChar toUUChars[];
*
* Contains toUnicode mapping results, stored as sequences of UChars.
* Indexes and lengths stored in the toUTable[].
*
*
* UChar fromUTableUChars[];
* uint32_t fromUTableValues[];
*
* The fromUTable is split into two arrays, but works otherwise much like
* the toUTable. The array is partitioned into sections like collation
* contraction tables and toUTable.
* A row in the table consists of same-index entries in fromUTableUChars[]
* and fromUTableValues[].
*
* Interpret a value as follows:
* if(value==0) {
* no match, see below
* } else if(value<=0xffffff) { (bits 31..24 are 0)
* partial match - use value as index to the next fromUTable section
* and match the next unit; (value indexes fromUTable[value])
* } else {
* if(value==0x80000001) {
* return no mapping, but request for <subchar1>;
* }
* if(bit 31 set) {
* roundtrip;
* } else {
* fallback;
* }
* // bits 30..29 reserved, 0
* length=(value>>24)&0x1f; (bits 28..24)
* if(length==1..3) {
* bits 23..0 contain 1..3 bytes, padded with 00s on the left;
* } else {
* bits 23..0 (value&0xffffff) is an index to
* the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
* }
* }
*
* The first pair in a section contains the number of following pairs in the
* UChar position (16 bits, number=1..0xffff).
* The value of the initial pair is used when the current UChar is not found
* in this section.
* If the value is not 0, then it represents a result as above.
* If the value is 0, then the search has to return a shorter match with an
* earlier default value as the result, or result in "unmappable" even for the
* initial UChars.
*
* If the from Unicode trie is present, then the from Unicode search tables
* are not used for initial code points.
* In this case, the first entries (index 0) in the tables are not used
* (reserved, set to 0) because a value of 0 is used in trie results
* to indicate no mapping.
*
*
* uint16_t fromUStage12[];
*
* Stages 1 & 2 of a trie that maps an initial code point.
* Indexes in stage 1 are all offset by the length of stage 1 so that the
* same array pointer can be used for both stages.
* If (c>>10)>=(length of stage 1) then c does not start any mapping.
* Same bit distribution as for regular conversion tries.
*
*
* uint16_t fromUStage3[];
* uint32_t fromUStage3b[];
*
* Stage 3 of the trie. The first array simply contains indexes to the second,
* which contains words in the same format as fromUTableValues[].
* Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
* and 16-bit entries in stage 3 allow for 64k stage 3b entries.
* The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
*
* Two arrays are used because it is expected that more than half of the stage 3
* entries will be zero. The 16-bit index stage 3 array saves space even
* considering storing a total of 6 bytes per non-zero entry in both arrays
* together.
* Using a stage 3 granularity of >1 diminishes the compactability in that stage
* but provides a larger effective addressing space in stage 2.
* All but the final result stage use 16-bit entries to save space.
*
* fromUStage3b[] contains a zero for "no mapping" at its index 0,
* and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for "<subchar1> SUB mapping"
* (i.e., "no mapping" with preference for <subchar1> rather than <subchar>),
* and all other items are unique non-zero results.
*
*
* char fromUBytes[];
*
* Contains fromUnicode mapping results, stored as sequences of chars.
* Indexes and lengths stored in the fromUTableValues[].
*/
enum {
UCNV_EXT_INDEXES_LENGTH, /* 0 */
UCNV_EXT_TO_U_INDEX, /* 1 */
UCNV_EXT_TO_U_LENGTH,
UCNV_EXT_TO_U_UCHARS_INDEX,
UCNV_EXT_TO_U_UCHARS_LENGTH,
UCNV_EXT_FROM_U_UCHARS_INDEX, /* 5 */
UCNV_EXT_FROM_U_VALUES_INDEX,
UCNV_EXT_FROM_U_LENGTH,
UCNV_EXT_FROM_U_BYTES_INDEX,
UCNV_EXT_FROM_U_BYTES_LENGTH,
UCNV_EXT_FROM_U_STAGE_12_INDEX, /* 10 */
UCNV_EXT_FROM_U_STAGE_1_LENGTH,
UCNV_EXT_FROM_U_STAGE_12_LENGTH,
UCNV_EXT_FROM_U_STAGE_3_INDEX,
UCNV_EXT_FROM_U_STAGE_3_LENGTH,
UCNV_EXT_FROM_U_STAGE_3B_INDEX,
UCNV_EXT_FROM_U_STAGE_3B_LENGTH,
UCNV_EXT_RESERVED_INDEX, /* 17, moves with additional indexes */
UCNV_EXT_SIZE=31,
UCNV_EXT_INDEXES_MIN_LENGTH=32
};
/* get the pointer to an extension array from indexes[index] */
#define UCNV_EXT_ARRAY(indexes, index, itemType) \
((const itemType *)((const char *)(indexes)+(indexes)[index]))
/* internal API ------------------------------------------------------------- */
U_CFUNC UBool
ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
int32_t firstLength,
const char **src, const char *srcLimit,
UChar **target, const UChar *targetLimit,
int32_t **offsets, int32_t srcIndex,
UBool flush,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_extContinueMatchToU(UConverter *cnv,
UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode);
U_CFUNC UBool
ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
UChar32 cp,
const UChar **src, const UChar *srcLimit,
char **target, const char *targetLimit,
int32_t **offsets, int32_t srcIndex,
UBool flush,
UErrorCode *pErrorCode);
U_CFUNC int32_t
ucnv_extSimpleMatchFromU(const int32_t *cx,
UChar32 cp, uint32_t *pValue,
UBool useFallback,
UErrorCode *pErrorCode);
U_CFUNC void
ucnv_extContinueMatchFromU(UConverter *cnv,
UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
UErrorCode *pErrorCode);
/* toUnicode helpers -------------------------------------------------------- */
#define UCNV_EXT_TO_U_BYTE_SHIFT 24
#define UCNV_EXT_TO_U_VALUE_MASK 0xffffff
#define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000
#define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff
#define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23)
#define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff
#define UCNV_EXT_TO_U_LENGTH_SHIFT 18
#define UCNV_EXT_TO_U_LENGTH_OFFSET 12
/* maximum number of indexed UChars */
#define UCNV_EXT_MAX_UCHARS 19
#define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<<UCNV_EXT_TO_U_BYTE_SHIFT)|(value))
#define UCNV_EXT_TO_U_GET_BYTE(word) ((word)>>UCNV_EXT_TO_U_BYTE_SHIFT)
#define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK)
#define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)<UCNV_EXT_TO_U_MIN_CODE_POINT)
#define UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value) (value)
#define UCNV_EXT_TO_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_TO_U_ROUNDTRIP_FLAG)!=0)
#define UCNV_EXT_TO_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_TO_U_ROUNDTRIP_FLAG)
/* use after masking off the roundtrip flag */
#define UCNV_EXT_TO_U_IS_CODE_POINT(value) ((value)<=UCNV_EXT_TO_U_MAX_CODE_POINT)
#define UCNV_EXT_TO_U_GET_CODE_POINT(value) ((value)-UCNV_EXT_TO_U_MIN_CODE_POINT)
#define UCNV_EXT_TO_U_GET_INDEX(value) ((value)&UCNV_EXT_TO_U_INDEX_MASK)
#define UCNV_EXT_TO_U_GET_LENGTH(value) (((value)>>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET)
/* fromUnicode helpers ------------------------------------------------------ */
/* most trie constants are shared with ucnvmbcs.h */
/* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */
#define UCNV_EXT_STAGE_2_LEFT_SHIFT 2
#define UCNV_EXT_STAGE_3_GRANULARITY 4
/* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
#define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \
(stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]<<UCNV_EXT_STAGE_2_LEFT_SHIFT) +((c)&0xf) ]
#define UCNV_EXT_FROM_U_LENGTH_SHIFT 24
#define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31)
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x60000000
#define UCNV_EXT_FROM_U_DATA_MASK 0xffffff
/* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
#define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001
/* at most 3 bytes in the lower part of the value */
#define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3
/* maximum number of indexed bytes */
#define UCNV_EXT_MAX_BYTES 0x1f
#define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0)
#define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value)
#define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
#define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)
/* use after masking off the roundtrip flag */
#define UCNV_EXT_FROM_U_GET_LENGTH(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)
/* get bytes or bytes index */
#define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK)
#endif

View File

@ -46,6 +46,7 @@
#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include "ucnv_cnv.h"
#include "umutex.h"
#include "cmemory.h"
@ -56,9 +57,18 @@
#define MBCS_UNROLL_SINGLE_FROM_BMP 0
/*
* _MBCSHeader versions 4.1
* _MBCSHeader versions 4.2
* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
*
* Change from version 4.1:
* - Added an optional extension table structure at the end of the .cnv file.
* It is present if the upper bits of the header flags field contains a non-zero
* byte offset to it.
* Files that contain only a conversion table and no base table
* use the special outputType MBCS_OUTPUT_EXT_ONLY.
* These contain the base table name between the MBCS header and the extension
* data.
*
* Change from version 4.0:
* - Replace header.reserved with header.fromUBytesLength so that all
* fields in the data have length.
@ -524,11 +534,6 @@ _MBCSGetUnicodeSet(const UConverter *cnv,
* code. The framework will then call the callback function.
*/
/*
* TODO when implementing real extensions, review whether the useFallback parameter
* should get cnv->useFallback or the full resolution considering cp as well
*/
/*
* @return if(U_FAILURE) return the code point for cnv->fromUChar32
* else return 0 after output has been written to the target
@ -539,10 +544,26 @@ _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
const UChar **source, const UChar *sourceLimit,
char **target, const char *targetLimit,
int32_t **offsets, int32_t sourceIndex,
UBool useFallback, UBool flush,
UBool flush,
UErrorCode *pErrorCode) {
const int32_t *cx;
cnv->useSubChar1=FALSE;
if( (cx=sharedData->table->mbcs.extIndexes)!=NULL &&
ucnv_extInitialMatchFromU(
cnv, cx,
cp, source, sourceLimit,
target, targetLimit,
offsets, sourceIndex,
flush,
pErrorCode)
) {
return 0; /* an extension mapping handled the input */
}
/* GB 18030 */
if(cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
const uint32_t *range;
int32_t i;
@ -590,10 +611,24 @@ _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
const char **source, const char *sourceLimit,
UChar **target, const UChar *targetLimit,
int32_t **offsets, int32_t sourceIndex,
UBool useFallback, UBool flush,
UBool flush,
UErrorCode *pErrorCode) {
const int32_t *cx;
if( (cx=sharedData->table->mbcs.extIndexes)!=NULL &&
ucnv_extInitialMatchToU(
cnv, cx,
length, source, sourceLimit,
target, targetLimit,
offsets, sourceIndex,
flush,
pErrorCode)
) {
return 0; /* an extension mapping handled the input */
}
/* GB 18030 */
if(length==4 && cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
const uint32_t *range;
uint32_t linear;
int32_t i;
@ -789,6 +824,7 @@ _MBCSLoad(UConverterSharedData *sharedData,
UDataInfo info;
UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
_MBCSHeader *header=(_MBCSHeader *)raw;
uint32_t offset;
if(header->version[0]!=4) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
@ -806,6 +842,12 @@ _MBCSLoad(UConverterSharedData *sharedData,
mbcsTable->fromUBytesLength=header->fromUBytesLength;
mbcsTable->outputType=(uint8_t)header->flags;
/* extension data, header version 4.2 and higher */
offset=header->flags>>8;
if(offset!=0) {
mbcsTable->extIndexes=(const int32_t *)(raw+offset);
}
/* make sure that the output type is known */
switch(mbcsTable->outputType) {
case MBCS_OUTPUT_1:
@ -817,6 +859,8 @@ _MBCSLoad(UConverterSharedData *sharedData,
case MBCS_OUTPUT_2_SISO:
/* OK */
break;
case MBCS_OUTPUT_EXT_ONLY:
/* ### TODO */
default:
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
@ -1062,7 +1106,7 @@ _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1, (const char **)&source, (const char *)sourceLimit,
&target, targetLimit,
&offsets, sourceIndex,
(UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
pArgs->flush,
pErrorCode);
sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
@ -1263,7 +1307,7 @@ unrolled:
1, (const char **)&source, (const char *)sourceLimit,
&target, target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
pArgs->flush,
pErrorCode);
sourceIndex+=1+(int32_t)(source-lastSource);
@ -1299,266 +1343,6 @@ unrolled:
pArgs->offsets=offsets;
}
/*
* This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
* We still need a conversion loop in case we find reserved action codes, which are to be ignored.
*/
static UChar32
_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const int32_t (*stateTable)[256];
const uint8_t *source, *sourceLimit;
int32_t entry;
uint8_t action;
/* set up the local pointers */
cnv=pArgs->converter;
source=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
} else {
stateTable=cnv->sharedData->table->mbcs.stateTable;
}
/* conversion loop */
while(source<sourceLimit) {
entry=stateTable[0][*source++];
/* MBCS_ENTRY_IS_FINAL(entry) */
/* write back the updated pointer early so that we can return directly */
pArgs->source=(const char *)source;
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
}
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if( action==MBCS_STATE_VALID_DIRECT_20 ||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
) {
/* output supplementary code point */
return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
}
} else if(action==MBCS_STATE_UNASSIGNED) {
/* just fall through */
} else if(action==MBCS_STATE_ILLEGAL) {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
} else {
/* reserved, must never occur */
continue;
}
if(U_FAILURE(*pErrorCode)) {
/* callback(illegal) */
break;
} else /* unassigned sequence */ {
/* defer to the generic implementation */
pArgs->source=(const char *)source-1;
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
}
/* no output because of empty input or only state changes */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
static UChar32
_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const uint8_t *source, *sourceLimit, *lastSource;
const int32_t (*stateTable)[256];
const uint16_t *unicodeCodeUnits;
uint32_t offset;
uint8_t state;
int32_t entry;
UChar32 c;
uint8_t action;
/* use optimized function if possible */
cnv=pArgs->converter;
if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
/*
* Using the generic ucnv_getNextUChar() code lets us deal correctly
* with the rare case of a codepage that maps single surrogates
* without adding the complexity to this already complicated function here.
*/
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
} else if(cnv->sharedData->table->mbcs.countStates==1) {
return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
}
/* set up the local pointers */
source=lastSource=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
} else {
stateTable=cnv->sharedData->table->mbcs.stateTable;
}
unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
/* get the converter state from UConverter */
offset=cnv->toUnicodeStatus;
state=(uint8_t)(cnv->mode);
/* conversion loop */
c=U_SENTINEL;
while(source<sourceLimit) {
entry=stateTable[state][*source++];
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
/* optimization for 1/2-byte input and BMP output */
if( source<sourceLimit &&
MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
) {
++source;
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
/* output BMP code point */
break;
}
} else {
/* set the next state early so that we can reuse the entry variable */
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if(action==MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
} else if(action==MBCS_STATE_VALID_16) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset];
if(c<0xfffe) {
/* output BMP code point */
break;
} else if(c==0xfffe) {
if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
break;
}
} else {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
} else if(action==MBCS_STATE_VALID_16_PAIR) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset++];
if(c<0xd800) {
/* output BMP code point below 0xd800 */
break;
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
/* output roundtrip or fallback supplementary code point */
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
break;
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
c=unicodeCodeUnits[offset];
break;
} else if(c==0xffff) {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
} else if(action==MBCS_STATE_VALID_DIRECT_20 ||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
) {
/* output supplementary code point */
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
break;
} else if(action==MBCS_STATE_CHANGE_ONLY) {
/*
* This serves as a state change without any output.
* It is useful for reading simple stateful encodings,
* for example using just Shift-In/Shift-Out codes.
* The 21 unused bits may later be used for more sophisticated
* state transitions.
*/
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
/* output BMP code point */
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
}
} else if(action==MBCS_STATE_UNASSIGNED) {
/* just fall through */
} else if(action==MBCS_STATE_ILLEGAL) {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
} else {
/* reserved (must never occur), or only state change */
offset=0;
lastSource=source;
continue;
}
/* end of action codes: prepare for a new character */
offset=0;
if(U_FAILURE(*pErrorCode)) {
/* callback(illegal) */
break;
} else /* unassigned sequence */ {
/* defer to the generic implementation */
cnv->toUnicodeStatus=0;
cnv->mode=state;
pArgs->source=(const char *)lastSource;
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
}
}
if(c<0) {
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
if(U_FAILURE(*pErrorCode)) {
/* incomplete character byte sequence */
uint8_t *bytes=cnv->toUBytes;
cnv->toULength=(int8_t)(source-lastSource);
do {
*bytes++=*lastSource++;
} while(lastSource<source);
} else {
/* no output because of empty input or only state changes */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
}
c=0xffff;
}
/* set the converter state back into UConverter, ready for a new character */
cnv->toUnicodeStatus=0;
cnv->mode=state;
/* write back the updated pointer */
pArgs->source=(const char *)source;
return c;
}
U_CFUNC void
_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@ -1584,6 +1368,19 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
/* use optimized function if possible */
cnv=pArgs->converter;
if(cnv->preToULength>0) {
/*
* pass sourceIndex=-1 because we continue from an earlier buffer
* in the future, this may change with continuous offsets
*/
ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
return;
}
}
if(cnv->sharedData->table->mbcs.countStates==1) {
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
@ -1890,7 +1687,7 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
byteIndex, (const char **)&source, (const char *)sourceLimit,
&target, targetLimit,
&offsets, sourceIndex,
(UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
pArgs->flush,
pErrorCode);
sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
@ -1912,6 +1709,328 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
pArgs->offsets=offsets;
}
/*
* This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
* We still need a conversion loop in case we find reserved action codes, which are to be ignored.
*/
static UChar32
_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const int32_t (*stateTable)[256];
const uint8_t *source, *sourceLimit;
int32_t entry;
uint8_t action;
/* set up the local pointers */
cnv=pArgs->converter;
source=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
} else {
stateTable=cnv->sharedData->table->mbcs.stateTable;
}
/* conversion loop */
while(source<sourceLimit) {
entry=stateTable[0][*source++];
/* MBCS_ENTRY_IS_FINAL(entry) */
/* write back the updated pointer early so that we can return directly */
pArgs->source=(const char *)source;
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
}
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if( action==MBCS_STATE_VALID_DIRECT_20 ||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
) {
/* output supplementary code point */
return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
}
} else if(action==MBCS_STATE_UNASSIGNED) {
/* just fall through */
} else if(action==MBCS_STATE_ILLEGAL) {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
} else {
/* reserved, must never occur */
continue;
}
if(U_FAILURE(*pErrorCode)) {
/* callback(illegal) */
break;
} else /* unassigned sequence */ {
/* defer to the generic implementation */
pArgs->source=(const char *)source-1;
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
}
/* no output because of empty input or only state changes */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
return 0xffff;
}
static UChar32
_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
UConverter *cnv;
const uint8_t *source, *sourceLimit, *lastSource;
const int32_t (*stateTable)[256];
const uint16_t *unicodeCodeUnits;
uint32_t offset;
uint8_t state;
int32_t entry;
UChar32 c;
uint8_t action;
/* use optimized function if possible */
cnv=pArgs->converter;
/* ### TODO extension */
if(cnv->sharedData->table->mbcs.extIndexes!=NULL) {
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
/* ### TODO end cheap-trick extension */
if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
/*
* Using the generic ucnv_getNextUChar() code lets us deal correctly
* with the rare case of a codepage that maps single surrogates
* without adding the complexity to this already complicated function here.
*/
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
} else if(cnv->sharedData->table->mbcs.countStates==1) {
return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
}
/* set up the local pointers */
source=lastSource=(const uint8_t *)pArgs->source;
sourceLimit=(const uint8_t *)pArgs->sourceLimit;
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
} else {
stateTable=cnv->sharedData->table->mbcs.stateTable;
}
unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
/* get the converter state from UConverter */
offset=cnv->toUnicodeStatus;
state=(uint8_t)(cnv->mode);
/* conversion loop */
c=U_SENTINEL;
while(source<sourceLimit) {
entry=stateTable[state][*source++];
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
/* optimization for 1/2-byte input and BMP output */
if( source<sourceLimit &&
MBCS_ENTRY_IS_FINAL(entry=stateTable[state][*source]) &&
MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16 &&
(c=unicodeCodeUnits[offset+MBCS_ENTRY_FINAL_VALUE_16(entry)])<0xfffe
) {
++source;
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
/* output BMP code point */
break;
}
} else {
/* set the next state early so that we can reuse the entry variable */
state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if(action==MBCS_STATE_VALID_DIRECT_16) {
/* output BMP code point */
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
} else if(action==MBCS_STATE_VALID_16) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset];
if(c<0xfffe) {
/* output BMP code point */
break;
} else if(c==0xfffe) {
if(UCNV_TO_U_USE_FALLBACK(cnv) && (c=_MBCSGetFallback(&cnv->sharedData->table->mbcs, offset))!=0xfffe) {
break;
}
} else {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
} else if(action==MBCS_STATE_VALID_16_PAIR) {
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
c=unicodeCodeUnits[offset++];
if(c<0xd800) {
/* output BMP code point below 0xd800 */
break;
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
/* output roundtrip or fallback supplementary code point */
c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
break;
} else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
/* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
c=unicodeCodeUnits[offset];
break;
} else if(c==0xffff) {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
}
} else if(action==MBCS_STATE_VALID_DIRECT_20 ||
(action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
) {
/* output supplementary code point */
c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
break;
} else if(action==MBCS_STATE_CHANGE_ONLY) {
/*
* This serves as a state change without any output.
* It is useful for reading simple stateful encodings,
* for example using just Shift-In/Shift-Out codes.
* The 21 unused bits may later be used for more sophisticated
* state transitions.
*/
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(UCNV_TO_U_USE_FALLBACK(cnv)) {
/* output BMP code point */
c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
break;
}
} else if(action==MBCS_STATE_UNASSIGNED) {
/* just fall through */
} else if(action==MBCS_STATE_ILLEGAL) {
/* callback(illegal) */
*pErrorCode=U_ILLEGAL_CHAR_FOUND;
} else {
/* reserved (must never occur), or only state change */
offset=0;
lastSource=source;
continue;
}
/* end of action codes: prepare for a new character */
offset=0;
if(U_FAILURE(*pErrorCode)) {
/* callback(illegal) */
break;
} else /* unassigned sequence */ {
/* defer to the generic implementation */
cnv->toUnicodeStatus=0;
cnv->mode=state;
pArgs->source=(const char *)lastSource;
return UCNV_GET_NEXT_UCHAR_USE_TO_U;
}
}
}
if(c<0) {
if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSource<source) {
*pErrorCode=U_TRUNCATED_CHAR_FOUND;
}
if(U_FAILURE(*pErrorCode)) {
/* incomplete character byte sequence */
uint8_t *bytes=cnv->toUBytes;
cnv->toULength=(int8_t)(source-lastSource);
do {
*bytes++=*lastSource++;
} while(lastSource<source);
} else {
/* no output because of empty input or only state changes */
*pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
}
c=0xffff;
}
/* set the converter state back into UConverter, ready for a new character */
cnv->toUnicodeStatus=0;
cnv->mode=state;
/* write back the updated pointer */
pArgs->source=(const char *)source;
return c;
}
#if 0
/*
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
* Removal improves code coverage.
*/
/**
* This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It does not handle conversion extensions (_extToU()).
*/
U_CFUNC UChar32
_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
uint8_t b, UBool useFallback) {
int32_t entry;
uint8_t action;
entry=sharedData->table->mbcs.stateTable[0][b];
/* MBCS_ENTRY_IS_FINAL(entry) */
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
}
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if(action==MBCS_STATE_VALID_DIRECT_20) {
/* output supplementary code point */
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(!TO_U_USE_FALLBACK(useFallback)) {
return 0xfffe;
}
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
if(!TO_U_USE_FALLBACK(useFallback)) {
return 0xfffe;
}
/* output supplementary code point */
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
} else if(action==MBCS_STATE_UNASSIGNED) {
return 0xfffe;
} else if(action==MBCS_STATE_ILLEGAL) {
return 0xffff;
} else {
/* reserved, must never occur */
return 0xffff;
}
}
#endif
/*
* This is a simple version of getNextUChar() that is used
* by other converter implementations.
@ -1945,6 +2064,8 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
return 0xffff;
}
/* ### TODO extension */
#if 0
/*
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
@ -2054,61 +2175,6 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
return 0xffff;
}
#if 0
/*
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
* Removal improves code coverage.
*/
/**
* This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
* It does not handle the EBCDIC swaplfnl option (set in UConverter).
* It does not handle conversion extensions (_extToU()).
*/
U_CFUNC UChar32
_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
uint8_t b, UBool useFallback) {
int32_t entry;
uint8_t action;
entry=sharedData->table->mbcs.stateTable[0][b];
/* MBCS_ENTRY_IS_FINAL(entry) */
if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
}
/*
* An if-else-if chain provides more reliable performance for
* the most common cases compared to a switch.
*/
action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
if(action==MBCS_STATE_VALID_DIRECT_20) {
/* output supplementary code point */
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
} else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
if(!TO_U_USE_FALLBACK(useFallback)) {
return 0xfffe;
}
/* output BMP code point */
return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
} else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
if(!TO_U_USE_FALLBACK(useFallback)) {
return 0xfffe;
}
/* output supplementary code point */
return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
} else if(action==MBCS_STATE_UNASSIGNED) {
return 0xfffe;
} else if(action==MBCS_STATE_ILLEGAL) {
return 0xffff;
} else {
/* reserved, must never occur */
return 0xffff;
}
}
#endif
/* MBCS-from-Unicode conversion functions ----------------------------------- */
/* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
@ -2251,7 +2317,7 @@ unassigned:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
@ -2454,7 +2520,7 @@ unassigned:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
@ -2681,7 +2747,7 @@ getTrail:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pArgs->flush,
pErrorCode);
sourceIndex+=length+(int32_t)(source-lastSource);
lastSource=source;
@ -2744,8 +2810,21 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
int32_t length, prevLength;
uint8_t unicodeMask;
/* use optimized function if possible */
cnv=pArgs->converter;
if(cnv->preFromUFirstCP>=0) {
/*
* pass sourceIndex=-1 because we continue from an earlier buffer
* in the future, this may change with continuous offsets
*/
ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
return;
}
}
/* use optimized function if possible */
outputType=cnv->sharedData->table->mbcs.outputType;
unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
@ -2768,6 +2847,7 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
offsets=pArgs->offsets;
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
} else {
@ -3025,7 +3105,7 @@ unassigned:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
(UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
@ -3222,6 +3302,8 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
uint32_t value;
int32_t length;
/* ### TODO extension mapping */
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
return 0;
@ -3404,7 +3486,11 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
int32_t length;
/* first, select between subChar and subChar1 */
if(cnv->subChar1!=0 && cnv->invalidUCharBuffer[0]<=0xff) {
if( cnv->subChar1!=0 &&
(cnv->sharedData->table->mbcs.extIndexes!=NULL ?
cnv->useSubChar1 :
(cnv->invalidUCharBuffer[0]<=0xff))
) {
/* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
subchar=(char *)&cnv->subChar1;
length=1;
@ -3414,6 +3500,9 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
length=cnv->subCharLen;
}
/* reset the selector for the next code point */
cnv->useSubChar1=FALSE;
switch(cnv->sharedData->table->mbcs.outputType) {
case MBCS_OUTPUT_2_SISO:
p=buffer;

View File

@ -38,6 +38,10 @@
* the same toUnicode structures, while the fromUnicode structures for SBCS
* differ from those for other MBCS-style converters.
*
* _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
* If it is present, then an ICU version reading header versions 4.0 or 4.1
* will be able to use the base table and ignore the extension.
*
* MBCS-style data structure following the static data.
* Offsets are counted in bytes from the beginning of the MBCS header structure.
* Details about usage in comments in ucnvmbcs.c.
@ -45,18 +49,28 @@
* struct _MBCSHeader (see the definition in this header file below)
* contains 32-bit fields as follows:
* 8 values:
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.1.0.0)
* 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
* 1 uint32_t countStates
* 2 uint32_t countToUFallbacks
* 3 uint32_t offsetToUCodeUnits
* 4 uint32_t offsetFromUTable
* 5 uint32_t offsetFromUBytes
* 6 uint32_t flags, bits:
* 31.. 8 reserved
* 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher
* 0 for older versions and if
* there is not extension structure
* 7.. 0 outputType
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
* counts bytes in fromUBytes[]
*
* if(outputType==MBCS_OUTPUT_EXT_ONLY) {
* -- base table name for extension-only table
* char baseTableName[variable]; -- with NUL plus padding for 4-alignment
*
* -- all _MBCSHeader fields except for version and flags are 0
* } else {
* -- normal base table with optional extension
*
* int32_t stateTable[countStates][256];
*
* struct _MBCSToUFallback { (fallbacks are sorted by offset)
@ -96,10 +110,18 @@
* uint16_t fromUBytes[fromUBytesLength/2]; or
* uint32_t fromUBytes[fromUBytesLength/4];
* }
* }
*
* -- extension table, details see ucnv_ext.h
* int32_t indexes[>=32]; ...
*/
/* MBCS converter data and state -------------------------------------------- */
enum {
MBCS_MAX_STATE_COUNT=128
};
/**
* MBCS action codes for conversions to Unicode.
* These values are in bits 23..20 of the state table entries.
@ -175,7 +197,11 @@ enum {
MBCS_OUTPUT_4_EUC, /* 9 */
MBCS_OUTPUT_2_SISO=12, /* c */
MBCS_OUTPUT_2_HZ /* d */
MBCS_OUTPUT_2_HZ, /* d */
MBCS_OUTPUT_EXT_ONLY, /* e */
MBCS_OUTPUT_COUNT
};
/**
@ -210,6 +236,9 @@ typedef struct UConverterMBCSTable {
/* converter name for swaplfnl */
char *swapLFNLName;
/* extension data */
const int32_t *extIndexes;
} UConverterMBCSTable;
/**

View File

@ -455,7 +455,7 @@ ucnv_safeClone(const UConverter *cnv,
UErrorCode *status);
/** @stable ICU 2.0 */
#define U_CNV_SAFECLONE_BUFFERSIZE 3072
#define U_CNV_SAFECLONE_BUFFERSIZE 4096
/**
* Deletes the unicode converter and releases resources associated

View File

@ -43,6 +43,16 @@ conversion {
toUnicode {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
// extensions
{
"*test3",
:bin{ 00050601020b0701020a01020c },
"\u20ac\x05\x06\x0b\U00101234\U00023456\ufffd",
:intvector{ 0, 1, 2, 3, 6, 6, 7, 7, 10 },
:int{1}, :int{0}, "", "?", :bin{""}
}
// normal conversions
{
"UTF-16LE",
:bin{ 310000d801dc00d902dc320000d8330001dc3400 },
@ -110,6 +120,24 @@ conversion {
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
// extensions
{
"*test3",
"\xc4\xc4\xc4\U00101234\xc4\xc4\U00101234\x05",
:bin{ ffffff070501020c },
:intvector{ 0, 1, 2, 3, 5, 5, 5, 5 },
:int{1}, :int{0}, "", "?", ""
}
{
"*test3",
"\U00101234\U00101234\U00050005\U00101234\U00050005\U00060006",
:bin{ 07070001020e05070001020f09 },
:intvector{ 0, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6 },
:int{1}, :int{0}, "", "?", ""
}
// normal conversions
{
"UTF-16LE",
"1\U00010001\U000500022\ud8003\udc014",

View File

@ -1,11 +1,12 @@
# *******************************************************************************
# * Copyright (C) 2001, International Business Machines
# * Copyright (C) 2001-2003, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test1.ucm
#
# Test file for MBCS conversion with single-byte codepage data.
# Also contains extension mappings (m:n).
<code_set_name> "test1"
<mb_cur_max> 1

View File

@ -1,11 +1,12 @@
# *******************************************************************************
# * Copyright (C) 2001, International Business Machines
# * Copyright (C) 2001-2003, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test3.ucm
#
# Test file for MBCS conversion with three-byte codepage data.
# Also contains extension mappings (m:n).
<code_set_name> "test3"
<mb_cur_max> 3
@ -24,6 +25,11 @@ CHARMAP
# nothing special
<U0005> \x05 |0
# extensions
<U00c0> \x05+\x01\x02\x0d |0
<U00c0> \x05+\x01\x02\x0e |3
<U00c0> \x05+\xff |3
# toUnicode result is fallback direct
<U0006> \x06 |3
@ -31,8 +37,18 @@ CHARMAP
<U101234> \x07 |0
<Ufebcd> \x08 |3
# extensions
<U101234>+<U50005>+<U60006> \x07+\x00+\x01\x02\x0f+\x09 |0
<U101234>+<U50005> \x07+\x00+\x01\x02\x0e+\x05 |0
<U101234>+<U60006> \x07+\x00+\x01\x02\x0f+\x06 |0
<U101234>+<U70007> \x07+\x00+\x01\x02\x0f |1
#unassigned \x09
# extensions where the first code point is unassigned, for replay testing
#<U00c4><U0300> \x09+\x09 |0
<U00c4><U00c4><U101234><U0005> \x05+\x01\x02\x0c |0
# toUnicode result is surrogate pair: test real pair, single unit, unassigned
<U23456> \x01\x02\x0a |0
<U000b> \x01\x02\x0b |0

View File

@ -1,11 +1,11 @@
# *******************************************************************************
# * Copyright (C) 2001, International Business Machines
# * Copyright (C) 2001-2003, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test4.ucm
#
# Test file for MBCS conversion with three-byte codepage data.
# Test file for MBCS conversion with four-byte codepage data.
<code_set_name> "test4"
<mb_cur_max> 4

View File

@ -26,6 +26,7 @@
#include "unicode/udata.h"
#include "utrie.h"
#include "unicode/uset.h"
#include "toolutil.h"
#include "unewdata.h"
#include "unormimp.h"
#include "gennorm.h"
@ -86,87 +87,6 @@ setUnicodeVersion(const char *v) {
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
/* tool memory helper ------------------------------------------------------- */
/*
* UToolMemory is used for generic, custom memory management.
* It is allocated with enough space for count*size bytes starting
* at array.
* The array is declared with a union of large data types so
* that its base address is aligned for any types.
* If size is a multiple of a data type size, then such items
* can be safely allocated inside the array, at offsets that
* are themselves multiples of size.
*/
typedef struct UToolMemory {
char name[64];
uint32_t count, size, index;
union {
uint32_t u;
double d;
void *p;
} array[1];
} UToolMemory;
static UToolMemory *
utm_open(const char *name, uint32_t count, uint32_t size) {
UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size);
if(mem==NULL) {
fprintf(stderr, "error: %s - out of memory\n", name);
exit(U_MEMORY_ALLOCATION_ERROR);
}
uprv_strcpy(mem->name, name);
mem->count=count;
mem->size=size;
mem->index=0;
return mem;
}
static void
utm_close(UToolMemory *mem) {
if(mem!=NULL) {
uprv_free(mem);
}
}
static void *
utm_getStart(UToolMemory *mem) {
return (char *)mem->array;
}
static int32_t
utm_countItems(UToolMemory *mem) {
return mem->index;
}
static void *
utm_alloc(UToolMemory *mem) {
char *p=(char *)mem->array+mem->index*mem->size;
if(++mem->index<=mem->count) {
uprv_memset(p, 0, mem->size);
return p;
} else {
fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
mem->name, (long)mem->count);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
static void *
utm_allocN(UToolMemory *mem, int32_t n) {
char *p=(char *)mem->array+mem->index*mem->size;
if((mem->index+=(uint32_t)n)<=mem->count) {
uprv_memset(p, 0, n*mem->size);
return p;
} else {
fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
mem->name, (long)mem->count);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
/* builder data ------------------------------------------------------------- */
typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
@ -244,23 +164,23 @@ init() {
}
/* allocate Norm structures and reset the first one */
normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm));
normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
norms=utm_alloc(normMem);
/* allocate UTF-32 string memory */
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4);
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
/* reset all "have seen" flags */
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
/* allocate extra data memory for UTF-16 decomposition strings and other values */
extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2);
extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
/* initialize the extraMem counter for the top of FNC strings */
p16=(uint16_t *)utm_alloc(extraMem);
*p16=1;
/* allocate temporary memory for combining triples */
combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple));
combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
/* set the minimum code points for no/maybe quick check values to the end of the BMP */
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
@ -508,7 +428,7 @@ processCombining() {
triples=utm_getStart(combiningTriplesMem);
/* add lead and trail indexes to the triples for sorting */
count=(uint16_t)combiningTriplesMem->index;
count=(uint16_t)utm_countItems(combiningTriplesMem);
for(i=0; i<count; ++i) {
/* findCombiningCP() must always find the code point */
triples[i].leadIndex=findCombiningCP(triples[i].lead, TRUE);
@ -1265,7 +1185,7 @@ makeAll32() {
uint32_t n;
int32_t i, normLength, count;
count=(int32_t)normMem->index;
count=(int32_t)utm_countItems(normMem);
for(i=0; i<count; ++i) {
norms[i].value32=make32BitNorm(norms+i);
}
@ -1292,7 +1212,7 @@ makeFCD() {
int32_t i, count, fcdLength;
uint16_t bothCCs;
count=(int32_t)normMem->index;
count=utm_countItems(normMem);
for(i=0; i<count; ++i) {
bothCCs=norms[i].canonBothCCs;
if(bothCCs==0) {
@ -1400,7 +1320,7 @@ combine(uint32_t lead, uint32_t trail) {
/* search for all triples with c as lead code point */
triples=utm_getStart(combiningTriplesMem);
count=combiningTriplesMem->index;
count=utm_countItems(combiningTriplesMem);
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
for(i=0; i<count && lead!=triples[i].lead; ++i) {}
@ -1512,7 +1432,7 @@ canChangeWithFollowing(const uint32_t *s, int32_t length, uint8_t trailCC) {
/* search for all triples with c as lead code point */
triples=utm_getStart(combiningTriplesMem);
count=combiningTriplesMem->index;
count=utm_countItems(combiningTriplesMem);
c=s[0];
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
@ -1838,7 +1758,7 @@ generateData(const char *dataDir) {
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
/* make sure that the FCD trie is 4-aligned */
if((extraMem->index+combiningTableTop)&1) {
if((utm_countItems(extraMem)+combiningTableTop)&1) {
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
}
@ -1850,7 +1770,7 @@ generateData(const char *dataDir) {
size=
_NORM_INDEX_TOP*4+
normTrieSize+
extraMem->index*2+
utm_countItems(extraMem)*2+
combiningTableTop*2+
fcdTrieSize+
auxTrieSize+
@ -1858,7 +1778,7 @@ generateData(const char *dataDir) {
if(beVerbose) {
printf("size of normalization trie %5u bytes\n", normTrieSize);
printf("size of 16-bit extra memory %5u UChars/uint16_t\n", extraMem->index);
printf("size of 16-bit extra memory %5u UChars/uint16_t\n", utm_countItems(extraMem));
printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
printf("size of combining table %5u uint16_t\n", combiningTableTop);
printf("size of FCD trie %5u bytes\n", fcdTrieSize);
@ -1873,7 +1793,7 @@ generateData(const char *dataDir) {
}
indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index;
indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
@ -1900,7 +1820,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, normTrieBlock, normTrieSize);
udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2);
udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
@ -1928,7 +1848,7 @@ extern void
cleanUpData(void) {
int32_t i, count;
count=(int32_t)normMem->index;
count=utm_countItems(normMem);
for(i=0; i<count; ++i) {
uset_close(norms[i].canonStart);
}

View File

@ -34,7 +34,7 @@ TARGET = makeconv$(EXEEXT)
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = makeconv.o ucnvstat.o genmbcs.o
OBJECTS = makeconv.o ucnvstat.o genmbcs.o gencnvex.o
DEPS = $(OBJECTS:.o=.d)

View File

@ -0,0 +1,996 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: gencnvex.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003oct12
* created by: Markus W. Scherer
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "cstring.h"
#include "cmemory.h"
#include "ucnv_cnv.h"
#include "ucnvmbcs.h"
#include "toolutil.h"
#include "unewdata.h"
#include "ucm.h"
#include "makeconv.h"
#include "genmbcs.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
static void
CnvExtClose(NewConverter *cnvData);
static UBool
CnvExtIsValid(NewConverter *cnvData,
const uint8_t *bytes, int32_t length);
static UBool
CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
static uint32_t
CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
UNewDataMemory *pData, int32_t tableType);
typedef struct CnvExtData {
NewConverter newConverter;
UCMFile *ucm;
/* toUnicode (state table in ucm->states) */
UToolMemory *toUTable, *toUUChars;
/* fromUnicode */
UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes;
uint16_t stage1[MBCS_STAGE_1_SIZE];
uint16_t stage2[MBCS_STAGE_2_SIZE];
uint16_t stage3[0x10000<<UCNV_EXT_STAGE_2_LEFT_SHIFT]; /* 0x10000 because of 16-bit stage 2/3 indexes */
uint32_t stage3b[0x10000];
int32_t stage1Top, stage2Top, stage3Top, stage3bTop;
/* for stage3 compaction of <subchar1> |2 mappings */
uint16_t stage3Sub1Block;
} CnvExtData;
NewConverter *
CnvExtOpen(UCMFile *ucm) {
CnvExtData *extData;
extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData));
if(extData!=NULL) {
uprv_memset(extData, 0, sizeof(CnvExtData));
extData->ucm=ucm; /* aliased, not owned */
extData->newConverter.close=CnvExtClose;
extData->newConverter.isValid=CnvExtIsValid;
extData->newConverter.addTable=CnvExtAddTable;
extData->newConverter.write=CnvExtWrite;
}
return &extData->newConverter;
}
static void
CnvExtClose(NewConverter *cnvData) {
CnvExtData *extData=(CnvExtData *)cnvData;
if(extData!=NULL) {
utm_close(extData->toUTable);
utm_close(extData->toUUChars);
utm_close(extData->fromUTableUChars);
utm_close(extData->fromUTableValues);
utm_close(extData->fromUBytes);
}
}
/* we do not expect this to be called */
static UBool
CnvExtIsValid(NewConverter *cnvData,
const uint8_t *bytes, int32_t length) {
return FALSE;
}
static uint32_t
CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
UNewDataMemory *pData, int32_t tableType) {
CnvExtData *extData=(CnvExtData *)cnvData;
int32_t length, top, headerSize;
int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 };
if(tableType&TABLE_BASE) {
headerSize=0;
} else {
_MBCSHeader header={ 0 };
/* write the header and base table name for an extension-only table */
length=uprv_strlen(extData->ucm->baseName)+1;
while(length&3) {
/* add padding */
extData->ucm->baseName[length++]=0;
}
headerSize=sizeof(header)+length;
/* fill the header */
header.version[0]=4;
header.version[1]=2;
header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY);
/* write the header and the base table name */
udata_writeBlock(pData, &header, sizeof(header));
udata_writeBlock(pData, extData->ucm->baseName, length);
}
/* fill indexes[] - offsets/indexes are in units of the target array */
top=0;
indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH;
top+=length*4;
indexes[UCNV_EXT_TO_U_INDEX]=top;
indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable);
top+=length*4;
indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top;
indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars);
top+=length*2;
indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top;
length=utm_countItems(extData->fromUTableUChars);
top+=length*2;
if(top&3) {
/* add padding */
*((UChar *)utm_alloc(extData->fromUTableUChars))=0;
*((uint32_t *)utm_alloc(extData->fromUTableValues))=0;
++length;
top+=2;
}
indexes[UCNV_EXT_FROM_U_LENGTH]=length;
indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top;
top+=length*4;
indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top;
length=utm_countItems(extData->fromUBytes);
top+=length;
if(top&1) {
/* add padding */
*((uint8_t *)utm_alloc(extData->fromUBytes))=0;
++length;
++top;
}
indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length;
indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top;
indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top;
indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top;
top+=length*2;
indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top;
length=extData->stage3Top;
top+=length*2;
if(top&3) {
/* add padding */
extData->stage3[extData->stage3Top++]=0;
++length;
top+=2;
}
indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length;
indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top;
indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop;
top+=length*4;
indexes[UCNV_EXT_SIZE]=top;
/* write the extension data */
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2);
udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2);
udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4);
udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]);
udata_writeBlock(pData, extData->stage1, extData->stage1Top*2);
udata_writeBlock(pData, extData->stage2, extData->stage2Top*2);
udata_writeBlock(pData, extData->stage3, extData->stage3Top*2);
udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4);
{
int32_t i, j;
length=extData->stage1Top;
printf("\nstage1[%x]:\n", length);
for(i=0; i<length; ++i) {
if(extData->stage1[i]!=length) {
printf("stage1[%04x]=%04x\n", i, extData->stage1[i]);
}
}
j=length;
length=extData->stage2Top;
printf("\nstage2[%x]:\n", length);
for(i=0; i<length; ++j, ++i) {
if(extData->stage2[i]!=0) {
printf("stage12[%04x]=%04x\n", j, extData->stage2[i]);
}
}
length=extData->stage3Top;
printf("\nstage3[%x]:\n", length);
for(i=0; i<length; ++i) {
if(extData->stage3[i]!=0) {
printf("stage3[%04x]=%04x\n", i, extData->stage3[i]);
}
}
length=extData->stage3bTop;
printf("\nstage3b[%x]:\n", length);
for(i=0; i<length; ++i) {
if(extData->stage3b[i]!=0) {
printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]);
}
}
}
if(VERBOSE) {
printf("size of extension data: %ld\n", top);
}
/* return the number of bytes that should have been written */
return (uint32_t)(headerSize+top);
}
/* to Unicode --------------------------------------------------------------- */
/*
* Remove fromUnicode fallbacks and SUB mappings which are irrelevant for
* the toUnicode table.
* The table must be sorted.
* Destroys previous data in the reverseMap.
*/
static int32_t
reduceToUMappings(UCMTable *table) {
UCMapping *mappings;
int32_t *map;
int32_t i, j, count;
int8_t flag;
mappings=table->mappings;
map=table->reverseMap;
count=table->mappingsLength;
/* leave the map alone for the initial mappings with desired flags */
for(i=j=0; i<count; ++i) {
flag=mappings[map[i]].f;
if(flag!=0 && flag!=3) {
break;
}
}
/* reduce from here to the rest */
for(j=i; i<count; ++i) {
flag=mappings[map[i]].f;
if(flag==0 || flag==3) {
map[j++]=map[i];
}
}
return j;
}
static uint32_t
getToUnicodeValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
UChar32 *u32;
UChar *u;
uint32_t value;
int32_t u16Length;
UErrorCode errorCode;
/* write the Unicode result code point or string index */
if(m->uLen==1) {
value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
} else {
/* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
/* get the result code point string and its 16-bit string length */
u32=UCM_GET_CODE_POINTS(table, m);
errorCode=U_ZERO_ERROR;
u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode);
if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
exit(errorCode);
}
/* allocate it and put its length and index into the value */
value=
(((uint32_t)m->uLen+UCNV_EXT_TO_U_LENGTH_OFFSET)<<UCNV_EXT_TO_U_LENGTH_SHIFT)|
((uint32_t)utm_countItems(extData->toUUChars));
u=utm_allocN(extData->toUUChars, u16Length);
/* write the result 16-bit string */
errorCode=U_ZERO_ERROR;
u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode);
if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
exit(errorCode);
}
}
if(m->f==0) {
value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
}
return value;
}
/*
* Recursive toUTable generator core function.
* Preconditions:
* - start<limit (There is at least one mapping.)
* - The mappings are sorted lexically. (Access is through the reverseMap.)
* - All mappings between start and limit have input sequences that share
* the same prefix of unitIndex length, and therefore all of these sequences
* are at least unitIndex+1 long.
* - There are only relevant mappings available through the reverseMap,
* see reduceToUMappings().
*
* One function invocation generates one section table.
*
* Steps:
* 1. Count the number of unique unit values and get the low/high unit values
* that occur at unitIndex.
* 2. Allocate the section table with possible optimization for linear access.
* 3. Write temporary version of the section table with start indexes of
* subsections, each corresponding to one unit value at unitIndex.
* 4. Iterate through the table once more, and depending on the subsection length:
* 0: write 0 as a result value (unused byte in linear-access section table)
* >0: if there is one mapping with an input unit sequence of unitIndex+1
* then defaultValue=compute the mapping result for this whole sequence
* else defaultValue=0
*
* recurse into the subsection
*/
static UBool
generateToUTable(CnvExtData *extData, UCMTable *table,
int32_t start, int32_t limit, int32_t unitIndex,
uint32_t defaultValue) {
UCMapping *mappings, *m;
int32_t *map;
int32_t i, j, uniqueCount, count, subStart, subLimit;
uint8_t *bytes;
int32_t low, high, prev;
uint32_t *section;
mappings=table->mappings;
map=table->reverseMap;
/* step 1: examine the input units; set low, high, uniqueCount */
m=mappings+map[start];
bytes=UCM_GET_BYTES(table, m);
low=bytes[unitIndex];
uniqueCount=1;
prev=high=low;
for(i=start+1; i<limit; ++i) {
m=mappings+map[i];
bytes=UCM_GET_BYTES(table, m);
high=bytes[unitIndex];
if(high!=prev) {
prev=high;
++uniqueCount;
}
}
/* step 2: allocate the section; set count, section */
count=(high-low)+1;
if(unitIndex==0 || uniqueCount>=(3*count)/4) {
/*
* for the root table and for fairly full tables:
* allocate for direct, linear array access
* by keeping count, to write an entry for each unit value
* from low to high
*/
} else {
count=uniqueCount;
}
/* allocate the section: 1 entry for the header + count for the items */
section=(uint32_t *)utm_allocN(extData->toUTable, 1+count);
/* write the section header */
*section++=((uint32_t)count<<UCNV_EXT_TO_U_BYTE_SHIFT)|defaultValue;
/* step 3: write temporary section table with subsection starts */
prev=low-1; /* just before low to prevent empty subsections before low */
j=0; /* section table index */
for(i=start; i<limit; ++i) {
m=mappings+map[i];
bytes=UCM_GET_BYTES(table, m);
high=bytes[unitIndex];
if(high!=prev) {
/* start of a new subsection for unit high */
if(count>uniqueCount) {
/* write empty subsections for unused units in a linear table */
while(++prev<high) {
section[j++]=((uint32_t)prev<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i;
}
} else {
prev=high;
}
/* write the entry with the subsection start */
section[j++]=((uint32_t)high<<UCNV_EXT_TO_U_BYTE_SHIFT)|(uint32_t)i;
}
}
/* assert(j==count) */
/* step 4: recurse and write results */
subLimit=UCNV_EXT_TO_U_GET_VALUE(section[0]);
for(j=0; j<count; ++j) {
subStart=subLimit;
subLimit= (j+1)<count ? UCNV_EXT_TO_U_GET_VALUE(section[j+1]) : limit;
/* remove the subStart temporary value */
section[j]&=~UCNV_EXT_TO_U_VALUE_MASK;
if(subStart==subLimit) {
/* leave the value zero: empty subsection for unused unit in a linear table */
continue;
}
/* see if there is exactly one input unit sequence of length unitIndex+1 */
defaultValue=0;
m=mappings+map[subStart];
if(m->bLen==unitIndex+1) {
/* do not include this in generateToUTable() */
++subStart;
if(subStart<subLimit && mappings[map[subStart]].bLen==unitIndex+1) {
/* print error for multiple same-input-sequence mappings */
fprintf(stderr, "error: multiple mappings from same bytes\n");
ucm_printMapping(table, m, stderr);
ucm_printMapping(table, mappings+map[subStart], stderr);
return FALSE;
}
defaultValue=getToUnicodeValue(extData, table, m);
}
if(subStart==subLimit) {
/* write the result for the input sequence ending here */
section[j]|=defaultValue;
} else {
/* write the index to the subsection table */
section[j]|=(uint32_t)utm_countItems(extData->toUTable);
/* recurse */
if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
return FALSE;
}
}
}
return TRUE;
}
/*
* Generate the toUTable and toUUChars from the input table.
* The input table must be sorted, and all precision flags must be 0..3.
* This function will modify the table's reverseMap.
*/
static UBool
makeToUTable(CnvExtData *extData, UCMTable *table) {
int32_t toUCount;
toUCount=reduceToUMappings(table);
extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4);
extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2);
return generateToUTable(extData, table, 0, toUCount, 0, 0);
}
/* from Unicode ------------------------------------------------------------- */
/*
* preprocessing:
* rebuild reverseMap with mapping indexes for mappings relevant for from Unicode
* change each Unicode string to encode all but the first code point in 16-bit form
*
* generation:
* for each unique code point
* write an entry in the 3-stage trie
* check that there is only one single-code point sequence
* start recursion for following 16-bit input units
*/
/*
* Remove toUnicode fallbacks and non-<subchar1> SUB mappings
* which are irrelevant for the fromUnicode extension table.
* Overwrite the reverseMap with an index array to the relevant mappings.
* Modify the code point sequences to a generator-friendly format where
* the first code points remains unchanged but the following are recoded
* into 16-bit Unicode string form.
* The table must be sorted.
* Destroys previous data in the reverseMap.
*/
static int32_t
prepareFromUMappings(UCMTable *table) {
UCMapping *mappings, *m;
int32_t *map;
int32_t i, j, count;
int8_t flag;
mappings=table->mappings;
map=table->reverseMap;
count=table->mappingsLength;
/*
* we do not go through the map on input because the mappings are
* sorted lexically
*/
m=mappings;
for(i=j=0; i<count; ++m, ++i) {
flag=m->f;
if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
map[j++]=i;
if(m->uLen>1) {
/* recode all but the first code point to 16-bit Unicode */
UChar32 *u32;
UChar *u;
UChar32 c;
int32_t q, r;
u32=UCM_GET_CODE_POINTS(table, m);
u=(UChar *)u32; /* destructive in-place recoding */
for(r=2, q=1; q<m->uLen; ++q) {
c=u32[q];
U16_APPEND_UNSAFE(u, r, c);
}
/* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */
m->uLen=(int8_t)r;
}
}
}
return j;
}
static uint32_t
getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
uint8_t *bytes, *resultBytes;
uint32_t value;
if(m->f==2) {
return UCNV_EXT_FROM_U_SUBCHAR1; /* <subchar1> SUB mapping */
}
bytes=UCM_GET_BYTES(table, m);
value=0;
switch(m->bLen) {
/* 1..3: store the bytes in the value word */
case 3:
value=((uint32_t)*bytes++)<<16;
case 2:
value|=((uint32_t)*bytes++)<<8;
case 1:
value|=*bytes;
break;
default:
/* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */
/* store the bytes in fromUBytes[] and the index in the value word */
value=(uint32_t)utm_countItems(extData->fromUBytes);
resultBytes=utm_allocN(extData->fromUBytes, m->bLen);
uprv_memcpy(resultBytes, bytes, m->bLen);
break;
}
value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT;
if(m->f==0) {
value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
}
return value;
}
/*
* works like generateToUTable(), except that the
* output section consists of two arrays, one for input UChars and one
* for result values
*
* also, fromUTable sections are always stored in a compact form for
* access via binary search
*/
static UBool
generateFromUTable(CnvExtData *extData, UCMTable *table,
int32_t start, int32_t limit, int32_t unitIndex,
uint32_t defaultValue) {
UCMapping *mappings, *m;
int32_t *map;
int32_t i, j, uniqueCount, count, subStart, subLimit;
UChar *uchars;
UChar32 low, high, prev;
UChar *sectionUChars;
uint32_t *sectionValues;
mappings=table->mappings;
map=table->reverseMap;
/* step 1: examine the input units; set low, high, uniqueCount */
m=mappings+map[start];
uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
low=uchars[unitIndex];
uniqueCount=1;
prev=high=low;
for(i=start+1; i<limit; ++i) {
m=mappings+map[i];
uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
high=uchars[unitIndex];
if(high!=prev) {
prev=high;
++uniqueCount;
}
}
/* step 2: allocate the section; set count, section */
/* the fromUTable always stores for access via binary search */
count=uniqueCount;
/* allocate the section: 1 entry for the header + count for the items */
sectionUChars=(UChar *)utm_allocN(extData->fromUTableUChars, 1+count);
sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count);
/* write the section header */
*sectionUChars++=(UChar)count;
*sectionValues++=defaultValue;
/* step 3: write temporary section table with subsection starts */
prev=low-1; /* just before low to prevent empty subsections before low */
j=0; /* section table index */
for(i=start; i<limit; ++i) {
m=mappings+map[i];
uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
high=uchars[unitIndex];
if(high!=prev) {
/* start of a new subsection for unit high */
prev=high;
/* write the entry with the subsection start */
sectionUChars[j]=(UChar)high;
sectionValues[j]=(uint32_t)i;
++j;
}
}
/* assert(j==count) */
/* step 4: recurse and write results */
subLimit=(int32_t)(sectionValues[0]);
for(j=0; j<count; ++j) {
subStart=subLimit;
subLimit= (j+1)<count ? (int32_t)(sectionValues[j+1]) : limit;
/* see if there is exactly one input unit sequence of length unitIndex+1 */
defaultValue=0;
m=mappings+map[subStart];
if(m->uLen==unitIndex+1) {
/* do not include this in generateToUTable() */
++subStart;
if(subStart<subLimit && mappings[map[subStart]].uLen==unitIndex+1) {
/* print error for multiple same-input-sequence mappings */
fprintf(stderr, "error: multiple mappings from same Unicode code points\n");
ucm_printMapping(table, m, stderr);
ucm_printMapping(table, mappings+map[subStart], stderr);
return FALSE;
}
defaultValue=getFromUBytesValue(extData, table, m);
}
if(subStart==subLimit) {
/* write the result for the input sequence ending here */
sectionValues[j]=defaultValue;
} else {
/* write the index to the subsection table */
sectionValues[j]=(uint32_t)utm_countItems(extData->fromUTableValues);
/* recurse */
if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
return FALSE;
}
}
}
return TRUE;
}
/*
* add entries to the fromUnicode trie,
* assume to be called with code points in ascending order
* and use that to build the trie in precompacted form
*/
static void
addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) {
int32_t i1, i2, i3, i3b, nextOffset, min, newBlock;
if(value==0) {
return;
}
/*
* compute the index for each stage,
* allocate a stage block if necessary,
* and write the stage value
*/
i1=c>>10;
if(i1>=extData->stage1Top) {
extData->stage1Top=i1+1;
}
nextOffset=(c>>4)&0x3f;
if(extData->stage1[i1]==0) {
/* allocate another block in stage 2; overlap with the previous block */
newBlock=extData->stage2Top;
min=newBlock-nextOffset; /* minimum block start with overlap */
while(min<newBlock && extData->stage2[newBlock-1]==0) {
--newBlock;
}
extData->stage1[i1]=(uint16_t)newBlock;
extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
if(extData->stage2Top>LENGTHOF(extData->stage2)) {
fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", c);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
i2=extData->stage1[i1]+nextOffset;
nextOffset=c&0xf;
if(extData->stage2[i2]==0) {
/* allocate another block in stage 3; overlap with the previous block */
newBlock=extData->stage3Top;
min=newBlock-nextOffset; /* minimum block start with overlap */
while(min<newBlock && extData->stage3[newBlock-1]==0) {
--newBlock;
}
/* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */
newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1);
extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT);
extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE;
if(extData->stage3Top>LENGTHOF(extData->stage3)) {
fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", c);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
i3=((int32_t)extData->stage2[i2]<<UCNV_EXT_STAGE_2_LEFT_SHIFT)+nextOffset;
/*
* assume extData->stage3[i3]==0 because we get
* code points in strictly ascending order
*/
if(value==UCNV_EXT_FROM_U_SUBCHAR1) {
/* <subchar1> SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */
extData->stage3[i3]=1;
/*
* precompaction is not optimal for <subchar1> |2 mappings because
* stage3 values for them are all the same, unlike for other mappings
* which all have unique values;
* use a simple compaction of reusing a whole block filled with these
* mappings
*/
/* is the entire block filled with <subchar1> |2 mappings? */
if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) {
for(min=i3-nextOffset;
min<i3 && extData->stage3[min]==1;
++min) {}
if(min==i3) {
/* the entire block is filled with these mappings */
if(extData->stage3Sub1Block!=0) {
/* point to the previous such block and remove this block from stage3 */
extData->stage2[i2]=extData->stage3Sub1Block;
extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE;
uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2);
} else {
/* remember this block's stage2 entry */
extData->stage3Sub1Block=extData->stage2[i2];
}
}
}
} else {
if((i3b=extData->stage3bTop++)>=LENGTHOF(extData->stage3b)) {
fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", c);
exit(U_MEMORY_ALLOCATION_ERROR);
}
/* roundtrip or fallback mapping */
extData->stage3[i3]=(uint16_t)i3b;
extData->stage3b[i3b]=value;
}
}
static UBool
generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) {
UCMapping *mappings, *m;
int32_t *map;
uint32_t value;
int32_t subStart, subLimit;
UChar32 *codePoints;
UChar32 c, next;
if(mapLength==0) {
return TRUE;
}
mappings=table->mappings;
map=table->reverseMap;
/*
* iterate over same-initial-code point mappings,
* enter the initial code point into the trie,
* and start a recursion on the corresponding mappings section
* with generateFromUTable()
*/
m=mappings+map[0];
codePoints=UCM_GET_CODE_POINTS(table, m);
next=codePoints[0];
subLimit=0;
while(subLimit<mapLength) {
/* get a new subsection of mappings starting with the same code point */
subStart=subLimit;
c=next;
while(next==c && ++subLimit<mapLength) {
m=mappings+map[subLimit];
codePoints=UCM_GET_CODE_POINTS(table, m);
next=codePoints[0];
}
/*
* compute the value for this code point;
* if there is a mapping for this code point alone, it is at subStart
* because the table is sorted lexically
*/
value=0;
m=mappings+map[subStart];
codePoints=UCM_GET_CODE_POINTS(table, m);
if(m->uLen==1) {
/* do not include this in generateFromUTable() */
++subStart;
if(subStart<subLimit && mappings[map[subStart]].uLen==1) {
/* print error for multiple same-input-sequence mappings */
fprintf(stderr, "error: multiple mappings from same Unicode code points\n");
ucm_printMapping(table, m, stderr);
ucm_printMapping(table, mappings+map[subStart], stderr);
return FALSE;
}
value=getFromUBytesValue(extData, table, m);
}
if(subStart==subLimit) {
/* write the result for this one code point */
addFromUTrieEntry(extData, c, value);
} else {
/* write the index to the subsection table */
addFromUTrieEntry(extData, c, (uint32_t)utm_countItems(extData->fromUTableValues));
/* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */
if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) {
return FALSE;
}
}
}
return TRUE;
}
/*
* Generate the fromU data structures from the input table.
* The input table must be sorted, and all precision flags must be 0..3.
* This function will modify the table's reverseMap.
*/
static UBool
makeFromUTable(CnvExtData *extData, UCMTable *table) {
uint16_t *stage1;
int32_t i, stage1Top, fromUCount;
fromUCount=prepareFromUMappings(table);
extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2);
extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4);
extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1);
/* allocate all-unassigned stage blocks */
extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED;
extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED;
/*
* stage 3b stores only unique values, and in
* index 0: 0 for "no mapping"
* index 1: "no mapping" with preference for <subchar1> rather than <subchar>
*/
extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1;
extData->stage3bTop=2;
/* allocate the first entry in the fromUTable because index 0 means "no result" */
utm_alloc(extData->fromUTableUChars);
utm_alloc(extData->fromUTableValues);
if(!generateFromUTrie(extData, table, fromUCount)) {
return FALSE;
}
/*
* offset the stage 1 trie entries by stage1Top because they will
* be stored in a single array
*/
stage1=extData->stage1;
stage1Top=extData->stage1Top;
for(i=0; i<stage1Top; ++i) {
stage1[i]=(uint16_t)(stage1[i]+stage1Top);
}
return TRUE;
}
/* -------------------------------------------------------------------------- */
static UBool
CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
CnvExtData *extData;
staticData->unicodeMask=table->unicodeMask;
if(staticData->unicodeMask&UCNV_HAS_SURROGATES) {
fprintf(stderr, "error: contains mappings for surrogate code points\n");
return FALSE;
}
staticData->conversionType=UCNV_MBCS;
extData=(CnvExtData *)cnvData;
/*
* assume that the table is sorted
*
* call the functions in this order because
* makeToUTable() modifies the original reverseMap,
* makeFromUTable() writes a whole new mapping into reverseMap
*/
return
makeToUTable(extData, table) &&
makeFromUTable(extData, table);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000, International Business Machines
* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -19,10 +19,27 @@
#include "makeconv.h"
U_CFUNC NewConverter *
MBCSOpen(uint8_t maxCharLength);
enum {
MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
U_CFUNC UBool
MBCSAddState(NewConverter *cnvData, const char *s);
MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
MBCS_STAGE_3_BLOCK_SIZE=16, /* 16; 16=1<<4 for 4 bits in stage 3 */
MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
MBCS_MAX_FALLBACK_COUNT=8192
};
U_CFUNC NewConverter *
MBCSOpen(UCMFile *ucm);
U_CFUNC NewConverter *
CnvExtOpen(UCMFile *ucm);
#endif

View File

@ -30,11 +30,43 @@
#include "unicode/udata.h"
#include "unewdata.h"
#include "ucmpwrit.h"
#include "ucm.h"
#include "makeconv.h"
#include "genmbcs.h"
#define DEBUG 0
typedef struct ConvData {
UCMFile *ucm;
NewConverter *cnvData, *extData;
UConverterSharedData sharedData;
UConverterStaticData staticData;
} ConvData;
static void
initConvData(ConvData *data) {
uprv_memset(data, 0, sizeof(ConvData));
data->sharedData.structSize=sizeof(UConverterSharedData);
data->staticData.structSize=sizeof(UConverterStaticData);
data->sharedData.staticData=&data->staticData;
}
static void
cleanupConvData(ConvData *data) {
if(data!=NULL) {
if(data->cnvData!=NULL) {
data->cnvData->close(data->cnvData);
data->cnvData=NULL;
}
if(data->extData!=NULL) {
data->extData->close(data->extData);
data->extData=NULL;
}
ucm_close(data->ucm);
data->ucm=NULL;
}
}
/*
* from ucnvstat.c - static prototypes of data-based converters
*/
@ -46,137 +78,14 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP
UBool VERBOSE = FALSE;
UBool TOUCHFILE = FALSE;
/*Reads the header of the table file and fills in basic knowledge about the converter
*in "converter"
*/
static void readHeaderFromFile(UConverterSharedData* myConverter, FileStream* convFile, const char* converterName, UErrorCode* err);
/*Reads the rest of the file, and fills up the shared objects if necessary
Returns the UConverterTable. */
static void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err);
/* creates a UConverterSharedData from a mapping file.
* Fills in: *staticData, *table. Converter is NOT otherwise useful.
*/
static UConverterSharedData* createConverterFromTableFile(const char* realName, UErrorCode* err);
static void
createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
/*
* Set up the UNewData and write the converter..
*/
void writeConverterData(UConverterSharedData *mySharedData, const char *cnvName, const char *cnvDir, UErrorCode *status);
static const char NLTC_SEPARATORS[9] = { '\r', '\n', '\t', ' ', '<', '>' ,'"' , 'U', '\0' };
static const char FALLBACK_SEPARATOR = '|';
static const char CODEPOINT_SEPARATORS[8] = { '\r', '>', '\\', 'x', '\n', ' ', '\t', '\0' };
static const char UNICODE_CODEPOINT_SEPARATORS[6] = { '<', '>', 'U', ' ', '\t', '\0' };
static const char *
skipWhitespace(const char *s) {
while(*s==' ' || *s=='\t') {
++s;
}
return s;
}
static int32_t
parseCodepageBytes(const char *s, uint32_t *pBytes, const char **pEnd) {
char *end;
int32_t length=0;
uint32_t bytes=0, value;
while(s[0]=='\\' && s[1]=='x') {
if(length==4) {
return -1;
}
value=uprv_strtoul(s+2, &end, 16);
s+=4;
if(end!=s) {
return -1;
}
bytes=(bytes<<8)|value;
++length;
}
if(length==0) {
return -1;
}
if(pEnd!=NULL) {
*pEnd=s;
}
*pBytes=bytes;
return length;
}
/* Remove all characters followed by '#'. There is an exception if there
* is a fallback sign '|' after the comment and the comment does not
* start in column 0. In this case, we just blank from '#' to just
* before the '|' in order to support the fact that IBM official .ucm
* files have the fallback information in comments!
*/
static char *
removeComments (char *line)
{
char *pound;
line = (char*)skipWhitespace(line);
pound = uprv_strchr (line, '#');
if (pound != NULL)
{
char *fallback = pound == line ? 0 : uprv_strchr(pound + 1, '|');
if (fallback != NULL)
{
uprv_memset(pound, ' ', fallback-pound);
}
else
{
*pound = '\0';
}
}
return line;
}
/* Returns true in c is a in set 'setOfChars', false otherwise
*/
static UBool
isInSet (char c, const char *setOfChars)
{
uint8_t i = 0;
while (setOfChars[i] != '\0')
{
if (c == setOfChars[i++])
return TRUE;
}
return FALSE;
}
/* Returns pointer to the next non-whitespace (or non-separator)
*/
static int32_t
nextTokenOffset (const char *line, const char *separators)
{
int32_t i = 0;
while (line[i] && isInSet(line[i], separators))
i++;
return i;
}
/* Returns pointer to the next token based on the set of separators
*/
static char *
getToken (char *token, char *line, const char *separators)
{
int32_t i = nextTokenOffset (line, separators);
int8_t j = 0;
while (line[i] && (!isInSet(line[i], separators)))
token[j++] = line[i++];
token[j] = '\0';
return line + i;
}
static void
writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
UBool haveCopyright=TRUE;
@ -194,20 +103,27 @@ static UDataInfo dataInfo={
{0, 0, 0, 0} /* dataVersion (calculated at runtime) */
};
void writeConverterData(UConverterSharedData *mySharedData,
const char *cnvName,
const char *cnvDir,
UErrorCode *status)
static void
writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
{
UNewDataMemory *mem = NULL;
uint32_t sz2;
uint32_t size = 0;
int32_t tableType;
if(U_FAILURE(*status))
{
return;
}
tableType=TABLE_NONE;
if(data->cnvData!=NULL) {
tableType|=TABLE_BASE;
}
if(data->extData!=NULL) {
tableType|=TABLE_EXT;
}
mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
if(U_FAILURE(*status))
@ -224,11 +140,17 @@ void writeConverterData(UConverterSharedData *mySharedData,
fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv");
}
/* all read only, clean, platform independent data. Mmmm. :) */
udata_writeBlock(mem, mySharedData->staticData, sizeof(UConverterStaticData));
udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */
/* Now, write the table */
size += ((NewConverter *)mySharedData->table)->write((NewConverter *)mySharedData->table, mySharedData->staticData, mem);
if(tableType&TABLE_BASE) {
size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
}
if(tableType&TABLE_EXT) {
size += data->extData->write(data->extData, &data->staticData, mem, tableType);
}
sz2 = udata_finish(mem, status);
if(size != sz2)
@ -255,7 +177,7 @@ static UOption options[]={
int main(int argc, char* argv[])
{
UConverterSharedData* mySharedData = NULL;
ConvData data;
UErrorCode err = U_ZERO_ERROR, localError;
char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
char touchFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
@ -420,7 +342,7 @@ int main(int argc, char* argv[])
if(pkgName != NULL)
{
/* changes both baename and filename */
/* changes both basename and filename */
uprv_strcpy(outBasename, pkgName);
uprv_strcat(outBasename, "_");
uprv_strcat(outBasename, cnvName);
@ -435,9 +357,10 @@ int main(int argc, char* argv[])
fflush(stdout);
#endif
localError = U_ZERO_ERROR;
mySharedData = createConverterFromTableFile(arg, &localError);
initConvData(&data);
createConverter(&data, arg, &localError);
if (U_FAILURE(localError) || (mySharedData == NULL))
if (U_FAILURE(localError))
{
/* if an error is found, print out an error msg and keep going */
fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg,
@ -449,21 +372,21 @@ int main(int argc, char* argv[])
else
{
/* Make the static data name equal to the file name */
if( /*VERBOSE && */ uprv_stricmp(cnvName,mySharedData->staticData->name))
if( /*VERBOSE && */ uprv_stricmp(cnvName,data.staticData.name))
{
fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
cnvName,
CONVERTER_FILE_EXTENSION,
mySharedData->staticData->name);
data.staticData.name);
}
uprv_strcpy((char*)mySharedData->staticData->name, cnvName);
uprv_strcpy((char*)data.staticData.name, cnvName);
if(!uprv_isInvariantString((char*)mySharedData->staticData->name, -1)) {
if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
fprintf(stderr,
"Error: A converter name must contain only invariant characters.\n"
"%s is not a valid converter name.\n",
mySharedData->staticData->name);
data.staticData.name);
if(U_SUCCESS(err)) {
err = U_INVALID_TABLE_FORMAT;
}
@ -481,8 +404,7 @@ int main(int argc, char* argv[])
}
localError = U_ZERO_ERROR;
writeConverterData(mySharedData, cnvNameWithPkg, destdir, &localError);
((NewConverter *)mySharedData->table)->close((NewConverter *)mySharedData->table);
writeConverterData(&data, cnvNameWithPkg, destdir, &localError);
if(TOUCHFILE)
{
FileStream *q;
@ -505,10 +427,6 @@ int main(int argc, char* argv[])
}
}
/* write the information data */
uprv_free((UConverterStaticData *)mySharedData->staticData);
uprv_free(mySharedData);
if(U_FAILURE(localError))
{
/* if an error is found, print out an error msg and keep going*/
@ -525,6 +443,8 @@ int main(int argc, char* argv[])
}
fflush(stdout);
fflush(stderr);
cleanupConvData(&data);
}
return err;
@ -548,102 +468,87 @@ getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID
}
}
/*Reads the header of the table file and fills in basic knowledge about the converter in "converter"*/
void readHeaderFromFile(UConverterSharedData* mySharedData,
static void
readHeader(ConvData *data,
FileStream* convFile,
const char* converterName,
UErrorCode *pErrorCode)
{
UErrorCode *pErrorCode) {
char line[200];
char *s, *end, *key, *value;
char *s, *key, *value;
const UConverterStaticData *prototype;
UConverterStaticData *staticData;
char c;
if(U_FAILURE(*pErrorCode)) {
return;
}
staticData=(UConverterStaticData *)mySharedData->staticData;
staticData->conversionType=UCNV_UNSUPPORTED_CONVERTER;
staticData=&data->staticData;
staticData->platform=UCNV_IBM;
staticData->subCharLen=0;
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
/* remove comments and trailing CR and LF and remove whitespace from the end */
for(end=line; (c=*end)!=0; ++end) {
if(c=='#' || c=='\r' || c=='\n') {
break;
}
}
while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
--end;
}
*end=0;
/* skip leading white space and ignore empty lines */
s=(char *)skipWhitespace(line);
if(*s==0) {
/* basic parsing and handling of state-related items */
if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
continue;
}
/* stop at the beginning of the mapping section */
if(uprv_memcmp(s, "CHARMAP", 7)==0) {
if(uprv_strcmp(line, "CHARMAP")==0) {
break;
}
/* get the key name, bracketed in <> */
if(*s!='<') {
fprintf(stderr, "error: no header field <key> in line \"%s\"\n", line);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
key=++s;
while(*s!='>') {
if(*s==0) {
fprintf(stderr, "error: incomplete header field <key> in line \"%s\"\n", line);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
++s;
}
*s=0;
/* get the value string, possibly quoted */
s=(char *)skipWhitespace(s+1);
if(*s!='"') {
value=s;
} else {
/* remove the quotes */
value=s+1;
if(end>value && *(end-1)=='"') {
*--end=0;
}
}
/* collect the information from the header field, ignore unknown keys */
if(uprv_strcmp(key, "code_set_name")==0) {
if(*value!=0) {
uprv_strcpy((char *)staticData->name, value);
getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
}
} else if(uprv_strcmp(key, "uconv_class")==0) {
const UConverterStaticData *prototype;
} else if(uprv_strcmp(key, "subchar")==0) {
uint8_t bytes[UCNV_EXT_MAX_BYTES];
int8_t length;
if(uprv_strcmp(value, "DBCS")==0) {
staticData->conversionType=UCNV_DBCS;
} else if(uprv_strcmp(value, "SBCS")==0) {
staticData->conversionType = UCNV_SBCS;
} else if(uprv_strcmp(value, "MBCS")==0) {
staticData->conversionType = UCNV_MBCS;
} else if(uprv_strcmp(value, "EBCDIC_STATEFUL")==0) {
staticData->conversionType = UCNV_EBCDIC_STATEFUL;
s=value;
length=ucm_parseBytes(bytes, line, &s);
if(1<=length && length<=4 && *s==0) {
staticData->subCharLen=length;
uprv_memcpy(staticData->subChar, bytes, length);
} else {
fprintf(stderr, "error: unknown <uconv_class> %s\n", value);
fprintf(stderr, "error: illegal <subchar> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "subchar1")==0) {
uint8_t bytes[UCNV_EXT_MAX_BYTES];
s=value;
if(1==ucm_parseBytes(bytes, line, &s) && *s==0) {
staticData->subChar1=bytes[0];
} else {
fprintf(stderr, "error: illegal <subchar1> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
}
}
/* copy values from the UCMFile to the static data */
staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
staticData->conversionType=data->ucm->states.conversionType;
/* ### TODO use UCNV_UNSUPPORTED_CONVERTER to indicate an extension-only file? */
if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
/* Now that we know the type, copy any 'default' values from the table. */
/*
* Now that we know the type, copy any 'default' values from the table.
* We need not check the type any further because the parser only
* recognizes what we have prototypes for.
*/
prototype=ucnv_converterStaticData[staticData->conversionType];
if(prototype!=NULL) {
if(staticData->name[0]==0) {
@ -673,392 +578,202 @@ void readHeaderFromFile(UConverterSharedData* mySharedData,
}
}
}
} else if(uprv_strcmp(key, "mb_cur_max")==0) {
if('1'<=*value && *value<='4' && value[1]==0) {
staticData->maxBytesPerChar=(int8_t)(*value-'0');
} else {
fprintf(stderr, "error: illegal <mb_cur_max> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "mb_cur_min")==0) {
if('1'<=*value && *value<='4' && value[1]==0) {
staticData->minBytesPerChar=(int8_t)(*value-'0');
} else {
fprintf(stderr, "error: illegal <mb_cur_min> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "subchar")==0) {
uint32_t bytes;
int32_t length;
length=parseCodepageBytes(value, &bytes, (const char **)&end);
if(length>0 && *end==0) {
staticData->subCharLen=(int8_t)length;
do {
staticData->subChar[--length]=(uint8_t)bytes;
bytes>>=8;
} while(length>0);
} else {
fprintf(stderr, "error: illegal <subchar> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "subchar1")==0) {
uint32_t bytes;
if(1==parseCodepageBytes(value, &bytes, (const char **)&end) && *end==0) {
staticData->subChar1=(uint8_t)bytes;
} else {
fprintf(stderr, "error: illegal <subchar1> %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "icu:state")==0) {
/* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
switch(staticData->conversionType) {
case UCNV_SBCS:
case UCNV_DBCS:
case UCNV_EBCDIC_STATEFUL:
staticData->conversionType = UCNV_MBCS;
break;
case UCNV_MBCS:
break;
default:
fprintf(stderr, "error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
if(data->ucm->states.outputType<0) {
data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength;
}
if(staticData->maxBytesPerChar==0) {
fprintf(stderr, "error: <icu:state> before the <mb_cur_max> line\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
if(mySharedData->table==NULL) {
mySharedData->table=(UConverterTable *)MBCSOpen(staticData->maxBytesPerChar);
if(mySharedData->table==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return;
}
}
if(!MBCSAddState((NewConverter *)mySharedData->table, value)) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
}
}
if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(staticData->conversionType==UCNV_MBCS && mySharedData->table==NULL) {
fprintf(stderr, "error: missing state table information (<icu:state>) for MBCS\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(staticData->subChar1!=0 &&
!staticData->conversionType==UCNV_MBCS &&
!staticData->conversionType==UCNV_EBCDIC_STATEFUL
if( staticData->subChar1!=0 &&
(staticData->minBytesPerChar>1 ||
(staticData->conversionType!=UCNV_MBCS &&
staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
) {
fprintf(stderr, "error: <subchar1> defined for a type other than MBCS or EBCDIC_STATEFUL\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err)
{
char storageLine[200];
char* line = NULL;
UConverterStaticData *staticData=(UConverterStaticData *)sharedData->staticData;
NewConverter *cnvData = (NewConverter *)sharedData->table;
UChar32 unicodeValue, codepageValue;
uint8_t mbcsBytes[8];
int32_t mbcsLength;
char codepointBytes[20];
UBool isOK = TRUE;
uint8_t precisionMask = 0, unicodeMask = 0;
char endOfLine;
static void
readTable(ConvData *data, FileStream* convFile,
UBool forBase, UCMStates *baseStates,
UErrorCode *pErrorCode) {
char line[200];
char *end;
UBool isOK;
if(cnvData->startMappings!=NULL)
{
if(!cnvData->startMappings(cnvData)) {
*err = U_INVALID_TABLE_FORMAT;
if(U_FAILURE(*pErrorCode)) {
return;
}
}
if(cnvData->isValid!=NULL)
{
const uint8_t *p = staticData->subChar;
codepageValue = 0;
switch(staticData->subCharLen) {
case 4: codepageValue = (codepageValue << 8) | *p++;
case 3: codepageValue = (codepageValue << 8) | *p++;
case 2: codepageValue = (codepageValue << 8) | *p++;
case 1: codepageValue = (codepageValue << 8) | *p;
default: break; /* must never occur */
}
if(!cnvData->isValid(cnvData, staticData->subChar, staticData->subCharLen, codepageValue)) {
fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
*err = U_INVALID_TABLE_FORMAT;
isOK=TRUE;
for(;;) {
/* read the next line */
if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
fprintf(stderr, "incomplete charmap section\n");
isOK=FALSE;
}
}
staticData->hasFromUnicodeFallback = staticData->hasToUnicodeFallback = FALSE;
while (T_FileStream_readLine(convFile, storageLine, sizeof(storageLine)))
{
removeComments(storageLine);
line = storageLine;
if (line[nextTokenOffset(line, NLTC_SEPARATORS)] != '\0')
{
/* get the Unicode code point */
line = getToken(codepointBytes, line, UNICODE_CODEPOINT_SEPARATORS);
if (uprv_strcmp(codepointBytes, "END") == 0)
{
break;
}
unicodeValue = (UChar32)T_CString_stringToInteger(codepointBytes, 16);
/* get the codepage bytes */
codepageValue = 0;
mbcsLength = 0;
do
{
line = getToken(codepointBytes, line, CODEPOINT_SEPARATORS);
mbcsBytes[mbcsLength] = (uint8_t)T_CString_stringToInteger(codepointBytes, 16);
codepageValue = codepageValue << 8 | mbcsBytes[mbcsLength++];
/* remove CR LF */
end=uprv_strchr(line, 0);
while(line<end && (*(end-1)=='\r' || *(end-1)=='\n')) {
--end;
}
*end=0;
/* End of line could be \0 or | (if fallback) */
endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)];
} while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR));
if(unicodeValue>=0x10000) {
unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
} else if(UTF_IS_SURROGATE(unicodeValue)) {
unicodeMask|=UCNV_HAS_SURROGATES; /* there are single surrogates */
/* ignore empty and comment lines */
if(line[0]==0 || line[0]=='#') {
continue;
}
if((uint32_t)unicodeValue > 0x10ffff)
{
fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine);
isOK = FALSE;
}
else if(endOfLine == FALLBACK_SEPARATOR)
{
/* we know that there is a fallback separator */
precisionMask |= 1;
line = uprv_strchr(line, FALLBACK_SEPARATOR) + 1;
switch(*line)
{
case '0':
/* set roundtrip mappings */
isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0) &&
cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0);
break;
case '1':
/* set only a fallback mapping from Unicode to codepage */
staticData->hasFromUnicodeFallback = TRUE;
isOK &= cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1);
break;
case '2':
/* skip subchar mappings */
break;
case '3':
/* set only a fallback mapping from codepage to Unicode */
staticData->hasToUnicodeFallback = TRUE;
isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1);
break;
default:
fprintf(stderr, "error: illegal fallback indicator '%s' in '%s'\n", line - 1, storageLine);
*err = U_INVALID_TABLE_FORMAT;
/* stop at the end of the mapping table */
if(0==uprv_strcmp(line, "END CHARMAP")) {
break;
}
isOK&=ucm_addMappingFromLine(data->ucm, line, forBase, baseStates);
}
else
{
precisionMask |= 2;
/* set the mappings */
isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1) &&
cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1);
}
if(!isOK) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
if(unicodeMask == 3)
{
fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n");
}
staticData->unicodeMask = unicodeMask;
/* return TRUE if a base table was read, FALSE for an extension table */
static UBool
readFile(ConvData *data, const char* converterName,
UErrorCode *pErrorCode) {
char line[200];
char *end;
FileStream *convFile;
UBool dataIsBase;
if(cnvData->finishMappings!=NULL)
{
cnvData->finishMappings(cnvData, staticData);
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
if(!isOK)
{
*err = U_INVALID_TABLE_FORMAT;
}
else if(precisionMask == 3)
{
fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
*err = U_INVALID_TABLE_FORMAT;
}
}
/*creates a UConverterStaticData, fills in necessary links to it the appropriate function pointers*/
UConverterSharedData* createConverterFromTableFile(const char* converterName, UErrorCode* err)
{
FileStream* convFile = NULL;
UConverterSharedData* mySharedData = NULL;
UConverterStaticData* myStaticData = NULL;
if (U_FAILURE(*err)) return NULL;
data->ucm=ucm_open();
convFile=T_FileStream_open(converterName, "r");
if (convFile == NULL)
{
*err = U_FILE_ACCESS_ERROR;
return NULL;
if(convFile==NULL) {
*pErrorCode=U_FILE_ACCESS_ERROR;
return FALSE;
}
mySharedData = (UConverterSharedData*) uprv_malloc(sizeof(UConverterSharedData));
if (mySharedData == NULL)
{
*err = U_MEMORY_ALLOCATION_ERROR;
T_FileStream_close(convFile);
return NULL;
readHeader(data, convFile, converterName, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
uprv_memset(mySharedData, 0, sizeof(UConverterSharedData));
if(data->ucm->baseName[0]==0) {
dataIsBase=TRUE;
ucm_processStates(&data->ucm->states);
mySharedData->structSize = sizeof(UConverterSharedData);
myStaticData = (UConverterStaticData*) uprv_malloc(sizeof(UConverterStaticData));
if (myStaticData == NULL)
{
*err = U_MEMORY_ALLOCATION_ERROR;
T_FileStream_close(convFile);
return NULL;
/* read the base table */
readTable(data, convFile, TRUE, &data->ucm->states, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return FALSE;
}
uprv_memset(myStaticData, 0, sizeof(UConverterStaticData));
mySharedData->staticData = myStaticData;
myStaticData->structSize = sizeof(UConverterStaticData);
/* mySharedData->staticDataOwned = FALSE; */ /* not owned if in udata */
mySharedData->sharedDataCached = FALSE;
mySharedData->dataMemory = NULL; /* for init */
/* read an extension table if there is one */
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
end=uprv_strchr(line, 0);
while(line<end &&
(*(end-1)=='\n' || *(end-1)=='\r' || *(end-1)==' ' || *(end-1)=='\t')) {
--end;
}
readHeaderFromFile(mySharedData, convFile, converterName, err);
if (U_FAILURE(*err)) return NULL;
switch (myStaticData->conversionType)
{
case UCNV_SBCS:
{
/* SBCS: use MBCS data structure with a default state table */
if(mySharedData->staticData->maxBytesPerChar!=1) {
fprintf(stderr, "error: SBCS codepage with max bytes/char!=1\n");
*err = U_INVALID_TABLE_FORMAT;
if(0==uprv_strcmp(line, "CHARMAP")) {
/* read the extension table */
readTable(data, convFile, FALSE, &data->ucm->states, pErrorCode);
break;
}
myStaticData->conversionType = UCNV_MBCS;
if(mySharedData->table == NULL) {
NewConverter *sharedDataTable = MBCSOpen(1);
if(sharedDataTable != NULL) {
if(!MBCSAddState(sharedDataTable, "0-ff")) {
*err = U_INVALID_TABLE_FORMAT;
sharedDataTable->close(sharedDataTable);
} else {
mySharedData->table = (UConverterTable *)sharedDataTable;
}
} else {
*err = U_MEMORY_ALLOCATION_ERROR;
/* read only the extension table */
dataIsBase=FALSE;
readTable(data, convFile, FALSE, NULL, pErrorCode);
}
T_FileStream_close(convFile);
if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
break;
return dataIsBase;
}
case UCNV_MBCS:
{
/* MBCSOpen() was called by readHeaderFromFile() */
break;
static void
createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode) {
ConvData baseData;
UBool dataIsBase;
if(U_FAILURE(*pErrorCode)) {
return;
}
case UCNV_EBCDIC_STATEFUL:
{
/* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
if(mySharedData->staticData->maxBytesPerChar!=2) {
fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n");
*err = U_INVALID_TABLE_FORMAT;
break;
initConvData(data);
/* ### TODO if there is an extension table:
1. the base table must use precision flags
2. check base vs. extension for mappings overlap
*/
dataIsBase=readFile(data, converterName, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return;
}
myStaticData->conversionType = UCNV_MBCS;
if(mySharedData->table == NULL) {
NewConverter *sharedDataTable = MBCSOpen(2);
if(sharedDataTable != NULL) {
if( !MBCSAddState(sharedDataTable, "0-ff, e:1.s, f:0.s") ||
!MBCSAddState(sharedDataTable, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4") ||
!MBCSAddState(sharedDataTable, "0-40:1.i, 41-fe:1., ff:1.i") ||
!MBCSAddState(sharedDataTable, "0-ff:1.i, 40:1.") ||
!MBCSAddState(sharedDataTable, "0-ff:1.i")
initConvData(&baseData);
if(dataIsBase) {
data->cnvData=MBCSOpen(data->ucm);
if(data->cnvData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else if(!data->cnvData->isValid(data->cnvData,
data->staticData.subChar, data->staticData.subCharLen)
) {
*err = U_INVALID_TABLE_FORMAT;
sharedDataTable->close(sharedDataTable);
} else {
mySharedData->table = (UConverterTable *)sharedDataTable;
}
} else {
*err = U_MEMORY_ALLOCATION_ERROR;
}
}
break;
}
case UCNV_DBCS:
{
/* DBCS: use MBCS data structure with a default state table */
if(mySharedData->staticData->maxBytesPerChar!=2) {
fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n");
*err = U_INVALID_TABLE_FORMAT;
break;
}
myStaticData->conversionType = UCNV_MBCS;
if(mySharedData->table == NULL) {
NewConverter *sharedDataTable = MBCSOpen(2);
if(sharedDataTable != NULL) {
if( !MBCSAddState(sharedDataTable, "0-3f:3, 40:2, 41-fe:1, ff:3") ||
!MBCSAddState(sharedDataTable, "41-fe") ||
!MBCSAddState(sharedDataTable, "40") ||
!MBCSAddState(sharedDataTable, "")
fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
} else if(data->ucm->ext->mappingsLength>0) {
/* prepare the extension table, if there is one */
data->extData=CnvExtOpen(data->ucm);
if(data->extData==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
} else if(
!ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, TRUE) ||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
) {
*err = U_INVALID_TABLE_FORMAT;
sharedDataTable->close(sharedDataTable);
} else {
mySharedData->table = (UConverterTable *)sharedDataTable;
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
/* add the base table after ucm_checkBaseExt()! */
if( U_SUCCESS(*pErrorCode) &&
!data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
} else {
*err = U_MEMORY_ALLOCATION_ERROR;
/* ### TODO assemble a path/filename for data->ucm->states.baseName */
/* must be TRUE */readFile(&baseData, ""/*extConverterName*/, pErrorCode);
/* ### TODO read extension table */
/* ### TODO - actually write the mappings into genmbcs or into ext */
if( !ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) ||
!ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, FALSE) ||
!data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
break;
}
default :
fprintf(stderr, "error: <uconv_class> omitted\n");
*err = U_INVALID_TABLE_FORMAT;
mySharedData->table = NULL;
break;
};
if(U_SUCCESS(*err) && mySharedData->table != NULL)
{
loadTableFromFile(convFile, mySharedData, err);
}
T_FileStream_close(convFile);
return mySharedData;
cleanupConvData(&baseData);
}
/*

View File

@ -183,6 +183,10 @@ SOURCE="$(InputPath)"
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
# Begin Source File
SOURCE=.\gencnvex.c
# End Source File
# Begin Source File
SOURCE=.\genmbcs.c
# End Source File
# Begin Source File

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2001, International Business Machines
* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -20,10 +20,19 @@
#include "unicode/utypes.h"
#include "ucnv_bld.h"
#include "unewdata.h"
#include "ucm.h"
/* exports from makeconv.c */
U_CFUNC UBool VERBOSE;
/* converter table type for writing */
enum {
TABLE_NONE,
TABLE_BASE,
TABLE_EXT,
TABLE_BASE_AND_EXT
};
/* abstract converter generator struct, C++ - style */
struct NewConverter;
typedef struct NewConverter NewConverter;
@ -32,32 +41,17 @@ struct NewConverter {
void
(*close)(NewConverter *cnvData);
UBool
(*startMappings)(NewConverter *cnvData);
/** is this byte sequence valid? */
UBool
(*isValid)(NewConverter *cnvData,
const uint8_t *bytes, int32_t length,
uint32_t b);
const uint8_t *bytes, int32_t length);
UBool
(*addToUnicode)(NewConverter *cnvData,
const uint8_t *bytes, int32_t length,
UChar32 c, uint32_t b,
int8_t isFallback);
UBool
(*addFromUnicode)(NewConverter *cnvData,
const uint8_t *bytes, int32_t length,
UChar32 c, uint32_t b,
int8_t isFallback);
void
(*finishMappings)(NewConverter *cnvData, const UConverterStaticData *staticData);
(*addTable)(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
uint32_t
(*write)(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData);
(*write)(NewConverter *cnvData, const UConverterStaticData *staticData,
UNewDataMemory *pData, int32_t tableType);
};
#endif

View File

@ -132,6 +132,9 @@
<Filter
Name="Source Files"
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat">
<File
RelativePath=".\gencnvex.c">
</File>
<File
RelativePath=".\genmbcs.c">
</File>

View File

@ -38,7 +38,7 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/ctestfw $(LIBCPPFLAGS)
LIBS = $(LIBICUUC) $(DEFAULT_LIBS)
OBJECTS = toolutil.o unewdata.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o
OBJECTS = toolutil.o unewdata.o ucm.o ucmstate.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))

View File

@ -26,6 +26,7 @@
# define NOMCX
# include <windows.h>
#endif
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "cmemory.h"
@ -73,3 +74,117 @@ findBasename(const char *filename) {
return filename;
}
}
/* tool memory helper ------------------------------------------------------- */
typedef struct UToolMemory {
char name[64];
int32_t capacity, maxCapacity, size, index;
void *array;
UAlignedMemory staticArray[1];
} UToolMemory;
U_CAPI UToolMemory * U_EXPORT2
utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) {
UToolMemory *mem;
if(maxCapacity<initialCapacity) {
maxCapacity=initialCapacity;
}
mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+initialCapacity*size);
if(mem==NULL) {
fprintf(stderr, "error: %s - out of memory\n", name);
exit(U_MEMORY_ALLOCATION_ERROR);
}
mem->array=mem->staticArray;
uprv_strcpy(mem->name, name);
mem->capacity=initialCapacity;
mem->maxCapacity=maxCapacity;
mem->size=size;
mem->index=0;
return mem;
}
U_CAPI void U_EXPORT2
utm_close(UToolMemory *mem) {
if(mem!=NULL) {
if(mem->array!=mem->staticArray) {
uprv_free(mem->array);
}
uprv_free(mem);
}
}
U_CAPI void * U_EXPORT2
utm_getStart(UToolMemory *mem) {
return (char *)mem->array;
}
U_CAPI int32_t U_EXPORT2
utm_countItems(UToolMemory *mem) {
return mem->index;
}
static UBool
utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
if(mem->capacity<capacity) {
int32_t newCapacity;
if(mem->maxCapacity<capacity) {
fprintf(stderr, "error: %s - trying to use more than maxCapacity=%ld units\n",
mem->name, (long)mem->maxCapacity);
exit(U_MEMORY_ALLOCATION_ERROR);
}
/* try to allocate a larger array */
if(capacity>=2*mem->capacity) {
newCapacity=capacity;
} else if(mem->capacity<=mem->maxCapacity/3) {
newCapacity=2*mem->capacity;
} else {
newCapacity=mem->maxCapacity;
}
if(mem->array==mem->staticArray) {
mem->array=uprv_malloc(newCapacity*mem->size);
if(mem->array!=NULL) {
uprv_memcpy(mem->array, mem->staticArray, mem->index*mem->size);
}
} else {
mem->array=uprv_realloc(mem->array, newCapacity*mem->size);
}
if(mem->array==NULL) {
fprintf(stderr, "error: %s - out of memory\n", mem->name);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
return TRUE;
}
U_CAPI void * U_EXPORT2
utm_alloc(UToolMemory *mem) {
char *p=(char *)mem->array+mem->index*mem->size;
int32_t newIndex=mem->index+1;
if(utm_hasCapacity(mem, newIndex)) {
mem->index=newIndex;
uprv_memset(p, 0, mem->size);
}
return p;
}
U_CAPI void * U_EXPORT2
utm_allocN(UToolMemory *mem, int32_t n) {
char *p=(char *)mem->array+mem->index*mem->size;
int32_t newIndex=mem->index+n;
if(utm_hasCapacity(mem, newIndex)) {
mem->index=newIndex;
uprv_memset(p, 0, n*mem->size);
}
return p;
}

View File

@ -163,10 +163,18 @@ SOURCE=.\ucbuf.c
# End Source File
# Begin Source File
SOURCE=.\ucm.c
# End Source File
# Begin Source File
SOURCE=.\ucmpwrit.c
# End Source File
# Begin Source File
SOURCE=.\ucmstate.c
# End Source File
# Begin Source File
SOURCE=.\unewdata.c
# End Source File
# Begin Source File
@ -195,6 +203,10 @@ SOURCE=.\ucbuf.h
# End Source File
# Begin Source File
SOURCE=.\ucm.h
# End Source File
# Begin Source File
SOURCE=.\ucmpwrit.h
# End Source File
# Begin Source File

View File

@ -20,8 +20,7 @@
#define __TOOLUTIL_H__
#include "unicode/utypes.h"
#include "cmemory.h"
/*
* For Windows, a path/filename may be the short (8.3) version
@ -51,4 +50,55 @@ getLongPathname(const char *pathname);
U_CAPI const char * U_EXPORT2
findBasename(const char *filename);
/*
* UToolMemory is used for generic, custom memory management.
* It is allocated with enough space for count*size bytes starting
* at array.
* The array is declared with a union of large data types so
* that its base address is aligned for any types.
* If size is a multiple of a data type size, then such items
* can be safely allocated inside the array, at offsets that
* are themselves multiples of size.
*/
struct UToolMemory;
typedef struct UToolMemory UToolMemory;
/**
* Open a UToolMemory object for allocation of initialCapacity to maxCapacity
* items with size bytes each.
*/
U_CAPI UToolMemory * U_EXPORT2
utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size);
/**
* Close a UToolMemory object.
*/
U_CAPI void U_EXPORT2
utm_close(UToolMemory *mem);
/**
* Get the pointer to the beginning of the array of items.
* The pointer becomes invalid after allocation of new items.
*/
U_CAPI void * U_EXPORT2
utm_getStart(UToolMemory *mem);
/**
* Get the current number of items.
*/
U_CAPI int32_t U_EXPORT2
utm_countItems(UToolMemory *mem);
/**
* Allocate one more item and return the pointer to its start in the array.
*/
U_CAPI void * U_EXPORT2
utm_alloc(UToolMemory *mem);
/**
* Allocate n items and return the pointer to the start of the first one in the array.
*/
U_CAPI void * U_EXPORT2
utm_allocN(UToolMemory *mem, int32_t n);
#endif

View File

@ -136,9 +136,15 @@
<File
RelativePath=".\ucbuf.c">
</File>
<File
RelativePath=".\ucm.c">
</File>
<File
RelativePath=".\ucmpwrit.c">
</File>
<File
RelativePath=".\ucmstate.c">
</File>
<File
RelativePath=".\unewdata.c">
</File>
@ -161,6 +167,9 @@
<File
RelativePath=".\ucbuf.h">
</File>
<File
RelativePath=".\ucm.h">
</File>
<File
RelativePath=".\ucmpwrit.h">
</File>

View File

@ -0,0 +1,910 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ucm.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jun20
* created by: Markus W. Scherer
*
* This file reads a .ucm file, stores its mappings and sorts them.
* It implements handling of Unicode conversion mappings from .ucm files
* for makeconv, canonucm, rptp2ucm, etc.
*
* Unicode code point sequences with a length of more than 1,
* as well as byte sequences with more than 4 bytes or more than one complete
* character sequence are handled to support m:n mappings.
*/
#include "unicode/utypes.h"
#include "unicode/ustring.h"
#include "cstring.h"
#include "cmemory.h"
#include "uarrsort.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include "uparse.h"
#include "ucm.h"
#include <stdio.h>
/* -------------------------------------------------------------------------- */
/*
### TODO
allow file without fallback indicators for backward compatibility
only for makeconv
must not sort such mappings
disallow when using extension tables because that requires sorting
rptp2ucm has its own mapping parser and sets all-|1 and |3 mappings; normalization function generates |0 and |2
*/
static void
printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
int32_t j;
for(j=0; j<m->uLen; ++j) {
fprintf(f, "<U%04lX>", codePoints[j]);
}
fputc(' ', f);
for(j=0; j<m->bLen; ++j) {
fprintf(f, "\\x%02X", bytes[j]);
}
if(m->f>=0) {
fprintf(f, " |%lu\n", m->f);
} else {
fputs("\n", f);
}
}
U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
}
U_CAPI void U_EXPORT2
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
UCMapping *m;
int32_t i, length;
m=table->mappings;
length=table->mappingsLength;
if(byUnicode) {
for(i=0; i<length; ++m, ++i) {
ucm_printMapping(table, m, f);
}
} else {
const int32_t *map=table->reverseMap;
for(i=0; i<length; ++i) {
ucm_printMapping(table, m+map[i], f);
}
}
}
/* mapping comparisons ------------------------------------------------------ */
static int32_t
compareUnicode(UCMTable *lTable, const UCMapping *l,
UCMTable *rTable, const UCMapping *r) {
const UChar32 *lu, *ru;
int32_t result, i, length;
if(l->uLen==1 && r->uLen==1) {
/* compare two single code points */
return l->u-r->u;
}
/* get pointers to the code point sequences */
lu=UCM_GET_CODE_POINTS(lTable, l);
ru=UCM_GET_CODE_POINTS(rTable, r);
/* get the minimum length */
if(l->uLen<=r->uLen) {
length=l->uLen;
} else {
length=r->uLen;
}
/* compare the code points */
for(i=0; i<length; ++i) {
result=lu[i]-ru[i];
if(result!=0) {
return result;
}
}
/* compare the lengths */
return l->uLen-r->uLen;
}
static int32_t
compareBytes(UCMTable *lTable, const UCMapping *l,
UCMTable *rTable, const UCMapping *r,
UBool lexical) {
const uint8_t *lb, *rb;
int32_t result, i, length;
/*
* A lexical comparison is used for sorting in the builder, to allow
* an efficient search for a byte sequence that could be a prefix
* of a previously entered byte sequence.
*
* Comparing by lengths first is for compatibility with old .ucm tools
* like canonucm and rptp2ucm.
*/
if(lexical) {
/* get the minimum length and continue */
if(l->bLen<=r->bLen) {
length=l->bLen;
} else {
length=r->bLen;
}
} else {
/* compare lengths first */
result=l->bLen-r->bLen;
if(result!=0) {
return result;
} else {
length=l->bLen;
}
}
/* get pointers to the byte sequences */
lb=UCM_GET_BYTES(lTable, l);
rb=UCM_GET_BYTES(rTable, r);
/* compare the bytes */
for(i=0; i<length; ++i) {
result=lb[i]-rb[i];
if(result!=0) {
return result;
}
}
/* compare the lengths */
return l->bLen-r->bLen;
}
/* compare UCMappings for sorting */
static int32_t
compareMappings(UCMTable *table, const void *left, const void *right, UBool uFirst) {
const UCMapping *l=(const UCMapping *)left, *r=(const UCMapping *)right;
int32_t result;
/* choose which side to compare first */
if(uFirst) {
/* Unicode then bytes */
result=compareUnicode(table, l, table, r);
if(result==0) {
result=compareBytes(table, l, table, r, FALSE); /* not lexically, like canonucm */
}
} else {
/* bytes then Unicode */
result=compareBytes(table, l, table, r, TRUE); /* lexically, for builder */
if(result==0) {
result=compareUnicode(table, l, table, r);
}
}
if(result!=0) {
return result;
}
/* compare the flags */
return l->f-r->f;
}
/* sorting by Unicode first sorts mappings directly */
static int32_t
compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
return compareMappings((UCMTable *)context, left, right, TRUE);
}
/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
static int32_t
compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
UCMTable *table=(UCMTable *)context;
int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
return compareMappings(table, table->mappings+l, table->mappings+r, FALSE);
}
U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable *t) {
UErrorCode errorCode;
int32_t i;
errorCode=U_ZERO_ERROR;
/* 1. sort by Unicode first */
uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
compareMappingsUnicodeFirst, t,
FALSE, &errorCode);
/* build the reverseMap */
if(t->reverseMap==NULL) {
/*
* allocate mappingsCapacity instead of mappingsLength so that
* if mappings are added, the reverseMap need not be
* reallocated each time
* (see moveMappings() and ucm_addMapping())
*/
t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
if(t->reverseMap==NULL) {
fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
for(i=0; i<t->mappingsLength; ++i) {
t->reverseMap[i]=i;
}
/* 2. sort reverseMap by mappings bytes first */
uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
compareMappingsBytesFirst, t,
FALSE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
u_errorName(errorCode));
exit(errorCode);
}
}
/*
### TODO normalization function for a table (in or for rptp2ucm)
sort table
if there are mappings with the same code points and bytes but |1 and |3, merge them into one |0 (or make |2 where necessary)
if mappings were merged, sort again
-> for rptp2ucm
*/
/* lookups ------------------------------------------------------------------ */
/*
### TODO lookups?
binary search for first mapping with some code point or byte sequence
check if a code point is the first of any mapping (RT or FB)
check if a byte sequence is a prefix of any mapping (RT or RFB)
check if there is a mapping with the same source units; return whether the target is same or different
*/
enum {
MOVE_TO_EXT=0x10,
REMOVE_MAPPING=0x20,
MOVE_ANY=0x30
};
/*
* move mappings with MOVE_ANY ored into their flags from the base table
* to the extension table
*/
static void
moveMappings(UCMTable *base, UCMTable *ext) {
UCMapping *mb, *mbLimit;
int8_t flag;
UBool didMove;
mb=base->mappings;
mbLimit=mb+base->mappingsLength;
didMove=FALSE;
while(mb<mbLimit) {
flag=mb->f;
if(flag&MOVE_ANY) {
/* restore the original flag value */
mb->f=flag&~MOVE_ANY;
didMove=TRUE;
if(ext!=NULL && (flag&MOVE_TO_EXT)) {
/* add the mapping to the extension table */
ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
}
/* move the last base mapping down and overwrite the current one */
if(mb<(mbLimit-1)) {
uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
}
--mbLimit;
--base->mappingsLength;
} else {
++mb;
}
}
if(didMove) {
ucm_sortTable(base);
ucm_printTable(base, stdout, TRUE); puts(""); /* ### TODO */
if(ext!=NULL) {
ucm_sortTable(ext);
ucm_printTable(ext, stdout, TRUE); puts(""); /* ### TODO */
}
}
}
enum {
NEEDS_MOVE=1,
HAS_ERRORS=2
};
static uint8_t
checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
UCMapping *mb, *me, *mbLimit, *meLimit;
int32_t cmp;
uint8_t result;
mb=base->mappings;
mbLimit=mb+base->mappingsLength;
me=ext->mappings;
meLimit=me+ext->mappingsLength;
result=0;
for(;;) {
/* skip irrelevant mappings on both sides */
for(;;) {
if(mb==mbLimit) {
return result;
}
if(0<=mb->f && mb->f<=2) {
break;
}
++mb;
}
for(;;) {
if(me==meLimit) {
return result;
}
if(0<=me->f && me->f<=2) {
break;
}
++me;
}
/* compare the base and extension mappings */
cmp=compareUnicode(base, mb, ext, me);
if(cmp<0) {
/* does mb map from an input sequence that is a prefix of me's? */
if( mb->uLen<me->uLen &&
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
) {
if(moveToExt) {
/* mark this mapping to be moved to the extension table */
mb->f|=MOVE_TO_EXT;
} else {
fprintf(stderr,
"ucm error: the base table contains a mapping whose input sequence\n"
" is a prefix of the input sequence of an extension mapping\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
}
result|=NEEDS_MOVE;
}
++mb;
} else if(cmp==0) {
/*
* same output: remove the extension mapping,
* otherwise treat as an error
*/
if( mb->f==me->f && mb->bLen==me->bLen &&
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
) {
me->f|=REMOVE_MAPPING;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
"ucm error: the base table contains a mapping whose input sequence\n"
" is the same as the input sequence of an extension mapping\n"
" but it maps differently\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
++mb;
} else /* cmp>0 */ {
++me;
}
}
}
static uint8_t
checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
UCMapping *mb, *me;
int32_t *baseMap, *extMap;
int32_t b, e, bLimit, eLimit, cmp;
uint8_t result;
UBool isSISO;
baseMap=base->reverseMap;
extMap=ext->reverseMap;
b=e=0;
bLimit=base->mappingsLength;
eLimit=ext->mappingsLength;
result=0;
isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
for(;;) {
/* skip irrelevant mappings on both sides */
for(;;) {
if(b==bLimit) {
return result;
}
mb=base->mappings+baseMap[b];
if(mb->f==0 || mb->f==3) {
break;
}
++b;
}
for(;;) {
if(e==eLimit) {
return result;
}
me=ext->mappings+extMap[e];
if(me->f==0 || me->f==3) {
break;
}
++e;
}
/* compare the base and extension mappings */
cmp=compareBytes(base, mb, ext, me, TRUE);
if(cmp<0) {
/*
* does mb map from an input sequence that is a prefix of me's?
* for SI/SO tables, a single byte is never a prefix because it
* occurs in a separate single-byte state
*/
if( mb->bLen<me->bLen &&
(!isSISO || mb->bLen>1) &&
0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
) {
if(moveToExt) {
/* mark this mapping to be moved to the extension table */
mb->f|=MOVE_TO_EXT;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
"ucm error: the base table contains a mapping whose input sequence\n"
" is a prefix of the input sequence of an extension mapping\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
}
++b;
} else if(cmp==0) {
/*
* same output: remove the extension mapping,
* otherwise treat as an error
*/
if( mb->f==me->f && mb->uLen==me->uLen &&
0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
) {
me->f|=REMOVE_MAPPING;
result|=NEEDS_MOVE;
} else {
fprintf(stderr,
"ucm error: the base table contains a mapping whose input sequence\n"
" is the same as the input sequence of an extension mapping\n"
" but it maps differently\n");
ucm_printMapping(base, mb, stderr);
ucm_printMapping(ext, me, stderr);
result|=HAS_ERRORS;
}
++b;
} else /* cmp>0 */ {
++e;
}
}
}
U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
UCMapping *m, *mLimit;
int32_t count;
UBool isOK;
m=table->mappings;
mLimit=m+table->mappingsLength;
isOK=TRUE;
while(m<mLimit) {
count=ucm_countChars(baseStates, UCM_GET_BYTES(table, m), m->bLen);
if(count<1) {
ucm_printMapping(table, m, stderr);
isOK=FALSE;
}
++m;
}
return isOK;
}
U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
uint8_t result;
/* if we have an extension table, we must always use precision flags */
if(base->flagsType!=UCM_FLAGS_EXPLICIT || ext->flagsType!=UCM_FLAGS_EXPLICIT) {
fprintf(stderr, "ucm error: the base or extension table contains mappings without precision flags\n");
return FALSE;
}
/* checking requires both tables to be sorted */
ucm_sortTable(base);
ucm_sortTable(ext);
/* check */
result=
checkBaseExtUnicode(base, ext, moveToExt)|
checkBaseExtBytes(baseStates, base, ext, moveToExt);
if(result&HAS_ERRORS) {
return FALSE;
}
if(result&NEEDS_MOVE) {
moveMappings(ext, NULL);
moveMappings(base, ext);
}
return TRUE;
}
/* ucm parser --------------------------------------------------------------- */
U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
const char *s=*ps;
char *end;
int8_t bLen;
bLen=0;
for(;;) {
/* skip an optional plus sign */
if(bLen>0 && *s=='+') {
++s;
}
if(*s!='\\') {
break;
}
if(bLen==UCNV_EXT_MAX_BYTES) {
fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
return -1;
}
if( s[1]!='x' ||
(bytes[bLen]=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
) {
fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
return -1;
}
++bLen;
s=end;
}
*ps=s;
return bLen;
}
/* parse a mapping line; must not be empty */
U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES],
const char *line) {
const char *s;
char *end;
int32_t u16Length;
int8_t uLen, bLen, f;
s=line;
uLen=bLen=0;
/* parse code points */
for(;;) {
/* skip an optional plus sign */
if(uLen>0 && *s=='+') {
++s;
}
if(*s!='<') {
break;
}
if(uLen==UCNV_EXT_MAX_UCHARS) {
fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
return FALSE;
}
if( s[1]!='U' ||
(codePoints[uLen]=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
*end!='>'
) {
fprintf(stderr, "ucm error: Unicode code point must be formatted as <UXXXX> (1..6 hex digits) - \"%s\"\n", line);
return FALSE;
}
if((uint32_t)codePoints[uLen]>0x10ffff || U_IS_SURROGATE(codePoints[uLen])) {
fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
return FALSE;
}
++uLen;
s=end+1;
}
if(uLen==0) {
fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
return FALSE;
} else if(uLen==1) {
m->u=codePoints[0];
} else {
UErrorCode errorCode=U_ZERO_ERROR;
u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
u16Length>UCNV_EXT_MAX_UCHARS
) {
fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
return FALSE;
}
}
s=u_skipWhitespace(s);
/* parse bytes */
bLen=ucm_parseBytes(bytes, line, &s);
if(bLen<0) {
return FALSE;
} else if(bLen==0) {
fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
return FALSE;
} else if(bLen<=4) {
uprv_memcpy(m->b.bytes, bytes, bLen);
}
/* skip everything until the fallback indicator, even the start of a comment */
for(;;) {
if(*s==0) {
f=-1; /* no fallback indicator */
break;
} else if(*s=='|') {
f=(int8_t)(s[1]-'0');
if((uint8_t)f>3) {
fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
return FALSE;
}
break;
}
++s;
}
m->uLen=uLen;
m->bLen=bLen;
m->f=f;
return TRUE;
}
/* general APIs ------------------------------------------------------------- */
U_CAPI UCMTable * U_EXPORT2
ucm_openTable() {
UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
if(table==NULL) {
fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
memset(table, 0, sizeof(UCMTable));
return table;
}
U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable *table) {
if(table!=NULL) {
uprv_free(table->mappings);
uprv_free(table->codePoints);
uprv_free(table->bytes);
uprv_free(table->reverseMap);
uprv_free(table);
}
}
U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable *table,
UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
UCMapping *tm;
UChar32 c;
int32_t index;
if(table->mappingsLength>=table->mappingsCapacity) {
/* make the mappings array larger */
if(table->mappingsCapacity==0) {
table->mappingsCapacity=1000;
} else {
table->mappingsCapacity*=10;
}
table->mappings=(UCMapping *)uprv_realloc(table->mappings,
table->mappingsCapacity*sizeof(UCMapping));
if(table->mappings==NULL) {
fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
table->mappingsCapacity);
exit(U_MEMORY_ALLOCATION_ERROR);
}
if(table->reverseMap!=NULL) {
/* the reverseMap must be reallocated in a new sort */
uprv_free(table->reverseMap);
table->reverseMap=NULL;
}
}
if(m->uLen>1 && table->codePointsCapacity==0) {
table->codePointsCapacity=10000;
table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
if(table->codePoints==NULL) {
fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
table->codePointsCapacity);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
if(m->bLen>4 && table->bytesCapacity==0) {
table->bytesCapacity=10000;
table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
if(table->bytes==NULL) {
fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
table->bytesCapacity);
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
if(m->uLen>1) {
index=table->codePointsLength;
table->codePointsLength+=m->uLen;
if(table->codePointsLength>table->codePointsCapacity) {
fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
m->u=index;
}
if(m->bLen>4) {
index=table->bytesLength;
table->bytesLength+=m->bLen;
if(table->bytesLength>table->bytesCapacity) {
fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
uprv_memcpy(table->bytes+index, bytes, m->bLen);
m->b.index=index;
}
/* set unicodeMask */
for(index=0; index<m->uLen; ++index) {
c=codePoints[index];
if(c>=0x10000) {
table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
} else if(U_IS_SURROGATE(c)) {
table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
}
}
/* set flagsType */
if(m->f<0) {
table->flagsType|=UCM_FLAGS_IMPLICIT;
} else {
table->flagsType|=UCM_FLAGS_EXPLICIT;
}
tm=table->mappings+table->mappingsLength++;
uprv_memcpy(tm, m, sizeof(UCMapping));
}
U_CAPI UCMFile * U_EXPORT2
ucm_open() {
UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
if(ucm==NULL) {
fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
memset(ucm, 0, sizeof(UCMFile));
ucm->base=ucm_openTable();
ucm->ext=ucm_openTable();
ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
ucm->states.outputType=-1;
ucm->states.minCharLength=ucm->states.maxCharLength=1;
return ucm;
}
U_CAPI void U_EXPORT2
ucm_close(UCMFile *ucm) {
if(ucm!=NULL) {
uprv_free(ucm->base);
uprv_free(ucm->ext);
uprv_free(ucm);
}
}
U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
UCMapping m={ 0 };
UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
uint8_t bytes[UCNV_EXT_MAX_BYTES];
int32_t count;
if(!ucm_parseMappingLine(&m, codePoints, bytes, line)) {
return FALSE;
}
if(baseStates!=NULL) {
/* check validity of the bytes and count the characters in them */
count=ucm_countChars(baseStates, bytes, m.bLen);
if(count<1) {
/* illegal byte sequence */
printMapping(&m, codePoints, bytes, stderr);
return FALSE;
}
} else {
/* not used - adding a mapping for an extension-only table before its base table is read */
count=0;
}
/*
* Add the mapping to the base table if this is requested
* and it is a 1:1 mapping.
* Otherwise, add it to the extension table.
*
* Also add |2 SUB mappings for <subchar1>
* and |1 fallbacks from something other than U+0000 to 0x00
* to the extension table.
*/
if( forBase && m.uLen==1 && count==1 &&
!((m.f==2 && m.bLen==1 && ucm->states.maxCharLength>1) ||
(m.f==1 && m.bLen==1 && bytes[0]==0 && !(m.uLen==1 && codePoints[0]==0)))
) {
ucm_addMapping(ucm->base, &m, codePoints, bytes);
return TRUE;
}
ucm_addMapping(ucm->ext, &m, codePoints, bytes);
return TRUE;
}

View File

@ -0,0 +1,217 @@
/*
*******************************************************************************
*
* Copyright (C) 2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: ucm.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003jun20
* created by: Markus W. Scherer
*
* Definitions for the .ucm file parser and handler module ucm.c.
*/
#ifndef __UCM_H__
#define __UCM_H__
#include "unicode/utypes.h"
#include "ucnvmbcs.h"
#include "ucnv_ext.h"
#include <stdio.h>
U_CDECL_BEGIN
/*
* Per-mapping data structure
*
* u if uLen==1: Unicode code point
* else index to uLen code points
* b if bLen<=4: up to 4 bytes
* else index to bLen bytes
* uLen number of code points
* bLen number of words containing left-justified bytes
* bIsMultipleChars indicates that the bytes contain more than one sequence
* according to the state table
* f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
* same values as in the source file after |
*/
typedef struct UCMapping {
UChar32 u;
union {
uint32_t index;
uint8_t bytes[4];
} b;
int8_t uLen, bLen, f;
} UCMapping;
enum {
UCM_FLAGS_INITIAL, /* no mappings parsed yet */
UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
UCM_FLAGS_MIXED /* both implicit and explicit */
};
typedef struct UCMTable {
UCMapping *mappings;
int32_t mappingsCapacity, mappingsLength;
UChar32 *codePoints;
int32_t codePointsCapacity, codePointsLength;
uint8_t *bytes;
int32_t bytesCapacity, bytesLength;
/* index map for mapping by bytes first */
int32_t *reverseMap;
uint8_t unicodeMask;
int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
} UCMTable;
enum {
MBCS_STATE_FLAG_DIRECT=1,
MBCS_STATE_FLAG_SURROGATES,
MBCS_STATE_FLAG_READY=16
};
typedef struct UCMStates {
int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
stateOffsetSum[MBCS_MAX_STATE_COUNT];
int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
int8_t conversionType, outputType;
} UCMStates;
typedef struct UCMFile {
UCMTable *base, *ext;
UCMStates states;
char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
} UCMFile;
/* simple accesses ---------------------------------------------------------- */
#define UCM_GET_CODE_POINTS(t, m) \
(((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
#define UCM_GET_BYTES(t, m) \
(((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.index)
/* APIs --------------------------------------------------------------------- */
U_CAPI UCMFile * U_EXPORT2
ucm_open(void);
U_CAPI void U_EXPORT2
ucm_close(UCMFile *ucm);
U_CAPI UBool U_EXPORT2
ucm_parseHeaderLine(UCMFile *ucm,
char *line, char **pKey, char **pValue);
U_CAPI UBool U_EXPORT2
ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
U_CAPI UCMTable * U_EXPORT2
ucm_openTable(void);
U_CAPI void U_EXPORT2
ucm_closeTable(UCMTable *table);
U_CAPI void U_EXPORT2
ucm_sortTable(UCMTable *t);
/**
* Check the validity of mappings against a base table's states;
* necessary for extension-only tables that were read before their base tables.
*/
U_CAPI UBool U_EXPORT2
ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
/**
* Check a base table against an extension table.
* Set moveToExt=TRUE for where base and extension tables are parsed
* from a single file,
* and moveToExt=FALSE for where the extension table is in a separate file.
*
* For both tables in the same file, the extension table is automatically
* built.
* For separate files, the extension file can use a complete mapping table,
* so that common mappings need not be stripped out manually.
*
*
* Sort both tables, and then for each mapping direction:
*
* If the base table contains a mapping for which the input sequence is
* the same as the extension input, then
* - if the output is the same: remove the extension mapping
* - else: error
*
* If the base table contains a mapping for which the input sequence is
* a prefix of the extension input, then
* - if moveToExt: move the base mapping to the extension table
* - else: error
*
* @return FALSE in case of an irreparable error
*/
U_CAPI UBool U_EXPORT2
ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt);
U_CAPI void U_EXPORT2
ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
U_CAPI void U_EXPORT2
ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
U_CAPI void U_EXPORT2
ucm_addState(UCMStates *states, const char *s);
U_CAPI void U_EXPORT2
ucm_processStates(UCMStates *states);
U_CAPI int32_t U_EXPORT2
ucm_countChars(UCMStates *states,
const uint8_t *bytes, int32_t length);
U_CAPI int8_t U_EXPORT2
ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
U_CAPI UBool U_EXPORT2
ucm_parseMappingLine(UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES],
const char *line);
U_CAPI void U_EXPORT2
ucm_addMapping(UCMTable *table,
UCMapping *m,
UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
uint8_t bytes[UCNV_EXT_MAX_BYTES]);
/* very makeconv-specific functions ----------------------------------------- */
/* finalize and optimize states after the toUnicode mappings are processed */
U_CAPI void U_EXPORT2
ucm_optimizeStates(UCMStates *states,
uint16_t **pUnicodeCodeUnits,
_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
UBool verbose);
/* moved here because it is used inside ucmstate.c */
U_CAPI int32_t U_EXPORT2
ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
uint32_t offset);
U_CDECL_END
#endif

File diff suppressed because it is too large Load Diff