diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in index 61196bd839..48c40088ad 100644 --- a/icu4c/source/common/Makefile.in +++ b/icu4c/source/common/Makefile.in @@ -61,7 +61,7 @@ OBJECTS = putil.o uobject.o cmemory.o umutex.o \ udata.o ucmndata.o udatamem.o udataswp.o umapfile.o ucol_swp.o \ uresbund.o uresdata.o resbund.o ucat.o locmap.o uloc.o locid.o \ uhash.o uhash_us.o \ -ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_io.o ucnvlat1.o \ +ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_ext.o ucnv_io.o ucnvlat1.o \ ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \ ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o \ unistr.o utf_impl.o ustring.o ustrcase.o cstring.o ustrfmt.o ustrtrns.o \ diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp index 7b66828194..3539840bfa 100644 --- a/icu4c/source/common/common.dsp +++ b/icu4c/source/common/common.dsp @@ -1347,6 +1347,14 @@ InputPath=.\unicode\ucnv_err.h # End Source File # Begin Source File +SOURCE=.\ucnv_ext.c +# End Source File +# Begin Source File + +SOURCE=.\ucnv_ext.h +# End Source File +# Begin Source File + SOURCE=.\ucnv_imp.h # End Source File # Begin Source File diff --git a/icu4c/source/common/common.vcproj b/icu4c/source/common/common.vcproj index 7e67cccac0..c72a6d0a73 100644 --- a/icu4c/source/common/common.vcproj +++ b/icu4c/source/common/common.vcproj @@ -730,6 +730,12 @@ Outputs="..\..\include\unicode\$(InputName).h"/> + + + + diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c index 9ec013109c..48639b2729 100644 --- a/icu4c/source/common/ucnv.c +++ b/icu4c/source/common/ucnv.c @@ -608,11 +608,14 @@ static void _reset(UConverter *converter, UConverterResetChoice choice, converter->mode = 0; converter->toULength = 0; converter->invalidCharLength = converter->UCharErrorBufferLength = 0; + converter->preToULength = 0; } if(choice!=UCNV_RESET_TO_UNICODE) { converter->fromUnicodeStatus = 0; converter->fromUChar32 = 0; converter->invalidUCharLength = converter->charErrorBufferLength = 0; + converter->preFromUFirstCP = U_SENTINEL; + converter->preFromULength = 0; } if (converter->sharedData->impl->reset != NULL) { @@ -811,6 +814,28 @@ _updateOffsets(int32_t *offsets, int32_t length, /* ucnv_fromUnicode --------------------------------------------------------- */ +/* + * Implementation note for m:n conversions + * + * While collecting source units to find the longest match for m:n conversion, + * some source units may need to be stored for a partial match. + * When a second buffer does not yield a match on all of the previously stored + * source units, then they must be "replayed", i.e., fed back into the converter. + * + * The code relies on the fact that replaying will not nest - + * converting a replay buffer will not result in a replay. + * This is because a replay is necessary only after the _continuation_ of a + * partial match failed, but a replay buffer is converted as a whole. + * It may result in some of its units being stored again for a partial match, + * but there will not be a continuation _during_ the replay which could fail. + * + * It is conceivable that a callback function could call the converter + * recursively in a way that causes another replay to be stored, but that + * would be an error in the callback function. + * Such violations will cause assertion failures in a debug build, + * and wrong output, but they will not cause a crash. + */ + static void _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { UConverterFromUnicode fromUnicode; @@ -822,6 +847,12 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { int32_t errorInputLength; UBool converterSawEndOfInput, calledCallback; + /* variables for m:n conversion */ + UChar replay[UCNV_EXT_MAX_UCHARS]; + const UChar *realSource, *realSourceLimit; + int32_t realSourceIndex; + UBool realFlush; + cnv=pArgs->converter; s=pArgs->source; t=pArgs->target; @@ -841,6 +872,29 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { } } + if(cnv->preFromULength>=0) { + /* normal mode */ + realSource=NULL; + } else { + /* + * Previous m:n conversion stored source units from a partial match + * and failed to consume all of them. + * We need to "replay" them from a temporary buffer and convert them first. + */ + realSource=pArgs->source; + realSourceLimit=pArgs->sourceLimit; + realFlush=pArgs->flush; + realSourceIndex=sourceIndex; + + uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR); + pArgs->source=replay; + pArgs->sourceLimit=replay-cnv->preFromULength; + pArgs->flush=FALSE; + sourceIndex=-1; + + cnv->preFromULength=0; + } + /* * loop for conversion and error handling * @@ -897,7 +951,36 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { pArgs->offsets=offsets+=length; } - sourceIndex+=(int32_t)(pArgs->source-s); + if(sourceIndex>=0) { + sourceIndex+=(int32_t)(pArgs->source-s); + } + } + + if(cnv->preFromULength<0) { + /* + * switch the source to new replay units (cannot occur while replaying) + * after offset handling and before end-of-input and callback handling + */ + if(realSource==NULL) { + realSource=pArgs->source; + realSourceLimit=pArgs->sourceLimit; + realFlush=pArgs->flush; + realSourceIndex=sourceIndex; + + uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR); + pArgs->source=replay; + pArgs->sourceLimit=replay-cnv->preFromULength; + pArgs->flush=FALSE; + if((sourceIndex+=cnv->preFromULength)<0) { + sourceIndex=-1; + } + + cnv->preFromULength=0; + } else { + /* see implementation note before _fromUnicodeWithCallback() */ + U_ASSERT(realSource==NULL); + *err=U_INTERNAL_PROGRAM_ERROR; + } } /* update pointers */ @@ -911,6 +994,15 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { * (continue converting by breaking out of only the inner loop) */ break; + } else if(realSource!=NULL) { + /* switch back from replaying to the real source and continue */ + pArgs->source=realSource; + pArgs->sourceLimit=realSourceLimit; + pArgs->flush=realFlush; + sourceIndex=realSourceIndex; + + realSource=NULL; + break; } else if(pArgs->flush && cnv->fromUChar32!=0) { /* * the entire input stream is consumed @@ -960,7 +1052,27 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) { * the check for buffer overflow is redundant but it is * a high-runner case and hopefully documents the intent * well + * + * if we were replaying, then the replay buffer must be + * copied back into the UConverter + * and the real arguments must be restored */ + if(realSource!=NULL) { + int32_t length; + + U_ASSERT(cnv->preFromULength==0); + + length=(int32_t)(pArgs->sourceLimit-pArgs->source); + if(length>0) { + uprv_memcpy(cnv->preFromU, pArgs->source, length*U_SIZEOF_UCHAR); + cnv->preFromULength=(int8_t)-length; + } + + pArgs->source=realSource; + pArgs->sourceLimit=realSourceLimit; + pArgs->flush=realFlush; + } + return; } } @@ -1079,7 +1191,7 @@ ucnv_fromUnicode(UConverter *cnv, cnv->charErrorBufferLength=0; } - if(!flush && s==sourceLimit) { + if(!flush && s==sourceLimit && cnv->preFromULength>=0) { /* the overflow buffer is emptied and there is no new input: we are done */ *target=t; return; @@ -1122,6 +1234,12 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { int32_t errorInputLength; UBool converterSawEndOfInput, calledCallback; + /* variables for m:n conversion */ + char replay[UCNV_EXT_MAX_BYTES]; + const char *realSource, *realSourceLimit; + int32_t realSourceIndex; + UBool realFlush; + cnv=pArgs->converter; s=pArgs->source; t=pArgs->target; @@ -1141,6 +1259,29 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { } } + if(cnv->preToULength>=0) { + /* normal mode */ + realSource=NULL; + } else { + /* + * Previous m:n conversion stored source units from a partial match + * and failed to consume all of them. + * We need to "replay" them from a temporary buffer and convert them first. + */ + realSource=pArgs->source; + realSourceLimit=pArgs->sourceLimit; + realFlush=pArgs->flush; + realSourceIndex=sourceIndex; + + uprv_memcpy(replay, cnv->preToU, -cnv->preToULength); + pArgs->source=replay; + pArgs->sourceLimit=replay-cnv->preToULength; + pArgs->flush=FALSE; + sourceIndex=-1; + + cnv->preToULength=0; + } + /* * loop for conversion and error handling * @@ -1202,7 +1343,36 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { pArgs->offsets=offsets+=length; } - sourceIndex+=(int32_t)(pArgs->source-s); + if(sourceIndex>=0) { + sourceIndex+=(int32_t)(pArgs->source-s); + } + } + + if(cnv->preToULength<0) { + /* + * switch the source to new replay units (cannot occur while replaying) + * after offset handling and before end-of-input and callback handling + */ + if(realSource==NULL) { + realSource=pArgs->source; + realSourceLimit=pArgs->sourceLimit; + realFlush=pArgs->flush; + realSourceIndex=sourceIndex; + + uprv_memcpy(replay, cnv->preToU, -cnv->preToULength); + pArgs->source=replay; + pArgs->sourceLimit=replay-cnv->preToULength; + pArgs->flush=FALSE; + if((sourceIndex+=cnv->preToULength)<0) { + sourceIndex=-1; + } + + cnv->preToULength=0; + } else { + /* see implementation note before _fromUnicodeWithCallback() */ + U_ASSERT(realSource==NULL); + *err=U_INTERNAL_PROGRAM_ERROR; + } } /* update pointers */ @@ -1216,6 +1386,15 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { * (continue converting by breaking out of only the inner loop) */ break; + } else if(realSource!=NULL) { + /* switch back from replaying to the real source and continue */ + pArgs->source=realSource; + pArgs->sourceLimit=realSourceLimit; + pArgs->flush=realFlush; + sourceIndex=realSourceIndex; + + realSource=NULL; + break; } else if(pArgs->flush && cnv->toULength>0) { /* * the entire input stream is consumed @@ -1265,7 +1444,27 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { * the check for buffer overflow is redundant but it is * a high-runner case and hopefully documents the intent * well + * + * if we were replaying, then the replay buffer must be + * copied back into the UConverter + * and the real arguments must be restored */ + if(realSource!=NULL) { + int32_t length; + + U_ASSERT(cnv->preToULength==0); + + length=(int32_t)(pArgs->sourceLimit-pArgs->source); + if(length>0) { + uprv_memcpy(cnv->preToU, pArgs->source, length); + cnv->preToULength=(int8_t)-length; + } + + pArgs->source=realSource; + pArgs->sourceLimit=realSourceLimit; + pArgs->flush=realFlush; + } + return; } } @@ -1379,7 +1578,7 @@ ucnv_toUnicode(UConverter *cnv, cnv->UCharErrorBufferLength=0; } - if(!flush && s==sourceLimit) { + if(!flush && s==sourceLimit && cnv->preToULength>=0) { /* the overflow buffer is emptied and there is no new input: we are done */ *target=t; return; diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c index a8999c01fe..b9290f6ff4 100644 --- a/icu4c/source/common/ucnv_bld.c +++ b/icu4c/source/common/ucnv_bld.c @@ -776,6 +776,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter, myUConverter->subChar1 = myUConverter->sharedData->staticData->subChar1; myUConverter->subCharLen = myUConverter->sharedData->staticData->subCharLen; uprv_memcpy (myUConverter->subChar, myUConverter->sharedData->staticData->subChar, myUConverter->subCharLen); + myUConverter->preFromUFirstCP = U_SENTINEL; if(myUConverter != NULL && myUConverter->sharedData->impl->open != NULL) { myUConverter->sharedData->impl->open(myUConverter, realName, locale,options, err); diff --git a/icu4c/source/common/ucnv_bld.h b/icu4c/source/common/ucnv_bld.h index f52a5ef9ee..4d68e54850 100644 --- a/icu4c/source/common/ucnv_bld.h +++ b/icu4c/source/common/ucnv_bld.h @@ -20,6 +20,7 @@ #include "unicode/utypes.h" #include "unicode/ucnv.h" #include "unicode/ucnv_err.h" +#include "ucnv_ext.h" #include "udataswp.h" /* size of the overflow buffers in UConverter, enough for escaping callbacks */ @@ -168,12 +169,22 @@ struct UConverter { int8_t UCharErrorBufferLength; /* number of valid UChars in charErrorBuffer */ uint8_t subChar1; /* single-byte substitution character if different from subChar */ + UBool useSubChar1; uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* codepage specific character sequence */ char invalidCharBuffer[UCNV_MAX_CHAR_LEN]; /* bytes from last error/callback situation */ uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* codepage output from Error functions */ UChar invalidUCharBuffer[U16_MAX_LENGTH]; /* UChars from last error/callback situation */ UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* unicode output from Error functions */ + + /* fields for conversion extension */ + + /* store previous UChars/chars to continue partial matches */ + UChar32 preFromUFirstCP; /* >=0: partial match */ + UChar preFromU[UCNV_EXT_MAX_UCHARS]; + char preToU[UCNV_EXT_MAX_BYTES]; + int8_t preFromULength, preToULength; /* negative: replay */ + int8_t preToUFirstLength; /* length of first character */ }; U_CDECL_END /* end of UConverter */ diff --git a/icu4c/source/common/ucnv_cb.c b/icu4c/source/common/ucnv_cb.c index 9cbf25e607..bfa3eb1b8c 100644 --- a/icu4c/source/common/ucnv_cb.c +++ b/icu4c/source/common/ucnv_cb.c @@ -1,6 +1,6 @@ /* ********************************************************************** -* Copyright (C) 2000-2001, International Business Machines +* Copyright (C) 2000-2003, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * ucnv_cb.c: @@ -35,50 +35,16 @@ ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args, int32_t offsetIndex, UErrorCode * err) { - int32_t togo; - int8_t toerr; - int32_t i; - - if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */ - { - uprv_memcpy(args->target, source, length); - args->target += length; - if(args->offsets) /* set all the offsets to the same # */ - { - for(i=0;ioffsets++) = offsetIndex; - } - } + if(U_FAILURE(*err)) { + return; } - else - { - togo = (int32_t)(args->targetLimit - args->target); - uprv_memcpy(args->target, source, togo); - args->target += togo; - - if(args->offsets) - { - for(i=0;ioffsets++) = offsetIndex; - } - } - - /* Now, copy the remainder into the errbuff */ - source += togo; - toerr = (int8_t)(length - togo); - - uprv_memcpy(args->converter->charErrorBuffer + - args->converter->charErrorBufferLength, - source, - toerr * sizeof(source[0])); - args->converter->charErrorBufferLength += toerr; - - *err = U_BUFFER_OVERFLOW_ERROR; - - } + ucnv_fromUWriteBytes( + args->converter, + source, length, + &args->target, args->targetLimit, + &args->offsets, offsetIndex, + err); } U_CAPI void U_EXPORT2 @@ -232,55 +198,16 @@ ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args, int32_t offsetIndex, UErrorCode * err) { - int32_t togo; - int8_t toerr; - int32_t i; - - if(U_FAILURE(*err)) - { + if(U_FAILURE(*err)) { return; } - - if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */ - { - uprv_memcpy(args->target, source, length * sizeof(args->target[0]) ); - args->target += length; - if(args->offsets) /* set all the offsets to the same # */ - { - for(i=0;ioffsets++) = offsetIndex; - } - } - } - else - { - togo = (int32_t)(args->targetLimit - args->target); - - uprv_memcpy(args->target, source, togo * sizeof(args->target[0]) ); - args->target += togo; - - if(args->offsets) - { - for(i=0;ioffsets++) = offsetIndex; - } - } - - /* Now, copy the remainder into the errbuff */ - source += togo; - toerr = (int8_t)(length - togo); - - uprv_memcpy(args->converter->UCharErrorBuffer + - args->converter->UCharErrorBufferLength, - source, - toerr * sizeof(source[0])); - args->converter->UCharErrorBufferLength += toerr; - - *err = U_BUFFER_OVERFLOW_ERROR; - } + ucnv_toUWriteUChars( + args->converter, + source, length, + &args->target, args->targetLimit, + &args->offsets, offsetIndex, + err); } U_CAPI void U_EXPORT2 diff --git a/icu4c/source/common/ucnv_cnv.c b/icu4c/source/common/ucnv_cnv.c index f53c336ca2..875ea1cdbe 100644 --- a/icu4c/source/common/ucnv_cnv.c +++ b/icu4c/source/common/ucnv_cnv.c @@ -79,6 +79,46 @@ ucnv_fromUWriteBytes(UConverter *cnv, } } +U_CFUNC void +ucnv_toUWriteUChars(UConverter *cnv, + const UChar *uchars, int32_t length, + UChar **target, const UChar *targetLimit, + int32_t **offsets, + int32_t sourceIndex, + UErrorCode *pErrorCode) { + UChar *t=*target; + int32_t *o; + + /* write UChars */ + if(offsets==NULL || (o=*offsets)==NULL) { + while(length>0 && t0 && t0) { + if(cnv!=NULL) { + t=cnv->UCharErrorBuffer; + cnv->UCharErrorBufferLength=(int8_t)length; + do { + *t++=*uchars++; + } while(--length>0); + } + *pErrorCode=U_BUFFER_OVERFLOW_ERROR; + } +} + U_CFUNC void ucnv_toUWriteCodePoint(UConverter *cnv, UChar32 c, diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h index 5b948f4fbb..3de6b49bce 100644 --- a/icu4c/source/common/ucnv_cnv.h +++ b/icu4c/source/common/ucnv_cnv.h @@ -251,6 +251,13 @@ ucnv_fromUWriteBytes(UConverter *cnv, int32_t **offsets, int32_t sourceIndex, UErrorCode *pErrorCode); +U_CFUNC void +ucnv_toUWriteUChars(UConverter *cnv, + const UChar *uchars, int32_t length, + UChar **target, const UChar *targetLimit, + int32_t **offsets, + int32_t sourceIndex, + UErrorCode *pErrorCode); U_CFUNC void ucnv_toUWriteCodePoint(UConverter *cnv, diff --git a/icu4c/source/common/ucnv_ext.c b/icu4c/source/common/ucnv_ext.c new file mode 100644 index 0000000000..e706e20945 --- /dev/null +++ b/icu4c/source/common/ucnv_ext.c @@ -0,0 +1,921 @@ +/* +****************************************************************************** +* +* Copyright (C) 2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: ucnv_ext.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun13 +* created by: Markus W. Scherer +* +* Conversion extensions +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_LEGACY_CONVERSION + +#include "ucnv_bld.h" +#include "ucnv_cnv.h" +#include "ucnv_ext.h" +#include "cmemory.h" + +/* + * ### TODO + * + * implement getUnicodeSet for the extension table + * implement data swapping for it + */ + +/* + * ### TODO: probably need pointer to baseTableSharedData + * and also copy the base table's pointers for the base table arrays etc. + * into this sharedData + */ + +/* to Unicode --------------------------------------------------------------- */ + +/* + * @return lookup value for the byte, if found; else 0 + */ +static U_INLINE uint32_t +ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) { + uint32_t word; + int32_t i, start, limit; + + /* check the input byte against the lowest and highest section bytes */ + start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]); + limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]); + if(byte=toUSection[start]) { + break; + } + if(++start=toUSection[start]) { + break; + } + if(++start=toUSection[start]) { + break; + } + /* always break at start==limit-1 */ + ++start; + break; + } + + i=(start+limit)/2; + if(wordUCNV_EXT_MAX_BYTES) { + /* + * end of the entire input stream, stop with the longest match so far + * or: partial match must not be longer than UCNV_EXT_MAX_BYTES + * because it must fit into state buffers + */ + break; + } else { + /* continue with more input next time */ + return -length; + } + } + + /* search for the current UChar */ + value=ucnv_extFindToU(toUSection, length, b); + if(value==0) { + /* no match here, stop with the longest match so far */ + break; + } else { + if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { + /* partial match, continue */ + index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value); + } else { + if( UCNV_EXT_TO_U_IS_ROUNDTRIP(value) || + TO_U_USE_FALLBACK(useFallback) + ) { + /* full match, stop with result */ + matchValue=value; + matchLength=i+j; + } else { + /* full match on fallback not taken, stop with the longest match so far */ + } + break; + } + } + } + + if(matchLength==0) { + /* no match at all */ + return 0; + } + + /* return result */ + matchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue); + if(UCNV_EXT_TO_U_IS_CODE_POINT(matchValue)) { + *pResultLength=-(int32_t)matchValue; + } else { + *pResultLength=UCNV_EXT_TO_U_GET_LENGTH(matchValue); + *pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+UCNV_EXT_TO_U_GET_INDEX(matchValue); + } + + return matchLength; +} + +static U_INLINE void +ucnv_extWriteToU(UConverter *cnv, + const UChar *result, int32_t resultLength, + UChar **target, const UChar *targetLimit, + int32_t **offsets, int32_t srcIndex, + UErrorCode *pErrorCode) { + /* output the result */ + if(resultLength<0) { + /* output a single code point */ + ucnv_toUWriteCodePoint( + cnv, UCNV_EXT_TO_U_GET_CODE_POINT(-resultLength), + target, targetLimit, + offsets, srcIndex, + pErrorCode); + } else { + /* output a string - with correct data we have resultLength>0 */ + ucnv_toUWriteUChars( + cnv, + result, resultLength, + target, targetLimit, + offsets, srcIndex, + pErrorCode); + } +} + +/* + * targettoUBytes, firstLength, + *src, (int32_t)(srcLimit-*src), + &result, &resultLength, + cnv->useFallback, flush); + if(match>0) { + /* advance src pointer for the consumed input */ + *src+=match-firstLength; + + /* write result to target */ + ucnv_extWriteToU(cnv, + result, resultLength, + target, targetLimit, + offsets, srcIndex, + pErrorCode); + return TRUE; + } else if(match<0) { + /* save state for partial match */ + const char *s; + int32_t j; + + /* copy the first code point */ + s=(const char *)cnv->toUBytes; + cnv->preToUFirstLength=(int8_t)firstLength; + for(j=0; jpreToU[j]=*s++; + } + + /* now copy the newly consumed input */ + s=*src; + match=-match; + for(; jpreToU[j]=*s++; + } + *src=s; /* same as *src=srcLimit; because we reached the end of input */ + cnv->preToULength=(int8_t)match; + return TRUE; + } else /* match==0 no match */ { + return FALSE; + } +} + +#if 0 +/* ### TODO */ + +U_CFUNC int32_t +ucnv_extSimpleMatchToU(const int32_t *cx, + UChar32 cp, uint32_t *pValue, + UBool useFallback, + UErrorCode *pErrorCode) { + const uint8_t *result; + int32_t resultLength, match; + + /* try to match */ + match=ucnv_extMatchToU(cx, + cp, + NULL, 0, + NULL, 0, + &result, &resultLength, + useFallback, TRUE); + if(match>=2) { + /* write result for simple, single-character conversion */ + if(resultLength<0) { + resultLength=-resultLength; + *pValue=(uint32_t)UCNV_EXT_TO_U_GET_DATA(resultLength); + return UCNV_EXT_TO_U_GET_LENGTH(resultLength); + } else if(resultLength==4) { + /* de-serialize a 4-byte result */ + *pValue= + ((uint32_t)result[0]<<24)| + ((uint32_t)result[1]<<16)| + ((uint32_t)result[2]<<8)| + result[3]; + return 4; + } + } + + /* + * return no match because + * - match>1 && resultLength>4: result too long for simple conversion + * - match==1: no match found, preferred + * - match==0: no match found in the first place + * - match<0: partial match, not supported for simple conversion (and flush==TRUE) + */ + return 0; +} + +#endif + +/* + * continue partial match with new input + * never called for simple, single-character conversion + */ +U_CFUNC void +ucnv_extContinueMatchToU(UConverter *cnv, + UConverterToUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode) { + const UChar *result; + int32_t resultLength, match, length; + + match=ucnv_extMatchToU(cnv->sharedData->table->mbcs.extIndexes, + cnv->preToU, cnv->preToULength, + pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), + &result, &resultLength, + cnv->useFallback, pArgs->flush); + if(match>0) { + if(match>=cnv->preToULength) { + /* advance src pointer for the consumed input */ + pArgs->source+=match-cnv->preToULength; + cnv->preToULength=0; + } else { + /* the match did not use all of preToU[] - keep the rest for replay */ + int32_t length=cnv->preToULength-match; + uprv_memmove(cnv->preToU, cnv->preToU+match, length); + cnv->preToULength=(int8_t)-length; + } + + /* write result */ + ucnv_extWriteToU(cnv, + result, resultLength, + &pArgs->target, pArgs->targetLimit, + &pArgs->offsets, srcIndex, + pErrorCode); + } else if(match<0) { + /* save state for partial match */ + const char *s; + int32_t j; + + /* just _append_ the newly consumed input to preToU[] */ + s=pArgs->source; + match=-match; + for(j=cnv->preToULength; jpreToU[j]=*s++; + } + pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ + cnv->preToULength=(int8_t)match; + } else /* match==0 */ { + /* + * no match + * + * We need to split the previous input into two parts: + * + * 1. The first codepage character is unmappable - that's how we got into + * trying the extension data in the first place. + * We need to move it from the preToU buffer + * to the error buffer, set an error code, + * and prepare the rest of the previous input for 2. + * + * 2. The rest of the previous input must be converted once we + * come back from the callback for the first character. + * At that time, we have to try again from scratch to convert + * these input characters. + * The replay will be handled by the ucnv.c conversion code. + */ + + /* move the first codepage character to the error field */ + uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength); + cnv->toULength=cnv->preToUFirstLength; + + /* move the rest up inside the buffer */ + length=cnv->preToULength-cnv->preToUFirstLength; + if(length>0) { + uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length); + } + + /* mark preToU for replay */ + cnv->preToULength=(int8_t)-length; + + /* set the error code for unassigned */ + *pErrorCode=U_INVALID_CHAR_FOUND; + } +} + +/* from Unicode ------------------------------------------------------------- */ + +/* + * @return index of the UChar, if found; else <0 + */ +static U_INLINE int32_t +ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) { + int32_t i, start, limit; + + /* binary search */ + start=0; + limit=length; + for(;;) { + i=limit-start; + if(i<=1) { + break; /* done */ + } + /* start=fromUSection[start]) { + break; + } + if(++start=fromUSection[start]) { + break; + } + if(++start=fromUSection[start]) { + break; + } + /* always break at start==limit-1 */ + ++start; + break; + } + + i=(start+limit)/2; + if(u=0 + * @param src UChars that can be used to complete a match + * @param srcLength length of src, >=0 + * @param pResult [out] address of pointer to result bytes + * set only in case of a match + * @param pResultLength [out] address of result length variable; + * gets a negative value if the length variable + * itself contains the length and bytes, encoded in + * the format of fromUTableValues[] and then inverted + * @param useFallback "use fallback" flag, usually from cnv->useFallback + * @param flush TRUE if the end of the input stream is reached + * @return >1: matched, return value=total match length (number of input units matched) + * 1: matched, no mapping but request for + * (only for the first code point) + * 0: no match + * <0: partial match, return value=negative total match length + * (partial matches are never returned for flush==TRUE) + * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) + * the matchLength is 2 if only firstCP matched, and >2 if firstCP and + * further code units matched + */ +static int32_t +ucnv_extMatchFromU(const int32_t *cx, + UChar32 firstCP, + const UChar *pre, int32_t preLength, + const UChar *src, int32_t srcLength, + const uint8_t **pResult, int32_t *pResultLength, + UBool useFallback, UBool flush) { + const uint16_t *stage12, *stage3; + const uint32_t *stage3b; + + const UChar *fromUTableUChars, *fromUSectionUChars; + const uint32_t *fromUTableValues, *fromUSectionValues; + + uint32_t value, matchValue; + int32_t i, j, index, length, matchLength; + UChar c; + + if(cx==NULL) { + return 0; /* no extension data, no match */ + } + + /* trie lookup of firstCP */ + index=firstCP>>10; /* stage 1 index */ + if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) { + return 0; /* the first code point is outside the trie */ + } + + stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t); + stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t); + index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP); + + stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t); + value=stage3b[index]; + if(value==0) { + return 0; + } + + if(UCNV_EXT_TO_U_IS_PARTIAL(value)) { + /* partial match, enter the loop below */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); + + /* initialize */ + fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar); + fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t); + + matchValue=0; + i=j=matchLength=0; + + /* we must not remember fallback matches when not using fallbacks */ + + /* match input units until there is a full match or the input is consumed */ + for(;;) { + /* go to the next section */ + fromUSectionUChars=fromUTableUChars+index; + fromUSectionValues=fromUTableValues+index; + + /* read first pair of the section */ + length=*fromUSectionUChars++; + value=*fromUSectionValues++; + if( value!=0 && + (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || + FROM_U_USE_FALLBACK(useFallback, firstCP)) + ) { + /* remember longest match so far */ + matchValue=value; + matchLength=2+i+j; + } + + /* match pre[] then src[] */ + if(iUCNV_EXT_MAX_UCHARS) { + /* + * end of the entire input stream, stop with the longest match so far + * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS + * because it must fit into state buffers + */ + break; + } else { + /* continue with more input next time */ + return -(2+length); + } + } + + /* search for the current UChar */ + index=ucnv_extFindFromU(fromUSectionUChars, length, c); + if(index<0) { + /* no match here, stop with the longest match so far */ + break; + } else { + value=fromUSectionValues[index]; + if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) { + /* partial match, continue */ + index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value); + } else { + if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || + FROM_U_USE_FALLBACK(useFallback, firstCP) + ) { + /* full match, stop with result */ + matchValue=value; + matchLength=2+i+j; + } else { + /* full match on fallback not taken, stop with the longest match so far */ + } + break; + } + } + } + + if(matchLength==0) { + /* no match at all */ + return 0; + } + } else /* result from firstCP trie lookup */ { + if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || + FROM_U_USE_FALLBACK(useFallback, firstCP) + ) { + /* full match, stop with result */ + matchValue=value; + matchLength=2; + } else { + /* fallback not taken */ + return 0; + } + } + + if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) { + /* do not interpret values with reserved bits used, for forward compatibility */ + return 0; + } + + /* return result */ + if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) { + return 1; + } + + matchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue); + length=(int32_t)UCNV_EXT_FROM_U_GET_LENGTH(matchValue); + if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) { + *pResultLength=-(int32_t)matchValue; + } else { + *pResultLength=length; + *pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+UCNV_EXT_FROM_U_GET_DATA(matchValue); + } + + return matchLength; +} + +static U_INLINE void +ucnv_extWriteFromU(UConverter *cnv, + const uint8_t *result, int32_t resultLength, + char **target, const char *targetLimit, + int32_t **offsets, int32_t srcIndex, + UErrorCode *pErrorCode) { + uint8_t buffer[4]; + + /* output the result */ + if(resultLength<0) { + /* + * Generate a byte array and then write it below. + * This is not the fastest possible way, but it should be ok for + * extension mappings, and it is much simpler. + * Offset and overflow handling are only done once this way. + */ + uint8_t *p; + uint32_t value; + + resultLength=-resultLength; + value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength); + resultLength=UCNV_EXT_FROM_U_GET_LENGTH(resultLength); + /* resultLength<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH==3 */ + + p=buffer; + switch(resultLength) { + case 3: + *p++=(uint8_t)(value>>16); + case 2: + *p++=(uint8_t)(value>>8); + case 1: + *p++=(uint8_t)value; + default: + break; /* will never occur */ + } + result=buffer; + } + + /* with correct data we have resultLength>0 */ + ucnv_fromUWriteBytes(cnv, (const char *)result, resultLength, + target, targetLimit, + offsets, srcIndex, + pErrorCode); +} + +/* + * targetuseFallback, flush); + if(match>=2) { + /* advance src pointer for the consumed input */ + *src+=match-2; /* remove 2 for the initial code point */ + + /* write result to target */ + ucnv_extWriteFromU(cnv, + result, resultLength, + target, targetLimit, + offsets, srcIndex, + pErrorCode); + return TRUE; + } else if(match<0) { + /* save state for partial match */ + const UChar *s; + int32_t j; + + /* copy the first code point */ + cnv->preFromUFirstCP=cp; + + /* now copy the newly consumed input */ + s=*src; + match=-match-2; /* remove 2 for the initial code point */ + for(j=0; jpreFromU[j]=*s++; + } + *src=s; /* same as *src=srcLimit; because we reached the end of input */ + cnv->preFromULength=(int8_t)match; + return TRUE; + } else if(match==1) { + /* matched, no mapping but request for */ + cnv->useSubChar1=TRUE; + return FALSE; + } else /* match==0 no match */ { + return FALSE; + } +} + +U_CFUNC int32_t +ucnv_extSimpleMatchFromU(const int32_t *cx, + UChar32 cp, uint32_t *pValue, + UBool useFallback, + UErrorCode *pErrorCode) { + const uint8_t *result; + int32_t resultLength, match; + + /* try to match */ + match=ucnv_extMatchFromU(cx, + cp, + NULL, 0, + NULL, 0, + &result, &resultLength, + useFallback, TRUE); + if(match>=2) { + /* write result for simple, single-character conversion */ + if(resultLength<0) { + resultLength=-resultLength; + *pValue=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength); + return UCNV_EXT_FROM_U_GET_LENGTH(resultLength); + } else if(resultLength==4) { + /* de-serialize a 4-byte result */ + *pValue= + ((uint32_t)result[0]<<24)| + ((uint32_t)result[1]<<16)| + ((uint32_t)result[2]<<8)| + result[3]; + return 4; + } + } + + /* + * return no match because + * - match>1 && resultLength>4: result too long for simple conversion + * - match==1: no match found, preferred + * - match==0: no match found in the first place + * - match<0: partial match, not supported for simple conversion (and flush==TRUE) + */ + return 0; +} + +/* + * continue partial match with new input, requires cnv->preFromUFirstCP>=0 + * never called for simple, single-character conversion + */ +U_CFUNC void +ucnv_extContinueMatchFromU(UConverter *cnv, + UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode) { + const uint8_t *result; + int32_t resultLength, match; + + match=ucnv_extMatchFromU(cnv->sharedData->table->mbcs.extIndexes, + cnv->preFromUFirstCP, + cnv->preFromU, cnv->preFromULength, + pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source), + &result, &resultLength, + cnv->useFallback, pArgs->flush); + if(match>=2) { + match-=2; /* remove 2 for the initial code point */ + + if(match>=cnv->preFromULength) { + /* advance src pointer for the consumed input */ + pArgs->source+=match-cnv->preFromULength; + cnv->preFromULength=0; + } else { + /* the match did not use all of preFromU[] - keep the rest for replay */ + int32_t length=cnv->preFromULength-match; + uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR); + cnv->preFromULength=(int8_t)-length; + } + + /* finish the partial match */ + cnv->preFromUFirstCP=U_SENTINEL; + + /* write result */ + ucnv_extWriteFromU(cnv, + result, resultLength, + &pArgs->target, pArgs->targetLimit, + &pArgs->offsets, srcIndex, + pErrorCode); + } else if(match<0) { + /* save state for partial match */ + const UChar *s; + int32_t j; + + /* just _append_ the newly consumed input to preFromU[] */ + s=pArgs->source; + match=-match-2; /* remove 2 for the initial code point */ + for(j=cnv->preFromULength; jpreFromU[j]=*s++; + } + pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */ + cnv->preFromULength=(int8_t)match; + } else /* match==0 or 1 */ { + /* + * no match + * + * We need to split the previous input into two parts: + * + * 1. The first code point is unmappable - that's how we got into + * trying the extension data in the first place. + * We need to move it from the preFromU buffer + * to the error buffer, set an error code, + * and prepare the rest of the previous input for 2. + * + * 2. The rest of the previous input must be converted once we + * come back from the callback for the first code point. + * At that time, we have to try again from scratch to convert + * these input characters. + * The replay will be handled by the ucnv.c conversion code. + */ + + if(match==1) { + /* matched, no mapping but request for */ + cnv->useSubChar1=TRUE; + } + + /* move the first code point to the error field */ + cnv->fromUChar32=cnv->preFromUFirstCP; + cnv->preFromUFirstCP=U_SENTINEL; + + /* mark preFromU for replay */ + cnv->preFromULength=-cnv->preFromULength; + + /* set the error code for unassigned */ + *pErrorCode=U_INVALID_CHAR_FOUND; + } +} + +/* + * ### TODO + * + * - test toU() functions + * + * - EBCDIC_STATEFUL: support extensions, but the charset string must be + * either one single-byte character or a sequence of double-byte ones, + * to avoid state transitions inside the mapping and to avoid having to + * store character boundaries. + * The extension functions will need an additional EBCDIC state in/out + * parameter and will have to be able to insert an SI or SO before writing + * the mapping result. + * - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB + * - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate + * - make dbcsonly work with extensions + * + * - test |2 to for regular code point, prefix code point, + * multiple code points + * - test fallback from non-zero to 00 + * - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone() + */ + +#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ diff --git a/icu4c/source/common/ucnv_ext.h b/icu4c/source/common/ucnv_ext.h new file mode 100644 index 0000000000..29a683263f --- /dev/null +++ b/icu4c/source/common/ucnv_ext.h @@ -0,0 +1,417 @@ +/* +****************************************************************************** +* +* Copyright (C) 2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: ucnv_ext.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun13 +* created by: Markus W. Scherer +* +* Conversion extensions +*/ + +#ifndef __UCNV_EXT_H__ +#define __UCNV_EXT_H__ + +#include "unicode/utypes.h" +#include "unicode/ucnv.h" + +/* + * See icuhtml/design/conversion/conversion_extensions.html + * + * Conversion extensions serve two purposes: + * 1. They support m:n mappings. + * 2. They support extension-only conversion files that are used together + * with the regular conversion data in base files. + * + * A base file may contain an extension table (explicitly requested or + * implicitly generated for m:n mappings), but its extension table is not + * used when an extension-only file is used. + * + * It is an error if a base file contains any regular (not extension) mapping + * from the same sequence as a mapping in the extension file + * because the base mapping would hide the extension mapping. + * + * + * Data for conversion extensions: + * + * One set of data structures per conversion direction (to/from Unicode). + * The data structures are sorted by input units to allow for binary search. + * Input sequences of more than one unit are handled like contraction tables + * in collation: + * The lookup value of a unit points to another table that is to be searched + * for the next unit, recursively. + * + * For conversion from Unicode, the initial code point is looked up in + * a 3-stage trie for speed, + * with an additional table of unique results to save space. + * + * Long output strings are stored in separate arrays, with length and index + * in the lookup tables. + * Output results also include a flag distinguishing roundtrip from + * (reverse) fallback mappings. + * + * Input Unicode strings must not begin or end with unpaired surrogates + * to avoid problems with matches on parts of surrogate pairs. + * + * Mappings from multiple characters (code points or codepage state + * table sequences) must be searched preferring the longest match. + * For this to work and be efficient, the variable-width table must contain + * all mappings that contain prefixes of the multiple characters. + * If an extension table is built on top of a base table in another file + * and a base table entry is a prefix of a multi-character mapping, then + * this is an error. + * + * + * Implementation note: + * + * Currently, the parser and several checks in the code limit the number + * of UChars or bytes in a mapping to + * UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively, + * which are output value limits in the data structure. + * + * For input, this is not strictly necessary - it is a hard limit only for the + * buffers in UConverter that are used to store partial matches. + * + * Input sequences could otherwise be arbitrarily long if partial matches + * need not be stored (i.e., if a sequence does not span several buffers with too + * many units before the last buffer), although then results would differ + * depending on whether partial matches exceed the limits or not, + * which depends on the pattern of buffer sizes. + * + * + * Data structure: + * + * int32_t indexes[>=32]; + * + * Array of indexes and lengths etc. The length of the array is at least 32. + * The actual length is stored in indexes[0] to be forward compatible. + * + * Each index to another array is the number of bytes from indexes[]. + * Each length of an array is the number of array base units in that array. + * + * Some of the structures may not be present, in which case their indexes + * and lengths are 0. + * + * Usage of indexes[i]: + * [0] length of indexes[] + * + * // to Unicode table + * [1] index of toUTable[] (array of uint32_t) + * [2] length of toUTable[] + * [3] index of toUUChars[] (array of UChar) + * [4] length of toUUChars[] + * + * // from Unicode table, not for the initial code point + * [5] index of fromUTableUChars[] (array of UChar) + * [6] index of fromUTableValues[] (array of uint32_t) + * [7] length of fromUTableUChars[] and fromUTableValues[] + * [8] index of fromUBytes[] (array of char) + * [9] length of fromUBytes[] + * + * // from Unicode trie for initial-code point lookup + * [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2) + * [11] length of stage 1 portion of fromUStage12[] + * [12] length of fromUStage12[] + * [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[]) + * [14] length of fromUStage3[] + * [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[]) + * [16] length of fromUStage3b[] + * + * [17]..[30] reserved + * [31] number of bytes for the entire extension structure + * [>31] reserved; there are indexes[0] indexes + * + * + * uint32_t toUTable[]; + * + * Array of byte/value pairs for lookups for toUnicode conversion. + * The array is partitioned into sections like collation contraction tables. + * Each section contains one word with the number of following words and + * a default value for when the lookup in this section yields no match. + * + * A section is sorted in ascending order of input bytes, + * allowing for fast linear or binary searches. + * The builder may store entries for a contiguous range of byte values + * (compare difference between the first and last one with count), + * which then allows for direct array access. + * The builder should always do this for the initial table section. + * + * Entries may have 0 values, see below. + * No two entries in a section have the same byte values. + * + * Each uint32_t contains an input byte value in bits 31..24 and the + * corresponding lookup value in bits 23..0. + * Interpret the value as follows: + * if(value==0) { + * no match, see below + * } else if(value<0x1f0000) { + * partial match - use value as index to the next toUTable section + * and match the next unit; (value indexes toUTable[value]) + * } else { + * if(bit 23 set) { + * roundtrip; + * } else { + * fallback; + * } + * unset value bit 23; + * if(value<=0x2fffff) { + * (value-0x1f0000) is a code point; (BMP: value<=0x1fffff) + * } else { + * bits 17..0 (value&0x3ffff) is an index to + * the result UChars in toUUChars[]; (0 indexes toUUChars[0]) + * length of the result=((value>>18)-12); (length=0..19) + * } + * } + * + * The first word in a section contains the number of following words in the + * input byte position (bits 31..24, number=1..0xff). + * The value of the initial word is used when the current byte is not found + * in this section. + * If the value is not 0, then it represents a result as above. + * If the value is 0, then the search has to return a shorter match with an + * earlier default value as the result, or result in "unmappable" even for the + * initial bytes. + * If the value is 0 for the initial toUTable entry, then the initial byte + * does not start any mapping input. + * + * + * UChar toUUChars[]; + * + * Contains toUnicode mapping results, stored as sequences of UChars. + * Indexes and lengths stored in the toUTable[]. + * + * + * UChar fromUTableUChars[]; + * uint32_t fromUTableValues[]; + * + * The fromUTable is split into two arrays, but works otherwise much like + * the toUTable. The array is partitioned into sections like collation + * contraction tables and toUTable. + * A row in the table consists of same-index entries in fromUTableUChars[] + * and fromUTableValues[]. + * + * Interpret a value as follows: + * if(value==0) { + * no match, see below + * } else if(value<=0xffffff) { (bits 31..24 are 0) + * partial match - use value as index to the next fromUTable section + * and match the next unit; (value indexes fromUTable[value]) + * } else { + * if(value==0x80000001) { + * return no mapping, but request for ; + * } + * if(bit 31 set) { + * roundtrip; + * } else { + * fallback; + * } + * // bits 30..29 reserved, 0 + * length=(value>>24)&0x1f; (bits 28..24) + * if(length==1..3) { + * bits 23..0 contain 1..3 bytes, padded with 00s on the left; + * } else { + * bits 23..0 (value&0xffffff) is an index to + * the result bytes in fromUBytes[]; (0 indexes fromUBytes[0]) + * } + * } + * + * The first pair in a section contains the number of following pairs in the + * UChar position (16 bits, number=1..0xffff). + * The value of the initial pair is used when the current UChar is not found + * in this section. + * If the value is not 0, then it represents a result as above. + * If the value is 0, then the search has to return a shorter match with an + * earlier default value as the result, or result in "unmappable" even for the + * initial UChars. + * + * If the from Unicode trie is present, then the from Unicode search tables + * are not used for initial code points. + * In this case, the first entries (index 0) in the tables are not used + * (reserved, set to 0) because a value of 0 is used in trie results + * to indicate no mapping. + * + * + * uint16_t fromUStage12[]; + * + * Stages 1 & 2 of a trie that maps an initial code point. + * Indexes in stage 1 are all offset by the length of stage 1 so that the + * same array pointer can be used for both stages. + * If (c>>10)>=(length of stage 1) then c does not start any mapping. + * Same bit distribution as for regular conversion tries. + * + * + * uint16_t fromUStage3[]; + * uint32_t fromUStage3b[]; + * + * Stage 3 of the trie. The first array simply contains indexes to the second, + * which contains words in the same format as fromUTableValues[]. + * Use a stage 3 granularity of 4, which allows for 256k stage 3 entries, + * and 16-bit entries in stage 3 allow for 64k stage 3b entries. + * The stage 3 granularity means that the stage 2 entry needs to be left-shifted. + * + * Two arrays are used because it is expected that more than half of the stage 3 + * entries will be zero. The 16-bit index stage 3 array saves space even + * considering storing a total of 6 bytes per non-zero entry in both arrays + * together. + * Using a stage 3 granularity of >1 diminishes the compactability in that stage + * but provides a larger effective addressing space in stage 2. + * All but the final result stage use 16-bit entries to save space. + * + * fromUStage3b[] contains a zero for "no mapping" at its index 0, + * and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for " SUB mapping" + * (i.e., "no mapping" with preference for rather than ), + * and all other items are unique non-zero results. + * + * + * char fromUBytes[]; + * + * Contains fromUnicode mapping results, stored as sequences of chars. + * Indexes and lengths stored in the fromUTableValues[]. + */ +enum { + UCNV_EXT_INDEXES_LENGTH, /* 0 */ + + UCNV_EXT_TO_U_INDEX, /* 1 */ + UCNV_EXT_TO_U_LENGTH, + UCNV_EXT_TO_U_UCHARS_INDEX, + UCNV_EXT_TO_U_UCHARS_LENGTH, + + UCNV_EXT_FROM_U_UCHARS_INDEX, /* 5 */ + UCNV_EXT_FROM_U_VALUES_INDEX, + UCNV_EXT_FROM_U_LENGTH, + UCNV_EXT_FROM_U_BYTES_INDEX, + UCNV_EXT_FROM_U_BYTES_LENGTH, + + UCNV_EXT_FROM_U_STAGE_12_INDEX, /* 10 */ + UCNV_EXT_FROM_U_STAGE_1_LENGTH, + UCNV_EXT_FROM_U_STAGE_12_LENGTH, + UCNV_EXT_FROM_U_STAGE_3_INDEX, + UCNV_EXT_FROM_U_STAGE_3_LENGTH, + UCNV_EXT_FROM_U_STAGE_3B_INDEX, + UCNV_EXT_FROM_U_STAGE_3B_LENGTH, + + UCNV_EXT_RESERVED_INDEX, /* 17, moves with additional indexes */ + + UCNV_EXT_SIZE=31, + UCNV_EXT_INDEXES_MIN_LENGTH=32 +}; + +/* get the pointer to an extension array from indexes[index] */ +#define UCNV_EXT_ARRAY(indexes, index, itemType) \ + ((const itemType *)((const char *)(indexes)+(indexes)[index])) + +/* internal API ------------------------------------------------------------- */ + +U_CFUNC UBool +ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx, + int32_t firstLength, + const char **src, const char *srcLimit, + UChar **target, const UChar *targetLimit, + int32_t **offsets, int32_t srcIndex, + UBool flush, + UErrorCode *pErrorCode); + +U_CFUNC void +ucnv_extContinueMatchToU(UConverter *cnv, + UConverterToUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode); + + +U_CFUNC UBool +ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx, + UChar32 cp, + const UChar **src, const UChar *srcLimit, + char **target, const char *targetLimit, + int32_t **offsets, int32_t srcIndex, + UBool flush, + UErrorCode *pErrorCode); + +U_CFUNC int32_t +ucnv_extSimpleMatchFromU(const int32_t *cx, + UChar32 cp, uint32_t *pValue, + UBool useFallback, + UErrorCode *pErrorCode); + +U_CFUNC void +ucnv_extContinueMatchFromU(UConverter *cnv, + UConverterFromUnicodeArgs *pArgs, int32_t srcIndex, + UErrorCode *pErrorCode); + +/* toUnicode helpers -------------------------------------------------------- */ + +#define UCNV_EXT_TO_U_BYTE_SHIFT 24 +#define UCNV_EXT_TO_U_VALUE_MASK 0xffffff +#define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000 +#define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff +#define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23) +#define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff +#define UCNV_EXT_TO_U_LENGTH_SHIFT 18 +#define UCNV_EXT_TO_U_LENGTH_OFFSET 12 + +/* maximum number of indexed UChars */ +#define UCNV_EXT_MAX_UCHARS 19 + +#define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<>UCNV_EXT_TO_U_BYTE_SHIFT) +#define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK) + +#define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET) + +/* fromUnicode helpers ------------------------------------------------------ */ + +/* most trie constants are shared with ucnvmbcs.h */ + +/* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */ +#define UCNV_EXT_STAGE_2_LEFT_SHIFT 2 +#define UCNV_EXT_STAGE_3_GRANULARITY 4 + +/* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */ +#define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \ + (stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]< (impossible roundtrip to 0 bytes, value 01) */ +#define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001 + +/* at most 3 bytes in the lower part of the value */ +#define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3 + +/* maximum number of indexed bytes */ +#define UCNV_EXT_MAX_BYTES 0x1f + +#define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0) +#define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value) + +#define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0) +#define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG) + +/* use after masking off the roundtrip flag */ +#define UCNV_EXT_FROM_U_GET_LENGTH(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES) + +/* get bytes or bytes index */ +#define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK) + +#endif diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c index 0e2cee3f3f..c71bbe99fd 100644 --- a/icu4c/source/common/ucnvmbcs.c +++ b/icu4c/source/common/ucnvmbcs.c @@ -46,6 +46,7 @@ #include "unicode/uset.h" #include "ucnv_bld.h" #include "ucnvmbcs.h" +#include "ucnv_ext.h" #include "ucnv_cnv.h" #include "umutex.h" #include "cmemory.h" @@ -56,9 +57,18 @@ #define MBCS_UNROLL_SINGLE_FROM_BMP 0 /* - * _MBCSHeader versions 4.1 + * _MBCSHeader versions 4.2 * (Note that the _MBCSHeader version is in addition to the converter formatVersion.) * + * Change from version 4.1: + * - Added an optional extension table structure at the end of the .cnv file. + * It is present if the upper bits of the header flags field contains a non-zero + * byte offset to it. + * Files that contain only a conversion table and no base table + * use the special outputType MBCS_OUTPUT_EXT_ONLY. + * These contain the base table name between the MBCS header and the extension + * data. + * * Change from version 4.0: * - Replace header.reserved with header.fromUBytesLength so that all * fields in the data have length. @@ -524,11 +534,6 @@ _MBCSGetUnicodeSet(const UConverter *cnv, * code. The framework will then call the callback function. */ -/* - * TODO when implementing real extensions, review whether the useFallback parameter - * should get cnv->useFallback or the full resolution considering cp as well - */ - /* * @return if(U_FAILURE) return the code point for cnv->fromUChar32 * else return 0 after output has been written to the target @@ -539,10 +544,26 @@ _extFromU(UConverter *cnv, const UConverterSharedData *sharedData, const UChar **source, const UChar *sourceLimit, char **target, const char *targetLimit, int32_t **offsets, int32_t sourceIndex, - UBool useFallback, UBool flush, + UBool flush, UErrorCode *pErrorCode) { + const int32_t *cx; + + cnv->useSubChar1=FALSE; + + if( (cx=sharedData->table->mbcs.extIndexes)!=NULL && + ucnv_extInitialMatchFromU( + cnv, cx, + cp, source, sourceLimit, + target, targetLimit, + offsets, sourceIndex, + flush, + pErrorCode) + ) { + return 0; /* an extension mapping handled the input */ + } + /* GB 18030 */ - if(cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) { + if((cnv->options&_MBCS_OPTION_GB18030)!=0) { const uint32_t *range; int32_t i; @@ -590,10 +611,24 @@ _extToU(UConverter *cnv, const UConverterSharedData *sharedData, const char **source, const char *sourceLimit, UChar **target, const UChar *targetLimit, int32_t **offsets, int32_t sourceIndex, - UBool useFallback, UBool flush, + UBool flush, UErrorCode *pErrorCode) { + const int32_t *cx; + + if( (cx=sharedData->table->mbcs.extIndexes)!=NULL && + ucnv_extInitialMatchToU( + cnv, cx, + length, source, sourceLimit, + target, targetLimit, + offsets, sourceIndex, + flush, + pErrorCode) + ) { + return 0; /* an extension mapping handled the input */ + } + /* GB 18030 */ - if(length==4 && cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) { + if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) { const uint32_t *range; uint32_t linear; int32_t i; @@ -789,6 +824,7 @@ _MBCSLoad(UConverterSharedData *sharedData, UDataInfo info; UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs; _MBCSHeader *header=(_MBCSHeader *)raw; + uint32_t offset; if(header->version[0]!=4) { *pErrorCode=U_INVALID_TABLE_FORMAT; @@ -806,6 +842,12 @@ _MBCSLoad(UConverterSharedData *sharedData, mbcsTable->fromUBytesLength=header->fromUBytesLength; mbcsTable->outputType=(uint8_t)header->flags; + /* extension data, header version 4.2 and higher */ + offset=header->flags>>8; + if(offset!=0) { + mbcsTable->extIndexes=(const int32_t *)(raw+offset); + } + /* make sure that the output type is known */ switch(mbcsTable->outputType) { case MBCS_OUTPUT_1: @@ -817,6 +859,8 @@ _MBCSLoad(UConverterSharedData *sharedData, case MBCS_OUTPUT_2_SISO: /* OK */ break; + case MBCS_OUTPUT_EXT_ONLY: + /* ### TODO */ default: *pErrorCode=U_INVALID_TABLE_FORMAT; return; @@ -1062,7 +1106,7 @@ _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1, (const char **)&source, (const char *)sourceLimit, &target, targetLimit, &offsets, sourceIndex, - (UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush, + pArgs->flush, pErrorCode); sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source); @@ -1263,7 +1307,7 @@ unrolled: 1, (const char **)&source, (const char *)sourceLimit, &target, target+targetCapacity, &offsets, sourceIndex, - (UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush, + pArgs->flush, pErrorCode); sourceIndex+=1+(int32_t)(source-lastSource); @@ -1299,266 +1343,6 @@ unrolled: pArgs->offsets=offsets; } -/* - * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages. - * We still need a conversion loop in case we find reserved action codes, which are to be ignored. - */ -static UChar32 -_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const int32_t (*stateTable)[256]; - const uint8_t *source, *sourceLimit; - - int32_t entry; - uint8_t action; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { - stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable; - } else { - stateTable=cnv->sharedData->table->mbcs.stateTable; - } - - /* conversion loop */ - while(sourcesource=(const char *)source; - - if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { - /* output BMP code point */ - return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); - } - - /* - * An if-else-if chain provides more reliable performance for - * the most common cases compared to a switch. - */ - action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); - if( action==MBCS_STATE_VALID_DIRECT_20 || - (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) - ) { - /* output supplementary code point */ - return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); - } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { - if(UCNV_TO_U_USE_FALLBACK(cnv)) { - /* output BMP code point */ - return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); - } - } else if(action==MBCS_STATE_UNASSIGNED) { - /* just fall through */ - } else if(action==MBCS_STATE_ILLEGAL) { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } else { - /* reserved, must never occur */ - continue; - } - - if(U_FAILURE(*pErrorCode)) { - /* callback(illegal) */ - break; - } else /* unassigned sequence */ { - /* defer to the generic implementation */ - pArgs->source=(const char *)source-1; - return UCNV_GET_NEXT_UCHAR_USE_TO_U; - } - } - - /* no output because of empty input or only state changes */ - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0xffff; -} - -static UChar32 -_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source, *sourceLimit, *lastSource; - - const int32_t (*stateTable)[256]; - const uint16_t *unicodeCodeUnits; - - uint32_t offset; - uint8_t state; - - int32_t entry; - UChar32 c; - uint8_t action; - - /* use optimized function if possible */ - cnv=pArgs->converter; - if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { - /* - * Using the generic ucnv_getNextUChar() code lets us deal correctly - * with the rare case of a codepage that maps single surrogates - * without adding the complexity to this already complicated function here. - */ - return UCNV_GET_NEXT_UCHAR_USE_TO_U; - } else if(cnv->sharedData->table->mbcs.countStates==1) { - return _MBCSSingleGetNextUChar(pArgs, pErrorCode); - } - - /* set up the local pointers */ - source=lastSource=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - - if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { - stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable; - } else { - stateTable=cnv->sharedData->table->mbcs.stateTable; - } - unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits; - - /* get the converter state from UConverter */ - offset=cnv->toUnicodeStatus; - state=(uint8_t)(cnv->mode); - - /* conversion loop */ - c=U_SENTINEL; - while(sourcesharedData->table->mbcs, offset))!=0xfffe) { - break; - } - } else { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } else if(action==MBCS_STATE_VALID_16_PAIR) { - offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); - c=unicodeCodeUnits[offset++]; - if(c<0xd800) { - /* output BMP code point below 0xd800 */ - break; - } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { - /* output roundtrip or fallback supplementary code point */ - c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); - break; - } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { - /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ - c=unicodeCodeUnits[offset]; - break; - } else if(c==0xffff) { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } - } else if(action==MBCS_STATE_VALID_DIRECT_20 || - (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) - ) { - /* output supplementary code point */ - c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); - break; - } else if(action==MBCS_STATE_CHANGE_ONLY) { - /* - * This serves as a state change without any output. - * It is useful for reading simple stateful encodings, - * for example using just Shift-In/Shift-Out codes. - * The 21 unused bits may later be used for more sophisticated - * state transitions. - */ - } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { - if(UCNV_TO_U_USE_FALLBACK(cnv)) { - /* output BMP code point */ - c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); - break; - } - } else if(action==MBCS_STATE_UNASSIGNED) { - /* just fall through */ - } else if(action==MBCS_STATE_ILLEGAL) { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - } else { - /* reserved (must never occur), or only state change */ - offset=0; - lastSource=source; - continue; - } - - /* end of action codes: prepare for a new character */ - offset=0; - - if(U_FAILURE(*pErrorCode)) { - /* callback(illegal) */ - break; - } else /* unassigned sequence */ { - /* defer to the generic implementation */ - cnv->toUnicodeStatus=0; - cnv->mode=state; - pArgs->source=(const char *)lastSource; - return UCNV_GET_NEXT_UCHAR_USE_TO_U; - } - } - } - - if(c<0) { - if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes; - cnv->toULength=(int8_t)(source-lastSource); - do { - *bytes++=*lastSource++; - } while(lastSourcetoUnicodeStatus=0; - cnv->mode=state; - - /* write back the updated pointer */ - pArgs->source=(const char *)source; - return c; -} - U_CFUNC void _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, UErrorCode *pErrorCode) { @@ -1584,6 +1368,19 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, /* use optimized function if possible */ cnv=pArgs->converter; + + if(cnv->preToULength>0) { + /* + * pass sourceIndex=-1 because we continue from an earlier buffer + * in the future, this may change with continuous offsets + */ + ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode); + + if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) { + return; + } + } + if(cnv->sharedData->table->mbcs.countStates==1) { if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { _MBCSSingleToBMPWithOffsets(pArgs, pErrorCode); @@ -1890,7 +1687,7 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, byteIndex, (const char **)&source, (const char *)sourceLimit, &target, targetLimit, &offsets, sourceIndex, - (UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush, + pArgs->flush, pErrorCode); sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source); @@ -1912,6 +1709,328 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, pArgs->offsets=offsets; } +/* + * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages. + * We still need a conversion loop in case we find reserved action codes, which are to be ignored. + */ +static UChar32 +_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const int32_t (*stateTable)[256]; + const uint8_t *source, *sourceLimit; + + int32_t entry; + uint8_t action; + + /* set up the local pointers */ + cnv=pArgs->converter; + source=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } + + /* conversion loop */ + while(sourcesource=(const char *)source; + + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { + /* output BMP code point */ + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + } + + /* + * An if-else-if chain provides more reliable performance for + * the most common cases compared to a switch. + */ + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); + if( action==MBCS_STATE_VALID_DIRECT_20 || + (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) + ) { + /* output supplementary code point */ + return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { + if(UCNV_TO_U_USE_FALLBACK(cnv)) { + /* output BMP code point */ + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + } + } else if(action==MBCS_STATE_UNASSIGNED) { + /* just fall through */ + } else if(action==MBCS_STATE_ILLEGAL) { + /* callback(illegal) */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } else { + /* reserved, must never occur */ + continue; + } + + if(U_FAILURE(*pErrorCode)) { + /* callback(illegal) */ + break; + } else /* unassigned sequence */ { + /* defer to the generic implementation */ + pArgs->source=(const char *)source-1; + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } + } + + /* no output because of empty input or only state changes */ + *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0xffff; +} + +static UChar32 +_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs, + UErrorCode *pErrorCode) { + UConverter *cnv; + const uint8_t *source, *sourceLimit, *lastSource; + + const int32_t (*stateTable)[256]; + const uint16_t *unicodeCodeUnits; + + uint32_t offset; + uint8_t state; + + int32_t entry; + UChar32 c; + uint8_t action; + + /* use optimized function if possible */ + cnv=pArgs->converter; + + /* ### TODO extension */ + if(cnv->sharedData->table->mbcs.extIndexes!=NULL) { + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } + /* ### TODO end cheap-trick extension */ + + if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) { + /* + * Using the generic ucnv_getNextUChar() code lets us deal correctly + * with the rare case of a codepage that maps single surrogates + * without adding the complexity to this already complicated function here. + */ + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } else if(cnv->sharedData->table->mbcs.countStates==1) { + return _MBCSSingleGetNextUChar(pArgs, pErrorCode); + } + + /* set up the local pointers */ + source=lastSource=(const uint8_t *)pArgs->source; + sourceLimit=(const uint8_t *)pArgs->sourceLimit; + + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { + stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable; + } else { + stateTable=cnv->sharedData->table->mbcs.stateTable; + } + unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits; + + /* get the converter state from UConverter */ + offset=cnv->toUnicodeStatus; + state=(uint8_t)(cnv->mode); + + /* conversion loop */ + c=U_SENTINEL; + while(sourcesharedData->table->mbcs, offset))!=0xfffe) { + break; + } + } else { + /* callback(illegal) */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } else if(action==MBCS_STATE_VALID_16_PAIR) { + offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); + c=unicodeCodeUnits[offset++]; + if(c<0xd800) { + /* output BMP code point below 0xd800 */ + break; + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) { + /* output roundtrip or fallback supplementary code point */ + c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00); + break; + } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) { + /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ + c=unicodeCodeUnits[offset]; + break; + } else if(c==0xffff) { + /* callback(illegal) */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } + } else if(action==MBCS_STATE_VALID_DIRECT_20 || + (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv)) + ) { + /* output supplementary code point */ + c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000); + break; + } else if(action==MBCS_STATE_CHANGE_ONLY) { + /* + * This serves as a state change without any output. + * It is useful for reading simple stateful encodings, + * for example using just Shift-In/Shift-Out codes. + * The 21 unused bits may later be used for more sophisticated + * state transitions. + */ + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { + if(UCNV_TO_U_USE_FALLBACK(cnv)) { + /* output BMP code point */ + c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + break; + } + } else if(action==MBCS_STATE_UNASSIGNED) { + /* just fall through */ + } else if(action==MBCS_STATE_ILLEGAL) { + /* callback(illegal) */ + *pErrorCode=U_ILLEGAL_CHAR_FOUND; + } else { + /* reserved (must never occur), or only state change */ + offset=0; + lastSource=source; + continue; + } + + /* end of action codes: prepare for a new character */ + offset=0; + + if(U_FAILURE(*pErrorCode)) { + /* callback(illegal) */ + break; + } else /* unassigned sequence */ { + /* defer to the generic implementation */ + cnv->toUnicodeStatus=0; + cnv->mode=state; + pArgs->source=(const char *)lastSource; + return UCNV_GET_NEXT_UCHAR_USE_TO_U; + } + } + } + + if(c<0) { + if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes; + cnv->toULength=(int8_t)(source-lastSource); + do { + *bytes++=*lastSource++; + } while(lastSourcetoUnicodeStatus=0; + cnv->mode=state; + + /* write back the updated pointer */ + pArgs->source=(const char *)source; + return c; +} + +#if 0 +/* + * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus + * Removal improves code coverage. + */ +/** + * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. + * It does not handle the EBCDIC swaplfnl option (set in UConverter). + * It does not handle conversion extensions (_extToU()). + */ +U_CFUNC UChar32 +_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, + uint8_t b, UBool useFallback) { + int32_t entry; + uint8_t action; + + entry=sharedData->table->mbcs.stateTable[0][b]; + /* MBCS_ENTRY_IS_FINAL(entry) */ + + if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { + /* output BMP code point */ + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + } + + /* + * An if-else-if chain provides more reliable performance for + * the most common cases compared to a switch. + */ + action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); + if(action==MBCS_STATE_VALID_DIRECT_20) { + /* output supplementary code point */ + return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { + if(!TO_U_USE_FALLBACK(useFallback)) { + return 0xfffe; + } + /* output BMP code point */ + return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); + } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { + if(!TO_U_USE_FALLBACK(useFallback)) { + return 0xfffe; + } + /* output supplementary code point */ + return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); + } else if(action==MBCS_STATE_UNASSIGNED) { + return 0xfffe; + } else if(action==MBCS_STATE_ILLEGAL) { + return 0xffff; + } else { + /* reserved, must never occur */ + return 0xffff; + } +} +#endif + /* * This is a simple version of getNextUChar() that is used * by other converter implementations. @@ -1945,6 +2064,8 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, return 0xffff; } + /* ### TODO extension */ + #if 0 /* * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus @@ -2054,61 +2175,6 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData, return 0xffff; } -#if 0 -/* - * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus - * Removal improves code coverage. - */ -/** - * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages. - * It does not handle the EBCDIC swaplfnl option (set in UConverter). - * It does not handle conversion extensions (_extToU()). - */ -U_CFUNC UChar32 -_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData, - uint8_t b, UBool useFallback) { - int32_t entry; - uint8_t action; - - entry=sharedData->table->mbcs.stateTable[0][b]; - /* MBCS_ENTRY_IS_FINAL(entry) */ - - if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { - /* output BMP code point */ - return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); - } - - /* - * An if-else-if chain provides more reliable performance for - * the most common cases compared to a switch. - */ - action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry)); - if(action==MBCS_STATE_VALID_DIRECT_20) { - /* output supplementary code point */ - return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); - } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) { - if(!TO_U_USE_FALLBACK(useFallback)) { - return 0xfffe; - } - /* output BMP code point */ - return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry); - } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) { - if(!TO_U_USE_FALLBACK(useFallback)) { - return 0xfffe; - } - /* output supplementary code point */ - return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry); - } else if(action==MBCS_STATE_UNASSIGNED) { - return 0xfffe; - } else if(action==MBCS_STATE_ILLEGAL) { - return 0xffff; - } else { - /* reserved, must never occur */ - return 0xffff; - } -} -#endif - /* MBCS-from-Unicode conversion functions ----------------------------------- */ /* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ @@ -2251,7 +2317,7 @@ unassigned: c, &source, sourceLimit, (char **)&target, (char *)target+targetCapacity, &offsets, sourceIndex, - (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pArgs->flush, pErrorCode); nextSourceIndex+=(int32_t)(source-pArgs->source); @@ -2454,7 +2520,7 @@ unassigned: c, &source, sourceLimit, (char **)&target, (char *)target+targetCapacity, &offsets, sourceIndex, - (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pArgs->flush, pErrorCode); nextSourceIndex+=(int32_t)(source-pArgs->source); @@ -2681,7 +2747,7 @@ getTrail: c, &source, sourceLimit, (char **)&target, (char *)target+targetCapacity, &offsets, sourceIndex, - (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pArgs->flush, pErrorCode); sourceIndex+=length+(int32_t)(source-lastSource); lastSource=source; @@ -2744,8 +2810,21 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, int32_t length, prevLength; uint8_t unicodeMask; - /* use optimized function if possible */ cnv=pArgs->converter; + + if(cnv->preFromUFirstCP>=0) { + /* + * pass sourceIndex=-1 because we continue from an earlier buffer + * in the future, this may change with continuous offsets + */ + ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode); + + if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) { + return; + } + } + + /* use optimized function if possible */ outputType=cnv->sharedData->table->mbcs.outputType; unicodeMask=cnv->sharedData->table->mbcs.unicodeMask; if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) { @@ -2768,6 +2847,7 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, offsets=pArgs->offsets; table=cnv->sharedData->table->mbcs.fromUnicodeTable; + if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) { bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes; } else { @@ -3025,7 +3105,7 @@ unassigned: c, &source, sourceLimit, (char **)&target, (char *)target+targetCapacity, &offsets, sourceIndex, - (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush, + pArgs->flush, pErrorCode); nextSourceIndex+=(int32_t)(source-pArgs->source); prevLength=cnv->fromUnicodeStatus; /* restore SISO state */ @@ -3222,6 +3302,8 @@ _MBCSFromUChar32(UConverterSharedData *sharedData, uint32_t value; int32_t length; + /* ### TODO extension mapping */ + /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) { return 0; @@ -3404,7 +3486,11 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, int32_t length; /* first, select between subChar and subChar1 */ - if(cnv->subChar1!=0 && cnv->invalidUCharBuffer[0]<=0xff) { + if( cnv->subChar1!=0 && + (cnv->sharedData->table->mbcs.extIndexes!=NULL ? + cnv->useSubChar1 : + (cnv->invalidUCharBuffer[0]<=0xff)) + ) { /* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */ subchar=(char *)&cnv->subChar1; length=1; @@ -3414,6 +3500,9 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs, length=cnv->subCharLen; } + /* reset the selector for the next code point */ + cnv->useSubChar1=FALSE; + switch(cnv->sharedData->table->mbcs.outputType) { case MBCS_OUTPUT_2_SISO: p=buffer; diff --git a/icu4c/source/common/ucnvmbcs.h b/icu4c/source/common/ucnvmbcs.h index 3df87dcc57..5940af28ba 100644 --- a/icu4c/source/common/ucnvmbcs.h +++ b/icu4c/source/common/ucnvmbcs.h @@ -37,7 +37,11 @@ * At the moment, there are only variations of MBCS converters. They all have * the same toUnicode structures, while the fromUnicode structures for SBCS * differ from those for other MBCS-style converters. - * + * + * _MBCSHeader.version 4.2 adds an optional conversion extension data structure. + * If it is present, then an ICU version reading header versions 4.0 or 4.1 + * will be able to use the base table and ignore the extension. + * * MBCS-style data structure following the static data. * Offsets are counted in bytes from the beginning of the MBCS header structure. * Details about usage in comments in ucnvmbcs.c. @@ -45,61 +49,79 @@ * struct _MBCSHeader (see the definition in this header file below) * contains 32-bit fields as follows: * 8 values: - * 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.1.0.0) + * 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0) * 1 uint32_t countStates * 2 uint32_t countToUFallbacks * 3 uint32_t offsetToUCodeUnits * 4 uint32_t offsetFromUTable * 5 uint32_t offsetFromUBytes * 6 uint32_t flags, bits: - * 31.. 8 reserved + * 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher + * 0 for older versions and if + * there is not extension structure * 7.. 0 outputType * 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher * counts bytes in fromUBytes[] * - * int32_t stateTable[countStates][256]; + * if(outputType==MBCS_OUTPUT_EXT_ONLY) { + * -- base table name for extension-only table + * char baseTableName[variable]; -- with NUL plus padding for 4-alignment * - * struct _MBCSToUFallback { (fallbacks are sorted by offset) - * uint32_t offset; - * UChar32 codePoint; - * } toUFallbacks[countToUFallbacks]; - * - * uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2]; - * (padded to an even number of units) - * - * -- stage 1 tables - * if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) { - * -- stage 1 table for all of Unicode - * uint16_t fromUTable[0x440]; (32-bit-aligned) + * -- all _MBCSHeader fields except for version and flags are 0 * } else { - * -- BMP-only tables have a smaller stage 1 table - * uint16_t fromUTable[0x40]; (32-bit-aligned) + * -- normal base table with optional extension + * + * int32_t stateTable[countStates][256]; + * + * struct _MBCSToUFallback { (fallbacks are sorted by offset) + * uint32_t offset; + * UChar32 codePoint; + * } toUFallbacks[countToUFallbacks]; + * + * uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2]; + * (padded to an even number of units) + * + * -- stage 1 tables + * if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) { + * -- stage 1 table for all of Unicode + * uint16_t fromUTable[0x440]; (32-bit-aligned) + * } else { + * -- BMP-only tables have a smaller stage 1 table + * uint16_t fromUTable[0x40]; (32-bit-aligned) + * } + * + * -- stage 2 tables + * length determined by top of stage 1 and bottom of stage 3 tables + * if(outputType==MBCS_OUTPUT_1) { + * -- SBCS: pure indexes + * uint16_t stage 2 indexes[?]; + * } else { + * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes + * uint32_t stage 2 flags and indexes[?]; + * } + * + * -- stage 3 tables with byte results + * if(outputType==MBCS_OUTPUT_1) { + * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c + * uint16_t fromUBytes[fromUBytesLength/2]; + * } else { + * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c + * uint8_t fromUBytes[fromUBytesLength]; or + * uint16_t fromUBytes[fromUBytesLength/2]; or + * uint32_t fromUBytes[fromUBytesLength/4]; + * } * } * - * -- stage 2 tables - * length determined by top of stage 1 and bottom of stage 3 tables - * if(outputType==MBCS_OUTPUT_1) { - * -- SBCS: pure indexes - * uint16_t stage 2 indexes[?]; - * } else { - * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes - * uint32_t stage 2 flags and indexes[?]; - * } - * - * -- stage 3 tables with byte results - * if(outputType==MBCS_OUTPUT_1) { - * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c - * uint16_t fromUBytes[fromUBytesLength/2]; - * } else { - * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c - * uint8_t fromUBytes[fromUBytesLength]; or - * uint16_t fromUBytes[fromUBytesLength/2]; or - * uint32_t fromUBytes[fromUBytesLength/4]; - * } + * -- extension table, details see ucnv_ext.h + * int32_t indexes[>=32]; ... */ /* MBCS converter data and state -------------------------------------------- */ +enum { + MBCS_MAX_STATE_COUNT=128 +}; + /** * MBCS action codes for conversions to Unicode. * These values are in bits 23..20 of the state table entries. @@ -175,7 +197,11 @@ enum { MBCS_OUTPUT_4_EUC, /* 9 */ MBCS_OUTPUT_2_SISO=12, /* c */ - MBCS_OUTPUT_2_HZ /* d */ + MBCS_OUTPUT_2_HZ, /* d */ + + MBCS_OUTPUT_EXT_ONLY, /* e */ + + MBCS_OUTPUT_COUNT }; /** @@ -210,6 +236,9 @@ typedef struct UConverterMBCSTable { /* converter name for swaplfnl */ char *swapLFNLName; + + /* extension data */ + const int32_t *extIndexes; } UConverterMBCSTable; /** diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h index af2b70a909..eb5f948707 100644 --- a/icu4c/source/common/unicode/ucnv.h +++ b/icu4c/source/common/unicode/ucnv.h @@ -455,7 +455,7 @@ ucnv_safeClone(const UConverter *cnv, UErrorCode *status); /** @stable ICU 2.0 */ -#define U_CNV_SAFECLONE_BUFFERSIZE 3072 +#define U_CNV_SAFECLONE_BUFFERSIZE 4096 /** * Deletes the unicode converter and releases resources associated diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt index 036d1996a3..77d37a374d 100644 --- a/icu4c/source/test/testdata/conversion.txt +++ b/icu4c/source/test/testdata/conversion.txt @@ -43,6 +43,16 @@ conversion { toUnicode { Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } Cases { + // extensions + { + "*test3", + :bin{ 00050601020b0701020a01020c }, + "\u20ac\x05\x06\x0b\U00101234\U00023456\ufffd", + :intvector{ 0, 1, 2, 3, 6, 6, 7, 7, 10 }, + :int{1}, :int{0}, "", "?", :bin{""} + } + + // normal conversions { "UTF-16LE", :bin{ 310000d801dc00d902dc320000d8330001dc3400 }, @@ -110,6 +120,24 @@ conversion { fromUnicode { Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" } Cases { + // extensions + { + "*test3", + "\xc4\xc4\xc4\U00101234\xc4\xc4\U00101234\x05", + :bin{ ffffff070501020c }, + :intvector{ 0, 1, 2, 3, 5, 5, 5, 5 }, + :int{1}, :int{0}, "", "?", "" + } + + { + "*test3", + "\U00101234\U00101234\U00050005\U00101234\U00050005\U00060006", + :bin{ 07070001020e05070001020f09 }, + :intvector{ 0, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6 }, + :int{1}, :int{0}, "", "?", "" + } + + // normal conversions { "UTF-16LE", "1\U00010001\U000500022\ud8003\udc014", diff --git a/icu4c/source/test/testdata/test1.ucm b/icu4c/source/test/testdata/test1.ucm index 9fe7d0fe33..219704b83c 100644 --- a/icu4c/source/test/testdata/test1.ucm +++ b/icu4c/source/test/testdata/test1.ucm @@ -1,18 +1,19 @@ # ******************************************************************************* -# * Copyright (C) 2001, International Business Machines +# * Copyright (C) 2001-2003, International Business Machines # * Corporation and others. All Rights Reserved. # ******************************************************************************* # # test1.ucm # # Test file for MBCS conversion with single-byte codepage data. +# Also contains extension mappings (m:n). "test1" 1 1 "MBCS" - \xff - 0, 5-9, ff + \xff + 0, 5-9, ff CHARMAP diff --git a/icu4c/source/test/testdata/test3.ucm b/icu4c/source/test/testdata/test3.ucm index f9e6ea85c0..aee69c2898 100644 --- a/icu4c/source/test/testdata/test3.ucm +++ b/icu4c/source/test/testdata/test3.ucm @@ -1,20 +1,21 @@ # ******************************************************************************* -# * Copyright (C) 2001, International Business Machines +# * Copyright (C) 2001-2003, International Business Machines # * Corporation and others. All Rights Reserved. # ******************************************************************************* # # test3.ucm # # Test file for MBCS conversion with three-byte codepage data. +# Also contains extension mappings (m:n). "test3" 3 1 "MBCS" - \xff - 0, 1:1, 5-9, ff - 2:2 - a-f.p + \xff + 0, 1:1, 5-9, ff + 2:2 + a-f.p CHARMAP @@ -24,6 +25,11 @@ CHARMAP # nothing special \x05 |0 +# extensions + \x05+\x01\x02\x0d |0 + \x05+\x01\x02\x0e |3 + \x05+\xff |3 + # toUnicode result is fallback direct \x06 |3 @@ -31,8 +37,18 @@ CHARMAP \x07 |0 \x08 |3 +# extensions +++ \x07+\x00+\x01\x02\x0f+\x09 |0 ++ \x07+\x00+\x01\x02\x0e+\x05 |0 ++ \x07+\x00+\x01\x02\x0f+\x06 |0 ++ \x07+\x00+\x01\x02\x0f |1 + #unassigned \x09 +# extensions where the first code point is unassigned, for replay testing +# \x09+\x09 |0 + \x05+\x01\x02\x0c |0 + # toUnicode result is surrogate pair: test real pair, single unit, unassigned \x01\x02\x0a |0 \x01\x02\x0b |0 diff --git a/icu4c/source/test/testdata/test4.ucm b/icu4c/source/test/testdata/test4.ucm index 9738964a3f..9313257c95 100644 --- a/icu4c/source/test/testdata/test4.ucm +++ b/icu4c/source/test/testdata/test4.ucm @@ -1,21 +1,21 @@ # ******************************************************************************* -# * Copyright (C) 2001, International Business Machines +# * Copyright (C) 2001-2003, International Business Machines # * Corporation and others. All Rights Reserved. # ******************************************************************************* # # test4.ucm # -# Test file for MBCS conversion with three-byte codepage data. +# Test file for MBCS conversion with four-byte codepage data. "test4" 4 1 "MBCS" - \xff - 0, 1:1, 5-9, ff - 2:2 - 3:3 - a-f.p + \xff + 0, 1:1, 5-9, ff + 2:2 + 3:3 + a-f.p CHARMAP diff --git a/icu4c/source/tools/gennorm/store.c b/icu4c/source/tools/gennorm/store.c index c78a3193df..49375bd64d 100644 --- a/icu4c/source/tools/gennorm/store.c +++ b/icu4c/source/tools/gennorm/store.c @@ -26,6 +26,7 @@ #include "unicode/udata.h" #include "utrie.h" #include "unicode/uset.h" +#include "toolutil.h" #include "unewdata.h" #include "unormimp.h" #include "gennorm.h" @@ -86,87 +87,6 @@ setUnicodeVersion(const char *v) { static int32_t indexes[_NORM_INDEX_TOP]={ 0 }; -/* tool memory helper ------------------------------------------------------- */ - -/* - * UToolMemory is used for generic, custom memory management. - * It is allocated with enough space for count*size bytes starting - * at array. - * The array is declared with a union of large data types so - * that its base address is aligned for any types. - * If size is a multiple of a data type size, then such items - * can be safely allocated inside the array, at offsets that - * are themselves multiples of size. - */ -typedef struct UToolMemory { - char name[64]; - uint32_t count, size, index; - union { - uint32_t u; - double d; - void *p; - } array[1]; -} UToolMemory; - -static UToolMemory * -utm_open(const char *name, uint32_t count, uint32_t size) { - UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size); - if(mem==NULL) { - fprintf(stderr, "error: %s - out of memory\n", name); - exit(U_MEMORY_ALLOCATION_ERROR); - } - uprv_strcpy(mem->name, name); - mem->count=count; - mem->size=size; - mem->index=0; - return mem; -} - -static void -utm_close(UToolMemory *mem) { - if(mem!=NULL) { - uprv_free(mem); - } -} - - - -static void * -utm_getStart(UToolMemory *mem) { - return (char *)mem->array; -} - -static int32_t -utm_countItems(UToolMemory *mem) { - return mem->index; -} - -static void * -utm_alloc(UToolMemory *mem) { - char *p=(char *)mem->array+mem->index*mem->size; - if(++mem->index<=mem->count) { - uprv_memset(p, 0, mem->size); - return p; - } else { - fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", - mem->name, (long)mem->count); - exit(U_MEMORY_ALLOCATION_ERROR); - } -} - -static void * -utm_allocN(UToolMemory *mem, int32_t n) { - char *p=(char *)mem->array+mem->index*mem->size; - if((mem->index+=(uint32_t)n)<=mem->count) { - uprv_memset(p, 0, n*mem->size); - return p; - } else { - fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n", - mem->name, (long)mem->count); - exit(U_MEMORY_ALLOCATION_ERROR); - } -} - /* builder data ------------------------------------------------------------- */ typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm); @@ -244,23 +164,23 @@ init() { } /* allocate Norm structures and reset the first one */ - normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm)); + normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm)); norms=utm_alloc(normMem); /* allocate UTF-32 string memory */ - utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4); + utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4); /* reset all "have seen" flags */ uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags)); /* allocate extra data memory for UTF-16 decomposition strings and other values */ - extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2); + extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2); /* initialize the extraMem counter for the top of FNC strings */ p16=(uint16_t *)utm_alloc(extraMem); *p16=1; /* allocate temporary memory for combining triples */ - combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple)); + combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple)); /* set the minimum code points for no/maybe quick check values to the end of the BMP */ indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff; @@ -508,7 +428,7 @@ processCombining() { triples=utm_getStart(combiningTriplesMem); /* add lead and trail indexes to the triples for sorting */ - count=(uint16_t)combiningTriplesMem->index; + count=(uint16_t)utm_countItems(combiningTriplesMem); for(i=0; iindex; + count=(int32_t)utm_countItems(normMem); for(i=0; iindex; + count=utm_countItems(normMem); for(i=0; iindex; + count=utm_countItems(combiningTriplesMem); /* triples are not sorted by code point but for each lead CP there is one contiguous block */ for(i=0; iindex; + count=utm_countItems(combiningTriplesMem); c=s[0]; /* triples are not sorted by code point but for each lead CP there is one contiguous block */ @@ -1838,7 +1758,7 @@ generateData(const char *dataDir) { canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]; /* make sure that the FCD trie is 4-aligned */ - if((extraMem->index+combiningTableTop)&1) { + if((utm_countItems(extraMem)+combiningTableTop)&1) { combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */ } @@ -1850,7 +1770,7 @@ generateData(const char *dataDir) { size= _NORM_INDEX_TOP*4+ normTrieSize+ - extraMem->index*2+ + utm_countItems(extraMem)*2+ combiningTableTop*2+ fcdTrieSize+ auxTrieSize+ @@ -1858,7 +1778,7 @@ generateData(const char *dataDir) { if(beVerbose) { printf("size of normalization trie %5u bytes\n", normTrieSize); - printf("size of 16-bit extra memory %5u UChars/uint16_t\n", extraMem->index); + printf("size of 16-bit extra memory %5u UChars/uint16_t\n", utm_countItems(extraMem)); printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]); printf("size of combining table %5u uint16_t\n", combiningTableTop); printf("size of FCD trie %5u bytes\n", fcdTrieSize); @@ -1873,7 +1793,7 @@ generateData(const char *dataDir) { } indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize; - indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index; + indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem); indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop; indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop; @@ -1900,7 +1820,7 @@ generateData(const char *dataDir) { udata_writeBlock(pData, indexes, sizeof(indexes)); udata_writeBlock(pData, normTrieBlock, normTrieSize); - udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2); + udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2); udata_writeBlock(pData, combiningTable, combiningTableTop*2); udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize); udata_writeBlock(pData, auxTrieBlock, auxTrieSize); @@ -1928,7 +1848,7 @@ extern void cleanUpData(void) { int32_t i, count; - count=(int32_t)normMem->index; + count=utm_countItems(normMem); for(i=0; i +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "ucnv_cnv.h" +#include "ucnvmbcs.h" +#include "toolutil.h" +#include "unewdata.h" +#include "ucm.h" +#include "makeconv.h" +#include "genmbcs.h" + +#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) + +static void +CnvExtClose(NewConverter *cnvData); + +static UBool +CnvExtIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length); + +static UBool +CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); + +static uint32_t +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); + +typedef struct CnvExtData { + NewConverter newConverter; + + UCMFile *ucm; + + /* toUnicode (state table in ucm->states) */ + UToolMemory *toUTable, *toUUChars; + + /* fromUnicode */ + UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes; + + uint16_t stage1[MBCS_STAGE_1_SIZE]; + uint16_t stage2[MBCS_STAGE_2_SIZE]; + uint16_t stage3[0x10000< |2 mappings */ + uint16_t stage3Sub1Block; +} CnvExtData; + +NewConverter * +CnvExtOpen(UCMFile *ucm) { + CnvExtData *extData; + + extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData)); + if(extData!=NULL) { + uprv_memset(extData, 0, sizeof(CnvExtData)); + + extData->ucm=ucm; /* aliased, not owned */ + + extData->newConverter.close=CnvExtClose; + extData->newConverter.isValid=CnvExtIsValid; + extData->newConverter.addTable=CnvExtAddTable; + extData->newConverter.write=CnvExtWrite; + } + return &extData->newConverter; +} + +static void +CnvExtClose(NewConverter *cnvData) { + CnvExtData *extData=(CnvExtData *)cnvData; + if(extData!=NULL) { + utm_close(extData->toUTable); + utm_close(extData->toUUChars); + utm_close(extData->fromUTableUChars); + utm_close(extData->fromUTableValues); + utm_close(extData->fromUBytes); + } +} + +/* we do not expect this to be called */ +static UBool +CnvExtIsValid(NewConverter *cnvData, + const uint8_t *bytes, int32_t length) { + return FALSE; +} + +static uint32_t +CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType) { + CnvExtData *extData=(CnvExtData *)cnvData; + int32_t length, top, headerSize; + + int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 }; + + if(tableType&TABLE_BASE) { + headerSize=0; + } else { + _MBCSHeader header={ 0 }; + + /* write the header and base table name for an extension-only table */ + length=uprv_strlen(extData->ucm->baseName)+1; + while(length&3) { + /* add padding */ + extData->ucm->baseName[length++]=0; + } + + headerSize=sizeof(header)+length; + + /* fill the header */ + header.version[0]=4; + header.version[1]=2; + header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY); + + /* write the header and the base table name */ + udata_writeBlock(pData, &header, sizeof(header)); + udata_writeBlock(pData, extData->ucm->baseName, length); + } + + /* fill indexes[] - offsets/indexes are in units of the target array */ + top=0; + + indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH; + top+=length*4; + + indexes[UCNV_EXT_TO_U_INDEX]=top; + indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable); + top+=length*4; + + indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top; + indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars); + top+=length*2; + + indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top; + length=utm_countItems(extData->fromUTableUChars); + top+=length*2; + + if(top&3) { + /* add padding */ + *((UChar *)utm_alloc(extData->fromUTableUChars))=0; + *((uint32_t *)utm_alloc(extData->fromUTableValues))=0; + ++length; + top+=2; + } + indexes[UCNV_EXT_FROM_U_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top; + top+=length*4; + + indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top; + length=utm_countItems(extData->fromUBytes); + top+=length; + + if(top&1) { + /* add padding */ + *((uint8_t *)utm_alloc(extData->fromUBytes))=0; + ++length; + ++top; + } + indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top; + indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top; + indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top; + top+=length*2; + + indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top; + length=extData->stage3Top; + top+=length*2; + + if(top&3) { + /* add padding */ + extData->stage3[extData->stage3Top++]=0; + ++length; + top+=2; + } + indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length; + + indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top; + indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop; + top+=length*4; + + indexes[UCNV_EXT_SIZE]=top; + + /* write the extension data */ + udata_writeBlock(pData, indexes, sizeof(indexes)); + udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4); + udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2); + + udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2); + udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4); + udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]); + + udata_writeBlock(pData, extData->stage1, extData->stage1Top*2); + udata_writeBlock(pData, extData->stage2, extData->stage2Top*2); + udata_writeBlock(pData, extData->stage3, extData->stage3Top*2); + udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4); + + { + int32_t i, j; + + length=extData->stage1Top; + printf("\nstage1[%x]:\n", length); + + for(i=0; istage1[i]!=length) { + printf("stage1[%04x]=%04x\n", i, extData->stage1[i]); + } + } + + j=length; + length=extData->stage2Top; + printf("\nstage2[%x]:\n", length); + + for(i=0; istage2[i]!=0) { + printf("stage12[%04x]=%04x\n", j, extData->stage2[i]); + } + } + + length=extData->stage3Top; + printf("\nstage3[%x]:\n", length); + + for(i=0; istage3[i]!=0) { + printf("stage3[%04x]=%04x\n", i, extData->stage3[i]); + } + } + + length=extData->stage3bTop; + printf("\nstage3b[%x]:\n", length); + + for(i=0; istage3b[i]!=0) { + printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]); + } + } + } + + if(VERBOSE) { + printf("size of extension data: %ld\n", top); + } + + /* return the number of bytes that should have been written */ + return (uint32_t)(headerSize+top); +} + +/* to Unicode --------------------------------------------------------------- */ + +/* + * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for + * the toUnicode table. + * The table must be sorted. + * Destroys previous data in the reverseMap. + */ +static int32_t +reduceToUMappings(UCMTable *table) { + UCMapping *mappings; + int32_t *map; + int32_t i, j, count; + int8_t flag; + + mappings=table->mappings; + map=table->reverseMap; + count=table->mappingsLength; + + /* leave the map alone for the initial mappings with desired flags */ + for(i=j=0; iuLen==1) { + value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u); + } else { + /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */ + + /* get the result code point string and its 16-bit string length */ + u32=UCM_GET_CODE_POINTS(table, m); + errorCode=U_ZERO_ERROR; + u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode); + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { + exit(errorCode); + } + + /* allocate it and put its length and index into the value */ + value= + (((uint32_t)m->uLen+UCNV_EXT_TO_U_LENGTH_OFFSET)<toUUChars)); + u=utm_allocN(extData->toUUChars, u16Length); + + /* write the result 16-bit string */ + errorCode=U_ZERO_ERROR; + u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode); + if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) { + exit(errorCode); + } + } + if(m->f==0) { + value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG; + } + return value; +} + +/* + * Recursive toUTable generator core function. + * Preconditions: + * - start0: if there is one mapping with an input unit sequence of unitIndex+1 + * then defaultValue=compute the mapping result for this whole sequence + * else defaultValue=0 + * + * recurse into the subsection + */ +static UBool +generateToUTable(CnvExtData *extData, UCMTable *table, + int32_t start, int32_t limit, int32_t unitIndex, + uint32_t defaultValue) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, uniqueCount, count, subStart, subLimit; + + uint8_t *bytes; + int32_t low, high, prev; + + uint32_t *section; + + mappings=table->mappings; + map=table->reverseMap; + + /* step 1: examine the input units; set low, high, uniqueCount */ + m=mappings+map[start]; + bytes=UCM_GET_BYTES(table, m); + low=bytes[unitIndex]; + uniqueCount=1; + + prev=high=low; + for(i=start+1; i=(3*count)/4) { + /* + * for the root table and for fairly full tables: + * allocate for direct, linear array access + * by keeping count, to write an entry for each unit value + * from low to high + */ + } else { + count=uniqueCount; + } + + /* allocate the section: 1 entry for the header + count for the items */ + section=(uint32_t *)utm_allocN(extData->toUTable, 1+count); + + /* write the section header */ + *section++=((uint32_t)count<uniqueCount) { + /* write empty subsections for unused units in a linear table */ + while(++prevbLen==unitIndex+1) { + /* do not include this in generateToUTable() */ + ++subStart; + + if(subStarttoUTable); + + /* recurse */ + if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { + return FALSE; + } + } + } + return TRUE; +} + +/* + * Generate the toUTable and toUUChars from the input table. + * The input table must be sorted, and all precision flags must be 0..3. + * This function will modify the table's reverseMap. + */ +static UBool +makeToUTable(CnvExtData *extData, UCMTable *table) { + int32_t toUCount; + + toUCount=reduceToUMappings(table); + + extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4); + extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2); + + return generateToUTable(extData, table, 0, toUCount, 0, 0); +} + +/* from Unicode ------------------------------------------------------------- */ + +/* + * preprocessing: + * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode + * change each Unicode string to encode all but the first code point in 16-bit form + * + * generation: + * for each unique code point + * write an entry in the 3-stage trie + * check that there is only one single-code point sequence + * start recursion for following 16-bit input units + */ + +/* + * Remove toUnicode fallbacks and non- SUB mappings + * which are irrelevant for the fromUnicode extension table. + * Overwrite the reverseMap with an index array to the relevant mappings. + * Modify the code point sequences to a generator-friendly format where + * the first code points remains unchanged but the following are recoded + * into 16-bit Unicode string form. + * The table must be sorted. + * Destroys previous data in the reverseMap. + */ +static int32_t +prepareFromUMappings(UCMTable *table) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, count; + int8_t flag; + + mappings=table->mappings; + map=table->reverseMap; + count=table->mappingsLength; + + /* + * we do not go through the map on input because the mappings are + * sorted lexically + */ + m=mappings; + + for(i=j=0; if; + if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) { + map[j++]=i; + + if(m->uLen>1) { + /* recode all but the first code point to 16-bit Unicode */ + UChar32 *u32; + UChar *u; + UChar32 c; + int32_t q, r; + + u32=UCM_GET_CODE_POINTS(table, m); + u=(UChar *)u32; /* destructive in-place recoding */ + for(r=2, q=1; quLen; ++q) { + c=u32[q]; + U16_APPEND_UNSAFE(u, r, c); + } + + /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */ + m->uLen=(int8_t)r; + } + } + } + + return j; +} + +static uint32_t +getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) { + uint8_t *bytes, *resultBytes; + uint32_t value; + + if(m->f==2) { + return UCNV_EXT_FROM_U_SUBCHAR1; /* SUB mapping */ + } + + bytes=UCM_GET_BYTES(table, m); + value=0; + switch(m->bLen) { + /* 1..3: store the bytes in the value word */ + case 3: + value=((uint32_t)*bytes++)<<16; + case 2: + value|=((uint32_t)*bytes++)<<8; + case 1: + value|=*bytes; + break; + default: + /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */ + /* store the bytes in fromUBytes[] and the index in the value word */ + value=(uint32_t)utm_countItems(extData->fromUBytes); + resultBytes=utm_allocN(extData->fromUBytes, m->bLen); + uprv_memcpy(resultBytes, bytes, m->bLen); + break; + } + value|=(uint32_t)m->bLen<f==0) { + value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG; + } + return value; +} + +/* + * works like generateToUTable(), except that the + * output section consists of two arrays, one for input UChars and one + * for result values + * + * also, fromUTable sections are always stored in a compact form for + * access via binary search + */ +static UBool +generateFromUTable(CnvExtData *extData, UCMTable *table, + int32_t start, int32_t limit, int32_t unitIndex, + uint32_t defaultValue) { + UCMapping *mappings, *m; + int32_t *map; + int32_t i, j, uniqueCount, count, subStart, subLimit; + + UChar *uchars; + UChar32 low, high, prev; + + UChar *sectionUChars; + uint32_t *sectionValues; + + mappings=table->mappings; + map=table->reverseMap; + + /* step 1: examine the input units; set low, high, uniqueCount */ + m=mappings+map[start]; + uchars=(UChar *)UCM_GET_CODE_POINTS(table, m); + low=uchars[unitIndex]; + uniqueCount=1; + + prev=high=low; + for(i=start+1; ifromUTableUChars, 1+count); + sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count); + + /* write the section header */ + *sectionUChars++=(UChar)count; + *sectionValues++=defaultValue; + + /* step 3: write temporary section table with subsection starts */ + prev=low-1; /* just before low to prevent empty subsections before low */ + j=0; /* section table index */ + for(i=start; iuLen==unitIndex+1) { + /* do not include this in generateToUTable() */ + ++subStart; + + if(subStartfromUTableValues); + + /* recurse */ + if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) { + return FALSE; + } + } + } + return TRUE; +} + +/* + * add entries to the fromUnicode trie, + * assume to be called with code points in ascending order + * and use that to build the trie in precompacted form + */ +static void +addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) { + int32_t i1, i2, i3, i3b, nextOffset, min, newBlock; + + if(value==0) { + return; + } + + /* + * compute the index for each stage, + * allocate a stage block if necessary, + * and write the stage value + */ + i1=c>>10; + if(i1>=extData->stage1Top) { + extData->stage1Top=i1+1; + } + + nextOffset=(c>>4)&0x3f; + + if(extData->stage1[i1]==0) { + /* allocate another block in stage 2; overlap with the previous block */ + newBlock=extData->stage2Top; + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minstage2[newBlock-1]==0) { + --newBlock; + } + + extData->stage1[i1]=(uint16_t)newBlock; + extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE; + if(extData->stage2Top>LENGTHOF(extData->stage2)) { + fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + i2=extData->stage1[i1]+nextOffset; + nextOffset=c&0xf; + + if(extData->stage2[i2]==0) { + /* allocate another block in stage 3; overlap with the previous block */ + newBlock=extData->stage3Top; + min=newBlock-nextOffset; /* minimum block start with overlap */ + while(minstage3[newBlock-1]==0) { + --newBlock; + } + + /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */ + newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1); + extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT); + + extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE; + if(extData->stage3Top>LENGTHOF(extData->stage3)) { + fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + i3=((int32_t)extData->stage2[i2]<stage3[i3]==0 because we get + * code points in strictly ascending order + */ + + if(value==UCNV_EXT_FROM_U_SUBCHAR1) { + /* SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */ + extData->stage3[i3]=1; + + /* + * precompaction is not optimal for |2 mappings because + * stage3 values for them are all the same, unlike for other mappings + * which all have unique values; + * use a simple compaction of reusing a whole block filled with these + * mappings + */ + + /* is the entire block filled with |2 mappings? */ + if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) { + for(min=i3-nextOffset; + minstage3[min]==1; + ++min) {} + + if(min==i3) { + /* the entire block is filled with these mappings */ + if(extData->stage3Sub1Block!=0) { + /* point to the previous such block and remove this block from stage3 */ + extData->stage2[i2]=extData->stage3Sub1Block; + extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE; + uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2); + } else { + /* remember this block's stage2 entry */ + extData->stage3Sub1Block=extData->stage2[i2]; + } + } + } + } else { + if((i3b=extData->stage3bTop++)>=LENGTHOF(extData->stage3b)) { + fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", c); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* roundtrip or fallback mapping */ + extData->stage3[i3]=(uint16_t)i3b; + extData->stage3b[i3b]=value; + } +} + +static UBool +generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) { + UCMapping *mappings, *m; + int32_t *map; + uint32_t value; + int32_t subStart, subLimit; + + UChar32 *codePoints; + UChar32 c, next; + + if(mapLength==0) { + return TRUE; + } + + mappings=table->mappings; + map=table->reverseMap; + + /* + * iterate over same-initial-code point mappings, + * enter the initial code point into the trie, + * and start a recursion on the corresponding mappings section + * with generateFromUTable() + */ + m=mappings+map[0]; + codePoints=UCM_GET_CODE_POINTS(table, m); + next=codePoints[0]; + subLimit=0; + while(subLimituLen==1) { + /* do not include this in generateFromUTable() */ + ++subStart; + + if(subStartfromUTableValues)); + + /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */ + if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) { + return FALSE; + } + } + } + return TRUE; +} + +/* + * Generate the fromU data structures from the input table. + * The input table must be sorted, and all precision flags must be 0..3. + * This function will modify the table's reverseMap. + */ +static UBool +makeFromUTable(CnvExtData *extData, UCMTable *table) { + uint16_t *stage1; + int32_t i, stage1Top, fromUCount; + + fromUCount=prepareFromUMappings(table); + + extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2); + extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4); + extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1); + + /* allocate all-unassigned stage blocks */ + extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; + extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED; + + /* + * stage 3b stores only unique values, and in + * index 0: 0 for "no mapping" + * index 1: "no mapping" with preference for rather than + */ + extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1; + extData->stage3bTop=2; + + /* allocate the first entry in the fromUTable because index 0 means "no result" */ + utm_alloc(extData->fromUTableUChars); + utm_alloc(extData->fromUTableValues); + + if(!generateFromUTrie(extData, table, fromUCount)) { + return FALSE; + } + + /* + * offset the stage 1 trie entries by stage1Top because they will + * be stored in a single array + */ + stage1=extData->stage1; + stage1Top=extData->stage1Top; + for(i=0; iunicodeMask=table->unicodeMask; + if(staticData->unicodeMask&UCNV_HAS_SURROGATES) { + fprintf(stderr, "error: contains mappings for surrogate code points\n"); + return FALSE; + } + + staticData->conversionType=UCNV_MBCS; + + extData=(CnvExtData *)cnvData; + + /* + * assume that the table is sorted + * + * call the functions in this order because + * makeToUTable() modifies the original reverseMap, + * makeFromUTable() writes a whole new mapping into reverseMap + */ + return + makeToUTable(extData, table) && + makeFromUTable(extData, table); +} diff --git a/icu4c/source/tools/makeconv/genmbcs.c b/icu4c/source/tools/makeconv/genmbcs.c index dfd0174aaf..fbe9922952 100644 --- a/icu4c/source/tools/makeconv/genmbcs.c +++ b/icu4c/source/tools/makeconv/genmbcs.c @@ -21,49 +21,26 @@ #include "unewdata.h" #include "ucnv_cnv.h" #include "ucnvmbcs.h" +#include "ucm.h" #include "makeconv.h" #include "genmbcs.h" -enum { - MBCS_STATE_FLAG_DIRECT=1, - MBCS_STATE_FLAG_SURROGATES, - - MBCS_STATE_FLAG_READY=16 -}; - -enum { - MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */ - MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ - MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */ - MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */ - MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, - MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, - - MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ - MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ - - MBCS_MAX_STATE_COUNT=128, - MBCS_MAX_FALLBACK_COUNT=8192 -}; - typedef struct MBCSData { NewConverter newConverter; - /* toUnicode */ - int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; - uint32_t stateFlags[MBCS_MAX_STATE_COUNT], - stateOffsetSum[MBCS_MAX_STATE_COUNT]; + UCMFile *ucm; + + /* toUnicode (state table in ucm->states) */ _MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT]; + int32_t countToUFallbacks; uint16_t *unicodeCodeUnits; - _MBCSHeader header; - int32_t countToUCodeUnits; /* fromUnicode */ uint16_t stage1[MBCS_STAGE_1_SIZE]; uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */ uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */ uint8_t *fromUBytes; - uint32_t stage2Top, stage3Top, maxCharLength; + uint32_t stage2Top, stage3Top; } MBCSData; /* prototypes */ @@ -71,64 +48,80 @@ static void MBCSClose(NewConverter *cnvData); static UBool -MBCSProcessStates(NewConverter *cnvData); +MBCSStartMappings(MBCSData *mbcsData); static UBool -MBCSAddToUnicode(NewConverter *cnvData, +MBCSAddToUnicode(MBCSData *mbcsData, const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback); + UChar32 c, + int8_t flag); static UBool MBCSIsValid(NewConverter *cnvData, - const uint8_t *bytes, int32_t length, - uint32_t b); + const uint8_t *bytes, int32_t length); static UBool -MBCSSingleAddFromUnicode(NewConverter *cnvData, +MBCSSingleAddFromUnicode(MBCSData *mbcsData, const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback); + UChar32 c, + int8_t flag); static UBool -MBCSAddFromUnicode(NewConverter *cnvData, +MBCSAddFromUnicode(MBCSData *mbcsData, const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback); + UChar32 c, + int8_t flag); static void -MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData); +MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData); + +static UBool +MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); static uint32_t -MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData); +MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); + +/* helper ------------------------------------------------------------------- */ + +static U_INLINE char +hexDigit(uint8_t digit) { + return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); +} + +static U_INLINE char * +printBytes(char *buffer, const uint8_t *bytes, int32_t length) { + char *s=buffer; + while(length>0) { + *s++=hexDigit((uint8_t)(*bytes>>4)); + *s++=hexDigit((uint8_t)(*bytes&0xf)); + ++bytes; + --length; + } + + *s=0; + return buffer; +} /* implementation ----------------------------------------------------------- */ static void -MBCSInit(MBCSData *mbcsData, uint8_t maxCharLength) { - int i; +MBCSInit(MBCSData *mbcsData, UCMFile *ucm) { + int32_t i, maxCharLength; uprv_memset(mbcsData, 0, sizeof(MBCSData)); + maxCharLength=ucm->states.maxCharLength; + + mbcsData->ucm=ucm; /* aliased, not owned */ + mbcsData->newConverter.close=MBCSClose; - mbcsData->newConverter.startMappings=MBCSProcessStates; mbcsData->newConverter.isValid=MBCSIsValid; - mbcsData->newConverter.addToUnicode=MBCSAddToUnicode; - if(maxCharLength==1) { - mbcsData->newConverter.addFromUnicode=MBCSSingleAddFromUnicode; - } else { - mbcsData->newConverter.addFromUnicode=MBCSAddFromUnicode; - } - mbcsData->newConverter.finishMappings=MBCSPostprocess; + mbcsData->newConverter.addTable=MBCSAddTable; mbcsData->newConverter.write=MBCSWrite; - mbcsData->header.version[0]=4; - mbcsData->header.version[1]=1; - mbcsData->stateFlags[0]=MBCS_STATE_FLAG_DIRECT; mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */ mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */ - mbcsData->maxCharLength=maxCharLength; - mbcsData->header.flags=maxCharLength-1; /* outputType */ /* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */ for(i=0; inewConverter; } @@ -149,366 +142,22 @@ static void MBCSClose(NewConverter *cnvData) { MBCSData *mbcsData=(MBCSData *)cnvData; if(mbcsData!=NULL) { - if(mbcsData->unicodeCodeUnits!=NULL) { - uprv_free(mbcsData->unicodeCodeUnits); - } - if(mbcsData->fromUBytes!=NULL) { - uprv_free(mbcsData->fromUBytes); - } + uprv_free(mbcsData->unicodeCodeUnits); + uprv_free(mbcsData->fromUBytes); uprv_free(mbcsData); } } -static const char * -skipWhitespace(const char *s) { - while(*s==' ' || *s=='\t') { - ++s; - } - return s; -} - -/* - * state table row grammar (ebnf-style): - * (whitespace is allowed between all tokens) - * - * row=[[firstentry ','] entry (',' entry)*] - * firstentry="initial" | "surrogates" - * (initial state (default for state 0), output is all surrogate pairs) - * entry=range [':' nextstate] ['.' action] - * range=number ['-' number] - * nextstate=number - * (0..7f) - * action='u' | 's' | 'p' | 'i' - * (unassigned, state change only, surrogate pair, illegal) - * number=(1- or 2-digit hexadecimal number) - */ -static const char * -parseState(const char *s, int32_t state[256], uint32_t *pFlags) { - const char *t; - uint32_t start, end, i; - int32_t entry; - - /* initialize the state: all illegal with U+ffff */ - for(i=0; i<256; ++i) { - state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); - } - - /* skip leading white space */ - s=skipWhitespace(s); - - /* is there an "initial" or "surrogates" directive? */ - if(uprv_strncmp("initial", s, 7)==0) { - *pFlags=MBCS_STATE_FLAG_DIRECT; - s=skipWhitespace(s+7); - if(*s++!=',') { - return s-1; - } - } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { - *pFlags=MBCS_STATE_FLAG_SURROGATES; - s=skipWhitespace(s+10); - if(*s++!=',') { - return s-1; - } - } else if(*s==0) { - /* empty state row: all-illegal */ - return NULL; - } - - for(;;) { - /* read an entry, the start of the range first */ - s=skipWhitespace(s); - start=uprv_strtoul(s, (char **)&t, 16); - if(s==t || 0xffheader.countStates==MBCS_MAX_STATE_COUNT) { - fprintf(stderr, "error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); - return FALSE; - } - - error=parseState(s, mbcsData->stateTable[mbcsData->header.countStates], - &mbcsData->stateFlags[mbcsData->header.countStates]); - if(error!=NULL) { - fprintf(stderr, "parse error in state definition at '%s'\n", error); - return FALSE; - } - - ++mbcsData->header.countStates; - return TRUE; -} - -static int32_t -sumUpStates(MBCSData *mbcsData) { - int32_t entry, sum; - int state, cell, count; - UBool allStatesReady; - - /* - * Sum up the offsets for all states. - * In each final state (where there are only final entries), - * the offsets add up directly. - * In all other state table rows, for each transition entry to another state, - * the offsets sum of that state needs to be added. - * This is achieved in at most countStates iterations. - */ - allStatesReady=FALSE; - for(count=mbcsData->header.countStates; !allStatesReady && count>=0; --count) { - allStatesReady=TRUE; - for(state=mbcsData->header.countStates-1; state>=0; --state) { - if(!(mbcsData->stateFlags[state]&MBCS_STATE_FLAG_READY)) { - allStatesReady=FALSE; - sum=0; - - /* at first, add up only the final delta offsets to keep them <512 */ - for(cell=0; cell<256; ++cell) { - entry=mbcsData->stateTable[state][cell]; - if(MBCS_ENTRY_IS_FINAL(entry)) { - switch(MBCS_ENTRY_FINAL_ACTION(entry)) { - case MBCS_STATE_VALID_16: - mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); - sum+=1; - break; - case MBCS_STATE_VALID_16_PAIR: - mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); - sum+=2; - break; - default: - /* no addition */ - break; - } - } - } - - /* now, add up the delta offsets for the transitional entries */ - for(cell=0; cell<256; ++cell) { - entry=mbcsData->stateTable[state][cell]; - if(MBCS_ENTRY_IS_TRANSITION(entry)) { - if(mbcsData->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { - mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); - sum+=mbcsData->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; - } else { - /* that next state does not have a sum yet, we cannot finish the one for this state */ - sum=-1; - break; - } - } - } - - if(sum!=-1) { - mbcsData->stateOffsetSum[state]=sum; - mbcsData->stateFlags[state]|=MBCS_STATE_FLAG_READY; - } - } - } - } - - if(!allStatesReady) { - fprintf(stderr, "error: the state table contains loops\n"); - return -1; - } - - /* - * For all "direct" (i.e., initial) states>0, - * the offsets need to be increased by the sum of - * the previous initial states. - */ - sum=mbcsData->stateOffsetSum[0]; - for(state=1; state<(int)mbcsData->header.countStates; ++state) { - if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { - int32_t sum2=sum; - sum+=mbcsData->stateOffsetSum[state]; - for(cell=0; cell<256; ++cell) { - entry=mbcsData->stateTable[state][cell]; - if(MBCS_ENTRY_IS_TRANSITION(entry)) { - mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); - } - } - } - } - if(VERBOSE) { - printf("the total number of offsets is 0x%lx=%ld\n", - (unsigned long)sum, (long)sum); - } - - /* round up to the next even number to have the following data 32-bit-aligned */ - sum=(sum+1)&~1; - return mbcsData->countToUCodeUnits=sum; -} - static UBool -MBCSProcessStates(NewConverter *cnvData) { - MBCSData *mbcsData=(MBCSData *)cnvData; - int32_t i, entry, sum; - int state, cell; - - /* - * first make sure that all "next state" values are within limits - * and that all next states after final ones have the "direct" - * flag of initial states - */ - for(state=mbcsData->header.countStates-1; state>=0; --state) { - for(cell=0; cell<256; ++cell) { - entry=mbcsData->stateTable[state][cell]; - if((uint8_t)MBCS_ENTRY_STATE(entry)>=mbcsData->header.countStates) { - fprintf(stderr, "error: state table entry [%x][%x] has a next state of %x that is too high\n", - state, cell, MBCS_ENTRY_STATE(entry)); - return FALSE; - } - if(MBCS_ENTRY_IS_FINAL(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { - fprintf(stderr, "error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", - state, cell, MBCS_ENTRY_STATE(entry)); - return FALSE; - } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { - fprintf(stderr, "error: state table entry [%x][%x] is not final but has an initial next state of %x\n", - state, cell, MBCS_ENTRY_STATE(entry)); - return FALSE; - } - } - } - - /* is this an SI/SO (like EBCDIC-stateful) state table? */ - if(mbcsData->header.countStates>=2 && (mbcsData->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { - if(mbcsData->maxCharLength!=2) { - fprintf(stderr, "error: SI/SO codepages must have max 2 bytes/char (not %x)\n", mbcsData->maxCharLength); - return FALSE; - } - if(mbcsData->header.countStates<3) { - fprintf(stderr, "error: SI/SO codepages must have at least 3 states (not %x)\n", mbcsData->header.countStates); - return FALSE; - } - /* are the SI/SO all in the right places? */ - if( mbcsData->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && - mbcsData->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && - mbcsData->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && - mbcsData->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) - ) { - mbcsData->header.flags=MBCS_OUTPUT_2_SISO; - } else { - fprintf(stderr, "error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); - return FALSE; - } - state=2; - } else { - state=1; - } - - /* check that no unexpected state is a "direct" one */ - while(state<(int)mbcsData->header.countStates) { - if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { - fprintf(stderr, "error: state %d is 'initial' - not supported except for SI/SO codepages\n", state); - return FALSE; - } - ++state; - } - - sum=sumUpStates(mbcsData); - if(sum<0) { - return FALSE; - } +MBCSStartMappings(MBCSData *mbcsData) { + int32_t i, sum; /* allocate the code unit array and prefill it with "unassigned" values */ + sum=mbcsData->ucm->states.countToUCodeUnits; + if(VERBOSE) { + printf("the total number of offsets is 0x%lx=%ld\n", sum, sum); + } + if(sum>0) { mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); if(mbcsData->unicodeCodeUnits==NULL) { @@ -522,17 +171,16 @@ MBCSProcessStates(NewConverter *cnvData) { } /* allocate the codepage mappings and preset the first 16 characters to 0 */ - if(mbcsData->maxCharLength==1) { + if(mbcsData->ucm->states.maxCharLength==1) { /* allocate 64k 16-bit results for single-byte codepages */ sum=0x20000; } else { /* allocate 1M * maxCharLength bytes for at most 1M mappings */ - sum=0x100000*mbcsData->maxCharLength; + sum=0x100000*mbcsData->ucm->states.maxCharLength; } mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum); if(mbcsData->fromUBytes==NULL) { - fprintf(stderr, "error: out of memory allocating %ldMB for target mappings\n", - (long)sum); + fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", sum); return FALSE; } /* initialize the all-unassigned first stage 3 block */ @@ -541,46 +189,24 @@ MBCSProcessStates(NewConverter *cnvData) { return TRUE; } -/* find a fallback for this offset; return the index or -1 if not found */ -static int32_t -findFallback(MBCSData *mbcsData, uint32_t offset) { - _MBCSToUFallback *toUFallbacks; - int32_t i, limit; - - limit=mbcsData->header.countToUFallbacks; - if(limit==0) { - /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ - return -1; - } - - /* do a linear search for the fallback mapping (the table is not yet sorted) */ - toUFallbacks=mbcsData->toUFallbacks; - for(i=0; itoUFallbacks, mbcsData->countToUFallbacks, offset); if(i>=0) { /* if there is already a fallback for this offset, then overwrite it */ mbcsData->toUFallbacks[i].codePoint=c; return TRUE; } else { /* if there is no fallback for this offset, then add one */ - i=mbcsData->header.countToUFallbacks; + i=mbcsData->countToUFallbacks; if(i>=MBCS_MAX_FALLBACK_COUNT) { fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", c); return FALSE; } else { mbcsData->toUFallbacks[i].offset=offset; mbcsData->toUFallbacks[i].codePoint=c; - mbcsData->header.countToUFallbacks=i+1; + mbcsData->countToUFallbacks=i+1; return TRUE; } } @@ -589,19 +215,19 @@ setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) { /* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */ static int32_t removeFallback(MBCSData *mbcsData, uint32_t offset) { - int32_t i=findFallback(mbcsData, offset); + int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset); if(i>=0) { _MBCSToUFallback *toUFallbacks; int32_t limit, old; toUFallbacks=mbcsData->toUFallbacks; - limit=mbcsData->header.countToUFallbacks; + limit=mbcsData->countToUFallbacks; old=(int32_t)toUFallbacks[i].codePoint; /* copy the last fallback entry here to keep the list contiguous */ toUFallbacks[i].offset=toUFallbacks[limit-1].offset; toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint; - mbcsData->header.countToUFallbacks=limit-1; + mbcsData->countToUFallbacks=limit-1; return old; } else { return -1; @@ -615,22 +241,22 @@ removeFallback(MBCSData *mbcsData, uint32_t offset) { * -1 the precision of this mapping is not specified */ static UBool -MBCSAddToUnicode(NewConverter *cnvData, +MBCSAddToUnicode(MBCSData *mbcsData, const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback) { - MBCSData *mbcsData=(MBCSData *)cnvData; + UChar32 c, + int8_t flag) { + char buffer[10]; uint32_t offset=0; int32_t i=0, entry, old; uint8_t state=0; - if(mbcsData->header.countStates==0) { + if(mbcsData->ucm->states.countStates==0) { fprintf(stderr, "error: there is no state information!\n"); return FALSE; } /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ - if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) { + if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) { state=1; } @@ -640,33 +266,33 @@ MBCSAddToUnicode(NewConverter *cnvData, * We assume that c<=0x10ffff. */ for(i=0;;) { - entry=mbcsData->stateTable[state][bytes[i++]]; + entry=mbcsData->ucm->states.stateTable[state][bytes[i++]]; if(MBCS_ENTRY_IS_TRANSITION(entry)) { if(i==length) { - fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx (U+%x)\n", - state, (unsigned long)b, c); + fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n", + state, printBytes(buffer, bytes, length), c); return FALSE; } state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); } else { if(i0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; case MBCS_STATE_CHANGE_ONLY: - fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; case MBCS_STATE_UNASSIGNED: - fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; case MBCS_STATE_FALLBACK_DIRECT_16: case MBCS_STATE_VALID_DIRECT_16: @@ -679,13 +305,13 @@ MBCSAddToUnicode(NewConverter *cnvData, } else { old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry); } - if(isFallback>=0) { - fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n", - c, (unsigned long)b, (long)old); + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n", + c, printBytes(buffer, bytes, length), (long)old); return FALSE; } else if(VERBOSE) { - fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n", - c, (unsigned long)b, (long)old); + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n", + c, printBytes(buffer, bytes, length), (long)old); } /* * Continue after the above warning @@ -693,7 +319,7 @@ MBCSAddToUnicode(NewConverter *cnvData, */ } /* reassign the correct action code */ - entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(isFallback>0 ? 2 : 0)+(c>=0x10000 ? 1 : 0))); + entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0))); /* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */ if(c<=0xffff) { @@ -701,7 +327,7 @@ MBCSAddToUnicode(NewConverter *cnvData, } else { entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000); } - mbcsData->stateTable[state][bytes[i-1]]=entry; + mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry; break; case MBCS_STATE_VALID_16: /* bits 26..16 are not used, 0 */ @@ -709,21 +335,21 @@ MBCSAddToUnicode(NewConverter *cnvData, offset+=MBCS_ENTRY_FINAL_VALUE_16(entry); /* check that this byte sequence is still unassigned */ if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) { - if(isFallback>=0) { - fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n", - c, (unsigned long)b, (long)old); + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n", + c, printBytes(buffer, bytes, length), (long)old); return FALSE; } else if(VERBOSE) { - fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n", - c, (unsigned long)b, (long)old); + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n", + c, printBytes(buffer, bytes, length), (long)old); } } if(c>=0x10000) { - fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; } - if(isFallback>0) { + if(flag>0) { /* assign only if there is no precise mapping */ if(mbcsData->unicodeCodeUnits[offset]==0xfffe) { return setFallback(mbcsData, offset, c); @@ -747,16 +373,16 @@ MBCSAddToUnicode(NewConverter *cnvData, } else /* old<=0xe001 */ { real=mbcsData->unicodeCodeUnits[offset+1]; } - if(isFallback>=0) { - fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n", - c, (unsigned long)b, (long)real); + if(flag>=0) { + fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n", + c, printBytes(buffer, bytes, length), (long)real); return FALSE; } else if(VERBOSE) { - fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n", - c, (unsigned long)b, (long)real); + fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n", + c, printBytes(buffer, bytes, length), (long)real); } } - if(isFallback>0) { + if(flag>0) { /* assign only if there is no precise mapping */ if(old<=0xdbff || old==0xe000) { /* do nothing */ @@ -786,8 +412,8 @@ MBCSAddToUnicode(NewConverter *cnvData, break; default: /* reserved, must never occur */ - fprintf(stderr, "internal error: byte sequence reached reserved action code, entry0x%02lx: 0x%02lx (U+%x)\n", - (unsigned long)entry, (unsigned long)b, c); + fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02lx: 0x%s (U+%x)\n", + (unsigned long)entry, printBytes(buffer, bytes, length), c); return FALSE; } @@ -799,83 +425,26 @@ MBCSAddToUnicode(NewConverter *cnvData, /* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */ static UBool MBCSIsValid(NewConverter *cnvData, - const uint8_t *bytes, int32_t length, - uint32_t b) { + const uint8_t *bytes, int32_t length) { MBCSData *mbcsData=(MBCSData *)cnvData; - uint32_t offset=0; - int32_t i=0, entry; - uint8_t state=0; - if(mbcsData->header.countStates==0) { - fprintf(stderr, "error: there is no state information!\n"); - return FALSE; - } - - /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ - if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) { - state=1; - } - - /* - * Walk down the state table like in conversion, - * much like getNextUChar(). - * We assume that c<=0x10ffff. - */ - for(i=0;;) { - entry=mbcsData->stateTable[state][bytes[i++]]; - if(MBCS_ENTRY_IS_TRANSITION(entry)) { - if(i==length) { - fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx\n", - state, (unsigned long)b); - return FALSE; - } - state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); - offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); - } else { - if(iucm->states, bytes, length)); } static UBool -MBCSSingleAddFromUnicode(NewConverter *cnvData, +MBCSSingleAddFromUnicode(MBCSData *mbcsData, const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback) { - MBCSData *mbcsData=(MBCSData *)cnvData; + UChar32 c, + int8_t flag) { uint16_t *p; uint32_t index; uint16_t old; + uint8_t b; + + /* ignore |2 SUB mappings */ + if(flag==2) { + return TRUE; + } /* * Walk down the triple-stage compact array ("trie") and @@ -883,14 +452,14 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData, * Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings. * We assume that length<=maxCharLength and that c<=0x10ffff. */ + b=*bytes; /* inspect stage 1 */ index=c>>10; if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { /* allocate another block in stage 2 */ if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) { - fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", c, b); return FALSE; } @@ -907,8 +476,7 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData, if(mbcsData->stage2Single[index]==0) { /* allocate another block in stage 3 */ if(mbcsData->stage3Top>=0x10000) { - fprintf(stderr, "error: too many code points at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", c, b); return FALSE; } /* each block has 16 uint16_t entries */ @@ -920,7 +488,7 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData, /* write the codepage entry into stage 3 and get the previous entry */ p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf); old=*p; - if(isFallback<=0) { + if(flag<=0) { *p=(uint16_t)(0xf00|b); } else if(IS_PRIVATE_USE(c)) { *p=(uint16_t)(0xc00|b); @@ -930,13 +498,13 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData, /* check that this Unicode code point was still unassigned */ if(old>=0x100) { - if(isFallback>=0) { - fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02x\n", - c, (unsigned long)b, old&0xff); + if(flag>=0) { + fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", + c, b, old&0xff); return FALSE; } else if(VERBOSE) { - fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02x\n", - c, (unsigned long)b, old&0xff); + fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n", + c, b, old&0xff); } /* continue after the above warning if the precision of the mapping is unspecified */ } @@ -945,21 +513,41 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData, } static UBool -MBCSAddFromUnicode(NewConverter *cnvData, +MBCSAddFromUnicode(MBCSData *mbcsData, const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback) { - MBCSData *mbcsData=(MBCSData *)cnvData; + UChar32 c, + int8_t flag) { + char buffer[10]; + const uint8_t *pb; uint8_t *p; - uint32_t index, old; + uint32_t index, b, old; + int32_t maxCharLength; - if( (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO && + /* ignore |2 SUB mappings */ + if(flag==2) { + return TRUE; + } + + maxCharLength=mbcsData->ucm->states.maxCharLength; + + if(maxCharLength==1) { + return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag); + } + + if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO && (*bytes==0xe || *bytes==0xf) ) { - fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; } + + if(flag==1 && length==1 && *bytes==0 && c!=*bytes) { + fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n", + c, *bytes); + return FALSE; + } + /* * Walk down the triple-stage compact array ("trie") and * allocate parts as necessary. @@ -973,8 +561,8 @@ MBCSAddFromUnicode(NewConverter *cnvData, if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) { /* allocate another block in stage 2 */ if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) { - fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; } @@ -990,21 +578,38 @@ MBCSAddFromUnicode(NewConverter *cnvData, index=mbcsData->stage1[index]+((c>>4)&0x3f); if(mbcsData->stage2[index]==0) { /* allocate another block in stage 3 */ - if(mbcsData->stage3Top>=0x100000*mbcsData->maxCharLength) { - fprintf(stderr, "error: too many code points at U+%04x<->0x%02lx\n", - c, (unsigned long)b); + if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) { + fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n", + c, printBytes(buffer, bytes, length)); return FALSE; } /* each block has 16*maxCharLength bytes */ - mbcsData->stage2[index]=(mbcsData->stage3Top/16)/mbcsData->maxCharLength; - uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*mbcsData->maxCharLength); - mbcsData->stage3Top+=16*mbcsData->maxCharLength; + mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength; + uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength); + mbcsData->stage3Top+=16*maxCharLength; } /* write the codepage bytes into stage 3 and get the previous bytes */ + + /* assemble the bytes into a single integer */ + pb=bytes; + b=0; + switch(length) { + case 4: + b=*pb++; + case 3: + b=(b<<8)|*pb++; + case 2: + b=(b<<8)|*pb++; + case 1: + default: + b=(b<<8)|*pb++; + break; + } + old=0; - p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*mbcsData->maxCharLength; - switch(mbcsData->maxCharLength) { + p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength; + switch(maxCharLength) { case 2: old=*(uint16_t *)p; *(uint16_t *)p=(uint16_t)b; @@ -1028,338 +633,101 @@ MBCSAddFromUnicode(NewConverter *cnvData, /* check that this Unicode code point was still unassigned */ if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) { - if(isFallback>=0) { - fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02lx\n", - c, (unsigned long)b, (unsigned long)old); + if(flag>=0) { + fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02lx\n", + c, printBytes(buffer, bytes, length), (unsigned long)old); return FALSE; } else if(VERBOSE) { - fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02lx\n", - c, (unsigned long)b, (unsigned long)old); + fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02lx\n", + c, printBytes(buffer, bytes, length), (unsigned long)old); } /* continue after the above warning if the precision of the mapping is unspecified */ } - if(isFallback<=0) { - /* set the "assigned" flag */ + if(flag<=0) { + /* set the roundtrip flag */ mbcsData->stage2[index]|=(1UL<<(16+(c&0xf))); } return TRUE; } -static int -compareFallbacks(const void *fb1, const void *fb2) { - return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; -} +/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */ +static UBool +MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) { + MBCSData *mbcsData; + UCMapping *m; + UChar32 c; + int32_t i; + UBool isOK; -/* - * This function tries to compact toUnicode tables for 2-byte codepages - * by finding lead bytes with all-unassigned trail bytes and adding another state - * for them. - */ -static void -compactToUnicode2(MBCSData *mbcsData) { - int32_t (*oldStateTable)[256]; - uint16_t count[256]; - uint16_t *oldUnicodeCodeUnits; - int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum; - int32_t i, j, leadState, trailState, newState, fallback; - uint16_t unit; - - /* find the lead state */ - if((mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) { - /* use the DBCS lead state for SI/SO codepages */ - leadState=1; - } else { - leadState=0; + staticData->unicodeMask=table->unicodeMask; + if(staticData->unicodeMask==3) { + fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n"); + return FALSE; } - /* find the main trail state: the most used target state */ - uprv_memset(count, 0, sizeof(count)); - for(i=0; i<256; ++i) { - entry=mbcsData->stateTable[leadState][i]; - if(MBCS_ENTRY_IS_TRANSITION(entry)) { - ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; - } - } - trailState=0; - for(i=1; i<(int)mbcsData->header.countStates; ++i) { - if(count[i]>count[trailState]) { - trailState=i; - } + staticData->conversionType=UCNV_MBCS; + + mbcsData=(MBCSData *)cnvData; + + if(!MBCSStartMappings(mbcsData)) { + return FALSE; } - /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ - uprv_memset(count, 0, sizeof(count)); - savings=0; - /* for each lead byte */ - for(i=0; i<256; ++i) { - entry=mbcsData->stateTable[leadState][i]; - if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) { - /* the offset is different for each lead byte */ - offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); - /* for each trail byte for this lead byte */ - for(j=0; j<256; ++j) { - entry=mbcsData->stateTable[trailState][j]; - switch(MBCS_ENTRY_FINAL_ACTION(entry)) { - case MBCS_STATE_VALID_16: - entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); - if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) { - ++count[i]; - } else { - j=999; /* do not count for this lead byte because there are assignments */ - } - break; - case MBCS_STATE_VALID_16_PAIR: - entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); - if(mbcsData->unicodeCodeUnits[entry]==0xfffe) { - count[i]+=2; - } else { - j=999; /* do not count for this lead byte because there are assignments */ - } - break; - default: - break; - } - } - if(j==256) { - /* all trail bytes for this lead byte are unassigned */ - savings+=count[i]; - } else { - count[i]=0; - } - } - } - /* subtract from the possible savings the cost of an additional state */ - savings=savings*2-1024; /* count bytes, not 16-bit words */ - if(savings<=0) { - return; - } - if(VERBOSE) { - printf("compacting toUnicode data saves %ld bytes\n", (long)savings); - } - if(mbcsData->header.countStates>=MBCS_MAX_STATE_COUNT) { - fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); - return; - } + isOK=TRUE; - /* make a copy of the state table */ - oldStateTable=(int32_t (*)[256])uprv_malloc(mbcsData->header.countStates*1024); - if(oldStateTable==NULL) { - fprintf(stderr, "cannot compact toUnicode: out of memory\n"); - return; - } - uprv_memcpy(oldStateTable, mbcsData->stateTable, mbcsData->header.countStates*1024); + m=table->mappings; + for(i=0; imappingsLength; ++m, ++i) { + c=m->u; - /* add the new state */ - /* - * this function does not catch the degenerate case where all lead bytes - * have all-unassigned trail bytes and the lead state could be removed - */ - newState=mbcsData->header.countStates++; - mbcsData->stateFlags[newState]=0; - /* copy the old trail state, turning all assigned states into unassigned ones */ - for(i=0; i<256; ++i) { - entry=mbcsData->stateTable[trailState][i]; - switch(MBCS_ENTRY_FINAL_ACTION(entry)) { - case MBCS_STATE_VALID_16: - case MBCS_STATE_VALID_16_PAIR: - mbcsData->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); + switch(m->f) { + case -1: + /* there was no precision/fallback indicator */ + /* fall through to set the mappings */ + case 0: + /* set roundtrip mappings */ + isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) && + MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f); + break; + case 1: + /* set only a fallback mapping from Unicode to codepage */ + staticData->hasFromUnicodeFallback=TRUE; + isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f); + break; + case 2: + /* ignore |2 SUB mappings */ + break; + case 3: + /* set only a fallback mapping from codepage to Unicode */ + staticData->hasToUnicodeFallback=TRUE; + isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f); break; default: - mbcsData->stateTable[newState][i]=entry; - break; + /* will not occur because the parser checked it already */ + fprintf(stderr, "error: illegal fallback indicator %d\n", m->f); + return FALSE; } } - /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ - for(i=0; i<256; ++i) { - if(count[i]>0) { - mbcsData->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(mbcsData->stateTable[leadState][i], newState); - } - } + MBCSPostprocess(mbcsData, staticData); - /* sum up the new state table */ - for(i=0; i<(int)mbcsData->header.countStates; ++i) { - mbcsData->stateFlags[i]&=~MBCS_STATE_FLAG_READY; - } - sum=sumUpStates(mbcsData); - - /* allocate a new, smaller code units array */ - oldUnicodeCodeUnits=mbcsData->unicodeCodeUnits; - if(sum==0) { - mbcsData->unicodeCodeUnits=NULL; - if(oldUnicodeCodeUnits!=NULL) { - uprv_free(oldUnicodeCodeUnits); - } - uprv_free(oldStateTable); - return; - } - mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); - if(mbcsData->unicodeCodeUnits==NULL) { - fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", - (long)sum); - /* revert to the old state table */ - mbcsData->unicodeCodeUnits=oldUnicodeCodeUnits; - --mbcsData->header.countStates; - uprv_memcpy(mbcsData->stateTable, oldStateTable, mbcsData->header.countStates*1024); - uprv_free(oldStateTable); - return; - } - for(i=0; iunicodeCodeUnits[i]=0xfffe; - } - - /* copy the code units for all assigned characters */ - /* - * The old state table has the same lead _and_ trail states for assigned characters! - * The differences are in the offsets, and in the trail states for some unassigned characters. - * For each character with an assigned state in the new table, it was assigned in the old one. - * Only still-assigned characters are copied. - * Note that fallback mappings need to get their offset values adjusted. - */ - - /* for each initial state */ - for(leadState=0; leadState<(int)mbcsData->header.countStates; ++leadState) { - if((mbcsData->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { - /* for each lead byte from there */ - for(i=0; i<256; ++i) { - entry=mbcsData->stateTable[leadState][i]; - if(MBCS_ENTRY_IS_TRANSITION(entry)) { - trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); - /* the new state does not have assigned states */ - if(trailState!=newState) { - trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); - oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); - /* for each trail byte */ - for(j=0; j<256; ++j) { - entry=mbcsData->stateTable[trailState][j]; - /* copy assigned-character code units and adjust fallback offsets */ - switch(MBCS_ENTRY_FINAL_ACTION(entry)) { - case MBCS_STATE_VALID_16: - offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); - /* find the old offset according to the old state table */ - oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); - unit=mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset]; - if(unit==0xfffe && (fallback=findFallback(mbcsData, oldOffset))>=0) { - mbcsData->toUFallbacks[fallback].offset=0x80000000|offset; - } - break; - case MBCS_STATE_VALID_16_PAIR: - offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); - /* find the old offset according to the old state table */ - oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); - mbcsData->unicodeCodeUnits[offset++]=oldUnicodeCodeUnits[oldOffset++]; - mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset]; - break; - default: - break; - } - } - } - } - } - } - } - - /* remove temporary flags from fallback offsets that protected them from being modified twice */ - sum=mbcsData->header.countToUFallbacks; - for(i=0; itoUFallbacks[i].offset&=0x7fffffff; - } - - /* free temporary memory */ - uprv_free(oldUnicodeCodeUnits); - uprv_free(oldStateTable); -} - -/* - * recursive sub-function of compactToUnicodeHelper() - * returns: - * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved, - * if all sequences from this state are unassigned, returns the - * <0 there are assignments in unicodeCodeUnits[] - * 0 no use of unicodeCodeUnits[] - */ -static int32_t -findUnassigned(MBCSData *mbcsData, int32_t state, int32_t offset, uint32_t b) { - int32_t i, entry, savings, localSavings, belowSavings; - UBool haveAssigned; - - localSavings=belowSavings=0; - haveAssigned=FALSE; - for(i=0; i<256; ++i) { - entry=mbcsData->stateTable[state][i]; - if(MBCS_ENTRY_IS_TRANSITION(entry)) { - savings=findUnassigned(mbcsData, MBCS_ENTRY_TRANSITION_STATE(entry), offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), (b<<8)|(uint32_t)i); - if(savings<0) { - haveAssigned=TRUE; - } else if(savings>0) { - printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", - (unsigned long)((b<<8)|i), (long)state, (long)savings); - belowSavings+=savings; - } - } else if(!haveAssigned) { - switch(MBCS_ENTRY_FINAL_ACTION(entry)) { - case MBCS_STATE_VALID_16: - entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); - if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) { - localSavings+=2; - } else { - haveAssigned=TRUE; - } - break; - case MBCS_STATE_VALID_16_PAIR: - entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); - if(mbcsData->unicodeCodeUnits[entry]==0xfffe) { - localSavings+=4; - } else { - haveAssigned=TRUE; - } - break; - default: - break; - } - } - } - if(haveAssigned) { - return -1; - } else { - return localSavings+belowSavings; - } -} - -/* helper function for finding compaction opportunities */ -static void -compactToUnicodeHelper(MBCSData *mbcsData) { - int32_t state, savings; - - if(!VERBOSE) { - return; - } - - /* for each initial state */ - for(state=0; state<(int)mbcsData->header.countStates; ++state) { - if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { - savings=findUnassigned(mbcsData, state, 0, 0); - if(savings>0) { - printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", - (long)state, (long)savings); - } - } - } + return isOK; } static UBool transformEUC(MBCSData *mbcsData) { uint8_t *p8; - uint32_t i, value, oldLength=mbcsData->maxCharLength, old3Top=mbcsData->stage3Top, new3Top; + uint32_t i, value, oldLength, old3Top, new3Top; uint8_t b; + oldLength=mbcsData->ucm->states.maxCharLength; if(oldLength<3) { return FALSE; } + old3Top=mbcsData->stage3Top; + /* careful: 2-byte and 4-byte codes are stored in platform endianness! */ /* test if all first bytes are in {0, 0x8e, 0x8f} */ @@ -1382,7 +750,7 @@ transformEUC(MBCSData *mbcsData) { p8=mbcsData->fromUBytes; /* modify outputType and adjust stage3Top */ - mbcsData->header.flags=MBCS_OUTPUT_3_EUC+oldLength-3; + mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3); mbcsData->stage3Top=new3Top=(old3Top*(oldLength-1))/oldLength; /* @@ -1608,54 +976,28 @@ compactStage2(MBCSData *mbcsData) { } static void -MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData) { - MBCSData *mbcsData=(MBCSData *)cnvData; - int32_t entry; - int state, cell; +MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) { + UCMStates *states; + int32_t maxCharLength; + + states=&mbcsData->ucm->states; + maxCharLength=states->maxCharLength; /* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */ if(VERBOSE) { printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n", - (unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength, - (unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength); + (unsigned long)mbcsData->stage3Top/maxCharLength, + (unsigned long)mbcsData->stage3Top/maxCharLength); } - /* test each state table entry */ - for(state=0; state<(int)mbcsData->header.countStates; ++state) { - for(cell=0; cell<256; ++cell) { - entry=mbcsData->stateTable[state][cell]; - /* - * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code - * and the code point is "unassigned" (0xfffe), then change it to - * the "unassigned" action code with bits 26..23 set to zero and U+fffe. - */ - if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { - mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); - } - } - } - - /* try to compact the toUnicode tables */ - if(mbcsData->maxCharLength==2) { - compactToUnicode2(mbcsData); - } else if(mbcsData->maxCharLength>2) { - compactToUnicodeHelper(mbcsData); - } - - /* sort toUFallbacks */ - /* - * It should be safe to sort them before compactToUnicode2() is called, - * because it should not change the relative order of the offset values - * that it adjusts, but they need to be sorted at some point, and - * it is safest here. - */ - if(mbcsData->header.countToUFallbacks>0) { - qsort(mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks, sizeof(_MBCSToUFallback), compareFallbacks); - } + ucm_optimizeStates(states, + &mbcsData->unicodeCodeUnits, + mbcsData->toUFallbacks, mbcsData->countToUFallbacks, + VERBOSE); /* try to compact the fromUnicode tables */ transformEUC(mbcsData); - if(mbcsData->maxCharLength==1) { + if(maxCharLength==1) { singleCompactStage3(mbcsData); singleCompactStage2(mbcsData); } else { @@ -1664,12 +1006,16 @@ MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData) { } static uint32_t -MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData) { +MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType) { MBCSData *mbcsData=(MBCSData *)cnvData; + uint32_t top; int32_t i, stage1Top; + _MBCSHeader header={ 0 }; + /* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */ - if(mbcsData->maxCharLength==1) { + if(mbcsData->ucm->states.maxCharLength==1) { if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) { stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */ } else { @@ -1705,26 +1051,44 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDat mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3; /* fill the header */ - mbcsData->header.offsetToUCodeUnits= + header.version[0]=4; + header.version[1]=2; + header.countStates=mbcsData->ucm->states.countStates; + header.countToUFallbacks=mbcsData->countToUFallbacks; + + header.offsetToUCodeUnits= sizeof(_MBCSHeader)+ - mbcsData->header.countStates*1024+ - mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback); - mbcsData->header.offsetFromUTable= - mbcsData->header.offsetToUCodeUnits+ - mbcsData->countToUCodeUnits*2; - mbcsData->header.offsetFromUBytes= - mbcsData->header.offsetFromUTable+ + mbcsData->ucm->states.countStates*1024+ + mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback); + header.offsetFromUTable= + header.offsetToUCodeUnits+ + mbcsData->ucm->states.countToUCodeUnits*2; + header.offsetFromUBytes= + header.offsetFromUTable+ stage1Top*2+ mbcsData->stage2Top; - mbcsData->header.fromUBytesLength=mbcsData->stage3Top; + header.fromUBytesLength=mbcsData->stage3Top; + + top=header.offsetFromUBytes+header.fromUBytesLength; + + header.flags=(uint8_t)(mbcsData->ucm->states.outputType); + + if(tableType&TABLE_EXT) { + if(top>0xffffff) { + fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", top); + return 0; + } + + header.flags|=top<<8; + } /* write the MBCS data */ - udata_writeBlock(pData, &mbcsData->header, sizeof(_MBCSHeader)); - udata_writeBlock(pData, mbcsData->stateTable, mbcsData->header.countStates*1024); - udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback)); - udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->countToUCodeUnits*2); + udata_writeBlock(pData, &header, sizeof(_MBCSHeader)); + udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024); + udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback)); + udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2); udata_writeBlock(pData, mbcsData->stage1, stage1Top*2); - if(mbcsData->maxCharLength==1) { + if(mbcsData->ucm->states.maxCharLength==1) { udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top); } else { udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top); @@ -1732,5 +1096,5 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDat udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top); /* return the number of bytes that should have been written */ - return mbcsData->header.offsetFromUBytes+mbcsData->header.fromUBytesLength; + return header.offsetFromUBytes+header.fromUBytesLength; } diff --git a/icu4c/source/tools/makeconv/genmbcs.h b/icu4c/source/tools/makeconv/genmbcs.h index c2ab199492..9313202649 100644 --- a/icu4c/source/tools/makeconv/genmbcs.h +++ b/icu4c/source/tools/makeconv/genmbcs.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000, International Business Machines +* Copyright (C) 2000-2003, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -19,10 +19,27 @@ #include "makeconv.h" -U_CFUNC NewConverter * -MBCSOpen(uint8_t maxCharLength); +enum { + MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */ + MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */ + MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */ + MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */ + MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE, + MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT, -U_CFUNC UBool -MBCSAddState(NewConverter *cnvData, const char *s); + MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */ + MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */ + + MBCS_STAGE_3_BLOCK_SIZE=16, /* 16; 16=1<<4 for 4 bits in stage 3 */ + MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */ + + MBCS_MAX_FALLBACK_COUNT=8192 +}; + +U_CFUNC NewConverter * +MBCSOpen(UCMFile *ucm); + +U_CFUNC NewConverter * +CnvExtOpen(UCMFile *ucm); #endif diff --git a/icu4c/source/tools/makeconv/makeconv.c b/icu4c/source/tools/makeconv/makeconv.c index b5c5d6111a..574327fb86 100644 --- a/icu4c/source/tools/makeconv/makeconv.c +++ b/icu4c/source/tools/makeconv/makeconv.c @@ -30,11 +30,43 @@ #include "unicode/udata.h" #include "unewdata.h" #include "ucmpwrit.h" +#include "ucm.h" #include "makeconv.h" #include "genmbcs.h" #define DEBUG 0 +typedef struct ConvData { + UCMFile *ucm; + NewConverter *cnvData, *extData; + UConverterSharedData sharedData; + UConverterStaticData staticData; +} ConvData; + +static void +initConvData(ConvData *data) { + uprv_memset(data, 0, sizeof(ConvData)); + data->sharedData.structSize=sizeof(UConverterSharedData); + data->staticData.structSize=sizeof(UConverterStaticData); + data->sharedData.staticData=&data->staticData; +} + +static void +cleanupConvData(ConvData *data) { + if(data!=NULL) { + if(data->cnvData!=NULL) { + data->cnvData->close(data->cnvData); + data->cnvData=NULL; + } + if(data->extData!=NULL) { + data->extData->close(data->extData); + data->extData=NULL; + } + ucm_close(data->ucm); + data->ucm=NULL; + } +} + /* * from ucnvstat.c - static prototypes of data-based converters */ @@ -46,137 +78,14 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP UBool VERBOSE = FALSE; UBool TOUCHFILE = FALSE; -/*Reads the header of the table file and fills in basic knowledge about the converter - *in "converter" - */ -static void readHeaderFromFile(UConverterSharedData* myConverter, FileStream* convFile, const char* converterName, UErrorCode* err); - -/*Reads the rest of the file, and fills up the shared objects if necessary -Returns the UConverterTable. */ -static void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err); - -/* creates a UConverterSharedData from a mapping file. - * Fills in: *staticData, *table. Converter is NOT otherwise useful. - */ -static UConverterSharedData* createConverterFromTableFile(const char* realName, UErrorCode* err); +static void +createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode); /* * Set up the UNewData and write the converter.. */ -void writeConverterData(UConverterSharedData *mySharedData, const char *cnvName, const char *cnvDir, UErrorCode *status); - -static const char NLTC_SEPARATORS[9] = { '\r', '\n', '\t', ' ', '<', '>' ,'"' , 'U', '\0' }; -static const char FALLBACK_SEPARATOR = '|'; -static const char CODEPOINT_SEPARATORS[8] = { '\r', '>', '\\', 'x', '\n', ' ', '\t', '\0' }; -static const char UNICODE_CODEPOINT_SEPARATORS[6] = { '<', '>', 'U', ' ', '\t', '\0' }; - -static const char * -skipWhitespace(const char *s) { - while(*s==' ' || *s=='\t') { - ++s; - } - return s; -} - -static int32_t -parseCodepageBytes(const char *s, uint32_t *pBytes, const char **pEnd) { - char *end; - int32_t length=0; - uint32_t bytes=0, value; - - while(s[0]=='\\' && s[1]=='x') { - if(length==4) { - return -1; - } - value=uprv_strtoul(s+2, &end, 16); - s+=4; - if(end!=s) { - return -1; - } - bytes=(bytes<<8)|value; - ++length; - } - if(length==0) { - return -1; - } - if(pEnd!=NULL) { - *pEnd=s; - } - *pBytes=bytes; - return length; -} - -/* Remove all characters followed by '#'. There is an exception if there - * is a fallback sign '|' after the comment and the comment does not - * start in column 0. In this case, we just blank from '#' to just - * before the '|' in order to support the fact that IBM official .ucm - * files have the fallback information in comments! - */ -static char * - removeComments (char *line) -{ - char *pound; - - line = (char*)skipWhitespace(line); - pound = uprv_strchr (line, '#'); - if (pound != NULL) - { - char *fallback = pound == line ? 0 : uprv_strchr(pound + 1, '|'); - if (fallback != NULL) - { - uprv_memset(pound, ' ', fallback-pound); - } - else - { - *pound = '\0'; - } - } - return line; -} - -/* Returns true in c is a in set 'setOfChars', false otherwise - */ -static UBool - isInSet (char c, const char *setOfChars) -{ - uint8_t i = 0; - - while (setOfChars[i] != '\0') - { - if (c == setOfChars[i++]) - return TRUE; - } - - return FALSE; -} - -/* Returns pointer to the next non-whitespace (or non-separator) - */ -static int32_t - nextTokenOffset (const char *line, const char *separators) -{ - int32_t i = 0; - - while (line[i] && isInSet(line[i], separators)) - i++; - - return i; -} - -/* Returns pointer to the next token based on the set of separators - */ -static char * - getToken (char *token, char *line, const char *separators) -{ - int32_t i = nextTokenOffset (line, separators); - int8_t j = 0; - - while (line[i] && (!isInSet(line[i], separators))) - token[j++] = line[i++]; - token[j] = '\0'; - - return line + i; -} +static void +writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status); UBool haveCopyright=TRUE; @@ -194,20 +103,27 @@ static UDataInfo dataInfo={ {0, 0, 0, 0} /* dataVersion (calculated at runtime) */ }; -void writeConverterData(UConverterSharedData *mySharedData, - const char *cnvName, - const char *cnvDir, - UErrorCode *status) +static void +writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status) { UNewDataMemory *mem = NULL; uint32_t sz2; uint32_t size = 0; + int32_t tableType; if(U_FAILURE(*status)) { return; } + tableType=TABLE_NONE; + if(data->cnvData!=NULL) { + tableType|=TABLE_BASE; + } + if(data->extData!=NULL) { + tableType|=TABLE_EXT; + } + mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status); if(U_FAILURE(*status)) @@ -224,11 +140,17 @@ void writeConverterData(UConverterSharedData *mySharedData, fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv"); } + /* all read only, clean, platform independent data. Mmmm. :) */ - udata_writeBlock(mem, mySharedData->staticData, sizeof(UConverterStaticData)); + udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData)); size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */ /* Now, write the table */ - size += ((NewConverter *)mySharedData->table)->write((NewConverter *)mySharedData->table, mySharedData->staticData, mem); + if(tableType&TABLE_BASE) { + size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType); + } + if(tableType&TABLE_EXT) { + size += data->extData->write(data->extData, &data->staticData, mem, tableType); + } sz2 = udata_finish(mem, status); if(size != sz2) @@ -255,7 +177,7 @@ static UOption options[]={ int main(int argc, char* argv[]) { - UConverterSharedData* mySharedData = NULL; + ConvData data; UErrorCode err = U_ZERO_ERROR, localError; char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; char touchFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH]; @@ -420,7 +342,7 @@ int main(int argc, char* argv[]) if(pkgName != NULL) { - /* changes both baename and filename */ + /* changes both basename and filename */ uprv_strcpy(outBasename, pkgName); uprv_strcat(outBasename, "_"); uprv_strcat(outBasename, cnvName); @@ -435,9 +357,10 @@ int main(int argc, char* argv[]) fflush(stdout); #endif localError = U_ZERO_ERROR; - mySharedData = createConverterFromTableFile(arg, &localError); + initConvData(&data); + createConverter(&data, arg, &localError); - if (U_FAILURE(localError) || (mySharedData == NULL)) + if (U_FAILURE(localError)) { /* if an error is found, print out an error msg and keep going */ fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg, @@ -449,21 +372,21 @@ int main(int argc, char* argv[]) else { /* Make the static data name equal to the file name */ - if( /*VERBOSE && */ uprv_stricmp(cnvName,mySharedData->staticData->name)) + if( /*VERBOSE && */ uprv_stricmp(cnvName,data.staticData.name)) { fprintf(stderr, "Warning: %s%s claims to be '%s'\n", cnvName, CONVERTER_FILE_EXTENSION, - mySharedData->staticData->name); + data.staticData.name); } - uprv_strcpy((char*)mySharedData->staticData->name, cnvName); + uprv_strcpy((char*)data.staticData.name, cnvName); - if(!uprv_isInvariantString((char*)mySharedData->staticData->name, -1)) { + if(!uprv_isInvariantString((char*)data.staticData.name, -1)) { fprintf(stderr, "Error: A converter name must contain only invariant characters.\n" "%s is not a valid converter name.\n", - mySharedData->staticData->name); + data.staticData.name); if(U_SUCCESS(err)) { err = U_INVALID_TABLE_FORMAT; } @@ -481,8 +404,7 @@ int main(int argc, char* argv[]) } localError = U_ZERO_ERROR; - writeConverterData(mySharedData, cnvNameWithPkg, destdir, &localError); - ((NewConverter *)mySharedData->table)->close((NewConverter *)mySharedData->table); + writeConverterData(&data, cnvNameWithPkg, destdir, &localError); if(TOUCHFILE) { FileStream *q; @@ -505,10 +427,6 @@ int main(int argc, char* argv[]) } } - /* write the information data */ - uprv_free((UConverterStaticData *)mySharedData->staticData); - uprv_free(mySharedData); - if(U_FAILURE(localError)) { /* if an error is found, print out an error msg and keep going*/ @@ -525,6 +443,8 @@ int main(int argc, char* argv[]) } fflush(stdout); fflush(stderr); + + cleanupConvData(&data); } return err; @@ -548,517 +468,312 @@ getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID } } -/*Reads the header of the table file and fills in basic knowledge about the converter in "converter"*/ -void readHeaderFromFile(UConverterSharedData* mySharedData, - FileStream* convFile, - const char* converterName, - UErrorCode *pErrorCode) -{ +static void +readHeader(ConvData *data, + FileStream* convFile, + const char* converterName, + UErrorCode *pErrorCode) { char line[200]; - char *s, *end, *key, *value; + char *s, *key, *value; + const UConverterStaticData *prototype; UConverterStaticData *staticData; - char c; if(U_FAILURE(*pErrorCode)) { return; } - staticData=(UConverterStaticData *)mySharedData->staticData; - staticData->conversionType=UCNV_UNSUPPORTED_CONVERTER; + staticData=&data->staticData; staticData->platform=UCNV_IBM; staticData->subCharLen=0; while(T_FileStream_readLine(convFile, line, sizeof(line))) { - /* remove comments and trailing CR and LF and remove whitespace from the end */ - for(end=line; (c=*end)!=0; ++end) { - if(c=='#' || c=='\r' || c=='\n') { - break; - } - } - while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { - --end; - } - *end=0; - - /* skip leading white space and ignore empty lines */ - s=(char *)skipWhitespace(line); - if(*s==0) { + /* basic parsing and handling of state-related items */ + if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) { continue; } /* stop at the beginning of the mapping section */ - if(uprv_memcmp(s, "CHARMAP", 7)==0) { + if(uprv_strcmp(line, "CHARMAP")==0) { break; } - /* get the key name, bracketed in <> */ - if(*s!='<') { - fprintf(stderr, "error: no header field in line \"%s\"\n", line); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } - key=++s; - while(*s!='>') { - if(*s==0) { - fprintf(stderr, "error: incomplete header field in line \"%s\"\n", line); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } - ++s; - } - *s=0; - - /* get the value string, possibly quoted */ - s=(char *)skipWhitespace(s+1); - if(*s!='"') { - value=s; - } else { - /* remove the quotes */ - value=s+1; - if(end>value && *(end-1)=='"') { - *--end=0; - } - } - /* collect the information from the header field, ignore unknown keys */ if(uprv_strcmp(key, "code_set_name")==0) { if(*value!=0) { - uprv_strcpy((char*)staticData->name, value); + uprv_strcpy((char *)staticData->name, value); getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage); } - } else if(uprv_strcmp(key, "uconv_class")==0) { - const UConverterStaticData *prototype; - - if(uprv_strcmp(value, "DBCS")==0) { - staticData->conversionType=UCNV_DBCS; - } else if(uprv_strcmp(value, "SBCS")==0) { - staticData->conversionType = UCNV_SBCS; - } else if(uprv_strcmp(value, "MBCS")==0) { - staticData->conversionType = UCNV_MBCS; - } else if(uprv_strcmp(value, "EBCDIC_STATEFUL")==0) { - staticData->conversionType = UCNV_EBCDIC_STATEFUL; - } else { - fprintf(stderr, "error: unknown %s\n", value); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } - - /* Now that we know the type, copy any 'default' values from the table. */ - prototype=ucnv_converterStaticData[staticData->conversionType]; - if(prototype!=NULL) { - if(staticData->name[0]==0) { - uprv_strcpy((char*)staticData->name, prototype->name); - } - - if(staticData->codepage==0) { - staticData->codepage = prototype->codepage; - } - - if(staticData->platform==0) { - staticData->platform = prototype->platform; - } - - if(staticData->minBytesPerChar==0) { - staticData->minBytesPerChar = prototype->minBytesPerChar; - } - - if(staticData->maxBytesPerChar==0) { - staticData->maxBytesPerChar = prototype->maxBytesPerChar; - } - - if(staticData->subCharLen==0) { - staticData->subCharLen=prototype->subCharLen; - if(prototype->subCharLen>0) { - uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); - } - } - } - } else if(uprv_strcmp(key, "mb_cur_max")==0) { - if('1'<=*value && *value<='4' && value[1]==0) { - staticData->maxBytesPerChar=(int8_t)(*value-'0'); - } else { - fprintf(stderr, "error: illegal %s\n", value); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } - } else if(uprv_strcmp(key, "mb_cur_min")==0) { - if('1'<=*value && *value<='4' && value[1]==0) { - staticData->minBytesPerChar=(int8_t)(*value-'0'); - } else { - fprintf(stderr, "error: illegal %s\n", value); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } } else if(uprv_strcmp(key, "subchar")==0) { - uint32_t bytes; - int32_t length; + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + int8_t length; - length=parseCodepageBytes(value, &bytes, (const char **)&end); - if(length>0 && *end==0) { - staticData->subCharLen=(int8_t)length; - do { - staticData->subChar[--length]=(uint8_t)bytes; - bytes>>=8; - } while(length>0); + s=value; + length=ucm_parseBytes(bytes, line, &s); + if(1<=length && length<=4 && *s==0) { + staticData->subCharLen=length; + uprv_memcpy(staticData->subChar, bytes, length); } else { fprintf(stderr, "error: illegal %s\n", value); *pErrorCode=U_INVALID_TABLE_FORMAT; return; } } else if(uprv_strcmp(key, "subchar1")==0) { - uint32_t bytes; + uint8_t bytes[UCNV_EXT_MAX_BYTES]; - if(1==parseCodepageBytes(value, &bytes, (const char **)&end) && *end==0) { - staticData->subChar1=(uint8_t)bytes; + s=value; + if(1==ucm_parseBytes(bytes, line, &s) && *s==0) { + staticData->subChar1=bytes[0]; } else { fprintf(stderr, "error: illegal %s\n", value); *pErrorCode=U_INVALID_TABLE_FORMAT; return; } - } else if(uprv_strcmp(key, "icu:state")==0) { - /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ - switch(staticData->conversionType) { - case UCNV_SBCS: - case UCNV_DBCS: - case UCNV_EBCDIC_STATEFUL: - staticData->conversionType = UCNV_MBCS; - break; - case UCNV_MBCS: - break; - default: - fprintf(stderr, "error: entry for non-MBCS table or before the line\n"); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } - - if(staticData->maxBytesPerChar==0) { - fprintf(stderr, "error: before the line\n"); - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } - if(mySharedData->table==NULL) { - mySharedData->table=(UConverterTable *)MBCSOpen(staticData->maxBytesPerChar); - if(mySharedData->table==NULL) { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return; - } - } - if(!MBCSAddState((NewConverter *)mySharedData->table, value)) { - *pErrorCode=U_INVALID_TABLE_FORMAT; - return; - } } } + /* copy values from the UCMFile to the static data */ + staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength; + staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength; + staticData->conversionType=data->ucm->states.conversionType; + + /* ### TODO use UCNV_UNSUPPORTED_CONVERTER to indicate an extension-only file? */ + if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) { + fprintf(stderr, "ucm error: missing conversion type ()\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; - } else if(staticData->conversionType==UCNV_MBCS && mySharedData->table==NULL) { - fprintf(stderr, "error: missing state table information () for MBCS\n"); - *pErrorCode=U_INVALID_TABLE_FORMAT; - } else if(staticData->subChar1!=0 && - !staticData->conversionType==UCNV_MBCS && - !staticData->conversionType==UCNV_EBCDIC_STATEFUL + return; + } + + /* + * Now that we know the type, copy any 'default' values from the table. + * We need not check the type any further because the parser only + * recognizes what we have prototypes for. + */ + prototype=ucnv_converterStaticData[staticData->conversionType]; + if(prototype!=NULL) { + if(staticData->name[0]==0) { + uprv_strcpy((char *)staticData->name, prototype->name); + } + + if(staticData->codepage==0) { + staticData->codepage=prototype->codepage; + } + + if(staticData->platform==0) { + staticData->platform=prototype->platform; + } + + if(staticData->minBytesPerChar==0) { + staticData->minBytesPerChar=prototype->minBytesPerChar; + } + + if(staticData->maxBytesPerChar==0) { + staticData->maxBytesPerChar=prototype->maxBytesPerChar; + } + + if(staticData->subCharLen==0) { + staticData->subCharLen=prototype->subCharLen; + if(prototype->subCharLen>0) { + uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen); + } + } + } + + if(data->ucm->states.outputType<0) { + data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength; + } + + if( staticData->subChar1!=0 && + (staticData->minBytesPerChar>1 || + (staticData->conversionType!=UCNV_MBCS && + staticData->conversionType!=UCNV_EBCDIC_STATEFUL)) ) { fprintf(stderr, "error: defined for a type other than MBCS or EBCDIC_STATEFUL\n"); *pErrorCode=U_INVALID_TABLE_FORMAT; } } -void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err) -{ - char storageLine[200]; - char* line = NULL; - UConverterStaticData *staticData=(UConverterStaticData *)sharedData->staticData; - NewConverter *cnvData = (NewConverter *)sharedData->table; - UChar32 unicodeValue, codepageValue; - uint8_t mbcsBytes[8]; - int32_t mbcsLength; - char codepointBytes[20]; - UBool isOK = TRUE; - uint8_t precisionMask = 0, unicodeMask = 0; - char endOfLine; +static void +readTable(ConvData *data, FileStream* convFile, + UBool forBase, UCMStates *baseStates, + UErrorCode *pErrorCode) { + char line[200]; + char *end; + UBool isOK; + + if(U_FAILURE(*pErrorCode)) { + return; + } - if(cnvData->startMappings!=NULL) - { - if(!cnvData->startMappings(cnvData)) { - *err = U_INVALID_TABLE_FORMAT; - return; + isOK=TRUE; + + for(;;) { + /* read the next line */ + if(!T_FileStream_readLine(convFile, line, sizeof(line))) { + fprintf(stderr, "incomplete charmap section\n"); + isOK=FALSE; + break; } - } - if(cnvData->isValid!=NULL) - { - const uint8_t *p = staticData->subChar; - codepageValue = 0; - switch(staticData->subCharLen) { - case 4: codepageValue = (codepageValue << 8) | *p++; - case 3: codepageValue = (codepageValue << 8) | *p++; - case 2: codepageValue = (codepageValue << 8) | *p++; - case 1: codepageValue = (codepageValue << 8) | *p; - default: break; /* must never occur */ + /* remove CR LF */ + end=uprv_strchr(line, 0); + while(lineisValid(cnvData, staticData->subChar, staticData->subCharLen, codepageValue)) { - fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); - *err = U_INVALID_TABLE_FORMAT; - isOK = FALSE; + *end=0; + + /* ignore empty and comment lines */ + if(line[0]==0 || line[0]=='#') { + continue; } - } - staticData->hasFromUnicodeFallback = staticData->hasToUnicodeFallback = FALSE; - - while (T_FileStream_readLine(convFile, storageLine, sizeof(storageLine))) - { - removeComments(storageLine); - line = storageLine; - if (line[nextTokenOffset(line, NLTC_SEPARATORS)] != '\0') - { - /* get the Unicode code point */ - line = getToken(codepointBytes, line, UNICODE_CODEPOINT_SEPARATORS); - if (uprv_strcmp(codepointBytes, "END") == 0) - { - break; - } - unicodeValue = (UChar32)T_CString_stringToInteger(codepointBytes, 16); - - /* get the codepage bytes */ - codepageValue = 0; - mbcsLength = 0; - do - { - line = getToken(codepointBytes, line, CODEPOINT_SEPARATORS); - mbcsBytes[mbcsLength] = (uint8_t)T_CString_stringToInteger(codepointBytes, 16); - codepageValue = codepageValue << 8 | mbcsBytes[mbcsLength++]; - - /* End of line could be \0 or | (if fallback) */ - endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)]; - } while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR)); - - if(unicodeValue>=0x10000) { - unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ - } else if(UTF_IS_SURROGATE(unicodeValue)) { - unicodeMask|=UCNV_HAS_SURROGATES; /* there are single surrogates */ - } - - if((uint32_t)unicodeValue > 0x10ffff) - { - fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine); - isOK = FALSE; - } - else if(endOfLine == FALLBACK_SEPARATOR) - { - /* we know that there is a fallback separator */ - precisionMask |= 1; - line = uprv_strchr(line, FALLBACK_SEPARATOR) + 1; - switch(*line) - { - case '0': - /* set roundtrip mappings */ - isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0) && - cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0); - break; - case '1': - /* set only a fallback mapping from Unicode to codepage */ - staticData->hasFromUnicodeFallback = TRUE; - isOK &= cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1); - break; - case '2': - /* skip subchar mappings */ - break; - case '3': - /* set only a fallback mapping from codepage to Unicode */ - staticData->hasToUnicodeFallback = TRUE; - isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1); - break; - default: - fprintf(stderr, "error: illegal fallback indicator '%s' in '%s'\n", line - 1, storageLine); - *err = U_INVALID_TABLE_FORMAT; - break; - } - } - else - { - precisionMask |= 2; - /* set the mappings */ - isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1) && - cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1); - } + /* stop at the end of the mapping table */ + if(0==uprv_strcmp(line, "END CHARMAP")) { + break; } + + isOK&=ucm_addMappingFromLine(data->ucm, line, forBase, baseStates); } - if(unicodeMask == 3) - { - fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n"); - } - staticData->unicodeMask = unicodeMask; - - if(cnvData->finishMappings!=NULL) - { - cnvData->finishMappings(cnvData, staticData); - } - - if(!isOK) - { - *err = U_INVALID_TABLE_FORMAT; - } - else if(precisionMask == 3) - { - fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); - *err = U_INVALID_TABLE_FORMAT; + if(!isOK) { + *pErrorCode=U_INVALID_TABLE_FORMAT; } } -/*creates a UConverterStaticData, fills in necessary links to it the appropriate function pointers*/ -UConverterSharedData* createConverterFromTableFile(const char* converterName, UErrorCode* err) -{ - FileStream* convFile = NULL; - UConverterSharedData* mySharedData = NULL; - UConverterStaticData* myStaticData = NULL; +/* return TRUE if a base table was read, FALSE for an extension table */ +static UBool +readFile(ConvData *data, const char* converterName, + UErrorCode *pErrorCode) { + char line[200]; + char *end; + FileStream *convFile; + UBool dataIsBase; - if (U_FAILURE(*err)) return NULL; - - convFile = T_FileStream_open(converterName, "r"); - if (convFile == NULL) - { - *err = U_FILE_ACCESS_ERROR; - return NULL; + if(U_FAILURE(*pErrorCode)) { + return FALSE; } + data->ucm=ucm_open(); - mySharedData = (UConverterSharedData*) uprv_malloc(sizeof(UConverterSharedData)); - if (mySharedData == NULL) - { - *err = U_MEMORY_ALLOCATION_ERROR; - T_FileStream_close(convFile); - return NULL; + convFile=T_FileStream_open(converterName, "r"); + if(convFile==NULL) { + *pErrorCode=U_FILE_ACCESS_ERROR; + return FALSE; } - uprv_memset(mySharedData, 0, sizeof(UConverterSharedData)); - - mySharedData->structSize = sizeof(UConverterSharedData); - - myStaticData = (UConverterStaticData*) uprv_malloc(sizeof(UConverterStaticData)); - if (myStaticData == NULL) - { - *err = U_MEMORY_ALLOCATION_ERROR; - T_FileStream_close(convFile); - return NULL; + readHeader(data, convFile, converterName, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return FALSE; } - uprv_memset(myStaticData, 0, sizeof(UConverterStaticData)); - mySharedData->staticData = myStaticData; - myStaticData->structSize = sizeof(UConverterStaticData); - /* mySharedData->staticDataOwned = FALSE; */ /* not owned if in udata */ - mySharedData->sharedDataCached = FALSE; - mySharedData->dataMemory = NULL; /* for init */ + if(data->ucm->baseName[0]==0) { + dataIsBase=TRUE; + ucm_processStates(&data->ucm->states); - readHeaderFromFile(mySharedData, convFile, converterName, err); - - if (U_FAILURE(*err)) return NULL; - - switch (myStaticData->conversionType) - { - case UCNV_SBCS: - { - /* SBCS: use MBCS data structure with a default state table */ - if(mySharedData->staticData->maxBytesPerChar!=1) { - fprintf(stderr, "error: SBCS codepage with max bytes/char!=1\n"); - *err = U_INVALID_TABLE_FORMAT; - break; + /* read the base table */ + readTable(data, convFile, TRUE, &data->ucm->states, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return FALSE; } - myStaticData->conversionType = UCNV_MBCS; - if(mySharedData->table == NULL) { - NewConverter *sharedDataTable = MBCSOpen(1); - if(sharedDataTable != NULL) { - if(!MBCSAddState(sharedDataTable, "0-ff")) { - *err = U_INVALID_TABLE_FORMAT; - sharedDataTable->close(sharedDataTable); - } else { - mySharedData->table = (UConverterTable *)sharedDataTable; - } - } else { - *err = U_MEMORY_ALLOCATION_ERROR; + + /* read an extension table if there is one */ + while(T_FileStream_readLine(convFile, line, sizeof(line))) { + end=uprv_strchr(line, 0); + while(lineucm->states, pErrorCode); + break; } } - break; - } - case UCNV_MBCS: - { - /* MBCSOpen() was called by readHeaderFromFile() */ - break; - } - case UCNV_EBCDIC_STATEFUL: - { - /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ - if(mySharedData->staticData->maxBytesPerChar!=2) { - fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n"); - *err = U_INVALID_TABLE_FORMAT; - break; - } - myStaticData->conversionType = UCNV_MBCS; - if(mySharedData->table == NULL) { - NewConverter *sharedDataTable = MBCSOpen(2); - if(sharedDataTable != NULL) { - if( !MBCSAddState(sharedDataTable, "0-ff, e:1.s, f:0.s") || - !MBCSAddState(sharedDataTable, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4") || - !MBCSAddState(sharedDataTable, "0-40:1.i, 41-fe:1., ff:1.i") || - !MBCSAddState(sharedDataTable, "0-ff:1.i, 40:1.") || - !MBCSAddState(sharedDataTable, "0-ff:1.i") - ) { - *err = U_INVALID_TABLE_FORMAT; - sharedDataTable->close(sharedDataTable); - } else { - mySharedData->table = (UConverterTable *)sharedDataTable; - } - } else { - *err = U_MEMORY_ALLOCATION_ERROR; - } - } - break; - } - case UCNV_DBCS: - { - /* DBCS: use MBCS data structure with a default state table */ - if(mySharedData->staticData->maxBytesPerChar!=2) { - fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n"); - *err = U_INVALID_TABLE_FORMAT; - break; - } - myStaticData->conversionType = UCNV_MBCS; - if(mySharedData->table == NULL) { - NewConverter *sharedDataTable = MBCSOpen(2); - if(sharedDataTable != NULL) { - if( !MBCSAddState(sharedDataTable, "0-3f:3, 40:2, 41-fe:1, ff:3") || - !MBCSAddState(sharedDataTable, "41-fe") || - !MBCSAddState(sharedDataTable, "40") || - !MBCSAddState(sharedDataTable, "") - ) { - *err = U_INVALID_TABLE_FORMAT; - sharedDataTable->close(sharedDataTable); - } else { - mySharedData->table = (UConverterTable *)sharedDataTable; - } - } else { - *err = U_MEMORY_ALLOCATION_ERROR; - } - } - break; - } - - default : - fprintf(stderr, "error: omitted\n"); - *err = U_INVALID_TABLE_FORMAT; - mySharedData->table = NULL; - break; - }; - - if(U_SUCCESS(*err) && mySharedData->table != NULL) - { - loadTableFromFile(convFile, mySharedData, err); + } else { + /* read only the extension table */ + dataIsBase=FALSE; + readTable(data, convFile, FALSE, NULL, pErrorCode); } T_FileStream_close(convFile); - return mySharedData; + if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) { + fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + + return dataIsBase; +} + +static void +createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode) { + ConvData baseData; + UBool dataIsBase; + + if(U_FAILURE(*pErrorCode)) { + return; + } + + initConvData(data); + + /* ### TODO if there is an extension table: + 1. the base table must use precision flags + 2. check base vs. extension for mappings overlap + */ + dataIsBase=readFile(data, converterName, pErrorCode); + if(U_FAILURE(*pErrorCode)) { + return; + } + + initConvData(&baseData); + + if(dataIsBase) { + data->cnvData=MBCSOpen(data->ucm); + if(data->cnvData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + + } else if(!data->cnvData->isValid(data->cnvData, + data->staticData.subChar, data->staticData.subCharLen) + ) { + fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n"); + *pErrorCode=U_INVALID_TABLE_FORMAT; + + } else if(data->ucm->ext->mappingsLength>0) { + /* prepare the extension table, if there is one */ + data->extData=CnvExtOpen(data->ucm); + if(data->extData==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + + } else if( + !ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, TRUE) || + !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + + /* add the base table after ucm_checkBaseExt()! */ + if( U_SUCCESS(*pErrorCode) && + !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } else { + /* ### TODO assemble a path/filename for data->ucm->states.baseName */ + /* must be TRUE */readFile(&baseData, ""/*extConverterName*/, pErrorCode); + /* ### TODO read extension table */ + /* ### TODO - actually write the mappings into genmbcs or into ext */ + + if( !ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) || + !ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, FALSE) || + !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData) + ) { + *pErrorCode=U_INVALID_TABLE_FORMAT; + } + } + + cleanupConvData(&baseData); } /* diff --git a/icu4c/source/tools/makeconv/makeconv.dsp b/icu4c/source/tools/makeconv/makeconv.dsp index c1506d20a9..058d4a14f1 100644 --- a/icu4c/source/tools/makeconv/makeconv.dsp +++ b/icu4c/source/tools/makeconv/makeconv.dsp @@ -183,6 +183,10 @@ SOURCE="$(InputPath)" # PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat" # Begin Source File +SOURCE=.\gencnvex.c +# End Source File +# Begin Source File + SOURCE=.\genmbcs.c # End Source File # Begin Source File diff --git a/icu4c/source/tools/makeconv/makeconv.h b/icu4c/source/tools/makeconv/makeconv.h index cb4825b324..bb6c500432 100644 --- a/icu4c/source/tools/makeconv/makeconv.h +++ b/icu4c/source/tools/makeconv/makeconv.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2000-2001, International Business Machines +* Copyright (C) 2000-2003, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -20,10 +20,19 @@ #include "unicode/utypes.h" #include "ucnv_bld.h" #include "unewdata.h" +#include "ucm.h" /* exports from makeconv.c */ U_CFUNC UBool VERBOSE; +/* converter table type for writing */ +enum { + TABLE_NONE, + TABLE_BASE, + TABLE_EXT, + TABLE_BASE_AND_EXT +}; + /* abstract converter generator struct, C++ - style */ struct NewConverter; typedef struct NewConverter NewConverter; @@ -32,32 +41,17 @@ struct NewConverter { void (*close)(NewConverter *cnvData); - UBool - (*startMappings)(NewConverter *cnvData); - /** is this byte sequence valid? */ UBool (*isValid)(NewConverter *cnvData, - const uint8_t *bytes, int32_t length, - uint32_t b); + const uint8_t *bytes, int32_t length); UBool - (*addToUnicode)(NewConverter *cnvData, - const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback); - - UBool - (*addFromUnicode)(NewConverter *cnvData, - const uint8_t *bytes, int32_t length, - UChar32 c, uint32_t b, - int8_t isFallback); - - void - (*finishMappings)(NewConverter *cnvData, const UConverterStaticData *staticData); + (*addTable)(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData); uint32_t - (*write)(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData); + (*write)(NewConverter *cnvData, const UConverterStaticData *staticData, + UNewDataMemory *pData, int32_t tableType); }; #endif diff --git a/icu4c/source/tools/makeconv/makeconv.vcproj b/icu4c/source/tools/makeconv/makeconv.vcproj index 393cf36175..cb9dadb272 100644 --- a/icu4c/source/tools/makeconv/makeconv.vcproj +++ b/icu4c/source/tools/makeconv/makeconv.vcproj @@ -132,6 +132,9 @@ + + diff --git a/icu4c/source/tools/toolutil/Makefile.in b/icu4c/source/tools/toolutil/Makefile.in index b41a9e9582..b47038d5cd 100644 --- a/icu4c/source/tools/toolutil/Makefile.in +++ b/icu4c/source/tools/toolutil/Makefile.in @@ -38,7 +38,7 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS) CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/ctestfw $(LIBCPPFLAGS) LIBS = $(LIBICUUC) $(DEFAULT_LIBS) -OBJECTS = toolutil.o unewdata.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o +OBJECTS = toolutil.o unewdata.o ucm.o ucmstate.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O)) diff --git a/icu4c/source/tools/toolutil/toolutil.c b/icu4c/source/tools/toolutil/toolutil.c index 900a43a0d0..5118c7876b 100644 --- a/icu4c/source/tools/toolutil/toolutil.c +++ b/icu4c/source/tools/toolutil/toolutil.c @@ -26,6 +26,7 @@ # define NOMCX # include #endif +#include #include "unicode/utypes.h" #include "unicode/putil.h" #include "cmemory.h" @@ -73,3 +74,117 @@ findBasename(const char *filename) { return filename; } } + +/* tool memory helper ------------------------------------------------------- */ + +typedef struct UToolMemory { + char name[64]; + int32_t capacity, maxCapacity, size, index; + void *array; + UAlignedMemory staticArray[1]; +} UToolMemory; + +U_CAPI UToolMemory * U_EXPORT2 +utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) { + UToolMemory *mem; + + if(maxCapacityarray=mem->staticArray; + + uprv_strcpy(mem->name, name); + mem->capacity=initialCapacity; + mem->maxCapacity=maxCapacity; + mem->size=size; + mem->index=0; + return mem; +} + +U_CAPI void U_EXPORT2 +utm_close(UToolMemory *mem) { + if(mem!=NULL) { + if(mem->array!=mem->staticArray) { + uprv_free(mem->array); + } + uprv_free(mem); + } +} + + +U_CAPI void * U_EXPORT2 +utm_getStart(UToolMemory *mem) { + return (char *)mem->array; +} + +U_CAPI int32_t U_EXPORT2 +utm_countItems(UToolMemory *mem) { + return mem->index; +} + + +static UBool +utm_hasCapacity(UToolMemory *mem, int32_t capacity) { + if(mem->capacitymaxCapacityname, (long)mem->maxCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + /* try to allocate a larger array */ + if(capacity>=2*mem->capacity) { + newCapacity=capacity; + } else if(mem->capacity<=mem->maxCapacity/3) { + newCapacity=2*mem->capacity; + } else { + newCapacity=mem->maxCapacity; + } + + if(mem->array==mem->staticArray) { + mem->array=uprv_malloc(newCapacity*mem->size); + if(mem->array!=NULL) { + uprv_memcpy(mem->array, mem->staticArray, mem->index*mem->size); + } + } else { + mem->array=uprv_realloc(mem->array, newCapacity*mem->size); + } + + if(mem->array==NULL) { + fprintf(stderr, "error: %s - out of memory\n", mem->name); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + return TRUE; +} + +U_CAPI void * U_EXPORT2 +utm_alloc(UToolMemory *mem) { + char *p=(char *)mem->array+mem->index*mem->size; + int32_t newIndex=mem->index+1; + if(utm_hasCapacity(mem, newIndex)) { + mem->index=newIndex; + uprv_memset(p, 0, mem->size); + } + return p; +} + +U_CAPI void * U_EXPORT2 +utm_allocN(UToolMemory *mem, int32_t n) { + char *p=(char *)mem->array+mem->index*mem->size; + int32_t newIndex=mem->index+n; + if(utm_hasCapacity(mem, newIndex)) { + mem->index=newIndex; + uprv_memset(p, 0, n*mem->size); + } + return p; +} diff --git a/icu4c/source/tools/toolutil/toolutil.dsp b/icu4c/source/tools/toolutil/toolutil.dsp index 6cc54d8846..78aefa5c4a 100644 --- a/icu4c/source/tools/toolutil/toolutil.dsp +++ b/icu4c/source/tools/toolutil/toolutil.dsp @@ -163,10 +163,18 @@ SOURCE=.\ucbuf.c # End Source File # Begin Source File +SOURCE=.\ucm.c +# End Source File +# Begin Source File + SOURCE=.\ucmpwrit.c # End Source File # Begin Source File +SOURCE=.\ucmstate.c +# End Source File +# Begin Source File + SOURCE=.\unewdata.c # End Source File # Begin Source File @@ -195,6 +203,10 @@ SOURCE=.\ucbuf.h # End Source File # Begin Source File +SOURCE=.\ucm.h +# End Source File +# Begin Source File + SOURCE=.\ucmpwrit.h # End Source File # Begin Source File diff --git a/icu4c/source/tools/toolutil/toolutil.h b/icu4c/source/tools/toolutil/toolutil.h index 2326339668..7cde28e620 100644 --- a/icu4c/source/tools/toolutil/toolutil.h +++ b/icu4c/source/tools/toolutil/toolutil.h @@ -20,8 +20,7 @@ #define __TOOLUTIL_H__ #include "unicode/utypes.h" - - +#include "cmemory.h" /* * For Windows, a path/filename may be the short (8.3) version @@ -51,4 +50,55 @@ getLongPathname(const char *pathname); U_CAPI const char * U_EXPORT2 findBasename(const char *filename); +/* + * UToolMemory is used for generic, custom memory management. + * It is allocated with enough space for count*size bytes starting + * at array. + * The array is declared with a union of large data types so + * that its base address is aligned for any types. + * If size is a multiple of a data type size, then such items + * can be safely allocated inside the array, at offsets that + * are themselves multiples of size. + */ +struct UToolMemory; +typedef struct UToolMemory UToolMemory; + +/** + * Open a UToolMemory object for allocation of initialCapacity to maxCapacity + * items with size bytes each. + */ +U_CAPI UToolMemory * U_EXPORT2 +utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size); + +/** + * Close a UToolMemory object. + */ +U_CAPI void U_EXPORT2 +utm_close(UToolMemory *mem); + +/** + * Get the pointer to the beginning of the array of items. + * The pointer becomes invalid after allocation of new items. + */ +U_CAPI void * U_EXPORT2 +utm_getStart(UToolMemory *mem); + +/** + * Get the current number of items. + */ +U_CAPI int32_t U_EXPORT2 +utm_countItems(UToolMemory *mem); + +/** + * Allocate one more item and return the pointer to its start in the array. + */ +U_CAPI void * U_EXPORT2 +utm_alloc(UToolMemory *mem); + +/** + * Allocate n items and return the pointer to the start of the first one in the array. + */ +U_CAPI void * U_EXPORT2 +utm_allocN(UToolMemory *mem, int32_t n); + #endif diff --git a/icu4c/source/tools/toolutil/toolutil.vcproj b/icu4c/source/tools/toolutil/toolutil.vcproj index 2e7ab8a315..f626978006 100644 --- a/icu4c/source/tools/toolutil/toolutil.vcproj +++ b/icu4c/source/tools/toolutil/toolutil.vcproj @@ -136,9 +136,15 @@ + + + + @@ -161,6 +167,9 @@ + + diff --git a/icu4c/source/tools/toolutil/ucm.c b/icu4c/source/tools/toolutil/ucm.c new file mode 100644 index 0000000000..f4acb60d2e --- /dev/null +++ b/icu4c/source/tools/toolutil/ucm.c @@ -0,0 +1,910 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucm.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun20 +* created by: Markus W. Scherer +* +* This file reads a .ucm file, stores its mappings and sorts them. +* It implements handling of Unicode conversion mappings from .ucm files +* for makeconv, canonucm, rptp2ucm, etc. +* +* Unicode code point sequences with a length of more than 1, +* as well as byte sequences with more than 4 bytes or more than one complete +* character sequence are handled to support m:n mappings. +*/ + +#include "unicode/utypes.h" +#include "unicode/ustring.h" +#include "cstring.h" +#include "cmemory.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include + +/* -------------------------------------------------------------------------- */ + +/* +### TODO +allow file without fallback indicators for backward compatibility +only for makeconv +must not sort such mappings +disallow when using extension tables because that requires sorting + +rptp2ucm has its own mapping parser and sets all-|1 and |3 mappings; normalization function generates |0 and |2 + +*/ + +static void +printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) { + int32_t j; + + for(j=0; juLen; ++j) { + fprintf(f, "", codePoints[j]); + } + + fputc(' ', f); + + for(j=0; jbLen; ++j) { + fprintf(f, "\\x%02X", bytes[j]); + } + + if(m->f>=0) { + fprintf(f, " |%lu\n", m->f); + } else { + fputs("\n", f); + } +} + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) { + printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f); +} + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) { + UCMapping *m; + int32_t i, length; + + m=table->mappings; + length=table->mappingsLength; + if(byUnicode) { + for(i=0; ireverseMap; + for(i=0; iuLen==1 && r->uLen==1) { + /* compare two single code points */ + return l->u-r->u; + } + + /* get pointers to the code point sequences */ + lu=UCM_GET_CODE_POINTS(lTable, l); + ru=UCM_GET_CODE_POINTS(rTable, r); + + /* get the minimum length */ + if(l->uLen<=r->uLen) { + length=l->uLen; + } else { + length=r->uLen; + } + + /* compare the code points */ + for(i=0; iuLen-r->uLen; +} + +static int32_t +compareBytes(UCMTable *lTable, const UCMapping *l, + UCMTable *rTable, const UCMapping *r, + UBool lexical) { + const uint8_t *lb, *rb; + int32_t result, i, length; + + /* + * A lexical comparison is used for sorting in the builder, to allow + * an efficient search for a byte sequence that could be a prefix + * of a previously entered byte sequence. + * + * Comparing by lengths first is for compatibility with old .ucm tools + * like canonucm and rptp2ucm. + */ + if(lexical) { + /* get the minimum length and continue */ + if(l->bLen<=r->bLen) { + length=l->bLen; + } else { + length=r->bLen; + } + } else { + /* compare lengths first */ + result=l->bLen-r->bLen; + if(result!=0) { + return result; + } else { + length=l->bLen; + } + } + + /* get pointers to the byte sequences */ + lb=UCM_GET_BYTES(lTable, l); + rb=UCM_GET_BYTES(rTable, r); + + /* compare the bytes */ + for(i=0; ibLen-r->bLen; +} + +/* compare UCMappings for sorting */ +static int32_t +compareMappings(UCMTable *table, const void *left, const void *right, UBool uFirst) { + const UCMapping *l=(const UCMapping *)left, *r=(const UCMapping *)right; + int32_t result; + + /* choose which side to compare first */ + if(uFirst) { + /* Unicode then bytes */ + result=compareUnicode(table, l, table, r); + if(result==0) { + result=compareBytes(table, l, table, r, FALSE); /* not lexically, like canonucm */ + } + } else { + /* bytes then Unicode */ + result=compareBytes(table, l, table, r, TRUE); /* lexically, for builder */ + if(result==0) { + result=compareUnicode(table, l, table, r); + } + } + + if(result!=0) { + return result; + } + + /* compare the flags */ + return l->f-r->f; +} + +/* sorting by Unicode first sorts mappings directly */ +static int32_t +compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) { + return compareMappings((UCMTable *)context, left, right, TRUE); +} + +/* sorting by bytes first sorts the reverseMap; use indirection to mappings */ +static int32_t +compareMappingsBytesFirst(const void *context, const void *left, const void *right) { + UCMTable *table=(UCMTable *)context; + int32_t l=*(const int32_t *)left, r=*(const int32_t *)right; + return compareMappings(table, table->mappings+l, table->mappings+r, FALSE); +} + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t) { + UErrorCode errorCode; + int32_t i; + + errorCode=U_ZERO_ERROR; + + /* 1. sort by Unicode first */ + uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping), + compareMappingsUnicodeFirst, t, + FALSE, &errorCode); + + /* build the reverseMap */ + if(t->reverseMap==NULL) { + /* + * allocate mappingsCapacity instead of mappingsLength so that + * if mappings are added, the reverseMap need not be + * reallocated each time + * (see moveMappings() and ucm_addMapping()) + */ + t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t)); + if(t->reverseMap==NULL) { + fprintf(stderr, "ucm error: unable to allocate reverseMap\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + for(i=0; imappingsLength; ++i) { + t->reverseMap[i]=i; + } + + /* 2. sort reverseMap by mappings bytes first */ + uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t), + compareMappingsBytesFirst, t, + FALSE, &errorCode); + + if(U_FAILURE(errorCode)) { + fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n", + u_errorName(errorCode)); + exit(errorCode); + } +} + +/* + +### TODO normalization function for a table (in or for rptp2ucm) +sort table +if there are mappings with the same code points and bytes but |1 and |3, merge them into one |0 (or make |2 where necessary) +if mappings were merged, sort again +-> for rptp2ucm + +*/ + +/* lookups ------------------------------------------------------------------ */ + +/* +### TODO lookups? + +binary search for first mapping with some code point or byte sequence +check if a code point is the first of any mapping (RT or FB) +check if a byte sequence is a prefix of any mapping (RT or RFB) +check if there is a mapping with the same source units; return whether the target is same or different + +*/ + +enum { + MOVE_TO_EXT=0x10, + REMOVE_MAPPING=0x20, + MOVE_ANY=0x30 +}; + +/* + * move mappings with MOVE_ANY ored into their flags from the base table + * to the extension table + */ +static void +moveMappings(UCMTable *base, UCMTable *ext) { + UCMapping *mb, *mbLimit; + int8_t flag; + UBool didMove; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + didMove=FALSE; + + while(mbf; + if(flag&MOVE_ANY) { + /* restore the original flag value */ + mb->f=flag&~MOVE_ANY; + didMove=TRUE; + + if(ext!=NULL && (flag&MOVE_TO_EXT)) { + /* add the mapping to the extension table */ + ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb)); + } + + /* move the last base mapping down and overwrite the current one */ + if(mb<(mbLimit-1)) { + uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping)); + } + --mbLimit; + --base->mappingsLength; + } else { + ++mb; + } + } + + if(didMove) { + ucm_sortTable(base); + ucm_printTable(base, stdout, TRUE); puts(""); /* ### TODO */ + if(ext!=NULL) { + ucm_sortTable(ext); + ucm_printTable(ext, stdout, TRUE); puts(""); /* ### TODO */ + } + } +} + +enum { + NEEDS_MOVE=1, + HAS_ERRORS=2 +}; + +static uint8_t +checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) { + UCMapping *mb, *me, *mbLimit, *meLimit; + int32_t cmp; + uint8_t result; + + mb=base->mappings; + mbLimit=mb+base->mappingsLength; + + me=ext->mappings; + meLimit=me+ext->mappingsLength; + + result=0; + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;;) { + if(mb==mbLimit) { + return result; + } + + if(0<=mb->f && mb->f<=2) { + break; + } + + ++mb; + } + + for(;;) { + if(me==meLimit) { + return result; + } + + if(0<=me->f && me->f<=2) { + break; + } + + ++me; + } + + /* compare the base and extension mappings */ + cmp=compareUnicode(base, mb, ext, me); + if(cmp<0) { + /* does mb map from an input sequence that is a prefix of me's? */ + if( mb->uLenuLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->f|=MOVE_TO_EXT; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + } + result|=NEEDS_MOVE; + } + + ++mb; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->bLen==me->bLen && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + me->f|=REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++mb; + } else /* cmp>0 */ { + ++me; + } + } +} + +static uint8_t +checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) { + UCMapping *mb, *me; + int32_t *baseMap, *extMap; + int32_t b, e, bLimit, eLimit, cmp; + uint8_t result; + UBool isSISO; + + baseMap=base->reverseMap; + extMap=ext->reverseMap; + + b=e=0; + bLimit=base->mappingsLength; + eLimit=ext->mappingsLength; + + result=0; + + isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO); + + for(;;) { + /* skip irrelevant mappings on both sides */ + for(;;) { + if(b==bLimit) { + return result; + } + mb=base->mappings+baseMap[b]; + + if(mb->f==0 || mb->f==3) { + break; + } + + ++b; + } + + for(;;) { + if(e==eLimit) { + return result; + } + me=ext->mappings+extMap[e]; + + if(me->f==0 || me->f==3) { + break; + } + + ++e; + } + + /* compare the base and extension mappings */ + cmp=compareBytes(base, mb, ext, me, TRUE); + if(cmp<0) { + /* + * does mb map from an input sequence that is a prefix of me's? + * for SI/SO tables, a single byte is never a prefix because it + * occurs in a separate single-byte state + */ + if( mb->bLenbLen && + (!isSISO || mb->bLen>1) && + 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen) + ) { + if(moveToExt) { + /* mark this mapping to be moved to the extension table */ + mb->f|=MOVE_TO_EXT; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is a prefix of the input sequence of an extension mapping\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + } + + ++b; + } else if(cmp==0) { + /* + * same output: remove the extension mapping, + * otherwise treat as an error + */ + if( mb->f==me->f && mb->uLen==me->uLen && + 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen) + ) { + me->f|=REMOVE_MAPPING; + result|=NEEDS_MOVE; + } else { + fprintf(stderr, + "ucm error: the base table contains a mapping whose input sequence\n" + " is the same as the input sequence of an extension mapping\n" + " but it maps differently\n"); + ucm_printMapping(base, mb, stderr); + ucm_printMapping(ext, me, stderr); + result|=HAS_ERRORS; + } + + ++b; + } else /* cmp>0 */ { + ++e; + } + } +} + +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *table, UCMStates *baseStates) { + UCMapping *m, *mLimit; + int32_t count; + UBool isOK; + + m=table->mappings; + mLimit=m+table->mappingsLength; + isOK=TRUE; + + while(mbLen); + if(count<1) { + ucm_printMapping(table, m, stderr); + isOK=FALSE; + } + ++m; + } + + return isOK; +} + +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) { + uint8_t result; + + /* if we have an extension table, we must always use precision flags */ + if(base->flagsType!=UCM_FLAGS_EXPLICIT || ext->flagsType!=UCM_FLAGS_EXPLICIT) { + fprintf(stderr, "ucm error: the base or extension table contains mappings without precision flags\n"); + return FALSE; + } + + /* checking requires both tables to be sorted */ + ucm_sortTable(base); + ucm_sortTable(ext); + + /* check */ + result= + checkBaseExtUnicode(base, ext, moveToExt)| + checkBaseExtBytes(baseStates, base, ext, moveToExt); + + if(result&HAS_ERRORS) { + return FALSE; + } + + if(result&NEEDS_MOVE) { + moveMappings(ext, NULL); + moveMappings(base, ext); + } + + return TRUE; +} + +/* ucm parser --------------------------------------------------------------- */ + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) { + const char *s=*ps; + char *end; + int8_t bLen; + + bLen=0; + for(;;) { + /* skip an optional plus sign */ + if(bLen>0 && *s=='+') { + ++s; + } + if(*s!='\\') { + break; + } + + if(bLen==UCNV_EXT_MAX_BYTES) { + fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line); + return -1; + } + if( s[1]!='x' || + (bytes[bLen]=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4 + ) { + fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line); + return -1; + } + ++bLen; + s=end; + } + + *ps=s; + return bLen; +} + +/* parse a mapping line; must not be empty */ +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line) { + const char *s; + char *end; + int32_t u16Length; + int8_t uLen, bLen, f; + + s=line; + uLen=bLen=0; + + /* parse code points */ + for(;;) { + /* skip an optional plus sign */ + if(uLen>0 && *s=='+') { + ++s; + } + if(*s!='<') { + break; + } + + if(uLen==UCNV_EXT_MAX_UCHARS) { + fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line); + return FALSE; + } + if( s[1]!='U' || + (codePoints[uLen]=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 || + *end!='>' + ) { + fprintf(stderr, "ucm error: Unicode code point must be formatted as (1..6 hex digits) - \"%s\"\n", line); + return FALSE; + } + if((uint32_t)codePoints[uLen]>0x10ffff || U_IS_SURROGATE(codePoints[uLen])) { + fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line); + return FALSE; + } + ++uLen; + s=end+1; + } + + if(uLen==0) { + fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line); + return FALSE; + } else if(uLen==1) { + m->u=codePoints[0]; + } else { + UErrorCode errorCode=U_ZERO_ERROR; + u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode); + if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) || + u16Length>UCNV_EXT_MAX_UCHARS + ) { + fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line); + return FALSE; + } + } + + s=u_skipWhitespace(s); + + /* parse bytes */ + bLen=ucm_parseBytes(bytes, line, &s); + + if(bLen<0) { + return FALSE; + } else if(bLen==0) { + fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line); + return FALSE; + } else if(bLen<=4) { + uprv_memcpy(m->b.bytes, bytes, bLen); + } + + /* skip everything until the fallback indicator, even the start of a comment */ + for(;;) { + if(*s==0) { + f=-1; /* no fallback indicator */ + break; + } else if(*s=='|') { + f=(int8_t)(s[1]-'0'); + if((uint8_t)f>3) { + fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line); + return FALSE; + } + break; + } + ++s; + } + + m->uLen=uLen; + m->bLen=bLen; + m->f=f; + return TRUE; +} + +/* general APIs ------------------------------------------------------------- */ + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable() { + UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable)); + if(table==NULL) { + fprintf(stderr, "ucm error: unable to allocate a UCMTable\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(table, 0, sizeof(UCMTable)); + return table; +} + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table) { + if(table!=NULL) { + uprv_free(table->mappings); + uprv_free(table->codePoints); + uprv_free(table->bytes); + uprv_free(table->reverseMap); + uprv_free(table); + } +} + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]) { + UCMapping *tm; + UChar32 c; + int32_t index; + + if(table->mappingsLength>=table->mappingsCapacity) { + /* make the mappings array larger */ + if(table->mappingsCapacity==0) { + table->mappingsCapacity=1000; + } else { + table->mappingsCapacity*=10; + } + table->mappings=(UCMapping *)uprv_realloc(table->mappings, + table->mappingsCapacity*sizeof(UCMapping)); + if(table->mappings==NULL) { + fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n", + table->mappingsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + if(table->reverseMap!=NULL) { + /* the reverseMap must be reallocated in a new sort */ + uprv_free(table->reverseMap); + table->reverseMap=NULL; + } + } + + if(m->uLen>1 && table->codePointsCapacity==0) { + table->codePointsCapacity=10000; + table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4); + if(table->codePoints==NULL) { + fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n", + table->codePointsCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->bLen>4 && table->bytesCapacity==0) { + table->bytesCapacity=10000; + table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity); + if(table->bytes==NULL) { + fprintf(stderr, "ucm error: unable to allocate %d bytes\n", + table->bytesCapacity); + exit(U_MEMORY_ALLOCATION_ERROR); + } + } + + if(m->uLen>1) { + index=table->codePointsLength; + table->codePointsLength+=m->uLen; + if(table->codePointsLength>table->codePointsCapacity) { + fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4); + m->u=index; + } + + if(m->bLen>4) { + index=table->bytesLength; + table->bytesLength+=m->bLen; + if(table->bytesLength>table->bytesCapacity) { + fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + uprv_memcpy(table->bytes+index, bytes, m->bLen); + m->b.index=index; + } + + /* set unicodeMask */ + for(index=0; indexuLen; ++index) { + c=codePoints[index]; + if(c>=0x10000) { + table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */ + } else if(U_IS_SURROGATE(c)) { + table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */ + } + } + + /* set flagsType */ + if(m->f<0) { + table->flagsType|=UCM_FLAGS_IMPLICIT; + } else { + table->flagsType|=UCM_FLAGS_EXPLICIT; + } + + tm=table->mappings+table->mappingsLength++; + uprv_memcpy(tm, m, sizeof(UCMapping)); +} + +U_CAPI UCMFile * U_EXPORT2 +ucm_open() { + UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile)); + if(ucm==NULL) { + fprintf(stderr, "ucm error: unable to allocate a UCMFile\n"); + exit(U_MEMORY_ALLOCATION_ERROR); + } + + memset(ucm, 0, sizeof(UCMFile)); + + ucm->base=ucm_openTable(); + ucm->ext=ucm_openTable(); + + ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT; + ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER; + ucm->states.outputType=-1; + ucm->states.minCharLength=ucm->states.maxCharLength=1; + + return ucm; +} + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm) { + if(ucm!=NULL) { + uprv_free(ucm->base); + uprv_free(ucm->ext); + uprv_free(ucm); + } +} + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) { + UCMapping m={ 0 }; + UChar32 codePoints[UCNV_EXT_MAX_UCHARS]; + uint8_t bytes[UCNV_EXT_MAX_BYTES]; + int32_t count; + + if(!ucm_parseMappingLine(&m, codePoints, bytes, line)) { + return FALSE; + } + + if(baseStates!=NULL) { + /* check validity of the bytes and count the characters in them */ + count=ucm_countChars(baseStates, bytes, m.bLen); + if(count<1) { + /* illegal byte sequence */ + printMapping(&m, codePoints, bytes, stderr); + return FALSE; + } + } else { + /* not used - adding a mapping for an extension-only table before its base table is read */ + count=0; + } + + /* + * Add the mapping to the base table if this is requested + * and it is a 1:1 mapping. + * Otherwise, add it to the extension table. + * + * Also add |2 SUB mappings for + * and |1 fallbacks from something other than U+0000 to 0x00 + * to the extension table. + */ + if( forBase && m.uLen==1 && count==1 && + !((m.f==2 && m.bLen==1 && ucm->states.maxCharLength>1) || + (m.f==1 && m.bLen==1 && bytes[0]==0 && !(m.uLen==1 && codePoints[0]==0))) + ) { + ucm_addMapping(ucm->base, &m, codePoints, bytes); + return TRUE; + } + + ucm_addMapping(ucm->ext, &m, codePoints, bytes); + return TRUE; +} diff --git a/icu4c/source/tools/toolutil/ucm.h b/icu4c/source/tools/toolutil/ucm.h new file mode 100644 index 0000000000..b58eb343c0 --- /dev/null +++ b/icu4c/source/tools/toolutil/ucm.h @@ -0,0 +1,217 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucm.h +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003jun20 +* created by: Markus W. Scherer +* +* Definitions for the .ucm file parser and handler module ucm.c. +*/ + +#ifndef __UCM_H__ +#define __UCM_H__ + +#include "unicode/utypes.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include + +U_CDECL_BEGIN + +/* + * Per-mapping data structure + * + * u if uLen==1: Unicode code point + * else index to uLen code points + * b if bLen<=4: up to 4 bytes + * else index to bLen bytes + * uLen number of code points + * bLen number of words containing left-justified bytes + * bIsMultipleChars indicates that the bytes contain more than one sequence + * according to the state table + * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) + * same values as in the source file after | + */ +typedef struct UCMapping { + UChar32 u; + union { + uint32_t index; + uint8_t bytes[4]; + } b; + int8_t uLen, bLen, f; +} UCMapping; + +enum { + UCM_FLAGS_INITIAL, /* no mappings parsed yet */ + UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ + UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ + UCM_FLAGS_MIXED /* both implicit and explicit */ +}; + +typedef struct UCMTable { + UCMapping *mappings; + int32_t mappingsCapacity, mappingsLength; + + UChar32 *codePoints; + int32_t codePointsCapacity, codePointsLength; + + uint8_t *bytes; + int32_t bytesCapacity, bytesLength; + + /* index map for mapping by bytes first */ + int32_t *reverseMap; + + uint8_t unicodeMask; + int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ +} UCMTable; + +enum { + MBCS_STATE_FLAG_DIRECT=1, + MBCS_STATE_FLAG_SURROGATES, + + MBCS_STATE_FLAG_READY=16 +}; + +typedef struct UCMStates { + int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; + uint32_t stateFlags[MBCS_MAX_STATE_COUNT], + stateOffsetSum[MBCS_MAX_STATE_COUNT]; + + int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; + int8_t conversionType, outputType; +} UCMStates; + +typedef struct UCMFile { + UCMTable *base, *ext; + UCMStates states; + + char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; +} UCMFile; + +/* simple accesses ---------------------------------------------------------- */ + +#define UCM_GET_CODE_POINTS(t, m) \ + (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) + +#define UCM_GET_BYTES(t, m) \ + (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.index) + +/* APIs --------------------------------------------------------------------- */ + +U_CAPI UCMFile * U_EXPORT2 +ucm_open(void); + +U_CAPI void U_EXPORT2 +ucm_close(UCMFile *ucm); + +U_CAPI UBool U_EXPORT2 +ucm_parseHeaderLine(UCMFile *ucm, + char *line, char **pKey, char **pValue); + +U_CAPI UBool U_EXPORT2 +ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); + + +U_CAPI UCMTable * U_EXPORT2 +ucm_openTable(void); + +U_CAPI void U_EXPORT2 +ucm_closeTable(UCMTable *table); + +U_CAPI void U_EXPORT2 +ucm_sortTable(UCMTable *t); + +/** + * Check the validity of mappings against a base table's states; + * necessary for extension-only tables that were read before their base tables. + */ +U_CAPI UBool U_EXPORT2 +ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); + +/** + * Check a base table against an extension table. + * Set moveToExt=TRUE for where base and extension tables are parsed + * from a single file, + * and moveToExt=FALSE for where the extension table is in a separate file. + * + * For both tables in the same file, the extension table is automatically + * built. + * For separate files, the extension file can use a complete mapping table, + * so that common mappings need not be stripped out manually. + * + * + * Sort both tables, and then for each mapping direction: + * + * If the base table contains a mapping for which the input sequence is + * the same as the extension input, then + * - if the output is the same: remove the extension mapping + * - else: error + * + * If the base table contains a mapping for which the input sequence is + * a prefix of the extension input, then + * - if moveToExt: move the base mapping to the extension table + * - else: error + * + * @return FALSE in case of an irreparable error + */ +U_CAPI UBool U_EXPORT2 +ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt); + +U_CAPI void U_EXPORT2 +ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); + +U_CAPI void U_EXPORT2 +ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); + + +U_CAPI void U_EXPORT2 +ucm_addState(UCMStates *states, const char *s); + +U_CAPI void U_EXPORT2 +ucm_processStates(UCMStates *states); + +U_CAPI int32_t U_EXPORT2 +ucm_countChars(UCMStates *states, + const uint8_t *bytes, int32_t length); + + +U_CAPI int8_t U_EXPORT2 +ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); + +U_CAPI UBool U_EXPORT2 +ucm_parseMappingLine(UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES], + const char *line); + +U_CAPI void U_EXPORT2 +ucm_addMapping(UCMTable *table, + UCMapping *m, + UChar32 codePoints[UCNV_EXT_MAX_UCHARS], + uint8_t bytes[UCNV_EXT_MAX_BYTES]); + +/* very makeconv-specific functions ----------------------------------------- */ + +/* finalize and optimize states after the toUnicode mappings are processed */ +U_CAPI void U_EXPORT2 +ucm_optimizeStates(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose); + +/* moved here because it is used inside ucmstate.c */ +U_CAPI int32_t U_EXPORT2 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + uint32_t offset); + +U_CDECL_END + +#endif diff --git a/icu4c/source/tools/toolutil/ucmstate.c b/icu4c/source/tools/toolutil/ucmstate.c new file mode 100644 index 0000000000..ccc43a6f8b --- /dev/null +++ b/icu4c/source/tools/toolutil/ucmstate.c @@ -0,0 +1,1042 @@ +/* +******************************************************************************* +* +* Copyright (C) 2003, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: ucmstate.c +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2003oct09 +* created by: Markus W. Scherer +* +* This file handles ICU .ucm file state information as part of the ucm module. +* Most of this code used to be in makeconv.c. +*/ + +#include "unicode/utypes.h" +#include "cstring.h" +#include "cmemory.h" +#include "uarrsort.h" +#include "ucnvmbcs.h" +#include "ucnv_ext.h" +#include "uparse.h" +#include "ucm.h" +#include + +/* MBCS state handling ------------------------------------------------------ */ + +/* + * state table row grammar (ebnf-style): + * (whitespace is allowed between all tokens) + * + * row=[[firstentry ','] entry (',' entry)*] + * firstentry="initial" | "surrogates" + * (initial state (default for state 0), output is all surrogate pairs) + * entry=range [':' nextstate] ['.' action] + * range=number ['-' number] + * nextstate=number + * (0..7f) + * action='u' | 's' | 'p' | 'i' + * (unassigned, state change only, surrogate pair, illegal) + * number=(1- or 2-digit hexadecimal number) + */ +static const char * +parseState(const char *s, int32_t state[256], uint32_t *pFlags) { + const char *t; + uint32_t start, end, i; + int32_t entry; + + /* initialize the state: all illegal with U+ffff */ + for(i=0; i<256; ++i) { + state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); + } + + /* skip leading white space */ + s=u_skipWhitespace(s); + + /* is there an "initial" or "surrogates" directive? */ + if(uprv_strncmp("initial", s, 7)==0) { + *pFlags=MBCS_STATE_FLAG_DIRECT; + s=u_skipWhitespace(s+7); + if(*s++!=',') { + return s-1; + } + } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { + *pFlags=MBCS_STATE_FLAG_SURROGATES; + s=u_skipWhitespace(s+10); + if(*s++!=',') { + return s-1; + } + } else if(*s==0) { + /* empty state row: all-illegal */ + return NULL; + } + + for(;;) { + /* read an entry, the start of the range first */ + s=u_skipWhitespace(s); + start=uprv_strtoul(s, (char **)&t, 16); + if(s==t || 0xffcountStates==MBCS_MAX_STATE_COUNT) { + fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); + exit(U_INVALID_TABLE_FORMAT); + } + + error=parseState(s, states->stateTable[states->countStates], + &states->stateFlags[states->countStates]); + if(error!=NULL) { + fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); + exit(U_INVALID_TABLE_FORMAT); + } + + ++states->countStates; +} + +U_CAPI UBool U_EXPORT2 +ucm_parseHeaderLine(UCMFile *ucm, + char *line, char **pKey, char **pValue) { + UCMStates *states; + char *s, *end; + char c; + + states=&ucm->states; + + /* remove comments and trailing CR and LF and remove whitespace from the end */ + for(end=line; (c=*end)!=0; ++end) { + if(c=='#' || c=='\r' || c=='\n') { + break; + } + } + while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { + --end; + } + *end=0; + + /* skip leading white space and ignore empty lines */ + s=(char *)u_skipWhitespace(line); + if(*s==0) { + return TRUE; + } + + /* stop at the beginning of the mapping section */ + if(uprv_memcmp(s, "CHARMAP", 7)==0) { + return FALSE; + } + + /* get the key name, bracketed in <> */ + if(*s!='<') { + fprintf(stderr, "ucm error: no header field in line \"%s\"\n", line); + exit(U_INVALID_TABLE_FORMAT); + } + *pKey=++s; + while(*s!='>') { + if(*s==0) { + fprintf(stderr, "ucm error: incomplete header field in line \"%s\"\n", line); + exit(U_INVALID_TABLE_FORMAT); + } + ++s; + } + *s=0; + + /* get the value string, possibly quoted */ + s=(char *)u_skipWhitespace(s+1); + if(*s!='"') { + *pValue=s; + } else { + /* remove the quotes */ + *pValue=s+1; + if(end>*pValue && *(end-1)=='"') { + *--end=0; + } + } + + /* collect the information from the header field, ignore unknown keys */ + if(uprv_strcmp(*pKey, "uconv_class")==0) { + if(uprv_strcmp(*pValue, "DBCS")==0) { + states->conversionType=UCNV_DBCS; + } else if(uprv_strcmp(*pValue, "SBCS")==0) { + states->conversionType = UCNV_SBCS; + } else if(uprv_strcmp(*pValue, "MBCS")==0) { + states->conversionType = UCNV_MBCS; + } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { + states->conversionType = UCNV_EBCDIC_STATEFUL; + } else { + fprintf(stderr, "ucm error: unknown %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return TRUE; + } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { + c=**pValue; + if('1'<=c && c<='4' && (*pValue)[1]==0) { + states->maxCharLength=(int8_t)(c-'0'); + states->outputType=states->maxCharLength-1; + } else { + fprintf(stderr, "ucm error: illegal %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return TRUE; + } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { + c=**pValue; + if('1'<=c && c<='4' && (*pValue)[1]==0) { + states->minCharLength=(int8_t)(c-'0'); + } else { + fprintf(stderr, "ucm error: illegal %s\n", *pValue); + exit(U_INVALID_TABLE_FORMAT); + } + return TRUE; + } else if(uprv_strcmp(*pKey, "icu:state")==0) { + /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ + switch(states->conversionType) { + case UCNV_SBCS: + case UCNV_DBCS: + case UCNV_EBCDIC_STATEFUL: + states->conversionType=UCNV_MBCS; + break; + case UCNV_MBCS: + break; + default: + fprintf(stderr, "ucm error: entry for non-MBCS table or before the line\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->maxCharLength==0) { + fprintf(stderr, "ucm error: before the line\n"); + exit(U_INVALID_TABLE_FORMAT); + } + ucm_addState(states, *pValue); + return TRUE; + } else if(uprv_strcmp(*pKey, "icu:base")==0) { + if(**pValue==0) { + fprintf(stderr, "ucm error: without a base table name\n"); + exit(U_INVALID_TABLE_FORMAT); + } + uprv_strcpy(ucm->baseName, *pValue); + return TRUE; + } + + return FALSE; +} + +/* post-processing ---------------------------------------------------------- */ + +static int32_t +sumUpStates(UCMStates *states) { + int32_t entry, sum, state, cell, count; + UBool allStatesReady; + + /* + * Sum up the offsets for all states. + * In each final state (where there are only final entries), + * the offsets add up directly. + * In all other state table rows, for each transition entry to another state, + * the offsets sum of that state needs to be added. + * This is achieved in at most countStates iterations. + */ + allStatesReady=FALSE; + for(count=states->countStates; !allStatesReady && count>=0; --count) { + allStatesReady=TRUE; + for(state=states->countStates-1; state>=0; --state) { + if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { + allStatesReady=FALSE; + sum=0; + + /* at first, add up only the final delta offsets to keep them <512 */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_FINAL(entry)) { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); + sum+=1; + break; + case MBCS_STATE_VALID_16_PAIR: + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); + sum+=2; + break; + default: + /* no addition */ + break; + } + } + } + + /* now, add up the delta offsets for the transitional entries */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); + sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; + } else { + /* that next state does not have a sum yet, we cannot finish the one for this state */ + sum=-1; + break; + } + } + } + + if(sum!=-1) { + states->stateOffsetSum[state]=sum; + states->stateFlags[state]|=MBCS_STATE_FLAG_READY; + } + } + } + } + + if(!allStatesReady) { + fprintf(stderr, "ucm error: the state table contains loops\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + /* + * For all "direct" (i.e., initial) states>0, + * the offsets need to be increased by the sum of + * the previous initial states. + */ + sum=states->stateOffsetSum[0]; + for(state=1; statecountStates; ++state) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + int32_t sum2=sum; + sum+=states->stateOffsetSum[state]; + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); + } + } + } + } + + /* round up to the next even number to have the following data 32-bit-aligned */ + return states->countToUCodeUnits=(sum+1)&~1; +} + +U_CAPI void U_EXPORT2 +ucm_processStates(UCMStates *states) { + int32_t entry, state, cell, count; + + if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { + fprintf(stderr, "ucm error: missing conversion type ()\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->countStates==0) { + switch(states->conversionType) { + case UCNV_SBCS: + /* SBCS: use MBCS data structure with a default state table */ + if(states->maxCharLength!=1) { + fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType=UCNV_MBCS; + ucm_addState(states, "0-ff"); + break; + case UCNV_MBCS: + fprintf(stderr, "ucm error: missing state table information () for MBCS\n"); + exit(U_INVALID_TABLE_FORMAT); + break; + case UCNV_EBCDIC_STATEFUL: + /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ + if(states->minCharLength!=1 || states->maxCharLength!=2) { + fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType=UCNV_MBCS; + ucm_addState(states, "0-ff, e:1.s, f:0.s"); + ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); + ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); + ucm_addState(states, "0-ff:1.i, 40:1."); + ucm_addState(states, "0-ff:1.i"); + break; + case UCNV_DBCS: + /* DBCS: use MBCS data structure with a default state table */ + if(states->minCharLength!=2 || states->maxCharLength!=2) { + fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); + exit(U_INVALID_TABLE_FORMAT); + } + states->conversionType = UCNV_MBCS; + ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); + ucm_addState(states, "41-fe"); + ucm_addState(states, "40"); + ucm_addState(states, ""); + break; + default: + fprintf(stderr, "ucm error: unknown charset structure\n"); + exit(U_INVALID_TABLE_FORMAT); + break; + } + } + + /* + * check that the min/max character lengths are reasonable; + * to do this right, all paths through the state table would have to be + * recursively walked while keeping track of the sequence lengths, + * but these simple checks cover most state tables in practice + */ + if(states->maxCharLengthminCharLength) { + fprintf(stderr, "ucm error: max B/char < min B/char\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + /* count non-direct states and compare with max B/char */ + count=0; + for(state=0; statecountStates; ++state) { + if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { + ++count; + } + } + if(states->maxCharLength>count+1) { + fprintf(stderr, "ucm error: max B/char too large\n"); + exit(U_INVALID_TABLE_FORMAT); + } + + if(states->minCharLength==1) { + int32_t action; + + /* + * if there are single-byte characters, + * then the initial state must have direct result states + */ + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[0][cell]; + if( MBCS_ENTRY_IS_FINAL(entry) && + ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || + action==MBCS_STATE_UNASSIGNED) + ) { + break; + } + } + + if(cell==256) { + fprintf(stderr, "ucm warning: min B/char too small\n"); + } + } + + /* + * make sure that all "next state" values are within limits + * and that all next states after final ones have the "direct" + * flag of initial states + */ + for(state=states->countStates-1; state>=0; --state) { + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { + fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", + state, cell, MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } + if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", + state, cell, MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", + state, cell, MBCS_ENTRY_STATE(entry)); + exit(U_INVALID_TABLE_FORMAT); + } + } + } + + /* is this an SI/SO (like EBCDIC-stateful) state table? */ + if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { + if(states->maxCharLength!=2) { + fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", states->maxCharLength); + exit(U_INVALID_TABLE_FORMAT); + } + if(states->countStates<3) { + fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", states->countStates); + exit(U_INVALID_TABLE_FORMAT); + } + /* are the SI/SO all in the right places? */ + if( states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && + states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) + ) { + states->outputType=MBCS_OUTPUT_2_SISO; + } else { + fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); + exit(U_INVALID_TABLE_FORMAT); + } + state=2; + } else { + state=1; + } + + /* check that no unexpected state is a "direct" one */ + while(statecountStates) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", state); + exit(U_INVALID_TABLE_FORMAT); + } + ++state; + } + + sumUpStates(states); +} + +/* find a fallback for this offset; return the index or -1 if not found */ +U_CAPI int32_t U_EXPORT2 +ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + uint32_t offset) { + int32_t i; + + if(countToUFallbacks==0) { + /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ + return -1; + } + + /* do a linear search for the fallback mapping (the table is not yet sorted) */ + for(i=0; ioutputType==MBCS_OUTPUT_2_SISO) { + /* use the DBCS lead state for SI/SO codepages */ + leadState=1; + } else { + leadState=0; + } + + /* find the main trail state: the most used target state */ + uprv_memset(count, 0, sizeof(count)); + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; + } + } + trailState=0; + for(i=1; icountStates; ++i) { + if(count[i]>count[trailState]) { + trailState=i; + } + } + + /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ + uprv_memset(count, 0, sizeof(count)); + savings=0; + /* for each lead byte */ + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) { + /* the offset is different for each lead byte */ + offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); + /* for each trail byte for this lead byte */ + for(j=0; j<256; ++j) { + entry=states->stateTable[trailState][j]; + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { + ++count[i]; + } else { + j=999; /* do not count for this lead byte because there are assignments */ + } + break; + case MBCS_STATE_VALID_16_PAIR: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if((*pUnicodeCodeUnits)[entry]==0xfffe) { + count[i]+=2; + } else { + j=999; /* do not count for this lead byte because there are assignments */ + } + break; + default: + break; + } + } + if(j==256) { + /* all trail bytes for this lead byte are unassigned */ + savings+=count[i]; + } else { + count[i]=0; + } + } + } + /* subtract from the possible savings the cost of an additional state */ + savings=savings*2-1024; /* count bytes, not 16-bit words */ + if(savings<=0) { + return; + } + if(verbose) { + printf("compacting toUnicode data saves %ld bytes\n", (long)savings); + } + if(states->countStates>=MBCS_MAX_STATE_COUNT) { + fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); + return; + } + + /* make a copy of the state table */ + oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); + if(oldStateTable==NULL) { + fprintf(stderr, "cannot compact toUnicode: out of memory\n"); + return; + } + uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); + + /* add the new state */ + /* + * this function does not catch the degenerate case where all lead bytes + * have all-unassigned trail bytes and the lead state could be removed + */ + newState=states->countStates++; + states->stateFlags[newState]=0; + /* copy the old trail state, turning all assigned states into unassigned ones */ + for(i=0; i<256; ++i) { + entry=states->stateTable[trailState][i]; + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + case MBCS_STATE_VALID_16_PAIR: + states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); + break; + default: + states->stateTable[newState][i]=entry; + break; + } + } + + /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ + for(i=0; i<256; ++i) { + if(count[i]>0) { + states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); + } + } + + /* sum up the new state table */ + for(i=0; icountStates; ++i) { + states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; + } + sum=sumUpStates(states); + + /* allocate a new, smaller code units array */ + oldUnicodeCodeUnits=*pUnicodeCodeUnits; + if(sum==0) { + *pUnicodeCodeUnits=NULL; + if(oldUnicodeCodeUnits!=NULL) { + uprv_free(oldUnicodeCodeUnits); + } + uprv_free(oldStateTable); + return; + } + *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); + if(*pUnicodeCodeUnits==NULL) { + fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", + (long)sum); + /* revert to the old state table */ + *pUnicodeCodeUnits=oldUnicodeCodeUnits; + --states->countStates; + uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); + uprv_free(oldStateTable); + return; + } + for(i=0; icountStates; ++leadState) { + if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { + /* for each lead byte from there */ + for(i=0; i<256; ++i) { + entry=states->stateTable[leadState][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + /* the new state does not have assigned states */ + if(trailState!=newState) { + trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); + oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); + /* for each trail byte */ + for(j=0; j<256; ++j) { + entry=states->stateTable[trailState][j]; + /* copy assigned-character code units and adjust fallback offsets */ + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); + /* find the old offset according to the old state table */ + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); + unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; + if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { + toUFallbacks[fallback].offset=0x80000000|offset; + } + break; + case MBCS_STATE_VALID_16_PAIR: + offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); + /* find the old offset according to the old state table */ + oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); + (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; + (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; + break; + default: + break; + } + } + } + } + } + } + } + + /* remove temporary flags from fallback offsets that protected them from being modified twice */ + for(i=0; i0 number of bytes that are used in unicodeCodeUnits[] that could be saved, + * if all sequences from this state are unassigned, returns the + * <0 there are assignments in unicodeCodeUnits[] + * 0 no use of unicodeCodeUnits[] + */ +static int32_t +findUnassigned(UCMStates *states, + uint16_t *unicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + int32_t state, int32_t offset, uint32_t b) { + int32_t i, entry, savings, localSavings, belowSavings; + UBool haveAssigned; + + localSavings=belowSavings=0; + haveAssigned=FALSE; + for(i=0; i<256; ++i) { + entry=states->stateTable[state][i]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + savings=findUnassigned(states, + unicodeCodeUnits, + toUFallbacks, countToUFallbacks, + MBCS_ENTRY_TRANSITION_STATE(entry), + offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), + (b<<8)|(uint32_t)i); + if(savings<0) { + haveAssigned=TRUE; + } else if(savings>0) { + printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", + (unsigned long)((b<<8)|i), (long)state, (long)savings); + belowSavings+=savings; + } + } else if(!haveAssigned) { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_VALID_16: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { + localSavings+=2; + } else { + haveAssigned=TRUE; + } + break; + case MBCS_STATE_VALID_16_PAIR: + entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); + if(unicodeCodeUnits[entry]==0xfffe) { + localSavings+=4; + } else { + haveAssigned=TRUE; + } + break; + default: + break; + } + } + } + if(haveAssigned) { + return -1; + } else { + return localSavings+belowSavings; + } +} + +/* helper function for finding compaction opportunities */ +static void +compactToUnicodeHelper(UCMStates *states, + uint16_t *unicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { + int32_t state, savings; + + /* for each initial state */ + for(state=0; statecountStates; ++state) { + if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { + savings=findUnassigned(states, + unicodeCodeUnits, + toUFallbacks, countToUFallbacks, + state, 0, 0); + if(savings>0) { + printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", + (long)state, (long)savings); + } + } + } +} + +static int32_t +compareFallbacks(const void *context, const void *fb1, const void *fb2) { + return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; +} + +U_CAPI void U_EXPORT2 +ucm_optimizeStates(UCMStates *states, + uint16_t **pUnicodeCodeUnits, + _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, + UBool verbose) { + UErrorCode errorCode; + int32_t state, cell, entry; + + /* test each state table entry */ + for(state=0; statecountStates; ++state) { + for(cell=0; cell<256; ++cell) { + entry=states->stateTable[state][cell]; + /* + * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code + * and the code point is "unassigned" (0xfffe), then change it to + * the "unassigned" action code with bits 26..23 set to zero and U+fffe. + */ + if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { + states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); + } + } + } + + /* try to compact the toUnicode tables */ + if(states->maxCharLength==2) { + compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); + } else if(states->maxCharLength>2) { + if(verbose) { + compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); + } + } + + /* sort toUFallbacks */ + /* + * It should be safe to sort them before compactToUnicode2() is called, + * because it should not change the relative order of the offset values + * that it adjusts, but they need to be sorted at some point, and + * it is safest here. + */ + if(countToUFallbacks>0) { + errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ + uprv_sortArray(toUFallbacks, countToUFallbacks, + sizeof(_MBCSToUFallback), + compareFallbacks, NULL, FALSE, &errorCode); + } +} + +/* use a complete state table ----------------------------------------------- */ + +U_CAPI int32_t U_EXPORT2 +ucm_countChars(UCMStates *states, + const uint8_t *bytes, int32_t length) { + uint32_t offset; + int32_t i, entry, count; + uint8_t state; + + offset=0; + i=count=0; + state=0; + + if(states->countStates==0) { + fprintf(stderr, "ucm error: there is no state information!\n"); + return -1; + } + + /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ + if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { + state=1; + } + + /* + * Walk down the state table like in conversion, + * much like getNextUChar(). + * We assume that c<=0x10ffff. + */ + for(i=0; istateTable[state][bytes[i]]; + if(MBCS_ENTRY_IS_TRANSITION(entry)) { + state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); + offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); + } else { + switch(MBCS_ENTRY_FINAL_ACTION(entry)) { + case MBCS_STATE_ILLEGAL: + fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); + return -1; + case MBCS_STATE_CHANGE_ONLY: + fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); + return -1; + case MBCS_STATE_UNASSIGNED: + case MBCS_STATE_FALLBACK_DIRECT_16: + case MBCS_STATE_VALID_DIRECT_16: + case MBCS_STATE_FALLBACK_DIRECT_20: + case MBCS_STATE_VALID_DIRECT_20: + case MBCS_STATE_VALID_16: + case MBCS_STATE_VALID_16_PAIR: + /* count a complete character and prepare for a new one */ + ++count; + state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); + offset=0; + break; + default: + /* reserved, must never occur */ + fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", entry); + return -1; + } + } + } + + if(offset!=0) { + fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %hu: 0x%02lx\n", state); + return -1; + } + + /* + * for SI/SO (like EBCDIC-stateful), multiple-character results + * must consist of only double-byte sequences + */ + if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { + fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", count); + return -1; + } + + return count; +}