diff --git a/icu4c/source/common/Makefile.in b/icu4c/source/common/Makefile.in
index 61196bd839..48c40088ad 100644
--- a/icu4c/source/common/Makefile.in
+++ b/icu4c/source/common/Makefile.in
@@ -61,7 +61,7 @@ OBJECTS = putil.o uobject.o cmemory.o umutex.o \
udata.o ucmndata.o udatamem.o udataswp.o umapfile.o ucol_swp.o \
uresbund.o uresdata.o resbund.o ucat.o locmap.o uloc.o locid.o \
uhash.o uhash_us.o \
-ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_io.o ucnvlat1.o \
+ucnv.o ucnv_bld.o ucnv_cb.o ucnv_cnv.o ucnv_err.o ucnv_ext.o ucnv_io.o ucnvlat1.o \
ucnv_u7.o ucnv_u8.o ucnv_u16.o ucnv_u32.o ucnvscsu.o ucnvbocu.o \
ucnvmbcs.o ucnv2022.o ucnvhz.o ucnv_lmb.o ucnvisci.o \
unistr.o utf_impl.o ustring.o ustrcase.o cstring.o ustrfmt.o ustrtrns.o \
diff --git a/icu4c/source/common/common.dsp b/icu4c/source/common/common.dsp
index 7b66828194..3539840bfa 100644
--- a/icu4c/source/common/common.dsp
+++ b/icu4c/source/common/common.dsp
@@ -1347,6 +1347,14 @@ InputPath=.\unicode\ucnv_err.h
# End Source File
# Begin Source File
+SOURCE=.\ucnv_ext.c
+# End Source File
+# Begin Source File
+
+SOURCE=.\ucnv_ext.h
+# End Source File
+# Begin Source File
+
SOURCE=.\ucnv_imp.h
# End Source File
# Begin Source File
diff --git a/icu4c/source/common/common.vcproj b/icu4c/source/common/common.vcproj
index 7e67cccac0..c72a6d0a73 100644
--- a/icu4c/source/common/common.vcproj
+++ b/icu4c/source/common/common.vcproj
@@ -730,6 +730,12 @@
Outputs="..\..\include\unicode\$(InputName).h"/>
+
+
+
+
diff --git a/icu4c/source/common/ucnv.c b/icu4c/source/common/ucnv.c
index 9ec013109c..48639b2729 100644
--- a/icu4c/source/common/ucnv.c
+++ b/icu4c/source/common/ucnv.c
@@ -608,11 +608,14 @@ static void _reset(UConverter *converter, UConverterResetChoice choice,
converter->mode = 0;
converter->toULength = 0;
converter->invalidCharLength = converter->UCharErrorBufferLength = 0;
+ converter->preToULength = 0;
}
if(choice!=UCNV_RESET_TO_UNICODE) {
converter->fromUnicodeStatus = 0;
converter->fromUChar32 = 0;
converter->invalidUCharLength = converter->charErrorBufferLength = 0;
+ converter->preFromUFirstCP = U_SENTINEL;
+ converter->preFromULength = 0;
}
if (converter->sharedData->impl->reset != NULL) {
@@ -811,6 +814,28 @@ _updateOffsets(int32_t *offsets, int32_t length,
/* ucnv_fromUnicode --------------------------------------------------------- */
+/*
+ * Implementation note for m:n conversions
+ *
+ * While collecting source units to find the longest match for m:n conversion,
+ * some source units may need to be stored for a partial match.
+ * When a second buffer does not yield a match on all of the previously stored
+ * source units, then they must be "replayed", i.e., fed back into the converter.
+ *
+ * The code relies on the fact that replaying will not nest -
+ * converting a replay buffer will not result in a replay.
+ * This is because a replay is necessary only after the _continuation_ of a
+ * partial match failed, but a replay buffer is converted as a whole.
+ * It may result in some of its units being stored again for a partial match,
+ * but there will not be a continuation _during_ the replay which could fail.
+ *
+ * It is conceivable that a callback function could call the converter
+ * recursively in a way that causes another replay to be stored, but that
+ * would be an error in the callback function.
+ * Such violations will cause assertion failures in a debug build,
+ * and wrong output, but they will not cause a crash.
+ */
+
static void
_fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
UConverterFromUnicode fromUnicode;
@@ -822,6 +847,12 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
int32_t errorInputLength;
UBool converterSawEndOfInput, calledCallback;
+ /* variables for m:n conversion */
+ UChar replay[UCNV_EXT_MAX_UCHARS];
+ const UChar *realSource, *realSourceLimit;
+ int32_t realSourceIndex;
+ UBool realFlush;
+
cnv=pArgs->converter;
s=pArgs->source;
t=pArgs->target;
@@ -841,6 +872,29 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
}
}
+ if(cnv->preFromULength>=0) {
+ /* normal mode */
+ realSource=NULL;
+ } else {
+ /*
+ * Previous m:n conversion stored source units from a partial match
+ * and failed to consume all of them.
+ * We need to "replay" them from a temporary buffer and convert them first.
+ */
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preFromULength;
+ pArgs->flush=FALSE;
+ sourceIndex=-1;
+
+ cnv->preFromULength=0;
+ }
+
/*
* loop for conversion and error handling
*
@@ -897,7 +951,36 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
pArgs->offsets=offsets+=length;
}
- sourceIndex+=(int32_t)(pArgs->source-s);
+ if(sourceIndex>=0) {
+ sourceIndex+=(int32_t)(pArgs->source-s);
+ }
+ }
+
+ if(cnv->preFromULength<0) {
+ /*
+ * switch the source to new replay units (cannot occur while replaying)
+ * after offset handling and before end-of-input and callback handling
+ */
+ if(realSource==NULL) {
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preFromU, -cnv->preFromULength*U_SIZEOF_UCHAR);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preFromULength;
+ pArgs->flush=FALSE;
+ if((sourceIndex+=cnv->preFromULength)<0) {
+ sourceIndex=-1;
+ }
+
+ cnv->preFromULength=0;
+ } else {
+ /* see implementation note before _fromUnicodeWithCallback() */
+ U_ASSERT(realSource==NULL);
+ *err=U_INTERNAL_PROGRAM_ERROR;
+ }
}
/* update pointers */
@@ -911,6 +994,15 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
* (continue converting by breaking out of only the inner loop)
*/
break;
+ } else if(realSource!=NULL) {
+ /* switch back from replaying to the real source and continue */
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ sourceIndex=realSourceIndex;
+
+ realSource=NULL;
+ break;
} else if(pArgs->flush && cnv->fromUChar32!=0) {
/*
* the entire input stream is consumed
@@ -960,7 +1052,27 @@ _fromUnicodeWithCallback(UConverterFromUnicodeArgs *pArgs, UErrorCode *err) {
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
+ *
+ * if we were replaying, then the replay buffer must be
+ * copied back into the UConverter
+ * and the real arguments must be restored
*/
+ if(realSource!=NULL) {
+ int32_t length;
+
+ U_ASSERT(cnv->preFromULength==0);
+
+ length=(int32_t)(pArgs->sourceLimit-pArgs->source);
+ if(length>0) {
+ uprv_memcpy(cnv->preFromU, pArgs->source, length*U_SIZEOF_UCHAR);
+ cnv->preFromULength=(int8_t)-length;
+ }
+
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ }
+
return;
}
}
@@ -1079,7 +1191,7 @@ ucnv_fromUnicode(UConverter *cnv,
cnv->charErrorBufferLength=0;
}
- if(!flush && s==sourceLimit) {
+ if(!flush && s==sourceLimit && cnv->preFromULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
*target=t;
return;
@@ -1122,6 +1234,12 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
int32_t errorInputLength;
UBool converterSawEndOfInput, calledCallback;
+ /* variables for m:n conversion */
+ char replay[UCNV_EXT_MAX_BYTES];
+ const char *realSource, *realSourceLimit;
+ int32_t realSourceIndex;
+ UBool realFlush;
+
cnv=pArgs->converter;
s=pArgs->source;
t=pArgs->target;
@@ -1141,6 +1259,29 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
}
}
+ if(cnv->preToULength>=0) {
+ /* normal mode */
+ realSource=NULL;
+ } else {
+ /*
+ * Previous m:n conversion stored source units from a partial match
+ * and failed to consume all of them.
+ * We need to "replay" them from a temporary buffer and convert them first.
+ */
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preToULength;
+ pArgs->flush=FALSE;
+ sourceIndex=-1;
+
+ cnv->preToULength=0;
+ }
+
/*
* loop for conversion and error handling
*
@@ -1202,7 +1343,36 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
pArgs->offsets=offsets+=length;
}
- sourceIndex+=(int32_t)(pArgs->source-s);
+ if(sourceIndex>=0) {
+ sourceIndex+=(int32_t)(pArgs->source-s);
+ }
+ }
+
+ if(cnv->preToULength<0) {
+ /*
+ * switch the source to new replay units (cannot occur while replaying)
+ * after offset handling and before end-of-input and callback handling
+ */
+ if(realSource==NULL) {
+ realSource=pArgs->source;
+ realSourceLimit=pArgs->sourceLimit;
+ realFlush=pArgs->flush;
+ realSourceIndex=sourceIndex;
+
+ uprv_memcpy(replay, cnv->preToU, -cnv->preToULength);
+ pArgs->source=replay;
+ pArgs->sourceLimit=replay-cnv->preToULength;
+ pArgs->flush=FALSE;
+ if((sourceIndex+=cnv->preToULength)<0) {
+ sourceIndex=-1;
+ }
+
+ cnv->preToULength=0;
+ } else {
+ /* see implementation note before _fromUnicodeWithCallback() */
+ U_ASSERT(realSource==NULL);
+ *err=U_INTERNAL_PROGRAM_ERROR;
+ }
}
/* update pointers */
@@ -1216,6 +1386,15 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
* (continue converting by breaking out of only the inner loop)
*/
break;
+ } else if(realSource!=NULL) {
+ /* switch back from replaying to the real source and continue */
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ sourceIndex=realSourceIndex;
+
+ realSource=NULL;
+ break;
} else if(pArgs->flush && cnv->toULength>0) {
/*
* the entire input stream is consumed
@@ -1265,7 +1444,27 @@ _toUnicodeWithCallback(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
* the check for buffer overflow is redundant but it is
* a high-runner case and hopefully documents the intent
* well
+ *
+ * if we were replaying, then the replay buffer must be
+ * copied back into the UConverter
+ * and the real arguments must be restored
*/
+ if(realSource!=NULL) {
+ int32_t length;
+
+ U_ASSERT(cnv->preToULength==0);
+
+ length=(int32_t)(pArgs->sourceLimit-pArgs->source);
+ if(length>0) {
+ uprv_memcpy(cnv->preToU, pArgs->source, length);
+ cnv->preToULength=(int8_t)-length;
+ }
+
+ pArgs->source=realSource;
+ pArgs->sourceLimit=realSourceLimit;
+ pArgs->flush=realFlush;
+ }
+
return;
}
}
@@ -1379,7 +1578,7 @@ ucnv_toUnicode(UConverter *cnv,
cnv->UCharErrorBufferLength=0;
}
- if(!flush && s==sourceLimit) {
+ if(!flush && s==sourceLimit && cnv->preToULength>=0) {
/* the overflow buffer is emptied and there is no new input: we are done */
*target=t;
return;
diff --git a/icu4c/source/common/ucnv_bld.c b/icu4c/source/common/ucnv_bld.c
index a8999c01fe..b9290f6ff4 100644
--- a/icu4c/source/common/ucnv_bld.c
+++ b/icu4c/source/common/ucnv_bld.c
@@ -776,6 +776,7 @@ ucnv_createConverterFromSharedData(UConverter *myUConverter,
myUConverter->subChar1 = myUConverter->sharedData->staticData->subChar1;
myUConverter->subCharLen = myUConverter->sharedData->staticData->subCharLen;
uprv_memcpy (myUConverter->subChar, myUConverter->sharedData->staticData->subChar, myUConverter->subCharLen);
+ myUConverter->preFromUFirstCP = U_SENTINEL;
if(myUConverter != NULL && myUConverter->sharedData->impl->open != NULL) {
myUConverter->sharedData->impl->open(myUConverter, realName, locale,options, err);
diff --git a/icu4c/source/common/ucnv_bld.h b/icu4c/source/common/ucnv_bld.h
index f52a5ef9ee..4d68e54850 100644
--- a/icu4c/source/common/ucnv_bld.h
+++ b/icu4c/source/common/ucnv_bld.h
@@ -20,6 +20,7 @@
#include "unicode/utypes.h"
#include "unicode/ucnv.h"
#include "unicode/ucnv_err.h"
+#include "ucnv_ext.h"
#include "udataswp.h"
/* size of the overflow buffers in UConverter, enough for escaping callbacks */
@@ -168,12 +169,22 @@ struct UConverter {
int8_t UCharErrorBufferLength; /* number of valid UChars in charErrorBuffer */
uint8_t subChar1; /* single-byte substitution character if different from subChar */
+ UBool useSubChar1;
uint8_t subChar[UCNV_MAX_SUBCHAR_LEN]; /* codepage specific character sequence */
char invalidCharBuffer[UCNV_MAX_CHAR_LEN]; /* bytes from last error/callback situation */
uint8_t charErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* codepage output from Error functions */
UChar invalidUCharBuffer[U16_MAX_LENGTH]; /* UChars from last error/callback situation */
UChar UCharErrorBuffer[UCNV_ERROR_BUFFER_LENGTH]; /* unicode output from Error functions */
+
+ /* fields for conversion extension */
+
+ /* store previous UChars/chars to continue partial matches */
+ UChar32 preFromUFirstCP; /* >=0: partial match */
+ UChar preFromU[UCNV_EXT_MAX_UCHARS];
+ char preToU[UCNV_EXT_MAX_BYTES];
+ int8_t preFromULength, preToULength; /* negative: replay */
+ int8_t preToUFirstLength; /* length of first character */
};
U_CDECL_END /* end of UConverter */
diff --git a/icu4c/source/common/ucnv_cb.c b/icu4c/source/common/ucnv_cb.c
index 9cbf25e607..bfa3eb1b8c 100644
--- a/icu4c/source/common/ucnv_cb.c
+++ b/icu4c/source/common/ucnv_cb.c
@@ -1,6 +1,6 @@
/*
**********************************************************************
-* Copyright (C) 2000-2001, International Business Machines
+* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* ucnv_cb.c:
@@ -35,50 +35,16 @@ ucnv_cbFromUWriteBytes (UConverterFromUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err)
{
- int32_t togo;
- int8_t toerr;
- int32_t i;
-
- if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
- {
- uprv_memcpy(args->target, source, length);
- args->target += length;
- if(args->offsets) /* set all the offsets to the same # */
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
+ if(U_FAILURE(*err)) {
+ return;
}
- else
- {
- togo = (int32_t)(args->targetLimit - args->target);
- uprv_memcpy(args->target, source, togo);
- args->target += togo;
-
- if(args->offsets)
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
-
- /* Now, copy the remainder into the errbuff */
- source += togo;
- toerr = (int8_t)(length - togo);
-
- uprv_memcpy(args->converter->charErrorBuffer +
- args->converter->charErrorBufferLength,
- source,
- toerr * sizeof(source[0]));
- args->converter->charErrorBufferLength += toerr;
-
- *err = U_BUFFER_OVERFLOW_ERROR;
-
- }
+ ucnv_fromUWriteBytes(
+ args->converter,
+ source, length,
+ &args->target, args->targetLimit,
+ &args->offsets, offsetIndex,
+ err);
}
U_CAPI void U_EXPORT2
@@ -232,55 +198,16 @@ ucnv_cbToUWriteUChars (UConverterToUnicodeArgs *args,
int32_t offsetIndex,
UErrorCode * err)
{
- int32_t togo;
- int8_t toerr;
- int32_t i;
-
- if(U_FAILURE(*err))
- {
+ if(U_FAILURE(*err)) {
return;
}
-
- if((args->targetLimit - args->target) >= length) /* If the buffer fits.. */
- {
- uprv_memcpy(args->target, source, length * sizeof(args->target[0]) );
- args->target += length;
- if(args->offsets) /* set all the offsets to the same # */
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
- }
- else
- {
- togo = (int32_t)(args->targetLimit - args->target);
-
- uprv_memcpy(args->target, source, togo * sizeof(args->target[0]) );
- args->target += togo;
-
- if(args->offsets)
- {
- for(i=0;ioffsets++) = offsetIndex;
- }
- }
-
- /* Now, copy the remainder into the errbuff */
- source += togo;
- toerr = (int8_t)(length - togo);
-
- uprv_memcpy(args->converter->UCharErrorBuffer +
- args->converter->UCharErrorBufferLength,
- source,
- toerr * sizeof(source[0]));
- args->converter->UCharErrorBufferLength += toerr;
-
- *err = U_BUFFER_OVERFLOW_ERROR;
- }
+ ucnv_toUWriteUChars(
+ args->converter,
+ source, length,
+ &args->target, args->targetLimit,
+ &args->offsets, offsetIndex,
+ err);
}
U_CAPI void U_EXPORT2
diff --git a/icu4c/source/common/ucnv_cnv.c b/icu4c/source/common/ucnv_cnv.c
index f53c336ca2..875ea1cdbe 100644
--- a/icu4c/source/common/ucnv_cnv.c
+++ b/icu4c/source/common/ucnv_cnv.c
@@ -79,6 +79,46 @@ ucnv_fromUWriteBytes(UConverter *cnv,
}
}
+U_CFUNC void
+ucnv_toUWriteUChars(UConverter *cnv,
+ const UChar *uchars, int32_t length,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode) {
+ UChar *t=*target;
+ int32_t *o;
+
+ /* write UChars */
+ if(offsets==NULL || (o=*offsets)==NULL) {
+ while(length>0 && t0 && t0) {
+ if(cnv!=NULL) {
+ t=cnv->UCharErrorBuffer;
+ cnv->UCharErrorBufferLength=(int8_t)length;
+ do {
+ *t++=*uchars++;
+ } while(--length>0);
+ }
+ *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
+ }
+}
+
U_CFUNC void
ucnv_toUWriteCodePoint(UConverter *cnv,
UChar32 c,
diff --git a/icu4c/source/common/ucnv_cnv.h b/icu4c/source/common/ucnv_cnv.h
index 5b948f4fbb..3de6b49bce 100644
--- a/icu4c/source/common/ucnv_cnv.h
+++ b/icu4c/source/common/ucnv_cnv.h
@@ -251,6 +251,13 @@ ucnv_fromUWriteBytes(UConverter *cnv,
int32_t **offsets,
int32_t sourceIndex,
UErrorCode *pErrorCode);
+U_CFUNC void
+ucnv_toUWriteUChars(UConverter *cnv,
+ const UChar *uchars, int32_t length,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets,
+ int32_t sourceIndex,
+ UErrorCode *pErrorCode);
U_CFUNC void
ucnv_toUWriteCodePoint(UConverter *cnv,
diff --git a/icu4c/source/common/ucnv_ext.c b/icu4c/source/common/ucnv_ext.c
new file mode 100644
index 0000000000..e706e20945
--- /dev/null
+++ b/icu4c/source/common/ucnv_ext.c
@@ -0,0 +1,921 @@
+/*
+******************************************************************************
+*
+* Copyright (C) 2003, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: ucnv_ext.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun13
+* created by: Markus W. Scherer
+*
+* Conversion extensions
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_LEGACY_CONVERSION
+
+#include "ucnv_bld.h"
+#include "ucnv_cnv.h"
+#include "ucnv_ext.h"
+#include "cmemory.h"
+
+/*
+ * ### TODO
+ *
+ * implement getUnicodeSet for the extension table
+ * implement data swapping for it
+ */
+
+/*
+ * ### TODO: probably need pointer to baseTableSharedData
+ * and also copy the base table's pointers for the base table arrays etc.
+ * into this sharedData
+ */
+
+/* to Unicode --------------------------------------------------------------- */
+
+/*
+ * @return lookup value for the byte, if found; else 0
+ */
+static U_INLINE uint32_t
+ucnv_extFindToU(const uint32_t *toUSection, int32_t length, uint8_t byte) {
+ uint32_t word;
+ int32_t i, start, limit;
+
+ /* check the input byte against the lowest and highest section bytes */
+ start=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[0]);
+ limit=(int32_t)UCNV_EXT_TO_U_GET_BYTE(toUSection[length-1]);
+ if(byte=toUSection[start]) {
+ break;
+ }
+ if(++start=toUSection[start]) {
+ break;
+ }
+ if(++start=toUSection[start]) {
+ break;
+ }
+ /* always break at start==limit-1 */
+ ++start;
+ break;
+ }
+
+ i=(start+limit)/2;
+ if(wordUCNV_EXT_MAX_BYTES) {
+ /*
+ * end of the entire input stream, stop with the longest match so far
+ * or: partial match must not be longer than UCNV_EXT_MAX_BYTES
+ * because it must fit into state buffers
+ */
+ break;
+ } else {
+ /* continue with more input next time */
+ return -length;
+ }
+ }
+
+ /* search for the current UChar */
+ value=ucnv_extFindToU(toUSection, length, b);
+ if(value==0) {
+ /* no match here, stop with the longest match so far */
+ break;
+ } else {
+ if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
+ /* partial match, continue */
+ index=(int32_t)UCNV_EXT_TO_U_GET_PARTIAL_INDEX(value);
+ } else {
+ if( UCNV_EXT_TO_U_IS_ROUNDTRIP(value) ||
+ TO_U_USE_FALLBACK(useFallback)
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+ matchLength=i+j;
+ } else {
+ /* full match on fallback not taken, stop with the longest match so far */
+ }
+ break;
+ }
+ }
+ }
+
+ if(matchLength==0) {
+ /* no match at all */
+ return 0;
+ }
+
+ /* return result */
+ matchValue=UCNV_EXT_TO_U_MASK_ROUNDTRIP(matchValue);
+ if(UCNV_EXT_TO_U_IS_CODE_POINT(matchValue)) {
+ *pResultLength=-(int32_t)matchValue;
+ } else {
+ *pResultLength=UCNV_EXT_TO_U_GET_LENGTH(matchValue);
+ *pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_TO_U_UCHARS_INDEX, UChar)+UCNV_EXT_TO_U_GET_INDEX(matchValue);
+ }
+
+ return matchLength;
+}
+
+static U_INLINE void
+ucnv_extWriteToU(UConverter *cnv,
+ const UChar *result, int32_t resultLength,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ /* output the result */
+ if(resultLength<0) {
+ /* output a single code point */
+ ucnv_toUWriteCodePoint(
+ cnv, UCNV_EXT_TO_U_GET_CODE_POINT(-resultLength),
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ } else {
+ /* output a string - with correct data we have resultLength>0 */
+ ucnv_toUWriteUChars(
+ cnv,
+ result, resultLength,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ }
+}
+
+/*
+ * targettoUBytes, firstLength,
+ *src, (int32_t)(srcLimit-*src),
+ &result, &resultLength,
+ cnv->useFallback, flush);
+ if(match>0) {
+ /* advance src pointer for the consumed input */
+ *src+=match-firstLength;
+
+ /* write result to target */
+ ucnv_extWriteToU(cnv,
+ result, resultLength,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ return TRUE;
+ } else if(match<0) {
+ /* save state for partial match */
+ const char *s;
+ int32_t j;
+
+ /* copy the first code point */
+ s=(const char *)cnv->toUBytes;
+ cnv->preToUFirstLength=(int8_t)firstLength;
+ for(j=0; jpreToU[j]=*s++;
+ }
+
+ /* now copy the newly consumed input */
+ s=*src;
+ match=-match;
+ for(; jpreToU[j]=*s++;
+ }
+ *src=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preToULength=(int8_t)match;
+ return TRUE;
+ } else /* match==0 no match */ {
+ return FALSE;
+ }
+}
+
+#if 0
+/* ### TODO */
+
+U_CFUNC int32_t
+ucnv_extSimpleMatchToU(const int32_t *cx,
+ UChar32 cp, uint32_t *pValue,
+ UBool useFallback,
+ UErrorCode *pErrorCode) {
+ const uint8_t *result;
+ int32_t resultLength, match;
+
+ /* try to match */
+ match=ucnv_extMatchToU(cx,
+ cp,
+ NULL, 0,
+ NULL, 0,
+ &result, &resultLength,
+ useFallback, TRUE);
+ if(match>=2) {
+ /* write result for simple, single-character conversion */
+ if(resultLength<0) {
+ resultLength=-resultLength;
+ *pValue=(uint32_t)UCNV_EXT_TO_U_GET_DATA(resultLength);
+ return UCNV_EXT_TO_U_GET_LENGTH(resultLength);
+ } else if(resultLength==4) {
+ /* de-serialize a 4-byte result */
+ *pValue=
+ ((uint32_t)result[0]<<24)|
+ ((uint32_t)result[1]<<16)|
+ ((uint32_t)result[2]<<8)|
+ result[3];
+ return 4;
+ }
+ }
+
+ /*
+ * return no match because
+ * - match>1 && resultLength>4: result too long for simple conversion
+ * - match==1: no match found, preferred
+ * - match==0: no match found in the first place
+ * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
+ */
+ return 0;
+}
+
+#endif
+
+/*
+ * continue partial match with new input
+ * never called for simple, single-character conversion
+ */
+U_CFUNC void
+ucnv_extContinueMatchToU(UConverter *cnv,
+ UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ const UChar *result;
+ int32_t resultLength, match, length;
+
+ match=ucnv_extMatchToU(cnv->sharedData->table->mbcs.extIndexes,
+ cnv->preToU, cnv->preToULength,
+ pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
+ &result, &resultLength,
+ cnv->useFallback, pArgs->flush);
+ if(match>0) {
+ if(match>=cnv->preToULength) {
+ /* advance src pointer for the consumed input */
+ pArgs->source+=match-cnv->preToULength;
+ cnv->preToULength=0;
+ } else {
+ /* the match did not use all of preToU[] - keep the rest for replay */
+ int32_t length=cnv->preToULength-match;
+ uprv_memmove(cnv->preToU, cnv->preToU+match, length);
+ cnv->preToULength=(int8_t)-length;
+ }
+
+ /* write result */
+ ucnv_extWriteToU(cnv,
+ result, resultLength,
+ &pArgs->target, pArgs->targetLimit,
+ &pArgs->offsets, srcIndex,
+ pErrorCode);
+ } else if(match<0) {
+ /* save state for partial match */
+ const char *s;
+ int32_t j;
+
+ /* just _append_ the newly consumed input to preToU[] */
+ s=pArgs->source;
+ match=-match;
+ for(j=cnv->preToULength; jpreToU[j]=*s++;
+ }
+ pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preToULength=(int8_t)match;
+ } else /* match==0 */ {
+ /*
+ * no match
+ *
+ * We need to split the previous input into two parts:
+ *
+ * 1. The first codepage character is unmappable - that's how we got into
+ * trying the extension data in the first place.
+ * We need to move it from the preToU buffer
+ * to the error buffer, set an error code,
+ * and prepare the rest of the previous input for 2.
+ *
+ * 2. The rest of the previous input must be converted once we
+ * come back from the callback for the first character.
+ * At that time, we have to try again from scratch to convert
+ * these input characters.
+ * The replay will be handled by the ucnv.c conversion code.
+ */
+
+ /* move the first codepage character to the error field */
+ uprv_memcpy(cnv->toUBytes, cnv->preToU, cnv->preToUFirstLength);
+ cnv->toULength=cnv->preToUFirstLength;
+
+ /* move the rest up inside the buffer */
+ length=cnv->preToULength-cnv->preToUFirstLength;
+ if(length>0) {
+ uprv_memmove(cnv->preToU, cnv->preToU+cnv->preToUFirstLength, length);
+ }
+
+ /* mark preToU for replay */
+ cnv->preToULength=(int8_t)-length;
+
+ /* set the error code for unassigned */
+ *pErrorCode=U_INVALID_CHAR_FOUND;
+ }
+}
+
+/* from Unicode ------------------------------------------------------------- */
+
+/*
+ * @return index of the UChar, if found; else <0
+ */
+static U_INLINE int32_t
+ucnv_extFindFromU(const UChar *fromUSection, int32_t length, UChar u) {
+ int32_t i, start, limit;
+
+ /* binary search */
+ start=0;
+ limit=length;
+ for(;;) {
+ i=limit-start;
+ if(i<=1) {
+ break; /* done */
+ }
+ /* start=fromUSection[start]) {
+ break;
+ }
+ if(++start=fromUSection[start]) {
+ break;
+ }
+ if(++start=fromUSection[start]) {
+ break;
+ }
+ /* always break at start==limit-1 */
+ ++start;
+ break;
+ }
+
+ i=(start+limit)/2;
+ if(u=0
+ * @param src UChars that can be used to complete a match
+ * @param srcLength length of src, >=0
+ * @param pResult [out] address of pointer to result bytes
+ * set only in case of a match
+ * @param pResultLength [out] address of result length variable;
+ * gets a negative value if the length variable
+ * itself contains the length and bytes, encoded in
+ * the format of fromUTableValues[] and then inverted
+ * @param useFallback "use fallback" flag, usually from cnv->useFallback
+ * @param flush TRUE if the end of the input stream is reached
+ * @return >1: matched, return value=total match length (number of input units matched)
+ * 1: matched, no mapping but request for
+ * (only for the first code point)
+ * 0: no match
+ * <0: partial match, return value=negative total match length
+ * (partial matches are never returned for flush==TRUE)
+ * (partial matches are never returned as being longer than UCNV_EXT_MAX_UCHARS)
+ * the matchLength is 2 if only firstCP matched, and >2 if firstCP and
+ * further code units matched
+ */
+static int32_t
+ucnv_extMatchFromU(const int32_t *cx,
+ UChar32 firstCP,
+ const UChar *pre, int32_t preLength,
+ const UChar *src, int32_t srcLength,
+ const uint8_t **pResult, int32_t *pResultLength,
+ UBool useFallback, UBool flush) {
+ const uint16_t *stage12, *stage3;
+ const uint32_t *stage3b;
+
+ const UChar *fromUTableUChars, *fromUSectionUChars;
+ const uint32_t *fromUTableValues, *fromUSectionValues;
+
+ uint32_t value, matchValue;
+ int32_t i, j, index, length, matchLength;
+ UChar c;
+
+ if(cx==NULL) {
+ return 0; /* no extension data, no match */
+ }
+
+ /* trie lookup of firstCP */
+ index=firstCP>>10; /* stage 1 index */
+ if(index>=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH]) {
+ return 0; /* the first code point is outside the trie */
+ }
+
+ stage12=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_12_INDEX, uint16_t);
+ stage3=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3_INDEX, uint16_t);
+ index=UCNV_EXT_FROM_U(stage12, stage3, index, firstCP);
+
+ stage3b=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_STAGE_3B_INDEX, uint32_t);
+ value=stage3b[index];
+ if(value==0) {
+ return 0;
+ }
+
+ if(UCNV_EXT_TO_U_IS_PARTIAL(value)) {
+ /* partial match, enter the loop below */
+ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
+
+ /* initialize */
+ fromUTableUChars=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_UCHARS_INDEX, UChar);
+ fromUTableValues=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_VALUES_INDEX, uint32_t);
+
+ matchValue=0;
+ i=j=matchLength=0;
+
+ /* we must not remember fallback matches when not using fallbacks */
+
+ /* match input units until there is a full match or the input is consumed */
+ for(;;) {
+ /* go to the next section */
+ fromUSectionUChars=fromUTableUChars+index;
+ fromUSectionValues=fromUTableValues+index;
+
+ /* read first pair of the section */
+ length=*fromUSectionUChars++;
+ value=*fromUSectionValues++;
+ if( value!=0 &&
+ (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+ FROM_U_USE_FALLBACK(useFallback, firstCP))
+ ) {
+ /* remember longest match so far */
+ matchValue=value;
+ matchLength=2+i+j;
+ }
+
+ /* match pre[] then src[] */
+ if(iUCNV_EXT_MAX_UCHARS) {
+ /*
+ * end of the entire input stream, stop with the longest match so far
+ * or: partial match must not be longer than UCNV_EXT_MAX_UCHARS
+ * because it must fit into state buffers
+ */
+ break;
+ } else {
+ /* continue with more input next time */
+ return -(2+length);
+ }
+ }
+
+ /* search for the current UChar */
+ index=ucnv_extFindFromU(fromUSectionUChars, length, c);
+ if(index<0) {
+ /* no match here, stop with the longest match so far */
+ break;
+ } else {
+ value=fromUSectionValues[index];
+ if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
+ /* partial match, continue */
+ index=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
+ } else {
+ if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+ FROM_U_USE_FALLBACK(useFallback, firstCP)
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+ matchLength=2+i+j;
+ } else {
+ /* full match on fallback not taken, stop with the longest match so far */
+ }
+ break;
+ }
+ }
+ }
+
+ if(matchLength==0) {
+ /* no match at all */
+ return 0;
+ }
+ } else /* result from firstCP trie lookup */ {
+ if( UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
+ FROM_U_USE_FALLBACK(useFallback, firstCP)
+ ) {
+ /* full match, stop with result */
+ matchValue=value;
+ matchLength=2;
+ } else {
+ /* fallback not taken */
+ return 0;
+ }
+ }
+
+ if(matchValue&UCNV_EXT_FROM_U_RESERVED_MASK) {
+ /* do not interpret values with reserved bits used, for forward compatibility */
+ return 0;
+ }
+
+ /* return result */
+ if(matchValue==UCNV_EXT_FROM_U_SUBCHAR1) {
+ return 1;
+ }
+
+ matchValue=UCNV_EXT_FROM_U_MASK_ROUNDTRIP(matchValue);
+ length=(int32_t)UCNV_EXT_FROM_U_GET_LENGTH(matchValue);
+ if(length<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH) {
+ *pResultLength=-(int32_t)matchValue;
+ } else {
+ *pResultLength=length;
+ *pResult=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+UCNV_EXT_FROM_U_GET_DATA(matchValue);
+ }
+
+ return matchLength;
+}
+
+static U_INLINE void
+ucnv_extWriteFromU(UConverter *cnv,
+ const uint8_t *result, int32_t resultLength,
+ char **target, const char *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ uint8_t buffer[4];
+
+ /* output the result */
+ if(resultLength<0) {
+ /*
+ * Generate a byte array and then write it below.
+ * This is not the fastest possible way, but it should be ok for
+ * extension mappings, and it is much simpler.
+ * Offset and overflow handling are only done once this way.
+ */
+ uint8_t *p;
+ uint32_t value;
+
+ resultLength=-resultLength;
+ value=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength);
+ resultLength=UCNV_EXT_FROM_U_GET_LENGTH(resultLength);
+ /* resultLength<=UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH==3 */
+
+ p=buffer;
+ switch(resultLength) {
+ case 3:
+ *p++=(uint8_t)(value>>16);
+ case 2:
+ *p++=(uint8_t)(value>>8);
+ case 1:
+ *p++=(uint8_t)value;
+ default:
+ break; /* will never occur */
+ }
+ result=buffer;
+ }
+
+ /* with correct data we have resultLength>0 */
+ ucnv_fromUWriteBytes(cnv, (const char *)result, resultLength,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+}
+
+/*
+ * targetuseFallback, flush);
+ if(match>=2) {
+ /* advance src pointer for the consumed input */
+ *src+=match-2; /* remove 2 for the initial code point */
+
+ /* write result to target */
+ ucnv_extWriteFromU(cnv,
+ result, resultLength,
+ target, targetLimit,
+ offsets, srcIndex,
+ pErrorCode);
+ return TRUE;
+ } else if(match<0) {
+ /* save state for partial match */
+ const UChar *s;
+ int32_t j;
+
+ /* copy the first code point */
+ cnv->preFromUFirstCP=cp;
+
+ /* now copy the newly consumed input */
+ s=*src;
+ match=-match-2; /* remove 2 for the initial code point */
+ for(j=0; jpreFromU[j]=*s++;
+ }
+ *src=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preFromULength=(int8_t)match;
+ return TRUE;
+ } else if(match==1) {
+ /* matched, no mapping but request for */
+ cnv->useSubChar1=TRUE;
+ return FALSE;
+ } else /* match==0 no match */ {
+ return FALSE;
+ }
+}
+
+U_CFUNC int32_t
+ucnv_extSimpleMatchFromU(const int32_t *cx,
+ UChar32 cp, uint32_t *pValue,
+ UBool useFallback,
+ UErrorCode *pErrorCode) {
+ const uint8_t *result;
+ int32_t resultLength, match;
+
+ /* try to match */
+ match=ucnv_extMatchFromU(cx,
+ cp,
+ NULL, 0,
+ NULL, 0,
+ &result, &resultLength,
+ useFallback, TRUE);
+ if(match>=2) {
+ /* write result for simple, single-character conversion */
+ if(resultLength<0) {
+ resultLength=-resultLength;
+ *pValue=(uint32_t)UCNV_EXT_FROM_U_GET_DATA(resultLength);
+ return UCNV_EXT_FROM_U_GET_LENGTH(resultLength);
+ } else if(resultLength==4) {
+ /* de-serialize a 4-byte result */
+ *pValue=
+ ((uint32_t)result[0]<<24)|
+ ((uint32_t)result[1]<<16)|
+ ((uint32_t)result[2]<<8)|
+ result[3];
+ return 4;
+ }
+ }
+
+ /*
+ * return no match because
+ * - match>1 && resultLength>4: result too long for simple conversion
+ * - match==1: no match found, preferred
+ * - match==0: no match found in the first place
+ * - match<0: partial match, not supported for simple conversion (and flush==TRUE)
+ */
+ return 0;
+}
+
+/*
+ * continue partial match with new input, requires cnv->preFromUFirstCP>=0
+ * never called for simple, single-character conversion
+ */
+U_CFUNC void
+ucnv_extContinueMatchFromU(UConverter *cnv,
+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode) {
+ const uint8_t *result;
+ int32_t resultLength, match;
+
+ match=ucnv_extMatchFromU(cnv->sharedData->table->mbcs.extIndexes,
+ cnv->preFromUFirstCP,
+ cnv->preFromU, cnv->preFromULength,
+ pArgs->source, (int32_t)(pArgs->sourceLimit-pArgs->source),
+ &result, &resultLength,
+ cnv->useFallback, pArgs->flush);
+ if(match>=2) {
+ match-=2; /* remove 2 for the initial code point */
+
+ if(match>=cnv->preFromULength) {
+ /* advance src pointer for the consumed input */
+ pArgs->source+=match-cnv->preFromULength;
+ cnv->preFromULength=0;
+ } else {
+ /* the match did not use all of preFromU[] - keep the rest for replay */
+ int32_t length=cnv->preFromULength-match;
+ uprv_memmove(cnv->preFromU, cnv->preFromU+match, length*U_SIZEOF_UCHAR);
+ cnv->preFromULength=(int8_t)-length;
+ }
+
+ /* finish the partial match */
+ cnv->preFromUFirstCP=U_SENTINEL;
+
+ /* write result */
+ ucnv_extWriteFromU(cnv,
+ result, resultLength,
+ &pArgs->target, pArgs->targetLimit,
+ &pArgs->offsets, srcIndex,
+ pErrorCode);
+ } else if(match<0) {
+ /* save state for partial match */
+ const UChar *s;
+ int32_t j;
+
+ /* just _append_ the newly consumed input to preFromU[] */
+ s=pArgs->source;
+ match=-match-2; /* remove 2 for the initial code point */
+ for(j=cnv->preFromULength; jpreFromU[j]=*s++;
+ }
+ pArgs->source=s; /* same as *src=srcLimit; because we reached the end of input */
+ cnv->preFromULength=(int8_t)match;
+ } else /* match==0 or 1 */ {
+ /*
+ * no match
+ *
+ * We need to split the previous input into two parts:
+ *
+ * 1. The first code point is unmappable - that's how we got into
+ * trying the extension data in the first place.
+ * We need to move it from the preFromU buffer
+ * to the error buffer, set an error code,
+ * and prepare the rest of the previous input for 2.
+ *
+ * 2. The rest of the previous input must be converted once we
+ * come back from the callback for the first code point.
+ * At that time, we have to try again from scratch to convert
+ * these input characters.
+ * The replay will be handled by the ucnv.c conversion code.
+ */
+
+ if(match==1) {
+ /* matched, no mapping but request for */
+ cnv->useSubChar1=TRUE;
+ }
+
+ /* move the first code point to the error field */
+ cnv->fromUChar32=cnv->preFromUFirstCP;
+ cnv->preFromUFirstCP=U_SENTINEL;
+
+ /* mark preFromU for replay */
+ cnv->preFromULength=-cnv->preFromULength;
+
+ /* set the error code for unassigned */
+ *pErrorCode=U_INVALID_CHAR_FOUND;
+ }
+}
+
+/*
+ * ### TODO
+ *
+ * - test toU() functions
+ *
+ * - EBCDIC_STATEFUL: support extensions, but the charset string must be
+ * either one single-byte character or a sequence of double-byte ones,
+ * to avoid state transitions inside the mapping and to avoid having to
+ * store character boundaries.
+ * The extension functions will need an additional EBCDIC state in/out
+ * parameter and will have to be able to insert an SI or SO before writing
+ * the mapping result.
+ * - EBCDIC_STATEFUL: toU() may need to check if in DB mode, do nothing if in SB
+ * - EBCDIC_STATEFUL: fix prefix checking to keep SBCS & DBCS separate
+ * - make dbcsonly work with extensions
+ *
+ * - test |2 to for regular code point, prefix code point,
+ * multiple code points
+ * - test fallback from non-zero to 00
+ * - try a smaller U_CNV_SAFECLONE_BUFFERSIZE and try ccapitst/TestConvertSafeClone()
+ */
+
+#endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */
diff --git a/icu4c/source/common/ucnv_ext.h b/icu4c/source/common/ucnv_ext.h
new file mode 100644
index 0000000000..29a683263f
--- /dev/null
+++ b/icu4c/source/common/ucnv_ext.h
@@ -0,0 +1,417 @@
+/*
+******************************************************************************
+*
+* Copyright (C) 2003, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+******************************************************************************
+* file name: ucnv_ext.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun13
+* created by: Markus W. Scherer
+*
+* Conversion extensions
+*/
+
+#ifndef __UCNV_EXT_H__
+#define __UCNV_EXT_H__
+
+#include "unicode/utypes.h"
+#include "unicode/ucnv.h"
+
+/*
+ * See icuhtml/design/conversion/conversion_extensions.html
+ *
+ * Conversion extensions serve two purposes:
+ * 1. They support m:n mappings.
+ * 2. They support extension-only conversion files that are used together
+ * with the regular conversion data in base files.
+ *
+ * A base file may contain an extension table (explicitly requested or
+ * implicitly generated for m:n mappings), but its extension table is not
+ * used when an extension-only file is used.
+ *
+ * It is an error if a base file contains any regular (not extension) mapping
+ * from the same sequence as a mapping in the extension file
+ * because the base mapping would hide the extension mapping.
+ *
+ *
+ * Data for conversion extensions:
+ *
+ * One set of data structures per conversion direction (to/from Unicode).
+ * The data structures are sorted by input units to allow for binary search.
+ * Input sequences of more than one unit are handled like contraction tables
+ * in collation:
+ * The lookup value of a unit points to another table that is to be searched
+ * for the next unit, recursively.
+ *
+ * For conversion from Unicode, the initial code point is looked up in
+ * a 3-stage trie for speed,
+ * with an additional table of unique results to save space.
+ *
+ * Long output strings are stored in separate arrays, with length and index
+ * in the lookup tables.
+ * Output results also include a flag distinguishing roundtrip from
+ * (reverse) fallback mappings.
+ *
+ * Input Unicode strings must not begin or end with unpaired surrogates
+ * to avoid problems with matches on parts of surrogate pairs.
+ *
+ * Mappings from multiple characters (code points or codepage state
+ * table sequences) must be searched preferring the longest match.
+ * For this to work and be efficient, the variable-width table must contain
+ * all mappings that contain prefixes of the multiple characters.
+ * If an extension table is built on top of a base table in another file
+ * and a base table entry is a prefix of a multi-character mapping, then
+ * this is an error.
+ *
+ *
+ * Implementation note:
+ *
+ * Currently, the parser and several checks in the code limit the number
+ * of UChars or bytes in a mapping to
+ * UCNV_EXT_MAX_UCHARS and UCNV_EXT_MAX_BYTES, respectively,
+ * which are output value limits in the data structure.
+ *
+ * For input, this is not strictly necessary - it is a hard limit only for the
+ * buffers in UConverter that are used to store partial matches.
+ *
+ * Input sequences could otherwise be arbitrarily long if partial matches
+ * need not be stored (i.e., if a sequence does not span several buffers with too
+ * many units before the last buffer), although then results would differ
+ * depending on whether partial matches exceed the limits or not,
+ * which depends on the pattern of buffer sizes.
+ *
+ *
+ * Data structure:
+ *
+ * int32_t indexes[>=32];
+ *
+ * Array of indexes and lengths etc. The length of the array is at least 32.
+ * The actual length is stored in indexes[0] to be forward compatible.
+ *
+ * Each index to another array is the number of bytes from indexes[].
+ * Each length of an array is the number of array base units in that array.
+ *
+ * Some of the structures may not be present, in which case their indexes
+ * and lengths are 0.
+ *
+ * Usage of indexes[i]:
+ * [0] length of indexes[]
+ *
+ * // to Unicode table
+ * [1] index of toUTable[] (array of uint32_t)
+ * [2] length of toUTable[]
+ * [3] index of toUUChars[] (array of UChar)
+ * [4] length of toUUChars[]
+ *
+ * // from Unicode table, not for the initial code point
+ * [5] index of fromUTableUChars[] (array of UChar)
+ * [6] index of fromUTableValues[] (array of uint32_t)
+ * [7] length of fromUTableUChars[] and fromUTableValues[]
+ * [8] index of fromUBytes[] (array of char)
+ * [9] length of fromUBytes[]
+ *
+ * // from Unicode trie for initial-code point lookup
+ * [10] index of fromUStage12[] (combined array of uint16_t for stages 1 & 2)
+ * [11] length of stage 1 portion of fromUStage12[]
+ * [12] length of fromUStage12[]
+ * [13] index of fromUStage3[] (array of uint16_t indexes into fromUStage3b[])
+ * [14] length of fromUStage3[]
+ * [15] index of fromUStage3b[] (array of uint32_t like fromUTableValues[])
+ * [16] length of fromUStage3b[]
+ *
+ * [17]..[30] reserved
+ * [31] number of bytes for the entire extension structure
+ * [>31] reserved; there are indexes[0] indexes
+ *
+ *
+ * uint32_t toUTable[];
+ *
+ * Array of byte/value pairs for lookups for toUnicode conversion.
+ * The array is partitioned into sections like collation contraction tables.
+ * Each section contains one word with the number of following words and
+ * a default value for when the lookup in this section yields no match.
+ *
+ * A section is sorted in ascending order of input bytes,
+ * allowing for fast linear or binary searches.
+ * The builder may store entries for a contiguous range of byte values
+ * (compare difference between the first and last one with count),
+ * which then allows for direct array access.
+ * The builder should always do this for the initial table section.
+ *
+ * Entries may have 0 values, see below.
+ * No two entries in a section have the same byte values.
+ *
+ * Each uint32_t contains an input byte value in bits 31..24 and the
+ * corresponding lookup value in bits 23..0.
+ * Interpret the value as follows:
+ * if(value==0) {
+ * no match, see below
+ * } else if(value<0x1f0000) {
+ * partial match - use value as index to the next toUTable section
+ * and match the next unit; (value indexes toUTable[value])
+ * } else {
+ * if(bit 23 set) {
+ * roundtrip;
+ * } else {
+ * fallback;
+ * }
+ * unset value bit 23;
+ * if(value<=0x2fffff) {
+ * (value-0x1f0000) is a code point; (BMP: value<=0x1fffff)
+ * } else {
+ * bits 17..0 (value&0x3ffff) is an index to
+ * the result UChars in toUUChars[]; (0 indexes toUUChars[0])
+ * length of the result=((value>>18)-12); (length=0..19)
+ * }
+ * }
+ *
+ * The first word in a section contains the number of following words in the
+ * input byte position (bits 31..24, number=1..0xff).
+ * The value of the initial word is used when the current byte is not found
+ * in this section.
+ * If the value is not 0, then it represents a result as above.
+ * If the value is 0, then the search has to return a shorter match with an
+ * earlier default value as the result, or result in "unmappable" even for the
+ * initial bytes.
+ * If the value is 0 for the initial toUTable entry, then the initial byte
+ * does not start any mapping input.
+ *
+ *
+ * UChar toUUChars[];
+ *
+ * Contains toUnicode mapping results, stored as sequences of UChars.
+ * Indexes and lengths stored in the toUTable[].
+ *
+ *
+ * UChar fromUTableUChars[];
+ * uint32_t fromUTableValues[];
+ *
+ * The fromUTable is split into two arrays, but works otherwise much like
+ * the toUTable. The array is partitioned into sections like collation
+ * contraction tables and toUTable.
+ * A row in the table consists of same-index entries in fromUTableUChars[]
+ * and fromUTableValues[].
+ *
+ * Interpret a value as follows:
+ * if(value==0) {
+ * no match, see below
+ * } else if(value<=0xffffff) { (bits 31..24 are 0)
+ * partial match - use value as index to the next fromUTable section
+ * and match the next unit; (value indexes fromUTable[value])
+ * } else {
+ * if(value==0x80000001) {
+ * return no mapping, but request for ;
+ * }
+ * if(bit 31 set) {
+ * roundtrip;
+ * } else {
+ * fallback;
+ * }
+ * // bits 30..29 reserved, 0
+ * length=(value>>24)&0x1f; (bits 28..24)
+ * if(length==1..3) {
+ * bits 23..0 contain 1..3 bytes, padded with 00s on the left;
+ * } else {
+ * bits 23..0 (value&0xffffff) is an index to
+ * the result bytes in fromUBytes[]; (0 indexes fromUBytes[0])
+ * }
+ * }
+ *
+ * The first pair in a section contains the number of following pairs in the
+ * UChar position (16 bits, number=1..0xffff).
+ * The value of the initial pair is used when the current UChar is not found
+ * in this section.
+ * If the value is not 0, then it represents a result as above.
+ * If the value is 0, then the search has to return a shorter match with an
+ * earlier default value as the result, or result in "unmappable" even for the
+ * initial UChars.
+ *
+ * If the from Unicode trie is present, then the from Unicode search tables
+ * are not used for initial code points.
+ * In this case, the first entries (index 0) in the tables are not used
+ * (reserved, set to 0) because a value of 0 is used in trie results
+ * to indicate no mapping.
+ *
+ *
+ * uint16_t fromUStage12[];
+ *
+ * Stages 1 & 2 of a trie that maps an initial code point.
+ * Indexes in stage 1 are all offset by the length of stage 1 so that the
+ * same array pointer can be used for both stages.
+ * If (c>>10)>=(length of stage 1) then c does not start any mapping.
+ * Same bit distribution as for regular conversion tries.
+ *
+ *
+ * uint16_t fromUStage3[];
+ * uint32_t fromUStage3b[];
+ *
+ * Stage 3 of the trie. The first array simply contains indexes to the second,
+ * which contains words in the same format as fromUTableValues[].
+ * Use a stage 3 granularity of 4, which allows for 256k stage 3 entries,
+ * and 16-bit entries in stage 3 allow for 64k stage 3b entries.
+ * The stage 3 granularity means that the stage 2 entry needs to be left-shifted.
+ *
+ * Two arrays are used because it is expected that more than half of the stage 3
+ * entries will be zero. The 16-bit index stage 3 array saves space even
+ * considering storing a total of 6 bytes per non-zero entry in both arrays
+ * together.
+ * Using a stage 3 granularity of >1 diminishes the compactability in that stage
+ * but provides a larger effective addressing space in stage 2.
+ * All but the final result stage use 16-bit entries to save space.
+ *
+ * fromUStage3b[] contains a zero for "no mapping" at its index 0,
+ * and may contain UCNV_EXT_FROM_U_SUBCHAR1 at index 1 for " SUB mapping"
+ * (i.e., "no mapping" with preference for rather than ),
+ * and all other items are unique non-zero results.
+ *
+ *
+ * char fromUBytes[];
+ *
+ * Contains fromUnicode mapping results, stored as sequences of chars.
+ * Indexes and lengths stored in the fromUTableValues[].
+ */
+enum {
+ UCNV_EXT_INDEXES_LENGTH, /* 0 */
+
+ UCNV_EXT_TO_U_INDEX, /* 1 */
+ UCNV_EXT_TO_U_LENGTH,
+ UCNV_EXT_TO_U_UCHARS_INDEX,
+ UCNV_EXT_TO_U_UCHARS_LENGTH,
+
+ UCNV_EXT_FROM_U_UCHARS_INDEX, /* 5 */
+ UCNV_EXT_FROM_U_VALUES_INDEX,
+ UCNV_EXT_FROM_U_LENGTH,
+ UCNV_EXT_FROM_U_BYTES_INDEX,
+ UCNV_EXT_FROM_U_BYTES_LENGTH,
+
+ UCNV_EXT_FROM_U_STAGE_12_INDEX, /* 10 */
+ UCNV_EXT_FROM_U_STAGE_1_LENGTH,
+ UCNV_EXT_FROM_U_STAGE_12_LENGTH,
+ UCNV_EXT_FROM_U_STAGE_3_INDEX,
+ UCNV_EXT_FROM_U_STAGE_3_LENGTH,
+ UCNV_EXT_FROM_U_STAGE_3B_INDEX,
+ UCNV_EXT_FROM_U_STAGE_3B_LENGTH,
+
+ UCNV_EXT_RESERVED_INDEX, /* 17, moves with additional indexes */
+
+ UCNV_EXT_SIZE=31,
+ UCNV_EXT_INDEXES_MIN_LENGTH=32
+};
+
+/* get the pointer to an extension array from indexes[index] */
+#define UCNV_EXT_ARRAY(indexes, index, itemType) \
+ ((const itemType *)((const char *)(indexes)+(indexes)[index]))
+
+/* internal API ------------------------------------------------------------- */
+
+U_CFUNC UBool
+ucnv_extInitialMatchToU(UConverter *cnv, const int32_t *cx,
+ int32_t firstLength,
+ const char **src, const char *srcLimit,
+ UChar **target, const UChar *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UBool flush,
+ UErrorCode *pErrorCode);
+
+U_CFUNC void
+ucnv_extContinueMatchToU(UConverter *cnv,
+ UConverterToUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode);
+
+
+U_CFUNC UBool
+ucnv_extInitialMatchFromU(UConverter *cnv, const int32_t *cx,
+ UChar32 cp,
+ const UChar **src, const UChar *srcLimit,
+ char **target, const char *targetLimit,
+ int32_t **offsets, int32_t srcIndex,
+ UBool flush,
+ UErrorCode *pErrorCode);
+
+U_CFUNC int32_t
+ucnv_extSimpleMatchFromU(const int32_t *cx,
+ UChar32 cp, uint32_t *pValue,
+ UBool useFallback,
+ UErrorCode *pErrorCode);
+
+U_CFUNC void
+ucnv_extContinueMatchFromU(UConverter *cnv,
+ UConverterFromUnicodeArgs *pArgs, int32_t srcIndex,
+ UErrorCode *pErrorCode);
+
+/* toUnicode helpers -------------------------------------------------------- */
+
+#define UCNV_EXT_TO_U_BYTE_SHIFT 24
+#define UCNV_EXT_TO_U_VALUE_MASK 0xffffff
+#define UCNV_EXT_TO_U_MIN_CODE_POINT 0x1f0000
+#define UCNV_EXT_TO_U_MAX_CODE_POINT 0x2fffff
+#define UCNV_EXT_TO_U_ROUNDTRIP_FLAG ((uint32_t)1<<23)
+#define UCNV_EXT_TO_U_INDEX_MASK 0x3ffff
+#define UCNV_EXT_TO_U_LENGTH_SHIFT 18
+#define UCNV_EXT_TO_U_LENGTH_OFFSET 12
+
+/* maximum number of indexed UChars */
+#define UCNV_EXT_MAX_UCHARS 19
+
+#define UCNV_EXT_TO_U_MAKE_WORD(byte, value) (((uint32_t)(byte)<>UCNV_EXT_TO_U_BYTE_SHIFT)
+#define UCNV_EXT_TO_U_GET_VALUE(word) ((word)&UCNV_EXT_TO_U_VALUE_MASK)
+
+#define UCNV_EXT_TO_U_IS_PARTIAL(value) ((value)>UCNV_EXT_TO_U_LENGTH_SHIFT)-UCNV_EXT_TO_U_LENGTH_OFFSET)
+
+/* fromUnicode helpers ------------------------------------------------------ */
+
+/* most trie constants are shared with ucnvmbcs.h */
+
+/* see similar utrie.h UTRIE_INDEX_SHIFT and UTRIE_DATA_GRANULARITY */
+#define UCNV_EXT_STAGE_2_LEFT_SHIFT 2
+#define UCNV_EXT_STAGE_3_GRANULARITY 4
+
+/* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */
+#define UCNV_EXT_FROM_U(stage12, stage3, s1Index, c) \
+ (stage3)[ ((int32_t)(stage12)[ (stage12)[s1Index] +(((c)>>4)&0x3f) ]< (impossible roundtrip to 0 bytes, value 01) */
+#define UCNV_EXT_FROM_U_SUBCHAR1 0x80000001
+
+/* at most 3 bytes in the lower part of the value */
+#define UCNV_EXT_FROM_U_MAX_DIRECT_LENGTH 3
+
+/* maximum number of indexed bytes */
+#define UCNV_EXT_MAX_BYTES 0x1f
+
+#define UCNV_EXT_FROM_U_IS_PARTIAL(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)==0)
+#define UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value) (value)
+
+#define UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) (((value)&UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)!=0)
+#define UCNV_EXT_FROM_U_MASK_ROUNDTRIP(value) ((value)&~UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)
+
+/* use after masking off the roundtrip flag */
+#define UCNV_EXT_FROM_U_GET_LENGTH(value) (((value)>>UCNV_EXT_FROM_U_LENGTH_SHIFT)&UCNV_EXT_MAX_BYTES)
+
+/* get bytes or bytes index */
+#define UCNV_EXT_FROM_U_GET_DATA(value) ((value)&UCNV_EXT_FROM_U_DATA_MASK)
+
+#endif
diff --git a/icu4c/source/common/ucnvmbcs.c b/icu4c/source/common/ucnvmbcs.c
index 0e2cee3f3f..c71bbe99fd 100644
--- a/icu4c/source/common/ucnvmbcs.c
+++ b/icu4c/source/common/ucnvmbcs.c
@@ -46,6 +46,7 @@
#include "unicode/uset.h"
#include "ucnv_bld.h"
#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
#include "ucnv_cnv.h"
#include "umutex.h"
#include "cmemory.h"
@@ -56,9 +57,18 @@
#define MBCS_UNROLL_SINGLE_FROM_BMP 0
/*
- * _MBCSHeader versions 4.1
+ * _MBCSHeader versions 4.2
* (Note that the _MBCSHeader version is in addition to the converter formatVersion.)
*
+ * Change from version 4.1:
+ * - Added an optional extension table structure at the end of the .cnv file.
+ * It is present if the upper bits of the header flags field contains a non-zero
+ * byte offset to it.
+ * Files that contain only a conversion table and no base table
+ * use the special outputType MBCS_OUTPUT_EXT_ONLY.
+ * These contain the base table name between the MBCS header and the extension
+ * data.
+ *
* Change from version 4.0:
* - Replace header.reserved with header.fromUBytesLength so that all
* fields in the data have length.
@@ -524,11 +534,6 @@ _MBCSGetUnicodeSet(const UConverter *cnv,
* code. The framework will then call the callback function.
*/
-/*
- * TODO when implementing real extensions, review whether the useFallback parameter
- * should get cnv->useFallback or the full resolution considering cp as well
- */
-
/*
* @return if(U_FAILURE) return the code point for cnv->fromUChar32
* else return 0 after output has been written to the target
@@ -539,10 +544,26 @@ _extFromU(UConverter *cnv, const UConverterSharedData *sharedData,
const UChar **source, const UChar *sourceLimit,
char **target, const char *targetLimit,
int32_t **offsets, int32_t sourceIndex,
- UBool useFallback, UBool flush,
+ UBool flush,
UErrorCode *pErrorCode) {
+ const int32_t *cx;
+
+ cnv->useSubChar1=FALSE;
+
+ if( (cx=sharedData->table->mbcs.extIndexes)!=NULL &&
+ ucnv_extInitialMatchFromU(
+ cnv, cx,
+ cp, source, sourceLimit,
+ target, targetLimit,
+ offsets, sourceIndex,
+ flush,
+ pErrorCode)
+ ) {
+ return 0; /* an extension mapping handled the input */
+ }
+
/* GB 18030 */
- if(cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
+ if((cnv->options&_MBCS_OPTION_GB18030)!=0) {
const uint32_t *range;
int32_t i;
@@ -590,10 +611,24 @@ _extToU(UConverter *cnv, const UConverterSharedData *sharedData,
const char **source, const char *sourceLimit,
UChar **target, const UChar *targetLimit,
int32_t **offsets, int32_t sourceIndex,
- UBool useFallback, UBool flush,
+ UBool flush,
UErrorCode *pErrorCode) {
+ const int32_t *cx;
+
+ if( (cx=sharedData->table->mbcs.extIndexes)!=NULL &&
+ ucnv_extInitialMatchToU(
+ cnv, cx,
+ length, source, sourceLimit,
+ target, targetLimit,
+ offsets, sourceIndex,
+ flush,
+ pErrorCode)
+ ) {
+ return 0; /* an extension mapping handled the input */
+ }
+
/* GB 18030 */
- if(length==4 && cnv!=NULL && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
+ if(length==4 && (cnv->options&_MBCS_OPTION_GB18030)!=0) {
const uint32_t *range;
uint32_t linear;
int32_t i;
@@ -789,6 +824,7 @@ _MBCSLoad(UConverterSharedData *sharedData,
UDataInfo info;
UConverterMBCSTable *mbcsTable=&sharedData->table->mbcs;
_MBCSHeader *header=(_MBCSHeader *)raw;
+ uint32_t offset;
if(header->version[0]!=4) {
*pErrorCode=U_INVALID_TABLE_FORMAT;
@@ -806,6 +842,12 @@ _MBCSLoad(UConverterSharedData *sharedData,
mbcsTable->fromUBytesLength=header->fromUBytesLength;
mbcsTable->outputType=(uint8_t)header->flags;
+ /* extension data, header version 4.2 and higher */
+ offset=header->flags>>8;
+ if(offset!=0) {
+ mbcsTable->extIndexes=(const int32_t *)(raw+offset);
+ }
+
/* make sure that the output type is known */
switch(mbcsTable->outputType) {
case MBCS_OUTPUT_1:
@@ -817,6 +859,8 @@ _MBCSLoad(UConverterSharedData *sharedData,
case MBCS_OUTPUT_2_SISO:
/* OK */
break;
+ case MBCS_OUTPUT_EXT_ONLY:
+ /* ### TODO */
default:
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
@@ -1062,7 +1106,7 @@ _MBCSSingleToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
1, (const char **)&source, (const char *)sourceLimit,
&target, targetLimit,
&offsets, sourceIndex,
- (UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
+ pArgs->flush,
pErrorCode);
sourceIndex+=1+(int32_t)(source-(const uint8_t *)pArgs->source);
@@ -1263,7 +1307,7 @@ unrolled:
1, (const char **)&source, (const char *)sourceLimit,
&target, target+targetCapacity,
&offsets, sourceIndex,
- (UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
+ pArgs->flush,
pErrorCode);
sourceIndex+=1+(int32_t)(source-lastSource);
@@ -1299,266 +1343,6 @@ unrolled:
pArgs->offsets=offsets;
}
-/*
- * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
- * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
- */
-static UChar32
-_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv;
- const int32_t (*stateTable)[256];
- const uint8_t *source, *sourceLimit;
-
- int32_t entry;
- uint8_t action;
-
- /* set up the local pointers */
- cnv=pArgs->converter;
- source=(const uint8_t *)pArgs->source;
- sourceLimit=(const uint8_t *)pArgs->sourceLimit;
- if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
- stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
- } else {
- stateTable=cnv->sharedData->table->mbcs.stateTable;
- }
-
- /* conversion loop */
- while(sourcesource=(const char *)source;
-
- if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
- /* output BMP code point */
- return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
- }
-
- /*
- * An if-else-if chain provides more reliable performance for
- * the most common cases compared to a switch.
- */
- action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
- if( action==MBCS_STATE_VALID_DIRECT_20 ||
- (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
- ) {
- /* output supplementary code point */
- return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
- } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
- if(UCNV_TO_U_USE_FALLBACK(cnv)) {
- /* output BMP code point */
- return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
- }
- } else if(action==MBCS_STATE_UNASSIGNED) {
- /* just fall through */
- } else if(action==MBCS_STATE_ILLEGAL) {
- /* callback(illegal) */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- } else {
- /* reserved, must never occur */
- continue;
- }
-
- if(U_FAILURE(*pErrorCode)) {
- /* callback(illegal) */
- break;
- } else /* unassigned sequence */ {
- /* defer to the generic implementation */
- pArgs->source=(const char *)source-1;
- return UCNV_GET_NEXT_UCHAR_USE_TO_U;
- }
- }
-
- /* no output because of empty input or only state changes */
- *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
- return 0xffff;
-}
-
-static UChar32
-_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
- UErrorCode *pErrorCode) {
- UConverter *cnv;
- const uint8_t *source, *sourceLimit, *lastSource;
-
- const int32_t (*stateTable)[256];
- const uint16_t *unicodeCodeUnits;
-
- uint32_t offset;
- uint8_t state;
-
- int32_t entry;
- UChar32 c;
- uint8_t action;
-
- /* use optimized function if possible */
- cnv=pArgs->converter;
- if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
- /*
- * Using the generic ucnv_getNextUChar() code lets us deal correctly
- * with the rare case of a codepage that maps single surrogates
- * without adding the complexity to this already complicated function here.
- */
- return UCNV_GET_NEXT_UCHAR_USE_TO_U;
- } else if(cnv->sharedData->table->mbcs.countStates==1) {
- return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
- }
-
- /* set up the local pointers */
- source=lastSource=(const uint8_t *)pArgs->source;
- sourceLimit=(const uint8_t *)pArgs->sourceLimit;
-
- if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
- stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
- } else {
- stateTable=cnv->sharedData->table->mbcs.stateTable;
- }
- unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
-
- /* get the converter state from UConverter */
- offset=cnv->toUnicodeStatus;
- state=(uint8_t)(cnv->mode);
-
- /* conversion loop */
- c=U_SENTINEL;
- while(sourcesharedData->table->mbcs, offset))!=0xfffe) {
- break;
- }
- } else {
- /* callback(illegal) */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- }
- } else if(action==MBCS_STATE_VALID_16_PAIR) {
- offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
- c=unicodeCodeUnits[offset++];
- if(c<0xd800) {
- /* output BMP code point below 0xd800 */
- break;
- } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
- /* output roundtrip or fallback supplementary code point */
- c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
- break;
- } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
- /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
- c=unicodeCodeUnits[offset];
- break;
- } else if(c==0xffff) {
- /* callback(illegal) */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- }
- } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
- (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
- ) {
- /* output supplementary code point */
- c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
- break;
- } else if(action==MBCS_STATE_CHANGE_ONLY) {
- /*
- * This serves as a state change without any output.
- * It is useful for reading simple stateful encodings,
- * for example using just Shift-In/Shift-Out codes.
- * The 21 unused bits may later be used for more sophisticated
- * state transitions.
- */
- } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
- if(UCNV_TO_U_USE_FALLBACK(cnv)) {
- /* output BMP code point */
- c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
- break;
- }
- } else if(action==MBCS_STATE_UNASSIGNED) {
- /* just fall through */
- } else if(action==MBCS_STATE_ILLEGAL) {
- /* callback(illegal) */
- *pErrorCode=U_ILLEGAL_CHAR_FOUND;
- } else {
- /* reserved (must never occur), or only state change */
- offset=0;
- lastSource=source;
- continue;
- }
-
- /* end of action codes: prepare for a new character */
- offset=0;
-
- if(U_FAILURE(*pErrorCode)) {
- /* callback(illegal) */
- break;
- } else /* unassigned sequence */ {
- /* defer to the generic implementation */
- cnv->toUnicodeStatus=0;
- cnv->mode=state;
- pArgs->source=(const char *)lastSource;
- return UCNV_GET_NEXT_UCHAR_USE_TO_U;
- }
- }
- }
-
- if(c<0) {
- if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes;
- cnv->toULength=(int8_t)(source-lastSource);
- do {
- *bytes++=*lastSource++;
- } while(lastSourcetoUnicodeStatus=0;
- cnv->mode=state;
-
- /* write back the updated pointer */
- pArgs->source=(const char *)source;
- return c;
-}
-
U_CFUNC void
_MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
UErrorCode *pErrorCode) {
@@ -1584,6 +1368,19 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
/* use optimized function if possible */
cnv=pArgs->converter;
+
+ if(cnv->preToULength>0) {
+ /*
+ * pass sourceIndex=-1 because we continue from an earlier buffer
+ * in the future, this may change with continuous offsets
+ */
+ ucnv_extContinueMatchToU(cnv, pArgs, -1, pErrorCode);
+
+ if(U_FAILURE(*pErrorCode) || cnv->preToULength<0) {
+ return;
+ }
+ }
+
if(cnv->sharedData->table->mbcs.countStates==1) {
if(!(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
_MBCSSingleToBMPWithOffsets(pArgs, pErrorCode);
@@ -1890,7 +1687,7 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
byteIndex, (const char **)&source, (const char *)sourceLimit,
&target, targetLimit,
&offsets, sourceIndex,
- (UBool)UCNV_TO_U_USE_FALLBACK(cnv), pArgs->flush,
+ pArgs->flush,
pErrorCode);
sourceIndex=nextSourceIndex+(int32_t)(source-(const uint8_t *)pArgs->source);
@@ -1912,6 +1709,328 @@ _MBCSToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
pArgs->offsets=offsets;
}
+/*
+ * This version of _MBCSGetNextUChar() is optimized for single-byte, single-state codepages.
+ * We still need a conversion loop in case we find reserved action codes, which are to be ignored.
+ */
+static UChar32
+_MBCSSingleGetNextUChar(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const int32_t (*stateTable)[256];
+ const uint8_t *source, *sourceLimit;
+
+ int32_t entry;
+ uint8_t action;
+
+ /* set up the local pointers */
+ cnv=pArgs->converter;
+ source=(const uint8_t *)pArgs->source;
+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+ if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
+ } else {
+ stateTable=cnv->sharedData->table->mbcs.stateTable;
+ }
+
+ /* conversion loop */
+ while(sourcesource=(const char *)source;
+
+ if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
+ /* output BMP code point */
+ return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
+ }
+
+ /*
+ * An if-else-if chain provides more reliable performance for
+ * the most common cases compared to a switch.
+ */
+ action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
+ if( action==MBCS_STATE_VALID_DIRECT_20 ||
+ (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
+ ) {
+ /* output supplementary code point */
+ return (UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
+ } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
+ if(UCNV_TO_U_USE_FALLBACK(cnv)) {
+ /* output BMP code point */
+ return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
+ }
+ } else if(action==MBCS_STATE_UNASSIGNED) {
+ /* just fall through */
+ } else if(action==MBCS_STATE_ILLEGAL) {
+ /* callback(illegal) */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ } else {
+ /* reserved, must never occur */
+ continue;
+ }
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* callback(illegal) */
+ break;
+ } else /* unassigned sequence */ {
+ /* defer to the generic implementation */
+ pArgs->source=(const char *)source-1;
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+ }
+ }
+
+ /* no output because of empty input or only state changes */
+ *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0xffff;
+}
+
+static UChar32
+_MBCSGetNextUChar(UConverterToUnicodeArgs *pArgs,
+ UErrorCode *pErrorCode) {
+ UConverter *cnv;
+ const uint8_t *source, *sourceLimit, *lastSource;
+
+ const int32_t (*stateTable)[256];
+ const uint16_t *unicodeCodeUnits;
+
+ uint32_t offset;
+ uint8_t state;
+
+ int32_t entry;
+ UChar32 c;
+ uint8_t action;
+
+ /* use optimized function if possible */
+ cnv=pArgs->converter;
+
+ /* ### TODO extension */
+ if(cnv->sharedData->table->mbcs.extIndexes!=NULL) {
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+ }
+ /* ### TODO end cheap-trick extension */
+
+ if(cnv->sharedData->table->mbcs.unicodeMask&UCNV_HAS_SURROGATES) {
+ /*
+ * Using the generic ucnv_getNextUChar() code lets us deal correctly
+ * with the rare case of a codepage that maps single surrogates
+ * without adding the complexity to this already complicated function here.
+ */
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+ } else if(cnv->sharedData->table->mbcs.countStates==1) {
+ return _MBCSSingleGetNextUChar(pArgs, pErrorCode);
+ }
+
+ /* set up the local pointers */
+ source=lastSource=(const uint8_t *)pArgs->source;
+ sourceLimit=(const uint8_t *)pArgs->sourceLimit;
+
+ if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
+ stateTable=(const int32_t (*)[256])cnv->sharedData->table->mbcs.swapLFNLStateTable;
+ } else {
+ stateTable=cnv->sharedData->table->mbcs.stateTable;
+ }
+ unicodeCodeUnits=cnv->sharedData->table->mbcs.unicodeCodeUnits;
+
+ /* get the converter state from UConverter */
+ offset=cnv->toUnicodeStatus;
+ state=(uint8_t)(cnv->mode);
+
+ /* conversion loop */
+ c=U_SENTINEL;
+ while(sourcesharedData->table->mbcs, offset))!=0xfffe) {
+ break;
+ }
+ } else {
+ /* callback(illegal) */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ } else if(action==MBCS_STATE_VALID_16_PAIR) {
+ offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
+ c=unicodeCodeUnits[offset++];
+ if(c<0xd800) {
+ /* output BMP code point below 0xd800 */
+ break;
+ } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? c<=0xdfff : c<=0xdbff) {
+ /* output roundtrip or fallback supplementary code point */
+ c=((c&0x3ff)<<10)+unicodeCodeUnits[offset]+(0x10000-0xdc00);
+ break;
+ } else if(UCNV_TO_U_USE_FALLBACK(cnv) ? (c&0xfffe)==0xe000 : c==0xe000) {
+ /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */
+ c=unicodeCodeUnits[offset];
+ break;
+ } else if(c==0xffff) {
+ /* callback(illegal) */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ }
+ } else if(action==MBCS_STATE_VALID_DIRECT_20 ||
+ (action==MBCS_STATE_FALLBACK_DIRECT_20 && UCNV_TO_U_USE_FALLBACK(cnv))
+ ) {
+ /* output supplementary code point */
+ c=(UChar32)(MBCS_ENTRY_FINAL_VALUE(entry)+0x10000);
+ break;
+ } else if(action==MBCS_STATE_CHANGE_ONLY) {
+ /*
+ * This serves as a state change without any output.
+ * It is useful for reading simple stateful encodings,
+ * for example using just Shift-In/Shift-Out codes.
+ * The 21 unused bits may later be used for more sophisticated
+ * state transitions.
+ */
+ } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
+ if(UCNV_TO_U_USE_FALLBACK(cnv)) {
+ /* output BMP code point */
+ c=(UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
+ break;
+ }
+ } else if(action==MBCS_STATE_UNASSIGNED) {
+ /* just fall through */
+ } else if(action==MBCS_STATE_ILLEGAL) {
+ /* callback(illegal) */
+ *pErrorCode=U_ILLEGAL_CHAR_FOUND;
+ } else {
+ /* reserved (must never occur), or only state change */
+ offset=0;
+ lastSource=source;
+ continue;
+ }
+
+ /* end of action codes: prepare for a new character */
+ offset=0;
+
+ if(U_FAILURE(*pErrorCode)) {
+ /* callback(illegal) */
+ break;
+ } else /* unassigned sequence */ {
+ /* defer to the generic implementation */
+ cnv->toUnicodeStatus=0;
+ cnv->mode=state;
+ pArgs->source=(const char *)lastSource;
+ return UCNV_GET_NEXT_UCHAR_USE_TO_U;
+ }
+ }
+ }
+
+ if(c<0) {
+ if(U_SUCCESS(*pErrorCode) && source==sourceLimit && lastSourcetoUBytes;
+ cnv->toULength=(int8_t)(source-lastSource);
+ do {
+ *bytes++=*lastSource++;
+ } while(lastSourcetoUnicodeStatus=0;
+ cnv->mode=state;
+
+ /* write back the updated pointer */
+ pArgs->source=(const char *)source;
+ return c;
+}
+
+#if 0
+/*
+ * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
+ * Removal improves code coverage.
+ */
+/**
+ * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
+ * It does not handle the EBCDIC swaplfnl option (set in UConverter).
+ * It does not handle conversion extensions (_extToU()).
+ */
+U_CFUNC UChar32
+_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
+ uint8_t b, UBool useFallback) {
+ int32_t entry;
+ uint8_t action;
+
+ entry=sharedData->table->mbcs.stateTable[0][b];
+ /* MBCS_ENTRY_IS_FINAL(entry) */
+
+ if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
+ /* output BMP code point */
+ return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
+ }
+
+ /*
+ * An if-else-if chain provides more reliable performance for
+ * the most common cases compared to a switch.
+ */
+ action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
+ if(action==MBCS_STATE_VALID_DIRECT_20) {
+ /* output supplementary code point */
+ return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
+ } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
+ if(!TO_U_USE_FALLBACK(useFallback)) {
+ return 0xfffe;
+ }
+ /* output BMP code point */
+ return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
+ } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
+ if(!TO_U_USE_FALLBACK(useFallback)) {
+ return 0xfffe;
+ }
+ /* output supplementary code point */
+ return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
+ } else if(action==MBCS_STATE_UNASSIGNED) {
+ return 0xfffe;
+ } else if(action==MBCS_STATE_ILLEGAL) {
+ return 0xffff;
+ } else {
+ /* reserved, must never occur */
+ return 0xffff;
+ }
+}
+#endif
+
/*
* This is a simple version of getNextUChar() that is used
* by other converter implementations.
@@ -1945,6 +2064,8 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
return 0xffff;
}
+ /* ### TODO extension */
+
#if 0
/*
* Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
@@ -2054,61 +2175,6 @@ _MBCSSimpleGetNextUChar(UConverterSharedData *sharedData,
return 0xffff;
}
-#if 0
-/*
- * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus
- * Removal improves code coverage.
- */
-/**
- * This version of _MBCSSimpleGetNextUChar() is optimized for single-byte, single-state codepages.
- * It does not handle the EBCDIC swaplfnl option (set in UConverter).
- * It does not handle conversion extensions (_extToU()).
- */
-U_CFUNC UChar32
-_MBCSSingleSimpleGetNextUChar(UConverterSharedData *sharedData,
- uint8_t b, UBool useFallback) {
- int32_t entry;
- uint8_t action;
-
- entry=sharedData->table->mbcs.stateTable[0][b];
- /* MBCS_ENTRY_IS_FINAL(entry) */
-
- if(MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) {
- /* output BMP code point */
- return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
- }
-
- /*
- * An if-else-if chain provides more reliable performance for
- * the most common cases compared to a switch.
- */
- action=(uint8_t)(MBCS_ENTRY_FINAL_ACTION(entry));
- if(action==MBCS_STATE_VALID_DIRECT_20) {
- /* output supplementary code point */
- return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
- } else if(action==MBCS_STATE_FALLBACK_DIRECT_16) {
- if(!TO_U_USE_FALLBACK(useFallback)) {
- return 0xfffe;
- }
- /* output BMP code point */
- return (UChar)MBCS_ENTRY_FINAL_VALUE_16(entry);
- } else if(action==MBCS_STATE_FALLBACK_DIRECT_20) {
- if(!TO_U_USE_FALLBACK(useFallback)) {
- return 0xfffe;
- }
- /* output supplementary code point */
- return 0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
- } else if(action==MBCS_STATE_UNASSIGNED) {
- return 0xfffe;
- } else if(action==MBCS_STATE_ILLEGAL) {
- return 0xffff;
- } else {
- /* reserved, must never occur */
- return 0xffff;
- }
-}
-#endif
-
/* MBCS-from-Unicode conversion functions ----------------------------------- */
/* This version of _MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */
@@ -2251,7 +2317,7 @@ unassigned:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
- (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
+ pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
@@ -2454,7 +2520,7 @@ unassigned:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
- (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
+ pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
@@ -2681,7 +2747,7 @@ getTrail:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
- (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
+ pArgs->flush,
pErrorCode);
sourceIndex+=length+(int32_t)(source-lastSource);
lastSource=source;
@@ -2744,8 +2810,21 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
int32_t length, prevLength;
uint8_t unicodeMask;
- /* use optimized function if possible */
cnv=pArgs->converter;
+
+ if(cnv->preFromUFirstCP>=0) {
+ /*
+ * pass sourceIndex=-1 because we continue from an earlier buffer
+ * in the future, this may change with continuous offsets
+ */
+ ucnv_extContinueMatchFromU(cnv, pArgs, -1, pErrorCode);
+
+ if(U_FAILURE(*pErrorCode) || cnv->preFromULength<0) {
+ return;
+ }
+ }
+
+ /* use optimized function if possible */
outputType=cnv->sharedData->table->mbcs.outputType;
unicodeMask=cnv->sharedData->table->mbcs.unicodeMask;
if(outputType==MBCS_OUTPUT_1 && !(unicodeMask&UCNV_HAS_SURROGATES)) {
@@ -2768,6 +2847,7 @@ _MBCSFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
offsets=pArgs->offsets;
table=cnv->sharedData->table->mbcs.fromUnicodeTable;
+
if((cnv->options&UCNV_OPTION_SWAP_LFNL)!=0) {
bytes=cnv->sharedData->table->mbcs.swapLFNLFromUnicodeBytes;
} else {
@@ -3025,7 +3105,7 @@ unassigned:
c, &source, sourceLimit,
(char **)&target, (char *)target+targetCapacity,
&offsets, sourceIndex,
- (UBool)UCNV_FROM_U_USE_FALLBACK(cnv, c), pArgs->flush,
+ pArgs->flush,
pErrorCode);
nextSourceIndex+=(int32_t)(source-pArgs->source);
prevLength=cnv->fromUnicodeStatus; /* restore SISO state */
@@ -3222,6 +3302,8 @@ _MBCSFromUChar32(UConverterSharedData *sharedData,
uint32_t value;
int32_t length;
+ /* ### TODO extension mapping */
+
/* BMP-only codepages are stored without stage 1 entries for supplementary code points */
if(c>=0x10000 && !(sharedData->table->mbcs.unicodeMask&UCNV_HAS_SUPPLEMENTARY)) {
return 0;
@@ -3404,7 +3486,11 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
int32_t length;
/* first, select between subChar and subChar1 */
- if(cnv->subChar1!=0 && cnv->invalidUCharBuffer[0]<=0xff) {
+ if( cnv->subChar1!=0 &&
+ (cnv->sharedData->table->mbcs.extIndexes!=NULL ?
+ cnv->useSubChar1 :
+ (cnv->invalidUCharBuffer[0]<=0xff))
+ ) {
/* select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS behavior) */
subchar=(char *)&cnv->subChar1;
length=1;
@@ -3414,6 +3500,9 @@ _MBCSWriteSub(UConverterFromUnicodeArgs *pArgs,
length=cnv->subCharLen;
}
+ /* reset the selector for the next code point */
+ cnv->useSubChar1=FALSE;
+
switch(cnv->sharedData->table->mbcs.outputType) {
case MBCS_OUTPUT_2_SISO:
p=buffer;
diff --git a/icu4c/source/common/ucnvmbcs.h b/icu4c/source/common/ucnvmbcs.h
index 3df87dcc57..5940af28ba 100644
--- a/icu4c/source/common/ucnvmbcs.h
+++ b/icu4c/source/common/ucnvmbcs.h
@@ -37,7 +37,11 @@
* At the moment, there are only variations of MBCS converters. They all have
* the same toUnicode structures, while the fromUnicode structures for SBCS
* differ from those for other MBCS-style converters.
- *
+ *
+ * _MBCSHeader.version 4.2 adds an optional conversion extension data structure.
+ * If it is present, then an ICU version reading header versions 4.0 or 4.1
+ * will be able to use the base table and ignore the extension.
+ *
* MBCS-style data structure following the static data.
* Offsets are counted in bytes from the beginning of the MBCS header structure.
* Details about usage in comments in ucnvmbcs.c.
@@ -45,61 +49,79 @@
* struct _MBCSHeader (see the definition in this header file below)
* contains 32-bit fields as follows:
* 8 values:
- * 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.1.0.0)
+ * 0 uint8_t[4] MBCS version in UVersionInfo format (currently 4.2.0.0)
* 1 uint32_t countStates
* 2 uint32_t countToUFallbacks
* 3 uint32_t offsetToUCodeUnits
* 4 uint32_t offsetFromUTable
* 5 uint32_t offsetFromUBytes
* 6 uint32_t flags, bits:
- * 31.. 8 reserved
+ * 31.. 8 offsetExtension -- _MBCSHeader.version 4.2 (ICU 2.8) and higher
+ * 0 for older versions and if
+ * there is not extension structure
* 7.. 0 outputType
* 7 uint32_t fromUBytesLength -- _MBCSHeader.version 4.1 (ICU 2.4) and higher
* counts bytes in fromUBytes[]
*
- * int32_t stateTable[countStates][256];
+ * if(outputType==MBCS_OUTPUT_EXT_ONLY) {
+ * -- base table name for extension-only table
+ * char baseTableName[variable]; -- with NUL plus padding for 4-alignment
*
- * struct _MBCSToUFallback { (fallbacks are sorted by offset)
- * uint32_t offset;
- * UChar32 codePoint;
- * } toUFallbacks[countToUFallbacks];
- *
- * uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
- * (padded to an even number of units)
- *
- * -- stage 1 tables
- * if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
- * -- stage 1 table for all of Unicode
- * uint16_t fromUTable[0x440]; (32-bit-aligned)
+ * -- all _MBCSHeader fields except for version and flags are 0
* } else {
- * -- BMP-only tables have a smaller stage 1 table
- * uint16_t fromUTable[0x40]; (32-bit-aligned)
+ * -- normal base table with optional extension
+ *
+ * int32_t stateTable[countStates][256];
+ *
+ * struct _MBCSToUFallback { (fallbacks are sorted by offset)
+ * uint32_t offset;
+ * UChar32 codePoint;
+ * } toUFallbacks[countToUFallbacks];
+ *
+ * uint16_t unicodeCodeUnits[(offsetFromUTable-offsetToUCodeUnits)/2];
+ * (padded to an even number of units)
+ *
+ * -- stage 1 tables
+ * if(staticData.unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
+ * -- stage 1 table for all of Unicode
+ * uint16_t fromUTable[0x440]; (32-bit-aligned)
+ * } else {
+ * -- BMP-only tables have a smaller stage 1 table
+ * uint16_t fromUTable[0x40]; (32-bit-aligned)
+ * }
+ *
+ * -- stage 2 tables
+ * length determined by top of stage 1 and bottom of stage 3 tables
+ * if(outputType==MBCS_OUTPUT_1) {
+ * -- SBCS: pure indexes
+ * uint16_t stage 2 indexes[?];
+ * } else {
+ * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
+ * uint32_t stage 2 flags and indexes[?];
+ * }
+ *
+ * -- stage 3 tables with byte results
+ * if(outputType==MBCS_OUTPUT_1) {
+ * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
+ * uint16_t fromUBytes[fromUBytesLength/2];
+ * } else {
+ * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
+ * uint8_t fromUBytes[fromUBytesLength]; or
+ * uint16_t fromUBytes[fromUBytesLength/2]; or
+ * uint32_t fromUBytes[fromUBytesLength/4];
+ * }
* }
*
- * -- stage 2 tables
- * length determined by top of stage 1 and bottom of stage 3 tables
- * if(outputType==MBCS_OUTPUT_1) {
- * -- SBCS: pure indexes
- * uint16_t stage 2 indexes[?];
- * } else {
- * -- DBCS, MBCS, EBCDIC_STATEFUL, ...: roundtrip flags and indexes
- * uint32_t stage 2 flags and indexes[?];
- * }
- *
- * -- stage 3 tables with byte results
- * if(outputType==MBCS_OUTPUT_1) {
- * -- SBCS: each 16-bit result contains flags and the result byte, see ucnvmbcs.c
- * uint16_t fromUBytes[fromUBytesLength/2];
- * } else {
- * -- DBCS, MBCS, EBCDIC_STATEFUL, ... 2/3/4 bytes result, see ucnvmbcs.c
- * uint8_t fromUBytes[fromUBytesLength]; or
- * uint16_t fromUBytes[fromUBytesLength/2]; or
- * uint32_t fromUBytes[fromUBytesLength/4];
- * }
+ * -- extension table, details see ucnv_ext.h
+ * int32_t indexes[>=32]; ...
*/
/* MBCS converter data and state -------------------------------------------- */
+enum {
+ MBCS_MAX_STATE_COUNT=128
+};
+
/**
* MBCS action codes for conversions to Unicode.
* These values are in bits 23..20 of the state table entries.
@@ -175,7 +197,11 @@ enum {
MBCS_OUTPUT_4_EUC, /* 9 */
MBCS_OUTPUT_2_SISO=12, /* c */
- MBCS_OUTPUT_2_HZ /* d */
+ MBCS_OUTPUT_2_HZ, /* d */
+
+ MBCS_OUTPUT_EXT_ONLY, /* e */
+
+ MBCS_OUTPUT_COUNT
};
/**
@@ -210,6 +236,9 @@ typedef struct UConverterMBCSTable {
/* converter name for swaplfnl */
char *swapLFNLName;
+
+ /* extension data */
+ const int32_t *extIndexes;
} UConverterMBCSTable;
/**
diff --git a/icu4c/source/common/unicode/ucnv.h b/icu4c/source/common/unicode/ucnv.h
index af2b70a909..eb5f948707 100644
--- a/icu4c/source/common/unicode/ucnv.h
+++ b/icu4c/source/common/unicode/ucnv.h
@@ -455,7 +455,7 @@ ucnv_safeClone(const UConverter *cnv,
UErrorCode *status);
/** @stable ICU 2.0 */
-#define U_CNV_SAFECLONE_BUFFERSIZE 3072
+#define U_CNV_SAFECLONE_BUFFERSIZE 4096
/**
* Deletes the unicode converter and releases resources associated
diff --git a/icu4c/source/test/testdata/conversion.txt b/icu4c/source/test/testdata/conversion.txt
index 036d1996a3..77d37a374d 100644
--- a/icu4c/source/test/testdata/conversion.txt
+++ b/icu4c/source/test/testdata/conversion.txt
@@ -43,6 +43,16 @@ conversion {
toUnicode {
Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" }
Cases {
+ // extensions
+ {
+ "*test3",
+ :bin{ 00050601020b0701020a01020c },
+ "\u20ac\x05\x06\x0b\U00101234\U00023456\ufffd",
+ :intvector{ 0, 1, 2, 3, 6, 6, 7, 7, 10 },
+ :int{1}, :int{0}, "", "?", :bin{""}
+ }
+
+ // normal conversions
{
"UTF-16LE",
:bin{ 310000d801dc00d902dc320000d8330001dc3400 },
@@ -110,6 +120,24 @@ conversion {
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
+ // extensions
+ {
+ "*test3",
+ "\xc4\xc4\xc4\U00101234\xc4\xc4\U00101234\x05",
+ :bin{ ffffff070501020c },
+ :intvector{ 0, 1, 2, 3, 5, 5, 5, 5 },
+ :int{1}, :int{0}, "", "?", ""
+ }
+
+ {
+ "*test3",
+ "\U00101234\U00101234\U00050005\U00101234\U00050005\U00060006",
+ :bin{ 07070001020e05070001020f09 },
+ :intvector{ 0, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6 },
+ :int{1}, :int{0}, "", "?", ""
+ }
+
+ // normal conversions
{
"UTF-16LE",
"1\U00010001\U000500022\ud8003\udc014",
diff --git a/icu4c/source/test/testdata/test1.ucm b/icu4c/source/test/testdata/test1.ucm
index 9fe7d0fe33..219704b83c 100644
--- a/icu4c/source/test/testdata/test1.ucm
+++ b/icu4c/source/test/testdata/test1.ucm
@@ -1,18 +1,19 @@
# *******************************************************************************
-# * Copyright (C) 2001, International Business Machines
+# * Copyright (C) 2001-2003, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test1.ucm
#
# Test file for MBCS conversion with single-byte codepage data.
+# Also contains extension mappings (m:n).
"test1"
1
1
"MBCS"
- \xff
- 0, 5-9, ff
+ \xff
+ 0, 5-9, ff
CHARMAP
diff --git a/icu4c/source/test/testdata/test3.ucm b/icu4c/source/test/testdata/test3.ucm
index f9e6ea85c0..aee69c2898 100644
--- a/icu4c/source/test/testdata/test3.ucm
+++ b/icu4c/source/test/testdata/test3.ucm
@@ -1,20 +1,21 @@
# *******************************************************************************
-# * Copyright (C) 2001, International Business Machines
+# * Copyright (C) 2001-2003, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test3.ucm
#
# Test file for MBCS conversion with three-byte codepage data.
+# Also contains extension mappings (m:n).
"test3"
3
1
"MBCS"
- \xff
- 0, 1:1, 5-9, ff
- 2:2
- a-f.p
+ \xff
+ 0, 1:1, 5-9, ff
+ 2:2
+ a-f.p
CHARMAP
@@ -24,6 +25,11 @@ CHARMAP
# nothing special
\x05 |0
+# extensions
+ \x05+\x01\x02\x0d |0
+ \x05+\x01\x02\x0e |3
+ \x05+\xff |3
+
# toUnicode result is fallback direct
\x06 |3
@@ -31,8 +37,18 @@ CHARMAP
\x07 |0
\x08 |3
+# extensions
+++ \x07+\x00+\x01\x02\x0f+\x09 |0
++ \x07+\x00+\x01\x02\x0e+\x05 |0
++ \x07+\x00+\x01\x02\x0f+\x06 |0
++ \x07+\x00+\x01\x02\x0f |1
+
#unassigned \x09
+# extensions where the first code point is unassigned, for replay testing
+# \x09+\x09 |0
+ \x05+\x01\x02\x0c |0
+
# toUnicode result is surrogate pair: test real pair, single unit, unassigned
\x01\x02\x0a |0
\x01\x02\x0b |0
diff --git a/icu4c/source/test/testdata/test4.ucm b/icu4c/source/test/testdata/test4.ucm
index 9738964a3f..9313257c95 100644
--- a/icu4c/source/test/testdata/test4.ucm
+++ b/icu4c/source/test/testdata/test4.ucm
@@ -1,21 +1,21 @@
# *******************************************************************************
-# * Copyright (C) 2001, International Business Machines
+# * Copyright (C) 2001-2003, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
# test4.ucm
#
-# Test file for MBCS conversion with three-byte codepage data.
+# Test file for MBCS conversion with four-byte codepage data.
"test4"
4
1
"MBCS"
- \xff
- 0, 1:1, 5-9, ff
- 2:2
- 3:3
- a-f.p
+ \xff
+ 0, 1:1, 5-9, ff
+ 2:2
+ 3:3
+ a-f.p
CHARMAP
diff --git a/icu4c/source/tools/gennorm/store.c b/icu4c/source/tools/gennorm/store.c
index c78a3193df..49375bd64d 100644
--- a/icu4c/source/tools/gennorm/store.c
+++ b/icu4c/source/tools/gennorm/store.c
@@ -26,6 +26,7 @@
#include "unicode/udata.h"
#include "utrie.h"
#include "unicode/uset.h"
+#include "toolutil.h"
#include "unewdata.h"
#include "unormimp.h"
#include "gennorm.h"
@@ -86,87 +87,6 @@ setUnicodeVersion(const char *v) {
static int32_t indexes[_NORM_INDEX_TOP]={ 0 };
-/* tool memory helper ------------------------------------------------------- */
-
-/*
- * UToolMemory is used for generic, custom memory management.
- * It is allocated with enough space for count*size bytes starting
- * at array.
- * The array is declared with a union of large data types so
- * that its base address is aligned for any types.
- * If size is a multiple of a data type size, then such items
- * can be safely allocated inside the array, at offsets that
- * are themselves multiples of size.
- */
-typedef struct UToolMemory {
- char name[64];
- uint32_t count, size, index;
- union {
- uint32_t u;
- double d;
- void *p;
- } array[1];
-} UToolMemory;
-
-static UToolMemory *
-utm_open(const char *name, uint32_t count, uint32_t size) {
- UToolMemory *mem=(UToolMemory *)uprv_malloc(sizeof(UToolMemory)+count*size);
- if(mem==NULL) {
- fprintf(stderr, "error: %s - out of memory\n", name);
- exit(U_MEMORY_ALLOCATION_ERROR);
- }
- uprv_strcpy(mem->name, name);
- mem->count=count;
- mem->size=size;
- mem->index=0;
- return mem;
-}
-
-static void
-utm_close(UToolMemory *mem) {
- if(mem!=NULL) {
- uprv_free(mem);
- }
-}
-
-
-
-static void *
-utm_getStart(UToolMemory *mem) {
- return (char *)mem->array;
-}
-
-static int32_t
-utm_countItems(UToolMemory *mem) {
- return mem->index;
-}
-
-static void *
-utm_alloc(UToolMemory *mem) {
- char *p=(char *)mem->array+mem->index*mem->size;
- if(++mem->index<=mem->count) {
- uprv_memset(p, 0, mem->size);
- return p;
- } else {
- fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
- mem->name, (long)mem->count);
- exit(U_MEMORY_ALLOCATION_ERROR);
- }
-}
-
-static void *
-utm_allocN(UToolMemory *mem, int32_t n) {
- char *p=(char *)mem->array+mem->index*mem->size;
- if((mem->index+=(uint32_t)n)<=mem->count) {
- uprv_memset(p, 0, n*mem->size);
- return p;
- } else {
- fprintf(stderr, "error: %s - trying to use more than %ld preallocated units\n",
- mem->name, (long)mem->count);
- exit(U_MEMORY_ALLOCATION_ERROR);
- }
-}
-
/* builder data ------------------------------------------------------------- */
typedef void EnumTrieFn(void *context, uint32_t code, Norm *norm);
@@ -244,23 +164,23 @@ init() {
}
/* allocate Norm structures and reset the first one */
- normMem=utm_open("gennorm normalization structs", 20000, sizeof(Norm));
+ normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
norms=utm_alloc(normMem);
/* allocate UTF-32 string memory */
- utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 4);
+ utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
/* reset all "have seen" flags */
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
/* allocate extra data memory for UTF-16 decomposition strings and other values */
- extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, 2);
+ extraMem=utm_open("gennorm extra 16-bit memory", _NORM_EXTRA_INDEX_TOP, _NORM_EXTRA_INDEX_TOP, 2);
/* initialize the extraMem counter for the top of FNC strings */
p16=(uint16_t *)utm_alloc(extraMem);
*p16=1;
/* allocate temporary memory for combining triples */
- combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, sizeof(CombiningTriple));
+ combiningTriplesMem=utm_open("gennorm combining triples", 0x4000, 0x4000, sizeof(CombiningTriple));
/* set the minimum code points for no/maybe quick check values to the end of the BMP */
indexes[_NORM_INDEX_MIN_NFC_NO_MAYBE]=0xffff;
@@ -508,7 +428,7 @@ processCombining() {
triples=utm_getStart(combiningTriplesMem);
/* add lead and trail indexes to the triples for sorting */
- count=(uint16_t)combiningTriplesMem->index;
+ count=(uint16_t)utm_countItems(combiningTriplesMem);
for(i=0; iindex;
+ count=(int32_t)utm_countItems(normMem);
for(i=0; iindex;
+ count=utm_countItems(normMem);
for(i=0; iindex;
+ count=utm_countItems(combiningTriplesMem);
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
for(i=0; iindex;
+ count=utm_countItems(combiningTriplesMem);
c=s[0];
/* triples are not sorted by code point but for each lead CP there is one contiguous block */
@@ -1838,7 +1758,7 @@ generateData(const char *dataDir) {
canonStartSetsTop+=canonStartSets[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH];
/* make sure that the FCD trie is 4-aligned */
- if((extraMem->index+combiningTableTop)&1) {
+ if((utm_countItems(extraMem)+combiningTableTop)&1) {
combiningTable[combiningTableTop++]=0x1234; /* add one 16-bit word for an even number */
}
@@ -1850,7 +1770,7 @@ generateData(const char *dataDir) {
size=
_NORM_INDEX_TOP*4+
normTrieSize+
- extraMem->index*2+
+ utm_countItems(extraMem)*2+
combiningTableTop*2+
fcdTrieSize+
auxTrieSize+
@@ -1858,7 +1778,7 @@ generateData(const char *dataDir) {
if(beVerbose) {
printf("size of normalization trie %5u bytes\n", normTrieSize);
- printf("size of 16-bit extra memory %5u UChars/uint16_t\n", extraMem->index);
+ printf("size of 16-bit extra memory %5u UChars/uint16_t\n", utm_countItems(extraMem));
printf(" of that: FC_NFKC_Closure size %5u UChars/uint16_t\n", ((uint16_t *)utm_getStart(extraMem))[0]);
printf("size of combining table %5u uint16_t\n", combiningTableTop);
printf("size of FCD trie %5u bytes\n", fcdTrieSize);
@@ -1873,7 +1793,7 @@ generateData(const char *dataDir) {
}
indexes[_NORM_INDEX_TRIE_SIZE]=normTrieSize;
- indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)extraMem->index;
+ indexes[_NORM_INDEX_UCHAR_COUNT]=(uint16_t)utm_countItems(extraMem);
indexes[_NORM_INDEX_COMBINE_DATA_COUNT]=combiningTableTop;
indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop;
@@ -1900,7 +1820,7 @@ generateData(const char *dataDir) {
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, normTrieBlock, normTrieSize);
- udata_writeBlock(pData, utm_getStart(extraMem), extraMem->index*2);
+ udata_writeBlock(pData, utm_getStart(extraMem), utm_countItems(extraMem)*2);
udata_writeBlock(pData, combiningTable, combiningTableTop*2);
udata_writeBlock(pData, fcdTrieBlock, fcdTrieSize);
udata_writeBlock(pData, auxTrieBlock, auxTrieSize);
@@ -1928,7 +1848,7 @@ extern void
cleanUpData(void) {
int32_t i, count;
- count=(int32_t)normMem->index;
+ count=utm_countItems(normMem);
for(i=0; i
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "ucnv_cnv.h"
+#include "ucnvmbcs.h"
+#include "toolutil.h"
+#include "unewdata.h"
+#include "ucm.h"
+#include "makeconv.h"
+#include "genmbcs.h"
+
+#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
+
+static void
+CnvExtClose(NewConverter *cnvData);
+
+static UBool
+CnvExtIsValid(NewConverter *cnvData,
+ const uint8_t *bytes, int32_t length);
+
+static UBool
+CnvExtAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
+
+static uint32_t
+CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType);
+
+typedef struct CnvExtData {
+ NewConverter newConverter;
+
+ UCMFile *ucm;
+
+ /* toUnicode (state table in ucm->states) */
+ UToolMemory *toUTable, *toUUChars;
+
+ /* fromUnicode */
+ UToolMemory *fromUTableUChars, *fromUTableValues, *fromUBytes;
+
+ uint16_t stage1[MBCS_STAGE_1_SIZE];
+ uint16_t stage2[MBCS_STAGE_2_SIZE];
+ uint16_t stage3[0x10000< |2 mappings */
+ uint16_t stage3Sub1Block;
+} CnvExtData;
+
+NewConverter *
+CnvExtOpen(UCMFile *ucm) {
+ CnvExtData *extData;
+
+ extData=(CnvExtData *)uprv_malloc(sizeof(CnvExtData));
+ if(extData!=NULL) {
+ uprv_memset(extData, 0, sizeof(CnvExtData));
+
+ extData->ucm=ucm; /* aliased, not owned */
+
+ extData->newConverter.close=CnvExtClose;
+ extData->newConverter.isValid=CnvExtIsValid;
+ extData->newConverter.addTable=CnvExtAddTable;
+ extData->newConverter.write=CnvExtWrite;
+ }
+ return &extData->newConverter;
+}
+
+static void
+CnvExtClose(NewConverter *cnvData) {
+ CnvExtData *extData=(CnvExtData *)cnvData;
+ if(extData!=NULL) {
+ utm_close(extData->toUTable);
+ utm_close(extData->toUUChars);
+ utm_close(extData->fromUTableUChars);
+ utm_close(extData->fromUTableValues);
+ utm_close(extData->fromUBytes);
+ }
+}
+
+/* we do not expect this to be called */
+static UBool
+CnvExtIsValid(NewConverter *cnvData,
+ const uint8_t *bytes, int32_t length) {
+ return FALSE;
+}
+
+static uint32_t
+CnvExtWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType) {
+ CnvExtData *extData=(CnvExtData *)cnvData;
+ int32_t length, top, headerSize;
+
+ int32_t indexes[UCNV_EXT_INDEXES_MIN_LENGTH]={ 0 };
+
+ if(tableType&TABLE_BASE) {
+ headerSize=0;
+ } else {
+ _MBCSHeader header={ 0 };
+
+ /* write the header and base table name for an extension-only table */
+ length=uprv_strlen(extData->ucm->baseName)+1;
+ while(length&3) {
+ /* add padding */
+ extData->ucm->baseName[length++]=0;
+ }
+
+ headerSize=sizeof(header)+length;
+
+ /* fill the header */
+ header.version[0]=4;
+ header.version[1]=2;
+ header.flags=(uint32_t)((headerSize<<8)|MBCS_OUTPUT_EXT_ONLY);
+
+ /* write the header and the base table name */
+ udata_writeBlock(pData, &header, sizeof(header));
+ udata_writeBlock(pData, extData->ucm->baseName, length);
+ }
+
+ /* fill indexes[] - offsets/indexes are in units of the target array */
+ top=0;
+
+ indexes[UCNV_EXT_INDEXES_LENGTH]=length=UCNV_EXT_INDEXES_MIN_LENGTH;
+ top+=length*4;
+
+ indexes[UCNV_EXT_TO_U_INDEX]=top;
+ indexes[UCNV_EXT_TO_U_LENGTH]=length=utm_countItems(extData->toUTable);
+ top+=length*4;
+
+ indexes[UCNV_EXT_TO_U_UCHARS_INDEX]=top;
+ indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]=length=utm_countItems(extData->toUUChars);
+ top+=length*2;
+
+ indexes[UCNV_EXT_FROM_U_UCHARS_INDEX]=top;
+ length=utm_countItems(extData->fromUTableUChars);
+ top+=length*2;
+
+ if(top&3) {
+ /* add padding */
+ *((UChar *)utm_alloc(extData->fromUTableUChars))=0;
+ *((uint32_t *)utm_alloc(extData->fromUTableValues))=0;
+ ++length;
+ top+=2;
+ }
+ indexes[UCNV_EXT_FROM_U_LENGTH]=length;
+
+ indexes[UCNV_EXT_FROM_U_VALUES_INDEX]=top;
+ top+=length*4;
+
+ indexes[UCNV_EXT_FROM_U_BYTES_INDEX]=top;
+ length=utm_countItems(extData->fromUBytes);
+ top+=length;
+
+ if(top&1) {
+ /* add padding */
+ *((uint8_t *)utm_alloc(extData->fromUBytes))=0;
+ ++length;
+ ++top;
+ }
+ indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]=length;
+
+ indexes[UCNV_EXT_FROM_U_STAGE_12_INDEX]=top;
+ indexes[UCNV_EXT_FROM_U_STAGE_1_LENGTH]=length=extData->stage1Top;
+ indexes[UCNV_EXT_FROM_U_STAGE_12_LENGTH]=length+=extData->stage2Top;
+ top+=length*2;
+
+ indexes[UCNV_EXT_FROM_U_STAGE_3_INDEX]=top;
+ length=extData->stage3Top;
+ top+=length*2;
+
+ if(top&3) {
+ /* add padding */
+ extData->stage3[extData->stage3Top++]=0;
+ ++length;
+ top+=2;
+ }
+ indexes[UCNV_EXT_FROM_U_STAGE_3_LENGTH]=length;
+
+ indexes[UCNV_EXT_FROM_U_STAGE_3B_INDEX]=top;
+ indexes[UCNV_EXT_FROM_U_STAGE_3B_LENGTH]=length=extData->stage3bTop;
+ top+=length*4;
+
+ indexes[UCNV_EXT_SIZE]=top;
+
+ /* write the extension data */
+ udata_writeBlock(pData, indexes, sizeof(indexes));
+ udata_writeBlock(pData, utm_getStart(extData->toUTable), indexes[UCNV_EXT_TO_U_LENGTH]*4);
+ udata_writeBlock(pData, utm_getStart(extData->toUUChars), indexes[UCNV_EXT_TO_U_UCHARS_LENGTH]*2);
+
+ udata_writeBlock(pData, utm_getStart(extData->fromUTableUChars), indexes[UCNV_EXT_FROM_U_LENGTH]*2);
+ udata_writeBlock(pData, utm_getStart(extData->fromUTableValues), indexes[UCNV_EXT_FROM_U_LENGTH]*4);
+ udata_writeBlock(pData, utm_getStart(extData->fromUBytes), indexes[UCNV_EXT_FROM_U_BYTES_LENGTH]);
+
+ udata_writeBlock(pData, extData->stage1, extData->stage1Top*2);
+ udata_writeBlock(pData, extData->stage2, extData->stage2Top*2);
+ udata_writeBlock(pData, extData->stage3, extData->stage3Top*2);
+ udata_writeBlock(pData, extData->stage3b, extData->stage3bTop*4);
+
+ {
+ int32_t i, j;
+
+ length=extData->stage1Top;
+ printf("\nstage1[%x]:\n", length);
+
+ for(i=0; istage1[i]!=length) {
+ printf("stage1[%04x]=%04x\n", i, extData->stage1[i]);
+ }
+ }
+
+ j=length;
+ length=extData->stage2Top;
+ printf("\nstage2[%x]:\n", length);
+
+ for(i=0; istage2[i]!=0) {
+ printf("stage12[%04x]=%04x\n", j, extData->stage2[i]);
+ }
+ }
+
+ length=extData->stage3Top;
+ printf("\nstage3[%x]:\n", length);
+
+ for(i=0; istage3[i]!=0) {
+ printf("stage3[%04x]=%04x\n", i, extData->stage3[i]);
+ }
+ }
+
+ length=extData->stage3bTop;
+ printf("\nstage3b[%x]:\n", length);
+
+ for(i=0; istage3b[i]!=0) {
+ printf("stage3b[%04x]=%08x\n", i, extData->stage3b[i]);
+ }
+ }
+ }
+
+ if(VERBOSE) {
+ printf("size of extension data: %ld\n", top);
+ }
+
+ /* return the number of bytes that should have been written */
+ return (uint32_t)(headerSize+top);
+}
+
+/* to Unicode --------------------------------------------------------------- */
+
+/*
+ * Remove fromUnicode fallbacks and SUB mappings which are irrelevant for
+ * the toUnicode table.
+ * The table must be sorted.
+ * Destroys previous data in the reverseMap.
+ */
+static int32_t
+reduceToUMappings(UCMTable *table) {
+ UCMapping *mappings;
+ int32_t *map;
+ int32_t i, j, count;
+ int8_t flag;
+
+ mappings=table->mappings;
+ map=table->reverseMap;
+ count=table->mappingsLength;
+
+ /* leave the map alone for the initial mappings with desired flags */
+ for(i=j=0; iuLen==1) {
+ value=(uint32_t)(UCNV_EXT_TO_U_MIN_CODE_POINT+m->u);
+ } else {
+ /* the parser enforces m->uLen<=UCNV_EXT_MAX_UCHARS */
+
+ /* get the result code point string and its 16-bit string length */
+ u32=UCM_GET_CODE_POINTS(table, m);
+ errorCode=U_ZERO_ERROR;
+ u_strFromUTF32(NULL, 0, &u16Length, u32, m->uLen, &errorCode);
+ if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
+ exit(errorCode);
+ }
+
+ /* allocate it and put its length and index into the value */
+ value=
+ (((uint32_t)m->uLen+UCNV_EXT_TO_U_LENGTH_OFFSET)<toUUChars));
+ u=utm_allocN(extData->toUUChars, u16Length);
+
+ /* write the result 16-bit string */
+ errorCode=U_ZERO_ERROR;
+ u_strFromUTF32(u, u16Length, NULL, u32, m->uLen, &errorCode);
+ if(U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) {
+ exit(errorCode);
+ }
+ }
+ if(m->f==0) {
+ value|=UCNV_EXT_TO_U_ROUNDTRIP_FLAG;
+ }
+ return value;
+}
+
+/*
+ * Recursive toUTable generator core function.
+ * Preconditions:
+ * - start0: if there is one mapping with an input unit sequence of unitIndex+1
+ * then defaultValue=compute the mapping result for this whole sequence
+ * else defaultValue=0
+ *
+ * recurse into the subsection
+ */
+static UBool
+generateToUTable(CnvExtData *extData, UCMTable *table,
+ int32_t start, int32_t limit, int32_t unitIndex,
+ uint32_t defaultValue) {
+ UCMapping *mappings, *m;
+ int32_t *map;
+ int32_t i, j, uniqueCount, count, subStart, subLimit;
+
+ uint8_t *bytes;
+ int32_t low, high, prev;
+
+ uint32_t *section;
+
+ mappings=table->mappings;
+ map=table->reverseMap;
+
+ /* step 1: examine the input units; set low, high, uniqueCount */
+ m=mappings+map[start];
+ bytes=UCM_GET_BYTES(table, m);
+ low=bytes[unitIndex];
+ uniqueCount=1;
+
+ prev=high=low;
+ for(i=start+1; i=(3*count)/4) {
+ /*
+ * for the root table and for fairly full tables:
+ * allocate for direct, linear array access
+ * by keeping count, to write an entry for each unit value
+ * from low to high
+ */
+ } else {
+ count=uniqueCount;
+ }
+
+ /* allocate the section: 1 entry for the header + count for the items */
+ section=(uint32_t *)utm_allocN(extData->toUTable, 1+count);
+
+ /* write the section header */
+ *section++=((uint32_t)count<uniqueCount) {
+ /* write empty subsections for unused units in a linear table */
+ while(++prevbLen==unitIndex+1) {
+ /* do not include this in generateToUTable() */
+ ++subStart;
+
+ if(subStarttoUTable);
+
+ /* recurse */
+ if(!generateToUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
+ return FALSE;
+ }
+ }
+ }
+ return TRUE;
+}
+
+/*
+ * Generate the toUTable and toUUChars from the input table.
+ * The input table must be sorted, and all precision flags must be 0..3.
+ * This function will modify the table's reverseMap.
+ */
+static UBool
+makeToUTable(CnvExtData *extData, UCMTable *table) {
+ int32_t toUCount;
+
+ toUCount=reduceToUMappings(table);
+
+ extData->toUTable=utm_open("cnv extension toUTable", 0x10000, UCNV_EXT_TO_U_MIN_CODE_POINT, 4);
+ extData->toUUChars=utm_open("cnv extension toUUChars", 0x10000, UCNV_EXT_TO_U_INDEX_MASK+1, 2);
+
+ return generateToUTable(extData, table, 0, toUCount, 0, 0);
+}
+
+/* from Unicode ------------------------------------------------------------- */
+
+/*
+ * preprocessing:
+ * rebuild reverseMap with mapping indexes for mappings relevant for from Unicode
+ * change each Unicode string to encode all but the first code point in 16-bit form
+ *
+ * generation:
+ * for each unique code point
+ * write an entry in the 3-stage trie
+ * check that there is only one single-code point sequence
+ * start recursion for following 16-bit input units
+ */
+
+/*
+ * Remove toUnicode fallbacks and non- SUB mappings
+ * which are irrelevant for the fromUnicode extension table.
+ * Overwrite the reverseMap with an index array to the relevant mappings.
+ * Modify the code point sequences to a generator-friendly format where
+ * the first code points remains unchanged but the following are recoded
+ * into 16-bit Unicode string form.
+ * The table must be sorted.
+ * Destroys previous data in the reverseMap.
+ */
+static int32_t
+prepareFromUMappings(UCMTable *table) {
+ UCMapping *mappings, *m;
+ int32_t *map;
+ int32_t i, j, count;
+ int8_t flag;
+
+ mappings=table->mappings;
+ map=table->reverseMap;
+ count=table->mappingsLength;
+
+ /*
+ * we do not go through the map on input because the mappings are
+ * sorted lexically
+ */
+ m=mappings;
+
+ for(i=j=0; if;
+ if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
+ map[j++]=i;
+
+ if(m->uLen>1) {
+ /* recode all but the first code point to 16-bit Unicode */
+ UChar32 *u32;
+ UChar *u;
+ UChar32 c;
+ int32_t q, r;
+
+ u32=UCM_GET_CODE_POINTS(table, m);
+ u=(UChar *)u32; /* destructive in-place recoding */
+ for(r=2, q=1; quLen; ++q) {
+ c=u32[q];
+ U16_APPEND_UNSAFE(u, r, c);
+ }
+
+ /* counts the first code point always at 2 - the first 16-bit unit is at 16-bit index 2 */
+ m->uLen=(int8_t)r;
+ }
+ }
+ }
+
+ return j;
+}
+
+static uint32_t
+getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
+ uint8_t *bytes, *resultBytes;
+ uint32_t value;
+
+ if(m->f==2) {
+ return UCNV_EXT_FROM_U_SUBCHAR1; /* SUB mapping */
+ }
+
+ bytes=UCM_GET_BYTES(table, m);
+ value=0;
+ switch(m->bLen) {
+ /* 1..3: store the bytes in the value word */
+ case 3:
+ value=((uint32_t)*bytes++)<<16;
+ case 2:
+ value|=((uint32_t)*bytes++)<<8;
+ case 1:
+ value|=*bytes;
+ break;
+ default:
+ /* the parser enforces m->bLen<=UCNV_EXT_MAX_BYTES */
+ /* store the bytes in fromUBytes[] and the index in the value word */
+ value=(uint32_t)utm_countItems(extData->fromUBytes);
+ resultBytes=utm_allocN(extData->fromUBytes, m->bLen);
+ uprv_memcpy(resultBytes, bytes, m->bLen);
+ break;
+ }
+ value|=(uint32_t)m->bLen<f==0) {
+ value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
+ }
+ return value;
+}
+
+/*
+ * works like generateToUTable(), except that the
+ * output section consists of two arrays, one for input UChars and one
+ * for result values
+ *
+ * also, fromUTable sections are always stored in a compact form for
+ * access via binary search
+ */
+static UBool
+generateFromUTable(CnvExtData *extData, UCMTable *table,
+ int32_t start, int32_t limit, int32_t unitIndex,
+ uint32_t defaultValue) {
+ UCMapping *mappings, *m;
+ int32_t *map;
+ int32_t i, j, uniqueCount, count, subStart, subLimit;
+
+ UChar *uchars;
+ UChar32 low, high, prev;
+
+ UChar *sectionUChars;
+ uint32_t *sectionValues;
+
+ mappings=table->mappings;
+ map=table->reverseMap;
+
+ /* step 1: examine the input units; set low, high, uniqueCount */
+ m=mappings+map[start];
+ uchars=(UChar *)UCM_GET_CODE_POINTS(table, m);
+ low=uchars[unitIndex];
+ uniqueCount=1;
+
+ prev=high=low;
+ for(i=start+1; ifromUTableUChars, 1+count);
+ sectionValues=(uint32_t *)utm_allocN(extData->fromUTableValues, 1+count);
+
+ /* write the section header */
+ *sectionUChars++=(UChar)count;
+ *sectionValues++=defaultValue;
+
+ /* step 3: write temporary section table with subsection starts */
+ prev=low-1; /* just before low to prevent empty subsections before low */
+ j=0; /* section table index */
+ for(i=start; iuLen==unitIndex+1) {
+ /* do not include this in generateToUTable() */
+ ++subStart;
+
+ if(subStartfromUTableValues);
+
+ /* recurse */
+ if(!generateFromUTable(extData, table, subStart, subLimit, unitIndex+1, defaultValue)) {
+ return FALSE;
+ }
+ }
+ }
+ return TRUE;
+}
+
+/*
+ * add entries to the fromUnicode trie,
+ * assume to be called with code points in ascending order
+ * and use that to build the trie in precompacted form
+ */
+static void
+addFromUTrieEntry(CnvExtData *extData, UChar32 c, uint32_t value) {
+ int32_t i1, i2, i3, i3b, nextOffset, min, newBlock;
+
+ if(value==0) {
+ return;
+ }
+
+ /*
+ * compute the index for each stage,
+ * allocate a stage block if necessary,
+ * and write the stage value
+ */
+ i1=c>>10;
+ if(i1>=extData->stage1Top) {
+ extData->stage1Top=i1+1;
+ }
+
+ nextOffset=(c>>4)&0x3f;
+
+ if(extData->stage1[i1]==0) {
+ /* allocate another block in stage 2; overlap with the previous block */
+ newBlock=extData->stage2Top;
+ min=newBlock-nextOffset; /* minimum block start with overlap */
+ while(minstage2[newBlock-1]==0) {
+ --newBlock;
+ }
+
+ extData->stage1[i1]=(uint16_t)newBlock;
+ extData->stage2Top=newBlock+MBCS_STAGE_2_BLOCK_SIZE;
+ if(extData->stage2Top>LENGTHOF(extData->stage2)) {
+ fprintf(stderr, "error: too many stage 2 entries at U+%04x\n", c);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ i2=extData->stage1[i1]+nextOffset;
+ nextOffset=c&0xf;
+
+ if(extData->stage2[i2]==0) {
+ /* allocate another block in stage 3; overlap with the previous block */
+ newBlock=extData->stage3Top;
+ min=newBlock-nextOffset; /* minimum block start with overlap */
+ while(minstage3[newBlock-1]==0) {
+ --newBlock;
+ }
+
+ /* round up to a multiple of stage 3 granularity >1 (similar to utrie.c) */
+ newBlock=(newBlock+(UCNV_EXT_STAGE_3_GRANULARITY-1))&~(UCNV_EXT_STAGE_3_GRANULARITY-1);
+ extData->stage2[i2]=(uint16_t)(newBlock>>UCNV_EXT_STAGE_2_LEFT_SHIFT);
+
+ extData->stage3Top=newBlock+MBCS_STAGE_3_BLOCK_SIZE;
+ if(extData->stage3Top>LENGTHOF(extData->stage3)) {
+ fprintf(stderr, "error: too many stage 3 entries at U+%04x\n", c);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ i3=((int32_t)extData->stage2[i2]<stage3[i3]==0 because we get
+ * code points in strictly ascending order
+ */
+
+ if(value==UCNV_EXT_FROM_U_SUBCHAR1) {
+ /* SUB mapping, see getFromUBytesValue() and prepareFromUMappings() */
+ extData->stage3[i3]=1;
+
+ /*
+ * precompaction is not optimal for |2 mappings because
+ * stage3 values for them are all the same, unlike for other mappings
+ * which all have unique values;
+ * use a simple compaction of reusing a whole block filled with these
+ * mappings
+ */
+
+ /* is the entire block filled with |2 mappings? */
+ if(nextOffset==MBCS_STAGE_3_BLOCK_SIZE-1) {
+ for(min=i3-nextOffset;
+ minstage3[min]==1;
+ ++min) {}
+
+ if(min==i3) {
+ /* the entire block is filled with these mappings */
+ if(extData->stage3Sub1Block!=0) {
+ /* point to the previous such block and remove this block from stage3 */
+ extData->stage2[i2]=extData->stage3Sub1Block;
+ extData->stage3Top-=MBCS_STAGE_3_BLOCK_SIZE;
+ uprv_memset(extData->stage3+extData->stage3Top, 0, MBCS_STAGE_3_BLOCK_SIZE*2);
+ } else {
+ /* remember this block's stage2 entry */
+ extData->stage3Sub1Block=extData->stage2[i2];
+ }
+ }
+ }
+ } else {
+ if((i3b=extData->stage3bTop++)>=LENGTHOF(extData->stage3b)) {
+ fprintf(stderr, "error: too many stage 3b entries at U+%04x\n", c);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ /* roundtrip or fallback mapping */
+ extData->stage3[i3]=(uint16_t)i3b;
+ extData->stage3b[i3b]=value;
+ }
+}
+
+static UBool
+generateFromUTrie(CnvExtData *extData, UCMTable *table, int32_t mapLength) {
+ UCMapping *mappings, *m;
+ int32_t *map;
+ uint32_t value;
+ int32_t subStart, subLimit;
+
+ UChar32 *codePoints;
+ UChar32 c, next;
+
+ if(mapLength==0) {
+ return TRUE;
+ }
+
+ mappings=table->mappings;
+ map=table->reverseMap;
+
+ /*
+ * iterate over same-initial-code point mappings,
+ * enter the initial code point into the trie,
+ * and start a recursion on the corresponding mappings section
+ * with generateFromUTable()
+ */
+ m=mappings+map[0];
+ codePoints=UCM_GET_CODE_POINTS(table, m);
+ next=codePoints[0];
+ subLimit=0;
+ while(subLimituLen==1) {
+ /* do not include this in generateFromUTable() */
+ ++subStart;
+
+ if(subStartfromUTableValues));
+
+ /* recurse, starting from 16-bit-unit index 2, the first 16-bit unit after c */
+ if(!generateFromUTable(extData, table, subStart, subLimit, 2, value)) {
+ return FALSE;
+ }
+ }
+ }
+ return TRUE;
+}
+
+/*
+ * Generate the fromU data structures from the input table.
+ * The input table must be sorted, and all precision flags must be 0..3.
+ * This function will modify the table's reverseMap.
+ */
+static UBool
+makeFromUTable(CnvExtData *extData, UCMTable *table) {
+ uint16_t *stage1;
+ int32_t i, stage1Top, fromUCount;
+
+ fromUCount=prepareFromUMappings(table);
+
+ extData->fromUTableUChars=utm_open("cnv extension fromUTableUChars", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 2);
+ extData->fromUTableValues=utm_open("cnv extension fromUTableValues", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 4);
+ extData->fromUBytes=utm_open("cnv extension fromUBytes", 0x10000, UCNV_EXT_FROM_U_DATA_MASK+1, 1);
+
+ /* allocate all-unassigned stage blocks */
+ extData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED;
+ extData->stage3Top=MBCS_STAGE_3_FIRST_ASSIGNED;
+
+ /*
+ * stage 3b stores only unique values, and in
+ * index 0: 0 for "no mapping"
+ * index 1: "no mapping" with preference for rather than
+ */
+ extData->stage3b[1]=UCNV_EXT_FROM_U_SUBCHAR1;
+ extData->stage3bTop=2;
+
+ /* allocate the first entry in the fromUTable because index 0 means "no result" */
+ utm_alloc(extData->fromUTableUChars);
+ utm_alloc(extData->fromUTableValues);
+
+ if(!generateFromUTrie(extData, table, fromUCount)) {
+ return FALSE;
+ }
+
+ /*
+ * offset the stage 1 trie entries by stage1Top because they will
+ * be stored in a single array
+ */
+ stage1=extData->stage1;
+ stage1Top=extData->stage1Top;
+ for(i=0; iunicodeMask=table->unicodeMask;
+ if(staticData->unicodeMask&UCNV_HAS_SURROGATES) {
+ fprintf(stderr, "error: contains mappings for surrogate code points\n");
+ return FALSE;
+ }
+
+ staticData->conversionType=UCNV_MBCS;
+
+ extData=(CnvExtData *)cnvData;
+
+ /*
+ * assume that the table is sorted
+ *
+ * call the functions in this order because
+ * makeToUTable() modifies the original reverseMap,
+ * makeFromUTable() writes a whole new mapping into reverseMap
+ */
+ return
+ makeToUTable(extData, table) &&
+ makeFromUTable(extData, table);
+}
diff --git a/icu4c/source/tools/makeconv/genmbcs.c b/icu4c/source/tools/makeconv/genmbcs.c
index dfd0174aaf..fbe9922952 100644
--- a/icu4c/source/tools/makeconv/genmbcs.c
+++ b/icu4c/source/tools/makeconv/genmbcs.c
@@ -21,49 +21,26 @@
#include "unewdata.h"
#include "ucnv_cnv.h"
#include "ucnvmbcs.h"
+#include "ucm.h"
#include "makeconv.h"
#include "genmbcs.h"
-enum {
- MBCS_STATE_FLAG_DIRECT=1,
- MBCS_STATE_FLAG_SURROGATES,
-
- MBCS_STATE_FLAG_READY=16
-};
-
-enum {
- MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
- MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
- MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
- MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
- MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
- MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
-
- MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
- MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
-
- MBCS_MAX_STATE_COUNT=128,
- MBCS_MAX_FALLBACK_COUNT=8192
-};
-
typedef struct MBCSData {
NewConverter newConverter;
- /* toUnicode */
- int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
- uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
- stateOffsetSum[MBCS_MAX_STATE_COUNT];
+ UCMFile *ucm;
+
+ /* toUnicode (state table in ucm->states) */
_MBCSToUFallback toUFallbacks[MBCS_MAX_FALLBACK_COUNT];
+ int32_t countToUFallbacks;
uint16_t *unicodeCodeUnits;
- _MBCSHeader header;
- int32_t countToUCodeUnits;
/* fromUnicode */
uint16_t stage1[MBCS_STAGE_1_SIZE];
uint16_t stage2Single[MBCS_STAGE_2_SIZE]; /* stage 2 for single-byte codepages */
uint32_t stage2[MBCS_STAGE_2_SIZE]; /* stage 2 for MBCS */
uint8_t *fromUBytes;
- uint32_t stage2Top, stage3Top, maxCharLength;
+ uint32_t stage2Top, stage3Top;
} MBCSData;
/* prototypes */
@@ -71,64 +48,80 @@ static void
MBCSClose(NewConverter *cnvData);
static UBool
-MBCSProcessStates(NewConverter *cnvData);
+MBCSStartMappings(MBCSData *mbcsData);
static UBool
-MBCSAddToUnicode(NewConverter *cnvData,
+MBCSAddToUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
+ UChar32 c,
+ int8_t flag);
static UBool
MBCSIsValid(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- uint32_t b);
+ const uint8_t *bytes, int32_t length);
static UBool
-MBCSSingleAddFromUnicode(NewConverter *cnvData,
+MBCSSingleAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
+ UChar32 c,
+ int8_t flag);
static UBool
-MBCSAddFromUnicode(NewConverter *cnvData,
+MBCSAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
+ UChar32 c,
+ int8_t flag);
static void
-MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData);
+MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData);
+
+static UBool
+MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
static uint32_t
-MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData);
+MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType);
+
+/* helper ------------------------------------------------------------------- */
+
+static U_INLINE char
+hexDigit(uint8_t digit) {
+ return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
+}
+
+static U_INLINE char *
+printBytes(char *buffer, const uint8_t *bytes, int32_t length) {
+ char *s=buffer;
+ while(length>0) {
+ *s++=hexDigit((uint8_t)(*bytes>>4));
+ *s++=hexDigit((uint8_t)(*bytes&0xf));
+ ++bytes;
+ --length;
+ }
+
+ *s=0;
+ return buffer;
+}
/* implementation ----------------------------------------------------------- */
static void
-MBCSInit(MBCSData *mbcsData, uint8_t maxCharLength) {
- int i;
+MBCSInit(MBCSData *mbcsData, UCMFile *ucm) {
+ int32_t i, maxCharLength;
uprv_memset(mbcsData, 0, sizeof(MBCSData));
+ maxCharLength=ucm->states.maxCharLength;
+
+ mbcsData->ucm=ucm; /* aliased, not owned */
+
mbcsData->newConverter.close=MBCSClose;
- mbcsData->newConverter.startMappings=MBCSProcessStates;
mbcsData->newConverter.isValid=MBCSIsValid;
- mbcsData->newConverter.addToUnicode=MBCSAddToUnicode;
- if(maxCharLength==1) {
- mbcsData->newConverter.addFromUnicode=MBCSSingleAddFromUnicode;
- } else {
- mbcsData->newConverter.addFromUnicode=MBCSAddFromUnicode;
- }
- mbcsData->newConverter.finishMappings=MBCSPostprocess;
+ mbcsData->newConverter.addTable=MBCSAddTable;
mbcsData->newConverter.write=MBCSWrite;
- mbcsData->header.version[0]=4;
- mbcsData->header.version[1]=1;
- mbcsData->stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
mbcsData->stage2Top=MBCS_STAGE_2_FIRST_ASSIGNED; /* after stage 1 and one all-unassigned stage 2 block */
mbcsData->stage3Top=16*maxCharLength; /* after one all-unassigned stage 3 block */
- mbcsData->maxCharLength=maxCharLength;
- mbcsData->header.flags=maxCharLength-1; /* outputType */
/* point all entries in stage 1 to the "all-unassigned" first block in stage 2 */
for(i=0; inewConverter;
}
@@ -149,366 +142,22 @@ static void
MBCSClose(NewConverter *cnvData) {
MBCSData *mbcsData=(MBCSData *)cnvData;
if(mbcsData!=NULL) {
- if(mbcsData->unicodeCodeUnits!=NULL) {
- uprv_free(mbcsData->unicodeCodeUnits);
- }
- if(mbcsData->fromUBytes!=NULL) {
- uprv_free(mbcsData->fromUBytes);
- }
+ uprv_free(mbcsData->unicodeCodeUnits);
+ uprv_free(mbcsData->fromUBytes);
uprv_free(mbcsData);
}
}
-static const char *
-skipWhitespace(const char *s) {
- while(*s==' ' || *s=='\t') {
- ++s;
- }
- return s;
-}
-
-/*
- * state table row grammar (ebnf-style):
- * (whitespace is allowed between all tokens)
- *
- * row=[[firstentry ','] entry (',' entry)*]
- * firstentry="initial" | "surrogates"
- * (initial state (default for state 0), output is all surrogate pairs)
- * entry=range [':' nextstate] ['.' action]
- * range=number ['-' number]
- * nextstate=number
- * (0..7f)
- * action='u' | 's' | 'p' | 'i'
- * (unassigned, state change only, surrogate pair, illegal)
- * number=(1- or 2-digit hexadecimal number)
- */
-static const char *
-parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
- const char *t;
- uint32_t start, end, i;
- int32_t entry;
-
- /* initialize the state: all illegal with U+ffff */
- for(i=0; i<256; ++i) {
- state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
- }
-
- /* skip leading white space */
- s=skipWhitespace(s);
-
- /* is there an "initial" or "surrogates" directive? */
- if(uprv_strncmp("initial", s, 7)==0) {
- *pFlags=MBCS_STATE_FLAG_DIRECT;
- s=skipWhitespace(s+7);
- if(*s++!=',') {
- return s-1;
- }
- } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
- *pFlags=MBCS_STATE_FLAG_SURROGATES;
- s=skipWhitespace(s+10);
- if(*s++!=',') {
- return s-1;
- }
- } else if(*s==0) {
- /* empty state row: all-illegal */
- return NULL;
- }
-
- for(;;) {
- /* read an entry, the start of the range first */
- s=skipWhitespace(s);
- start=uprv_strtoul(s, (char **)&t, 16);
- if(s==t || 0xffheader.countStates==MBCS_MAX_STATE_COUNT) {
- fprintf(stderr, "error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
- return FALSE;
- }
-
- error=parseState(s, mbcsData->stateTable[mbcsData->header.countStates],
- &mbcsData->stateFlags[mbcsData->header.countStates]);
- if(error!=NULL) {
- fprintf(stderr, "parse error in state definition at '%s'\n", error);
- return FALSE;
- }
-
- ++mbcsData->header.countStates;
- return TRUE;
-}
-
-static int32_t
-sumUpStates(MBCSData *mbcsData) {
- int32_t entry, sum;
- int state, cell, count;
- UBool allStatesReady;
-
- /*
- * Sum up the offsets for all states.
- * In each final state (where there are only final entries),
- * the offsets add up directly.
- * In all other state table rows, for each transition entry to another state,
- * the offsets sum of that state needs to be added.
- * This is achieved in at most countStates iterations.
- */
- allStatesReady=FALSE;
- for(count=mbcsData->header.countStates; !allStatesReady && count>=0; --count) {
- allStatesReady=TRUE;
- for(state=mbcsData->header.countStates-1; state>=0; --state) {
- if(!(mbcsData->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
- allStatesReady=FALSE;
- sum=0;
-
- /* at first, add up only the final delta offsets to keep them <512 */
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if(MBCS_ENTRY_IS_FINAL(entry)) {
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
- sum+=1;
- break;
- case MBCS_STATE_VALID_16_PAIR:
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
- sum+=2;
- break;
- default:
- /* no addition */
- break;
- }
- }
- }
-
- /* now, add up the delta offsets for the transitional entries */
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- if(mbcsData->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
- sum+=mbcsData->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
- } else {
- /* that next state does not have a sum yet, we cannot finish the one for this state */
- sum=-1;
- break;
- }
- }
- }
-
- if(sum!=-1) {
- mbcsData->stateOffsetSum[state]=sum;
- mbcsData->stateFlags[state]|=MBCS_STATE_FLAG_READY;
- }
- }
- }
- }
-
- if(!allStatesReady) {
- fprintf(stderr, "error: the state table contains loops\n");
- return -1;
- }
-
- /*
- * For all "direct" (i.e., initial) states>0,
- * the offsets need to be increased by the sum of
- * the previous initial states.
- */
- sum=mbcsData->stateOffsetSum[0];
- for(state=1; state<(int)mbcsData->header.countStates; ++state) {
- if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- int32_t sum2=sum;
- sum+=mbcsData->stateOffsetSum[state];
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
- }
- }
- }
- }
- if(VERBOSE) {
- printf("the total number of offsets is 0x%lx=%ld\n",
- (unsigned long)sum, (long)sum);
- }
-
- /* round up to the next even number to have the following data 32-bit-aligned */
- sum=(sum+1)&~1;
- return mbcsData->countToUCodeUnits=sum;
-}
-
static UBool
-MBCSProcessStates(NewConverter *cnvData) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
- int32_t i, entry, sum;
- int state, cell;
-
- /*
- * first make sure that all "next state" values are within limits
- * and that all next states after final ones have the "direct"
- * flag of initial states
- */
- for(state=mbcsData->header.countStates-1; state>=0; --state) {
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- if((uint8_t)MBCS_ENTRY_STATE(entry)>=mbcsData->header.countStates) {
- fprintf(stderr, "error: state table entry [%x][%x] has a next state of %x that is too high\n",
- state, cell, MBCS_ENTRY_STATE(entry));
- return FALSE;
- }
- if(MBCS_ENTRY_IS_FINAL(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
- fprintf(stderr, "error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
- state, cell, MBCS_ENTRY_STATE(entry));
- return FALSE;
- } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (mbcsData->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- fprintf(stderr, "error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
- state, cell, MBCS_ENTRY_STATE(entry));
- return FALSE;
- }
- }
- }
-
- /* is this an SI/SO (like EBCDIC-stateful) state table? */
- if(mbcsData->header.countStates>=2 && (mbcsData->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- if(mbcsData->maxCharLength!=2) {
- fprintf(stderr, "error: SI/SO codepages must have max 2 bytes/char (not %x)\n", mbcsData->maxCharLength);
- return FALSE;
- }
- if(mbcsData->header.countStates<3) {
- fprintf(stderr, "error: SI/SO codepages must have at least 3 states (not %x)\n", mbcsData->header.countStates);
- return FALSE;
- }
- /* are the SI/SO all in the right places? */
- if( mbcsData->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
- mbcsData->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
- mbcsData->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
- mbcsData->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)
- ) {
- mbcsData->header.flags=MBCS_OUTPUT_2_SISO;
- } else {
- fprintf(stderr, "error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
- return FALSE;
- }
- state=2;
- } else {
- state=1;
- }
-
- /* check that no unexpected state is a "direct" one */
- while(state<(int)mbcsData->header.countStates) {
- if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- fprintf(stderr, "error: state %d is 'initial' - not supported except for SI/SO codepages\n", state);
- return FALSE;
- }
- ++state;
- }
-
- sum=sumUpStates(mbcsData);
- if(sum<0) {
- return FALSE;
- }
+MBCSStartMappings(MBCSData *mbcsData) {
+ int32_t i, sum;
/* allocate the code unit array and prefill it with "unassigned" values */
+ sum=mbcsData->ucm->states.countToUCodeUnits;
+ if(VERBOSE) {
+ printf("the total number of offsets is 0x%lx=%ld\n", sum, sum);
+ }
+
if(sum>0) {
mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
if(mbcsData->unicodeCodeUnits==NULL) {
@@ -522,17 +171,16 @@ MBCSProcessStates(NewConverter *cnvData) {
}
/* allocate the codepage mappings and preset the first 16 characters to 0 */
- if(mbcsData->maxCharLength==1) {
+ if(mbcsData->ucm->states.maxCharLength==1) {
/* allocate 64k 16-bit results for single-byte codepages */
sum=0x20000;
} else {
/* allocate 1M * maxCharLength bytes for at most 1M mappings */
- sum=0x100000*mbcsData->maxCharLength;
+ sum=0x100000*mbcsData->ucm->states.maxCharLength;
}
mbcsData->fromUBytes=(uint8_t *)uprv_malloc(sum);
if(mbcsData->fromUBytes==NULL) {
- fprintf(stderr, "error: out of memory allocating %ldMB for target mappings\n",
- (long)sum);
+ fprintf(stderr, "error: out of memory allocating %ld B for target mappings\n", sum);
return FALSE;
}
/* initialize the all-unassigned first stage 3 block */
@@ -541,46 +189,24 @@ MBCSProcessStates(NewConverter *cnvData) {
return TRUE;
}
-/* find a fallback for this offset; return the index or -1 if not found */
-static int32_t
-findFallback(MBCSData *mbcsData, uint32_t offset) {
- _MBCSToUFallback *toUFallbacks;
- int32_t i, limit;
-
- limit=mbcsData->header.countToUFallbacks;
- if(limit==0) {
- /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
- return -1;
- }
-
- /* do a linear search for the fallback mapping (the table is not yet sorted) */
- toUFallbacks=mbcsData->toUFallbacks;
- for(i=0; itoUFallbacks, mbcsData->countToUFallbacks, offset);
if(i>=0) {
/* if there is already a fallback for this offset, then overwrite it */
mbcsData->toUFallbacks[i].codePoint=c;
return TRUE;
} else {
/* if there is no fallback for this offset, then add one */
- i=mbcsData->header.countToUFallbacks;
+ i=mbcsData->countToUFallbacks;
if(i>=MBCS_MAX_FALLBACK_COUNT) {
fprintf(stderr, "error: too many toUnicode fallbacks, currently at: U+%x\n", c);
return FALSE;
} else {
mbcsData->toUFallbacks[i].offset=offset;
mbcsData->toUFallbacks[i].codePoint=c;
- mbcsData->header.countToUFallbacks=i+1;
+ mbcsData->countToUFallbacks=i+1;
return TRUE;
}
}
@@ -589,19 +215,19 @@ setFallback(MBCSData *mbcsData, uint32_t offset, UChar32 c) {
/* remove fallback if there is one with this offset; return the code point if there was such a fallback, otherwise -1 */
static int32_t
removeFallback(MBCSData *mbcsData, uint32_t offset) {
- int32_t i=findFallback(mbcsData, offset);
+ int32_t i=ucm_findFallback(mbcsData->toUFallbacks, mbcsData->countToUFallbacks, offset);
if(i>=0) {
_MBCSToUFallback *toUFallbacks;
int32_t limit, old;
toUFallbacks=mbcsData->toUFallbacks;
- limit=mbcsData->header.countToUFallbacks;
+ limit=mbcsData->countToUFallbacks;
old=(int32_t)toUFallbacks[i].codePoint;
/* copy the last fallback entry here to keep the list contiguous */
toUFallbacks[i].offset=toUFallbacks[limit-1].offset;
toUFallbacks[i].codePoint=toUFallbacks[limit-1].codePoint;
- mbcsData->header.countToUFallbacks=limit-1;
+ mbcsData->countToUFallbacks=limit-1;
return old;
} else {
return -1;
@@ -615,22 +241,22 @@ removeFallback(MBCSData *mbcsData, uint32_t offset) {
* -1 the precision of this mapping is not specified
*/
static UBool
-MBCSAddToUnicode(NewConverter *cnvData,
+MBCSAddToUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
+ UChar32 c,
+ int8_t flag) {
+ char buffer[10];
uint32_t offset=0;
int32_t i=0, entry, old;
uint8_t state=0;
- if(mbcsData->header.countStates==0) {
+ if(mbcsData->ucm->states.countStates==0) {
fprintf(stderr, "error: there is no state information!\n");
return FALSE;
}
/* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
- if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
+ if(length==2 && mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO) {
state=1;
}
@@ -640,33 +266,33 @@ MBCSAddToUnicode(NewConverter *cnvData,
* We assume that c<=0x10ffff.
*/
for(i=0;;) {
- entry=mbcsData->stateTable[state][bytes[i++]];
+ entry=mbcsData->ucm->states.stateTable[state][bytes[i++]];
if(MBCS_ENTRY_IS_TRANSITION(entry)) {
if(i==length) {
- fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx (U+%x)\n",
- state, (unsigned long)b, c);
+ fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%s (U+%x)\n",
+ state, printBytes(buffer, bytes, length), c);
return FALSE;
}
state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
} else {
if(i0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: byte sequence ends in illegal state at U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
case MBCS_STATE_CHANGE_ONLY:
- fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: byte sequence ends in state-change-only at U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
case MBCS_STATE_UNASSIGNED:
- fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: byte sequence ends in unassigned state at U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
case MBCS_STATE_FALLBACK_DIRECT_16:
case MBCS_STATE_VALID_DIRECT_16:
@@ -679,13 +305,13 @@ MBCSAddToUnicode(NewConverter *cnvData,
} else {
old=0x10000+MBCS_ENTRY_FINAL_VALUE(entry);
}
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n",
+ c, printBytes(buffer, bytes, length), (long)old);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n",
+ c, printBytes(buffer, bytes, length), (long)old);
}
/*
* Continue after the above warning
@@ -693,7 +319,7 @@ MBCSAddToUnicode(NewConverter *cnvData,
*/
}
/* reassign the correct action code */
- entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(isFallback>0 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
+ entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, (MBCS_STATE_VALID_DIRECT_16+(flag==3 ? 2 : 0)+(c>=0x10000 ? 1 : 0)));
/* put the code point into bits 22..7 for BMP, c-0x10000 into 26..7 for others */
if(c<=0xffff) {
@@ -701,7 +327,7 @@ MBCSAddToUnicode(NewConverter *cnvData,
} else {
entry=MBCS_ENTRY_FINAL_SET_VALUE(entry, c-0x10000);
}
- mbcsData->stateTable[state][bytes[i-1]]=entry;
+ mbcsData->ucm->states.stateTable[state][bytes[i-1]]=entry;
break;
case MBCS_STATE_VALID_16:
/* bits 26..16 are not used, 0 */
@@ -709,21 +335,21 @@ MBCSAddToUnicode(NewConverter *cnvData,
offset+=MBCS_ENTRY_FINAL_VALUE_16(entry);
/* check that this byte sequence is still unassigned */
if((old=mbcsData->unicodeCodeUnits[offset])!=0xfffe || (old=removeFallback(mbcsData, offset))!=-1) {
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n",
+ c, printBytes(buffer, bytes, length), (long)old);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)old);
+ fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n",
+ c, printBytes(buffer, bytes, length), (long)old);
}
}
if(c>=0x10000) {
- fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: code point does not fit into valid-16-bit state at U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
}
- if(isFallback>0) {
+ if(flag>0) {
/* assign only if there is no precise mapping */
if(mbcsData->unicodeCodeUnits[offset]==0xfffe) {
return setFallback(mbcsData, offset, c);
@@ -747,16 +373,16 @@ MBCSAddToUnicode(NewConverter *cnvData,
} else /* old<=0xe001 */ {
real=mbcsData->unicodeCodeUnits[offset+1];
}
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)real);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n",
+ c, printBytes(buffer, bytes, length), (long)real);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%02lx see U+%04lx\n",
- c, (unsigned long)b, (long)real);
+ fprintf(stderr, "duplicate codepage byte sequence at U+%04x<->0x%s see U+%04lx\n",
+ c, printBytes(buffer, bytes, length), (long)real);
}
}
- if(isFallback>0) {
+ if(flag>0) {
/* assign only if there is no precise mapping */
if(old<=0xdbff || old==0xe000) {
/* do nothing */
@@ -786,8 +412,8 @@ MBCSAddToUnicode(NewConverter *cnvData,
break;
default:
/* reserved, must never occur */
- fprintf(stderr, "internal error: byte sequence reached reserved action code, entry0x%02lx: 0x%02lx (U+%x)\n",
- (unsigned long)entry, (unsigned long)b, c);
+ fprintf(stderr, "internal error: byte sequence reached reserved action code, entry 0x%02lx: 0x%s (U+%x)\n",
+ (unsigned long)entry, printBytes(buffer, bytes, length), c);
return FALSE;
}
@@ -799,83 +425,26 @@ MBCSAddToUnicode(NewConverter *cnvData,
/* is this byte sequence valid? (this is almost the same as MBCSAddToUnicode()) */
static UBool
MBCSIsValid(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- uint32_t b) {
+ const uint8_t *bytes, int32_t length) {
MBCSData *mbcsData=(MBCSData *)cnvData;
- uint32_t offset=0;
- int32_t i=0, entry;
- uint8_t state=0;
- if(mbcsData->header.countStates==0) {
- fprintf(stderr, "error: there is no state information!\n");
- return FALSE;
- }
-
- /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
- if(length==2 && (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
- state=1;
- }
-
- /*
- * Walk down the state table like in conversion,
- * much like getNextUChar().
- * We assume that c<=0x10ffff.
- */
- for(i=0;;) {
- entry=mbcsData->stateTable[state][bytes[i++]];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- if(i==length) {
- fprintf(stderr, "error: byte sequence too short, ends in non-final state %hu: 0x%02lx\n",
- state, (unsigned long)b);
- return FALSE;
- }
- state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
- offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
- } else {
- if(iucm->states, bytes, length));
}
static UBool
-MBCSSingleAddFromUnicode(NewConverter *cnvData,
+MBCSSingleAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
+ UChar32 c,
+ int8_t flag) {
uint16_t *p;
uint32_t index;
uint16_t old;
+ uint8_t b;
+
+ /* ignore |2 SUB mappings */
+ if(flag==2) {
+ return TRUE;
+ }
/*
* Walk down the triple-stage compact array ("trie") and
@@ -883,14 +452,14 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData,
* Note that the first stage 2 and 3 blocks are reserved for all-unassigned mappings.
* We assume that length<=maxCharLength and that c<=0x10ffff.
*/
+ b=*bytes;
/* inspect stage 1 */
index=c>>10;
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
/* allocate another block in stage 2 */
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
- fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02x\n", c, b);
return FALSE;
}
@@ -907,8 +476,7 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData,
if(mbcsData->stage2Single[index]==0) {
/* allocate another block in stage 3 */
if(mbcsData->stage3Top>=0x10000) {
- fprintf(stderr, "error: too many code points at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: too many code points at U+%04x<->0x%02x\n", c, b);
return FALSE;
}
/* each block has 16 uint16_t entries */
@@ -920,7 +488,7 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData,
/* write the codepage entry into stage 3 and get the previous entry */
p=(uint16_t *)mbcsData->fromUBytes+mbcsData->stage2Single[index]+(c&0xf);
old=*p;
- if(isFallback<=0) {
+ if(flag<=0) {
*p=(uint16_t)(0xf00|b);
} else if(IS_PRIVATE_USE(c)) {
*p=(uint16_t)(0xc00|b);
@@ -930,13 +498,13 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData,
/* check that this Unicode code point was still unassigned */
if(old>=0x100) {
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02x\n",
- c, (unsigned long)b, old&0xff);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
+ c, b, old&0xff);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02x\n",
- c, (unsigned long)b, old&0xff);
+ fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02x see 0x%02x\n",
+ c, b, old&0xff);
}
/* continue after the above warning if the precision of the mapping is unspecified */
}
@@ -945,21 +513,41 @@ MBCSSingleAddFromUnicode(NewConverter *cnvData,
}
static UBool
-MBCSAddFromUnicode(NewConverter *cnvData,
+MBCSAddFromUnicode(MBCSData *mbcsData,
const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
+ UChar32 c,
+ int8_t flag) {
+ char buffer[10];
+ const uint8_t *pb;
uint8_t *p;
- uint32_t index, old;
+ uint32_t index, b, old;
+ int32_t maxCharLength;
- if( (mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO &&
+ /* ignore |2 SUB mappings */
+ if(flag==2) {
+ return TRUE;
+ }
+
+ maxCharLength=mbcsData->ucm->states.maxCharLength;
+
+ if(maxCharLength==1) {
+ return MBCSSingleAddFromUnicode(mbcsData, bytes, length, c, flag);
+ }
+
+ if( mbcsData->ucm->states.outputType==MBCS_OUTPUT_2_SISO &&
(*bytes==0xe || *bytes==0xf)
) {
- fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: illegal mapping to SI or SO for SI/SO codepage: U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
}
+
+ if(flag==1 && length==1 && *bytes==0 && c!=*bytes) {
+ fprintf(stderr, "error: unable to encode a |1 fallback from U+%04x to 0x%02x\n",
+ c, *bytes);
+ return FALSE;
+ }
+
/*
* Walk down the triple-stage compact array ("trie") and
* allocate parts as necessary.
@@ -973,8 +561,8 @@ MBCSAddFromUnicode(NewConverter *cnvData,
if(mbcsData->stage1[index]==MBCS_STAGE_2_ALL_UNASSIGNED_INDEX) {
/* allocate another block in stage 2 */
if(mbcsData->stage2Top>=MBCS_MAX_STAGE_2_TOP) {
- fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ fprintf(stderr, "error: too many stage 2 entries at U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
}
@@ -990,21 +578,38 @@ MBCSAddFromUnicode(NewConverter *cnvData,
index=mbcsData->stage1[index]+((c>>4)&0x3f);
if(mbcsData->stage2[index]==0) {
/* allocate another block in stage 3 */
- if(mbcsData->stage3Top>=0x100000*mbcsData->maxCharLength) {
- fprintf(stderr, "error: too many code points at U+%04x<->0x%02lx\n",
- c, (unsigned long)b);
+ if(mbcsData->stage3Top>=0x100000*(uint32_t)maxCharLength) {
+ fprintf(stderr, "error: too many code points at U+%04x<->0x%s\n",
+ c, printBytes(buffer, bytes, length));
return FALSE;
}
/* each block has 16*maxCharLength bytes */
- mbcsData->stage2[index]=(mbcsData->stage3Top/16)/mbcsData->maxCharLength;
- uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*mbcsData->maxCharLength);
- mbcsData->stage3Top+=16*mbcsData->maxCharLength;
+ mbcsData->stage2[index]=(mbcsData->stage3Top/16)/maxCharLength;
+ uprv_memset(mbcsData->fromUBytes+mbcsData->stage3Top, 0, 16*maxCharLength);
+ mbcsData->stage3Top+=16*maxCharLength;
}
/* write the codepage bytes into stage 3 and get the previous bytes */
+
+ /* assemble the bytes into a single integer */
+ pb=bytes;
+ b=0;
+ switch(length) {
+ case 4:
+ b=*pb++;
+ case 3:
+ b=(b<<8)|*pb++;
+ case 2:
+ b=(b<<8)|*pb++;
+ case 1:
+ default:
+ b=(b<<8)|*pb++;
+ break;
+ }
+
old=0;
- p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*mbcsData->maxCharLength;
- switch(mbcsData->maxCharLength) {
+ p=mbcsData->fromUBytes+(16*(uint32_t)(uint16_t)mbcsData->stage2[index]+(c&0xf))*maxCharLength;
+ switch(maxCharLength) {
case 2:
old=*(uint16_t *)p;
*(uint16_t *)p=(uint16_t)b;
@@ -1028,338 +633,101 @@ MBCSAddFromUnicode(NewConverter *cnvData,
/* check that this Unicode code point was still unassigned */
if((mbcsData->stage2[index]&(1UL<<(16+(c&0xf))))!=0 || old!=0) {
- if(isFallback>=0) {
- fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02lx\n",
- c, (unsigned long)b, (unsigned long)old);
+ if(flag>=0) {
+ fprintf(stderr, "error: duplicate Unicode code point at U+%04x<->0x%s see 0x%02lx\n",
+ c, printBytes(buffer, bytes, length), (unsigned long)old);
return FALSE;
} else if(VERBOSE) {
- fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%02lx see 0x%02lx\n",
- c, (unsigned long)b, (unsigned long)old);
+ fprintf(stderr, "duplicate Unicode code point at U+%04x<->0x%s see 0x%02lx\n",
+ c, printBytes(buffer, bytes, length), (unsigned long)old);
}
/* continue after the above warning if the precision of the mapping is
unspecified */
}
- if(isFallback<=0) {
- /* set the "assigned" flag */
+ if(flag<=0) {
+ /* set the roundtrip flag */
mbcsData->stage2[index]|=(1UL<<(16+(c&0xf)));
}
return TRUE;
}
-static int
-compareFallbacks(const void *fb1, const void *fb2) {
- return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
-}
+/* we can assume that the table only contains 1:1 mappings with <=4 bytes each */
+static UBool
+MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData) {
+ MBCSData *mbcsData;
+ UCMapping *m;
+ UChar32 c;
+ int32_t i;
+ UBool isOK;
-/*
- * This function tries to compact toUnicode tables for 2-byte codepages
- * by finding lead bytes with all-unassigned trail bytes and adding another state
- * for them.
- */
-static void
-compactToUnicode2(MBCSData *mbcsData) {
- int32_t (*oldStateTable)[256];
- uint16_t count[256];
- uint16_t *oldUnicodeCodeUnits;
- int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum;
- int32_t i, j, leadState, trailState, newState, fallback;
- uint16_t unit;
-
- /* find the lead state */
- if((mbcsData->header.flags&0xff)==MBCS_OUTPUT_2_SISO) {
- /* use the DBCS lead state for SI/SO codepages */
- leadState=1;
- } else {
- leadState=0;
+ staticData->unicodeMask=table->unicodeMask;
+ if(staticData->unicodeMask==3) {
+ fprintf(stderr, "error: contains mappings for both supplementary and surrogate code points\n");
+ return FALSE;
}
- /* find the main trail state: the most used target state */
- uprv_memset(count, 0, sizeof(count));
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[leadState][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
- }
- }
- trailState=0;
- for(i=1; i<(int)mbcsData->header.countStates; ++i) {
- if(count[i]>count[trailState]) {
- trailState=i;
- }
+ staticData->conversionType=UCNV_MBCS;
+
+ mbcsData=(MBCSData *)cnvData;
+
+ if(!MBCSStartMappings(mbcsData)) {
+ return FALSE;
}
- /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
- uprv_memset(count, 0, sizeof(count));
- savings=0;
- /* for each lead byte */
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[leadState][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
- /* the offset is different for each lead byte */
- offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
- /* for each trail byte for this lead byte */
- for(j=0; j<256; ++j) {
- entry=mbcsData->stateTable[trailState][j];
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) {
- ++count[i];
- } else {
- j=999; /* do not count for this lead byte because there are assignments */
- }
- break;
- case MBCS_STATE_VALID_16_PAIR:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe) {
- count[i]+=2;
- } else {
- j=999; /* do not count for this lead byte because there are assignments */
- }
- break;
- default:
- break;
- }
- }
- if(j==256) {
- /* all trail bytes for this lead byte are unassigned */
- savings+=count[i];
- } else {
- count[i]=0;
- }
- }
- }
- /* subtract from the possible savings the cost of an additional state */
- savings=savings*2-1024; /* count bytes, not 16-bit words */
- if(savings<=0) {
- return;
- }
- if(VERBOSE) {
- printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
- }
- if(mbcsData->header.countStates>=MBCS_MAX_STATE_COUNT) {
- fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
- return;
- }
+ isOK=TRUE;
- /* make a copy of the state table */
- oldStateTable=(int32_t (*)[256])uprv_malloc(mbcsData->header.countStates*1024);
- if(oldStateTable==NULL) {
- fprintf(stderr, "cannot compact toUnicode: out of memory\n");
- return;
- }
- uprv_memcpy(oldStateTable, mbcsData->stateTable, mbcsData->header.countStates*1024);
+ m=table->mappings;
+ for(i=0; imappingsLength; ++m, ++i) {
+ c=m->u;
- /* add the new state */
- /*
- * this function does not catch the degenerate case where all lead bytes
- * have all-unassigned trail bytes and the lead state could be removed
- */
- newState=mbcsData->header.countStates++;
- mbcsData->stateFlags[newState]=0;
- /* copy the old trail state, turning all assigned states into unassigned ones */
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[trailState][i];
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- case MBCS_STATE_VALID_16_PAIR:
- mbcsData->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
+ switch(m->f) {
+ case -1:
+ /* there was no precision/fallback indicator */
+ /* fall through to set the mappings */
+ case 0:
+ /* set roundtrip mappings */
+ isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f) &&
+ MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
+ break;
+ case 1:
+ /* set only a fallback mapping from Unicode to codepage */
+ staticData->hasFromUnicodeFallback=TRUE;
+ isOK&=MBCSAddFromUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
+ break;
+ case 2:
+ /* ignore |2 SUB mappings */
+ break;
+ case 3:
+ /* set only a fallback mapping from codepage to Unicode */
+ staticData->hasToUnicodeFallback=TRUE;
+ isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, m->f);
break;
default:
- mbcsData->stateTable[newState][i]=entry;
- break;
+ /* will not occur because the parser checked it already */
+ fprintf(stderr, "error: illegal fallback indicator %d\n", m->f);
+ return FALSE;
}
}
- /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
- for(i=0; i<256; ++i) {
- if(count[i]>0) {
- mbcsData->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(mbcsData->stateTable[leadState][i], newState);
- }
- }
+ MBCSPostprocess(mbcsData, staticData);
- /* sum up the new state table */
- for(i=0; i<(int)mbcsData->header.countStates; ++i) {
- mbcsData->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
- }
- sum=sumUpStates(mbcsData);
-
- /* allocate a new, smaller code units array */
- oldUnicodeCodeUnits=mbcsData->unicodeCodeUnits;
- if(sum==0) {
- mbcsData->unicodeCodeUnits=NULL;
- if(oldUnicodeCodeUnits!=NULL) {
- uprv_free(oldUnicodeCodeUnits);
- }
- uprv_free(oldStateTable);
- return;
- }
- mbcsData->unicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
- if(mbcsData->unicodeCodeUnits==NULL) {
- fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
- (long)sum);
- /* revert to the old state table */
- mbcsData->unicodeCodeUnits=oldUnicodeCodeUnits;
- --mbcsData->header.countStates;
- uprv_memcpy(mbcsData->stateTable, oldStateTable, mbcsData->header.countStates*1024);
- uprv_free(oldStateTable);
- return;
- }
- for(i=0; iunicodeCodeUnits[i]=0xfffe;
- }
-
- /* copy the code units for all assigned characters */
- /*
- * The old state table has the same lead _and_ trail states for assigned characters!
- * The differences are in the offsets, and in the trail states for some unassigned characters.
- * For each character with an assigned state in the new table, it was assigned in the old one.
- * Only still-assigned characters are copied.
- * Note that fallback mappings need to get their offset values adjusted.
- */
-
- /* for each initial state */
- for(leadState=0; leadState<(int)mbcsData->header.countStates; ++leadState) {
- if((mbcsData->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- /* for each lead byte from there */
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[leadState][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
- /* the new state does not have assigned states */
- if(trailState!=newState) {
- trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
- oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
- /* for each trail byte */
- for(j=0; j<256; ++j) {
- entry=mbcsData->stateTable[trailState][j];
- /* copy assigned-character code units and adjust fallback offsets */
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- /* find the old offset according to the old state table */
- oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
- unit=mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset];
- if(unit==0xfffe && (fallback=findFallback(mbcsData, oldOffset))>=0) {
- mbcsData->toUFallbacks[fallback].offset=0x80000000|offset;
- }
- break;
- case MBCS_STATE_VALID_16_PAIR:
- offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- /* find the old offset according to the old state table */
- oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
- mbcsData->unicodeCodeUnits[offset++]=oldUnicodeCodeUnits[oldOffset++];
- mbcsData->unicodeCodeUnits[offset]=oldUnicodeCodeUnits[oldOffset];
- break;
- default:
- break;
- }
- }
- }
- }
- }
- }
- }
-
- /* remove temporary flags from fallback offsets that protected them from being modified twice */
- sum=mbcsData->header.countToUFallbacks;
- for(i=0; itoUFallbacks[i].offset&=0x7fffffff;
- }
-
- /* free temporary memory */
- uprv_free(oldUnicodeCodeUnits);
- uprv_free(oldStateTable);
-}
-
-/*
- * recursive sub-function of compactToUnicodeHelper()
- * returns:
- * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
- * if all sequences from this state are unassigned, returns the
- * <0 there are assignments in unicodeCodeUnits[]
- * 0 no use of unicodeCodeUnits[]
- */
-static int32_t
-findUnassigned(MBCSData *mbcsData, int32_t state, int32_t offset, uint32_t b) {
- int32_t i, entry, savings, localSavings, belowSavings;
- UBool haveAssigned;
-
- localSavings=belowSavings=0;
- haveAssigned=FALSE;
- for(i=0; i<256; ++i) {
- entry=mbcsData->stateTable[state][i];
- if(MBCS_ENTRY_IS_TRANSITION(entry)) {
- savings=findUnassigned(mbcsData, MBCS_ENTRY_TRANSITION_STATE(entry), offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), (b<<8)|(uint32_t)i);
- if(savings<0) {
- haveAssigned=TRUE;
- } else if(savings>0) {
- printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
- (unsigned long)((b<<8)|i), (long)state, (long)savings);
- belowSavings+=savings;
- }
- } else if(!haveAssigned) {
- switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
- case MBCS_STATE_VALID_16:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe && findFallback(mbcsData, entry)<0) {
- localSavings+=2;
- } else {
- haveAssigned=TRUE;
- }
- break;
- case MBCS_STATE_VALID_16_PAIR:
- entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
- if(mbcsData->unicodeCodeUnits[entry]==0xfffe) {
- localSavings+=4;
- } else {
- haveAssigned=TRUE;
- }
- break;
- default:
- break;
- }
- }
- }
- if(haveAssigned) {
- return -1;
- } else {
- return localSavings+belowSavings;
- }
-}
-
-/* helper function for finding compaction opportunities */
-static void
-compactToUnicodeHelper(MBCSData *mbcsData) {
- int32_t state, savings;
-
- if(!VERBOSE) {
- return;
- }
-
- /* for each initial state */
- for(state=0; state<(int)mbcsData->header.countStates; ++state) {
- if((mbcsData->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
- savings=findUnassigned(mbcsData, state, 0, 0);
- if(savings>0) {
- printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
- (long)state, (long)savings);
- }
- }
- }
+ return isOK;
}
static UBool
transformEUC(MBCSData *mbcsData) {
uint8_t *p8;
- uint32_t i, value, oldLength=mbcsData->maxCharLength, old3Top=mbcsData->stage3Top, new3Top;
+ uint32_t i, value, oldLength, old3Top, new3Top;
uint8_t b;
+ oldLength=mbcsData->ucm->states.maxCharLength;
if(oldLength<3) {
return FALSE;
}
+ old3Top=mbcsData->stage3Top;
+
/* careful: 2-byte and 4-byte codes are stored in platform endianness! */
/* test if all first bytes are in {0, 0x8e, 0x8f} */
@@ -1382,7 +750,7 @@ transformEUC(MBCSData *mbcsData) {
p8=mbcsData->fromUBytes;
/* modify outputType and adjust stage3Top */
- mbcsData->header.flags=MBCS_OUTPUT_3_EUC+oldLength-3;
+ mbcsData->ucm->states.outputType=(int8_t)(MBCS_OUTPUT_3_EUC+oldLength-3);
mbcsData->stage3Top=new3Top=(old3Top*(oldLength-1))/oldLength;
/*
@@ -1608,54 +976,28 @@ compactStage2(MBCSData *mbcsData) {
}
static void
-MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData) {
- MBCSData *mbcsData=(MBCSData *)cnvData;
- int32_t entry;
- int state, cell;
+MBCSPostprocess(MBCSData *mbcsData, const UConverterStaticData *staticData) {
+ UCMStates *states;
+ int32_t maxCharLength;
+
+ states=&mbcsData->ucm->states;
+ maxCharLength=states->maxCharLength;
/* this needs to be printed before the EUC transformation because later maxCharLength might not be correct */
if(VERBOSE) {
printf("number of codepage characters in 16-blocks: 0x%lx=%lu\n",
- (unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength,
- (unsigned long)mbcsData->stage3Top/mbcsData->maxCharLength);
+ (unsigned long)mbcsData->stage3Top/maxCharLength,
+ (unsigned long)mbcsData->stage3Top/maxCharLength);
}
- /* test each state table entry */
- for(state=0; state<(int)mbcsData->header.countStates; ++state) {
- for(cell=0; cell<256; ++cell) {
- entry=mbcsData->stateTable[state][cell];
- /*
- * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
- * and the code point is "unassigned" (0xfffe), then change it to
- * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
- */
- if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
- mbcsData->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
- }
- }
- }
-
- /* try to compact the toUnicode tables */
- if(mbcsData->maxCharLength==2) {
- compactToUnicode2(mbcsData);
- } else if(mbcsData->maxCharLength>2) {
- compactToUnicodeHelper(mbcsData);
- }
-
- /* sort toUFallbacks */
- /*
- * It should be safe to sort them before compactToUnicode2() is called,
- * because it should not change the relative order of the offset values
- * that it adjusts, but they need to be sorted at some point, and
- * it is safest here.
- */
- if(mbcsData->header.countToUFallbacks>0) {
- qsort(mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks, sizeof(_MBCSToUFallback), compareFallbacks);
- }
+ ucm_optimizeStates(states,
+ &mbcsData->unicodeCodeUnits,
+ mbcsData->toUFallbacks, mbcsData->countToUFallbacks,
+ VERBOSE);
/* try to compact the fromUnicode tables */
transformEUC(mbcsData);
- if(mbcsData->maxCharLength==1) {
+ if(maxCharLength==1) {
singleCompactStage3(mbcsData);
singleCompactStage2(mbcsData);
} else {
@@ -1664,12 +1006,16 @@ MBCSPostprocess(NewConverter *cnvData, const UConverterStaticData *staticData) {
}
static uint32_t
-MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData) {
+MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType) {
MBCSData *mbcsData=(MBCSData *)cnvData;
+ uint32_t top;
int32_t i, stage1Top;
+ _MBCSHeader header={ 0 };
+
/* adjust stage 1 entries to include the size of stage 1 in the offsets to stage 2 */
- if(mbcsData->maxCharLength==1) {
+ if(mbcsData->ucm->states.maxCharLength==1) {
if(staticData->unicodeMask&UCNV_HAS_SUPPLEMENTARY) {
stage1Top=MBCS_STAGE_1_SIZE; /* 0x440==1088 */
} else {
@@ -1705,26 +1051,44 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDat
mbcsData->stage3Top=(mbcsData->stage3Top+3)&~3;
/* fill the header */
- mbcsData->header.offsetToUCodeUnits=
+ header.version[0]=4;
+ header.version[1]=2;
+ header.countStates=mbcsData->ucm->states.countStates;
+ header.countToUFallbacks=mbcsData->countToUFallbacks;
+
+ header.offsetToUCodeUnits=
sizeof(_MBCSHeader)+
- mbcsData->header.countStates*1024+
- mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback);
- mbcsData->header.offsetFromUTable=
- mbcsData->header.offsetToUCodeUnits+
- mbcsData->countToUCodeUnits*2;
- mbcsData->header.offsetFromUBytes=
- mbcsData->header.offsetFromUTable+
+ mbcsData->ucm->states.countStates*1024+
+ mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback);
+ header.offsetFromUTable=
+ header.offsetToUCodeUnits+
+ mbcsData->ucm->states.countToUCodeUnits*2;
+ header.offsetFromUBytes=
+ header.offsetFromUTable+
stage1Top*2+
mbcsData->stage2Top;
- mbcsData->header.fromUBytesLength=mbcsData->stage3Top;
+ header.fromUBytesLength=mbcsData->stage3Top;
+
+ top=header.offsetFromUBytes+header.fromUBytesLength;
+
+ header.flags=(uint8_t)(mbcsData->ucm->states.outputType);
+
+ if(tableType&TABLE_EXT) {
+ if(top>0xffffff) {
+ fprintf(stderr, "error: offset 0x%lx to extension table exceeds 0xffffff\n", top);
+ return 0;
+ }
+
+ header.flags|=top<<8;
+ }
/* write the MBCS data */
- udata_writeBlock(pData, &mbcsData->header, sizeof(_MBCSHeader));
- udata_writeBlock(pData, mbcsData->stateTable, mbcsData->header.countStates*1024);
- udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->header.countToUFallbacks*sizeof(_MBCSToUFallback));
- udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->countToUCodeUnits*2);
+ udata_writeBlock(pData, &header, sizeof(_MBCSHeader));
+ udata_writeBlock(pData, mbcsData->ucm->states.stateTable, header.countStates*1024);
+ udata_writeBlock(pData, mbcsData->toUFallbacks, mbcsData->countToUFallbacks*sizeof(_MBCSToUFallback));
+ udata_writeBlock(pData, mbcsData->unicodeCodeUnits, mbcsData->ucm->states.countToUCodeUnits*2);
udata_writeBlock(pData, mbcsData->stage1, stage1Top*2);
- if(mbcsData->maxCharLength==1) {
+ if(mbcsData->ucm->states.maxCharLength==1) {
udata_writeBlock(pData, mbcsData->stage2Single, mbcsData->stage2Top);
} else {
udata_writeBlock(pData, mbcsData->stage2, mbcsData->stage2Top);
@@ -1732,5 +1096,5 @@ MBCSWrite(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDat
udata_writeBlock(pData, mbcsData->fromUBytes, mbcsData->stage3Top);
/* return the number of bytes that should have been written */
- return mbcsData->header.offsetFromUBytes+mbcsData->header.fromUBytesLength;
+ return header.offsetFromUBytes+header.fromUBytesLength;
}
diff --git a/icu4c/source/tools/makeconv/genmbcs.h b/icu4c/source/tools/makeconv/genmbcs.h
index c2ab199492..9313202649 100644
--- a/icu4c/source/tools/makeconv/genmbcs.h
+++ b/icu4c/source/tools/makeconv/genmbcs.h
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 2000, International Business Machines
+* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -19,10 +19,27 @@
#include "makeconv.h"
-U_CFUNC NewConverter *
-MBCSOpen(uint8_t maxCharLength);
+enum {
+ MBCS_STAGE_2_BLOCK_SIZE=0x40, /* 64; 64=1<<6 for 6 bits in stage 2 */
+ MBCS_STAGE_2_BLOCK_SIZE_SHIFT=6, /* log2(MBCS_STAGE_2_BLOCK_SIZE) */
+ MBCS_STAGE_1_SIZE=0x440, /* 0x110000>>10, or 17*64 for one entry per 1k code points */
+ MBCS_STAGE_2_SIZE=0xfbc0, /* 0x10000-MBCS_STAGE_1_SIZE */
+ MBCS_MAX_STAGE_2_TOP=MBCS_STAGE_2_SIZE,
+ MBCS_STAGE_2_MAX_BLOCKS=MBCS_STAGE_2_SIZE>>MBCS_STAGE_2_BLOCK_SIZE_SHIFT,
-U_CFUNC UBool
-MBCSAddState(NewConverter *cnvData, const char *s);
+ MBCS_STAGE_2_ALL_UNASSIGNED_INDEX=0, /* stage 1 entry for the all-unassigned stage 2 block */
+ MBCS_STAGE_2_FIRST_ASSIGNED=MBCS_STAGE_2_BLOCK_SIZE, /* start of the first stage 2 block after the all-unassigned one */
+
+ MBCS_STAGE_3_BLOCK_SIZE=16, /* 16; 16=1<<4 for 4 bits in stage 3 */
+ MBCS_STAGE_3_FIRST_ASSIGNED=MBCS_STAGE_3_BLOCK_SIZE, /* start of the first stage 3 block after the all-unassigned one */
+
+ MBCS_MAX_FALLBACK_COUNT=8192
+};
+
+U_CFUNC NewConverter *
+MBCSOpen(UCMFile *ucm);
+
+U_CFUNC NewConverter *
+CnvExtOpen(UCMFile *ucm);
#endif
diff --git a/icu4c/source/tools/makeconv/makeconv.c b/icu4c/source/tools/makeconv/makeconv.c
index b5c5d6111a..574327fb86 100644
--- a/icu4c/source/tools/makeconv/makeconv.c
+++ b/icu4c/source/tools/makeconv/makeconv.c
@@ -30,11 +30,43 @@
#include "unicode/udata.h"
#include "unewdata.h"
#include "ucmpwrit.h"
+#include "ucm.h"
#include "makeconv.h"
#include "genmbcs.h"
#define DEBUG 0
+typedef struct ConvData {
+ UCMFile *ucm;
+ NewConverter *cnvData, *extData;
+ UConverterSharedData sharedData;
+ UConverterStaticData staticData;
+} ConvData;
+
+static void
+initConvData(ConvData *data) {
+ uprv_memset(data, 0, sizeof(ConvData));
+ data->sharedData.structSize=sizeof(UConverterSharedData);
+ data->staticData.structSize=sizeof(UConverterStaticData);
+ data->sharedData.staticData=&data->staticData;
+}
+
+static void
+cleanupConvData(ConvData *data) {
+ if(data!=NULL) {
+ if(data->cnvData!=NULL) {
+ data->cnvData->close(data->cnvData);
+ data->cnvData=NULL;
+ }
+ if(data->extData!=NULL) {
+ data->extData->close(data->extData);
+ data->extData=NULL;
+ }
+ ucm_close(data->ucm);
+ data->ucm=NULL;
+ }
+}
+
/*
* from ucnvstat.c - static prototypes of data-based converters
*/
@@ -46,137 +78,14 @@ extern const UConverterStaticData * ucnv_converterStaticData[UCNV_NUMBER_OF_SUPP
UBool VERBOSE = FALSE;
UBool TOUCHFILE = FALSE;
-/*Reads the header of the table file and fills in basic knowledge about the converter
- *in "converter"
- */
-static void readHeaderFromFile(UConverterSharedData* myConverter, FileStream* convFile, const char* converterName, UErrorCode* err);
-
-/*Reads the rest of the file, and fills up the shared objects if necessary
-Returns the UConverterTable. */
-static void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err);
-
-/* creates a UConverterSharedData from a mapping file.
- * Fills in: *staticData, *table. Converter is NOT otherwise useful.
- */
-static UConverterSharedData* createConverterFromTableFile(const char* realName, UErrorCode* err);
+static void
+createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode);
/*
* Set up the UNewData and write the converter..
*/
-void writeConverterData(UConverterSharedData *mySharedData, const char *cnvName, const char *cnvDir, UErrorCode *status);
-
-static const char NLTC_SEPARATORS[9] = { '\r', '\n', '\t', ' ', '<', '>' ,'"' , 'U', '\0' };
-static const char FALLBACK_SEPARATOR = '|';
-static const char CODEPOINT_SEPARATORS[8] = { '\r', '>', '\\', 'x', '\n', ' ', '\t', '\0' };
-static const char UNICODE_CODEPOINT_SEPARATORS[6] = { '<', '>', 'U', ' ', '\t', '\0' };
-
-static const char *
-skipWhitespace(const char *s) {
- while(*s==' ' || *s=='\t') {
- ++s;
- }
- return s;
-}
-
-static int32_t
-parseCodepageBytes(const char *s, uint32_t *pBytes, const char **pEnd) {
- char *end;
- int32_t length=0;
- uint32_t bytes=0, value;
-
- while(s[0]=='\\' && s[1]=='x') {
- if(length==4) {
- return -1;
- }
- value=uprv_strtoul(s+2, &end, 16);
- s+=4;
- if(end!=s) {
- return -1;
- }
- bytes=(bytes<<8)|value;
- ++length;
- }
- if(length==0) {
- return -1;
- }
- if(pEnd!=NULL) {
- *pEnd=s;
- }
- *pBytes=bytes;
- return length;
-}
-
-/* Remove all characters followed by '#'. There is an exception if there
- * is a fallback sign '|' after the comment and the comment does not
- * start in column 0. In this case, we just blank from '#' to just
- * before the '|' in order to support the fact that IBM official .ucm
- * files have the fallback information in comments!
- */
-static char *
- removeComments (char *line)
-{
- char *pound;
-
- line = (char*)skipWhitespace(line);
- pound = uprv_strchr (line, '#');
- if (pound != NULL)
- {
- char *fallback = pound == line ? 0 : uprv_strchr(pound + 1, '|');
- if (fallback != NULL)
- {
- uprv_memset(pound, ' ', fallback-pound);
- }
- else
- {
- *pound = '\0';
- }
- }
- return line;
-}
-
-/* Returns true in c is a in set 'setOfChars', false otherwise
- */
-static UBool
- isInSet (char c, const char *setOfChars)
-{
- uint8_t i = 0;
-
- while (setOfChars[i] != '\0')
- {
- if (c == setOfChars[i++])
- return TRUE;
- }
-
- return FALSE;
-}
-
-/* Returns pointer to the next non-whitespace (or non-separator)
- */
-static int32_t
- nextTokenOffset (const char *line, const char *separators)
-{
- int32_t i = 0;
-
- while (line[i] && isInSet(line[i], separators))
- i++;
-
- return i;
-}
-
-/* Returns pointer to the next token based on the set of separators
- */
-static char *
- getToken (char *token, char *line, const char *separators)
-{
- int32_t i = nextTokenOffset (line, separators);
- int8_t j = 0;
-
- while (line[i] && (!isInSet(line[i], separators)))
- token[j++] = line[i++];
- token[j] = '\0';
-
- return line + i;
-}
+static void
+writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status);
UBool haveCopyright=TRUE;
@@ -194,20 +103,27 @@ static UDataInfo dataInfo={
{0, 0, 0, 0} /* dataVersion (calculated at runtime) */
};
-void writeConverterData(UConverterSharedData *mySharedData,
- const char *cnvName,
- const char *cnvDir,
- UErrorCode *status)
+static void
+writeConverterData(ConvData *data, const char *cnvName, const char *cnvDir, UErrorCode *status)
{
UNewDataMemory *mem = NULL;
uint32_t sz2;
uint32_t size = 0;
+ int32_t tableType;
if(U_FAILURE(*status))
{
return;
}
+ tableType=TABLE_NONE;
+ if(data->cnvData!=NULL) {
+ tableType|=TABLE_BASE;
+ }
+ if(data->extData!=NULL) {
+ tableType|=TABLE_EXT;
+ }
+
mem = udata_create(cnvDir, "cnv", cnvName, &dataInfo, haveCopyright ? U_COPYRIGHT_STRING : NULL, status);
if(U_FAILURE(*status))
@@ -224,11 +140,17 @@ void writeConverterData(UConverterSharedData *mySharedData,
fprintf(stderr, "- Opened udata %s.%s\n", cnvName, "cnv");
}
+
/* all read only, clean, platform independent data. Mmmm. :) */
- udata_writeBlock(mem, mySharedData->staticData, sizeof(UConverterStaticData));
+ udata_writeBlock(mem, &data->staticData, sizeof(UConverterStaticData));
size += sizeof(UConverterStaticData); /* Is 4-aligned - by size */
/* Now, write the table */
- size += ((NewConverter *)mySharedData->table)->write((NewConverter *)mySharedData->table, mySharedData->staticData, mem);
+ if(tableType&TABLE_BASE) {
+ size += data->cnvData->write(data->cnvData, &data->staticData, mem, tableType);
+ }
+ if(tableType&TABLE_EXT) {
+ size += data->extData->write(data->extData, &data->staticData, mem, tableType);
+ }
sz2 = udata_finish(mem, status);
if(size != sz2)
@@ -255,7 +177,7 @@ static UOption options[]={
int main(int argc, char* argv[])
{
- UConverterSharedData* mySharedData = NULL;
+ ConvData data;
UErrorCode err = U_ZERO_ERROR, localError;
char outFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
char touchFileName[UCNV_MAX_FULL_FILE_NAME_LENGTH];
@@ -420,7 +342,7 @@ int main(int argc, char* argv[])
if(pkgName != NULL)
{
- /* changes both baename and filename */
+ /* changes both basename and filename */
uprv_strcpy(outBasename, pkgName);
uprv_strcat(outBasename, "_");
uprv_strcat(outBasename, cnvName);
@@ -435,9 +357,10 @@ int main(int argc, char* argv[])
fflush(stdout);
#endif
localError = U_ZERO_ERROR;
- mySharedData = createConverterFromTableFile(arg, &localError);
+ initConvData(&data);
+ createConverter(&data, arg, &localError);
- if (U_FAILURE(localError) || (mySharedData == NULL))
+ if (U_FAILURE(localError))
{
/* if an error is found, print out an error msg and keep going */
fprintf(stderr, "Error creating converter for \"%s\" file for \"%s\" (%s)\n", outFileName, arg,
@@ -449,21 +372,21 @@ int main(int argc, char* argv[])
else
{
/* Make the static data name equal to the file name */
- if( /*VERBOSE && */ uprv_stricmp(cnvName,mySharedData->staticData->name))
+ if( /*VERBOSE && */ uprv_stricmp(cnvName,data.staticData.name))
{
fprintf(stderr, "Warning: %s%s claims to be '%s'\n",
cnvName,
CONVERTER_FILE_EXTENSION,
- mySharedData->staticData->name);
+ data.staticData.name);
}
- uprv_strcpy((char*)mySharedData->staticData->name, cnvName);
+ uprv_strcpy((char*)data.staticData.name, cnvName);
- if(!uprv_isInvariantString((char*)mySharedData->staticData->name, -1)) {
+ if(!uprv_isInvariantString((char*)data.staticData.name, -1)) {
fprintf(stderr,
"Error: A converter name must contain only invariant characters.\n"
"%s is not a valid converter name.\n",
- mySharedData->staticData->name);
+ data.staticData.name);
if(U_SUCCESS(err)) {
err = U_INVALID_TABLE_FORMAT;
}
@@ -481,8 +404,7 @@ int main(int argc, char* argv[])
}
localError = U_ZERO_ERROR;
- writeConverterData(mySharedData, cnvNameWithPkg, destdir, &localError);
- ((NewConverter *)mySharedData->table)->close((NewConverter *)mySharedData->table);
+ writeConverterData(&data, cnvNameWithPkg, destdir, &localError);
if(TOUCHFILE)
{
FileStream *q;
@@ -505,10 +427,6 @@ int main(int argc, char* argv[])
}
}
- /* write the information data */
- uprv_free((UConverterStaticData *)mySharedData->staticData);
- uprv_free(mySharedData);
-
if(U_FAILURE(localError))
{
/* if an error is found, print out an error msg and keep going*/
@@ -525,6 +443,8 @@ int main(int argc, char* argv[])
}
fflush(stdout);
fflush(stderr);
+
+ cleanupConvData(&data);
}
return err;
@@ -548,517 +468,312 @@ getPlatformAndCCSIDFromName(const char *name, int8_t *pPlatform, int32_t *pCCSID
}
}
-/*Reads the header of the table file and fills in basic knowledge about the converter in "converter"*/
-void readHeaderFromFile(UConverterSharedData* mySharedData,
- FileStream* convFile,
- const char* converterName,
- UErrorCode *pErrorCode)
-{
+static void
+readHeader(ConvData *data,
+ FileStream* convFile,
+ const char* converterName,
+ UErrorCode *pErrorCode) {
char line[200];
- char *s, *end, *key, *value;
+ char *s, *key, *value;
+ const UConverterStaticData *prototype;
UConverterStaticData *staticData;
- char c;
if(U_FAILURE(*pErrorCode)) {
return;
}
- staticData=(UConverterStaticData *)mySharedData->staticData;
- staticData->conversionType=UCNV_UNSUPPORTED_CONVERTER;
+ staticData=&data->staticData;
staticData->platform=UCNV_IBM;
staticData->subCharLen=0;
while(T_FileStream_readLine(convFile, line, sizeof(line))) {
- /* remove comments and trailing CR and LF and remove whitespace from the end */
- for(end=line; (c=*end)!=0; ++end) {
- if(c=='#' || c=='\r' || c=='\n') {
- break;
- }
- }
- while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
- --end;
- }
- *end=0;
-
- /* skip leading white space and ignore empty lines */
- s=(char *)skipWhitespace(line);
- if(*s==0) {
+ /* basic parsing and handling of state-related items */
+ if(ucm_parseHeaderLine(data->ucm, line, &key, &value)) {
continue;
}
/* stop at the beginning of the mapping section */
- if(uprv_memcmp(s, "CHARMAP", 7)==0) {
+ if(uprv_strcmp(line, "CHARMAP")==0) {
break;
}
- /* get the key name, bracketed in <> */
- if(*s!='<') {
- fprintf(stderr, "error: no header field in line \"%s\"\n", line);
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
- key=++s;
- while(*s!='>') {
- if(*s==0) {
- fprintf(stderr, "error: incomplete header field in line \"%s\"\n", line);
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
- ++s;
- }
- *s=0;
-
- /* get the value string, possibly quoted */
- s=(char *)skipWhitespace(s+1);
- if(*s!='"') {
- value=s;
- } else {
- /* remove the quotes */
- value=s+1;
- if(end>value && *(end-1)=='"') {
- *--end=0;
- }
- }
-
/* collect the information from the header field, ignore unknown keys */
if(uprv_strcmp(key, "code_set_name")==0) {
if(*value!=0) {
- uprv_strcpy((char*)staticData->name, value);
+ uprv_strcpy((char *)staticData->name, value);
getPlatformAndCCSIDFromName(value, &staticData->platform, &staticData->codepage);
}
- } else if(uprv_strcmp(key, "uconv_class")==0) {
- const UConverterStaticData *prototype;
-
- if(uprv_strcmp(value, "DBCS")==0) {
- staticData->conversionType=UCNV_DBCS;
- } else if(uprv_strcmp(value, "SBCS")==0) {
- staticData->conversionType = UCNV_SBCS;
- } else if(uprv_strcmp(value, "MBCS")==0) {
- staticData->conversionType = UCNV_MBCS;
- } else if(uprv_strcmp(value, "EBCDIC_STATEFUL")==0) {
- staticData->conversionType = UCNV_EBCDIC_STATEFUL;
- } else {
- fprintf(stderr, "error: unknown %s\n", value);
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
-
- /* Now that we know the type, copy any 'default' values from the table. */
- prototype=ucnv_converterStaticData[staticData->conversionType];
- if(prototype!=NULL) {
- if(staticData->name[0]==0) {
- uprv_strcpy((char*)staticData->name, prototype->name);
- }
-
- if(staticData->codepage==0) {
- staticData->codepage = prototype->codepage;
- }
-
- if(staticData->platform==0) {
- staticData->platform = prototype->platform;
- }
-
- if(staticData->minBytesPerChar==0) {
- staticData->minBytesPerChar = prototype->minBytesPerChar;
- }
-
- if(staticData->maxBytesPerChar==0) {
- staticData->maxBytesPerChar = prototype->maxBytesPerChar;
- }
-
- if(staticData->subCharLen==0) {
- staticData->subCharLen=prototype->subCharLen;
- if(prototype->subCharLen>0) {
- uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
- }
- }
- }
- } else if(uprv_strcmp(key, "mb_cur_max")==0) {
- if('1'<=*value && *value<='4' && value[1]==0) {
- staticData->maxBytesPerChar=(int8_t)(*value-'0');
- } else {
- fprintf(stderr, "error: illegal %s\n", value);
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
- } else if(uprv_strcmp(key, "mb_cur_min")==0) {
- if('1'<=*value && *value<='4' && value[1]==0) {
- staticData->minBytesPerChar=(int8_t)(*value-'0');
- } else {
- fprintf(stderr, "error: illegal %s\n", value);
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
} else if(uprv_strcmp(key, "subchar")==0) {
- uint32_t bytes;
- int32_t length;
+ uint8_t bytes[UCNV_EXT_MAX_BYTES];
+ int8_t length;
- length=parseCodepageBytes(value, &bytes, (const char **)&end);
- if(length>0 && *end==0) {
- staticData->subCharLen=(int8_t)length;
- do {
- staticData->subChar[--length]=(uint8_t)bytes;
- bytes>>=8;
- } while(length>0);
+ s=value;
+ length=ucm_parseBytes(bytes, line, &s);
+ if(1<=length && length<=4 && *s==0) {
+ staticData->subCharLen=length;
+ uprv_memcpy(staticData->subChar, bytes, length);
} else {
fprintf(stderr, "error: illegal %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
} else if(uprv_strcmp(key, "subchar1")==0) {
- uint32_t bytes;
+ uint8_t bytes[UCNV_EXT_MAX_BYTES];
- if(1==parseCodepageBytes(value, &bytes, (const char **)&end) && *end==0) {
- staticData->subChar1=(uint8_t)bytes;
+ s=value;
+ if(1==ucm_parseBytes(bytes, line, &s) && *s==0) {
+ staticData->subChar1=bytes[0];
} else {
fprintf(stderr, "error: illegal %s\n", value);
*pErrorCode=U_INVALID_TABLE_FORMAT;
return;
}
- } else if(uprv_strcmp(key, "icu:state")==0) {
- /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
- switch(staticData->conversionType) {
- case UCNV_SBCS:
- case UCNV_DBCS:
- case UCNV_EBCDIC_STATEFUL:
- staticData->conversionType = UCNV_MBCS;
- break;
- case UCNV_MBCS:
- break;
- default:
- fprintf(stderr, "error: entry for non-MBCS table or before the line\n");
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
-
- if(staticData->maxBytesPerChar==0) {
- fprintf(stderr, "error: before the line\n");
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
- if(mySharedData->table==NULL) {
- mySharedData->table=(UConverterTable *)MBCSOpen(staticData->maxBytesPerChar);
- if(mySharedData->table==NULL) {
- *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
- return;
- }
- }
- if(!MBCSAddState((NewConverter *)mySharedData->table, value)) {
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- return;
- }
}
}
+ /* copy values from the UCMFile to the static data */
+ staticData->maxBytesPerChar=(int8_t)data->ucm->states.maxCharLength;
+ staticData->minBytesPerChar=(int8_t)data->ucm->states.minCharLength;
+ staticData->conversionType=data->ucm->states.conversionType;
+
+ /* ### TODO use UCNV_UNSUPPORTED_CONVERTER to indicate an extension-only file? */
+
if(staticData->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
+ fprintf(stderr, "ucm error: missing conversion type ()\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
- } else if(staticData->conversionType==UCNV_MBCS && mySharedData->table==NULL) {
- fprintf(stderr, "error: missing state table information () for MBCS\n");
- *pErrorCode=U_INVALID_TABLE_FORMAT;
- } else if(staticData->subChar1!=0 &&
- !staticData->conversionType==UCNV_MBCS &&
- !staticData->conversionType==UCNV_EBCDIC_STATEFUL
+ return;
+ }
+
+ /*
+ * Now that we know the type, copy any 'default' values from the table.
+ * We need not check the type any further because the parser only
+ * recognizes what we have prototypes for.
+ */
+ prototype=ucnv_converterStaticData[staticData->conversionType];
+ if(prototype!=NULL) {
+ if(staticData->name[0]==0) {
+ uprv_strcpy((char *)staticData->name, prototype->name);
+ }
+
+ if(staticData->codepage==0) {
+ staticData->codepage=prototype->codepage;
+ }
+
+ if(staticData->platform==0) {
+ staticData->platform=prototype->platform;
+ }
+
+ if(staticData->minBytesPerChar==0) {
+ staticData->minBytesPerChar=prototype->minBytesPerChar;
+ }
+
+ if(staticData->maxBytesPerChar==0) {
+ staticData->maxBytesPerChar=prototype->maxBytesPerChar;
+ }
+
+ if(staticData->subCharLen==0) {
+ staticData->subCharLen=prototype->subCharLen;
+ if(prototype->subCharLen>0) {
+ uprv_memcpy(staticData->subChar, prototype->subChar, prototype->subCharLen);
+ }
+ }
+ }
+
+ if(data->ucm->states.outputType<0) {
+ data->ucm->states.outputType=(int8_t)data->ucm->states.maxCharLength;
+ }
+
+ if( staticData->subChar1!=0 &&
+ (staticData->minBytesPerChar>1 ||
+ (staticData->conversionType!=UCNV_MBCS &&
+ staticData->conversionType!=UCNV_EBCDIC_STATEFUL))
) {
fprintf(stderr, "error: defined for a type other than MBCS or EBCDIC_STATEFUL\n");
*pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
-void loadTableFromFile(FileStream* convFile, UConverterSharedData* sharedData, UErrorCode* err)
-{
- char storageLine[200];
- char* line = NULL;
- UConverterStaticData *staticData=(UConverterStaticData *)sharedData->staticData;
- NewConverter *cnvData = (NewConverter *)sharedData->table;
- UChar32 unicodeValue, codepageValue;
- uint8_t mbcsBytes[8];
- int32_t mbcsLength;
- char codepointBytes[20];
- UBool isOK = TRUE;
- uint8_t precisionMask = 0, unicodeMask = 0;
- char endOfLine;
+static void
+readTable(ConvData *data, FileStream* convFile,
+ UBool forBase, UCMStates *baseStates,
+ UErrorCode *pErrorCode) {
+ char line[200];
+ char *end;
+ UBool isOK;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
- if(cnvData->startMappings!=NULL)
- {
- if(!cnvData->startMappings(cnvData)) {
- *err = U_INVALID_TABLE_FORMAT;
- return;
+ isOK=TRUE;
+
+ for(;;) {
+ /* read the next line */
+ if(!T_FileStream_readLine(convFile, line, sizeof(line))) {
+ fprintf(stderr, "incomplete charmap section\n");
+ isOK=FALSE;
+ break;
}
- }
- if(cnvData->isValid!=NULL)
- {
- const uint8_t *p = staticData->subChar;
- codepageValue = 0;
- switch(staticData->subCharLen) {
- case 4: codepageValue = (codepageValue << 8) | *p++;
- case 3: codepageValue = (codepageValue << 8) | *p++;
- case 2: codepageValue = (codepageValue << 8) | *p++;
- case 1: codepageValue = (codepageValue << 8) | *p;
- default: break; /* must never occur */
+ /* remove CR LF */
+ end=uprv_strchr(line, 0);
+ while(lineisValid(cnvData, staticData->subChar, staticData->subCharLen, codepageValue)) {
- fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
- *err = U_INVALID_TABLE_FORMAT;
- isOK = FALSE;
+ *end=0;
+
+ /* ignore empty and comment lines */
+ if(line[0]==0 || line[0]=='#') {
+ continue;
}
- }
- staticData->hasFromUnicodeFallback = staticData->hasToUnicodeFallback = FALSE;
-
- while (T_FileStream_readLine(convFile, storageLine, sizeof(storageLine)))
- {
- removeComments(storageLine);
- line = storageLine;
- if (line[nextTokenOffset(line, NLTC_SEPARATORS)] != '\0')
- {
- /* get the Unicode code point */
- line = getToken(codepointBytes, line, UNICODE_CODEPOINT_SEPARATORS);
- if (uprv_strcmp(codepointBytes, "END") == 0)
- {
- break;
- }
- unicodeValue = (UChar32)T_CString_stringToInteger(codepointBytes, 16);
-
- /* get the codepage bytes */
- codepageValue = 0;
- mbcsLength = 0;
- do
- {
- line = getToken(codepointBytes, line, CODEPOINT_SEPARATORS);
- mbcsBytes[mbcsLength] = (uint8_t)T_CString_stringToInteger(codepointBytes, 16);
- codepageValue = codepageValue << 8 | mbcsBytes[mbcsLength++];
-
- /* End of line could be \0 or | (if fallback) */
- endOfLine= line[nextTokenOffset(line, CODEPOINT_SEPARATORS)];
- } while((endOfLine != '\0') && (endOfLine != FALLBACK_SEPARATOR));
-
- if(unicodeValue>=0x10000) {
- unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
- } else if(UTF_IS_SURROGATE(unicodeValue)) {
- unicodeMask|=UCNV_HAS_SURROGATES; /* there are single surrogates */
- }
-
- if((uint32_t)unicodeValue > 0x10ffff)
- {
- fprintf(stderr, "error: Unicode code point > U+10ffff in '%s'\n", storageLine);
- isOK = FALSE;
- }
- else if(endOfLine == FALLBACK_SEPARATOR)
- {
- /* we know that there is a fallback separator */
- precisionMask |= 1;
- line = uprv_strchr(line, FALLBACK_SEPARATOR) + 1;
- switch(*line)
- {
- case '0':
- /* set roundtrip mappings */
- isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0) &&
- cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 0);
- break;
- case '1':
- /* set only a fallback mapping from Unicode to codepage */
- staticData->hasFromUnicodeFallback = TRUE;
- isOK &= cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1);
- break;
- case '2':
- /* skip subchar mappings */
- break;
- case '3':
- /* set only a fallback mapping from codepage to Unicode */
- staticData->hasToUnicodeFallback = TRUE;
- isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, 1);
- break;
- default:
- fprintf(stderr, "error: illegal fallback indicator '%s' in '%s'\n", line - 1, storageLine);
- *err = U_INVALID_TABLE_FORMAT;
- break;
- }
- }
- else
- {
- precisionMask |= 2;
- /* set the mappings */
- isOK &= cnvData->addToUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1) &&
- cnvData->addFromUnicode(cnvData, mbcsBytes, mbcsLength, unicodeValue, codepageValue, -1);
- }
+ /* stop at the end of the mapping table */
+ if(0==uprv_strcmp(line, "END CHARMAP")) {
+ break;
}
+
+ isOK&=ucm_addMappingFromLine(data->ucm, line, forBase, baseStates);
}
- if(unicodeMask == 3)
- {
- fprintf(stderr, "warning: contains mappings to both supplementary code points and single surrogates\n");
- }
- staticData->unicodeMask = unicodeMask;
-
- if(cnvData->finishMappings!=NULL)
- {
- cnvData->finishMappings(cnvData, staticData);
- }
-
- if(!isOK)
- {
- *err = U_INVALID_TABLE_FORMAT;
- }
- else if(precisionMask == 3)
- {
- fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
- *err = U_INVALID_TABLE_FORMAT;
+ if(!isOK) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
}
}
-/*creates a UConverterStaticData, fills in necessary links to it the appropriate function pointers*/
-UConverterSharedData* createConverterFromTableFile(const char* converterName, UErrorCode* err)
-{
- FileStream* convFile = NULL;
- UConverterSharedData* mySharedData = NULL;
- UConverterStaticData* myStaticData = NULL;
+/* return TRUE if a base table was read, FALSE for an extension table */
+static UBool
+readFile(ConvData *data, const char* converterName,
+ UErrorCode *pErrorCode) {
+ char line[200];
+ char *end;
+ FileStream *convFile;
+ UBool dataIsBase;
- if (U_FAILURE(*err)) return NULL;
-
- convFile = T_FileStream_open(converterName, "r");
- if (convFile == NULL)
- {
- *err = U_FILE_ACCESS_ERROR;
- return NULL;
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
}
+ data->ucm=ucm_open();
- mySharedData = (UConverterSharedData*) uprv_malloc(sizeof(UConverterSharedData));
- if (mySharedData == NULL)
- {
- *err = U_MEMORY_ALLOCATION_ERROR;
- T_FileStream_close(convFile);
- return NULL;
+ convFile=T_FileStream_open(converterName, "r");
+ if(convFile==NULL) {
+ *pErrorCode=U_FILE_ACCESS_ERROR;
+ return FALSE;
}
- uprv_memset(mySharedData, 0, sizeof(UConverterSharedData));
-
- mySharedData->structSize = sizeof(UConverterSharedData);
-
- myStaticData = (UConverterStaticData*) uprv_malloc(sizeof(UConverterStaticData));
- if (myStaticData == NULL)
- {
- *err = U_MEMORY_ALLOCATION_ERROR;
- T_FileStream_close(convFile);
- return NULL;
+ readHeader(data, convFile, converterName, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
}
- uprv_memset(myStaticData, 0, sizeof(UConverterStaticData));
- mySharedData->staticData = myStaticData;
- myStaticData->structSize = sizeof(UConverterStaticData);
- /* mySharedData->staticDataOwned = FALSE; */ /* not owned if in udata */
- mySharedData->sharedDataCached = FALSE;
- mySharedData->dataMemory = NULL; /* for init */
+ if(data->ucm->baseName[0]==0) {
+ dataIsBase=TRUE;
+ ucm_processStates(&data->ucm->states);
- readHeaderFromFile(mySharedData, convFile, converterName, err);
-
- if (U_FAILURE(*err)) return NULL;
-
- switch (myStaticData->conversionType)
- {
- case UCNV_SBCS:
- {
- /* SBCS: use MBCS data structure with a default state table */
- if(mySharedData->staticData->maxBytesPerChar!=1) {
- fprintf(stderr, "error: SBCS codepage with max bytes/char!=1\n");
- *err = U_INVALID_TABLE_FORMAT;
- break;
+ /* read the base table */
+ readTable(data, convFile, TRUE, &data->ucm->states, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return FALSE;
}
- myStaticData->conversionType = UCNV_MBCS;
- if(mySharedData->table == NULL) {
- NewConverter *sharedDataTable = MBCSOpen(1);
- if(sharedDataTable != NULL) {
- if(!MBCSAddState(sharedDataTable, "0-ff")) {
- *err = U_INVALID_TABLE_FORMAT;
- sharedDataTable->close(sharedDataTable);
- } else {
- mySharedData->table = (UConverterTable *)sharedDataTable;
- }
- } else {
- *err = U_MEMORY_ALLOCATION_ERROR;
+
+ /* read an extension table if there is one */
+ while(T_FileStream_readLine(convFile, line, sizeof(line))) {
+ end=uprv_strchr(line, 0);
+ while(lineucm->states, pErrorCode);
+ break;
}
}
- break;
- }
- case UCNV_MBCS:
- {
- /* MBCSOpen() was called by readHeaderFromFile() */
- break;
- }
- case UCNV_EBCDIC_STATEFUL:
- {
- /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
- if(mySharedData->staticData->maxBytesPerChar!=2) {
- fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n");
- *err = U_INVALID_TABLE_FORMAT;
- break;
- }
- myStaticData->conversionType = UCNV_MBCS;
- if(mySharedData->table == NULL) {
- NewConverter *sharedDataTable = MBCSOpen(2);
- if(sharedDataTable != NULL) {
- if( !MBCSAddState(sharedDataTable, "0-ff, e:1.s, f:0.s") ||
- !MBCSAddState(sharedDataTable, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4") ||
- !MBCSAddState(sharedDataTable, "0-40:1.i, 41-fe:1., ff:1.i") ||
- !MBCSAddState(sharedDataTable, "0-ff:1.i, 40:1.") ||
- !MBCSAddState(sharedDataTable, "0-ff:1.i")
- ) {
- *err = U_INVALID_TABLE_FORMAT;
- sharedDataTable->close(sharedDataTable);
- } else {
- mySharedData->table = (UConverterTable *)sharedDataTable;
- }
- } else {
- *err = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- break;
- }
- case UCNV_DBCS:
- {
- /* DBCS: use MBCS data structure with a default state table */
- if(mySharedData->staticData->maxBytesPerChar!=2) {
- fprintf(stderr, "error: DBCS codepage with max bytes/char!=2\n");
- *err = U_INVALID_TABLE_FORMAT;
- break;
- }
- myStaticData->conversionType = UCNV_MBCS;
- if(mySharedData->table == NULL) {
- NewConverter *sharedDataTable = MBCSOpen(2);
- if(sharedDataTable != NULL) {
- if( !MBCSAddState(sharedDataTable, "0-3f:3, 40:2, 41-fe:1, ff:3") ||
- !MBCSAddState(sharedDataTable, "41-fe") ||
- !MBCSAddState(sharedDataTable, "40") ||
- !MBCSAddState(sharedDataTable, "")
- ) {
- *err = U_INVALID_TABLE_FORMAT;
- sharedDataTable->close(sharedDataTable);
- } else {
- mySharedData->table = (UConverterTable *)sharedDataTable;
- }
- } else {
- *err = U_MEMORY_ALLOCATION_ERROR;
- }
- }
- break;
- }
-
- default :
- fprintf(stderr, "error: omitted\n");
- *err = U_INVALID_TABLE_FORMAT;
- mySharedData->table = NULL;
- break;
- };
-
- if(U_SUCCESS(*err) && mySharedData->table != NULL)
- {
- loadTableFromFile(convFile, mySharedData, err);
+ } else {
+ /* read only the extension table */
+ dataIsBase=FALSE;
+ readTable(data, convFile, FALSE, NULL, pErrorCode);
}
T_FileStream_close(convFile);
- return mySharedData;
+ if(data->ucm->base->flagsType==UCM_FLAGS_MIXED || data->ucm->ext->flagsType==UCM_FLAGS_MIXED) {
+ fprintf(stderr, "error: some entries have the mapping precision (with '|'), some do not\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+
+ return dataIsBase;
+}
+
+static void
+createConverter(ConvData *data, const char* converterName, UErrorCode *pErrorCode) {
+ ConvData baseData;
+ UBool dataIsBase;
+
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ initConvData(data);
+
+ /* ### TODO if there is an extension table:
+ 1. the base table must use precision flags
+ 2. check base vs. extension for mappings overlap
+ */
+ dataIsBase=readFile(data, converterName, pErrorCode);
+ if(U_FAILURE(*pErrorCode)) {
+ return;
+ }
+
+ initConvData(&baseData);
+
+ if(dataIsBase) {
+ data->cnvData=MBCSOpen(data->ucm);
+ if(data->cnvData==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+
+ } else if(!data->cnvData->isValid(data->cnvData,
+ data->staticData.subChar, data->staticData.subCharLen)
+ ) {
+ fprintf(stderr, " the substitution character byte sequence is illegal in this codepage structure!\n");
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+
+ } else if(data->ucm->ext->mappingsLength>0) {
+ /* prepare the extension table, if there is one */
+ data->extData=CnvExtOpen(data->ucm);
+ if(data->extData==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+
+ } else if(
+ !ucm_checkBaseExt(&data->ucm->states, data->ucm->base, data->ucm->ext, TRUE) ||
+ !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+ }
+
+ /* add the base table after ucm_checkBaseExt()! */
+ if( U_SUCCESS(*pErrorCode) &&
+ !data->cnvData->addTable(data->cnvData, data->ucm->base, &data->staticData)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+ } else {
+ /* ### TODO assemble a path/filename for data->ucm->states.baseName */
+ /* must be TRUE */readFile(&baseData, ""/*extConverterName*/, pErrorCode);
+ /* ### TODO read extension table */
+ /* ### TODO - actually write the mappings into genmbcs or into ext */
+
+ if( !ucm_checkValidity(data->ucm->ext, &baseData.ucm->states) ||
+ !ucm_checkBaseExt(&baseData.ucm->states, baseData.ucm->base, data->ucm->ext, FALSE) ||
+ !data->extData->addTable(data->extData, data->ucm->ext, &data->staticData)
+ ) {
+ *pErrorCode=U_INVALID_TABLE_FORMAT;
+ }
+ }
+
+ cleanupConvData(&baseData);
}
/*
diff --git a/icu4c/source/tools/makeconv/makeconv.dsp b/icu4c/source/tools/makeconv/makeconv.dsp
index c1506d20a9..058d4a14f1 100644
--- a/icu4c/source/tools/makeconv/makeconv.dsp
+++ b/icu4c/source/tools/makeconv/makeconv.dsp
@@ -183,6 +183,10 @@ SOURCE="$(InputPath)"
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
# Begin Source File
+SOURCE=.\gencnvex.c
+# End Source File
+# Begin Source File
+
SOURCE=.\genmbcs.c
# End Source File
# Begin Source File
diff --git a/icu4c/source/tools/makeconv/makeconv.h b/icu4c/source/tools/makeconv/makeconv.h
index cb4825b324..bb6c500432 100644
--- a/icu4c/source/tools/makeconv/makeconv.h
+++ b/icu4c/source/tools/makeconv/makeconv.h
@@ -1,7 +1,7 @@
/*
*******************************************************************************
*
-* Copyright (C) 2000-2001, International Business Machines
+* Copyright (C) 2000-2003, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@@ -20,10 +20,19 @@
#include "unicode/utypes.h"
#include "ucnv_bld.h"
#include "unewdata.h"
+#include "ucm.h"
/* exports from makeconv.c */
U_CFUNC UBool VERBOSE;
+/* converter table type for writing */
+enum {
+ TABLE_NONE,
+ TABLE_BASE,
+ TABLE_EXT,
+ TABLE_BASE_AND_EXT
+};
+
/* abstract converter generator struct, C++ - style */
struct NewConverter;
typedef struct NewConverter NewConverter;
@@ -32,32 +41,17 @@ struct NewConverter {
void
(*close)(NewConverter *cnvData);
- UBool
- (*startMappings)(NewConverter *cnvData);
-
/** is this byte sequence valid? */
UBool
(*isValid)(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- uint32_t b);
+ const uint8_t *bytes, int32_t length);
UBool
- (*addToUnicode)(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
-
- UBool
- (*addFromUnicode)(NewConverter *cnvData,
- const uint8_t *bytes, int32_t length,
- UChar32 c, uint32_t b,
- int8_t isFallback);
-
- void
- (*finishMappings)(NewConverter *cnvData, const UConverterStaticData *staticData);
+ (*addTable)(NewConverter *cnvData, UCMTable *table, UConverterStaticData *staticData);
uint32_t
- (*write)(NewConverter *cnvData, const UConverterStaticData *staticData, UNewDataMemory *pData);
+ (*write)(NewConverter *cnvData, const UConverterStaticData *staticData,
+ UNewDataMemory *pData, int32_t tableType);
};
#endif
diff --git a/icu4c/source/tools/makeconv/makeconv.vcproj b/icu4c/source/tools/makeconv/makeconv.vcproj
index 393cf36175..cb9dadb272 100644
--- a/icu4c/source/tools/makeconv/makeconv.vcproj
+++ b/icu4c/source/tools/makeconv/makeconv.vcproj
@@ -132,6 +132,9 @@
+
+
diff --git a/icu4c/source/tools/toolutil/Makefile.in b/icu4c/source/tools/toolutil/Makefile.in
index b41a9e9582..b47038d5cd 100644
--- a/icu4c/source/tools/toolutil/Makefile.in
+++ b/icu4c/source/tools/toolutil/Makefile.in
@@ -38,7 +38,7 @@ DYNAMICCXXFLAGS = $(SHAREDLIBCXXFLAGS)
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(top_srcdir)/tools/ctestfw $(LIBCPPFLAGS)
LIBS = $(LIBICUUC) $(DEFAULT_LIBS)
-OBJECTS = toolutil.o unewdata.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o
+OBJECTS = toolutil.o unewdata.o ucm.o ucmstate.o ucmpwrit.o uoptions.o uparse.o ucbuf.o uperf.o
STATIC_OBJECTS = $(OBJECTS:.o=.$(STATIC_O))
diff --git a/icu4c/source/tools/toolutil/toolutil.c b/icu4c/source/tools/toolutil/toolutil.c
index 900a43a0d0..5118c7876b 100644
--- a/icu4c/source/tools/toolutil/toolutil.c
+++ b/icu4c/source/tools/toolutil/toolutil.c
@@ -26,6 +26,7 @@
# define NOMCX
# include
#endif
+#include
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "cmemory.h"
@@ -73,3 +74,117 @@ findBasename(const char *filename) {
return filename;
}
}
+
+/* tool memory helper ------------------------------------------------------- */
+
+typedef struct UToolMemory {
+ char name[64];
+ int32_t capacity, maxCapacity, size, index;
+ void *array;
+ UAlignedMemory staticArray[1];
+} UToolMemory;
+
+U_CAPI UToolMemory * U_EXPORT2
+utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size) {
+ UToolMemory *mem;
+
+ if(maxCapacityarray=mem->staticArray;
+
+ uprv_strcpy(mem->name, name);
+ mem->capacity=initialCapacity;
+ mem->maxCapacity=maxCapacity;
+ mem->size=size;
+ mem->index=0;
+ return mem;
+}
+
+U_CAPI void U_EXPORT2
+utm_close(UToolMemory *mem) {
+ if(mem!=NULL) {
+ if(mem->array!=mem->staticArray) {
+ uprv_free(mem->array);
+ }
+ uprv_free(mem);
+ }
+}
+
+
+U_CAPI void * U_EXPORT2
+utm_getStart(UToolMemory *mem) {
+ return (char *)mem->array;
+}
+
+U_CAPI int32_t U_EXPORT2
+utm_countItems(UToolMemory *mem) {
+ return mem->index;
+}
+
+
+static UBool
+utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
+ if(mem->capacitymaxCapacityname, (long)mem->maxCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ /* try to allocate a larger array */
+ if(capacity>=2*mem->capacity) {
+ newCapacity=capacity;
+ } else if(mem->capacity<=mem->maxCapacity/3) {
+ newCapacity=2*mem->capacity;
+ } else {
+ newCapacity=mem->maxCapacity;
+ }
+
+ if(mem->array==mem->staticArray) {
+ mem->array=uprv_malloc(newCapacity*mem->size);
+ if(mem->array!=NULL) {
+ uprv_memcpy(mem->array, mem->staticArray, mem->index*mem->size);
+ }
+ } else {
+ mem->array=uprv_realloc(mem->array, newCapacity*mem->size);
+ }
+
+ if(mem->array==NULL) {
+ fprintf(stderr, "error: %s - out of memory\n", mem->name);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ return TRUE;
+}
+
+U_CAPI void * U_EXPORT2
+utm_alloc(UToolMemory *mem) {
+ char *p=(char *)mem->array+mem->index*mem->size;
+ int32_t newIndex=mem->index+1;
+ if(utm_hasCapacity(mem, newIndex)) {
+ mem->index=newIndex;
+ uprv_memset(p, 0, mem->size);
+ }
+ return p;
+}
+
+U_CAPI void * U_EXPORT2
+utm_allocN(UToolMemory *mem, int32_t n) {
+ char *p=(char *)mem->array+mem->index*mem->size;
+ int32_t newIndex=mem->index+n;
+ if(utm_hasCapacity(mem, newIndex)) {
+ mem->index=newIndex;
+ uprv_memset(p, 0, n*mem->size);
+ }
+ return p;
+}
diff --git a/icu4c/source/tools/toolutil/toolutil.dsp b/icu4c/source/tools/toolutil/toolutil.dsp
index 6cc54d8846..78aefa5c4a 100644
--- a/icu4c/source/tools/toolutil/toolutil.dsp
+++ b/icu4c/source/tools/toolutil/toolutil.dsp
@@ -163,10 +163,18 @@ SOURCE=.\ucbuf.c
# End Source File
# Begin Source File
+SOURCE=.\ucm.c
+# End Source File
+# Begin Source File
+
SOURCE=.\ucmpwrit.c
# End Source File
# Begin Source File
+SOURCE=.\ucmstate.c
+# End Source File
+# Begin Source File
+
SOURCE=.\unewdata.c
# End Source File
# Begin Source File
@@ -195,6 +203,10 @@ SOURCE=.\ucbuf.h
# End Source File
# Begin Source File
+SOURCE=.\ucm.h
+# End Source File
+# Begin Source File
+
SOURCE=.\ucmpwrit.h
# End Source File
# Begin Source File
diff --git a/icu4c/source/tools/toolutil/toolutil.h b/icu4c/source/tools/toolutil/toolutil.h
index 2326339668..7cde28e620 100644
--- a/icu4c/source/tools/toolutil/toolutil.h
+++ b/icu4c/source/tools/toolutil/toolutil.h
@@ -20,8 +20,7 @@
#define __TOOLUTIL_H__
#include "unicode/utypes.h"
-
-
+#include "cmemory.h"
/*
* For Windows, a path/filename may be the short (8.3) version
@@ -51,4 +50,55 @@ getLongPathname(const char *pathname);
U_CAPI const char * U_EXPORT2
findBasename(const char *filename);
+/*
+ * UToolMemory is used for generic, custom memory management.
+ * It is allocated with enough space for count*size bytes starting
+ * at array.
+ * The array is declared with a union of large data types so
+ * that its base address is aligned for any types.
+ * If size is a multiple of a data type size, then such items
+ * can be safely allocated inside the array, at offsets that
+ * are themselves multiples of size.
+ */
+struct UToolMemory;
+typedef struct UToolMemory UToolMemory;
+
+/**
+ * Open a UToolMemory object for allocation of initialCapacity to maxCapacity
+ * items with size bytes each.
+ */
+U_CAPI UToolMemory * U_EXPORT2
+utm_open(const char *name, int32_t initialCapacity, int32_t maxCapacity, int32_t size);
+
+/**
+ * Close a UToolMemory object.
+ */
+U_CAPI void U_EXPORT2
+utm_close(UToolMemory *mem);
+
+/**
+ * Get the pointer to the beginning of the array of items.
+ * The pointer becomes invalid after allocation of new items.
+ */
+U_CAPI void * U_EXPORT2
+utm_getStart(UToolMemory *mem);
+
+/**
+ * Get the current number of items.
+ */
+U_CAPI int32_t U_EXPORT2
+utm_countItems(UToolMemory *mem);
+
+/**
+ * Allocate one more item and return the pointer to its start in the array.
+ */
+U_CAPI void * U_EXPORT2
+utm_alloc(UToolMemory *mem);
+
+/**
+ * Allocate n items and return the pointer to the start of the first one in the array.
+ */
+U_CAPI void * U_EXPORT2
+utm_allocN(UToolMemory *mem, int32_t n);
+
#endif
diff --git a/icu4c/source/tools/toolutil/toolutil.vcproj b/icu4c/source/tools/toolutil/toolutil.vcproj
index 2e7ab8a315..f626978006 100644
--- a/icu4c/source/tools/toolutil/toolutil.vcproj
+++ b/icu4c/source/tools/toolutil/toolutil.vcproj
@@ -136,9 +136,15 @@
+
+
+
+
@@ -161,6 +167,9 @@
+
+
diff --git a/icu4c/source/tools/toolutil/ucm.c b/icu4c/source/tools/toolutil/ucm.c
new file mode 100644
index 0000000000..f4acb60d2e
--- /dev/null
+++ b/icu4c/source/tools/toolutil/ucm.c
@@ -0,0 +1,910 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucm.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun20
+* created by: Markus W. Scherer
+*
+* This file reads a .ucm file, stores its mappings and sorts them.
+* It implements handling of Unicode conversion mappings from .ucm files
+* for makeconv, canonucm, rptp2ucm, etc.
+*
+* Unicode code point sequences with a length of more than 1,
+* as well as byte sequences with more than 4 bytes or more than one complete
+* character sequence are handled to support m:n mappings.
+*/
+
+#include "unicode/utypes.h"
+#include "unicode/ustring.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "uarrsort.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include "uparse.h"
+#include "ucm.h"
+#include
+
+/* -------------------------------------------------------------------------- */
+
+/*
+### TODO
+allow file without fallback indicators for backward compatibility
+only for makeconv
+must not sort such mappings
+disallow when using extension tables because that requires sorting
+
+rptp2ucm has its own mapping parser and sets all-|1 and |3 mappings; normalization function generates |0 and |2
+
+*/
+
+static void
+printMapping(UCMapping *m, UChar32 *codePoints, uint8_t *bytes, FILE *f) {
+ int32_t j;
+
+ for(j=0; juLen; ++j) {
+ fprintf(f, "", codePoints[j]);
+ }
+
+ fputc(' ', f);
+
+ for(j=0; jbLen; ++j) {
+ fprintf(f, "\\x%02X", bytes[j]);
+ }
+
+ if(m->f>=0) {
+ fprintf(f, " |%lu\n", m->f);
+ } else {
+ fputs("\n", f);
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f) {
+ printMapping(m, UCM_GET_CODE_POINTS(table, m), UCM_GET_BYTES(table, m), f);
+}
+
+U_CAPI void U_EXPORT2
+ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode) {
+ UCMapping *m;
+ int32_t i, length;
+
+ m=table->mappings;
+ length=table->mappingsLength;
+ if(byUnicode) {
+ for(i=0; ireverseMap;
+ for(i=0; iuLen==1 && r->uLen==1) {
+ /* compare two single code points */
+ return l->u-r->u;
+ }
+
+ /* get pointers to the code point sequences */
+ lu=UCM_GET_CODE_POINTS(lTable, l);
+ ru=UCM_GET_CODE_POINTS(rTable, r);
+
+ /* get the minimum length */
+ if(l->uLen<=r->uLen) {
+ length=l->uLen;
+ } else {
+ length=r->uLen;
+ }
+
+ /* compare the code points */
+ for(i=0; iuLen-r->uLen;
+}
+
+static int32_t
+compareBytes(UCMTable *lTable, const UCMapping *l,
+ UCMTable *rTable, const UCMapping *r,
+ UBool lexical) {
+ const uint8_t *lb, *rb;
+ int32_t result, i, length;
+
+ /*
+ * A lexical comparison is used for sorting in the builder, to allow
+ * an efficient search for a byte sequence that could be a prefix
+ * of a previously entered byte sequence.
+ *
+ * Comparing by lengths first is for compatibility with old .ucm tools
+ * like canonucm and rptp2ucm.
+ */
+ if(lexical) {
+ /* get the minimum length and continue */
+ if(l->bLen<=r->bLen) {
+ length=l->bLen;
+ } else {
+ length=r->bLen;
+ }
+ } else {
+ /* compare lengths first */
+ result=l->bLen-r->bLen;
+ if(result!=0) {
+ return result;
+ } else {
+ length=l->bLen;
+ }
+ }
+
+ /* get pointers to the byte sequences */
+ lb=UCM_GET_BYTES(lTable, l);
+ rb=UCM_GET_BYTES(rTable, r);
+
+ /* compare the bytes */
+ for(i=0; ibLen-r->bLen;
+}
+
+/* compare UCMappings for sorting */
+static int32_t
+compareMappings(UCMTable *table, const void *left, const void *right, UBool uFirst) {
+ const UCMapping *l=(const UCMapping *)left, *r=(const UCMapping *)right;
+ int32_t result;
+
+ /* choose which side to compare first */
+ if(uFirst) {
+ /* Unicode then bytes */
+ result=compareUnicode(table, l, table, r);
+ if(result==0) {
+ result=compareBytes(table, l, table, r, FALSE); /* not lexically, like canonucm */
+ }
+ } else {
+ /* bytes then Unicode */
+ result=compareBytes(table, l, table, r, TRUE); /* lexically, for builder */
+ if(result==0) {
+ result=compareUnicode(table, l, table, r);
+ }
+ }
+
+ if(result!=0) {
+ return result;
+ }
+
+ /* compare the flags */
+ return l->f-r->f;
+}
+
+/* sorting by Unicode first sorts mappings directly */
+static int32_t
+compareMappingsUnicodeFirst(const void *context, const void *left, const void *right) {
+ return compareMappings((UCMTable *)context, left, right, TRUE);
+}
+
+/* sorting by bytes first sorts the reverseMap; use indirection to mappings */
+static int32_t
+compareMappingsBytesFirst(const void *context, const void *left, const void *right) {
+ UCMTable *table=(UCMTable *)context;
+ int32_t l=*(const int32_t *)left, r=*(const int32_t *)right;
+ return compareMappings(table, table->mappings+l, table->mappings+r, FALSE);
+}
+
+U_CAPI void U_EXPORT2
+ucm_sortTable(UCMTable *t) {
+ UErrorCode errorCode;
+ int32_t i;
+
+ errorCode=U_ZERO_ERROR;
+
+ /* 1. sort by Unicode first */
+ uprv_sortArray(t->mappings, t->mappingsLength, sizeof(UCMapping),
+ compareMappingsUnicodeFirst, t,
+ FALSE, &errorCode);
+
+ /* build the reverseMap */
+ if(t->reverseMap==NULL) {
+ /*
+ * allocate mappingsCapacity instead of mappingsLength so that
+ * if mappings are added, the reverseMap need not be
+ * reallocated each time
+ * (see moveMappings() and ucm_addMapping())
+ */
+ t->reverseMap=(int32_t *)uprv_malloc(t->mappingsCapacity*sizeof(int32_t));
+ if(t->reverseMap==NULL) {
+ fprintf(stderr, "ucm error: unable to allocate reverseMap\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+ for(i=0; imappingsLength; ++i) {
+ t->reverseMap[i]=i;
+ }
+
+ /* 2. sort reverseMap by mappings bytes first */
+ uprv_sortArray(t->reverseMap, t->mappingsLength, sizeof(int32_t),
+ compareMappingsBytesFirst, t,
+ FALSE, &errorCode);
+
+ if(U_FAILURE(errorCode)) {
+ fprintf(stderr, "ucm error: sortTable()/uprv_sortArray() fails - %s\n",
+ u_errorName(errorCode));
+ exit(errorCode);
+ }
+}
+
+/*
+
+### TODO normalization function for a table (in or for rptp2ucm)
+sort table
+if there are mappings with the same code points and bytes but |1 and |3, merge them into one |0 (or make |2 where necessary)
+if mappings were merged, sort again
+-> for rptp2ucm
+
+*/
+
+/* lookups ------------------------------------------------------------------ */
+
+/*
+### TODO lookups?
+
+binary search for first mapping with some code point or byte sequence
+check if a code point is the first of any mapping (RT or FB)
+check if a byte sequence is a prefix of any mapping (RT or RFB)
+check if there is a mapping with the same source units; return whether the target is same or different
+
+*/
+
+enum {
+ MOVE_TO_EXT=0x10,
+ REMOVE_MAPPING=0x20,
+ MOVE_ANY=0x30
+};
+
+/*
+ * move mappings with MOVE_ANY ored into their flags from the base table
+ * to the extension table
+ */
+static void
+moveMappings(UCMTable *base, UCMTable *ext) {
+ UCMapping *mb, *mbLimit;
+ int8_t flag;
+ UBool didMove;
+
+ mb=base->mappings;
+ mbLimit=mb+base->mappingsLength;
+ didMove=FALSE;
+
+ while(mbf;
+ if(flag&MOVE_ANY) {
+ /* restore the original flag value */
+ mb->f=flag&~MOVE_ANY;
+ didMove=TRUE;
+
+ if(ext!=NULL && (flag&MOVE_TO_EXT)) {
+ /* add the mapping to the extension table */
+ ucm_addMapping(ext, mb, UCM_GET_CODE_POINTS(base, mb), UCM_GET_BYTES(base, mb));
+ }
+
+ /* move the last base mapping down and overwrite the current one */
+ if(mb<(mbLimit-1)) {
+ uprv_memcpy(mb, mbLimit-1, sizeof(UCMapping));
+ }
+ --mbLimit;
+ --base->mappingsLength;
+ } else {
+ ++mb;
+ }
+ }
+
+ if(didMove) {
+ ucm_sortTable(base);
+ ucm_printTable(base, stdout, TRUE); puts(""); /* ### TODO */
+ if(ext!=NULL) {
+ ucm_sortTable(ext);
+ ucm_printTable(ext, stdout, TRUE); puts(""); /* ### TODO */
+ }
+ }
+}
+
+enum {
+ NEEDS_MOVE=1,
+ HAS_ERRORS=2
+};
+
+static uint8_t
+checkBaseExtUnicode(UCMTable *base, UCMTable *ext, UBool moveToExt) {
+ UCMapping *mb, *me, *mbLimit, *meLimit;
+ int32_t cmp;
+ uint8_t result;
+
+ mb=base->mappings;
+ mbLimit=mb+base->mappingsLength;
+
+ me=ext->mappings;
+ meLimit=me+ext->mappingsLength;
+
+ result=0;
+
+ for(;;) {
+ /* skip irrelevant mappings on both sides */
+ for(;;) {
+ if(mb==mbLimit) {
+ return result;
+ }
+
+ if(0<=mb->f && mb->f<=2) {
+ break;
+ }
+
+ ++mb;
+ }
+
+ for(;;) {
+ if(me==meLimit) {
+ return result;
+ }
+
+ if(0<=me->f && me->f<=2) {
+ break;
+ }
+
+ ++me;
+ }
+
+ /* compare the base and extension mappings */
+ cmp=compareUnicode(base, mb, ext, me);
+ if(cmp<0) {
+ /* does mb map from an input sequence that is a prefix of me's? */
+ if( mb->uLenuLen &&
+ 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
+ ) {
+ if(moveToExt) {
+ /* mark this mapping to be moved to the extension table */
+ mb->f|=MOVE_TO_EXT;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is a prefix of the input sequence of an extension mapping\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ }
+ result|=NEEDS_MOVE;
+ }
+
+ ++mb;
+ } else if(cmp==0) {
+ /*
+ * same output: remove the extension mapping,
+ * otherwise treat as an error
+ */
+ if( mb->f==me->f && mb->bLen==me->bLen &&
+ 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
+ ) {
+ me->f|=REMOVE_MAPPING;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is the same as the input sequence of an extension mapping\n"
+ " but it maps differently\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+
+ ++mb;
+ } else /* cmp>0 */ {
+ ++me;
+ }
+ }
+}
+
+static uint8_t
+checkBaseExtBytes(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
+ UCMapping *mb, *me;
+ int32_t *baseMap, *extMap;
+ int32_t b, e, bLimit, eLimit, cmp;
+ uint8_t result;
+ UBool isSISO;
+
+ baseMap=base->reverseMap;
+ extMap=ext->reverseMap;
+
+ b=e=0;
+ bLimit=base->mappingsLength;
+ eLimit=ext->mappingsLength;
+
+ result=0;
+
+ isSISO=(UBool)(baseStates->outputType==MBCS_OUTPUT_2_SISO);
+
+ for(;;) {
+ /* skip irrelevant mappings on both sides */
+ for(;;) {
+ if(b==bLimit) {
+ return result;
+ }
+ mb=base->mappings+baseMap[b];
+
+ if(mb->f==0 || mb->f==3) {
+ break;
+ }
+
+ ++b;
+ }
+
+ for(;;) {
+ if(e==eLimit) {
+ return result;
+ }
+ me=ext->mappings+extMap[e];
+
+ if(me->f==0 || me->f==3) {
+ break;
+ }
+
+ ++e;
+ }
+
+ /* compare the base and extension mappings */
+ cmp=compareBytes(base, mb, ext, me, TRUE);
+ if(cmp<0) {
+ /*
+ * does mb map from an input sequence that is a prefix of me's?
+ * for SI/SO tables, a single byte is never a prefix because it
+ * occurs in a separate single-byte state
+ */
+ if( mb->bLenbLen &&
+ (!isSISO || mb->bLen>1) &&
+ 0==uprv_memcmp(UCM_GET_BYTES(base, mb), UCM_GET_BYTES(ext, me), mb->bLen)
+ ) {
+ if(moveToExt) {
+ /* mark this mapping to be moved to the extension table */
+ mb->f|=MOVE_TO_EXT;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is a prefix of the input sequence of an extension mapping\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+ }
+
+ ++b;
+ } else if(cmp==0) {
+ /*
+ * same output: remove the extension mapping,
+ * otherwise treat as an error
+ */
+ if( mb->f==me->f && mb->uLen==me->uLen &&
+ 0==uprv_memcmp(UCM_GET_CODE_POINTS(base, mb), UCM_GET_CODE_POINTS(ext, me), 4*mb->uLen)
+ ) {
+ me->f|=REMOVE_MAPPING;
+ result|=NEEDS_MOVE;
+ } else {
+ fprintf(stderr,
+ "ucm error: the base table contains a mapping whose input sequence\n"
+ " is the same as the input sequence of an extension mapping\n"
+ " but it maps differently\n");
+ ucm_printMapping(base, mb, stderr);
+ ucm_printMapping(ext, me, stderr);
+ result|=HAS_ERRORS;
+ }
+
+ ++b;
+ } else /* cmp>0 */ {
+ ++e;
+ }
+ }
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_checkValidity(UCMTable *table, UCMStates *baseStates) {
+ UCMapping *m, *mLimit;
+ int32_t count;
+ UBool isOK;
+
+ m=table->mappings;
+ mLimit=m+table->mappingsLength;
+ isOK=TRUE;
+
+ while(mbLen);
+ if(count<1) {
+ ucm_printMapping(table, m, stderr);
+ isOK=FALSE;
+ }
+ ++m;
+ }
+
+ return isOK;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt) {
+ uint8_t result;
+
+ /* if we have an extension table, we must always use precision flags */
+ if(base->flagsType!=UCM_FLAGS_EXPLICIT || ext->flagsType!=UCM_FLAGS_EXPLICIT) {
+ fprintf(stderr, "ucm error: the base or extension table contains mappings without precision flags\n");
+ return FALSE;
+ }
+
+ /* checking requires both tables to be sorted */
+ ucm_sortTable(base);
+ ucm_sortTable(ext);
+
+ /* check */
+ result=
+ checkBaseExtUnicode(base, ext, moveToExt)|
+ checkBaseExtBytes(baseStates, base, ext, moveToExt);
+
+ if(result&HAS_ERRORS) {
+ return FALSE;
+ }
+
+ if(result&NEEDS_MOVE) {
+ moveMappings(ext, NULL);
+ moveMappings(base, ext);
+ }
+
+ return TRUE;
+}
+
+/* ucm parser --------------------------------------------------------------- */
+
+U_CAPI int8_t U_EXPORT2
+ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps) {
+ const char *s=*ps;
+ char *end;
+ int8_t bLen;
+
+ bLen=0;
+ for(;;) {
+ /* skip an optional plus sign */
+ if(bLen>0 && *s=='+') {
+ ++s;
+ }
+ if(*s!='\\') {
+ break;
+ }
+
+ if(bLen==UCNV_EXT_MAX_BYTES) {
+ fprintf(stderr, "ucm error: too many bytes on \"%s\"\n", line);
+ return -1;
+ }
+ if( s[1]!='x' ||
+ (bytes[bLen]=(uint8_t)uprv_strtoul(s+2, &end, 16), end)!=s+4
+ ) {
+ fprintf(stderr, "ucm error: byte must be formatted as \\xXX (2 hex digits) - \"%s\"\n", line);
+ return -1;
+ }
+ ++bLen;
+ s=end;
+ }
+
+ *ps=s;
+ return bLen;
+}
+
+/* parse a mapping line; must not be empty */
+U_CAPI UBool U_EXPORT2
+ucm_parseMappingLine(UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES],
+ const char *line) {
+ const char *s;
+ char *end;
+ int32_t u16Length;
+ int8_t uLen, bLen, f;
+
+ s=line;
+ uLen=bLen=0;
+
+ /* parse code points */
+ for(;;) {
+ /* skip an optional plus sign */
+ if(uLen>0 && *s=='+') {
+ ++s;
+ }
+ if(*s!='<') {
+ break;
+ }
+
+ if(uLen==UCNV_EXT_MAX_UCHARS) {
+ fprintf(stderr, "ucm error: too many code points on \"%s\"\n", line);
+ return FALSE;
+ }
+ if( s[1]!='U' ||
+ (codePoints[uLen]=(UChar32)uprv_strtoul(s+2, &end, 16), end)==s+2 ||
+ *end!='>'
+ ) {
+ fprintf(stderr, "ucm error: Unicode code point must be formatted as (1..6 hex digits) - \"%s\"\n", line);
+ return FALSE;
+ }
+ if((uint32_t)codePoints[uLen]>0x10ffff || U_IS_SURROGATE(codePoints[uLen])) {
+ fprintf(stderr, "ucm error: Unicode code point must be 0..d7ff or e000..10ffff - \"%s\"\n", line);
+ return FALSE;
+ }
+ ++uLen;
+ s=end+1;
+ }
+
+ if(uLen==0) {
+ fprintf(stderr, "ucm error: no Unicode code points on \"%s\"\n", line);
+ return FALSE;
+ } else if(uLen==1) {
+ m->u=codePoints[0];
+ } else {
+ UErrorCode errorCode=U_ZERO_ERROR;
+ u_strFromUTF32(NULL, 0, &u16Length, codePoints, uLen, &errorCode);
+ if( (U_FAILURE(errorCode) && errorCode!=U_BUFFER_OVERFLOW_ERROR) ||
+ u16Length>UCNV_EXT_MAX_UCHARS
+ ) {
+ fprintf(stderr, "ucm error: too many UChars on \"%s\"\n", line);
+ return FALSE;
+ }
+ }
+
+ s=u_skipWhitespace(s);
+
+ /* parse bytes */
+ bLen=ucm_parseBytes(bytes, line, &s);
+
+ if(bLen<0) {
+ return FALSE;
+ } else if(bLen==0) {
+ fprintf(stderr, "ucm error: no bytes on \"%s\"\n", line);
+ return FALSE;
+ } else if(bLen<=4) {
+ uprv_memcpy(m->b.bytes, bytes, bLen);
+ }
+
+ /* skip everything until the fallback indicator, even the start of a comment */
+ for(;;) {
+ if(*s==0) {
+ f=-1; /* no fallback indicator */
+ break;
+ } else if(*s=='|') {
+ f=(int8_t)(s[1]-'0');
+ if((uint8_t)f>3) {
+ fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
+ return FALSE;
+ }
+ break;
+ }
+ ++s;
+ }
+
+ m->uLen=uLen;
+ m->bLen=bLen;
+ m->f=f;
+ return TRUE;
+}
+
+/* general APIs ------------------------------------------------------------- */
+
+U_CAPI UCMTable * U_EXPORT2
+ucm_openTable() {
+ UCMTable *table=(UCMTable *)uprv_malloc(sizeof(UCMTable));
+ if(table==NULL) {
+ fprintf(stderr, "ucm error: unable to allocate a UCMTable\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ memset(table, 0, sizeof(UCMTable));
+ return table;
+}
+
+U_CAPI void U_EXPORT2
+ucm_closeTable(UCMTable *table) {
+ if(table!=NULL) {
+ uprv_free(table->mappings);
+ uprv_free(table->codePoints);
+ uprv_free(table->bytes);
+ uprv_free(table->reverseMap);
+ uprv_free(table);
+ }
+}
+
+U_CAPI void U_EXPORT2
+ucm_addMapping(UCMTable *table,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]) {
+ UCMapping *tm;
+ UChar32 c;
+ int32_t index;
+
+ if(table->mappingsLength>=table->mappingsCapacity) {
+ /* make the mappings array larger */
+ if(table->mappingsCapacity==0) {
+ table->mappingsCapacity=1000;
+ } else {
+ table->mappingsCapacity*=10;
+ }
+ table->mappings=(UCMapping *)uprv_realloc(table->mappings,
+ table->mappingsCapacity*sizeof(UCMapping));
+ if(table->mappings==NULL) {
+ fprintf(stderr, "ucm error: unable to allocate %d UCMappings\n",
+ table->mappingsCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ if(table->reverseMap!=NULL) {
+ /* the reverseMap must be reallocated in a new sort */
+ uprv_free(table->reverseMap);
+ table->reverseMap=NULL;
+ }
+ }
+
+ if(m->uLen>1 && table->codePointsCapacity==0) {
+ table->codePointsCapacity=10000;
+ table->codePoints=(UChar32 *)uprv_malloc(table->codePointsCapacity*4);
+ if(table->codePoints==NULL) {
+ fprintf(stderr, "ucm error: unable to allocate %d UChar32s\n",
+ table->codePointsCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ if(m->bLen>4 && table->bytesCapacity==0) {
+ table->bytesCapacity=10000;
+ table->bytes=(uint8_t *)uprv_malloc(table->bytesCapacity);
+ if(table->bytes==NULL) {
+ fprintf(stderr, "ucm error: unable to allocate %d bytes\n",
+ table->bytesCapacity);
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+ }
+
+ if(m->uLen>1) {
+ index=table->codePointsLength;
+ table->codePointsLength+=m->uLen;
+ if(table->codePointsLength>table->codePointsCapacity) {
+ fprintf(stderr, "ucm error: too many code points in multiple-code point mappings\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ uprv_memcpy(table->codePoints+index, codePoints, m->uLen*4);
+ m->u=index;
+ }
+
+ if(m->bLen>4) {
+ index=table->bytesLength;
+ table->bytesLength+=m->bLen;
+ if(table->bytesLength>table->bytesCapacity) {
+ fprintf(stderr, "ucm error: too many bytes in mappings with >4 charset bytes\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ uprv_memcpy(table->bytes+index, bytes, m->bLen);
+ m->b.index=index;
+ }
+
+ /* set unicodeMask */
+ for(index=0; indexuLen; ++index) {
+ c=codePoints[index];
+ if(c>=0x10000) {
+ table->unicodeMask|=UCNV_HAS_SUPPLEMENTARY; /* there are supplementary code points */
+ } else if(U_IS_SURROGATE(c)) {
+ table->unicodeMask|=UCNV_HAS_SURROGATES; /* there are surrogate code points */
+ }
+ }
+
+ /* set flagsType */
+ if(m->f<0) {
+ table->flagsType|=UCM_FLAGS_IMPLICIT;
+ } else {
+ table->flagsType|=UCM_FLAGS_EXPLICIT;
+ }
+
+ tm=table->mappings+table->mappingsLength++;
+ uprv_memcpy(tm, m, sizeof(UCMapping));
+}
+
+U_CAPI UCMFile * U_EXPORT2
+ucm_open() {
+ UCMFile *ucm=(UCMFile *)uprv_malloc(sizeof(UCMFile));
+ if(ucm==NULL) {
+ fprintf(stderr, "ucm error: unable to allocate a UCMFile\n");
+ exit(U_MEMORY_ALLOCATION_ERROR);
+ }
+
+ memset(ucm, 0, sizeof(UCMFile));
+
+ ucm->base=ucm_openTable();
+ ucm->ext=ucm_openTable();
+
+ ucm->states.stateFlags[0]=MBCS_STATE_FLAG_DIRECT;
+ ucm->states.conversionType=UCNV_UNSUPPORTED_CONVERTER;
+ ucm->states.outputType=-1;
+ ucm->states.minCharLength=ucm->states.maxCharLength=1;
+
+ return ucm;
+}
+
+U_CAPI void U_EXPORT2
+ucm_close(UCMFile *ucm) {
+ if(ucm!=NULL) {
+ uprv_free(ucm->base);
+ uprv_free(ucm->ext);
+ uprv_free(ucm);
+ }
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates) {
+ UCMapping m={ 0 };
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS];
+ uint8_t bytes[UCNV_EXT_MAX_BYTES];
+ int32_t count;
+
+ if(!ucm_parseMappingLine(&m, codePoints, bytes, line)) {
+ return FALSE;
+ }
+
+ if(baseStates!=NULL) {
+ /* check validity of the bytes and count the characters in them */
+ count=ucm_countChars(baseStates, bytes, m.bLen);
+ if(count<1) {
+ /* illegal byte sequence */
+ printMapping(&m, codePoints, bytes, stderr);
+ return FALSE;
+ }
+ } else {
+ /* not used - adding a mapping for an extension-only table before its base table is read */
+ count=0;
+ }
+
+ /*
+ * Add the mapping to the base table if this is requested
+ * and it is a 1:1 mapping.
+ * Otherwise, add it to the extension table.
+ *
+ * Also add |2 SUB mappings for
+ * and |1 fallbacks from something other than U+0000 to 0x00
+ * to the extension table.
+ */
+ if( forBase && m.uLen==1 && count==1 &&
+ !((m.f==2 && m.bLen==1 && ucm->states.maxCharLength>1) ||
+ (m.f==1 && m.bLen==1 && bytes[0]==0 && !(m.uLen==1 && codePoints[0]==0)))
+ ) {
+ ucm_addMapping(ucm->base, &m, codePoints, bytes);
+ return TRUE;
+ }
+
+ ucm_addMapping(ucm->ext, &m, codePoints, bytes);
+ return TRUE;
+}
diff --git a/icu4c/source/tools/toolutil/ucm.h b/icu4c/source/tools/toolutil/ucm.h
new file mode 100644
index 0000000000..b58eb343c0
--- /dev/null
+++ b/icu4c/source/tools/toolutil/ucm.h
@@ -0,0 +1,217 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucm.h
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003jun20
+* created by: Markus W. Scherer
+*
+* Definitions for the .ucm file parser and handler module ucm.c.
+*/
+
+#ifndef __UCM_H__
+#define __UCM_H__
+
+#include "unicode/utypes.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include
+
+U_CDECL_BEGIN
+
+/*
+ * Per-mapping data structure
+ *
+ * u if uLen==1: Unicode code point
+ * else index to uLen code points
+ * b if bLen<=4: up to 4 bytes
+ * else index to bLen bytes
+ * uLen number of code points
+ * bLen number of words containing left-justified bytes
+ * bIsMultipleChars indicates that the bytes contain more than one sequence
+ * according to the state table
+ * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
+ * same values as in the source file after |
+ */
+typedef struct UCMapping {
+ UChar32 u;
+ union {
+ uint32_t index;
+ uint8_t bytes[4];
+ } b;
+ int8_t uLen, bLen, f;
+} UCMapping;
+
+enum {
+ UCM_FLAGS_INITIAL, /* no mappings parsed yet */
+ UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */
+ UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */
+ UCM_FLAGS_MIXED /* both implicit and explicit */
+};
+
+typedef struct UCMTable {
+ UCMapping *mappings;
+ int32_t mappingsCapacity, mappingsLength;
+
+ UChar32 *codePoints;
+ int32_t codePointsCapacity, codePointsLength;
+
+ uint8_t *bytes;
+ int32_t bytesCapacity, bytesLength;
+
+ /* index map for mapping by bytes first */
+ int32_t *reverseMap;
+
+ uint8_t unicodeMask;
+ int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */
+} UCMTable;
+
+enum {
+ MBCS_STATE_FLAG_DIRECT=1,
+ MBCS_STATE_FLAG_SURROGATES,
+
+ MBCS_STATE_FLAG_READY=16
+};
+
+typedef struct UCMStates {
+ int32_t stateTable[MBCS_MAX_STATE_COUNT][256];
+ uint32_t stateFlags[MBCS_MAX_STATE_COUNT],
+ stateOffsetSum[MBCS_MAX_STATE_COUNT];
+
+ int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits;
+ int8_t conversionType, outputType;
+} UCMStates;
+
+typedef struct UCMFile {
+ UCMTable *base, *ext;
+ UCMStates states;
+
+ char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH];
+} UCMFile;
+
+/* simple accesses ---------------------------------------------------------- */
+
+#define UCM_GET_CODE_POINTS(t, m) \
+ (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u)
+
+#define UCM_GET_BYTES(t, m) \
+ (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.index)
+
+/* APIs --------------------------------------------------------------------- */
+
+U_CAPI UCMFile * U_EXPORT2
+ucm_open(void);
+
+U_CAPI void U_EXPORT2
+ucm_close(UCMFile *ucm);
+
+U_CAPI UBool U_EXPORT2
+ucm_parseHeaderLine(UCMFile *ucm,
+ char *line, char **pKey, char **pValue);
+
+U_CAPI UBool U_EXPORT2
+ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates);
+
+
+U_CAPI UCMTable * U_EXPORT2
+ucm_openTable(void);
+
+U_CAPI void U_EXPORT2
+ucm_closeTable(UCMTable *table);
+
+U_CAPI void U_EXPORT2
+ucm_sortTable(UCMTable *t);
+
+/**
+ * Check the validity of mappings against a base table's states;
+ * necessary for extension-only tables that were read before their base tables.
+ */
+U_CAPI UBool U_EXPORT2
+ucm_checkValidity(UCMTable *ext, UCMStates *baseStates);
+
+/**
+ * Check a base table against an extension table.
+ * Set moveToExt=TRUE for where base and extension tables are parsed
+ * from a single file,
+ * and moveToExt=FALSE for where the extension table is in a separate file.
+ *
+ * For both tables in the same file, the extension table is automatically
+ * built.
+ * For separate files, the extension file can use a complete mapping table,
+ * so that common mappings need not be stripped out manually.
+ *
+ *
+ * Sort both tables, and then for each mapping direction:
+ *
+ * If the base table contains a mapping for which the input sequence is
+ * the same as the extension input, then
+ * - if the output is the same: remove the extension mapping
+ * - else: error
+ *
+ * If the base table contains a mapping for which the input sequence is
+ * a prefix of the extension input, then
+ * - if moveToExt: move the base mapping to the extension table
+ * - else: error
+ *
+ * @return FALSE in case of an irreparable error
+ */
+U_CAPI UBool U_EXPORT2
+ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, UBool moveToExt);
+
+U_CAPI void U_EXPORT2
+ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode);
+
+U_CAPI void U_EXPORT2
+ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f);
+
+
+U_CAPI void U_EXPORT2
+ucm_addState(UCMStates *states, const char *s);
+
+U_CAPI void U_EXPORT2
+ucm_processStates(UCMStates *states);
+
+U_CAPI int32_t U_EXPORT2
+ucm_countChars(UCMStates *states,
+ const uint8_t *bytes, int32_t length);
+
+
+U_CAPI int8_t U_EXPORT2
+ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps);
+
+U_CAPI UBool U_EXPORT2
+ucm_parseMappingLine(UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES],
+ const char *line);
+
+U_CAPI void U_EXPORT2
+ucm_addMapping(UCMTable *table,
+ UCMapping *m,
+ UChar32 codePoints[UCNV_EXT_MAX_UCHARS],
+ uint8_t bytes[UCNV_EXT_MAX_BYTES]);
+
+/* very makeconv-specific functions ----------------------------------------- */
+
+/* finalize and optimize states after the toUnicode mappings are processed */
+U_CAPI void U_EXPORT2
+ucm_optimizeStates(UCMStates *states,
+ uint16_t **pUnicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ UBool verbose);
+
+/* moved here because it is used inside ucmstate.c */
+U_CAPI int32_t U_EXPORT2
+ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ uint32_t offset);
+
+U_CDECL_END
+
+#endif
diff --git a/icu4c/source/tools/toolutil/ucmstate.c b/icu4c/source/tools/toolutil/ucmstate.c
new file mode 100644
index 0000000000..ccc43a6f8b
--- /dev/null
+++ b/icu4c/source/tools/toolutil/ucmstate.c
@@ -0,0 +1,1042 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2003, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: ucmstate.c
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2003oct09
+* created by: Markus W. Scherer
+*
+* This file handles ICU .ucm file state information as part of the ucm module.
+* Most of this code used to be in makeconv.c.
+*/
+
+#include "unicode/utypes.h"
+#include "cstring.h"
+#include "cmemory.h"
+#include "uarrsort.h"
+#include "ucnvmbcs.h"
+#include "ucnv_ext.h"
+#include "uparse.h"
+#include "ucm.h"
+#include
+
+/* MBCS state handling ------------------------------------------------------ */
+
+/*
+ * state table row grammar (ebnf-style):
+ * (whitespace is allowed between all tokens)
+ *
+ * row=[[firstentry ','] entry (',' entry)*]
+ * firstentry="initial" | "surrogates"
+ * (initial state (default for state 0), output is all surrogate pairs)
+ * entry=range [':' nextstate] ['.' action]
+ * range=number ['-' number]
+ * nextstate=number
+ * (0..7f)
+ * action='u' | 's' | 'p' | 'i'
+ * (unassigned, state change only, surrogate pair, illegal)
+ * number=(1- or 2-digit hexadecimal number)
+ */
+static const char *
+parseState(const char *s, int32_t state[256], uint32_t *pFlags) {
+ const char *t;
+ uint32_t start, end, i;
+ int32_t entry;
+
+ /* initialize the state: all illegal with U+ffff */
+ for(i=0; i<256; ++i) {
+ state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff);
+ }
+
+ /* skip leading white space */
+ s=u_skipWhitespace(s);
+
+ /* is there an "initial" or "surrogates" directive? */
+ if(uprv_strncmp("initial", s, 7)==0) {
+ *pFlags=MBCS_STATE_FLAG_DIRECT;
+ s=u_skipWhitespace(s+7);
+ if(*s++!=',') {
+ return s-1;
+ }
+ } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) {
+ *pFlags=MBCS_STATE_FLAG_SURROGATES;
+ s=u_skipWhitespace(s+10);
+ if(*s++!=',') {
+ return s-1;
+ }
+ } else if(*s==0) {
+ /* empty state row: all-illegal */
+ return NULL;
+ }
+
+ for(;;) {
+ /* read an entry, the start of the range first */
+ s=u_skipWhitespace(s);
+ start=uprv_strtoul(s, (char **)&t, 16);
+ if(s==t || 0xffcountStates==MBCS_MAX_STATE_COUNT) {
+ fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ error=parseState(s, states->stateTable[states->countStates],
+ &states->stateFlags[states->countStates]);
+ if(error!=NULL) {
+ fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ ++states->countStates;
+}
+
+U_CAPI UBool U_EXPORT2
+ucm_parseHeaderLine(UCMFile *ucm,
+ char *line, char **pKey, char **pValue) {
+ UCMStates *states;
+ char *s, *end;
+ char c;
+
+ states=&ucm->states;
+
+ /* remove comments and trailing CR and LF and remove whitespace from the end */
+ for(end=line; (c=*end)!=0; ++end) {
+ if(c=='#' || c=='\r' || c=='\n') {
+ break;
+ }
+ }
+ while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) {
+ --end;
+ }
+ *end=0;
+
+ /* skip leading white space and ignore empty lines */
+ s=(char *)u_skipWhitespace(line);
+ if(*s==0) {
+ return TRUE;
+ }
+
+ /* stop at the beginning of the mapping section */
+ if(uprv_memcmp(s, "CHARMAP", 7)==0) {
+ return FALSE;
+ }
+
+ /* get the key name, bracketed in <> */
+ if(*s!='<') {
+ fprintf(stderr, "ucm error: no header field in line \"%s\"\n", line);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ *pKey=++s;
+ while(*s!='>') {
+ if(*s==0) {
+ fprintf(stderr, "ucm error: incomplete header field in line \"%s\"\n", line);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ ++s;
+ }
+ *s=0;
+
+ /* get the value string, possibly quoted */
+ s=(char *)u_skipWhitespace(s+1);
+ if(*s!='"') {
+ *pValue=s;
+ } else {
+ /* remove the quotes */
+ *pValue=s+1;
+ if(end>*pValue && *(end-1)=='"') {
+ *--end=0;
+ }
+ }
+
+ /* collect the information from the header field, ignore unknown keys */
+ if(uprv_strcmp(*pKey, "uconv_class")==0) {
+ if(uprv_strcmp(*pValue, "DBCS")==0) {
+ states->conversionType=UCNV_DBCS;
+ } else if(uprv_strcmp(*pValue, "SBCS")==0) {
+ states->conversionType = UCNV_SBCS;
+ } else if(uprv_strcmp(*pValue, "MBCS")==0) {
+ states->conversionType = UCNV_MBCS;
+ } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) {
+ states->conversionType = UCNV_EBCDIC_STATEFUL;
+ } else {
+ fprintf(stderr, "ucm error: unknown %s\n", *pValue);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ return TRUE;
+ } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) {
+ c=**pValue;
+ if('1'<=c && c<='4' && (*pValue)[1]==0) {
+ states->maxCharLength=(int8_t)(c-'0');
+ states->outputType=states->maxCharLength-1;
+ } else {
+ fprintf(stderr, "ucm error: illegal %s\n", *pValue);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ return TRUE;
+ } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) {
+ c=**pValue;
+ if('1'<=c && c<='4' && (*pValue)[1]==0) {
+ states->minCharLength=(int8_t)(c-'0');
+ } else {
+ fprintf(stderr, "ucm error: illegal %s\n", *pValue);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ return TRUE;
+ } else if(uprv_strcmp(*pKey, "icu:state")==0) {
+ /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */
+ switch(states->conversionType) {
+ case UCNV_SBCS:
+ case UCNV_DBCS:
+ case UCNV_EBCDIC_STATEFUL:
+ states->conversionType=UCNV_MBCS;
+ break;
+ case UCNV_MBCS:
+ break;
+ default:
+ fprintf(stderr, "ucm error: entry for non-MBCS table or before the line\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ if(states->maxCharLength==0) {
+ fprintf(stderr, "ucm error: before the line\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ ucm_addState(states, *pValue);
+ return TRUE;
+ } else if(uprv_strcmp(*pKey, "icu:base")==0) {
+ if(**pValue==0) {
+ fprintf(stderr, "ucm error: without a base table name\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ uprv_strcpy(ucm->baseName, *pValue);
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+/* post-processing ---------------------------------------------------------- */
+
+static int32_t
+sumUpStates(UCMStates *states) {
+ int32_t entry, sum, state, cell, count;
+ UBool allStatesReady;
+
+ /*
+ * Sum up the offsets for all states.
+ * In each final state (where there are only final entries),
+ * the offsets add up directly.
+ * In all other state table rows, for each transition entry to another state,
+ * the offsets sum of that state needs to be added.
+ * This is achieved in at most countStates iterations.
+ */
+ allStatesReady=FALSE;
+ for(count=states->countStates; !allStatesReady && count>=0; --count) {
+ allStatesReady=TRUE;
+ for(state=states->countStates-1; state>=0; --state) {
+ if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) {
+ allStatesReady=FALSE;
+ sum=0;
+
+ /* at first, add up only the final delta offsets to keep them <512 */
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if(MBCS_ENTRY_IS_FINAL(entry)) {
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
+ sum+=1;
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum);
+ sum+=2;
+ break;
+ default:
+ /* no addition */
+ break;
+ }
+ }
+ }
+
+ /* now, add up the delta offsets for the transitional entries */
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) {
+ states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum);
+ sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)];
+ } else {
+ /* that next state does not have a sum yet, we cannot finish the one for this state */
+ sum=-1;
+ break;
+ }
+ }
+ }
+
+ if(sum!=-1) {
+ states->stateOffsetSum[state]=sum;
+ states->stateFlags[state]|=MBCS_STATE_FLAG_READY;
+ }
+ }
+ }
+ }
+
+ if(!allStatesReady) {
+ fprintf(stderr, "ucm error: the state table contains loops\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ /*
+ * For all "direct" (i.e., initial) states>0,
+ * the offsets need to be increased by the sum of
+ * the previous initial states.
+ */
+ sum=states->stateOffsetSum[0];
+ for(state=1; statecountStates; ++state) {
+ if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ int32_t sum2=sum;
+ sum+=states->stateOffsetSum[state];
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2);
+ }
+ }
+ }
+ }
+
+ /* round up to the next even number to have the following data 32-bit-aligned */
+ return states->countToUCodeUnits=(sum+1)&~1;
+}
+
+U_CAPI void U_EXPORT2
+ucm_processStates(UCMStates *states) {
+ int32_t entry, state, cell, count;
+
+ if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) {
+ fprintf(stderr, "ucm error: missing conversion type ()\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ if(states->countStates==0) {
+ switch(states->conversionType) {
+ case UCNV_SBCS:
+ /* SBCS: use MBCS data structure with a default state table */
+ if(states->maxCharLength!=1) {
+ fprintf(stderr, "error: SBCS codepage with max B/char!=1\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ states->conversionType=UCNV_MBCS;
+ ucm_addState(states, "0-ff");
+ break;
+ case UCNV_MBCS:
+ fprintf(stderr, "ucm error: missing state table information () for MBCS\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ break;
+ case UCNV_EBCDIC_STATEFUL:
+ /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */
+ if(states->minCharLength!=1 || states->maxCharLength!=2) {
+ fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ states->conversionType=UCNV_MBCS;
+ ucm_addState(states, "0-ff, e:1.s, f:0.s");
+ ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4");
+ ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i");
+ ucm_addState(states, "0-ff:1.i, 40:1.");
+ ucm_addState(states, "0-ff:1.i");
+ break;
+ case UCNV_DBCS:
+ /* DBCS: use MBCS data structure with a default state table */
+ if(states->minCharLength!=2 || states->maxCharLength!=2) {
+ fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ states->conversionType = UCNV_MBCS;
+ ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3");
+ ucm_addState(states, "41-fe");
+ ucm_addState(states, "40");
+ ucm_addState(states, "");
+ break;
+ default:
+ fprintf(stderr, "ucm error: unknown charset structure\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ break;
+ }
+ }
+
+ /*
+ * check that the min/max character lengths are reasonable;
+ * to do this right, all paths through the state table would have to be
+ * recursively walked while keeping track of the sequence lengths,
+ * but these simple checks cover most state tables in practice
+ */
+ if(states->maxCharLengthminCharLength) {
+ fprintf(stderr, "ucm error: max B/char < min B/char\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ /* count non-direct states and compare with max B/char */
+ count=0;
+ for(state=0; statecountStates; ++state) {
+ if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
+ ++count;
+ }
+ }
+ if(states->maxCharLength>count+1) {
+ fprintf(stderr, "ucm error: max B/char too large\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+
+ if(states->minCharLength==1) {
+ int32_t action;
+
+ /*
+ * if there are single-byte characters,
+ * then the initial state must have direct result states
+ */
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[0][cell];
+ if( MBCS_ENTRY_IS_FINAL(entry) &&
+ ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 ||
+ action==MBCS_STATE_UNASSIGNED)
+ ) {
+ break;
+ }
+ }
+
+ if(cell==256) {
+ fprintf(stderr, "ucm warning: min B/char too small\n");
+ }
+ }
+
+ /*
+ * make sure that all "next state" values are within limits
+ * and that all next states after final ones have the "direct"
+ * flag of initial states
+ */
+ for(state=states->countStates-1; state>=0; --state) {
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) {
+ fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n",
+ state, cell, MBCS_ENTRY_STATE(entry));
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) {
+ fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n",
+ state, cell, MBCS_ENTRY_STATE(entry));
+ exit(U_INVALID_TABLE_FORMAT);
+ } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n",
+ state, cell, MBCS_ENTRY_STATE(entry));
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ }
+ }
+
+ /* is this an SI/SO (like EBCDIC-stateful) state table? */
+ if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ if(states->maxCharLength!=2) {
+ fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", states->maxCharLength);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ if(states->countStates<3) {
+ fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", states->countStates);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ /* are the SI/SO all in the right places? */
+ if( states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
+ states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) &&
+ states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) &&
+ states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)
+ ) {
+ states->outputType=MBCS_OUTPUT_2_SISO;
+ } else {
+ fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n");
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ state=2;
+ } else {
+ state=1;
+ }
+
+ /* check that no unexpected state is a "direct" one */
+ while(statecountStates) {
+ if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", state);
+ exit(U_INVALID_TABLE_FORMAT);
+ }
+ ++state;
+ }
+
+ sumUpStates(states);
+}
+
+/* find a fallback for this offset; return the index or -1 if not found */
+U_CAPI int32_t U_EXPORT2
+ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ uint32_t offset) {
+ int32_t i;
+
+ if(countToUFallbacks==0) {
+ /* shortcut: most codepages do not have fallbacks from codepage to Unicode */
+ return -1;
+ }
+
+ /* do a linear search for the fallback mapping (the table is not yet sorted) */
+ for(i=0; ioutputType==MBCS_OUTPUT_2_SISO) {
+ /* use the DBCS lead state for SI/SO codepages */
+ leadState=1;
+ } else {
+ leadState=0;
+ }
+
+ /* find the main trail state: the most used target state */
+ uprv_memset(count, 0, sizeof(count));
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[leadState][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ ++count[MBCS_ENTRY_TRANSITION_STATE(entry)];
+ }
+ }
+ trailState=0;
+ for(i=1; icountStates; ++i) {
+ if(count[i]>count[trailState]) {
+ trailState=i;
+ }
+ }
+
+ /* count possible savings from lead bytes with all-unassigned results in all trail bytes */
+ uprv_memset(count, 0, sizeof(count));
+ savings=0;
+ /* for each lead byte */
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[leadState][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) {
+ /* the offset is different for each lead byte */
+ offset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
+ /* for each trail byte for this lead byte */
+ for(j=0; j<256; ++j) {
+ entry=states->stateTable[trailState][j];
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
+ ++count[i];
+ } else {
+ j=999; /* do not count for this lead byte because there are assignments */
+ }
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if((*pUnicodeCodeUnits)[entry]==0xfffe) {
+ count[i]+=2;
+ } else {
+ j=999; /* do not count for this lead byte because there are assignments */
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ if(j==256) {
+ /* all trail bytes for this lead byte are unassigned */
+ savings+=count[i];
+ } else {
+ count[i]=0;
+ }
+ }
+ }
+ /* subtract from the possible savings the cost of an additional state */
+ savings=savings*2-1024; /* count bytes, not 16-bit words */
+ if(savings<=0) {
+ return;
+ }
+ if(verbose) {
+ printf("compacting toUnicode data saves %ld bytes\n", (long)savings);
+ }
+ if(states->countStates>=MBCS_MAX_STATE_COUNT) {
+ fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n");
+ return;
+ }
+
+ /* make a copy of the state table */
+ oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024);
+ if(oldStateTable==NULL) {
+ fprintf(stderr, "cannot compact toUnicode: out of memory\n");
+ return;
+ }
+ uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024);
+
+ /* add the new state */
+ /*
+ * this function does not catch the degenerate case where all lead bytes
+ * have all-unassigned trail bytes and the lead state could be removed
+ */
+ newState=states->countStates++;
+ states->stateFlags[newState]=0;
+ /* copy the old trail state, turning all assigned states into unassigned ones */
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[trailState][i];
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ case MBCS_STATE_VALID_16_PAIR:
+ states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe);
+ break;
+ default:
+ states->stateTable[newState][i]=entry;
+ break;
+ }
+ }
+
+ /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */
+ for(i=0; i<256; ++i) {
+ if(count[i]>0) {
+ states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState);
+ }
+ }
+
+ /* sum up the new state table */
+ for(i=0; icountStates; ++i) {
+ states->stateFlags[i]&=~MBCS_STATE_FLAG_READY;
+ }
+ sum=sumUpStates(states);
+
+ /* allocate a new, smaller code units array */
+ oldUnicodeCodeUnits=*pUnicodeCodeUnits;
+ if(sum==0) {
+ *pUnicodeCodeUnits=NULL;
+ if(oldUnicodeCodeUnits!=NULL) {
+ uprv_free(oldUnicodeCodeUnits);
+ }
+ uprv_free(oldStateTable);
+ return;
+ }
+ *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t));
+ if(*pUnicodeCodeUnits==NULL) {
+ fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n",
+ (long)sum);
+ /* revert to the old state table */
+ *pUnicodeCodeUnits=oldUnicodeCodeUnits;
+ --states->countStates;
+ uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024);
+ uprv_free(oldStateTable);
+ return;
+ }
+ for(i=0; icountStates; ++leadState) {
+ if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ /* for each lead byte from there */
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[leadState][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
+ /* the new state does not have assigned states */
+ if(trailState!=newState) {
+ trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry);
+ oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]);
+ /* for each trail byte */
+ for(j=0; j<256; ++j) {
+ entry=states->stateTable[trailState][j];
+ /* copy assigned-character code units and adjust fallback offsets */
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ /* find the old offset according to the old state table */
+ oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
+ unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
+ if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) {
+ toUFallbacks[fallback].offset=0x80000000|offset;
+ }
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ /* find the old offset according to the old state table */
+ oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]);
+ (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++];
+ (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset];
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ /* remove temporary flags from fallback offsets that protected them from being modified twice */
+ for(i=0; i0 number of bytes that are used in unicodeCodeUnits[] that could be saved,
+ * if all sequences from this state are unassigned, returns the
+ * <0 there are assignments in unicodeCodeUnits[]
+ * 0 no use of unicodeCodeUnits[]
+ */
+static int32_t
+findUnassigned(UCMStates *states,
+ uint16_t *unicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ int32_t state, int32_t offset, uint32_t b) {
+ int32_t i, entry, savings, localSavings, belowSavings;
+ UBool haveAssigned;
+
+ localSavings=belowSavings=0;
+ haveAssigned=FALSE;
+ for(i=0; i<256; ++i) {
+ entry=states->stateTable[state][i];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ savings=findUnassigned(states,
+ unicodeCodeUnits,
+ toUFallbacks, countToUFallbacks,
+ MBCS_ENTRY_TRANSITION_STATE(entry),
+ offset+MBCS_ENTRY_TRANSITION_OFFSET(entry),
+ (b<<8)|(uint32_t)i);
+ if(savings<0) {
+ haveAssigned=TRUE;
+ } else if(savings>0) {
+ printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n",
+ (unsigned long)((b<<8)|i), (long)state, (long)savings);
+ belowSavings+=savings;
+ }
+ } else if(!haveAssigned) {
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_VALID_16:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) {
+ localSavings+=2;
+ } else {
+ haveAssigned=TRUE;
+ }
+ break;
+ case MBCS_STATE_VALID_16_PAIR:
+ entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry);
+ if(unicodeCodeUnits[entry]==0xfffe) {
+ localSavings+=4;
+ } else {
+ haveAssigned=TRUE;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ if(haveAssigned) {
+ return -1;
+ } else {
+ return localSavings+belowSavings;
+ }
+}
+
+/* helper function for finding compaction opportunities */
+static void
+compactToUnicodeHelper(UCMStates *states,
+ uint16_t *unicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) {
+ int32_t state, savings;
+
+ /* for each initial state */
+ for(state=0; statecountStates; ++state) {
+ if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) {
+ savings=findUnassigned(states,
+ unicodeCodeUnits,
+ toUFallbacks, countToUFallbacks,
+ state, 0, 0);
+ if(savings>0) {
+ printf(" all-unassigned sequences from initial state %ld use %ld bytes\n",
+ (long)state, (long)savings);
+ }
+ }
+ }
+}
+
+static int32_t
+compareFallbacks(const void *context, const void *fb1, const void *fb2) {
+ return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset;
+}
+
+U_CAPI void U_EXPORT2
+ucm_optimizeStates(UCMStates *states,
+ uint16_t **pUnicodeCodeUnits,
+ _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks,
+ UBool verbose) {
+ UErrorCode errorCode;
+ int32_t state, cell, entry;
+
+ /* test each state table entry */
+ for(state=0; statecountStates; ++state) {
+ for(cell=0; cell<256; ++cell) {
+ entry=states->stateTable[state][cell];
+ /*
+ * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code
+ * and the code point is "unassigned" (0xfffe), then change it to
+ * the "unassigned" action code with bits 26..23 set to zero and U+fffe.
+ */
+ if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) {
+ states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED);
+ }
+ }
+ }
+
+ /* try to compact the toUnicode tables */
+ if(states->maxCharLength==2) {
+ compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose);
+ } else if(states->maxCharLength>2) {
+ if(verbose) {
+ compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks);
+ }
+ }
+
+ /* sort toUFallbacks */
+ /*
+ * It should be safe to sort them before compactToUnicode2() is called,
+ * because it should not change the relative order of the offset values
+ * that it adjusts, but they need to be sorted at some point, and
+ * it is safest here.
+ */
+ if(countToUFallbacks>0) {
+ errorCode=U_ZERO_ERROR; /* nothing bad will happen... */
+ uprv_sortArray(toUFallbacks, countToUFallbacks,
+ sizeof(_MBCSToUFallback),
+ compareFallbacks, NULL, FALSE, &errorCode);
+ }
+}
+
+/* use a complete state table ----------------------------------------------- */
+
+U_CAPI int32_t U_EXPORT2
+ucm_countChars(UCMStates *states,
+ const uint8_t *bytes, int32_t length) {
+ uint32_t offset;
+ int32_t i, entry, count;
+ uint8_t state;
+
+ offset=0;
+ i=count=0;
+ state=0;
+
+ if(states->countStates==0) {
+ fprintf(stderr, "ucm error: there is no state information!\n");
+ return -1;
+ }
+
+ /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */
+ if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) {
+ state=1;
+ }
+
+ /*
+ * Walk down the state table like in conversion,
+ * much like getNextUChar().
+ * We assume that c<=0x10ffff.
+ */
+ for(i=0; istateTable[state][bytes[i]];
+ if(MBCS_ENTRY_IS_TRANSITION(entry)) {
+ state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry);
+ offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry);
+ } else {
+ switch(MBCS_ENTRY_FINAL_ACTION(entry)) {
+ case MBCS_STATE_ILLEGAL:
+ fprintf(stderr, "ucm error: byte sequence ends in illegal state\n");
+ return -1;
+ case MBCS_STATE_CHANGE_ONLY:
+ fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n");
+ return -1;
+ case MBCS_STATE_UNASSIGNED:
+ case MBCS_STATE_FALLBACK_DIRECT_16:
+ case MBCS_STATE_VALID_DIRECT_16:
+ case MBCS_STATE_FALLBACK_DIRECT_20:
+ case MBCS_STATE_VALID_DIRECT_20:
+ case MBCS_STATE_VALID_16:
+ case MBCS_STATE_VALID_16_PAIR:
+ /* count a complete character and prepare for a new one */
+ ++count;
+ state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry);
+ offset=0;
+ break;
+ default:
+ /* reserved, must never occur */
+ fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", entry);
+ return -1;
+ }
+ }
+ }
+
+ if(offset!=0) {
+ fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %hu: 0x%02lx\n", state);
+ return -1;
+ }
+
+ /*
+ * for SI/SO (like EBCDIC-stateful), multiple-character results
+ * must consist of only double-byte sequences
+ */
+ if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) {
+ fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", count);
+ return -1;
+ }
+
+ return count;
+}