ICU-9602 conversion: add good one-way mapping type |4
X-SVN-Rev: 33173
This commit is contained in:
parent
da324defa4
commit
2e71f86e17
@ -1,7 +1,7 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2011, International Business Machines
|
||||
* Copyright (C) 2003-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
@ -446,6 +446,15 @@ ucnv_extContinueMatchToU(UConverter *cnv,
|
||||
|
||||
/* from Unicode ------------------------------------------------------------- */
|
||||
|
||||
// Use roundtrips, "good one-way" mappings, and some normal fallbacks.
|
||||
static inline UBool
|
||||
extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
|
||||
return
|
||||
((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
|
||||
}
|
||||
|
||||
/*
|
||||
* @return index of the UChar, if found; else <0
|
||||
*/
|
||||
@ -580,11 +589,7 @@ ucnv_extMatchFromU(const int32_t *cx,
|
||||
/* read first pair of the section */
|
||||
length=*fromUSectionUChars++;
|
||||
value=*fromUSectionValues++;
|
||||
if( value!=0 &&
|
||||
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
|
||||
) {
|
||||
if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
|
||||
/* remember longest match so far */
|
||||
matchValue=value;
|
||||
matchLength=2+i+j;
|
||||
@ -621,10 +626,7 @@ ucnv_extMatchFromU(const int32_t *cx,
|
||||
/* partial match, continue */
|
||||
idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
|
||||
} else {
|
||||
if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
|
||||
) {
|
||||
if(extFromUUseMapping(useFallback, value, firstCP)) {
|
||||
/* full match, stop with result */
|
||||
matchValue=value;
|
||||
matchLength=2+i+j;
|
||||
@ -641,10 +643,7 @@ ucnv_extMatchFromU(const int32_t *cx,
|
||||
return 0;
|
||||
}
|
||||
} else /* result from firstCP trie lookup */ {
|
||||
if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
|
||||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
|
||||
) {
|
||||
if(extFromUUseMapping(useFallback, value, firstCP)) {
|
||||
/* full match, stop with result */
|
||||
matchValue=value;
|
||||
matchLength=2;
|
||||
@ -944,13 +943,38 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
|
||||
if(which==UCNV_ROUNDTRIP_SET) {
|
||||
// Add only code points for which the roundtrip flag is set.
|
||||
// Do not add any fallbacks, even if ucnv_fromUnicode() would use them
|
||||
// (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
|
||||
//
|
||||
// By analogy, also do not add "good one-way" mappings.
|
||||
//
|
||||
// Do not add entries with reserved bits set.
|
||||
if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
|
||||
return FALSE;
|
||||
}
|
||||
} else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
|
||||
// Do not add entries with reserved bits set.
|
||||
if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
// Do not add <subchar1> entries or other (future?) pseudo-entries
|
||||
// with an output length of 0.
|
||||
return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
|
||||
}
|
||||
|
||||
static void
|
||||
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
||||
const int32_t *cx,
|
||||
const USetAdder *sa,
|
||||
UBool useFallback,
|
||||
UConverterUnicodeSet which,
|
||||
int32_t minLength,
|
||||
UChar32 c,
|
||||
UChar32 firstCP,
|
||||
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
|
||||
int32_t sectionIndex,
|
||||
UErrorCode *pErrorCode) {
|
||||
@ -967,13 +991,10 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
||||
count=*fromUSectionUChars++;
|
||||
value=*fromUSectionValues++;
|
||||
|
||||
if( value!=0 &&
|
||||
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
||||
) {
|
||||
if(c>=0) {
|
||||
if(extSetUseMapping(which, minLength, value)) {
|
||||
if(length==U16_LENGTH(firstCP)) {
|
||||
/* add the initial code point */
|
||||
sa->add(sa->set, c);
|
||||
sa->add(sa->set, firstCP);
|
||||
} else {
|
||||
/* add the string so far */
|
||||
sa->addString(sa->set, s, length);
|
||||
@ -989,16 +1010,11 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
|
||||
/* no mapping, do nothing */
|
||||
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
||||
ucnv_extGetUnicodeSetString(
|
||||
sharedData, cx, sa, useFallback, minLength,
|
||||
U_SENTINEL, s, length+1,
|
||||
sharedData, cx, sa, which, minLength,
|
||||
firstCP, s, length+1,
|
||||
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
||||
pErrorCode);
|
||||
} else if((useFallback ?
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
|
||||
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
||||
) {
|
||||
} else if(extSetUseMapping(which, minLength, value)) {
|
||||
sa->addString(sa->set, s, length+1);
|
||||
}
|
||||
}
|
||||
@ -1016,7 +1032,6 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
|
||||
uint32_t value;
|
||||
int32_t st1, stage1Length, st2, st3, minLength;
|
||||
UBool useFallback;
|
||||
|
||||
UChar s[UCNV_EXT_MAX_UCHARS];
|
||||
UChar32 c;
|
||||
@ -1033,8 +1048,6 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
|
||||
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
|
||||
|
||||
useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
|
||||
|
||||
/* enumerate the from-Unicode trie table */
|
||||
c=0; /* keep track of the current code point while enumerating */
|
||||
|
||||
@ -1062,30 +1075,20 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
/* read the stage 3 block */
|
||||
ps3=stage3+st3;
|
||||
|
||||
/*
|
||||
* Add code points for which the roundtrip flag is set.
|
||||
* Do not add <subchar1> entries or other (future?) pseudo-entries
|
||||
* with an output length of 0, or entries with reserved bits set.
|
||||
* Recurse for partial results.
|
||||
*/
|
||||
do {
|
||||
value=stage3b[*ps3++];
|
||||
if(value==0) {
|
||||
/* no mapping, do nothing */
|
||||
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
|
||||
// Recurse for partial results.
|
||||
length=0;
|
||||
U16_APPEND_UNSAFE(s, length, c);
|
||||
ucnv_extGetUnicodeSetString(
|
||||
sharedData, cx, sa, useFallback, minLength,
|
||||
sharedData, cx, sa, which, minLength,
|
||||
c, s, length,
|
||||
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
|
||||
pErrorCode);
|
||||
} else if((useFallback ?
|
||||
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
|
||||
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
|
||||
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
|
||||
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
|
||||
) {
|
||||
} else if(extSetUseMapping(which, minLength, value)) {
|
||||
switch(filter) {
|
||||
case UCNV_SET_FILTER_2022_CN:
|
||||
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2007, International Business Machines
|
||||
* Copyright (C) 2003-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
@ -29,10 +29,12 @@
|
||||
/*
|
||||
* See icuhtml/design/conversion/conversion_extensions.html
|
||||
*
|
||||
* Conversion extensions serve two purposes:
|
||||
* Conversion extensions serve three purposes:
|
||||
* 1. They support m:n mappings.
|
||||
* 2. They support extension-only conversion files that are used together
|
||||
* with the regular conversion data in base files.
|
||||
* 3. They support mappings with more complicated meta data,
|
||||
* for example "good one-way" mappings (|4).
|
||||
*
|
||||
* A base file may contain an extension table (explicitly requested or
|
||||
* implicitly generated for m:n mappings), but its extension table is not
|
||||
@ -229,11 +231,13 @@
|
||||
* return no mapping, but request for <subchar1>;
|
||||
* }
|
||||
* if(bit 31 set) {
|
||||
* roundtrip;
|
||||
* roundtrip (|0);
|
||||
* } else if(bit 30 set) {
|
||||
* "good one-way" mapping (|4);
|
||||
* } else {
|
||||
* fallback;
|
||||
* normal fallback (|1);
|
||||
* }
|
||||
* // bits 30..29 reserved, 0
|
||||
* // bit 29 reserved, 0
|
||||
* length=(value>>24)&0x1f; (bits 28..24)
|
||||
* if(length==1..3) {
|
||||
* bits 23..0 contain 1..3 bytes, padded with 00s on the left;
|
||||
@ -444,7 +448,9 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
|
||||
|
||||
#define UCNV_EXT_FROM_U_LENGTH_SHIFT 24
|
||||
#define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31)
|
||||
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x60000000
|
||||
#define UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG 0x40000000
|
||||
#define UCNV_EXT_FROM_U_STATUS_MASK 0xc0000000
|
||||
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x20000000
|
||||
#define UCNV_EXT_FROM_U_DATA_MASK 0xffffff
|
||||
|
||||
/* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */
|
||||
|
28
icu4c/source/test/testdata/conversion.txt
vendored
28
icu4c/source/test/testdata/conversion.txt
vendored
@ -1004,6 +1004,20 @@ conversion:table(nofallback) {
|
||||
fromUnicode {
|
||||
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
|
||||
Cases {
|
||||
// Test ticket 9602: Add "good one-way" mapping type (|4).
|
||||
// Such mappings are used regardless of the fallback flag.
|
||||
{
|
||||
"*test3", "##\uFE0E#\uFE0F",
|
||||
:bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 },
|
||||
:int{1}, :int{0}, // no fallbacks
|
||||
"", "?", ""
|
||||
}
|
||||
{
|
||||
"*test3", "##\uFE0E#\uFE0F",
|
||||
:bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 },
|
||||
:int{1}, :int{1}, // with fallbacks
|
||||
"", "?", ""
|
||||
}
|
||||
// Test ticket 6789: implement Java-compatible Unicode, UnicodeBig and UnicodeLittle converters
|
||||
// For details about these encodings see convrtrs.txt.
|
||||
// Standard UTF-16BE
|
||||
@ -1833,6 +1847,20 @@ conversion:table(nofallback) {
|
||||
// which - numeric UConverterUnicodeSet value
|
||||
Headers { "charset", "map", "mapnot", "which" }
|
||||
Cases {
|
||||
// Test ticket 9602: Add "good one-way" mapping type (|4).
|
||||
// Excluded from roundtrip set, included in the set with fallbacks.
|
||||
{
|
||||
"*test3",
|
||||
"[{#\uFE0F}]",
|
||||
"[#{#\uFE0E}]",
|
||||
:int{0}
|
||||
}
|
||||
{
|
||||
"*test3",
|
||||
"[#{#\uFE0E}{#\uFE0F}]",
|
||||
"[]",
|
||||
:int{1}
|
||||
}
|
||||
// Unicode charsets that do not map surrogate code points
|
||||
{
|
||||
"UTF-8",
|
||||
|
9
icu4c/source/test/testdata/test3.ucm
vendored
9
icu4c/source/test/testdata/test3.ucm
vendored
@ -1,5 +1,5 @@
|
||||
# *******************************************************************************
|
||||
# * Copyright (C) 2001-2003, International Business Machines
|
||||
# * Copyright (C) 2001-2013, International Business Machines
|
||||
# * Corporation and others. All Rights Reserved.
|
||||
# *******************************************************************************
|
||||
#
|
||||
@ -15,7 +15,7 @@
|
||||
<subchar> \xff
|
||||
<icu:state> 0, 1:1, 5-9, ff
|
||||
<icu:state> 2:2
|
||||
<icu:state> a-f.p
|
||||
<icu:state> 4, a-f.p
|
||||
|
||||
CHARMAP
|
||||
|
||||
@ -57,4 +57,9 @@ CHARMAP
|
||||
<U000e> \x01\x02\x0e |3
|
||||
#unassigned \x01\x02\x0f
|
||||
|
||||
# "good one-way" mappings
|
||||
<U0023> \x01\x02\x04 |4
|
||||
<U0023>+<UFE0E> \x01\x02\x04 |4
|
||||
<U0023>+<UFE0F> \x01\x02\x04 |0
|
||||
|
||||
END CHARMAP
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2012, International Business Machines
|
||||
* Copyright (C) 2003-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -606,7 +606,7 @@ prepareFromUMappings(UCMTable *table) {
|
||||
flag&=MBCS_FROM_U_EXT_MASK;
|
||||
m->f=flag;
|
||||
}
|
||||
if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
|
||||
if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) {
|
||||
map[j++]=i;
|
||||
|
||||
if(m->uLen>1) {
|
||||
@ -672,6 +672,8 @@ getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
|
||||
value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT;
|
||||
if(m->f==0) {
|
||||
value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
|
||||
} else if(m->f==4) {
|
||||
value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG;
|
||||
}
|
||||
|
||||
/* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2012, International Business Machines
|
||||
* Copyright (C) 2000-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -1049,6 +1049,11 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
|
||||
staticData->hasToUnicodeFallback=TRUE;
|
||||
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
|
||||
break;
|
||||
case 4:
|
||||
/* move "good one-way" mappings to the extension table */
|
||||
m->f|=MBCS_FROM_U_EXT_FLAG;
|
||||
m->moveFlag=UCM_MOVE_TO_EXT;
|
||||
break;
|
||||
default:
|
||||
/* will not occur because the parser checked it already */
|
||||
fprintf(stderr, "error: illegal fallback indicator %d\n", f);
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2012, International Business Machines
|
||||
* Copyright (C) 2003-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -327,7 +327,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
||||
return result;
|
||||
}
|
||||
|
||||
if(0<=mb->f && mb->f<=2) {
|
||||
if((0<=mb->f && mb->f<=2) || mb->f==4) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -339,7 +339,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
|
||||
return result;
|
||||
}
|
||||
|
||||
if(0<=me->f && me->f<=2) {
|
||||
if((0<=me->f && me->f<=2) || me->f==4) {
|
||||
break;
|
||||
}
|
||||
|
||||
@ -857,8 +857,8 @@ ucm_parseMappingLine(UCMapping *m,
|
||||
break;
|
||||
} else if(*s=='|') {
|
||||
f=(int8_t)(s[1]-'0');
|
||||
if((uint8_t)f>3) {
|
||||
fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
|
||||
if((uint8_t)f>4) {
|
||||
fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
|
||||
return FALSE;
|
||||
}
|
||||
break;
|
||||
@ -1051,6 +1051,7 @@ ucm_mappingType(UCMStates *baseStates,
|
||||
/*
|
||||
* Suitable for an ICU conversion base table means:
|
||||
* - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
|
||||
* - precision flag 0..3
|
||||
* - SBCS: any 1:1 mapping
|
||||
* (the table stores additional bits to distinguish mapping types)
|
||||
* - MBCS: not a |2 SUB mapping for <subchar1>
|
||||
@ -1070,7 +1071,7 @@ ucm_mappingType(UCMStates *baseStates,
|
||||
* makeconv uses a hack for moving mappings only for the fromUnicode table
|
||||
* that only works with non-negative values of f.
|
||||
*/
|
||||
if( m->uLen==1 && count==1 &&
|
||||
if( m->uLen==1 && count==1 && m->f<=3 &&
|
||||
(baseStates->maxCharLength==1 ||
|
||||
!((m->f==2 && m->bLen==1) ||
|
||||
(m->f==1 && bytes[0]==0) ||
|
||||
@ -1146,7 +1147,7 @@ ucm_readTable(UCMFile *ucm, FileStream* convFile,
|
||||
char line[500];
|
||||
char *end;
|
||||
UBool isOK;
|
||||
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2003-2010, International Business Machines
|
||||
* Copyright (C) 2003-2013, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: ucm.h
|
||||
@ -45,7 +45,8 @@ enum {
|
||||
* bIsMultipleChars indicates that the bytes contain more than one sequence
|
||||
* according to the state table
|
||||
* f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
|
||||
* same values as in the source file after |
|
||||
* or "good one-way" mapping (4).
|
||||
* Same values as in the source file after |
|
||||
*/
|
||||
typedef struct UCMapping {
|
||||
UChar32 u;
|
||||
|
Loading…
Reference in New Issue
Block a user