ICU-9602 conversion: add good one-way mapping type |4

X-SVN-Rev: 33173
This commit is contained in:
Markus Scherer 2013-02-11 20:48:11 +00:00
parent da324defa4
commit 2e71f86e17
8 changed files with 117 additions and 66 deletions

View File

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2003-2011, International Business Machines
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -446,6 +446,15 @@ ucnv_extContinueMatchToU(UConverter *cnv,
/* from Unicode ------------------------------------------------------------- */
// Use roundtrips, "good one-way" mappings, and some normal fallbacks.
static inline UBool
extFromUUseMapping(UBool useFallback, uint32_t value, UChar32 firstCP) {
return
((value&UCNV_EXT_FROM_U_STATUS_MASK)!=0 ||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0;
}
/*
* @return index of the UChar, if found; else <0
*/
@ -580,11 +589,7 @@ ucnv_extMatchFromU(const int32_t *cx,
/* read first pair of the section */
length=*fromUSectionUChars++;
value=*fromUSectionValues++;
if( value!=0 &&
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
) {
if(value!=0 && extFromUUseMapping(useFallback, value, firstCP)) {
/* remember longest match so far */
matchValue=value;
matchLength=2+i+j;
@ -621,10 +626,7 @@ ucnv_extMatchFromU(const int32_t *cx,
/* partial match, continue */
idx=(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value);
} else {
if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
) {
if(extFromUUseMapping(useFallback, value, firstCP)) {
/* full match, stop with result */
matchValue=value;
matchLength=2+i+j;
@ -641,10 +643,7 @@ ucnv_extMatchFromU(const int32_t *cx,
return 0;
}
} else /* result from firstCP trie lookup */ {
if( (UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) ||
FROM_U_USE_FALLBACK(useFallback, firstCP)) &&
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0
) {
if(extFromUUseMapping(useFallback, value, firstCP)) {
/* full match, stop with result */
matchValue=value;
matchLength=2;
@ -944,13 +943,38 @@ ucnv_extContinueMatchFromU(UConverter *cnv,
}
}
static UBool
extSetUseMapping(UConverterUnicodeSet which, int32_t minLength, uint32_t value) {
if(which==UCNV_ROUNDTRIP_SET) {
// Add only code points for which the roundtrip flag is set.
// Do not add any fallbacks, even if ucnv_fromUnicode() would use them
// (fallbacks from PUA). See the API docs for ucnv_getUnicodeSet().
//
// By analogy, also do not add "good one-way" mappings.
//
// Do not add entries with reserved bits set.
if(((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))!=
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) {
return FALSE;
}
} else /* UCNV_ROUNDTRIP_AND_FALLBACK_SET */ {
// Do not add entries with reserved bits set.
if((value&UCNV_EXT_FROM_U_RESERVED_MASK)!=0) {
return FALSE;
}
}
// Do not add <subchar1> entries or other (future?) pseudo-entries
// with an output length of 0.
return UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength;
}
static void
ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
const int32_t *cx,
const USetAdder *sa,
UBool useFallback,
UConverterUnicodeSet which,
int32_t minLength,
UChar32 c,
UChar32 firstCP,
UChar s[UCNV_EXT_MAX_UCHARS], int32_t length,
int32_t sectionIndex,
UErrorCode *pErrorCode) {
@ -967,13 +991,10 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
count=*fromUSectionUChars++;
value=*fromUSectionValues++;
if( value!=0 &&
(UCNV_EXT_FROM_U_IS_ROUNDTRIP(value) || useFallback) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
if(c>=0) {
if(extSetUseMapping(which, minLength, value)) {
if(length==U16_LENGTH(firstCP)) {
/* add the initial code point */
sa->add(sa->set, c);
sa->add(sa->set, firstCP);
} else {
/* add the string so far */
sa->addString(sa->set, s, length);
@ -989,16 +1010,11 @@ ucnv_extGetUnicodeSetString(const UConverterSharedData *sharedData,
/* no mapping, do nothing */
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
ucnv_extGetUnicodeSetString(
sharedData, cx, sa, useFallback, minLength,
U_SENTINEL, s, length+1,
sharedData, cx, sa, which, minLength,
firstCP, s, length+1,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
} else if((useFallback ?
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
} else if(extSetUseMapping(which, minLength, value)) {
sa->addString(sa->set, s, length+1);
}
}
@ -1016,7 +1032,6 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
uint32_t value;
int32_t st1, stage1Length, st2, st3, minLength;
UBool useFallback;
UChar s[UCNV_EXT_MAX_UCHARS];
UChar32 c;
@ -1033,8 +1048,6 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
stage1Length=cx[UCNV_EXT_FROM_U_STAGE_1_LENGTH];
useFallback=(UBool)(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET);
/* enumerate the from-Unicode trie table */
c=0; /* keep track of the current code point while enumerating */
@ -1062,30 +1075,20 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
/* read the stage 3 block */
ps3=stage3+st3;
/*
* Add code points for which the roundtrip flag is set.
* Do not add <subchar1> entries or other (future?) pseudo-entries
* with an output length of 0, or entries with reserved bits set.
* Recurse for partial results.
*/
do {
value=stage3b[*ps3++];
if(value==0) {
/* no mapping, do nothing */
} else if(UCNV_EXT_FROM_U_IS_PARTIAL(value)) {
// Recurse for partial results.
length=0;
U16_APPEND_UNSAFE(s, length, c);
ucnv_extGetUnicodeSetString(
sharedData, cx, sa, useFallback, minLength,
sharedData, cx, sa, which, minLength,
c, s, length,
(int32_t)UCNV_EXT_FROM_U_GET_PARTIAL_INDEX(value),
pErrorCode);
} else if((useFallback ?
(value&UCNV_EXT_FROM_U_RESERVED_MASK)==0 :
((value&(UCNV_EXT_FROM_U_ROUNDTRIP_FLAG|UCNV_EXT_FROM_U_RESERVED_MASK))==
UCNV_EXT_FROM_U_ROUNDTRIP_FLAG)) &&
UCNV_EXT_FROM_U_GET_LENGTH(value)>=minLength
) {
} else if(extSetUseMapping(which, minLength, value)) {
switch(filter) {
case UCNV_SET_FILTER_2022_CN:
if(!(UCNV_EXT_FROM_U_GET_LENGTH(value)==3 && UCNV_EXT_FROM_U_GET_DATA(value)<=0x82ffff)) {

View File

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2003-2007, International Business Machines
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -29,10 +29,12 @@
/*
* See icuhtml/design/conversion/conversion_extensions.html
*
* Conversion extensions serve two purposes:
* Conversion extensions serve three purposes:
* 1. They support m:n mappings.
* 2. They support extension-only conversion files that are used together
* with the regular conversion data in base files.
* 3. They support mappings with more complicated meta data,
* for example "good one-way" mappings (|4).
*
* A base file may contain an extension table (explicitly requested or
* implicitly generated for m:n mappings), but its extension table is not
@ -229,11 +231,13 @@
* return no mapping, but request for <subchar1>;
* }
* if(bit 31 set) {
* roundtrip;
* roundtrip (|0);
* } else if(bit 30 set) {
* "good one-way" mapping (|4);
* } else {
* fallback;
* normal fallback (|1);
* }
* // bits 30..29 reserved, 0
* // bit 29 reserved, 0
* length=(value>>24)&0x1f; (bits 28..24)
* if(length==1..3) {
* bits 23..0 contain 1..3 bytes, padded with 00s on the left;
@ -444,7 +448,9 @@ ucnv_extGetUnicodeSet(const UConverterSharedData *sharedData,
#define UCNV_EXT_FROM_U_LENGTH_SHIFT 24
#define UCNV_EXT_FROM_U_ROUNDTRIP_FLAG ((uint32_t)1<<31)
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x60000000
#define UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG 0x40000000
#define UCNV_EXT_FROM_U_STATUS_MASK 0xc0000000
#define UCNV_EXT_FROM_U_RESERVED_MASK 0x20000000
#define UCNV_EXT_FROM_U_DATA_MASK 0xffffff
/* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */

View File

@ -1004,6 +1004,20 @@ conversion:table(nofallback) {
fromUnicode {
Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" }
Cases {
// Test ticket 9602: Add "good one-way" mapping type (|4).
// Such mappings are used regardless of the fallback flag.
{
"*test3", "##\uFE0E#\uFE0F",
:bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 },
:int{1}, :int{0}, // no fallbacks
"", "?", ""
}
{
"*test3", "##\uFE0E#\uFE0F",
:bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 },
:int{1}, :int{1}, // with fallbacks
"", "?", ""
}
// Test ticket 6789: implement Java-compatible Unicode, UnicodeBig and UnicodeLittle converters
// For details about these encodings see convrtrs.txt.
// Standard UTF-16BE
@ -1833,6 +1847,20 @@ conversion:table(nofallback) {
// which - numeric UConverterUnicodeSet value
Headers { "charset", "map", "mapnot", "which" }
Cases {
// Test ticket 9602: Add "good one-way" mapping type (|4).
// Excluded from roundtrip set, included in the set with fallbacks.
{
"*test3",
"[{#\uFE0F}]",
"[#{#\uFE0E}]",
:int{0}
}
{
"*test3",
"[#{#\uFE0E}{#\uFE0F}]",
"[]",
:int{1}
}
// Unicode charsets that do not map surrogate code points
{
"UTF-8",

View File

@ -1,5 +1,5 @@
# *******************************************************************************
# * Copyright (C) 2001-2003, International Business Machines
# * Copyright (C) 2001-2013, International Business Machines
# * Corporation and others. All Rights Reserved.
# *******************************************************************************
#
@ -15,7 +15,7 @@
<subchar> \xff
<icu:state> 0, 1:1, 5-9, ff
<icu:state> 2:2
<icu:state> a-f.p
<icu:state> 4, a-f.p
CHARMAP
@ -57,4 +57,9 @@ CHARMAP
<U000e> \x01\x02\x0e |3
#unassigned \x01\x02\x0f
# "good one-way" mappings
<U0023> \x01\x02\x04 |4
<U0023>+<UFE0E> \x01\x02\x04 |4
<U0023>+<UFE0F> \x01\x02\x04 |0
END CHARMAP

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2012, International Business Machines
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -606,7 +606,7 @@ prepareFromUMappings(UCMTable *table) {
flag&=MBCS_FROM_U_EXT_MASK;
m->f=flag;
}
if(flag==0 || flag==1 || (flag==2 && m->bLen==1)) {
if(flag==0 || flag==1 || (flag==2 && m->bLen==1) || flag==4) {
map[j++]=i;
if(m->uLen>1) {
@ -672,6 +672,8 @@ getFromUBytesValue(CnvExtData *extData, UCMTable *table, UCMapping *m) {
value|=(uint32_t)m->bLen<<UCNV_EXT_FROM_U_LENGTH_SHIFT;
if(m->f==0) {
value|=UCNV_EXT_FROM_U_ROUNDTRIP_FLAG;
} else if(m->f==4) {
value|=UCNV_EXT_FROM_U_GOOD_ONE_WAY_FLAG;
}
/* calculate the real UTF-16 length (see recoding in prepareFromUMappings()) */

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2012, International Business Machines
* Copyright (C) 2000-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -1049,6 +1049,11 @@ MBCSAddTable(NewConverter *cnvData, UCMTable *table, UConverterStaticData *stati
staticData->hasToUnicodeFallback=TRUE;
isOK&=MBCSAddToUnicode(mbcsData, m->b.bytes, m->bLen, c, f);
break;
case 4:
/* move "good one-way" mappings to the extension table */
m->f|=MBCS_FROM_U_EXT_FLAG;
m->moveFlag=UCM_MOVE_TO_EXT;
break;
default:
/* will not occur because the parser checked it already */
fprintf(stderr, "error: illegal fallback indicator %d\n", f);

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2012, International Business Machines
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -327,7 +327,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
return result;
}
if(0<=mb->f && mb->f<=2) {
if((0<=mb->f && mb->f<=2) || mb->f==4) {
break;
}
@ -339,7 +339,7 @@ checkBaseExtUnicode(UCMStates *baseStates, UCMTable *base, UCMTable *ext,
return result;
}
if(0<=me->f && me->f<=2) {
if((0<=me->f && me->f<=2) || me->f==4) {
break;
}
@ -857,8 +857,8 @@ ucm_parseMappingLine(UCMapping *m,
break;
} else if(*s=='|') {
f=(int8_t)(s[1]-'0');
if((uint8_t)f>3) {
fprintf(stderr, "ucm error: fallback indicator must be |0..|3 - \"%s\"\n", line);
if((uint8_t)f>4) {
fprintf(stderr, "ucm error: fallback indicator must be |0..|4 - \"%s\"\n", line);
return FALSE;
}
break;
@ -1051,6 +1051,7 @@ ucm_mappingType(UCMStates *baseStates,
/*
* Suitable for an ICU conversion base table means:
* - a 1:1 mapping (1 Unicode code point : 1 byte sequence)
* - precision flag 0..3
* - SBCS: any 1:1 mapping
* (the table stores additional bits to distinguish mapping types)
* - MBCS: not a |2 SUB mapping for <subchar1>
@ -1070,7 +1071,7 @@ ucm_mappingType(UCMStates *baseStates,
* makeconv uses a hack for moving mappings only for the fromUnicode table
* that only works with non-negative values of f.
*/
if( m->uLen==1 && count==1 &&
if( m->uLen==1 && count==1 && m->f<=3 &&
(baseStates->maxCharLength==1 ||
!((m->f==2 && m->bLen==1) ||
(m->f==1 && bytes[0]==0) ||
@ -1146,7 +1147,7 @@ ucm_readTable(UCMFile *ucm, FileStream* convFile,
char line[500];
char *end;
UBool isOK;
if(U_FAILURE(*pErrorCode)) {
return;
}

View File

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 2003-2010, International Business Machines
* Copyright (C) 2003-2013, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: ucm.h
@ -45,7 +45,8 @@ enum {
* bIsMultipleChars indicates that the bytes contain more than one sequence
* according to the state table
* f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3)
* same values as in the source file after |
* or "good one-way" mapping (4).
* Same values as in the source file after |
*/
typedef struct UCMapping {
UChar32 u;