ICU-8804 Normalizer2::composePair(a, b) with separation of minYesNo extraData into combines-forward vs. not

X-SVN-Rev: 30982
This commit is contained in:
Markus Scherer 2011-11-27 20:29:38 +00:00
parent c80f9c5856
commit bed105857f
13 changed files with 201 additions and 20 deletions

View File

@ -160,6 +160,11 @@ FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition
return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
}
UChar32
FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
}
uint8_t
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
return set.contains(c) ? norm2.getCombiningClass(c) : 0;

View File

@ -40,6 +40,11 @@ Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
return FALSE;
}
UChar32
Normalizer2::composePair(UChar32, UChar32) const {
return U_SENTINEL;
}
uint8_t
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
return 0;
@ -223,6 +228,10 @@ public:
}
return TRUE;
}
virtual UChar32
composePair(UChar32 a, UChar32 b) const {
return impl.composePair(a, b);
}
virtual uint8_t
getCombiningClass(UChar32 c) const {
@ -853,6 +862,11 @@ unorm2_getRawDecomposition(const UNormalizer2 *norm2,
}
}
U_DRAFT UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
}
U_DRAFT uint8_t U_EXPORT2
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);

View File

@ -301,6 +301,7 @@ Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &err
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
minYesNo=inIndexes[IX_MIN_YES_NO];
minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
minNoNo=inIndexes[IX_MIN_NO_NO];
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
@ -967,6 +968,50 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart
buffer.setReorderingLimit(limit);
}
UChar32
Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
const uint16_t *list;
if(isInert(norm16)) {
return U_SENTINEL;
} else if(norm16<minYesNoMappingsOnly) {
if(isJamoL(norm16)) {
b-=Hangul::JAMO_V_BASE;
if(0<=b && b<Hangul::JAMO_V_COUNT) {
return
(Hangul::HANGUL_BASE+
((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
Hangul::JAMO_T_COUNT);
} else {
return U_SENTINEL;
}
} else if(isHangul(norm16)) {
b-=Hangul::JAMO_T_BASE;
if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
return a+b;
} else {
return U_SENTINEL;
}
} else {
// 'a' has a compositions list in extraData
list=extraData+norm16;
if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
list+= // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
(*list&MAPPING_LENGTH_MASK); // + mapping length
}
}
} else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
return U_SENTINEL;
} else {
list=maybeYesCompositions+norm16-minMaybeYes;
}
if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
return U_SENTINEL;
}
return combine(list, b)>>1;
}
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
// doCompose: normalize
// !doCompose: isNormalized (buffer must be empty and initialized)

View File

@ -297,6 +297,8 @@ public:
*/
const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;
UChar32 composePair(UChar32 a, UChar32 b) const;
UBool isCanonSegmentStarter(UChar32 c) const;
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
@ -328,12 +330,13 @@ public:
IX_MIN_COMP_NO_MAYBE_CP,
// Norm16 value thresholds for quick check combinations and types of extra data.
IX_MIN_YES_NO,
IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
IX_RESERVED14,
IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[.
IX_RESERVED15,
IX_COUNT
};
@ -407,7 +410,7 @@ private:
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
static UBool isInert(uint16_t norm16) { return norm16==0; }
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
static UBool isJamoL(uint16_t norm16) { return norm16==1; }
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
@ -518,6 +521,7 @@ private:
// Norm16 value thresholds for quick check combinations and types of extra data.
uint16_t minYesNo;
uint16_t minYesNoMappingsOnly;
uint16_t minNoNo;
uint16_t limitNoNo;
uint16_t minMaybeYes;
@ -718,12 +722,13 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
*
* The next four indexes are thresholds of 16-bit trie values for ranges of
* The next five indexes are thresholds of 16-bit trie values for ranges of
* values indicating multiple normalization properties.
* minYesNo=indexes[IX_MIN_YES_NO];
* minNoNo=indexes[IX_MIN_NO_NO];
* limitNoNo=indexes[IX_LIMIT_NO_NO];
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* See the normTrie description below and the design doc for details.
*
* UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
@ -735,7 +740,7 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
* which means it has a two-way (round-trip) decomposition mapping.
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
* pointing to mappings, composition lists, or both.
* pointing to mappings, compositions lists, or both.
* Value norm16==0 means that the character is normalization-inert, that is,
* it does not have a mapping, does not participate in composition, has a zero
* canonical combining class, and forms a boundary where text before it and after it
@ -760,7 +765,7 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
* There is only one byte offset for the end of these two arrays.
* The split between them is given by the constant and variable mentioned above.
*
* The maybeYesCompositions array contains composition lists for characters that
* The maybeYesCompositions array contains compositions lists for characters that
* combine both forward (as starters in composition pairs)
* and backward (as trailing characters in composition pairs).
* Such characters do not occur in Unicode 5.2 but are allowed by
@ -770,13 +775,13 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
* If there are such characters, then minMaybeYes is subtracted from their norm16 values
* to get the index into this array.
*
* The extraData array contains composition lists for "YesYes" characters,
* followed by mappings and optional composition lists for "YesNo" characters,
* The extraData array contains compositions lists for "YesYes" characters,
* followed by mappings and optional compositions lists for "YesNo" characters,
* followed by only mappings for "NoNo" characters.
* (Referring to pairs of NFC/NFD quick check values.)
* The norm16 values of those characters are directly indexes into the extraData array.
*
* The data structures for composition lists and mappings are described in the design doc.
* The data structures for compositions lists and mappings are described in the design doc.
*
* uint8_t smallFCD[0x100]; -- new in format version 2
*
@ -799,6 +804,11 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
* This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
* is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
* + For details see the design doc.
* - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
* distinct ranges (combines-forward vs. not)
* so that a range check can be used to find out if there is a compositions list.
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
* - Addition of the smallFCD[] bit set.
*/

View File

@ -223,6 +223,24 @@ public:
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
*
* Returns a composite code point c only if c has a two-way mapping to a+b.
* In standard Unicode normalization, this means that
* c has a canonical decomposition to a+b
* and c does not have the Full_Composition_Exclusion property.
*
* This function is independent of the mode of the Normalizer2.
* The default implementation returns a negative value.
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @draft ICU 49
*/
virtual UChar32
composePair(UChar32 a, UChar32 b) const;
/**
* Gets the combining class of c.
* The default implementation returns 0
@ -459,6 +477,19 @@ public:
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
* For details see the base class documentation.
*
* This function is independent of the mode of the Normalizer2.
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @draft ICU 49
*/
virtual UChar32
composePair(UChar32 a, UChar32 b) const;
/**
* Gets the combining class of c.
* The default implementation returns 0

View File

@ -318,6 +318,24 @@ unorm2_getRawDecomposition(const UNormalizer2 *norm2,
UChar32 c, UChar *decomposition, int32_t capacity,
UErrorCode *pErrorCode);
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
*
* Returns a composite code point c only if c has a two-way mapping to a+b.
* In standard Unicode normalization, this means that
* c has a canonical decomposition to a+b
* and c does not have the Full_Composition_Exclusion property.
*
* This function is independent of the mode of the UNormalizer2.
* @param norm2 UNormalizer2 instance
* @param a A (normalization starter) code point.
* @param b Another code point.
* @return The non-negative composite code point if there is one; otherwise a negative value.
* @draft ICU 49
*/
U_DRAFT UChar32 U_EXPORT2
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b);
/**
* Gets the combining class of c.
* The default implementation returns 0

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1084,6 +1084,23 @@ unicodeDataLineFn(void *context,
c, length, dmLength, u_errorName(*pErrorCode));
return;
}
/* recompose */
if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
UChar32 a, b, composite;
i=0;
U16_NEXT(dm, i, dmLength, a);
U16_NEXT(dm, i, dmLength, b);
/* i==dmLength */
composite=unorm2_composePair(nfc, a, b);
if(composite!=c) {
log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
(long)c, (long)a, (long)b, (long)composite);
}
/*
* Note: NFKC has fewer round-trip mappings than NFC,
* so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
*/
}
#endif
/* get ISO Comment, field 11 */

View File

@ -1175,6 +1175,15 @@ BasicNormalizerTest::TestCompare() {
errln("NFC.getRawDecomposition() returns TRUE for characters which do not have decompositions");
}
// test composePair() for some pairs of characters that do not compose
if( nfcNorm2->composePair(0x20, 0x301)>=0 ||
nfcNorm2->composePair(0x61, 0x305)>=0 ||
nfcNorm2->composePair(0x1100, 0x1160)>=0 ||
nfcNorm2->composePair(0xac00, 0x11a7)>=0
) {
errln("NFC.composePair() incorrectly composes some pairs of characters");
}
// test FilteredNormalizer2::getDecomposition()
UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode);
FilteredNormalizer2 fn2(*nfcNorm2, filter);
@ -1190,6 +1199,13 @@ BasicNormalizerTest::TestCompare() {
) {
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
}
// test FilteredNormalizer2::composePair()
if( 0x100!=fn2.composePair(0x41, 0x304) ||
fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
) {
errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
}
}
// verify that case-folding does not un-FCD strings

View File

@ -132,8 +132,18 @@ struct Norm {
UBool hasNoCompBoundaryAfter;
enum OffsetType {
OFFSET_NONE, OFFSET_MAYBE_YES,
OFFSET_YES_YES, OFFSET_YES_NO, OFFSET_NO_NO,
OFFSET_NONE,
// Composition for back-combining character. Allowed, but not normally used.
OFFSET_MAYBE_YES,
// Composition for a starter that does not have a decomposition mapping.
OFFSET_YES_YES,
// Round-trip mapping & composition for a starter.
OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
// Round-trip mapping for a starter that itself does not combine-forward.
OFFSET_YES_NO_MAPPING_ONLY,
// One-way mapping.
OFFSET_NO_NO,
// Delta for an algorithmic one-way mapping.
OFFSET_DELTA
};
enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
@ -784,7 +794,7 @@ public:
ExtraDataWriter(Normalizer2DataBuilder &b) :
Normalizer2DBEnumerator(b),
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
yesNoData(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data
yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data
virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
if(value!=0) {
if(start!=end) {
@ -800,7 +810,8 @@ public:
}
UnicodeString maybeYesCompositions;
UnicodeString yesYesCompositions;
UnicodeString yesNoData;
UnicodeString yesNoMappingsAndCompositions;
UnicodeString yesNoMappingsOnly;
UnicodeString noNoMappings;
Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode.
};
@ -844,10 +855,15 @@ void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraData
writeCompositions(c, p, writer.yesYesCompositions);
}
} else if(p->mappingType==Norm::ROUND_TRIP) {
int32_t offset=writer.yesNoData.length()+writeMapping(c, p, writer.yesNoData);
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO;
if(p->compositions!=NULL) {
writeCompositions(c, p, writer.yesNoData);
int32_t offset=writer.yesNoMappingsAndCompositions.length()+
writeMapping(c, p, writer.yesNoMappingsAndCompositions);
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
} else {
int32_t offset=writer.yesNoMappingsOnly.length()+
writeMapping(c, p, writer.yesNoMappingsOnly);
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
}
} else /* one-way */ {
if(p->compositions!=NULL) {
@ -929,10 +945,14 @@ void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t va
case Norm::OFFSET_YES_YES:
norm16=offset;
break;
case Norm::OFFSET_YES_NO:
case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
isDecompNo=TRUE;
break;
case Norm::OFFSET_YES_NO_MAPPING_ONLY:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
isDecompNo=TRUE;
break;
case Norm::OFFSET_NO_NO:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
isDecompNo=isCompNoMaybe=TRUE;
@ -1042,7 +1062,8 @@ void Normalizer2DataBuilder::processData() {
extraData=extraDataWriter.maybeYesCompositions;
extraData.append(extraDataWriter.yesYesCompositions).
append(extraDataWriter.yesNoData).
append(extraDataWriter.yesNoMappingsAndCompositions).
append(extraDataWriter.yesNoMappingsOnly).
append(extraDataWriter.noNoMappings);
// Pad to even length for 4-byte alignment of following data.
if(extraData.length()&1) {
@ -1051,9 +1072,12 @@ void Normalizer2DataBuilder::processData() {
indexes[Normalizer2Impl::IX_MIN_YES_NO]=
extraDataWriter.yesYesCompositions.length();
indexes[Normalizer2Impl::IX_MIN_NO_NO]=
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
indexes[Normalizer2Impl::IX_MIN_YES_NO]+
extraDataWriter.yesNoData.length();
extraDataWriter.yesNoMappingsAndCompositions.length();
indexes[Normalizer2Impl::IX_MIN_NO_NO]=
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
extraDataWriter.yesNoMappingsOnly.length();
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
indexes[Normalizer2Impl::IX_MIN_NO_NO]+
extraDataWriter.noNoMappings.length();
@ -1147,6 +1171,7 @@ void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);