ICU-8804 Normalizer2::composePair(a, b) with separation of minYesNo extraData into combines-forward vs. not
X-SVN-Rev: 30982
This commit is contained in:
parent
c80f9c5856
commit
bed105857f
@ -160,6 +160,11 @@ FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition
|
|||||||
return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
|
return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UChar32
|
||||||
|
FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
|
||||||
|
return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
|
||||||
|
}
|
||||||
|
|
||||||
uint8_t
|
uint8_t
|
||||||
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
|
FilteredNormalizer2::getCombiningClass(UChar32 c) const {
|
||||||
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
|
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
|
||||||
|
@ -40,6 +40,11 @@ Normalizer2::getRawDecomposition(UChar32, UnicodeString &) const {
|
|||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UChar32
|
||||||
|
Normalizer2::composePair(UChar32, UChar32) const {
|
||||||
|
return U_SENTINEL;
|
||||||
|
}
|
||||||
|
|
||||||
uint8_t
|
uint8_t
|
||||||
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
|
Normalizer2::getCombiningClass(UChar32 /*c*/) const {
|
||||||
return 0;
|
return 0;
|
||||||
@ -223,6 +228,10 @@ public:
|
|||||||
}
|
}
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
virtual UChar32
|
||||||
|
composePair(UChar32 a, UChar32 b) const {
|
||||||
|
return impl.composePair(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
virtual uint8_t
|
virtual uint8_t
|
||||||
getCombiningClass(UChar32 c) const {
|
getCombiningClass(UChar32 c) const {
|
||||||
@ -853,6 +862,11 @@ unorm2_getRawDecomposition(const UNormalizer2 *norm2,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
U_DRAFT UChar32 U_EXPORT2
|
||||||
|
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b) {
|
||||||
|
return reinterpret_cast<const Normalizer2 *>(norm2)->composePair(a, b);
|
||||||
|
}
|
||||||
|
|
||||||
U_DRAFT uint8_t U_EXPORT2
|
U_DRAFT uint8_t U_EXPORT2
|
||||||
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
|
unorm2_getCombiningClass(const UNormalizer2 *norm2, UChar32 c) {
|
||||||
return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
|
return reinterpret_cast<const Normalizer2 *>(norm2)->getCombiningClass(c);
|
||||||
|
@ -301,6 +301,7 @@ Normalizer2Impl::load(const char *packageName, const char *name, UErrorCode &err
|
|||||||
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
|
minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
|
||||||
|
|
||||||
minYesNo=inIndexes[IX_MIN_YES_NO];
|
minYesNo=inIndexes[IX_MIN_YES_NO];
|
||||||
|
minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
||||||
minNoNo=inIndexes[IX_MIN_NO_NO];
|
minNoNo=inIndexes[IX_MIN_NO_NO];
|
||||||
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
|
limitNoNo=inIndexes[IX_LIMIT_NO_NO];
|
||||||
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
|
minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
|
||||||
@ -967,6 +968,50 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart
|
|||||||
buffer.setReorderingLimit(limit);
|
buffer.setReorderingLimit(limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
UChar32
|
||||||
|
Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
|
||||||
|
uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
|
||||||
|
const uint16_t *list;
|
||||||
|
if(isInert(norm16)) {
|
||||||
|
return U_SENTINEL;
|
||||||
|
} else if(norm16<minYesNoMappingsOnly) {
|
||||||
|
if(isJamoL(norm16)) {
|
||||||
|
b-=Hangul::JAMO_V_BASE;
|
||||||
|
if(0<=b && b<Hangul::JAMO_V_COUNT) {
|
||||||
|
return
|
||||||
|
(Hangul::HANGUL_BASE+
|
||||||
|
((a-Hangul::JAMO_L_BASE)*Hangul::JAMO_V_COUNT+b)*
|
||||||
|
Hangul::JAMO_T_COUNT);
|
||||||
|
} else {
|
||||||
|
return U_SENTINEL;
|
||||||
|
}
|
||||||
|
} else if(isHangul(norm16)) {
|
||||||
|
b-=Hangul::JAMO_T_BASE;
|
||||||
|
if(Hangul::isHangulWithoutJamoT(a) && 0<b && b<Hangul::JAMO_T_COUNT) { // not b==0!
|
||||||
|
return a+b;
|
||||||
|
} else {
|
||||||
|
return U_SENTINEL;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// 'a' has a compositions list in extraData
|
||||||
|
list=extraData+norm16;
|
||||||
|
if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list
|
||||||
|
list+= // mapping pointer
|
||||||
|
1+ // +1 to skip the first unit with the mapping lenth
|
||||||
|
(*list&MAPPING_LENGTH_MASK); // + mapping length
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||||
|
return U_SENTINEL;
|
||||||
|
} else {
|
||||||
|
list=maybeYesCompositions+norm16-minMaybeYes;
|
||||||
|
}
|
||||||
|
if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b
|
||||||
|
return U_SENTINEL;
|
||||||
|
}
|
||||||
|
return combine(list, b)>>1;
|
||||||
|
}
|
||||||
|
|
||||||
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
|
// Very similar to composeQuickCheck(): Make the same changes in both places if relevant.
|
||||||
// doCompose: normalize
|
// doCompose: normalize
|
||||||
// !doCompose: isNormalized (buffer must be empty and initialized)
|
// !doCompose: isNormalized (buffer must be empty and initialized)
|
||||||
|
@ -297,6 +297,8 @@ public:
|
|||||||
*/
|
*/
|
||||||
const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;
|
const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const;
|
||||||
|
|
||||||
|
UChar32 composePair(UChar32 a, UChar32 b) const;
|
||||||
|
|
||||||
UBool isCanonSegmentStarter(UChar32 c) const;
|
UBool isCanonSegmentStarter(UChar32 c) const;
|
||||||
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
|
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
|
||||||
|
|
||||||
@ -328,12 +330,13 @@ public:
|
|||||||
IX_MIN_COMP_NO_MAYBE_CP,
|
IX_MIN_COMP_NO_MAYBE_CP,
|
||||||
|
|
||||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||||
IX_MIN_YES_NO,
|
IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
|
||||||
IX_MIN_NO_NO,
|
IX_MIN_NO_NO,
|
||||||
IX_LIMIT_NO_NO,
|
IX_LIMIT_NO_NO,
|
||||||
IX_MIN_MAYBE_YES,
|
IX_MIN_MAYBE_YES,
|
||||||
|
|
||||||
IX_RESERVED14,
|
IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[.
|
||||||
|
|
||||||
IX_RESERVED15,
|
IX_RESERVED15,
|
||||||
IX_COUNT
|
IX_COUNT
|
||||||
};
|
};
|
||||||
@ -407,7 +410,7 @@ private:
|
|||||||
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
||||||
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
|
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
|
||||||
static UBool isInert(uint16_t norm16) { return norm16==0; }
|
static UBool isInert(uint16_t norm16) { return norm16==0; }
|
||||||
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
|
static UBool isJamoL(uint16_t norm16) { return norm16==1; }
|
||||||
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
|
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
|
||||||
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
|
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
|
||||||
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
|
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
|
||||||
@ -518,6 +521,7 @@ private:
|
|||||||
|
|
||||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||||
uint16_t minYesNo;
|
uint16_t minYesNo;
|
||||||
|
uint16_t minYesNoMappingsOnly;
|
||||||
uint16_t minNoNo;
|
uint16_t minNoNo;
|
||||||
uint16_t limitNoNo;
|
uint16_t limitNoNo;
|
||||||
uint16_t minMaybeYes;
|
uint16_t minMaybeYes;
|
||||||
@ -718,12 +722,13 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||||||
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
|
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
|
||||||
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
|
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
|
||||||
*
|
*
|
||||||
* The next four indexes are thresholds of 16-bit trie values for ranges of
|
* The next five indexes are thresholds of 16-bit trie values for ranges of
|
||||||
* values indicating multiple normalization properties.
|
* values indicating multiple normalization properties.
|
||||||
* minYesNo=indexes[IX_MIN_YES_NO];
|
* minYesNo=indexes[IX_MIN_YES_NO];
|
||||||
* minNoNo=indexes[IX_MIN_NO_NO];
|
* minNoNo=indexes[IX_MIN_NO_NO];
|
||||||
* limitNoNo=indexes[IX_LIMIT_NO_NO];
|
* limitNoNo=indexes[IX_LIMIT_NO_NO];
|
||||||
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
|
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
|
||||||
|
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
||||||
* See the normTrie description below and the design doc for details.
|
* See the normTrie description below and the design doc for details.
|
||||||
*
|
*
|
||||||
* UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
|
* UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
|
||||||
@ -735,7 +740,7 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||||||
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
|
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
|
||||||
* which means it has a two-way (round-trip) decomposition mapping.
|
* which means it has a two-way (round-trip) decomposition mapping.
|
||||||
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
|
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
|
||||||
* pointing to mappings, composition lists, or both.
|
* pointing to mappings, compositions lists, or both.
|
||||||
* Value norm16==0 means that the character is normalization-inert, that is,
|
* Value norm16==0 means that the character is normalization-inert, that is,
|
||||||
* it does not have a mapping, does not participate in composition, has a zero
|
* it does not have a mapping, does not participate in composition, has a zero
|
||||||
* canonical combining class, and forms a boundary where text before it and after it
|
* canonical combining class, and forms a boundary where text before it and after it
|
||||||
@ -760,7 +765,7 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||||||
* There is only one byte offset for the end of these two arrays.
|
* There is only one byte offset for the end of these two arrays.
|
||||||
* The split between them is given by the constant and variable mentioned above.
|
* The split between them is given by the constant and variable mentioned above.
|
||||||
*
|
*
|
||||||
* The maybeYesCompositions array contains composition lists for characters that
|
* The maybeYesCompositions array contains compositions lists for characters that
|
||||||
* combine both forward (as starters in composition pairs)
|
* combine both forward (as starters in composition pairs)
|
||||||
* and backward (as trailing characters in composition pairs).
|
* and backward (as trailing characters in composition pairs).
|
||||||
* Such characters do not occur in Unicode 5.2 but are allowed by
|
* Such characters do not occur in Unicode 5.2 but are allowed by
|
||||||
@ -770,13 +775,13 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||||||
* If there are such characters, then minMaybeYes is subtracted from their norm16 values
|
* If there are such characters, then minMaybeYes is subtracted from their norm16 values
|
||||||
* to get the index into this array.
|
* to get the index into this array.
|
||||||
*
|
*
|
||||||
* The extraData array contains composition lists for "YesYes" characters,
|
* The extraData array contains compositions lists for "YesYes" characters,
|
||||||
* followed by mappings and optional composition lists for "YesNo" characters,
|
* followed by mappings and optional compositions lists for "YesNo" characters,
|
||||||
* followed by only mappings for "NoNo" characters.
|
* followed by only mappings for "NoNo" characters.
|
||||||
* (Referring to pairs of NFC/NFD quick check values.)
|
* (Referring to pairs of NFC/NFD quick check values.)
|
||||||
* The norm16 values of those characters are directly indexes into the extraData array.
|
* The norm16 values of those characters are directly indexes into the extraData array.
|
||||||
*
|
*
|
||||||
* The data structures for composition lists and mappings are described in the design doc.
|
* The data structures for compositions lists and mappings are described in the design doc.
|
||||||
*
|
*
|
||||||
* uint8_t smallFCD[0x100]; -- new in format version 2
|
* uint8_t smallFCD[0x100]; -- new in format version 2
|
||||||
*
|
*
|
||||||
@ -799,6 +804,11 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||||||
* This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
|
* This subsumes the one actual use of the MAPPING_PLUS_COMPOSITION_LIST bit which
|
||||||
* is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
|
* is then repurposed for the MAPPING_HAS_RAW_MAPPING bit.
|
||||||
* + For details see the design doc.
|
* + For details see the design doc.
|
||||||
|
* - Addition of indexes[IX_MIN_YES_NO_MAPPINGS_ONLY] and separation of the yesNo extraData into
|
||||||
|
* distinct ranges (combines-forward vs. not)
|
||||||
|
* so that a range check can be used to find out if there is a compositions list.
|
||||||
|
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
|
||||||
|
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
|
||||||
* - Addition of the smallFCD[] bit set.
|
* - Addition of the smallFCD[] bit set.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
@ -223,6 +223,24 @@ public:
|
|||||||
virtual UBool
|
virtual UBool
|
||||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs pairwise composition of a & b and returns the composite if there is one.
|
||||||
|
*
|
||||||
|
* Returns a composite code point c only if c has a two-way mapping to a+b.
|
||||||
|
* In standard Unicode normalization, this means that
|
||||||
|
* c has a canonical decomposition to a+b
|
||||||
|
* and c does not have the Full_Composition_Exclusion property.
|
||||||
|
*
|
||||||
|
* This function is independent of the mode of the Normalizer2.
|
||||||
|
* The default implementation returns a negative value.
|
||||||
|
* @param a A (normalization starter) code point.
|
||||||
|
* @param b Another code point.
|
||||||
|
* @return The non-negative composite code point if there is one; otherwise a negative value.
|
||||||
|
* @draft ICU 49
|
||||||
|
*/
|
||||||
|
virtual UChar32
|
||||||
|
composePair(UChar32 a, UChar32 b) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the combining class of c.
|
* Gets the combining class of c.
|
||||||
* The default implementation returns 0
|
* The default implementation returns 0
|
||||||
@ -459,6 +477,19 @@ public:
|
|||||||
virtual UBool
|
virtual UBool
|
||||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs pairwise composition of a & b and returns the composite if there is one.
|
||||||
|
* For details see the base class documentation.
|
||||||
|
*
|
||||||
|
* This function is independent of the mode of the Normalizer2.
|
||||||
|
* @param a A (normalization starter) code point.
|
||||||
|
* @param b Another code point.
|
||||||
|
* @return The non-negative composite code point if there is one; otherwise a negative value.
|
||||||
|
* @draft ICU 49
|
||||||
|
*/
|
||||||
|
virtual UChar32
|
||||||
|
composePair(UChar32 a, UChar32 b) const;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the combining class of c.
|
* Gets the combining class of c.
|
||||||
* The default implementation returns 0
|
* The default implementation returns 0
|
||||||
|
@ -318,6 +318,24 @@ unorm2_getRawDecomposition(const UNormalizer2 *norm2,
|
|||||||
UChar32 c, UChar *decomposition, int32_t capacity,
|
UChar32 c, UChar *decomposition, int32_t capacity,
|
||||||
UErrorCode *pErrorCode);
|
UErrorCode *pErrorCode);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs pairwise composition of a & b and returns the composite if there is one.
|
||||||
|
*
|
||||||
|
* Returns a composite code point c only if c has a two-way mapping to a+b.
|
||||||
|
* In standard Unicode normalization, this means that
|
||||||
|
* c has a canonical decomposition to a+b
|
||||||
|
* and c does not have the Full_Composition_Exclusion property.
|
||||||
|
*
|
||||||
|
* This function is independent of the mode of the UNormalizer2.
|
||||||
|
* @param norm2 UNormalizer2 instance
|
||||||
|
* @param a A (normalization starter) code point.
|
||||||
|
* @param b Another code point.
|
||||||
|
* @return The non-negative composite code point if there is one; otherwise a negative value.
|
||||||
|
* @draft ICU 49
|
||||||
|
*/
|
||||||
|
U_DRAFT UChar32 U_EXPORT2
|
||||||
|
unorm2_composePair(const UNormalizer2 *norm2, UChar32 a, UChar32 b);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Gets the combining class of c.
|
* Gets the combining class of c.
|
||||||
* The default implementation returns 0
|
* The default implementation returns 0
|
||||||
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1084,6 +1084,23 @@ unicodeDataLineFn(void *context,
|
|||||||
c, length, dmLength, u_errorName(*pErrorCode));
|
c, length, dmLength, u_errorName(*pErrorCode));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
/* recompose */
|
||||||
|
if(dt==U_DT_CANONICAL && !u_hasBinaryProperty(c, UCHAR_FULL_COMPOSITION_EXCLUSION)) {
|
||||||
|
UChar32 a, b, composite;
|
||||||
|
i=0;
|
||||||
|
U16_NEXT(dm, i, dmLength, a);
|
||||||
|
U16_NEXT(dm, i, dmLength, b);
|
||||||
|
/* i==dmLength */
|
||||||
|
composite=unorm2_composePair(nfc, a, b);
|
||||||
|
if(composite!=c) {
|
||||||
|
log_err("error: nfc U+%04lX decomposes to U+%04lX+U+%04lX but does not compose back (instead U+%04lX)\n",
|
||||||
|
(long)c, (long)a, (long)b, (long)composite);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Note: NFKC has fewer round-trip mappings than NFC,
|
||||||
|
* so we can't just test unorm2_composePair(nfkc, a, b) here without further data.
|
||||||
|
*/
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* get ISO Comment, field 11 */
|
/* get ISO Comment, field 11 */
|
||||||
|
@ -1175,6 +1175,15 @@ BasicNormalizerTest::TestCompare() {
|
|||||||
errln("NFC.getRawDecomposition() returns TRUE for characters which do not have decompositions");
|
errln("NFC.getRawDecomposition() returns TRUE for characters which do not have decompositions");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test composePair() for some pairs of characters that do not compose
|
||||||
|
if( nfcNorm2->composePair(0x20, 0x301)>=0 ||
|
||||||
|
nfcNorm2->composePair(0x61, 0x305)>=0 ||
|
||||||
|
nfcNorm2->composePair(0x1100, 0x1160)>=0 ||
|
||||||
|
nfcNorm2->composePair(0xac00, 0x11a7)>=0
|
||||||
|
) {
|
||||||
|
errln("NFC.composePair() incorrectly composes some pairs of characters");
|
||||||
|
}
|
||||||
|
|
||||||
// test FilteredNormalizer2::getDecomposition()
|
// test FilteredNormalizer2::getDecomposition()
|
||||||
UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode);
|
UnicodeSet filter(UNICODE_STRING_SIMPLE("[^\\u00a0-\\u00ff]"), errorCode);
|
||||||
FilteredNormalizer2 fn2(*nfcNorm2, filter);
|
FilteredNormalizer2 fn2(*nfcNorm2, filter);
|
||||||
@ -1190,6 +1199,13 @@ BasicNormalizerTest::TestCompare() {
|
|||||||
) {
|
) {
|
||||||
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
|
errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test FilteredNormalizer2::composePair()
|
||||||
|
if( 0x100!=fn2.composePair(0x41, 0x304) ||
|
||||||
|
fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
|
||||||
|
) {
|
||||||
|
errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// verify that case-folding does not un-FCD strings
|
// verify that case-folding does not un-FCD strings
|
||||||
|
@ -132,8 +132,18 @@ struct Norm {
|
|||||||
UBool hasNoCompBoundaryAfter;
|
UBool hasNoCompBoundaryAfter;
|
||||||
|
|
||||||
enum OffsetType {
|
enum OffsetType {
|
||||||
OFFSET_NONE, OFFSET_MAYBE_YES,
|
OFFSET_NONE,
|
||||||
OFFSET_YES_YES, OFFSET_YES_NO, OFFSET_NO_NO,
|
// Composition for back-combining character. Allowed, but not normally used.
|
||||||
|
OFFSET_MAYBE_YES,
|
||||||
|
// Composition for a starter that does not have a decomposition mapping.
|
||||||
|
OFFSET_YES_YES,
|
||||||
|
// Round-trip mapping & composition for a starter.
|
||||||
|
OFFSET_YES_NO_MAPPING_AND_COMPOSITION,
|
||||||
|
// Round-trip mapping for a starter that itself does not combine-forward.
|
||||||
|
OFFSET_YES_NO_MAPPING_ONLY,
|
||||||
|
// One-way mapping.
|
||||||
|
OFFSET_NO_NO,
|
||||||
|
// Delta for an algorithmic one-way mapping.
|
||||||
OFFSET_DELTA
|
OFFSET_DELTA
|
||||||
};
|
};
|
||||||
enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
|
enum { OFFSET_SHIFT=4, OFFSET_MASK=(1<<OFFSET_SHIFT)-1 };
|
||||||
@ -784,7 +794,7 @@ public:
|
|||||||
ExtraDataWriter(Normalizer2DataBuilder &b) :
|
ExtraDataWriter(Normalizer2DataBuilder &b) :
|
||||||
Normalizer2DBEnumerator(b),
|
Normalizer2DBEnumerator(b),
|
||||||
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
|
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
|
||||||
yesNoData(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data
|
yesNoMappingsAndCompositions(1000, (UChar32)0, 1) {} // 0=Hangul, 1=start of normal data
|
||||||
virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
|
virtual UBool rangeHandler(UChar32 start, UChar32 end, uint32_t value) {
|
||||||
if(value!=0) {
|
if(value!=0) {
|
||||||
if(start!=end) {
|
if(start!=end) {
|
||||||
@ -800,7 +810,8 @@ public:
|
|||||||
}
|
}
|
||||||
UnicodeString maybeYesCompositions;
|
UnicodeString maybeYesCompositions;
|
||||||
UnicodeString yesYesCompositions;
|
UnicodeString yesYesCompositions;
|
||||||
UnicodeString yesNoData;
|
UnicodeString yesNoMappingsAndCompositions;
|
||||||
|
UnicodeString yesNoMappingsOnly;
|
||||||
UnicodeString noNoMappings;
|
UnicodeString noNoMappings;
|
||||||
Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode.
|
Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode.
|
||||||
};
|
};
|
||||||
@ -844,10 +855,15 @@ void Normalizer2DataBuilder::writeExtraData(UChar32 c, uint32_t value, ExtraData
|
|||||||
writeCompositions(c, p, writer.yesYesCompositions);
|
writeCompositions(c, p, writer.yesYesCompositions);
|
||||||
}
|
}
|
||||||
} else if(p->mappingType==Norm::ROUND_TRIP) {
|
} else if(p->mappingType==Norm::ROUND_TRIP) {
|
||||||
int32_t offset=writer.yesNoData.length()+writeMapping(c, p, writer.yesNoData);
|
|
||||||
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO;
|
|
||||||
if(p->compositions!=NULL) {
|
if(p->compositions!=NULL) {
|
||||||
writeCompositions(c, p, writer.yesNoData);
|
int32_t offset=writer.yesNoMappingsAndCompositions.length()+
|
||||||
|
writeMapping(c, p, writer.yesNoMappingsAndCompositions);
|
||||||
|
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION;
|
||||||
|
writeCompositions(c, p, writer.yesNoMappingsAndCompositions);
|
||||||
|
} else {
|
||||||
|
int32_t offset=writer.yesNoMappingsOnly.length()+
|
||||||
|
writeMapping(c, p, writer.yesNoMappingsOnly);
|
||||||
|
p->offset=(offset<<Norm::OFFSET_SHIFT)|Norm::OFFSET_YES_NO_MAPPING_ONLY;
|
||||||
}
|
}
|
||||||
} else /* one-way */ {
|
} else /* one-way */ {
|
||||||
if(p->compositions!=NULL) {
|
if(p->compositions!=NULL) {
|
||||||
@ -929,10 +945,14 @@ void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, uint32_t va
|
|||||||
case Norm::OFFSET_YES_YES:
|
case Norm::OFFSET_YES_YES:
|
||||||
norm16=offset;
|
norm16=offset;
|
||||||
break;
|
break;
|
||||||
case Norm::OFFSET_YES_NO:
|
case Norm::OFFSET_YES_NO_MAPPING_AND_COMPOSITION:
|
||||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
|
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+offset;
|
||||||
isDecompNo=TRUE;
|
isDecompNo=TRUE;
|
||||||
break;
|
break;
|
||||||
|
case Norm::OFFSET_YES_NO_MAPPING_ONLY:
|
||||||
|
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+offset;
|
||||||
|
isDecompNo=TRUE;
|
||||||
|
break;
|
||||||
case Norm::OFFSET_NO_NO:
|
case Norm::OFFSET_NO_NO:
|
||||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
|
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+offset;
|
||||||
isDecompNo=isCompNoMaybe=TRUE;
|
isDecompNo=isCompNoMaybe=TRUE;
|
||||||
@ -1042,7 +1062,8 @@ void Normalizer2DataBuilder::processData() {
|
|||||||
|
|
||||||
extraData=extraDataWriter.maybeYesCompositions;
|
extraData=extraDataWriter.maybeYesCompositions;
|
||||||
extraData.append(extraDataWriter.yesYesCompositions).
|
extraData.append(extraDataWriter.yesYesCompositions).
|
||||||
append(extraDataWriter.yesNoData).
|
append(extraDataWriter.yesNoMappingsAndCompositions).
|
||||||
|
append(extraDataWriter.yesNoMappingsOnly).
|
||||||
append(extraDataWriter.noNoMappings);
|
append(extraDataWriter.noNoMappings);
|
||||||
// Pad to even length for 4-byte alignment of following data.
|
// Pad to even length for 4-byte alignment of following data.
|
||||||
if(extraData.length()&1) {
|
if(extraData.length()&1) {
|
||||||
@ -1051,9 +1072,12 @@ void Normalizer2DataBuilder::processData() {
|
|||||||
|
|
||||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]=
|
indexes[Normalizer2Impl::IX_MIN_YES_NO]=
|
||||||
extraDataWriter.yesYesCompositions.length();
|
extraDataWriter.yesYesCompositions.length();
|
||||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]=
|
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=
|
||||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]+
|
indexes[Normalizer2Impl::IX_MIN_YES_NO]+
|
||||||
extraDataWriter.yesNoData.length();
|
extraDataWriter.yesNoMappingsAndCompositions.length();
|
||||||
|
indexes[Normalizer2Impl::IX_MIN_NO_NO]=
|
||||||
|
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+
|
||||||
|
extraDataWriter.yesNoMappingsOnly.length();
|
||||||
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
|
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=
|
||||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]+
|
indexes[Normalizer2Impl::IX_MIN_NO_NO]+
|
||||||
extraDataWriter.noNoMappings.length();
|
extraDataWriter.noNoMappings.length();
|
||||||
@ -1147,6 +1171,7 @@ void Normalizer2DataBuilder::writeBinaryFile(const char *filename) {
|
|||||||
printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
|
printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
|
||||||
printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
|
printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
|
||||||
printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
|
printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
|
||||||
|
printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
|
||||||
printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
|
printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
|
||||||
printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
|
printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
|
||||||
printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
|
printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
|
||||||
|
Loading…
Reference in New Issue
Block a user