ICU-10524 normalization one-way mapping with trailing ccc>1 has no compose-boundary-after

X-SVN-Rev: 40355
This commit is contained in:
Markus Scherer 2017-08-25 22:46:12 +00:00
parent 3e9530c048
commit 2f87cf4c46
11 changed files with 68 additions and 26 deletions

View File

@ -300,21 +300,21 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
1,1,1,1,0x864,0x198d,1,1,1,1,1,1,0x868,0x1993,1,0x86c,
0x1999,1,1,1,1,1,1,1,0xfc0e,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,0xffcc,0xffb8,0xffcc,
0xffcc,1,1,1,0x29dd,0x29e3,0x29e9,0x29ef,0x29f5,0x29fb,0x2a01,0x2a07,1,1,1,1,
0xffcc,1,1,1,0x29dc,0x29e2,0x29e8,0x29ee,0x29f4,0x29fa,0x2a00,0x2a06,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,0xfe0e,1,0xfc00,1,1,1,1,1,
1,1,1,0x870,1,1,1,0x199f,0x19a5,0xfe12,1,1,1,1,1,1,
1,1,1,0xfc00,1,1,1,1,0x2a0d,0x2a13,1,0x2a19,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1f,
1,1,0x2a25,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
1,1,1,0xfc00,1,1,1,1,0x2a0c,0x2a12,1,0x2a18,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x2a1e,
1,1,0x2a24,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
1,1,1,1,1,0x2a2b,0x2a31,0x2a37,1,1,0x2a3d,1,1,1,1,1,
1,1,1,1,1,0x2a2a,0x2a30,0x2a36,1,1,0x2a3c,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,0xfe0e,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0xfe12,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0x878,
0x19ab,1,1,0x19b1,0x19b7,0xfe12,1,1,1,1,1,1,1,1,0xfc00,0xfc00,
1,1,1,1,0x2a43,0x2a49,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,0x2a42,0x2a48,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,0x884,1,0x19bd,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,0xfc00,1,
@ -342,7 +342,7 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
1,1,1,0x2a4f,1,1,1,1,1,1,1,1,1,0x2a55,1,1,
1,1,0x2a5b,1,1,1,1,0x2a61,1,1,1,1,0x2a67,1,1,1,
1,1,1,1,1,1,1,1,1,0x2a6d,1,1,1,1,1,1,
1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a73,1,0x2a79,1,0xff04,0xff04,0xff04,0xff04,1,1,
1,0xff02,0xff04,0x3c40,0xff08,0x3c48,0x2a72,1,0x2a78,1,0xff04,0xff04,0xff04,0xff04,1,1,
0xff04,0x3c50,0xffcc,0xffcc,0xfe12,1,0xffcc,0xffcc,1,1,1,1,1,1,1,1,
1,1,1,0x2a7f,1,1,1,1,1,1,1,1,1,0x2a85,1,1,
1,1,0x2a8b,1,1,1,1,0x2a91,1,1,1,1,0x2a97,1,1,1,
@ -406,15 +406,15 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
0x21ef,0x21f9,0x2203,0x220d,0x10d8,0x10e6,0x2217,0x2221,0x222b,0x2235,1,1,0x10f4,0x1102,0x223f,0x2249,
0x2253,0x225d,1,1,0x1110,0x1122,0x2267,0x2271,0x227b,0x2285,0x228f,0x2299,1,0x1134,1,0x22a3,
1,0x22ad,1,0x22b7,0x1146,0x115c,0x1174,0x1182,0x1190,0x119e,0x11ac,0x11ba,0x11c6,0x11dc,0x11f4,0x1202,
0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b97,0x1250,0x3b9e,0x22c5,0x3ba7,0x22cb,0x3baf,0x22d1,0x3bb7,
0x1210,0x121e,0x122c,0x123a,0x1246,0x3b8e,0x22bf,0x3b96,0x1250,0x3b9e,0x22c5,0x3ba6,0x22cb,0x3bae,0x22d1,0x3bb6,
0x125a,0x3bbe,1,1,0x22d8,0x22e2,0x22f1,0x2301,0x2311,0x2321,0x2331,0x2341,0x234c,0x2356,0x2365,0x2375,
0x2385,0x2395,0x23a5,0x23b5,0x23c0,0x23ca,0x23d9,0x23e9,0x23f9,0x2409,0x2419,0x2429,0x2434,0x243e,0x244d,0x245d,
0x246d,0x247d,0x248d,0x249d,0x24a8,0x24b2,0x24c1,0x24d1,0x24e1,0x24f1,0x2501,0x2511,0x251c,0x2526,0x2535,0x2545,
0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc7,
0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bcf,0x2607,0x3bd7,
0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be1,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3beb,
1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf5,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bff,
0x26b3,0x26b9,0x3c07,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0f,0x26e9,0x3c17,
0x2555,0x2565,0x2575,0x2585,0x258f,0x2595,0x259d,0x25a4,0x25ad,1,0x1264,0x25b7,0x25bf,0x25c5,0x25cb,0x3bc6,
0x25d0,1,0x2aa2,0x8f0,1,0x25d7,0x25df,0x25e6,0x25ef,1,0x126e,0x25f9,0x2601,0x3bce,0x2607,0x3bd6,
0x260c,0x2613,0x2619,0x261f,0x2625,0x262b,0x2633,0x3be0,1,1,0x263b,0x2643,0x264b,0x2651,0x2657,0x3bea,
1,0x265d,0x2663,0x2669,0x266f,0x2675,0x267d,0x3bf4,0x2685,0x268b,0x2691,0x2699,0x26a1,0x26a7,0x26ad,0x3bfe,
0x26b3,0x26b9,0x3c06,0x2aa7,1,1,0x26c1,0x26c8,0x26d1,1,0x1278,0x26db,0x26e3,0x3c0e,0x26e9,0x3c16,
0x26ee,0x2aab,0x8fc,1,0xfa09,0xfa09,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,0xffcc,0xffcc,0xfe02,0xfe02,0xffcc,0xffcc,0xffcc,0xffcc,0xfe02,0xfe02,0xfe02,0xffcc,
@ -512,10 +512,10 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
0x311b,0x3009,0x311f,0x3123,0x3127,0x312b,0x312f,0x3011,0x2f09,0x3133,0x3015,0x3137,0x3019,0x313b,0x2ae1,0x313f,
0x3145,0x314b,0x3151,0x3155,0x3159,0x315d,0x3163,0x3169,0x316f,0x3173,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0x3177,0xfe34,0x317d,1,1,1,1,
1,1,1,1,1,1,0x3183,0x3189,0x3191,0x319b,0x31a3,0x31a9,0x31af,0x31b5,0x31bb,0x31c1,
0x31c7,0x31cd,0x31d3,1,0x31d9,0x31df,0x31e5,0x31eb,0x31f1,1,0x31f7,1,0x31fd,0x3203,1,0x3209,
0x320f,1,0x3215,0x321b,0x3221,0x3227,0x322d,0x3233,0x3239,0x323f,0x3245,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,0x3176,0xfe34,0x317c,1,1,1,1,
1,1,1,1,1,1,0x3182,0x3188,0x3190,0x319a,0x31a2,0x31a8,0x31ae,0x31b4,0x31ba,0x31c0,
0x31c6,0x31cc,0x31d2,1,0x31d8,0x31de,0x31e4,0x31ea,0x31f0,1,0x31f6,1,0x31fc,0x3202,1,0x3208,
0x320e,1,0x3214,0x321a,0x3220,0x3226,0x322c,0x3232,0x3238,0x323e,0x3244,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,
0xffcc,0xffcc,0xffcc,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffcc,0xffcc,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@ -560,13 +560,13 @@ static const uint16_t norm2_nfc_data_trieIndex[9776]={
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,0xfe02,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,0x324b,0x3255,0x3269,0x3281,0x3299,0x32b1,0x32c9,0xffb0,0xffb0,0xfe02,
1,1,1,1,1,1,0x324a,0x3254,0x3268,0x3280,0x3298,0x32b0,0x32c8,0xffb0,0xffb0,0xfe02,
0xfe02,0xfe02,1,1,1,0xffc4,0xffb0,0xffb0,0xffb0,0xffb0,0xffb0,1,1,1,1,1,
1,1,1,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,0xffb8,1,1,0xffcc,0xffcc,0xffcc,
0xffcc,0xffcc,0xffb8,0xffb8,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,0xffcc,0xffcc,0xffcc,0xffcc,1,1,
1,1,1,1,1,1,1,1,1,1,1,0x32d7,0x32e1,0x32f5,0x330d,0x3325,
0x333d,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,0x32d6,0x32e0,0x32f4,0x330c,0x3324,
0x333c,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,0xffcc,0xffcc,0xffcc,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -59,6 +59,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
TESTCASE_AUTO(TestNormalizeIllFormedText);
TESTCASE_AUTO(TestComposeJamoTBase);
TESTCASE_AUTO(TestComposeBoundaryAfter);
TESTCASE_AUTO_END;
}
@ -1754,4 +1755,21 @@ BasicNormalizerTest::TestComposeJamoTBase() {
assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
}
void
BasicNormalizerTest::TestComposeBoundaryAfter() {
IcuTestErrorCode errorCode(*this, "TestComposeBoundaryAfter");
const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
return;
}
// U+02DA and U+FB2C do not have compose-boundaries-after.
UnicodeString s(u"\u02DA\u0339 \uFB2C\u05B6");
UnicodeString expected(u" \u0339\u030A \u05E9\u05B6\u05BC\u05C1");
UnicodeString result = nfkc->normalize(s, errorCode);
assertSuccess("nfkc", errorCode.get());
assertEquals("nfkc", expected, result);
assertFalse("U+02DA boundary-after", nfkc->hasBoundaryAfter(0x2DA));
assertFalse("U+FB2C boundary-after", nfkc->hasBoundaryAfter(0xFB2C));
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View File

@ -52,6 +52,7 @@ public:
void TestLowMappingToEmpty_FCD();
void TestNormalizeIllFormedText();
void TestComposeJamoTBase();
void TestComposeBoundaryAfter();
private:
UnicodeString canonTests[24][3];

View File

@ -209,7 +209,8 @@ void Normalizer2DataBuilder::removeMapping(UChar32 c) {
norms.mappingSet.add(c);
}
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
Norm::MappingType mappingType) const {
if(buffer.isEmpty()) {
return FALSE; // Maps-to-empty-string is no boundary of any kind.
}
@ -217,6 +218,15 @@ UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderin
if(lastStarterIndex<0) {
return FALSE; // no starter
}
const int32_t lastIndex=buffer.length()-1;
if(mappingType==Norm::ONE_WAY && lastStarterIndex<lastIndex && buffer.ccAt(lastIndex)>1) {
// One-way mapping where after the last starter is at least one combining mark
// with a combining class greater than 1,
// which means that another combining mark can reorder before it.
// By contrast, in a round-trip mapping this does not prevent a boundary as long as
// the starter or composite does not combine-forward with a following combining mark.
return FALSE;
}
UChar32 starter=buffer.charAt(lastStarterIndex);
if(lastStarterIndex==0 && norms.combinesBack(starter)) {
// The last starter is at the beginning of the mapping and combines backward.
@ -227,7 +237,7 @@ UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderin
0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
// A Jamo leading consonant or an LV pair combines-forward if it is at the end,
// otherwise it is blocked.
return lastStarterIndex!=buffer.length()-1;
return lastStarterIndex!=lastIndex;
}
// Note: There can be no Hangul syllable in the fully decomposed mapping.
@ -344,7 +354,7 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
norm.hasCompBoundaryBefore=
!buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
norm.hasCompBoundaryAfter=
norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer);
norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer, norm.mappingType);
if(norm.combinesBack) {
norm.error="combines-back and decomposes, not possible in Unicode normalization";

View File

@ -84,7 +84,8 @@ private:
* or its mapping contains no starter,
* or the last starter combines-forward.
*/
UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const;
UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer,
Norm::MappingType mappingType) const;
/** Returns TRUE if the mapping by itself recomposes, that is, it is not comp-normalized. */
UBool mappingRecomposes(const BuilderReorderingBuffer &buffer) const;
void postProcess(Norm &norm);

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:193787da8cd2caebf1901892beccad07f8e7f3c714ef482681784bc583be5c60
size 12226288
oid sha256:36b9c089da215705724fac836c41e0db1922f8ea85bf734c601f70fc623187ad
size 12226253

View File

@ -2867,6 +2867,18 @@ public class BasicTest extends TestFmwk {
assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
}
@Test
public void TestComposeBoundaryAfter() {
Normalizer2 nfkc = Normalizer2.getNFKCInstance();
// U+02DA and U+FB2C do not have compose-boundaries-after.
String s = "\u02DA\u0339 \uFB2C\u05B6";
String expected = " \u0339\u030A \u05E9\u05B6\u05BC\u05C1";
String result = nfkc.normalize(s);
assertEquals("nfkc", expected, result);
assertFalse("U+02DA boundary-after", nfkc.hasBoundaryAfter(0x2DA));
assertFalse("U+FB2C boundary-after", nfkc.hasBoundaryAfter(0xFB2C));
}
@Test
public void TestNFC() {
// Coverage tests.