Unicode: fix the grapheme clustering algorithm

An oversight in the code kept the algorithm in the GB11 state, even if
the codepoint that is being processed wouldn't allow for that (for
instance a sequence of ExtPic, Ext and Any).

Refactor the code of GB11/GB12/GB13 to deal with code points that break
the sequences (falling back to "normal" handling).

Add some manual tests; interestingly enough, the failing cases are not
covered by Unicode's tests, as we now pass the entire test suite.

Amends a794c5e287.

Fixes: QTBUG-94951
Pick-to: 6.1 5.15
Change-Id: If987d5ccf7c6b13de36d049b1b3d88a3c4b6dd00
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Giuseppe D'Angelo 2021-08-22 19:36:49 +02:00
parent ca604964f6
commit d48058f197
2 changed files with 142 additions and 29 deletions

View File

@ -165,9 +165,50 @@ static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttrib
QUnicodeTables::GraphemeBreakClass cls = (QUnicodeTables::GraphemeBreakClass) prop->graphemeBreakClass;
bool shouldBreak = GB::shouldBreakBetweenClasses(lcls, cls);
bool handled = false;
switch (state) {
case GB::State::Normal:
break; // will deal with it below
case GB::State::GB11_ExtPicExt:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
if (cls == QUnicodeTables::GraphemeBreak_Extend) {
// keep going in the current state
Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
handled = true;
} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
state = GB::State::GB11_ExtPicExtZWJ;
Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
handled = true;
} else {
state = GB::State::Normal;
}
break;
case GB::State::GB11_ExtPicExtZWJ:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) {
shouldBreak = false;
handled = true;
}
state = GB::State::Normal;
break;
case GB::State::GB12_13_RI:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) {
shouldBreak = false;
handled = true;
}
state = GB::State::Normal;
break;
}
if (!handled) {
Q_ASSERT(state == GB::State::Normal);
if (lcls == QUnicodeTables::GraphemeBreak_Extended_Pictographic) { // GB11
if (cls == QUnicodeTables::GraphemeBreak_Extend) {
state = GB::State::GB11_ExtPicExt;
@ -179,35 +220,6 @@ static void getGraphemeBreaks(const char16_t *string, qsizetype len, QCharAttrib
} else if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator) { // GB12, GB13
state = GB::State::GB12_13_RI;
}
break;
case GB::State::GB11_ExtPicExt:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_Extend);
if (cls == QUnicodeTables::GraphemeBreak_Extend) {
// keep going in the current state
Q_ASSERT(!shouldBreak); // GB9, do not break before Extend
} else if (cls == QUnicodeTables::GraphemeBreak_ZWJ) {
state = GB::State::GB11_ExtPicExtZWJ;
Q_ASSERT(!shouldBreak); // GB9, do not break before ZWJ
}
break;
case GB::State::GB11_ExtPicExtZWJ:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_ZWJ);
if (cls == QUnicodeTables::GraphemeBreak_Extended_Pictographic)
shouldBreak = false;
state = GB::State::Normal;
break;
case GB::State::GB12_13_RI:
Q_ASSERT(lcls == QUnicodeTables::GraphemeBreak_RegionalIndicator);
if (cls == QUnicodeTables::GraphemeBreak_RegionalIndicator)
shouldBreak = false;
state = GB::State::Normal;
break;
}
if (shouldBreak)

View File

@ -51,6 +51,9 @@ private slots:
void lineBoundariesDefault();
#endif
void graphemeBoundaries_manual_data();
void graphemeBoundaries_manual();
void wordBoundaries_manual_data();
void wordBoundaries_manual();
void sentenceBoundaries_manual_data();
@ -286,6 +289,104 @@ void tst_QTextBoundaryFinder::lineBoundariesDefault()
}
#endif // QT_BUILD_INTERNAL
void tst_QTextBoundaryFinder::graphemeBoundaries_manual_data()
{
QTest::addColumn<QString>("testString");
QTest::addColumn<QList<int>>("expectedBreakPositions");
{
// QTBUG-94951
QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xD83D), QChar(0xDCF2), // U+1F4F2 MOBILE PHONE WITH RIGHTWARDS ARROW AT LEFT
QChar(0xD83D), QChar(0xDCE9), // U+1F4E9 ENVELOPE WITH DOWNWARDS ARROW ABOVE
};
QString testString(s, sizeof(s)/sizeof(s[0]));
QList<int> expectedBreakPositions{0, 2, 4, 6};
QTest::newRow("+EXTPICxEXT+EXTPIC+EXTPIC+") << testString << expectedBreakPositions;
}
{
QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
};
QString testString(s, sizeof(s)/sizeof(s[0]));
QList<int> expectedBreakPositions{0, 2, 4};
QTest::newRow("+EXTPICxEXT+EXTPICxEXT+") << testString << expectedBreakPositions;
}
{
QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
};
QString testString(s, sizeof(s)/sizeof(s[0]));
QList<int> expectedBreakPositions{0, 4, 7};
QTest::newRow("+EXTPICxEXTxEXTxEXT+EXTPICxEXTxEXT+") << testString << expectedBreakPositions;
}
{
QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0x200D), // U+200D ZERO WIDTH JOINER
QChar(0xD83D), QChar(0xDCF2), // U+1F4F2 MOBILE PHONE WITH RIGHTWARDS ARROW AT LEFT
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
};
QString testString(s, sizeof(s)/sizeof(s[0]));
QList<int> expectedBreakPositions{0, 7};
QTest::newRow("+EXTPICxEXTxEXTxZWJxEXTPICxEXTxEXT+") << testString << expectedBreakPositions;
}
{
QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0x200D), // U+200D ZERO WIDTH JOINER
QChar(0x0041), // U+0041 LATIN CAPITAL LETTER A
QChar(0xD83D), QChar(0xDCF2), // U+1F4F2 MOBILE PHONE WITH RIGHTWARDS ARROW AT LEFT
};
QString testString(s, sizeof(s)/sizeof(s[0]));
QList<int> expectedBreakPositions{0, 4, 5, 7};
QTest::newRow("+EXTPICxEXTxEXTxZWJ+Any+EXTPIC+") << testString << expectedBreakPositions;
}
{
QChar s[] = { QChar(0x2764), // U+2764 HEAVY BLACK HEART
QChar(0xFE0F), // U+FE0F VARIATION SELECTOR-16
QChar(0xD83C), QChar(0xDDEA), // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E
QChar(0xD83C), QChar(0xDDFA), // U+1F1FA REGIONAL INDICATOR SYMBOL LETTER U
QChar(0xD83C), QChar(0xDDEA), // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E
QChar(0xD83C), QChar(0xDDFA), // U+1F1FA REGIONAL INDICATOR SYMBOL LETTER U
QChar(0xD83C), QChar(0xDDEA), // U+1F1EA REGIONAL INDICATOR SYMBOL LETTER E
QChar(0x0041), // U+0041 LATIN CAPITAL LETTER A
};
QString testString(s, sizeof(s)/sizeof(s[0]));
QList<int> expectedBreakPositions{0, 2, 6, 10, 12, 13};
QTest::newRow("+EXTPICxEXT+RIxRI+RIxRI+RI+ANY+") << testString << expectedBreakPositions;
}
}
void tst_QTextBoundaryFinder::graphemeBoundaries_manual()
{
QFETCH(QString, testString);
QFETCH(QList<int>, expectedBreakPositions);
doTestData(testString, expectedBreakPositions, QTextBoundaryFinder::Grapheme);
}
void tst_QTextBoundaryFinder::wordBoundaries_manual_data()
{
QTest::addColumn<QString>("testString");