QTBF: Fix issue with no splitting the words at "." (FULL STOP)

As of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
for better URL and abbreviations handling which caused "hi.there" to be treated
like if it were just a single word;
until we have the Unicode Text Segmentation tailoring mechanism, retain
the old behavior by remapping (some of) those characters back to their old values.

Change-Id: I49dea6064f2ea40a82fc0b1bc3c4f0b4e803919f
Reviewed-by: David Faure <david.faure@kdab.com>
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
Konstantin Ritt 2012-11-22 03:25:05 +02:00 committed by The Qt Project
parent 44b1c5dde2
commit 2fbb69a093
4 changed files with 64 additions and 4 deletions

View File

@ -4796,7 +4796,7 @@ static const Properties uc_properties[] = {
{ 26, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 9, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 11, 8, 0 },
{ 20, 3, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 16, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 8, 10, 8, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 10, 8, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 7, 0 },
{ 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 3, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
@ -4808,7 +4808,7 @@ static const Properties uc_properties[] = {
{ 3, 2, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 3, 2, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 3, 2, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 11, 9, 11, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 9, 11, 8, 0 },
{ 25, 6, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 11, 8, 0 },
{ 25, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 10, 0, 8, 0 },
{ 26, 10, 0, 0, -1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },
{ 26, 10, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 12, 0 },

View File

@ -156,6 +156,18 @@ static void getWordBreaks(const ushort *string, quint32 len, QCharAttributes *at
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreakClass ncls = (QUnicodeTables::WordBreakClass) prop->wordBreakClass;
#ifdef QT_BUILD_INTERNAL
if (qt_initcharattributes_default_algorithm_only) {
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
// which caused "hi.there" to be treated like if it were just a single word;
// by remapping those characters in the Unicode tables generator.
// this code is needed to pass the coverage tests; remove once the issue is fixed.
if (ucs4 == 0x002E) // FULL STOP
ncls = QUnicodeTables::WordBreak_MidNumLet;
else if (ucs4 == 0x003A) // COLON
ncls = QUnicodeTables::WordBreak_MidLetter;
}
#endif
uchar action = WB::breakTable[cls][ncls];
if (Q_UNLIKELY(action == WB::Lookup)) {

View File

@ -330,8 +330,40 @@ void tst_QTextBoundaryFinder::wordBoundaries_manual_data()
expectedStartPositions << 0 << 4 << 8 << 14 << 18 << 22;
expectedEndPositions << 3 << 7 << 11 << 17 << 21 << 25;
QTest::newRow("data1") << testString << expectedBreakPositions
<< expectedStartPositions << expectedEndPositions;
QTest::newRow("words1") << testString << expectedBreakPositions
<< expectedStartPositions << expectedEndPositions;
}
{
QString testString(QString::fromUtf8("Hello (sad) world !"));
QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
expectedBreakPositions << 0 << 5 << 6 << 7 << 10 << 11 << 12 << 17 << 18 << 19;
expectedStartPositions << 0 << 7 << 12;
expectedEndPositions << 5 << 10 << 17;
QTest::newRow("words2") << testString << expectedBreakPositions
<< expectedStartPositions << expectedEndPositions;
}
{
QString testString(QString::fromUtf8("mr.Hamster"));
QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
expectedBreakPositions << 0 << 2 << 3 << 10;
expectedStartPositions << 0 << 3;
expectedEndPositions << 2 << 10;
QTest::newRow("words3") << testString << expectedBreakPositions
<< expectedStartPositions << expectedEndPositions;
}
{
QString testString(QString::fromUtf8("This is a sample buffer.Please test me . He's don't Le'Clerk."));
QList<int> expectedBreakPositions, expectedStartPositions, expectedEndPositions;
expectedBreakPositions << 0 << 4 << 5 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 20 << 21 << 27
<< 28 << 34 << 35 << 39 << 40 << 42 << 43 << 44 << 45 << 46 << 47 << 48
<< 49 << 53 << 54 << 59 << 60 << 68 << 69;
expectedStartPositions << 0 << 5 << 12 << 14 << 21 << 28 << 35 << 40 << 49 << 54 << 60;
expectedEndPositions << 4 << 7 << 13 << 20 << 27 << 34 << 39 << 42 << 53 << 59 << 68;
QTest::newRow("words4") << testString << expectedBreakPositions
<< expectedStartPositions << expectedEndPositions;
}
{
// text with trailing space
@ -512,6 +544,13 @@ void tst_QTextBoundaryFinder::sentenceBoundaries_manual_data()
QTest::newRow("data2") << testString << expectedBreakPositions;
}
{
QString testString(QString::fromUtf8("mr.Hamster"));
QList<int> expectedBreakPositions;
expectedBreakPositions << 0 << 3 << 10;
QTest::newRow("data3") << testString << expectedBreakPositions;
}
}
void tst_QTextBoundaryFinder::sentenceBoundaries_manual()

View File

@ -1572,6 +1572,15 @@ static void readWordBreak()
qFatal("unassigned word break class: %s", l[1].constData());
for (int codepoint = from; codepoint <= to; ++codepoint) {
// ### [
// as of Unicode 5.1, some punctuation marks were mapped to MidLetter and MidNumLet
// which caused "hi.there" to be treated like if it were just a single word;
// until we have a tailoring mechanism, retain the old behavior by remapping those characters here.
if (codepoint == 0x002E) // FULL STOP
brk = WordBreak_MidNum;
else if (codepoint == 0x003A) // COLON
brk = WordBreak_Other;
// ] ###
UnicodeData &ud = UnicodeData::valueRef(codepoint);
ud.p.wordBreakClass = brk;
}