From 3df159ba174c1775a0e77d2305a639eeab1ea71d Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Sat, 30 Jan 2016 10:18:25 +0400 Subject: [PATCH] Improve the script itemization algorithm to match Unicode 8.0 Override preceding Common-s with a subsequent non-Inherited, non-Common script. This produces longer script runs, which automagically improves the shaping quality (as we don't lose the context anymore), the shaping performance (as we're typically shape a fewer runs), and the fallback font selection (when the font supports more than just a single language/script). Task-number: QTBUG-29930 Change-Id: I1c55af30bd397871d7f1f6e062605517f5a7e5a1 Reviewed-by: Lars Knoll Reviewed-by: Eskil Abrahamsen Blomfeldt --- src/corelib/tools/qunicodetools.cpp | 71 +++++++------------ .../tst_qtextscriptengine.cpp | 22 ++---- 2 files changed, 31 insertions(+), 62 deletions(-) diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index 52e7b5a53f..fad4267edc 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -685,10 +685,10 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts) { int sor = 0; - int eor = -1; + int eor = 0; uchar script = QChar::Script_Common; - for (int i = 0; i < length; ++i) { - eor = i; + + for (int i = 0; i < length; ++i, eor = i) { uint ucs4 = string[i]; if (QChar::isHighSurrogate(ucs4) && i + 1 < length) { ushort low = string[i + 1]; @@ -700,60 +700,37 @@ Q_CORE_EXPORT void initScripts(const ushort *string, int length, uchar *scripts) const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4); - if (Q_LIKELY(prop->script == script || prop->script <= QChar::Script_Inherited)) + uchar nscript = prop->script; + + if (Q_LIKELY(nscript == script || nscript <= QChar::Script_Common)) continue; + // inherit preceding Common-s + if (Q_UNLIKELY(script <= QChar::Script_Common)) { + // also covers a case where the base character of Common script followed + // by one or more combining marks of non-Inherited, non-Common script + script = nscript; + continue; + } + // Never break between a combining mark (gc= Mc, Mn or Me) and its base character. // Thus, a combining mark — whatever its script property value is — should inherit // the script property value of its base character. static const int test = (FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining) | FLAG(QChar::Mark_Enclosing)); - if (Q_UNLIKELY(FLAG(prop->category) & test)) { - // In cases where the base character itself has the Common script property value, - // and it is followed by one or more combining marks with a specific script property value, - // it may be even better for processing to let the base acquire the script property value - // from the first mark. This approach can be generalized by treating all the characters - // of a combining character sequence as having the script property value - // of the first non-Inherited, non-Common character in the sequence if there is one, - // and otherwise treating all the characters as having the Common script property value. - if (Q_LIKELY(script > QChar::Script_Common || prop->script <= QChar::Script_Common)) - continue; + if (Q_UNLIKELY(FLAG(prop->category) & test)) + continue; - script = QChar::Script(prop->script); - } + Q_ASSERT(script > QChar::Script_Common); + Q_ASSERT(sor < eor); + ::memset(scripts + sor, script, (eor - sor) * sizeof(uchar)); + sor = eor; -#if 0 // ### Disabled due to regressions. The font selection algorithm is not prepared for this change. - if (Q_LIKELY(script != QChar::Script_Common)) { - // override preceding Common-s - while (sor > 0 && scripts[sor - 1] == QChar::Script_Common) - --sor; - } else { - // see if we are inheriting preceding run - if (sor > 0) - script = scripts[sor - 1]; - } -#endif - - while (sor < eor) - scripts[sor++] = script; - - script = prop->script; + script = nscript; } - eor = length; -#if 0 // ### Disabled due to regressions. The font selection algorithm is not prepared for this change. - if (Q_LIKELY(script != QChar::Script_Common)) { - // override preceding Common-s - while (sor > 0 && scripts[sor - 1] == QChar::Script_Common) - --sor; - } else { - // see if we are inheriting preceding run - if (sor > 0) - script = scripts[sor - 1]; - } -#endif - - while (sor < eor) - scripts[sor++] = script; + Q_ASSERT(script >= QChar::Script_Common); + Q_ASSERT(eor == length); + ::memset(scripts + sor, script, (eor - sor) * sizeof(uchar)); } } // namespace QUnicodeTools diff --git a/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp b/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp index 18012fd347..e9e243f7ed 100644 --- a/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp +++ b/tests/auto/gui/text/qtextscriptengine/tst_qtextscriptengine.cpp @@ -1225,29 +1225,21 @@ void tst_QTextScriptEngine::thaiWithZWJ() QTextLayout layout(s, font); QTextEngine *e = layout.engine(); e->itemize(); - QCOMPARE(e->layoutData->items.size(), 11); + QCOMPARE(e->layoutData->items.size(), 3); for (int item = 0; item < e->layoutData->items.size(); ++item) e->shape(item); - QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(7)); // Thai: The ZWJ and ZWNJ characters are inherited, so should be part of the thai script - QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1)); // Common: The smart quotes cannot be handled by thai, so should be a separate item - QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[3].num_glyphs, ushort(1)); // Common: Ellipsis - QCOMPARE(e->layoutData->items[4].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[5].num_glyphs, ushort(1)); // Common: Smart quote - QCOMPARE(e->layoutData->items[6].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[7].num_glyphs, ushort(1)); // Common: \xA0 = non-breaking space. Could be useful to have in thai, but not currently implemented - QCOMPARE(e->layoutData->items[8].num_glyphs, ushort(1)); // Thai: Thai character - QCOMPARE(e->layoutData->items[9].num_glyphs, ushort(1)); // Japanese: Kanji for tree - QCOMPARE(e->layoutData->items[10].num_glyphs, ushort(2)); // Thai: Thai character followed by superscript "a" which is of inherited type + QCOMPARE(e->layoutData->items[0].num_glyphs, ushort(15)); // Thai, Inherited and Common + QCOMPARE(e->layoutData->items[1].num_glyphs, ushort(1)); // Japanese: Kanji for tree + QCOMPARE(e->layoutData->items[2].num_glyphs, ushort(2)); // Thai: Thai character followed by superscript "a" which is of inherited type //A quick sanity check - check all the characters are individual clusters unsigned short *logClusters = e->layoutData->logClustersPtr; - for (int i = 0; i < 7; i++) + for (int i = 0; i < 15; i++) QCOMPARE(logClusters[i], ushort(i)); - for (int i = 0; i < 10; i++) - QCOMPARE(logClusters[i+7], ushort(0)); + for (int i = 0; i < 3; i++) + QCOMPARE(logClusters[i+15], ushort(0)); // A thai implementation could either remove the ZWJ and ZWNJ characters, or hide them. // The current implementation hides them, so we test for that.