Update the Unicode Text Breaking Algorithm implementation

to make it conformant to the Unicode 6.1 specifications #14 and #29. The most important changes are: * The implementation has been reworked from scratch to fix all known bugs; * Separate-out the grapheme and the line breaking implementation to eliminate an overhead due to calculating unnecessary breaks; * Stop using deprecated SG class in favor of resolving pairs of surrogates; * A proper support for SMP code points; * Support for extended grapheme clusters (a drop-in replacement for the legacy grapheme clusters as of Unicode 5.1); * The hardcoded tailoring of UBA has been eliminated which breaks the 7 years-old lineBreaking test. Some later, we'll investigate if such a tailoring is still needed. Change-Id: I9f5867b3cec753b4fc120bc5a7e20f9a73d89370 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
2012-06-08 01:30:04 +03:00 · 2012-06-08 01:30:04 +03:00 · d64cb5f707
commit d64cb5f707
parent f32a7f1e21
3 changed files with 360 additions and 312 deletions
--- a/src/corelib/tools/qtextboundaryfinder.cpp
+++ b/src/corelib/tools/qtextboundaryfinder.cpp
@ -116,8 +116,9 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
    \reentrant
    QTextBoundaryFinder allows to find Unicode text boundaries in a
-    string, similar to the Unicode text boundary specification (see
+    string, accordingly to the Unicode text boundary specification (see
-    http://www.unicode.org/reports/tr29/tr29-11.html).
+    \l{http://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
    \l{http://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
    QTextBoundaryFinder can operate on a QString in four possible
    modes depending on the value of \a BoundaryType.
@ -127,14 +128,18 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
    Grapheme clusters. The two unicode characters 'A' + diaeresis do
    for example form one grapheme cluster as the user thinks of them
    as one character, yet it is in this case represented by two
-    unicode code points.
+    unicode code points
    (see \l{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
    Word boundaries are there to locate the start and end of what a
-    language considers to be a word.
+    language considers to be a word
    (see \l{http://www.unicode.org/reports/tr29/#Word_Boundaries}).
    Line break boundaries give possible places where a line break
    might happen and sentence boundaries will show the beginning and
-    end of whole sentences.
+    end of whole sentences
    (see \l{http://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
    \l{http://www.unicode.org/reports/tr14/}).
    The first position in a string is always a valid boundary and
    refers to the position before the first character. The last
--- a/src/corelib/tools/qunicodetools.cpp
+++ b/src/corelib/tools/qunicodetools.cpp
@ -43,6 +43,8 @@
 #include "qunicodetables_p.h"
 #define FLAG(x) (1 << (x))
 QT_BEGIN_NAMESPACE
 Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
@ -51,330 +53,374 @@ namespace QUnicodeTools {
 // -----------------------------------------------------------------------------------------------------
 //
-// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-19.html
+// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-19.html
 //
 // -----------------------------------------------------------------------------------------------------
 //
 // The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-11.html
 //
 // -----------------------------------------------------------------------------------------------------
-/* The Unicode algorithm does in our opinion allow line breaks at some
+namespace GB {
   places they shouldn't be allowed. The following changes were thus
   made in comparison to the Unicode reference:
-   EX->AL from DB to IB
+static const uchar breakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
-   SY->AL from DB to IB
+//    Other   CR     LF  Control Extend Prepend S-Mark   L     V      T      LV    LVT
-   SY->PO from DB to IB
+    { true , true , true , true , false, true , false, true , true , true , true , true  }, // Other
-   SY->PR from DB to IB
+    { true , true , false, true , true , true , true , true , true , true , true , true  }, // CR
-   SY->OP from DB to IB
+    { true , true , true , true , true , true , true , true , true , true , true , true  }, // LF
-   AL->PR from DB to IB
+    { true , true , true , true , true , true , true , true , true , true , true , true  }, // Control
-   AL->PO from DB to IB
+    { true , true , true , true , false, true , false, true , true , true , true , true  }, // Extend
-   PR->PR from DB to IB
+    { false, true , true , true , false, false, false, false, false, false, false, false }, // Prepend
-   PO->PO from DB to IB
+    { true , true , true , true , false, true , false, true , true , true , true , true  }, // SpacingMark
-   PR->PO from DB to IB
+    { true , true , true , true , false, true , false, false, false, true , false, false }, // L
-   PO->PR from DB to IB
+    { true , true , true , true , false, true , false, true , false, false, true , true  }, // V
-   HY->PO from DB to IB
+    { true , true , true , true , false, true , false, true , true , false, true , true  }, // T
-   HY->PR from DB to IB
+    { true , true , true , true , false, true , false, true , false, false, true , true  }, // LV
-   HY->OP from DB to IB
+    { true , true , true , true , false, true , false, true , true , false, true , true  }, // LVT
   NU->EX from PB to IB
   EX->PO from DB to IB
 */
 // The following line break classes are not treated by the table:
 //  AI, BK, CB, CR, LF, NL, SA, SG, SP, XX
 enum LineBreakRule {
    ProhibitedBreak,            // PB in table
    DirectBreak,                // DB in table
    IndirectBreak,              // IB in table
    CombiningIndirectBreak,     // CI in table
    CombiningProhibitedBreak    // CP in table
 };
 #define DB DirectBreak
 #define IB IndirectBreak
 #define CI CombiningIndirectBreak
 #define CP CombiningProhibitedBreak
 #define PB ProhibitedBreak
 static const uchar lineBreakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
 /*         OP  CL  QU  GL  NS  EX  SY  IS  PR  PO  NU  AL  ID  IN  HY  BA  BB  B2  ZW  CM  WJ  H2  H3  JL  JV  JT */
 /* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
 /* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
 /* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
 /* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
 /* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
 /* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
 /* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
 };
 #undef DB
 #undef IB
 #undef CI
 #undef CP
 #undef PB
 static const uchar graphemeBreakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
 //    Other, CR,    LF,  Control, Extend, L,     V,     T,     LV,    LVT
    { true , true , true , true , true , true , true , true , true , true  }, // Other,
    { true , true , true , true , true , true , true , true , true , true  }, // CR,
    { true , false, true , true , true , true , true , true , true , true  }, // LF,
    { true , true , true , true , true , true , true , true , true , true  }, // Control,
    { false, true , true , true , false, false, false, false, false, false }, // Extend,
    { true , true , true , true , true , false, true , true , true , true  }, // L,
    { true , true , true , true , true , false, false, true , false, true  }, // V,
    { true , true , true , true , true , true , false, false, false, false }, // T,
    { true , true , true , true , true , false, true , true , true , true  }, // LV,
    { true , true , true , true , true , false, true , true , true , true  }, // LVT
 };
-static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
+} // namespace GB
 static void getGraphemeBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
-    // ##### can this fail if the first char is a surrogate?
+    QUnicodeTables::GraphemeBreak lcls = QUnicodeTables::GraphemeBreakLF; // to meet GB1
-    const QUnicodeTables::Properties *prop = QUnicodeTables::properties(string[0]);
+    for (quint32 i = 0; i != len; ++i) {
-    QUnicodeTables::GraphemeBreak grapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
+        quint32 pos = i;
    QUnicodeTables::LineBreakClass cls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
    // handle case where input starts with an LF
    if (cls == QUnicodeTables::LineBreak_LF)
        cls = QUnicodeTables::LineBreak_BK;
    attributes[0].charStop = true;
    int lcls = cls;
    for (quint32 i = 1; i < len; ++i) {
        attributes[i].charStop = true;
        uint ucs4 = string[i];
-        prop = QUnicodeTables::properties(ucs4);
+        if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
-        QUnicodeTables::GraphemeBreak ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
+            ushort low = string[i + 1];
-        QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
+            if (QChar::isLowSurrogate(low)) {
-        attributes[i].charStop = graphemeBreakTable[ngrapheme][grapheme];
+                ucs4 = QChar::surrogateToUcs4(ucs4, low);
-        // handle surrogates
+                ++i;
        if (ncls == QUnicodeTables::LineBreak_SG) {
            if (QChar::isHighSurrogate(string[i]) && i < len - 1 && QChar::isLowSurrogate(string[i+1])) {
                continue;
            } else if (QChar::isLowSurrogate(string[i]) && QChar::isHighSurrogate(string[i-1])) {
                ucs4 = QChar::surrogateToUcs4(string[i-1], string[i]);
                prop = QUnicodeTables::properties(ucs4);
                ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
                ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
                attributes[i].charStop = false;
            } else {
                ncls = QUnicodeTables::LineBreak_AL;
            }
        }
        const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
        QUnicodeTables::GraphemeBreak cls = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
        attributes[pos].charStop = GB::breakTable[lcls][cls];
        lcls = cls;
    }
 }
 namespace WB {
 enum Action {
    NoBreak = 0,
    Break = 1,
    Lookup = 2
 };
 static const uchar breakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
 //    Other      CR       LF    Newline   Format  Katakana ALetter MidNumLet MidLetter MidNum  Numeric  ExtendNumLet
    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Other
    { Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // CR
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // LF
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Newline
    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Format
    { Break  , Break  , Break  , Break  , NoBreak, NoBreak, Break  , Break  , Break  , Break  , Break  , NoBreak }, // Katakana
    { Break  , Break  , Break  , Break  , NoBreak, Break  , NoBreak, Lookup , Lookup , Break  , NoBreak, NoBreak }, // ALetter
    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNumLet
    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidLetter
    { Break  , Break  , Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNum
    { Break  , Break  , Break  , Break  , NoBreak, Break  , NoBreak, Lookup , Break  , Lookup , NoBreak, NoBreak }, // Numeric
    { Break  , Break  , Break  , Break  , NoBreak, NoBreak, NoBreak, Break  , Break  , Break  , NoBreak, NoBreak }, // ExtendNumLet
 };
 } // namespace WB
 static void getWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
    QUnicodeTables::WordBreak cls = QUnicodeTables::WordBreakLF; // to meet WB1
    for (quint32 i = 0; i != len; ++i) {
        quint32 pos = i;
        uint ucs4 = string[i];
        if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
            ushort low = string[i + 1];
            if (QChar::isLowSurrogate(low)) {
                ucs4 = QChar::surrogateToUcs4(ucs4, low);
                ++i;
            }
        }
        const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
        QUnicodeTables::WordBreak ncls = (QUnicodeTables::WordBreak) prop->wordBreak;
        uchar action = WB::breakTable[cls][ncls];
        if (ncls == QUnicodeTables::WordBreakFormat) {
            // WB4: X(Extend|Format)* -> X
            if (action != WB::Break)
                continue;
        } else {
            if (action == WB::Lookup) {
                action = WB::Break;
                for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
                    ucs4 = string[lookahead];
                    if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
                        ushort low = string[lookahead + 1];
                        if (QChar::isLowSurrogate(low)) {
                            ucs4 = QChar::surrogateToUcs4(ucs4, low);
                            ++lookahead;
                        }
                    }
                    prop = QUnicodeTables::properties(ucs4);
                    QUnicodeTables::WordBreak tcls = (QUnicodeTables::WordBreak) prop->wordBreak;
                    if (tcls == QUnicodeTables::WordBreakFormat)
                        continue;
                    if (tcls == cls) {
                        i = lookahead;
                        ncls = tcls;
                        action = WB::NoBreak;
                    }
                    break;
                }
            }
        }
        cls = ncls;
        if (action == WB::Break)
            attributes[pos].wordBoundary = true;
    }
 }
 namespace SB {
 enum State {
    Initial,
    Upper,
    UpATerm,
    ATerm,
    ATermC,
    ACS,
    STerm,
    STermC,
    SCS,
    BAfterC,
    BAfter,
    Break,
    Lookup
 };
 static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreakClose + 1] = {
 //     Other     CR       LF      Sep     Format     Sp      Lower   Upper    OLetter  Numeric  ATerm   SContinue STerm     Close
    { Initial, BAfterC, BAfter , BAfter , Initial, Initial, Initial, Upper  , Initial, Initial, ATerm  , Initial, STerm  , Initial }, // Initial
    { Initial, BAfterC, BAfter , BAfter , Upper  , Initial, Initial, Upper  , Initial, Initial, UpATerm, STerm  , STerm  , Initial }, // Upper
    { Lookup , BAfterC, BAfter , BAfter , UpATerm, ACS    , Initial, Upper  , Break  , Initial, ATerm  , STerm  , STerm  , ATermC  }, // UpATerm
    { Lookup , BAfterC, BAfter , BAfter , ATerm  , ACS    , Initial, Break  , Break  , Initial, ATerm  , STerm  , STerm  , ATermC  }, // ATerm
    { Lookup , BAfterC, BAfter , BAfter , ATermC , ACS    , Initial, Break  , Break  , Lookup , ATerm  , STerm  , STerm  , ATermC  }, // ATermC
    { Lookup , BAfterC, BAfter , BAfter , ACS    , ACS    , Initial, Break  , Break  , Lookup , ATerm  , STerm  , STerm  , Lookup  }, // ACS
    { Break  , BAfterC, BAfter , BAfter , STerm  , SCS    , Break  , Break  , Break  , Break  , ATerm  , STerm  , STerm  , STermC  }, // STerm,
    { Break  , BAfterC, BAfter , BAfter , STermC , SCS    , Break  , Break  , Break  , Break  , ATerm  , STerm  , STerm  , STermC  }, // STermC
    { Break  , BAfterC, BAfter , BAfter , SCS    , SCS    , Break  , Break  , Break  , Break  , ATerm  , STerm  , STerm  , Break   }, // SCS
    { Break  , Break  , BAfter , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // BAfterC
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // BAfter
 };
 } // namespace SB
 static void getSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
    uchar state = SB::BAfter; // to meet SB1
    for (quint32 i = 0; i != len; ++i) {
        quint32 pos = i;
        uint ucs4 = string[i];
        if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
            ushort low = string[i + 1];
            if (QChar::isLowSurrogate(low)) {
                ucs4 = QChar::surrogateToUcs4(ucs4, low);
                ++i;
            }
        }
        const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
        QUnicodeTables::SentenceBreak ncls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
        Q_ASSERT(state <= SB::BAfter);
        state = SB::breakTable[state][ncls];
        if (state == SB::Lookup) { // SB8
            state = SB::Break;
            for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
                ucs4 = string[lookahead];
                if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
                    ushort low = string[lookahead + 1];
                    if (QChar::isLowSurrogate(low)) {
                        ucs4 = QChar::surrogateToUcs4(ucs4, low);
                        ++lookahead;
                    }
                }
                prop = QUnicodeTables::properties(ucs4);
                QUnicodeTables::SentenceBreak tcls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
                switch (tcls) {
                case QUnicodeTables::SentenceBreakOther:
                case QUnicodeTables::SentenceBreakFormat:
                case QUnicodeTables::SentenceBreakSp:
                case QUnicodeTables::SentenceBreakNumeric:
                case QUnicodeTables::SentenceBreakSContinue:
                case QUnicodeTables::SentenceBreakClose:
                    continue;
                case QUnicodeTables::SentenceBreakLower:
                    i = lookahead;
                    state = SB::Initial;
                    break;
                default:
                    break;
                }
                break;
            }
        }
        if (state == SB::Break) {
            attributes[pos].sentenceBoundary = true;
            state = SB::breakTable[SB::Initial][ncls];
        }
    }
 }
 // -----------------------------------------------------------------------------------------------------
 //
 // The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-28.html
 //
 // -----------------------------------------------------------------------------------------------------
 namespace LB {
 // The following line break classes are not treated by the pair table
 // and must be resolved outside:
 //  AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
 enum Action {
    ProhibitedBreak, PB = ProhibitedBreak,
    DirectBreak, DB = DirectBreak,
    IndirectBreak, IB = IndirectBreak,
    CombiningIndirectBreak, CI = CombiningIndirectBreak,
    CombiningProhibitedBreak, CP = CombiningProhibitedBreak
 };
 static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
 /*         OP  CL  CP  QU  GL  NS  EX  SY  IS  PR  PO  NU  AL  HL  ID  IN  HY  BA  BB  B2  ZW  CM  WJ  H2  H3  JL  JV  JT */
 /* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
 /* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* PR */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* PO */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* HY */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* BA */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
 /* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
 /* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
 /* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
 /* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
 /* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
 /* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
 /* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
 };
 } // namespace LB
 static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
    uint lucs4 = 0;
    QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
    QUnicodeTables::LineBreakClass cls = lcls;
    for (quint32 i = 0; i != len; ++i) {
        quint32 pos = i;
        uint ucs4 = string[i];
        if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
            ushort low = string[i + 1];
            if (QChar::isLowSurrogate(low)) {
                ucs4 = QChar::surrogateToUcs4(ucs4, low);
                ++i;
            }
        }
        const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
        QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
        if (ncls == QUnicodeTables::LineBreak_SA) {
            // LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
            static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
            if (FLAG(prop->category) & test)
                ncls = QUnicodeTables::LineBreak_CM;
        }
        if (ncls == QUnicodeTables::LineBreak_CM) {
            // LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
            if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
                ncls = QUnicodeTables::LineBreak_AL;
        }
        HB_LineBreakType lineBreakType = HB_NoBreak;
-        if (cls >= QUnicodeTables::LineBreak_CR) {
+        if (lcls >= QUnicodeTables::LineBreak_CR) {
-            if (cls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
+            // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
            if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
                lineBreakType = HB_ForcedBreak;
            goto next;
        }
-        if (ncls == QUnicodeTables::LineBreak_SP)
+        if (ncls >= QUnicodeTables::LineBreak_SP) {
-            goto next_no_cls_update;
+            if (ncls > QUnicodeTables::LineBreak_SP)
-        if (ncls >= QUnicodeTables::LineBreak_CR)
+                goto next; // LB6: x(BK|CR|LF|NL)
-            goto next;
+            goto next_no_cls_update; // LB7: xSP
        {
            int tcls = ncls;
            // for south east asian chars that require a complex (dictionary analysis), the unicode
            // standard recommends to treat them as AL. thai_attributes and other attribute methods that
            // do dictionary analysis can override
            if (tcls >= QUnicodeTables::LineBreak_SA)
                tcls = QUnicodeTables::LineBreak_AL;
            if (cls >= QUnicodeTables::LineBreak_SA)
                cls = QUnicodeTables::LineBreak_AL;
            int brk = lineBreakTable[cls][tcls];
            switch (brk) {
            case DirectBreak:
                lineBreakType = HB_Break;
                if (string[i-1] == 0xad) // soft hyphen
                    lineBreakType = HB_SoftHyphen;
                break;
            case IndirectBreak:
                lineBreakType = (lcls == QUnicodeTables::LineBreak_SP) ? HB_Break : HB_NoBreak;
                break;
            case CombiningIndirectBreak:
                lineBreakType = HB_NoBreak;
                if (lcls == QUnicodeTables::LineBreak_SP){
                    if (i > 1)
                        attributes[i-2].lineBreakType = HB_Break;
                } else {
                    goto next_no_cls_update;
                }
                break;
            case CombiningProhibitedBreak:
                lineBreakType = HB_NoBreak;
                if (lcls != QUnicodeTables::LineBreak_SP)
                    goto next_no_cls_update;
            case ProhibitedBreak:
            default:
                break;
            }
        }
        // for South East Asian chars that require a complex analysis, the Unicode
        // standard recommends to treat them as AL. tailoring that do dictionary analysis can override
        if (cls >= QUnicodeTables::LineBreak_SA)
            cls = QUnicodeTables::LineBreak_AL;
        switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
        case LB::DirectBreak:
            lineBreakType = HB_Break;
            if (lucs4 == 0x00ad) // soft hyphen
                lineBreakType = HB_SoftHyphen;
            break;
        case LB::IndirectBreak:
            if (lcls == QUnicodeTables::LineBreak_SP)
                lineBreakType = HB_Break;
            break;
        case LB::CombiningIndirectBreak:
            if (lcls != QUnicodeTables::LineBreak_SP)
                goto next_no_cls_update;
            lineBreakType = HB_Break;
            break;
        case LB::CombiningProhibitedBreak:
            if (lcls != QUnicodeTables::LineBreak_SP)
                goto next_no_cls_update;
            break;
        case LB::ProhibitedBreak:
            // nothing to do
        default:
            break;
        }
    next:
        cls = ncls;
        lucs4 = ucs4;
    next_no_cls_update:
        lcls = ncls;
-        grapheme = ngrapheme;
+        if (lineBreakType != HB_NoBreak)
-        attributes[i-1].lineBreakType = lineBreakType;
+            attributes[pos].lineBreakType = lineBreakType;
    }
    for (quint32 i = len - 1; i > 0; --i)
        attributes[i].lineBreakType = attributes[i - 1].lineBreakType;
    attributes[0].lineBreakType = HB_NoBreak; // LB2
 }
 enum WordBreakRule { NoBreak = 0, Break = 1, Middle = 2 };
 static const uchar wordBreakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
 //    Other    Format   Katakana ALetter  MidLetter MidNum  Numeric  ExtendNumLet
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Other
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // Format
    { Break  , Break  , NoBreak, Break  , Break  , Break  , Break  , NoBreak }, // Katakana
    { Break  , Break  , Break  , NoBreak, Middle , Break  , NoBreak, NoBreak }, // ALetter
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidLetter
    { Break  , Break  , Break  , Break  , Break  , Break  , Break  , Break   }, // MidNum
    { Break  , Break  , Break  , NoBreak, Break  , Middle , NoBreak, NoBreak }, // Numeric
    { Break  , Break  , NoBreak, NoBreak, Break  , Break  , NoBreak, NoBreak }, // ExtendNumLet
 };
 static void calcWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
    quint32 brk = QUnicodeTables::wordBreakClass(string[0]);
    attributes[0].wordBoundary = true;
    for (quint32 i = 1; i < len; ++i) {
        if (!attributes[i].charStop) {
            attributes[i].wordBoundary = false;
            continue;
        }
        quint32 nbrk = QUnicodeTables::wordBreakClass(string[i]);
        if (nbrk == QUnicodeTables::WordBreakFormat) {
            attributes[i].wordBoundary = (QUnicodeTables::sentenceBreakClass(string[i-1]) == QUnicodeTables::SentenceBreakSep);
            continue;
        }
        WordBreakRule rule = (WordBreakRule)wordBreakTable[brk][nbrk];
        if (rule == Middle) {
            rule = Break;
            quint32 lookahead = i + 1;
            while (lookahead < len) {
                quint32 testbrk = QUnicodeTables::wordBreakClass(string[lookahead]);
                if (testbrk == QUnicodeTables::WordBreakFormat
                    && QUnicodeTables::sentenceBreakClass(string[lookahead]) != QUnicodeTables::SentenceBreakSep) {
                    ++lookahead;
                    continue;
                }
                if (testbrk == brk) {
                    rule = NoBreak;
                    while (i < lookahead)
                        attributes[i++].wordBoundary = false;
                    nbrk = testbrk;
                }
                break;
            }
        }
        attributes[i].wordBoundary = (rule == Break);
        brk = nbrk;
    }
 }
 enum SentenceBreakState {
    SB_Initial,
    SB_Upper,
    SB_UpATerm,
    SB_ATerm,
    SB_ATermC,
    SB_ACS,
    SB_STerm,
    SB_STermC,
    SB_SCS,
    SB_BAfter,
    SB_Break,
    SB_Lookup
 };
 static const uchar sentenceBreakTable[SB_Lookup + 1][QUnicodeTables::SentenceBreakClose + 1] = {
 //      Other       Sep         Format      Sp          Lower       Upper       OLetter     Numeric     ATerm       STerm       Close
    { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper  , SB_Initial, SB_Initial, SB_ATerm  , SB_STerm  , SB_Initial }, // SB_Initial,
    { SB_Initial, SB_BAfter , SB_Upper  , SB_Initial, SB_Initial, SB_Upper  , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm  , SB_Initial }, // SB_Upper
    { SB_Lookup , SB_BAfter , SB_UpATerm, SB_ACS    , SB_Initial, SB_Upper  , SB_Break  , SB_Initial, SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_UpATerm
    { SB_Lookup , SB_BAfter , SB_ATerm  , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Initial, SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_ATerm
    { SB_Lookup , SB_BAfter , SB_ATermC , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Lookup , SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_ATermC,
    { SB_Lookup , SB_BAfter , SB_ACS    , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Lookup , SB_ATerm  , SB_STerm  , SB_Lookup  }, // SB_ACS,
    { SB_Break  , SB_BAfter , SB_STerm  , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_STermC  }, // SB_STerm,
    { SB_Break  , SB_BAfter , SB_STermC , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_STermC  }, // SB_STermC,
    { SB_Break  , SB_BAfter , SB_SCS    , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_Break   }, // SB_SCS,
    { SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break   }, // SB_BAfter,
 };
 static void calcSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
    quint32 brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[0])];
    attributes[0].sentenceBoundary = true;
    for (quint32 i = 1; i < len; ++i) {
        if (!attributes[i].charStop) {
            attributes[i].sentenceBoundary = false;
            continue;
        }
        brk = sentenceBreakTable[brk][QUnicodeTables::sentenceBreakClass(string[i])];
        if (brk == SB_Lookup) {
            brk = SB_Break;
            quint32 lookahead = i + 1;
            while (lookahead < len) {
                quint32 sbrk = QUnicodeTables::sentenceBreakClass(string[lookahead]);
                if (sbrk != QUnicodeTables::SentenceBreakOther
                    && sbrk != QUnicodeTables::SentenceBreakNumeric
                    && sbrk != QUnicodeTables::SentenceBreakClose) {
                    break;
                } else if (sbrk == QUnicodeTables::SentenceBreakLower) {
                    brk = SB_Initial;
                    break;
                }
                ++lookahead;
            }
            if (brk == SB_Initial) {
                while (i < lookahead)
                    attributes[i++].sentenceBoundary = false;
            }
        }
        if (brk == SB_Break) {
            attributes[i].sentenceBoundary = true;
            brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[i])];
        } else {
            attributes[i].sentenceBoundary = false;
        }
    }
 }
 static void getWhiteSpaces(const ushort *string, quint32 len, HB_CharAttributes *attributes)
 {
    for (quint32 i = 0; i != len; ++i) {
@ -400,18 +446,17 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
    if (length <= 0)
        return;
-    if (!(options & DontClearAttributes)) {
+    if (!(options & DontClearAttributes))
        ::memset(attributes, 0, length * sizeof(HB_CharAttributes));
        if (options & (WordBreaks | SentenceBreaks))
            options |= GraphemeBreaks;
    }
-    if (options & (GraphemeBreaks | LineBreaks))
+    if (options & GraphemeBreaks)
-        calcGraphemeAndLineBreaks(string, length, attributes);
+        getGraphemeBreaks(string, length, attributes);
    if (options & WordBreaks)
-        calcWordBreaks(string, length, attributes);
+        getWordBreaks(string, length, attributes);
    if (options & SentenceBreaks)
-        calcSentenceBreaks(string, length, attributes);
+        getSentenceBreaks(string, length, attributes);
    if (options & LineBreaks)
        getLineBreaks(string, length, attributes);
    if (options & WhiteSpaces)
        getWhiteSpaces(string, length, attributes);
--- a/tests/auto/gui/text/qtextlayout/tst_qtextlayout.cpp
+++ b/tests/auto/gui/text/qtextlayout/tst_qtextlayout.cpp
@ -209,7 +209,7 @@ void tst_QTextLayout::cleanup()
 void tst_QTextLayout::lineBreaking()
 {
-#if defined(Q_WS_X11)
+#if 0
    struct Breaks {
 	const char *utf8;
 	uchar breaks[32];
@ -286,8 +286,6 @@ void tst_QTextLayout::lineBreaking()
        QCOMPARE(b->breaks[i], (uchar)0xff);
        ++b;
    }
 #else
    QSKIP("This test can not be run on non-X11 platforms");
 #endif
 }