Update the Unicode Text Breaking Algorithm implementation

to make it conformant to the Unicode 6.1 specifications #14 and #29.

The most important changes are:
* The implementation has been reworked from scratch to fix all known bugs;
* Separate-out the grapheme and the line breaking implementation to eliminate
  an overhead due to calculating unnecessary breaks;
* Stop using deprecated SG class in favor of resolving pairs of surrogates;
* A proper support for SMP code points;
* Support for extended grapheme clusters (a drop-in replacement for the legacy
  grapheme clusters as of Unicode 5.1);
* The hardcoded tailoring of UBA has been eliminated which breaks the 7 years-old
  lineBreaking test. Some later, we'll investigate if such a tailoring is still needed.

Change-Id: I9f5867b3cec753b4fc120bc5a7e20f9a73d89370
Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
This commit is contained in:
Konstantin Ritt 2012-06-08 01:30:04 +03:00 committed by Qt by Nokia
parent f32a7f1e21
commit d64cb5f707
3 changed files with 360 additions and 312 deletions

View File

@ -116,8 +116,9 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
\reentrant \reentrant
QTextBoundaryFinder allows to find Unicode text boundaries in a QTextBoundaryFinder allows to find Unicode text boundaries in a
string, similar to the Unicode text boundary specification (see string, accordingly to the Unicode text boundary specification (see
http://www.unicode.org/reports/tr29/tr29-11.html). \l{http://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
\l{http://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
QTextBoundaryFinder can operate on a QString in four possible QTextBoundaryFinder can operate on a QString in four possible
modes depending on the value of \a BoundaryType. modes depending on the value of \a BoundaryType.
@ -127,14 +128,18 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
Grapheme clusters. The two unicode characters 'A' + diaeresis do Grapheme clusters. The two unicode characters 'A' + diaeresis do
for example form one grapheme cluster as the user thinks of them for example form one grapheme cluster as the user thinks of them
as one character, yet it is in this case represented by two as one character, yet it is in this case represented by two
unicode code points. unicode code points
(see \l{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
Word boundaries are there to locate the start and end of what a Word boundaries are there to locate the start and end of what a
language considers to be a word. language considers to be a word
(see \l{http://www.unicode.org/reports/tr29/#Word_Boundaries}).
Line break boundaries give possible places where a line break Line break boundaries give possible places where a line break
might happen and sentence boundaries will show the beginning and might happen and sentence boundaries will show the beginning and
end of whole sentences. end of whole sentences
(see \l{http://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
\l{http://www.unicode.org/reports/tr14/}).
The first position in a string is always a valid boundary and The first position in a string is always a valid boundary and
refers to the position before the first character. The last refers to the position before the first character. The last

View File

@ -43,6 +43,8 @@
#include "qunicodetables_p.h" #include "qunicodetables_p.h"
#define FLAG(x) (1 << (x))
QT_BEGIN_NAMESPACE QT_BEGIN_NAMESPACE
Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0; Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
@ -51,330 +53,374 @@ namespace QUnicodeTools {
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------
// //
// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-19.html // The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-19.html
//
// -----------------------------------------------------------------------------------------------------
//
// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-11.html
// //
// ----------------------------------------------------------------------------------------------------- // -----------------------------------------------------------------------------------------------------
/* The Unicode algorithm does in our opinion allow line breaks at some namespace GB {
places they shouldn't be allowed. The following changes were thus
made in comparison to the Unicode reference:
EX->AL from DB to IB static const uchar breakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
SY->AL from DB to IB // Other CR LF Control Extend Prepend S-Mark L V T LV LVT
SY->PO from DB to IB { true , true , true , true , false, true , false, true , true , true , true , true }, // Other
SY->PR from DB to IB { true , true , false, true , true , true , true , true , true , true , true , true }, // CR
SY->OP from DB to IB { true , true , true , true , true , true , true , true , true , true , true , true }, // LF
AL->PR from DB to IB { true , true , true , true , true , true , true , true , true , true , true , true }, // Control
AL->PO from DB to IB { true , true , true , true , false, true , false, true , true , true , true , true }, // Extend
PR->PR from DB to IB { false, true , true , true , false, false, false, false, false, false, false, false }, // Prepend
PO->PO from DB to IB { true , true , true , true , false, true , false, true , true , true , true , true }, // SpacingMark
PR->PO from DB to IB { true , true , true , true , false, true , false, false, false, true , false, false }, // L
PO->PR from DB to IB { true , true , true , true , false, true , false, true , false, false, true , true }, // V
HY->PO from DB to IB { true , true , true , true , false, true , false, true , true , false, true , true }, // T
HY->PR from DB to IB { true , true , true , true , false, true , false, true , false, false, true , true }, // LV
HY->OP from DB to IB { true , true , true , true , false, true , false, true , true , false, true , true }, // LVT
NU->EX from PB to IB
EX->PO from DB to IB
*/
// The following line break classes are not treated by the table:
// AI, BK, CB, CR, LF, NL, SA, SG, SP, XX
enum LineBreakRule {
ProhibitedBreak, // PB in table
DirectBreak, // DB in table
IndirectBreak, // IB in table
CombiningIndirectBreak, // CI in table
CombiningProhibitedBreak // CP in table
};
#define DB DirectBreak
#define IB IndirectBreak
#define CI CombiningIndirectBreak
#define CP CombiningProhibitedBreak
#define PB ProhibitedBreak
static const uchar lineBreakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
/* OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
};
#undef DB
#undef IB
#undef CI
#undef CP
#undef PB
static const uchar graphemeBreakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
// Other, CR, LF, Control, Extend, L, V, T, LV, LVT
{ true , true , true , true , true , true , true , true , true , true }, // Other,
{ true , true , true , true , true , true , true , true , true , true }, // CR,
{ true , false, true , true , true , true , true , true , true , true }, // LF,
{ true , true , true , true , true , true , true , true , true , true }, // Control,
{ false, true , true , true , false, false, false, false, false, false }, // Extend,
{ true , true , true , true , true , false, true , true , true , true }, // L,
{ true , true , true , true , true , false, false, true , false, true }, // V,
{ true , true , true , true , true , true , false, false, false, false }, // T,
{ true , true , true , true , true , false, true , true , true , true }, // LV,
{ true , true , true , true , true , false, true , true , true , true }, // LVT
}; };
static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) } // namespace GB
static void getGraphemeBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{ {
// ##### can this fail if the first char is a surrogate? QUnicodeTables::GraphemeBreak lcls = QUnicodeTables::GraphemeBreakLF; // to meet GB1
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(string[0]); for (quint32 i = 0; i != len; ++i) {
QUnicodeTables::GraphemeBreak grapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak; quint32 pos = i;
QUnicodeTables::LineBreakClass cls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
// handle case where input starts with an LF
if (cls == QUnicodeTables::LineBreak_LF)
cls = QUnicodeTables::LineBreak_BK;
attributes[0].charStop = true;
int lcls = cls;
for (quint32 i = 1; i < len; ++i) {
attributes[i].charStop = true;
uint ucs4 = string[i]; uint ucs4 = string[i];
prop = QUnicodeTables::properties(ucs4); if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
QUnicodeTables::GraphemeBreak ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak; ushort low = string[i + 1];
QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class; if (QChar::isLowSurrogate(low)) {
attributes[i].charStop = graphemeBreakTable[ngrapheme][grapheme]; ucs4 = QChar::surrogateToUcs4(ucs4, low);
// handle surrogates ++i;
if (ncls == QUnicodeTables::LineBreak_SG) {
if (QChar::isHighSurrogate(string[i]) && i < len - 1 && QChar::isLowSurrogate(string[i+1])) {
continue;
} else if (QChar::isLowSurrogate(string[i]) && QChar::isHighSurrogate(string[i-1])) {
ucs4 = QChar::surrogateToUcs4(string[i-1], string[i]);
prop = QUnicodeTables::properties(ucs4);
ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
attributes[i].charStop = false;
} else {
ncls = QUnicodeTables::LineBreak_AL;
} }
} }
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::GraphemeBreak cls = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
attributes[pos].charStop = GB::breakTable[lcls][cls];
lcls = cls;
}
}
namespace WB {
enum Action {
NoBreak = 0,
Break = 1,
Lookup = 2
};
static const uchar breakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
// Other CR LF Newline Format Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Other
{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Format
{ Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidLetter
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNum
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric
{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet
};
} // namespace WB
static void getWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{
QUnicodeTables::WordBreak cls = QUnicodeTables::WordBreakLF; // to meet WB1
for (quint32 i = 0; i != len; ++i) {
quint32 pos = i;
uint ucs4 = string[i];
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
ushort low = string[i + 1];
if (QChar::isLowSurrogate(low)) {
ucs4 = QChar::surrogateToUcs4(ucs4, low);
++i;
}
}
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreak ncls = (QUnicodeTables::WordBreak) prop->wordBreak;
uchar action = WB::breakTable[cls][ncls];
if (ncls == QUnicodeTables::WordBreakFormat) {
// WB4: X(Extend|Format)* -> X
if (action != WB::Break)
continue;
} else {
if (action == WB::Lookup) {
action = WB::Break;
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
ucs4 = string[lookahead];
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
ushort low = string[lookahead + 1];
if (QChar::isLowSurrogate(low)) {
ucs4 = QChar::surrogateToUcs4(ucs4, low);
++lookahead;
}
}
prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::WordBreak tcls = (QUnicodeTables::WordBreak) prop->wordBreak;
if (tcls == QUnicodeTables::WordBreakFormat)
continue;
if (tcls == cls) {
i = lookahead;
ncls = tcls;
action = WB::NoBreak;
}
break;
}
}
}
cls = ncls;
if (action == WB::Break)
attributes[pos].wordBoundary = true;
}
}
namespace SB {
enum State {
Initial,
Upper,
UpATerm,
ATerm,
ATermC,
ACS,
STerm,
STermC,
SCS,
BAfterC,
BAfter,
Break,
Lookup
};
static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreakClose + 1] = {
// Other CR LF Sep Format Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Initial, Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, UpATerm, STerm , STerm , Initial }, // Upper
{ Lookup , BAfterC, BAfter , BAfter , UpATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // UpATerm
{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
{ Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
{ Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
{ Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
{ Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
};
} // namespace SB
static void getSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{
uchar state = SB::BAfter; // to meet SB1
for (quint32 i = 0; i != len; ++i) {
quint32 pos = i;
uint ucs4 = string[i];
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
ushort low = string[i + 1];
if (QChar::isLowSurrogate(low)) {
ucs4 = QChar::surrogateToUcs4(ucs4, low);
++i;
}
}
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::SentenceBreak ncls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
Q_ASSERT(state <= SB::BAfter);
state = SB::breakTable[state][ncls];
if (state == SB::Lookup) { // SB8
state = SB::Break;
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
ucs4 = string[lookahead];
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
ushort low = string[lookahead + 1];
if (QChar::isLowSurrogate(low)) {
ucs4 = QChar::surrogateToUcs4(ucs4, low);
++lookahead;
}
}
prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::SentenceBreak tcls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
switch (tcls) {
case QUnicodeTables::SentenceBreakOther:
case QUnicodeTables::SentenceBreakFormat:
case QUnicodeTables::SentenceBreakSp:
case QUnicodeTables::SentenceBreakNumeric:
case QUnicodeTables::SentenceBreakSContinue:
case QUnicodeTables::SentenceBreakClose:
continue;
case QUnicodeTables::SentenceBreakLower:
i = lookahead;
state = SB::Initial;
break;
default:
break;
}
break;
}
}
if (state == SB::Break) {
attributes[pos].sentenceBoundary = true;
state = SB::breakTable[SB::Initial][ncls];
}
}
}
// -----------------------------------------------------------------------------------------------------
//
// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-28.html
//
// -----------------------------------------------------------------------------------------------------
namespace LB {
// The following line break classes are not treated by the pair table
// and must be resolved outside:
// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
enum Action {
ProhibitedBreak, PB = ProhibitedBreak,
DirectBreak, DB = DirectBreak,
IndirectBreak, IB = IndirectBreak,
CombiningIndirectBreak, CI = CombiningIndirectBreak,
CombiningProhibitedBreak, CP = CombiningProhibitedBreak
};
static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* PR */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
/* PO */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* HY */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* BA */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
};
} // namespace LB
static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{
uint lucs4 = 0;
QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
QUnicodeTables::LineBreakClass cls = lcls;
for (quint32 i = 0; i != len; ++i) {
quint32 pos = i;
uint ucs4 = string[i];
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
ushort low = string[i + 1];
if (QChar::isLowSurrogate(low)) {
ucs4 = QChar::surrogateToUcs4(ucs4, low);
++i;
}
}
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
if (ncls == QUnicodeTables::LineBreak_SA) {
// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
if (FLAG(prop->category) & test)
ncls = QUnicodeTables::LineBreak_CM;
}
if (ncls == QUnicodeTables::LineBreak_CM) {
// LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
ncls = QUnicodeTables::LineBreak_AL;
}
HB_LineBreakType lineBreakType = HB_NoBreak; HB_LineBreakType lineBreakType = HB_NoBreak;
if (cls >= QUnicodeTables::LineBreak_CR) { if (lcls >= QUnicodeTables::LineBreak_CR) {
if (cls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF) // LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
lineBreakType = HB_ForcedBreak; lineBreakType = HB_ForcedBreak;
goto next; goto next;
} }
if (ncls == QUnicodeTables::LineBreak_SP) if (ncls >= QUnicodeTables::LineBreak_SP) {
goto next_no_cls_update; if (ncls > QUnicodeTables::LineBreak_SP)
if (ncls >= QUnicodeTables::LineBreak_CR) goto next; // LB6: x(BK|CR|LF|NL)
goto next; goto next_no_cls_update; // LB7: xSP
{
int tcls = ncls;
// for south east asian chars that require a complex (dictionary analysis), the unicode
// standard recommends to treat them as AL. thai_attributes and other attribute methods that
// do dictionary analysis can override
if (tcls >= QUnicodeTables::LineBreak_SA)
tcls = QUnicodeTables::LineBreak_AL;
if (cls >= QUnicodeTables::LineBreak_SA)
cls = QUnicodeTables::LineBreak_AL;
int brk = lineBreakTable[cls][tcls];
switch (brk) {
case DirectBreak:
lineBreakType = HB_Break;
if (string[i-1] == 0xad) // soft hyphen
lineBreakType = HB_SoftHyphen;
break;
case IndirectBreak:
lineBreakType = (lcls == QUnicodeTables::LineBreak_SP) ? HB_Break : HB_NoBreak;
break;
case CombiningIndirectBreak:
lineBreakType = HB_NoBreak;
if (lcls == QUnicodeTables::LineBreak_SP){
if (i > 1)
attributes[i-2].lineBreakType = HB_Break;
} else {
goto next_no_cls_update;
}
break;
case CombiningProhibitedBreak:
lineBreakType = HB_NoBreak;
if (lcls != QUnicodeTables::LineBreak_SP)
goto next_no_cls_update;
case ProhibitedBreak:
default:
break;
}
} }
// for South East Asian chars that require a complex analysis, the Unicode
// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
if (cls >= QUnicodeTables::LineBreak_SA)
cls = QUnicodeTables::LineBreak_AL;
switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
case LB::DirectBreak:
lineBreakType = HB_Break;
if (lucs4 == 0x00ad) // soft hyphen
lineBreakType = HB_SoftHyphen;
break;
case LB::IndirectBreak:
if (lcls == QUnicodeTables::LineBreak_SP)
lineBreakType = HB_Break;
break;
case LB::CombiningIndirectBreak:
if (lcls != QUnicodeTables::LineBreak_SP)
goto next_no_cls_update;
lineBreakType = HB_Break;
break;
case LB::CombiningProhibitedBreak:
if (lcls != QUnicodeTables::LineBreak_SP)
goto next_no_cls_update;
break;
case LB::ProhibitedBreak:
// nothing to do
default:
break;
}
next: next:
cls = ncls; cls = ncls;
lucs4 = ucs4;
next_no_cls_update: next_no_cls_update:
lcls = ncls; lcls = ncls;
grapheme = ngrapheme; if (lineBreakType != HB_NoBreak)
attributes[i-1].lineBreakType = lineBreakType; attributes[pos].lineBreakType = lineBreakType;
} }
for (quint32 i = len - 1; i > 0; --i)
attributes[i].lineBreakType = attributes[i - 1].lineBreakType;
attributes[0].lineBreakType = HB_NoBreak; // LB2 attributes[0].lineBreakType = HB_NoBreak; // LB2
} }
enum WordBreakRule { NoBreak = 0, Break = 1, Middle = 2 };
static const uchar wordBreakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet
{ Break , Break , Break , Break , Break , Break , Break , Break }, // Other
{ Break , Break , Break , Break , Break , Break , Break , Break }, // Format
{ Break , Break , NoBreak, Break , Break , Break , Break , NoBreak }, // Katakana
{ Break , Break , Break , NoBreak, Middle , Break , NoBreak, NoBreak }, // ALetter
{ Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
{ Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
{ Break , Break , Break , NoBreak, Break , Middle , NoBreak, NoBreak }, // Numeric
{ Break , Break , NoBreak, NoBreak, Break , Break , NoBreak, NoBreak }, // ExtendNumLet
};
static void calcWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{
quint32 brk = QUnicodeTables::wordBreakClass(string[0]);
attributes[0].wordBoundary = true;
for (quint32 i = 1; i < len; ++i) {
if (!attributes[i].charStop) {
attributes[i].wordBoundary = false;
continue;
}
quint32 nbrk = QUnicodeTables::wordBreakClass(string[i]);
if (nbrk == QUnicodeTables::WordBreakFormat) {
attributes[i].wordBoundary = (QUnicodeTables::sentenceBreakClass(string[i-1]) == QUnicodeTables::SentenceBreakSep);
continue;
}
WordBreakRule rule = (WordBreakRule)wordBreakTable[brk][nbrk];
if (rule == Middle) {
rule = Break;
quint32 lookahead = i + 1;
while (lookahead < len) {
quint32 testbrk = QUnicodeTables::wordBreakClass(string[lookahead]);
if (testbrk == QUnicodeTables::WordBreakFormat
&& QUnicodeTables::sentenceBreakClass(string[lookahead]) != QUnicodeTables::SentenceBreakSep) {
++lookahead;
continue;
}
if (testbrk == brk) {
rule = NoBreak;
while (i < lookahead)
attributes[i++].wordBoundary = false;
nbrk = testbrk;
}
break;
}
}
attributes[i].wordBoundary = (rule == Break);
brk = nbrk;
}
}
enum SentenceBreakState {
SB_Initial,
SB_Upper,
SB_UpATerm,
SB_ATerm,
SB_ATermC,
SB_ACS,
SB_STerm,
SB_STermC,
SB_SCS,
SB_BAfter,
SB_Break,
SB_Lookup
};
static const uchar sentenceBreakTable[SB_Lookup + 1][QUnicodeTables::SentenceBreakClose + 1] = {
// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close
{ SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial,
{ SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper
{ SB_Lookup , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm
{ SB_Lookup , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm
{ SB_Lookup , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC,
{ SB_Lookup , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_Lookup }, // SB_ACS,
{ SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm,
{ SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC,
{ SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS,
{ SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter,
};
static void calcSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{
quint32 brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[0])];
attributes[0].sentenceBoundary = true;
for (quint32 i = 1; i < len; ++i) {
if (!attributes[i].charStop) {
attributes[i].sentenceBoundary = false;
continue;
}
brk = sentenceBreakTable[brk][QUnicodeTables::sentenceBreakClass(string[i])];
if (brk == SB_Lookup) {
brk = SB_Break;
quint32 lookahead = i + 1;
while (lookahead < len) {
quint32 sbrk = QUnicodeTables::sentenceBreakClass(string[lookahead]);
if (sbrk != QUnicodeTables::SentenceBreakOther
&& sbrk != QUnicodeTables::SentenceBreakNumeric
&& sbrk != QUnicodeTables::SentenceBreakClose) {
break;
} else if (sbrk == QUnicodeTables::SentenceBreakLower) {
brk = SB_Initial;
break;
}
++lookahead;
}
if (brk == SB_Initial) {
while (i < lookahead)
attributes[i++].sentenceBoundary = false;
}
}
if (brk == SB_Break) {
attributes[i].sentenceBoundary = true;
brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[i])];
} else {
attributes[i].sentenceBoundary = false;
}
}
}
static void getWhiteSpaces(const ushort *string, quint32 len, HB_CharAttributes *attributes) static void getWhiteSpaces(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{ {
for (quint32 i = 0; i != len; ++i) { for (quint32 i = 0; i != len; ++i) {
@ -400,18 +446,17 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
if (length <= 0) if (length <= 0)
return; return;
if (!(options & DontClearAttributes)) { if (!(options & DontClearAttributes))
::memset(attributes, 0, length * sizeof(HB_CharAttributes)); ::memset(attributes, 0, length * sizeof(HB_CharAttributes));
if (options & (WordBreaks | SentenceBreaks))
options |= GraphemeBreaks;
}
if (options & (GraphemeBreaks | LineBreaks)) if (options & GraphemeBreaks)
calcGraphemeAndLineBreaks(string, length, attributes); getGraphemeBreaks(string, length, attributes);
if (options & WordBreaks) if (options & WordBreaks)
calcWordBreaks(string, length, attributes); getWordBreaks(string, length, attributes);
if (options & SentenceBreaks) if (options & SentenceBreaks)
calcSentenceBreaks(string, length, attributes); getSentenceBreaks(string, length, attributes);
if (options & LineBreaks)
getLineBreaks(string, length, attributes);
if (options & WhiteSpaces) if (options & WhiteSpaces)
getWhiteSpaces(string, length, attributes); getWhiteSpaces(string, length, attributes);

View File

@ -209,7 +209,7 @@ void tst_QTextLayout::cleanup()
void tst_QTextLayout::lineBreaking() void tst_QTextLayout::lineBreaking()
{ {
#if defined(Q_WS_X11) #if 0
struct Breaks { struct Breaks {
const char *utf8; const char *utf8;
uchar breaks[32]; uchar breaks[32];
@ -286,8 +286,6 @@ void tst_QTextLayout::lineBreaking()
QCOMPARE(b->breaks[i], (uchar)0xff); QCOMPARE(b->breaks[i], (uchar)0xff);
++b; ++b;
} }
#else
QSKIP("This test can not be run on non-X11 platforms");
#endif #endif
} }