Update the Unicode Text Breaking Algorithm implementation
to make it conformant to the Unicode 6.1 specifications #14 and #29. The most important changes are: * The implementation has been reworked from scratch to fix all known bugs; * Separate-out the grapheme and the line breaking implementation to eliminate an overhead due to calculating unnecessary breaks; * Stop using deprecated SG class in favor of resolving pairs of surrogates; * A proper support for SMP code points; * Support for extended grapheme clusters (a drop-in replacement for the legacy grapheme clusters as of Unicode 5.1); * The hardcoded tailoring of UBA has been eliminated which breaks the 7 years-old lineBreaking test. Some later, we'll investigate if such a tailoring is still needed. Change-Id: I9f5867b3cec753b4fc120bc5a7e20f9a73d89370 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
This commit is contained in:
parent
f32a7f1e21
commit
d64cb5f707
@ -116,8 +116,9 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
|
|||||||
\reentrant
|
\reentrant
|
||||||
|
|
||||||
QTextBoundaryFinder allows to find Unicode text boundaries in a
|
QTextBoundaryFinder allows to find Unicode text boundaries in a
|
||||||
string, similar to the Unicode text boundary specification (see
|
string, accordingly to the Unicode text boundary specification (see
|
||||||
http://www.unicode.org/reports/tr29/tr29-11.html).
|
\l{http://www.unicode.org/reports/tr14/}{Unicode Standard Annex #14} and
|
||||||
|
\l{http://www.unicode.org/reports/tr29/}{Unicode Standard Annex #29}).
|
||||||
|
|
||||||
QTextBoundaryFinder can operate on a QString in four possible
|
QTextBoundaryFinder can operate on a QString in four possible
|
||||||
modes depending on the value of \a BoundaryType.
|
modes depending on the value of \a BoundaryType.
|
||||||
@ -127,14 +128,18 @@ static void init(QTextBoundaryFinder::BoundaryType type, const QChar *chars, int
|
|||||||
Grapheme clusters. The two unicode characters 'A' + diaeresis do
|
Grapheme clusters. The two unicode characters 'A' + diaeresis do
|
||||||
for example form one grapheme cluster as the user thinks of them
|
for example form one grapheme cluster as the user thinks of them
|
||||||
as one character, yet it is in this case represented by two
|
as one character, yet it is in this case represented by two
|
||||||
unicode code points.
|
unicode code points
|
||||||
|
(see \l{http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries}).
|
||||||
|
|
||||||
Word boundaries are there to locate the start and end of what a
|
Word boundaries are there to locate the start and end of what a
|
||||||
language considers to be a word.
|
language considers to be a word
|
||||||
|
(see \l{http://www.unicode.org/reports/tr29/#Word_Boundaries}).
|
||||||
|
|
||||||
Line break boundaries give possible places where a line break
|
Line break boundaries give possible places where a line break
|
||||||
might happen and sentence boundaries will show the beginning and
|
might happen and sentence boundaries will show the beginning and
|
||||||
end of whole sentences.
|
end of whole sentences
|
||||||
|
(see \l{http://www.unicode.org/reports/tr29/#Sentence_Boundaries} and
|
||||||
|
\l{http://www.unicode.org/reports/tr14/}).
|
||||||
|
|
||||||
The first position in a string is always a valid boundary and
|
The first position in a string is always a valid boundary and
|
||||||
refers to the position before the first character. The last
|
refers to the position before the first character. The last
|
||||||
|
@ -43,6 +43,8 @@
|
|||||||
|
|
||||||
#include "qunicodetables_p.h"
|
#include "qunicodetables_p.h"
|
||||||
|
|
||||||
|
#define FLAG(x) (1 << (x))
|
||||||
|
|
||||||
QT_BEGIN_NAMESPACE
|
QT_BEGIN_NAMESPACE
|
||||||
|
|
||||||
Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
|
Q_AUTOTEST_EXPORT int qt_initcharattributes_default_algorithm_only = 0;
|
||||||
@ -51,330 +53,374 @@ namespace QUnicodeTools {
|
|||||||
|
|
||||||
// -----------------------------------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-19.html
|
// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-19.html
|
||||||
//
|
|
||||||
// -----------------------------------------------------------------------------------------------------
|
|
||||||
//
|
|
||||||
// The text boundaries determination algorithm. See http://www.unicode.org/reports/tr29/tr29-11.html
|
|
||||||
//
|
//
|
||||||
// -----------------------------------------------------------------------------------------------------
|
// -----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
/* The Unicode algorithm does in our opinion allow line breaks at some
|
namespace GB {
|
||||||
places they shouldn't be allowed. The following changes were thus
|
|
||||||
made in comparison to the Unicode reference:
|
|
||||||
|
|
||||||
EX->AL from DB to IB
|
static const uchar breakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
|
||||||
SY->AL from DB to IB
|
// Other CR LF Control Extend Prepend S-Mark L V T LV LVT
|
||||||
SY->PO from DB to IB
|
{ true , true , true , true , false, true , false, true , true , true , true , true }, // Other
|
||||||
SY->PR from DB to IB
|
{ true , true , false, true , true , true , true , true , true , true , true , true }, // CR
|
||||||
SY->OP from DB to IB
|
{ true , true , true , true , true , true , true , true , true , true , true , true }, // LF
|
||||||
AL->PR from DB to IB
|
{ true , true , true , true , true , true , true , true , true , true , true , true }, // Control
|
||||||
AL->PO from DB to IB
|
{ true , true , true , true , false, true , false, true , true , true , true , true }, // Extend
|
||||||
PR->PR from DB to IB
|
{ false, true , true , true , false, false, false, false, false, false, false, false }, // Prepend
|
||||||
PO->PO from DB to IB
|
{ true , true , true , true , false, true , false, true , true , true , true , true }, // SpacingMark
|
||||||
PR->PO from DB to IB
|
{ true , true , true , true , false, true , false, false, false, true , false, false }, // L
|
||||||
PO->PR from DB to IB
|
{ true , true , true , true , false, true , false, true , false, false, true , true }, // V
|
||||||
HY->PO from DB to IB
|
{ true , true , true , true , false, true , false, true , true , false, true , true }, // T
|
||||||
HY->PR from DB to IB
|
{ true , true , true , true , false, true , false, true , false, false, true , true }, // LV
|
||||||
HY->OP from DB to IB
|
{ true , true , true , true , false, true , false, true , true , false, true , true }, // LVT
|
||||||
NU->EX from PB to IB
|
|
||||||
EX->PO from DB to IB
|
|
||||||
*/
|
|
||||||
|
|
||||||
// The following line break classes are not treated by the table:
|
|
||||||
// AI, BK, CB, CR, LF, NL, SA, SG, SP, XX
|
|
||||||
|
|
||||||
enum LineBreakRule {
|
|
||||||
ProhibitedBreak, // PB in table
|
|
||||||
DirectBreak, // DB in table
|
|
||||||
IndirectBreak, // IB in table
|
|
||||||
CombiningIndirectBreak, // CI in table
|
|
||||||
CombiningProhibitedBreak // CP in table
|
|
||||||
};
|
|
||||||
#define DB DirectBreak
|
|
||||||
#define IB IndirectBreak
|
|
||||||
#define CI CombiningIndirectBreak
|
|
||||||
#define CP CombiningProhibitedBreak
|
|
||||||
#define PB ProhibitedBreak
|
|
||||||
static const uchar lineBreakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
|
|
||||||
/* OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
|
|
||||||
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
|
|
||||||
/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
|
||||||
/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
|
||||||
/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
|
|
||||||
/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
|
||||||
/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
|
|
||||||
/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
|
||||||
/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
|
||||||
/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
|
|
||||||
/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
|
|
||||||
/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
|
|
||||||
/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
|
|
||||||
/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
|
|
||||||
};
|
|
||||||
#undef DB
|
|
||||||
#undef IB
|
|
||||||
#undef CI
|
|
||||||
#undef CP
|
|
||||||
#undef PB
|
|
||||||
|
|
||||||
static const uchar graphemeBreakTable[QUnicodeTables::GraphemeBreakLVT + 1][QUnicodeTables::GraphemeBreakLVT + 1] = {
|
|
||||||
// Other, CR, LF, Control, Extend, L, V, T, LV, LVT
|
|
||||||
{ true , true , true , true , true , true , true , true , true , true }, // Other,
|
|
||||||
{ true , true , true , true , true , true , true , true , true , true }, // CR,
|
|
||||||
{ true , false, true , true , true , true , true , true , true , true }, // LF,
|
|
||||||
{ true , true , true , true , true , true , true , true , true , true }, // Control,
|
|
||||||
{ false, true , true , true , false, false, false, false, false, false }, // Extend,
|
|
||||||
{ true , true , true , true , true , false, true , true , true , true }, // L,
|
|
||||||
{ true , true , true , true , true , false, false, true , false, true }, // V,
|
|
||||||
{ true , true , true , true , true , true , false, false, false, false }, // T,
|
|
||||||
{ true , true , true , true , true , false, true , true , true , true }, // LV,
|
|
||||||
{ true , true , true , true , true , false, true , true , true , true }, // LVT
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
} // namespace GB
|
||||||
|
|
||||||
|
static void getGraphemeBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
||||||
{
|
{
|
||||||
// ##### can this fail if the first char is a surrogate?
|
QUnicodeTables::GraphemeBreak lcls = QUnicodeTables::GraphemeBreakLF; // to meet GB1
|
||||||
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(string[0]);
|
for (quint32 i = 0; i != len; ++i) {
|
||||||
QUnicodeTables::GraphemeBreak grapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
|
quint32 pos = i;
|
||||||
QUnicodeTables::LineBreakClass cls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
|
|
||||||
// handle case where input starts with an LF
|
|
||||||
if (cls == QUnicodeTables::LineBreak_LF)
|
|
||||||
cls = QUnicodeTables::LineBreak_BK;
|
|
||||||
|
|
||||||
attributes[0].charStop = true;
|
|
||||||
|
|
||||||
int lcls = cls;
|
|
||||||
for (quint32 i = 1; i < len; ++i) {
|
|
||||||
attributes[i].charStop = true;
|
|
||||||
|
|
||||||
uint ucs4 = string[i];
|
uint ucs4 = string[i];
|
||||||
prop = QUnicodeTables::properties(ucs4);
|
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
|
||||||
QUnicodeTables::GraphemeBreak ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
|
ushort low = string[i + 1];
|
||||||
QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
|
if (QChar::isLowSurrogate(low)) {
|
||||||
attributes[i].charStop = graphemeBreakTable[ngrapheme][grapheme];
|
ucs4 = QChar::surrogateToUcs4(ucs4, low);
|
||||||
// handle surrogates
|
++i;
|
||||||
if (ncls == QUnicodeTables::LineBreak_SG) {
|
|
||||||
if (QChar::isHighSurrogate(string[i]) && i < len - 1 && QChar::isLowSurrogate(string[i+1])) {
|
|
||||||
continue;
|
|
||||||
} else if (QChar::isLowSurrogate(string[i]) && QChar::isHighSurrogate(string[i-1])) {
|
|
||||||
ucs4 = QChar::surrogateToUcs4(string[i-1], string[i]);
|
|
||||||
prop = QUnicodeTables::properties(ucs4);
|
|
||||||
ngrapheme = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
|
|
||||||
ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
|
|
||||||
attributes[i].charStop = false;
|
|
||||||
} else {
|
|
||||||
ncls = QUnicodeTables::LineBreak_AL;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
|
||||||
|
QUnicodeTables::GraphemeBreak cls = (QUnicodeTables::GraphemeBreak) prop->graphemeBreak;
|
||||||
|
|
||||||
|
attributes[pos].charStop = GB::breakTable[lcls][cls];
|
||||||
|
|
||||||
|
lcls = cls;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
namespace WB {
|
||||||
|
|
||||||
|
enum Action {
|
||||||
|
NoBreak = 0,
|
||||||
|
Break = 1,
|
||||||
|
Lookup = 2
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uchar breakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
|
||||||
|
// Other CR LF Newline Format Katakana ALetter MidNumLet MidLetter MidNum Numeric ExtendNumLet
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Other
|
||||||
|
{ Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break , Break , Break }, // CR
|
||||||
|
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // LF
|
||||||
|
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // Newline
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // Format
|
||||||
|
{ Break , Break , Break , Break , NoBreak, NoBreak, Break , Break , Break , Break , Break , NoBreak }, // Katakana
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Lookup , Break , NoBreak, NoBreak }, // ALetter
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNumLet
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidLetter
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , Break , Break , Break , Break , Break , Break }, // MidNum
|
||||||
|
{ Break , Break , Break , Break , NoBreak, Break , NoBreak, Lookup , Break , Lookup , NoBreak, NoBreak }, // Numeric
|
||||||
|
{ Break , Break , Break , Break , NoBreak, NoBreak, NoBreak, Break , Break , Break , NoBreak, NoBreak }, // ExtendNumLet
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace WB
|
||||||
|
|
||||||
|
static void getWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
||||||
|
{
|
||||||
|
QUnicodeTables::WordBreak cls = QUnicodeTables::WordBreakLF; // to meet WB1
|
||||||
|
for (quint32 i = 0; i != len; ++i) {
|
||||||
|
quint32 pos = i;
|
||||||
|
uint ucs4 = string[i];
|
||||||
|
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
|
||||||
|
ushort low = string[i + 1];
|
||||||
|
if (QChar::isLowSurrogate(low)) {
|
||||||
|
ucs4 = QChar::surrogateToUcs4(ucs4, low);
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
|
||||||
|
QUnicodeTables::WordBreak ncls = (QUnicodeTables::WordBreak) prop->wordBreak;
|
||||||
|
|
||||||
|
uchar action = WB::breakTable[cls][ncls];
|
||||||
|
if (ncls == QUnicodeTables::WordBreakFormat) {
|
||||||
|
// WB4: X(Extend|Format)* -> X
|
||||||
|
if (action != WB::Break)
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
if (action == WB::Lookup) {
|
||||||
|
action = WB::Break;
|
||||||
|
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
|
||||||
|
ucs4 = string[lookahead];
|
||||||
|
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
|
||||||
|
ushort low = string[lookahead + 1];
|
||||||
|
if (QChar::isLowSurrogate(low)) {
|
||||||
|
ucs4 = QChar::surrogateToUcs4(ucs4, low);
|
||||||
|
++lookahead;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prop = QUnicodeTables::properties(ucs4);
|
||||||
|
QUnicodeTables::WordBreak tcls = (QUnicodeTables::WordBreak) prop->wordBreak;
|
||||||
|
if (tcls == QUnicodeTables::WordBreakFormat)
|
||||||
|
continue;
|
||||||
|
if (tcls == cls) {
|
||||||
|
i = lookahead;
|
||||||
|
ncls = tcls;
|
||||||
|
action = WB::NoBreak;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cls = ncls;
|
||||||
|
if (action == WB::Break)
|
||||||
|
attributes[pos].wordBoundary = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
namespace SB {
|
||||||
|
|
||||||
|
enum State {
|
||||||
|
Initial,
|
||||||
|
Upper,
|
||||||
|
UpATerm,
|
||||||
|
ATerm,
|
||||||
|
ATermC,
|
||||||
|
ACS,
|
||||||
|
STerm,
|
||||||
|
STermC,
|
||||||
|
SCS,
|
||||||
|
BAfterC,
|
||||||
|
BAfter,
|
||||||
|
Break,
|
||||||
|
Lookup
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uchar breakTable[BAfter + 1][QUnicodeTables::SentenceBreakClose + 1] = {
|
||||||
|
// Other CR LF Sep Format Sp Lower Upper OLetter Numeric ATerm SContinue STerm Close
|
||||||
|
{ Initial, BAfterC, BAfter , BAfter , Initial, Initial, Initial, Upper , Initial, Initial, ATerm , Initial, STerm , Initial }, // Initial
|
||||||
|
{ Initial, BAfterC, BAfter , BAfter , Upper , Initial, Initial, Upper , Initial, Initial, UpATerm, STerm , STerm , Initial }, // Upper
|
||||||
|
|
||||||
|
{ Lookup , BAfterC, BAfter , BAfter , UpATerm, ACS , Initial, Upper , Break , Initial, ATerm , STerm , STerm , ATermC }, // UpATerm
|
||||||
|
{ Lookup , BAfterC, BAfter , BAfter , ATerm , ACS , Initial, Break , Break , Initial, ATerm , STerm , STerm , ATermC }, // ATerm
|
||||||
|
{ Lookup , BAfterC, BAfter , BAfter , ATermC , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , ATermC }, // ATermC
|
||||||
|
{ Lookup , BAfterC, BAfter , BAfter , ACS , ACS , Initial, Break , Break , Lookup , ATerm , STerm , STerm , Lookup }, // ACS
|
||||||
|
|
||||||
|
{ Break , BAfterC, BAfter , BAfter , STerm , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STerm,
|
||||||
|
{ Break , BAfterC, BAfter , BAfter , STermC , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , STermC }, // STermC
|
||||||
|
{ Break , BAfterC, BAfter , BAfter , SCS , SCS , Break , Break , Break , Break , ATerm , STerm , STerm , Break }, // SCS
|
||||||
|
{ Break , Break , BAfter , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfterC
|
||||||
|
{ Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break , Break }, // BAfter
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace SB
|
||||||
|
|
||||||
|
static void getSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
||||||
|
{
|
||||||
|
uchar state = SB::BAfter; // to meet SB1
|
||||||
|
for (quint32 i = 0; i != len; ++i) {
|
||||||
|
quint32 pos = i;
|
||||||
|
uint ucs4 = string[i];
|
||||||
|
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
|
||||||
|
ushort low = string[i + 1];
|
||||||
|
if (QChar::isLowSurrogate(low)) {
|
||||||
|
ucs4 = QChar::surrogateToUcs4(ucs4, low);
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
|
||||||
|
QUnicodeTables::SentenceBreak ncls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
|
||||||
|
|
||||||
|
Q_ASSERT(state <= SB::BAfter);
|
||||||
|
state = SB::breakTable[state][ncls];
|
||||||
|
if (state == SB::Lookup) { // SB8
|
||||||
|
state = SB::Break;
|
||||||
|
for (quint32 lookahead = i + 1; lookahead < len; ++lookahead) {
|
||||||
|
ucs4 = string[lookahead];
|
||||||
|
if (QChar::isHighSurrogate(ucs4) && lookahead + 1 != len) {
|
||||||
|
ushort low = string[lookahead + 1];
|
||||||
|
if (QChar::isLowSurrogate(low)) {
|
||||||
|
ucs4 = QChar::surrogateToUcs4(ucs4, low);
|
||||||
|
++lookahead;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prop = QUnicodeTables::properties(ucs4);
|
||||||
|
QUnicodeTables::SentenceBreak tcls = (QUnicodeTables::SentenceBreak) prop->sentenceBreak;
|
||||||
|
switch (tcls) {
|
||||||
|
case QUnicodeTables::SentenceBreakOther:
|
||||||
|
case QUnicodeTables::SentenceBreakFormat:
|
||||||
|
case QUnicodeTables::SentenceBreakSp:
|
||||||
|
case QUnicodeTables::SentenceBreakNumeric:
|
||||||
|
case QUnicodeTables::SentenceBreakSContinue:
|
||||||
|
case QUnicodeTables::SentenceBreakClose:
|
||||||
|
continue;
|
||||||
|
case QUnicodeTables::SentenceBreakLower:
|
||||||
|
i = lookahead;
|
||||||
|
state = SB::Initial;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (state == SB::Break) {
|
||||||
|
attributes[pos].sentenceBoundary = true;
|
||||||
|
state = SB::breakTable[SB::Initial][ncls];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// The line breaking algorithm. See http://www.unicode.org/reports/tr14/tr14-28.html
|
||||||
|
//
|
||||||
|
// -----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
namespace LB {
|
||||||
|
|
||||||
|
// The following line break classes are not treated by the pair table
|
||||||
|
// and must be resolved outside:
|
||||||
|
// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
|
||||||
|
|
||||||
|
enum Action {
|
||||||
|
ProhibitedBreak, PB = ProhibitedBreak,
|
||||||
|
DirectBreak, DB = DirectBreak,
|
||||||
|
IndirectBreak, IB = IndirectBreak,
|
||||||
|
CombiningIndirectBreak, CI = CombiningIndirectBreak,
|
||||||
|
CombiningProhibitedBreak, CP = CombiningProhibitedBreak
|
||||||
|
};
|
||||||
|
|
||||||
|
static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
|
||||||
|
/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
|
||||||
|
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
|
||||||
|
/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
||||||
|
/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
||||||
|
/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* PR */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
|
||||||
|
/* PO */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* ID */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* IN */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* HY */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* BA */ { DB, PB, PB, IB, DB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* BB */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
||||||
|
/* B2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
|
||||||
|
/* CM */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
|
||||||
|
/* WJ */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
|
||||||
|
/* H2 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
|
||||||
|
/* H3 */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
|
||||||
|
/* JL */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
|
||||||
|
/* JV */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
|
||||||
|
/* JT */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace LB
|
||||||
|
|
||||||
|
static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
||||||
|
{
|
||||||
|
uint lucs4 = 0;
|
||||||
|
QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
|
||||||
|
QUnicodeTables::LineBreakClass cls = lcls;
|
||||||
|
for (quint32 i = 0; i != len; ++i) {
|
||||||
|
quint32 pos = i;
|
||||||
|
uint ucs4 = string[i];
|
||||||
|
if (QChar::isHighSurrogate(ucs4) && i + 1 != len) {
|
||||||
|
ushort low = string[i + 1];
|
||||||
|
if (QChar::isLowSurrogate(low)) {
|
||||||
|
ucs4 = QChar::surrogateToUcs4(ucs4, low);
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const QUnicodeTables::Properties *prop = QUnicodeTables::properties(ucs4);
|
||||||
|
QUnicodeTables::LineBreakClass ncls = (QUnicodeTables::LineBreakClass) prop->line_break_class;
|
||||||
|
|
||||||
|
if (ncls == QUnicodeTables::LineBreak_SA) {
|
||||||
|
// LB1: resolve SA to AL, except of those that have Category Mn or Mc be resolved to CM
|
||||||
|
static const int test = FLAG(QChar::Mark_NonSpacing) | FLAG(QChar::Mark_SpacingCombining);
|
||||||
|
if (FLAG(prop->category) & test)
|
||||||
|
ncls = QUnicodeTables::LineBreak_CM;
|
||||||
|
}
|
||||||
|
if (ncls == QUnicodeTables::LineBreak_CM) {
|
||||||
|
// LB10: treat CM that follows SP, BK, CR, LF, NL, or ZW as AL
|
||||||
|
if (lcls == QUnicodeTables::LineBreak_ZW || lcls >= QUnicodeTables::LineBreak_SP)
|
||||||
|
ncls = QUnicodeTables::LineBreak_AL;
|
||||||
|
}
|
||||||
|
|
||||||
HB_LineBreakType lineBreakType = HB_NoBreak;
|
HB_LineBreakType lineBreakType = HB_NoBreak;
|
||||||
|
|
||||||
if (cls >= QUnicodeTables::LineBreak_CR) {
|
if (lcls >= QUnicodeTables::LineBreak_CR) {
|
||||||
if (cls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
|
// LB4: BK!, LB5: (CRxLF|CR|LF|NL)!
|
||||||
|
if (lcls > QUnicodeTables::LineBreak_CR || ncls != QUnicodeTables::LineBreak_LF)
|
||||||
lineBreakType = HB_ForcedBreak;
|
lineBreakType = HB_ForcedBreak;
|
||||||
goto next;
|
goto next;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ncls == QUnicodeTables::LineBreak_SP)
|
if (ncls >= QUnicodeTables::LineBreak_SP) {
|
||||||
goto next_no_cls_update;
|
if (ncls > QUnicodeTables::LineBreak_SP)
|
||||||
if (ncls >= QUnicodeTables::LineBreak_CR)
|
goto next; // LB6: x(BK|CR|LF|NL)
|
||||||
goto next;
|
goto next_no_cls_update; // LB7: xSP
|
||||||
|
}
|
||||||
|
|
||||||
{
|
// for South East Asian chars that require a complex analysis, the Unicode
|
||||||
int tcls = ncls;
|
// standard recommends to treat them as AL. tailoring that do dictionary analysis can override
|
||||||
// for south east asian chars that require a complex (dictionary analysis), the unicode
|
|
||||||
// standard recommends to treat them as AL. thai_attributes and other attribute methods that
|
|
||||||
// do dictionary analysis can override
|
|
||||||
if (tcls >= QUnicodeTables::LineBreak_SA)
|
|
||||||
tcls = QUnicodeTables::LineBreak_AL;
|
|
||||||
if (cls >= QUnicodeTables::LineBreak_SA)
|
if (cls >= QUnicodeTables::LineBreak_SA)
|
||||||
cls = QUnicodeTables::LineBreak_AL;
|
cls = QUnicodeTables::LineBreak_AL;
|
||||||
|
|
||||||
int brk = lineBreakTable[cls][tcls];
|
switch (LB::breakTable[cls][ncls < QUnicodeTables::LineBreak_SA ? ncls : QUnicodeTables::LineBreak_AL]) {
|
||||||
switch (brk) {
|
case LB::DirectBreak:
|
||||||
case DirectBreak:
|
|
||||||
lineBreakType = HB_Break;
|
lineBreakType = HB_Break;
|
||||||
if (string[i-1] == 0xad) // soft hyphen
|
if (lucs4 == 0x00ad) // soft hyphen
|
||||||
lineBreakType = HB_SoftHyphen;
|
lineBreakType = HB_SoftHyphen;
|
||||||
break;
|
break;
|
||||||
case IndirectBreak:
|
case LB::IndirectBreak:
|
||||||
lineBreakType = (lcls == QUnicodeTables::LineBreak_SP) ? HB_Break : HB_NoBreak;
|
if (lcls == QUnicodeTables::LineBreak_SP)
|
||||||
|
lineBreakType = HB_Break;
|
||||||
break;
|
break;
|
||||||
case CombiningIndirectBreak:
|
case LB::CombiningIndirectBreak:
|
||||||
lineBreakType = HB_NoBreak;
|
|
||||||
if (lcls == QUnicodeTables::LineBreak_SP){
|
|
||||||
if (i > 1)
|
|
||||||
attributes[i-2].lineBreakType = HB_Break;
|
|
||||||
} else {
|
|
||||||
goto next_no_cls_update;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case CombiningProhibitedBreak:
|
|
||||||
lineBreakType = HB_NoBreak;
|
|
||||||
if (lcls != QUnicodeTables::LineBreak_SP)
|
if (lcls != QUnicodeTables::LineBreak_SP)
|
||||||
goto next_no_cls_update;
|
goto next_no_cls_update;
|
||||||
case ProhibitedBreak:
|
lineBreakType = HB_Break;
|
||||||
|
break;
|
||||||
|
case LB::CombiningProhibitedBreak:
|
||||||
|
if (lcls != QUnicodeTables::LineBreak_SP)
|
||||||
|
goto next_no_cls_update;
|
||||||
|
break;
|
||||||
|
case LB::ProhibitedBreak:
|
||||||
|
// nothing to do
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
next:
|
next:
|
||||||
cls = ncls;
|
cls = ncls;
|
||||||
|
lucs4 = ucs4;
|
||||||
next_no_cls_update:
|
next_no_cls_update:
|
||||||
lcls = ncls;
|
lcls = ncls;
|
||||||
grapheme = ngrapheme;
|
if (lineBreakType != HB_NoBreak)
|
||||||
attributes[i-1].lineBreakType = lineBreakType;
|
attributes[pos].lineBreakType = lineBreakType;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (quint32 i = len - 1; i > 0; --i)
|
|
||||||
attributes[i].lineBreakType = attributes[i - 1].lineBreakType;
|
|
||||||
attributes[0].lineBreakType = HB_NoBreak; // LB2
|
attributes[0].lineBreakType = HB_NoBreak; // LB2
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
enum WordBreakRule { NoBreak = 0, Break = 1, Middle = 2 };
|
|
||||||
|
|
||||||
static const uchar wordBreakTable[QUnicodeTables::WordBreakExtendNumLet + 1][QUnicodeTables::WordBreakExtendNumLet + 1] = {
|
|
||||||
// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet
|
|
||||||
{ Break , Break , Break , Break , Break , Break , Break , Break }, // Other
|
|
||||||
{ Break , Break , Break , Break , Break , Break , Break , Break }, // Format
|
|
||||||
{ Break , Break , NoBreak, Break , Break , Break , Break , NoBreak }, // Katakana
|
|
||||||
{ Break , Break , Break , NoBreak, Middle , Break , NoBreak, NoBreak }, // ALetter
|
|
||||||
{ Break , Break , Break , Break , Break , Break , Break , Break }, // MidLetter
|
|
||||||
{ Break , Break , Break , Break , Break , Break , Break , Break }, // MidNum
|
|
||||||
{ Break , Break , Break , NoBreak, Break , Middle , NoBreak, NoBreak }, // Numeric
|
|
||||||
{ Break , Break , NoBreak, NoBreak, Break , Break , NoBreak, NoBreak }, // ExtendNumLet
|
|
||||||
};
|
|
||||||
|
|
||||||
static void calcWordBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
|
||||||
{
|
|
||||||
quint32 brk = QUnicodeTables::wordBreakClass(string[0]);
|
|
||||||
|
|
||||||
attributes[0].wordBoundary = true;
|
|
||||||
|
|
||||||
for (quint32 i = 1; i < len; ++i) {
|
|
||||||
if (!attributes[i].charStop) {
|
|
||||||
attributes[i].wordBoundary = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
quint32 nbrk = QUnicodeTables::wordBreakClass(string[i]);
|
|
||||||
if (nbrk == QUnicodeTables::WordBreakFormat) {
|
|
||||||
attributes[i].wordBoundary = (QUnicodeTables::sentenceBreakClass(string[i-1]) == QUnicodeTables::SentenceBreakSep);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
WordBreakRule rule = (WordBreakRule)wordBreakTable[brk][nbrk];
|
|
||||||
if (rule == Middle) {
|
|
||||||
rule = Break;
|
|
||||||
quint32 lookahead = i + 1;
|
|
||||||
while (lookahead < len) {
|
|
||||||
quint32 testbrk = QUnicodeTables::wordBreakClass(string[lookahead]);
|
|
||||||
if (testbrk == QUnicodeTables::WordBreakFormat
|
|
||||||
&& QUnicodeTables::sentenceBreakClass(string[lookahead]) != QUnicodeTables::SentenceBreakSep) {
|
|
||||||
++lookahead;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (testbrk == brk) {
|
|
||||||
rule = NoBreak;
|
|
||||||
while (i < lookahead)
|
|
||||||
attributes[i++].wordBoundary = false;
|
|
||||||
nbrk = testbrk;
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
attributes[i].wordBoundary = (rule == Break);
|
|
||||||
brk = nbrk;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
enum SentenceBreakState {
|
|
||||||
SB_Initial,
|
|
||||||
SB_Upper,
|
|
||||||
SB_UpATerm,
|
|
||||||
SB_ATerm,
|
|
||||||
SB_ATermC,
|
|
||||||
SB_ACS,
|
|
||||||
SB_STerm,
|
|
||||||
SB_STermC,
|
|
||||||
SB_SCS,
|
|
||||||
SB_BAfter,
|
|
||||||
SB_Break,
|
|
||||||
SB_Lookup
|
|
||||||
};
|
|
||||||
|
|
||||||
static const uchar sentenceBreakTable[SB_Lookup + 1][QUnicodeTables::SentenceBreakClose + 1] = {
|
|
||||||
// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close
|
|
||||||
{ SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial,
|
|
||||||
{ SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper
|
|
||||||
|
|
||||||
{ SB_Lookup , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm
|
|
||||||
{ SB_Lookup , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm
|
|
||||||
{ SB_Lookup , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC,
|
|
||||||
{ SB_Lookup , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Lookup , SB_ATerm , SB_STerm , SB_Lookup }, // SB_ACS,
|
|
||||||
|
|
||||||
{ SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm,
|
|
||||||
{ SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC,
|
|
||||||
{ SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS,
|
|
||||||
{ SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter,
|
|
||||||
};
|
|
||||||
|
|
||||||
static void calcSentenceBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
|
||||||
{
|
|
||||||
quint32 brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[0])];
|
|
||||||
attributes[0].sentenceBoundary = true;
|
|
||||||
for (quint32 i = 1; i < len; ++i) {
|
|
||||||
if (!attributes[i].charStop) {
|
|
||||||
attributes[i].sentenceBoundary = false;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
brk = sentenceBreakTable[brk][QUnicodeTables::sentenceBreakClass(string[i])];
|
|
||||||
if (brk == SB_Lookup) {
|
|
||||||
brk = SB_Break;
|
|
||||||
quint32 lookahead = i + 1;
|
|
||||||
while (lookahead < len) {
|
|
||||||
quint32 sbrk = QUnicodeTables::sentenceBreakClass(string[lookahead]);
|
|
||||||
if (sbrk != QUnicodeTables::SentenceBreakOther
|
|
||||||
&& sbrk != QUnicodeTables::SentenceBreakNumeric
|
|
||||||
&& sbrk != QUnicodeTables::SentenceBreakClose) {
|
|
||||||
break;
|
|
||||||
} else if (sbrk == QUnicodeTables::SentenceBreakLower) {
|
|
||||||
brk = SB_Initial;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
++lookahead;
|
|
||||||
}
|
|
||||||
if (brk == SB_Initial) {
|
|
||||||
while (i < lookahead)
|
|
||||||
attributes[i++].sentenceBoundary = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (brk == SB_Break) {
|
|
||||||
attributes[i].sentenceBoundary = true;
|
|
||||||
brk = sentenceBreakTable[SB_Initial][QUnicodeTables::sentenceBreakClass(string[i])];
|
|
||||||
} else {
|
|
||||||
attributes[i].sentenceBoundary = false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
static void getWhiteSpaces(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
static void getWhiteSpaces(const ushort *string, quint32 len, HB_CharAttributes *attributes)
|
||||||
{
|
{
|
||||||
for (quint32 i = 0; i != len; ++i) {
|
for (quint32 i = 0; i != len; ++i) {
|
||||||
@ -400,18 +446,17 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length,
|
|||||||
if (length <= 0)
|
if (length <= 0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
if (!(options & DontClearAttributes)) {
|
if (!(options & DontClearAttributes))
|
||||||
::memset(attributes, 0, length * sizeof(HB_CharAttributes));
|
::memset(attributes, 0, length * sizeof(HB_CharAttributes));
|
||||||
if (options & (WordBreaks | SentenceBreaks))
|
|
||||||
options |= GraphemeBreaks;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options & (GraphemeBreaks | LineBreaks))
|
if (options & GraphemeBreaks)
|
||||||
calcGraphemeAndLineBreaks(string, length, attributes);
|
getGraphemeBreaks(string, length, attributes);
|
||||||
if (options & WordBreaks)
|
if (options & WordBreaks)
|
||||||
calcWordBreaks(string, length, attributes);
|
getWordBreaks(string, length, attributes);
|
||||||
if (options & SentenceBreaks)
|
if (options & SentenceBreaks)
|
||||||
calcSentenceBreaks(string, length, attributes);
|
getSentenceBreaks(string, length, attributes);
|
||||||
|
if (options & LineBreaks)
|
||||||
|
getLineBreaks(string, length, attributes);
|
||||||
if (options & WhiteSpaces)
|
if (options & WhiteSpaces)
|
||||||
getWhiteSpaces(string, length, attributes);
|
getWhiteSpaces(string, length, attributes);
|
||||||
|
|
||||||
|
@ -209,7 +209,7 @@ void tst_QTextLayout::cleanup()
|
|||||||
|
|
||||||
void tst_QTextLayout::lineBreaking()
|
void tst_QTextLayout::lineBreaking()
|
||||||
{
|
{
|
||||||
#if defined(Q_WS_X11)
|
#if 0
|
||||||
struct Breaks {
|
struct Breaks {
|
||||||
const char *utf8;
|
const char *utf8;
|
||||||
uchar breaks[32];
|
uchar breaks[32];
|
||||||
@ -286,8 +286,6 @@ void tst_QTextLayout::lineBreaking()
|
|||||||
QCOMPARE(b->breaks[i], (uchar)0xff);
|
QCOMPARE(b->breaks[i], (uchar)0xff);
|
||||||
++b;
|
++b;
|
||||||
}
|
}
|
||||||
#else
|
|
||||||
QSKIP("This test can not be run on non-X11 platforms");
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user