Line Breaking Algorithm: don't break inside numeric expressions

Change-Id: I8362663454e4c6604ecb6289ae8009d47c78aeb1
Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
This commit is contained in:
Konstantin Ritt 2012-05-31 13:03:51 +03:00 committed by Qt by Nokia
parent 48d99e74fc
commit c8dd0de1a6
2 changed files with 107 additions and 31 deletions

View File

@ -286,6 +286,77 @@ static void getSentenceBreaks(const ushort *string, quint32 len, HB_CharAttribut
namespace LB { namespace LB {
namespace NS { // Number Sequence
// LB25 recommends to not break lines inside numbers of the form
// described by the following regular expression:
// (PR|PO)?(OP|HY)?NU(NU|SY|IS)*(CL|CP)?(PR|PO)?
enum Action {
None,
Start,
Continue,
Break
};
enum Class {
XX,
PRPO,
OPHY,
NU,
SYIS,
CLCP
};
static const uchar actionTable[CLCP + 1][CLCP + 1] = {
// XX PRPO OPHY NU SYIS CLCP
{ None , Start , Start , Start , None , None }, // XX
{ None , Start , Continue, Continue, None , None }, // PRPO
{ None , Start , Start , Continue, None , None }, // OPHY
{ Break , Break , Break , Continue, Continue, Continue }, // NU
{ Break , Break , Break , Continue, Continue, Continue }, // SYIS
{ Break , Continue, Break , Break , Break , Break }, // CLCP
};
inline Class toClass(QUnicodeTables::LineBreakClass lbc, QChar::Category category)
{
switch (lbc) {
case QUnicodeTables::LineBreak_AL:// case QUnicodeTables::LineBreak_AI:
// resolve AI math symbols in numerical context to IS
if (category == QChar::Symbol_Math)
return SYIS;
break;
case QUnicodeTables::LineBreak_PR: case QUnicodeTables::LineBreak_PO:
return PRPO;
case QUnicodeTables::LineBreak_OP: case QUnicodeTables::LineBreak_HY:
return OPHY;
case QUnicodeTables::LineBreak_NU:
return NU;
case QUnicodeTables::LineBreak_SY: case QUnicodeTables::LineBreak_IS:
return SYIS;
case QUnicodeTables::LineBreak_CL: case QUnicodeTables::LineBreak_CP:
return CLCP;
default:
break;
}
return XX;
}
} // namespace NS
/* In order to support the tailored implementation of LB25 properly
the following changes were made in the pair table to allow breaks
where the numeric expression doesn't match the template (i.e. [^NU](IS|SY)NU):
CL->PO from IB to DB
CP->PO from IB to DB
CL->PR from IB to DB
CP->PR from IB to DB
PO->OP from IB to DB
PR->OP from IB to DB
IS->NU from IB to DB
SY->NU from IB to DB
*/
// The following line break classes are not treated by the pair table // The following line break classes are not treated by the pair table
// and must be resolved outside: // and must be resolved outside:
// AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX // AI, BK, CB, CJ, CR, LF, NL, SA, SG, SP, XX
@ -301,16 +372,16 @@ enum Action {
static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = { static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::LineBreak_JT + 1] = {
/* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */ /* OP CL CP QU GL NS EX SY IS PR PO NU AL HL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */
/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB }, /* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
/* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* CL */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* CP */ { DB, PB, PB, IB, IB, PB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, /* QU */ { PB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, /* GL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
/* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* NS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* EX */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* SY */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* IS */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* PR */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB }, /* PR */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
/* PO */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* PO */ { DB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* NU */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* AL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
/* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, /* HL */ { IB, PB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
@ -334,6 +405,9 @@ static const uchar breakTable[QUnicodeTables::LineBreak_JT + 1][QUnicodeTables::
static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes) static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *attributes)
{ {
quint32 nestart = 0;
LB::NS::Class nelast = LB::NS::XX;
uint lucs4 = 0; uint lucs4 = 0;
QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10 QUnicodeTables::LineBreakClass lcls = QUnicodeTables::LineBreak_LF; // to meet LB10
QUnicodeTables::LineBreakClass cls = lcls; QUnicodeTables::LineBreakClass cls = lcls;
@ -363,6 +437,27 @@ static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *
ncls = QUnicodeTables::LineBreak_AL; ncls = QUnicodeTables::LineBreak_AL;
} }
if (ncls != QUnicodeTables::LineBreak_CM) {
// LB25: do not break lines inside numbers
LB::NS::Class necur = LB::NS::toClass(ncls, (QChar::Category)prop->category);
switch (LB::NS::actionTable[nelast][necur]) {
case LB::NS::Break:
// do not change breaks before and after the expression
for (quint32 j = nestart + 1; j < pos; ++j)
attributes[j].lineBreakType = HB_NoBreak;
// fall through
case LB::NS::None:
nelast = LB::NS::XX; // reset state
break;
case LB::NS::Start:
nestart = i;
// fall through
default:
nelast = necur;
break;
}
}
HB_LineBreakType lineBreakType = HB_NoBreak; HB_LineBreakType lineBreakType = HB_NoBreak;
if (lcls >= QUnicodeTables::LineBreak_CR) { if (lcls >= QUnicodeTables::LineBreak_CR) {
@ -417,6 +512,12 @@ static void getLineBreaks(const ushort *string, quint32 len, HB_CharAttributes *
attributes[pos].lineBreakType = lineBreakType; attributes[pos].lineBreakType = lineBreakType;
} }
if (LB::NS::actionTable[nelast][LB::NS::XX] == LB::NS::Break) {
// LB25: do not break lines inside numbers
for (quint32 j = nestart + 1; j < len; ++j)
attributes[j].lineBreakType = HB_NoBreak;
}
attributes[0].lineBreakType = HB_NoBreak; // LB2 attributes[0].lineBreakType = HB_NoBreak; // LB2
} }

View File

@ -288,31 +288,6 @@ void tst_QTextBoundaryFinder::lineBoundariesDefault()
if (testString.contains(QChar::ObjectReplacementCharacter)) if (testString.contains(QChar::ObjectReplacementCharacter))
QSKIP("QTextBoundaryFinder doesn't handle Object Replacement character"); QSKIP("QTextBoundaryFinder doesn't handle Object Replacement character");
{ // The Line Break tests use tailoring of numbers described in Example 7
// of Section 8.2 Examples of Customization
QChar s[][2] = {
{ 0x007D, 0x0025 }, // CL x PO
{ 0x007D, 0x0024 }, // CL x PR
{ 0x0029, 0x0025 }, // CP x PO
{ 0x0029, 0x0024 }, // CP x PR
{ 0x002C, 0x0030 }, // IS x NU
{ 0x002E, 0x0031 }, // IS x NU
{ 0x002E, 0x0032 }, // IS x NU
{ 0x002E, 0x0033 }, // IS x NU
{ 0x002E, 0x0034 }, // IS x NU
{ 0x0025, 0x0028 }, // PO x OP
{ 0x0024, 0x0028 }, // PR x OP
{ 0x005C, 0x0028 }, // PR x OP
{ 0x005C, 0x007B }, // PR x OP
{ 0x002F, 0x0030 }, // SY x NU
};
QChar cm(0x0308);
for (int i = 0; i < int(sizeof(s) / sizeof(QChar)) / 2; ++i) {
if (testString.contains(QString(s[i], 2)) || testString.contains(QString(s[i], 2).insert(1, cm)))
QSKIP("QTextBoundaryFinder doesn't handle numerical expressions");
}
}
expectedBreakPositions.prepend(0); // ### QTBF generates a boundary at start of text expectedBreakPositions.prepend(0); // ### QTBF generates a boundary at start of text
doTestData(testString, expectedBreakPositions, QTextBoundaryFinder::Line, true); doTestData(testString, expectedBreakPositions, QTextBoundaryFinder::Line, true);
} }