[1/2] Implement Unicode Normalization Form Quick Check (NF QC)

Make Unicode tables generator take QuickCheck data from DerivedNormalizationProps.txt
into account and generate NF QC bits.
\sa http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms

Change-Id: Ib73bd42ddb8f99d0be0aff609711943c52dd9c24
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
This commit is contained in:
Konstantin Ritt 2013-08-10 15:41:52 +03:00 committed by The Qt Project
parent f4942c3cc1
commit 339aff06f9

View File

@ -689,8 +689,9 @@ static const char *property_string =
" ushort titleCaseSpecial : 1;\n"
" ushort caseFoldSpecial : 1;\n"
" ushort unicodeVersion : 4;\n"
" ushort graphemeBreakClass : 8; /* 4 used */\n"
" ushort wordBreakClass : 8; /* 4 used */\n"
" ushort nfQuickCheck : 8;\n" // could be narrowed
" ushort graphemeBreakClass : 4; /* 4 used */\n"
" ushort wordBreakClass : 4; /* 4 used */\n"
" ushort sentenceBreakClass : 8; /* 4 used */\n"
" ushort lineBreakClass : 8; /* 6 used */\n"
" ushort script : 8; /* 7 used */\n"
@ -741,6 +742,7 @@ struct PropertyFlags {
&& sentenceBreakClass == o.sentenceBreakClass
&& lineBreakClass == o.lineBreakClass
&& script == o.script
&& nfQuickCheck == o.nfQuickCheck
);
}
// from UnicodeData.txt
@ -768,6 +770,8 @@ struct PropertyFlags {
SentenceBreakClass sentenceBreakClass;
LineBreakClass lineBreakClass;
int script;
// from DerivedNormalizationProps.txt
uchar nfQuickCheck;
};
@ -873,6 +877,7 @@ struct UnicodeData {
p.wordBreakClass = WordBreak_Other;
p.sentenceBreakClass = SentenceBreak_Other;
p.script = QChar::Script_Unknown;
p.nfQuickCheck = 0;
propertyIndex = -1;
excludedComposition = false;
}
@ -1270,9 +1275,12 @@ static void readDerivedNormalizationProps()
Q_ASSERT(l.size() >= 2);
QByteArray propName = l[1].trimmed();
if (propName != "Full_Composition_Exclusion")
if (propName != "Full_Composition_Exclusion" &&
propName != "NFD_QC" && propName != "NFC_QC" &&
propName != "NFKD_QC" && propName != "NFKC_QC") {
// ###
continue;
}
QByteArray codes = l[0].trimmed();
codes.replace("..", ".");
@ -1289,7 +1297,35 @@ static void readDerivedNormalizationProps()
for (int codepoint = from; codepoint <= to; ++codepoint) {
UnicodeData &d = UnicodeData::valueRef(codepoint);
d.excludedComposition = true;
if (propName == "Full_Composition_Exclusion") {
d.excludedComposition = true;
} else {
Q_STATIC_ASSERT(QString::NormalizationForm_D == 0);
Q_STATIC_ASSERT(QString::NormalizationForm_C == 1);
Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2);
Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3);
QString::NormalizationForm form;
if (propName == "NFD_QC")
form = QString::NormalizationForm_D;
else if (propName == "NFC_QC")
form = QString::NormalizationForm_C;
else if (propName == "NFKD_QC")
form = QString::NormalizationForm_KD;
else// if (propName == "NFKC_QC")
form = QString::NormalizationForm_KC;
Q_ASSERT(l.size() == 3);
l[2] = l[2].trimmed();
enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
uchar ynm = (l[2] == "N" ? NFQC_NO : l[2] == "M" ? NFQC_MAYBE : NFQC_YES);
if (ynm == NFQC_MAYBE) {
// if this changes, we need to revise the normalizationQuickCheckHelper() implementation
Q_ASSERT(form == QString::NormalizationForm_C || form == QString::NormalizationForm_KC);
}
d.p.nfQuickCheck |= (ynm << (form << 1)); // 2 bits per NF
}
}
}
@ -2246,8 +2282,11 @@ static QByteArray createPropertyInfo()
// " ushort unicodeVersion : 4;\n"
out += QByteArray::number( p.age );
out += ", ";
// " ushort graphemeBreakClass : 8; /* 4 used */\n"
// " ushort wordBreakClass : 8; /* 4 used */\n"
// " ushort nfQuickCheck : 8;\n"
out += QByteArray::number( p.nfQuickCheck );
out += ", ";
// " ushort graphemeBreakClass : 4; /* 4 used */\n"
// " ushort wordBreakClass : 4; /* 4 used */\n"
// " ushort sentenceBreakClass : 8; /* 4 used */\n"
// " ushort lineBreakClass : 8; /* 6 used */\n"
out += QByteArray::number( p.graphemeBreakClass );