38be0d1383
This is the beginning of revision history for this module. If you want to look at revision history older than this, please refer to the Qt Git wiki for how to use Git history grafting. At the time of writing, this wiki is located here: http://qt.gitorious.org/qt/pages/GitIntroductionWithQt If you have already performed the grafting and you don't see any history beyond this commit, try running "git log" with the "--follow" argument. Branched from the monolithic repo, Qt master branch, at commit 896db169ea224deb96c59ce8af800d019de63f12
2787 lines
88 KiB
C++
2787 lines
88 KiB
C++
/****************************************************************************
|
|
**
|
|
** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).
|
|
** All rights reserved.
|
|
** Contact: Nokia Corporation (qt-info@nokia.com)
|
|
**
|
|
** This file is part of the utils of the Qt Toolkit.
|
|
**
|
|
** $QT_BEGIN_LICENSE:LGPL$
|
|
** No Commercial Usage
|
|
** This file contains pre-release code and may not be distributed.
|
|
** You may use this file in accordance with the terms and conditions
|
|
** contained in the Technology Preview License Agreement accompanying
|
|
** this package.
|
|
**
|
|
** GNU Lesser General Public License Usage
|
|
** Alternatively, this file may be used under the terms of the GNU Lesser
|
|
** General Public License version 2.1 as published by the Free Software
|
|
** Foundation and appearing in the file LICENSE.LGPL included in the
|
|
** packaging of this file. Please review the following information to
|
|
** ensure the GNU Lesser General Public License version 2.1 requirements
|
|
** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
|
**
|
|
** In addition, as a special exception, Nokia gives you certain additional
|
|
** rights. These rights are described in the Nokia Qt LGPL Exception
|
|
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
|
|
**
|
|
** If you have questions regarding the use of this file, please contact
|
|
** Nokia at qt-info@nokia.com.
|
|
**
|
|
**
|
|
**
|
|
**
|
|
**
|
|
**
|
|
**
|
|
**
|
|
** $QT_END_LICENSE$
|
|
**
|
|
****************************************************************************/
|
|
|
|
#include <qlist.h>
|
|
#include <qhash.h>
|
|
#include <qfile.h>
|
|
#include <qbytearray.h>
|
|
#include <qstring.h>
|
|
#include <qchar.h>
|
|
#include <qvector.h>
|
|
#include <qdebug.h>
|
|
#if 0
|
|
#include <private/qunicodetables_p.h>
|
|
#endif
|
|
|
|
#define DATA_VERSION_S "5.0"
|
|
#define DATA_VERSION_STR "QChar::Unicode_5_0"
|
|
|
|
#define LAST_CODEPOINT 0x10ffff
|
|
#define LAST_CODEPOINT_STR "0x10ffff"
|
|
|
|
|
|
static QHash<QByteArray, QChar::UnicodeVersion> age_map;
|
|
|
|
static void initAgeMap()
|
|
{
|
|
struct AgeMap {
|
|
const QChar::UnicodeVersion version;
|
|
const char *age;
|
|
} ageMap[] = {
|
|
{ QChar::Unicode_1_1, "1.1" },
|
|
{ QChar::Unicode_2_0, "2.0" },
|
|
{ QChar::Unicode_2_1_2, "2.1" },
|
|
{ QChar::Unicode_3_0, "3.0" },
|
|
{ QChar::Unicode_3_1, "3.1" },
|
|
{ QChar::Unicode_3_2, "3.2" },
|
|
{ QChar::Unicode_4_0, "4.0" },
|
|
{ QChar::Unicode_4_1, "4.1" },
|
|
{ QChar::Unicode_5_0, "5.0" },
|
|
{ QChar::Unicode_Unassigned, 0 }
|
|
};
|
|
AgeMap *d = ageMap;
|
|
while (d->age) {
|
|
age_map.insert(d->age, d->version);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
enum Joining {
|
|
Joining_None,
|
|
Joining_Left,
|
|
Joining_Causing,
|
|
Joining_Dual,
|
|
Joining_Right,
|
|
Joining_Transparent
|
|
|
|
, Joining_Unassigned
|
|
};
|
|
|
|
static QHash<QByteArray, Joining> joining_map;
|
|
|
|
static void initJoiningMap()
|
|
{
|
|
struct JoiningList {
|
|
Joining joining;
|
|
const char *name;
|
|
} joinings[] = {
|
|
{ Joining_None, "U" },
|
|
{ Joining_Left, "L" },
|
|
{ Joining_Causing, "C" },
|
|
{ Joining_Dual, "D" },
|
|
{ Joining_Right, "R" },
|
|
{ Joining_Transparent, "T" },
|
|
{ Joining_Unassigned, 0 }
|
|
};
|
|
JoiningList *d = joinings;
|
|
while (d->name) {
|
|
joining_map.insert(d->name, d->joining);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
static const char *grapheme_break_string =
|
|
" enum GraphemeBreak {\n"
|
|
" GraphemeBreakOther,\n"
|
|
" GraphemeBreakCR,\n"
|
|
" GraphemeBreakLF,\n"
|
|
" GraphemeBreakControl,\n"
|
|
" GraphemeBreakExtend,\n"
|
|
" GraphemeBreakL,\n"
|
|
" GraphemeBreakV,\n"
|
|
" GraphemeBreakT,\n"
|
|
" GraphemeBreakLV,\n"
|
|
" GraphemeBreakLVT\n"
|
|
" };\n\n";
|
|
|
|
enum GraphemeBreak {
|
|
GraphemeBreakOther,
|
|
GraphemeBreakCR,
|
|
GraphemeBreakLF,
|
|
GraphemeBreakControl,
|
|
GraphemeBreakExtend,
|
|
GraphemeBreakL,
|
|
GraphemeBreakV,
|
|
GraphemeBreakT,
|
|
GraphemeBreakLV,
|
|
GraphemeBreakLVT
|
|
|
|
, GraphemeBreak_Unassigned
|
|
};
|
|
|
|
static QHash<QByteArray, GraphemeBreak> grapheme_break_map;
|
|
|
|
static void initGraphemeBreak()
|
|
{
|
|
struct GraphemeBreakList {
|
|
GraphemeBreak brk;
|
|
const char *name;
|
|
} breaks[] = {
|
|
{ GraphemeBreakOther, "Other" },
|
|
{ GraphemeBreakCR, "CR" },
|
|
{ GraphemeBreakLF, "LF" },
|
|
{ GraphemeBreakControl, "Control" },
|
|
{ GraphemeBreakExtend, "Extend" },
|
|
{ GraphemeBreakL, "L" },
|
|
{ GraphemeBreakV, "V" },
|
|
{ GraphemeBreakT, "T" },
|
|
{ GraphemeBreakLV, "LV" },
|
|
{ GraphemeBreakLVT, "LVT" },
|
|
{ GraphemeBreak_Unassigned, 0 }
|
|
};
|
|
GraphemeBreakList *d = breaks;
|
|
while (d->name) {
|
|
grapheme_break_map.insert(d->name, d->brk);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
static const char *word_break_string =
|
|
" enum WordBreak {\n"
|
|
" WordBreakOther,\n"
|
|
" WordBreakFormat,\n"
|
|
" WordBreakKatakana,\n"
|
|
" WordBreakALetter,\n"
|
|
" WordBreakMidLetter,\n"
|
|
" WordBreakMidNum,\n"
|
|
" WordBreakNumeric,\n"
|
|
" WordBreakExtendNumLet\n"
|
|
" };\n\n";
|
|
|
|
enum WordBreak {
|
|
WordBreakOther,
|
|
WordBreakFormat,
|
|
WordBreakKatakana,
|
|
WordBreakALetter,
|
|
WordBreakMidLetter,
|
|
WordBreakMidNum,
|
|
WordBreakNumeric,
|
|
WordBreakExtendNumLet
|
|
|
|
, WordBreak_Unassigned
|
|
};
|
|
|
|
static QHash<QByteArray, WordBreak> word_break_map;
|
|
|
|
static void initWordBreak()
|
|
{
|
|
struct WordBreakList {
|
|
WordBreak brk;
|
|
const char *name;
|
|
} breaks[] = {
|
|
{ WordBreakFormat, "Format" },
|
|
{ WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt
|
|
{ WordBreakKatakana, "Katakana" },
|
|
{ WordBreakALetter, "ALetter" },
|
|
{ WordBreakMidLetter, "MidLetter" },
|
|
{ WordBreakMidNum, "MidNum" },
|
|
{ WordBreakNumeric, "Numeric" },
|
|
{ WordBreakExtendNumLet, "ExtendNumLet" },
|
|
{ WordBreak_Unassigned, 0 }
|
|
};
|
|
WordBreakList *d = breaks;
|
|
while (d->name) {
|
|
word_break_map.insert(d->name, d->brk);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
static const char *sentence_break_string =
|
|
" enum SentenceBreak {\n"
|
|
" SentenceBreakOther,\n"
|
|
" SentenceBreakSep,\n"
|
|
" SentenceBreakFormat,\n"
|
|
" SentenceBreakSp,\n"
|
|
" SentenceBreakLower,\n"
|
|
" SentenceBreakUpper,\n"
|
|
" SentenceBreakOLetter,\n"
|
|
" SentenceBreakNumeric,\n"
|
|
" SentenceBreakATerm,\n"
|
|
" SentenceBreakSTerm,\n"
|
|
" SentenceBreakClose\n"
|
|
" };\n\n";
|
|
|
|
enum SentenceBreak {
|
|
SentenceBreakOther,
|
|
SentenceBreakSep,
|
|
SentenceBreakFormat,
|
|
SentenceBreakSp,
|
|
SentenceBreakLower,
|
|
SentenceBreakUpper,
|
|
SentenceBreakOLetter,
|
|
SentenceBreakNumeric,
|
|
SentenceBreakATerm,
|
|
SentenceBreakSTerm,
|
|
SentenceBreakClose
|
|
|
|
, SentenceBreak_Unassigned
|
|
};
|
|
|
|
static QHash<QByteArray, SentenceBreak> sentence_break_map;
|
|
|
|
static void initSentenceBreak()
|
|
{
|
|
struct SentenceBreakList {
|
|
SentenceBreak brk;
|
|
const char *name;
|
|
} breaks[] = {
|
|
{ SentenceBreakOther, "Other" },
|
|
{ SentenceBreakSep, "Sep" },
|
|
{ SentenceBreakFormat, "Format" },
|
|
{ SentenceBreakSp, "Sp" },
|
|
{ SentenceBreakLower, "Lower" },
|
|
{ SentenceBreakUpper, "Upper" },
|
|
{ SentenceBreakOLetter, "OLetter" },
|
|
{ SentenceBreakNumeric, "Numeric" },
|
|
{ SentenceBreakATerm, "ATerm" },
|
|
{ SentenceBreakSTerm, "STerm" },
|
|
{ SentenceBreakClose, "Close" },
|
|
{ SentenceBreak_Unassigned, 0 }
|
|
};
|
|
SentenceBreakList *d = breaks;
|
|
while (d->name) {
|
|
sentence_break_map.insert(d->name, d->brk);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
static const char *lineBreakClass =
|
|
" // see http://www.unicode.org/reports/tr14/tr14-19.html\n"
|
|
" // we don't use the XX, AI and CB properties and map them to AL instead.\n"
|
|
" // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n"
|
|
" enum LineBreakClass {\n"
|
|
" LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n"
|
|
" LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n"
|
|
" LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n"
|
|
" LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n"
|
|
" LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n"
|
|
" LineBreak_JT, LineBreak_SA, LineBreak_SG,\n"
|
|
" LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n"
|
|
" };\n\n";
|
|
|
|
enum LineBreakClass {
|
|
LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,
|
|
LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,
|
|
LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,
|
|
LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,
|
|
LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,
|
|
LineBreak_JT, LineBreak_SA, LineBreak_SG,
|
|
LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK
|
|
|
|
, LineBreak_Unassigned
|
|
};
|
|
|
|
static QHash<QByteArray, LineBreakClass> line_break_map;
|
|
|
|
static void initLineBreak()
|
|
{
|
|
// ### Classes XX and AI are left out and mapped to AL for now;
|
|
// ### Class NL is ignored and mapped to AL as well.
|
|
struct LineBreakList {
|
|
LineBreakClass brk;
|
|
const char *name;
|
|
} breaks[] = {
|
|
{ LineBreak_BK, "BK" },
|
|
{ LineBreak_CR, "CR" },
|
|
{ LineBreak_LF, "LF" },
|
|
{ LineBreak_CM, "CM" },
|
|
{ LineBreak_AL, "NL" },
|
|
{ LineBreak_SG, "SG" },
|
|
{ LineBreak_WJ, "WJ" },
|
|
{ LineBreak_ZW, "ZW" },
|
|
{ LineBreak_GL, "GL" },
|
|
{ LineBreak_SP, "SP" },
|
|
{ LineBreak_B2, "B2" },
|
|
{ LineBreak_BA, "BA" },
|
|
{ LineBreak_BB, "BB" },
|
|
{ LineBreak_HY, "HY" },
|
|
{ LineBreak_AL, "CB" }, // ###
|
|
{ LineBreak_CL, "CL" },
|
|
{ LineBreak_EX, "EX" },
|
|
{ LineBreak_IN, "IN" },
|
|
{ LineBreak_NS, "NS" },
|
|
{ LineBreak_OP, "OP" },
|
|
{ LineBreak_QU, "QU" },
|
|
{ LineBreak_IS, "IS" },
|
|
{ LineBreak_NU, "NU" },
|
|
{ LineBreak_PO, "PO" },
|
|
{ LineBreak_PR, "PR" },
|
|
{ LineBreak_SY, "SY" },
|
|
{ LineBreak_AL, "AI" },
|
|
{ LineBreak_AL, "AL" },
|
|
{ LineBreak_H2, "H2" },
|
|
{ LineBreak_H3, "H3" },
|
|
{ LineBreak_ID, "ID" },
|
|
{ LineBreak_JL, "JL" },
|
|
{ LineBreak_JV, "JV" },
|
|
{ LineBreak_JT, "JT" },
|
|
{ LineBreak_SA, "SA" },
|
|
{ LineBreak_AL, "XX" },
|
|
{ LineBreak_Unassigned, 0 }
|
|
};
|
|
LineBreakList *d = breaks;
|
|
while (d->name) {
|
|
line_break_map.insert(d->name, d->brk);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
// Keep this one in sync with the code in createPropertyInfo
|
|
static const char *property_string =
|
|
" struct Properties {\n"
|
|
" ushort category : 8; /* 5 needed */\n"
|
|
" ushort line_break_class : 8; /* 6 needed */\n"
|
|
" ushort direction : 8; /* 5 needed */\n"
|
|
" ushort combiningClass : 8;\n"
|
|
" ushort joining : 2;\n"
|
|
" signed short digitValue : 6; /* 5 needed */\n"
|
|
" ushort unicodeVersion : 4;\n"
|
|
" ushort lowerCaseSpecial : 1;\n"
|
|
" ushort upperCaseSpecial : 1;\n"
|
|
" ushort titleCaseSpecial : 1;\n"
|
|
" ushort caseFoldSpecial : 1; /* currently unused */\n"
|
|
" signed short mirrorDiff : 16;\n"
|
|
" signed short lowerCaseDiff : 16;\n"
|
|
" signed short upperCaseDiff : 16;\n"
|
|
" signed short titleCaseDiff : 16;\n"
|
|
" signed short caseFoldDiff : 16;\n"
|
|
" ushort graphemeBreak : 8; /* 4 needed */\n"
|
|
" ushort wordBreak : 8; /* 4 needed */\n"
|
|
" ushort sentenceBreak : 8; /* 4 needed */\n"
|
|
" };\n"
|
|
" Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n"
|
|
" Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n";
|
|
|
|
static const char *methods =
|
|
" Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n"
|
|
" inline int lineBreakClass(const QChar &ch)\n"
|
|
" { return lineBreakClass(ch.unicode()); }\n"
|
|
"\n"
|
|
" Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n"
|
|
" inline int script(const QChar &ch)\n"
|
|
" { return script(ch.unicode()); }\n\n";
|
|
|
|
static const int SizeOfPropertiesStruct = 20;
|
|
|
|
struct PropertyFlags {
|
|
bool operator ==(const PropertyFlags &o) {
|
|
return (combiningClass == o.combiningClass
|
|
&& category == o.category
|
|
&& direction == o.direction
|
|
&& joining == o.joining
|
|
&& age == o.age
|
|
&& digitValue == o.digitValue
|
|
&& line_break_class == o.line_break_class
|
|
&& mirrorDiff == o.mirrorDiff
|
|
&& lowerCaseDiff == o.lowerCaseDiff
|
|
&& upperCaseDiff == o.upperCaseDiff
|
|
&& titleCaseDiff == o.titleCaseDiff
|
|
&& caseFoldDiff == o.caseFoldDiff
|
|
&& lowerCaseSpecial == o.lowerCaseSpecial
|
|
&& upperCaseSpecial == o.upperCaseSpecial
|
|
&& titleCaseSpecial == o.titleCaseSpecial
|
|
&& caseFoldSpecial == o.caseFoldSpecial
|
|
&& graphemeBreak == o.graphemeBreak
|
|
&& wordBreak == o.wordBreak
|
|
&& sentenceBreak == o.sentenceBreak
|
|
);
|
|
}
|
|
// from UnicodeData.txt
|
|
uchar combiningClass : 8;
|
|
QChar::Category category : 5;
|
|
QChar::Direction direction : 5;
|
|
// from ArabicShaping.txt
|
|
QChar::Joining joining : 2;
|
|
// from DerivedAge.txt
|
|
QChar::UnicodeVersion age : 4;
|
|
int digitValue;
|
|
LineBreakClass line_break_class;
|
|
|
|
int mirrorDiff : 16;
|
|
|
|
int lowerCaseDiff;
|
|
int upperCaseDiff;
|
|
int titleCaseDiff;
|
|
int caseFoldDiff;
|
|
bool lowerCaseSpecial;
|
|
bool upperCaseSpecial;
|
|
bool titleCaseSpecial;
|
|
bool caseFoldSpecial;
|
|
GraphemeBreak graphemeBreak;
|
|
WordBreak wordBreak;
|
|
SentenceBreak sentenceBreak;
|
|
};
|
|
|
|
|
|
static QList<int> specialCaseMap;
|
|
static int specialCaseMaxLen = 0;
|
|
|
|
static int appendToSpecialCaseMap(const QList<int> &map)
|
|
{
|
|
QList<int> utf16map;
|
|
for (int i = 0; i < map.size(); ++i) {
|
|
int val = map.at(i);
|
|
if (val >= 0x10000) {
|
|
utf16map << QChar::highSurrogate(val);
|
|
utf16map << QChar::lowSurrogate(val);
|
|
} else {
|
|
utf16map << val;
|
|
}
|
|
}
|
|
specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size());
|
|
utf16map << 0;
|
|
|
|
for (int i = 0; i < specialCaseMap.size() - utf16map.size() + 1; ++i) {
|
|
int j;
|
|
for (j = 0; j < utf16map.size(); ++j) {
|
|
if (specialCaseMap.at(i+j) != utf16map.at(j))
|
|
break;
|
|
}
|
|
if (j == utf16map.size())
|
|
return i;
|
|
}
|
|
|
|
int pos = specialCaseMap.size();
|
|
specialCaseMap << utf16map;
|
|
return pos;
|
|
}
|
|
|
|
struct UnicodeData {
|
|
UnicodeData(int codepoint = 0) {
|
|
p.category = QChar::Other_NotAssigned; // Cn
|
|
p.combiningClass = 0;
|
|
|
|
p.direction = QChar::DirL;
|
|
// DerivedBidiClass.txt
|
|
// DirR for: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF
|
|
if ((codepoint >= 0x590 && codepoint <= 0x5ff)
|
|
|| (codepoint >= 0x7c0 && codepoint <= 0x8ff)
|
|
|| (codepoint >= 0xfb1d && codepoint <= 0xfb4f)
|
|
|| (codepoint >= 0x10800 && codepoint <= 0x10fff)) {
|
|
p.direction = QChar::DirR;
|
|
}
|
|
// DirAL for: U+0600..U+07BF, U+FB50..U+FDFF, U+FE70..U+FEFF
|
|
// minus noncharacter code points (intersects with U+FDD0..U+FDEF)
|
|
if ((codepoint >= 0x600 && codepoint <= 0x7bf)
|
|
|| (codepoint >= 0xfb50 && codepoint <= 0xfdcf)
|
|
|| (codepoint >= 0xfdf0 && codepoint <= 0xfdff)
|
|
|| (codepoint >= 0xfe70 && codepoint <= 0xfeff)) {
|
|
p.direction = QChar::DirAL;
|
|
}
|
|
|
|
mirroredChar = 0;
|
|
decompositionType = QChar::NoDecomposition;
|
|
p.joining = QChar::OtherJoining;
|
|
p.age = QChar::Unicode_Unassigned;
|
|
p.mirrorDiff = 0;
|
|
p.digitValue = -1;
|
|
p.line_break_class = LineBreak_AL; // XX -> AL
|
|
p.lowerCaseDiff = 0;
|
|
p.upperCaseDiff = 0;
|
|
p.titleCaseDiff = 0;
|
|
p.caseFoldDiff = 0;
|
|
p.lowerCaseSpecial = 0;
|
|
p.upperCaseSpecial = 0;
|
|
p.titleCaseSpecial = 0;
|
|
p.caseFoldSpecial = 0;
|
|
p.graphemeBreak = GraphemeBreakOther;
|
|
p.wordBreak = WordBreakOther;
|
|
p.sentenceBreak = SentenceBreakOther;
|
|
propertyIndex = -1;
|
|
excludedComposition = false;
|
|
}
|
|
PropertyFlags p;
|
|
|
|
// from UnicodeData.txt
|
|
QChar::Decomposition decompositionType;
|
|
QList<int> decomposition;
|
|
|
|
QList<int> specialFolding;
|
|
|
|
// from BidiMirroring.txt
|
|
int mirroredChar;
|
|
|
|
// DerivedNormalizationProps.txt
|
|
bool excludedComposition;
|
|
|
|
// computed position of unicode property set
|
|
int propertyIndex;
|
|
};
|
|
|
|
enum UniDataFields {
|
|
UD_Value,
|
|
UD_Name,
|
|
UD_Category,
|
|
UD_CombiningClass,
|
|
UD_BidiCategory,
|
|
UD_Decomposition,
|
|
UD_DecimalDigitValue,
|
|
UD_DigitValue,
|
|
UD_NumericValue,
|
|
UD_Mirrored,
|
|
UD_OldName,
|
|
UD_Comment,
|
|
UD_UpperCase,
|
|
UD_LowerCase,
|
|
UD_TitleCase
|
|
};
|
|
|
|
|
|
static QHash<QByteArray, QChar::Category> categoryMap;
|
|
|
|
static void initCategoryMap()
|
|
{
|
|
struct Cat {
|
|
QChar::Category cat;
|
|
const char *name;
|
|
} categories[] = {
|
|
{ QChar::Mark_NonSpacing, "Mn" },
|
|
{ QChar::Mark_SpacingCombining, "Mc" },
|
|
{ QChar::Mark_Enclosing, "Me" },
|
|
|
|
{ QChar::Number_DecimalDigit, "Nd" },
|
|
{ QChar::Number_Letter, "Nl" },
|
|
{ QChar::Number_Other, "No" },
|
|
|
|
{ QChar::Separator_Space, "Zs" },
|
|
{ QChar::Separator_Line, "Zl" },
|
|
{ QChar::Separator_Paragraph, "Zp" },
|
|
|
|
{ QChar::Other_Control, "Cc" },
|
|
{ QChar::Other_Format, "Cf" },
|
|
{ QChar::Other_Surrogate, "Cs" },
|
|
{ QChar::Other_PrivateUse, "Co" },
|
|
{ QChar::Other_NotAssigned, "Cn" },
|
|
|
|
{ QChar::Letter_Uppercase, "Lu" },
|
|
{ QChar::Letter_Lowercase, "Ll" },
|
|
{ QChar::Letter_Titlecase, "Lt" },
|
|
{ QChar::Letter_Modifier, "Lm" },
|
|
{ QChar::Letter_Other, "Lo" },
|
|
|
|
{ QChar::Punctuation_Connector, "Pc" },
|
|
{ QChar::Punctuation_Dash, "Pd" },
|
|
{ QChar::Punctuation_Open, "Ps" },
|
|
{ QChar::Punctuation_Close, "Pe" },
|
|
{ QChar::Punctuation_InitialQuote, "Pi" },
|
|
{ QChar::Punctuation_FinalQuote, "Pf" },
|
|
{ QChar::Punctuation_Other, "Po" },
|
|
|
|
{ QChar::Symbol_Math, "Sm" },
|
|
{ QChar::Symbol_Currency, "Sc" },
|
|
{ QChar::Symbol_Modifier, "Sk" },
|
|
{ QChar::Symbol_Other, "So" },
|
|
{ QChar::NoCategory, 0 }
|
|
};
|
|
Cat *c = categories;
|
|
while (c->name) {
|
|
categoryMap.insert(c->name, c->cat);
|
|
++c;
|
|
}
|
|
}
|
|
|
|
|
|
static QHash<QByteArray, QChar::Direction> directionMap;
|
|
|
|
static void initDirectionMap()
|
|
{
|
|
struct Dir {
|
|
QChar::Direction dir;
|
|
const char *name;
|
|
} directions[] = {
|
|
{ QChar::DirL, "L" },
|
|
{ QChar::DirR, "R" },
|
|
{ QChar::DirEN, "EN" },
|
|
{ QChar::DirES, "ES" },
|
|
{ QChar::DirET, "ET" },
|
|
{ QChar::DirAN, "AN" },
|
|
{ QChar::DirCS, "CS" },
|
|
{ QChar::DirB, "B" },
|
|
{ QChar::DirS, "S" },
|
|
{ QChar::DirWS, "WS" },
|
|
{ QChar::DirON, "ON" },
|
|
{ QChar::DirLRE, "LRE" },
|
|
{ QChar::DirLRO, "LRO" },
|
|
{ QChar::DirAL, "AL" },
|
|
{ QChar::DirRLE, "RLE" },
|
|
{ QChar::DirRLO, "RLO" },
|
|
{ QChar::DirPDF, "PDF" },
|
|
{ QChar::DirNSM, "NSM" },
|
|
{ QChar::DirBN, "BN" },
|
|
{ QChar::DirL, 0 }
|
|
};
|
|
Dir *d = directions;
|
|
while (d->name) {
|
|
directionMap.insert(d->name, d->dir);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
static QHash<QByteArray, QChar::Decomposition> decompositionMap;
|
|
|
|
static void initDecompositionMap()
|
|
{
|
|
struct Dec {
|
|
QChar::Decomposition dec;
|
|
const char *name;
|
|
} decompositions[] = {
|
|
{ QChar::Canonical, "<canonical>" },
|
|
{ QChar::Font, "<font>" },
|
|
{ QChar::NoBreak, "<noBreak>" },
|
|
{ QChar::Initial, "<initial>" },
|
|
{ QChar::Medial, "<medial>" },
|
|
{ QChar::Final, "<final>" },
|
|
{ QChar::Isolated, "<isolated>" },
|
|
{ QChar::Circle, "<circle>" },
|
|
{ QChar::Super, "<super>" },
|
|
{ QChar::Sub, "<sub>" },
|
|
{ QChar::Vertical, "<vertical>" },
|
|
{ QChar::Wide, "<wide>" },
|
|
{ QChar::Narrow, "<narrow>" },
|
|
{ QChar::Small, "<small>" },
|
|
{ QChar::Square, "<square>" },
|
|
{ QChar::Compat, "<compat>" },
|
|
{ QChar::Fraction, "<fraction>" },
|
|
{ QChar::NoDecomposition, 0 }
|
|
};
|
|
Dec *d = decompositions;
|
|
while (d->name) {
|
|
decompositionMap.insert(d->name, d->dec);
|
|
++d;
|
|
}
|
|
}
|
|
|
|
|
|
static QHash<int, UnicodeData> unicodeData;
|
|
static QList<PropertyFlags> uniqueProperties;
|
|
|
|
|
|
static QHash<int, int> decompositionLength;
|
|
static int highestComposedCharacter = 0;
|
|
static int numLigatures = 0;
|
|
static int highestLigature = 0;
|
|
|
|
struct Ligature {
|
|
ushort u1;
|
|
ushort u2;
|
|
ushort ligature;
|
|
};
|
|
// we need them sorted after the first component for fast lookup
|
|
bool operator < (const Ligature &l1, const Ligature &l2)
|
|
{ return l1.u1 < l2.u1; }
|
|
|
|
static QHash<ushort, QList<Ligature> > ligatureHashes;
|
|
|
|
static QHash<int, int> combiningClassUsage;
|
|
|
|
static int maxLowerCaseDiff = 0;
|
|
static int maxUpperCaseDiff = 0;
|
|
static int maxTitleCaseDiff = 0;
|
|
|
|
static void readUnicodeData()
|
|
{
|
|
QFile f("data/UnicodeData.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find UnicodeData.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.truncate(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> properties = line.split(';');
|
|
bool ok;
|
|
int codepoint = properties[UD_Value].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
Q_ASSERT(codepoint <= LAST_CODEPOINT);
|
|
int lastCodepoint = codepoint;
|
|
|
|
QByteArray name = properties[UD_Name];
|
|
if (name.startsWith('<') && name.contains("First")) {
|
|
QByteArray nextLine;
|
|
nextLine.resize(1024);
|
|
f.readLine(nextLine.data(), 1024);
|
|
QList<QByteArray> properties = nextLine.split(';');
|
|
Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last"));
|
|
lastCodepoint = properties[UD_Value].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
Q_ASSERT(lastCodepoint <= LAST_CODEPOINT);
|
|
}
|
|
|
|
UnicodeData data(codepoint);
|
|
data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory);
|
|
if (data.p.category == QChar::NoCategory)
|
|
qFatal("unassigned char category: %s", properties[UD_Category].constData());
|
|
|
|
data.p.combiningClass = properties[UD_CombiningClass].toInt();
|
|
if (!combiningClassUsage.contains(data.p.combiningClass))
|
|
combiningClassUsage[data.p.combiningClass] = 1;
|
|
else
|
|
++combiningClassUsage[data.p.combiningClass];
|
|
|
|
data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction);
|
|
|
|
if (!properties[UD_UpperCase].isEmpty()) {
|
|
int upperCase = properties[UD_UpperCase].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int diff = upperCase - codepoint;
|
|
if (qAbs(diff) >= (1<<14))
|
|
qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << ")";
|
|
data.p.upperCaseDiff = diff;
|
|
maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff));
|
|
if (codepoint >= 0x10000 || upperCase >= 0x10000) {
|
|
// if the conditions below doesn't hold anymore we need to modify our upper casing code
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase));
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase));
|
|
}
|
|
}
|
|
if (!properties[UD_LowerCase].isEmpty()) {
|
|
int lowerCase = properties[UD_LowerCase].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int diff = lowerCase - codepoint;
|
|
if (qAbs(diff) >= (1<<14))
|
|
qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << ")";
|
|
data.p.lowerCaseDiff = diff;
|
|
maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff));
|
|
if (codepoint >= 0x10000 || lowerCase >= 0x10000) {
|
|
// if the conditions below doesn't hold anymore we need to modify our lower casing code
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase));
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase));
|
|
}
|
|
}
|
|
// we want toTitleCase to map to ToUpper in case we don't have any titlecase.
|
|
if (properties[UD_TitleCase].isEmpty())
|
|
properties[UD_TitleCase] = properties[UD_UpperCase];
|
|
if (!properties[UD_TitleCase].isEmpty()) {
|
|
int titleCase = properties[UD_TitleCase].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int diff = titleCase - codepoint;
|
|
if (qAbs(diff) >= (1<<14))
|
|
qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << ")";
|
|
data.p.titleCaseDiff = diff;
|
|
maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff));
|
|
if (codepoint >= 0x10000 || titleCase >= 0x10000) {
|
|
// if the conditions below doesn't hold anymore we need to modify our title casing code
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase));
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase));
|
|
}
|
|
}
|
|
|
|
if (!properties[UD_DigitValue].isEmpty())
|
|
data.p.digitValue = properties[UD_DigitValue].toInt();
|
|
|
|
// decompositition
|
|
QByteArray decomposition = properties[UD_Decomposition];
|
|
if (!decomposition.isEmpty()) {
|
|
highestComposedCharacter = qMax(highestComposedCharacter, codepoint);
|
|
QList<QByteArray> d = decomposition.split(' ');
|
|
if (d[0].contains('<')) {
|
|
data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition);
|
|
if (data.decompositionType == QChar::NoDecomposition)
|
|
qFatal("unassigned decomposition type: %s", d[0].constData());
|
|
d.takeFirst();
|
|
} else {
|
|
data.decompositionType = QChar::Canonical;
|
|
}
|
|
for (int i = 0; i < d.size(); ++i) {
|
|
data.decomposition.append(d[i].toInt(&ok, 16));
|
|
Q_ASSERT(ok);
|
|
}
|
|
if (!decompositionLength.contains(data.decomposition.size()))
|
|
decompositionLength[data.decomposition.size()] = 1;
|
|
else
|
|
++decompositionLength[data.decomposition.size()];
|
|
}
|
|
|
|
for (int i = codepoint; i <= lastCodepoint; ++i)
|
|
unicodeData.insert(i, data);
|
|
}
|
|
|
|
}
|
|
|
|
static int maxMirroredDiff = 0;
|
|
|
|
static void readBidiMirroring()
|
|
{
|
|
QFile f("data/BidiMirroring.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find BidiMirroring.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
line = line.replace(" ", "");
|
|
|
|
QList<QByteArray> pair = line.split(';');
|
|
Q_ASSERT(pair.size() == 2);
|
|
|
|
bool ok;
|
|
int codepoint = pair[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int mirror = pair[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
|
|
UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
d.mirroredChar = mirror;
|
|
d.p.mirrorDiff = d.mirroredChar - codepoint;
|
|
maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff));
|
|
unicodeData.insert(codepoint, d);
|
|
}
|
|
}
|
|
|
|
static void readArabicShaping()
|
|
{
|
|
QFile f("data/ArabicShaping.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find ArabicShaping.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line = line.trimmed();
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() == 4);
|
|
|
|
bool ok;
|
|
int codepoint = l[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
|
|
Joining joining = joining_map.value(l[2].trimmed(), Joining_Unassigned);
|
|
if (joining == Joining_Unassigned)
|
|
qFatal("unassigned or unhandled joining value: %s", l[2].constData());
|
|
|
|
if (joining == Joining_Left) {
|
|
// There are currently no characters of joining type Left_Joining defined in Unicode.
|
|
qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData());
|
|
}
|
|
|
|
UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
if (joining == Joining_Right)
|
|
d.p.joining = QChar::Right;
|
|
else if (joining == Joining_Dual)
|
|
d.p.joining = QChar::Dual;
|
|
else if (joining == Joining_Causing)
|
|
d.p.joining = QChar::Center;
|
|
else
|
|
d.p.joining = QChar::OtherJoining;
|
|
unicodeData.insert(codepoint, d);
|
|
}
|
|
}
|
|
|
|
static void readDerivedAge()
|
|
{
|
|
QFile f("data/DerivedAge.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find DerivedAge.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
QByteArray codes = l[0];
|
|
codes.replace("..", ".");
|
|
QList<QByteArray> cl = codes.split('.');
|
|
|
|
bool ok;
|
|
int from = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int to = from;
|
|
if (cl.size() == 2) {
|
|
to = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned);
|
|
//qDebug() << hex << from << ".." << to << ba << age;
|
|
if (age == QChar::Unicode_Unassigned)
|
|
qFatal("unassigned or unhandled age value: %s", l[1].constData());
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
d.p.age = age;
|
|
unicodeData.insert(codepoint, d);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static void readDerivedNormalizationProps()
|
|
{
|
|
QFile f("data/DerivedNormalizationProps.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find DerivedNormalizationProps.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
|
|
if (line.trimmed().isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() >= 2);
|
|
|
|
QByteArray propName = l[1].trimmed();
|
|
if (propName != "Full_Composition_Exclusion")
|
|
// ###
|
|
continue;
|
|
|
|
QByteArray codes = l[0].trimmed();
|
|
codes.replace("..", ".");
|
|
QList<QByteArray> cl = codes.split('.');
|
|
|
|
bool ok;
|
|
int from = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int to = from;
|
|
if (cl.size() == 2) {
|
|
to = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
d.excludedComposition = true;
|
|
unicodeData.insert(codepoint, d);
|
|
}
|
|
}
|
|
|
|
for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) {
|
|
UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
if (!d.excludedComposition
|
|
&& d.decompositionType == QChar::Canonical
|
|
&& d.decomposition.size() > 1) {
|
|
Q_ASSERT(d.decomposition.size() == 2);
|
|
|
|
int part1 = d.decomposition.at(0);
|
|
int part2 = d.decomposition.at(1);
|
|
|
|
// all non-starters are listed in DerivedNormalizationProps.txt
|
|
// and already excluded from composition
|
|
Q_ASSERT(unicodeData.value(part1, UnicodeData(part1)).p.combiningClass == 0);
|
|
|
|
++numLigatures;
|
|
highestLigature = qMax(highestLigature, part1);
|
|
Ligature l = {(ushort)part1, (ushort)part2, (ushort)codepoint};
|
|
ligatureHashes[part2].append(l);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
struct NormalizationCorrection {
|
|
uint codepoint;
|
|
uint mapped;
|
|
uint version;
|
|
};
|
|
|
|
static QByteArray createNormalizationCorrections()
|
|
{
|
|
QFile f("data/NormalizationCorrections.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find NormalizationCorrections.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
QByteArray out;
|
|
|
|
out += "struct NormalizationCorrection {\n"
|
|
" uint ucs4;\n"
|
|
" uint old_mapping;\n"
|
|
" int version;\n"
|
|
"};\n\n"
|
|
|
|
"static const NormalizationCorrection uc_normalization_corrections[] = {\n";
|
|
|
|
int numCorrections = 0;
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
Q_ASSERT(!line.contains(".."));
|
|
|
|
QList<QByteArray> fields = line.split(';');
|
|
Q_ASSERT(fields.size() == 4);
|
|
|
|
NormalizationCorrection c = { 0, 0, 0 };
|
|
bool ok;
|
|
c.codepoint = fields.at(0).toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
c.mapped = fields.at(1).toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
if (fields.at(3) == "3.2.0")
|
|
c.version = QChar::Unicode_3_2;
|
|
else if (fields.at(3) == "4.0.0")
|
|
c.version = QChar::Unicode_4_0;
|
|
else
|
|
qFatal("unknown unicode version in NormalizationCorrection.txt");
|
|
|
|
out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16)
|
|
+ ", " + QString::number(c.version) + " },\n";
|
|
++numCorrections;
|
|
}
|
|
|
|
out += "};\n\n"
|
|
|
|
"enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n";
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
static void computeUniqueProperties()
|
|
{
|
|
qDebug("computeUniqueProperties:");
|
|
for (int uc = 0; uc <= LAST_CODEPOINT; ++uc) {
|
|
UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
|
|
|
|
int index = uniqueProperties.indexOf(d.p);
|
|
if (index == -1) {
|
|
index = uniqueProperties.size();
|
|
uniqueProperties.append(d.p);
|
|
}
|
|
d.propertyIndex = index;
|
|
unicodeData.insert(uc, d);
|
|
}
|
|
qDebug(" %d unique unicode properties found", uniqueProperties.size());
|
|
}
|
|
|
|
|
|
static void readLineBreak()
|
|
{
|
|
qDebug() << "Reading LineBreak.txt";
|
|
QFile f("data/LineBreak.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find LineBreak.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
QByteArray codes = l[0];
|
|
codes.replace("..", ".");
|
|
QList<QByteArray> cl = codes.split('.');
|
|
|
|
bool ok;
|
|
int from = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int to = from;
|
|
if (cl.size() == 2) {
|
|
to = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned);
|
|
if (lb == LineBreak_Unassigned)
|
|
qFatal("unassigned line break class: %s", l[1].constData());
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
d.p.line_break_class = lb;
|
|
unicodeData.insert(codepoint, d);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static void readSpecialCasing()
|
|
{
|
|
qDebug() << "Reading SpecialCasing.txt";
|
|
QFile f("data/SpecialCasing.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find SpecialCasing.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed();
|
|
if (!condition.isEmpty())
|
|
// #####
|
|
continue;
|
|
|
|
bool ok;
|
|
int codepoint = l[0].trimmed().toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
|
|
// if the condition below doesn't hold anymore we need to modify our
|
|
// lower/upper/title casing code and case folding code
|
|
Q_ASSERT(codepoint < 0x10000);
|
|
|
|
// qDebug() << "codepoint" << hex << codepoint;
|
|
// qDebug() << line;
|
|
|
|
QList<QByteArray> lower = l[1].trimmed().split(' ');
|
|
QList<int> lowerMap;
|
|
for (int i = 0; i < lower.size(); ++i) {
|
|
bool ok;
|
|
lowerMap.append(lower.at(i).toInt(&ok, 16));
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
QList<QByteArray> title = l[2].trimmed().split(' ');
|
|
QList<int> titleMap;
|
|
for (int i = 0; i < title.size(); ++i) {
|
|
bool ok;
|
|
titleMap.append(title.at(i).toInt(&ok, 16));
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
QList<QByteArray> upper = l[3].trimmed().split(' ');
|
|
QList<int> upperMap;
|
|
for (int i = 0; i < upper.size(); ++i) {
|
|
bool ok;
|
|
upperMap.append(upper.at(i).toInt(&ok, 16));
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
|
|
UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
|
|
Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff);
|
|
Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff);
|
|
Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff);
|
|
|
|
if (lowerMap.size() > 1) {
|
|
ud.p.lowerCaseSpecial = true;
|
|
ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap);
|
|
}
|
|
if (titleMap.size() > 1) {
|
|
ud.p.titleCaseSpecial = true;
|
|
ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap);
|
|
}
|
|
if (upperMap.size() > 1) {
|
|
ud.p.upperCaseSpecial = true;
|
|
ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);;
|
|
}
|
|
|
|
unicodeData.insert(codepoint, ud);
|
|
}
|
|
}
|
|
|
|
static int maxCaseFoldDiff = 0;
|
|
|
|
static void readCaseFolding()
|
|
{
|
|
qDebug() << "Reading CaseFolding.txt";
|
|
QFile f("data/CaseFolding.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find CaseFolding.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
|
|
bool ok;
|
|
int codepoint = l[0].trimmed().toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
|
|
|
|
l[1] = l[1].trimmed();
|
|
if (l[1] == "F" || l[1] == "T")
|
|
continue;
|
|
|
|
// qDebug() << "codepoint" << hex << codepoint;
|
|
// qDebug() << line;
|
|
QList<QByteArray> fold = l[2].trimmed().split(' ');
|
|
QList<int> foldMap;
|
|
for (int i = 0; i < fold.size(); ++i) {
|
|
bool ok;
|
|
foldMap.append(fold.at(i).toInt(&ok, 16));
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
if (foldMap.size() == 1) {
|
|
int caseFolded = foldMap.at(0);
|
|
int diff = caseFolded - codepoint;
|
|
if (qAbs(diff) >= (1<<14))
|
|
qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << ")";
|
|
ud.p.caseFoldDiff = diff;
|
|
maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff));
|
|
if (codepoint >= 0x10000 || caseFolded >= 0x10000) {
|
|
// if the conditions below doesn't hold anymore we need to modify our case folding code
|
|
Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded));
|
|
Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded));
|
|
}
|
|
if (caseFolded != codepoint + ud.p.lowerCaseDiff)
|
|
qDebug() << hex << codepoint;
|
|
} else {
|
|
qFatal("we currently don't support full case foldings");
|
|
// qDebug() << "special" << hex << foldMap;
|
|
ud.p.caseFoldSpecial = true;
|
|
ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap);
|
|
}
|
|
unicodeData.insert(codepoint, ud);
|
|
}
|
|
}
|
|
|
|
static void readGraphemeBreak()
|
|
{
|
|
qDebug() << "Reading GraphemeBreakProperty.txt";
|
|
QFile f("data/GraphemeBreakProperty.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find GraphemeBreakProperty.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
QByteArray codes = l[0];
|
|
codes.replace("..", ".");
|
|
QList<QByteArray> cl = codes.split('.');
|
|
|
|
bool ok;
|
|
int from = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int to = from;
|
|
if (cl.size() == 2) {
|
|
to = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
GraphemeBreak brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned);
|
|
if (brk == GraphemeBreak_Unassigned)
|
|
qFatal("unassigned grapheme break class: %s", l[1].constData());
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
ud.p.graphemeBreak = brk;
|
|
unicodeData.insert(codepoint, ud);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void readWordBreak()
|
|
{
|
|
qDebug() << "Reading WordBreakProperty.txt";
|
|
QFile f("data/WordBreakProperty.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find WordBreakProperty.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
QByteArray codes = l[0];
|
|
codes.replace("..", ".");
|
|
QList<QByteArray> cl = codes.split('.');
|
|
|
|
bool ok;
|
|
int from = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int to = from;
|
|
if (cl.size() == 2) {
|
|
to = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
WordBreak brk = word_break_map.value(l[1], WordBreak_Unassigned);
|
|
if (brk == WordBreak_Unassigned)
|
|
qFatal("unassigned word break class: %s", l[1].constData());
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
ud.p.wordBreak = brk;
|
|
unicodeData.insert(codepoint, ud);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void readSentenceBreak()
|
|
{
|
|
qDebug() << "Reading SentenceBreakProperty.txt";
|
|
QFile f("data/SentenceBreakProperty.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find SentenceBreakProperty.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line;
|
|
line.resize(1024);
|
|
int len = f.readLine(line.data(), 1024);
|
|
line.resize(len-1);
|
|
|
|
int comment = line.indexOf('#');
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
QList<QByteArray> l = line.split(';');
|
|
Q_ASSERT(l.size() == 2);
|
|
|
|
QByteArray codes = l[0];
|
|
codes.replace("..", ".");
|
|
QList<QByteArray> cl = codes.split('.');
|
|
|
|
bool ok;
|
|
int from = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int to = from;
|
|
if (cl.size() == 2) {
|
|
to = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
SentenceBreak brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned);
|
|
if (brk == SentenceBreak_Unassigned)
|
|
qFatal("unassigned sentence break class: %s", l[1].constData());
|
|
|
|
for (int codepoint = from; codepoint <= to; ++codepoint) {
|
|
UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint));
|
|
ud.p.sentenceBreak = brk;
|
|
unicodeData.insert(codepoint, ud);
|
|
}
|
|
}
|
|
}
|
|
|
|
#if 0
|
|
// this piece of code does full case folding and comparison. We currently
|
|
// don't use it, since this gives lots of issues with things as case insensitive
|
|
// search and replace.
|
|
static inline void foldCase(uint ch, ushort *out)
|
|
{
|
|
const QUnicodeTables::Properties *p = qGetProp(ch);
|
|
if (!p->caseFoldSpecial) {
|
|
*(out++) = ch + p->caseFoldDiff;
|
|
} else {
|
|
const ushort *folded = specialCaseMap + p->caseFoldDiff;
|
|
while (*folded)
|
|
*out++ = *folded++;
|
|
}
|
|
*out = 0;
|
|
}
|
|
|
|
static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be)
|
|
{
|
|
if (a == b)
|
|
return 0;
|
|
if (a == 0)
|
|
return 1;
|
|
if (b == 0)
|
|
return -1;
|
|
|
|
while (a != ae && b != be) {
|
|
const QUnicodeTables::Properties *pa = qGetProp(*a);
|
|
const QUnicodeTables::Properties *pb = qGetProp(*b);
|
|
if (pa->caseFoldSpecial | pb->caseFoldSpecial)
|
|
goto special;
|
|
int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
|
|
if ((diff))
|
|
return diff;
|
|
++a;
|
|
++b;
|
|
}
|
|
}
|
|
if (a == ae) {
|
|
if (b == be)
|
|
return 0;
|
|
return -1;
|
|
}
|
|
return 1;
|
|
special:
|
|
ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
abuf[0] = bbuf[0] = 0;
|
|
ushort *ap = abuf;
|
|
ushort *bp = bbuf;
|
|
while (1) {
|
|
if (!*ap) {
|
|
if (a == ae) {
|
|
if (!*bp && b == be)
|
|
return 0;
|
|
return -1;
|
|
}
|
|
foldCase(*(a++), abuf);
|
|
ap = abuf;
|
|
}
|
|
if (!*bp) {
|
|
if (b == be)
|
|
return 1;
|
|
foldCase(*(b++), bbuf);
|
|
bp = bbuf;
|
|
}
|
|
if (*ap != *bp)
|
|
return (int)*ap - (int)*bp;
|
|
++ap;
|
|
++bp;
|
|
}
|
|
}
|
|
|
|
|
|
static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b)
|
|
{
|
|
if (a == 0)
|
|
return 1;
|
|
if (b == 0)
|
|
return -1;
|
|
|
|
while (a != ae && *b) {
|
|
const QUnicodeTables::Properties *pa = qGetProp(*a);
|
|
const QUnicodeTables::Properties *pb = qGetProp((ushort)*b);
|
|
if (pa->caseFoldSpecial | pb->caseFoldSpecial)
|
|
goto special;
|
|
int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff);
|
|
if ((diff))
|
|
return diff;
|
|
++a;
|
|
++b;
|
|
}
|
|
if (a == ae) {
|
|
if (!*b)
|
|
return 0;
|
|
return -1;
|
|
}
|
|
return 1;
|
|
|
|
special:
|
|
ushort abuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
ushort bbuf[SPECIAL_CASE_MAX_LEN + 1];
|
|
abuf[0] = bbuf[0] = 0;
|
|
ushort *ap = abuf;
|
|
ushort *bp = bbuf;
|
|
while (1) {
|
|
if (!*ap) {
|
|
if (a == ae) {
|
|
if (!*bp && !*b)
|
|
return 0;
|
|
return -1;
|
|
}
|
|
foldCase(*(a++), abuf);
|
|
ap = abuf;
|
|
}
|
|
if (!*bp) {
|
|
if (!*b)
|
|
return 1;
|
|
foldCase(*(b++), bbuf);
|
|
bp = bbuf;
|
|
}
|
|
if (*ap != *bp)
|
|
return (int)*ap - (int)*bp;
|
|
++ap;
|
|
++bp;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
#if 0
|
|
static QList<QByteArray> blockNames;
|
|
struct BlockInfo
|
|
{
|
|
int blockIndex;
|
|
int firstCodePoint;
|
|
int lastCodePoint;
|
|
};
|
|
static QList<BlockInfo> blockInfoList;
|
|
|
|
static void readBlocks()
|
|
{
|
|
QFile f("data/Blocks.txt");
|
|
if (!f.exists())
|
|
qFatal("Couldn't find Blocks.txt");
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line = f.readLine();
|
|
line.resize(line.size() - 1);
|
|
|
|
int comment = line.indexOf("#");
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
|
|
line.replace(" ", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
int semicolon = line.indexOf(';');
|
|
Q_ASSERT(semicolon >= 0);
|
|
QByteArray codePoints = line.left(semicolon);
|
|
QByteArray blockName = line.mid(semicolon + 1);
|
|
|
|
int blockIndex = blockNames.indexOf(blockName);
|
|
if (blockIndex == -1) {
|
|
blockIndex = blockNames.size();
|
|
blockNames.append(blockName);
|
|
}
|
|
|
|
codePoints.replace("..", ".");
|
|
QList<QByteArray> cl = codePoints.split('.');
|
|
|
|
bool ok;
|
|
int first = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int last = first;
|
|
if (cl.size() == 2) {
|
|
last = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
BlockInfo blockInfo = { blockIndex, first, last };
|
|
blockInfoList.append(blockInfo);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static QList<QByteArray> scriptNames;
|
|
static QHash<int, int> scriptAssignment;
|
|
static QHash<int, int> scriptHash;
|
|
|
|
struct ExtraBlock {
|
|
int block;
|
|
QVector<int> vector;
|
|
};
|
|
|
|
static QList<ExtraBlock> extraBlockList;
|
|
|
|
|
|
static void readScripts()
|
|
{
|
|
scriptNames.append("Common");
|
|
|
|
static const char *files[] = {
|
|
"data/ScriptsInitial.txt",
|
|
"data/Scripts.txt",
|
|
"data/ScriptsCorrections.txt"
|
|
};
|
|
enum { fileCount = sizeof(files) / sizeof(const char *) };
|
|
|
|
for (int i = 0; i < fileCount; ++i) {
|
|
QFile f(files[i]);
|
|
if (!f.exists())
|
|
qFatal("Couldn't find %s", files[i]);
|
|
|
|
f.open(QFile::ReadOnly);
|
|
|
|
while (!f.atEnd()) {
|
|
QByteArray line = f.readLine();
|
|
line.resize(line.size() - 1);
|
|
|
|
int comment = line.indexOf("#");
|
|
if (comment >= 0)
|
|
line = line.left(comment);
|
|
|
|
line.replace(" ", "");
|
|
line.replace("_", "");
|
|
|
|
if (line.isEmpty())
|
|
continue;
|
|
|
|
int semicolon = line.indexOf(';');
|
|
Q_ASSERT(semicolon >= 0);
|
|
QByteArray codePoints = line.left(semicolon);
|
|
QByteArray scriptName = line.mid(semicolon + 1);
|
|
|
|
int scriptIndex = scriptNames.indexOf(scriptName);
|
|
if (scriptIndex == -1) {
|
|
scriptIndex = scriptNames.size();
|
|
scriptNames.append(scriptName);
|
|
}
|
|
|
|
codePoints.replace("..", ".");
|
|
QList<QByteArray> cl = codePoints.split('.');
|
|
|
|
bool ok;
|
|
int first = cl[0].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
int last = first;
|
|
if (cl.size() == 2) {
|
|
last = cl[1].toInt(&ok, 16);
|
|
Q_ASSERT(ok);
|
|
}
|
|
|
|
for (int i = first; i <= last; ++i)
|
|
scriptAssignment[i] = scriptIndex;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
static int scriptSentinel = 0;
|
|
|
|
QByteArray createScriptEnumDeclaration()
|
|
{
|
|
static const char *specialScripts[] = {
|
|
"Common",
|
|
"Arabic",
|
|
"Armenian",
|
|
"Bengali",
|
|
"Cyrillic",
|
|
"Devanagari",
|
|
"Georgian",
|
|
"Greek",
|
|
"Gujarati",
|
|
"Gurmukhi",
|
|
"Hangul",
|
|
"Hebrew",
|
|
"Kannada",
|
|
"Khmer",
|
|
"Lao",
|
|
"Malayalam",
|
|
"Myanmar",
|
|
"Nko",
|
|
"Ogham",
|
|
"Oriya",
|
|
"Runic",
|
|
"Sinhala",
|
|
"Syriac",
|
|
"Tamil",
|
|
"Telugu",
|
|
"Thaana",
|
|
"Thai",
|
|
"Tibetan",
|
|
"Inherited"
|
|
};
|
|
const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *);
|
|
|
|
// generate script enum
|
|
QByteArray declaration;
|
|
|
|
declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n";
|
|
declaration += " enum Script {\n Common";
|
|
|
|
int uniqueScripts = 1; // Common
|
|
|
|
// output the ones with special processing first
|
|
for (int i = 1; i < scriptNames.size(); ++i) {
|
|
QByteArray scriptName = scriptNames.at(i);
|
|
// does the script require special processing?
|
|
bool special = false;
|
|
for (int s = 0; s < specialScriptsCount; ++s) {
|
|
if (scriptName == specialScripts[s]) {
|
|
special = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!special) {
|
|
scriptHash[i] = 0; // alias for 'Common'
|
|
continue;
|
|
} else {
|
|
++uniqueScripts;
|
|
scriptHash[i] = i;
|
|
}
|
|
|
|
if (scriptName != "Inherited") {
|
|
declaration += ",\n ";
|
|
declaration += scriptName;
|
|
}
|
|
}
|
|
declaration += ",\n Inherited";
|
|
declaration += ",\n ScriptCount = Inherited";
|
|
|
|
// output the ones that are an alias for 'Common'
|
|
for (int i = 1; i < scriptNames.size(); ++i) {
|
|
if (scriptHash.value(i) != 0)
|
|
continue;
|
|
declaration += ",\n ";
|
|
declaration += scriptNames.at(i);
|
|
declaration += " = Common";
|
|
}
|
|
|
|
declaration += "\n };\n";
|
|
|
|
scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32
|
|
declaration += " enum { ScriptSentinel = ";
|
|
declaration += QByteArray::number(scriptSentinel);
|
|
declaration += " };\n\n";
|
|
return declaration;
|
|
}
|
|
|
|
QByteArray createScriptTableDeclaration()
|
|
{
|
|
Q_ASSERT(scriptSentinel > 0);
|
|
|
|
QByteArray declaration;
|
|
|
|
const int unicodeBlockCount = 512; // number of unicode blocks
|
|
const int unicodeBlockSize = 128; // size of each block
|
|
declaration = "enum { UnicodeBlockCount = ";
|
|
declaration += QByteArray::number(unicodeBlockCount);
|
|
declaration += " }; // number of unicode blocks\n";
|
|
declaration += "enum { UnicodeBlockSize = ";
|
|
declaration += QByteArray::number(unicodeBlockSize);
|
|
declaration += " }; // size of each block\n\n";
|
|
|
|
// script table
|
|
declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n";
|
|
for (int i = 0; i < unicodeBlockCount; ++i) {
|
|
int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80));
|
|
int blockAssignment[unicodeBlockSize];
|
|
for (int x = 0; x < unicodeBlockSize; ++x) {
|
|
int codePoint = (i << 7) | x;
|
|
blockAssignment[x] = scriptAssignment.value(codePoint, 0);
|
|
}
|
|
bool allTheSame = true;
|
|
const int originalScript = blockAssignment[0];
|
|
const int script = scriptHash.value(originalScript);
|
|
for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) {
|
|
const int s = scriptHash.value(blockAssignment[x]);
|
|
if (s != script)
|
|
allTheSame = false;
|
|
}
|
|
|
|
if (allTheSame) {
|
|
declaration += " ";
|
|
declaration += scriptNames.value(originalScript);
|
|
declaration += ", /* U+";
|
|
declaration += QByteArray::number(block, 16).rightJustified(4, '0');
|
|
declaration += '-';
|
|
declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
|
|
declaration += " */\n";
|
|
} else {
|
|
const int value = extraBlockList.size() + scriptSentinel;
|
|
const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
|
|
|
|
declaration += " ";
|
|
declaration += QByteArray::number(value);
|
|
declaration += ", /* U+";
|
|
declaration += QByteArray::number(block, 16).rightJustified(4, '0');
|
|
declaration += '-';
|
|
declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
|
|
declaration += " at offset ";
|
|
declaration += QByteArray::number(offset);
|
|
declaration += " */\n";
|
|
|
|
ExtraBlock extraBlock;
|
|
extraBlock.block = block;
|
|
extraBlock.vector.resize(unicodeBlockSize);
|
|
for (int x = 0; x < unicodeBlockSize; ++x)
|
|
extraBlock.vector[x] = blockAssignment[x];
|
|
|
|
extraBlockList.append(extraBlock);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < extraBlockList.size(); ++i) {
|
|
const int value = i + scriptSentinel;
|
|
const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount;
|
|
const ExtraBlock &extraBlock = extraBlockList.at(i);
|
|
const int block = extraBlock.block;
|
|
|
|
declaration += "\n\n /* U+";
|
|
declaration += QByteArray::number(block, 16).rightJustified(4, '0');
|
|
declaration += '-';
|
|
declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0');
|
|
declaration += " at offset ";
|
|
declaration += QByteArray::number(offset);
|
|
declaration += " */\n ";
|
|
|
|
for (int x = 0; x < extraBlock.vector.size(); ++x) {
|
|
const int o = extraBlock.vector.at(x);
|
|
|
|
declaration += scriptNames.value(o);
|
|
if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1)
|
|
declaration += ',';
|
|
if ((x & 7) == 7 && x < extraBlock.vector.size() - 1)
|
|
declaration += "\n ";
|
|
else
|
|
declaration += ' ';
|
|
}
|
|
if (declaration.endsWith(' '))
|
|
declaration.chop(1);
|
|
}
|
|
declaration += "\n};\n\n} // namespace QUnicodeTables\n\n";
|
|
|
|
declaration +=
|
|
"Q_CORE_EXPORT int QT_FASTCALL QUnicodeTables::script(uint ucs4)\n"
|
|
"{\n"
|
|
" if (ucs4 > 0xffff)\n"
|
|
" return Common;\n"
|
|
" int script = uc_scripts[ucs4 >> 7];\n"
|
|
" if (script < ScriptSentinel)\n"
|
|
" return script;\n"
|
|
" script = (((script - ScriptSentinel) * UnicodeBlockSize) + UnicodeBlockCount);\n"
|
|
" script = uc_scripts[script + (ucs4 & 0x7f)];\n"
|
|
" return script;\n"
|
|
"}\n\n";
|
|
|
|
qDebug("createScriptTableDeclaration: table size is %d bytes",
|
|
unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize));
|
|
|
|
return declaration;
|
|
}
|
|
|
|
#if 0
|
|
static void dump(int from, int to)
|
|
{
|
|
for (int i = from; i <= to; ++i) {
|
|
UnicodeData d = unicodeData.value(i, UnicodeData(i));
|
|
qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d",
|
|
i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age);
|
|
if (d.decompositionType != QChar::NoDecomposition) {
|
|
qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(),
|
|
d.decomposition[0]);
|
|
}
|
|
}
|
|
qDebug(" ");
|
|
}
|
|
#endif
|
|
|
|
struct PropertyBlock {
|
|
PropertyBlock() { index = -1; }
|
|
int index;
|
|
QList<int> properties;
|
|
bool operator==(const PropertyBlock &other)
|
|
{ return properties == other.properties; }
|
|
};
|
|
|
|
static QByteArray createPropertyInfo()
|
|
{
|
|
qDebug("createPropertyInfo:");
|
|
|
|
const int BMP_BLOCKSIZE = 32;
|
|
const int BMP_SHIFT = 5;
|
|
const int BMP_END = 0x11000;
|
|
const int SMP_END = 0x110000;
|
|
const int SMP_BLOCKSIZE = 256;
|
|
const int SMP_SHIFT = 8;
|
|
|
|
QList<PropertyBlock> blocks;
|
|
QList<int> blockMap;
|
|
|
|
int used = 0;
|
|
|
|
for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
|
|
PropertyBlock b;
|
|
for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
|
|
int uc = block*BMP_BLOCKSIZE + i;
|
|
UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
|
|
b.properties.append(d.propertyIndex);
|
|
}
|
|
int index = blocks.indexOf(b);
|
|
if (index == -1) {
|
|
index = blocks.size();
|
|
b.index = used;
|
|
used += BMP_BLOCKSIZE;
|
|
blocks.append(b);
|
|
}
|
|
blockMap.append(blocks.at(index).index);
|
|
}
|
|
|
|
int bmp_blocks = blocks.size();
|
|
Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
|
|
|
|
for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
|
|
PropertyBlock b;
|
|
for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
|
|
int uc = block*SMP_BLOCKSIZE + i;
|
|
UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
|
|
b.properties.append(d.propertyIndex);
|
|
}
|
|
int index = blocks.indexOf(b);
|
|
if (index == -1) {
|
|
index = blocks.size();
|
|
b.index = used;
|
|
used += SMP_BLOCKSIZE;
|
|
blocks.append(b);
|
|
}
|
|
blockMap.append(blocks.at(index).index);
|
|
}
|
|
|
|
int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
|
|
int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
|
|
int bmp_mem = bmp_block_data + bmp_trie;
|
|
qDebug(" %d unique blocks in BMP.", blocks.size());
|
|
qDebug(" block data uses: %d bytes", bmp_block_data);
|
|
qDebug(" trie data uses : %d bytes", bmp_trie);
|
|
|
|
int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
|
|
int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
|
|
int smp_mem = smp_block_data + smp_trie;
|
|
qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks);
|
|
qDebug(" block data uses: %d bytes", smp_block_data);
|
|
qDebug(" trie data uses : %d bytes", smp_trie);
|
|
|
|
qDebug("\n properties uses : %d bytes", uniqueProperties.size() * SizeOfPropertiesStruct);
|
|
qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + uniqueProperties.size() * SizeOfPropertiesStruct);
|
|
|
|
QByteArray out;
|
|
out += "static const unsigned short uc_property_trie[] = {\n";
|
|
|
|
// first write the map
|
|
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
|
|
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
if (!((i*BMP_BLOCKSIZE) % 0x1000))
|
|
out += "\n";
|
|
out += "\n ";
|
|
}
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
out += ", ";
|
|
}
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
|
|
for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
if (!(i % (0x10000/SMP_BLOCKSIZE)))
|
|
out += "\n";
|
|
out += "\n ";
|
|
}
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
out += ", ";
|
|
}
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n";
|
|
// write the data
|
|
for (int i = 0; i < blocks.size(); ++i) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n";
|
|
const PropertyBlock &b = blocks.at(i);
|
|
for (int j = 0; j < b.properties.size(); ++j) {
|
|
if (!(j % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n ";
|
|
}
|
|
out += QByteArray::number(b.properties.at(j));
|
|
out += ", ";
|
|
}
|
|
}
|
|
|
|
// we reserve one bit more than in the assert below for the sign
|
|
Q_ASSERT(maxMirroredDiff < (1<<12));
|
|
Q_ASSERT(maxLowerCaseDiff < (1<<14));
|
|
Q_ASSERT(maxUpperCaseDiff < (1<<14));
|
|
Q_ASSERT(maxTitleCaseDiff < (1<<14));
|
|
Q_ASSERT(maxCaseFoldDiff < (1<<14));
|
|
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n};\n\n"
|
|
|
|
"#define GET_PROP_INDEX(ucs4) \\\n"
|
|
" (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
|
|
" ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
|
|
"] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
|
|
" : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
|
|
")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
|
|
" + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n"
|
|
"#define GET_PROP_INDEX_UCS2(ucs2) \\\n"
|
|
"(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) +
|
|
"] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n"
|
|
|
|
|
|
"static const QUnicodeTables::Properties uc_properties[] = {\n";
|
|
|
|
// keep in sync with the property declaration
|
|
for (int i = 0; i < uniqueProperties.size(); ++i) {
|
|
PropertyFlags p = uniqueProperties.at(i);
|
|
out += " { ";
|
|
// " ushort category : 8;\n"
|
|
out += QByteArray::number( p.category );
|
|
out += ", ";
|
|
// " ushort line_break_class : 8;\n"
|
|
out += QByteArray::number( p.line_break_class );
|
|
out += ", ";
|
|
// " ushort direction : 8;\n"
|
|
out += QByteArray::number( p.direction );
|
|
out += ", ";
|
|
// " ushort combiningClass :8;\n"
|
|
out += QByteArray::number( p.combiningClass );
|
|
out += ", ";
|
|
// " ushort joining : 2;\n"
|
|
out += QByteArray::number( p.joining );
|
|
out += ", ";
|
|
// " signed short digitValue : 6;\n /* 5 needed */"
|
|
out += QByteArray::number( p.digitValue );
|
|
out += ", ";
|
|
// " ushort unicodeVersion : 4;\n"
|
|
out += QByteArray::number( p.age );
|
|
out += ", ";
|
|
// " ushort lowerCaseSpecial : 1;\n"
|
|
// " ushort upperCaseSpecial : 1;\n"
|
|
// " ushort titleCaseSpecial : 1;\n"
|
|
// " ushort caseFoldSpecial : 1;\n"
|
|
out += QByteArray::number( p.lowerCaseSpecial );
|
|
out += ", ";
|
|
out += QByteArray::number( p.upperCaseSpecial );
|
|
out += ", ";
|
|
out += QByteArray::number( p.titleCaseSpecial );
|
|
out += ", ";
|
|
out += QByteArray::number( p.caseFoldSpecial );
|
|
out += ", ";
|
|
// " signed short mirrorDiff : 16;\n"
|
|
// " signed short lowerCaseDiff : 16;\n"
|
|
// " signed short upperCaseDiff : 16;\n"
|
|
// " signed short titleCaseDiff : 16;\n"
|
|
// " signed short caseFoldDiff : 16;\n"
|
|
out += QByteArray::number( p.mirrorDiff );
|
|
out += ", ";
|
|
out += QByteArray::number( p.lowerCaseDiff );
|
|
out += ", ";
|
|
out += QByteArray::number( p.upperCaseDiff );
|
|
out += ", ";
|
|
out += QByteArray::number( p.titleCaseDiff );
|
|
out += ", ";
|
|
out += QByteArray::number( p.caseFoldDiff );
|
|
out += ", ";
|
|
out += QByteArray::number( p.graphemeBreak );
|
|
out += ", ";
|
|
out += QByteArray::number( p.wordBreak );
|
|
out += ", ";
|
|
out += QByteArray::number( p.sentenceBreak );
|
|
out += " },\n";
|
|
}
|
|
out += "};\n\n";
|
|
|
|
out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n"
|
|
"{\n"
|
|
" int index = GET_PROP_INDEX(ucs4);\n"
|
|
" return uc_properties + index;\n"
|
|
"}\n"
|
|
"\n"
|
|
"static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n"
|
|
"{\n"
|
|
" int index = GET_PROP_INDEX_UCS2(ucs2);\n"
|
|
" return uc_properties + index;\n"
|
|
"}\n"
|
|
"\n"
|
|
"Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n"
|
|
"{\n"
|
|
" int index = GET_PROP_INDEX(ucs4);\n"
|
|
" return uc_properties + index;\n"
|
|
"}\n"
|
|
"\n"
|
|
"Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n"
|
|
"{\n"
|
|
" int index = GET_PROP_INDEX_UCS2(ucs2);\n"
|
|
" return uc_properties + index;\n"
|
|
"}\n\n";
|
|
|
|
out += "Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL QUnicodeTables::lineBreakClass(uint ucs4)\n"
|
|
"{\n"
|
|
" return (QUnicodeTables::LineBreakClass)qGetProp(ucs4)->line_break_class;\n"
|
|
"}\n\n";
|
|
|
|
out += "static const ushort specialCaseMap[] = {\n ";
|
|
for (int i = 0; i < specialCaseMap.size(); ++i) {
|
|
out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16);
|
|
if (i < specialCaseMap.size() - 1)
|
|
out += ",";
|
|
if (!specialCaseMap.at(i))
|
|
out += "\n ";
|
|
}
|
|
out += "\n};\n";
|
|
out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n";
|
|
|
|
qDebug("Special case map uses : %d bytes", specialCaseMap.size()*2);
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
struct DecompositionBlock {
|
|
DecompositionBlock() { index = -1; }
|
|
int index;
|
|
QList<int> decompositionPositions;
|
|
bool operator ==(const DecompositionBlock &other)
|
|
{ return decompositionPositions == other.decompositionPositions; }
|
|
};
|
|
|
|
static QByteArray createCompositionInfo()
|
|
{
|
|
qDebug("createCompositionInfo:");
|
|
|
|
const int BMP_BLOCKSIZE = 16;
|
|
const int BMP_SHIFT = 4;
|
|
const int BMP_END = 0x3400; // start of Han
|
|
const int SMP_END = 0x30000;
|
|
const int SMP_BLOCKSIZE = 256;
|
|
const int SMP_SHIFT = 8;
|
|
|
|
if (SMP_END <= highestComposedCharacter)
|
|
qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter);
|
|
|
|
QList<DecompositionBlock> blocks;
|
|
QList<int> blockMap;
|
|
QList<unsigned short> decompositions;
|
|
|
|
int used = 0;
|
|
int tableIndex = 0;
|
|
|
|
for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
|
|
DecompositionBlock b;
|
|
for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
|
|
int uc = block*BMP_BLOCKSIZE + i;
|
|
UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
|
|
if (!d.decomposition.isEmpty()) {
|
|
int utf16Chars = 0;
|
|
for (int j = 0; j < d.decomposition.size(); ++j)
|
|
utf16Chars += d.decomposition.at(j) >= 0x10000 ? 2 : 1;
|
|
decompositions.append(d.decompositionType + (utf16Chars<<8));
|
|
for (int j = 0; j < d.decomposition.size(); ++j) {
|
|
int code = d.decomposition.at(j);
|
|
if (code >= 0x10000) {
|
|
// save as surrogate pair
|
|
ushort high = QChar::highSurrogate(code);
|
|
ushort low = QChar::lowSurrogate(code);
|
|
decompositions.append(high);
|
|
decompositions.append(low);
|
|
} else {
|
|
decompositions.append(code);
|
|
}
|
|
}
|
|
b.decompositionPositions.append(tableIndex);
|
|
tableIndex += utf16Chars + 1;
|
|
} else {
|
|
b.decompositionPositions.append(0xffff);
|
|
}
|
|
}
|
|
int index = blocks.indexOf(b);
|
|
if (index == -1) {
|
|
index = blocks.size();
|
|
b.index = used;
|
|
used += BMP_BLOCKSIZE;
|
|
blocks.append(b);
|
|
}
|
|
blockMap.append(blocks.at(index).index);
|
|
}
|
|
|
|
int bmp_blocks = blocks.size();
|
|
Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
|
|
|
|
for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) {
|
|
DecompositionBlock b;
|
|
for (int i = 0; i < SMP_BLOCKSIZE; ++i) {
|
|
int uc = block*SMP_BLOCKSIZE + i;
|
|
UnicodeData d = unicodeData.value(uc, UnicodeData(uc));
|
|
if (!d.decomposition.isEmpty()) {
|
|
int utf16Chars = 0;
|
|
for (int j = 0; j < d.decomposition.size(); ++j)
|
|
utf16Chars += d.decomposition.at(j) >= 0x10000 ? 2 : 1;
|
|
decompositions.append(d.decompositionType + (utf16Chars<<8));
|
|
for (int j = 0; j < d.decomposition.size(); ++j) {
|
|
int code = d.decomposition.at(j);
|
|
if (code >= 0x10000) {
|
|
// save as surrogate pair
|
|
ushort high = QChar::highSurrogate(code);
|
|
ushort low = QChar::lowSurrogate(code);
|
|
decompositions.append(high);
|
|
decompositions.append(low);
|
|
} else {
|
|
decompositions.append(code);
|
|
}
|
|
}
|
|
b.decompositionPositions.append(tableIndex);
|
|
tableIndex += utf16Chars + 1;
|
|
} else {
|
|
b.decompositionPositions.append(0xffff);
|
|
}
|
|
}
|
|
int index = blocks.indexOf(b);
|
|
if (index == -1) {
|
|
index = blocks.size();
|
|
b.index = used;
|
|
used += SMP_BLOCKSIZE;
|
|
blocks.append(b);
|
|
}
|
|
blockMap.append(blocks.at(index).index);
|
|
}
|
|
|
|
int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
|
|
int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
|
|
int bmp_mem = bmp_block_data + bmp_trie;
|
|
qDebug(" %d unique blocks in BMP.", blocks.size());
|
|
qDebug(" block data uses: %d bytes", bmp_block_data);
|
|
qDebug(" trie data uses : %d bytes", bmp_trie);
|
|
qDebug(" memory usage: %d bytes", bmp_mem);
|
|
|
|
int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2;
|
|
int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2;
|
|
int smp_mem = smp_block_data + smp_trie;
|
|
qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks);
|
|
qDebug(" block data uses: %d bytes", smp_block_data);
|
|
qDebug(" trie data uses : %d bytes", smp_trie);
|
|
|
|
qDebug("\n decomposition table use : %d bytes", decompositions.size()*2);
|
|
qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2);
|
|
|
|
QByteArray out;
|
|
|
|
out += "static const unsigned short uc_decomposition_trie[] = {\n";
|
|
|
|
// first write the map
|
|
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
|
|
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
if (!((i*BMP_BLOCKSIZE) % 0x1000))
|
|
out += "\n";
|
|
out += "\n ";
|
|
}
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
out += ", ";
|
|
}
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";;
|
|
for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
if (!(i % (0x10000/SMP_BLOCKSIZE)))
|
|
out += "\n";
|
|
out += "\n ";
|
|
}
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
out += ", ";
|
|
}
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n";
|
|
// write the data
|
|
for (int i = 0; i < blocks.size(); ++i) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n";
|
|
const DecompositionBlock &b = blocks.at(i);
|
|
for (int j = 0; j < b.decompositionPositions.size(); ++j) {
|
|
if (!(j % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n ";
|
|
}
|
|
out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
|
|
out += ", ";
|
|
}
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n};\n\n"
|
|
|
|
"#define GET_DECOMPOSITION_INDEX(ucs4) \\\n"
|
|
" (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n"
|
|
" ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) +
|
|
"] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n"
|
|
" : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n"
|
|
" ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) +
|
|
")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]"
|
|
" + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n"
|
|
" : 0xffff))\n\n"
|
|
|
|
"static const unsigned short uc_decomposition_map[] = {\n";
|
|
|
|
for (int i = 0; i < decompositions.size(); ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n ";
|
|
}
|
|
out += "0x" + QByteArray::number(decompositions.at(i), 16);
|
|
out += ", ";
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n};\n\n";
|
|
|
|
return out;
|
|
}
|
|
|
|
static QByteArray createLigatureInfo()
|
|
{
|
|
qDebug("createLigatureInfo: numLigatures=%d", numLigatures);
|
|
|
|
QList<DecompositionBlock> blocks;
|
|
QList<int> blockMap;
|
|
QList<unsigned short> ligatures;
|
|
|
|
const int BMP_BLOCKSIZE = 32;
|
|
const int BMP_SHIFT = 5;
|
|
const int BMP_END = 0x3100;
|
|
Q_ASSERT(highestLigature < BMP_END);
|
|
|
|
int used = 0;
|
|
int tableIndex = 0;
|
|
|
|
for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) {
|
|
DecompositionBlock b;
|
|
for (int i = 0; i < BMP_BLOCKSIZE; ++i) {
|
|
int uc = block*BMP_BLOCKSIZE + i;
|
|
QList<Ligature> l = ligatureHashes.value(uc);
|
|
if (!l.isEmpty()) {
|
|
qSort(l);
|
|
|
|
ligatures.append(l.size());
|
|
for (int j = 0; j < l.size(); ++j) {
|
|
Q_ASSERT(l.at(j).u2 == uc);
|
|
ligatures.append(l.at(j).u1);
|
|
ligatures.append(l.at(j).ligature);
|
|
}
|
|
b.decompositionPositions.append(tableIndex);
|
|
tableIndex += 2*l.size() + 1;
|
|
} else {
|
|
b.decompositionPositions.append(0xffff);
|
|
}
|
|
}
|
|
int index = blocks.indexOf(b);
|
|
if (index == -1) {
|
|
index = blocks.size();
|
|
b.index = used;
|
|
used += BMP_BLOCKSIZE;
|
|
blocks.append(b);
|
|
}
|
|
blockMap.append(blocks.at(index).index);
|
|
}
|
|
|
|
int bmp_blocks = blocks.size();
|
|
Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE);
|
|
|
|
int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2;
|
|
int bmp_trie = BMP_END/BMP_BLOCKSIZE*2;
|
|
int bmp_mem = bmp_block_data + bmp_trie;
|
|
qDebug(" %d unique blocks in BMP.", blocks.size());
|
|
qDebug(" block data uses: %d bytes", bmp_block_data);
|
|
qDebug(" trie data uses : %d bytes", bmp_trie);
|
|
qDebug("\n ligature data uses : %d bytes", ligatures.size()*2);
|
|
qDebug(" memory usage: %d bytes", bmp_mem + ligatures.size() * 2);
|
|
|
|
QByteArray out;
|
|
|
|
out += "static const unsigned short uc_ligature_trie[] = {\n";
|
|
|
|
// first write the map
|
|
out += " // 0 - 0x" + QByteArray::number(BMP_END, 16);
|
|
for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
if (!((i*BMP_BLOCKSIZE) % 0x1000))
|
|
out += "\n";
|
|
out += "\n ";
|
|
}
|
|
out += QByteArray::number(blockMap.at(i) + blockMap.size());
|
|
out += ", ";
|
|
}
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n";
|
|
// write the data
|
|
for (int i = 0; i < blocks.size(); ++i) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n";
|
|
const DecompositionBlock &b = blocks.at(i);
|
|
for (int j = 0; j < b.decompositionPositions.size(); ++j) {
|
|
if (!(j % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n ";
|
|
}
|
|
out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16);
|
|
out += ", ";
|
|
}
|
|
}
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n};\n\n"
|
|
|
|
"#define GET_LIGATURE_INDEX(u2) "
|
|
"(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? "
|
|
"uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) +
|
|
"] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n"
|
|
|
|
"static const unsigned short uc_ligature_map[] = {\n";
|
|
|
|
for (int i = 0; i < ligatures.size(); ++i) {
|
|
if (!(i % 8)) {
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n ";
|
|
}
|
|
out += "0x" + QByteArray::number(ligatures.at(i), 16);
|
|
out += ", ";
|
|
}
|
|
|
|
if (out.endsWith(' '))
|
|
out.chop(1);
|
|
out += "\n};\n\n";
|
|
|
|
return out;
|
|
}
|
|
|
|
QByteArray createCasingInfo()
|
|
{
|
|
QByteArray out;
|
|
|
|
out += "struct CasingInfo {\n"
|
|
" uint codePoint : 16;\n"
|
|
" uint flags : 8;\n"
|
|
" uint offset : 8;\n"
|
|
"};\n\n";
|
|
|
|
return out;
|
|
}
|
|
|
|
|
|
int main(int, char **)
|
|
{
|
|
initAgeMap();
|
|
initCategoryMap();
|
|
initDecompositionMap();
|
|
initDirectionMap();
|
|
initJoiningMap();
|
|
initGraphemeBreak();
|
|
initWordBreak();
|
|
initSentenceBreak();
|
|
initLineBreak();
|
|
|
|
readUnicodeData();
|
|
readBidiMirroring();
|
|
readArabicShaping();
|
|
readDerivedAge();
|
|
readDerivedNormalizationProps();
|
|
readSpecialCasing();
|
|
readCaseFolding();
|
|
// readBlocks();
|
|
readScripts();
|
|
readGraphemeBreak();
|
|
readWordBreak();
|
|
readSentenceBreak();
|
|
readLineBreak();
|
|
|
|
computeUniqueProperties();
|
|
QByteArray properties = createPropertyInfo();
|
|
QByteArray compositions = createCompositionInfo();
|
|
QByteArray ligatures = createLigatureInfo();
|
|
QByteArray normalizationCorrections = createNormalizationCorrections();
|
|
QByteArray scriptEnumDeclaration = createScriptEnumDeclaration();
|
|
QByteArray scriptTableDeclaration = createScriptTableDeclaration();
|
|
|
|
QByteArray header =
|
|
"/****************************************************************************\n"
|
|
"**\n"
|
|
"** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).\n"
|
|
"** All rights reserved.\n"
|
|
"** Contact: Nokia Corporation (qt-info@nokia.com)\n"
|
|
"**\n"
|
|
"** This file is part of the QtCore module of the Qt Toolkit.\n"
|
|
"**\n"
|
|
"** $QT_BEGIN_LICENSE:LGPL$\n"
|
|
"** No Commercial Usage\n"
|
|
"** This file contains pre-release code and may not be distributed.\n"
|
|
"** You may use this file in accordance with the terms and conditions\n"
|
|
"** contained in the Technology Preview License Agreement accompanying\n"
|
|
"** this package.\n"
|
|
"**\n"
|
|
"** GNU Lesser General Public License Usage\n"
|
|
"** Alternatively, this file may be used under the terms of the GNU Lesser\n"
|
|
"** General Public License version 2.1 as published by the Free Software\n"
|
|
"** Foundation and appearing in the file LICENSE.LGPL included in the\n"
|
|
"** packaging of this file. Please review the following information to\n"
|
|
"** ensure the GNU Lesser General Public License version 2.1 requirements\n"
|
|
"** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n"
|
|
"**\n"
|
|
"** In addition, as a special exception, Nokia gives you certain additional\n"
|
|
"** rights. These rights are described in the Nokia Qt LGPL Exception\n"
|
|
"** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n"
|
|
"**\n"
|
|
"** If you have questions regarding the use of this file, please contact\n"
|
|
"** Nokia at qt-info@nokia.com.\n"
|
|
"**\n"
|
|
"**\n"
|
|
"**\n"
|
|
"**\n"
|
|
"**\n"
|
|
"**\n"
|
|
"**\n"
|
|
"**\n"
|
|
"** $QT_END_LICENSE$\n"
|
|
"**\n"
|
|
"****************************************************************************/\n\n";
|
|
|
|
QByteArray note =
|
|
"/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n";
|
|
|
|
QByteArray warning =
|
|
"//\n"
|
|
"// W A R N I N G\n"
|
|
"// -------------\n"
|
|
"//\n"
|
|
"// This file is not part of the Qt API. It exists for the convenience\n"
|
|
"// of internal files. This header file may change from version to version\n"
|
|
"// without notice, or even be removed.\n"
|
|
"//\n"
|
|
"// We mean it.\n"
|
|
"//\n\n";
|
|
|
|
QFile f("../../src/corelib/tools/qunicodetables.cpp");
|
|
f.open(QFile::WriteOnly|QFile::Truncate);
|
|
f.write(header);
|
|
f.write(note);
|
|
f.write("QT_BEGIN_NAMESPACE\n\n");
|
|
f.write(properties);
|
|
f.write(compositions);
|
|
f.write(ligatures);
|
|
f.write(normalizationCorrections);
|
|
f.write(scriptTableDeclaration);
|
|
f.write("QT_END_NAMESPACE\n");
|
|
f.close();
|
|
|
|
f.setFileName("../../src/corelib/tools/qunicodetables_p.h");
|
|
f.open(QFile::WriteOnly | QFile::Truncate);
|
|
f.write(header);
|
|
f.write(note);
|
|
f.write(warning);
|
|
f.write("#ifndef QUNICODETABLES_P_H\n"
|
|
"#define QUNICODETABLES_P_H\n\n"
|
|
"#include <QtCore/qchar.h>\n\n"
|
|
"QT_BEGIN_NAMESPACE\n\n");
|
|
f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n");
|
|
f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n");
|
|
f.write("namespace QUnicodeTables {\n\n");
|
|
f.write(property_string);
|
|
f.write("\n");
|
|
f.write(scriptEnumDeclaration);
|
|
f.write("\n");
|
|
f.write(grapheme_break_string);
|
|
f.write("\n");
|
|
f.write(word_break_string);
|
|
f.write("\n");
|
|
f.write(sentence_break_string);
|
|
f.write("\n");
|
|
f.write(lineBreakClass);
|
|
f.write("\n");
|
|
f.write(methods);
|
|
f.write("} // namespace QUnicodeTables\n\n"
|
|
"QT_END_NAMESPACE\n\n"
|
|
"#endif // QUNICODETABLES_P_H\n");
|
|
f.close();
|
|
|
|
qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff;
|
|
qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff;
|
|
qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff;
|
|
qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff;
|
|
qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff;
|
|
#if 0
|
|
// dump(0, 0x7f);
|
|
// dump(0x620, 0x640);
|
|
// dump(0x10000, 0x10020);
|
|
// dump(0x10800, 0x10820);
|
|
|
|
qDebug("decompositionLength used:");
|
|
int totalcompositions = 0;
|
|
int sum = 0;
|
|
for (int i = 1; i < 20; ++i) {
|
|
qDebug(" length %d used %d times", i, decompositionLength.value(i, 0));
|
|
totalcompositions += i*decompositionLength.value(i, 0);
|
|
sum += decompositionLength.value(i, 0);
|
|
}
|
|
qDebug(" len decomposition map %d, average length %f, num composed chars %d",
|
|
totalcompositions, (float)totalcompositions/(float)sum, sum);
|
|
qDebug("highest composed character %x", highestComposedCharacter);
|
|
qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature);
|
|
|
|
qBubbleSort(ligatures);
|
|
for (int i = 0; i < ligatures.size(); ++i)
|
|
qDebug("%s", ligatures.at(i).data());
|
|
|
|
// qDebug("combiningClass usage:");
|
|
// int numClasses = 0;
|
|
// for (int i = 0; i < 255; ++i) {
|
|
// int num = combiningClassUsage.value(i, 0);
|
|
// if (num) {
|
|
// ++numClasses;
|
|
// qDebug(" combiningClass %d used %d times", i, num);
|
|
// }
|
|
// }
|
|
// qDebug("total of %d combining classes used", numClasses);
|
|
|
|
#endif
|
|
}
|