a2f3849b0d
X-SVN-Rev: 32193
861 lines
32 KiB
C++
861 lines
32 KiB
C++
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 1999-2012, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
* file name: corepropsbuilder.cpp (was store.c & props2.cpp)
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 1999dec11
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* Store Unicode character properties efficiently for
|
|
* random access.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/udata.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/unistr.h"
|
|
#include "unicode/usetiter.h"
|
|
#include "unicode/uscript.h"
|
|
#include "cmemory.h"
|
|
#include "cstring.h"
|
|
#include "genprops.h"
|
|
#include "propsvec.h"
|
|
#include "uassert.h"
|
|
#include "unewdata.h"
|
|
#include "uprops.h"
|
|
#include "utrie2.h"
|
|
#include "writesrc.h"
|
|
|
|
/* Unicode character properties file format ------------------------------------
|
|
|
|
The file format prepared and written here contains several data
|
|
structures that store indexes or data.
|
|
|
|
Before the data contents described below, there are the headers required by
|
|
the udata API for loading ICU data. Especially, a UDataInfo structure
|
|
precedes the actual data. It contains platform properties values and the
|
|
file format version.
|
|
|
|
The following is a description of format version 7.1 .
|
|
|
|
Data contents:
|
|
|
|
The contents is a parsed, binary form of several Unicode character
|
|
database files, most prominently UnicodeData.txt.
|
|
|
|
Any Unicode code point from 0 to 0x10ffff can be looked up to get
|
|
the properties, if any, for that code point. This means that the input
|
|
to the lookup are 21-bit unsigned integers, with not all of the
|
|
21-bit range used.
|
|
|
|
It is assumed that client code keeps a uint32_t pointer
|
|
to the beginning of the data:
|
|
|
|
const uint32_t *p32;
|
|
|
|
Formally, the file contains the following structures:
|
|
|
|
const int32_t indexes[16] with values i0..i15:
|
|
|
|
i0 indicates the length of the main trie.
|
|
i0..i3 all have the same value in format versions 4.0 and higher;
|
|
the related props32[] and exceptions[] and uchars[] were used in format version 3
|
|
|
|
i0 propsIndex; -- 32-bit unit index to the table of 32-bit properties words
|
|
i1 exceptionsIndex; -- 32-bit unit index to the table of 32-bit exception words
|
|
i2 exceptionsTopIndex; -- 32-bit unit index to the array of UChars for special mappings
|
|
|
|
i3 additionalTrieIndex; -- 32-bit unit index to the additional trie for more properties
|
|
i4 additionalVectorsIndex; -- 32-bit unit index to the table of properties vectors
|
|
i5 additionalVectorsColumns; -- number of 32-bit words per properties vector
|
|
|
|
i6 scriptExtensionsIndex; -- 32-bit unit index to the Script_Extensions data
|
|
i7 reservedIndex7; -- 32-bit unit index to the top of the Script_Extensions data
|
|
i8 reservedIndex8; -- for now: i7, i8 and i9 have the same values
|
|
i9 dataTopIndex; -- size of the data file (number of 32-bit units after the header)
|
|
|
|
i10 maxValues; -- maximum code values for vector word 0, see uprops.h (new in format version 3.1+)
|
|
i11 maxValues2; -- maximum code values for vector word 2, see uprops.h (new in format version 3.2)
|
|
i12..i15 reservedIndexes; -- reserved values; 0 for now
|
|
|
|
PT serialized properties trie, see utrie2.h (byte size: 4*(i0-16))
|
|
|
|
P, E, and U are not used (empty) in format versions 4 and above
|
|
|
|
P const uint32_t props32[i1-i0];
|
|
E const uint32_t exceptions[i2-i1];
|
|
U const UChar uchars[2*(i3-i2)];
|
|
|
|
AT serialized trie for additional properties (byte size: 4*(i4-i3))
|
|
PV const uint32_t propsVectors[(i6-i4)/i5][i5]==uint32_t propsVectors[i6-i4];
|
|
|
|
SCX const uint16_t scriptExtensions[2*(i7-i6)];
|
|
|
|
SCX contains Script_Extensions lists and (Script code, Script_Extensions index) pairs.
|
|
A Script_Extensions list is a sequence of UScriptCode values in ascending order,
|
|
with the last code having bit 15 set for termination.
|
|
A (Script code, Script_Extensions index) pair is the main UScriptCode (Script value)
|
|
followed by the index of the Script_Extensions list.
|
|
If the propsVectors[] column 0 value indicates that there are Script_Extensions,
|
|
then the UPROPS_SCRIPT_MASK bit field is an index to either a list or a pair in SCX,
|
|
rather than the Script itself. The high bits in the UPROPS_SCRIPT_X_MASK fields
|
|
indicate whether the main Script value is Common or Inherited (and the index is to a list)
|
|
vs. another value (and the index is to a pair).
|
|
(See UPROPS_SCRIPT_X_WITH_COMMON etc. in uprops.h.)
|
|
|
|
Trie lookup and properties:
|
|
|
|
In order to condense the data for the 21-bit code space, several properties of
|
|
the Unicode code assignment are exploited:
|
|
- The code space is sparse.
|
|
- There are several 10k of consecutive codes with the same properties.
|
|
- Characters and scripts are allocated in groups of 16 code points.
|
|
- Inside blocks for scripts the properties are often repetitive.
|
|
- The 21-bit space is not fully used for Unicode.
|
|
|
|
The lookup of properties for a given code point is done with a trie lookup,
|
|
using the UTrie implementation.
|
|
The trie lookup result is a 16-bit properties word.
|
|
|
|
With a given Unicode code point
|
|
|
|
UChar32 c;
|
|
|
|
and 0<=c<0x110000, the lookup is done like this:
|
|
|
|
uint16_t props;
|
|
UTRIE_GET16(trie, c, props);
|
|
|
|
Each 16-bit properties word contains:
|
|
|
|
0.. 4 general category
|
|
5 reserved
|
|
6..15 numeric type and value (ntv)
|
|
|
|
Encoding of numeric type and value in the 10-bit ntv field:
|
|
ntv type value
|
|
0 U_NT_NONE 0
|
|
1..10 U_NT_DECIMAL 0..9
|
|
11..20 U_NT_DIGIT 0..9
|
|
21..0x2ff U_NT_NUMERIC see below
|
|
0x300..0x3ff reserved
|
|
|
|
For U_NT_NUMERIC:
|
|
ntv value
|
|
21..0xaf integer 0..154
|
|
0xb0..0x1df fraction ((ntv>>4)-12) / ((ntv&0xf)+1) = -1..17 / 1..16
|
|
0x1e0..0x2ff large int ((ntv>>5)-14) * 10^((ntv&0x1f)+2) = (1..9)*(10^2..10^33)
|
|
(only one significant decimal digit)
|
|
0x300..0x323 base-60 (sexagesimal) integer (new in format version 7.1)
|
|
((ntv>>2)-0xbf) * 60^((ntv&3)+1) = (1..9)*(60^1..60^4)
|
|
|
|
--- Additional properties (new in format version 2.1) ---
|
|
|
|
The second trie for additional properties (AT) is also a UTrie with 16-bit data.
|
|
The data words consist of 32-bit unit indexes (not row indexes!) into the
|
|
table of unique properties vectors (PV).
|
|
Each vector contains a set of properties.
|
|
The width of a vector (number of uint32_t per row) may change
|
|
with the formatVersion, it is stored in i5.
|
|
|
|
Current properties: see icu/source/common/uprops.h
|
|
|
|
--- Changes in format version 3.1 ---
|
|
|
|
See i10 maxValues above, contains only UBLOCK_COUNT and USCRIPT_CODE_LIMIT.
|
|
|
|
--- Changes in format version 3.2 ---
|
|
|
|
- The tries use linear Latin-1 ranges.
|
|
- The additional properties bits store full properties XYZ instead
|
|
of partial Other_XYZ, so that changes in the derivation formulas
|
|
need not be tracked in runtime library code.
|
|
- Joining Type and Line Break are also stored completely, so that uprops.c
|
|
needs no runtime formulas for enumerated properties either.
|
|
- Store the case-sensitive flag in the main properties word.
|
|
- i10 also contains U_LB_COUNT and U_EA_COUNT.
|
|
- i11 contains maxValues2 for vector word 2.
|
|
|
|
--- Changes in format version 4 ---
|
|
|
|
The format changes between version 3 and 4 because the properties related to
|
|
case mappings and bidi/shaping are pulled out into separate files
|
|
for modularization.
|
|
In order to reduce the need for code changes, some of the previous data
|
|
structures are omitted, rather than rearranging everything.
|
|
|
|
(The change to format version 4 is for ICU 3.4. The last CVS revision of
|
|
genprops/store.c for format version 3.2 is 1.48.)
|
|
|
|
The main trie's data is significantly simplified:
|
|
- The trie's 16-bit data word is used directly instead of as an index
|
|
into props32[].
|
|
- The trie uses the default trie folding functions instead of custom ones.
|
|
- Numeric values are stored directly in the trie data word, with special
|
|
encodings.
|
|
- No more exception data (the data that needed it was pulled out, or, in the
|
|
case of numeric values, encoded differently).
|
|
- No more string data (pulled out - was for case mappings).
|
|
|
|
Also, some of the previously used properties vector bits are reserved again.
|
|
|
|
The indexes[] values for the omitted structures are still filled in
|
|
(indicating zero-length arrays) so that the swapper code remains unchanged.
|
|
|
|
--- Changes in format version 5 ---
|
|
|
|
Format version 5 became necessary because the bit field for script codes
|
|
overflowed. The changes are incompatible because
|
|
old code would have seen nonsensically low values for new, higher script codes.
|
|
|
|
Rearranged bit fields in the second trie (AT) and widened three (Script, Block,
|
|
Word_Break) by one bit each.
|
|
|
|
Modified bit fields in icu/source/common/uprops.h
|
|
|
|
--- Changes in format version 6 ---
|
|
|
|
Format version 6 became necessary because Unicode 5.2 adds fractions with
|
|
denominators 9, 10 and 16, and it was easier to redesign the encoding of numeric
|
|
types and values rather than add another variant to the previous format.
|
|
|
|
--- Changes in format version 7 ---
|
|
|
|
Unicode 6.0 adds Script_Extensions. For characters with script extensions data,
|
|
the script code bits are an index into the new Script_Extensions array rather
|
|
than a script code.
|
|
|
|
Change from UTrie to UTrie2.
|
|
|
|
--- Changes in format version 7.1 ---
|
|
|
|
Unicode 6.2 adds sexagesimal (base-60) numeric values:
|
|
cp;12432;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS DISH;nv=216000
|
|
cp;12433;na=CUNEIFORM NUMERIC SIGN SHAR2 TIMES GAL PLUS MIN;nv=432000
|
|
|
|
The encoding of numeric values was extended to handle such values.
|
|
|
|
----------------------------------------------------------------------------- */
|
|
|
|
U_NAMESPACE_USE
|
|
|
|
/* UDataInfo cf. udata.h */
|
|
static UDataInfo dataInfo={
|
|
sizeof(UDataInfo),
|
|
0,
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
U_CHARSET_FAMILY,
|
|
U_SIZEOF_UCHAR,
|
|
0,
|
|
|
|
{ 0x55, 0x50, 0x72, 0x6f }, /* dataFormat="UPro" */
|
|
{ 7, 1, 0, 0 }, /* formatVersion */
|
|
{ 6, 2, 0, 0 } /* dataVersion */
|
|
};
|
|
|
|
class CorePropsBuilder : public PropsBuilder {
|
|
public:
|
|
CorePropsBuilder(UErrorCode &errorCode);
|
|
virtual ~CorePropsBuilder();
|
|
|
|
virtual void setUnicodeVersion(const UVersionInfo version);
|
|
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
|
|
virtual void build(UErrorCode &errorCode);
|
|
virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
|
|
virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
|
|
|
|
private:
|
|
void setGcAndNumeric(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
|
|
|
|
UTrie2 *pTrie;
|
|
UTrie2 *props2Trie;
|
|
UPropsVectors *pv;
|
|
UnicodeString scriptExtensions;
|
|
};
|
|
|
|
CorePropsBuilder::CorePropsBuilder(UErrorCode &errorCode)
|
|
: pTrie(NULL), props2Trie(NULL), pv(NULL) {
|
|
pTrie=utrie2_open(0, 0, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: corepropsbuilder utrie2_open() failed - %s\n",
|
|
u_errorName(errorCode));
|
|
}
|
|
pv=upvec_open(UPROPS_VECTOR_WORDS, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: corepropsbuilder upvec_open() failed - %s\n",
|
|
u_errorName(errorCode));
|
|
}
|
|
}
|
|
|
|
CorePropsBuilder::~CorePropsBuilder() {
|
|
utrie2_close(pTrie);
|
|
utrie2_close(props2Trie);
|
|
upvec_close(pv);
|
|
}
|
|
|
|
void
|
|
CorePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
|
|
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
|
}
|
|
|
|
// For nt=U_NT_NUMERIC.
|
|
static int32_t
|
|
encodeNumericValue(UChar32 start, const char *s, UErrorCode &errorCode) {
|
|
const char *original;
|
|
/* get a possible minus sign */
|
|
UBool isNegative;
|
|
if(*s=='-') {
|
|
isNegative=TRUE;
|
|
++s;
|
|
} else {
|
|
isNegative=FALSE;
|
|
}
|
|
|
|
int32_t value=0, den=0, exp=0, ntv=0;
|
|
char *numberLimit;
|
|
/* try large, single-significant-digit numbers, may otherwise overflow strtoul() */
|
|
if('1'<=s[0] && s[0]<='9' && s[1]=='0' && s[2]=='0') {
|
|
value=s[0]-'0';
|
|
numberLimit=const_cast<char *>(s);
|
|
while(*(++numberLimit)=='0') {
|
|
++exp;
|
|
}
|
|
} else {
|
|
/* normal number parsing */
|
|
unsigned long ul=uprv_strtoul(s, &numberLimit, 10);
|
|
if(s==numberLimit || (*numberLimit!=0 && *numberLimit!='/') || ul>0x7fffffff) {
|
|
ntv=-1;
|
|
} else {
|
|
value=(int32_t)ul;
|
|
}
|
|
if(ntv>=0 && *numberLimit=='/') {
|
|
/* fractional value, get the denominator */
|
|
s=numberLimit+1;
|
|
ul=uprv_strtoul(s, &numberLimit, 10);
|
|
if(s==numberLimit || *numberLimit!=0 || ul==0 || ul>0x7fffffff) {
|
|
ntv=-1;
|
|
} else {
|
|
den=(int32_t)ul;
|
|
}
|
|
}
|
|
}
|
|
if(isNegative) {
|
|
value=-(int32_t)value;
|
|
}
|
|
|
|
if(ntv<0) {
|
|
// pass
|
|
} else if(den==0 && value>=0) {
|
|
if(exp==2 && (value*100)<=UPROPS_NTV_MAX_SMALL_INT) {
|
|
/* small integer parsed like a large one */
|
|
ntv=UPROPS_NTV_NUMERIC_START+value*100;
|
|
} else if(exp==0) {
|
|
if(value<=UPROPS_NTV_MAX_SMALL_INT) {
|
|
/* small integer */
|
|
ntv=UPROPS_NTV_NUMERIC_START+value;
|
|
} else {
|
|
/* large integer parsed like a small one */
|
|
/* split the value into mantissa and exponent, base 10 */
|
|
int32_t mant=value;
|
|
while((mant%10)==0) {
|
|
mant/=10;
|
|
++exp;
|
|
}
|
|
// Note: value<=0x7fffffff guarantees exp<=33
|
|
if(mant<=9) {
|
|
ntv=((mant+14)<<5)+(exp-2);
|
|
} else {
|
|
// Try sexagesimal (base 60) numbers.
|
|
mant=value;
|
|
exp=0;
|
|
while((mant%60)==0) {
|
|
mant/=60;
|
|
++exp;
|
|
}
|
|
if(mant<=9 && exp<=4) {
|
|
ntv=((mant+0xbf)<<2)+(exp-1);
|
|
} else {
|
|
ntv=-1;
|
|
}
|
|
}
|
|
}
|
|
} else if(2<=exp && exp<=33 && 1<=value && value<=9) {
|
|
/* large, single-significant-digit integer */
|
|
ntv=((value+14)<<5)+(exp-2);
|
|
} else {
|
|
ntv=-1;
|
|
}
|
|
} else if(exp==0 && -1<=value && value<=17 && 1<=den && den<=16) {
|
|
/* fraction */
|
|
ntv=((value+12)<<4)+(den-1);
|
|
} else if(exp==0 && value==-1 && den==0) {
|
|
/* -1 parsed with den=0, encoded as pseudo-fraction -1/1 */
|
|
ntv=((value+12)<<4);
|
|
} else {
|
|
ntv=-1;
|
|
}
|
|
if(ntv<0 || *numberLimit!=0) {
|
|
fprintf(stderr, "genprops error: unable to encode numeric value nv=%s\n", original);
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
}
|
|
return ntv;
|
|
}
|
|
|
|
void
|
|
CorePropsBuilder::setGcAndNumeric(const UniProps &props, const UnicodeSet &newValues,
|
|
UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
UChar32 start=props.start;
|
|
UChar32 end=props.end;
|
|
|
|
int32_t type=props.getIntProp(UCHAR_NUMERIC_TYPE);
|
|
const char *nvString=props.numericValue;
|
|
if(type!=U_NT_NONE && nvString==NULL && start==end) {
|
|
fprintf(stderr, "genprops error: cp line has Numeric_Type but no Numeric_Value\n");
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
if(!newValues.contains(UCHAR_GENERAL_CATEGORY) && !newValues.contains(UCHAR_NUMERIC_VALUE)) {
|
|
return;
|
|
}
|
|
|
|
int32_t ntv=UPROPS_NTV_NONE; // numeric type & value
|
|
if(nvString!=NULL) {
|
|
int32_t digitValue=props.digitValue;
|
|
if( type<=U_NT_NONE || U_NT_NUMERIC<type ||
|
|
((type==U_NT_DECIMAL || type==U_NT_DIGIT) && digitValue<0)
|
|
) {
|
|
fprintf(stderr, "genprops error: nt=%d but nv=%s\n",
|
|
(int)type, nvString==NULL ? "NULL" : nvString);
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
switch(type) {
|
|
case U_NT_NONE:
|
|
ntv=UPROPS_NTV_NONE;
|
|
break;
|
|
case U_NT_DECIMAL:
|
|
ntv=UPROPS_NTV_DECIMAL_START+digitValue;
|
|
break;
|
|
case U_NT_DIGIT:
|
|
ntv=UPROPS_NTV_DIGIT_START+digitValue;
|
|
break;
|
|
case U_NT_NUMERIC:
|
|
if(digitValue>=0) {
|
|
ntv=UPROPS_NTV_NUMERIC_START+digitValue;
|
|
} else {
|
|
ntv=encodeNumericValue(start, nvString, errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
return;
|
|
}
|
|
}
|
|
default:
|
|
break; // unreachable
|
|
}
|
|
}
|
|
|
|
uint32_t value=
|
|
(uint32_t)props.getIntProp(UCHAR_GENERAL_CATEGORY) |
|
|
(ntv<<UPROPS_NUMERIC_TYPE_VALUE_SHIFT);
|
|
if(start==end) {
|
|
utrie2_set32(pTrie, start, value, &errorCode);
|
|
} else {
|
|
utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
|
|
}
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "error: utrie2_setRange32(properties trie %04lX..%04lX) failed - %s\n",
|
|
(long)start, (long)end, u_errorName(errorCode));
|
|
}
|
|
}
|
|
|
|
struct PropToBinary {
|
|
int32_t prop; // UProperty
|
|
int32_t vecWord, vecShift;
|
|
};
|
|
|
|
static const PropToBinary
|
|
propToBinaries[]={
|
|
{ UCHAR_WHITE_SPACE, 1, UPROPS_WHITE_SPACE },
|
|
{ UCHAR_DASH, 1, UPROPS_DASH },
|
|
// Note: The Hyphen property is stabilized since Unicode 4.0
|
|
// and deprecated since Unicode 6.0.
|
|
{ UCHAR_HYPHEN, 1, UPROPS_HYPHEN },
|
|
{ UCHAR_QUOTATION_MARK, 1, UPROPS_QUOTATION_MARK },
|
|
{ UCHAR_TERMINAL_PUNCTUATION, 1, UPROPS_TERMINAL_PUNCTUATION },
|
|
// Note: The Hex_Digit and ASCII_Hex_Digit properties are probably stable enough
|
|
// so that they could be hardcoded.
|
|
{ UCHAR_HEX_DIGIT, 1, UPROPS_HEX_DIGIT },
|
|
{ UCHAR_ASCII_HEX_DIGIT, 1, UPROPS_ASCII_HEX_DIGIT },
|
|
{ UCHAR_IDEOGRAPHIC, 1, UPROPS_IDEOGRAPHIC },
|
|
{ UCHAR_DIACRITIC, 1, UPROPS_DIACRITIC },
|
|
{ UCHAR_EXTENDER, 1, UPROPS_EXTENDER },
|
|
// Note: The Noncharacter_Code_Point property is probably stable enough
|
|
// so that it could be hardcoded.
|
|
{ UCHAR_NONCHARACTER_CODE_POINT, 1, UPROPS_NONCHARACTER_CODE_POINT },
|
|
// Note: The Grapheme_Link property is deprecated since Unicode 5.0
|
|
// because it is a "Duplication of ccc=9" (UAX #44).
|
|
{ UCHAR_GRAPHEME_LINK, 1, UPROPS_GRAPHEME_LINK },
|
|
{ UCHAR_IDS_BINARY_OPERATOR, 1, UPROPS_IDS_BINARY_OPERATOR },
|
|
{ UCHAR_IDS_TRINARY_OPERATOR, 1, UPROPS_IDS_TRINARY_OPERATOR },
|
|
{ UCHAR_RADICAL, 1, UPROPS_RADICAL },
|
|
{ UCHAR_UNIFIED_IDEOGRAPH, 1, UPROPS_UNIFIED_IDEOGRAPH },
|
|
{ UCHAR_DEPRECATED, 1, UPROPS_DEPRECATED },
|
|
{ UCHAR_LOGICAL_ORDER_EXCEPTION, 1, UPROPS_LOGICAL_ORDER_EXCEPTION },
|
|
{ UCHAR_S_TERM, 1, UPROPS_S_TERM },
|
|
{ UCHAR_VARIATION_SELECTOR, 1, UPROPS_VARIATION_SELECTOR },
|
|
// Note: Pattern_Syntax & Pattern_White_Space are available via
|
|
// the internal PatternProps class and need not be stored here any more.
|
|
{ UCHAR_PATTERN_SYNTAX, 1, UPROPS_PATTERN_SYNTAX },
|
|
{ UCHAR_PATTERN_WHITE_SPACE, 1, UPROPS_PATTERN_WHITE_SPACE },
|
|
{ UCHAR_XID_START, 1, UPROPS_XID_START },
|
|
{ UCHAR_XID_CONTINUE, 1, UPROPS_XID_CONTINUE },
|
|
{ UCHAR_MATH, 1, UPROPS_MATH },
|
|
{ UCHAR_ALPHABETIC, 1, UPROPS_ALPHABETIC },
|
|
{ UCHAR_GRAPHEME_EXTEND, 1, UPROPS_GRAPHEME_EXTEND },
|
|
{ UCHAR_DEFAULT_IGNORABLE_CODE_POINT, 1, UPROPS_DEFAULT_IGNORABLE_CODE_POINT },
|
|
{ UCHAR_ID_START, 1, UPROPS_ID_START },
|
|
{ UCHAR_ID_CONTINUE, 1, UPROPS_ID_CONTINUE },
|
|
{ UCHAR_GRAPHEME_BASE, 1, UPROPS_GRAPHEME_BASE },
|
|
};
|
|
|
|
struct PropToEnum {
|
|
int32_t prop; // UProperty
|
|
int32_t vecWord, vecShift;
|
|
uint32_t vecMask;
|
|
};
|
|
|
|
static const PropToEnum
|
|
propToEnums[]={
|
|
// Use UPROPS_SCRIPT_X_MASK not UPROPS_SCRIPT_MASK:
|
|
// When writing a Script code, remove Script_Extensions bits as well.
|
|
// If needed, they will get written again.
|
|
{ UCHAR_SCRIPT, 0, 0, UPROPS_SCRIPT_X_MASK },
|
|
{ UCHAR_BLOCK, 0, UPROPS_BLOCK_SHIFT, UPROPS_BLOCK_MASK },
|
|
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
|
|
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
|
|
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
|
|
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
|
|
{ UCHAR_SENTENCE_BREAK, 2, UPROPS_SB_SHIFT, UPROPS_SB_MASK },
|
|
{ UCHAR_LINE_BREAK, 2, UPROPS_LB_SHIFT, UPROPS_LB_MASK },
|
|
};
|
|
|
|
void
|
|
CorePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|
UErrorCode &errorCode) {
|
|
setGcAndNumeric(props, newValues, errorCode);
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
UChar32 start=props.start;
|
|
UChar32 end=props.end;
|
|
if(start==0 && end==0x10ffff) {
|
|
// Also set bits for initialValue and errorValue.
|
|
end=UPVEC_MAX_CP;
|
|
}
|
|
if(newValues.containsSome(0, UCHAR_BINARY_LIMIT-1)) {
|
|
for(int32_t i=0; i<LENGTHOF(propToBinaries); ++i) {
|
|
const PropToBinary &p2b=propToBinaries[i];
|
|
U_ASSERT(p2b.vecShift<32);
|
|
if(newValues.contains(p2b.prop)) {
|
|
uint32_t mask=U_MASK(p2b.vecShift);
|
|
uint32_t value= props.binProps[p2b.prop] ? mask : 0;
|
|
upvec_setValue(pv, start, end, p2b.vecWord, value, mask, &errorCode);
|
|
}
|
|
}
|
|
}
|
|
if(newValues.containsSome(UCHAR_INT_START, UCHAR_INT_LIMIT-1)) {
|
|
for(int32_t i=0; i<LENGTHOF(propToEnums); ++i) {
|
|
const PropToEnum &p2e=propToEnums[i];
|
|
U_ASSERT(p2e.vecShift<32);
|
|
if(newValues.contains(p2e.prop)) {
|
|
uint32_t mask=p2e.vecMask;
|
|
uint32_t value=(uint32_t)(props.getIntProp(p2e.prop)<<p2e.vecShift);
|
|
U_ASSERT((value&mask)==value);
|
|
upvec_setValue(pv, start, end, p2e.vecWord, value, mask, &errorCode);
|
|
}
|
|
}
|
|
}
|
|
if(newValues.contains(UCHAR_AGE)) {
|
|
if(props.age[0]>15 || props.age[1]>15 || props.age[2]!=0 || props.age[3]!=0) {
|
|
char buffer[U_MAX_VERSION_STRING_LENGTH];
|
|
u_versionToString(props.age, buffer);
|
|
fprintf(stderr, "genprops error: age %s cannot be encoded\n", buffer);
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
uint32_t version=(props.age[0]<<4)|props.age[1];
|
|
upvec_setValue(pv, start, end,
|
|
0, version<<UPROPS_AGE_SHIFT, UPROPS_AGE_MASK,
|
|
&errorCode);
|
|
}
|
|
// Write a new (Script, Script_Extensions) value if there are Script_Extensions
|
|
// and either Script or Script_Extensions are new on the current line.
|
|
// (If only Script is new, then it just clobbered the relevant bits.)
|
|
if( !props.scx.isEmpty() &&
|
|
(newValues.contains(UCHAR_SCRIPT) || newValues.contains(UCHAR_SCRIPT_EXTENSIONS))
|
|
) {
|
|
UnicodeString codes; // vector of 16-bit UScriptCode values
|
|
UnicodeSetIterator iter(props.scx);
|
|
while(iter.next()) { codes.append((UChar)iter.getCodepoint()); }
|
|
|
|
// Set bit 15 on the last script code, for termination.
|
|
int32_t length=codes.length();
|
|
codes.setCharAt(length-1, (UChar)(codes[length-1]|0x8000));
|
|
// Find this list of codes in the Script_Extensions data so far, or add this list.
|
|
int32_t index=scriptExtensions.indexOf(codes);
|
|
if(index<0) {
|
|
index=scriptExtensions.length();
|
|
scriptExtensions.append(codes);
|
|
}
|
|
|
|
// Encode the (Script, Script_Extensions index) pair.
|
|
int32_t script=props.getIntProp(UCHAR_SCRIPT);
|
|
uint32_t scriptX;
|
|
if(script==USCRIPT_COMMON) {
|
|
scriptX=UPROPS_SCRIPT_X_WITH_COMMON|(uint32_t)index;
|
|
} else if(script==USCRIPT_INHERITED) {
|
|
scriptX=UPROPS_SCRIPT_X_WITH_INHERITED|(uint32_t)index;
|
|
} else {
|
|
// Store an additional pair of 16-bit units for an unusual main Script code
|
|
// together with the Script_Extensions index.
|
|
UnicodeString codeIndexPair;
|
|
codeIndexPair.append((UChar)script).append((UChar)index);
|
|
index=scriptExtensions.indexOf(codeIndexPair);
|
|
if(index<0) {
|
|
index=scriptExtensions.length();
|
|
scriptExtensions.append(codeIndexPair);
|
|
}
|
|
scriptX=UPROPS_SCRIPT_X_WITH_OTHER|(uint32_t)index;
|
|
}
|
|
if(index>UPROPS_SCRIPT_MASK) {
|
|
fprintf(stderr, "genprops: Script_Extensions indexes overflow bit field\n");
|
|
errorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
return;
|
|
}
|
|
upvec_setValue(pv, start, end, 0, scriptX, UPROPS_SCRIPT_X_MASK, &errorCode);
|
|
}
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set props2 values for %04lX..%04lX: %s\n",
|
|
(long)start, (long)end, u_errorName(errorCode));
|
|
}
|
|
}
|
|
|
|
static int32_t indexes[UPROPS_INDEX_COUNT]={
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0
|
|
};
|
|
|
|
static uint8_t trieBlock[40000];
|
|
static int32_t trieSize;
|
|
static uint8_t props2TrieBlock[100000];
|
|
static int32_t props2TrieSize;
|
|
|
|
static int32_t totalSize;
|
|
|
|
void
|
|
CorePropsBuilder::build(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
|
|
trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr,
|
|
"genprops error: utrie2_freeze(main trie)+utrie2_serialize() "
|
|
"failed: %s (length %ld)\n",
|
|
u_errorName(errorCode), (long)trieSize);
|
|
return;
|
|
}
|
|
|
|
props2Trie=upvec_compactToUTrie2WithRowIndexes(pv, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to build trie for additional properties: %s\n",
|
|
u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
props2TrieSize=utrie2_serialize(props2Trie,
|
|
props2TrieBlock, (int32_t)sizeof(props2TrieBlock),
|
|
&errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr,
|
|
"genprops error: utrie2_freeze(additional properties)+utrie2_serialize() failed: %s\n",
|
|
u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
int32_t pvRows;
|
|
const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
|
|
int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
|
|
|
|
/* round up scriptExtensions to multiple of 4 bytes */
|
|
if(scriptExtensions.length()&1) {
|
|
scriptExtensions.append((UChar)0);
|
|
}
|
|
|
|
/* set indexes */
|
|
int32_t offset=sizeof(indexes)/4; /* uint32_t offset to the properties trie */
|
|
offset+=trieSize>>2;
|
|
indexes[UPROPS_PROPS32_INDEX]= /* set indexes to the same offsets for empty */
|
|
indexes[UPROPS_EXCEPTIONS_INDEX]= /* structures from the old format version 3 */
|
|
indexes[UPROPS_EXCEPTIONS_TOP_INDEX]= /* so that less runtime code has to be changed */
|
|
indexes[UPROPS_ADDITIONAL_TRIE_INDEX]=offset;
|
|
|
|
offset+=props2TrieSize/4;
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_INDEX]=offset;
|
|
indexes[UPROPS_ADDITIONAL_VECTORS_COLUMNS_INDEX]=UPROPS_VECTOR_WORDS;
|
|
offset+=pvCount;
|
|
indexes[UPROPS_SCRIPT_EXTENSIONS_INDEX]=offset;
|
|
offset+=scriptExtensions.length()/2;
|
|
indexes[UPROPS_RESERVED_INDEX_7]=offset;
|
|
indexes[UPROPS_RESERVED_INDEX_8]=offset;
|
|
indexes[UPROPS_DATA_TOP_INDEX]=offset;
|
|
totalSize=4*offset;
|
|
|
|
indexes[UPROPS_MAX_VALUES_INDEX]=
|
|
(((int32_t)U_EA_COUNT-1)<<UPROPS_EA_SHIFT)|
|
|
(((int32_t)UBLOCK_COUNT-1)<<UPROPS_BLOCK_SHIFT)|
|
|
(((int32_t)USCRIPT_CODE_LIMIT-1)&UPROPS_SCRIPT_MASK);
|
|
indexes[UPROPS_MAX_VALUES_2_INDEX]=
|
|
(((int32_t)U_LB_COUNT-1)<<UPROPS_LB_SHIFT)|
|
|
(((int32_t)U_SB_COUNT-1)<<UPROPS_SB_SHIFT)|
|
|
(((int32_t)U_WB_COUNT-1)<<UPROPS_WB_SHIFT)|
|
|
(((int32_t)U_GCB_COUNT-1)<<UPROPS_GCB_SHIFT)|
|
|
((int32_t)U_DT_COUNT-1);
|
|
|
|
if(!beQuiet) {
|
|
puts("* uprops.icu stats *");
|
|
printf("trie size in bytes: %5u\n", (int)trieSize);
|
|
printf("size in bytes of additional props trie:%5u\n", (int)props2TrieSize);
|
|
printf("number of additional props vectors: %5u\n", (int)pvRows);
|
|
printf("number of 32-bit words per vector: %5u\n", UPROPS_VECTOR_WORDS);
|
|
printf("number of 16-bit scriptExtensions: %5u\n", (int)scriptExtensions.length());
|
|
printf("data size: %6ld\n", (long)totalSize);
|
|
}
|
|
}
|
|
|
|
void
|
|
CorePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
int32_t pvRows;
|
|
const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
|
|
int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
|
|
|
|
FILE *f=usrc_create(path, "uchar_props_data.h",
|
|
"icu/tools/unicode/c/genprops/corepropsbuilder.cpp");
|
|
if(f==NULL) {
|
|
errorCode=U_FILE_ACCESS_ERROR;
|
|
return;
|
|
}
|
|
fputs("#ifndef INCLUDED_FROM_UCHAR_C\n"
|
|
"# error This file must be #included from uchar.c only.\n"
|
|
"#endif\n\n", f);
|
|
usrc_writeArray(f,
|
|
"static const UVersionInfo dataVersion={",
|
|
dataInfo.dataVersion, 8, 4,
|
|
"};\n\n");
|
|
usrc_writeUTrie2Arrays(f,
|
|
"static const uint16_t propsTrie_index[%ld]={\n", NULL,
|
|
pTrie,
|
|
"\n};\n\n");
|
|
usrc_writeUTrie2Struct(f,
|
|
"static const UTrie2 propsTrie={\n",
|
|
pTrie, "propsTrie_index", NULL,
|
|
"};\n\n");
|
|
|
|
usrc_writeUTrie2Arrays(f,
|
|
"static const uint16_t propsVectorsTrie_index[%ld]={\n", NULL,
|
|
props2Trie,
|
|
"\n};\n\n");
|
|
usrc_writeUTrie2Struct(f,
|
|
"static const UTrie2 propsVectorsTrie={\n",
|
|
props2Trie, "propsVectorsTrie_index", NULL,
|
|
"};\n\n");
|
|
|
|
usrc_writeArray(f,
|
|
"static const uint32_t propsVectors[%ld]={\n",
|
|
pvArray, 32, pvCount,
|
|
"};\n\n");
|
|
fprintf(f, "static const int32_t countPropsVectors=%ld;\n", (long)pvCount);
|
|
fprintf(f, "static const int32_t propsVectorsColumns=%ld;\n", (long)UPROPS_VECTOR_WORDS);
|
|
|
|
usrc_writeArray(f,
|
|
"static const uint16_t scriptExtensions[%ld]={\n",
|
|
scriptExtensions.getBuffer(), 16, scriptExtensions.length(),
|
|
"};\n\n");
|
|
|
|
usrc_writeArray(f,
|
|
"static const int32_t indexes[UPROPS_INDEX_COUNT]={",
|
|
indexes, 32, UPROPS_INDEX_COUNT,
|
|
"};\n\n");
|
|
fclose(f);
|
|
}
|
|
|
|
void
|
|
CorePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
int32_t pvRows;
|
|
const uint32_t *pvArray=upvec_getArray(pv, &pvRows, NULL);
|
|
int32_t pvCount=pvRows*UPROPS_VECTOR_WORDS;
|
|
|
|
UNewDataMemory *pData=udata_create(path, "icu", "uprops", &dataInfo,
|
|
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops: udata_create(%s, uprops.icu) failed - %s\n",
|
|
path, u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
udata_writeBlock(pData, indexes, sizeof(indexes));
|
|
udata_writeBlock(pData, trieBlock, trieSize);
|
|
udata_writeBlock(pData, props2TrieBlock, props2TrieSize);
|
|
udata_writeBlock(pData, pvArray, pvCount*4);
|
|
udata_writeBlock(pData, scriptExtensions.getBuffer(), scriptExtensions.length()*2);
|
|
|
|
long dataLength=udata_finish(pData, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops: error %s writing the output file\n", u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
if(dataLength!=(long)totalSize) {
|
|
fprintf(stderr,
|
|
"udata_finish(uprops.icu) reports %ld bytes written but should be %ld\n",
|
|
dataLength, (long)totalSize);
|
|
errorCode=U_INTERNAL_PROGRAM_ERROR;
|
|
}
|
|
}
|
|
|
|
PropsBuilder *
|
|
createCorePropsBuilder(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return NULL; }
|
|
PropsBuilder *pb=new CorePropsBuilder(errorCode);
|
|
if(pb==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
return pb;
|
|
}
|
|
|
|
/*
|
|
* Hey, Emacs, please set the following:
|
|
*
|
|
* Local Variables:
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*
|
|
*/
|