e70b98d3f6
X-SVN-Rev: 38607
1213 lines
40 KiB
C++
1213 lines
40 KiB
C++
/*
|
|
*******************************************************************************
|
|
*
|
|
* Copyright (C) 2004-2016, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
*******************************************************************************
|
|
* file name: casepropsbuilder.cpp (was gencase/store.c)
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2004aug28
|
|
* created by: Markus W. Scherer
|
|
*
|
|
* Store Unicode case mapping properties efficiently for
|
|
* random access.
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/localpointer.h"
|
|
#include "unicode/uchar.h"
|
|
#include "unicode/udata.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/usetiter.h"
|
|
#include "unicode/ustring.h"
|
|
#include "cmemory.h"
|
|
#include "cstring.h"
|
|
#include "genprops.h"
|
|
#include "ppucd.h"
|
|
#include "uassert.h"
|
|
#include "uarrsort.h"
|
|
#include "ucase.h"
|
|
#include "unewdata.h"
|
|
#include "utrie2.h"
|
|
#include "writesrc.h"
|
|
|
|
/* Unicode case mapping properties file format ---------------------------------
|
|
|
|
The file format prepared and written here contains several data
|
|
structures that store indexes or data.
|
|
|
|
Before the data contents described below, there are the headers required by
|
|
the udata API for loading ICU data. Especially, a UDataInfo structure
|
|
precedes the actual data. It contains platform properties values and the
|
|
file format version.
|
|
|
|
The following is a description of format version 3.0 .
|
|
|
|
Format version 1.1 adds data for case closure.
|
|
|
|
Format version 1.2 adds an exception bit for case-ignorable. Needed because
|
|
the Cased and Case_Ignorable properties are not disjoint.
|
|
|
|
Format version 2.0 changes from UTrie to UTrie2.
|
|
|
|
Format version 3.0 (ICU 49) shuffles the trie bits to simplify some builder and runtime code.
|
|
It moves the Case_Ignorable flag from sometimes-trie-bit 6, sometimes-exception-bit 11
|
|
to always-trie-bit 2 and adjusts the higher trie bits accordingly.
|
|
Exception index reduced from 12 bits to 11, simple case mapping delta reduced from 10 bits to 9.
|
|
|
|
The file contains the following structures:
|
|
|
|
const int32_t indexes[i0] with values i0, i1, ...:
|
|
(see UCASE_IX_... constants for names of indexes)
|
|
|
|
i0 indexLength; -- length of indexes[] (UCASE_IX_TOP)
|
|
i1 dataLength; -- length in bytes of the post-header data (incl. indexes[])
|
|
i2 trieSize; -- size in bytes of the case mapping properties trie
|
|
i3 exceptionsLength; -- length in uint16_t of the exceptions array
|
|
i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1)
|
|
|
|
i5..i14 reservedIndexes; -- reserved values; 0 for now
|
|
|
|
i15 maxFullLength; -- maximum length of a full case mapping/folding string
|
|
|
|
|
|
Serialized trie, see utrie2.h;
|
|
|
|
const uint16_t exceptions[exceptionsLength];
|
|
|
|
const UChar unfold[unfoldLength];
|
|
|
|
|
|
Trie data word:
|
|
Bits
|
|
if(exception) {
|
|
15..5 unsigned exception index
|
|
} else {
|
|
if(not uncased) {
|
|
15..7 signed delta to simple case mapping code point
|
|
(add delta to input code point)
|
|
} else {
|
|
15..7 reserved, 0
|
|
}
|
|
6..5 0 normal character with cc=0
|
|
1 soft-dotted character
|
|
2 cc=230
|
|
3 other cc
|
|
The runtime code relies on these two bits to be adjacent with this encoding.
|
|
}
|
|
4 exception
|
|
3 case-sensitive
|
|
2 case-ignorable
|
|
1..0 0 uncased
|
|
1 lowercase
|
|
2 uppercase
|
|
3 titlecase
|
|
The runtime code relies on the case-ignorable and case type bits 2..0
|
|
to be the lowest bits with this encoding.
|
|
|
|
|
|
Exceptions:
|
|
A sub-array of the exceptions array is indexed by the exception index in a
|
|
trie word.
|
|
The sub-array consists of the following fields:
|
|
uint16_t excWord;
|
|
uint16_t optional values [];
|
|
UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase
|
|
|
|
excWord: (see UCASE_EXC_...)
|
|
Bits
|
|
15 conditional case folding
|
|
14 conditional special casing
|
|
13..12 same as non-exception trie data bits 6..5
|
|
moved here because the exception index needs more bits than the delta
|
|
0 normal character with cc=0
|
|
1 soft-dotted character
|
|
2 cc=230
|
|
3 other cc
|
|
11 reserved
|
|
(was used in formatVersion 1.2..2.0:
|
|
case-ignorable (used when the character is cased or has another exception))
|
|
10.. 9 reserved
|
|
8 if set, then for each optional-value slot there are 2 uint16_t values
|
|
(high and low parts of 32-bit values)
|
|
instead of single ones
|
|
7.. 0 bits for which optional value is present
|
|
|
|
Optional-value slots:
|
|
0 lowercase mapping (code point)
|
|
1 case folding (code point)
|
|
2 uppercase mapping (code point)
|
|
3 titlecase mapping (code point)
|
|
4 reserved
|
|
5 reserved
|
|
6 closure mappings (new in format version 1.1)
|
|
7 there is at least one full (string) case mapping
|
|
the length of each is encoded in a nibble of this optional value,
|
|
and the strings follow this optional value in the same order:
|
|
lower/fold/upper/title
|
|
|
|
The optional closure mappings value is used as follows:
|
|
Bits 0..3 contain the length of a string of code points for case closure.
|
|
The string immediately follows the full case mappings, or the closure value
|
|
slot if there are no full case mappings.
|
|
Bits 4..15 are reserved and could be used in the future to indicate the
|
|
number of strings for case closure.
|
|
Complete case closure for a code point is given by the union of all simple
|
|
and full case mappings and foldings, plus the case closure code points
|
|
(and potentially, in the future, case closure strings).
|
|
|
|
For space saving, some values are not stored. Lookups are as follows:
|
|
- If special casing is conditional, then no full lower/upper/title mapping
|
|
strings are stored.
|
|
- If case folding is conditional, then no simple or full case foldings are
|
|
stored.
|
|
- Fall back in this order:
|
|
full (string) mapping -- if full mappings are used
|
|
simple (code point) mapping of the same type
|
|
simple fold->simple lower
|
|
simple title->simple upper
|
|
finally, the original code point (no mapping)
|
|
|
|
This fallback order is strict:
|
|
In particular, the fallback from full case folding is to simple case folding,
|
|
not to full lowercase mapping.
|
|
|
|
Reverse case folding data ("unfold") array: (new in format version 1.1)
|
|
|
|
This array stores some miscellaneous values followed by a table. The data maps
|
|
back from multi-character strings to their original code points, for use
|
|
in case closure.
|
|
|
|
The table contains two columns of strings.
|
|
The string in the first column is the case folding of each of the code points
|
|
in the second column. The strings are terminated with NUL or by the end of the
|
|
column, whichever comes first.
|
|
|
|
The miscellaneous data takes up one pseudo-row and includes:
|
|
- number of rows
|
|
- number of UChars per row
|
|
- number of UChars in the left (folding string) column
|
|
|
|
The table is sorted by its first column. Values in the first column are unique.
|
|
|
|
----------------------------------------------------------------------------- */
|
|
|
|
U_NAMESPACE_USE
|
|
|
|
/* UDataInfo cf. udata.h */
|
|
static UDataInfo dataInfo={
|
|
sizeof(UDataInfo),
|
|
0,
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
U_CHARSET_FAMILY,
|
|
U_SIZEOF_UCHAR,
|
|
0,
|
|
|
|
/* dataFormat="cAsE" */
|
|
{ UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 },
|
|
{ 3, 0, 0, 0 }, /* formatVersion */
|
|
{ 6, 0, 0, 0 } /* dataVersion */
|
|
};
|
|
|
|
#define UGENCASE_EXC_SHIFT 20
|
|
#define UGENCASE_EXC_MASK 0xfff00000
|
|
|
|
enum {
|
|
MAX_EXC_COUNT=(UGENCASE_EXC_MASK>>UGENCASE_EXC_SHIFT)+1
|
|
};
|
|
|
|
struct ExcProps {
|
|
ExcProps()
|
|
: hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {}
|
|
ExcProps(const UniProps &otherProps)
|
|
: props(otherProps),
|
|
hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {}
|
|
|
|
UniProps props;
|
|
UnicodeSet closure;
|
|
UBool hasConditionalCaseMappings;
|
|
UBool hasTurkicCaseFolding;
|
|
};
|
|
|
|
/*
|
|
* Values for the ucase.icu unfold[] data array.
|
|
* The values are stored in ucase.icu so that the runtime code will work with
|
|
* changing values, but they are hardcoded here for simplicity.
|
|
* They are optimized, that is, provide for minimal table column widths,
|
|
* for the actual Unicode data, so that the table size is minimized.
|
|
* Future versions of Unicode may require increases of some of these values.
|
|
*/
|
|
enum {
|
|
UGENCASE_UNFOLD_STRING_WIDTH=3,
|
|
UGENCASE_UNFOLD_CP_WIDTH=2,
|
|
UGENCASE_UNFOLD_WIDTH=UGENCASE_UNFOLD_STRING_WIDTH+UGENCASE_UNFOLD_CP_WIDTH
|
|
};
|
|
|
|
class CasePropsBuilder : public PropsBuilder {
|
|
public:
|
|
CasePropsBuilder(UErrorCode &errorCode);
|
|
virtual ~CasePropsBuilder();
|
|
|
|
virtual void setUnicodeVersion(const UVersionInfo version);
|
|
virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode);
|
|
virtual void build(UErrorCode &errorCode);
|
|
virtual void writeCSourceFile(const char *path, UErrorCode &errorCode);
|
|
virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode);
|
|
|
|
private:
|
|
uint32_t makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode);
|
|
void addUnfolding(UChar32 c, const UnicodeString &s, UErrorCode &errorCode);
|
|
void makeUnfoldData(UErrorCode &errorCode);
|
|
void addClosureMapping(UChar32 src, UChar32 dest, UErrorCode &errorCode);
|
|
UBool addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value,
|
|
UErrorCode &errorCode);
|
|
void makeCaseClosure(UErrorCode &errorCode);
|
|
int32_t makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorCode &errorCode);
|
|
void makeExceptions(UErrorCode &errorCode);
|
|
|
|
UnicodeSet relevantProps;
|
|
/*
|
|
* Unicode set collecting the case-sensitive characters;
|
|
* see uchar.h UCHAR_CASE_SENSITIVE.
|
|
* Add code points from case mappings/foldings in
|
|
* the root locale and with default options.
|
|
*/
|
|
UnicodeSet caseSensitive;
|
|
/* reverse case folding ("unfold") data */
|
|
UnicodeString unfold;
|
|
UnicodeString exceptions;
|
|
ExcProps **excProps;
|
|
int32_t excPropsCount;
|
|
/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */
|
|
int32_t maxFullLength;
|
|
UTrie2 *pTrie;
|
|
};
|
|
|
|
CasePropsBuilder::CasePropsBuilder(UErrorCode &errorCode)
|
|
: excProps(NULL), excPropsCount(0), maxFullLength(U16_MAX_LENGTH), pTrie(NULL) {
|
|
// This builder encodes the following properties.
|
|
relevantProps.
|
|
add(UCHAR_CANONICAL_COMBINING_CLASS). // 0 vs. 230 vs. other
|
|
add(UCHAR_SOFT_DOTTED).
|
|
add(UCHAR_LOWERCASE).
|
|
add(UCHAR_UPPERCASE).
|
|
add(UCHAR_CASE_IGNORABLE).
|
|
add(UCHAR_SIMPLE_CASE_FOLDING).
|
|
add(UCHAR_SIMPLE_LOWERCASE_MAPPING).
|
|
add(UCHAR_SIMPLE_TITLECASE_MAPPING).
|
|
add(UCHAR_SIMPLE_UPPERCASE_MAPPING).
|
|
add(UCHAR_CASE_FOLDING).
|
|
add(UCHAR_LOWERCASE_MAPPING).
|
|
add(UCHAR_TITLECASE_MAPPING).
|
|
add(UCHAR_UPPERCASE_MAPPING).
|
|
add(PPUCD_CONDITIONAL_CASE_MAPPINGS).
|
|
add(PPUCD_TURKIC_CASE_FOLDING);
|
|
// Write "unfold" meta data into the first row. Must be UGENCASE_UNFOLD_WIDTH UChars.
|
|
unfold.
|
|
append(0).
|
|
append((UChar)UGENCASE_UNFOLD_WIDTH).
|
|
append((UChar)UGENCASE_UNFOLD_STRING_WIDTH).
|
|
append(0).
|
|
append(0);
|
|
U_ASSERT(unfold.length()==UGENCASE_UNFOLD_WIDTH);
|
|
pTrie=utrie2_open(0, 0, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: casepropsbuilder utrie2_open() failed - %s\n",
|
|
u_errorName(errorCode));
|
|
return;
|
|
}
|
|
excProps=new ExcProps *[MAX_EXC_COUNT];
|
|
if(excProps==NULL) {
|
|
fprintf(stderr,
|
|
"genprops error: casepropsbuilder out of memory allocating "
|
|
"the array of exceptions properties\n");
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
|
|
CasePropsBuilder::~CasePropsBuilder() {
|
|
utrie2_close(pTrie);
|
|
for(int32_t i=0; i<excPropsCount; ++i) {
|
|
delete excProps[i];
|
|
}
|
|
delete[] excProps;
|
|
}
|
|
|
|
void
|
|
CasePropsBuilder::setUnicodeVersion(const UVersionInfo version) {
|
|
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
|
}
|
|
|
|
/* -------------------------------------------------------------------------- */
|
|
|
|
void
|
|
CasePropsBuilder::addUnfolding(UChar32 c, const UnicodeString &s, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
int32_t length=s.length();
|
|
if(length>UGENCASE_UNFOLD_STRING_WIDTH) {
|
|
fprintf(stderr, "genprops error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n",
|
|
(long)length, UGENCASE_UNFOLD_STRING_WIDTH);
|
|
errorCode=U_INTERNAL_PROGRAM_ERROR;
|
|
}
|
|
unfold.append(s);
|
|
while(length<UGENCASE_UNFOLD_STRING_WIDTH) {
|
|
unfold.append(0);
|
|
++length;
|
|
}
|
|
|
|
unfold.append(c);
|
|
if(U16_LENGTH(c)<UGENCASE_UNFOLD_CP_WIDTH) {
|
|
unfold.append(0);
|
|
}
|
|
|
|
U_ASSERT((unfold.length()%UGENCASE_UNFOLD_WIDTH)==0);
|
|
}
|
|
|
|
/* store a character's properties ------------------------------------------- */
|
|
|
|
void
|
|
CasePropsBuilder::setProps(const UniProps &props, const UnicodeSet &newValues,
|
|
UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode) || newValues.containsNone(relevantProps)) { return; }
|
|
|
|
UChar32 start=props.start;
|
|
UChar32 end=props.end;
|
|
|
|
/* default: map to self */
|
|
int32_t delta=0;
|
|
|
|
uint32_t type;
|
|
if(props.binProps[UCHAR_LOWERCASE]) {
|
|
type=UCASE_LOWER;
|
|
} else if(props.binProps[UCHAR_UPPERCASE]) {
|
|
type=UCASE_UPPER;
|
|
} else if(props.getIntProp(UCHAR_GENERAL_CATEGORY)==U_TITLECASE_LETTER) {
|
|
type=UCASE_TITLE;
|
|
} else {
|
|
type=UCASE_NONE;
|
|
}
|
|
uint32_t value=type;
|
|
|
|
UBool hasMapping=FALSE;
|
|
if(props.suc>=0) {
|
|
/* uppercase mapping as delta if the character is lowercase */
|
|
hasMapping=TRUE;
|
|
if(type==UCASE_LOWER) {
|
|
delta=props.suc-start;
|
|
} else {
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
}
|
|
if(props.slc>=0) {
|
|
/* lowercase mapping as delta if the character is uppercase or titlecase */
|
|
hasMapping=TRUE;
|
|
if(type>=UCASE_UPPER) {
|
|
delta=props.slc-start;
|
|
} else {
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
}
|
|
if(props.stc>=0) {
|
|
hasMapping=TRUE;
|
|
}
|
|
if(props.suc!=props.stc) {
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() ||
|
|
newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS)
|
|
) {
|
|
hasMapping=TRUE;
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
if( (props.scf>=0 && props.scf!=props.slc) ||
|
|
(!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) ||
|
|
newValues.contains(PPUCD_TURKIC_CASE_FOLDING)
|
|
) {
|
|
hasMapping=TRUE;
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
|
|
// Simple case folding falls back to simple lowercasing.
|
|
// If there is no case folding but there is a lowercase mapping,
|
|
// then add a case folding mapping to the code point.
|
|
// For example: Cherokee uppercase syllables since Unicode 8.
|
|
// (Full case folding falls back to simple case folding,
|
|
// not to full lowercasing, so we need not also handle it specially
|
|
// for such cases.)
|
|
UChar32 scf=props.scf;
|
|
if(scf<0 && props.slc>=0) {
|
|
scf=start;
|
|
hasMapping=TRUE;
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
|
|
if(delta<UCASE_MIN_DELTA || UCASE_MAX_DELTA<delta) {
|
|
value|=UCASE_EXCEPTION;
|
|
}
|
|
|
|
if(props.binProps[UCHAR_SOFT_DOTTED]) {
|
|
value|=UCASE_SOFT_DOTTED;
|
|
}
|
|
int32_t cc=props.getIntProp(UCHAR_CANONICAL_COMBINING_CLASS);
|
|
if(cc!=0) {
|
|
if(props.binProps[UCHAR_SOFT_DOTTED]) {
|
|
fprintf(stderr, "genprops error: a soft-dotted character has ccc!=0\n");
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
if(cc==230) {
|
|
value|=UCASE_ABOVE;
|
|
} else {
|
|
value|=UCASE_OTHER_ACCENT;
|
|
}
|
|
}
|
|
|
|
if(props.binProps[UCHAR_CASE_IGNORABLE]) {
|
|
value|=UCASE_IGNORABLE;
|
|
}
|
|
|
|
if((hasMapping || (value&UCASE_EXCEPTION)) && start!=end) {
|
|
fprintf(stderr,
|
|
"genprops error: range %04lX..%04lX has case mappings "
|
|
"or reasons for data structure exceptions\n",
|
|
(long)start, (long)end);
|
|
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
|
return;
|
|
}
|
|
|
|
/* handle exceptions */
|
|
if(value&UCASE_EXCEPTION) {
|
|
/* simply store exceptions for later processing and encoding */
|
|
if(excPropsCount==MAX_EXC_COUNT) {
|
|
fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n");
|
|
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return;
|
|
}
|
|
ExcProps *newExcProps=new ExcProps(props);
|
|
if(newExcProps==NULL) {
|
|
fprintf(stderr,
|
|
"genprops error: casepropsbuilder out of memory allocating "
|
|
"exceptions properties\n");
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
return;
|
|
}
|
|
newExcProps->props.scf=scf;
|
|
newExcProps->hasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS);
|
|
newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING);
|
|
value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
|
|
excProps[excPropsCount++]=newExcProps;
|
|
} else {
|
|
/* store the simple case mapping delta */
|
|
value|=((uint32_t)delta<<UCASE_DELTA_SHIFT)&UCASE_DELTA_MASK;
|
|
}
|
|
|
|
utrie2_setRange32(pTrie, start, end, value, TRUE, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set case mapping values: %s\n",
|
|
u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
if(hasMapping) {
|
|
/* update the case-sensitive set */
|
|
caseSensitive.add(start);
|
|
if(scf>=0) { caseSensitive.add(scf); }
|
|
if(props.slc>=0) { caseSensitive.add(props.slc); }
|
|
if(props.suc>=0) { caseSensitive.add(props.suc); }
|
|
if(props.stc>=0) { caseSensitive.add(props.stc); }
|
|
caseSensitive.addAll(props.cf);
|
|
caseSensitive.addAll(props.lc);
|
|
caseSensitive.addAll(props.uc);
|
|
caseSensitive.addAll(props.tc);
|
|
|
|
/* update maxFullLength */
|
|
if(props.cf.length()>maxFullLength) { maxFullLength=props.cf.length(); }
|
|
if(props.lc.length()>maxFullLength) { maxFullLength=props.lc.length(); }
|
|
if(props.uc.length()>maxFullLength) { maxFullLength=props.uc.length(); }
|
|
if(props.tc.length()>maxFullLength) { maxFullLength=props.tc.length(); }
|
|
}
|
|
|
|
/* add the multi-character case folding to the "unfold" data */
|
|
if(props.cf.hasMoreChar32Than(0, 0x7fffffff, 1)) {
|
|
addUnfolding(start, props.cf, errorCode);
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
CasePropsBuilder::makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
if(excPropsCount==MAX_EXC_COUNT) {
|
|
fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n");
|
|
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
|
return 0;
|
|
}
|
|
LocalPointer<ExcProps> newExcProps(new ExcProps);
|
|
if(newExcProps==NULL) {
|
|
fprintf(stderr,
|
|
"genprops error: casepropsbuilder out of memory allocating "
|
|
"exceptions properties\n");
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
return 0;
|
|
}
|
|
|
|
if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
|
|
// Decode the simple case mapping.
|
|
UChar32 next=c+UCASE_GET_DELTA(value);
|
|
if(next!=c) {
|
|
UniProps &p=newExcProps->props;
|
|
if((value&UCASE_TYPE_MASK)==UCASE_LOWER) {
|
|
p.suc=p.stc=next;
|
|
} else {
|
|
p.slc=next;
|
|
}
|
|
}
|
|
}
|
|
|
|
value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); // remove previous simple mapping
|
|
value|=(uint32_t)excPropsCount<<UGENCASE_EXC_SHIFT;
|
|
value|=UCASE_EXCEPTION;
|
|
excProps[excPropsCount++]=newExcProps.orphan();
|
|
return value;
|
|
}
|
|
|
|
/* finalize reverse case folding ("unfold") data ---------------------------- */
|
|
|
|
static int32_t U_CALLCONV
|
|
compareUnfold(const void *context, const void *left, const void *right) {
|
|
return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH);
|
|
}
|
|
|
|
void
|
|
CasePropsBuilder::makeUnfoldData(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
UChar *p, *q;
|
|
int32_t i, j, k;
|
|
|
|
/* sort the data */
|
|
int32_t unfoldLength=unfold.length();
|
|
int32_t unfoldRows=unfoldLength/UGENCASE_UNFOLD_WIDTH-1;
|
|
UChar *unfoldBuffer=unfold.getBuffer(-1);
|
|
uprv_sortArray(unfoldBuffer+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2,
|
|
compareUnfold, NULL, FALSE, &errorCode);
|
|
|
|
/* make unique-string rows by merging adjacent ones' code point columns */
|
|
|
|
/* make p point to row i-1 */
|
|
p=unfoldBuffer+UGENCASE_UNFOLD_WIDTH;
|
|
|
|
for(i=1; i<unfoldRows;) {
|
|
if(0==u_memcmp(p, p+UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH)) {
|
|
/* concatenate code point columns */
|
|
q=p+UGENCASE_UNFOLD_STRING_WIDTH;
|
|
for(j=1; j<UGENCASE_UNFOLD_CP_WIDTH && q[j]!=0; ++j) {}
|
|
for(k=0; k<UGENCASE_UNFOLD_CP_WIDTH && q[UGENCASE_UNFOLD_WIDTH+k]!=0; ++j, ++k) {
|
|
q[j]=q[UGENCASE_UNFOLD_WIDTH+k];
|
|
}
|
|
if(j>UGENCASE_UNFOLD_CP_WIDTH) {
|
|
fprintf(stderr, "genprops error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n",
|
|
(long)j, UGENCASE_UNFOLD_CP_WIDTH);
|
|
errorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
return;
|
|
}
|
|
|
|
/* move following rows up one */
|
|
--unfoldRows;
|
|
u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH);
|
|
} else {
|
|
p+=UGENCASE_UNFOLD_WIDTH;
|
|
++i;
|
|
}
|
|
}
|
|
|
|
unfoldBuffer[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows;
|
|
|
|
if(beVerbose) {
|
|
puts("unfold data:");
|
|
|
|
p=unfoldBuffer;
|
|
for(i=0; i<unfoldRows; ++i) {
|
|
p+=UGENCASE_UNFOLD_WIDTH;
|
|
printf("[%2d] %04x %04x %04x <- %04x %04x\n",
|
|
(int)i, p[0], p[1], p[2], p[3], p[4]);
|
|
}
|
|
}
|
|
|
|
unfold.releaseBuffer((unfoldRows+1)*UGENCASE_UNFOLD_WIDTH);
|
|
}
|
|
|
|
/* case closure ------------------------------------------------------------- */
|
|
|
|
void
|
|
CasePropsBuilder::addClosureMapping(UChar32 src, UChar32 dest, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
if(beVerbose) {
|
|
printf("add closure mapping U+%04lx->U+%04lx\n",
|
|
(unsigned long)src, (unsigned long)dest);
|
|
}
|
|
|
|
uint32_t value=utrie2_get32(pTrie, src);
|
|
if((value&UCASE_EXCEPTION)==0) {
|
|
/*
|
|
* decode value into p2 (enough for makeException() to work properly),
|
|
* add the closure mapping,
|
|
* and set the new exception for src
|
|
*/
|
|
value=makeExcProps(src, value, errorCode);
|
|
utrie2_set32(pTrie, src, value, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set case mapping values, code: %s\n",
|
|
u_errorName(errorCode));
|
|
return;
|
|
}
|
|
}
|
|
excProps[value>>UGENCASE_EXC_SHIFT]->closure.add(dest);
|
|
}
|
|
|
|
/*
|
|
* Find missing case mapping relationships and add mappings for case closure.
|
|
* This function starts from an "original" code point and recursively
|
|
* finds its case mappings and the case mappings of where it maps to.
|
|
*
|
|
* The recursion depth is capped at 3 nested calls of this function.
|
|
* In each call, the current code point is c, and the function enumerates
|
|
* all of c's simple (single-code point) case mappings.
|
|
* prev is the code point that case-mapped to c.
|
|
* prev2 is the code point that case-mapped to prev.
|
|
*
|
|
* The initial function call has prev2<0, prev<0, and c==orig
|
|
* (marking no code points).
|
|
* It enumerates c's case mappings and recurses without further action.
|
|
*
|
|
* The second-level function call has prev2<0, prev==orig, and c is
|
|
* the destination code point of one of prev's case mappings.
|
|
* The function checks if any of c's case mappings go back to orig
|
|
* and adds a closure mapping if not.
|
|
* In other words, it turns a case mapping relationship of
|
|
* orig->c
|
|
* into
|
|
* orig<->c
|
|
*
|
|
* The third-level function call has prev2==orig, prev>=0, and c is
|
|
* the destination code point of one of prev's case mappings.
|
|
* (And prev is the destination of one of prev2's case mappings.)
|
|
* The function checks if any of c's case mappings go back to orig
|
|
* and adds a closure mapping if not.
|
|
* In other words, it turns case mapping relationships of
|
|
* orig->prev->c or orig->prev<->c
|
|
* into
|
|
* orig->prev->c->orig or orig->prev<->c->orig
|
|
* etc.
|
|
* (Graphically, this closes a triangle.)
|
|
*
|
|
* With repeated application on all code points until no more closure mappings
|
|
* are added, all case equivalence groups get complete mappings.
|
|
* That is, in each group of code points with case relationships
|
|
* each code point will in the end have some mapping to each other
|
|
* code point in the group.
|
|
*
|
|
* @return TRUE if a closure mapping was added
|
|
*/
|
|
UBool
|
|
CasePropsBuilder::addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value,
|
|
UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return FALSE; }
|
|
|
|
UChar32 next;
|
|
UBool someMappingsAdded=FALSE;
|
|
|
|
if(c!=orig) {
|
|
/* get the properties for c */
|
|
value=utrie2_get32(pTrie, c);
|
|
}
|
|
/* else if c==orig then c's value was passed in */
|
|
|
|
if(value&UCASE_EXCEPTION) {
|
|
UnicodeSet set;
|
|
|
|
ExcProps &ep=*excProps[value>>UGENCASE_EXC_SHIFT];
|
|
UniProps &p=ep.props;
|
|
|
|
/*
|
|
* marker for whether any of c's mappings goes to orig
|
|
* c==orig: prevent adding a closure mapping when getting orig's own, direct mappings
|
|
*/
|
|
UBool mapsToOrig=(UBool)(c==orig);
|
|
|
|
/* collect c's case mapping destinations in set[] */
|
|
if((next=p.suc)>=0 && next!=c) {
|
|
set.add(next);
|
|
}
|
|
if((next=p.slc)>=0 && next!=c) {
|
|
set.add(next);
|
|
}
|
|
if(p.suc!=(next=p.stc) && next!=c) {
|
|
set.add(next);
|
|
}
|
|
if((next=p.scf)>=0 && next!=c) {
|
|
set.add(next);
|
|
}
|
|
|
|
/* add c's current closure mappings to set */
|
|
set.addAll(ep.closure);
|
|
|
|
/* process all code points to which c case-maps */
|
|
UnicodeSetIterator iter(set);
|
|
while(iter.next()) {
|
|
next=iter.getCodepoint(); /* next!=c */
|
|
|
|
if(next==orig) {
|
|
mapsToOrig=TRUE; /* remember that we map to orig */
|
|
} else if(prev2<0 && next!=prev) {
|
|
/*
|
|
* recurse unless
|
|
* we have reached maximum depth (prev2>=0) or
|
|
* this is a mapping to one of the previous code points (orig, prev, c)
|
|
*/
|
|
someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode);
|
|
}
|
|
}
|
|
|
|
if(!mapsToOrig) {
|
|
addClosureMapping(c, orig, errorCode);
|
|
return TRUE;
|
|
}
|
|
} else {
|
|
if((value&UCASE_TYPE_MASK)>UCASE_NONE) {
|
|
/* one simple case mapping, don't care which one */
|
|
next=c+UCASE_GET_DELTA(value);
|
|
if(next!=c) {
|
|
/*
|
|
* recurse unless
|
|
* we have reached maximum depth (prev2>=0) or
|
|
* this is a mapping to one of the previous code points (orig, prev, c)
|
|
*/
|
|
if(prev2<0 && next!=orig && next!=prev) {
|
|
someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode);
|
|
}
|
|
|
|
if(c!=orig && next!=orig) {
|
|
/* c does not map to orig, add a closure mapping c->orig */
|
|
addClosureMapping(c, orig, errorCode);
|
|
return TRUE;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return someMappingsAdded;
|
|
}
|
|
|
|
void
|
|
CasePropsBuilder::makeCaseClosure(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
/*
|
|
* finalize the "unfold" data because we need to use it to add closure mappings
|
|
* for situations like FB05->"st"<-FB06
|
|
* where we would otherwise miss the FB05<->FB06 relationship
|
|
*/
|
|
makeUnfoldData(errorCode);
|
|
|
|
/* use the "unfold" data to add mappings */
|
|
|
|
/* p always points to the code points; this loop ignores the strings completely */
|
|
const UChar *p=unfold.getBuffer()+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH;
|
|
int32_t unfoldRows=unfold.length()/UGENCASE_UNFOLD_WIDTH-1;
|
|
|
|
for(int32_t i=0; i<unfoldRows; p+=UGENCASE_UNFOLD_WIDTH, ++i) {
|
|
int32_t j=0;
|
|
UChar32 c;
|
|
U16_NEXT_UNSAFE(p, j, c);
|
|
while(j<UGENCASE_UNFOLD_CP_WIDTH && p[j]!=0) {
|
|
UChar32 c2;
|
|
U16_NEXT_UNSAFE(p, j, c2);
|
|
addClosure(c, U_SENTINEL, c, c2, 0, errorCode);
|
|
}
|
|
}
|
|
|
|
if(beVerbose) {
|
|
puts("---- ---- ---- ---- (done with closures from unfolding)");
|
|
}
|
|
|
|
/* add further closure mappings from analyzing simple mappings */
|
|
UBool someMappingsAdded;
|
|
do {
|
|
someMappingsAdded=FALSE;
|
|
|
|
for(UChar32 c=0; c<=0x10ffff; ++c) {
|
|
uint32_t value=utrie2_get32(pTrie, c);
|
|
if(value!=0) {
|
|
someMappingsAdded|=addClosure(c, U_SENTINEL, U_SENTINEL, c, value, errorCode);
|
|
}
|
|
}
|
|
|
|
if(beVerbose && someMappingsAdded) {
|
|
puts("---- ---- ---- ----");
|
|
}
|
|
} while(someMappingsAdded);
|
|
}
|
|
|
|
/* exceptions --------------------------------------------------------------- */
|
|
|
|
static UBool
|
|
fullMappingEqualsSimple(const UnicodeString &s, UChar32 simple, UChar32 c) {
|
|
if(simple<=0) {
|
|
simple=c; /* UCD has no simple mapping if it's the same as the code point itself */
|
|
}
|
|
return s.length()==U16_LENGTH(simple) && s.char32At(0)==simple;
|
|
}
|
|
|
|
int32_t
|
|
CasePropsBuilder::makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
/* exceptions.length() might be returned for storing in the trie word */
|
|
if(exceptions.length()>=UCASE_MAX_EXCEPTIONS) {
|
|
fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions words\n");
|
|
errorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
return 0;
|
|
}
|
|
|
|
/* copy and shift the soft-dotted bits */
|
|
UChar excWord=(UChar)((value&UCASE_DOT_MASK)<<UCASE_EXC_DOT_SHIFT);
|
|
|
|
UniProps &p=ep.props;
|
|
|
|
/* set the bits for conditional mappings */
|
|
if(ep.hasConditionalCaseMappings) {
|
|
excWord|=UCASE_EXC_CONDITIONAL_SPECIAL;
|
|
p.lc.remove();
|
|
p.uc.remove();
|
|
p.tc.remove();
|
|
}
|
|
if(ep.hasTurkicCaseFolding) {
|
|
excWord|=UCASE_EXC_CONDITIONAL_FOLD;
|
|
p.cf.remove();
|
|
}
|
|
|
|
/* remove redundant data */
|
|
/* do not store full mappings if they are the same as the simple ones */
|
|
if(fullMappingEqualsSimple(p.lc, p.slc, c)) {
|
|
p.lc.remove();
|
|
}
|
|
if(fullMappingEqualsSimple(p.uc, p.suc, c)) {
|
|
p.uc.remove();
|
|
}
|
|
if(fullMappingEqualsSimple(p.tc, p.stc, c)) {
|
|
p.tc.remove();
|
|
}
|
|
if(fullMappingEqualsSimple(p.cf, p.scf, c)) {
|
|
p.cf.remove();
|
|
}
|
|
|
|
/* write the optional slots */
|
|
uint32_t slots[8];
|
|
uint32_t slotBits=0;
|
|
int32_t count=0;
|
|
|
|
if(p.slc>=0) {
|
|
slots[count]=(uint32_t)p.slc;
|
|
slotBits|=slots[count];
|
|
++count;
|
|
excWord|=U_MASK(UCASE_EXC_LOWER);
|
|
}
|
|
if( p.scf>=0 &&
|
|
(p.slc>=0 ?
|
|
p.scf!=p.slc :
|
|
p.scf!=c)) {
|
|
slots[count]=(uint32_t)p.scf;
|
|
slotBits|=slots[count];
|
|
++count;
|
|
excWord|=U_MASK(UCASE_EXC_FOLD);
|
|
}
|
|
if(p.suc>=0) {
|
|
slots[count]=(uint32_t)p.suc;
|
|
slotBits|=slots[count];
|
|
++count;
|
|
excWord|=U_MASK(UCASE_EXC_UPPER);
|
|
}
|
|
if(p.suc!=p.stc) {
|
|
if(p.stc>=0) {
|
|
slots[count]=(uint32_t)p.stc;
|
|
} else {
|
|
slots[count]=(uint32_t)c;
|
|
}
|
|
slotBits|=slots[count];
|
|
++count;
|
|
excWord|=U_MASK(UCASE_EXC_TITLE);
|
|
}
|
|
|
|
/* length of case closure */
|
|
UnicodeString closureString;
|
|
if(!ep.closure.isEmpty()) {
|
|
UnicodeSetIterator iter(ep.closure);
|
|
while(iter.next()) { closureString.append(iter.getCodepoint()); }
|
|
int32_t length=closureString.length();
|
|
if(length>UCASE_CLOSURE_MAX_LENGTH) {
|
|
fprintf(stderr,
|
|
"genprops error: case closure for U+%04lX has length %d "
|
|
"which exceeds UCASE_CLOSURE_MAX_LENGTH=%d\n",
|
|
(long)c, (int)length, (int)UCASE_CLOSURE_MAX_LENGTH);
|
|
errorCode=U_BUFFER_OVERFLOW_ERROR;
|
|
return 0;
|
|
}
|
|
slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */
|
|
slotBits|=slots[count];
|
|
++count;
|
|
excWord|=U_MASK(UCASE_EXC_CLOSURE);
|
|
}
|
|
|
|
/* lengths of full case mapping strings, stored in the last slot */
|
|
int32_t fullLengths=
|
|
p.lc.length()|
|
|
(p.cf.length()<<4)|
|
|
(p.uc.length()<<8)|
|
|
(p.tc.length()<<12);
|
|
if(fullLengths!=0) {
|
|
slots[count]=(uint32_t)fullLengths;
|
|
slotBits|=slots[count];
|
|
++count;
|
|
excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS);
|
|
}
|
|
|
|
if(count==0) {
|
|
/* No optional slots: Try to share excWord entries. */
|
|
int32_t excIndex=exceptions.indexOf((UChar)excWord);
|
|
if(excIndex>=0) {
|
|
return excIndex;
|
|
}
|
|
/* not found */
|
|
excIndex=exceptions.length();
|
|
exceptions.append((UChar)excWord);
|
|
return excIndex;
|
|
} else {
|
|
/* write slots */
|
|
int32_t excIndex=exceptions.length();
|
|
exceptions.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */
|
|
|
|
if(slotBits<=0xffff) {
|
|
for(int32_t i=0; i<count; ++i) {
|
|
exceptions.append((UChar)slots[i]);
|
|
}
|
|
} else {
|
|
excWord|=UCASE_EXC_DOUBLE_SLOTS;
|
|
for(int32_t i=0; i<count; ++i) {
|
|
exceptions.append((UChar)(slots[i]>>16));
|
|
exceptions.append((UChar)slots[i]);
|
|
}
|
|
}
|
|
|
|
/* write the full case mapping strings */
|
|
exceptions.append(p.lc);
|
|
exceptions.append(p.cf);
|
|
exceptions.append(p.uc);
|
|
exceptions.append(p.tc);
|
|
|
|
/* write the closure data */
|
|
exceptions.append(closureString);
|
|
|
|
/* write the main exceptions word */
|
|
exceptions.setCharAt(excIndex, (UChar)excWord);
|
|
|
|
return excIndex;
|
|
}
|
|
}
|
|
|
|
void
|
|
CasePropsBuilder::makeExceptions(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
/*
|
|
* Encode case-ignorable as delta==1 on uncased characters,
|
|
* and with an exception bit on cased characters and characters with another exception.
|
|
*
|
|
* Change from temporary UGENCASE_EXC_SHIFT'ed index into excProps[]
|
|
* to UCASE_EXC_SHIFT'ed index into encoded exceptions[].
|
|
*/
|
|
for(UChar32 c=0; c<=0x10ffff; ++c) {
|
|
uint32_t value=utrie2_get32(pTrie, c);
|
|
if(value&UCASE_EXCEPTION) {
|
|
int32_t excIndex=makeException(c, value, *excProps[value>>UGENCASE_EXC_SHIFT], errorCode);
|
|
value=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|((uint32_t)excIndex<<UCASE_EXC_SHIFT);
|
|
utrie2_set32(pTrie, c, value, &errorCode);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* generate output data ----------------------------------------------------- */
|
|
|
|
static int32_t indexes[UCASE_IX_TOP]={
|
|
UCASE_IX_TOP, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0,
|
|
0, 0, 0, 0
|
|
};
|
|
|
|
static uint8_t trieBlock[40000];
|
|
static int32_t trieSize;
|
|
|
|
void
|
|
CasePropsBuilder::build(UErrorCode &errorCode) {
|
|
if(!beQuiet) {
|
|
puts("* ucase.icu stats *");
|
|
}
|
|
|
|
makeCaseClosure(errorCode);
|
|
makeExceptions(errorCode);
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
/*
|
|
* Add one complex mapping to caseSensitive that was filtered out above:
|
|
* Greek final Sigma has a conditional mapping but not locale-sensitive,
|
|
* and it is taken when lowercasing just U+03A3 alone.
|
|
* 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
|
*/
|
|
caseSensitive.add(0x3c2);
|
|
|
|
UnicodeSetIterator iter(caseSensitive);
|
|
while(iter.next()) {
|
|
UChar32 c=iter.getCodepoint();
|
|
uint32_t value=utrie2_get32(pTrie, c);
|
|
if((value&UCASE_SENSITIVE)==0) {
|
|
utrie2_set32(pTrie, c, value|UCASE_SENSITIVE, &errorCode);
|
|
}
|
|
}
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: unable to set UCASE_SENSITIVE: %s\n",
|
|
u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
utrie2_freeze(pTrie, UTRIE2_16_VALUE_BITS, &errorCode);
|
|
trieSize=utrie2_serialize(pTrie, trieBlock, sizeof(trieBlock), &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: utrie2_freeze()+utrie2_serialize() failed: %s (length %ld)\n",
|
|
u_errorName(errorCode), (long)trieSize);
|
|
return;
|
|
}
|
|
|
|
indexes[UCASE_IX_EXC_LENGTH]=exceptions.length();
|
|
indexes[UCASE_IX_TRIE_SIZE]=trieSize;
|
|
indexes[UCASE_IX_UNFOLD_LENGTH]=unfold.length();
|
|
indexes[UCASE_IX_LENGTH]=(int32_t)sizeof(indexes)+trieSize+2*exceptions.length()+2*unfold.length();
|
|
|
|
indexes[UCASE_IX_MAX_FULL_LENGTH]=maxFullLength;
|
|
|
|
if(!beQuiet) {
|
|
printf("trie size in bytes: %5d\n", (int)trieSize);
|
|
printf("number of code points with exceptions: %5d\n", excPropsCount);
|
|
printf("size in bytes of exceptions: %5d\n", 2*exceptions.length());
|
|
printf("size in bytes of reverse foldings: %5d\n", 2*unfold.length());
|
|
printf("data size: %5d\n", (int)indexes[UCASE_IX_LENGTH]);
|
|
}
|
|
}
|
|
|
|
void
|
|
CasePropsBuilder::writeCSourceFile(const char *path, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
FILE *f=usrc_create(path, "ucase_props_data.h",
|
|
"icu/tools/unicode/c/genprops/casepropsbuilder.cpp");
|
|
if(f==NULL) {
|
|
errorCode=U_FILE_ACCESS_ERROR;
|
|
return;
|
|
}
|
|
fputs("#ifdef INCLUDED_FROM_UCASE_CPP\n\n", f);
|
|
usrc_writeArray(f,
|
|
"static const UVersionInfo ucase_props_dataVersion={",
|
|
dataInfo.dataVersion, 8, 4,
|
|
"};\n\n");
|
|
usrc_writeArray(f,
|
|
"static const int32_t ucase_props_indexes[UCASE_IX_TOP]={",
|
|
indexes, 32, UCASE_IX_TOP,
|
|
"};\n\n");
|
|
usrc_writeUTrie2Arrays(f,
|
|
"static const uint16_t ucase_props_trieIndex[%ld]={\n", NULL,
|
|
pTrie,
|
|
"\n};\n\n");
|
|
usrc_writeArray(f,
|
|
"static const uint16_t ucase_props_exceptions[%ld]={\n",
|
|
exceptions.getBuffer(), 16, exceptions.length(),
|
|
"\n};\n\n");
|
|
usrc_writeArray(f,
|
|
"static const uint16_t ucase_props_unfold[%ld]={\n",
|
|
unfold.getBuffer(), 16, unfold.length(),
|
|
"\n};\n\n");
|
|
fputs(
|
|
"static const UCaseProps ucase_props_singleton={\n"
|
|
" NULL,\n"
|
|
" ucase_props_indexes,\n"
|
|
" ucase_props_exceptions,\n"
|
|
" ucase_props_unfold,\n",
|
|
f);
|
|
usrc_writeUTrie2Struct(f,
|
|
" {\n",
|
|
pTrie, "ucase_props_trieIndex", NULL,
|
|
" },\n");
|
|
usrc_writeArray(f, " { ", dataInfo.formatVersion, 8, 4, " }\n");
|
|
fputs("};\n\n"
|
|
"#endif // INCLUDED_FROM_UCASE_CPP\n", f);
|
|
fclose(f);
|
|
}
|
|
|
|
void
|
|
CasePropsBuilder::writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
UNewDataMemory *pData=udata_create(path, "icu", "ucase", &dataInfo,
|
|
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops: udata_create(%s, ucase.icu) failed - %s\n",
|
|
path, u_errorName(errorCode));
|
|
return;
|
|
}
|
|
|
|
udata_writeBlock(pData, indexes, sizeof(indexes));
|
|
udata_writeBlock(pData, trieBlock, trieSize);
|
|
udata_writeBlock(pData, exceptions.getBuffer(), 2*exceptions.length());
|
|
udata_writeBlock(pData, unfold.getBuffer(), 2*unfold.length());
|
|
|
|
/* finish up */
|
|
long dataLength=udata_finish(pData, &errorCode);
|
|
if(U_FAILURE(errorCode)) {
|
|
fprintf(stderr, "genprops error: casepropsbuilder error %d writing the output file\n", errorCode);
|
|
exit(errorCode);
|
|
}
|
|
|
|
if(dataLength!=indexes[UCASE_IX_LENGTH]) {
|
|
fprintf(stderr,
|
|
"udata_finish(ucase.icu) reports %ld bytes written but should be %ld\n",
|
|
dataLength, (long)indexes[UCASE_IX_LENGTH]);
|
|
errorCode=U_INTERNAL_PROGRAM_ERROR;
|
|
}
|
|
}
|
|
|
|
PropsBuilder *
|
|
createCasePropsBuilder(UErrorCode &errorCode) {
|
|
if(U_FAILURE(errorCode)) { return NULL; }
|
|
PropsBuilder *pb=new CasePropsBuilder(errorCode);
|
|
if(pb==NULL) {
|
|
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
return pb;
|
|
}
|
|
|
|
/*
|
|
* Hey, Emacs, please set the following:
|
|
*
|
|
* Local Variables:
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*
|
|
*/
|