2001-02-23 04:57:47 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
|
|
|
*
|
2014-02-24 22:17:04 +00:00
|
|
|
* Copyright (C) 2000-2014, International Business Machines
|
2001-02-23 04:57:47 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
*
|
|
|
|
*******************************************************************************
|
|
|
|
* file name: genuca.cpp
|
|
|
|
* encoding: US-ASCII
|
|
|
|
* tab size: 8 (not used)
|
|
|
|
* indentation:4
|
|
|
|
*
|
2001-02-26 10:28:56 +00:00
|
|
|
* created at the end of XX century
|
2014-02-24 22:17:04 +00:00
|
|
|
* created by: Vladimir Weinstein,
|
|
|
|
* modified in 2013-2014 by Markus Scherer
|
2001-02-26 10:28:56 +00:00
|
|
|
*
|
2014-02-24 22:17:04 +00:00
|
|
|
* This program reads the Fractional UCA table and generates
|
2001-02-23 04:57:47 +00:00
|
|
|
* internal format for UCA table as well as inverse UCA table.
|
2014-02-24 22:17:04 +00:00
|
|
|
* It then writes the ucadata.icu binary file containing the data.
|
2001-02-23 04:57:47 +00:00
|
|
|
*/
|
|
|
|
|
2011-12-07 06:14:56 +00:00
|
|
|
#define U_NO_DEFAULT_INCLUDE_UTF_HEADERS 1
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
#include <stdio.h>
|
2002-10-01 01:26:49 +00:00
|
|
|
#include "unicode/utypes.h"
|
2014-02-24 22:17:04 +00:00
|
|
|
#include "unicode/errorcode.h"
|
|
|
|
#include "unicode/localpointer.h"
|
2012-01-10 19:36:20 +00:00
|
|
|
#include "charstr.h"
|
2014-02-24 22:17:04 +00:00
|
|
|
#include "cmemory.h"
|
|
|
|
#include "collation.h"
|
|
|
|
#include "collationbasedatabuilder.h"
|
|
|
|
#include "collationdata.h"
|
|
|
|
#include "collationdatabuilder.h"
|
|
|
|
#include "collationdatareader.h"
|
|
|
|
#include "collationdatawriter.h"
|
|
|
|
#include "collationinfo.h"
|
|
|
|
#include "collationrootelements.h"
|
|
|
|
#include "collationruleparser.h"
|
|
|
|
#include "collationtailoring.h"
|
|
|
|
#include "cstring.h"
|
|
|
|
#include "normalizer2impl.h"
|
2001-02-23 01:21:38 +00:00
|
|
|
#include "toolutil.h"
|
2002-10-01 17:44:04 +00:00
|
|
|
#include "unewdata.h"
|
2014-02-24 22:17:04 +00:00
|
|
|
#include "uoptions.h"
|
|
|
|
#include "uparse.h"
|
|
|
|
#include "writesrc.h"
|
2004-11-12 00:26:54 +00:00
|
|
|
|
2010-10-07 19:46:41 +00:00
|
|
|
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
|
|
|
|
2002-10-01 17:44:04 +00:00
|
|
|
#if UCONFIG_NO_COLLATION
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
extern "C" int
|
|
|
|
main(int argc, char* argv[]) {
|
|
|
|
(void)argc;
|
|
|
|
(void)argv;
|
|
|
|
return 1;
|
|
|
|
}
|
2002-10-01 17:44:04 +00:00
|
|
|
|
|
|
|
#else
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
U_NAMESPACE_USE
|
2004-08-28 05:50:39 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static UBool beVerbose=FALSE, withCopyright=TRUE;
|
2004-08-28 05:50:39 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static UVersionInfo UCAVersion={ 0, 0, 0, 0 };
|
2004-08-28 05:50:39 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static UDataInfo ucaDataInfo={
|
2004-08-28 05:50:39 +00:00
|
|
|
sizeof(UDataInfo),
|
|
|
|
0,
|
|
|
|
|
|
|
|
U_IS_BIG_ENDIAN,
|
|
|
|
U_CHARSET_FAMILY,
|
2014-02-24 22:17:04 +00:00
|
|
|
U_SIZEOF_UCHAR,
|
2004-08-28 05:50:39 +00:00
|
|
|
0,
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
{ 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol"
|
|
|
|
{ 4, 0, 0, 0 }, // formatVersion
|
|
|
|
{ 6, 3, 0, 0 } // dataVersion
|
2004-08-28 05:50:39 +00:00
|
|
|
};
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static char *skipWhiteSpace(char *s) {
|
|
|
|
while(*s == ' ' || *s == '\t') { ++s; }
|
|
|
|
return s;
|
|
|
|
}
|
2002-10-01 17:44:04 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static int32_t hex2num(char hex) {
|
|
|
|
if(hex>='0' && hex <='9') {
|
|
|
|
return hex-'0';
|
|
|
|
} else if(hex>='a' && hex<='f') {
|
|
|
|
return hex-'a'+10;
|
|
|
|
} else if(hex>='A' && hex<='F') {
|
|
|
|
return hex-'A'+10;
|
|
|
|
} else {
|
|
|
|
return -1;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
static uint32_t parseWeight(char *&s, const char *separators,
|
|
|
|
int32_t maxBytes, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
uint32_t weight = 0;
|
|
|
|
int32_t numBytes = 0;
|
2011-11-03 23:09:27 +00:00
|
|
|
for(;;) {
|
2014-02-24 22:17:04 +00:00
|
|
|
// Check one character after another, so that we don't just run over a 00.
|
|
|
|
int32_t nibble1, nibble2;
|
|
|
|
if((nibble1 = hex2num(s[0])) < 0 || (nibble2 = hex2num(s[1])) < 0) {
|
|
|
|
// Stop when we find something other than a pair of hex digits.
|
2011-11-03 23:09:27 +00:00
|
|
|
break;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
if(numBytes == maxBytes || (numBytes != 0 && nibble1 == 0 && nibble2 <= 1)) {
|
|
|
|
// Too many bytes, or a 00 or 01 byte which is illegal inside a weight.
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
2010-10-28 19:05:02 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
weight = (weight << 8) | ((uint32_t)nibble1 << 4) | (uint32_t)nibble2;
|
|
|
|
++numBytes;
|
|
|
|
s += 2;
|
|
|
|
if(*s != ' ') {
|
|
|
|
break;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
++s;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
char c = *s;
|
|
|
|
if(c == 0 || strchr(separators, c) == NULL) {
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
2010-10-28 19:05:02 +00:00
|
|
|
return 0;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
// numBytes==0 is ok, for example in [,,] or [, 82, 05]
|
|
|
|
// Left-align the weight.
|
|
|
|
while(numBytes < 4) {
|
|
|
|
weight <<= 8;
|
|
|
|
++numBytes;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
return weight;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
/**
|
|
|
|
* Parse a CE like [0A 86, 05, 17] or [U+4E00, 10].
|
|
|
|
* Stop with an error, or else with the pointer s after the closing bracket.
|
2003-08-23 01:38:31 +00:00
|
|
|
*/
|
2014-02-24 22:17:04 +00:00
|
|
|
static int64_t parseCE(const CollationDataBuilder &builder, char *&s, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
++s; // skip over the '['
|
|
|
|
if(s[0] == 'U' && s[1] == '+') {
|
|
|
|
// Read a code point and look up its CE.
|
|
|
|
// We use this especially for implicit primary weights,
|
|
|
|
// so that we can use different algorithms in the FractionalUCA.txt
|
|
|
|
// generator and the parser.
|
|
|
|
// The generator may not even need to compute any implicit primaries at all.
|
|
|
|
s += 2;
|
|
|
|
char *end;
|
|
|
|
unsigned long longCp = uprv_strtoul(s, &end, 16);
|
|
|
|
if(end == s || longCp > 0x10ffff) {
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
UChar32 c = (UChar32)longCp;
|
|
|
|
int64_t ce = builder.getSingleCE(c, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
s = end;
|
|
|
|
if(*s == ']') { // [U+4E00]
|
|
|
|
++s;
|
|
|
|
return ce;
|
|
|
|
}
|
|
|
|
if(*s != ',') {
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
// Parse the following, secondary or tertiary weight.
|
|
|
|
s = skipWhiteSpace(s + 1);
|
|
|
|
uint32_t w = parseWeight(s, ",]", 2, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
if(*s == ']') { // [U+4E00, 10]
|
|
|
|
++s;
|
|
|
|
// Set the tertiary weight to w.
|
|
|
|
return (ce & INT64_C(0xffffffffffff0000)) | (w >> 16);
|
|
|
|
}
|
|
|
|
// Set the secondary weight to w: [U+9F9C, 70, 20]
|
|
|
|
ce = (ce & INT64_C(0xffffffff00000000)) | w;
|
|
|
|
// Parse and set the tertiary weight.
|
|
|
|
s = skipWhiteSpace(s + 1);
|
|
|
|
w = parseWeight(s, "]", 2, errorCode);
|
|
|
|
++s;
|
|
|
|
return ce | (w >> 16);
|
2001-02-22 21:18:29 +00:00
|
|
|
} else {
|
2014-02-24 22:17:04 +00:00
|
|
|
uint32_t p = parseWeight(s, ",", 4, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
int64_t ce = (int64_t)p << 32;
|
|
|
|
s = skipWhiteSpace(s + 1);
|
|
|
|
uint32_t w = parseWeight(s, ",", 2, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
ce |= w;
|
|
|
|
s = skipWhiteSpace(s + 1);
|
|
|
|
w = parseWeight(s, "]", 2, errorCode);
|
|
|
|
++s;
|
|
|
|
return ce | (w >> 16);
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2011-11-03 23:09:27 +00:00
|
|
|
static const struct {
|
|
|
|
const char *name;
|
|
|
|
int32_t code;
|
|
|
|
} specialReorderTokens[] = {
|
|
|
|
{ "TERMINATOR", -2 }, // -2 means "ignore"
|
|
|
|
{ "LEVEL-SEPARATOR", -2 },
|
|
|
|
{ "FIELD-SEPARATOR", -2 },
|
2014-02-24 22:17:04 +00:00
|
|
|
{ "COMPRESS", -3 },
|
|
|
|
// The standard name is "PUNCT" but FractionalUCA.txt uses the long form.
|
2011-11-03 23:09:27 +00:00
|
|
|
{ "PUNCTUATION", UCOL_REORDER_CODE_PUNCTUATION },
|
|
|
|
{ "IMPLICIT", USCRIPT_HAN }, // Implicit weights are usually for Han characters. Han & unassigned share a lead byte.
|
|
|
|
{ "TRAILING", -2 }, // We do not reorder trailing weights (those after implicits).
|
|
|
|
{ "SPECIAL", -2 } // We must never reorder internal, special CE lead bytes.
|
|
|
|
};
|
|
|
|
|
|
|
|
int32_t getReorderCode(const char* name) {
|
2014-02-24 22:17:04 +00:00
|
|
|
int32_t code = CollationRuleParser::getReorderCode(name);
|
2011-11-03 23:09:27 +00:00
|
|
|
if (code >= 0) {
|
|
|
|
return code;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2011-11-03 23:09:27 +00:00
|
|
|
for (int32_t i = 0; i < LENGTHOF(specialReorderTokens); ++i) {
|
|
|
|
if (0 == strcmp(name, specialReorderTokens[i].name)) {
|
|
|
|
return specialReorderTokens[i].code;
|
|
|
|
}
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2011-11-03 23:09:27 +00:00
|
|
|
return -1; // Same as UCHAR_INVALID_CODE or USCRIPT_INVALID_CODE.
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
enum ActionType {
|
|
|
|
READCE,
|
|
|
|
READPRIMARY,
|
|
|
|
READBYTE,
|
|
|
|
READUNIFIEDIDEOGRAPH,
|
|
|
|
READUCAVERSION,
|
|
|
|
READLEADBYTETOSCRIPTS,
|
|
|
|
IGNORE
|
|
|
|
};
|
2002-07-19 21:59:26 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static struct {
|
|
|
|
const char *const name;
|
|
|
|
int64_t value;
|
|
|
|
const ActionType what_to_do;
|
|
|
|
} vt[] = {
|
|
|
|
{"[first tertiary ignorable", 0, IGNORE},
|
|
|
|
{"[last tertiary ignorable", 0, IGNORE},
|
|
|
|
{"[first secondary ignorable", 0, READCE},
|
|
|
|
{"[last secondary ignorable", 0, READCE},
|
|
|
|
{"[first primary ignorable", 0, READCE},
|
|
|
|
{"[last primary ignorable", 0, READCE},
|
|
|
|
{"[first variable", 0, READCE},
|
|
|
|
{"[last variable", 0, READCE},
|
|
|
|
{"[first regular", 0, READCE},
|
|
|
|
{"[last regular", 0, READCE},
|
|
|
|
{"[first implicit", 0, READCE},
|
|
|
|
{"[last implicit", 0, READCE},
|
|
|
|
{"[first trailing", 0, READCE},
|
|
|
|
{"[last trailing", 0, READCE},
|
|
|
|
|
|
|
|
{"[Unified_Ideograph", 0, READUNIFIEDIDEOGRAPH},
|
|
|
|
|
|
|
|
{"[fixed first implicit byte", 0, IGNORE},
|
|
|
|
{"[fixed last implicit byte", 0, IGNORE},
|
|
|
|
{"[fixed first trail byte", 0, IGNORE},
|
|
|
|
{"[fixed last trail byte", 0, IGNORE},
|
|
|
|
{"[fixed first special byte", 0, IGNORE},
|
|
|
|
{"[fixed last special byte", 0, IGNORE},
|
|
|
|
{"[fixed secondary common byte", 0, READBYTE},
|
|
|
|
{"[fixed last secondary common byte", 0, READBYTE},
|
|
|
|
{"[fixed first ignorable secondary byte", 0, READBYTE},
|
|
|
|
{"[fixed tertiary common byte", 0, READBYTE},
|
|
|
|
{"[fixed first ignorable tertiary byte", 0, READBYTE},
|
|
|
|
{"[variable top = ", 0, IGNORE},
|
|
|
|
{"[UCA version = ", 0, READUCAVERSION},
|
|
|
|
{"[top_byte", 0, READLEADBYTETOSCRIPTS},
|
|
|
|
{"[reorderingTokens", 0, IGNORE},
|
|
|
|
{"[categories", 0, IGNORE},
|
|
|
|
{"[first tertiary in secondary non-ignorable", 0, IGNORE},
|
|
|
|
{"[last tertiary in secondary non-ignorable", 0, IGNORE},
|
|
|
|
{"[first secondary in primary non-ignorable", 0, IGNORE},
|
|
|
|
{"[last secondary in primary non-ignorable", 0, IGNORE},
|
|
|
|
};
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static int64_t getOptionValue(const char *name) {
|
|
|
|
for (int32_t i = 0; i < LENGTHOF(vt); ++i) {
|
|
|
|
if(uprv_strcmp(name, vt[i].name) == 0) {
|
|
|
|
return vt[i].value;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static UnicodeString *leadByteScripts = NULL;
|
2002-07-02 22:28:40 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static void readAnOption(
|
|
|
|
CollationBaseDataBuilder &builder, char *buffer, UErrorCode *status) {
|
|
|
|
for (int32_t cnt = 0; cnt<LENGTHOF(vt); cnt++) {
|
|
|
|
int32_t vtLen = (int32_t)uprv_strlen(vt[cnt].name);
|
2002-06-13 18:24:36 +00:00
|
|
|
if(uprv_strncmp(buffer, vt[cnt].name, vtLen) == 0) {
|
2010-10-13 17:40:32 +00:00
|
|
|
ActionType what_to_do = vt[cnt].what_to_do;
|
2014-02-24 22:17:04 +00:00
|
|
|
char *pointer = skipWhiteSpace(buffer + vtLen);
|
2010-10-28 19:05:02 +00:00
|
|
|
if (what_to_do == IGNORE) { //vt[cnt].what_to_do == IGNORE
|
2014-02-24 22:17:04 +00:00
|
|
|
return;
|
2010-10-13 17:40:32 +00:00
|
|
|
} else if (what_to_do == READCE) {
|
2014-02-24 22:17:04 +00:00
|
|
|
vt[cnt].value = parseCE(builder, pointer, *status);
|
|
|
|
if(U_SUCCESS(*status) && *pointer != ']') {
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
|
|
}
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr, "Syntax error: unable to parse the CE from line '%s'\n", buffer);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(what_to_do == READPRIMARY) {
|
|
|
|
vt[cnt].value = parseWeight(pointer, "]", 4, *status);
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr, "Value of \"%s\" is not a primary weight\n", buffer);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(what_to_do == READBYTE) {
|
|
|
|
vt[cnt].value = parseWeight(pointer, "]", 1, *status) >> 24;
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr, "Value of \"%s\" is not a valid byte\n", buffer);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if(what_to_do == READUNIFIEDIDEOGRAPH) {
|
|
|
|
UVector32 unihan(*status);
|
|
|
|
if(U_FAILURE(*status)) { return; }
|
|
|
|
for(;;) {
|
|
|
|
if(*pointer == ']') { break; }
|
|
|
|
if(*pointer == 0) {
|
|
|
|
// Missing ] after ranges.
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
2002-07-02 22:28:40 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
char *s = pointer;
|
|
|
|
while(*s != ' ' && *s != '\t' && *s != ']' && *s != '\0') { ++s; }
|
|
|
|
char c = *s;
|
|
|
|
*s = 0;
|
|
|
|
uint32_t start, end;
|
|
|
|
u_parseCodePointRange(pointer, &start, &end, status);
|
|
|
|
*s = c;
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr, "Syntax error: unable to parse one of the ranges from line '%s'\n", buffer);
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
2002-07-02 22:28:40 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
unihan.addElement((UChar32)start, *status);
|
|
|
|
unihan.addElement((UChar32)end, *status);
|
|
|
|
pointer = skipWhiteSpace(s);
|
2002-07-02 22:28:40 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
builder.initHanRanges(unihan.getBuffer(), unihan.size(), *status);
|
|
|
|
} else if (what_to_do == READUCAVERSION) {
|
|
|
|
u_versionFromString(UCAVersion, pointer);
|
|
|
|
if(beVerbose) {
|
|
|
|
char uca[U_MAX_VERSION_STRING_LENGTH];
|
|
|
|
u_versionToString(UCAVersion, uca);
|
|
|
|
printf("UCA version %s\n", uca);
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
UVersionInfo UCDVersion;
|
|
|
|
u_getUnicodeVersion(UCDVersion);
|
|
|
|
if (UCAVersion[0] != UCDVersion[0] || UCAVersion[1] != UCDVersion[1]) {
|
|
|
|
char uca[U_MAX_VERSION_STRING_LENGTH];
|
|
|
|
char ucd[U_MAX_VERSION_STRING_LENGTH];
|
|
|
|
u_versionToString(UCAVersion, uca);
|
|
|
|
u_versionToString(UCDVersion, ucd);
|
|
|
|
// Warning, not error, to permit bootstrapping during a version upgrade.
|
|
|
|
fprintf(stderr, "warning: UCA version %s != UCD version %s\n", uca, ucd);
|
|
|
|
}
|
|
|
|
} else if (what_to_do == READLEADBYTETOSCRIPTS) {
|
|
|
|
uint16_t leadByte = (hex2num(*pointer++) * 16);
|
|
|
|
leadByte += hex2num(*pointer++);
|
|
|
|
|
|
|
|
if(0xe0 <= leadByte && leadByte < Collation::UNASSIGNED_IMPLICIT_BYTE) {
|
|
|
|
// Extend the Hani range to the end of what this implementation uses.
|
|
|
|
// FractionalUCA.txt assumes a different algorithm for implicit primary weights,
|
|
|
|
// and different high-lead byte ranges.
|
|
|
|
leadByteScripts[leadByte] = leadByteScripts[0xdf];
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
UnicodeString scripts;
|
|
|
|
for(;;) {
|
|
|
|
pointer = skipWhiteSpace(pointer);
|
|
|
|
if (*pointer == ']') {
|
2010-10-28 19:05:02 +00:00
|
|
|
break;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
const char *scriptName = pointer;
|
|
|
|
char c;
|
|
|
|
while((c = *pointer) != 0 && c != ' ' && c != '\t' && c != ']') { ++pointer; }
|
|
|
|
if(c == 0) {
|
|
|
|
fprintf(stderr, "Syntax error: unterminated list of scripts: '%s'\n", buffer);
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
*pointer = 0;
|
2011-11-03 23:09:27 +00:00
|
|
|
int32_t reorderCode = getReorderCode(scriptName);
|
2014-02-24 22:17:04 +00:00
|
|
|
*pointer = c;
|
|
|
|
if (reorderCode == -3) { // COMPRESS
|
|
|
|
builder.setCompressibleLeadByte(leadByte);
|
|
|
|
continue;
|
|
|
|
}
|
2011-11-03 23:09:27 +00:00
|
|
|
if (reorderCode == -2) {
|
|
|
|
continue; // Ignore "TERMINATOR" etc.
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
if (reorderCode < 0 || 0xffff < reorderCode) {
|
|
|
|
fprintf(stderr, "Syntax error: unable to parse reorder code from '%s'\n", scriptName);
|
2011-11-03 23:09:27 +00:00
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
2014-02-24 22:17:04 +00:00
|
|
|
return;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
scripts.append((UChar)reorderCode);
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
if(!scripts.isEmpty()) {
|
|
|
|
if(leadByteScripts == NULL) {
|
|
|
|
leadByteScripts = new UnicodeString[256];
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
leadByteScripts[leadByte] = scripts;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2002-06-13 18:24:36 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
fprintf(stderr, "Warning: unrecognized option: %s\n", buffer);
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool
|
|
|
|
readAnElement(FILE *data,
|
|
|
|
CollationBaseDataBuilder &builder,
|
|
|
|
UnicodeString &prefix, UnicodeString &s,
|
|
|
|
int64_t ces[32], int32_t &cesLength,
|
|
|
|
UErrorCode *status) {
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
char buffer[2048];
|
|
|
|
char *result = fgets(buffer, 2048, data);
|
|
|
|
if(result == NULL) {
|
|
|
|
if(feof(data)) {
|
|
|
|
return FALSE;
|
|
|
|
} else {
|
|
|
|
fprintf(stderr, "empty line but no EOF!\n");
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
|
|
return FALSE;
|
2001-04-19 17:08:07 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
}
|
|
|
|
int32_t buflen = (int32_t)uprv_strlen(buffer);
|
|
|
|
while(buflen>0 && (buffer[buflen-1] == '\r' || buffer[buflen-1] == '\n')) {
|
|
|
|
buffer[--buflen] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(buffer[0] == 0 || buffer[0] == '#') {
|
|
|
|
return FALSE; // just a comment, skip whole line
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
// Directives.
|
|
|
|
if(buffer[0] == '[') {
|
|
|
|
readAnOption(builder, buffer, status);
|
|
|
|
return FALSE;
|
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
char *startCodePoint = buffer;
|
|
|
|
char *endCodePoint = strchr(startCodePoint, ';');
|
|
|
|
if(endCodePoint == NULL) {
|
2001-02-22 21:18:29 +00:00
|
|
|
fprintf(stderr, "error - line with no code point!\n");
|
|
|
|
*status = U_INVALID_FORMAT_ERROR; /* No code point - could be an error, but probably only an empty line */
|
2014-02-24 22:17:04 +00:00
|
|
|
return FALSE;
|
2001-02-22 21:18:29 +00:00
|
|
|
} else {
|
2014-02-24 22:17:04 +00:00
|
|
|
*endCodePoint = 0;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
|
2010-10-07 19:46:41 +00:00
|
|
|
char *pipePointer = strchr(buffer, '|');
|
|
|
|
if (pipePointer != NULL) {
|
|
|
|
// Read the prefix string which precedes the actual string.
|
|
|
|
*pipePointer = 0;
|
2014-02-24 22:17:04 +00:00
|
|
|
UChar *prefixChars = prefix.getBuffer(32);
|
|
|
|
int32_t prefixSize =
|
2010-10-07 19:46:41 +00:00
|
|
|
u_parseString(startCodePoint,
|
2014-02-24 22:17:04 +00:00
|
|
|
prefixChars, prefix.getCapacity(),
|
2010-10-07 19:46:41 +00:00
|
|
|
NULL, status);
|
|
|
|
if(U_FAILURE(*status)) {
|
2014-02-24 22:17:04 +00:00
|
|
|
prefix.releaseBuffer(0);
|
2010-10-07 19:46:41 +00:00
|
|
|
fprintf(stderr, "error - parsing of prefix \"%s\" failed: %s\n",
|
|
|
|
startCodePoint, u_errorName(*status));
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
2014-02-24 22:17:04 +00:00
|
|
|
return FALSE;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
prefix.releaseBuffer(prefixSize);
|
2010-10-07 19:46:41 +00:00
|
|
|
startCodePoint = pipePointer + 1;
|
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2010-10-07 19:46:41 +00:00
|
|
|
// Read the string which gets the CE(s) assigned.
|
2014-02-24 22:17:04 +00:00
|
|
|
UChar *uchars = s.getBuffer(32);
|
|
|
|
int32_t cSize =
|
2010-10-07 19:46:41 +00:00
|
|
|
u_parseString(startCodePoint,
|
2014-02-24 22:17:04 +00:00
|
|
|
uchars, s.getCapacity(),
|
2010-10-07 19:46:41 +00:00
|
|
|
NULL, status);
|
|
|
|
if(U_FAILURE(*status)) {
|
2014-02-24 22:17:04 +00:00
|
|
|
s.releaseBuffer(0);
|
2010-10-07 19:46:41 +00:00
|
|
|
fprintf(stderr, "error - parsing of code point(s) \"%s\" failed: %s\n",
|
|
|
|
startCodePoint, u_errorName(*status));
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
2014-02-24 22:17:04 +00:00
|
|
|
return FALSE;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
s.releaseBuffer(cSize);
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
char *pointer = endCodePoint + 1;
|
2001-05-10 22:32:08 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
char *commentStart = strchr(pointer, '#');
|
2001-02-22 21:18:29 +00:00
|
|
|
if(commentStart == NULL) {
|
2014-02-24 22:17:04 +00:00
|
|
|
commentStart = strchr(pointer, 0);
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
cesLength = 0;
|
2001-02-22 21:18:29 +00:00
|
|
|
for(;;) {
|
2014-02-24 22:17:04 +00:00
|
|
|
pointer = skipWhiteSpace(pointer);
|
|
|
|
if(pointer == commentStart) {
|
2001-02-22 21:18:29 +00:00
|
|
|
break;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
if(cesLength >= 31) {
|
|
|
|
fprintf(stderr, "Error: Too many CEs on line '%s'\n", buffer);
|
|
|
|
*status = U_INVALID_FORMAT_ERROR;
|
|
|
|
return FALSE;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
ces[cesLength++] = parseCE(builder, pointer, *status);
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr, "Syntax error parsing CE from line '%s' - %s\n",
|
|
|
|
buffer, u_errorName(*status));
|
|
|
|
return FALSE;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
|
|
|
|
if(s.length() == 1 && s[0] == 0xfffe) {
|
2010-10-12 23:54:40 +00:00
|
|
|
// UCA 6.0 gives U+FFFE a special minimum weight using the
|
|
|
|
// byte 02 which is the merge-sort-key separator and illegal for any
|
|
|
|
// other characters.
|
|
|
|
} else {
|
|
|
|
// Rudimentary check for valid bytes in CE weights.
|
2014-02-24 22:17:04 +00:00
|
|
|
// For a more comprehensive check see CollationTest::TestRootElements(),
|
|
|
|
// intltest collate/CollationTest/TestRootElements
|
|
|
|
for (int32_t i = 0; i < cesLength; ++i) {
|
|
|
|
int64_t ce = ces[i];
|
|
|
|
UBool isCompressible = FALSE;
|
|
|
|
for (int j = 7; j >= 0; --j) {
|
|
|
|
uint8_t b = (uint8_t)(ce >> (j * 8));
|
|
|
|
if(j <= 1) { b &= 0x3f; } // tertiary bytes use 6 bits
|
|
|
|
if (b == 1) {
|
|
|
|
fprintf(stderr, "Warning: invalid UCA weight byte 01 for %s\n", buffer);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if ((j == 7 || j == 3 || j == 1) && b == 2) {
|
|
|
|
fprintf(stderr, "Warning: invalid UCA weight lead byte 02 for %s\n", buffer);
|
|
|
|
return FALSE;
|
|
|
|
}
|
|
|
|
if (j == 7) {
|
|
|
|
isCompressible = builder.isCompressibleLeadByte(b);
|
|
|
|
} else if (j == 6) {
|
|
|
|
// Primary second bytes 03 and FF are compression terminators.
|
|
|
|
// 02, 03 and FF are usable when the lead byte is not compressible.
|
|
|
|
// 02 is unusable and 03 is the low compression terminator when the lead byte is compressible.
|
|
|
|
if (isCompressible && (b <= 3 || b == 0xff)) {
|
|
|
|
fprintf(stderr, "Warning: invalid UCA primary second weight byte %02X for %s\n",
|
|
|
|
b, buffer);
|
|
|
|
return FALSE;
|
|
|
|
}
|
2010-10-12 23:54:40 +00:00
|
|
|
}
|
2009-11-13 19:25:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
return TRUE;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static void
|
|
|
|
parseFractionalUCA(const char *filename,
|
|
|
|
CollationBaseDataBuilder &builder,
|
|
|
|
UErrorCode *status)
|
2001-02-23 01:21:38 +00:00
|
|
|
{
|
2014-02-24 22:17:04 +00:00
|
|
|
if(U_FAILURE(*status)) { return; }
|
|
|
|
FILE *data = fopen(filename, "r");
|
|
|
|
if(data == NULL) {
|
|
|
|
fprintf(stderr, "Couldn't open file: %s\n", filename);
|
|
|
|
*status = U_FILE_ACCESS_ERROR;
|
2001-02-22 21:18:29 +00:00
|
|
|
return;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
uint32_t line = 0;
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
UChar32 maxCodePoint = 0;
|
|
|
|
while(!feof(data)) {
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr, "Something returned an error %i (%s) while processing line %u of %s. Exiting...\n",
|
|
|
|
*status, u_errorName(*status), (int)line, filename);
|
|
|
|
exit(*status);
|
|
|
|
}
|
2001-06-05 22:52:56 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
line++;
|
2003-09-11 18:56:13 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
UnicodeString prefix;
|
|
|
|
UnicodeString s;
|
|
|
|
int64_t ces[32];
|
|
|
|
int32_t cesLength = 0;
|
|
|
|
if(readAnElement(data, builder, prefix, s, ces, cesLength, status)) {
|
|
|
|
// we have read the line, now do something sensible with the read data!
|
|
|
|
uint32_t p = (uint32_t)(ces[0] >> 32);
|
|
|
|
|
|
|
|
if(s.length() > 1 && s[0] == 0xFDD0) {
|
|
|
|
// FractionalUCA.txt contractions starting with U+FDD0
|
|
|
|
// are only entered into the inverse table,
|
|
|
|
// not into the normal collation data.
|
|
|
|
builder.addRootElements(ces, cesLength, *status);
|
|
|
|
if(s.length() == 2 && s[1] == 0x34 && cesLength == 1) {
|
|
|
|
// Lead byte for numeric sorting.
|
|
|
|
builder.setNumericPrimary(p);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
UChar32 c = s.char32At(0);
|
|
|
|
if(c > maxCodePoint) { maxCodePoint = c; }
|
|
|
|
|
|
|
|
// We ignore the CEs for U+FFFD..U+FFFF and for the unassigned first primary.
|
|
|
|
// CollationBaseDataBuilder::init() maps them to special CEs.
|
|
|
|
// Except for U+FFFE, these have higher primaries in v2 than in FractionalUCA.txt.
|
|
|
|
if(0xfffd <= c && c <= 0xffff) { continue; }
|
|
|
|
if(s.length() == 2 && s[0] == 0xFDD1 && s[1] == 0xFDD0) {
|
|
|
|
continue;
|
|
|
|
}
|
2001-06-05 22:52:56 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
if(0xe0000000 <= p && p < 0xf0000000) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Error: Unexpected mapping to an implicit or trailing primary"
|
|
|
|
" on line %u of %s.\n",
|
|
|
|
(int)line, filename);
|
|
|
|
exit(U_INVALID_FORMAT_ERROR);
|
|
|
|
}
|
2001-06-05 22:52:56 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
builder.add(prefix, s, ces, cesLength, *status);
|
|
|
|
}
|
|
|
|
}
|
2001-06-05 22:52:56 +00:00
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
int32_t numRanges = 0;
|
|
|
|
int32_t numRangeCodePoints = 0;
|
|
|
|
UChar32 rangeFirst = U_SENTINEL;
|
|
|
|
UChar32 rangeLast = U_SENTINEL;
|
|
|
|
uint32_t rangeFirstPrimary = 0;
|
|
|
|
uint32_t rangeLastPrimary = 0;
|
|
|
|
int32_t rangeStep = -1;
|
|
|
|
|
|
|
|
// Detect ranges of characters in primary code point order,
|
|
|
|
// with 3-byte primaries and
|
|
|
|
// with consistent "step" differences between adjacent primaries.
|
|
|
|
// This relies on the FractionalUCA generator using the same primary-weight incrementation.
|
|
|
|
// Start at U+0180: No ranges for common Latin characters.
|
|
|
|
// Go one beyond maxCodePoint in case a range ends there.
|
|
|
|
for(UChar32 c = 0x180; c <= (maxCodePoint + 1); ++c) {
|
|
|
|
UBool action;
|
|
|
|
uint32_t p = builder.getLongPrimaryIfSingleCE(c);
|
|
|
|
if(p != 0) {
|
|
|
|
// p is a "long" (three-byte) primary.
|
|
|
|
if(rangeFirst >= 0 && c == (rangeLast + 1) && p > rangeLastPrimary) {
|
|
|
|
// Find the offset between the two primaries.
|
|
|
|
int32_t step = CollationBaseDataBuilder::diffThreeBytePrimaries(
|
|
|
|
rangeLastPrimary, p, builder.isCompressiblePrimary(p));
|
|
|
|
if(rangeFirst == rangeLast && step >= 2) {
|
|
|
|
// c == rangeFirst + 1, store the "step" between range primaries.
|
|
|
|
rangeStep = step;
|
|
|
|
rangeLast = c;
|
|
|
|
rangeLastPrimary = p;
|
|
|
|
action = 0; // continue range
|
|
|
|
} else if(rangeStep == step) {
|
|
|
|
// Continue the range with the same "step" difference.
|
|
|
|
rangeLast = c;
|
|
|
|
rangeLastPrimary = p;
|
|
|
|
action = 0; // continue range
|
|
|
|
} else {
|
|
|
|
action = 1; // maybe finish range, start a new one
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
action = 1; // maybe finish range, start a new one
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
action = -1; // maybe finish range, do not start a new one
|
|
|
|
}
|
|
|
|
if(action != 0 && rangeFirst >= 0) {
|
|
|
|
// Finish a range.
|
|
|
|
// Set offset CE32s for a long range, leave single CEs for a short range.
|
|
|
|
UBool didSetRange = builder.maybeSetPrimaryRange(
|
|
|
|
rangeFirst, rangeLast,
|
|
|
|
rangeFirstPrimary, rangeStep, *status);
|
|
|
|
if(U_FAILURE(*status)) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"failure setting code point order range U+%04lx..U+%04lx "
|
|
|
|
"%08lx..%08lx step %d - %s\n",
|
|
|
|
(long)rangeFirst, (long)rangeLast,
|
|
|
|
(long)rangeFirstPrimary, (long)rangeLastPrimary,
|
|
|
|
(int)rangeStep, u_errorName(*status));
|
|
|
|
} else if(didSetRange) {
|
|
|
|
int32_t rangeLength = rangeLast - rangeFirst + 1;
|
|
|
|
if(beVerbose) {
|
|
|
|
printf("* set code point order range U+%04lx..U+%04lx [%d] "
|
|
|
|
"%08lx..%08lx step %d\n",
|
|
|
|
(long)rangeFirst, (long)rangeLast,
|
|
|
|
(int)rangeLength,
|
|
|
|
(long)rangeFirstPrimary, (long)rangeLastPrimary,
|
|
|
|
(int)rangeStep);
|
|
|
|
}
|
|
|
|
++numRanges;
|
|
|
|
numRangeCodePoints += rangeLength;
|
|
|
|
}
|
|
|
|
rangeFirst = U_SENTINEL;
|
|
|
|
rangeStep = -1;
|
|
|
|
}
|
|
|
|
if(action > 0) {
|
|
|
|
// Start a new range.
|
|
|
|
rangeFirst = rangeLast = c;
|
|
|
|
rangeFirstPrimary = rangeLastPrimary = p;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
printf("** set %d ranges with %d code points\n", (int)numRanges, (int)numRangeCodePoints);
|
|
|
|
|
|
|
|
// Idea: Probably best to work in two passes.
|
|
|
|
// Pass 1 for reading all data, setting isCompressible flags (and reordering groups)
|
|
|
|
// and finding ranges.
|
|
|
|
// Then set the ranges in a newly initialized builder
|
|
|
|
// for optimal compression (makes sure that adjacent blocks can overlap easily).
|
|
|
|
// Then set all mappings outside the ranges.
|
|
|
|
//
|
|
|
|
// In the first pass, we could store mappings in a simple list,
|
|
|
|
// with single-character/single-long-primary-CE mappings in a UTrie2;
|
|
|
|
// or store the mappings in a temporary builder;
|
|
|
|
// or we could just parse the input file again in the second pass.
|
|
|
|
//
|
|
|
|
// Ideally set/copy U+0000..U+017F before setting anything else,
|
|
|
|
// then set default Han/Hangul, then set the ranges, then copy non-range mappings.
|
|
|
|
// It should be easy to copy mappings from an un-built builder to a new one.
|
|
|
|
// Add CollationDataBuilder::copyFrom(builder, code point, errorCode) -- copy contexts & expansions.
|
|
|
|
|
|
|
|
if(UCAVersion[0] == 0 && UCAVersion[1] == 0 && UCAVersion[2] == 0 && UCAVersion[3] == 0) {
|
|
|
|
fprintf(stderr, "UCA version not specified. Cannot create data file!\n");
|
|
|
|
fclose(data);
|
2001-02-22 21:18:29 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2012-01-10 19:36:20 +00:00
|
|
|
if (beVerbose) {
|
2014-02-24 22:17:04 +00:00
|
|
|
printf("\nLines read: %u\n", (int)line);
|
2002-01-04 23:48:46 +00:00
|
|
|
}
|
2001-06-05 22:52:56 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
fclose(data);
|
2002-06-13 18:24:36 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
buildAndWriteBaseData(CollationBaseDataBuilder &builder,
|
|
|
|
const char *path, UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
|
|
|
|
if(getOptionValue("[fixed secondary common byte") != Collation::COMMON_BYTE) {
|
|
|
|
fprintf(stderr, "error: unexpected [fixed secondary common byte]");
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
if(getOptionValue("[fixed tertiary common byte") != Collation::COMMON_BYTE) {
|
|
|
|
fprintf(stderr, "error: unexpected [fixed tertiary common byte]");
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
2001-06-05 22:52:56 +00:00
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
if(leadByteScripts != NULL) {
|
|
|
|
uint32_t firstLead = Collation::MERGE_SEPARATOR_BYTE + 1;
|
|
|
|
do {
|
|
|
|
// Find the range of lead bytes with this set of scripts.
|
|
|
|
const UnicodeString &firstScripts = leadByteScripts[firstLead];
|
|
|
|
if(firstScripts.isEmpty()) {
|
|
|
|
fprintf(stderr, "[top_byte 0x%02X] has no reorderable scripts\n", (int)firstLead);
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
uint32_t lead = firstLead;
|
|
|
|
for(;;) {
|
|
|
|
++lead;
|
|
|
|
const UnicodeString &scripts = leadByteScripts[lead];
|
|
|
|
// The scripts should either be the same or disjoint.
|
|
|
|
// We do not test if all reordering groups have disjoint sets of scripts.
|
|
|
|
if(scripts.isEmpty() || firstScripts.indexOf(scripts[0]) < 0) { break; }
|
|
|
|
if(scripts != firstScripts) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"[top_byte 0x%02X] includes script %d from [top_byte 0x%02X] "
|
|
|
|
"but not all scripts match\n",
|
|
|
|
(int)firstLead, scripts[0], (int)lead);
|
|
|
|
errorCode = U_INVALID_FORMAT_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// lead is one greater than the last lead byte with the same set of scripts as firstLead.
|
|
|
|
builder.addReorderingGroup(firstLead, lead - 1, firstScripts, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
firstLead = lead;
|
|
|
|
} while(firstLead < Collation::UNASSIGNED_IMPLICIT_BYTE);
|
|
|
|
delete[] leadByteScripts;
|
|
|
|
}
|
|
|
|
|
|
|
|
CollationData data(*Normalizer2Factory::getNFCImpl(errorCode));
|
|
|
|
builder.enableFastLatin();
|
|
|
|
builder.build(data, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "builder.build() failed: %s\n",
|
|
|
|
u_errorName(errorCode));
|
|
|
|
return;
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
|
|
|
|
// The CollationSettings constructor gives us the properly encoded
|
|
|
|
// default options, so that we need not duplicate them here.
|
|
|
|
CollationSettings settings;
|
|
|
|
|
|
|
|
UVector32 rootElements(errorCode);
|
|
|
|
for(int32_t i = 0; i < CollationRootElements::IX_COUNT; ++i) {
|
|
|
|
rootElements.addElement(0, errorCode);
|
2010-10-28 19:05:02 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
builder.buildRootElementsTable(rootElements, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "builder.buildRootElementsTable() failed: %s\n",
|
|
|
|
u_errorName(errorCode));
|
2001-02-22 21:18:29 +00:00
|
|
|
return;
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
int32_t index = CollationRootElements::IX_COUNT;
|
|
|
|
rootElements.setElementAt(index, CollationRootElements::IX_FIRST_TERTIARY_INDEX);
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
while((rootElements.elementAti(index) & 0xffff0000) == 0) { ++index; }
|
|
|
|
rootElements.setElementAt(index, CollationRootElements::IX_FIRST_SECONDARY_INDEX);
|
2009-11-13 19:25:21 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
while((rootElements.elementAti(index) & CollationRootElements::SEC_TER_DELTA_FLAG) != 0) {
|
|
|
|
++index;
|
2006-03-28 07:40:46 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
rootElements.setElementAt(index, CollationRootElements::IX_FIRST_PRIMARY_INDEX);
|
|
|
|
|
|
|
|
rootElements.setElementAt(Collation::COMMON_SEC_AND_TER_CE,
|
|
|
|
CollationRootElements::IX_COMMON_SEC_AND_TER_CE);
|
|
|
|
|
|
|
|
int32_t secTerBoundaries = (int32_t)getOptionValue("[fixed last secondary common byte") << 24;
|
|
|
|
secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable secondary byte") << 16;
|
|
|
|
secTerBoundaries |= (int32_t)getOptionValue("[fixed first ignorable tertiary byte");
|
|
|
|
rootElements.setElementAt(secTerBoundaries, CollationRootElements::IX_SEC_TER_BOUNDARIES);
|
|
|
|
|
|
|
|
LocalMemory<uint8_t> buffer;
|
|
|
|
int32_t capacity = 1000000;
|
|
|
|
uint8_t *dest = buffer.allocateInsteadAndCopy(capacity);
|
|
|
|
if(dest == NULL) {
|
|
|
|
fprintf(stderr, "memory allocation (%ld bytes) for file contents failed\n",
|
|
|
|
(long)capacity);
|
|
|
|
errorCode = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
2004-11-11 23:34:58 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
|
|
|
|
int32_t totalSize = CollationDataWriter::writeBase(
|
|
|
|
data, settings,
|
|
|
|
rootElements.getBuffer(), rootElements.size(),
|
|
|
|
indexes, dest, capacity,
|
|
|
|
errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "CollationDataWriter::writeBase(capacity = %ld) failed: %s\n",
|
|
|
|
(long)capacity, u_errorName(errorCode));
|
|
|
|
return;
|
2004-11-11 23:34:58 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
printf("*** CLDR root collation part sizes ***\n");
|
|
|
|
CollationInfo::printSizes(totalSize, indexes);
|
|
|
|
printf("*** CLDR root collation size: %6ld (with file header but no copyright string)\n",
|
|
|
|
(long)totalSize + 32); // 32 bytes = DataHeader rounded up to 16-byte boundary
|
|
|
|
|
|
|
|
CollationTailoring::makeBaseVersion(UCAVersion, ucaDataInfo.dataVersion);
|
|
|
|
UNewDataMemory *pData=udata_create(path, "icu", "ucadata", &ucaDataInfo,
|
|
|
|
withCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "genuca: udata_create(%s, ucadata.icu) failed - %s\n",
|
|
|
|
path, u_errorName(errorCode));
|
|
|
|
return;
|
2002-07-17 03:56:50 +00:00
|
|
|
}
|
2001-09-20 20:16:39 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
udata_writeBlock(pData, dest, totalSize);
|
|
|
|
long dataLength = udata_finish(pData, &errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
fprintf(stderr, "genuca: error %s writing the output file\n", u_errorName(errorCode));
|
|
|
|
return;
|
2001-11-06 22:55:29 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
if(dataLength != (long)totalSize) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"udata_finish(ucadata.icu) reports %ld bytes written but should be %ld\n",
|
|
|
|
dataLength, (long)totalSize);
|
|
|
|
errorCode=U_INTERNAL_PROGRAM_ERROR;
|
|
|
|
}
|
|
|
|
}
|
2001-11-06 22:55:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
/**
|
|
|
|
* Adds each lead surrogate to the bmp set if any of the 1024
|
|
|
|
* associated supplementary code points is in the supp set.
|
|
|
|
* These can be one and the same set.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
setLeadSurrogatesForAssociatedSupplementary(UnicodeSet &bmp, const UnicodeSet &supp) {
|
|
|
|
UChar32 c = 0x10000;
|
|
|
|
for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
|
|
|
|
if(supp.containsSome(c, c + 0x3ff)) {
|
|
|
|
bmp.add(lead);
|
2002-07-02 22:28:40 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
}
|
|
|
|
}
|
2002-06-13 18:24:36 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static int32_t
|
|
|
|
makeBMPFoldedBitSet(const UnicodeSet &set, uint8_t index[0x800], uint32_t bits[256],
|
|
|
|
UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return 0; }
|
|
|
|
bits[0] = 0; // no bits set
|
|
|
|
bits[1] = 0xffffffff; // all bits set
|
|
|
|
int32_t bitsLength = 2;
|
|
|
|
int32_t i = 0;
|
|
|
|
for(UChar32 c = 0; c <= 0xffff; c += 0x20, ++i) {
|
|
|
|
if(set.containsNone(c, c + 0x1f)) {
|
|
|
|
index[i] = 0;
|
|
|
|
} else if(set.contains(c, c + 0x1f)) {
|
|
|
|
index[i] = 1;
|
|
|
|
} else {
|
|
|
|
uint32_t b = 0;
|
|
|
|
for(int32_t j = 0; j <= 0x1f; ++j) {
|
|
|
|
if(set.contains(c + j)) {
|
|
|
|
b |= (uint32_t)1 << j;
|
2001-11-06 22:55:29 +00:00
|
|
|
}
|
2001-06-05 22:52:56 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
int32_t k;
|
|
|
|
for(k = 2;; ++k) {
|
|
|
|
if(k == bitsLength) {
|
|
|
|
// new bit combination
|
|
|
|
if(bitsLength == 256) {
|
|
|
|
errorCode = U_BUFFER_OVERFLOW_ERROR;
|
|
|
|
return 0;
|
2009-11-13 19:25:21 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
bits[bitsLength++] = b;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if(bits[k] == b) {
|
|
|
|
// duplicate bit combination
|
|
|
|
break;
|
2008-04-04 22:47:43 +00:00
|
|
|
}
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
index[i] = k;
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
return bitsLength;
|
|
|
|
}
|
2001-02-22 21:18:29 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
// TODO: Make preparseucd.py write fcd_data.h mapping code point ranges to FCD16 values,
|
|
|
|
// use that rather than properties APIs.
|
|
|
|
// Then consider moving related logic for the unsafeBwdSet back from the loader into this builder.
|
2001-11-10 00:13:03 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
/**
|
|
|
|
* Builds data for the FCD check fast path.
|
|
|
|
* For details see the CollationFCD class comments.
|
|
|
|
*/
|
|
|
|
static void
|
|
|
|
buildAndWriteFCDData(const char *path, UErrorCode &errorCode) {
|
|
|
|
UnicodeSet lcccSet(UNICODE_STRING_SIMPLE("[[:^lccc=0:][\\udc00-\\udfff]]"), errorCode);
|
|
|
|
UnicodeSet tcccSet(UNICODE_STRING_SIMPLE("[:^tccc=0:]"), errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
setLeadSurrogatesForAssociatedSupplementary(tcccSet, tcccSet);
|
|
|
|
// The following supp(lccc)->lead(tccc) should be unnecessary
|
|
|
|
// after the previous supp(tccc)->lead(tccc)
|
|
|
|
// because there should not be any characters with lccc!=0 and tccc=0.
|
|
|
|
// It is safe and harmless.
|
|
|
|
setLeadSurrogatesForAssociatedSupplementary(tcccSet, lcccSet);
|
|
|
|
setLeadSurrogatesForAssociatedSupplementary(lcccSet, lcccSet);
|
|
|
|
uint8_t lcccIndex[0x800], tcccIndex[0x800];
|
|
|
|
uint32_t lcccBits[256], tcccBits[256];
|
|
|
|
int32_t lcccBitsLength = makeBMPFoldedBitSet(lcccSet, lcccIndex, lcccBits, errorCode);
|
|
|
|
int32_t tcccBitsLength = makeBMPFoldedBitSet(tcccSet, tcccIndex, tcccBits, errorCode);
|
|
|
|
printf("@@@ lcccBitsLength=%d -> %d bytes\n", lcccBitsLength, 0x800 + lcccBitsLength * 4);
|
|
|
|
printf("@@@ tcccBitsLength=%d -> %d bytes\n", tcccBitsLength, 0x800 + tcccBitsLength * 4);
|
|
|
|
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
|
|
|
|
FILE *f=usrc_create(path, "collationfcd.cpp",
|
|
|
|
"icu/tools/unicode/c/genuca/genuca.cpp");
|
|
|
|
if(f==NULL) {
|
|
|
|
errorCode=U_FILE_ACCESS_ERROR;
|
|
|
|
return;
|
2004-01-16 07:12:35 +00:00
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
fputs("#include \"unicode/utypes.h\"\n\n", f);
|
|
|
|
fputs("#if !UCONFIG_NO_COLLATION\n\n", f);
|
|
|
|
fputs("#include \"collationfcd.h\"\n\n", f);
|
|
|
|
fputs("U_NAMESPACE_BEGIN\n\n", f);
|
|
|
|
usrc_writeArray(f,
|
|
|
|
"const uint8_t CollationFCD::lcccIndex[%ld]={\n",
|
|
|
|
lcccIndex, 8, 0x800,
|
|
|
|
"\n};\n\n");
|
|
|
|
usrc_writeArray(f,
|
|
|
|
"const uint32_t CollationFCD::lcccBits[%ld]={\n",
|
|
|
|
lcccBits, 32, lcccBitsLength,
|
|
|
|
"\n};\n\n");
|
|
|
|
usrc_writeArray(f,
|
|
|
|
"const uint8_t CollationFCD::tcccIndex[%ld]={\n",
|
|
|
|
tcccIndex, 8, 0x800,
|
|
|
|
"\n};\n\n");
|
|
|
|
usrc_writeArray(f,
|
|
|
|
"const uint32_t CollationFCD::tcccBits[%ld]={\n",
|
|
|
|
tcccBits, 32, tcccBitsLength,
|
|
|
|
"\n};\n\n");
|
|
|
|
fputs("U_NAMESPACE_END\n\n", f);
|
|
|
|
fputs("#endif // !UCONFIG_NO_COLLATION\n", f);
|
|
|
|
fclose(f);
|
|
|
|
}
|
2004-01-16 07:12:35 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
static void
|
|
|
|
parseAndWriteCollationRootData(
|
|
|
|
const char *fracUCAPath,
|
|
|
|
const char *binaryDataPath,
|
|
|
|
const char *sourceCodePath,
|
|
|
|
UErrorCode &errorCode) {
|
|
|
|
if(U_FAILURE(errorCode)) { return; }
|
|
|
|
CollationBaseDataBuilder builder(errorCode);
|
|
|
|
builder.init(errorCode);
|
|
|
|
parseFractionalUCA(fracUCAPath, builder, &errorCode);
|
|
|
|
buildAndWriteBaseData(builder, binaryDataPath, errorCode);
|
|
|
|
buildAndWriteFCDData(sourceCodePath, errorCode);
|
2001-02-22 21:18:29 +00:00
|
|
|
}
|
2001-02-23 01:21:38 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
// ------------------------------------------------------------------------- ***
|
2002-10-01 17:44:04 +00:00
|
|
|
|
2012-01-10 19:36:20 +00:00
|
|
|
enum {
|
|
|
|
HELP_H,
|
|
|
|
HELP_QUESTION_MARK,
|
|
|
|
VERBOSE,
|
2014-02-24 22:17:04 +00:00
|
|
|
COPYRIGHT
|
2012-01-10 19:36:20 +00:00
|
|
|
};
|
|
|
|
|
2001-02-23 01:21:38 +00:00
|
|
|
static UOption options[]={
|
2012-01-10 19:36:20 +00:00
|
|
|
UOPTION_HELP_H,
|
|
|
|
UOPTION_HELP_QUESTION_MARK,
|
|
|
|
UOPTION_VERBOSE,
|
2014-02-24 22:17:04 +00:00
|
|
|
UOPTION_COPYRIGHT
|
2001-02-23 01:21:38 +00:00
|
|
|
};
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
extern "C" int
|
|
|
|
main(int argc, char* argv[]) {
|
2002-03-15 23:41:40 +00:00
|
|
|
U_MAIN_INIT_ARGS(argc, argv);
|
2014-02-24 22:17:04 +00:00
|
|
|
|
2012-01-10 19:36:20 +00:00
|
|
|
argc=u_parseArgs(argc, argv, LENGTHOF(options), options);
|
2001-02-23 01:21:38 +00:00
|
|
|
|
|
|
|
/* error handling, printing usage message */
|
|
|
|
if(argc<0) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"error in command line argument \"%s\"\n",
|
|
|
|
argv[-argc]);
|
|
|
|
}
|
2014-02-24 22:17:04 +00:00
|
|
|
if( argc<2 ||
|
|
|
|
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
|
|
|
|
) {
|
|
|
|
/*
|
|
|
|
* Broken into chunks because the C89 standard says the minimum
|
|
|
|
* required supported string length is 509 bytes.
|
|
|
|
*/
|
|
|
|
fprintf(stderr,
|
|
|
|
"Usage: %s [-options] path/to/ICU/src/root\n"
|
|
|
|
"\n"
|
|
|
|
"Reads path/to/ICU/src/root/source/data/unidata/FractionalUCA.txt and\n"
|
|
|
|
"writes source and binary data files with the collation root data.\n"
|
|
|
|
"\n",
|
|
|
|
argv[0]);
|
2001-02-23 01:21:38 +00:00
|
|
|
fprintf(stderr,
|
2014-02-24 22:17:04 +00:00
|
|
|
"Options:\n"
|
2002-03-15 23:41:40 +00:00
|
|
|
"\t-h or -? or --help this usage text\n"
|
2014-02-24 22:17:04 +00:00
|
|
|
"\t-v or --verbose verbose output\n"
|
|
|
|
"\t-c or --copyright include a copyright notice\n");
|
|
|
|
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
2001-02-23 01:21:38 +00:00
|
|
|
}
|
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
beVerbose=options[VERBOSE].doesOccur;
|
|
|
|
withCopyright=options[COPYRIGHT].doesOccur;
|
2001-02-23 19:10:28 +00:00
|
|
|
|
2012-01-10 19:36:20 +00:00
|
|
|
IcuToolErrorCode errorCode("genuca");
|
2003-08-15 01:26:22 +00:00
|
|
|
|
2012-01-10 19:36:20 +00:00
|
|
|
CharString icuSrcRoot(argv[1], errorCode);
|
2002-03-15 23:41:40 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
CharString icuSource(icuSrcRoot, errorCode);
|
|
|
|
icuSource.appendPathPart("source", errorCode);
|
2001-02-23 01:21:38 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
CharString icuSourceData(icuSource, errorCode);
|
|
|
|
icuSourceData.appendPathPart("data", errorCode);
|
2012-01-10 19:36:20 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
CharString fracUCAPath(icuSourceData, errorCode);
|
|
|
|
fracUCAPath.appendPathPart("unidata", errorCode);
|
|
|
|
fracUCAPath.appendPathPart("FractionalUCA.txt", errorCode);
|
2002-10-01 17:44:04 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
CharString sourceDataInColl(icuSourceData, errorCode);
|
|
|
|
sourceDataInColl.appendPathPart("in", errorCode);
|
|
|
|
sourceDataInColl.appendPathPart("coll", errorCode);
|
2002-10-01 17:44:04 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
CharString sourceI18n(icuSource, errorCode);
|
|
|
|
sourceI18n.appendPathPart("i18n", errorCode);
|
2002-10-01 17:44:04 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
errorCode.assertSuccess();
|
2002-10-01 17:44:04 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
parseAndWriteCollationRootData(
|
|
|
|
fracUCAPath.data(),
|
|
|
|
sourceDataInColl.data(),
|
|
|
|
sourceI18n.data(),
|
|
|
|
errorCode);
|
2001-02-23 01:21:38 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
return errorCode;
|
2002-10-01 17:44:04 +00:00
|
|
|
}
|
2002-10-01 01:26:49 +00:00
|
|
|
|
2014-02-24 22:17:04 +00:00
|
|
|
#endif // UCONFIG_NO_COLLATION
|