ICU-2350 implement CASE_SENSITIVE via uprops.icu data
X-SVN-Rev: 11275
This commit is contained in:
parent
41490cb19a
commit
84adae7885
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Copyright (C) 1999-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -13,8 +13,8 @@
|
||||
* created on: 1999dec08
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This program reads the Unicode character database text file,
|
||||
* parses it, and extracts most of the properties for each character.
|
||||
* This program reads several of the Unicode character database text files,
|
||||
* parses them, and extracts most of the properties for each character.
|
||||
* It then writes a binary file containing the properties
|
||||
* that is designed to be used directly for random-access to
|
||||
* the properties of each Unicode character.
|
||||
@ -24,6 +24,7 @@
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
@ -41,6 +42,14 @@ U_CDECL_END
|
||||
|
||||
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
/*
|
||||
* Unicode set collecting the case-sensitive characters;
|
||||
* see uchar.h UCHAR_CASE_SENSITIVE.
|
||||
* Add code points from case mappings/foldings in
|
||||
* the root locale and with default options.
|
||||
*/
|
||||
static USet *caseSensitive;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
@ -79,7 +88,7 @@ main(int argc, char* argv[]) {
|
||||
/* preset then read command line options */
|
||||
options[4].value=u_getDataDirectory();
|
||||
options[5].value="";
|
||||
options[6].value="3.0.0";
|
||||
options[6].value="";
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
@ -127,7 +136,10 @@ main(int argc, char* argv[]) {
|
||||
suffix=NULL;
|
||||
}
|
||||
|
||||
setUnicodeVersion(options[6].value);
|
||||
if(options[6].doesOccur) {
|
||||
setUnicodeVersion(options[6].value);
|
||||
}
|
||||
/* else use the default dataVersion in store.c */
|
||||
|
||||
/* prepare the filename beginning with the source dir */
|
||||
uprv_strcpy(filename, srcDir);
|
||||
@ -138,6 +150,7 @@ main(int argc, char* argv[]) {
|
||||
|
||||
/* initialize */
|
||||
initStore();
|
||||
caseSensitive=uset_open(1, 0); /* empty set (start>end) */
|
||||
|
||||
/* process BidiMirroring.txt */
|
||||
writeUCDFilename(basename, "BidiMirroring", suffix);
|
||||
@ -232,6 +245,18 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
static void
|
||||
_set_addAll(USet *set, const UChar *s, int32_t length) {
|
||||
UChar32 c;
|
||||
int32_t i;
|
||||
|
||||
/* needs length>=0 */
|
||||
for(i=0; i<length; /* U16_NEXT advances i */) {
|
||||
U16_NEXT(s, i, length, c);
|
||||
uset_add(set, c);
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for BidiMirroring.txt --------------------------------------------- */
|
||||
|
||||
#define MAX_MIRROR_COUNT 2000
|
||||
@ -311,7 +336,7 @@ specialCasingLineFn(void *context,
|
||||
}
|
||||
|
||||
/* is this a complex mapping? */
|
||||
if(*u_skipWhitespace(fields[4][0])!=0) {
|
||||
if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') {
|
||||
/* there is some condition text in the fifth field */
|
||||
specialCasings[specialCasingCount].isComplex=TRUE;
|
||||
|
||||
@ -332,6 +357,11 @@ specialCasingLineFn(void *context,
|
||||
fprintf(stderr, "genprops: error parsing special casing at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code);
|
||||
_set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]);
|
||||
_set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]);
|
||||
_set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]);
|
||||
}
|
||||
|
||||
if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) {
|
||||
@ -381,6 +411,14 @@ parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) {
|
||||
qsort(specialCasings, specialCasingCount, sizeof(SpecialCasing), compareSpecialCasings);
|
||||
specialCasingCount-=j;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add one complex mapping to caseSensitive that was filtered out above:
|
||||
* Greek final Sigma has a conditional mapping but not locale-sensitive,
|
||||
* and it is taken when lowercasing just U+03A3 alone.
|
||||
* 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||||
*/
|
||||
uset_add(caseSensitive, 0x3c2);
|
||||
}
|
||||
|
||||
/* parser for CaseFolding.txt ----------------------------------------------- */
|
||||
@ -434,6 +472,12 @@ caseFoldingLineFn(void *context,
|
||||
caseFoldings[caseFoldingCount].simple=0;
|
||||
}
|
||||
|
||||
/* update the case-sensitive set */
|
||||
if(status!='T') {
|
||||
uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code);
|
||||
_set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]);
|
||||
}
|
||||
|
||||
/* check the status */
|
||||
if(status=='S') {
|
||||
/* check if there was a full mapping for this code point before */
|
||||
@ -720,7 +764,11 @@ unicodeDataLineFn(void *context,
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
p.upperCase=value;
|
||||
if(value!=0 && value!=p.code) {
|
||||
p.upperCase=value;
|
||||
uset_add(caseSensitive, (UChar32)p.code);
|
||||
uset_add(caseSensitive, (UChar32)value);
|
||||
}
|
||||
|
||||
/* get lowercase value, field 13 */
|
||||
value=(uint32_t)uprv_strtoul(fields[13][0], &end, 16);
|
||||
@ -730,7 +778,11 @@ unicodeDataLineFn(void *context,
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
p.lowerCase=value;
|
||||
if(value!=0 && value!=p.code) {
|
||||
p.lowerCase=value;
|
||||
uset_add(caseSensitive, (UChar32)p.code);
|
||||
uset_add(caseSensitive, (UChar32)value);
|
||||
}
|
||||
|
||||
/* get titlecase value, field 14 */
|
||||
value=(uint32_t)uprv_strtoul(fields[14][0], &end, 16);
|
||||
@ -740,7 +792,11 @@ unicodeDataLineFn(void *context,
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
p.titleCase=value;
|
||||
if(value!=0 && value!=p.code) {
|
||||
p.titleCase=value;
|
||||
uset_add(caseSensitive, (UChar32)p.code);
|
||||
uset_add(caseSensitive, (UChar32)value);
|
||||
}
|
||||
|
||||
/* set additional properties from previously parsed files */
|
||||
if(mirrorIndex<mirrorCount && p.code==mirrorMappings[mirrorIndex][0]) {
|
||||
@ -899,6 +955,7 @@ parseDB(const char *filename, UErrorCode *pErrorCode) {
|
||||
};
|
||||
|
||||
char *fields[15][2];
|
||||
UChar32 start, end;
|
||||
uint32_t prev;
|
||||
int32_t i;
|
||||
|
||||
@ -950,6 +1007,20 @@ parseDB(const char *filename, UErrorCode *pErrorCode) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
for(i=0;
|
||||
0==uset_getItem(caseSensitive, i, &start, &end, NULL, 0, pErrorCode) && U_SUCCESS(*pErrorCode);
|
||||
++i
|
||||
) {
|
||||
addCaseSensitive(start, end);
|
||||
}
|
||||
if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) {
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Copyright (C) 1999-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -89,6 +89,9 @@ repeatProps(uint32_t first, uint32_t last, uint32_t props);
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
getFoldedPropsValue(UNewTrie *trie, UChar32 start, int32_t offset);
|
||||
|
||||
extern void
|
||||
addCaseSensitive(UChar32 first, UChar32 last);
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir);
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Copyright (C) 1999-2003, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -610,6 +610,21 @@ addProps(uint32_t c, uint32_t x) {
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
addCaseSensitive(UChar32 first, UChar32 last) {
|
||||
uint32_t x, cs;
|
||||
|
||||
cs=U_MASK(UPROPS_CASE_SENSITIVE_SHIFT);
|
||||
while(first<=last) {
|
||||
x=utrie_get32(pTrie, first, NULL);
|
||||
if(!utrie_set32(pTrie, first, x|cs)) {
|
||||
fprintf(stderr, "error: too many entries for the properties trie\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
++first;
|
||||
}
|
||||
}
|
||||
|
||||
/* areas of same properties ------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
|
Loading…
Reference in New Issue
Block a user