From f712c8bc001887b6a4b754c19cd98103c63aca6c Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Tue, 10 Jan 2012 07:23:44 +0000 Subject: [PATCH] ICU-8972 merge gencase into genprops X-SVN-Rev: 31189 --- .gitignore | 16 - tools/unicode/c/CMakeLists.txt | 1 - tools/unicode/c/gencase/CMakeLists.txt | 9 - tools/unicode/c/gencase/Makefile.in | 95 -- tools/unicode/c/gencase/gencase.c | 865 ------------ tools/unicode/c/gencase/gencase.h | 132 -- tools/unicode/c/gencase/gencase.vcproj | 422 ------ tools/unicode/c/gencase/store.c | 1201 ---------------- tools/unicode/c/genprops/CMakeLists.txt | 2 +- tools/unicode/c/genprops/bidipropsbuilder.cpp | 13 +- tools/unicode/c/genprops/casepropsbuilder.cpp | 1217 +++++++++++++++++ tools/unicode/c/genprops/corepropsbuilder.cpp | 9 +- tools/unicode/c/genprops/genprops.cpp | 6 + tools/unicode/c/genprops/genprops.h | 1 + .../unicode/c/genprops/namespropsbuilder.cpp | 10 +- tools/unicode/c/genprops/pnamesbuilder.cpp | 8 +- 16 files changed, 1243 insertions(+), 2764 deletions(-) delete mode 100644 tools/unicode/c/gencase/CMakeLists.txt delete mode 100644 tools/unicode/c/gencase/Makefile.in delete mode 100644 tools/unicode/c/gencase/gencase.c delete mode 100644 tools/unicode/c/gencase/gencase.h delete mode 100644 tools/unicode/c/gencase/gencase.vcproj delete mode 100644 tools/unicode/c/gencase/store.c create mode 100644 tools/unicode/c/genprops/casepropsbuilder.cpp diff --git a/.gitignore b/.gitignore index 27144532fc..8244b96320 100644 --- a/.gitignore +++ b/.gitignore @@ -964,22 +964,6 @@ tools/trac/IcuCodeTools/0.11/icucodetools/*.pyc tools/trac/IcuCodeTools/0.12/*.egg-info tools/trac/IcuCodeTools/0.12/build tools/trac/IcuCodeTools/0.12/icucodetools/*.pyc -tools/unicode/c/gencase/*.d -tools/unicode/c/gencase/*.ncb -tools/unicode/c/gencase/*.o -tools/unicode/c/gencase/*.opt -tools/unicode/c/gencase/*.pdb -tools/unicode/c/gencase/*.plg -tools/unicode/c/gencase/Debug -tools/unicode/c/gencase/Makefile -tools/unicode/c/gencase/Release -tools/unicode/c/gencase/debug -tools/unicode/c/gencase/gencase -tools/unicode/c/gencase/gencase.[0-9] -tools/unicode/c/gencase/gencase.vcproj.*.*.user -tools/unicode/c/gencase/release -tools/unicode/c/gencase/x64 -tools/unicode/c/gencase/x86 tools/unicode/c/genprops/*.d tools/unicode/c/genprops/*.ncb tools/unicode/c/genprops/*.o diff --git a/tools/unicode/c/CMakeLists.txt b/tools/unicode/c/CMakeLists.txt index 155d631159..87f692bf82 100644 --- a/tools/unicode/c/CMakeLists.txt +++ b/tools/unicode/c/CMakeLists.txt @@ -17,7 +17,6 @@ include_directories( ${ICU_SRC_DIR}/source/i18n ${ICU_SRC_DIR}/source/tools/toolutil) link_directories(${ICU_INST_DIR}/lib) -add_subdirectory(gencase) add_subdirectory(genprops) add_subdirectory(genuca) add_subdirectory(genuts46) diff --git a/tools/unicode/c/gencase/CMakeLists.txt b/tools/unicode/c/gencase/CMakeLists.txt deleted file mode 100644 index 0f54f65fb4..0000000000 --- a/tools/unicode/c/gencase/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (C) 2010, International Business Machines -# Corporation and others. All Rights Reserved. -# -# created on: 2010jun03 -# created by: Markus W. Scherer -# edited on: 2010jul20 -# edited by: Stuart G. Gill -add_executable(gencase gencase.c store.c) -target_link_libraries(gencase icuuc icutu) diff --git a/tools/unicode/c/gencase/Makefile.in b/tools/unicode/c/gencase/Makefile.in deleted file mode 100644 index 7ab1d619dd..0000000000 --- a/tools/unicode/c/gencase/Makefile.in +++ /dev/null @@ -1,95 +0,0 @@ -## Makefile.in for ICU - tools/gencase -## Copyright (c) 1999-2005, International Business Machines Corporation and -## others. All Rights Reserved. -## Steven R. Loomis - -## Source directory information -srcdir = @srcdir@ -top_srcdir = @top_srcdir@ - -top_builddir = ../.. - -include $(top_builddir)/icudefs.mk - -## Build directory information -subdir = tools/gencase - -TARGET_STUB_NAME = gencase - -SECTION = 8 - -#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION) - - -## Extra files to remove for 'make clean' -CLEANFILES = *~ $(DEPS) $(MAN_FILES) - -## Target information -TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT) - -ifneq ($(top_builddir),$(top_srcdir)) -CPPFLAGS += -I$(top_builddir)/common -endif -CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil -LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M) - -OBJECTS = gencase.o store.o - -DEPS = $(OBJECTS:.o=.d) - -## List of phony targets -.PHONY : all all-local install install-local clean clean-local \ -distclean distclean-local dist dist-local check check-local install-man - -## Clear suffix list -.SUFFIXES : - -## List of standard targets -all: all-local -install: install-local -clean: clean-local -distclean : distclean-local -dist: dist-local -check: all check-local - -all-local: $(TARGET) $(MAN_FILES) - -install-local: all-local install-man - -install-man: $(MAN_FILES) -# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION) -# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION) - -dist-local: - -clean-local: - test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES) - $(RMV) $(TARGET) $(OBJECTS) - -distclean-local: clean-local - $(RMV) Makefile - -check-local: all-local - -Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status - cd $(top_builddir) \ - && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status - -$(TARGET) : $(OBJECTS) - $(LINK.cc) $(OUTOPT)$@ $^ $(LIBS) - $(POST_BUILD_STEP) - - -%.$(SECTION): $(srcdir)/%.$(SECTION).in - cd $(top_builddir) \ - && CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status - - -ifeq (,$(MAKECMDGOALS)) --include $(DEPS) -else -ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),) --include $(DEPS) -endif -endif - diff --git a/tools/unicode/c/gencase/gencase.c b/tools/unicode/c/gencase/gencase.c deleted file mode 100644 index 68960d0808..0000000000 --- a/tools/unicode/c/gencase/gencase.c +++ /dev/null @@ -1,865 +0,0 @@ -/* -******************************************************************************* -* -* Copyright (C) 2004-2011, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: gencase.c -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2004aug28 -* created by: Markus W. Scherer -* -* This program reads several of the Unicode character database text files, -* parses them, and the case mapping properties for each character. -* It then writes a binary file containing the properties -* that is designed to be used directly for random-access to -* the properties of each Unicode character. -*/ - -#include -#include "unicode/utypes.h" -#include "unicode/uchar.h" -#include "unicode/uset.h" -#include "unicode/putil.h" -#include "unicode/uclean.h" -#include "cmemory.h" -#include "cstring.h" -#include "uarrsort.h" -#include "unewdata.h" -#include "uoptions.h" -#include "uparse.h" -#include "uprops.h" -#include "propsvec.h" -#include "gencase.h" - -#define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) - -/* data --------------------------------------------------------------------- */ - -UPropsVectors *pv; - -UBool beVerbose=FALSE, haveCopyright=TRUE; - -/* - * Unicode set collecting the case-sensitive characters; - * see uchar.h UCHAR_CASE_SENSITIVE. - * Add code points from case mappings/foldings in - * the root locale and with default options. - */ -static USet *caseSensitive; - -/* prototypes --------------------------------------------------------------- */ - -static void -parseSpecialCasing(const char *filename, UErrorCode *pErrorCode); - -static void -parseCaseFolding(const char *filename, UErrorCode *pErrorCode); - -static void -parseDB(const char *filename, UErrorCode *pErrorCode); - -/* parse files with multiple binary properties ------------------------------ */ - -/* TODO: more common code, move functions to uparse.h|c */ - -/* TODO: similar to genprops/props2.c but not the same */ - -struct Binary { - const char *propName; - int32_t vecWord; - uint32_t vecValue, vecMask; -}; -typedef struct Binary Binary; - -struct Binaries { - const char *ucdFile; - const Binary *binaries; - int32_t binariesCount; -}; -typedef struct Binaries Binaries; - -static const Binary -propListNames[]={ - { "Soft_Dotted", 0, UCASE_SOFT_DOTTED, UCASE_DOT_MASK } -}; - -static const Binaries -propListBinaries={ - "PropList", propListNames, LENGTHOF(propListNames) -}; - -static const Binary -derCorePropsNames[]={ - { "Lowercase", 0, UCASE_LOWER, UCASE_TYPE_MASK }, - { "Uppercase", 0, UCASE_UPPER, UCASE_TYPE_MASK }, - /* Unicode 5.2 adds Case_Ignorable as a public property. See comments in store.c. */ - { "Case_Ignorable", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } -}; - -static const Binaries -derCorePropsBinaries={ - "DerivedCoreProperties", derCorePropsNames, LENGTHOF(derCorePropsNames) -}; - -/* - * Treat Word_Break=MidLetter and MidNumLet as a single binary property. - * We need not distinguish between them because both add to case-ignorable. - * We ignore all other Word_Break values. - */ -static const Binary -wordBreakNames[]={ - { "MidLetter", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) }, - { "MidNumLet", 1, U_MASK(UGENCASE_IS_MID_LETTER_SHIFT), U_MASK(UGENCASE_IS_MID_LETTER_SHIFT) } -}; - -static const Binaries -wordBreakBinaries={ - "WordBreakProperty", wordBreakNames, LENGTHOF(wordBreakNames) -}; - -static void U_CALLCONV -binariesLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - const Binaries *bin; - char *s; - uint32_t start, end; - int32_t i; - - bin=(const Binaries *)context; - - u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "gencase: syntax error in %s.txt field 0 at %s\n", bin->ucdFile, fields[0][0]); - exit(*pErrorCode); - } - - /* parse binary property name */ - s=(char *)u_skipWhitespace(fields[1][0]); - for(i=0;; ++i) { - if(i==bin->binariesCount) { - /* ignore unrecognized properties */ - return; - } - if(isToken(bin->binaries[i].propName, s)) { - break; - } - } - - if(bin->binaries[i].vecMask==0) { - fprintf(stderr, "gencase error: mask value %d==0 for %s %s\n", - (int)bin->binaries[i].vecMask, bin->ucdFile, bin->binaries[i].propName); - exit(U_INTERNAL_PROGRAM_ERROR); - } - - upvec_setValue(pv, start, end, bin->binaries[i].vecWord, bin->binaries[i].vecValue, bin->binaries[i].vecMask, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "gencase error: unable to set %s, code: %s\n", - bin->binaries[i].propName, u_errorName(*pErrorCode)); - exit(*pErrorCode); - } -} - -static void -parseBinariesFile(char *filename, char *basename, const char *suffix, - const Binaries *bin, - UErrorCode *pErrorCode) { - char *fields[2][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - writeUCDFilename(basename, bin->ucdFile, suffix); - - u_parseDelimitedFile(filename, ';', fields, 2, binariesLineFn, (void *)bin, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "error parsing %s.txt: %s\n", bin->ucdFile, u_errorName(*pErrorCode)); - } -} - -/* -------------------------------------------------------------------------- */ - -enum -{ - HELP_H, - HELP_QUESTION_MARK, - VERBOSE, - COPYRIGHT, - DESTDIR, - SOURCEDIR, - UNICODE_VERSION, - ICUDATADIR, - CSOURCE -}; - -/* Keep these values in sync with the above enums */ -static UOption options[]={ - UOPTION_HELP_H, - UOPTION_HELP_QUESTION_MARK, - UOPTION_VERBOSE, - UOPTION_COPYRIGHT, - UOPTION_DESTDIR, - UOPTION_SOURCEDIR, - UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG), - UOPTION_ICUDATADIR, - UOPTION_DEF("csource", 'C', UOPT_NO_ARG) -}; - -extern int -main(int argc, char* argv[]) { - char filename[300]; - const char *srcDir=NULL, *destDir=NULL, *suffix=NULL; - char *basename=NULL; - UErrorCode errorCode=U_ZERO_ERROR; - - U_MAIN_INIT_ARGS(argc, argv); - - /* preset then read command line options */ - options[DESTDIR].value=u_getDataDirectory(); - options[SOURCEDIR].value=""; - options[UNICODE_VERSION].value=""; - options[ICUDATADIR].value=u_getDataDirectory(); - argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); - - /* error handling, printing usage message */ - if(argc<0) { - fprintf(stderr, - "error in command line argument \"%s\"\n", - argv[-argc]); - } - if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) { - /* - * Broken into chunks because the C89 standard says the minimum - * required supported string length is 509 bytes. - */ - fprintf(stderr, - "Usage: %s [-options] [suffix]\n" - "\n" - "read the UnicodeData.txt file and other Unicode properties files and\n" - "create a binary file " UCASE_DATA_NAME "." UCASE_DATA_TYPE " with the case mapping properties\n" - "\n", - argv[0]); - fprintf(stderr, - "Options:\n" - "\t-h or -? or --help this usage text\n" - "\t-v or --verbose verbose output\n" - "\t-c or --copyright include a copyright notice\n" - "\t-u or --unicode Unicode version, followed by the version like 3.0.0\n" - "\t-C or --csource generate a .c source file rather than the .icu binary\n"); - fprintf(stderr, - "\t-d or --destdir destination directory, followed by the path\n" - "\t-s or --sourcedir source directory, followed by the path\n" - "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" - "\t followed by path, defaults to %s\n" - "\tsuffix suffix that is to be appended with a '-'\n" - "\t to the source file basenames before opening;\n" - "\t 'gencase new' will read UnicodeData-new.txt etc.\n", - u_getDataDirectory()); - return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; - } - - /* get the options values */ - beVerbose=options[VERBOSE].doesOccur; - haveCopyright=options[COPYRIGHT].doesOccur; - srcDir=options[SOURCEDIR].value; - destDir=options[DESTDIR].value; - - if(argc>=2) { - suffix=argv[1]; - } else { - suffix=NULL; - } - - if(options[UNICODE_VERSION].doesOccur) { - setUnicodeVersion(options[UNICODE_VERSION].value); - } - /* else use the default dataVersion in store.c */ - - if (options[ICUDATADIR].doesOccur) { - u_setDataDirectory(options[ICUDATADIR].value); - } - - /* prepare the filename beginning with the source dir */ - uprv_strcpy(filename, srcDir); - basename=filename+uprv_strlen(filename); - if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) { - *basename++=U_FILE_SEP_CHAR; - } - - /* initialize */ - pv=upvec_open(2, &errorCode); - caseSensitive=uset_open(1, 0); /* empty set (start>end) */ - - /* process SpecialCasing.txt */ - writeUCDFilename(basename, "SpecialCasing", suffix); - parseSpecialCasing(filename, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error parsing SpecialCasing.txt: %s\n", u_errorName(errorCode)); - return errorCode; - } - - /* process CaseFolding.txt */ - writeUCDFilename(basename, "CaseFolding", suffix); - parseCaseFolding(filename, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error parsing CaseFolding.txt: %s\n", u_errorName(errorCode)); - return errorCode; - } - - /* process additional properties files */ - *basename=0; - - parseBinariesFile(filename, basename, suffix, &propListBinaries, &errorCode); - - parseBinariesFile(filename, basename, suffix, &derCorePropsBinaries, &errorCode); - - if(ucdVersion>=UNI_4_1) { - parseBinariesFile(filename, basename, suffix, &wordBreakBinaries, &errorCode); - } - - /* process UnicodeData.txt */ - writeUCDFilename(basename, "UnicodeData", suffix); - parseDB(filename, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "error parsing UnicodeData.txt: %s\n", u_errorName(errorCode)); - return errorCode; - } - - /* process parsed data */ - makeCaseClosure(); - - makeExceptions(); - - if(U_SUCCESS(errorCode)) { - /* write the properties data file */ - generateData(destDir, options[CSOURCE].doesOccur); - } - - u_cleanup(); - return errorCode; -} - -U_CFUNC void -writeUCDFilename(char *basename, const char *filename, const char *suffix) { - int32_t length=(int32_t)uprv_strlen(filename); - uprv_strcpy(basename, filename); - if(suffix!=NULL) { - basename[length++]='-'; - uprv_strcpy(basename+length, suffix); - length+=(int32_t)uprv_strlen(suffix); - } - uprv_strcpy(basename+length, ".txt"); -} - -/* TODO: move to toolutil */ -U_CFUNC UBool -isToken(const char *token, const char *s) { - const char *z; - int32_t j; - - s=u_skipWhitespace(s); - for(j=0;; ++j) { - if(token[j]!=0) { - if(s[j]!=token[j]) { - break; - } - } else { - z=u_skipWhitespace(s+j); - if(*z==';' || *z==0) { - return TRUE; - } else { - break; - } - } - } - - return FALSE; -} - -static int32_t -getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) { - const char *t, *z; - int32_t i, j; - - s=u_skipWhitespace(s); - for(i=0; i=0 */ - for(i=0; i; ; ; - * because maps-to-self is already our default, and this line breaks this parser. - */ - return; - } - specialCasings[specialCasingCount].code=(UChar32)uprv_strtoul(s, &end, 16); - end=(char *)u_skipWhitespace(end); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "gencase: syntax error in SpecialCasing.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* is this a complex mapping? */ - if(*(end=(char *)u_skipWhitespace(fields[4][0]))!=0 && *end!=';' && *end!='#') { - /* there is some condition text in the fifth field */ - specialCasings[specialCasingCount].isComplex=TRUE; - - /* do not store any actual mappings for this */ - specialCasings[specialCasingCount].lowerCase[0]=0; - specialCasings[specialCasingCount].upperCase[0]=0; - specialCasings[specialCasingCount].titleCase[0]=0; - } else { - /* just set the "complex" flag and get the case mappings */ - specialCasings[specialCasingCount].isComplex=FALSE; - specialCasings[specialCasingCount].lowerCase[0]= - (UChar)u_parseString(fields[1][0], specialCasings[specialCasingCount].lowerCase+1, 31, NULL, pErrorCode); - specialCasings[specialCasingCount].upperCase[0]= - (UChar)u_parseString(fields[3][0], specialCasings[specialCasingCount].upperCase+1, 31, NULL, pErrorCode); - specialCasings[specialCasingCount].titleCase[0]= - (UChar)u_parseString(fields[2][0], specialCasings[specialCasingCount].titleCase+1, 31, NULL, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "gencase: error parsing special casing at %s\n", fields[0][0]); - exit(*pErrorCode); - } - - uset_add(caseSensitive, (UChar32)specialCasings[specialCasingCount].code); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].lowerCase+1, specialCasings[specialCasingCount].lowerCase[0]); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].upperCase+1, specialCasings[specialCasingCount].upperCase[0]); - _set_addAll(caseSensitive, specialCasings[specialCasingCount].titleCase+1, specialCasings[specialCasingCount].titleCase[0]); - } - - if(++specialCasingCount==MAX_SPECIAL_CASING_COUNT) { - fprintf(stderr, "gencase: too many special casing mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static int32_t U_CALLCONV -compareSpecialCasings(const void *context, const void *left, const void *right) { - return ((const SpecialCasing *)left)->code-((const SpecialCasing *)right)->code; -} - -static void -parseSpecialCasing(const char *filename, UErrorCode *pErrorCode) { - char *fields[5][2]; - int32_t i, j; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 5, specialCasingLineFn, NULL, pErrorCode); - - /* sort the special casing entries by code point */ - if(specialCasingCount>0) { - uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), - compareSpecialCasings, NULL, FALSE, pErrorCode); - } - if(U_FAILURE(*pErrorCode)) { - return; - } - - /* replace multiple entries for any code point by one "complex" one */ - j=0; - for(i=1; i0) { - uprv_sortArray(specialCasings, specialCasingCount, sizeof(SpecialCasing), - compareSpecialCasings, NULL, FALSE, pErrorCode); - specialCasingCount-=j; - } - if(U_FAILURE(*pErrorCode)) { - return; - } - - /* - * Add one complex mapping to caseSensitive that was filtered out above: - * Greek final Sigma has a conditional mapping but not locale-sensitive, - * and it is taken when lowercasing just U+03A3 alone. - * 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA - */ - uset_add(caseSensitive, 0x3c2); -} - -/* parser for CaseFolding.txt ----------------------------------------------- */ - -#define MAX_CASE_FOLDING_COUNT 2000 - -static CaseFolding caseFoldings[MAX_CASE_FOLDING_COUNT]; -static int32_t caseFoldingCount=0; - -static void U_CALLCONV -caseFoldingLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - char *end; - static UChar32 prevCode=0; - int32_t count; - char status; - - /* get code point */ - const char *s=u_skipWhitespace(fields[0][0]); - if(0==uprv_strncmp(s, "0000..10FFFF", 12)) { - /* - * Ignore the line - * # @missing: 0000..10FFFF; C; - * because maps-to-self is already our default, and this line breaks this parser. - */ - return; - } - caseFoldings[caseFoldingCount].code=(UChar32)uprv_strtoul(s, &end, 16); - end=(char *)u_skipWhitespace(end); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "gencase: syntax error in CaseFolding.txt field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* get the status of this mapping */ - caseFoldings[caseFoldingCount].status=status=*u_skipWhitespace(fields[1][0]); - if(status!='L' && status!='E' && status!='C' && status!='S' && status!='F' && status!='I' && status!='T') { - fprintf(stderr, "gencase: unrecognized status field in CaseFolding.txt at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* ignore all case folding mappings that are the same as the UnicodeData.txt lowercase mappings */ - if(status=='L') { - return; - } - - /* get the mapping */ - count=caseFoldings[caseFoldingCount].full[0]= - (UChar)u_parseString(fields[2][0], caseFoldings[caseFoldingCount].full+1, 31, (uint32_t *)&caseFoldings[caseFoldingCount].simple, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - fprintf(stderr, "gencase: error parsing CaseFolding.txt mapping at %s\n", fields[0][0]); - exit(*pErrorCode); - } - - /* there is a simple mapping only if there is exactly one code point (count is in UChars) */ - if(count==0 || count>2 || (count==2 && UTF_IS_SINGLE(caseFoldings[caseFoldingCount].full[1]))) { - caseFoldings[caseFoldingCount].simple=0; - } - - /* update the case-sensitive set */ - if(status!='T') { - uset_add(caseSensitive, (UChar32)caseFoldings[caseFoldingCount].code); - _set_addAll(caseSensitive, caseFoldings[caseFoldingCount].full+1, caseFoldings[caseFoldingCount].full[0]); - } - - /* check the status */ - if(status=='S') { - /* check if there was a full mapping for this code point before */ - if( caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && - caseFoldings[caseFoldingCount-1].status=='F' - ) { - /* merge the two entries */ - caseFoldings[caseFoldingCount-1].simple=caseFoldings[caseFoldingCount].simple; - return; - } - } else if(status=='F') { - /* check if there was a simple mapping for this code point before */ - if( caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code && - caseFoldings[caseFoldingCount-1].status=='S' - ) { - /* merge the two entries */ - uprv_memcpy(caseFoldings[caseFoldingCount-1].full, caseFoldings[caseFoldingCount].full, 32*U_SIZEOF_UCHAR); - return; - } - } else if(status=='I' || status=='T') { - /* check if there was a default mapping for this code point before (remove it) */ - while(caseFoldingCount>0 && - caseFoldings[caseFoldingCount-1].code==caseFoldings[caseFoldingCount].code - ) { - prevCode=0; - --caseFoldingCount; - } - /* store only a marker for special handling for cases like dotless i */ - caseFoldings[caseFoldingCount].simple=0; - caseFoldings[caseFoldingCount].full[0]=0; - } - - /* check that the code points (caseFoldings[caseFoldingCount].code) are in ascending order */ - if(caseFoldings[caseFoldingCount].code<=prevCode && caseFoldings[caseFoldingCount].code>0) { - fprintf(stderr, "gencase: error - CaseFolding entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)caseFoldings[caseFoldingCount].code, - (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - prevCode=caseFoldings[caseFoldingCount].code; - - if(++caseFoldingCount==MAX_CASE_FOLDING_COUNT) { - fprintf(stderr, "gencase: too many case folding mappings\n"); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - exit(U_INDEX_OUTOFBOUNDS_ERROR); - } -} - -static void -parseCaseFolding(const char *filename, UErrorCode *pErrorCode) { - char *fields[3][2]; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 3, caseFoldingLineFn, NULL, pErrorCode); -} - -/* parser for UnicodeData.txt ----------------------------------------------- */ - -/* general categories */ -const char *const -genCategoryNames[U_CHAR_CATEGORY_COUNT]={ - "Cn", - "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", - "Mc", "Nd", "Nl", "No", - "Zs", "Zl", "Zp", - "Cc", "Cf", "Co", "Cs", - "Pd", "Ps", "Pe", "Pc", "Po", - "Sm", "Sc", "Sk", "So", - "Pi", "Pf" -}; - -static int32_t specialCasingIndex=0, caseFoldingIndex=0; - -static void U_CALLCONV -unicodeDataLineFn(void *context, - char *fields[][2], int32_t fieldCount, - UErrorCode *pErrorCode) { - Props p; - char *end; - static UChar32 prevCode=0; - UChar32 value; - int32_t i; - - /* reset the properties */ - uprv_memset(&p, 0, sizeof(Props)); - - /* get the character code, field 0 */ - p.code=(UChar32)uprv_strtoul(fields[0][0], &end, 16); - if(end<=fields[0][0] || end!=fields[0][1]) { - fprintf(stderr, "gencase: syntax error in field 0 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* get general category, field 2 */ - i=getTokenIndex(genCategoryNames, U_CHAR_CATEGORY_COUNT, fields[2][0]); - if(i>=0) { - p.gc=(uint8_t)i; - } else { - fprintf(stderr, "gencase: unknown general category \"%s\" at code 0x%lx\n", - fields[2][0], (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* get canonical combining class, field 3 */ - value=(UChar32)uprv_strtoul(fields[3][0], &end, 10); - if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) { - fprintf(stderr, "gencase: syntax error in field 3 at %s\n", fields[0][0]); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - p.cc=(uint8_t)value; - - /* get uppercase mapping, field 12 */ - value=(UChar32)uprv_strtoul(fields[12][0], &end, 16); - if(end!=fields[12][1]) { - fprintf(stderr, "gencase: syntax error in field 12 at code 0x%lx\n", - (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - if(value!=0 && value!=p.code) { - p.upperCase=value; - uset_add(caseSensitive, p.code); - uset_add(caseSensitive, value); - } - - /* get lowercase value, field 13 */ - value=(UChar32)uprv_strtoul(fields[13][0], &end, 16); - if(end!=fields[13][1]) { - fprintf(stderr, "gencase: syntax error in field 13 at code 0x%lx\n", - (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - if(value!=0 && value!=p.code) { - p.lowerCase=value; - uset_add(caseSensitive, p.code); - uset_add(caseSensitive, value); - } - - /* get titlecase value, field 14 */ - value=(UChar32)uprv_strtoul(fields[14][0], &end, 16); - if(end!=fields[14][1]) { - fprintf(stderr, "gencase: syntax error in field 14 at code 0x%lx\n", - (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - if(value!=0 && value!=p.code) { - p.titleCase=value; - uset_add(caseSensitive, p.code); - uset_add(caseSensitive, value); - } - - /* set additional properties from previously parsed files */ - if(specialCasingIndexstatus=='C' && - p.caseFolding->simple==p.lowerCase - ) { - p.caseFolding=NULL; - } - } else { - p.caseFolding=NULL; - } - - /* check for non-character code points */ - if((p.code&0xfffe)==0xfffe || (uint32_t)(p.code-0xfdd0)<0x20) { - fprintf(stderr, "gencase: error - properties for non-character code point U+%04lx\n", - (unsigned long)p.code); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* check that the code points (p.code) are in ascending order */ - if(p.code<=prevCode && p.code>0) { - fprintf(stderr, "gencase: error - UnicodeData entries out of order, U+%04lx after U+%04lx\n", - (unsigned long)p.code, (unsigned long)prevCode); - *pErrorCode=U_PARSE_ERROR; - exit(U_PARSE_ERROR); - } - - /* properties for a single code point */ - setProps(&p); - - prevCode=p.code; -} - -static void -parseDB(const char *filename, UErrorCode *pErrorCode) { - char *fields[15][2]; - UChar32 start, end; - int32_t i; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode); - - /* are all sub-properties consumed? */ - if(specialCasingIndex - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/tools/unicode/c/gencase/store.c b/tools/unicode/c/gencase/store.c deleted file mode 100644 index a6fe0946c1..0000000000 --- a/tools/unicode/c/gencase/store.c +++ /dev/null @@ -1,1201 +0,0 @@ -/* -******************************************************************************* -* -* Copyright (C) 2004-2011, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: store.c -* encoding: US-ASCII -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2004aug28 -* created by: Markus W. Scherer -* -* Store Unicode case mapping properties efficiently for -* random access. -*/ - -#include -#include -#include "unicode/utypes.h" -#include "unicode/uchar.h" -#include "unicode/ustring.h" -#include "cmemory.h" -#include "cstring.h" -#include "filestrm.h" -#include "utrie2.h" -#include "uarrsort.h" -#include "unicode/udata.h" -#include "unewdata.h" -#include "propsvec.h" -#include "writesrc.h" -#include "gencase.h" - -#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) - -/* Unicode case mapping properties file format --------------------------------- - -The file format prepared and written here contains several data -structures that store indexes or data. - -Before the data contents described below, there are the headers required by -the udata API for loading ICU data. Especially, a UDataInfo structure -precedes the actual data. It contains platform properties values and the -file format version. - -The following is a description of format version 2.0 . - -Format version 1.1 adds data for case closure. - -Format version 1.2 adds an exception bit for case-ignorable. Needed because -the Cased and Case_Ignorable properties are not disjoint. - -Format version 2.0 changes from UTrie to UTrie2. - -The file contains the following structures: - - const int32_t indexes[i0] with values i0, i1, ...: - (see UCASE_IX_... constants for names of indexes) - - i0 indexLength; -- length of indexes[] (UCASE_IX_TOP) - i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) - i2 trieSize; -- size in bytes of the case mapping properties trie - i3 exceptionsLength; -- length in uint16_t of the exceptions array - i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1) - - i5..i14 reservedIndexes; -- reserved values; 0 for now - - i15 maxFullLength; -- maximum length of a full case mapping/folding string - - - Serialized trie, see utrie2.h; - - const uint16_t exceptions[exceptionsLength]; - - const UChar unfold[unfoldLength]; - - -Trie data word: -Bits -if(exception) { - 15..4 unsigned exception index -} else { - if(not uncased) { - 15..6 signed delta to simple case mapping code point - (add delta to input code point) - } else { - 6 the code point is case-ignorable - (U+0307 is also case-ignorable but has an exception) - } - 5..4 0 normal character with cc=0 - 1 soft-dotted character - 2 cc=230 - 3 other cc -} - 3 exception - 2 case sensitive - 1..0 0 uncased - 1 lowercase - 2 uppercase - 3 titlecase - - -Exceptions: -A sub-array of the exceptions array is indexed by the exception index in a -trie word. -The sub-array consists of the following fields: - uint16_t excWord; - uint16_t optional values []; - UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase - -excWord: (see UCASE_EXC_...) -Bits - 15 conditional case folding - 14 conditional special casing -13..12 same as non-exception trie data bits 5..4 - moved here because the exception index needs more bits than the delta - 0 normal character with cc=0 - 1 soft-dotted character - 2 cc=230 - 3 other cc -11 case-ignorable (used when the character is cased or has another exception) - (new in formatVersion 1.2/ICU 4.4) -10.. 9 reserved - 8 if set, then for each optional-value slot there are 2 uint16_t values - (high and low parts of 32-bit values) - instead of single ones - 7.. 0 bits for which optional value is present - -Optional-value slots: -0 lowercase mapping (code point) -1 case folding (code point) -2 uppercase mapping (code point) -3 titlecase mapping (code point) -4 reserved -5 reserved -6 closure mappings (new in format version 1.1) -7 there is at least one full (string) case mapping - the length of each is encoded in a nibble of this optional value, - and the strings follow this optional value in the same order: - lower/fold/upper/title - -The optional closure mappings value is used as follows: -Bits 0..3 contain the length of a string of code points for case closure. -The string immediately follows the full case mappings, or the closure value -slot if there are no full case mappings. -Bits 4..15 are reserved and could be used in the future to indicate the -number of strings for case closure. -Complete case closure for a code point is given by the union of all simple -and full case mappings and foldings, plus the case closure code points -(and potentially, in the future, case closure strings). - -For space saving, some values are not stored. Lookups are as follows: -- If special casing is conditional, then no full lower/upper/title mapping - strings are stored. -- If case folding is conditional, then no simple or full case foldings are - stored. -- Fall back in this order: - full (string) mapping -- if full mappings are used - simple (code point) mapping of the same type - simple fold->simple lower - simple title->simple upper - finally, the original code point (no mapping) - -This fallback order is strict: -In particular, the fallback from full case folding is to simple case folding, -not to full lowercase mapping. - -Reverse case folding data ("unfold") array: (new in format version 1.1) - -This array stores some miscellaneous values followed by a table. The data maps -back from multi-character strings to their original code points, for use -in case closure. - -The table contains two columns of strings. -The string in the first column is the case folding of each of the code points -in the second column. The strings are terminated with NUL or by the end of the -column, whichever comes first. - -The miscellaneous data takes up one pseudo-row and includes: -- number of rows -- number of UChars per row -- number of UChars in the left (folding string) column - -The table is sorted by its first column. Values in the first column are unique. - ------------------------------------------------------------------------------ */ - -/* UDataInfo cf. udata.h */ -static UDataInfo dataInfo={ - sizeof(UDataInfo), - 0, - - U_IS_BIG_ENDIAN, - U_CHARSET_FAMILY, - U_SIZEOF_UCHAR, - 0, - - /* dataFormat="cAsE" */ - { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, - { 2, 0, 0, 0 }, /* formatVersion */ - { 6, 0, 0, 0 } /* dataVersion */ -}; - -enum { - /* maximum number of exceptions expected */ - MAX_EXC_COUNT=1000 -}; - -/* exceptions values */ -static uint16_t exceptions[UCASE_MAX_EXCEPTIONS+100]; -static uint16_t exceptionsTop=0; -static Props excProps[MAX_EXC_COUNT]; -static uint16_t exceptionsCount=0; - -/* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */ -static int32_t maxFullLength=U16_MAX_LENGTH; - -/* reverse case folding ("unfold") data */ -static UChar unfold[UGENCASE_UNFOLD_MAX_ROWS*UGENCASE_UNFOLD_WIDTH]={ - 0, UGENCASE_UNFOLD_WIDTH, UGENCASE_UNFOLD_STRING_WIDTH, 0, 0 -}; -static uint16_t unfoldRows=0; -static uint16_t unfoldTop=UGENCASE_UNFOLD_WIDTH; - -/* Unicode versions --------------------------------------------------------- */ - -static const UVersionInfo -unicodeVersions[]={ - { 1, 0, 0, 0 }, - { 1, 1, 0, 0 }, - { 2, 0, 0, 0 }, - { 3, 0, 0, 0 }, - { 3, 1, 0, 0 }, - { 3, 2, 0, 0 }, - { 4, 0, 0, 0 }, - { 4, 0, 1, 0 }, - { 4, 1, 0, 0 }, - { 5, 1, 0, 0 }, - { 5, 2, 0, 0 }, - { 6, 0, 0, 0 } -}; - -int32_t ucdVersion=UNI_4_1; - -static int32_t -findUnicodeVersion(const UVersionInfo version) { - int32_t i; - - for(i=0; /* while(version>unicodeVersions[i]) {} */ - i0; - ++i) {} - if(0=ucdVersion comparisons */ - } - return i; /* version>=unicodeVersions[i] && versionUGENCASE_UNFOLD_STRING_WIDTH) { - fprintf(stderr, "gencase error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n", - (long)length, UGENCASE_UNFOLD_STRING_WIDTH); - exit(U_INTERNAL_PROGRAM_ERROR); - } - if(unfoldTop >= (LENGTHOF(unfold) - UGENCASE_UNFOLD_STRING_WIDTH)) { - fprintf(stderr, "gencase error: too many multi-character case foldings\n"); - exit(U_BUFFER_OVERFLOW_ERROR); - } - u_memset(unfold+unfoldTop, 0, UGENCASE_UNFOLD_WIDTH); - u_memcpy(unfold+unfoldTop, s, length); - - i=unfoldTop+UGENCASE_UNFOLD_STRING_WIDTH; - U16_APPEND_UNSAFE(unfold, i, c); - - ++unfoldRows; - unfoldTop+=UGENCASE_UNFOLD_WIDTH; -} - -/* store a character's properties ------------------------------------------- */ - -extern void -setProps(Props *p) { - UErrorCode errorCode; - uint32_t value, oldValue; - int32_t delta; - - /* get the non-UnicodeData.txt properties */ - value=oldValue=upvec_getValue(pv, p->code, 0); - - /* default: map to self */ - delta=0; - - if(p->gc==U_TITLECASE_LETTER) { - /* the Titlecase property is read late, from UnicodeData.txt */ - value|=UCASE_TITLE; - } - - if(p->upperCase!=0) { - /* uppercase mapping as delta if the character is lowercase */ - if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { - delta=p->upperCase-p->code; - } else { - value|=UCASE_EXCEPTION; - } - } - if(p->lowerCase!=0) { - /* lowercase mapping as delta if the character is uppercase or titlecase */ - if((value&UCASE_TYPE_MASK)>=UCASE_UPPER) { - delta=p->lowerCase-p->code; - } else { - value|=UCASE_EXCEPTION; - } - } - if(p->upperCase!=p->titleCase) { - value|=UCASE_EXCEPTION; - } - if(p->closure[0]!=0) { - value|=UCASE_EXCEPTION; - } - if(p->specialCasing!=NULL) { - value|=UCASE_EXCEPTION; - } - if(p->caseFolding!=NULL) { - value|=UCASE_EXCEPTION; - } - - if(deltacc!=0) { - if(value&UCASE_DOT_MASK) { - fprintf(stderr, "gencase: a soft-dotted character has cc!=0\n"); - exit(U_INTERNAL_PROGRAM_ERROR); - } - if(p->cc==230) { - value|=UCASE_ABOVE; - } else { - value|=UCASE_OTHER_ACCENT; - } - } - - /* - * Encode case-ignorable as delta==1 on uncased characters, - * and with an exception bit on cased characters and characters with another exception. - */ - if(ucdVersion>=UNI_4_1) { - /* - * Unicode 4.1 & 5.0: (D47a) Word_Break=MidLetter or Mn, Me, Cf, Lm, Sk - * Unicode 5.1: Word_Break=(MidLetter or MidNumLet) or Mn, Me, Cf, Lm, Sk - * The UGENCASE_IS_MID_LETTER_SHIFT bit is set for both WB=MidLetter and WB=MidNumLet. - * Unicode 5.2: The definition (Unicode Standard Definition D121) is unchanged, - * but now Case_Ignorable is a public property - * with its values listed in DerivedCoreProperties.txt. - * gencase.c parses those values as well, just in case the definition changes - * in the future. gencase.c sets the UGENCASE_IS_MID_LETTER_SHIFT bit - * for each Case_Ignorable entry. (It never resets that bit.) - */ - if( - (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || - (upvec_getValue(pv, p->code, 1)&U_MASK(UGENCASE_IS_MID_LETTER_SHIFT))!=0 - ) { - p->isCaseIgnorable=TRUE; - } - } else { - /* before Unicode 4.1: Mn, Me, Cf, Lm, Sk or 0027 or 00AD or 2019 */ - if( - (U_MASK(p->gc)&(U_GC_MN_MASK|U_GC_ME_MASK|U_GC_CF_MASK|U_GC_LM_MASK|U_GC_SK_MASK))!=0 || - p->code==0x27 || p->code==0xad || p->code==0x2019 - ) { - p->isCaseIgnorable=TRUE; - } - } - if(p->isCaseIgnorable) { - if((value&UCASE_TYPE_MASK)==UCASE_NONE) { - /* - * We use one of the delta/exception bits for - * the case-ignorable flag for uncased characters. - * There is no delta for uncased characters (see checks above). - */ - delta=1; - } else { - /* - * If the character is cased or has another exception, - * then we store the case-ignorable flag as an exception bit. - */ - value|=UCASE_EXCEPTION; - } - } - - /* handle exceptions */ - if(value&UCASE_EXCEPTION) { - /* simply store exceptions for later processing and encoding */ - value|=(uint32_t)exceptionsCount<code, p->code, 0, value, 0xffffffff, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gencase error: unable to set case mapping values, code: %s\n", - u_errorName(errorCode)); - exit(errorCode); - } - } - - /* add the multi-character case folding to the "unfold" data */ - if(p->caseFolding!=NULL) { - int32_t length=p->caseFolding->full[0]; - if(length>1 && u_strHasMoreChar32Than(p->caseFolding->full+1, length, 1)) { - addUnfolding(p->code, p->caseFolding->full+1, length); - } - } -} - -extern void -addCaseSensitive(UChar32 first, UChar32 last) { - UErrorCode errorCode=U_ZERO_ERROR; - upvec_setValue(pv, first, last, 0, UCASE_SENSITIVE, UCASE_SENSITIVE, &errorCode); - if(U_FAILURE(errorCode)) { - fprintf(stderr, "gencase error: unable to set UCASE_SENSITIVE, code: %s\n", - u_errorName(errorCode)); - exit(errorCode); - } -} - -/* finalize reverse case folding ("unfold") data ---------------------------- */ - -static int32_t U_CALLCONV -compareUnfold(const void *context, const void *left, const void *right) { - return u_memcmp((const UChar *)left, (const UChar *)right, UGENCASE_UNFOLD_WIDTH); -} - -static void -makeUnfoldData() { - static const UChar - iDot[2]= { 0x69, 0x307 }; - - UChar *p, *q; - int32_t i, j, k; - UErrorCode errorCode; - - /* - * add a case folding that we missed because it's conditional: - * 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE - */ - addUnfolding(0x130, iDot, 2); - - /* sort the data */ - errorCode=U_ZERO_ERROR; - uprv_sortArray(unfold+UGENCASE_UNFOLD_WIDTH, unfoldRows, UGENCASE_UNFOLD_WIDTH*2, - compareUnfold, NULL, FALSE, &errorCode); - - /* make unique-string rows by merging adjacent ones' code point columns */ - - /* make p point to row i-1 */ - p=(UChar *)unfold+UGENCASE_UNFOLD_WIDTH; - - for(i=1; iUGENCASE_UNFOLD_CP_WIDTH) { - fprintf(stderr, "gencase error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n", - (long)j, UGENCASE_UNFOLD_CP_WIDTH); - exit(U_BUFFER_OVERFLOW_ERROR); - } - - /* move following rows up one */ - --unfoldRows; - unfoldTop-=UGENCASE_UNFOLD_WIDTH; - u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH); - } else { - p+=UGENCASE_UNFOLD_WIDTH; - ++i; - } - } - - unfold[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows; - - if(beVerbose) { - puts("unfold data:"); - - p=(UChar *)unfold; - for(i=0; iU+%04lx\n", - (unsigned long)src, (unsigned long)dest); - } - - value=upvec_getValue(pv, src, 0); - if(value&UCASE_EXCEPTION) { - Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); - int32_t i; - - /* append dest to src's closure array */ - for(i=0;; ++i) { - if(i==LENGTHOF(p->closure)) { - fprintf(stderr, "closure[] overflow for U+%04lx->U+%04lx\n", - (unsigned long)src, (unsigned long)dest); - exit(U_BUFFER_OVERFLOW_ERROR); - } else if(p->closure[i]==dest) { - break; /* do not store duplicates */ - } else if(p->closure[i]==0) { - p->closure[i]=dest; - break; - } - } - } else { - Props p2={ 0 }; - UChar32 next; - UErrorCode errorCode; - - /* - * decode value into p2 (enough for makeException() to work properly), - * add the closure mapping, - * and set the new exception for src - */ - p2.code=src; - p2.closure[0]=dest; - - if((value&UCASE_TYPE_MASK)>UCASE_NONE) { - /* one simple case mapping, don't care which one */ - next=src+((int16_t)value>>UCASE_DELTA_SHIFT); - if(next!=src) { - if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { - p2.upperCase=p2.titleCase=next; - } else { - p2.lowerCase=next; - } - } - } else if(value&UCASE_DELTA_MASK) { - fprintf(stderr, "gencase error: unable to add case closure exception to case-ignorable U+%04lx\n", - (unsigned long)src); - exit(U_INTERNAL_PROGRAM_ERROR); - } - - value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); /* remove previous simple mapping */ - value|=(uint32_t)exceptionsCount<c - * into - * orig<->c - * - * The third-level function call has prev2==orig, prev>=0, and c is - * the destination code point of one of prev's case mappings. - * (And prev is the destination of one of prev2's case mappings.) - * The function checks if any of c's case mappings go back to orig - * and adds a closure mapping if not. - * In other words, it turns case mapping relationships of - * orig->prev->c or orig->prev<->c - * into - * orig->prev->c->orig or orig->prev<->c->orig - * etc. - * (Graphically, this closes a triangle.) - * - * With repeated application on all code points until no more closure mappings - * are added, all case equivalence groups get complete mappings. - * That is, in each group of code points with case relationships - * each code point will in the end have some mapping to each other - * code point in the group. - * - * @return TRUE if a closure mapping was added - */ -static UBool -addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value) { - UChar32 next; - UBool someMappingsAdded=FALSE; - - if(c!=orig) { - /* get the properties for c */ - value=upvec_getValue(pv, c, 0); - } - /* else if c==orig then c's value was passed in */ - - if(value&UCASE_EXCEPTION) { - UChar32 set[32]; - int32_t i, count=0; - - Props *p=excProps+(value>>UGENCASE_EXC_SHIFT); - - /* - * marker for whether any of c's mappings goes to orig - * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings - */ - UBool mapsToOrig=(UBool)(c==orig); - - /* collect c's case mapping destinations in set[] */ - if((next=p->upperCase)!=0 && next!=c) { - set[count++]=next; - } - if((next=p->lowerCase)!=0 && next!=c) { - set[count++]=next; - } - if(p->upperCase!=(next=p->titleCase) && next!=c) { - set[count++]=next; - } - if(p->caseFolding!=NULL && (next=p->caseFolding->simple)!=0 && next!=c) { - set[count++]=next; - } - - /* append c's current closure mappings to set[] */ - for(i=0; iclosure) && (next=p->closure[i])!=0; ++i) { - set[count++]=next; - } - - /* process all code points to which c case-maps */ - for(i=0; i=0) or - * this is a mapping to one of the previous code points (orig, prev, c) - */ - someMappingsAdded|=addClosure(orig, prev, c, next, 0); - } - } - - if(!mapsToOrig) { - addClosureMapping(c, orig); - return TRUE; - } - } else { - if((value&UCASE_TYPE_MASK)>UCASE_NONE) { - /* one simple case mapping, don't care which one */ - next=c+((int16_t)value>>UCASE_DELTA_SHIFT); - if(next!=c) { - /* - * recurse unless - * we have reached maximum depth (prev2>=0) or - * this is a mapping to one of the previous code points (orig, prev, c) - */ - if(prev2<0 && next!=orig && next!=prev) { - someMappingsAdded|=addClosure(orig, prev, c, next, 0); - } - - if(c!=orig && next!=orig) { - /* c does not map to orig, add a closure mapping c->orig */ - addClosureMapping(c, orig); - return TRUE; - } - } - } - } - - return someMappingsAdded; -} - -extern void -makeCaseClosure() { - UChar *p; - uint32_t *row; - uint32_t value; - UChar32 start, end, c, c2; - int32_t i, j; - UBool someMappingsAdded; - - /* - * finalize the "unfold" data because we need to use it to add closure mappings - * for situations like FB05->"st"<-FB06 - * where we would otherwise miss the FB05<->FB06 relationship - */ - makeUnfoldData(); - - /* use the "unfold" data to add mappings */ - - /* p always points to the code points; this loop ignores the strings completely */ - p=unfold+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH; - - for(i=0; iend) { - ++i; - } - row=NULL; /* signal to continue with outer loop, without further ++i */ - break; - } - ++start; - } - if(row==NULL) { - continue; /* see row=NULL above */ - } - } - ++i; - } - - if(beVerbose && someMappingsAdded) { - puts("---- ---- ---- ----"); - } - } while(someMappingsAdded); -} - -/* exceptions --------------------------------------------------------------- */ - -/* get the string length from zero-terminated code points in a limited-length array */ -static int32_t -getLengthOfCodePoints(const UChar32 *s, int32_t maxLength) { - int32_t i, length; - - for(i=length=0; iU16_MAX_LENGTH) { - return FALSE; - } - i=0; - U16_NEXT(s, i, length, full); - - if(simple==0) { - simple=c; /* UCD has no simple mapping if it's the same as the code point itself */ - } - return (UBool)(i==length && full==simple); -} - -static uint16_t -makeException(uint32_t value, Props *p) { - uint32_t slots[8]; - uint32_t slotBits; - uint16_t excWord, i, count, length, fullLengths; - UBool doubleSlots; - - /* exceptionsTop might be returned for storing in the trie word */ - if(exceptionsTop>=UCASE_MAX_EXCEPTIONS) { - fprintf(stderr, "gencase error: too many exceptions words\n"); - exit(U_BUFFER_OVERFLOW_ERROR); - } - - /* copy and shift the soft-dotted bits */ - excWord=((uint16_t)value&UCASE_DOT_MASK)<isCaseIgnorable) { - excWord|=UCASE_EXC_CASE_IGNORABLE; - } - - /* update maxFullLength */ - if(p->specialCasing!=NULL) { - length=p->specialCasing->lowerCase[0]; - if(length>maxFullLength) { - maxFullLength=length; - } - length=p->specialCasing->upperCase[0]; - if(length>maxFullLength) { - maxFullLength=length; - } - length=p->specialCasing->titleCase[0]; - if(length>maxFullLength) { - maxFullLength=length; - } - } - if(p->caseFolding!=NULL) { - length=p->caseFolding->full[0]; - if(length>maxFullLength) { - maxFullLength=length; - } - } - - /* set the bits for conditional mappings */ - if(p->specialCasing!=NULL && p->specialCasing->isComplex) { - excWord|=UCASE_EXC_CONDITIONAL_SPECIAL; - p->specialCasing=NULL; - } - if(p->caseFolding!=NULL && p->caseFolding->simple==0 && p->caseFolding->full[0]==0) { - excWord|=UCASE_EXC_CONDITIONAL_FOLD; - p->caseFolding=NULL; - } - - /* - * Note: - * UCD stores no simple mappings when they are the same as the code point itself. - * SpecialCasing and CaseFolding do store simple mappings even if they are - * the same as the code point itself. - * Comparisons between simple regular mappings and simple special/folding - * mappings need to compensate for the difference by comparing with the - * original code point if a simple UCD mapping is missing (0). - */ - - /* remove redundant data */ - if(p->specialCasing!=NULL) { - /* do not store full mappings if they are the same as the simple ones */ - if(fullMappingEqualsSimple(p->specialCasing->lowerCase, p->lowerCase, p->code)) { - p->specialCasing->lowerCase[0]=0; - } - if(fullMappingEqualsSimple(p->specialCasing->upperCase, p->upperCase, p->code)) { - p->specialCasing->upperCase[0]=0; - } - if(fullMappingEqualsSimple(p->specialCasing->titleCase, p->titleCase, p->code)) { - p->specialCasing->titleCase[0]=0; - } - } - if( p->caseFolding!=NULL && - fullMappingEqualsSimple(p->caseFolding->full, p->caseFolding->simple, p->code) - ) { - p->caseFolding->full[0]=0; - } - - /* write the optional slots */ - slotBits=0; - count=0; - - if(p->lowerCase!=0) { - slots[count]=(uint32_t)p->lowerCase; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_LOWER); - } - if( p->caseFolding!=NULL && - p->caseFolding->simple!=0 && - (p->lowerCase!=0 ? - p->caseFolding->simple!=p->lowerCase : - p->caseFolding->simple!=p->code) - ) { - slots[count]=(uint32_t)p->caseFolding->simple; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_FOLD); - } - if(p->upperCase!=0) { - slots[count]=(uint32_t)p->upperCase; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_UPPER); - } - if(p->upperCase!=p->titleCase) { - if(p->titleCase!=0) { - slots[count]=(uint32_t)p->titleCase; - } else { - slots[count]=(uint32_t)p->code; - } - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_TITLE); - } - - /* length of case closure */ - if(p->closure[0]!=0) { - length=getLengthOfCodePoints(p->closure, LENGTHOF(p->closure)); - slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */ - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_CLOSURE); - } - - /* lengths of full case mapping strings, stored in the last slot */ - fullLengths=0; - if(p->specialCasing!=NULL) { - fullLengths=p->specialCasing->lowerCase[0]; - fullLengths|=p->specialCasing->upperCase[0]<<8; - fullLengths|=p->specialCasing->titleCase[0]<<12; - } - if(p->caseFolding!=NULL) { - fullLengths|=p->caseFolding->full[0]<<4; - } - if(fullLengths!=0) { - slots[count]=fullLengths; - slotBits|=slots[count]; - ++count; - excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS); - } - - if(count==0) { - /* No optional slots: Try to share excWord entries. */ - uint16_t excIndex; - for(excIndex=0; excIndex0xffff); - if(!doubleSlots) { - for(i=0; i>16); - exceptions[excTop++]=(uint16_t)slots[i]; - } - } - - /* write the full case mapping strings */ - if(p->specialCasing!=NULL) { - length=(uint16_t)p->specialCasing->lowerCase[0]; - u_memcpy((UChar *)exceptions+excTop, p->specialCasing->lowerCase+1, length); - excTop+=length; - } - if(p->caseFolding!=NULL) { - length=(uint16_t)p->caseFolding->full[0]; - u_memcpy((UChar *)exceptions+excTop, p->caseFolding->full+1, length); - excTop+=length; - } - if(p->specialCasing!=NULL) { - length=(uint16_t)p->specialCasing->upperCase[0]; - u_memcpy((UChar *)exceptions+excTop, p->specialCasing->upperCase+1, length); - excTop+=length; - - length=(uint16_t)p->specialCasing->titleCase[0]; - u_memcpy((UChar *)exceptions+excTop, p->specialCasing->titleCase+1, length); - excTop+=length; - } - - /* write the closure data */ - if(p->closure[0]!=0) { - UChar32 c; - - for(i=0; iclosure) && (c=p->closure[i])!=0; ++i) { - U16_APPEND_UNSAFE((UChar *)exceptions, excTop, c); - } - } - - exceptionsTop=excTop; - - /* write the main exceptions word */ - exceptions[excIndex]=excWord; - - return excIndex; - } -} - -extern void -makeExceptions() { - uint32_t *row; - uint32_t value; - int32_t i; - uint16_t excIndex; - - i=0; - while((row=upvec_getRow(pv, i, NULL, NULL))!=NULL) { - value=*row; - if(value&UCASE_EXCEPTION) { - excIndex=makeException(value, excProps+(value>>UGENCASE_EXC_SHIFT)); - *row=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|(excIndex< +#include +#include "unicode/utypes.h" +#include "unicode/localpointer.h" +#include "unicode/uchar.h" +#include "unicode/udata.h" +#include "unicode/uniset.h" +#include "unicode/usetiter.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "cstring.h" +#include "genprops.h" +#include "ppucd.h" +#include "uassert.h" +#include "uarrsort.h" +#include "ucase.h" +#include "unewdata.h" +#include "utrie2.h" +#include "writesrc.h" + +/* Unicode case mapping properties file format --------------------------------- + +The file format prepared and written here contains several data +structures that store indexes or data. + +Before the data contents described below, there are the headers required by +the udata API for loading ICU data. Especially, a UDataInfo structure +precedes the actual data. It contains platform properties values and the +file format version. + +The following is a description of format version 2.0 . + +Format version 1.1 adds data for case closure. + +Format version 1.2 adds an exception bit for case-ignorable. Needed because +the Cased and Case_Ignorable properties are not disjoint. + +Format version 2.0 changes from UTrie to UTrie2. + +The file contains the following structures: + + const int32_t indexes[i0] with values i0, i1, ...: + (see UCASE_IX_... constants for names of indexes) + + i0 indexLength; -- length of indexes[] (UCASE_IX_TOP) + i1 dataLength; -- length in bytes of the post-header data (incl. indexes[]) + i2 trieSize; -- size in bytes of the case mapping properties trie + i3 exceptionsLength; -- length in uint16_t of the exceptions array + i4 unfoldLength; -- length in uint16_t of the reverse-folding array (new in format version 1.1) + + i5..i14 reservedIndexes; -- reserved values; 0 for now + + i15 maxFullLength; -- maximum length of a full case mapping/folding string + + + Serialized trie, see utrie2.h; + + const uint16_t exceptions[exceptionsLength]; + + const UChar unfold[unfoldLength]; + + +Trie data word: +Bits +if(exception) { + 15..4 unsigned exception index +} else { + if(not uncased) { + 15..6 signed delta to simple case mapping code point + (add delta to input code point) + } else { + 6 the code point is case-ignorable + (U+0307 is also case-ignorable but has an exception) + } + 5..4 0 normal character with cc=0 + 1 soft-dotted character + 2 cc=230 + 3 other cc +} + 3 exception + 2 case sensitive + 1..0 0 uncased + 1 lowercase + 2 uppercase + 3 titlecase + + +Exceptions: +A sub-array of the exceptions array is indexed by the exception index in a +trie word. +The sub-array consists of the following fields: + uint16_t excWord; + uint16_t optional values []; + UTF-16 strings for full (string) mappings for lowercase, case folding, uppercase, titlecase + +excWord: (see UCASE_EXC_...) +Bits + 15 conditional case folding + 14 conditional special casing +13..12 same as non-exception trie data bits 5..4 + moved here because the exception index needs more bits than the delta + 0 normal character with cc=0 + 1 soft-dotted character + 2 cc=230 + 3 other cc +11 case-ignorable (used when the character is cased or has another exception) + (new in formatVersion 1.2/ICU 4.4) +10.. 9 reserved + 8 if set, then for each optional-value slot there are 2 uint16_t values + (high and low parts of 32-bit values) + instead of single ones + 7.. 0 bits for which optional value is present + +Optional-value slots: +0 lowercase mapping (code point) +1 case folding (code point) +2 uppercase mapping (code point) +3 titlecase mapping (code point) +4 reserved +5 reserved +6 closure mappings (new in format version 1.1) +7 there is at least one full (string) case mapping + the length of each is encoded in a nibble of this optional value, + and the strings follow this optional value in the same order: + lower/fold/upper/title + +The optional closure mappings value is used as follows: +Bits 0..3 contain the length of a string of code points for case closure. +The string immediately follows the full case mappings, or the closure value +slot if there are no full case mappings. +Bits 4..15 are reserved and could be used in the future to indicate the +number of strings for case closure. +Complete case closure for a code point is given by the union of all simple +and full case mappings and foldings, plus the case closure code points +(and potentially, in the future, case closure strings). + +For space saving, some values are not stored. Lookups are as follows: +- If special casing is conditional, then no full lower/upper/title mapping + strings are stored. +- If case folding is conditional, then no simple or full case foldings are + stored. +- Fall back in this order: + full (string) mapping -- if full mappings are used + simple (code point) mapping of the same type + simple fold->simple lower + simple title->simple upper + finally, the original code point (no mapping) + +This fallback order is strict: +In particular, the fallback from full case folding is to simple case folding, +not to full lowercase mapping. + +Reverse case folding data ("unfold") array: (new in format version 1.1) + +This array stores some miscellaneous values followed by a table. The data maps +back from multi-character strings to their original code points, for use +in case closure. + +The table contains two columns of strings. +The string in the first column is the case folding of each of the code points +in the second column. The strings are terminated with NUL or by the end of the +column, whichever comes first. + +The miscellaneous data takes up one pseudo-row and includes: +- number of rows +- number of UChars per row +- number of UChars in the left (folding string) column + +The table is sorted by its first column. Values in the first column are unique. + +----------------------------------------------------------------------------- */ + +U_NAMESPACE_USE + +/* UDataInfo cf. udata.h */ +static UDataInfo dataInfo={ + sizeof(UDataInfo), + 0, + + U_IS_BIG_ENDIAN, + U_CHARSET_FAMILY, + U_SIZEOF_UCHAR, + 0, + + /* dataFormat="cAsE" */ + { UCASE_FMT_0, UCASE_FMT_1, UCASE_FMT_2, UCASE_FMT_3 }, + { 2, 0, 0, 0 }, /* formatVersion */ + { 6, 0, 0, 0 } /* dataVersion */ +}; + +// Temporary Case_Ignorable bit before final encoding. +#define UGENCASE_IGNORABLE 0x00010000 + +#define UGENCASE_EXC_SHIFT 20 +#define UGENCASE_EXC_MASK 0xfff00000 + +enum { + MAX_EXC_COUNT=(UGENCASE_EXC_MASK>>UGENCASE_EXC_SHIFT)+1 +}; + +struct ExcProps { + ExcProps() + : hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {} + ExcProps(const UniProps &otherProps) + : props(otherProps), + hasConditionalCaseMappings(FALSE), hasTurkicCaseFolding(FALSE) {} + + UniProps props; + UnicodeSet closure; + UBool hasConditionalCaseMappings; + UBool hasTurkicCaseFolding; +}; + +/* + * Values for the ucase.icu unfold[] data array. + * The values are stored in ucase.icu so that the runtime code will work with + * changing values, but they are hardcoded here for simplicity. + * They are optimized, that is, provide for minimal table column widths, + * for the actual Unicode data, so that the table size is minimized. + * Future versions of Unicode may require increases of some of these values. + */ +enum { + UGENCASE_UNFOLD_STRING_WIDTH=3, + UGENCASE_UNFOLD_CP_WIDTH=2, + UGENCASE_UNFOLD_WIDTH=UGENCASE_UNFOLD_STRING_WIDTH+UGENCASE_UNFOLD_CP_WIDTH +}; + +class CasePropsBuilder : public PropsBuilder { +public: + CasePropsBuilder(UErrorCode &errorCode); + virtual ~CasePropsBuilder(); + + virtual void setUnicodeVersion(const UVersionInfo version); + virtual void setProps(const UniProps &, const UnicodeSet &newValues, UErrorCode &errorCode); + virtual void build(UErrorCode &errorCode); + virtual void writeCSourceFile(const char *path, UErrorCode &errorCode); + virtual void writeBinaryData(const char *path, UBool withCopyright, UErrorCode &errorCode); + +private: + uint32_t makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode); + void addUnfolding(UChar32 c, const UnicodeString &s, UErrorCode &errorCode); + void makeUnfoldData(UErrorCode &errorCode); + void addClosureMapping(UChar32 src, UChar32 dest, UErrorCode &errorCode); + UBool addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value, + UErrorCode &errorCode); + void makeCaseClosure(UErrorCode &errorCode); + int32_t makeException(UChar32 c, uint32_t value, ExcProps &ep, UErrorCode &errorCode); + void makeExceptions(UErrorCode &errorCode); + + UnicodeSet relevantProps; + /* + * Unicode set collecting the case-sensitive characters; + * see uchar.h UCHAR_CASE_SENSITIVE. + * Add code points from case mappings/foldings in + * the root locale and with default options. + */ + UnicodeSet caseSensitive; + /* reverse case folding ("unfold") data */ + UnicodeString unfold; + UnicodeString exceptions; + ExcProps **excProps; + int32_t excPropsCount; + /* becomes indexes[UCASE_IX_MAX_FULL_LENGTH] */ + int32_t maxFullLength; + UTrie2 *pTrie; +}; + +CasePropsBuilder::CasePropsBuilder(UErrorCode &errorCode) + : excProps(NULL), excPropsCount(0), maxFullLength(U16_MAX_LENGTH), pTrie(NULL) { + // This builder encodes the following properties. + relevantProps. + add(UCHAR_CANONICAL_COMBINING_CLASS). // 0 vs. 230 vs. other + add(UCHAR_SOFT_DOTTED). + add(UCHAR_LOWERCASE). + add(UCHAR_UPPERCASE). + add(UCHAR_CASE_IGNORABLE). + add(UCHAR_SIMPLE_CASE_FOLDING). + add(UCHAR_SIMPLE_LOWERCASE_MAPPING). + add(UCHAR_SIMPLE_TITLECASE_MAPPING). + add(UCHAR_SIMPLE_UPPERCASE_MAPPING). + add(UCHAR_CASE_FOLDING). + add(UCHAR_LOWERCASE_MAPPING). + add(UCHAR_TITLECASE_MAPPING). + add(UCHAR_UPPERCASE_MAPPING). + add(PPUCD_CONDITIONAL_CASE_MAPPINGS). + add(PPUCD_TURKIC_CASE_FOLDING); + // Write "unfold" meta data into the first row. Must be UGENCASE_UNFOLD_WIDTH UChars. + unfold. + append(0). + append((UChar)UGENCASE_UNFOLD_WIDTH). + append((UChar)UGENCASE_UNFOLD_STRING_WIDTH). + append(0). + append(0); + U_ASSERT(unfold.length()==UGENCASE_UNFOLD_WIDTH); + pTrie=utrie2_open(0, 0, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "genprops error: casepropsbuilder utrie2_open() failed - %s\n", + u_errorName(errorCode)); + return; + } + excProps=new ExcProps *[MAX_EXC_COUNT]; + if(excProps==NULL) { + fprintf(stderr, + "genprops error: casepropsbuilder out of memory allocating " + "the array of exceptions properties\n"); + errorCode=U_MEMORY_ALLOCATION_ERROR; + } +} + +CasePropsBuilder::~CasePropsBuilder() { + utrie2_close(pTrie); + for(int32_t i=0; iUGENCASE_UNFOLD_STRING_WIDTH) { + fprintf(stderr, "genprops error: case folding too long (length=%ld>%d=UGENCASE_UNFOLD_STRING_WIDTH)\n", + (long)length, UGENCASE_UNFOLD_STRING_WIDTH); + errorCode=U_INTERNAL_PROGRAM_ERROR; + } + unfold.append(s); + while(length=0) { + /* uppercase mapping as delta if the character is lowercase */ + hasMapping=TRUE; + if(type==UCASE_LOWER) { + delta=props.suc-start; + } else { + value|=UCASE_EXCEPTION; + } + } + if(props.slc>=0) { + /* lowercase mapping as delta if the character is uppercase or titlecase */ + hasMapping=TRUE; + if(type>=UCASE_UPPER) { + delta=props.slc-start; + } else { + value|=UCASE_EXCEPTION; + } + } + if(props.stc>=0) { + hasMapping=TRUE; + } + if(props.suc!=props.stc) { + value|=UCASE_EXCEPTION; + } + if(!props.lc.isEmpty() || !props.uc.isEmpty() || !props.tc.isEmpty() || + newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS) + ) { + hasMapping=TRUE; + value|=UCASE_EXCEPTION; + } + if( (props.scf>=0 && props.scf!=props.slc) || + (!props.cf.isEmpty() && props.cf!=UnicodeString(props.scf)) || + newValues.contains(PPUCD_TURKIC_CASE_FOLDING) + ) { + hasMapping=TRUE; + value|=UCASE_EXCEPTION; + } + + if(deltahasConditionalCaseMappings=newValues.contains(PPUCD_CONDITIONAL_CASE_MAPPINGS); + newExcProps->hasTurkicCaseFolding=newValues.contains(PPUCD_TURKIC_CASE_FOLDING); + value|=(uint32_t)excPropsCount<=0) { caseSensitive.add(props.scf); } + if(props.slc>=0) { caseSensitive.add(props.slc); } + if(props.suc>=0) { caseSensitive.add(props.suc); } + if(props.stc>=0) { caseSensitive.add(props.stc); } + caseSensitive.addAll(props.cf); + caseSensitive.addAll(props.lc); + caseSensitive.addAll(props.uc); + caseSensitive.addAll(props.tc); + + /* update maxFullLength */ + if(props.cf.length()>maxFullLength) { maxFullLength=props.cf.length(); } + if(props.lc.length()>maxFullLength) { maxFullLength=props.lc.length(); } + if(props.uc.length()>maxFullLength) { maxFullLength=props.uc.length(); } + if(props.tc.length()>maxFullLength) { maxFullLength=props.tc.length(); } + } + + /* add the multi-character case folding to the "unfold" data */ + if(props.cf.hasMoreChar32Than(0, 0x7fffffff, 1)) { + addUnfolding(start, props.cf, errorCode); + } +} + +uint32_t +CasePropsBuilder::makeExcProps(UChar32 c, uint32_t value, UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return 0; } + if(excPropsCount==MAX_EXC_COUNT) { + fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions\n"); + errorCode=U_INDEX_OUTOFBOUNDS_ERROR; + return 0; + } + LocalPointer newExcProps(new ExcProps); + if(newExcProps==NULL) { + fprintf(stderr, + "genprops error: casepropsbuilder out of memory allocating " + "exceptions properties\n"); + errorCode=U_MEMORY_ALLOCATION_ERROR; + return 0; + } + + if((value&UCASE_TYPE_MASK)>UCASE_NONE) { + // Decode the simple case mapping. + UChar32 next=c+UCASE_GET_DELTA(value); + if(next!=c) { + UniProps &p=newExcProps->props; + if((value&UCASE_TYPE_MASK)==UCASE_LOWER) { + p.suc=p.stc=next; + } else { + p.slc=next; + } + } + } + + value&=~(UGENCASE_EXC_MASK|UCASE_DELTA_MASK); // remove previous simple mapping + value|=(uint32_t)excPropsCount<UGENCASE_UNFOLD_CP_WIDTH) { + fprintf(stderr, "genprops error: too many code points in unfold[]: %ld>%d=UGENCASE_UNFOLD_CP_WIDTH\n", + (long)j, UGENCASE_UNFOLD_CP_WIDTH); + errorCode=U_BUFFER_OVERFLOW_ERROR; + return; + } + + /* move following rows up one */ + --unfoldRows; + u_memmove(p+UGENCASE_UNFOLD_WIDTH, p+UGENCASE_UNFOLD_WIDTH*2, (unfoldRows-i)*UGENCASE_UNFOLD_WIDTH); + } else { + p+=UGENCASE_UNFOLD_WIDTH; + ++i; + } + } + + unfoldBuffer[UCASE_UNFOLD_ROWS]=(UChar)unfoldRows; + + if(beVerbose) { + puts("unfold data:"); + + p=unfoldBuffer; + for(i=0; iU+%04lx\n", + (unsigned long)src, (unsigned long)dest); + } + + uint32_t value=utrie2_get32(pTrie, src); + if((value&UCASE_EXCEPTION)==0) { + /* + * decode value into p2 (enough for makeException() to work properly), + * add the closure mapping, + * and set the new exception for src + */ + value=makeExcProps(src, value, errorCode); + utrie2_set32(pTrie, src, value, &errorCode); + if(U_FAILURE(errorCode)) { + fprintf(stderr, "genprops error: unable to set case mapping values, code: %s\n", + u_errorName(errorCode)); + return; + } + } + excProps[value>>UGENCASE_EXC_SHIFT]->closure.add(dest); +} + +/* + * Find missing case mapping relationships and add mappings for case closure. + * This function starts from an "original" code point and recursively + * finds its case mappings and the case mappings of where it maps to. + * + * The recursion depth is capped at 3 nested calls of this function. + * In each call, the current code point is c, and the function enumerates + * all of c's simple (single-code point) case mappings. + * prev is the code point that case-mapped to c. + * prev2 is the code point that case-mapped to prev. + * + * The initial function call has prev2<0, prev<0, and c==orig + * (marking no code points). + * It enumerates c's case mappings and recurses without further action. + * + * The second-level function call has prev2<0, prev==orig, and c is + * the destination code point of one of prev's case mappings. + * The function checks if any of c's case mappings go back to orig + * and adds a closure mapping if not. + * In other words, it turns a case mapping relationship of + * orig->c + * into + * orig<->c + * + * The third-level function call has prev2==orig, prev>=0, and c is + * the destination code point of one of prev's case mappings. + * (And prev is the destination of one of prev2's case mappings.) + * The function checks if any of c's case mappings go back to orig + * and adds a closure mapping if not. + * In other words, it turns case mapping relationships of + * orig->prev->c or orig->prev<->c + * into + * orig->prev->c->orig or orig->prev<->c->orig + * etc. + * (Graphically, this closes a triangle.) + * + * With repeated application on all code points until no more closure mappings + * are added, all case equivalence groups get complete mappings. + * That is, in each group of code points with case relationships + * each code point will in the end have some mapping to each other + * code point in the group. + * + * @return TRUE if a closure mapping was added + */ +UBool +CasePropsBuilder::addClosure(UChar32 orig, UChar32 prev2, UChar32 prev, UChar32 c, uint32_t value, + UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return FALSE; } + + UChar32 next; + UBool someMappingsAdded=FALSE; + + if(c!=orig) { + /* get the properties for c */ + value=utrie2_get32(pTrie, c); + } + /* else if c==orig then c's value was passed in */ + + if(value&UCASE_EXCEPTION) { + UnicodeSet set; + + ExcProps &ep=*excProps[value>>UGENCASE_EXC_SHIFT]; + UniProps &p=ep.props; + + /* + * marker for whether any of c's mappings goes to orig + * c==orig: prevent adding a closure mapping when getting orig's own, direct mappings + */ + UBool mapsToOrig=(UBool)(c==orig); + + /* collect c's case mapping destinations in set[] */ + if((next=p.suc)>=0 && next!=c) { + set.add(next); + } + if((next=p.slc)>=0 && next!=c) { + set.add(next); + } + if(p.suc!=(next=p.stc) && next!=c) { + set.add(next); + } + if((next=p.scf)>=0 && next!=c) { + set.add(next); + } + + /* add c's current closure mappings to set */ + set.addAll(ep.closure); + + /* process all code points to which c case-maps */ + UnicodeSetIterator iter(set); + while(iter.next()) { + next=iter.getCodepoint(); /* next!=c */ + + if(next==orig) { + mapsToOrig=TRUE; /* remember that we map to orig */ + } else if(prev2<0 && next!=prev) { + /* + * recurse unless + * we have reached maximum depth (prev2>=0) or + * this is a mapping to one of the previous code points (orig, prev, c) + */ + someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode); + } + } + + if(!mapsToOrig) { + addClosureMapping(c, orig, errorCode); + return TRUE; + } + } else { + if((value&UCASE_TYPE_MASK)>UCASE_NONE) { + /* one simple case mapping, don't care which one */ + next=c+UCASE_GET_DELTA(value); + if(next!=c) { + /* + * recurse unless + * we have reached maximum depth (prev2>=0) or + * this is a mapping to one of the previous code points (orig, prev, c) + */ + if(prev2<0 && next!=orig && next!=prev) { + someMappingsAdded|=addClosure(orig, prev, c, next, 0, errorCode); + } + + if(c!=orig && next!=orig) { + /* c does not map to orig, add a closure mapping c->orig */ + addClosureMapping(c, orig, errorCode); + return TRUE; + } + } + } + } + + return someMappingsAdded; +} + +void +CasePropsBuilder::makeCaseClosure(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + + /* + * finalize the "unfold" data because we need to use it to add closure mappings + * for situations like FB05->"st"<-FB06 + * where we would otherwise miss the FB05<->FB06 relationship + */ + makeUnfoldData(errorCode); + + /* use the "unfold" data to add mappings */ + + /* p always points to the code points; this loop ignores the strings completely */ + const UChar *p=unfold.getBuffer()+UGENCASE_UNFOLD_WIDTH+UGENCASE_UNFOLD_STRING_WIDTH; + int32_t unfoldRows=unfold.length()/UGENCASE_UNFOLD_WIDTH-1; + + for(int32_t i=0; i=UCASE_MAX_EXCEPTIONS) { + fprintf(stderr, "genprops error: casepropsbuilder: too many exceptions words\n"); + errorCode=U_BUFFER_OVERFLOW_ERROR; + return 0; + } + + /* copy and shift the soft-dotted bits */ + UChar excWord=(UChar)((value&UCASE_DOT_MASK)<=0) { + slots[count]=(uint32_t)p.slc; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_LOWER); + } + if( p.scf>=0 && + (p.slc>=0 ? + p.slc!=p.slc : + p.slc!=c) + ) { + slots[count]=(uint32_t)p.scf; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_FOLD); + } + if(p.suc>=0) { + slots[count]=(uint32_t)p.suc; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_UPPER); + } + if(p.suc!=p.stc) { + if(p.stc>=0) { + slots[count]=(uint32_t)p.stc; + } else { + slots[count]=(uint32_t)c; + } + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_TITLE); + } + + /* length of case closure */ + UnicodeString closureString; + if(!ep.closure.isEmpty()) { + UnicodeSetIterator iter(ep.closure); + while(iter.next()) { closureString.append(iter.getCodepoint()); } + int32_t length=closureString.length(); + if(length>UCASE_CLOSURE_MAX_LENGTH) { + fprintf(stderr, + "genprops error: case closure for U+%04lX has length %d " + "which exceeds UCASE_CLOSURE_MAX_LENGTH=%d\n", + (long)c, (int)length, (int)UCASE_CLOSURE_MAX_LENGTH); + errorCode=U_BUFFER_OVERFLOW_ERROR; + return 0; + } + slots[count]=(uint32_t)length; /* must be 1..UCASE_CLOSURE_MAX_LENGTH */ + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_CLOSURE); + } + + /* lengths of full case mapping strings, stored in the last slot */ + int32_t fullLengths= + p.lc.length()| + (p.cf.length()<<4)| + (p.uc.length()<<8)| + (p.tc.length()<<12); + if(fullLengths!=0) { + slots[count]=(uint32_t)fullLengths; + slotBits|=slots[count]; + ++count; + excWord|=U_MASK(UCASE_EXC_FULL_MAPPINGS); + } + + if(count==0) { + /* No optional slots: Try to share excWord entries. */ + int32_t excIndex=exceptions.indexOf((UChar)excWord); + if(excIndex>=0) { + return excIndex; + } + /* not found */ + excIndex=exceptions.length(); + exceptions.append((UChar)excWord); + return excIndex; + } else { + /* write slots */ + int32_t excIndex=exceptions.length(); + exceptions.append((UChar)0); /* placeholder for excWord which will be stored at excIndex */ + + if(slotBits<=0xffff) { + for(int32_t i=0; i>16)); + exceptions.append((UChar)slots[i]); + } + } + + /* write the full case mapping strings */ + exceptions.append(p.lc); + exceptions.append(p.cf); + exceptions.append(p.uc); + exceptions.append(p.tc); + + /* write the closure data */ + exceptions.append(closureString); + + /* write the main exceptions word */ + exceptions.setCharAt(excIndex, (UChar)excWord); + + return excIndex; + } +} + +void +CasePropsBuilder::makeExceptions(UErrorCode &errorCode) { + if(U_FAILURE(errorCode)) { return; } + + /* + * Encode case-ignorable as delta==1 on uncased characters, + * and with an exception bit on cased characters and characters with another exception. + * + * Change from temporary UGENCASE_EXC_SHIFT'ed index into excProps[] + * to UCASE_EXC_SHIFT'ed index into encoded exceptions[]. + */ + for(UChar32 c=0; c<=0x10ffff; ++c) { + uint32_t value=utrie2_get32(pTrie, c); + if(value&(UGENCASE_IGNORABLE|UCASE_EXCEPTION)) { + /* + * If the character is cased or has another exception, + * then we store the case-ignorable flag as an exception bit. + */ + if( (value&UCASE_EXCEPTION)==0 && + (value&UGENCASE_IGNORABLE) && (value&UCASE_TYPE_MASK)!=UCASE_NONE + ) { + // Case_Ignorable and Cased and no exception: + // Create ExcProps for the Case_Ignorable flag. + value=makeExcProps(c, value, errorCode); // sets UCASE_EXCEPTION + } + if(value&UCASE_EXCEPTION) { + int32_t excIndex=makeException(c, value, *excProps[value>>UGENCASE_EXC_SHIFT], errorCode); + value=(value&~(UGENCASE_EXC_MASK|UCASE_EXC_MASK))|((uint32_t)excIndex< pnamesBuilder(createPNamesBuilder(errorCode)); LocalPointer corePropsBuilder(createCorePropsBuilder(errorCode)); LocalPointer bidiPropsBuilder(createBiDiPropsBuilder(errorCode)); + LocalPointer casePropsBuilder(createCasePropsBuilder(errorCode)); LocalPointer namesPropsBuilder(createNamesPropsBuilder(errorCode)); if(errorCode.isFailure()) { fprintf(stderr, "genprops: unable to create PropsBuilders - %s\n", errorCode.errorName()); @@ -144,11 +145,13 @@ main(int argc, char* argv[]) { const UniProps *props=ppucd.getProps(newValues, errorCode); corePropsBuilder->setProps(*props, newValues, errorCode); bidiPropsBuilder->setProps(*props, newValues, errorCode); + casePropsBuilder->setProps(*props, newValues, errorCode); namesPropsBuilder->setProps(*props, newValues, errorCode); } else if(lineType==PreparsedUCD::UNICODE_VERSION_LINE) { const UVersionInfo &version=ppucd.getUnicodeVersion(); corePropsBuilder->setUnicodeVersion(version); bidiPropsBuilder->setUnicodeVersion(version); + casePropsBuilder->setUnicodeVersion(version); namesPropsBuilder->setUnicodeVersion(version); } else if(lineType==PreparsedUCD::ALG_NAMES_RANGE_LINE) { UChar32 start, end; @@ -168,6 +171,7 @@ main(int argc, char* argv[]) { corePropsBuilder->build(errorCode); bidiPropsBuilder->build(errorCode); + casePropsBuilder->build(errorCode); namesPropsBuilder->build(errorCode); if(errorCode.isFailure()) { fprintf(stderr, "genprops error: failure finalizing the data - %s\n", @@ -190,6 +194,8 @@ main(int argc, char* argv[]) { corePropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); bidiPropsBuilder->writeCSourceFile(sourceCommon.data(), errorCode); bidiPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); + casePropsBuilder->writeCSourceFile(sourceCommon.data(), errorCode); + casePropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); namesPropsBuilder->writeBinaryData(sourceDataIn.data(), withCopyright, errorCode); return errorCode; diff --git a/tools/unicode/c/genprops/genprops.h b/tools/unicode/c/genprops/genprops.h index 7eede9563d..51ef0c8f20 100644 --- a/tools/unicode/c/genprops/genprops.h +++ b/tools/unicode/c/genprops/genprops.h @@ -46,6 +46,7 @@ public: PNamesBuilder *createPNamesBuilder(UErrorCode &errorCode); PropsBuilder *createCorePropsBuilder(UErrorCode &errorCode); PropsBuilder *createBiDiPropsBuilder(UErrorCode &errorCode); +PropsBuilder *createCasePropsBuilder(UErrorCode &errorCode); PropsBuilder *createNamesPropsBuilder(UErrorCode &errorCode); /* global flags */ diff --git a/tools/unicode/c/genprops/namespropsbuilder.cpp b/tools/unicode/c/genprops/namespropsbuilder.cpp index 60d37b4318..73658efa1c 100644 --- a/tools/unicode/c/genprops/namespropsbuilder.cpp +++ b/tools/unicode/c/genprops/namespropsbuilder.cpp @@ -144,6 +144,8 @@ /* generator data ----------------------------------------------------------- */ +U_NAMESPACE_USE + /* UDataInfo cf. udata.h */ static UDataInfo dataInfo={ sizeof(UDataInfo), @@ -516,13 +518,11 @@ compress(UErrorCode &errorCode) { for(i=0, wordNumber=0; wordNumber<(int16_t)wordCount; ++i) { if(tokens[i]!=-1) { tokens[i]=wordNumber; -#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } -#endif ++wordNumber; } } @@ -572,13 +572,11 @@ compress(UErrorCode &errorCode) { /* set token 0 to word 0 */ tokens[0]=0; -#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x000]: word%8ld \"%.*s\"\n", (long)words[0].weight, words[0].length, words[0].s); } -#endif wordNumber=1; /* set the lead byte tokens */ @@ -591,13 +589,11 @@ compress(UErrorCode &errorCode) { /* if store10Names then the parser set tokens[NAME_SEPARATOR_CHAR]=-1 */ if(tokens[i]!=-1) { tokens[i]=wordNumber; -#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } -#endif ++wordNumber; } } @@ -608,13 +604,11 @@ compress(UErrorCode &errorCode) { tokens[i]=-1; /* do not use NAME_SEPARATOR_CHAR as a second token byte */ } else { tokens[i]=wordNumber; -#ifdef DEBUG_NAMES if(beVerbose) { printf("tokens[0x%03x]: word%8ld \"%.*s\"\n", (int)i, (long)words[wordNumber].weight, words[wordNumber].length, words[wordNumber].s); } -#endif ++wordNumber; } } diff --git a/tools/unicode/c/genprops/pnamesbuilder.cpp b/tools/unicode/c/genprops/pnamesbuilder.cpp index 222ff6afaa..f838087cc3 100644 --- a/tools/unicode/c/genprops/pnamesbuilder.cpp +++ b/tools/unicode/c/genprops/pnamesbuilder.cpp @@ -279,7 +279,7 @@ public: indexes[i]=0; } - if(beVerbose) { + if(!beQuiet) { puts("* pnames.icu stats *"); printf("length of all value maps: %6ld\n", (long)valueMaps.size()); printf("length of all BytesTries: %6ld\n", (long)bytesTries.length()); @@ -487,11 +487,11 @@ PNamesBuilderImpl::writeBinaryData(const char *path, UBool withCopyright, UError void PNamesBuilderImpl::writeCSourceFile(const char *path, UErrorCode &errorCode) { if(U_FAILURE(errorCode)) { return; } - FILE *f=usrc_createFromGenerator(path, "propname_data.h", - "icu/tools/unicode/c/genprops/pnamesbuilder.cpp"); + FILE *f=usrc_create(path, "propname_data.h", + "icu/tools/unicode/c/genprops/pnamesbuilder.cpp"); if(f==NULL) { errorCode=U_FILE_ACCESS_ERROR; - return; // usrc_create() reported an error. + return; } fputs("#ifndef INCLUDED_FROM_PROPNAME_CPP\n"