ICU-8972 replace gennorm with code in preparseucd.py
X-SVN-Rev: 31174
This commit is contained in:
parent
66ead436a8
commit
72559ba1cd
14
.gitignore
vendored
14
.gitignore
vendored
@ -996,20 +996,6 @@ tools/unicode/c/gennames/gennames.vcproj.*.*.user
|
||||
tools/unicode/c/gennames/release
|
||||
tools/unicode/c/gennames/x64
|
||||
tools/unicode/c/gennames/x86
|
||||
tools/unicode/c/gennorm/*.d
|
||||
tools/unicode/c/gennorm/*.o
|
||||
tools/unicode/c/gennorm/*.pdb
|
||||
tools/unicode/c/gennorm/*.plg
|
||||
tools/unicode/c/gennorm/Debug
|
||||
tools/unicode/c/gennorm/Makefile
|
||||
tools/unicode/c/gennorm/Release
|
||||
tools/unicode/c/gennorm/debug
|
||||
tools/unicode/c/gennorm/gennorm
|
||||
tools/unicode/c/gennorm/gennorm.[0-9]
|
||||
tools/unicode/c/gennorm/gennorm.vcproj.*.*.user
|
||||
tools/unicode/c/gennorm/release
|
||||
tools/unicode/c/gennorm/x64
|
||||
tools/unicode/c/gennorm/x86
|
||||
tools/unicode/c/genprops/*.d
|
||||
tools/unicode/c/genprops/*.ncb
|
||||
tools/unicode/c/genprops/*.o
|
||||
|
@ -19,7 +19,6 @@ include_directories(
|
||||
link_directories(${ICU_INST_DIR}/lib)
|
||||
add_subdirectory(gencase)
|
||||
add_subdirectory(gennames)
|
||||
add_subdirectory(gennorm)
|
||||
add_subdirectory(genprops)
|
||||
add_subdirectory(genuca)
|
||||
add_subdirectory(genuts46)
|
||||
|
@ -1,9 +0,0 @@
|
||||
# Copyright (C) 2010, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# created on: 2010jun03
|
||||
# created by: Markus W. Scherer
|
||||
# edited on: 2010jul20
|
||||
# edited by: Stuart G. Gill
|
||||
add_executable(gennorm gennorm.c store.c)
|
||||
target_link_libraries(gennorm icuuc icutu)
|
@ -1,97 +0,0 @@
|
||||
## Makefile.in for ICU - tools/gennorm
|
||||
## Copyright (c) 2001-2005, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
## Steven R. Loomis/Markus W. Scherer
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/gennorm
|
||||
|
||||
TARGET_STUB_NAME = gennorm
|
||||
|
||||
SECTION = 8
|
||||
|
||||
#MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
|
||||
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS) $(MAN_FILES)
|
||||
|
||||
## Target information
|
||||
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
|
||||
|
||||
ifneq ($(top_builddir),$(top_srcdir))
|
||||
CPPFLAGS += -I$(top_builddir)/common
|
||||
endif
|
||||
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = gennorm.o store.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check check-local install-man
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET) $(MAN_FILES)
|
||||
|
||||
install-local: all-local install-man
|
||||
# $(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
|
||||
# $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)
|
||||
|
||||
install-man: $(MAN_FILES)
|
||||
# $(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
# $(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(OBJECTS)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
|
||||
$(POST_BUILD_STEP)
|
||||
|
||||
|
||||
%.$(SECTION): $(srcdir)/%.$(SECTION).in
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
|
@ -1,399 +0,0 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: gennorm.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This program reads the Unicode character database text file,
|
||||
* parses it, and extracts the data for normalization.
|
||||
* It then preprocesses it and writes a binary file for efficient use
|
||||
* in various Unicode text normalization processes.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "unewdata.h"
|
||||
#include "uoptions.h"
|
||||
#include "uparse.h"
|
||||
#include "unormimp.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
#include "gennorm.h"
|
||||
U_CDECL_END
|
||||
|
||||
UBool beVerbose=FALSE;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError);
|
||||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
enum {
|
||||
HELP_H,
|
||||
HELP_QUESTION_MARK,
|
||||
VERBOSE,
|
||||
DESTDIR,
|
||||
SOURCEDIR,
|
||||
ICUDATADIR
|
||||
};
|
||||
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
UOPTION_DESTDIR,
|
||||
UOPTION_SOURCEDIR,
|
||||
UOPTION_ICUDATADIR
|
||||
};
|
||||
|
||||
extern int
|
||||
main(int argc, char* argv[]) {
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
char filename[300];
|
||||
#endif
|
||||
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
|
||||
char *basename=NULL;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
/* preset then read command line options */
|
||||
options[DESTDIR].value=u_getDataDirectory();
|
||||
options[SOURCEDIR].value="";
|
||||
options[ICUDATADIR].value=u_getDataDirectory();
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
fprintf(stderr,
|
||||
"error in command line argument \"%s\"\n",
|
||||
argv[-argc]);
|
||||
}
|
||||
if(argc<0 || options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur) {
|
||||
/*
|
||||
* Broken into chucks because the C89 standard says the minimum
|
||||
* required supported string length is 509 bytes.
|
||||
*/
|
||||
fprintf(stderr,
|
||||
"Usage: %s [-options] [suffix]\n"
|
||||
"\n"
|
||||
"Read the UnicodeData.txt file and other Unicode properties files and\n"
|
||||
"write nfc.txt and nfkc.txt files for gennorm2\n"
|
||||
"\n",
|
||||
argv[0]);
|
||||
fprintf(stderr,
|
||||
"Options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-v or --verbose verbose output\n");
|
||||
fprintf(stderr,
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t-s or --sourcedir source directory, followed by the path\n"
|
||||
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
|
||||
"\t followed by path, defaults to <%s>\n"
|
||||
"\tsuffix suffix that is to be appended with a '-'\n"
|
||||
"\t to the source file basenames before opening;\n"
|
||||
"\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
|
||||
u_getDataDirectory());
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
/* get the options values */
|
||||
beVerbose=options[VERBOSE].doesOccur;
|
||||
srcDir=options[SOURCEDIR].value;
|
||||
destDir=options[DESTDIR].value;
|
||||
|
||||
if(argc>=2) {
|
||||
suffix=argv[1];
|
||||
} else {
|
||||
suffix=NULL;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
if (options[ICUDATADIR].doesOccur) {
|
||||
u_setDataDirectory(options[ICUDATADIR].value);
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that we can work with properties
|
||||
* but don't call u_init() because that needs unorm.icu which we are just
|
||||
* going to build here.
|
||||
*/
|
||||
{
|
||||
U_STRING_DECL(ideo, "[:Ideographic:]", 15);
|
||||
USet *set;
|
||||
|
||||
U_STRING_INIT(ideo, "[:Ideographic:]", 15);
|
||||
set=uset_openPattern(ideo, -1, &errorCode);
|
||||
if(U_FAILURE(errorCode) || !uset_contains(set, 0xf900)) {
|
||||
fprintf(stderr, "gennorm is unable to work with properties (uprops.icu): %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
uset_close(set);
|
||||
}
|
||||
|
||||
/* prepare the filename beginning with the source dir */
|
||||
uprv_strcpy(filename, srcDir);
|
||||
basename=filename+uprv_strlen(filename);
|
||||
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR && *(basename-1)!=U_FILE_ALT_SEP_CHAR) {
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
init();
|
||||
|
||||
/* process DerivedNormalizationProps.txt (name changed for Unicode 3.2, to <=31 characters) */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "DerivedNormalizationProps.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "DerivedNormalizationProps");
|
||||
basename[30]='-';
|
||||
uprv_strcpy(basename+31, suffix);
|
||||
uprv_strcat(basename+31, ".txt");
|
||||
}
|
||||
parseDerivedNormalizationProperties(filename, &errorCode, FALSE);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
/* can be only U_FILE_ACCESS_ERROR - try filename from before Unicode 3.2 */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "DerivedNormalizationProperties.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "DerivedNormalizationProperties");
|
||||
basename[30]='-';
|
||||
uprv_strcpy(basename+31, suffix);
|
||||
uprv_strcat(basename+31, ".txt");
|
||||
}
|
||||
parseDerivedNormalizationProperties(filename, &errorCode, TRUE);
|
||||
}
|
||||
|
||||
/* process UnicodeData.txt */
|
||||
if(suffix==NULL) {
|
||||
uprv_strcpy(basename, "UnicodeData.txt");
|
||||
} else {
|
||||
uprv_strcpy(basename, "UnicodeData");
|
||||
basename[11]='-';
|
||||
uprv_strcpy(basename+12, suffix);
|
||||
uprv_strcat(basename+12, ".txt");
|
||||
}
|
||||
parseDB(filename, &errorCode);
|
||||
|
||||
/* process parsed data */
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
writeNorm2(destDir);
|
||||
|
||||
cleanUpData();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/* parser for DerivedNormalizationProperties.txt ---------------------------- */
|
||||
|
||||
static void U_CALLCONV
|
||||
derivedNormalizationPropertiesLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
UChar string[32];
|
||||
char *s;
|
||||
uint32_t start, end;
|
||||
int32_t count;
|
||||
|
||||
/* get code point range */
|
||||
count=u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "gennorm: error parsing DerivedNormalizationProperties.txt mapping at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* ignore hangul - handle explicitly */
|
||||
if(start==0xac00) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get property - ignore unrecognized ones */
|
||||
s=(char *)u_skipWhitespace(fields[1][0]);
|
||||
if(0==uprv_memcmp(s, "Comp_Ex", 7) || 0==uprv_memcmp(s, "Full_Composition_Exclusion", 26)) {
|
||||
/* full composition exclusion */
|
||||
while(start<=end) {
|
||||
setCompositionExclusion(start++);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseDerivedNormalizationProperties(const char *filename, UErrorCode *pErrorCode, UBool reportError) {
|
||||
char *fields[2][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 2, derivedNormalizationPropertiesLineFn, NULL, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
|
||||
fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for UnicodeData.txt ----------------------------------------------- */
|
||||
|
||||
static void U_CALLCONV
|
||||
unicodeDataLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t decomp[40];
|
||||
Norm norm;
|
||||
const char *s;
|
||||
char *end;
|
||||
uint32_t code, value;
|
||||
int32_t length;
|
||||
UBool isCompat, something=FALSE;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* reset the properties */
|
||||
uprv_memset(&norm, 0, sizeof(Norm));
|
||||
|
||||
/* get the character code, field 0 */
|
||||
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "gennorm: syntax error in field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
/* get canonical combining class, field 3 */
|
||||
value=(uint32_t)uprv_strtoul(fields[3][0], &end, 10);
|
||||
if(end<=fields[3][0] || end!=fields[3][1] || value>0xff) {
|
||||
fprintf(stderr, "gennorm: syntax error in field 3 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
if(value>0) {
|
||||
norm.udataCC=(uint8_t)value;
|
||||
something=TRUE;
|
||||
}
|
||||
|
||||
/* get the decomposition, field 5 */
|
||||
if(fields[5][0]<fields[5][1]) {
|
||||
if(*(s=fields[5][0])=='<') {
|
||||
++s;
|
||||
isCompat=TRUE;
|
||||
|
||||
/* skip and ignore the compatibility type name */
|
||||
do {
|
||||
if(s==fields[5][1]) {
|
||||
/* missing '>' */
|
||||
fprintf(stderr, "gennorm: syntax error in field 5 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
} while(*s++!='>');
|
||||
} else {
|
||||
isCompat=FALSE;
|
||||
}
|
||||
|
||||
/* parse the decomposition string */
|
||||
length=u_parseCodePoints(s, decomp, sizeof(decomp)/4, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "gennorm error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
|
||||
(long)code, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* store the string */
|
||||
if(length>0) {
|
||||
something=TRUE;
|
||||
if(isCompat) {
|
||||
norm.lenNFKD=(uint8_t)length;
|
||||
norm.nfkd=decomp;
|
||||
} else {
|
||||
if(length>2) {
|
||||
fprintf(stderr, "gennorm: error - length of NFD(U+%04lx) = %ld >2 in UnicodeData - illegal\n",
|
||||
(long)code, (long)length);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
norm.lenNFD=(uint8_t)length;
|
||||
norm.nfd=decomp;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* check for non-character code points */
|
||||
if((code&0xfffe)==0xfffe || (uint32_t)(code-0xfdd0)<0x20 || code>0x10ffff) {
|
||||
fprintf(stderr, "gennorm: error - properties for non-character code point U+%04lx\n",
|
||||
(long)code);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
if(something) {
|
||||
/* there are normalization values, so store them */
|
||||
#if 0
|
||||
if(beVerbose) {
|
||||
printf("store values for U+%04lx: cc=%d, lenNFD=%ld, lenNFKD=%ld\n",
|
||||
(long)code, norm.udataCC, (long)norm.lenNFD, (long)norm.lenNFKD);
|
||||
}
|
||||
#endif
|
||||
storeNorm(code, &norm);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseDB(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[15][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 15, unicodeDataLineFn, NULL, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "gennorm error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
@ -1,50 +0,0 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: gennorm.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __GENPROPS_H__
|
||||
#define __GENPROPS_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
/*
|
||||
* data structure that holds the normalization properties for one or more
|
||||
* code point(s) at build time
|
||||
*/
|
||||
typedef struct Norm {
|
||||
uint8_t udataCC, lenNFD, lenNFKD;
|
||||
uint32_t *nfd, *nfkd;
|
||||
} Norm;
|
||||
|
||||
/* global flags */
|
||||
extern UBool beVerbose;
|
||||
|
||||
/* prototypes */
|
||||
extern void
|
||||
init(void);
|
||||
|
||||
extern void
|
||||
storeNorm(uint32_t code, Norm *norm);
|
||||
|
||||
extern void
|
||||
setCompositionExclusion(uint32_t code);
|
||||
|
||||
extern void
|
||||
writeNorm2(const char *dataDir);
|
||||
|
||||
extern void
|
||||
cleanUpData(void);
|
||||
|
||||
#endif
|
@ -1,407 +0,0 @@
|
||||
<?xml version="1.0" encoding="Windows-1252"?>
|
||||
<VisualStudioProject
|
||||
ProjectType="Visual C++"
|
||||
Version="9.00"
|
||||
Name="gennorm"
|
||||
ProjectGUID="{F5213103-6CBE-46E6-B4CC-2570B6837D86}"
|
||||
TargetFrameworkVersion="131072"
|
||||
>
|
||||
<Platforms>
|
||||
<Platform
|
||||
Name="Win32"
|
||||
/>
|
||||
<Platform
|
||||
Name="x64"
|
||||
/>
|
||||
</Platforms>
|
||||
<ToolFiles>
|
||||
</ToolFiles>
|
||||
<Configurations>
|
||||
<Configuration
|
||||
Name="Release|Win32"
|
||||
OutputDirectory=".\x86\Release"
|
||||
IntermediateDirectory=".\x86\Release"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin
"
|
||||
Outputs="..\..\..\bin\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TypeLibraryName=".\x86\Release/gennorm.tlb"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x86\Release/gennorm.pch"
|
||||
AssemblerListingLocation=".\x86\Release/"
|
||||
ObjectFile=".\x86\Release/"
|
||||
ProgramDataBaseFileName=".\x86\Release/"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="NDEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x86\Release/gennorm.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
ProgramDatabaseFile=".\x86\Release/gennorm.pdb"
|
||||
SubSystem="1"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Debug|Win32"
|
||||
OutputDirectory=".\x86\Debug"
|
||||
IntermediateDirectory=".\x86\Debug"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin
"
|
||||
Outputs="..\..\..\bin\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TypeLibraryName=".\x86\Debug/gennorm.tlb"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x86\Debug/gennorm.pch"
|
||||
AssemblerListingLocation=".\x86\Debug/"
|
||||
ObjectFile=".\x86\Debug/"
|
||||
ProgramDataBaseFileName=".\x86\Debug/"
|
||||
BrowseInformation="1"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
DebugInformationFormat="4"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="_DEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x86\Debug/gennorm.exe"
|
||||
LinkIncremental="2"
|
||||
SuppressStartupBanner="true"
|
||||
GenerateDebugInformation="true"
|
||||
ProgramDatabaseFile=".\x86\Debug/gennorm.pdb"
|
||||
SubSystem="1"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
UseFAT32Workaround="true"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|x64"
|
||||
OutputDirectory=".\x64\Release"
|
||||
IntermediateDirectory=".\x64\Release"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin64
"
|
||||
Outputs="..\..\..\bin64\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TargetEnvironment="3"
|
||||
TypeLibraryName=".\x64\Release/gennorm.tlb"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x64\Release/gennorm.pch"
|
||||
AssemblerListingLocation=".\x64\Release/"
|
||||
ObjectFile=".\x64\Release/"
|
||||
ProgramDataBaseFileName=".\x64\Release/"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="NDEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x64\Release/gennorm.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
ProgramDatabaseFile=".\x64\Release/gennorm.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="17"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebDeploymentTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Debug|x64"
|
||||
OutputDirectory=".\x64\Debug"
|
||||
IntermediateDirectory=".\x64\Debug"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin64
"
|
||||
Outputs="..\..\..\bin64\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TargetEnvironment="3"
|
||||
TypeLibraryName=".\x64\Debug/gennorm.tlb"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x64\Debug/gennorm.pch"
|
||||
AssemblerListingLocation=".\x64\Debug/"
|
||||
ObjectFile=".\x64\Debug/"
|
||||
ProgramDataBaseFileName=".\x64\Debug/"
|
||||
BrowseInformation="1"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
DebugInformationFormat="3"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="_DEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x64\Debug/gennorm.exe"
|
||||
LinkIncremental="2"
|
||||
SuppressStartupBanner="true"
|
||||
GenerateDebugInformation="true"
|
||||
ProgramDatabaseFile=".\x64\Debug/gennorm.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="17"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
UseFAT32Workaround="true"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebDeploymentTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
</Configurations>
|
||||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<File
|
||||
RelativePath=".\gennorm.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\gennorm.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\store.c"
|
||||
>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
</VisualStudioProject>
|
@ -1,289 +0,0 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: store.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2001may25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Store Unicode normalization data in a memory-mappable file.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "cmemory.h"
|
||||
#include "filestrm.h"
|
||||
#include "utrie.h"
|
||||
#include "toolutil.h"
|
||||
#include "writesrc.h"
|
||||
#include "unormimp.h"
|
||||
#include "gennorm.h"
|
||||
|
||||
#define DO_DEBUG_OUT 0
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/* builder data ------------------------------------------------------------- */
|
||||
|
||||
static UNewTrie *normTrie;
|
||||
|
||||
static UToolMemory *normMem, *utf32Mem;
|
||||
|
||||
static Norm *norms;
|
||||
|
||||
static USet *compositionExclusions;
|
||||
|
||||
/* allocate and initialize a Norm unit */
|
||||
static Norm *
|
||||
allocNorm() {
|
||||
/* allocate Norm */
|
||||
Norm *p=(Norm *)utm_alloc(normMem);
|
||||
return p;
|
||||
}
|
||||
|
||||
extern void
|
||||
init() {
|
||||
uint16_t *p16;
|
||||
|
||||
normTrie = (UNewTrie *)uprv_malloc(sizeof(UNewTrie));
|
||||
uprv_memset(normTrie, 0, sizeof(UNewTrie));
|
||||
|
||||
/* initialize the two tries */
|
||||
if(NULL==utrie_open(normTrie, NULL, 30000, 0, 0, FALSE)) {
|
||||
fprintf(stderr, "error: failed to initialize tries\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
|
||||
/* allocate Norm structures and reset the first one */
|
||||
normMem=utm_open("gennorm normalization structs", 20000, 20000, sizeof(Norm));
|
||||
norms=allocNorm();
|
||||
|
||||
/* allocate UTF-32 string memory */
|
||||
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
|
||||
|
||||
compositionExclusions=uset_openEmpty();
|
||||
}
|
||||
|
||||
/*
|
||||
* get or create a Norm unit;
|
||||
* get or create the intermediate trie entries for it as well
|
||||
*/
|
||||
static Norm *
|
||||
createNorm(uint32_t code) {
|
||||
Norm *p;
|
||||
uint32_t i;
|
||||
|
||||
i=utrie_get32(normTrie, (UChar32)code, NULL);
|
||||
if(i!=0) {
|
||||
p=norms+i;
|
||||
} else {
|
||||
/* allocate Norm */
|
||||
p=allocNorm();
|
||||
if(!utrie_set32(normTrie, (UChar32)code, (uint32_t)(p-norms))) {
|
||||
fprintf(stderr, "error: too many normalization entries\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
/* processing incoming normalization data ----------------------------------- */
|
||||
|
||||
/*
|
||||
* process the data for one code point listed in UnicodeData;
|
||||
* UnicodeData itself never maps a code point to both NFD and NFKD
|
||||
*/
|
||||
extern void
|
||||
storeNorm(uint32_t code, Norm *norm) {
|
||||
Norm *p=createNorm(code);
|
||||
|
||||
/* store the data */
|
||||
uprv_memcpy(p, norm, sizeof(Norm));
|
||||
|
||||
/* store the decomposition string if there is one here */
|
||||
if(norm->lenNFD!=0) {
|
||||
uint32_t *s32=utm_allocN(utf32Mem, norm->lenNFD);
|
||||
uprv_memcpy(s32, norm->nfd, norm->lenNFD*4);
|
||||
p->nfd=s32;
|
||||
} else if(norm->lenNFKD!=0) {
|
||||
uint32_t *s32=utm_allocN(utf32Mem, norm->lenNFKD);
|
||||
uprv_memcpy(s32, norm->nfkd, norm->lenNFKD*4);
|
||||
p->nfkd=s32;
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
setCompositionExclusion(uint32_t code) {
|
||||
uset_add(compositionExclusions, (UChar32)code);
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
writeAllCC(FILE *f) {
|
||||
uint32_t i;
|
||||
UChar32 prevCode, code;
|
||||
uint8_t prevCC, cc;
|
||||
UBool isInBlockZero;
|
||||
|
||||
fprintf(f, "# Canonical_Combining_Class (ccc) values\n");
|
||||
prevCode=0;
|
||||
prevCC=0;
|
||||
for(code=0; code<=0x110000;) {
|
||||
if(code==0x110000) {
|
||||
cc=0;
|
||||
} else {
|
||||
i=utrie_get32(normTrie, code, &isInBlockZero);
|
||||
if(i==0 || isInBlockZero) {
|
||||
cc=0;
|
||||
} else {
|
||||
cc=norms[i].udataCC;
|
||||
}
|
||||
}
|
||||
if(prevCC!=cc) {
|
||||
if(prevCC!=0) {
|
||||
uint32_t lastCode=code-1;
|
||||
if(prevCode==lastCode) {
|
||||
fprintf(f, "%04lX:%d\n", (long)lastCode, prevCC);
|
||||
} else {
|
||||
fprintf(f, "%04lX..%04lX:%d\n",
|
||||
(long)prevCode, (long)lastCode, prevCC);
|
||||
}
|
||||
}
|
||||
prevCode=code;
|
||||
prevCC=cc;
|
||||
}
|
||||
if(isInBlockZero) {
|
||||
code+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
++code;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
hasMapping(uint32_t code) {
|
||||
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
|
||||
return norm->lenNFD!=0 || norm->lenNFKD!=0;
|
||||
}
|
||||
|
||||
static UBool
|
||||
hasOneWayMapping(uint32_t code, UBool withCompat) {
|
||||
for(;;) {
|
||||
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
|
||||
uint8_t length;
|
||||
if((length=norm->lenNFD)!=0) {
|
||||
/*
|
||||
* The canonical decomposition is a one-way mapping if
|
||||
* - it does not map to exactly two code points
|
||||
* - the code has ccc!=0
|
||||
* - the code has the Composition_Exclusion property
|
||||
* - its starter has a one-way mapping (loop for this)
|
||||
* - its non-starter decomposes
|
||||
*/
|
||||
if( length!=2 ||
|
||||
norm->udataCC!=0 ||
|
||||
uset_contains(compositionExclusions, (UChar32)code) ||
|
||||
hasMapping(norm->nfd[1])
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
code=norm->nfd[0]; /* continue */
|
||||
} else if(withCompat && norm->lenNFKD!=0) {
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
writeAllMappings(FILE *f, UBool withCompat) {
|
||||
uint32_t i, code;
|
||||
UBool isInBlockZero;
|
||||
|
||||
if(withCompat) {
|
||||
fprintf(f, "\n# Canonical and compatibility decomposition mappings\n");
|
||||
} else {
|
||||
fprintf(f, "\n# Canonical decomposition mappings\n");
|
||||
}
|
||||
for(code=0; code<=0x10ffff;) {
|
||||
i=utrie_get32(normTrie, code, &isInBlockZero);
|
||||
if(isInBlockZero) {
|
||||
code+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
if(i!=0) {
|
||||
uint32_t *s32;
|
||||
uint8_t length;
|
||||
char separator;
|
||||
if((length=norms[i].lenNFD)!=0) {
|
||||
s32=norms[i].nfd;
|
||||
separator= hasOneWayMapping(code, withCompat) ? '>' : '=';
|
||||
} else if(withCompat && (length=norms[i].lenNFKD)!=0) {
|
||||
s32=norms[i].nfkd;
|
||||
separator='>';
|
||||
}
|
||||
if(length!=0) {
|
||||
uint8_t j;
|
||||
fprintf(f, "%04lX%c", (long)code, separator);
|
||||
for(j=0; j<length; ++j) {
|
||||
if(j!=0) {
|
||||
fputc(' ', f);
|
||||
}
|
||||
fprintf(f, "%04lX", (long)s32[j]);
|
||||
}
|
||||
fputc('\n', f);
|
||||
}
|
||||
}
|
||||
++code;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
writeNorm2TextFile(const char *path, const char *filename, UBool withCompat) {
|
||||
FILE *f=usrc_createTextData(path, filename);
|
||||
if(f==NULL) {
|
||||
exit(U_FILE_ACCESS_ERROR);
|
||||
}
|
||||
writeAllCC(f);
|
||||
writeAllMappings(f, withCompat);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
extern void
|
||||
writeNorm2(const char *dataDir) {
|
||||
writeNorm2TextFile(dataDir, "nfc.txt", FALSE);
|
||||
writeNorm2TextFile(dataDir, "nfkc.txt", TRUE);
|
||||
}
|
||||
|
||||
extern void
|
||||
cleanUpData(void) {
|
||||
utm_close(normMem);
|
||||
utm_close(utf32Mem);
|
||||
utrie_close(normTrie);
|
||||
uprv_free(normTrie);
|
||||
uset_close(compositionExclusions);
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
@ -80,8 +80,10 @@ _ignored_properties = set((
|
||||
"OMath",
|
||||
"OUpper",
|
||||
# Further properties that just contribute to others.
|
||||
"CE", # Composition_Exclusion just contributes to Full_Composition_Exclusion.
|
||||
"JSN",
|
||||
# These properties just don't seem useful.
|
||||
# They are deprecated since Unicode 6.0.
|
||||
"XO_NFC",
|
||||
"XO_NFD",
|
||||
"XO_NFKC",
|
||||
@ -235,7 +237,7 @@ _alg_names_ranges = []
|
||||
# Starts with one range for all of Unicode without any properties.
|
||||
# Setting values subdivides ranges.
|
||||
_starts = array.array('l', [0, 0x110000]) # array of int32_t
|
||||
_props = [{}] # props for 0 but not 110000
|
||||
_props = [{}, {}] # props for 0 and 110000
|
||||
|
||||
def FindRange(x):
|
||||
""" Binary search for x in the inversion map.
|
||||
@ -243,6 +245,11 @@ def FindRange(x):
|
||||
return bisect.bisect(_starts, x) - 1
|
||||
|
||||
|
||||
def GetProps(c):
|
||||
i = FindRange(c)
|
||||
return _props[i]
|
||||
|
||||
|
||||
def UpdateProps(start, end, update):
|
||||
assert 0 <= start <= end <= 0x10ffff
|
||||
(need_to_update, do_update, u) = (update[0], update[1], update[2])
|
||||
@ -1108,6 +1115,101 @@ def WritePreparsedUCD(out_file):
|
||||
if props:
|
||||
WriteFieldsRangeProps(["cp"], start, end, props, out_file)
|
||||
|
||||
# Write Normalizer2 input files -------------------------------------------- ***
|
||||
# Ported from genprops/store.c.
|
||||
|
||||
def WriteAllCC(out_file):
|
||||
out_file.write("# Canonical_Combining_Class (ccc) values\n");
|
||||
prev_start = 0
|
||||
prev_cc = 0
|
||||
for i in xrange(len(_starts)):
|
||||
start = _starts[i]
|
||||
props = _props[i]
|
||||
cc = props.get("ccc")
|
||||
if not cc: cc = 0
|
||||
if prev_cc != cc:
|
||||
if prev_cc != 0:
|
||||
last_code_point = start - 1
|
||||
if prev_start == last_code_point:
|
||||
out_file.write("%04X:%d\n" % (last_code_point, prev_cc))
|
||||
else:
|
||||
out_file.write("%04X..%04X:%d\n" %
|
||||
(prev_start, last_code_point, prev_cc))
|
||||
prev_start = start
|
||||
prev_cc = cc
|
||||
|
||||
|
||||
def HasMapping(c):
|
||||
props = GetProps(c)
|
||||
dt = props.get("dt")
|
||||
return dt and dt != "None"
|
||||
|
||||
|
||||
def HasOneWayMapping(c, with_compat):
|
||||
while True:
|
||||
props = GetProps(c)
|
||||
dt = props.get("dt")
|
||||
if not dt or dt == "None":
|
||||
return False # no mapping
|
||||
elif dt == "Can":
|
||||
# The canonical decomposition is a one-way mapping if
|
||||
# - it does not map to exactly two code points
|
||||
# - c has ccc!=0
|
||||
# - c has the Composition_Exclusion property
|
||||
# - its starter has a one-way mapping (loop for this)
|
||||
# - its non-starter decomposes
|
||||
nfd = props["dm"].split()
|
||||
if (len(nfd) != 2 or
|
||||
props.get("ccc") or
|
||||
props.get("Comp_Ex") or
|
||||
HasMapping(int(nfd[1], 16))):
|
||||
return True
|
||||
c = int(nfd[0], 16) # continue
|
||||
else:
|
||||
# c has a compatibility mapping.
|
||||
return with_compat
|
||||
|
||||
|
||||
def WriteAllMappings(out_file, with_compat):
|
||||
if with_compat:
|
||||
out_file.write("\n# Canonical and compatibility decomposition mappings\n")
|
||||
else:
|
||||
out_file.write("\n# Canonical decomposition mappings\n")
|
||||
for i in xrange(len(_starts) - 1):
|
||||
start = _starts[i]
|
||||
end = _starts[i + 1] - 1
|
||||
props = _props[i]
|
||||
dm = props.get("dm")
|
||||
if dm and dm[0] != '<' and (with_compat or props["dt"] == "Can"):
|
||||
assert start == end
|
||||
separator = '>' if HasOneWayMapping(start, with_compat) else '='
|
||||
out_file.write("%04X%s%s\n" % (start, separator, dm))
|
||||
|
||||
|
||||
def WriteNorm2TextFile(path, filename, with_compat):
|
||||
year = datetime.date.today().strftime("%Y")
|
||||
with open(os.path.join(path, filename), "w") as out_file:
|
||||
out_file.write(
|
||||
"""# Copyright (C) 1999-""" + year +
|
||||
""", International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# file name: """ + filename + """
|
||||
#
|
||||
# machine-generated by ICU preparseucd.py
|
||||
#
|
||||
""")
|
||||
type = "NFKC" if with_compat else "NFC"
|
||||
out_file.write("# Complete data for Unicode " + type + " normalization.\n")
|
||||
out_file.write("# Unicode " + _ucd_version + "\n\n")
|
||||
WriteAllCC(out_file)
|
||||
WriteAllMappings(out_file, with_compat)
|
||||
|
||||
|
||||
def WriteNorm2(path):
|
||||
WriteNorm2TextFile(path, "nfc.txt", False)
|
||||
WriteNorm2TextFile(path, "nfkc.txt", True)
|
||||
|
||||
# Preprocessing ------------------------------------------------------------ ***
|
||||
|
||||
_strip_re = re.compile("([0-9a-fA-F]+.+?) *#.*")
|
||||
@ -1778,11 +1880,15 @@ def main():
|
||||
if pnv:
|
||||
raise Exception("no default values (@missing lines) for " +
|
||||
"some Catalog or Enumerated properties: %s " % pnv)
|
||||
# Write Normalizer2 input text files.
|
||||
# Do this before compacting the data so that we need not handle fallbacks.
|
||||
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
|
||||
norm2_path = os.path.join(unidata_path, "norm2")
|
||||
if not os.path.exists(norm2_path): os.makedirs(norm2_path)
|
||||
WriteNorm2(norm2_path)
|
||||
# Optimize block vs. cp properties.
|
||||
CompactBlocks()
|
||||
# Write the ppucd.txt output file.
|
||||
unidata_path = os.path.join(icu_src_root, "source", "data", "unidata")
|
||||
if not os.path.exists(unidata_path): os.makedirs(unidata_path)
|
||||
out_path = os.path.join(unidata_path, "ppucd.txt")
|
||||
with open(out_path, "w") as out_file:
|
||||
WritePreparsedUCD(out_file)
|
||||
|
Loading…
Reference in New Issue
Block a user