ICU-2194 IDNA data trie generator
X-SVN-Rev: 11193
This commit is contained in:
parent
41d36b7650
commit
78f36c9a5a
101
icu4c/source/tools/genidna/Makefile.in
Normal file
101
icu4c/source/tools/genidna/Makefile.in
Normal file
@ -0,0 +1,101 @@
|
||||
## Makefile.in for ICU - tools/genidna
|
||||
## Copyright (c) 1999-2001, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
##
|
||||
|
||||
SECTION = 8
|
||||
|
||||
MAN_FILES = $(TARGET:$(EXEEXT)=).$(SECTION)
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/genidna
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(MAN_FILES) $(DEPS)
|
||||
|
||||
## Target information
|
||||
TARGET = genidna$(EXEEXT)
|
||||
|
||||
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUI18N) $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = genidna.o store.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check \
|
||||
check-local install-man
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET) $(MAN_FILES)
|
||||
|
||||
install-local: all-local install-man
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
|
||||
$(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)/$(TARGET)
|
||||
|
||||
<dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(OBJECTS)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
|
||||
|
||||
|
||||
# the 'mv' will always fail if you are building in the source dir
|
||||
|
||||
# man page
|
||||
install-man: $(MAN_FILES)
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
$(INSTALL_DATA) $< $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
|
||||
%.$(SECTION): $(srcdir)/%.$(SECTION).in
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
# build postscript and pdf formats
|
||||
#$(TARGET).ps: $(TARGET).$(SECTION)
|
||||
# groff -man < $< > $@
|
||||
|
||||
#$(TARGET).pdf: $(TARGET).ps
|
||||
# ps2pdf $< $@
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
172
icu4c/source/tools/genidna/filterRFC3454.pl
Executable file
172
icu4c/source/tools/genidna/filterRFC3454.pl
Executable file
@ -0,0 +1,172 @@
|
||||
#/usr/bin/perl
|
||||
|
||||
####################################################################################
|
||||
# filterRFC3454.pl:
|
||||
# This tool filters the RFC-3454 txt file for String prep tables
|
||||
# Author: Ram Viswanadha
|
||||
#
|
||||
####################################################################################
|
||||
|
||||
use File::Find;
|
||||
use File::Basename;
|
||||
use IO::File;
|
||||
use Cwd;
|
||||
use File::Copy;
|
||||
use Getopt::Long;
|
||||
use File::Path;
|
||||
use File::Copy;
|
||||
|
||||
$warning = "###################\n# WARNING: This table is generated by filterRFC3454.pl tool. DO NOT EDIT \n#################\n\n";
|
||||
#run the program
|
||||
main();
|
||||
|
||||
#---------------------------------------------------------------------
|
||||
# The main program
|
||||
|
||||
sub main(){
|
||||
GetOptions(
|
||||
"--sourcedir=s" => \$sourceDir,
|
||||
"--destdir=s" => \$destDir,
|
||||
"--filename=s" => \$fileName,
|
||||
);
|
||||
usage() unless defined $sourceDir;
|
||||
usage() unless defined $destDir;
|
||||
usage() unless defined $fileName;
|
||||
|
||||
$infile = $sourceDir."/".$fileName;
|
||||
$inFH = IO::File->new($infile,"r")
|
||||
or die "could not open the file for reading: $! \n";
|
||||
|
||||
while(defined ($line=<$inFH>)){
|
||||
next unless $line=~ /Start\sTable/;
|
||||
if($line =~ /A.1/){
|
||||
createUnassignedTable($inFH,$destDir);
|
||||
}
|
||||
if($line =~ /B.1/){
|
||||
createCaseMapNoNorm($inFH,$destDir);
|
||||
}
|
||||
if($line =~ /B.2/){
|
||||
createCaseMap($inFH,$destDir);
|
||||
}
|
||||
if($line =~ /C.*/ ){
|
||||
createProhibitedTable($inFH,$destDir,$line);
|
||||
}
|
||||
}
|
||||
close($inFH);
|
||||
}
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
sub readPrint{
|
||||
local ($inFH, $outFH,$comment, $print) = @_;
|
||||
$count = 0;
|
||||
print $outFH $comment."\n";
|
||||
while(defined ($line = <$inFH>)){
|
||||
next if $line =~ /Hoffman\s\&\sBlanchet/; # ignore heading
|
||||
next if $line =~ /RFC\s3454/; # ignore heading
|
||||
next if $line =~ /\f/; # ignore form feed
|
||||
next if $line eq "\n"; # ignore blank lines
|
||||
# break if "End Table" is found
|
||||
if( $line =~ /End\sTable/){
|
||||
print $outFH "\n# Total code points $count\n\n";
|
||||
return;
|
||||
}
|
||||
if($print==1){
|
||||
print $line;
|
||||
}
|
||||
$line =~ s/-/../;
|
||||
$line =~ s/^\s+//;
|
||||
if($line =~ /\;/){
|
||||
}else{
|
||||
$line =~ s/$/;/;
|
||||
}
|
||||
if($line =~ /\.\./){
|
||||
($code, $noise) = split /;/ , $line;
|
||||
($startStr, $endStr ) = split /\.\./, $code;
|
||||
$start = atoi($startStr);
|
||||
$end = atoi($endStr);
|
||||
#print $start." ".$end."\n";
|
||||
while($start <= $end){
|
||||
$count++;
|
||||
$start++;
|
||||
}
|
||||
}else{
|
||||
$count++;
|
||||
}
|
||||
print $outFH $line;
|
||||
}
|
||||
}
|
||||
#-----------------------------------------------------------------------
|
||||
sub atoi {
|
||||
my $t;
|
||||
foreach my $d (split(//, shift())) {
|
||||
$t = $t * 16 + $d;
|
||||
}
|
||||
return $t;
|
||||
}
|
||||
#-----------------------------------------------------------------------
|
||||
sub createUnassignedTable{
|
||||
($inFH,$destDir) = @_;
|
||||
$outfile = $destDir."/"."rfc3454_A_1.txt";
|
||||
$outFH = IO::File->new($outfile,"w")
|
||||
or die "could not open the file $outfile for writing: $! \n";
|
||||
$comment = $warning."# This file contains code points from Table A.1 from RFC 3454\n";
|
||||
readPrint($inFH,$outFH, $comment);
|
||||
close($outFH);
|
||||
}
|
||||
#-----------------------------------------------------------------------
|
||||
sub createCaseMapNoNorm{
|
||||
($inFH,$destDir) = @_;
|
||||
$outfile = $destDir."/"."rfc3454_B_1.txt";
|
||||
$outFH = IO::File->new($outfile,"w")
|
||||
or die "could not open the file $outfile for writing: $! \n";
|
||||
$comment = $warning."# This file contains code points from Table B.1 from RFC 3454\n";
|
||||
readPrint($inFH,$outFH,$comment);
|
||||
close($outFH);
|
||||
}
|
||||
#-----------------------------------------------------------------------
|
||||
sub createCaseMap{
|
||||
($inFH,$destDir) = @_;
|
||||
$outfile = $destDir."/"."rfc3454_B_2.txt";
|
||||
$outFH = IO::File->new($outfile,"w")
|
||||
or die "could not open the file $outfile for writing: $! \n";
|
||||
$comment = $warning."# This file contains code points from Table B.2 from RFC 3454\n";
|
||||
readPrint($inFH,$outFH,$comment);
|
||||
close($outFH);
|
||||
}
|
||||
#-----------------------------------------------------------------------
|
||||
sub createProhibitedTable{
|
||||
($inFH,$destDir,$line) = @_;
|
||||
$outfile = $destDir."/"."rfc3454_C_X.txt";
|
||||
if($line =~ /C.1.1/ && stat($outfile)){
|
||||
unlink($outfile)
|
||||
or die "could not delete the file $outfile : $! \n";
|
||||
|
||||
}
|
||||
$line =~ s/Start//;
|
||||
$line =~ s/-//g;
|
||||
$comment = $warning."# code points from $line";
|
||||
$outFH = IO::File->new($outfile, "a")
|
||||
or die "could not open the file $outfile for writing: $! \n";
|
||||
readPrint($inFH,$outFH,$comment);
|
||||
close($outFH);
|
||||
}
|
||||
#-----------------------------------------------------------------------
|
||||
sub usage {
|
||||
print << "END";
|
||||
Usage:
|
||||
filterRFC3454.pl
|
||||
Options:
|
||||
--sourcedir=<directory>
|
||||
--destdir=<directory>
|
||||
--filename=<name of RFC file>
|
||||
|
||||
e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --filename=rfc3454.txt
|
||||
|
||||
filterRFC3454.pl filters the RFC file and creates String prep table files.
|
||||
The RFC text can be downloaded from ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt
|
||||
|
||||
END
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
555
icu4c/source/tools/genidna/genidna.c
Normal file
555
icu4c/source/tools/genidna/genidna.c
Normal file
@ -0,0 +1,555 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: genidn.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003-02-06
|
||||
* created by: Ram Viswanadha
|
||||
*
|
||||
* This program reads the rfc3454_*.txt files,
|
||||
* parses them, and extracts the data for Nameprep conformance.
|
||||
* It then preprocesses it and writes a binary file for efficient use
|
||||
* in various IDNA conversion processes.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "uoptions.h"
|
||||
#include "uparse.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "uprops.h"
|
||||
|
||||
U_CDECL_BEGIN
|
||||
#include "genidna.h"
|
||||
U_CDECL_END
|
||||
|
||||
#ifdef WIN32
|
||||
# pragma warning(disable: 4100)
|
||||
#endif
|
||||
|
||||
UBool beVerbose=FALSE, haveCopyright=TRUE, printRules = FALSE;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
parseMappings(const char *filename, UBool withNorm, UBool reportError, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseTable(const char *filename, UBool isUnassigned, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode);
|
||||
|
||||
static void
|
||||
setLDHValues(UErrorCode* pErrorCode);
|
||||
|
||||
static void
|
||||
setLabelSeperators(UErrorCode* pErrorCode);
|
||||
|
||||
static void
|
||||
printMapping(UChar32 cp,UChar32* mapping, int32_t mappingLength);
|
||||
|
||||
static const char* fileNames[] = {
|
||||
"rfc3454_A_1.txt", /* contains unassigned code points */
|
||||
"rfc3454_C_X.txt", /* contains code points that are prohibited */
|
||||
"rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */
|
||||
"rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */
|
||||
"NormalizationCorrections.txt",/* normalization corrections */
|
||||
};
|
||||
static const char *UNIDATA_DIR = "unidata";
|
||||
static const char *MISC_DIR = "misc";
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
UOPTION_COPYRIGHT,
|
||||
UOPTION_DESTDIR,
|
||||
UOPTION_SOURCEDIR,
|
||||
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
|
||||
{ "generate-rules", NULL, NULL, NULL, 'g', UOPT_NO_ARG, 0 }
|
||||
};
|
||||
|
||||
extern int
|
||||
main(int argc, char* argv[]) {
|
||||
char filename[300];
|
||||
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
|
||||
char *basename=NULL;
|
||||
char *saveBasename = NULL;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
/* preset then read command line options */
|
||||
options[4].value=u_getDataDirectory();
|
||||
options[5].value="";
|
||||
options[6].value="3.0.0";
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
fprintf(stderr,
|
||||
"error in command line argument \"%s\"\n",
|
||||
argv[-argc]);
|
||||
}
|
||||
if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
|
||||
/*
|
||||
* Broken into chucks because the C89 standard says the minimum
|
||||
* required supported string length is 509 bytes.
|
||||
*/
|
||||
fprintf(stderr,
|
||||
"Usage: %s [-options] [suffix]\n"
|
||||
"\n"
|
||||
"Read the rfc3454_*.txt files and\n"
|
||||
"create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
|
||||
"\n",
|
||||
argv[0]);
|
||||
fprintf(stderr,
|
||||
"Options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-v or --verbose verbose output\n"
|
||||
"\t-c or --copyright include a copyright notice\n");
|
||||
fprintf(stderr,
|
||||
"\t-d or --destdir destination directory, followed by the path\n"
|
||||
"\t-s or --sourcedir source directory of ICU data, followed by the path\n"
|
||||
"\t-g or --generate-rules generate IDN rules for testing. Will print out rules to STDOUT\n"
|
||||
);
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
/* get the options values */
|
||||
beVerbose=options[2].doesOccur;
|
||||
haveCopyright=options[3].doesOccur;
|
||||
srcDir=options[5].value;
|
||||
destDir=options[4].value;
|
||||
printRules = options[7].doesOccur;
|
||||
|
||||
if(argc>=2) {
|
||||
suffix=argv[1];
|
||||
} else {
|
||||
suffix=NULL;
|
||||
}
|
||||
|
||||
setUnicodeVersion(options[6].value);
|
||||
|
||||
/* prepare the filename beginning with the source dir */
|
||||
if(srcDir[0] == U_FILE_SEP_CHAR){
|
||||
filename[0]= 0x2E;
|
||||
uprv_strcat(filename+1,srcDir);
|
||||
}else if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){
|
||||
filename[0] = 0x2E;
|
||||
filename[1] = U_FILE_SEP_CHAR;
|
||||
uprv_strcpy(filename+2,srcDir);
|
||||
}else{
|
||||
uprv_strcpy(filename, srcDir);
|
||||
}
|
||||
basename=filename+uprv_strlen(filename);
|
||||
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
init();
|
||||
if(printRules){
|
||||
printf("// Copyright (C) 2003, International Business Machines\n\n");
|
||||
printf("// WARNING: This file is machine generated by %s tool. Please DO NOT edit.\n\n",argv[0]);
|
||||
|
||||
printf("idn_rules{\n");
|
||||
}
|
||||
|
||||
/* first copy misc directory */
|
||||
saveBasename = basename;
|
||||
uprv_strcpy(basename,MISC_DIR);
|
||||
basename = basename + uprv_strlen(MISC_DIR);
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
|
||||
/* process unassigned */
|
||||
uprv_strcpy(basename,fileNames[0]);
|
||||
parseTable(filename,TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
/* process prohibited */
|
||||
uprv_strcpy(basename,fileNames[1]);
|
||||
parseTable(filename,FALSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
/* setLDHValues(&errorCode); */
|
||||
setLabelSeperators(&errorCode);
|
||||
|
||||
/* process mappings */
|
||||
if(printRules){
|
||||
printf("\n\tMapNoNormalization{\n");
|
||||
}
|
||||
uprv_strcpy(basename,fileNames[2]);
|
||||
parseMappings(filename, FALSE, FALSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
if(printRules){
|
||||
printf("\n\t}\n");
|
||||
}
|
||||
|
||||
if(printRules){
|
||||
printf("\n\tMapNFKC{\n");
|
||||
}
|
||||
uprv_strcpy(basename,fileNames[3]);
|
||||
parseMappings(filename, TRUE, FALSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
/* set up directory for NormalizationCorrections.txt */
|
||||
basename = saveBasename;
|
||||
uprv_strcpy(basename,UNIDATA_DIR);
|
||||
basename = basename + uprv_strlen(UNIDATA_DIR);
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
uprv_strcpy(basename,fileNames[4]);
|
||||
|
||||
parseNormalizationCorrections(filename,&errorCode);
|
||||
if(U_FAILURE(errorCode)){
|
||||
fprintf(stderr,"Could not open file %s for reading \n", filename);
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
/* process parsed data */
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
/* write the data file */
|
||||
generateData(destDir);
|
||||
|
||||
cleanUpData();
|
||||
}
|
||||
if(printRules){
|
||||
printf("\t\t\"::[:AGE=3.2:]NFKC;\"\n\t}\n}");
|
||||
}
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
normalizationCorrectionsLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t mapping[40];
|
||||
char *end, *s;
|
||||
uint32_t code;
|
||||
int32_t length;
|
||||
UVersionInfo version;
|
||||
UVersionInfo thisVersion;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the character code, field 0 */
|
||||
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genidn: error parsing FCNFKC_3_2_0.txt mapping at %s\n", fields[0][0]);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
/* Original (erroneous) decomposition */
|
||||
s = fields[1][0];
|
||||
|
||||
/* parse the mapping string */
|
||||
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
|
||||
|
||||
/* ignore corrected decomposition */
|
||||
|
||||
u_versionFromString(version,fields[3][0] );
|
||||
u_versionFromString(thisVersion, "3.2.0");
|
||||
|
||||
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genidn error parsing NormalizationCorrection of U+%04lx - %s\n",
|
||||
(long)code, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* store the mapping */
|
||||
if( version[0] > thisVersion[0] ||
|
||||
((version[0]==thisVersion[0]) && (version[1] > thisVersion[1]))
|
||||
){
|
||||
storeMapping(code,mapping, length, TRUE, pErrorCode);
|
||||
if(printRules){
|
||||
printMapping(code,mapping,length);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) {
|
||||
char *fields[4][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode);
|
||||
|
||||
/* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
|
||||
|
||||
if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) {
|
||||
fprintf(stderr, "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
static void U_CALLCONV
|
||||
caseMapLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
uint32_t mapping[40];
|
||||
char *end, *s;
|
||||
uint32_t code;
|
||||
int32_t length;
|
||||
UBool* mapWithNorm = (UBool*) context;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
/* get the character code, field 0 */
|
||||
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
|
||||
if(end<=fields[0][0] || end!=fields[0][1]) {
|
||||
fprintf(stderr, "genidn: syntax error in field 0 at %s\n", fields[0][0]);
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
|
||||
s = fields[1][0];
|
||||
/* parse the mapping string */
|
||||
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genidn error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
|
||||
(long)code, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
/* store the mapping */
|
||||
|
||||
storeMapping(code,mapping, length, *mapWithNorm, pErrorCode);
|
||||
if(printRules){
|
||||
printMapping(code,mapping,length);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseMappings(const char *filename,UBool withNorm, UBool reportError, UErrorCode *pErrorCode) {
|
||||
char *fields[3][2];
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode);
|
||||
|
||||
/*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
|
||||
|
||||
if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
|
||||
fprintf(stderr, "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
|
||||
/* parser for UnicodeData.txt ----------------------------------------------- */
|
||||
static int32_t printedCharCount = 0;
|
||||
|
||||
static void printEscaped(UChar32 ch){
|
||||
if(ch > 0xFFFF){
|
||||
printf("\\\\U%08X",ch);
|
||||
printedCharCount+=11;
|
||||
}else{
|
||||
if(uprv_isRuleWhiteSpace(ch)){
|
||||
/* double escape the rule white space */
|
||||
printf("\\\\u%04X", ch);
|
||||
printedCharCount+=7;
|
||||
}else if(0x20< ch && ch <0x7f){
|
||||
if(ch == 0x2E){
|
||||
/* double escape dot */
|
||||
printf("\\\\%c",(char)ch);
|
||||
printedCharCount+=3;
|
||||
}else{
|
||||
printf("%c",(char)ch);
|
||||
printedCharCount++;
|
||||
}
|
||||
}else{
|
||||
printf("\\\\u%04X",ch);
|
||||
printedCharCount+=7;
|
||||
}
|
||||
}
|
||||
}
|
||||
static void printEscapedRange(UChar32 rangeStart, UChar32 rangeEnd){
|
||||
if(rangeStart != rangeEnd){
|
||||
printEscaped(rangeStart);
|
||||
printf("-");
|
||||
printedCharCount++;
|
||||
printEscaped(rangeEnd);
|
||||
printf(" ");
|
||||
}else{
|
||||
printEscaped(rangeStart);
|
||||
printf(" ");
|
||||
}
|
||||
if(printedCharCount > 70){
|
||||
printf("\"\n\t\t\t\"");
|
||||
printedCharCount =0 ;
|
||||
}
|
||||
}
|
||||
static void printMapping( UChar32 cp, UChar32* mapping, int32_t mappingLength){
|
||||
|
||||
int32_t i;
|
||||
printf("\t\t\"");
|
||||
printEscaped(cp);
|
||||
printf(" > ");
|
||||
for(i=0;i<mappingLength;i++){
|
||||
printEscaped(mapping[i]);
|
||||
}
|
||||
printf(";\"\n");
|
||||
|
||||
printedCharCount=0;
|
||||
}
|
||||
static void U_CALLCONV
|
||||
unicodeDataLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode) {
|
||||
int32_t length;
|
||||
uint32_t rangeStart=0,rangeEnd =0;
|
||||
UBool* isUnassigned = (UBool*) context;
|
||||
|
||||
/* ignore First and Last entries for ranges */
|
||||
if( *fields[1][0]=='<' &&
|
||||
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
|
||||
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode);
|
||||
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
if(*isUnassigned == TRUE){
|
||||
storeRange(rangeStart,rangeEnd,UIDNA_UNASSIGNED, pErrorCode);
|
||||
}else{
|
||||
storeRange(rangeStart,rangeEnd,UIDNA_PROHIBITED, pErrorCode);
|
||||
}
|
||||
/*TODO: comment out the printer */
|
||||
if(printRules){
|
||||
printEscapedRange(rangeStart,rangeEnd);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
parseTable(const char *filename,UBool isUnassigned, UErrorCode *pErrorCode) {
|
||||
char *fields[1][2];
|
||||
int32_t len=0;
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
/*TODO: comment out the printer */
|
||||
if(printRules){
|
||||
printedCharCount = 0;
|
||||
if(isUnassigned){
|
||||
printf("\n\tUnassignedSet{\"[ ");
|
||||
}else{
|
||||
printf("\n\tProhibitedSet{\"[ ");
|
||||
}
|
||||
}
|
||||
u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode);
|
||||
|
||||
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
if(printRules){
|
||||
printf("]\"}\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
static void
|
||||
setLDHValues(UErrorCode* pErrorCode){
|
||||
USet* set = uset_openPattern(LDH_PATTERN, LDH_PATTERN_LEN, pErrorCode);
|
||||
int32_t itemCount;
|
||||
int32_t index = 0;
|
||||
UChar32 start,end;
|
||||
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
fprintf(stderr,"Could not open USet. Error :%s \n",u_errorName(*pErrorCode));
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
|
||||
itemCount = uset_getItemCount(set);
|
||||
|
||||
for(;index < itemCount; index++){
|
||||
uset_getItem(set,index, &start, &end, NULL, 0, pErrorCode);
|
||||
storeRange(start,end,UIDNA_LDH_OR_MAP_NFKC, pErrorCode);
|
||||
}
|
||||
if(printRules){
|
||||
printf(PAT);
|
||||
}
|
||||
|
||||
}
|
||||
*/
|
||||
static void
|
||||
setLabelSeperators(UErrorCode *pErrorCode){
|
||||
/* U+002E, U+3002, U+FF0E, U+FF61 */
|
||||
storeRange(0x002E, 0x002E, UIDNA_LABEL_SEPARATOR, pErrorCode);
|
||||
storeRange(0x3002, 0x3002, UIDNA_LABEL_SEPARATOR, pErrorCode);
|
||||
storeRange(0xFF0E, 0xFF0E, UIDNA_LABEL_SEPARATOR, pErrorCode);
|
||||
storeRange(0xFF61, 0xFF61, UIDNA_LABEL_SEPARATOR, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)){
|
||||
fprintf(stderr, "Could not store values for label separators\n");
|
||||
}
|
||||
if(printRules){
|
||||
printf("\tLabelSeparatorSet{\"[ ");
|
||||
printEscaped(0x002E);
|
||||
printEscaped(0x3002);
|
||||
printEscaped(0xFF0E);
|
||||
printEscaped(0xFF61);
|
||||
printf(" ]\"}\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
121
icu4c/source/tools/genidna/genidna.dsp
Normal file
121
icu4c/source/tools/genidna/genidna.dsp
Normal file
@ -0,0 +1,121 @@
|
||||
# Microsoft Developer Studio Project File - Name="genidna" - Package Owner=<4>
|
||||
# Microsoft Developer Studio Generated Build File, Format Version 6.00
|
||||
# ** DO NOT EDIT **
|
||||
|
||||
# TARGTYPE "Win32 (x86) Console Application" 0x0103
|
||||
|
||||
CFG=genidna - Win32 Debug
|
||||
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
|
||||
!MESSAGE use the Export Makefile command and run
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "genidna.mak".
|
||||
!MESSAGE
|
||||
!MESSAGE You can specify a configuration when running NMAKE
|
||||
!MESSAGE by defining the macro CFG on the command line. For example:
|
||||
!MESSAGE
|
||||
!MESSAGE NMAKE /f "genidna.mak" CFG="genidna - Win32 Debug"
|
||||
!MESSAGE
|
||||
!MESSAGE Possible choices for configuration are:
|
||||
!MESSAGE
|
||||
!MESSAGE "genidna - Win32 Release" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE "genidna - Win32 Debug" (based on "Win32 (x86) Console Application")
|
||||
!MESSAGE
|
||||
|
||||
# Begin Project
|
||||
# PROP AllowPerConfigDependencies 0
|
||||
# PROP Scc_ProjName ""
|
||||
# PROP Scc_LocalPath ""
|
||||
CPP=cl.exe
|
||||
RSC=rc.exe
|
||||
|
||||
!IF "$(CFG)" == "genidna - Win32 Release"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 0
|
||||
# PROP BASE Output_Dir "Release"
|
||||
# PROP BASE Intermediate_Dir "Release"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 0
|
||||
# PROP Output_Dir "Release"
|
||||
# PROP Intermediate_Dir "Release"
|
||||
# PROP Target_Dir ""
|
||||
MTL=midl.exe
|
||||
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
|
||||
# ADD BASE RSC /l 0x409 /d "NDEBUG"
|
||||
# ADD RSC /l 0x409 /d "NDEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
||||
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
|
||||
|
||||
!ELSEIF "$(CFG)" == "genidna - Win32 Debug"
|
||||
|
||||
# PROP BASE Use_MFC 0
|
||||
# PROP BASE Use_Debug_Libraries 1
|
||||
# PROP BASE Output_Dir "Debug"
|
||||
# PROP BASE Intermediate_Dir "Debug"
|
||||
# PROP BASE Target_Dir ""
|
||||
# PROP Use_MFC 0
|
||||
# PROP Use_Debug_Libraries 1
|
||||
# PROP Output_Dir "Debug"
|
||||
# PROP Intermediate_Dir "Debug"
|
||||
# PROP Ignore_Export_Lib 0
|
||||
# PROP Target_Dir ""
|
||||
MTL=midl.exe
|
||||
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
|
||||
# ADD BASE RSC /l 0x409 /d "_DEBUG"
|
||||
# ADD RSC /l 0x409 /d "_DEBUG"
|
||||
BSC32=bscmake.exe
|
||||
# ADD BASE BSC32 /nologo
|
||||
# ADD BSC32 /nologo
|
||||
LINK32=link.exe
|
||||
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
|
||||
# ADD LINK32 icutud.lib icuucd.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\..\..\lib"
|
||||
# Begin Custom Build
|
||||
TargetPath=.\Debug\genidna.exe
|
||||
InputPath=.\Debug\genidna.exe
|
||||
InputName=genidna
|
||||
SOURCE="$(InputPath)"
|
||||
|
||||
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
|
||||
copy $(TargetPath) ..\..\..\bin
|
||||
|
||||
# End Custom Build
|
||||
|
||||
!ENDIF
|
||||
|
||||
# Begin Target
|
||||
|
||||
# Name "genidna - Win32 Release"
|
||||
# Name "genidna - Win32 Debug"
|
||||
# Begin Group "Source Files"
|
||||
|
||||
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\genidna.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\store.c
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
# PROP Default_Filter "h;hpp;hxx;hm;inl"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\genidna.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
||||
# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
# End Group
|
||||
# End Target
|
||||
# End Project
|
76
icu4c/source/tools/genidna/genidna.h
Normal file
76
icu4c/source/tools/genidna/genidna.h
Normal file
@ -0,0 +1,76 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: genidn.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003-02-06
|
||||
* created by: Ram Viswanadha
|
||||
*/
|
||||
|
||||
#ifndef __GENIDN_H__
|
||||
#define __GENIDN_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "sprpimpl.h"
|
||||
|
||||
/* file definitions */
|
||||
#define DATA_NAME "uidna"
|
||||
#define DATA_TYPE "icu"
|
||||
|
||||
/*
|
||||
* data structure that holds the IDN properties for one or more
|
||||
* code point(s) at build time
|
||||
*/
|
||||
|
||||
|
||||
/* global flags */
|
||||
extern UBool beVerbose, haveCopyright;
|
||||
|
||||
/* prototypes */
|
||||
|
||||
extern void
|
||||
setUnicodeVersion(const char *v);
|
||||
|
||||
extern void
|
||||
init(void);
|
||||
|
||||
extern void
|
||||
storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UBool withNorm, UErrorCode* status);
|
||||
extern void
|
||||
storeRange(uint32_t start, uint32_t end, UBool isUnassigned,UErrorCode* status);
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir);
|
||||
|
||||
extern void
|
||||
cleanUpData(void);
|
||||
|
||||
/*
|
||||
extern void
|
||||
storeIDN(uint32_t code, IDN *idn);
|
||||
|
||||
extern void
|
||||
processData(void);
|
||||
|
||||
|
||||
*/
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
||||
|
||||
|
332
icu4c/source/tools/genidna/store.c
Normal file
332
icu4c/source/tools/genidna/store.c
Normal file
@ -0,0 +1,332 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2002, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: store.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2003-02-06
|
||||
* created by: Ram Viswanadha
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "utrie.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unewdata.h"
|
||||
#include "genidna.h"
|
||||
|
||||
#ifdef WIN32
|
||||
# pragma warning(disable: 4100)
|
||||
#endif
|
||||
|
||||
#define DO_DEBUG_OUT 0
|
||||
|
||||
|
||||
/**
|
||||
This is a simple Trie with the following structure
|
||||
|
||||
16-bit IDN sets:
|
||||
|
||||
Each 16-bit IDN word contains:
|
||||
|
||||
0..2 Category flags
|
||||
Contains the enum values IDNStates
|
||||
|
||||
3..4 Contains the length of the mapping
|
||||
If length of the mapping is < 2 the length is stored
|
||||
If length of the mapping is > 2 then _IDNA_LENGTH_IN_MAPPING_TABLE
|
||||
enum is stored and the length of mapping is stored in the first index
|
||||
in the data array
|
||||
|
||||
5..16 Contains the index into the data array that contains the mapping
|
||||
If it contains _IDNA_MAP_TO_NOTHING, then the codepoint is stripped from
|
||||
the input
|
||||
|
||||
*/
|
||||
|
||||
/* file data ---------------------------------------------------------------- */
|
||||
/* indexes[] value names */
|
||||
|
||||
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
|
||||
|
||||
static uint16_t mappingData[_IDNA_MAPPING_DATA_SIZE]={0};
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
static UDataInfo dataInfo={
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ 0x49, 0x44, 0x4e, 0x41 }, /* dataFormat="IDNA" */
|
||||
{ 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
|
||||
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
|
||||
};
|
||||
void
|
||||
setUnicodeVersion(const char *v) {
|
||||
UVersionInfo version;
|
||||
u_versionFromString(version, v);
|
||||
uprv_memcpy(dataInfo.dataVersion, version, 4);
|
||||
}
|
||||
|
||||
|
||||
static UNewTrie idnTrie={ {0},0,0,0,0,0,0,0,0,{0} };
|
||||
|
||||
static int32_t currentIndex = 1; /* the current index into the data trie */
|
||||
static int32_t maxLength = 0; /* maximum length of mapping string */
|
||||
|
||||
#define MAX_DATA_LENGTH 11500
|
||||
|
||||
extern void
|
||||
init() {
|
||||
|
||||
/* initialize the two tries */
|
||||
if(NULL==utrie_open(&idnTrie, NULL, MAX_DATA_LENGTH, 0, FALSE)) {
|
||||
fprintf(stderr, "error: failed to initialize tries\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
store(uint32_t codepoint, uint32_t* mapping, int32_t length, uint32_t flags, UErrorCode* status){
|
||||
|
||||
uint32_t trieWord = 0;
|
||||
int32_t i =0;
|
||||
if(flags == _IDNA_MAP_TO_NOTHING){
|
||||
trieWord = flags << 5;
|
||||
}else{
|
||||
if(length==0){
|
||||
trieWord = flags;
|
||||
}else{
|
||||
/*
|
||||
int32_t delta;
|
||||
|
||||
if(length==1 && (delta=(int32_t)codepoint-(int32_t)mapping[0])>=-4096 && delta<=4095) {
|
||||
printf("mapping of U+%04lx to U+%04lx could fit into a 13-bit delta (0x%lx)\n", codepoint, mapping[0], delta);
|
||||
}
|
||||
*/
|
||||
/* set the 0..2 bits the flags */
|
||||
trieWord = flags;
|
||||
/* set the 3..4 bits the length */
|
||||
|
||||
if(length > 2){
|
||||
trieWord += _IDNA_LENGTH_IN_MAPPING_TABLE << 3;
|
||||
}else{
|
||||
trieWord += (uint32_t)((length)<<3);
|
||||
}
|
||||
if(length > maxLength)
|
||||
maxLength = length;
|
||||
|
||||
/* get the current index in the data array
|
||||
* and store in 5..15 bits
|
||||
*/
|
||||
trieWord += currentIndex << 5;
|
||||
|
||||
|
||||
/* load mapping into the data array */
|
||||
i = 0;
|
||||
|
||||
if(trieWord > 0xFFFF){
|
||||
fprintf(stderr,"size of trie word is greater than 0xFFFF.\n");
|
||||
}
|
||||
/* set the length in mapping table */
|
||||
if(length > 2){
|
||||
mappingData[currentIndex++] = (uint16_t)length;
|
||||
}
|
||||
while(i<length){
|
||||
if(currentIndex < _IDNA_MAPPING_DATA_SIZE){
|
||||
if(mappingData[currentIndex]==0){
|
||||
if(mapping[i] <= 0xFFFF){
|
||||
mappingData[currentIndex++] = (uint16_t)mapping[i++];
|
||||
}else{
|
||||
mappingData[currentIndex++] = UTF16_LEAD(mapping[i]);
|
||||
if(currentIndex < _IDNA_MAPPING_DATA_SIZE){
|
||||
mappingData[currentIndex++] = UTF16_TRAIL(mapping[i++]);
|
||||
}else{
|
||||
fprintf(stderr, "Data Array index out of bounds.currentIndex = %i size of mapping arry = %i \n",currentIndex, _IDNA_MAPPING_DATA_SIZE);
|
||||
*status = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}else{
|
||||
fprintf(stderr, "Data Array index out of bounds.currentIndex = %i size of mapping arry = %i \n",currentIndex, _IDNA_MAPPING_DATA_SIZE);
|
||||
*status = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
i = utrie_get32(&idnTrie,codepoint,NULL);
|
||||
|
||||
if(i==0){
|
||||
/* now set the value in the trie */
|
||||
if(!utrie_set32(&idnTrie,codepoint,trieWord)){
|
||||
fprintf(stderr, "error: too many mapping entries\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
|
||||
}else{
|
||||
if(i== UIDNA_PROHIBITED){
|
||||
i += _IDNA_MAP_TO_NOTHING << 5;
|
||||
/* now set the value in the trie */
|
||||
if(!utrie_set32(&idnTrie,codepoint,i)){
|
||||
fprintf(stderr, "error: too many mapping entries\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
}else{
|
||||
fprintf(stderr, "Index array has been set for codepoint 0x%06X. \n",codepoint);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
extern void
|
||||
storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UBool withNorm, UErrorCode* status){
|
||||
|
||||
if(withNorm){
|
||||
store(codepoint,mapping,length,UIDNA_MAP_NFKC,status);
|
||||
}else{
|
||||
store(codepoint,mapping,length,_IDNA_MAP_TO_NOTHING,status);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
extern void
|
||||
storeRange(uint32_t start, uint32_t end, int8_t flag,UErrorCode* status){
|
||||
uint32_t trieWord = 0, i=0;
|
||||
|
||||
trieWord += flag;
|
||||
|
||||
if(start == end){
|
||||
uint32_t i = utrie_get32(&idnTrie,start,NULL);
|
||||
if(i == 0 || i==(uint8_t)flag){
|
||||
if(!utrie_set32(&idnTrie,start,trieWord)){
|
||||
fprintf(stderr, "error: too many entries\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
}else{
|
||||
fprintf(stderr, "Index array has been set for codepoint 0x%06X. \n",start);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}else{
|
||||
if(!utrie_setRange32(&idnTrie,start,end+1,trieWord,FALSE)){
|
||||
fprintf(stderr, "error: too many entries\n");
|
||||
exit(U_BUFFER_OVERFLOW_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
|
||||
static uint32_t U_CALLCONV
|
||||
getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
|
||||
uint32_t foldedValue, value;
|
||||
UChar32 limit;
|
||||
UBool inBlockZero;
|
||||
|
||||
foldedValue=0;
|
||||
|
||||
limit=start+0x400;
|
||||
while(start<limit) {
|
||||
value=utrie_get32(trie, start, &inBlockZero);
|
||||
if(inBlockZero) {
|
||||
start+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
foldedValue|=value;
|
||||
++start;
|
||||
}
|
||||
}
|
||||
|
||||
if(foldedValue!=0) {
|
||||
return (uint32_t)(offset|0x8000);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir) {
|
||||
static uint8_t idnTrieBlock[100000];
|
||||
|
||||
UNewDataMemory *pData;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
int32_t size, idnTrieSize, dataLength;
|
||||
|
||||
idnTrieSize=utrie_serialize(&idnTrie, idnTrieBlock, sizeof(idnTrieBlock), getFoldedValue, TRUE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "error: utrie_serialize(idn trie) failed, %s\n", u_errorName(errorCode));
|
||||
exit(errorCode);
|
||||
}
|
||||
size = idnTrieSize + sizeof(mappingData) + sizeof(indexes);
|
||||
if(beVerbose) {
|
||||
printf("size of idn trie %5u bytes\n", idnTrieSize);
|
||||
printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
|
||||
printf("size of mapping data array %5u bytes\n", sizeof(mappingData));
|
||||
printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex);
|
||||
printf("Maximum length of the mapping string is : %i \n", maxLength);
|
||||
}
|
||||
|
||||
|
||||
/* write the data */
|
||||
pData=udata_create(dataDir, DATA_TYPE, U_ICUDATA_NAME "_" DATA_NAME, &dataInfo,
|
||||
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
|
||||
exit(errorCode);
|
||||
}
|
||||
indexes[_IDNA_INDEX_TRIE_SIZE]=idnTrieSize;
|
||||
indexes[_IDNA_INDEX_MAPPING_DATA_SIZE]=sizeof(mappingData);
|
||||
|
||||
udata_writeBlock(pData, indexes, sizeof(indexes));
|
||||
udata_writeBlock(pData, idnTrieBlock, idnTrieSize);
|
||||
udata_writeBlock(pData, mappingData, sizeof(mappingData));
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genidn: error %d writing the output file\n", errorCode);
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
if(dataLength!=size) {
|
||||
fprintf(stderr, "genidn error: data length %ld != calculated size %ld\n",
|
||||
(long)dataLength, (long)size);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}
|
||||
extern void
|
||||
cleanUpData(void) {
|
||||
|
||||
utrie_close(&idnTrie);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
Loading…
Reference in New Issue
Block a user