ICU-2194 IDNA data trie generator

X-SVN-Rev: 11193
This commit is contained in:
Ram Viswanadha 2003-02-28 21:32:28 +00:00
parent 41d36b7650
commit 78f36c9a5a
6 changed files with 1357 additions and 0 deletions

View File

@ -0,0 +1,101 @@
## Makefile.in for ICU - tools/genidna
## Copyright (c) 1999-2001, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
##
SECTION = 8
MAN_FILES = $(TARGET:$(EXEEXT)=).$(SECTION)
## Build directory information
subdir = tools/genidna
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(MAN_FILES) $(DEPS)
## Target information
TARGET = genidna$(EXEEXT)
CPPFLAGS += -I$(top_builddir)/common -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
LIBS = $(LIBICUI18N) $(LIBICUTOOLUTIL) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = genidna.o store.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check \
check-local install-man
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET) $(MAN_FILES)
install-local: all-local install-man
$(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
$(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)/$(TARGET)
<dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(TARGET) $(OBJECTS)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
# the 'mv' will always fail if you are building in the source dir
# man page
install-man: $(MAN_FILES)
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
$(INSTALL_DATA) $< $(DESTDIR)$(mandir)/man$(SECTION)
%.$(SECTION): $(srcdir)/%.$(SECTION).in
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
# build postscript and pdf formats
#$(TARGET).ps: $(TARGET).$(SECTION)
# groff -man < $< > $@
#$(TARGET).pdf: $(TARGET).ps
# ps2pdf $< $@
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

View File

@ -0,0 +1,172 @@
#/usr/bin/perl
####################################################################################
# filterRFC3454.pl:
# This tool filters the RFC-3454 txt file for String prep tables
# Author: Ram Viswanadha
#
####################################################################################
use File::Find;
use File::Basename;
use IO::File;
use Cwd;
use File::Copy;
use Getopt::Long;
use File::Path;
use File::Copy;
$warning = "###################\n# WARNING: This table is generated by filterRFC3454.pl tool. DO NOT EDIT \n#################\n\n";
#run the program
main();
#---------------------------------------------------------------------
# The main program
sub main(){
GetOptions(
"--sourcedir=s" => \$sourceDir,
"--destdir=s" => \$destDir,
"--filename=s" => \$fileName,
);
usage() unless defined $sourceDir;
usage() unless defined $destDir;
usage() unless defined $fileName;
$infile = $sourceDir."/".$fileName;
$inFH = IO::File->new($infile,"r")
or die "could not open the file for reading: $! \n";
while(defined ($line=<$inFH>)){
next unless $line=~ /Start\sTable/;
if($line =~ /A.1/){
createUnassignedTable($inFH,$destDir);
}
if($line =~ /B.1/){
createCaseMapNoNorm($inFH,$destDir);
}
if($line =~ /B.2/){
createCaseMap($inFH,$destDir);
}
if($line =~ /C.*/ ){
createProhibitedTable($inFH,$destDir,$line);
}
}
close($inFH);
}
#-----------------------------------------------------------------------
sub readPrint{
local ($inFH, $outFH,$comment, $print) = @_;
$count = 0;
print $outFH $comment."\n";
while(defined ($line = <$inFH>)){
next if $line =~ /Hoffman\s\&\sBlanchet/; # ignore heading
next if $line =~ /RFC\s3454/; # ignore heading
next if $line =~ /\f/; # ignore form feed
next if $line eq "\n"; # ignore blank lines
# break if "End Table" is found
if( $line =~ /End\sTable/){
print $outFH "\n# Total code points $count\n\n";
return;
}
if($print==1){
print $line;
}
$line =~ s/-/../;
$line =~ s/^\s+//;
if($line =~ /\;/){
}else{
$line =~ s/$/;/;
}
if($line =~ /\.\./){
($code, $noise) = split /;/ , $line;
($startStr, $endStr ) = split /\.\./, $code;
$start = atoi($startStr);
$end = atoi($endStr);
#print $start." ".$end."\n";
while($start <= $end){
$count++;
$start++;
}
}else{
$count++;
}
print $outFH $line;
}
}
#-----------------------------------------------------------------------
sub atoi {
my $t;
foreach my $d (split(//, shift())) {
$t = $t * 16 + $d;
}
return $t;
}
#-----------------------------------------------------------------------
sub createUnassignedTable{
($inFH,$destDir) = @_;
$outfile = $destDir."/"."rfc3454_A_1.txt";
$outFH = IO::File->new($outfile,"w")
or die "could not open the file $outfile for writing: $! \n";
$comment = $warning."# This file contains code points from Table A.1 from RFC 3454\n";
readPrint($inFH,$outFH, $comment);
close($outFH);
}
#-----------------------------------------------------------------------
sub createCaseMapNoNorm{
($inFH,$destDir) = @_;
$outfile = $destDir."/"."rfc3454_B_1.txt";
$outFH = IO::File->new($outfile,"w")
or die "could not open the file $outfile for writing: $! \n";
$comment = $warning."# This file contains code points from Table B.1 from RFC 3454\n";
readPrint($inFH,$outFH,$comment);
close($outFH);
}
#-----------------------------------------------------------------------
sub createCaseMap{
($inFH,$destDir) = @_;
$outfile = $destDir."/"."rfc3454_B_2.txt";
$outFH = IO::File->new($outfile,"w")
or die "could not open the file $outfile for writing: $! \n";
$comment = $warning."# This file contains code points from Table B.2 from RFC 3454\n";
readPrint($inFH,$outFH,$comment);
close($outFH);
}
#-----------------------------------------------------------------------
sub createProhibitedTable{
($inFH,$destDir,$line) = @_;
$outfile = $destDir."/"."rfc3454_C_X.txt";
if($line =~ /C.1.1/ && stat($outfile)){
unlink($outfile)
or die "could not delete the file $outfile : $! \n";
}
$line =~ s/Start//;
$line =~ s/-//g;
$comment = $warning."# code points from $line";
$outFH = IO::File->new($outfile, "a")
or die "could not open the file $outfile for writing: $! \n";
readPrint($inFH,$outFH,$comment);
close($outFH);
}
#-----------------------------------------------------------------------
sub usage {
print << "END";
Usage:
filterRFC3454.pl
Options:
--sourcedir=<directory>
--destdir=<directory>
--filename=<name of RFC file>
e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --filename=rfc3454.txt
filterRFC3454.pl filters the RFC file and creates String prep table files.
The RFC text can be downloaded from ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt
END
exit(0);
}

View File

@ -0,0 +1,555 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: genidn.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003-02-06
* created by: Ram Viswanadha
*
* This program reads the rfc3454_*.txt files,
* parses them, and extracts the data for Nameprep conformance.
* It then preprocesses it and writes a binary file for efficient use
* in various IDNA conversion processes.
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "unicode/udata.h"
#include "unewdata.h"
#include "uoptions.h"
#include "uparse.h"
#include "unicode/uset.h"
#include "uprops.h"
U_CDECL_BEGIN
#include "genidna.h"
U_CDECL_END
#ifdef WIN32
# pragma warning(disable: 4100)
#endif
UBool beVerbose=FALSE, haveCopyright=TRUE, printRules = FALSE;
/* prototypes --------------------------------------------------------------- */
static void
parseMappings(const char *filename, UBool withNorm, UBool reportError, UErrorCode *pErrorCode);
static void
parseTable(const char *filename, UBool isUnassigned, UErrorCode *pErrorCode);
static void
parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode);
static void
setLDHValues(UErrorCode* pErrorCode);
static void
setLabelSeperators(UErrorCode* pErrorCode);
static void
printMapping(UChar32 cp,UChar32* mapping, int32_t mappingLength);
static const char* fileNames[] = {
"rfc3454_A_1.txt", /* contains unassigned code points */
"rfc3454_C_X.txt", /* contains code points that are prohibited */
"rfc3454_B_1.txt", /* contains case mappings when normalization is turned off */
"rfc3454_B_2.txt", /* contains case mappings when normalization it turned on */
"NormalizationCorrections.txt",/* normalization corrections */
};
static const char *UNIDATA_DIR = "unidata";
static const char *MISC_DIR = "misc";
/* -------------------------------------------------------------------------- */
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_DESTDIR,
UOPTION_SOURCEDIR,
{ "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 },
{ "generate-rules", NULL, NULL, NULL, 'g', UOPT_NO_ARG, 0 }
};
extern int
main(int argc, char* argv[]) {
char filename[300];
const char *srcDir=NULL, *destDir=NULL, *suffix=NULL;
char *basename=NULL;
char *saveBasename = NULL;
UErrorCode errorCode=U_ZERO_ERROR;
U_MAIN_INIT_ARGS(argc, argv);
/* preset then read command line options */
options[4].value=u_getDataDirectory();
options[5].value="";
options[6].value="3.0.0";
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
/* error handling, printing usage message */
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
}
if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
/*
* Broken into chucks because the C89 standard says the minimum
* required supported string length is 509 bytes.
*/
fprintf(stderr,
"Usage: %s [-options] [suffix]\n"
"\n"
"Read the rfc3454_*.txt files and\n"
"create a binary file " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " with the normalization data\n"
"\n",
argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
"\t-c or --copyright include a copyright notice\n");
fprintf(stderr,
"\t-d or --destdir destination directory, followed by the path\n"
"\t-s or --sourcedir source directory of ICU data, followed by the path\n"
"\t-g or --generate-rules generate IDN rules for testing. Will print out rules to STDOUT\n"
);
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
/* get the options values */
beVerbose=options[2].doesOccur;
haveCopyright=options[3].doesOccur;
srcDir=options[5].value;
destDir=options[4].value;
printRules = options[7].doesOccur;
if(argc>=2) {
suffix=argv[1];
} else {
suffix=NULL;
}
setUnicodeVersion(options[6].value);
/* prepare the filename beginning with the source dir */
if(srcDir[0] == U_FILE_SEP_CHAR){
filename[0]= 0x2E;
uprv_strcat(filename+1,srcDir);
}else if(uprv_strchr(srcDir,U_FILE_SEP_CHAR) == NULL){
filename[0] = 0x2E;
filename[1] = U_FILE_SEP_CHAR;
uprv_strcpy(filename+2,srcDir);
}else{
uprv_strcpy(filename, srcDir);
}
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
/* initialize */
init();
if(printRules){
printf("// Copyright (C) 2003, International Business Machines\n\n");
printf("// WARNING: This file is machine generated by %s tool. Please DO NOT edit.\n\n",argv[0]);
printf("idn_rules{\n");
}
/* first copy misc directory */
saveBasename = basename;
uprv_strcpy(basename,MISC_DIR);
basename = basename + uprv_strlen(MISC_DIR);
*basename++=U_FILE_SEP_CHAR;
/* process unassigned */
uprv_strcpy(basename,fileNames[0]);
parseTable(filename,TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "Could not open file %s for reading \n", filename);
return errorCode;
}
/* process prohibited */
uprv_strcpy(basename,fileNames[1]);
parseTable(filename,FALSE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "Could not open file %s for reading \n", filename);
return errorCode;
}
/* setLDHValues(&errorCode); */
setLabelSeperators(&errorCode);
/* process mappings */
if(printRules){
printf("\n\tMapNoNormalization{\n");
}
uprv_strcpy(basename,fileNames[2]);
parseMappings(filename, FALSE, FALSE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "Could not open file %s for reading \n", filename);
return errorCode;
}
if(printRules){
printf("\n\t}\n");
}
if(printRules){
printf("\n\tMapNFKC{\n");
}
uprv_strcpy(basename,fileNames[3]);
parseMappings(filename, TRUE, FALSE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "Could not open file %s for reading \n", filename);
return errorCode;
}
/* set up directory for NormalizationCorrections.txt */
basename = saveBasename;
uprv_strcpy(basename,UNIDATA_DIR);
basename = basename + uprv_strlen(UNIDATA_DIR);
*basename++=U_FILE_SEP_CHAR;
uprv_strcpy(basename,fileNames[4]);
parseNormalizationCorrections(filename,&errorCode);
if(U_FAILURE(errorCode)){
fprintf(stderr,"Could not open file %s for reading \n", filename);
return errorCode;
}
/* process parsed data */
if(U_SUCCESS(errorCode)) {
/* write the data file */
generateData(destDir);
cleanUpData();
}
if(printRules){
printf("\t\t\"::[:AGE=3.2:]NFKC;\"\n\t}\n}");
}
return errorCode;
}
static void U_CALLCONV
normalizationCorrectionsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t mapping[40];
char *end, *s;
uint32_t code;
int32_t length;
UVersionInfo version;
UVersionInfo thisVersion;
/* ignore First and Last entries for ranges */
if( *fields[1][0]=='<' &&
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
) {
return;
}
/* get the character code, field 0 */
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genidn: error parsing FCNFKC_3_2_0.txt mapping at %s\n", fields[0][0]);
exit(*pErrorCode);
}
/* Original (erroneous) decomposition */
s = fields[1][0];
/* parse the mapping string */
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
/* ignore corrected decomposition */
u_versionFromString(version,fields[3][0] );
u_versionFromString(thisVersion, "3.2.0");
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genidn error parsing NormalizationCorrection of U+%04lx - %s\n",
(long)code, u_errorName(*pErrorCode));
exit(*pErrorCode);
}
/* store the mapping */
if( version[0] > thisVersion[0] ||
((version[0]==thisVersion[0]) && (version[1] > thisVersion[1]))
){
storeMapping(code,mapping, length, TRUE, pErrorCode);
if(printRules){
printMapping(code,mapping,length);
}
}
}
static void
parseNormalizationCorrections(const char *filename, UErrorCode *pErrorCode) {
char *fields[4][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 4, normalizationCorrectionsLineFn, NULL, pErrorCode);
/* fprintf(stdout,"Number of code points that have NormalizationCorrections mapping with length >1 : %i\n",len); */
if(U_FAILURE(*pErrorCode) && ( *pErrorCode!=U_FILE_ACCESS_ERROR)) {
fprintf(stderr, "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
static void U_CALLCONV
caseMapLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
uint32_t mapping[40];
char *end, *s;
uint32_t code;
int32_t length;
UBool* mapWithNorm = (UBool*) context;
/* ignore First and Last entries for ranges */
if( *fields[1][0]=='<' &&
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
) {
return;
}
/* get the character code, field 0 */
code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16);
if(end<=fields[0][0] || end!=fields[0][1]) {
fprintf(stderr, "genidn: syntax error in field 0 at %s\n", fields[0][0]);
*pErrorCode=U_PARSE_ERROR;
exit(U_PARSE_ERROR);
}
s = fields[1][0];
/* parse the mapping string */
length=u_parseCodePoints(s, mapping, sizeof(mapping)/4, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genidn error parsing UnicodeData.txt decomposition of U+%04lx - %s\n",
(long)code, u_errorName(*pErrorCode));
exit(*pErrorCode);
}
/* store the mapping */
storeMapping(code,mapping, length, *mapWithNorm, pErrorCode);
if(printRules){
printMapping(code,mapping,length);
}
}
static void
parseMappings(const char *filename,UBool withNorm, UBool reportError, UErrorCode *pErrorCode) {
char *fields[3][2];
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
u_parseDelimitedFile(filename, ';', fields, 3, caseMapLineFn, &withNorm, pErrorCode);
/*fprintf(stdout,"Number of code points that have mappings with length >1 : %i\n",len);*/
if(U_FAILURE(*pErrorCode) && (reportError || *pErrorCode!=U_FILE_ACCESS_ERROR)) {
fprintf(stderr, "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
exit(*pErrorCode);
}
}
/* parser for UnicodeData.txt ----------------------------------------------- */
static int32_t printedCharCount = 0;
static void printEscaped(UChar32 ch){
if(ch > 0xFFFF){
printf("\\\\U%08X",ch);
printedCharCount+=11;
}else{
if(uprv_isRuleWhiteSpace(ch)){
/* double escape the rule white space */
printf("\\\\u%04X", ch);
printedCharCount+=7;
}else if(0x20< ch && ch <0x7f){
if(ch == 0x2E){
/* double escape dot */
printf("\\\\%c",(char)ch);
printedCharCount+=3;
}else{
printf("%c",(char)ch);
printedCharCount++;
}
}else{
printf("\\\\u%04X",ch);
printedCharCount+=7;
}
}
}
static void printEscapedRange(UChar32 rangeStart, UChar32 rangeEnd){
if(rangeStart != rangeEnd){
printEscaped(rangeStart);
printf("-");
printedCharCount++;
printEscaped(rangeEnd);
printf(" ");
}else{
printEscaped(rangeStart);
printf(" ");
}
if(printedCharCount > 70){
printf("\"\n\t\t\t\"");
printedCharCount =0 ;
}
}
static void printMapping( UChar32 cp, UChar32* mapping, int32_t mappingLength){
int32_t i;
printf("\t\t\"");
printEscaped(cp);
printf(" > ");
for(i=0;i<mappingLength;i++){
printEscaped(mapping[i]);
}
printf(";\"\n");
printedCharCount=0;
}
static void U_CALLCONV
unicodeDataLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode) {
int32_t length;
uint32_t rangeStart=0,rangeEnd =0;
UBool* isUnassigned = (UBool*) context;
/* ignore First and Last entries for ranges */
if( *fields[1][0]=='<' &&
(length=(int32_t)(fields[1][1]-fields[1][0]))>=9 &&
(0==uprv_memcmp(", First>", fields[1][1]-8, 8) || 0==uprv_memcmp(", Last>", fields[1][1]-7, 7))
) {
return;
}
u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode);
if(U_FAILURE(*pErrorCode)){
fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode));
return;
}
if(*isUnassigned == TRUE){
storeRange(rangeStart,rangeEnd,UIDNA_UNASSIGNED, pErrorCode);
}else{
storeRange(rangeStart,rangeEnd,UIDNA_PROHIBITED, pErrorCode);
}
/*TODO: comment out the printer */
if(printRules){
printEscapedRange(rangeStart,rangeEnd);
}
}
static void
parseTable(const char *filename,UBool isUnassigned, UErrorCode *pErrorCode) {
char *fields[1][2];
int32_t len=0;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
return;
}
/*TODO: comment out the printer */
if(printRules){
printedCharCount = 0;
if(isUnassigned){
printf("\n\tUnassignedSet{\"[ ");
}else{
printf("\n\tProhibitedSet{\"[ ");
}
}
u_parseDelimitedFile(filename, ';', fields, 1, unicodeDataLineFn, &isUnassigned, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "genidn error: u_parseDelimitedFile(\"%s\") failed - %s\n", filename, u_errorName(*pErrorCode));
exit(*pErrorCode);
}
if(printRules){
printf("]\"}\n");
}
}
/*
static void
setLDHValues(UErrorCode* pErrorCode){
USet* set = uset_openPattern(LDH_PATTERN, LDH_PATTERN_LEN, pErrorCode);
int32_t itemCount;
int32_t index = 0;
UChar32 start,end;
if(U_FAILURE(*pErrorCode)){
fprintf(stderr,"Could not open USet. Error :%s \n",u_errorName(*pErrorCode));
exit(*pErrorCode);
}
itemCount = uset_getItemCount(set);
for(;index < itemCount; index++){
uset_getItem(set,index, &start, &end, NULL, 0, pErrorCode);
storeRange(start,end,UIDNA_LDH_OR_MAP_NFKC, pErrorCode);
}
if(printRules){
printf(PAT);
}
}
*/
static void
setLabelSeperators(UErrorCode *pErrorCode){
/* U+002E, U+3002, U+FF0E, U+FF61 */
storeRange(0x002E, 0x002E, UIDNA_LABEL_SEPARATOR, pErrorCode);
storeRange(0x3002, 0x3002, UIDNA_LABEL_SEPARATOR, pErrorCode);
storeRange(0xFF0E, 0xFF0E, UIDNA_LABEL_SEPARATOR, pErrorCode);
storeRange(0xFF61, 0xFF61, UIDNA_LABEL_SEPARATOR, pErrorCode);
if(U_FAILURE(*pErrorCode)){
fprintf(stderr, "Could not store values for label separators\n");
}
if(printRules){
printf("\tLabelSeparatorSet{\"[ ");
printEscaped(0x002E);
printEscaped(0x3002);
printEscaped(0xFF0E);
printEscaped(0xFF61);
printf(" ]\"}\n\n");
}
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View File

@ -0,0 +1,121 @@
# Microsoft Developer Studio Project File - Name="genidna" - Package Owner=<4>
# Microsoft Developer Studio Generated Build File, Format Version 6.00
# ** DO NOT EDIT **
# TARGTYPE "Win32 (x86) Console Application" 0x0103
CFG=genidna - Win32 Debug
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
!MESSAGE use the Export Makefile command and run
!MESSAGE
!MESSAGE NMAKE /f "genidna.mak".
!MESSAGE
!MESSAGE You can specify a configuration when running NMAKE
!MESSAGE by defining the macro CFG on the command line. For example:
!MESSAGE
!MESSAGE NMAKE /f "genidna.mak" CFG="genidna - Win32 Debug"
!MESSAGE
!MESSAGE Possible choices for configuration are:
!MESSAGE
!MESSAGE "genidna - Win32 Release" (based on "Win32 (x86) Console Application")
!MESSAGE "genidna - Win32 Debug" (based on "Win32 (x86) Console Application")
!MESSAGE
# Begin Project
# PROP AllowPerConfigDependencies 0
# PROP Scc_ProjName ""
# PROP Scc_LocalPath ""
CPP=cl.exe
RSC=rc.exe
!IF "$(CFG)" == "genidna - Win32 Release"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 0
# PROP BASE Output_Dir "Release"
# PROP BASE Intermediate_Dir "Release"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 0
# PROP Output_Dir "Release"
# PROP Intermediate_Dir "Release"
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD BASE RSC /l 0x409 /d "NDEBUG"
# ADD RSC /l 0x409 /d "NDEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
!ELSEIF "$(CFG)" == "genidna - Win32 Debug"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 1
# PROP BASE Output_Dir "Debug"
# PROP BASE Intermediate_Dir "Debug"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 1
# PROP Output_Dir "Debug"
# PROP Intermediate_Dir "Debug"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
MTL=midl.exe
# ADD BASE CPP /nologo /W3 /Gm /GX /ZI /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
# ADD CPP /nologo /W3 /Gm /GX /ZI /Od /I "..\..\common" /I "..\toolutil" /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /GZ /c
# ADD BASE RSC /l 0x409 /d "_DEBUG"
# ADD RSC /l 0x409 /d "_DEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# ADD LINK32 icutud.lib icuucd.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept /libpath:"..\..\..\lib"
# Begin Custom Build
TargetPath=.\Debug\genidna.exe
InputPath=.\Debug\genidna.exe
InputName=genidna
SOURCE="$(InputPath)"
"..\..\..\bin\$(InputName).exe" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy $(TargetPath) ..\..\..\bin
# End Custom Build
!ENDIF
# Begin Target
# Name "genidna - Win32 Release"
# Name "genidna - Win32 Debug"
# Begin Group "Source Files"
# PROP Default_Filter "cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
# Begin Source File
SOURCE=.\genidna.c
# End Source File
# Begin Source File
SOURCE=.\store.c
# End Source File
# End Group
# Begin Group "Header Files"
# PROP Default_Filter "h;hpp;hxx;hm;inl"
# Begin Source File
SOURCE=.\genidna.h
# End Source File
# End Group
# Begin Group "Resource Files"
# PROP Default_Filter "ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
# End Group
# End Target
# End Project

View File

@ -0,0 +1,76 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: genidn.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003-02-06
* created by: Ram Viswanadha
*/
#ifndef __GENIDN_H__
#define __GENIDN_H__
#include "unicode/utypes.h"
#include "unicode/uset.h"
#include "sprpimpl.h"
/* file definitions */
#define DATA_NAME "uidna"
#define DATA_TYPE "icu"
/*
* data structure that holds the IDN properties for one or more
* code point(s) at build time
*/
/* global flags */
extern UBool beVerbose, haveCopyright;
/* prototypes */
extern void
setUnicodeVersion(const char *v);
extern void
init(void);
extern void
storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UBool withNorm, UErrorCode* status);
extern void
storeRange(uint32_t start, uint32_t end, UBool isUnassigned,UErrorCode* status);
extern void
generateData(const char *dataDir);
extern void
cleanUpData(void);
/*
extern void
storeIDN(uint32_t code, IDN *idn);
extern void
processData(void);
*/
#endif
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View File

@ -0,0 +1,332 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2002, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: store.c
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2003-02-06
* created by: Ram Viswanadha
*
*/
#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "unicode/udata.h"
#include "utrie.h"
#include "unicode/uset.h"
#include "unewdata.h"
#include "genidna.h"
#ifdef WIN32
# pragma warning(disable: 4100)
#endif
#define DO_DEBUG_OUT 0
/**
This is a simple Trie with the following structure
16-bit IDN sets:
Each 16-bit IDN word contains:
0..2 Category flags
Contains the enum values IDNStates
3..4 Contains the length of the mapping
If length of the mapping is < 2 the length is stored
If length of the mapping is > 2 then _IDNA_LENGTH_IN_MAPPING_TABLE
enum is stored and the length of mapping is stored in the first index
in the data array
5..16 Contains the index into the data array that contains the mapping
If it contains _IDNA_MAP_TO_NOTHING, then the codepoint is stripped from
the input
*/
/* file data ---------------------------------------------------------------- */
/* indexes[] value names */
static int32_t indexes[_IDNA_INDEX_TOP]={ 0 };
static uint16_t mappingData[_IDNA_MAPPING_DATA_SIZE]={0};
/* UDataInfo cf. udata.h */
static UDataInfo dataInfo={
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0x49, 0x44, 0x4e, 0x41 }, /* dataFormat="IDNA" */
{ 2, 2, UTRIE_SHIFT, UTRIE_INDEX_SHIFT }, /* formatVersion */
{ 3, 2, 0, 0 } /* dataVersion (Unicode version) */
};
void
setUnicodeVersion(const char *v) {
UVersionInfo version;
u_versionFromString(version, v);
uprv_memcpy(dataInfo.dataVersion, version, 4);
}
static UNewTrie idnTrie={ {0},0,0,0,0,0,0,0,0,{0} };
static int32_t currentIndex = 1; /* the current index into the data trie */
static int32_t maxLength = 0; /* maximum length of mapping string */
#define MAX_DATA_LENGTH 11500
extern void
init() {
/* initialize the two tries */
if(NULL==utrie_open(&idnTrie, NULL, MAX_DATA_LENGTH, 0, FALSE)) {
fprintf(stderr, "error: failed to initialize tries\n");
exit(U_MEMORY_ALLOCATION_ERROR);
}
}
static void
store(uint32_t codepoint, uint32_t* mapping, int32_t length, uint32_t flags, UErrorCode* status){
uint32_t trieWord = 0;
int32_t i =0;
if(flags == _IDNA_MAP_TO_NOTHING){
trieWord = flags << 5;
}else{
if(length==0){
trieWord = flags;
}else{
/*
int32_t delta;
if(length==1 && (delta=(int32_t)codepoint-(int32_t)mapping[0])>=-4096 && delta<=4095) {
printf("mapping of U+%04lx to U+%04lx could fit into a 13-bit delta (0x%lx)\n", codepoint, mapping[0], delta);
}
*/
/* set the 0..2 bits the flags */
trieWord = flags;
/* set the 3..4 bits the length */
if(length > 2){
trieWord += _IDNA_LENGTH_IN_MAPPING_TABLE << 3;
}else{
trieWord += (uint32_t)((length)<<3);
}
if(length > maxLength)
maxLength = length;
/* get the current index in the data array
* and store in 5..15 bits
*/
trieWord += currentIndex << 5;
/* load mapping into the data array */
i = 0;
if(trieWord > 0xFFFF){
fprintf(stderr,"size of trie word is greater than 0xFFFF.\n");
}
/* set the length in mapping table */
if(length > 2){
mappingData[currentIndex++] = (uint16_t)length;
}
while(i<length){
if(currentIndex < _IDNA_MAPPING_DATA_SIZE){
if(mappingData[currentIndex]==0){
if(mapping[i] <= 0xFFFF){
mappingData[currentIndex++] = (uint16_t)mapping[i++];
}else{
mappingData[currentIndex++] = UTF16_LEAD(mapping[i]);
if(currentIndex < _IDNA_MAPPING_DATA_SIZE){
mappingData[currentIndex++] = UTF16_TRAIL(mapping[i++]);
}else{
fprintf(stderr, "Data Array index out of bounds.currentIndex = %i size of mapping arry = %i \n",currentIndex, _IDNA_MAPPING_DATA_SIZE);
*status = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
}
}
}else{
fprintf(stderr, "Data Array index out of bounds.currentIndex = %i size of mapping arry = %i \n",currentIndex, _IDNA_MAPPING_DATA_SIZE);
*status = U_INDEX_OUTOFBOUNDS_ERROR;
return;
}
}
}
}
i = utrie_get32(&idnTrie,codepoint,NULL);
if(i==0){
/* now set the value in the trie */
if(!utrie_set32(&idnTrie,codepoint,trieWord)){
fprintf(stderr, "error: too many mapping entries\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
}else{
if(i== UIDNA_PROHIBITED){
i += _IDNA_MAP_TO_NOTHING << 5;
/* now set the value in the trie */
if(!utrie_set32(&idnTrie,codepoint,i)){
fprintf(stderr, "error: too many mapping entries\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
}else{
fprintf(stderr, "Index array has been set for codepoint 0x%06X. \n",codepoint);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
}
extern void
storeMapping(uint32_t codepoint, uint32_t* mapping,int32_t length, UBool withNorm, UErrorCode* status){
if(withNorm){
store(codepoint,mapping,length,UIDNA_MAP_NFKC,status);
}else{
store(codepoint,mapping,length,_IDNA_MAP_TO_NOTHING,status);
}
}
extern void
storeRange(uint32_t start, uint32_t end, int8_t flag,UErrorCode* status){
uint32_t trieWord = 0, i=0;
trieWord += flag;
if(start == end){
uint32_t i = utrie_get32(&idnTrie,start,NULL);
if(i == 0 || i==(uint8_t)flag){
if(!utrie_set32(&idnTrie,start,trieWord)){
fprintf(stderr, "error: too many entries\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
}else{
fprintf(stderr, "Index array has been set for codepoint 0x%06X. \n",start);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}else{
if(!utrie_setRange32(&idnTrie,start,end+1,trieWord,FALSE)){
fprintf(stderr, "error: too many entries\n");
exit(U_BUFFER_OVERFLOW_ERROR);
}
}
}
/* folding value: just store the offset (16 bits) if there is any non-0 entry */
static uint32_t U_CALLCONV
getFoldedValue(UNewTrie *trie, UChar32 start, int32_t offset) {
uint32_t foldedValue, value;
UChar32 limit;
UBool inBlockZero;
foldedValue=0;
limit=start+0x400;
while(start<limit) {
value=utrie_get32(trie, start, &inBlockZero);
if(inBlockZero) {
start+=UTRIE_DATA_BLOCK_LENGTH;
} else {
foldedValue|=value;
++start;
}
}
if(foldedValue!=0) {
return (uint32_t)(offset|0x8000);
} else {
return 0;
}
}
extern void
generateData(const char *dataDir) {
static uint8_t idnTrieBlock[100000];
UNewDataMemory *pData;
UErrorCode errorCode=U_ZERO_ERROR;
int32_t size, idnTrieSize, dataLength;
idnTrieSize=utrie_serialize(&idnTrie, idnTrieBlock, sizeof(idnTrieBlock), getFoldedValue, TRUE, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "error: utrie_serialize(idn trie) failed, %s\n", u_errorName(errorCode));
exit(errorCode);
}
size = idnTrieSize + sizeof(mappingData) + sizeof(indexes);
if(beVerbose) {
printf("size of idn trie %5u bytes\n", idnTrieSize);
printf("size of " U_ICUDATA_NAME "_" DATA_NAME "." DATA_TYPE " contents: %ld bytes\n", (long)size);
printf("size of mapping data array %5u bytes\n", sizeof(mappingData));
printf("Number of code units in mappingData (currentIndex) are: %i \n", currentIndex);
printf("Maximum length of the mapping string is : %i \n", maxLength);
}
/* write the data */
pData=udata_create(dataDir, DATA_TYPE, U_ICUDATA_NAME "_" DATA_NAME, &dataInfo,
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "gennorm: unable to create the output file, error %d\n", errorCode);
exit(errorCode);
}
indexes[_IDNA_INDEX_TRIE_SIZE]=idnTrieSize;
indexes[_IDNA_INDEX_MAPPING_DATA_SIZE]=sizeof(mappingData);
udata_writeBlock(pData, indexes, sizeof(indexes));
udata_writeBlock(pData, idnTrieBlock, idnTrieSize);
udata_writeBlock(pData, mappingData, sizeof(mappingData));
/* finish up */
dataLength=udata_finish(pData, &errorCode);
if(U_FAILURE(errorCode)) {
fprintf(stderr, "genidn: error %d writing the output file\n", errorCode);
exit(errorCode);
}
if(dataLength!=size) {
fprintf(stderr, "genidn error: data length %ld != calculated size %ld\n",
(long)dataLength, (long)size);
exit(U_INTERNAL_PROGRAM_ERROR);
}
}
extern void
cleanUpData(void) {
utrie_close(&idnTrie);
}
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/