diff --git a/icu4c/source/common/uidna.cpp b/icu4c/source/common/uidna.cpp index b86ed26b67..b7780b84e6 100644 --- a/icu4c/source/common/uidna.cpp +++ b/icu4c/source/common/uidna.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * - * Copyright (C) 2003-2007, International Business Machines + * Copyright (C) 2003-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -45,8 +45,6 @@ static const UChar ACE_PREFIX[] ={ 0x0078,0x006E,0x002d,0x002d } ; #define CAPITAL_A 0x0041 #define CAPITAL_Z 0x005A -#define DATA_FILE_NAME "uidna" - inline static UChar toASCIILower(UChar ch){ if(CAPITAL_A <= ch && ch <= CAPITAL_Z){ @@ -624,7 +622,7 @@ uidna_toASCII(const UChar* src, int32_t srcLength, return 0; } - UStringPrepProfile* nameprep = usprep_open(NULL,DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return -1; @@ -653,7 +651,7 @@ uidna_toUnicode(const UChar* src, int32_t srcLength, return 0; } - UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return -1; @@ -684,7 +682,7 @@ uidna_IDNToASCII( const UChar *src, int32_t srcLength, int32_t reqLength = 0; - UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return 0; @@ -777,7 +775,7 @@ uidna_IDNToUnicode( const UChar* src, int32_t srcLength, int32_t reqLength = 0; - UStringPrepProfile* nameprep = usprep_open(NULL, DATA_FILE_NAME, status); + UStringPrepProfile* nameprep = usprep_openByType(USPREP_RFC3491_NAMEPREP, status); if(U_FAILURE(*status)){ return 0; diff --git a/icu4c/source/common/unicode/usprep.h b/icu4c/source/common/unicode/usprep.h index c7e75a53fa..d73548de5d 100644 --- a/icu4c/source/common/unicode/usprep.h +++ b/icu4c/source/common/unicode/usprep.h @@ -1,7 +1,7 @@ /* ******************************************************************************* * - * Copyright (C) 2003-2006, International Business Machines + * Copyright (C) 2003-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -82,6 +82,85 @@ typedef struct UStringPrepProfile UStringPrepProfile; */ #define USPREP_ALLOW_UNASSIGNED 0x0001 +/** + * enums for the standard stringprep profile types + * supported by usprep_openByType. + * @see usprep_openByType + * @draft ICU 4.2 + */ +typedef enum UStringPrepProfileType { + /** + * RFC3491 Nameprep + * @draft ICU 4.2 + */ + USPREP_RFC3491_NAMEPREP, + /** + * RFC3530 nfs4_cs_prep + * @draft ICU 4.2 + */ + USPREP_RFC3530_NFS4_CS_PREP, + /** + * RFC3530 nfs4_cs_prep with case insensitive option + * @draft ICU 4.2 + */ + USPREP_RFC3530_NFS4_CS_PREP_CI, + /** + * RFC3530 nfs4_cis_prep + * @draft ICU 4.2 + */ + USPREP_RFC3530_NSF4_CIS_PREP, + /** + * RFC3530 nfs4_mixed_prep for prefix + * @draft ICU 4.2 + */ + USPREP_RFC3530_NSF4_MIXED_PREP_PREFIX, + /** + * RFC3530 nfs4_mixed_prep for suffix + * @draft ICU 4.2 + */ + USPREP_RFC3530_NSF4_MIXED_PREP_SUFFIX, + /** + * RFC3722 iSCSI + * @draft ICU 4.2 + */ + USPREP_RFC3722_ISCSI, + /** + * RFC3920 XMPP Nodeprep + * @draft ICU 4.2 + */ + USPREP_RFC3920_NODEPREP, + /** + * RFC3920 XMPP Resourceprep + * @draft ICU 4.2 + */ + USPREP_RFC3920_RESOURCEPREP, + /** + * RFC4011 Policy MIB Stringprep + * @draft ICU 4.2 + */ + USPREP_RFC4011_MIB, + /** + * RFC4013 SASLprep + * @draft ICU 4.2 + */ + USPREP_RFC4013_SASLPREP, + /** + * RFC4505 trace + * @draft ICU 4.2 + */ + USPREP_RFC4505_TRACE, + /** + * RFC4518 LDAP + * @draft ICU 4.2 + */ + USPREP_RFC4518_LDAP, + /** + * RFC4518 LDAP for case ignore, numeric and stored prefix + * matching rules + * @draft ICU 4.2 + */ + USPREP_RFC4518_LDAP_CI +} UStringPrepProfileType; /** * Creates a StringPrep profile from the data file. @@ -103,6 +182,20 @@ usprep_open(const char* path, const char* fileName, UErrorCode* status); +/** + * Creates a StringPrep profile for the specified profile type. + * + * @param type The profile type + * @param status ICU error code in/out parameter. Must not be NULL. + * Must fulfill U_SUCCESS before the function call. + * @return Pointer to UStringPrepProfile that is opened. Should be closed by + * calling usprep_close() + * @see usprep_close() + * @draft ICU 4.2 + */ +U_DRAFT UStringPrepProfile* U_EXPORT2 +usprep_openByType(UStringPrepProfileType type, + UErrorCode* status); /** * Closes the profile diff --git a/icu4c/source/common/usprep.cpp b/icu4c/source/common/usprep.cpp index bd9d7cb117..3cb4ca3445 100644 --- a/icu4c/source/common/usprep.cpp +++ b/icu4c/source/common/usprep.cpp @@ -1,7 +1,7 @@ /* ******************************************************************************* * - * Copyright (C) 2003-2008, International Business Machines + * Copyright (C) 2003-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -50,6 +50,24 @@ static uint8_t formatVersion[4]={ 0, 0, 0, 0 }; /* the Unicode version of the sprep data */ static UVersionInfo dataVersion={ 0, 0, 0, 0 }; +/* Profile names must be aligned to UStringPrepProfileType */ +static const char *PROFILE_NAMES[] = { + "rfc3491", /* USPREP_RFC3491_NAMEPREP */ + "rfc3530cs", /* USPREP_RFC3530_NFS4_CS_PREP */ + "rfc3530csci", /* USPREP_RFC3530_NFS4_CS_PREP_CI */ + "rfc3491", /* USPREP_RFC3530_NSF4_CIS_PREP */ + "rfc3530mixp", /* USPREP_RFC3530_NSF4_MIXED_PREP_PREFIX */ + "rfc3491", /* USPREP_RFC3530_NSF4_MIXED_PREP_SUFFIX */ + "rfc3722", /* USPREP_RFC3722_ISCSI */ + "rfc3920node", /* USPREP_RFC3920_NODEPREP */ + "rfc3920res", /* USPREP_RFC3920_RESOURCEPREP */ + "rfc4011", /* USPREP_RFC4011_MIB */ + "rfc4013", /* USPREP_RFC4013_SASLPREP */ + "rfc4505", /* USPREP_RFC4505_TRACE */ + "rfc4518", /* USPREP_RFC4518_LDAP */ + "rfc4518ci", /* USPREP_RFC4518_LDAP_CI */ +}; + static UBool U_CALLCONV isSPrepAcceptable(void * /* context */, const char * /* type */, @@ -418,6 +436,20 @@ usprep_open(const char* path, return usprep_getProfile(path,name,status); } +U_CAPI UStringPrepProfile* U_EXPORT2 +usprep_openByType(UStringPrepProfileType type, + UErrorCode* status) { + if(status == NULL || U_FAILURE(*status)){ + return NULL; + } + int32_t index = (int32_t)type; + if (index < 0 || index >= sizeof(PROFILE_NAMES)/sizeof(PROFILE_NAMES[0])) { + *status = U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + return usprep_open(NULL, PROFILE_NAMES[index], status); +} + U_CAPI void U_EXPORT2 usprep_close(UStringPrepProfile* profile){ if(profile==NULL){ diff --git a/icu4c/source/data/Makefile.in b/icu4c/source/data/Makefile.in index 23be78023f..4e425344ae 100644 --- a/icu4c/source/data/Makefile.in +++ b/icu4c/source/data/Makefile.in @@ -1,5 +1,5 @@ ## Makefile.in for ICU data -## Copyright (c) 1999-2008, International Business Machines Corporation and +## Copyright (c) 1999-2009, International Business Machines Corporation and ## others. All Rights Reserved. ## Source directory information @@ -76,6 +76,7 @@ BRKSRCDIR=$(SRCDATADIR)/brkitr BRKBLDDIR=$(BUILDDIR)/brkitr MISCSRCDIR=$(SRCDATADIR)/misc UCMSRCDIR=$(SRCDATADIR)/mappings +SPREPSRCDIR=$(SRCDATADIR)/sprep COMINCDIR=$(top_srcdir)/common/unicode SRCLISTDEPS=Makefile $(srcdir)/Makefile.in BUILD_DIRS=$(OUTDIR) $(MAINBUILDDIR) $(BUILDDIR) $(BRKBLDDIR) $(COLBLDDIR) $(RBNFBLDDIR) $(TRANSLITBLDDIR) $(OUTTMPDIR) $(OUTTMPDIR_390STUB) $(OUTTMPDIR)/$(COLLATION_TREE) $(OUTTMPDIR)/$(RBNF_TREE) $(OUTTMPDIR)/$(TRANSLIT_TREE) $(OUTTMPDIR)/$(BREAK_TREE) @@ -201,7 +202,7 @@ package390: $(OUTTMPDIR)/icudata390.lst $(PKGDATA_LIST) ./icupkg.inc packagedata # 2005-may-05 Removed Unicode properties files (unorm.icu, uprops.icu, ucase.icu, ubidi.icu) # from data build. See Jitterbug 4497. (makedata.mak revision 1.117) # -DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu uidna.spp +DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%) ## BRK files @@ -315,11 +316,18 @@ TRANSLIT_TREE=translit TRANSLIT_FILES = $(TRANSLIT_SRC:%.txt=$(TRANSLITBLDDIR)/%.res) TRANSLIT_FILES_SHORT = $(TRANSLIT_SRC:%.txt=$(TRANSLIT_TREE)/%.res) +## SPP files +-include $(SPREPSRCDIR)/sprepfiles.mk +-include $(SPREPSRCDIR)/spreplocal.mk +ALL_SPREP_SOURCE=$(SPREP_SOURCE) $(SPREP_SOURCE_LOCAL) +SPREP_FILES = $(ALL_SPREP_SOURCE:%.txt=$(BUILDDIR)/%.spp) +SPREP_FILES_SHORT = $(ALL_SPREP_SOURCE:%.txt=%.spp) + ## All generated files -ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) +ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) ALL_INDEX_SRC_FILES = $(PKGDATA_LIST) $(INDEX_FILE) $(COLLATION_INDEX_FILE) $(BRK_RES_INDEX_FILE) $(RBNF_INDEX_FILE) # a list to use in the .lst files (package-relative) -ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) +ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) UNI_CORE_DATA=uprops.icu ucase.icu ubidi.icu unorm.icu UNI_CORE_TARGET_DATA=$(UNI_CORE_DATA:%=$(BUILDDIR)/%) @@ -416,9 +424,11 @@ $(BUILDDIR)/unames.icu: $(UNICODEDATADIR)/UnicodeData.txt $(BINDIR)/gennames$(EX $(BUILDDIR)/cnvalias.icu: $(UCMSRCDIR)/convrtrs.txt $(BINDIR)/gencnval$(EXEEXT) $(INVOKE) $(BINDIR)/gencnval -d $(BUILDDIR) $(UCMSRCDIR)/convrtrs.txt -# uidna.spp -$(BUILDDIR)/uidna.spp: $(MISCSRCDIR)/NamePrepProfile.txt $(BINDIR)/gensprep$(EXEEXT) $(BUILDDIR)/unames.icu $(BUILDDIR)/pnames.icu - $(INVOKE) $(BINDIR)/gensprep -d $(BUILDDIR) -i $(BUILDDIR) -s $(MISCSRCDIR) -b uidna -n $(UNICODEDATADIR) -u 3.2.0 -k NamePrepProfile.txt +#################################################### SPP +# SPP FILES + +$(BUILDDIR)/%.spp: $(SPREPSRCDIR)/%.txt $(BINDIR)/gensprep$(EXEEXT) $(BUILDDIR)/unames.icu $(BUILDDIR)/pnames.icu + $(INVOKE) $(BINDIR)/gensprep -d $(BUILDDIR) -i $(BUILDDIR) -s $(SPREPSRCDIR) -b $(@F:%.spp=%) -m $(UNICODEDATADIR) -u 3.2.0 $( \$a1, "--B1" => \$b1, "--B2" => \$b2, - "--B3" => \$b3, + "--B3" => \$b3, "--C11" => \$c11, "--C12" => \$c12, "--C21" => \$c21, @@ -49,8 +51,12 @@ sub main(){ "--C7" => \$c7, "--C8" => \$c8, "--C9" => \$c9, - "--ldh-chars" => \$writeLDHChars, - "--iscsi" => \$writeISCSIChars, + "--iscsi" => \$writeISCSIProhibitedExtra, + "--xmpp-node" => \$writeXMPPNodeProhibitedExtra, + "--sasl" => \$writeSASLMap, + "--ldap" => \$writeLDAPMap, + "--normalize" => \$norm, + "--check-bidi" => \$checkBidi, ); usage() unless defined $sourceDir; usage() unless defined $destDir; @@ -65,8 +71,18 @@ sub main(){ unlink($outfile); $outFH = IO::File->new($outfile,"a") or die "could not open the file $outfile for writing: $! \n"; + + printf $outFH $icu_copyright, localtime->year()+1900; print $outFH $copyright; print $outFH $warning; + + if(defined $norm) { + print $outFH "\@normalize;;\n"; + } + if(defined $checkBidi) { + print $outFH "\@check-bidi;;\n"; + } + print $outFH "\n"; close($outFH); if(defined $b2 && defined $b3){ @@ -121,9 +137,18 @@ sub main(){ createProhibitedTable($inFH,$outfile,$line); } } - if( defined $writeISCSIChars){ + if( defined $writeISCSIProhibitedExtra){ create_iSCSIExtraProhibitedTable($inFH, $outfile); } + if( defined $writeXMPPNodeProhitedExtra){ + create_XMPPNodeExtraProhibitedTable($inFH, $outfile); + } + if( defined $writeSASLMap){ + create_SASLMapTable($inFH, $outfile); + } + if( defined $writeLDAPMap){ + create_LDAPMapTable($inFH, $outfile); + } close($inFH); } @@ -239,7 +264,7 @@ sub createProhibitedTable{ #----------------------------------------------------------------------- sub create_iSCSIExtraProhibitedTable{ ($inFH,$outfile,$line) = @_; - $comment ="# Additional prohibitions from draft-ietf-ips-iscsi-string-prep-06.txt\n"; + $comment ="# Additional prohibitions from iSCSI profile (rfc3722.txt)\n\n"; $outFH = IO::File->new($outfile, "a") or die "could not open the file $outfile for writing: $! \n"; @@ -254,6 +279,358 @@ sub create_iSCSIExtraProhibitedTable{ close($outFH); } #----------------------------------------------------------------------- +sub create_XMPPNodeExtraProhibitedTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Additional prohibitions from XMPP Nodeprep profile (rfc3920.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + print $outFH "0022; ; PROHIBITED\n"; + print $outFH "0026; ; PROHIBITED\n"; + print $outFH "0027; ; PROHIBITED\n"; + print $outFH "002F; ; PROHIBITED\n"; + print $outFH "003A; ; PROHIBITED\n"; + print $outFH "003C; ; PROHIBITED\n"; + print $outFH "003E; ; PROHIBITED\n"; + print $outFH "0040; ; PROHIBITED\n"; + print $outFH "\n# Total code points 8\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_SASLMapTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Map table for SASL profile (rfc4013.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + # non-ASCII space characters [C.1.2] to SPACE + print $outFH "00A0; 0020; MAP\n"; + print $outFH "1680; 0020; MAP\n"; + print $outFH "2000; 0020; MAP\n"; + print $outFH "2001; 0020; MAP\n"; + print $outFH "2002; 0020; MAP\n"; + print $outFH "2003; 0020; MAP\n"; + print $outFH "2004; 0020; MAP\n"; + print $outFH "2005; 0020; MAP\n"; + print $outFH "2006; 0020; MAP\n"; + print $outFH "2007; 0020; MAP\n"; + print $outFH "2008; 0020; MAP\n"; + print $outFH "2009; 0020; MAP\n"; + print $outFH "200A; 0020; MAP\n"; + print $outFH "200B; 0020; MAP\n"; + print $outFH "202F; 0020; MAP\n"; + print $outFH "205F; 0020; MAP\n"; + print $outFH "3000; 0020; MAP\n"; + + # commonly mapped to nothing characters except U+200B to nothing + print $outFH "00AD; ; MAP\n"; + print $outFH "034F; ; MAP\n"; + print $outFH "1806; ; MAP\n"; + print $outFH "180B; ; MAP\n"; + print $outFH "180C; ; MAP\n"; + print $outFH "180D; ; MAP\n"; + print $outFH "200C; ; MAP\n"; + print $outFH "200D; ; MAP\n"; + print $outFH "2060; ; MAP\n"; + print $outFH "FE00; ; MAP\n"; + print $outFH "FE01; ; MAP\n"; + print $outFH "FE02; ; MAP\n"; + print $outFH "FE03; ; MAP\n"; + print $outFH "FE04; ; MAP\n"; + print $outFH "FE05; ; MAP\n"; + print $outFH "FE06; ; MAP\n"; + print $outFH "FE07; ; MAP\n"; + print $outFH "FE08; ; MAP\n"; + print $outFH "FE09; ; MAP\n"; + print $outFH "FE0A; ; MAP\n"; + print $outFH "FE0B; ; MAP\n"; + print $outFH "FE0C; ; MAP\n"; + print $outFH "FE0D; ; MAP\n"; + print $outFH "FE0E; ; MAP\n"; + print $outFH "FE0F; ; MAP\n"; + print $outFH "FEFF; ; MAP\n"; + print $outFH "\n# Total code points 43\n"; + close($outFH); +} +#----------------------------------------------------------------------- +sub create_LDAPMapTable{ + ($inFH,$outfile,$line) = @_; + $comment ="# Map table for LDAP profile (rfc4518.txt)\n\n"; + + $outFH = IO::File->new($outfile, "a") + or die "could not open the file $outfile for writing: $! \n"; + print $outFH $comment; + + # SOFT HYPHEN (U+00AD) and MONGOLIAN TODO SOFT HYPHEN (U+1806) code + # points are mapped to nothing. COMBINING GRAPHEME JOINER (U+034F) and + # VARIATION SELECTORs (U+180B-180D, FF00-FE0F) code points are also + # mapped to nothing. The OBJECT REPLACEMENT CHARACTER (U+FFFC) is + # mapped to nothing. + + print $outFH "00AD; ; MAP\n"; + print $outFH "034F; ; MAP\n"; + print $outFH "1806; ; MAP\n"; + print $outFH "180B; ; MAP\n"; + print $outFH "180C; ; MAP\n"; + print $outFH "180D; ; MAP\n"; + print $outFH "FE00; ; MAP\n"; + print $outFH "FE01; ; MAP\n"; + print $outFH "FE02; ; MAP\n"; + print $outFH "FE03; ; MAP\n"; + print $outFH "FE04; ; MAP\n"; + print $outFH "FE05; ; MAP\n"; + print $outFH "FE06; ; MAP\n"; + print $outFH "FE07; ; MAP\n"; + print $outFH "FE08; ; MAP\n"; + print $outFH "FE09; ; MAP\n"; + print $outFH "FE0A; ; MAP\n"; + print $outFH "FE0B; ; MAP\n"; + print $outFH "FE0C; ; MAP\n"; + print $outFH "FE0D; ; MAP\n"; + print $outFH "FE0E; ; MAP\n"; + print $outFH "FE0F; ; MAP\n"; + print $outFH "FFFC; ; MAP\n"; + +# CHARACTER TABULATION (U+0009), LINE FEED (LF) (U+000A), LINE +# TABULATION (U+000B), FORM FEED (FF) (U+000C), CARRIAGE RETURN (CR) +# (U+000D), and NEXT LINE (NEL) (U+0085) are mapped to SPACE (U+0020). + + print $outFH "0009; 0020; MAP\n"; + print $outFH "000A; 0020; MAP\n"; + print $outFH "000B; 0020; MAP\n"; + print $outFH "000C; 0020; MAP\n"; + print $outFH "000D; 0020; MAP\n"; + print $outFH "0085; 0020; MAP\n"; + + # All other control code (e.g., Cc) points or code points with a + # control function (e.g., Cf) are mapped to nothing. The following is + # a complete list of these code points: U+0000-0008, 000E-001F, 007F- + # 0084, 0086-009F, 06DD, 070F, 180E, 200C-200F, 202A-202E, 2060-2063, + # 206A-206F, FEFF, FFF9-FFFB, 1D173-1D17A, E0001, E0020-E007F. + + print $outFH "0000; ; MAP\n"; + print $outFH "0001; ; MAP\n"; + print $outFH "0002; ; MAP\n"; + print $outFH "0003; ; MAP\n"; + print $outFH "0004; ; MAP\n"; + print $outFH "0005; ; MAP\n"; + print $outFH "0006; ; MAP\n"; + print $outFH "0007; ; MAP\n"; + print $outFH "0008; ; MAP\n"; + print $outFH "000E; ; MAP\n"; + print $outFH "000F; ; MAP\n"; + print $outFH "0010; ; MAP\n"; + print $outFH "0011; ; MAP\n"; + print $outFH "0012; ; MAP\n"; + print $outFH "0013; ; MAP\n"; + print $outFH "0014; ; MAP\n"; + print $outFH "0015; ; MAP\n"; + print $outFH "0016; ; MAP\n"; + print $outFH "0017; ; MAP\n"; + print $outFH "0018; ; MAP\n"; + print $outFH "0019; ; MAP\n"; + print $outFH "001A; ; MAP\n"; + print $outFH "001B; ; MAP\n"; + print $outFH "001C; ; MAP\n"; + print $outFH "001D; ; MAP\n"; + print $outFH "001E; ; MAP\n"; + print $outFH "001F; ; MAP\n"; + print $outFH "007F; ; MAP\n"; + print $outFH "0080; ; MAP\n"; + print $outFH "0081; ; MAP\n"; + print $outFH "0082; ; MAP\n"; + print $outFH "0083; ; MAP\n"; + print $outFH "0084; ; MAP\n"; + print $outFH "0086; ; MAP\n"; + print $outFH "0087; ; MAP\n"; + print $outFH "0088; ; MAP\n"; + print $outFH "0089; ; MAP\n"; + print $outFH "008A; ; MAP\n"; + print $outFH "008B; ; MAP\n"; + print $outFH "008C; ; MAP\n"; + print $outFH "008D; ; MAP\n"; + print $outFH "008E; ; MAP\n"; + print $outFH "008F; ; MAP\n"; + print $outFH "0090; ; MAP\n"; + print $outFH "0091; ; MAP\n"; + print $outFH "0092; ; MAP\n"; + print $outFH "0093; ; MAP\n"; + print $outFH "0094; ; MAP\n"; + print $outFH "0095; ; MAP\n"; + print $outFH "0096; ; MAP\n"; + print $outFH "0097; ; MAP\n"; + print $outFH "0098; ; MAP\n"; + print $outFH "0099; ; MAP\n"; + print $outFH "009A; ; MAP\n"; + print $outFH "009B; ; MAP\n"; + print $outFH "009C; ; MAP\n"; + print $outFH "009D; ; MAP\n"; + print $outFH "009E; ; MAP\n"; + print $outFH "009F; ; MAP\n"; + print $outFH "06DD; ; MAP\n"; + print $outFH "070F; ; MAP\n"; + print $outFH "180E; ; MAP\n"; + print $outFH "200C; ; MAP\n"; + print $outFH "200D; ; MAP\n"; + print $outFH "200E; ; MAP\n"; + print $outFH "200F; ; MAP\n"; + print $outFH "202A; ; MAP\n"; + print $outFH "202B; ; MAP\n"; + print $outFH "202C; ; MAP\n"; + print $outFH "202D; ; MAP\n"; + print $outFH "202E; ; MAP\n"; + print $outFH "2060; ; MAP\n"; + print $outFH "2061; ; MAP\n"; + print $outFH "2062; ; MAP\n"; + print $outFH "2063; ; MAP\n"; + print $outFH "206A; ; MAP\n"; + print $outFH "206B; ; MAP\n"; + print $outFH "206C; ; MAP\n"; + print $outFH "206D; ; MAP\n"; + print $outFH "206E; ; MAP\n"; + print $outFH "206F; ; MAP\n"; + print $outFH "FEFF; ; MAP\n"; + print $outFH "FFF9; ; MAP\n"; + print $outFH "FFFA; ; MAP\n"; + print $outFH "FFFB; ; MAP\n"; + print $outFH "1D173; ; MAP\n"; + print $outFH "1D174; ; MAP\n"; + print $outFH "1D175; ; MAP\n"; + print $outFH "1D176; ; MAP\n"; + print $outFH "1D177; ; MAP\n"; + print $outFH "1D178; ; MAP\n"; + print $outFH "1D179; ; MAP\n"; + print $outFH "1D17A; ; MAP\n"; + print $outFH "E0001; ; MAP\n"; + print $outFH "E0020; ; MAP\n"; + print $outFH "E0021; ; MAP\n"; + print $outFH "E0022; ; MAP\n"; + print $outFH "E0023; ; MAP\n"; + print $outFH "E0024; ; MAP\n"; + print $outFH "E0025; ; MAP\n"; + print $outFH "E0026; ; MAP\n"; + print $outFH "E0027; ; MAP\n"; + print $outFH "E0028; ; MAP\n"; + print $outFH "E0029; ; MAP\n"; + print $outFH "E002A; ; MAP\n"; + print $outFH "E002B; ; MAP\n"; + print $outFH "E002C; ; MAP\n"; + print $outFH "E002D; ; MAP\n"; + print $outFH "E002E; ; MAP\n"; + print $outFH "E002F; ; MAP\n"; + print $outFH "E0030; ; MAP\n"; + print $outFH "E0031; ; MAP\n"; + print $outFH "E0032; ; MAP\n"; + print $outFH "E0033; ; MAP\n"; + print $outFH "E0034; ; MAP\n"; + print $outFH "E0035; ; MAP\n"; + print $outFH "E0036; ; MAP\n"; + print $outFH "E0037; ; MAP\n"; + print $outFH "E0038; ; MAP\n"; + print $outFH "E0039; ; MAP\n"; + print $outFH "E003A; ; MAP\n"; + print $outFH "E003B; ; MAP\n"; + print $outFH "E003C; ; MAP\n"; + print $outFH "E003D; ; MAP\n"; + print $outFH "E003E; ; MAP\n"; + print $outFH "E003F; ; MAP\n"; + print $outFH "E0040; ; MAP\n"; + print $outFH "E0041; ; MAP\n"; + print $outFH "E0042; ; MAP\n"; + print $outFH "E0043; ; MAP\n"; + print $outFH "E0044; ; MAP\n"; + print $outFH "E0045; ; MAP\n"; + print $outFH "E0046; ; MAP\n"; + print $outFH "E0047; ; MAP\n"; + print $outFH "E0048; ; MAP\n"; + print $outFH "E0049; ; MAP\n"; + print $outFH "E004A; ; MAP\n"; + print $outFH "E004B; ; MAP\n"; + print $outFH "E004C; ; MAP\n"; + print $outFH "E004D; ; MAP\n"; + print $outFH "E004E; ; MAP\n"; + print $outFH "E004F; ; MAP\n"; + print $outFH "E0050; ; MAP\n"; + print $outFH "E0051; ; MAP\n"; + print $outFH "E0052; ; MAP\n"; + print $outFH "E0053; ; MAP\n"; + print $outFH "E0054; ; MAP\n"; + print $outFH "E0055; ; MAP\n"; + print $outFH "E0056; ; MAP\n"; + print $outFH "E0057; ; MAP\n"; + print $outFH "E0058; ; MAP\n"; + print $outFH "E0059; ; MAP\n"; + print $outFH "E005A; ; MAP\n"; + print $outFH "E005B; ; MAP\n"; + print $outFH "E005C; ; MAP\n"; + print $outFH "E005D; ; MAP\n"; + print $outFH "E005E; ; MAP\n"; + print $outFH "E005F; ; MAP\n"; + print $outFH "E0060; ; MAP\n"; + print $outFH "E0061; ; MAP\n"; + print $outFH "E0062; ; MAP\n"; + print $outFH "E0063; ; MAP\n"; + print $outFH "E0064; ; MAP\n"; + print $outFH "E0065; ; MAP\n"; + print $outFH "E0066; ; MAP\n"; + print $outFH "E0067; ; MAP\n"; + print $outFH "E0068; ; MAP\n"; + print $outFH "E0069; ; MAP\n"; + print $outFH "E006A; ; MAP\n"; + print $outFH "E006B; ; MAP\n"; + print $outFH "E006C; ; MAP\n"; + print $outFH "E006D; ; MAP\n"; + print $outFH "E006E; ; MAP\n"; + print $outFH "E006F; ; MAP\n"; + print $outFH "E0070; ; MAP\n"; + print $outFH "E0071; ; MAP\n"; + print $outFH "E0072; ; MAP\n"; + print $outFH "E0073; ; MAP\n"; + print $outFH "E0074; ; MAP\n"; + print $outFH "E0075; ; MAP\n"; + print $outFH "E0076; ; MAP\n"; + print $outFH "E0077; ; MAP\n"; + print $outFH "E0078; ; MAP\n"; + print $outFH "E0079; ; MAP\n"; + print $outFH "E007A; ; MAP\n"; + print $outFH "E007B; ; MAP\n"; + print $outFH "E007C; ; MAP\n"; + print $outFH "E007D; ; MAP\n"; + print $outFH "E007E; ; MAP\n"; + print $outFH "E007F; ; MAP\n"; + + # ZERO WIDTH SPACE (U+200B) is mapped to nothing. All other code + # points with Separator (space, line, or paragraph) property (e.g., Zs, + # Zl, or Zp) are mapped to SPACE (U+0020). The following is a complete + # list of these code points: U+0020, 00A0, 1680, 2000-200A, 2028-2029, + # 202F, 205F, 3000. + + print $outFH "200B; ; MAP\n"; + print $outFH "00A0; 0020; MAP\n"; + print $outFH "1680; 0020; MAP\n"; + print $outFH "2000; 0020; MAP\n"; + print $outFH "2001; 0020; MAP\n"; + print $outFH "2002; 0020; MAP\n"; + print $outFH "2003; 0020; MAP\n"; + print $outFH "2004; 0020; MAP\n"; + print $outFH "2005; 0020; MAP\n"; + print $outFH "2006; 0020; MAP\n"; + print $outFH "2007; 0020; MAP\n"; + print $outFH "2008; 0020; MAP\n"; + print $outFH "2009; 0020; MAP\n"; + print $outFH "200A; 0020; MAP\n"; + print $outFH "2028; 0020; MAP\n"; + print $outFH "2029; 0020; MAP\n"; + print $outFH "202F; 0020; MAP\n"; + print $outFH "205F; 0020; MAP\n"; + print $outFH "3000; 0020; MAP\n"; + + print $outFH "\n# Total code points 238\n"; + close($outFH); +} +#----------------------------------------------------------------------- sub usage { print << "END"; Usage: @@ -278,11 +655,16 @@ Options: --C7 Generate data for table C.7 --C8 Generate data for table C.8 --C9 Generate data for table C.9 - --iscsi Generate data for extra prohibited iSCSI chars + --iscsi Generate data for iSCSI extra prohibited table + --xmpp-node Generate data for XMPP extra prohibited table + --sasl Generate data for SASL map table + --ldap Generate data for LDAP map table + --normalize Embed the normalization directive in the output file + --check-bidi Embed the check bidi directove in the output file Note, --B2 and --B3 are mutually exclusive. -e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --src-filename=rfc3454.txt --dest-filename=NamePrepProfile.txt --A1 --B1 --B2 --C12 --C22 --C3 --C4 --C5 --C6 --C7 --C8 --C9 +e.g.: filterRFC3454.pl --sourcedir=. --destdir=./output --src-filename=rfc3454.txt --dest-filename=NamePrepProfile.txt --A1 --B1 --B2 --C12 --C22 --C3 --C4 --C5 --C6 --C7 --C8 --C9 --normalize --check-bidi filterRFC3454.pl filters the RFC file and creates String prep table files. The RFC text can be downloaded from ftp://ftp.rfc-editor.org/in-notes/rfc3454.txt diff --git a/icu4c/source/tools/gensprep/gensprep.c b/icu4c/source/tools/gensprep/gensprep.c index 7421545158..c3f1ade571 100644 --- a/icu4c/source/tools/gensprep/gensprep.c +++ b/icu4c/source/tools/gensprep/gensprep.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 2003-2006, International Business Machines +* Copyright (C) 2003-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -44,6 +44,11 @@ UBool beVerbose=FALSE, haveCopyright=TRUE; #define NORM_CORRECTIONS_FILE_NAME "NormalizationCorrections.txt" +#define NORMALIZE_DIRECTIVE "normalize" +#define NORMALIZE_DIRECTIVE_LEN 9 +#define CHECK_BIDI_DIRECTIVE "check-bidi" +#define CHECK_BIDI_DIRECTIVE_LEN 10 + /* prototypes --------------------------------------------------------------- */ static void @@ -65,6 +70,7 @@ static UOption options[]={ UOPTION_ICUDATADIR, UOPTION_BUNDLE_NAME, { "normalization", NULL, NULL, NULL, 'n', UOPT_REQUIRES_ARG, 0 }, + { "norm-correction", NULL, NULL, NULL, 'm', UOPT_REQUIRES_ARG, 0 }, { "check-bidi", NULL, NULL, NULL, 'k', UOPT_NO_ARG, 0}, { "unicode", NULL, NULL, NULL, 'u', UOPT_REQUIRES_ARG, 0 }, }; @@ -79,6 +85,7 @@ enum{ ICUDATADIR, BUNDLE_NAME, NORMALIZE, + NORM_CORRECTION_DIR, CHECK_BIDI, UNICODE_VERSION }; @@ -110,7 +117,13 @@ static int printHelp(int argc, char* argv[]){ fprintf(stderr, "\t-n or --normalize turn on the option for normalization and include mappings\n" "\t from NormalizationCorrections.txt from the given path,\n" - "\t e.g: /test/icu/source/data/unidata\n" + "\t e.g: /test/icu/source/data/unidata\n"); + fprintf(stderr, + "\t-m or --norm-correction use NormalizationCorrections.txt from the given path\n" + "\t when the input file contains a normalization directive.\n" + "\t unlike -n/--normalize, this option does not force the\n" + "\t normalization.\n"); + fprintf(stderr, "\t-k or --check-bidi turn on the option for checking for BiDi in the profile\n" "\t-u or --unicode version of Unicode to be used with this profile followed by the version\n" ); @@ -158,7 +171,11 @@ main(int argc, char* argv[]) { srcDir=options[SOURCEDIR].value; destDir=options[DESTDIR].value; bundleName = options[BUNDLE_NAME].value; - icuUniDataDir = options[NORMALIZE].value; + if(options[NORMALIZE].doesOccur) { + icuUniDataDir = options[NORMALIZE].value; + } else { + icuUniDataDir = options[NORM_CORRECTION_DIR].value; + } if(argc<2) { /* print the help message */ @@ -210,7 +227,7 @@ main(int argc, char* argv[]) { return errorCode; } - if(options[NORMALIZE].doesOccur){ + if(options[NORMALIZE].doesOccur){ /* this option might be set by @normalize;; in the source file */ /* set up directory for NormalizationCorrections.txt */ uprv_strcpy(filename,icuUniDataDir); basename=filename+uprv_strlen(filename); @@ -229,7 +246,7 @@ main(int argc, char* argv[]) { sprepOptions |= _SPREP_NORMALIZATION_ON; } - if(options[CHECK_BIDI].doesOccur){ + if(options[CHECK_BIDI].doesOccur){ /* this option might be set by @check-bidi;; in the source file */ sprepOptions |= _SPREP_CHECK_BIDI_ON; } @@ -327,13 +344,34 @@ strprepProfileLineFn(void *context, const char* typeName; uint32_t rangeStart=0,rangeEnd =0; const char* filename = (const char*) context; - + const char *s; + + s = u_skipWhitespace(fields[0][0]); + if (*s == '@') { + /* special directive */ + s++; + length = fields[0][1] - s; + if (length >= NORMALIZE_DIRECTIVE_LEN + && uprv_strncmp(s, NORMALIZE_DIRECTIVE, NORMALIZE_DIRECTIVE_LEN) == 0) { + options[NORMALIZE].doesOccur = TRUE; + return; + } + else if (length >= CHECK_BIDI_DIRECTIVE_LEN + && uprv_strncmp(s, CHECK_BIDI_DIRECTIVE, CHECK_BIDI_DIRECTIVE_LEN) == 0) { + options[CHECK_BIDI].doesOccur = TRUE; + return; + } + else { + fprintf(stderr, "gensprep error parsing a directive %s.", fields[0][0]); + } + } + typeName = fields[2][0]; map = fields[1][0]; if(uprv_strstr(typeName, usprepTypeNames[USPREP_UNASSIGNED])!=NULL){ - u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode); + u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); return; @@ -344,7 +382,7 @@ strprepProfileLineFn(void *context, }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_PROHIBITED])!=NULL){ - u_parseCodePointRange(fields[0][0], &rangeStart,&rangeEnd, pErrorCode); + u_parseCodePointRange(s, &rangeStart,&rangeEnd, pErrorCode); if(U_FAILURE(*pErrorCode)){ fprintf(stderr, "Could not parse code point range. Error: %s\n",u_errorName(*pErrorCode)); return; @@ -356,8 +394,8 @@ strprepProfileLineFn(void *context, }else if(uprv_strstr(typeName, usprepTypeNames[USPREP_MAP])!=NULL){ /* get the character code, field 0 */ - code=(uint32_t)uprv_strtoul(fields[0][0], &end, 16); - if(end<=fields[0][0] || end!=fields[0][1]) { + code=(uint32_t)uprv_strtoul(s, &end, 16); + if(end<=s || end!=fields[0][1]) { fprintf(stderr, "gensprep: syntax error in field 0 at %s\n", fields[0][0]); *pErrorCode=U_PARSE_ERROR; exit(U_PARSE_ERROR); diff --git a/icu4c/source/tools/gensprep/store.c b/icu4c/source/tools/gensprep/store.c index 58b88b3ae7..d2ec374121 100644 --- a/icu4c/source/tools/gensprep/store.c +++ b/icu4c/source/tools/gensprep/store.c @@ -1,7 +1,7 @@ /* ******************************************************************************* * -* Copyright (C) 1999-2006, International Business Machines +* Copyright (C) 1999-2009, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* @@ -240,14 +240,19 @@ storeMappingData(){ const UHashElement* element = NULL; ValueStruct* value = NULL; int32_t codepoint = 0; - int32_t elementCount = uhash_count(hashTable); + int32_t elementCount = 0; int32_t writtenElementCount = 0; int32_t mappingLength = 1; /* minimum mapping length */ int32_t oldMappingLength = 0; uint16_t trieWord =0; int32_t limitIndex = 0; - /*initialize the mapping data */ + if (hashTable == NULL) { + return; + } + elementCount = uhash_count(hashTable); + + /*initialize the mapping data */ mappingData = (uint16_t*) uprv_malloc(U_SIZEOF_UCHAR * (mappingDataCapacity)); uprv_memset(mappingData,0,U_SIZEOF_UCHAR * mappingDataCapacity); @@ -647,7 +652,9 @@ generateData(const char *dataDir, const char* bundleName) { #if !UCONFIG_NO_IDNA /* done with writing the data .. close the hashtable */ - uhash_close(hashTable); + if (hashTable != NULL) { + uhash_close(hashTable); + } #endif }