ICU-4790 spoof impl merged to trunk.

X-SVN-Rev: 25534
This commit is contained in:
Andy Heninger 2009-03-09 23:40:15 +00:00
parent a5894c4401
commit 9715eae02c
33 changed files with 36784 additions and 293 deletions

View File

@ -239,6 +239,13 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "letest", "..\test\letest\le
{37FC2C7F-1904-4811-8955-2F478830EAD1} = {37FC2C7F-1904-4811-8955-2F478830EAD1}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gencfu", "..\tools\gencfu\gencfu.vcproj", "{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}"
ProjectSection(ProjectDependencies) = postProject
{0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776}
{6B231032-3CB5-4EED-9210-810D666A23A0} = {6B231032-3CB5-4EED-9210-810D666A23A0}
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
@ -511,6 +518,12 @@ Global
{67351485-4D18-4245-BE39-A7EF0675ACD2}.Release|Win32.Build.0 = Release|Win32
{67351485-4D18-4245-BE39-A7EF0675ACD2}.Release|x64.ActiveCfg = Release|x64
{67351485-4D18-4245-BE39-A7EF0675ACD2}.Release|x64.Build.0 = Release|x64
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Debug|Win32.ActiveCfg = Debug|Win32
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Debug|Win32.Build.0 = Debug|Win32
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Debug|x64.ActiveCfg = Debug|Win32
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|Win32.ActiveCfg = Release|Win32
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|Win32.Build.0 = Release|Win32
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|x64.ActiveCfg = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

View File

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1997-2008, International Business Machines
* Copyright (C) 1997-2009, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* Date Name Description
@ -569,7 +569,9 @@ uhash_init(UHashtable *fillinResult,
U_CAPI void U_EXPORT2
uhash_close(UHashtable *hash) {
U_ASSERT(hash != NULL);
if (hash == NULL) {
return;
}
if (hash->elements != NULL) {
if (hash->keyDeleter != NULL || hash->valueDeleter != NULL) {
int32_t pos=-1;

View File

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1997-2007, International Business Machines
* Copyright (C) 1997-2009, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* Date Name Description
@ -246,7 +246,7 @@ uhash_init(UHashtable *hash,
/**
* Close a UHashtable, releasing the memory used.
* @param hash The UHashtable to close.
* @param hash The UHashtable to close. If hash is NULL no operation is performed.
*/
U_CAPI void U_EXPORT2
uhash_close(UHashtable *hash);

View File

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1999-2004, International Business Machines Corporation and *
* Copyright (C) 1999-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
* Date Name Description
@ -10,6 +10,7 @@
#include "uvector.h"
#include "cmemory.h"
#include "uarrsort.h"
U_NAMESPACE_BEGIN
@ -466,5 +467,74 @@ void UVector::sortedInsert(UHashTok tok, USortComparator *compare, UErrorCode& e
}
}
/**
* Array sort comparator function.
* Used from UVector::sort()
* Conforms to function signature required for uprv_sortArray().
* This function is essentially just a wrapper, to make a
* UVector style comparator function usable with uprv_sortArray().
*
* The context pointer to this function is a pointer back
* (with some extra indirection) to the user supplied comparator.
*
*/
static int32_t U_CALLCONV
sortComparator(const void *context, const void *left, const void *right) {
USortComparator *compare = *static_cast<USortComparator * const *>(context);
UHashTok tok1 = *static_cast<const UHashTok *>(left);
UHashTok tok2 = *static_cast<const UHashTok *>(right);
int32_t result = (*compare)(tok1, tok2);
return result;
}
/**
* Array sort comparison function for use from UVector::sorti()
* Compares int32_t vector elements.
*/
static int32_t U_CALLCONV
sortiComparator(const void * /*context */, const void *left, const void *right) {
const UHashTok *tok1 = static_cast<const UHashTok *>(left);
const UHashTok *tok2 = static_cast<const UHashTok *>(right);
int32_t result = tok1->integer < tok2->integer? -1 :
tok1->integer == tok2->integer? 0 : 1;
return result;
}
/**
* Sort the vector, assuming it constains ints.
* (A more general sort would take a comparison function, but it's
* not clear whether UVector's USortComparator or
* UComparator from uprv_sortAray would be more appropriate.)
*/
void UVector::sorti(UErrorCode &ec) {
if (U_SUCCESS(ec)) {
uprv_sortArray(elements, count, sizeof(UHashTok),
sortiComparator, NULL, FALSE, &ec);
}
}
/**
* Sort with a user supplied comparator.
*
* The comparator function handling is confusing because the function type
* for UVector (as defined for sortedInsert()) is different from the signature
* required by uprv_sortArray(). This is handled by passing the
* the UVector sort function pointer via the context pointer to a
* sortArray() comparator function, which can then call back to
* the original user functtion.
*
* An additional twist is that it's not safe to pass a pointer-to-function
* as a (void *) data pointer, so instead we pass a (data) pointer to a
* pointer-to-function variable.
*/
void UVector::sort(USortComparator *compare, UErrorCode &ec) {
if (U_SUCCESS(ec)) {
uprv_sortArray(elements, count, sizeof(UHashTok),
sortComparator, &compare, FALSE, &ec);
}
}
U_NAMESPACE_END

View File

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2006, International Business Machines
* Copyright (C) 1999-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -245,6 +245,20 @@ public:
*/
void sortedInsert(int32_t obj, USortComparator *compare, UErrorCode& ec);
/**
* Sort the contents of the vector, assuming that the contents of the
* vector are of type int32_t.
*/
void sorti(UErrorCode &ec);
/**
* Sort the contents of this vector, using a caller-supplied function
* to do the comparisons. (It's confusing that
* UVector's USortComparator function is different from the
* UComparator function type defined in uarrsort.h)
*/
void sort(USortComparator *compare, UErrorCode &ec);
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
*/

View File

@ -10170,7 +10170,7 @@ then
fi
# output the Makefiles
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
@ -10807,6 +10807,7 @@ do
"tools/icuswap/Makefile") CONFIG_FILES="$CONFIG_FILES tools/icuswap/Makefile" ;;
"tools/pkgdata/Makefile") CONFIG_FILES="$CONFIG_FILES tools/pkgdata/Makefile" ;;
"tools/tzcode/Makefile") CONFIG_FILES="$CONFIG_FILES tools/tzcode/Makefile" ;;
"tools/gencfu/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencfu/Makefile" ;;
"test/Makefile") CONFIG_FILES="$CONFIG_FILES test/Makefile" ;;
"test/compat/Makefile") CONFIG_FILES="$CONFIG_FILES test/compat/Makefile" ;;
"test/testdata/Makefile") CONFIG_FILES="$CONFIG_FILES test/testdata/Makefile" ;;

View File

@ -1116,6 +1116,7 @@ AC_CONFIG_FILES([icudefs.mk \
tools/icuswap/Makefile \
tools/pkgdata/Makefile \
tools/tzcode/Makefile \
tools/gencfu/Makefile \
test/Makefile \
test/compat/Makefile \
test/testdata/Makefile \

View File

@ -230,6 +230,11 @@ BRS_SRC_FILES = $(BRS_SRC:%=$(BRKSRCDIR)/%)
INSTALLED_BRS_FILES = $(BRK_RES_SOURCE:%.txt=%) $(BRK_RES_SOURCE_LOCAL:%.txt=%)
endif
## Confusables (Spoofing) files
ALL_CFU_SOURCE=$(UNICODEDATADIR)/confusables.txt $(UNICODEDATADIR)/confusablesWholeScript.txt
CFU_FILES_SHORT=confusables.cfu
CFU_FILES=$(BUILDDIR)/$(CFU_FILES_SHORT)
## UCM files
-include $(UCMSRCDIR)/ucmcore.mk
-include $(UCMSRCDIR)/ucmfiles.mk
@ -331,10 +336,10 @@ SPREP_FILES = $(ALL_SPREP_SOURCE:%.txt=$(BUILDDIR)/%.spp)
SPREP_FILES_SHORT = $(ALL_SPREP_SOURCE:%.txt=%.spp)
## All generated files
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES)
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES)
ALL_INDEX_SRC_FILES = $(PKGDATA_LIST) $(INDEX_FILE) $(COLLATION_INDEX_FILE) $(BRK_RES_INDEX_FILE) $(RBNF_INDEX_FILE)
# a list to use in the .lst files (package-relative)
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT)
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT)
UNI_CORE_DATA=uprops.icu ucase.icu ubidi.icu unorm.icu
UNI_CORE_TARGET_DATA=$(UNI_CORE_DATA:%=$(BUILDDIR)/%)
@ -452,6 +457,20 @@ $(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(BINDIR)/genbrk$(EXEEXT) $(DAT_FILES)
$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(BINDIR)/genctd$(EXEEXT) $(DAT_FILES)
$(INVOKE) $(BINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
#################################################### CFU
# CFU FILES
# Note: gencfu requires two input files to produce a single output file.
# There will be exactly one target file and two source files.
# The $(word n, ...) selects the nth word from the following stuff.
# There must be a nicer way to do this.
$(CFU_FILES): $(ALL_CFU_SOURCE) $(BINDIR)/gencfu$(EXEEXT) $(DAT_FILES)
$(INVOKE) echo ALL_CFU_SOURCE: $(ALL_CFU_SOURCE)
$(INVOKE) echo CFU_FILES: $(CFU_FILES)
$(INVOKE) echo CFU_FILES_SHORT: $(CFU_FILES_SHORT)
$(INVOKE) $(BINDIR)/gencfu -c -i $(BUILDDIR) -r $(word 1,$(ALL_CFU_SOURCE)) -w $(word 2,$(ALL_CFU_SOURCE)) -o $@
#################################################### CNV
# CNV FILES
$(BUILDDIR)/%.cnv: $(UCMSRCDIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT)

View File

@ -422,12 +422,13 @@ uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(IC
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
-@erase "$(ICUTMP)\$(ICUPKG).dat"
!ELSE
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES)
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
@echo Building icu data
cd "$(ICUBLD_PKG)"
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
pnames.icu
unames.icu
confusables.cfu
$(ICUCOL)\ucadata.icu
$(ICUCOL)\invuca.icu
cnvalias.icu
@ -486,6 +487,7 @@ CLEAN : GODATA
-@erase "*.res"
-@erase "*.spp"
-@erase "*.txt"
-@erase "*.cfu"
@cd "$(ICUBLD_PKG)\$(ICUBRK)"
-@erase "*.brk"
-@erase "*.ctd"
@ -497,7 +499,7 @@ CLEAN : GODATA
@cd "$(ICUBLD_PKG)\$(ICURBNF)"
-@erase "*.res"
-@erase "*.txt"
@cd "$(ICUBLD_PKG)\$(ICUTRNS)"
@cd "$(ICUBLD_PKG)\$(ICUTRNS)"
-@erase "*.res"
@cd "$(ICUOUT)"
-@erase "*.dat"
@ -673,6 +675,12 @@ res_index:table(nofallback) {
@echo Creating $@
@"$(ICUTOOLS)\gensprep\$(CFG)\gensprep" -s $(<D) -d "$(ICUBLD_PKG)" -b $(@B) -m "$(ICUUNIDATA)" -u 3.2.0 $(<F)
# Confusables .cfu file generation
# Can't use an inference rule because two .txt source files combine to produce a single .cfu output file
"$(ICUBLD_PKG)\confusables.cfu": "$(ICUUNIDATA)\confusables.txt" "$(ICUUNIDATA)\confusablesWholeScript.txt" "$(ICUTOOLS)\gencfu\$(CFG)\gencfu.exe"
@echo Creating $@
@"$(ICUTOOLS)\gencfu\$(CFG)\gencfu" -c -r "$(ICUUNIDATA)\confusables.txt" -w "$(ICUUNIDATA)\confusablesWholeScript.txt" -o $@ -i "$(ICUBLD_PKG)"
!IFDEF ICUDATA_ARCHIVE
"$(ICUDATA_SOURCE_ARCHIVE)": CREATE_DIRS $(ICUDATA_ARCHIVE) "$(ICUTOOLS)\icupkg\$(CFG)\icupkg.exe"
"$(ICUTOOLS)\icupkg\$(CFG)\icupkg" -t$(U_ICUDATA_ENDIAN_SUFFIX) "$(ICUDATA_ARCHIVE)" "$(ICUDATA_SOURCE_ARCHIVE)"

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -81,7 +81,8 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
zonemeta.o zstrfmt.o plurrule.o plurfmt.o dtitvfmt.o dtitvinf.o \
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o currpinf.o
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o currpinf.o \
uspoof.o uspoof_impl.o uspoof_build.o uspoof_buildconf.o uspoof_buildwsconf.o
## Header files to install
HEADERS = $(srcdir)/unicode/*.h

File diff suppressed because it is too large Load Diff

View File

@ -29,6 +29,8 @@
#ifdef XP_CPLUSPLUS
#include "unicode/unistr.h"
#include "unicode/uniset.h"
U_NAMESPACE_USE
#endif
@ -133,8 +135,8 @@ typedef enum USpoofChecks {
USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4,
/** Modifier for single, mixed & whole script checks.
Selects between Lower Case Confusable (0) and
Any Case Confusable (1). */
Selects between Lower Case Confusable and
Any Case Confusable. */
USPOOF_ANY_CASE = 8,
/** Check that an identifer contains only characters from a
@ -146,15 +148,13 @@ typedef enum USpoofChecks {
/** Check that an identifier for the presence of invisble characters,
* characters, such as zero-width spaces, or character sequences that are
* likely not to display, such as multiple occurences of the same
* non-spacing mark. This does not test the input string as a whole
* non-spacing mark. This check does not test the input string as a whole
* for conformance to any particular syntax for identifiers.
*/
USPOOF_INVISIBLE = 32,
USPOOF_LOCALE_LIMIT = 64,
USPOOF_CHAR_LIMIT = 128,
USPOOF_CHAR_LIMIT = 64,
USPOOF_ALL_CHECKS = 0x7f
};
} USpoofChecks;
/**
@ -298,10 +298,20 @@ uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
* Supplying an empty string removes all restrictions;
* characters from any script will be allowed.
*
* The USPOOF_LOCALE_LIMIT test is automatically enabled for this
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
* USpoofChecker when calling this function with a non-empty set
* of locales.
*
* The Unicode Set of characters that will be allowed is accessible
* via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales()
* will <i>replace</i> any previously applied set of allowed characters.
*
* Adjustments, such as additions or deletions of certain classes of characters,
* can be made to the result of uspoof_setAllowedLocales() by
* fetching the resulting set with uspoof_getAllowedChars(),
* manipulating it with the Unicode Set API, then resetting the
* spoof detectors limits with uspoof_setAllowedChars()
*
* @param sc The USpoofChecker
* @param localesList A list list of locales, from which the language
* and associated script are extracted. The list
@ -318,6 +328,8 @@ uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode
* to be checked. If no limitations on scripts have been specified,
* an empty string will be returned.
*
* uspoof_setAllowedChars() will reset the list of allowed to be empty.
*
* The format of the returned list is that of an HTTP Accept-Language
* header field, but it may not be identical to the original string passed
* to uspoof_setAllowedLocales(); the string may be
@ -339,7 +351,8 @@ uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
/**
* Limit the acceptable characters to those specified by a Unicode Set.
* Any previously specified character limit is
* is replaced by the new settings.
* is replaced by the new settings. This includes limits on
* characters that were set with the uspoof_setAllowedLocales() function.
*
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
* USpoofChecker by this function.
@ -381,14 +394,15 @@ uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status)
* the USPOOF_CHAR_LIMIT test.
*/
U_DRAFT const USet * U_EXPORT2
uspoof_getAllowedChars(USpoofChecker *sc, UErrorCode *status);
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
#ifdef XP_CPLUSPLUS
/**
* Limit the acceptable characters to those specified by a Unicode Set.
* Any previously specified character limit is
* is replaced by the new settings.
* is replaced by the new settings. This includes limits on
* characters that were set with the uspoof_setAllowedLocales() function.
*
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
* USoofChecker by this function.
@ -425,7 +439,7 @@ uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCo
* the USPOOF_CHAR_LIMIT test.
*/
U_DRAFT const UnicodeSet * U_EXPORT2
uspoof_getAllowedUnicodeSet(USpoofChecker *sc, UErrorCode *status);
uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
#endif
@ -441,10 +455,11 @@ uspoof_getAllowedUnicodeSet(USpoofChecker *sc, UErrorCode *status);
* 16 bit UTF-16 code units, or -1 if the string is
* zero terminated.
* @position An out parameter that receives the index of the
* first string position that fails one of the checks.
* first string position that fails the allowed character
* limitation checks.
* This parameter may be null if the position information
* is not needed.
* If the string passes all of the requested checks the
* If the string passes the requested checks the
* parameter value will not be set.
* @param status The error code, set if an error occured while attempting to
* perform the check.
@ -473,15 +488,18 @@ uspoof_check(const USpoofChecker *sc,
* @param length the length of the string to be checked, or -1 if the string is
* zero terminated.
* @position An out parameter that receives the index of the
* first string position that fails one of the checks.
* first string position that fails the allowed character
* limitation checks.
* This parameter may be null if the position information
* is not needed.
* If the string passes all of the requested checks the
* If the string passes the requested checks the
* parameter value will not be set.
* @param status The error code, set if an error occured while attempting to
* perform the check.
* Spoofing or security issues detected with the input string are
* not reported here, but through the function's return value.
* If the input contains invalid UTF-8 sequences,
* a status of U_INVALID_CHAR_FOUND will be returned.
* @return An integer value with bits set for any potential security
* or spoofing issues detected. The bits are defined by
* enum USpoofChecks. Zero is returned if no issues
@ -504,10 +522,11 @@ uspoof_checkUTF8(const USpoofChecker *sc,
* @param sc The USpoofChecker
* @param text A UnicodeString to be checked for possible security issues.
* @position An out parameter that receives the index of the
* first string position that fails one of the checks.
* first string position that fails the allowed character
* limitation checks.
* This parameter may be null if the position information
* is not needed.
* If the string passes all of the requested checks the
* If the string passes the requested checks the
* parameter value will not be set.
* @param status The error code, set if an error occured while attempting to
* perform the check.
@ -645,7 +664,7 @@ U_DRAFT int32_t U_EXPORT2
uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
const U_NAMESPACE_QUALIFIER UnicodeString &s1,
const U_NAMESPACE_QUALIFIER UnicodeString &s2,
int32_t *position,
int32_t *position,
UErrorCode *status);
#endif
@ -684,7 +703,7 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
*/
U_DRAFT int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
USpoofChecks type,
uint32_t type,
const UChar *s, int32_t length,
UChar *dest, int32_t destCapacity,
UErrorCode *status);
@ -726,7 +745,7 @@ uspoof_getSkeleton(const USpoofChecker *sc,
*/
U_DRAFT int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
USpoofChecks type,
uint32_t type,
const char *s, int32_t length,
char *dest, int32_t destCapacity,
UErrorCode *status);
@ -762,7 +781,7 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
*/
U_DRAFT UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
USpoofChecks type,
uint32_t type,
const UnicodeString &s,
UnicodeString &dest,
UErrorCode *status);

View File

@ -0,0 +1,540 @@
/*
***************************************************************************
* Copyright (C) 2008-2009, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* file name: uspoof.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2008Feb13
* created by: Andy Heninger
*
* Unicode Spoof Detection
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/unorm.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "uspoof_impl.h"
#include "uassert.h"
#include <stdio.h> // debug
U_NAMESPACE_USE
U_CAPI USpoofChecker * U_EXPORT2
uspoof_open(UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
if (U_FAILURE(*status)) {
delete si;
si = NULL;
}
return (USpoofChecker *)si;
}
U_CAPI USpoofChecker * U_EXPORT2
uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
SpoofData *sd = new SpoofData(data, length, *status);
SpoofImpl *si = new SpoofImpl(sd, *status);
if (U_FAILURE(*status)) {
delete sd;
delete si;
return NULL;
}
if (sd == NULL || si == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
delete sd;
delete si;
return NULL;
}
if (pActualLength != NULL) {
*pActualLength = sd->fRawData->fLength;
}
return reinterpret_cast<USpoofChecker *>(si);
}
U_CAPI USpoofChecker * U_EXPORT2
uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
if (src == NULL) {
return NULL;
}
SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor
if (U_FAILURE(*status)) {
delete result;
result = NULL;
}
return (USpoofChecker *)result;
}
U_CAPI void U_EXPORT2
uspoof_close(USpoofChecker *sc) {
UErrorCode status = U_ZERO_ERROR;
SpoofImpl *This = SpoofImpl::validateThis(sc, status);
delete This;
}
U_CAPI void U_EXPORT2
uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return;
}
// Verify that the requested checks are all ones (bits) that
// are acceptable, known values.
if (checks & ~USPOOF_ALL_CHECKS) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
This->fChecks = checks;
}
U_CAPI int32_t U_EXPORT2
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return 0;
}
return This->fChecks;
}
U_CAPI void U_EXPORT2
uspoof_setAllowedLocales(USpoofChecker *sc, const char * /*localesList*/, UErrorCode *status) {
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return;
}
// TODO:
}
U_CAPI const USet * U_EXPORT2
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
return reinterpret_cast<const USet *>(result);
}
U_CAPI const UnicodeSet * U_EXPORT2
uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return NULL;
}
return This->fAllowedCharsSet;
}
U_CAPI void U_EXPORT2
uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
uspoof_setAllowedUnicodeSet(sc, set, status);
}
U_CAPI void U_EXPORT2
uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return;
}
if (chars->isBogus()) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
if (clonedSet == NULL || clonedSet->isBogus()) {
*status = U_MEMORY_ALLOCATION_ERROR;
return;
}
clonedSet->freeze();
delete This->fAllowedCharsSet;
This->fAllowedCharsSet = clonedSet;
This->fChecks |= USPOOF_CHAR_LIMIT;
}
U_CAPI int32_t U_EXPORT2
uspoof_check(const USpoofChecker *sc,
const UChar *text, int32_t length,
int32_t *position,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
return 0;
}
if (length < -1) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
if (length == -1) {
// It's not worth the bother to handle nul terminated strings everywhere.
// Just get the length and be done with it.
length = u_strlen(text);
}
int32_t result = 0;
int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32?
// A count of the number of non-Common or inherited scripts.
// Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
// Share the computation when possible. scriptCount == -1 means that we haven't
// done it yet.
int32_t scriptCount = -1;
if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
scriptCount = This->scriptScan(text, length, failPos, *status);
// printf("scriptCount (clipped to 2) = %d\n", scriptCount);
if ( scriptCount >= 2) {
// Note: scriptCount == 2 covers all cases of the number of scripts >= 2
result |= USPOOF_SINGLE_SCRIPT;
}
}
if (This->fChecks & USPOOF_CHAR_LIMIT) {
int32_t i;
UChar32 c;
for (i=0; i<length ;) {
U16_NEXT(text, i, length, c);
if (!This->fAllowedCharsSet->contains(c)) {
result |= USPOOF_CHAR_LIMIT;
if (i < failPos) {
failPos = i;
}
break;
}
}
}
// TODO: add USPOOF_INVISIBLE check
if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
// The basic test is the same for both whole and mixed script confusables.
// Compute the set of scripts that every input character has a confusable in.
// For this computation an input character is always considered to be
// confusable with itself in its own script.
// If the number of such scripts is two or more, and the input consisted of
// characters all from a single script, we have a whole script confusable.
// (The two scripts will be the original script and the one that is confusable)
// If the number of such scripts >= one, and the original input contained characters from
// more than one script, we have a mixed script confusable. (We can transform
// some of the characters, and end up with a visually similar string all in
// one script.)
NFKDBuffer normalizedInput(text, length, *status);
const UChar *nfkdText = normalizedInput.getBuffer();
int32_t nfkdLength = normalizedInput.getLength();
if (scriptCount == -1) {
int32_t t;
scriptCount = This->scriptScan(text, length, t, *status);
}
ScriptSet scripts;
This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
int32_t confusableScriptCount = scripts.countMembers();
//printf("confusableScriptCount = %d\n", confusableScriptCount);
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
confusableScriptCount >= 2 &&
scriptCount == 1) {
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
}
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
confusableScriptCount >= 1 &&
scriptCount > 1) {
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
}
}
if (position != NULL && failPos != 0x7fffffff) {
*position = failPos;
}
return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_checkUTF8(const USpoofChecker *sc,
const char *text, int32_t length,
int32_t *position,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return 0;
}
UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
UChar* text16 = stackBuf;
int32_t len16;
u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
return 0;
}
if (*status == U_BUFFER_OVERFLOW_ERROR) {
text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
if (text16 == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
*status = U_ZERO_ERROR;
u_strFromUTF8(text16, len16+1, NULL, text, length, status);
}
int32_t position16 = -1;
int32_t result = uspoof_check(sc, text16, len16, &position16, status);
if (U_FAILURE(*status)) {
return 0;
}
if (position16 > 0) {
// Translate a UTF-16 based error position back to a UTF-8 offset.
// u_strToUTF8() in preflight mode is an easy way to do it.
U_ASSERT(position16 <= len16);
u_strToUTF8(NULL, 0, position, text16, position16, status);
}
if (text16 != stackBuf) {
uprv_free(text16);
}
return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_checkUnicodeString(const USpoofChecker *sc,
const U_NAMESPACE_QUALIFIER UnicodeString &text,
int32_t *position,
UErrorCode *status) {
int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
return result;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeleton(const USpoofChecker *sc,
uint32_t type,
const UChar *s, int32_t length,
UChar *dest, int32_t destCapacity,
UErrorCode *status) {
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (U_FAILURE(*status)) {
return 0;
}
if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
(type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
int32_t tableMask = 0;
switch (type) {
case 0:
tableMask = USPOOF_ML_TABLE_FLAG;
break;
case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
tableMask = USPOOF_SL_TABLE_FLAG;
break;
case USPOOF_ANY_CASE:
tableMask = USPOOF_MA_TABLE_FLAG;
break;
case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
tableMask = USPOOF_SA_TABLE_FLAG;
break;
default:
*status = U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
// NFKD transform of the user supplied input
UChar nfkdBuf[USPOOF_STACK_BUFFER_SIZE];
UChar *nfkdInput = nfkdBuf;
int32_t normalizedLen = unorm_normalize(
s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
if (nfkdInput == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return 0;
}
normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,
nfkdInput, normalizedLen+1, status);
}
if (U_FAILURE(*status)) {
return 0;
}
// buffer to hold the Unicode defined mappings for a single code point
UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
// Apply the mapping to the NFKD form string
int32_t inputIndex = 0;
int32_t resultLen = 0;
while (inputIndex < normalizedLen) {
UChar32 c;
U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
if (resultLen + replaceLen < destCapacity) {
int i;
for (i=0; i<replaceLen; i++) {
dest[resultLen++] = buf[i];
}
} else {
// Storing the transformed string would overflow the dest buffer.
// Don't bother storing anything, just sum up the required buffer size.
// (We dont guarantee that a truncated buffer is filled to it's end)
resultLen += replaceLen;
}
}
if (resultLen < destCapacity) {
dest[resultLen] = 0;
} else if (resultLen == destCapacity) {
*status = U_STRING_NOT_TERMINATED_WARNING;
} else {
*status = U_BUFFER_OVERFLOW_ERROR;
}
if (nfkdInput != nfkdBuf) {
uprv_free(nfkdInput);
}
return resultLen;
}
U_CAPI UnicodeString & U_EXPORT2
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
uint32_t type,
const UnicodeString &s,
UnicodeString &dest,
UErrorCode *status) {
if (U_FAILURE(*status)) {
return dest;
}
dest.remove();
const UChar *str = s.getBuffer();
int32_t strLen = s.length();
UChar smallBuf[100];
UChar *buf = smallBuf;
int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, 100, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
buf = static_cast<UChar *>(uprv_malloc(outputSize+1));
if (buf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
}
uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
}
if (U_SUCCESS(*status)) {
dest.setTo(buf, outputSize);
}
if (buf != smallBuf) {
uprv_free(buf);
}
return dest;
}
U_CAPI int32_t U_EXPORT2
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
uint32_t type,
const char *s, int32_t length,
char *dest, int32_t destCapacity,
UErrorCode *status) {
// Lacking a UTF-8 normalization API, just converting the input to
// UTF-16 seems as good an approach as any. In typical use, input will
// be an identifier, which is to say not too long for stack buffers.
if (U_FAILURE(*status)) {
return 0;
}
// Buffers for the UChar form of the input and skeleton strings.
UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE];
UChar *inBuf = smallInBuf;
UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
UChar *outBuf = smallOutBuf;
int32_t lengthInUChars = 0;
int32_t skelLengthInUChars = 0;
int32_t skelLengthInUTF8 = 0;
u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
s, length, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
if (inBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars+1,
s, length, status);
}
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
if (*status == U_BUFFER_OVERFLOW_ERROR) {
*status = U_ZERO_ERROR;
outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
if (outBuf == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
}
u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
outBuf, skelLengthInUChars, status);
cleanup:
if (inBuf != smallInBuf) {
delete inBuf;
}
if (outBuf != smallOutBuf) {
delete outBuf;
}
return skelLengthInUTF8;
}
U_CAPI int32_t U_EXPORT2
uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
if (This == NULL) {
U_ASSERT(U_FAILURE(*status));
return 0;
}
int32_t dataSize = This->fSpoofData->fRawData->fLength;
if (capacity < dataSize) {
*status = U_BUFFER_OVERFLOW_ERROR;
return dataSize;
}
uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
return dataSize;
}

View File

@ -0,0 +1,81 @@
/*
***************************************************************************
* Copyright (C) 2008-2009, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* file name: uspoof_build.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2008 Dec 8
* created by: Andy Heninger
*
* Unicode Spoof Detection Data Builder
* Builder-related functions are kept in separate files so that applications not needing
* the builder can more easily exclude them, typically by means of static linking.
*
* There are three relatively independent sets of Spoof data,
* Confusables,
* Whole Script Confusables
* ID character extensions.
*
* The data tables for each are built separately, each from its own definitions
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/unorm.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
#include "uassert.h"
#include "uarrsort.h"
#include "uspoof_buildconf.h"
#include "uspoof_buildwsconf.h"
#include <stdio.h> // DEBUG
U_NAMESPACE_USE
// The main data building function
U_CAPI USpoofChecker * U_EXPORT2
uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
int32_t *errorType, UParseError *pe, UErrorCode *status) {
if (U_FAILURE(*status)) {
return NULL;
}
if (errorType!=NULL) {
*errorType = 0;
}
if (pe != NULL) {
pe->line = 0;
pe->offset = 0;
pe->preContext[0] = 0;
pe->postContext[0] = 0;
}
// Set up a shell of a spoof detector, with empty data.
SpoofData *newSpoofData = new SpoofData(*status);
SpoofImpl *This = new SpoofImpl(newSpoofData, *status);
// Compile the binary data from the source (text) format.
ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
if (U_FAILURE(*status)) {
delete This;
This = NULL;
}
return (USpoofChecker *)This;
}

View File

@ -0,0 +1,593 @@
/*
******************************************************************************
*
* Copyright (C) 2008-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uspoof_buildconf.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009Jan05 (refactoring earlier files)
* created by: Andy Heninger
*
* Internal classes for compililing confusable data into its binary (runtime) form.
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/unorm.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
#include "uassert.h"
#include "uarrsort.h"
#include "uspoof_buildconf.h"
#include "stdio.h" // DEBUG. Remove.
U_NAMESPACE_USE
//---------------------------------------------------------------------
//
// buildConfusableData Compile the source confusable data, as defined by
// the Unicode data file confusables.txt, into the binary
// structures used by the confusable detector.
//
// The binary structures are described in uspoof_impl.h
//
// 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
// tables. Each maps from a UChar32 to a String.
//
// 2. Sort all of the strings encountered by length, since they will need to
// be stored in that order in the final string table.
//
// 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
// list because that will be the ordering of our runtime table.
//
// 4. Generate the run time string table. This is generated before the key & value
// tables because we need the string indexes when building those tables.
//
// 5. Build the run-time key and value tables. These are parallel tables, and are built
// at the same time
//
SPUString::SPUString(UnicodeString *s) {
fStr = s;
fStrTableIndex = 0;
}
SPUString::~SPUString() {
delete fStr;
}
SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
fVec = new UVector(status);
fHash = uhash_open(uhash_hashUnicodeString, // key hash function
uhash_compareUnicodeString, // Key Comparator
NULL, // Value Comparator
&status);
}
SPUStringPool::~SPUStringPool() {
int i;
for (i=fVec->size()-1; i>=0; i--) {
SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
delete s;
}
delete fVec;
uhash_close(fHash);
}
int32_t SPUStringPool::size() {
return fVec->size();
}
SPUString *SPUStringPool::getByIndex(int32_t index) {
SPUString *retString = (SPUString *)fVec->elementAt(index);
return retString;
}
// Comparison function for ordering strings in the string pool.
// Compare by length first, then, within a group of the same length,
// by code point order.
// Conforms to the type signature for a USortComparator in uvector.h
static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
const SPUString *sL = static_cast<const SPUString *>(left.pointer);
const SPUString *sR = static_cast<const SPUString *>(right.pointer);
int32_t lenL = sL->fStr->length();
int32_t lenR = sR->fStr->length();
if (lenL < lenR) {
return -1;
} else if (lenL > lenR) {
return 1;
} else {
return sL->fStr->compare(*(sR->fStr));
}
}
void SPUStringPool::sort(UErrorCode &status) {
fVec->sort(SPUStringCompare, status);
}
SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
if (hashedString != NULL) {
delete src;
} else {
hashedString = new SPUString(src);
uhash_put(fHash, src, hashedString, &status);
fVec->addElement(hashedString, status);
}
return hashedString;
}
ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
fSpoofImpl(spImpl),
fInput(NULL),
fSLTable(NULL),
fSATable(NULL),
fMLTable(NULL),
fMATable(NULL),
fKeySet(NULL),
fKeyVec(NULL),
fValueVec(NULL),
fStringTable(NULL),
fStringLengthsTable(NULL),
stringPool(NULL),
fParseLine(NULL),
fParseHexNum(NULL),
fLineNum(0)
{
if (U_FAILURE(status)) {
return;
}
fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
fKeySet = new UnicodeSet();
fKeyVec = new UVector(status);
fValueVec = new UVector(status);
stringPool = new SPUStringPool(status);
}
ConfusabledataBuilder::~ConfusabledataBuilder() {
uprv_free(fInput);
uregex_close(fParseLine);
uregex_close(fParseHexNum);
uhash_close(fSLTable);
uhash_close(fSATable);
uhash_close(fMLTable);
uhash_close(fMATable);
delete fKeySet;
delete fKeyVec;
delete fStringTable;
delete fStringLengthsTable;
delete fValueVec;
delete stringPool;
}
void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
ConfusabledataBuilder builder(spImpl, status);
builder.build(confusables, confusablesLen, status);
if (U_FAILURE(status) && errorType != NULL) {
*errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
pe->line = builder.fLineNum;
}
}
void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
UErrorCode &status) {
// Convert the user input data from UTF-8 to UChar (UTF-16)
int32_t inputLen = 0;
if (U_FAILURE(status)) {
return;
}
u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
return;
}
status = U_ZERO_ERROR;
fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
if (fInput == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
// Regular Expression to parse a line from Confusables.txt. The expression will match
// any line. What was matched is determined by examining which capture groups have a match.
// Capture Group 1: the source char
// Capture Group 2: the replacement chars
// Capture Group 3-6 the table type, SL, SA, ML, or MA
// Capture Group 7: A blank or comment only line.
// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
// Example Line from the confusables.txt source file:
// "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
fParseLine = uregex_openC(
"(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
"[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
"(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
"\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
"[ \\t]*(?:#.*?)?$" // Match any trailing #comment
"|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
"|^(.*?)$", // OR match any line, which catches illegal lines.
0, NULL, &status);
// Regular expression for parsing a hex number out of a space-separated list of them.
// Capture group 1 gets the number, with spaces removed.
fParseHexNum = uregex_openC("\\s*([0-9A-F]+)", 0, NULL, &status);
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
// given the syntax of the input.
if (*fInput == 0xfeff) {
*fInput = 0x20;
}
// Parse the input, one line per iteration of this loop.
uregex_setText(fParseLine, fInput, inputLen, &status);
while (uregex_findNext(fParseLine, &status)) {
fLineNum++;
if (uregex_start(fParseLine, 7, &status) >= 0) {
// this was a blank or comment line.
continue;
}
if (uregex_start(fParseLine, 8, &status) >= 0) {
// input file syntax error.
status = U_PARSE_ERROR;
return;
}
// We have a good input line. Extract the key character and mapping string, and
// put them into the appropriate mapping table.
UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
uregex_end(fParseLine, 1, &status), status);
int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
UnicodeString *mapString = new UnicodeString();
if (mapString == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
while (uregex_findNext(fParseHexNum, &status)) {
UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
uregex_end(fParseHexNum, 1, &status), status);
mapString->append(c);
}
U_ASSERT(mapString->length() >= 1);
// Put the map (value) string into the string pool
// This a little like a Java intern() - any duplicates will be eliminated.
SPUString *smapString = stringPool->addString(mapString, status);
// Add the UChar -> string mapping to the appropriate table.
UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
NULL;
U_ASSERT(table != NULL);
uhash_iput(table, keyChar, smapString, &status);
fKeySet->add(keyChar);
if (U_FAILURE(status)) {
return;
}
}
// Input data is now all parsed and collected.
// Now create the run-time binary form of the data.
//
// This is done in two steps. First the data is assembled into vectors and strings,
// for ease of construction, then the contents of these collections are dumped
// into the actual raw-bytes data storage.
// Build up the string array, and record the index of each string therein
// in the (build time only) string pool.
// Strings of length one are not entered into the strings array.
// At the same time, build up the string lengths table, which records the
// position in the string table of the first string of each length >= 4.
// (Strings in the table are sorted by length)
stringPool->sort(status);
fStringTable = new UnicodeString();
fStringLengthsTable = new UVector(status);
int32_t previousStringLength = 0;
int32_t previousStringIndex = 0;
int32_t poolSize = stringPool->size();
int32_t i;
for (i=0; i<poolSize; i++) {
SPUString *s = stringPool->getByIndex(i);
int32_t strLen = s->fStr->length();
int32_t strIndex = fStringTable->length();
U_ASSERT(strLen >= previousStringLength);
if (strLen == 1) {
// strings of length one do not get an entry in the string table.
// Keep the single string character itself here, which is the same
// convention that is used in the final run-time string table index.
s->fStrTableIndex = s->fStr->charAt(0);
} else {
if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
fStringLengthsTable->addElement(previousStringIndex, status);
fStringLengthsTable->addElement(previousStringLength, status);
}
s->fStrTableIndex = strIndex;
fStringTable->append(*(s->fStr));
}
previousStringLength = strLen;
previousStringIndex = strIndex;
}
// Make the final entry to the string lengths table.
// (it holds an entry for the _last_ string of each length, so adding the
// final one doesn't happen in the main loop because no longer string was encountered.)
if (previousStringLength >= 4) {
fStringLengthsTable->addElement(previousStringIndex, status);
fStringLengthsTable->addElement(previousStringLength, status);
}
// Construct the compile-time Key and Value tables
//
// For each key code point, check which mapping tables it applies to,
// and create the final data for the key & value structures.
//
// The four logical mapping tables are conflated into one combined table.
// If multiple logical tables have the same mapping for some key, they
// share a single entry in the combined table.
// If more than one mapping exists for the same key code point, multiple
// entries will be created in the table
for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
// It is an oddity of the UnicodeSet API that simply enumerating the contained
// code points requires a nested loop.
for (UChar32 keyChar=fKeySet->getRangeStart(range);
keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
}
}
// Put the assembled data into the flat runtime array
outputData(status);
// All of the intermediate allocated data belongs to the ConfusabledataBuilder
// object (this), and is deleted in the destructor.
return;
}
//
// outputData The confusable data has been compiled and stored in intermediate
// collections and strings. Copy it from there to the final flat
// binary array.
//
// Note that as each section is added to the output data, the
// expand (reserveSpace() function will likely relocate it in memory.
// Be careful with pointers.
//
void ConfusabledataBuilder::outputData(UErrorCode &status) {
U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
// The Key Table
// While copying the keys to the runtime array,
// also sanity check that they are sorted.
int32_t numKeys = fKeyVec->size();
int32_t *keys =
static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
if (U_FAILURE(status)) {
return;
}
int i;
int32_t previousKey = 0;
for (i=0; i<numKeys; i++) {
int32_t key = fKeyVec->elementAti(i);
U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
U_ASSERT((key & 0xff000000) != 0);
keys[i] = key;
previousKey = key;
}
SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
rawData->fCFUKeys = (char *)keys - (char *)rawData;
rawData->fCFUKeysSize = numKeys;
fSpoofImpl->fSpoofData->fCFUKeys = keys;
// The Value Table, parallels the key table
int32_t numValues = fValueVec->size();
U_ASSERT(numKeys == numValues);
uint16_t *values =
static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
if (U_FAILURE(status)) {
return;
}
for (i=0; i<numValues; i++) {
uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
U_ASSERT(value < 0xffff);
values[i] = static_cast<uint16_t>(value);
}
rawData = fSpoofImpl->fSpoofData->fRawData;
rawData->fCFUStringIndex = (char *)values - (char *)rawData;
rawData->fCFUStringIndexSize = numValues;
fSpoofImpl->fSpoofData->fCFUValues = values;
// The Strings Table.
uint32_t stringsLength = fStringTable->length();
// Reserve an extra space so the string will be nul-terminated. This is
// only a convenience, for when debugging; it is not needed otherwise.
UChar *strings =
static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
if (U_FAILURE(status)) {
return;
}
fStringTable->extract(strings, stringsLength+1, status);
rawData = fSpoofImpl->fSpoofData->fRawData;
U_ASSERT(rawData->fCFUStringTable == 0);
rawData->fCFUStringTable = (char *)strings - (char *)rawData;
rawData->fCFUStringTableLen = stringsLength;
fSpoofImpl->fSpoofData->fCFUStrings = strings;
// The String Lengths Table
// While copying into the runtime array do some sanity checks on the values
// Each complete entry contains two fields, an index and an offset.
// Lengths should increase with each entry.
// Offsets should be less than the size of the string table.
int32_t lengthTableLength = fStringLengthsTable->size();
uint16_t *stringLengths =
static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
if (U_FAILURE(status)) {
return;
}
int32_t destIndex = 0;
uint32_t previousLength = 0;
for (i=0; i<lengthTableLength; i+=2) {
uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
U_ASSERT(offset < stringsLength);
U_ASSERT(length < 40);
U_ASSERT(length > previousLength);
stringLengths[destIndex++] = static_cast<uint16_t>(offset);
stringLengths[destIndex++] = static_cast<uint16_t>(length);
previousLength = length;
}
rawData = fSpoofImpl->fSpoofData->fRawData;
rawData->fCFUStringLengths = (char *)stringLengths - (char *)rawData;
// Note: StringLengthsSize in the raw data is the number of complete entries,
// each consisting of a pair of 16 bit values, hence the divide by 2.
rawData->fCFUStringLengthsSize = lengthTableLength / 2;
fSpoofImpl->fSpoofData->fCFUStringLengths =
reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
}
// addKeyEntry Construction of the confusable Key and Mapping Values tables.
// This is an intermediate point in the building process.
// We already have the mappings in the hash tables fSLTable, etc.
// This function builds corresponding run-time style table entries into
// fKeyVec and fValueVec
void ConfusabledataBuilder::addKeyEntry(
UChar32 keyChar, // The key character
UHashtable *table, // The table, one of SATable, MATable, etc.
int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
UErrorCode &status) {
SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
if (targetMapping == NULL) {
// No mapping for this key character.
// (This function is called for all four tables for each key char that
// is seen anywhere, so this no entry cases are very much expected.)
return;
}
// Check whether there is already an entry with the correct mapping.
// If so, simply set the flag in the keyTable saying that the existing entry
// applies to the table that we're doing now.
UBool keyHasMultipleValues = FALSE;
int32_t i;
for (i=fKeyVec->size()-1; i>=0 ; i--) {
int32_t key = fKeyVec->elementAti(i);
if ((key & 0x0ffffff) != keyChar) {
// We have now checked all existing key entries for this key char (if any)
// without finding one with the same mapping.
break;
}
UnicodeString mapping = getMapping(i);
if (mapping == *(targetMapping->fStr)) {
// The run time entry we are currently testing has the correct mapping.
// Set the flag in it indicating that it applies to the new table also.
key |= tableFlag;
fKeyVec->setElementAt(key, i);
return;
}
keyHasMultipleValues = TRUE;
}
// Need to add a new entry to the binary data being built for this mapping.
// Includes adding entries to both the key table and the parallel values table.
int32_t newKey = keyChar | tableFlag;
if (keyHasMultipleValues) {
newKey |= USPOOF_KEY_MULTIPLE_VALUES;
}
int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
if (adjustedMappingLength>3) {
adjustedMappingLength = 3;
}
newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
int32_t newData = targetMapping->fStrTableIndex;
fKeyVec->addElement(newKey, status);
fValueVec->addElement(newData, status);
// If the preceding key entry is for the same key character (but with a different mapping)
// set the multiple-values flag on it.
if (keyHasMultipleValues) {
int32_t previousKeyIndex = fKeyVec->size() - 2;
int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
fKeyVec->setElementAt(previousKey, previousKeyIndex);
}
}
UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
int32_t key = fKeyVec->elementAti(index);
int32_t value = fValueVec->elementAti(index);
int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
int32_t lastIndexWithLen;
switch (length) {
case 0:
return UnicodeString(static_cast<UChar>(value));
case 1:
case 2:
return UnicodeString(*fStringTable, value, length+1);
case 3:
length = 0;
int32_t i;
for (i=0; i<fStringLengthsTable->size(); i+=2) {
lastIndexWithLen = fStringLengthsTable->elementAti(i);
if (value <= lastIndexWithLen) {
length = fStringLengthsTable->elementAti(i+1);
break;
}
}
U_ASSERT(length>=3);
return UnicodeString(*fStringTable, value, length);
default:
U_ASSERT(FALSE);
}
return UnicodeString();
}

View File

@ -0,0 +1,123 @@
/*
******************************************************************************
*
* Copyright (C) 2008-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uspoof_buildconf.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009Jan05
* created by: Andy Heninger
*
* Internal classes for compiling confusable data into its binary (runtime) form.
*/
#ifndef __USPOOF_BUILDCONF_H__
#define __USPOOF_BUILDCONF_H__
#include "uspoof_impl.h"
// SPUString
// Holds a string that is the result of one of the mappings defined
// by the confusable mapping data (confusables.txt from Unicode.org)
// Instances of SPUString exist during the compilation process only.
struct SPUString : public UMemory {
UnicodeString *fStr; // The actual string.
int32_t fStrTableIndex; // Index into the final runtime data for this string.
// (or, for length 1, the single string char itself,
// there being no string table entry for it.)
SPUString(UnicodeString *s);
~SPUString();
};
// String Pool A utility class for holding the strings that are the result of
// the spoof mappings. These strings will utimately end up in the
// run-time String Table.
// This is sort of like a sorted set of strings, except that ICU's anemic
// built-in collections don't support those, so it is implemented with a
// combination of a uhash and a UVector.
class SPUStringPool : public UMemory {
public:
SPUStringPool(UErrorCode &status);
~SPUStringPool();
// Add a string. Return the string from the table.
// If the input parameter string is already in the table, delete the
// input parameter and return the existing string.
SPUString *addString(UnicodeString *src, UErrorCode &status);
// Get the n-th string in the collection.
SPUString *getByIndex(int32_t i);
// Sort the contents; affects the ordering of getByIndex().
void sort(UErrorCode &status);
int32_t size();
private:
UVector *fVec; // Elements are SPUString *
UHashtable *fHash; // Key: UnicodeString Value: SPUString
};
// class ConfusabledataBuilder
// An instance of this class exists while the confusable data is being built from source.
// It encapsulates the intermediate data structures that are used for building.
// It exports one static function, to do a confusable data build.
class ConfusabledataBuilder : public UMemory {
private:
SpoofImpl *fSpoofImpl;
UChar *fInput;
UHashtable *fSLTable;
UHashtable *fSATable;
UHashtable *fMLTable;
UHashtable *fMATable;
UnicodeSet *fKeySet; // A set of all keys (UChar32s) that go into the four mapping tables.
// The binary data is first assembled into the following four collections, then
// copied to its final raw-memory destination.
UVector *fKeyVec;
UVector *fValueVec;
UnicodeString *fStringTable;
UVector *fStringLengthsTable;
SPUStringPool *stringPool;
URegularExpression *fParseLine;
URegularExpression *fParseHexNum;
int32_t fLineNum;
ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status);
~ConfusabledataBuilder();
void build(const char * confusables, int32_t confusablesLen, UErrorCode &status);
// Add an entry to the key and value tables being built
// input: data from SLTable, MATable, etc.
// outut: entry added to fKeyVec and fValueVec
void addKeyEntry(UChar32 keyChar, // The key character
UHashtable *table, // The table, one of SATable, MATable, etc.
int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
UErrorCode &status);
// From an index into fKeyVec & fValueVec
// get a UnicodeString with the corresponding mapping.
UnicodeString getMapping(int32_t key);
// Populate the final binary output data array with the compiled data.
void outputData(UErrorCode &status);
public:
static void buildConfusableData(SpoofImpl *spImpl, const char * confusables,
int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status);
};
#endif

View File

@ -0,0 +1,431 @@
/*
******************************************************************************
*
* Copyright (C) 2008-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uspoof_buildwsconf.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009Jan05 (refactoring earlier files)
* created by: Andy Heninger
*
* Internal functions for compililing Whole Script confusable source data
* into its binary (runtime) form. The binary data format is described
* in uspoof_impl.h
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/unorm.h"
#include "unicode/uregex.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "uspoof_impl.h"
#include "uhash.h"
#include "uvector.h"
#include "uassert.h"
#include "uspoof_buildwsconf.h"
//#include <stdio.h> // TODO: debug. remove.
U_NAMESPACE_USE
// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
// Example Lines:
// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
// | | | |
// | | | |---- Which table, Any Case or Lower Case (A or L)
// | | |----------Target script. We need this.
// | |----------------Src script. Should match the script of the source
// | code points. Beyond checking that, we don't keep it.
// |--------------------------------Source code points or range.
//
// The expression will match _all_ lines, including erroneous lines.
// The result of the parse is returned via the contents of the (match) groups.
static const char *parseExp =
"(?m)" // Multi-line mode
"^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
"|^(?:" // OR
"\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
"\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
"\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
"\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
"[ \\t]*(?:#.*?)?" // Trailing commment
")$|" // OR
"^(.*?)$"; // An error line. Group 8.
// Any line not matching the preceding
// parts of the expression.will match
// this, and thus be flagged as an error
// Extract a regular expression match group into a char * string.
// The group must contain only invariant characters.
// Used for script names
//
static void extractGroup(
URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
UChar ubuf[50];
ubuf[0] = 0;
destBuf[0] = 0;
int32_t len = uregex_group(e, group, ubuf, 50, &status);
if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
return;
}
UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
s.extract(0, len, destBuf, destCapacity, US_INV);
}
// Build the Whole Script Confusable data
//
// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
// because everything is local to this one build function anyhow,
// OR
// break this function into more reasonably sized pieces, with
// state in WSConfusableDataBuilder.
//
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
{
if (U_FAILURE(status)) {
return;
}
URegularExpression *parseRegexp = NULL;
int32_t inputLen = 0;
UChar *input = NULL;
int32_t lineNum = 0;
UVector *scriptSets = NULL;
uint32_t rtScriptSetsCount = 2;
UTrie2 *anyCaseTrie = NULL;
UTrie2 *lowerCaseTrie = NULL;
anyCaseTrie = utrie2_open(0, 0, &status);
lowerCaseTrie = utrie2_open(0, 0, &status);
// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
//
// Reserved TRIE values:
// 0: Code point has no whole script confusables.
// 1: Code point is of script Common or Inherited.
// These code points do not participate in whole script confusable detection.
// (This is logically equivalent to saying that they contain confusables in
// all scripts)
//
// Because Trie values are indexes into the ScriptSets vector, pre-fill
// vector positions 0 and 1 to avoid conflicts with the reserved values.
scriptSets = new UVector(status);
if (scriptSets == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
scriptSets->addElement((void *)NULL, status);
scriptSets->addElement((void *)NULL, status);
// Convert the user input data from UTF-8 to UChar (UTF-16)
u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
goto cleanup;
}
status = U_ZERO_ERROR;
input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
if (input == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
// given the syntax of the input.
if (*input == 0xfeff) {
*input = 0x20;
}
// Parse the input, one line per iteration of this loop.
uregex_setText(parseRegexp, input, inputLen, &status);
while (uregex_findNext(parseRegexp, &status)) {
lineNum++;
UChar line[200];
uregex_group(parseRegexp, 0, line, 200, &status);
if (uregex_start(parseRegexp, 1, &status) >= 0) {
// this was a blank or comment line.
continue;
}
if (uregex_start(parseRegexp, 8, &status) >= 0) {
// input file syntax error.
status = U_PARSE_ERROR;
goto cleanup;
}
if (U_FAILURE(status)) {
goto cleanup;
}
// Pick up the start and optional range end code points from the parsed line.
UChar32 startCodePoint = SpoofImpl::ScanHex(
input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
UChar32 endCodePoint = startCodePoint;
if (uregex_start(parseRegexp, 3, &status) >=0) {
endCodePoint = SpoofImpl::ScanHex(
input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
}
// Extract the two script names from the source line. We need these in an 8 bit
// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
// to the ICU u_getPropertyValueEnum() function. Ugh.
char srcScriptName[20];
char targScriptName[20];
extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
UScriptCode srcScript =
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
UScriptCode targScript =
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
if (U_FAILURE(status)) {
goto cleanup;
}
if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
status = U_INVALID_FORMAT_ERROR;
goto cleanup;
}
// select the table - (A) any case or (L) lower case only
UTrie2 *table = anyCaseTrie;
if (uregex_start(parseRegexp, 7, &status) >= 0) {
table = lowerCaseTrie;
}
// Build the set of scripts containing confusable characters for
// the code point(s) specified in this input line.
// Sanity check that the script of the source code point is the same
// as the source script indicated in the input file. Failure of this check is
// an error in the input file.
// Include the source script in the set (needed for Mixed Script Confusable detection).
//
UChar32 cp;
for (cp=startCodePoint; cp<=endCodePoint; cp++) {
int32_t setIndex = utrie2_get32(table, cp);
BuilderScriptSet *bsset = NULL;
if (setIndex > 0) {
U_ASSERT(setIndex < scriptSets->size());
bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
} else {
bsset = new BuilderScriptSet();
if (bsset == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
bsset->codePoint = cp;
bsset->trie = table;
bsset->sset = new ScriptSet();
setIndex = scriptSets->size();
bsset->index = setIndex;
bsset->rindex = 0;
if (bsset->sset == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
goto cleanup;
}
scriptSets->addElement(bsset, status);
utrie2_set32(table, cp, setIndex, &status);
}
bsset->sset->Union(targScript);
bsset->sset->Union(srcScript);
if (U_FAILURE(status)) {
goto cleanup;
}
UScriptCode cpScript = uscript_getScript(cp, &status);
if (cpScript != srcScript) {
status = U_INVALID_FORMAT_ERROR;
goto cleanup;
}
}
}
// Eliminate duplicate script sets. At this point we have a separate
// script set for every code point that had data in the input file.
//
// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
//
// printf("Number of scriptSets: %d\n", scriptSets->size());
{
int32_t duplicateCount = 0;
rtScriptSetsCount = 2;
for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
if (outerSet->index != static_cast<uint32_t>(outeri)) {
// This set was already identified as a duplicate.
// It will not be allocated a position in the runtime array of ScriptSets.
continue;
}
outerSet->rindex = rtScriptSetsCount++;
for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
delete innerSet->sset;
innerSet->scriptSetOwned = FALSE;
innerSet->sset = outerSet->sset;
innerSet->index = outeri;
innerSet->rindex = outerSet->rindex;
duplicateCount++;
}
// But this doesn't get all. We need to fix the TRIE.
}
}
// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
}
// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
// are unused, which is why the loop index starts at 2.)
{
for (int32_t i=2; i<scriptSets->size(); i++) {
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
if (bSet->rindex != (uint32_t)i) {
utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
}
}
}
// For code points with script==Common or script==Inherited,
// Set the reserved value of 1 into both Tries. These characters do not participate
// in Whole Script Confusable detection; this reserved value is the means
// by which they are detected.
{
UnicodeSet ignoreSet;
ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
UnicodeSet inheritedSet;
inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
ignoreSet.addAll(inheritedSet);
for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
UChar32 rangeStart = ignoreSet.getRangeStart(rn);
UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
}
}
// Serialize the data to the Spoof Detector
{
utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
// printf("Any case Trie size: %d\n", size);
if (status != U_BUFFER_OVERFLOW_ERROR) {
goto cleanup;
}
status = U_ZERO_ERROR;
spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
void *where = spImpl->fSpoofData->reserveSpace(size, status);
utrie2_serialize(anyCaseTrie, where, size, &status);
utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
// printf("Lower case Trie size: %d\n", size);
if (status != U_BUFFER_OVERFLOW_ERROR) {
goto cleanup;
}
status = U_ZERO_ERROR;
spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
where = spImpl->fSpoofData->reserveSpace(size, status);
utrie2_serialize(lowerCaseTrie, where, size, &status);
spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
ScriptSet *rtScriptSets = static_cast<ScriptSet *>
(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
uint32_t rindex = 2;
for (int32_t i=2; i<scriptSets->size(); i++) {
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
if (bSet->rindex < rindex) {
// We have already copied this script set to the serialized data.
continue;
}
U_ASSERT(rindex == bSet->rindex);
rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
rindex++;
}
}
// Open new utrie2s from the serialized data. We don't want to keep the ones
// we just built because we would then have two copies of the data, one internal to
// the utries that we have already constructed, and one in the serialized data area.
// An alternative would be to not pre-serialize the Trie data, but that makes the
// spoof detector data different, depending on how the detector was constructed.
// It's simpler to keep the data always the same.
spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
UTRIE2_16_VALUE_BITS,
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
NULL,
&status);
spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
UTRIE2_16_VALUE_BITS,
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
NULL,
&status);
cleanup:
if (U_FAILURE(status)) {
pe->line = lineNum;
}
uregex_close(parseRegexp);
uprv_free(input);
int32_t i;
for (i=0; i<scriptSets->size(); i++) {
BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
delete bsset;
}
delete scriptSets;
utrie2_close(anyCaseTrie);
utrie2_close(lowerCaseTrie);
return;
}
BuilderScriptSet::BuilderScriptSet() {
codePoint = -1;
trie = NULL;
sset = NULL;
index = 0;
rindex = 0;
scriptSetOwned = TRUE;
}
BuilderScriptSet::~BuilderScriptSet() {
if (scriptSetOwned) {
delete sset;
}
}

View File

@ -0,0 +1,56 @@
/*
******************************************************************************
*
* Copyright (C) 2008-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: uspoof_buildwsconf.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009Jan19
* created by: Andy Heninger
*
* Internal classes and functions
* for compiling whole script confusable data into its binary (runtime) form.
*/
#ifndef __USPOOF_BUILDWSCONF_H__
#define __USPOOF_BUILDWSCONF_H__
#include "uspoof_impl.h"
#include "utrie2.h"
//
// class BuilderScriptSet. Represents the set of scripts (Script Codes)
// containing characters that are confusable with one specific
// code point.
//
class BuilderScriptSet: public UMemory {
public:
UChar32 codePoint; // The source code point.
UTrie2 *trie; // Any-case or Lower-case Trie.
// These Trie tables are the final result of the
// build. This flag indicates which of the two
// this set of data is for.
ScriptSet *sset; // The set of scripts itself.
// Vectors of all B
uint32_t index; // Index of this set in the Build Time vector
// of script sets.
uint32_t rindex; // Index of this set in the final (runtime)
// array of sets.
UBool scriptSetOwned; // True if this BuilderScriptSet owns (should delete)
// its underlying sset.
BuilderScriptSet();
~BuilderScriptSet();
};
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status);
#endif

View File

@ -0,0 +1,841 @@
/*
**********************************************************************
* Copyright (C) 2008-2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "unicode/unorm.h"
#include "utrie2.h"
#include "cmemory.h"
#include "udatamem.h"
#include "umutex.h"
#include "udataswp.h"
#include "uassert.h"
#include "uspoof_impl.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
if (U_FAILURE(status)) {
return;
}
fMagic = USPOOF_MAGIC;
fSpoofData = data;
fChecks = USPOOF_ALL_CHECKS;
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
if (allowedCharsSet == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
}
SpoofImpl::SpoofImpl() {
fMagic = USPOOF_MAGIC;
fSpoofData = NULL;
fChecks = USPOOF_ALL_CHECKS;
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
allowedCharsSet->freeze();
fAllowedCharsSet = allowedCharsSet;
}
// Copy Constructor, used by the user level clone() function.
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
if (U_FAILURE(status)) {
return;
}
fMagic = src.fMagic;
fChecks = src.fChecks;
if (src.fSpoofData != NULL) {
fSpoofData = src.fSpoofData->addReference();
}
fCheckMask = src.fCheckMask;
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
if (fAllowedCharsSet == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
}
SpoofImpl::~SpoofImpl() {
fMagic = 0; // head off application errors by preventing use of
// of deleted objects.
if (fSpoofData != NULL) {
fSpoofData->removeReference(); // Will delete if refCount goes to zero.
}
delete fAllowedCharsSet;
}
//
// Incoming parameter check on Status and the SpoofChecker object
// received from the C API.
//
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
if (sc == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
};
SpoofImpl *This = (SpoofImpl *)sc;
if (This->fMagic != USPOOF_MAGIC ||
This->fSpoofData == NULL) {
status = U_INVALID_FORMAT_ERROR;
return NULL;
}
if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
return NULL;
}
return This;
}
SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
return const_cast<SpoofImpl *>
(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
}
//--------------------------------------------------------------------------------------
//
// confusableLookup() This is the heart of the confusable skeleton generation
// implementation.
//
// Given a source character, produce the corresponding
// replacement character(s)
//
//---------------------------------------------------------------------------------------
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
// Binary search the spoof data key table for the inChar
int32_t *low = fSpoofData->fCFUKeys;
int32_t *mid = NULL;
int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
UChar midc;
do {
int32_t delta = (limit-low)/2;
mid = low + delta;
midc = *mid & 0x1fffff;
if (inChar == midc) {
goto foundChar;
} else if (inChar < midc) {
limit = mid;
} else {
low = mid;
}
} while (low < limit-1);
mid = low;
midc = *mid & 0x1fffff;
if (inChar != midc) {
// Char not found. It maps to itself.
int i = 0;
U16_APPEND_UNSAFE(destBuf, i, inChar)
return i;
}
foundChar:
int32_t keyFlags = *mid & 0xff000000;
if ((keyFlags & tableMask) == 0) {
// We found the right key char, but the entry doesn't pertain to the
// table we need. See if there is an adjacent key that does
if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
int32_t *altMid;
for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
keyFlags = *altMid & 0xff000000;
if (keyFlags & tableMask) {
mid = altMid;
goto foundKey;
}
}
for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
keyFlags = *altMid & 0xff000000;
if (keyFlags & tableMask) {
mid = altMid;
goto foundKey;
}
}
}
// No key entry for this char & table.
// The input char maps to itself.
int i = 0;
U16_APPEND_UNSAFE(destBuf, i, inChar)
return i;
}
foundKey:
int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
int32_t keyTableIndex = mid - fSpoofData->fCFUKeys;
// Value is either a UChar (for strings of length 1) or
// an index into the string table (for longer strings)
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
if (stringLen == 1) {
destBuf[0] = value;
return 1;
}
// String length of 4 from the above lookup is used for all strings of length >= 4.
// For these, get the real length from the string lengths table,
// which maps string table indexes to lengths.
// All strings of the same length are stored contiguously in the string table.
// 'value' from the lookup above is the starting index for the desired string.
int32_t ix;
if (stringLen == 4) {
// TODO:
int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
for (ix = 0; ix < stringLengthsLimit; ix++) {
if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
break;
}
}
U_ASSERT(ix < stringLengthsLimit);
}
U_ASSERT(value + stringLen < fSpoofData->fRawData->fCFUStringTableLen);
UChar *src = &fSpoofData->fCFUStrings[value];
for (ix=0; ix<stringLen; ix++) {
destBuf[ix] = src[ix];
}
return stringLen;
}
//---------------------------------------------------------------------------------------
//
// wholeScriptCheck()
//
// Input text is already normalized to NFKD
// Return the set of scripts, each of which can represent something that is
// confusable with the input text. The script of the input text
// is included; input consisting of characters from a single script will
// always produce a result consisting of a set containing that script.
//
//---------------------------------------------------------------------------------------
void SpoofImpl::wholeScriptCheck(
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
int32_t inputIdx = 0;
UChar32 c;
UTrie2 *table =
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
result->setAll();
while (inputIdx < length) {
U16_NEXT(text, inputIdx, length, c);
uint32_t index = utrie2_get32(table, c);
if (index == 0) {
// No confusables in another script for this char.
// TODO: we should change the data to have sets with just the single script
// bit for the script of this char. Gets rid of this special case.
// Until then, grab the script from the char and intersect it with the set.
UScriptCode cpScript = uscript_getScript(c, &status);
U_ASSERT(cpScript > USCRIPT_INHERITED);
result->intersect(cpScript);
} else if (index == 1) {
// Script == Common or Inherited. Nothing to do.
} else {
result->intersect(fSpoofData->fScriptSets[index]);
}
}
}
int32_t SpoofImpl::scriptScan
(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
if (U_FAILURE(status)) {
return 0;
}
int32_t inputIdx = 0;
UChar32 c;
int32_t scriptCount = 0;
UScriptCode lastScript = USCRIPT_INVALID_CODE;
UScriptCode sc = USCRIPT_INVALID_CODE;
while ((inputIdx < length || length == -1) && scriptCount < 2) {
U16_NEXT(text, inputIdx, length, c);
if (c == 0 && length == -1) {
break;
}
sc = uscript_getScript(c, &status);
if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
continue;
}
if (sc != lastScript) {
scriptCount++;
lastScript = sc;
}
}
if (scriptCount == 2) {
pos = inputIdx;
}
return scriptCount;
}
// Convert a text format hex number. Utility function used by builder code. Static.
// Input: UChar *string text. Output: a UChar32
// Input has been pre-checked, and will have no non-hex chars.
// The number must fall in the code point range of 0..0x10ffff
// Static Function.
UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
if (U_FAILURE(status)) {
return 0;
}
U_ASSERT(limit-start > 0);
uint32_t val = 0;
int i;
for (i=start; i<limit; i++) {
int digitVal = s[i] - 0x30;
if (digitVal>9) {
digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
}
if (digitVal>15) {
digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
}
U_ASSERT(digitVal <= 0xf);
val <<= 4;
val += digitVal;
}
if (val > 0x10ffff) {
status = U_PARSE_ERROR;
val = 0;
}
return (UChar32)val;
}
//----------------------------------------------------------------------------------------------
//
// class SpoofData Implementation
//
//----------------------------------------------------------------------------------------------
UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
if (U_FAILURE(status) ||
rawData == NULL ||
rawData->fMagic != USPOOF_MAGIC ||
rawData->fFormatVersion[0] > 1 ||
rawData->fFormatVersion[1] > 0) {
status = U_INVALID_FORMAT_ERROR;
return FALSE;
}
return TRUE;
}
//
// SpoofData::getDefault() - return a wrapper around the spoof data that is
// baked into the default ICU data.
//
SpoofData *SpoofData::getDefault(UErrorCode &status) {
// TODO: Cache it. Lazy create, keep until cleanup.
UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status);
if (U_FAILURE(status)) {
return NULL;
}
SpoofData *This = new SpoofData(udm, status);
if (U_FAILURE(status)) {
delete This;
return NULL;
}
if (This == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
}
return This;
}
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
{
reset();
if (U_FAILURE(status)) {
return;
}
fRawData = reinterpret_cast<SpoofDataHeader *>
((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
fUDM = udm;
validateDataVersion(fRawData, status);
initPtrs(status);
}
SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
{
reset();
if (U_FAILURE(status)) {
return;
}
if ((size_t)length < sizeof(SpoofDataHeader)) {
status = U_INVALID_FORMAT_ERROR;
return;
}
void *ncData = const_cast<void *>(data);
fRawData = static_cast<SpoofDataHeader *>(ncData);
if (length < fRawData->fLength) {
status = U_INVALID_FORMAT_ERROR;
return;
}
validateDataVersion(fRawData, status);
initPtrs(status);
}
// Spoof Data constructor for use from data builder.
// Initializes a new, empty data area that will be populated later.
SpoofData::SpoofData(UErrorCode &status) {
reset();
if (U_FAILURE(status)) {
return;
}
fDataOwned = true;
fRefCount = 1;
// The spoof header should already be sized to be a multiple of 16 bytes.
// Just in case it's not, round it up.
uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
U_ASSERT(initialSize == sizeof(SpoofDataHeader));
fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
fMemLimit = initialSize;
if (fRawData == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
uprv_memset(fRawData, 0, initialSize);
fRawData->fMagic = USPOOF_MAGIC;
fRawData->fFormatVersion[0] = 1;
fRawData->fFormatVersion[1] = 0;
fRawData->fFormatVersion[2] = 0;
fRawData->fFormatVersion[3] = 0;
initPtrs(status);
}
// reset() - initialize all fields.
// Should be updated if any new fields are added.
// Called by constructors to put things in a known initial state.
void SpoofData::reset() {
fRawData = NULL;
fDataOwned = FALSE;
fUDM = NULL;
fMemLimit = 0;
fRefCount = 1;
fCFUKeys = NULL;
fCFUValues = NULL;
fCFUStringLengths = NULL;
fCFUStrings = NULL;
fAnyCaseTrie = NULL;
fLowerCaseTrie = NULL;
fScriptSets = NULL;
}
// SpoofData::initPtrs()
// Initialize the pointers to the various sections of the raw data.
//
// This function is used both during the Trie building process (multiple
// times, as the individual data sections are added), and
// during the opening of a Spoof Checker from prebuilt data.
//
// The pointers for non-existent data sections (identified by an offset of 0)
// are set to NULL.
//
// Note: During building the data, adding each new data section
// reallocs the raw data area, which likely relocates it, which
// in turn requires reinitializing all of the pointers into it, hence
// multiple calls to this function during building.
//
void SpoofData::initPtrs(UErrorCode &status) {
fCFUKeys = NULL;
fCFUValues = NULL;
fCFUStringLengths = NULL;
fCFUStrings = NULL;
if (U_FAILURE(status)) {
return;
}
if (fRawData->fCFUKeys != 0) {
fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
}
if (fRawData->fCFUStringIndex != 0) {
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
}
if (fRawData->fCFUStringLengths != 0) {
fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
}
if (fRawData->fCFUStringTable != 0) {
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
}
if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
}
if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
(char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
}
if (fRawData->fScriptSets != 0) {
fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
}
}
SpoofData::~SpoofData() {
utrie2_close(fAnyCaseTrie);
fAnyCaseTrie = NULL;
utrie2_close(fLowerCaseTrie);
fLowerCaseTrie = NULL;
if (fDataOwned) {
uprv_free(fRawData);
}
fRawData = NULL;
if (fUDM != NULL) {
udata_close(fUDM);
}
fUDM = NULL;
}
void SpoofData::removeReference() {
if (umtx_atomic_dec(&fRefCount) == 0) {
delete this;
}
}
SpoofData *SpoofData::addReference() {
umtx_atomic_inc(&fRefCount);
return this;
}
void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
if (!fDataOwned) {
U_ASSERT(FALSE);
status = U_INTERNAL_PROGRAM_ERROR;
return NULL;
}
numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
uint32_t returnOffset = fMemLimit;
fMemLimit += numBytes;
fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
fRawData->fLength = fMemLimit;
uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
initPtrs(status);
return (char *)fRawData + returnOffset;
}
//----------------------------------------------------------------------------
//
// ScriptSet implementation
//
//----------------------------------------------------------------------------
ScriptSet::ScriptSet() {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0;
}
}
ScriptSet::~ScriptSet() {
}
UBool ScriptSet::operator == (const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
if (bits[i] != other.bits[i]) {
return FALSE;
}
}
return TRUE;
}
void ScriptSet::Union(UScriptCode script) {
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
U_ASSERT(index < sizeof(bits)*4);
bits[index] |= bit;
}
void ScriptSet::Union(const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] |= other.bits[i];
}
}
void ScriptSet::intersect(const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] &= other.bits[i];
}
}
void ScriptSet::intersect(UScriptCode script) {
uint32_t index = script / 32;
uint32_t bit = 1 << (script & 31);
U_ASSERT(index < sizeof(bits)*4);
uint32_t i;
for (i=0; i<index; i++) {
bits[i] = 0;
}
bits[index] &= bit;
for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0;
}
}
ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = other.bits[i];
}
return *this;
}
void ScriptSet::setAll() {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0xffffffffu;
}
}
void ScriptSet::resetAll() {
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
bits[i] = 0;
}
}
int32_t ScriptSet::countMembers() {
// This bit counter is good for sparse numbers of '1's, which is
// very much the case that we will usually have.
int32_t count = 0;
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
uint32_t x = bits[i];
while (x > 0) {
count++;
x &= (x - 1); // and off the least significant one bit.
}
}
return count;
}
//-----------------------------------------------------------------------------
//
// NFKDBuffer Implementation.
//
//-----------------------------------------------------------------------------
NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
fNormalizedText = NULL;
fNormalizedTextLength = 0;
fOriginalText = text;
if (U_FAILURE(status)) {
return;
}
fNormalizedText = fSmallBuf;
fNormalizedTextLength = unorm_normalize(
text, length, UNORM_NFKD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
if (status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
if (fNormalizedText == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
} else {
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFKD, 0,
fNormalizedText, fNormalizedTextLength+1, &status);
}
}
}
NFKDBuffer::~NFKDBuffer() {
if (fNormalizedText != fSmallBuf) {
delete fNormalizedText;
}
fNormalizedText = 0;
}
const UChar *NFKDBuffer::getBuffer() {
return fNormalizedText;
}
int32_t NFKDBuffer::getLength() {
return fNormalizedTextLength;
}
U_NAMESPACE_END
U_NAMESPACE_USE
//-----------------------------------------------------------------------------
//
// uspoof_swap - byte swap and char encoding swap of spoof data
//
//-----------------------------------------------------------------------------
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
UErrorCode *status) {
if (status == NULL || U_FAILURE(*status)) {
return 0;
}
if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
*status=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
//
// Check that the data header is for spoof data.
// (Header contents are defined in gencfu.cpp)
//
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
pInfo->dataFormat[1]==0x66 &&
pInfo->dataFormat[2]==0x75 &&
pInfo->dataFormat[3]==0x20 &&
pInfo->formatVersion[0]==1 )) {
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
"(format version %02x %02x %02x %02x) is not recognized\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
pInfo->dataFormat[2], pInfo->dataFormat[3],
pInfo->formatVersion[0], pInfo->formatVersion[1],
pInfo->formatVersion[2], pInfo->formatVersion[3]);
*status=U_UNSUPPORTED_ERROR;
return 0;
}
//
// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
// header). This swap also conveniently gets us
// the size of the ICU d.h., which lets us locate the start
// of the uspoof specific data.
//
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
//
// Get the Spoof Data Header, and check that it appears to be OK.
//
//
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
{
udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
*status=U_UNSUPPORTED_ERROR;
return 0;
}
//
// Prefight operation? Just return the size
//
int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
int32_t totalSize = headerSize + spoofDataLength;
if (length < 0) {
return totalSize;
}
//
// Check that length passed in is consistent with length from Spoof data header.
//
if (length < totalSize) {
udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
spoofDataLength);
*status=U_INDEX_OUTOFBOUNDS_ERROR;
return 0;
}
//
// Swap the Data. Do the data itself first, then the Spoof Data Header, because
// we need to reference the header to locate the data, and an
// inplace swap of the header leaves it unusable.
//
uint8_t *outBytes = (uint8_t *)outData + headerSize;
SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
int32_t sectionStart;
int32_t sectionLength;
//
// If not swapping in place, zero out the output buffer before starting.
// Gaps may exist between the individual sections, and these must be zeroed in
// the output buffer. The simplest way to do that is to just zero the whole thing.
//
if (inBytes != outBytes) {
uprv_memset(outBytes, 0, spoofDataLength);
}
// Confusables Keys Section (fCFUKeys)
sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// String Index Section
sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// String Table Section
sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// String Lengths Section
sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// Any Case Trie
sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// Lower Case Trie
sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// Script Sets. The data is an array of int32_t
sectionStart = ds->readUInt32(spoofDH->fScriptSets);
sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * 4;
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
// And, last, swap the header itself.
// int32_t fMagic // swap this
// uint8_t fFormatVersion[4] // Do not swap this
// int32_t all the rest // Swap the rest, all is 32 bit stuff.
//
uint32_t magic = ds->readUInt32(spoofDH->fMagic);
ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8, &outputDH->fLength, status);
return totalSize;
}

View File

@ -0,0 +1,397 @@
/*
***************************************************************************
* Copyright (C) 2008-2009, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
*
* uspoof_impl.h
*
* Implemenation header for spoof detection
*
*/
#ifndef USPOOFIM_H
#define USPOOFIM_H
#include "unicode/utypes.h"
#include "unicode/uspoof.h"
#include "utrie2.h"
#include "unicode/uscript.h"
#include "unicode/udata.h"
U_NAMESPACE_BEGIN
// The maximium length (in UTF-16 UChars) of the skeleton replacement string resulting from
// a single input code point. This is function of the unicode.org data.
#define USPOOF_MAX_SKELETON_EXPANSION 20
// The default stack buffer size for copies or conversions or normalizations
// of input strings being checked. (Used in multiple places.)
#define USPOOF_STACK_BUFFER_SIZE 100
// Magic number for sanity checking spoof data.
#define USPOOF_MAGIC 0x3845fdef
class SpoofData;
struct SpoofDataHeader;
struct SpoofStringLengthsElement;
class ScriptSet;
/**
* Class SpoofImpl corresponds directly to the plain C API opaque type
* USpoofChecker. One can be cast to the other.
*/
class SpoofImpl : public UObject {
public:
SpoofImpl(SpoofData *data, UErrorCode &status);
SpoofImpl();
virtual ~SpoofImpl();
/** Copy constructor, used by the user level uspoof_clone() function.
*/
SpoofImpl(const SpoofImpl &src, UErrorCode &status);
static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);
/** Get the confusable skeleton transform for a single code point.
* The result is a string with a length between 1 and 18.
* @param tableMask bit flag specifying which confusable table to use.
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
* @return The length in UTF-16 code units of the substition string.
*/
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const;
/** parse a hex number. Untility used by the builders. */
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
// Implementation for Whole Script tests.
// Return the test bit flag to be ORed into the eventual user return value
// if a Spoof opportunity is detected.
void wholeScriptCheck(
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const;
/** Scan a string to determine how many scripts it includes.
* Ignore characters with script=Common and scirpt=Inherited.
* @param text The UChar text to be scanned
* @param length The length of the input text, -1 for nul termintated.
* @param pos An out parameter, set to the first input postion at which
* a second script was encountered, ignoring Common and Inherited.
* @param status For errors.
* @return the number of (non-common,inherited) scripts encountered,
* clipped to a max of two.
*/
int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const;
// WholeScript and MixedScript check implementation.
//
ScriptSet *WholeScriptCheck(const UChar *text, int32_t length, UErrorCode &status) const;
static UClassID U_EXPORT2 getStaticClassID(void);
virtual UClassID getDynamicClassID(void) const;
//
// Data Members
//
int32_t fMagic; // Internal sanity check.
int32_t fChecks; // Bit vector of checks to perform.
SpoofData *fSpoofData;
int32_t fCheckMask; // Spoof table selector. f(Check Type)
const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters.
// for this Spoof Checker. Defaults to all chars.
};
//
// Confusable Mappings Data Structures
//
// For the confusable data, we are essentially implementing a map,
// key: a code point
// value: a string. Most commonly one char in length, but can be more.
//
// The keys are stored as a sorted array of 32 bit ints.
// bits 0-23 a code point value
// bits 24-31 flags
// 24: 1 if entry applies to SL table
// 25: 1 if entry applies to SA table
// 26: 1 if entry applies to ML table
// 27: 1 if entry applies to MA table
// 28: 1 if there are multiple entries for this code point.
// 29-30: length of value string, in UChars.
// values are (1, 2, 3, other)
// The key table is sorted in ascending code point order. (not on the
// 32 bit int value, the flag bits do not participate in the sorting.)
//
// Lookup is done by means of a binary search in the key table.
//
// The corresponding values are kept in a parallel array of 16 bit ints.
// If the value string is of length 1, it is literally in the value array.
// For longer strings, the value array contains an index into the strings table.
//
// String Table:
// The strings table contains all of the value strings (those of length two or greater)
// concatentated together into one long UChar (UTF-16) array.
//
// The array is arranged by length of the strings - all strings of the same length
// are stored together. The sections are ordered by length of the strings -
// all two char strings first, followed by all of the three Char strings, etc.
//
// There is no nul character or other mark between adjacent strings.
//
// String Lengths table
// The length of strings from 1 to 3 is flagged in the key table.
// For strings of length 4 or longer, the string length table provides a
// mapping between an index into the string table and the corresponding length.
// Strings of these lengths are rare, so lookup time is not an issue.
// Each entry consists of
// uint16_t index of the _last_ string with this length
// uint16_t the length
//
// Flag bits in the Key entries
#define USPOOF_SL_TABLE_FLAG (1<<24)
#define USPOOF_SA_TABLE_FLAG (1<<25)
#define USPOOF_ML_TABLE_FLAG (1<<26)
#define USPOOF_MA_TABLE_FLAG (1<<27)
#define USPOOF_KEY_MULTIPLE_VALUES (1<<28)
#define USPOOF_KEY_LENGTH_SHIFT 29
#define USPOOF_KEY_LENGTH_FIELD(x) (((x)>>29) & 3)
struct SpoofStringLengthsElement {
uint16_t fLastString; // index in string table of last string with this length
uint16_t fStrLength; // Length of strings
};
//-------------------------------------------------------------------------------
//
// ScriptSet - Wrapper class for the Script code bit sets that are part of the
// whole script confusable data.
//
// This class is used both at data build and at run time.
// The constructor is only used at build time.
// At run time, just point at the prebuilt data and go.
//
//-------------------------------------------------------------------------------
class ScriptSet: public UMemory {
public:
ScriptSet();
~ScriptSet();
UBool operator == (const ScriptSet &other);
ScriptSet & operator = (const ScriptSet &other);
void Union(const ScriptSet &other);
void Union(UScriptCode script);
void intersect(const ScriptSet &other);
void intersect(UScriptCode script);
void setAll();
void resetAll();
int32_t countMembers();
private:
uint32_t bits[6];
};
//-------------------------------------------------------------------------------
//
// NFKDBuffer A little class to handle the NFKD normalization that is
// needed on incoming identifiers to be checked.
// Takes care of buffer handling and normalization
//
// Instances of this class are intended to be stack-allocated.
//
// TODO: how to map position offsets back to user values?
//
//--------------------------------------------------------------------------------
class NFKDBuffer: public UMemory {
public:
NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status);
~NFKDBuffer();
const UChar *getBuffer();
int32_t getLength();
private:
const UChar *fOriginalText;
UChar *fNormalizedText;
int32_t fNormalizedTextLength;
UChar fSmallBuf[USPOOF_STACK_BUFFER_SIZE];
};
//-------------------------------------------------------------------------------------
//
// SpoofData
//
// A small class that wraps the raw (usually memory mapped) spoof data.
// Serves two primary functions:
// 1. Convenience. Contains real pointers to the data, to avoid dealing with
// the offsets in the raw data.
// 2. Reference counting. When a spoof checker is cloned, the raw data is shared
// and must be retained until all checkers using the data are closed.
// Nothing in this struct includes state that is specific to any particular
// USpoofDetector object.
//
//---------------------------------------------------------------------------------------
class SpoofData: public UMemory {
public:
static SpoofData *getDefault(UErrorCode &status); // Load standard ICU spoof data.
SpoofData(UErrorCode &status); // Create new spoof data wrapper.
// Only used when building new data from rules.
// Constructor for use when creating from prebuilt default data.
// A UDataMemory is what the ICU internal data loading functions provide.
// The udm is adopted by the SpoofData.
SpoofData(UDataMemory *udm, UErrorCode &status);
// Constructor for use when creating from serialized data.
//
SpoofData(const void *serializedData, int32_t length, UErrorCode &status);
// Check raw Spoof Data Version compatibility.
// Return TRUE it looks good.
static UBool validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status);
~SpoofData(); // Destructor not normally used.
// Use removeReference() instead.
// Reference Counting functions.
// Clone of a user-level spoof detector increments the ref count on the data.
// Close of a user-level spoof detector decrements the ref count.
// If the data is owned by us, it will be deleted when count goes to zero.
SpoofData *addReference();
void removeReference();
// Reserve space in the raw data. For use by builder when putting together a
// new set of data. Init the new storage to zero, to prevent inconsistent
// results if it is not all otherwise set by the requester.
// Return:
// pointer to the new space that was added by this function.
void *reserveSpace(int32_t numBytes, UErrorCode &status);
// initialize the pointers from this object to the raw data.
void initPtrs(UErrorCode &status);
// Reset all fields to an initial state.
// Called from the top of all constructors.
void reset();
SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data
UBool fDataOwned; // True if the raw data is owned, and needs
// to be deleted when refcount goes to zero.
UDataMemory *fUDM; // If not NULL, our data came from a
// UDataMemory, which we must close when
// we're done.
uint32_t fMemLimit; // Limit of available raw data space
int32_t fRefCount;
// Confusable data
int32_t *fCFUKeys;
uint16_t *fCFUValues;
SpoofStringLengthsElement *fCFUStringLengths;
UChar *fCFUStrings;
// Whole Script Confusable Data
UTrie2 *fAnyCaseTrie;
UTrie2 *fLowerCaseTrie;
ScriptSet *fScriptSets;
};
//---------------------------------------------------------------------------------------
//
// Raw Binary Data Formats, as loaded from the ICU data file,
// or as built by the builder.
//
//---------------------------------------------------------------------------------------
struct SpoofDataHeader {
int32_t fMagic; // (0x8345fdef)
uint8_t fFormatVersion[4]; // Data Format. Same as the value in struct UDataInfo
// if there is one associated with this data.
int32_t fLength; // Total lenght in bytes of this spoof data,
// including all sections, not just the header.
// The following four sections refer to data representing the confusable data
// from the Unicode.org data from "confusables.txt"
int32_t fCFUKeys; // byte offset to Keys table (from SpoofDataHeader *)
int32_t fCFUKeysSize; // number of entries in keys table (32 bits each)
// TODO: change name to fCFUValues, for consistency.
int32_t fCFUStringIndex; // byte offset to String Indexes table
int32_t fCFUStringIndexSize; // number of entries in String Indexes table (16 bits each)
// (number of entries must be same as in Keys table
int32_t fCFUStringTable; // byte offset of String table
int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars)
int32_t fCFUStringLengths; // byte offset to String Lengths table
int32_t fCFUStringLengthsSize; // number of entries in lengths table. (2 x 16 bits each)
// The following sections are for data from confusablesWholeScript.txt
int32_t fAnyCaseTrie; // byte offset to the serialized Any Case Trie
int32_t fAnyCaseTrieLength; // Length (bytes) of the serialized Any Case Trie
int32_t fLowerCaseTrie; // byte offset to the serialized Lower Case Trie
int32_t fLowerCaseTrieLength; // Length (bytes) of the serialized Lower Case Trie
int32_t fScriptSets; // byte offset to array of ScriptSets
int32_t fScriptSetsLength; // Number of ScriptSets (24 bytes each)
// The following sections are for data from xidmodifications.txt
int32_t unused[15]; // Padding, Room for Expansion
};
//
// Structure for the Whole Script Confusable Data
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
// Whole Script confusable data
//
// The data provides mappings from code points to a set of scripts
// that contain characters that might be confused with the code point.
// There are two mappings, one for lower case only, and one for characters
// of any case.
//
// The actual data consists of a utrie2 to map from a code point to an offset,
// and an array of UScriptSets (essentially bit maps) that is indexed
// by the offsets obtained from the Trie.
//
//
U_NAMESPACE_END
/**
* Endianness swap function for binary spoof data.
* @internal
*/
U_CAPI int32_t U_EXPORT2
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
UErrorCode *status);
#endif /* USPOOFIM_H */

View File

@ -1,6 +1,6 @@
#******************************************************************************
#
# Copyright (C) 1999-2008, International Business Machines
# Copyright (C) 1999-2009, International Business Machines
# Corporation and others. All Rights Reserved.
#
#******************************************************************************
@ -52,7 +52,7 @@ ncnvfbts.o ncnvtst.o putiltst.o cstrtest.o udatpg_test.o utf8tst.o \
stdnmtst.o usrchtst.o custrtrn.o sorttest.o trietest.o trie2test.o usettest.o \
uenumtst.o utmstest.o currtest.o \
idnatest.o nfsprep.o spreptst.o sprpdata.o \
hpmufn.o tracetst.o reapits.o utexttst.o ucsdetst.o
hpmufn.o tracetst.o reapits.o utexttst.o ucsdetst.o spooftest.o
DEPS = $(OBJECTS:.o=.d)

View File

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2008, International Business Machines Corporation and
* Copyright (c) 1996-2009, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -38,6 +38,7 @@ void addURegexTest(TestNode** root);
void addUTextTest(TestNode** root);
void addUCsdetTest(TestNode** root);
void addCnvSelTest(TestNode** root);
void addUSpoofTest(TestNode** root);
void addAllTests(TestNode** root)
{
@ -75,6 +76,7 @@ void addAllTests(TestNode** root)
#if !UCONFIG_NO_TRANSLITERATION
addUTransTest(root);
#endif
addUSpoofTest(root);
}

View File

@ -931,6 +931,14 @@
>
</File>
</Filter>
<Filter
Name="spoof"
>
<File
RelativePath=".\spooftest.c"
>
</File>
</Filter>
</Files>
<Globals>
</Globals>

View File

@ -886,14 +886,7 @@ static void TestGetKeywordValuesForLocale(void) {
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1
};
UErrorCode status = U_ZERO_ERROR;
int32_t i, j, size;
UEnumeration *pref, *all;
const char *loc = NULL;
UBool matchPref, matchAll;
const char *value = NULL;
int32_t valueLength = 0;
UList *ALLList = NULL;
int32_t i;
UEnumeration *ALL = ucurr_getKeywordValuesForLocale("currency", uloc_getDefault(), FALSE, &status);
if (ALL == NULL) {
@ -902,12 +895,15 @@ static void TestGetKeywordValuesForLocale(void) {
}
for (i = 0; i < PREFERRED_SIZE; i++) {
pref = NULL;
all = NULL;
loc = PREFERRED[i][0];
UEnumeration *pref = NULL;
UEnumeration *all = NULL;
const char *loc = PREFERRED[i][0];
pref = ucurr_getKeywordValuesForLocale("currency", loc, TRUE, &status);
matchPref = FALSE;
matchAll = FALSE;
UBool matchPref = FALSE;
UBool matchAll = FALSE;
int32_t size = 0, j;
const char *value = NULL, *allValue = NULL;
int32_t valueLength = 0, allValueLength = 0;
size = uenum_count(pref, &status);
@ -939,7 +935,7 @@ static void TestGetKeywordValuesForLocale(void) {
if (U_SUCCESS(status) && size == uenum_count(ALL, &status)) {
matchAll = TRUE;
ALLList = ulist_getListFromEnum(ALL);
UList *ALLList = ulist_getListFromEnum(ALL);
for (j = 0; j < size; j++) {
if ((value = uenum_next(all, &valueLength, &status)) != NULL && U_SUCCESS(status)) {
if (!ulist_containsString(ALLList, value, uprv_strlen(value))) {

View File

@ -0,0 +1,152 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2009, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
*
* File spooftest.c
*
*********************************************************************************/
/*C API TEST for the uspoof Unicode Indentifier Spoofing and Security API */
/**
* This is an API test for ICU spoof detection in plain C. It doesn't test very many cases, and doesn't
* try to test the full functionality. It just calls each function and verifies that it
* works on a basic level.
*
* More complete testing of spoof detection functionality is done with the C++ tests.
**/
#include "unicode/utypes.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "unicode/uspoof.h"
#include "unicode/ustring.h"
#include "cintltst.h"
#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}
#define TEST_CHECK_SUCCESS(status) {if (U_FAILURE(status)) { \
log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); \
goto bailout;} \
}
#define TEST_ASSERT_TRUE(expr) {if ((expr)==FALSE) { \
log_err("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);}}
#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
log_err("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
__FILE__, __LINE__, #a, (a), #b, (b)); }}
#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
log_err("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
__FILE__, __LINE__, #a, (a), #b, (b)); }}
/*
* TEST_SETUP and TEST_TEARDOWN
* macros to handle the boilerplate around setting up test case.
* Put arbitrary test code between SETUP and TEARDOWN.
* "sc" is the ready-to-go SpoofChecker for use in the tests.
*/
#define TEST_SETUP { \
UErrorCode status = U_ZERO_ERROR; \
USpoofChecker *sc; \
sc = uspoof_open(&status); \
TEST_CHECK_SUCCESS(status); \
{
#define TEST_TEARDOWN \
} \
TEST_ASSERT_SUCCESS(status); \
bailout: \
uspoof_close(sc); \
}
static void test_assert_string(const char *expected, const UChar *actual, UBool nulTerm, const char *file, int line) {
char buf_inside_macro[120];
int32_t len = (int32_t)strlen(expected);
UBool success;
if (nulTerm) {
u_austrncpy(buf_inside_macro, (actual), len+1);
buf_inside_macro[len+2] = 0;
success = (strcmp((expected), buf_inside_macro) == 0);
} else {
u_austrncpy(buf_inside_macro, (actual), len);
buf_inside_macro[len+1] = 0;
success = (strncmp((expected), buf_inside_macro, len) == 0);
}
if (success == FALSE) {
log_err("Failure at file %s, line %d, expected \"%s\", got \"%s\"\n",
file, line, (expected), buf_inside_macro);
}
}
#define TEST_ASSERT_STRING(expected, actual, nulTerm) test_assert_string(expected, actual, nulTerm, __FILE__, __LINE__)
static void TestUSpoofCAPI(void);
void addUSpoofTest(TestNode** root);
void addUSpoofTest(TestNode** root)
{
addTest(root, &TestUSpoofCAPI, "uspoof/TestUSpoofCAPI");
}
/*
* Spoof Detction C API Tests
*/
static void TestUSpoofCAPI(void) {
TEST_SETUP
const char *dataSrcDir;
char *fileName;
char *confusables;
int confusablesLength;
char *confusablesWholeScript;
int confusablesWholeScriptLength;
FILE *f;
UParseError pe;
int32_t errType;
USpoofChecker *rsc;
dataSrcDir = ctest_dataSrcDir();
fileName = malloc(strlen(dataSrcDir) + 100);
strcpy(fileName, dataSrcDir);
strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt");
f = fopen(fileName, "r");
TEST_ASSERT_NE(f, NULL);
confusables = malloc(3000000);
confusablesLength = fread(confusables, 1, 3000000, f);
fclose(f);
strcpy(fileName, dataSrcDir);
strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt");
f = fopen(fileName, "r");
TEST_ASSERT_NE(f, NULL);
confusablesWholeScript = malloc(1000000);
confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f);
fclose(f);
rsc = uspoof_openFromSource(confusables, confusablesLength,
confusablesWholeScript, confusablesWholeScriptLength,
&errType, &pe, &status);
TEST_ASSERT_SUCCESS(status);
free(confusablesWholeScript);
free(confusables);
free(fileName);
uspoof_close(rsc);
/* printf("ParseError Line is %d\n", pe.line); */
TEST_TEARDOWN;
}

View File

@ -1,5 +1,5 @@
## Makefile.in for ICU tools
## Copyright (c) 1999-2008, International Business Machines Corporation and
## Copyright (c) 1999-2009, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
@ -15,7 +15,7 @@ subdir = tools
SUBDIRS = toolutil ctestfw makeconv genrb genuca genbrk genctd \
gennames genpname gencnval gensprep genccode gencmn icupkg pkgdata \
gentest genprops gencase genbidi gennorm
gentest genprops gencase genbidi gennorm gencfu
## List of phony targets
.PHONY : all all-local all-recursive install install-local \

View File

@ -0,0 +1,96 @@
## Makefile.in for ICU - tools/gencfu
## Copyright (c) 2009 International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = tools/gencfu
TARGET_STUB_NAME = gencfu
SECTION = 1
# MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS) $(MAN_FILES)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
ifneq ($(top_builddir),$(top_srcdir))
CPPFLAGS += -I$(top_builddir)/common
endif
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = gencfu.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check check-local install-man
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET) $(MAN_FILES)
install-local: all-local install-man
$(MKINSTALLDIRS) $(DESTDIR)$(bindir)
$(INSTALL) $(TARGET) $(DESTDIR)$(bindir)
install-man: $(MAN_FILES)
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
$(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(TARGET) $(OBJECTS)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
$(POST_BUILD_STEP)
%.$(SECTION): $(srcdir)/%.$(SECTION).in
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

View File

@ -0,0 +1,326 @@
/*
**********************************************************************
* Copyright (C) 2009, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
* File gencfu.c
*/
//--------------------------------------------------------------------
//
// Tool for generating Unicode Confusable data files (.cfu files).
// .cfu files contain the compiled of the confusable data
// derived from the Unicode Consortium data described in
// Unicode UAX 39.
//
// Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu
//
// options: -v verbose
// -? or -h help
//
// The input rule filew is are plain text files containing confusable character
// definitions in the input format defined by Unicode UAX39 for the files
// confusables.txt and confusablesWholeScript.txt. This source (.txt) format
// is also accepted direaccepted by ICU spoof detedtors. The
// files must be encoded in utf-8 format, with or without a BOM.
//
//--------------------------------------------------------------------
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/uclean.h"
#include "unicode/udata.h"
#include "unicode/putil.h"
#include "uoptions.h"
#include "unewdata.h"
#include "ucmndata.h"
#include "uspoof_impl.h"
#include "cmemory.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
U_NAMESPACE_USE
static char *progName;
static UOption options[]={
UOPTION_HELP_H, /* 0 */
UOPTION_HELP_QUESTION_MARK, /* 1 */
UOPTION_VERBOSE, /* 2 */
{ "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
{ "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */
UOPTION_ICUDATADIR, /* 6 */
UOPTION_DESTDIR, /* 7 */
UOPTION_COPYRIGHT, /* 8 */
};
void usageAndDie(int retCode) {
printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
"options:\n"
"\t-h or -? or --help this usage text\n"
"\t-V or --version show a version message\n"
"\t-c or --copyright include a copyright notice\n"
"\t-v or --verbose turn on verbose output\n"
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
"\t followed by path, defaults to %s\n"
"\t-d or --destdir destination directory, followed by the path\n",
u_getDataDirectory());
exit (retCode);
}
#if UCONFIG_NO_BREAK_ITERATION
/* dummy UDataInfo cf. udata.h */
static UDataInfo dummyDataInfo = {
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0, 0, 0, 0 }, /* dummy dataFormat */
{ 0, 0, 0, 0 }, /* dummy formatVersion */
{ 0, 0, 0, 0 } /* dummy dataVersion */
};
#else
//
// Set up the ICU data header, defined in ucmndata.h
//
DataHeader dh ={
{sizeof(DataHeader), // Struct MappedData
0xda,
0x27},
{ // struct UDataInfo
sizeof(UDataInfo), // size
0, // reserved
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0, // reserved
{ 0x43, 0x66, 0x75, 0x20 }, // dataFormat="Cfu "
{ 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
// from the builder. The values declared
// here should never appear in any real data.
{ 5, 1, 0, 0 } // dataVersion (Unicode version)
}};
#endif
// Forward declaration for function for reading source files.
static const char *readFile(const char *fileName, int32_t *len);
//----------------------------------------------------------------------------
//
// main for gencfu
//
//----------------------------------------------------------------------------
int main(int argc, char **argv) {
UErrorCode status = U_ZERO_ERROR;
const char *confFileName;
const char *confWSFileName;
const char *outFileName;
const char *outDir = NULL;
const char *copyright = NULL;
//
// Pick up and check the command line arguments,
// using the standard ICU tool utils option handling.
//
U_MAIN_INIT_ARGS(argc, argv);
progName = argv[0];
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
if(argc<0) {
// Unrecognized option
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
if(options[0].doesOccur || options[1].doesOccur) {
// -? or -h for help.
usageAndDie(0);
}
if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
}
confFileName = options[3].value;
confWSFileName = options[4].value;
outFileName = options[5].value;
if (options[6].doesOccur) {
u_setDataDirectory(options[6].value);
}
/* Initialize ICU */
u_init(&status);
if (U_FAILURE(status)) {
fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
argv[0], u_errorName(status));
exit(1);
}
status = U_ZERO_ERROR;
/* Combine the directory with the file name */
if(options[7].doesOccur) {
outDir = options[7].value;
}
if (options[8].doesOccur) {
copyright = U_COPYRIGHT_STRING;
}
#if UCONFIG_NO_SPOOF_DETECTION
// TOOD: implement UCONFIG_NO_SPOOF_DETECTION in uconfig.h, or decide we don't want it and take this out.
UNewDataMemory *pData;
char msg[1024];
/* write message with just the name */
sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_SPOOF_DETECTION, see uconfig.h", outFileName);
fprintf(stderr, "%s\n", msg);
/* write the dummy data file */
pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
udata_writeBlock(pData, msg, strlen(msg));
udata_finish(pData, &status);
return (int)status;
#else
// Read in the confusables source file
int32_t confusablesLen = 0;
const char *confusables = readFile(confFileName, &confusablesLen);
if (confusables == NULL) {
printf("gencfu: error reading file \"%s\"\n", confFileName);
exit(-1);
}
int32_t wsConfusablesLen = 0;
const char *wsConfsables = readFile(confWSFileName, &wsConfusablesLen);
if (wsConfsables == NULL) {
printf("gencfu: error reading file \"%s\"\n", confFileName);
exit(-1);
}
//
// Create the Spoof Detector from the source confusables files.
// This will compile the data.
//
UParseError parseError;
parseError.line = 0;
parseError.offset = 0;
int32_t errType;
USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
wsConfsables, wsConfusablesLen,
&errType, &parseError, &status);
if (U_FAILURE(status)) {
const char *errFile =
(errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
exit(status);
};
//
// Get the compiled rule data from the USpoofChecker.
//
uint32_t outDataSize;
uint8_t *outData;
outDataSize = uspoof_serialize(sc, NULL, 0, &status);
if (status != U_BUFFER_OVERFLOW_ERROR) {
fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
exit(status);
}
status = U_ZERO_ERROR;
outData = new uint8_t[outDataSize];
uspoof_serialize(sc, outData, outDataSize, &status);
// Copy the data format version numbers from the spoof data header into the UDataMemory header.
uprv_memcpy(dh.info.formatVersion,
reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
sizeof(dh.info.formatVersion));
//
// Create the output file
//
size_t bytesWritten;
UNewDataMemory *pData;
pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
if(U_FAILURE(status)) {
fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
outFileName, u_errorName(status));
exit(status);
}
// Write the data itself.
udata_writeBlock(pData, outData, outDataSize);
// finish up
bytesWritten = udata_finish(pData, &status);
if(U_FAILURE(status)) {
fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
exit(status);
}
if (bytesWritten != outDataSize) {
fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
exit(-1);
}
uspoof_close(sc);
delete outData;
delete confusables;
delete wsConfsables;
u_cleanup();
printf("gencfu: tool completed successfully.\n");
return 0;
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
}
//
// Read in a confusables source file
//
static const char *readFile(const char *fileName, int32_t *len) {
char *result;
long fileSize;
FILE *file;
file = fopen(fileName, "rb");
if( file == 0 ) {
return NULL;
}
fseek(file, 0, SEEK_END);
fileSize = ftell(file);
fseek(file, 0, SEEK_SET);
result = new char[fileSize+10];
if (result==NULL) {
return result;
}
long t = fread(result, 1, fileSize, file);
if (t != fileSize) {
delete result;
return NULL;
}
result[fileSize]=0;
*len = static_cast<int32_t>(fileSize);
fclose(file);
return result;
}

View File

@ -0,0 +1,404 @@
<?xml version="1.0" encoding="UTF-8"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="9.00"
Name="gencfu"
ProjectGUID="{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}"
Keyword="Win32Proj"
TargetFrameworkVersion="0"
>
<Platforms>
<Platform
Name="Win32"
/>
</Platforms>
<ToolFiles>
</ToolFiles>
<Configurations>
<Configuration
Name="Debug|Win32"
OutputDirectory="Debug"
IntermediateDirectory="Debug"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin"
Outputs="..\..\..\bin\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\common;..\..\i18n;..\toolutil"
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
MinimalRebuild="false"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
BufferSecurityCheck="true"
DisableLanguageExtensions="true"
UsePrecompiledHeader="0"
AssemblerListingLocation=".\x86\Debug/"
ObjectFile=".\x86\Debug/"
ProgramDataBaseFileName=".\x86\Debug/"
BrowseInformation="1"
WarningLevel="3"
Detect64BitPortabilityProblems="true"
DebugInformationFormat="4"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x86\Debug\gencfu.exe"
LinkIncremental="2"
SuppressStartupBanner="true"
GenerateDebugInformation="true"
SubSystem="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|Win32"
OutputDirectory="Release"
IntermediateDirectory="Release"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="false"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin"
Outputs="..\..\..\bin\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories="..\..\common;..\..\i18n;..\toolutil"
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;"
StringPooling="true"
MinimalRebuild="false"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
UsePrecompiledHeader="0"
AssemblerListingLocation=".\x86\Release/"
ObjectFile=".\x86\Release/"
ProgramDataBaseFileName=".\x86\Release/"
WarningLevel="3"
Detect64BitPortabilityProblems="true"
DebugInformationFormat="3"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x86\Release\gencfu.exe"
LinkIncremental="1"
GenerateDebugInformation="true"
SubSystem="1"
RandomizedBaseAddress="1"
DataExecutionPrevention="0"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|x64"
OutputDirectory=".\x64\Release"
IntermediateDirectory=".\x64\Release"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="false"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin64&#x0D;&#x0A;"
Outputs="..\..\..\bin64\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
TargetEnvironment="3"
TypeLibraryName=".\x64\Release/genbrk.tlb"
/>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories="..\..\common;..\toolutil"
PreprocessorDefinitions="WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
StringPooling="true"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
PrecompiledHeaderFile=".\x64\Release/genbrk.pch"
AssemblerListingLocation=".\x64\Release/"
ObjectFile=".\x64\Release/"
ProgramDataBaseFileName=".\x64\Release/"
WarningLevel="3"
SuppressStartupBanner="true"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="NDEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x64\Release/genbrk.exe"
LinkIncremental="1"
SuppressStartupBanner="true"
ProgramDatabaseFile=".\x64\Release/genbrk.pdb"
SubSystem="1"
TargetMachine="17"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCWebDeploymentTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Debug|x64"
OutputDirectory=".\x64\Debug"
IntermediateDirectory=".\x64\Debug"
ConfigurationType="1"
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
UseOfMFC="0"
ATLMinimizesCRunTimeLibraryUsage="false"
CharacterSet="2"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin64&#x0D;&#x0A;"
Outputs="..\..\..\bin64\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
TargetEnvironment="3"
TypeLibraryName=".\x64\Debug/gencfu.tlb"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\common;..\toolutil"
PreprocessorDefinitions="WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
BufferSecurityCheck="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
PrecompiledHeaderFile=".\x64\Debug/gencfu.pch"
AssemblerListingLocation=".\x64\Debug/"
ObjectFile=".\x64\Debug/"
ProgramDataBaseFileName=".\x64\Debug/"
BrowseInformation="1"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="3"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="_DEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x64\Debug/gencfu.exe"
LinkIncremental="2"
SuppressStartupBanner="true"
GenerateDebugInformation="true"
ProgramDatabaseFile=".\x64\Debug/gencfu.pdb"
SubSystem="1"
TargetMachine="17"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
UseFAT32Workaround="true"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCWebDeploymentTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<Filter
Name="Header Files"
Filter="h;hpp;hxx;hm;inl;inc;xsd"
UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
>
</Filter>
<Filter
Name="Resource Files"
Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
>
</Filter>
<Filter
Name="Source Files"
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
>
<File
RelativePath=".\gencfu.cpp"
>
</File>
</Filter>
</Files>
<Globals>
</Globals>
</VisualStudioProject>

View File

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005-2008, International Business Machines
* Copyright (C) 2005-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -51,6 +51,8 @@
/* swapping implementations in i18n */
#include "uspoof_impl.h"
/* definitions */
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
@ -556,7 +558,9 @@ static const struct {
{ { 0x54, 0x72, 0x44, 0x63 }, triedict_swap }, /* dataFormat="TrDc " */
#endif
{ { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames } /* dataFormat="unam" */
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */
{ { 0x43, 0x66, 0x75, 0x20 }, uspoof_swap } /* dataFormat="Cfu " */
};
U_CAPI int32_t U_EXPORT2

View File

@ -50,7 +50,7 @@
<Tool
Name="VCCLCompilerTool"
WholeProgramOptimization="true"
AdditionalIncludeDirectories="..\..\..\include,..\..\common"
AdditionalIncludeDirectories="..\..\..\include,..\..\common,..\..\i18n"
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;U_TOOLUTIL_IMPLEMENTATION"
StringPooling="true"
RuntimeLibrary="2"
@ -145,7 +145,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\include,..\..\common"
AdditionalIncludeDirectories="..\..\..\include,..\..\common,..\..\i18n"
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;U_TOOLUTIL_IMPLEMENTATION"
BasicRuntimeChecks="3"
RuntimeLibrary="3"