ICU-4790 spoof impl merged to trunk.
X-SVN-Rev: 25534
This commit is contained in:
parent
a5894c4401
commit
9715eae02c
@ -239,6 +239,13 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "letest", "..\test\letest\le
|
||||
{37FC2C7F-1904-4811-8955-2F478830EAD1} = {37FC2C7F-1904-4811-8955-2F478830EAD1}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gencfu", "..\tools\gencfu\gencfu.vcproj", "{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776}
|
||||
{6B231032-3CB5-4EED-9210-810D666A23A0} = {6B231032-3CB5-4EED-9210-810D666A23A0}
|
||||
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
@ -511,6 +518,12 @@ Global
|
||||
{67351485-4D18-4245-BE39-A7EF0675ACD2}.Release|Win32.Build.0 = Release|Win32
|
||||
{67351485-4D18-4245-BE39-A7EF0675ACD2}.Release|x64.ActiveCfg = Release|x64
|
||||
{67351485-4D18-4245-BE39-A7EF0675ACD2}.Release|x64.Build.0 = Release|x64
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|Win32.Build.0 = Release|Win32
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|x64.ActiveCfg = Release|Win32
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1997-2008, International Business Machines
|
||||
* Copyright (C) 1997-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
* Date Name Description
|
||||
@ -569,7 +569,9 @@ uhash_init(UHashtable *fillinResult,
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_close(UHashtable *hash) {
|
||||
U_ASSERT(hash != NULL);
|
||||
if (hash == NULL) {
|
||||
return;
|
||||
}
|
||||
if (hash->elements != NULL) {
|
||||
if (hash->keyDeleter != NULL || hash->valueDeleter != NULL) {
|
||||
int32_t pos=-1;
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1997-2007, International Business Machines
|
||||
* Copyright (C) 1997-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
* Date Name Description
|
||||
@ -246,7 +246,7 @@ uhash_init(UHashtable *hash,
|
||||
|
||||
/**
|
||||
* Close a UHashtable, releasing the memory used.
|
||||
* @param hash The UHashtable to close.
|
||||
* @param hash The UHashtable to close. If hash is NULL no operation is performed.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uhash_close(UHashtable *hash);
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1999-2004, International Business Machines Corporation and *
|
||||
* Copyright (C) 1999-2009, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
* Date Name Description
|
||||
@ -10,6 +10,7 @@
|
||||
|
||||
#include "uvector.h"
|
||||
#include "cmemory.h"
|
||||
#include "uarrsort.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
@ -466,5 +467,74 @@ void UVector::sortedInsert(UHashTok tok, USortComparator *compare, UErrorCode& e
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Array sort comparator function.
|
||||
* Used from UVector::sort()
|
||||
* Conforms to function signature required for uprv_sortArray().
|
||||
* This function is essentially just a wrapper, to make a
|
||||
* UVector style comparator function usable with uprv_sortArray().
|
||||
*
|
||||
* The context pointer to this function is a pointer back
|
||||
* (with some extra indirection) to the user supplied comparator.
|
||||
*
|
||||
*/
|
||||
static int32_t U_CALLCONV
|
||||
sortComparator(const void *context, const void *left, const void *right) {
|
||||
USortComparator *compare = *static_cast<USortComparator * const *>(context);
|
||||
UHashTok tok1 = *static_cast<const UHashTok *>(left);
|
||||
UHashTok tok2 = *static_cast<const UHashTok *>(right);
|
||||
int32_t result = (*compare)(tok1, tok2);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Array sort comparison function for use from UVector::sorti()
|
||||
* Compares int32_t vector elements.
|
||||
*/
|
||||
static int32_t U_CALLCONV
|
||||
sortiComparator(const void * /*context */, const void *left, const void *right) {
|
||||
const UHashTok *tok1 = static_cast<const UHashTok *>(left);
|
||||
const UHashTok *tok2 = static_cast<const UHashTok *>(right);
|
||||
int32_t result = tok1->integer < tok2->integer? -1 :
|
||||
tok1->integer == tok2->integer? 0 : 1;
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sort the vector, assuming it constains ints.
|
||||
* (A more general sort would take a comparison function, but it's
|
||||
* not clear whether UVector's USortComparator or
|
||||
* UComparator from uprv_sortAray would be more appropriate.)
|
||||
*/
|
||||
void UVector::sorti(UErrorCode &ec) {
|
||||
if (U_SUCCESS(ec)) {
|
||||
uprv_sortArray(elements, count, sizeof(UHashTok),
|
||||
sortiComparator, NULL, FALSE, &ec);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Sort with a user supplied comparator.
|
||||
*
|
||||
* The comparator function handling is confusing because the function type
|
||||
* for UVector (as defined for sortedInsert()) is different from the signature
|
||||
* required by uprv_sortArray(). This is handled by passing the
|
||||
* the UVector sort function pointer via the context pointer to a
|
||||
* sortArray() comparator function, which can then call back to
|
||||
* the original user functtion.
|
||||
*
|
||||
* An additional twist is that it's not safe to pass a pointer-to-function
|
||||
* as a (void *) data pointer, so instead we pass a (data) pointer to a
|
||||
* pointer-to-function variable.
|
||||
*/
|
||||
void UVector::sort(USortComparator *compare, UErrorCode &ec) {
|
||||
if (U_SUCCESS(ec)) {
|
||||
uprv_sortArray(elements, count, sizeof(UHashTok),
|
||||
sortComparator, &compare, FALSE, &ec);
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2006, International Business Machines
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
@ -245,6 +245,20 @@ public:
|
||||
*/
|
||||
void sortedInsert(int32_t obj, USortComparator *compare, UErrorCode& ec);
|
||||
|
||||
/**
|
||||
* Sort the contents of the vector, assuming that the contents of the
|
||||
* vector are of type int32_t.
|
||||
*/
|
||||
void sorti(UErrorCode &ec);
|
||||
|
||||
/**
|
||||
* Sort the contents of this vector, using a caller-supplied function
|
||||
* to do the comparisons. (It's confusing that
|
||||
* UVector's USortComparator function is different from the
|
||||
* UComparator function type defined in uarrsort.h)
|
||||
*/
|
||||
void sort(USortComparator *compare, UErrorCode &ec);
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
*/
|
||||
|
3
icu4c/source/configure
vendored
3
icu4c/source/configure
vendored
@ -10170,7 +10170,7 @@ then
|
||||
fi
|
||||
|
||||
# output the Makefiles
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
@ -10807,6 +10807,7 @@ do
|
||||
"tools/icuswap/Makefile") CONFIG_FILES="$CONFIG_FILES tools/icuswap/Makefile" ;;
|
||||
"tools/pkgdata/Makefile") CONFIG_FILES="$CONFIG_FILES tools/pkgdata/Makefile" ;;
|
||||
"tools/tzcode/Makefile") CONFIG_FILES="$CONFIG_FILES tools/tzcode/Makefile" ;;
|
||||
"tools/gencfu/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencfu/Makefile" ;;
|
||||
"test/Makefile") CONFIG_FILES="$CONFIG_FILES test/Makefile" ;;
|
||||
"test/compat/Makefile") CONFIG_FILES="$CONFIG_FILES test/compat/Makefile" ;;
|
||||
"test/testdata/Makefile") CONFIG_FILES="$CONFIG_FILES test/testdata/Makefile" ;;
|
||||
|
@ -1116,6 +1116,7 @@ AC_CONFIG_FILES([icudefs.mk \
|
||||
tools/icuswap/Makefile \
|
||||
tools/pkgdata/Makefile \
|
||||
tools/tzcode/Makefile \
|
||||
tools/gencfu/Makefile \
|
||||
test/Makefile \
|
||||
test/compat/Makefile \
|
||||
test/testdata/Makefile \
|
||||
|
@ -230,6 +230,11 @@ BRS_SRC_FILES = $(BRS_SRC:%=$(BRKSRCDIR)/%)
|
||||
INSTALLED_BRS_FILES = $(BRK_RES_SOURCE:%.txt=%) $(BRK_RES_SOURCE_LOCAL:%.txt=%)
|
||||
endif
|
||||
|
||||
## Confusables (Spoofing) files
|
||||
ALL_CFU_SOURCE=$(UNICODEDATADIR)/confusables.txt $(UNICODEDATADIR)/confusablesWholeScript.txt
|
||||
CFU_FILES_SHORT=confusables.cfu
|
||||
CFU_FILES=$(BUILDDIR)/$(CFU_FILES_SHORT)
|
||||
|
||||
## UCM files
|
||||
-include $(UCMSRCDIR)/ucmcore.mk
|
||||
-include $(UCMSRCDIR)/ucmfiles.mk
|
||||
@ -331,10 +336,10 @@ SPREP_FILES = $(ALL_SPREP_SOURCE:%.txt=$(BUILDDIR)/%.spp)
|
||||
SPREP_FILES_SHORT = $(ALL_SPREP_SOURCE:%.txt=%.spp)
|
||||
|
||||
## All generated files
|
||||
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES)
|
||||
ALL_FILES = $(DAT_FILES) $(CNV_FILES) $(BRK_FILES) $(CTD_FILES) $(RES_FILES) $(INDEX_RES_FILE) $(COLLATION_FILES) $(BRK_RES_FILES) $(RBNF_FILES) $(TRANSLIT_FILES) $(SPREP_FILES) $(CFU_FILES)
|
||||
ALL_INDEX_SRC_FILES = $(PKGDATA_LIST) $(INDEX_FILE) $(COLLATION_INDEX_FILE) $(BRK_RES_INDEX_FILE) $(RBNF_INDEX_FILE)
|
||||
# a list to use in the .lst files (package-relative)
|
||||
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT)
|
||||
ALL_FILES_LIST = $(DAT_FILES_SHORT) $(CNV_FILES_SHORT) $(BRK_FILES_SHORT) $(CTD_FILES_SHORT) $(RES_FILES_SHORT) $(INDEX_RES_FILE_SHORT) $(COLLATION_FILES_SHORT) $(COLLATION_INDEX_RES_SHORT) $(BRK_RES_FILES_SHORT) $(BRK_RES_INDEX_RES_SHORT) $(RBNF_FILES_SHORT) $(RBNF_INDEX_RES_SHORT) $(TRANSLIT_FILES_SHORT) $(SPREP_FILES_SHORT) $(CFU_FILES_SHORT)
|
||||
|
||||
UNI_CORE_DATA=uprops.icu ucase.icu ubidi.icu unorm.icu
|
||||
UNI_CORE_TARGET_DATA=$(UNI_CORE_DATA:%=$(BUILDDIR)/%)
|
||||
@ -452,6 +457,20 @@ $(BRKBLDDIR)/%.brk: $(BRKSRCDIR)/%.txt $(BINDIR)/genbrk$(EXEEXT) $(DAT_FILES)
|
||||
$(BRKBLDDIR)/%.ctd: $(BRKSRCDIR)/%.txt $(BINDIR)/genctd$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) $(BINDIR)/genctd -c -i $(BUILDDIR) -o $@ $<
|
||||
|
||||
#################################################### CFU
|
||||
# CFU FILES
|
||||
# Note: gencfu requires two input files to produce a single output file.
|
||||
# There will be exactly one target file and two source files.
|
||||
# The $(word n, ...) selects the nth word from the following stuff.
|
||||
# There must be a nicer way to do this.
|
||||
|
||||
$(CFU_FILES): $(ALL_CFU_SOURCE) $(BINDIR)/gencfu$(EXEEXT) $(DAT_FILES)
|
||||
$(INVOKE) echo ALL_CFU_SOURCE: $(ALL_CFU_SOURCE)
|
||||
$(INVOKE) echo CFU_FILES: $(CFU_FILES)
|
||||
$(INVOKE) echo CFU_FILES_SHORT: $(CFU_FILES_SHORT)
|
||||
$(INVOKE) $(BINDIR)/gencfu -c -i $(BUILDDIR) -r $(word 1,$(ALL_CFU_SOURCE)) -w $(word 2,$(ALL_CFU_SOURCE)) -o $@
|
||||
|
||||
|
||||
#################################################### CNV
|
||||
# CNV FILES
|
||||
$(BUILDDIR)/%.cnv: $(UCMSRCDIR)/%.ucm $(BINDIR)/makeconv$(EXEEXT)
|
||||
|
@ -422,12 +422,13 @@ uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(IC
|
||||
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
|
||||
-@erase "$(ICUTMP)\$(ICUPKG).dat"
|
||||
!ELSE
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES)
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
@echo Building icu data
|
||||
cd "$(ICUBLD_PKG)"
|
||||
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
|
||||
pnames.icu
|
||||
unames.icu
|
||||
confusables.cfu
|
||||
$(ICUCOL)\ucadata.icu
|
||||
$(ICUCOL)\invuca.icu
|
||||
cnvalias.icu
|
||||
@ -486,6 +487,7 @@ CLEAN : GODATA
|
||||
-@erase "*.res"
|
||||
-@erase "*.spp"
|
||||
-@erase "*.txt"
|
||||
-@erase "*.cfu"
|
||||
@cd "$(ICUBLD_PKG)\$(ICUBRK)"
|
||||
-@erase "*.brk"
|
||||
-@erase "*.ctd"
|
||||
@ -497,7 +499,7 @@ CLEAN : GODATA
|
||||
@cd "$(ICUBLD_PKG)\$(ICURBNF)"
|
||||
-@erase "*.res"
|
||||
-@erase "*.txt"
|
||||
@cd "$(ICUBLD_PKG)\$(ICUTRNS)"
|
||||
@cd "$(ICUBLD_PKG)\$(ICUTRNS)"
|
||||
-@erase "*.res"
|
||||
@cd "$(ICUOUT)"
|
||||
-@erase "*.dat"
|
||||
@ -673,6 +675,12 @@ res_index:table(nofallback) {
|
||||
@echo Creating $@
|
||||
@"$(ICUTOOLS)\gensprep\$(CFG)\gensprep" -s $(<D) -d "$(ICUBLD_PKG)" -b $(@B) -m "$(ICUUNIDATA)" -u 3.2.0 $(<F)
|
||||
|
||||
# Confusables .cfu file generation
|
||||
# Can't use an inference rule because two .txt source files combine to produce a single .cfu output file
|
||||
"$(ICUBLD_PKG)\confusables.cfu": "$(ICUUNIDATA)\confusables.txt" "$(ICUUNIDATA)\confusablesWholeScript.txt" "$(ICUTOOLS)\gencfu\$(CFG)\gencfu.exe"
|
||||
@echo Creating $@
|
||||
@"$(ICUTOOLS)\gencfu\$(CFG)\gencfu" -c -r "$(ICUUNIDATA)\confusables.txt" -w "$(ICUUNIDATA)\confusablesWholeScript.txt" -o $@ -i "$(ICUBLD_PKG)"
|
||||
|
||||
!IFDEF ICUDATA_ARCHIVE
|
||||
"$(ICUDATA_SOURCE_ARCHIVE)": CREATE_DIRS $(ICUDATA_ARCHIVE) "$(ICUTOOLS)\icupkg\$(CFG)\icupkg.exe"
|
||||
"$(ICUTOOLS)\icupkg\$(CFG)\icupkg" -t$(U_ICUDATA_ENDIAN_SUFFIX) "$(ICUDATA_ARCHIVE)" "$(ICUDATA_SOURCE_ARCHIVE)"
|
||||
|
27701
icu4c/source/data/unidata/confusables.txt
Normal file
27701
icu4c/source/data/unidata/confusables.txt
Normal file
File diff suppressed because it is too large
Load Diff
4516
icu4c/source/data/unidata/confusablesWholeScript.txt
Normal file
4516
icu4c/source/data/unidata/confusablesWholeScript.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -81,7 +81,8 @@ ulocdata.o measfmt.o currfmt.o curramt.o currunit.o measure.o utmscale.o \
|
||||
csdetect.o csmatch.o csr2022.o csrecog.o csrmbcs.o csrsbcs.o csrucode.o csrutf8.o inputext.o \
|
||||
wintzimpl.o windtfmt.o winnmfmt.o basictz.o dtrule.o rbtz.o tzrule.o tztrans.o vtzone.o \
|
||||
zonemeta.o zstrfmt.o plurrule.o plurfmt.o dtitvfmt.o dtitvinf.o \
|
||||
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o currpinf.o
|
||||
tmunit.o tmutamt.o tmutfmt.o colldata.o bmsearch.o bms.o currpinf.o \
|
||||
uspoof.o uspoof_impl.o uspoof_build.o uspoof_buildconf.o uspoof_buildwsconf.o
|
||||
|
||||
## Header files to install
|
||||
HEADERS = $(srcdir)/unicode/*.h
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -29,6 +29,8 @@
|
||||
#ifdef XP_CPLUSPLUS
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uniset.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
#endif
|
||||
|
||||
|
||||
@ -133,8 +135,8 @@ typedef enum USpoofChecks {
|
||||
USPOOF_WHOLE_SCRIPT_CONFUSABLE = 4,
|
||||
|
||||
/** Modifier for single, mixed & whole script checks.
|
||||
Selects between Lower Case Confusable (0) and
|
||||
Any Case Confusable (1). */
|
||||
Selects between Lower Case Confusable and
|
||||
Any Case Confusable. */
|
||||
USPOOF_ANY_CASE = 8,
|
||||
|
||||
/** Check that an identifer contains only characters from a
|
||||
@ -146,15 +148,13 @@ typedef enum USpoofChecks {
|
||||
/** Check that an identifier for the presence of invisble characters,
|
||||
* characters, such as zero-width spaces, or character sequences that are
|
||||
* likely not to display, such as multiple occurences of the same
|
||||
* non-spacing mark. This does not test the input string as a whole
|
||||
* non-spacing mark. This check does not test the input string as a whole
|
||||
* for conformance to any particular syntax for identifiers.
|
||||
*/
|
||||
USPOOF_INVISIBLE = 32,
|
||||
|
||||
USPOOF_LOCALE_LIMIT = 64,
|
||||
USPOOF_CHAR_LIMIT = 128,
|
||||
USPOOF_CHAR_LIMIT = 64,
|
||||
USPOOF_ALL_CHECKS = 0x7f
|
||||
};
|
||||
} USpoofChecks;
|
||||
|
||||
|
||||
/**
|
||||
@ -298,10 +298,20 @@ uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status);
|
||||
* Supplying an empty string removes all restrictions;
|
||||
* characters from any script will be allowed.
|
||||
*
|
||||
* The USPOOF_LOCALE_LIMIT test is automatically enabled for this
|
||||
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
|
||||
* USpoofChecker when calling this function with a non-empty set
|
||||
* of locales.
|
||||
*
|
||||
* The Unicode Set of characters that will be allowed is accessible
|
||||
* via the uspoof_getAllowedChars() function. uspoof_setAllowedLocales()
|
||||
* will <i>replace</i> any previously applied set of allowed characters.
|
||||
*
|
||||
* Adjustments, such as additions or deletions of certain classes of characters,
|
||||
* can be made to the result of uspoof_setAllowedLocales() by
|
||||
* fetching the resulting set with uspoof_getAllowedChars(),
|
||||
* manipulating it with the Unicode Set API, then resetting the
|
||||
* spoof detectors limits with uspoof_setAllowedChars()
|
||||
*
|
||||
* @param sc The USpoofChecker
|
||||
* @param localesList A list list of locales, from which the language
|
||||
* and associated script are extracted. The list
|
||||
@ -318,6 +328,8 @@ uspoof_setAllowedLocales(USpoofChecker *sc, const char *localesList, UErrorCode
|
||||
* to be checked. If no limitations on scripts have been specified,
|
||||
* an empty string will be returned.
|
||||
*
|
||||
* uspoof_setAllowedChars() will reset the list of allowed to be empty.
|
||||
*
|
||||
* The format of the returned list is that of an HTTP Accept-Language
|
||||
* header field, but it may not be identical to the original string passed
|
||||
* to uspoof_setAllowedLocales(); the string may be
|
||||
@ -339,7 +351,8 @@ uspoof_getAllowedLocales(USpoofChecker *sc, UErrorCode *status);
|
||||
/**
|
||||
* Limit the acceptable characters to those specified by a Unicode Set.
|
||||
* Any previously specified character limit is
|
||||
* is replaced by the new settings.
|
||||
* is replaced by the new settings. This includes limits on
|
||||
* characters that were set with the uspoof_setAllowedLocales() function.
|
||||
*
|
||||
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
|
||||
* USpoofChecker by this function.
|
||||
@ -381,14 +394,15 @@ uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status)
|
||||
* the USPOOF_CHAR_LIMIT test.
|
||||
*/
|
||||
U_DRAFT const USet * U_EXPORT2
|
||||
uspoof_getAllowedChars(USpoofChecker *sc, UErrorCode *status);
|
||||
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status);
|
||||
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
/**
|
||||
* Limit the acceptable characters to those specified by a Unicode Set.
|
||||
* Any previously specified character limit is
|
||||
* is replaced by the new settings.
|
||||
* is replaced by the new settings. This includes limits on
|
||||
* characters that were set with the uspoof_setAllowedLocales() function.
|
||||
*
|
||||
* The USPOOF_CHAR_LIMIT test is automatically enabled for this
|
||||
* USoofChecker by this function.
|
||||
@ -425,7 +439,7 @@ uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCo
|
||||
* the USPOOF_CHAR_LIMIT test.
|
||||
*/
|
||||
U_DRAFT const UnicodeSet * U_EXPORT2
|
||||
uspoof_getAllowedUnicodeSet(USpoofChecker *sc, UErrorCode *status);
|
||||
uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status);
|
||||
#endif
|
||||
|
||||
|
||||
@ -441,10 +455,11 @@ uspoof_getAllowedUnicodeSet(USpoofChecker *sc, UErrorCode *status);
|
||||
* 16 bit UTF-16 code units, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @position An out parameter that receives the index of the
|
||||
* first string position that fails one of the checks.
|
||||
* first string position that fails the allowed character
|
||||
* limitation checks.
|
||||
* This parameter may be null if the position information
|
||||
* is not needed.
|
||||
* If the string passes all of the requested checks the
|
||||
* If the string passes the requested checks the
|
||||
* parameter value will not be set.
|
||||
* @param status The error code, set if an error occured while attempting to
|
||||
* perform the check.
|
||||
@ -473,15 +488,18 @@ uspoof_check(const USpoofChecker *sc,
|
||||
* @param length the length of the string to be checked, or -1 if the string is
|
||||
* zero terminated.
|
||||
* @position An out parameter that receives the index of the
|
||||
* first string position that fails one of the checks.
|
||||
* first string position that fails the allowed character
|
||||
* limitation checks.
|
||||
* This parameter may be null if the position information
|
||||
* is not needed.
|
||||
* If the string passes all of the requested checks the
|
||||
* If the string passes the requested checks the
|
||||
* parameter value will not be set.
|
||||
* @param status The error code, set if an error occured while attempting to
|
||||
* perform the check.
|
||||
* Spoofing or security issues detected with the input string are
|
||||
* not reported here, but through the function's return value.
|
||||
* If the input contains invalid UTF-8 sequences,
|
||||
* a status of U_INVALID_CHAR_FOUND will be returned.
|
||||
* @return An integer value with bits set for any potential security
|
||||
* or spoofing issues detected. The bits are defined by
|
||||
* enum USpoofChecks. Zero is returned if no issues
|
||||
@ -504,10 +522,11 @@ uspoof_checkUTF8(const USpoofChecker *sc,
|
||||
* @param sc The USpoofChecker
|
||||
* @param text A UnicodeString to be checked for possible security issues.
|
||||
* @position An out parameter that receives the index of the
|
||||
* first string position that fails one of the checks.
|
||||
* first string position that fails the allowed character
|
||||
* limitation checks.
|
||||
* This parameter may be null if the position information
|
||||
* is not needed.
|
||||
* If the string passes all of the requested checks the
|
||||
* If the string passes the requested checks the
|
||||
* parameter value will not be set.
|
||||
* @param status The error code, set if an error occured while attempting to
|
||||
* perform the check.
|
||||
@ -645,7 +664,7 @@ U_DRAFT int32_t U_EXPORT2
|
||||
uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
||||
const U_NAMESPACE_QUALIFIER UnicodeString &s1,
|
||||
const U_NAMESPACE_QUALIFIER UnicodeString &s2,
|
||||
int32_t *position,
|
||||
int32_t *position,
|
||||
UErrorCode *status);
|
||||
#endif
|
||||
|
||||
@ -684,7 +703,7 @@ uspoof_areConfusableUnicodeString(const USpoofChecker *sc,
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uspoof_getSkeleton(const USpoofChecker *sc,
|
||||
USpoofChecks type,
|
||||
uint32_t type,
|
||||
const UChar *s, int32_t length,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
@ -726,7 +745,7 @@ uspoof_getSkeleton(const USpoofChecker *sc,
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
||||
USpoofChecks type,
|
||||
uint32_t type,
|
||||
const char *s, int32_t length,
|
||||
char *dest, int32_t destCapacity,
|
||||
UErrorCode *status);
|
||||
@ -762,7 +781,7 @@ uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
||||
*/
|
||||
U_DRAFT UnicodeString & U_EXPORT2
|
||||
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
||||
USpoofChecks type,
|
||||
uint32_t type,
|
||||
const UnicodeString &s,
|
||||
UnicodeString &dest,
|
||||
UErrorCode *status);
|
||||
|
540
icu4c/source/i18n/uspoof.cpp
Normal file
540
icu4c/source/i18n/uspoof.cpp
Normal file
@ -0,0 +1,540 @@
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2008-2009, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
* file name: uspoof.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2008Feb13
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Unicode Spoof Detection
|
||||
*/
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "uassert.h"
|
||||
|
||||
#include <stdio.h> // debug
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
||||
U_CAPI USpoofChecker * U_EXPORT2
|
||||
uspoof_open(UErrorCode *status) {
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
SpoofImpl *si = new SpoofImpl(SpoofData::getDefault(*status), *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
delete si;
|
||||
si = NULL;
|
||||
}
|
||||
return (USpoofChecker *)si;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI USpoofChecker * U_EXPORT2
|
||||
uspoof_openFromSerialized(const void *data, int32_t length, int32_t *pActualLength,
|
||||
UErrorCode *status) {
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
SpoofData *sd = new SpoofData(data, length, *status);
|
||||
SpoofImpl *si = new SpoofImpl(sd, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
delete sd;
|
||||
delete si;
|
||||
return NULL;
|
||||
}
|
||||
if (sd == NULL || si == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
delete sd;
|
||||
delete si;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (pActualLength != NULL) {
|
||||
*pActualLength = sd->fRawData->fLength;
|
||||
}
|
||||
return reinterpret_cast<USpoofChecker *>(si);
|
||||
}
|
||||
|
||||
|
||||
U_CAPI USpoofChecker * U_EXPORT2
|
||||
uspoof_clone(const USpoofChecker *sc, UErrorCode *status) {
|
||||
const SpoofImpl *src = SpoofImpl::validateThis(sc, *status);
|
||||
if (src == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
SpoofImpl *result = new SpoofImpl(*src, *status); // copy constructor
|
||||
if (U_FAILURE(*status)) {
|
||||
delete result;
|
||||
result = NULL;
|
||||
}
|
||||
return (USpoofChecker *)result;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uspoof_close(USpoofChecker *sc) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
SpoofImpl *This = SpoofImpl::validateThis(sc, status);
|
||||
delete This;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uspoof_setChecks(USpoofChecker *sc, int32_t checks, UErrorCode *status) {
|
||||
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Verify that the requested checks are all ones (bits) that
|
||||
// are acceptable, known values.
|
||||
if (checks & ~USPOOF_ALL_CHECKS) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
This->fChecks = checks;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_getChecks(const USpoofChecker *sc, UErrorCode *status) {
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return 0;
|
||||
}
|
||||
return This->fChecks;
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uspoof_setAllowedLocales(USpoofChecker *sc, const char * /*localesList*/, UErrorCode *status) {
|
||||
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return;
|
||||
}
|
||||
// TODO:
|
||||
}
|
||||
|
||||
|
||||
U_CAPI const USet * U_EXPORT2
|
||||
uspoof_getAllowedChars(const USpoofChecker *sc, UErrorCode *status) {
|
||||
const UnicodeSet *result = uspoof_getAllowedUnicodeSet(sc, status);
|
||||
return reinterpret_cast<const USet *>(result);
|
||||
}
|
||||
|
||||
U_CAPI const UnicodeSet * U_EXPORT2
|
||||
uspoof_getAllowedUnicodeSet(const USpoofChecker *sc, UErrorCode *status) {
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
return This->fAllowedCharsSet;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uspoof_setAllowedChars(USpoofChecker *sc, const USet *chars, UErrorCode *status) {
|
||||
const UnicodeSet *set = reinterpret_cast<const UnicodeSet *>(chars);
|
||||
uspoof_setAllowedUnicodeSet(sc, set, status);
|
||||
}
|
||||
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uspoof_setAllowedUnicodeSet(USpoofChecker *sc, const UnicodeSet *chars, UErrorCode *status) {
|
||||
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return;
|
||||
}
|
||||
if (chars->isBogus()) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return;
|
||||
}
|
||||
UnicodeSet *clonedSet = static_cast<UnicodeSet *>(chars->clone());
|
||||
if (clonedSet == NULL || clonedSet->isBogus()) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
clonedSet->freeze();
|
||||
delete This->fAllowedCharsSet;
|
||||
This->fAllowedCharsSet = clonedSet;
|
||||
This->fChecks |= USPOOF_CHAR_LIMIT;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_check(const USpoofChecker *sc,
|
||||
const UChar *text, int32_t length,
|
||||
int32_t *position,
|
||||
UErrorCode *status) {
|
||||
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
return 0;
|
||||
}
|
||||
if (length < -1) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if (length == -1) {
|
||||
// It's not worth the bother to handle nul terminated strings everywhere.
|
||||
// Just get the length and be done with it.
|
||||
length = u_strlen(text);
|
||||
}
|
||||
|
||||
int32_t result = 0;
|
||||
int32_t failPos = 0x7fffffff; // TODO: do we have a #define for max int32?
|
||||
|
||||
// A count of the number of non-Common or inherited scripts.
|
||||
// Needed for both the SINGLE_SCRIPT and the WHOLE/MIXED_SCIRPT_CONFUSABLE tests.
|
||||
// Share the computation when possible. scriptCount == -1 means that we haven't
|
||||
// done it yet.
|
||||
int32_t scriptCount = -1;
|
||||
|
||||
if ((This->fChecks) & USPOOF_SINGLE_SCRIPT) {
|
||||
scriptCount = This->scriptScan(text, length, failPos, *status);
|
||||
// printf("scriptCount (clipped to 2) = %d\n", scriptCount);
|
||||
if ( scriptCount >= 2) {
|
||||
// Note: scriptCount == 2 covers all cases of the number of scripts >= 2
|
||||
result |= USPOOF_SINGLE_SCRIPT;
|
||||
}
|
||||
}
|
||||
|
||||
if (This->fChecks & USPOOF_CHAR_LIMIT) {
|
||||
int32_t i;
|
||||
UChar32 c;
|
||||
for (i=0; i<length ;) {
|
||||
U16_NEXT(text, i, length, c);
|
||||
if (!This->fAllowedCharsSet->contains(c)) {
|
||||
result |= USPOOF_CHAR_LIMIT;
|
||||
if (i < failPos) {
|
||||
failPos = i;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: add USPOOF_INVISIBLE check
|
||||
|
||||
if (This->fChecks & (USPOOF_WHOLE_SCRIPT_CONFUSABLE | USPOOF_MIXED_SCRIPT_CONFUSABLE)) {
|
||||
// The basic test is the same for both whole and mixed script confusables.
|
||||
// Compute the set of scripts that every input character has a confusable in.
|
||||
// For this computation an input character is always considered to be
|
||||
// confusable with itself in its own script.
|
||||
// If the number of such scripts is two or more, and the input consisted of
|
||||
// characters all from a single script, we have a whole script confusable.
|
||||
// (The two scripts will be the original script and the one that is confusable)
|
||||
// If the number of such scripts >= one, and the original input contained characters from
|
||||
// more than one script, we have a mixed script confusable. (We can transform
|
||||
// some of the characters, and end up with a visually similar string all in
|
||||
// one script.)
|
||||
|
||||
NFKDBuffer normalizedInput(text, length, *status);
|
||||
const UChar *nfkdText = normalizedInput.getBuffer();
|
||||
int32_t nfkdLength = normalizedInput.getLength();
|
||||
|
||||
if (scriptCount == -1) {
|
||||
int32_t t;
|
||||
scriptCount = This->scriptScan(text, length, t, *status);
|
||||
}
|
||||
|
||||
ScriptSet scripts;
|
||||
This->wholeScriptCheck(nfkdText, nfkdLength, &scripts, *status);
|
||||
int32_t confusableScriptCount = scripts.countMembers();
|
||||
//printf("confusableScriptCount = %d\n", confusableScriptCount);
|
||||
|
||||
if ((This->fChecks & USPOOF_WHOLE_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 2 &&
|
||||
scriptCount == 1) {
|
||||
result |= USPOOF_WHOLE_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
|
||||
if ((This->fChecks & USPOOF_MIXED_SCRIPT_CONFUSABLE) &&
|
||||
confusableScriptCount >= 1 &&
|
||||
scriptCount > 1) {
|
||||
result |= USPOOF_MIXED_SCRIPT_CONFUSABLE;
|
||||
}
|
||||
}
|
||||
|
||||
if (position != NULL && failPos != 0x7fffffff) {
|
||||
*position = failPos;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_checkUTF8(const USpoofChecker *sc,
|
||||
const char *text, int32_t length,
|
||||
int32_t *position,
|
||||
UErrorCode *status) {
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
UChar stackBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||
UChar* text16 = stackBuf;
|
||||
int32_t len16;
|
||||
|
||||
u_strFromUTF8(text16, USPOOF_STACK_BUFFER_SIZE, &len16, text, length, status);
|
||||
if (U_FAILURE(*status) && *status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
return 0;
|
||||
}
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
text16 = static_cast<UChar *>(uprv_malloc(len16 * sizeof(UChar) + 2));
|
||||
if (text16 == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
*status = U_ZERO_ERROR;
|
||||
u_strFromUTF8(text16, len16+1, NULL, text, length, status);
|
||||
}
|
||||
|
||||
int32_t position16 = -1;
|
||||
int32_t result = uspoof_check(sc, text16, len16, &position16, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (position16 > 0) {
|
||||
// Translate a UTF-16 based error position back to a UTF-8 offset.
|
||||
// u_strToUTF8() in preflight mode is an easy way to do it.
|
||||
U_ASSERT(position16 <= len16);
|
||||
u_strToUTF8(NULL, 0, position, text16, position16, status);
|
||||
}
|
||||
|
||||
if (text16 != stackBuf) {
|
||||
uprv_free(text16);
|
||||
}
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_checkUnicodeString(const USpoofChecker *sc,
|
||||
const U_NAMESPACE_QUALIFIER UnicodeString &text,
|
||||
int32_t *position,
|
||||
UErrorCode *status) {
|
||||
int32_t result = uspoof_check(sc, text.getBuffer(), text.length(), position, status);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_getSkeleton(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const UChar *s, int32_t length,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
UErrorCode *status) {
|
||||
|
||||
const SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if (length<-1 || destCapacity<0 || (destCapacity==0 && dest!=NULL) ||
|
||||
(type & ~(USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE)) != 0) {
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t tableMask = 0;
|
||||
switch (type) {
|
||||
case 0:
|
||||
tableMask = USPOOF_ML_TABLE_FLAG;
|
||||
break;
|
||||
case USPOOF_SINGLE_SCRIPT_CONFUSABLE:
|
||||
tableMask = USPOOF_SL_TABLE_FLAG;
|
||||
break;
|
||||
case USPOOF_ANY_CASE:
|
||||
tableMask = USPOOF_MA_TABLE_FLAG;
|
||||
break;
|
||||
case USPOOF_SINGLE_SCRIPT_CONFUSABLE | USPOOF_ANY_CASE:
|
||||
tableMask = USPOOF_SA_TABLE_FLAG;
|
||||
break;
|
||||
default:
|
||||
*status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
// NFKD transform of the user supplied input
|
||||
|
||||
UChar nfkdBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||
UChar *nfkdInput = nfkdBuf;
|
||||
int32_t normalizedLen = unorm_normalize(
|
||||
s, length, UNORM_NFKD, 0, nfkdInput, USPOOF_STACK_BUFFER_SIZE, status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
nfkdInput = (UChar *)uprv_malloc((normalizedLen+1)*sizeof(UChar));
|
||||
if (nfkdInput == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return 0;
|
||||
}
|
||||
normalizedLen = unorm_normalize(s, length, UNORM_NFKD, 0,
|
||||
nfkdInput, normalizedLen+1, status);
|
||||
}
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// buffer to hold the Unicode defined mappings for a single code point
|
||||
UChar buf[USPOOF_MAX_SKELETON_EXPANSION];
|
||||
|
||||
// Apply the mapping to the NFKD form string
|
||||
|
||||
int32_t inputIndex = 0;
|
||||
int32_t resultLen = 0;
|
||||
while (inputIndex < normalizedLen) {
|
||||
UChar32 c;
|
||||
U16_NEXT(nfkdInput, inputIndex, normalizedLen, c);
|
||||
int32_t replaceLen = This->confusableLookup(c, tableMask, buf);
|
||||
if (resultLen + replaceLen < destCapacity) {
|
||||
int i;
|
||||
for (i=0; i<replaceLen; i++) {
|
||||
dest[resultLen++] = buf[i];
|
||||
}
|
||||
} else {
|
||||
// Storing the transformed string would overflow the dest buffer.
|
||||
// Don't bother storing anything, just sum up the required buffer size.
|
||||
// (We dont guarantee that a truncated buffer is filled to it's end)
|
||||
resultLen += replaceLen;
|
||||
}
|
||||
}
|
||||
|
||||
if (resultLen < destCapacity) {
|
||||
dest[resultLen] = 0;
|
||||
} else if (resultLen == destCapacity) {
|
||||
*status = U_STRING_NOT_TERMINATED_WARNING;
|
||||
} else {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
if (nfkdInput != nfkdBuf) {
|
||||
uprv_free(nfkdInput);
|
||||
}
|
||||
return resultLen;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI UnicodeString & U_EXPORT2
|
||||
uspoof_getSkeletonUnicodeString(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const UnicodeString &s,
|
||||
UnicodeString &dest,
|
||||
UErrorCode *status) {
|
||||
if (U_FAILURE(*status)) {
|
||||
return dest;
|
||||
}
|
||||
dest.remove();
|
||||
|
||||
const UChar *str = s.getBuffer();
|
||||
int32_t strLen = s.length();
|
||||
UChar smallBuf[100];
|
||||
UChar *buf = smallBuf;
|
||||
int32_t outputSize = uspoof_getSkeleton(sc, type, str, strLen, smallBuf, 100, status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
buf = static_cast<UChar *>(uprv_malloc(outputSize+1));
|
||||
if (buf == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
uspoof_getSkeleton(sc, type, str, strLen, buf, outputSize+1, status);
|
||||
}
|
||||
if (U_SUCCESS(*status)) {
|
||||
dest.setTo(buf, outputSize);
|
||||
}
|
||||
|
||||
if (buf != smallBuf) {
|
||||
uprv_free(buf);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_getSkeletonUTF8(const USpoofChecker *sc,
|
||||
uint32_t type,
|
||||
const char *s, int32_t length,
|
||||
char *dest, int32_t destCapacity,
|
||||
UErrorCode *status) {
|
||||
// Lacking a UTF-8 normalization API, just converting the input to
|
||||
// UTF-16 seems as good an approach as any. In typical use, input will
|
||||
// be an identifier, which is to say not too long for stack buffers.
|
||||
if (U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
// Buffers for the UChar form of the input and skeleton strings.
|
||||
UChar smallInBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||
UChar *inBuf = smallInBuf;
|
||||
UChar smallOutBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||
UChar *outBuf = smallOutBuf;
|
||||
|
||||
int32_t lengthInUChars = 0;
|
||||
int32_t skelLengthInUChars = 0;
|
||||
int32_t skelLengthInUTF8 = 0;
|
||||
|
||||
u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars,
|
||||
s, length, status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*status = U_ZERO_ERROR;
|
||||
inBuf = static_cast<UChar *>(uprv_malloc((lengthInUChars+1)*sizeof(UChar)));
|
||||
if (inBuf == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
u_strFromUTF8(inBuf, USPOOF_STACK_BUFFER_SIZE, &lengthInUChars+1,
|
||||
s, length, status);
|
||||
}
|
||||
|
||||
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
|
||||
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
|
||||
if (*status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
*status = U_ZERO_ERROR;
|
||||
outBuf = static_cast<UChar *>(uprv_malloc((skelLengthInUChars+1)*sizeof(UChar)));
|
||||
if (outBuf == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
skelLengthInUChars = uspoof_getSkeleton(sc, type, outBuf, lengthInUChars,
|
||||
outBuf, USPOOF_STACK_BUFFER_SIZE, status);
|
||||
}
|
||||
|
||||
u_strToUTF8(dest, destCapacity, &skelLengthInUTF8,
|
||||
outBuf, skelLengthInUChars, status);
|
||||
|
||||
cleanup:
|
||||
if (inBuf != smallInBuf) {
|
||||
delete inBuf;
|
||||
}
|
||||
if (outBuf != smallOutBuf) {
|
||||
delete outBuf;
|
||||
}
|
||||
return skelLengthInUTF8;
|
||||
}
|
||||
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_serialize(USpoofChecker *sc,void *buf, int32_t capacity, UErrorCode *status) {
|
||||
SpoofImpl *This = SpoofImpl::validateThis(sc, *status);
|
||||
if (This == NULL) {
|
||||
U_ASSERT(U_FAILURE(*status));
|
||||
return 0;
|
||||
}
|
||||
int32_t dataSize = This->fSpoofData->fRawData->fLength;
|
||||
if (capacity < dataSize) {
|
||||
*status = U_BUFFER_OVERFLOW_ERROR;
|
||||
return dataSize;
|
||||
}
|
||||
uprv_memcpy(buf, This->fSpoofData->fRawData, dataSize);
|
||||
return dataSize;
|
||||
}
|
||||
|
81
icu4c/source/i18n/uspoof_build.cpp
Normal file
81
icu4c/source/i18n/uspoof_build.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2008-2009, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
* file name: uspoof_build.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2008 Dec 8
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Unicode Spoof Detection Data Builder
|
||||
* Builder-related functions are kept in separate files so that applications not needing
|
||||
* the builder can more easily exclude them, typically by means of static linking.
|
||||
*
|
||||
* There are three relatively independent sets of Spoof data,
|
||||
* Confusables,
|
||||
* Whole Script Confusables
|
||||
* ID character extensions.
|
||||
*
|
||||
* The data tables for each are built separately, each from its own definitions
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/uregex.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "uassert.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uspoof_buildconf.h"
|
||||
#include "uspoof_buildwsconf.h"
|
||||
|
||||
|
||||
#include <stdio.h> // DEBUG
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
||||
|
||||
// The main data building function
|
||||
|
||||
U_CAPI USpoofChecker * U_EXPORT2
|
||||
uspoof_openFromSource(const char *confusables, int32_t confusablesLen,
|
||||
const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
|
||||
int32_t *errorType, UParseError *pe, UErrorCode *status) {
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
if (errorType!=NULL) {
|
||||
*errorType = 0;
|
||||
}
|
||||
if (pe != NULL) {
|
||||
pe->line = 0;
|
||||
pe->offset = 0;
|
||||
pe->preContext[0] = 0;
|
||||
pe->postContext[0] = 0;
|
||||
}
|
||||
|
||||
// Set up a shell of a spoof detector, with empty data.
|
||||
SpoofData *newSpoofData = new SpoofData(*status);
|
||||
SpoofImpl *This = new SpoofImpl(newSpoofData, *status);
|
||||
|
||||
// Compile the binary data from the source (text) format.
|
||||
ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
|
||||
buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
delete This;
|
||||
This = NULL;
|
||||
}
|
||||
return (USpoofChecker *)This;
|
||||
}
|
||||
|
593
icu4c/source/i18n/uspoof_buildconf.cpp
Normal file
593
icu4c/source/i18n/uspoof_buildconf.cpp
Normal file
@ -0,0 +1,593 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: uspoof_buildconf.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009Jan05 (refactoring earlier files)
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Internal classes for compililing confusable data into its binary (runtime) form.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/uregex.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "uassert.h"
|
||||
#include "uarrsort.h"
|
||||
#include "uspoof_buildconf.h"
|
||||
|
||||
#include "stdio.h" // DEBUG. Remove.
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
||||
//---------------------------------------------------------------------
|
||||
//
|
||||
// buildConfusableData Compile the source confusable data, as defined by
|
||||
// the Unicode data file confusables.txt, into the binary
|
||||
// structures used by the confusable detector.
|
||||
//
|
||||
// The binary structures are described in uspoof_impl.h
|
||||
//
|
||||
// 1. parse the data, building 4 hash tables, one each for the SL, SA, ML and MA
|
||||
// tables. Each maps from a UChar32 to a String.
|
||||
//
|
||||
// 2. Sort all of the strings encountered by length, since they will need to
|
||||
// be stored in that order in the final string table.
|
||||
//
|
||||
// 3. Build a list of keys (UChar32s) from the four mapping tables. Sort the
|
||||
// list because that will be the ordering of our runtime table.
|
||||
//
|
||||
// 4. Generate the run time string table. This is generated before the key & value
|
||||
// tables because we need the string indexes when building those tables.
|
||||
//
|
||||
// 5. Build the run-time key and value tables. These are parallel tables, and are built
|
||||
// at the same time
|
||||
//
|
||||
|
||||
SPUString::SPUString(UnicodeString *s) {
|
||||
fStr = s;
|
||||
fStrTableIndex = 0;
|
||||
}
|
||||
|
||||
|
||||
SPUString::~SPUString() {
|
||||
delete fStr;
|
||||
}
|
||||
|
||||
|
||||
SPUStringPool::SPUStringPool(UErrorCode &status) : fVec(NULL), fHash(NULL) {
|
||||
fVec = new UVector(status);
|
||||
fHash = uhash_open(uhash_hashUnicodeString, // key hash function
|
||||
uhash_compareUnicodeString, // Key Comparator
|
||||
NULL, // Value Comparator
|
||||
&status);
|
||||
}
|
||||
|
||||
|
||||
SPUStringPool::~SPUStringPool() {
|
||||
int i;
|
||||
for (i=fVec->size()-1; i>=0; i--) {
|
||||
SPUString *s = static_cast<SPUString *>(fVec->elementAt(i));
|
||||
delete s;
|
||||
}
|
||||
delete fVec;
|
||||
uhash_close(fHash);
|
||||
}
|
||||
|
||||
|
||||
int32_t SPUStringPool::size() {
|
||||
return fVec->size();
|
||||
}
|
||||
|
||||
SPUString *SPUStringPool::getByIndex(int32_t index) {
|
||||
SPUString *retString = (SPUString *)fVec->elementAt(index);
|
||||
return retString;
|
||||
}
|
||||
|
||||
|
||||
// Comparison function for ordering strings in the string pool.
|
||||
// Compare by length first, then, within a group of the same length,
|
||||
// by code point order.
|
||||
// Conforms to the type signature for a USortComparator in uvector.h
|
||||
|
||||
static int8_t U_CALLCONV SPUStringCompare(UHashTok left, UHashTok right) {
|
||||
const SPUString *sL = static_cast<const SPUString *>(left.pointer);
|
||||
const SPUString *sR = static_cast<const SPUString *>(right.pointer);
|
||||
int32_t lenL = sL->fStr->length();
|
||||
int32_t lenR = sR->fStr->length();
|
||||
if (lenL < lenR) {
|
||||
return -1;
|
||||
} else if (lenL > lenR) {
|
||||
return 1;
|
||||
} else {
|
||||
return sL->fStr->compare(*(sR->fStr));
|
||||
}
|
||||
}
|
||||
|
||||
void SPUStringPool::sort(UErrorCode &status) {
|
||||
fVec->sort(SPUStringCompare, status);
|
||||
}
|
||||
|
||||
|
||||
SPUString *SPUStringPool::addString(UnicodeString *src, UErrorCode &status) {
|
||||
SPUString *hashedString = static_cast<SPUString *>(uhash_get(fHash, src));
|
||||
if (hashedString != NULL) {
|
||||
delete src;
|
||||
} else {
|
||||
hashedString = new SPUString(src);
|
||||
uhash_put(fHash, src, hashedString, &status);
|
||||
fVec->addElement(hashedString, status);
|
||||
}
|
||||
return hashedString;
|
||||
}
|
||||
|
||||
|
||||
|
||||
ConfusabledataBuilder::ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status) :
|
||||
fSpoofImpl(spImpl),
|
||||
fInput(NULL),
|
||||
fSLTable(NULL),
|
||||
fSATable(NULL),
|
||||
fMLTable(NULL),
|
||||
fMATable(NULL),
|
||||
fKeySet(NULL),
|
||||
fKeyVec(NULL),
|
||||
fValueVec(NULL),
|
||||
fStringTable(NULL),
|
||||
fStringLengthsTable(NULL),
|
||||
stringPool(NULL),
|
||||
fParseLine(NULL),
|
||||
fParseHexNum(NULL),
|
||||
fLineNum(0)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fSLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fSATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fMLTable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fMATable = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &status);
|
||||
fKeySet = new UnicodeSet();
|
||||
fKeyVec = new UVector(status);
|
||||
fValueVec = new UVector(status);
|
||||
stringPool = new SPUStringPool(status);
|
||||
}
|
||||
|
||||
|
||||
ConfusabledataBuilder::~ConfusabledataBuilder() {
|
||||
uprv_free(fInput);
|
||||
uregex_close(fParseLine);
|
||||
uregex_close(fParseHexNum);
|
||||
uhash_close(fSLTable);
|
||||
uhash_close(fSATable);
|
||||
uhash_close(fMLTable);
|
||||
uhash_close(fMATable);
|
||||
delete fKeySet;
|
||||
delete fKeyVec;
|
||||
delete fStringTable;
|
||||
delete fStringLengthsTable;
|
||||
delete fValueVec;
|
||||
delete stringPool;
|
||||
}
|
||||
|
||||
|
||||
void ConfusabledataBuilder::buildConfusableData(SpoofImpl * spImpl, const char * confusables,
|
||||
int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status) {
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
ConfusabledataBuilder builder(spImpl, status);
|
||||
builder.build(confusables, confusablesLen, status);
|
||||
if (U_FAILURE(status) && errorType != NULL) {
|
||||
*errorType = USPOOF_SINGLE_SCRIPT_CONFUSABLE;
|
||||
pe->line = builder.fLineNum;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ConfusabledataBuilder::build(const char * confusables, int32_t confusablesLen,
|
||||
UErrorCode &status) {
|
||||
|
||||
// Convert the user input data from UTF-8 to UChar (UTF-16)
|
||||
int32_t inputLen = 0;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
u_strFromUTF8(NULL, 0, &inputLen, confusables, confusablesLen, &status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
return;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
fInput = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
|
||||
if (fInput == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
u_strFromUTF8(fInput, inputLen+1, NULL, confusables, confusablesLen, &status);
|
||||
|
||||
|
||||
// Regular Expression to parse a line from Confusables.txt. The expression will match
|
||||
// any line. What was matched is determined by examining which capture groups have a match.
|
||||
// Capture Group 1: the source char
|
||||
// Capture Group 2: the replacement chars
|
||||
// Capture Group 3-6 the table type, SL, SA, ML, or MA
|
||||
// Capture Group 7: A blank or comment only line.
|
||||
// Capture Group 8: A syntactically invalid line. Anything that didn't match before.
|
||||
// Example Line from the confusables.txt source file:
|
||||
// "1D702 ; 006E 0329 ; SL # MATHEMATICAL ITALIC SMALL ETA ... "
|
||||
fParseLine = uregex_openC(
|
||||
"(?m)^[ \\t]*([0-9A-Fa-f]+)[ \\t]+;" // Match the source char
|
||||
"[ \\t]*([0-9A-Fa-f]+" // Match the replacement char(s)
|
||||
"(?:[ \\t]+[0-9A-Fa-f]+)*)[ \\t]*;" // (continued)
|
||||
"\\s*(?:(SL)|(SA)|(ML)|(MA))" // Match the table type
|
||||
"[ \\t]*(?:#.*?)?$" // Match any trailing #comment
|
||||
"|^([ \\t]*(?:#.*?)?)$" // OR match empty lines or lines with only a #comment
|
||||
"|^(.*?)$", // OR match any line, which catches illegal lines.
|
||||
0, NULL, &status);
|
||||
|
||||
// Regular expression for parsing a hex number out of a space-separated list of them.
|
||||
// Capture group 1 gets the number, with spaces removed.
|
||||
fParseHexNum = uregex_openC("\\s*([0-9A-F]+)", 0, NULL, &status);
|
||||
|
||||
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
|
||||
// given the syntax of the input.
|
||||
if (*fInput == 0xfeff) {
|
||||
*fInput = 0x20;
|
||||
}
|
||||
|
||||
// Parse the input, one line per iteration of this loop.
|
||||
uregex_setText(fParseLine, fInput, inputLen, &status);
|
||||
while (uregex_findNext(fParseLine, &status)) {
|
||||
fLineNum++;
|
||||
if (uregex_start(fParseLine, 7, &status) >= 0) {
|
||||
// this was a blank or comment line.
|
||||
continue;
|
||||
}
|
||||
if (uregex_start(fParseLine, 8, &status) >= 0) {
|
||||
// input file syntax error.
|
||||
status = U_PARSE_ERROR;
|
||||
return;
|
||||
}
|
||||
|
||||
// We have a good input line. Extract the key character and mapping string, and
|
||||
// put them into the appropriate mapping table.
|
||||
UChar32 keyChar = SpoofImpl::ScanHex(fInput, uregex_start(fParseLine, 1, &status),
|
||||
uregex_end(fParseLine, 1, &status), status);
|
||||
|
||||
int32_t mapStringStart = uregex_start(fParseLine, 2, &status);
|
||||
int32_t mapStringLength = uregex_end(fParseLine, 2, &status) - mapStringStart;
|
||||
uregex_setText(fParseHexNum, &fInput[mapStringStart], mapStringLength, &status);
|
||||
|
||||
UnicodeString *mapString = new UnicodeString();
|
||||
if (mapString == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
while (uregex_findNext(fParseHexNum, &status)) {
|
||||
UChar32 c = SpoofImpl::ScanHex(&fInput[mapStringStart], uregex_start(fParseHexNum, 1, &status),
|
||||
uregex_end(fParseHexNum, 1, &status), status);
|
||||
mapString->append(c);
|
||||
}
|
||||
U_ASSERT(mapString->length() >= 1);
|
||||
|
||||
// Put the map (value) string into the string pool
|
||||
// This a little like a Java intern() - any duplicates will be eliminated.
|
||||
SPUString *smapString = stringPool->addString(mapString, status);
|
||||
|
||||
// Add the UChar -> string mapping to the appropriate table.
|
||||
UHashtable *table = uregex_start(fParseLine, 3, &status) >= 0 ? fSLTable :
|
||||
uregex_start(fParseLine, 4, &status) >= 0 ? fSATable :
|
||||
uregex_start(fParseLine, 5, &status) >= 0 ? fMLTable :
|
||||
uregex_start(fParseLine, 6, &status) >= 0 ? fMATable :
|
||||
NULL;
|
||||
U_ASSERT(table != NULL);
|
||||
uhash_iput(table, keyChar, smapString, &status);
|
||||
fKeySet->add(keyChar);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Input data is now all parsed and collected.
|
||||
// Now create the run-time binary form of the data.
|
||||
//
|
||||
// This is done in two steps. First the data is assembled into vectors and strings,
|
||||
// for ease of construction, then the contents of these collections are dumped
|
||||
// into the actual raw-bytes data storage.
|
||||
|
||||
// Build up the string array, and record the index of each string therein
|
||||
// in the (build time only) string pool.
|
||||
// Strings of length one are not entered into the strings array.
|
||||
// At the same time, build up the string lengths table, which records the
|
||||
// position in the string table of the first string of each length >= 4.
|
||||
// (Strings in the table are sorted by length)
|
||||
stringPool->sort(status);
|
||||
fStringTable = new UnicodeString();
|
||||
fStringLengthsTable = new UVector(status);
|
||||
int32_t previousStringLength = 0;
|
||||
int32_t previousStringIndex = 0;
|
||||
int32_t poolSize = stringPool->size();
|
||||
int32_t i;
|
||||
for (i=0; i<poolSize; i++) {
|
||||
SPUString *s = stringPool->getByIndex(i);
|
||||
int32_t strLen = s->fStr->length();
|
||||
int32_t strIndex = fStringTable->length();
|
||||
U_ASSERT(strLen >= previousStringLength);
|
||||
if (strLen == 1) {
|
||||
// strings of length one do not get an entry in the string table.
|
||||
// Keep the single string character itself here, which is the same
|
||||
// convention that is used in the final run-time string table index.
|
||||
s->fStrTableIndex = s->fStr->charAt(0);
|
||||
} else {
|
||||
if ((strLen > previousStringLength) && (previousStringLength >= 4)) {
|
||||
fStringLengthsTable->addElement(previousStringIndex, status);
|
||||
fStringLengthsTable->addElement(previousStringLength, status);
|
||||
}
|
||||
s->fStrTableIndex = strIndex;
|
||||
fStringTable->append(*(s->fStr));
|
||||
}
|
||||
previousStringLength = strLen;
|
||||
previousStringIndex = strIndex;
|
||||
}
|
||||
// Make the final entry to the string lengths table.
|
||||
// (it holds an entry for the _last_ string of each length, so adding the
|
||||
// final one doesn't happen in the main loop because no longer string was encountered.)
|
||||
if (previousStringLength >= 4) {
|
||||
fStringLengthsTable->addElement(previousStringIndex, status);
|
||||
fStringLengthsTable->addElement(previousStringLength, status);
|
||||
}
|
||||
|
||||
// Construct the compile-time Key and Value tables
|
||||
//
|
||||
// For each key code point, check which mapping tables it applies to,
|
||||
// and create the final data for the key & value structures.
|
||||
//
|
||||
// The four logical mapping tables are conflated into one combined table.
|
||||
// If multiple logical tables have the same mapping for some key, they
|
||||
// share a single entry in the combined table.
|
||||
// If more than one mapping exists for the same key code point, multiple
|
||||
// entries will be created in the table
|
||||
|
||||
for (int32_t range=0; range<fKeySet->getRangeCount(); range++) {
|
||||
// It is an oddity of the UnicodeSet API that simply enumerating the contained
|
||||
// code points requires a nested loop.
|
||||
for (UChar32 keyChar=fKeySet->getRangeStart(range);
|
||||
keyChar <= fKeySet->getRangeEnd(range); keyChar++) {
|
||||
addKeyEntry(keyChar, fSLTable, USPOOF_SL_TABLE_FLAG, status);
|
||||
addKeyEntry(keyChar, fSATable, USPOOF_SA_TABLE_FLAG, status);
|
||||
addKeyEntry(keyChar, fMLTable, USPOOF_ML_TABLE_FLAG, status);
|
||||
addKeyEntry(keyChar, fMATable, USPOOF_MA_TABLE_FLAG, status);
|
||||
}
|
||||
}
|
||||
|
||||
// Put the assembled data into the flat runtime array
|
||||
outputData(status);
|
||||
|
||||
// All of the intermediate allocated data belongs to the ConfusabledataBuilder
|
||||
// object (this), and is deleted in the destructor.
|
||||
return;
|
||||
}
|
||||
|
||||
//
|
||||
// outputData The confusable data has been compiled and stored in intermediate
|
||||
// collections and strings. Copy it from there to the final flat
|
||||
// binary array.
|
||||
//
|
||||
// Note that as each section is added to the output data, the
|
||||
// expand (reserveSpace() function will likely relocate it in memory.
|
||||
// Be careful with pointers.
|
||||
//
|
||||
void ConfusabledataBuilder::outputData(UErrorCode &status) {
|
||||
|
||||
U_ASSERT(fSpoofImpl->fSpoofData->fDataOwned == TRUE);
|
||||
|
||||
// The Key Table
|
||||
// While copying the keys to the runtime array,
|
||||
// also sanity check that they are sorted.
|
||||
|
||||
int32_t numKeys = fKeyVec->size();
|
||||
int32_t *keys =
|
||||
static_cast<int32_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(int32_t), status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int i;
|
||||
int32_t previousKey = 0;
|
||||
for (i=0; i<numKeys; i++) {
|
||||
int32_t key = fKeyVec->elementAti(i);
|
||||
U_ASSERT((key & 0x00ffffff) >= (previousKey & 0x00ffffff));
|
||||
U_ASSERT((key & 0xff000000) != 0);
|
||||
keys[i] = key;
|
||||
previousKey = key;
|
||||
}
|
||||
SpoofDataHeader *rawData = fSpoofImpl->fSpoofData->fRawData;
|
||||
rawData->fCFUKeys = (char *)keys - (char *)rawData;
|
||||
rawData->fCFUKeysSize = numKeys;
|
||||
fSpoofImpl->fSpoofData->fCFUKeys = keys;
|
||||
|
||||
|
||||
// The Value Table, parallels the key table
|
||||
int32_t numValues = fValueVec->size();
|
||||
U_ASSERT(numKeys == numValues);
|
||||
uint16_t *values =
|
||||
static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(numKeys*sizeof(uint16_t), status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
for (i=0; i<numValues; i++) {
|
||||
uint32_t value = static_cast<uint32_t>(fValueVec->elementAti(i));
|
||||
U_ASSERT(value < 0xffff);
|
||||
values[i] = static_cast<uint16_t>(value);
|
||||
}
|
||||
rawData = fSpoofImpl->fSpoofData->fRawData;
|
||||
rawData->fCFUStringIndex = (char *)values - (char *)rawData;
|
||||
rawData->fCFUStringIndexSize = numValues;
|
||||
fSpoofImpl->fSpoofData->fCFUValues = values;
|
||||
|
||||
// The Strings Table.
|
||||
|
||||
uint32_t stringsLength = fStringTable->length();
|
||||
// Reserve an extra space so the string will be nul-terminated. This is
|
||||
// only a convenience, for when debugging; it is not needed otherwise.
|
||||
UChar *strings =
|
||||
static_cast<UChar *>(fSpoofImpl->fSpoofData->reserveSpace(stringsLength*sizeof(UChar)+2, status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fStringTable->extract(strings, stringsLength+1, status);
|
||||
rawData = fSpoofImpl->fSpoofData->fRawData;
|
||||
U_ASSERT(rawData->fCFUStringTable == 0);
|
||||
rawData->fCFUStringTable = (char *)strings - (char *)rawData;
|
||||
rawData->fCFUStringTableLen = stringsLength;
|
||||
fSpoofImpl->fSpoofData->fCFUStrings = strings;
|
||||
|
||||
// The String Lengths Table
|
||||
// While copying into the runtime array do some sanity checks on the values
|
||||
// Each complete entry contains two fields, an index and an offset.
|
||||
// Lengths should increase with each entry.
|
||||
// Offsets should be less than the size of the string table.
|
||||
int32_t lengthTableLength = fStringLengthsTable->size();
|
||||
uint16_t *stringLengths =
|
||||
static_cast<uint16_t *>(fSpoofImpl->fSpoofData->reserveSpace(lengthTableLength*sizeof(uint16_t), status));
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
int32_t destIndex = 0;
|
||||
uint32_t previousLength = 0;
|
||||
for (i=0; i<lengthTableLength; i+=2) {
|
||||
uint32_t offset = static_cast<uint32_t>(fStringLengthsTable->elementAti(i));
|
||||
uint32_t length = static_cast<uint32_t>(fStringLengthsTable->elementAti(i+1));
|
||||
U_ASSERT(offset < stringsLength);
|
||||
U_ASSERT(length < 40);
|
||||
U_ASSERT(length > previousLength);
|
||||
stringLengths[destIndex++] = static_cast<uint16_t>(offset);
|
||||
stringLengths[destIndex++] = static_cast<uint16_t>(length);
|
||||
previousLength = length;
|
||||
}
|
||||
rawData = fSpoofImpl->fSpoofData->fRawData;
|
||||
rawData->fCFUStringLengths = (char *)stringLengths - (char *)rawData;
|
||||
// Note: StringLengthsSize in the raw data is the number of complete entries,
|
||||
// each consisting of a pair of 16 bit values, hence the divide by 2.
|
||||
rawData->fCFUStringLengthsSize = lengthTableLength / 2;
|
||||
fSpoofImpl->fSpoofData->fCFUStringLengths =
|
||||
reinterpret_cast<SpoofStringLengthsElement *>(stringLengths);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// addKeyEntry Construction of the confusable Key and Mapping Values tables.
|
||||
// This is an intermediate point in the building process.
|
||||
// We already have the mappings in the hash tables fSLTable, etc.
|
||||
// This function builds corresponding run-time style table entries into
|
||||
// fKeyVec and fValueVec
|
||||
|
||||
void ConfusabledataBuilder::addKeyEntry(
|
||||
UChar32 keyChar, // The key character
|
||||
UHashtable *table, // The table, one of SATable, MATable, etc.
|
||||
int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
|
||||
UErrorCode &status) {
|
||||
|
||||
SPUString *targetMapping = static_cast<SPUString *>(uhash_iget(table, keyChar));
|
||||
if (targetMapping == NULL) {
|
||||
// No mapping for this key character.
|
||||
// (This function is called for all four tables for each key char that
|
||||
// is seen anywhere, so this no entry cases are very much expected.)
|
||||
return;
|
||||
}
|
||||
|
||||
// Check whether there is already an entry with the correct mapping.
|
||||
// If so, simply set the flag in the keyTable saying that the existing entry
|
||||
// applies to the table that we're doing now.
|
||||
|
||||
UBool keyHasMultipleValues = FALSE;
|
||||
int32_t i;
|
||||
for (i=fKeyVec->size()-1; i>=0 ; i--) {
|
||||
int32_t key = fKeyVec->elementAti(i);
|
||||
if ((key & 0x0ffffff) != keyChar) {
|
||||
// We have now checked all existing key entries for this key char (if any)
|
||||
// without finding one with the same mapping.
|
||||
break;
|
||||
}
|
||||
UnicodeString mapping = getMapping(i);
|
||||
if (mapping == *(targetMapping->fStr)) {
|
||||
// The run time entry we are currently testing has the correct mapping.
|
||||
// Set the flag in it indicating that it applies to the new table also.
|
||||
key |= tableFlag;
|
||||
fKeyVec->setElementAt(key, i);
|
||||
return;
|
||||
}
|
||||
keyHasMultipleValues = TRUE;
|
||||
}
|
||||
|
||||
// Need to add a new entry to the binary data being built for this mapping.
|
||||
// Includes adding entries to both the key table and the parallel values table.
|
||||
|
||||
int32_t newKey = keyChar | tableFlag;
|
||||
if (keyHasMultipleValues) {
|
||||
newKey |= USPOOF_KEY_MULTIPLE_VALUES;
|
||||
}
|
||||
int32_t adjustedMappingLength = targetMapping->fStr->length() - 1;
|
||||
if (adjustedMappingLength>3) {
|
||||
adjustedMappingLength = 3;
|
||||
}
|
||||
newKey |= adjustedMappingLength << USPOOF_KEY_LENGTH_SHIFT;
|
||||
|
||||
int32_t newData = targetMapping->fStrTableIndex;
|
||||
|
||||
fKeyVec->addElement(newKey, status);
|
||||
fValueVec->addElement(newData, status);
|
||||
|
||||
// If the preceding key entry is for the same key character (but with a different mapping)
|
||||
// set the multiple-values flag on it.
|
||||
if (keyHasMultipleValues) {
|
||||
int32_t previousKeyIndex = fKeyVec->size() - 2;
|
||||
int32_t previousKey = fKeyVec->elementAti(previousKeyIndex);
|
||||
previousKey |= USPOOF_KEY_MULTIPLE_VALUES;
|
||||
fKeyVec->setElementAt(previousKey, previousKeyIndex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
UnicodeString ConfusabledataBuilder::getMapping(int32_t index) {
|
||||
int32_t key = fKeyVec->elementAti(index);
|
||||
int32_t value = fValueVec->elementAti(index);
|
||||
int32_t length = USPOOF_KEY_LENGTH_FIELD(key);
|
||||
int32_t lastIndexWithLen;
|
||||
switch (length) {
|
||||
case 0:
|
||||
return UnicodeString(static_cast<UChar>(value));
|
||||
case 1:
|
||||
case 2:
|
||||
return UnicodeString(*fStringTable, value, length+1);
|
||||
case 3:
|
||||
length = 0;
|
||||
int32_t i;
|
||||
for (i=0; i<fStringLengthsTable->size(); i+=2) {
|
||||
lastIndexWithLen = fStringLengthsTable->elementAti(i);
|
||||
if (value <= lastIndexWithLen) {
|
||||
length = fStringLengthsTable->elementAti(i+1);
|
||||
break;
|
||||
}
|
||||
}
|
||||
U_ASSERT(length>=3);
|
||||
return UnicodeString(*fStringTable, value, length);
|
||||
default:
|
||||
U_ASSERT(FALSE);
|
||||
}
|
||||
return UnicodeString();
|
||||
}
|
123
icu4c/source/i18n/uspoof_buildconf.h
Normal file
123
icu4c/source/i18n/uspoof_buildconf.h
Normal file
@ -0,0 +1,123 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: uspoof_buildconf.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009Jan05
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Internal classes for compiling confusable data into its binary (runtime) form.
|
||||
*/
|
||||
|
||||
#ifndef __USPOOF_BUILDCONF_H__
|
||||
#define __USPOOF_BUILDCONF_H__
|
||||
|
||||
#include "uspoof_impl.h"
|
||||
|
||||
// SPUString
|
||||
// Holds a string that is the result of one of the mappings defined
|
||||
// by the confusable mapping data (confusables.txt from Unicode.org)
|
||||
// Instances of SPUString exist during the compilation process only.
|
||||
|
||||
struct SPUString : public UMemory {
|
||||
UnicodeString *fStr; // The actual string.
|
||||
int32_t fStrTableIndex; // Index into the final runtime data for this string.
|
||||
// (or, for length 1, the single string char itself,
|
||||
// there being no string table entry for it.)
|
||||
SPUString(UnicodeString *s);
|
||||
~SPUString();
|
||||
};
|
||||
|
||||
|
||||
// String Pool A utility class for holding the strings that are the result of
|
||||
// the spoof mappings. These strings will utimately end up in the
|
||||
// run-time String Table.
|
||||
// This is sort of like a sorted set of strings, except that ICU's anemic
|
||||
// built-in collections don't support those, so it is implemented with a
|
||||
// combination of a uhash and a UVector.
|
||||
|
||||
|
||||
class SPUStringPool : public UMemory {
|
||||
public:
|
||||
SPUStringPool(UErrorCode &status);
|
||||
~SPUStringPool();
|
||||
|
||||
// Add a string. Return the string from the table.
|
||||
// If the input parameter string is already in the table, delete the
|
||||
// input parameter and return the existing string.
|
||||
SPUString *addString(UnicodeString *src, UErrorCode &status);
|
||||
|
||||
|
||||
// Get the n-th string in the collection.
|
||||
SPUString *getByIndex(int32_t i);
|
||||
|
||||
// Sort the contents; affects the ordering of getByIndex().
|
||||
void sort(UErrorCode &status);
|
||||
|
||||
int32_t size();
|
||||
|
||||
private:
|
||||
UVector *fVec; // Elements are SPUString *
|
||||
UHashtable *fHash; // Key: UnicodeString Value: SPUString
|
||||
};
|
||||
|
||||
|
||||
// class ConfusabledataBuilder
|
||||
// An instance of this class exists while the confusable data is being built from source.
|
||||
// It encapsulates the intermediate data structures that are used for building.
|
||||
// It exports one static function, to do a confusable data build.
|
||||
|
||||
class ConfusabledataBuilder : public UMemory {
|
||||
private:
|
||||
SpoofImpl *fSpoofImpl;
|
||||
UChar *fInput;
|
||||
UHashtable *fSLTable;
|
||||
UHashtable *fSATable;
|
||||
UHashtable *fMLTable;
|
||||
UHashtable *fMATable;
|
||||
UnicodeSet *fKeySet; // A set of all keys (UChar32s) that go into the four mapping tables.
|
||||
|
||||
// The binary data is first assembled into the following four collections, then
|
||||
// copied to its final raw-memory destination.
|
||||
UVector *fKeyVec;
|
||||
UVector *fValueVec;
|
||||
UnicodeString *fStringTable;
|
||||
UVector *fStringLengthsTable;
|
||||
|
||||
SPUStringPool *stringPool;
|
||||
URegularExpression *fParseLine;
|
||||
URegularExpression *fParseHexNum;
|
||||
int32_t fLineNum;
|
||||
|
||||
ConfusabledataBuilder(SpoofImpl *spImpl, UErrorCode &status);
|
||||
~ConfusabledataBuilder();
|
||||
void build(const char * confusables, int32_t confusablesLen, UErrorCode &status);
|
||||
|
||||
// Add an entry to the key and value tables being built
|
||||
// input: data from SLTable, MATable, etc.
|
||||
// outut: entry added to fKeyVec and fValueVec
|
||||
void addKeyEntry(UChar32 keyChar, // The key character
|
||||
UHashtable *table, // The table, one of SATable, MATable, etc.
|
||||
int32_t tableFlag, // One of USPOOF_SA_TABLE_FLAG, etc.
|
||||
UErrorCode &status);
|
||||
|
||||
// From an index into fKeyVec & fValueVec
|
||||
// get a UnicodeString with the corresponding mapping.
|
||||
UnicodeString getMapping(int32_t key);
|
||||
|
||||
// Populate the final binary output data array with the compiled data.
|
||||
void outputData(UErrorCode &status);
|
||||
|
||||
public:
|
||||
static void buildConfusableData(SpoofImpl *spImpl, const char * confusables,
|
||||
int32_t confusablesLen, int32_t *errorType, UParseError *pe, UErrorCode &status);
|
||||
};
|
||||
|
||||
#endif
|
431
icu4c/source/i18n/uspoof_buildwsconf.cpp
Normal file
431
icu4c/source/i18n/uspoof_buildwsconf.cpp
Normal file
@ -0,0 +1,431 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: uspoof_buildwsconf.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009Jan05 (refactoring earlier files)
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Internal functions for compililing Whole Script confusable source data
|
||||
* into its binary (runtime) form. The binary data format is described
|
||||
* in uspoof_impl.h
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/uregex.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "uhash.h"
|
||||
#include "uvector.h"
|
||||
#include "uassert.h"
|
||||
#include "uspoof_buildwsconf.h"
|
||||
|
||||
|
||||
//#include <stdio.h> // TODO: debug. remove.
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
||||
// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
|
||||
// Example Lines:
|
||||
// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
|
||||
// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
|
||||
// | | | |
|
||||
// | | | |---- Which table, Any Case or Lower Case (A or L)
|
||||
// | | |----------Target script. We need this.
|
||||
// | |----------------Src script. Should match the script of the source
|
||||
// | code points. Beyond checking that, we don't keep it.
|
||||
// |--------------------------------Source code points or range.
|
||||
//
|
||||
// The expression will match _all_ lines, including erroneous lines.
|
||||
// The result of the parse is returned via the contents of the (match) groups.
|
||||
static const char *parseExp =
|
||||
|
||||
"(?m)" // Multi-line mode
|
||||
"^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
|
||||
"|^(?:" // OR
|
||||
"\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
|
||||
"\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
|
||||
"\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
|
||||
"\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
|
||||
"[ \\t]*(?:#.*?)?" // Trailing commment
|
||||
")$|" // OR
|
||||
"^(.*?)$"; // An error line. Group 8.
|
||||
// Any line not matching the preceding
|
||||
// parts of the expression.will match
|
||||
// this, and thus be flagged as an error
|
||||
|
||||
|
||||
// Extract a regular expression match group into a char * string.
|
||||
// The group must contain only invariant characters.
|
||||
// Used for script names
|
||||
//
|
||||
static void extractGroup(
|
||||
URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
|
||||
|
||||
UChar ubuf[50];
|
||||
ubuf[0] = 0;
|
||||
destBuf[0] = 0;
|
||||
int32_t len = uregex_group(e, group, ubuf, 50, &status);
|
||||
if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
|
||||
return;
|
||||
}
|
||||
UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
|
||||
s.extract(0, len, destBuf, destCapacity, US_INV);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Build the Whole Script Confusable data
|
||||
//
|
||||
// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
|
||||
// because everything is local to this one build function anyhow,
|
||||
// OR
|
||||
// break this function into more reasonably sized pieces, with
|
||||
// state in WSConfusableDataBuilder.
|
||||
//
|
||||
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
||||
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
|
||||
{
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
URegularExpression *parseRegexp = NULL;
|
||||
int32_t inputLen = 0;
|
||||
UChar *input = NULL;
|
||||
int32_t lineNum = 0;
|
||||
|
||||
UVector *scriptSets = NULL;
|
||||
uint32_t rtScriptSetsCount = 2;
|
||||
|
||||
UTrie2 *anyCaseTrie = NULL;
|
||||
UTrie2 *lowerCaseTrie = NULL;
|
||||
|
||||
anyCaseTrie = utrie2_open(0, 0, &status);
|
||||
lowerCaseTrie = utrie2_open(0, 0, &status);
|
||||
|
||||
|
||||
// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
|
||||
//
|
||||
// Reserved TRIE values:
|
||||
// 0: Code point has no whole script confusables.
|
||||
// 1: Code point is of script Common or Inherited.
|
||||
// These code points do not participate in whole script confusable detection.
|
||||
// (This is logically equivalent to saying that they contain confusables in
|
||||
// all scripts)
|
||||
//
|
||||
// Because Trie values are indexes into the ScriptSets vector, pre-fill
|
||||
// vector positions 0 and 1 to avoid conflicts with the reserved values.
|
||||
|
||||
scriptSets = new UVector(status);
|
||||
if (scriptSets == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
scriptSets->addElement((void *)NULL, status);
|
||||
scriptSets->addElement((void *)NULL, status);
|
||||
|
||||
// Convert the user input data from UTF-8 to UChar (UTF-16)
|
||||
u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto cleanup;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
|
||||
if (input == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
|
||||
|
||||
|
||||
|
||||
parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
|
||||
|
||||
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
|
||||
// given the syntax of the input.
|
||||
if (*input == 0xfeff) {
|
||||
*input = 0x20;
|
||||
}
|
||||
|
||||
// Parse the input, one line per iteration of this loop.
|
||||
uregex_setText(parseRegexp, input, inputLen, &status);
|
||||
while (uregex_findNext(parseRegexp, &status)) {
|
||||
lineNum++;
|
||||
UChar line[200];
|
||||
uregex_group(parseRegexp, 0, line, 200, &status);
|
||||
if (uregex_start(parseRegexp, 1, &status) >= 0) {
|
||||
// this was a blank or comment line.
|
||||
continue;
|
||||
}
|
||||
if (uregex_start(parseRegexp, 8, &status) >= 0) {
|
||||
// input file syntax error.
|
||||
status = U_PARSE_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// Pick up the start and optional range end code points from the parsed line.
|
||||
UChar32 startCodePoint = SpoofImpl::ScanHex(
|
||||
input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
|
||||
UChar32 endCodePoint = startCodePoint;
|
||||
if (uregex_start(parseRegexp, 3, &status) >=0) {
|
||||
endCodePoint = SpoofImpl::ScanHex(
|
||||
input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
|
||||
}
|
||||
|
||||
// Extract the two script names from the source line. We need these in an 8 bit
|
||||
// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
|
||||
// to the ICU u_getPropertyValueEnum() function. Ugh.
|
||||
char srcScriptName[20];
|
||||
char targScriptName[20];
|
||||
extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
|
||||
extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
|
||||
UScriptCode srcScript =
|
||||
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
|
||||
UScriptCode targScript =
|
||||
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
}
|
||||
if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
// select the table - (A) any case or (L) lower case only
|
||||
UTrie2 *table = anyCaseTrie;
|
||||
if (uregex_start(parseRegexp, 7, &status) >= 0) {
|
||||
table = lowerCaseTrie;
|
||||
}
|
||||
|
||||
// Build the set of scripts containing confusable characters for
|
||||
// the code point(s) specified in this input line.
|
||||
// Sanity check that the script of the source code point is the same
|
||||
// as the source script indicated in the input file. Failure of this check is
|
||||
// an error in the input file.
|
||||
// Include the source script in the set (needed for Mixed Script Confusable detection).
|
||||
//
|
||||
UChar32 cp;
|
||||
for (cp=startCodePoint; cp<=endCodePoint; cp++) {
|
||||
int32_t setIndex = utrie2_get32(table, cp);
|
||||
BuilderScriptSet *bsset = NULL;
|
||||
if (setIndex > 0) {
|
||||
U_ASSERT(setIndex < scriptSets->size());
|
||||
bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
|
||||
} else {
|
||||
bsset = new BuilderScriptSet();
|
||||
if (bsset == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
bsset->codePoint = cp;
|
||||
bsset->trie = table;
|
||||
bsset->sset = new ScriptSet();
|
||||
setIndex = scriptSets->size();
|
||||
bsset->index = setIndex;
|
||||
bsset->rindex = 0;
|
||||
if (bsset->sset == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
scriptSets->addElement(bsset, status);
|
||||
utrie2_set32(table, cp, setIndex, &status);
|
||||
}
|
||||
bsset->sset->Union(targScript);
|
||||
bsset->sset->Union(srcScript);
|
||||
|
||||
if (U_FAILURE(status)) {
|
||||
goto cleanup;
|
||||
}
|
||||
UScriptCode cpScript = uscript_getScript(cp, &status);
|
||||
if (cpScript != srcScript) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
goto cleanup;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Eliminate duplicate script sets. At this point we have a separate
|
||||
// script set for every code point that had data in the input file.
|
||||
//
|
||||
// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
|
||||
//
|
||||
// printf("Number of scriptSets: %d\n", scriptSets->size());
|
||||
{
|
||||
int32_t duplicateCount = 0;
|
||||
rtScriptSetsCount = 2;
|
||||
for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
|
||||
BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
|
||||
if (outerSet->index != static_cast<uint32_t>(outeri)) {
|
||||
// This set was already identified as a duplicate.
|
||||
// It will not be allocated a position in the runtime array of ScriptSets.
|
||||
continue;
|
||||
}
|
||||
outerSet->rindex = rtScriptSetsCount++;
|
||||
for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
|
||||
BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
|
||||
if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
|
||||
delete innerSet->sset;
|
||||
innerSet->scriptSetOwned = FALSE;
|
||||
innerSet->sset = outerSet->sset;
|
||||
innerSet->index = outeri;
|
||||
innerSet->rindex = outerSet->rindex;
|
||||
duplicateCount++;
|
||||
}
|
||||
// But this doesn't get all. We need to fix the TRIE.
|
||||
}
|
||||
}
|
||||
// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
|
||||
// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
|
||||
// are unused, which is why the loop index starts at 2.)
|
||||
{
|
||||
for (int32_t i=2; i<scriptSets->size(); i++) {
|
||||
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
||||
if (bSet->rindex != (uint32_t)i) {
|
||||
utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For code points with script==Common or script==Inherited,
|
||||
// Set the reserved value of 1 into both Tries. These characters do not participate
|
||||
// in Whole Script Confusable detection; this reserved value is the means
|
||||
// by which they are detected.
|
||||
{
|
||||
UnicodeSet ignoreSet;
|
||||
ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
|
||||
UnicodeSet inheritedSet;
|
||||
inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
|
||||
ignoreSet.addAll(inheritedSet);
|
||||
for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
|
||||
UChar32 rangeStart = ignoreSet.getRangeStart(rn);
|
||||
UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
|
||||
utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
||||
utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
||||
}
|
||||
}
|
||||
|
||||
// Serialize the data to the Spoof Detector
|
||||
{
|
||||
utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
||||
int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
|
||||
// printf("Any case Trie size: %d\n", size);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto cleanup;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
|
||||
spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
|
||||
void *where = spImpl->fSpoofData->reserveSpace(size, status);
|
||||
utrie2_serialize(anyCaseTrie, where, size, &status);
|
||||
|
||||
utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
||||
size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
|
||||
// printf("Lower case Trie size: %d\n", size);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
goto cleanup;
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
|
||||
spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
|
||||
spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
|
||||
where = spImpl->fSpoofData->reserveSpace(size, status);
|
||||
utrie2_serialize(lowerCaseTrie, where, size, &status);
|
||||
|
||||
spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
|
||||
spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
|
||||
ScriptSet *rtScriptSets = static_cast<ScriptSet *>
|
||||
(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
|
||||
uint32_t rindex = 2;
|
||||
for (int32_t i=2; i<scriptSets->size(); i++) {
|
||||
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
||||
if (bSet->rindex < rindex) {
|
||||
// We have already copied this script set to the serialized data.
|
||||
continue;
|
||||
}
|
||||
U_ASSERT(rindex == bSet->rindex);
|
||||
rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
|
||||
rindex++;
|
||||
}
|
||||
}
|
||||
|
||||
// Open new utrie2s from the serialized data. We don't want to keep the ones
|
||||
// we just built because we would then have two copies of the data, one internal to
|
||||
// the utries that we have already constructed, and one in the serialized data area.
|
||||
// An alternative would be to not pre-serialize the Trie data, but that makes the
|
||||
// spoof detector data different, depending on how the detector was constructed.
|
||||
// It's simpler to keep the data always the same.
|
||||
|
||||
spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
|
||||
UTRIE2_16_VALUE_BITS,
|
||||
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
||||
NULL,
|
||||
&status);
|
||||
|
||||
spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
|
||||
UTRIE2_16_VALUE_BITS,
|
||||
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
|
||||
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
||||
NULL,
|
||||
&status);
|
||||
|
||||
|
||||
|
||||
cleanup:
|
||||
if (U_FAILURE(status)) {
|
||||
pe->line = lineNum;
|
||||
}
|
||||
uregex_close(parseRegexp);
|
||||
uprv_free(input);
|
||||
|
||||
int32_t i;
|
||||
for (i=0; i<scriptSets->size(); i++) {
|
||||
BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
||||
delete bsset;
|
||||
}
|
||||
delete scriptSets;
|
||||
utrie2_close(anyCaseTrie);
|
||||
utrie2_close(lowerCaseTrie);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BuilderScriptSet::BuilderScriptSet() {
|
||||
codePoint = -1;
|
||||
trie = NULL;
|
||||
sset = NULL;
|
||||
index = 0;
|
||||
rindex = 0;
|
||||
scriptSetOwned = TRUE;
|
||||
}
|
||||
|
||||
BuilderScriptSet::~BuilderScriptSet() {
|
||||
if (scriptSetOwned) {
|
||||
delete sset;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
56
icu4c/source/i18n/uspoof_buildwsconf.h
Normal file
56
icu4c/source/i18n/uspoof_buildwsconf.h
Normal file
@ -0,0 +1,56 @@
|
||||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2008-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: uspoof_buildwsconf.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009Jan19
|
||||
* created by: Andy Heninger
|
||||
*
|
||||
* Internal classes and functions
|
||||
* for compiling whole script confusable data into its binary (runtime) form.
|
||||
*/
|
||||
|
||||
#ifndef __USPOOF_BUILDWSCONF_H__
|
||||
#define __USPOOF_BUILDWSCONF_H__
|
||||
|
||||
#include "uspoof_impl.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
//
|
||||
// class BuilderScriptSet. Represents the set of scripts (Script Codes)
|
||||
// containing characters that are confusable with one specific
|
||||
// code point.
|
||||
//
|
||||
class BuilderScriptSet: public UMemory {
|
||||
public:
|
||||
UChar32 codePoint; // The source code point.
|
||||
UTrie2 *trie; // Any-case or Lower-case Trie.
|
||||
// These Trie tables are the final result of the
|
||||
// build. This flag indicates which of the two
|
||||
// this set of data is for.
|
||||
ScriptSet *sset; // The set of scripts itself.
|
||||
|
||||
// Vectors of all B
|
||||
uint32_t index; // Index of this set in the Build Time vector
|
||||
// of script sets.
|
||||
uint32_t rindex; // Index of this set in the final (runtime)
|
||||
// array of sets.
|
||||
UBool scriptSetOwned; // True if this BuilderScriptSet owns (should delete)
|
||||
// its underlying sset.
|
||||
|
||||
BuilderScriptSet();
|
||||
~BuilderScriptSet();
|
||||
};
|
||||
|
||||
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
||||
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status);
|
||||
|
||||
|
||||
#endif
|
841
icu4c/source/i18n/uspoof_impl.cpp
Normal file
841
icu4c/source/i18n/uspoof_impl.cpp
Normal file
@ -0,0 +1,841 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2008-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "utrie2.h"
|
||||
#include "cmemory.h"
|
||||
#include "udatamem.h"
|
||||
#include "umutex.h"
|
||||
#include "udataswp.h"
|
||||
#include "uassert.h"
|
||||
#include "uspoof_impl.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl)
|
||||
|
||||
SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode &status) :
|
||||
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fMagic = USPOOF_MAGIC;
|
||||
fSpoofData = data;
|
||||
fChecks = USPOOF_ALL_CHECKS;
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
if (allowedCharsSet == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
}
|
||||
|
||||
|
||||
SpoofImpl::SpoofImpl() {
|
||||
fMagic = USPOOF_MAGIC;
|
||||
fSpoofData = NULL;
|
||||
fChecks = USPOOF_ALL_CHECKS;
|
||||
UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff);
|
||||
allowedCharsSet->freeze();
|
||||
fAllowedCharsSet = allowedCharsSet;
|
||||
}
|
||||
|
||||
|
||||
// Copy Constructor, used by the user level clone() function.
|
||||
SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) :
|
||||
fMagic(0), fSpoofData(NULL), fAllowedCharsSet(NULL) {
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fMagic = src.fMagic;
|
||||
fChecks = src.fChecks;
|
||||
if (src.fSpoofData != NULL) {
|
||||
fSpoofData = src.fSpoofData->addReference();
|
||||
}
|
||||
fCheckMask = src.fCheckMask;
|
||||
fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone());
|
||||
if (fAllowedCharsSet == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
SpoofImpl::~SpoofImpl() {
|
||||
fMagic = 0; // head off application errors by preventing use of
|
||||
// of deleted objects.
|
||||
if (fSpoofData != NULL) {
|
||||
fSpoofData->removeReference(); // Will delete if refCount goes to zero.
|
||||
}
|
||||
delete fAllowedCharsSet;
|
||||
}
|
||||
|
||||
//
|
||||
// Incoming parameter check on Status and the SpoofChecker object
|
||||
// received from the C API.
|
||||
//
|
||||
const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
if (sc == NULL) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
};
|
||||
SpoofImpl *This = (SpoofImpl *)sc;
|
||||
if (This->fMagic != USPOOF_MAGIC ||
|
||||
This->fSpoofData == NULL) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
if (!SpoofData::validateDataVersion(This->fSpoofData->fRawData, status)) {
|
||||
return NULL;
|
||||
}
|
||||
return This;
|
||||
}
|
||||
|
||||
SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) {
|
||||
return const_cast<SpoofImpl *>
|
||||
(SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status));
|
||||
}
|
||||
|
||||
|
||||
|
||||
//--------------------------------------------------------------------------------------
|
||||
//
|
||||
// confusableLookup() This is the heart of the confusable skeleton generation
|
||||
// implementation.
|
||||
//
|
||||
// Given a source character, produce the corresponding
|
||||
// replacement character(s)
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
int32_t SpoofImpl::confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const {
|
||||
|
||||
// Binary search the spoof data key table for the inChar
|
||||
int32_t *low = fSpoofData->fCFUKeys;
|
||||
int32_t *mid = NULL;
|
||||
int32_t *limit = low + fSpoofData->fRawData->fCFUKeysSize;
|
||||
UChar midc;
|
||||
do {
|
||||
int32_t delta = (limit-low)/2;
|
||||
mid = low + delta;
|
||||
midc = *mid & 0x1fffff;
|
||||
if (inChar == midc) {
|
||||
goto foundChar;
|
||||
} else if (inChar < midc) {
|
||||
limit = mid;
|
||||
} else {
|
||||
low = mid;
|
||||
}
|
||||
} while (low < limit-1);
|
||||
mid = low;
|
||||
midc = *mid & 0x1fffff;
|
||||
if (inChar != midc) {
|
||||
// Char not found. It maps to itself.
|
||||
int i = 0;
|
||||
U16_APPEND_UNSAFE(destBuf, i, inChar)
|
||||
return i;
|
||||
}
|
||||
foundChar:
|
||||
int32_t keyFlags = *mid & 0xff000000;
|
||||
if ((keyFlags & tableMask) == 0) {
|
||||
// We found the right key char, but the entry doesn't pertain to the
|
||||
// table we need. See if there is an adjacent key that does
|
||||
if (keyFlags & USPOOF_KEY_MULTIPLE_VALUES) {
|
||||
int32_t *altMid;
|
||||
for (altMid = mid-1; (*altMid&0x00ffffff) == inChar; altMid--) {
|
||||
keyFlags = *altMid & 0xff000000;
|
||||
if (keyFlags & tableMask) {
|
||||
mid = altMid;
|
||||
goto foundKey;
|
||||
}
|
||||
}
|
||||
for (altMid = mid+1; (*altMid&0x00ffffff) == inChar; altMid++) {
|
||||
keyFlags = *altMid & 0xff000000;
|
||||
if (keyFlags & tableMask) {
|
||||
mid = altMid;
|
||||
goto foundKey;
|
||||
}
|
||||
}
|
||||
}
|
||||
// No key entry for this char & table.
|
||||
// The input char maps to itself.
|
||||
int i = 0;
|
||||
U16_APPEND_UNSAFE(destBuf, i, inChar)
|
||||
return i;
|
||||
}
|
||||
|
||||
foundKey:
|
||||
int32_t stringLen = USPOOF_KEY_LENGTH_FIELD(keyFlags) + 1;
|
||||
int32_t keyTableIndex = mid - fSpoofData->fCFUKeys;
|
||||
|
||||
// Value is either a UChar (for strings of length 1) or
|
||||
// an index into the string table (for longer strings)
|
||||
uint16_t value = fSpoofData->fCFUValues[keyTableIndex];
|
||||
if (stringLen == 1) {
|
||||
destBuf[0] = value;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// String length of 4 from the above lookup is used for all strings of length >= 4.
|
||||
// For these, get the real length from the string lengths table,
|
||||
// which maps string table indexes to lengths.
|
||||
// All strings of the same length are stored contiguously in the string table.
|
||||
// 'value' from the lookup above is the starting index for the desired string.
|
||||
|
||||
int32_t ix;
|
||||
if (stringLen == 4) {
|
||||
// TODO:
|
||||
int32_t stringLengthsLimit = fSpoofData->fRawData->fCFUStringLengthsSize;
|
||||
for (ix = 0; ix < stringLengthsLimit; ix++) {
|
||||
if (fSpoofData->fCFUStringLengths[ix].fLastString >= value) {
|
||||
stringLen = fSpoofData->fCFUStringLengths[ix].fStrLength;
|
||||
break;
|
||||
}
|
||||
}
|
||||
U_ASSERT(ix < stringLengthsLimit);
|
||||
}
|
||||
|
||||
U_ASSERT(value + stringLen < fSpoofData->fRawData->fCFUStringTableLen);
|
||||
UChar *src = &fSpoofData->fCFUStrings[value];
|
||||
for (ix=0; ix<stringLen; ix++) {
|
||||
destBuf[ix] = src[ix];
|
||||
}
|
||||
return stringLen;
|
||||
}
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// wholeScriptCheck()
|
||||
//
|
||||
// Input text is already normalized to NFKD
|
||||
// Return the set of scripts, each of which can represent something that is
|
||||
// confusable with the input text. The script of the input text
|
||||
// is included; input consisting of characters from a single script will
|
||||
// always produce a result consisting of a set containing that script.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
void SpoofImpl::wholeScriptCheck(
|
||||
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const {
|
||||
|
||||
int32_t inputIdx = 0;
|
||||
UChar32 c;
|
||||
|
||||
UTrie2 *table =
|
||||
(fChecks & USPOOF_ANY_CASE) ? fSpoofData->fAnyCaseTrie : fSpoofData->fLowerCaseTrie;
|
||||
result->setAll();
|
||||
while (inputIdx < length) {
|
||||
U16_NEXT(text, inputIdx, length, c);
|
||||
uint32_t index = utrie2_get32(table, c);
|
||||
if (index == 0) {
|
||||
// No confusables in another script for this char.
|
||||
// TODO: we should change the data to have sets with just the single script
|
||||
// bit for the script of this char. Gets rid of this special case.
|
||||
// Until then, grab the script from the char and intersect it with the set.
|
||||
UScriptCode cpScript = uscript_getScript(c, &status);
|
||||
U_ASSERT(cpScript > USCRIPT_INHERITED);
|
||||
result->intersect(cpScript);
|
||||
} else if (index == 1) {
|
||||
// Script == Common or Inherited. Nothing to do.
|
||||
} else {
|
||||
result->intersect(fSpoofData->fScriptSets[index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
int32_t SpoofImpl::scriptScan
|
||||
(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
int32_t inputIdx = 0;
|
||||
UChar32 c;
|
||||
int32_t scriptCount = 0;
|
||||
UScriptCode lastScript = USCRIPT_INVALID_CODE;
|
||||
UScriptCode sc = USCRIPT_INVALID_CODE;
|
||||
while ((inputIdx < length || length == -1) && scriptCount < 2) {
|
||||
U16_NEXT(text, inputIdx, length, c);
|
||||
if (c == 0 && length == -1) {
|
||||
break;
|
||||
}
|
||||
sc = uscript_getScript(c, &status);
|
||||
if (sc == USCRIPT_COMMON || sc == USCRIPT_INHERITED || sc == USCRIPT_UNKNOWN) {
|
||||
continue;
|
||||
}
|
||||
if (sc != lastScript) {
|
||||
scriptCount++;
|
||||
lastScript = sc;
|
||||
}
|
||||
}
|
||||
if (scriptCount == 2) {
|
||||
pos = inputIdx;
|
||||
}
|
||||
return scriptCount;
|
||||
}
|
||||
|
||||
|
||||
// Convert a text format hex number. Utility function used by builder code. Static.
|
||||
// Input: UChar *string text. Output: a UChar32
|
||||
// Input has been pre-checked, and will have no non-hex chars.
|
||||
// The number must fall in the code point range of 0..0x10ffff
|
||||
// Static Function.
|
||||
UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return 0;
|
||||
}
|
||||
U_ASSERT(limit-start > 0);
|
||||
uint32_t val = 0;
|
||||
int i;
|
||||
for (i=start; i<limit; i++) {
|
||||
int digitVal = s[i] - 0x30;
|
||||
if (digitVal>9) {
|
||||
digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A'
|
||||
}
|
||||
if (digitVal>15) {
|
||||
digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a'
|
||||
}
|
||||
U_ASSERT(digitVal <= 0xf);
|
||||
val <<= 4;
|
||||
val += digitVal;
|
||||
}
|
||||
if (val > 0x10ffff) {
|
||||
status = U_PARSE_ERROR;
|
||||
val = 0;
|
||||
}
|
||||
return (UChar32)val;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------------------------
|
||||
//
|
||||
// class SpoofData Implementation
|
||||
//
|
||||
//----------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
UBool SpoofData::validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status) {
|
||||
if (U_FAILURE(status) ||
|
||||
rawData == NULL ||
|
||||
rawData->fMagic != USPOOF_MAGIC ||
|
||||
rawData->fFormatVersion[0] > 1 ||
|
||||
rawData->fFormatVersion[1] > 0) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
//
|
||||
// SpoofData::getDefault() - return a wrapper around the spoof data that is
|
||||
// baked into the default ICU data.
|
||||
//
|
||||
SpoofData *SpoofData::getDefault(UErrorCode &status) {
|
||||
// TODO: Cache it. Lazy create, keep until cleanup.
|
||||
|
||||
UDataMemory *udm = udata_open(NULL, "cfu", "confusables", &status);
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
SpoofData *This = new SpoofData(udm, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete This;
|
||||
return NULL;
|
||||
}
|
||||
if (This == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return This;
|
||||
}
|
||||
|
||||
|
||||
SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status)
|
||||
{
|
||||
reset();
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fRawData = reinterpret_cast<SpoofDataHeader *>
|
||||
((char *)(udm->pHeader) + udm->pHeader->dataHeader.headerSize);
|
||||
fUDM = udm;
|
||||
validateDataVersion(fRawData, status);
|
||||
initPtrs(status);
|
||||
}
|
||||
|
||||
|
||||
SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status)
|
||||
{
|
||||
reset();
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if ((size_t)length < sizeof(SpoofDataHeader)) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
void *ncData = const_cast<void *>(data);
|
||||
fRawData = static_cast<SpoofDataHeader *>(ncData);
|
||||
if (length < fRawData->fLength) {
|
||||
status = U_INVALID_FORMAT_ERROR;
|
||||
return;
|
||||
}
|
||||
validateDataVersion(fRawData, status);
|
||||
initPtrs(status);
|
||||
}
|
||||
|
||||
|
||||
// Spoof Data constructor for use from data builder.
|
||||
// Initializes a new, empty data area that will be populated later.
|
||||
SpoofData::SpoofData(UErrorCode &status) {
|
||||
reset();
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fDataOwned = true;
|
||||
fRefCount = 1;
|
||||
|
||||
// The spoof header should already be sized to be a multiple of 16 bytes.
|
||||
// Just in case it's not, round it up.
|
||||
uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15;
|
||||
U_ASSERT(initialSize == sizeof(SpoofDataHeader));
|
||||
|
||||
fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize));
|
||||
fMemLimit = initialSize;
|
||||
if (fRawData == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
uprv_memset(fRawData, 0, initialSize);
|
||||
|
||||
fRawData->fMagic = USPOOF_MAGIC;
|
||||
fRawData->fFormatVersion[0] = 1;
|
||||
fRawData->fFormatVersion[1] = 0;
|
||||
fRawData->fFormatVersion[2] = 0;
|
||||
fRawData->fFormatVersion[3] = 0;
|
||||
initPtrs(status);
|
||||
}
|
||||
|
||||
// reset() - initialize all fields.
|
||||
// Should be updated if any new fields are added.
|
||||
// Called by constructors to put things in a known initial state.
|
||||
void SpoofData::reset() {
|
||||
fRawData = NULL;
|
||||
fDataOwned = FALSE;
|
||||
fUDM = NULL;
|
||||
fMemLimit = 0;
|
||||
fRefCount = 1;
|
||||
fCFUKeys = NULL;
|
||||
fCFUValues = NULL;
|
||||
fCFUStringLengths = NULL;
|
||||
fCFUStrings = NULL;
|
||||
fAnyCaseTrie = NULL;
|
||||
fLowerCaseTrie = NULL;
|
||||
fScriptSets = NULL;
|
||||
}
|
||||
|
||||
|
||||
// SpoofData::initPtrs()
|
||||
// Initialize the pointers to the various sections of the raw data.
|
||||
//
|
||||
// This function is used both during the Trie building process (multiple
|
||||
// times, as the individual data sections are added), and
|
||||
// during the opening of a Spoof Checker from prebuilt data.
|
||||
//
|
||||
// The pointers for non-existent data sections (identified by an offset of 0)
|
||||
// are set to NULL.
|
||||
//
|
||||
// Note: During building the data, adding each new data section
|
||||
// reallocs the raw data area, which likely relocates it, which
|
||||
// in turn requires reinitializing all of the pointers into it, hence
|
||||
// multiple calls to this function during building.
|
||||
//
|
||||
void SpoofData::initPtrs(UErrorCode &status) {
|
||||
fCFUKeys = NULL;
|
||||
fCFUValues = NULL;
|
||||
fCFUStringLengths = NULL;
|
||||
fCFUStrings = NULL;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (fRawData->fCFUKeys != 0) {
|
||||
fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys);
|
||||
}
|
||||
if (fRawData->fCFUStringIndex != 0) {
|
||||
fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex);
|
||||
}
|
||||
if (fRawData->fCFUStringLengths != 0) {
|
||||
fCFUStringLengths = (SpoofStringLengthsElement *)((char *)fRawData + fRawData->fCFUStringLengths);
|
||||
}
|
||||
if (fRawData->fCFUStringTable != 0) {
|
||||
fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable);
|
||||
}
|
||||
|
||||
if (fAnyCaseTrie == NULL && fRawData->fAnyCaseTrie != 0) {
|
||||
fAnyCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(char *)fRawData + fRawData->fAnyCaseTrie, fRawData->fAnyCaseTrieLength, NULL, &status);
|
||||
}
|
||||
if (fLowerCaseTrie == NULL && fRawData->fLowerCaseTrie != 0) {
|
||||
fLowerCaseTrie = utrie2_openFromSerialized(UTRIE2_16_VALUE_BITS,
|
||||
(char *)fRawData + fRawData->fLowerCaseTrie, fRawData->fLowerCaseTrieLength, NULL, &status);
|
||||
}
|
||||
|
||||
if (fRawData->fScriptSets != 0) {
|
||||
fScriptSets = (ScriptSet *)((char *)fRawData + fRawData->fScriptSets);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
SpoofData::~SpoofData() {
|
||||
utrie2_close(fAnyCaseTrie);
|
||||
fAnyCaseTrie = NULL;
|
||||
utrie2_close(fLowerCaseTrie);
|
||||
fLowerCaseTrie = NULL;
|
||||
if (fDataOwned) {
|
||||
uprv_free(fRawData);
|
||||
}
|
||||
fRawData = NULL;
|
||||
if (fUDM != NULL) {
|
||||
udata_close(fUDM);
|
||||
}
|
||||
fUDM = NULL;
|
||||
}
|
||||
|
||||
|
||||
void SpoofData::removeReference() {
|
||||
if (umtx_atomic_dec(&fRefCount) == 0) {
|
||||
delete this;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
SpoofData *SpoofData::addReference() {
|
||||
umtx_atomic_inc(&fRefCount);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
if (!fDataOwned) {
|
||||
U_ASSERT(FALSE);
|
||||
status = U_INTERNAL_PROGRAM_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16
|
||||
uint32_t returnOffset = fMemLimit;
|
||||
fMemLimit += numBytes;
|
||||
fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit));
|
||||
fRawData->fLength = fMemLimit;
|
||||
uprv_memset((char *)fRawData + returnOffset, 0, numBytes);
|
||||
initPtrs(status);
|
||||
return (char *)fRawData + returnOffset;
|
||||
}
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// ScriptSet implementation
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
ScriptSet::ScriptSet() {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
ScriptSet::~ScriptSet() {
|
||||
}
|
||||
|
||||
UBool ScriptSet::operator == (const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
if (bits[i] != other.bits[i]) {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
void ScriptSet::Union(UScriptCode script) {
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
U_ASSERT(index < sizeof(bits)*4);
|
||||
bits[index] |= bit;
|
||||
}
|
||||
|
||||
|
||||
void ScriptSet::Union(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] |= other.bits[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ScriptSet::intersect(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] &= other.bits[i];
|
||||
}
|
||||
}
|
||||
|
||||
void ScriptSet::intersect(UScriptCode script) {
|
||||
uint32_t index = script / 32;
|
||||
uint32_t bit = 1 << (script & 31);
|
||||
U_ASSERT(index < sizeof(bits)*4);
|
||||
uint32_t i;
|
||||
for (i=0; i<index; i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
bits[index] &= bit;
|
||||
for (i=index+1; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
ScriptSet & ScriptSet::operator =(const ScriptSet &other) {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = other.bits[i];
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
void ScriptSet::setAll() {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0xffffffffu;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void ScriptSet::resetAll() {
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
bits[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
int32_t ScriptSet::countMembers() {
|
||||
// This bit counter is good for sparse numbers of '1's, which is
|
||||
// very much the case that we will usually have.
|
||||
int32_t count = 0;
|
||||
for (uint32_t i=0; i<sizeof(bits)/sizeof(uint32_t); i++) {
|
||||
uint32_t x = bits[i];
|
||||
while (x > 0) {
|
||||
count++;
|
||||
x &= (x - 1); // and off the least significant one bit.
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// NFKDBuffer Implementation.
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
|
||||
NFKDBuffer::NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status) {
|
||||
fNormalizedText = NULL;
|
||||
fNormalizedTextLength = 0;
|
||||
fOriginalText = text;
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
fNormalizedText = fSmallBuf;
|
||||
fNormalizedTextLength = unorm_normalize(
|
||||
text, length, UNORM_NFKD, 0, fNormalizedText, USPOOF_STACK_BUFFER_SIZE, &status);
|
||||
if (status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
status = U_ZERO_ERROR;
|
||||
fNormalizedText = (UChar *)uprv_malloc((fNormalizedTextLength+1)*sizeof(UChar));
|
||||
if (fNormalizedText == NULL) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else {
|
||||
fNormalizedTextLength = unorm_normalize(text, length, UNORM_NFKD, 0,
|
||||
fNormalizedText, fNormalizedTextLength+1, &status);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
NFKDBuffer::~NFKDBuffer() {
|
||||
if (fNormalizedText != fSmallBuf) {
|
||||
delete fNormalizedText;
|
||||
}
|
||||
fNormalizedText = 0;
|
||||
}
|
||||
|
||||
const UChar *NFKDBuffer::getBuffer() {
|
||||
return fNormalizedText;
|
||||
}
|
||||
|
||||
int32_t NFKDBuffer::getLength() {
|
||||
return fNormalizedTextLength;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
//-----------------------------------------------------------------------------
|
||||
//
|
||||
// uspoof_swap - byte swap and char encoding swap of spoof data
|
||||
//
|
||||
//-----------------------------------------------------------------------------
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *status) {
|
||||
|
||||
if (status == NULL || U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
|
||||
*status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Check that the data header is for spoof data.
|
||||
// (Header contents are defined in gencfu.cpp)
|
||||
//
|
||||
const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4);
|
||||
if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */
|
||||
pInfo->dataFormat[1]==0x66 &&
|
||||
pInfo->dataFormat[2]==0x75 &&
|
||||
pInfo->dataFormat[3]==0x20 &&
|
||||
pInfo->formatVersion[0]==1 )) {
|
||||
udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x "
|
||||
"(format version %02x %02x %02x %02x) is not recognized\n",
|
||||
pInfo->dataFormat[0], pInfo->dataFormat[1],
|
||||
pInfo->dataFormat[2], pInfo->dataFormat[3],
|
||||
pInfo->formatVersion[0], pInfo->formatVersion[1],
|
||||
pInfo->formatVersion[2], pInfo->formatVersion[3]);
|
||||
*status=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific
|
||||
// header). This swap also conveniently gets us
|
||||
// the size of the ICU d.h., which lets us locate the start
|
||||
// of the uspoof specific data.
|
||||
//
|
||||
int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
|
||||
|
||||
|
||||
//
|
||||
// Get the Spoof Data Header, and check that it appears to be OK.
|
||||
//
|
||||
//
|
||||
const uint8_t *inBytes =(const uint8_t *)inData+headerSize;
|
||||
SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes;
|
||||
if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC ||
|
||||
ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader))
|
||||
{
|
||||
udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n");
|
||||
*status=U_UNSUPPORTED_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Prefight operation? Just return the size
|
||||
//
|
||||
int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength);
|
||||
int32_t totalSize = headerSize + spoofDataLength;
|
||||
if (length < 0) {
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
//
|
||||
// Check that length passed in is consistent with length from Spoof data header.
|
||||
//
|
||||
if (length < totalSize) {
|
||||
udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n",
|
||||
spoofDataLength);
|
||||
*status=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Swap the Data. Do the data itself first, then the Spoof Data Header, because
|
||||
// we need to reference the header to locate the data, and an
|
||||
// inplace swap of the header leaves it unusable.
|
||||
//
|
||||
uint8_t *outBytes = (uint8_t *)outData + headerSize;
|
||||
SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes;
|
||||
|
||||
int32_t sectionStart;
|
||||
int32_t sectionLength;
|
||||
|
||||
//
|
||||
// If not swapping in place, zero out the output buffer before starting.
|
||||
// Gaps may exist between the individual sections, and these must be zeroed in
|
||||
// the output buffer. The simplest way to do that is to just zero the whole thing.
|
||||
//
|
||||
if (inBytes != outBytes) {
|
||||
uprv_memset(outBytes, 0, spoofDataLength);
|
||||
}
|
||||
|
||||
// Confusables Keys Section (fCFUKeys)
|
||||
sectionStart = ds->readUInt32(spoofDH->fCFUKeys);
|
||||
sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4;
|
||||
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// String Index Section
|
||||
sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex);
|
||||
sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2;
|
||||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// String Table Section
|
||||
sectionStart = ds->readUInt32(spoofDH->fCFUStringTable);
|
||||
sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2;
|
||||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// String Lengths Section
|
||||
sectionStart = ds->readUInt32(spoofDH->fCFUStringLengths);
|
||||
sectionLength = ds->readUInt32(spoofDH->fCFUStringLengthsSize) * 4;
|
||||
ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// Any Case Trie
|
||||
sectionStart = ds->readUInt32(spoofDH->fAnyCaseTrie);
|
||||
sectionLength = ds->readUInt32(spoofDH->fAnyCaseTrieLength);
|
||||
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// Lower Case Trie
|
||||
sectionStart = ds->readUInt32(spoofDH->fLowerCaseTrie);
|
||||
sectionLength = ds->readUInt32(spoofDH->fLowerCaseTrieLength);
|
||||
utrie2_swap(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// Script Sets. The data is an array of int32_t
|
||||
sectionStart = ds->readUInt32(spoofDH->fScriptSets);
|
||||
sectionLength = ds->readUInt32(spoofDH->fScriptSetsLength) * 4;
|
||||
ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status);
|
||||
|
||||
// And, last, swap the header itself.
|
||||
// int32_t fMagic // swap this
|
||||
// uint8_t fFormatVersion[4] // Do not swap this
|
||||
// int32_t all the rest // Swap the rest, all is 32 bit stuff.
|
||||
//
|
||||
uint32_t magic = ds->readUInt32(spoofDH->fMagic);
|
||||
ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic);
|
||||
ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8, &outputDH->fLength, status);
|
||||
|
||||
return totalSize;
|
||||
}
|
||||
|
||||
|
397
icu4c/source/i18n/uspoof_impl.h
Normal file
397
icu4c/source/i18n/uspoof_impl.h
Normal file
@ -0,0 +1,397 @@
|
||||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 2008-2009, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
*
|
||||
* uspoof_impl.h
|
||||
*
|
||||
* Implemenation header for spoof detection
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef USPOOFIM_H
|
||||
#define USPOOFIM_H
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uspoof.h"
|
||||
#include "utrie2.h"
|
||||
#include "unicode/uscript.h"
|
||||
#include "unicode/udata.h"
|
||||
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// The maximium length (in UTF-16 UChars) of the skeleton replacement string resulting from
|
||||
// a single input code point. This is function of the unicode.org data.
|
||||
#define USPOOF_MAX_SKELETON_EXPANSION 20
|
||||
|
||||
// The default stack buffer size for copies or conversions or normalizations
|
||||
// of input strings being checked. (Used in multiple places.)
|
||||
#define USPOOF_STACK_BUFFER_SIZE 100
|
||||
|
||||
// Magic number for sanity checking spoof data.
|
||||
#define USPOOF_MAGIC 0x3845fdef
|
||||
|
||||
class SpoofData;
|
||||
struct SpoofDataHeader;
|
||||
struct SpoofStringLengthsElement;
|
||||
class ScriptSet;
|
||||
|
||||
/**
|
||||
* Class SpoofImpl corresponds directly to the plain C API opaque type
|
||||
* USpoofChecker. One can be cast to the other.
|
||||
*/
|
||||
class SpoofImpl : public UObject {
|
||||
public:
|
||||
SpoofImpl(SpoofData *data, UErrorCode &status);
|
||||
SpoofImpl();
|
||||
virtual ~SpoofImpl();
|
||||
|
||||
/** Copy constructor, used by the user level uspoof_clone() function.
|
||||
*/
|
||||
SpoofImpl(const SpoofImpl &src, UErrorCode &status);
|
||||
|
||||
static SpoofImpl *validateThis(USpoofChecker *sc, UErrorCode &status);
|
||||
static const SpoofImpl *validateThis(const USpoofChecker *sc, UErrorCode &status);
|
||||
|
||||
/** Get the confusable skeleton transform for a single code point.
|
||||
* The result is a string with a length between 1 and 18.
|
||||
* @param tableMask bit flag specifying which confusable table to use.
|
||||
* One of USPOOF_SL_TABLE_FLAG, USPOOF_MA_TABLE_FLAG, etc.
|
||||
* @return The length in UTF-16 code units of the substition string.
|
||||
*/
|
||||
int32_t confusableLookup(UChar32 inChar, int32_t tableMask, UChar *destBuf) const;
|
||||
|
||||
/** parse a hex number. Untility used by the builders. */
|
||||
static UChar32 ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status);
|
||||
|
||||
// Implementation for Whole Script tests.
|
||||
// Return the test bit flag to be ORed into the eventual user return value
|
||||
// if a Spoof opportunity is detected.
|
||||
void wholeScriptCheck(
|
||||
const UChar *text, int32_t length, ScriptSet *result, UErrorCode &status) const;
|
||||
|
||||
/** Scan a string to determine how many scripts it includes.
|
||||
* Ignore characters with script=Common and scirpt=Inherited.
|
||||
* @param text The UChar text to be scanned
|
||||
* @param length The length of the input text, -1 for nul termintated.
|
||||
* @param pos An out parameter, set to the first input postion at which
|
||||
* a second script was encountered, ignoring Common and Inherited.
|
||||
* @param status For errors.
|
||||
* @return the number of (non-common,inherited) scripts encountered,
|
||||
* clipped to a max of two.
|
||||
*/
|
||||
int32_t scriptScan(const UChar *text, int32_t length, int32_t &pos, UErrorCode &status) const;
|
||||
|
||||
|
||||
// WholeScript and MixedScript check implementation.
|
||||
//
|
||||
ScriptSet *WholeScriptCheck(const UChar *text, int32_t length, UErrorCode &status) const;
|
||||
|
||||
static UClassID U_EXPORT2 getStaticClassID(void);
|
||||
virtual UClassID getDynamicClassID(void) const;
|
||||
|
||||
//
|
||||
// Data Members
|
||||
//
|
||||
|
||||
int32_t fMagic; // Internal sanity check.
|
||||
int32_t fChecks; // Bit vector of checks to perform.
|
||||
|
||||
SpoofData *fSpoofData;
|
||||
|
||||
int32_t fCheckMask; // Spoof table selector. f(Check Type)
|
||||
|
||||
const UnicodeSet *fAllowedCharsSet; // The UnicodeSet of allowed characters.
|
||||
// for this Spoof Checker. Defaults to all chars.
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Confusable Mappings Data Structures
|
||||
//
|
||||
// For the confusable data, we are essentially implementing a map,
|
||||
// key: a code point
|
||||
// value: a string. Most commonly one char in length, but can be more.
|
||||
//
|
||||
// The keys are stored as a sorted array of 32 bit ints.
|
||||
// bits 0-23 a code point value
|
||||
// bits 24-31 flags
|
||||
// 24: 1 if entry applies to SL table
|
||||
// 25: 1 if entry applies to SA table
|
||||
// 26: 1 if entry applies to ML table
|
||||
// 27: 1 if entry applies to MA table
|
||||
// 28: 1 if there are multiple entries for this code point.
|
||||
// 29-30: length of value string, in UChars.
|
||||
// values are (1, 2, 3, other)
|
||||
// The key table is sorted in ascending code point order. (not on the
|
||||
// 32 bit int value, the flag bits do not participate in the sorting.)
|
||||
//
|
||||
// Lookup is done by means of a binary search in the key table.
|
||||
//
|
||||
// The corresponding values are kept in a parallel array of 16 bit ints.
|
||||
// If the value string is of length 1, it is literally in the value array.
|
||||
// For longer strings, the value array contains an index into the strings table.
|
||||
//
|
||||
// String Table:
|
||||
// The strings table contains all of the value strings (those of length two or greater)
|
||||
// concatentated together into one long UChar (UTF-16) array.
|
||||
//
|
||||
// The array is arranged by length of the strings - all strings of the same length
|
||||
// are stored together. The sections are ordered by length of the strings -
|
||||
// all two char strings first, followed by all of the three Char strings, etc.
|
||||
//
|
||||
// There is no nul character or other mark between adjacent strings.
|
||||
//
|
||||
// String Lengths table
|
||||
// The length of strings from 1 to 3 is flagged in the key table.
|
||||
// For strings of length 4 or longer, the string length table provides a
|
||||
// mapping between an index into the string table and the corresponding length.
|
||||
// Strings of these lengths are rare, so lookup time is not an issue.
|
||||
// Each entry consists of
|
||||
// uint16_t index of the _last_ string with this length
|
||||
// uint16_t the length
|
||||
//
|
||||
|
||||
// Flag bits in the Key entries
|
||||
#define USPOOF_SL_TABLE_FLAG (1<<24)
|
||||
#define USPOOF_SA_TABLE_FLAG (1<<25)
|
||||
#define USPOOF_ML_TABLE_FLAG (1<<26)
|
||||
#define USPOOF_MA_TABLE_FLAG (1<<27)
|
||||
#define USPOOF_KEY_MULTIPLE_VALUES (1<<28)
|
||||
#define USPOOF_KEY_LENGTH_SHIFT 29
|
||||
#define USPOOF_KEY_LENGTH_FIELD(x) (((x)>>29) & 3)
|
||||
|
||||
|
||||
struct SpoofStringLengthsElement {
|
||||
uint16_t fLastString; // index in string table of last string with this length
|
||||
uint16_t fStrLength; // Length of strings
|
||||
};
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// ScriptSet - Wrapper class for the Script code bit sets that are part of the
|
||||
// whole script confusable data.
|
||||
//
|
||||
// This class is used both at data build and at run time.
|
||||
// The constructor is only used at build time.
|
||||
// At run time, just point at the prebuilt data and go.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
class ScriptSet: public UMemory {
|
||||
public:
|
||||
ScriptSet();
|
||||
~ScriptSet();
|
||||
|
||||
UBool operator == (const ScriptSet &other);
|
||||
ScriptSet & operator = (const ScriptSet &other);
|
||||
|
||||
void Union(const ScriptSet &other);
|
||||
void Union(UScriptCode script);
|
||||
void intersect(const ScriptSet &other);
|
||||
void intersect(UScriptCode script);
|
||||
void setAll();
|
||||
void resetAll();
|
||||
int32_t countMembers();
|
||||
|
||||
private:
|
||||
uint32_t bits[6];
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// NFKDBuffer A little class to handle the NFKD normalization that is
|
||||
// needed on incoming identifiers to be checked.
|
||||
// Takes care of buffer handling and normalization
|
||||
//
|
||||
// Instances of this class are intended to be stack-allocated.
|
||||
//
|
||||
// TODO: how to map position offsets back to user values?
|
||||
//
|
||||
//--------------------------------------------------------------------------------
|
||||
class NFKDBuffer: public UMemory {
|
||||
public:
|
||||
NFKDBuffer(const UChar *text, int32_t length, UErrorCode &status);
|
||||
~NFKDBuffer();
|
||||
const UChar *getBuffer();
|
||||
int32_t getLength();
|
||||
|
||||
private:
|
||||
const UChar *fOriginalText;
|
||||
UChar *fNormalizedText;
|
||||
int32_t fNormalizedTextLength;
|
||||
UChar fSmallBuf[USPOOF_STACK_BUFFER_SIZE];
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
//-------------------------------------------------------------------------------------
|
||||
//
|
||||
// SpoofData
|
||||
//
|
||||
// A small class that wraps the raw (usually memory mapped) spoof data.
|
||||
// Serves two primary functions:
|
||||
// 1. Convenience. Contains real pointers to the data, to avoid dealing with
|
||||
// the offsets in the raw data.
|
||||
// 2. Reference counting. When a spoof checker is cloned, the raw data is shared
|
||||
// and must be retained until all checkers using the data are closed.
|
||||
// Nothing in this struct includes state that is specific to any particular
|
||||
// USpoofDetector object.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
class SpoofData: public UMemory {
|
||||
public:
|
||||
static SpoofData *getDefault(UErrorCode &status); // Load standard ICU spoof data.
|
||||
SpoofData(UErrorCode &status); // Create new spoof data wrapper.
|
||||
// Only used when building new data from rules.
|
||||
|
||||
// Constructor for use when creating from prebuilt default data.
|
||||
// A UDataMemory is what the ICU internal data loading functions provide.
|
||||
// The udm is adopted by the SpoofData.
|
||||
SpoofData(UDataMemory *udm, UErrorCode &status);
|
||||
|
||||
// Constructor for use when creating from serialized data.
|
||||
//
|
||||
SpoofData(const void *serializedData, int32_t length, UErrorCode &status);
|
||||
|
||||
// Check raw Spoof Data Version compatibility.
|
||||
// Return TRUE it looks good.
|
||||
static UBool validateDataVersion(const SpoofDataHeader *rawData, UErrorCode &status);
|
||||
~SpoofData(); // Destructor not normally used.
|
||||
// Use removeReference() instead.
|
||||
// Reference Counting functions.
|
||||
// Clone of a user-level spoof detector increments the ref count on the data.
|
||||
// Close of a user-level spoof detector decrements the ref count.
|
||||
// If the data is owned by us, it will be deleted when count goes to zero.
|
||||
SpoofData *addReference();
|
||||
void removeReference();
|
||||
|
||||
// Reserve space in the raw data. For use by builder when putting together a
|
||||
// new set of data. Init the new storage to zero, to prevent inconsistent
|
||||
// results if it is not all otherwise set by the requester.
|
||||
// Return:
|
||||
// pointer to the new space that was added by this function.
|
||||
void *reserveSpace(int32_t numBytes, UErrorCode &status);
|
||||
|
||||
// initialize the pointers from this object to the raw data.
|
||||
void initPtrs(UErrorCode &status);
|
||||
|
||||
// Reset all fields to an initial state.
|
||||
// Called from the top of all constructors.
|
||||
void reset();
|
||||
|
||||
SpoofDataHeader *fRawData; // Ptr to the raw memory-mapped data
|
||||
UBool fDataOwned; // True if the raw data is owned, and needs
|
||||
// to be deleted when refcount goes to zero.
|
||||
UDataMemory *fUDM; // If not NULL, our data came from a
|
||||
// UDataMemory, which we must close when
|
||||
// we're done.
|
||||
|
||||
uint32_t fMemLimit; // Limit of available raw data space
|
||||
int32_t fRefCount;
|
||||
|
||||
// Confusable data
|
||||
int32_t *fCFUKeys;
|
||||
uint16_t *fCFUValues;
|
||||
SpoofStringLengthsElement *fCFUStringLengths;
|
||||
UChar *fCFUStrings;
|
||||
|
||||
// Whole Script Confusable Data
|
||||
UTrie2 *fAnyCaseTrie;
|
||||
UTrie2 *fLowerCaseTrie;
|
||||
ScriptSet *fScriptSets;
|
||||
};
|
||||
|
||||
|
||||
//---------------------------------------------------------------------------------------
|
||||
//
|
||||
// Raw Binary Data Formats, as loaded from the ICU data file,
|
||||
// or as built by the builder.
|
||||
//
|
||||
//---------------------------------------------------------------------------------------
|
||||
struct SpoofDataHeader {
|
||||
int32_t fMagic; // (0x8345fdef)
|
||||
uint8_t fFormatVersion[4]; // Data Format. Same as the value in struct UDataInfo
|
||||
// if there is one associated with this data.
|
||||
int32_t fLength; // Total lenght in bytes of this spoof data,
|
||||
// including all sections, not just the header.
|
||||
|
||||
// The following four sections refer to data representing the confusable data
|
||||
// from the Unicode.org data from "confusables.txt"
|
||||
|
||||
int32_t fCFUKeys; // byte offset to Keys table (from SpoofDataHeader *)
|
||||
int32_t fCFUKeysSize; // number of entries in keys table (32 bits each)
|
||||
|
||||
// TODO: change name to fCFUValues, for consistency.
|
||||
int32_t fCFUStringIndex; // byte offset to String Indexes table
|
||||
int32_t fCFUStringIndexSize; // number of entries in String Indexes table (16 bits each)
|
||||
// (number of entries must be same as in Keys table
|
||||
|
||||
int32_t fCFUStringTable; // byte offset of String table
|
||||
int32_t fCFUStringTableLen; // length of string table (in 16 bit UChars)
|
||||
|
||||
int32_t fCFUStringLengths; // byte offset to String Lengths table
|
||||
int32_t fCFUStringLengthsSize; // number of entries in lengths table. (2 x 16 bits each)
|
||||
|
||||
|
||||
// The following sections are for data from confusablesWholeScript.txt
|
||||
|
||||
int32_t fAnyCaseTrie; // byte offset to the serialized Any Case Trie
|
||||
int32_t fAnyCaseTrieLength; // Length (bytes) of the serialized Any Case Trie
|
||||
|
||||
int32_t fLowerCaseTrie; // byte offset to the serialized Lower Case Trie
|
||||
int32_t fLowerCaseTrieLength; // Length (bytes) of the serialized Lower Case Trie
|
||||
|
||||
int32_t fScriptSets; // byte offset to array of ScriptSets
|
||||
int32_t fScriptSetsLength; // Number of ScriptSets (24 bytes each)
|
||||
|
||||
|
||||
// The following sections are for data from xidmodifications.txt
|
||||
|
||||
|
||||
int32_t unused[15]; // Padding, Room for Expansion
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Structure for the Whole Script Confusable Data
|
||||
// See Unicode UAX-39, Unicode Security Mechanisms, for a description of the
|
||||
// Whole Script confusable data
|
||||
//
|
||||
// The data provides mappings from code points to a set of scripts
|
||||
// that contain characters that might be confused with the code point.
|
||||
// There are two mappings, one for lower case only, and one for characters
|
||||
// of any case.
|
||||
//
|
||||
// The actual data consists of a utrie2 to map from a code point to an offset,
|
||||
// and an array of UScriptSets (essentially bit maps) that is indexed
|
||||
// by the offsets obtained from the Trie.
|
||||
//
|
||||
//
|
||||
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/**
|
||||
* Endianness swap function for binary spoof data.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *status);
|
||||
|
||||
|
||||
#endif /* USPOOFIM_H */
|
||||
|
@ -1,6 +1,6 @@
|
||||
#******************************************************************************
|
||||
#
|
||||
# Copyright (C) 1999-2008, International Business Machines
|
||||
# Copyright (C) 1999-2009, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
#******************************************************************************
|
||||
@ -52,7 +52,7 @@ ncnvfbts.o ncnvtst.o putiltst.o cstrtest.o udatpg_test.o utf8tst.o \
|
||||
stdnmtst.o usrchtst.o custrtrn.o sorttest.o trietest.o trie2test.o usettest.o \
|
||||
uenumtst.o utmstest.o currtest.o \
|
||||
idnatest.o nfsprep.o spreptst.o sprpdata.o \
|
||||
hpmufn.o tracetst.o reapits.o utexttst.o ucsdetst.o
|
||||
hpmufn.o tracetst.o reapits.o utexttst.o ucsdetst.o spooftest.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1996-2008, International Business Machines Corporation and
|
||||
* Copyright (c) 1996-2009, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
@ -38,6 +38,7 @@ void addURegexTest(TestNode** root);
|
||||
void addUTextTest(TestNode** root);
|
||||
void addUCsdetTest(TestNode** root);
|
||||
void addCnvSelTest(TestNode** root);
|
||||
void addUSpoofTest(TestNode** root);
|
||||
|
||||
void addAllTests(TestNode** root)
|
||||
{
|
||||
@ -75,6 +76,7 @@ void addAllTests(TestNode** root)
|
||||
#if !UCONFIG_NO_TRANSLITERATION
|
||||
addUTransTest(root);
|
||||
#endif
|
||||
addUSpoofTest(root);
|
||||
|
||||
}
|
||||
|
||||
|
@ -931,6 +931,14 @@
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="spoof"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\spooftest.c"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
|
@ -886,14 +886,7 @@ static void TestGetKeywordValuesForLocale(void) {
|
||||
1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1
|
||||
};
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
int32_t i, j, size;
|
||||
UEnumeration *pref, *all;
|
||||
const char *loc = NULL;
|
||||
UBool matchPref, matchAll;
|
||||
const char *value = NULL;
|
||||
int32_t valueLength = 0;
|
||||
|
||||
UList *ALLList = NULL;
|
||||
int32_t i;
|
||||
|
||||
UEnumeration *ALL = ucurr_getKeywordValuesForLocale("currency", uloc_getDefault(), FALSE, &status);
|
||||
if (ALL == NULL) {
|
||||
@ -902,12 +895,15 @@ static void TestGetKeywordValuesForLocale(void) {
|
||||
}
|
||||
|
||||
for (i = 0; i < PREFERRED_SIZE; i++) {
|
||||
pref = NULL;
|
||||
all = NULL;
|
||||
loc = PREFERRED[i][0];
|
||||
UEnumeration *pref = NULL;
|
||||
UEnumeration *all = NULL;
|
||||
const char *loc = PREFERRED[i][0];
|
||||
pref = ucurr_getKeywordValuesForLocale("currency", loc, TRUE, &status);
|
||||
matchPref = FALSE;
|
||||
matchAll = FALSE;
|
||||
UBool matchPref = FALSE;
|
||||
UBool matchAll = FALSE;
|
||||
int32_t size = 0, j;
|
||||
const char *value = NULL, *allValue = NULL;
|
||||
int32_t valueLength = 0, allValueLength = 0;
|
||||
|
||||
size = uenum_count(pref, &status);
|
||||
|
||||
@ -939,7 +935,7 @@ static void TestGetKeywordValuesForLocale(void) {
|
||||
|
||||
if (U_SUCCESS(status) && size == uenum_count(ALL, &status)) {
|
||||
matchAll = TRUE;
|
||||
ALLList = ulist_getListFromEnum(ALL);
|
||||
UList *ALLList = ulist_getListFromEnum(ALL);
|
||||
for (j = 0; j < size; j++) {
|
||||
if ((value = uenum_next(all, &valueLength, &status)) != NULL && U_SUCCESS(status)) {
|
||||
if (!ulist_containsString(ALLList, value, uprv_strlen(value))) {
|
||||
|
152
icu4c/source/test/cintltst/spooftest.c
Normal file
152
icu4c/source/test/cintltst/spooftest.c
Normal file
@ -0,0 +1,152 @@
|
||||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2009, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
*
|
||||
* File spooftest.c
|
||||
*
|
||||
*********************************************************************************/
|
||||
/*C API TEST for the uspoof Unicode Indentifier Spoofing and Security API */
|
||||
/**
|
||||
* This is an API test for ICU spoof detection in plain C. It doesn't test very many cases, and doesn't
|
||||
* try to test the full functionality. It just calls each function and verifies that it
|
||||
* works on a basic level.
|
||||
*
|
||||
* More complete testing of spoof detection functionality is done with the C++ tests.
|
||||
**/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "unicode/uspoof.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cintltst.h"
|
||||
|
||||
#define TEST_ASSERT_SUCCESS(status) {if (U_FAILURE(status)) { \
|
||||
log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));}}
|
||||
|
||||
#define TEST_CHECK_SUCCESS(status) {if (U_FAILURE(status)) { \
|
||||
log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status)); \
|
||||
goto bailout;} \
|
||||
}
|
||||
|
||||
#define TEST_ASSERT_TRUE(expr) {if ((expr)==FALSE) { \
|
||||
log_err("Test Failure at file %s, line %d: \"%s\" is false.\n", __FILE__, __LINE__, #expr);}}
|
||||
|
||||
#define TEST_ASSERT_EQ(a, b) { if ((a) != (b)) { \
|
||||
log_err("Test Failure at file %s, line %d: \"%s\" (%d) != \"%s\" (%d) \n", \
|
||||
__FILE__, __LINE__, #a, (a), #b, (b)); }}
|
||||
|
||||
#define TEST_ASSERT_NE(a, b) { if ((a) == (b)) { \
|
||||
log_err("Test Failure at file %s, line %d: \"%s\" (%d) == \"%s\" (%d) \n", \
|
||||
__FILE__, __LINE__, #a, (a), #b, (b)); }}
|
||||
|
||||
|
||||
/*
|
||||
* TEST_SETUP and TEST_TEARDOWN
|
||||
* macros to handle the boilerplate around setting up test case.
|
||||
* Put arbitrary test code between SETUP and TEARDOWN.
|
||||
* "sc" is the ready-to-go SpoofChecker for use in the tests.
|
||||
*/
|
||||
#define TEST_SETUP { \
|
||||
UErrorCode status = U_ZERO_ERROR; \
|
||||
USpoofChecker *sc; \
|
||||
sc = uspoof_open(&status); \
|
||||
TEST_CHECK_SUCCESS(status); \
|
||||
{
|
||||
|
||||
#define TEST_TEARDOWN \
|
||||
} \
|
||||
TEST_ASSERT_SUCCESS(status); \
|
||||
bailout: \
|
||||
uspoof_close(sc); \
|
||||
}
|
||||
|
||||
|
||||
static void test_assert_string(const char *expected, const UChar *actual, UBool nulTerm, const char *file, int line) {
|
||||
char buf_inside_macro[120];
|
||||
int32_t len = (int32_t)strlen(expected);
|
||||
UBool success;
|
||||
if (nulTerm) {
|
||||
u_austrncpy(buf_inside_macro, (actual), len+1);
|
||||
buf_inside_macro[len+2] = 0;
|
||||
success = (strcmp((expected), buf_inside_macro) == 0);
|
||||
} else {
|
||||
u_austrncpy(buf_inside_macro, (actual), len);
|
||||
buf_inside_macro[len+1] = 0;
|
||||
success = (strncmp((expected), buf_inside_macro, len) == 0);
|
||||
}
|
||||
if (success == FALSE) {
|
||||
log_err("Failure at file %s, line %d, expected \"%s\", got \"%s\"\n",
|
||||
file, line, (expected), buf_inside_macro);
|
||||
}
|
||||
}
|
||||
|
||||
#define TEST_ASSERT_STRING(expected, actual, nulTerm) test_assert_string(expected, actual, nulTerm, __FILE__, __LINE__)
|
||||
|
||||
|
||||
|
||||
static void TestUSpoofCAPI(void);
|
||||
|
||||
void addUSpoofTest(TestNode** root);
|
||||
|
||||
void addUSpoofTest(TestNode** root)
|
||||
{
|
||||
addTest(root, &TestUSpoofCAPI, "uspoof/TestUSpoofCAPI");
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Spoof Detction C API Tests
|
||||
*/
|
||||
static void TestUSpoofCAPI(void) {
|
||||
|
||||
TEST_SETUP
|
||||
const char *dataSrcDir;
|
||||
char *fileName;
|
||||
char *confusables;
|
||||
int confusablesLength;
|
||||
char *confusablesWholeScript;
|
||||
int confusablesWholeScriptLength;
|
||||
FILE *f;
|
||||
UParseError pe;
|
||||
int32_t errType;
|
||||
USpoofChecker *rsc;
|
||||
|
||||
dataSrcDir = ctest_dataSrcDir();
|
||||
fileName = malloc(strlen(dataSrcDir) + 100);
|
||||
strcpy(fileName, dataSrcDir);
|
||||
strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusables.txt");
|
||||
f = fopen(fileName, "r");
|
||||
TEST_ASSERT_NE(f, NULL);
|
||||
confusables = malloc(3000000);
|
||||
confusablesLength = fread(confusables, 1, 3000000, f);
|
||||
fclose(f);
|
||||
|
||||
|
||||
strcpy(fileName, dataSrcDir);
|
||||
strcat(fileName, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "confusablesWholeScript.txt");
|
||||
f = fopen(fileName, "r");
|
||||
TEST_ASSERT_NE(f, NULL);
|
||||
confusablesWholeScript = malloc(1000000);
|
||||
confusablesWholeScriptLength = fread(confusablesWholeScript, 1, 1000000, f);
|
||||
fclose(f);
|
||||
|
||||
rsc = uspoof_openFromSource(confusables, confusablesLength,
|
||||
confusablesWholeScript, confusablesWholeScriptLength,
|
||||
&errType, &pe, &status);
|
||||
TEST_ASSERT_SUCCESS(status);
|
||||
|
||||
free(confusablesWholeScript);
|
||||
free(confusables);
|
||||
free(fileName);
|
||||
uspoof_close(rsc);
|
||||
/* printf("ParseError Line is %d\n", pe.line); */
|
||||
TEST_TEARDOWN;
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,5 @@
|
||||
## Makefile.in for ICU tools
|
||||
## Copyright (c) 1999-2008, International Business Machines Corporation and
|
||||
## Copyright (c) 1999-2009, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
@ -15,7 +15,7 @@ subdir = tools
|
||||
|
||||
SUBDIRS = toolutil ctestfw makeconv genrb genuca genbrk genctd \
|
||||
gennames genpname gencnval gensprep genccode gencmn icupkg pkgdata \
|
||||
gentest genprops gencase genbidi gennorm
|
||||
gentest genprops gencase genbidi gennorm gencfu
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local all-recursive install install-local \
|
||||
|
96
icu4c/source/tools/gencfu/Makefile.in
Normal file
96
icu4c/source/tools/gencfu/Makefile.in
Normal file
@ -0,0 +1,96 @@
|
||||
## Makefile.in for ICU - tools/gencfu
|
||||
## Copyright (c) 2009 International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/gencfu
|
||||
|
||||
TARGET_STUB_NAME = gencfu
|
||||
|
||||
SECTION = 1
|
||||
|
||||
# MAN_FILES = $(TARGET_STUB_NAME).$(SECTION)
|
||||
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS) $(MAN_FILES)
|
||||
|
||||
## Target information
|
||||
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
|
||||
|
||||
ifneq ($(top_builddir),$(top_srcdir))
|
||||
CPPFLAGS += -I$(top_builddir)/common
|
||||
endif
|
||||
CPPFLAGS += -I$(top_srcdir)/common -I$(top_srcdir)/i18n -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = gencfu.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check check-local install-man
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET) $(MAN_FILES)
|
||||
|
||||
install-local: all-local install-man
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(bindir)
|
||||
$(INSTALL) $(TARGET) $(DESTDIR)$(bindir)
|
||||
|
||||
install-man: $(MAN_FILES)
|
||||
$(MKINSTALLDIRS) $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
$(INSTALL_DATA) $? $(DESTDIR)$(mandir)/man$(SECTION)
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(OBJECTS)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
|
||||
$(POST_BUILD_STEP)
|
||||
|
||||
|
||||
%.$(SECTION): $(srcdir)/%.$(SECTION).in
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
||||
|
326
icu4c/source/tools/gencfu/gencfu.cpp
Normal file
326
icu4c/source/tools/gencfu/gencfu.cpp
Normal file
@ -0,0 +1,326 @@
|
||||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
* File gencfu.c
|
||||
*/
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
//
|
||||
// Tool for generating Unicode Confusable data files (.cfu files).
|
||||
// .cfu files contain the compiled of the confusable data
|
||||
// derived from the Unicode Consortium data described in
|
||||
// Unicode UAX 39.
|
||||
//
|
||||
// Usage: gencfu [options] -r confusables-file.txt -w whole-script-confusables.txt -o output-file.cfu
|
||||
//
|
||||
// options: -v verbose
|
||||
// -? or -h help
|
||||
//
|
||||
// The input rule filew is are plain text files containing confusable character
|
||||
// definitions in the input format defined by Unicode UAX39 for the files
|
||||
// confusables.txt and confusablesWholeScript.txt. This source (.txt) format
|
||||
// is also accepted direaccepted by ICU spoof detedtors. The
|
||||
// files must be encoded in utf-8 format, with or without a BOM.
|
||||
//
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uclean.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/putil.h"
|
||||
|
||||
#include "uoptions.h"
|
||||
#include "unewdata.h"
|
||||
#include "ucmndata.h"
|
||||
#include "uspoof_impl.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
static char *progName;
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H, /* 0 */
|
||||
UOPTION_HELP_QUESTION_MARK, /* 1 */
|
||||
UOPTION_VERBOSE, /* 2 */
|
||||
{ "rules", NULL, NULL, NULL, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */
|
||||
{ "wsrules", NULL, NULL, NULL, 'w', UOPT_REQUIRES_ARG, 0}, /* 4 */
|
||||
{ "out", NULL, NULL, NULL, 'o', UOPT_REQUIRES_ARG, 0 }, /* 5 */
|
||||
UOPTION_ICUDATADIR, /* 6 */
|
||||
UOPTION_DESTDIR, /* 7 */
|
||||
UOPTION_COPYRIGHT, /* 8 */
|
||||
};
|
||||
|
||||
void usageAndDie(int retCode) {
|
||||
printf("Usage: %s [-v] [-options] -r confusablesRules.txt -w wholeScriptConfusables.txt -o output-file\n", progName);
|
||||
printf("\tRead in Unicode confusable character definitions and write out the binary data\n"
|
||||
"options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-V or --version show a version message\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-v or --verbose turn on verbose output\n"
|
||||
"\t-i or --icudatadir directory for locating any needed intermediate data files,\n"
|
||||
"\t followed by path, defaults to %s\n"
|
||||
"\t-d or --destdir destination directory, followed by the path\n",
|
||||
u_getDataDirectory());
|
||||
exit (retCode);
|
||||
}
|
||||
|
||||
|
||||
#if UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
/* dummy UDataInfo cf. udata.h */
|
||||
static UDataInfo dummyDataInfo = {
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ 0, 0, 0, 0 }, /* dummy dataFormat */
|
||||
{ 0, 0, 0, 0 }, /* dummy formatVersion */
|
||||
{ 0, 0, 0, 0 } /* dummy dataVersion */
|
||||
};
|
||||
|
||||
#else
|
||||
|
||||
//
|
||||
// Set up the ICU data header, defined in ucmndata.h
|
||||
//
|
||||
DataHeader dh ={
|
||||
{sizeof(DataHeader), // Struct MappedData
|
||||
0xda,
|
||||
0x27},
|
||||
|
||||
{ // struct UDataInfo
|
||||
sizeof(UDataInfo), // size
|
||||
0, // reserved
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0, // reserved
|
||||
|
||||
{ 0x43, 0x66, 0x75, 0x20 }, // dataFormat="Cfu "
|
||||
{ 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values
|
||||
// from the builder. The values declared
|
||||
// here should never appear in any real data.
|
||||
{ 5, 1, 0, 0 } // dataVersion (Unicode version)
|
||||
}};
|
||||
|
||||
#endif
|
||||
|
||||
// Forward declaration for function for reading source files.
|
||||
static const char *readFile(const char *fileName, int32_t *len);
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//
|
||||
// main for gencfu
|
||||
//
|
||||
//----------------------------------------------------------------------------
|
||||
int main(int argc, char **argv) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
const char *confFileName;
|
||||
const char *confWSFileName;
|
||||
const char *outFileName;
|
||||
const char *outDir = NULL;
|
||||
const char *copyright = NULL;
|
||||
|
||||
//
|
||||
// Pick up and check the command line arguments,
|
||||
// using the standard ICU tool utils option handling.
|
||||
//
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
progName = argv[0];
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);
|
||||
if(argc<0) {
|
||||
// Unrecognized option
|
||||
fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]);
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
|
||||
if(options[0].doesOccur || options[1].doesOccur) {
|
||||
// -? or -h for help.
|
||||
usageAndDie(0);
|
||||
}
|
||||
|
||||
if (!(options[3].doesOccur && options[4].doesOccur && options[5].doesOccur)) {
|
||||
fprintf(stderr, "confusables file, whole script confusables file and output file must all be specified.\n");
|
||||
usageAndDie(U_ILLEGAL_ARGUMENT_ERROR);
|
||||
}
|
||||
confFileName = options[3].value;
|
||||
confWSFileName = options[4].value;
|
||||
outFileName = options[5].value;
|
||||
|
||||
if (options[6].doesOccur) {
|
||||
u_setDataDirectory(options[6].value);
|
||||
}
|
||||
|
||||
/* Initialize ICU */
|
||||
u_init(&status);
|
||||
if (U_FAILURE(status)) {
|
||||
fprintf(stderr, "%s: can not initialize ICU. status = %s\n",
|
||||
argv[0], u_errorName(status));
|
||||
exit(1);
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
/* Combine the directory with the file name */
|
||||
if(options[7].doesOccur) {
|
||||
outDir = options[7].value;
|
||||
}
|
||||
if (options[8].doesOccur) {
|
||||
copyright = U_COPYRIGHT_STRING;
|
||||
}
|
||||
|
||||
#if UCONFIG_NO_SPOOF_DETECTION
|
||||
// TOOD: implement UCONFIG_NO_SPOOF_DETECTION in uconfig.h, or decide we don't want it and take this out.
|
||||
|
||||
UNewDataMemory *pData;
|
||||
char msg[1024];
|
||||
|
||||
/* write message with just the name */
|
||||
sprintf(msg, "gencfu writes dummy %s because of UCONFIG_NO_SPOOF_DETECTION, see uconfig.h", outFileName);
|
||||
fprintf(stderr, "%s\n", msg);
|
||||
|
||||
/* write the dummy data file */
|
||||
pData = udata_create(outDir, NULL, outFileName, &dummyDataInfo, NULL, &status);
|
||||
udata_writeBlock(pData, msg, strlen(msg));
|
||||
udata_finish(pData, &status);
|
||||
return (int)status;
|
||||
|
||||
#else
|
||||
|
||||
// Read in the confusables source file
|
||||
|
||||
int32_t confusablesLen = 0;
|
||||
const char *confusables = readFile(confFileName, &confusablesLen);
|
||||
if (confusables == NULL) {
|
||||
printf("gencfu: error reading file \"%s\"\n", confFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
int32_t wsConfusablesLen = 0;
|
||||
const char *wsConfsables = readFile(confWSFileName, &wsConfusablesLen);
|
||||
if (wsConfsables == NULL) {
|
||||
printf("gencfu: error reading file \"%s\"\n", confFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
//
|
||||
// Create the Spoof Detector from the source confusables files.
|
||||
// This will compile the data.
|
||||
//
|
||||
UParseError parseError;
|
||||
parseError.line = 0;
|
||||
parseError.offset = 0;
|
||||
int32_t errType;
|
||||
USpoofChecker *sc = uspoof_openFromSource(confusables, confusablesLen,
|
||||
wsConfsables, wsConfusablesLen,
|
||||
&errType, &parseError, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
const char *errFile =
|
||||
(errType == USPOOF_WHOLE_SCRIPT_CONFUSABLE)? confWSFileName : confFileName;
|
||||
fprintf(stderr, "gencfu: uspoof_openFromSource error \"%s\" at file %s, line %d, column %d\n",
|
||||
u_errorName(status), errFile, (int)parseError.line, (int)parseError.offset);
|
||||
exit(status);
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// Get the compiled rule data from the USpoofChecker.
|
||||
//
|
||||
uint32_t outDataSize;
|
||||
uint8_t *outData;
|
||||
outDataSize = uspoof_serialize(sc, NULL, 0, &status);
|
||||
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
||||
fprintf(stderr, "gencfu: uspoof_serialize() returned %s\n", u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
status = U_ZERO_ERROR;
|
||||
outData = new uint8_t[outDataSize];
|
||||
uspoof_serialize(sc, outData, outDataSize, &status);
|
||||
|
||||
// Copy the data format version numbers from the spoof data header into the UDataMemory header.
|
||||
|
||||
uprv_memcpy(dh.info.formatVersion,
|
||||
reinterpret_cast<SpoofDataHeader *>(outData)->fFormatVersion,
|
||||
sizeof(dh.info.formatVersion));
|
||||
|
||||
//
|
||||
// Create the output file
|
||||
//
|
||||
size_t bytesWritten;
|
||||
UNewDataMemory *pData;
|
||||
pData = udata_create(outDir, NULL, outFileName, &(dh.info), copyright, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
fprintf(stderr, "gencfu: Could not open output file \"%s\", \"%s\"\n",
|
||||
outFileName, u_errorName(status));
|
||||
exit(status);
|
||||
}
|
||||
|
||||
|
||||
// Write the data itself.
|
||||
udata_writeBlock(pData, outData, outDataSize);
|
||||
// finish up
|
||||
bytesWritten = udata_finish(pData, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
fprintf(stderr, "gencfu: Error %d writing the output file\n", status);
|
||||
exit(status);
|
||||
}
|
||||
|
||||
if (bytesWritten != outDataSize) {
|
||||
fprintf(stderr, "gencfu: Error writing to output file \"%s\"\n", outFileName);
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
uspoof_close(sc);
|
||||
delete outData;
|
||||
delete confusables;
|
||||
delete wsConfsables;
|
||||
u_cleanup();
|
||||
printf("gencfu: tool completed successfully.\n");
|
||||
return 0;
|
||||
|
||||
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Read in a confusables source file
|
||||
//
|
||||
static const char *readFile(const char *fileName, int32_t *len) {
|
||||
char *result;
|
||||
long fileSize;
|
||||
FILE *file;
|
||||
|
||||
file = fopen(fileName, "rb");
|
||||
if( file == 0 ) {
|
||||
return NULL;
|
||||
}
|
||||
fseek(file, 0, SEEK_END);
|
||||
fileSize = ftell(file);
|
||||
fseek(file, 0, SEEK_SET);
|
||||
result = new char[fileSize+10];
|
||||
if (result==NULL) {
|
||||
return result;
|
||||
}
|
||||
|
||||
long t = fread(result, 1, fileSize, file);
|
||||
if (t != fileSize) {
|
||||
delete result;
|
||||
return NULL;
|
||||
}
|
||||
result[fileSize]=0;
|
||||
*len = static_cast<int32_t>(fileSize);
|
||||
fclose(file);
|
||||
return result;
|
||||
}
|
404
icu4c/source/tools/gencfu/gencfu.vcproj
Normal file
404
icu4c/source/tools/gencfu/gencfu.vcproj
Normal file
@ -0,0 +1,404 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<VisualStudioProject
|
||||
ProjectType="Visual C++"
|
||||
Version="9.00"
|
||||
Name="gencfu"
|
||||
ProjectGUID="{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}"
|
||||
Keyword="Win32Proj"
|
||||
TargetFrameworkVersion="0"
|
||||
>
|
||||
<Platforms>
|
||||
<Platform
|
||||
Name="Win32"
|
||||
/>
|
||||
</Platforms>
|
||||
<ToolFiles>
|
||||
</ToolFiles>
|
||||
<Configurations>
|
||||
<Configuration
|
||||
Name="Debug|Win32"
|
||||
OutputDirectory="Debug"
|
||||
IntermediateDirectory="Debug"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin"
|
||||
Outputs="..\..\..\bin\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\common;..\..\i18n;..\toolutil"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
MinimalRebuild="false"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="true"
|
||||
DisableLanguageExtensions="true"
|
||||
UsePrecompiledHeader="0"
|
||||
AssemblerListingLocation=".\x86\Debug/"
|
||||
ObjectFile=".\x86\Debug/"
|
||||
ProgramDataBaseFileName=".\x86\Debug/"
|
||||
BrowseInformation="1"
|
||||
WarningLevel="3"
|
||||
Detect64BitPortabilityProblems="true"
|
||||
DebugInformationFormat="4"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x86\Debug\gencfu.exe"
|
||||
LinkIncremental="2"
|
||||
SuppressStartupBanner="true"
|
||||
GenerateDebugInformation="true"
|
||||
SubSystem="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|Win32"
|
||||
OutputDirectory="Release"
|
||||
IntermediateDirectory="Release"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin"
|
||||
Outputs="..\..\..\bin\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
AdditionalIncludeDirectories="..\..\common;..\..\i18n;..\toolutil"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;"
|
||||
StringPooling="true"
|
||||
MinimalRebuild="false"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
UsePrecompiledHeader="0"
|
||||
AssemblerListingLocation=".\x86\Release/"
|
||||
ObjectFile=".\x86\Release/"
|
||||
ProgramDataBaseFileName=".\x86\Release/"
|
||||
WarningLevel="3"
|
||||
Detect64BitPortabilityProblems="true"
|
||||
DebugInformationFormat="3"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x86\Release\gencfu.exe"
|
||||
LinkIncremental="1"
|
||||
GenerateDebugInformation="true"
|
||||
SubSystem="1"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|x64"
|
||||
OutputDirectory=".\x64\Release"
|
||||
IntermediateDirectory=".\x64\Release"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin64
"
|
||||
Outputs="..\..\..\bin64\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TargetEnvironment="3"
|
||||
TypeLibraryName=".\x64\Release/genbrk.tlb"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x64\Release/genbrk.pch"
|
||||
AssemblerListingLocation=".\x64\Release/"
|
||||
ObjectFile=".\x64\Release/"
|
||||
ProgramDataBaseFileName=".\x64\Release/"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="NDEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x64\Release/genbrk.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
ProgramDatabaseFile=".\x64\Release/genbrk.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="17"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebDeploymentTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Debug|x64"
|
||||
OutputDirectory=".\x64\Debug"
|
||||
IntermediateDirectory=".\x64\Debug"
|
||||
ConfigurationType="1"
|
||||
InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
|
||||
UseOfMFC="0"
|
||||
ATLMinimizesCRunTimeLibraryUsage="false"
|
||||
CharacterSet="2"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin64
"
|
||||
Outputs="..\..\..\bin64\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
TargetEnvironment="3"
|
||||
TypeLibraryName=".\x64\Debug/gencfu.tlb"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x64\Debug/gencfu.pch"
|
||||
AssemblerListingLocation=".\x64\Debug/"
|
||||
ObjectFile=".\x64\Debug/"
|
||||
ProgramDataBaseFileName=".\x64\Debug/"
|
||||
BrowseInformation="1"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
DebugInformationFormat="3"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="_DEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x64\Debug/gencfu.exe"
|
||||
LinkIncremental="2"
|
||||
SuppressStartupBanner="true"
|
||||
GenerateDebugInformation="true"
|
||||
ProgramDatabaseFile=".\x64\Debug/gencfu.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="17"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
UseFAT32Workaround="true"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebDeploymentTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
</Configurations>
|
||||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
Filter="h;hpp;hxx;hm;inl;inc;xsd"
|
||||
UniqueIdentifier="{93995380-89BD-4b04-88EB-625FBE52EBFB}"
|
||||
>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
Filter="rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx"
|
||||
UniqueIdentifier="{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}"
|
||||
>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx"
|
||||
UniqueIdentifier="{4FC737F1-C7A5-4376-A066-2A32D752A2FF}"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\gencfu.cpp"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
</VisualStudioProject>
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
@ -51,6 +51,8 @@
|
||||
|
||||
/* swapping implementations in i18n */
|
||||
|
||||
#include "uspoof_impl.h"
|
||||
|
||||
/* definitions */
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
@ -556,7 +558,9 @@ static const struct {
|
||||
{ { 0x54, 0x72, 0x44, 0x63 }, triedict_swap }, /* dataFormat="TrDc " */
|
||||
#endif
|
||||
{ { 0x70, 0x6e, 0x61, 0x6d }, upname_swap }, /* dataFormat="pnam" */
|
||||
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames } /* dataFormat="unam" */
|
||||
{ { 0x75, 0x6e, 0x61, 0x6d }, uchar_swapNames }, /* dataFormat="unam" */
|
||||
|
||||
{ { 0x43, 0x66, 0x75, 0x20 }, uspoof_swap } /* dataFormat="Cfu " */
|
||||
};
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
@ -50,7 +50,7 @@
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
WholeProgramOptimization="true"
|
||||
AdditionalIncludeDirectories="..\..\..\include,..\..\common"
|
||||
AdditionalIncludeDirectories="..\..\..\include,..\..\common,..\..\i18n"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE;U_TOOLUTIL_IMPLEMENTATION"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
@ -145,7 +145,7 @@
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\..\include,..\..\common"
|
||||
AdditionalIncludeDirectories="..\..\..\include,..\..\common,..\..\i18n"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE;U_TOOLUTIL_IMPLEMENTATION"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
|
Loading…
Reference in New Issue
Block a user