712b165172
X-SVN-Rev: 26637
436 lines
17 KiB
C++
436 lines
17 KiB
C++
/*
|
|
******************************************************************************
|
|
*
|
|
* Copyright (C) 2008-2009, International Business Machines
|
|
* Corporation and others. All Rights Reserved.
|
|
*
|
|
******************************************************************************
|
|
* file name: uspoof_wsconf.cpp
|
|
* encoding: US-ASCII
|
|
* tab size: 8 (not used)
|
|
* indentation:4
|
|
*
|
|
* created on: 2009Jan05 (refactoring earlier files)
|
|
* created by: Andy Heninger
|
|
*
|
|
* Internal functions for compililing Whole Script confusable source data
|
|
* into its binary (runtime) form. The binary data format is described
|
|
* in uspoof_impl.h
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
#include "unicode/uspoof.h"
|
|
|
|
#if !UCONFIG_NO_NORMALIZATION
|
|
|
|
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|
|
#include "unicode/unorm.h"
|
|
#include "unicode/uregex.h"
|
|
#include "unicode/ustring.h"
|
|
#include "cmemory.h"
|
|
#include "uspoof_impl.h"
|
|
#include "uhash.h"
|
|
#include "uvector.h"
|
|
#include "uassert.h"
|
|
#include "uspoof_wsconf.h"
|
|
|
|
U_NAMESPACE_USE
|
|
|
|
|
|
// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt
|
|
// Example Lines:
|
|
// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O
|
|
// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I
|
|
// | | | |
|
|
// | | | |---- Which table, Any Case or Lower Case (A or L)
|
|
// | | |----------Target script. We need this.
|
|
// | |----------------Src script. Should match the script of the source
|
|
// | code points. Beyond checking that, we don't keep it.
|
|
// |--------------------------------Source code points or range.
|
|
//
|
|
// The expression will match _all_ lines, including erroneous lines.
|
|
// The result of the parse is returned via the contents of the (match) groups.
|
|
static const char *parseExp =
|
|
|
|
"(?m)" // Multi-line mode
|
|
"^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1.
|
|
"|^(?:" // OR
|
|
"\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3.
|
|
"\\s*([A-Za-z]+)\\s*;" // The source script. Group 4.
|
|
"\\s*([A-Za-z]+)\\s*;" // The target script. Group 5.
|
|
"\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7
|
|
"[ \\t]*(?:#.*?)?" // Trailing commment
|
|
")$|" // OR
|
|
"^(.*?)$"; // An error line. Group 8.
|
|
// Any line not matching the preceding
|
|
// parts of the expression.will match
|
|
// this, and thus be flagged as an error
|
|
|
|
|
|
// Extract a regular expression match group into a char * string.
|
|
// The group must contain only invariant characters.
|
|
// Used for script names
|
|
//
|
|
static void extractGroup(
|
|
URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) {
|
|
|
|
UChar ubuf[50];
|
|
ubuf[0] = 0;
|
|
destBuf[0] = 0;
|
|
int32_t len = uregex_group(e, group, ubuf, 50, &status);
|
|
if (U_FAILURE(status) || len == -1 || len >= destCapacity) {
|
|
return;
|
|
}
|
|
UnicodeString s(FALSE, ubuf, len); // Aliasing constructor
|
|
s.extract(0, len, destBuf, destCapacity, US_INV);
|
|
}
|
|
|
|
|
|
|
|
// Build the Whole Script Confusable data
|
|
//
|
|
// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class,
|
|
// because everything is local to this one build function anyhow,
|
|
// OR
|
|
// break this function into more reasonably sized pieces, with
|
|
// state in WSConfusableDataBuilder.
|
|
//
|
|
void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS,
|
|
int32_t confusablesWSLen, UParseError *pe, UErrorCode &status)
|
|
{
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
URegularExpression *parseRegexp = NULL;
|
|
int32_t inputLen = 0;
|
|
UChar *input = NULL;
|
|
int32_t lineNum = 0;
|
|
|
|
UVector *scriptSets = NULL;
|
|
uint32_t rtScriptSetsCount = 2;
|
|
|
|
UTrie2 *anyCaseTrie = NULL;
|
|
UTrie2 *lowerCaseTrie = NULL;
|
|
|
|
anyCaseTrie = utrie2_open(0, 0, &status);
|
|
lowerCaseTrie = utrie2_open(0, 0, &status);
|
|
|
|
|
|
// The scriptSets vector provides a mapping from TRIE values to the set of scripts.
|
|
//
|
|
// Reserved TRIE values:
|
|
// 0: Code point has no whole script confusables.
|
|
// 1: Code point is of script Common or Inherited.
|
|
// These code points do not participate in whole script confusable detection.
|
|
// (This is logically equivalent to saying that they contain confusables in
|
|
// all scripts)
|
|
//
|
|
// Because Trie values are indexes into the ScriptSets vector, pre-fill
|
|
// vector positions 0 and 1 to avoid conflicts with the reserved values.
|
|
|
|
scriptSets = new UVector(status);
|
|
if (scriptSets == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
goto cleanup;
|
|
}
|
|
scriptSets->addElement((void *)NULL, status);
|
|
scriptSets->addElement((void *)NULL, status);
|
|
|
|
// Convert the user input data from UTF-8 to UChar (UTF-16)
|
|
u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status);
|
|
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
|
goto cleanup;
|
|
}
|
|
status = U_ZERO_ERROR;
|
|
input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar)));
|
|
if (input == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
goto cleanup;
|
|
}
|
|
u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status);
|
|
|
|
|
|
|
|
parseRegexp = uregex_openC(parseExp, 0, NULL, &status);
|
|
|
|
// Zap any Byte Order Mark at the start of input. Changing it to a space is benign
|
|
// given the syntax of the input.
|
|
if (*input == 0xfeff) {
|
|
*input = 0x20;
|
|
}
|
|
|
|
// Parse the input, one line per iteration of this loop.
|
|
uregex_setText(parseRegexp, input, inputLen, &status);
|
|
while (uregex_findNext(parseRegexp, &status)) {
|
|
lineNum++;
|
|
UChar line[200];
|
|
uregex_group(parseRegexp, 0, line, 200, &status);
|
|
if (uregex_start(parseRegexp, 1, &status) >= 0) {
|
|
// this was a blank or comment line.
|
|
continue;
|
|
}
|
|
if (uregex_start(parseRegexp, 8, &status) >= 0) {
|
|
// input file syntax error.
|
|
status = U_PARSE_ERROR;
|
|
goto cleanup;
|
|
}
|
|
if (U_FAILURE(status)) {
|
|
goto cleanup;
|
|
}
|
|
|
|
// Pick up the start and optional range end code points from the parsed line.
|
|
UChar32 startCodePoint = SpoofImpl::ScanHex(
|
|
input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status);
|
|
UChar32 endCodePoint = startCodePoint;
|
|
if (uregex_start(parseRegexp, 3, &status) >=0) {
|
|
endCodePoint = SpoofImpl::ScanHex(
|
|
input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status);
|
|
}
|
|
|
|
// Extract the two script names from the source line. We need these in an 8 bit
|
|
// default encoding (will be EBCDIC on IBM mainframes) in order to pass them on
|
|
// to the ICU u_getPropertyValueEnum() function. Ugh.
|
|
char srcScriptName[20];
|
|
char targScriptName[20];
|
|
extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status);
|
|
extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status);
|
|
UScriptCode srcScript =
|
|
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName));
|
|
UScriptCode targScript =
|
|
static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName));
|
|
if (U_FAILURE(status)) {
|
|
goto cleanup;
|
|
}
|
|
if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) {
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
goto cleanup;
|
|
}
|
|
|
|
// select the table - (A) any case or (L) lower case only
|
|
UTrie2 *table = anyCaseTrie;
|
|
if (uregex_start(parseRegexp, 7, &status) >= 0) {
|
|
table = lowerCaseTrie;
|
|
}
|
|
|
|
// Build the set of scripts containing confusable characters for
|
|
// the code point(s) specified in this input line.
|
|
// Sanity check that the script of the source code point is the same
|
|
// as the source script indicated in the input file. Failure of this check is
|
|
// an error in the input file.
|
|
// Include the source script in the set (needed for Mixed Script Confusable detection).
|
|
//
|
|
UChar32 cp;
|
|
for (cp=startCodePoint; cp<=endCodePoint; cp++) {
|
|
int32_t setIndex = utrie2_get32(table, cp);
|
|
BuilderScriptSet *bsset = NULL;
|
|
if (setIndex > 0) {
|
|
U_ASSERT(setIndex < scriptSets->size());
|
|
bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex));
|
|
} else {
|
|
bsset = new BuilderScriptSet();
|
|
if (bsset == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
goto cleanup;
|
|
}
|
|
bsset->codePoint = cp;
|
|
bsset->trie = table;
|
|
bsset->sset = new ScriptSet();
|
|
setIndex = scriptSets->size();
|
|
bsset->index = setIndex;
|
|
bsset->rindex = 0;
|
|
if (bsset->sset == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
goto cleanup;
|
|
}
|
|
scriptSets->addElement(bsset, status);
|
|
utrie2_set32(table, cp, setIndex, &status);
|
|
}
|
|
bsset->sset->Union(targScript);
|
|
bsset->sset->Union(srcScript);
|
|
|
|
if (U_FAILURE(status)) {
|
|
goto cleanup;
|
|
}
|
|
UScriptCode cpScript = uscript_getScript(cp, &status);
|
|
if (cpScript != srcScript) {
|
|
status = U_INVALID_FORMAT_ERROR;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Eliminate duplicate script sets. At this point we have a separate
|
|
// script set for every code point that had data in the input file.
|
|
//
|
|
// We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them
|
|
//
|
|
// printf("Number of scriptSets: %d\n", scriptSets->size());
|
|
{
|
|
int32_t duplicateCount = 0;
|
|
rtScriptSetsCount = 2;
|
|
for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) {
|
|
BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri));
|
|
if (outerSet->index != static_cast<uint32_t>(outeri)) {
|
|
// This set was already identified as a duplicate.
|
|
// It will not be allocated a position in the runtime array of ScriptSets.
|
|
continue;
|
|
}
|
|
outerSet->rindex = rtScriptSetsCount++;
|
|
for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) {
|
|
BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri));
|
|
if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) {
|
|
delete innerSet->sset;
|
|
innerSet->scriptSetOwned = FALSE;
|
|
innerSet->sset = outerSet->sset;
|
|
innerSet->index = outeri;
|
|
innerSet->rindex = outerSet->rindex;
|
|
duplicateCount++;
|
|
}
|
|
// But this doesn't get all. We need to fix the TRIE.
|
|
}
|
|
}
|
|
// printf("Number of distinct script sets: %d\n", rtScriptSetsCount);
|
|
}
|
|
|
|
|
|
|
|
// Update the Trie values to be reflect the run time script indexes (after duplicate merging).
|
|
// (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets
|
|
// are unused, which is why the loop index starts at 2.)
|
|
{
|
|
for (int32_t i=2; i<scriptSets->size(); i++) {
|
|
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
|
if (bSet->rindex != (uint32_t)i) {
|
|
utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status);
|
|
}
|
|
}
|
|
}
|
|
|
|
// For code points with script==Common or script==Inherited,
|
|
// Set the reserved value of 1 into both Tries. These characters do not participate
|
|
// in Whole Script Confusable detection; this reserved value is the means
|
|
// by which they are detected.
|
|
{
|
|
UnicodeSet ignoreSet;
|
|
ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status);
|
|
UnicodeSet inheritedSet;
|
|
inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status);
|
|
ignoreSet.addAll(inheritedSet);
|
|
for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) {
|
|
UChar32 rangeStart = ignoreSet.getRangeStart(rn);
|
|
UChar32 rangeEnd = ignoreSet.getRangeEnd(rn);
|
|
utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
|
utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status);
|
|
}
|
|
}
|
|
|
|
// Serialize the data to the Spoof Detector
|
|
{
|
|
utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
|
int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status);
|
|
// printf("Any case Trie size: %d\n", size);
|
|
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
|
goto cleanup;
|
|
}
|
|
status = U_ZERO_ERROR;
|
|
spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit;
|
|
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size;
|
|
spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie;
|
|
void *where = spImpl->fSpoofData->reserveSpace(size, status);
|
|
utrie2_serialize(anyCaseTrie, where, size, &status);
|
|
|
|
utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status);
|
|
size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status);
|
|
// printf("Lower case Trie size: %d\n", size);
|
|
if (status != U_BUFFER_OVERFLOW_ERROR) {
|
|
goto cleanup;
|
|
}
|
|
status = U_ZERO_ERROR;
|
|
spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit;
|
|
spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size;
|
|
spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie;
|
|
where = spImpl->fSpoofData->reserveSpace(size, status);
|
|
utrie2_serialize(lowerCaseTrie, where, size, &status);
|
|
|
|
spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit;
|
|
spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount;
|
|
ScriptSet *rtScriptSets = static_cast<ScriptSet *>
|
|
(spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status));
|
|
uint32_t rindex = 2;
|
|
for (int32_t i=2; i<scriptSets->size(); i++) {
|
|
BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
|
if (bSet->rindex < rindex) {
|
|
// We have already copied this script set to the serialized data.
|
|
continue;
|
|
}
|
|
U_ASSERT(rindex == bSet->rindex);
|
|
rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits.
|
|
rindex++;
|
|
}
|
|
}
|
|
|
|
// Open new utrie2s from the serialized data. We don't want to keep the ones
|
|
// we just built because we would then have two copies of the data, one internal to
|
|
// the utries that we have already constructed, and one in the serialized data area.
|
|
// An alternative would be to not pre-serialize the Trie data, but that makes the
|
|
// spoof detector data different, depending on how the detector was constructed.
|
|
// It's simpler to keep the data always the same.
|
|
|
|
spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized(
|
|
UTRIE2_16_VALUE_BITS,
|
|
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie,
|
|
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
|
NULL,
|
|
&status);
|
|
|
|
spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized(
|
|
UTRIE2_16_VALUE_BITS,
|
|
(const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie,
|
|
spImpl->fSpoofData->fRawData->fAnyCaseTrieLength,
|
|
NULL,
|
|
&status);
|
|
|
|
|
|
|
|
cleanup:
|
|
if (U_FAILURE(status)) {
|
|
pe->line = lineNum;
|
|
}
|
|
uregex_close(parseRegexp);
|
|
uprv_free(input);
|
|
|
|
int32_t i;
|
|
for (i=0; i<scriptSets->size(); i++) {
|
|
BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i));
|
|
delete bsset;
|
|
}
|
|
delete scriptSets;
|
|
utrie2_close(anyCaseTrie);
|
|
utrie2_close(lowerCaseTrie);
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
BuilderScriptSet::BuilderScriptSet() {
|
|
codePoint = -1;
|
|
trie = NULL;
|
|
sset = NULL;
|
|
index = 0;
|
|
rindex = 0;
|
|
scriptSetOwned = TRUE;
|
|
}
|
|
|
|
BuilderScriptSet::~BuilderScriptSet() {
|
|
if (scriptSetOwned) {
|
|
delete sset;
|
|
}
|
|
}
|
|
|
|
#endif
|
|
#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
|
|
|