ICU-1491 rewrite parseID to handle all ID form variants; move Mark's code into parseID; enable special inverses (NFC/NFC, etc.)
X-SVN-Rev: 6935
This commit is contained in:
parent
fdd3dc6fd0
commit
59fa6939a1
@ -120,6 +120,10 @@ void NormalizationTransliterator::registerIDs() {
|
||||
_create, integerToken(UNORM_NFD));
|
||||
Transliterator::_registerFactory(UnicodeString("Any-NFKD", ""),
|
||||
_create, integerToken(UNORM_NFKD));
|
||||
Transliterator::_registerSpecialInverses(UnicodeString("NFC", ""),
|
||||
UnicodeString("NFD", ""));
|
||||
Transliterator::_registerSpecialInverses(UnicodeString("NFKC", ""),
|
||||
UnicodeString("NFKD", ""));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -46,6 +46,8 @@ static const UChar VARIANT_SEP = 0x002F; // '/'
|
||||
static const UChar OPEN_PAREN = 40;
|
||||
static const UChar CLOSE_PAREN = 41;
|
||||
|
||||
static const UChar ANY[] = { 65, 110, 121, 0 }; // Any
|
||||
|
||||
/**
|
||||
* Prefix for resource bundle key for the display name for a
|
||||
* transliterator. The ID is appended to this to form the key.
|
||||
@ -77,7 +79,7 @@ static const char RB_DISPLAY_NAME_PATTERN[] = "TransliteratorNamePattern";
|
||||
static const char RB_RULE_BASED_IDS[] = "RuleBasedTransliteratorIDs";
|
||||
|
||||
/**
|
||||
* The mutex controlling access to registry object.
|
||||
* The mutex controlling access to registry object and specialInverses.
|
||||
*/
|
||||
static UMTX registryMutex = 0;
|
||||
|
||||
@ -89,6 +91,8 @@ static TransliteratorRegistry* registry = 0;
|
||||
// Empty string
|
||||
static const UChar EMPTY[] = {0}; //""
|
||||
|
||||
static Hashtable *specialInverses = 0;
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
@ -1104,7 +1108,6 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
|
||||
}
|
||||
|
||||
Transliterator* t = NULL;
|
||||
int32_t sep = 0; // index of the separator ('-') in id
|
||||
|
||||
// If id is empty, then we have either an empty specifier,
|
||||
// which is illegal, or a compound filter, which is legal
|
||||
@ -1124,39 +1127,67 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
|
||||
}
|
||||
|
||||
else {
|
||||
// Fix the id, if necessary, by reversing it (A-B => B-A). This
|
||||
// is only done if the id is NOT of the form Foo(Bar). Record the
|
||||
// position of the separator.
|
||||
//
|
||||
// For both A-B and Foo(Bar) ids, detect the special case of Null,
|
||||
// whose inverse is itself. Given an ID with no separator "Foo",
|
||||
// an abbreviation for "Any-Foo", consider the inverse to be
|
||||
// "Foo-Any".
|
||||
sep = id.indexOf(ID_SEP);
|
||||
if (sep < 0 && id.caseCompare(NullTransliterator::SHORT_ID,
|
||||
U_FOLD_CASE_DEFAULT) == 0) {
|
||||
// Handle "Null"
|
||||
sep = id.length();
|
||||
} else if (dir == UTRANS_REVERSE &&
|
||||
id.caseCompare(NullTransliterator::ID,
|
||||
U_FOLD_CASE_DEFAULT) == 0) {
|
||||
// Reverse of "Any-Null" => "Null"
|
||||
id.removeBetween(0, sep+1);
|
||||
sep = id.length();
|
||||
} else if (dir == UTRANS_REVERSE && revStart < 0) {
|
||||
if (sep >= 0) {
|
||||
id.extractBetween(0, sep, str);
|
||||
id.removeBetween(0, sep+1);
|
||||
} else {
|
||||
str = UnicodeString("Any", "");
|
||||
|
||||
// Normalize the ID. Take IDs of the form T, T/V, S-T, S-T/V, or S/V-T
|
||||
// and produce S-T/V. If the ID needs to be reversed, do so. This
|
||||
// produces T-S/V, with a default S of "Any". If the ID has a special
|
||||
// non-canonical inverse, look it up (e.g., NFC -> NFD, Null -> Null).
|
||||
if (id.length() > 0) { // We handle empty IDs below
|
||||
UnicodeString source(ANY);
|
||||
UnicodeString target;
|
||||
UnicodeString variant; // Variant INCLUDING "/"
|
||||
|
||||
int32_t sep = id.indexOf(ID_SEP);
|
||||
int32_t var = id.indexOf(VARIANT_SEP);
|
||||
if (var < 0) {
|
||||
var = id.length();
|
||||
}
|
||||
|
||||
if (sep < 0) {
|
||||
// Form: T/V or T (or /V)
|
||||
id.extractBetween(0, var, target);
|
||||
id.extractBetween(var, 0x7FFFFFFF, variant);
|
||||
} else if (sep < var) {
|
||||
// Form: S-T/V or S-T
|
||||
id.extractBetween(0, sep++, source);
|
||||
id.extractBetween(sep, var, target);
|
||||
id.extractBetween(var, 0x7FFFFFFF, variant);
|
||||
} else {
|
||||
// Form: S/V-T
|
||||
id.extractBetween(0, var, source);
|
||||
id.extractBetween(var, sep++, variant);
|
||||
id.extractBetween(sep, 0x7FFFFFFF, target);
|
||||
}
|
||||
id.truncate(0);
|
||||
// For forward IDs *or IDs that were part of a Foo(Bar) ID*,
|
||||
// normalize them to canonical form.
|
||||
if (dir == UTRANS_FORWARD || revStart >= 0) {
|
||||
id.append(source).append(ID_SEP).append(target);
|
||||
} else {
|
||||
// Handle special, non-canonical inverse mappings,
|
||||
// e.g. inverse(Any-NFC) = Any-NFD and vice versa.
|
||||
if (source == ANY) {
|
||||
UnicodeString* inverseTarget = (UnicodeString*) specialInverses->get(target);
|
||||
if (inverseTarget != NULL) {
|
||||
// If the original ID contained "Any-" then make the
|
||||
// special inverse "Any-Foo"; otherwise make it "Foo".
|
||||
// So "Any-NFC" => "Any-NFD" but "NFC" => "NFD".
|
||||
if (sep < 0) {
|
||||
id.append(*inverseTarget);
|
||||
} else {
|
||||
source = *inverseTarget;
|
||||
target = ANY;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (id.length() == 0) {
|
||||
id.append(target).append(ID_SEP).append(source);
|
||||
}
|
||||
}
|
||||
// If the variant is empty ("/") then don't append it
|
||||
if (variant.length() > 1) {
|
||||
id.append(variant);
|
||||
}
|
||||
sep = id.length();
|
||||
id.append(ID_SEP).append(str);
|
||||
} else if (sep < 0 && id.length() > 0) {
|
||||
// Don't do anything for empty IDs -- we handle these specially below
|
||||
str = UnicodeString("Any-", "");
|
||||
sep = str.length() - 1;
|
||||
id.insert(0, str);
|
||||
}
|
||||
|
||||
// If we have a reverse part of the ID, e.g., Foo(Bar), then we
|
||||
@ -1230,7 +1261,7 @@ Transliterator* Transliterator::parseID(const UnicodeString& ID,
|
||||
ID.extractBetween(revStart+1, revLimit, id);
|
||||
}
|
||||
} else if (revStart < 0) {
|
||||
id.insert(sep, ID, setStart, setLimit-setStart);
|
||||
id.insert(0, ID, setStart, setLimit-setStart);
|
||||
} else {
|
||||
// Change Foo(Bar) to Bar(Foo)
|
||||
ID.extractBetween(pos, revStart, str);
|
||||
@ -1375,6 +1406,27 @@ void Transliterator::_registerFactory(const UnicodeString& id,
|
||||
registry->put(id, factory, context, TRUE);
|
||||
}
|
||||
|
||||
// For public consumption
|
||||
void Transliterator::registerSpecialInverses(const UnicodeString& target1,
|
||||
const UnicodeString& target2) {
|
||||
if (registry == 0) {
|
||||
initializeRegistry();
|
||||
}
|
||||
Mutex lock(®istryMutex);
|
||||
_registerSpecialInverses(target1, target2);
|
||||
}
|
||||
|
||||
// To be called only by Transliterator subclasses that are called
|
||||
// to register themselves by initializeRegistry().
|
||||
void Transliterator::_registerSpecialInverses(const UnicodeString& target1,
|
||||
const UnicodeString& target2) {
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
specialInverses->put(target1, new UnicodeString(target2), ec);
|
||||
if (0 != target1.caseCompare(target2, U_FOLD_CASE_DEFAULT)) {
|
||||
specialInverses->put(target2, new UnicodeString(target1), ec);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a instance <tt>obj</tt> of a subclass of
|
||||
* <code>Transliterator</code> with the system. This object must
|
||||
@ -1596,6 +1648,11 @@ void Transliterator::initializeRegistry(void) {
|
||||
ures_close(transIDs);
|
||||
ures_close(bundle);
|
||||
|
||||
specialInverses = new Hashtable(TRUE);
|
||||
specialInverses->setValueDeleter(uhash_deleteUnicodeString);
|
||||
_registerSpecialInverses(NullTransliterator::SHORT_ID,
|
||||
NullTransliterator::SHORT_ID);
|
||||
|
||||
// Manually add prototypes that the system knows about to the
|
||||
// cache. This is how new non-rule-based transliterators are
|
||||
// added to the system.
|
||||
|
@ -816,6 +816,35 @@ public:
|
||||
*/
|
||||
static void registerInstance(Transliterator* adoptedObj);
|
||||
|
||||
/**
|
||||
* Register two targets as being inverses of one another. For
|
||||
* example, calling registerSpecialInverses("NFC", "NFD") causes
|
||||
* Transliterator to form the following inverse relationships:
|
||||
*
|
||||
* <pre>NFC => NFD
|
||||
* Any-NFC => Any-NFD
|
||||
* NFD => NFC
|
||||
* Any-NFD => Any-NFC</pre>
|
||||
*
|
||||
* (Without the special inverse registration, the inverse of NFC
|
||||
* would be NFC-Any.) Note that NFD is shorthand for Any-NFD, but
|
||||
* that the presence or absence of "Any-" is preserved.
|
||||
*
|
||||
* <p>The relationship is symmetrical; registering (a, b) is
|
||||
* equivalent to registering (b, a).
|
||||
*
|
||||
* <p>The relevant IDs must still be registered separately as
|
||||
* factories or classes.
|
||||
*
|
||||
* <p>Only the targets are specified. Special inverses always
|
||||
* have the form Any-Target1 <=> Any-Target2. The target should
|
||||
* have canonical casing (the casing desired to be produced when
|
||||
* an inverse is formed) and should contain no whitespace or other
|
||||
* extraneous characters.
|
||||
*/
|
||||
static void registerSpecialInverses(const UnicodeString& target1,
|
||||
const UnicodeString& target2);
|
||||
|
||||
private:
|
||||
|
||||
friend class NormalizationTransliterator;
|
||||
@ -824,6 +853,9 @@ private:
|
||||
Factory factory,
|
||||
Token context);
|
||||
|
||||
static void _registerSpecialInverses(const UnicodeString& target1,
|
||||
const UnicodeString& target2);
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
|
@ -138,6 +138,7 @@ TransliteratorTest::runIndexedTest(int32_t index, UBool exec,
|
||||
TESTCASE(56,TestOutputSet);
|
||||
TESTCASE(57,TestVariableRange);
|
||||
TESTCASE(58,TestInvalidPostContext);
|
||||
TESTCASE(59,TestIDForms);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
@ -1040,18 +1041,18 @@ void TransliteratorTest::TestFilterIDs(void) {
|
||||
// Array of 3n strings:
|
||||
// <id>, <inverse id>, <input>, <expected output>
|
||||
const char* DATA[] = {
|
||||
"Any[aeiou]-Hex",
|
||||
"Hex[aeiou]-Any",
|
||||
"quizzical",
|
||||
"q\\u0075\\u0069zz\\u0069c\\u0061l",
|
||||
"Any[aeiou]-Hex", // ID
|
||||
"[aeiou]Hex-Any", // expected inverse ID
|
||||
"quizzical", // src
|
||||
"q\\u0075\\u0069zz\\u0069c\\u0061l", // expected ID.translit(src)
|
||||
|
||||
"Any[aeiou]-Hex;Hex[^5]-Any",
|
||||
"Any[^5]-Hex;Hex[aeiou]-Any",
|
||||
"[^5]Any-Hex;[aeiou]Hex-Any",
|
||||
"quizzical",
|
||||
"q\\u0075izzical",
|
||||
|
||||
"Null[abc]",
|
||||
"Null[abc]",
|
||||
"[abc]Null",
|
||||
"xyz",
|
||||
"xyz",
|
||||
};
|
||||
@ -2694,6 +2695,51 @@ void TransliteratorTest::TestInvalidPostContext() {
|
||||
errln("FAIL: No syntax error");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test ID form variants
|
||||
*/
|
||||
void TransliteratorTest::TestIDForms() {
|
||||
char* DATA[] = {
|
||||
"NFC", "NFD",
|
||||
"nfd", "NFC", // make sure case is ignored
|
||||
"Any-NFKD", "Any-NFKC",
|
||||
"Null", "Null",
|
||||
"Latin-Greek/UNGEGN", "Greek-Latin/UNGEGN",
|
||||
"Greek/UNGEGN-Latin", "Latin-Greek/UNGEGN",
|
||||
"Bengali-Devanagari/", "Devanagari-Bengali",
|
||||
};
|
||||
const int32_t DATA_length = sizeof(DATA)/sizeof(DATA[0]);
|
||||
|
||||
for (int32_t i=0; i<DATA_length; i+=2) {
|
||||
UParseError pe;
|
||||
UErrorCode ec = U_ZERO_ERROR;
|
||||
Transliterator *t =
|
||||
Transliterator::createInstance(DATA[i], UTRANS_FORWARD, pe, ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
errln((UnicodeString)"FAIL: Couldn't create " + DATA[i]);
|
||||
delete t;
|
||||
continue;
|
||||
}
|
||||
Transliterator *u = t->createInverse(ec);
|
||||
if (U_FAILURE(ec)) {
|
||||
errln((UnicodeString)"FAIL: Couldn't create inverse of " + DATA[i]);
|
||||
delete t;
|
||||
delete u;
|
||||
continue;
|
||||
}
|
||||
if (t->getID() == DATA[i] &&
|
||||
u->getID() == DATA[i+1]) {
|
||||
logln((UnicodeString)"Ok: " + DATA[i] + ".getInverse() => " + DATA[i+1]);
|
||||
} else {
|
||||
errln((UnicodeString)"FAIL: getInstance(" + DATA[i] + ") => " +
|
||||
t->getID() + " x getInverse() => " + u->getID() +
|
||||
", expected " + DATA[i+1]);
|
||||
}
|
||||
delete t;
|
||||
delete u;
|
||||
}
|
||||
}
|
||||
|
||||
//======================================================================
|
||||
// icu4c ONLY
|
||||
// These tests are not mirrored (yet) in icu4j at
|
||||
|
@ -277,6 +277,11 @@ class TransliteratorTest : public IntlTest {
|
||||
*/
|
||||
void TestInvalidPostContext();
|
||||
|
||||
/**
|
||||
* Test ID form variants
|
||||
*/
|
||||
void TestIDForms();
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
Loading…
Reference in New Issue
Block a user