/* ***************************************************************** * Copyright (c) 2002, International Business Machines Corporation * and others. All Rights Reserved. ***************************************************************** * Date Name Description * 06/06/2002 aliu Creation. ***************************************************************** */ #include "unicode/utypes.h" #if !UCONFIG_NO_TRANSLITERATION #include "unicode/uobject.h" #include "unicode/nultrans.h" #include "unicode/uscript.h" #include "anytrans.h" #include "uvector.h" #include "tridpars.h" #include "hash.h" //------------------------------------------------------------ // Constants static const UChar TARGET_SEP = 45; // '-' static const UChar VARIANT_SEP = 47; // '/' static const UChar ANY[] = {65,110,121,0}; // "Any" static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" //------------------------------------------------------------ U_CDECL_BEGIN /** * Deleter function for Transliterator*. */ static void U_CALLCONV _deleteTransliterator(void *obj) { delete (Transliterator*) obj; } U_CDECL_END //------------------------------------------------------------ U_NAMESPACE_BEGIN //------------------------------------------------------------ // ScriptRunIterator /** * Returns a series of ranges corresponding to scripts. They will be * of the form: * * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second * | | - first run (start, limit) * | | - second run (start, limit) * * That is, the runs will overlap. The reason for this is so that a * transliterator can consider common characters both before and after * the scripts. */ class ScriptRunIterator : public UObject { private: const Replaceable& text; int32_t textStart; int32_t textLimit; public: /** * The code of the current run, valid after next() returns. May * be USCRIPT_INVALID_CODE if and only if the entire text is * COMMON/INHERITED. */ UScriptCode scriptCode; /** * The start of the run, inclusive, valid after next() returns. */ int32_t start; /** * The end of the run, exclusive, valid after next() returns. */ int32_t limit; /** * Constructs a run iterator over the given text from start * (inclusive) to limit (exclusive). */ ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); /** * Returns TRUE if there are any more runs. TRUE is always * returned at least once. Upon return, the caller should * examine scriptCode, start, and limit. */ UBool next(); /** * Adjusts internal indices for a change in the limit index of the * given delta. A positive delta means the limit has increased. */ void adjustLimit(int32_t delta); /** * ICU "poor man's RTTI", returns a UClassID for the actual class. * * @draft ICU 2.2 */ virtual inline UClassID getDynamicClassID() const { return getStaticClassID(); } /** * ICU "poor man's RTTI", returns a UClassID for this class. * * @draft ICU 2.2 */ static inline UClassID getStaticClassID() { return (UClassID)&fgClassID; } private: /** * The address of this static class variable serves as this class's ID * for ICU "poor man's RTTI". */ static const char fgClassID; }; const char ScriptRunIterator::fgClassID=0; ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, int32_t myStart, int32_t myLimit) : text(theText) { textStart = myStart; textLimit = myLimit; limit = myStart; } UBool ScriptRunIterator::next() { UChar32 ch; UScriptCode s; UErrorCode ec = U_ZERO_ERROR; scriptCode = USCRIPT_INVALID_CODE; // don't know script yet start = limit; // Are we done? if (start == textLimit) { return FALSE; } // Move start back to include adjacent COMMON or INHERITED // characters while (start > textStart) { ch = text.char32At(start - 1); // look back s = uscript_getScript(ch, &ec); if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { --start; } else { break; } } // Move limit ahead to include COMMON, INHERITED, and characters // of the current script. while (limit < textLimit) { ch = text.char32At(limit); // look ahead s = uscript_getScript(ch, &ec); if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { if (scriptCode == USCRIPT_INVALID_CODE) { scriptCode = s; } else if (s != scriptCode) { break; } } ++limit; } // Return TRUE even if the entire text is COMMON / INHERITED, in // which case scriptCode will be USCRIPT_INVALID_CODE. return TRUE; } void ScriptRunIterator::adjustLimit(int32_t delta) { limit += delta; textLimit += delta; } //------------------------------------------------------------ // AnyTransliterator const char AnyTransliterator::fgClassID=0; AnyTransliterator::AnyTransliterator(const UnicodeString& id, const UnicodeString& theTarget, const UnicodeString& theVariant, UScriptCode theTargetScript, UErrorCode& ec) : Transliterator(id, NULL), targetScript(theTargetScript) { cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec); uhash_setValueDeleter(cache, _deleteTransliterator); target = theTarget; if (theVariant.length() > 0) { target.append(VARIANT_SEP).append(theVariant); } } AnyTransliterator::~AnyTransliterator() { uhash_close(cache); } /** * Copy constructor. */ AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : Transliterator(o), target(o.target), targetScript(o.targetScript) { // Don't copy the cache contents UErrorCode ec = U_ZERO_ERROR; cache = uhash_open(uhash_hashLong, uhash_compareLong, &ec); uhash_setValueDeleter(cache, _deleteTransliterator); } /** * Transliterator API. */ Transliterator* AnyTransliterator::clone() const { return new AnyTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, UBool isIncremental) const { int32_t allStart = pos.start; int32_t allLimit = pos.limit; ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); while (it.next()) { // Ignore runs in the ante context if (it.limit <= allStart) continue; // Try to instantiate transliterator from it.scriptCode to // our target or target/variant Transliterator* t = getTransliterator(it.scriptCode); if (t == NULL) { // We have no transliterator. Do nothing, but keep // pos.start up to date. pos.start = it.limit; continue; } // If the run end is before the transliteration limit, do // a non-incremental transliteration. Otherwise do an // incremental one. UBool incremental = isIncremental && (it.limit >= allLimit); pos.start = uprv_max(allStart, it.start); pos.limit = uprv_min(allLimit, it.limit); int32_t limit = pos.limit; t->filteredTransliterate(text, pos, incremental); int32_t delta = pos.limit - limit; allLimit += delta; it.adjustLimit(delta); // We're done if we enter the post context if (it.limit >= allLimit) break; } // Restore limit. pos.start is fine where the last transliterator // left it, or at the end of the last run. pos.limit = allLimit; } Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { if (source == targetScript || source == USCRIPT_INVALID_CODE) { return NULL; } Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source); if (t == NULL) { UErrorCode ec = U_ZERO_ERROR; UnicodeString sourceName(uscript_getName(source), ""); UnicodeString id(sourceName); id.append(TARGET_SEP).append(target); t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); if (U_FAILURE(ec) || t == NULL) { delete t; // Try to pivot around Latin, our most common script id = sourceName; id.append(LATIN_PIVOT).append(target); t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); if (U_FAILURE(ec) || t == NULL) { delete t; t = NULL; } } if (t != NULL) { uhash_iput(cache, (int32_t) source, t, &ec); } } return t; } /** * Return the script code for a given name, or -1 if not found. */ UScriptCode AnyTransliterator::scriptNameToCode(const UnicodeString& name) { char buf[128]; UScriptCode code; UErrorCode ec = U_ZERO_ERROR; name.extract(0, 128, buf, 128, ""); if (uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) { code = USCRIPT_INVALID_CODE; } return code; } /** * Registers standard transliterators with the system. Called by * Transliterator during initialization. Scan all current targets and * register those that are scripts T as Any-T/V. */ void AnyTransliterator::registerIDs() { UErrorCode ec; Hashtable seen(TRUE); int32_t sourceCount = Transliterator::_countAvailableSources(); for (int32_t s=0; s= 1); for (int32_t v=0; v