/* ********************************************************************** * Copyright (C) 1999, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "rbt_rule.h" #include "unicode/rep.h" #include "rbt_data.h" #include "unicode/unifilt.h" #include "unicode/uniset.h" /** * Construct a new rule with the given input, output text, and other * attributes. A cursor position may be specified for the output text. * @param input input string, including key and optional ante and * post context * @param anteContextPos offset into input to end of ante context, or -1 if * none. Must be <= input.length() if not -1. * @param postContextPos offset into input to start of post context, or -1 * if none. Must be <= input.length() if not -1, and must be >= * anteContextPos. * @param output output string * @param cursorPos offset into output at which cursor is located, or -1 if * none. If less than zero, then the cursor is placed after the * output; that is, -1 is equivalent to * output.length(). If greater than * output.length() then an exception is thrown. */ TransliterationRule::TransliterationRule(const UnicodeString& input, int32_t anteContextPos, int32_t postContextPos, const UnicodeString& output, int32_t cursorPos, UErrorCode& status) { if (U_FAILURE(status)) { return; } // Do range checks only when warranted to save time if (anteContextPos < 0) { anteContextLength = 0; } else { if (anteContextPos > input.length()) { // throw new IllegalArgumentException("Invalid ante context"); status = U_ILLEGAL_ARGUMENT_ERROR; return; } anteContextLength = anteContextPos; } if (postContextPos < 0) { keyLength = input.length() - anteContextLength; } else { if (postContextPos < anteContextLength || postContextPos > input.length()) { // throw new IllegalArgumentException("Invalid post context"); status = U_ILLEGAL_ARGUMENT_ERROR; return; } keyLength = postContextPos - anteContextLength; } if (cursorPos < 0) { this->cursorPos = output.length(); } else { if (cursorPos > output.length()) { // throw new IllegalArgumentException("Invalid cursor position"); status = U_ILLEGAL_ARGUMENT_ERROR; return; } this->cursorPos = cursorPos; } pattern = input; this->output = output; } TransliterationRule::~TransliterationRule() {} /** * Return the length of the key. Equivalent to getKey().length(). * @return the length of the match key. */ int32_t TransliterationRule::getKeyLength(void) const { return keyLength; } /** * Return the output string. * @return the output string. */ const UnicodeString& TransliterationRule::getOutput(void) const { return output; } /** * Return the position of the cursor within the output string. * @return a value from 0 to getOutput().length(), inclusive. */ int32_t TransliterationRule::getCursorPos(void) const { return cursorPos; } /** * Return the preceding context length. This method is needed to * support the Transliterator method * getMaximumContextLength(). */ int32_t TransliterationRule::getAnteContextLength(void) const { return anteContextLength; } /** * Internal method. Returns 8-bit index value for this rule. * This is the low byte of the first character of the key, * unless the first character of the key is a set. If it's a * set, or otherwise can match multiple keys, the index value is -1. */ int16_t TransliterationRule::getIndexValue(const TransliterationRuleData& data) { if (anteContextLength == pattern.length()) { // A pattern with just ante context {such as foo)>bar} can // match any key. return -1; } UChar c = pattern.charAt(anteContextLength); return data.lookupSet(c) == NULL ? (c & 0xFF) : -1; } /** * Internal method. Returns true if this rule matches the given * index value. The index value is an 8-bit integer, 0..255, * representing the low byte of the first character of the key. * It matches this rule if it matches the first character of the * key, or if the first character of the key is a set, and the set * contains any character with a low byte equal to the index * value. If the rule contains only ante context, as in foo)>bar, * then it will match any key. */ bool_t TransliterationRule::matchesIndexValue(uint8_t v, const TransliterationRuleData& data) { if (anteContextLength == pattern.length()) { // A pattern with just ante context {such as foo)>bar} can // match any key. return TRUE; } UChar c = pattern.charAt(anteContextLength); UnicodeSet* set = data.lookupSet(c); return set == NULL ? (uint8_t(c) == v) : set->containsIndexValue(v); } /** * Return true if this rule masks another rule. If r1 masks r2 then * r1 matches any input string that r2 matches. If r1 masks r2 and r2 masks * r1 then r1 == r2. Examples: "a>x" masks "ab>y". "a>x" masks "a[b]>y". * "[c]a>x" masks "[dc]a>y". */ bool_t TransliterationRule::masks(const TransliterationRule& r2) const { /* Rule r1 masks rule r2 if the string formed of the * antecontext, key, and postcontext overlaps in the following * way: * * r1: aakkkpppp * r2: aaakkkkkpppp * ^ * * The strings must be aligned at the first character of the * key. The length of r1 to the left of the alignment point * must be <= the length of r2 to the left; ditto for the * right. The characters of r1 must equal (or be a superset * of) the corresponding characters of r2. The superset * operation should be performed to check for UnicodeSet * masking. */ /* LIMITATION of the current mask algorithm: Some rule * maskings are currently not detected. For example, * "{Lu}]a>x" masks "A]a>y". This can be added later. TODO */ int32_t len = pattern.length(); int32_t left = anteContextLength; int32_t left2 = r2.anteContextLength; int32_t right = len - left; int32_t right2 = r2.pattern.length() - left2; return left <= left2 && right <= right2 && 0 == r2.pattern.compare(left2 - left, len, pattern); } /** * Return true if this rule matches the given text. * @param text the text, both translated and untranslated * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param cursor position at which to translate next, representing offset * into text. This value must be between start and * limit. * @param filter the filter. Any character for which * filter.contains() returns false will not be * altered by this transliterator. If filter is * null then no filtering is applied. */ bool_t TransliterationRule::matches(const Replaceable& text, int32_t start, int32_t limit, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { // Match anteContext, key, and postContext cursor -= anteContextLength; if (cursor < start || (cursor + pattern.length()) > limit) { return FALSE; } for (int32_t i=0; i0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param cursor position at which to translate next, representing offset * into text. This value must be between start and * limit. * @param filter the filter. Any character for which * filter.contains() returns false will not be * altered by this transliterator. If filter is * null then no filtering is applied. * @return one of MISMATCH, PARTIAL_MATCH, or * FULL_MATCH. * @see #MISMATCH * @see #PARTIAL_MATCH * @see #FULL_MATCH */ int32_t TransliterationRule::getMatchDegree(const Replaceable& text, int32_t start, int32_t limit, int32_t cursor, const TransliterationRuleData& data, const UnicodeFilter* filter) const { int len = getRegionMatchLength(text, start, limit, cursor - anteContextLength, pattern, data, filter); return len < anteContextLength ? MISMATCH : (len < pattern.length() ? PARTIAL_MATCH : FULL_MATCH); } /** * Return the number of characters of the text that match this rule. If * there is a mismatch, return -1. If the text is not long enough to match * any characters, return 0. * @param text the text, both translated and untranslated * @param start the beginning index, inclusive; 0 <= start * <= limit. * @param limit the ending index, exclusive; start <= limit * <= text.length(). * @param cursor position at which to translate next, representing offset * into text. This value must be between start and * limit. * @param templ the text to match against. All characters must match. * @param data a dictionary of variables mapping Character * to UnicodeSet * @param filter the filter. Any character for which * filter.contains() returns false will not be * altered by this transliterator. If filter is * null then no filtering is applied. * @return -1 if there is a mismatch, 0 if the text is not long enough to * match any characters, otherwise the number of characters of text that * match this rule. */ int32_t TransliterationRule::getRegionMatchLength(const Replaceable& text, int32_t start, int32_t limit, int32_t cursor, const UnicodeString& templ, const TransliterationRuleData& data, const UnicodeFilter* filter) const { if (cursor < start) { return -1; } int32_t i; for (i=0; iCharacter * to UnicodeSet * @param filter the filter. Any character for which * filter.contains() returns false will not be * altered by this transliterator. If filter is * null then no filtering is applied. */ bool_t TransliterationRule::charMatches(UChar keyChar, UChar textChar, const TransliterationRuleData& data, const UnicodeFilter* filter) const { UnicodeSet* set = 0; return (filter == 0 || filter->contains(textChar)) && (((set = data.lookupSet(keyChar)) == 0) ? keyChar == textChar : set->contains(textChar)); }