/* ********************************************************************** * Copyright (C) 1999, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/17/99 aliu Creation. ********************************************************************** */ #include "unicode/hextouni.h" #include "unicode/rep.h" #include "unicode/unifilt.h" #include "unicode/unicode.h" /** * ID for this transliterator. */ const char* HexToUnicodeTransliterator::_ID = "Hex-Any"; /** * This pattern encodes the following specs for the default constructor: * \\u0000 * \\U0000 * u+0000 * U+0000 * The multiple backslashes resolve to a single backslash * in the effective prefix. */ static const UChar gDEFAULT_PATTERN[] = { 0x5C, 0x5C, 0x75, 0x30, 0x30, 0x30, 0x30, 0x3B, /* "\\u0000;" */ 0x5C, 0x5C, 0x55, 0x30, 0x30, 0x30, 0x30, 0x3B, /* "\\U0000;" */ 0x75, 0x2B, 0x30, 0x30, 0x30, 0x30, 0x3B, /* "u+0000;" */ 0x55, 0x2B, 0x30, 0x30, 0x30, 0x30, 0 /* "U+0000" */ }; /* "\\u0000;\\U0000;u+0000;U+0000" */ const UChar *HexToUnicodeTransliterator::DEFAULT_PATTERN = gDEFAULT_PATTERN; // UNICODE_STRING("\\\\u0000;\\\\U0000;u+0000;U+0000", 29); /** * Constructs a transliterator. */ HexToUnicodeTransliterator::HexToUnicodeTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(_ID, adoptedFilter) { // We don't need to pass the status back to the caller because // we know that the DEFAULT_PATTERN parses. UErrorCode status = U_ZERO_ERROR; applyPattern(DEFAULT_PATTERN, status); } /** * Constructs a transliterator. */ HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern, UErrorCode& status) : Transliterator(_ID, 0) { applyPattern(thePattern, status); } /** * Constructs a transliterator. */ HexToUnicodeTransliterator::HexToUnicodeTransliterator(const UnicodeString& thePattern, UnicodeFilter* adoptedFilter, UErrorCode& status) : Transliterator(_ID, adoptedFilter) { applyPattern(thePattern, status); } /** * Copy constructor. */ HexToUnicodeTransliterator::HexToUnicodeTransliterator(const HexToUnicodeTransliterator& o) : Transliterator(o), pattern(o.pattern), affixes(o.affixes), affixCount(o.affixCount) { } /** * Assignment operator. */ HexToUnicodeTransliterator& HexToUnicodeTransliterator::operator=( const HexToUnicodeTransliterator& o) { Transliterator::operator=(o); pattern = o.pattern; affixes = o.affixes; affixCount = o.affixCount; return *this; } /** * Transliterator API. */ Transliterator* HexToUnicodeTransliterator::clone(void) const { return new HexToUnicodeTransliterator(*this); } void HexToUnicodeTransliterator::applyPattern(const UnicodeString& thePattern, UErrorCode& status) { if (U_FAILURE(status)) { return; } /* The pattern is processed and stored in affixes. The pattern * consists of zero or more affixes. Each affix is parsed to * determine the prefix, suffix, minimum digit count, and maximum * digit count. These values are then stored as a four character * header. That is, their numeric values are cast to UChars and * stored in the string. Following these four characters, the prefix * characters, then suffix characters are stored. Each spec takes * n+4 characters, where n is the total length of the prefix and * suffix. */ // POSSIBILE FUTURE MODIFICATION // Parse thePattern, and if this succeeds, set pattern to thePattern. // If it fails, call applyPattern(pattern) to restore the original // conditions. pattern = thePattern; affixes.truncate(0); affixCount = 0; /* The mode specifies where we are in each spec. * mode 0 = in prefix * mode 1 = in optional digits (#) * mode 2 = in required digits (0) * mode 3 = in suffix */ int32_t mode = 0; int32_t prefixLen = 0, suffixLen = 0, minDigits = 0, maxDigits = 0; int32_t start = 0; /* To make parsing easier, we append a virtual ';' at the end of * the pattern string, if there isn't one already. When we get to * the index pattern.length() (that is, one past the end), we * create a virtual ';' if necessary. */ UChar c = 0; // These are outside the loop so we can see the UBool isLiteral = FALSE; // previous character... for (int32_t i=0; i<=pattern.length(); ++i) { // Create the virtual trailing ';' if necessary if (i == pattern.length()) { // If the last character was not a non-literal ';'... if (i > 0 && !(c == SEMICOLON && !isLiteral)) { c = SEMICOLON; isLiteral = FALSE; } else { break; } } else { c = pattern.charAt(i); isLiteral = FALSE; } if (c == BACKSLASH) { if ((i+1) 4 // Invalid min/max digit count || prefixLen > 0xFFFF || suffixLen > 0xFFFF) { // Suffix or prefix too long status = U_ILLEGAL_ARGUMENT_ERROR; return; } // If there was no prefix and no suffix, then the // header will not have been allocated yet. We need // allocate the header now. if (start == affixes.length()) { affixes.append(UNICODE_STRING("AAAA", 4)); } // Fill in 4-character header affixes.setCharAt(start++, (UChar) prefixLen); affixes.setCharAt(start++, (UChar) suffixLen); affixes.setCharAt(start++, (UChar) minDigits); affixes.setCharAt(start++, (UChar) maxDigits); start = affixes.length(); ++affixCount; prefixLen = suffixLen = minDigits = maxDigits = mode = 0; break; default: isLiteral = TRUE; break; } } if (isLiteral) { if (start == affixes.length()) { // Make space for the header. Append any four // characters as place holders for the header values. // We fill these in when we parse the ';'. affixes.append(UNICODE_STRING("AAAA", 4)); } affixes.append(c); if (mode == 0) { ++prefixLen; } else { // Any literal outside the prefix moves us into mode 3 // (suffix) mode = 3; ++suffixLen; } } } } const UnicodeString& HexToUnicodeTransliterator::toPattern(void) const { return pattern; } void HexToUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { int32_t cursor = offsets.start; int32_t limit = offsets.limit; int32_t i, j, ipat; while (cursor < limit) { // Loop over the specs in affixes. If affixCount is zero (an // empty pattern), then we do nothing. We exit this loop when // we match one of the specs. We exit this function (by // jumping to exit: below) if a partial match is detected and // isIncremental is true. for (j=0, ipat=0; j= limit) { if (i > 0) { // We've already matched a character. This is // a partial match, so we return if in // incremental mode. In non-incremental mode, // go to the next spec. if (isIncremental) { goto exit; } match = FALSE; break; } } UChar c = text.charAt(curs++); if (c != affixes.charAt(ipat + i)) { match = FALSE; break; } } if (match) { UChar u = 0; int32_t digitCount = 0; for (;;) { if (curs >= limit) { // Check for partial match in incremental mode. if (curs > cursor && isIncremental) { goto exit; } break; } int8_t digit = Unicode::digit(text.charAt(curs), 16); if (digit < 0) { break; } ++curs; u <<= 4; u |= digit; if (++digitCount == maxDigits) { break; } } match = (digitCount >= minDigits); if (match) { for (i=0; i= limit) { // Check for partial match in incremental mode. if (curs > cursor && isIncremental) { goto exit; } match = FALSE; break; } UChar c = text.charAt(curs++); if (c != affixes.charAt(ipat + prefixLen + i)) { match = FALSE; break; } } if (match) { // This is a temporary one-character string UnicodeString str = UNICODE_STRING("A", 1); // At this point, we have a match str.setCharAt(0, u); text.handleReplaceBetween(cursor, curs, str); limit -= curs - cursor - 1; // The following break statement leaves the // loop that is traversing the specs in // affixes. We then parse the next input // character. break; } } } ipat += prefixLen + suffixLen; } ++cursor; } exit: offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; offsets.start = cursor; }