/* ********************************************************************** * Copyright (C) 2001, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 06/07/01 aliu Creation. ********************************************************************** */ #include "name2uni.h" #include "unicode/unifilt.h" #include "unicode/uchar.h" // As of Unicode 3.0.0, the longest name is 83 characters long. #define LONGEST_NAME 83 U_NAMESPACE_BEGIN const char NameUnicodeTransliterator::_ID[] = "Name-Any"; /** * Constructs a transliterator. */ NameUnicodeTransliterator::NameUnicodeTransliterator( UChar32 openDelim, UChar32 closeDelim, UnicodeFilter* adoptedFilter) : Transliterator(_ID, adoptedFilter), openDelimiter(openDelim), closeDelimiter(closeDelim) { } /** * Constructs a transliterator with the default delimiters '{' and * '}'. */ NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) : Transliterator(_ID, adoptedFilter), openDelimiter((UChar) 0x007B /*{*/), closeDelimiter((UChar) 0x007D /*}*/) { } /** * Destructor. */ NameUnicodeTransliterator::~NameUnicodeTransliterator() {} /** * Copy constructor. */ NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) : Transliterator(o), openDelimiter(o.openDelimiter), closeDelimiter(o.closeDelimiter) {} /** * Assignment operator. */ NameUnicodeTransliterator& NameUnicodeTransliterator::operator=( const NameUnicodeTransliterator& o) { Transliterator::operator=(o); openDelimiter = o.openDelimiter; closeDelimiter = o.closeDelimiter; return *this; } /** * Transliterator API. */ Transliterator* NameUnicodeTransliterator::clone(void) const { return new NameUnicodeTransliterator(*this); } /** * Implements {@link Transliterator#handleTransliterate}. */ void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, UBool isIncremental) const { // Accomodate the longest possible name plus padding UChar buf[LONGEST_NAME + 8]; char cbuf[LONGEST_NAME + 8]; // Default converter // The only characters used in names are (as of Unicode 3.0.0): // -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ // (first character is a space). int32_t cursor = offsets.start; int32_t limit = offsets.limit; // Modes: // 0 - looking for open delimiter // 1 - after open delimiter int32_t mode = 0; int32_t ibuf = 0; int32_t openPos = offsets.start; // position of openDelimiter UnicodeString str; UChar32 c; for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) { c = text.char32At(cursor); switch (mode) { case 0: // looking for open delimiter if (c == openDelimiter) { openPos = cursor; mode = 1; ibuf = 0; } break; case 1: // after open delimiter // Look for [-a-zA-Z0-9<>]. If \w+ is found, convert it // to a single space. If closeDelimiter is found, exit // the loop. If any other character is found, exit the // loop. If the limit is found, exit the loop. if (u_isWhitespace(c)) { // Ignore leading whitespace if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) { buf[ibuf++] = (UChar)0x0020 /* */; // If we go a bit past the longest possible name then abort if (ibuf == (LONGEST_NAME + 4)) { mode = 0; } } continue; } if (c == closeDelimiter) { // Delete trailing space, if any if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) { --ibuf; } buf[ibuf] = 0; // Add terminating zero UErrorCode status = U_ZERO_ERROR; UChar32 ch; u_UCharsToChars(buf, cbuf, ibuf+1); ch = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status); if (U_SUCCESS(status)) { // Lookup succeeded str.truncate(0); str.append(ch); text.handleReplaceBetween(openPos, cursor+1, str); // Adjust indices for the change in the length of // the string. Do not assume that str.length() == // 1, in case of surrogates. int32_t delta = cursor + 1 - openPos - str.length(); cursor -= delta; limit -= delta; // assert(cursor == openPos + str.length()); } // If the lookup failed, we leave things as-is and // still switch to mode 0 and continue. mode = 0; continue; } // Check if c =~ [-A-Za-z0-9<> ] if (c == (UChar)0x002D || (c >= (UChar)0x0041 && c <= (UChar)0x005A) || (c >= (UChar)0x0061 && c <= (UChar)0x007A) || (c >= (UChar)0x0030 && c <= (UChar)0x0039) || c == (UChar)0x003C || c == (UChar)0x003E) { buf[ibuf++] = (char) c; // If we go a bit past the longest possible name then abort if (ibuf == (LONGEST_NAME + 4)) { mode = 0; } } // Invalid character else { --cursor; // Backup and reprocess this character mode = 0; } break; } } offsets.contextLimit += limit - offsets.limit; offsets.limit = limit; // In incremental mode, only advance the cursor up to the last // open delimiter, if we are in mode 1. offsets.start = (mode == 1 && isIncremental) ? openPos : cursor; } U_NAMESPACE_END