scuffed-code/icu4c/source/i18n/name2uni.cpp

/*
**********************************************************************
*   Copyright (C) 2001, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   06/07/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/name2uni.h"
#include "unicode/unifilt.h"
#include "unicode/unicode.h"
#include "unicode/convert.h"

const char* NameUnicodeTransliterator::_ID = "Name-Any";

// As of Unicode 3.0.0, the longest name is 83 characters long.
#define LONGEST_NAME 83

/**
 * Constructs a transliterator.
 */
NameUnicodeTransliterator::NameUnicodeTransliterator(
                                 UChar32 openDelim, UChar32 closeDelim,
                                 UnicodeFilter* adoptedFilter) :
    Transliterator(_ID, adoptedFilter),
    openDelimiter(openDelim),
    closeDelimiter(closeDelim) {
}

/**
 * Constructs a transliterator with the default delimiters '{' and
 * '}'.
 */
NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
    Transliterator(_ID, adoptedFilter),
    openDelimiter((UChar) 0x007B /*{*/),
    closeDelimiter((UChar) 0x007D /*}*/) {
}

/**
 * Destructor.
 */
NameUnicodeTransliterator::~NameUnicodeTransliterator() {}

/**
 * Copy constructor.
 */
NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
    Transliterator(o),
    openDelimiter(o.openDelimiter),
    closeDelimiter(o.closeDelimiter) {}

/**
 * Assignment operator.
 */
NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
                             const NameUnicodeTransliterator& o) {
    Transliterator::operator=(o);
    openDelimiter = o.openDelimiter;
    closeDelimiter = o.closeDelimiter;
    return *this;
}

/**
 * Transliterator API.
 */
Transliterator* NameUnicodeTransliterator::clone(void) const {
    return new NameUnicodeTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental) const {
    // Accomodate the longest possible name plus padding
    UChar buf[LONGEST_NAME + 8];
    char cbuf[LONGEST_NAME + 8]; // Default converter

    // The only characters used in names are (as of Unicode 3.0.0):
    //  -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
    // (first character is a space).
    
    int32_t cursor = offsets.start;
    int32_t limit = offsets.limit;

    // Modes:
    // 0 - looking for open delimiter
    // 1 - after open delimiter
    int32_t mode = 0;
    int32_t ibuf = 0;
    int32_t openPos = offsets.start; // position of openDelimiter

    UnicodeString str;

    UnicodeConverter converter; // default converter

    for (; cursor < limit; ++cursor) {
        UChar c = text.charAt(cursor);

        switch (mode) {
        case 0: // looking for open delimiter
            if (c == openDelimiter) {
                openPos = cursor;
                mode = 1;
                ibuf = 0;
            }
            break;

        case 1: // after open delimiter
            // Look for [-a-zA-Z0-9].  If \w+ is found, convert it
            // to a single space.  If closeDelimiter is found, exit
            // the loop.  If any other character is found, exit the
            // loop.  If the limit is found, exit the loop.
            if (Unicode::isWhitespace(c)) {
                // Ignore leading whitespace
                if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
                    buf[ibuf++] = (UChar)0x0020 /* */;
                    // If we go a bit past the longest possible name then abort
                    if (ibuf == (LONGEST_NAME + 4)) {
                        mode = 0;
                    }
                }
                continue;
            }

            if (c == closeDelimiter) {
                // Delete trailing space, if any
                if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
                    --ibuf;
                }
                buf[ibuf] = 0; // Add terminating zero
                UErrorCode status = U_ZERO_ERROR;

                // Convert UChar to char
                char *out = cbuf;
                const UChar *in = buf;
                converter.fromUnicode(out, cbuf+sizeof(cbuf),
                                      in, buf+ibuf, NULL, TRUE, status);
                *out = 0;

                UChar32 ch = u_charFromName(U_UNICODE_CHAR_NAME, cbuf, &status);
                if (ch != (UChar32) 0xFFFF && U_SUCCESS(status)) {
                    // Lookup succeeded
                    str.truncate(0);
                    str.append(ch);
                    text.handleReplaceBetween(openPos, cursor+1, str);

                    // Adjust indices for the change in the length of
                    // the string.  Do not assume that str.length() ==
                    // 1, in case of surrogates.
                    int32_t delta = cursor + 1 - openPos - str.length();
                    cursor -= delta;
                    limit -= delta;
                    // assert(cursor == openPos + str.length());
                }
                // If the lookup failed, we leave things as-is and
                // still switch to mode 0 and continue.
                mode = 0;
                continue;
            }
            
            //if (c >= (UChar)0x0061 && c <= (UChar)0x007A) {
            //    c -= 0x0020; // [a-z] => [A-Z]
            //}

            // Check if c =~ [-A-Z0-9]
            if (c == (UChar)0x002D ||
                (c >= (UChar)0x0041 && c <= (UChar)0x005A) ||
                (c >= (UChar)0x0030 && c <= (UChar)0x0039)) {
                buf[ibuf++] = (char) c;
                // If we go a bit past the longest possible name then abort
                if (ibuf == (LONGEST_NAME + 4)) {
                    mode = 0;
                }
            }
            
            // Invalid character
            else {
                --cursor; // Backup and reprocess this character
                mode = 0;
            }

            break;
        }
    }
        
    offsets.contextLimit += limit - offsets.limit;
    offsets.limit = limit;
    // In incremental mode, only advance the cursor up to the last
    // open delimiter, if we are in mode 1.
    offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
}
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`/*`
			`**********************************************************************`
			`* Copyright (C) 2001, International Business Machines`
			`* Corporation and others. All Rights Reserved.`
			`**********************************************************************`
			`* Date Name Description`
			`* 06/07/01 aliu Creation.`
			`**********************************************************************`
			`*/`

			`#include "unicode/name2uni.h"`
			`#include "unicode/unifilt.h"`
			`#include "unicode/unicode.h"`
ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00			`#include "unicode/convert.h"`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`const char* NameUnicodeTransliterator::_ID = "Name-Any";`

			`// As of Unicode 3.0.0, the longest name is 83 characters long.`
			`#define LONGEST_NAME 83`

			`/**`
			`* Constructs a transliterator.`
			`*/`
			`NameUnicodeTransliterator::NameUnicodeTransliterator(`
			`UChar32 openDelim, UChar32 closeDelim,`
			`UnicodeFilter* adoptedFilter) :`
			`Transliterator(_ID, adoptedFilter),`
			`openDelimiter(openDelim),`
			`closeDelimiter(closeDelim) {`
			`}`

			`/**`
			`* Constructs a transliterator with the default delimiters '{' and`
			`* '}'.`
			`*/`
			`NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :`
			`Transliterator(_ID, adoptedFilter),`
			`openDelimiter((UChar) 0x007B /{/),`
			`closeDelimiter((UChar) 0x007D /}/) {`
			`}`

			`/**`
			`* Destructor.`
			`*/`
			`NameUnicodeTransliterator::~NameUnicodeTransliterator() {}`

			`/**`
			`* Copy constructor.`
			`*/`
			`NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :`
			`Transliterator(o),`
			`openDelimiter(o.openDelimiter),`
			`closeDelimiter(o.closeDelimiter) {}`

			`/**`
			`* Assignment operator.`
			`*/`
			`NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(`
			`const NameUnicodeTransliterator& o) {`
			`Transliterator::operator=(o);`
			`openDelimiter = o.openDelimiter;`
			`closeDelimiter = o.closeDelimiter;`
			`return *this;`
			`}`

			`/**`
			`* Transliterator API.`
			`*/`
			`Transliterator* NameUnicodeTransliterator::clone(void) const {`
			`return new NameUnicodeTransliterator(*this);`
			`}`

			`/**`
			`* Implements {@link Transliterator#handleTransliterate}.`
			`*/`
			`void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,`
			`UBool isIncremental) const {`
			`// Accomodate the longest possible name plus padding`
ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00			`UChar buf[LONGEST_NAME + 8];`
			`char cbuf[LONGEST_NAME + 8]; // Default converter`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`// The only characters used in names are (as of Unicode 3.0.0):`
			`// -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ`
			`// (first character is a space).`

			`int32_t cursor = offsets.start;`
			`int32_t limit = offsets.limit;`

			`// Modes:`
			`// 0 - looking for open delimiter`
			`// 1 - after open delimiter`
			`int32_t mode = 0;`
			`int32_t ibuf = 0;`
			`int32_t openPos = offsets.start; // position of openDelimiter`

			`UnicodeString str;`

ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00			`UnicodeConverter converter; // default converter`

ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`for (; cursor < limit; ++cursor) {`
ICU-1053 move filter logic into Transliterator.filteredTransliterate X-SVN-Rev: 5258 2001-07-17 23:36:41 +00:00			`UChar c = text.charAt(cursor);`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`switch (mode) {`
			`case 0: // looking for open delimiter`
			`if (c == openDelimiter) {`
			`openPos = cursor;`
			`mode = 1;`
			`ibuf = 0;`
			`}`
			`break;`

			`case 1: // after open delimiter`
			`// Look for [-a-zA-Z0-9]. If \w+ is found, convert it`
			`// to a single space. If closeDelimiter is found, exit`
			`// the loop. If any other character is found, exit the`
			`// loop. If the limit is found, exit the loop.`
			`if (Unicode::isWhitespace(c)) {`
			`// Ignore leading whitespace`
			`if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {`
			`buf[ibuf++] = (UChar)0x0020 /* */;`
			`// If we go a bit past the longest possible name then abort`
			`if (ibuf == (LONGEST_NAME + 4)) {`
			`mode = 0;`
			`}`
			`}`
			`continue;`
			`}`

			`if (c == closeDelimiter) {`
			`// Delete trailing space, if any`
			`if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {`
			`--ibuf;`
			`}`
			`buf[ibuf] = 0; // Add terminating zero`
			`UErrorCode status = U_ZERO_ERROR;`
ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00
			`// Convert UChar to char`
			`char *out = cbuf;`
			`const UChar *in = buf;`
			`converter.fromUnicode(out, cbuf+sizeof(cbuf),`
			`in, buf+ibuf, NULL, TRUE, status);`
			`*out = 0;`

			`UChar32 ch = u_charFromName(U_UNICODE_CHAR_NAME, cbuf, &status);`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`if (ch != (UChar32) 0xFFFF && U_SUCCESS(status)) {`
			`// Lookup succeeded`
			`str.truncate(0);`
			`str.append(ch);`
			`text.handleReplaceBetween(openPos, cursor+1, str);`

			`// Adjust indices for the change in the length of`
			`// the string. Do not assume that str.length() ==`
			`// 1, in case of surrogates.`
			`int32_t delta = cursor + 1 - openPos - str.length();`
			`cursor -= delta;`
			`limit -= delta;`
			`// assert(cursor == openPos + str.length());`
			`}`
			`// If the lookup failed, we leave things as-is and`
			`// still switch to mode 0 and continue.`
			`mode = 0;`
			`continue;`
			`}`

ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00			`//if (c >= (UChar)0x0061 && c <= (UChar)0x007A) {`
			`// c -= 0x0020; // [a-z] => [A-Z]`
			`//}`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`// Check if c =~ [-A-Z0-9]`
			`if (c == (UChar)0x002D \|\|`
			`(c >= (UChar)0x0041 && c <= (UChar)0x005A) \|\|`
			`(c >= (UChar)0x0030 && c <= (UChar)0x0039)) {`
			`buf[ibuf++] = (char) c;`
			`// If we go a bit past the longest possible name then abort`
			`if (ibuf == (LONGEST_NAME + 4)) {`
			`mode = 0;`
			`}`
			`}`

			`// Invalid character`
			`else {`
			`--cursor; // Backup and reprocess this character`
			`mode = 0;`
			`}`

			`break;`
			`}`
			`}`

			`offsets.contextLimit += limit - offsets.limit;`
			`offsets.limit = limit;`
			`// In incremental mode, only advance the cursor up to the last`
			`// open delimiter, if we are in mode 1.`
			`offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;`
			`}`