scuffed-code/icu4c/source/i18n/name2uni.cpp

/*
**********************************************************************
*   Copyright (C) 2001, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   06/07/01    aliu        Creation.
**********************************************************************
*/

#include "name2uni.h"
#include "unicode/unifilt.h"
#include "unicode/uchar.h"

// As of Unicode 3.0.0, the longest name is 83 characters long.
#define LONGEST_NAME 83

U_NAMESPACE_BEGIN

const char NameUnicodeTransliterator::_ID[] = "Name-Any";

/**
 * Constructs a transliterator.
 */
NameUnicodeTransliterator::NameUnicodeTransliterator(
                                 UChar32 openDelim, UChar32 closeDelim,
                                 UnicodeFilter* adoptedFilter) :
    Transliterator(_ID, adoptedFilter),
    openDelimiter(openDelim),
    closeDelimiter(closeDelim) {
}

/**
 * Constructs a transliterator with the default delimiters '{' and
 * '}'.
 */
NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :
    Transliterator(_ID, adoptedFilter),
    openDelimiter((UChar) 0x007B /*{*/),
    closeDelimiter((UChar) 0x007D /*}*/) {
}

/**
 * Destructor.
 */
NameUnicodeTransliterator::~NameUnicodeTransliterator() {}

/**
 * Copy constructor.
 */
NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :
    Transliterator(o),
    openDelimiter(o.openDelimiter),
    closeDelimiter(o.closeDelimiter) {}

/**
 * Assignment operator.
 */
NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(
                             const NameUnicodeTransliterator& o) {
    Transliterator::operator=(o);
    openDelimiter = o.openDelimiter;
    closeDelimiter = o.closeDelimiter;
    return *this;
}

/**
 * Transliterator API.
 */
Transliterator* NameUnicodeTransliterator::clone(void) const {
    return new NameUnicodeTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                    UBool isIncremental) const {
    // Accomodate the longest possible name plus padding
    UChar buf[LONGEST_NAME + 8];
    char cbuf[LONGEST_NAME + 8]; // Default converter

    // The only characters used in names are (as of Unicode 3.0.0):
    //  -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ
    // (first character is a space).
    
    int32_t cursor = offsets.start;
    int32_t limit = offsets.limit;

    // Modes:
    // 0 - looking for open delimiter
    // 1 - after open delimiter
    int32_t mode = 0;
    int32_t ibuf = 0;
    int32_t openPos = offsets.start; // position of openDelimiter

    UnicodeString str;

    UChar32 c;
    for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) {
        c = text.char32At(cursor);

        switch (mode) {
        case 0: // looking for open delimiter
            if (c == openDelimiter) {
                openPos = cursor;
                mode = 1;
                ibuf = 0;
            }
            break;

        case 1: // after open delimiter
            // Look for [-a-zA-Z0-9<>].  If \w+ is found, convert it
            // to a single space.  If closeDelimiter is found, exit
            // the loop.  If any other character is found, exit the
            // loop.  If the limit is found, exit the loop.
            if (u_isWhitespace(c)) {
                // Ignore leading whitespace
                if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {
                    buf[ibuf++] = (UChar)0x0020 /* */;
                    // If we go a bit past the longest possible name then abort
                    if (ibuf == (LONGEST_NAME + 4)) {
                        mode = 0;
                    }
                }
                continue;
            }

            if (c == closeDelimiter) {
                // Delete trailing space, if any
                if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {
                    --ibuf;
                }
                buf[ibuf] = 0; // Add terminating zero
                UErrorCode status = U_ZERO_ERROR;

                UChar32 ch;

		u_UCharsToChars(buf, cbuf, ibuf+1);
		ch = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);
                if (U_SUCCESS(status)) {
                    // Lookup succeeded
                    str.truncate(0);
                    str.append(ch);
                    text.handleReplaceBetween(openPos, cursor+1, str);

                    // Adjust indices for the change in the length of
                    // the string.  Do not assume that str.length() ==
                    // 1, in case of surrogates.
                    int32_t delta = cursor + 1 - openPos - str.length();
                    cursor -= delta;
                    limit -= delta;
                    // assert(cursor == openPos + str.length());
                }
                // If the lookup failed, we leave things as-is and
                // still switch to mode 0 and continue.
                mode = 0;
                continue;
            }
            
            // Check if c =~ [-A-Za-z0-9<> ]
            if (c == (UChar)0x002D ||
                (c >= (UChar)0x0041 && c <= (UChar)0x005A) ||
                (c >= (UChar)0x0061 && c <= (UChar)0x007A) ||
                (c >= (UChar)0x0030 && c <= (UChar)0x0039) ||
                c == (UChar)0x003C || c == (UChar)0x003E) {
                buf[ibuf++] = (char) c;
                // If we go a bit past the longest possible name then abort
                if (ibuf == (LONGEST_NAME + 4)) {
                    mode = 0;
                }
            }
            
            // Invalid character
            else {
                --cursor; // Backup and reprocess this character
                mode = 0;
            }

            break;
        }
    }
        
    offsets.contextLimit += limit - offsets.limit;
    offsets.limit = limit;
    // In incremental mode, only advance the cursor up to the last
    // open delimiter, if we are in mode 1.
    offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;
}

U_NAMESPACE_END
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`/*`
			`**********************************************************************`
			`* Copyright (C) 2001, International Business Machines`
			`* Corporation and others. All Rights Reserved.`
			`**********************************************************************`
			`* Date Name Description`
			`* 06/07/01 aliu Creation.`
			`**********************************************************************`
			`*/`

ICU-1533 Moved new Transliterator subclasses here to make them private. X-SVN-Rev: 6964 2001-11-16 23:51:15 +00:00			`#include "name2uni.h"`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`#include "unicode/unifilt.h"`
ICU-770 Memory cleanup X-SVN-Rev: 6047 2001-10-04 16:36:32 +00:00			`#include "unicode/uchar.h"`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`// As of Unicode 3.0.0, the longest name is 83 characters long.`
			`#define LONGEST_NAME 83`

ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_BEGIN`

			`const char NameUnicodeTransliterator::_ID[] = "Name-Any";`

ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`/**`
			`* Constructs a transliterator.`
			`*/`
			`NameUnicodeTransliterator::NameUnicodeTransliterator(`
			`UChar32 openDelim, UChar32 closeDelim,`
			`UnicodeFilter* adoptedFilter) :`
			`Transliterator(_ID, adoptedFilter),`
			`openDelimiter(openDelim),`
			`closeDelimiter(closeDelim) {`
			`}`

			`/**`
			`* Constructs a transliterator with the default delimiters '{' and`
			`* '}'.`
			`*/`
			`NameUnicodeTransliterator::NameUnicodeTransliterator(UnicodeFilter* adoptedFilter) :`
			`Transliterator(_ID, adoptedFilter),`
			`openDelimiter((UChar) 0x007B /{/),`
			`closeDelimiter((UChar) 0x007D /}/) {`
			`}`

			`/**`
			`* Destructor.`
			`*/`
			`NameUnicodeTransliterator::~NameUnicodeTransliterator() {}`

			`/**`
			`* Copy constructor.`
			`*/`
			`NameUnicodeTransliterator::NameUnicodeTransliterator(const NameUnicodeTransliterator& o) :`
			`Transliterator(o),`
			`openDelimiter(o.openDelimiter),`
			`closeDelimiter(o.closeDelimiter) {}`

			`/**`
			`* Assignment operator.`
			`*/`
			`NameUnicodeTransliterator& NameUnicodeTransliterator::operator=(`
			`const NameUnicodeTransliterator& o) {`
			`Transliterator::operator=(o);`
			`openDelimiter = o.openDelimiter;`
			`closeDelimiter = o.closeDelimiter;`
			`return *this;`
			`}`

			`/**`
			`* Transliterator API.`
			`*/`
			`Transliterator* NameUnicodeTransliterator::clone(void) const {`
			`return new NameUnicodeTransliterator(*this);`
			`}`

			`/**`
			`* Implements {@link Transliterator#handleTransliterate}.`
			`*/`
			`void NameUnicodeTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,`
			`UBool isIncremental) const {`
			`// Accomodate the longest possible name plus padding`
ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00			`UChar buf[LONGEST_NAME + 8];`
			`char cbuf[LONGEST_NAME + 8]; // Default converter`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`// The only characters used in names are (as of Unicode 3.0.0):`
			`// -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ`
			`// (first character is a space).`

			`int32_t cursor = offsets.start;`
			`int32_t limit = offsets.limit;`

			`// Modes:`
			`// 0 - looking for open delimiter`
			`// 1 - after open delimiter`
			`int32_t mode = 0;`
			`int32_t ibuf = 0;`
			`int32_t openPos = offsets.start; // position of openDelimiter`

			`UnicodeString str;`

ICU-1373 more fixes to support supplementals X-SVN-Rev: 7286 2001-12-03 21:43:13 +00:00			`UChar32 c;`
			`for (; cursor < limit; cursor+=UTF_CHAR_LENGTH(c)) {`
			`c = text.char32At(cursor);`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00
			`switch (mode) {`
			`case 0: // looking for open delimiter`
			`if (c == openDelimiter) {`
			`openPos = cursor;`
			`mode = 1;`
			`ibuf = 0;`
			`}`
			`break;`

			`case 1: // after open delimiter`
ICU-1681 simply use U_EXTENDED_CHAR_NAME to do the transliteration. X-SVN-Rev: 7658 2002-02-14 05:45:39 +00:00			`// Look for [-a-zA-Z0-9<>]. If \w+ is found, convert it`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`// to a single space. If closeDelimiter is found, exit`
			`// the loop. If any other character is found, exit the`
			`// loop. If the limit is found, exit the loop.`
ICU-770 Memory cleanup X-SVN-Rev: 6047 2001-10-04 16:36:32 +00:00			`if (u_isWhitespace(c)) {`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`// Ignore leading whitespace`
			`if (ibuf != 0 && buf[ibuf-1] != (UChar)0x0020) {`
			`buf[ibuf++] = (UChar)0x0020 /* */;`
			`// If we go a bit past the longest possible name then abort`
			`if (ibuf == (LONGEST_NAME + 4)) {`
			`mode = 0;`
			`}`
			`}`
			`continue;`
			`}`

			`if (c == closeDelimiter) {`
			`// Delete trailing space, if any`
			`if (ibuf > 0 && buf[ibuf-1] == (UChar)0x0020) {`
			`--ibuf;`
			`}`
			`buf[ibuf] = 0; // Add terminating zero`
			`UErrorCode status = U_ZERO_ERROR;`
ICU-989 use default converter for UChar -> char X-SVN-Rev: 5209 2001-07-09 23:26:09 +00:00
ICU-1681 simply use U_EXTENDED_CHAR_NAME to do the transliteration. X-SVN-Rev: 7658 2002-02-14 05:45:39 +00:00			`UChar32 ch;`

			`u_UCharsToChars(buf, cbuf, ibuf+1);`
			`ch = u_charFromName(U_EXTENDED_CHAR_NAME, cbuf, &status);`
			`if (U_SUCCESS(status)) {`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`// Lookup succeeded`
			`str.truncate(0);`
			`str.append(ch);`
			`text.handleReplaceBetween(openPos, cursor+1, str);`

			`// Adjust indices for the change in the length of`
			`// the string. Do not assume that str.length() ==`
			`// 1, in case of surrogates.`
			`int32_t delta = cursor + 1 - openPos - str.length();`
			`cursor -= delta;`
			`limit -= delta;`
			`// assert(cursor == openPos + str.length());`
			`}`
			`// If the lookup failed, we leave things as-is and`
			`// still switch to mode 0 and continue.`
			`mode = 0;`
			`continue;`
			`}`

ICU-1681 simply use U_EXTENDED_CHAR_NAME to do the transliteration. X-SVN-Rev: 7658 2002-02-14 05:45:39 +00:00			`// Check if c =~ [-A-Za-z0-9<> ]`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`if (c == (UChar)0x002D \|\|`
			`(c >= (UChar)0x0041 && c <= (UChar)0x005A) \|\|`
ICU-1681 simply use U_EXTENDED_CHAR_NAME to do the transliteration. X-SVN-Rev: 7658 2002-02-14 05:45:39 +00:00			`(c >= (UChar)0x0061 && c <= (UChar)0x007A) \|\|`
ICU-1681 fix Name-Any and test. Note for Alan: this is just to stabilize the test until we sort out how to change u_charName and the Java issue. Whatever the result of this sorting out is, we can then revert these changes. X-SVN-Rev: 7590 2002-02-07 07:14:42 +00:00			`(c >= (UChar)0x0030 && c <= (UChar)0x0039) \|\|`
ICU-1681 do not test for U+0020 it is handled already. X-SVN-Rev: 7671 2002-02-14 20:39:27 +00:00			`c == (UChar)0x003C \|\| c == (UChar)0x003E) {`
ICU-989 implement algorithmic Any-Name, Name-Any X-SVN-Rev: 4948 2001-06-11 23:38:54 +00:00			`buf[ibuf++] = (char) c;`
			`// If we go a bit past the longest possible name then abort`
			`if (ibuf == (LONGEST_NAME + 4)) {`
			`mode = 0;`
			`}`
			`}`

			`// Invalid character`
			`else {`
			`--cursor; // Backup and reprocess this character`
			`mode = 0;`
			`}`

			`break;`
			`}`
			`}`

			`offsets.contextLimit += limit - offsets.limit;`
			`offsets.limit = limit;`
			`// In incremental mode, only advance the cursor up to the last`
			`// open delimiter, if we are in mode 1.`
			`offsets.start = (mode == 1 && isIncremental) ? openPos : cursor;`
			`}`
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00
			`U_NAMESPACE_END`