scuffed-code/icu4c/source/i18n/nortrans.cpp

/*
**********************************************************************
*   Copyright (C) 2001-2005, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   07/03/01    aliu        Creation.
**********************************************************************
*/

#include "unicode/utypes.h"

#if !UCONFIG_NO_TRANSLITERATION

#include "unicode/uniset.h"
#include "unicode/uiter.h"
#include "nortrans.h"
#include "unormimp.h"
#include "mutex.h"
#include "ucln_in.h"

U_NAMESPACE_BEGIN

UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)

/**
 * System registration hook.
 */
void NormalizationTransliterator::registerIDs() {
    UErrorCode errorCode = U_ZERO_ERROR;
    if(!unorm_haveData(&errorCode)) {
        return;
    }

    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
                                     _create, integerToken(UNORM_NFC));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
                                     _create, integerToken(UNORM_NFKC));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
                                     _create, integerToken(UNORM_NFD));
    Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
                                     _create, integerToken(UNORM_NFKD));
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
                                            UNICODE_STRING_SIMPLE("NFD"), TRUE);
    Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
                                            UNICODE_STRING_SIMPLE("NFKD"), TRUE);
}

/**
 * Factory methods
 */
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
                                                     Token context) {
    return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
}

/**
 * Constructs a transliterator.
 */
NormalizationTransliterator::NormalizationTransliterator(
                                 const UnicodeString& id,
                                 UNormalizationMode mode, int32_t opt) :
    Transliterator(id, 0) {
    fMode = mode;
    options = opt;
}

/**
 * Destructor.
 */
NormalizationTransliterator::~NormalizationTransliterator() {
}

/**
 * Copy constructor.
 */
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o) {
    fMode = o.fMode;
    options = o.options;
}

/**
 * Assignment operator.
 */
NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
    Transliterator::operator=(o);
    fMode = o.fMode;
    options = o.options;
    return *this;
}

/**
 * Transliterator API.
 */
Transliterator* NormalizationTransliterator::clone(void) const {
    return new NormalizationTransliterator(*this);
}

/**
 * Implements {@link Transliterator#handleTransliterate}.
 */
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
                                                      UBool isIncremental) const {
    // start and limit of the input range
    int32_t start = offsets.start;
    int32_t limit = offsets.limit;
    int32_t length, delta;

    if(start >= limit) {
        return;
    }

    // a C code unit iterator, implemented around the Replaceable
    UCharIterator iter;
    uiter_setReplaceable(&iter, &text);

    // the output string and buffer pointer
    UnicodeString output;
    UChar *buffer;
    UBool neededToNormalize;

    UErrorCode errorCode;

    /*
     * Normalize as short chunks at a time as possible even in
     * bulk mode, so that styled text is minimally disrupted.
     * In incremental mode, a chunk that ends with offsets.limit
     * must not be normalized.
     *
     * If it was known that the input text is not styled, then
     * a bulk mode normalization could look like this:
     *

    UChar staticChars[256];
    UnicodeString input;

    length = limit - start;
    input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias

    _Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
    input.releaseBuffer(length);

    UErrorCode status = U_ZERO_ERROR;
    Normalizer::normalize(input, fMode, options, output, status);

    text.handleReplaceBetween(start, limit, output);

    int32_t delta = output.length() - length;
    offsets.contextLimit += delta;
    offsets.limit += delta;
    offsets.start = limit + delta;

     *
     */
    while(start < limit) {
        // set the iterator limits for the remaining input range
        // this is a moving target because of the replacements in the text object
        iter.start = iter.index = start;
        iter.limit = limit;

        // incrementally normalize a small chunk of the input
        buffer = output.getBuffer(-1);
        errorCode = U_ZERO_ERROR;
        length = unorm_next(&iter, buffer, output.getCapacity(),
                            fMode, 0,
                            TRUE, &neededToNormalize,
                            &errorCode);
        output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);

        if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
            // use a larger output string buffer and do it again from the start
            iter.index = start;
            buffer = output.getBuffer(length);
            errorCode = U_ZERO_ERROR;
            length = unorm_next(&iter, buffer, output.getCapacity(),
                                fMode, 0,
                                TRUE, &neededToNormalize,
                                &errorCode);
            output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
        }

        if(U_FAILURE(errorCode)) {
            break;
        }

        limit = iter.index;
        if(isIncremental && limit == iter.limit) {
            // stop in incremental mode when we reach the input limit
            // in case there are additional characters that could change the
            // normalization result

            // UNLESS all characters in the result of the normalization of
            // the last run are in the skippable set
            const UChar *s=output.getBuffer();
            int32_t i=0, outLength=output.length();
            UChar32 c;

            while(i<outLength) {
                U16_NEXT(s, i, outLength, c);
                if(!unorm_isNFSkippable(c, fMode)) {
                    outLength=-1; // I wish C++ had labeled loops and break outer; ...
                    break;
                }
            }
            if (outLength<0) {
                break;
            }
        }

        if(neededToNormalize) {
            // replace the input chunk with its normalized form
            text.handleReplaceBetween(start, limit, output);

            // update all necessary indexes accordingly
            delta = length - (limit - start);   // length change in the text object
            start = limit += delta;             // the next chunk starts where this one ends, with adjustment
            limit = offsets.limit += delta;     // set the iteration limit to the adjusted end of the input range
            offsets.contextLimit += delta;
        } else {
            // delta == 0
            start = limit;
            limit = offsets.limit;
        }
    }

    offsets.start = start;
}

U_NAMESPACE_END

#endif /* #if !UCONFIG_NO_TRANSLITERATION */
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`/*`
			`**********************************************************************`
ICU-4514 pass 0 length to UnicodeString::releaseBuffer() when the buffer contents is undefined due to a U_FAILURE condition X-SVN-Rev: 17972 2005-06-22 02:40:25 +00:00			`* Copyright (C) 2001-2005, International Business Machines`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`* Corporation and others. All Rights Reserved.`
			`**********************************************************************`
			`* Date Name Description`
			`* 07/03/01 aliu Creation.`
			`**********************************************************************`
			`*/`

ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`#include "unicode/utypes.h"`
ICU-2248 modularize icu, allow parts to not be built X-SVN-Rev: 9900 2002-09-20 01:54:48 +00:00
			`#if !UCONFIG_NO_TRANSLITERATION`

ICU-1575 fixed normalizing transliterator to get closer to the end in incremental mode; latin-katakana & fullwidth/halfwidth now pass incremental tests. UnicodeSet has some package-private utilities added -- they should be made public next release. X-SVN-Rev: 7280 2001-12-03 20:50:50 +00:00			`#include "unicode/uniset.h"`
ICU-1686 use public instead of private C API for character iteration X-SVN-Rev: 7621 2002-02-09 21:59:27 +00:00			`#include "unicode/uiter.h"`
ICU-1533 Moved new Transliterator subclasses here to make them private. X-SVN-Rev: 6964 2001-11-16 23:51:15 +00:00			`#include "nortrans.h"`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`#include "unormimp.h"`
ICU-1575 fixed normalizing transliterator to get closer to the end in incremental mode; latin-katakana & fullwidth/halfwidth now pass incremental tests. UnicodeSet has some package-private utilities added -- they should be made public next release. X-SVN-Rev: 7280 2001-12-03 20:50:50 +00:00			`#include "mutex.h"`
			`#include "ucln_in.h"`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00			`U_NAMESPACE_BEGIN`

ICU-3221 Fix AIX linker warnings X-SVN-Rev: 12997 2003-08-31 20:53:46 +00:00			`UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)`
ICU-1962 change UObject: RTTI pure virtual, remove other boilerplate for now X-SVN-Rev: 8977 2002-06-29 00:04:16 +00:00
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`/**`
			`* System registration hook.`
			`*/`
			`void NormalizationTransliterator::registerIDs() {`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`UErrorCode errorCode = U_ZERO_ERROR;`
			`if(!unorm_haveData(&errorCode)) {`
			`return;`
			`}`

ICU-1629 Simplify UnicodeString creation X-SVN-Rev: 7471 2002-01-22 00:27:49 +00:00			`Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),`
ICU-1079 add ID and context param to factory X-SVN-Rev: 6276 2001-10-17 17:29:34 +00:00			`_create, integerToken(UNORM_NFC));`
ICU-1629 Simplify UnicodeString creation X-SVN-Rev: 7471 2002-01-22 00:27:49 +00:00			`Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),`
ICU-1079 add ID and context param to factory X-SVN-Rev: 6276 2001-10-17 17:29:34 +00:00			`_create, integerToken(UNORM_NFKC));`
ICU-1629 Simplify UnicodeString creation X-SVN-Rev: 7471 2002-01-22 00:27:49 +00:00			`Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),`
ICU-1079 add ID and context param to factory X-SVN-Rev: 6276 2001-10-17 17:29:34 +00:00			`_create, integerToken(UNORM_NFD));`
ICU-1629 Simplify UnicodeString creation X-SVN-Rev: 7471 2002-01-22 00:27:49 +00:00			`Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),`
ICU-1079 add ID and context param to factory X-SVN-Rev: 6276 2001-10-17 17:29:34 +00:00			`_create, integerToken(UNORM_NFKD));`
ICU-1629 Simplify UnicodeString creation X-SVN-Rev: 7471 2002-01-22 00:27:49 +00:00			`Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),`
			`UNICODE_STRING_SIMPLE("NFD"), TRUE);`
			`Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),`
			`UNICODE_STRING_SIMPLE("NFKD"), TRUE);`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`}`

			`/**`
			`* Factory methods`
			`*/`
ICU-1079 add ID and context param to factory X-SVN-Rev: 6276 2001-10-17 17:29:34 +00:00			`Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,`
			`Token context) {`
			`return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`}`

			`/**`
			`* Constructs a transliterator.`
			`*/`
			`NormalizationTransliterator::NormalizationTransliterator(`
			`const UnicodeString& id,`
ICU-1007 change Normalizer::EMode to UNormalizationMode X-SVN-Rev: 5950 2001-09-27 23:19:22 +00:00			`UNormalizationMode mode, int32_t opt) :`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`Transliterator(id, 0) {`
ICU-1007 change Normalizer::EMode to UNormalizationMode X-SVN-Rev: 5950 2001-09-27 23:19:22 +00:00			`fMode = mode;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`options = opt;`
			`}`

			`/**`
			`* Destructor.`
			`*/`
			`NormalizationTransliterator::~NormalizationTransliterator() {`
			`}`

			`/**`
			`* Copy constructor.`
			`*/`
			`NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :`
			`Transliterator(o) {`
ICU-1007 change Normalizer::EMode to UNormalizationMode X-SVN-Rev: 5950 2001-09-27 23:19:22 +00:00			`fMode = o.fMode;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`options = o.options;`
			`}`

			`/**`
			`* Assignment operator.`
			`*/`
			`NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {`
			`Transliterator::operator=(o);`
ICU-1007 change Normalizer::EMode to UNormalizationMode X-SVN-Rev: 5950 2001-09-27 23:19:22 +00:00			`fMode = o.fMode;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`options = o.options;`
			`return *this;`
			`}`

			`/**`
			`* Transliterator API.`
			`*/`
			`Transliterator* NormalizationTransliterator::clone(void) const {`
			`return new NormalizationTransliterator(*this);`
			`}`

			`/**`
			`* Implements {@link Transliterator#handleTransliterate}.`
			`*/`
			`void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,`
			`UBool isIncremental) const {`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`// start and limit of the input range`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`int32_t start = offsets.start;`
			`int32_t limit = offsets.limit;`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`int32_t length, delta;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`if(start >= limit) {`
			`return;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`}`

ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`// a C code unit iterator, implemented around the Replaceable`
ICU-1686 use public instead of private C API for character iteration X-SVN-Rev: 7621 2002-02-09 21:59:27 +00:00			`UCharIterator iter;`
			`uiter_setReplaceable(&iter, &text);`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`// the output string and buffer pointer`
			`UnicodeString output;`
			`UChar *buffer;`
ICU-1700 public C API for iterative normalization X-SVN-Rev: 7622 2002-02-10 00:11:16 +00:00			`UBool neededToNormalize;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`UErrorCode errorCode;`

			`/*`
			`* Normalize as short chunks at a time as possible even in`
			`* bulk mode, so that styled text is minimally disrupted.`
			`* In incremental mode, a chunk that ends with offsets.limit`
			`* must not be normalized.`
			`*`
			`* If it was known that the input text is not styled, then`
			`* a bulk mode normalization could look like this:`
			`*`

			`UChar staticChars[256];`
			`UnicodeString input;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`length = limit - start;`
			`input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));`
			`input.releaseBuffer(length);`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`UErrorCode status = U_ZERO_ERROR;`
			`Normalizer::normalize(input, fMode, options, output, status);`

			`text.handleReplaceBetween(start, limit, output);`

			`int32_t delta = output.length() - length;`
			`offsets.contextLimit += delta;`
			`offsets.limit += delta;`
			`offsets.start = limit + delta;`

			`*`
			`*/`
			`while(start < limit) {`
			`// set the iterator limits for the remaining input range`
			`// this is a moving target because of the replacements in the text object`
			`iter.start = iter.index = start;`
			`iter.limit = limit;`

			`// incrementally normalize a small chunk of the input`
			`buffer = output.getBuffer(-1);`
			`errorCode = U_ZERO_ERROR;`
ICU-1700 public C API for iterative normalization X-SVN-Rev: 7622 2002-02-10 00:11:16 +00:00			`length = unorm_next(&iter, buffer, output.getCapacity(),`
			`fMode, 0,`
			`TRUE, &neededToNormalize,`
			`&errorCode);`
ICU-4514 pass 0 length to UnicodeString::releaseBuffer() when the buffer contents is undefined due to a U_FAILURE condition X-SVN-Rev: 17972 2005-06-22 02:40:25 +00:00			`output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00
			`if(errorCode == U_BUFFER_OVERFLOW_ERROR) {`
			`// use a larger output string buffer and do it again from the start`
			`iter.index = start;`
			`buffer = output.getBuffer(length);`
			`errorCode = U_ZERO_ERROR;`
ICU-1700 public C API for iterative normalization X-SVN-Rev: 7622 2002-02-10 00:11:16 +00:00			`length = unorm_next(&iter, buffer, output.getCapacity(),`
			`fMode, 0,`
			`TRUE, &neededToNormalize,`
			`&errorCode);`
ICU-4514 pass 0 length to UnicodeString::releaseBuffer() when the buffer contents is undefined due to a U_FAILURE condition X-SVN-Rev: 17972 2005-06-22 02:40:25 +00:00			`output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`}`

ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`if(U_FAILURE(errorCode)) {`
			`break;`
			`}`

			`limit = iter.index;`
			`if(isIncremental && limit == iter.limit) {`
			`// stop in incremental mode when we reach the input limit`
			`// in case there are additional characters that could change the`
			`// normalization result`
ICU-1575 fixed normalizing transliterator to get closer to the end in incremental mode; latin-katakana & fullwidth/halfwidth now pass incremental tests. UnicodeSet has some package-private utilities added -- they should be made public next release. X-SVN-Rev: 7280 2001-12-03 20:50:50 +00:00
			`// UNLESS all characters in the result of the normalization of`
			`// the last run are in the skippable set`
ICU-1785 replace hardcoded skippables sets with calls to unorm_isNFSkippable() X-SVN-Rev: 10151 2002-11-05 00:53:42 +00:00			`const UChar *s=output.getBuffer();`
			`int32_t i=0, outLength=output.length();`
			`UChar32 c;`

			`while(i<outLength) {`
			`U16_NEXT(s, i, outLength, c);`
			`if(!unorm_isNFSkippable(c, fMode)) {`
			`outLength=-1; // I wish C++ had labeled loops and break outer; ...`
			`break;`
			`}`
			`}`
			`if (outLength<0) {`
ICU-1575 fixed normalizing transliterator to get closer to the end in incremental mode; latin-katakana & fullwidth/halfwidth now pass incremental tests. UnicodeSet has some package-private utilities added -- they should be made public next release. X-SVN-Rev: 7280 2001-12-03 20:50:50 +00:00			`break;`
			`}`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00			`}`

ICU-1700 public C API for iterative normalization X-SVN-Rev: 7622 2002-02-10 00:11:16 +00:00			`if(neededToNormalize) {`
			`// replace the input chunk with its normalized form`
			`text.handleReplaceBetween(start, limit, output);`

			`// update all necessary indexes accordingly`
			`delta = length - (limit - start); // length change in the text object`
			`start = limit += delta; // the next chunk starts where this one ends, with adjustment`
			`limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range`
			`offsets.contextLimit += delta;`
			`} else {`
			`// delta == 0`
			`start = limit;`
			`limit = offsets.limit;`
			`}`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`}`
ICU-1007 use internal iterative normalization api X-SVN-Rev: 6353 2001-10-19 23:26:48 +00:00
			`offsets.start = start;`
ICU-1029 add NormalizationTransliterator X-SVN-Rev: 5173 2001-07-03 23:44:45 +00:00			`}`
ICU-1264 added namspace support where possible. X-SVN-Rev: 6124 2001-10-08 23:26:58 +00:00
			`U_NAMESPACE_END`
ICU-2248 modularize icu, allow parts to not be built X-SVN-Rev: 9900 2002-09-20 01:54:48 +00:00
			`#endif /* #if !UCONFIG_NO_TRANSLITERATION */`