scuffed-code/icu4c/source/i18n/nortrans.cpp

233 lines
7.4 KiB
C++
Raw Normal View History

/*
**********************************************************************
* Copyright (C) 2001-2005, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 07/03/01 aliu Creation.
**********************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/uniset.h"
#include "unicode/uiter.h"
#include "nortrans.h"
#include "unormimp.h"
#include "mutex.h"
#include "ucln_in.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
/**
* System registration hook.
*/
void NormalizationTransliterator::registerIDs() {
UErrorCode errorCode = U_ZERO_ERROR;
if(!unorm_haveData(&errorCode)) {
return;
}
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
_create, integerToken(UNORM_NFC));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
_create, integerToken(UNORM_NFKC));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
_create, integerToken(UNORM_NFD));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
_create, integerToken(UNORM_NFKD));
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
UNICODE_STRING_SIMPLE("NFD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
}
/**
* Factory methods
*/
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
Token context) {
return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
}
/**
* Constructs a transliterator.
*/
NormalizationTransliterator::NormalizationTransliterator(
const UnicodeString& id,
UNormalizationMode mode, int32_t opt) :
Transliterator(id, 0) {
fMode = mode;
options = opt;
}
/**
* Destructor.
*/
NormalizationTransliterator::~NormalizationTransliterator() {
}
/**
* Copy constructor.
*/
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o) {
fMode = o.fMode;
options = o.options;
}
/**
* Assignment operator.
*/
NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
Transliterator::operator=(o);
fMode = o.fMode;
options = o.options;
return *this;
}
/**
* Transliterator API.
*/
Transliterator* NormalizationTransliterator::clone(void) const {
return new NormalizationTransliterator(*this);
}
/**
* Implements {@link Transliterator#handleTransliterate}.
*/
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
UBool isIncremental) const {
// start and limit of the input range
int32_t start = offsets.start;
int32_t limit = offsets.limit;
int32_t length, delta;
if(start >= limit) {
return;
}
// a C code unit iterator, implemented around the Replaceable
UCharIterator iter;
uiter_setReplaceable(&iter, &text);
// the output string and buffer pointer
UnicodeString output;
UChar *buffer;
UBool neededToNormalize;
UErrorCode errorCode;
/*
* Normalize as short chunks at a time as possible even in
* bulk mode, so that styled text is minimally disrupted.
* In incremental mode, a chunk that ends with offsets.limit
* must not be normalized.
*
* If it was known that the input text is not styled, then
* a bulk mode normalization could look like this:
*
UChar staticChars[256];
UnicodeString input;
length = limit - start;
input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
input.releaseBuffer(length);
UErrorCode status = U_ZERO_ERROR;
Normalizer::normalize(input, fMode, options, output, status);
text.handleReplaceBetween(start, limit, output);
int32_t delta = output.length() - length;
offsets.contextLimit += delta;
offsets.limit += delta;
offsets.start = limit + delta;
*
*/
while(start < limit) {
// set the iterator limits for the remaining input range
// this is a moving target because of the replacements in the text object
iter.start = iter.index = start;
iter.limit = limit;
// incrementally normalize a small chunk of the input
buffer = output.getBuffer(-1);
errorCode = U_ZERO_ERROR;
length = unorm_next(&iter, buffer, output.getCapacity(),
fMode, 0,
TRUE, &neededToNormalize,
&errorCode);
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
// use a larger output string buffer and do it again from the start
iter.index = start;
buffer = output.getBuffer(length);
errorCode = U_ZERO_ERROR;
length = unorm_next(&iter, buffer, output.getCapacity(),
fMode, 0,
TRUE, &neededToNormalize,
&errorCode);
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
}
if(U_FAILURE(errorCode)) {
break;
}
limit = iter.index;
if(isIncremental && limit == iter.limit) {
// stop in incremental mode when we reach the input limit
// in case there are additional characters that could change the
// normalization result
// UNLESS all characters in the result of the normalization of
// the last run are in the skippable set
const UChar *s=output.getBuffer();
int32_t i=0, outLength=output.length();
UChar32 c;
while(i<outLength) {
U16_NEXT(s, i, outLength, c);
if(!unorm_isNFSkippable(c, fMode)) {
outLength=-1; // I wish C++ had labeled loops and break outer; ...
break;
}
}
if (outLength<0) {
break;
}
}
if(neededToNormalize) {
// replace the input chunk with its normalized form
text.handleReplaceBetween(start, limit, output);
// update all necessary indexes accordingly
delta = length - (limit - start); // length change in the text object
start = limit += delta; // the next chunk starts where this one ends, with adjustment
limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
offsets.contextLimit += delta;
} else {
// delta == 0
start = limit;
limit = offsets.limit;
}
}
offsets.start = start;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_TRANSLITERATION */