2001-07-03 23:44:45 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
|
|
|
* Copyright (C) 2001, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 07/03/01 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
2001-07-03 23:44:45 +00:00
|
|
|
#include "unicode/nortrans.h"
|
2001-10-19 23:26:48 +00:00
|
|
|
#include "unormimp.h"
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
U_CDECL_BEGIN
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This is an implementation of a code unit (UChar) iterator
|
|
|
|
* based on a Replaceable object.
|
|
|
|
* It is used with the internal API for incremental normalization.
|
|
|
|
*
|
|
|
|
* The UCharIterator.context field holds a pointer to the Replaceable.
|
|
|
|
* UCharIterator.length and UCharIterator.index hold Replaceable.length()
|
|
|
|
* and the iteration index.
|
|
|
|
*/
|
|
|
|
|
|
|
|
static int32_t U_CALLCONV
|
|
|
|
replaceableIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin) {
|
|
|
|
int32_t pos;
|
|
|
|
|
|
|
|
switch(origin) {
|
|
|
|
case UITERATOR_START:
|
|
|
|
pos=iter->start+delta;
|
|
|
|
break;
|
|
|
|
case UITERATOR_CURRENT:
|
|
|
|
pos=iter->index+delta;
|
|
|
|
break;
|
|
|
|
case UITERATOR_END:
|
|
|
|
pos=iter->limit+delta;
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
/* not a valid origin, no move */
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(pos<iter->start) {
|
|
|
|
pos=iter->start;
|
|
|
|
} else if(pos>iter->limit) {
|
|
|
|
pos=iter->limit;
|
|
|
|
}
|
|
|
|
|
|
|
|
return iter->index=pos;
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool U_CALLCONV
|
|
|
|
replaceableIteratorHasNext(UCharIterator *iter) {
|
|
|
|
return iter->index<iter->limit;
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool U_CALLCONV
|
|
|
|
replaceableIteratorHasPrevious(UCharIterator *iter) {
|
|
|
|
return iter->index>iter->start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static UChar U_CALLCONV
|
|
|
|
replaceableIteratorCurrent(UCharIterator *iter) {
|
|
|
|
if(iter->index<iter->limit) {
|
|
|
|
return ((Replaceable *)(iter->context))->charAt(iter->index);
|
|
|
|
} else {
|
|
|
|
return 0xffff;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static UChar U_CALLCONV
|
|
|
|
replaceableIteratorNext(UCharIterator *iter) {
|
|
|
|
if(iter->index<iter->limit) {
|
|
|
|
return ((Replaceable *)(iter->context))->charAt(iter->index++);
|
|
|
|
} else {
|
|
|
|
return 0xffff;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static UChar U_CALLCONV
|
|
|
|
replaceableIteratorPrevious(UCharIterator *iter) {
|
|
|
|
if(iter->index>iter->start) {
|
|
|
|
return ((Replaceable *)(iter->context))->charAt(--iter->index);
|
|
|
|
} else {
|
|
|
|
return 0xffff;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const UCharIterator replaceableIterator={
|
|
|
|
0, 0, 0, 0, 0,
|
|
|
|
replaceableIteratorMove,
|
|
|
|
replaceableIteratorHasNext,
|
|
|
|
replaceableIteratorHasPrevious,
|
|
|
|
replaceableIteratorCurrent,
|
|
|
|
replaceableIteratorNext,
|
|
|
|
replaceableIteratorPrevious
|
|
|
|
};
|
|
|
|
|
|
|
|
U_CDECL_END
|
|
|
|
|
2001-07-03 23:44:45 +00:00
|
|
|
/**
|
|
|
|
* System registration hook.
|
|
|
|
*/
|
|
|
|
void NormalizationTransliterator::registerIDs() {
|
2001-10-19 23:26:48 +00:00
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
if(!unorm_haveData(&errorCode)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2001-10-17 17:29:34 +00:00
|
|
|
Transliterator::_registerFactory(UnicodeString("Any-NFC", ""),
|
|
|
|
_create, integerToken(UNORM_NFC));
|
|
|
|
Transliterator::_registerFactory(UnicodeString("Any-NFKC", ""),
|
|
|
|
_create, integerToken(UNORM_NFKC));
|
|
|
|
Transliterator::_registerFactory(UnicodeString("Any-NFD", ""),
|
|
|
|
_create, integerToken(UNORM_NFD));
|
|
|
|
Transliterator::_registerFactory(UnicodeString("Any-NFKD", ""),
|
|
|
|
_create, integerToken(UNORM_NFKD));
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Factory methods
|
|
|
|
*/
|
2001-10-17 17:29:34 +00:00
|
|
|
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
|
|
|
|
Token context) {
|
|
|
|
return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructs a transliterator.
|
|
|
|
*/
|
|
|
|
NormalizationTransliterator::NormalizationTransliterator(
|
|
|
|
const UnicodeString& id,
|
2001-09-27 23:19:22 +00:00
|
|
|
UNormalizationMode mode, int32_t opt) :
|
2001-07-03 23:44:45 +00:00
|
|
|
Transliterator(id, 0) {
|
2001-09-27 23:19:22 +00:00
|
|
|
fMode = mode;
|
2001-07-03 23:44:45 +00:00
|
|
|
options = opt;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor.
|
|
|
|
*/
|
|
|
|
NormalizationTransliterator::~NormalizationTransliterator() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Copy constructor.
|
|
|
|
*/
|
|
|
|
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
|
|
|
|
Transliterator(o) {
|
2001-09-27 23:19:22 +00:00
|
|
|
fMode = o.fMode;
|
2001-07-03 23:44:45 +00:00
|
|
|
options = o.options;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Assignment operator.
|
|
|
|
*/
|
|
|
|
NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
|
|
|
|
Transliterator::operator=(o);
|
2001-09-27 23:19:22 +00:00
|
|
|
fMode = o.fMode;
|
2001-07-03 23:44:45 +00:00
|
|
|
options = o.options;
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Transliterator API.
|
|
|
|
*/
|
|
|
|
Transliterator* NormalizationTransliterator::clone(void) const {
|
|
|
|
return new NormalizationTransliterator(*this);
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO
|
|
|
|
// TODO
|
|
|
|
// TODO
|
|
|
|
// Get rid of this function and use the official Replaceable
|
|
|
|
// extractBetween() method, when possible
|
|
|
|
// TODO
|
|
|
|
// TODO
|
|
|
|
// TODO
|
|
|
|
static void _Replaceable_extractBetween(const Replaceable& text,
|
|
|
|
int32_t start,
|
|
|
|
int32_t limit,
|
|
|
|
UChar* buffer) {
|
|
|
|
while (start < limit) {
|
|
|
|
*buffer++ = text.charAt(start++);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implements {@link Transliterator#handleTransliterate}.
|
|
|
|
*/
|
|
|
|
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
|
|
|
|
UBool isIncremental) const {
|
2001-10-19 23:26:48 +00:00
|
|
|
// start and limit of the input range
|
2001-07-03 23:44:45 +00:00
|
|
|
int32_t start = offsets.start;
|
|
|
|
int32_t limit = offsets.limit;
|
2001-10-19 23:26:48 +00:00
|
|
|
int32_t length, delta;
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
if(start >= limit) {
|
|
|
|
return;
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
// a C code unit iterator, implemented around the Replaceable
|
|
|
|
UCharIterator iter = replaceableIterator;
|
|
|
|
iter.context = &text;
|
|
|
|
// iter.length = text.length(); is not used
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
// the output string and buffer pointer
|
|
|
|
UnicodeString output;
|
|
|
|
UChar *buffer;
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
UErrorCode errorCode;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Normalize as short chunks at a time as possible even in
|
|
|
|
* bulk mode, so that styled text is minimally disrupted.
|
|
|
|
* In incremental mode, a chunk that ends with offsets.limit
|
|
|
|
* must not be normalized.
|
|
|
|
*
|
|
|
|
* If it was known that the input text is not styled, then
|
|
|
|
* a bulk mode normalization could look like this:
|
|
|
|
*
|
|
|
|
|
|
|
|
UChar staticChars[256];
|
|
|
|
UnicodeString input;
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
length = limit - start;
|
|
|
|
input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
|
|
|
|
input.releaseBuffer(length);
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
|
|
Normalizer::normalize(input, fMode, options, output, status);
|
|
|
|
|
|
|
|
text.handleReplaceBetween(start, limit, output);
|
|
|
|
|
|
|
|
int32_t delta = output.length() - length;
|
|
|
|
offsets.contextLimit += delta;
|
|
|
|
offsets.limit += delta;
|
|
|
|
offsets.start = limit + delta;
|
|
|
|
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
while(start < limit) {
|
|
|
|
// set the iterator limits for the remaining input range
|
|
|
|
// this is a moving target because of the replacements in the text object
|
|
|
|
iter.start = iter.index = start;
|
|
|
|
iter.limit = limit;
|
|
|
|
|
|
|
|
// incrementally normalize a small chunk of the input
|
|
|
|
buffer = output.getBuffer(-1);
|
|
|
|
errorCode = U_ZERO_ERROR;
|
|
|
|
length = unorm_nextNormalize(buffer, output.getCapacity(), &iter,
|
|
|
|
fMode, FALSE, &errorCode);
|
|
|
|
output.releaseBuffer(length);
|
|
|
|
|
|
|
|
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
|
|
|
|
// use a larger output string buffer and do it again from the start
|
|
|
|
iter.index = start;
|
|
|
|
buffer = output.getBuffer(length);
|
|
|
|
errorCode = U_ZERO_ERROR;
|
|
|
|
length = unorm_nextNormalize(buffer, output.getCapacity(), &iter,
|
|
|
|
fMode, FALSE, &errorCode);
|
|
|
|
output.releaseBuffer(length);
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
limit = iter.index;
|
|
|
|
if(isIncremental && limit == iter.limit) {
|
|
|
|
// stop in incremental mode when we reach the input limit
|
|
|
|
// in case there are additional characters that could change the
|
|
|
|
// normalization result
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// replace the input chunk with its normalized form
|
2001-07-03 23:44:45 +00:00
|
|
|
text.handleReplaceBetween(start, limit, output);
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
// update all necessary indexes accordingly
|
|
|
|
delta = length - (limit - start); // length change in the text object
|
|
|
|
start = limit += delta; // the next chunk starts where this one ends, with adjustment
|
|
|
|
limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
|
2001-07-03 23:44:45 +00:00
|
|
|
offsets.contextLimit += delta;
|
|
|
|
}
|
2001-10-19 23:26:48 +00:00
|
|
|
|
|
|
|
offsets.start = start;
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
2001-10-08 23:26:58 +00:00
|
|
|
|
|
|
|
U_NAMESPACE_END
|