2001-07-03 23:44:45 +00:00
|
|
|
/*
|
|
|
|
**********************************************************************
|
2010-01-06 23:50:03 +00:00
|
|
|
* Copyright (C) 2001-2010, International Business Machines
|
2001-07-03 23:44:45 +00:00
|
|
|
* Corporation and others. All Rights Reserved.
|
|
|
|
**********************************************************************
|
|
|
|
* Date Name Description
|
|
|
|
* 07/03/01 aliu Creation.
|
|
|
|
**********************************************************************
|
|
|
|
*/
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
#include "unicode/utypes.h"
|
2002-09-20 01:54:48 +00:00
|
|
|
|
|
|
|
#if !UCONFIG_NO_TRANSLITERATION
|
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
#include "unicode/normalizer2.h"
|
|
|
|
#include "cstring.h"
|
2001-11-16 23:51:15 +00:00
|
|
|
#include "nortrans.h"
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-08 23:26:58 +00:00
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2003-08-31 20:53:46 +00:00
|
|
|
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
|
2002-06-29 00:04:16 +00:00
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
static inline Transliterator::Token cstrToken(const char *s) {
|
|
|
|
return Transliterator::pointerToken((void *)s);
|
|
|
|
}
|
|
|
|
|
2001-07-03 23:44:45 +00:00
|
|
|
/**
|
|
|
|
* System registration hook.
|
|
|
|
*/
|
|
|
|
void NormalizationTransliterator::registerIDs() {
|
2010-01-06 23:50:03 +00:00
|
|
|
// In the Token, the byte after the NUL is the UNormalization2Mode.
|
2002-01-22 00:27:49 +00:00
|
|
|
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
|
2010-01-06 23:50:03 +00:00
|
|
|
_create, cstrToken("nfc\0\0"));
|
2002-01-22 00:27:49 +00:00
|
|
|
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
|
2010-01-06 23:50:03 +00:00
|
|
|
_create, cstrToken("nfkc\0\0"));
|
2002-01-22 00:27:49 +00:00
|
|
|
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
|
2010-01-06 23:50:03 +00:00
|
|
|
_create, cstrToken("nfc\0\1"));
|
2002-01-22 00:27:49 +00:00
|
|
|
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
|
2010-01-06 23:50:03 +00:00
|
|
|
_create, cstrToken("nfkc\0\1"));
|
|
|
|
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
|
|
|
|
_create, cstrToken("nfc\0\2"));
|
|
|
|
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
|
|
|
|
_create, cstrToken("nfc\0\3"));
|
2002-01-22 00:27:49 +00:00
|
|
|
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
|
|
|
|
UNICODE_STRING_SIMPLE("NFD"), TRUE);
|
|
|
|
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
|
|
|
|
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
|
2010-01-06 23:50:03 +00:00
|
|
|
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
|
|
|
|
UNICODE_STRING_SIMPLE("NFD"), FALSE);
|
|
|
|
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
|
|
|
|
UNICODE_STRING_SIMPLE("FCD"), FALSE);
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Factory methods
|
|
|
|
*/
|
2001-10-17 17:29:34 +00:00
|
|
|
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
|
|
|
|
Token context) {
|
2010-01-06 23:50:03 +00:00
|
|
|
const char *name = (const char *)context.pointer;
|
|
|
|
UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
|
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
|
|
|
|
if(U_SUCCESS(errorCode)) {
|
|
|
|
return new NormalizationTransliterator(ID, *norm2);
|
|
|
|
} else {
|
|
|
|
return NULL;
|
|
|
|
}
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Constructs a transliterator.
|
|
|
|
*/
|
2010-01-06 23:50:03 +00:00
|
|
|
NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
|
|
|
|
const Normalizer2 &norm2) :
|
|
|
|
Transliterator(id, 0), fNorm2(norm2) {}
|
2001-07-03 23:44:45 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Destructor.
|
|
|
|
*/
|
|
|
|
NormalizationTransliterator::~NormalizationTransliterator() {
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Copy constructor.
|
|
|
|
*/
|
|
|
|
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
|
2010-01-06 23:50:03 +00:00
|
|
|
Transliterator(o), fNorm2(o.fNorm2) {}
|
2001-07-03 23:44:45 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Transliterator API.
|
|
|
|
*/
|
|
|
|
Transliterator* NormalizationTransliterator::clone(void) const {
|
|
|
|
return new NormalizationTransliterator(*this);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Implements {@link Transliterator#handleTransliterate}.
|
|
|
|
*/
|
|
|
|
void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
|
|
|
|
UBool isIncremental) const {
|
2001-10-19 23:26:48 +00:00
|
|
|
// start and limit of the input range
|
2001-07-03 23:44:45 +00:00
|
|
|
int32_t start = offsets.start;
|
|
|
|
int32_t limit = offsets.limit;
|
2001-10-19 23:26:48 +00:00
|
|
|
if(start >= limit) {
|
|
|
|
return;
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
/*
|
|
|
|
* Normalize as short chunks at a time as possible even in
|
|
|
|
* bulk mode, so that styled text is minimally disrupted.
|
|
|
|
* In incremental mode, a chunk that ends with offsets.limit
|
|
|
|
* must not be normalized.
|
|
|
|
*
|
|
|
|
* If it was known that the input text is not styled, then
|
|
|
|
* a bulk mode normalization could look like this:
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
UnicodeString input, normalized;
|
|
|
|
int32_t length = limit - start;
|
2001-10-19 23:26:48 +00:00
|
|
|
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
|
|
|
|
input.releaseBuffer(length);
|
2001-07-03 23:44:45 +00:00
|
|
|
|
2001-10-19 23:26:48 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
2010-01-06 23:50:03 +00:00
|
|
|
fNorm2.normalize(input, normalized, status);
|
2001-10-19 23:26:48 +00:00
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
text.handleReplaceBetween(start, limit, normalized);
|
2001-10-19 23:26:48 +00:00
|
|
|
|
2010-01-06 23:50:03 +00:00
|
|
|
int32_t delta = normalized.length() - length;
|
2001-10-19 23:26:48 +00:00
|
|
|
offsets.contextLimit += delta;
|
|
|
|
offsets.limit += delta;
|
|
|
|
offsets.start = limit + delta;
|
|
|
|
|
|
|
|
*/
|
2010-01-06 23:50:03 +00:00
|
|
|
UErrorCode errorCode = U_ZERO_ERROR;
|
|
|
|
UnicodeString segment;
|
|
|
|
UnicodeString normalized;
|
|
|
|
UChar32 c = text.char32At(start);
|
|
|
|
do {
|
|
|
|
int32_t prev = start;
|
|
|
|
// Skip at least one character so we make progress.
|
|
|
|
// c holds the character at start.
|
2010-02-04 23:57:28 +00:00
|
|
|
segment.remove();
|
|
|
|
do {
|
2010-01-06 23:50:03 +00:00
|
|
|
segment.append(c);
|
|
|
|
start += U16_LENGTH(c);
|
2010-02-04 23:57:28 +00:00
|
|
|
} while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start)));
|
2010-01-06 23:50:03 +00:00
|
|
|
if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
|
2001-10-19 23:26:48 +00:00
|
|
|
// stop in incremental mode when we reach the input limit
|
|
|
|
// in case there are additional characters that could change the
|
|
|
|
// normalization result
|
2010-01-06 23:50:03 +00:00
|
|
|
start=prev;
|
|
|
|
break;
|
2001-10-19 23:26:48 +00:00
|
|
|
}
|
2010-01-06 23:50:03 +00:00
|
|
|
fNorm2.normalize(segment, normalized, errorCode);
|
|
|
|
if(U_FAILURE(errorCode)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if(segment != normalized) {
|
2002-02-10 00:11:16 +00:00
|
|
|
// replace the input chunk with its normalized form
|
2010-01-06 23:50:03 +00:00
|
|
|
text.handleReplaceBetween(prev, start, normalized);
|
2002-02-10 00:11:16 +00:00
|
|
|
|
|
|
|
// update all necessary indexes accordingly
|
2010-01-06 23:50:03 +00:00
|
|
|
int32_t delta = normalized.length() - (start - prev);
|
|
|
|
start += delta;
|
|
|
|
limit += delta;
|
2002-02-10 00:11:16 +00:00
|
|
|
}
|
2010-01-06 23:50:03 +00:00
|
|
|
} while(start < limit);
|
2001-10-19 23:26:48 +00:00
|
|
|
|
|
|
|
offsets.start = start;
|
2010-01-06 23:50:03 +00:00
|
|
|
offsets.contextLimit += limit - offsets.limit;
|
|
|
|
offsets.limit = limit;
|
2001-07-03 23:44:45 +00:00
|
|
|
}
|
2001-10-08 23:26:58 +00:00
|
|
|
|
|
|
|
U_NAMESPACE_END
|
2002-09-20 01:54:48 +00:00
|
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_TRANSLITERATION */
|