1999-08-16 21:50:52 +00:00
|
|
|
/*
|
2001-03-21 20:44:20 +00:00
|
|
|
*************************************************************************
|
1999-11-22 21:59:05 +00:00
|
|
|
* COPYRIGHT:
|
2001-03-21 20:44:20 +00:00
|
|
|
* Copyright (c) 1996-2001, International Business Machines Corporation and
|
1999-11-22 21:59:05 +00:00
|
|
|
* others. All Rights Reserved.
|
2001-03-21 20:44:20 +00:00
|
|
|
*************************************************************************
|
1999-11-22 21:59:05 +00:00
|
|
|
*/
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2001-02-03 01:25:27 +00:00
|
|
|
/*
|
|
|
|
* Modification history
|
|
|
|
*
|
|
|
|
* Date Name Description
|
|
|
|
* 02/02/01 synwee Added converters from EMode to UNormalizationMode,
|
|
|
|
* getUNormalizationMode and getNormalizerEMode,
|
|
|
|
* useful in tbcoll and unorm.
|
|
|
|
* Added quickcheck method and incorporated it into
|
|
|
|
* normalize()
|
|
|
|
*/
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
#include "ucmp16.h"
|
|
|
|
#include "dcmpdata.h"
|
|
|
|
#include "compdata.h"
|
|
|
|
|
1999-12-28 23:39:02 +00:00
|
|
|
#include "unicode/normlzr.h"
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#include "unicode/unistr.h"
|
|
|
|
#include "unicode/chariter.h"
|
|
|
|
#include "unicode/schriter.h"
|
|
|
|
#include "unicode/unicode.h"
|
1999-08-16 21:50:52 +00:00
|
|
|
#include "mutex.h"
|
|
|
|
|
2001-06-20 22:24:42 +00:00
|
|
|
/* ### TODO: new implementation */
|
|
|
|
#include "unormimp.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))
|
2001-02-03 01:25:27 +00:00
|
|
|
/**
|
|
|
|
* Maximum initial buffer size.
|
|
|
|
* Used in quickCheck to declare initial array.
|
|
|
|
*/
|
|
|
|
const uint32_t StackBufferLen = 1024;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
inline static void insert(UnicodeString& dest,
|
2000-07-22 00:10:49 +00:00
|
|
|
UTextOffset pos,
|
|
|
|
UChar ch)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
dest.replace(pos, 0, &ch, 1);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Constructors and other boilerplate
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
Normalizer::Normalizer(const UnicodeString& str,
|
2000-07-22 00:10:49 +00:00
|
|
|
EMode mode)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
init(new StringCharacterIterator(str), mode, 0);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Normalizer::Normalizer(const UnicodeString& str,
|
2000-07-22 00:10:49 +00:00
|
|
|
EMode mode,
|
|
|
|
int32_t opt)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
init(new StringCharacterIterator(str), mode, opt);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
1999-11-22 21:59:05 +00:00
|
|
|
Normalizer::Normalizer(const UChar* str, int32_t length, EMode mode)
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
init(new StringCharacterIterator(UnicodeString(str, length)), mode, 0);
|
1999-11-22 21:59:05 +00:00
|
|
|
}
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
Normalizer::Normalizer(const CharacterIterator& iter,
|
2000-07-22 00:10:49 +00:00
|
|
|
EMode mode)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
init(iter.clone(), mode, 0);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Normalizer::Normalizer(const CharacterIterator& iter,
|
2000-07-22 00:10:49 +00:00
|
|
|
EMode mode,
|
|
|
|
int32_t opt)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
init(iter.clone(), mode, opt);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::init(CharacterIterator* adoptIter,
|
2000-07-22 00:10:49 +00:00
|
|
|
EMode mode,
|
|
|
|
int32_t options)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
bufferPos = 0;
|
|
|
|
bufferLimit = 0;
|
|
|
|
fOptions = options;
|
|
|
|
currentChar = DONE;
|
|
|
|
fMode = mode;
|
|
|
|
text = adoptIter;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
minDecomp = (int16_t)((fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Normalizer::Normalizer(const Normalizer& copy)
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
init(copy.text->clone(), copy.fMode, copy.fOptions);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
buffer = copy.buffer;
|
|
|
|
bufferPos = copy.bufferPos;
|
|
|
|
bufferLimit = copy.bufferLimit;
|
|
|
|
explodeBuf = copy.explodeBuf;
|
|
|
|
currentChar = copy.currentChar;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Normalizer::~Normalizer()
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
delete text;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
Normalizer*
|
1999-08-16 21:50:52 +00:00
|
|
|
Normalizer::clone() const
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
if(this!=0) {
|
|
|
|
return new Normalizer(*this);
|
|
|
|
} else {
|
|
|
|
return 0;
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Generates a hash code for this iterator.
|
|
|
|
*/
|
|
|
|
int32_t Normalizer::hashCode() const
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
return text->hashCode() + fMode + fOptions + bufferPos + bufferLimit;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool Normalizer::operator==(const Normalizer& that) const
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
return *text == *(that.text)
|
|
|
|
&& currentChar == that.currentChar
|
|
|
|
&& buffer == that.buffer
|
|
|
|
&& explodeBuf == that.explodeBuf
|
|
|
|
&& bufferPos == that.bufferPos
|
|
|
|
&& bufferLimit == that.bufferLimit;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Static utility methods
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
void
|
|
|
|
Normalizer::normalize(const UnicodeString& source,
|
2000-07-22 00:10:49 +00:00
|
|
|
EMode mode,
|
|
|
|
int32_t options,
|
|
|
|
UnicodeString& result,
|
2001-08-17 00:21:18 +00:00
|
|
|
UErrorCode &status) {
|
|
|
|
if(source.isBogus()) {
|
|
|
|
result.setToBogus();
|
|
|
|
} else {
|
|
|
|
/* make sure that we do not operate on the same buffer in source and result */
|
|
|
|
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
|
|
|
|
result.fLength=unorm_internalNormalize(result.fArray, result.fCapacity,
|
|
|
|
source.fArray, source.fLength,
|
|
|
|
getUNormalizationMode(mode, status), (options&IGNORE_HANGUL)!=0,
|
|
|
|
UnicodeString::growBuffer, &result,
|
|
|
|
&status);
|
|
|
|
if(U_FAILURE(status)) {
|
2001-06-28 00:34:35 +00:00
|
|
|
result.setToBogus();
|
|
|
|
}
|
|
|
|
}
|
2001-02-03 01:25:27 +00:00
|
|
|
}
|
|
|
|
|
2001-02-15 20:21:21 +00:00
|
|
|
UNormalizationCheckResult
|
|
|
|
Normalizer::quickCheck(const UnicodeString& source,
|
|
|
|
Normalizer::EMode mode,
|
2001-08-17 00:21:18 +00:00
|
|
|
UErrorCode &status) {
|
|
|
|
if(U_FAILURE(status)) {
|
|
|
|
return UNORM_MAYBE;
|
|
|
|
}
|
2001-02-03 01:25:27 +00:00
|
|
|
|
2001-08-17 00:21:18 +00:00
|
|
|
return unorm_quickCheck(source.fArray, source.length(),
|
|
|
|
getUNormalizationMode(mode, status), &status);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Inline functions for 64-bit bitmasks (array of 2 uint32_t)
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
// Clear all bits of the mask
|
|
|
|
inline void emptyBitmask64(uint32_t* mask) {
|
|
|
|
mask[0] = mask[1] = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return TRUE if all bits are clear in the mask
|
|
|
|
inline UBool isEmptyBitmask64(uint32_t* mask) {
|
|
|
|
return (mask[0] == 0) && (mask[1] == 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set a single bit (0..63) of the mask
|
|
|
|
inline void setBitmask64(uint32_t* mask, int32_t bit) {
|
2001-05-22 22:25:48 +00:00
|
|
|
mask[bit >> 5] |= ((uint32_t)1L << (bit & 31));
|
2000-07-26 16:40:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Return TRUE if a single bit (0..63) is set in the mask
|
|
|
|
inline UBool isSetBitmask64(uint32_t* mask, int32_t bit) {
|
|
|
|
return (mask[bit >> 5] & (1L << (bit & 31))) != 0;
|
|
|
|
}
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Compose methods
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
void
|
|
|
|
Normalizer::compose(const UnicodeString& source,
|
2000-07-22 00:10:49 +00:00
|
|
|
UBool compat,
|
2001-06-28 00:34:35 +00:00
|
|
|
int32_t options,
|
2000-07-22 00:10:49 +00:00
|
|
|
UnicodeString& result,
|
2001-08-17 00:21:18 +00:00
|
|
|
UErrorCode &status) {
|
|
|
|
if(source.isBogus()) {
|
|
|
|
result.setToBogus();
|
|
|
|
} else {
|
|
|
|
/* make sure that we do not operate on the same buffer in source and result */
|
|
|
|
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
|
|
|
|
result.fLength=unorm_compose(result.fArray, result.fCapacity,
|
|
|
|
source.fArray, source.fLength,
|
|
|
|
compat, (options&IGNORE_HANGUL)!=0,
|
|
|
|
UnicodeString::growBuffer, &result,
|
|
|
|
&status);
|
|
|
|
if(U_FAILURE(status)) {
|
2001-06-28 00:34:35 +00:00
|
|
|
result.setToBogus();
|
2000-07-22 00:10:49 +00:00
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Compose starting with current input character and continuing
|
|
|
|
* until just before the next base char.
|
|
|
|
* <p>
|
|
|
|
* <b>Input</b>:
|
|
|
|
* <ul>
|
|
|
|
* <li>underlying char iter points to first character to decompose
|
|
|
|
* </ul>
|
|
|
|
* <p>
|
|
|
|
* <b>Output:</b>
|
|
|
|
* <ul>
|
|
|
|
* <li>returns first char of decomposition or DONE if at end
|
|
|
|
* <li>Underlying char iter is pointing at next base char or past end
|
|
|
|
* </ul>
|
|
|
|
*/
|
|
|
|
UChar Normalizer::nextCompose()
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
UTextOffset explodePos = EMPTY; // Position in input buffer
|
|
|
|
UTextOffset basePos = 0; // Position of last base in output string
|
|
|
|
uint16_t baseIndex = 0; // Index of last base in "actions" array
|
2000-07-26 16:40:43 +00:00
|
|
|
uint32_t classesSeen[2]; // Combining classes seen since last base
|
1999-08-16 21:50:52 +00:00
|
|
|
uint16_t action;
|
2000-07-22 00:10:49 +00:00
|
|
|
UChar lastBase = 0;
|
|
|
|
UBool chFromText = TRUE;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
// Compatibility explosions have lower indices; skip them if necessary
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t minExplode = (uint16_t)((fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT);
|
2001-02-22 19:24:15 +00:00
|
|
|
uint16_t minDecompLocal = (uint16_t)((fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
emptyBitmask64(classesSeen);
|
1999-08-16 21:50:52 +00:00
|
|
|
initBuffer();
|
|
|
|
explodeBuf.truncate(0);
|
|
|
|
|
|
|
|
UChar ch = curForward();
|
|
|
|
|
|
|
|
while (ch != DONE) {
|
|
|
|
// Get the basic info for the character
|
|
|
|
uint16_t charInfo = composeLookup(ch);
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
|
|
|
|
uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
if (type == ComposeData::BASE || (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)) {
|
1999-12-08 02:11:04 +00:00
|
|
|
if (buffer.length() > 0 && chFromText && explodePos == EMPTY) {
|
1999-08-16 21:50:52 +00:00
|
|
|
// When we hit a base char in the source text, we can return the text
|
2000-07-26 16:40:43 +00:00
|
|
|
// that's been composed so far. We'll re-process this char next time hrough.
|
1999-08-16 21:50:52 +00:00
|
|
|
break;
|
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
emptyBitmask64(classesSeen);
|
1999-08-16 21:50:52 +00:00
|
|
|
baseIndex = index;
|
1999-12-08 02:11:04 +00:00
|
|
|
basePos = buffer.length();
|
1999-08-16 21:50:52 +00:00
|
|
|
buffer += ch;
|
|
|
|
lastBase = ch;
|
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
else if (type == ComposeData::COMBINING)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-26 16:40:43 +00:00
|
|
|
uint32_t cclass = ComposeData::typeBit[index]; // 0..63
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
// We can only combine a character with the base if we haven't
|
|
|
|
// already seen a combining character with the same canonical class.
|
2000-07-26 16:40:43 +00:00
|
|
|
if (index < ComposeData::COMBINING_COUNT
|
|
|
|
&& !isSetBitmask64(classesSeen, cclass)
|
1999-08-16 21:50:52 +00:00
|
|
|
&& (action = composeAction(baseIndex, index)) > 0)
|
|
|
|
{
|
|
|
|
if (action > ComposeData::MAX_COMPOSED) {
|
|
|
|
// Pairwise explosion. Actions above this value are really
|
|
|
|
// indices into an array that in turn contains indices
|
|
|
|
// into the exploding string table
|
|
|
|
// TODO: What if there are unprocessed chars in the explode buffer?
|
|
|
|
UChar newBase = pairExplode(explodeBuf, action);
|
|
|
|
explodePos = 0;
|
|
|
|
buffer[basePos] = newBase;
|
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT);
|
1999-08-16 21:50:52 +00:00
|
|
|
lastBase = newBase;
|
|
|
|
} else {
|
|
|
|
// Normal pairwise combination. Replace the base char
|
|
|
|
UChar newBase = (UChar) action;
|
|
|
|
buffer[basePos] = newBase;
|
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT);
|
1999-08-16 21:50:52 +00:00
|
|
|
lastBase = newBase;
|
|
|
|
}
|
|
|
|
//
|
|
|
|
// Since there are Unicode characters that cannot be combined in arbitrary
|
|
|
|
// order, we have to re-process any combining marks that go with this
|
|
|
|
// base character. There are only four characters in Unicode that have
|
|
|
|
// this problem. If they are fixed in Unicode 3.0, this code can go away.
|
|
|
|
//
|
1999-12-08 02:11:04 +00:00
|
|
|
UTextOffset len = buffer.length();
|
1999-08-16 21:50:52 +00:00
|
|
|
if (len - basePos > 1) {
|
|
|
|
for (UTextOffset j = basePos+1; j < len; j++) {
|
|
|
|
explodeBuf += buffer[j];
|
|
|
|
}
|
|
|
|
buffer.truncate(basePos+1);
|
2000-07-26 16:40:43 +00:00
|
|
|
emptyBitmask64(classesSeen);
|
1999-08-16 21:50:52 +00:00
|
|
|
if (explodePos == EMPTY) explodePos = 0;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// No combination with this character
|
|
|
|
bubbleAppend(buffer, ch, cclass);
|
2000-07-26 16:40:43 +00:00
|
|
|
setBitmask64(classesSeen, cclass); //[cclass >> 5] |= (1L << (cclass & 31));
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
else if (index > minExplode) {
|
|
|
|
// Single exploding character
|
|
|
|
explode(explodeBuf, index);
|
|
|
|
explodePos = 0;
|
|
|
|
}
|
|
|
|
else if (type == ComposeData::HANGUL && minExplode == 0) {
|
|
|
|
// If we're in compatibility mode we need to decompose Hangul to Jamo,
|
|
|
|
// because some of the Jamo might have compatibility decompositions.
|
2001-02-22 19:24:15 +00:00
|
|
|
hangulToJamo(ch, explodeBuf, minDecompLocal);
|
1999-08-16 21:50:52 +00:00
|
|
|
explodePos = 0;
|
|
|
|
}
|
|
|
|
else if (type == ComposeData::INITIAL_JAMO) {
|
1999-12-08 02:11:04 +00:00
|
|
|
if (buffer.length() > 0 && chFromText && explodePos == EMPTY) {
|
1999-08-16 21:50:52 +00:00
|
|
|
// When we hit a base char in the source text, we can return the text
|
|
|
|
// that's been composed so far. We'll re-process this char next time through.
|
|
|
|
break;
|
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
emptyBitmask64(classesSeen);
|
1999-08-16 21:50:52 +00:00
|
|
|
baseIndex = ComposeData::INITIAL_JAMO_INDEX;
|
1999-12-08 02:11:04 +00:00
|
|
|
basePos = buffer.length();
|
1999-08-16 21:50:52 +00:00
|
|
|
buffer += ch;
|
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
else if (type == ComposeData::MEDIAL_JAMO
|
|
|
|
&& isEmptyBitmask64(classesSeen)
|
2000-07-22 00:10:49 +00:00
|
|
|
&& baseIndex == ComposeData::INITIAL_JAMO_INDEX) {
|
1999-08-16 21:50:52 +00:00
|
|
|
// If the last character was an initial jamo, we can combine it with this
|
|
|
|
// one to create a Hangul character.
|
2000-08-25 15:53:31 +00:00
|
|
|
uint16_t l = (uint16_t)(buffer[basePos] - (UChar)JAMO_LBASE);
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t v = (uint16_t)(ch - JAMO_VBASE);
|
|
|
|
buffer[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
baseIndex = ComposeData::MEDIAL_JAMO_INDEX;
|
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
else if (type == ComposeData::FINAL_JAMO
|
|
|
|
&& isEmptyBitmask64(classesSeen)
|
2000-07-22 00:10:49 +00:00
|
|
|
&& baseIndex == ComposeData::MEDIAL_JAMO_INDEX) {
|
1999-08-16 21:50:52 +00:00
|
|
|
// If the last character was a medial jamo that we turned into Hangul,
|
|
|
|
// we can add this character too.
|
2000-08-08 22:15:50 +00:00
|
|
|
buffer[basePos] = (UChar)(buffer[basePos] + (ch - JAMO_TBASE));
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
baseIndex = 0;
|
|
|
|
basePos = -1;
|
2000-07-26 16:40:43 +00:00
|
|
|
emptyBitmask64(classesSeen);
|
1999-08-16 21:50:52 +00:00
|
|
|
} else {
|
|
|
|
// TODO: deal with JAMO character types
|
|
|
|
baseIndex = 0;
|
|
|
|
basePos = -1;
|
2000-07-26 16:40:43 +00:00
|
|
|
emptyBitmask64(classesSeen);
|
1999-08-16 21:50:52 +00:00
|
|
|
buffer += ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (explodePos == EMPTY) {
|
|
|
|
ch = text->next();
|
|
|
|
chFromText = TRUE;
|
|
|
|
} else {
|
|
|
|
ch = explodeBuf[explodePos++];
|
1999-12-08 02:11:04 +00:00
|
|
|
if (explodePos >= explodeBuf.length()) {
|
1999-08-16 21:50:52 +00:00
|
|
|
explodePos = EMPTY;
|
|
|
|
explodeBuf.truncate(0);
|
|
|
|
}
|
|
|
|
chFromText = FALSE;
|
|
|
|
}
|
|
|
|
}
|
1999-12-08 02:11:04 +00:00
|
|
|
if (buffer.length() > 0) {
|
|
|
|
bufferLimit = buffer.length() - 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
ch = buffer[0];
|
|
|
|
} else {
|
|
|
|
ch = DONE;
|
|
|
|
bufferLimit = 0;
|
|
|
|
}
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Compose starting with the input UChar just before the current position
|
|
|
|
* and continuing backward until (and including) the previous base char.
|
|
|
|
* <p>
|
|
|
|
* <b>Input</b>:
|
|
|
|
* <ul>
|
|
|
|
* <li>underlying char iter points just after last char to decompose
|
|
|
|
* </ul>
|
|
|
|
* <p>
|
|
|
|
* <b>Output:</b>
|
|
|
|
* <ul>
|
|
|
|
* <li>returns last char of resulting decomposition sequence
|
|
|
|
* <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
|
|
|
|
* </ul>
|
|
|
|
*/
|
|
|
|
UChar Normalizer::prevCompose()
|
|
|
|
{
|
1999-10-07 00:07:53 +00:00
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
2000-07-26 16:40:43 +00:00
|
|
|
|
|
|
|
// Compatibility explosions have lower indices; skip them if necessary
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t minExplode = (uint16_t)((fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT);
|
2000-07-26 16:40:43 +00:00
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
initBuffer();
|
|
|
|
|
|
|
|
// Slurp up characters until we hit a base char or an initial Jamo
|
|
|
|
UChar ch;
|
|
|
|
while ((ch = curBackward()) != DONE) {
|
|
|
|
insert(buffer, 0, ch);
|
|
|
|
|
|
|
|
// Get the basic info for the character
|
|
|
|
uint16_t charInfo = composeLookup(ch);
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
|
|
|
|
uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
if (type == ComposeData::BASE
|
|
|
|
|| (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)
|
|
|
|
|| type == ComposeData::HANGUL
|
|
|
|
|| type == ComposeData::INITIAL_JAMO)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If there's more than one character in the buffer, compose it all at once....
|
1999-12-08 02:11:04 +00:00
|
|
|
if (buffer.length() > 0) {
|
1999-08-16 21:50:52 +00:00
|
|
|
// TODO: The performance of this is awful; add a way to compose
|
|
|
|
// a UnicodeString& in place.
|
2000-07-22 00:10:49 +00:00
|
|
|
UnicodeString composed;
|
2000-08-15 18:25:20 +00:00
|
|
|
compose(buffer, (fMode & COMPAT_BIT) != 0, fOptions, composed, status);
|
2000-07-22 00:10:49 +00:00
|
|
|
buffer.truncate(0);
|
|
|
|
buffer += composed;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
if (buffer.length() > 1) {
|
|
|
|
bufferLimit = bufferPos = buffer.length() - 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
ch = buffer[bufferPos];
|
|
|
|
} else {
|
|
|
|
ch = buffer[0];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
ch = DONE;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass) {
|
|
|
|
UTextOffset i;
|
2001-04-02 19:29:50 +00:00
|
|
|
for (i = target.length() - 1; i >= 0; --i) {
|
1999-08-16 21:50:52 +00:00
|
|
|
uint32_t iClass = getComposeClass(target[i]);
|
|
|
|
|
|
|
|
if (iClass == 1 || iClass <= cclass) { // 1 means combining class 0
|
|
|
|
// We've hit something we can't bubble this character past, so insert here
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// We need to insert just after character "i"
|
|
|
|
insert(target, i+1, ch);
|
|
|
|
}
|
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
/**
|
|
|
|
* Return the composing class of a character, as stored in the ComposeData
|
|
|
|
* table. This is not the composing class as listed in the raw Unicode
|
|
|
|
* database, but an equivalent remapped value. Values are remapped so they
|
|
|
|
* fit in a sequential range from 0..n, where n < 64, and relative order
|
|
|
|
* is preserved.
|
|
|
|
* @return the composing class of ch, from 0..63
|
|
|
|
*/
|
1999-08-16 21:50:52 +00:00
|
|
|
uint32_t Normalizer::getComposeClass(UChar ch) {
|
|
|
|
uint32_t cclass = 0;
|
|
|
|
uint16_t charInfo = composeLookup(ch);
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK);
|
2000-07-26 16:40:43 +00:00
|
|
|
if (type == ComposeData::COMBINING) {
|
|
|
|
cclass = ComposeData::typeBit[charInfo >> ComposeData::INDEX_SHIFT];
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
return cclass;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint16_t Normalizer::composeLookup(UChar ch) {
|
2000-07-22 00:10:49 +00:00
|
|
|
return ucmp16_getu(ComposeData::lookup, ch);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
uint16_t Normalizer::composeAction(uint16_t baseIndex, uint16_t comIndex)
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
return ucmp16_getu(ComposeData::actions,
|
|
|
|
((UChar)(baseIndex + ComposeData::MAX_BASES*comIndex)));
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::explode(UnicodeString& target, uint16_t index) {
|
|
|
|
UChar ch;
|
2000-07-22 00:10:49 +00:00
|
|
|
while ((ch = ComposeData::replace[index++]) != 0) {
|
|
|
|
target += ch;
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
UChar Normalizer::pairExplode(UnicodeString& target, uint16_t action) {
|
|
|
|
uint16_t index = ComposeData::actionIndex[action - ComposeData::MAX_COMPOSED];
|
2000-08-11 21:56:35 +00:00
|
|
|
explode(target, (uint16_t)(index + 1));
|
1999-08-16 21:50:52 +00:00
|
|
|
return ComposeData::replace[index]; // New base char
|
|
|
|
}
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Decompose methods
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
void
|
|
|
|
Normalizer::decompose(const UnicodeString& source,
|
2000-07-22 00:10:49 +00:00
|
|
|
UBool compat,
|
|
|
|
int32_t options,
|
|
|
|
UnicodeString& result,
|
2001-08-17 00:21:18 +00:00
|
|
|
UErrorCode &status) {
|
|
|
|
if(source.isBogus()) {
|
|
|
|
result.setToBogus();
|
|
|
|
} else {
|
|
|
|
/* make sure that we do not operate on the same buffer in source and result */
|
|
|
|
result.cloneArrayIfNeeded(-1, source.length()+20, FALSE);
|
|
|
|
result.fLength=unorm_decompose(result.fArray, result.fCapacity,
|
|
|
|
source.fArray, source.fLength,
|
|
|
|
compat, (options&IGNORE_HANGUL)!=0,
|
|
|
|
UnicodeString::growBuffer, &result,
|
|
|
|
&status);
|
|
|
|
if(U_FAILURE(status)) {
|
2001-06-20 22:24:42 +00:00
|
|
|
result.setToBogus();
|
2000-07-22 00:10:49 +00:00
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Decompose starting with current input character and continuing
|
|
|
|
* until just before the next base char.
|
|
|
|
* <p>
|
|
|
|
* <b>Input</b>:
|
|
|
|
* <ul>
|
|
|
|
* <li>underlying char iter points to first character to decompose
|
|
|
|
* </ul>
|
|
|
|
* <p>
|
|
|
|
* <b>Output:</b>
|
|
|
|
* <ul>
|
|
|
|
* <li>returns first char of decomposition or DONE if at end
|
|
|
|
* <li>Underlying char iter is pointing at next base char or past end
|
|
|
|
* </ul>
|
|
|
|
*/
|
|
|
|
UChar Normalizer::nextDecomp()
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
UBool hangul = ((fOptions & IGNORE_HANGUL) == 0);
|
|
|
|
UChar ch = curForward();
|
2000-07-26 16:40:43 +00:00
|
|
|
int32_t i;
|
2000-07-22 00:10:49 +00:00
|
|
|
uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
|
2000-08-08 22:15:50 +00:00
|
|
|
int16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
if (index > minDecomp ||
|
2000-07-22 00:10:49 +00:00
|
|
|
ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
initBuffer();
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
if (index > minDecomp) {
|
|
|
|
doAppend((const UChar*)(DecompData::contents), index, buffer);
|
|
|
|
|
|
|
|
if ((offset & DecompData::DECOMP_RECURSE) != 0) {
|
|
|
|
// Need to decompose the output of this decomposition recursively.
|
|
|
|
for (i = 0; i < buffer.length(); i++) {
|
|
|
|
ch = buffer.charAt(i);
|
2000-08-08 22:15:50 +00:00
|
|
|
int16_t index = (int16_t)(ucmp16_getu(DecompData::offsets, ch)
|
|
|
|
& DecompData::DECOMP_MASK);
|
2000-07-26 16:40:43 +00:00
|
|
|
if (index > minDecomp) {
|
|
|
|
i += doReplace((const UChar*)(DecompData::contents), index, buffer, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2000-07-22 00:10:49 +00:00
|
|
|
} else {
|
|
|
|
buffer += ch;
|
|
|
|
}
|
|
|
|
UBool needToReorder = FALSE;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
// Any other combining chacters that immediately follow the decomposed
|
|
|
|
// character must be included in the buffer too, because they're
|
|
|
|
// conceptually part of the same logical character.
|
|
|
|
while ((ch = text->next()) != DONE
|
|
|
|
&& ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
needToReorder = TRUE;
|
2000-07-26 16:40:43 +00:00
|
|
|
// Decompose any of these characters that need it - Liu
|
2000-08-08 22:15:50 +00:00
|
|
|
index = (int16_t)(ucmp16_getu(DecompData::offsets, ch)
|
|
|
|
& DecompData::DECOMP_MASK);
|
2000-07-26 16:40:43 +00:00
|
|
|
if (index > minDecomp) {
|
|
|
|
doAppend((const UChar*)DecompData::contents, index, buffer);
|
|
|
|
} else {
|
|
|
|
buffer += ch;
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
if (buffer.length() > 1 && needToReorder) {
|
|
|
|
// If there is more than one combining character in the buffer,
|
|
|
|
// put them into the canonical order.
|
|
|
|
// But we don't need to sort if only characters are the ones that
|
|
|
|
// resulted from decomosing the base character.
|
|
|
|
fixCanonical(buffer);
|
|
|
|
}
|
|
|
|
bufferLimit = buffer.length() - 1;
|
|
|
|
ch = buffer[0];
|
1999-08-16 21:50:52 +00:00
|
|
|
} else {
|
2000-07-22 00:10:49 +00:00
|
|
|
// Just use this character, but first advance to the next one
|
|
|
|
text->next();
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
// Do Hangul -> Jamo decomposition if necessary
|
|
|
|
if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
|
|
|
|
initBuffer();
|
|
|
|
hangulToJamo(ch, buffer, minDecomp);
|
|
|
|
bufferLimit = buffer.length() - 1;
|
|
|
|
ch = buffer[0];
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
2000-07-22 00:10:49 +00:00
|
|
|
return ch;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Decompose starting with the input char just before the current position
|
|
|
|
* and continuing backward until (and including) the previous base char.
|
|
|
|
* <p>
|
|
|
|
* <b>Input</b>:
|
|
|
|
* <ul>
|
|
|
|
* <li>underlying char iter points just after last char to decompose
|
|
|
|
* </ul>
|
|
|
|
* <p>
|
|
|
|
* <b>Output:</b>
|
|
|
|
* <ul>
|
|
|
|
* <li>returns last char of resulting decomposition sequence
|
|
|
|
* <li>underlying iter points to lowest-index char we decomposed, i.e. the base char
|
|
|
|
* </ul>
|
|
|
|
*/
|
|
|
|
UChar Normalizer::prevDecomp() {
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool hangul = (fOptions & IGNORE_HANGUL) == 0;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
UChar ch = curBackward();
|
|
|
|
|
|
|
|
uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
|
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
if (offset > minDecomp ||
|
|
|
|
ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
|
|
|
initBuffer();
|
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
// This method rewritten to pass conformance tests. - Liu
|
|
|
|
// Collect all characters up to the previous base char
|
|
|
|
while (ch != DONE) {
|
|
|
|
buffer.insert(0, ch);
|
|
|
|
if (ucmp8_get(DecompData::canonClass, ch) == DecompData::BASE) break;
|
1999-08-16 21:50:52 +00:00
|
|
|
ch = text->previous();
|
|
|
|
}
|
2000-07-26 16:40:43 +00:00
|
|
|
|
|
|
|
// Decompose the buffer
|
|
|
|
int32_t i;
|
|
|
|
for (i = 0; i < buffer.length(); i++) {
|
|
|
|
ch = buffer.charAt(i);
|
|
|
|
offset = ucmp16_getu(DecompData::offsets, ch);
|
2000-08-08 22:15:50 +00:00
|
|
|
int16_t index = (int16_t)(offset & DecompData::DECOMP_MASK);
|
2000-07-26 16:40:43 +00:00
|
|
|
|
|
|
|
if (index > minDecomp) {
|
|
|
|
int j = doReplace((const UChar*)(DecompData::contents), index, buffer, i);
|
|
|
|
if ((offset & DecompData::DECOMP_RECURSE) != 0) {
|
|
|
|
// Need to decompose this recursively
|
|
|
|
for (; i < j; ++i) {
|
|
|
|
ch = buffer.charAt(i);
|
2000-08-08 22:15:50 +00:00
|
|
|
index = (int16_t)(ucmp16_getu(DecompData::offsets, ch)
|
|
|
|
& DecompData::DECOMP_MASK);
|
2000-07-26 16:40:43 +00:00
|
|
|
if (index > minDecomp) {
|
|
|
|
i += doReplace((const UChar*)(DecompData::contents), index, buffer, i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
i = j;
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
|
1999-12-08 02:11:04 +00:00
|
|
|
if (buffer.length() > 1) {
|
1999-08-16 21:50:52 +00:00
|
|
|
// If there is more than one combining character in the buffer,
|
|
|
|
// put them into the canonical order.
|
|
|
|
fixCanonical(buffer);
|
|
|
|
}
|
1999-12-08 02:11:04 +00:00
|
|
|
bufferLimit = bufferPos = buffer.length() - 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
ch = buffer[bufferPos];
|
|
|
|
}
|
|
|
|
else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) {
|
|
|
|
initBuffer();
|
|
|
|
hangulToJamo(ch, buffer, minDecomp);
|
1999-12-08 02:11:04 +00:00
|
|
|
bufferLimit = bufferPos = buffer.length() - 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
ch = buffer[bufferPos];
|
|
|
|
}
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint8_t Normalizer::getClass(UChar ch) {
|
|
|
|
return ucmp8_get(DecompData::canonClass, ch);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Fixes the sorting sequence of non-spacing characters according to
|
|
|
|
* their combining class. The algorithm is listed on p.3-11 in the
|
|
|
|
* Unicode Standard 2.0. The table of combining classes is on p.4-2
|
|
|
|
* in the Unicode Standard 2.0.
|
|
|
|
* @param result the string to fix.
|
|
|
|
*/
|
|
|
|
void Normalizer::fixCanonical(UnicodeString& result) {
|
1999-12-08 02:11:04 +00:00
|
|
|
UTextOffset i = result.length() - 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
uint8_t currentType = getClass(result[i]);
|
|
|
|
uint8_t lastType;
|
|
|
|
|
|
|
|
for (--i; i >= 0; --i) {
|
|
|
|
lastType = currentType;
|
|
|
|
currentType = getClass(result[i]);
|
|
|
|
|
|
|
|
//
|
|
|
|
// a swap is presumed to be rare (and a double-swap very rare),
|
|
|
|
// so don't worry about efficiency here.
|
|
|
|
//
|
|
|
|
if (currentType > lastType && lastType != DecompData::BASE) {
|
|
|
|
// swap characters
|
|
|
|
UChar temp = result[i];
|
|
|
|
result[i] = result[i+1];
|
|
|
|
result[i+1] = temp;
|
|
|
|
|
|
|
|
// if not at end, backup (one further, to compensate for for-loop)
|
1999-12-08 02:11:04 +00:00
|
|
|
if (i < result.length() - 2) {
|
1999-08-16 21:50:52 +00:00
|
|
|
i += 2;
|
|
|
|
}
|
|
|
|
// reset type, since we swapped.
|
|
|
|
currentType = getClass(result[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// CharacterIterator overrides
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the current character in the normalized text.
|
|
|
|
*/
|
2000-05-24 20:56:19 +00:00
|
|
|
UChar32 Normalizer:: current() const
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
// TODO: make this method const and guarantee that currentChar is always set?
|
|
|
|
Normalizer *nonConst = (Normalizer*)this;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
if (currentChar == DONE) {
|
|
|
|
switch (fMode) {
|
|
|
|
case NO_OP:
|
|
|
|
nonConst->currentChar = text->current();
|
|
|
|
break;
|
|
|
|
case COMPOSE:
|
|
|
|
case COMPOSE_COMPAT:
|
|
|
|
nonConst->currentChar = nonConst->nextCompose();
|
|
|
|
break;
|
|
|
|
case DECOMP:
|
|
|
|
case DECOMP_COMPAT:
|
|
|
|
nonConst->currentChar = nonConst->nextDecomp();
|
|
|
|
break;
|
2001-08-17 20:37:13 +00:00
|
|
|
case FCD:
|
|
|
|
/* ### TODO */
|
|
|
|
break;
|
2000-07-22 00:10:49 +00:00
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
2000-07-22 00:10:49 +00:00
|
|
|
return currentChar;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the first character in the normalized text. This resets
|
|
|
|
* the <tt>Normalizer's</tt> position to the beginning of the text.
|
|
|
|
*/
|
2000-05-24 20:56:19 +00:00
|
|
|
UChar32 Normalizer::first() {
|
1999-08-16 21:50:52 +00:00
|
|
|
return setIndex(text->startIndex());
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the last character in the normalized text. This resets
|
|
|
|
* the <tt>Normalizer's</tt> position to be just before the
|
|
|
|
* the input text corresponding to that normalized character.
|
|
|
|
*/
|
2000-05-24 20:56:19 +00:00
|
|
|
UChar32 Normalizer::last() {
|
2000-07-22 00:10:49 +00:00
|
|
|
text->setIndex(text->endIndex());
|
1999-08-16 21:50:52 +00:00
|
|
|
|
2000-07-22 00:10:49 +00:00
|
|
|
currentChar = DONE; // The current char hasn't been processed
|
|
|
|
clearBuffer(); // The buffer is empty too
|
|
|
|
return previous();
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the next character in the normalized text and advance
|
|
|
|
* the iteration position by one. If the end
|
|
|
|
* of the text has already been reached, {@link #DONE} is returned.
|
|
|
|
*/
|
2000-05-24 20:56:19 +00:00
|
|
|
UChar32 Normalizer::next() {
|
2000-07-22 00:10:49 +00:00
|
|
|
if (bufferPos < bufferLimit) {
|
|
|
|
// There are output characters left in the buffer
|
|
|
|
currentChar = buffer[++bufferPos];
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
2000-07-22 00:10:49 +00:00
|
|
|
else {
|
|
|
|
bufferLimit = bufferPos = 0; // Buffer is now out of date
|
|
|
|
switch (fMode) {
|
|
|
|
case NO_OP:
|
|
|
|
currentChar = text->next();
|
|
|
|
break;
|
|
|
|
case COMPOSE:
|
|
|
|
case COMPOSE_COMPAT:
|
|
|
|
currentChar = nextCompose();
|
|
|
|
break;
|
|
|
|
case DECOMP:
|
|
|
|
case DECOMP_COMPAT:
|
|
|
|
currentChar = nextDecomp();
|
|
|
|
break;
|
2001-08-17 20:37:13 +00:00
|
|
|
case FCD:
|
|
|
|
/* ### TODO */
|
|
|
|
break;
|
2000-07-22 00:10:49 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return currentChar;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the previous character in the normalized text and decrement
|
|
|
|
* the iteration position by one. If the beginning
|
|
|
|
* of the text has already been reached, {@link #DONE} is returned.
|
|
|
|
*/
|
2000-05-24 20:56:19 +00:00
|
|
|
UChar32 Normalizer::previous()
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
if (bufferPos > 0) {
|
|
|
|
// There are output characters left in the buffer
|
|
|
|
currentChar = buffer[--bufferPos];
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
bufferLimit = bufferPos = 0; // Buffer is now out of date
|
|
|
|
switch (fMode) {
|
|
|
|
case NO_OP:
|
|
|
|
currentChar = text->previous();
|
|
|
|
break;
|
|
|
|
case COMPOSE:
|
|
|
|
case COMPOSE_COMPAT:
|
|
|
|
currentChar = prevCompose();
|
|
|
|
break;
|
|
|
|
case DECOMP:
|
|
|
|
case DECOMP_COMPAT:
|
|
|
|
currentChar = prevDecomp();
|
|
|
|
break;
|
2001-08-17 20:37:13 +00:00
|
|
|
case FCD:
|
|
|
|
/* ### TODO */
|
|
|
|
break;
|
2000-07-22 00:10:49 +00:00
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
2000-07-22 00:10:49 +00:00
|
|
|
return currentChar;
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::reset()
|
|
|
|
{
|
|
|
|
text->setIndex(text->startIndex());
|
|
|
|
currentChar = DONE; // The current char hasn't been processed
|
|
|
|
clearBuffer(); // The buffer is empty too
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the iteration position in the input text that is being normalized
|
|
|
|
* and return the first normalized character at that position.
|
|
|
|
* <p>
|
|
|
|
* <b>Note:</b> This method sets the position in the <em>input</em> text,
|
|
|
|
* while {@link #next} and {@link #previous} iterate through characters
|
|
|
|
* in the normalized <em>output</em>. This means that there is not
|
|
|
|
* necessarily a one-to-one correspondence between characters returned
|
|
|
|
* by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
|
|
|
|
* returned from <tt>setIndex</tt> and {@link #getIndex}.
|
|
|
|
* <p>
|
|
|
|
* @param index the desired index in the input text.
|
|
|
|
*
|
|
|
|
* @return the first normalized character that is the result of iterating
|
|
|
|
* forward starting at the given index.
|
|
|
|
*
|
|
|
|
* @throws IllegalArgumentException if the given index is less than
|
|
|
|
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
|
|
|
|
*/
|
2000-05-24 20:56:19 +00:00
|
|
|
UChar32 Normalizer::setIndex(UTextOffset index)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
|
|
|
text->setIndex(index); // Checks range
|
|
|
|
currentChar = DONE; // The current char hasn't been processed
|
|
|
|
clearBuffer(); // The buffer is empty too
|
|
|
|
|
|
|
|
return current();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Retrieve the current iteration position in the input text that is
|
|
|
|
* being normalized. This method is useful in applications such as
|
|
|
|
* searching, where you need to be able to determine the position in
|
|
|
|
* the input text that corresponds to a given normalized output character.
|
|
|
|
* <p>
|
|
|
|
* <b>Note:</b> This method sets the position in the <em>input</em>, while
|
|
|
|
* {@link #next} and {@link #previous} iterate through characters in the
|
|
|
|
* <em>output</em>. This means that there is not necessarily a one-to-one
|
|
|
|
* correspondence between characters returned by <tt>next</tt> and
|
|
|
|
* <tt>previous</tt> and the indices passed to and returned from
|
|
|
|
* <tt>setIndex</tt> and {@link #getIndex}.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
UTextOffset Normalizer::getIndex() const {
|
|
|
|
return text->getIndex();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Retrieve the index of the start of the input text. This is the begin index
|
|
|
|
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
|
|
|
|
* over which this <tt>Normalizer</tt> is iterating
|
|
|
|
*/
|
|
|
|
UTextOffset Normalizer::startIndex() const {
|
|
|
|
return text->startIndex();
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Retrieve the index of the end of the input text. This is the end index
|
|
|
|
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
|
|
|
* over which this <tt>Normalizer</tt> is iterating
|
|
|
|
*/
|
|
|
|
UTextOffset Normalizer::endIndex() const {
|
|
|
|
return text->endIndex();
|
|
|
|
}
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Property access methods
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
void
|
|
|
|
Normalizer::setMode(EMode newMode)
|
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
fMode = newMode;
|
2000-08-08 22:15:50 +00:00
|
|
|
minDecomp = (int16_t)(((fMode & COMPAT_BIT) != 0) ? 0 : DecompData::MAX_COMPAT);
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
Normalizer::EMode
|
|
|
|
Normalizer::getMode() const
|
|
|
|
{
|
|
|
|
return fMode;
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
Normalizer::setOption(int32_t option,
|
2000-07-22 00:10:49 +00:00
|
|
|
UBool value)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
if (value) {
|
|
|
|
fOptions |= option;
|
|
|
|
} else {
|
|
|
|
fOptions &= (~option);
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
2000-05-18 22:08:39 +00:00
|
|
|
UBool
|
1999-08-16 21:50:52 +00:00
|
|
|
Normalizer::getOption(int32_t option) const
|
|
|
|
{
|
|
|
|
return (fOptions & option) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
|
|
* The iteration position is set to the beginning of the input text.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
Normalizer::setText(const UnicodeString& newText,
|
2000-07-22 00:10:49 +00:00
|
|
|
UErrorCode &status)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
CharacterIterator *newIter = new StringCharacterIterator(newText);
|
|
|
|
if (newIter == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
delete text;
|
|
|
|
text = newIter;
|
|
|
|
reset();
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
|
|
|
* The iteration position is set to the beginning of the string.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
Normalizer::setText(const CharacterIterator& newText,
|
2000-07-22 00:10:49 +00:00
|
|
|
UErrorCode &status)
|
1999-08-16 21:50:52 +00:00
|
|
|
{
|
2000-07-22 00:10:49 +00:00
|
|
|
if (U_FAILURE(status)) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
CharacterIterator *newIter = newText.clone();
|
|
|
|
if (newIter == NULL) {
|
|
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
delete text;
|
|
|
|
text = newIter;
|
|
|
|
reset();
|
1999-08-16 21:50:52 +00:00
|
|
|
}
|
|
|
|
|
1999-11-22 21:59:05 +00:00
|
|
|
void
|
|
|
|
Normalizer::setText(const UChar* newText,
|
|
|
|
int32_t length,
|
2000-07-22 00:10:49 +00:00
|
|
|
UErrorCode &status)
|
1999-11-22 21:59:05 +00:00
|
|
|
{
|
|
|
|
setText(UnicodeString(newText, length), status);
|
|
|
|
}
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Copies the text under iteration into the UnicodeString referred to by "result".
|
|
|
|
* @param result Receives a copy of the text under iteration.
|
|
|
|
*/
|
|
|
|
void
|
|
|
|
Normalizer::getText(UnicodeString& result)
|
|
|
|
{
|
|
|
|
text->getText(result);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
// Private utility methods
|
|
|
|
//-------------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
UChar Normalizer::curForward() {
|
|
|
|
UChar ch = text->current();
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
UChar Normalizer::curBackward() {
|
|
|
|
UChar ch = text->previous();
|
|
|
|
return ch;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::doAppend(const UChar source[], uint16_t offset, UnicodeString& dest) {
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT);
|
|
|
|
uint16_t length = (int16_t)(offset & STR_LENGTH_MASK);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
if (length == 0) {
|
|
|
|
UChar ch;
|
|
|
|
while ((ch = source[index++]) != 0x0000) {
|
|
|
|
dest += ch;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
while (length-- > 0) {
|
|
|
|
dest += source[index++];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos)
|
|
|
|
{
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT);
|
|
|
|
uint16_t length = (int16_t)(offset & STR_LENGTH_MASK);
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
if (length == 0) {
|
|
|
|
UChar ch;
|
|
|
|
while ((ch = source[index++]) != 0x0000) {
|
|
|
|
insert(dest, pos++, ch);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
while (length-- > 0) {
|
|
|
|
insert(dest, pos++, source[index++]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2000-07-26 16:40:43 +00:00
|
|
|
uint16_t Normalizer::doReplace(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos) {
|
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT);
|
|
|
|
uint16_t length = (int16_t)(offset & STR_LENGTH_MASK);
|
2000-07-26 16:40:43 +00:00
|
|
|
uint16_t i;
|
|
|
|
|
|
|
|
dest.setCharAt(pos++, source[index++]);
|
|
|
|
if (length == 0) {
|
|
|
|
UChar ch;
|
|
|
|
while ((ch = source[index++]) != 0x0000) {
|
|
|
|
insert(dest, pos++, ch);
|
|
|
|
length++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (i = 1; i < length; i++) {
|
|
|
|
dest.insert(pos++, source[index++]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return length;
|
|
|
|
}
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
void Normalizer::initBuffer() {
|
|
|
|
buffer.truncate(0);
|
|
|
|
clearBuffer();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::clearBuffer() {
|
|
|
|
bufferLimit = bufferPos = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
//-----------------------------------------------------------------------------
|
|
|
|
// Hangul / Jamo conversion utilities for internal use
|
|
|
|
// See section 3.10 of The Unicode Standard, v 2.0.
|
|
|
|
//
|
|
|
|
/**
|
|
|
|
* Convert a single Hangul syllable into one or more Jamo characters.
|
|
|
|
*
|
|
|
|
* @param conjoin If TRUE, decompose Jamo into conjoining Jamo.
|
|
|
|
*/
|
|
|
|
void Normalizer::hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit)
|
|
|
|
{
|
|
|
|
UChar sIndex = (UChar)(ch - HANGUL_BASE);
|
|
|
|
UChar leading = (UChar)(JAMO_LBASE + sIndex / JAMO_NCOUNT);
|
|
|
|
UChar vowel = (UChar)(JAMO_VBASE +
|
2000-07-22 00:10:49 +00:00
|
|
|
(sIndex % JAMO_NCOUNT) / JAMO_TCOUNT);
|
1999-08-16 21:50:52 +00:00
|
|
|
UChar trailing= (UChar)(JAMO_TBASE + (sIndex % JAMO_TCOUNT));
|
|
|
|
|
|
|
|
jamoAppend(leading, decompLimit, result);
|
|
|
|
jamoAppend(vowel, decompLimit, result);
|
|
|
|
if (trailing != JAMO_TBASE) {
|
|
|
|
jamoAppend(trailing, decompLimit, result);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest) {
|
2000-07-22 00:10:49 +00:00
|
|
|
uint16_t offset = ucmp16_getu(DecompData::offsets, ch);
|
1999-08-16 21:50:52 +00:00
|
|
|
if (offset > decompLimit) {
|
2000-07-20 00:39:21 +00:00
|
|
|
/* HSYS: Be sure to check this for later. UChar may not always be
|
|
|
|
uint16_t*/
|
|
|
|
doAppend((const UChar*)(DecompData::contents), offset, dest);
|
1999-08-16 21:50:52 +00:00
|
|
|
} else {
|
|
|
|
dest += ch;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void Normalizer::jamoToHangul(UnicodeString& buffer, UTextOffset start) {
|
|
|
|
UTextOffset out = start;
|
1999-12-08 02:11:04 +00:00
|
|
|
UTextOffset limit = buffer.length() - 1;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
UTextOffset in;
|
2000-04-13 23:00:43 +00:00
|
|
|
int16_t l, v = 0, t;
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
for (in = start; in < limit; in++) {
|
|
|
|
UChar ch = buffer[in];
|
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
if ((l = (int16_t)(ch - JAMO_LBASE)) >= 0 && l < JAMO_LCOUNT
|
2000-08-25 15:53:31 +00:00
|
|
|
&& (v = (int16_t)(buffer[in+1] - (UChar)JAMO_VBASE)) >= 0 && v < JAMO_VCOUNT) {
|
1999-08-16 21:50:52 +00:00
|
|
|
//
|
|
|
|
// We've found a pair of Jamo characters to compose.
|
|
|
|
// Snarf the Jamo vowel and see if there's also a trailing char
|
|
|
|
//
|
|
|
|
in++; // Snarf the Jamo vowel too.
|
|
|
|
|
2000-08-08 22:15:50 +00:00
|
|
|
t = (int16_t)((in < limit) ? buffer.charAt(in+1) : 0);
|
1999-08-16 21:50:52 +00:00
|
|
|
t -= JAMO_TBASE;
|
|
|
|
|
|
|
|
if (t >= 0 && t < JAMO_TCOUNT) {
|
|
|
|
in++; // Snarf the trailing consonant too
|
|
|
|
} else {
|
|
|
|
t = 0; // No trailing consonant
|
|
|
|
}
|
|
|
|
buffer[out++] = (UChar)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE);
|
|
|
|
} else {
|
|
|
|
buffer[out++] = ch;
|
|
|
|
}
|
|
|
|
}
|
1999-12-08 02:11:04 +00:00
|
|
|
while (in < buffer.length()) {
|
1999-08-16 21:50:52 +00:00
|
|
|
buffer[out++] = buffer[in++];
|
|
|
|
}
|
|
|
|
|
|
|
|
buffer.truncate(out);
|
|
|
|
}
|