From 974bfd35196f7e7bd7fe1928dddd258c9487525a Mon Sep 17 00:00:00 2001 From: Markus Scherer Date: Wed, 29 Aug 2001 23:57:15 +0000 Subject: [PATCH] ICU-1007 completely replace the old with the new implementation X-SVN-Rev: 5614 --- icu4c/source/common/normlzr.cpp | 1063 +++++-------------------- icu4c/source/common/unicode/normlzr.h | 96 +-- 2 files changed, 222 insertions(+), 937 deletions(-) diff --git a/icu4c/source/common/normlzr.cpp b/icu4c/source/common/normlzr.cpp index 7ec326b823..b0ded10182 100644 --- a/icu4c/source/common/normlzr.cpp +++ b/icu4c/source/common/normlzr.cpp @@ -15,99 +15,65 @@ * useful in tbcoll and unorm. * Added quickcheck method and incorporated it into * normalize() +* 06/20/01+ Markus Scherer +* total rewrite, implement all normalization in unorm.cpp +* and turn Normalizer into a wrapper; +* fix the very broken iteration API */ -#include "ucmp16.h" -#include "dcmpdata.h" -#include "compdata.h" - #include "unicode/normlzr.h" #include "unicode/utypes.h" #include "unicode/unistr.h" #include "unicode/chariter.h" #include "unicode/schriter.h" -#include "unicode/unicode.h" -#include "mutex.h" - -/* ### TODO: new implementation */ +#include "unicode/uchriter.h" #include "unormimp.h" - - - -#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array)) -/** -* Maximum initial buffer size. -* Used in quickCheck to declare initial array. -*/ -const uint32_t StackBufferLen = 1024; - -inline static void insert(UnicodeString& dest, - UTextOffset pos, - UChar ch) -{ - dest.replace(pos, 0, &ch, 1); -} - //------------------------------------------------------------------------- // Constructors and other boilerplate //------------------------------------------------------------------------- Normalizer::Normalizer(const UnicodeString& str, - EMode mode) -{ - init(new StringCharacterIterator(str), mode, 0); -} + EMode mode) : + fMode(mode), fOptions(0), + text(new StringCharacterIterator(str)), nextIndex(-1), + buffer(), bufferPos(0) +{} Normalizer::Normalizer(const UnicodeString& str, EMode mode, - int32_t opt) -{ - init(new StringCharacterIterator(str), mode, opt); -} + int32_t options) : + fMode(mode), fOptions(options), + text(new StringCharacterIterator(str)), nextIndex(-1), + buffer(), bufferPos(0) +{} -Normalizer::Normalizer(const UChar* str, int32_t length, EMode mode) -{ - init(new StringCharacterIterator(UnicodeString(str, length)), mode, 0); -} +Normalizer::Normalizer(const UChar *str, int32_t length, EMode mode) : + fMode(mode), fOptions(0), + text(new UCharCharacterIterator(str, length)), nextIndex(-1), + buffer(), bufferPos(0) +{} Normalizer::Normalizer(const CharacterIterator& iter, - EMode mode) -{ - init(iter.clone(), mode, 0); -} + EMode mode) : + fMode(mode), fOptions(0), + text(iter.clone()), nextIndex(-1), + buffer(), bufferPos(0) +{} Normalizer::Normalizer(const CharacterIterator& iter, EMode mode, - int32_t opt) -{ - init(iter.clone(), mode, opt); -} + int32_t options) : + fMode(mode), fOptions(options), + text(iter.clone()), nextIndex(-1), + buffer(), bufferPos(0) +{} -void Normalizer::init(CharacterIterator* adoptIter, - EMode mode, - int32_t options) -{ - bufferPos = 0; - bufferLimit = 0; - fOptions = options; - currentChar = DONE; - fMode = mode; - text = adoptIter; - - minDecomp = (int16_t)((fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT); -} - -Normalizer::Normalizer(const Normalizer& copy) -{ - init(copy.text->clone(), copy.fMode, copy.fOptions); - - buffer = copy.buffer; - bufferPos = copy.bufferPos; - bufferLimit = copy.bufferLimit; - explodeBuf = copy.explodeBuf; - currentChar = copy.currentChar; -} +Normalizer::Normalizer(const Normalizer ©) : + fMode(copy.fMode), fOptions(copy.fOptions), + text(copy.text->clone()), nextIndex(copy.nextIndex), + buffer(copy.buffer), bufferPos(copy.bufferPos) +{} Normalizer::~Normalizer() { @@ -129,17 +95,19 @@ Normalizer::clone() const */ int32_t Normalizer::hashCode() const { - return text->hashCode() + fMode + fOptions + bufferPos + bufferLimit; + return text->hashCode() + fMode + fOptions + buffer.hashCode() + bufferPos + nextIndex; } UBool Normalizer::operator==(const Normalizer& that) const { - return *text == *(that.text) - && currentChar == that.currentChar - && buffer == that.buffer - && explodeBuf == that.explodeBuf - && bufferPos == that.bufferPos - && bufferLimit == that.bufferLimit; + return + this==&that || + fMode==that.fMode && + fOptions==that.fOptions && + *text==*(that.text) && + buffer==that.buffer && + bufferPos==that.bufferPos && + nextIndex==that.nextIndex; } //------------------------------------------------------------------------- @@ -152,7 +120,7 @@ Normalizer::normalize(const UnicodeString& source, int32_t options, UnicodeString& result, UErrorCode &status) { - if(source.isBogus()) { + if(source.isBogus() || U_FAILURE(status)) { result.setToBogus(); } else { /* make sure that we do not operate on the same buffer in source and result */ @@ -180,34 +148,6 @@ Normalizer::quickCheck(const UnicodeString& source, getUNormalizationMode(mode, status), &status); } -//------------------------------------------------------------------------- -// Inline functions for 64-bit bitmasks (array of 2 uint32_t) -//------------------------------------------------------------------------- - -// Clear all bits of the mask -inline void emptyBitmask64(uint32_t* mask) { - mask[0] = mask[1] = 0; -} - -// Return TRUE if all bits are clear in the mask -inline UBool isEmptyBitmask64(uint32_t* mask) { - return (mask[0] == 0) && (mask[1] == 0); -} - -// Set a single bit (0..63) of the mask -inline void setBitmask64(uint32_t* mask, int32_t bit) { - mask[bit >> 5] |= ((uint32_t)1L << (bit & 31)); -} - -// Return TRUE if a single bit (0..63) is set in the mask -inline UBool isSetBitmask64(uint32_t* mask, int32_t bit) { - return (mask[bit >> 5] & (1L << (bit & 31))) != 0; -} - -//------------------------------------------------------------------------- -// Compose methods -//------------------------------------------------------------------------- - void Normalizer::compose(const UnicodeString& source, UBool compat, @@ -230,306 +170,6 @@ Normalizer::compose(const UnicodeString& source, } } -/** - * Compose starting with current input character and continuing - * until just before the next base char. - *

- * Input: - *

- *

- * Output: - *

- */ -UChar Normalizer::nextCompose() -{ - UTextOffset explodePos = EMPTY; // Position in input buffer - UTextOffset basePos = 0; // Position of last base in output string - uint16_t baseIndex = 0; // Index of last base in "actions" array - uint32_t classesSeen[2]; // Combining classes seen since last base - uint16_t action; - UChar lastBase = 0; - UBool chFromText = TRUE; - - // Compatibility explosions have lower indices; skip them if necessary - uint16_t minExplode = (uint16_t)((fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT); - uint16_t minDecompLocal = (uint16_t)((fMode & COMPAT_BIT) ? 0 : DecompData::MAX_COMPAT); - - emptyBitmask64(classesSeen); - initBuffer(); - explodeBuf.truncate(0); - - UChar ch = curForward(); - - while (ch != DONE) { - // Get the basic info for the character - uint16_t charInfo = composeLookup(ch); - uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK); - uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT); - - if (type == ComposeData::BASE || (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode)) { - if (buffer.length() > 0 && chFromText && explodePos == EMPTY) { - // When we hit a base char in the source text, we can return the text - // that's been composed so far. We'll re-process this char next time hrough. - break; - } - emptyBitmask64(classesSeen); - baseIndex = index; - basePos = buffer.length(); - buffer += ch; - lastBase = ch; - } - else if (type == ComposeData::COMBINING) - { - uint32_t cclass = ComposeData::typeBit[index]; // 0..63 - - // We can only combine a character with the base if we haven't - // already seen a combining character with the same canonical class. - if (index < ComposeData::COMBINING_COUNT - && !isSetBitmask64(classesSeen, cclass) - && (action = composeAction(baseIndex, index)) > 0) - { - if (action > ComposeData::MAX_COMPOSED) { - // Pairwise explosion. Actions above this value are really - // indices into an array that in turn contains indices - // into the exploding string table - // TODO: What if there are unprocessed chars in the explode buffer? - UChar newBase = pairExplode(explodeBuf, action); - explodePos = 0; - buffer[basePos] = newBase; - - baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT); - lastBase = newBase; - } else { - // Normal pairwise combination. Replace the base char - UChar newBase = (UChar) action; - buffer[basePos] = newBase; - - baseIndex = (uint16_t)(composeLookup(newBase) >> ComposeData::INDEX_SHIFT); - lastBase = newBase; - } - // - // Since there are Unicode characters that cannot be combined in arbitrary - // order, we have to re-process any combining marks that go with this - // base character. There are only four characters in Unicode that have - // this problem. If they are fixed in Unicode 3.0, this code can go away. - // - UTextOffset len = buffer.length(); - if (len - basePos > 1) { - for (UTextOffset j = basePos+1; j < len; j++) { - explodeBuf += buffer[j]; - } - buffer.truncate(basePos+1); - emptyBitmask64(classesSeen); - if (explodePos == EMPTY) explodePos = 0; - } - } else { - // No combination with this character - bubbleAppend(buffer, ch, cclass); - setBitmask64(classesSeen, cclass); //[cclass >> 5] |= (1L << (cclass & 31)); - } - } - else if (index > minExplode) { - // Single exploding character - explode(explodeBuf, index); - explodePos = 0; - } - else if (type == ComposeData::HANGUL && minExplode == 0) { - // If we're in compatibility mode we need to decompose Hangul to Jamo, - // because some of the Jamo might have compatibility decompositions. - hangulToJamo(ch, explodeBuf, minDecompLocal); - explodePos = 0; - } - else if (type == ComposeData::INITIAL_JAMO) { - if (buffer.length() > 0 && chFromText && explodePos == EMPTY) { - // When we hit a base char in the source text, we can return the text - // that's been composed so far. We'll re-process this char next time through. - break; - } - emptyBitmask64(classesSeen); - baseIndex = ComposeData::INITIAL_JAMO_INDEX; - basePos = buffer.length(); - buffer += ch; - } - else if (type == ComposeData::MEDIAL_JAMO - && isEmptyBitmask64(classesSeen) - && baseIndex == ComposeData::INITIAL_JAMO_INDEX) { - // If the last character was an initial jamo, we can combine it with this - // one to create a Hangul character. - uint16_t l = (uint16_t)(buffer[basePos] - (UChar)JAMO_LBASE); - uint16_t v = (uint16_t)(ch - JAMO_VBASE); - buffer[basePos] = (UChar)(HANGUL_BASE + (l*JAMO_VCOUNT + v) * JAMO_TCOUNT); - - baseIndex = ComposeData::MEDIAL_JAMO_INDEX; - } - else if (type == ComposeData::FINAL_JAMO - && isEmptyBitmask64(classesSeen) - && baseIndex == ComposeData::MEDIAL_JAMO_INDEX) { - // If the last character was a medial jamo that we turned into Hangul, - // we can add this character too. - buffer[basePos] = (UChar)(buffer[basePos] + (ch - JAMO_TBASE)); - - baseIndex = 0; - basePos = -1; - emptyBitmask64(classesSeen); - } else { - // TODO: deal with JAMO character types - baseIndex = 0; - basePos = -1; - emptyBitmask64(classesSeen); - buffer += ch; - } - - if (explodePos == EMPTY) { - ch = text->next(); - chFromText = TRUE; - } else { - ch = explodeBuf[explodePos++]; - if (explodePos >= explodeBuf.length()) { - explodePos = EMPTY; - explodeBuf.truncate(0); - } - chFromText = FALSE; - } - } - if (buffer.length() > 0) { - bufferLimit = buffer.length() - 1; - ch = buffer[0]; - } else { - ch = DONE; - bufferLimit = 0; - } - return ch; -} - -/** - * Compose starting with the input UChar just before the current position - * and continuing backward until (and including) the previous base char. - *

- * Input: - *

- *

- * Output: - *

- */ -UChar Normalizer::prevCompose() -{ - UErrorCode status = U_ZERO_ERROR; - - // Compatibility explosions have lower indices; skip them if necessary - uint16_t minExplode = (uint16_t)((fMode & COMPAT_BIT) ? 0 : ComposeData::MAX_COMPAT); - - initBuffer(); - - // Slurp up characters until we hit a base char or an initial Jamo - UChar ch; - while ((ch = curBackward()) != DONE) { - insert(buffer, 0, ch); - - // Get the basic info for the character - uint16_t charInfo = composeLookup(ch); - uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK); - uint16_t index = (uint16_t)(charInfo >> ComposeData::INDEX_SHIFT); - - if (type == ComposeData::BASE - || (type == ComposeData::NON_COMPOSING_COMBINING && index < minExplode) - || type == ComposeData::HANGUL - || type == ComposeData::INITIAL_JAMO) - { - break; - } - } - // If there's more than one character in the buffer, compose it all at once.... - if (buffer.length() > 0) { - // TODO: The performance of this is awful; add a way to compose - // a UnicodeString& in place. - UnicodeString composed; - compose(buffer, (fMode & COMPAT_BIT) != 0, fOptions, composed, status); - buffer.truncate(0); - buffer += composed; - - if (buffer.length() > 1) { - bufferLimit = bufferPos = buffer.length() - 1; - ch = buffer[bufferPos]; - } else { - ch = buffer[0]; - } - } - else { - ch = DONE; - } - - return ch; -} - -void Normalizer::bubbleAppend(UnicodeString& target, UChar ch, uint32_t cclass) { - UTextOffset i; - for (i = target.length() - 1; i >= 0; --i) { - uint32_t iClass = getComposeClass(target[i]); - - if (iClass == 1 || iClass <= cclass) { // 1 means combining class 0 - // We've hit something we can't bubble this character past, so insert here - break; - } - } - // We need to insert just after character "i" - insert(target, i+1, ch); -} - -/** - * Return the composing class of a character, as stored in the ComposeData - * table. This is not the composing class as listed in the raw Unicode - * database, but an equivalent remapped value. Values are remapped so they - * fit in a sequential range from 0..n, where n < 64, and relative order - * is preserved. - * @return the composing class of ch, from 0..63 - */ -uint32_t Normalizer::getComposeClass(UChar ch) { - uint32_t cclass = 0; - uint16_t charInfo = composeLookup(ch); - uint16_t type = (uint16_t)(charInfo & ComposeData::TYPE_MASK); - if (type == ComposeData::COMBINING) { - cclass = ComposeData::typeBit[charInfo >> ComposeData::INDEX_SHIFT]; - } - return cclass; -} - -uint16_t Normalizer::composeLookup(UChar ch) { - return ucmp16_getu(ComposeData::lookup, ch); -} - -uint16_t Normalizer::composeAction(uint16_t baseIndex, uint16_t comIndex) -{ - return ucmp16_getu(ComposeData::actions, - ((UChar)(baseIndex + ComposeData::MAX_BASES*comIndex))); -} - -void Normalizer::explode(UnicodeString& target, uint16_t index) { - UChar ch; - while ((ch = ComposeData::replace[index++]) != 0) { - target += ch; - } -} - -UChar Normalizer::pairExplode(UnicodeString& target, uint16_t action) { - uint16_t index = ComposeData::actionIndex[action - ComposeData::MAX_COMPOSED]; - explode(target, (uint16_t)(index + 1)); - return ComposeData::replace[index]; // New base char -} - -//------------------------------------------------------------------------- -// Decompose methods -//------------------------------------------------------------------------- - void Normalizer::decompose(const UnicodeString& source, UBool compat, @@ -552,265 +192,36 @@ Normalizer::decompose(const UnicodeString& source, } } -/** - * Decompose starting with current input character and continuing - * until just before the next base char. - *

- * Input: - *

- *

- * Output: - *

- */ -UChar Normalizer::nextDecomp() -{ - UBool hangul = ((fOptions & IGNORE_HANGUL) == 0); - UChar ch = curForward(); - int32_t i; - uint16_t offset = ucmp16_getu(DecompData::offsets, ch); - int16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK); - - if (index > minDecomp || - ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE) - { - initBuffer(); - - if (index > minDecomp) { - doAppend((const UChar*)(DecompData::contents), index, buffer); - - if ((offset & DecompData::DECOMP_RECURSE) != 0) { - // Need to decompose the output of this decomposition recursively. - for (i = 0; i < buffer.length(); i++) { - ch = buffer.charAt(i); - int16_t index = (int16_t)(ucmp16_getu(DecompData::offsets, ch) - & DecompData::DECOMP_MASK); - if (index > minDecomp) { - i += doReplace((const UChar*)(DecompData::contents), index, buffer, i); - } - } - } - } else { - buffer += ch; - } - UBool needToReorder = FALSE; - - // Any other combining chacters that immediately follow the decomposed - // character must be included in the buffer too, because they're - // conceptually part of the same logical character. - while ((ch = text->next()) != DONE - && ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE) - { - needToReorder = TRUE; - // Decompose any of these characters that need it - Liu - index = (int16_t)(ucmp16_getu(DecompData::offsets, ch) - & DecompData::DECOMP_MASK); - if (index > minDecomp) { - doAppend((const UChar*)DecompData::contents, index, buffer); - } else { - buffer += ch; - } - } - - if (buffer.length() > 1 && needToReorder) { - // If there is more than one combining character in the buffer, - // put them into the canonical order. - // But we don't need to sort if only characters are the ones that - // resulted from decomosing the base character. - fixCanonical(buffer); - } - bufferLimit = buffer.length() - 1; - ch = buffer[0]; - } else { - // Just use this character, but first advance to the next one - text->next(); - - // Do Hangul -> Jamo decomposition if necessary - if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) { - initBuffer(); - hangulToJamo(ch, buffer, minDecomp); - bufferLimit = buffer.length() - 1; - ch = buffer[0]; - } - } - return ch; -} - - -/** - * Decompose starting with the input char just before the current position - * and continuing backward until (and including) the previous base char. - *

- * Input: - *

- *

- * Output: - *

- */ -UChar Normalizer::prevDecomp() { - UBool hangul = (fOptions & IGNORE_HANGUL) == 0; - - UChar ch = curBackward(); - - uint16_t offset = ucmp16_getu(DecompData::offsets, ch); - - if (offset > minDecomp || - ucmp8_get(DecompData::canonClass, ch) != DecompData::BASE) - { - initBuffer(); - - // This method rewritten to pass conformance tests. - Liu - // Collect all characters up to the previous base char - while (ch != DONE) { - buffer.insert(0, ch); - if (ucmp8_get(DecompData::canonClass, ch) == DecompData::BASE) break; - ch = text->previous(); - } - - // Decompose the buffer - int32_t i; - for (i = 0; i < buffer.length(); i++) { - ch = buffer.charAt(i); - offset = ucmp16_getu(DecompData::offsets, ch); - int16_t index = (int16_t)(offset & DecompData::DECOMP_MASK); - - if (index > minDecomp) { - int j = doReplace((const UChar*)(DecompData::contents), index, buffer, i); - if ((offset & DecompData::DECOMP_RECURSE) != 0) { - // Need to decompose this recursively - for (; i < j; ++i) { - ch = buffer.charAt(i); - index = (int16_t)(ucmp16_getu(DecompData::offsets, ch) - & DecompData::DECOMP_MASK); - if (index > minDecomp) { - i += doReplace((const UChar*)(DecompData::contents), index, buffer, i); - } - } - } - i = j; - } - } - - - if (buffer.length() > 1) { - // If there is more than one combining character in the buffer, - // put them into the canonical order. - fixCanonical(buffer); - } - bufferLimit = bufferPos = buffer.length() - 1; - ch = buffer[bufferPos]; - } - else if (hangul && ch >= HANGUL_BASE && ch < HANGUL_LIMIT) { - initBuffer(); - hangulToJamo(ch, buffer, minDecomp); - bufferLimit = bufferPos = buffer.length() - 1; - ch = buffer[bufferPos]; - } - return ch; -} - -uint8_t Normalizer::getClass(UChar ch) { - return ucmp8_get(DecompData::canonClass, ch); -} - -/** - * Fixes the sorting sequence of non-spacing characters according to - * their combining class. The algorithm is listed on p.3-11 in the - * Unicode Standard 2.0. The table of combining classes is on p.4-2 - * in the Unicode Standard 2.0. - * @param result the string to fix. - */ -void Normalizer::fixCanonical(UnicodeString& result) { - UTextOffset i = result.length() - 1; - uint8_t currentType = getClass(result[i]); - uint8_t lastType; - - for (--i; i >= 0; --i) { - lastType = currentType; - currentType = getClass(result[i]); - - // - // a swap is presumed to be rare (and a double-swap very rare), - // so don't worry about efficiency here. - // - if (currentType > lastType && lastType != DecompData::BASE) { - // swap characters - UChar temp = result[i]; - result[i] = result[i+1]; - result[i+1] = temp; - - // if not at end, backup (one further, to compensate for for-loop) - if (i < result.length() - 2) { - i += 2; - } - // reset type, since we swapped. - currentType = getClass(result[i]); - } - } -} - - //------------------------------------------------------------------------- -// CharacterIterator overrides +// Iteration API //------------------------------------------------------------------------- /** * Return the current character in the normalized text. */ -UChar32 Normalizer:: current() const -{ - // TODO: make this method const and guarantee that currentChar is always set? - Normalizer *nonConst = (Normalizer*)this; - - if (currentChar == DONE) { - switch (fMode) { - case NO_OP: - nonConst->currentChar = text->current(); - break; - case COMPOSE: - case COMPOSE_COMPAT: - nonConst->currentChar = nonConst->nextCompose(); - break; - case DECOMP: - case DECOMP_COMPAT: - nonConst->currentChar = nonConst->nextDecomp(); - break; - case FCD: - /* ### TODO */ - break; +UChar32 Normalizer::current() { + if(bufferPosgetIndex(); + UChar32 c; + + if(nextNormalize()) { + c=buffer.char32At(bufferPos); + nextIndex=text->getIndex(); + } else { + c=DONE; } + text->setIndex(currentIndex); + return c; } - return currentChar; -} - -/** - * Return the first character in the normalized text. This resets - * the Normalizer's position to the beginning of the text. - */ -UChar32 Normalizer::first() { - return setIndex(text->startIndex()); -} - -/** - * Return the last character in the normalized text. This resets - * the Normalizer's position to be just before the - * the input text corresponding to that normalized character. - */ -UChar32 Normalizer::last() { - text->setIndex(text->endIndex()); - - currentChar = DONE; // The current char hasn't been processed - clearBuffer(); // The buffer is empty too - return previous(); } /** @@ -819,30 +230,31 @@ UChar32 Normalizer::last() { * of the text has already been reached, {@link #DONE} is returned. */ UChar32 Normalizer::next() { - if (bufferPos < bufferLimit) { - // There are output characters left in the buffer - currentChar = buffer[++bufferPos]; - } - else { - bufferLimit = bufferPos = 0; // Buffer is now out of date - switch (fMode) { - case NO_OP: - currentChar = text->next(); - break; - case COMPOSE: - case COMPOSE_COMPAT: - currentChar = nextCompose(); - break; - case DECOMP: - case DECOMP_COMPAT: - currentChar = nextDecomp(); - break; - case FCD: - /* ### TODO */ - break; + UChar32 c; + + if(bufferPos=0) { + text->setIndex(nextIndex); + } + if(nextNormalize()) { + c=buffer.char32At(bufferPos); + bufferPos+=UTF_CHAR_LENGTH(c); + return c; + } else { + return DONE; } } - return currentChar; } /** @@ -850,39 +262,27 @@ UChar32 Normalizer::next() { * the iteration position by one. If the beginning * of the text has already been reached, {@link #DONE} is returned. */ -UChar32 Normalizer::previous() -{ - if (bufferPos > 0) { - // There are output characters left in the buffer - currentChar = buffer[--bufferPos]; +UChar32 Normalizer::previous() { + UChar32 c; + + if(bufferPos>0 || previousNormalize()) { + c=buffer.char32At(bufferPos-1); + bufferPos-=UTF_CHAR_LENGTH(c); + return c; + } else { + return DONE; } - else { - bufferLimit = bufferPos = 0; // Buffer is now out of date - switch (fMode) { - case NO_OP: - currentChar = text->previous(); - break; - case COMPOSE: - case COMPOSE_COMPAT: - currentChar = prevCompose(); - break; - case DECOMP: - case DECOMP_COMPAT: - currentChar = prevDecomp(); - break; - case FCD: - /* ### TODO */ - break; - } - } - return currentChar; } -void Normalizer::reset() -{ - text->setIndex(text->startIndex()); - currentChar = DONE; // The current char hasn't been processed - clearBuffer(); // The buffer is empty too +void Normalizer::reset() { + text->setToStart(); + clearBuffer(); +} + +void +Normalizer::setIndexOnly(UTextOffset index) { + text->setIndex(index); + clearBuffer(); } /** @@ -904,15 +304,32 @@ void Normalizer::reset() * @throws IllegalArgumentException if the given index is less than * {@link #getBeginIndex} or greater than {@link #getEndIndex}. */ -UChar32 Normalizer::setIndex(UTextOffset index) -{ - text->setIndex(index); // Checks range - currentChar = DONE; // The current char hasn't been processed - clearBuffer(); // The buffer is empty too - +UChar32 Normalizer::setIndex(UTextOffset index) { + setIndexOnly(index); return current(); } +/** + * Return the first character in the normalized text. This resets + * the Normalizer's position to the beginning of the text. + */ +UChar32 Normalizer::first() { + text->setToStart(); + clearBuffer(); + return next(); +} + +/** + * Return the last character in the normalized text. This resets + * the Normalizer's position to be just before the + * the input text corresponding to that normalized character. + */ +UChar32 Normalizer::last() { + text->setToEnd(); + clearBuffer(); + return previous(); +} + /** * Retrieve the current iteration position in the input text that is * being normalized. This method is useful in applications such as @@ -957,7 +374,6 @@ void Normalizer::setMode(EMode newMode) { fMode = newMode; - minDecomp = (int16_t)(((fMode & COMPAT_BIT) != 0) ? 0 : DecompData::MAX_COMPAT); } Normalizer::EMode @@ -1030,7 +446,17 @@ Normalizer::setText(const UChar* newText, int32_t length, UErrorCode &status) { - setText(UnicodeString(newText, length), status); + if (U_FAILURE(status)) { + return; + } + CharacterIterator *newIter = new UCharCharacterIterator(newText, length); + if (newIter == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + delete text; + text = newIter; + reset(); } /** @@ -1043,154 +469,83 @@ Normalizer::getText(UnicodeString& result) text->getText(result); } - //------------------------------------------------------------------------- // Private utility methods //------------------------------------------------------------------------- - -UChar Normalizer::curForward() { - UChar ch = text->current(); - return ch; -} - -UChar Normalizer::curBackward() { - UChar ch = text->previous(); - return ch; -} - -void Normalizer::doAppend(const UChar source[], uint16_t offset, UnicodeString& dest) { - uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT); - uint16_t length = (int16_t)(offset & STR_LENGTH_MASK); - - if (length == 0) { - UChar ch; - while ((ch = source[index++]) != 0x0000) { - dest += ch; - } - } else { - while (length-- > 0) { - dest += source[index++]; - } - } -} - -void Normalizer::doInsert(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos) -{ - uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT); - uint16_t length = (int16_t)(offset & STR_LENGTH_MASK); - - if (length == 0) { - UChar ch; - while ((ch = source[index++]) != 0x0000) { - insert(dest, pos++, ch); - } - } else { - while (length-- > 0) { - insert(dest, pos++, source[index++]); - } - } -} - -uint16_t Normalizer::doReplace(const UChar source[], uint16_t offset, UnicodeString& dest, UTextOffset pos) { - - uint16_t index = (int16_t)(offset >> STR_INDEX_SHIFT); - uint16_t length = (int16_t)(offset & STR_LENGTH_MASK); - uint16_t i; - - dest.setCharAt(pos++, source[index++]); - if (length == 0) { - UChar ch; - while ((ch = source[index++]) != 0x0000) { - insert(dest, pos++, ch); - length++; - } - } else { - for (i = 1; i < length; i++) { - dest.insert(pos++, source[index++]); - } - } - return length; -} - -void Normalizer::initBuffer() { - buffer.truncate(0); - clearBuffer(); -} - void Normalizer::clearBuffer() { - bufferLimit = bufferPos = 0; + nextIndex=-1; + buffer.remove(); + bufferPos=0; } -//----------------------------------------------------------------------------- -// Hangul / Jamo conversion utilities for internal use -// See section 3.10 of The Unicode Standard, v 2.0. -// -/** - * Convert a single Hangul syllable into one or more Jamo characters. - * - * @param conjoin If TRUE, decompose Jamo into conjoining Jamo. - */ -void Normalizer::hangulToJamo(UChar ch, UnicodeString& result, uint16_t decompLimit) -{ - UChar sIndex = (UChar)(ch - HANGUL_BASE); - UChar leading = (UChar)(JAMO_LBASE + sIndex / JAMO_NCOUNT); - UChar vowel = (UChar)(JAMO_VBASE + - (sIndex % JAMO_NCOUNT) / JAMO_TCOUNT); - UChar trailing= (UChar)(JAMO_TBASE + (sIndex % JAMO_TCOUNT)); +UBool +Normalizer::nextNormalize() { + UErrorCode errorCode=U_ZERO_ERROR; + int32_t length; - jamoAppend(leading, decompLimit, result); - jamoAppend(vowel, decompLimit, result); - if (trailing != JAMO_TBASE) { - jamoAppend(trailing, decompLimit, result); + clearBuffer(); + switch(fMode) { + case NO_OP: + buffer.setTo(text->next32PostInc()); + length=buffer.length(); + break; + case COMPOSE: + case COMPOSE_COMPAT: + length=unorm_nextCompose(buffer.fArray, buffer.fCapacity, *text, + fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &buffer, + &errorCode); + break; + case DECOMP: + case DECOMP_COMPAT: + length=unorm_nextDecompose(buffer.fArray, buffer.fCapacity, *text, + fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &buffer, + &errorCode); + break; + case FCD: + length=unorm_nextFCD(buffer.fArray, buffer.fCapacity, *text, + UnicodeString::growBuffer, &buffer, + &errorCode); + break; } + + return U_SUCCESS(errorCode) && length>0; } -void Normalizer::jamoAppend(UChar ch, uint16_t decompLimit, UnicodeString& dest) { - uint16_t offset = ucmp16_getu(DecompData::offsets, ch); - if (offset > decompLimit) { - /* HSYS: Be sure to check this for later. UChar may not always be - uint16_t*/ - doAppend((const UChar*)(DecompData::contents), offset, dest); - } else { - dest += ch; +UBool +Normalizer::previousNormalize() { + UErrorCode errorCode=U_ZERO_ERROR; + int32_t length; + + clearBuffer(); + switch(fMode) { + case NO_OP: + buffer.setTo(text->previous32()); + length=buffer.length(); + break; + case COMPOSE: + case COMPOSE_COMPAT: + length=unorm_prevCompose(buffer.fArray, buffer.fCapacity, *text, + fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &buffer, + &errorCode); + break; + case DECOMP: + case DECOMP_COMPAT: + length=unorm_prevDecompose(buffer.fArray, buffer.fCapacity, *text, + fMode==COMPOSE_COMPAT, (fOptions&IGNORE_HANGUL)!=0, + UnicodeString::growBuffer, &buffer, + &errorCode); + break; + case FCD: + length=unorm_prevFCD(buffer.fArray, buffer.fCapacity, *text, + UnicodeString::growBuffer, &buffer, + &errorCode); + break; } -} - -void Normalizer::jamoToHangul(UnicodeString& buffer, UTextOffset start) { - UTextOffset out = start; - UTextOffset limit = buffer.length() - 1; - - UTextOffset in; - int16_t l, v = 0, t; - - for (in = start; in < limit; in++) { - UChar ch = buffer[in]; - - if ((l = (int16_t)(ch - JAMO_LBASE)) >= 0 && l < JAMO_LCOUNT - && (v = (int16_t)(buffer[in+1] - (UChar)JAMO_VBASE)) >= 0 && v < JAMO_VCOUNT) { - // - // We've found a pair of Jamo characters to compose. - // Snarf the Jamo vowel and see if there's also a trailing char - // - in++; // Snarf the Jamo vowel too. - - t = (int16_t)((in < limit) ? buffer.charAt(in+1) : 0); - t -= JAMO_TBASE; - - if (t >= 0 && t < JAMO_TCOUNT) { - in++; // Snarf the trailing consonant too - } else { - t = 0; // No trailing consonant - } - buffer[out++] = (UChar)((l*JAMO_VCOUNT + v) * JAMO_TCOUNT + t + HANGUL_BASE); - } else { - buffer[out++] = ch; - } - } - while (in < buffer.length()) { - buffer[out++] = buffer[in++]; - } - - buffer.truncate(out); + + bufferPos=length; + return U_SUCCESS(errorCode) && length>0; } diff --git a/icu4c/source/common/unicode/normlzr.h b/icu4c/source/common/unicode/normlzr.h index 7b5ff9688e..d935397014 100644 --- a/icu4c/source/common/unicode/normlzr.h +++ b/icu4c/source/common/unicode/normlzr.h @@ -27,9 +27,6 @@ #include "unicode/chariter.h" #include "unicode/unorm.h" -/* forward declaration */ -class ComposedCharIter; - /** * Normalizer transforms Unicode text into an equivalent composed or * decomposed form, allowing for easier sorting and searching of text. @@ -246,6 +243,7 @@ class U_COMMON_API Normalizer * Unicode Normalization Forms. *

* @see #setOption + * @deprecated To be removed (or moved to private for documentation) after 2002-aug-31. Obsolete option. */ IGNORE_HANGUL = 0x001 }; @@ -500,14 +498,14 @@ class U_COMMON_API Normalizer UErrorCode& status); //------------------------------------------------------------------------- - // CharacterIterator overrides + // Iteration API //------------------------------------------------------------------------- /** * Return the current character in the normalized text. * @draft */ - UChar32 current(void) const; + UChar32 current(void); /** * Return the first character in the normalized text. This resets @@ -555,10 +553,12 @@ class U_COMMON_API Normalizer * * @return the first normalized character that is the result of iterating * forward starting at the given index. - * @draft + * @deprecated To be removed after 2002-aug-31. Use setIndexOnly(). */ UChar32 setIndex(UTextOffset index); + void setIndexOnly(UTextOffset index); + /** * Reset the iterator so that it is in the same state that it was just after * it was constructed. A subsequent call to next will return the first @@ -740,98 +740,28 @@ class U_COMMON_API Normalizer private: // Private utility methods for iteration // For documentation, see the source code - UChar nextCompose(void); - UChar prevCompose(void); - UChar nextDecomp(void); - UChar prevDecomp(void); + UBool nextNormalize(); + UBool previousNormalize(); - UChar curForward(void); - UChar curBackward(void); - - void init(CharacterIterator* iter, - EMode mode, - int32_t option); - void initBuffer(void); + void init(CharacterIterator* iter, EMode mode, int32_t option); void clearBuffer(void); - // Utilities used by Compose - static void bubbleAppend(UnicodeString& target, - UChar ch, - uint32_t cclass); - static uint32_t getComposeClass(UChar ch); - static uint16_t composeLookup(UChar ch); - static uint16_t composeAction(uint16_t baseIndex, - uint16_t comIndex); - static void explode(UnicodeString& target, - uint16_t index); - static UChar pairExplode(UnicodeString& target, - uint16_t action); - - // Utilities used by Decompose - static void fixCanonical(UnicodeString& result); // Reorders combining marks - static uint8_t getClass(UChar ch); // Gets char's combining class - - // Other static utility methods - static void doAppend(const UChar source[], - uint16_t offset, - UnicodeString& dest); - static void doInsert(const UChar source[], - uint16_t offset, - UnicodeString& dest, - UTextOffset pos); - static uint16_t doReplace(const UChar source[], - uint16_t offset, - UnicodeString& dest, - UTextOffset pos); - - static void hangulToJamo(UChar ch, - UnicodeString& result, - uint16_t decompLimit); - static void jamoAppend(UChar ch, - uint16_t decompLimit, - UnicodeString& dest); - static void jamoToHangul(UnicodeString& buffer, - UTextOffset start); - //------------------------------------------------------------------------- // Private data //------------------------------------------------------------------------- EMode fMode; int32_t fOptions; - int16_t minDecomp; // The input text and our position in it CharacterIterator* text; + // The next index (if >= 0) to set in text for next(), which is + // necessary to make current() and setIndex() work reasonably. + UTextOffset nextIndex; // A buffer for holding intermediate results UnicodeString buffer; - UTextOffset bufferPos; - UTextOffset bufferLimit; - UChar currentChar; - - // Another buffer for use during iterative composition - UnicodeString explodeBuf; - - enum { - EMPTY = -1, - STR_INDEX_SHIFT = 2, //Must agree with the constants used in NormalizerBuilder - STR_LENGTH_MASK = 0x0003 - }; - - enum { - HANGUL_BASE = 0xac00, - HANGUL_LIMIT = 0xd7a4, - JAMO_LBASE = 0x1100, - JAMO_VBASE = 0x1161, - JAMO_TBASE = 0x11a7, - JAMO_LCOUNT = 19, - JAMO_VCOUNT = 21, - JAMO_TCOUNT = 28, - JAMO_NCOUNT = JAMO_VCOUNT * JAMO_TCOUNT - }; - - friend class ComposedCharIter; + UTextOffset bufferPos; }; inline UBool