/* ********************************************************************** * Copyright (C) 1996-1999, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include "dcmpdata.h" #include "compitr.h" #include "unicode/normlzr.h" /** * Construct a new ComposedCharIter. The iterator will return * all Unicode characters with canonical decompositions, including Korean * Hangul characters. */ ComposedCharIter::ComposedCharIter() : minDecomp(DecompData::MAX_COMPAT), hangul(FALSE), curChar(0), nextChar(ComposedCharIter::DONE) { } /** * Constructs a non-default ComposedCharIter with optional behavior. *

* @param compat false for canonical decompositions only; * true for both canonical and compatibility * decompositions. * * @param options Optional decomposition features. Currently, the only * supported option is {@link Normalizer#IGNORE_HANGUL}, which * causes this ComposedCharIter not to iterate * over the Hangul characters and their corresponding * Jamo decompositions. */ ComposedCharIter::ComposedCharIter(UBool compat, int32_t options) : minDecomp(compat ? 0 : DecompData::MAX_COMPAT), hangul((options & Normalizer::IGNORE_HANGUL) == 0), curChar(0), nextChar(ComposedCharIter::DONE) { } /** * Determines whether there any precomposed Unicode characters not yet returned * by {@link #next}. */ UBool ComposedCharIter::hasNext() const { if (nextChar == DONE) { ((ComposedCharIter*)this)->findNextChar(); } return nextChar != DONE; } /** * Returns the next precomposed Unicode character. * Repeated calls to next return all of the precomposed characters defined * by Unicode, in ascending order. After all precomposed characters have * been returned, {@link #hasNext} will return false and further calls * to next will return {@link #DONE}. */ UChar ComposedCharIter::next() { if (nextChar == DONE) { findNextChar(); } curChar = nextChar; nextChar = DONE; return curChar; } /** * Returns the Unicode decomposition of the current character. * This method returns the decomposition of the precomposed character most * recently returned by {@link #next}. The resulting decomposition is * affected by the settings of the * {@link Normalizer#COMPATIBILITY COMPATIBILITY} * and {@link Normalizer#NO_HANGUL NO_HANGUL} options passed to the constructor. */ void ComposedCharIter::getDecomposition(UnicodeString& result) const { // We duplicate most of the implementation of Normalizer::decompose() here // for efficiency. One thing we don't duplicate is the recursive // decomposition code. If we detect a need to do recursive decomposition // (which happens for only 16 characters in Unicode 3.0) then we delegate to // Normalizer::decompose(). This gives us optimal performance without // having a complete copy of Normalizer::decompose() here, with its extra // baggage of recursion buffers, etc. - Liu result.truncate(0); uint16_t offset = ucmp16_getu(DecompData::offsets, curChar); uint16_t index = (uint16_t)(offset & DecompData::DECOMP_MASK); if (index > minDecomp) { if ((offset & DecompData::DECOMP_RECURSE) != 0) { // Let Normalizer::decompose() handle recursive decomp UnicodeString temp(curChar); UErrorCode status = U_ZERO_ERROR; Normalizer::decompose(temp, minDecomp > 0, hangul ? Normalizer::IGNORE_HANGUL : 0, result, status); } else { Normalizer::doAppend((const UChar*)DecompData::contents, index, result); } } else if (hangul && curChar >= Normalizer::HANGUL_BASE && curChar < Normalizer::HANGUL_LIMIT) { Normalizer::hangulToJamo(curChar, result, (uint16_t)minDecomp); } else { result += curChar; } } void ComposedCharIter::findNextChar() { if (curChar != DONE) { UChar ch = curChar; while (++ch < 0xFFFF) { UChar offset = ucmp16_getu(DecompData::offsets, ch); if (offset > minDecomp || (hangul && ch >= Normalizer::HANGUL_BASE && ch < Normalizer::HANGUL_LIMIT) ) { nextChar = ch; break; } } } }