/* ******************************************************************************* * Copyright (C) 2009-2011, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ /** * \file * \brief C API: AlphabeticIndex class */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION #include "unicode/alphaindex.h" #include "unicode/coll.h" #include "unicode/normalizer2.h" #include "unicode/strenum.h" #include "unicode/tblcoll.h" #include "unicode/ulocdata.h" #include "unicode/uniset.h" #include "unicode/uobject.h" #include "unicode/uscript.h" #include "unicode/usetiter.h" #include "unicode/ustring.h" #include "cstring.h" #include "mutex.h" #include "uassert.h" #include "ucln_in.h" #include "uhash.h" #include "uvector.h" //#include //#include U_NAMESPACE_BEGIN UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex) // Forward Declarations static int32_t U_CALLCONV PreferenceComparator(const void *context, const void *left, const void *right); static int32_t U_CALLCONV sortCollateComparator(const void *context, const void *left, const void *right); static int32_t U_CALLCONV recordCompareFn(const void *context, const void *left, const void *right); // UVector support function, delete a Bucket. static void U_CALLCONV alphaIndex_deleteBucket(void *obj) { delete static_cast(obj); } // UVector support function, delete a Record. static void U_CALLCONV alphaIndex_deleteRecord(void *obj) { delete static_cast(obj); } static const Normalizer2 *nfkdNormalizer; // // Append the contents of a UnicodeSet to a UVector of UnicodeStrings. // Append everything - individual characters are handled as strings of length 1. // The destination vector owns the appended strings. static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) { UnicodeSetIterator setIter(source); while (setIter.next()) { const UnicodeString &str = setIter.getString(); dest.addElement(str.clone(), status); } } AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) { init(status); if (U_FAILURE(status)) { return; } locale_ = locale; langType_ = langTypeFromLocale(locale_); collator_ = Collator::createInstance(locale, status); if (collator_ != NULL) { collatorPrimaryOnly_ = collator_->clone(); } if (collatorPrimaryOnly_ != NULL) { collatorPrimaryOnly_->setStrength(Collator::PRIMARY); } getIndexExemplars(*initialLabels_, locale, status); indexBuildRequired_ = TRUE; if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } firstScriptCharacters_ = firstStringsInScript(status); } AlphabeticIndex::~AlphabeticIndex() { uhash_close(alreadyIn_); delete bucketList_; delete collator_; delete collatorPrimaryOnly_; delete firstScriptCharacters_; delete labels_; delete inputRecords_; delete noDistinctSorting_; delete notAlphabetic_; delete initialLabels_; } AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } initialLabels_->addAll(additions); return *this; } AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } UnicodeSet additions; getIndexExemplars(additions, locale, status); initialLabels_->addAll(additions); return *this; } int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { buildIndex(status); if (U_FAILURE(status)) { return 0; } return bucketList_->size(); } int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { if (U_FAILURE(status)) { return 0; } return inputRecords_->size(); } void AlphabeticIndex::buildIndex(UErrorCode &status) { if (U_FAILURE(status)) { return; } if (!indexBuildRequired_) { return; } // Discard any already-built data. // This is important when the user builds and uses an index, then subsequently modifies it, // necessitating a rebuild. bucketList_->removeAllElements(); labels_->removeAllElements(); uhash_removeAll(alreadyIn_); noDistinctSorting_->clear(); notAlphabetic_->clear(); // first sort the incoming Labels, with a "best" ordering among items // that are the same according to the collator UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector. preferenceSorting.setDeleter(uprv_deleteUObject); appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status); preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status); // We now make a set of Labels. // Some of the input may, however, be redundant. // That is, we might have c, ch, d, where "ch" sorts just like "c", "h" // So we make a pass through, filtering out those cases. // TODO: filtering these out would seem to be at odds with the eventual goal // of being able to split buckets that contain too many items. UnicodeSet labelSet; for (int32_t psIndex=0; psIndex(preferenceSorting.elementAt(psIndex)); // TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet, // is it even possible for duplicates to show up in this check? if (labelSet.contains(item)) { UnicodeSetIterator itemAlreadyInIter(labelSet); while (itemAlreadyInIter.next()) { const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString(); if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) { UnicodeSet *targets = static_cast(uhash_get(alreadyIn_, &itemAlreadyIn)); if (targets == NULL) { // alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet()); targets = new UnicodeSet(); uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status); } targets->add(item); break; } } } else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point. collatorPrimaryOnly_->compare(item, separated(item)) == 0) { noDistinctSorting_->add(item); } else if (!ALPHABETIC->containsSome(item)) { notAlphabetic_->add(item); } else { labelSet.add(item); } } // Move the set of Labels from the set into a vector, and sort // according to the collator. appendUnicodeSetToUVector(*labels_, labelSet, status); labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status); // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element // Implemented by copying the elements to be retained to a new UVector. const int32_t size = labelSet.size() - 1; if (size > maxLabelCount_) { UVector *newLabels = new UVector(status); newLabels->setDeleter(uprv_deleteUObject); int32_t count = 0; int32_t old = -1; for (int32_t srcIndex=0; srcIndexsize(); srcIndex++) { const UnicodeString *str = static_cast(labels_->elementAt(srcIndex)); ++count; const int32_t bump = count * maxLabelCount_ / size; if (bump == old) { // it.remove(); } else { newLabels->addElement(str->clone(), status); old = bump; } } delete labels_; labels_ = newLabels; } // We now know the list of labels. // Create a corresponding list of buckets, one per label. buildBucketList(status); // Corresponds to Java BucketList constructor. // Bin the Records into the Buckets. bucketRecords(status); indexBuildRequired_ = FALSE; resetBucketIterator(status); } // // buildBucketList() Corresponds to the BucketList constructor in the Java version. void AlphabeticIndex::buildBucketList(UErrorCode &status) { UnicodeString labelStr = getUnderflowLabel(); Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status); bucketList_->addElement(b, status); // Build up the list, adding underflow, additions, overflow // insert infix labels as needed, using \uFFFF. const UnicodeString *last = static_cast(labels_->elementAt(0)); b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status); bucketList_->addElement(b, status); UnicodeSet lastSet; UnicodeSet set; AlphabeticIndex::getScriptSet(lastSet, *last, status); lastSet.removeAll(*IGNORE_SCRIPTS); for (int i = 1; i < labels_->size(); ++i) { UnicodeString *current = static_cast(labels_->elementAt(i)); getScriptSet(set, *current, status); set.removeAll(*IGNORE_SCRIPTS); if (lastSet.containsNone(set)) { // check for adjacent const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status); if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) { labelStr = getInflowLabel(); b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status); bucketList_->addElement(b, status); i++; lastSet = set; } } b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status); bucketList_->addElement(b, status); last = current; lastSet = set; } const UnicodeString &limitString = getOverflowComparisonString(*last, status); b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status); bucketList_->addElement(b, status); // final overflow bucket } // // Place all of the raw input records into the correct bucket. // // Begin by sorting the input records; this lets us bin them in a single pass. // // Note on storage management: The input records are owned by the // inputRecords_ vector, and will (eventually) be auto-deleted by it. // The Bucket objects have pointers to the Record objects, but do not own them. // void AlphabeticIndex::bucketRecords(UErrorCode &status) { if (U_FAILURE(status)) { return; } inputRecords_->sortWithUComparator(recordCompareFn, collator_, status); U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow // bucket, even if no user labels. int32_t bucketIndex = 0; Bucket *destBucket = static_cast(bucketList_->elementAt(bucketIndex)); Bucket *nextBucket = NULL; if (bucketIndex+1 < bucketList_->size()) { nextBucket = static_cast(bucketList_->elementAt(bucketIndex+1)); } int32_t recordIndex = 0; Record *r = static_cast(inputRecords_->elementAt(recordIndex)); while (recordIndex < inputRecords_->size()) { if (nextBucket == NULL || collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) { // Record goes in current bucket. Advance to next record, // stay on current bucket. destBucket->records_->addElement(r, status); ++recordIndex; r = static_cast(inputRecords_->elementAt(recordIndex)); } else { // Advance to the next bucket, stay on current record. bucketIndex++; destBucket = nextBucket; if (bucketIndex+1 < bucketList_->size()) { nextBucket = static_cast(bucketList_->elementAt(bucketIndex+1)); } else { nextBucket = NULL; } U_ASSERT(destBucket != NULL); } } } void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) { if (U_FAILURE(status)) { return; } LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); UnicodeSet exemplars; ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); if (U_SUCCESS(status)) { dest.addAll(exemplars); return; } status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR // Locale data did not include explicit Index characters. // Synthesize a set of them from the locale's standard exemplar characters. ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); if (U_FAILURE(status)) { return; } // Upper-case any that aren't already so. // (We only do this for synthesized index characters.) UnicodeSetIterator it(exemplars); UnicodeString upperC; UnicodeSet lowersToRemove; UnicodeSet uppersToAdd; while (it.next()) { const UnicodeString &exemplarC = it.getString(); upperC = exemplarC; upperC.toUpper(locale); if (exemplarC != upperC) { lowersToRemove.add(exemplarC); uppersToAdd.add(upperC); } } exemplars.removeAll(lowersToRemove); exemplars.addAll(uppersToAdd); // get the exemplars, and handle special cases // question: should we add auxiliary exemplars? if (exemplars.containsSome(*CORE_LATIN)) { exemplars.addAll(*CORE_LATIN); } if (exemplars.containsSome(*HANGUL)) { // cut down to small list UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status); exemplars.removeAll(BLOCK_HANGUL_SYLLABLES); exemplars.addAll(*HANGUL); } if (exemplars.containsSome(*ETHIOPIC)) { // cut down to small list // make use of the fact that Ethiopic is allocated in 8's, where // the base is 0 mod 8. UnicodeSetIterator it(*ETHIOPIC); while (it.next() && !it.isString()) { if ((it.getCodepoint() & 0x7) != 0) { exemplars.remove(it.getCodepoint()); } } } dest.addAll(exemplars); } /* * Return the string with interspersed CGJs. Input must have more than 2 codepoints. */ static const UChar32 CGJ = (UChar)0x034F; UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { UnicodeString result; if (item.length() == 0) { return result; } int32_t i = 0; for (;;) { UChar32 cp = item.char32At(i); result.append(cp); i = item.moveIndex32(i, 1); if (i >= item.length()) { break; } result.append(CGJ); } return result; } UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { return FALSE; } UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { return FALSE; } const RuleBasedCollator &AlphabeticIndex::getCollator() const { // There are no known non-RuleBasedCollator collators, and none ever expected. // But, in case that changes, better a null pointer than a wrong type. return *dynamic_cast(collator_); } const UnicodeString &AlphabeticIndex::getInflowLabel() const { return inflowLabel_; } const UnicodeString &AlphabeticIndex::getOverflowLabel() const { return overflowLabel_; } const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { return underflowLabel_; } AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { inflowLabel_ = label; indexBuildRequired_ = TRUE; return *this; } AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { overflowLabel_ = label; indexBuildRequired_ = TRUE; return *this; } AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { underflowLabel_ = label; indexBuildRequired_ = TRUE; return *this; } int32_t AlphabeticIndex::getMaxLabelCount() const { return maxLabelCount_; } AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } if (maxLabelCount <= 0) { status = U_ILLEGAL_ARGUMENT_ERROR; return *this; } maxLabelCount_ = maxLabelCount; if (maxLabelCount < bucketList_->size()) { indexBuildRequired_ = TRUE; } return *this; } const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) { for (int32_t i=0; isize(); i++) { const UnicodeString *s = static_cast(firstScriptCharacters_->elementAt(i)); if (collator_->compare(*s, lowerLimit) > 0) { return *s; } } return *EMPTY_STRING; } UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) { if (U_FAILURE(status)) { return &dest; } UChar32 cp = codePoint.char32At(0); UScriptCode scriptCode = uscript_getScript(cp, &status); dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status); return &dest; } // // init() - Common code for constructors. // void AlphabeticIndex::init(UErrorCode &status) { // Initialize statics if needed. AlphabeticIndex::staticInit(status); // Put the object into a known state so that the destructor will function. alreadyIn_ = NULL; bucketList_ = NULL; collator_ = NULL; collatorPrimaryOnly_ = NULL; currentBucket_ = NULL; firstScriptCharacters_ = NULL; initialLabels_ = NULL; indexBuildRequired_ = TRUE; inputRecords_ = NULL; itemsIterIndex_ = 0; labels_ = NULL; labelsIterIndex_ = 0; maxLabelCount_ = 99; noDistinctSorting_ = NULL; notAlphabetic_ = NULL; recordCounter_ = 0; if (U_FAILURE(status)) { return; } alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash, uhash_compareUnicodeString, // key Comparator, NULL, // value Comparator &status); uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject); uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject); bucketList_ = new UVector(status); bucketList_->setDeleter(alphaIndex_deleteBucket); labels_ = new UVector(status); labels_->setDeleter(uprv_deleteUObject); labels_->setComparer(uhash_compareUnicodeString); inputRecords_ = new UVector(status); inputRecords_->setDeleter(alphaIndex_deleteRecord); noDistinctSorting_ = new UnicodeSet(); notAlphabetic_ = new UnicodeSet(); initialLabels_ = new UnicodeSet(); inflowLabel_.remove(); inflowLabel_.append((UChar)0x2026); // Ellipsis overflowLabel_ = inflowLabel_; underflowLabel_ = inflowLabel_; // TODO: check for memory allocation failures. } static UBool indexCharactersAreInitialized = FALSE; // Index Characters Clean up function. Delete statically allocated constant stuff. U_CDECL_BEGIN static UBool U_CALLCONV indexCharacters_cleanup(void) { AlphabeticIndex::staticCleanup(); return TRUE; } U_CDECL_END void AlphabeticIndex::staticCleanup() { delete ALPHABETIC; ALPHABETIC = NULL; delete HANGUL; HANGUL = NULL; delete ETHIOPIC; ETHIOPIC = NULL; delete CORE_LATIN; CORE_LATIN = NULL; delete IGNORE_SCRIPTS; IGNORE_SCRIPTS = NULL; delete TO_TRY; TO_TRY = NULL; delete UNIHAN; UNIHAN = NULL; delete EMPTY_STRING; EMPTY_STRING = NULL; nfkdNormalizer = NULL; // ref to a singleton. Do not delete. indexCharactersAreInitialized = FALSE; } UnicodeSet *AlphabeticIndex::ALPHABETIC; UnicodeSet *AlphabeticIndex::HANGUL; UnicodeSet *AlphabeticIndex::ETHIOPIC; UnicodeSet *AlphabeticIndex::CORE_LATIN; UnicodeSet *AlphabeticIndex::IGNORE_SCRIPTS; UnicodeSet *AlphabeticIndex::TO_TRY; UnicodeSet *AlphabeticIndex::UNIHAN; const UnicodeString *AlphabeticIndex::EMPTY_STRING; // // staticInit() One-time initialization of constants. // Thread safe. Called from constructors. // Mutex overhead is not a concern. AlphabeticIndex constructors are // sufficiently heavy that the cost of the mutex check is not significant. void AlphabeticIndex::staticInit(UErrorCode &status) { static UMTX IndexCharsInitMutex; Mutex mutex(&IndexCharsInitMutex); if (indexCharactersAreInitialized || U_FAILURE(status)) { return; } UBool finishedInit = FALSE; { UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]"); ALPHABETIC = new UnicodeSet(alphaString, status); if (ALPHABETIC == NULL) { goto err; } HANGUL = new UnicodeSet(); HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC). add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558); if (HANGUL== NULL) { goto err; } UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); ETHIOPIC = new UnicodeSet(EthiopicStr, status); if (ETHIOPIC == NULL) { goto err; } CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z'); if (CORE_LATIN == NULL) { goto err; } UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE( "[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]"); IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status); IGNORE_SCRIPTS->freeze(); if (IGNORE_SCRIPTS == NULL) { goto err; } UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]"); TO_TRY = new UnicodeSet(nfcqcStr, status); if (TO_TRY == NULL) { goto err; } UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]"); UNIHAN = new UnicodeSet(unihanStr, status); if (UNIHAN == NULL) { goto err; } EMPTY_STRING = new UnicodeString(); nfkdNormalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, status); if (nfkdNormalizer == NULL) { goto err; } } finishedInit = TRUE; err: if (!finishedInit && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } if (U_FAILURE(status)) { indexCharacters_cleanup(); return; } ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup); indexCharactersAreInitialized = TRUE; } // // Comparison function for UVector sorting with a collator. // static int32_t U_CALLCONV sortCollateComparator(const void *context, const void *left, const void *right) { const UElement *leftElement = static_cast(left); const UElement *rightElement = static_cast(right); const UnicodeString *leftString = static_cast(leftElement->pointer); const UnicodeString *rightString = static_cast(rightElement->pointer); const Collator *col = static_cast(context); if (leftString == rightString) { // Catches case where both are NULL return 0; } if (leftString == NULL) { return 1; }; if (rightString == NULL) { return -1; } Collator::EComparisonResult r = col->compare(*leftString, *rightString); return (int32_t) r; } // // Comparison function for UVector sorting with a collator. // static int32_t U_CALLCONV recordCompareFn(const void *context, const void *left, const void *right) { const UElement *leftElement = static_cast(left); const UElement *rightElement = static_cast(right); const AlphabeticIndex::Record *leftRec = static_cast(leftElement->pointer); const AlphabeticIndex::Record *rightRec = static_cast(rightElement->pointer); const Collator *col = static_cast(context); Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_); if (r == Collator::EQUAL) { if (leftRec->serialNumber_ < rightRec->serialNumber_) { r = Collator::LESS; } else if (leftRec->serialNumber_ > rightRec->serialNumber_) { r = Collator::GREATER; } } return (int32_t) r; } #if 0 // // First characters in scripts. // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. // The vector is sorted according to this index's collation. // // This code is too slow to use, so for now hard code the data. // Hard coded implementation is follows. // UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } UnicodeString results[USCRIPT_CODE_LIMIT]; UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a"); UnicodeSetIterator siter(*TO_TRY); while (siter.next()) { const UnicodeString ¤t = siter.getString(); Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A); if (r < 0) { // TODO fix; we only want "real" script characters, not // symbols. continue; } int script = uscript_getScript(current.char32At(0), &status); if (results[script].length() == 0) { results[script] = current; } else if (ruleBasedCollator->compare(current, results[script]) < 0) { results[script] = current; } } UnicodeSet extras; UnicodeSet expansions; RuleBasedCollator *rbc = dynamic_cast(ruleBasedCollator); const UCollator *uRuleBasedCollator = rbc->getUCollator(); ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status); extras.addAll(expansions).removeAll(*TO_TRY); if (extras.size() != 0) { const Normalizer2 *normalizer = Normalizer2::getInstance(NULL, "nfkc", UNORM2_COMPOSE, status); UnicodeSetIterator extrasIter(extras); while (extrasIter.next()) { const UnicodeString ¤t = extrasIter.next(); if (!TO_TRY->containsAll(current)) continue; if (!normalizer->isNormalized(current, status) || ruleBasedCollator->compare(current, LOWER_A) < 0) { continue; } int script = uscript_getScript(current.char32At(0), &status); if (results[script].length() == 0) { results[script] = current; } else if (ruleBasedCollator->compare(current, results[script]) < 0) { results[script] = current; } } } UVector *dest = new UVector(status); dest->setDeleter(uprv_deleteUObject); for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) { if (results[i].length() > 0) { dest->addElement(results[i].clone(), status); } } dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status); return dest; } #endif // // First characters in scripts. // Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script. // The vector is sorted according to this index's collation. // // It takes too much time to compute this from character properties, so hard code it for now. // Character constants copied from corresponding declaration in ICU4J. static UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0, 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 0xABC0, 0, 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 0x1B83, 0, 0xD802, 0xDE00, 0, 0x0E01, 0, 0x0E81, 0, 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 0xD800, 0xDE80, 0, 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, 0xD801, 0xDC80, 0, 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 0x4E00, 0 }; UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } UVector *dest = new UVector(status); if (dest == NULL && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } dest->setDeleter(uprv_deleteUObject); const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]); do { if (U_FAILURE(status)) { return dest; } UnicodeString *str = new UnicodeString(src, -1); if (str == NULL) { status = U_MEMORY_ALLOCATION_ERROR; } dest->addElement(str, status); src += str->length() + 1; } while (src < limit); dest->sortWithUComparator(sortCollateComparator, collator_, status); return dest; } AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) { const char *lang = loc.getLanguage(); if (uprv_strcmp(lang, "zh") != 0) { return kNormal; } const char *script = loc.getScript(); if (uprv_strcmp(script, "Hant") == 0) { return kTraditional; } const char *country = loc.getCountry(); if (uprv_strcmp(country, "TW") == 0) { return kTraditional; } return kSimplified; } // // Pinyin Hacks. Direct port from Java. // static const UChar32 probeCharInLong = 0x28EAD; static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz" 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; // Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code. AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_SHORT = { {(UChar)0, (UChar)0, (UChar)0}, // A {(UChar)0x516B, (UChar)0, (UChar)0}, // B {(UChar)0x5693, (UChar)0, (UChar)0}, // C {(UChar)0x5491, (UChar)0, (UChar)0}, // D {(UChar)0x59B8, (UChar)0, (UChar)0}, // E {(UChar)0x53D1, (UChar)0, (UChar)0}, // F {(UChar)0x65EE, (UChar)0, (UChar)0}, // G {(UChar)0x54C8, (UChar)0, (UChar)0}, // H {(UChar)0x4E0C, (UChar)0, (UChar)0}, // J {(UChar)0x5494, (UChar)0, (UChar)0}, // K {(UChar)0x5783, (UChar)0, (UChar)0}, // L {(UChar)0x5452, (UChar)0, (UChar)0}, // M {(UChar)0x5514, (UChar)0, (UChar)0}, // N {(UChar)0x5594, (UChar)0, (UChar)0}, // O {(UChar)0x5991, (UChar)0, (UChar)0}, // P {(UChar)0x4E03, (UChar)0, (UChar)0}, // Q {(UChar)0x513F, (UChar)0, (UChar)0}, // R {(UChar)0x4EE8, (UChar)0, (UChar)0}, // S {(UChar)0x4ED6, (UChar)0, (UChar)0}, // T {(UChar)0x7A75, (UChar)0, (UChar)0}, // W {(UChar)0x5915, (UChar)0, (UChar)0}, // X {(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y {(UChar)0x5E00, (UChar)0, (UChar)0}, // Z {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array }; static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"; 0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D, /*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A}; AlphabeticIndex::PinyinLookup AlphabeticIndex::HACK_PINYIN_LOOKUP_LONG = { {(UChar)0, (UChar)0, (UChar)0}, // A {(UChar)0x516B, (UChar)0, (UChar)0}, // b {(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c {(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d {(UChar)0x59B8, (UChar)0, (UChar)0}, // e {(UChar)0x53D1, (UChar)0, (UChar)0}, // f {(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g {(UChar)0x54C8, (UChar)0, (UChar)0}, // h {(UChar)0x4E0C, (UChar)0, (UChar)0}, // j {(UChar)0x5494, (UChar)0, (UChar)0}, // k {(UChar)0x3547, (UChar)0, (UChar)0}, // l {(UChar)0x5452, (UChar)0, (UChar)0}, // m {(UChar)0x5514, (UChar)0, (UChar)0}, // n {(UChar)0x5594, (UChar)0, (UChar)0}, // o {(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p {(UChar)0x4E03, (UChar)0, (UChar)0}, // q {(UChar)0x513F, (UChar)0, (UChar)0}, // r {(UChar)0x4EE8, (UChar)0, (UChar)0}, // s {(UChar)0x4ED6, (UChar)0, (UChar)0}, // t {(UChar)0x7A75, (UChar)0, (UChar)0}, // w {(UChar)0x5915, (UChar)0, (UChar)0}, // x {(UChar)0x4E2B, (UChar)0, (UChar)0}, // y {(UChar)0x5E00, (UChar)0, (UChar)0}, // z {(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array }; // // Probe the collation data, and decide which Pinyin tables should be used // // ICU can be built with a choice between two Chinese collations. // The hack Pinyin tables to use depend on which one is in use. // We can assume that any given copy of ICU will have only one of the collations available, // and that there is no way, in a given process, to create two alphabetic indexes using // different Chinese collations. Which means the probe can be done once // and the results cached. // // This whole arrangement is temporary. // AlphabeticIndex::PinyinLookup *AlphabeticIndex::HACK_PINYIN_LOOKUP = NULL; const UChar *AlphabeticIndex::PINYIN_LOWER_BOUNDS = NULL; void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) { { Mutex m; if (PINYIN_LOWER_BOUNDS != NULL) { return; } } UnicodeSet *colSet = col->getTailoredSet(status); if (U_FAILURE(status) || colSet == NULL) { delete colSet; if (U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } return; } UBool useLongTables = colSet->contains(probeCharInLong); delete colSet; { Mutex m; if (useLongTables) { PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG; HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG; } else { PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT; HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT; } } } // Pinyin Hack: // Modify a Chinese name by prepending a Latin letter. The modified name is used // when putting records (names) into buckets, to put the name under a Latin index heading. void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) { if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) { dest = name; return; } UErrorCode status = U_ZERO_ERROR; initPinyinBounds(col, status); if (U_FAILURE(status)) { dest = name; return; } // TODO: use binary search int index; for (index=0; ; index++) { if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) { index--; break; } int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1)); if (compareResult < 0) { index--; } if (compareResult <= 0) { break; } } UChar c = PINYIN_LOWER_BOUNDS[index]; dest.setTo(c); dest.append(name); return; } /** * Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is * better, and otherwise binary order is better. * * For use with array sort or UVector. * @param context A UErrorCode pointer. * @param left A UElement pointer, which must refer to a UnicodeString * * @param right A UElement pointer, which must refer to a UnicodeString * */ static int32_t U_CALLCONV PreferenceComparator(const void *context, const void *left, const void *right) { const UElement *leftElement = static_cast(left); const UElement *rightElement = static_cast(right); const UnicodeString *s1 = static_cast(leftElement->pointer); const UnicodeString *s2 = static_cast(rightElement->pointer); UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const. if (s1 == s2) { return 0; } UnicodeString n1 = nfkdNormalizer->normalize(*s1, status); UnicodeString n2 = nfkdNormalizer->normalize(*s2, status); int32_t result = n1.length() - n2.length(); if (result != 0) { return result; } result = n1.compareCodePointOrder(n2); if (result != 0) { return result; } return s1->compareCodePointOrder(*s2); } // // Constructor & Destructor for AlphabeticIndex::Record // // Records are internal only, instances are not directly surfaced in the public API. // This class is mostly struct-like, with all public fields. AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data): alphaIndex_(alphaIndex), name_(name), data_(data) { UnicodeString prefixedName; alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_); serialNumber_ = ++alphaIndex->recordCounter_; } AlphabeticIndex::Record::~Record() { } AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { if (U_FAILURE(status)) { return *this; } Record *r = new Record(this, name, data); inputRecords_->addElement(r, status); indexBuildRequired_ = TRUE; //std::string ss; //std::string ss2; //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; return *this; } AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { if (U_FAILURE(status)) { return *this; } inputRecords_->removeAllElements(); indexBuildRequired_ = TRUE; return *this; } int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { buildIndex(status); if (U_FAILURE(status)) { return 0; } // For simplified Chinese prepend a prefix to the name. // For non-Chinese locales or non-Chinese names, the name is not modified. UnicodeString prefixedName; hackName(prefixedName, name, collatorPrimaryOnly_); // TODO: use a binary search. for (int32_t i = 0; i < bucketList_->size(); ++i) { Bucket *bucket = static_cast(bucketList_->elementAt(i)); Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_); if (comp < 0) { return i - 1; } } // Loop runs until we find the bucket following the one that would hold prefixedName. // If the prefixedName belongs in the last bucket the loop will drop out the bottom rather // than returning from the middle. return bucketList_->size() - 1; } int32_t AlphabeticIndex::getBucketIndex() const { return labelsIterIndex_; } UBool AlphabeticIndex::nextBucket(UErrorCode &status) { if (U_FAILURE(status)) { return FALSE; } if (indexBuildRequired_ && currentBucket_ != NULL) { status = U_ENUM_OUT_OF_SYNC_ERROR; return FALSE; } buildIndex(status); if (U_FAILURE(status)) { return FALSE; } ++labelsIterIndex_; if (labelsIterIndex_ >= bucketList_->size()) { labelsIterIndex_ = bucketList_->size(); return FALSE; } currentBucket_ = static_cast(bucketList_->elementAt(labelsIterIndex_)); resetRecordIterator(); return TRUE; } const UnicodeString &AlphabeticIndex::getBucketLabel() const { if (currentBucket_ != NULL) { return currentBucket_->label_; } else { return *EMPTY_STRING; } } UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { if (currentBucket_ != NULL) { return currentBucket_->labelType_; } else { return U_ALPHAINDEX_NORMAL; } } int32_t AlphabeticIndex::getBucketRecordCount() const { if (currentBucket_ != NULL) { return currentBucket_->records_->size(); } else { return 0; } } AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { if (U_FAILURE(status)) { return *this; } buildIndex(status); labelsIterIndex_ = -1; currentBucket_ = NULL; return *this; } UBool AlphabeticIndex::nextRecord(UErrorCode &status) { if (U_FAILURE(status)) { return FALSE; } if (currentBucket_ == NULL) { // We are trying to iterate over the items in a bucket, but there is no // current bucket from the enumeration of buckets. status = U_INVALID_STATE_ERROR; return FALSE; } if (indexBuildRequired_) { status = U_ENUM_OUT_OF_SYNC_ERROR; return FALSE; } ++itemsIterIndex_; if (itemsIterIndex_ >= currentBucket_->records_->size()) { itemsIterIndex_ = currentBucket_->records_->size(); return FALSE; } return TRUE; } const UnicodeString &AlphabeticIndex::getRecordName() const { const UnicodeString *retStr = EMPTY_STRING; if (currentBucket_ != NULL && itemsIterIndex_ >= 0 && itemsIterIndex_ < currentBucket_->records_->size()) { Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); retStr = &item->name_; } return *retStr; } const void *AlphabeticIndex::getRecordData() const { const void *retPtr = NULL; if (currentBucket_ != NULL && itemsIterIndex_ >= 0 && itemsIterIndex_ < currentBucket_->records_->size()) { Record *item = static_cast(currentBucket_->records_->elementAt(itemsIterIndex_)); retPtr = item->data_; } return retPtr; } AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { itemsIterIndex_ = -1; return *this; } AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, const UnicodeString &lowerBoundary, UAlphabeticIndexLabelType type, UErrorCode &status): label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) { if (U_FAILURE(status)) { return; } records_ = new UVector(status); if (records_ == NULL && U_SUCCESS(status)) { status = U_MEMORY_ALLOCATION_ERROR; } } AlphabeticIndex::Bucket::~Bucket() { delete records_; } U_NAMESPACE_END #endif