scuffed-code/icu4c/source/i18n/alphaindex.cpp
2019-08-23 11:45:36 -07:00

1243 lines
41 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_COLLATION
#include "unicode/alphaindex.h"
#include "unicode/coll.h"
#include "unicode/localpointer.h"
#include "unicode/normalizer2.h"
#include "unicode/tblcoll.h"
#include "unicode/uchar.h"
#include "unicode/ulocdata.h"
#include "unicode/uniset.h"
#include "unicode/uobject.h"
#include "unicode/usetiter.h"
#include "unicode/utf16.h"
#include "cmemory.h"
#include "cstring.h"
#include "uassert.h"
#include "uvector.h"
#include "uvectr64.h"
//#include <string>
//#include <iostream>
U_NAMESPACE_BEGIN
namespace {
/**
* Prefix string for Chinese index buckets.
* See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
*/
const UChar BASE[1] = { 0xFDD0 };
const int32_t BASE_LENGTH = 1;
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
const UnicodeString &one, const UnicodeString &other);
} // namespace
static int32_t U_CALLCONV
collatorComparator(const void *context, const void *left, const void *right);
static int32_t U_CALLCONV
recordCompareFn(const void *context, const void *left, const void *right);
// UVector<Record *> support function, delete a Record.
static void U_CALLCONV
alphaIndex_deleteRecord(void *obj) {
delete static_cast<AlphabeticIndex::Record *>(obj);
}
namespace {
UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return NULL; }
if (owned.isValid()) {
return owned.orphan();
}
UnicodeString *p = new UnicodeString(s);
if (p == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
}
return p;
}
inline UnicodeString *getString(const UVector &list, int32_t i) {
return static_cast<UnicodeString *>(list[i]);
}
inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
return static_cast<AlphabeticIndex::Bucket *>(list[i]);
}
inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
return static_cast<AlphabeticIndex::Record *>(list[i]);
}
/**
* Like Java Collections.binarySearch(List, String, Comparator).
*
* @return the index>=0 where the item was found,
* or the index<0 for inserting the string at ~index in sorted order
*/
int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
if (list.size() == 0) { return ~0; }
int32_t start = 0;
int32_t limit = list.size();
for (;;) {
int32_t i = (start + limit) / 2;
const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
UErrorCode errorCode = U_ZERO_ERROR;
UCollationResult cmp = coll.compare(s, *si, errorCode);
if (cmp == UCOL_EQUAL) {
return i;
} else if (cmp < 0) {
if (i == start) {
return ~start; // insert s before *si
}
limit = i;
} else {
if (i == start) {
return ~(start + 1); // insert s after *si
}
start = i;
}
}
}
} // namespace
// The BucketList is not in the anonymous namespace because only Clang
// seems to support its use in other classes from there.
// However, we also don't need U_I18N_API because it is not used from outside the i18n library.
class BucketList : public UObject {
public:
BucketList(UVector *bucketList, UVector *publicBucketList)
: bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
int32_t displayIndex = 0;
for (int32_t i = 0; i < publicBucketList->size(); ++i) {
getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
}
}
// The virtual destructor must not be inline.
// See ticket #8454 for details.
virtual ~BucketList();
int32_t getBucketCount() const {
return immutableVisibleList_->size();
}
int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
UErrorCode &errorCode) {
// binary search
int32_t start = 0;
int32_t limit = bucketList_->size();
while ((start + 1) < limit) {
int32_t i = (start + limit) / 2;
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
UCollationResult nameVsBucket =
collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
if (nameVsBucket < 0) {
limit = i;
} else {
start = i;
}
}
const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
if (bucket->displayBucket_ != NULL) {
bucket = bucket->displayBucket_;
}
return bucket->displayIndex_;
}
/** All of the buckets, visible and invisible. */
UVector *bucketList_;
/** Just the visible buckets. */
UVector *immutableVisibleList_;
};
BucketList::~BucketList() {
delete bucketList_;
if (immutableVisibleList_ != bucketList_) {
delete immutableVisibleList_;
}
}
AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
delete buckets_;
delete collatorPrimaryOnly_;
}
int32_t
AlphabeticIndex::ImmutableIndex::getBucketCount() const {
return buckets_->getBucketCount();
}
int32_t
AlphabeticIndex::ImmutableIndex::getBucketIndex(
const UnicodeString &name, UErrorCode &errorCode) const {
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
}
const AlphabeticIndex::Bucket *
AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
if (0 <= index && index < buckets_->getBucketCount()) {
return icu::getBucket(*buckets_->immutableVisibleList_, index);
} else {
return NULL;
}
}
AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
: inputList_(NULL),
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
maxLabelCount_(99),
initialLabels_(NULL), firstCharsInScripts_(NULL),
collator_(NULL), collatorPrimaryOnly_(NULL),
buckets_(NULL) {
init(&locale, status);
}
AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
: inputList_(NULL),
labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
maxLabelCount_(99),
initialLabels_(NULL), firstCharsInScripts_(NULL),
collator_(collator), collatorPrimaryOnly_(NULL),
buckets_(NULL) {
init(NULL, status);
}
AlphabeticIndex::~AlphabeticIndex() {
delete collator_;
delete collatorPrimaryOnly_;
delete firstCharsInScripts_;
delete buckets_;
delete inputList_;
delete initialLabels_;
}
AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
initialLabels_->addAll(additions);
clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
addIndexExemplars(locale, status);
clearBuckets();
return *this;
}
AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
if (U_FAILURE(errorCode)) { return NULL; }
// In C++, the ImmutableIndex must own its copy of the BucketList,
// even if it contains no records, for proper memory management.
// We could clone the buckets_ if they are not NULL,
// but that would be worth it only if this method is called multiple times,
// or called after using the old-style bucket iterator API.
LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
LocalPointer<RuleBasedCollator> coll(collatorPrimaryOnly_->clone());
if (immutableBucketList.isNull() || coll.isNull()) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
if (immIndex == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
// The ImmutableIndex adopted its parameter objects.
immutableBucketList.orphan();
coll.orphan();
return immIndex;
}
int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
initBuckets(status);
if (U_FAILURE(status)) {
return 0;
}
return buckets_->getBucketCount();
}
int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
if (U_FAILURE(status) || inputList_ == NULL) {
return 0;
}
return inputList_->size();
}
void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
if (U_FAILURE(errorCode)) { return; }
const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
const UnicodeString &overflowBoundary =
*getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
// We make a sorted array of elements.
// Some of the input may be redundant.
// That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
// We filter out those cases.
UnicodeSetIterator iter(*initialLabels_);
while (iter.next()) {
const UnicodeString *item = &iter.getString();
LocalPointer<UnicodeString> ownedItem;
UBool checkDistinct;
int32_t itemLength = item->length();
if (!item->hasMoreChar32Than(0, itemLength, 1)) {
checkDistinct = FALSE;
} else if(item->charAt(itemLength - 1) == 0x2a && // '*'
item->charAt(itemLength - 2) != 0x2a) {
// Use a label if it is marked with one trailing star,
// even if the label string sorts the same when all contractions are suppressed.
ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
item = ownedItem.getAlias();
if (item == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
checkDistinct = FALSE;
} else {
checkDistinct = TRUE;
}
if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
// Ignore a primary-ignorable or non-alphabetic index character.
} else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
// Ignore an index character that will land in the overflow bucket.
} else if (checkDistinct &&
collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
// Ignore a multi-code point index character that does not sort distinctly
// from the sequence of its separate characters.
} else {
int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
if (insertionPoint < 0) {
indexCharacters.insertElementAt(
ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
} else {
const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
indexCharacters.setElementAt(
ownedString(*item, ownedItem, errorCode), insertionPoint);
}
}
}
}
if (U_FAILURE(errorCode)) { return; }
// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
int32_t size = indexCharacters.size() - 1;
if (size > maxLabelCount_) {
int32_t count = 0;
int32_t old = -1;
for (int32_t i = 0; i < indexCharacters.size();) {
++count;
int32_t bump = count * maxLabelCount_ / size;
if (bump == old) {
indexCharacters.removeElementAt(i);
} else {
old = bump;
++i;
}
}
}
}
namespace {
const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
if (!current.startsWith(BASE, BASE_LENGTH)) {
return current;
}
UChar rest = current.charAt(BASE_LENGTH);
if (0x2800 < rest && rest <= 0x28FF) { // stroke count
int32_t count = rest-0x2800;
temp.setTo((UChar)(0x30 + count % 10));
if (count >= 10) {
count /= 10;
temp.insert(0, (UChar)(0x30 + count % 10));
if (count >= 10) {
count /= 10;
temp.insert(0, (UChar)(0x30 + count));
}
}
return temp.append((UChar)0x5283);
}
return temp.setTo(current, BASE_LENGTH);
}
UBool hasMultiplePrimaryWeights(
const RuleBasedCollator &coll, uint32_t variableTop,
const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) {
ces.removeAllElements();
coll.internalGetCEs(s, ces, errorCode);
if (U_FAILURE(errorCode)) { return FALSE; }
UBool seenPrimary = FALSE;
for (int32_t i = 0; i < ces.size(); ++i) {
int64_t ce = ces.elementAti(i);
uint32_t p = (uint32_t)(ce >> 32);
if (p > variableTop) {
// not primary ignorable
if (seenPrimary) {
return TRUE;
}
seenPrimary = TRUE;
}
}
return FALSE;
}
} // namespace
BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
// Initialize indexCharacters.
UVector indexCharacters(errorCode);
indexCharacters.setDeleter(uprv_deleteUObject);
initLabels(indexCharacters, errorCode);
if (U_FAILURE(errorCode)) { return NULL; }
// Variables for hasMultiplePrimaryWeights().
UVector64 ces(errorCode);
uint32_t variableTop;
if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
variableTop = collatorPrimaryOnly_->getVariableTop(errorCode);
} else {
variableTop = 0;
}
UBool hasInvisibleBuckets = FALSE;
// Helper arrays for Chinese Pinyin collation.
Bucket *asciiBuckets[26] = {
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
Bucket *pinyinBuckets[26] = {
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
};
UBool hasPinyin = FALSE;
LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
bucketList->setDeleter(uprv_deleteUObject);
// underflow bucket
Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
if (bucket == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList->addElement(bucket, errorCode);
if (U_FAILURE(errorCode)) { return NULL; }
UnicodeString temp;
// fix up the list, adding underflow, additions, overflow
// Insert inflow labels as needed.
int32_t scriptIndex = -1;
const UnicodeString *scriptUpperBoundary = &emptyString_;
for (int32_t i = 0; i < indexCharacters.size(); ++i) {
UnicodeString &current = *getString(indexCharacters, i);
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
// We crossed the script boundary into a new script.
const UnicodeString &inflowBoundary = *scriptUpperBoundary;
UBool skippedScript = FALSE;
for (;;) {
scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
break;
}
skippedScript = TRUE;
}
if (skippedScript && bucketList->size() > 1) {
// We are skipping one or more scripts,
// and we are not just getting out of the underflow label.
bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
if (bucket == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList->addElement(bucket, errorCode);
}
}
// Add a bucket with the current label.
bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
if (bucket == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList->addElement(bucket, errorCode);
// Remember ASCII and Pinyin buckets for Pinyin redirects.
UChar c;
if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z
asciiBuckets[c - 0x41] = bucket;
} else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
pinyinBuckets[c - 0x41] = bucket;
hasPinyin = TRUE;
}
// Check for multiple primary weights.
if (!current.startsWith(BASE, BASE_LENGTH) &&
hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current,
ces, errorCode) &&
current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
// "AE-ligature" or "Sch" etc.
for (int32_t j = bucketList->size() - 2;; --j) {
Bucket *singleBucket = getBucket(*bucketList, j);
if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
// There is no single-character bucket since the last
// underflow or inflow label.
break;
}
if (singleBucket->displayBucket_ == NULL &&
!hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop,
singleBucket->lowerBoundary_,
ces, errorCode)) {
// Add an invisible bucket that redirects strings greater than the expansion
// to the previous single-character bucket.
// For example, after ... Q R S Sch we add Sch\uFFFF->S
// and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
bucket = new Bucket(emptyString_,
UnicodeString(current).append((UChar)0xFFFF),
U_ALPHAINDEX_NORMAL);
if (bucket == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucket->displayBucket_ = singleBucket;
bucketList->addElement(bucket, errorCode);
hasInvisibleBuckets = TRUE;
break;
}
}
}
}
if (U_FAILURE(errorCode)) { return NULL; }
if (bucketList->size() == 1) {
// No real labels, show only the underflow label.
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
if (bl == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList.orphan();
return bl;
}
// overflow bucket
bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
if (bucket == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList->addElement(bucket, errorCode); // final
if (hasPinyin) {
// Redirect Pinyin buckets.
Bucket *asciiBucket = NULL;
for (int32_t i = 0; i < 26; ++i) {
if (asciiBuckets[i] != NULL) {
asciiBucket = asciiBuckets[i];
}
if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
pinyinBuckets[i]->displayBucket_ = asciiBucket;
hasInvisibleBuckets = TRUE;
}
}
}
if (U_FAILURE(errorCode)) { return NULL; }
if (!hasInvisibleBuckets) {
BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
if (bl == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList.orphan();
return bl;
}
// Merge inflow buckets that are visually adjacent.
// Iterate backwards: Merge inflow into overflow rather than the other way around.
int32_t i = bucketList->size() - 1;
Bucket *nextBucket = getBucket(*bucketList, i);
while (--i > 0) {
bucket = getBucket(*bucketList, i);
if (bucket->displayBucket_ != NULL) {
continue; // skip invisible buckets
}
if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
bucket->displayBucket_ = nextBucket;
continue;
}
}
nextBucket = bucket;
}
LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode);
if (U_FAILURE(errorCode)) {
return NULL;
}
// Do not call publicBucketList->setDeleter():
// This vector shares its objects with the bucketList.
for (int32_t j = 0; j < bucketList->size(); ++j) {
bucket = getBucket(*bucketList, j);
if (bucket->displayBucket_ == NULL) {
publicBucketList->addElement(bucket, errorCode);
}
}
if (U_FAILURE(errorCode)) { return NULL; }
BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
if (bl == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
bucketList.orphan();
publicBucketList.orphan();
return bl;
}
/**
* Creates an index, and buckets and sorts the list of records into the index.
*/
void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
if (U_FAILURE(errorCode) || buckets_ != NULL) {
return;
}
buckets_ = createBucketList(errorCode);
if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
return;
}
// Sort the records by name.
// Stable sort preserves input order of collation duplicates.
inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
// Now, we traverse all of the input, which is now sorted.
// If the item doesn't go in the current bucket, we find the next bucket that contains it.
// This makes the process order n*log(n), since we just sort the list and then do a linear process.
// However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
// we need to improve it for that case.
Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
int32_t bucketIndex = 1;
Bucket *nextBucket;
const UnicodeString *upperBoundary;
if (bucketIndex < buckets_->bucketList_->size()) {
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
upperBoundary = &nextBucket->lowerBoundary_;
} else {
nextBucket = NULL;
upperBoundary = NULL;
}
for (int32_t i = 0; i < inputList_->size(); ++i) {
Record *r = getRecord(*inputList_, i);
// if the current bucket isn't the right one, find the one that is
// We have a special flag for the last bucket so that we don't look any further
while (upperBoundary != NULL &&
collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
currentBucket = nextBucket;
// now reset the boundary that we compare against
if (bucketIndex < buckets_->bucketList_->size()) {
nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
upperBoundary = &nextBucket->lowerBoundary_;
} else {
upperBoundary = NULL;
}
}
// now put the record into the bucket.
Bucket *bucket = currentBucket;
if (bucket->displayBucket_ != NULL) {
bucket = bucket->displayBucket_;
}
if (bucket->records_ == NULL) {
bucket->records_ = new UVector(errorCode);
if (bucket->records_ == NULL) {
errorCode = U_MEMORY_ALLOCATION_ERROR;
return;
}
}
bucket->records_->addElement(r, errorCode);
}
}
void AlphabeticIndex::clearBuckets() {
if (buckets_ != NULL) {
delete buckets_;
buckets_ = NULL;
internalResetBucketIterator();
}
}
void AlphabeticIndex::internalResetBucketIterator() {
labelsIterIndex_ = -1;
currentBucket_ = NULL;
}
void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
if (U_FAILURE(status)) {
return;
}
UnicodeSet exemplars;
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
if (U_SUCCESS(status)) {
initialLabels_->addAll(exemplars);
return;
}
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
// The locale data did not include explicit Index characters.
// Synthesize a set of them from the locale's standard exemplar characters.
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
if (U_FAILURE(status)) {
return;
}
// question: should we add auxiliary exemplars?
if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.isEmpty()) {
exemplars.add(0x61, 0x7A);
}
if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables
// cut down to small list
exemplars.remove(0xAC00, 0xD7A3).
add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
add(0xD30C).add(0xD558);
}
if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block
// cut down to small list
// make use of the fact that Ethiopic is allocated in 8's, where
// the base is 0 mod 8.
UnicodeSet ethiopic(UnicodeString(u"[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]"), status);
ethiopic.retainAll(exemplars);
exemplars.remove(u'', 0x137F).addAll(ethiopic);
}
// Upper-case any that aren't already so.
// (We only do this for synthesized index characters.)
UnicodeSetIterator it(exemplars);
UnicodeString upperC;
while (it.next()) {
const UnicodeString &exemplarC = it.getString();
upperC = exemplarC;
upperC.toUpper(locale);
initialLabels_->add(upperC);
}
}
UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
UnicodeSet contractions;
collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode);
if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; }
initialLabels_->addAll(contractions);
UnicodeSetIterator iter(contractions);
while (iter.next()) {
const UnicodeString &s = iter.getString();
U_ASSERT (s.startsWith(BASE, BASE_LENGTH));
UChar c = s.charAt(s.length() - 1);
if (0x41 <= c && c <= 0x5A) { // A-Z
// There are Pinyin labels, add ASCII A-Z labels as well.
initialLabels_->add(0x41, 0x5A); // A-Z
break;
}
}
return TRUE;
}
/*
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
*/
static const UChar CGJ = 0x034F;
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
UnicodeString result;
if (item.length() == 0) {
return result;
}
int32_t i = 0;
for (;;) {
UChar32 cp = item.char32At(i);
result.append(cp);
i = item.moveIndex32(i, 1);
if (i >= item.length()) {
break;
}
result.append(CGJ);
}
return result;
}
UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
return FALSE;
}
UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
return FALSE;
}
const RuleBasedCollator &AlphabeticIndex::getCollator() const {
return *collator_;
}
const UnicodeString &AlphabeticIndex::getInflowLabel() const {
return inflowLabel_;
}
const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
return overflowLabel_;
}
const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
return underflowLabel_;
}
AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
inflowLabel_ = label;
clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
overflowLabel_ = label;
clearBuckets();
return *this;
}
AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
underflowLabel_ = label;
clearBuckets();
return *this;
}
int32_t AlphabeticIndex::getMaxLabelCount() const {
return maxLabelCount_;
}
AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (maxLabelCount <= 0) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return *this;
}
maxLabelCount_ = maxLabelCount;
clearBuckets();
return *this;
}
//
// init() - Common code for constructors.
//
void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
if (U_FAILURE(status)) { return; }
if (locale == NULL && collator_ == NULL) {
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
initialLabels_ = new UnicodeSet();
if (initialLabels_ == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
inflowLabel_.setTo((UChar)0x2026); // Ellipsis
overflowLabel_ = inflowLabel_;
underflowLabel_ = inflowLabel_;
if (collator_ == NULL) {
Collator *coll = Collator::createInstance(*locale, status);
if (U_FAILURE(status)) {
delete coll;
return;
}
if (coll == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
collator_ = dynamic_cast<RuleBasedCollator *>(coll);
if (collator_ == NULL) {
delete coll;
status = U_UNSUPPORTED_ERROR;
return;
}
}
collatorPrimaryOnly_ = collator_->clone();
if (collatorPrimaryOnly_ == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
firstCharsInScripts_ = firstStringsInScript(status);
if (U_FAILURE(status)) { return; }
firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
// Guard against a degenerate collator where
// some script boundary strings are primary ignorable.
for (;;) {
if (U_FAILURE(status)) { return; }
if (firstCharsInScripts_->isEmpty()) {
// AlphabeticIndex requires some non-ignorable script boundary strings.
status = U_ILLEGAL_ARGUMENT_ERROR;
return;
}
if (collatorPrimaryOnly_->compare(
*static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
emptyString_, status) == UCOL_EQUAL) {
firstCharsInScripts_->removeElementAt(0);
} else {
break;
}
}
// Chinese index characters, which are specific to each of the several Chinese tailorings,
// take precedence over the single locale data exemplar set per language.
if (!addChineseIndexCharacters(status) && locale != NULL) {
addIndexExemplars(*locale, status);
}
}
//
// Comparison function for UVector<UnicodeString *> sorting with a collator.
//
static int32_t U_CALLCONV
collatorComparator(const void *context, const void *left, const void *right) {
const UElement *leftElement = static_cast<const UElement *>(left);
const UElement *rightElement = static_cast<const UElement *>(right);
const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
if (leftString == rightString) {
// Catches case where both are NULL
return 0;
}
if (leftString == NULL) {
return 1;
}
if (rightString == NULL) {
return -1;
}
const Collator *col = static_cast<const Collator *>(context);
UErrorCode errorCode = U_ZERO_ERROR;
return col->compare(*leftString, *rightString, errorCode);
}
//
// Comparison function for UVector<Record *> sorting with a collator.
//
static int32_t U_CALLCONV
recordCompareFn(const void *context, const void *left, const void *right) {
const UElement *leftElement = static_cast<const UElement *>(left);
const UElement *rightElement = static_cast<const UElement *>(right);
const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
const Collator *col = static_cast<const Collator *>(context);
UErrorCode errorCode = U_ZERO_ERROR;
return col->compare(leftRec->name_, rightRec->name_, errorCode);
}
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
if (U_FAILURE(status)) {
return NULL;
}
LocalPointer<UVector> dest(new UVector(status), status);
if (U_FAILURE(status)) {
return NULL;
}
dest->setDeleter(uprv_deleteUObject);
// Fetch the script-first-primary contractions which are defined in the root collator.
// They all start with U+FDD1.
UnicodeSet set;
collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status);
if (U_FAILURE(status)) {
return NULL;
}
if (set.isEmpty()) {
status = U_UNSUPPORTED_ERROR;
return NULL;
}
UnicodeSetIterator iter(set);
while (iter.next()) {
const UnicodeString &boundary = iter.getString();
uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1));
if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) {
// Ignore boundaries for the special reordering groups.
// Take only those for "real scripts" (where the sample character is a Letter,
// and the one for unassigned implicit weights (Cn).
continue;
}
UnicodeString *s = new UnicodeString(boundary);
if (s == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
dest->addElement(s, status);
}
return dest.orphan();
}
namespace {
/**
* Returns true if one index character string is "better" than the other.
* Shorter NFKD is better, and otherwise NFKD-binary-less-than is
* better, and otherwise binary-less-than is better.
*/
UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
const UnicodeString &one, const UnicodeString &other) {
// This is called with primary-equal strings, but never with one.equals(other).
UErrorCode status = U_ZERO_ERROR;
UnicodeString n1 = nfkdNormalizer.normalize(one, status);
UnicodeString n2 = nfkdNormalizer.normalize(other, status);
if (U_FAILURE(status)) { return FALSE; }
int32_t result = n1.countChar32() - n2.countChar32();
if (result != 0) {
return result < 0;
}
result = n1.compareCodePointOrder(n2);
if (result != 0) {
return result < 0;
}
return one.compareCodePointOrder(other) < 0;
}
} // namespace
//
// Constructor & Destructor for AlphabeticIndex::Record
//
// Records are internal only, instances are not directly surfaced in the public API.
// This class is mostly struct-like, with all public fields.
AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
: name_(name), data_(data) {}
AlphabeticIndex::Record::~Record() {
}
AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
if (inputList_ == NULL) {
inputList_ = new UVector(status);
if (inputList_ == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
inputList_->setDeleter(alphaIndex_deleteRecord);
}
Record *r = new Record(name, data);
if (r == NULL) {
status = U_MEMORY_ALLOCATION_ERROR;
return *this;
}
inputList_->addElement(r, status);
clearBuckets();
//std::string ss;
//std::string ss2;
//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
// " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
return *this;
}
AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
inputList_->removeAllElements();
clearBuckets();
}
return *this;
}
int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
initBuckets(status);
if (U_FAILURE(status)) {
return 0;
}
return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
}
int32_t AlphabeticIndex::getBucketIndex() const {
return labelsIterIndex_;
}
UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (buckets_ == NULL && currentBucket_ != NULL) {
status = U_ENUM_OUT_OF_SYNC_ERROR;
return FALSE;
}
initBuckets(status);
if (U_FAILURE(status)) {
return FALSE;
}
++labelsIterIndex_;
if (labelsIterIndex_ >= buckets_->getBucketCount()) {
labelsIterIndex_ = buckets_->getBucketCount();
return FALSE;
}
currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
resetRecordIterator();
return TRUE;
}
const UnicodeString &AlphabeticIndex::getBucketLabel() const {
if (currentBucket_ != NULL) {
return currentBucket_->label_;
} else {
return emptyString_;
}
}
UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
if (currentBucket_ != NULL) {
return currentBucket_->labelType_;
} else {
return U_ALPHAINDEX_NORMAL;
}
}
int32_t AlphabeticIndex::getBucketRecordCount() const {
if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
return currentBucket_->records_->size();
} else {
return 0;
}
}
AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
if (U_FAILURE(status)) {
return *this;
}
internalResetBucketIterator();
return *this;
}
UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
if (U_FAILURE(status)) {
return FALSE;
}
if (currentBucket_ == NULL) {
// We are trying to iterate over the items in a bucket, but there is no
// current bucket from the enumeration of buckets.
status = U_INVALID_STATE_ERROR;
return FALSE;
}
if (buckets_ == NULL) {
status = U_ENUM_OUT_OF_SYNC_ERROR;
return FALSE;
}
if (currentBucket_->records_ == NULL) {
return FALSE;
}
++itemsIterIndex_;
if (itemsIterIndex_ >= currentBucket_->records_->size()) {
itemsIterIndex_ = currentBucket_->records_->size();
return FALSE;
}
return TRUE;
}
const UnicodeString &AlphabeticIndex::getRecordName() const {
const UnicodeString *retStr = &emptyString_;
if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
itemsIterIndex_ >= 0 &&
itemsIterIndex_ < currentBucket_->records_->size()) {
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
retStr = &item->name_;
}
return *retStr;
}
const void *AlphabeticIndex::getRecordData() const {
const void *retPtr = NULL;
if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
itemsIterIndex_ >= 0 &&
itemsIterIndex_ < currentBucket_->records_->size()) {
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
retPtr = item->data_;
}
return retPtr;
}
AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
itemsIterIndex_ = -1;
return *this;
}
AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
const UnicodeString &lowerBoundary,
UAlphabeticIndexLabelType type)
: label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
displayBucket_(NULL), displayIndex_(-1),
records_(NULL) {
}
AlphabeticIndex::Bucket::~Bucket() {
delete records_;
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_COLLATION