ce96470daf
X-SVN-Rev: 32159
1345 lines
45 KiB
C++
1345 lines
45 KiB
C++
/*
|
|
*******************************************************************************
|
|
* Copyright (C) 2009-2012, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*/
|
|
|
|
/**
|
|
* \file
|
|
* \brief C API: AlphabeticIndex class
|
|
*/
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
|
|
|
|
#include "unicode/alphaindex.h"
|
|
#include "unicode/coll.h"
|
|
#include "unicode/normalizer2.h"
|
|
#include "unicode/strenum.h"
|
|
#include "unicode/tblcoll.h"
|
|
#include "unicode/ulocdata.h"
|
|
#include "unicode/uniset.h"
|
|
#include "unicode/uobject.h"
|
|
#include "unicode/uscript.h"
|
|
#include "unicode/usetiter.h"
|
|
#include "unicode/ustring.h"
|
|
#include "unicode/utf16.h"
|
|
|
|
#include "cstring.h"
|
|
#include "mutex.h"
|
|
#include "uassert.h"
|
|
#include "ucln_in.h"
|
|
#include "uhash.h"
|
|
#include "uvector.h"
|
|
|
|
//#include <string>
|
|
//#include <iostream>
|
|
U_NAMESPACE_BEGIN
|
|
|
|
UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(AlphabeticIndex)
|
|
|
|
// Forward Declarations
|
|
static int32_t U_CALLCONV
|
|
PreferenceComparator(const void *context, const void *left, const void *right);
|
|
|
|
static int32_t U_CALLCONV
|
|
sortCollateComparator(const void *context, const void *left, const void *right);
|
|
|
|
static int32_t U_CALLCONV
|
|
recordCompareFn(const void *context, const void *left, const void *right);
|
|
|
|
// UVector<Bucket *> support function, delete a Bucket.
|
|
static void U_CALLCONV
|
|
alphaIndex_deleteBucket(void *obj) {
|
|
delete static_cast<AlphabeticIndex::Bucket *>(obj);
|
|
}
|
|
|
|
// UVector<Record *> support function, delete a Record.
|
|
static void U_CALLCONV
|
|
alphaIndex_deleteRecord(void *obj) {
|
|
delete static_cast<AlphabeticIndex::Record *>(obj);
|
|
}
|
|
|
|
|
|
typedef const UChar PinyinLookup[24][3];
|
|
|
|
static const Normalizer2 *nfkdNormalizer;
|
|
static UBool indexCharactersAreInitialized = FALSE;
|
|
static UnicodeSet *ALPHABETIC;
|
|
static UnicodeSet *HANGUL;
|
|
static UnicodeSet *ETHIOPIC;
|
|
static UnicodeSet *CORE_LATIN;
|
|
static UnicodeSet *IGNORE_SCRIPTS;
|
|
static UnicodeSet *TO_TRY;
|
|
static UnicodeSet *UNIHAN;
|
|
static const UnicodeString *EMPTY_STRING;
|
|
|
|
|
|
//
|
|
// Append the contents of a UnicodeSet to a UVector of UnicodeStrings.
|
|
// Append everything - individual characters are handled as strings of length 1.
|
|
// The destination vector owns the appended strings.
|
|
|
|
static void appendUnicodeSetToUVector(UVector &dest, const UnicodeSet &source, UErrorCode &status) {
|
|
UnicodeSetIterator setIter(source);
|
|
while (setIter.next()) {
|
|
const UnicodeString &str = setIter.getString();
|
|
dest.addElement(str.clone(), status);
|
|
}
|
|
}
|
|
|
|
|
|
AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) {
|
|
init(status);
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
locale_ = locale;
|
|
langType_ = langTypeFromLocale(locale_);
|
|
|
|
collator_ = Collator::createInstance(locale, status);
|
|
if (collator_ != NULL) {
|
|
collatorPrimaryOnly_ = collator_->clone();
|
|
}
|
|
if (collatorPrimaryOnly_ != NULL) {
|
|
collatorPrimaryOnly_->setStrength(Collator::PRIMARY);
|
|
}
|
|
getIndexExemplars(*initialLabels_, locale, status);
|
|
indexBuildRequired_ = TRUE;
|
|
if ((collator_ == NULL || collatorPrimaryOnly_ == NULL) && U_SUCCESS(status)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
firstScriptCharacters_ = firstStringsInScript(status);
|
|
}
|
|
|
|
|
|
AlphabeticIndex::~AlphabeticIndex() {
|
|
uhash_close(alreadyIn_);
|
|
delete bucketList_;
|
|
delete collator_;
|
|
delete collatorPrimaryOnly_;
|
|
delete firstScriptCharacters_;
|
|
delete labels_;
|
|
delete inputRecords_;
|
|
delete noDistinctSorting_;
|
|
delete notAlphabetic_;
|
|
delete initialLabels_;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
initialLabels_->addAll(additions);
|
|
return *this;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
UnicodeSet additions;
|
|
getIndexExemplars(additions, locale, status);
|
|
initialLabels_->addAll(additions);
|
|
return *this;
|
|
}
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
|
|
buildIndex(status);
|
|
if (U_FAILURE(status)) {
|
|
return 0;
|
|
}
|
|
return bucketList_->size();
|
|
}
|
|
|
|
|
|
int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return 0;
|
|
}
|
|
return inputRecords_->size();
|
|
}
|
|
|
|
|
|
void AlphabeticIndex::buildIndex(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
if (!indexBuildRequired_) {
|
|
return;
|
|
}
|
|
|
|
// Discard any already-built data.
|
|
// This is important when the user builds and uses an index, then subsequently modifies it,
|
|
// necessitating a rebuild.
|
|
|
|
bucketList_->removeAllElements();
|
|
labels_->removeAllElements();
|
|
uhash_removeAll(alreadyIn_);
|
|
noDistinctSorting_->clear();
|
|
notAlphabetic_->clear();
|
|
|
|
// first sort the incoming Labels, with a "best" ordering among items
|
|
// that are the same according to the collator
|
|
|
|
UVector preferenceSorting(status); // Vector of UnicodeStrings; owned by the vector.
|
|
preferenceSorting.setDeleter(uprv_deleteUObject);
|
|
appendUnicodeSetToUVector(preferenceSorting, *initialLabels_, status);
|
|
preferenceSorting.sortWithUComparator(PreferenceComparator, &status, status);
|
|
|
|
// We now make a set of Labels.
|
|
// Some of the input may, however, be redundant.
|
|
// That is, we might have c, ch, d, where "ch" sorts just like "c", "h"
|
|
// So we make a pass through, filtering out those cases.
|
|
// TODO: filtering these out would seem to be at odds with the eventual goal
|
|
// of being able to split buckets that contain too many items.
|
|
|
|
UnicodeSet labelSet;
|
|
for (int32_t psIndex=0; psIndex<preferenceSorting.size(); psIndex++) {
|
|
UnicodeString item = *static_cast<const UnicodeString *>(preferenceSorting.elementAt(psIndex));
|
|
// TODO: Since preferenceSorting was originally populated from the contents of a UnicodeSet,
|
|
// is it even possible for duplicates to show up in this check?
|
|
if (labelSet.contains(item)) {
|
|
UnicodeSetIterator itemAlreadyInIter(labelSet);
|
|
while (itemAlreadyInIter.next()) {
|
|
const UnicodeString &itemAlreadyIn = itemAlreadyInIter.getString();
|
|
if (collatorPrimaryOnly_->compare(item, itemAlreadyIn) == 0) {
|
|
UnicodeSet *targets = static_cast<UnicodeSet *>(uhash_get(alreadyIn_, &itemAlreadyIn));
|
|
if (targets == NULL) {
|
|
// alreadyIn.put(itemAlreadyIn, targets = new LinkedHashSet<String>());
|
|
targets = new UnicodeSet();
|
|
uhash_put(alreadyIn_, itemAlreadyIn.clone(), targets, &status);
|
|
}
|
|
targets->add(item);
|
|
break;
|
|
}
|
|
}
|
|
} else if (item.moveIndex32(0, 1) < item.length() && // Label contains more than one code point.
|
|
collatorPrimaryOnly_->compare(item, separated(item)) == 0) {
|
|
noDistinctSorting_->add(item);
|
|
} else if (!ALPHABETIC->containsSome(item)) {
|
|
notAlphabetic_->add(item);
|
|
} else {
|
|
labelSet.add(item);
|
|
}
|
|
}
|
|
|
|
// If we have no labels, hard-code a fallback default set of [A-Z]
|
|
// This case can occur with locales that don't have exemplar character data, including root.
|
|
// A no-labels situation will cause other problems; it needs to be avoided.
|
|
if (labelSet.isEmpty()) {
|
|
labelSet.add((UChar32)0x41, (UChar32)0x5A);
|
|
}
|
|
|
|
// Move the set of Labels from the set into a vector, and sort
|
|
// according to the collator.
|
|
|
|
appendUnicodeSetToUVector(*labels_, labelSet, status);
|
|
labels_->sortWithUComparator(sortCollateComparator, collatorPrimaryOnly_, status);
|
|
|
|
// if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element
|
|
// Implemented by copying the elements to be retained to a new UVector.
|
|
|
|
const int32_t size = labelSet.size() - 1;
|
|
if (size > maxLabelCount_) {
|
|
UVector *newLabels = new UVector(status);
|
|
newLabels->setDeleter(uprv_deleteUObject);
|
|
int32_t count = 0;
|
|
int32_t old = -1;
|
|
for (int32_t srcIndex=0; srcIndex<labels_->size(); srcIndex++) {
|
|
const UnicodeString *str = static_cast<const UnicodeString *>(labels_->elementAt(srcIndex));
|
|
++count;
|
|
const int32_t bump = count * maxLabelCount_ / size;
|
|
if (bump == old) {
|
|
// it.remove();
|
|
} else {
|
|
newLabels->addElement(str->clone(), status);
|
|
old = bump;
|
|
}
|
|
}
|
|
delete labels_;
|
|
labels_ = newLabels;
|
|
}
|
|
|
|
// We now know the list of labels.
|
|
// Create a corresponding list of buckets, one per label.
|
|
|
|
buildBucketList(status); // Corresponds to Java BucketList constructor.
|
|
|
|
// Bin the Records into the Buckets.
|
|
bucketRecords(status);
|
|
|
|
indexBuildRequired_ = FALSE;
|
|
resetBucketIterator(status);
|
|
}
|
|
|
|
//
|
|
// buildBucketList() Corresponds to the BucketList constructor in the Java version.
|
|
|
|
void AlphabeticIndex::buildBucketList(UErrorCode &status) {
|
|
UnicodeString labelStr = getUnderflowLabel();
|
|
Bucket *b = new Bucket(labelStr, *EMPTY_STRING, U_ALPHAINDEX_UNDERFLOW, status);
|
|
bucketList_->addElement(b, status);
|
|
|
|
// Build up the list, adding underflow, additions, overflow
|
|
// insert infix labels as needed, using \uFFFF.
|
|
const UnicodeString *last = static_cast<UnicodeString *>(labels_->elementAt(0));
|
|
b = new Bucket(*last, *last, U_ALPHAINDEX_NORMAL, status);
|
|
bucketList_->addElement(b, status);
|
|
|
|
UnicodeSet lastSet;
|
|
UnicodeSet set;
|
|
AlphabeticIndex::getScriptSet(lastSet, *last, status);
|
|
lastSet.removeAll(*IGNORE_SCRIPTS);
|
|
|
|
for (int i = 1; i < labels_->size(); ++i) {
|
|
UnicodeString *current = static_cast<UnicodeString *>(labels_->elementAt(i));
|
|
getScriptSet(set, *current, status);
|
|
set.removeAll(*IGNORE_SCRIPTS);
|
|
if (lastSet.containsNone(set)) {
|
|
// check for adjacent
|
|
const UnicodeString &overflowComparisonString = getOverflowComparisonString(*last, status);
|
|
if (collatorPrimaryOnly_->compare(overflowComparisonString, *current) < 0) {
|
|
labelStr = getInflowLabel();
|
|
b = new Bucket(labelStr, overflowComparisonString, U_ALPHAINDEX_INFLOW, status);
|
|
bucketList_->addElement(b, status);
|
|
i++;
|
|
lastSet = set;
|
|
}
|
|
}
|
|
b = new Bucket(*current, *current, U_ALPHAINDEX_NORMAL, status);
|
|
bucketList_->addElement(b, status);
|
|
last = current;
|
|
lastSet = set;
|
|
}
|
|
const UnicodeString &limitString = getOverflowComparisonString(*last, status);
|
|
b = new Bucket(getOverflowLabel(), limitString, U_ALPHAINDEX_OVERFLOW, status);
|
|
bucketList_->addElement(b, status);
|
|
// final overflow bucket
|
|
}
|
|
|
|
|
|
//
|
|
// Place all of the raw input records into the correct bucket.
|
|
//
|
|
// Begin by sorting the input records; this lets us bin them in a single pass.
|
|
//
|
|
// Note on storage management: The input records are owned by the
|
|
// inputRecords_ vector, and will (eventually) be auto-deleted by it.
|
|
// The Bucket objects have pointers to the Record objects, but do not own them.
|
|
//
|
|
void AlphabeticIndex::bucketRecords(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
inputRecords_->sortWithUComparator(recordCompareFn, collator_, status);
|
|
U_ASSERT(bucketList_->size() > 0); // Should always have at least an overflow
|
|
// bucket, even if no user labels.
|
|
int32_t bucketIndex = 0;
|
|
Bucket *destBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex));
|
|
Bucket *nextBucket = NULL;
|
|
if (bucketIndex+1 < bucketList_->size()) {
|
|
nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
|
|
}
|
|
int32_t recordIndex = 0;
|
|
Record *r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
|
|
while (recordIndex < inputRecords_->size()) {
|
|
if (nextBucket == NULL ||
|
|
collatorPrimaryOnly_->compare(r->sortingName_, nextBucket->lowerBoundary_) < 0) {
|
|
// Record goes in current bucket. Advance to next record,
|
|
// stay on current bucket.
|
|
destBucket->records_->addElement(r, status);
|
|
++recordIndex;
|
|
r = static_cast<Record *>(inputRecords_->elementAt(recordIndex));
|
|
} else {
|
|
// Advance to the next bucket, stay on current record.
|
|
bucketIndex++;
|
|
destBucket = nextBucket;
|
|
if (bucketIndex+1 < bucketList_->size()) {
|
|
nextBucket = static_cast<Bucket *>(bucketList_->elementAt(bucketIndex+1));
|
|
} else {
|
|
nextBucket = NULL;
|
|
}
|
|
U_ASSERT(destBucket != NULL);
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
void AlphabeticIndex::getIndexExemplars(UnicodeSet &dest, const Locale &locale, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
|
|
UnicodeSet exemplars;
|
|
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
|
|
if (U_SUCCESS(status)) {
|
|
dest.addAll(exemplars);
|
|
return;
|
|
}
|
|
status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR
|
|
|
|
// Locale data did not include explicit Index characters.
|
|
// Synthesize a set of them from the locale's standard exemplar characters.
|
|
|
|
ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
|
|
// Upper-case any that aren't already so.
|
|
// (We only do this for synthesized index characters.)
|
|
|
|
UnicodeSetIterator it(exemplars);
|
|
UnicodeString upperC;
|
|
UnicodeSet lowersToRemove;
|
|
UnicodeSet uppersToAdd;
|
|
while (it.next()) {
|
|
const UnicodeString &exemplarC = it.getString();
|
|
upperC = exemplarC;
|
|
upperC.toUpper(locale);
|
|
if (exemplarC != upperC) {
|
|
lowersToRemove.add(exemplarC);
|
|
uppersToAdd.add(upperC);
|
|
}
|
|
}
|
|
exemplars.removeAll(lowersToRemove);
|
|
exemplars.addAll(uppersToAdd);
|
|
|
|
// get the exemplars, and handle special cases
|
|
|
|
// question: should we add auxiliary exemplars?
|
|
if (exemplars.containsSome(*CORE_LATIN)) {
|
|
exemplars.addAll(*CORE_LATIN);
|
|
}
|
|
if (exemplars.containsSome(*HANGUL)) {
|
|
// cut down to small list
|
|
UnicodeSet BLOCK_HANGUL_SYLLABLES(UNICODE_STRING_SIMPLE("[:block=hangul_syllables:]"), status);
|
|
exemplars.removeAll(BLOCK_HANGUL_SYLLABLES);
|
|
exemplars.addAll(*HANGUL);
|
|
}
|
|
if (exemplars.containsSome(*ETHIOPIC)) {
|
|
// cut down to small list
|
|
// make use of the fact that Ethiopic is allocated in 8's, where
|
|
// the base is 0 mod 8.
|
|
UnicodeSetIterator it(*ETHIOPIC);
|
|
while (it.next() && !it.isString()) {
|
|
if ((it.getCodepoint() & 0x7) != 0) {
|
|
exemplars.remove(it.getCodepoint());
|
|
}
|
|
}
|
|
}
|
|
dest.addAll(exemplars);
|
|
}
|
|
|
|
|
|
/*
|
|
* Return the string with interspersed CGJs. Input must have more than 2 codepoints.
|
|
*/
|
|
static const UChar32 CGJ = (UChar)0x034F;
|
|
UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
|
|
UnicodeString result;
|
|
if (item.length() == 0) {
|
|
return result;
|
|
}
|
|
int32_t i = 0;
|
|
for (;;) {
|
|
UChar32 cp = item.char32At(i);
|
|
result.append(cp);
|
|
i = item.moveIndex32(i, 1);
|
|
if (i >= item.length()) {
|
|
break;
|
|
}
|
|
result.append(CGJ);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
|
|
return FALSE;
|
|
}
|
|
|
|
|
|
const RuleBasedCollator &AlphabeticIndex::getCollator() const {
|
|
// There are no known non-RuleBasedCollator collators, and none ever expected.
|
|
// But, in case that changes, better a null pointer than a wrong type.
|
|
return *dynamic_cast<RuleBasedCollator *>(collator_);
|
|
}
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getInflowLabel() const {
|
|
return inflowLabel_;
|
|
}
|
|
|
|
const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
|
|
return overflowLabel_;
|
|
}
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
|
|
return underflowLabel_;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
|
|
inflowLabel_ = label;
|
|
indexBuildRequired_ = TRUE;
|
|
return *this;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
|
|
overflowLabel_ = label;
|
|
indexBuildRequired_ = TRUE;
|
|
return *this;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
|
|
underflowLabel_ = label;
|
|
indexBuildRequired_ = TRUE;
|
|
return *this;
|
|
}
|
|
|
|
|
|
int32_t AlphabeticIndex::getMaxLabelCount() const {
|
|
return maxLabelCount_;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
if (maxLabelCount <= 0) {
|
|
status = U_ILLEGAL_ARGUMENT_ERROR;
|
|
return *this;
|
|
}
|
|
maxLabelCount_ = maxLabelCount;
|
|
if (maxLabelCount < bucketList_->size()) {
|
|
indexBuildRequired_ = TRUE;
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getOverflowComparisonString(const UnicodeString &lowerLimit, UErrorCode &/*status*/) {
|
|
for (int32_t i=0; i<firstScriptCharacters_->size(); i++) {
|
|
const UnicodeString *s =
|
|
static_cast<const UnicodeString *>(firstScriptCharacters_->elementAt(i));
|
|
if (collator_->compare(*s, lowerLimit) > 0) {
|
|
return *s;
|
|
}
|
|
}
|
|
return *EMPTY_STRING;
|
|
}
|
|
|
|
UnicodeSet *AlphabeticIndex::getScriptSet(UnicodeSet &dest, const UnicodeString &codePoint, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return &dest;
|
|
}
|
|
UChar32 cp = codePoint.char32At(0);
|
|
UScriptCode scriptCode = uscript_getScript(cp, &status);
|
|
dest.applyIntPropertyValue(UCHAR_SCRIPT, scriptCode, status);
|
|
return &dest;
|
|
}
|
|
|
|
//
|
|
// init() - Common code for constructors.
|
|
//
|
|
|
|
void AlphabeticIndex::init(UErrorCode &status) {
|
|
// Initialize statics if needed.
|
|
AlphabeticIndex::staticInit(status);
|
|
|
|
// Put the object into a known state so that the destructor will function.
|
|
|
|
alreadyIn_ = NULL;
|
|
bucketList_ = NULL;
|
|
collator_ = NULL;
|
|
collatorPrimaryOnly_ = NULL;
|
|
currentBucket_ = NULL;
|
|
firstScriptCharacters_ = NULL;
|
|
initialLabels_ = NULL;
|
|
indexBuildRequired_ = TRUE;
|
|
inputRecords_ = NULL;
|
|
itemsIterIndex_ = 0;
|
|
labels_ = NULL;
|
|
labelsIterIndex_ = 0;
|
|
maxLabelCount_ = 99;
|
|
noDistinctSorting_ = NULL;
|
|
notAlphabetic_ = NULL;
|
|
recordCounter_ = 0;
|
|
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
alreadyIn_ = uhash_open(uhash_hashUnicodeString, // Key Hash,
|
|
uhash_compareUnicodeString, // key Comparator,
|
|
NULL, // value Comparator
|
|
&status);
|
|
uhash_setKeyDeleter(alreadyIn_, uprv_deleteUObject);
|
|
uhash_setValueDeleter(alreadyIn_, uprv_deleteUObject);
|
|
|
|
bucketList_ = new UVector(status);
|
|
bucketList_->setDeleter(alphaIndex_deleteBucket);
|
|
labels_ = new UVector(status);
|
|
labels_->setDeleter(uprv_deleteUObject);
|
|
labels_->setComparer(uhash_compareUnicodeString);
|
|
inputRecords_ = new UVector(status);
|
|
inputRecords_->setDeleter(alphaIndex_deleteRecord);
|
|
|
|
noDistinctSorting_ = new UnicodeSet();
|
|
notAlphabetic_ = new UnicodeSet();
|
|
initialLabels_ = new UnicodeSet();
|
|
|
|
inflowLabel_.remove();
|
|
inflowLabel_.append((UChar)0x2026); // Ellipsis
|
|
overflowLabel_ = inflowLabel_;
|
|
underflowLabel_ = inflowLabel_;
|
|
|
|
// TODO: check for memory allocation failures.
|
|
}
|
|
|
|
|
|
|
|
// Index Characters Clean up function. Delete statically allocated constant stuff.
|
|
U_CDECL_BEGIN
|
|
static UBool U_CALLCONV indexCharacters_cleanup(void) {
|
|
AlphabeticIndex::staticCleanup();
|
|
return TRUE;
|
|
}
|
|
U_CDECL_END
|
|
|
|
void AlphabeticIndex::staticCleanup() {
|
|
delete ALPHABETIC;
|
|
ALPHABETIC = NULL;
|
|
delete HANGUL;
|
|
HANGUL = NULL;
|
|
delete ETHIOPIC;
|
|
ETHIOPIC = NULL;
|
|
delete CORE_LATIN;
|
|
CORE_LATIN = NULL;
|
|
delete IGNORE_SCRIPTS;
|
|
IGNORE_SCRIPTS = NULL;
|
|
delete TO_TRY;
|
|
TO_TRY = NULL;
|
|
delete UNIHAN;
|
|
UNIHAN = NULL;
|
|
delete EMPTY_STRING;
|
|
EMPTY_STRING = NULL;
|
|
nfkdNormalizer = NULL; // ref to a singleton. Do not delete.
|
|
indexCharactersAreInitialized = FALSE;
|
|
}
|
|
|
|
|
|
//
|
|
// staticInit() One-time initialization of constants.
|
|
// Thread safe. Called from constructors.
|
|
// Mutex overhead is not a concern. AlphabeticIndex constructors are
|
|
// sufficiently heavy that the cost of the mutex check is not significant.
|
|
|
|
void AlphabeticIndex::staticInit(UErrorCode &status) {
|
|
static UMTX IndexCharsInitMutex;
|
|
|
|
Mutex mutex(&IndexCharsInitMutex);
|
|
if (indexCharactersAreInitialized || U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
UBool finishedInit = FALSE;
|
|
|
|
{
|
|
UnicodeString alphaString = UNICODE_STRING_SIMPLE("[[:alphabetic:]-[:mark:]]");
|
|
ALPHABETIC = new UnicodeSet(alphaString, status);
|
|
if (ALPHABETIC == NULL) {
|
|
goto err;
|
|
}
|
|
|
|
HANGUL = new UnicodeSet();
|
|
HANGUL->add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).add(0xB9C8).add(0xBC14).add(0xC0AC).
|
|
add(0xC544).add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).add(0xD30C).add(0xD558);
|
|
if (HANGUL== NULL) {
|
|
goto err;
|
|
}
|
|
|
|
|
|
UnicodeString EthiopicStr = UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
|
|
ETHIOPIC = new UnicodeSet(EthiopicStr, status);
|
|
if (ETHIOPIC == NULL) {
|
|
goto err;
|
|
}
|
|
|
|
CORE_LATIN = new UnicodeSet((UChar32)0x61, (UChar32)0x7a); // ('a', 'z');
|
|
if (CORE_LATIN == NULL) {
|
|
goto err;
|
|
}
|
|
|
|
UnicodeString IgnoreStr= UNICODE_STRING_SIMPLE(
|
|
"[[:sc=Common:][:sc=inherited:][:script=Unknown:][:script=braille:]]");
|
|
IGNORE_SCRIPTS = new UnicodeSet(IgnoreStr, status);
|
|
IGNORE_SCRIPTS->freeze();
|
|
if (IGNORE_SCRIPTS == NULL) {
|
|
goto err;
|
|
}
|
|
|
|
UnicodeString nfcqcStr = UNICODE_STRING_SIMPLE("[:^nfcqc=no:]");
|
|
TO_TRY = new UnicodeSet(nfcqcStr, status);
|
|
if (TO_TRY == NULL) {
|
|
goto err;
|
|
}
|
|
|
|
UnicodeString unihanStr = UNICODE_STRING_SIMPLE("[:script=Hani:]");
|
|
UNIHAN = new UnicodeSet(unihanStr, status);
|
|
if (UNIHAN == NULL) {
|
|
goto err;
|
|
}
|
|
|
|
EMPTY_STRING = new UnicodeString();
|
|
|
|
nfkdNormalizer = Normalizer2::getNFKDInstance(status);
|
|
if (nfkdNormalizer == NULL) {
|
|
goto err;
|
|
}
|
|
}
|
|
finishedInit = TRUE;
|
|
|
|
err:
|
|
if (!finishedInit && U_SUCCESS(status)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
if (U_FAILURE(status)) {
|
|
indexCharacters_cleanup();
|
|
return;
|
|
}
|
|
ucln_i18n_registerCleanup(UCLN_I18N_INDEX_CHARACTERS, indexCharacters_cleanup);
|
|
indexCharactersAreInitialized = TRUE;
|
|
}
|
|
|
|
|
|
//
|
|
// Comparison function for UVector<UnicodeString *> sorting with a collator.
|
|
//
|
|
static int32_t U_CALLCONV
|
|
sortCollateComparator(const void *context, const void *left, const void *right) {
|
|
const UElement *leftElement = static_cast<const UElement *>(left);
|
|
const UElement *rightElement = static_cast<const UElement *>(right);
|
|
const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer);
|
|
const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
|
|
const Collator *col = static_cast<const Collator *>(context);
|
|
|
|
if (leftString == rightString) {
|
|
// Catches case where both are NULL
|
|
return 0;
|
|
}
|
|
if (leftString == NULL) {
|
|
return 1;
|
|
};
|
|
if (rightString == NULL) {
|
|
return -1;
|
|
}
|
|
Collator::EComparisonResult r = col->compare(*leftString, *rightString);
|
|
return (int32_t) r;
|
|
}
|
|
|
|
//
|
|
// Comparison function for UVector<Record *> sorting with a collator.
|
|
//
|
|
static int32_t U_CALLCONV
|
|
recordCompareFn(const void *context, const void *left, const void *right) {
|
|
const UElement *leftElement = static_cast<const UElement *>(left);
|
|
const UElement *rightElement = static_cast<const UElement *>(right);
|
|
const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
|
|
const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
|
|
const Collator *col = static_cast<const Collator *>(context);
|
|
|
|
Collator::EComparisonResult r = col->compare(leftRec->sortingName_, rightRec->sortingName_);
|
|
if (r == Collator::EQUAL) {
|
|
if (leftRec->serialNumber_ < rightRec->serialNumber_) {
|
|
r = Collator::LESS;
|
|
} else if (leftRec->serialNumber_ > rightRec->serialNumber_) {
|
|
r = Collator::GREATER;
|
|
}
|
|
}
|
|
return (int32_t) r;
|
|
}
|
|
|
|
|
|
#if 0
|
|
//
|
|
// First characters in scripts.
|
|
// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
|
|
// The vector is sorted according to this index's collation.
|
|
//
|
|
// This code is too slow to use, so for now hard code the data.
|
|
// Hard coded implementation is follows.
|
|
//
|
|
UVector *AlphabeticIndex::firstStringsInScript(Collator *ruleBasedCollator, UErrorCode &status) {
|
|
|
|
if (U_FAILURE(status)) {
|
|
return NULL;
|
|
}
|
|
|
|
UnicodeString results[USCRIPT_CODE_LIMIT];
|
|
UnicodeString LOWER_A = UNICODE_STRING_SIMPLE("a");
|
|
|
|
UnicodeSetIterator siter(*TO_TRY);
|
|
while (siter.next()) {
|
|
const UnicodeString ¤t = siter.getString();
|
|
Collator::EComparisonResult r = ruleBasedCollator->compare(current, LOWER_A);
|
|
if (r < 0) { // TODO fix; we only want "real" script characters, not
|
|
// symbols.
|
|
continue;
|
|
}
|
|
|
|
int script = uscript_getScript(current.char32At(0), &status);
|
|
if (results[script].length() == 0) {
|
|
results[script] = current;
|
|
}
|
|
else if (ruleBasedCollator->compare(current, results[script]) < 0) {
|
|
results[script] = current;
|
|
}
|
|
}
|
|
|
|
UnicodeSet extras;
|
|
UnicodeSet expansions;
|
|
RuleBasedCollator *rbc = dynamic_cast<RuleBasedCollator *>(ruleBasedCollator);
|
|
const UCollator *uRuleBasedCollator = rbc->getUCollator();
|
|
ucol_getContractionsAndExpansions(uRuleBasedCollator, extras.toUSet(), expansions.toUSet(), true, &status);
|
|
extras.addAll(expansions).removeAll(*TO_TRY);
|
|
if (extras.size() != 0) {
|
|
const Normalizer2 *normalizer = Normalizer2::getNFKCInstance(status);
|
|
UnicodeSetIterator extrasIter(extras);
|
|
while (extrasIter.next()) {
|
|
const UnicodeString ¤t = extrasIter.next();
|
|
if (!TO_TRY->containsAll(current))
|
|
continue;
|
|
if (!normalizer->isNormalized(current, status) ||
|
|
ruleBasedCollator->compare(current, LOWER_A) < 0) {
|
|
continue;
|
|
}
|
|
int script = uscript_getScript(current.char32At(0), &status);
|
|
if (results[script].length() == 0) {
|
|
results[script] = current;
|
|
} else if (ruleBasedCollator->compare(current, results[script]) < 0) {
|
|
results[script] = current;
|
|
}
|
|
}
|
|
}
|
|
|
|
UVector *dest = new UVector(status);
|
|
dest->setDeleter(uprv_deleteUObject);
|
|
for (uint32_t i = 0; i < sizeof(results) / sizeof(results[0]); ++i) {
|
|
if (results[i].length() > 0) {
|
|
dest->addElement(results[i].clone(), status);
|
|
}
|
|
}
|
|
dest->sortWithUComparator(sortCollateComparator, ruleBasedCollator, status);
|
|
return dest;
|
|
}
|
|
#endif
|
|
|
|
|
|
//
|
|
// First characters in scripts.
|
|
// Create a UVector whose contents are pointers to UnicodeStrings for the First Characters in each script.
|
|
// The vector is sorted according to this index's collation.
|
|
//
|
|
// It takes too much time to compute this from character properties, so hard code it for now.
|
|
// Character constants copied from corresponding declaration in ICU4J.
|
|
// See main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
|
|
|
|
static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 0x61, 0, 0x03B1, 0,
|
|
0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
|
|
0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
|
|
0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
|
|
0xAAF2, 0, // Meetei Mayek
|
|
0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
|
|
U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada
|
|
U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri
|
|
0x1B83, 0,
|
|
0xD802, 0xDE00, 0, 0x0E01, 0,
|
|
0x0EDE, 0, // Lao
|
|
0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
|
|
0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
|
|
U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma
|
|
0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
|
|
0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
|
|
0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
|
|
U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao
|
|
0xD800, 0xDE80, 0,
|
|
0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
|
|
0xD801, 0xDC80, 0,
|
|
U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng
|
|
0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
|
|
0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
|
|
U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive
|
|
U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs
|
|
0x4E00, 0 };
|
|
|
|
UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return NULL;
|
|
}
|
|
UVector *dest = new UVector(status);
|
|
if (dest == NULL) {
|
|
if (U_SUCCESS(status)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
return NULL;
|
|
}
|
|
dest->setDeleter(uprv_deleteUObject);
|
|
const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS;
|
|
const UChar *limit = src + sizeof(HACK_FIRST_CHARS_IN_SCRIPTS) / sizeof(HACK_FIRST_CHARS_IN_SCRIPTS[0]);
|
|
do {
|
|
if (U_FAILURE(status)) {
|
|
return dest;
|
|
}
|
|
UnicodeString *str = new UnicodeString(src, -1);
|
|
if (str == NULL) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
} else {
|
|
dest->addElement(str, status);
|
|
src += str->length() + 1;
|
|
}
|
|
} while (src < limit);
|
|
dest->sortWithUComparator(sortCollateComparator, collator_, status);
|
|
return dest;
|
|
}
|
|
|
|
|
|
AlphabeticIndex::ELangType AlphabeticIndex::langTypeFromLocale(const Locale &loc) {
|
|
const char *lang = loc.getLanguage();
|
|
if (uprv_strcmp(lang, "zh") != 0) {
|
|
return kNormal;
|
|
}
|
|
const char *script = loc.getScript();
|
|
if (uprv_strcmp(script, "Hant") == 0) {
|
|
return kTraditional;
|
|
}
|
|
const char *country = loc.getCountry();
|
|
if (uprv_strcmp(country, "TW") == 0) {
|
|
return kTraditional;
|
|
}
|
|
return kSimplified;
|
|
}
|
|
|
|
|
|
//
|
|
// Pinyin Hacks. Direct port from Java.
|
|
//
|
|
|
|
static const UChar32 probeCharInLong = 0x28EAD;
|
|
|
|
|
|
static const UChar PINYIN_LOWER_BOUNDS_SHORT[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz"
|
|
0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
|
|
/*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
|
|
|
|
|
|
// Pinyin lookup tables copied, pasted (and reformatted) from the ICU4J code.
|
|
|
|
static const PinyinLookup HACK_PINYIN_LOOKUP_SHORT = {
|
|
{(UChar)0, (UChar)0, (UChar)0}, // A
|
|
{(UChar)0x516B, (UChar)0, (UChar)0}, // B
|
|
{(UChar)0x5693, (UChar)0, (UChar)0}, // C
|
|
{(UChar)0x5491, (UChar)0, (UChar)0}, // D
|
|
{(UChar)0x59B8, (UChar)0, (UChar)0}, // E
|
|
{(UChar)0x53D1, (UChar)0, (UChar)0}, // F
|
|
{(UChar)0x65EE, (UChar)0, (UChar)0}, // G
|
|
{(UChar)0x54C8, (UChar)0, (UChar)0}, // H
|
|
{(UChar)0x4E0C, (UChar)0, (UChar)0}, // J
|
|
{(UChar)0x5494, (UChar)0, (UChar)0}, // K
|
|
{(UChar)0x5783, (UChar)0, (UChar)0}, // L
|
|
{(UChar)0x5452, (UChar)0, (UChar)0}, // M
|
|
{(UChar)0x5514, (UChar)0, (UChar)0}, // N
|
|
{(UChar)0x5594, (UChar)0, (UChar)0}, // O
|
|
{(UChar)0x5991, (UChar)0, (UChar)0}, // P
|
|
{(UChar)0x4E03, (UChar)0, (UChar)0}, // Q
|
|
{(UChar)0x513F, (UChar)0, (UChar)0}, // R
|
|
{(UChar)0x4EE8, (UChar)0, (UChar)0}, // S
|
|
{(UChar)0x4ED6, (UChar)0, (UChar)0}, // T
|
|
{(UChar)0x7A75, (UChar)0, (UChar)0}, // W
|
|
{(UChar)0x5915, (UChar)0, (UChar)0}, // X
|
|
{(UChar)0x4E2B, (UChar)0, (UChar)0}, // Y
|
|
{(UChar)0x5E00, (UChar)0, (UChar)0}, // Z
|
|
{(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
|
|
};
|
|
|
|
static const UChar PINYIN_LOWER_BOUNDS_LONG[] = { // "\u0101bcd\u0113fghjkl\u1E3F\u0144\u014Dpqrstwxyz";
|
|
0x0101, 0x62, 0x63, 0x64, 0x0113, 0x66, 0x67, 0x68, 0x6A, 0x6B, /*l*/0x6C, 0x1E3F, 0x0144, 0x014D,
|
|
/*p*/0x70, 0x71, 0x72, 0x73, 0x74, /*w*/0x77, 0x78, 0x79, 0x7A};
|
|
|
|
static const PinyinLookup HACK_PINYIN_LOOKUP_LONG = {
|
|
{(UChar)0, (UChar)0, (UChar)0}, // A
|
|
{(UChar)0x516B, (UChar)0, (UChar)0}, // b
|
|
{(UChar)0xD863, (UChar)0xDEAD, (UChar)0}, // c
|
|
{(UChar)0xD844, (UChar)0xDE51, (UChar)0}, // d
|
|
{(UChar)0x59B8, (UChar)0, (UChar)0}, // e
|
|
{(UChar)0x53D1, (UChar)0, (UChar)0}, // f
|
|
{(UChar)0xD844, (UChar)0xDE45, (UChar)0}, // g
|
|
{(UChar)0x54C8, (UChar)0, (UChar)0}, // h
|
|
{(UChar)0x4E0C, (UChar)0, (UChar)0}, // j
|
|
{(UChar)0x5494, (UChar)0, (UChar)0}, // k
|
|
{(UChar)0x3547, (UChar)0, (UChar)0}, // l
|
|
{(UChar)0x5452, (UChar)0, (UChar)0}, // m
|
|
{(UChar)0x5514, (UChar)0, (UChar)0}, // n
|
|
{(UChar)0x5594, (UChar)0, (UChar)0}, // o
|
|
{(UChar)0xD84F, (UChar)0xDC7A, (UChar)0}, // p
|
|
{(UChar)0x4E03, (UChar)0, (UChar)0}, // q
|
|
{(UChar)0x513F, (UChar)0, (UChar)0}, // r
|
|
{(UChar)0x4EE8, (UChar)0, (UChar)0}, // s
|
|
{(UChar)0x4ED6, (UChar)0, (UChar)0}, // t
|
|
{(UChar)0x7A75, (UChar)0, (UChar)0}, // w
|
|
{(UChar)0x5915, (UChar)0, (UChar)0}, // x
|
|
{(UChar)0x4E2B, (UChar)0, (UChar)0}, // y
|
|
{(UChar)0x5E00, (UChar)0, (UChar)0}, // z
|
|
{(UChar)0xFFFF, (UChar)0, (UChar)0}, // mark end of array
|
|
};
|
|
|
|
|
|
//
|
|
// Probe the collation data, and decide which Pinyin tables should be used
|
|
//
|
|
// ICU can be built with a choice between two Chinese collations.
|
|
// The hack Pinyin tables to use depend on which one is in use.
|
|
// We can assume that any given copy of ICU will have only one of the collations available,
|
|
// and that there is no way, in a given process, to create two alphabetic indexes using
|
|
// different Chinese collations. Which means the probe can be done once
|
|
// and the results cached.
|
|
//
|
|
// This whole arrangement is temporary.
|
|
//
|
|
static const PinyinLookup *HACK_PINYIN_LOOKUP = NULL;
|
|
static const UChar *PINYIN_LOWER_BOUNDS = NULL;
|
|
|
|
void AlphabeticIndex::initPinyinBounds(const Collator *col, UErrorCode &status) {
|
|
{
|
|
Mutex m;
|
|
if (PINYIN_LOWER_BOUNDS != NULL) {
|
|
return;
|
|
}
|
|
}
|
|
UnicodeSet *colSet = col->getTailoredSet(status);
|
|
if (U_FAILURE(status) || colSet == NULL) {
|
|
delete colSet;
|
|
if (U_SUCCESS(status)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
return;
|
|
}
|
|
UBool useLongTables = colSet->contains(probeCharInLong);
|
|
delete colSet;
|
|
{
|
|
Mutex m;
|
|
if (useLongTables) {
|
|
PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_LONG;
|
|
HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_LONG;
|
|
} else {
|
|
PINYIN_LOWER_BOUNDS = PINYIN_LOWER_BOUNDS_SHORT;
|
|
HACK_PINYIN_LOOKUP = &HACK_PINYIN_LOOKUP_SHORT;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Pinyin Hack:
|
|
// Modify a Chinese name by prepending a Latin letter. The modified name is used
|
|
// when putting records (names) into buckets, to put the name under a Latin index heading.
|
|
|
|
void AlphabeticIndex::hackName(UnicodeString &dest, const UnicodeString &name, const Collator *col) {
|
|
|
|
if (langType_ != kSimplified || !UNIHAN->contains(name.char32At(0))) {
|
|
dest = name;
|
|
return;
|
|
}
|
|
|
|
UErrorCode status = U_ZERO_ERROR;
|
|
initPinyinBounds(col, status);
|
|
if (U_FAILURE(status)) {
|
|
dest = name;
|
|
return;
|
|
}
|
|
// TODO: use binary search
|
|
int index;
|
|
for (index=0; ; index++) {
|
|
if ((*HACK_PINYIN_LOOKUP)[index][0] == (UChar)0xffff) {
|
|
index--;
|
|
break;
|
|
}
|
|
int32_t compareResult = col->compare(name, UnicodeString(TRUE, (*HACK_PINYIN_LOOKUP)[index], -1));
|
|
if (compareResult < 0) {
|
|
index--;
|
|
}
|
|
if (compareResult <= 0) {
|
|
break;
|
|
}
|
|
}
|
|
UChar c = PINYIN_LOWER_BOUNDS[index];
|
|
dest.setTo(c);
|
|
dest.append(name);
|
|
return;
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* Comparator that returns "better" items first, where shorter NFKD is better, and otherwise NFKD binary order is
|
|
* better, and otherwise binary order is better.
|
|
*
|
|
* For use with array sort or UVector.
|
|
* @param context A UErrorCode pointer.
|
|
* @param left A UElement pointer, which must refer to a UnicodeString *
|
|
* @param right A UElement pointer, which must refer to a UnicodeString *
|
|
*/
|
|
|
|
static int32_t U_CALLCONV
|
|
PreferenceComparator(const void *context, const void *left, const void *right) {
|
|
const UElement *leftElement = static_cast<const UElement *>(left);
|
|
const UElement *rightElement = static_cast<const UElement *>(right);
|
|
const UnicodeString *s1 = static_cast<const UnicodeString *>(leftElement->pointer);
|
|
const UnicodeString *s2 = static_cast<const UnicodeString *>(rightElement->pointer);
|
|
UErrorCode &status = *(UErrorCode *)(context); // Cast off both static and const.
|
|
if (s1 == s2) {
|
|
return 0;
|
|
}
|
|
|
|
UnicodeString n1 = nfkdNormalizer->normalize(*s1, status);
|
|
UnicodeString n2 = nfkdNormalizer->normalize(*s2, status);
|
|
int32_t result = n1.length() - n2.length();
|
|
if (result != 0) {
|
|
return result;
|
|
}
|
|
|
|
result = n1.compareCodePointOrder(n2);
|
|
if (result != 0) {
|
|
return result;
|
|
}
|
|
return s1->compareCodePointOrder(*s2);
|
|
}
|
|
|
|
|
|
//
|
|
// Constructor & Destructor for AlphabeticIndex::Record
|
|
//
|
|
// Records are internal only, instances are not directly surfaced in the public API.
|
|
// This class is mostly struct-like, with all public fields.
|
|
|
|
AlphabeticIndex::Record::Record(AlphabeticIndex *alphaIndex, const UnicodeString &name, const void *data):
|
|
alphaIndex_(alphaIndex), name_(name), data_(data)
|
|
{
|
|
UnicodeString prefixedName;
|
|
alphaIndex->hackName(sortingName_, name_, alphaIndex->collatorPrimaryOnly_);
|
|
serialNumber_ = ++alphaIndex->recordCounter_;
|
|
}
|
|
|
|
AlphabeticIndex::Record::~Record() {
|
|
}
|
|
|
|
|
|
AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
Record *r = new Record(this, name, data);
|
|
inputRecords_->addElement(r, status);
|
|
indexBuildRequired_ = TRUE;
|
|
//std::string ss;
|
|
//std::string ss2;
|
|
//std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
|
|
// " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
|
|
return *this;
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
inputRecords_->removeAllElements();
|
|
indexBuildRequired_ = TRUE;
|
|
return *this;
|
|
}
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
|
|
buildIndex(status);
|
|
if (U_FAILURE(status)) {
|
|
return 0;
|
|
}
|
|
|
|
// For simplified Chinese prepend a prefix to the name.
|
|
// For non-Chinese locales or non-Chinese names, the name is not modified.
|
|
|
|
UnicodeString prefixedName;
|
|
hackName(prefixedName, name, collatorPrimaryOnly_);
|
|
|
|
// TODO: use a binary search.
|
|
for (int32_t i = 0; i < bucketList_->size(); ++i) {
|
|
Bucket *bucket = static_cast<Bucket *>(bucketList_->elementAt(i));
|
|
Collator::EComparisonResult comp = collatorPrimaryOnly_->compare(prefixedName, bucket->lowerBoundary_);
|
|
if (comp < 0) {
|
|
return i - 1;
|
|
}
|
|
}
|
|
// Loop runs until we find the bucket following the one that would hold prefixedName.
|
|
// If the prefixedName belongs in the last bucket the loop will drop out the bottom rather
|
|
// than returning from the middle.
|
|
|
|
return bucketList_->size() - 1;
|
|
}
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketIndex() const {
|
|
return labelsIterIndex_;
|
|
}
|
|
|
|
|
|
UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return FALSE;
|
|
}
|
|
if (indexBuildRequired_ && currentBucket_ != NULL) {
|
|
status = U_ENUM_OUT_OF_SYNC_ERROR;
|
|
return FALSE;
|
|
}
|
|
buildIndex(status);
|
|
if (U_FAILURE(status)) {
|
|
return FALSE;
|
|
}
|
|
++labelsIterIndex_;
|
|
if (labelsIterIndex_ >= bucketList_->size()) {
|
|
labelsIterIndex_ = bucketList_->size();
|
|
return FALSE;
|
|
}
|
|
currentBucket_ = static_cast<Bucket *>(bucketList_->elementAt(labelsIterIndex_));
|
|
resetRecordIterator();
|
|
return TRUE;
|
|
}
|
|
|
|
const UnicodeString &AlphabeticIndex::getBucketLabel() const {
|
|
if (currentBucket_ != NULL) {
|
|
return currentBucket_->label_;
|
|
} else {
|
|
return *EMPTY_STRING;
|
|
}
|
|
}
|
|
|
|
|
|
UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
|
|
if (currentBucket_ != NULL) {
|
|
return currentBucket_->labelType_;
|
|
} else {
|
|
return U_ALPHAINDEX_NORMAL;
|
|
}
|
|
}
|
|
|
|
|
|
int32_t AlphabeticIndex::getBucketRecordCount() const {
|
|
if (currentBucket_ != NULL) {
|
|
return currentBucket_->records_->size();
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return *this;
|
|
}
|
|
buildIndex(status);
|
|
labelsIterIndex_ = -1;
|
|
currentBucket_ = NULL;
|
|
return *this;
|
|
}
|
|
|
|
|
|
UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
|
|
if (U_FAILURE(status)) {
|
|
return FALSE;
|
|
}
|
|
if (currentBucket_ == NULL) {
|
|
// We are trying to iterate over the items in a bucket, but there is no
|
|
// current bucket from the enumeration of buckets.
|
|
status = U_INVALID_STATE_ERROR;
|
|
return FALSE;
|
|
}
|
|
if (indexBuildRequired_) {
|
|
status = U_ENUM_OUT_OF_SYNC_ERROR;
|
|
return FALSE;
|
|
}
|
|
++itemsIterIndex_;
|
|
if (itemsIterIndex_ >= currentBucket_->records_->size()) {
|
|
itemsIterIndex_ = currentBucket_->records_->size();
|
|
return FALSE;
|
|
}
|
|
return TRUE;
|
|
}
|
|
|
|
|
|
const UnicodeString &AlphabeticIndex::getRecordName() const {
|
|
const UnicodeString *retStr = EMPTY_STRING;
|
|
if (currentBucket_ != NULL &&
|
|
itemsIterIndex_ >= 0 &&
|
|
itemsIterIndex_ < currentBucket_->records_->size()) {
|
|
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
|
|
retStr = &item->name_;
|
|
}
|
|
return *retStr;
|
|
}
|
|
|
|
const void *AlphabeticIndex::getRecordData() const {
|
|
const void *retPtr = NULL;
|
|
if (currentBucket_ != NULL &&
|
|
itemsIterIndex_ >= 0 &&
|
|
itemsIterIndex_ < currentBucket_->records_->size()) {
|
|
Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
|
|
retPtr = item->data_;
|
|
}
|
|
return retPtr;
|
|
}
|
|
|
|
|
|
AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
|
|
itemsIterIndex_ = -1;
|
|
return *this;
|
|
}
|
|
|
|
|
|
|
|
AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
|
|
const UnicodeString &lowerBoundary,
|
|
UAlphabeticIndexLabelType type,
|
|
UErrorCode &status):
|
|
label_(label), lowerBoundary_(lowerBoundary), labelType_(type), records_(NULL) {
|
|
if (U_FAILURE(status)) {
|
|
return;
|
|
}
|
|
records_ = new UVector(status);
|
|
if (records_ == NULL && U_SUCCESS(status)) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
}
|
|
|
|
|
|
AlphabeticIndex::Bucket::~Bucket() {
|
|
delete records_;
|
|
}
|
|
|
|
U_NAMESPACE_END
|
|
|
|
#endif
|