2017-01-20 00:20:31 +00:00
|
|
|
// © 2016 and later: Unicode, Inc. and others.
|
2016-06-15 18:58:17 +00:00
|
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
2014-02-25 21:21:49 +00:00
|
|
|
/*
|
|
|
|
*******************************************************************************
|
2016-05-31 21:45:07 +00:00
|
|
|
* Copyright (C) 2013-2015, International Business Machines
|
|
|
|
* Corporation and others. All Rights Reserved.
|
2014-02-25 21:21:49 +00:00
|
|
|
*******************************************************************************
|
|
|
|
* collationsettings.h
|
|
|
|
*
|
|
|
|
* created on: 2013feb07
|
|
|
|
* created by: Markus W. Scherer
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef __COLLATIONSETTINGS_H__
|
|
|
|
#define __COLLATIONSETTINGS_H__
|
|
|
|
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
|
|
|
|
#if !UCONFIG_NO_COLLATION
|
|
|
|
|
|
|
|
#include "unicode/ucol.h"
|
|
|
|
#include "collation.h"
|
|
|
|
#include "sharedobject.h"
|
|
|
|
#include "umutex.h"
|
|
|
|
|
|
|
|
U_NAMESPACE_BEGIN
|
|
|
|
|
2015-01-07 03:37:11 +00:00
|
|
|
struct CollationData;
|
|
|
|
|
2014-02-25 21:21:49 +00:00
|
|
|
/**
|
|
|
|
* Collation settings/options/attributes.
|
|
|
|
* These are the values that can be changed via API.
|
|
|
|
*/
|
|
|
|
struct U_I18N_API CollationSettings : public SharedObject {
|
|
|
|
/**
|
|
|
|
* Options bit 0: Perform the FCD check on the input text and deliver normalized text.
|
|
|
|
*/
|
|
|
|
static const int32_t CHECK_FCD = 1;
|
|
|
|
/**
|
|
|
|
* Options bit 1: Numeric collation.
|
|
|
|
* Also known as CODAN = COllate Digits As Numbers.
|
|
|
|
*
|
|
|
|
* Treat digit sequences as numbers with CE sequences in numeric order,
|
|
|
|
* rather than returning a normal CE for each digit.
|
|
|
|
*/
|
|
|
|
static const int32_t NUMERIC = 2;
|
|
|
|
/**
|
|
|
|
* "Shifted" alternate handling, see ALTERNATE_MASK.
|
|
|
|
*/
|
|
|
|
static const int32_t SHIFTED = 4;
|
|
|
|
/**
|
|
|
|
* Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
|
|
|
|
* Reserve values 8 and 0xc for shift-trimmed and blanked.
|
|
|
|
*/
|
|
|
|
static const int32_t ALTERNATE_MASK = 0xc;
|
|
|
|
/**
|
|
|
|
* Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
|
|
|
|
*/
|
|
|
|
static const int32_t MAX_VARIABLE_SHIFT = 4;
|
|
|
|
/** maxVariable options bit mask before shifting. */
|
|
|
|
static const int32_t MAX_VARIABLE_MASK = 0x70;
|
|
|
|
/** Options bit 7: Reserved/unused/0. */
|
|
|
|
/**
|
|
|
|
* Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
|
|
|
|
*/
|
|
|
|
static const int32_t UPPER_FIRST = 0x100;
|
|
|
|
/**
|
|
|
|
* Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
|
|
|
|
* unless case level is on (when they are *moved* into the separate case level).
|
|
|
|
* By default, the case bits are removed from the tertiary weight (ignored).
|
|
|
|
*
|
|
|
|
* When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
|
|
|
|
* the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
|
|
|
|
*/
|
|
|
|
static const int32_t CASE_FIRST = 0x200;
|
|
|
|
/**
|
|
|
|
* Options bit mask for caseFirst and upperFirst, before shifting.
|
|
|
|
* Same value as caseFirst==upperFirst.
|
|
|
|
*/
|
|
|
|
static const int32_t CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
|
|
|
|
/**
|
|
|
|
* Options bit 10: Insert the case level between the secondary and tertiary levels.
|
|
|
|
*/
|
|
|
|
static const int32_t CASE_LEVEL = 0x400;
|
|
|
|
/**
|
|
|
|
* Options bit 11: Compare secondary weights backwards. ("French secondary")
|
|
|
|
*/
|
|
|
|
static const int32_t BACKWARD_SECONDARY = 0x800;
|
|
|
|
/**
|
|
|
|
* Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
|
|
|
|
* It is the top used bit field in the options. (No need to mask after shifting.)
|
|
|
|
*/
|
|
|
|
static const int32_t STRENGTH_SHIFT = 12;
|
|
|
|
/** Strength options bit mask before shifting. */
|
|
|
|
static const int32_t STRENGTH_MASK = 0xf000;
|
|
|
|
|
|
|
|
/** maxVariable values */
|
|
|
|
enum MaxVariable {
|
|
|
|
MAX_VAR_SPACE,
|
|
|
|
MAX_VAR_PUNCT,
|
|
|
|
MAX_VAR_SYMBOL,
|
|
|
|
MAX_VAR_CURRENCY
|
|
|
|
};
|
|
|
|
|
|
|
|
CollationSettings()
|
|
|
|
: options((UCOL_DEFAULT_STRENGTH << STRENGTH_SHIFT) |
|
|
|
|
(MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT)),
|
|
|
|
variableTop(0),
|
|
|
|
reorderTable(NULL),
|
2015-01-07 03:37:11 +00:00
|
|
|
minHighNoReorder(0),
|
|
|
|
reorderRanges(NULL), reorderRangesLength(0),
|
2014-02-25 21:21:49 +00:00
|
|
|
reorderCodes(NULL), reorderCodesLength(0), reorderCodesCapacity(0),
|
|
|
|
fastLatinOptions(-1) {}
|
|
|
|
|
|
|
|
CollationSettings(const CollationSettings &other);
|
|
|
|
virtual ~CollationSettings();
|
|
|
|
|
|
|
|
UBool operator==(const CollationSettings &other) const;
|
|
|
|
|
|
|
|
inline UBool operator!=(const CollationSettings &other) const {
|
|
|
|
return !operator==(other);
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t hashCode() const;
|
|
|
|
|
|
|
|
void resetReordering();
|
2015-01-07 03:37:11 +00:00
|
|
|
void aliasReordering(const CollationData &data, const int32_t *codes, int32_t length,
|
|
|
|
const uint32_t *ranges, int32_t rangesLength,
|
|
|
|
const uint8_t *table, UErrorCode &errorCode);
|
|
|
|
void setReordering(const CollationData &data, const int32_t *codes, int32_t codesLength,
|
|
|
|
UErrorCode &errorCode);
|
|
|
|
void copyReorderingFrom(const CollationSettings &other, UErrorCode &errorCode);
|
|
|
|
|
|
|
|
inline UBool hasReordering() const { return reorderTable != NULL; }
|
|
|
|
static UBool reorderTableHasSplitBytes(const uint8_t table[256]);
|
|
|
|
inline uint32_t reorder(uint32_t p) const {
|
|
|
|
uint8_t b = reorderTable[p >> 24];
|
|
|
|
if(b != 0 || p <= Collation::NO_CE_PRIMARY) {
|
|
|
|
return ((uint32_t)b << 24) | (p & 0xffffff);
|
|
|
|
} else {
|
|
|
|
return reorderEx(p);
|
|
|
|
}
|
|
|
|
}
|
2014-02-25 21:21:49 +00:00
|
|
|
|
|
|
|
void setStrength(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
|
|
|
|
|
|
|
|
static int32_t getStrength(int32_t options) {
|
|
|
|
return options >> STRENGTH_SHIFT;
|
|
|
|
}
|
|
|
|
|
|
|
|
int32_t getStrength() const {
|
|
|
|
return getStrength(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Sets the options bit for an on/off attribute. */
|
|
|
|
void setFlag(int32_t bit, UColAttributeValue value,
|
|
|
|
int32_t defaultOptions, UErrorCode &errorCode);
|
|
|
|
|
|
|
|
UColAttributeValue getFlag(int32_t bit) const {
|
|
|
|
return ((options & bit) != 0) ? UCOL_ON : UCOL_OFF;
|
|
|
|
}
|
|
|
|
|
|
|
|
void setCaseFirst(UColAttributeValue value, int32_t defaultOptions, UErrorCode &errorCode);
|
|
|
|
|
|
|
|
UColAttributeValue getCaseFirst() const {
|
|
|
|
int32_t option = options & CASE_FIRST_AND_UPPER_MASK;
|
|
|
|
return (option == 0) ? UCOL_OFF :
|
|
|
|
(option == CASE_FIRST) ? UCOL_LOWER_FIRST : UCOL_UPPER_FIRST;
|
|
|
|
}
|
|
|
|
|
|
|
|
void setAlternateHandling(UColAttributeValue value,
|
|
|
|
int32_t defaultOptions, UErrorCode &errorCode);
|
|
|
|
|
|
|
|
UColAttributeValue getAlternateHandling() const {
|
|
|
|
return ((options & ALTERNATE_MASK) == 0) ? UCOL_NON_IGNORABLE : UCOL_SHIFTED;
|
|
|
|
}
|
|
|
|
|
|
|
|
void setMaxVariable(int32_t value, int32_t defaultOptions, UErrorCode &errorCode);
|
|
|
|
|
|
|
|
MaxVariable getMaxVariable() const {
|
|
|
|
return (MaxVariable)((options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
|
|
|
|
*/
|
|
|
|
static inline UBool isTertiaryWithCaseBits(int32_t options) {
|
|
|
|
return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
|
|
|
|
}
|
|
|
|
static uint32_t getTertiaryMask(int32_t options) {
|
|
|
|
// Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
|
|
|
|
return isTertiaryWithCaseBits(options) ?
|
|
|
|
Collation::CASE_AND_TERTIARY_MASK : Collation::ONLY_TERTIARY_MASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static UBool sortsTertiaryUpperCaseFirst(int32_t options) {
|
|
|
|
// On tertiary level, consider case bits and sort uppercase first
|
|
|
|
// if caseLevel is off and caseFirst==upperFirst.
|
|
|
|
return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline UBool dontCheckFCD() const {
|
|
|
|
return (options & CHECK_FCD) == 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline UBool hasBackwardSecondary() const {
|
|
|
|
return (options & BACKWARD_SECONDARY) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline UBool isNumeric() const {
|
|
|
|
return (options & NUMERIC) != 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** CHECK_FCD etc. */
|
|
|
|
int32_t options;
|
|
|
|
/** Variable-top primary weight. */
|
|
|
|
uint32_t variableTop;
|
2015-01-07 03:37:11 +00:00
|
|
|
/**
|
|
|
|
* 256-byte table for reordering permutation of primary lead bytes; NULL if no reordering.
|
|
|
|
* A 0 entry at a non-zero index means that the primary lead byte is "split"
|
|
|
|
* (there are different offsets for primaries that share that lead byte)
|
|
|
|
* and the reordering offset must be determined via the reorderRanges.
|
|
|
|
*/
|
2014-02-25 21:21:49 +00:00
|
|
|
const uint8_t *reorderTable;
|
2015-01-07 03:37:11 +00:00
|
|
|
/** Limit of last reordered range. 0 if no reordering or no split bytes. */
|
|
|
|
uint32_t minHighNoReorder;
|
|
|
|
/**
|
|
|
|
* Primary-weight ranges for script reordering,
|
|
|
|
* to be used by reorder(p) for split-reordered primary lead bytes.
|
|
|
|
*
|
|
|
|
* Each entry is a (limit, offset) pair.
|
|
|
|
* The upper 16 bits of the entry are the upper 16 bits of the
|
|
|
|
* exclusive primary limit of a range.
|
|
|
|
* Primaries between the previous limit and this one have their lead bytes
|
|
|
|
* modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
|
|
|
|
*
|
|
|
|
* CollationData::makeReorderRanges() writes a full list where the first range
|
|
|
|
* (at least for terminators and separators) has a 0 offset.
|
|
|
|
* The last range has a non-zero offset.
|
|
|
|
* minHighNoReorder is set to the limit of that last range.
|
|
|
|
*
|
|
|
|
* In the settings object, the initial ranges before the first split lead byte
|
|
|
|
* are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
|
|
|
|
* If there are no split-reordered lead bytes, then no ranges are needed.
|
|
|
|
*/
|
|
|
|
const uint32_t *reorderRanges;
|
|
|
|
int32_t reorderRangesLength;
|
2014-02-25 21:21:49 +00:00
|
|
|
/** Array of reorder codes; ignored if reorderCodesLength == 0. */
|
|
|
|
const int32_t *reorderCodes;
|
|
|
|
/** Number of reorder codes; 0 if no reordering. */
|
|
|
|
int32_t reorderCodesLength;
|
|
|
|
/**
|
|
|
|
* Capacity of reorderCodes.
|
2015-01-07 03:37:11 +00:00
|
|
|
* If 0, then the codes, the ranges, and the table are aliases.
|
2014-02-25 21:21:49 +00:00
|
|
|
* Otherwise, this object owns the memory via the reorderCodes pointer;
|
2015-01-07 03:37:11 +00:00
|
|
|
* the codes, the ranges, and the table are in the same memory block, in that order.
|
2014-02-25 21:21:49 +00:00
|
|
|
*/
|
|
|
|
int32_t reorderCodesCapacity;
|
|
|
|
|
|
|
|
/** Options for CollationFastLatin. Negative if disabled. */
|
|
|
|
int32_t fastLatinOptions;
|
|
|
|
uint16_t fastLatinPrimaries[0x180];
|
2015-01-07 03:37:11 +00:00
|
|
|
|
|
|
|
private:
|
|
|
|
void setReorderArrays(const int32_t *codes, int32_t codesLength,
|
|
|
|
const uint32_t *ranges, int32_t rangesLength,
|
|
|
|
const uint8_t *table, UErrorCode &errorCode);
|
|
|
|
uint32_t reorderEx(uint32_t p) const;
|
2014-02-25 21:21:49 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
U_NAMESPACE_END
|
|
|
|
|
|
|
|
#endif // !UCONFIG_NO_COLLATION
|
|
|
|
#endif // __COLLATIONSETTINGS_H__
|