474 lines
17 KiB
C++
474 lines
17 KiB
C++
// © 2018 and later: Unicode, Inc. and others.
|
|
// License & terms of use: http://www.unicode.org/copyright.html
|
|
|
|
#include "unicode/utypes.h"
|
|
|
|
#if !UCONFIG_NO_FORMATTING
|
|
|
|
// Allow implicit conversion from char16_t* to UnicodeString for this file:
|
|
// Helpful in toString methods and elsewhere.
|
|
#define UNISTR_FROM_STRING_EXPLICIT
|
|
|
|
#include "numparse_types.h"
|
|
#include "numparse_affixes.h"
|
|
#include "numparse_utils.h"
|
|
#include "number_utils.h"
|
|
#include "string_segment.h"
|
|
|
|
using namespace icu;
|
|
using namespace icu::numparse;
|
|
using namespace icu::numparse::impl;
|
|
using namespace icu::number;
|
|
using namespace icu::number::impl;
|
|
|
|
|
|
namespace {
|
|
|
|
/**
|
|
* Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
|
|
* Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
|
|
* the given pattern string.
|
|
*/
|
|
static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
|
|
return (affix == nullptr && patternString.isBogus()) ||
|
|
(affix != nullptr && affix->getPattern() == patternString);
|
|
}
|
|
|
|
/**
|
|
* Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
|
|
*/
|
|
static int32_t length(const AffixPatternMatcher* matcher) {
|
|
return matcher == nullptr ? 0 : matcher->getPattern().length();
|
|
}
|
|
|
|
/**
|
|
* Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
|
|
* valid, whether they are equal according to operator==. Similar to Java Objects.equals()
|
|
*/
|
|
static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
|
|
if (lhs == nullptr && rhs == nullptr) {
|
|
return true;
|
|
}
|
|
if (lhs == nullptr || rhs == nullptr) {
|
|
return false;
|
|
}
|
|
return *lhs == *rhs;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
|
|
AffixTokenMatcherWarehouse& warehouse,
|
|
IgnorablesMatcher* ignorables)
|
|
: fMatchersLen(0),
|
|
fLastTypeOrCp(0),
|
|
fPattern(pattern),
|
|
fWarehouse(warehouse),
|
|
fIgnorables(ignorables) {}
|
|
|
|
void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
|
|
// This is called by AffixUtils.iterateWithConsumer() for each token.
|
|
|
|
// Add an ignorables matcher between tokens except between two literals, and don't put two
|
|
// ignorables matchers in a row.
|
|
if (fIgnorables != nullptr && fMatchersLen > 0 &&
|
|
(fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
|
|
addMatcher(*fIgnorables);
|
|
}
|
|
|
|
if (type != TYPE_CODEPOINT) {
|
|
// Case 1: the token is a symbol.
|
|
switch (type) {
|
|
case TYPE_MINUS_SIGN:
|
|
addMatcher(fWarehouse.minusSign());
|
|
break;
|
|
case TYPE_PLUS_SIGN:
|
|
addMatcher(fWarehouse.plusSign());
|
|
break;
|
|
case TYPE_PERCENT:
|
|
addMatcher(fWarehouse.percent());
|
|
break;
|
|
case TYPE_PERMILLE:
|
|
addMatcher(fWarehouse.permille());
|
|
break;
|
|
case TYPE_CURRENCY_SINGLE:
|
|
case TYPE_CURRENCY_DOUBLE:
|
|
case TYPE_CURRENCY_TRIPLE:
|
|
case TYPE_CURRENCY_QUAD:
|
|
case TYPE_CURRENCY_QUINT:
|
|
// All currency symbols use the same matcher
|
|
addMatcher(fWarehouse.currency(status));
|
|
break;
|
|
default:
|
|
UPRV_UNREACHABLE;
|
|
}
|
|
|
|
} else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
|
|
// Case 2: the token is an ignorable literal.
|
|
// No action necessary: the ignorables matcher has already been added.
|
|
|
|
} else {
|
|
// Case 3: the token is a non-ignorable literal.
|
|
if (auto* ptr = fWarehouse.nextCodePointMatcher(cp, status)) {
|
|
addMatcher(*ptr);
|
|
} else {
|
|
// OOM; unwind the stack
|
|
return;
|
|
}
|
|
}
|
|
fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
|
|
}
|
|
|
|
void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
|
|
if (fMatchersLen >= fMatchers.getCapacity()) {
|
|
fMatchers.resize(fMatchersLen * 2, fMatchersLen);
|
|
}
|
|
fMatchers[fMatchersLen++] = &matcher;
|
|
}
|
|
|
|
AffixPatternMatcher AffixPatternMatcherBuilder::build() {
|
|
return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern);
|
|
}
|
|
|
|
AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
|
|
: fSetupData(setupData) {}
|
|
|
|
NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
|
|
return fMinusSign = {fSetupData->dfs, true};
|
|
}
|
|
|
|
NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
|
|
return fPlusSign = {fSetupData->dfs, true};
|
|
}
|
|
|
|
NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
|
|
return fPercent = {fSetupData->dfs};
|
|
}
|
|
|
|
NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
|
|
return fPermille = {fSetupData->dfs};
|
|
}
|
|
|
|
NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
|
|
return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
|
|
}
|
|
|
|
IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
|
|
return fSetupData->ignorables;
|
|
}
|
|
|
|
NumberParseMatcher* AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp, UErrorCode& status) {
|
|
if (U_FAILURE(status)) {
|
|
return nullptr;
|
|
}
|
|
auto* result = fCodePoints.create(cp);
|
|
if (result == nullptr) {
|
|
status = U_MEMORY_ALLOCATION_ERROR;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
|
|
CodePointMatcher::CodePointMatcher(UChar32 cp)
|
|
: fCp(cp) {}
|
|
|
|
bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
|
|
if (segment.startsWith(fCp)) {
|
|
segment.adjustOffsetByCodePoint();
|
|
result.setCharsConsumed(segment);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
|
|
return segment.startsWith(fCp);
|
|
}
|
|
|
|
UnicodeString CodePointMatcher::toString() const {
|
|
return u"<CodePoint>";
|
|
}
|
|
|
|
|
|
AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
|
|
AffixTokenMatcherWarehouse& tokenWarehouse,
|
|
parse_flags_t parseFlags, bool* success,
|
|
UErrorCode& status) {
|
|
if (affixPattern.isEmpty()) {
|
|
*success = false;
|
|
return {};
|
|
}
|
|
*success = true;
|
|
|
|
IgnorablesMatcher* ignorables;
|
|
if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
|
|
ignorables = nullptr;
|
|
} else {
|
|
ignorables = &tokenWarehouse.ignorables();
|
|
}
|
|
|
|
AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
|
|
AffixUtils::iterateWithConsumer(affixPattern, builder, status);
|
|
return builder.build();
|
|
}
|
|
|
|
AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
|
|
const UnicodeString& pattern)
|
|
: ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {}
|
|
|
|
UnicodeString AffixPatternMatcher::getPattern() const {
|
|
return fPattern.toAliasedUnicodeString();
|
|
}
|
|
|
|
bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
|
|
return fPattern == other.fPattern;
|
|
}
|
|
|
|
|
|
AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
|
|
: fTokenWarehouse(tokenWarehouse) {
|
|
}
|
|
|
|
bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
|
|
const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
|
|
UErrorCode& status) {
|
|
UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
|
|
UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
|
|
UnicodeString negPrefixString;
|
|
UnicodeString negSuffixString;
|
|
if (patternInfo.hasNegativeSubpattern()) {
|
|
negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
|
|
negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
|
|
}
|
|
|
|
if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
|
|
AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
|
|
AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
|
|
AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
|
|
AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
|
|
// HACK: Plus and minus sign are a special case: we accept them trailing only if they are
|
|
// trailing in the pattern string.
|
|
&& !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
|
|
!AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
|
|
!AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
|
|
!AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
|
|
// The affixes contain only symbols and ignorables.
|
|
// No need to generate affix matchers.
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
|
|
MutableMatcherCollection& output,
|
|
const IgnorablesMatcher& ignorables,
|
|
parse_flags_t parseFlags, UErrorCode& status) {
|
|
if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
|
|
return;
|
|
}
|
|
|
|
// The affixes have interesting characters, or we are in strict mode.
|
|
// Use initial capacity of 6, the highest possible number of AffixMatchers.
|
|
UnicodeString sb;
|
|
bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
|
|
|
|
int32_t numAffixMatchers = 0;
|
|
int32_t numAffixPatternMatchers = 0;
|
|
|
|
AffixPatternMatcher* posPrefix = nullptr;
|
|
AffixPatternMatcher* posSuffix = nullptr;
|
|
|
|
// Pre-process the affix strings to resolve LDML rules like sign display.
|
|
for (int8_t typeInt = 0; typeInt < PATTERN_SIGN_TYPE_COUNT; typeInt++) {
|
|
auto type = static_cast<PatternSignType>(typeInt);
|
|
|
|
// Skip affixes in some cases
|
|
if (type == PATTERN_SIGN_TYPE_POS
|
|
&& 0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
|
|
continue;
|
|
}
|
|
if (type == PATTERN_SIGN_TYPE_POS_SIGN
|
|
&& 0 == (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) {
|
|
continue;
|
|
}
|
|
|
|
// Generate Prefix
|
|
bool hasPrefix = false;
|
|
PatternStringUtils::patternInfoToStringBuilder(
|
|
patternInfo, true, type, StandardPlural::OTHER, false, sb);
|
|
fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
|
|
sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
|
|
AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
|
|
: nullptr;
|
|
|
|
// Generate Suffix
|
|
bool hasSuffix = false;
|
|
PatternStringUtils::patternInfoToStringBuilder(
|
|
patternInfo, false, type, StandardPlural::OTHER, false, sb);
|
|
fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
|
|
sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
|
|
AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
|
|
: nullptr;
|
|
|
|
if (type == PATTERN_SIGN_TYPE_POS) {
|
|
posPrefix = prefix;
|
|
posSuffix = suffix;
|
|
} else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
|
|
// Skip adding these matchers (we already have equivalents)
|
|
continue;
|
|
}
|
|
|
|
// Flags for setting in the ParsedNumber; the token matchers may add more.
|
|
int flags = (type == PATTERN_SIGN_TYPE_NEG) ? FLAG_NEGATIVE : 0;
|
|
|
|
// Note: it is indeed possible for posPrefix and posSuffix to both be null.
|
|
// We still need to add that matcher for strict mode to work.
|
|
fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
|
|
if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
|
|
// The following if statements are designed to prevent adding two identical matchers.
|
|
if (type == PATTERN_SIGN_TYPE_POS || !equals(prefix, posPrefix)) {
|
|
fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
|
|
}
|
|
if (type == PATTERN_SIGN_TYPE_POS || !equals(suffix, posSuffix)) {
|
|
fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
|
|
}
|
|
}
|
|
}
|
|
|
|
// Put the AffixMatchers in order, and then add them to the output.
|
|
// Since there are at most 9 elements, do a simple-to-implement bubble sort.
|
|
bool madeChanges;
|
|
do {
|
|
madeChanges = false;
|
|
for (int32_t i = 1; i < numAffixMatchers; i++) {
|
|
if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
|
|
madeChanges = true;
|
|
AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
|
|
fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
|
|
fAffixMatchers[i] = std::move(temp);
|
|
}
|
|
}
|
|
} while (madeChanges);
|
|
|
|
for (int32_t i = 0; i < numAffixMatchers; i++) {
|
|
// Enable the following line to debug affixes
|
|
//std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
|
|
output.addMatcher(fAffixMatchers[i]);
|
|
}
|
|
}
|
|
|
|
|
|
AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
|
|
: fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
|
|
|
|
bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
|
|
if (!result.seenNumber()) {
|
|
// Prefix
|
|
// Do not match if:
|
|
// 1. We have already seen a prefix (result.prefix != null)
|
|
// 2. The prefix in this AffixMatcher is empty (prefix == null)
|
|
if (!result.prefix.isBogus() || fPrefix == nullptr) {
|
|
return false;
|
|
}
|
|
|
|
// Attempt to match the prefix.
|
|
int initialOffset = segment.getOffset();
|
|
bool maybeMore = fPrefix->match(segment, result, status);
|
|
if (initialOffset != segment.getOffset()) {
|
|
result.prefix = fPrefix->getPattern();
|
|
}
|
|
return maybeMore;
|
|
|
|
} else {
|
|
// Suffix
|
|
// Do not match if:
|
|
// 1. We have already seen a suffix (result.suffix != null)
|
|
// 2. The suffix in this AffixMatcher is empty (suffix == null)
|
|
// 3. The matched prefix does not equal this AffixMatcher's prefix
|
|
if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
|
|
return false;
|
|
}
|
|
|
|
// Attempt to match the suffix.
|
|
int initialOffset = segment.getOffset();
|
|
bool maybeMore = fSuffix->match(segment, result, status);
|
|
if (initialOffset != segment.getOffset()) {
|
|
result.suffix = fSuffix->getPattern();
|
|
}
|
|
return maybeMore;
|
|
}
|
|
}
|
|
|
|
bool AffixMatcher::smokeTest(const StringSegment& segment) const {
|
|
return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
|
|
(fSuffix != nullptr && fSuffix->smokeTest(segment));
|
|
}
|
|
|
|
void AffixMatcher::postProcess(ParsedNumber& result) const {
|
|
// Check to see if our affix is the one that was matched. If so, set the flags in the result.
|
|
if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
|
|
// Fill in the result prefix and suffix with non-null values (empty string).
|
|
// Used by strict mode to determine whether an entire affix pair was matched.
|
|
if (result.prefix.isBogus()) {
|
|
result.prefix = UnicodeString();
|
|
}
|
|
if (result.suffix.isBogus()) {
|
|
result.suffix = UnicodeString();
|
|
}
|
|
result.flags |= fFlags;
|
|
if (fPrefix != nullptr) {
|
|
fPrefix->postProcess(result);
|
|
}
|
|
if (fSuffix != nullptr) {
|
|
fSuffix->postProcess(result);
|
|
}
|
|
}
|
|
}
|
|
|
|
int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
|
|
const AffixMatcher& lhs = *this;
|
|
if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
|
|
return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
|
|
} else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
|
|
return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
|
|
} else {
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
UnicodeString AffixMatcher::toString() const {
|
|
bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
|
|
return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
|
|
(fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
|
|
(fSuffix ? fSuffix->getPattern() : u"null") + u">";
|
|
|
|
}
|
|
|
|
|
|
#endif /* #if !UCONFIG_NO_FORMATTING */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|