scuffed-code/icu4c/source/i18n/affixpatternparser.cpp

697 lines
20 KiB
C++
Raw Normal View History

// Copyright (C) 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
* Copyright (C) 2015, International Business Machines
* Corporation and others. All Rights Reserved.
*
* file name: affixpatternparser.cpp
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_FORMATTING
#include "unicode/dcfmtsym.h"
#include "unicode/plurrule.h"
#include "unicode/ucurr.h"
#include "affixpatternparser.h"
#include "charstr.h"
#include "precision.h"
#include "uassert.h"
#include "unistrappender.h"
static UChar gDefaultSymbols[] = {0xa4, 0xa4, 0xa4};
static UChar gPercent = 0x25;
static UChar gPerMill = 0x2030;
static UChar gNegative = 0x2D;
static UChar gPositive = 0x2B;
#define PACK_TOKEN_AND_LENGTH(t, l) ((UChar) (((t) << 8) | (l & 0xFF)))
#define UNPACK_TOKEN(c) ((AffixPattern::ETokenType) (((c) >> 8) & 0x7F))
#define UNPACK_LONG(c) (((c) >> 8) & 0x80)
#define UNPACK_LENGTH(c) ((c) & 0xFF)
U_NAMESPACE_BEGIN
static int32_t
nextToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
if (buffer[idx] != 0x27 || idx + 1 == len) {
*token = buffer[idx];
return 1;
}
*token = buffer[idx + 1];
if (buffer[idx + 1] == 0xA4) {
int32_t i = 2;
for (; idx + i < len && i < 4 && buffer[idx + i] == buffer[idx + 1]; ++i)
;
return i;
}
return 2;
}
static int32_t
nextUserToken(const UChar *buffer, int32_t idx, int32_t len, UChar *token) {
*token = buffer[idx];
int32_t max;
switch (buffer[idx]) {
case 0x27:
max = 2;
break;
case 0xA4:
max = 3;
break;
default:
max = 1;
break;
}
int32_t i = 1;
for (; idx + i < len && i < max && buffer[idx + i] == buffer[idx]; ++i)
;
return i;
}
CurrencyAffixInfo::CurrencyAffixInfo()
: fSymbol(gDefaultSymbols, 1),
fISO(gDefaultSymbols, 2),
fLong(DigitAffix(gDefaultSymbols, 3)),
fIsDefault(TRUE) {
}
void
CurrencyAffixInfo::set(
const char *locale,
const PluralRules *rules,
const UChar *currency,
UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
fIsDefault = FALSE;
if (currency == NULL) {
fSymbol.setTo(gDefaultSymbols, 1);
fISO.setTo(gDefaultSymbols, 2);
fLong.remove();
fLong.append(gDefaultSymbols, 3);
fIsDefault = TRUE;
return;
}
int32_t len;
UBool unusedIsChoice;
const UChar *symbol = ucurr_getName(
currency, locale, UCURR_SYMBOL_NAME, &unusedIsChoice,
&len, &status);
if (U_FAILURE(status)) {
return;
}
fSymbol.setTo(symbol, len);
fISO.setTo(currency, u_strlen(currency));
fLong.remove();
StringEnumeration* keywords = rules->getKeywords(status);
if (U_FAILURE(status)) {
return;
}
const UnicodeString* pluralCount;
while ((pluralCount = keywords->snext(status)) != NULL) {
CharString pCount;
pCount.appendInvariantChars(*pluralCount, status);
const UChar *pluralName = ucurr_getPluralName(
currency, locale, &unusedIsChoice, pCount.data(),
&len, &status);
fLong.setVariant(pCount.data(), UnicodeString(pluralName, len), status);
}
delete keywords;
}
void
CurrencyAffixInfo::adjustPrecision(
const UChar *currency, const UCurrencyUsage usage,
FixedPrecision &precision, UErrorCode &status) {
if (U_FAILURE(status)) {
return;
}
int32_t digitCount = ucurr_getDefaultFractionDigitsForUsage(
currency, usage, &status);
precision.fMin.setFracDigitCount(digitCount);
precision.fMax.setFracDigitCount(digitCount);
double increment = ucurr_getRoundingIncrementForUsage(
currency, usage, &status);
if (increment == 0.0) {
precision.fRoundingIncrement.clear();
} else {
precision.fRoundingIncrement.set(increment);
// guard against round-off error
precision.fRoundingIncrement.round(6);
}
}
void
AffixPattern::addLiteral(
const UChar *literal, int32_t start, int32_t len) {
char32Count += u_countChar32(literal + start, len);
literals.append(literal, start, len);
int32_t tlen = tokens.length();
// Takes 4 UChars to encode maximum literal length.
UChar *tokenChars = tokens.getBuffer(tlen + 4);
// find start of literal size. May be tlen if there is no literal.
// While finding start of literal size, compute literal length
int32_t literalLength = 0;
int32_t tLiteralStart = tlen;
while (tLiteralStart > 0 && UNPACK_TOKEN(tokenChars[tLiteralStart - 1]) == kLiteral) {
tLiteralStart--;
literalLength <<= 8;
literalLength |= UNPACK_LENGTH(tokenChars[tLiteralStart]);
}
// Add number of chars we just added to literal
literalLength += len;
// Now encode the new length starting at tLiteralStart
tlen = tLiteralStart;
tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral, literalLength & 0xFF);
literalLength >>= 8;
while (literalLength) {
tokenChars[tlen++] = PACK_TOKEN_AND_LENGTH(kLiteral | 0x80, literalLength & 0xFF);
literalLength >>= 8;
}
tokens.releaseBuffer(tlen);
}
void
AffixPattern::add(ETokenType t) {
add(t, 1);
}
void
AffixPattern::addCurrency(uint8_t count) {
add(kCurrency, count);
}
void
AffixPattern::add(ETokenType t, uint8_t count) {
U_ASSERT(t != kLiteral);
char32Count += count;
switch (t) {
case kCurrency:
hasCurrencyToken = TRUE;
break;
case kPercent:
hasPercentToken = TRUE;
break;
case kPerMill:
hasPermillToken = TRUE;
break;
default:
// Do nothing
break;
}
tokens.append(PACK_TOKEN_AND_LENGTH(t, count));
}
AffixPattern &
AffixPattern::append(const AffixPattern &other) {
AffixPatternIterator iter;
other.iterator(iter);
UnicodeString literal;
while (iter.nextToken()) {
switch (iter.getTokenType()) {
case kLiteral:
iter.getLiteral(literal);
addLiteral(literal.getBuffer(), 0, literal.length());
break;
case kCurrency:
addCurrency(iter.getTokenLength());
break;
default:
add(iter.getTokenType());
break;
}
}
return *this;
}
void
AffixPattern::remove() {
tokens.remove();
literals.remove();
hasCurrencyToken = FALSE;
hasPercentToken = FALSE;
hasPermillToken = FALSE;
char32Count = 0;
}
// escapes literals for strings where special characters are NOT escaped
// except for apostrophe.
static void escapeApostropheInLiteral(
const UnicodeString &literal, UnicodeStringAppender &appender) {
int32_t len = literal.length();
const UChar *buffer = literal.getBuffer();
for (int32_t i = 0; i < len; ++i) {
UChar ch = buffer[i];
switch (ch) {
case 0x27:
appender.append((UChar) 0x27);
appender.append((UChar) 0x27);
break;
default:
appender.append(ch);
break;
}
}
}
// escapes literals for user strings where special characters in literals
// are escaped with apostrophe.
static void escapeLiteral(
const UnicodeString &literal, UnicodeStringAppender &appender) {
int32_t len = literal.length();
const UChar *buffer = literal.getBuffer();
for (int32_t i = 0; i < len; ++i) {
UChar ch = buffer[i];
switch (ch) {
case 0x27:
appender.append((UChar) 0x27);
appender.append((UChar) 0x27);
break;
case 0x25:
appender.append((UChar) 0x27);
appender.append((UChar) 0x25);
appender.append((UChar) 0x27);
break;
case 0x2030:
appender.append((UChar) 0x27);
appender.append((UChar) 0x2030);
appender.append((UChar) 0x27);
break;
case 0xA4:
appender.append((UChar) 0x27);
appender.append((UChar) 0xA4);
appender.append((UChar) 0x27);
break;
case 0x2D:
appender.append((UChar) 0x27);
appender.append((UChar) 0x2D);
appender.append((UChar) 0x27);
break;
case 0x2B:
appender.append((UChar) 0x27);
appender.append((UChar) 0x2B);
appender.append((UChar) 0x27);
break;
default:
appender.append(ch);
break;
}
}
}
UnicodeString &
AffixPattern::toString(UnicodeString &appendTo) const {
AffixPatternIterator iter;
iterator(iter);
UnicodeStringAppender appender(appendTo);
UnicodeString literal;
while (iter.nextToken()) {
switch (iter.getTokenType()) {
case kLiteral:
escapeApostropheInLiteral(iter.getLiteral(literal), appender);
break;
case kPercent:
appender.append((UChar) 0x27);
appender.append((UChar) 0x25);
break;
case kPerMill:
appender.append((UChar) 0x27);
appender.append((UChar) 0x2030);
break;
case kCurrency:
{
appender.append((UChar) 0x27);
int32_t cl = iter.getTokenLength();
for (int32_t i = 0; i < cl; ++i) {
appender.append((UChar) 0xA4);
}
}
break;
case kNegative:
appender.append((UChar) 0x27);
appender.append((UChar) 0x2D);
break;
case kPositive:
appender.append((UChar) 0x27);
appender.append((UChar) 0x2B);
break;
default:
U_ASSERT(FALSE);
break;
}
}
return appendTo;
}
UnicodeString &
AffixPattern::toUserString(UnicodeString &appendTo) const {
AffixPatternIterator iter;
iterator(iter);
UnicodeStringAppender appender(appendTo);
UnicodeString literal;
while (iter.nextToken()) {
switch (iter.getTokenType()) {
case kLiteral:
escapeLiteral(iter.getLiteral(literal), appender);
break;
case kPercent:
appender.append((UChar) 0x25);
break;
case kPerMill:
appender.append((UChar) 0x2030);
break;
case kCurrency:
{
int32_t cl = iter.getTokenLength();
for (int32_t i = 0; i < cl; ++i) {
appender.append((UChar) 0xA4);
}
}
break;
case kNegative:
appender.append((UChar) 0x2D);
break;
case kPositive:
appender.append((UChar) 0x2B);
break;
default:
U_ASSERT(FALSE);
break;
}
}
return appendTo;
}
class AffixPatternAppender : public UMemory {
public:
AffixPatternAppender(AffixPattern &dest) : fDest(&dest), fIdx(0) { }
inline void append(UChar x) {
if (fIdx == UPRV_LENGTHOF(fBuffer)) {
fDest->addLiteral(fBuffer, 0, fIdx);
fIdx = 0;
}
fBuffer[fIdx++] = x;
}
inline void append(UChar32 x) {
if (fIdx >= UPRV_LENGTHOF(fBuffer) - 1) {
fDest->addLiteral(fBuffer, 0, fIdx);
fIdx = 0;
}
U16_APPEND_UNSAFE(fBuffer, fIdx, x);
}
inline void flush() {
if (fIdx) {
fDest->addLiteral(fBuffer, 0, fIdx);
}
fIdx = 0;
}
/**
* flush the buffer when we go out of scope.
*/
~AffixPatternAppender() {
flush();
}
private:
AffixPattern *fDest;
int32_t fIdx;
UChar fBuffer[32];
AffixPatternAppender(const AffixPatternAppender &other);
AffixPatternAppender &operator=(const AffixPatternAppender &other);
};
AffixPattern &
AffixPattern::parseUserAffixString(
const UnicodeString &affixStr,
AffixPattern &appendTo,
UErrorCode &status) {
if (U_FAILURE(status)) {
return appendTo;
}
int32_t len = affixStr.length();
const UChar *buffer = affixStr.getBuffer();
// 0 = not quoted; 1 = quoted.
int32_t state = 0;
AffixPatternAppender appender(appendTo);
for (int32_t i = 0; i < len; ) {
UChar token;
int32_t tokenSize = nextUserToken(buffer, i, len, &token);
i += tokenSize;
if (token == 0x27 && tokenSize == 1) { // quote
state = 1 - state;
continue;
}
if (state == 0) {
switch (token) {
case 0x25:
appender.flush();
appendTo.add(kPercent, 1);
break;
case 0x27: // double quote
appender.append((UChar) 0x27);
break;
case 0x2030:
appender.flush();
appendTo.add(kPerMill, 1);
break;
case 0x2D:
appender.flush();
appendTo.add(kNegative, 1);
break;
case 0x2B:
appender.flush();
appendTo.add(kPositive, 1);
break;
case 0xA4:
appender.flush();
appendTo.add(kCurrency, tokenSize);
break;
default:
appender.append(token);
break;
}
} else {
switch (token) {
case 0x27: // double quote
appender.append((UChar) 0x27);
break;
case 0xA4: // included b/c tokenSize can be > 1
for (int32_t j = 0; j < tokenSize; ++j) {
appender.append((UChar) 0xA4);
}
break;
default:
appender.append(token);
break;
}
}
}
return appendTo;
}
AffixPattern &
AffixPattern::parseAffixString(
const UnicodeString &affixStr,
AffixPattern &appendTo,
UErrorCode &status) {
if (U_FAILURE(status)) {
return appendTo;
}
int32_t len = affixStr.length();
const UChar *buffer = affixStr.getBuffer();
for (int32_t i = 0; i < len; ) {
UChar token;
int32_t tokenSize = nextToken(buffer, i, len, &token);
if (tokenSize == 1) {
int32_t literalStart = i;
++i;
while (i < len && (tokenSize = nextToken(buffer, i, len, &token)) == 1) {
++i;
}
appendTo.addLiteral(buffer, literalStart, i - literalStart);
// If we reached end of string, we are done
if (i == len) {
return appendTo;
}
}
i += tokenSize;
switch (token) {
case 0x25:
appendTo.add(kPercent, 1);
break;
case 0x2030:
appendTo.add(kPerMill, 1);
break;
case 0x2D:
appendTo.add(kNegative, 1);
break;
case 0x2B:
appendTo.add(kPositive, 1);
break;
case 0xA4:
{
if (tokenSize - 1 > 3) {
status = U_PARSE_ERROR;
return appendTo;
}
appendTo.add(kCurrency, tokenSize - 1);
}
break;
default:
appendTo.addLiteral(&token, 0, 1);
break;
}
}
return appendTo;
}
AffixPatternIterator &
AffixPattern::iterator(AffixPatternIterator &result) const {
result.nextLiteralIndex = 0;
result.lastLiteralLength = 0;
result.nextTokenIndex = 0;
result.tokens = &tokens;
result.literals = &literals;
return result;
}
UBool
AffixPatternIterator::nextToken() {
int32_t tlen = tokens->length();
if (nextTokenIndex == tlen) {
return FALSE;
}
++nextTokenIndex;
const UChar *tokenBuffer = tokens->getBuffer();
if (UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]) ==
AffixPattern::kLiteral) {
while (nextTokenIndex < tlen &&
UNPACK_LONG(tokenBuffer[nextTokenIndex])) {
++nextTokenIndex;
}
lastLiteralLength = 0;
int32_t i = nextTokenIndex - 1;
for (; UNPACK_LONG(tokenBuffer[i]); --i) {
lastLiteralLength <<= 8;
lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
}
lastLiteralLength <<= 8;
lastLiteralLength |= UNPACK_LENGTH(tokenBuffer[i]);
nextLiteralIndex += lastLiteralLength;
}
return TRUE;
}
AffixPattern::ETokenType
AffixPatternIterator::getTokenType() const {
return UNPACK_TOKEN(tokens->charAt(nextTokenIndex - 1));
}
UnicodeString &
AffixPatternIterator::getLiteral(UnicodeString &result) const {
const UChar *buffer = literals->getBuffer();
result.setTo(buffer + (nextLiteralIndex - lastLiteralLength), lastLiteralLength);
return result;
}
int32_t
AffixPatternIterator::getTokenLength() const {
const UChar *tokenBuffer = tokens->getBuffer();
AffixPattern::ETokenType type = UNPACK_TOKEN(tokenBuffer[nextTokenIndex - 1]);
return type == AffixPattern::kLiteral ? lastLiteralLength : UNPACK_LENGTH(tokenBuffer[nextTokenIndex - 1]);
}
AffixPatternParser::AffixPatternParser()
: fPercent(gPercent), fPermill(gPerMill), fNegative(gNegative), fPositive(gPositive) {
}
AffixPatternParser::AffixPatternParser(
const DecimalFormatSymbols &symbols) {
setDecimalFormatSymbols(symbols);
}
void
AffixPatternParser::setDecimalFormatSymbols(
const DecimalFormatSymbols &symbols) {
fPercent = symbols.getConstSymbol(DecimalFormatSymbols::kPercentSymbol);
fPermill = symbols.getConstSymbol(DecimalFormatSymbols::kPerMillSymbol);
fNegative = symbols.getConstSymbol(DecimalFormatSymbols::kMinusSignSymbol);
fPositive = symbols.getConstSymbol(DecimalFormatSymbols::kPlusSignSymbol);
}
PluralAffix &
AffixPatternParser::parse(
const AffixPattern &affixPattern,
const CurrencyAffixInfo &currencyAffixInfo,
PluralAffix &appendTo,
UErrorCode &status) const {
if (U_FAILURE(status)) {
return appendTo;
}
AffixPatternIterator iter;
affixPattern.iterator(iter);
UnicodeString literal;
while (iter.nextToken()) {
switch (iter.getTokenType()) {
case AffixPattern::kPercent:
appendTo.append(fPercent, UNUM_PERCENT_FIELD);
break;
case AffixPattern::kPerMill:
appendTo.append(fPermill, UNUM_PERMILL_FIELD);
break;
case AffixPattern::kNegative:
appendTo.append(fNegative, UNUM_SIGN_FIELD);
break;
case AffixPattern::kPositive:
appendTo.append(fPositive, UNUM_SIGN_FIELD);
break;
case AffixPattern::kCurrency:
switch (iter.getTokenLength()) {
case 1:
appendTo.append(
currencyAffixInfo.getSymbol(), UNUM_CURRENCY_FIELD);
break;
case 2:
appendTo.append(
currencyAffixInfo.getISO(), UNUM_CURRENCY_FIELD);
break;
case 3:
appendTo.append(
currencyAffixInfo.getLong(), UNUM_CURRENCY_FIELD, status);
break;
default:
U_ASSERT(FALSE);
break;
}
break;
case AffixPattern::kLiteral:
appendTo.append(iter.getLiteral(literal));
break;
default:
U_ASSERT(FALSE);
break;
}
}
return appendTo;
}
U_NAMESPACE_END
#endif /* #if !UCONFIG_NO_FORMATTING */