scuffed-code/icu4c/source/i18n/nfrs.cpp

/*
******************************************************************************
*   Copyright (C) 1997-2001, International Business Machines
*   Corporation and others.  All Rights Reserved.
******************************************************************************
*   file name:  nfrs.cpp
*   encoding:   US-ASCII
*   tab size:   8 (not used)
*   indentation:4
*
* Modification history
* Date        Name      Comments
* 10/11/2001  Doug      Ported from ICU4J
*/

#include "nfrs.h"
#include "nfrule.h"
#include "nfrlist.h"
#include "unicode/uchar.h"

#ifdef RBNF_DEBUG
#include "cmemory.h"
#endif

U_NAMESPACE_BEGIN

#if 0
// euclid's algorithm works with doubles
// note, doubles only get us up to one quadrillion or so, which
// isn't as much range as we get with longs.  We probably still
// want either 64-bit math, or BigInteger.

static llong
util_lcm(llong x, llong y)
{
    x.abs();
    y.abs();

    if (x == 0 || y == 0) {
        return 0;
    } else {
        do {
            if (x < y) {
                llong t = x; x = y; y = t;
            }
            x -= y * (x/y);
        } while (x != 0);

        return y;
    }
}

#else
/**
 * Calculates the least common multiple of x and y.
 */
static llong
util_lcm(llong x, llong y)
{
    // binary gcd algorithm from Knuth, "The Art of Computer Programming,"
    // vol. 2, 1st ed., pp. 298-299
    llong x1 = x;
    llong y1 = y;

    int p2 = 0;
    while ((x1 & 1) == 0 && (y1 & 1) == 0) {
        ++p2;
        x1 >>= 1;
        y1 >>= 1;
    }

    llong t;
    if ((x1 & 1) == 1) {
        t = -y1;
    } else {
        t = x1;
    }

    while (t != 0) {
        while ((t & 1) == 0) {
            t >>= 1;
        }
        if (t > 0) {
            x1 = t;
        } else {
            y1 = -t;
        }
        t = x1 - y1;
    }

    llong gcd = x1 << p2;

    // x * y == gcd(x, y) * lcm(x, y)
    return x / gcd * y;
}
#endif

static const UChar gPercent = 0x0025;
static const UChar gColon = 0x003a;
static const UChar gSemicolon = 0x003b;
static const UChar gLineFeed = 0x000a;

static const UChar gFourSpaces[] =
{
    0x20, 0x20, 0x20, 0x20, 0
}; /* "    " */
static const UChar gPercentPercent[] =
{
    0x25, 0x25, 0
}; /* "%%" */

NFRuleSet::NFRuleSet(UnicodeString* descriptions, int32_t index, UErrorCode& status)
  : name()
  , rules(0)
  , negativeNumberRule(NULL)
  , fIsFractionRuleSet(FALSE)
  , fIsPublic(FALSE)
{
    for (int i = 0; i < 3; ++i) {
        fractionRules[i] = NULL;
    }

    if (U_FAILURE(status)) {
        return;
    }

    UnicodeString& description = descriptions[index]; // !!! make sure index is valid

    // if the description begins with a rule set name (the rule set
    // name can be omitted in formatter descriptions that consist
    // of only one rule set), copy it out into our "name" member
    // and delete it from the description
    if (description.charAt(0) == gPercent) {
        UTextOffset pos = description.indexOf(gColon);
        if (pos == -1) {
            // throw new IllegalArgumentException("Rule set name doesn't end in colon");
            status = U_PARSE_ERROR;
        } else {
            name.setTo(description, 0, pos);
            while (pos < description.length() && u_isWhitespace(description.charAt(++pos))) {
            }
            description.remove(0, pos);
        }
    } else {
        name.setTo("%default");
    }

    if (description.length() == 0) {
        // throw new IllegalArgumentException("Empty rule set description");
        status = U_PARSE_ERROR;
    }

    fIsPublic = name.indexOf(gPercentPercent) != 0;

    // all of the other members of NFRuleSet are initialized
    // by parseRules()
}

void
NFRuleSet::parseRules(UnicodeString& description, const RuleBasedNumberFormat* owner, UErrorCode& status)
{
    // start by creating a Vector whose elements are Strings containing
    // the descriptions of the rules (one rule per element).  The rules
    // are separated by semicolons (there's no escape facility: ALL
    // semicolons are rule delimiters)

    if (U_FAILURE(status)) {
        return;
    }

    // dlf - the original code kept a separate description array for no reason,
    // so I got rid of it.  The loop was too complex so I simplified it.

    UnicodeString currentDescription;
    UTextOffset oldP = 0;
    while (oldP < description.length()) {
        UTextOffset p = description.indexOf(gSemicolon, oldP);
        if (p == -1) {
            p = description.length();
        }
        currentDescription.setTo(description, oldP, p - oldP);
        NFRule::makeRules(currentDescription, this, rules.last(), owner, rules, status);
        oldP = p + 1;
    }

    // for rules that didn't specify a base value, their base values
    // were initialized to 0.  Make another pass through the list and
    // set all those rules' base values.  We also remove any special
    // rules from the list and put them into their own member variables
    llong defaultBaseValue = (int32_t)0;

    // (this isn't a for loop because we might be deleting items from
    // the vector-- we want to make sure we only increment i when
    // we _didn't_ delete aything from the vector)
    uint32_t i = 0;
    while (i < rules.size()) {
        NFRule* rule = rules[i];

        switch (rule->getType()) {
            // if the rule's base value is 0, fill in a default
            // base value (this will be 1 plus the preceding
            // rule's base value for regular rule sets, and the
            // same as the preceding rule's base value in fraction
            // rule sets)
        case NFRule::kNoBase:
            rule->setBaseValue(defaultBaseValue);
            if (!isFractionRuleSet()) {
                ++defaultBaseValue;
            }
            ++i;
            break;

            // if it's the negative-number rule, copy it into its own
            // data member and delete it from the list
        case NFRule::kNegativeNumberRule:
            negativeNumberRule = rules.remove(i);
            break;

            // if it's the improper fraction rule, copy it into the
            // correct element of fractionRules
        case NFRule::kImproperFractionRule:
            fractionRules[0] = rules.remove(i);
            break;

            // if it's the proper fraction rule, copy it into the
            // correct element of fractionRules
        case NFRule::kProperFractionRule:
            fractionRules[1] = rules.remove(i);
            break;

            // if it's the master rule, copy it into the
            // correct element of fractionRules
        case NFRule::kMasterRule:
            fractionRules[2] = rules.remove(i);
            break;

            // if it's a regular rule that already knows its base value,
            // check to make sure the rules are in order, and update
            // the default base value for the next rule
        default:
            if (rule->getBaseValue() < defaultBaseValue) {
                // throw new IllegalArgumentException("Rules are not in order");
                status = U_PARSE_ERROR;
                return;
            }
            defaultBaseValue = rule->getBaseValue();
            if (!isFractionRuleSet()) {
                ++defaultBaseValue;
            }
            ++i;
            break;
        }
    }
}

NFRuleSet::~NFRuleSet()
{
    delete negativeNumberRule;
    delete fractionRules[0];
    delete fractionRules[1];
    delete fractionRules[2];
}

UBool
util_equalRules(const NFRule* rule1, const NFRule* rule2)
{
    if (rule1) {
        if (rule2) {
            return *rule1 == *rule2;
        }
    } else if (!rule2) {
        return TRUE;
    }
    return FALSE;
}

UBool
NFRuleSet::operator==(const NFRuleSet& rhs) const
{
    if (rules.size() == rhs.rules.size() &&
        fIsFractionRuleSet == rhs.fIsFractionRuleSet &&
        name == rhs.name &&
        util_equalRules(negativeNumberRule, rhs.negativeNumberRule) &&
        util_equalRules(fractionRules[0], rhs.fractionRules[0]) &&
        util_equalRules(fractionRules[1], rhs.fractionRules[1]) &&
        util_equalRules(fractionRules[2], rhs.fractionRules[2])) {

        for (uint32_t i = 0; i < rules.size(); ++i) {
            if (*rules[i] != *rhs.rules[i]) {
                return FALSE;
            }
        }
        return TRUE;
    }
    return FALSE;
}

void
NFRuleSet::format(llong number, UnicodeString& toAppendTo, int32_t pos) const
{
    NFRule *rule = findNormalRule(number);
    rule->doFormat(number, toAppendTo, pos);
}

void
NFRuleSet::format(double number, UnicodeString& toAppendTo, int32_t pos) const
{
    NFRule *rule = findDoubleRule(number);
    rule->doFormat(number, toAppendTo, pos);
}

NFRule*
NFRuleSet::findDoubleRule(double number) const
{
    // if this is a fraction rule set, use findFractionRuleSetRule()
    if (isFractionRuleSet()) {
        return findFractionRuleSetRule(number);
    }

    // if the number is negative, return the negative number rule
    // (if there isn't a negative-number rule, we pretend it's a
    // positive number)
    if (number < 0) {
        if (negativeNumberRule) {
            return  negativeNumberRule;
        } else {
            number = -number;
        }
    }

    // if the number isn't an integer, we use one of the fraction rules...
    if (number != uprv_floor(number)) {
        // if the number is between 0 and 1, return the proper
        // fraction rule
        if (number < 1 && fractionRules[1]) {
            return fractionRules[1];
        }
        // otherwise, return the improper fraction rule
        else if (fractionRules[0]) {
            return fractionRules[0];
        }
    }

    // if there's a master rule, use it to format the number
    if (fractionRules[2]) {
        return fractionRules[2];
    }

    // and if we haven't yet returned a rule, use findNormalRule()
    // to find the applicable rule
    llong r = number + 0.5;
    return findNormalRule(r);
}

NFRule *
NFRuleSet::findNormalRule(llong number) const
{
    // if this is a fraction rule set, use findFractionRuleSetRule()
    // to find the rule (we should only go into this clause if the
    // value is 0)
    if (fIsFractionRuleSet) {
        return findFractionRuleSetRule(number.asDouble());
    }

    // if the number is negative, return the negative-number rule
    // (if there isn't one, pretend the number is positive)
    if (number < 0) {
        if (negativeNumberRule) {
            return negativeNumberRule;
        } else {
            number = -number;
        }
    }

    // we have to repeat the preceding two checks, even though we
    // do them in findRule(), because the version of format() that
    // takes a long bypasses findRule() and goes straight to this
    // function.  This function does skip the fraction rules since
    // we know the value is an integer (it also skips the master
    // rule, since it's considered a fraction rule.  Skipping the
    // master rule in this function is also how we avoid infinite
    // recursion)

	// {dlf} unfortunately this fails if there are no rules except
	// special rules.  If there are no rules, use the master rule.

    // binary-search the rule list for the applicable rule
    // (a rule is used for all values from its base value to
    // the next rule's base value)
    int32_t hi = rules.size();
	if (hi > 0) {
		int32_t lo = 0;

		while (lo < hi) {
			int32_t mid = (lo + hi) / 2;
			if (rules[mid]->getBaseValue() == number) {
				return rules[mid];
			}
			else if (rules[mid]->getBaseValue() > number) {
				hi = mid;
			}
			else {
				lo = mid + 1;
			}
		}
		NFRule *result = rules[hi - 1];

		// use shouldRollBack() to see whether we need to invoke the
		// rollback rule (see shouldRollBack()'s documentation for
		// an explanation of the rollback rule).  If we do, roll back
		// one rule and return that one instead of the one we'd normally
		// return
		if (result->shouldRollBack(number.asDouble())) {
			result = rules[hi - 2];
		}
	     return result;
	}
	// else use the master rule
	return fractionRules[2];
}

/**
 * If this rule is a fraction rule set, this function is used by
 * findRule() to select the most appropriate rule for formatting
 * the number.  Basically, the base value of each rule in the rule
 * set is treated as the denominator of a fraction.  Whichever
 * denominator can produce the fraction closest in value to the
 * number passed in is the result.  If there's a tie, the earlier
 * one in the list wins.  (If there are two rules in a row with the
 * same base value, the first one is used when the numerator of the
 * fraction would be 1, and the second rule is used the rest of the
 * time.
 * @param number The number being formatted (which will always be
 * a number between 0 and 1)
 * @return The rule to use to format this number
 */
NFRule*
NFRuleSet::findFractionRuleSetRule(double number) const
{
    // the obvious way to do this (multiply the value being formatted
    // by each rule's base value until you get an integral result)
    // doesn't work because of rounding error.  This method is more
    // accurate

    // find the least common multiple of the rules' base values
    // and multiply this by the number being formatted.  This is
    // all the precision we need, and we can do all of the rest
    // of the math using integer arithmetic
    llong leastCommonMultiple = rules[0]->getBaseValue();
    llong numerator;
    {
        for (uint32_t i = 1; i < rules.size(); ++i) {
            leastCommonMultiple = util_lcm(leastCommonMultiple, rules[i]->getBaseValue());
        }
        numerator = number * leastCommonMultiple.asDouble() + 0.5;
    }
    // for each rule, do the following...
    llong tempDifference;
    llong difference(uprv_maxMantissa());
    int32_t winner = 0;
    for (uint32_t i = 0; i < rules.size(); ++i) {
        // "numerator" is the numerator of the fraction if the
        // denominator is the LCD.  The numerator if the rule's
        // base value is the denominator is "numerator" times the
        // base value divided bythe LCD.  Here we check to see if
        // that's an integer, and if not, how close it is to being
        // an integer.
        tempDifference = numerator * rules[i]->getBaseValue() % leastCommonMultiple;


        // normalize the result of the above calculation: we want
        // the numerator's distance from the CLOSEST multiple
        // of the LCD
        if (leastCommonMultiple - tempDifference < tempDifference) {
            tempDifference = leastCommonMultiple - tempDifference;
        }

        // if this is as close as we've come, keep track of how close
        // that is, and the line number of the rule that did it.  If
        // we've scored a direct hit, we don't have to look at any more
        // rules
        if (tempDifference < difference) {
            difference = tempDifference;
            winner = i;
            if (difference == 0) {
                break;
            }
        }
    }

    // if we have two successive rules that both have the winning base
    // value, then the first one (the one we found above) is used if
    // the numerator of the fraction is 1 and the second one is used if
    // the numerator of the fraction is anything else (this lets us
    // do things like "one third"/"two thirds" without haveing to define
    // a whole bunch of extra rule sets)
    if ((unsigned)(winner + 1) < rules.size() &&
        rules[winner + 1]->getBaseValue() == rules[winner]->getBaseValue()) {
        double n = rules[winner]->getBaseValue().asDouble() * number;
        if (n < 0.5 || n >= 2) {
            ++winner;
        }
    }

    // finally, return the winning rule
    return rules[winner];
}

/**
 * Parses a string.  Matches the string to be parsed against each
 * of its rules (with a base value less than upperBound) and returns
 * the value produced by the rule that matched the most charcters
 * in the source string.
 * @param text The string to parse
 * @param parsePosition The initial position is ignored and assumed
 * to be 0.  On exit, this object has been updated to point to the
 * first character position this rule set didn't consume.
 * @param upperBound Limits the rules that can be allowed to match.
 * Only rules whose base values are strictly less than upperBound
 * are considered.
 * @return The numerical result of parsing this string.  This will
 * be the matching rule's base value, composed appropriately with
 * the results of matching any of its substitutions.  The object
 * will be an instance of Long if it's an integral value; otherwise,
 * it will be an instance of Double.  This function always returns
 * a valid object: If nothing matched the input string at all,
 * this function returns new Long(0), and the parse position is
 * left unchanged.
 */
#ifdef RBNF_DEBUG
#include <stdio.h>

static void dumpUS(FILE* f, const UnicodeString& us) {
  int len = us.length();
  char* buf = new char[len+1];
  us.extract(0, len, buf);
  buf[len] = 0;
  fprintf(f, "%s", buf);
  delete[] buf;
}
#endif

UBool
NFRuleSet::parse(const UnicodeString& text, ParsePosition& pos, double upperBound, Formattable& result) const
{
    // try matching each rule in the rule set against the text being
    // parsed.  Whichever one matches the most characters is the one
    // that determines the value we return.

    result.setLong(0);

    // dump out if there's no text to parse
    if (text.length() == 0) {
        return 0;
    }

    ParsePosition highWaterMark;
    ParsePosition workingPos = pos;

#ifdef RBNF_DEBUG
    fprintf(stderr, "<nfrs> %x '", this);
    dumpUS(stderr, name);
    fprintf(stderr, "' text '");
    dumpUS(stderr, text);
    fprintf(stderr, "'\n");
    fprintf(stderr, "  parse negative: %d\n", this, negativeNumberRule != 0);
#endif

    // start by trying the negative number rule (if there is one)
    if (negativeNumberRule) {
        Formattable tempResult;
#ifdef RBNF_DEBUG
        fprintf(stderr, "  <nfrs before negative> %x ub: %g\n", negativeNumberRule, upperBound);
#endif
        UBool success = negativeNumberRule->doParse(text, workingPos, 0, upperBound, tempResult);
#ifdef RBNF_DEBUG
        fprintf(stderr, "  <nfrs after negative> success: %d wpi: %d\n", success, workingPos.getIndex());
#endif
        if (success && workingPos.getIndex() > highWaterMark.getIndex()) {
            result = tempResult;
            highWaterMark = workingPos;
        }
        workingPos = pos;
    }
#ifdef RBNF_DEBUG
    fprintf(stderr, "<nfrs> continue fractional with text '");
    dumpUS(stderr, text);
    fprintf(stderr, "' hwm: %d\n", highWaterMark.getIndex());
#endif
    // then try each of the fraction rules
    {
        for (int i = 0; i < 3; i++) {
            if (fractionRules[i]) {
                Formattable tempResult;
                UBool success = fractionRules[i]->doParse(text, workingPos, 0, upperBound, tempResult);
                if (success && (workingPos.getIndex() > highWaterMark.getIndex())) {
                    result = tempResult;
                    highWaterMark = workingPos;
                }
                workingPos = pos;
            }
        }
    }
#ifdef RBNF_DEBUG
    fprintf(stderr, "<nfrs> continue other with text '");
    dumpUS(stderr, text);
    fprintf(stderr, "' hwm: %d\n", highWaterMark.getIndex());
#endif

    // finally, go through the regular rules one at a time.  We start
    // at the end of the list because we want to try matching the most
    // sigificant rule first (this helps ensure that we parse
    // "five thousand three hundred six" as
    // "(five thousand) (three hundred) (six)" rather than
    // "((five thousand three) hundred) (six)").  Skip rules whose
    // base values are higher than the upper bound (again, this helps
    // limit ambiguity by making sure the rules that match a rule's
    // are less significant than the rule containing the substitutions)/
    {
        llong ub(upperBound);
#ifdef RBNF_DEBUG
        {
            char ubstr[64];
            ub.lltoa(ubstr, 64);
            fprintf(stderr, "ub: %g, ll: %s(%x/%x)\n", upperBound, ubstr, ub.hi, ub.lo);
        }
#endif
        for (int32_t i = rules.size(); --i >= 0 && highWaterMark.getIndex() < text.length();) {
            if ((!fIsFractionRuleSet) && (rules[i]->getBaseValue() >= ub)) {
                continue;
            }
            Formattable tempResult;
            UBool success = rules[i]->doParse(text, workingPos, fIsFractionRuleSet, upperBound, tempResult);
            if (success && workingPos.getIndex() > highWaterMark.getIndex()) {
                result = tempResult;
                highWaterMark = workingPos;
            }
            workingPos = pos;
        }
    }
#ifdef RBNF_DEBUG
    fprintf(stderr, "<nfrs> exit\n");
#endif
    // finally, update the parse postion we were passed to point to the
    // first character we didn't use, and return the result that
    // corresponds to that string of characters
    pos = highWaterMark;

    return 1;
}

void
NFRuleSet::appendRules(UnicodeString& result) const
{
    // the rule set name goes first...
    result.append(name);
    result.append(gColon);
    result.append(gLineFeed);

    // followed by the regular rules...
    for (uint32_t i = 0; i < rules.size(); i++) {
        result.append(gFourSpaces);
        rules[i]->appendRuleText(result);
        result.append(gLineFeed);
    }

    // followed by the special rules (if they exist)
    if (negativeNumberRule) {
        result.append(gFourSpaces);
        negativeNumberRule->appendRuleText(result);
        result.append(gLineFeed);
    }

    {
        for (uint32_t i = 0; i < 3; ++i) {
            if (fractionRules[i]) {
                result.append(gFourSpaces);
                fractionRules[i]->appendRuleText(result);
                result.append(gLineFeed);
            }
        }
    }
}

U_NAMESPACE_END