/* ******************************************************************************* * Copyright (C) 1997-2001, International Business Machines Corporation and others. All Rights Reserved. ******************************************************************************* */ #ifndef RBNF_H #define RBNF_H #include "unicode/coll.h" #include "unicode/dcfmtsym.h" #include "unicode/fmtable.h" #include "unicode/locid.h" #include "unicode/numfmt.h" #include "unicode/unistr.h" #include "unicode/utypes.h" U_NAMESPACE_BEGIN class NFRuleSet; /** * \file * \brief C++ API: RuleBasedNumberFormat * *

Rule Based Number Format C++ API

* *

A class that formats numbers according to a set of rules. This number formatter is * typically used for spelling out numeric values in words (e.g., 25,3476 as * "twenty-five thousand three hundred seventy-six" or "vingt-cinq mille trois * cents soixante-seize" or * "fünfundzwanzigtausenddreihundertsechsundsiebzig"), but can also be used for * other complicated formatting tasks, such as formatting a number of seconds as hours, * minutes and seconds (e.g., 3,730 as "1:02:10").

* *

The resources contain three predefined formatters for each locale: spellout, which * spells out a value in words (123 is "one hundred twenty-three"); ordinal, which * appends an ordinal suffix to the end of a numeral (123 is "123rd"); and * duration, which shows a duration in seconds as hours, minutes, and seconds (123 is * "2:03"). The client can also define more specialized RuleBasedNumberFormats * by supplying programmer-defined rule sets.

* *

The behavior of a RuleBasedNumberFormat is specified by a textual description * that is either passed to the constructor as a String or loaded from a resource * bundle. In its simplest form, the description consists of a semicolon-delimited list of rules. * Each rule has a string of output text and a value or range of values it is applicable to. * In a typical spellout rule set, the first twenty rules are the words for the numbers from * 0 to 19:

* *

zero; one; two; three; four; five; six; seven; eight; nine;
 * ten; eleven; twelve; thirteen; fourteen; fifteen; sixteen; seventeen; eighteen; nineteen;

* *

For larger numbers, we can use the preceding set of rules to format the ones place, and * we only have to supply the words for the multiples of 10:

* *

 20: twenty[->>];
 * 30: thirty[->>];
 * 40: forty[->>];
 * 50: fifty[->>];
 * 60: sixty[->>];
 * 70: seventy[->>];
 * 80: eighty[->>];
 * 90: ninety[->>];

* *

In these rules, the base value is spelled out explicitly and set off from the * rule's output text with a colon. The rules are in a sorted list, and a rule is applicable * to all numbers from its own base value to one less than the next rule's base value. The * ">>" token is called a substitution and tells the fomatter to * isolate the number's ones digit, format it using this same set of rules, and place the * result at the position of the ">>" token. Text in brackets is omitted if * the number being formatted is an even multiple of 10 (the hyphen is a literal hyphen; 24 * is "twenty-four," not "twenty four").

* *

For even larger numbers, we can actually look up several parts of the number in the * list:

* *

100: << hundred[ >>];

* *

The "<<" represents a new kind of substitution. The << isolates * the hundreds digit (and any digits to its left), formats it using this same rule set, and * places the result where the "<<" was. Notice also that the meaning of * >> has changed: it now refers to both the tens and the ones digits. The meaning of * both substitutions depends on the rule's base value. The base value determines the rule's divisor, * which is the highest power of 10 that is less than or equal to the base value (the user * can change this). To fill in the substitutions, the formatter divides the number being * formatted by the divisor. The integral quotient is used to fill in the << * substitution, and the remainder is used to fill in the >> substitution. The meaning * of the brackets changes similarly: text in brackets is omitted if the value being * formatted is an even multiple of the rule's divisor. The rules are applied recursively, so * if a substitution is filled in with text that includes another substitution, that * substitution is also filled in.

* *

This rule covers values up to 999, at which point we add another rule:

* *

1000: << thousand[ >>];

* *

Again, the meanings of the brackets and substitution tokens shift because the rule's * base value is a higher power of 10, changing the rule's divisor. This rule can actually be * used all the way up to 999,999. This allows us to finish out the rules as follows:

* *

 1,000,000: << million[ >>];
 * 1,000,000,000: << billion[ >>];
 * 1,000,000,000,000: << trillion[ >>];
 * 1,000,000,000,000,000: OUT OF RANGE!;

* *

Commas, periods, and spaces can be used in the base values to improve legibility and * are ignored by the rule parser. The last rule in the list is customarily treated as an * "overflow rule," applying to everything from its base value on up, and often (as * in this example) being used to print out an error message or default representation. * Notice also that the size of the major groupings in large numbers is controlled by the * spacing of the rules: because in English we group numbers by thousand, the higher rules * are separated from each other by a factor of 1,000.

* *

To see how these rules actually work in practice, consider the following example: * Formatting 25,430 with this rule set would work like this:

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

	<< thousand >>	[the rule whose base value is 1,000 is applicable to 25,340]
	twenty->> thousand >>	[25,340 over 1,000 is 25. The rule for 20 applies.]
	twenty-five thousand >>	[25 mod 10 is 5. The rule for 5 is "five."
	twenty-five thousand << hundred >>	[25,340 mod 1,000 is 340. The rule for 100 applies.]
	twenty-five thousand three hundred >>	[340 over 100 is 3. The rule for 3 is "three."]
	twenty-five thousand three hundred forty	[340 mod 100 is 40. The rule for 40 applies. Since 40 divides * evenly by 10, the hyphen and substitution in the brackets are omitted.]

* *

The above syntax suffices only to format positive integers. To format negative numbers, * we add a special rule:

* *

-x: minus >>;

* *

This is called a negative-number rule, and is identified by "-x" * where the base value would be. This rule is used to format all negative numbers. the * >> token here means "find the number's absolute value, format it with these * rules, and put the result here."

* *

We also add a special rule called a fraction rule for numbers with fractional * parts:

* *

x.x: << point >>;

* *

This rule is used for all positive non-integers (negative non-integers pass through the * negative-number rule first and then through this rule). Here, the << token refers to * the number's integral part, and the >> to the number's fractional part. The * fractional part is formatted as a series of single-digit numbers (e.g., 123.456 would be * formatted as "one hundred twenty-three point four five six").

* *

To see how this rule syntax is applied to various languages, examine the resource data.

* *

There is actually much more flexibility built into the rule language than the * description above shows. A formatter may own multiple rule sets, which can be selected by * the caller, and which can use each other to fill in their substitutions. Substitutions can * also be filled in with digits, using a DecimalFormat object. There is syntax that can be * used to alter a rule's divisor in various ways. And there is provision for much more * flexible fraction handling. A complete description of the rule syntax follows:

* *

The description of a RuleBasedNumberFormat's behavior consists of one or more rule * sets. Each rule set consists of a name, a colon, and a list of rules. A rule * set name must begin with a % sign. Rule sets with names that begin with a single % sign * are public: the caller can specify that they be used to format and parse numbers. * Rule sets with names that begin with %% are private: they exist only for the use * of other rule sets. If a formatter only has one rule set, the name may be omitted.

* *

The user can also specify a special "rule set" named %%lenient-parse. * The body of %%lenient-parse isn't a set of number-formatting rules, but a RuleBasedCollator * description which is used to define equivalences for lenient parsing. For more information * on the syntax, see RuleBasedCollator. For more information on lenient parsing, * see setLenientParse().

* *

The body of a rule set consists of an ordered, semicolon-delimited list of rules. * Internally, every rule has a base value, a divisor, rule text, and zero, one, or two substitutions. * These parameters are controlled by the description syntax, which consists of a rule * descriptor, a colon, and a rule body.

* *

A rule descriptor can take one of the following forms (text in italics is the * name of a token):

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

	bv:	bv specifies the rule's base value. bv is a decimal * number expressed using ASCII digits. bv may contain spaces, period, and commas, * which are ignored. The rule's divisor is the highest power of 10 less than or equal to * the base value.
	bv/rad:	bv specifies the rule's base value. The rule's divisor is the * highest power of rad less than or equal to the base value.
	bv>:	bv specifies the rule's base value. To calculate the divisor, * let the radix be 10, and the exponent be the highest exponent of the radix that yields a * result less than or equal to the base value. Every > character after the base value * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix * raised to the power of the exponent; otherwise, the divisor is 1.
	bv/rad>:	bv specifies the rule's base value. To calculate the divisor, * let the radix be rad, and the exponent be the highest exponent of the radix that * yields a result less than or equal to the base value. Every > character after the radix * decreases the exponent by 1. If the exponent is positive or 0, the divisor is the radix * raised to the power of the exponent; otherwise, the divisor is 1.
	-x:	The rule is a negative-number rule.
	x.x:	The rule is an improper fraction rule.
	0.x:	The rule is a proper fraction rule.
	x.0:	The rule is a master rule.
	nothing	If the rule's rule descriptor is left out, the base value is one plus the * preceding rule's base value (or zero if this is the first rule in the list) in a normal * rule set. In a fraction rule set, the base value is the same as the preceding rule's * base value.

* *

A rule set may be either a regular rule set or a fraction rule set, depending * on whether it is used to format a number's integral part (or the whole number) or a * number's fractional part. Using a rule set to format a rule's fractional part makes it a * fraction rule set.

* *

Which rule is used to format a number is defined according to one of the following * algorithms: If the rule set is a regular rule set, do the following: * *

If the rule set includes a master rule (and the number was passed in as a double), * use the master rule. (If the number being formatted was passed in as a long, * the master rule is ignored.)
If the number is negative, use the negative-number rule.
If the number has a fractional part and is greater than 1, use the improper fraction * rule.
If the number has a fractional part and is between 0 and 1, use the proper fraction * rule.
Binary-search the rule list for the rule with the highest base value less than or equal * to the number. If that rule has two substitutions, its base value is not an even multiple * of its divisor, and the number is an even multiple of the rule's divisor, use the * rule that precedes it in the rule list. Otherwise, use the rule itself.

* *

If the rule set is a fraction rule set, do the following: * *

Ignore negative-number and fraction rules.
For each rule in the list, multiply the number being formatted (which will always be * between 0 and 1) by the rule's base value. Keep track of the distance between the result * the nearest integer.
Use the rule that produced the result closest to zero in the above calculation. In the * event of a tie or a direct hit, use the first matching rule encountered. (The idea here is * to try each rule's base value as a possible denominator of a fraction. Whichever * denominator produces the fraction closest in value to the number being formatted wins.) If * the rule following the matching rule has the same base value, use it if the numerator of * the fraction is anything other than 1; if the numerator is 1, use the original matching * rule. (This is to allow singular and plural forms of the rule text without a lot of extra * hassle.)

* *

A rule's body consists of a string of characters terminated by a semicolon. The rule * may include zero, one, or two substitution tokens, and a range of text in * brackets. The brackets denote optional text (and may also include one or both * substitutions). The exact meanings of the substitution tokens, and under what conditions * optional text is omitted, depend on the syntax of the substitution token and the context. * The rest of the text in a rule body is literal text that is output when the rule matches * the number being formatted.

* *

A substitution token begins and ends with a token character. The token * character and the context together specify a mathematical operation to be performed on the * number being formatted. An optional substitution descriptor specifies how the * value resulting from that operation is used to fill in the substitution. The position of * the substitution token in the rule body specifies the location of the resultant text in * the original rule text.

* *

The meanings of the substitution token characters are as follows:

* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

>>	in normal rule	Divide the number by the rule's divisor and format the remainder
	in negative-number rule	Find the absolute value of the number and format the result
	in fraction or master rule	Isolate the number's fractional part and format it.
	in rule in fraction rule set	Not allowed.
>>>	in normal rule	Divide the number by the rule's divisor and format the remainder, * but bypass the normal rule-selection process and just use the * rule that precedes this one in this rule list.
	in all other rules	Not allowed.
<<	in normal rule	Divide the number by the rule's divisor and format the quotient
	in negative-number rule	Not allowed.
	in fraction or master rule	Isolate the number's integral part and format it.
	in rule in fraction rule set	Multiply the number by the rule's base value and format the result.
==	in all rule sets	Format the number unchanged
[]	in normal rule	Omit the optional text if the number is an even multiple of the rule's divisor
	in negative-number rule	Not allowed.
	in improper-fraction rule	Omit the optional text if the number is between 0 and 1 (same as specifying both an * x.x rule and a 0.x rule)
	in master rule	Omit the optional text if the number is an integer (same as specifying both an x.x * rule and an x.0 rule)
	in proper-fraction rule	Not allowed.
	in rule in fraction rule set	Omit the optional text if multiplying the number by the rule's base value yields 1.

* *

The substitution descriptor (i.e., the text between the token characters) may take one * of three forms:

* * * * * * * * * * * * * * * * * *

	a rule set name	Perform the mathematical operation on the number, and format the result using the * named rule set.
	a DecimalFormat pattern	Perform the mathematical operation on the number, and format the result using a * DecimalFormat with the specified pattern. The pattern must begin with 0 or #.
	nothing	Perform the mathematical operation on the number, and format the result using the rule * set containing the current rule, except: * You can't have an empty substitution descriptor with a == substitution. * If you omit the substitution descriptor in a >> substitution in a fraction rule, * format the result one digit at a time using the rule set containing the current rule. * If you omit the substitution descriptor in a << substitution in a rule in a * fraction rule set, format the result using the default rule set for this formatter. * *

* *

Whitespace is ignored between a rule set name and a rule set body, between a rule * descriptor and a rule body, or between rules. If a rule body begins with an apostrophe, * the apostrophe is ignored, but all text after it becomes significant (this is how you can * have a rule's rule text begin with whitespace). There is no escape function: the semicolon * is not allowed in rule set names or in rule text, and the colon is not allowed in rule set * names. The characters beginning a substitution token are always treated as the beginning * of a substitution token.

* *

See the resource data and the demo program for annotated examples of real rule sets * using these features.

* * @author Richard Gillam * @see NumberFormat * @see DecimalFormat * @draft */ /** Tags for the predefined rulesets. */ enum URBNFRuleSetTag { URBNF_SPELLOUT, URBNF_ORDINAL, URBNF_DURATION, URBNF_COUNT }; class U_I18N_API RuleBasedNumberFormat : public NumberFormat { public: //----------------------------------------------------------------------- // constructors //----------------------------------------------------------------------- /** * Creates a RuleBasedNumberFormat that behaves according to the rules * passed in. The formatter uses the specified locale to determine the * characters to use when formatting numerals, and to define equivalences * for lenient parsing. * @param rules The formatter rules. * See the class documentation for a complete explanation of the rule * syntax. * @param locale A locale, that governs which characters are used for * formatting values in numerals, and which characters are equivalent in * lenient parsing. * @param perror The parse error if an error was encountered. * @param status The status indicating whether the constructor succeeded. * @draft */ RuleBasedNumberFormat(const UnicodeString& rules, const Locale& locale, UParseError& perror, UErrorCode& status); /** * Creates a RuleBasedNumberFormat from a predefined ruleset. The selector * code choosed among three possible predefined formats: spellout, ordinal, * and duration. * @param tag A selector code specifying which kind of formatter to create for that * locale. There are three legal values: URBNF_SPELLOUT, which creates a formatter that * spells out a value in words in the desired language, URBNF_ORDINAL, which attaches * an ordinal suffix from the desired language to the end of a number (e.g. "123rd"), * and URBNF_DURATION, which formats a duration in seconds as hours, minutes, and seconds. * @param locale The locale for the formatter. * @param status The status indicating whether the constructor succeeded. * @draft */ RuleBasedNumberFormat(URBNFRuleSetTag tag, const Locale& locale, UErrorCode& status); //----------------------------------------------------------------------- // boilerplate //----------------------------------------------------------------------- /** * Copy constructor */ RuleBasedNumberFormat(const RuleBasedNumberFormat& rhs); /** * Assignment operator */ RuleBasedNumberFormat& operator=(const RuleBasedNumberFormat& rhs); /** * Release memory allocated for a RuleBasedNumberFormat when you are finished with it. */ virtual ~RuleBasedNumberFormat(); /** * Clone this object polymorphically. The caller is responsible * for deleting the result when done. */ virtual Format* clone(void) const; /** * Return true if the given Format objects are semantically equal. * Objects of different subclasses are considered unequal. */ virtual UBool operator==(const Format& other) const; //----------------------------------------------------------------------- // public API functions //----------------------------------------------------------------------- /** * @return the rules that were provided to the RuleBasedNumberFormat. * @return the result String that was passed in * @draft */ virtual UnicodeString getRules() const; /** * Return the name of the index'th public ruleSet. If index is not valid, * the function returns null. * @param index the index of the ruleset * @return the name of the index'th public ruleSet. * @draft */ virtual UnicodeString getRuleSetName(int32_t index) const; /** * Return the number of public rule set names. * @return the number of public rule set names. * @draft */ virtual int32_t getNumberOfRuleSetNames() const; /** * Formats the specified number using the default ruleset. * @param number The number to format. * @param toAppendTo the string that will hold the (appended) result * @param pos the fieldposition * @return A textual representation of the number. * @draft */ virtual UnicodeString& format(int32_t number, UnicodeString& toAppendTo, FieldPosition& pos) const; /** * Formats the specified number using the default ruleset. * @param number The number to format. * @param toAppendTo the string that will hold the (appended) result * @param pos the fieldposition * @return A textual representation of the number. * @draft */ virtual UnicodeString& format(double number, UnicodeString& toAppendTo, FieldPosition& pos) const; /** * Formats the specified number using the default ruleset. * @param number The number to format. * @param ruleSetName The name of the rule set to format the number with. * This must be the name of a valid public rule set for this formatter. * @param toAppendTo the string that will hold the (appended) result * @param pos the fieldposition * @param status the status * @return A textual representation of the number. * @draft */ virtual UnicodeString& format(int32_t number, const UnicodeString& ruleSetName, UnicodeString& toAppendTo, FieldPosition& pos, UErrorCode& status) const; /** * Formats the specified number using the default ruleset. * @param number The number to format. * @param ruleSetName The name of the rule set to format the number with. * This must be the name of a valid public rule set for this formatter. * @param toAppendTo the string that will hold the (appended) result * @param pos the fieldposition * @param status the status * @return A textual representation of the number. * @draft */ virtual UnicodeString& format(double number, const UnicodeString& ruleSetName, UnicodeString& toAppendTo, FieldPosition& pos, UErrorCode& status) const; /** * Formats the specified number using the default ruleset. * @param obj The number to format. * @param toAppendTo the string that will hold the (appended) result * @param pos the fieldposition * @param status the status * @return A textual representation of the number. * @draft */ virtual UnicodeString& format(const Formattable& obj, UnicodeString& toAppendTo, FieldPosition& pos, UErrorCode& status) const; /** * Redeclared Format method. * @stable */ UnicodeString& format(const Formattable& obj, UnicodeString& result, UErrorCode& status) const; /** * Redeclared NumberFormat method. * @stable */ UnicodeString& format(double number, UnicodeString& output) const; /** * Redeclared NumberFormat method. * @stable */ UnicodeString& format(int32_t number, UnicodeString& output) const; /** * Parses the specfied string, beginning at the specified position, according * to this formatter's rules. This will match the string against all of the * formatter's public rule sets and return the value corresponding to the longest * parseable substring. This function's behavior is affected by the lenient * parse mode. * @param text The string to parse * @param result the result of the parse, either a double or a long. * @param parsePosition On entry, contains the position of the first character * in "text" to examine. On exit, has been updated to contain the position * of the first character in "text" that wasn't consumed by the parse. * @see #setLenientParseMode * @draft */ virtual void parse(const UnicodeString& text, Formattable& result, ParsePosition& parsePosition) const; /** * Redeclared Format method. * @stable */ virtual inline void parse(const UnicodeString& text, Formattable& result, UErrorCode& status) const; /** * Turns lenient parse mode on and off. * * When in lenient parse mode, the formatter uses a Collator for parsing the text. * Only primary differences are treated as significant. This means that case * differences, accent differences, alternate spellings of the same letter * (e.g., ae and a-umlaut in German), ignorable characters, etc. are ignored in * matching the text. In many cases, numerals will be accepted in place of words * or phrases as well. * * For example, all of the following will correctly parse as 255 in English in * lenient-parse mode: *
"two hundred fifty-five" *
"two hundred fifty five" *
"TWO HUNDRED FIFTY-FIVE" *
"twohundredfiftyfive" *
"2 hundred fifty-5" * * The Collator used is determined by the locale that was * passed to this object on construction. The description passed to this object * on construction may supply additional collation rules that are appended to the * end of the default collator for the locale, enabling additional equivalences * (such as adding more ignorable characters or permitting spelled-out version of * symbols; see the demo program for examples). * * It's important to emphasize that even strict parsing is relatively lenient: it * will accept some text that it won't produce as output. In English, for example, * it will correctly parse "two hundred zero" and "fifteen hundred". * * @param enabled If true, turns lenient-parse mode on; if false, turns it off. * @see RuleBasedCollator * @draft */ virtual void setLenient(UBool enabled); /** * Returns true if lenient-parse mode is turned on. Lenient parsing is off * by default. * @return true if lenient-parse mode is turned on. * @see #setLenientParseMode * @draft */ virtual inline UBool isLenient(void) const; private: void init(const UnicodeString& rules, UParseError& perror, UErrorCode& status); void dispose(); void stripWhitespace(UnicodeString& src); void setDefaultRuleSet(); void format(double number, NFRuleSet& ruleSet); NFRuleSet* findRuleSet(const UnicodeString& name, UErrorCode& status) const; /* friend access */ friend class NFSubstitution; friend class NFRule; friend class FractionalPartSubstitution; inline NFRuleSet * getDefaultRuleSet() const; Collator * getCollator() const; DecimalFormatSymbols * getDecimalFormatSymbols() const; private: static const char fgClassID; public: static UClassID getStaticClassID(void) { return (UClassID)&fgClassID; } virtual UClassID getDynamicClassID(void) const { return getStaticClassID(); } private: NFRuleSet **ruleSets; NFRuleSet *defaultRuleSet; Locale locale; Collator* collator; DecimalFormatSymbols* decimalFormatSymbols; UBool lenient; UnicodeString* lenientParseRules; }; // --------------- inline UnicodeString& RuleBasedNumberFormat::format(const Formattable& obj, UnicodeString& result, UErrorCode& status) const { // Don't use Format:: - use immediate base class only, // in case immediate base modifies behavior later. // dlf - the above comment is bogus, if there were a reason to modify // it, it would be virtual, and there's no reason because it is // a one-line macro in NumberFormat anyway, just like this one. return NumberFormat::format(obj, result, status); } inline UnicodeString& RuleBasedNumberFormat::format(double number, UnicodeString& output) const { FieldPosition pos(0); return format(number, output, pos); } inline UnicodeString& RuleBasedNumberFormat::format(int32_t number, UnicodeString& output) const { FieldPosition pos(0); return format(number, output, pos); } inline void RuleBasedNumberFormat::parse(const UnicodeString& text, Formattable& result, UErrorCode& status) const { NumberFormat::parse(text, result, status); } inline UBool RuleBasedNumberFormat::isLenient(void) const { return lenient; } inline NFRuleSet* RuleBasedNumberFormat::getDefaultRuleSet() const { return defaultRuleSet; } U_NAMESPACE_END /* RBNF_H */ #endif