ICU-4935 add toTitleCase(with options), and options TITLECASE_NO_LOWERCASE and TITLECASE_NO_BREAK_ADJUSTMENT

X-SVN-Rev: 22220
2007-07-31 20:56:05 +00:00 · 2007-07-31 20:56:05 +00:00 · f432dcfdda
commit f432dcfdda
parent 59c60af09a
2 changed files with 185 additions and 17 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
@ -14,6 +14,7 @@ import com.ibm.icu.dev.test.TestUtil;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
 import com.ibm.icu.util.ULocale;
 import com.ibm.icu.impl.Utility;
 import java.util.Locale;
@ -296,16 +297,36 @@ public final class UCharacterCaseTest extends TestFmwk
                String expected = TITLE_DATA_[i++];
                ULocale locale = new ULocale(TITLE_DATA_[i++]);
                int breakType = Integer.parseInt(TITLE_DATA_[i++]);
+                String optionsString = TITLE_DATA_[i++];
                BreakIterator iter =
                    breakType >= 0 ?
                        BreakIterator.getBreakInstance(locale, breakType) :
-                        null;
-                String result = UCharacter.toTitleCase(locale, test, iter);
+                        breakType == -2 ?
+                            // Open a trivial break iterator that only delivers { 0, length }
+                            // or even just { 0 } as boundaries.
+                            new RuleBasedBreakIterator(".*;") :
+                            null;
+                int options = 0;
+                if (optionsString.indexOf('L') >= 0) {
+                    options |= UCharacter.TITLECASE_NO_LOWERCASE;
+                }
+                if (optionsString.indexOf('A') >= 0) {
+                    options |= UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT;
+                }
+                String result = UCharacter.toTitleCase(locale, test, iter, options);
                if (!expected.equals(result)) {
-                    errln("titlecasing for " + prettify(test) + " should be " +
+                    errln("titlecasing for " + prettify(test) + " (options " + options + ") should be " +
                          prettify(expected) + " but got " +
                          prettify(result));
-                }                
+                }
+                if (options == 0) {
+                    result = UCharacter.toTitleCase(locale, test, iter);
+                    if (!expected.equals(result)) {
+                        errln("titlecasing for " + prettify(test) + " should be " +
+                              prettify(expected) + " but got " +
+                              prettify(result));
+                    }
+                }
            }
         }catch(Exception ex){
            warnln("Could not find data for BreakIterators");
@ -705,9 +726,10 @@ public final class UCharacterCaseTest extends TestFmwk
                      "\u0061\u0062\u0131\u03c3\u00df\u03c2\u002f\ud93f\udfff";

    /**
-     * each item is an array with input string, result string, locale ID, break iterator
+     * each item is an array with input string, result string, locale ID, break iterator, options
     * the break iterator is specified as an int, same as in BreakIterator.KIND_*:
-     * 0=KIND_CHARACTER  1=KIND_WORD  2=KIND_LINE  3=KIND_SENTENCE  4=KIND_TITLE  -1=default
+     * 0=KIND_CHARACTER  1=KIND_WORD  2=KIND_LINE  3=KIND_SENTENCE  4=KIND_TITLE  -1=default (NULL=words)  -2=no breaks (.*)
+     * options: T=U_FOLD_CASE_EXCLUDE_SPECIAL_I  L=U_TITLECASE_NO_LOWERCASE  A=U_TITLECASE_NO_BREAK_ADJUSTMENT
     * see ICU4C source/test/testdata/casing.txt
     */
    private static final String TITLE_DATA_[] = {
@ -715,32 +737,82 @@ public final class UCharacterCaseTest extends TestFmwk
        "\u0041\u0042\u0020\u0049\u03a3\u0020\u0053\u0073\u03a3\u002f\u0046\u0066\u0069\ud93f\udfff",
        "",
        "0",
+        "",

        "\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
        "\u0041\u0062\u0020\u0049\u03c2\u0020\u0053\u0073\u03c3\u002f\u0046\u0066\u0069\ud93f\udfff",
        "",
        "1",
+        "",

        "\u02bbaMeLikA huI P\u016b \u02bb\u02bb\u02bbiA", "\u02bbAmelika Hui P\u016b \u02bb\u02bb\u02bbIa", // titlecase first _cased_ letter, j4933
        "",
        "-1",
+        "",

        " tHe QUIcK bRoWn", " The Quick Brown",
        "",
        "4",
+        "",

        "\u01c4\u01c5\u01c6\u01c7\u01c8\u01c9\u01ca\u01cb\u01cc", 
        "\u01c5\u01c5\u01c5\u01c8\u01c8\u01c8\u01cb\u01cb\u01cb", // UBRK_CHARACTER
        "",
        "0",
+        "",

        "\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
        "",
        "-1",
+        "",

        "'oH dOn'T tItLeCaSe AfTeR lEtTeR+'",  "'Oh Don't Titlecase After Letter+'",
        "",
-        "-1"
+        "-1",
+        "",
+
+        "a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
+        "A \u02bbCat. A \u02bbDog! \u02bbEtc.",
+        "",
+        "-1",
+        "", // default
+
+        "a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
+        "A \u02bbcat. A \u02bbdog! \u02bbetc.",
+        "",
+        "-1",
+        "A", // U_TITLECASE_NO_BREAK_ADJUSTMENT
+
+        "a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
+        "A \u02bbCaT. A \u02bbdOg! \u02bbETc.",
+        "",
+        "3",
+        "L", // UBRK_SENTENCE and U_TITLECASE_NO_LOWERCASE
+
+
+        "\u02bbcAt! \u02bbeTc.",
+        "\u02bbCat! \u02bbetc.",
+        "",
+        "-2",
+        "", // -2=Trivial break iterator
+
+        "\u02bbcAt! \u02bbeTc.",
+        "\u02bbcat! \u02bbetc.",
+        "",
+        "-2",
+        "A", // U_TITLECASE_NO_BREAK_ADJUSTMENT
+
+        "\u02bbcAt! \u02bbeTc.",
+        "\u02bbCAt! \u02bbeTc.",
+        "",
+        "-2",
+        "L", // U_TITLECASE_NO_LOWERCASE
+
+        "\u02bbcAt! \u02bbeTc.",
+        "\u02bbcAt! \u02bbeTc.",
+        "",
+        "-2",
+        "AL" // Both options
    };


--- a/icu4j/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacter.java
@ -2731,6 +2731,43 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
     */
    public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;

+    /**
+     * Do not lowercase non-initial parts of words when titlecasing.
+     * Option bit for titlecasing APIs that take an options bit set.
+     *
+     * By default, titlecasing will titlecase the first cased character
+     * of a word and lowercase all other characters.
+     * With this option, the other characters will not be modified.
+     *
+     * @see toTitleCase
+     * @draft ICU 3.8
+     * @provisional This API might change or be removed in a future release.
+     */
+    public static final int TITLECASE_NO_LOWERCASE = 0x100;
+
+    /**
+     * Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
+     * titlecase exactly the characters at breaks from the iterator.
+     * Option bit for titlecasing APIs that take an options bit set.
+     *
+     * By default, titlecasing will take each break iterator index,
+     * adjust it by looking for the next cased character, and titlecase that one.
+     * Other characters are lowercased.
+     *
+     * This follows Unicode 4 & 5 section 3.13 Default Case Operations:
+     *
+     * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
+     * #29, "Text Boundaries." Between each pair of word boundaries, find the first
+     * cased character F. If F exists, map F to default_title(F); then map each
+     * subsequent character C to default_lower(C).
+     *
+     * @see toTitleCase
+     * @see TITLECASE_NO_LOWERCASE
+     * @draft ICU 3.8
+     * @provisional This API might change or be removed in a future release.
+     */
+    public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
+
    // public methods ----------------------------------------------------
      
    /**
@ -4062,6 +4099,13 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
            }
        }

+        /**
+         * Move to the iteration limit without fetching code points up to there.
+         */
+        public void moveToLimit() {
+            cpStart=cpLimit=limit;
+        }
+
        /**
         * Iterate forward through the string to fetch the next code point
         * to be case-mapped, and set the context indexes for it.
@ -4105,6 +4149,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
            return cpStart;
        }

+        /**
+         * Get the limit of the code point that was last returned
+         * by nextCaseMapCP().
+         */
+        public int getCPLimit() {
+            return cpLimit;
+        }
+
        // implement UCaseProps.ContextIterator
        public void reset(int dir) {
            if(dir>0) {
@ -4170,7 +4222,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
    /**
     * <p>Gets the titlecase version of the argument string.</p>
     * <p>Position for titlecasing is determined by the argument break 
-     * iterator, hence the user can customized his break iterator for 
+     * iterator, hence the user can customize his break iterator for 
     * a specialized titlecasing. In this case only the forward iteration 
     * needs to be implemented.
     * If the break iterator passed in is null, the default Unicode algorithm
@ -4299,7 +4351,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
    /**
     * <p>Gets the titlecase version of the argument string.</p>
     * <p>Position for titlecasing is determined by the argument break 
-     * iterator, hence the user can customized his break iterator for 
+     * iterator, hence the user can customize his break iterator for 
     * a specialized titlecasing. In this case only the forward iteration 
     * needs to be implemented.
     * If the break iterator passed in is null, the default Unicode algorithm
@ -4324,7 +4376,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
    /**
     * <p>Gets the titlecase version of the argument string.</p>
     * <p>Position for titlecasing is determined by the argument break 
-     * iterator, hence the user can customized his break iterator for 
+     * iterator, hence the user can customize his break iterator for 
     * a specialized titlecasing. In this case only the forward iteration 
     * needs to be implemented.
     * If the break iterator passed in is null, the default Unicode algorithm
@ -4342,6 +4394,35 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
     */
    public static String toTitleCase(ULocale locale, String str, 
                                     BreakIterator titleIter) {
+        return toTitleCase(locale, str, titleIter, 0);
+    }
+
+    /**
+     * <p>Gets the titlecase version of the argument string.</p>
+     * <p>Position for titlecasing is determined by the argument break 
+     * iterator, hence the user can customize his break iterator for 
+     * a specialized titlecasing. In this case only the forward iteration 
+     * needs to be implemented.
+     * If the break iterator passed in is null, the default Unicode algorithm
+     * will be used to determine the titlecase positions.
+     * </p>
+     * <p>Only positions returned by the break iterator will be title cased,
+     * character in between the positions will all be in lower case.</p>
+     * <p>Casing is dependent on the argument locale and context-sensitive</p>
+     * @param locale which string is to be converted in
+     * @param str source string to be performed on
+     * @param titleIter break iterator to determine the positions in which
+     *        the character should be title cased.
+     * @param options bit set to modify the titlecasing operation
+     * @return lowercase version of the argument string
+     * @draft ICU 3.8
+     * @provisional This API might change or be removed in a future release.
+     * @see TITLECASE_NO_LOWERCASE
+     * @see TITLECASE_NO_BREAK_ADJUSTMENT
+     */
+    public static String toTitleCase(ULocale locale, String str, 
+                                     BreakIterator titleIter,
+                                     int options) {
        StringContextIterator iter = new StringContextIterator(str);
        StringBuffer result = new StringBuffer(str.length());
        int[] locCache = new int[1];
@ -4393,11 +4474,16 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
            if(prev<index) {
                /* find and copy uncased characters [prev..titleStart[ */
                iter.setLimit(index);
-                while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
-                titleStart=iter.getCPStart();
-                if(prev<titleStart) {
-                    // TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
-                    result.append(str.substring(prev, titleStart));
+                c=iter.nextCaseMapCP();
+                if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCaseProps.NONE==gCsp.getType(c)) {
+                    while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
+                    titleStart=iter.getCPStart();
+                    if(prev<titleStart) {
+                        // TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
+                        result.append(str.substring(prev, titleStart));
+                    }
+                } else {
+                    titleStart=prev;
                }

                if(titleStart<index) {
@ -4424,8 +4510,18 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
                                UTF16.append(result, c);
                            }
                        }
-                        
-                        if((c=iter.nextCaseMapCP())>=0) {
+
+                        if((options&TITLECASE_NO_LOWERCASE)!=0) {
+                            /* Optionally just copy the rest of the word unchanged. */
+                            int titleLimit=iter.getCPLimit();
+                            if(titleLimit<index) {
+                                // TODO: With Java 5, this would want to be result.append(str, titleLimit, index);
+                                result.append(str.substring(titleLimit, index));
+                                iter.moveToLimit();
+                                break;
+                            }
+                        } else if((c=iter.nextCaseMapCP())>=0) {
+                            /* Normal operation: Lowercase the rest of the word. */
                            c=gCsp.toFullLower(c, iter, result, locale, locCache);
                        } else {
                            break;