ICU-4933 titlecase first _cased_ letter

X-SVN-Rev: 19962
2006-08-02 21:56:34 +00:00 · 2006-08-02 21:56:34 +00:00 · 885b57fdb6
commit 885b57fdb6
parent 86ff19728b
2 changed files with 132 additions and 86 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
+++ b/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java
@ -14,6 +14,7 @@ import com.ibm.icu.dev.test.TestUtil;
 import com.ibm.icu.lang.UCharacter;
 import com.ibm.icu.text.UTF16;
 import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.util.ULocale;
 import com.ibm.icu.impl.Utility;
 import java.util.Locale;
 import java.io.BufferedReader;
@ -287,39 +288,40 @@ public final class UCharacterCaseTest extends TestFmwk
        }
    }

+    // Unfortunately, BreakIterator.getBreakInstance(ULocale where, int kind)
+    // is private. Re-implement it here.
+    private final BreakIterator getBreakInstance(ULocale where, int kind) {
+        switch(kind) {
+        case BreakIterator.KIND_CHARACTER:
+            return BreakIterator.getCharacterInstance(where);
+        case BreakIterator.KIND_WORD:
+            return BreakIterator.getWordInstance(where);
+        case BreakIterator.KIND_LINE:
+            return BreakIterator.getLineInstance(where);
+        case BreakIterator.KIND_SENTENCE:
+            return BreakIterator.getSentenceInstance(where);
+        case BreakIterator.KIND_TITLE:
+            return BreakIterator.getTitleInstance(where);
+        default:
+            return null;
+        }
+    }
+
    public void TestTitle()
    {
         try{ 
-              BreakIterator TITLE_BREAKITERATORS_[] =
-              {
-                BreakIterator.getCharacterInstance(),
-                BreakIterator.getWordInstance(),
-                BreakIterator.getTitleInstance(),
-                BreakIterator.getCharacterInstance(),
-                null,
-                null
-              };
-            for (int i = 0; i < TITLE_BREAKITERATORS_.length; i ++) {
-                String test = TITLE_DATA_[i << 1];
-                String expected = TITLE_DATA_[(i << 1) + 1];
-                if (!expected.equals(
-                    UCharacter.toTitleCase(test,
-                                           TITLE_BREAKITERATORS_[i]))) {
-                    errln("error: titlecasing for " + prettify(test) + " should be " +
+            for (int i = 0; i < TITLE_DATA_.length;) {
+                String test = TITLE_DATA_[i++];
+                String expected = TITLE_DATA_[i++];
+                String locale = TITLE_DATA_[i++];
+                String breakType = TITLE_DATA_[i++];
+                ULocale loc = new ULocale(locale);
+                BreakIterator iter = getBreakInstance(loc, Integer.parseInt(breakType));
+                String result = UCharacter.toTitleCase(loc, test, iter);
+                if (!expected.equals(result)) {
+                    errln("titlecasing for " + prettify(test) + " should be " +
                          prettify(expected) + " but got " +
-                          prettify(UCharacter.toTitleCase(test,
-                                                     TITLE_BREAKITERATORS_[i])));
-                }
-                //cover toTitleCase(Locale, String, BreakIterator)
-                Locale def = Locale.getDefault();
-                String data = TITLE_DATA_[i << 1];
-                if (!expected.equals(
-                    UCharacter.toTitleCase(def, data,
-                                           TITLE_BREAKITERATORS_[i]))) {
-                    errln("error: titlecasing for " + prettify(data) + " should be " +
-                          prettify(expected) + " but got " +
-                          prettify(UCharacter.toTitleCase(def, data,
-                                                     TITLE_BREAKITERATORS_[i])));
+                          prettify(result));
                }                
            }
         }catch(Exception ex){
@ -720,24 +722,42 @@ public final class UCharacterCaseTest extends TestFmwk
                      "\u0061\u0062\u0131\u03c3\u00df\u03c2\u002f\ud93f\udfff";

    /**
-     * each item is an array with input string, result string, locale
+     * each item is an array with input string, result string, locale ID, break iterator
+     * the break iterator is specified as an int, same as in BreakIterator.KIND_*:
+     * 0=KIND_CHARACTER  1=KIND_WORD  2=KIND_LINE  3=KIND_SENTENCE  4=KIND_TITLE  -1=default
+     * see ICU4C source/test/testdata/casing.txt
     */
    private static final String TITLE_DATA_[] = {
        "\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
        "\u0041\u0042\u0020\u0049\u03a3\u0020\u0053\u0073\u03a3\u002f\u0046\u0066\u0069\ud93f\udfff",
-        
+        "",
+        "0",
+
        "\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
        "\u0041\u0062\u0020\u0049\u03c2\u0020\u0053\u0073\u03c3\u002f\u0046\u0066\u0069\ud93f\udfff",
-        
+        "",
+        "1",
+
+        "\u02bbaMeLikA huI P\u016b \u02bb\u02bb\u02bbiA", "\u02bbAmelika Hui P\u016b \u02bb\u02bb\u02bbIa", // titlecase first _cased_ letter, j4933
+        "",
+        "-1",
+
        " tHe QUIcK bRoWn", " The Quick Brown",
-        
+        "",
+        "4",
+
        "\u01c4\u01c5\u01c6\u01c7\u01c8\u01c9\u01ca\u01cb\u01cc", 
        "\u01c5\u01c5\u01c5\u01c8\u01c8\u01c8\u01cb\u01cb\u01cb", // UBRK_CHARACTER
-        
-        "\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
-        
-        "'oH dOn'T tItLeCaSe AfTeR lEtTeR+'",  "'Oh Don't Titlecase After Letter+'"
+        "",
+        "0",

+        "\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
+        "",
+        "-1",
+
+        "'oH dOn'T tItLeCaSe AfTeR lEtTeR+'",  "'Oh Don't Titlecase After Letter+'",
+        "",
+        "-1"
    };


--- a/icu4j/src/com/ibm/icu/lang/UCharacter.java
+++ b/icu4j/src/com/ibm/icu/lang/UCharacter.java
@ -4140,6 +4140,9 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
         * If the limit parameter is negative or past the string, then the
         * string length is restored as the iteration limit.
         *
+         * This limit does not affect the next() function which always
+         * iterates to the very end of the string.
+         *
         * @param lim The iteration limit.
         */
        public void setLimit(int lim) {
@ -4156,13 +4159,17 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
         * Performance optimization, to save on function calls and redundant
         * tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
         *
+         * When the iteration limit is reached (and -1 is returned),
+         * getCPStart() will be at the iteration limit.
+         *
+         * Iteration with next() does not affect the position for nextCaseMapCP().
+         *
         * @return The next code point to be case-mapped, or <0 when the iteration is done.
         */
        public int nextCaseMapCP() {
-            int c;
+            cpStart=cpLimit;
            if(cpLimit<limit) {
-                cpStart=cpLimit;
-                c=s.charAt(cpLimit++);
+                int c=s.charAt(cpLimit++);
                if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
                    char c2;
                    if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
@ -4181,6 +4188,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
            }
        }

+        /**
+         * Get the start of the code point that was last returned
+         * by nextCaseMapCP().
+         */
+        public int getCPStart() {
+            return cpStart;
+        }
+
        // implement UCaseProps.ContextIterator
        public void reset(int dir) {
            if(dir>0) {
@ -4201,7 +4216,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
        public int next() {
            int c;

-            if(dir>0 && index<limit) {
+            if(dir>0 && index<s.length()) {
                c=UTF16.charAt(s, index);
                index+=UTF16.getCharCount(c);
                return c;
@ -4436,14 +4451,15 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
        }
        titleIter.setText(str);

-        int index;
+        int prev, titleStart, index;
        boolean isFirstIndex;

        /* set up local variables */
+        prev=0;
        isFirstIndex=true;

        /* titlecasing loop */
-        for(;;) {
+        while(prev<srcLength) {
            /* find next index where to titlecase */
            if(isFirstIndex) {
                isFirstIndex=false;
@ -4455,53 +4471,63 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
                index=srcLength;
            }

-            /* lowercase up to index */
-            iter.setLimit(index);
-            while((c=iter.nextCaseMapCP())>=0) {
-                c=gCsp.toFullLower(c, iter, result, locale, locCache);
-
-                /* decode the result */
-                if(c<0) {
-                    /* (not) original code point */
-                    c=~c;
-                } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
-                    /* mapping already appended to result */
-                    continue;
-                /* } else { append single-code point mapping */
+            /*
+             * Unicode 4 & 5 section 3.13 Default Case Operations:
+             *
+             * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
+             * #29, "Text Boundaries." Between each pair of word boundaries, find the first
+             * cased character F. If F exists, map F to default_title(F); then map each
+             * subsequent character C to default_lower(C).
+             *
+             * In this implementation, segment [prev..index[ into 3 parts:
+             * a) uncased characters (copy as-is) [prev..titleStart[
+             * b) first case letter (titlecase)         [titleStart..titleLimit[
+             * c) subsequent characters (lowercase)                 [titleLimit..index[
+             */
+            if(prev<index) {
+                /* find and copy uncased characters [prev..titleStart[ */
+                iter.setLimit(index);
+                while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
+                titleStart=iter.getCPStart();
+                if(prev<titleStart) {
+                    result.append(str, prev, titleStart);
                }
-                if(c<=0xffff) {
-                    result.append((char)c);
-                } else {
-                    UTF16.append(result, c);
+
+                if(titleStart<index) {
+                    /* titlecase c which is from titleStart */
+                    c=gCsp.toFullTitle(c, iter, result, locale, locCache);
+
+                    /* decode the result and lowercase up to index */
+                    for(;;) {
+                        if(c<0) {
+                            /* (not) original code point */
+                            c=~c;
+                            if(c<=0xffff) {
+                                result.append((char)c);
+                            } else {
+                                UTF16.append(result, c);
+                            }
+                        } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
+                            /* mapping already appended to result */
+                        } else {
+                            /* append single-code point mapping */
+                            if(c<=0xffff) {
+                                result.append((char)c);
+                            } else {
+                                UTF16.append(result, c);
+                            }
+                        }
+                        
+                        if((c=iter.nextCaseMapCP())>=0) {
+                            c=gCsp.toFullLower(c, iter, result, locale, locCache);
+                        } else {
+                            break;
+                        }
+                    }
                }
            }

-            if(index>=srcLength) {
-                break;
-            }
-
-            /* titlecase the character at the found index */
-            iter.setLimit(srcLength);
-            c=iter.nextCaseMapCP();
-            if(c<0) {
-                break; // reached end of str
-            }
-            c=gCsp.toFullTitle(c, iter, result, locale, locCache);
-
-            /* decode the result */
-            if(c<0) {
-                /* (not) original code point */
-                c=~c;
-            } else if(c<=UCaseProps.MAX_STRING_LENGTH) {
-                /* mapping already appended to result */
-                continue;
-            /* } else { append single-code point mapping */
-            }
-            if(c<=0xffff) {
-                result.append((char)c);
-            } else {
-                UTF16.append(result, c);
-            }
+            prev=index;
        }
        return result.toString();
    }