ICU-4935 add toTitleCase(with options), and options TITLECASE_NO_LOWERCASE and TITLECASE_NO_BREAK_ADJUSTMENT

X-SVN-Rev: 22220
This commit is contained in:
Markus Scherer 2007-07-31 20:56:05 +00:00
parent 59c60af09a
commit f432dcfdda
2 changed files with 185 additions and 17 deletions

View File

@ -14,6 +14,7 @@ import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.impl.Utility;
import java.util.Locale;
@ -296,16 +297,36 @@ public final class UCharacterCaseTest extends TestFmwk
String expected = TITLE_DATA_[i++];
ULocale locale = new ULocale(TITLE_DATA_[i++]);
int breakType = Integer.parseInt(TITLE_DATA_[i++]);
String optionsString = TITLE_DATA_[i++];
BreakIterator iter =
breakType >= 0 ?
BreakIterator.getBreakInstance(locale, breakType) :
null;
String result = UCharacter.toTitleCase(locale, test, iter);
breakType == -2 ?
// Open a trivial break iterator that only delivers { 0, length }
// or even just { 0 } as boundaries.
new RuleBasedBreakIterator(".*;") :
null;
int options = 0;
if (optionsString.indexOf('L') >= 0) {
options |= UCharacter.TITLECASE_NO_LOWERCASE;
}
if (optionsString.indexOf('A') >= 0) {
options |= UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT;
}
String result = UCharacter.toTitleCase(locale, test, iter, options);
if (!expected.equals(result)) {
errln("titlecasing for " + prettify(test) + " should be " +
errln("titlecasing for " + prettify(test) + " (options " + options + ") should be " +
prettify(expected) + " but got " +
prettify(result));
}
}
if (options == 0) {
result = UCharacter.toTitleCase(locale, test, iter);
if (!expected.equals(result)) {
errln("titlecasing for " + prettify(test) + " should be " +
prettify(expected) + " but got " +
prettify(result));
}
}
}
}catch(Exception ex){
warnln("Could not find data for BreakIterators");
@ -705,9 +726,10 @@ public final class UCharacterCaseTest extends TestFmwk
"\u0061\u0062\u0131\u03c3\u00df\u03c2\u002f\ud93f\udfff";
/**
* each item is an array with input string, result string, locale ID, break iterator
* each item is an array with input string, result string, locale ID, break iterator, options
* the break iterator is specified as an int, same as in BreakIterator.KIND_*:
* 0=KIND_CHARACTER 1=KIND_WORD 2=KIND_LINE 3=KIND_SENTENCE 4=KIND_TITLE -1=default
* 0=KIND_CHARACTER 1=KIND_WORD 2=KIND_LINE 3=KIND_SENTENCE 4=KIND_TITLE -1=default (NULL=words) -2=no breaks (.*)
* options: T=U_FOLD_CASE_EXCLUDE_SPECIAL_I L=U_TITLECASE_NO_LOWERCASE A=U_TITLECASE_NO_BREAK_ADJUSTMENT
* see ICU4C source/test/testdata/casing.txt
*/
private static final String TITLE_DATA_[] = {
@ -715,32 +737,82 @@ public final class UCharacterCaseTest extends TestFmwk
"\u0041\u0042\u0020\u0049\u03a3\u0020\u0053\u0073\u03a3\u002f\u0046\u0066\u0069\ud93f\udfff",
"",
"0",
"",
"\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
"\u0041\u0062\u0020\u0049\u03c2\u0020\u0053\u0073\u03c3\u002f\u0046\u0066\u0069\ud93f\udfff",
"",
"1",
"",
"\u02bbaMeLikA huI P\u016b \u02bb\u02bb\u02bbiA", "\u02bbAmelika Hui P\u016b \u02bb\u02bb\u02bbIa", // titlecase first _cased_ letter, j4933
"",
"-1",
"",
" tHe QUIcK bRoWn", " The Quick Brown",
"",
"4",
"",
"\u01c4\u01c5\u01c6\u01c7\u01c8\u01c9\u01ca\u01cb\u01cc",
"\u01c5\u01c5\u01c5\u01c8\u01c8\u01c8\u01cb\u01cb\u01cb", // UBRK_CHARACTER
"",
"0",
"",
"\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
"",
"-1",
"",
"'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'",
"",
"-1"
"-1",
"",
"a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
"A \u02bbCat. A \u02bbDog! \u02bbEtc.",
"",
"-1",
"", // default
"a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
"A \u02bbcat. A \u02bbdog! \u02bbetc.",
"",
"-1",
"A", // U_TITLECASE_NO_BREAK_ADJUSTMENT
"a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
"A \u02bbCaT. A \u02bbdOg! \u02bbETc.",
"",
"3",
"L", // UBRK_SENTENCE and U_TITLECASE_NO_LOWERCASE
"\u02bbcAt! \u02bbeTc.",
"\u02bbCat! \u02bbetc.",
"",
"-2",
"", // -2=Trivial break iterator
"\u02bbcAt! \u02bbeTc.",
"\u02bbcat! \u02bbetc.",
"",
"-2",
"A", // U_TITLECASE_NO_BREAK_ADJUSTMENT
"\u02bbcAt! \u02bbeTc.",
"\u02bbCAt! \u02bbeTc.",
"",
"-2",
"L", // U_TITLECASE_NO_LOWERCASE
"\u02bbcAt! \u02bbeTc.",
"\u02bbcAt! \u02bbeTc.",
"",
"-2",
"AL" // Both options
};

View File

@ -2731,6 +2731,43 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
/**
* Do not lowercase non-initial parts of words when titlecasing.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will titlecase the first cased character
* of a word and lowercase all other characters.
* With this option, the other characters will not be modified.
*
* @see toTitleCase
* @draft ICU 3.8
* @provisional This API might change or be removed in a future release.
*/
public static final int TITLECASE_NO_LOWERCASE = 0x100;
/**
* Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
* titlecase exactly the characters at breaks from the iterator.
* Option bit for titlecasing APIs that take an options bit set.
*
* By default, titlecasing will take each break iterator index,
* adjust it by looking for the next cased character, and titlecase that one.
* Other characters are lowercased.
*
* This follows Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* @see toTitleCase
* @see TITLECASE_NO_LOWERCASE
* @draft ICU 3.8
* @provisional This API might change or be removed in a future release.
*/
public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
// public methods ----------------------------------------------------
/**
@ -4062,6 +4099,13 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
}
}
/**
* Move to the iteration limit without fetching code points up to there.
*/
public void moveToLimit() {
cpStart=cpLimit=limit;
}
/**
* Iterate forward through the string to fetch the next code point
* to be case-mapped, and set the context indexes for it.
@ -4105,6 +4149,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
return cpStart;
}
/**
* Get the limit of the code point that was last returned
* by nextCaseMapCP().
*/
public int getCPLimit() {
return cpLimit;
}
// implement UCaseProps.ContextIterator
public void reset(int dir) {
if(dir>0) {
@ -4170,7 +4222,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* iterator, hence the user can customize his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
@ -4299,7 +4351,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* iterator, hence the user can customize his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
@ -4324,7 +4376,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* iterator, hence the user can customize his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
@ -4342,6 +4394,35 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
*/
public static String toTitleCase(ULocale locale, String str,
BreakIterator titleIter) {
return toTitleCase(locale, str, titleIter, 0);
}
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customize his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
* </p>
* <p>Only positions returned by the break iterator will be title cased,
* character in between the positions will all be in lower case.</p>
* <p>Casing is dependent on the argument locale and context-sensitive</p>
* @param locale which string is to be converted in
* @param str source string to be performed on
* @param titleIter break iterator to determine the positions in which
* the character should be title cased.
* @param options bit set to modify the titlecasing operation
* @return lowercase version of the argument string
* @draft ICU 3.8
* @provisional This API might change or be removed in a future release.
* @see TITLECASE_NO_LOWERCASE
* @see TITLECASE_NO_BREAK_ADJUSTMENT
*/
public static String toTitleCase(ULocale locale, String str,
BreakIterator titleIter,
int options) {
StringContextIterator iter = new StringContextIterator(str);
StringBuffer result = new StringBuffer(str.length());
int[] locCache = new int[1];
@ -4393,11 +4474,16 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
if(prev<index) {
/* find and copy uncased characters [prev..titleStart[ */
iter.setLimit(index);
while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
titleStart=iter.getCPStart();
if(prev<titleStart) {
// TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
result.append(str.substring(prev, titleStart));
c=iter.nextCaseMapCP();
if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCaseProps.NONE==gCsp.getType(c)) {
while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
titleStart=iter.getCPStart();
if(prev<titleStart) {
// TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
result.append(str.substring(prev, titleStart));
}
} else {
titleStart=prev;
}
if(titleStart<index) {
@ -4424,8 +4510,18 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
UTF16.append(result, c);
}
}
if((c=iter.nextCaseMapCP())>=0) {
if((options&TITLECASE_NO_LOWERCASE)!=0) {
/* Optionally just copy the rest of the word unchanged. */
int titleLimit=iter.getCPLimit();
if(titleLimit<index) {
// TODO: With Java 5, this would want to be result.append(str, titleLimit, index);
result.append(str.substring(titleLimit, index));
iter.moveToLimit();
break;
}
} else if((c=iter.nextCaseMapCP())>=0) {
/* Normal operation: Lowercase the rest of the word. */
c=gCsp.toFullLower(c, iter, result, locale, locCache);
} else {
break;