ICU-4935 add toTitleCase(with options), and options TITLECASE_NO_LOWERCASE and TITLECASE_NO_BREAK_ADJUSTMENT
X-SVN-Rev: 22220
This commit is contained in:
parent
59c60af09a
commit
f432dcfdda
@ -14,6 +14,7 @@ import com.ibm.icu.dev.test.TestUtil;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.text.RuleBasedBreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import java.util.Locale;
|
||||
@ -296,16 +297,36 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
String expected = TITLE_DATA_[i++];
|
||||
ULocale locale = new ULocale(TITLE_DATA_[i++]);
|
||||
int breakType = Integer.parseInt(TITLE_DATA_[i++]);
|
||||
String optionsString = TITLE_DATA_[i++];
|
||||
BreakIterator iter =
|
||||
breakType >= 0 ?
|
||||
BreakIterator.getBreakInstance(locale, breakType) :
|
||||
null;
|
||||
String result = UCharacter.toTitleCase(locale, test, iter);
|
||||
breakType == -2 ?
|
||||
// Open a trivial break iterator that only delivers { 0, length }
|
||||
// or even just { 0 } as boundaries.
|
||||
new RuleBasedBreakIterator(".*;") :
|
||||
null;
|
||||
int options = 0;
|
||||
if (optionsString.indexOf('L') >= 0) {
|
||||
options |= UCharacter.TITLECASE_NO_LOWERCASE;
|
||||
}
|
||||
if (optionsString.indexOf('A') >= 0) {
|
||||
options |= UCharacter.TITLECASE_NO_BREAK_ADJUSTMENT;
|
||||
}
|
||||
String result = UCharacter.toTitleCase(locale, test, iter, options);
|
||||
if (!expected.equals(result)) {
|
||||
errln("titlecasing for " + prettify(test) + " should be " +
|
||||
errln("titlecasing for " + prettify(test) + " (options " + options + ") should be " +
|
||||
prettify(expected) + " but got " +
|
||||
prettify(result));
|
||||
}
|
||||
}
|
||||
if (options == 0) {
|
||||
result = UCharacter.toTitleCase(locale, test, iter);
|
||||
if (!expected.equals(result)) {
|
||||
errln("titlecasing for " + prettify(test) + " should be " +
|
||||
prettify(expected) + " but got " +
|
||||
prettify(result));
|
||||
}
|
||||
}
|
||||
}
|
||||
}catch(Exception ex){
|
||||
warnln("Could not find data for BreakIterators");
|
||||
@ -705,9 +726,10 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
"\u0061\u0062\u0131\u03c3\u00df\u03c2\u002f\ud93f\udfff";
|
||||
|
||||
/**
|
||||
* each item is an array with input string, result string, locale ID, break iterator
|
||||
* each item is an array with input string, result string, locale ID, break iterator, options
|
||||
* the break iterator is specified as an int, same as in BreakIterator.KIND_*:
|
||||
* 0=KIND_CHARACTER 1=KIND_WORD 2=KIND_LINE 3=KIND_SENTENCE 4=KIND_TITLE -1=default
|
||||
* 0=KIND_CHARACTER 1=KIND_WORD 2=KIND_LINE 3=KIND_SENTENCE 4=KIND_TITLE -1=default (NULL=words) -2=no breaks (.*)
|
||||
* options: T=U_FOLD_CASE_EXCLUDE_SPECIAL_I L=U_TITLECASE_NO_LOWERCASE A=U_TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
* see ICU4C source/test/testdata/casing.txt
|
||||
*/
|
||||
private static final String TITLE_DATA_[] = {
|
||||
@ -715,32 +737,82 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
"\u0041\u0042\u0020\u0049\u03a3\u0020\u0053\u0073\u03a3\u002f\u0046\u0066\u0069\ud93f\udfff",
|
||||
"",
|
||||
"0",
|
||||
"",
|
||||
|
||||
"\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
|
||||
"\u0041\u0062\u0020\u0049\u03c2\u0020\u0053\u0073\u03c3\u002f\u0046\u0066\u0069\ud93f\udfff",
|
||||
"",
|
||||
"1",
|
||||
"",
|
||||
|
||||
"\u02bbaMeLikA huI P\u016b \u02bb\u02bb\u02bbiA", "\u02bbAmelika Hui P\u016b \u02bb\u02bb\u02bbIa", // titlecase first _cased_ letter, j4933
|
||||
"",
|
||||
"-1",
|
||||
"",
|
||||
|
||||
" tHe QUIcK bRoWn", " The Quick Brown",
|
||||
"",
|
||||
"4",
|
||||
"",
|
||||
|
||||
"\u01c4\u01c5\u01c6\u01c7\u01c8\u01c9\u01ca\u01cb\u01cc",
|
||||
"\u01c5\u01c5\u01c5\u01c8\u01c8\u01c8\u01cb\u01cb\u01cb", // UBRK_CHARACTER
|
||||
"",
|
||||
"0",
|
||||
"",
|
||||
|
||||
"\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
|
||||
"",
|
||||
"-1",
|
||||
"",
|
||||
|
||||
"'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'",
|
||||
"",
|
||||
"-1"
|
||||
"-1",
|
||||
"",
|
||||
|
||||
"a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
|
||||
"A \u02bbCat. A \u02bbDog! \u02bbEtc.",
|
||||
"",
|
||||
"-1",
|
||||
"", // default
|
||||
|
||||
"a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
|
||||
"A \u02bbcat. A \u02bbdog! \u02bbetc.",
|
||||
"",
|
||||
"-1",
|
||||
"A", // U_TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
|
||||
"a \u02bbCaT. A \u02bbdOg! \u02bbeTc.",
|
||||
"A \u02bbCaT. A \u02bbdOg! \u02bbETc.",
|
||||
"",
|
||||
"3",
|
||||
"L", // UBRK_SENTENCE and U_TITLECASE_NO_LOWERCASE
|
||||
|
||||
|
||||
"\u02bbcAt! \u02bbeTc.",
|
||||
"\u02bbCat! \u02bbetc.",
|
||||
"",
|
||||
"-2",
|
||||
"", // -2=Trivial break iterator
|
||||
|
||||
"\u02bbcAt! \u02bbeTc.",
|
||||
"\u02bbcat! \u02bbetc.",
|
||||
"",
|
||||
"-2",
|
||||
"A", // U_TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
|
||||
"\u02bbcAt! \u02bbeTc.",
|
||||
"\u02bbCAt! \u02bbeTc.",
|
||||
"",
|
||||
"-2",
|
||||
"L", // U_TITLECASE_NO_LOWERCASE
|
||||
|
||||
"\u02bbcAt! \u02bbeTc.",
|
||||
"\u02bbcAt! \u02bbeTc.",
|
||||
"",
|
||||
"-2",
|
||||
"AL" // Both options
|
||||
};
|
||||
|
||||
|
||||
|
@ -2731,6 +2731,43 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static final int MAX_RADIX = java.lang.Character.MAX_RADIX;
|
||||
|
||||
/**
|
||||
* Do not lowercase non-initial parts of words when titlecasing.
|
||||
* Option bit for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* By default, titlecasing will titlecase the first cased character
|
||||
* of a word and lowercase all other characters.
|
||||
* With this option, the other characters will not be modified.
|
||||
*
|
||||
* @see toTitleCase
|
||||
* @draft ICU 3.8
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final int TITLECASE_NO_LOWERCASE = 0x100;
|
||||
|
||||
/**
|
||||
* Do not adjust the titlecasing indexes from BreakIterator::next() indexes;
|
||||
* titlecase exactly the characters at breaks from the iterator.
|
||||
* Option bit for titlecasing APIs that take an options bit set.
|
||||
*
|
||||
* By default, titlecasing will take each break iterator index,
|
||||
* adjust it by looking for the next cased character, and titlecase that one.
|
||||
* Other characters are lowercased.
|
||||
*
|
||||
* This follows Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* @see toTitleCase
|
||||
* @see TITLECASE_NO_LOWERCASE
|
||||
* @draft ICU 3.8
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final int TITLECASE_NO_BREAK_ADJUSTMENT = 0x200;
|
||||
|
||||
// public methods ----------------------------------------------------
|
||||
|
||||
/**
|
||||
@ -4062,6 +4099,13 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Move to the iteration limit without fetching code points up to there.
|
||||
*/
|
||||
public void moveToLimit() {
|
||||
cpStart=cpLimit=limit;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate forward through the string to fetch the next code point
|
||||
* to be case-mapped, and set the context indexes for it.
|
||||
@ -4105,6 +4149,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
return cpStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the limit of the code point that was last returned
|
||||
* by nextCaseMapCP().
|
||||
*/
|
||||
public int getCPLimit() {
|
||||
return cpLimit;
|
||||
}
|
||||
|
||||
// implement UCaseProps.ContextIterator
|
||||
public void reset(int dir) {
|
||||
if(dir>0) {
|
||||
@ -4170,7 +4222,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
/**
|
||||
* <p>Gets the titlecase version of the argument string.</p>
|
||||
* <p>Position for titlecasing is determined by the argument break
|
||||
* iterator, hence the user can customized his break iterator for
|
||||
* iterator, hence the user can customize his break iterator for
|
||||
* a specialized titlecasing. In this case only the forward iteration
|
||||
* needs to be implemented.
|
||||
* If the break iterator passed in is null, the default Unicode algorithm
|
||||
@ -4299,7 +4351,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
/**
|
||||
* <p>Gets the titlecase version of the argument string.</p>
|
||||
* <p>Position for titlecasing is determined by the argument break
|
||||
* iterator, hence the user can customized his break iterator for
|
||||
* iterator, hence the user can customize his break iterator for
|
||||
* a specialized titlecasing. In this case only the forward iteration
|
||||
* needs to be implemented.
|
||||
* If the break iterator passed in is null, the default Unicode algorithm
|
||||
@ -4324,7 +4376,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
/**
|
||||
* <p>Gets the titlecase version of the argument string.</p>
|
||||
* <p>Position for titlecasing is determined by the argument break
|
||||
* iterator, hence the user can customized his break iterator for
|
||||
* iterator, hence the user can customize his break iterator for
|
||||
* a specialized titlecasing. In this case only the forward iteration
|
||||
* needs to be implemented.
|
||||
* If the break iterator passed in is null, the default Unicode algorithm
|
||||
@ -4342,6 +4394,35 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
*/
|
||||
public static String toTitleCase(ULocale locale, String str,
|
||||
BreakIterator titleIter) {
|
||||
return toTitleCase(locale, str, titleIter, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>Gets the titlecase version of the argument string.</p>
|
||||
* <p>Position for titlecasing is determined by the argument break
|
||||
* iterator, hence the user can customize his break iterator for
|
||||
* a specialized titlecasing. In this case only the forward iteration
|
||||
* needs to be implemented.
|
||||
* If the break iterator passed in is null, the default Unicode algorithm
|
||||
* will be used to determine the titlecase positions.
|
||||
* </p>
|
||||
* <p>Only positions returned by the break iterator will be title cased,
|
||||
* character in between the positions will all be in lower case.</p>
|
||||
* <p>Casing is dependent on the argument locale and context-sensitive</p>
|
||||
* @param locale which string is to be converted in
|
||||
* @param str source string to be performed on
|
||||
* @param titleIter break iterator to determine the positions in which
|
||||
* the character should be title cased.
|
||||
* @param options bit set to modify the titlecasing operation
|
||||
* @return lowercase version of the argument string
|
||||
* @draft ICU 3.8
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
* @see TITLECASE_NO_LOWERCASE
|
||||
* @see TITLECASE_NO_BREAK_ADJUSTMENT
|
||||
*/
|
||||
public static String toTitleCase(ULocale locale, String str,
|
||||
BreakIterator titleIter,
|
||||
int options) {
|
||||
StringContextIterator iter = new StringContextIterator(str);
|
||||
StringBuffer result = new StringBuffer(str.length());
|
||||
int[] locCache = new int[1];
|
||||
@ -4393,11 +4474,16 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
if(prev<index) {
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
iter.setLimit(index);
|
||||
while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
|
||||
titleStart=iter.getCPStart();
|
||||
if(prev<titleStart) {
|
||||
// TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
|
||||
result.append(str.substring(prev, titleStart));
|
||||
c=iter.nextCaseMapCP();
|
||||
if((options&TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCaseProps.NONE==gCsp.getType(c)) {
|
||||
while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
|
||||
titleStart=iter.getCPStart();
|
||||
if(prev<titleStart) {
|
||||
// TODO: With Java 5, this would want to be result.append(str, prev, titleStart);
|
||||
result.append(str.substring(prev, titleStart));
|
||||
}
|
||||
} else {
|
||||
titleStart=prev;
|
||||
}
|
||||
|
||||
if(titleStart<index) {
|
||||
@ -4424,8 +4510,18 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
UTF16.append(result, c);
|
||||
}
|
||||
}
|
||||
|
||||
if((c=iter.nextCaseMapCP())>=0) {
|
||||
|
||||
if((options&TITLECASE_NO_LOWERCASE)!=0) {
|
||||
/* Optionally just copy the rest of the word unchanged. */
|
||||
int titleLimit=iter.getCPLimit();
|
||||
if(titleLimit<index) {
|
||||
// TODO: With Java 5, this would want to be result.append(str, titleLimit, index);
|
||||
result.append(str.substring(titleLimit, index));
|
||||
iter.moveToLimit();
|
||||
break;
|
||||
}
|
||||
} else if((c=iter.nextCaseMapCP())>=0) {
|
||||
/* Normal operation: Lowercase the rest of the word. */
|
||||
c=gCsp.toFullLower(c, iter, result, locale, locCache);
|
||||
} else {
|
||||
break;
|
||||
|
Loading…
Reference in New Issue
Block a user