ICU-4933 titlecase first _cased_ letter
X-SVN-Rev: 19962
This commit is contained in:
parent
86ff19728b
commit
885b57fdb6
@ -14,6 +14,7 @@ import com.ibm.icu.dev.test.TestUtil;
|
||||
import com.ibm.icu.lang.UCharacter;
|
||||
import com.ibm.icu.text.UTF16;
|
||||
import com.ibm.icu.text.BreakIterator;
|
||||
import com.ibm.icu.util.ULocale;
|
||||
import com.ibm.icu.impl.Utility;
|
||||
import java.util.Locale;
|
||||
import java.io.BufferedReader;
|
||||
@ -287,39 +288,40 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
}
|
||||
}
|
||||
|
||||
// Unfortunately, BreakIterator.getBreakInstance(ULocale where, int kind)
|
||||
// is private. Re-implement it here.
|
||||
private final BreakIterator getBreakInstance(ULocale where, int kind) {
|
||||
switch(kind) {
|
||||
case BreakIterator.KIND_CHARACTER:
|
||||
return BreakIterator.getCharacterInstance(where);
|
||||
case BreakIterator.KIND_WORD:
|
||||
return BreakIterator.getWordInstance(where);
|
||||
case BreakIterator.KIND_LINE:
|
||||
return BreakIterator.getLineInstance(where);
|
||||
case BreakIterator.KIND_SENTENCE:
|
||||
return BreakIterator.getSentenceInstance(where);
|
||||
case BreakIterator.KIND_TITLE:
|
||||
return BreakIterator.getTitleInstance(where);
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public void TestTitle()
|
||||
{
|
||||
try{
|
||||
BreakIterator TITLE_BREAKITERATORS_[] =
|
||||
{
|
||||
BreakIterator.getCharacterInstance(),
|
||||
BreakIterator.getWordInstance(),
|
||||
BreakIterator.getTitleInstance(),
|
||||
BreakIterator.getCharacterInstance(),
|
||||
null,
|
||||
null
|
||||
};
|
||||
for (int i = 0; i < TITLE_BREAKITERATORS_.length; i ++) {
|
||||
String test = TITLE_DATA_[i << 1];
|
||||
String expected = TITLE_DATA_[(i << 1) + 1];
|
||||
if (!expected.equals(
|
||||
UCharacter.toTitleCase(test,
|
||||
TITLE_BREAKITERATORS_[i]))) {
|
||||
errln("error: titlecasing for " + prettify(test) + " should be " +
|
||||
for (int i = 0; i < TITLE_DATA_.length;) {
|
||||
String test = TITLE_DATA_[i++];
|
||||
String expected = TITLE_DATA_[i++];
|
||||
String locale = TITLE_DATA_[i++];
|
||||
String breakType = TITLE_DATA_[i++];
|
||||
ULocale loc = new ULocale(locale);
|
||||
BreakIterator iter = getBreakInstance(loc, Integer.parseInt(breakType));
|
||||
String result = UCharacter.toTitleCase(loc, test, iter);
|
||||
if (!expected.equals(result)) {
|
||||
errln("titlecasing for " + prettify(test) + " should be " +
|
||||
prettify(expected) + " but got " +
|
||||
prettify(UCharacter.toTitleCase(test,
|
||||
TITLE_BREAKITERATORS_[i])));
|
||||
}
|
||||
//cover toTitleCase(Locale, String, BreakIterator)
|
||||
Locale def = Locale.getDefault();
|
||||
String data = TITLE_DATA_[i << 1];
|
||||
if (!expected.equals(
|
||||
UCharacter.toTitleCase(def, data,
|
||||
TITLE_BREAKITERATORS_[i]))) {
|
||||
errln("error: titlecasing for " + prettify(data) + " should be " +
|
||||
prettify(expected) + " but got " +
|
||||
prettify(UCharacter.toTitleCase(def, data,
|
||||
TITLE_BREAKITERATORS_[i])));
|
||||
prettify(result));
|
||||
}
|
||||
}
|
||||
}catch(Exception ex){
|
||||
@ -720,24 +722,42 @@ public final class UCharacterCaseTest extends TestFmwk
|
||||
"\u0061\u0062\u0131\u03c3\u00df\u03c2\u002f\ud93f\udfff";
|
||||
|
||||
/**
|
||||
* each item is an array with input string, result string, locale
|
||||
* each item is an array with input string, result string, locale ID, break iterator
|
||||
* the break iterator is specified as an int, same as in BreakIterator.KIND_*:
|
||||
* 0=KIND_CHARACTER 1=KIND_WORD 2=KIND_LINE 3=KIND_SENTENCE 4=KIND_TITLE -1=default
|
||||
* see ICU4C source/test/testdata/casing.txt
|
||||
*/
|
||||
private static final String TITLE_DATA_[] = {
|
||||
"\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
|
||||
"\u0041\u0042\u0020\u0049\u03a3\u0020\u0053\u0073\u03a3\u002f\u0046\u0066\u0069\ud93f\udfff",
|
||||
|
||||
"",
|
||||
"0",
|
||||
|
||||
"\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
|
||||
"\u0041\u0062\u0020\u0049\u03c2\u0020\u0053\u0073\u03c3\u002f\u0046\u0066\u0069\ud93f\udfff",
|
||||
|
||||
"",
|
||||
"1",
|
||||
|
||||
"\u02bbaMeLikA huI P\u016b \u02bb\u02bb\u02bbiA", "\u02bbAmelika Hui P\u016b \u02bb\u02bb\u02bbIa", // titlecase first _cased_ letter, j4933
|
||||
"",
|
||||
"-1",
|
||||
|
||||
" tHe QUIcK bRoWn", " The Quick Brown",
|
||||
|
||||
"",
|
||||
"4",
|
||||
|
||||
"\u01c4\u01c5\u01c6\u01c7\u01c8\u01c9\u01ca\u01cb\u01cc",
|
||||
"\u01c5\u01c5\u01c5\u01c8\u01c8\u01c8\u01cb\u01cb\u01cb", // UBRK_CHARACTER
|
||||
|
||||
"\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
|
||||
|
||||
"'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'"
|
||||
"",
|
||||
"0",
|
||||
|
||||
"\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
|
||||
"",
|
||||
"-1",
|
||||
|
||||
"'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'",
|
||||
"",
|
||||
"-1"
|
||||
};
|
||||
|
||||
|
||||
|
@ -4140,6 +4140,9 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
* If the limit parameter is negative or past the string, then the
|
||||
* string length is restored as the iteration limit.
|
||||
*
|
||||
* This limit does not affect the next() function which always
|
||||
* iterates to the very end of the string.
|
||||
*
|
||||
* @param lim The iteration limit.
|
||||
*/
|
||||
public void setLimit(int lim) {
|
||||
@ -4156,13 +4159,17 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
* Performance optimization, to save on function calls and redundant
|
||||
* tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
|
||||
*
|
||||
* When the iteration limit is reached (and -1 is returned),
|
||||
* getCPStart() will be at the iteration limit.
|
||||
*
|
||||
* Iteration with next() does not affect the position for nextCaseMapCP().
|
||||
*
|
||||
* @return The next code point to be case-mapped, or <0 when the iteration is done.
|
||||
*/
|
||||
public int nextCaseMapCP() {
|
||||
int c;
|
||||
cpStart=cpLimit;
|
||||
if(cpLimit<limit) {
|
||||
cpStart=cpLimit;
|
||||
c=s.charAt(cpLimit++);
|
||||
int c=s.charAt(cpLimit++);
|
||||
if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
|
||||
char c2;
|
||||
if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
|
||||
@ -4181,6 +4188,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the start of the code point that was last returned
|
||||
* by nextCaseMapCP().
|
||||
*/
|
||||
public int getCPStart() {
|
||||
return cpStart;
|
||||
}
|
||||
|
||||
// implement UCaseProps.ContextIterator
|
||||
public void reset(int dir) {
|
||||
if(dir>0) {
|
||||
@ -4201,7 +4216,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
public int next() {
|
||||
int c;
|
||||
|
||||
if(dir>0 && index<limit) {
|
||||
if(dir>0 && index<s.length()) {
|
||||
c=UTF16.charAt(s, index);
|
||||
index+=UTF16.getCharCount(c);
|
||||
return c;
|
||||
@ -4436,14 +4451,15 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
}
|
||||
titleIter.setText(str);
|
||||
|
||||
int index;
|
||||
int prev, titleStart, index;
|
||||
boolean isFirstIndex;
|
||||
|
||||
/* set up local variables */
|
||||
prev=0;
|
||||
isFirstIndex=true;
|
||||
|
||||
/* titlecasing loop */
|
||||
for(;;) {
|
||||
while(prev<srcLength) {
|
||||
/* find next index where to titlecase */
|
||||
if(isFirstIndex) {
|
||||
isFirstIndex=false;
|
||||
@ -4455,53 +4471,63 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
|
||||
index=srcLength;
|
||||
}
|
||||
|
||||
/* lowercase up to index */
|
||||
iter.setLimit(index);
|
||||
while((c=iter.nextCaseMapCP())>=0) {
|
||||
c=gCsp.toFullLower(c, iter, result, locale, locCache);
|
||||
|
||||
/* decode the result */
|
||||
if(c<0) {
|
||||
/* (not) original code point */
|
||||
c=~c;
|
||||
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
|
||||
/* mapping already appended to result */
|
||||
continue;
|
||||
/* } else { append single-code point mapping */
|
||||
/*
|
||||
* Unicode 4 & 5 section 3.13 Default Case Operations:
|
||||
*
|
||||
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
|
||||
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
|
||||
* cased character F. If F exists, map F to default_title(F); then map each
|
||||
* subsequent character C to default_lower(C).
|
||||
*
|
||||
* In this implementation, segment [prev..index[ into 3 parts:
|
||||
* a) uncased characters (copy as-is) [prev..titleStart[
|
||||
* b) first case letter (titlecase) [titleStart..titleLimit[
|
||||
* c) subsequent characters (lowercase) [titleLimit..index[
|
||||
*/
|
||||
if(prev<index) {
|
||||
/* find and copy uncased characters [prev..titleStart[ */
|
||||
iter.setLimit(index);
|
||||
while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
|
||||
titleStart=iter.getCPStart();
|
||||
if(prev<titleStart) {
|
||||
result.append(str, prev, titleStart);
|
||||
}
|
||||
if(c<=0xffff) {
|
||||
result.append((char)c);
|
||||
} else {
|
||||
UTF16.append(result, c);
|
||||
|
||||
if(titleStart<index) {
|
||||
/* titlecase c which is from titleStart */
|
||||
c=gCsp.toFullTitle(c, iter, result, locale, locCache);
|
||||
|
||||
/* decode the result and lowercase up to index */
|
||||
for(;;) {
|
||||
if(c<0) {
|
||||
/* (not) original code point */
|
||||
c=~c;
|
||||
if(c<=0xffff) {
|
||||
result.append((char)c);
|
||||
} else {
|
||||
UTF16.append(result, c);
|
||||
}
|
||||
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
|
||||
/* mapping already appended to result */
|
||||
} else {
|
||||
/* append single-code point mapping */
|
||||
if(c<=0xffff) {
|
||||
result.append((char)c);
|
||||
} else {
|
||||
UTF16.append(result, c);
|
||||
}
|
||||
}
|
||||
|
||||
if((c=iter.nextCaseMapCP())>=0) {
|
||||
c=gCsp.toFullLower(c, iter, result, locale, locCache);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(index>=srcLength) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* titlecase the character at the found index */
|
||||
iter.setLimit(srcLength);
|
||||
c=iter.nextCaseMapCP();
|
||||
if(c<0) {
|
||||
break; // reached end of str
|
||||
}
|
||||
c=gCsp.toFullTitle(c, iter, result, locale, locCache);
|
||||
|
||||
/* decode the result */
|
||||
if(c<0) {
|
||||
/* (not) original code point */
|
||||
c=~c;
|
||||
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
|
||||
/* mapping already appended to result */
|
||||
continue;
|
||||
/* } else { append single-code point mapping */
|
||||
}
|
||||
if(c<=0xffff) {
|
||||
result.append((char)c);
|
||||
} else {
|
||||
UTF16.append(result, c);
|
||||
}
|
||||
prev=index;
|
||||
}
|
||||
return result.toString();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user