ICU-4933 titlecase first _cased_ letter

X-SVN-Rev: 19962
This commit is contained in:
Markus Scherer 2006-08-02 21:56:34 +00:00
parent 86ff19728b
commit 885b57fdb6
2 changed files with 132 additions and 86 deletions

View File

@ -14,6 +14,7 @@ import com.ibm.icu.dev.test.TestUtil;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.util.ULocale;
import com.ibm.icu.impl.Utility;
import java.util.Locale;
import java.io.BufferedReader;
@ -287,39 +288,40 @@ public final class UCharacterCaseTest extends TestFmwk
}
}
// Unfortunately, BreakIterator.getBreakInstance(ULocale where, int kind)
// is private. Re-implement it here.
private final BreakIterator getBreakInstance(ULocale where, int kind) {
switch(kind) {
case BreakIterator.KIND_CHARACTER:
return BreakIterator.getCharacterInstance(where);
case BreakIterator.KIND_WORD:
return BreakIterator.getWordInstance(where);
case BreakIterator.KIND_LINE:
return BreakIterator.getLineInstance(where);
case BreakIterator.KIND_SENTENCE:
return BreakIterator.getSentenceInstance(where);
case BreakIterator.KIND_TITLE:
return BreakIterator.getTitleInstance(where);
default:
return null;
}
}
public void TestTitle()
{
try{
BreakIterator TITLE_BREAKITERATORS_[] =
{
BreakIterator.getCharacterInstance(),
BreakIterator.getWordInstance(),
BreakIterator.getTitleInstance(),
BreakIterator.getCharacterInstance(),
null,
null
};
for (int i = 0; i < TITLE_BREAKITERATORS_.length; i ++) {
String test = TITLE_DATA_[i << 1];
String expected = TITLE_DATA_[(i << 1) + 1];
if (!expected.equals(
UCharacter.toTitleCase(test,
TITLE_BREAKITERATORS_[i]))) {
errln("error: titlecasing for " + prettify(test) + " should be " +
for (int i = 0; i < TITLE_DATA_.length;) {
String test = TITLE_DATA_[i++];
String expected = TITLE_DATA_[i++];
String locale = TITLE_DATA_[i++];
String breakType = TITLE_DATA_[i++];
ULocale loc = new ULocale(locale);
BreakIterator iter = getBreakInstance(loc, Integer.parseInt(breakType));
String result = UCharacter.toTitleCase(loc, test, iter);
if (!expected.equals(result)) {
errln("titlecasing for " + prettify(test) + " should be " +
prettify(expected) + " but got " +
prettify(UCharacter.toTitleCase(test,
TITLE_BREAKITERATORS_[i])));
}
//cover toTitleCase(Locale, String, BreakIterator)
Locale def = Locale.getDefault();
String data = TITLE_DATA_[i << 1];
if (!expected.equals(
UCharacter.toTitleCase(def, data,
TITLE_BREAKITERATORS_[i]))) {
errln("error: titlecasing for " + prettify(data) + " should be " +
prettify(expected) + " but got " +
prettify(UCharacter.toTitleCase(def, data,
TITLE_BREAKITERATORS_[i])));
prettify(result));
}
}
}catch(Exception ex){
@ -720,24 +722,42 @@ public final class UCharacterCaseTest extends TestFmwk
"\u0061\u0062\u0131\u03c3\u00df\u03c2\u002f\ud93f\udfff";
/**
* each item is an array with input string, result string, locale
* each item is an array with input string, result string, locale ID, break iterator
* the break iterator is specified as an int, same as in BreakIterator.KIND_*:
* 0=KIND_CHARACTER 1=KIND_WORD 2=KIND_LINE 3=KIND_SENTENCE 4=KIND_TITLE -1=default
* see ICU4C source/test/testdata/casing.txt
*/
private static final String TITLE_DATA_[] = {
"\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
"\u0041\u0042\u0020\u0049\u03a3\u0020\u0053\u0073\u03a3\u002f\u0046\u0066\u0069\ud93f\udfff",
"",
"0",
"\u0061\u0042\u0020\u0069\u03c2\u0020\u00df\u03c3\u002f\ufb03\ud93f\udfff",
"\u0041\u0062\u0020\u0049\u03c2\u0020\u0053\u0073\u03c3\u002f\u0046\u0066\u0069\ud93f\udfff",
"",
"1",
"\u02bbaMeLikA huI P\u016b \u02bb\u02bb\u02bbiA", "\u02bbAmelika Hui P\u016b \u02bb\u02bb\u02bbIa", // titlecase first _cased_ letter, j4933
"",
"-1",
" tHe QUIcK bRoWn", " The Quick Brown",
"",
"4",
"\u01c4\u01c5\u01c6\u01c7\u01c8\u01c9\u01ca\u01cb\u01cc",
"\u01c5\u01c5\u01c5\u01c8\u01c8\u01c8\u01cb\u01cb\u01cb", // UBRK_CHARACTER
"\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
"'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'"
"",
"0",
"\u01c9ubav ljubav", "\u01c8ubav Ljubav", // Lj vs. L+j
"",
"-1",
"'oH dOn'T tItLeCaSe AfTeR lEtTeR+'", "'Oh Don't Titlecase After Letter+'",
"",
"-1"
};

View File

@ -4140,6 +4140,9 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* If the limit parameter is negative or past the string, then the
* string length is restored as the iteration limit.
*
* This limit does not affect the next() function which always
* iterates to the very end of the string.
*
* @param lim The iteration limit.
*/
public void setLimit(int lim) {
@ -4156,13 +4159,17 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
* Performance optimization, to save on function calls and redundant
* tests. Combines UTF16.charAt(), UTF16.getCharCount(), and setIndex().
*
* When the iteration limit is reached (and -1 is returned),
* getCPStart() will be at the iteration limit.
*
* Iteration with next() does not affect the position for nextCaseMapCP().
*
* @return The next code point to be case-mapped, or <0 when the iteration is done.
*/
public int nextCaseMapCP() {
int c;
cpStart=cpLimit;
if(cpLimit<limit) {
cpStart=cpLimit;
c=s.charAt(cpLimit++);
int c=s.charAt(cpLimit++);
if(UTF16.LEAD_SURROGATE_MIN_VALUE<=c || c<=UTF16.TRAIL_SURROGATE_MAX_VALUE) {
char c2;
if( c<=UTF16.LEAD_SURROGATE_MAX_VALUE && cpLimit<limit &&
@ -4181,6 +4188,14 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
}
}
/**
* Get the start of the code point that was last returned
* by nextCaseMapCP().
*/
public int getCPStart() {
return cpStart;
}
// implement UCaseProps.ContextIterator
public void reset(int dir) {
if(dir>0) {
@ -4201,7 +4216,7 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
public int next() {
int c;
if(dir>0 && index<limit) {
if(dir>0 && index<s.length()) {
c=UTF16.charAt(s, index);
index+=UTF16.getCharCount(c);
return c;
@ -4436,14 +4451,15 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
}
titleIter.setText(str);
int index;
int prev, titleStart, index;
boolean isFirstIndex;
/* set up local variables */
prev=0;
isFirstIndex=true;
/* titlecasing loop */
for(;;) {
while(prev<srcLength) {
/* find next index where to titlecase */
if(isFirstIndex) {
isFirstIndex=false;
@ -4455,53 +4471,63 @@ public final class UCharacter implements ECharacterCategory, ECharacterDirection
index=srcLength;
}
/* lowercase up to index */
iter.setLimit(index);
while((c=iter.nextCaseMapCP())>=0) {
c=gCsp.toFullLower(c, iter, result, locale, locCache);
/* decode the result */
if(c<0) {
/* (not) original code point */
c=~c;
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
/* mapping already appended to result */
continue;
/* } else { append single-code point mapping */
/*
* Unicode 4 & 5 section 3.13 Default Case Operations:
*
* R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
* #29, "Text Boundaries." Between each pair of word boundaries, find the first
* cased character F. If F exists, map F to default_title(F); then map each
* subsequent character C to default_lower(C).
*
* In this implementation, segment [prev..index[ into 3 parts:
* a) uncased characters (copy as-is) [prev..titleStart[
* b) first case letter (titlecase) [titleStart..titleLimit[
* c) subsequent characters (lowercase) [titleLimit..index[
*/
if(prev<index) {
/* find and copy uncased characters [prev..titleStart[ */
iter.setLimit(index);
while((c=iter.nextCaseMapCP())>=0 && UCaseProps.NONE==gCsp.getType(c)) {}
titleStart=iter.getCPStart();
if(prev<titleStart) {
result.append(str, prev, titleStart);
}
if(c<=0xffff) {
result.append((char)c);
} else {
UTF16.append(result, c);
if(titleStart<index) {
/* titlecase c which is from titleStart */
c=gCsp.toFullTitle(c, iter, result, locale, locCache);
/* decode the result and lowercase up to index */
for(;;) {
if(c<0) {
/* (not) original code point */
c=~c;
if(c<=0xffff) {
result.append((char)c);
} else {
UTF16.append(result, c);
}
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
/* mapping already appended to result */
} else {
/* append single-code point mapping */
if(c<=0xffff) {
result.append((char)c);
} else {
UTF16.append(result, c);
}
}
if((c=iter.nextCaseMapCP())>=0) {
c=gCsp.toFullLower(c, iter, result, locale, locCache);
} else {
break;
}
}
}
}
if(index>=srcLength) {
break;
}
/* titlecase the character at the found index */
iter.setLimit(srcLength);
c=iter.nextCaseMapCP();
if(c<0) {
break; // reached end of str
}
c=gCsp.toFullTitle(c, iter, result, locale, locCache);
/* decode the result */
if(c<0) {
/* (not) original code point */
c=~c;
} else if(c<=UCaseProps.MAX_STRING_LENGTH) {
/* mapping already appended to result */
continue;
/* } else { append single-code point mapping */
}
if(c<=0xffff) {
result.append((char)c);
} else {
UTF16.append(result, c);
}
prev=index;
}
return result.toString();
}