new extended name APIs

X-SVN-Rev: 7677
This commit is contained in:
Syn Wee Quek 2002-02-15 02:53:35 +00:00
parent 6fdea6ffb4
commit d882319b30
10 changed files with 782 additions and 100 deletions

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterTest.java,v $
* $Date: 2002/02/08 23:44:17 $
* $Revision: 1.21 $
* $Date: 2002/02/15 02:53:32 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -555,16 +555,33 @@ public final class UCharacterTest extends TestFmwk
*/
public void TestNames()
{
int c[] = {0x0061, 0x0284, 0x3401, 0x7fed, 0xac00, 0xd7a3, 0xff08, 0xffe5,
0x23456};
int c[] = {0x0061, 0x0284, 0x3401, 0x7fed, 0xac00, 0xd7a3, 0xd800, 0xdc00,
0xff08, 0xffe5, 0xffff, 0x23456, 0x9};
String name[] = {"LATIN SMALL LETTER A",
"LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
"CJK UNIFIED IDEOGRAPH-3401",
"CJK UNIFIED IDEOGRAPH-7FED", "HANGUL SYLLABLE GA",
"HANGUL SYLLABLE HIH", "FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN", "CJK UNIFIED IDEOGRAPH-23456"};
"HANGUL SYLLABLE HIH", "", "",
"FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN", "", "CJK UNIFIED IDEOGRAPH-23456",
""};
String oldname[] = {"", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "", "",
"", "", "FULLWIDTH OPENING PARENTHESIS", "", ""};
"", "", "", "", "FULLWIDTH OPENING PARENTHESIS", "",
"", "", "HORIZONTAL TABULATION"};
String extendedname[] = {"LATIN SMALL LETTER A",
"LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
"CJK UNIFIED IDEOGRAPH-3401",
"CJK UNIFIED IDEOGRAPH-7FED",
"HANGUL SYLLABLE GA",
"HANGUL SYLLABLE HIH",
"<lead surrogate-D800>",
"<trail surrogate-DC00>",
"FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN",
"<noncharacter-FFFF>",
"CJK UNIFIED IDEOGRAPH-23456",
"HORIZONTAL TABULATION"};
int size = c.length;
String str;
int uc;
@ -573,7 +590,8 @@ public final class UCharacterTest extends TestFmwk
{
// modern Unicode character name
str = UCharacter.getName(c[i]);
if (!str.equals(name[i]))
if ((str == null && name[i].length() > 0) ||
(str != null && !str.equals(name[i])))
{
errln("FAIL \\u" + hex(c[i]) + " expected name " +
name[i]);
@ -590,9 +608,18 @@ public final class UCharacterTest extends TestFmwk
break;
}
// extended character name
str = UCharacter.getExtendedName(c[i]);
if (str == null || !str.equals(extendedname[i]))
{
errln("FAIL \\u" + hex(c[i]) + " expected extended name " +
extendedname[i]);
break;
}
// retrieving unicode character from modern name
uc = UCharacter.getCharFromName(name[i]);
if (uc != c[i])
if (uc != c[i] && name[i].length() != 0)
{
errln("FAIL " + name[i] + " expected character \\u" + hex(c[i]));
break;
@ -600,9 +627,17 @@ public final class UCharacterTest extends TestFmwk
//retrieving unicode character from 1.0 name
uc = UCharacter.getCharFromName1_0(oldname[i]);
if (uc != c[i] && oldname[i].length() != 0)
{
errln("FAIL " + oldname[i] + " expected 1.0 character \\u" + hex(c[i]));
break;
}
//retrieving unicode character from 1.0 name
uc = UCharacter.getCharFromExtendedName(extendedname[i]);
if (uc != c[i] && i != 0 && (i == 1 || i == 6))
{
errln("FAIL " + name[i] + " expected 1.0 character \\u" + hex(c[i]));
errln("FAIL " + extendedname[i] + " expected extended character \\u" + hex(c[i]));
break;
}
}
@ -1014,8 +1049,8 @@ public final class UCharacterTest extends TestFmwk
try
{
UCharacterTest test = new UCharacterTest();
//test.TestEnumeration();
test.run(arg);
test.TestNames();
//test.run(arg);
}
catch (Exception e)
{

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/lang/UCharacter.java,v $
* $Date: 2002/02/08 01:08:38 $
* $Revision: 1.21 $
* $Date: 2002/02/15 02:53:32 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -17,6 +17,8 @@ package com.ibm.text;
import java.util.Locale;
import com.ibm.util.Utility;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.text.BreakIterator;
import com.ibm.text.RuleBasedBreakIterator;
/**
* <p>
@ -910,8 +912,7 @@ public final class UCharacter
*/
public static String getName(int ch)
{
return NAME_.getName(ch,
UCharacterNameChoice.U_UNICODE_CHAR_NAME);
return NAME_.getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
}
/**
@ -929,10 +930,33 @@ public final class UCharacter
return NAME_.getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
/**
* <p>Retrieves a name for a valid codepoint. Unlike, getName(int) and
* getName1_0(int), this method will return a name even for codepoints that
* are not assigned a name in UnicodeData.txt.
* </p>
* The names are returned in the following order.
* <ul>
* <li> Most current Unicode name if there is any
* <li> Unicode 1.0 name if there is any
* <li> Extended name in the form of "<codepoint_type-codepoint_hex_digits>".
* E.g. <noncharacter-fffe>
* </ul>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param ch the code point for which to get the name
* @return a name for the argument codepoint
* @draft 2.1
*/
public static String getExtendedName(int ch)
{
return NAME_.getName(ch, UCharacterNameChoice.U_EXTENDED_CHAR_NAME);
}
/**
* Find a Unicode code point by its most current Unicode name and return its
* code point value.<br>
* <p>Find a Unicode code point by its most current Unicode name and
* return its code point value. All Unicode names are in uppercase.</p>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name most current Unicode character name whose code point is to be
@ -946,8 +970,8 @@ public final class UCharacter
}
/**
* Find a Unicode character by its version 1.0 Unicode name and return its
* code point value.<br>
* <p>Find a Unicode character by its version 1.0 Unicode name and return
* its code point value. All Unicode names are in uppercase.</p>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name Unicode 1.0 code point name whose code point is to
@ -959,6 +983,31 @@ public final class UCharacter
return NAME_.getCharFromName(
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name);
}
/**
* <p>Find a Unicode character by either its name and return its code
* point value. All Unicode names are in uppercase.
* Extended names are all lowercase except for numbers and are contained
* within angle brackets.</p>
* The names are searched in the following order
* <ul>
* <li> Most current Unicode name if there is any
* <li> Unicode 1.0 name if there is any
* <li> Extended name in the form of "<codepoint_type-codepoint_hex_digits>".
* E.g. <noncharacter-FFFE>
* </ul>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name codepoint name
* @return code point associated with the name or -1 if the name is not
* found.
* @draft 2.1
*/
public static int getCharFromExtendedName(String name)
{
return NAME_.getCharFromName(
UCharacterNameChoice.U_EXTENDED_CHAR_NAME, name);
}
/**
* Returns a code pointcorresponding to the two UTF16 characters.<br>
@ -1016,6 +1065,38 @@ public final class UCharacter
{
return toLowerCase(Locale.getDefault(), str);
}
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
* </p>
* <p>Only positions returned by the break iterator will be title cased,
* character in between the positions will all be in lower case.</p>
* <p>Casing is dependent on the default locale and context-sensitive</p>
* @param str source string to be performed on
* @param breakiter break iterator to determine the positions in which
* the character should be title cased.
* @return lowercase version of the argument string
*/
public static String toTitleCase(String str, BreakIterator breakiter)
{
if (breakiter == null) {
String rules = "$cased=[[:Lu:][:Lt:][:Ll:]];" +
"$case_ignorable=[[:Mn:][:Me:][:Cf:][:Lm:][:Sk:]"
+ " \\u0027\u00AD\u2019];" +
"$not_cased=[^$cased$case_ignorable];" +
"[$not_cased$case_ignorable]*/" +
"$cased[$cased$case_ignorable]*$not_cased*;";
breakiter = new RuleBasedBreakIterator(rules);
}
return str;
}
/**
* Gets uppercase version of the argument string.
@ -1111,6 +1192,30 @@ public final class UCharacter
return result.toString();
}
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
* </p>
* <p>Only positions returned by the break iterator will be title cased,
* character in between the positions will all be in lower case.</p>
* <p>Casing is dependent on the argument locale and context-sensitive</p>
* @param locale which string is to be converted in
* @param str source string to be performed on
* @param breakiter break iterator to determine the positions in which
* the character should be title cased.
* @return lowercase version of the argument string
*/
public static String toTitleCase(Locale locale, String str,
BreakIterator breakiter)
{
return str;
}
/**
* The given character is mapped to its case folding equivalent according to
* UnicodeData.txt and CaseFolding.txt; if the character has no case folding

View File

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterCategory.java $
* $Date: 2001/10/12 23:53:16 $
* $Revision: 1.3 $
* $Date: 2002/02/15 02:53:35 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -26,22 +26,19 @@ package com.ibm.text;
public class UCharacterCategory
{
// private constructor ===================================================
/**
* Private constructor to prevent initialisation
*/
private UCharacterCategory()
{
}
// public variable =======================================================
// public variable -----------------------------------------------------
/**
* Unassigned character type
*/
public static final int UNASSIGNED = 0;
/**
* Character type Cn
* Not Assigned (no characters in [UnicodeData.txt] have this property)
* @draft 2.1
*/
public static final int GENERAL_OTHER_TYPES = 0;
/**
* Character type Lu
*/
public static final int UPPERCASE_LETTER = 1;
@ -163,17 +160,13 @@ public class UCharacterCategory
* Character type Pf
*/
public static final int FINAL_PUNCTUATION = 29;
/**
* Character type Cn
*/
public static final int GENERAL_OTHER_TYPES = 30;
// start of 31 ------------
/**
* Character type count
*/
public static final int CHAR_CATEGORY_COUNT = 31;
public static final int CHAR_CATEGORY_COUNT = 30;
/**
* Gets the name of the argument category
@ -245,4 +238,72 @@ public class UCharacterCategory
}
return "Unassigned";
}
// private constructor -----------------------------------------------
/**
* Private constructor to prevent initialisation
*/
private UCharacterCategory()
{
}
// package private data members --------------------------------------
/**
* Not a character type
*/
static final int NON_CHARACTER_ = CHAR_CATEGORY_COUNT;
/**
* Lead surrogate type
*/
static final int LEAD_SURROGATE_ = CHAR_CATEGORY_COUNT + 1;
/**
* Trail surrogate type
*/
static final int TRAIL_SURROGATE_ = CHAR_CATEGORY_COUNT + 2;
/**
* Extended category count
*/
static final int EXTENDED_CATEGORY_ = CHAR_CATEGORY_COUNT + 3;
/**
* Type names used for extended names
*/
static final String TYPE_NAMES_[] = {"unassigned",
"uppercase letter",
"lowercase letter",
"titlecase letter",
"modifier letter",
"other letter",
"non spacing mark",
"enclosing mark",
"combining spacing mark",
"decimal digit number",
"letter number",
"other number",
"space separator",
"line separator",
"paragraph separator",
"control",
"format",
"private use area",
"surrogate",
"dash punctuation",
"start punctuation",
"end punctuation",
"connector punctuation",
"other punctuation",
"math symbol",
"currency symbol",
"modifier symbol",
"other symbol",
"initial punctuation",
"final punctuation",
"noncharacter",
"lead surrogate",
"trail surrogate"};
/**
* Unknown type name
*/
static final String UNKNOWN_TYPE_NAME_ = "unknown";
}

View File

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $
* $Date: 2002/02/08 01:08:38 $
* $Revision: 1.6 $
* $Date: 2002/02/15 02:53:34 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -409,20 +409,29 @@ final class UCharacterName
return null;
}
int tempChoice = choice;
if (tempChoice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
tempChoice = UCharacterNameChoice.U_UNICODE_CHAR_NAME;
}
String result = "";
// Do not write algorithmic Unicode 1.0 names because Unihan names are
// the same as the modern ones, extension A was only introduced with
// Unicode 3.0, and the Hangul syllable block was moved and changed around
// Unicode 1.1.5.
if (choice == UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
if (tempChoice == UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
// try getting algorithmic name first
result = getAlgName(ch);
}
// getting normal character name
if (result == null || result.length() == 0) {
result = getGroupName(ch, choice);
if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
result = getExtendedName(ch);
} else {
result = getGroupName(ch, choice);
}
}
return result;
@ -442,26 +451,42 @@ final class UCharacterName
name == null || name.length() == 0) {
return -1;
}
String uppercasename = UCharacter.toUpperCase(Locale.ENGLISH, name);
// try extended names first
int result = getExtendedChar(name, choice);
if (result >= -1) {
return result;
}
// try algorithmic names first, if fails then try group names
// int result = getAlgorithmChar(choice, uppercasename);
int tempChoice = choice;
if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
tempChoice = UCharacterNameChoice.U_UNICODE_CHAR_NAME;
}
// 1.0 has no algorithmic names
String upperCaseName = UCharacter.toUpperCase(Locale.ENGLISH, name);
// try algorithmic names now, 1.0 has no algorithmic names
if (choice != UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
return getGroupChar(uppercasename, choice);
return getGroupChar(upperCaseName, tempChoice);
}
int count = 0;
if (m_algorithm_ != null) {
count = m_algorithm_.length;
}
for (count --; count >= 0; count --) {
int result = m_algorithm_[count].getAlgorithmChar(name);
result = m_algorithm_[count].getAlgorithmChar(name);
if (result >= 0) {
return result;
}
}
return getGroupChar(uppercasename, choice);
result = getGroupChar(upperCaseName, tempChoice);
if (result == -1 &&
choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
result = getGroupChar(upperCaseName,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
return result;
}
/**
@ -943,4 +968,118 @@ final class UCharacterName
}
return -1;
}
/**
* Getting the character with extended name of the form <....>.
* @param name of the character to be found
* @param choice name choice
* @return character associated with the name, -1 if such character is not
* found and -2 if we should continue with the search.
*/
private int getExtendedChar(String name, int choice)
{
if (name.charAt(0) == '<') {
if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
int endIndex = name.length() - 1;
if (name.charAt(endIndex) == '>') {
int startIndex = name.lastIndexOf('-');
if (startIndex >= 0) { // We've got a category.
startIndex ++;
int result = -1;
try {
result = Integer.parseInt(
name.substring(startIndex, endIndex),
16);
}
catch (NumberFormatException e) {
return -1;
}
// Now validate the category name. We could use a
// binary search, or a trie, if we really wanted to.
String type = name.substring(1, startIndex - 1);
int length = UCharacterCategory.TYPE_NAMES_.length;
for (int i = 0; i < length; ++ i) {
if (type.compareToIgnoreCase(
UCharacterCategory.TYPE_NAMES_[i]) == 0) {
if (getType(result) == i) {
return result;
}
break;
}
}
}
}
}
return -1;
}
return -2;
}
/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private int getType(int ch)
{
if ((ch & 0xFFFE) == 0xFFFE || (ch >= 0xFDD0 && ch <= 0xFDEF)) {
// not a character we return a invalid category count
return UCharacterCategory.NON_CHARACTER_;
}
// Undo ICU exceptions to the UCD when determining the category.
int result;
if (UCharacter.isISOControl(ch)) {
result = UCharacterCategory.CONTROL;
}
else {
result = UCharacter.getType(ch);
if (result == UCharacterCategory.SURROGATE) {
if (UTF16.isLeadSurrogate((char)ch)) {
result = UCharacterCategory.LEAD_SURROGATE_;
}
else {
result = UCharacterCategory.TRAIL_SURROGATE_;
}
}
}
return result;
}
/**
* Retrieves the extended name
*/
private String getExtendedName(int ch)
{
String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
if (result == null) {
if (getType(ch) == UCharacterCategory.CONTROL) {
result = getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
if (result == null) {
int type = getType(ch);
// Return unknown if the table of names above is not up to
// date.
if (type >= UCharacterCategory.TYPE_NAMES_.length) {
result = UCharacterCategory.UNKNOWN_TYPE_NAME_;
}
else {
result = UCharacterCategory.TYPE_NAMES_[type];
}
StringBuffer tempResult = new StringBuffer(result);
tempResult.insert(0, '<');
tempResult.append('-');
String chStr = Integer.toHexString(ch).toUpperCase();
int zeros = 4 - chStr.length();
while (zeros > 0) {
tempResult.append('0');
zeros --;
}
tempResult.append(chStr);
tempResult.append('>');
result = tempResult.toString();
}
}
return result;
}
}

View File

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterNameChoiceEnum.java $
* $Date: 2001/03/23 19:51:38 $
* $Revision: 1.2 $
* $Date: 2002/02/15 02:53:35 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -30,5 +30,6 @@ interface UCharacterNameChoice
static final int U_UNICODE_CHAR_NAME = 0;
static final int U_UNICODE_10_CHAR_NAME = 1;
static final int U_CHAR_NAME_CHOICE_COUNT = 2;
static final int U_EXTENDED_CHAR_NAME = 2;
static final int U_CHAR_NAME_CHOICE_COUNT = 3;
}

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/test/text/Attic/UCharacterTest.java,v $
* $Date: 2002/02/08 23:44:17 $
* $Revision: 1.21 $
* $Date: 2002/02/15 02:53:32 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -555,16 +555,33 @@ public final class UCharacterTest extends TestFmwk
*/
public void TestNames()
{
int c[] = {0x0061, 0x0284, 0x3401, 0x7fed, 0xac00, 0xd7a3, 0xff08, 0xffe5,
0x23456};
int c[] = {0x0061, 0x0284, 0x3401, 0x7fed, 0xac00, 0xd7a3, 0xd800, 0xdc00,
0xff08, 0xffe5, 0xffff, 0x23456, 0x9};
String name[] = {"LATIN SMALL LETTER A",
"LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
"CJK UNIFIED IDEOGRAPH-3401",
"CJK UNIFIED IDEOGRAPH-7FED", "HANGUL SYLLABLE GA",
"HANGUL SYLLABLE HIH", "FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN", "CJK UNIFIED IDEOGRAPH-23456"};
"HANGUL SYLLABLE HIH", "", "",
"FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN", "", "CJK UNIFIED IDEOGRAPH-23456",
""};
String oldname[] = {"", "LATIN SMALL LETTER DOTLESS J BAR HOOK", "", "",
"", "", "FULLWIDTH OPENING PARENTHESIS", "", ""};
"", "", "", "", "FULLWIDTH OPENING PARENTHESIS", "",
"", "", "HORIZONTAL TABULATION"};
String extendedname[] = {"LATIN SMALL LETTER A",
"LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK",
"CJK UNIFIED IDEOGRAPH-3401",
"CJK UNIFIED IDEOGRAPH-7FED",
"HANGUL SYLLABLE GA",
"HANGUL SYLLABLE HIH",
"<lead surrogate-D800>",
"<trail surrogate-DC00>",
"FULLWIDTH LEFT PARENTHESIS",
"FULLWIDTH YEN SIGN",
"<noncharacter-FFFF>",
"CJK UNIFIED IDEOGRAPH-23456",
"HORIZONTAL TABULATION"};
int size = c.length;
String str;
int uc;
@ -573,7 +590,8 @@ public final class UCharacterTest extends TestFmwk
{
// modern Unicode character name
str = UCharacter.getName(c[i]);
if (!str.equals(name[i]))
if ((str == null && name[i].length() > 0) ||
(str != null && !str.equals(name[i])))
{
errln("FAIL \\u" + hex(c[i]) + " expected name " +
name[i]);
@ -590,9 +608,18 @@ public final class UCharacterTest extends TestFmwk
break;
}
// extended character name
str = UCharacter.getExtendedName(c[i]);
if (str == null || !str.equals(extendedname[i]))
{
errln("FAIL \\u" + hex(c[i]) + " expected extended name " +
extendedname[i]);
break;
}
// retrieving unicode character from modern name
uc = UCharacter.getCharFromName(name[i]);
if (uc != c[i])
if (uc != c[i] && name[i].length() != 0)
{
errln("FAIL " + name[i] + " expected character \\u" + hex(c[i]));
break;
@ -600,9 +627,17 @@ public final class UCharacterTest extends TestFmwk
//retrieving unicode character from 1.0 name
uc = UCharacter.getCharFromName1_0(oldname[i]);
if (uc != c[i] && oldname[i].length() != 0)
{
errln("FAIL " + oldname[i] + " expected 1.0 character \\u" + hex(c[i]));
break;
}
//retrieving unicode character from 1.0 name
uc = UCharacter.getCharFromExtendedName(extendedname[i]);
if (uc != c[i] && i != 0 && (i == 1 || i == 6))
{
errln("FAIL " + name[i] + " expected 1.0 character \\u" + hex(c[i]));
errln("FAIL " + extendedname[i] + " expected extended character \\u" + hex(c[i]));
break;
}
}
@ -1014,8 +1049,8 @@ public final class UCharacterTest extends TestFmwk
try
{
UCharacterTest test = new UCharacterTest();
//test.TestEnumeration();
test.run(arg);
test.TestNames();
//test.run(arg);
}
catch (Exception e)
{

View File

@ -5,8 +5,8 @@
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UCharacter.java,v $
* $Date: 2002/02/08 01:08:38 $
* $Revision: 1.21 $
* $Date: 2002/02/15 02:53:32 $
* $Revision: 1.22 $
*
*******************************************************************************
*/
@ -17,6 +17,8 @@ package com.ibm.text;
import java.util.Locale;
import com.ibm.util.Utility;
import com.ibm.icu.util.RangeValueIterator;
import com.ibm.text.BreakIterator;
import com.ibm.text.RuleBasedBreakIterator;
/**
* <p>
@ -910,8 +912,7 @@ public final class UCharacter
*/
public static String getName(int ch)
{
return NAME_.getName(ch,
UCharacterNameChoice.U_UNICODE_CHAR_NAME);
return NAME_.getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
}
/**
@ -929,10 +930,33 @@ public final class UCharacter
return NAME_.getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
/**
* <p>Retrieves a name for a valid codepoint. Unlike, getName(int) and
* getName1_0(int), this method will return a name even for codepoints that
* are not assigned a name in UnicodeData.txt.
* </p>
* The names are returned in the following order.
* <ul>
* <li> Most current Unicode name if there is any
* <li> Unicode 1.0 name if there is any
* <li> Extended name in the form of "<codepoint_type-codepoint_hex_digits>".
* E.g. <noncharacter-fffe>
* </ul>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param ch the code point for which to get the name
* @return a name for the argument codepoint
* @draft 2.1
*/
public static String getExtendedName(int ch)
{
return NAME_.getName(ch, UCharacterNameChoice.U_EXTENDED_CHAR_NAME);
}
/**
* Find a Unicode code point by its most current Unicode name and return its
* code point value.<br>
* <p>Find a Unicode code point by its most current Unicode name and
* return its code point value. All Unicode names are in uppercase.</p>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name most current Unicode character name whose code point is to be
@ -946,8 +970,8 @@ public final class UCharacter
}
/**
* Find a Unicode character by its version 1.0 Unicode name and return its
* code point value.<br>
* <p>Find a Unicode character by its version 1.0 Unicode name and return
* its code point value. All Unicode names are in uppercase.</p>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name Unicode 1.0 code point name whose code point is to
@ -959,6 +983,31 @@ public final class UCharacter
return NAME_.getCharFromName(
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME, name);
}
/**
* <p>Find a Unicode character by either its name and return its code
* point value. All Unicode names are in uppercase.
* Extended names are all lowercase except for numbers and are contained
* within angle brackets.</p>
* The names are searched in the following order
* <ul>
* <li> Most current Unicode name if there is any
* <li> Unicode 1.0 name if there is any
* <li> Extended name in the form of "<codepoint_type-codepoint_hex_digits>".
* E.g. <noncharacter-FFFE>
* </ul>
* Note calling any methods related to code point names, e.g. get*Name*()
* incurs a one-time initialisation cost to construct the name tables.
* @param name codepoint name
* @return code point associated with the name or -1 if the name is not
* found.
* @draft 2.1
*/
public static int getCharFromExtendedName(String name)
{
return NAME_.getCharFromName(
UCharacterNameChoice.U_EXTENDED_CHAR_NAME, name);
}
/**
* Returns a code pointcorresponding to the two UTF16 characters.<br>
@ -1016,6 +1065,38 @@ public final class UCharacter
{
return toLowerCase(Locale.getDefault(), str);
}
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
* </p>
* <p>Only positions returned by the break iterator will be title cased,
* character in between the positions will all be in lower case.</p>
* <p>Casing is dependent on the default locale and context-sensitive</p>
* @param str source string to be performed on
* @param breakiter break iterator to determine the positions in which
* the character should be title cased.
* @return lowercase version of the argument string
*/
public static String toTitleCase(String str, BreakIterator breakiter)
{
if (breakiter == null) {
String rules = "$cased=[[:Lu:][:Lt:][:Ll:]];" +
"$case_ignorable=[[:Mn:][:Me:][:Cf:][:Lm:][:Sk:]"
+ " \\u0027\u00AD\u2019];" +
"$not_cased=[^$cased$case_ignorable];" +
"[$not_cased$case_ignorable]*/" +
"$cased[$cased$case_ignorable]*$not_cased*;";
breakiter = new RuleBasedBreakIterator(rules);
}
return str;
}
/**
* Gets uppercase version of the argument string.
@ -1111,6 +1192,30 @@ public final class UCharacter
return result.toString();
}
/**
* <p>Gets the titlecase version of the argument string.</p>
* <p>Position for titlecasing is determined by the argument break
* iterator, hence the user can customized his break iterator for
* a specialized titlecasing. In this case only the forward iteration
* needs to be implemented.
* If the break iterator passed in is null, the default Unicode algorithm
* will be used to determine the titlecase positions.
* </p>
* <p>Only positions returned by the break iterator will be title cased,
* character in between the positions will all be in lower case.</p>
* <p>Casing is dependent on the argument locale and context-sensitive</p>
* @param locale which string is to be converted in
* @param str source string to be performed on
* @param breakiter break iterator to determine the positions in which
* the character should be title cased.
* @return lowercase version of the argument string
*/
public static String toTitleCase(Locale locale, String str,
BreakIterator breakiter)
{
return str;
}
/**
* The given character is mapped to its case folding equivalent according to
* UnicodeData.txt and CaseFolding.txt; if the character has no case folding

View File

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterCategory.java $
* $Date: 2001/10/12 23:53:16 $
* $Revision: 1.3 $
* $Date: 2002/02/15 02:53:35 $
* $Revision: 1.4 $
*
*******************************************************************************
*/
@ -26,22 +26,19 @@ package com.ibm.text;
public class UCharacterCategory
{
// private constructor ===================================================
/**
* Private constructor to prevent initialisation
*/
private UCharacterCategory()
{
}
// public variable =======================================================
// public variable -----------------------------------------------------
/**
* Unassigned character type
*/
public static final int UNASSIGNED = 0;
/**
* Character type Cn
* Not Assigned (no characters in [UnicodeData.txt] have this property)
* @draft 2.1
*/
public static final int GENERAL_OTHER_TYPES = 0;
/**
* Character type Lu
*/
public static final int UPPERCASE_LETTER = 1;
@ -163,17 +160,13 @@ public class UCharacterCategory
* Character type Pf
*/
public static final int FINAL_PUNCTUATION = 29;
/**
* Character type Cn
*/
public static final int GENERAL_OTHER_TYPES = 30;
// start of 31 ------------
/**
* Character type count
*/
public static final int CHAR_CATEGORY_COUNT = 31;
public static final int CHAR_CATEGORY_COUNT = 30;
/**
* Gets the name of the argument category
@ -245,4 +238,72 @@ public class UCharacterCategory
}
return "Unassigned";
}
// private constructor -----------------------------------------------
/**
* Private constructor to prevent initialisation
*/
private UCharacterCategory()
{
}
// package private data members --------------------------------------
/**
* Not a character type
*/
static final int NON_CHARACTER_ = CHAR_CATEGORY_COUNT;
/**
* Lead surrogate type
*/
static final int LEAD_SURROGATE_ = CHAR_CATEGORY_COUNT + 1;
/**
* Trail surrogate type
*/
static final int TRAIL_SURROGATE_ = CHAR_CATEGORY_COUNT + 2;
/**
* Extended category count
*/
static final int EXTENDED_CATEGORY_ = CHAR_CATEGORY_COUNT + 3;
/**
* Type names used for extended names
*/
static final String TYPE_NAMES_[] = {"unassigned",
"uppercase letter",
"lowercase letter",
"titlecase letter",
"modifier letter",
"other letter",
"non spacing mark",
"enclosing mark",
"combining spacing mark",
"decimal digit number",
"letter number",
"other number",
"space separator",
"line separator",
"paragraph separator",
"control",
"format",
"private use area",
"surrogate",
"dash punctuation",
"start punctuation",
"end punctuation",
"connector punctuation",
"other punctuation",
"math symbol",
"currency symbol",
"modifier symbol",
"other symbol",
"initial punctuation",
"final punctuation",
"noncharacter",
"lead surrogate",
"trail surrogate"};
/**
* Unknown type name
*/
static final String UNKNOWN_TYPE_NAME_ = "unknown";
}

View File

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterName.java $
* $Date: 2002/02/08 01:08:38 $
* $Revision: 1.6 $
* $Date: 2002/02/15 02:53:34 $
* $Revision: 1.7 $
*
*******************************************************************************
*/
@ -409,20 +409,29 @@ final class UCharacterName
return null;
}
int tempChoice = choice;
if (tempChoice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
tempChoice = UCharacterNameChoice.U_UNICODE_CHAR_NAME;
}
String result = "";
// Do not write algorithmic Unicode 1.0 names because Unihan names are
// the same as the modern ones, extension A was only introduced with
// Unicode 3.0, and the Hangul syllable block was moved and changed around
// Unicode 1.1.5.
if (choice == UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
if (tempChoice == UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
// try getting algorithmic name first
result = getAlgName(ch);
}
// getting normal character name
if (result == null || result.length() == 0) {
result = getGroupName(ch, choice);
if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
result = getExtendedName(ch);
} else {
result = getGroupName(ch, choice);
}
}
return result;
@ -442,26 +451,42 @@ final class UCharacterName
name == null || name.length() == 0) {
return -1;
}
String uppercasename = UCharacter.toUpperCase(Locale.ENGLISH, name);
// try extended names first
int result = getExtendedChar(name, choice);
if (result >= -1) {
return result;
}
// try algorithmic names first, if fails then try group names
// int result = getAlgorithmChar(choice, uppercasename);
int tempChoice = choice;
if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
tempChoice = UCharacterNameChoice.U_UNICODE_CHAR_NAME;
}
// 1.0 has no algorithmic names
String upperCaseName = UCharacter.toUpperCase(Locale.ENGLISH, name);
// try algorithmic names now, 1.0 has no algorithmic names
if (choice != UCharacterNameChoice.U_UNICODE_CHAR_NAME) {
return getGroupChar(uppercasename, choice);
return getGroupChar(upperCaseName, tempChoice);
}
int count = 0;
if (m_algorithm_ != null) {
count = m_algorithm_.length;
}
for (count --; count >= 0; count --) {
int result = m_algorithm_[count].getAlgorithmChar(name);
result = m_algorithm_[count].getAlgorithmChar(name);
if (result >= 0) {
return result;
}
}
return getGroupChar(uppercasename, choice);
result = getGroupChar(upperCaseName, tempChoice);
if (result == -1 &&
choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
result = getGroupChar(upperCaseName,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
return result;
}
/**
@ -943,4 +968,118 @@ final class UCharacterName
}
return -1;
}
/**
* Getting the character with extended name of the form <....>.
* @param name of the character to be found
* @param choice name choice
* @return character associated with the name, -1 if such character is not
* found and -2 if we should continue with the search.
*/
private int getExtendedChar(String name, int choice)
{
if (name.charAt(0) == '<') {
if (choice == UCharacterNameChoice.U_EXTENDED_CHAR_NAME) {
int endIndex = name.length() - 1;
if (name.charAt(endIndex) == '>') {
int startIndex = name.lastIndexOf('-');
if (startIndex >= 0) { // We've got a category.
startIndex ++;
int result = -1;
try {
result = Integer.parseInt(
name.substring(startIndex, endIndex),
16);
}
catch (NumberFormatException e) {
return -1;
}
// Now validate the category name. We could use a
// binary search, or a trie, if we really wanted to.
String type = name.substring(1, startIndex - 1);
int length = UCharacterCategory.TYPE_NAMES_.length;
for (int i = 0; i < length; ++ i) {
if (type.compareToIgnoreCase(
UCharacterCategory.TYPE_NAMES_[i]) == 0) {
if (getType(result) == i) {
return result;
}
break;
}
}
}
}
}
return -1;
}
return -2;
}
/**
* Gets the character extended type
* @param ch character to be tested
* @return extended type it is associated with
*/
private int getType(int ch)
{
if ((ch & 0xFFFE) == 0xFFFE || (ch >= 0xFDD0 && ch <= 0xFDEF)) {
// not a character we return a invalid category count
return UCharacterCategory.NON_CHARACTER_;
}
// Undo ICU exceptions to the UCD when determining the category.
int result;
if (UCharacter.isISOControl(ch)) {
result = UCharacterCategory.CONTROL;
}
else {
result = UCharacter.getType(ch);
if (result == UCharacterCategory.SURROGATE) {
if (UTF16.isLeadSurrogate((char)ch)) {
result = UCharacterCategory.LEAD_SURROGATE_;
}
else {
result = UCharacterCategory.TRAIL_SURROGATE_;
}
}
}
return result;
}
/**
* Retrieves the extended name
*/
private String getExtendedName(int ch)
{
String result = getName(ch, UCharacterNameChoice.U_UNICODE_CHAR_NAME);
if (result == null) {
if (getType(ch) == UCharacterCategory.CONTROL) {
result = getName(ch,
UCharacterNameChoice.U_UNICODE_10_CHAR_NAME);
}
if (result == null) {
int type = getType(ch);
// Return unknown if the table of names above is not up to
// date.
if (type >= UCharacterCategory.TYPE_NAMES_.length) {
result = UCharacterCategory.UNKNOWN_TYPE_NAME_;
}
else {
result = UCharacterCategory.TYPE_NAMES_[type];
}
StringBuffer tempResult = new StringBuffer(result);
tempResult.insert(0, '<');
tempResult.append('-');
String chStr = Integer.toHexString(ch).toUpperCase();
int zeros = 4 - chStr.length();
while (zeros > 0) {
tempResult.append('0');
zeros --;
}
tempResult.append(chStr);
tempResult.append('>');
result = tempResult.toString();
}
}
return result;
}
}

View File

@ -6,8 +6,8 @@
*
* $Source:
* /usr/cvs/icu4j/icu4j/src/com/ibm/icu/text/UCharacterNameChoiceEnum.java $
* $Date: 2001/03/23 19:51:38 $
* $Revision: 1.2 $
* $Date: 2002/02/15 02:53:35 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
@ -30,5 +30,6 @@ interface UCharacterNameChoice
static final int U_UNICODE_CHAR_NAME = 0;
static final int U_UNICODE_10_CHAR_NAME = 1;
static final int U_CHAR_NAME_CHOICE_COUNT = 2;
static final int U_EXTENDED_CHAR_NAME = 2;
static final int U_CHAR_NAME_CHOICE_COUNT = 3;
}