/** ******************************************************************************* * Copyright (C) 1996-2001, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/utility/UTF32.java,v $ * $Date: 2001/08/31 00:19:16 $ * $Revision: 1.2 $ * ******************************************************************************* */ package com.ibm.text.utility; /** * Utility class for demonstrating UTF16 character conversions and indexing conversions. * Ideally, these methods would be on existing classes in Java, but they can also be used * in a stand-alone utility class like this one. *
Code that uses strings alone rarely need modification.
* By design, UTF-16 does not allow overlap, so searching for strings is a safe operation.
* Similarly, concatenation is always safe. Substringing is safe if the start and end are both
* on UTF32 boundaries. In normal code, the values for start and end are on those boundaries,
* since they arose from operations like searching.
* If not, the nearest UTF-32 boundaries can be determined using bounds32()
.
*
Here is a summary of the methods: *
char32At()
, count32()
, and append32()
* are most important methods for most programs.
* They are used for iteration, filtering and copying. See the examples below.
* bounds32()
is useful for finding the nearest UTF-32 boundaries.
* However, in most circumstances it is better to use
*
* BreakIterator.getCharacterInstance(Locale) to find character boundaries
* that are closer to end-user expectations.
* valueOf32()
is occasionally convenient for producing a string containing a UTF-32 value.
* findOffset16()
and findOffset32()
are generally not needed,
* except when interfacing to specifications that use UTF-32 indices (such as XSL).
* isLegal()
can be used to test whether UTF-16 or UTF-32 values are valid.
* isLeadSurrogate()
, isSurrogate()
, and isTrailSurrogate()
* test the type of a char. They are useful for lower-level code.
* getChar32()
, getLead()
, and getTrail()
* are sometimes useful for putting together and taking apart UTF-32 values.
* The following examples illustrate use of some of these methods.
// iteration forwards: Original for (int i = 0; i < s.length(); ++i) { char ch = s.charAt(i); doSomethingWith(ch); } // iteration forwards: Changes for UTF-32 int ch; for (int i = 0; i < s.length(); i+=UTF32.count16(ch)) { ch = UTF32.char32At(s,i); doSomethingWith(ch); } // iteration backwards: Original for (int i = s.length()-1; i >= 0; --i) { char ch = s.charAt(i); doSomethingWith(ch); } // iteration backwards: Changes for UTF-32 int ch; for (int i = s.length()-1; i > 0; i-=UTF32.count16(ch)) { ch = UTF32.char32At(s,i); doSomethingWith(ch); } ** Notes: *
Lead
and Trail
in the API,
* which gives a better sense of their ordering in a string. offset16
and offset32
are used to distinguish
* offsets to UTF-16 boundaries vs offsets to UTF-32 boundaries.
* int char32
is used to contain UTF-32 characters, as opposed to char
, which is a UTF-16 code unit.
* bounds(string, offset16) != TRAIL
.
* isLegal()
can be used to check for validity if desired.
* char32
contains an out-of-bounds UTF-32 value,
* then it is treated as REPLACEMENT_CHAR for consistency across the API.
* bounds32()
.
*/
public static final int SINGLE = 1, LEAD = 2, TRAIL = 5;
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use isLegal()
* on char32 before calling.
* If this were integrated into the Java API, it could be a static method of either Character or String.
* @return 2 if is in surrogate space, otherwise 1.
* @param ch the input character.
*/
public static int count16(int char32) {
if (char32 < MIN_SUPPLEMENTARY) return 1;
return 2;
}
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with count16()
, as well as random access.
* If a validity check is required, use isLegal()
on the return value.
*
If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator.
* @return UTF-32 value for the UTF-32 value that contains the char at offset16.
* The boundaries of that codepoint are the same as in bounds32()
.
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
*/
public static int char32At(String source, int offset16) {
char single = source.charAt(offset16);
if (!isSurrogate(single)) return single;
try { // use exception to catch out-of-bounds
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is low,
// look both directions.
if (isLeadSurrogate(single)) {
char trail = source.charAt(++offset16);
if (isTrailSurrogate(trail)) {
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
} else { // isTrailSurrogate(single), so
char lead = source.charAt(--offset16);
if (isLeadSurrogate(lead)) {
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
}
}
} catch (StringIndexOutOfBoundsException e) {}
return single; // return unmatched surrogate
}
public static int char32At(StringBuffer source, int offset16) {
char single = source.charAt(offset16);
if (!isSurrogate(single)) return single;
try { // use exception to catch out-of-bounds
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is low,
// look both directions.
if (isLeadSurrogate(single)) {
char trail = source.charAt(++offset16);
if (isTrailSurrogate(trail)) {
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
} else { // isTrailSurrogate(single), so
char lead = source.charAt(--offset16);
if (isLeadSurrogate(lead)) {
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
}
}
} catch (StringIndexOutOfBoundsException e) {}
return single; // return unmatched surrogate
}
public static int char32At(char[] source, int start16, int end16, int offset16) {
if (offset16 < start16 || offset16 >= end16) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) return single;
try { // use exception to catch out-of-bounds
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is low,
// look both directions.
if (isLeadSurrogate(single)) {
++offset16;
if (offset16 >= end16) return single;
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return ((int)single << SURROGATE_SHIFT) + trail + SURROGATE_OFFSET;
}
} else { // isTrailSurrogate(single), so
char lead = source[--offset16];
if (isLeadSurrogate(lead)) {
return ((int)lead << SURROGATE_SHIFT) + single + SURROGATE_OFFSET;
}
}
} catch (ArrayIndexOutOfBoundsException e) {}
return single; // return unmatched surrogate
}
// moral equivalent of valueOf32(charAt32(x)), but no memory alloc
public static String getCodePointSubstring(String s, int offset16) {
switch(bounds32(s,offset16)) {
default: return s.substring(offset16,offset16+1);
case LEAD: return s.substring(offset16,offset16+2);
case TRAIL: return s.substring(offset16-1,offset16+1);
}
}
// moral equivalent of valueOf32(charAt32(x)), but no memory alloc
public static String getCodePointSubstring(StringBuffer s, int offset16) {
switch(bounds32(s,offset16)) {
default: return s.substring(offset16,offset16+1);
case LEAD: return s.substring(offset16,offset16+2);
case TRAIL: return s.substring(offset16-1,offset16+1);
}
}
public static int append32(char[] output, int oPosition, int oEnd, int cp) {
if (oPosition >= oEnd) throw new ArrayIndexOutOfBoundsException(oPosition);
output[oPosition++] = UTF32.getLead(cp);
if (UTF32.count16(cp) != 1) {
if (oPosition >= oEnd) throw new ArrayIndexOutOfBoundsException(oPosition);
output[oPosition++] = UTF32.getTrail(cp);
}
return oPosition;
}
public static void setChar32At(StringBuffer b, int position, int codePoint) {
int type = bounds32(b, position);
// handle simple cases: #chars at position match #chars in codePoint
int end = position;
switch (type) {
case SINGLE:
if (isSupplementary(codePoint)) break;
b.setCharAt(position, (char)codePoint);
return;
case LEAD:
if (!isSupplementary(codePoint)) {
++end;
break;
}
b.setCharAt(position++, (char)getLead(codePoint));
b.setCharAt(position, (char)getTrail(codePoint));
return;
case TRAIL:
if (!isSupplementary(codePoint)) {
--position;
break;
}
b.setCharAt(position++, (char)getLead(codePoint));
b.setCharAt(position, (char)getTrail(codePoint));
return;
}
// mismatch, just use long form
b.replace(position, end+1, valueOf32(codePoint));
}
/**
* See if a char value is legal. It can't be:
*
If this were integrated into the Java API, it could be a static method of String or Character. * @param UTF-32 value to test * @return true iff legal. */ public static boolean isLegal(char char16) { return (char16 < 0xFFFE); } /** * See if a UTF32 value is legal. It can't be: *
If this were integrated into the Java API, it could be a static method of String or Character. * @param char32 UTF-32 value to test * @return true iff legal. */ public static boolean isLegal(int char32) { if (char32 < 0) return false; //if (char32 < SURROGATE_BASE) return true; //if (char32 < SURROGATE_LIMIT) return false; if ((char32 & PLANE_MASK) >= NON_CHARACTER_BASE) return false; return (char32 <= MAX_UNICODE); } /** * Determines whether the code unit OR code point is a surrogate. *
If this were integrated into the Java API, it could be a static method of String or Character. * @return true iff the input character is a surrogate. * @param ch the input character. */ public static boolean isSurrogate(int char32) { return (SURROGATE_BASE <= char32 && char32 < SURROGATE_LIMIT); } /** * Determines whether the code point is a supplementary. *
If this were integrated into the Java API, it could be a static method of String or Character. * @return true iff the input character is a surrogate. * @param ch the input character. */ public static boolean isSupplementary(int char32) { return (char32 >= MIN_SUPPLEMENTARY && char32 <= MAX_UNICODE); } /** * Determines whether the code point is a supplementary. *
If this were integrated into the Java API, it could be a static method of String or Character. * @return true iff the input character is a surrogate. * @param ch the input character. */ public static boolean isBasic(int char32) { return (char32 >= 0 && char32 < MIN_SUPPLEMENTARY); } /** * Determines whether the character is a trail surrogate. *
If this were integrated into the Java API, it could be a static method of String or Character. * @return true iff the input character is a trail surrogate. * @param ch the input character. */ public static boolean isTrailSurrogate(char ch) { return (TRAIL_BASE <= ch && ch < TRAIL_LIMIT); } /** * Determines whether the character is a lead surrogate. *
If this were integrated into the Java API, it could be a static method of String or Character.
* @return true iff the input character is a lead surrogate.
* @param ch the input character.
*/
public static boolean isLeadSurrogate(char ch) {
return (LEAD_BASE <= ch && ch < LEAD_LIMIT);
}
/**
* Returns the lead surrogate.
* If a validity check is required, use isLegal()
on char32 before calling.
*
If this were integrated into the Java API, it could be a static method of String or Character.
* @return lead surrogate if the count16(ch) is 2;
*
otherwise the character itself
* @param char32 the input character.
*/
public static char getLead(int char32) {
if (char32 >= MIN_SUPPLEMENTARY) {
return (char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT));
}
return (char)char32;
}
/**
* Returns the trail surrogate.
* If a validity check is required, use isLegal()
on char32 before calling.
*
If this were integrated into the Java API, it could be a static method of String or Character.
* @return the trail surrogate if the count16(ch) is 2;
*
and 0 otherwise (note: 0 is not a valid lead surrogate).
* @param char32 the input character.
*/
public static char getTrail(int char32) {
if (char32 >= MIN_SUPPLEMENTARY) {
return (char)(TRAIL_BASE + (char32 & TRAIL_MASK));
}
return '\u0000';
}
/**
* Convenience method corresponding to String.valueOf(char). It returns a one or two char string containing
* the UTF-32 value. If the input value can't be converted, it substitutes REPLACEMENT_CHAR.
* If a validity check is required, use isLegal()
before calling.
*
If this were integrated into the Java API, it could be a static method of String.
* @return string value of char32
* @param ch the input character.
*/
public static String valueOf32(int char32) {
if (char32 < 0 || MAX_UNICODE < char32) return String.valueOf(REPLACEMENT_CHAR);
if (char32 < MIN_SUPPLEMENTARY) return String.valueOf((char)char32);
synchronized (buf2) { // saves allocations
buf2[0] = (char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT));
buf2[1] = (char)(TRAIL_BASE + (char32 & TRAIL_MASK));
return String.valueOf(buf2);
}
}
private static char[] buf2 = new char[2]; // used to avoid allocations
/**
* Returns the UTF-32 character corresponding to the two chars.
* If a validity check is required, check the arguments with
* isLeadSurrogate()
and isTrailSurrogate()
, respectively before calling.
*
If this were integrated into the Java API, it could be a static method of String or Character. * @return the UTF-32 character, or REPLACEMENT_CHAR if invalid. * @param lead the lead char * @param lead the trail char */ public static int getChar32(char lead, char trail) { if (isLeadSurrogate(lead) && isTrailSurrogate(trail)) { return (lead <<= SURROGATE_SHIFT) + trail + SURROGATE_OFFSET; } return REPLACEMENT_CHAR; } /** * Returns the type of the UTF32 boundaries around the char at offset16. * Used for random access. *
If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator. * @return SINGLE, FIRST, or SECOND: *
If this were integrated into the Java API, it could be a method of String, StringBuffer and possibly CharacterIterator. * @return UTF-16 offset * @param offset32 UTF-32 offset * @param source text to analyse * @exception StringIndexOutOfBoundsException if offset32 is out of bounds. */ public static int findOffset16(String source, int offset32) { int remaining = offset32; // for decrementing boolean hadLeadSurrogate = false; int i; for (i = 0; remaining > 0 && i < source.length(); ++i) { char ch = source.charAt(i); if (hadLeadSurrogate && isTrailSurrogate(ch)) { hadLeadSurrogate = false; // count valid trail as zero } else { hadLeadSurrogate = isLeadSurrogate(ch); --remaining; // count others as 1 } } // if we didn't use up all of remaining (or if we started < 0) // then it is beyond the bounds if (remaining != 0) throw new StringIndexOutOfBoundsException(offset32); // special check for last surrogate if needed, for consistency with // other situations if (hadLeadSurrogate && i < source.length() && isTrailSurrogate(source.charAt(i))) { ++i; // grab extra unicode } return i; } /** * Returns the UTF-32 offset corresponding to the first UTF-32 boundary at or after the given UTF-16 offset. * Used for random access. See the class description * for notes on roundtripping. * Note: If the UTF-16 offset is into the middle of a surrogate pair, then * the UTF-32 offset of the end of the pair is returned. *
To find the UTF-32 length of a string, use: *
* len32 = getOffset32(source, source.length()); **
If this were integrated into the Java API, it could be a methods of String, StringBuffer and possibly CharacterIterator.
* @return UTF-32 offset
* @param source text to analyse
* @param offset16 UTF-16 offset
* @exception StringIndexOutOfBoundsException if offset16 is out of bounds.
*/
public static int findOffset32(String source, int offset16) {
int result = 0;
boolean hadLeadSurrogate = false;
for (int i = 0; i < offset16; ++i) {
char ch = source.charAt(i);
if (hadLeadSurrogate && isTrailSurrogate(ch)) {
hadLeadSurrogate = false; // count valid trail as zero
} else {
hadLeadSurrogate = isLeadSurrogate(ch);
++result; // count others as 1
}
}
return result;
}
public static int length32(String source) {
return findOffset32(source, source.length());
}
/**
* Append a single UTF-32 value to the end of a StringBuffer.
* If a validity check is required, use isLegal()
on char32 before calling.
*
If this were integrated into the Java API, it could be a method of StringBuffer. * @param char32 value to append. If out of bounds, substitutes REPLACEMENT_CHAR. * @param target string to add to */ public static void append32(StringBuffer target, int char32) { // Check for irregular values if (char32 < 0 || char32 > MAX_UNICODE) char32 = REPLACEMENT_CHAR; // Write the UTF-16 values if (char32 >= MIN_SUPPLEMENTARY) { target.append((char)(LEAD_BASE_OFFSET + (char32 >> SURROGATE_SHIFT))); target.append((char)(TRAIL_BASE + (char32 & TRAIL_MASK))); } else { target.append((char)char32); } } /** * Compare strings using Unicode code point order, instead of UTF-16 code unit order. */ public static final class StringComparator implements java.util.Comparator { /** * Standard String compare. Only one small section is different, marked in the code. */ public int compare(Object a, Object b) { if (a == b) { return 0; } if (a == null) { return -1; } else if (b == null) { return 1; } String sa = (String) a; String sb = (String) b; int lena = sa.length(); int lenb = sb.length(); int len = lena; if (len > lenb) len = lenb; for (int i = 0; i < len; ++i) { char ca = sa.charAt(i); char cb = sb.charAt(i); if (ca == cb) continue; // skip remap if equal // start of only different section if (ca >= 0xD800) { // reshuffle to get right codepoint order ca += (ca < 0xE000) ? 0x2000 : -0x800; } if (cb >= 0xD800) { // reshuffle to get right codepoint order cb += (cb < 0xE000) ? 0x2000 : -0x800; } // end of only different section if (ca < cb) return -1; return 1; // wasn't equal, so return 1 } if (lena < lenb) return -1; if (lena > lenb) return 1; return 0; } } // =========================================================== // PRIVATES // =========================================================== /** * Prevent instance from being created. */ private UTF32() {} /** * Maximum code point values for UTF-32. */ private static final int MAX_UNICODE = 0x10FFFF; /** * Maximum values for Basic code points (BMP). */ private static final int MAX_BASIC = 0xFFFF; /** * Minimum value for Supplementary code points (SMP). */ private static final int MIN_SUPPLEMENTARY = 0x10000; /** * Used to mask off single plane in checking for NON_CHARACTER */ private static final int PLANE_MASK = 0xFFFF; /** * Range of non-characters in each plane */ private static final int NON_CHARACTER_BASE = 0xFFFE, NON_CHARACTER_END = 0xFFFF; // useful statics and tables for fast lookup /** * Values for surrogate detection. X is a surrogate iff X & SURROGATE_MASK == SURROGATE_MASK. */ static final int SURROGATE_MASK = 0xD800; /** * Bottom 10 bits for use in surrogates. */ private static final int TRAIL_MASK = 0x3FF; /** * Shift value for surrogates. */ private static final int SURROGATE_SHIFT = 10; /** * Lead surrogates go from LEAD_BASE up to LEAD_LIMIT-1. */ private static final int LEAD_BASE = 0xD800, LEAD_LIMIT = 0xDC00; /** * Trail surrogates go from TRAIL_BASE up to TRAIL_LIMIT-1. */ private static final int TRAIL_BASE = 0xDC00, TRAIL_LIMIT = 0xE000; /** * Surrogates go from SURROGATE_BASE up to SURROGATE_LIMIT-1. */ private static final int SURROGATE_BASE = 0xD800, SURROGATE_LIMIT = 0xE000; /** * Any codepoint at or greater than SURROGATE_SPACE_BASE requires 2 16-bit code units. */ //private static final int SURROGATE_SPACE_BASE = 0x10000; /** * Offset to add to combined surrogate pair to avoid masking. */ private static final int SURROGATE_OFFSET = MIN_SUPPLEMENTARY - (LEAD_BASE << SURROGATE_SHIFT) - TRAIL_BASE; private static final int LEAD_BASE_OFFSET = LEAD_BASE - (MIN_SUPPLEMENTARY >> SURROGATE_SHIFT); };