diff --git a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java index 5bea6b9478..86264792be 100755 --- a/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java +++ b/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/test/translit/TransliteratorTest.java,v $ - * $Date: 2001/07/02 19:44:10 $ - * $Revision: 1.38 $ + * $Date: 2001/07/02 20:54:51 $ + * $Revision: 1.39 $ * ***************************************************************************************** */ @@ -795,6 +795,26 @@ public class TransliteratorTest extends TestFmwk { "The Quick Brown FoX Can't Jump Over The LaZy Dogs."); } + /** + * Test the name mapping transliterators. + */ + public void TestNameMap() { + Transliterator uni2name = + Transliterator.getInstance("Any-Name[^abc]"); + Transliterator name2uni = + Transliterator.getInstance("Name-Any"); + + /// NOTE NOTE NOTE NOTE NOTE NOTE NOTE + // The results in icu4j and icu4c are different: + // icu4c: CJK UNIFIED IDEOGRAPH-4E01 + // icu4j: CJK UNIFIED IDEOGRAPH-4e01 + + expect(uni2name, "\u00A0abc\u4E01\u00B5\u0A81\uFFFD\uFFFF", + "{NO-BREAK SPACE}abc{CJK UNIFIED IDEOGRAPH-4e01}{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}\uFFFF"); + expect(name2uni, "{ NO-BREAK SPACE}abc{ CJK UNIFIED IDEOGRAPH-4E01 }{x{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}{", + "\u00A0abc\u4E01{x\u00B5\u0A81\uFFFD{"); + } + /** * Test the normalization transliterator. */ diff --git a/icu4j/src/com/ibm/icu/text/NameUnicodeTransliterator.java b/icu4j/src/com/ibm/icu/text/NameUnicodeTransliterator.java new file mode 100755 index 0000000000..c31e92fde1 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/NameUnicodeTransliterator.java @@ -0,0 +1,161 @@ +/* + * Copyright (C) 1996-2001, International Business Machines Corporation and + * others. All Rights Reserved. + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/NameUnicodeTransliterator.java,v $ + * $Date: 2001/07/02 20:55:29 $ + * $Revision: 1.1 $ + */ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that performs name to character mapping. + * @author Alan Liu + */ +public class NameUnicodeTransliterator extends Transliterator { + + char openDelimiter; + char closeDelimiter; + + static final String _ID = "Name-Any"; + + /** + * System registration hook. + */ + static void register() { + Transliterator.registerFactory(_ID, new Transliterator.Factory() { + public Transliterator getInstance() { + return new NameUnicodeTransliterator(null); + } + }); + } + + /** + * Constructs a transliterator. + */ + public NameUnicodeTransliterator(char openDelimiter, char closeDelimiter, + UnicodeFilter filter) { + super(_ID, filter); + this.openDelimiter = openDelimiter; + this.closeDelimiter = closeDelimiter; + } + + /** + * Constructs a transliterator with the default delimiters '{' and + * '}'. + */ + public NameUnicodeTransliterator(UnicodeFilter filter) { + this('{', '}', filter); + } + + /** + * Implements {@link Transliterator#handleTransliterate}. + */ + protected void handleTransliterate(Replaceable text, + Position offsets, boolean isIncremental) { + // Longest name as of 3.0.0 is 83 + final int LONGEST_NAME = 83; + + // Accomodate the longest possible name plus padding + char[] buf = new char[LONGEST_NAME + 8]; + + // The only characters used in names are (as of Unicode 3.0.0): + // -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ + // (first character is a space). + + int cursor = offsets.start; + int limit = offsets.limit; + + // Modes: + // 0 - looking for open delimiter + // 1 - after open delimiter + int mode = 0; + int ibuf = 0; + int openPos = offsets.start; // position of openDelimiter + + for (; cursor < limit; ++cursor) { + char c = filteredCharAt(text, cursor); + + switch (mode) { + case 0: // looking for open delimiter + if (c == openDelimiter) { + openPos = cursor; + mode = 1; + ibuf = 0; + } + break; + + case 1: // after open delimiter + // Look for [-a-zA-Z0-9]. If \w+ is found, convert it + // to a single space. If closeDelimiter is found, exit + // the loop. If any other character is found, exit the + // loop. If the limit is found, exit the loop. + if (UCharacter.isWhitespace(c)) { + // Ignore leading whitespace + if (ibuf != 0 && buf[ibuf-1] != (char)0x0020) { + buf[ibuf++] = (char)0x0020 /* */; + // If we go a bit past the longest possible name then abort + if (ibuf == (LONGEST_NAME + 4)) { + mode = 0; + } + } + continue; + } + + if (c == closeDelimiter) { + // Delete trailing space, if any + if (ibuf > 0 && buf[ibuf-1] == (char)0x0020) { + --ibuf; + } + int ch = UCharacter.getCharFromName(new String(buf, 0, ibuf)); + if (ch != -1) { + // Lookup succeeded + text.replace(openPos, cursor+1, String.valueOf((char) ch)); + + // Adjust indices for the change in the length of + // the string. Do not assume that str.length() == + // 1, in case of surrogates. + int delta = cursor + 1 - openPos - 1/*str.length()*/; + cursor -= delta; + limit -= delta; + // assert(cursor == openPos + str.length()); + } + // If the lookup failed, we leave things as-is and + // still switch to mode 0 and continue. + mode = 0; + continue; + } + + if (c >= (char)0x0061 && c <= (char)0x007A) { + c -= 0x0020; // [a-z] => [A-Z] + } + + // Check if c =~ [-A-Z0-9] + if (c == (char)0x002D || + (c >= (char)0x0041 && c <= (char)0x005A) || + (c >= (char)0x0030 && c <= (char)0x0039)) { + buf[ibuf++] = (char) c; + // If we go a bit past the longest possible name then abort + if (ibuf == (LONGEST_NAME + 4)) { + mode = 0; + } + } + + // Invalid character + else { + --cursor; // Backup and reprocess this character + mode = 0; + } + + break; + } + } + + offsets.contextLimit += limit - offsets.limit; + offsets.limit = limit; + // In incremental mode, only advance the cursor up to the last + // open delimiter, if we are in mode 1. + offsets.start = (mode == 1 && isIncremental) ? openPos : cursor; + } +} diff --git a/icu4j/src/com/ibm/icu/text/Transliterator.java b/icu4j/src/com/ibm/icu/text/Transliterator.java index 33c42e8fff..049f62800f 100755 --- a/icu4j/src/com/ibm/icu/text/Transliterator.java +++ b/icu4j/src/com/ibm/icu/text/Transliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/Transliterator.java,v $ - * $Date: 2001/06/29 22:50:25 $ - * $Revision: 1.35 $ + * $Date: 2001/07/02 20:55:29 $ + * $Revision: 1.36 $ * ***************************************************************************************** */ @@ -240,7 +240,7 @@ import com.ibm.text.resources.ResourceReader; *
Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: Transliterator.java,v $ $Revision: 1.35 $ $Date: 2001/06/29 22:50:25 $ + * @version $RCSfile: Transliterator.java,v $ $Revision: 1.36 $ $Date: 2001/07/02 20:55:29 $ */ public abstract class Transliterator { /** @@ -1100,6 +1100,8 @@ public abstract class Transliterator { LowercaseTransliterator.register(); UppercaseTransliterator.register(); TitlecaseTransliterator.register(); + UnicodeNameTransliterator.register(); + NameUnicodeTransliterator.register(); NormalizationTransliterator.register(); } diff --git a/icu4j/src/com/ibm/icu/text/UnicodeNameTransliterator.java b/icu4j/src/com/ibm/icu/text/UnicodeNameTransliterator.java new file mode 100755 index 0000000000..80071d99fe --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/UnicodeNameTransliterator.java @@ -0,0 +1,87 @@ +/* + * Copyright (C) 1996-2001, International Business Machines Corporation and + * others. All Rights Reserved. + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/text/UnicodeNameTransliterator.java,v $ + * $Date: 2001/07/02 20:55:29 $ + * $Revision: 1.1 $ + */ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that performs character to name mapping. + * @author Alan Liu + */ +public class UnicodeNameTransliterator extends Transliterator { + + char openDelimiter; + char closeDelimiter; + + static final String _ID = "Any-Name"; + + /** + * System registration hook. + */ + static void register() { + Transliterator.registerFactory(_ID, new Transliterator.Factory() { + public Transliterator getInstance() { + return new UnicodeNameTransliterator(null); + } + }); + } + + /** + * Constructs a transliterator. + */ + public UnicodeNameTransliterator(char openDelimiter, char closeDelimiter, + UnicodeFilter filter) { + super(_ID, filter); + this.openDelimiter = openDelimiter; + this.closeDelimiter = closeDelimiter; + } + + /** + * Constructs a transliterator with the default delimiters '{' and + * '}'. + */ + public UnicodeNameTransliterator(UnicodeFilter filter) { + this('{', '}', filter); + } + + /** + * Implements {@link Transliterator#handleTransliterate}. + */ + protected void handleTransliterate(Replaceable text, + Position offsets, boolean isIncremental) { + int cursor = offsets.start; + int limit = offsets.limit; + + UnicodeFilter filt = getFilter(); + StringBuffer str = new StringBuffer(); + str.append(openDelimiter); + int len; + String name; + + while (cursor < limit) { + char c = text.charAt(cursor); + if ((filt == null || filt.contains(c)) && + (name=UCharacter.getName(c)) != null) { + + str.setLength(1); + str.append(name).append(closeDelimiter); + + text.replace(cursor, cursor+1, str.toString()); + len = str.length(); + cursor += len; // advance cursor by 1 and adjust for new text + limit += len-1; // change in length is (len - 1) + } else { + ++cursor; + } + } + + offsets.contextLimit += limit - offsets.limit; + offsets.limit = limit; + offsets.start = cursor; + } +} diff --git a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java index 4404cc7fd3..eb2ae626fd 100755 --- a/icu4j/src/com/ibm/test/translit/TransliteratorTest.java +++ b/icu4j/src/com/ibm/test/translit/TransliteratorTest.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/test/translit/Attic/TransliteratorTest.java,v $ - * $Date: 2001/07/02 19:44:10 $ - * $Revision: 1.38 $ + * $Date: 2001/07/02 20:54:51 $ + * $Revision: 1.39 $ * ***************************************************************************************** */ @@ -795,6 +795,26 @@ public class TransliteratorTest extends TestFmwk { "The Quick Brown FoX Can't Jump Over The LaZy Dogs."); } + /** + * Test the name mapping transliterators. + */ + public void TestNameMap() { + Transliterator uni2name = + Transliterator.getInstance("Any-Name[^abc]"); + Transliterator name2uni = + Transliterator.getInstance("Name-Any"); + + /// NOTE NOTE NOTE NOTE NOTE NOTE NOTE + // The results in icu4j and icu4c are different: + // icu4c: CJK UNIFIED IDEOGRAPH-4E01 + // icu4j: CJK UNIFIED IDEOGRAPH-4e01 + + expect(uni2name, "\u00A0abc\u4E01\u00B5\u0A81\uFFFD\uFFFF", + "{NO-BREAK SPACE}abc{CJK UNIFIED IDEOGRAPH-4e01}{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}\uFFFF"); + expect(name2uni, "{ NO-BREAK SPACE}abc{ CJK UNIFIED IDEOGRAPH-4E01 }{x{MICRO SIGN}{GUJARATI SIGN CANDRABINDU}{REPLACEMENT CHARACTER}{", + "\u00A0abc\u4E01{x\u00B5\u0A81\uFFFD{"); + } + /** * Test the normalization transliterator. */ diff --git a/icu4j/src/com/ibm/text/NameUnicodeTransliterator.java b/icu4j/src/com/ibm/text/NameUnicodeTransliterator.java new file mode 100755 index 0000000000..bf66e3d8fe --- /dev/null +++ b/icu4j/src/com/ibm/text/NameUnicodeTransliterator.java @@ -0,0 +1,161 @@ +/* + * Copyright (C) 1996-2001, International Business Machines Corporation and + * others. All Rights Reserved. + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/NameUnicodeTransliterator.java,v $ + * $Date: 2001/07/02 20:55:29 $ + * $Revision: 1.1 $ + */ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that performs name to character mapping. + * @author Alan Liu + */ +public class NameUnicodeTransliterator extends Transliterator { + + char openDelimiter; + char closeDelimiter; + + static final String _ID = "Name-Any"; + + /** + * System registration hook. + */ + static void register() { + Transliterator.registerFactory(_ID, new Transliterator.Factory() { + public Transliterator getInstance() { + return new NameUnicodeTransliterator(null); + } + }); + } + + /** + * Constructs a transliterator. + */ + public NameUnicodeTransliterator(char openDelimiter, char closeDelimiter, + UnicodeFilter filter) { + super(_ID, filter); + this.openDelimiter = openDelimiter; + this.closeDelimiter = closeDelimiter; + } + + /** + * Constructs a transliterator with the default delimiters '{' and + * '}'. + */ + public NameUnicodeTransliterator(UnicodeFilter filter) { + this('{', '}', filter); + } + + /** + * Implements {@link Transliterator#handleTransliterate}. + */ + protected void handleTransliterate(Replaceable text, + Position offsets, boolean isIncremental) { + // Longest name as of 3.0.0 is 83 + final int LONGEST_NAME = 83; + + // Accomodate the longest possible name plus padding + char[] buf = new char[LONGEST_NAME + 8]; + + // The only characters used in names are (as of Unicode 3.0.0): + // -0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ + // (first character is a space). + + int cursor = offsets.start; + int limit = offsets.limit; + + // Modes: + // 0 - looking for open delimiter + // 1 - after open delimiter + int mode = 0; + int ibuf = 0; + int openPos = offsets.start; // position of openDelimiter + + for (; cursor < limit; ++cursor) { + char c = filteredCharAt(text, cursor); + + switch (mode) { + case 0: // looking for open delimiter + if (c == openDelimiter) { + openPos = cursor; + mode = 1; + ibuf = 0; + } + break; + + case 1: // after open delimiter + // Look for [-a-zA-Z0-9]. If \w+ is found, convert it + // to a single space. If closeDelimiter is found, exit + // the loop. If any other character is found, exit the + // loop. If the limit is found, exit the loop. + if (UCharacter.isWhitespace(c)) { + // Ignore leading whitespace + if (ibuf != 0 && buf[ibuf-1] != (char)0x0020) { + buf[ibuf++] = (char)0x0020 /* */; + // If we go a bit past the longest possible name then abort + if (ibuf == (LONGEST_NAME + 4)) { + mode = 0; + } + } + continue; + } + + if (c == closeDelimiter) { + // Delete trailing space, if any + if (ibuf > 0 && buf[ibuf-1] == (char)0x0020) { + --ibuf; + } + int ch = UCharacter.getCharFromName(new String(buf, 0, ibuf)); + if (ch != -1) { + // Lookup succeeded + text.replace(openPos, cursor+1, String.valueOf((char) ch)); + + // Adjust indices for the change in the length of + // the string. Do not assume that str.length() == + // 1, in case of surrogates. + int delta = cursor + 1 - openPos - 1/*str.length()*/; + cursor -= delta; + limit -= delta; + // assert(cursor == openPos + str.length()); + } + // If the lookup failed, we leave things as-is and + // still switch to mode 0 and continue. + mode = 0; + continue; + } + + if (c >= (char)0x0061 && c <= (char)0x007A) { + c -= 0x0020; // [a-z] => [A-Z] + } + + // Check if c =~ [-A-Z0-9] + if (c == (char)0x002D || + (c >= (char)0x0041 && c <= (char)0x005A) || + (c >= (char)0x0030 && c <= (char)0x0039)) { + buf[ibuf++] = (char) c; + // If we go a bit past the longest possible name then abort + if (ibuf == (LONGEST_NAME + 4)) { + mode = 0; + } + } + + // Invalid character + else { + --cursor; // Backup and reprocess this character + mode = 0; + } + + break; + } + } + + offsets.contextLimit += limit - offsets.limit; + offsets.limit = limit; + // In incremental mode, only advance the cursor up to the last + // open delimiter, if we are in mode 1. + offsets.start = (mode == 1 && isIncremental) ? openPos : cursor; + } +} diff --git a/icu4j/src/com/ibm/text/Transliterator.java b/icu4j/src/com/ibm/text/Transliterator.java index 2bf75f8f5c..0a9b6066ba 100755 --- a/icu4j/src/com/ibm/text/Transliterator.java +++ b/icu4j/src/com/ibm/text/Transliterator.java @@ -5,8 +5,8 @@ ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/Transliterator.java,v $ - * $Date: 2001/06/29 22:50:25 $ - * $Revision: 1.35 $ + * $Date: 2001/07/02 20:55:29 $ + * $Revision: 1.36 $ * ***************************************************************************************** */ @@ -240,7 +240,7 @@ import com.ibm.text.resources.ResourceReader; *
Copyright © IBM Corporation 1999. All rights reserved. * * @author Alan Liu - * @version $RCSfile: Transliterator.java,v $ $Revision: 1.35 $ $Date: 2001/06/29 22:50:25 $ + * @version $RCSfile: Transliterator.java,v $ $Revision: 1.36 $ $Date: 2001/07/02 20:55:29 $ */ public abstract class Transliterator { /** @@ -1100,6 +1100,8 @@ public abstract class Transliterator { LowercaseTransliterator.register(); UppercaseTransliterator.register(); TitlecaseTransliterator.register(); + UnicodeNameTransliterator.register(); + NameUnicodeTransliterator.register(); NormalizationTransliterator.register(); } diff --git a/icu4j/src/com/ibm/text/UnicodeNameTransliterator.java b/icu4j/src/com/ibm/text/UnicodeNameTransliterator.java new file mode 100755 index 0000000000..5e1e999ca0 --- /dev/null +++ b/icu4j/src/com/ibm/text/UnicodeNameTransliterator.java @@ -0,0 +1,87 @@ +/* + * Copyright (C) 1996-2001, International Business Machines Corporation and + * others. All Rights Reserved. + * + * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/Attic/UnicodeNameTransliterator.java,v $ + * $Date: 2001/07/02 20:55:29 $ + * $Revision: 1.1 $ + */ +package com.ibm.text; +import java.util.*; + +/** + * A transliterator that performs character to name mapping. + * @author Alan Liu + */ +public class UnicodeNameTransliterator extends Transliterator { + + char openDelimiter; + char closeDelimiter; + + static final String _ID = "Any-Name"; + + /** + * System registration hook. + */ + static void register() { + Transliterator.registerFactory(_ID, new Transliterator.Factory() { + public Transliterator getInstance() { + return new UnicodeNameTransliterator(null); + } + }); + } + + /** + * Constructs a transliterator. + */ + public UnicodeNameTransliterator(char openDelimiter, char closeDelimiter, + UnicodeFilter filter) { + super(_ID, filter); + this.openDelimiter = openDelimiter; + this.closeDelimiter = closeDelimiter; + } + + /** + * Constructs a transliterator with the default delimiters '{' and + * '}'. + */ + public UnicodeNameTransliterator(UnicodeFilter filter) { + this('{', '}', filter); + } + + /** + * Implements {@link Transliterator#handleTransliterate}. + */ + protected void handleTransliterate(Replaceable text, + Position offsets, boolean isIncremental) { + int cursor = offsets.start; + int limit = offsets.limit; + + UnicodeFilter filt = getFilter(); + StringBuffer str = new StringBuffer(); + str.append(openDelimiter); + int len; + String name; + + while (cursor < limit) { + char c = text.charAt(cursor); + if ((filt == null || filt.contains(c)) && + (name=UCharacter.getName(c)) != null) { + + str.setLength(1); + str.append(name).append(closeDelimiter); + + text.replace(cursor, cursor+1, str.toString()); + len = str.length(); + cursor += len; // advance cursor by 1 and adjust for new text + limit += len-1; // change in length is (len - 1) + } else { + ++cursor; + } + } + + offsets.contextLimit += limit - offsets.limit; + offsets.limit = limit; + offsets.start = cursor; + } +}