diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java index f746f3a44b..6b89b62a41 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java @@ -1,6 +1,6 @@ /** ******************************************************************************* -* Copyright (C) 2005-2012, International Business Machines Corporation and * +* Copyright (C) 2005-2013, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ @@ -12,6 +12,7 @@ import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.List; /** @@ -187,10 +188,14 @@ public class CharsetDetector { // Iterate over all possible charsets, remember all that // give a match quality > 0. - for (CharsetRecognizer csr: fCSRecognizers) { - CharsetMatch m = csr.match(this); - if (m != null) { - matches.add(m); + for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { + CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); + boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled; + if (active) { + CharsetMatch m = rcinfo.recognizer.match(this); + if (m != null) { + matches.add(m); + } } } Collections.sort(matches); // CharsetMatch compares on confidence @@ -278,17 +283,28 @@ public class CharsetDetector { /** - * Get the names of all char sets that can be recognized by the char set detector. + * Get the names of all charsets supported by CharsetDetector class. + *

+ * Note: Multiple different charset encodings in a same family may use + * a single shared name in this implementation. For example, this method returns + * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" + * (Windows Latin 1). However, actual detection result could be "windows-1252" + * when the input data matches Latin 1 code points with any points only available + * in "windows-1252". * - * @return an array of the names of all charsets that can be recognized - * by the charset detector. + * @return an array of the names of all charsets supported by + * CharsetDetector class. * * @stable ICU 3.4 */ public static String[] getAllDetectableCharsets() { - return fCharsetNames; - } - + String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()]; + for (int i = 0; i < allCharsetNames.length; i++) { + allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName(); + } + return allCharsetNames; + } + /** * Test whether or not input filtering is enabled. * @@ -420,12 +436,8 @@ public class CharsetDetector { false; String fDeclaredEncoding; - - - // - // Stuff private to CharsetDetector - // + byte[] fRawInput; // Original, untouched input bytes. // If user gave us a byte array, this is it. // If user gave us a stream, it's read to a @@ -435,71 +447,136 @@ public class CharsetDetector { InputStream fInputStream; // User's input stream, or null if the user // gave us a byte array. - boolean fStripTags = // If true, setText() will strip tags from input text. + // + // Stuff private to CharsetDetector + // + private boolean fStripTags = // If true, setText() will strip tags from input text. false; - - + + private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had + // been changed from the default. The array index is + // corresponding to ALL_RECOGNIZER. See setDetectableCharset(). + + private static class CSRecognizerInfo { + CharsetRecognizer recognizer; + boolean isDefaultEnabled; + + CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) { + this.recognizer = recognizer; + this.isDefaultEnabled = isDefaultEnabled; + } + } + /* * List of recognizers for all charsets known to the implementation. */ - private static ArrayList fCSRecognizers = createRecognizers(); - private static String [] fCharsetNames; - - /* - * Create the singleton instances of the CharsetRecognizer classes + private static final List ALL_CS_RECOGNIZERS; + + static { + List list = new ArrayList(); + + list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true)); + + list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true)); + + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true)); + + // IBM 420/424 recognizers are disabled by default + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false)); + list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false)); + + ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list); + } + + /** + * Get the names of charsets that can be recognized by this CharsetDetector instance. + * + * @return an array of the names of charsets that can be recognized by this CharsetDetector + * instance. + * + * @internal + * @deprecated This API is ICU internal only. */ - private static ArrayList createRecognizers() { - ArrayList recognizers = new ArrayList(); - - recognizers.add(new CharsetRecog_UTF8()); - - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE()); - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE()); - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE()); - recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE()); - - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis()); - recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP()); - recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN()); - recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr()); - recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5()); - - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr()); - - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl()); - recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr()); - - // Create an array of all charset names, as a side effect. - // Needed for the getAllDetectableCharsets() API. - String[] charsetNames = new String [recognizers.size()]; - int out = 0; - - for (int i = 0; i < recognizers.size(); i++) { - String name = recognizers.get(i).getName(); - - if (out == 0 || ! name.equals(charsetNames[out - 1])) { - charsetNames[out++] = name; + public String[] getDetectableCharsets() { + List csnames = new ArrayList(ALL_CS_RECOGNIZERS.size()); + for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { + CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i); + boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i]; + if (active) { + csnames.add(rcinfo.recognizer.getName()); } } - - fCharsetNames = new String[out]; - System.arraycopy(charsetNames, 0, fCharsetNames, 0, out); - - return recognizers; + return csnames.toArray(new String[csnames.size()]); + } + + /** + * Enable or disable individual charset encoding. + * A name of charset encoding must be included in the names returned by + * {@link #getAllDetectableCharsets()}. + * + * @param encoding the name of charset encoding. + * @param enabled true to enable, or false to disable the + * charset encoding. + * @return A reference to this CharsetDetector. + * @throws IllegalArgumentException when the name of charset encoding is + * not supported. + * + * @internal + * @deprecated This API is ICU internal only. + */ + public CharsetDetector setDetectableCharset(String encoding, boolean enabled) { + int modIdx = -1; + boolean isDefaultVal = false; + for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { + CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i); + if (csrinfo.recognizer.getName().equals(encoding)) { + modIdx = i; + isDefaultVal = (csrinfo.isDefaultEnabled == enabled); + break; + } + } + if (modIdx < 0) { + // No matching encoding found + throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\""); + } + + if (fEnabledRecognizers == null && !isDefaultVal) { + // Create an array storing the non default setting + fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()]; + + // Initialize the array with default info + for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) { + fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled; + } + } + + if (fEnabledRecognizers != null) { + fEnabledRecognizers[modIdx] = enabled; + } + + return this; } } diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java index 24321073a3..e051ea25ac 100644 --- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java +++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java @@ -783,10 +783,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer { public String getName() { - // return "ISO-8859-8-I"; - // ICU4C returns ISO-8859-8-I - // Ticket #9364 to resolve the difference. - return "ISO-8859-8"; + return "ISO-8859-8-I"; } public String getLanguage() @@ -796,9 +793,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer { public CharsetMatch match(CharsetDetector det) { - // ICU4C returns ISO-8859-8-I - // Ticket #9364 to resolve the difference. - String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8"; + String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I"; int confidence = match(det, ngrams, byteMap); return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he"); } diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml index 9dcbf13af2..b281e8c56d 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml @@ -1,6 +1,6 @@ - + @@ -118,7 +118,7 @@ - + אירופה, תוכנה והאינטרנט: @@ -548,4 +548,4 @@ Conference Program şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar. - \ No newline at end of file + diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java index fb6f7b843d..0c21d2c0e5 100644 --- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java +++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java @@ -98,7 +98,38 @@ public class TestCharsetDetector extends TestFmwk CheckAssert(charsetNames[i].equals("") == false); // System.out.println("\"" + charsetNames[i] + "\""); } - } + + final String[] defDisabled = { + "IBM420_rtl", "IBM420_ltr", + "IBM424_rtl", "IBM424_ltr" + }; + String[] activeCharsetNames = det.getDetectableCharsets(); + for (String cs : activeCharsetNames) { + // the charset must be included in all list + boolean found = false; + for (String cs0 : charsetNames) { + if (cs0.equals(cs)) { + found = true; + break; + } + } + if (!found) { + errln(cs + " is not included in the all charset list." ); + } + + // some charsets are disabled by default + found = false; + for (String cs1 : defDisabled) { + if (cs1.equals(cs)) { + found = true; + break; + } + } + if (found) { + errln(cs + " should not be included in the default charset list."); + } + } + } public void TestInputFilter() throws Exception { @@ -484,6 +515,10 @@ public class TestCharsetDetector extends TestFmwk "\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629."; CharsetDetector det = new CharsetDetector(); + det.setDetectableCharset("IBM424_rtl", true); + det.setDetectableCharset("IBM424_ltr", true); + det.setDetectableCharset("IBM420_rtl", true); + det.setDetectableCharset("IBM420_ltr", true); CharsetMatch m; String charsetMatch; byte[] bytes; @@ -603,7 +638,7 @@ public class TestCharsetDetector extends TestFmwk CharsetMatch m = _test1255(s); String charsetMatch = m.getName(); - CheckAssert(charsetMatch.equals("ISO-8859-8")); + CheckAssert(charsetMatch.equals("ISO-8859-8-I")); CheckAssert(m.getLanguage().equals("he")); m = _test1255_reverse(s); @@ -654,6 +689,10 @@ public class TestCharsetDetector extends TestFmwk private CharsetMatch _testIBM424_he_rtl(String s) throws Exception { byte [] bytes = s.getBytes("IBM424"); CharsetDetector det = new CharsetDetector(); + det.setDetectableCharset("IBM424_rtl", true); + det.setDetectableCharset("IBM424_ltr", true); + det.setDetectableCharset("IBM420_rtl", true); + det.setDetectableCharset("IBM420_ltr", true); det.setText(bytes); CharsetMatch m = det.detect(); return m; @@ -669,6 +708,10 @@ public class TestCharsetDetector extends TestFmwk byte [] bytes = ltrStrBuf.toString().getBytes("IBM424"); CharsetDetector det = new CharsetDetector(); + det.setDetectableCharset("IBM424_rtl", true); + det.setDetectableCharset("IBM424_ltr", true); + det.setDetectableCharset("IBM420_rtl", true); + det.setDetectableCharset("IBM420_ltr", true); det.setText(bytes); CharsetMatch m = det.detect(); return m;