diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
index f746f3a44b..6b89b62a41 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetDetector.java
@@ -1,6 +1,6 @@
/**
*******************************************************************************
-* Copyright (C) 2005-2012, International Business Machines Corporation and *
+* Copyright (C) 2005-2013, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@@ -12,6 +12,7 @@ import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
+import java.util.List;
/**
@@ -187,10 +188,14 @@ public class CharsetDetector {
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
- for (CharsetRecognizer csr: fCSRecognizers) {
- CharsetMatch m = csr.match(this);
- if (m != null) {
- matches.add(m);
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
+ boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
+ if (active) {
+ CharsetMatch m = rcinfo.recognizer.match(this);
+ if (m != null) {
+ matches.add(m);
+ }
}
}
Collections.sort(matches); // CharsetMatch compares on confidence
@@ -278,17 +283,28 @@ public class CharsetDetector {
/**
- * Get the names of all char sets that can be recognized by the char set detector.
+ * Get the names of all charsets supported by CharsetDetector
class.
+ *
+ * Note: Multiple different charset encodings in a same family may use
+ * a single shared name in this implementation. For example, this method returns
+ * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
+ * (Windows Latin 1). However, actual detection result could be "windows-1252"
+ * when the input data matches Latin 1 code points with any points only available
+ * in "windows-1252".
*
- * @return an array of the names of all charsets that can be recognized
- * by the charset detector.
+ * @return an array of the names of all charsets supported by
+ * CharsetDetector
class.
*
* @stable ICU 3.4
*/
public static String[] getAllDetectableCharsets() {
- return fCharsetNames;
- }
-
+ String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
+ for (int i = 0; i < allCharsetNames.length; i++) {
+ allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
+ }
+ return allCharsetNames;
+ }
+
/**
* Test whether or not input filtering is enabled.
*
@@ -420,12 +436,8 @@ public class CharsetDetector {
false;
String fDeclaredEncoding;
-
-
- //
- // Stuff private to CharsetDetector
- //
+
byte[] fRawInput; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
// If user gave us a stream, it's read to a
@@ -435,71 +447,136 @@ public class CharsetDetector {
InputStream fInputStream; // User's input stream, or null if the user
// gave us a byte array.
- boolean fStripTags = // If true, setText() will strip tags from input text.
+ //
+ // Stuff private to CharsetDetector
+ //
+ private boolean fStripTags = // If true, setText() will strip tags from input text.
false;
-
-
+
+ private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had
+ // been changed from the default. The array index is
+ // corresponding to ALL_RECOGNIZER. See setDetectableCharset().
+
+ private static class CSRecognizerInfo {
+ CharsetRecognizer recognizer;
+ boolean isDefaultEnabled;
+
+ CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
+ this.recognizer = recognizer;
+ this.isDefaultEnabled = isDefaultEnabled;
+ }
+ }
+
/*
* List of recognizers for all charsets known to the implementation.
*/
- private static ArrayListtrue
to enable, or false
to disable the
+ * charset encoding.
+ * @return A reference to this CharsetDetector
.
+ * @throws IllegalArgumentException when the name of charset encoding is
+ * not supported.
+ *
+ * @internal
+ * @deprecated This API is ICU internal only.
+ */
+ public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
+ int modIdx = -1;
+ boolean isDefaultVal = false;
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
+ if (csrinfo.recognizer.getName().equals(encoding)) {
+ modIdx = i;
+ isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
+ break;
+ }
+ }
+ if (modIdx < 0) {
+ // No matching encoding found
+ throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
+ }
+
+ if (fEnabledRecognizers == null && !isDefaultVal) {
+ // Create an array storing the non default setting
+ fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
+
+ // Initialize the array with default info
+ for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
+ fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
+ }
+ }
+
+ if (fEnabledRecognizers != null) {
+ fEnabledRecognizers[modIdx] = enabled;
+ }
+
+ return this;
}
}
diff --git a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
index 24321073a3..e051ea25ac 100644
--- a/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/text/CharsetRecog_sbcs.java
@@ -783,10 +783,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
public String getName()
{
- // return "ISO-8859-8-I";
- // ICU4C returns ISO-8859-8-I
- // Ticket #9364 to resolve the difference.
- return "ISO-8859-8";
+ return "ISO-8859-8-I";
}
public String getLanguage()
@@ -796,9 +793,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
public CharsetMatch match(CharsetDetector det)
{
- // ICU4C returns ISO-8859-8-I
- // Ticket #9364 to resolve the difference.
- String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8";
+ String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I";
int confidence = match(det, ngrams, byteMap);
return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
}
diff --git a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
index 9dcbf13af2..b281e8c56d 100644
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/charsetdet/CharsetDetectionTests.xml
@@ -1,6 +1,6 @@
-
+