ICU-10012 Disable EBCDIC Arabic/Hebrew detectors by default. Added APIs to enable/disable each charset and get currently active charsets. Use ISO-8859-8-I for ISO-8859-8/logica/implicit - to make the behavior compatible with ICU4C implementation (#9364).
X-SVN-Rev: 34351
This commit is contained in:
parent
920dadff8d
commit
1e57298e3f
@ -1,6 +1,6 @@
|
|||||||
/**
|
/**
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
* Copyright (C) 2005-2012, International Business Machines Corporation and *
|
* Copyright (C) 2005-2013, International Business Machines Corporation and *
|
||||||
* others. All Rights Reserved. *
|
* others. All Rights Reserved. *
|
||||||
*******************************************************************************
|
*******************************************************************************
|
||||||
*/
|
*/
|
||||||
@ -12,6 +12,7 @@ import java.io.Reader;
|
|||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -187,10 +188,14 @@ public class CharsetDetector {
|
|||||||
|
|
||||||
// Iterate over all possible charsets, remember all that
|
// Iterate over all possible charsets, remember all that
|
||||||
// give a match quality > 0.
|
// give a match quality > 0.
|
||||||
for (CharsetRecognizer csr: fCSRecognizers) {
|
for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
|
||||||
CharsetMatch m = csr.match(this);
|
CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
|
||||||
if (m != null) {
|
boolean active = (fEnabledRecognizers != null) ? fEnabledRecognizers[i] : rcinfo.isDefaultEnabled;
|
||||||
matches.add(m);
|
if (active) {
|
||||||
|
CharsetMatch m = rcinfo.recognizer.match(this);
|
||||||
|
if (m != null) {
|
||||||
|
matches.add(m);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Collections.sort(matches); // CharsetMatch compares on confidence
|
Collections.sort(matches); // CharsetMatch compares on confidence
|
||||||
@ -278,17 +283,28 @@ public class CharsetDetector {
|
|||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the names of all char sets that can be recognized by the char set detector.
|
* Get the names of all charsets supported by <code>CharsetDetector</code> class.
|
||||||
|
* <p>
|
||||||
|
* <b>Note:</b> Multiple different charset encodings in a same family may use
|
||||||
|
* a single shared name in this implementation. For example, this method returns
|
||||||
|
* an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
|
||||||
|
* (Windows Latin 1). However, actual detection result could be "windows-1252"
|
||||||
|
* when the input data matches Latin 1 code points with any points only available
|
||||||
|
* in "windows-1252".
|
||||||
*
|
*
|
||||||
* @return an array of the names of all charsets that can be recognized
|
* @return an array of the names of all charsets supported by
|
||||||
* by the charset detector.
|
* <code>CharsetDetector</code> class.
|
||||||
*
|
*
|
||||||
* @stable ICU 3.4
|
* @stable ICU 3.4
|
||||||
*/
|
*/
|
||||||
public static String[] getAllDetectableCharsets() {
|
public static String[] getAllDetectableCharsets() {
|
||||||
return fCharsetNames;
|
String[] allCharsetNames = new String[ALL_CS_RECOGNIZERS.size()];
|
||||||
}
|
for (int i = 0; i < allCharsetNames.length; i++) {
|
||||||
|
allCharsetNames[i] = ALL_CS_RECOGNIZERS.get(i).recognizer.getName();
|
||||||
|
}
|
||||||
|
return allCharsetNames;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test whether or not input filtering is enabled.
|
* Test whether or not input filtering is enabled.
|
||||||
*
|
*
|
||||||
@ -420,12 +436,8 @@ public class CharsetDetector {
|
|||||||
false;
|
false;
|
||||||
|
|
||||||
String fDeclaredEncoding;
|
String fDeclaredEncoding;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//
|
|
||||||
// Stuff private to CharsetDetector
|
|
||||||
//
|
|
||||||
byte[] fRawInput; // Original, untouched input bytes.
|
byte[] fRawInput; // Original, untouched input bytes.
|
||||||
// If user gave us a byte array, this is it.
|
// If user gave us a byte array, this is it.
|
||||||
// If user gave us a stream, it's read to a
|
// If user gave us a stream, it's read to a
|
||||||
@ -435,71 +447,136 @@ public class CharsetDetector {
|
|||||||
InputStream fInputStream; // User's input stream, or null if the user
|
InputStream fInputStream; // User's input stream, or null if the user
|
||||||
// gave us a byte array.
|
// gave us a byte array.
|
||||||
|
|
||||||
boolean fStripTags = // If true, setText() will strip tags from input text.
|
//
|
||||||
|
// Stuff private to CharsetDetector
|
||||||
|
//
|
||||||
|
private boolean fStripTags = // If true, setText() will strip tags from input text.
|
||||||
false;
|
false;
|
||||||
|
|
||||||
|
private boolean[] fEnabledRecognizers; // If not null, active set of charset recognizers had
|
||||||
|
// been changed from the default. The array index is
|
||||||
|
// corresponding to ALL_RECOGNIZER. See setDetectableCharset().
|
||||||
|
|
||||||
|
private static class CSRecognizerInfo {
|
||||||
|
CharsetRecognizer recognizer;
|
||||||
|
boolean isDefaultEnabled;
|
||||||
|
|
||||||
|
CSRecognizerInfo(CharsetRecognizer recognizer, boolean isDefaultEnabled) {
|
||||||
|
this.recognizer = recognizer;
|
||||||
|
this.isDefaultEnabled = isDefaultEnabled;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* List of recognizers for all charsets known to the implementation.
|
* List of recognizers for all charsets known to the implementation.
|
||||||
*/
|
*/
|
||||||
private static ArrayList<CharsetRecognizer> fCSRecognizers = createRecognizers();
|
private static final List<CSRecognizerInfo> ALL_CS_RECOGNIZERS;
|
||||||
private static String [] fCharsetNames;
|
|
||||||
|
static {
|
||||||
/*
|
List<CSRecognizerInfo> list = new ArrayList<CSRecognizerInfo>();
|
||||||
* Create the singleton instances of the CharsetRecognizer classes
|
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_UTF8(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE(), true));
|
||||||
|
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_sjis(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022JP(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022CN(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_2022.CharsetRecog_2022KR(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_mbcs.CharsetRecog_big5(), true));
|
||||||
|
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_1(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_2(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_7_el(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_8_he(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1251(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_windows_1256(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_KOI8_R(), true));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr(), true));
|
||||||
|
|
||||||
|
// IBM 420/424 recognizers are disabled by default
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl(), false));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr(), false));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl(), false));
|
||||||
|
list.add(new CSRecognizerInfo(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr(), false));
|
||||||
|
|
||||||
|
ALL_CS_RECOGNIZERS = Collections.unmodifiableList(list);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the names of charsets that can be recognized by this CharsetDetector instance.
|
||||||
|
*
|
||||||
|
* @return an array of the names of charsets that can be recognized by this CharsetDetector
|
||||||
|
* instance.
|
||||||
|
*
|
||||||
|
* @internal
|
||||||
|
* @deprecated This API is ICU internal only.
|
||||||
*/
|
*/
|
||||||
private static ArrayList<CharsetRecognizer> createRecognizers() {
|
public String[] getDetectableCharsets() {
|
||||||
ArrayList<CharsetRecognizer> recognizers = new ArrayList<CharsetRecognizer>();
|
List<String> csnames = new ArrayList<String>(ALL_CS_RECOGNIZERS.size());
|
||||||
|
for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
|
||||||
recognizers.add(new CharsetRecog_UTF8());
|
CSRecognizerInfo rcinfo = ALL_CS_RECOGNIZERS.get(i);
|
||||||
|
boolean active = (fEnabledRecognizers == null) ? rcinfo.isDefaultEnabled : fEnabledRecognizers[i];
|
||||||
recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_BE());
|
if (active) {
|
||||||
recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_16_LE());
|
csnames.add(rcinfo.recognizer.getName());
|
||||||
recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_BE());
|
|
||||||
recognizers.add(new CharsetRecog_Unicode.CharsetRecog_UTF_32_LE());
|
|
||||||
|
|
||||||
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_sjis());
|
|
||||||
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022JP());
|
|
||||||
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022CN());
|
|
||||||
recognizers.add(new CharsetRecog_2022.CharsetRecog_2022KR());
|
|
||||||
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_gb_18030());
|
|
||||||
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_jp());
|
|
||||||
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_euc.CharsetRecog_euc_kr());
|
|
||||||
recognizers.add(new CharsetRecog_mbcs.CharsetRecog_big5());
|
|
||||||
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_1());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_2());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_5_ru());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_6_ar());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_7_el());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_I_he());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_8_he());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1251());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_windows_1256());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_KOI8_R());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_8859_9_tr());
|
|
||||||
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_rtl());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM424_he_ltr());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_rtl());
|
|
||||||
recognizers.add(new CharsetRecog_sbcs.CharsetRecog_IBM420_ar_ltr());
|
|
||||||
|
|
||||||
// Create an array of all charset names, as a side effect.
|
|
||||||
// Needed for the getAllDetectableCharsets() API.
|
|
||||||
String[] charsetNames = new String [recognizers.size()];
|
|
||||||
int out = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < recognizers.size(); i++) {
|
|
||||||
String name = recognizers.get(i).getName();
|
|
||||||
|
|
||||||
if (out == 0 || ! name.equals(charsetNames[out - 1])) {
|
|
||||||
charsetNames[out++] = name;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return csnames.toArray(new String[csnames.size()]);
|
||||||
fCharsetNames = new String[out];
|
}
|
||||||
System.arraycopy(charsetNames, 0, fCharsetNames, 0, out);
|
|
||||||
|
/**
|
||||||
return recognizers;
|
* Enable or disable individual charset encoding.
|
||||||
|
* A name of charset encoding must be included in the names returned by
|
||||||
|
* {@link #getAllDetectableCharsets()}.
|
||||||
|
*
|
||||||
|
* @param encoding the name of charset encoding.
|
||||||
|
* @param enabled <code>true</code> to enable, or <code>false</code> to disable the
|
||||||
|
* charset encoding.
|
||||||
|
* @return A reference to this <code>CharsetDetector</code>.
|
||||||
|
* @throws IllegalArgumentException when the name of charset encoding is
|
||||||
|
* not supported.
|
||||||
|
*
|
||||||
|
* @internal
|
||||||
|
* @deprecated This API is ICU internal only.
|
||||||
|
*/
|
||||||
|
public CharsetDetector setDetectableCharset(String encoding, boolean enabled) {
|
||||||
|
int modIdx = -1;
|
||||||
|
boolean isDefaultVal = false;
|
||||||
|
for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
|
||||||
|
CSRecognizerInfo csrinfo = ALL_CS_RECOGNIZERS.get(i);
|
||||||
|
if (csrinfo.recognizer.getName().equals(encoding)) {
|
||||||
|
modIdx = i;
|
||||||
|
isDefaultVal = (csrinfo.isDefaultEnabled == enabled);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (modIdx < 0) {
|
||||||
|
// No matching encoding found
|
||||||
|
throw new IllegalArgumentException("Invalid encoding: " + "\"" + encoding + "\"");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fEnabledRecognizers == null && !isDefaultVal) {
|
||||||
|
// Create an array storing the non default setting
|
||||||
|
fEnabledRecognizers = new boolean[ALL_CS_RECOGNIZERS.size()];
|
||||||
|
|
||||||
|
// Initialize the array with default info
|
||||||
|
for (int i = 0; i < ALL_CS_RECOGNIZERS.size(); i++) {
|
||||||
|
fEnabledRecognizers[i] = ALL_CS_RECOGNIZERS.get(i).isDefaultEnabled;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fEnabledRecognizers != null) {
|
||||||
|
fEnabledRecognizers[modIdx] = enabled;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -783,10 +783,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
|||||||
|
|
||||||
public String getName()
|
public String getName()
|
||||||
{
|
{
|
||||||
// return "ISO-8859-8-I";
|
return "ISO-8859-8-I";
|
||||||
// ICU4C returns ISO-8859-8-I
|
|
||||||
// Ticket #9364 to resolve the difference.
|
|
||||||
return "ISO-8859-8";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLanguage()
|
public String getLanguage()
|
||||||
@ -796,9 +793,7 @@ abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
|||||||
|
|
||||||
public CharsetMatch match(CharsetDetector det)
|
public CharsetMatch match(CharsetDetector det)
|
||||||
{
|
{
|
||||||
// ICU4C returns ISO-8859-8-I
|
String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8-I";
|
||||||
// Ticket #9364 to resolve the difference.
|
|
||||||
String name = det.fC1Bytes ? "windows-1255" : "ISO-8859-8";
|
|
||||||
int confidence = match(det, ngrams, byteMap);
|
int confidence = match(det, ngrams, byteMap);
|
||||||
return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
|
return confidence == 0 ? null : new CharsetMatch(det, this, confidence, name, "he");
|
||||||
}
|
}
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
<!-- Copyright (c) 2005-2007 IBM Corporation and others. All rights reserved -->
|
<!-- Copyright (c) 2005-2013 IBM Corporation and others. All rights reserved -->
|
||||||
<!-- See individual test cases for their specific copyright. -->
|
<!-- See individual test cases for their specific copyright. -->
|
||||||
|
|
||||||
<charset-detection-tests>
|
<charset-detection-tests>
|
||||||
@ -118,7 +118,7 @@
|
|||||||
|
|
||||||
</test-case>
|
</test-case>
|
||||||
|
|
||||||
<test-case id="IUC10-he" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-8/he">
|
<test-case id="IUC10-he" encodings="UTF-8 UTF-32BE UTF-32LE ISO-8859-8-I/he">
|
||||||
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
<!-- Copyright © 1991-2005 Unicode, Inc. All rights reserved. -->
|
||||||
|
|
||||||
אירופה, תוכנה והאינטרנט:
|
אירופה, תוכנה והאינטרנט:
|
||||||
@ -548,4 +548,4 @@ Conference Program
|
|||||||
şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar.
|
şifrelemeyi desteklemek zorundadırlar; veriler, farklı şifreleme ve altyapılardan geçerken bozulma riski taşırlar.
|
||||||
|
|
||||||
</test-case>
|
</test-case>
|
||||||
</charset-detection-tests>
|
</charset-detection-tests>
|
||||||
|
@ -98,7 +98,38 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
CheckAssert(charsetNames[i].equals("") == false);
|
CheckAssert(charsetNames[i].equals("") == false);
|
||||||
// System.out.println("\"" + charsetNames[i] + "\"");
|
// System.out.println("\"" + charsetNames[i] + "\"");
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
final String[] defDisabled = {
|
||||||
|
"IBM420_rtl", "IBM420_ltr",
|
||||||
|
"IBM424_rtl", "IBM424_ltr"
|
||||||
|
};
|
||||||
|
String[] activeCharsetNames = det.getDetectableCharsets();
|
||||||
|
for (String cs : activeCharsetNames) {
|
||||||
|
// the charset must be included in all list
|
||||||
|
boolean found = false;
|
||||||
|
for (String cs0 : charsetNames) {
|
||||||
|
if (cs0.equals(cs)) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!found) {
|
||||||
|
errln(cs + " is not included in the all charset list." );
|
||||||
|
}
|
||||||
|
|
||||||
|
// some charsets are disabled by default
|
||||||
|
found = false;
|
||||||
|
for (String cs1 : defDisabled) {
|
||||||
|
if (cs1.equals(cs)) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (found) {
|
||||||
|
errln(cs + " should not be included in the default charset list.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public void TestInputFilter() throws Exception
|
public void TestInputFilter() throws Exception
|
||||||
{
|
{
|
||||||
@ -484,6 +515,10 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
"\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
|
"\u0627\u062C\u062A\u0645\u0627\u0639\u064A\u0629.";
|
||||||
|
|
||||||
CharsetDetector det = new CharsetDetector();
|
CharsetDetector det = new CharsetDetector();
|
||||||
|
det.setDetectableCharset("IBM424_rtl", true);
|
||||||
|
det.setDetectableCharset("IBM424_ltr", true);
|
||||||
|
det.setDetectableCharset("IBM420_rtl", true);
|
||||||
|
det.setDetectableCharset("IBM420_ltr", true);
|
||||||
CharsetMatch m;
|
CharsetMatch m;
|
||||||
String charsetMatch;
|
String charsetMatch;
|
||||||
byte[] bytes;
|
byte[] bytes;
|
||||||
@ -603,7 +638,7 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
|
|
||||||
CharsetMatch m = _test1255(s);
|
CharsetMatch m = _test1255(s);
|
||||||
String charsetMatch = m.getName();
|
String charsetMatch = m.getName();
|
||||||
CheckAssert(charsetMatch.equals("ISO-8859-8"));
|
CheckAssert(charsetMatch.equals("ISO-8859-8-I"));
|
||||||
CheckAssert(m.getLanguage().equals("he"));
|
CheckAssert(m.getLanguage().equals("he"));
|
||||||
|
|
||||||
m = _test1255_reverse(s);
|
m = _test1255_reverse(s);
|
||||||
@ -654,6 +689,10 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
|
private CharsetMatch _testIBM424_he_rtl(String s) throws Exception {
|
||||||
byte [] bytes = s.getBytes("IBM424");
|
byte [] bytes = s.getBytes("IBM424");
|
||||||
CharsetDetector det = new CharsetDetector();
|
CharsetDetector det = new CharsetDetector();
|
||||||
|
det.setDetectableCharset("IBM424_rtl", true);
|
||||||
|
det.setDetectableCharset("IBM424_ltr", true);
|
||||||
|
det.setDetectableCharset("IBM420_rtl", true);
|
||||||
|
det.setDetectableCharset("IBM420_ltr", true);
|
||||||
det.setText(bytes);
|
det.setText(bytes);
|
||||||
CharsetMatch m = det.detect();
|
CharsetMatch m = det.detect();
|
||||||
return m;
|
return m;
|
||||||
@ -669,6 +708,10 @@ public class TestCharsetDetector extends TestFmwk
|
|||||||
byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
|
byte [] bytes = ltrStrBuf.toString().getBytes("IBM424");
|
||||||
|
|
||||||
CharsetDetector det = new CharsetDetector();
|
CharsetDetector det = new CharsetDetector();
|
||||||
|
det.setDetectableCharset("IBM424_rtl", true);
|
||||||
|
det.setDetectableCharset("IBM424_ltr", true);
|
||||||
|
det.setDetectableCharset("IBM420_rtl", true);
|
||||||
|
det.setDetectableCharset("IBM420_ltr", true);
|
||||||
det.setText(bytes);
|
det.setText(bytes);
|
||||||
CharsetMatch m = det.detect();
|
CharsetMatch m = det.detect();
|
||||||
return m;
|
return m;
|
||||||
|
Loading…
Reference in New Issue
Block a user