diff --git a/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java b/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java index 65f89c4568..0497e52fd4 100644 --- a/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java +++ b/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java @@ -77,6 +77,6 @@ public class TestCharsetDetector extends TestFmwk { CharsetMatch m = det.detect(); CheckAssert(m.getName().equals("UTF-8")); String retrievedS = m.getString(); - CheckAssert(s == retrievedS); + CheckAssert(s.equals(retrievedS)); } } diff --git a/icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java b/icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java new file mode 100644 index 0000000000..36609a1fe4 --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java @@ -0,0 +1,41 @@ +/* + * Created on Apr 12, 2005 + * + * TODO To change the template for this generated file go to + * Window - Preferences - Java - Code Style - Code Templates + */ +package com.ibm.icu.text; + + +/** + * Shift-JIS encoding scheme recognizer + * + */ +class CharsetDetectEnc_sjis extends CharsetDetectEncoding { + + boolean nextChar(iteratedChar retChar, CharsetDetector det) { + retChar.index = retChar.nextIndex; + retChar.error = false; + int firstByte; + firstByte = retChar.charValue = retChar.nextByte(det); + if (firstByte < 0) { + return false; + } + + if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { + return true; + } + + int secondByte = retChar.nextByte(det); + if (secondByte < 0) { + return false; + } + retChar.charValue = firstByte << 8 + secondByte; + if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { + // Illegal second byte value. + retChar.error = true; + } + return true; + } + +} diff --git a/icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java b/icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java new file mode 100644 index 0000000000..da66032a5e --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java @@ -0,0 +1,45 @@ +/* +******************************************************************************* +* Copyright (C) 2005, International Business Machines Corporation and * +* others. All Rights Reserved. * +******************************************************************************* +*/ +package com.ibm.icu.text; + +/** + * @author andy + * + * TODO To change the template for this generated type comment go to + * Window - Preferences - Java - Code Style - Code Templates + */ +abstract class CharsetDetectEncoding { + + static class iteratedChar { + int charValue = 0; + int index = 0; + int nextIndex = 0; + boolean error = false; + boolean done = false; + + void reset() { + charValue = 0; + index = -1; + nextIndex = 0; + error = false; + done = false; + } + + int nextByte(CharsetDetector det) { + if (nextIndex >= det.fInputLen) { + done = true; + return -1; + } + int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff; + return byteValue; + } + } + + + abstract boolean nextChar(iteratedChar retChar, CharsetDetector det); + + } diff --git a/icu4j/src/com/ibm/icu/text/CharsetDetector.java b/icu4j/src/com/ibm/icu/text/CharsetDetector.java index 5feed23c73..618c58462c 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java +++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java @@ -308,13 +308,13 @@ public class CharsetDetector { // // Stuff private to CharsetDetector // - private byte[] fRawInput; // Original, untouched input bytes. + byte[] fRawInput; // Original, untouched input bytes. // If user gave us a byte array, this is it. // If user gave us a stream, it's read to a // buffer here. - private int fRawLength; // Length of data in fRawInput array. + int fRawLength; // Length of data in fRawInput array. - private InputStream fInputStream; // User's input stream, or null if the user + InputStream fInputStream; // User's input stream, or null if the user // gave us a byte array. @@ -331,6 +331,7 @@ public class CharsetDetector { private static ArrayList createRecognizers() { ArrayList recognizers = new ArrayList(); recognizers.add(new CharsetRecog_UTF8()); + recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis())); // Create an array of all charset names, as a side effect. // Needed for the getAllDetectableCharsets() API. diff --git a/icu4j/src/com/ibm/icu/text/CharsetMatch.java b/icu4j/src/com/ibm/icu/text/CharsetMatch.java index 3678500279..2e53c892a4 100644 --- a/icu4j/src/com/ibm/icu/text/CharsetMatch.java +++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java @@ -61,8 +61,14 @@ public class CharsetMatch implements Comparable { * @param maxLength The maximium length of the String to be created. * @return a String created from the converted input data. */ - public String getString(int maxLength) { - return null; + public String getString(int maxLength) throws java.io.IOException { + String result = null; + if (fInputStream != null) { + // TODO: read the stream in somehow. + } else { + result = new String(fRawInput, getName()); + } + return result; } @@ -75,7 +81,7 @@ public class CharsetMatch implements Comparable { * @return the confidence in the charset match */ public int getConfidence() { - return 0; + return fConfidence; } /** @@ -135,6 +141,17 @@ public class CharsetMatch implements Comparable { CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) { fRecognizer = rec; fConfidence = conf; + + // The references to the original aplication input data must be copied out + // of the charset recognizer to here, in case the application resets the + // recognizer before using this CharsetMatch. + if (det.fInputStream == null) { + // We only want the existing input byte data if it came straight from the user, + // not if is just the head of a stream. + fRawInput = det.fRawInput; + fRawLength = det.fRawLength; + }; + fInputStream = det.fInputStream; } @@ -143,6 +160,12 @@ public class CharsetMatch implements Comparable { // private int fConfidence; private CharsetRecognizer fRecognizer; + private byte[] fRawInput = null; // Original, untouched input bytes. + // If user gave us a byte array, this is it. + private int fRawLength; // Length of data in fRawInput array. + + private InputStream fInputStream = null; // User's input stream, or null if the user + // gave us a byte array. } diff --git a/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java b/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java new file mode 100644 index 0000000000..c7e3ce050b --- /dev/null +++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java @@ -0,0 +1,92 @@ +/* + * Created on Apr 12, 2005 + * + * TODO To change the template for this generated file go to + * Window - Preferences - Java - Code Style - Code Templates + */ +package com.ibm.icu.text; + +/** + * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets. + * Match is determined mostly by the input data adhering to the + * encoding scheme for the charset, although the hooks are here + * to also check language based character occurence frequencies if that + * proves to be necessary. + *

+ * Instances of this class are singletons, one per encoding + * being recognized. They are created in the main + * CharsetDetector class and kept in the global list of available + * encodings to be checked. The specific encoding being recognized + * is determined by the CharsetDetectEncoding provided when an + * instance of this class is created. + * + */ +class CharsetRecog_mbcs extends CharsetRecognizer { + + private CharsetDetectEncoding fEnc; + private String fCharsetName; + + /** + * Constructor. + * @param enc + */ + CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) { + fEnc = enc; + fCharsetName = charsetName; + } + + /** + * Get the IANA name of this charset. + * @return the charset name. + */ + String getName() { + return fCharsetName; + } + + + /** + * Test the match of this charset with the input text data + * which is obtained via the CharsetDetector object. + * + * @param det The CharsetDetector, which contains the input text + * to be checked for being in this charset. + * @return Two values packed into one int (Damn java, anyhow) + *
+ * bits 0-7: the match confidence, ranging from 0-100 + *
+ * bits 8-15: The match reason, an enum-like value. + */ + int match(CharsetDetector det) { + int singleByteCharCount = 0; + int doubleByteCharCount = 0; + int badCharCount = 0; + int totalCharCount = 0; + + CharsetDetectEncoding.iteratedChar ichar = new CharsetDetectEncoding.iteratedChar(); + + for (ichar.reset(); fEnc.nextChar(ichar, det);) { + totalCharCount++; + if (ichar.error) { + badCharCount++; + } else { + + if (ichar.charValue <= 0xff) { + singleByteCharCount++; + } else { + doubleByteCharCount++; + } + } + } + + int confidence = 40 + doubleByteCharCount - 10*badCharCount; + if (confidence < 0) { + confidence = 0; + } + if (confidence > 100) { + confidence = 100; + } + + return confidence; + } + +}