ICU-124 charset detector, work in progress.

X-SVN-Rev: 17476
2005-04-13 01:11:26 +00:00 · 2005-04-13 01:11:26 +00:00 · 5f733d65b8
commit 5f733d65b8
parent add48d59b6
6 changed files with 209 additions and 7 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -77,6 +77,6 @@ public class TestCharsetDetector extends TestFmwk {
        CharsetMatch m = det.detect();
        CheckAssert(m.getName().equals("UTF-8"));
        String retrievedS = m.getString();
-        CheckAssert(s == retrievedS);
+        CheckAssert(s.equals(retrievedS));
    }
 }
--- a/icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java
@ -0,0 +1,41 @@
+/*
+ * Created on Apr 12, 2005
+ *
+ * TODO To change the template for this generated file go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+package com.ibm.icu.text;
+
+
+/**
+ *   Shift-JIS encoding scheme recognizer
+ *
+ */
+class CharsetDetectEnc_sjis extends CharsetDetectEncoding {
+
+     boolean nextChar(iteratedChar retChar, CharsetDetector det) {
+         retChar.index = retChar.nextIndex;
+         retChar.error = false;
+         int firstByte;
+         firstByte = retChar.charValue = retChar.nextByte(det);
+         if (firstByte < 0) {
+             return false;
+         }
+         
+         if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
+             return true;
+         }
+         
+         int secondByte = retChar.nextByte(det);
+         if (secondByte < 0)  {
+             return false;          
+         }
+         retChar.charValue = firstByte << 8 + secondByte;
+         if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
+             // Illegal second byte value.
+             retChar.error = true;
+         }
+        return true;
+    }
+
+}
--- a/icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java
@ -0,0 +1,45 @@
+/*
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and         *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+/**
+ * @author andy
+ *
+ * TODO To change the template for this generated type comment go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+abstract class CharsetDetectEncoding {
+    
+    static class iteratedChar {
+        int             charValue = 0;
+        int             index     = 0;
+        int             nextIndex = 0;
+        boolean         error     = false;
+        boolean         done      = false;
+        
+        void reset() {
+            charValue = 0;
+            index     = -1;
+            nextIndex = 0;
+            error     = false;
+            done      = false;
+        }
+        
+        int nextByte(CharsetDetector det) {
+            if (nextIndex >= det.fInputLen) {
+                done = true;
+                return -1;
+            }
+            int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
+            return byteValue;
+        }
+    }
+     
+    
+    abstract boolean nextChar(iteratedChar retChar, CharsetDetector det);
+    
+  }
--- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java
@ -308,13 +308,13 @@ public class CharsetDetector {
    //
    //  Stuff private to CharsetDetector
    //
-    private byte[]       fRawInput;     // Original, untouched input bytes.
+    byte[]               fRawInput;     // Original, untouched input bytes.
                                        //  If user gave us a byte array, this is it.
                                        //  If user gave us a stream, it's read to a 
                                        //   buffer here.
-    private int          fRawLength;    // Length of data in fRawInput array.
+    int                  fRawLength;    // Length of data in fRawInput array.
    
-    private InputStream  fInputStream;  // User's input stream, or null if the user
+     InputStream         fInputStream;  // User's input stream, or null if the user
                                        //   gave us a byte array.
    
    
@ -331,6 +331,7 @@ public class CharsetDetector {
    private static ArrayList createRecognizers() {
        ArrayList recognizers = new ArrayList();
        recognizers.add(new CharsetRecog_UTF8());
+        recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis()));
        
        // Create an array of all charset names, as a side effect.
        // Needed for the getAllDetectableCharsets() API.
--- a/icu4j/src/com/ibm/icu/text/CharsetMatch.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java
@ -61,8 +61,14 @@ public class CharsetMatch implements Comparable {
     * @param maxLength The maximium length of the String to be created.
     * @return a String created from the converted input data.
     */
-    public String getString(int maxLength) {
-        return null;
+    public String getString(int maxLength) throws java.io.IOException {
+        String result = null;
+        if (fInputStream != null) {
+            // TODO:  read the stream in somehow.
+        } else {
+            result = new String(fRawInput, getName());            
+        }
+        return result;

    }
    
@ -75,7 +81,7 @@ public class CharsetMatch implements Comparable {
     * @return the confidence in the charset match
     */
    public int getConfidence() {
-        return 0;
+        return fConfidence;
    }
    
    /**
@ -135,6 +141,17 @@ public class CharsetMatch implements Comparable {
    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
        fRecognizer = rec;
        fConfidence = conf;
+        
+        // The references to the original aplication input data must be copied out
+        //   of the charset recognizer to here, in case the application resets the
+        //   recognizer before using this CharsetMatch.
+        if (det.fInputStream == null) {
+            // We only want the existing input byte data if it came straight from the user,
+            //   not if is just the head of a stream.
+            fRawInput    = det.fRawInput;
+            fRawLength   = det.fRawLength;
+        };
+        fInputStream = det.fInputStream;
    }

    
@ -143,6 +160,12 @@ public class CharsetMatch implements Comparable {
    //
    private int                 fConfidence;
    private CharsetRecognizer   fRecognizer;
+    private byte[]              fRawInput = null;     // Original, untouched input bytes.
+                                                      //  If user gave us a byte array, this is it.
+    private int                 fRawLength;           // Length of data in fRawInput array.
+
+    private InputStream         fInputStream = null;  // User's input stream, or null if the user
+                                                      //   gave us a byte array.
    

 }
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
@ -0,0 +1,92 @@
+/*
+ * Created on Apr 12, 2005
+ *
+ * TODO To change the template for this generated file go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+package com.ibm.icu.text;
+
+/**
+ * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
+ *                   Match is determined mostly by the input data adhering to the
+ *                   encoding scheme for the charset, although the hooks are here
+ *                   to also check language based character occurence frequencies if that
+ *                   proves to be necessary.
+ * <p/>
+ *                   Instances of this class are singletons, one per encoding
+ *                   being recognized.  They are created in the main
+ *                   CharsetDetector class and kept in the global list of available
+ *                   encodings to be checked.  The specific encoding being recognized
+ *                   is determined by the CharsetDetectEncoding provided when an
+ *                   instance of this class is created.
+ *                   
+ */
+class CharsetRecog_mbcs extends CharsetRecognizer {
+
+    private CharsetDetectEncoding fEnc;
+    private String                fCharsetName;
+    
+    /**
+     * Constructor.  
+     * @param enc
+     */
+    CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) {
+        fEnc = enc;
+        fCharsetName = charsetName;
+    }
+    
+    /**
+     * Get the IANA name of this charset.
+     * @return the charset name.
+     */
+    String      getName() {
+        return fCharsetName;
+    }
+    
+    
+    /**
+     * Test the match of this charset with the input text data
+     *      which is obtained via the CharsetDetector object.
+     * 
+     * @param det  The CharsetDetector, which contains the input text
+     *             to be checked for being in this charset.
+     * @return     Two values packed into one int  (Damn java, anyhow)
+     *             <br/>
+     *             bits 0-7:  the match confidence, ranging from 0-100
+     *             <br/>
+     *             bits 8-15: The match reason, an enum-like value.
+     */
+     int         match(CharsetDetector det) {
+        int   singleByteCharCount = 0;
+        int   doubleByteCharCount = 0;
+        int   badCharCount        = 0;
+        int   totalCharCount      = 0;
+        
+        CharsetDetectEncoding.iteratedChar   ichar = new CharsetDetectEncoding.iteratedChar();
+        
+        for (ichar.reset(); fEnc.nextChar(ichar, det);) {
+            totalCharCount++;
+            if (ichar.error) {
+                badCharCount++; 
+            } else {
+                
+                if (ichar.charValue <= 0xff) {
+                    singleByteCharCount++;
+                } else {
+                    doubleByteCharCount++;
+                }
+            }
+        }
+        
+        int confidence = 40 + doubleByteCharCount - 10*badCharCount;
+        if (confidence < 0) {
+            confidence = 0;
+        }
+        if (confidence > 100) {
+            confidence = 100;
+        }
+         
+        return confidence;
+    }
+
+}