ICU-124 charset detector, work in progress.

X-SVN-Rev: 17471
2005-04-08 01:27:36 +00:00 · 2005-04-08 01:27:36 +00:00 · 757d1a33fe
commit 757d1a33fe
parent cc7f22046e
5 changed files with 402 additions and 9 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/dev/test/charsetdet/TestCharsetDetector.java
@ -0,0 +1,58 @@
+/**
+ *******************************************************************************
+ * Copyright (C) 2005, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ */
+package com.ibm.icu.dev.test.charsetdet;
+
+import com.ibm.icu.dev.test.TestFmwk;
+import com.ibm.icu.text.*;
+
+
+/**
+ * @author andy
+ *
+ * TODO To change the template for this generated type comment go to
+ * Window - Preferences - Java - Code Style - Code Templates
+ */
+public class TestCharsetDetector extends TestFmwk {
+
+    
+    /**
+     * Constructor
+     */
+    public TestCharsetDetector()
+    {
+    }
+
+    public static void main(String[] args) {
+        try
+        {
+            TestCharsetDetector test = new TestCharsetDetector();
+            test.run(args);
+        }
+        catch (Exception e)
+        {
+            e.printStackTrace();
+        }
+    }
+
+    
+    public void TestConstruction() {
+        int i;
+        CharsetDetector  det = new CharsetDetector();
+        
+        String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
+        if (charsetNames.length == 0) {
+            errln("TestCharsetDetector TestConstruction #0001");
+        }
+        for (i=0; i<charsetNames.length; i++) {
+            if (charsetNames[i].equals("")) {
+                errln("TestCharsetDetector TestConstruction #0002.  i=" + i);                
+            }
+            // System.out.println("\"" + charsetNames[i] + "\"");
+        }
+        
+     }
+}
--- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java
@ -8,6 +8,11 @@ package com.ibm.icu.text;

 import java.io.InputStream;
 import java.io.Reader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Collections;
+import java.util.Arrays;


 /**
@ -64,7 +69,8 @@ public class CharsetDetector {
    * 
    *  @param encoding The declared encoding 
    */
-    public CharsetDetector setDecaredEncoding(String encoding) {
+    public CharsetDetector setDeclaredEncoding(String encoding) {
+        fDeclaredEncoding = encoding;
        return this;
    }
    
@ -73,7 +79,9 @@ public class CharsetDetector {
     * @param in the input text of unknown encoding
     * @return This CharsetDetector
     */
-    public CharsetDetector setText(byte in[]) {
+    public CharsetDetector setText(byte [] in) {
+        fRawInput  = in;
+        fRawLength = in.length;      
        return this;
    }
    
@ -89,7 +97,14 @@ public class CharsetDetector {
     * @param in the input text of unknown encoding
     * @return This CharsetDetector
     */
-    public CharsetDetector setText(InputStream in) {
+    public CharsetDetector setText(InputStream in) throws IOException {
+        fInputStream = in;
+        fInputStream.mark(4000);
+        fRawInput = new byte[4000];       // Always make a new buffer because the
+                                          //   previous one may have come from the caller,
+                                          //   in which case we can't touch it.
+        fRawLength = fInputStream.read(fRawInput);
+        fInputStream.reset();
        return this;
    }

@ -109,10 +124,15 @@ public class CharsetDetector {
     *  </ul>
     *
     * @return a CharsetMatch object representing the best matching charset.
+     * *
+     * TODO:  A better implementation would be to copy the detect loop from
+     *        detectAll(), and cut it short as soon as a match with a high confidence
+     *        is found.  This is something to be done later, after things are otherwise
+     *        working.
     */
    public CharsetMatch detect() {
-        return null;
-    }
+        return detectAll()[0];
+     }
    
    /**
     *  Return an array of all charsets that appear to be plausible
@ -128,7 +148,28 @@ public class CharsetDetector {
     * @return An array of CharsetMatch objects representing possibly matching charsets.
     */
    public CharsetMatch[] detectAll() {
-        return null;
+        CharsetRecognizer csr;
+        int               i;
+        int               detectResults;
+        int               confidence;
+        ArrayList         matches = new ArrayList();
+        
+        //  Iterate over all possible charsets, remember all that
+        //    give a match quality > 0.
+        for (i=0; i<fCSRecognizers.size(); i++) {
+            csr = (CharsetRecognizer)fCSRecognizers.get(i);
+            detectResults = csr.match(this);
+            confidence = detectResults & 0x000000ff;
+            if (confidence > 0) {
+                CharsetMatch  m = new CharsetMatch(this, csr, confidence);
+                matches.add(csr);
+            }
+        }
+        Collections.sort(matches);      // CharsetMatch compares on confidence
+        Collections.reverse(matches);   //  Put best match first.
+        CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
+        matches.toArray(resultArray);
+        return resultArray;
    }

    
@ -182,8 +223,121 @@ public class CharsetDetector {
     * by the charset detector.
     */
    public static String[] getAllDetectableCharsets() {
-        return null;
+        return fCharsetNames;
    }
+    

+    /**
+     *  MungeInput - after getting a set of raw input data to be analyzed, preprocess
+     *               it by removing what appears to be html markup.
+     */
+    private void MungeInput() {
+        int srci = 0;
+        int dsti = 0;
+        byte b;
+        boolean  inMarkup = false;
+        int      openTags = 0;
+        int      badTags  = 0;
+        
+        //
+        //  html / xml markup stripping.
+        //     quick and dirty, not 100% accurate, but hopefully good enough, statistically.
+        //     discard everything within < brackets >
+        //     Count how many total '<' and illegal (nested) '<' occur, so we can make some
+        //     guess as to whether the input was actually marked up at all.
+        for (srci=0; srci<fRawLength; srci++) {
+            b = fRawInput[srci];
+            if (b == (byte)'<') {
+                if (inMarkup) {
+                    badTags++;
+                }
+                inMarkup = true;
+                openTags++;
+            }
+            if (inMarkup == false) {
+                fInputBytes[dsti++] = b;
+            }
+            
+            if (b == (byte)'>') {
+                inMarkup = false;
+            }        
+        }
+        fInputLen = dsti;
+        
+        //
+        //  If it looks like this input wasn't marked up, or if it looks like it's
+        //    essentially nothing but markup abandon the markup stripping.
+        //    Detection will have to work on the unstripped input.
+        //
+        if (openTags<5 || openTags/5 < badTags || 
+                (fInputLen < 100 && fRawLength>600)) {
+            for (srci=0; srci<fRawLength; srci++) {
+                fInputBytes[srci] = fRawInput[srci];
+            }
+            fInputLen = srci;
+        }
+        
+        //
+        // Tally up the byte occurence statistics.
+        //   These are available for use by the various detectors.
+        //
+        Arrays.fill(fByteStats, (short)0);
+        for (srci=0; srci<fInputLen; srci++) {
+            int val = fInputBytes[srci] & 0x00ff;
+            fByteStats[val]++;
+        }        
+     }

+    /**
+     *  The following items are accessed by individual CharsetRecongizers during
+     *     the recognition process
+     */
+    byte[]      fInputBytes =     // The text to be checked.  Markup will have been
+                   new byte[4000];//   removed if appropriate.
+    
+    int         fInputLen;        // Length of the byte data in fInputText.
+    
+    short       fByteStats[];     // byte frequency statistics for the input text.
+                                  //   Value is percent, not absolute.
+                                  //   Value is rounded up, so zero really means zero occurences.
+    
+    String      fDeclaredEncoding;
+    
+    
+
+    //
+    //  Stuff private to CharsetDetector
+    //
+    private byte[]       fRawInput;     // Original, untouched input bytes.
+                                        //  If user gave us a byte array, this is it.
+                                        //  If user gave us a stream, it's read to a 
+                                        //   buffer here.
+    private int          fRawLength;    // Length of data in fRawInput array.
+    
+    private InputStream  fInputStream;  // User's input stream, or null if the user
+                                        //   gave us a byte array.
+    
+    
+    /**
+     *  List of recognizers for all charsets known to the implementation.
+     *
+     */
+    private static ArrayList fCSRecognizers = createRecognizers();
+    private static String [] fCharsetNames;
+    
+   /**
+     * Create the singleton instances of the CharsetRecognizer classes
+     */
+    private static ArrayList createRecognizers() {
+        ArrayList recognizers = new ArrayList();
+        recognizers.add(new CharsetRecog_UTF8());
+        
+        // Create an array of all charset names, as a side effect.
+        // Needed for the getAllDetectableCharsets() API.
+        fCharsetNames = new String [recognizers.size()];
+        for (int i=0; i<recognizers.size(); i++) {
+            fCharsetNames[i] = ((CharsetRecognizer)recognizers.get(i)).getName();          
+        }
+        return recognizers;
+    }
 }
--- a/icu4j/src/com/ibm/icu/text/CharsetMatch.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java
@ -17,13 +17,21 @@ import java.io.Reader;
 * or for Java Reader or String to access the original byte data in Unicode form.
 * <p/>
 * Instances of this class are created only by CharsetDetectors.
+ * <p/>
+ * Note:  this class has a natural ordering that is inconsistent with equals.
+ *        The natural ordering is based on the match confidence value.
 */
-public class CharsetMatch {
+public class CharsetMatch implements Comparable {

    
    /**
     * Create a java.io.Reader for reading the Unicode character data corresponding
     * to the original byte data supplied to the Charset detect operation.
+     * <p/>
+     * CAUTION:  if the source of the byte data was an InputStream, a Reader
+     * can be created for only one matching char set using this method.  If more 
+     * than one charset needs to be tried, the caller will need to reset
+     * the InputStream and create InputStreamReaders itself, based on the Char Set name.
     *
     * @return the Reader for the Unicode character data.
     */
@ -98,7 +106,43 @@ public class CharsetMatch {
     * @return The name of the charset.
     */
    public String getName() {
-        return "";
+        return fRecognizer.getName();
+    }
+    
+    
+    /**
+     * Comparison function, for java.lang.Comparable
+     * Comparison is based on the match confidence value, which conveniently
+     *   allows CharsetDetector.detectAll() to order its results. 
+     */
+    public int compareTo (Object o) {
+        CharsetMatch other = (CharsetMatch)o;
+        int compareResult = 0;
+        if (this.fConfidence > other.fConfidence) {
+            compareResult = 1;
+        } else if (this.fConfidence < other.fConfidence) {
+            compareResult = -1;
+        }
+        return compareResult;
+    }
+    
+    
+    
+    /**
+     *  Constructor.  Implementation internal
+     *
+     */
+    CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+        fRecognizer = rec;
+        fConfidence = conf;
    }

+    
+    //
+    //   Private Data
+    //
+    private int                 fConfidence;
+    private CharsetRecognizer   fRecognizer;
+    
+
 }
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_UTF8.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_UTF8.java
@ -0,0 +1,92 @@
+/**
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and         *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+/**
+ * Charset recognizer for UTF-8
+ *
+ */
+class CharsetRecog_UTF8 extends CharsetRecognizer {
+
+    String getName() {
+        return "UTF-8";
+    }
+
+    /* (non-Javadoc)
+     * @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
+     */
+    int match(CharsetDetector det) {
+        boolean     hasBOM = false;
+        int         numValid = 0;
+        int         numInvalid = 0;
+        byte        input[] = det.fInputBytes;
+        int         i;
+        int         trailBytes = 0;
+        int         confidence;
+        
+        if (det.fInputLen >= 3 && 
+                input[0]==0xef && input[1]==0xbb & input[2]==0xbf) {
+            hasBOM = true;
+        }
+        
+        // Scan for multi-byte sequences
+        for (i=0; i<det.fInputLen; i++) {
+            int b = input[i];
+            if ((b & 0x80) == 0) {
+                continue;   // ASCII
+            }
+            
+            // Hi bit on char found.  Figure out how long the sequence should be
+            if ((b & 0x0e0) == 0x0c0) {
+                trailBytes = 1;                
+            } else if ((b & 0x0f0) == 0x0e0) {
+                trailBytes = 2;
+            } else if ((b & 0x0f8) == 0xf0) {
+                trailBytes = 3;
+            } else {
+                numInvalid++;
+                trailBytes = 0;
+            }
+                
+            // Verify that we've got the right number of trail bytes in the sequence
+            for (;;) {
+                i++;
+                b = input[i];
+                if ((b & 0xc0) != 0x080) {
+                    numInvalid++;
+                    break;
+                }
+                if (--trailBytes == 0) {
+                    numValid++;
+                    break;
+                }
+            }
+                        
+        }
+        
+        // Cook up some sort of confidence score, based on presense of a BOM
+        //    and the existence of valid and/or invalid multi-byte sequences.
+        confidence = 0;
+        if (hasBOM && numInvalid==0) {
+            confidence = 100;
+        } else if (hasBOM && numValid > numInvalid) {
+            confidence = 80;
+        } else if (numValid > 3 && numInvalid == 0) {
+            confidence = 100;            
+        } else if (numValid > 0 && numInvalid == 0) {
+            confidence = 80;
+        } else if (numValid == 0 && numInvalid == 0) {
+            // Plain ASCII.  
+            confidence = 50;            
+        } else if (numValid > numInvalid) {
+            // Probably corruput utf-8 data.  Valid sequences aren't likely by chance.
+            confidence = 60;
+        }
+        return confidence;
+    }
+
+}
--- a/icu4j/src/com/ibm/icu/text/CharsetRecognizer.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecognizer.java
@ -0,0 +1,45 @@
+/**
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and         *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+/**
+ * Abstract class for recognizing a single charset.
+ * Part of the implementation of ICU's CharsetDetector.
+ * 
+ * Each specific charset that can be recognized will have an instance
+ * of some subclass of this class.  All interaction between the overall
+ * CharsetDetector and the stuff specific to an individual charset happens
+ * via the interface provided here.
+ * 
+ * Instances of CharsetDetector DO NOT have or maintain 
+ * state pertaining to a specific match or detect operation.
+ * The WILL be shared by multiple instances of CharsetDetector.
+ * They encapsulate const charset-specific information.
+ * 
+ */
+abstract class CharsetRecognizer {
+    /**
+     * Get the IANA name of this charset.
+     * @return the charset name.
+     */
+    abstract String      getName();
+    
+    /**
+     * Test the match of this charset with the input text data
+     *      which is obtained via the CharsetDetector object.
+     * 
+     * @param det  The CharsetDetector, which contains the input text
+     *             to be checked for being in this charset.
+     * @return     Two values packed into one int  (Damn java, anyhow)
+     *             <br/>
+     *             bits 0-7:  the match confidence, ranging from 0-100
+     *             <br/>
+     *             bits 8-15: The match reason, an enum-like value.
+     */
+    abstract int         match(CharsetDetector det);
+
+}