ICU-124 Code Page Detection stub classes added.

X-SVN-Rev: 17268
2005-03-02 02:07:29 +00:00 · 2005-03-02 02:07:29 +00:00 · 04313f02eb
commit 04313f02eb
parent c3248589e2
2 changed files with 293 additions and 0 deletions
--- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java
@ -0,0 +1,189 @@
+/**
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and         *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+import java.io.InputStream;
+import java.io.Reader;
+
+
+/**
+ *
+ * <code>CharsetDetector</code> provides a facility for detecting the
+ * charset or encoding of character data in an unknown format.
+ * The input data can either be from an input stream or an array of bytes.
+ * The result of the detection operation is a list of possibly matching
+ * charsets, or, for simple use, you can just ask for a Java Reader that
+ * will will work over the input data.
+ * <p/>
+ * Character set detection is at best an imprecise operation.  The detection
+ * process will attempt to identify the charset that best matches the characteristics
+ * of the byte data, but the process is partly statistical in nature, and
+ * the results can not be guaranteed to always be correct.
+ * <p/>
+ * For best accuracy in charset detection, the input data should be primarily
+ * in a single language, and a minimum of a few hundred bytes worth of plain text
+ * in the language are needed.  The detection process will attempt to
+ * ignore html or xml style markup that could otherwise obscure the content.
+ * <p/>
+ * <b>Question:</b>Should we have getters corresponding to the setters for inut text
+ * and declared encoding?
+ * <p/>
+ * <b>A thought:</b>  If we were to create our own type of Java Reader, we could defer
+ * figuring out an actual charset for data that starts out with too much English
+ *  only ASCII until the user actually read through to something that didn't look
+ * like 7 bit English.  If  nothing else ever appeared, we would never need to
+ *  actually choose the "real" charset.  All assuming that the application just
+ *   wants the data, and doesn't care about a char set name.
+ *
+ *
+ */
+public class CharsetDetector {
+
+
+    /**
+     *   Constructor
+     */
+    public CharsetDetector() {
+    }
+
+    /**
+     * Set the declared encoding for charset detection.
+    *  The declared encoding of an input text is an encoding obtained
+    *  from an http header or xml declaration or similar source that
+    *  can be provided as additional information to the charset detector.  
+    *  A match between a declared encoding and a possible detected encoding
+    *  will raise the quality of that detected encoding by a small delta,
+    *  and will also appear as a "reason" for the match.
+    * <p/>
+    * A declared encoding that is incompatible with the input data being
+    * analyzed will not be added to the list of possible encodings.
+    * 
+    *  @param encoding The declared encoding 
+    */
+    public CharsetDetector setDecaredEncoding(String encoding) {
+        return this;
+    }
+    
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     * @param in the input text of unknown encoding
+     * @return This CharsetDetector
+     */
+    public CharsetDetector setText(byte in[]) {
+        return this;
+    }
+    
+    /**
+     * Set the input text (byte) data whose charset is to be detected.
+     *  <p/>
+     *   The input stream that supplies the character data must have markSupported()
+     *   == true; the charset detection process will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *   be read depends on the characteristics of the data itself.
+
+     * @param in the input text of unknown encoding
+     * @return This CharsetDetector
+     */
+    public CharsetDetector setText(InputStream in) {
+        return this;
+    }
+
+  
+    /**
+     * Return the charset that best matches the supplied input data.
+     * 
+     * Note though, that because the detection 
+     * only looks at the start of the input data,
+     * there is a possibility that the returned charset will fail to handle
+     * the full set of input data.
+     * <p/>
+     * Raise an exception if 
+     *  <ul>
+     *    <li>no charset appears to match the data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+     *
+     * @return a CharsetMatch object representing the best matching charset.
+     */
+    public CharsetMatch detect() {
+        return null;
+    }
+    
+    /**
+     *  Return an array of all charsets that appear to be plausible
+     *  matches with the input data.  The array is ordered with the
+     *  best quality match first.
+     * <p/>
+     * Raise an exception if 
+     *  <ul>
+     *    <li>no charsets appear to match the input data.</li>
+     *    <li>no input text has been provided</li>
+     *  </ul>
+      * 
+     * @return An array of CharsetMatch objects representing possibly matching charsets.
+     */
+    public CharsetMatch[] detectAll() {
+        return null;
+    }
+
+    
+    /**
+     * Autodetect the charset of an inputStream, and return a Java Reader
+     * to access the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
+     * <p/>
+     *   For the input stream that supplies the character data, markSupported()
+     *   must be true; the  charset detection will read a small amount of data,
+     *   then return the stream to its original position via
+     *   the InputStream.reset() operation.  The exact amount that will
+     *    be read depends on the characteristics of the data itself.
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     * 
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     */
+    public Reader getReader(InputStream in, String declaredEncoding) {
+        return null;
+    }
+
+    /**
+     * Autodetect the charset of an inputStream, and return a String
+     * containing the converted input data.
+     * <p/>
+     * This is a convenience method that is equivalent to
+     *   <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
+     *<p/>
+     * Raise an exception if no charsets appear to match the input data.
+     * 
+     * @param in The source of the byte data in the unknown charset.
+     *
+     * @param declaredEncoding  A declared encoding for the data, if available,
+     *           or null or an empty string if none is available.
+     */
+    public String getString(byte[] in, String declaredEncoding) {
+        return null;
+    }
+
+ 
+    /**
+     * Get the names of all char sets that can be recognized by the char set detector.
+     *
+     * @return an array of the names of all charsets that can be recognized
+     * by the charset detector.
+     */
+    public static String[] getAllDetectableCharsets() {
+        return null;
+    }
+
+
+}
--- a/icu4j/src/com/ibm/icu/text/CharsetMatch.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java
@ -0,0 +1,104 @@
+/**
+*******************************************************************************
+* Copyright (C) 2005, International Business Machines Corporation and         *
+* others. All Rights Reserved.                                                *
+*******************************************************************************
+*/
+package com.ibm.icu.text;
+
+import java.io.InputStream;
+import java.io.Reader;
+
+
+/**
+ * This class represents a charset that has been identified by a CharsetDetector
+ * as a possible encoding for a set of input data.  From an instance of this
+ * class, you can ask for a confidence level in the charset identification,
+ * or for Java Reader or String to access the original byte data in Unicode form.
+ * <p/>
+ * Instances of this class are created only by CharsetDetectors.
+ */
+public class CharsetMatch {
+
+    
+    /**
+     * Create a java.io.Reader for reading the Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     *
+     * @return the Reader for the Unicode character data.
+     */
+    public Reader getReader() {
+        return null;
+    }
+    
+    
+
+    /**
+     * Create a Java String from Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     *
+     * @return a String created from the converted input data.
+     */
+    public String getString() {
+        return null;
+
+    }
+    /**
+     * Create a Java String from Unicode character data corresponding
+     * to the original byte data supplied to the Charset detect operation.
+     * The length of the returned string is limited to the specified size;
+     * the string will be trunctated to this length if necessary.  A limit value of
+     * zero or less is ignored, and treated as no limit.
+     *
+     * @param maxLength The maximium length of the String to be created.
+     * @return a String created from the converted input data.
+     */
+    public String getString(int maxLength) {
+        return null;
+
+    }
+    
+    /**
+     * Get an indication of the confidence in the charset detected.
+     * Confidence values range from 0-100, with larger numbers indicating
+     * a better match of the input data to the characteristics of the
+     * charset.
+     *
+     * @return the confidence in the charset match
+     */
+    public int getConfidence() {
+        return 0;
+    }
+    
+    /**
+     * Return an indication of what it was about input data that 
+     * that caused this charset to be considered as a possible match.
+     * <p>
+     * TODO: create a list of enum-like constants for the possible types of matches.
+     * 
+     * @return the type of match found for this charset.
+     */
+    public int getMatchType() {
+        return 0;
+    }
+ 
+    
+
+    /**
+     * Get the name of the detected charset.  
+     * The name will be one that can be used with other APIs on the
+     * platform that accept charset names.  It is the "Canonical name"
+     * as defined by the class java.nio.charset.Charset; for
+     * charsets that are registered with the IANA charset registry,
+     * this is the MIME-preferred registerd name.
+     *
+     * @see java.nio.charset.Charset
+     * @see java.io.InputStreamReader
+     *
+     * @return The name of the charset.
+     */
+    public String getName() {
+        return "";
+    }
+
+}