ICU-124 Code Page Detection stub classes added.

X-SVN-Rev: 17268
This commit is contained in:
Andy Heninger 2005-03-02 02:07:29 +00:00
parent c3248589e2
commit 04313f02eb
2 changed files with 293 additions and 0 deletions

View File

@ -0,0 +1,189 @@
/**
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.Reader;
/**
*
* <code>CharsetDetector</code> provides a facility for detecting the
* charset or encoding of character data in an unknown format.
* The input data can either be from an input stream or an array of bytes.
* The result of the detection operation is a list of possibly matching
* charsets, or, for simple use, you can just ask for a Java Reader that
* will will work over the input data.
* <p/>
* Character set detection is at best an imprecise operation. The detection
* process will attempt to identify the charset that best matches the characteristics
* of the byte data, but the process is partly statistical in nature, and
* the results can not be guaranteed to always be correct.
* <p/>
* For best accuracy in charset detection, the input data should be primarily
* in a single language, and a minimum of a few hundred bytes worth of plain text
* in the language are needed. The detection process will attempt to
* ignore html or xml style markup that could otherwise obscure the content.
* <p/>
* <b>Question:</b>Should we have getters corresponding to the setters for inut text
* and declared encoding?
* <p/>
* <b>A thought:</b> If we were to create our own type of Java Reader, we could defer
* figuring out an actual charset for data that starts out with too much English
* only ASCII until the user actually read through to something that didn't look
* like 7 bit English. If nothing else ever appeared, we would never need to
* actually choose the "real" charset. All assuming that the application just
* wants the data, and doesn't care about a char set name.
*
*
*/
public class CharsetDetector {
/**
* Constructor
*/
public CharsetDetector() {
}
/**
* Set the declared encoding for charset detection.
* The declared encoding of an input text is an encoding obtained
* from an http header or xml declaration or similar source that
* can be provided as additional information to the charset detector.
* A match between a declared encoding and a possible detected encoding
* will raise the quality of that detected encoding by a small delta,
* and will also appear as a "reason" for the match.
* <p/>
* A declared encoding that is incompatible with the input data being
* analyzed will not be added to the list of possible encodings.
*
* @param encoding The declared encoding
*/
public CharsetDetector setDecaredEncoding(String encoding) {
return this;
}
/**
* Set the input text (byte) data whose charset is to be detected.
* @param in the input text of unknown encoding
* @return This CharsetDetector
*/
public CharsetDetector setText(byte in[]) {
return this;
}
/**
* Set the input text (byte) data whose charset is to be detected.
* <p/>
* The input stream that supplies the character data must have markSupported()
* == true; the charset detection process will read a small amount of data,
* then return the stream to its original position via
* the InputStream.reset() operation. The exact amount that will
* be read depends on the characteristics of the data itself.
* @param in the input text of unknown encoding
* @return This CharsetDetector
*/
public CharsetDetector setText(InputStream in) {
return this;
}
/**
* Return the charset that best matches the supplied input data.
*
* Note though, that because the detection
* only looks at the start of the input data,
* there is a possibility that the returned charset will fail to handle
* the full set of input data.
* <p/>
* Raise an exception if
* <ul>
* <li>no charset appears to match the data.</li>
* <li>no input text has been provided</li>
* </ul>
*
* @return a CharsetMatch object representing the best matching charset.
*/
public CharsetMatch detect() {
return null;
}
/**
* Return an array of all charsets that appear to be plausible
* matches with the input data. The array is ordered with the
* best quality match first.
* <p/>
* Raise an exception if
* <ul>
* <li>no charsets appear to match the input data.</li>
* <li>no input text has been provided</li>
* </ul>
*
* @return An array of CharsetMatch objects representing possibly matching charsets.
*/
public CharsetMatch[] detectAll() {
return null;
}
/**
* Autodetect the charset of an inputStream, and return a Java Reader
* to access the converted input data.
* <p/>
* This is a convenience method that is equivalent to
* <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
* <p/>
* For the input stream that supplies the character data, markSupported()
* must be true; the charset detection will read a small amount of data,
* then return the stream to its original position via
* the InputStream.reset() operation. The exact amount that will
* be read depends on the characteristics of the data itself.
*<p/>
* Raise an exception if no charsets appear to match the input data.
*
* @param in The source of the byte data in the unknown charset.
*
* @param declaredEncoding A declared encoding for the data, if available,
* or null or an empty string if none is available.
*/
public Reader getReader(InputStream in, String declaredEncoding) {
return null;
}
/**
* Autodetect the charset of an inputStream, and return a String
* containing the converted input data.
* <p/>
* This is a convenience method that is equivalent to
* <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
*<p/>
* Raise an exception if no charsets appear to match the input data.
*
* @param in The source of the byte data in the unknown charset.
*
* @param declaredEncoding A declared encoding for the data, if available,
* or null or an empty string if none is available.
*/
public String getString(byte[] in, String declaredEncoding) {
return null;
}
/**
* Get the names of all char sets that can be recognized by the char set detector.
*
* @return an array of the names of all charsets that can be recognized
* by the charset detector.
*/
public static String[] getAllDetectableCharsets() {
return null;
}
}

View File

@ -0,0 +1,104 @@
/**
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import java.io.InputStream;
import java.io.Reader;
/**
* This class represents a charset that has been identified by a CharsetDetector
* as a possible encoding for a set of input data. From an instance of this
* class, you can ask for a confidence level in the charset identification,
* or for Java Reader or String to access the original byte data in Unicode form.
* <p/>
* Instances of this class are created only by CharsetDetectors.
*/
public class CharsetMatch {
/**
* Create a java.io.Reader for reading the Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
*
* @return the Reader for the Unicode character data.
*/
public Reader getReader() {
return null;
}
/**
* Create a Java String from Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
*
* @return a String created from the converted input data.
*/
public String getString() {
return null;
}
/**
* Create a Java String from Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
* The length of the returned string is limited to the specified size;
* the string will be trunctated to this length if necessary. A limit value of
* zero or less is ignored, and treated as no limit.
*
* @param maxLength The maximium length of the String to be created.
* @return a String created from the converted input data.
*/
public String getString(int maxLength) {
return null;
}
/**
* Get an indication of the confidence in the charset detected.
* Confidence values range from 0-100, with larger numbers indicating
* a better match of the input data to the characteristics of the
* charset.
*
* @return the confidence in the charset match
*/
public int getConfidence() {
return 0;
}
/**
* Return an indication of what it was about input data that
* that caused this charset to be considered as a possible match.
* <p>
* TODO: create a list of enum-like constants for the possible types of matches.
*
* @return the type of match found for this charset.
*/
public int getMatchType() {
return 0;
}
/**
* Get the name of the detected charset.
* The name will be one that can be used with other APIs on the
* platform that accept charset names. It is the "Canonical name"
* as defined by the class java.nio.charset.Charset; for
* charsets that are registered with the IANA charset registry,
* this is the MIME-preferred registerd name.
*
* @see java.nio.charset.Charset
* @see java.io.InputStreamReader
*
* @return The name of the charset.
*/
public String getName() {
return "";
}
}