ICU-124 Code Page Detection stub classes added.
X-SVN-Rev: 17268
This commit is contained in:
parent
c3248589e2
commit
04313f02eb
189
icu4j/src/com/ibm/icu/text/CharsetDetector.java
Normal file
189
icu4j/src/com/ibm/icu/text/CharsetDetector.java
Normal file
@ -0,0 +1,189 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* <code>CharsetDetector</code> provides a facility for detecting the
|
||||
* charset or encoding of character data in an unknown format.
|
||||
* The input data can either be from an input stream or an array of bytes.
|
||||
* The result of the detection operation is a list of possibly matching
|
||||
* charsets, or, for simple use, you can just ask for a Java Reader that
|
||||
* will will work over the input data.
|
||||
* <p/>
|
||||
* Character set detection is at best an imprecise operation. The detection
|
||||
* process will attempt to identify the charset that best matches the characteristics
|
||||
* of the byte data, but the process is partly statistical in nature, and
|
||||
* the results can not be guaranteed to always be correct.
|
||||
* <p/>
|
||||
* For best accuracy in charset detection, the input data should be primarily
|
||||
* in a single language, and a minimum of a few hundred bytes worth of plain text
|
||||
* in the language are needed. The detection process will attempt to
|
||||
* ignore html or xml style markup that could otherwise obscure the content.
|
||||
* <p/>
|
||||
* <b>Question:</b>Should we have getters corresponding to the setters for inut text
|
||||
* and declared encoding?
|
||||
* <p/>
|
||||
* <b>A thought:</b> If we were to create our own type of Java Reader, we could defer
|
||||
* figuring out an actual charset for data that starts out with too much English
|
||||
* only ASCII until the user actually read through to something that didn't look
|
||||
* like 7 bit English. If nothing else ever appeared, we would never need to
|
||||
* actually choose the "real" charset. All assuming that the application just
|
||||
* wants the data, and doesn't care about a char set name.
|
||||
*
|
||||
*
|
||||
*/
|
||||
public class CharsetDetector {
|
||||
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
public CharsetDetector() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the declared encoding for charset detection.
|
||||
* The declared encoding of an input text is an encoding obtained
|
||||
* from an http header or xml declaration or similar source that
|
||||
* can be provided as additional information to the charset detector.
|
||||
* A match between a declared encoding and a possible detected encoding
|
||||
* will raise the quality of that detected encoding by a small delta,
|
||||
* and will also appear as a "reason" for the match.
|
||||
* <p/>
|
||||
* A declared encoding that is incompatible with the input data being
|
||||
* analyzed will not be added to the list of possible encodings.
|
||||
*
|
||||
* @param encoding The declared encoding
|
||||
*/
|
||||
public CharsetDetector setDecaredEncoding(String encoding) {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text (byte) data whose charset is to be detected.
|
||||
* @param in the input text of unknown encoding
|
||||
* @return This CharsetDetector
|
||||
*/
|
||||
public CharsetDetector setText(byte in[]) {
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the input text (byte) data whose charset is to be detected.
|
||||
* <p/>
|
||||
* The input stream that supplies the character data must have markSupported()
|
||||
* == true; the charset detection process will read a small amount of data,
|
||||
* then return the stream to its original position via
|
||||
* the InputStream.reset() operation. The exact amount that will
|
||||
* be read depends on the characteristics of the data itself.
|
||||
|
||||
* @param in the input text of unknown encoding
|
||||
* @return This CharsetDetector
|
||||
*/
|
||||
public CharsetDetector setText(InputStream in) {
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return the charset that best matches the supplied input data.
|
||||
*
|
||||
* Note though, that because the detection
|
||||
* only looks at the start of the input data,
|
||||
* there is a possibility that the returned charset will fail to handle
|
||||
* the full set of input data.
|
||||
* <p/>
|
||||
* Raise an exception if
|
||||
* <ul>
|
||||
* <li>no charset appears to match the data.</li>
|
||||
* <li>no input text has been provided</li>
|
||||
* </ul>
|
||||
*
|
||||
* @return a CharsetMatch object representing the best matching charset.
|
||||
*/
|
||||
public CharsetMatch detect() {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array of all charsets that appear to be plausible
|
||||
* matches with the input data. The array is ordered with the
|
||||
* best quality match first.
|
||||
* <p/>
|
||||
* Raise an exception if
|
||||
* <ul>
|
||||
* <li>no charsets appear to match the input data.</li>
|
||||
* <li>no input text has been provided</li>
|
||||
* </ul>
|
||||
*
|
||||
* @return An array of CharsetMatch objects representing possibly matching charsets.
|
||||
*/
|
||||
public CharsetMatch[] detectAll() {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Autodetect the charset of an inputStream, and return a Java Reader
|
||||
* to access the converted input data.
|
||||
* <p/>
|
||||
* This is a convenience method that is equivalent to
|
||||
* <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getReader();</code>
|
||||
* <p/>
|
||||
* For the input stream that supplies the character data, markSupported()
|
||||
* must be true; the charset detection will read a small amount of data,
|
||||
* then return the stream to its original position via
|
||||
* the InputStream.reset() operation. The exact amount that will
|
||||
* be read depends on the characteristics of the data itself.
|
||||
*<p/>
|
||||
* Raise an exception if no charsets appear to match the input data.
|
||||
*
|
||||
* @param in The source of the byte data in the unknown charset.
|
||||
*
|
||||
* @param declaredEncoding A declared encoding for the data, if available,
|
||||
* or null or an empty string if none is available.
|
||||
*/
|
||||
public Reader getReader(InputStream in, String declaredEncoding) {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Autodetect the charset of an inputStream, and return a String
|
||||
* containing the converted input data.
|
||||
* <p/>
|
||||
* This is a convenience method that is equivalent to
|
||||
* <code>this.setDeclaredEncoding(declaredEncoding).setText(in).detect().getString();</code>
|
||||
*<p/>
|
||||
* Raise an exception if no charsets appear to match the input data.
|
||||
*
|
||||
* @param in The source of the byte data in the unknown charset.
|
||||
*
|
||||
* @param declaredEncoding A declared encoding for the data, if available,
|
||||
* or null or an empty string if none is available.
|
||||
*/
|
||||
public String getString(byte[] in, String declaredEncoding) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Get the names of all char sets that can be recognized by the char set detector.
|
||||
*
|
||||
* @return an array of the names of all charsets that can be recognized
|
||||
* by the charset detector.
|
||||
*/
|
||||
public static String[] getAllDetectableCharsets() {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
}
|
104
icu4j/src/com/ibm/icu/text/CharsetMatch.java
Normal file
104
icu4j/src/com/ibm/icu/text/CharsetMatch.java
Normal file
@ -0,0 +1,104 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
/**
|
||||
* This class represents a charset that has been identified by a CharsetDetector
|
||||
* as a possible encoding for a set of input data. From an instance of this
|
||||
* class, you can ask for a confidence level in the charset identification,
|
||||
* or for Java Reader or String to access the original byte data in Unicode form.
|
||||
* <p/>
|
||||
* Instances of this class are created only by CharsetDetectors.
|
||||
*/
|
||||
public class CharsetMatch {
|
||||
|
||||
|
||||
/**
|
||||
* Create a java.io.Reader for reading the Unicode character data corresponding
|
||||
* to the original byte data supplied to the Charset detect operation.
|
||||
*
|
||||
* @return the Reader for the Unicode character data.
|
||||
*/
|
||||
public Reader getReader() {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Create a Java String from Unicode character data corresponding
|
||||
* to the original byte data supplied to the Charset detect operation.
|
||||
*
|
||||
* @return a String created from the converted input data.
|
||||
*/
|
||||
public String getString() {
|
||||
return null;
|
||||
|
||||
}
|
||||
/**
|
||||
* Create a Java String from Unicode character data corresponding
|
||||
* to the original byte data supplied to the Charset detect operation.
|
||||
* The length of the returned string is limited to the specified size;
|
||||
* the string will be trunctated to this length if necessary. A limit value of
|
||||
* zero or less is ignored, and treated as no limit.
|
||||
*
|
||||
* @param maxLength The maximium length of the String to be created.
|
||||
* @return a String created from the converted input data.
|
||||
*/
|
||||
public String getString(int maxLength) {
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an indication of the confidence in the charset detected.
|
||||
* Confidence values range from 0-100, with larger numbers indicating
|
||||
* a better match of the input data to the characteristics of the
|
||||
* charset.
|
||||
*
|
||||
* @return the confidence in the charset match
|
||||
*/
|
||||
public int getConfidence() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an indication of what it was about input data that
|
||||
* that caused this charset to be considered as a possible match.
|
||||
* <p>
|
||||
* TODO: create a list of enum-like constants for the possible types of matches.
|
||||
*
|
||||
* @return the type of match found for this charset.
|
||||
*/
|
||||
public int getMatchType() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Get the name of the detected charset.
|
||||
* The name will be one that can be used with other APIs on the
|
||||
* platform that accept charset names. It is the "Canonical name"
|
||||
* as defined by the class java.nio.charset.Charset; for
|
||||
* charsets that are registered with the IANA charset registry,
|
||||
* this is the MIME-preferred registerd name.
|
||||
*
|
||||
* @see java.nio.charset.Charset
|
||||
* @see java.io.InputStreamReader
|
||||
*
|
||||
* @return The name of the charset.
|
||||
*/
|
||||
public String getName() {
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user