ICU-124 charset detector, work in progress.

X-SVN-Rev: 17471
This commit is contained in:
Andy Heninger 2005-04-08 01:27:36 +00:00
parent cc7f22046e
commit 757d1a33fe
5 changed files with 402 additions and 9 deletions

View File

@ -0,0 +1,58 @@
/**
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.dev.test.charsetdet;
import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.text.*;
/**
* @author andy
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
public class TestCharsetDetector extends TestFmwk {
/**
* Constructor
*/
public TestCharsetDetector()
{
}
public static void main(String[] args) {
try
{
TestCharsetDetector test = new TestCharsetDetector();
test.run(args);
}
catch (Exception e)
{
e.printStackTrace();
}
}
public void TestConstruction() {
int i;
CharsetDetector det = new CharsetDetector();
String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
if (charsetNames.length == 0) {
errln("TestCharsetDetector TestConstruction #0001");
}
for (i=0; i<charsetNames.length; i++) {
if (charsetNames[i].equals("")) {
errln("TestCharsetDetector TestConstruction #0002. i=" + i);
}
// System.out.println("\"" + charsetNames[i] + "\"");
}
}
}

View File

@ -8,6 +8,11 @@ package com.ibm.icu.text;
import java.io.InputStream;
import java.io.Reader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Collections;
import java.util.Arrays;
/**
@ -64,7 +69,8 @@ public class CharsetDetector {
*
* @param encoding The declared encoding
*/
public CharsetDetector setDecaredEncoding(String encoding) {
public CharsetDetector setDeclaredEncoding(String encoding) {
fDeclaredEncoding = encoding;
return this;
}
@ -73,7 +79,9 @@ public class CharsetDetector {
* @param in the input text of unknown encoding
* @return This CharsetDetector
*/
public CharsetDetector setText(byte in[]) {
public CharsetDetector setText(byte [] in) {
fRawInput = in;
fRawLength = in.length;
return this;
}
@ -89,7 +97,14 @@ public class CharsetDetector {
* @param in the input text of unknown encoding
* @return This CharsetDetector
*/
public CharsetDetector setText(InputStream in) {
public CharsetDetector setText(InputStream in) throws IOException {
fInputStream = in;
fInputStream.mark(4000);
fRawInput = new byte[4000]; // Always make a new buffer because the
// previous one may have come from the caller,
// in which case we can't touch it.
fRawLength = fInputStream.read(fRawInput);
fInputStream.reset();
return this;
}
@ -109,10 +124,15 @@ public class CharsetDetector {
* </ul>
*
* @return a CharsetMatch object representing the best matching charset.
* *
* TODO: A better implementation would be to copy the detect loop from
* detectAll(), and cut it short as soon as a match with a high confidence
* is found. This is something to be done later, after things are otherwise
* working.
*/
public CharsetMatch detect() {
return null;
}
return detectAll()[0];
}
/**
* Return an array of all charsets that appear to be plausible
@ -128,7 +148,28 @@ public class CharsetDetector {
* @return An array of CharsetMatch objects representing possibly matching charsets.
*/
public CharsetMatch[] detectAll() {
return null;
CharsetRecognizer csr;
int i;
int detectResults;
int confidence;
ArrayList matches = new ArrayList();
// Iterate over all possible charsets, remember all that
// give a match quality > 0.
for (i=0; i<fCSRecognizers.size(); i++) {
csr = (CharsetRecognizer)fCSRecognizers.get(i);
detectResults = csr.match(this);
confidence = detectResults & 0x000000ff;
if (confidence > 0) {
CharsetMatch m = new CharsetMatch(this, csr, confidence);
matches.add(csr);
}
}
Collections.sort(matches); // CharsetMatch compares on confidence
Collections.reverse(matches); // Put best match first.
CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
matches.toArray(resultArray);
return resultArray;
}
@ -182,8 +223,121 @@ public class CharsetDetector {
* by the charset detector.
*/
public static String[] getAllDetectableCharsets() {
return null;
return fCharsetNames;
}
/**
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
* it by removing what appears to be html markup.
*/
private void MungeInput() {
int srci = 0;
int dsti = 0;
byte b;
boolean inMarkup = false;
int openTags = 0;
int badTags = 0;
//
// html / xml markup stripping.
// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
// discard everything within < brackets >
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
// guess as to whether the input was actually marked up at all.
for (srci=0; srci<fRawLength; srci++) {
b = fRawInput[srci];
if (b == (byte)'<') {
if (inMarkup) {
badTags++;
}
inMarkup = true;
openTags++;
}
if (inMarkup == false) {
fInputBytes[dsti++] = b;
}
if (b == (byte)'>') {
inMarkup = false;
}
}
fInputLen = dsti;
//
// If it looks like this input wasn't marked up, or if it looks like it's
// essentially nothing but markup abandon the markup stripping.
// Detection will have to work on the unstripped input.
//
if (openTags<5 || openTags/5 < badTags ||
(fInputLen < 100 && fRawLength>600)) {
for (srci=0; srci<fRawLength; srci++) {
fInputBytes[srci] = fRawInput[srci];
}
fInputLen = srci;
}
//
// Tally up the byte occurence statistics.
// These are available for use by the various detectors.
//
Arrays.fill(fByteStats, (short)0);
for (srci=0; srci<fInputLen; srci++) {
int val = fInputBytes[srci] & 0x00ff;
fByteStats[val]++;
}
}
/**
* The following items are accessed by individual CharsetRecongizers during
* the recognition process
*/
byte[] fInputBytes = // The text to be checked. Markup will have been
new byte[4000];// removed if appropriate.
int fInputLen; // Length of the byte data in fInputText.
short fByteStats[]; // byte frequency statistics for the input text.
// Value is percent, not absolute.
// Value is rounded up, so zero really means zero occurences.
String fDeclaredEncoding;
//
// Stuff private to CharsetDetector
//
private byte[] fRawInput; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
// If user gave us a stream, it's read to a
// buffer here.
private int fRawLength; // Length of data in fRawInput array.
private InputStream fInputStream; // User's input stream, or null if the user
// gave us a byte array.
/**
* List of recognizers for all charsets known to the implementation.
*
*/
private static ArrayList fCSRecognizers = createRecognizers();
private static String [] fCharsetNames;
/**
* Create the singleton instances of the CharsetRecognizer classes
*/
private static ArrayList createRecognizers() {
ArrayList recognizers = new ArrayList();
recognizers.add(new CharsetRecog_UTF8());
// Create an array of all charset names, as a side effect.
// Needed for the getAllDetectableCharsets() API.
fCharsetNames = new String [recognizers.size()];
for (int i=0; i<recognizers.size(); i++) {
fCharsetNames[i] = ((CharsetRecognizer)recognizers.get(i)).getName();
}
return recognizers;
}
}

View File

@ -17,13 +17,21 @@ import java.io.Reader;
* or for Java Reader or String to access the original byte data in Unicode form.
* <p/>
* Instances of this class are created only by CharsetDetectors.
* <p/>
* Note: this class has a natural ordering that is inconsistent with equals.
* The natural ordering is based on the match confidence value.
*/
public class CharsetMatch {
public class CharsetMatch implements Comparable {
/**
* Create a java.io.Reader for reading the Unicode character data corresponding
* to the original byte data supplied to the Charset detect operation.
* <p/>
* CAUTION: if the source of the byte data was an InputStream, a Reader
* can be created for only one matching char set using this method. If more
* than one charset needs to be tried, the caller will need to reset
* the InputStream and create InputStreamReaders itself, based on the Char Set name.
*
* @return the Reader for the Unicode character data.
*/
@ -98,7 +106,43 @@ public class CharsetMatch {
* @return The name of the charset.
*/
public String getName() {
return "";
return fRecognizer.getName();
}
/**
* Comparison function, for java.lang.Comparable
* Comparison is based on the match confidence value, which conveniently
* allows CharsetDetector.detectAll() to order its results.
*/
public int compareTo (Object o) {
CharsetMatch other = (CharsetMatch)o;
int compareResult = 0;
if (this.fConfidence > other.fConfidence) {
compareResult = 1;
} else if (this.fConfidence < other.fConfidence) {
compareResult = -1;
}
return compareResult;
}
/**
* Constructor. Implementation internal
*
*/
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
fRecognizer = rec;
fConfidence = conf;
}
//
// Private Data
//
private int fConfidence;
private CharsetRecognizer fRecognizer;
}

View File

@ -0,0 +1,92 @@
/**
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Charset recognizer for UTF-8
*
*/
class CharsetRecog_UTF8 extends CharsetRecognizer {
String getName() {
return "UTF-8";
}
/* (non-Javadoc)
* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
*/
int match(CharsetDetector det) {
boolean hasBOM = false;
int numValid = 0;
int numInvalid = 0;
byte input[] = det.fInputBytes;
int i;
int trailBytes = 0;
int confidence;
if (det.fInputLen >= 3 &&
input[0]==0xef && input[1]==0xbb & input[2]==0xbf) {
hasBOM = true;
}
// Scan for multi-byte sequences
for (i=0; i<det.fInputLen; i++) {
int b = input[i];
if ((b & 0x80) == 0) {
continue; // ASCII
}
// Hi bit on char found. Figure out how long the sequence should be
if ((b & 0x0e0) == 0x0c0) {
trailBytes = 1;
} else if ((b & 0x0f0) == 0x0e0) {
trailBytes = 2;
} else if ((b & 0x0f8) == 0xf0) {
trailBytes = 3;
} else {
numInvalid++;
trailBytes = 0;
}
// Verify that we've got the right number of trail bytes in the sequence
for (;;) {
i++;
b = input[i];
if ((b & 0xc0) != 0x080) {
numInvalid++;
break;
}
if (--trailBytes == 0) {
numValid++;
break;
}
}
}
// Cook up some sort of confidence score, based on presense of a BOM
// and the existence of valid and/or invalid multi-byte sequences.
confidence = 0;
if (hasBOM && numInvalid==0) {
confidence = 100;
} else if (hasBOM && numValid > numInvalid) {
confidence = 80;
} else if (numValid > 3 && numInvalid == 0) {
confidence = 100;
} else if (numValid > 0 && numInvalid == 0) {
confidence = 80;
} else if (numValid == 0 && numInvalid == 0) {
// Plain ASCII.
confidence = 50;
} else if (numValid > numInvalid) {
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
confidence = 60;
}
return confidence;
}
}

View File

@ -0,0 +1,45 @@
/**
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* Abstract class for recognizing a single charset.
* Part of the implementation of ICU's CharsetDetector.
*
* Each specific charset that can be recognized will have an instance
* of some subclass of this class. All interaction between the overall
* CharsetDetector and the stuff specific to an individual charset happens
* via the interface provided here.
*
* Instances of CharsetDetector DO NOT have or maintain
* state pertaining to a specific match or detect operation.
* The WILL be shared by multiple instances of CharsetDetector.
* They encapsulate const charset-specific information.
*
*/
abstract class CharsetRecognizer {
/**
* Get the IANA name of this charset.
* @return the charset name.
*/
abstract String getName();
/**
* Test the match of this charset with the input text data
* which is obtained via the CharsetDetector object.
*
* @param det The CharsetDetector, which contains the input text
* to be checked for being in this charset.
* @return Two values packed into one int (Damn java, anyhow)
* <br/>
* bits 0-7: the match confidence, ranging from 0-100
* <br/>
* bits 8-15: The match reason, an enum-like value.
*/
abstract int match(CharsetDetector det);
}