ICU-124 charset detector, work in progress.
X-SVN-Rev: 17471
This commit is contained in:
parent
cc7f22046e
commit
757d1a33fe
@ -0,0 +1,58 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.dev.test.charsetdet;
|
||||
|
||||
import com.ibm.icu.dev.test.TestFmwk;
|
||||
import com.ibm.icu.text.*;
|
||||
|
||||
|
||||
/**
|
||||
* @author andy
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
public class TestCharsetDetector extends TestFmwk {
|
||||
|
||||
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
public TestCharsetDetector()
|
||||
{
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
try
|
||||
{
|
||||
TestCharsetDetector test = new TestCharsetDetector();
|
||||
test.run(args);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void TestConstruction() {
|
||||
int i;
|
||||
CharsetDetector det = new CharsetDetector();
|
||||
|
||||
String [] charsetNames = CharsetDetector.getAllDetectableCharsets();
|
||||
if (charsetNames.length == 0) {
|
||||
errln("TestCharsetDetector TestConstruction #0001");
|
||||
}
|
||||
for (i=0; i<charsetNames.length; i++) {
|
||||
if (charsetNames[i].equals("")) {
|
||||
errln("TestCharsetDetector TestConstruction #0002. i=" + i);
|
||||
}
|
||||
// System.out.println("\"" + charsetNames[i] + "\"");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -8,6 +8,11 @@ package com.ibm.icu.text;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Collections;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
||||
/**
|
||||
@ -64,7 +69,8 @@ public class CharsetDetector {
|
||||
*
|
||||
* @param encoding The declared encoding
|
||||
*/
|
||||
public CharsetDetector setDecaredEncoding(String encoding) {
|
||||
public CharsetDetector setDeclaredEncoding(String encoding) {
|
||||
fDeclaredEncoding = encoding;
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -73,7 +79,9 @@ public class CharsetDetector {
|
||||
* @param in the input text of unknown encoding
|
||||
* @return This CharsetDetector
|
||||
*/
|
||||
public CharsetDetector setText(byte in[]) {
|
||||
public CharsetDetector setText(byte [] in) {
|
||||
fRawInput = in;
|
||||
fRawLength = in.length;
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -89,7 +97,14 @@ public class CharsetDetector {
|
||||
* @param in the input text of unknown encoding
|
||||
* @return This CharsetDetector
|
||||
*/
|
||||
public CharsetDetector setText(InputStream in) {
|
||||
public CharsetDetector setText(InputStream in) throws IOException {
|
||||
fInputStream = in;
|
||||
fInputStream.mark(4000);
|
||||
fRawInput = new byte[4000]; // Always make a new buffer because the
|
||||
// previous one may have come from the caller,
|
||||
// in which case we can't touch it.
|
||||
fRawLength = fInputStream.read(fRawInput);
|
||||
fInputStream.reset();
|
||||
return this;
|
||||
}
|
||||
|
||||
@ -109,10 +124,15 @@ public class CharsetDetector {
|
||||
* </ul>
|
||||
*
|
||||
* @return a CharsetMatch object representing the best matching charset.
|
||||
* *
|
||||
* TODO: A better implementation would be to copy the detect loop from
|
||||
* detectAll(), and cut it short as soon as a match with a high confidence
|
||||
* is found. This is something to be done later, after things are otherwise
|
||||
* working.
|
||||
*/
|
||||
public CharsetMatch detect() {
|
||||
return null;
|
||||
}
|
||||
return detectAll()[0];
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an array of all charsets that appear to be plausible
|
||||
@ -128,7 +148,28 @@ public class CharsetDetector {
|
||||
* @return An array of CharsetMatch objects representing possibly matching charsets.
|
||||
*/
|
||||
public CharsetMatch[] detectAll() {
|
||||
return null;
|
||||
CharsetRecognizer csr;
|
||||
int i;
|
||||
int detectResults;
|
||||
int confidence;
|
||||
ArrayList matches = new ArrayList();
|
||||
|
||||
// Iterate over all possible charsets, remember all that
|
||||
// give a match quality > 0.
|
||||
for (i=0; i<fCSRecognizers.size(); i++) {
|
||||
csr = (CharsetRecognizer)fCSRecognizers.get(i);
|
||||
detectResults = csr.match(this);
|
||||
confidence = detectResults & 0x000000ff;
|
||||
if (confidence > 0) {
|
||||
CharsetMatch m = new CharsetMatch(this, csr, confidence);
|
||||
matches.add(csr);
|
||||
}
|
||||
}
|
||||
Collections.sort(matches); // CharsetMatch compares on confidence
|
||||
Collections.reverse(matches); // Put best match first.
|
||||
CharsetMatch [] resultArray = new CharsetMatch[matches.size()];
|
||||
matches.toArray(resultArray);
|
||||
return resultArray;
|
||||
}
|
||||
|
||||
|
||||
@ -182,8 +223,121 @@ public class CharsetDetector {
|
||||
* by the charset detector.
|
||||
*/
|
||||
public static String[] getAllDetectableCharsets() {
|
||||
return null;
|
||||
return fCharsetNames;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
|
||||
* it by removing what appears to be html markup.
|
||||
*/
|
||||
private void MungeInput() {
|
||||
int srci = 0;
|
||||
int dsti = 0;
|
||||
byte b;
|
||||
boolean inMarkup = false;
|
||||
int openTags = 0;
|
||||
int badTags = 0;
|
||||
|
||||
//
|
||||
// html / xml markup stripping.
|
||||
// quick and dirty, not 100% accurate, but hopefully good enough, statistically.
|
||||
// discard everything within < brackets >
|
||||
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
|
||||
// guess as to whether the input was actually marked up at all.
|
||||
for (srci=0; srci<fRawLength; srci++) {
|
||||
b = fRawInput[srci];
|
||||
if (b == (byte)'<') {
|
||||
if (inMarkup) {
|
||||
badTags++;
|
||||
}
|
||||
inMarkup = true;
|
||||
openTags++;
|
||||
}
|
||||
if (inMarkup == false) {
|
||||
fInputBytes[dsti++] = b;
|
||||
}
|
||||
|
||||
if (b == (byte)'>') {
|
||||
inMarkup = false;
|
||||
}
|
||||
}
|
||||
fInputLen = dsti;
|
||||
|
||||
//
|
||||
// If it looks like this input wasn't marked up, or if it looks like it's
|
||||
// essentially nothing but markup abandon the markup stripping.
|
||||
// Detection will have to work on the unstripped input.
|
||||
//
|
||||
if (openTags<5 || openTags/5 < badTags ||
|
||||
(fInputLen < 100 && fRawLength>600)) {
|
||||
for (srci=0; srci<fRawLength; srci++) {
|
||||
fInputBytes[srci] = fRawInput[srci];
|
||||
}
|
||||
fInputLen = srci;
|
||||
}
|
||||
|
||||
//
|
||||
// Tally up the byte occurence statistics.
|
||||
// These are available for use by the various detectors.
|
||||
//
|
||||
Arrays.fill(fByteStats, (short)0);
|
||||
for (srci=0; srci<fInputLen; srci++) {
|
||||
int val = fInputBytes[srci] & 0x00ff;
|
||||
fByteStats[val]++;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The following items are accessed by individual CharsetRecongizers during
|
||||
* the recognition process
|
||||
*/
|
||||
byte[] fInputBytes = // The text to be checked. Markup will have been
|
||||
new byte[4000];// removed if appropriate.
|
||||
|
||||
int fInputLen; // Length of the byte data in fInputText.
|
||||
|
||||
short fByteStats[]; // byte frequency statistics for the input text.
|
||||
// Value is percent, not absolute.
|
||||
// Value is rounded up, so zero really means zero occurences.
|
||||
|
||||
String fDeclaredEncoding;
|
||||
|
||||
|
||||
|
||||
//
|
||||
// Stuff private to CharsetDetector
|
||||
//
|
||||
private byte[] fRawInput; // Original, untouched input bytes.
|
||||
// If user gave us a byte array, this is it.
|
||||
// If user gave us a stream, it's read to a
|
||||
// buffer here.
|
||||
private int fRawLength; // Length of data in fRawInput array.
|
||||
|
||||
private InputStream fInputStream; // User's input stream, or null if the user
|
||||
// gave us a byte array.
|
||||
|
||||
|
||||
/**
|
||||
* List of recognizers for all charsets known to the implementation.
|
||||
*
|
||||
*/
|
||||
private static ArrayList fCSRecognizers = createRecognizers();
|
||||
private static String [] fCharsetNames;
|
||||
|
||||
/**
|
||||
* Create the singleton instances of the CharsetRecognizer classes
|
||||
*/
|
||||
private static ArrayList createRecognizers() {
|
||||
ArrayList recognizers = new ArrayList();
|
||||
recognizers.add(new CharsetRecog_UTF8());
|
||||
|
||||
// Create an array of all charset names, as a side effect.
|
||||
// Needed for the getAllDetectableCharsets() API.
|
||||
fCharsetNames = new String [recognizers.size()];
|
||||
for (int i=0; i<recognizers.size(); i++) {
|
||||
fCharsetNames[i] = ((CharsetRecognizer)recognizers.get(i)).getName();
|
||||
}
|
||||
return recognizers;
|
||||
}
|
||||
}
|
||||
|
@ -17,13 +17,21 @@ import java.io.Reader;
|
||||
* or for Java Reader or String to access the original byte data in Unicode form.
|
||||
* <p/>
|
||||
* Instances of this class are created only by CharsetDetectors.
|
||||
* <p/>
|
||||
* Note: this class has a natural ordering that is inconsistent with equals.
|
||||
* The natural ordering is based on the match confidence value.
|
||||
*/
|
||||
public class CharsetMatch {
|
||||
public class CharsetMatch implements Comparable {
|
||||
|
||||
|
||||
/**
|
||||
* Create a java.io.Reader for reading the Unicode character data corresponding
|
||||
* to the original byte data supplied to the Charset detect operation.
|
||||
* <p/>
|
||||
* CAUTION: if the source of the byte data was an InputStream, a Reader
|
||||
* can be created for only one matching char set using this method. If more
|
||||
* than one charset needs to be tried, the caller will need to reset
|
||||
* the InputStream and create InputStreamReaders itself, based on the Char Set name.
|
||||
*
|
||||
* @return the Reader for the Unicode character data.
|
||||
*/
|
||||
@ -98,7 +106,43 @@ public class CharsetMatch {
|
||||
* @return The name of the charset.
|
||||
*/
|
||||
public String getName() {
|
||||
return "";
|
||||
return fRecognizer.getName();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Comparison function, for java.lang.Comparable
|
||||
* Comparison is based on the match confidence value, which conveniently
|
||||
* allows CharsetDetector.detectAll() to order its results.
|
||||
*/
|
||||
public int compareTo (Object o) {
|
||||
CharsetMatch other = (CharsetMatch)o;
|
||||
int compareResult = 0;
|
||||
if (this.fConfidence > other.fConfidence) {
|
||||
compareResult = 1;
|
||||
} else if (this.fConfidence < other.fConfidence) {
|
||||
compareResult = -1;
|
||||
}
|
||||
return compareResult;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Constructor. Implementation internal
|
||||
*
|
||||
*/
|
||||
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
|
||||
fRecognizer = rec;
|
||||
fConfidence = conf;
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// Private Data
|
||||
//
|
||||
private int fConfidence;
|
||||
private CharsetRecognizer fRecognizer;
|
||||
|
||||
|
||||
}
|
||||
|
92
icu4j/src/com/ibm/icu/text/CharsetRecog_UTF8.java
Normal file
92
icu4j/src/com/ibm/icu/text/CharsetRecog_UTF8.java
Normal file
@ -0,0 +1,92 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
/**
|
||||
* Charset recognizer for UTF-8
|
||||
*
|
||||
*/
|
||||
class CharsetRecog_UTF8 extends CharsetRecognizer {
|
||||
|
||||
String getName() {
|
||||
return "UTF-8";
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see com.ibm.icu.text.CharsetRecognizer#match(com.ibm.icu.text.CharsetDetector)
|
||||
*/
|
||||
int match(CharsetDetector det) {
|
||||
boolean hasBOM = false;
|
||||
int numValid = 0;
|
||||
int numInvalid = 0;
|
||||
byte input[] = det.fInputBytes;
|
||||
int i;
|
||||
int trailBytes = 0;
|
||||
int confidence;
|
||||
|
||||
if (det.fInputLen >= 3 &&
|
||||
input[0]==0xef && input[1]==0xbb & input[2]==0xbf) {
|
||||
hasBOM = true;
|
||||
}
|
||||
|
||||
// Scan for multi-byte sequences
|
||||
for (i=0; i<det.fInputLen; i++) {
|
||||
int b = input[i];
|
||||
if ((b & 0x80) == 0) {
|
||||
continue; // ASCII
|
||||
}
|
||||
|
||||
// Hi bit on char found. Figure out how long the sequence should be
|
||||
if ((b & 0x0e0) == 0x0c0) {
|
||||
trailBytes = 1;
|
||||
} else if ((b & 0x0f0) == 0x0e0) {
|
||||
trailBytes = 2;
|
||||
} else if ((b & 0x0f8) == 0xf0) {
|
||||
trailBytes = 3;
|
||||
} else {
|
||||
numInvalid++;
|
||||
trailBytes = 0;
|
||||
}
|
||||
|
||||
// Verify that we've got the right number of trail bytes in the sequence
|
||||
for (;;) {
|
||||
i++;
|
||||
b = input[i];
|
||||
if ((b & 0xc0) != 0x080) {
|
||||
numInvalid++;
|
||||
break;
|
||||
}
|
||||
if (--trailBytes == 0) {
|
||||
numValid++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Cook up some sort of confidence score, based on presense of a BOM
|
||||
// and the existence of valid and/or invalid multi-byte sequences.
|
||||
confidence = 0;
|
||||
if (hasBOM && numInvalid==0) {
|
||||
confidence = 100;
|
||||
} else if (hasBOM && numValid > numInvalid) {
|
||||
confidence = 80;
|
||||
} else if (numValid > 3 && numInvalid == 0) {
|
||||
confidence = 100;
|
||||
} else if (numValid > 0 && numInvalid == 0) {
|
||||
confidence = 80;
|
||||
} else if (numValid == 0 && numInvalid == 0) {
|
||||
// Plain ASCII.
|
||||
confidence = 50;
|
||||
} else if (numValid > numInvalid) {
|
||||
// Probably corruput utf-8 data. Valid sequences aren't likely by chance.
|
||||
confidence = 60;
|
||||
}
|
||||
return confidence;
|
||||
}
|
||||
|
||||
}
|
45
icu4j/src/com/ibm/icu/text/CharsetRecognizer.java
Normal file
45
icu4j/src/com/ibm/icu/text/CharsetRecognizer.java
Normal file
@ -0,0 +1,45 @@
|
||||
/**
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
/**
|
||||
* Abstract class for recognizing a single charset.
|
||||
* Part of the implementation of ICU's CharsetDetector.
|
||||
*
|
||||
* Each specific charset that can be recognized will have an instance
|
||||
* of some subclass of this class. All interaction between the overall
|
||||
* CharsetDetector and the stuff specific to an individual charset happens
|
||||
* via the interface provided here.
|
||||
*
|
||||
* Instances of CharsetDetector DO NOT have or maintain
|
||||
* state pertaining to a specific match or detect operation.
|
||||
* The WILL be shared by multiple instances of CharsetDetector.
|
||||
* They encapsulate const charset-specific information.
|
||||
*
|
||||
*/
|
||||
abstract class CharsetRecognizer {
|
||||
/**
|
||||
* Get the IANA name of this charset.
|
||||
* @return the charset name.
|
||||
*/
|
||||
abstract String getName();
|
||||
|
||||
/**
|
||||
* Test the match of this charset with the input text data
|
||||
* which is obtained via the CharsetDetector object.
|
||||
*
|
||||
* @param det The CharsetDetector, which contains the input text
|
||||
* to be checked for being in this charset.
|
||||
* @return Two values packed into one int (Damn java, anyhow)
|
||||
* <br/>
|
||||
* bits 0-7: the match confidence, ranging from 0-100
|
||||
* <br/>
|
||||
* bits 8-15: The match reason, an enum-like value.
|
||||
*/
|
||||
abstract int match(CharsetDetector det);
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user