ICU-124 charset detector, work in progress.

X-SVN-Rev: 17476
This commit is contained in:
Andy Heninger 2005-04-13 01:11:26 +00:00
parent add48d59b6
commit 5f733d65b8
6 changed files with 209 additions and 7 deletions

View File

@ -77,6 +77,6 @@ public class TestCharsetDetector extends TestFmwk {
CharsetMatch m = det.detect();
CheckAssert(m.getName().equals("UTF-8"));
String retrievedS = m.getString();
CheckAssert(s == retrievedS);
CheckAssert(s.equals(retrievedS));
}
}

View File

@ -0,0 +1,41 @@
/*
* Created on Apr 12, 2005
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package com.ibm.icu.text;
/**
* Shift-JIS encoding scheme recognizer
*
*/
class CharsetDetectEnc_sjis extends CharsetDetectEncoding {
boolean nextChar(iteratedChar retChar, CharsetDetector det) {
retChar.index = retChar.nextIndex;
retChar.error = false;
int firstByte;
firstByte = retChar.charValue = retChar.nextByte(det);
if (firstByte < 0) {
return false;
}
if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
return true;
}
int secondByte = retChar.nextByte(det);
if (secondByte < 0) {
return false;
}
retChar.charValue = firstByte << 8 + secondByte;
if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
// Illegal second byte value.
retChar.error = true;
}
return true;
}
}

View File

@ -0,0 +1,45 @@
/*
*******************************************************************************
* Copyright (C) 2005, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
/**
* @author andy
*
* TODO To change the template for this generated type comment go to
* Window - Preferences - Java - Code Style - Code Templates
*/
abstract class CharsetDetectEncoding {
static class iteratedChar {
int charValue = 0;
int index = 0;
int nextIndex = 0;
boolean error = false;
boolean done = false;
void reset() {
charValue = 0;
index = -1;
nextIndex = 0;
error = false;
done = false;
}
int nextByte(CharsetDetector det) {
if (nextIndex >= det.fInputLen) {
done = true;
return -1;
}
int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
return byteValue;
}
}
abstract boolean nextChar(iteratedChar retChar, CharsetDetector det);
}

View File

@ -308,13 +308,13 @@ public class CharsetDetector {
//
// Stuff private to CharsetDetector
//
private byte[] fRawInput; // Original, untouched input bytes.
byte[] fRawInput; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
// If user gave us a stream, it's read to a
// buffer here.
private int fRawLength; // Length of data in fRawInput array.
int fRawLength; // Length of data in fRawInput array.
private InputStream fInputStream; // User's input stream, or null if the user
InputStream fInputStream; // User's input stream, or null if the user
// gave us a byte array.
@ -331,6 +331,7 @@ public class CharsetDetector {
private static ArrayList createRecognizers() {
ArrayList recognizers = new ArrayList();
recognizers.add(new CharsetRecog_UTF8());
recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis()));
// Create an array of all charset names, as a side effect.
// Needed for the getAllDetectableCharsets() API.

View File

@ -61,8 +61,14 @@ public class CharsetMatch implements Comparable {
* @param maxLength The maximium length of the String to be created.
* @return a String created from the converted input data.
*/
public String getString(int maxLength) {
return null;
public String getString(int maxLength) throws java.io.IOException {
String result = null;
if (fInputStream != null) {
// TODO: read the stream in somehow.
} else {
result = new String(fRawInput, getName());
}
return result;
}
@ -75,7 +81,7 @@ public class CharsetMatch implements Comparable {
* @return the confidence in the charset match
*/
public int getConfidence() {
return 0;
return fConfidence;
}
/**
@ -135,6 +141,17 @@ public class CharsetMatch implements Comparable {
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
fRecognizer = rec;
fConfidence = conf;
// The references to the original aplication input data must be copied out
// of the charset recognizer to here, in case the application resets the
// recognizer before using this CharsetMatch.
if (det.fInputStream == null) {
// We only want the existing input byte data if it came straight from the user,
// not if is just the head of a stream.
fRawInput = det.fRawInput;
fRawLength = det.fRawLength;
};
fInputStream = det.fInputStream;
}
@ -143,6 +160,12 @@ public class CharsetMatch implements Comparable {
//
private int fConfidence;
private CharsetRecognizer fRecognizer;
private byte[] fRawInput = null; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
private int fRawLength; // Length of data in fRawInput array.
private InputStream fInputStream = null; // User's input stream, or null if the user
// gave us a byte array.
}

View File

@ -0,0 +1,92 @@
/*
* Created on Apr 12, 2005
*
* TODO To change the template for this generated file go to
* Window - Preferences - Java - Code Style - Code Templates
*/
package com.ibm.icu.text;
/**
* CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
* Match is determined mostly by the input data adhering to the
* encoding scheme for the charset, although the hooks are here
* to also check language based character occurence frequencies if that
* proves to be necessary.
* <p/>
* Instances of this class are singletons, one per encoding
* being recognized. They are created in the main
* CharsetDetector class and kept in the global list of available
* encodings to be checked. The specific encoding being recognized
* is determined by the CharsetDetectEncoding provided when an
* instance of this class is created.
*
*/
class CharsetRecog_mbcs extends CharsetRecognizer {
private CharsetDetectEncoding fEnc;
private String fCharsetName;
/**
* Constructor.
* @param enc
*/
CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) {
fEnc = enc;
fCharsetName = charsetName;
}
/**
* Get the IANA name of this charset.
* @return the charset name.
*/
String getName() {
return fCharsetName;
}
/**
* Test the match of this charset with the input text data
* which is obtained via the CharsetDetector object.
*
* @param det The CharsetDetector, which contains the input text
* to be checked for being in this charset.
* @return Two values packed into one int (Damn java, anyhow)
* <br/>
* bits 0-7: the match confidence, ranging from 0-100
* <br/>
* bits 8-15: The match reason, an enum-like value.
*/
int match(CharsetDetector det) {
int singleByteCharCount = 0;
int doubleByteCharCount = 0;
int badCharCount = 0;
int totalCharCount = 0;
CharsetDetectEncoding.iteratedChar ichar = new CharsetDetectEncoding.iteratedChar();
for (ichar.reset(); fEnc.nextChar(ichar, det);) {
totalCharCount++;
if (ichar.error) {
badCharCount++;
} else {
if (ichar.charValue <= 0xff) {
singleByteCharCount++;
} else {
doubleByteCharCount++;
}
}
}
int confidence = 40 + doubleByteCharCount - 10*badCharCount;
if (confidence < 0) {
confidence = 0;
}
if (confidence > 100) {
confidence = 100;
}
return confidence;
}
}