ICU-124 charset detector, work in progress.
X-SVN-Rev: 17476
This commit is contained in:
parent
add48d59b6
commit
5f733d65b8
@ -77,6 +77,6 @@ public class TestCharsetDetector extends TestFmwk {
|
||||
CharsetMatch m = det.detect();
|
||||
CheckAssert(m.getName().equals("UTF-8"));
|
||||
String retrievedS = m.getString();
|
||||
CheckAssert(s == retrievedS);
|
||||
CheckAssert(s.equals(retrievedS));
|
||||
}
|
||||
}
|
||||
|
41
icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java
Normal file
41
icu4j/src/com/ibm/icu/text/CharsetDetectEnc_sjis.java
Normal file
@ -0,0 +1,41 @@
|
||||
/*
|
||||
* Created on Apr 12, 2005
|
||||
*
|
||||
* TODO To change the template for this generated file go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
|
||||
/**
|
||||
* Shift-JIS encoding scheme recognizer
|
||||
*
|
||||
*/
|
||||
class CharsetDetectEnc_sjis extends CharsetDetectEncoding {
|
||||
|
||||
boolean nextChar(iteratedChar retChar, CharsetDetector det) {
|
||||
retChar.index = retChar.nextIndex;
|
||||
retChar.error = false;
|
||||
int firstByte;
|
||||
firstByte = retChar.charValue = retChar.nextByte(det);
|
||||
if (firstByte < 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int secondByte = retChar.nextByte(det);
|
||||
if (secondByte < 0) {
|
||||
return false;
|
||||
}
|
||||
retChar.charValue = firstByte << 8 + secondByte;
|
||||
if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
|
||||
// Illegal second byte value.
|
||||
retChar.error = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
45
icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java
Normal file
45
icu4j/src/com/ibm/icu/text/CharsetDetectEncoding.java
Normal file
@ -0,0 +1,45 @@
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2005, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
/**
|
||||
* @author andy
|
||||
*
|
||||
* TODO To change the template for this generated type comment go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
abstract class CharsetDetectEncoding {
|
||||
|
||||
static class iteratedChar {
|
||||
int charValue = 0;
|
||||
int index = 0;
|
||||
int nextIndex = 0;
|
||||
boolean error = false;
|
||||
boolean done = false;
|
||||
|
||||
void reset() {
|
||||
charValue = 0;
|
||||
index = -1;
|
||||
nextIndex = 0;
|
||||
error = false;
|
||||
done = false;
|
||||
}
|
||||
|
||||
int nextByte(CharsetDetector det) {
|
||||
if (nextIndex >= det.fInputLen) {
|
||||
done = true;
|
||||
return -1;
|
||||
}
|
||||
int byteValue = (int)det.fInputBytes[nextIndex++] & 0x00ff;
|
||||
return byteValue;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
abstract boolean nextChar(iteratedChar retChar, CharsetDetector det);
|
||||
|
||||
}
|
@ -308,13 +308,13 @@ public class CharsetDetector {
|
||||
//
|
||||
// Stuff private to CharsetDetector
|
||||
//
|
||||
private byte[] fRawInput; // Original, untouched input bytes.
|
||||
byte[] fRawInput; // Original, untouched input bytes.
|
||||
// If user gave us a byte array, this is it.
|
||||
// If user gave us a stream, it's read to a
|
||||
// buffer here.
|
||||
private int fRawLength; // Length of data in fRawInput array.
|
||||
int fRawLength; // Length of data in fRawInput array.
|
||||
|
||||
private InputStream fInputStream; // User's input stream, or null if the user
|
||||
InputStream fInputStream; // User's input stream, or null if the user
|
||||
// gave us a byte array.
|
||||
|
||||
|
||||
@ -331,6 +331,7 @@ public class CharsetDetector {
|
||||
private static ArrayList createRecognizers() {
|
||||
ArrayList recognizers = new ArrayList();
|
||||
recognizers.add(new CharsetRecog_UTF8());
|
||||
recognizers.add(new CharsetRecog_mbcs("Shift_JIS", new CharsetDetectEnc_sjis()));
|
||||
|
||||
// Create an array of all charset names, as a side effect.
|
||||
// Needed for the getAllDetectableCharsets() API.
|
||||
|
@ -61,8 +61,14 @@ public class CharsetMatch implements Comparable {
|
||||
* @param maxLength The maximium length of the String to be created.
|
||||
* @return a String created from the converted input data.
|
||||
*/
|
||||
public String getString(int maxLength) {
|
||||
return null;
|
||||
public String getString(int maxLength) throws java.io.IOException {
|
||||
String result = null;
|
||||
if (fInputStream != null) {
|
||||
// TODO: read the stream in somehow.
|
||||
} else {
|
||||
result = new String(fRawInput, getName());
|
||||
}
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
@ -75,7 +81,7 @@ public class CharsetMatch implements Comparable {
|
||||
* @return the confidence in the charset match
|
||||
*/
|
||||
public int getConfidence() {
|
||||
return 0;
|
||||
return fConfidence;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -135,6 +141,17 @@ public class CharsetMatch implements Comparable {
|
||||
CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
|
||||
fRecognizer = rec;
|
||||
fConfidence = conf;
|
||||
|
||||
// The references to the original aplication input data must be copied out
|
||||
// of the charset recognizer to here, in case the application resets the
|
||||
// recognizer before using this CharsetMatch.
|
||||
if (det.fInputStream == null) {
|
||||
// We only want the existing input byte data if it came straight from the user,
|
||||
// not if is just the head of a stream.
|
||||
fRawInput = det.fRawInput;
|
||||
fRawLength = det.fRawLength;
|
||||
};
|
||||
fInputStream = det.fInputStream;
|
||||
}
|
||||
|
||||
|
||||
@ -143,6 +160,12 @@ public class CharsetMatch implements Comparable {
|
||||
//
|
||||
private int fConfidence;
|
||||
private CharsetRecognizer fRecognizer;
|
||||
private byte[] fRawInput = null; // Original, untouched input bytes.
|
||||
// If user gave us a byte array, this is it.
|
||||
private int fRawLength; // Length of data in fRawInput array.
|
||||
|
||||
private InputStream fInputStream = null; // User's input stream, or null if the user
|
||||
// gave us a byte array.
|
||||
|
||||
|
||||
}
|
||||
|
92
icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
Normal file
92
icu4j/src/com/ibm/icu/text/CharsetRecog_mbcs.java
Normal file
@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Created on Apr 12, 2005
|
||||
*
|
||||
* TODO To change the template for this generated file go to
|
||||
* Window - Preferences - Java - Code Style - Code Templates
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
/**
|
||||
* CharsetRecognizer implemenation for Asian - double or multi-byte - charsets.
|
||||
* Match is determined mostly by the input data adhering to the
|
||||
* encoding scheme for the charset, although the hooks are here
|
||||
* to also check language based character occurence frequencies if that
|
||||
* proves to be necessary.
|
||||
* <p/>
|
||||
* Instances of this class are singletons, one per encoding
|
||||
* being recognized. They are created in the main
|
||||
* CharsetDetector class and kept in the global list of available
|
||||
* encodings to be checked. The specific encoding being recognized
|
||||
* is determined by the CharsetDetectEncoding provided when an
|
||||
* instance of this class is created.
|
||||
*
|
||||
*/
|
||||
class CharsetRecog_mbcs extends CharsetRecognizer {
|
||||
|
||||
private CharsetDetectEncoding fEnc;
|
||||
private String fCharsetName;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param enc
|
||||
*/
|
||||
CharsetRecog_mbcs(String charsetName, CharsetDetectEncoding enc) {
|
||||
fEnc = enc;
|
||||
fCharsetName = charsetName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the IANA name of this charset.
|
||||
* @return the charset name.
|
||||
*/
|
||||
String getName() {
|
||||
return fCharsetName;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test the match of this charset with the input text data
|
||||
* which is obtained via the CharsetDetector object.
|
||||
*
|
||||
* @param det The CharsetDetector, which contains the input text
|
||||
* to be checked for being in this charset.
|
||||
* @return Two values packed into one int (Damn java, anyhow)
|
||||
* <br/>
|
||||
* bits 0-7: the match confidence, ranging from 0-100
|
||||
* <br/>
|
||||
* bits 8-15: The match reason, an enum-like value.
|
||||
*/
|
||||
int match(CharsetDetector det) {
|
||||
int singleByteCharCount = 0;
|
||||
int doubleByteCharCount = 0;
|
||||
int badCharCount = 0;
|
||||
int totalCharCount = 0;
|
||||
|
||||
CharsetDetectEncoding.iteratedChar ichar = new CharsetDetectEncoding.iteratedChar();
|
||||
|
||||
for (ichar.reset(); fEnc.nextChar(ichar, det);) {
|
||||
totalCharCount++;
|
||||
if (ichar.error) {
|
||||
badCharCount++;
|
||||
} else {
|
||||
|
||||
if (ichar.charValue <= 0xff) {
|
||||
singleByteCharCount++;
|
||||
} else {
|
||||
doubleByteCharCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int confidence = 40 + doubleByteCharCount - 10*badCharCount;
|
||||
if (confidence < 0) {
|
||||
confidence = 0;
|
||||
}
|
||||
if (confidence > 100) {
|
||||
confidence = 100;
|
||||
}
|
||||
|
||||
return confidence;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user