ICU-4060 Add enableInputFilter(), let the first two incomplete ngrams get looked up.

X-SVN-Rev: 17606
This commit is contained in:
Eric Mader 2005-05-17 19:38:17 +00:00
parent c7e2f059fa
commit 0c21dcd927
2 changed files with 61 additions and 26 deletions

View File

@ -81,10 +81,15 @@ public class CharsetDetector {
*/
public CharsetDetector setText(byte [] in) {
fRawInput = in;
fRawLength = in.length;
fRawLength = in.length;
MungeInput();
return this;
}
private static final int kBufSize = 8000;
/**
* Set the input text (byte) data whose charset is to be detected.
* <p/>
@ -97,7 +102,6 @@ public class CharsetDetector {
* @param in the input text of unknown encoding
* @return This CharsetDetector
*/
private static final int kBufSize = 8000;
public CharsetDetector setText(InputStream in) throws IOException {
fInputStream = in;
@ -225,7 +229,8 @@ public class CharsetDetector {
* @param declaredEncoding A declared encoding for the data, if available,
* or null or an empty string if none is available.
*/
public String getString(byte[] in, String declaredEncoding) {
public String getString(byte[] in, String declaredEncoding)
{
return null;
}
@ -240,7 +245,36 @@ public class CharsetDetector {
return fCharsetNames;
}
/**
* Test whether or not input filtering is enabled.
*
* @return <code>true</code> if input text will be filtered.
*
* @see enableInputFilter
*/
public boolean inputFilterEnabled()
{
return fStripTags;
}
/**
* Enable filtering of input text. If filtering is enabled,
* text within angle brackets ("<" and ">") will be removed
* before detection.
*
* @param filter <code>true</code> to enable input text filtering.
*
* @return The previous setting.
*/
public boolean enableInputFilter(boolean filter)
{
boolean previous = fStripTags;
fStripTags = filter;
return previous;
}
/**
* MungeInput - after getting a set of raw input data to be analyzed, preprocess
* it by removing what appears to be html markup.
@ -259,24 +293,28 @@ public class CharsetDetector {
// discard everything within < brackets >
// Count how many total '<' and illegal (nested) '<' occur, so we can make some
// guess as to whether the input was actually marked up at all.
for (srci=0; srci<fRawLength; srci++) {
b = fRawInput[srci];
if (b == (byte)'<') {
if (inMarkup) {
badTags++;
if (fStripTags) {
for (srci=0; srci<fRawLength; srci++) {
b = fRawInput[srci];
if (b == (byte)'<') {
if (inMarkup) {
badTags++;
}
inMarkup = true;
openTags++;
}
inMarkup = true;
openTags++;
}
if (inMarkup == false) {
fInputBytes[dsti++] = b;
if (! inMarkup) {
fInputBytes[dsti++] = b;
}
if (b == (byte)'>') {
inMarkup = false;
}
}
if (b == (byte)'>') {
inMarkup = false;
}
fInputLen = dsti;
}
fInputLen = dsti;
//
// If it looks like this input wasn't marked up, or if it looks like it's
@ -325,11 +363,14 @@ public class CharsetDetector {
byte[] fRawInput; // Original, untouched input bytes.
// If user gave us a byte array, this is it.
// If user gave us a stream, it's read to a
// buffer here.
// buffer here.
int fRawLength; // Length of data in fRawInput array.
InputStream fInputStream; // User's input stream, or null if the user
// gave us a byte array.
boolean fStripTags = // If true, setText() will strip tags from input text.
false;
/**

View File

@ -31,7 +31,6 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
private static final int N_GRAM_SIZE = 3;
private static final int N_GRAM_MASK = 0xFFFFFF;
private int byteCount = 0;
private int byteIndex = 0;
private int ngram = 0;
@ -47,7 +46,6 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
byteMap = theByteMap;
ngram = 0;
byteCount = 0;
ngramCount = hitCount = 0;
}
@ -107,11 +105,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
private void addByte(int b)
{
ngram = ((ngram << 8) + (b & 0xFF)) & N_GRAM_MASK;
if (++byteCount >= N_GRAM_SIZE) {
lookup(ngram);
byteCount = N_GRAM_SIZE - 1;
}
lookup(ngram);
}
private int nextByte(CharsetDetector det)