ICU-4060 Implement CharsetMatch.getReader(), CharsetMatch.getString(), CharsetDetector.getReader(), CharsetDetector.getString(), relationship between ISO-8859-x and Windows-125x.

X-SVN-Rev: 17637
This commit is contained in:
Eric Mader 2005-05-20 22:33:10 +00:00
parent 58e85492fd
commit ed4a0639ba
3 changed files with 64 additions and 15 deletions

View File

@ -212,7 +212,15 @@ public class CharsetDetector {
* or null or an empty string if none is available.
*/
public Reader getReader(InputStream in, String declaredEncoding) {
return null;
fDeclaredEncoding = declaredEncoding;
try {
setText(in);
return detect().getReader();
} catch (IOException e) {
return null;
}
}
/**
@ -231,7 +239,14 @@ public class CharsetDetector {
*/
public String getString(byte[] in, String declaredEncoding)
{
return null;
fDeclaredEncoding = declaredEncoding;
try {
setText(in);
return detect().getString(-1);
} catch (IOException e) {
return null;
}
}
@ -344,6 +359,13 @@ public class CharsetDetector {
int val = fInputBytes[srci] & 0x00ff;
fByteStats[val]++;
}
for (int i = 0x80; i <= 0x9F; i += 1) {
if (fByteStats[i] != 0) {
fC1Bytes = true;
break;
}
}
}
/**
@ -359,6 +381,9 @@ public class CharsetDetector {
new short[256]; // Value is percent, not absolute.
// Value is rounded up, so zero really means zero occurences.
boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
false;
String fDeclaredEncoding;

View File

@ -6,7 +6,10 @@
*/
package com.ibm.icu.text;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
@ -36,7 +39,18 @@ public class CharsetMatch implements Comparable {
* @return the Reader for the Unicode character data.
*/
public Reader getReader() {
return null;
InputStream inputStream = fInputStream;
if (inputStream == null) {
inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
}
try {
fInputStream.reset();
return new InputStreamReader(fInputStream, getName());
} catch (IOException e) {
return null;
}
}
@ -66,7 +80,20 @@ public class CharsetMatch implements Comparable {
public String getString(int maxLength) throws java.io.IOException {
String result = null;
if (fInputStream != null) {
// TODO: read the stream in somehow.
StringBuffer sb = new StringBuffer();
char[] buffer = new char[1024];
Reader reader = getReader();
int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
int bytesRead = 0;
while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
sb.append(buffer, 0, bytesRead);
max -= bytesRead;
}
reader.close();
return sb.toString();
} else {
result = new String(fRawInput, getName());
}

View File

@ -148,17 +148,14 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
}
}
private int byteIndex;
void reset()
{
byteIndex = 0;
}
protected boolean haveC1Bytes = false;
int match(CharsetDetector det, int[] ngrams, byte[] byteMap)
{
NGramParser parser = new NGramParser(ngrams, byteMap);
haveC1Bytes = det.fC1Bytes;
return parser.parse(det);
}
@ -201,7 +198,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
public String getName()
{
return "ISO-8859-1";
return haveC1Bytes? "windows-1252" : "ISO-8859-1";
}
}
@ -444,7 +441,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
public String getName()
{
return "ISO-8859-2";
return haveC1Bytes? "windows-1250" : "ISO-8859-2";
}
}
@ -630,7 +627,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
public String getName()
{
return "ISO-8859-7";
return haveC1Bytes? "windows-1253" : "ISO-8859-7";
}
}
@ -693,7 +690,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
public String getName()
{
return "ISO-8859-9";
return haveC1Bytes? "windows-1254" : "ISO-8859-9";
}
}