ICU-4060 Implement CharsetMatch.getReader(), CharsetMatch.getString(), CharsetDetector.getReader(), CharsetDetector.getString(), relationship between ISO-8859-x and Windows-125x.
X-SVN-Rev: 17637
This commit is contained in:
parent
58e85492fd
commit
ed4a0639ba
@ -212,7 +212,15 @@ public class CharsetDetector {
|
||||
* or null or an empty string if none is available.
|
||||
*/
|
||||
public Reader getReader(InputStream in, String declaredEncoding) {
|
||||
return null;
|
||||
fDeclaredEncoding = declaredEncoding;
|
||||
|
||||
try {
|
||||
setText(in);
|
||||
|
||||
return detect().getReader();
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -231,7 +239,14 @@ public class CharsetDetector {
|
||||
*/
|
||||
public String getString(byte[] in, String declaredEncoding)
|
||||
{
|
||||
return null;
|
||||
fDeclaredEncoding = declaredEncoding;
|
||||
|
||||
try {
|
||||
setText(in);
|
||||
return detect().getString(-1);
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -343,7 +358,14 @@ public class CharsetDetector {
|
||||
for (srci=0; srci<fInputLen; srci++) {
|
||||
int val = fInputBytes[srci] & 0x00ff;
|
||||
fByteStats[val]++;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0x80; i <= 0x9F; i += 1) {
|
||||
if (fByteStats[i] != 0) {
|
||||
fC1Bytes = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@ -359,6 +381,9 @@ public class CharsetDetector {
|
||||
new short[256]; // Value is percent, not absolute.
|
||||
// Value is rounded up, so zero really means zero occurences.
|
||||
|
||||
boolean fC1Bytes = // True if any bytes in the range 0x80 - 0x9F are in the input;
|
||||
false;
|
||||
|
||||
String fDeclaredEncoding;
|
||||
|
||||
|
||||
|
@ -6,7 +6,10 @@
|
||||
*/
|
||||
package com.ibm.icu.text;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
|
||||
|
||||
@ -36,7 +39,18 @@ public class CharsetMatch implements Comparable {
|
||||
* @return the Reader for the Unicode character data.
|
||||
*/
|
||||
public Reader getReader() {
|
||||
return null;
|
||||
InputStream inputStream = fInputStream;
|
||||
|
||||
if (inputStream == null) {
|
||||
inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
|
||||
}
|
||||
|
||||
try {
|
||||
fInputStream.reset();
|
||||
return new InputStreamReader(fInputStream, getName());
|
||||
} catch (IOException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -66,7 +80,20 @@ public class CharsetMatch implements Comparable {
|
||||
public String getString(int maxLength) throws java.io.IOException {
|
||||
String result = null;
|
||||
if (fInputStream != null) {
|
||||
// TODO: read the stream in somehow.
|
||||
StringBuffer sb = new StringBuffer();
|
||||
char[] buffer = new char[1024];
|
||||
Reader reader = getReader();
|
||||
int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
|
||||
int bytesRead = 0;
|
||||
|
||||
while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
|
||||
sb.append(buffer, 0, bytesRead);
|
||||
max -= bytesRead;
|
||||
}
|
||||
|
||||
reader.close();
|
||||
|
||||
return sb.toString();
|
||||
} else {
|
||||
result = new String(fRawInput, getName());
|
||||
}
|
||||
|
@ -148,17 +148,14 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
||||
}
|
||||
}
|
||||
|
||||
private int byteIndex;
|
||||
|
||||
void reset()
|
||||
{
|
||||
byteIndex = 0;
|
||||
}
|
||||
protected boolean haveC1Bytes = false;
|
||||
|
||||
int match(CharsetDetector det, int[] ngrams, byte[] byteMap)
|
||||
{
|
||||
NGramParser parser = new NGramParser(ngrams, byteMap);
|
||||
|
||||
haveC1Bytes = det.fC1Bytes;
|
||||
|
||||
return parser.parse(det);
|
||||
}
|
||||
|
||||
@ -201,7 +198,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
||||
|
||||
public String getName()
|
||||
{
|
||||
return "ISO-8859-1";
|
||||
return haveC1Bytes? "windows-1252" : "ISO-8859-1";
|
||||
}
|
||||
}
|
||||
|
||||
@ -444,7 +441,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
||||
|
||||
public String getName()
|
||||
{
|
||||
return "ISO-8859-2";
|
||||
return haveC1Bytes? "windows-1250" : "ISO-8859-2";
|
||||
}
|
||||
}
|
||||
|
||||
@ -630,7 +627,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
||||
|
||||
public String getName()
|
||||
{
|
||||
return "ISO-8859-7";
|
||||
return haveC1Bytes? "windows-1253" : "ISO-8859-7";
|
||||
}
|
||||
}
|
||||
|
||||
@ -693,7 +690,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
|
||||
|
||||
public String getName()
|
||||
{
|
||||
return "ISO-8859-9";
|
||||
return haveC1Bytes? "windows-1254" : "ISO-8859-9";
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user