ICU-4060 Implement CharsetMatch.getReader(), CharsetMatch.getString(), CharsetDetector.getReader(), CharsetDetector.getString(), relationship between ISO-8859-x and Windows-125x.

X-SVN-Rev: 17637
2005-05-20 22:33:10 +00:00 · 2005-05-20 22:33:10 +00:00 · ed4a0639ba
commit ed4a0639ba
parent 58e85492fd
3 changed files with 64 additions and 15 deletions
--- a/icu4j/src/com/ibm/icu/text/CharsetDetector.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetDetector.java
@ -212,7 +212,15 @@ public class CharsetDetector {
     *           or null or an empty string if none is available.
     */
    public Reader getReader(InputStream in, String declaredEncoding) {
-        return null;
+        fDeclaredEncoding = declaredEncoding;
+        
+        try {
+            setText(in);
+            
+            return detect().getReader();
+        } catch (IOException e) {
+            return null;
+        }
    }

    /**
@ -231,7 +239,14 @@ public class CharsetDetector {
     */
    public String getString(byte[] in, String declaredEncoding)
    {
-        return null;
+        fDeclaredEncoding = declaredEncoding;
+       
+        try {
+            setText(in);
+            return detect().getString(-1);
+        } catch (IOException e) {
+            return null;
+        }
    }

 
@ -344,6 +359,13 @@ public class CharsetDetector {
            int val = fInputBytes[srci] & 0x00ff;
            fByteStats[val]++;
        }
+        
+        for (int i = 0x80; i <= 0x9F; i += 1) {
+            if (fByteStats[i] != 0) {
+                fC1Bytes = true;
+                break;
+            }
+        }
     }

    /**
@ -359,6 +381,9 @@ public class CharsetDetector {
                   new short[256];  //   Value is percent, not absolute.
                                    //   Value is rounded up, so zero really means zero occurences.
    
+    boolean     fC1Bytes =          // True if any bytes in the range 0x80 - 0x9F are in the input;
+                   false;
+    
    String      fDeclaredEncoding;
    
    
--- a/icu4j/src/com/ibm/icu/text/CharsetMatch.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetMatch.java
@ -6,7 +6,10 @@
 */
 package com.ibm.icu.text;

+import java.io.ByteArrayInputStream;
+import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.Reader;


@ -36,7 +39,18 @@ public class CharsetMatch implements Comparable {
     * @return the Reader for the Unicode character data.
     */
    public Reader getReader() {
-        return null;
+        InputStream inputStream = fInputStream;
+        
+        if (inputStream == null) {
+            inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
+        }
+        
+        try {
+            fInputStream.reset();
+            return new InputStreamReader(fInputStream, getName());
+        } catch (IOException e) {
+            return null;
+        }
    }
    
    
@ -66,7 +80,20 @@ public class CharsetMatch implements Comparable {
    public String getString(int maxLength) throws java.io.IOException {
        String result = null;
        if (fInputStream != null) {
-            // TODO:  read the stream in somehow.
+            StringBuffer sb = new StringBuffer();
+            char[] buffer = new char[1024];
+            Reader reader = getReader();
+            int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
+            int bytesRead = 0;
+            
+            while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
+                sb.append(buffer, 0, bytesRead);
+                max -= bytesRead;
+            }
+            
+            reader.close();
+            
+            return sb.toString();
        } else {
            result = new String(fRawInput, getName());            
        }
--- a/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java
+++ b/icu4j/src/com/ibm/icu/text/CharsetRecog_sbcs.java
@ -148,17 +148,14 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {
        }
    }
    
-    private int byteIndex;
-    
-    void reset()
-    {
-        byteIndex = 0;
-    }
+    protected boolean haveC1Bytes = false;
    
    int match(CharsetDetector det, int[] ngrams,  byte[] byteMap)
    {
        NGramParser parser = new NGramParser(ngrams, byteMap);
        
+        haveC1Bytes = det.fC1Bytes;
+        
        return parser.parse(det);
    }
    
@ -201,7 +198,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {

        public String getName()
        {
-            return "ISO-8859-1";
+            return haveC1Bytes? "windows-1252" : "ISO-8859-1";
        }
    }

@ -444,7 +441,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {

        public String getName()
        {
-            return "ISO-8859-2";
+            return haveC1Bytes? "windows-1250" : "ISO-8859-2";
        }
    }
    
@ -630,7 +627,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {

        public String getName()
        {
-            return "ISO-8859-7";
+            return haveC1Bytes? "windows-1253" : "ISO-8859-7";
        }
    }
    
@ -693,7 +690,7 @@ public abstract class CharsetRecog_sbcs extends CharsetRecognizer {

        public String getName()
        {
-            return "ISO-8859-9";
+            return haveC1Bytes? "windows-1254" : "ISO-8859-9";
        }
    }