ICU-1560 added tool for fixing filters

X-SVN-Rev: 7183
2001-11-29 23:21:42 +00:00 · 2001-11-29 23:21:42 +00:00 · 41c6ecbd19
commit 41c6ecbd19
parent 04d996482a
2 changed files with 266 additions and 0 deletions
--- a/icu4j/src/com/ibm/icu/dev/tool/translit/UnicodeSetClosure.java
+++ b/icu4j/src/com/ibm/icu/dev/tool/translit/UnicodeSetClosure.java
@ -0,0 +1,133 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/icu/dev/tool/translit/UnicodeSetClosure.java,v $
+ * $Date: 2001/11/29 23:21:42 $
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.tools.translit;
+import com.ibm.text.*;
+import com.ibm.test.*;
+import com.ibm.util.Utility;
+//import java.text.*;
+import java.io.*;
+import java.util.Locale;
+
+public class UnicodeSetClosure {
+    public static void main(String[] args) throws Exception {
+        // test it
+        String testStr = "[\u00E0Bc]";
+        
+        File f = new File("TestUnicodeSetClosure.txt");
+        String filename = f.getCanonicalFile().toString();
+        out = new PrintWriter(
+            new OutputStreamWriter(
+                new FileOutputStream(filename), "UTF-8"));
+        System.out.println("Writing " + filename);
+        out.print('\uFEFF'); // BOM
+        
+        UnicodeSet test = new UnicodeSet(testStr);
+        
+        close(test, Normalizer.DECOMP, false);
+        print("NFD", test);
+        
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.NO_OP, true);
+        print("Lower", test);
+
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.COMPOSE, false);
+        print("NFC", test);
+
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.DECOMP_COMPAT, false);
+        print("NFKD", test);
+
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.COMPOSE_COMPAT, false);
+        print("NFKC", test);
+        
+        out.close();
+    }
+    
+    static PrintWriter out;
+    
+    public static void print(String label, UnicodeSet test) {
+        System.out.println(label);
+        out.println(label + ": " + test.toPattern(false));
+        out.println();
+    }
+    
+    public static void close(UnicodeSet s, Normalizer.Mode m, boolean lowerFirst) {
+        close(s, new NFToString(m, lowerFirst));
+    }
+    
+    // dumb, slow implementations
+    public static class NFToString implements Char32ToString {
+        Normalizer.Mode mode;
+        boolean lowerFirst;
+        
+        NFToString(Normalizer.Mode m, boolean lowerFirst) {
+            mode = m;
+            this.lowerFirst = lowerFirst;
+        }
+        
+        public String get(int cp) {
+            String source = UTF16.valueOf(cp);
+            String result = source;
+            if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
+            result = Normalizer.normalize(result, mode, 0);
+            if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
+            if (result.equals(source)) return null;
+            return result;
+        }
+    }
+        
+    
+    /** Returns a mapping from char32 to a string. If there is no change,
+     * null is returned.
+     */
+     
+    interface Char32ToString {
+        public String get(int cp);
+    }
+    
+    public static void close(UnicodeSet s, Char32ToString f) {
+        for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+            int type = UCharacter.getType(cp);
+            if (type == Character.UNASSIGNED) continue;
+            
+            if (cp == '\u00e7') {
+                System.out.println("debug");
+            }
+            String result = f.get(cp);
+            if (result == null) continue;
+            if (!containsSome(s, result)) continue;
+            s.add(cp);
+        }
+    }
+    
+    // These should both be public, and on the respective classes
+    
+    public static void addAll(UnicodeSet s, String str) {
+        int cp;
+        for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(str,i);
+            s.add(cp);
+        }
+    }
+    
+    public static boolean containsSome(UnicodeSet s, String str) {
+        int cp;
+        for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(str,i);
+            if (s.contains(cp)) return true;
+        }
+        return false;
+    }
+}
--- a/icu4j/src/com/ibm/tools/translit/UnicodeSetClosure.java
+++ b/icu4j/src/com/ibm/tools/translit/UnicodeSetClosure.java
@ -0,0 +1,133 @@
+/*
+ *******************************************************************************
+ * Copyright (C) 1996-2000, International Business Machines Corporation and    *
+ * others. All Rights Reserved.                                                *
+ *******************************************************************************
+ *
+ * $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/tools/translit/Attic/UnicodeSetClosure.java,v $
+ * $Date: 2001/11/29 23:21:42 $
+ * $Revision: 1.1 $
+ *
+ *****************************************************************************************
+ */
+package com.ibm.tools.translit;
+import com.ibm.text.*;
+import com.ibm.test.*;
+import com.ibm.util.Utility;
+//import java.text.*;
+import java.io.*;
+import java.util.Locale;
+
+public class UnicodeSetClosure {
+    public static void main(String[] args) throws Exception {
+        // test it
+        String testStr = "[\u00E0Bc]";
+        
+        File f = new File("TestUnicodeSetClosure.txt");
+        String filename = f.getCanonicalFile().toString();
+        out = new PrintWriter(
+            new OutputStreamWriter(
+                new FileOutputStream(filename), "UTF-8"));
+        System.out.println("Writing " + filename);
+        out.print('\uFEFF'); // BOM
+        
+        UnicodeSet test = new UnicodeSet(testStr);
+        
+        close(test, Normalizer.DECOMP, false);
+        print("NFD", test);
+        
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.NO_OP, true);
+        print("Lower", test);
+
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.COMPOSE, false);
+        print("NFC", test);
+
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.DECOMP_COMPAT, false);
+        print("NFKD", test);
+
+        test = new UnicodeSet(testStr);
+        close(test, Normalizer.COMPOSE_COMPAT, false);
+        print("NFKC", test);
+        
+        out.close();
+    }
+    
+    static PrintWriter out;
+    
+    public static void print(String label, UnicodeSet test) {
+        System.out.println(label);
+        out.println(label + ": " + test.toPattern(false));
+        out.println();
+    }
+    
+    public static void close(UnicodeSet s, Normalizer.Mode m, boolean lowerFirst) {
+        close(s, new NFToString(m, lowerFirst));
+    }
+    
+    // dumb, slow implementations
+    public static class NFToString implements Char32ToString {
+        Normalizer.Mode mode;
+        boolean lowerFirst;
+        
+        NFToString(Normalizer.Mode m, boolean lowerFirst) {
+            mode = m;
+            this.lowerFirst = lowerFirst;
+        }
+        
+        public String get(int cp) {
+            String source = UTF16.valueOf(cp);
+            String result = source;
+            if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
+            result = Normalizer.normalize(result, mode, 0);
+            if (lowerFirst) result = UCharacter.toLowerCase(Locale.US, result);
+            if (result.equals(source)) return null;
+            return result;
+        }
+    }
+        
+    
+    /** Returns a mapping from char32 to a string. If there is no change,
+     * null is returned.
+     */
+     
+    interface Char32ToString {
+        public String get(int cp);
+    }
+    
+    public static void close(UnicodeSet s, Char32ToString f) {
+        for (int cp = 0; cp <= 0x10FFFF; ++cp) {
+            int type = UCharacter.getType(cp);
+            if (type == Character.UNASSIGNED) continue;
+            
+            if (cp == '\u00e7') {
+                System.out.println("debug");
+            }
+            String result = f.get(cp);
+            if (result == null) continue;
+            if (!containsSome(s, result)) continue;
+            s.add(cp);
+        }
+    }
+    
+    // These should both be public, and on the respective classes
+    
+    public static void addAll(UnicodeSet s, String str) {
+        int cp;
+        for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(str,i);
+            s.add(cp);
+        }
+    }
+    
+    public static boolean containsSome(UnicodeSet s, String str) {
+        int cp;
+        for (int i = 0; i < str.length(); i += UTF16.getCharCount(cp)) {
+            cp = UTF16.charAt(str,i);
+            if (s.contains(cp)) return true;
+        }
+        return false;
+    }
+}