scuffed-code/tools/unicodetools/com/ibm/text/UCD/NFSkippable.java

package com.ibm.text.UCD;
import com.ibm.text.UTF16;
import com.ibm.text.UnicodeSet;
import java.util.BitSet;
import com.ibm.text.utility.*;
import java.io.PrintWriter;


public final class NFSkippable extends UnicodeProperty {
    
    static final boolean DEBUG = false;
    
    private Normalizer nf;
    private Normalizer nfd;
    private boolean composes;
    private int[] realTrailers = new int[100];
    private int realTrailerCount = 0;
    
    public NFSkippable(byte normalizerMode, String unicodeVersion) {
        isStandard = false;
        ucd = UCD.make(unicodeVersion);
        nf = new Normalizer(normalizerMode, unicodeVersion);
        name = nf.getName() + "_Skippable";
        shortName = nf.getName() + "_Skip";
        header = "# Derived Property: " + name
            + "\r\n#   Generated according to UAX #15."
            + "\r\n#   Characters that don't interact with any others in this normalization form."
            + "\r\n#   WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."
            + "\r\n#            The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";

        nfd = new Normalizer(Normalizer.NFD, unicodeVersion);
        composes = normalizerMode == Normalizer.NFC || normalizerMode == Normalizer.NFKC;
        
        // preprocess to find possible trailers
        
        if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {
            if (nf.isTrailing(cp2)) {
                //System.out.println("Trailing: " + ucd.getCodeAndName(cp2));
                if (ucd.isTrailingJamo(cp2)) {
                    //System.out.println("Jamo: " + ucd.getCodeAndName(cp2));
                    continue;
                }
                realTrailers[realTrailerCount++] = cp2;
            }
        }
        Utility.fixDot();
        //System.out.println("trailer count: " + realTrailerCount);
    }
    
    /** A skippable character is<br>
    * a) unassigned, or ALL of the following:<br>
    * b) of combining class 0.<br>
    * c) not decomposed by this normalization form.<br>
    * AND if NKC or NFKC, <br>
    * d) can never compose with a previous character.<br>
    * e) can never compose with a following character.<br>
    * f) can never change if another character is added.
    *    Example: a-breve might satisfy all but f, but if you
    *    add an ogonek it changes to a-ogonek + breve
    */
    
    String cause = "";
    
    public boolean hasProperty(int cp) {
        // quick check on some special classes
        if (DEBUG) cause = "\t\tunassigned";
        if (!ucd.isAssigned(cp)) return true;
        
        if (DEBUG) cause = "\t\tnf differs";
        if (nf.normalizationDiffers(cp)) return false;
        
        if (DEBUG) cause = "\t\tnon-zero cc";
        if (ucd.getCombiningClass(cp) != 0) return false;
        
        if (DEBUG) cause = "";
        if (!composes) return true;
        
        // now special checks for composing normalizers
        if (DEBUG) cause = "\t\tleading";
        if (nf.isLeading(cp)) return false;

        if (DEBUG) cause = "\t\ttrailing";
        if (nf.isTrailing(cp)) return false;
        
        // OPTIMIZATION -- careful
        // If there is no NFD decomposition, then this character's accents can't be
        // "displaced", so we don't have to test further
        
        if (DEBUG) cause = "\t\tno decomp";
        if (!nfd.normalizationDiffers(cp)) return true;
        
        // OPTIMIZATION -- careful
        // Hangul syllables are skippable IFF they are isLeadingJamoComposition
        if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp);
        
        // We now see if adding another character causes a problem. 
        // brute force for now!!
        // We do skip the trailing Jamo, since those never displace!
        
        StringBuffer base = new StringBuffer(UTF16.valueOf(cp));
        int baseLen = base.length();
        for (int i = 0; i < realTrailerCount; ++i) {
            base.setLength(baseLen); // shorten if needed
            base.append(UTF16.valueOf(realTrailers[i]));
            String probe = base.toString();
            String result = nf.normalize(probe);
            if (!result.equals(probe)) {
                if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]);
                return false;
            }
        }
        
        // passed the sieve, so we are ok
        if (DEBUG) cause = "";
        return true;
    }
    
    // both the following should go into UTF16
    
    public static String replace(String source, int toReplace, int replacement) {
        if (0 <= toReplace && toReplace <= 0xFFFF
            && 0 <= replacement && replacement <= 0xFFFF) {
            return source.replace((char)toReplace, (char)replacement);
        }
        return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement));
    }
    
    public static String replace(String source, String toReplace, String replacement) {
        int pos = 0;
        StringBuffer result = new StringBuffer(source.length());
        while (true) {
            int newPos = source.indexOf(toReplace, pos);
            if (newPos >= 0) {
                result.append(source.substring(pos, newPos));
                result.append(replacement);
                pos = newPos + toReplace.length();
            } else if (pos != 0) {
                result.append(source.substring(pos));
                return result.toString();
            } else {
                return source; // no change necessary
            }
        }
    }
    
    static void writeStringInPieces(PrintWriter pw, String s, String term) {
        int start;
        int end;
        int lineLen = 64;
        for (start = 0; ; start = end) {
            if (start == 0) pw.print("\t  \"");
            else pw.print("\t+ \"");
            end = s.length();
            if (end > start + lineLen) end = start + lineLen;
            
            // if we have a slash in the last 5 characters, backup
            
            int lastSlash = s.lastIndexOf('\\', end);
            if (lastSlash >= end-5) end = lastSlash;
            
            // backup if we broke on a \
            
            while (end > start && s.charAt(end-1) == '\\') --end;
            
            pw.print(s.substring(start, end));
            if (end == s.length()) {
                pw.println('"' + term);
                break;
            } else {
                pw.println('"');
            }
        }
    }
    
    static void testWriteStringInPieces() {
        String test =
	  "[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"
	+ "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F"
	+ "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-"
	+ "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017"
	+ "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u"
	+ "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226";
	    PrintWriter pw = new PrintWriter(System.out);
	    writeStringInPieces(pw,test,"");
	    writeStringInPieces(pw,replace(test, "\\", "\\\\"),"");
	    
	    pw.flush();
	}
        
    static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller
    
    public static void main (String[] args) throws java.io.IOException {
        
        String version = ""; // Unicode version, "" = latest released
        
        PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt");
        
        for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {
            UnicodeProperty up = DerivedProperty.getProperty(mode, UCD.make(version));
            generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);
        }
        
        for (byte mode = NFD; mode <= NFKC; ++mode) {
            NFSkippable skipper = new NFSkippable(mode,version);
            generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);
        }
        
        out.close();
    }
    
    static void generateSet(PrintWriter out, String label, UnicodeProperty up) {
        System.out.println("Generating: " + up.getName(NORMAL));
        UnicodeSet result = new UnicodeSet();
        for (int cp = 0; cp <= limit; ++cp) {
            Utility.dot(cp);
            if (up.hasProperty(cp)) result.add(cp);
        }
        Utility.fixDot();
            
        String rSet = result.toPattern(true);          
        rSet = replace(rSet, "\\U", "\\\\U");
        out.println(label + " = new UnicodeSet(");
        writeStringInPieces(out, rSet, ", false);");
        out.println();
            
        rSet = result.toPattern(false);
        out.println("/*Unicode: ");
        writeStringInPieces(out, rSet, "*/");
        out.println();
        out.flush();
    }
    
            /*
       // DerivedProperty dp = new DerivedProperty(UCD.make(version));
        
            System.out.println(skipper.getName(NORMAL));
            
            UnicodeSet result = new UnicodeSet();
            for (int cp = 0; cp <= limit; ++cp) {
                Utility.dot(cp);
                if (skipper.hasProperty(cp)) result.add(cp);
            }
            Utility.fixDot();
            
            String rSet = result.toPattern(true);
            rSet = replace(rSet, "\\U", "\\\\U");
            out.println("\tSKIPPABLE[" + skipper.getName(NORMAL)
                + "] = new UnicodeSet(");
            writeStringInPieces(out, rSet, ", false);");
            out.println();
            
            rSet = result.toPattern(false);
            out.println("/*Unicode: ");
            */
            //writeStringInPieces(out, rSet, "*/");
            /*out.println();
            out.flush();
        
        if (false) {
            NFSkippable skipper = new NFSkippable(Normalizer.NFC,"");
            NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,"");
            for (int cp = 0; cp <= 0x10FFFF; ++cp) {
                if (cp > 0xFF) {
                    if (!skipper.ucd.isAssigned(cp)) continue;
                    byte cat = skipper.ucd.getCategory(cp);
                    if (cat == PRIVATE_USE || cat == SURROGATE) continue;
                    if (skipper.ucd.getCombiningClass(cp) != 0) continue;
                    if (skipper.nf.normalizationDiffers(cp)) continue;
                    if ((cp < 0xAC00 || cp > 0xAE00)
                        && cp != skipper.ucd.mapToRepresentative(cp, false)) continue;
                }
                
                if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue;
                
                String status = (skipper.hasProperty(cp) ? "  SKIPc " : "NOSKIPc ")
                    + (skipper2.hasProperty(cp) ? "  SKIPkc " : "NOSKIPkc ");
                System.out.println(status
                    + skipper.ucd.getCodeAndName(cp)
                    + skipper.cause);
            }
        }
        */
    
}
ICU-0 Added Unicode property NF_SKIPPABLE X-SVN-Rev: 7274 2001-12-03 19:29:35 +00:00			`package com.ibm.text.UCD;`
			`import com.ibm.text.UTF16;`
			`import com.ibm.text.UnicodeSet;`
			`import java.util.BitSet;`
			`import com.ibm.text.utility.*;`
			`import java.io.PrintWriter;`


			`public final class NFSkippable extends UnicodeProperty {`

			`static final boolean DEBUG = false;`

			`private Normalizer nf;`
			`private Normalizer nfd;`
			`private boolean composes;`
			`private int[] realTrailers = new int[100];`
			`private int realTrailerCount = 0;`

			`public NFSkippable(byte normalizerMode, String unicodeVersion) {`
			`isStandard = false;`
			`ucd = UCD.make(unicodeVersion);`
			`nf = new Normalizer(normalizerMode, unicodeVersion);`
			`name = nf.getName() + "_Skippable";`
			`shortName = nf.getName() + "_Skip";`
			`header = "# Derived Property: " + name`
			`+ "\r\n# Generated according to UAX #15."`
			`+ "\r\n# Characters that don't interact with any others in this normalization form."`
			`+ "\r\n# WARNING: Normalization of STRINGS must use the algorithm in UAX #15 because characters may interact."`
			`+ "\r\n# The length of a normalized string is not necessarily the sum of the lengths of the normalized characters!";`

			`nfd = new Normalizer(Normalizer.NFD, unicodeVersion);`
			`composes = normalizerMode == Normalizer.NFC \|\| normalizerMode == Normalizer.NFKC;`

			`// preprocess to find possible trailers`

			`if (composes) for (int cp2 = 0; cp2 <= 0x10FFFF; ++cp2) {`
			`if (nf.isTrailing(cp2)) {`
			`//System.out.println("Trailing: " + ucd.getCodeAndName(cp2));`
			`if (ucd.isTrailingJamo(cp2)) {`
			`//System.out.println("Jamo: " + ucd.getCodeAndName(cp2));`
			`continue;`
			`}`
			`realTrailers[realTrailerCount++] = cp2;`
			`}`
			`}`
			`Utility.fixDot();`
			`//System.out.println("trailer count: " + realTrailerCount);`
			`}`

			`/** A skippable character is<br>`
			`* a) unassigned, or ALL of the following:<br>`
			`* b) of combining class 0.<br>`
			`* c) not decomposed by this normalization form.<br>`
			`* AND if NKC or NFKC, <br>`
			`* d) can never compose with a previous character.<br>`
			`* e) can never compose with a following character.<br>`
			`* f) can never change if another character is added.`
			`* Example: a-breve might satisfy all but f, but if you`
			`* add an ogonek it changes to a-ogonek + breve`
			`*/`

			`String cause = "";`

			`public boolean hasProperty(int cp) {`
			`// quick check on some special classes`
			`if (DEBUG) cause = "\t\tunassigned";`
			`if (!ucd.isAssigned(cp)) return true;`

			`if (DEBUG) cause = "\t\tnf differs";`
			`if (nf.normalizationDiffers(cp)) return false;`

			`if (DEBUG) cause = "\t\tnon-zero cc";`
			`if (ucd.getCombiningClass(cp) != 0) return false;`

			`if (DEBUG) cause = "";`
			`if (!composes) return true;`

			`// now special checks for composing normalizers`
			`if (DEBUG) cause = "\t\tleading";`
			`if (nf.isLeading(cp)) return false;`

			`if (DEBUG) cause = "\t\ttrailing";`
			`if (nf.isTrailing(cp)) return false;`

			`// OPTIMIZATION -- careful`
			`// If there is no NFD decomposition, then this character's accents can't be`
			`// "displaced", so we don't have to test further`

			`if (DEBUG) cause = "\t\tno decomp";`
			`if (!nfd.normalizationDiffers(cp)) return true;`

			`// OPTIMIZATION -- careful`
			`// Hangul syllables are skippable IFF they are isLeadingJamoComposition`
			`if (ucd.isHangulSyllable(cp)) return !ucd.isLeadingJamoComposition(cp);`

			`// We now see if adding another character causes a problem.`
			`// brute force for now!!`
			`// We do skip the trailing Jamo, since those never displace!`

			`StringBuffer base = new StringBuffer(UTF16.valueOf(cp));`
			`int baseLen = base.length();`
			`for (int i = 0; i < realTrailerCount; ++i) {`
			`base.setLength(baseLen); // shorten if needed`
			`base.append(UTF16.valueOf(realTrailers[i]));`
			`String probe = base.toString();`
			`String result = nf.normalize(probe);`
			`if (!result.equals(probe)) {`
			`if (DEBUG) cause = "\t\tinteracts with " + ucd.getCodeAndName(realTrailers[i]);`
			`return false;`
			`}`
			`}`

			`// passed the sieve, so we are ok`
			`if (DEBUG) cause = "";`
			`return true;`
			`}`

			`// both the following should go into UTF16`

			`public static String replace(String source, int toReplace, int replacement) {`
			`if (0 <= toReplace && toReplace <= 0xFFFF`
			`&& 0 <= replacement && replacement <= 0xFFFF) {`
			`return source.replace((char)toReplace, (char)replacement);`
			`}`
			`return replace(source, UTF16.valueOf(toReplace), UTF16.valueOf(replacement));`
			`}`

			`public static String replace(String source, String toReplace, String replacement) {`
			`int pos = 0;`
			`StringBuffer result = new StringBuffer(source.length());`
			`while (true) {`
			`int newPos = source.indexOf(toReplace, pos);`
			`if (newPos >= 0) {`
			`result.append(source.substring(pos, newPos));`
			`result.append(replacement);`
			`pos = newPos + toReplace.length();`
			`} else if (pos != 0) {`
			`result.append(source.substring(pos));`
			`return result.toString();`
			`} else {`
			`return source; // no change necessary`
			`}`
			`}`
			`}`

			`static void writeStringInPieces(PrintWriter pw, String s, String term) {`
			`int start;`
			`int end;`
			`int lineLen = 64;`
			`for (start = 0; ; start = end) {`
			`if (start == 0) pw.print("\t \"");`
			`else pw.print("\t+ \"");`
			`end = s.length();`
			`if (end > start + lineLen) end = start + lineLen;`

			`// if we have a slash in the last 5 characters, backup`

			`int lastSlash = s.lastIndexOf('\\', end);`
			`if (lastSlash >= end-5) end = lastSlash;`

			`// backup if we broke on a \`

			`while (end > start && s.charAt(end-1) == '\\') --end;`

			`pw.print(s.substring(start, end));`
			`if (end == s.length()) {`
			`pw.println('"' + term);`
			`break;`
			`} else {`
			`pw.println('"');`
			`}`
			`}`
			`}`

			`static void testWriteStringInPieces() {`
			`String test =`
			`"[^\\u00C0-\\u00C5\\u00C7-\\u00CF\\u00D1-\\u00D6\\u00D9-\\u00DD"`
			`+ "\\u00E0-\\u00E5\\u00E7-\\u00EF\\u00F1-\\u00F6\\u00F9-\\u00FD\\u00F"`
			`+ "F-\\u010F\\u0112-\\u0125\\u0128-\\u0130\\u0134-\\u0137\\u0139-"`
			`+ "\\u013E\\u0143-\\u0148\\u014C-\\u0151\\u0154-\\u0165\\u0168-\\u017"`
			`+ "E\\u01A0-\\u01A1\\u01AF-\\u01B0\\u01CD-\\u01DC\\u01DE-\\u01E3\\u"`
			`+ "01E6-\\u01F0\\u01F4-\\u01F5\\u01F8-\\u021B\\u021E-\\u021F\\u0226";`
			`PrintWriter pw = new PrintWriter(System.out);`
			`writeStringInPieces(pw,test,"");`
			`writeStringInPieces(pw,replace(test, "\\", "\\\\"),"");`

			`pw.flush();`
			`}`

			`static int limit = 0x10FFFF; // full version = 10ffff, for testing may use smaller`

			`public static void main (String[] args) throws java.io.IOException {`

			`String version = ""; // Unicode version, "" = latest released`

			`PrintWriter out = Utility.openPrintWriter("NFSafeSets.txt");`

			`for (int mode = NFD_UnsafeStart; mode <= NFKC_UnsafeStart; ++mode) {`
			`UnicodeProperty up = DerivedProperty.getProperty(mode, UCD.make(version));`
			`generateSet(out, "UNSAFE[" + Normalizer.getName((byte)(mode-NFD_UnsafeStart)) + "]", up);`
			`}`

			`for (byte mode = NFD; mode <= NFKC; ++mode) {`
			`NFSkippable skipper = new NFSkippable(mode,version);`
			`generateSet(out, "SKIPPABLE[" + Normalizer.getName(mode) + "]", skipper);`
			`}`

			`out.close();`
			`}`

			`static void generateSet(PrintWriter out, String label, UnicodeProperty up) {`
			`System.out.println("Generating: " + up.getName(NORMAL));`
			`UnicodeSet result = new UnicodeSet();`
			`for (int cp = 0; cp <= limit; ++cp) {`
			`Utility.dot(cp);`
			`if (up.hasProperty(cp)) result.add(cp);`
			`}`
			`Utility.fixDot();`

			`String rSet = result.toPattern(true);`
			`rSet = replace(rSet, "\\U", "\\\\U");`
			`out.println(label + " = new UnicodeSet(");`
			`writeStringInPieces(out, rSet, ", false);");`
			`out.println();`

			`rSet = result.toPattern(false);`
			`out.println("/*Unicode: ");`
			`writeStringInPieces(out, rSet, "*/");`
			`out.println();`
			`out.flush();`
			`}`

			`/*`
			`// DerivedProperty dp = new DerivedProperty(UCD.make(version));`

			`System.out.println(skipper.getName(NORMAL));`

			`UnicodeSet result = new UnicodeSet();`
			`for (int cp = 0; cp <= limit; ++cp) {`
			`Utility.dot(cp);`
			`if (skipper.hasProperty(cp)) result.add(cp);`
			`}`
			`Utility.fixDot();`

			`String rSet = result.toPattern(true);`
			`rSet = replace(rSet, "\\U", "\\\\U");`
			`out.println("\tSKIPPABLE[" + skipper.getName(NORMAL)`
			`+ "] = new UnicodeSet(");`
			`writeStringInPieces(out, rSet, ", false);");`
			`out.println();`

			`rSet = result.toPattern(false);`
			`out.println("/*Unicode: ");`
			`*/`
			`//writeStringInPieces(out, rSet, "*/");`
			`/*out.println();`
			`out.flush();`

			`if (false) {`
			`NFSkippable skipper = new NFSkippable(Normalizer.NFC,"");`
			`NFSkippable skipper2 = new NFSkippable(Normalizer.NFKC,"");`
			`for (int cp = 0; cp <= 0x10FFFF; ++cp) {`
			`if (cp > 0xFF) {`
			`if (!skipper.ucd.isAssigned(cp)) continue;`
			`byte cat = skipper.ucd.getCategory(cp);`
			`if (cat == PRIVATE_USE \|\| cat == SURROGATE) continue;`
			`if (skipper.ucd.getCombiningClass(cp) != 0) continue;`
			`if (skipper.nf.normalizationDiffers(cp)) continue;`
			`if ((cp < 0xAC00 \|\| cp > 0xAE00)`
			`&& cp != skipper.ucd.mapToRepresentative(cp, false)) continue;`
			`}`

			`if (skipper2.hasProperty(cp) == skipper.hasProperty(cp)) continue;`

			`String status = (skipper.hasProperty(cp) ? " SKIPc " : "NOSKIPc ")`
			`+ (skipper2.hasProperty(cp) ? " SKIPkc " : "NOSKIPkc ");`
			`System.out.println(status`
			`+ skipper.ucd.getCodeAndName(cp)`
			`+ skipper.cause);`
			`}`
			`}`
			`*/`

			`}`