d1ef79fafd
X-SVN-Rev: 14468
134 lines
5.0 KiB
Java
134 lines
5.0 KiB
Java
/**
|
|
*******************************************************************************
|
|
* Copyright (C) 1996-2001, International Business Machines Corporation and *
|
|
* others. All Rights Reserved. *
|
|
*******************************************************************************
|
|
*
|
|
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateThaiBreaks.java,v $
|
|
* $Date: 2004/02/07 01:01:14 $
|
|
* $Revision: 1.4 $
|
|
*
|
|
*******************************************************************************
|
|
*/
|
|
|
|
package com.ibm.text.UCD;
|
|
import java.io.*;
|
|
import com.ibm.icu.text.UTF16;
|
|
import com.ibm.text.utility.*;
|
|
import com.ibm.icu.text.UnicodeSet;
|
|
import java.util.*;
|
|
|
|
public class GenerateThaiBreaks {
|
|
public static void main(String [] args) throws IOException {
|
|
|
|
BufferedReader br = new BufferedReader(
|
|
new InputStreamReader(
|
|
new FileInputStream("c:\\icu4j\\src\\com\\ibm\\icu\\dev\\data\\thai6.ucs"), "UnicodeLittle"));
|
|
PrintWriter out = null;
|
|
|
|
try {
|
|
|
|
UnicodeSet ignorables = new UnicodeSet();
|
|
/* new UnicodeSet(0xE30, 0xE3A);
|
|
ignorables.add(0x0E40, 0x0E44); // add logical order exception
|
|
ignorables.add(0x0E47, 0x0E4E);
|
|
*/
|
|
ignorables.add(0, ' '); // add controls
|
|
ignorables.add('.');
|
|
|
|
|
|
UnicodeSet initials = new UnicodeSet();
|
|
UnicodeSet finals = new UnicodeSet();
|
|
UnicodeSet medials = new UnicodeSet();
|
|
|
|
char[] buffer = new char[100];
|
|
|
|
while (true) {
|
|
String line = br.readLine();
|
|
if (line == null) break;
|
|
int end = 0;
|
|
|
|
// find 'real' characters
|
|
for (int i = 0; i < line.length(); ++i) {
|
|
char c = line.charAt(i);
|
|
if (ignorables.contains(c)) continue;
|
|
buffer[end++] = c;
|
|
}
|
|
String temp = new String(buffer, 0, end);
|
|
|
|
if (temp.length() <= 1) {
|
|
initials.add(temp);
|
|
finals.add(temp);
|
|
continue;
|
|
}
|
|
|
|
initials.add(temp.substring(0,1));
|
|
//initials.add(temp.substring(0,2));
|
|
finals.add(temp.substring(temp.length()-1));
|
|
//finals.add(temp.substring(temp.length()-1));
|
|
|
|
for (int i = 1; i < temp.length() - 1; ++i) {
|
|
//medials.add(temp.substring(i, i+2));
|
|
medials.add(temp.substring(i, i+1));
|
|
}
|
|
//medials.add(temp.substring(temp.length() - 2, temp.length() - 1));
|
|
}
|
|
|
|
System.out.println("initials size: " + initials.size());
|
|
System.out.println("finals size: " + finals.size());
|
|
System.out.println("medials size: " + medials.size());
|
|
|
|
//out = Utility.openPrintWriter("ThaiData.txt", Utility.UTF8_WINDOWS);
|
|
// out.write('\uFEFF');
|
|
|
|
UnicodeSet marks = new UnicodeSet("[[\u0e00-\u0e7f]&[[:mn:][:me:]]]");
|
|
finals.addAll(marks);
|
|
|
|
UnicodeSet all = new UnicodeSet(initials).addAll(medials).addAll(finals);
|
|
|
|
UnicodeSet missingThai = new UnicodeSet("[[\u0e00-\u0e7f]-[:Cn:]]").removeAll(all);
|
|
|
|
System.out.println("Never occur: " + missingThai.toPattern(true));
|
|
Utility.showSetNames("", missingThai, true, Default.ucd());
|
|
System.out.println();
|
|
|
|
UnicodeSet neverInitial = new UnicodeSet(all).removeAll(initials);
|
|
UnicodeSet neverFinal = new UnicodeSet(all).removeAll(finals);
|
|
|
|
System.out.println("Never initial: " + neverInitial.toPattern(true));
|
|
Utility.showSetNames("", neverInitial, true, Default.ucd());
|
|
System.out.println();
|
|
|
|
System.out.println("Never final: " + neverFinal.toPattern(true));
|
|
Utility.showSetNames("", neverFinal, true, Default.ucd());
|
|
System.out.println();
|
|
|
|
initials.removeAll(medials);
|
|
finals.removeAll(medials);
|
|
|
|
System.out.println("initials size: " + initials.size());
|
|
System.out.println("finals size: " + finals.size());
|
|
|
|
System.out.println("Only Initials" + initials.toPattern(true));
|
|
Utility.showSetNames("", initials, true, Default.ucd());
|
|
System.out.println();
|
|
|
|
System.out.println("Only Finals" + finals.toPattern(true));
|
|
Utility.showSetNames("", finals, true, Default.ucd());
|
|
} finally {
|
|
br.close();
|
|
if (out != null) out.close();
|
|
}
|
|
}
|
|
|
|
static class MyBreaker implements Utility.Breaker {
|
|
public String get(Object current, Object old) {
|
|
if (old == null || UTF16.charAt(current.toString(), 0) == UTF16.charAt(old.toString(), 0)) {
|
|
return current.toString() + "(" + Default.ucd().getCode(current.toString().substring(1)) + "))";
|
|
} else {
|
|
return "\r\n" + current + "(" + Default.ucd().getCode(current.toString()) + "))";
|
|
}
|
|
}
|
|
public boolean filter(Object current) { return true; }
|
|
}
|
|
} |