scuffed-code/tools/unicodetools/com/ibm/text/UCD/BuildNames.java
2001-12-13 23:36:29 +00:00

549 lines
18 KiB
Java

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/BuildNames.java,v $
* $Date: 2001/12/13 23:35:54 $
* $Revision: 1.3 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.io.IOException;
//import com.ibm.text.unicode.UInfo;
import java.util.*;
import java.io.*;
//import java.text.*;
import com.ibm.text.utility.*;
public class BuildNames implements UCD_Types {
static final boolean DEBUG = true;
public static void main(String[] args) throws IOException {
Main.setUCD();
collectWords();
}
static Set words = new TreeSet(new LengthFirstComparator());
static Set lines = new TreeSet(new LengthFirstComparator());
static int[] letters = new int[128];
static void stash(String word) {
words.add(word);
for (int i = 0; i < word.length(); ++i) {
letters[word.charAt(i)]++;
}
}
static String transform(String line) {
StringBuffer result = new StringBuffer();
boolean changed = false;
for (int i = 0; i < line.length(); ++i) {
char c = line.charAt(i);
if (c == '-' || c == '<' || c == '>') {
if (result.length() > 0 && result.charAt(result.length()-1) != ' ') result.append(' ');
result.append(c);
if (i + 1 < line.length() && line.charAt(i+1) != ' ') result.append(' ');
changed = true;
continue;
}
if ('a' <= c && c <= 'z') {
result.append((char)(c - 'a' + 'A'));
changed = true;
continue;
}
if ('0' <= c && c <= '9') {
result.append('*').append((char)(c - '0' + 'A'));
changed = true;
continue;
}
result.append(c);
}
if (!changed) return line;
return result.toString().trim();
}
static void collectWords() throws IOException {
System.out.println("Gathering data");
//Counter counter = new Counter();
String[] parts = new String[100];
//int total = 0;
int used = 0;
int sum = 0;
for (int i = 0; i < 0x10FFFF; ++i) {
if (Main.ucd.hasComputableName(i)) continue;
String name = transform(Main.ucd.getName(i));
sum += name.length();
used++;
// replace numbers & letters
int len = Utility.split(name, ' ', parts);
for (int j = 0; j < len; ++j) {
stash(parts[j]);
}
lines.add(name);
}
System.out.println("Overhead: " + (lastLink - used) + ", " + ((lastLink - used) * 100 / used) + "%");
System.out.println("Strings: " + sum + ", " + (lastLink*4));
System.out.println();
System.out.println("Compacting Words");
System.out.println();
Iterator it = words.iterator();
int i = 0;
while (it.hasNext()) {
String s = (String) it.next();
int test = CompactName.addWord(s);
String round = CompactName.stringFromToken(test);
boolean goesRound = round.equals(s);
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (goesRound ? ": NO RT: '" + round + "'" : ""));
}
System.out.println();
System.out.println("Compacting Lines");
System.out.println();
CompactName.startLines();
it = lines.iterator();
i = 0;
while (it.hasNext()) {
String s = (String) it.next();
if (s.equals("< BELL >")) {
System.out.println("DEBUG");
}
int test = CompactName.addLine(s);
String round = CompactName.stringFromToken(test);
boolean goesRound = round.equals(s);
if (false || !goesRound) System.out.println("Compacting: '" + s + "': " + i++ + "(" + CompactName.lastToken + ")"
+ (!goesRound ? ": NO RT: '" + round + "'" : ""));
}
/*System.out.println("Printing Compact Forms");
for (int i = 0; i < CompactName.lastToken; ++i) {
String s = CompactName.stringFromToken(i);
System.out.println(i + ": '" + s + "'");
}*/
System.out.println("Strings: " + sum
+ ", " + (CompactName.spacedMinimum*4)
+ ", " + (CompactName.lastToken*4)
);
}
/*
Set stuff = new TreeSet();
for (int i = 0; i < letters.length; ++i) {
if (letters[i] != 0) {
stuff.add(new Integer((letters[i] << 8) + i));
}
}
it = stuff.iterator();
while (it.hasNext()) {
int in = ((Integer) it.next()).intValue();
System.out.println((char)(in & 0xFF) + ":\t" + String.valueOf(in >> 8));
}
int r = addString(name);
if (!DEBUG && !rname.equals(name)) {
System.out.println("\tNo Round Trip: '" + rname + "'");
}
*/
static Map stringToInt = new HashMap();
static Map intToString = new HashMap();
static final int[] remap = new int['Z'+1];
static final int maxToken;
static {
int counter = 1;
remap[' '] = counter++;
remap['-'] = counter++;
remap['>'] = counter++;
remap['<'] = counter++;
for (int i = 'A'; i <= 'Z'; ++i) {
remap[i] = counter++;
}
for (int i = '0'; i <= '9'; ++i) {
remap[i] = counter++;
}
maxToken = counter;
}
static final String[] unmap = new String[maxToken];
static {
unmap[0] = "";
for (int i = 0; i < remap.length; ++i) {
int x = remap[i];
if (x != 0) unmap[x] = String.valueOf((char)i);
}
}
static int[] links = new int[40000];
static final int linkStart = 0;
static int lastLink = 0;
static final int LITERAL_BOUND = 0x7FFF - maxToken * maxToken;
static boolean isLiteral(int i) {
return (i & 0x7FFF) > LITERAL_BOUND;
}
static String lookup(int i) {
String result;
boolean trailingSpace = false;
if ((i & 0x8000) != 0) {
i ^= 0x8000;
trailingSpace = true;
}
if (i > LITERAL_BOUND) {
i = i - LITERAL_BOUND;
int first = i / maxToken;
int second = i % maxToken;
result = unmap[first] + unmap[second];
} else {
int value = links[i];
int lead = value >>> 16;
int trail = value & 0xFFFF;
//if (DEBUG) System.out.println("lead: " + lead + ", trail: " + trail);
result = lookup(lead) + lookup(trail);
}
if (trailingSpace) result += ' ';
if (DEBUG) System.out.println("token: " + i + " => '" + result + "'");
return result;
}
static int getInt(String s) {
if (s.length() < 3) {
if (s.length() == 0) return 0;
int first = s.charAt(0);
int second = s.length() > 1 ? s.charAt(1) : 0;
return LITERAL_BOUND + (remap[first] * maxToken + remap[second]);
}
Object in = stringToInt.get(s);
if (in == null) return -1;
return ((Integer)in).intValue();
}
static int putString(String s, int lead, int trail) {
Object in = stringToInt.get(s);
if (in != null) throw new IllegalArgumentException();
int value = (lead << 16) + (trail & 0xFFFF);
int result = lastLink;
links[lastLink++] = value;
if (DEBUG) {
System.out.println("'" + s + "', link[" + result + "] = lead: " + lead + ", trail: " + trail);
String roundTrip = lookup(result);
if (!roundTrip.equals(s)) {
System.out.println("\t*** No Round Trip: '" + roundTrip + "'");
}
}
stringToInt.put(s, new Integer(result));
return result;
}
// s cannot have a trailing space. Must be <,>,-,SPACE,0-9,A-Z
static int addString(String s) {
int result = getInt(s);
if (result != -1) return result;
int limit = s.length() - 1;
int bestLen = 0;
int best_i = 0;
int bestSpaceLen = 0;
int bestSpace_i = 0;
int lastSpace = -1;
int spaceBits;
int endOfFirst;
// invariant. We break after a space if there is one.
for (int i = 1; i < limit; ++i) {
char c = s.charAt(i-1);
spaceBits = 0;
endOfFirst = i;
if (c == ' ') {
lastSpace = i;
endOfFirst--;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(i);
if (firstPart.equals("<START OF ")) {
System.out.println("HUH");
}
int lead = getInt(firstPart);
int trail = getInt(lastPart);
if (lead >= 0 && trail >= 0) { // if both match, return immediately with pair
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH BOTH");
return putString(s, spaceBits | lead, trail);
}
if (!isLiteral(lead)) {
if (i > bestLen) {
bestLen = i;
best_i = i;
}
if (i > bestSpaceLen && c == ' ') {
bestSpaceLen = i;
bestSpace_i = i + 1;
}
}
int end_i = s.length() - i;
if (!isLiteral(trail)) {
if (end_i > bestLen) {
bestLen = end_i;
best_i = i;
}
if (end_i > bestSpaceLen && c == ' ') {
bestSpaceLen = end_i;
bestSpace_i = i + 1;
}
}
}
if (lastSpace >= 0) {
bestLen = bestSpaceLen;
best_i = bestSpace_i;
}
spaceBits = 0;
if (bestLen > 0) { // if one matches, recurse -- and return pair
endOfFirst = best_i;
if (lastSpace > 0) {
--endOfFirst;
spaceBits = 0x8000;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(best_i);
int lead = getInt(firstPart);
int trail = getInt(lastPart);
if (lead >= 0) {
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH FIRST");
return putString(s, spaceBits | lead, addString(lastPart));
} else {
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' MATCH SECOND");
return putString(s, spaceBits | addString(firstPart), trail);
}
}
// otherwise, we failed to find anything. Then break before the last word, if there is one
// otherwise break in the middle (but at even value)
if (lastSpace >= 0) {
best_i = lastSpace;
endOfFirst = lastSpace - 1;
spaceBits = 0x8000;
} else {
endOfFirst = best_i = ((s.length() + 1) / 4) * 2;
}
String firstPart = s.substring(0, endOfFirst);
String lastPart = s.substring(best_i);
if (DEBUG) System.out.println(s + " => '" + firstPart + (spaceBits != 0 ? "*" : "")
+ "' # '" + lastPart + "' FALLBACK");
return putString(s, spaceBits | addString(firstPart), addString(lastPart));
}
/*
static int addCompression(String s) {
Object in = stringToInt.get(s);
if (in != null) return ((Integer) in).intValue();
// find best match, recursively
int bestBreak = -1;
boolean pickFirst = false;
for (int i = 1; i < s.length() - 1; ++i) {
char c = s.charAt(i);
if (c == ' ' || c == '-') {
Object pos1 = stringToInt.get(s.substring(0,i+1));
//Object pos23 = stringToInt.get(s..substring(i));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
continue main;
}
if (pos2 >= 0) {
if (k > bestBreak) {
bestBreak = k;
pickFirst = true;
}
} else if (pos3 >= 0) {
if (value.length() - k > bestBreak) {
bestBreak = k;
pickFirst = false;
}
}
}
}
}
}
static void gatherData() throws IOException {
System.out.println("Gathering data");
Counter counter = new Counter();
String[] parts = new String[100];
String[] parts2 = new String[100];
int total = 0;
for (int i = 0; i < 0x10FFFF; ++i) {
//if ((i & 0xFF) == 0) System.out.println(Utility.hex(i));
if (!ucd.isRepresented(i)) continue;
String s = ucd.getName(i);
total += s.length();
int len = Utility.split(s, ' ', parts);
for (int j = 0; j < len; ++j) {
if (parts[j].indexOf('-') >= 0) {
// hyphen stuff
int len2 = Utility.split(parts[j], '-', parts2);
for (int k = 0; k < len2; ++k) {
if (k == len2 - 1) {
counter.add(parts2[k] + '-');
} else {
counter.add(parts2[k] + " ");
}
}
} else {
// normal
counter.add(parts[j] + " ");
}
}
}
System.out.println("Sorting data");
Map m = counter.extract();
System.out.println("Printing data");
PrintWriter log = new PrintWriter(
new BufferedWriter(
new OutputStreamWriter(
new FileOutputStream(GEN_DIR + "NameCompression.txt")),
32*1024));
log.println("total: " + total);
Iterator it = m.keySet().iterator();
String mondo = "";
int i = 0;
int strTotal = 0;
int index = 0;
Map fullToCompressed = new HashMap();
String mondoIndex = "";
main:
while (it.hasNext()) {
index++;
if ((i & 255) == 0) System.out.println("#" + i);
Counter.RWInteger key = (Counter.RWInteger) it.next();
String value = (String)m.get(key);
log.println(i++ + ": " + key + ": \"" + value + "\"");
strTotal += value.length();
// first 128 are the highest frequency, inc. space
if (index < 128 - SINGLES) {
mondo += value;
fullToCompressed.put(value, new String((char)(index + reserved)));
continue;
}
int pos = mondo.indexOf(value);
if (pos >= 0) {
// try splitting!
int bestBreak = -1;
boolean pickFirst = false;
if (value.length() > 2) for (int k = 1; k < value.length()-1; ++k) {
int pos2 = mondo.indexOf(value.substring(0,k) + " ");
int pos3 = mondo.indexOf(value.substring(k));
if (pos2 >= 0 && pos3 >= 0) {
fullToCompressed.put(value, new Integer(index + reserved));
continue main;
}
if (pos2 >= 0) {
if (k > bestBreak) {
bestBreak = k;
pickFirst = true;
}
} else if (pos3 >= 0) {
if (value.length() - k > bestBreak) {
bestBreak = k;
pickFirst = false;
}
}
}
if (bestBreak > 0) {
if (pickFirst) {
mondo += value.substring(bestBreak);
} else {
mondo += value.substring(0, bestBreak) + " ";
}
} else {
mondo += value;
}
}
// high bit on, means 2 bytes, look in array
}
log.println("strTotal: " + strTotal);
log.println("mondo: " + mondo.length());
int k = 80;
for (; k < mondo.length(); k += 80) {
log.println(mondo.substring(k-80, k));
}
log.println(mondo.substring(k-80)); // last line
log.close();
}
static int indexOf(StringBuffer target, String source) {
int targetLen = target.length() - source.length();
main:
for (int i = 0; i <= targetLen; ++i) {
for (int j = 0; j < source.length(); ++j) {
if (target.charAt(i) != source.charAt(j)) continue main;
}
return i;
}
return -1;
}
static final int SINGLES = 26 + 10 + 2;
*/
/*
static String decode(int x) {
if (x < SINGLES) {
if (x < 26) return String.valueOf(x + 'A');
if (x < 36) return String.valueOf(x - 26 + '0');
if (x == 36) return "-";
return " ";
}
if (x < binaryLimit) {
x =
*/
}