package com.ibm.text.UCD; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.lang.UCharacter; import com.ibm.text.utility.*; import java.util.*; import java.io.*; // Enumerated properties will be IntCodePointProperty. // The string values they return will be the property value names. // Binary properties are Enumerated properties. They return 0 or 1 public final class TernaryStore { static final int DONE = Integer.MIN_VALUE; static final int NOT_FOUND = Integer.MIN_VALUE+1; // for testing static DepthPrinter dp; static void test() throws java.io.IOException { PrintWriter pw = Utility.openPrintWriter("TestTernary.txt", Utility.LATIN1_WINDOWS); try { dp = new DepthPrinter(pw); String[] tests = {"the", "quick", "fish", "fisherman", "fishes", "brown", "brow", "bracket", "bright", "brat", "brough", "dogs", "upper", "zebra", "fisher"}; test("Simple: ", tests, tests.length); tests = new String[300000]; int counter = 0; int i; for (i = 0; counter < tests.length && i <= 0x10FFFF; ++i) { if (Default.ucd().hasComputableName(i)) continue; String temp = UCharacter.getName(i); if (temp != null) tests[counter++] = temp.trim(); } System.out.println("max-cp: " + Utility.hex(i)); test("Unicode Names: ", tests, counter); //if (true) return; BufferedReader br = Utility.openReadFile(UCD_Types.BASE_DIR + "dict\\DiploFreq.txt", Utility.LATIN1); String line; counter = 0; while (counter < tests.length) { line = Utility.readDataLine(br); if (line == null) break; if (line.length() == 0) continue; Utility.dot(counter); int tabPos = line.indexOf('\t'); if (tabPos < 0) { System.out.println("???" + line); continue; } tests[counter++] = line.substring(tabPos+1); } test("French: ", tests, counter); } finally { pw.close(); } } static void test(String title, String[] tests, int len) { System.out.println(); System.out.println(title); dp.println(); dp.print(title, 0); dp.println(); TernaryStore.Builder builder = new TernaryStore.Builder(); int charCount = 0; for (int i = 0; i < len; ++i) { builder.add(tests[i], i); charCount += tests[i].length(); } System.out.println("charCount: " + charCount); TernaryStore store = builder.build(); store.showNodes(); store.checkNodes(); dp.println("Storage"); dp.println(store.stringStore.toString()); System.out.println("StorageSize: " + store.stringStore.toString().length()); Matcher matcher = store.getMatcher(); for (int i = 0; i < len; ++i) { int check = test(tests[i], matcher); if (check != i) { System.out.println("\tFail, result: " + tests[i] + ", " + check); } } } static int test(String s, Matcher matcher) { matcher.reset(s, 0); int lastResult = -1; for (int result = matcher.next(); result != DONE; result = matcher.next()) { lastResult = result; } return lastResult; } static final class Node { String getString(StringStore stringStore) { if (stringCode < 0) return tempString; return stringStore.get(stringCode); } void setString(String s) { tempString = s; } String tempString; int stringCode = -1; Node less; Node greater; Node next; int result = NOT_FOUND; public String toString(StringStore store) { return getString(store) + (result != NOT_FOUND ? "(" + result + ")" : "") + (next != null ? next.toString() : ""); } } Node base; StringStore stringStore = new StringStore(); final static class Matcher { TernaryStore store; String s; int position; Node lastNode; void reset(String s, int position) { this.s = s; this.position = position; this.lastNode = store.base; } // returns the next result // or DONE when done // sets position to point after end of found string int next() { while (lastNode != null && position < s.length()) { char ch = s.charAt(position++); do { String nodeString = lastNode.getString(store.stringStore); char first = nodeString.charAt(0); if (ch == first) { // now check the rest of the string for (int i = 1; i < nodeString.length(); ++i) { char other = nodeString.charAt(i); if (other != s.charAt(position++)) { return DONE; } } // if we succeed, return result if there is one int result = lastNode.result; lastNode = lastNode.next; if (result != NOT_FOUND) return result; break; // get next char } // otherwise branch sideways, keeping same char if (ch > first) { lastNode = lastNode.greater; } else { lastNode = lastNode.less; } } while (lastNode != null); } return DONE; } } public Matcher getMatcher() { Matcher result = new Matcher(); result.store = this; return result; } public void showNodes() { showNodes2(base, "", 5); } public void showNodes2(Node n, String path, int depth) { if (n.less != null) { showNodes2(n.less, path+"-", depth); } dp.print("", depth); if (false) dp.print(path); dp.print(n.getString(stringStore)); if (n.result != NOT_FOUND) dp.print("/" + n.result); dp.println(); if (n.next != null) { showNodes2(n.next, path+".", depth+n.getString(stringStore).length()); } if (n.greater != null) { showNodes2(n.greater, path+"+", depth); } } static class NodeInfo { int nodeCount; int resultCount; int nullLessCount; int nullGreaterCount; int nullSimpleCount; int nullNextCount; } public void checkNodes() { NodeInfo nodeInfo = new NodeInfo(); checkNodes(base, nodeInfo); System.out.println("Nodes: " + nodeInfo.nodeCount); System.out.println("nullLessCount: " + nodeInfo.nullLessCount); System.out.println("nullGreaterCount: " + nodeInfo.nullGreaterCount); System.out.println("nullNextCount: " + nodeInfo.nullNextCount); System.out.println("resultCount: " + nodeInfo.resultCount); System.out.println("nullSimpleCount: " + nodeInfo.nullSimpleCount); } public void checkNodes(Node n, NodeInfo nodeInfo) { nodeInfo.nodeCount++; if (n.result != NOT_FOUND) nodeInfo.resultCount++; if (n.less != null) { checkNodes(n.less, nodeInfo); } else { nodeInfo.nullLessCount++; if (n.greater == null && n.result == NOT_FOUND) nodeInfo.nullSimpleCount++; } if (n.next != null) { checkNodes(n.next, nodeInfo); } else { nodeInfo.nullNextCount++; } if (n.greater != null) { checkNodes(n.greater, nodeInfo); } else { nodeInfo.nullGreaterCount++; } } final static class DepthPrinter { private PrintWriter pw; private int currentDepth = 0; private String leader = "."; DepthPrinter(PrintWriter pw) { this.pw = pw; } void print(char ch) { print(ch, 0); } void print(String s) { print(s, 0); } void print(char ch, int depth) { print(String.valueOf(ch), depth); } void print(String s, int depth) { int delta = depth - currentDepth; if (delta > 0) { pw.print(Utility.repeat(leader, delta - 1)); currentDepth = depth; } pw.print(s); currentDepth += s.length(); } void println() { pw.println(); currentDepth = 0; } void println(String s) { pw.print(s); pw.println(); currentDepth = 0; } } final static class StringStore { // initially, there is a simple strategy private String buffer = ""; private static final char TERMINATOR = '\u007E'; private static final int PIECE_LENGTH = 5; private static String[] pieces = new String[50]; // HACK private static Set strings = new HashSet(); public void add(String s) { strings.add(s); } public void compact() { System.out.println("Adding Pieces"); // add all the pieces Iterator it = strings.iterator(); Set additions = new HashSet(); while (it.hasNext()) { String s = (String)it.next(); int len = Utility.split(s, ' ', pieces); for (int i = 0; i < len; ++i) { additions.add(pieces[i]); } } store(additions); store(strings); } private void store(Set stuff) { System.out.println("Sorting"); // sort them by length, longest first Set ordered = new TreeSet(); Iterator it = stuff.iterator(); while (it.hasNext()) { String s = (String)it.next(); ordered.add(new Pair(new Integer(-s.length()), s)); } System.out.println("Storing"); // add them it = ordered.iterator(); while (it.hasNext()) { String s = (String)(((Pair)it.next()).second); get(s); } } private int get(String s) { System.out.println("Adding: \'" + s + "\'"); int index; if (s.indexOf(' ') < 0) { index = addNoSplit(s); System.out.println("\tReturning: " + index); return index; } int len = Utility.split(s, ' ', pieces); StringBuffer itemCodes = new StringBuffer(); for (int i = 0; i < len; ++i) { String piece = pieces[i]; itemCodes.append((char)addNoSplit(piece)); /*for (int j = 0; j < piece.length(); j += PIECE_LENGTH) { int maxLen = j + PIECE_LENGTH; if (maxLen > piece.length()) maxLen = piece.length(); itemCodes.append((char)addNoSplit(piece.substring(j, maxLen))); }*/ } index = 0x8000 | addNoSplit(itemCodes.toString()); // mark it as composite System.out.println("\tReturning: " + index); return index; } private int addNoSplit(String s) { System.out.println("\tAdding2: \'" + s + "\'"); String sTerm = s + TERMINATOR; int index = buffer.indexOf(sTerm); if (index >= 0) return index; index = buffer.length(); buffer += sTerm; System.out.println("\t\tReturning2: " + index); return index; } public String get(int index) { String result; System.out.println("Fetching: " + index); if ((index & 0x8000) == 0) { int end = buffer.indexOf(TERMINATOR, index); result = buffer.substring(index, end); System.out.println("\tReturning: '" + result + "'"); return result; } index &= ~0x8000; // remove 1 bit int end = buffer.indexOf(TERMINATOR, index); result = ""; for (int i = index; i < end; ++i) { if (result.length() != 0) result += " "; result += get(buffer.charAt(i)); } System.out.println("\tReturning: '" + result + "'"); return result; } public String toString() { return buffer; } } final static class Builder { Map map = new TreeMap(); String[] names; TernaryStore store; Set set = new TreeSet(); public void add(String name, int result) { map.put(name, new Integer(result)); } public TernaryStore build() { // flatten strings into array names = new String[map.size()]; Iterator it = map.keySet().iterator(); int count = 0; while (it.hasNext()) { names[count++] = (String) it.next(); if (false) { dp.print((count-1) + " " + names[count-1]); dp.println(); } } // build nodes store = new TernaryStore(); addNode(0, names.length); // free storage names = null; map.clear(); System.out.println("compacting"); compactStore(store.base); store.stringStore.compact(); //compactStrings(store); //set.clear(); // free more storage replaceStrings(store.base); //map.clear(); // free storage // free storage TernaryStore result = store; store = null; return result; } /* void compactStrings(TernaryStore t) { // we have a set of Pairs, first is length, second is string // compact them, word by word Iterator it = set.iterator(); while (it.hasNext()) { String string = ((String)((Pair)it.next()).second); int index = t.stringStore.add(string); if (true) { System.out.println("Checking: " + index); String reverse = t.stringStore.get(index); if (!reverse.equals(string)) { System.out.println("source: \'" + string + "\'"); System.out.println("reverse: \'" + reverse + "\'"); throw new IllegalArgumentException("Failed roundtrip"); } } map.put(string, new Integer(index)); } } */ public void replaceStrings(Node n) { n.stringCode = store.stringStore.get(n.getString(store.stringStore)); n.setString(null); if (n.less != null) replaceStrings(n.less); if (n.next != null) replaceStrings(n.next); if (n.greater != null) replaceStrings(n.greater); } public void compactStore(Node n) { Node nextNode = n.next; if (false) dp.println(n.toString()); while (n.result == NOT_FOUND && nextNode != null && nextNode.greater == null && nextNode.less == null) { n.setString(n.getString(store.stringStore) + nextNode.getString(store.stringStore)); n.result = nextNode.result; n.next = nextNode = nextNode.next; // remove old node } // add strings sorted by length, longest first store.stringStore.add(n.getString(store.stringStore)); if (n.less != null) compactStore(n.less); if (n.next != null) compactStore(n.next); if (n.greater != null) compactStore(n.greater); } private void addNode(int start, int limit) { if (start >= limit) return; int mid = (start + limit) / 2; //System.out.println("start: " + start + ", mid: " + mid + ", limit: " + limit); //System.out.println("adding: " + names[mid]); addNode(names[mid], ((Integer)map.get(names[mid])).intValue()); addNode(start, mid); addNode(mid+1, limit); } private void addNode(String s, int result) { if (store.base == null) { store.base = addRest(s, 0, result); return; } Node n = store.base; Node lastNode = n; for (int i = 0; i < s.length(); ++i) { char ch = s.charAt(i); while (true) { char first = n.getString(store.stringStore).charAt(0); if (ch == first) { if (n.next == null) { n.next = addRest(s, i+1, result); return; } lastNode = n; n = n.next; break; // get next char } // otherwise branch sideways, keeping same char if (ch > first) { if (n.greater == null) { n.greater = addRest(s, i, result); return; } n = n.greater; } else { if (n.less == null) { n.less = addRest(s, i, result); return; } n = n.less; } } } lastNode.result = result; } private Node addRest(String s, int position, int result) { Node lastNode = null; for (int i = s.length() - 1; i >= position; --i) { Node n = new Node(); n.setString(s.substring(i, i+1)); // + "" to force a new string if (lastNode == null) { n.result = result; } n.next = lastNode; lastNode = n; } return lastNode; } } }