/** ******************************************************************************* * Copyright (C) 1996-2001, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* * * $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/QuickTest.java,v $ * $Date: 2005/06/24 23:51:52 $ * $Revision: 1.6 $ * ******************************************************************************* */ package com.ibm.text.UCD; import java.util.*; import java.io.*; import com.ibm.icu.dev.test.util.BagFormatter; import com.ibm.icu.text.UTF16; import com.ibm.icu.text.UnicodeSet; import com.ibm.icu.text.UnicodeSetIterator; import com.ibm.text.utility.*; public class QuickTest implements UCD_Types { public static class Length { String title; int bytesPerCodeUnit; int longestCodePoint = -1; int longestLength = 0; UnicodeSet longestSet = new UnicodeSet(); Length(String title, int bytesPerCodeUnit) { this.title = title; this.bytesPerCodeUnit = bytesPerCodeUnit; } void add(int codePoint, int codeUnitLength) { if (codeUnitLength > longestLength) { longestCodePoint = codePoint; longestLength = codeUnitLength; longestSet.clear(); longestSet.add(codePoint); System.out.println(title + " \t(" + codeUnitLength*bytesPerCodeUnit + " bytes, " + codeUnitLength + " code units) \t" + Default.ucd().getCodeAndName(codePoint)); } else if (codeUnitLength == longestLength) { longestSet.add(codePoint); } } } public static void main(String[] args) throws IOException { getLengths("NFC", Default.nfc()); getLengths("NFD", Default.nfd()); getLengths("NFKC", Default.nfkc()); getLengths("NFKD", Default.nfkd()); System.out.println("Done"); } static final int skip = (1< 0) { UnicodeSetIterator it = new UnicodeSetIterator(common); it.next(); System.out.println("Common Exemplar: " + Default.ucd().getCodeAndName(it.codepoint)); } } static ByteArrayOutputStream utf8baos; static Writer utf8bw; static int getUTF8Length(String source) throws IOException { if (utf8bw == null) { utf8baos = new ByteArrayOutputStream(); utf8bw = new OutputStreamWriter(utf8baos, "UTF-8"); } utf8baos.reset(); utf8bw.write(source); utf8bw.flush(); return utf8baos.size(); } static final void test() { String test2 = "ab\u263ac"; StringTokenizer st = new StringTokenizer(test2, "\u263a"); try { while (true) { String s = st.nextToken(); System.out.println(s); } } catch (Exception e) { } StringReader r = new StringReader(test2); StreamTokenizer s = new StreamTokenizer(r); try { while (true) { int x = s.nextToken(); if (x == StreamTokenizer.TT_EOF) break; System.out.println(s.sval); } } catch (Exception e) { } String testString = "en-Arab-200-gaulish-a-abcd-def-x-abcd1234-12345678"; for (int i = testString.length() + 1; i > 0; --i) { String trunc = truncateValidLanguageTag(testString, i); System.out.println(i + "\t" + trunc + "\t" + trunc.length()); } } static String truncateValidLanguageTag(String tag, int limit) { if (tag.length() <= limit) return tag; // legit truncation point has - after, and two letters before do { if (tag.charAt(limit) == '-' && tag.charAt(limit-1) != '-' && tag.charAt(limit-2) != '-') break; } while (--limit > 2); return tag.substring(0,limit); } static final void test2() { UnicodeSet format = new UnicodeSet("[:Cf:]"); /* [4] NameStartChar := ":" | [A-Z] | "_" | [a-z] | [#xC0-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xEFFFF] [4a] NameChar := NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] */ UnicodeSet nameStartChar = new UnicodeSet("[\\: A-Z \\_ a-z" + "\\u00c0-\\u02FF \\u0370-\\u037D \\u037F-\\u1FFF" + "\\u200C-\\u200D \\u2070-\\u218F \\u2C00-\\u2FEF" + "\\u3001-\\uD7FF \\uF900-\\U000EFFFF]"); UnicodeSet nameChar = new UnicodeSet("[\\- \\. 0-9 \\u00B7 " + "\\u0300-\\u036F \\u203F-\\u2040]") .addAll(nameStartChar); UnicodeSet nameAll = new UnicodeSet(nameChar).addAll(nameStartChar); showSet("NameStartChar", nameStartChar); showDiffs("NameChar", nameChar, "NameStartChar", nameStartChar); UnicodeSet ID_Start = new UnicodeSet("[:ID_Start:]"); UnicodeSet ID_Continue = new UnicodeSet("[:ID_Continue:]").removeAll(format); UnicodeSet ID_All = new UnicodeSet(ID_Start).addAll(ID_Continue); showDiffs("ID_All", ID_All, "nameAll", nameAll); showDiffs("ID_Start", ID_Start, "nameStartChar", nameStartChar); UnicodeSet defaultIgnorable = UnifiedBinaryProperty.make(DERIVED | DefaultIgnorable).getSet(); UnicodeSet whitespace = UnifiedBinaryProperty.make(BINARY_PROPERTIES | White_space).getSet(); UnicodeSet notNFKC = new UnicodeSet(); UnicodeSet privateUse = new UnicodeSet(); UnicodeSet noncharacter = new UnicodeSet(); for (int i = 0; i <= 0x10FFFF; ++i) { if (!Default.ucd().isAllocated(i)) continue; if (!Default.nfkc().isNormalized(i)) notNFKC.add(i); if (Default.ucd().isNoncharacter(i)) noncharacter.add(i); if (Default.ucd().getCategory(i) == PRIVATE_USE) privateUse.add(i); } showSet("notNFKC in NameChar", new UnicodeSet(notNFKC).retainAll(nameChar)); showSet("notNFKC outside of NameChar", new UnicodeSet(notNFKC).removeAll(nameChar)); showSet("Whitespace in NameChar", new UnicodeSet(nameChar).retainAll(whitespace)); showSet("Whitespace not in NameChar", new UnicodeSet(whitespace).removeAll(nameChar)); showSet("Noncharacters in NameChar", new UnicodeSet(noncharacter).retainAll(noncharacter)); showSet("Noncharacters outside of NameChar", new UnicodeSet(noncharacter).removeAll(nameChar)); showSet("Format in NameChar", new UnicodeSet(nameChar).retainAll(format)); showSet("Other Default_Ignorables in NameChar", new UnicodeSet(defaultIgnorable).removeAll(format).retainAll(nameChar)); showSet("PrivateUse in NameChar", new UnicodeSet(defaultIgnorable).retainAll(privateUse)); UnicodeSet CID_Start = new UnicodeSet("[:ID_Start:]").removeAll(notNFKC); UnicodeSet CID_Continue = new UnicodeSet("[:ID_Continue:]") .removeAll(notNFKC).removeAll(format); UnicodeSet CID_Continue_extras = new UnicodeSet(CID_Continue).removeAll(CID_Start); showDiffs("NoK_ID_Start", CID_Start, "NameStartChar", nameStartChar); showDiffs("NoK_ID_Continue_Extras", CID_Continue_extras, "NameChar", nameChar); System.out.println("Removing canonical singletons"); } static void showDiffs(String title1, UnicodeSet set1, String title2, UnicodeSet set2) { showSet(title1 + " - " + title2, new UnicodeSet(set1).removeAll(set2)); } static void showSet(String title1, UnicodeSet set1) { System.out.println(); System.out.println(title1); if (set1.size() == 0) { System.out.println("\tNONE"); return; } System.out.println("\tCount:" + set1.size()); System.out.println("\tSet:" + set1.toPattern(true)); System.out.println("\tDetails:"); Utility.showSetNames("", set1, false, Default.ucd()); } }