scuffed-code/tools/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java
Mark Davis 7ca61b13cc utf-8 change
X-SVN-Rev: 15005
2004-04-17 18:21:39 +00:00

777 lines
28 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
*******************************************************************************
* Copyright (C) 1996-2001, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*
* $Source: /xsrl/Nsvn/icu/unicodetools/com/ibm/text/UCD/GenerateLineBreakTest.java,v $
* $Date: 2004/04/17 18:21:39 $
* $Revision: 1.5 $
*
*******************************************************************************
*/
package com.ibm.text.UCD;
import java.util.*;
import java.io.*;
import com.ibm.text.utility.*;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.text.UnicodeSet;
public class GenerateLineBreakTest implements UCD_Types {
// COMMON STUFF for Hangul
static final byte hNot = -1, hL = 0, hV = 1, hT = 2, hLV = 3, hLVT = 4, hLIMIT = 5;
static final String[] hNames = {"L", "V", "T", "LV", "LVT"};
static byte getHangulType(int cp) {
if (Default.ucd().isLeadingJamo(cp)) return hL;
if (Default.ucd().isVowelJamo(cp)) return hV;
if (Default.ucd().isTrailingJamo(cp)) return hT;
if (Default.ucd().isHangulSyllable(cp)) {
if (Default.ucd().isDoubleHangul(cp)) return hLV;
return hLVT;
}
return hNot;
}
//============================
protected String rule;
protected String fileName = "Line";
// all the other items are supplied in UCD_TYPES
static byte LB_L = LB_LIMIT + hL, LB_V = LB_LIMIT + hV, LB_T = LB_LIMIT + hT,
LB_LV = LB_LIMIT + hLV, LB_LVT = LB_LIMIT + hLVT, LB_SUP = LB_LIMIT + hLIMIT,
LB2_LIMIT = (byte)(LB_SUP + 1);
String[] samples = new String[100];
byte[] TypeOrder = {
LB_OP, LB_CL, LB_QU, LB_GL, LB_NS, LB_EX, LB_SY, LB_IS, LB_PR, LB_PO,
LB_NU, LB_AL, LB_ID, LB_IN, LB_HY, LB_BA, LB_BB, LB_B2, LB_ZW, LB_CM,
// missing from Pair Table
LB_SP, LB_BK, LB_CR, LB_LF,
// resolved types below
LB_CB, LB_AI, LB_SA, LB_SG, LB_XX,
// 3 JAMO CLASSES, plus supplementary
LB_L, LB_V, LB_T, LB_LV, LB_LVT, LB_SUP
};
public static void main(String[] args) throws IOException {
new GenerateLineBreakTest().run();
new GenerateWordBreakTest().run();
}
// stuff that subclasses need to override
public void run() throws IOException {
findSamples();
// test individual cases
//printLine(out, samples[LB_ZW], "", samples[LB_CL]);
//printLine(out, samples[LB_ZW], " ", samples[LB_CL]);
PrintWriter out = Utility.openPrintWriter(fileName + "BreakTest.html", Utility.UTF8_WINDOWS);
out.println("<html><head><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><title>"
+ fileName + "</title></head>");
out.println("<body bgcolor='#FFFFFF'><h3>Current (fixed only for consistency):</h3>");
generateTable(out, false);
out.println("<h3>Recommended:</h3>");
generateTable(out, true);
out.println("</body></html>");
out.close();
String[] testCase = new String[50];
// do main test
for (int k = 0; k < 2; ++k) {
out = Utility.openPrintWriter(fileName + (k == 0 ? "Test_SHORT.txt" : "Test.txt"), Utility.LATIN1_WINDOWS);
int counter = 0;
out.println("# Default " + fileName + " Break Test");
out.println("# Generated: " + Default.getDate() + ", MED");
out.println("#");
out.println("# Format:");
out.println("# <string> (# <comment>)? ");
out.println("# <string> contains hex Unicode code points, with ");
out.println("#\t" + BREAK + " wherever there is a break opportunity, and ");
out.println("#\t" + NOBREAK + " wherever there is not.");
out.println("# <comment> the format can change, but currently it shows:");
out.println("#\t- the sample character name");
out.println("#\t- (x) the line_break property* for the sample character");
out.println("#\t- [x] the rule that determines whether there is a break or not");
out.println("#");
out.println("# Samples:");
out.println("# The test currently takes all pairs of linebreak types*,");
out.println("# picks a sample for each type, and generates three strings: ");
out.println("#\t- the pair alone");
out.println("#\t- the pair alone with an imbeded space");
out.println("#\t- the pair alone with embedded combining marks");
out.println("# The sample for each type is simply the first code point (above NULL)");
out.println("# with that property.");
out.println("# * Note:");
out.println("#\t- SG is omitted");
out.println("#\t- 3 different Jamo characters and a supplementary character are added");
out.println("#\t The syllable types for the Jamo (L, V, T) are displayed in comments");
out.println("#\t instead of the linebreak property");
out.println("# These samples may be extended in the future.");
out.println("#");
for (int ii = 0; ii < getLimit(); ++ii) {
int i = TypeOrder[ii];
if (i == LB_SG) continue;
String before = samples[i];
for (int jj = 0; jj < getLimit(); ++jj) {
Utility.dot(counter);
int j = TypeOrder[jj];
if (j == LB_SG) continue;
String after = samples[j];
// do line straight
int len = genTestItems(before, after, testCase);
for (int q = 0; q < len; ++q) {
printLine(out, testCase[q], k != 0 && q == 0, false);
++counter;
}
}
}
out.println("# Lines: " + counter);
out.close();
}
}
// stuff that subclasses need to override
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
results[1] = before + " " + after;
results[2] = before + "\u0301\u0308" + after;
return 3;
}
// stuff that subclasses need to override
boolean skipType(byte type) {
return type == LB_AI || type == LB_SA || type == LB_SG || type == LB_XX;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
byte result = getType(cp);
if (result == LB_SUP) return "SUP";
if (result >= LB_LIMIT) return hNames[result - LB_LIMIT];
return Default.ucd().getLineBreakID_fromIndex(result);
}
// stuff that subclasses need to override
public byte getType(int cp) {
if (cp > 0xFFFF) return LB_SUP;
byte result = getHangulType(cp);
if (result != hNot) return (byte)(result + LB_LIMIT);
return Default.ucd().getLineBreak(cp);
}
public int getLimit() {
return LB2_LIMIT;
}
public int getTableLimit() {
return LB_SUP; // skip last;
}
public void generateTable(PrintWriter out, boolean recommended) {
String width = "width='" + (100 / (getTableLimit() + 1)) + "%'";
out.print("<table border='1' cellspacing='0'><tr><th " + width + "></th>");
byte type;
for (int i = 0; i < getTableLimit(); ++i) {
type = TypeOrder[i];
if (skipType(type)) continue;
String h = getTypeID(samples[TypeOrder[i]]);
out.print("<th " + width + ">" + h + "</th>");
}
out.print("</tr>");
String[] rule = new String[1];
String[] rule2 = new String[1];
for (int i = 0; i < getTableLimit(); ++i) {
type = TypeOrder[i];
if (skipType(type)) continue;
String before = samples[type];
String line = "<tr><th>" + getTypeID(before) + "</th>";
for (int j = 0; j < getTableLimit(); ++j) {
type = TypeOrder[j];
if (skipType(type)) continue;
String after = samples[type];
String t = getTableEntry(before, after, recommended, rule);
String background = "";
String t2 = getTableEntry(before, after, !recommended, rule2);
if (!t.equals(t2)) {
if (t.equals(NOBREAK)) {
background = " bgcolor='#CCFFFF'";
} else {
background = " bgcolor='#FFFF00'";
}
} else if (t.equals(NOBREAK)) {
background = " bgcolor='#CCCCFF'";
}
line += "<th title='" + rule[0] + "'" + background + ">" + t + "</th>";
}
out.println(line + "</tr>");
}
out.println("</table>");
}
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
String t = "_";
boolean spaceBreak = isBreak(before + " " + after, before.length() + 1, recommended);
String spaceRule = rule;
boolean spaceBreak2 = isBreak(before + " " + after, before.length(), recommended);
String spaceRule2 = rule;
boolean normalBreak = isBreak(before + after, before.length(), recommended);
String normalRule = rule;
if (!normalBreak) {
if (!spaceBreak && !spaceBreak2) {
t = "^";
rule = spaceRule.equals(normalRule) ? normalRule : spaceRule + "/" + normalRule;
if (!spaceRule2.equals(normalRule) && !spaceRule2.equals(spaceRule)) {
rule += "/" + spaceRule2;
}
} else {
t = "%";
rule = normalRule;
}
}
ruleOut[0] = rule;
return t;
}
static final String BREAK = "\u00F7";
static final String NOBREAK = "\u00D7";
public void printLine(PrintWriter out, String source, boolean comments, boolean recommended) {
int cp;
StringBuffer string = new StringBuffer();
StringBuffer comment = new StringBuffer("\t# ");
String status = isBreak(source, 0, recommended) ? BREAK : NOBREAK;
string.append(status);
comment.append(' ').append(status).append(" [").append(rule).append(']');
for (int offset = 0; offset < source.length(); offset += UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, offset);
if (string.length() > 0) {
string.append(' ');
comment.append(' ');
}
string.append(Utility.hex(cp));
comment.append(Default.ucd().getName(cp) + " (" + getTypeID(cp) + ")");
status = isBreak(source, offset + UTF16.getCharCount(cp), recommended) ? BREAK : NOBREAK;
string.append(' ').append(status);
comment.append(' ').append(status).append(" [").append(rule).append(']');
}
if (comments) string.append(comment);
out.println(string);
}
public void findSamples() {
for (int i = 1; i <= 0x10FFFF; ++i) {
if (!Default.ucd().isAllocated(i)) continue;
if (0xD800 <= i && i <= 0xDFFF) continue;
if(i == 0x1100) {
System.out.print("here");
}
byte lb = getType(i);
if (samples[lb] == null) {
samples[lb] = UTF16.valueOf(i);
}
}
for (int i = 0; i < TypeOrder.length; ++i) {
String sample = samples[i];
System.out.println(getTypeID(sample) + ":\t" + Default.ucd().getCodeAndName(sample));
}
}
public String getTypeID(String s) {
if (s == null) return "<null>";
if (s.length() == 1) return getTypeID(s.charAt(0));
StringBuffer result = new StringBuffer();
int cp;
for (int i = 0; i < s.length(); i += UTF32.count16(cp)) {
cp = UTF32.char32At(s, i);
if (i > 0) result.append(" ");
result.append(getTypeID(cp));
}
return result.toString();
}
public int findLastNon(String source, int offset, byte notLBType, boolean recommended) {
int cp;
for (int i = offset-1; i >= 0; i -= UTF16.getCharCount(cp)) {
cp = UTF16.charAt(source, i);
byte f = getResolvedType(cp, recommended);
if (f != notLBType) return i;
}
return -1;
}
public byte getResolvedType (int cp, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
byte result = getType(cp);
switch (result) {
case LB_AI: result = LB_AI; break;
// case LB_CB: result = LB_ID; break;
case LB_SA: result = LB_AL; break;
// case LB_SG: result = LB_XX; break; Surrogates; will never occur
case LB_XX: result = LB_AL; break;
}
if (recommended) {
if (getHangulType(cp) != hNot) {
result = LB_ID;
}
}
return result;
}
public boolean onCodepointBoundary(String s, int offset) {
if (offset < 0 || offset > s.length()) return false;
if (offset == 0 || offset == s.length()) return true;
if (UTF16.isLeadSurrogate(s.charAt(offset-1))
&& UTF16.isTrailSurrogate(s.charAt(offset))) return false;
return true;
}
// find out whether there is a break at offset
// WARNING: as a side effect, sets "rule"
public boolean isBreak(String source, int offset, boolean recommended) {
// LB 1 Assign a line break category to each character of the input.
// Resolve AI, CB, SA, SG, XX into other line break classes depending on criteria outside this algorithm.
// this is taken care of in the getResolvedType function
// LB 2a Never break at the start of text
rule="2a";
if (offset <= 0) return false;
// LB 2b Always break at the end of text
rule="2b";
if (offset >= source.length()) return true;
// UTF-16: never break in the middle of a code point
if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedType(cpBefore, recommended);
byte after = getResolvedType(cpAfter, recommended);
rule="3a";
// Always break after hard line breaks (but never between CR and LF).
// CR ^ LF
if (before == LB_CR && after == LB_LF) return false;
if (before == LB_BK || before == LB_LF || before == LB_CR) return true;
//LB 3b Dont break before hard line breaks.
rule="3b";
if (after == LB_BK || after == LB_LF | after == LB_CR) return false;
// LB 4 Dont break before spaces or zero-width space.
// × SP
// × ZW
rule="4";
if (after == LB_SP || after == LB_ZW) return false;
// LB 5 Break after zero-width space.
// ZW ÷
rule="5";
if (before == LB_ZW) return true;
// LB 6 Dont break graphemes (before combining marks, around virama or on sequences of conjoining Jamos.
rule="6";
if (after == LB_CM) return false;
if (before == LB_L && (after == LB_L || after == LB_V || after == LB_LV || after == LB_LVT)) return false;
if ((before == LB_LV || before == LB_V) && (after == LB_V || after == LB_T)) return false;
if ((before == LB_LVT || before == LB_T) && (after == LB_T)) return false;
boolean setBase = false;
if (before == LB_CM) {
setBase = true;
int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset < 0) {
before = LB_ID;
} else {
before = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
// LB 7 In all of the following rules, if a space is the base character for a combining mark,
// the space is changed to type ID. In other words, break before SP CM* in the same cases as
// one would break before an ID.
rule="7";
if (setBase && before == LB_SP) before = LB_ID;
// LB 8 Dont break before ] or ! or ; or /, even after spaces.
// × CL, × EX, × IS, × SY
rule="8";
if (after == LB_CL || after == LB_EX || after == LB_SY | after == LB_IS) return false;
// find the last non-space character; we will need it
byte lastNonSpace = before;
if (lastNonSpace == LB_SP) {
int backOffset = findLastNon(source, offset, LB_CM, recommended);
if (backOffset >= 0) {
lastNonSpace = getResolvedType(UTF16.charAt(source, backOffset), recommended);
}
}
// LB 9 Dont break after [, even after spaces.
// OP SP* ×
rule="9";
if (lastNonSpace == LB_OP) return false;
// LB 10 Dont break within ‘”[, , even with intervening spaces.
// QU SP* × OP
rule="10";
if (lastNonSpace == LB_QU && after == LB_OP) return false;
// LB 11 Dont break within ]h, even with intervening spaces.
// CL SP* × NS
rule="11";
if (lastNonSpace == LB_CL && after == LB_NS) return false;
// LB 11a Dont break within ‘——’, even with intervening spaces.
// B2 × B2
rule="11a";
if (lastNonSpace == LB_B2 && after == LB_B2) return false;
if (recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="11b";
if (after == LB_GL || before == LB_GL) return false;
}
// [Note: by this time, all of the "X" in the table are accounted for. We can safely break after spaces.]
rule="12";
// LB 12 Break after spaces
// SP ÷
if (before == LB_SP) return true;
if (!recommended) {
// LB 13 Dont break before or after NBSP or WORD JOINER
// × GL
// GL ×
rule="13";
if (after == LB_GL || before == LB_GL) return false;
}
rule="14";
// LB 14 Dont break before or after ‘”’
// × QU
// QU ×
if (before == LB_QU || after == LB_QU) return false;
// LB 15 Dont break before hyphen-minus, other hyphens, fixed-width spaces,
// small kana and other non- starters, or after acute accents:
// × BA
// × HY
// × NS
// BB ×
if (recommended) {
// LB 14a Break before and after CB
// CB ÷
// ÷ CB
if (before == LB_CB || after == LB_CB) return true;
}
rule="15";
if (after == LB_NS) return false;
if (after == LB_HY) return false;
if (after == LB_BA) return false;
if (before == LB_BB) return false;
if (!recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
rule="15b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 16 Dont break between two ellipses, or between letters or numbers and ellipsis:
// AL × IN
// ID × IN
// IN × IN
// NU × IN
// Examples: 9..., a..., H...
rule="16";
if ((before == LB_NU || before == LB_AL || before == LB_ID) && after == LB_IN) return false;
if (before == LB_IN && after == LB_IN) return false;
// Don't break alphanumerics.
// LB 17 Dont break within a9, 3a, or H%
// ID × PO
// AL × NU
// NU × AL
// Numbers are of the form PR ? ( OP | HY ) ? NU (NU | IS) * CL ? PO ?
// Examples: $(12.35) 2,1234 (12)¢ 12.54¢
// This is approximated with the following rules. (Some cases already handled above,
// like 9,, [9.)
rule="17";
if (before == LB_ID && after == LB_PO) return false;
if (before == LB_AL && after == LB_NU) return false;
if (before == LB_NU && after == LB_AL) return false;
// LB 18 Dont break between the following pairs of classes.
// CL × PO
// HY × NU
// IS × NU
// NU × NU
// NU × PO
// PR × AL
// PR × HY
// PR × ID
// PR × NU
// PR × OP
// SY × NU
// Example pairs: $9, $[, $-, -9, /9, 99, ,9, 9% ]%
rule="18";
if (before == LB_CL && after == LB_PO) return false;
if (before == LB_HY && after == LB_NU) return false;
if (before == LB_IS && after == LB_NU) return false;
if (before == LB_NU && after == LB_NU) return false;
if (before == LB_NU && after == LB_PO) return false;
if (before == LB_PR && after == LB_AL) return false;
if (before == LB_PR && after == LB_HY) return false;
if (before == LB_PR && after == LB_ID) return false;
if (before == LB_PR && after == LB_NU) return false;
if (before == LB_PR && after == LB_OP) return false;
if (before == LB_SY && after == LB_NU) return false;
if (recommended) {
// LB 15b Break after hyphen-minus, and before acute accents:
// HY ÷
// ÷ BB
rule="18b";
if (before == LB_HY) return true;
if (after == LB_BB) return true;
}
// LB 19 Dont break between alphabetics (“at”)
// AL × AL
rule="19";
if (before == LB_AL && after == LB_AL) return false;
// LB 20 Break everywhere else
// ALL ÷
// ÷ ALL
rule="20";
return true;
}
static class GenerateWordBreakTest extends GenerateLineBreakTest {
static final byte CR = 0, LF = 1, Control = 2, Extend = 3, Link = 4, CGJ = 5, Base = 6, LetterBase = 7, Other = 8,
oLIMIT = 9, // RESET THIS IF LIST ABOVE CHANGES!
L = oLIMIT + hL, V = oLIMIT + hV, T = oLIMIT + hT, LV = oLIMIT + hLV, LVT = oLIMIT + hLVT,
LIMIT = LVT + 1;
static final String[] Names = {"CR", "LF", "CTL", "Extend", "Link", "CGJ", "Base", "LetterBase", "Other" };
static UCDProperty extendProp = UnifiedBinaryProperty.make(DERIVED | GraphemeExtend);
static UCDProperty baseProp = UnifiedBinaryProperty.make(DERIVED | GraphemeBase);
static UCDProperty linkProp = UnifiedBinaryProperty.make(BINARY_PROPERTIES | GraphemeLink);
{
fileName = "Word";
TypeOrder = new byte[LIMIT];
for (byte i = 0; i < TypeOrder.length; ++i) {
TypeOrder[i] = i;
}
}
boolean skipType(byte type) {
return false;
}
public int getLimit() {
return LIMIT;
}
public int getTableLimit() {
return LIMIT;
}
// stuff that subclasses need to override
public int genTestItems(String before, String after, String[] results) {
results[0] = before + after;
return 1;
}
public String getTableEntry(String before, String after, boolean recommended, String[] ruleOut) {
boolean normalBreak = isBreak(before + after, before.length(), recommended);
String normalRule = rule;
ruleOut[0] = rule;
return normalBreak ? BREAK : NOBREAK;
}
// stuff that subclasses need to override
public String getTypeID(int cp) {
byte type = getType(cp);
if (type >= oLIMIT) return hNames[type - oLIMIT];
return Names[type];
}
// stuff that subclasses need to override
public byte getType(int cp) {
// single characters
if (cp == 0xA) return LF;
if (cp == 0xD) return CR;
if (cp == 0x034F) return CGJ;
if (cp == 0x2028 || cp == 0x2029) return Control;
// Hangul
byte result = getHangulType(cp);
if (result != hNot) return (byte)(result + oLIMIT);
// other properties
// category based
byte cat = Default.ucd().getCategory(cp);
if (cat == Cc) return Control;
if (cat == Cf) return Extend;
if (((1<<cat) & LETTER_MASK) != 0) return LetterBase;
// other binary properties
if (linkProp.hasValue(cp)) return Link;
if (extendProp.hasValue(cp)) return Extend;
if (baseProp.hasValue(cp)) return Base;
return Other;
}
public byte getResolvedType(int cp, boolean recommended) {
return getType(cp);
}
public boolean isBreak(String source, int offset, boolean recommended) {
rule="1";
if (offset < 0 || offset > source.length()) return false;
if (offset == 0) return true;
rule = "2";
if (offset == source.length()) return true;
// UTF-16: never break in the middle of a code point
if (!onCodepointBoundary(source, offset)) return false;
// now get the character before and after, and their types
int cpBefore = UTF16.charAt(source, offset-1);
int cpAfter = UTF16.charAt(source, offset);
byte before = getResolvedType(cpBefore, recommended);
byte after = getResolvedType(cpAfter, recommended);
rule = "3";
if (before == CR && after == LF) return false;
rule = "4";
if (before == CR || before == LF || before == Control
|| after == Control || after == LF || after == CR) return true;
rule = "6";
if (before == L && (after == L || after == V || after == LV || after == LVT)) return false;
rule = "7";
if ((before == LV || before == V) && (after == V || after == T)) return false;
rule = "8";
if ((before == LVT || before == T) && (after == T)) return false;
rule = "9";
if (after == Extend) return false;
if (recommended) {
if (after == Link || after == CGJ) return false;
} else {
// Do not break around a CGJ.
rule = "10";
if (before == CGJ && (after == Base
|| after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT)) return false;
rule = "11";
if (after == CGJ) return false;
// Do not break between linking characters and letters, or before linking characters. This provides for Indic graphemes, where virama (halant) will link character clusters together.
rule = "12";
//Link Extend* × LetterBase (12)
if (after == LetterBase || after == L || after == V || after == T || after == LV || after == LVT) {
int backOffset = findLastNon(source, offset, Extend, recommended);
if (backOffset >= 0) {
byte last = getResolvedType(UTF16.charAt(source, backOffset), recommended);
if (last == Link) return false;
}
}
rule = "13";
if (after == Link) return false;
}
// Otherwise break after all characters.
rule = "14";
return true;
}
}
}