From 2242d9195add3be0d784cbc4795106f4d6572183 Mon Sep 17 00:00:00 2001 From: Andy Heninger Date: Sun, 9 May 2004 18:38:43 +0000 Subject: [PATCH] ICU-3295 RBBI, more tests. X-SVN-Rev: 15223 --- .../icu/dev/test/rbbi/RBBITestExtended.java | 488 +++++++++++++++++- .../ibm/icu/text/RuleBasedBreakIterator.java | 20 +- 2 files changed, 480 insertions(+), 28 deletions(-) diff --git a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java index cce05bc903..e067dd21f2 100644 --- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java +++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java @@ -8,9 +8,18 @@ package com.ibm.icu.dev.test.rbbi; import com.ibm.icu.dev.test.TestFmwk; import com.ibm.icu.impl.ICUData; +import com.ibm.icu.impl.Utility; +import com.ibm.icu.text.BreakIterator; +import com.ibm.icu.text.RuleBasedBreakIterator; +import com.ibm.icu.lang.UCharacter; +import com.ibm.icu.lang.UProperty; +import com.ibm.icu.text.UTF16; import java.io.InputStream; import java.io.InputStreamReader; import java.io.IOException; +import java.util.Arrays; +import java.util.Locale; + /** * @author andy @@ -28,27 +37,470 @@ public class RBBITestExtended extends TestFmwk { public RBBITestExtended() { } - public void TestExtended() { - InputStreamReader isr = null; - try { - InputStream is = ICUData.getStream(RBBITestExtended.class, "rbbitst.txt"); - if (is == null) { - errln("Could not open test data file."); - return; + + +static class TestParams { + BreakIterator bi; + StringBuffer dataToBreak = new StringBuffer(); + int[] expectedBreaks = new int[1000]; + int[] srcLine = new int[1000]; + int[] srcCol = new int[1000]; +}; + + +public void TestExtended() { + + String rules; + TestParams tp = new TestParams(); + + + // + // Open and read the test data file. + // + InputStreamReader isr = null; + StringBuffer testFileBuf = new StringBuffer(); + try { + InputStream is = ICUData.getStream(RBBITestExtended.class, "rbbitst.txt"); + if (is == null) { + errln("Could not open test data file."); + return; + } + isr = new InputStreamReader(is, "UTF-8"); + int c; + int count = 0; + for (;;) { + c = isr.read(); + if (c < 0) { + break; } - isr = new InputStreamReader(is, "UTF-8"); - int c; - for (;;) { - c = isr.read(); - if (c < 0) { - break; + count++; + if (c==0xFEFF && count==1) { + // BOM in the test data file. Discard it. + continue; + } + + UTF16.append(testFileBuf, c); + } + + } catch (IOException e) { + errln(e.toString()); + return; + } + + String testString = testFileBuf.toString(); + + + final int PARSE_COMMENT = 1; + final int PARSE_TAG = 2; + final int PARSE_DATA = 3; + final int PARSE_NUM = 4; + + int parseState = PARSE_TAG; + + int savedState = PARSE_TAG; + + final char CH_LF = 0x0a; + final char CH_CR = 0x0d; + final char CH_HASH = 0x23; + /*static const UChar CH_PERIOD = 0x2e;*/ + final char CH_LT = 0x3c; + final char CH_GT = 0x3e; + final char CH_BACKSLASH = 0x5c; + final char CH_BULLET = 0x2022; + + int lineNum = 1; + int colStart = 0; + int column = 0; + int charIdx = 0; + int i; + + int tagValue = 0; // The numeric value of a tag. + int len = testString.length(); + + for (charIdx = 0; charIdx < len; ) { + int c = UTF16.charAt(testString, charIdx); + charIdx++; + if (c == CH_CR && charIdx", charIdx-1)) { + tp.bi = BreakIterator.getWordInstance(Locale.US); + charIdx += 5; + break; + } + if (testString.startsWith("", charIdx-1)) { + tp.bi = BreakIterator.getCharacterInstance(Locale.US); + charIdx += 5; + break; + } + if (testString.startsWith("", charIdx-1)) { + tp.bi = BreakIterator.getLineInstance(Locale.US); + charIdx += 5; + break; + } + if (testString.startsWith("", charIdx-1)) { + tp.bi = BreakIterator.getSentenceInstance(Locale.US); + charIdx += 5; + break; + } + if (testString.startsWith("", charIdx-1)) { + tp.bi = BreakIterator.getTitleInstance(Locale.US); + charIdx += 6; + break; + } + if (testString.startsWith("<data>", charIdx-1)) { + parseState = PARSE_DATA; + charIdx += 5; + tp.dataToBreak.setLength(0); + Arrays.fill(tp.expectedBreaks, 0); + Arrays.fill(tp.srcCol, 0); + Arrays.fill(tp.srcLine, 0); + break; + } + + errln("line" + lineNum + ": Tag expected in test file."); + return; + //parseState = PARSE_COMMENT; + //savedState = PARSE_DATA; + } + // break; // TODO: don't stop on errors + + case PARSE_DATA: + if (c == CH_BULLET) { + int breakIdx = tp.dataToBreak.length(); + tp.expectedBreaks[breakIdx] = -1; + tp.srcLine[breakIdx] = lineNum; + tp.srcCol[breakIdx] = column; + break; + } + + if (testString.startsWith("</data>", charIdx-1)) { + // Add final entry to mappings from break location to source file position. + // Need one extra because last break position returned is after the + // last char in the data, not at the last char. + int idx = tp.dataToBreak.length(); + tp.srcLine[idx] = lineNum; + tp.srcCol[idx] = column; + + parseState = PARSE_TAG; + charIdx += 7; + + // RUN THE TEST! + executeTest(tp); + break; + } + + if (testString.startsWith("\\N{", charIdx-1)) { + int nameEndIdx = testString.indexOf('}', charIdx); + if (nameEndIdx == -1) { + errln("foo"); // TODO: + } + // Named character, e.g. \N{COMBINING GRAVE ACCENT} + // Get the code point from the name and insert it into the test data. + String charName = testString.substring(charIdx+2, nameEndIdx); + c = UCharacter.getCharFromName(charName); + if (c == -1) { + errln("Error in named character in test file at line " + lineNum + + ", col " + column); + } else { + // Named code point was recognized. Insert it + // into the test data. + UTF16.append(tp.dataToBreak, c); + for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { + tp.srcLine[i] = lineNum; + tp.srcCol[i] = column; + } + + } + if (nameEndIdx > charIdx) { + charIdx = nameEndIdx+1; + } + break; + } + + if (testString.startsWith("<>", charIdx-1)) { + charIdx++; + int breakIdx = tp.dataToBreak.length(); + tp.expectedBreaks[breakIdx] = -1; + tp.srcLine[breakIdx] = lineNum; + tp.srcCol[breakIdx] = column; + break; + } + + if (c == CH_LT) { + tagValue = 0; + parseState = PARSE_NUM; + break; + } + + if (c == CH_HASH && column==3) { // TODO: why is column off so far? + parseState = PARSE_COMMENT; + savedState = PARSE_DATA; + break; + } + + if (c == CH_BACKSLASH) { + // Check for \ at end of line, a line continuation. + // Advance over (discard) the newline + int cp = UTF16.charAt(testString, charIdx); + if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) { + // We have a CR LF + // Need an extra increment of the input ptr to move over both of them + charIdx++; + } + if (cp == CH_LF || cp == CH_CR) { + lineNum++; + colStart = charIdx; + charIdx++; + break; + } + + // Let unescape handle the back slash. + int charIdxAr[] = new int[1]; + charIdxAr[0] = charIdx; + cp = Utility.unescapeAt(testString, charIdxAr); + if (cp != -1) { + // Escape sequence was recognized. Insert the char + // into the test data. + charIdx = charIdxAr[0]; + UTF16.append(tp.dataToBreak, cp); + for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { + tp.srcLine[i] = lineNum; + tp.srcCol[i] = column; + } + + break; + } + + + // Not a recognized backslash escape sequence. + // Take the next char as a literal. + // TODO: Should this be an error? + c = UTF16.charAt(testString,charIdx); + charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1); + } + + // Normal, non-escaped data char. + UTF16.append(tp.dataToBreak, c); + + // Save the mapping from offset in the data to line/column numbers in + // the original input file. Will be used for better error messages only. + // If there's an expected break before this char, the slot in the mapping + // vector will already be set for this char; don't overwrite it. + for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) { + tp.srcLine[i] = lineNum; + tp.srcCol[i] = column; + } + break; + + + case PARSE_NUM: + // We are parsing an expected numeric tag value, like <1234>, + // within a chunk of data. + if (UCharacter.isWhitespace(c)) { + break; + } + + if (c == CH_GT) { + // Finished the number. Add the info to the expected break data, + // and switch parse state back to doing plain data. + parseState = PARSE_DATA; + if (tagValue == 0) { + tagValue = -1; + } + int breakIdx = tp.dataToBreak.length(); + tp.expectedBreaks[breakIdx] = tagValue; + tp.srcLine[breakIdx] = lineNum; + tp.srcCol[breakIdx] = column; + break; + } + + if (UCharacter.isDigit(c)) { + tagValue = tagValue*10 + UCharacter.digit(c); + break; + } + + errln("Syntax Error in test file at line "+ lineNum +", col %d" + column); + return; + + // parseState = PARSE_COMMENT; // TODO: unreachable. Don't stop on errors. + // break; + } + + + + } +} + +void executeTest(TestParams t) { + int bp; + int prevBP; + int i; + + t.bi.setText(t.dataToBreak.toString()); + // + // Run the iterator forward + // + prevBP = -1; + for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) { + if (prevBP == bp) { + // Fail for lack of forward progress. + errln("Forward Iteration, no forward progress. Break Pos=" + bp + + " File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]); + break; + } + + // Check that there were we didn't miss an expected break between the last one + // and this one. + for (i=prevBP+1; i<bp; i++) { + if (t.expectedBreaks[i] != 0) { + int expected[] = {0, i}; + printStringBreaks(t.dataToBreak, expected, 2); + errln("Forward Iteration, break expected, but not found. Pos=" + i + + " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); + } + } + + // Check that the break we did find was expected + if (t.expectedBreaks[bp] == 0) { + int expected[] = {0, bp}; + printStringBreaks(t.dataToBreak, expected, 2); + errln("Forward Iteration, break found, but not expected. Pos=" + bp + + " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); + } else { + // The break was expected. + // Check that the {nnn} tag value is correct. + int expectedTagVal = t.expectedBreaks[bp]; + if (expectedTagVal == -1) { + expectedTagVal = 0; + } + int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus(); + if (rs != expectedTagVal) { + errln("Incorrect status for forward break. Pos= " + bp + + "File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp] + "\n" + + " Actual, Expected status = " + rs + ", " + expectedTagVal); + } + } + + + prevBP = bp; + } + + // Verify that there were no missed expected breaks after the last one found + for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) { + if (t.expectedBreaks[i] != 0) { + errln("Forward Iteration, break expected, but not found. Pos=" + i + + " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); + } + } + + // + // Run the iterator backwards, verify that the same breaks are found. + // + prevBP = t.dataToBreak.length()+2; // start with a phony value for the last break pos seen. + for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) { + if (prevBP == bp) { + // Fail for lack of progress. + errln("Reverse Iteration, no progress. Break Pos=" + bp + + "File line,col=" + t.srcLine[bp] + " " + t.srcCol[bp]); + break; + } + + // Check that there were we didn't miss an expected break between the last one + // and this one. (UVector returns zeros for index out of bounds.) + for (i=prevBP-1; i>bp; i--) { + if (t.expectedBreaks[i] != 0) { + errln("Reverse Itertion, break expected, but not found. Pos=" + i + + " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); + } + } + + // Check that the break we did find was expected + if (t.expectedBreaks[bp] == 0) { + errln("Reverse Itertion, break found, but not expected. Pos=" + bp + + " File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]); + } else { + // The break was expected. + // Check that the {nnn} tag value is correct. + int expectedTagVal = t.expectedBreaks[bp]; + if (expectedTagVal == -1) { + expectedTagVal = 0; + } + int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus(); + if (rs != expectedTagVal) { + errln("Incorrect status for reverse break. Pos= " + bp + + "File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp] + "\n" + + " Actual, Expected status = " + rs + ", " + expectedTagVal); + } + } + + prevBP = bp; + } + + // Verify that there were no missed breaks prior to the last one found + for (i=prevBP-1; i>=0; i--) { + if (t.expectedBreaks[i] != 0) { + errln("Forward Itertion, break expected, but not found. Pos=" + i + + " File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]); + } + } +} + + +void printStringBreaks(StringBuffer ustr, int expected[], + int expectedcount) +{ + String name; + System.out.println("code alpha extend alphanum type line name"); + int j; + for (j = 0; j < ustr.length(); j ++) { + if (expectedcount > 0) { + int k; + for (k = 0; k < expectedcount; k ++) { + if (j == expected[k]) { + System.out.println("------------------------------------------------ " + j); } - //System.out.print((char)c); } - - } catch (IOException e) { - errln(e.toString()); } - + int c = UTF16.charAt(ustr, j); + if (c > 0xffff) { + j ++; + } + name = UCharacter.getName(c); + System.out.println( UCharacter.isUAlphabetic(c) + " " + + UCharacter.hasBinaryProperty(c, UProperty.GRAPHEME_EXTEND) + " " + + UCharacter.isLetterOrDigit(c) + " " + + UCharacter.getPropertyValueName(UProperty.LINE_BREAK, + UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK), + UProperty.NameChoice.SHORT) + ); } } + + +} \ No newline at end of file diff --git a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java index 685deb7f4a..253bb50646 100755 --- a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java +++ b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java @@ -113,26 +113,26 @@ public class RuleBasedBreakIterator extends BreakIterator { /** Tag value for "words" that do not fit into any of other categories. * Includes spaces and most punctuation. */ - public static final int UBRK_WORD_NONE = 0; + public static final int WORD_NONE = 0; /** Upper bound for tags for uncategorized words. */ - public static final int UBRK_WORD_NONE_LIMIT = 100; + public static final int WORD_NONE_LIMIT = 100; /** Tag value for words that appear to be numbers, lower limit. */ - public static final int UBRK_WORD_NUMBER = 100; + public static final int WORD_NUMBER = 100; /** Tag value for words that appear to be numbers, upper limit. */ - public static final int UBRK_WORD_NUMBER_LIMIT = 200; + public static final int WORD_NUMBER_LIMIT = 200; /** Tag value for words that contain letters, excluding * hiragana, katakana or ideographic characters, lower limit. */ - public static final int UBRK_WORD_LETTER = 200; + public static final int WORD_LETTER = 200; /** Tag value for words containing letters, upper limit */ - public static final int UBRK_WORD_LETTER_LIMIT = 300; + public static final int WORD_LETTER_LIMIT = 300; /** Tag value for words containing kana characters, lower limit */ - public static final int UBRK_WORD_KANA = 300; + public static final int WORD_KANA = 300; /** Tag value for words containing kana characters, upper limit */ - public static final int UBRK_WORD_KANA_LIMIT = 400; + public static final int WORD_KANA_LIMIT = 400; /** Tag value for words containing ideographic characters, lower limit */ - public static final int UBRK_WORD_IDEO = 400; + public static final int WORD_IDEO = 400; /** Tag value for words containing ideographic characters, upper limit */ - public static final int UBRK_WORD_IDEO_LIMIT = 500; + public static final int WORD_IDEO_LIMIT = 500; //======================================================================= // BreakIterator overrides