ICU-3295 RBBI, more tests.

X-SVN-Rev: 15223
2004-05-09 18:38:43 +00:00 · 2004-05-09 18:38:43 +00:00 · 2242d9195a
commit 2242d9195a
parent b761d10bb0
2 changed files with 480 additions and 28 deletions
--- a/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
+++ b/icu4j/src/com/ibm/icu/dev/test/rbbi/RBBITestExtended.java
@ -8,9 +8,18 @@ package com.ibm.icu.dev.test.rbbi;

 import com.ibm.icu.dev.test.TestFmwk;
 import com.ibm.icu.impl.ICUData;
+import com.ibm.icu.impl.Utility;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UProperty;
+import com.ibm.icu.text.UTF16;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Locale;
+

 /**
 * @author andy
@ -28,27 +37,470 @@ public class RBBITestExtended extends TestFmwk {
 	public RBBITestExtended() { 
 	}

-	public void TestExtended() {
-		InputStreamReader isr = null;
-		try {
-			InputStream is = ICUData.getStream(RBBITestExtended.class, "rbbitst.txt");
-            if (is == null) {
-                errln("Could not open test data file.");
-                return;
+
+
+static class TestParams {
+    BreakIterator   bi;
+    StringBuffer    dataToBreak    = new StringBuffer();
+    int[]           expectedBreaks = new int[1000];
+    int[]           srcLine        = new int[1000];
+    int[]           srcCol         = new int[1000];
+};
+
+
+public void TestExtended() {
+
+    String             rules;
+    TestParams     tp = new TestParams();
+
+
+    //
+    //  Open and read the test data file.
+    //
+    InputStreamReader isr = null;
+    StringBuffer  testFileBuf = new StringBuffer();
+    try {
+        InputStream is = ICUData.getStream(RBBITestExtended.class, "rbbitst.txt");
+        if (is == null) {
+            errln("Could not open test data file.");
+            return;
+        }
+        isr = new InputStreamReader(is, "UTF-8");           
+        int c;
+        int count = 0;
+        for (;;) {
+            c = isr.read();
+            if (c < 0) {
+                break;
            }
-			isr = new InputStreamReader(is, "UTF-8");			
-			int c;
-			for (;;) {
-				c = isr.read();
-				if (c < 0) {
-					break;
+            count++;
+            if (c==0xFEFF && count==1) {
+               // BOM in the test data file.  Discard it.
+               continue;
+            }
+           
+            UTF16.append(testFileBuf, c);
+        }
+        
+    } catch (IOException e) {
+        errln(e.toString());
+        return;
+    }
+    
+    String testString = testFileBuf.toString();
+    
+
+    final int  PARSE_COMMENT = 1;
+    final int  PARSE_TAG     = 2;
+    final int  PARSE_DATA    = 3;
+    final int  PARSE_NUM     = 4;
+
+    int parseState = PARSE_TAG;
+
+    int savedState = PARSE_TAG;
+
+    final char CH_LF        = 0x0a;
+    final char CH_CR        = 0x0d;
+    final char CH_HASH      = 0x23;
+    /*static const UChar CH_PERIOD    = 0x2e;*/
+    final char CH_LT        = 0x3c;
+    final char CH_GT        = 0x3e;
+    final char CH_BACKSLASH = 0x5c;
+    final char CH_BULLET    = 0x2022;
+
+    int    lineNum  = 1;
+    int    colStart = 0;
+    int    column   = 0;
+    int    charIdx  = 0;
+    int    i;
+
+    int    tagValue = 0;       // The numeric value of a <nnn> tag.
+    int    len = testString.length();
+
+    for (charIdx = 0; charIdx < len; ) {
+        int  c = UTF16.charAt(testString, charIdx);
+        charIdx++;
+        if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
+            // treat CRLF as a unit
+            c = CH_LF;
+            charIdx++;
+        }
+        if (c == CH_LF || c == CH_CR) {
+            lineNum++;
+            colStart = charIdx;
+        }
+        column = charIdx - colStart + 1;
+
+        switch (parseState) {
+        case PARSE_COMMENT:
+            if (c == 0x0a || c == 0x0d) {
+                parseState = savedState;
+            }
+            break;
+
+        case PARSE_TAG:
+            {
+            if (c == CH_HASH) {
+                parseState = PARSE_COMMENT;
+                savedState = PARSE_TAG;
+                break;
+            }
+            if (UCharacter.isWhitespace(c)) {
+                break;
+            }
+           if (testString.startsWith("<word>", charIdx-1)) {
+                tp.bi = BreakIterator.getWordInstance(Locale.US);
+                charIdx += 5;
+                break;
+            }
+            if (testString.startsWith("<char>", charIdx-1)) {
+                tp.bi = BreakIterator.getCharacterInstance(Locale.US);
+                charIdx += 5;
+                break;
+            }
+            if (testString.startsWith("<line>", charIdx-1)) {
+                tp.bi = BreakIterator.getLineInstance(Locale.US);
+                charIdx += 5;
+                break;
+            }
+            if (testString.startsWith("<sent>", charIdx-1)) {
+                tp.bi = BreakIterator.getSentenceInstance(Locale.US);
+                charIdx += 5;
+                break;
+            }
+            if (testString.startsWith("<title>", charIdx-1)) {
+                tp.bi = BreakIterator.getTitleInstance(Locale.US);
+                charIdx += 6;
+                break;
+            }
+            if (testString.startsWith("<data>", charIdx-1)) {
+                parseState = PARSE_DATA;
+                charIdx += 5;
+                tp.dataToBreak.setLength(0);
+                Arrays.fill(tp.expectedBreaks, 0);
+                Arrays.fill(tp.srcCol, 0);
+                Arrays.fill(tp.srcLine, 0);
+                break;
+            }
+
+            errln("line" + lineNum + ": Tag expected in test file.");
+            return;
+            //parseState = PARSE_COMMENT;
+            //savedState = PARSE_DATA;
+            }
+            // break;   // TODO: don't stop on errors
+
+        case PARSE_DATA:
+            if (c == CH_BULLET) {
+                int  breakIdx = tp.dataToBreak.length();
+                tp.expectedBreaks[breakIdx] = -1;
+                tp.srcLine[breakIdx]        = lineNum;
+                tp.srcCol[breakIdx]         = column;
+                break;
+            }
+
+            if (testString.startsWith("</data>", charIdx-1))  {
+                // Add final entry to mappings from break location to source file position.
+                //  Need one extra because last break position returned is after the
+                //    last char in the data, not at the last char.
+                int idx = tp.dataToBreak.length();
+                tp.srcLine[idx] = lineNum;
+                tp.srcCol[idx]  = column;
+
+                parseState = PARSE_TAG;
+                charIdx += 7;
+
+                // RUN THE TEST!
+                executeTest(tp);
+                break;
+            }
+
+           if (testString.startsWith("\\N{", charIdx-1)) {
+               int nameEndIdx = testString.indexOf('}', charIdx);
+               if (nameEndIdx == -1) {
+                   errln("foo");  // TODO:
+               }
+                // Named character, e.g. \N{COMBINING GRAVE ACCENT}
+                // Get the code point from the name and insert it into the test data.
+                String charName = testString.substring(charIdx+2, nameEndIdx);
+                c = UCharacter.getCharFromName(charName);
+                if (c == -1) {
+                    errln("Error in named character in test file at line " + lineNum +
+                            ", col " + column);
+                } else {               
+                    // Named code point was recognized.  Insert it
+                    //   into the test data.
+                    UTF16.append(tp.dataToBreak, c);
+                    for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
+                        tp.srcLine[i] = lineNum;
+                        tp.srcCol[i]  = column;
+                    }
+                    
+                 }
+                if (nameEndIdx > charIdx) {
+                    charIdx = nameEndIdx+1;
+                }
+                break;
+            }
+
+            if (testString.startsWith("<>", charIdx-1)) {
+                charIdx++;
+                int  breakIdx = tp.dataToBreak.length();
+                tp.expectedBreaks[breakIdx] = -1;
+                tp.srcLine[breakIdx]        = lineNum;
+                tp.srcCol[breakIdx]         = column;
+                break;
+            }
+
+            if (c == CH_LT) {
+                tagValue   = 0;
+                parseState = PARSE_NUM;
+                break;
+            }
+
+            if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
+                parseState = PARSE_COMMENT;
+                savedState = PARSE_DATA;
+                break;
+            }
+
+            if (c == CH_BACKSLASH) {
+                // Check for \ at end of line, a line continuation.
+                //     Advance over (discard) the newline
+                int cp = UTF16.charAt(testString, charIdx); 
+                if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
+                    // We have a CR LF
+                    //  Need an extra increment of the input ptr to move over both of them
+                    charIdx++;
+                }
+                if (cp == CH_LF || cp == CH_CR) {
+                    lineNum++;
+                    colStart = charIdx;
+                    charIdx++;
+                    break;
+                }
+
+                // Let unescape handle the back slash.
+                int  charIdxAr[] = new int[1];
+                charIdxAr[0] = charIdx;
+                cp = Utility.unescapeAt(testString, charIdxAr);
+                if (cp != -1) {
+                	// Escape sequence was recognized.  Insert the char
+                	//   into the test data.
+                	charIdx = charIdxAr[0];
+                	UTF16.append(tp.dataToBreak, cp);
+                	for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
+                		tp.srcLine[i] = lineNum;
+                		tp.srcCol[i]  = column;
+                	}
+                	
+                	break;
+                }
+
+
+                // Not a recognized backslash escape sequence.
+                // Take the next char as a literal.
+                //  TODO:  Should this be an error?
+                c = UTF16.charAt(testString,charIdx);
+                charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
+             }
+
+            // Normal, non-escaped data char.
+            UTF16.append(tp.dataToBreak, c);
+ 
+            // Save the mapping from offset in the data to line/column numbers in
+            //   the original input file.  Will be used for better error messages only.
+            //   If there's an expected break before this char, the slot in the mapping
+            //     vector will already be set for this char; don't overwrite it.
+            for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
+                tp.srcLine[i] = lineNum;
+                tp.srcCol[i]  = column;
+            }
+            break;
+
+
+        case PARSE_NUM:
+            // We are parsing an expected numeric tag value, like <1234>,
+            //   within a chunk of data.
+            if (UCharacter.isWhitespace(c)) {
+                break;
+            }
+
+            if (c == CH_GT) {
+                // Finished the number.  Add the info to the expected break data,
+                //   and switch parse state back to doing plain data.
+                parseState = PARSE_DATA;
+                if (tagValue == 0) {
+                    tagValue = -1;
+                }
+                int  breakIdx = tp.dataToBreak.length();
+                tp.expectedBreaks[breakIdx] = tagValue;
+                tp.srcLine[breakIdx]        = lineNum;
+                tp.srcCol[breakIdx]         = column;
+                break;
+            }
+
+            if (UCharacter.isDigit(c)) {
+                tagValue = tagValue*10 + UCharacter.digit(c);
+                break;
+            }
+
+            errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
+            return;
+            
+            // parseState = PARSE_COMMENT;   // TODO: unreachable.  Don't stop on errors.
+            // break;
+        }
+
+
+ 
+    }
+}
+
+void executeTest(TestParams t) {
+    int    bp;
+    int    prevBP;
+    int    i;
+
+    t.bi.setText(t.dataToBreak.toString());
+    //
+    //  Run the iterator forward
+    //
+    prevBP = -1;
+    for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
+        if (prevBP ==  bp) {
+            // Fail for lack of forward progress.
+            errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
+                    "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
+            break;
+        }
+
+        // Check that there were we didn't miss an expected break between the last one
+        //  and this one.
+        for (i=prevBP+1; i<bp; i++) {
+            if (t.expectedBreaks[i] != 0) {
+                int expected[] = {0, i};
+                printStringBreaks(t.dataToBreak, expected, 2);
+                errln("Forward Iteration, break expected, but not found.  Pos=" + i + 
+                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
+            }
+        }
+
+        // Check that the break we did find was expected
+        if (t.expectedBreaks[bp] == 0) {
+            int expected[] = {0, bp};
+            printStringBreaks(t.dataToBreak, expected, 2);
+            errln("Forward Iteration, break found, but not expected.  Pos=" + bp + 
+                    "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
+        } else {
+            // The break was expected.
+            //   Check that the {nnn} tag value is correct.
+            int expectedTagVal = t.expectedBreaks[bp];
+            if (expectedTagVal == -1) {
+                expectedTagVal = 0;
+            }
+            int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus();
+            if (rs != expectedTagVal) {
+                errln("Incorrect status for forward break.  Pos=  " + bp +
+                        "File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp] + "\n" +
+                      "          Actual, Expected status = " + rs + ", " + expectedTagVal);
+            }
+        }
+
+
+        prevBP = bp;
+    }
+
+    // Verify that there were no missed expected breaks after the last one found
+    for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
+        if (t.expectedBreaks[i] != 0) {
+            errln("Forward Iteration, break expected, but not found.  Pos=" + i + 
+                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
+       }
+    }
+
+    //
+    //  Run the iterator backwards, verify that the same breaks are found.
+    //
+    prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
+    for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
+        if (prevBP ==  bp) {
+            // Fail for lack of progress.
+            errln("Reverse Iteration, no progress.  Break Pos=" + bp +
+                    "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
+            break;
+        }
+
+        // Check that there were we didn't miss an expected break between the last one
+        //  and this one.  (UVector returns zeros for index out of bounds.)
+        for (i=prevBP-1; i>bp; i--) {
+            if (t.expectedBreaks[i] != 0) {
+                errln("Reverse Itertion, break expected, but not found.  Pos=" + i + 
+                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
+            }
+        }
+
+        // Check that the break we did find was expected
+        if (t.expectedBreaks[bp] == 0) {
+            errln("Reverse Itertion, break found, but not expected.  Pos=" + bp + 
+                    "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
+        } else {
+            // The break was expected.
+            //   Check that the {nnn} tag value is correct.
+            int expectedTagVal = t.expectedBreaks[bp];
+            if (expectedTagVal == -1) {
+                expectedTagVal = 0;
+            }
+            int rs = ((RuleBasedBreakIterator)t.bi).getRuleStatus();
+            if (rs != expectedTagVal) {
+                errln("Incorrect status for reverse break.  Pos=  " + bp +
+                        "File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp] + "\n" +
+                      "          Actual, Expected status = " + rs + ", " + expectedTagVal);
+                  }
+        }
+
+        prevBP = bp;
+    }
+
+    // Verify that there were no missed breaks prior to the last one found
+    for (i=prevBP-1; i>=0; i--) {
+        if (t.expectedBreaks[i] != 0) {
+            errln("Forward Itertion, break expected, but not found.  Pos=" + i +
+                    "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
+         }
+    }
+}
+
+
+void printStringBreaks(StringBuffer ustr, int expected[],
+		int expectedcount)
+{
+	String name;
+	System.out.println("code    alpha extend alphanum type line name");
+	int j;
+	for (j = 0; j < ustr.length(); j ++) {
+		if (expectedcount > 0) {
+			int k;
+			for (k = 0; k < expectedcount; k ++) {
+				if (j == expected[k]) {
+					System.out.println("------------------------------------------------ " + j);
 				}
-				//System.out.print((char)c);
 			}
-			
-		} catch (IOException e) {
-			errln(e.toString());
 		}
-		
+		int c = UTF16.charAt(ustr, j);
+		if (c > 0xffff) {
+			j ++;
+		}
+        name = UCharacter.getName(c);
+        System.out.println(  UCharacter.isUAlphabetic(c) + "  " +
+                             UCharacter.hasBinaryProperty(c, UProperty.GRAPHEME_EXTEND) + "  " +
+                             UCharacter.isLetterOrDigit(c) + "  " +
+                             UCharacter.getPropertyValueName(UProperty.LINE_BREAK, 
+                                    UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK), 
+                                            UProperty.NameChoice.SHORT)
+                             );
 	}
 }
+
+
+}
--- a/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java
+++ b/icu4j/src/com/ibm/icu/text/RuleBasedBreakIterator.java
@ -113,26 +113,26 @@ public class RuleBasedBreakIterator extends BreakIterator {
    
        /** Tag value for "words" that do not fit into any of other categories. 
         *  Includes spaces and most punctuation. */
-        public static final int UBRK_WORD_NONE           = 0;
+        public static final int WORD_NONE           = 0;
        /** Upper bound for tags for uncategorized words. */
-        public static final int UBRK_WORD_NONE_LIMIT     = 100;
+        public static final int WORD_NONE_LIMIT     = 100;
        /** Tag value for words that appear to be numbers, lower limit. */
-        public static final int UBRK_WORD_NUMBER         = 100;
+        public static final int WORD_NUMBER         = 100;
        /** Tag value for words that appear to be numbers, upper limit. */
-        public static final int UBRK_WORD_NUMBER_LIMIT   = 200;
+        public static final int WORD_NUMBER_LIMIT   = 200;
        /** Tag value for words that contain letters, excluding
         *  hiragana, katakana or ideographic characters, lower limit. */
-        public static final int UBRK_WORD_LETTER         = 200;
+        public static final int WORD_LETTER         = 200;
        /** Tag value for words containing letters, upper limit  */
-        public static final int UBRK_WORD_LETTER_LIMIT   = 300;
+        public static final int WORD_LETTER_LIMIT   = 300;
        /** Tag value for words containing kana characters, lower limit */
-        public static final int UBRK_WORD_KANA           = 300;
+        public static final int WORD_KANA           = 300;
        /** Tag value for words containing kana characters, upper limit */
-        public static final int UBRK_WORD_KANA_LIMIT     = 400;
+        public static final int WORD_KANA_LIMIT     = 400;
        /** Tag value for words containing ideographic characters, lower limit */
-        public static final int UBRK_WORD_IDEO           = 400;
+        public static final int WORD_IDEO           = 400;
        /** Tag value for words containing ideographic characters, upper limit */
-        public static final int UBRK_WORD_IDEO_LIMIT     = 500;
+        public static final int WORD_IDEO_LIMIT     = 500;

    //=======================================================================
    // BreakIterator overrides