ICU-8564 move updated rbbitst.txt test case file from icu4c to icu4j. A number of test cases are commented out, and tickets 9099 and 9084 opened to fix them.

X-SVN-Rev: 31335
This commit is contained in:
Andy Heninger 2012-02-07 01:59:19 +00:00
parent c113439fad
commit 1b46329c58

View File

@ -1,4 +1,4 @@
# Copyright (c) 2001-2009 International Business Machines
# Copyright (c) 2001-2012 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -20,12 +20,22 @@
# \ Escape. Normal ICU unescape applied.
# \ at end of line -> Line Continuation. Remove both the backslash and the new line
#
# In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended.
# In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended
#
# There are two copies of this file in the source repository,
# [ICU4C] source/test/testdata/rbbitst.txt
# [ICU4J] main/tests/core/src/com/ibm/icu/dev/test/rbbi/rbbitst.txt
#
# ICU4C's copy is the master. If any changes are made to ICU4J's copy, make sure they
# are merged back into ICU4C's copy of the file, lest they get overwritten later.
# TODO: figure out how to have a single copy of the file for use by both C and Java.
# Temp debugging tests
<locale en>
<line>
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
<data>•Hello, •World.•</data>
########################################################################################
#
@ -214,6 +224,9 @@
#
<data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data>
# User guide example:
<data>•Parlez<200>-•vous<200> •français<200> •?•</data>
########################################################################################
#
#
@ -453,6 +466,9 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<data>• •\uF8FF\u2028<100>\uF8FF•</data>
<data>• \u200B\u2028<100>\u200B•</data>
# User Guide example
<data>•Parlez-•vous •français ?•</data>
#
# Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBasedLineIteration()
@ -556,27 +572,233 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
# Test data originally from the test code source file
# // @suwit -- Thai sample data from GVT Guideline
#
# NOTE: Thai Java results differ from C because the new dictionary code is not
# yet ported. All tags are zero with the old Thai impl.
#<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\
#\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\
#\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\
#\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data>
#
## Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
#<data>•กู<200> •กิน<200>กุ้ง<200> •ปิ้่<200>งอ<200>ยู่<200>ใน<200>ถ้ำ<200></data>
#
#<data>•\u0E01\u0E39<200>\u0020•\u0E01\u0E34\u0E19<200>\u0E01\u0E38\u0E49\u0E07<200>\
#\u0020•\u0E1B\u0E34\u0E49\u0E48<200>\u0E07\u0E2D<200>\u0E22\u0E39\u0E48<200>\
#\u0E43\u0E19<200>\u0E16\u0E49\u0E33<200></data>
<line>
<data>•0E01\u0E39\u0020•\u0E01\u0E34\u0E19•\u0E01\u0E38\u0E49\u0E07\
\u0020•\u0E1B\u0E34\u0E49\u0E48•\u0E07\u0E2D•\u0E22\u0E39\u0E48•\
\u0E43\u0E19•\u0E16\u0E49\u0E33•</data>
# Data originally from intltest RBBITest::TestThaiLineBreak()
#
# \u0e2f-- the Thai paiyannoi character-- isn't a letter. It's a symbol that
# represents elided letters at the end of a long word. It should be bound to
# the end of the word and not treated as an independent punctuation mark.
#
# the one time where the paiyannoi occurs somewhere other than at the end
# of a word is in the Thai abbrevation for "etc.", which both begins and
# ends with a paiyannoi
#
<line>
<data>•\u0e2a\u0e16\u0e32\u0e19\u0e35\u0e2f•\
\u0e08\u0e30•\
\u0e23\u0e30\u0e14\u0e21•\
\u0e40\u0e08\u0e49\u0e32•\
\u0e2b\u0e19\u0e49\u0e32\u0e17\u0e35\u0e48•\
\u0e2d\u0e2d\u0e01•\
\u0e21\u0e32•\
\u0e40\u0e23\u0e48\u0e07•\
\u0e23\u0e30\u0e1a\u0e32\u0e22•\
\u0e2d\u0e22\u0e48\u0e32\u0e07•\
\u0e40\u0e15\u0e47\u0e21•\
\u0e2f\u0e25\u0e2f•\
\u0e17\u0e35\u0e48•\
\u0e19\u0e31\u0e49\u0e19•</data>
# Data originally from RBBITest::TestMixedThaiLineBreak()
# @suwit -- Test Arabic numerals, Thai numerals, Punctuation and English characters start
#
<line>
#<data>•\u0E1B\u0E35•\
#\u0E1E\u0E38\u0E17\u0E18\u0E28\u0E31\u0E01\u0E23\u0E32\u0E0A •\
#2545 •\
#\u0E40\u0E1B\u0E47\u0E19•\
#\u0E1B\u0E35•\
#\u0E09\u0E25\u0E2D\u0E07•\
#\u0E04\u0E23\u0E1A•\
#\u0E23\u0E2D\u0E1A •\
#\"\u0E52\u0E52\u0E50 •\
#\u0E1b\u0E35\" •\
#\u0E02\u0E2d\u0E07•\
#\u0E01\u0E23\u0E38\u0E07•\
#\u0E23\u0E31\u0E15\u0E19\u0E42\u0E01\u0E2A\u0E34\u0E19\u0E17\u0E23\u0E4C •\
#(\u0E01\u0E23\u0E38\u0E07\u0E40\u0E17\u0E1e\u0E2F•\
#\u0E2B\u0E23\u0E37\u0E2D •\
#Bangkok)•</data>
# Data originally from RBBITest::TestMaiyamok()
# The Thai maiyamok character is a shorthand symbol that means "repeat the previous
# word". Instead of appearing as a word unto itself, however, it's kept together
# with the word before it.
#
<line>
<data>•\u0e44\u0e1b\u0e46•\
\u0e21\u0e32\u0e46•\
\u0e23\u0e30\u0e2b\u0e27\u0e48\u0e32\u0e07•\
\u0e01\u0e23\u0e38\u0e07•\
\u0e40\u0e17\u0e1e•\
\u0e41\u0e25\u0e30•\
\u0e40\u0e03\u0e35•\
\u0e22\u0e07•\
\u0e43\u0e2b\u0e21\u0e48•</data>
##########################################################################################
#
# Khmer Tests
#
##########################################################################################
# Test data originally from http://bugs.icu-project.org/trac/search?q=r30327
# from the file testdata/wordsegments.txt
<locale th>
<word>
#<data>•តើ<200>លោក<200>មក<200>ពី<200>ប្រទេស<200>ណា<200></data>
#<data>•សណ្ដូក<200>ក<200>បណ្ដែត<200>ខ្លួន<200></data>
#<data>•ពណ៌ស<200>ម្ដេច<200>ថា<200>ខ្មៅ<200></data>
##ប្រយោគ|ពី|របៀប|រួបរួម|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
#<data>•ប្រយោគ<200>ពី<200>របៀប<200>ដែល<200>និង<200>ភាព<200>ផ្សេងគ្នា<200>ដែល<200>អាច<200>ចូល<200></data>
##ប្រយោគ|ពី|របៀប|ជា|មួយ|និង|ភាព|ផ្សេងគ្នា|ដែល|អាច|ចូល<200></data>
#<data>•សូម<200>ចំណាយពេល<200>បន្តិច<200>ដើម្បី<200>អធិស្ឋាន<200>អរព្រះគុណ<200>ដល់<200>ព្រះអង្គ<200></data>
#<data>•ការ<200>ថោកទាប<200>បរិប្បូណ៌<200>ដោយ<200></data>
#<data>•ប្រើប្រាស់<200>ស្អាត<200>ទាំង<200>ចិត្ត<200>សិស្ស<200>នោះ<200></data>
#<data>•បើ<200>អ្នក<200>ប្រព្រឺត្ត<200>អំពើអាក្រក់<200>មុខ<200>ជា<200>មាន<200></data>
#<data>•ប្រដាប់<200>ប្រដា<200>រ<200>រៀនសូត្រ<200>បន្ទប់<200>រៀន<200></data>
#<data>•ដើរតួ<200>មនុស្សគ<200>ឥត<200>បញ្ចេញ<200>យោបល់<200>សោះ<200>ឡើយ<200></data>
#<data>•មិន<200>អាច<200>ឲ្យ<200>យើង<200>ធ្វើ<200>កសិកម្ម<200>បាន<200>ឡើយ<200></data>
#<data>•បន្ត<200>សេចក្ត<200>ទៅទៀត<200></data>
#<data>•ក្រុម<200>ប៉ូលិស<200>បណ្តាក់<200>គ្នា<200></data>
#<data>•គ្មាន<200>សុខ<200>សំរាន្ត<200>ដង<200>ណា<200></data>
#<data>•បាន<200>សុខភាព<200>បរិប្បូណ៌<200></data>
#<data>•ជា<200>មេចោរ<200>ខ្ញុំ<200>នឹង<200>ស្លាប់<200>ទៅវិញ<200>ជា<200>មេចោរ<200></data>
#<data>•ឯ<200>ការ<200>វាយ<200>ផ្ចាល<200>ដែល<200>នាំ<200></data>
#<data>•គេ<200>ដឹក<200>ទៅ<200>សំឡាប់<200></data>
##អ្នក|ដែល|ជា|មន្ត្រី|ធំ|លើ|គាត់|ទេ<200></data>
#<data>•យក<200>ទៅ<200>សម្លាប់ចោល<200>ស្ងាត់<200></data>
#<data>•ត្រូវ<200>បាន<200>គេ<200>សម្លាប់<200></data>
#<data>•នៅក្នុង<200>ស្រុក<200>ខ្ល<200>ងហ្ស៊ុន<200></data>
<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<000>\u0E04\u0E33<000>\u0E44\u0E17\u0E22<000>\
\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<000>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<000>\
\u0E14\u0E49\u0E27\u0E22<000>\u0e2b\u0e25\u0e32\u0e22<000>\
\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<000></data>
#
# Jitterbug 3671 Test Case
#
<data>•สวัสดี<000>ครับ<000>สบาย<000>ดี<000>ไหม<000> •ครับ<000></data>
#<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data>
#
# Trac ticket 5595 Test Case
# (Omitted from ICU4J because of old dictionary implementation.
#<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\
#ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\
#ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\
#สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\
#ไมล์<200></data>
####################################################################################
#
# <data>•บท<000>ที่๑พายุ<000>ไซโคลน<000>โด<000>โรธี<000>อาศัย<000>อยู่<000>ท่ามกลาง<000>\
# ทุ่งใหญ่<000>ใน<000>แคนซัส<000>กับ<000>ลุง<000>เฮ<000>นรี<000>ชาวไร่<000>และ<000>ป้า<000>เอ็ม<000>\
# ภรรยา<000>ชาวไร่<000>บ้าน<000>ของ<000>พวก<000>เขา<000>หลัง<000>เล็ก<000>เพราะ<000>ไม้<000>\
# สร้าง<000>บ้าน<000>ต้อง<000>ขน<000>มา<000>ด้วย<000>เกวียน<000>เป็น<000>ระยะ<000>ทาง<000>หลาย<000>\
# ไมล์<000></data>
# Tailored (locale specific) breaking.
#
####################################################################################
# Japanese line break tailoring test
<locale ja>
<line>
<data>•\u3041•\u3043•\u3045•\u31f1•</data>
<locale en>
<line>
<data>•\u3041\u3043\u3045\u31f1•</data>
# The following data was originally in RBBITest::TestJapaneseWordBreak()
<locale ja>
<word>
<data>•\u4ECA\u65E5<400>\u306F\u3044\u3044<300>\u5929\u6C17<400>\u3067\u3059\u306D<300>\u3002•\u000D\u000A•</data>
# UBreakIteratorType UBRK_WORD, Locale "ja"
# Don't break in runs of hiragana or runs of ideograph, where the latter includes \u3005 \u3007 \u303B (cldrbug #2009).
# \u79C1\u9054\u306B\u4E00\u3007\u3007\u3007\u306E\u30B3\u30F3\u30D4\u30E5\u30FC\u30BF\u304C\u3042\u308B\u3002\u5948\u3005\u306F\u30EF\u30FC\u30C9\u3067\u3042\u308B\u3002
<locale ja>
<word>
<data>•私達<400>に<300>一〇〇〇<400>の<300>コンピュータ<300>がある<300>。<0>奈々<400>は<300>ワード<300>である<300>。•</data>
<locale root>
<word>
<data>•私<400>達<400>に<300>一<400><400><400><400>の<300>コンピュータ<300>が<300>あ<300>る<300>。<0>奈<400>々<200>は<300>ワード<300>で<300>あ<300>る<300>。•</data>
# UBreakIteratorType UBRK_SENTENCE, Locale "el"
# Add break after Greek question mark (cldrbug #2069).
# "\u0391\u03B2, \u03B3\u03B4; \u0395 \u03B6\u03B7\u037E \u0398 \u03B9\u03BA. "
# "\u039B\u03BC \u03BD\u03BE! \u039F\u03C0, \u03A1\u03C2? \u03A3"
# which is "Αβ, γδ; Ε ζη; Θ ικ. Λμ νξ! Οπ, Ρς? Σ"
<locale root>
<sent>
<data>•Αβ, γδ; Ε ζη; Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data>
<locale el>
<sent>
<data>•Αβ, γδ; •Ε ζη; •Θ ικ. •Λμ νξ! •Οπ, Ρς? •Σ<100></data>
# UBreakIteratorType UBRK_WORD, Locale "en_US_POSIX"
# Words don't include colon or period (cldrbug #1969).
<locale en_US>
<word>
<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx:yy<200> •or<200> •struct.field<200> \
•for<200> •CS<200>-•types<200>.•</data>
<locale en_US_POSIX>
<word>
<data>•Can't<200> •have<200> •breaks<200> •in<200> •xx<200>:•yy<200> •or<200> •struct<200>.•field<200> \
•for<200> •CS<200>-•types<200>.•</data>
# UBreakIteratorType UBRK_CHARACTER, Locale "th"
# Clusters should not include spacing Thai/Lao vowels (prefix or postfix), except for [SARA] AM (cldrbug #2161).
# Update: As of Unicode 6.1 root has same behavior as th for this.
#
# "\u0E01\u0E23\u0E30\u0E17\u0E48\u0E2D\u0E21\u0E23\u0E08\u0E19\u0E32 "
# "(\u0E2A\u0E38\u0E0A\u0E32\u0E15\u0E34-\u0E08\u0E38\u0E11\u0E32\u0E21\u0E32\u0E28) "
# "\u0E40\u0E14\u0E47\u0E01\u0E21\u0E35\u0E1B\u0E31\u0E0D\u0E2B\u0E32 "
# which is "กระท่อมรจนา (สุชาติ-จุฑามาศ) เด็กมีปัญหา "
<locale th>
<char>
<data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E32• •\
(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
<locale root>
<char>
<data>•\u0E01•\u0E23•\u0E30•\u0E17\u0E48•\u0E2D•\u0E21•\u0E23•\u0E08•\u0E19•\u0E32• •\
(•\u0E2A\u0E38•\u0E0A•\u0E32•\u0E15\u0E34•-•\u0E08\u0E38•\u0E11•\u0E32•\u0E21•\u0E32•\u0E28•)• •\
\u0E40•\u0E14\u0E47•\u0E01•\u0E21\u0E35•\u0E1B\u0E31•\u0E0D•\u0E2B•\u0E32• •</data>
# Finnish line breaking
#
# These rules deal with hyphens when there is a space on the leading side.
# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
# See CLDR ticket 3029.
# See ICU ticket 8151
<locale root>
<line>
<data>•abc •- •def •abc •-•def •abc- •def •abc-•def•</data> # With ASCII hyphen
<data>•abc •‐ •def •abc ••def •abc •def •abc•def•</data> # With Unicode u2010 hyphen
<locale fi>
<line>
#<data>•abc •- •def •abc •-def •abc- •def •abc-•def•</data> # With ASCII hyphen
#<data>•abc •‐ •def •abc •def •abc •def •abc•def•</data> # With Unicode u2010 hyphen