ICU-11556 Line Break rules update for L2/16-043R, don't break CA$; also LB rules refactored for reduced memory consumption. ICU4J Data refreshed from ICU4C.
X-SVN-Rev: 38645
This commit is contained in:
parent
7265eeae4c
commit
c1422845ac
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:1420edbb9a70020f48c545a75738981907657c79f8c8543dbee9cbbb75ada655
|
||||
size 11767515
|
||||
oid sha256:9e60171048ccda76c8c9c0ced344822e21543ef8608d188f0029edfc5a5a87ea
|
||||
size 11718381
|
||||
|
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:75953512893b452eabe5d0b7721ca2c15473e11c14c51526d25d6aa1051b3bc1
|
||||
size 91108
|
||||
oid sha256:6451e003b77fcc7cf03b1e0a0eebdcc112a41209d3a41837964370a893198f24
|
||||
size 91105
|
||||
|
@ -1057,25 +1057,36 @@ public class RBBITestMonkey extends TestFmwk {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 23 (AL | HL) x NU
|
||||
// NU x (AL | HL)
|
||||
if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 23 ID x PO (Note: Leading CM behaves like ID)
|
||||
// AL x NU
|
||||
// NU x AL
|
||||
if (fID.contains(prevChar) && fPO.contains(thisChar) ||
|
||||
fAL.contains(prevChar) && fNU.contains(thisChar) ||
|
||||
fHL.contains(prevChar) && fNU.contains(thisChar) ||
|
||||
fNU.contains(prevChar) && fAL.contains(thisChar) ||
|
||||
fNU.contains(prevChar) && fHL.contains(thisChar) ) {
|
||||
continue;
|
||||
// LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
|
||||
// PR x (ID | EB | EM)
|
||||
// (ID | EB | EM) x PO
|
||||
if (fPR.contains(prevChar) &&
|
||||
(fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
|
||||
fPO.contains(thisChar)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// LB 24 Do not break between prefix and letters or ideographs.
|
||||
// PR x ID
|
||||
// PR x AL
|
||||
// PO x AL
|
||||
if (fPR.contains(prevChar) && fID.contains(thisChar) ||
|
||||
fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
|
||||
fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
|
||||
// (PR | PO) x (AL | HL)
|
||||
// (AL | HL) x (PR | PO)
|
||||
if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
|
||||
(fAL.contains(thisChar) || fHL.contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
|
||||
(fPR.contains(thisChar) || fPO.contains(thisChar))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
# Copyright (c) 2001-2015 International Business Machines
|
||||
# Copyright (c) 2001-2016 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# RBBI Test Data
|
||||
@ -15,11 +15,11 @@
|
||||
# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc.
|
||||
# <data> ... </data> test data. May span multiple lines.
|
||||
# <> Break position, status == 0
|
||||
# • Break position, status == 0 (Bullet, \u2022)
|
||||
# • Break position, status == 0 (Bullet, \u2022)
|
||||
# <nnn> Break position, status == nnn
|
||||
# \ Escape. Normal ICU unescape applied.
|
||||
# \ Escape. Normal ICU unescape applied.
|
||||
# \ at end of line -> Line Continuation. Remove both the backslash and the new line
|
||||
#
|
||||
#
|
||||
# In ICU4C, this test data is run by intltest, rbbi/RBBITest/TestExtended.
|
||||
# In ICU4J, this test data is run by com.ibm.icu.dev.test.rbbi.RBBITestExtended
|
||||
#
|
||||
@ -32,7 +32,24 @@
|
||||
# TODO: figure out how to have a single copy of the file for use by both C and Java.
|
||||
|
||||
|
||||
# Temp debugging tests
|
||||
## FILTERED BREAK TESTS
|
||||
|
||||
# (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
|
||||
#<locale en>
|
||||
#<sent>
|
||||
#<data>\
|
||||
#•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
|
||||
#
|
||||
#<locale en@ss=standard>
|
||||
#<sent>
|
||||
#<data>\
|
||||
#•In the meantime Mr. Weston arrived with his small ship, which he had now recovered. •Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
|
||||
#
|
||||
## END FILTERED BREAK TESTS
|
||||
|
||||
<locale en>
|
||||
|
||||
# Temp debugging tests
|
||||
<sent>
|
||||
<data>•\u00c0.•</data>
|
||||
|
||||
@ -72,7 +89,7 @@
|
||||
# LVT : \uAC01
|
||||
|
||||
<data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT
|
||||
<data>•\u1100\u1161•\u1100\u1161•</data>
|
||||
<data>•\u1100\u1161•\u1100\u1161•</data>
|
||||
<data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data>
|
||||
<data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data>
|
||||
<data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data>
|
||||
@ -80,7 +97,7 @@
|
||||
|
||||
|
||||
# Hindi combining chars. (An old test)
|
||||
# TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters
|
||||
# TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters
|
||||
#<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930•
|
||||
#•\u0939•\u094c•\u0964•</data>
|
||||
#<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data>
|
||||
@ -192,7 +209,7 @@
|
||||
|
||||
# Words with interior formatting characters
|
||||
<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data>
|
||||
|
||||
|
||||
# to test for bug #4097779
|
||||
<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data>
|
||||
|
||||
@ -218,13 +235,13 @@
|
||||
<data>•\u06c9<200>\uc799\ufffa•</data>
|
||||
|
||||
|
||||
#
|
||||
#
|
||||
# Try some words from other scripts.
|
||||
#
|
||||
#
|
||||
|
||||
# Try some words from other scripts.
|
||||
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
|
||||
#
|
||||
#
|
||||
<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data>
|
||||
|
||||
<data>•\u0301•A<200></data>
|
||||
@ -234,7 +251,7 @@
|
||||
# Hindi word break tests, imported from the old RBBI tests.
|
||||
# An historical note: a much earlier version of ICU break iterators had a number
|
||||
# of special case rules for Hindi, which were tested by an earlier version of
|
||||
# this test data. The current RBBI rules do not special case Hindi in
|
||||
# this test data. The current RBBI rules do not special case Hindi in
|
||||
# any way, making this test data much less signfificant.
|
||||
#
|
||||
<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200>
|
||||
@ -289,7 +306,7 @@ doing? •This\n<100> costs $20,00,000. •</data>
|
||||
"This isn't it." •Hi! \
|
||||
•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\
|
||||
"This isn't it." •\
|
||||
Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why.
|
||||
Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why.
|
||||
•Not on my time (el timo.)! •</data>
|
||||
|
||||
<data>•Hello. •So what!!\u2029•"But now," he said, \
|
||||
@ -334,11 +351,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•How do you do? •(fine). •</data>
|
||||
|
||||
#
|
||||
<data>•Hello.123<100></data> # Rule 6
|
||||
<data>•Hello?•123<100></data>
|
||||
|
||||
<data>•HELLO.Bye<100></data> # Rule 7
|
||||
<data>•HELLO?•Bye<100></data>
|
||||
<data>•Hello.123<100></data> # Rule 6
|
||||
<data>•Hello?•123<100></data>
|
||||
|
||||
<data>•HELLO.Bye<100></data> # Rule 7
|
||||
<data>•HELLO?•Bye<100></data>
|
||||
|
||||
<data>•Hello.goodbye<100></data> #Rule 8
|
||||
<data>•Hello. •Goodbye<100></data>
|
||||
@ -351,36 +368,36 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
#
|
||||
<data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible, work the same on all platforms. •</data>
|
||||
<data>•Another test.\u2029•</data>
|
||||
|
||||
|
||||
# test for bug #4143071: Make sure sentences that end with digits
|
||||
# work right
|
||||
#
|
||||
<data>•Today is the 27th of May, 1998. •</data>
|
||||
<data>•Tomorrow with be 28 May 1998. •</data>
|
||||
<data>•The day after will be the 30th.\u2029•</data>
|
||||
|
||||
|
||||
# test for bug #4152416: Make sure sentences ending with a capital
|
||||
# letter are treated correctly
|
||||
#
|
||||
<data>•The type of all primitive \<code>boolean\</code> values accessed in the target VM. •Calls to xxx will return an implementor of this interface. \u2029•</data>
|
||||
|
||||
|
||||
# test for bug #4152117: Make sure sentence breaking is handling
|
||||
# punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS
|
||||
# HERE TO MAKE SURE IT DOESN'T CROP UP]
|
||||
#
|
||||
<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc.
|
||||
<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc.
|
||||
•</data>
|
||||
|
||||
# sentence breaks for hindi which used Devanagari script
|
||||
# make sure there is sentence break after ?,danda(hindi phrase separator),
|
||||
# fullstop followed by space. (VERY old test)
|
||||
#
|
||||
#
|
||||
<data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u094d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\
|
||||
\u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u093e\n\
|
||||
<100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 means "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u0905\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u093f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data>
|
||||
|
||||
# Regression test for bug #1984, Sentence break in Arabic text.
|
||||
|
||||
|
||||
<data>\
|
||||
•\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u0627\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u0645\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u0022\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u0630\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u0649\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data>
|
||||
|
||||
@ -496,6 +513,18 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>• •\uF8FF\u2028<100>\uF8FF•</data>
|
||||
<data>• \u200B\u2028<100>\u200B•</data>
|
||||
|
||||
# Regional Indicator sequences. They group in pairs. The reverse rules are tricky.
|
||||
# Sequences are long enough that the non-exaustive monkey test won't reliably pick up problems.
|
||||
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
|
||||
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
|
||||
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
|
||||
|
||||
|
||||
# User Guide example
|
||||
|
||||
<data>•Parlez-•vous •français ?•</data>
|
||||
@ -506,11 +535,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
|
||||
<line>
|
||||
|
||||
<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
|
||||
<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence.
|
||||
<100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data>
|
||||
|
||||
<line>
|
||||
<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar
|
||||
<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar
|
||||
<100>How, •are, •you? •This, •costs •$20,00,000.•</data>
|
||||
|
||||
# test for bug #4068133
|
||||
@ -550,11 +579,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
# Regression for bug 836
|
||||
# Note: Unicode 5.1 changed this behavior
|
||||
# Unicode 5.2 changed it again, there is no break following the '('
|
||||
<data>•AAA(AAA •</data>
|
||||
<data>•AAA(AAA •</data>
|
||||
|
||||
# Try some words from other scripts.
|
||||
# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin
|
||||
#
|
||||
#
|
||||
<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data>
|
||||
|
||||
#
|
||||
@ -570,7 +599,7 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data>
|
||||
<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data>
|
||||
<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data>
|
||||
|
||||
|
||||
# Test for #10176 (in root)
|
||||
<line>
|
||||
<data>•abc/•s •def•</data>
|
||||
@ -578,6 +607,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
|
||||
<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
|
||||
<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
|
||||
|
||||
# Ticket #11556 don't break "R$" or "JP¥"
|
||||
<locale en>
|
||||
<line>
|
||||
<data>•R$ •JP¥ •a9 •3a •H% •CA$ •Travi$ •Scott •Ke$ha •Curren$y •A$AP •Rocky•</data>
|
||||
|
||||
|
||||
|
||||
########################################################################################
|
||||
@ -886,10 +920,10 @@ Bangkok)•</data>
|
||||
|
||||
# Finnish line breaking
|
||||
#
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
# These rules deal with hyphens when there is a space on the leading side.
|
||||
# There should be a break opportunity between the space and the hyphen, and not after the hyphen.
|
||||
# See CLDR ticket 3029.
|
||||
# See ICU ticket 8151
|
||||
# See ICU ticket 8151
|
||||
|
||||
<locale root>
|
||||
<line>
|
||||
|
Loading…
Reference in New Issue
Block a user