ICU-11556 Line Break rules update for L2/16-043R, don't break CA$; also LB rules refactored for reduced memory consumption. ICU4J Data refreshed from ICU4C.

X-SVN-Rev: 38645
This commit is contained in:
Andy Heninger 2016-04-25 18:23:59 +00:00
parent 7265eeae4c
commit c1422845ac
4 changed files with 95 additions and 50 deletions

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:1420edbb9a70020f48c545a75738981907657c79f8c8543dbee9cbbb75ada655
size 11767515
oid sha256:9e60171048ccda76c8c9c0ced344822e21543ef8608d188f0029edfc5a5a87ea
size 11718381

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:75953512893b452eabe5d0b7721ca2c15473e11c14c51526d25d6aa1051b3bc1
size 91108
oid sha256:6451e003b77fcc7cf03b1e0a0eebdcc112a41209d3a41837964370a893198f24
size 91105

View File

@ -1057,25 +1057,36 @@ public class RBBITestMonkey extends TestFmwk {
continue;
}
// LB 23 (AL | HL) x NU
// NU x (AL | HL)
if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
continue;
}
if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
continue;
}
// LB 23 ID x PO (Note: Leading CM behaves like ID)
// AL x NU
// NU x AL
if (fID.contains(prevChar) && fPO.contains(thisChar) ||
fAL.contains(prevChar) && fNU.contains(thisChar) ||
fHL.contains(prevChar) && fNU.contains(thisChar) ||
fNU.contains(prevChar) && fAL.contains(thisChar) ||
fNU.contains(prevChar) && fHL.contains(thisChar) ) {
continue;
// LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
// PR x (ID | EB | EM)
// (ID | EB | EM) x PO
if (fPR.contains(prevChar) &&
(fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) {
continue;
}
if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
fPO.contains(thisChar)) {
continue;
}
// LB 24 Do not break between prefix and letters or ideographs.
// PR x ID
// PR x AL
// PO x AL
if (fPR.contains(prevChar) && fID.contains(thisChar) ||
fPR.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar)) ||
fPO.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
// (PR | PO) x (AL | HL)
// (AL | HL) x (PR | PO)
if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
(fAL.contains(thisChar) || fHL.contains(thisChar))) {
continue;
}
if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
(fPR.contains(thisChar) || fPO.contains(thisChar))) {
continue;
}

View File

@ -1,4 +1,4 @@
# Copyright (c) 2001-2015 International Business Machines
# Copyright (c) 2001-2016 International Business Machines
# Corporation and others. All Rights Reserved.
#
# RBBI Test Data
@ -32,6 +32,23 @@
# TODO: figure out how to have a single copy of the file for use by both C and Java.
## FILTERED BREAK TESTS
# (William Bradford, public domain. http://catalog.hathitrust.org/Record/008651224 ) - edited.
#<locale en>
#<sent>
#<data>\
#•In the meantime Mr. •Weston arrived with his small ship, which he had now recovered. •Capt. •Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. •Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
#
#<locale en@ss=standard>
#<sent>
#<data>\
#•In the meantime Mr. Weston arrived with his small ship, which he had now recovered. •Capt. Gorges, who informed the Sgt. here that one purpose of his going east was to meet with Mr. Weston, took this opportunity to call him to account for some abuses he had to lay to his charge.•</data>
#
## END FILTERED BREAK TESTS
<locale en>
# Temp debugging tests
<sent>
<data>•\u00c0.•</data>
@ -496,6 +513,18 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<data>• •\uF8FF\u2028<100>\uF8FF•</data>
<data>• \u200B\u2028<100>\u200B•</data>
# Regional Indicator sequences. They group in pairs. The reverse rules are tricky.
# Sequences are long enough that the non-exaustive monkey test won't reliably pick up problems.
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•</data>
<data>•\U0001F1E6\U0001F1E6•\U0001F1E6\u00a0\U0001F1E6\U0001F1E6•\U0001F1E6\U0001F1E6•\U0001F1E6•</data>
# User Guide example
<data>•Parlez-•vous •français ?•</data>
@ -578,6 +607,11 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal
<data>•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC•</data>
<data>•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•</data>
# Ticket #11556 don't break "R$" or "JP¥"
<locale en>
<line>
<data>•R$ •JP¥ •a9 •3a •H% •CA$ •Travi$ •Scott •Ke$ha •Curren$y •A$AP •Rocky•</data>
########################################################################################