From b6dcdfcd2570b20fdb7c3825d9091f2fca0a34cc Mon Sep 17 00:00:00 2001 From: Peter Edberg Date: Fri, 30 Aug 2013 05:51:27 +0000 Subject: [PATCH] ICU-10176 No line break in $SY $HL; update tests accordingly X-SVN-Rev: 34142 --- icu4c/source/data/brkitr/line.txt | 8 ++++++- icu4c/source/data/brkitr/line_fi.txt | 8 ++++++- icu4c/source/data/brkitr/line_ja.txt | 8 ++++++- icu4c/source/test/intltest/rbbitst.cpp | 6 +++++ icu4c/source/test/testdata/LineBreakTest.txt | 4 ++-- icu4c/source/test/testdata/rbbitst.txt | 23 ++++++++++++++++++++ 6 files changed, 52 insertions(+), 5 deletions(-) diff --git a/icu4c/source/data/brkitr/line.txt b/icu4c/source/data/brkitr/line.txt index df59bffb7a..74560af044 100644 --- a/icu4c/source/data/brkitr/line.txt +++ b/icu4c/source/data/brkitr/line.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2002-2012 International Business Machines Corporation and +# Copyright (c) 2002-2013 International Business Machines Corporation and # others. All Rights Reserved. # # file: line.txt @@ -342,6 +342,10 @@ $BBcm $LB20NonBreaks $CM*; # $HLcm ($HYcm | $BAcm) [^$CB]?; +# LB 21b (forward) Don't break between SY and HL +# (break between HL and SY already disallowed by LB 13 above) +$SYcm $HLcm; + # LB 22 ($ALcm | $HLcm) $INcm; $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL @@ -575,6 +579,8 @@ $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . # LB21a [^$CB] $CM* ($HY | $BA) $CM* $HL; +# LB21b (reverse) +$CM* $HL $CM* $SY; # LB 22 $CM* $IN $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/line_fi.txt b/icu4c/source/data/brkitr/line_fi.txt index 38c835da1a..adf78bd388 100644 --- a/icu4c/source/data/brkitr/line_fi.txt +++ b/icu4c/source/data/brkitr/line_fi.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2002-2012 International Business Machines Corporation and +# Copyright (c) 2002-2013 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_fi.txt @@ -348,6 +348,10 @@ $BBcm $LB20NonBreaks $CM*; # $HLcm ($HYcm | $BAcm | $HHcm) [^$CB]?; +# LB 21b (forward) Don't break between SY and HL +# (break between HL and SY already disallowed by LB 13 above) +$SYcm $HLcm; + # LB 22 ($ALcm | $HLcm) $INcm; $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL @@ -585,6 +589,8 @@ $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . # LB21a [^$CB] $CM* ($HY | $BA | $HH) $CM* $HL; +# LB21b (reverse) +$CM* $HL $CM* $SY; # LB 22 $CM* $IN $CM* ($ALPlus | $HL); diff --git a/icu4c/source/data/brkitr/line_ja.txt b/icu4c/source/data/brkitr/line_ja.txt index 2d4781b04a..70b203d1b0 100644 --- a/icu4c/source/data/brkitr/line_ja.txt +++ b/icu4c/source/data/brkitr/line_ja.txt @@ -1,4 +1,4 @@ -# Copyright (c) 2002-2012 International Business Machines Corporation and +# Copyright (c) 2002-2013 International Business Machines Corporation and # others. All Rights Reserved. # # file: line_ja.txt @@ -342,6 +342,10 @@ $BBcm $LB20NonBreaks $CM*; # $HLcm ($HYcm | $BAcm) [^$CB]?; +# LB 21b (forward) Don't break between SY and HL +# (break between HL and SY already disallowed by LB 13 above) +$SYcm $HLcm; + # LB 22 ($ALcm | $HLcm) $INcm; $CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL @@ -575,6 +579,8 @@ $CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . # LB21a [^$CB] $CM* ($HY | $BA) $CM* $HL; +# LB21b (reverse) +$CM* $HL $CM* $SY; # LB 22 $CM* $IN $CM* ($ALPlus | $HL); diff --git a/icu4c/source/test/intltest/rbbitst.cpp b/icu4c/source/test/intltest/rbbitst.cpp index 1007422ea4..51cd6d3757 100644 --- a/icu4c/source/test/intltest/rbbitst.cpp +++ b/icu4c/source/test/intltest/rbbitst.cpp @@ -3189,6 +3189,12 @@ int32_t RBBILineMonkey::next(int32_t startPos) { continue; } + // LB 21b + // SY x HL + if (fSY->contains(prevChar) && fHL->contains(thisChar)) { + continue; + } + // LB 22 if ((fAL->contains(prevChar) && fIN->contains(thisChar)) || (fHL->contains(prevChar) && fIN->contains(thisChar)) || diff --git a/icu4c/source/test/testdata/LineBreakTest.txt b/icu4c/source/test/testdata/LineBreakTest.txt index cc5d66785f..75b398e2d3 100644 --- a/icu4c/source/test/testdata/LineBreakTest.txt +++ b/icu4c/source/test/testdata/LineBreakTest.txt @@ -4914,9 +4914,9 @@ × 002F × 0020 ÷ AC01 ÷ # × [0.3] SOLIDUS (SY) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3] × 002F × 0308 ÷ AC01 ÷ # × [0.3] SOLIDUS (SY) × [9.0] COMBINING DIAERESIS (CM) ÷ [999.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3] × 002F × 0308 × 0020 ÷ AC01 ÷ # × [0.3] SOLIDUS (SY) × [9.0] COMBINING DIAERESIS (CM) × [7.01] SPACE (SP) ÷ [18.0] HANGUL SYLLABLE GAG (H3) ÷ [0.3] -× 002F ÷ 05D0 ÷ # × [0.3] SOLIDUS (SY) ÷ [999.0] HEBREW LETTER ALEF (HL) ÷ [0.3] +× 002F × 05D0 ÷ # × [0.3] SOLIDUS (SY) ÷ [21.05] HEBREW LETTER ALEF (HL) ÷ [0.3] × 002F × 0020 ÷ 05D0 ÷ # × [0.3] SOLIDUS (SY) × [7.01] SPACE (SP) ÷ [18.0] HEBREW LETTER ALEF (HL) ÷ [0.3] -× 002F × 0308 ÷ 05D0 ÷ # × [0.3] SOLIDUS (SY) × [9.0] COMBINING DIAERESIS (CM) ÷ [999.0] HEBREW LETTER ALEF (HL) ÷ [0.3] +× 002F × 0308 × 05D0 ÷ # × [0.3] SOLIDUS (SY) × [9.0] COMBINING DIAERESIS (CM) ÷ [21.05] HEBREW LETTER ALEF (HL) ÷ [0.3] × 002F × 0308 × 0020 ÷ 05D0 ÷ # × [0.3] SOLIDUS (SY) × [9.0] COMBINING DIAERESIS (CM) × [7.01] SPACE (SP) ÷ [18.0] HEBREW LETTER ALEF (HL) ÷ [0.3] × 002F × 002D ÷ # × [0.3] SOLIDUS (SY) × [21.02] HYPHEN-MINUS (HY) ÷ [0.3] × 002F × 0020 ÷ 002D ÷ # × [0.3] SOLIDUS (SY) × [7.01] SPACE (SP) ÷ [18.0] HYPHEN-MINUS (HY) ÷ [0.3] diff --git a/icu4c/source/test/testdata/rbbitst.txt b/icu4c/source/test/testdata/rbbitst.txt index 8f185039b7..fc41df5a90 100644 --- a/icu4c/source/test/testdata/rbbitst.txt +++ b/icu4c/source/test/testdata/rbbitst.txt @@ -567,6 +567,14 @@ What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tal •\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3• •\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc• +# Test for #10176 (in root) + +•abc/•s •def• +•abc/\u05D9 •def• +•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC• +•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA• + + ######################################################################################## # @@ -762,6 +770,14 @@ Bangkok)• •私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々<400>は<400>ワード<400>で<400>ある<400>。• +# Test for #10176 (in ja) + +•abc/•s •def• +•abc/\u05D9 •def• +•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC• +•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA• + + •私<400>達<400>に<400>一<400>〇<400>〇〇<400>の<400>コンピュータ<400>が<400>ある<400>。<0>奈々<400>は<400>ワード<400>で<400>ある<400>。• @@ -834,3 +850,10 @@ Bangkok)• •abc •- •def •abc •-def •abc- •def • # With ASCII hyphen •abc •‐ •def •abc •‐def •abc‐ •def • # With Unicode u2010 hyphen + +# Test for #10176 (in fi) + +•abc/•s •def• +•abc/\u05D9 •def• +•\u05E7\u05D7/\u05D9 •\u05DE\u05E2\u05D9\u05DC• +•\u05D3\u05E8\u05D5\u05E9\u05D9\u05DD •\u05E9\u05D7\u05E7\u05E0\u05D9\u05DD/\u05D9\u05D5\u05EA•