ICU-20401 rbbi break rules, update comments to match current UAX versions.

2019-02-06 18:01:04 -08:00 · 2019-02-06 18:01:04 -08:00 · b50f97a58a
commit b50f97a58a
parent 8335adc310
11 changed files with 22 additions and 32 deletions
--- a/icu4c/source/data/brkitr/rules/char.txt
+++ b/icu4c/source/data/brkitr/rules/char.txt
@ -6,11 +6,9 @@
 #
 #   file:  char.txt
 #
-#   ICU Character Break Rules, also known as Grapheme Cluster Boundaries
-#      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#   ICU Character Break Rules
+#      These rules are based on the Extended Grapheme Cluster rules from
+#      Unicode UAX #29 Revision 34 for Unicode Version 12.0

 !!quoted_literals_only;

@ -20,9 +18,6 @@
 $CR          = [\p{Grapheme_Cluster_Break = CR}];
 $LF          = [\p{Grapheme_Cluster_Break = LF}];
 $Control     = [[\p{Grapheme_Cluster_Break = Control}]];
-# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
-#$Virama      = [[\p{Grapheme_Cluster_Break = Virama}]];
-#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
 $Extend      = [[\p{Grapheme_Cluster_Break = Extend}]];
 $ZWJ         = [\p{Grapheme_Cluster_Break = ZWJ}];
 $Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
@ -56,7 +51,7 @@ $L ($L | $V | $LV | $LVT);
 # GB 9
 [^$Control $CR $LF] ($Extend | $ZWJ);

-# GB 9a (only for extended grapheme clusters)
+# GB 9a
 [^$Control $CR $LF] $SpacingMark;

 # GB 9b
--- a/icu4c/source/data/brkitr/rules/line.txt
+++ b/icu4c/source/data/brkitr/rules/line.txt
@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
--- a/icu4c/source/data/brkitr/rules/line_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_cj.txt
@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
--- a/icu4c/source/data/brkitr/rules/line_loose.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose.txt
@ -8,7 +8,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
--- a/icu4c/source/data/brkitr/rules/line_loose_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_loose_cj.txt
@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
--- a/icu4c/source/data/brkitr/rules/line_normal.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal.txt
@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
--- a/icu4c/source/data/brkitr/rules/line_normal_cj.txt
+++ b/icu4c/source/data/brkitr/rules/line_normal_cj.txt
@ -7,7 +7,7 @@
 #
 #         Line Breaking Rules
 #         Implement default line breaking as defined by
-#         Unicode Standard Annex #14 Revision 40 for Unicode 11.0
+#         Unicode Standard Annex #14 Revision 42 for Unicode 12.0
 #         http://www.unicode.org/reports/tr14/, with the following modification:
 #
 #         Boundaries between hyphens and following letters are suppressed when
--- a/icu4c/source/data/brkitr/rules/sent.txt
+++ b/icu4c/source/data/brkitr/rules/sent.txt
@ -8,7 +8,7 @@
 #
 #   ICU Sentence Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #

 !!quoted_literals_only;
@ -34,7 +34,7 @@ $Close     = [\p{Sentence_Break = Close}];
 #
 # Define extended forms of the character classes,
 #   incorporate trailing Extend or Format chars.
-#   Rules 4 and 5.  
+#   Rules 4 and 5.

 $SpEx       = $Sp      ($Extend | $Format)*;
 $LowerEx    = $Lower   ($Extend | $Format)*;
@ -78,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
 #Rule 9, 10, 11
 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;

-#Rule 12
+#Rule 998
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
--- a/icu4c/source/data/brkitr/rules/sent_el.txt
+++ b/icu4c/source/data/brkitr/rules/sent_el.txt
@ -1,7 +1,6 @@
 # Copyright (C) 2016 and later: Unicode, Inc. and others.
 # License & terms of use: http://www.unicode.org/copyright.html
 #
-#
 #   Copyright (C) 2002-2015, International Business Machines Corporation and others.
 #       All Rights Reserved.
 #
@ -9,7 +8,7 @@
 #
 #   ICU Sentence Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #

 !!quoted_literals_only;
@ -35,7 +34,7 @@ $Close     = [\p{Sentence_Break = Close}];
 #
 # Define extended forms of the character classes,
 #   incorporate trailing Extend or Format chars.
-#   Rules 4 and 5.  
+#   Rules 4 and 5.

 $SpEx       = $Sp      ($Extend | $Format)*;
 $LowerEx    = $Lower   ($Extend | $Format)*;
@ -79,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
 #Rule 9, 10, 11
 ($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;

-#Rule 12
+#Rule 998
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
 [[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
--- a/icu4c/source/data/brkitr/rules/word.txt
+++ b/icu4c/source/data/brkitr/rules/word.txt
@ -8,9 +8,7 @@
 #
 # ICU Word Break Rules
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.
@ -58,7 +56,7 @@ $Hiragana           = [:Hiragana:];
 #   5.0 or later as the definition of Complex_Context was corrected to include all
 #   characters requiring dictionary break.

-$Control        = [\p{Grapheme_Cluster_Break = Control}]; 
+$Control        = [\p{Grapheme_Cluster_Break = Control}];
 $HangulSyllable = [\uac00-\ud7a3];
 $ComplexContext = [:LineBreak = Complex_Context:];
 $KanaKanji      = [$Han $Hiragana $Katakana];
@ -70,7 +68,7 @@ $ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];


 #
-#  Rules 4    Ignore Format and Extend characters, 
+#  Rules 4    Ignore Format and Extend characters,
 #             except when they appear at the beginning of a region of text.
 #
 # TODO: check if handling of katakana in dictionary makes rules incorrect/void
@ -148,7 +146,7 @@ $NumericEx $NumericEx {100};

 $NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};

-# rule 11 and 12 
+# rule 11 and 12

 $NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};

@ -180,7 +178,7 @@ $ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)

 # special handling for CJK characters: chain for later dictionary segmentation
 $HangulSyllable $HangulSyllable {200};
-$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 
+$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found

 # Rule 999
 #     Match a single code point if no other rule applies.
--- a/icu4c/source/data/brkitr/rules/word_POSIX.txt
+++ b/icu4c/source/data/brkitr/rules/word_POSIX.txt
@ -8,9 +8,7 @@
 #
 # ICU Word Break Rules, POSIX locale.
 #      See Unicode Standard Annex #29.
-#      These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
-#      with additions for Emoji Sequences from https://goo.gl/cluFCn
-#      Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
+#      These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
 #
 # Note:  Updates to word.txt will usually need to be merged into
 #        word_POSIX.txt also.