ICU-20401 rbbi break rules, update comments to match current UAX versions.

This commit is contained in:
Andy Heninger 2019-02-06 18:01:04 -08:00
parent 8335adc310
commit b50f97a58a
11 changed files with 22 additions and 32 deletions

View File

@ -6,11 +6,9 @@
#
# file: char.txt
#
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
# ICU Character Break Rules
# These rules are based on the Extended Grapheme Cluster rules from
# Unicode UAX #29 Revision 34 for Unicode Version 12.0
!!quoted_literals_only;
@ -20,9 +18,6 @@
$CR = [\p{Grapheme_Cluster_Break = CR}];
$LF = [\p{Grapheme_Cluster_Break = LF}];
$Control = [[\p{Grapheme_Cluster_Break = Control}]];
# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
#$Virama = [[\p{Grapheme_Cluster_Break = Virama}]];
#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
$Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
$ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
@ -56,7 +51,7 @@ $L ($L | $V | $LV | $LVT);
# GB 9
[^$Control $CR $LF] ($Extend | $ZWJ);
# GB 9a (only for extended grapheme clusters)
# GB 9a
[^$Control $CR $LF] $SpacingMark;
# GB 9b

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when

View File

@ -8,7 +8,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when

View File

@ -7,7 +7,7 @@
#
# Line Breaking Rules
# Implement default line breaking as defined by
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
# http://www.unicode.org/reports/tr14/, with the following modification:
#
# Boundaries between hyphens and following letters are suppressed when

View File

@ -8,7 +8,7 @@
#
# ICU Sentence Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
!!quoted_literals_only;
@ -34,7 +34,7 @@ $Close = [\p{Sentence_Break = Close}];
#
# Define extended forms of the character classes,
# incorporate trailing Extend or Format chars.
# Rules 4 and 5.
# Rules 4 and 5.
$SpEx = $Sp ($Extend | $Format)*;
$LowerEx = $Lower ($Extend | $Format)*;
@ -78,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
#Rule 9, 10, 11
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
#Rule 12
#Rule 998
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};

View File

@ -1,7 +1,6 @@
# Copyright (C) 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
#
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
# All Rights Reserved.
#
@ -9,7 +8,7 @@
#
# ICU Sentence Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
!!quoted_literals_only;
@ -35,7 +34,7 @@ $Close = [\p{Sentence_Break = Close}];
#
# Define extended forms of the character classes,
# incorporate trailing Extend or Format chars.
# Rules 4 and 5.
# Rules 4 and 5.
$SpEx = $Sp ($Extend | $Format)*;
$LowerEx = $Lower ($Extend | $Format)*;
@ -79,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
#Rule 9, 10, 11
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
#Rule 12
#Rule 998
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};

View File

@ -8,9 +8,7 @@
#
# ICU Word Break Rules
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
# with additions for Emoji Sequences from https://goo.gl/cluFCn
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.
@ -58,7 +56,7 @@ $Hiragana = [:Hiragana:];
# 5.0 or later as the definition of Complex_Context was corrected to include all
# characters requiring dictionary break.
$Control = [\p{Grapheme_Cluster_Break = Control}];
$Control = [\p{Grapheme_Cluster_Break = Control}];
$HangulSyllable = [\uac00-\ud7a3];
$ComplexContext = [:LineBreak = Complex_Context:];
$KanaKanji = [$Han $Hiragana $Katakana];
@ -70,7 +68,7 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
#
# Rules 4 Ignore Format and Extend characters,
# Rules 4 Ignore Format and Extend characters,
# except when they appear at the beginning of a region of text.
#
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
@ -148,7 +146,7 @@ $NumericEx $NumericEx {100};
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
# rule 11 and 12
# rule 11 and 12
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
@ -180,7 +178,7 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b)
# special handling for CJK characters: chain for later dictionary segmentation
$HangulSyllable $HangulSyllable {200};
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
# Rule 999
# Match a single code point if no other rule applies.

View File

@ -8,9 +8,7 @@
#
# ICU Word Break Rules, POSIX locale.
# See Unicode Standard Annex #29.
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
# with additions for Emoji Sequences from https://goo.gl/cluFCn
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
#
# Note: Updates to word.txt will usually need to be merged into
# word_POSIX.txt also.