ICU-20401 rbbi break rules, update comments to match current UAX versions.
This commit is contained in:
parent
8335adc310
commit
b50f97a58a
@ -6,11 +6,9 @@
|
||||
#
|
||||
# file: char.txt
|
||||
#
|
||||
# ICU Character Break Rules, also known as Grapheme Cluster Boundaries
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# Plus revisions to rule GB 11 from http://unicode.org/cldr/trac/ticket/10088
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
# ICU Character Break Rules
|
||||
# These rules are based on the Extended Grapheme Cluster rules from
|
||||
# Unicode UAX #29 Revision 34 for Unicode Version 12.0
|
||||
|
||||
!!quoted_literals_only;
|
||||
|
||||
@ -20,9 +18,6 @@
|
||||
$CR = [\p{Grapheme_Cluster_Break = CR}];
|
||||
$LF = [\p{Grapheme_Cluster_Break = LF}];
|
||||
$Control = [[\p{Grapheme_Cluster_Break = Control}]];
|
||||
# TODO: Enable Virama & LinkingConsonant definitions once rule builder allows empty sets.
|
||||
#$Virama = [[\p{Grapheme_Cluster_Break = Virama}]];
|
||||
#$LinkingConsonant = [[\p{Grapheme_Cluster_Break = LinkingConsonant}]];
|
||||
$Extend = [[\p{Grapheme_Cluster_Break = Extend}]];
|
||||
$ZWJ = [\p{Grapheme_Cluster_Break = ZWJ}];
|
||||
$Regional_Indicator = [\p{Grapheme_Cluster_Break = Regional_Indicator}];
|
||||
@ -56,7 +51,7 @@ $L ($L | $V | $LV | $LVT);
|
||||
# GB 9
|
||||
[^$Control $CR $LF] ($Extend | $ZWJ);
|
||||
|
||||
# GB 9a (only for extended grapheme clusters)
|
||||
# GB 9a
|
||||
[^$Control $CR $LF] $SpacingMark;
|
||||
|
||||
# GB 9b
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
|
@ -8,7 +8,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
|
@ -7,7 +7,7 @@
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 Revision 40 for Unicode 11.0
|
||||
# Unicode Standard Annex #14 Revision 42 for Unicode 12.0
|
||||
# http://www.unicode.org/reports/tr14/, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
|
@ -8,7 +8,7 @@
|
||||
#
|
||||
# ICU Sentence Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
|
||||
!!quoted_literals_only;
|
||||
@ -34,7 +34,7 @@ $Close = [\p{Sentence_Break = Close}];
|
||||
#
|
||||
# Define extended forms of the character classes,
|
||||
# incorporate trailing Extend or Format chars.
|
||||
# Rules 4 and 5.
|
||||
# Rules 4 and 5.
|
||||
|
||||
$SpEx = $Sp ($Extend | $Format)*;
|
||||
$LowerEx = $Lower ($Extend | $Format)*;
|
||||
@ -78,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
||||
#Rule 9, 10, 11
|
||||
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
|
||||
|
||||
#Rule 12
|
||||
#Rule 998
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
|
||||
|
@ -1,7 +1,6 @@
|
||||
# Copyright (C) 2016 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
#
|
||||
# Copyright (C) 2002-2015, International Business Machines Corporation and others.
|
||||
# All Rights Reserved.
|
||||
#
|
||||
@ -9,7 +8,7 @@
|
||||
#
|
||||
# ICU Sentence Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 26 for Unicode Version 8.0
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
|
||||
!!quoted_literals_only;
|
||||
@ -35,7 +34,7 @@ $Close = [\p{Sentence_Break = Close}];
|
||||
#
|
||||
# Define extended forms of the character classes,
|
||||
# incorporate trailing Extend or Format chars.
|
||||
# Rules 4 and 5.
|
||||
# Rules 4 and 5.
|
||||
|
||||
$SpEx = $Sp ($Extend | $Format)*;
|
||||
$LowerEx = $Lower ($Extend | $Format)*;
|
||||
@ -79,6 +78,6 @@ $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower;
|
||||
#Rule 9, 10, 11
|
||||
($STermEx | $ATermEx) $CloseEx* $SpEx* ($Sep | $CR | $LF)?;
|
||||
|
||||
#Rule 12
|
||||
#Rule 998
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .;
|
||||
[[^$STerm $ATerm $Close $Sp $Sep $LF $CR $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep $LF $CR {eof}] | $CR $LF){100};
|
||||
|
@ -8,9 +8,7 @@
|
||||
#
|
||||
# ICU Word Break Rules
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
@ -58,7 +56,7 @@ $Hiragana = [:Hiragana:];
|
||||
# 5.0 or later as the definition of Complex_Context was corrected to include all
|
||||
# characters requiring dictionary break.
|
||||
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$Control = [\p{Grapheme_Cluster_Break = Control}];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana];
|
||||
@ -70,7 +68,7 @@ $ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
|
||||
|
||||
|
||||
#
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# Rules 4 Ignore Format and Extend characters,
|
||||
# except when they appear at the beginning of a region of text.
|
||||
#
|
||||
# TODO: check if handling of katakana in dictionary makes rules incorrect/void
|
||||
@ -148,7 +146,7 @@ $NumericEx $NumericEx {100};
|
||||
|
||||
$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
|
||||
|
||||
# rule 11 and 12
|
||||
# rule 11 and 12
|
||||
|
||||
$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
|
||||
|
||||
@ -180,7 +178,7 @@ $ExtendNumLetEx $KatakanaEx {400}; # (13b)
|
||||
|
||||
# special handling for CJK characters: chain for later dictionary segmentation
|
||||
$HangulSyllable $HangulSyllable {200};
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
|
||||
|
||||
# Rule 999
|
||||
# Match a single code point if no other rule applies.
|
||||
|
@ -8,9 +8,7 @@
|
||||
#
|
||||
# ICU Word Break Rules, POSIX locale.
|
||||
# See Unicode Standard Annex #29.
|
||||
# These rules are based on UAX #29 Revision 29 for Unicode Version 9.0
|
||||
# with additions for Emoji Sequences from https://goo.gl/cluFCn
|
||||
# Plus additional characters introduces with Emoji 5, http://www.unicode.org/reports/tr51/proposed.html
|
||||
# These rules are based on UAX #29 Revision 34 for Unicode Version 12.0
|
||||
#
|
||||
# Note: Updates to word.txt will usually need to be merged into
|
||||
# word_POSIX.txt also.
|
||||
|
Loading…
Reference in New Issue
Block a user