scuffed-code/icu4c/source/data/brkitr/sent.txt
2003-11-05 23:50:39 +00:00

115 lines
2.9 KiB
Plaintext

#
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: sent.txt
#
# ICU Sentence Break Rules
# See Unicode Standard Annex #29.
# These rules are based on TR 29 version 4.0.0
#
!!chain;
#
# Character categories as defined in TR 29
#
$Sep = [\u000a \u000d \u0085 \u2028 \u2029];
$Format = [[:Format:]];
$Sp = [[:Whitespace:] - $Sep];
$Lower = [[:Lowercase:]];
$Upper = [[:TitleCase_Letter:] [:Uppercase:]];
$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
$Numeric = [:LineBreak = Numeric:];
$ATerm = [.];
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
\u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047
\u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
[[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
$Extend = [[:Grapheme_Extend = TRUE:]];
$ATermEx = $ATerm $Extend*;
$NumericEx = $Numeric $Extend*;
$UpperEx = $Upper $Extend*;
$CloseEx = $Close $Extend*;
$SpEx = $Sp $Extend*;
$LowerEx = $Lower $Extend*;
$TermEx = $Term $Extend*;
# rule 6
$ATermEx $Format* $NumericEx;
# rule 7
$UpperEx $ATermEx $Format* $UpperEx;
# rule 8
$ATermEx $Format* $CloseEx* $Format* $SpEx $Format*
[^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
# rule 9 forced to exit by / [^$Close $Sp]
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
# rule 10 forced to exit by / [^$Sp];
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];
# rule 11 partly included in rule 9 and 10
$TermEx;
$ATermEx;
# rule 12
([^$Term $ATerm $Sep] $Extend*)+;
([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);
#
# Reverse Rules
#
$BackATermEx = $Extend* $ATerm;
$BackNumericEx = $Extend* $Numeric;
$BackUpperEx = $Extend* $Upper;
$BackCloseEx = $Extend* $Close;
$BackSpEx = $Extend* $Sp;
$BackLowerEx = $Extend* $Lower;
$BackTermEx = $Extend* $Term;
# rule 3
! $Sep .;
# rule 6
! $BackNumericEx $Format* $BackATermEx;
# rule 7
! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
# rule 8
! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format*
$BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
# rules 9, 10, 11, 12
$Any = [^$Term $ATerm $Sep];
$Safe = [^$Term $ATerm $Sep $Sp $Close];
$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
! $BackEnd;
! $BackEnd? $Any* $Safe;
! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
! $BackEnd? $Any* $Sp / $Sep;