2002-08-30 21:37:59 +00:00
|
|
|
#
|
2003-06-03 20:58:22 +00:00
|
|
|
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
2002-08-30 21:37:59 +00:00
|
|
|
# All Rights Reserved.
|
|
|
|
#
|
|
|
|
# file: sent.txt
|
|
|
|
#
|
|
|
|
# ICU Sentence Break Rules
|
2003-05-16 22:05:35 +00:00
|
|
|
# See Unicode Standard Annex #29.
|
2003-05-30 16:07:39 +00:00
|
|
|
# These rules are based on TR 29 version 4.0.0
|
2002-08-30 21:37:59 +00:00
|
|
|
#
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
!!chain;
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2002-08-30 21:37:59 +00:00
|
|
|
#
|
|
|
|
# Character categories as defined in TR 29
|
|
|
|
#
|
2003-05-16 22:05:35 +00:00
|
|
|
$Sep = [\u000a \u000d \u0085 \u2028 \u2029];
|
|
|
|
$Format = [[:Format:]];
|
2002-08-30 21:37:59 +00:00
|
|
|
$Sp = [[:Whitespace:] - $Sep];
|
|
|
|
$Lower = [[:Lowercase:]];
|
2003-05-16 22:05:35 +00:00
|
|
|
$Upper = [[:TitleCase_Letter:] [:Uppercase:]];
|
|
|
|
$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
|
|
|
|
$Numeric = [:LineBreak = Numeric:];
|
|
|
|
|
|
|
|
$ATerm = [.];
|
|
|
|
|
|
|
|
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
|
|
|
|
\u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047
|
|
|
|
\u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
|
|
|
|
|
|
|
|
$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
|
|
|
|
[[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
|
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
$Extend = [[:Grapheme_Extend = TRUE:]];
|
2003-05-16 22:05:35 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
$ATermEx = $ATerm $Extend*;
|
|
|
|
$NumericEx = $Numeric $Extend*;
|
|
|
|
$UpperEx = $Upper $Extend*;
|
|
|
|
$CloseEx = $Close $Extend*;
|
|
|
|
$SpEx = $Sp $Extend*;
|
|
|
|
$LowerEx = $Lower $Extend*;
|
|
|
|
$TermEx = $Term $Extend*;
|
2003-05-16 22:05:35 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 6
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
$ATermEx $Format* $NumericEx;
|
|
|
|
|
|
|
|
# rule 7
|
|
|
|
|
|
|
|
$UpperEx $ATermEx $Format* $UpperEx;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 8
|
2003-05-16 22:05:35 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
$ATermEx $Format* $CloseEx* $Format* $SpEx $Format*
|
|
|
|
[^$OLetter $Upper $Lower $Sep]* $Extend* $Format* $LowerEx;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 9 forced to exit by / [^$Close $Sp]
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* $Sep;
|
|
|
|
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($CloseEx | $SpEx) / [^$Close $Sp];
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 10 forced to exit by / [^$Sp];
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-05-16 22:05:35 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $Sep;
|
|
|
|
($TermEx | $ATermEx) $Format* ($CloseEx $Format*)* ($SpEx $Format*)* $SpEx / [^$Sp];
|
2002-08-30 21:37:59 +00:00
|
|
|
|
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 11 partly included in rule 9 and 10
|
|
|
|
$TermEx;
|
|
|
|
$ATermEx;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 12
|
|
|
|
|
|
|
|
([^$Term $ATerm $Sep] $Extend*)+;
|
|
|
|
([^$Term $ATerm $Sep] $Extend* $Format*)+ ($Term | $ATerm | $Sep);
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2002-08-30 21:37:59 +00:00
|
|
|
#
|
|
|
|
# Reverse Rules
|
|
|
|
#
|
2003-11-05 23:50:39 +00:00
|
|
|
|
|
|
|
$BackATermEx = $Extend* $ATerm;
|
|
|
|
$BackNumericEx = $Extend* $Numeric;
|
|
|
|
$BackUpperEx = $Extend* $Upper;
|
|
|
|
$BackCloseEx = $Extend* $Close;
|
|
|
|
$BackSpEx = $Extend* $Sp;
|
|
|
|
$BackLowerEx = $Extend* $Lower;
|
|
|
|
$BackTermEx = $Extend* $Term;
|
|
|
|
|
|
|
|
# rule 3
|
|
|
|
|
|
|
|
! $Sep .;
|
|
|
|
|
|
|
|
# rule 6
|
|
|
|
|
|
|
|
! $BackNumericEx $Format* $BackATermEx;
|
|
|
|
|
|
|
|
# rule 7
|
|
|
|
|
|
|
|
! $BackUpperEx $Format* $BackATermEx $BackUpperEx;
|
|
|
|
|
|
|
|
# rule 8
|
|
|
|
|
|
|
|
! $BackLowerEx $Format* $Extend* [^$OLetter $Upper $Lower $Sep]* $Format*
|
|
|
|
$BackSpEx $Format* $BackCloseEx* $Format* $BackATermEx;
|
|
|
|
|
|
|
|
# rules 9, 10, 11, 12
|
|
|
|
|
|
|
|
$Any = [^$Term $ATerm $Sep];
|
|
|
|
$Safe = [^$Term $ATerm $Sep $Sp $Close];
|
|
|
|
$BackEnd = ($BackSpEx $Format*)* ($BackCloseEx $Format*)* ($BackTermEx | $BackATermEx);
|
|
|
|
! $BackEnd;
|
|
|
|
! $BackEnd? $Any* $Safe;
|
|
|
|
! $BackEnd? $Any* $Close / ($BackSpEx $Format*)+ ($BackTermEx | $BackATermEx);
|
|
|
|
! $BackEnd? $Any* $Sp / $Sep;
|