2002-08-30 21:37:59 +00:00
|
|
|
#
|
2003-06-03 20:58:22 +00:00
|
|
|
# Copyright (C) 2002-2003, International Business Machines Corporation and others.
|
2002-08-30 21:37:59 +00:00
|
|
|
# All Rights Reserved.
|
|
|
|
#
|
2003-11-25 00:23:46 +00:00
|
|
|
# file: sent.txt
|
2002-08-30 21:37:59 +00:00
|
|
|
#
|
|
|
|
# ICU Sentence Break Rules
|
2003-05-16 22:05:35 +00:00
|
|
|
# See Unicode Standard Annex #29.
|
2003-05-30 16:07:39 +00:00
|
|
|
# These rules are based on TR 29 version 4.0.0
|
2002-08-30 21:37:59 +00:00
|
|
|
#
|
2003-11-25 00:23:46 +00:00
|
|
|
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2002-08-30 21:37:59 +00:00
|
|
|
#
|
|
|
|
# Character categories as defined in TR 29
|
|
|
|
#
|
2003-05-16 22:05:35 +00:00
|
|
|
$Sep = [\u000a \u000d \u0085 \u2028 \u2029];
|
|
|
|
$Format = [[:Format:]];
|
2002-08-30 21:37:59 +00:00
|
|
|
$Sp = [[:Whitespace:] - $Sep];
|
|
|
|
$Lower = [[:Lowercase:]];
|
2003-05-16 22:05:35 +00:00
|
|
|
$Upper = [[:TitleCase_Letter:] [:Uppercase:]];
|
|
|
|
$OLetter = [[:Alphabetic:] [:name = HEBREW PUNCTUATION GERESH:] - [$Lower $Upper]];
|
|
|
|
$Numeric = [:LineBreak = Numeric:];
|
|
|
|
|
2003-11-25 00:23:46 +00:00
|
|
|
$ATerm = [.];
|
2003-05-16 22:05:35 +00:00
|
|
|
|
|
|
|
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0700 \u0701 \u0702 \u0964 \u1362
|
|
|
|
\u1367 \u1368 \u104a \u104b \u166e \u1803 \u1809 \u203C \u203D \u2047
|
|
|
|
\u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
|
2003-11-25 00:23:46 +00:00
|
|
|
|
2003-05-16 22:05:35 +00:00
|
|
|
$Close = [[:Open_Punctuation:] [:Close_Punctuation:] [:Linebreak = Quotation:] -
|
|
|
|
[[:name = HEBREW PUNCTUATION GERESH:] $ATerm $Term]];
|
2003-11-25 00:23:46 +00:00
|
|
|
|
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# Define extended forms of the character classes,
|
|
|
|
# incorporate grapheme cluster + format chars.
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-25 00:23:46 +00:00
|
|
|
$Extend = [[:Grapheme_Extend = TRUE:]];
|
2003-11-09 06:52:44 +00:00
|
|
|
$ATermEx = $ATerm $Extend* $Format*;
|
|
|
|
$NumericEx = $Numeric $Extend* $Format*;
|
|
|
|
$UpperEx = $Upper $Extend* $Format*;
|
|
|
|
$TermEx = $Term $Extend* $Format*;
|
2003-05-16 22:05:35 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
#
|
|
|
|
# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
|
|
|
|
#
|
|
|
|
$SepSeq = $Sep | \u000d\u000a;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# $InteriorChars are those that never trigger a following break.
|
|
|
|
$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
## -------------------------------------------------
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
!!forward;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
|
|
|
|
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
|
2003-05-16 22:05:35 +00:00
|
|
|
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
|
|
|
|
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
|
|
|
|
# because a lower case word follows the period.
|
|
|
|
$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
|
2002-08-30 21:37:59 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# Rules 3, 9, 10, 11
|
|
|
|
# Matches a simple sentence, or the trailing part of a complex sentence,
|
|
|
|
# where a simple sentence contains no interior "."s.
|
|
|
|
$TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
|
|
|
|
$EndSequence = $InteriorChars* $SepSeq?;
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-25 00:23:46 +00:00
|
|
|
# Put them all together.
|
2003-11-09 06:52:44 +00:00
|
|
|
($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM
|
|
|
|
($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP
|
2003-11-25 00:23:46 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
## -------------------------------------------------
|
2002-06-25 17:23:07 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
!!reverse;
|
2003-11-25 00:23:46 +00:00
|
|
|
|
2003-11-09 20:32:00 +00:00
|
|
|
# rule 6
|
|
|
|
|
|
|
|
$RULE6 = $Numeric $Format* $Extend* $ATerm;
|
|
|
|
|
|
|
|
# rule 7
|
|
|
|
|
|
|
|
$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper;
|
|
|
|
|
|
|
|
# rule 8
|
|
|
|
|
|
|
|
$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])*
|
|
|
|
($Format* $Extend* $Sp)* ($Format* $Extend* $Close)*
|
|
|
|
$Format* $Extend* $ATerm;
|
|
|
|
|
|
|
|
# rule 9, 10, 11
|
|
|
|
|
|
|
|
# $CR $LF
|
|
|
|
$End = $Sep | \u000a\u000d
|
|
|
|
| $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
|
|
|
|
$Extend* ($Term | $ATerm)
|
|
|
|
| $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
|
|
|
|
$Extend* ($Term | $ATerm);
|
|
|
|
|
|
|
|
# rule 12
|
|
|
|
|
|
|
|
$RULE12 = [^$Sep $Term $ATerm];
|
|
|
|
|
|
|
|
$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*;
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 20:32:00 +00:00
|
|
|
$End;
|
|
|
|
|
|
|
|
$End? $Join [$RULE12 - $Sp - $Close];
|
|
|
|
|
|
|
|
# forces a break at the beginning of text "$Sp blah blah blah"
|
|
|
|
# remember the break iterators takes the longest match
|
|
|
|
$End? $Join $Sp / [^$Term $ATerm $Sp $Close];
|
|
|
|
|
|
|
|
# forces a break at the beginning of text "$Close blah blah blah"
|
|
|
|
$End? $Join $Close / [^$Term $ATerm $Close];
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
## -------------------------------------------------
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 20:32:00 +00:00
|
|
|
!!safe_reverse;
|
|
|
|
|
|
|
|
# rule 4
|
|
|
|
$Extend+ [^$Extend];
|
2003-11-25 00:23:46 +00:00
|
|
|
|
2003-11-05 23:50:39 +00:00
|
|
|
# rule 7
|
2003-11-09 20:32:00 +00:00
|
|
|
$Extend* $ATerm $Format* $Extend* $Upper;
|
|
|
|
|
|
|
|
# rule 8
|
|
|
|
($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm;
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# rule 11
|
2003-11-09 20:32:00 +00:00
|
|
|
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*;
|
|
|
|
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
## -------------------------------------------------
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
!!safe_forward;
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 20:32:00 +00:00
|
|
|
# rule 7
|
|
|
|
|
|
|
|
$ATerm $Extend* $Format* $Upper;
|
|
|
|
|
2003-11-09 06:52:44 +00:00
|
|
|
# rule 8
|
2003-11-05 23:50:39 +00:00
|
|
|
|
2003-11-09 20:32:00 +00:00
|
|
|
$Lower .;
|
|
|
|
|
|
|
|
# rule 11
|
|
|
|
|
|
|
|
($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*;
|