scuffed-code/icu4c/source/data/brkitr/sent.txt
2005-04-22 21:49:52 +00:00

147 lines
4.1 KiB
Plaintext

#
# Copyright (C) 2002-2005, International Business Machines Corporation and others.
# All Rights Reserved.
#
# file: sent.txt
#
# ICU Sentence Break Rules
# See Unicode Standard Annex #29.
# These rules are based on TR 29 version 4.0.0
#
#
# Character categories as defined in TR 29
#
$Sep = [\p{Sentence_Break = Sep}];
$Format = [\p{Sentence_Break = Format}];
$Sp = [\p{Sentence_Break = Sp}];
$Lower = [\p{Sentence_Break = Lower}];
$Upper = [\p{Sentence_Break = Upper}];
$OLetter = [\p{Sentence_Break = OLetter}];
$Numeric = [\p{Sentence_Break = Numeric}];
$ATerm = [\p{Sentence_Break = ATerm}];
$Term = [\p{Sentence_Break = STerm}];
$Close = [\p{Sentence_Break = Close}];
#
# Define extended forms of the character classes,
# incorporate grapheme cluster + format chars.
$Extend = [[:Grapheme_Extend = TRUE:]];
$ATermEx = $ATerm $Extend* $Format*;
$NumericEx = $Numeric $Extend* $Format*;
$UpperEx = $Upper $Extend* $Format*;
$TermEx = $Term $Extend* $Format*;
#
# $SepSeq keeps together CRLF as a separator. (CRLF is a grapheme cluster)
#
$SepSeq = $Sep | \u000d\u000a;
# $InteriorChars are those that never trigger a following break.
$InteriorChars = [^$Term $ATerm $Sep]; #Note: includes Extend and Format chars
## -------------------------------------------------
!!forward;
# Rule 6. Match an ATerm (.) that does not cause a break because a number immediately follows it.
$NumberFollows = $InteriorChars* $ATermEx $NumericEx;
# Rule 7. $UppersSurround Match a no-break sentence fragment containing a . surrounded by Uppers
$UppersSurround = $InteriorChars* $UpperEx $ATermEx $UpperEx;
# Rule 8 Matches a sentence fragment containing "." that should not cause a sentence break,
# because a lower case word follows the period.
$LowerWordFollows = $InteriorChars* $ATermEx $Close* $Sp* [^$OLetter $Upper $Lower $Sep]* $Lower;
# Rules 3, 9, 10, 11
# Matches a simple sentence, or the trailing part of a complex sentence,
# where a simple sentence contains no interior "."s.
$TermEndSequence = $InteriorChars* ($TermEx | $ATermEx) $Close* $Sp* $SepSeq?;
$EndSequence = $InteriorChars* $SepSeq?;
# Put them all together.
($NumberFollows | $UppersSurround | $LowerWordFollows)* $TermEndSequence{0}; # status = UBRK_SENTENCE_TERM
($NumberFollows | $UppersSurround | $LowerWordFollows)* $EndSequence{100}; # status = UBRK_SENTENCE_SEP
## -------------------------------------------------
!!reverse;
# rule 6
$RULE6 = $Numeric $Format* $Extend* $ATerm;
# rule 7
$RULE7 = $Upper $Format* $Extend* $ATerm $Format* $Extend* $Upper;
# rule 8
$RULE8 = $Lower ($Format* $Extend* [^$OLetter $Upper $Lower $Sep])*
($Format* $Extend* $Sp)* ($Format* $Extend* $Close)*
$Format* $Extend* $ATerm;
# rule 9, 10, 11
# $CR $LF
$End = $Sep | \u000a\u000d
| $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
$Extend* ($Term | $ATerm)
| $Sep $Format* $Extend* $Sp* $Format* $Extend* $Close* $Format*
$Extend* ($Term | $ATerm);
# rule 12
$RULE12 = [^$Sep $Term $ATerm];
$Join = ($RULE6 | $RULE7 | $RULE8 | $RULE12)*;
$End;
$End? $Join [$RULE12 - $Sp - $Close];
# forces a break at the beginning of text "$Sp blah blah blah"
# remember the break iterators takes the longest match
$NOT_T_A_S_C = [^$Term $ATerm $Sp $Close];
$End? $Join $Sp / [$NOT_T_A_S_C {eof}];
# forces a break at the beginning of text "$Close blah blah blah"
$NOT_T_A_C = [^$Term $ATerm $Close];
$End? $Join $Close / [$NOT_T_A_C {eof}];
## -------------------------------------------------
!!safe_reverse;
# rule 4
$Extend+ [^$Extend];
# rule 7
$Extend* $ATerm $Format* $Extend* $Upper;
# rule 8
($Extend* $Term)+ ($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* $ATerm;
# rule 11
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)*;
($Extend* $Sp $Format*)* ($Extend* $Close $Format*)* $Extend* ($Term | $ATerm);
## -------------------------------------------------
!!safe_forward;
# rule 7
$ATerm $Extend* $Format* $Upper;
# rule 8
$Lower .;
# rule 11
($Close $Extend* $Format*)* ($Sp $Extend* $Format*)*;