# # Copyright (C) 2002, International Business Machines Corporation and others. # All Rights Reserved. # # file: sent.txt # # ICU Sentence Break Rules # See Unicode Technical Report #29. # These rules are based on the proposed draft dated 2002-08-09 # # # Character categories as defined in TR 29 # $Sep = [\u000d \u000a \u0085 \u2028 \u2029]; $Format = [[:Cf:]]; $Sp = [[:Whitespace:] - $Sep]; $Lower = [[:Lowercase:]]; $Upper = [[:Lt:] [:Uppercase:]]; $OLetter = [[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3]; # The chars listed by number below are those with "Linebreak=QU" $Close = [[:Pe:] [:Po:] \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C \u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ]; $ATerm = [\u002e]; $Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964 \u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002 \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61]; $AnyTerm = [$ATerm $Term]; # From Grapheme Cluster $Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend # # $SepSeq keeps together CRLF as a separator. # $SepSeq = $Sep | \u000d\u000a; # $InteriorChars are those that never cause a break. $InteriorChars = [^$AnyTerm $Sep]; # Sentence Break Rules 8, 9, 11 # $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods) # that do not cause a break for one exceptional reason or another. $EndSequence = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)* ($AnyTerm | $Format | $Sp | $Extend)* $SepSeq?; # Rule 6 Matches a sentence fragment containing "." that should not cause a sentence break, # because a lower case word follows the period. $LowerWordFollows = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower; # Rule 7. $UpperFollowsImmediately # Matches a fragment containing in a "." that should not cause a sentence break # because an uppercase letter follows the period with no intervening spaces. $UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper; # Put them all together. ($LowerWordFollows | $UpperFollowsImmediately)* $EndSequence; # # Reverse Rules # $EndGorp = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp); $RevEndSequence = $EndGorp* $InteriorChars* $EndGorp*; $ReverseLowerWordFollows = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*; $ReverseUpperFollowsIm = $Upper ($Format | $Extend)* $ATerm $InteriorChars*; ! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?; #! .*;