#
#   Copyright (C) 2002, International Business Machines Corporation and others.
#       All Rights Reserved.
#
#   file:  sent.txt   
#
#   ICU Sentence Break Rules
#      See Unicode Technical Report #29.
#      These rules are based on the proposed draft dated 2002-08-09
#
    

#
# Character categories as defined in TR 29
#
$Sep     = [\u000d \u000a \u0085 \u2028 \u2029];
$Format  = [[:Cf:]];
$Sp      = [[:Whitespace:] - $Sep];
$Lower   = [[:Lowercase:]];
$Upper   = [[:Lt:] [:Uppercase:]];
$OLetter = [[:Alphabetic:] \u02b9-\u02ba  \u02c2-\u02cf  \u02d2-\u02df  \u02e5-\u02ed  \u05f3];

                           #  The chars listed by number below are those with "Linebreak=QU"
$Close   = [[:Pe:] [:Po:]  \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C 
                           \u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];
                           
$ATerm = [\u002e];  
$Term  = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
          \u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
          \uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
$AnyTerm = [$ATerm $Term];	

# From Grapheme Cluster
$Extend   = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f];   #  FF9E..FF9F    ; Other_Grapheme_Extend

#
#  $SepSeq keeps together CRLF as a separator.
#
$SepSeq  = $Sep | \u000d\u000a;

# $InteriorChars are those that never cause a break.
$InteriorChars = [^$AnyTerm $Sep];


# Sentence Break Rules 8, 9, 11
# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
#              that do not cause a break for one exceptional reason or another.
$EndSequence       = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
                               ($AnyTerm | $Format | $Sp | $Extend)*  $SepSeq?;

# Rule 6   Matches a sentence fragment containing "." that should not cause a sentence break,
#          because a lower case word follows the period.
$LowerWordFollows  = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;


# Rule 7.  $UpperFollowsImmediately
#          Matches a fragment containing in a "." that should not cause a sentence break
#          because an uppercase letter follows the period with no intervening spaces.
$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;

# Put them all together.  
($LowerWordFollows |  $UpperFollowsImmediately)*  $EndSequence;

     
#
#  Reverse Rules
#
$EndGorp                  = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
$RevEndSequence           = $EndGorp* $InteriorChars* $EndGorp*;
$ReverseLowerWordFollows  = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
$ReverseUpperFollowsIm    = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;

! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
#! .*;