# # Copyright (C) 2002-2005, International Business Machines Corporation and others. # All Rights Reserved. # # file: sent.txt # # ICU Sentence Break Rules # See Unicode Standard Annex #29. # These rules are based on TR 29 version 4.1.0 # # # Character categories as defined in TR 29 # $Sep = [\p{Sentence_Break = Sep}]; $Format = [\p{Sentence_Break = Format}]; $Sp = [\p{Sentence_Break = Sp}]; $Lower = [\p{Sentence_Break = Lower}]; $Upper = [\p{Sentence_Break = Upper}]; $OLetter = [\p{Sentence_Break = OLetter}]; $Numeric = [\p{Sentence_Break = Numeric}]; $ATerm = [\p{Sentence_Break = ATerm}]; $STerm = [\p{Sentence_Break = STerm}]; $Close = [\p{Sentence_Break = Close}]; $Surrogate = [\p{GC=Surrogate}]; # # Define extended forms of the character classes, # incorporate grapheme cluster + format chars. # Rules 4 and 5. $CR = \u000d; $LF = \u000a; $Extend = [[:Grapheme_Extend = TRUE:]]; $Control = [\p{Grapheme_Cluster_Break = Control}]; $SpEx = ($Sp | ([[$Sp]-[$Control]] $Extend*)) $Format*; $LowerEx = $Lower $Extend* $Format*; $UpperEx = $Upper $Extend* $Format*; $OLetterEx = $OLetter $Extend* $Format*; $NumericEx = $Numeric $Extend* $Format*; $ATermEx = $ATerm $Extend* $Format*; $STermEx = $STerm $Extend* $Format*; $CloseEx = $Close $Extend* $Format*; ## ------------------------------------------------- !!chain; !!forward; # Rule 3 - break after separators. Keep CR/LF together. # $CR $LF; # Rule 4 - don't break grapheme clusters, including optional trailing format chars. # [^$Control $Sep] $Extend+ $Format*; [^$Control $Sep] $Extend* $Format+; # Rule 6 $ATermEx $NumericEx; # Rule 7 $UpperEx $ATermEx $UpperEx; #Rule 8 $NotLettersEx = ([^$OLetter $Upper $Lower $Sep $ATerm $STerm $Control] $Extend* $Format*) | ([^$OLetter $Upper $Lower $Sep $ATerm $STerm] $Format*); $ATermEx $CloseEx* $SpEx* $NotLettersEx* $Lower; #Rule 9, 10, 11 ($STermEx | $ATermEx) $CloseEx* $SpEx* $Sep?; #Rule 12 [[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* .; [[^$STerm $ATerm $Close $Sp $Sep $Format $Extend]{bof}] ($Extend | $Format | $Close | $Sp)* ([$Sep{eof}] | $CR $LF){100}; ## ------------------------------------------------- !!reverse; $SpEx_R = $Format* ($Sp | ( $Extend+[[$Sp]-[$Control]])); $LowerEx_R = $Format* $Extend* $Lower; $UpperEx_R = $Format* $Extend* $Upper; $OLetterEx_R = $Format* $Extend* $OLetter; $NumericEx_R = $Format* $Extend* $Numeric; $ATermEx_R = $Format* $Extend* $ATerm; $STermEx_R = $Format* $Extend* $STerm; $CloseEx_R = $Format* $Extend* $Close; # # Reverse rules. # For now, use the old style inexact reverse rules, which are easier # to write, but less efficient. # TODO: exact reverse rules. It appears that exact reverse rules # may require improving support for look-ahead breaks in the # builder. Needs more investigation. # [{bof}] [^$Sep]* (.? | $LF $CR) [^$Sep]* [$Sep {eof}] ($SpEx_R* $CloseEx_R* ($STermEx_R | $ATermEx_R))?;