3144b2665e
X-SVN-Rev: 9823
77 lines
2.8 KiB
Plaintext
77 lines
2.8 KiB
Plaintext
#
|
|
# Copyright (C) 2002, International Business Machines Corporation and others.
|
|
# All Rights Reserved.
|
|
#
|
|
# file: sent.txt
|
|
#
|
|
# ICU Sentence Break Rules
|
|
# See Unicode Technical Report #29.
|
|
# These rules are based on the proposed draft dated 2002-08-09
|
|
#
|
|
|
|
|
|
#
|
|
# Character categories as defined in TR 29
|
|
#
|
|
$Sep = [\u000d \u000a \u0085 \u2028 \u2029];
|
|
$Format = [[:Cf:]];
|
|
$Sp = [[:Whitespace:] - $Sep];
|
|
$Lower = [[:Lowercase:]];
|
|
$Upper = [[:Lt:] [:Uppercase:]];
|
|
$OLetter = [[:Alphabetic:] \u02b9-\u02ba \u02c2-\u02cf \u02d2-\u02df \u02e5-\u02ed \u05f3];
|
|
|
|
# The chars listed by number below are those with "Linebreak=QU"
|
|
$Close = [[:Pe:] [:Po:] \u0022 \u0027 \u00AB \u00BB \u2018 \u2019 \u201B-\u201C
|
|
\u201D \u201F \u2039 \u203A \u23B6 \u275B-\u275E ];
|
|
|
|
$ATerm = [\u002e];
|
|
$Term = [\u0021 \u003F \u0589 \u061F \u06D4 \u0701 \u0702 \u0700 \u0964
|
|
\u1362 \u1367 \u1368 \u1803 \u203C \u203D \u2048 \u2049 \u3002
|
|
\uFE52 \uFE57 \uFF01 \uFF0E \uFF1F \uFF61];
|
|
$AnyTerm = [$ATerm $Term];
|
|
|
|
# From Grapheme Cluster
|
|
$Extend = [[:Mn:] [:Me:] [:Mc:] \uff9e-\uff9f]; # FF9E..FF9F ; Other_Grapheme_Extend
|
|
|
|
#
|
|
# $SepSeq keeps together CRLF as a separator.
|
|
#
|
|
$SepSeq = $Sep | \u000d\u000a;
|
|
|
|
# $InteriorChars are those that never cause a break.
|
|
$InteriorChars = [^$AnyTerm $Sep];
|
|
|
|
|
|
|
|
# Sentence Break Rules 8, 9, 11
|
|
# $EndSequence matches a "Normal" sentence, which is one not containg any extra ATerms (periods)
|
|
# that do not cause a break for one exceptional reason or another.
|
|
$EndSequence = $InteriorChars* $AnyTerm? ($Close | $AnyTerm | $Format | $Extend)*
|
|
($AnyTerm | $Format | $Sp | $Extend)* $SepSeq?;
|
|
|
|
# Rule 6 Matches a sentence fragment containing "." that should not cause a sentence break,
|
|
# because a lower case word follows the period.
|
|
$LowerWordFollows = $InteriorChars* $ATerm [^$OLetter $Upper]* $Lower;
|
|
|
|
|
|
# Rule 7. $UpperFollowsImmediately
|
|
# Matches a fragment containing in a "." that should not cause a sentence break
|
|
# because an uppercase letter follows the period with no intervening spaces.
|
|
$UpperFollowsImmediately = $InteriorChars* $ATerm ($Format | $Extend)* $Upper;
|
|
|
|
# Put them all together.
|
|
($LowerWordFollows | $UpperFollowsImmediately)* $EndSequence;
|
|
|
|
|
|
#
|
|
# Reverse Rules
|
|
#
|
|
$EndGorp = ($AnyTerm | $Sep | $Close | $Extend | $Format | $Sp);
|
|
$RevEndSequence = $EndGorp* $InteriorChars* $EndGorp*;
|
|
$ReverseLowerWordFollows = $Lower [^$OLetter $Upper]* $ATerm $InteriorChars*;
|
|
$ReverseUpperFollowsIm = $Upper ($Format | $Extend)* $ATerm $InteriorChars*;
|
|
|
|
! $RevEndSequence? ($ReverseLowerWordFollows | $ReverseUpperFollowsIm)* .?;
|
|
#! .*;
|
|
|