scuffed-code/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt
Mark Davis 7ad388b8d6 updates for clean build
X-SVN-Rev: 16844
2004-11-12 23:17:15 +00:00

183 lines
7.6 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Invariance tests
# Each line indicates an invariant set relationship to be tested,
# and is of the form:
#
# line := set relation set
#
# relation := '=' // has identical contents to
# := ('>' | '⊃') // is proper superset of
# := ('≥' | '⊇') // is superset of
# := ('<' | '⊂') // is proper subset of
# := ('≤' | '⊆') // is subset of
# := '!' // has no intersection
# := '?' // none of the above (they overlap, and neither contains the other)
#
# A set is a standard UnicodeSet, but where $pv can be used to express properties
#
# pv := '$' '×'? prop (('=' | ':') value)?
#
# The × indicates that the property is the previous released version.
# That is, if the version is 4.0.1, then the × version is 4.0.0
# If the value is missing, it is defaulted to true
# If the value is of the form «...», then the ... is interpreted as a regular expression
# The property can be the short or long form as in the PropertyAliases.txt
# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt
#
# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in
# Perl or other regular-expression languages. Examples:
# [$General_Category:Unassigned-[a-zA-Z]]
# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html
#
# WARNING: do not use \p{...} or [:...:] syntax, since those will be
# ICU's current version of properties, not the current snapshot's.
# Use the $ notation for properties (listed above) instead.
#
# When this file is parsed, an error message may contain <@>
# to indicate the location of an error in the input line.
# The following not very interesting, but show examples of use
#$GC:Zs ! $GC:Zp
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
$GC:Zs ? $Name:«.*SPACE.*»
[$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
# Examples of parsing errors
# $LBA:Neutral = $GC:Zp # example of non-existant property
# $LB:foo = $GC:Zp # example of non-existant value
# $GC:Zs @ $GC:Zp # example of unknown relation
# The following should be real invariants
# For illustration, different alias styles are used
$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
$LB:OP = $GC:Ps
$General_Category:Decimal_Number = $Numeric_Type:Decimal
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
# Comparisons across versions
$ID_Start ⊇ $×ID_Start
$ID_Continue ⊇ $×ID_Continue
#$age:4.0.1 = $age4.0.0
# Derivations
$Math = [$GC:Sm $Other_Math]
$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
$Lowercase = [$GC:Ll $Other_Lowercase]
$Uppercase = [$GC:Lu $Other_Uppercase]
$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc]
$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
# "Minimal" Other_: NOT hard requirements; just if we want to be minimal
$Other_Math = [$Math - $GC:Sm]
$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
$Other_Lowercase = [$Lowercase - $GC:Ll]
$Other_Uppercase = [$Uppercase - $GC:Lu]
$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]]
$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
# Testing
$script:greek = $×script:greek
$gc:lm = $script:inherited
# ===========================
# Compatibility Properties (UTS#18)
# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html
# constants
Let $SP = [\u0020] # [\N{space}]
Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}]
Let $LF = [\u000A] # \N{linefeed}
Let $VTAB = [\u000B] # [\N{LINE TABULATION}]
Let $FF = [\u000C] # [\N{formfeed}]
Let $CR = [\u000D] # \N{carriage return}
Let $NEL = [\u0085] # \N{next line}
Let $ZWNJ = [\u200C] # [\N{ZERO WIDTH NON-JOINER}]
Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}]
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark]
Let $strange = [\u24B6-\u24CF]
# Unassigned, Control, Format, Private_Use, Surrogate,
# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter,
# Nonspacing_Mark, Enclosing_Mark, Spacing_Mark,
# Decimal_Number, Letter_Number, Other_Number,
# Space_Separator, Line_Separator, Paragraph_Separator,
# Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation
# Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol
# UTS Rules
Let $alpha = [$Alphabetic $Lowercase] # $Uppercase $ZWNJ $ZWJ]
Let $lower = $Lowercase
Let $upper = [$Uppercase - $strange]
Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha]
Let $digit = $gc:Decimal_Number
Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both!
Let $alnum = [$alpha $digit]
Let $space = $Whitespace
Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]]
Let $cntrl = $gc:Control
Let $graph = [^$space $gc:Control $gc:Format $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ]
Let $print = [$graph $blank - $cntrl]
Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation]
# ===========================
# POSIX locale definition file constraints
$upper ! [$cntrl $digit $punct $space]
$upper ≥ [A-Z]
$lower ! [$cntrl $digit $punct $space]
$lower ≥ [a-z]
$alpha ! [$cntrl $digit $punct $space]
$alpha ≥ [$lower $upper]
$digit ≥ [0-9]
$alnum = [$alpha $digit]
$space ! [$upper $lower $alpha $digit $graph $xdigit]
$space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL]
$space ≥ $blank
$cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit]
$punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP]
$graph ≥ [$upper $lower $alpha $digit $xdigit $punct]
$graph ! [$SP $cntrl]
$print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP]
$print ! $cntrl
$xdigit ≥ [$digit [a-f A-F]]
$blank ≥ [$SP $TAB]
# Extra POSIX 'POSIX locale' constraints
$cntrl ≥ [\u0000-\u001F]
$punct ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]]
[$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate]