scuffed-code/tools/unicodetools/com/ibm/text/UCD/UnicodeInvariants.txt
Mark Davis 6509d8087c ICU-4700 misc fixes
X-SVN-Rev: 18773
2005-11-08 05:20:00 +00:00

267 lines
11 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Invariance tests
# Each line indicates an invariant set relationship to be tested,
# and is of the form:
#
# line := set relation set
#
# relation := '=' // has identical contents to
# := ('>' | '⊃') // is proper superset of
# := ('≥' | '⊇') // is superset of
# := ('<' | '⊂') // is proper subset of
# := ('≤' | '⊆') // is subset of
# := '!' // has no intersection
# := '?' // none of the above (they overlap, and neither contains the other)
#
# A set is a standard UnicodeSet, but where $pv can be used to express properties
#
# pv := '$' '×'? prop (('=' | ':') value)?
#
# The × indicates that the property is the previous released version.
# That is, if the version is 4.0.1, then the × version is 4.0.0
# If the value is missing, it is defaulted to true
# If the value is of the form «...», then the ... is interpreted as a regular expression
# The property can be the short or long form as in the PropertyAliases.txt
# The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt
#
# A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in
# Perl or other regular-expression languages. Examples:
# [$General_Category:Unassigned-[a-zA-Z]]
# For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html
#
# WARNING: do not use \p{...} or [:...:] syntax, since those will be
# ICU's current version of properties, not the current snapshot's.
# Use the $ notation for properties (listed above) instead.
#
# When this file is parsed, an error message may contain <@>
# to indicate the location of an error in the input line.
# General Constants
Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation]
Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol]
Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark]
##### EXAMPLES OF USAGE #####
#Show [[^$gc:unassigned]-[^$×gc:unassigned]-[^$dt:none]]
#$GC:Zs ! $GC:Zp
#$East_Asian_Width:Neutral ? $GC:Uppercase_Letter
$GC:Zs ? $Name:«.*SPACE.*»
#$Script:Common ! [$Alphabetic - $Math]
# $Pattern_Whitespace = [$Whitespace \u200E \u200F]
# $Pattern_Syntax = [$gcAllSymbols $gcAllPunctuation [\u2190-\u2BFF\u2e00-\u2e7F]]
# $Pattern_Syntax ! $Alphabetic
# $Pattern_Syntax ! $ID_Continue
# [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126]
# $script:greek = $×script:greek
# $gc:lm = $script:inherited
# Examples of parsing errors
# $LBA:Neutral = $GC:Zp # example of non-existant property
# $LB:foo = $GC:Zp # example of non-existant value
# $GC:Zs @ $GC:Zp # example of unknown relation
#### REAL INVARIANTS FOLLOW ####
# For illustration, different alias styles are used
$Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse]
$LB:OP = $GC:Ps
$Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl]
$Dash ⊃ [$GC:Pd]
$Script:Common ! [$GC:Mn $GC:Me]
$Alphabetic ⊃ [$Uppercase $Lowercase]
# Numbers: the following must be equal
$General_Category:Decimal_Number = $Numeric_Type:Decimal
# Decimals are 0..9
Let $decimalValue = $Numeric_Value:«[0-9].0»
$decimalValue ⊇ $General_Category:Decimal_Number
# All and only those items with numeric types have numeric values
Let $anyNumericValue = $Numeric_Value:«-?[0-9]+.[0-9]+»
[$Numeric_Type:Decimal $Numeric_Type:Digit $Numeric_Type:Numeric] = $anyNumericValue
# Canonical decompositions (minus exclusions) must be identical across releases
[$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion]
# Identifiers must be backwards compatible
$ID_Start ⊇ $×ID_Start
$ID_Continue ⊇ $×ID_Continue
$XID_Start ⊇ $×XID_Start
$XID_Continue ⊇ $×XID_Continue
# Continue must contain start
$ID_Continue ⊇ $ID_Start
$XID_Continue ⊇ $XID_Start
# Identifiers can't intersect pattern stuff
$ID_Continue ! [$Pattern_Whitespace $Pattern_Syntax]
$Pattern_Whitespace ! [$ID_Continue $Pattern_Syntax]
$Pattern_Syntax ! [$ID_Continue $Pattern_Whitespace]
$XID_Continue ! [$Pattern_Whitespace $Pattern_Syntax]
$Pattern_Whitespace ! [$XID_Continue $Pattern_Syntax]
$Pattern_Syntax ! [$XID_Continue $Pattern_Whitespace]
# Test SA characters
# They are limited to certain scripts:
Let $SAScripts = [$script:thai $script:lao $script:myanmar $script:khmer]
$SAScripts ⊇ $LineBreak:SA
# And in those scripts, they are all the alphabetic spacing characters, plus some odd Cf
[$SAScripts & [$Alphabetic $gc:cf]] = [$SAScripts & [$LineBreak:SA $LineBreak:CM]]
# Try removing M* from alphabetic, and matching to SA
[$SAScripts & [$Alphabetic $gc:cf - $gcAllMarks]] = $LineBreak:SA
# Try adding M* to alphabetic, and matching to SA
[$SAScripts & [$Alphabetic $gc:cf $gcAllMarks]] = $LineBreak:SA
# testing
# [$Pattern_Whitespace $Pattern_Syntax] ! [[^$WB:Format $WB:Other] \u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A]
Let $otherword = [\u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A]
Let $currentword = [[^$WB:Format $WB:Other $WB:MidNum] $Grapheme_Extend $alphabetic]
Show [$currentword $otherword - $ID_Continue]
Show [$currentword $otherword - [$alphabetic $anyNumericValue $gcAllMarks]]
Show [$otherword - $currentword]
Show [$name:«.*LETTER.*» - $alphabetic]
# Pattern characters are invariant!
# Add after 4.1.0
$Pattern_Whitespace = $×Pattern_Whitespace
$Pattern_Syntax = $×Pattern_Syntax
#BIDI invariant constants
Let $R_blocks = [$block:Kharoshthi $block:Hebrew $block:Cypriot_Syllabary \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF]
Let $AL_blocks = [[$block:Arabic_Supplement $block:Arabic $block:Syriac $block:Arabic $block:Thaana $block:Arabic_Presentation_Forms_A $block:Arabic_Presentation_Forms_B [\u0750-\u077F]] -$Noncharacter_Code_Point]
#Unassigned characters in these blocks have R or AL respectively
$Bidi_Class:R ⊇ [$R_blocks & $gc:Cn]
$Bidi_Class:AL ⊇ [$AL_blocks & $gc:Cn]
# There are no strong characters of the other directionalities (out of L, AL, R) in these blocks,
# and anything R or L is in the block (or RLM)
$R_blocks ! [$Bidi_Class:L $Bidi_Class:AL]
$AL_blocks ! [$Bidi_Class:L $Bidi_Class:R]
[$R_blocks $AL_blocks \u200F] ⊇ [$Bidi_Class:AL $Bidi_Class:R]
# Derivations must match
$Math = [$GC:Sm $Other_Math]
$Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic]
$Lowercase = [$GC:Ll $Other_Lowercase]
$Uppercase = [$GC:Lu $Other_Uppercase]
$ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start]
$ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue]
$Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]
$Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend]
$Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend]
# "Minimal" Other_: NOT hard requirements; just if we want to be minimal
# (Should add way to make these warnings, not errors)
$Other_Math = [$Math - $GC:Sm]
$Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
$Other_Lowercase = [$Lowercase - $GC:Ll]
$Other_Uppercase = [$Uppercase - $GC:Lu]
$Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]]
$Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]]
$Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]]
# ===========================
# POSIX Compatibility Properties (UTS#18)
# http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html
# constants
Let $SP = [\u0020] # [\N{space}]
Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}]
Let $LF = [\u000A] # \N{linefeed}
Let $VTAB = [\u000B] # [\N{LINE TABULATION}]
Let $FF = [\u000C] # [\N{formfeed}]
Let $CR = [\u000D] # \N{carriage return}
Let $NEL = [\u0085] # \N{next line}
Let $ZWNJ = [\u200C] # [\N{ZERO WIDTH NON-JOINER}]
Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}]
Let $strange = [\u24B6-\u24E9]
# Unassigned, Control, Format, Private_Use, Surrogate,
# Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter,
# Nonspacing_Mark, Enclosing_Mark, Spacing_Mark,
# Decimal_Number, Letter_Number, Other_Number,
# Space_Separator, Line_Separator, Paragraph_Separator,
# Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation
# Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol
# UTS Rules
Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ]
Let $lower = $Lowercase
Let $upper = [$Uppercase]
Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha]
Let $digit = $gc:Decimal_Number
Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both!
Let $alnum = [$alpha $digit]
Let $space = $Whitespace
Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]]
Let $cntrl = $gc:Control
Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ]
Let $print = [$graph $blank - $cntrl]
Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation]
# ===========================
# POSIX locale definition file constraints
$upper ! [$cntrl $digit $punct $space]
$upper ≥ [A-Z]
$lower ! [$cntrl $digit $punct $space]
$lower ≥ [a-z]
$alpha ! [$cntrl $digit $punct $space]
$alpha ≥ [$lower $upper]
$digit ≥ [0-9]
$alnum = [$alpha $digit]
$space ! [$upper $lower $alpha $digit $graph $xdigit]
$space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL]
$space ≥ $blank
$cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit]
$punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP]
$graph ≥ [$upper $lower $alpha $digit $xdigit $punct]
$graph ! [$SP $cntrl]
$print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP]
$print ! $cntrl
$xdigit ≥ [$digit [a-f A-F]]
$blank ≥ [$SP $TAB]
# Extra POSIX 'POSIX locale' constraints
$cntrl ≥ [\u0000-\u001F]
$punct ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]]
[$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate]