# Invariance tests # Each line indicates an invariant set relationship to be tested, # and is of the form: # # line := set relation set # # relation := '=' // has identical contents to # := ('>' | '⊃') // is proper superset of # := ('≥' | '⊇') // is superset of # := ('<' | '⊂') // is proper subset of # := ('≤' | '⊆') // is subset of # := '!' // has no intersection # := '?' // none of the above (they overlap, and neither contains the other) # # A set is a standard UnicodeSet, but where $pv can be used to express properties # # pv := '$' '×'? prop (('=' | ':') value)? # # The × indicates that the property is the previous released version. # That is, if the version is 4.0.1, then the × version is 4.0.0 # If the value is missing, it is defaulted to true # If the value is of the form «...», then the ... is interpreted as a regular expression # The property can be the short or long form as in the PropertyAliases.txt # The value (if enumerated) can be the short or long form as in PropertyValueAliases.txt # # A UnicodeSet is a boolean combinations of properties and character ranges, as you would see in # Perl or other regular-expression languages. Examples: # [$General_Category:Unassigned-[a-zA-Z]] # For details, see http://oss.software.ibm.com/icu/userguide/unicodeSet.html # # WARNING: do not use \p{...} or [:...:] syntax, since those will be # ICU's current version of properties, not the current snapshot's. # Use the $ notation for properties (listed above) instead. # # When this file is parsed, an error message may contain <@> # to indicate the location of an error in the input line. # General Constants Let $gcAllPunctuation = [$gc:Open_Punctuation $gc:Close_Punctuation $gc:Dash_Punctuation $gc:Connector_Punctuation $gc:Other_Punctuation $gc:Initial_Punctuation $gc:Final_Punctuation] Let $gcAllSymbols = [$gc:Currency_Symbol $gc:Modifier_Symbol $gc:Math_Symbol $gc:Other_Symbol] Let $gcAllMarks = [$gc:Nonspacing_Mark $gc:Enclosing_Mark $gc:Spacing_Mark] ##### EXAMPLES OF USAGE ##### #Show [[^$gc:unassigned]-[^$×gc:unassigned]-[^$dt:none]] #$GC:Zs ! $GC:Zp #$East_Asian_Width:Neutral ? $GC:Uppercase_Letter $GC:Zs ? $Name:«.*SPACE.*» #$Script:Common ! [$Alphabetic - $Math] # $Pattern_Whitespace = [$Whitespace \u200E \u200F] # $Pattern_Syntax = [$gcAllSymbols $gcAllPunctuation [\u2190-\u2BFF\u2e00-\u2e7F]] # $Pattern_Syntax ! $Alphabetic # $Pattern_Syntax ! $ID_Continue # [$script:greek&$gc:«.*letter.*»] = [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] # $script:greek = $×script:greek # $gc:lm = $script:inherited # Examples of parsing errors # $LBA:Neutral = $GC:Zp # example of non-existant property # $LB:foo = $GC:Zp # example of non-existant value # $GC:Zs @ $GC:Zp # example of unknown relation #### REAL INVARIANTS FOLLOW #### # For illustration, different alias styles are used $Line_Break:Unknown = [$General_Category:Unassigned $GeneralCategory:PrivateUse] $LB:OP = $GC:Ps $Whitespace ⊃ [$GC:Zs $GC:Zp $GC:Zl] $Dash ⊃ [$GC:Pd] $Script:Common ! [$GC:Mn $GC:Me] $Alphabetic ⊃ [$Uppercase $Lowercase] # Numbers: the following must be equal $General_Category:Decimal_Number = $Numeric_Type:Decimal # Decimals are 0..9 Let $decimalValue = $Numeric_Value:«[0-9].0» $decimalValue ⊇ $General_Category:Decimal_Number # All and only those items with numeric types have numeric values Let $anyNumericValue = $Numeric_Value:«-?[0-9]+.[0-9]+» [$Numeric_Type:Decimal $Numeric_Type:Digit $Numeric_Type:Numeric] = $anyNumericValue # Canonical decompositions (minus exclusions) must be identical across releases [$Decomposition_Type:Canonical - $Full_Composition_Exclusion] = [$×Decomposition_Type:Canonical - $×Full_Composition_Exclusion] # Identifiers must be backwards compatible $ID_Start ⊇ $×ID_Start $ID_Continue ⊇ $×ID_Continue $XID_Start ⊇ $×XID_Start $XID_Continue ⊇ $×XID_Continue # Continue must contain start $ID_Continue ⊇ $ID_Start $XID_Continue ⊇ $XID_Start # Identifiers can't intersect pattern stuff $ID_Continue ! [$Pattern_Whitespace $Pattern_Syntax] $Pattern_Whitespace ! [$ID_Continue $Pattern_Syntax] $Pattern_Syntax ! [$ID_Continue $Pattern_Whitespace] $XID_Continue ! [$Pattern_Whitespace $Pattern_Syntax] $Pattern_Whitespace ! [$XID_Continue $Pattern_Syntax] $Pattern_Syntax ! [$XID_Continue $Pattern_Whitespace] # Test SA characters # They are limited to certain scripts: Let $SAScripts = [$script:thai $script:lao $script:myanmar $script:khmer] $SAScripts ⊇ $LineBreak:SA # And in those scripts, they are all the alphabetic spacing characters, plus some odd Cf [$SAScripts & [$Alphabetic $gc:cf]] = [$SAScripts & [$LineBreak:SA $LineBreak:CM]] # Try removing M* from alphabetic, and matching to SA [$SAScripts & [$Alphabetic $gc:cf - $gcAllMarks]] = $LineBreak:SA # Try adding M* to alphabetic, and matching to SA [$SAScripts & [$Alphabetic $gc:cf $gcAllMarks]] = $LineBreak:SA # testing # [$Pattern_Whitespace $Pattern_Syntax] ! [[^$WB:Format $WB:Other] \u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A] Let $otherword = [\u2019 \u0027 \u02BC \u002d \u00ad \u2027 \u058A] Let $currentword = [[^$WB:Format $WB:Other $WB:MidNum] $Grapheme_Extend $alphabetic] Show [$currentword $otherword - $ID_Continue] Show [$currentword $otherword - [$alphabetic $anyNumericValue $gcAllMarks]] Show [$otherword - $currentword] Show [$name:«.*LETTER.*» - $alphabetic] # Pattern characters are invariant! # Add after 4.1.0 $Pattern_Whitespace = $×Pattern_Whitespace $Pattern_Syntax = $×Pattern_Syntax #BIDI invariant constants Let $R_blocks = [$block:Kharoshthi $block:Hebrew $block:Cypriot_Syllabary \u07C0-\u08FF \uFB1D-\uFB4F \U00010840-\U00010FFF] Let $AL_blocks = [[$block:Arabic_Supplement $block:Arabic $block:Syriac $block:Arabic $block:Thaana $block:Arabic_Presentation_Forms_A $block:Arabic_Presentation_Forms_B [\u0750-\u077F]] -$Noncharacter_Code_Point] #Unassigned characters in these blocks have R or AL respectively $Bidi_Class:R ⊇ [$R_blocks & $gc:Cn] $Bidi_Class:AL ⊇ [$AL_blocks & $gc:Cn] # There are no strong characters of the other directionalities (out of L, AL, R) in these blocks, # and anything R or L is in the block (or RLM) $R_blocks ! [$Bidi_Class:L $Bidi_Class:AL] $AL_blocks ! [$Bidi_Class:L $Bidi_Class:R] [$R_blocks $AL_blocks \u200F] ⊇ [$Bidi_Class:AL $Bidi_Class:R] # Derivations must match $Math = [$GC:Sm $Other_Math] $Alphabetic = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_Alphabetic] $Lowercase = [$GC:Ll $Other_Lowercase] $Uppercase = [$GC:Lu $Other_Uppercase] $ID_Start = [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl $Other_ID_Start] $ID_Continue = [$ID_Start $GC:Mn $GC:Mc $GC:Nd $GC:Pc $Other_ID_Continue] $Default_Ignorable_Code_Point = [[$Other_Default_Ignorable_Code_Point $GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]] $Grapheme_Extend = [$GC:Me $GC:Mn $Other_Grapheme_Extend] $Grapheme_Base = [^$GC:Cc $GC:Cf $GC:Cs $GC:Co $GC:Cn $GC:Zl $GC:Zp $Grapheme_Extend] # "Minimal" Other_: NOT hard requirements; just if we want to be minimal # (Should add way to make these warnings, not errors) $Other_Math = [$Math - $GC:Sm] $Other_Alphabetic = [$Alphabetic - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] $Other_Lowercase = [$Lowercase - $GC:Ll] $Other_Uppercase = [$Uppercase - $GC:Lu] $Other_ID_Start = [$ID_Start - [$GC:Lu $GC:Ll $GC:Lt $GC:Lm $GC:Lo $GC:Nl]] $Other_Default_Ignorable_Code_Point = [$Default_Ignorable_Code_Point - [[$GC:Cf $GC:Cc $GC:Cs $Variation_Selector $Noncharacter_Code_Point] - [$White_Space\uFFF9-\uFFFB]]] $Other_Grapheme_Extend = [$Grapheme_Extend - [$GC:Me $GC:Mn]] # =========================== # POSIX Compatibility Properties (UTS#18) # http://www.opengroup.org/onlinepubs/007904975/basedefs/xbd_chap07.html # constants Let $SP = [\u0020] # [\N{space}] Let $TAB = [\u0009] # [\N{CHARACTER TABULATION}] Let $LF = [\u000A] # \N{linefeed} Let $VTAB = [\u000B] # [\N{LINE TABULATION}] Let $FF = [\u000C] # [\N{formfeed}] Let $CR = [\u000D] # \N{carriage return} Let $NEL = [\u0085] # \N{next line} Let $ZWNJ = [\u200C] # [\N{ZERO WIDTH NON-JOINER}] Let $ZWJ = [\u200D] # [\N{ZERO WIDTH JOINER}] Let $strange = [\u24B6-\u24E9] # Unassigned, Control, Format, Private_Use, Surrogate, # Uppercase_Letter, Lowercase_Letter, Titlecase_Letter, Modifier_Letter, Other_Letter, # Nonspacing_Mark, Enclosing_Mark, Spacing_Mark, # Decimal_Number, Letter_Number, Other_Number, # Space_Separator, Line_Separator, Paragraph_Separator, # Dash_Punctuation, Open_Punctuation, Close_Punctuation, Connector_Punctuation, Other_Punctuation, Initial_Punctuation, Final_Punctuation # Math_Symbol, Currency_Symbol, Modifier_Symbol, Other_Symbol # UTS Rules Let $alpha = [$Alphabetic $strange] # $Uppercase $ZWNJ $ZWJ] Let $lower = $Lowercase Let $upper = [$Uppercase] Let $punct = [$gcAllPunctuation $gcAllSymbols - $alpha] Let $digit = $gc:Decimal_Number Let $xdigit = [$gc:Decimal_Number $Hex_Digit] # in both! Let $alnum = [$alpha $digit] Let $space = $Whitespace Let $blank = [$Whitespace - [$LF $VTAB $FF $CR $NEL $gc:Line_Separator $gc:Paragraph_Separator]] Let $cntrl = $gc:Control Let $graph = [^$space $gc:Control $gc:Surrogate $gc:Unassigned] # $ZWNJ $ZWJ] Let $print = [$graph $blank - $cntrl] Let $word = [$alpha $gcAllMarks $digit $gc:Connector_Punctuation] # =========================== # POSIX locale definition file constraints $upper ! [$cntrl $digit $punct $space] $upper ≥ [A-Z] $lower ! [$cntrl $digit $punct $space] $lower ≥ [a-z] $alpha ! [$cntrl $digit $punct $space] $alpha ≥ [$lower $upper] $digit ≥ [0-9] $alnum = [$alpha $digit] $space ! [$upper $lower $alpha $digit $graph $xdigit] $space ≥ [$SP $FF $LF $CR] # $TAB $VTAB $NEL] $space ≥ $blank $cntrl ! [$upper $lower $alpha $digit $punct $graph $print $xdigit] $punct ! [$upper $lower $alpha $digit $cntrl $xdigit $SP] $graph ≥ [$upper $lower $alpha $digit $xdigit $punct] $graph ! [$SP $cntrl] $print ≥ [$upper $lower $alpha $digit $xdigit $punct $graph $SP] $print ! $cntrl $xdigit ≥ [$digit [a-f A-F]] $blank ≥ [$SP $TAB] # Extra POSIX 'POSIX locale' constraints $cntrl ≥ [\u0000-\u001F] $punct ≥ [[\u0021-\u007E] - [0-9 A-Z a-z]] [$alpha $lower $upper $punct $digit $xdigit $alnum $space $blank $cntrl $graph $print $word] = [^$gc:unassigned $gc:surrogate]