e977c057a9
X-SVN-Rev: 35227
2236 lines
38 KiB
Plaintext
2236 lines
38 KiB
Plaintext
# Copyright (c) 2012-2014 International Business Machines
|
||
# Corporation and others. All Rights Reserved.
|
||
#
|
||
# This file should be in UTF-8 with a signature byte sequence ("BOM").
|
||
#
|
||
# collationtest.txt: Collation test data.
|
||
#
|
||
# created on: 2012apr13
|
||
# created by: Markus W. Scherer
|
||
|
||
# A line with "** test: description" is used for verbose and error output.
|
||
|
||
# A collator can be set with "@ root" or "@ locale language-tag",
|
||
# for example "@ locale de-u-co-phonebk".
|
||
|
||
# A collator can be built with "@ rules".
|
||
# An "@ rules" line is followed by one or more lines with the tailoring rules.
|
||
|
||
# A collator can be modified with "% attribute=value".
|
||
|
||
# "* compare" tests the order (= or <) of the following strings.
|
||
# The relation can be "=" or "<" (the level of the difference is not specified)
|
||
# or "<1", "<2", "<c", "<3", "<4" (indicating the level of the difference).
|
||
|
||
# Test sections ("* compare") are terminated by
|
||
# definitions of new collators, changing attributes, or new test sections.
|
||
|
||
** test: simple CEs & expansions
|
||
# Many types of mappings are tested elsewhere, including via the UCA conformance tests.
|
||
# Here we mostly cover a few unusual mappings.
|
||
@ rules
|
||
&\x01 # most control codes are ignorable
|
||
<<<\u0300 # tertiary CE
|
||
&9<\x00 # NUL not ignorable
|
||
&\uA00A\uA00B=\uA002 # two long-primary CEs
|
||
&\uA00A\uA00B\u00050005=\uA003 # three CEs, require 64 bits
|
||
|
||
* compare
|
||
= \x01
|
||
= \x02
|
||
<3 \u0300
|
||
<1 9
|
||
<1 \x00
|
||
= \x01\x00\x02
|
||
<1 a
|
||
<3 a\u0300
|
||
<2 a\u0308
|
||
= ä
|
||
<1 b
|
||
<1 か # Hiragana Ka (U+304B)
|
||
<2 か\u3099 # plus voiced sound mark
|
||
= が # Hiragana Ga (U+304C)
|
||
<1 \uA00A\uA00B
|
||
= \uA002
|
||
<1 \uA00A\uA00B\u00050004
|
||
<1 \uA00A\uA00B\u00050005
|
||
= \uA003
|
||
<1 \uA00A\uA00B\u00050006
|
||
|
||
** test: contractions
|
||
# Create some interesting mappings, and map some normalization-inert characters
|
||
# (which are not subject to canonical reordering)
|
||
# to some of the same CEs to check the sequence of CEs.
|
||
@ rules
|
||
|
||
# Contractions starting with 'a' should not continue with any character < U+0300
|
||
# so that we can test a shortcut for that.
|
||
&a=ⓐ
|
||
&b<bz=ⓑ
|
||
&d<dz\u0301=ⓓ # d+z+acute
|
||
&z
|
||
<a\u0301=Ⓐ # a+acute sorts after z
|
||
<a\u0301\u0301=Ⓑ # a+acute+acute
|
||
<a\u0301\u0301\u0358=Ⓒ # a+acute+acute+dot above right
|
||
<a\u030a=Ⓓ # a+ring
|
||
<a\u0323=Ⓔ # a+dot below
|
||
<a\u0323\u0358=Ⓕ # a+dot below+dot above right
|
||
<a\u0327\u0323\u030a=Ⓖ # a+cedilla+dot below+ring
|
||
<a\u0327\u0323bz=Ⓗ # a+cedilla+dot below+b+z
|
||
|
||
&\U0001D158=⁰ # musical notehead black (has a symbol primary)
|
||
<\U0001D158\U0001D165=¼ # musical quarter note
|
||
|
||
# deliberately missing prefix contractions:
|
||
# dz
|
||
# a\u0327
|
||
# a\u0327\u0323
|
||
# a\u0327\u0323b
|
||
|
||
&\x01
|
||
<<<\U0001D165=¹ # musical stem (ccc=216)
|
||
<<<\U0001D16D=² # musical augmentation dot (ccc=226)
|
||
<<<\U0001D165\U0001D16D=³ # stem+dot (ccc=216 226)
|
||
&\u0301=❶ # acute (ccc=230)
|
||
&\u030a=❷ # ring (ccc=230)
|
||
&\u0308=❸ # diaeresis (ccc=230)
|
||
<<\u0308\u0301=❹ # diaeresis+acute (=dialytika tonos) (ccc=230 230)
|
||
&\u0327=❺ # cedilla (ccc=202)
|
||
&\u0323=❻ # dot below (ccc=220)
|
||
&\u0331=❼ # macron below (ccc=220)
|
||
<<\u0331\u0358=❽ # macron below+dot above right (ccc=220 232)
|
||
&\u0334=❾ # tilde overlay (ccc=1)
|
||
&\u0358=❿ # dot above right (ccc=232)
|
||
|
||
&\u0f71=① # tibetan vowel sign aa
|
||
&\u0f72=② # tibetan vowel sign i
|
||
# \u0f71\u0f72 # tibetan vowel sign aa + i = ii = U+0F73
|
||
&\u0f73=③ # tibetan vowel sign ii (ccc=0 but lccc=129)
|
||
|
||
** test: simple contractions
|
||
|
||
# Some strings are chosen to cause incremental contiguous contraction matching to
|
||
# go into partial matches for prefixes of contractions
|
||
# (where the prefixes are deliberately not also contractions).
|
||
# When there is no complete match, then the matching code must back out of those
|
||
# so that discontiguous contractions work as specified.
|
||
|
||
* compare
|
||
# contraction starter with no following text, or mismatch, or blocked
|
||
<1 a
|
||
= ⓐ
|
||
<1 aa
|
||
= ⓐⓐ
|
||
<1 ab
|
||
= ⓐb
|
||
<1 az
|
||
= ⓐz
|
||
|
||
* compare
|
||
<1 a
|
||
<2 a\u0308\u030a # ring blocked by diaeresis
|
||
= ⓐ❸❷
|
||
<2 a\u0327
|
||
= ⓐ❺
|
||
|
||
* compare
|
||
<2 \u0308
|
||
= ❸
|
||
<2 \u0308\u030a\u0301 # acute blocked by ring
|
||
= ❸❷❶
|
||
|
||
* compare
|
||
<1 \U0001D158
|
||
= ⁰
|
||
<1 \U0001D158\U0001D165
|
||
= ¼
|
||
|
||
# no discontiguous contraction because of missing prefix contraction d+z,
|
||
# and a starter ('z') after the 'd'
|
||
* compare
|
||
<1 dz\u0323\u0301
|
||
= dz❻❶
|
||
|
||
# contiguous contractions
|
||
* compare
|
||
<1 abz
|
||
= ⓐⓑ
|
||
<1 abzz
|
||
= ⓐⓑz
|
||
|
||
* compare
|
||
<1 a
|
||
<1 z
|
||
<1 a\u0301
|
||
= Ⓐ
|
||
<1 a\u0301\u0301
|
||
= Ⓑ
|
||
<1 a\u0301\u0301\u0358
|
||
= Ⓒ
|
||
<1 a\u030a
|
||
= Ⓓ
|
||
<1 a\u0323\u0358
|
||
= Ⓕ
|
||
<1 a\u0327\u0323\u030a # match despite missing prefix
|
||
= Ⓖ
|
||
<1 a\u0327\u0323bz
|
||
= Ⓗ
|
||
|
||
* compare
|
||
<2 \u0308\u0308\u0301 # acute blocked from first diaeresis, contracts with second
|
||
= ❸❹
|
||
|
||
* compare
|
||
<1 \U0001D158\U0001D165
|
||
= ¼
|
||
|
||
* compare
|
||
<3 \U0001D165\U0001D16D
|
||
= ³
|
||
|
||
** test: discontiguous contractions
|
||
* compare
|
||
<1 a\u0327\u030a # a+ring skips cedilla
|
||
= Ⓓ❺
|
||
<2 a\u0327\u0327\u030a # a+ring skips 2 cedillas
|
||
= Ⓓ❺❺
|
||
<2 a\u0327\u0327\u0327\u030a # a+ring skips 3 cedillas
|
||
= Ⓓ❺❺❺
|
||
<2 a\u0334\u0327\u0327\u030a # a+ring skips tilde overlay & 2 cedillas
|
||
= Ⓓ❾❺❺
|
||
<1 a\u0327\u0323 # a+dot below skips cedilla
|
||
= Ⓔ❺
|
||
<1 a\u0323\u0301\u0358 # a+dot below+dot ab.r.: 2-char match, then skips acute
|
||
= Ⓕ❶
|
||
<2 a\u0334\u0323\u0358 # a+dot below skips tilde overlay
|
||
= Ⓕ❾
|
||
|
||
* compare
|
||
<2 \u0331\u0331\u0358 # macron below+dot ab.r. skips the second macron below
|
||
= ❽❼
|
||
|
||
* compare
|
||
<1 a\u0327\u0331\u0323\u030a # a+ring skips cedilla, macron below, dot below (dot blocked by macron)
|
||
= Ⓓ❺❼❻
|
||
<1 a\u0327\u0323\U0001D16D\u030a # a+dot below skips cedilla
|
||
= Ⓔ❺²❷
|
||
<2 a\u0327\u0327\u0323\u030a # a+dot below skips 2 cedillas
|
||
= Ⓔ❺❺❷
|
||
<2 a\u0327\u0323\u0323\u030a # a+dot below skips cedilla
|
||
= Ⓔ❺❻❷
|
||
<2 a\u0334\u0327\u0323\u030a # a+dot below skips tilde overlay & cedilla
|
||
= Ⓔ❾❺❷
|
||
|
||
* compare
|
||
<1 \U0001D158\u0327\U0001D165 # quarter note skips cedilla
|
||
= ¼❺
|
||
<1 a\U0001D165\u0323 # a+dot below skips stem
|
||
= Ⓔ¹
|
||
|
||
# partial contiguous match, backs up, matches discontiguous contraction
|
||
<1 a\u0327\u0323b
|
||
= Ⓔ❺b
|
||
<1 a\u0327\u0323ba
|
||
= Ⓔ❺bⓐ
|
||
|
||
# a+acute+acute+dot above right skips cedilla, continues matching 2 same-ccc combining marks
|
||
* compare
|
||
<1 a\u0327\u0301\u0301\u0358
|
||
= Ⓒ❺
|
||
|
||
# FCD but not NFD
|
||
* compare
|
||
<1 a\u0f73\u0301 # a+acute skips tibetan ii
|
||
= Ⓐ③
|
||
|
||
# FCD but the 0f71 inside the 0f73 must be skipped
|
||
# to match the discontiguous contraction of the first 0f71 with the trailing 0f72 inside the 0f73
|
||
* compare
|
||
<1 \u0f71\u0f73 # == \u0f73\u0f71 == \u0f71\u0f71\u0f72
|
||
= ③①
|
||
|
||
** test: discontiguous contractions with nested contractions
|
||
* compare
|
||
<1 a\u0323\u0308\u0301\u0358
|
||
= Ⓕ❹
|
||
<2 a\u0323\u0308\u0301\u0308\u0301\u0358
|
||
= Ⓕ❹❹
|
||
|
||
** test: discontiguous contractions with interleaved contractions
|
||
* compare
|
||
# a+ring & cedilla & macron below+dot above right
|
||
<1 a\u0327\u0331\u030a\u0358
|
||
= Ⓓ❺❽
|
||
|
||
# a+ring & 1x..3x macron below+dot above right
|
||
<2 a\u0331\u030a\u0358
|
||
= Ⓓ❽
|
||
<2 a\u0331\u0331\u030a\u0358\u0358
|
||
= Ⓓ❽❽
|
||
# also skips acute
|
||
<2 a\u0331\u0331\u0331\u030a\u0301\u0358\u0358\u0358
|
||
= Ⓓ❽❽❽❶
|
||
|
||
# a+dot below & stem+augmentation dot, followed by contiguous d+z+acute
|
||
<1 a\U0001D165\u0323\U0001D16Ddz\u0301
|
||
= Ⓔ³ⓓ
|
||
|
||
** test: some simple string comparisons
|
||
@ root
|
||
* compare
|
||
# first string compares against ""
|
||
= \u0000
|
||
< a
|
||
<1 b
|
||
<3 B
|
||
= \u0000B\u0000
|
||
|
||
** test: compare with strength=primary
|
||
% strength=primary
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
= B
|
||
|
||
** test: compare with strength=secondary
|
||
% strength=secondary
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
= B
|
||
|
||
** test: compare with strength=tertiary
|
||
% strength=tertiary
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<3 B
|
||
|
||
** test: compare with strength=quaternary
|
||
% strength=quaternary
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<3 B
|
||
|
||
** test: compare with strength=identical
|
||
% strength=identical
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<3 B
|
||
|
||
** test: côté with forwards secondary
|
||
@ root
|
||
* compare
|
||
<1 cote
|
||
<2 coté
|
||
<2 côte
|
||
<2 côté
|
||
|
||
** test: côté with forwards secondary vs. U+FFFE merge separator
|
||
# Merged sort keys: On each level, any difference in the first segment
|
||
# must trump any further difference.
|
||
* compare
|
||
<1 cote\uFFFEcôté
|
||
<2 coté\uFFFEcôte
|
||
<2 côte\uFFFEcoté
|
||
<2 côté\uFFFEcote
|
||
|
||
** test: côté with backwards secondary
|
||
% backwards=on
|
||
* compare
|
||
<1 cote
|
||
<2 côte
|
||
<2 coté
|
||
<2 côté
|
||
|
||
** test: côté with backwards secondary vs. U+FFFE merge separator
|
||
# Merged sort keys: On each level, any difference in the first segment
|
||
# must trump any further difference.
|
||
* compare
|
||
<1 cote\uFFFEcôté
|
||
<2 côte\uFFFEcoté
|
||
<2 coté\uFFFEcôte
|
||
<2 côté\uFFFEcote
|
||
|
||
** test: U+FFFE on identical level
|
||
@ root
|
||
% strength=identical
|
||
* compare
|
||
# All of these control codes are completely-ignorable, so that
|
||
# their low code points are compared with the merge separator.
|
||
# The merge separator must compare less than any other character.
|
||
<1 \uFFFE\u0001\u0002\u0003
|
||
<i \u0001\uFFFE\u0002\u0003
|
||
<i \u0001\u0002\uFFFE\u0003
|
||
<i \u0001\u0002\u0003\uFFFE
|
||
|
||
* compare
|
||
# The merge separator must even compare less than U+0000.
|
||
<1 \uFFFE\u0000\u0000
|
||
<i \u0000\uFFFE\u0000
|
||
<i \u0000\u0000\uFFFE
|
||
|
||
** test: Hani < surrogates < U+FFFD
|
||
# Note: compareUTF8() treats unpaired surrogates like U+FFFD,
|
||
# so with that the strings with surrogates will compare equal to each other
|
||
# and equal to the string with U+FFFD.
|
||
@ root
|
||
% strength=identical
|
||
* compare
|
||
<1 abz
|
||
<1 a\u4e00z
|
||
<1 a\U00020000z
|
||
<1 a\ud800z
|
||
<1 a\udbffz
|
||
<1 a\udc00z
|
||
<1 a\udfffz
|
||
<1 a\ufffdz
|
||
|
||
** test: script reordering
|
||
@ root
|
||
% reorder Hani Zzzz digit
|
||
* compare
|
||
<1 ?
|
||
<1 +
|
||
<1 丂
|
||
<1 a
|
||
<1 α
|
||
<1 5
|
||
|
||
% reorder default
|
||
* compare
|
||
<1 ?
|
||
<1 +
|
||
<1 5
|
||
<1 a
|
||
<1 α
|
||
<1 丂
|
||
|
||
** test: empty rules
|
||
@ rules
|
||
* compare
|
||
<1 a
|
||
<2 ä
|
||
<3 Ä
|
||
<1 b
|
||
|
||
** test: very simple rules
|
||
@ rules
|
||
&a=e<<<<q<<<<r<x<<<X<<y<<<Y;z,Z
|
||
% strength=quaternary
|
||
* compare
|
||
<1 a
|
||
= e
|
||
<4 q
|
||
<4 r
|
||
<1 x
|
||
<3 X
|
||
<2 y
|
||
<3 Y
|
||
<2 z
|
||
<3 Z
|
||
|
||
** test: tailoring twice before a root position: primary
|
||
@ rules
|
||
&[before 1]b<p
|
||
&[before 1]b<q
|
||
* compare
|
||
<1 a
|
||
<1 p
|
||
<1 q
|
||
<1 b
|
||
|
||
** test: tailoring twice before a root position: secondary
|
||
@ rules
|
||
&[before 2]ſ<<p
|
||
&[before 2]ſ<<q
|
||
* compare
|
||
<1 s
|
||
<2 p
|
||
<2 q
|
||
<2 ſ
|
||
|
||
# secondary-before common weight
|
||
@ rules
|
||
&[before 2]b<<p
|
||
&[before 2]b<<q
|
||
* compare
|
||
<1 a
|
||
<1 p
|
||
<2 q
|
||
<2 b
|
||
|
||
** test: tailoring twice before a root position: tertiary
|
||
@ rules
|
||
&[before 3]B<<<p
|
||
&[before 3]B<<<q
|
||
* compare
|
||
<1 b
|
||
<3 p
|
||
<3 q
|
||
<3 B
|
||
|
||
# tertiary-before common weight
|
||
@ rules
|
||
&[before 3]b<<<p
|
||
&[before 3]b<<<q
|
||
* compare
|
||
<1 a
|
||
<1 p
|
||
<3 q
|
||
<3 b
|
||
|
||
@ rules
|
||
&[before 2]b<<s
|
||
&[before 3]s<<<p
|
||
&[before 3]s<<<q
|
||
* compare
|
||
<1 a
|
||
<1 p
|
||
<3 q
|
||
<3 s
|
||
<2 b
|
||
|
||
** test: tailor after completely ignorable
|
||
@ rules
|
||
&\x00<<<x<<y
|
||
* compare
|
||
= \x00
|
||
= \x1F
|
||
<3 x
|
||
<2 y
|
||
|
||
** test: secondary tailoring gaps, ICU ticket 9362
|
||
@ rules
|
||
&[before 2]s<<'_'
|
||
&s<<r # secondary between s and ſ (long s)
|
||
&ſ<<*a-q # more than 15 between ſ and secondary CE boundary
|
||
&[before 2][first primary ignorable]<<u<<v # between secondary CE boundary & lowest secondary CE
|
||
&[last primary ignorable]<<y<<z
|
||
|
||
* compare
|
||
<2 u
|
||
<2 v
|
||
<2 \u0332 # lowest secondary CE
|
||
<2 \u0308
|
||
<2 y
|
||
<2 z
|
||
<1 s_
|
||
<2 ss
|
||
<2 sr
|
||
<2 sſ
|
||
<2 sa
|
||
<2 sb
|
||
<2 sp
|
||
<2 sq
|
||
<2 sus
|
||
<2 svs
|
||
<2 rs
|
||
|
||
** test: tertiary tailoring gaps, ICU ticket 9362
|
||
@ rules
|
||
&[before 3]t<<<'_'
|
||
&t<<<r # tertiary between t and fullwidth t
|
||
&ᵀ<<<*a-q # more than 15 between ᵀ (modifier letter T) and tertiary CE boundary
|
||
&[before 3][first secondary ignorable]<<<u<<<v # between tertiary CE boundary & lowest tertiary CE
|
||
&[last secondary ignorable]<<<y<<<z
|
||
|
||
* compare
|
||
<3 u
|
||
<3 v
|
||
# Note: The root collator currently does not map any characters to tertiary CEs.
|
||
<3 y
|
||
<3 z
|
||
<1 t_
|
||
<3 tt
|
||
<3 tr
|
||
<3 tt
|
||
<3 tᵀ
|
||
<3 ta
|
||
<3 tb
|
||
<3 tp
|
||
<3 tq
|
||
<3 tut
|
||
<3 tvt
|
||
<3 rt
|
||
|
||
** test: secondary & tertiary around root character
|
||
@ rules
|
||
&[before 2]m<<r
|
||
&m<<s
|
||
&[before 3]m<<<u
|
||
&m<<<v
|
||
* compare
|
||
<1 l
|
||
<1 r
|
||
<2 u
|
||
<3 m
|
||
<3 v
|
||
<2 s
|
||
<1 n
|
||
|
||
** test: secondary & tertiary around tailored item
|
||
@ rules
|
||
&m<x
|
||
&[before 2]x<<r
|
||
&x<<s
|
||
&[before 3]x<<<u
|
||
&x<<<v
|
||
* compare
|
||
<1 m
|
||
<1 r
|
||
<2 u
|
||
<3 x
|
||
<3 v
|
||
<2 s
|
||
<1 n
|
||
|
||
** test: more nesting of secondary & tertiary before
|
||
@ rules
|
||
&[before 3]m<<<u
|
||
&[before 2]m<<r
|
||
&[before 3]r<<<q
|
||
&m<<<w
|
||
&m<<t
|
||
&[before 3]w<<<v
|
||
&w<<<x
|
||
&w<<s
|
||
* compare
|
||
<1 l
|
||
<1 q
|
||
<3 r
|
||
<2 u
|
||
<3 m
|
||
<3 v
|
||
<3 w
|
||
<3 x
|
||
<2 s
|
||
<2 t
|
||
<1 n
|
||
|
||
** test: case bits
|
||
@ rules
|
||
&w<x # tailored CE getting case bits
|
||
=uv=uV=Uv=UV # 2 chars -> 1 CE
|
||
&ae=ch=cH=Ch=CH # 2 chars -> 2 CEs
|
||
&rst=yz=yZ=Yz=YZ # 2 chars -> 3 CEs
|
||
% caseFirst=lower
|
||
* compare
|
||
<1 ae
|
||
= ch
|
||
<3 cH
|
||
<3 Ch
|
||
<3 CH
|
||
<1 rst
|
||
= yz
|
||
<3 yZ
|
||
<3 Yz
|
||
<3 YZ
|
||
<1 w
|
||
<1 x
|
||
= uv
|
||
<3 uV
|
||
= Uv # mixed case on single CE cannot distinguish variations
|
||
<3 UV
|
||
|
||
** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=lower
|
||
@ rules
|
||
&\u0001<<<t<<<T # tertiary CEs
|
||
% caseFirst=lower
|
||
* compare
|
||
<1 aa
|
||
<3 aat
|
||
<3 aaT
|
||
<3 aA
|
||
<3 aAt
|
||
<3 ata
|
||
<3 aTa
|
||
|
||
** test: tertiary CEs, tertiary, caseLevel=off, caseFirst=upper
|
||
% caseFirst=upper
|
||
* compare
|
||
<1 aA
|
||
<3 aAt
|
||
<3 aa
|
||
<3 aat
|
||
<3 aaT
|
||
<3 ata
|
||
<3 aTa
|
||
|
||
** test: reset on expansion, ICU tickets 9415 & 9593
|
||
@ rules
|
||
&æ<x # tailor the last primary CE so that x sorts between ae and af
|
||
&æb=bæ # copy all reset CEs to make bæ sort the same
|
||
&각<h # copy/tailor 3 CEs to make h sort before the next Hangul syllable 갂
|
||
&⒀<<y # copy/tailor 4 CEs to make y sort with only a secondary difference
|
||
&l·=z # handle the pre-context for · when fetching reset CEs
|
||
<<u # copy/tailor 2 CEs
|
||
|
||
* compare
|
||
<1 ae
|
||
<2 æ
|
||
<1 x
|
||
<1 af
|
||
|
||
* compare
|
||
<1 aeb
|
||
<2 æb
|
||
= bæ
|
||
|
||
* compare
|
||
<1 각
|
||
<1 h
|
||
<1 갂
|
||
<1 갃
|
||
|
||
* compare
|
||
<1 · # by itself: primary CE
|
||
<1 l
|
||
<2 l· # l+middle dot has only a secondary difference from l
|
||
= z
|
||
<2 u
|
||
|
||
* compare
|
||
<1 (13)
|
||
<3 ⒀ # DUCET sets special tertiary weights in all CEs
|
||
<2 y
|
||
<1 (13[
|
||
|
||
% alternate=shifted
|
||
* compare
|
||
<1 (13)
|
||
= 13
|
||
<3 ⒀
|
||
= y # alternate=shifted removes the tailoring difference on the last CE
|
||
<1 14
|
||
|
||
** test: contraction inside extension, ICU ticket 9378
|
||
@ rules
|
||
&а<<х/й # all letters are Cyrillic
|
||
* compare
|
||
<1 ай
|
||
<2 х
|
||
|
||
** test: no duplicate tailored CEs for different reset positions with same CEs, ICU ticket 10104
|
||
@ rules
|
||
&t<x &ᵀ<y # same primary weights
|
||
&q<u &[before 1]ꝗ<v # q and ꝗ are primary adjacent
|
||
* compare
|
||
<1 q
|
||
<1 u
|
||
<1 v
|
||
<1 ꝗ
|
||
<1 t
|
||
<3 ᵀ
|
||
<1 y
|
||
<1 x
|
||
|
||
# Principle: Each rule builds on the state of preceding rules and ignores following rules.
|
||
|
||
** test: later rule does not affect earlier reset position, ICU ticket 10105
|
||
@ rules
|
||
&a < u < v < w &ov < x &b < v
|
||
* compare
|
||
<1 oa
|
||
<1 ou
|
||
<1 x # CE(o) followed by CE between u and w
|
||
<1 ow
|
||
<1 ob
|
||
<1 ov
|
||
|
||
** test: later rule does not affect earlier extension (1), ICU ticket 10105
|
||
@ rules
|
||
&a=x/b &v=b
|
||
% strength=secondary
|
||
* compare
|
||
<1 B
|
||
<1 c
|
||
<1 v
|
||
= b
|
||
* compare
|
||
<1 AB
|
||
= x
|
||
<1 ac
|
||
<1 av
|
||
= ab
|
||
|
||
** test: later rule does not affect earlier extension (2), ICU ticket 10105
|
||
@ rules
|
||
&a <<< c / e &g <<< e / l
|
||
% strength=secondary
|
||
* compare
|
||
<1 AE
|
||
= c
|
||
<2 æ
|
||
<1 agl
|
||
= ae
|
||
|
||
** test: later rule does not affect earlier extension (3), ICU ticket 10105
|
||
@ rules
|
||
&a = b / c &d = c / e
|
||
% strength=secondary
|
||
* compare
|
||
<1 AC # C is still only tertiary different from the original c
|
||
= b
|
||
<1 ade
|
||
= ac
|
||
|
||
** test: extension contains tailored character, ICU ticket 10105
|
||
@ rules
|
||
&a=e &b=u/e
|
||
* compare
|
||
<1 a
|
||
= e
|
||
<1 ba
|
||
= be
|
||
= u
|
||
|
||
** test: add simple mappings for characters with root context
|
||
@ rules
|
||
&z=· # middle dot has a prefix mapping in the CLDR root
|
||
&n=и # и (U+0438) has contractions in the root
|
||
* compare
|
||
<1 l
|
||
<2 l· # root mapping for l|· still works
|
||
<1 z
|
||
= ·
|
||
* compare
|
||
<1 n
|
||
= и
|
||
<1 И
|
||
<1 и\u0306 # root mapping for й=и\u0306 still works
|
||
= й
|
||
<3 Й
|
||
|
||
** test: add context mappings around characters with root context
|
||
@ rules
|
||
&z=·h # middle dot has a prefix mapping in the CLDR root
|
||
&n=ә|и # и (U+0438) has contractions in the root
|
||
* compare
|
||
<1 l
|
||
<2 l· # root mapping for l|· still works
|
||
<1 z
|
||
= ·h
|
||
* compare
|
||
<1 и
|
||
<3 И
|
||
<1 и\u0306 # root mapping for й=и\u0306 still works
|
||
= й
|
||
* compare
|
||
<1 әn
|
||
= әи
|
||
<1 әo
|
||
|
||
** test: many secondary CEs at the top of their range
|
||
@ rules
|
||
&[last primary ignorable]<<*\u2801-\u28ff
|
||
* compare
|
||
<2 \u0308
|
||
<2 \u2801
|
||
<2 \u2802
|
||
<2 \u2803
|
||
<2 \u2804
|
||
<2 \u28fd
|
||
<2 \u28fe
|
||
<2 \u28ff
|
||
<1 \x20
|
||
|
||
** test: many tertiary CEs at the top of their range
|
||
@ rules
|
||
&[last secondary ignorable]<<<*a-z
|
||
* compare
|
||
<3 a
|
||
<3 b
|
||
<3 c
|
||
<3 d
|
||
# e..w
|
||
<3 x
|
||
<3 y
|
||
<3 z
|
||
<2 \u0308
|
||
|
||
** test: tailor contraction together with nearly equivalent prefix, ICU ticket 10101
|
||
@ rules
|
||
&a=p|x &b=px &c=op
|
||
* compare
|
||
<1 b
|
||
= px
|
||
<3 B
|
||
<1 c
|
||
= op
|
||
<3 C
|
||
* compare
|
||
<1 ca
|
||
= opx # first contraction op, then prefix p|x
|
||
<3 cA
|
||
<3 Ca
|
||
|
||
** test: reset position with prefix (pre-context), ICU ticket 10102
|
||
@ rules
|
||
&a=p|x &px=y
|
||
* compare
|
||
<1 pa
|
||
= px
|
||
= y
|
||
<3 pA
|
||
<1 q
|
||
<1 x
|
||
|
||
** test: prefix+contraction together (1), ICU ticket 10071
|
||
@ rules
|
||
&x=a|bc
|
||
* compare
|
||
<1 ab
|
||
<1 Abc
|
||
<1 abd
|
||
<1 ac
|
||
<1 aw
|
||
<1 ax
|
||
= abc
|
||
<3 aX
|
||
<3 Ax
|
||
<1 b
|
||
<1 bb
|
||
<1 bc
|
||
<3 bC
|
||
<3 Bc
|
||
<1 bd
|
||
|
||
** test: prefix+contraction together (2), ICU ticket 10071
|
||
@ rules
|
||
&w=bc &x=a|b
|
||
* compare
|
||
<1 w
|
||
= bc
|
||
<3 W
|
||
* compare
|
||
<1 aw
|
||
<1 ax
|
||
= ab
|
||
<3 aX
|
||
<1 axb
|
||
<1 axc
|
||
= abc # prefix match a|b takes precedence over contraction match bc
|
||
<3 abC
|
||
<1 abd
|
||
<1 ay
|
||
|
||
** test: prefix+contraction together (3), ICU ticket 10071
|
||
@ rules
|
||
&x=a|b &w=bc # reverse order of rules as previous test, order should not matter here
|
||
* compare # same "compare" sequences as previous test
|
||
<1 w
|
||
= bc
|
||
<3 W
|
||
* compare
|
||
<1 aw
|
||
<1 ax
|
||
= ab
|
||
<3 aX
|
||
<1 axb
|
||
<1 axc
|
||
= abc # prefix match a|b takes precedence over contraction match bc
|
||
<3 abC
|
||
<1 abd
|
||
<1 ay
|
||
|
||
** test: no mapping p|c, falls back to contraction ch, CLDR ticket 5962
|
||
@ rules
|
||
&d=ch &v=p|ci
|
||
* compare
|
||
<1 pc
|
||
<3 pC
|
||
<1 pcH
|
||
<1 pcI
|
||
<1 pd
|
||
= pch # no-prefix contraction ch matches
|
||
<3 pD
|
||
<1 pv
|
||
= pci # prefix+contraction p|ci matches
|
||
<3 pV
|
||
|
||
** test: tailor in & around compact ranges of root primaries
|
||
# The Ogham characters U+1681..U+169A are in simple ascending order of primary CEs
|
||
# which should be reliably encoded as one range in the root elements data.
|
||
@ rules
|
||
&[before 1]ᚁ<a
|
||
&ᚁ<b
|
||
&[before 1]ᚂ<c
|
||
&ᚂ<d
|
||
&[before 1]ᚚ<y
|
||
&ᚚ<z
|
||
&[before 2]ᚁ<<r
|
||
&ᚁ<<s
|
||
&[before 3]ᚚ<<<t
|
||
&ᚚ<<<u
|
||
* compare
|
||
<1 ᣵ # U+18F5 last Canadian Aboriginal
|
||
<1 a
|
||
<1 r
|
||
<2 ᚁ
|
||
<2 s
|
||
<1 b
|
||
<1 c
|
||
<1 ᚂ
|
||
<1 d
|
||
<1 ᚃ
|
||
<1 ᚙ
|
||
<1 y
|
||
<1 t
|
||
<3 ᚚ
|
||
<3 u
|
||
<1 z
|
||
<1 ᚠ # U+16A0 first Runic
|
||
|
||
** test: suppressContractions
|
||
@ rules
|
||
&z<ch<әж [suppressContractions [·cә]]
|
||
* compare
|
||
<1 ch
|
||
<3 cH # ch was suppressed
|
||
<1 l
|
||
<1 l· # primary difference, not secondary, because l|· was suppressed
|
||
<1 ә
|
||
<2 ә\u0308 # secondary difference, not primary, because contractions for ә were suppressed
|
||
<1 әж
|
||
<3 әЖ
|
||
|
||
** test: Hangul & Jamo
|
||
@ rules
|
||
&L=\u1100 # first Jamo L
|
||
&V=\u1161 # first Jamo V
|
||
&T=\u11A8 # first Jamo T
|
||
&\uAC01<<*\u4E00-\u4EFF # first Hangul LVT syllable & lots of secondary diffs
|
||
* compare
|
||
<1 Lv
|
||
<3 LV
|
||
= \u1100\u1161
|
||
= \uAC00
|
||
<1 LVt
|
||
<3 LVT
|
||
= \u1100\u1161\u11A8
|
||
= \uAC00\u11A8
|
||
= \uAC01
|
||
<2 LVT\u0308
|
||
<2 \u4E00
|
||
<2 \u4E01
|
||
<2 \u4E80
|
||
<2 \u4EFF
|
||
<2 LV\u0308T
|
||
<1 \uAC02
|
||
|
||
** test: adjust special reset positions according to previous rules, CLDR ticket 6070
|
||
@ rules
|
||
&[last variable]<x
|
||
[maxVariable space] # has effect only after building, no effect on following rules
|
||
&[last variable]<y
|
||
&[before 1][first regular]<z
|
||
* compare
|
||
<1 ? # some punctuation
|
||
<1 x
|
||
<1 y
|
||
<1 z
|
||
<1 $ # some symbol
|
||
|
||
@ rules
|
||
&[last primary ignorable]<<x<<<y
|
||
&[last primary ignorable]<<z
|
||
* compare
|
||
<2 \u0358
|
||
<2 x
|
||
<3 y
|
||
<2 z
|
||
<1 \x20
|
||
|
||
@ rules
|
||
&[last secondary ignorable]<<<x
|
||
&[last secondary ignorable]<<<y
|
||
* compare
|
||
<3 x
|
||
<3 y
|
||
<2 \u0358
|
||
|
||
@ rules
|
||
&[before 2][first variable]<<z
|
||
&[before 2][first variable]<<y
|
||
&[before 3][first variable]<<<x
|
||
&[before 3][first variable]<<<w
|
||
&[before 1][first variable]<v
|
||
&[before 2][first variable]<<u
|
||
&[before 3][first variable]<<<t
|
||
&[before 2]\uFDD1\xA0<<s # FractionalUCA.txt: FDD1 00A0, SPACE first primary
|
||
* compare
|
||
<2 \u0358
|
||
<1 s
|
||
<2 \uFDD1\xA0
|
||
<1 t
|
||
<3 u
|
||
<2 v
|
||
<1 w
|
||
<3 x
|
||
<3 y
|
||
<2 z
|
||
<2 \t
|
||
|
||
@ rules
|
||
&[before 2][first regular]<<z
|
||
&[before 3][first regular]<<<y
|
||
&[before 1][first regular]<x
|
||
&[before 3][first regular]<<<w
|
||
&[before 2]\uFDD1\u263A<<v # FractionalUCA.txt: FDD1 263A, SYMBOL first primary
|
||
&[before 3][first regular]<<<u
|
||
&[before 1][first regular]<p # primary before the boundary: becomes variable
|
||
&[before 3][first regular]<<<t # not affected by p
|
||
&[last variable]<q # after p!
|
||
* compare
|
||
<1 ?
|
||
<1 p
|
||
<1 q
|
||
<1 t
|
||
<3 u
|
||
<3 v
|
||
<1 w
|
||
<3 x
|
||
<1 y
|
||
<3 z
|
||
<1 $
|
||
|
||
# check that p & q are indeed variable
|
||
% alternate=shifted
|
||
* compare
|
||
= ?
|
||
= p
|
||
= q
|
||
<1 t
|
||
<3 u
|
||
<3 v
|
||
<1 w
|
||
<3 x
|
||
<1 y
|
||
<3 z
|
||
<1 $
|
||
|
||
@ rules
|
||
&[before 2][first trailing]<<z
|
||
&[before 1][first trailing]<y
|
||
&[before 3][first trailing]<<<x
|
||
* compare
|
||
<1 \u4E00 # first Han, first implicit
|
||
<1 \uFDD1\uFDD0 # FractionalUCA.txt: unassigned first primary
|
||
# Note: The root collator currently does not map any characters to the trailing first boundary primary.
|
||
<1 x
|
||
<3 y
|
||
<1 z
|
||
<2 \uFFFD # The root collator currently maps U+FFFD to the first real trailing primary.
|
||
|
||
@ rules
|
||
&[before 2][first primary ignorable]<<z
|
||
&[before 2][first primary ignorable]<<y
|
||
&[before 3][first primary ignorable]<<<x
|
||
&[before 3][first primary ignorable]<<<w
|
||
* compare
|
||
= \x01
|
||
<2 w
|
||
<3 x
|
||
<3 y
|
||
<2 z
|
||
<2 \u0301
|
||
|
||
@ rules
|
||
&[before 3][first secondary ignorable]<<<y
|
||
&[before 3][first secondary ignorable]<<<x
|
||
* compare
|
||
= \x01
|
||
<3 x
|
||
<3 y
|
||
<2 \u0301
|
||
|
||
** test: canonical closure
|
||
@ rules
|
||
&X=A &U=Â
|
||
* compare
|
||
<1 U
|
||
= Â
|
||
= A\u0302
|
||
<2 Ú # U with acute
|
||
= U\u0301
|
||
= Ấ # A with circumflex & acute
|
||
= Â\u0301
|
||
= A\u0302\u0301
|
||
<1 X
|
||
= A
|
||
<2 X\u030A # with ring above
|
||
= Å
|
||
= A\u030A
|
||
= \u212B # Angstrom sign
|
||
|
||
@ rules
|
||
&x=\u5140\u55C0
|
||
* compare
|
||
<1 x
|
||
= \u5140\u55C0
|
||
= \u5140\uFA0D
|
||
= \uFA0C\u55C0
|
||
= \uFA0C\uFA0D # CJK compatibility characters
|
||
<3 X
|
||
|
||
# canonical closure on prefix rules, ICU ticket 9444
|
||
@ rules
|
||
&x=ä|ŝ
|
||
* compare
|
||
<1 äs # not tailored
|
||
<1 äx
|
||
= äŝ
|
||
= a\u0308s\u0302
|
||
= a\u0308ŝ
|
||
= äs\u0302
|
||
<3 äX
|
||
|
||
** test: conjoining Jamo map to expansions
|
||
@ rules
|
||
&gg=\u1101 # Jamo Lead consonant GG
|
||
&nj=\u11AC # Jamo Trail consonant NJ
|
||
* compare
|
||
<1 gg\u1161nj
|
||
= \u1101\u1161\u11AC
|
||
= \uAE4C\u11AC
|
||
= \uAE51
|
||
<3 gg\u1161nJ
|
||
<1 \u1100\u1100
|
||
|
||
** test: canonical tail closure, ICU ticket 5913
|
||
@ rules
|
||
&a<â
|
||
* compare
|
||
<1 a
|
||
<1 â # tailored
|
||
= a\u0302
|
||
<2 a\u0323\u0302 # discontiguous contraction
|
||
= ạ\u0302 # equivalent
|
||
= ậ # equivalent
|
||
<1 b
|
||
|
||
@ rules
|
||
&a<ạ
|
||
* compare
|
||
<1 a
|
||
<1 ạ # tailored
|
||
= a\u0323
|
||
<2 a\u0323\u0302 # contiguous contraction plus extra diacritic
|
||
= ạ\u0302 # equivalent
|
||
= ậ # equivalent
|
||
<1 b
|
||
|
||
# Tail closure should work even if there is a prefix and/or contraction.
|
||
@ rules
|
||
&a<\u5140|câ
|
||
# In order to find discontiguous contractions for \u5140|câ
|
||
# there must exist a mapping for \u5140|ca, regardless of what it maps to.
|
||
# (This follows from the UCA spec.)
|
||
&x=\u5140|ca
|
||
* compare
|
||
<1 \u5140a
|
||
= \uFA0Ca
|
||
<1 \u5140câ # tailored
|
||
= \uFA0Ccâ
|
||
= \u5140ca\u0302
|
||
= \uFA0Cca\u0302
|
||
<2 \u5140ca\u0323\u0302 # discontiguous contraction
|
||
= \uFA0Cca\u0323\u0302
|
||
= \u5140cạ\u0302
|
||
= \uFA0Ccạ\u0302
|
||
= \u5140cậ
|
||
= \uFA0Ccậ
|
||
<1 \u5140b
|
||
= \uFA0Cb
|
||
<1 \u5140x
|
||
= \u5140ca
|
||
|
||
# Double-check that without the extra mapping there will be no discontiguous match.
|
||
@ rules
|
||
&a<\u5140|câ
|
||
* compare
|
||
<1 \u5140a
|
||
= \uFA0Ca
|
||
<1 \u5140câ # tailored
|
||
= \uFA0Ccâ
|
||
= \u5140ca\u0302
|
||
= \uFA0Cca\u0302
|
||
<1 \u5140b
|
||
= \uFA0Cb
|
||
<1 \u5140ca\u0323\u0302 # no discontiguous contraction
|
||
= \uFA0Cca\u0323\u0302
|
||
= \u5140cạ\u0302
|
||
= \uFA0Ccạ\u0302
|
||
= \u5140cậ
|
||
= \uFA0Ccậ
|
||
|
||
@ rules
|
||
&a<cạ
|
||
* compare
|
||
<1 a
|
||
<1 cạ # tailored
|
||
= ca\u0323
|
||
<2 ca\u0323\u0302 # contiguous contraction plus extra diacritic
|
||
= cạ\u0302 # equivalent
|
||
= cậ # equivalent
|
||
<1 b
|
||
|
||
# ᾢ = U+1FA2 GREEK SMALL LETTER OMEGA WITH PSILI AND VARIA AND YPOGEGRAMMENI
|
||
# = 03C9 0313 0300 0345
|
||
# ccc = 0, 230, 230, 240
|
||
@ rules
|
||
&δ=αῳ
|
||
# In order to find discontiguous contractions for αῳ
|
||
# there must exist a mapping for αω, regardless of what it maps to.
|
||
# (This follows from the UCA spec.)
|
||
&ε=αω
|
||
* compare
|
||
<1 δ
|
||
= αῳ
|
||
= αω\u0345
|
||
<2 αω\u0313\u0300\u0345 # discontiguous contraction
|
||
= αὠ\u0300\u0345
|
||
= αὢ\u0345
|
||
= αᾢ
|
||
<2 αω\u0300\u0313\u0345
|
||
= αὼ\u0313\u0345
|
||
= αῲ\u0313 # not FCD
|
||
<1 ε
|
||
= αω
|
||
|
||
# Double-check that without the extra mapping there will be no discontiguous match.
|
||
@ rules
|
||
&δ=αῳ
|
||
* compare
|
||
<1 αω\u0313\u0300\u0345 # no discontiguous contraction
|
||
= αὠ\u0300\u0345
|
||
= αὢ\u0345
|
||
= αᾢ
|
||
<2 αω\u0300\u0313\u0345
|
||
= αὼ\u0313\u0345
|
||
= αῲ\u0313 # not FCD
|
||
<1 δ
|
||
= αῳ
|
||
= αω\u0345
|
||
|
||
# Add U+0315 COMBINING COMMA ABOVE RIGHT which has ccc=232.
|
||
# Tests code paths where the tailored string has a combining mark
|
||
# that does not occur in any composite's decomposition.
|
||
@ rules
|
||
&δ=αὼ\u0315
|
||
* compare
|
||
<1 αω\u0313\u0300\u0315 # Not tailored: The grave accent blocks the comma above.
|
||
= αὠ\u0300\u0315
|
||
= αὢ\u0315
|
||
<1 δ
|
||
= αὼ\u0315
|
||
= αω\u0300\u0315
|
||
<2 αω\u0300\u0315\u0345
|
||
= αὼ\u0315\u0345
|
||
= αῲ\u0315 # not FCD
|
||
|
||
** test: danish a+a vs. a-umlaut, ICU ticket 9319
|
||
@ rules
|
||
&z<aa
|
||
* compare
|
||
<1 z
|
||
<1 aa
|
||
<2 aa\u0308
|
||
= aä
|
||
|
||
** test: Jamo L with and in prefix
|
||
# Useful for the Korean "searchjl" tailoring (instead of contractions of pairs of Jamo L).
|
||
@ rules
|
||
# Jamo Lead consonant G after G or GG
|
||
&[last primary ignorable]<<\u1100|\u1100=\u1101|\u1100
|
||
# Jamo Lead consonant GG sorts like G+G
|
||
&\u1100\u1100=\u1101
|
||
# Note: Making G|GG and GG|GG sort the same as G|G+G
|
||
# would require the ability to reset on G|G+G,
|
||
# or we could make G-after-G equal to some secondary-CE character,
|
||
# and reset on a pair of those.
|
||
# (It does not matter much if there are at most two G in a row in real text.)
|
||
* compare
|
||
<1 \u1100
|
||
<2 \u1100\u1100 # only one primary from a sequence of G lead consonants
|
||
= \u1101
|
||
<2 \u1100\u1100\u1100
|
||
= \u1101\u1100
|
||
# but not = \u1100\u1101, see above
|
||
<1 \u1100\u1161
|
||
= \uAC00
|
||
<2 \u1100\u1100\u1161
|
||
= \u1100\uAC00 # prefix match from the L of the LV syllable
|
||
= \u1101\u1161
|
||
= \uAE4C
|
||
|
||
** test: proposed Korean "searchjl" tailoring with prefixes, CLDR ticket 6546
|
||
@ rules
|
||
# Low secondary CEs for Jamo V & T.
|
||
# Note: T should sort before V for proper syllable order.
|
||
&\u0332 # COMBINING LOW LINE (first primary ignorable)
|
||
<<\u1161<<\u1162
|
||
|
||
# Korean Jamo lead consonant search rules, part 2:
|
||
# Make modern compound L jamo primary equivalent to non-compound forms.
|
||
|
||
# Secondary CEs for Jamo L-after-L, greater than Jamo V & T.
|
||
&\u0313 # COMBINING COMMA ABOVE (second primary ignorable)
|
||
=\u1100|\u1100
|
||
=\u1103|\u1103
|
||
=\u1107|\u1107
|
||
=\u1109|\u1109
|
||
=\u110C|\u110C
|
||
|
||
# Compound L Jamo map to equivalent expansions of primary+secondary CE.
|
||
&\u1100\u0313=\u1101<<<\u3132 # HANGUL CHOSEONG SSANGKIYEOK, HANGUL LETTER SSANGKIYEOK
|
||
&\u1103\u0313=\u1104<<<\u3138 # HANGUL CHOSEONG SSANGTIKEUT, HANGUL LETTER SSANGTIKEUT
|
||
&\u1107\u0313=\u1108<<<\u3143 # HANGUL CHOSEONG SSANGPIEUP, HANGUL LETTER SSANGPIEUP
|
||
&\u1109\u0313=\u110A<<<\u3146 # HANGUL CHOSEONG SSANGSIOS, HANGUL LETTER SSANGSIOS
|
||
&\u110C\u0313=\u110D<<<\u3149 # HANGUL CHOSEONG SSANGCIEUC, HANGUL LETTER SSANGCIEUC
|
||
|
||
* compare
|
||
<1 \u1100\u1161
|
||
= \uAC00
|
||
<2 \u1100\u1162
|
||
= \uAC1C
|
||
<2 \u1100\u1100\u1161
|
||
= \u1100\uAC00
|
||
= \u1101\u1161
|
||
= \uAE4C
|
||
<3 \u3132\u1161
|
||
|
||
** test: Hangul syllables in prefix & in the interior of a contraction
|
||
@ rules
|
||
&x=\u1100\u1161|a\u1102\u1162z
|
||
* compare
|
||
<1 \u1100\u1161x
|
||
= \u1100\u1161a\u1102\u1162z
|
||
= \u1100\u1161a\uB0B4z
|
||
= \uAC00a\u1102\u1162z
|
||
= \uAC00a\uB0B4z
|
||
|
||
** test: digits are unsafe-backwards when numeric=on
|
||
@ root
|
||
% numeric=on
|
||
* compare
|
||
# If digits are not unsafe, then numeric collation sees "1"=="01" and "b">"a".
|
||
# We need to back up before the identical prefix "1" and compare the full numbers.
|
||
<1 11b
|
||
<1 101a
|
||
|
||
** test: simple locale data test
|
||
@ locale de
|
||
* compare
|
||
<1 a
|
||
<2 ä
|
||
<1 ae
|
||
<2 æ
|
||
|
||
@ locale de-u-co-phonebk
|
||
* compare
|
||
<1 a
|
||
<1 ae
|
||
<2 ä
|
||
<2 æ
|
||
|
||
# The following test cases were moved here from ICU 52's DataDrivenCollationTest.txt.
|
||
|
||
** test: DataDrivenCollationTest/TestMorePinyin
|
||
# Testing the primary strength.
|
||
@ locale zh
|
||
% strength=primary
|
||
* compare
|
||
< lā
|
||
= lĀ
|
||
= Lā
|
||
= LĀ
|
||
< lān
|
||
= lĀn
|
||
< lē
|
||
= lĒ
|
||
= Lē
|
||
= LĒ
|
||
< lēn
|
||
= lĒn
|
||
|
||
** test: DataDrivenCollationTest/TestLithuanian
|
||
# Lithuanian sort order.
|
||
@ locale lt
|
||
* compare
|
||
< cz
|
||
< č
|
||
< d
|
||
< iz
|
||
< j
|
||
< sz
|
||
< š
|
||
< t
|
||
< zz
|
||
< ž
|
||
|
||
** test: DataDrivenCollationTest/TestLatvian
|
||
# Latvian sort order.
|
||
@ locale lv
|
||
* compare
|
||
< cz
|
||
< č
|
||
< d
|
||
< gz
|
||
< ģ
|
||
< h
|
||
< iz
|
||
< j
|
||
< kz
|
||
< ķ
|
||
< l
|
||
< lz
|
||
< ļ
|
||
< m
|
||
< nz
|
||
< ņ
|
||
< o
|
||
< rz
|
||
< ŗ
|
||
< s
|
||
< sz
|
||
< š
|
||
< t
|
||
< zz
|
||
< ž
|
||
|
||
** test: DataDrivenCollationTest/TestEstonian
|
||
# Estonian sort order.
|
||
@ locale et
|
||
* compare
|
||
< sy
|
||
< š
|
||
< šy
|
||
< z
|
||
< zy
|
||
< ž
|
||
< v
|
||
< w
|
||
< va
|
||
< õ
|
||
< õy
|
||
< ä
|
||
< äy
|
||
< ö
|
||
< öy
|
||
< ü
|
||
< üy
|
||
< x
|
||
|
||
** test: DataDrivenCollationTest/TestAlbanian
|
||
# Albanian sort order.
|
||
@ locale sq
|
||
* compare
|
||
< cz
|
||
< ç
|
||
< d
|
||
< dz
|
||
< dh
|
||
< e
|
||
< ez
|
||
< ë
|
||
< f
|
||
< gz
|
||
< gj
|
||
< h
|
||
< lz
|
||
< ll
|
||
< m
|
||
< nz
|
||
< nj
|
||
< o
|
||
< rz
|
||
< rr
|
||
< s
|
||
< sz
|
||
< sh
|
||
< t
|
||
< tz
|
||
< th
|
||
< u
|
||
< xz
|
||
< xh
|
||
< y
|
||
< zz
|
||
< zh
|
||
|
||
** test: DataDrivenCollationTest/TestSimplifiedChineseOrder
|
||
# Sorted file has different order.
|
||
@ root
|
||
# normalization=on turned on & off automatically.
|
||
* compare
|
||
< \u5F20
|
||
< \u5F20\u4E00\u8E3F
|
||
|
||
** test: DataDrivenCollationTest/TestTibetanNormalizedIterativeCrash
|
||
# This pretty much crashes.
|
||
@ root
|
||
* compare
|
||
< \u0f71\u0f72\u0f80\u0f71\u0f72
|
||
< \u0f80
|
||
|
||
** test: DataDrivenCollationTest/TestThaiPartialSortKeyProblems
|
||
# These are examples of strings that caused trouble in partial sort key testing.
|
||
@ locale th-TH
|
||
* compare
|
||
< \u0E01\u0E01\u0E38\u0E18\u0E20\u0E31\u0E13\u0E11\u0E4C
|
||
< \u0E01\u0E01\u0E38\u0E2A\u0E31\u0E19\u0E42\u0E18
|
||
* compare
|
||
< \u0E01\u0E07\u0E01\u0E32\u0E23
|
||
< \u0E01\u0E07\u0E42\u0E01\u0E49
|
||
* compare
|
||
< \u0E01\u0E23\u0E19\u0E17\u0E32
|
||
< \u0E01\u0E23\u0E19\u0E19\u0E40\u0E0A\u0E49\u0E32
|
||
* compare
|
||
< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E22\u0E27
|
||
< \u0E01\u0E23\u0E30\u0E40\u0E08\u0E35\u0E4A\u0E22\u0E27
|
||
* compare
|
||
< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E2D
|
||
< \u0E01\u0E23\u0E23\u0E40\u0E0A\u0E49\u0E32
|
||
|
||
** test: DataDrivenCollationTest/TestJavaStyleRule
|
||
# java.text allows rules to start as '<<<x<<<y...'
|
||
# we emulate this by assuming a &[first tertiary ignorable] in this case.
|
||
@ rules
|
||
&\u0001=equal<<<z<<x<<<w &[first tertiary ignorable]=a &[first primary ignorable]=b
|
||
* compare
|
||
= a
|
||
= equal
|
||
< z
|
||
< x
|
||
= b # x had become the new first primary ignorable
|
||
< w
|
||
|
||
** test: DataDrivenCollationTest/TestShiftedIgnorable
|
||
# The UCA states that primary ignorables should be completely
|
||
# ignorable when following a shifted code point.
|
||
@ root
|
||
% alternate=shifted
|
||
% strength=quaternary
|
||
* compare
|
||
< a\u0020b
|
||
= a\u0020\u0300b
|
||
= a\u0020\u0301b
|
||
< a_b
|
||
= a_\u0300b
|
||
= a_\u0301b
|
||
< A\u0020b
|
||
= A\u0020\u0300b
|
||
= A\u0020\u0301b
|
||
< A_b
|
||
= A_\u0300b
|
||
= A_\u0301b
|
||
< a\u0301b
|
||
< A\u0301b
|
||
< a\u0300b
|
||
< A\u0300b
|
||
|
||
** test: DataDrivenCollationTest/TestNShiftedIgnorable
|
||
# The UCA states that primary ignorables should be completely
|
||
# ignorable when following a shifted code point.
|
||
@ root
|
||
% alternate=non-ignorable
|
||
% strength=tertiary
|
||
* compare
|
||
< a\u0020b
|
||
< A\u0020b
|
||
< a\u0020\u0301b
|
||
< A\u0020\u0301b
|
||
< a\u0020\u0300b
|
||
< A\u0020\u0300b
|
||
< a_b
|
||
< A_b
|
||
< a_\u0301b
|
||
< A_\u0301b
|
||
< a_\u0300b
|
||
< A_\u0300b
|
||
< a\u0301b
|
||
< A\u0301b
|
||
< a\u0300b
|
||
< A\u0300b
|
||
|
||
** test: DataDrivenCollationTest/TestSafeSurrogates
|
||
# It turned out that surrogates were not skipped properly
|
||
# when iterating backwards if they were in the middle of a
|
||
# contraction. This test assures that this is fixed.
|
||
@ rules
|
||
&a < x\ud800\udc00b
|
||
* compare
|
||
< a
|
||
< x\ud800\udc00b
|
||
|
||
** test: DataDrivenCollationTest/da_TestPrimary
|
||
# This test goes through primary strength cases
|
||
@ locale da
|
||
% strength=primary
|
||
* compare
|
||
< Lvi
|
||
< Lwi
|
||
* compare
|
||
< L\u00e4vi
|
||
< L\u00f6wi
|
||
* compare
|
||
< L\u00fcbeck
|
||
= Lybeck
|
||
|
||
** test: DataDrivenCollationTest/da_TestTertiary
|
||
# This test goes through tertiary strength cases
|
||
@ locale da
|
||
% strength=tertiary
|
||
* compare
|
||
< Luc
|
||
< luck
|
||
* compare
|
||
< luck
|
||
< L\u00fcbeck
|
||
* compare
|
||
< lybeck
|
||
< L\u00fcbeck
|
||
* compare
|
||
< L\u00e4vi
|
||
< L\u00f6we
|
||
* compare
|
||
< L\u00f6ww
|
||
< mast
|
||
|
||
* compare
|
||
< A/S
|
||
< ANDRE
|
||
< ANDR\u00c9
|
||
< ANDREAS
|
||
< AS
|
||
< CA
|
||
< \u00c7A
|
||
< CB
|
||
< \u00c7C
|
||
< D.S.B.
|
||
< DA
|
||
< \u00d0A
|
||
< DB
|
||
< \u00d0C
|
||
< DSB
|
||
< DSC
|
||
< EKSTRA_ARBEJDE
|
||
< EKSTRABUD0
|
||
< H\u00d8ST
|
||
< HAAG
|
||
< H\u00c5NDBOG
|
||
< HAANDV\u00c6RKSBANKEN
|
||
< Karl
|
||
< karl
|
||
< NIELS\u0020J\u00d8RGEN
|
||
< NIELS-J\u00d8RGEN
|
||
< NIELSEN
|
||
< R\u00c9E,\u0020A
|
||
< REE,\u0020B
|
||
< R\u00c9E,\u0020L
|
||
< REE,\u0020V
|
||
< SCHYTT,\u0020B
|
||
< SCHYTT,\u0020H
|
||
< SCH\u00dcTT,\u0020H
|
||
< SCHYTT,\u0020L
|
||
< SCH\u00dcTT,\u0020M
|
||
< SS
|
||
< \u00df
|
||
< SSA
|
||
< STORE\u0020VILDMOSE
|
||
< STOREK\u00c6R0
|
||
< STORM\u0020PETERSEN
|
||
< STORMLY
|
||
< THORVALD
|
||
< THORVARDUR
|
||
< \u00feORVAR\u00d0UR
|
||
< THYGESEN
|
||
< VESTERG\u00c5RD,\u0020A
|
||
< VESTERGAARD,\u0020A
|
||
< VESTERG\u00c5RD,\u0020B
|
||
< \u00c6BLE
|
||
< \u00c4BLE
|
||
< \u00d8BERG
|
||
< \u00d6BERG
|
||
|
||
* compare
|
||
< andere
|
||
< chaque
|
||
< chemin
|
||
< cote
|
||
< cot\u00e9
|
||
< c\u00f4te
|
||
< c\u00f4t\u00e9
|
||
< \u010du\u010d\u0113t
|
||
< Czech
|
||
< hi\u0161a
|
||
< irdisch
|
||
< lie
|
||
< lire
|
||
< llama
|
||
< l\u00f5ug
|
||
< l\u00f2za
|
||
< lu\u010d
|
||
< luck
|
||
< L\u00fcbeck
|
||
< lye
|
||
< l\u00e4vi
|
||
< L\u00f6wen
|
||
< m\u00e0\u0161ta
|
||
< m\u00eer
|
||
< myndig
|
||
< M\u00e4nner
|
||
< m\u00f6chten
|
||
< pi\u00f1a
|
||
< pint
|
||
< pylon
|
||
< \u0161\u00e0ran
|
||
< savoir
|
||
< \u0160erb\u016bra
|
||
< Sietla
|
||
< \u015blub
|
||
< subtle
|
||
< symbol
|
||
< s\u00e4mtlich
|
||
< verkehrt
|
||
< vox
|
||
< v\u00e4ga
|
||
< waffle
|
||
< wood
|
||
< yen
|
||
< yuan
|
||
< yucca
|
||
< \u017eal
|
||
< \u017eena
|
||
< \u017den\u0113va
|
||
< zoo0
|
||
< Zviedrija
|
||
< Z\u00fcrich
|
||
< zysk0
|
||
< \u00e4ndere
|
||
|
||
** test: DataDrivenCollationTest/hi_TestNewRules
|
||
# This test goes through new rules and tests against old rules
|
||
@ locale hi
|
||
* compare
|
||
< कॐ
|
||
< कं
|
||
< कँ
|
||
< कः
|
||
|
||
** test: DataDrivenCollationTest/ro_TestNewRules
|
||
# This test goes through new rules and tests against old rules
|
||
@ locale ro
|
||
* compare
|
||
< xAx
|
||
< xă
|
||
< xĂ
|
||
< Xă
|
||
< XĂ
|
||
< xăx
|
||
< xĂx
|
||
< xâ
|
||
< xÂ
|
||
< Xâ
|
||
< XÂ
|
||
< xâx
|
||
< xÂx
|
||
< xb
|
||
< xIx
|
||
< xî
|
||
< xÎ
|
||
< Xî
|
||
< XÎ
|
||
< xîx
|
||
< xÎx
|
||
< xj
|
||
< xSx
|
||
< xș
|
||
= xş
|
||
< xȘ
|
||
= xŞ
|
||
< Xș
|
||
= Xş
|
||
< XȘ
|
||
= XŞ
|
||
< xșx
|
||
= xşx
|
||
< xȘx
|
||
= xŞx
|
||
< xT
|
||
< xTx
|
||
< xț
|
||
= xţ
|
||
< xȚ
|
||
= xŢ
|
||
< Xț
|
||
= Xţ
|
||
< XȚ
|
||
= XŢ
|
||
< xțx
|
||
= xţx
|
||
< xȚx
|
||
= xŢx
|
||
< xU
|
||
|
||
** test: DataDrivenCollationTest/testOffsets
|
||
# This tests cases where forwards and backwards iteration get different offsets
|
||
@ locale en
|
||
% strength=tertiary
|
||
* compare
|
||
< a\uD800\uDC00\uDC00
|
||
< b\uD800\uDC00\uDC00
|
||
* compare
|
||
< \u0301A\u0301\u0301
|
||
< \u0301B\u0301\u0301
|
||
* compare
|
||
< abcd\r\u0301
|
||
< abce\r\u0301
|
||
# TODO: test offsets in new CollationTest
|
||
|
||
# End of test cases moved here from ICU 52's DataDrivenCollationTest.txt.
|
||
|
||
** test: was ICU 52 cmsccoll/TestRedundantRules
|
||
@ rules
|
||
& a < b < c < d& [before 1] c < m
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<1 m
|
||
<1 c
|
||
<1 d
|
||
|
||
@ rules
|
||
& a < b <<< c << d <<< e& [before 3] e <<< x
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<3 c
|
||
<2 d
|
||
<3 x
|
||
<3 e
|
||
|
||
@ rules
|
||
& a < b <<< c << d <<< e <<< f < g& [before 1] g < x
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<3 c
|
||
<2 d
|
||
<3 e
|
||
<3 f
|
||
<1 x
|
||
<1 g
|
||
|
||
@ rules
|
||
& a <<< b << c < d& a < m
|
||
* compare
|
||
<1 a
|
||
<3 b
|
||
<2 c
|
||
<1 m
|
||
<1 d
|
||
|
||
@ rules
|
||
&a<b<<b\u0301 &z<b
|
||
* compare
|
||
<1 a
|
||
<1 b\u0301
|
||
<1 z
|
||
<1 b
|
||
|
||
@ rules
|
||
&z<m<<<q<<<m
|
||
* compare
|
||
<1 z
|
||
<1 q
|
||
<3 m
|
||
|
||
@ rules
|
||
&z<<<m<q<<<m
|
||
* compare
|
||
<1 z
|
||
<1 q
|
||
<3 m
|
||
|
||
@ rules
|
||
& a < b < c < d& r < c
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<1 d
|
||
<1 r
|
||
<1 c
|
||
|
||
@ rules
|
||
& a < b < c < d& c < m
|
||
* compare
|
||
<1 a
|
||
<1 b
|
||
<1 c
|
||
<1 m
|
||
<1 d
|
||
|
||
@ rules
|
||
& a < b < c < d& a < m
|
||
* compare
|
||
<1 a
|
||
<1 m
|
||
<1 b
|
||
<1 c
|
||
<1 d
|
||
|
||
** test: was ICU 52 cmsccoll/TestExpansionSyntax
|
||
# The following two rules should sort the particular list of strings the same.
|
||
@ rules
|
||
&AE <<< a << b <<< c &d <<< f
|
||
* compare
|
||
<1 AE
|
||
<3 a
|
||
<2 b
|
||
<3 c
|
||
<1 d
|
||
<3 f
|
||
|
||
@ rules
|
||
&A <<< a / E << b / E <<< c /E &d <<< f
|
||
* compare
|
||
<1 AE
|
||
<3 a
|
||
<2 b
|
||
<3 c
|
||
<1 d
|
||
<3 f
|
||
|
||
# The following two rules should sort the particular list of strings the same.
|
||
@ rules
|
||
&AE <<< a <<< b << c << d < e < f <<< g
|
||
* compare
|
||
<1 AE
|
||
<3 a
|
||
<3 b
|
||
<2 c
|
||
<2 d
|
||
<1 e
|
||
<1 f
|
||
<3 g
|
||
|
||
@ rules
|
||
&A <<< a / E <<< b / E << c / E << d / E < e < f <<< g
|
||
* compare
|
||
<1 AE
|
||
<3 a
|
||
<3 b
|
||
<2 c
|
||
<2 d
|
||
<1 e
|
||
<1 f
|
||
<3 g
|
||
|
||
# The following two rules should sort the particular list of strings the same.
|
||
@ rules
|
||
&AE <<< B <<< C / D <<< F
|
||
* compare
|
||
<1 AE
|
||
<3 B
|
||
<3 F
|
||
<1 AED
|
||
<3 C
|
||
|
||
@ rules
|
||
&A <<< B / E <<< C / ED <<< F / E
|
||
* compare
|
||
<1 AE
|
||
<3 B
|
||
<3 F
|
||
<1 AED
|
||
<3 C
|
||
|
||
** test: never reorder trailing primaries
|
||
@ root
|
||
% reorder Zzzz Grek
|
||
* compare
|
||
<1 L
|
||
<1 字
|
||
<1 Ω
|
||
<1 \uFFFD
|
||
<1 \uFFFF
|
||
|
||
** test: fall back to mappings with shorter prefixes, not immediately to ones with no prefixes
|
||
@ rules
|
||
&u=ab|cd
|
||
&v=b|ce
|
||
* compare
|
||
<1 abc
|
||
<1 abcc
|
||
<1 abcf
|
||
<1 abcd
|
||
= abu
|
||
<1 abce
|
||
= abv
|
||
|
||
# With the following rules, there is only one prefix per composite ĉ or ç,
|
||
# but both prefixes apply to just c in NFD form.
|
||
# We would get different results for composed vs. NFD input
|
||
# if we fell back directly from longest-prefix mappings to no-prefix mappings.
|
||
@ rules
|
||
&x=op|ĉ
|
||
&y=p|ç
|
||
* compare
|
||
<1 opc
|
||
<2 opć
|
||
<1 opcz
|
||
<1 opd
|
||
<1 opĉ
|
||
= opc\u0302
|
||
= opx
|
||
<1 opç
|
||
= opc\u0327
|
||
= opy
|
||
|
||
# The mapping is used which has the longest matching prefix for which
|
||
# there is also a suffix match, with the longest suffix match among several for that prefix.
|
||
@ rules
|
||
&❶=d
|
||
&❷=de
|
||
&❸=def
|
||
&①=c|d
|
||
&②=c|de
|
||
&③=c|def
|
||
&④=bc|d
|
||
&⑤=bc|de
|
||
&⑥=bc|def
|
||
&⑦=abc|d
|
||
&⑧=abc|de
|
||
&⑨=abc|def
|
||
* compare
|
||
<1 9aadzz
|
||
= 9aa❶zz
|
||
<1 9aadez
|
||
= 9aa❷z
|
||
<1 9aadef
|
||
= 9aa❸
|
||
<1 9acdzz
|
||
= 9ac①zz
|
||
<1 9acdez
|
||
= 9ac②z
|
||
<1 9acdef
|
||
= 9ac③
|
||
<1 9bcdzz
|
||
= 9bc④zz
|
||
<1 9bcdez
|
||
= 9bc⑤z
|
||
<1 9bcdef
|
||
= 9bc⑥
|
||
<1 abcdzz
|
||
= abc⑦zz
|
||
<1 abcdez
|
||
= abc⑧z
|
||
<1 abcdef
|
||
= abc⑨
|
||
|
||
** test: prefix + discontiguous contraction with missing prefix contraction
|
||
# Unfortunate terminology: The first "prefix" here is the pre-context,
|
||
# the second "prefix" refers to the contraction/relation string that is
|
||
# one shorter than the one being tested.
|
||
@ rules
|
||
&x=p|e
|
||
&y=p|ê
|
||
&z=op|ê
|
||
# No mapping for op|e:
|
||
# Discontiguous contraction matching should not match op|ê in opệ
|
||
# because it would have to skip the dot below and extend a match on op|e by the circumflex,
|
||
# but there is no match on op|e.
|
||
* compare
|
||
<1 oPe
|
||
<1 ope
|
||
= opx
|
||
<1 opệ
|
||
= opy\u0323 # y not z
|
||
<1 opê
|
||
= opz
|
||
|
||
# We cannot test for fallback by whether the contraction default CE32
|
||
# is for another contraction. With the following rules, there is no mapping for op|e,
|
||
# and the fallback to prefix p has no contractions.
|
||
@ rules
|
||
&x=p|e
|
||
&z=op|ê
|
||
* compare
|
||
<1 oPe
|
||
<1 ope
|
||
= opx
|
||
<2 opệ
|
||
= opx\u0323\u0302 # x not z
|
||
<1 opê
|
||
= opz
|
||
|
||
# One more variation: Fallback to the simple code point, no shorter non-empty prefix.
|
||
@ rules
|
||
&x=e
|
||
&z=op|ê
|
||
* compare
|
||
<1 ope
|
||
= opx
|
||
<3 oPe
|
||
= oPx
|
||
<2 opệ
|
||
= opx\u0323\u0302 # x not z
|
||
<1 opê
|
||
= opz
|
||
|
||
** test: maxVariable via rules
|
||
@ rules
|
||
[maxVariable space][alternate shifted]
|
||
* compare
|
||
= \u0020
|
||
= \u000A
|
||
<1 .
|
||
<1 ° # degree sign
|
||
<1 $
|
||
<1 0
|
||
|
||
** test: maxVariable via setting
|
||
@ root
|
||
% maxVariable=currency
|
||
% alternate=shifted
|
||
* compare
|
||
= \u0020
|
||
= \u000A
|
||
= .
|
||
= ° # degree sign
|
||
= $
|
||
<1 0
|
||
|
||
** test: ICU4J CollationMiscTest/TestContractionClosure (ää)
|
||
# This tests canonical closure, but it also tests that CollationFastLatin
|
||
# bails out properly for contractions with combining marks.
|
||
# For that we need pairs of strings that remain in the Latin fastpath
|
||
# long enough, hence the extra "= b" lines.
|
||
@ rules
|
||
&b=\u00e4\u00e4
|
||
* compare
|
||
<1 b
|
||
= \u00e4\u00e4
|
||
= b
|
||
= a\u0308a\u0308
|
||
= b
|
||
= \u00e4a\u0308
|
||
= b
|
||
= a\u0308\u00e4
|
||
|
||
** test: ICU4J CollationMiscTest/TestContractionClosure (Å)
|
||
@ rules
|
||
&b=\u00C5
|
||
* compare
|
||
<1 b
|
||
= \u00C5
|
||
= b
|
||
= A\u030A
|
||
= b
|
||
= \u212B
|