2008-05-23 04:22:28 +00:00
<?xml version="1.0" encoding="UTF-8"?>
2009-01-22 00:24:48 +00:00
<!-- Copyright (c) 2007 - 2009 IBM Corporation and others. All rights reserved -->
2008-05-23 04:22:28 +00:00
<!-- Test data file for string search -->
< !DOCTYPE stringsearch-tests [
<!ELEMENT stringsearch-tests (test-case+)>
<!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
<!ELEMENT test-case (pattern, pre?, m?, post?)>
< !ATTLIST test-case
id ID #REQUIRED
locale CDATA "en"
strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY"
norm (ON | OFF) "OFF"
2009-01-22 00:24:48 +00:00
alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
2008-05-23 04:22:28 +00:00
>
<!ELEMENT pattern (#PCDATA)>
<!ELEMENT pre (#PCDATA)>
<!ELEMENT m (#PCDATA)>
<!ELEMENT post (#PCDATA)>
]>
2009-01-22 00:24:48 +00:00
<stringsearch-tests >
2008-05-23 04:22:28 +00:00
<!-- debug="test11" (for copying into the above element) -->
<!-- Very simple match -->
<test-case id= "test01" >
<pattern > abc</pattern>
<pre > xxx</pre> <m > abc</m> <post > yyy</post>
</test-case>
<!-- Very simple no - match -->
<test-case id= "test02" >
<pattern > abc</pattern>
<pre > xxx</pre> <post > yyy</post>
</test-case>
<!-- Match after several near - misses. -->
<test-case id= "test03" >
<pattern > string</pattern>
<pre > silly spring stling strxng strilg strinx stri</pre> <m > string</m> <post > fling</post>
</test-case>
<test-case id= "test04" strength= "PRIMARY" >
<pattern > FUSS</pattern>
<pre > abc</pre> <m > fuss</m> <post > sss</post>
</test-case>
<test-case id= "test05" strength= "PRIMARY" >
<pattern > FUSS</pattern>
<pre > abc</pre> <m > fuß</m> <post > sss</post>
</test-case>
<test-case id= "test05.5" strength= "PRIMARY" >
<pattern > fuss</pattern>
<pre > a </pre>
<m > fuß</m>
<post > ball table</post>
</test-case>
<test-case id= "test06" strength= "PRIMARY" >
<pattern > fuß</pattern>
<pre > abc</pre> <m > fuss</m> <post > xyz</post>
</test-case>
<test-case id= "test07" strength= "SECONDARY" >
<pattern > fuß</pattern>
<pre > abcfussxyz</pre>
</test-case>
<test-case id= "test08" strength= "PRIMARY" >
<pattern > fus</pattern>
<pre > abcfuß</pre> <post > xyz</post>
</test-case>
<!-- A good match following an initial match that failed because
of not ending on a character boundary -->
<test-case id= "test09" strength= "PRIMARY" >
<pattern > fus</pattern>
<pre > fuß </pre> <m > fus</m> <post > sss</post>
</test-case>
<!-- Test cases from usrchdat.c BREAKITERATOREXACT -->
<test-case id= "test10" strength= "TERTIARY" >
<pattern > fox</pattern>
<m > fox</m> <post > y fox</post>
</test-case>
<test-case id= "test11" strength= "PRIMARY" locale= "de_DE@collation=phonebook" >
<pattern > toe</pattern>
<pre > This is a </pre> <m > Tö</m> <post > ne</post>
</test-case>
<test-case id= "test11a" strength= "SECONDARY" locale= "de_DE@collation=phonebook" >
<pattern > toe</pattern>
<pre > This is a </pre> <post > Töne</post>
</test-case>
<test-case id= "test12" strength= "TERTIARY" >
<pattern > e</pattern>
<pre > tésting that é doés not match </pre> <m > e</m> <post > </post>
</test-case>
<test-case id= "test13" strength= "PRIMARY" locale= "fr" >
<pattern > e</pattern>
<pre > </pre> <m > É</m> <post > É</post>
</test-case>
<test-case id= "test14" strength= "PRIMARY" locale= "fr" >
<pattern > O</pattern>
<pre > C</pre> <m > O\u0302</m> <post > TÉ</post>
</test-case>
<!-- Test cases from usrchdat.c STRENGTH -->
<test-case id= "test15" strength= "PRIMARY" locale= "en" >
<pattern > fox</pattern>
<pre > The quick brown </pre> <m > fox</m> <post > jumps over the lazy foxes</post>
</test-case>
<test-case id= "test16" strength= "PRIMARY" locale= "fr" >
<pattern > peche</pattern>
<pre > blackbirds pat </pre> <m > p\u00E9ch\u00E9</m> <post > </post>
</test-case>
<test-case id= "test17" strength= "PRIMARY" locale= "fr" >
<pattern > peche</pattern>
<pre > blackbirds pat </pre> <m > p\u00EAche</m> <post > </post>
</test-case>
<test-case id= "test18" strength= "PRIMARY" locale= "fr" >
<pattern > peche</pattern>
<pre > blackbirds pat </pre> <m > p\u00E9che</m> <post > r </post>
</test-case>
<test-case id= "test19" strength= "PRIMARY" locale= "fr" >
<pattern > peche</pattern>
<pre > blackbirds pat </pre> <m > p\u00EAche</m> <post > r </post>
</test-case>
<test-case id= "test20" strength= "PRIMARY" locale= "es" >
<pattern > channel</pattern>
<pre > A </pre> <m > channel</m> <post > , </post>
</test-case>
<test-case id= "test21" strength= "PRIMARY" locale= "es" >
<pattern > channel</pattern>
<pre > A </pre> <m > CHANNEL</m> <post > , </post>
</test-case>
<test-case id= "test22" strength= "PRIMARY" locale= "es" >
<pattern > channel</pattern>
<pre > A </pre> <m > Channel</m> <post > s, </post>
</test-case>
<test-case id= "test23" strength= "PRIMARY" locale= "es" >
<pattern > channel</pattern>
<pre > A </pre> <m > channel</m> <post > ... </post>
</test-case>
<test-case id= "test24" strength= "TERTIARY" locale= "en" >
<pattern > A\u0300</pattern>
<pre > A miss, and then </pre> <m > \u00c0</m> <post > should match but not A"</post>
</test-case>
<!-- TODO: In the original test data, this test matched at IDENTICAL strength.
Doesn't seem right. The characters are different.
-->
<test-case id= "test24a" strength= "IDENTICAL" locale= "en" >
<pattern > A\u0300</pattern>
<pre > At IDENTICAL, shoud this match? </pre> <m > \u00c0</m> <post > </post>
</test-case>
2009-01-22 00:24:48 +00:00
<test-case id= "test24b" strength= "IDENTICAL" alternate_handling= "SHIFTED" locale= "en" >
<pattern > A\u0300</pattern>
<pre > At IDENTICAL, shoud this match? </pre>
<m > \u00c0</m>
<post > </post>
</test-case>
<test-case id= "test25" strength= "SECONDARY" locale= "en" >
2008-05-23 04:22:28 +00:00
<pattern > Ű</pattern>
<pre > 12</pre> <m > ű</m> <post > Ű</post>
</test-case>
<test-case id= "test26" strength= "SECONDARY" locale= "en" >
<pattern > A</pattern>
<pre > 12</pre> <m > a</m> <post > ...</post>
</test-case>
<!-- Test Cases from usrchdat.c, VARIABLE -->
<test-case id= "test27" strength= "TERTIARY" locale= "en" >
<pattern > blackbird</pattern>
<pre > black-bird </pre> <m > blackbird</m> <post > ...</post>
</test-case>
<test-case id= "test28" strength= "TERTIARY" locale= "en" >
<pattern > go</pattern>
<pre > on</pre>
</test-case>
<!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening
the UStringSearch. How did the orignal test run? -->
<!--
<test-case id= "test29" strength= "PRIMARY" locale= "en" >
<pattern > </pattern>
<pre > </pre> <m > </m> <post > abc</post>
</test-case>
-->
<test-case id= "test30" strength= "SECONDARY" locale= "en" >
<pattern > abc</pattern>
<pre > a bc ab c a bc ab c"</pre>
</test-case>
<test-case id= "test31" strength= "SECONDARY" locale= "en" >
<pattern > abc</pattern>
<pre > ---------------</pre>
</test-case>
<!-- Normalization test cases from usrchdat.c -->
<test-case id= "test32" strength= "TERTIARY" norm= "ON" >
<pattern > a\u0325\u0300</pattern>
<pre > </pre> <m > a\u0300\u0325</m>
</test-case>
<test-case id= "test32a" strength= "TERTIARY" norm= "OFF" >
<pattern > a\u0325\u0300</pattern>
<pre > a\u0300\u0325</pre>
</test-case>
<!-- COMPOSITEBOUNDARIES from usrchdat.c
Boundaries are not identical to orignal test data because
of matching only full combining sequences
-->
<test-case id= "test40" strength= "TERTIARY" >
<pattern > A</pattern>
<pre > À</pre> <!-- \u00C0 -->
</test-case>
<test-case id= "test41" strength= "TERTIARY" >
<pattern > A</pattern>
<pre > À</pre> <m > A</m> <post > C</post>
</test-case>
<test-case id= "test42" strength= "TERTIARY" >
<pattern > A\u030A</pattern>
<pre > À\u01FA</pre>
</test-case>
<!-- SUPPLEMENTARYCANONICAL from usrchdat.c -->
<test-case id= "test50" strength= "TERTIARY" >
<pattern > \uD800\uDC00</pattern>
<pre > abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre> <m > \uD800\uDC00</m>
<post > abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
</test-case>
<test-case id= "test51" strength= "TERTIARY" >
<pattern > \\uD834\\uDDB9</pattern>
<pre > and</pre> <m > \\uD834\\uDDB9</m> <post > this sentence</post>
</test-case>
<test-case id= "test52" strength= "TERTIARY" >
<pattern > \\uD834\\uDDB9 </pattern>
<pre > and</pre> <m > \\uD834\\uDDB9 </m> <post > this sentence</post>
</test-case>
<test-case id= "test53" strength= "TERTIARY" >
<pattern > -\\uD834\\uDDB9-</pattern>
<pre > and</pre> <m > -\\uD834\\uDDB9-</m> <post > this sentence</post>
</test-case>
<test-case id= "test54" strength= "TERTIARY" >
<pattern > ,\\uD834\\uDDB9,</pattern>
<pre > and</pre> <m > ,\\uD834\\uDDB9,</m> <post > this sentence</post>
</test-case>
<test-case id= "test55" strength= "TERTIARY" >
<pattern > ?\\uD834\\uDDB9?</pattern>
<pre > and</pre> <m > ?\\uD834\\uDDB9?</m> <post > this sentence</post>
</test-case>
<!-- Long combining sequences -->
2009-01-22 00:24:48 +00:00
<!-- Backwards search fails because patterns ends w/ ignorables
2008-05-23 04:22:28 +00:00
<test-case id= "test60" strength= "PRIMARY" >
<pattern > A\u0301\u0301\u0301\u0301</pattern>
<m > A\u0301\u0301\u0301\u0301\u0301</m>
</test-case>
2009-01-22 00:24:48 +00:00
-->
2008-05-23 04:22:28 +00:00
<test-case id= "test61" strength= "TERTIARY" >
<pattern > A\u0301\u0301\u0301\u0301</pattern>
<pre > A\u0301\u0301\u0301\u0301\u0301</pre>
</test-case>
<test-case id= "test62" strength= "TERTIARY" >
<pattern > A\u0301\u0301\u0301\u0301</pattern>
<m > A\u0301\u0301\u0301\u0301</m>
</test-case>
<!-- stand - alone combining marks don't match attached marks -->
<test-case id= "test63" strength= "TERTIARY" >
<pattern > \u0301</pattern>
<pre > A\u0301\u0301\u0301\u0301</pre>
</test-case>
<test-case id= "test64" strength= "TERTIARY" >
<pattern > \u0301</pattern>
<post > \u0301\u0301\u0301\u0301</post>
</test-case>
<!-- stand - alone combining mark does match an un - attached combining mark -->
<test-case id= "test65" strength= "TERTIARY" >
<pattern > \u0301</pattern>
<m > \u0301</m> <post > A\u0301\u0301</post>
</test-case>
<test-case id= "test66" strength= "TERTIARY" >
<pattern > \u0301</pattern>
<m > \u0301</m>
</test-case>
<!-- stand - alone combining marks at end of the target text -->
<test-case id= "test67" strength= "TERTIARY" >
<pattern > \u0301</pattern>
<pre > abcd\r</pre> <m > \u0301</m>
</test-case>
<!-- attached combining marks at end of the target text, no match -->
<test-case id= "test68" strength= "TERTIARY" >
<pattern > \u0301</pattern>
<pre > abcd\u0301</pre>
</test-case>
<!-- no match within expansions at the start -->
<test-case id= "test70" strength= "PRIMARY" >
<pattern > Eligature</pattern>
<pre > Æligature</pre>
</test-case>
<test-case id= "test71" strength= "PRIMARY" >
<pattern > AEligature</pattern>
<m > Æligature</m>
</test-case>
<test-case id= "test72" strength= "PRIMARY" >
<pattern > AEligature</pattern>
<m > Æligature</m>
</test-case>
<!-- unattached combining Tilde will not match a Tilde that is
part of a composed Ñ (\u00D1) -->
<test-case id= "test73" strength= "SECONDARY" >
<pattern > \u0303</pattern> <!-- combining tilde -->
<pre > Ñ
 </pre> <m > \u0303</m>
</test-case>
<test-case id= "test74" strength= "SECONDARY" >
<pattern > \u0303</pattern> <!-- combining tilde -->
<pre > Ñ 
 </pre> <m > \u0303</m> <post > a</post>
</test-case>
<test-case id= "test75" strength= "TERTIARY" locale= "fr" >
<pattern > \u00EA</pattern>
<pre > p</pre> <m > \u00EA</m> <post > che</post>
</test-case>
<test-case id= "test76" strength= "TERTIARY" locale= "fr" >
<pattern > \u00EA</pattern>
<pre > p</pre> <m > e\u0302</m> <post > che</post>
</test-case>
<test-case id= "test77" strength= "TERTIARY" locale= "fr" >
<pattern > e\u0302</pattern>
<pre > p</pre> <m > \u00EA</m> <post > che</post>
</test-case>
<!-- Test cases from ticket:5382 -->
<test-case id= "test78" strength= "SECONDARY" locale= "hu_HU" >
<pattern > \u0170</pattern>
<m > \u0171</m>
<post > 12</post>
</test-case>
<test-case id= "test79" strength= "SECONDARY" locale= "hu_HU" >
<pattern > \u0170</pattern>
<pre > 1</pre>
<m > \u0171</m>
<post > 2</post>
</test-case>
<test-case id= "test80" strength= "SECONDARY" locale= "hu_HU" >
<pattern > \u0170</pattern>
<pre > 12</pre>
<m > \u0171</m>
</test-case>
<!-- Test cases from ticket:5959 -->
<test-case id= "test81" strength= "SECONDARY" >
<pattern > \u2166</pattern>
<m > VII</m>
</test-case>
<test-case id= "test82" strength= "SECONDARY" >
<pattern > VII</pattern>
<m > \u2166</m>
</test-case>
2009-01-22 00:24:48 +00:00
<test-case id= "test83" strength= "IDENTICAL" alternate_handling= "SHIFTED" locale= "en" >
<pattern > Universal Declaration of Human Rights</pattern>
<pre > Proclaims this </pre> <m > Universal Declaration of Human Rights</m> <post > as a common standard of achievement for all peoples and all nations</post>
</test-case>
<test-case id= "test83b" strength= "TERTIARY" alternate_handling= "SHIFTED" locale= "en" >
<pattern > Universal Declaration of Human Rights</pattern>
<pre > Proclaims this </pre>
<m > Universal-Declaration-of-Human-Rights</m>
<post > as a common standard of achievement for all peoples and all nations</post>
</test-case>
<test-case id= "test84" strength= "TERTIARY" locale= "en" >
<pattern > \u05E9\u0591\u05E9</pattern>
<m > \u05E9\u0592\u05E9</m>
</test-case>
<test-case id= "test84b" strength= "IDENTICAL" locale= "en" >
<pattern > \u05E9\u0591\u05E9</pattern>
<pre > \u05E9\u0592\u05E9</pre>
</test-case>
2008-05-23 04:22:28 +00:00
</stringsearch-tests>