1999-08-16 21:50:52 +00:00
|
|
|
|
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<html xmlns:v="urn:schemas-microsoft-com:vml"
|
|
|
|
|
xmlns:o="urn:schemas-microsoft-com:office:office"
|
|
|
|
|
xmlns:w="urn:schemas-microsoft-com:office:word"
|
|
|
|
|
xmlns="http://www.w3.org/TR/REC-html40">
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
<head>
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<meta http-equiv=Content-Type content="text/html; charset=iso-8859-1">
|
|
|
|
|
<meta name=ProgId content=Word.Document>
|
|
|
|
|
<meta name=Generator content="Microsoft Word 9">
|
|
|
|
|
<meta name=Originator content="Microsoft Word 9">
|
|
|
|
|
<link rel=File-List href="./Collate_files/filelist.xml">
|
|
|
|
|
<link rel=Edit-Time-Data href="./Collate_files/editdata.mso">
|
|
|
|
|
<!--[if !mso]>
|
|
|
|
|
<style>
|
|
|
|
|
v\:* {behavior:url(#default#VML);}
|
|
|
|
|
o\:* {behavior:url(#default#VML);}
|
|
|
|
|
w\:* {behavior:url(#default#VML);}
|
|
|
|
|
.shape {behavior:url(#default#VML);}
|
|
|
|
|
</style>
|
|
|
|
|
<![endif]-->
|
1999-08-16 21:50:52 +00:00
|
|
|
|
<title>International Classes for Unicode - Collation</title>
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<!--[if gte mso 9]><xml>
|
|
|
|
|
<o:DocumentProperties>
|
|
|
|
|
<o:Author>Helena Shih</o:Author>
|
|
|
|
|
<o:Template>Normal</o:Template>
|
|
|
|
|
<o:LastAuthor>Helena Shih</o:LastAuthor>
|
|
|
|
|
<o:Revision>2</o:Revision>
|
|
|
|
|
<o:TotalTime>0</o:TotalTime>
|
|
|
|
|
<o:Created>2000-01-15T02:20:00Z</o:Created>
|
|
|
|
|
<o:LastSaved>2000-01-15T02:20:00Z</o:LastSaved>
|
|
|
|
|
<o:Pages>4</o:Pages>
|
|
|
|
|
<o:Words>982</o:Words>
|
|
|
|
|
<o:Characters>5599</o:Characters>
|
|
|
|
|
<o:Company>IBM</o:Company>
|
|
|
|
|
<o:Lines>46</o:Lines>
|
|
|
|
|
<o:Paragraphs>11</o:Paragraphs>
|
|
|
|
|
<o:CharactersWithSpaces>6875</o:CharactersWithSpaces>
|
|
|
|
|
<o:Version>9.2720</o:Version>
|
|
|
|
|
</o:DocumentProperties>
|
|
|
|
|
</xml><![endif]-->
|
|
|
|
|
<style>
|
|
|
|
|
<!--
|
|
|
|
|
/* Style Definitions */
|
|
|
|
|
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
|
|
|
|
{mso-style-parent:"";
|
|
|
|
|
margin:0in;
|
|
|
|
|
margin-bottom:.0001pt;
|
|
|
|
|
mso-pagination:widow-orphan;
|
|
|
|
|
font-size:12.0pt;
|
|
|
|
|
font-family:"Times New Roman";
|
|
|
|
|
mso-fareast-font-family:"Times New Roman";}
|
|
|
|
|
p
|
|
|
|
|
{font-size:12.0pt;
|
|
|
|
|
font-family:"Times New Roman";
|
|
|
|
|
mso-fareast-font-family:"Times New Roman";}
|
|
|
|
|
@page Section1
|
|
|
|
|
{size:8.5in 11.0in;
|
|
|
|
|
margin:1.0in 1.25in 1.0in 1.25in;
|
|
|
|
|
mso-header-margin:.5in;
|
|
|
|
|
mso-footer-margin:.5in;
|
|
|
|
|
mso-paper-source:0;}
|
|
|
|
|
div.Section1
|
|
|
|
|
{page:Section1;}
|
|
|
|
|
/* List Definitions */
|
|
|
|
|
@list l0
|
|
|
|
|
{mso-list-id:56786128;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:316935058 799580840 -2129604566 1894698424 -1886861812 1076558752 -1316478726 -1694838522 -1962102214 -432647774;}
|
|
|
|
|
@list l0:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l1
|
|
|
|
|
{mso-list-id:218128614;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:726427572 -220806470 -535028662 -1139008068 926857880 19293176 -1336270008 -629910652 1961381030 -112671298;}
|
|
|
|
|
@list l1:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l2
|
|
|
|
|
{mso-list-id:398596625;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:1581174674 -1457777898 -2089911548 -358428948 -724424496 486305342 -803443362 102692998 -557393154 -905049134;}
|
|
|
|
|
@list l3
|
|
|
|
|
{mso-list-id:399836585;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:-1362098886 105021812 -1203221970 402953214 116268298 1957075642 -2103935390 135310026 2024683000 -1150503632;}
|
|
|
|
|
@list l4
|
|
|
|
|
{mso-list-id:512963647;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:870891966 718416036 1606166768 -734915736 -457166330 1502492706 -472354846 -1617501690 -1298739844 -1263368320;}
|
|
|
|
|
@list l4:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l4:level2
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:o;
|
|
|
|
|
mso-level-tab-stop:1.0in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:"Courier New";
|
|
|
|
|
mso-bidi-font-family:"Times New Roman";}
|
|
|
|
|
@list l5
|
|
|
|
|
{mso-list-id:1228296673;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:-1126373434 672300220 -1390479766 -247709302 -1005183176 586437888 1683491832 2053804606 -1192436542 -1064926734;}
|
|
|
|
|
@list l5:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l6
|
|
|
|
|
{mso-list-id:1292789779;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:-517678870 758258566 1452305524 223270358 -53846700 -513747388 -2068313458 1590586680 -1770905550 -1947821216;}
|
|
|
|
|
@list l6:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l7
|
|
|
|
|
{mso-list-id:1316833148;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:1332113200 -424870104 1051208176 892489996 -1233066968 -1810460500 -1022461362 2044245910 -1736145250 -1083136974;}
|
|
|
|
|
@list l7:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l8
|
|
|
|
|
{mso-list-id:1327392266;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:-1403883808 -326201746 -1498098978 273696058 1220037092 978733230 -1686724236 -2129999476 1994688346 -162764280;}
|
|
|
|
|
@list l8:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l9
|
|
|
|
|
{mso-list-id:1662541158;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:-1664995996 -1721883820 -1345845536 314467622 -225125196 2134916302 970646716 2097594674 1329790124 -20539764;}
|
|
|
|
|
@list l9:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l10
|
|
|
|
|
{mso-list-id:1800419391;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:-160151480 1490450160 1649709476 -912377276 -843295976 955542454 -958090392 1090130474 897190372 -1064773102;}
|
|
|
|
|
@list l10:level1
|
|
|
|
|
{mso-level-number-format:bullet;
|
|
|
|
|
mso-level-text:\F0B7;
|
|
|
|
|
mso-level-tab-stop:.5in;
|
|
|
|
|
mso-level-number-position:left;
|
|
|
|
|
text-indent:-.25in;
|
|
|
|
|
mso-ansi-font-size:10.0pt;
|
|
|
|
|
font-family:Symbol;}
|
|
|
|
|
@list l11
|
|
|
|
|
{mso-list-id:1804076982;
|
|
|
|
|
mso-list-type:hybrid;
|
|
|
|
|
mso-list-template-ids:26769352 -1273701986 819625222 -1355785140 1125281256 -561239646 -1265353568 721578390 882827536 -1507570576;}
|
|
|
|
|
ol
|
|
|
|
|
{margin-bottom:0in;}
|
|
|
|
|
ul
|
|
|
|
|
{margin-bottom:0in;}
|
|
|
|
|
-->
|
|
|
|
|
</style>
|
|
|
|
|
<!--[if gte mso 9]><xml>
|
|
|
|
|
<o:shapedefaults v:ext="edit" spidmax="1027"/>
|
|
|
|
|
</xml><![endif]--><!--[if gte mso 9]><xml>
|
|
|
|
|
<o:shapelayout v:ext="edit">
|
|
|
|
|
<o:idmap v:ext="edit" data="1"/>
|
|
|
|
|
</o:shapelayout></xml><![endif]-->
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</head>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<body bgcolor=white lang=EN-US link=blue vlink=blue style='tab-interval:.5in'>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<div class=Section1>
|
|
|
|
|
|
|
|
|
|
<h1>International Collectanea for Unicode</h1>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
<h2>Collation Framework</h2>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
|
|
|
|
|
<div class=MsoNormal align=center style='text-align:center'>
|
|
|
|
|
|
|
|
|
|
<hr size=2 width="100%" align=center>
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
<h3><u>Contents</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l4 level1 lfo1;tab-stops:list .5in'>What is collation?</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l4 level1 lfo1;tab-stops:list .5in'>The rule symbols and their
|
|
|
|
|
usage</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l4 level1 lfo1;tab-stops:list .5in'>Interesting Examples</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l4 level1 lfo1;tab-stops:list .5in'>Implementation Details</li>
|
|
|
|
|
<ul type=circle>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:
|
|
|
|
|
auto;mso-list:l4 level2 lfo1;tab-stops:list 1.0in'>Building the Collation
|
|
|
|
|
Table</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:
|
|
|
|
|
auto;mso-list:l4 level2 lfo1;tab-stops:list 1.0in'>Incremental Comparison
|
|
|
|
|
Diagram</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:
|
|
|
|
|
auto;mso-list:l4 level2 lfo1;tab-stops:list 1.0in'>Generating a Collation
|
|
|
|
|
Table</li>
|
|
|
|
|
</ul>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l4 level1 lfo1;tab-stops:list .5in'>Q and A</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h3><u>What is collation?</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>Collation framework performs locale-sensitive string comparison. The user of
|
|
|
|
|
this class can use this class to build searching and sorting routines for
|
|
|
|
|
natural language text, build table of contents for large documentation or
|
|
|
|
|
create efficient index look up for database entries.<br>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
<br>
|
2000-01-15 02:00:06 +00:00
|
|
|
|
The ICU Collator classes provides services to allow: </p>
|
|
|
|
|
|
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l10 level1 lfo2;tab-stops:list .5in'>Simple, data-driven, table
|
|
|
|
|
based collation.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l10 level1 lfo2;tab-stops:list .5in'>Easily customizble for your
|
|
|
|
|
needs.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l10 level1 lfo2;tab-stops:list .5in'>Merging different resources
|
|
|
|
|
made possible.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l10 level1 lfo2;tab-stops:list .5in'>Behind the scene
|
|
|
|
|
transforming the ASCII data file into a binary file for efficiency.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l10 level1 lfo2;tab-stops:list .5in'>Offering both incremental
|
|
|
|
|
comparison for simple comparison and collation keys for batch processes.</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>There are 4 comparison levels in the Collator classes to allow different
|
|
|
|
|
levels of difference to be considered significant: </p>
|
|
|
|
|
|
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l6 level1 lfo3;tab-stops:list .5in'>Primary: a letter difference.
|
|
|
|
|
For example, 'a' and 'b'.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l6 level1 lfo3;tab-stops:list .5in'>Secondary: an accent
|
|
|
|
|
difference. For example, '<27>' and '<27>'.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l6 level1 lfo3;tab-stops:list .5in'>Tertiary: a case difference.
|
|
|
|
|
For example, 'a' and 'A'.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l6 level1 lfo3;tab-stops:list .5in'>Identical: no difference. For
|
|
|
|
|
example, 'a' and 'a'.</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h3><u>The rule symbols and their usage</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>A string is decomposed to be one or more collation elements when using with
|
|
|
|
|
the collation classes. The collation rules specify the order of these collation
|
|
|
|
|
elements. The collation table is composed of a list of collation rules, where
|
|
|
|
|
each rule is of three forms: </p>
|
|
|
|
|
|
|
|
|
|
<ol start=1 type=1>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l3 level1 lfo4;tab-stops:list .5in'><modifier></li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l3 level1 lfo4;tab-stops:list .5in'><relation>
|
|
|
|
|
<text-argument></li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l3 level1 lfo4;tab-stops:list .5in'><reset>
|
|
|
|
|
<text-argument1> <relation> <text-argument2></li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ol>
|
|
|
|
|
|
|
|
|
|
<h4><modifier></h4>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l7 level1 lfo5;tab-stops:list .5in'>'@': French secondary, accent
|
|
|
|
|
weights sorted backwards.</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h4><text-argument></h4>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>A text-argument is any sequence of characters, excluding special characters
|
|
|
|
|
(that is, common whitespace characters [0009-000D, 0020] and rule syntax
|
|
|
|
|
characters [0021-002F, 003A-0040, 005B-0060, 007B-007E]). If those characters
|
|
|
|
|
are desired, you can put them in single quotes (e.g. ampersand => '&').
|
|
|
|
|
Note that unquoted white space characters are ignored; e.g. "b c" is
|
|
|
|
|
treated as "bc".</p>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
<h4><relation></h4>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l9 level1 lfo6;tab-stops:list .5in'>'<' : Greater, as a letter
|
|
|
|
|
difference (primary)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l9 level1 lfo6;tab-stops:list .5in'>';' : Greater, as an accent
|
|
|
|
|
difference (secondary)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l9 level1 lfo6;tab-stops:list .5in'>',' : Greater, as a case
|
|
|
|
|
difference (tertiary)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l9 level1 lfo6;tab-stops:list .5in'>'=' : Equal</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h4><reset></h4>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l0 level1 lfo7;tab-stops:list .5in'>'&': Indicates that
|
|
|
|
|
text-argument2 follows the position to where the reset text-argument1
|
|
|
|
|
would be sorted.</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h3><u>Interesting Examples</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>The following is a list of interesting examples of the rules and some string
|
|
|
|
|
comparison results using those rules. The comparison relation will be denoted
|
|
|
|
|
as "<" of primary difference of less than, "<<" of
|
|
|
|
|
secondary difference of less than, "<<<" of teriatry
|
|
|
|
|
difference of less than and "==" of equal to relationships: </p>
|
|
|
|
|
|
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l1 level1 lfo8;tab-stops:list .5in'>Rule " a, A < b, B
|
|
|
|
|
< c, C < ch, cH, Ch, CH < d, D < e, E": this rule simply
|
|
|
|
|
says, sorts letters 'a', 'b', 'c', 'd' and 'e' in that order with primary
|
|
|
|
|
weights. 'ch' is sorted as a significant letter between 'c' and 'd' with
|
|
|
|
|
primary weights and upper cased letters sorts after lower cased letters
|
|
|
|
|
with tertiary weights. For example, "abc" <<<
|
|
|
|
|
"ABC" and "achb" < "adb".</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l1 level1 lfo8;tab-stops:list .5in'>Rule " a, A < b, B
|
|
|
|
|
< c, C < d, D < e, E & AE; <20> ": this will sort letters
|
|
|
|
|
'a', 'b', 'c', 'd' and 'e' in that order with primary weights. '<27>' will
|
|
|
|
|
sort as with a secondary less than to the sequence of 'A' following 'E'.
|
|
|
|
|
For example, "aeb" << "<EFBFBD>b" and "acb"
|
|
|
|
|
< "<EFBFBD>b".</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l1 level1 lfo8;tab-stops:list .5in'>Rule ".... q, Q &
|
|
|
|
|
Question'-'mark = '?' ....": the rule shows how to sort symbols to be
|
|
|
|
|
equivalent to the corrsponding text. In this example, "?" ==
|
|
|
|
|
"Question-mark". Note that the special symbols need to be quoted
|
|
|
|
|
in the rule.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l1 level1 lfo8;tab-stops:list .5in'>Rule ".... & aa ; a-
|
|
|
|
|
& ee ; e- & ii ; i- & oo ; o- & uu ; u- ....": this
|
|
|
|
|
rule demonstrates how to specify prolonged vowels in Japanese. In this
|
|
|
|
|
case, "aa" is sorted as with a secondary less than to
|
|
|
|
|
"a-". For example, "baab" << "ba-b".</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h3><u>Implementation Details</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>Three parts of the code will be carefully examined here: </p>
|
|
|
|
|
|
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l5 level1 lfo9;tab-stops:list .5in'>Building the collation rule
|
|
|
|
|
table. (see mergecol.cpp, ptnentry.cpp and tblcoll.cpp)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l5 level1 lfo9;tab-stops:list .5in'>Incremental comparison
|
|
|
|
|
algorithm for simple string comparison. (RuleBasedCollator.compare() in
|
|
|
|
|
tblcoll.cpp)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l5 level1 lfo9;tab-stops:list .5in'>Collation key generation and
|
|
|
|
|
its format. (RuleBasedCollator.getCollationKey() in tblcoll.cpp)</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<h3><u>Building the Collation Table</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>The process of building a collation table is as following: </p>
|
|
|
|
|
|
|
|
|
|
<ul type=disc>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l8 level1 lfo10;tab-stops:list .5in'>Parse the rule text into a
|
|
|
|
|
list of pattern entries. Each pattern has the content of current core
|
|
|
|
|
characters, extension character and the strength relation. (In
|
|
|
|
|
ptnentry.cpp)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l8 level1 lfo10;tab-stops:list .5in'>Inserts each entry at the
|
|
|
|
|
correct position based on the <reset> arguements. (In mergecol.cpp)</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l8 level1 lfo10;tab-stops:list .5in'>Build the compacted, highly
|
|
|
|
|
efficient look-up table based on the list of pattern entries. (In
|
|
|
|
|
tblcoll.cpp)</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ul>
|
|
|
|
|
|
|
|
|
|
<p> </p>
|
|
|
|
|
|
|
|
|
|
<h3><u>Incremental Comparison Diagram</u></h3>
|
|
|
|
|
|
|
|
|
|
<p> </p>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p><img width=468 height=800 id="_x0000_i1026" src=collflow.gif></p>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
<h3><u>Generating a Collation Key</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p>The control flow of generating a collation key is as the following: </p>
|
|
|
|
|
|
|
|
|
|
<ol start=1 type=1>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l2 level1 lfo11;tab-stops:list .5in'>Retrieve the next collation
|
|
|
|
|
element of the source string. Go to step 5 when reaches the end of string.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l2 level1 lfo11;tab-stops:list .5in'>Append the primary weight of
|
|
|
|
|
element to the primary weight buffer.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l2 level1 lfo11;tab-stops:list .5in'>Checks if it's necessary to
|
|
|
|
|
process secondary weights. If so, append the secondary weights to the
|
|
|
|
|
secondary weight buffer. If the collator is marked to process French
|
|
|
|
|
secondary, reverse the order of all the secondary weights before encounters
|
|
|
|
|
the next primary weight.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l2 level1 lfo11;tab-stops:list .5in'>Checks if it's necessary to
|
|
|
|
|
process tertiary weights. If so, append the tertiary weights to the
|
|
|
|
|
tertiary weight buffer. </li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l2 level1 lfo11;tab-stops:list .5in'>Concatenate the primary
|
|
|
|
|
weight buffer, secondary weight buffer and tertiary weight buffer and add
|
|
|
|
|
a null delimiter among the weights. Return the concatenated buffer as the
|
|
|
|
|
collation key.</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ol>
|
|
|
|
|
|
|
|
|
|
<h3><u>Q & A</u></h3>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<ol start=1 type=1>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l11 level1 lfo12;tab-stops:list .5in'>How do I customize the
|
|
|
|
|
collation sequence?<br>
|
|
|
|
|
A: Using the RuleBasedCollator constructor, the user of the collation
|
|
|
|
|
framework can then create his/her own Collator with a customized rule.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l11 level1 lfo12;tab-stops:list .5in'>Will the collation framwork
|
|
|
|
|
support the surrogate and private use characters?<br>
|
|
|
|
|
A: It's part of our future work items. However, no firm schedule has
|
|
|
|
|
been set for this yet.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l11 level1 lfo12;tab-stops:list .5in'>How does the French
|
|
|
|
|
secondary turn-on affect the generation of collation key?<br>
|
|
|
|
|
A: In French, the secondary differences are sorted backwards so this will
|
|
|
|
|
invoke the collation key to reverse the secondary weights in the keys.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l11 level1 lfo12;tab-stops:list .5in'>Is there any support for
|
|
|
|
|
composing characters? If so, how does it work?<br>
|
|
|
|
|
A: Yes, it is based on the Normalizer interface. When a expanding
|
|
|
|
|
character is detected, the rule builder will construct collation entries
|
|
|
|
|
for the precomposed version internally to handle the composed characters
|
|
|
|
|
correctly.</li>
|
|
|
|
|
<li class=MsoNormal style='mso-margin-top-alt:auto;mso-margin-bottom-alt:auto;
|
|
|
|
|
mso-list:l11 level1 lfo12;tab-stops:list .5in'>Is there any plan for
|
|
|
|
|
performance improvement, for instance, contracting/expanding character
|
|
|
|
|
lookup?<br>
|
|
|
|
|
A: Yes, the performance enhancement is an ongoing work item.</li>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</ol>
|
|
|
|
|
|
|
|
|
|
<p> </p>
|
|
|
|
|
|
2000-01-15 02:00:06 +00:00
|
|
|
|
<p><a href="..\readme.html">ReadMe for </a><a href="..\readme.html#API">International
|
|
|
|
|
Collectanea for Unicode</a></p>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<div class=MsoNormal align=center style='text-align:center'>
|
|
|
|
|
|
|
|
|
|
<hr size=2 width="100%" align=center>
|
|
|
|
|
|
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
</div>
|
1999-08-16 21:50:52 +00:00
|
|
|
|
|
|
|
|
|
</body>
|
2000-01-15 02:00:06 +00:00
|
|
|
|
|
1999-08-16 21:50:52 +00:00
|
|
|
|
</html>
|