123132b8e6
X-SVN-Rev: 16095
384 lines
8.9 KiB
Plaintext
384 lines
8.9 KiB
Plaintext
#--------------------------------------------------------------------
|
|
# Copyright (c) 1999-2004, International Business Machines
|
|
# Corporation and others. All Rights Reserved.
|
|
#--------------------------------------------------------------------
|
|
|
|
# Latin-InterIndic
|
|
#:: NFD;
|
|
#\u0e00 reserved
|
|
#consonants
|
|
$chandrabindu=\ue001;
|
|
$anusvara=\ue002;
|
|
$visarga=\ue003;
|
|
#\u0e004 reserved
|
|
# w<vowel> represents the stand-alone form
|
|
$wa=\ue005;
|
|
$waa=\ue006;
|
|
$wi=\ue007;
|
|
$wii=\ue008;
|
|
$wu=\ue009;
|
|
$wuu=\ue00a;
|
|
$wr=\ue00b;
|
|
$wl=\ue00c;
|
|
$wce=\ue00d; # LETTER CANDRA E
|
|
$wse=\ue00e; # LETTER SHORT E
|
|
$we=\ue00f; # \u090f LETTER E
|
|
$wai=\ue010;
|
|
$wco=\ue011; # LETTER CANDRA O
|
|
$wso=\ue012; # LETTER SHORT O
|
|
$wo=\ue013; # \u0913 LETTER O
|
|
$wau=\ue014;
|
|
$ka=\ue015;
|
|
$kha=\ue016;
|
|
$ga=\ue017;
|
|
$gha=\ue018;
|
|
$nga=\ue019;
|
|
$ca=\ue01a;
|
|
$cha=\ue01b;
|
|
$ja=\ue01c;
|
|
$jha=\ue01d;
|
|
$nya=\ue01e;
|
|
$tta=\ue01f;
|
|
$ttha=\ue020;
|
|
$dda=\ue021;
|
|
$ddha=\ue022;
|
|
$nna=\ue023;
|
|
$ta=\ue024;
|
|
$tha=\ue025;
|
|
$da=\ue026;
|
|
$dha=\ue027;
|
|
$na=\ue028;
|
|
$ena=\ue029; #compatibility
|
|
$pa=\ue02a;
|
|
$pha=\ue02b;
|
|
$ba=\ue02c;
|
|
$bha=\ue02d;
|
|
$ma=\ue02e;
|
|
$ya=\ue02f;
|
|
$ra=\ue030;
|
|
$rra=\ue031;
|
|
$la=\ue032;
|
|
$lla=\ue033;
|
|
$ela=\ue034; #compatibility
|
|
$va=\ue035;
|
|
$vva=\ue081;
|
|
$sha=\ue036;
|
|
$ssa=\ue037;
|
|
$sa=\ue038;
|
|
$ha=\ue039;
|
|
#\u093a Reserved
|
|
#\u093b Reserved
|
|
$nukta=\ue03c;
|
|
$avagraha=\ue03d; # SIGN AVAGRAHA
|
|
# <vowel> represents the dependent form
|
|
$aa=\ue03e;
|
|
$i=\ue03f;
|
|
$ii=\ue040;
|
|
$u=\ue041;
|
|
$uu=\ue042;
|
|
$rh=\ue043;
|
|
$lh=\ue044;
|
|
$ce=\ue045; #VOWEL SIGN CANDRA E
|
|
$se=\ue046; #VOWEL SIGN SHORT E
|
|
$e=\ue047;
|
|
$ai=\ue048;
|
|
$co=\ue049; # VOWEL SIGN CANDRA O
|
|
$so=\ue04a; # VOWEL SIGN SHORT O
|
|
$o=\ue04b; # \u094b
|
|
$au=\ue04c;
|
|
$virama=\ue04d;
|
|
# \u094e Reserved
|
|
# \u094f Reserved
|
|
$om = \ue050; # OM
|
|
# \u0951>; # UNMAPPED STRESS SIGN UDATTA
|
|
# \u0952>; # UNMAPPED STRESS SIGN ANUDATTA
|
|
# \u0953>; # UNMAPPED GRAVE ACCENT
|
|
# \u0954>; # UNMAPPED ACUTE ACCENT
|
|
$lm = \ue055;# Telugu Length Mark
|
|
$ailm=\ue056;# AI Length Mark
|
|
$aulm=\ue057;# AU Length Mark
|
|
#urdu compatibity forms
|
|
$uka=\ue058;
|
|
$ukha=\ue059;
|
|
$ugha=\ue05a;
|
|
$ujha=\ue05b;
|
|
$uddha=\ue05c;
|
|
$udha=\ue05d;
|
|
$ufa=\ue05e;
|
|
$uya=\ue05f;
|
|
$wrr=\ue060;
|
|
$wll=\ue061;
|
|
$rrh=\ue062;
|
|
$llh=\ue063;
|
|
$danda=\ue064;
|
|
$doubleDanda=\ue065;
|
|
$zero=\ue066; # DIGIT ZERO
|
|
$one=\ue067; # DIGIT ONE
|
|
$two=\ue068; # DIGIT TWO
|
|
$three=\ue069; # DIGIT THREE
|
|
$four=\ue06a; # DIGIT FOUR
|
|
$five=\ue06b; # DIGIT FIVE
|
|
$six=\ue06c; # DIGIT SIX
|
|
$seven=\ue06d; # DIGIT SEVEN
|
|
$eight=\ue06e; # DIGIT EIGHT
|
|
$nine=\ue06f; # DIGIT NINE
|
|
# For all other scripts
|
|
$ecp0=\ue070;
|
|
$ecp1=\ue071;
|
|
$ecp2=\ue072;
|
|
$ecp3=\ue073;
|
|
$ecp4=\ue074;
|
|
$ecp5=\ue075;
|
|
$ecp6=\ue076;
|
|
$ecp7=\ue077;
|
|
$ecp8=\ue078;
|
|
$ecp9=\ue079;
|
|
$ecpA=\ue07a;
|
|
$ecpB=\ue07b;
|
|
$ecpC=\ue07c;
|
|
$ecpD=\ue07d;
|
|
$ecpE=\ue07e;
|
|
$ecpF=\ue07f;
|
|
# \u0970>; # UNMAPPED ABBREVIATION SIGN
|
|
$depVowelAbove=[\ue03e-\ue040\ue045-\ue04c];
|
|
$depVowelBelow=[\ue041-\ue044];
|
|
$endThing=[$danda$doubleDanda];
|
|
# $x was originally called '&'; $z was '%'
|
|
$x=[$virama$aa$ai$au$ii$i$uu$u$rrh$rh$lh$e$o$se$ce$so$co];
|
|
$z=[bcdfghjklmnpqrstvwxyz];
|
|
$consonants=[[$ka-$ha]$z[\u0915-\u0939][\u0995-\u09b9][\u0a15-\u0a39][\u0a95-\u0ab9][\u0b15-\u0b39][\u0b95-\u0bb9][\u0c15-\u0c39][\u0c95-\u0cb9][\u0d15-\u0d39]];
|
|
\u0315 > $avagraha;
|
|
\u0303>$chandrabindu$anusvara;
|
|
m\u0310>$chandrabindu;
|
|
h\u0323>$visarga;
|
|
x>$ka$virama$sa;
|
|
# convert to independent forms at start of word or syllable:
|
|
# dependent forms for roundtrip
|
|
\u0314a\u0304>$aa;
|
|
\u0314ai>$ai;
|
|
\u0314au>$au;
|
|
\u0314ii>$ii;
|
|
\u0314i\u0304>$ii;
|
|
\u0314i>$i;
|
|
\u0314u\u0304>$uu;
|
|
\u0314u>$u;
|
|
\u0314r\u0325\u0304>$rrh;
|
|
\u0314r\u0325>$rh;
|
|
\u0314l\u0325\u0304>$llh;
|
|
\u0314lh>$lh;
|
|
\u0314l\u0325>$lh;
|
|
\u0314e\u0304>$e;
|
|
\u0314o\u0304>$o;
|
|
\u0314a>;
|
|
\u0314e\u0306>$ce;
|
|
\u0314o\u0306>$co;
|
|
\u0314e>$se;
|
|
\u0314o>$so;
|
|
|
|
# preceeded by consonants
|
|
$consonants{ a\u0304>$aa;
|
|
$consonants{ ai>$ai;
|
|
$consonants{ au>$au;
|
|
$consonants{ ii>$ii;
|
|
$consonants{ i\u0304>$ii;
|
|
$consonants{ i>$i;
|
|
$consonants{ u\u0304>$uu;
|
|
$consonants{ u>$u;
|
|
$consonants{ r\u0325\u0304>$rrh;
|
|
$consonants{ r\u0325a>$rh;
|
|
$consonants{ r\u0325>$rh;
|
|
$consonants{ l\u0325\u0304>$llh;
|
|
$consonants{ lh>$lh;
|
|
$consonants{ l\u0325>$lh;
|
|
$consonants{ e\u0304>$e;
|
|
$consonants{ o\u0304>$o;
|
|
$consonants{ e\u0306>$ce;
|
|
$consonants{ o\u0306>$co;
|
|
$consonants{ e>$se;
|
|
$consonants{ o>$so;
|
|
|
|
# e.g. keai -> {ka}{e}{wai}; k'ai -> {ka}{wai}; (ai) -> ({wai})
|
|
a\u0304>$waa;
|
|
ai>$wai;
|
|
au>$wau;
|
|
i\u0304>$wii;
|
|
i>$wi;
|
|
u\u0304>$wuu;
|
|
u>$wu;
|
|
r\u0325\u0304>$wrr;
|
|
r\u0325>$wr;
|
|
l\u0325\u0304>$wll;
|
|
lh>$wl;
|
|
l\u0325>$wl;
|
|
e\u0304>$we;
|
|
o\u0304>$wo;
|
|
a>$wa;
|
|
e\u0306>$wce;
|
|
o\u0306>$wco;
|
|
e>$wse;
|
|
''om>$om;
|
|
o>$wso;
|
|
|
|
# rules for anusvara
|
|
n}r\u0325 > $na|$virama;
|
|
n}l\u0325 > $na|$virama;
|
|
n}na > $na|$virama;
|
|
n\u0307}[kg] > $anusvara;
|
|
n\u0307}n\u0307 > $anusvara;
|
|
n\u0304}[cj] > $anusvara;
|
|
n\u0304}n\u0303 > $anusvara;
|
|
n\u0323}[tdn]\u0323 > $anusvara;
|
|
n}[tdn] > $anusvara;
|
|
m}[pbm] > $anusvara;
|
|
n}[ylvshr] > $anusvara;
|
|
m\u0307 > $anusvara;
|
|
|
|
#urdu compatibility
|
|
q>$uka|$virama;
|
|
k\u0331h\u0331>$ukha |$virama;
|
|
g\u0307> $ugha | $virama;
|
|
z > $ujha |$virama;
|
|
f > $ufa|$virama;
|
|
|
|
# dev
|
|
y\u0307>$uya|$virama;
|
|
l\u0331>$ela|$virama;
|
|
n\u0331>$ena|$virama;
|
|
n\u0307>$nga|$virama;
|
|
n\u0303>$nya|$virama;
|
|
n\u0323>$nna|$virama;
|
|
t\u0323h>$ttha|$virama;
|
|
t\u0323>$tta|$virama;
|
|
r\u0323h>$udha|$virama;
|
|
r\u0323>$uddha|$virama;
|
|
d\u0323h>$ddha|$virama;
|
|
d\u0323>$dda|$virama;
|
|
kh>$kha|$virama;
|
|
k>$ka|$virama;
|
|
gh>$gha|$virama;
|
|
g>$ga|$virama;
|
|
ch>$cha|$virama;
|
|
c>$ca|$virama;
|
|
jh>$jha|$virama;
|
|
j>$ja|$virama;
|
|
ny>$nya|$virama;
|
|
tth>$ttha|$virama;
|
|
ddh>$ddha|$virama;
|
|
th>$tha|$virama;
|
|
t>$ta|$virama;
|
|
dh>$dha|$virama;
|
|
d>$da|$virama;
|
|
n>$na|$virama;
|
|
ph>$pha|$virama;
|
|
p>$pa|$virama;
|
|
bh>$bha|$virama;
|
|
b>$ba|$virama;
|
|
m>$ma|$virama;
|
|
y>$ya|$virama;
|
|
r\u0331>$rra|$virama;
|
|
r>$ra|$virama;
|
|
l\u0323>$lla|$virama;
|
|
l>$la|$virama;
|
|
v>$va|$virama;
|
|
w\u0307>$vva|$virama;
|
|
w>$va|$virama;
|
|
sh>$sha|$virama;
|
|
ss>$ssa|$virama;
|
|
s\u0323>$ssa|$virama;
|
|
s\u0301>$sha|$virama;
|
|
s>$sa|$virama;
|
|
h>$ha|$virama;
|
|
'.'>$danda;
|
|
$danda'.'>$doubleDanda;
|
|
$depVowelAbove{'~'>$anusvara;
|
|
$depVowelBelow{'~'>$chandrabindu;
|
|
# convert to dependent forms after consonant with no vowel:
|
|
# e.g. kai -> {ka}{virama}ai -> {ka}{ai}
|
|
#$virama aa>$aa;
|
|
$virama a\u0304>$aa;
|
|
$virama ai>$ai;
|
|
$virama au>$au;
|
|
$virama ii>$ii;
|
|
$virama i\u0304>$ii;
|
|
$virama i>$i;
|
|
#$virama uu>$uu;
|
|
$virama u\u0304>$uu;
|
|
$virama u>$u;
|
|
#$virama rrh>$rrh;
|
|
$virama r\u0325\u0304>$rrh;
|
|
#$virama rh>$rh;
|
|
$virama r\u0325a>$rh;
|
|
$virama r\u0325>$rh;
|
|
$virama l\u0325\u0304>$llh;
|
|
$virama lh>$lh;
|
|
$virama l\u0325>$lh;
|
|
$virama e\u0304>$e;
|
|
$virama o\u0304>$o;
|
|
$virama a>;
|
|
$virama e\u0306>$ce;
|
|
$virama o\u0306>$co;
|
|
$virama e>$se;
|
|
$virama o>$so;
|
|
|
|
|
|
# otherwise convert independent forms when separated by ': k'ai -> {ka}{virama}{wai}
|
|
#$virama''aa>$waa;
|
|
$virama''a\u0304>$waa;
|
|
$virama''ai>$wai;
|
|
$virama''au>$wau;
|
|
#$virama''ii>$wii;
|
|
$virama''i\u0304>$wii;
|
|
$virama''i>$wi;
|
|
#$virama''uu>$wuu;
|
|
$virama''u\u0304>$wuu;
|
|
$virama''u>$wu;
|
|
#$virama''rrh>$wrr;
|
|
$virama''r\u0325\u0304>$wrr;
|
|
#$virama''rh>$wr;
|
|
$virama''r\u0325>$wr;
|
|
$virama''l\u0325\u0304>$wll;
|
|
#$virama''lh>$wl;
|
|
$virama''l\u0325>$wl;
|
|
$virama''e\u0304>$we;
|
|
$virama''o\u0304>$wo;
|
|
$virama''a>$wa;
|
|
$virama''e\u0306>$wce;
|
|
$virama''o\u0306>$wco;
|
|
$virama''e>$wse;
|
|
$virama''o>$wso;
|
|
# no virama
|
|
''a\u0304>$waa;
|
|
''ai>$wai;
|
|
''au>$wau;
|
|
''i\u0304>$wii;
|
|
''i>$wi;
|
|
''u\u0304>$wuu;
|
|
''u>$wu;
|
|
''r\u0325\u0304>$wrr;
|
|
''r\u0325>$wr;
|
|
''l\u0325\u0304>$wll;
|
|
''l\u0325>$wl;
|
|
''e\u0304>$we;
|
|
''o\u0304>$wo;
|
|
''a>$wa;
|
|
''e\u0306>$wce;
|
|
''o\u0306>$wco;
|
|
''e>$wse;
|
|
''o>$wso;
|
|
|
|
$virama } [$z] > $virama;
|
|
$virama } ' ' > $virama ;
|
|
$virama}$endThing>;
|
|
0>$zero;
|
|
1>$one;
|
|
2>$two;
|
|
3>$three;
|
|
4>$four;
|
|
5>$five;
|
|
6>$six;
|
|
7>$seven;
|
|
8>$eight;
|
|
9>$nine;
|
|
''>;
|
|
#:: NFC (NFD) ;
|