123132b8e6
X-SVN-Rev: 16095
529 lines
14 KiB
Plaintext
529 lines
14 KiB
Plaintext
#--------------------------------------------------------------------
|
|
# Copyright (c) 1999-2004, International Business Machines
|
|
# Corporation and others. All Rights Reserved.
|
|
#--------------------------------------------------------------------
|
|
|
|
# InterIndic-Latin
|
|
#\u0e00 reserved
|
|
#consonants
|
|
$chandrabindu=\ue001;
|
|
$anusvara=\ue002;
|
|
$visarga=\ue003;
|
|
#\u0e004 reserved
|
|
# w<vowel> represents the stand-alone form
|
|
$wa=\ue005;
|
|
$waa=\ue006;
|
|
$wi=\ue007;
|
|
$wii=\ue008;
|
|
$wu=\ue009;
|
|
$wuu=\ue00a;
|
|
$wr=\ue00b;
|
|
$wl=\ue00c;
|
|
$wce=\ue00d; # LETTER CANDRA E
|
|
$wse=\ue00e; # LETTER SHORT E
|
|
$we=\ue00f; # \u090f LETTER E
|
|
$wai=\ue010;
|
|
$wco=\ue011; # LETTER CANDRA O
|
|
$wso=\ue012; # LETTER SHORT O
|
|
$wo=\ue013; # \u0913 LETTER O
|
|
$wau=\ue014;
|
|
$ka=\ue015;
|
|
$kha=\ue016;
|
|
$ga=\ue017;
|
|
$gha=\ue018;
|
|
$nga=\ue019;
|
|
$ca=\ue01a;
|
|
$cha=\ue01b;
|
|
$ja=\ue01c;
|
|
$jha=\ue01d;
|
|
$nya=\ue01e;
|
|
$tta=\ue01f;
|
|
$ttha=\ue020;
|
|
$dda=\ue021;
|
|
$ddha=\ue022;
|
|
$nna=\ue023;
|
|
$ta=\ue024;
|
|
$tha=\ue025;
|
|
$da=\ue026;
|
|
$dha=\ue027;
|
|
$na=\ue028;
|
|
$ena=\ue029; #compatibility
|
|
$pa=\ue02a;
|
|
$pha=\ue02b;
|
|
$ba=\ue02c;
|
|
$bha=\ue02d;
|
|
$ma=\ue02e;
|
|
$ya=\ue02f;
|
|
$ra=\ue030;
|
|
$vva=\ue081;
|
|
$rra=\ue031;
|
|
$la=\ue032;
|
|
$lla=\ue033;
|
|
$ela=\ue034; #compatibility
|
|
$va=\ue035;
|
|
$sha=\ue036;
|
|
$ssa=\ue037;
|
|
$sa=\ue038;
|
|
$ha=\ue039;
|
|
#\u093a Reserved
|
|
#\u093b Reserved
|
|
$nukta=\ue03c;
|
|
$avagraha=\ue03d; # SIGN AVAGRAHA
|
|
# <vowel> represents the dependent form
|
|
$aa=\ue03e;
|
|
$i=\ue03f;
|
|
$ii=\ue040;
|
|
$u=\ue041;
|
|
$uu=\ue042;
|
|
$rh=\ue043;
|
|
$lh=\ue044;
|
|
$ce=\ue045; #VOWEL SIGN CANDRA E
|
|
$se=\ue046; #VOWEL SIGN SHORT E
|
|
$e=\ue047;
|
|
$ai=\ue048;
|
|
$co=\ue049; # VOWEL SIGN CANDRA O
|
|
$so=\ue04a; # VOWEL SIGN SHORT O
|
|
$o=\ue04b; # \u094b
|
|
$au=\ue04c;
|
|
$virama=\ue04d;
|
|
# \u094e Reserved
|
|
# \u094f Reserved
|
|
$om=\ue050; # OM
|
|
\ue051>; # UNMAPPED STRESS SIGN UDATTA
|
|
\ue052>; # UNMAPPED STRESS SIGN ANUDATTA
|
|
\ue053>; # UNMAPPED GRAVE ACCENT
|
|
\ue054>; # UNMAPPED ACUTE ACCENT
|
|
$lm = \ue055;# Telugu Length Mark
|
|
$ailm=\ue056;# AI Length Mark
|
|
$aulm=\ue057;# AU Length Mark
|
|
#urdu compatibity forms
|
|
$uka=\ue058;
|
|
$ukha=\ue059;
|
|
$ugha=\ue05a;
|
|
$ujha=\ue05b;
|
|
$uddha=\ue05c;
|
|
$udha=\ue05d;
|
|
$ufa=\ue05e;
|
|
$uya=\ue05f;
|
|
$wrr=\ue060;
|
|
$wll=\ue061;
|
|
$rrh=\ue062;
|
|
$llh=\ue063;
|
|
$danda=\ue064;
|
|
$doubleDanda=\ue065;
|
|
$zero=\ue066; # DIGIT ZERO
|
|
$one=\ue067; # DIGIT ONE
|
|
$two=\ue068; # DIGIT TWO
|
|
$three=\ue069; # DIGIT THREE
|
|
$four=\ue06a; # DIGIT FOUR
|
|
$five=\ue06b; # DIGIT FIVE
|
|
$six=\ue06c; # DIGIT SIX
|
|
$seven=\ue06d; # DIGIT SEVEN
|
|
$eight=\ue06e; # DIGIT EIGHT
|
|
$nine=\ue06f; # DIGIT NINE
|
|
|
|
# \u0970>; # UNMAPPED ABBREVIATION SIGN
|
|
$depVowelAbove=[\ue03e-\ue040\ue045-\ue04c];
|
|
$depVowelBelow=[\ue041-\ue044];
|
|
# $x was originally called '&'; $z was '%'
|
|
$x=[$aa$ai$au$ii$i$uu$u$rrh$rh$lh$llh$e$o$se$ce$so$co];
|
|
$z=[bcdfghjklmnpqrstvwxyz];
|
|
$vowels=[aeiour\u0304\u0325\u0306];
|
|
$forceIndependentMatra = [^[[:L:][\u0300-\u034c]]];
|
|
######################################################################
|
|
# convert from Native letters to Latin letters
|
|
######################################################################
|
|
#transliterations for anusvara
|
|
$anusvara} [$ka$kha$ga$gha$nga] > n\u0307;
|
|
$anusvara} [$ca$cha$ja$jha$nya] > n\u0304;
|
|
$anusvara} [$tta$ttha$dda$ddha$nna] > n\u0323;
|
|
$anusvara} [$ta$tha$da$dha$na] > n ;
|
|
$anusvara} [$pa$pha$ba$bha$ma] > m ;
|
|
$anusvara} [$ya$ra$lla$la$va$ssa$sha$sa$ha] > n ;
|
|
$anusvara> m\u0307;
|
|
|
|
# Urdu compatibility
|
|
$ya$nukta}$x > y\u0307 ;
|
|
$ya$nukta$virama > y\u0307 ;
|
|
$ya$nukta > y\u0307a ;
|
|
|
|
$la$nukta }$x > l\u0331 ;
|
|
$la$nukta$virama > l\u0331 ;
|
|
$la$nukta > l\u0331a ;
|
|
|
|
$na$nukta }$x > n\u0331 ;
|
|
$na$nukta$virama > n\u0331 ;
|
|
$na$nukta > n\u0331a ;
|
|
|
|
$ena }$x > n\u0331 ;
|
|
$ena$virama > n\u0331 ;
|
|
$ena > n\u0331a ;
|
|
$uka > qa ;
|
|
$ka$nukta }$x > q ;
|
|
$ka$nukta$virama > q ;
|
|
$ka$nukta > qa ;
|
|
$kha$nukta }$x > k\u0331h\u0331 ;
|
|
$kha$nukta$virama > k\u0331h\u0331 ;
|
|
$kha$nukta > k\u0331h\u0331a ;
|
|
$ukha$virama > k\u0331h\u0331;
|
|
$ukha > k\u0331h\u0331a;
|
|
$ugha > g\u0307a ;
|
|
$ga$nukta }$x > g\u0307 ;
|
|
$ga$nukta$virama > g\u0307 ;
|
|
$ga$nukta > g\u0307a ;
|
|
|
|
$ujha > za ;
|
|
$ja$nukta }$x > z ;
|
|
$ja$nukta$virama > z ;
|
|
$ja$nukta > za ;
|
|
$ddha$nukta}$x > r\u0323h ;
|
|
$ddha$nukta$virama > r\u0323h ;
|
|
$ddha$nukta > r\u0323ha;
|
|
|
|
$uddha}$x > r\u0323 ;
|
|
$uddha$virama > r\u0323 ;
|
|
$uddha > r\u0323a;
|
|
|
|
$udha > r\u0323a ;
|
|
$dda$nukta}$x > r\u0323 ;
|
|
$dda$nukta$virama > r\u0323 ;
|
|
$dda$nukta > r\u0323a ;
|
|
$pha$nukta }$x > f ;
|
|
$pha$nukta$virama > f ;
|
|
$pha$nukta > fa ;
|
|
$ufa }$x > f ;
|
|
$ufa$virama > f ;
|
|
$ufa > fa ;
|
|
|
|
$ra$nukta}$x > r\u0331;
|
|
$ra$nukta$virama > r\u0331;
|
|
$ra$nukta > r\u0331a;
|
|
$lla$nukta}$x > l\u0331;
|
|
$lla$nukta$virama > l\u0331;
|
|
$lla$nukta > l\u0331a;
|
|
|
|
$ela}$x > l\u0331;
|
|
$ela$virama > l\u0331;
|
|
$ela > l\u0331a;
|
|
|
|
$uya}$x > y\u0307;
|
|
$uya$virama > y\u0307;
|
|
$uya > y\u0307a;
|
|
|
|
|
|
# normal consonants
|
|
$ka$virama}$ha>k'';
|
|
$ka}$x>k;
|
|
$ka$virama>k;
|
|
$ka>ka;
|
|
$kha}$x>kh;
|
|
$kha$virama>kh;
|
|
$kha>kha;
|
|
$ga$virama}$ha>g'';
|
|
$ga}$x>g;
|
|
$ga$virama>g;
|
|
$ga>ga;
|
|
|
|
$gha}$x>gh;
|
|
$gha$virama>gh;
|
|
$gha>gha;
|
|
|
|
$nga}$x>n\u0307;
|
|
$nga$virama>n\u0307;
|
|
$nga>n\u0307a ;
|
|
$ca$virama}$ha>c'';
|
|
$ca}$x>c;
|
|
$ca$virama>c;
|
|
$ca>ca;
|
|
|
|
$cha}$x>ch;
|
|
$cha$virama>ch;
|
|
$cha>cha;
|
|
$ja$virama}$ha>j'';
|
|
$ja}$x>j;
|
|
$ja$virama>j;
|
|
$ja>ja;
|
|
|
|
$jha}$x>jh;
|
|
$jha$virama>jh;
|
|
$jha>jha;
|
|
|
|
$nya }$x>n\u0303 ;
|
|
$nya$virama>n\u0303;
|
|
$nya > n\u0303a ;
|
|
|
|
|
|
$tta$virama}$ha>t\u0323'';
|
|
$tta}$x>t\u0323;
|
|
$tta$virama>t\u0323;
|
|
$tta>t\u0323a;
|
|
|
|
$ttha}$x>t\u0323h;
|
|
$ttha$virama>t\u0323h;
|
|
$ttha>t\u0323ha;
|
|
$dda}$x$ha>d\u0323'';
|
|
$dda}$x>d\u0323;
|
|
$dda$virama>d\u0323;
|
|
$dda>d\u0323a;
|
|
|
|
$ddha}$x>d\u0323h;
|
|
$ddha$virama>d\u0323h;
|
|
$ddha>d\u0323ha;
|
|
|
|
$nna}$x>n\u0323 ;
|
|
$nna$virama>n\u0323;
|
|
$nna>n\u0323a ;
|
|
|
|
|
|
$ta$virama}$ha>t'';
|
|
$ta$virama}$ttha>t'';
|
|
$ta$virama}$tta>t'';
|
|
$ta$virama}$tha>t'';
|
|
$ta}$x>t;
|
|
$ta$virama>t;
|
|
$ta>ta;
|
|
$tha}$x>th;
|
|
$tha$virama>th;
|
|
$tha>tha;
|
|
|
|
$da$virama}$ha>d'';
|
|
$da$virama}$ddha>d'';
|
|
$da$virama}$dda>d'';
|
|
$da$virama}$dha>d'';
|
|
$da}$x>d;
|
|
$da$virama>d;
|
|
$da>da;
|
|
$dha}$x>dh;
|
|
$dha$virama>dh;
|
|
$dha>dha;
|
|
$na$virama}$ga>n'';
|
|
$na$virama}$ya>n'';
|
|
$na}$x>n;
|
|
$na$virama>n;
|
|
$na>na;
|
|
|
|
|
|
$pa$virama}$ha>p'';
|
|
$pa}$x>p;
|
|
$pa$virama>p;
|
|
$pa>pa;
|
|
$pha}$x>ph;
|
|
$pha$virama>ph;
|
|
$pha>pha;
|
|
$ba$virama}$ha>b'';
|
|
$ba}$x>b;
|
|
$ba$virama>b;
|
|
$ba>ba;
|
|
|
|
$bha}$x>bh;
|
|
$bha$virama>bh;
|
|
$bha>bha;
|
|
|
|
$ma$virama}$ma>m'';
|
|
$ma}$x>m;
|
|
$ma$virama>m;
|
|
$ma>ma;
|
|
|
|
$ya}$x>y;
|
|
$ya$virama>y;
|
|
$ya>ya;
|
|
$ra$virama}$ha>r'';
|
|
$ra}$x>r;
|
|
$ra$virama>r;
|
|
$ra>ra;
|
|
$vva$virama}$ha>w\u0307'';
|
|
$vva}$x>w\u0307;
|
|
$vva$virama>w\u0307;
|
|
$vva>w\u0307a;
|
|
$rra$virama}$ha>r\u0331'';
|
|
$rra}$x>r\u0331;
|
|
$rra$virama>r\u0331;
|
|
$rra>r\u0331a;
|
|
$la$virama}$ha>l'';
|
|
$la}$x>l;
|
|
$la$virama>l;
|
|
$la>la;
|
|
$lla$virama}$ha>l\u0323'';
|
|
$lla}$x>l\u0323;
|
|
$lla$virama>l\u0323;
|
|
$lla>l\u0323a;
|
|
$va}$x>v;
|
|
$va$virama>v;
|
|
$va>va;
|
|
$sa$virama}$ha>s'';
|
|
$sa$virama}$sha>s'';
|
|
$sa$virama}$ssa>s'';
|
|
$sa$virama}$sa>s'';
|
|
$sa}$x>s;
|
|
$sa$virama>s;
|
|
|
|
#for gurmukhi
|
|
$sa$nukta}$x>s\u0301;
|
|
$sa$nukta$virama>s\u0301;
|
|
$sa$nukta>s\u0301a;
|
|
$sa>sa;
|
|
|
|
$sha}$x>s\u0301;
|
|
$sha$virama>s\u0301;
|
|
$sha>s\u0301a;
|
|
|
|
$ssa}$x>s\u0323;
|
|
$ssa$virama>s\u0323;
|
|
$ssa>s\u0323a;
|
|
$ha}$x>h;
|
|
$ha$virama>h;
|
|
$ha>ha;
|
|
|
|
# dependent vowels (should never occur except following consonants)
|
|
$forceIndependentMatra{$aa > \u0314a\u0304 ;
|
|
$forceIndependentMatra{$ai > \u0314ai ;
|
|
$forceIndependentMatra{$au > \u0314au ;
|
|
$forceIndependentMatra{$ii > \u0314i\u0304 ;
|
|
$forceIndependentMatra{$i > \u0314i ;
|
|
$forceIndependentMatra{$uu > \u0314u\u0304 ;
|
|
$forceIndependentMatra{$u > \u0314u ;
|
|
$forceIndependentMatra{$rrh > \u0314r\u0325\u0304 ;
|
|
$forceIndependentMatra{$rh > \u0314r\u0325 ;
|
|
$forceIndependentMatra{$llh > \u0314l\u0325\u0304 ;
|
|
$forceIndependentMatra{$lh > \u0314l\u0325 ;
|
|
$forceIndependentMatra{$e > \u0314e\u0304 ;
|
|
$forceIndependentMatra{$o > \u0314o\u0304 ;
|
|
#extra vowels
|
|
$forceIndependentMatra{$ce > \u0314e\u0306 ;
|
|
$forceIndependentMatra{$co > \u0314o\u0306 ;
|
|
$forceIndependentMatra{$se > \u0314e ;
|
|
$forceIndependentMatra{$so > \u0314o ;
|
|
$forceIndependentMatra{$nukta >; # Nukta cannot appear independently or as first character
|
|
$forceIndependentMatra{$virama >; # Virama cannot appear independently or as first character
|
|
$aa > a\u0304 ;
|
|
$ai > ai ;
|
|
$au > au ;
|
|
$ii > i\u0304 ;
|
|
$i > i ;
|
|
$uu > u\u0304 ;
|
|
$u > u ;
|
|
$rrh > r\u0325\u0304 ;
|
|
$rh > r\u0325 ;
|
|
$llh > l\u0325\u0304 ;
|
|
$lh > l\u0325 ;
|
|
$e > e\u0304 ;
|
|
$o > o\u0304 ;
|
|
#extra vowels
|
|
$ce > e\u0306 ;
|
|
$co > o\u0306 ;
|
|
$se > e ;
|
|
$so > o ;
|
|
#dependent vowels when following independent vowels. Generally Illegal only for roundtripping
|
|
$waa} $x > a\u0304\u0314 ;
|
|
$wai} $x > ai\u0314 ;
|
|
$wau} $x > au\u0314 ;
|
|
$wii} $x > i\u0304\u0314 ;
|
|
$wi } $x > i\u0314 ;
|
|
$wuu} $x > u\u0304\u0314 ;
|
|
$wu } $x > u\u0314 ;
|
|
$wrr} $x > r\u0325\u0304\u0314 ;
|
|
$wr } $x > r\u0325\u0314 ;
|
|
$wll} $x > l\u0325\u0304\u0314 ;
|
|
$wl } $x > l\u0325\u0314 ;
|
|
$we } $x > e\u0304\u0314 ;
|
|
$wo } $x > o\u0304\u0314 ;
|
|
$wa } $x > a\u0314 ;
|
|
#extra vowels
|
|
$wce} $x > e\u0306\u0314 ;
|
|
$wco} $x > o\u0306\u0314 ;
|
|
$wse} $x > e\u0314 ;
|
|
$wso} $x > o\u0314 ;
|
|
$om} $x > ''om\u0314 ;
|
|
|
|
# independent vowels when preceeded by vowels
|
|
$vowels{$waa > ''a\u0304 ;
|
|
$vowels{$wai > ''ai ;
|
|
$vowels{$wau > ''au ;
|
|
$vowels{$wii > ''i\u0304 ;
|
|
$vowels{$wi > ''i ;
|
|
$vowels{$wuu > ''u\u0304 ;
|
|
$vowels{$wu > ''u ;
|
|
$vowels{$wrr > ''r\u0325\u0304 ;
|
|
$vowels{$wr > ''r\u0325 ;
|
|
$vowels{$wll > ''l\u0325\u0304 ;
|
|
$vowels{$wl > ''l\u0325 ;
|
|
$vowels{$we > ''e\u0304 ;
|
|
$vowels{$wo > ''o\u0304 ;
|
|
$vowels{$wa > ''a ;
|
|
#extra vowels
|
|
$vowels{$wce > ''e\u0306 ;
|
|
$vowels{$wco > ''o\u0306 ;
|
|
$vowels{$wse > ''e ;
|
|
$vowels{$wso > ''o ;
|
|
|
|
# independent vowels (otherwise)
|
|
$waa > a\u0304 ;
|
|
$wai > ai ;
|
|
$wau > au ;
|
|
$wii > i\u0304 ;
|
|
$wi > i ;
|
|
$wuu > u\u0304 ;
|
|
$wu > u ;
|
|
$wrr > r\u0325\u0304 ;
|
|
$wr > r\u0325 ;
|
|
$wll > l\u0325\u0304 ;
|
|
$wl > l\u0325 ;
|
|
$we > e\u0304 ;
|
|
$wo > o\u0304 ;
|
|
$wa > a ;
|
|
#extra vowels
|
|
$wce > e\u0306 ;
|
|
$wco > o\u0306 ;
|
|
$wse > e ;
|
|
$wso > o ;
|
|
$om > ''om ;
|
|
|
|
#stress marks
|
|
$avagraha > \u0315;
|
|
$chandrabindu$anusvara>\u0303;
|
|
$chandrabindu > m\u0310;
|
|
$visarga>h\u0323;
|
|
#numbers
|
|
$zero > 0;
|
|
$one > 1;
|
|
$two > 2;
|
|
$three > 3;
|
|
$four > 4;
|
|
$five > 5;
|
|
$six > 6;
|
|
$seven > 7;
|
|
$eight > 8;
|
|
$nine > 9;
|
|
$lm >;
|
|
$ailm >;
|
|
$aulm >;
|
|
|
|
$danda>'.';
|
|
$doubleDanda>'.';
|
|
|
|
\ue070>; # ABBREVIATION SIGN
|
|
# LETTER RA WITH MIDDLE DIAGONAL
|
|
\ue071}$x>ra;
|
|
\ue071$virama>r;
|
|
\ue071>ra;
|
|
# LETTER RA WITH LOWER DIAGONAL
|
|
\ue072}$x>ra;
|
|
\ue072$virama>r;
|
|
\ue072>ra;
|
|
|
|
\ue073>; # RUPEE MARK
|
|
\ue074>; # RUPEE SIGN
|
|
\ue075>; # CURRENCY NUMERATOR ONE
|
|
\ue076>; # CURRENCY NUMERATOR TWO
|
|
\ue077>; # CURRENCY NUMERATOR THREE
|
|
\ue078>; # CURRENCY NUMERATOR FOUR
|
|
\ue079>; # CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR
|
|
\ue07A>; # CURRENCY DENOMINATOR SIXTEEN
|
|
\ue07B>; # ISSHAR
|
|
\uE07C>; # TIPPI
|
|
\uE07D>; # ADDAK
|
|
\uE07E>; # IRI
|
|
\uE07F>; # URA
|
|
\uE080>; # EK ONKAR
|
|
\uE004>; # DEVANAGARI VOWEL SIGN SHORT A
|
|
|