146 lines
5.0 KiB
Plaintext
146 lines
5.0 KiB
Plaintext
|
#--------------------------------------------------------------------
|
|||
|
# Copyright (c) 1999-2004, International Business Machines
|
|||
|
# Corporation and others. All Rights Reserved.
|
|||
|
#--------------------------------------------------------------------
|
|||
|
|
|||
|
# Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
|
|||
|
# Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf>
|
|||
|
# a) where required for disambiguation.
|
|||
|
# b) with underdot instead of cedilla for letter like SAD, since
|
|||
|
# those are explicitly in Unicode for transliteration.
|
|||
|
# c) with extra non-Arabic-language letters, like PEH
|
|||
|
|
|||
|
# Does *not* do assimilation of "al", nor hyphenation.
|
|||
|
# While it could be done, we need to determine whether a prefix "al" could
|
|||
|
# occur other than as the definite article (since no space is used).
|
|||
|
|
|||
|
:: [[:Arabic:] [ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;
|
|||
|
:: NFKD (NFC);
|
|||
|
$disambig = ̱ ;
|
|||
|
$disambig2 = ̰ ;
|
|||
|
$under = ̣ ;
|
|||
|
|
|||
|
$notAbove = [[:^ccc=0:]&[:^ccc=230:]];
|
|||
|
|
|||
|
# non-letters
|
|||
|
|
|||
|
٫ <> '.' $disambig ; # ARABIC DECIMAL SEPARATOR
|
|||
|
٬ <> ',' $disambig ; # ARABIC THOUSANDS SEPARATOR
|
|||
|
# ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate
|
|||
|
|
|||
|
، <> ',' ; # ARABIC COMMA
|
|||
|
؛ <> ';' ; # ARABIC SEMICOLON
|
|||
|
؟ <> '?' ; # ARABIC QUESTION MARK
|
|||
|
٪ <> '%' ; # ARABIC PERCENT SIGN
|
|||
|
|
|||
|
۰ <> 0 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ZERO
|
|||
|
۱ <> 1 $disambig ; # EXTENDED ARABIC-INDIC DIGIT ONE
|
|||
|
۲ <> 2 $disambig ; # EXTENDED ARABIC-INDIC DIGIT TWO
|
|||
|
۳ <> 3 $disambig ; # EXTENDED ARABIC-INDIC DIGIT THREE
|
|||
|
۴ <> 4 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FOUR
|
|||
|
۵ <> 5 $disambig ; # EXTENDED ARABIC-INDIC DIGIT FIVE
|
|||
|
۶ <> 6 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SIX
|
|||
|
۷ <> 7 $disambig ; # EXTENDED ARABIC-INDIC DIGIT SEVEN
|
|||
|
۸ <> 8 $disambig ; # EXTENDED ARABIC-INDIC DIGIT EIGHT
|
|||
|
۹ <> 9 $disambig ; # EXTENDED ARABIC-INDIC DIGIT NINE
|
|||
|
|
|||
|
٠ <> 0 ; # ARABIC-INDIC DIGIT ZERO
|
|||
|
١ <> 1 ; # ARABIC-INDIC DIGIT ONE
|
|||
|
٢ <> 2 ; # ARABIC-INDIC DIGIT TWO
|
|||
|
٣ <> 3 ; # ARABIC-INDIC DIGIT THREE
|
|||
|
٤ <> 4 ; # ARABIC-INDIC DIGIT FOUR
|
|||
|
٥ <> 5 ; # ARABIC-INDIC DIGIT FIVE
|
|||
|
٦ <> 6 ; # ARABIC-INDIC DIGIT SIX
|
|||
|
٧ <> 7 ; # ARABIC-INDIC DIGIT SEVEN
|
|||
|
٨ <> 8 ; # ARABIC-INDIC DIGIT EIGHT
|
|||
|
٩ <> 9 ; # ARABIC-INDIC DIGIT NINE
|
|||
|
|
|||
|
# letters
|
|||
|
|
|||
|
# long vowels
|
|||
|
َا<> ā ; # ARABIC FATHA, ARABIC LETTER ALEF
|
|||
|
ُو <> ū ; # ARABIC DAMMA, ARABIC LETTER WAW
|
|||
|
ِي <> ī ; # ARABIC KASRA, ARABIC LETTER YEH
|
|||
|
|
|||
|
# longer items moved here to prevent masking
|
|||
|
ث <> t h $disambig ; # ARABIC LETTER THEH
|
|||
|
ذ <> d h $disambig ; # ARABIC LETTER THAL
|
|||
|
ش <> s h $disambig ; # ARABIC LETTER SHEEN
|
|||
|
ص <> s $under ; # ARABIC LETTER SAD
|
|||
|
ض <> d $under ; # ARABIC LETTER DAD
|
|||
|
ط <> t $under ; # ARABIC LETTER TAH
|
|||
|
ظ <> z $under ; # ARABIC LETTER ZAH
|
|||
|
غ <> g h $disambig ; # ARABIC LETTER GHAIN
|
|||
|
|
|||
|
# WARNING: special case
|
|||
|
# <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut>
|
|||
|
# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
|
|||
|
# ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
|
|||
|
|
|||
|
ة <> t \u0308 ; # ARABIC LETTER TEH MARBUTA
|
|||
|
ة | $1 < t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
|
|||
|
|
|||
|
# non-Arabic language
|
|||
|
ژ <> z h $disambig ; # ARABIC LETTER JEH
|
|||
|
ڭ <> n $disambig g ; # ARABIC LETTER NG
|
|||
|
ۋ <> v $disambig ; # ARABIC LETTER VE
|
|||
|
ی <> y $disambig2 ; # ARABIC LETTER FARSI YEH
|
|||
|
|
|||
|
# Arabic language
|
|||
|
|
|||
|
ء <> ʾ ; # ARABIC LETTER HAMZA
|
|||
|
ا <> a $under; # ARABIC LETTER ALEF
|
|||
|
ب <> b ; # ARABIC LETTER BEH
|
|||
|
ت <> t ; # ARABIC LETTER TEH
|
|||
|
ج <> j ; # ARABIC LETTER JEEM
|
|||
|
ح <> h $under ; # ARABIC LETTER HAH
|
|||
|
خ <> k h $disambig ; # ARABIC LETTER KHAH
|
|||
|
د <> d ; # ARABIC LETTER DAL
|
|||
|
ر <> r ; # ARABIC LETTER REH
|
|||
|
ز <> z ; # ARABIC LETTER ZAIN
|
|||
|
س <> s ; # ARABIC LETTER SEEN
|
|||
|
ع <> ʿ ; # ARABIC LETTER AIN
|
|||
|
ـ > ; # ARABIC TATWEEL
|
|||
|
ف <> f ; # ARABIC LETTER FEH
|
|||
|
ق <> q ; # ARABIC LETTER QAF
|
|||
|
ك <> k ; # ARABIC LETTER KAF
|
|||
|
ل <> l ; # ARABIC LETTER LAM
|
|||
|
م <> m ; # ARABIC LETTER MEEM
|
|||
|
ن <> n ; # ARABIC LETTER NOON
|
|||
|
ه <> h ; # ARABIC LETTER HEH
|
|||
|
و <> w ; # ARABIC LETTER WAW
|
|||
|
ى <> y $disambig ; # ARABIC LETTER ALEF MAKSURA
|
|||
|
ي <> y ; # ARABIC LETTER YEH
|
|||
|
ً <> aⁿ ; # ARABIC FATHATAN
|
|||
|
ٌ <> uⁿ ; # ARABIC DAMMATAN
|
|||
|
ٍ <> iⁿ ; # ARABIC KASRATAN
|
|||
|
َ <> a ; # ARABIC FATHA
|
|||
|
ُ <> u ; # ARABIC DAMMA
|
|||
|
ِ <> i ; # ARABIC KASRA
|
|||
|
ّ <> ̃ ; # ARABIC SHADDA
|
|||
|
ْ <> ̊ ; # ARABIC SUKUN
|
|||
|
|
|||
|
# special combining marks
|
|||
|
ٓ <> ̂ ; # ARABIC MADDAH ABOVE
|
|||
|
ٔ <> ̉ ; # ARABIC HAMZA ABOVE
|
|||
|
ٕ <> ̹ ; # ARABIC HAMZA BELOW
|
|||
|
|
|||
|
# Some non-Arabic language (not in UNGEGN)
|
|||
|
پ <> p ; # ARABIC LETTER PEH
|
|||
|
چ <> c h $disambig ; # ARABIC LETTER TCHEH
|
|||
|
ڤ <> v ; # ARABIC LETTER VEH
|
|||
|
# ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
|
|||
|
# ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
|
|||
|
گ <> g ; # ARABIC LETTER GAF
|
|||
|
|
|||
|
# fallbacks
|
|||
|
| s < c } [eiy];
|
|||
|
| k < c ;
|
|||
|
| i < e ;
|
|||
|
| u < o ;
|
|||
|
| ks < x ;
|
|||
|
| n < ⁿ;
|
|||
|
|
|||
|
:: (lower) ;
|
|||
|
::NFC (NFD);
|
|||
|
:: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );
|