163 lines
5.8 KiB
Plaintext
163 lines
5.8 KiB
Plaintext
|
// -*- Coding: utf-8; -*-
|
|||
|
//--------------------------------------------------------------------
|
|||
|
// Copyright (c) 1999-2002, International Business Machines
|
|||
|
// Corporation and others. All Rights Reserved.
|
|||
|
//--------------------------------------------------------------------
|
|||
|
// THIS IS A MACHINE-GENERATED FILE
|
|||
|
// Tool: dumpicurules.bat
|
|||
|
// Source: ../../../impl/data/Transliterator_Arabic_Latin.txt
|
|||
|
// Date: Sat Jul 27 10:31:01 2002
|
|||
|
//--------------------------------------------------------------------
|
|||
|
|
|||
|
// Arabic_Latin
|
|||
|
|
|||
|
t_Arab_Latn {
|
|||
|
Rule {
|
|||
|
//--------------------------------------------------------------------
|
|||
|
//--------------------------------------------------------------------
|
|||
|
//--------------------------------------------------------------------
|
|||
|
|
|||
|
// Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
|
|||
|
// Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf>
|
|||
|
// a) where required for disambiguation.
|
|||
|
// b) with underdot instead of cedilla for letter like SAD, since
|
|||
|
// those are explicitly in Unicode for transliteration.
|
|||
|
// c) with extra non-Arabic-language letters, like PEH
|
|||
|
|
|||
|
// Does *not* do assimilation of "al", nor hyphenation.
|
|||
|
// While it could be done, we need to determine whether a prefix "al" could
|
|||
|
// occur other than as the definite article (since no space is used).
|
|||
|
|
|||
|
":: [[:Arabic:] [ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;"
|
|||
|
":: NFKD (NFC);"
|
|||
|
"$disambig = ̱ ;"
|
|||
|
"$disambig2 = ̰ ;"
|
|||
|
"$under = ̣ ;"
|
|||
|
|
|||
|
"$notAbove = [[:^ccc=0:]&[:^ccc=230:]];"
|
|||
|
|
|||
|
// non-letters
|
|||
|
|
|||
|
"٫ <> '.' $disambig ;" // ARABIC DECIMAL SEPARATOR
|
|||
|
"٬ <> ',' $disambig ;" // ARABIC THOUSANDS SEPARATOR
|
|||
|
// ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate
|
|||
|
|
|||
|
"، <> ',' ;" // ARABIC COMMA
|
|||
|
"؛ <> ';' ;" // ARABIC SEMICOLON
|
|||
|
"؟ <> '?' ;" // ARABIC QUESTION MARK
|
|||
|
"٪ <> '%' ;" // ARABIC PERCENT SIGN
|
|||
|
|
|||
|
"۰ <> 0 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT ZERO
|
|||
|
"۱ <> 1 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT ONE
|
|||
|
"۲ <> 2 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT TWO
|
|||
|
"۳ <> 3 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT THREE
|
|||
|
"۴ <> 4 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT FOUR
|
|||
|
"۵ <> 5 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT FIVE
|
|||
|
"۶ <> 6 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT SIX
|
|||
|
"۷ <> 7 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT SEVEN
|
|||
|
"۸ <> 8 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT EIGHT
|
|||
|
"۹ <> 9 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT NINE
|
|||
|
|
|||
|
"٠ <> 0 ;" // ARABIC-INDIC DIGIT ZERO
|
|||
|
"١ <> 1 ;" // ARABIC-INDIC DIGIT ONE
|
|||
|
"٢ <> 2 ;" // ARABIC-INDIC DIGIT TWO
|
|||
|
"٣ <> 3 ;" // ARABIC-INDIC DIGIT THREE
|
|||
|
"٤ <> 4 ;" // ARABIC-INDIC DIGIT FOUR
|
|||
|
"٥ <> 5 ;" // ARABIC-INDIC DIGIT FIVE
|
|||
|
"٦ <> 6 ;" // ARABIC-INDIC DIGIT SIX
|
|||
|
"٧ <> 7 ;" // ARABIC-INDIC DIGIT SEVEN
|
|||
|
"٨ <> 8 ;" // ARABIC-INDIC DIGIT EIGHT
|
|||
|
"٩ <> 9 ;" // ARABIC-INDIC DIGIT NINE
|
|||
|
|
|||
|
// letters
|
|||
|
|
|||
|
// long vowels
|
|||
|
"َا<> ā ;" // ARABIC FATHA, ARABIC LETTER ALEF
|
|||
|
"ُو <> ū ;" // ARABIC DAMMA, ARABIC LETTER WAW
|
|||
|
"ِي <> ī ;" // ARABIC KASRA, ARABIC LETTER YEH
|
|||
|
|
|||
|
// longer items moved here to prevent masking
|
|||
|
"ث <> t h $disambig ;" // ARABIC LETTER THEH
|
|||
|
"ذ <> d h $disambig ;" // ARABIC LETTER THAL
|
|||
|
"ش <> s h $disambig ;" // ARABIC LETTER SHEEN
|
|||
|
"ص <> s $under ;" // ARABIC LETTER SAD
|
|||
|
"ض <> d $under ;" // ARABIC LETTER DAD
|
|||
|
"ط <> t $under ;" // ARABIC LETTER TAH
|
|||
|
"ظ <> z $under ;" // ARABIC LETTER ZAH
|
|||
|
"غ <> g h $disambig ;" // ARABIC LETTER GHAIN
|
|||
|
|
|||
|
// WARNING: special case
|
|||
|
// <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut>
|
|||
|
// so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
|
|||
|
// ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
|
|||
|
|
|||
|
"ة <> t \u0308 ;" // ARABIC LETTER TEH MARBUTA
|
|||
|
"ة | $1 < t ($notAbove+) \u0308 ;" // ARABIC LETTER TEH MARBUTA
|
|||
|
|
|||
|
// non-Arabic language
|
|||
|
"ژ <> z h $disambig ;" // ARABIC LETTER JEH
|
|||
|
"ڭ <> n $disambig g ;" // ARABIC LETTER NG
|
|||
|
"ۋ <> v $disambig ;" // ARABIC LETTER VE
|
|||
|
"ی <> y $disambig2 ;" // ARABIC LETTER FARSI YEH
|
|||
|
|
|||
|
// Arabic language
|
|||
|
|
|||
|
"ء <> ʾ ;" // ARABIC LETTER HAMZA
|
|||
|
"ا <> a $under;" // ARABIC LETTER ALEF
|
|||
|
"ب <> b ;" // ARABIC LETTER BEH
|
|||
|
"ت <> t ;" // ARABIC LETTER TEH
|
|||
|
"ج <> j ;" // ARABIC LETTER JEEM
|
|||
|
"ح <> h $under ;" // ARABIC LETTER HAH
|
|||
|
"خ <> k h $disambig ;" // ARABIC LETTER KHAH
|
|||
|
"د <> d ;" // ARABIC LETTER DAL
|
|||
|
"ر <> r ;" // ARABIC LETTER REH
|
|||
|
"ز <> z ;" // ARABIC LETTER ZAIN
|
|||
|
"س <> s ;" // ARABIC LETTER SEEN
|
|||
|
"ع <> ʿ ;" // ARABIC LETTER AIN
|
|||
|
"ـ > ;" // ARABIC TATWEEL
|
|||
|
"ف <> f ;" // ARABIC LETTER FEH
|
|||
|
"ق <> q ;" // ARABIC LETTER QAF
|
|||
|
"ك <> k ;" // ARABIC LETTER KAF
|
|||
|
"ل <> l ;" // ARABIC LETTER LAM
|
|||
|
"م <> m ;" // ARABIC LETTER MEEM
|
|||
|
"ن <> n ;" // ARABIC LETTER NOON
|
|||
|
"ه <> h ;" // ARABIC LETTER HEH
|
|||
|
"و <> w ;" // ARABIC LETTER WAW
|
|||
|
"ى <> y $disambig ;" // ARABIC LETTER ALEF MAKSURA
|
|||
|
"ي <> y ;" // ARABIC LETTER YEH
|
|||
|
"ً <> aⁿ ;" // ARABIC FATHATAN
|
|||
|
"ٌ <> uⁿ ;" // ARABIC DAMMATAN
|
|||
|
"ٍ <> iⁿ ;" // ARABIC KASRATAN
|
|||
|
"َ <> a ;" // ARABIC FATHA
|
|||
|
"ُ <> u ;" // ARABIC DAMMA
|
|||
|
"ِ <> i ;" // ARABIC KASRA
|
|||
|
"ّ <> ̃ ;" // ARABIC SHADDA
|
|||
|
"ْ <> ̊ ;" // ARABIC SUKUN
|
|||
|
|
|||
|
// special combining marks
|
|||
|
"ٓ <> ̂ ;" // ARABIC MADDAH ABOVE
|
|||
|
"ٔ <> ̉ ;" // ARABIC HAMZA ABOVE
|
|||
|
"ٕ <> ̹ ;" // ARABIC HAMZA BELOW
|
|||
|
|
|||
|
// Some non-Arabic language (not in UNGEGN)
|
|||
|
"پ <> p ;" // ARABIC LETTER PEH
|
|||
|
"چ <> c h $disambig ;" // ARABIC LETTER TCHEH
|
|||
|
"ڤ <> v ;" // ARABIC LETTER VEH
|
|||
|
// ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
|
|||
|
// ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
|
|||
|
"گ <> g ;" // ARABIC LETTER GAF
|
|||
|
|
|||
|
// fallbacks
|
|||
|
"| s < c } [eiy];"
|
|||
|
"| k < c ;"
|
|||
|
"| i < e ;"
|
|||
|
"| u < o ;"
|
|||
|
"| ks < x ;"
|
|||
|
"| n < ⁿ;"
|
|||
|
|
|||
|
":: (lower) ;"
|
|||
|
"::NFC (NFD);"
|
|||
|
":: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );"
|
|||
|
}
|
|||
|
}
|