scuffed-code/icu4c/source/data/translit/t_Arab_Latn.txt

163 lines
5.8 KiB
Plaintext
Raw Normal View History

 // -*- Coding: utf-8; -*-
//--------------------------------------------------------------------
// Copyright (c) 1999-2002, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
// Tool: dumpicurules.bat
// Source: ../../../impl/data/Transliterator_Arabic_Latin.txt
// Date: Sat Jul 27 10:31:01 2002
//--------------------------------------------------------------------
// Arabic_Latin
t_Arab_Latn {
Rule {
//--------------------------------------------------------------------
//--------------------------------------------------------------------
//--------------------------------------------------------------------
// Generally follows UNGEGN <http://www.eki.ee/wgrs/rom1_ar.pdf>
// Occasionally deviates in the direction of ISO 233 <http://homepage.mac.com/sirbinks/pdf/Arabic.pdf>
// a) where required for disambiguation.
// b) with underdot instead of cedilla for letter like SAD, since
// those are explicitly in Unicode for transliteration.
// c) with extra non-Arabic-language letters, like PEH
// Does *not* do assimilation of "al", nor hyphenation.
// While it could be done, we need to determine whether a prefix "al" could
// occur other than as the definite article (since no space is used).
":: [[:Arabic:] [‎ⁿ\u060C\u061B\u061F\u0640\u064B-\u0655\u0660-\u066C\u06F0-\u06F9\uFDFC]] ;"
":: NFKD (NFC);"
"$disambig = ̱ ;"
"$disambig2 = ̰ ;"
"$under = ̣ ;"
"$notAbove = [[:^ccc=0:]&[:^ccc=230:]];"
// non-letters
"٫ <> '.' $disambig ;" // ARABIC DECIMAL SEPARATOR
"٬ <> ',' $disambig ;" // ARABIC THOUSANDS SEPARATOR
// ٭ <> ; # ARABIC FIVE POINTED STAR // no need to transliterate
"، <> ',' ;" // ARABIC COMMA
"؛ <> ';' ;" // ARABIC SEMICOLON
"؟ <> '?' ;" // ARABIC QUESTION MARK
"٪ <> '%' ;" // ARABIC PERCENT SIGN
"۰ <> 0 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT ZERO
"۱ <> 1 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT ONE
"۲ <> 2 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT TWO
"۳ <> 3 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT THREE
"۴ <> 4 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT FOUR
"۵ <> 5 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT FIVE
"۶ <> 6 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT SIX
"۷ <> 7 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT SEVEN
"۸ <> 8 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT EIGHT
"۹ <> 9 $disambig ;" // EXTENDED ARABIC-INDIC DIGIT NINE
"٠ <> 0 ;" // ARABIC-INDIC DIGIT ZERO
"١ <> 1 ;" // ARABIC-INDIC DIGIT ONE
"٢ <> 2 ;" // ARABIC-INDIC DIGIT TWO
"٣ <> 3 ;" // ARABIC-INDIC DIGIT THREE
"٤ <> 4 ;" // ARABIC-INDIC DIGIT FOUR
"٥ <> 5 ;" // ARABIC-INDIC DIGIT FIVE
"٦ <> 6 ;" // ARABIC-INDIC DIGIT SIX
"٧ <> 7 ;" // ARABIC-INDIC DIGIT SEVEN
"٨ <> 8 ;" // ARABIC-INDIC DIGIT EIGHT
"٩ <> 9 ;" // ARABIC-INDIC DIGIT NINE
// letters
// long vowels
"َا<> ā ;" // ARABIC FATHA, ARABIC LETTER ALEF
"ُو <> ū ;" // ARABIC DAMMA, ARABIC LETTER WAW
"ِي <> ī ;" // ARABIC KASRA, ARABIC LETTER YEH
// longer items moved here to prevent masking
"ث <> t h $disambig ;" // ARABIC LETTER THEH
"ذ <> d h $disambig ;" // ARABIC LETTER THAL
"ش <> s h $disambig ;" // ARABIC LETTER SHEEN
"ص <> s $under ;" // ARABIC LETTER SAD
"ض <> d $under ;" // ARABIC LETTER DAD
"ط <> t $under ;" // ARABIC LETTER TAH
"ظ <> z $under ;" // ARABIC LETTER ZAH
"غ <> g h $disambig ;" // ARABIC LETTER GHAIN
// WARNING: special case
// <t, umlaut, half-ring below> will be canonically ordered as <t, half-ring below, umlaut>
// so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
// ةٕ < ẗ̹ ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
"ة <> t \u0308 ;" // ARABIC LETTER TEH MARBUTA
"ة | $1 < t ($notAbove+) \u0308 ;" // ARABIC LETTER TEH MARBUTA
// non-Arabic language
"ژ <> z h $disambig ;" // ARABIC LETTER JEH
"ڭ <> n $disambig g ;" // ARABIC LETTER NG
"ۋ <> v $disambig ;" // ARABIC LETTER VE
"ی <> y $disambig2 ;" // ARABIC LETTER FARSI YEH
// Arabic language
"ء <> ʾ ;" // ARABIC LETTER HAMZA
"ا <> a $under;" // ARABIC LETTER ALEF
"ب <> b ;" // ARABIC LETTER BEH
"ت <> t ;" // ARABIC LETTER TEH
"ج <> j ;" // ARABIC LETTER JEEM
"ح <> h $under ;" // ARABIC LETTER HAH
"خ <> k h $disambig ;" // ARABIC LETTER KHAH
"د <> d ;" // ARABIC LETTER DAL
"ر <> r ;" // ARABIC LETTER REH
"ز <> z ;" // ARABIC LETTER ZAIN
"س <> s ;" // ARABIC LETTER SEEN
"ع <> ʿ ;" // ARABIC LETTER AIN
"ـ > ;" // ARABIC TATWEEL
"ف <> f ;" // ARABIC LETTER FEH
"ق <> q ;" // ARABIC LETTER QAF
"ك <> k ;" // ARABIC LETTER KAF
"ل <> l ;" // ARABIC LETTER LAM
"م <> m ;" // ARABIC LETTER MEEM
"ن <> n ;" // ARABIC LETTER NOON
"ه <> h ;" // ARABIC LETTER HEH
"و <> w ;" // ARABIC LETTER WAW
"ى <> y $disambig ;" // ARABIC LETTER ALEF MAKSURA
"ي <> y ;" // ARABIC LETTER YEH
"ً <> aⁿ ;" // ARABIC FATHATAN
"ٌ <> uⁿ ;" // ARABIC DAMMATAN
"ٍ <> iⁿ ;" // ARABIC KASRATAN
"َ <> a ;" // ARABIC FATHA
"ُ <> u ;" // ARABIC DAMMA
"ِ <> i ;" // ARABIC KASRA
"ّ <> ̃ ;" // ARABIC SHADDA
"ْ <> ̊ ;" // ARABIC SUKUN
// special combining marks
"ٓ <> ̂ ;" // ARABIC MADDAH ABOVE
"ٔ <> ̉ ;" // ARABIC HAMZA ABOVE
"ٕ <> ̹ ;" // ARABIC HAMZA BELOW
// Some non-Arabic language (not in UNGEGN)
"پ <> p ;" // ARABIC LETTER PEH
"چ <> c h $disambig ;" // ARABIC LETTER TCHEH
"ڤ <> v ;" // ARABIC LETTER VEH
// ڥ <> v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
// ڢ <> f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
"گ <> g ;" // ARABIC LETTER GAF
// fallbacks
"| s < c } [eiy];"
"| k < c ;"
"| i < e ;"
"| u < o ;"
"| ks < x ;"
"| n < ‎ⁿ;"
":: (lower) ;"
"::NFC (NFD);"
":: ( [[:Latin:] [%,.0-9;?\u02BE-\u02BF\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339\u037E]] );"
}
}