scuffed-code/icu4c/source/data/translit/ThaiLogical_Latin.txt
2004-08-02 20:06:55 +00:00

188 lines
6.3 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#--------------------------------------------------------------------
# Copyright (c) 1999-2004, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# Thai-Latin
# This set of rules follows ISO 11940
# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
# except that that does not mention an implicit vowel, so we use ọ
#
# The transcription is fairly ugly, so we ought to also do the UNGEGN version
# see: http://www.eki.ee/wgrs/rom1_th.pdf
# and probably make that the main variant.
# Note: this is an internal file. The NFD/NFC is handled externally, in the index
# The insertion of spaces between words, the reversal of the vowels
# and the conversion of space to semicolon are done *outside* of these rules.
# So as far as these rules are concerned, the vowels are in logical order!
# insert implicit vowel (and remove it going the other way)
# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
#$consonant = [ก-ฮ];
#$vowel = [ะ-ฺเ-ไ็];
#{ ( $consonant ) } [^$vowel ] > | $1  ;
# > ọ ;
# < ọ ;
$notAbove = [^\p{ccc=0}\p{ccc=above}] ;
$notBelow = [^\p{ccc=0}\p{ccc=below}] ;
# Consonants
# Warning: the 'h's need to be handled carefully!
# What we really want to say is the following, but we can't
# $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ;
# Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
$freeStandingBelow = [\u0325 ];
$hAccent = [ ̄ ̣]
$notHAccent0 = [^$freeStandingBelow$hAccent];
$notHAccent1 = $freeStandingBelow [^$hAccent];
ห > h̄ ; # THAI CHARACTER HO HIP
ห | $1 < h ($notAbove*) ̄; # backward case, account for reordering
ฮ <> ḥ ; # THAI CHARACTER HO NOKHUK
ข <> k̄h ; # THAI CHARACTER KHO KHAI
ฃ <> ḳ̄h ; # THAI CHARACTER KHO KHUAT
ฅ <> kʹh ; # THAI CHARACTER KHO KHON
ฆ <> ḳh ; # THAI CHARACTER KHO RAKHANG
ค < kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI
ค <> kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI
ก <> k ; # THAI CHARACTER KO KAI
ภ <> p̣h ; # THAI CHARACTER PHO SAMPHAO
ผ <> p̄h ; # THAI CHARACTER PHO PHUNG
พ < ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN
พ <> ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN
ป <> p ; # THAI CHARACTER PO PLA
ฉ <> c̄h ; # THAI CHARACTER CHO CHING
ฌ <> c̣h ; # THAI CHARACTER CHO CHOE
ช < ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG
ช <> ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG
จ <> c ; # THAI CHARACTER CHO CHAN
ฐ <> ṭ̄h ; # THAI CHARACTER THO THAN
ฑ <> ṯh ; # THAI CHARACTER THO NANGMONTHO
ฒ <> tʹh ; # THAI CHARACTER THO PHUTHAO
ถ <> t̄h ; # THAI CHARACTER THO THUNG
ธ <> ṭh ; # THAI CHARACTER THO THONG
ท < th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
ท <> th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
ฏ <> t̩ ; # THAI CHARACTER TO PATAK
ต <> t ; # THAI CHARACTER TO TAO
# since there is no singleton g (generated), don't worry about that.
ง <> ng ; # THAI CHARACTER NGO NGU
ณ <> ṇ ; # THAI CHARACTER NO NEN
น <> n ; # THAI CHARACTER NO NU
ญ <> ỵ ; # THAI CHARACTER YO YING
ฎ <> ḍ ; # THAI CHARACTER DO CHADA
ด <> d ; # THAI CHARACTER DO DEK
บ <> b ; # THAI CHARACTER BO BAIMAI
ฝ <> f̄ ; # THAI CHARACTER FO FA
ฝ | $1 < f ($notAbove*) ̄; # backward case, account for reordering
ม <> m ; # THAI CHARACTER MO MA
ย <> y ; # THAI CHARACTER YO YAK
ร <> r ; # THAI CHARACTER RO RUA
ฤ <> v ; # THAI CHARACTER RU
ฦ <> ł ; # THAI CHARACTER LU
ว <> w ; # THAI CHARACTER WO WAEN
ศ <> ṣ̄ ; # THAI CHARACTER SO SALA***
ศ | $1 < s ̣ ($notAbove*) ̄; # backward case, account for reordering
ษ <> s̄ʹ ; # THAI CHARACTER SO RUSI
ส > s̄ ; # THAI CHARACTER SO SUA***
ส | $1 < s ($notAbove*) ̄; # backward case, account for reordering
ฬ <> ḷ ; # THAI CHARACTER LO CHULA
ล <> l ; # THAI CHARACTER LO LING
ฟ <> f ; # THAI CHARACTER FO FAN
อ <> x ; # THAI CHARACTER O ANG
ซ <> s ; # THAI CHARACTER SO SO
# vowels
ั <> ạ ; # THAI CHARACTER MAI HAN-AKAT
า > ā ; # THAI CHARACTER SARA AA
า | $1 < a ($notAbove*) ̄; # backward case, account for reordering
# We deviate from ISO for SARA AM for disambiguation
ำ > a ̉; # THAI CHARACTER SARA AM
ำ | $1 < a ($notAbove*) ̉ ; # backward case, account for reordering
ะ <> a ; # THAI CHARACTER SARA A
ี <> ī ; # THAI CHARACTER SARA II
ี | $1 < i ($notAbove*) ̄ ; # backward case, account for reordering
ื <> ụ̄ ; # THAI CHARACTER SARA UEE
ื | $1 < u ̣ ($notAbove*) ̄ ; # backward case, account for reordering
ึ <> ụ ; # THAI CHARACTER SARA UE
ู <> ū ; # THAI CHARACTER SARA UU
ู | $1 < u ($notAbove*) ̄ ; # backward case, account for reordering
ุ <> u ; # THAI CHARACTER SARA U
ฯ <> ‡ ; # THAI CHARACTER PAIYANNOI
# ฿ <> XXX ; # THAI CURRENCY SYMBOL BAHT
เ <> e ; # THAI CHARACTER SARA E
แ <> æ ; # THAI CHARACTER SARA AE
โ <> o ; # THAI CHARACTER SARA O
ใ <> ı ; # THAI CHARACTER SARA AI MAIMUAN
ไ <> ị ; # THAI CHARACTER SARA AI MAIMALAI
ๅ <> ɨ ; # THAI CHARACTER LAKKHANGYAO
็ <> ̆ ; # THAI CHARACTER MAITAIKHU
่ <> ̀ ; # THAI CHARACTER MAI EK
้ <> ̂ ; # THAI CHARACTER MAI THO
๊ <> ́ ; # THAI CHARACTER MAI TRI
๋ <> ̌ ; # THAI CHARACTER MAI CHATTAWA
์ <> ̒ ; # THAI CHARACTER THANTHAKHAT
๎ <> '~' ; # THAI CHARACTER YAMAKKAN
# We deviate from ISO for disambiguation
ํ <> ̊ ; # THAI CHARACTER NIKHAHIT
๏ <> § ; # THAI CHARACTER FONGMAN
<> 0 ; # THAI DIGIT ZERO
๑ <> 1 ; # THAI DIGIT ONE
๒ <> 2 ; # THAI DIGIT TWO
๓ <> 3 ; # THAI DIGIT THREE
๔ <> 4 ; # THAI DIGIT FOUR
๕ <> 5 ; # THAI DIGIT FIVE
๖ <> 6 ; # THAI DIGIT SIX
๗ <> 7 ; # THAI DIGIT SEVEN
๘ <> 8 ; # THAI DIGIT EIGHT
๙ <> 9 ; # THAI DIGIT NINE
๚ <> '||' ; # THAI CHARACTER ANGKHANKHU
๛ <> » ; # THAI CHARACTER KHOMUT
ๆ <> « ; # THAI CHARACTER MAIYAMOK
# moved down to make shorter first
#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
ฺ <> ˌ ; # THAI CHARACTER PHINTHU
ิ <> i ; # THAI CHARACTER SARA I
# fallbacks
| k < g ;
| k < h ;
| c < j ;
| k < q ;
| s < z ;
:: (lower);