2004-08-02 20:06:55 +00:00
|
|
|
|
#--------------------------------------------------------------------
|
|
|
|
|
# Copyright (c) 1999-2004, International Business Machines
|
|
|
|
|
# Corporation and others. All Rights Reserved.
|
|
|
|
|
#--------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
# Thai-Latin
|
|
|
|
|
# This set of rules follows ISO 11940
|
|
|
|
|
# see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf
|
|
|
|
|
# except that that does not mention an implicit vowel, so we use ọ
|
|
|
|
|
#
|
|
|
|
|
# The transcription is fairly ugly, so we ought to also do the UNGEGN version
|
|
|
|
|
# see: http://www.eki.ee/wgrs/rom1_th.pdf
|
|
|
|
|
# and probably make that the main variant.
|
|
|
|
|
|
|
|
|
|
# Note: this is an internal file. The NFD/NFC is handled externally, in the index
|
|
|
|
|
# The insertion of spaces between words, the reversal of the vowels
|
|
|
|
|
# and the conversion of space to semicolon are done *outside* of these rules.
|
|
|
|
|
# So as far as these rules are concerned, the vowels are in logical order!
|
|
|
|
|
|
|
|
|
|
# insert implicit vowel (and remove it going the other way)
|
|
|
|
|
# COMMENTED out: the implicit vowel positions cannot be predicted algorithmically
|
|
|
|
|
#$consonant = [ก-ฮ];
|
|
|
|
|
#$vowel = [ะ-ฺเ-ไ็];
|
|
|
|
|
|
|
|
|
|
#{ ( $consonant ) } [^$vowel ] > | $1 ;
|
|
|
|
|
# > ọ ;
|
|
|
|
|
# < ọ ;
|
|
|
|
|
|
|
|
|
|
$notAbove = [^\p{ccc=0}\p{ccc=above}] ;
|
|
|
|
|
$notBelow = [^\p{ccc=0}\p{ccc=below}] ;
|
|
|
|
|
|
|
|
|
|
# Consonants
|
|
|
|
|
# Warning: the 'h's need to be handled carefully!
|
|
|
|
|
# What we really want to say is the following, but we can't
|
|
|
|
|
# $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ;
|
|
|
|
|
|
|
|
|
|
# Since the only accents we care about that could cause problems are free-standing accents below, we use instead:
|
|
|
|
|
$freeStandingBelow = [\u0325 ];
|
2004-11-05 01:38:29 +00:00
|
|
|
|
$hAccent = [ ̄ ̣];
|
2004-08-02 20:06:55 +00:00
|
|
|
|
$notHAccent0 = [^$freeStandingBelow$hAccent];
|
|
|
|
|
$notHAccent1 = $freeStandingBelow [^$hAccent];
|
|
|
|
|
|
|
|
|
|
ห > h̄ ; # THAI CHARACTER HO HIP
|
|
|
|
|
ห | $1 < h ($notAbove*) ̄; # backward case, account for reordering
|
|
|
|
|
ฮ <> ḥ ; # THAI CHARACTER HO NOKHUK
|
|
|
|
|
|
|
|
|
|
ข <> k̄h ; # THAI CHARACTER KHO KHAI
|
|
|
|
|
ฃ <> ḳ̄h ; # THAI CHARACTER KHO KHUAT
|
|
|
|
|
ฅ <> kʹh ; # THAI CHARACTER KHO KHON
|
|
|
|
|
ฆ <> ḳh ; # THAI CHARACTER KHO RAKHANG
|
|
|
|
|
ค < kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI
|
|
|
|
|
ค <> kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI
|
|
|
|
|
ก <> k ; # THAI CHARACTER KO KAI
|
|
|
|
|
|
|
|
|
|
ภ <> p̣h ; # THAI CHARACTER PHO SAMPHAO
|
|
|
|
|
ผ <> p̄h ; # THAI CHARACTER PHO PHUNG
|
|
|
|
|
พ < ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN
|
|
|
|
|
พ <> ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN
|
|
|
|
|
ป <> p ; # THAI CHARACTER PO PLA
|
|
|
|
|
|
|
|
|
|
ฉ <> c̄h ; # THAI CHARACTER CHO CHING
|
|
|
|
|
ฌ <> c̣h ; # THAI CHARACTER CHO CHOE
|
|
|
|
|
ช < ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG
|
|
|
|
|
ช <> ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG
|
|
|
|
|
จ <> c ; # THAI CHARACTER CHO CHAN
|
|
|
|
|
|
|
|
|
|
ฐ <> ṭ̄h ; # THAI CHARACTER THO THAN
|
|
|
|
|
ฑ <> ṯh ; # THAI CHARACTER THO NANGMONTHO
|
|
|
|
|
ฒ <> tʹh ; # THAI CHARACTER THO PHUTHAO
|
|
|
|
|
ถ <> t̄h ; # THAI CHARACTER THO THUNG
|
|
|
|
|
ธ <> ṭh ; # THAI CHARACTER THO THONG
|
|
|
|
|
ท < th } $notHAccent1 ; # THAI CHARACTER THO THAHAN
|
|
|
|
|
ท <> th } $notHAccent0 ; # THAI CHARACTER THO THAHAN
|
|
|
|
|
#Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick.
|
|
|
|
|
ฏ <> t̩ ; # THAI CHARACTER TO PATAK
|
|
|
|
|
ต <> t ; # THAI CHARACTER TO TAO
|
|
|
|
|
|
|
|
|
|
# since there is no singleton g (generated), don't worry about that.
|
|
|
|
|
ง <> ng ; # THAI CHARACTER NGO NGU
|
|
|
|
|
ณ <> ṇ ; # THAI CHARACTER NO NEN
|
|
|
|
|
น <> n ; # THAI CHARACTER NO NU
|
|
|
|
|
|
|
|
|
|
ญ <> ỵ ; # THAI CHARACTER YO YING
|
|
|
|
|
ฎ <> ḍ ; # THAI CHARACTER DO CHADA
|
|
|
|
|
ด <> d ; # THAI CHARACTER DO DEK
|
|
|
|
|
|
|
|
|
|
บ <> b ; # THAI CHARACTER BO BAIMAI
|
|
|
|
|
ฝ <> f̄ ; # THAI CHARACTER FO FA
|
|
|
|
|
ฝ | $1 < f ($notAbove*) ̄; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
ม <> m ; # THAI CHARACTER MO MA
|
|
|
|
|
ย <> y ; # THAI CHARACTER YO YAK
|
|
|
|
|
ร <> r ; # THAI CHARACTER RO RUA
|
|
|
|
|
ฤ <> v ; # THAI CHARACTER RU
|
|
|
|
|
ฦ <> ł ; # THAI CHARACTER LU
|
|
|
|
|
ว <> w ; # THAI CHARACTER WO WAEN
|
|
|
|
|
|
|
|
|
|
ศ <> ṣ̄ ; # THAI CHARACTER SO SALA***
|
|
|
|
|
ศ | $1 < s ̣ ($notAbove*) ̄; # backward case, account for reordering
|
|
|
|
|
ษ <> s̄ʹ ; # THAI CHARACTER SO RUSI
|
|
|
|
|
ส > s̄ ; # THAI CHARACTER SO SUA***
|
|
|
|
|
ส | $1 < s ($notAbove*) ̄; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
ฬ <> ḷ ; # THAI CHARACTER LO CHULA
|
|
|
|
|
ล <> l ; # THAI CHARACTER LO LING
|
|
|
|
|
ฟ <> f ; # THAI CHARACTER FO FAN
|
|
|
|
|
|
|
|
|
|
อ <> x ; # THAI CHARACTER O ANG
|
|
|
|
|
ซ <> s ; # THAI CHARACTER SO SO
|
|
|
|
|
|
|
|
|
|
# vowels
|
|
|
|
|
|
|
|
|
|
ั <> ạ ; # THAI CHARACTER MAI HAN-AKAT
|
|
|
|
|
|
|
|
|
|
า > ā ; # THAI CHARACTER SARA AA
|
|
|
|
|
า | $1 < a ($notAbove*) ̄; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
# We deviate from ISO for SARA AM for disambiguation
|
|
|
|
|
ำ > a ̉; # THAI CHARACTER SARA AM
|
|
|
|
|
ำ | $1 < a ($notAbove*) ̉ ; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
ะ <> a ; # THAI CHARACTER SARA A
|
|
|
|
|
ี <> ī ; # THAI CHARACTER SARA II
|
|
|
|
|
ี | $1 < i ($notAbove*) ̄ ; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
ื <> ụ̄ ; # THAI CHARACTER SARA UEE
|
|
|
|
|
ื | $1 < u ̣ ($notAbove*) ̄ ; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
ึ <> ụ ; # THAI CHARACTER SARA UE
|
|
|
|
|
ู <> ū ; # THAI CHARACTER SARA UU
|
|
|
|
|
ู | $1 < u ($notAbove*) ̄ ; # backward case, account for reordering
|
|
|
|
|
|
|
|
|
|
ุ <> u ; # THAI CHARACTER SARA U
|
|
|
|
|
|
|
|
|
|
ฯ <> ‡ ; # THAI CHARACTER PAIYANNOI
|
|
|
|
|
|
|
|
|
|
# ฿ <> XXX ; # THAI CURRENCY SYMBOL BAHT
|
|
|
|
|
|
|
|
|
|
เ <> e ; # THAI CHARACTER SARA E
|
|
|
|
|
แ <> æ ; # THAI CHARACTER SARA AE
|
|
|
|
|
โ <> o ; # THAI CHARACTER SARA O
|
|
|
|
|
ใ <> ı ; # THAI CHARACTER SARA AI MAIMUAN
|
|
|
|
|
ไ <> ị ; # THAI CHARACTER SARA AI MAIMALAI
|
|
|
|
|
ๅ <> ɨ ; # THAI CHARACTER LAKKHANGYAO
|
|
|
|
|
็ <> ̆ ; # THAI CHARACTER MAITAIKHU
|
|
|
|
|
่ <> ̀ ; # THAI CHARACTER MAI EK
|
|
|
|
|
้ <> ̂ ; # THAI CHARACTER MAI THO
|
|
|
|
|
๊ <> ́ ; # THAI CHARACTER MAI TRI
|
|
|
|
|
๋ <> ̌ ; # THAI CHARACTER MAI CHATTAWA
|
|
|
|
|
์ <> ̒ ; # THAI CHARACTER THANTHAKHAT
|
|
|
|
|
๎ <> '~' ; # THAI CHARACTER YAMAKKAN
|
|
|
|
|
|
|
|
|
|
# We deviate from ISO for disambiguation
|
|
|
|
|
ํ <> ̊ ; # THAI CHARACTER NIKHAHIT
|
|
|
|
|
|
|
|
|
|
๏ <> § ; # THAI CHARACTER FONGMAN
|
|
|
|
|
|
|
|
|
|
๐ <> 0 ; # THAI DIGIT ZERO
|
|
|
|
|
๑ <> 1 ; # THAI DIGIT ONE
|
|
|
|
|
๒ <> 2 ; # THAI DIGIT TWO
|
|
|
|
|
๓ <> 3 ; # THAI DIGIT THREE
|
|
|
|
|
๔ <> 4 ; # THAI DIGIT FOUR
|
|
|
|
|
๕ <> 5 ; # THAI DIGIT FIVE
|
|
|
|
|
๖ <> 6 ; # THAI DIGIT SIX
|
|
|
|
|
๗ <> 7 ; # THAI DIGIT SEVEN
|
|
|
|
|
๘ <> 8 ; # THAI DIGIT EIGHT
|
|
|
|
|
๙ <> 9 ; # THAI DIGIT NINE
|
|
|
|
|
|
|
|
|
|
๚ <> '||' ; # THAI CHARACTER ANGKHANKHU
|
|
|
|
|
|
|
|
|
|
๛ <> » ; # THAI CHARACTER KHOMUT
|
|
|
|
|
ๆ <> « ; # THAI CHARACTER MAIYAMOK
|
|
|
|
|
|
|
|
|
|
# moved down to make shorter first
|
|
|
|
|
#Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below.
|
|
|
|
|
ฺ <> ˌ ; # THAI CHARACTER PHINTHU
|
|
|
|
|
ิ <> i ; # THAI CHARACTER SARA I
|
|
|
|
|
|
|
|
|
|
# fallbacks
|
|
|
|
|
|
|
|
|
|
| k < g ;
|
|
|
|
|
| k < h ;
|
|
|
|
|
| c < j ;
|
|
|
|
|
| k < q ;
|
|
|
|
|
| s < z ;
|
|
|
|
|
|
|
|
|
|
:: (lower);
|