scuffed-code/icu4c/source/data/translit/Latn_Kana.txt
2016-09-19 05:09:40 +00:00

389 lines
10 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# © 2016 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html#License
#
# File: Latn_Kana.txt
# Generated from CLDR
#
# note: a global filter is more efficient, but MUST include all source chars
#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
### WARNING -- must add width filter, both here and below!!! ###
:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
:: [:Latin:] fullwidth-halfwidth ();
:: NFD (NFC);
:: Lower (); # whenever transliterating from cased to uncased script, include this
# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
# Uses modified Hepburn. Small changes to make unambiguous.
# | Kunrei-shiki: Hepburn/MHepburn
# | ------------------------------
# | si: shi
# | si ~ya: sha
# | si ~yu: shu
# | si ~yo: sho
# | zi: ji
# | zi ~ya: ja
# | zi ~yu: ju
# | zi ~yo: jo
# | ti: chi
# | ti ~ya: cha
# | ti ~yu: chu
# | ti ~yu: cho
# | tu: tsu
# | di: ji/dji
# | du: zu/dzu
# | hu: fu
# | For foreign words:
# | -----------------
# | se ~i si
# | si ~e she
# |
# | ze ~i zi
# | zi ~e je
# |
# | te ~i ti
# | ti ~e che
# | te ~u tu
# |
# | de ~i di
# | de ~u du
# | de ~i di
# |
# | he ~u: hu
# | hu ~a fa
# | hu ~i fi
# | hu ~e he
# | hu ~o ho
# Most small forms are generated, but if necessary
# explicit small forms are given with ~a, ~ya, etc.
#------------------------------------------------------
# Variables
$vowel = [aeiou] ;
$consonant = [bcdfghjklmnpqrstvwxyz] ;
$macron = \u0304 ;
# Variables used for doubled-consonants with tsu
$kana = [ぁ-ゔ] ;
$voice = [\u3099゛];
$semivoice = [\u309A゜];
$k_start = [カキクケコかきくけこ] ;
$s_start = [サシスセソさしすせそ] ;
$j_start = [シし] $voice ;
$t_start = [タチツテトたちつてと] ;
$n_start = [ナニヌネノンなにぬねの] ;
$h_start = [ハヒヘホはひへほ] ;
$f_start = [フふ] ;
$m_start = [マミムメモまみむめも] ;
$y_start = [ヤユヨやゆよ] ;
$r_start = [ラリルレロらりるれろ] ;
$w_start = [ワヰヱヲわゐゑを] ;
$v_start = [ワヰヱヲ]\u3099 ;
$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
# if ン is followed by $n_quoter, then it needs an
# apostrophe after its romaji form to disambiguate it.
# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ヤ ユ ヨ ン] ;
$small_y = [ャィュェョ] ;
$iteration = ゝ ;
#------------------------------------------------------
# katakana rules
# Punctuation
'.' ↔ 。;
',' ↔ 、;
# ' ' } [a-z] → ; # delete spaces before latin
# ' ' ← [^' '-ヿ] {} ['-ヿ] ; #insert spaces before hiragana
# Iteration Mark
# Copy previous letter § marks
# TODO
# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
# Specials for katakana -- not shared with hiragana
va ↔ ワ\u3099 ;
vi ↔ ヰ\u3099 ;
ve ↔ ヱ\u3099 ;
vo ↔ ヲ\u3099 ;
'~ka' ↔ ヵ ;
'~ke' ↔ ヶ ;
# ~~~ begin shared rules ~~~
#special
ya ← '~'ャ;
yi ← '~'ィ ;
yu ← '~'ュ;
ye ← '~'ェ;
yo ← '~'ョ;
#normal
a ↔ ア ;
b | '~' ← ヒ \u3099} $small_y ;
by } $vowel → ヒ\u3099 | '~y' ;
ba ↔ ハ\u3099 ;
bi ↔ ヒ\u3099 ;
bu ↔ フ\u3099 ;
be ↔ ヘ\u3099 ;
bo ↔ ホ\u3099 ;
c } i → | s ;
c } e → | s ;
da ↔ タ\u3099 ;
di ↔ テ\u3099ィ ;
du ↔ テ\u3099ゥ ;
de ↔ テ\u3099 ;
do ↔ ト\u3099 ;
dzu ↔ ツ\u3099 ;
dja ← チ\u3099ャ ;
dji'~i' ← チ\u3099ィ ; # liu
dju ← チ\u3099ュ ;
dje ← チ\u3099ェ ;
djo ← チ\u3099ョ ;
dji ↔ チ\u3099 ;
dj } $vowel → チ\u3099 | '~y' ;
# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
cha ← チャ ;
chi'~i' ← チィ ; # liu
chu ← チュ ;
che ← チェ ;
cho ← チョ ;
chi ↔ チ ;
ch } $vowel → チ | '~y' ;
e ↔ エ ;
g | '~' ← キ\u3099} $small_y ;
gy } $vowel → キ\u3099 | '~y' ;
ga ↔ カ\u3099 ;
gi ↔ キ\u3099 ;
gu ↔ ク\u3099 ;
ge ↔ ケ\u3099 ;
go ↔ コ\u3099 ;
i ↔ イ ;
# j } $vowel → シ\u3099 | '~y' ;
ja ↔ シ\u3099ャ ;
ji'~i' ← シ\u3099ィ ; # liu
ju ↔ シ\u3099ュ ;
je ↔ シ\u3099ェ ;
jo ↔ シ\u3099ョ ;
ji ↔ シ\u3099 ;
k | '~' ← キ} $small_y ;
ky } $vowel → キ | '~y' ;
ka ↔ カ ;
ki ↔ キ ;
ku ↔ ク ;
ke ↔ ケ ;
ko ↔ コ ;
m | '~' ← ミ} $small_y ;
my } $vowel → ミ | '~y' ;
ma ↔ マ ;
mi ↔ ミ ;
mu ↔ ム ;
me ↔ メ ;
mo ↔ モ ;
m } [pbfv] → ン ;
n | '~' ← ニ } $small_y ;
ny } $vowel → ニ | '~y' ;
na ↔ ナ ;
ni ↔ ニ ;
nu ↔ ヌ ;
ne ↔ ネ ;
no ↔ ;
o ↔ オ ;
p | '~' ← ヒ\u309A } $small_y ;
py } $vowel → ヒ\u309A | '~y' ;
pa ↔ ハ\u309A ;
pi ↔ ヒ\u309A ;
pu ↔ フ\u309A ;
pe ↔ ヘ\u309A ;
po ↔ ホ\u309A ;
h | '~' ← ヒ } $small_y ;
hy } $vowel → ヒ | '~y' ;
ha ↔ ハ ;
hi ↔ ヒ ;
hu ↔ ヘゥ ;
he ↔ ヘ ;
ho ↔ ホ ;
# f | '~' ← フ } $small_y ;
# f } $vowel → フ | '~' ;
fa ↔ ファ ;
fi ↔ フィ ;
fe ↔ フェ ;
fo ↔ フォ ;
fu ↔ フ ;
r | '~' ← リ } $small_y ;
ry } $vowel → リ | '~y' ;
ra ↔ ラ ;
ri ↔ リ ;
ru ↔ ル ;
re ↔ レ ;
ro ↔ ロ ;
za ↔ サ\u3099 ;
zi ↔ セ\u3099ィ ;
zu ↔ ス\u3099 ;
ze ↔ セ\u3099 ;
zo ↔ ソ\u3099 ;
sa ↔ サ ;
si ↔ セィ ;
su ↔ ス ;
se ↔ セ ;
so ↔ ソ ;
sha ← シャ ;
shi'~i' ← シィ ; # liu
shu ← シュ ;
she ← シェ ;
sho ← ショ ;
shi ↔ シ ;
sh } $vowel → シ | '~y' ;
ta ↔ タ ;
ti ↔ ティ ;
tu ↔ テゥ ;
te ↔ テ ;
to ↔ ト ;
tsu ↔ ツ ;
# v } $vowel → ウ\u3099 | '~' ;
#'v~a' ← ウ\u3099ァ ; # liu
#'v~i' ← ウ\u3099ィ ; # liu
#'v~e' ← ウ\u3099ェ ; # liu
#'v~o' ← ウ\u3099ォ ; # liu
vu ↔ ウ\u3099 ;
u ↔ ウ ;
# w } $vowel → ウ | '~' ;
wa ↔ ワ ;
wi ↔ ヰ ;
wu → ウ ;
we ↔ ヱ ;
wo ↔ ヲ ;
ya ↔ ヤ ;
yi → イ ;
yu ↔ ユ ;
ye → エ ;
yo ↔ ヨ ;
# double consonants
#specials
s } sh → ッ ;
t } ch → ッ ;
#voiced
j } j ↔ ッ } $j_start ;
b } b ↔ ッ } [$h_start$f_start] $voice;
d } d ↔ ッ } $t_start $voice;
g } g ↔ ッ } $k_start $voice;
p } p ↔ ッ } [$h_start$f_start] $semivoice;
# v } v ↔ ッ } [ワヰウヱヲう] $voice ;
z } z ↔ ッ } $s_start $voice;
v } v ↔ ッ } $v_start;
# normal
k } k ↔ ッ } $k_start ;
m } m ↔ ッ } $m_start ;
n } n ↔ ッ } $n_start ;
h } h ↔ ッ } $h_start ;
f } f ↔ ッ } $f_start ;
r } r ↔ ッ } $r_start ;
t } t ↔ ッ } $t_start ;
s } s ↔ ッ } $s_start ;
w } w ↔ ッ } $w_start;
y } y ↔ ッ } $y_start;
# completeness
x } x → ッ ;
c } k → ッ ;
c } c → ッ ;
c } q → ッ ;
l } l → ッ ;
q } q → ッ ;
# y } y → ッ ;
# w } w → ッ ;
# prolonged vowel mark. this indicates a doubling of
# the preceding vowel sound
#a ← a { ー ; # liu
#e ← e { ー ; # liu
#i ← i { ー ; # liu
#o ← o { ー ; # liu
#u ← u { ー ; # liu
$macron ↔ ー ;
# small forms
'~a' ↔ ァ ;
'~i' ↔ ィ ;
'~u' ↔ ゥ ;
'~e' ↔ ェ ;
'~o' ↔ ォ ;
'~tsu' ↔ ッ ;
'~wa' ↔ ヮ ;
'~ya' ↔ ャ ;
'~yi' → ィ ;
'~yu' ↔ ュ ;
'~ye' → ェ ;
'~yo' ↔ ョ ;
# iteration marks
# TODO: make more accurate
j $1 ← sh (y* $vowel) {ヽ$voice ;
dj $1 ← ch (y* $vowel) {ヽ$voice ;
dz $1 ← ts (y* $vowel) {ヽ$voice ;
g $1 ← k (y* $vowel) {ヽ$voice ;
z $1 ← s (y* $vowel) {ヽ$voice ;
d $1 ← t (y* $vowel) {ヽ$voice ;
h $1 ← b (y* $vowel) {ヽ$voice ;
v $1 ← w (y* $vowel) {ヽ$voice ;
sh $1 ← sh (y* $vowel) {ヽ$voice ;
j $1 ← j (y* $vowel) {ヽ$voice ;
ch $1 ← ch (y* $vowel) {ヽ$voice ;
dj $1 ← dj(y* $vowel) {ヽ$voice ;
ts $1 ← ts (y* $vowel) {ヽ$voice ;
dz $1 ← dz (y* $vowel) {ヽ$voice ;
$1 ← ($consonant y* $vowel) {ヽ$voice? ;
$1 ← (.) {ヽ $voice? ; # otherwise repeat last character
← ヽ $voice? ; # delete if no characters found
# h- rule: lengthens vowel if not followed by a vowel.
# At the point this is applied, latin [cons]?vowel sequences
# have been converted to katakana in NFD form.
$voweled_basekana [\u3099 \u309A]? { h → ー ;
# one-way latin- → kana rules. these do not occur in
# well-formed romaji representing actual japanese text.
# their purpose is to make all romaji map to kana of
# some sort.
# the following are not really necessary, but produce
# slightly more natural results.
cy → セィ ;
dy → テ\u3099ィ ;
hy → ヒ ;
sy → セィ ;
ty → ティ ;
zy → セ\u3099ィ ;
h → ヘ ;
# isolated consonants listed here so as not to mask
# longer rules above.
ch → チ;
sh → シ ;
dz → ツ\u3099 ;
dj → チ\u3099;
b → フ\u3099 ;
d → テ\u3099 ;
g → ク\u3099 ;
k → ク ;
m → ム ;
n'' ← ン } $n_quoter ;
n ↔ ン ;
p → フ\u309A ;
r → ル ;
s → ス ;
t → テ ;
y → イ ;
z → ス\u3099 ;
v → ウ\u3099 ;
f → フ;
j → シ\u3099;
w → ウ;
ß → | ss ;
æ → | e ;
ð → | d ;
ø → | u ;
þ → | th ;
# simple substitutions using backup
c → | k ;
l → | r ;
q → | k ;
x → | ks ;
# ~~~ END shared rules ~~~
#------------------------------------------------------
# Final cleanup
'~' → ; # delete stray tildes between letters
[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
:: NFC (NFD) ;
:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
# note: a global filter is more efficient, but MUST include all source chars!!
#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
# eof