496 lines
9.8 KiB
Plaintext
496 lines
9.8 KiB
Plaintext
|
#--------------------------------------------------------------------
|
|||
|
# Copyright (c) 1999-2004, International Business Machines
|
|||
|
# Corporation and others. All Rights Reserved.
|
|||
|
#--------------------------------------------------------------------
|
|||
|
|
|||
|
# note: a global filter is more efficient, but MUST include all source chars
|
|||
|
#:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
|
|||
|
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
|
|||
|
### WARNING -- must add width filter, both here and below!!! ###
|
|||
|
:: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;
|
|||
|
|
|||
|
:: [:Latin:] fullwidth-halfwidth ();
|
|||
|
:: NFD (NFC);
|
|||
|
:: Lower (); # whenever transliterating from cased to uncased script, include this
|
|||
|
# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
|
|||
|
|
|||
|
# Uses modified Hepburn. Small changes to make unambiguous.
|
|||
|
|
|||
|
# | Kunrei-shiki: Hepburn/MHepburn
|
|||
|
# | ------------------------------
|
|||
|
# | si: shi
|
|||
|
# | si ~ya: sha
|
|||
|
# | si ~yu: shu
|
|||
|
# | si ~yo: sho
|
|||
|
# | zi: ji
|
|||
|
# | zi ~ya: ja
|
|||
|
# | zi ~yu: ju
|
|||
|
# | zi ~yo: jo
|
|||
|
# | ti: chi
|
|||
|
# | ti ~ya: cha
|
|||
|
# | ti ~yu: chu
|
|||
|
# | ti ~yu: cho
|
|||
|
# | tu: tsu
|
|||
|
# | di: ji/dji
|
|||
|
# | du: zu/dzu
|
|||
|
# | hu: fu
|
|||
|
|
|||
|
# | For foreign words:
|
|||
|
# | -----------------
|
|||
|
# | se ~i si
|
|||
|
# | si ~e she
|
|||
|
# |
|
|||
|
# | ze ~i zi
|
|||
|
# | zi ~e je
|
|||
|
# |
|
|||
|
# | te ~i ti
|
|||
|
# | ti ~e che
|
|||
|
# | te ~u tu
|
|||
|
# |
|
|||
|
# | de ~i di
|
|||
|
# | de ~u du
|
|||
|
# | de ~i di
|
|||
|
# |
|
|||
|
# | he ~u: hu
|
|||
|
# | hu ~a fa
|
|||
|
# | hu ~i fi
|
|||
|
# | hu ~e he
|
|||
|
# | hu ~o ho
|
|||
|
|
|||
|
# Most small forms are generated, but if necessary
|
|||
|
# explicit small forms are given with ~a, ~ya, etc.
|
|||
|
|
|||
|
#------------------------------------------------------
|
|||
|
# Variables
|
|||
|
|
|||
|
$vowel = [aeiou] ;
|
|||
|
$consonant = [bcdfghjklmnpqrstvwxyz] ;
|
|||
|
$macron = \u0304 ;
|
|||
|
|
|||
|
# Variables used for doubled-consonants with tsu
|
|||
|
|
|||
|
$kana = [\u3041-\u3094] ;
|
|||
|
|
|||
|
$voice = [\u3099\u309B];
|
|||
|
$semivoice = [\u309A\u309C];
|
|||
|
|
|||
|
$k_start = [カキクケコかきくけこ] ;
|
|||
|
|
|||
|
$s_start = [サシスセソさしすせそ] ;
|
|||
|
|
|||
|
$j_start = [シし] $voice ;
|
|||
|
|
|||
|
$t_start = [タチツテトたちつてと] ;
|
|||
|
|
|||
|
$n_start = [ナニヌネノンなにぬねの] ;
|
|||
|
|
|||
|
$h_start = [ハヒヘホはひへほ] ;
|
|||
|
$f_start = [フふ] ;
|
|||
|
|
|||
|
$m_start = [マミムメモまみむめも] ;
|
|||
|
|
|||
|
$y_start = [ヤユヨやゆよ] ;
|
|||
|
|
|||
|
$r_start = [ラリルレロらりるれろ] ;
|
|||
|
|
|||
|
$w_start = [ワヰヱヲわゐゑを] ;
|
|||
|
|
|||
|
$v_start = [ワヰヱヲ]゙ ;
|
|||
|
|
|||
|
# if ン is followed by $n_quoter, then it needs an
|
|||
|
# apostrophe after its romaji form to disambiguate it.
|
|||
|
# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
|
|||
|
|
|||
|
$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;
|
|||
|
|
|||
|
$small_y = [ャィュェョ] ;
|
|||
|
|
|||
|
$iteration = \u309D ;
|
|||
|
|
|||
|
#------------------------------------------------------
|
|||
|
# katakana rules
|
|||
|
|
|||
|
# Punctuation
|
|||
|
|
|||
|
'.' <> 。;
|
|||
|
',' <> 、;
|
|||
|
# ' ' } [a-z] > ; # delete spaces before latin
|
|||
|
# ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
|
|||
|
|
|||
|
# Iteration Mark
|
|||
|
# Copy previous letter & marks
|
|||
|
|
|||
|
# TODO
|
|||
|
# | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
|
|||
|
|
|||
|
# Specials for katakana -- not shared with hiragana
|
|||
|
|
|||
|
va <> ヷ ;
|
|||
|
vi <> ヸ ;
|
|||
|
ve <> ヹ ;
|
|||
|
vo <> ヺ ;
|
|||
|
'~ka' <> ヵ ;
|
|||
|
'~ke' <> ヶ ;
|
|||
|
|
|||
|
# ~~~ begin shared rules ~~~
|
|||
|
|
|||
|
#special
|
|||
|
|
|||
|
ya < '~'ャ;
|
|||
|
yi < '~'ィ ;
|
|||
|
yu < '~'ュ;
|
|||
|
ye < '~'ェ;
|
|||
|
yo < '~'ョ;
|
|||
|
|
|||
|
#normal
|
|||
|
|
|||
|
a <> ア ;
|
|||
|
|
|||
|
b | '~' < ヒ ゙} $small_y ;
|
|||
|
by } $vowel > ビ | '~y' ;
|
|||
|
|
|||
|
ba <> バ ;
|
|||
|
bi <> ビ ;
|
|||
|
bu <> ブ ;
|
|||
|
be <> ベ ;
|
|||
|
bo <> ボ ;
|
|||
|
|
|||
|
c } i > | s ;
|
|||
|
c } e > | s ;
|
|||
|
|
|||
|
da <> ダ ;
|
|||
|
di <> ディ ;
|
|||
|
du <> デゥ ;
|
|||
|
de <> デ ;
|
|||
|
do <> ド ;
|
|||
|
dzu <> ヅ ;
|
|||
|
dja < ヂャ ;
|
|||
|
dji'~i' < ヂィ ; # liu
|
|||
|
dju < ヂュ ;
|
|||
|
dje < ヂェ ;
|
|||
|
djo < ヂョ ;
|
|||
|
dji <> ヂ ;
|
|||
|
dj } $vowel > ヂ | '~y' ;
|
|||
|
|
|||
|
# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
|
|||
|
|
|||
|
cha < チャ ;
|
|||
|
chi'~i' < チィ ; # liu
|
|||
|
chu < チュ ;
|
|||
|
che < チェ ;
|
|||
|
cho < チョ ;
|
|||
|
chi <> チ ;
|
|||
|
ch } $vowel > チ | '~y' ;
|
|||
|
|
|||
|
e <> エ ;
|
|||
|
|
|||
|
g | '~' < ギ} $small_y ;
|
|||
|
gy } $vowel > ギ | '~y' ;
|
|||
|
|
|||
|
ga <> ガ ;
|
|||
|
gi <> ギ ;
|
|||
|
gu <> グ ;
|
|||
|
ge <> ゲ ;
|
|||
|
go <> ゴ ;
|
|||
|
|
|||
|
i <> イ ;
|
|||
|
|
|||
|
# j } $vowel > ジ | '~y' ;
|
|||
|
|
|||
|
ja <> ジャ ;
|
|||
|
ji'~i' < ジィ ; # liu
|
|||
|
ju <> ジュ ;
|
|||
|
je <> ジェ ;
|
|||
|
jo <> ジョ ;
|
|||
|
ji <> ジ ;
|
|||
|
|
|||
|
k | '~' < キ} $small_y ;
|
|||
|
ky } $vowel > キ | '~y' ;
|
|||
|
|
|||
|
ka <> カ ;
|
|||
|
ki <> キ ;
|
|||
|
ku <> ク ;
|
|||
|
ke <> ケ ;
|
|||
|
ko <> コ ;
|
|||
|
|
|||
|
m | '~' < ミ} $small_y ;
|
|||
|
my } $vowel > ミ | '~y' ;
|
|||
|
|
|||
|
ma <> マ ;
|
|||
|
mi <> ミ ;
|
|||
|
mu <> ム ;
|
|||
|
me <> メ ;
|
|||
|
mo <> モ ;
|
|||
|
|
|||
|
m } [pbfv] > ン ;
|
|||
|
|
|||
|
n | '~' < ニ } $small_y ;
|
|||
|
ny } $vowel > ニ | '~y' ;
|
|||
|
|
|||
|
na <> ナ ;
|
|||
|
ni <> ニ ;
|
|||
|
nu <> ヌ ;
|
|||
|
ne <> ネ ;
|
|||
|
no <> ノ ;
|
|||
|
|
|||
|
o <> オ ;
|
|||
|
|
|||
|
p | '~' < ピ } $small_y ;
|
|||
|
py } $vowel > ピ | '~y' ;
|
|||
|
|
|||
|
pa <> パ ;
|
|||
|
pi <> ピ ;
|
|||
|
pu <> プ ;
|
|||
|
pe <> ペ ;
|
|||
|
po <> ポ ;
|
|||
|
|
|||
|
h | '~' < ヒ } $small_y ;
|
|||
|
hy } $vowel > ヒ | '~y' ;
|
|||
|
|
|||
|
ha <> ハ ;
|
|||
|
hi <> ヒ ;
|
|||
|
hu <> ヘゥ ;
|
|||
|
he <> ヘ ;
|
|||
|
ho <> ホ ;
|
|||
|
|
|||
|
# f | '~' < フ } $small_y ;
|
|||
|
# f } $vowel > フ | '~' ;
|
|||
|
|
|||
|
fa <> ファ ;
|
|||
|
fi <> フィ ;
|
|||
|
fe <> フェ ;
|
|||
|
fo <> フォ ;
|
|||
|
fu <> フ ;
|
|||
|
|
|||
|
r | '~' < リ } $small_y ;
|
|||
|
ry } $vowel > リ | '~y' ;
|
|||
|
|
|||
|
ra <> ラ ;
|
|||
|
ri <> リ ;
|
|||
|
ru <> ル ;
|
|||
|
re <> レ ;
|
|||
|
ro <> ロ ;
|
|||
|
|
|||
|
za <> ザ ;
|
|||
|
zi <> ゼィ ;
|
|||
|
zu <> ズ ;
|
|||
|
ze <> ゼ ;
|
|||
|
zo <> ゾ ;
|
|||
|
|
|||
|
sa <> サ ;
|
|||
|
si <> セィ ;
|
|||
|
su <> ス ;
|
|||
|
se <> セ ;
|
|||
|
so <> ソ ;
|
|||
|
|
|||
|
sha < シャ ;
|
|||
|
shi'~i' < シィ ; # liu
|
|||
|
shu < シュ ;
|
|||
|
she < シェ ;
|
|||
|
sho < ショ ;
|
|||
|
shi <> シ ;
|
|||
|
sh } $vowel > シ | '~y' ;
|
|||
|
|
|||
|
ta <> タ ;
|
|||
|
ti <> ティ ;
|
|||
|
tu <> テゥ ;
|
|||
|
te <> テ ;
|
|||
|
to <> ト ;
|
|||
|
|
|||
|
tsu <> ツ ;
|
|||
|
|
|||
|
# v } $vowel > ヴ | '~' ;
|
|||
|
|
|||
|
#'v~a' < ヴァ ; # liu
|
|||
|
#'v~i' < ヴィ ; # liu
|
|||
|
#'v~e' < ヴェ ; # liu
|
|||
|
#'v~o' < ヴォ ; # liu
|
|||
|
vu <> ヴ ;
|
|||
|
|
|||
|
u <> ウ ;
|
|||
|
|
|||
|
# w } $vowel > ウ | '~' ;
|
|||
|
|
|||
|
wa <> ワ ;
|
|||
|
wi <> ヰ ;
|
|||
|
wu > ウ ;
|
|||
|
we <> ヱ ;
|
|||
|
wo <> ヲ ;
|
|||
|
|
|||
|
ya <> ヤ ;
|
|||
|
yi > イ ;
|
|||
|
yu <> ユ ;
|
|||
|
ye > エ ;
|
|||
|
yo <> ヨ ;
|
|||
|
|
|||
|
# double consonants
|
|||
|
|
|||
|
#specials
|
|||
|
s } sh > ッ ;
|
|||
|
t } ch > ッ ;
|
|||
|
|
|||
|
#voiced
|
|||
|
|
|||
|
j } j <> ッ } $j_start ;
|
|||
|
b } b <> ッ } [$h_start$f_start] $voice;
|
|||
|
d } d <> ッ } $t_start $voice;
|
|||
|
g } g <> ッ } $k_start $voice;
|
|||
|
p } p <> ッ } [$h_start$f_start] $semivoice;
|
|||
|
# v } v <> ッ } [ワヰウヱヲう] $voice ;
|
|||
|
z } z <> ッ } $s_start $voice;
|
|||
|
v } v <> ッ } $v_start;
|
|||
|
|
|||
|
# normal
|
|||
|
|
|||
|
k } k <> ッ } $k_start ;
|
|||
|
m } m <> ッ } $m_start ;
|
|||
|
n } n <> ッ } $n_start ;
|
|||
|
h } h <> ッ } $h_start ;
|
|||
|
f } f <> ッ } $f_start ;
|
|||
|
r } r <> ッ } $r_start ;
|
|||
|
t } t <> ッ } $t_start ;
|
|||
|
s } s <> ッ } $s_start ;
|
|||
|
|
|||
|
w } w <> ッ } $w_start;
|
|||
|
y } y <> ッ } $y_start;
|
|||
|
|
|||
|
# completeness
|
|||
|
x } x > ッ ;
|
|||
|
c } k > ッ ;
|
|||
|
c } c > ッ ;
|
|||
|
c } q > ッ ;
|
|||
|
l } l > ッ ;
|
|||
|
q } q > ッ ;
|
|||
|
# y } y > ッ ;
|
|||
|
# w } w > ッ ;
|
|||
|
|
|||
|
# prolonged vowel mark. this indicates a doubling of
|
|||
|
# the preceding vowel sound
|
|||
|
|
|||
|
#a < a { ー ; # liu
|
|||
|
#e < e { ー ; # liu
|
|||
|
#i < i { ー ; # liu
|
|||
|
#o < o { ー ; # liu
|
|||
|
#u < u { ー ; # liu
|
|||
|
|
|||
|
$macron <> ー ;
|
|||
|
|
|||
|
# small forms
|
|||
|
|
|||
|
'~a' <> ァ ;
|
|||
|
'~i' <> ィ ;
|
|||
|
'~u' <> ゥ ;
|
|||
|
'~e' <> ェ ;
|
|||
|
'~o' <> ォ ;
|
|||
|
'~tsu' <> ッ ;
|
|||
|
'~wa' <> ヮ ;
|
|||
|
'~ya' <> ャ ;
|
|||
|
'~yi' > ィ ;
|
|||
|
'~yu' <> ュ ;
|
|||
|
'~ye' > ェ ;
|
|||
|
'~yo' <> ョ ;
|
|||
|
|
|||
|
# iteration marks
|
|||
|
# TODO: make more accurate
|
|||
|
|
|||
|
j $1 < sh (y* $vowel) {ヽ$voice ;
|
|||
|
dj $1 < ch (y* $vowel) {ヽ$voice ;
|
|||
|
dz $1 < ts (y* $vowel) {ヽ$voice ;
|
|||
|
|
|||
|
g $1 < k (y* $vowel) {ヽ$voice ;
|
|||
|
z $1 < s (y* $vowel) {ヽ$voice ;
|
|||
|
d $1 < t (y* $vowel) {ヽ$voice ;
|
|||
|
h $1 < b (y* $vowel) {ヽ$voice ;
|
|||
|
v $1 < w (y* $vowel) {ヽ$voice ;
|
|||
|
|
|||
|
sh $1 < sh (y* $vowel) {ヽ$voice ;
|
|||
|
j $1 < j (y* $vowel) {ヽ$voice ;
|
|||
|
ch $1 < ch (y* $vowel) {ヽ$voice ;
|
|||
|
dj $1 < dj(y* $vowel) {ヽ$voice ;
|
|||
|
ts $1 < ts (y* $vowel) {ヽ$voice ;
|
|||
|
dz $1 < dz (y* $vowel) {ヽ$voice ;
|
|||
|
|
|||
|
$1 < ($consonant y* $vowel) {ヽ$voice? ;
|
|||
|
$1 < (.) {ヽ $voice? ; # otherwise repeat last character
|
|||
|
< ヽ $voice? ; # delete if no characters found
|
|||
|
|
|||
|
# h- rule: lengthens vowel if not followed by a vowel
|
|||
|
|
|||
|
[aeiou] } h > ー ;
|
|||
|
|
|||
|
# one-way latin- > kana rules. these do not occur in
|
|||
|
# well-formed romaji representing actual japanese text.
|
|||
|
# their purpose is to make all romaji map to kana of
|
|||
|
# some sort.
|
|||
|
|
|||
|
# the following are not really necessary, but produce
|
|||
|
# slightly more natural results.
|
|||
|
|
|||
|
cy > セィ ;
|
|||
|
dy > ディ ;
|
|||
|
hy > ヒ ;
|
|||
|
sy > セィ ;
|
|||
|
ty > ティ ;
|
|||
|
zy > ゼィ ;
|
|||
|
|
|||
|
h > ヘ ;
|
|||
|
|
|||
|
# isolated consonants listed here so as not to mask
|
|||
|
# longer rules above.
|
|||
|
|
|||
|
ch > チ;
|
|||
|
sh > シ ;
|
|||
|
dz > ヅ ;
|
|||
|
dj > ヂ;
|
|||
|
|
|||
|
b > ブ ;
|
|||
|
d > デ ;
|
|||
|
g > グ ;
|
|||
|
k > ク ;
|
|||
|
m > ム ;
|
|||
|
n'' < ン } $n_quoter ;
|
|||
|
n <> ン ;
|
|||
|
p > プ ;
|
|||
|
r > ル ;
|
|||
|
s > ス ;
|
|||
|
t > テ ;
|
|||
|
y > イ ;
|
|||
|
z > ズ ;
|
|||
|
v > ヴ ;
|
|||
|
|
|||
|
f > フ;
|
|||
|
j > ジ;
|
|||
|
w > ウ;
|
|||
|
|
|||
|
ß > | ss ;
|
|||
|
æ > | e ;
|
|||
|
ð > | d ;
|
|||
|
ø > | u ;
|
|||
|
þ > | th ;
|
|||
|
|
|||
|
# simple substitutions using backup
|
|||
|
|
|||
|
c > | k ;
|
|||
|
l > | r ;
|
|||
|
q > | k ;
|
|||
|
x > | ks ;
|
|||
|
|
|||
|
# ~~~ END shared rules ~~~
|
|||
|
|
|||
|
#------------------------------------------------------
|
|||
|
# Final cleanup
|
|||
|
|
|||
|
'~' > ; # delete stray tildes between letters
|
|||
|
[:Katakana:] { '' } [:Latin:] > ; # delete stray quotes between letters
|
|||
|
# [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
|||
|
|
|||
|
:: NFC (NFD) ;
|
|||
|
:: ([:Katakana:] halfwidth-fullwidth);
|
|||
|
|
|||
|
# note: a global filter is more efficient, but MUST include all source chars!!
|
|||
|
#:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
|
|||
|
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
|
|||
|
:: ( [[\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;
|
|||
|
|
|||
|
# eof
|