b720561095
X-SVN-Rev: 7224
514 lines
11 KiB
Plaintext
514 lines
11 KiB
Plaintext
// -*- Coding: utf-8; -*-
|
||
//--------------------------------------------------------------------
|
||
// Copyright (c) 1999-2001, International Business Machines
|
||
// Corporation and others. All Rights Reserved.
|
||
//--------------------------------------------------------------------
|
||
// THIS IS A MACHINE-GENERATED FILE
|
||
// Tool: dumpicurules.bat
|
||
// Source: ../../text/resources/Transliterator_Latin_Katakana.txt
|
||
// Date: Fri Nov 30 13:01:42 2001
|
||
//--------------------------------------------------------------------
|
||
|
||
// Latin_Katakana
|
||
|
||
translit_Latin_Katakana {
|
||
Rule {
|
||
//--------------------------------------------------------------------
|
||
// Copyright (c) 1999-2001, International Business Machines
|
||
// Corporation and others. All Rights Reserved.
|
||
//--------------------------------------------------------------------
|
||
// $Source: /xsrl/Nsvn/icu/icu/data/Attic/translit_Latin_Katakana.txt,v $
|
||
// $Date: 2001/11/30 21:24:16 $
|
||
// $Revision: 1.5 $
|
||
//--------------------------------------------------------------------
|
||
|
||
// note: a global filter is more efficient, but MUST include all source chars
|
||
//:: [\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
|
||
// MINIMAL FILTER GENERATED FOR: Latin-Katakana
|
||
":: [',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B] ;"
|
||
|
||
":: NFD (NFC);" // use NFKD to get the fullwidth latin characters
|
||
":: Lower ();" // whenever transliterating from cased to uncased script, include this
|
||
// :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
|
||
|
||
// Uses modified Hepburn. Small changes to make unambiguous.
|
||
|
||
// | Kunrei-shiki: Hepburn/MHepburn
|
||
// | ------------------------------
|
||
// | si: shi
|
||
// | si ~ya: sha
|
||
// | si ~yu: shu
|
||
// | si ~yo: sho
|
||
// | zi: ji
|
||
// | zi ~ya: ja
|
||
// | zi ~yu: ju
|
||
// | zi ~yo: jo
|
||
// | ti: chi
|
||
// | ti ~ya: cha
|
||
// | ti ~yu: chu
|
||
// | ti ~yu: cho
|
||
// | tu: tsu
|
||
// | di: ji/dji
|
||
// | du: zu/dzu
|
||
// | hu: fu
|
||
|
||
// | For foreign words:
|
||
// | -----------------
|
||
// | se ~i si
|
||
// | si ~e she
|
||
// |
|
||
// | ze ~i zi
|
||
// | zi ~e je
|
||
// |
|
||
// | te ~i ti
|
||
// | ti ~e che
|
||
// | te ~u tu
|
||
// |
|
||
// | de ~i di
|
||
// | de ~u du
|
||
// | de ~i di
|
||
// |
|
||
// | he ~u: hu
|
||
// | hu ~a fa
|
||
// | hu ~i fi
|
||
// | hu ~e he
|
||
// | hu ~o ho
|
||
|
||
// Most small forms are generated, but if necessary
|
||
// explicit small forms are given with ~a, ~ya, etc.
|
||
|
||
//------------------------------------------------------
|
||
// Variables
|
||
|
||
"$vowel = [aeiou] ;"
|
||
"$consonant = [bcdfghjklmnpqrstvwxyz] ;"
|
||
"$macron = \u0304 ;"
|
||
|
||
// Variables used for doubled-consonants with tsu
|
||
|
||
"$kana = [\u3041-\u3094] ;"
|
||
|
||
"$voice = [\u3099\u309B];"
|
||
"$semivoice = [\u309A\u309C];"
|
||
|
||
"$k_start = [カキクケコかきくけこ] ;"
|
||
|
||
"$s_start = [サシスセソさしすせそ] ;"
|
||
|
||
"$j_start = [シし] $voice ;"
|
||
|
||
"$t_start = [タチツテトたちつてと] ;"
|
||
|
||
"$n_start = [ナニヌネノンなにぬねの] ;"
|
||
|
||
"$h_start = [ハヒヘホはひへほ] ;"
|
||
"$f_start = [フふ] ;"
|
||
|
||
"$m_start = [マミムメモまみむめも] ;"
|
||
|
||
"$y_start = [ヤユヨやゆよ] ;"
|
||
|
||
"$r_start = [ラリルレロらりるれろ] ;"
|
||
|
||
"$w_start = [ワヰヱヲわゐゑを] ;"
|
||
|
||
"$v_start = [ワヰヱヲ]゙ ;"
|
||
|
||
// if ン is followed by $n_quoter, then it needs an
|
||
// apostrophe after its romaji form to disambiguate it.
|
||
// e.g., ン ア ! = ナ, so represent as "n'a", not "na".
|
||
|
||
"$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;"
|
||
|
||
"$small_y = [ャィュェョ] ;"
|
||
|
||
"$iteration = \u309D ;"
|
||
|
||
//------------------------------------------------------
|
||
// katakana rules
|
||
|
||
// Punctuation
|
||
|
||
"'.' <> 。;"
|
||
"',' <> 、;"
|
||
// ' ' } [a-z] > ; # delete spaces before latin
|
||
// ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
|
||
|
||
// Iteration Mark
|
||
// Copy previous letter & marks
|
||
|
||
// TODO
|
||
// | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
|
||
|
||
// Specials for katakana -- not shared with hiragana
|
||
|
||
"va <> ヷ ;"
|
||
"vi <> ヸ ;"
|
||
"ve <> ヹ ;"
|
||
"vo <> ヺ ;"
|
||
"'~ka' <> ヵ ;"
|
||
"'~ke' <> ヶ ;"
|
||
|
||
// ~~~ begin shared rules ~~~
|
||
|
||
//special
|
||
|
||
"ya < '~'ャ;"
|
||
"yi < '~'ィ ;"
|
||
"yu < '~'ュ;"
|
||
"ye < '~'ェ;"
|
||
"yo < '~'ョ;"
|
||
|
||
//normal
|
||
|
||
"a <> ア ;"
|
||
|
||
"b | '~' < ヒ ゙} $small_y ;"
|
||
"by } $vowel > ビ | '~y' ;"
|
||
|
||
"ba <> バ ;"
|
||
"bi <> ビ ;"
|
||
"bu <> ブ ;"
|
||
"be <> ベ ;"
|
||
"bo <> ボ ;"
|
||
|
||
"c } i > | s ;"
|
||
"c } e > | s ;"
|
||
|
||
"da <> ダ ;"
|
||
"di <> ディ ;"
|
||
"du <> デゥ ;"
|
||
"de <> デ ;"
|
||
"do <> ド ;"
|
||
"dzu <> ヅ ;"
|
||
"dja < ヂャ ;"
|
||
"dji'~i' < ヂィ ;" // liu
|
||
"dju < ヂュ ;"
|
||
"dje < ヂェ ;"
|
||
"djo < ヂョ ;"
|
||
"dji <> ヂ ;"
|
||
"dj } $vowel > ヂ | '~y' ;"
|
||
|
||
// TODO: QUESTION: use ĵĴżŻ instead of dj, dz
|
||
|
||
"cha < チャ ;"
|
||
"chi'~i' < チィ ;" // liu
|
||
"chu < チュ ;"
|
||
"che < チェ ;"
|
||
"cho < チョ ;"
|
||
"chi <> チ ;"
|
||
"ch } $vowel > チ | '~y' ;"
|
||
|
||
"e <> エ ;"
|
||
|
||
"g | '~' < ギ} $small_y ;"
|
||
"gy } $vowel > ギ | '~y' ;"
|
||
|
||
"ga <> ガ ;"
|
||
"gi <> ギ ;"
|
||
"gu <> グ ;"
|
||
"ge <> ゲ ;"
|
||
"go <> ゴ ;"
|
||
|
||
"i <> イ ;"
|
||
|
||
// j } $vowel > ジ | '~y' ;
|
||
|
||
"ja <> ジャ ;"
|
||
"ji'~i' < ジィ ;" // liu
|
||
"ju <> ジュ ;"
|
||
"je <> ジェ ;"
|
||
"jo <> ジョ ;"
|
||
"ji <> ジ ;"
|
||
|
||
"k | '~' < キ} $small_y ;"
|
||
"ky } $vowel > キ | '~y' ;"
|
||
|
||
"ka <> カ ;"
|
||
"ki <> キ ;"
|
||
"ku <> ク ;"
|
||
"ke <> ケ ;"
|
||
"ko <> コ ;"
|
||
|
||
"m | '~' < ミ} $small_y ;"
|
||
"my } $vowel > ミ | '~y' ;"
|
||
|
||
"ma <> マ ;"
|
||
"mi <> ミ ;"
|
||
"mu <> ム ;"
|
||
"me <> メ ;"
|
||
"mo <> モ ;"
|
||
|
||
"m } [pbfv] > ン ;"
|
||
|
||
"n | '~' < ニ } $small_y ;"
|
||
"ny } $vowel > ニ | '~y' ;"
|
||
|
||
"na <> ナ ;"
|
||
"ni <> ニ ;"
|
||
"nu <> ヌ ;"
|
||
"ne <> ネ ;"
|
||
"no <> ノ ;"
|
||
|
||
"o <> オ ;"
|
||
|
||
"p | '~' < ピ } $small_y ;"
|
||
"py } $vowel > ピ | '~y' ;"
|
||
|
||
"pa <> パ ;"
|
||
"pi <> ピ ;"
|
||
"pu <> プ ;"
|
||
"pe <> ペ ;"
|
||
"po <> ポ ;"
|
||
|
||
"h | '~' < ヒ } $small_y ;"
|
||
"hy } $vowel > ヒ | '~y' ;"
|
||
|
||
"ha <> ハ ;"
|
||
"hi <> ヒ ;"
|
||
"hu <> ヘゥ ;"
|
||
"he <> ヘ ;"
|
||
"ho <> ホ ;"
|
||
|
||
// f | '~' < フ } $small_y ;
|
||
// f } $vowel > フ | '~' ;
|
||
|
||
"fa <> ファ ;"
|
||
"fi <> フィ ;"
|
||
"fe <> フェ ;"
|
||
"fo <> フォ ;"
|
||
"fu <> フ ;"
|
||
|
||
"r | '~' < リ } $small_y ;"
|
||
"ry } $vowel > リ | '~y' ;"
|
||
|
||
"ra <> ラ ;"
|
||
"ri <> リ ;"
|
||
"ru <> ル ;"
|
||
"re <> レ ;"
|
||
"ro <> ロ ;"
|
||
|
||
"za <> ザ ;"
|
||
"zi <> ゼィ ;"
|
||
"zu <> ズ ;"
|
||
"ze <> ゼ ;"
|
||
"zo <> ゾ ;"
|
||
|
||
"sa <> サ ;"
|
||
"si <> セィ ;"
|
||
"su <> ス ;"
|
||
"se <> セ ;"
|
||
"so <> ソ ;"
|
||
|
||
"sha < シャ ;"
|
||
"shi'~i' < シィ ;" // liu
|
||
"shu < シュ ;"
|
||
"she < シェ ;"
|
||
"sho < ショ ;"
|
||
"shi <> シ ;"
|
||
"sh } $vowel > シ | '~y' ;"
|
||
|
||
"ta <> タ ;"
|
||
"ti <> ティ ;"
|
||
"tu <> テゥ ;"
|
||
"te <> テ ;"
|
||
"to <> ト ;"
|
||
|
||
"tsu <> ツ ;"
|
||
|
||
// v } $vowel > ヴ | '~' ;
|
||
|
||
//'v~a' < ヴァ ; # liu
|
||
//'v~i' < ヴィ ; # liu
|
||
//'v~e' < ヴェ ; # liu
|
||
//'v~o' < ヴォ ; # liu
|
||
"vu <> ヴ ;"
|
||
|
||
"u <> ウ ;"
|
||
|
||
// w } $vowel > ウ | '~' ;
|
||
|
||
"wa <> ワ ;"
|
||
"wi <> ヰ ;"
|
||
"wu > ウ ;"
|
||
"we <> ヱ ;"
|
||
"wo <> ヲ ;"
|
||
|
||
"ya <> ヤ ;"
|
||
"yi > イ ;"
|
||
"yu <> ユ ;"
|
||
"ye > エ ;"
|
||
"yo <> ヨ ;"
|
||
|
||
// double consonants
|
||
|
||
//specials
|
||
"s } sh > ッ ;"
|
||
"t } ch > ッ ;"
|
||
|
||
//voiced
|
||
|
||
"j } j <> ッ } $j_start ;"
|
||
"b } b <> ッ } [$h_start$f_start] $voice;"
|
||
"d } d <> ッ } $t_start $voice;"
|
||
"g } g <> ッ } $k_start $voice;"
|
||
"p } p <> ッ } [$h_start$f_start] $semivoice;"
|
||
// v } v <> ッ } [ワヰウヱヲう] $voice ;
|
||
"z } z <> ッ } $s_start $voice;"
|
||
"v } v <> ッ } $v_start;"
|
||
|
||
// normal
|
||
|
||
"k } k <> ッ } $k_start ;"
|
||
"m } m <> ッ } $m_start ;"
|
||
"n } n <> ッ } $n_start ;"
|
||
"h } h <> ッ } $h_start ;"
|
||
"f } f <> ッ } $f_start ;"
|
||
"r } r <> ッ } $r_start ;"
|
||
"t } t <> ッ } $t_start ;"
|
||
"s } s <> ッ } $s_start ;"
|
||
|
||
"w } w <> ッ } $w_start;"
|
||
"y } y <> ッ } $y_start;"
|
||
|
||
// completeness
|
||
"x } x > ッ ;"
|
||
"c } k > ッ ;"
|
||
"c } c > ッ ;"
|
||
"c } q > ッ ;"
|
||
"l } l > ッ ;"
|
||
"q } q > ッ ;"
|
||
// y } y > ッ ;
|
||
// w } w > ッ ;
|
||
|
||
// prolonged vowel mark. this indicates a doubling of
|
||
// the preceding vowel sound
|
||
|
||
//a < a { ー ; # liu
|
||
//e < e { ー ; # liu
|
||
//i < i { ー ; # liu
|
||
//o < o { ー ; # liu
|
||
//u < u { ー ; # liu
|
||
|
||
"$macron <> ー ;"
|
||
|
||
// small forms
|
||
|
||
"'~a' <> ァ ;"
|
||
"'~i' <> ィ ;"
|
||
"'~u' <> ゥ ;"
|
||
"'~e' <> ェ ;"
|
||
"'~o' <> ォ ;"
|
||
"'~tsu' <> ッ ;"
|
||
"'~wa' <> ヮ ;"
|
||
"'~ya' <> ャ ;"
|
||
"'~yi' > ィ ;"
|
||
"'~yu' <> ュ ;"
|
||
"'~ye' > ェ ;"
|
||
"'~yo' <> ョ ;"
|
||
|
||
// iteration marks
|
||
// TODO: make more accurate
|
||
|
||
"j $1 < sh (y* $vowel) {ヽ$voice ;"
|
||
"dj $1 < ch (y* $vowel) {ヽ$voice ;"
|
||
"dz $1 < ts (y* $vowel) {ヽ$voice ;"
|
||
|
||
"g $1 < k (y* $vowel) {ヽ$voice ;"
|
||
"z $1 < s (y* $vowel) {ヽ$voice ;"
|
||
"d $1 < t (y* $vowel) {ヽ$voice ;"
|
||
"h $1 < b (y* $vowel) {ヽ$voice ;"
|
||
"v $1 < w (y* $vowel) {ヽ$voice ;"
|
||
|
||
"sh $1 < sh (y* $vowel) {ヽ$voice ;"
|
||
"j $1 < j (y* $vowel) {ヽ$voice ;"
|
||
"ch $1 < ch (y* $vowel) {ヽ$voice ;"
|
||
"dj $1 < dj(y* $vowel) {ヽ$voice ;"
|
||
"ts $1 < ts (y* $vowel) {ヽ$voice ;"
|
||
"dz $1 < dz (y* $vowel) {ヽ$voice ;"
|
||
|
||
"$1 < ($consonant y* $vowel) {ヽ$voice? ;"
|
||
"$1 < (.) {ヽ $voice? ;" // otherwise repeat last character
|
||
"< ヽ $voice? ;" // delete if no characters found
|
||
|
||
// h- rule: lengthens vowel if not followed by a vowel
|
||
|
||
"[aeiou] } h > ー ;"
|
||
|
||
// one-way latin- > kana rules. these do not occur in
|
||
// well-formed romaji representing actual japanese text.
|
||
// their purpose is to make all romaji map to kana of
|
||
// some sort.
|
||
|
||
// the following are not really necessary, but produce
|
||
// slightly more natural results.
|
||
|
||
"cy > セィ ;"
|
||
"dy > ディ ;"
|
||
"hy > ヒ ;"
|
||
"sy > セィ ;"
|
||
"ty > ティ ;"
|
||
"zy > ゼィ ;"
|
||
|
||
"h > ヘ ;"
|
||
|
||
// isolated consonants listed here so as not to mask
|
||
// longer rules above.
|
||
|
||
"ch > チ;"
|
||
"sh > シ ;"
|
||
"dz > ヅ ;"
|
||
"dj > ヂ;"
|
||
|
||
"b > ブ ;"
|
||
"d > デ ;"
|
||
"g > グ ;"
|
||
"k > ク ;"
|
||
"m > ム ;"
|
||
"n'' < ン } $n_quoter ;"
|
||
"n <> ン ;"
|
||
"p > プ ;"
|
||
"r > ル ;"
|
||
"s > ス ;"
|
||
"t > テ ;"
|
||
"y > イ ;"
|
||
"z > ズ ;"
|
||
"v > ヴ ;"
|
||
|
||
"f > フ;"
|
||
"j > ジ;"
|
||
"w > ウ;"
|
||
|
||
"ß > | ss ;"
|
||
"æ > | e ;"
|
||
"ð > | d ;"
|
||
"ø > | u ;"
|
||
"þ > | th ;"
|
||
|
||
// simple substitutions using backup
|
||
|
||
"c > | k ;"
|
||
"l > | r ;"
|
||
"q > | k ;"
|
||
"x > | ks ;"
|
||
|
||
// ~~~ END shared rules ~~~
|
||
|
||
//------------------------------------------------------
|
||
// Final cleanup
|
||
|
||
"'~' > ;" // delete stray tildes between letters
|
||
"[:Katakana:] { '' } [:Latin:] > ;" // delete stray quotes between letters
|
||
// [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
|
||
|
||
":: NFC (NFD) ;" // use NFKD to get the halfwidth katakana characters
|
||
|
||
// note: a global filter is more efficient, but MUST include all source chars!!
|
||
//:: ([\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
|
||
// MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
|
||
":: ( [~\u3001-\u3002\u30A1-\u30FA\u30FC-\u30FE] ) ;"
|
||
|
||
// eof
|
||
}
|
||
}
|