scuffed-code/icu4c/source/data/translit/t_Latn_Kana.txt

512 lines
11 KiB
Plaintext
Raw Normal View History

 // -*- Coding: utf-8; -*-
//--------------------------------------------------------------------
// Copyright (c) 1999-2002, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
// Tool: dumpicurules.bat
// Source: ../../../impl/data/Transliterator_Latin_Katakana.txt
// Date: Sat Jul 27 10:31:07 2002
//--------------------------------------------------------------------
// Latin_Katakana
t_Latn_Kana {
Rule {
//--------------------------------------------------------------------
//--------------------------------------------------------------------
//--------------------------------------------------------------------
// note: a global filter is more efficient, but MUST include all source chars
//:: [\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
// MINIMAL FILTER GENERATED FOR: Latin-Katakana
//## WARNING -- must add width filter, both here and below!!! ###
":: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;"
":: [:Latin:] fullwidth-halfwidth ();"
":: NFD (NFC);"
":: Lower ();" // whenever transliterating from cased to uncased script, include this
// :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
// Uses modified Hepburn. Small changes to make unambiguous.
// | Kunrei-shiki: Hepburn/MHepburn
// | ------------------------------
// | si: shi
// | si ~ya: sha
// | si ~yu: shu
// | si ~yo: sho
// | zi: ji
// | zi ~ya: ja
// | zi ~yu: ju
// | zi ~yo: jo
// | ti: chi
// | ti ~ya: cha
// | ti ~yu: chu
// | ti ~yu: cho
// | tu: tsu
// | di: ji/dji
// | du: zu/dzu
// | hu: fu
// | For foreign words:
// | -----------------
// | se ~i si
// | si ~e she
// |
// | ze ~i zi
// | zi ~e je
// |
// | te ~i ti
// | ti ~e che
// | te ~u tu
// |
// | de ~i di
// | de ~u du
// | de ~i di
// |
// | he ~u: hu
// | hu ~a fa
// | hu ~i fi
// | hu ~e he
// | hu ~o ho
// Most small forms are generated, but if necessary
// explicit small forms are given with ~a, ~ya, etc.
//------------------------------------------------------
// Variables
"$vowel = [aeiou] ;"
"$consonant = [bcdfghjklmnpqrstvwxyz] ;"
"$macron = \u0304 ;"
// Variables used for doubled-consonants with tsu
"$kana = [\u3041-\u3094] ;"
"$voice = [\u3099\u309B];"
"$semivoice = [\u309A\u309C];"
"$k_start = [カキクケコかきくけこ] ;"
"$s_start = [サシスセソさしすせそ] ;"
"$j_start = [シし] $voice ;"
"$t_start = [タチツテトたちつてと] ;"
"$n_start = [ナニヌネノンなにぬねの] ;"
"$h_start = [ハヒヘホはひへほ] ;"
"$f_start = [フふ] ;"
"$m_start = [マミムメモまみむめも] ;"
"$y_start = [ヤユヨやゆよ] ;"
"$r_start = [ラリルレロらりるれろ] ;"
"$w_start = [ワヰヱヲわゐゑを] ;"
"$v_start = [ワヰヱヲ]゙ ;"
// if ン is followed by $n_quoter, then it needs an
// apostrophe after its romaji form to disambiguate it.
// e.g., ン ア ! = ナ, so represent as "n'a", not "na".
"$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ヤ ユ ヨ ン] ;"
"$small_y = [ャィュェョ] ;"
"$iteration = \u309D ;"
//------------------------------------------------------
// katakana rules
// Punctuation
"'.' <> 。;"
"',' <> 、;"
// ' ' } [a-z] > ; # delete spaces before latin
// ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
// Iteration Mark
// Copy previous letter & marks
// TODO
// | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
// Specials for katakana -- not shared with hiragana
"va <> ヷ ;"
"vi <> ヸ ;"
"ve <> ヹ ;"
"vo <> ヺ ;"
"'~ka' <> ヵ ;"
"'~ke' <> ヶ ;"
// ~~~ begin shared rules ~~~
//special
"ya < '~'ャ;"
"yi < '~'ィ ;"
"yu < '~'ュ;"
"ye < '~'ェ;"
"yo < '~'ョ;"
//normal
"a <> ア ;"
"b | '~' < ヒ ゙} $small_y ;"
"by } $vowel > ビ | '~y' ;"
"ba <> バ ;"
"bi <> ビ ;"
"bu <> ブ ;"
"be <> ベ ;"
"bo <> ボ ;"
"c } i > | s ;"
"c } e > | s ;"
"da <> ダ ;"
"di <> ディ ;"
"du <> デゥ ;"
"de <> デ ;"
"do <> ド ;"
"dzu <> ヅ ;"
"dja < ヂャ ;"
"dji'~i' < ヂィ ;" // liu
"dju < ヂュ ;"
"dje < ヂェ ;"
"djo < ヂョ ;"
"dji <> ヂ ;"
"dj } $vowel > ヂ | '~y' ;"
// TODO: QUESTION: use ĵĴżŻ instead of dj, dz
"cha < チャ ;"
"chi'~i' < チィ ;" // liu
"chu < チュ ;"
"che < チェ ;"
"cho < チョ ;"
"chi <> チ ;"
"ch } $vowel > チ | '~y' ;"
"e <> エ ;"
"g | '~' < ギ} $small_y ;"
"gy } $vowel > ギ | '~y' ;"
"ga <> ガ ;"
"gi <> ギ ;"
"gu <> グ ;"
"ge <> ゲ ;"
"go <> ゴ ;"
"i <> イ ;"
// j } $vowel > ジ | '~y' ;
"ja <> ジャ ;"
"ji'~i' < ジィ ;" // liu
"ju <> ジュ ;"
"je <> ジェ ;"
"jo <> ジョ ;"
"ji <> ジ ;"
"k | '~' < キ} $small_y ;"
"ky } $vowel > キ | '~y' ;"
"ka <> カ ;"
"ki <> キ ;"
"ku <> ク ;"
"ke <> ケ ;"
"ko <> コ ;"
"m | '~' < ミ} $small_y ;"
"my } $vowel > ミ | '~y' ;"
"ma <> マ ;"
"mi <> ミ ;"
"mu <> ム ;"
"me <> メ ;"
"mo <> モ ;"
"m } [pbfv] > ン ;"
"n | '~' < ニ } $small_y ;"
"ny } $vowel > ニ | '~y' ;"
"na <> ナ ;"
"ni <> ニ ;"
"nu <> ヌ ;"
"ne <> ネ ;"
"no <> ;"
"o <> オ ;"
"p | '~' < ピ } $small_y ;"
"py } $vowel > ピ | '~y' ;"
"pa <> パ ;"
"pi <> ピ ;"
"pu <> プ ;"
"pe <> ペ ;"
"po <> ポ ;"
"h | '~' < ヒ } $small_y ;"
"hy } $vowel > ヒ | '~y' ;"
"ha <> ハ ;"
"hi <> ヒ ;"
"hu <> ヘゥ ;"
"he <> ヘ ;"
"ho <> ホ ;"
// f | '~' < フ } $small_y ;
// f } $vowel > フ | '~' ;
"fa <> ファ ;"
"fi <> フィ ;"
"fe <> フェ ;"
"fo <> フォ ;"
"fu <> フ ;"
"r | '~' < リ } $small_y ;"
"ry } $vowel > リ | '~y' ;"
"ra <> ラ ;"
"ri <> リ ;"
"ru <> ル ;"
"re <> レ ;"
"ro <> ロ ;"
"za <> ザ ;"
"zi <> ゼィ ;"
"zu <> ズ ;"
"ze <> ゼ ;"
"zo <> ゾ ;"
"sa <> サ ;"
"si <> セィ ;"
"su <> ス ;"
"se <> セ ;"
"so <> ソ ;"
"sha < シャ ;"
"shi'~i' < シィ ;" // liu
"shu < シュ ;"
"she < シェ ;"
"sho < ショ ;"
"shi <> シ ;"
"sh } $vowel > シ | '~y' ;"
"ta <> タ ;"
"ti <> ティ ;"
"tu <> テゥ ;"
"te <> テ ;"
"to <> ト ;"
"tsu <> ツ ;"
// v } $vowel > ヴ | '~' ;
//'v~a' < ヴァ ; # liu
//'v~i' < ヴィ ; # liu
//'v~e' < ヴェ ; # liu
//'v~o' < ヴォ ; # liu
"vu <> ヴ ;"
"u <> ウ ;"
// w } $vowel > ウ | '~' ;
"wa <> ワ ;"
"wi <> ヰ ;"
"wu > ウ ;"
"we <> ヱ ;"
"wo <> ヲ ;"
"ya <> ヤ ;"
"yi > イ ;"
"yu <> ユ ;"
"ye > エ ;"
"yo <> ヨ ;"
// double consonants
//specials
"s } sh > ッ ;"
"t } ch > ッ ;"
//voiced
"j } j <> ッ } $j_start ;"
"b } b <> ッ } [$h_start$f_start] $voice;"
"d } d <> ッ } $t_start $voice;"
"g } g <> ッ } $k_start $voice;"
"p } p <> ッ } [$h_start$f_start] $semivoice;"
// v } v <> ッ } [ワヰウヱヲう] $voice ;
"z } z <> ッ } $s_start $voice;"
"v } v <> ッ } $v_start;"
// normal
"k } k <> ッ } $k_start ;"
"m } m <> ッ } $m_start ;"
"n } n <> ッ } $n_start ;"
"h } h <> ッ } $h_start ;"
"f } f <> ッ } $f_start ;"
"r } r <> ッ } $r_start ;"
"t } t <> ッ } $t_start ;"
"s } s <> ッ } $s_start ;"
"w } w <> ッ } $w_start;"
"y } y <> ッ } $y_start;"
// completeness
"x } x > ッ ;"
"c } k > ッ ;"
"c } c > ッ ;"
"c } q > ッ ;"
"l } l > ッ ;"
"q } q > ッ ;"
// y } y > ッ ;
// w } w > ッ ;
// prolonged vowel mark. this indicates a doubling of
// the preceding vowel sound
//a < a { ー ; # liu
//e < e { ー ; # liu
//i < i { ー ; # liu
//o < o { ー ; # liu
//u < u { ー ; # liu
"$macron <> ー ;"
// small forms
"'~a' <> ァ ;"
"'~i' <> ィ ;"
"'~u' <> ゥ ;"
"'~e' <> ェ ;"
"'~o' <> ォ ;"
"'~tsu' <> ッ ;"
"'~wa' <> ヮ ;"
"'~ya' <> ャ ;"
"'~yi' > ィ ;"
"'~yu' <> ュ ;"
"'~ye' > ェ ;"
"'~yo' <> ョ ;"
// iteration marks
// TODO: make more accurate
"j $1 < sh (y* $vowel) {ヽ$voice ;"
"dj $1 < ch (y* $vowel) {ヽ$voice ;"
"dz $1 < ts (y* $vowel) {ヽ$voice ;"
"g $1 < k (y* $vowel) {ヽ$voice ;"
"z $1 < s (y* $vowel) {ヽ$voice ;"
"d $1 < t (y* $vowel) {ヽ$voice ;"
"h $1 < b (y* $vowel) {ヽ$voice ;"
"v $1 < w (y* $vowel) {ヽ$voice ;"
"sh $1 < sh (y* $vowel) {ヽ$voice ;"
"j $1 < j (y* $vowel) {ヽ$voice ;"
"ch $1 < ch (y* $vowel) {ヽ$voice ;"
"dj $1 < dj(y* $vowel) {ヽ$voice ;"
"ts $1 < ts (y* $vowel) {ヽ$voice ;"
"dz $1 < dz (y* $vowel) {ヽ$voice ;"
"$1 < ($consonant y* $vowel) {ヽ$voice? ;"
"$1 < (.) {ヽ $voice? ;" // otherwise repeat last character
"< ヽ $voice? ;" // delete if no characters found
// h- rule: lengthens vowel if not followed by a vowel
"[aeiou] } h > ー ;"
// one-way latin- > kana rules. these do not occur in
// well-formed romaji representing actual japanese text.
// their purpose is to make all romaji map to kana of
// some sort.
// the following are not really necessary, but produce
// slightly more natural results.
"cy > セィ ;"
"dy > ディ ;"
"hy > ヒ ;"
"sy > セィ ;"
"ty > ティ ;"
"zy > ゼィ ;"
"h > ヘ ;"
// isolated consonants listed here so as not to mask
// longer rules above.
"ch > チ;"
"sh > シ ;"
"dz > ヅ ;"
"dj > ヂ;"
"b > ブ ;"
"d > デ ;"
"g > グ ;"
"k > ク ;"
"m > ム ;"
"n'' < ン } $n_quoter ;"
"n <> ン ;"
"p > プ ;"
"r > ル ;"
"s > ス ;"
"t > テ ;"
"y > イ ;"
"z > ズ ;"
"v > ヴ ;"
"f > フ;"
"j > ジ;"
"w > ウ;"
"ß > | ss ;"
"æ > | e ;"
"ð > | d ;"
"ø > | u ;"
"þ > | th ;"
// simple substitutions using backup
"c > | k ;"
"l > | r ;"
"q > | k ;"
"x > | ks ;"
// ~~~ END shared rules ~~~
//------------------------------------------------------
// Final cleanup
"'~' > ;" // delete stray tildes between letters
"[:Katakana:] { '' } [:Latin:] > ;" // delete stray quotes between letters
// [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
":: NFC (NFD) ;"
":: ([:Katakana:] halfwidth-fullwidth);"
// note: a global filter is more efficient, but MUST include all source chars!!
//:: ([\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
// MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
":: ( [[\\\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;"
// eof
}
}