// -*- Coding: utf-8; -*- //-------------------------------------------------------------------- // Copyright (c) 1999-2001, International Business Machines // Corporation and others. All Rights Reserved. //-------------------------------------------------------------------- // THIS IS A MACHINE-GENERATED FILE // Tool: dumpICUrules.bat // Source: ../../text/resources/Transliterator_Latin_Katakana.txt // Date: Wed Nov 21 18:58:49 2001 //-------------------------------------------------------------------- // Latin_Katakana translit_Latin_Katakana { Rule { //-------------------------------------------------------------------- // Copyright (c) 1999-2001, International Business Machines // Corporation and others. All Rights Reserved. //-------------------------------------------------------------------- // $Source: /xsrl/Nsvn/icu/icu/source/data/translit/Attic/t_Latn_Kana.txt,v $ // $Date: 2001/11/22 05:50:51 $ // $Revision: 1.3 $ //-------------------------------------------------------------------- // note: a global filter is more efficient, but MUST include all source chars ":: [\\u0000-\u00FF [:Latin:][:nonspacing mark:]] ;" ":: NFKD (NFC);" // use NFKD to get the fullwidth latin characters ":: Lower ();" // whenever transliterating from cased to uncased script, include this // :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese // Uses modified Hepburn. Small changes to make unambiguous. // | Kunrei-shiki: Hepburn/MHepburn // | ------------------------------ // | si: shi // | si ~ya: sha // | si ~yu: shu // | si ~yo: sho // | zi: ji // | zi ~ya: ja // | zi ~yu: ju // | zi ~yo: jo // | ti: chi // | ti ~ya: cha // | ti ~yu: chu // | ti ~yu: cho // | tu: tsu // | di: ji/dji // | du: zu/dzu // | hu: fu // | For foreign words: // | ----------------- // | se ~i si // | si ~e she // | // | ze ~i zi // | zi ~e je // | // | te ~i ti // | ti ~e che // | te ~u tu // | // | de ~i di // | de ~u du // | de ~i di // | // | he ~u: hu // | hu ~a fa // | hu ~i fi // | hu ~e he // | hu ~o ho // Most small forms are generated, but if necessary // explicit small forms are given with ~a, ~ya, etc. //------------------------------------------------------ // Variables "$vowel = [aeiou] ;" "$consonant = [bcdfghjklmnpqrstvwxyz] ;" "$macron = \u0304 ;" // Variables used for doubled-consonants with tsu "$kana = [\u3041-\u3094] ;" "$voice = [\u3099\u309B];" "$semivoice = [\u309A\u309C];" "$k_start = [カキクケコかきくけこ] ;" "$s_start = [サシスセソさしすせそ] ;" "$j_start = [シし] $voice ;" "$t_start = [タチツテトたちつてと] ;" "$n_start = [ナニヌネノンなにぬねの] ;" "$h_start = [ハヒヘホはひへほ] ;" "$f_start = [フふ] ;" "$m_start = [マミムメモまみむめも] ;" "$y_start = [ヤユヨやゆよ] ;" "$r_start = [ラリルレロらりるれろ] ;" "$w_start = [ワヰヱヲわゐゑを] ;" "$v_start = [ワヰヱヲ]゙ ;" // if ン is followed by $n_quoter, then it needs an // apostrophe after its romaji form to disambiguate it. // e.g., ン ア ! = ナ, so represent as "n'a", not "na". "$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ノ ヤ ユ ヨ ン] ;" "$small_y = [ャィュェョ] ;" "$iteration = \u309D ;" //------------------------------------------------------ // katakana rules // Punctuation "'.' <> 。;" "',' <> 、;" // ' ' } [a-z] > ; # delete spaces before latin // ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana // Iteration Mark // Copy previous letter & marks // TODO // | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration // Specials for katakana -- not shared with hiragana "va <> ヷ ;" "vi <> ヸ ;" "ve <> ヹ ;" "vo <> ヺ ;" "'~ka' <> ヵ ;" "'~ke' <> ヶ ;" // ~~~ begin shared rules ~~~ //special "ya < '~'ャ;" "yi < '~'ィ ;" "yu < '~'ュ;" "ye < '~'ェ;" "yo < '~'ョ;" //normal "a <> ア ;" "b | '~' < ヒ ゙} $small_y ;" "by } $vowel > ビ | '~y' ;" "ba <> バ ;" "bi <> ビ ;" "bu <> ブ ;" "be <> ベ ;" "bo <> ボ ;" "c } i > | s ;" "c } e > | s ;" "da <> ダ ;" "di <> ディ ;" "du <> デゥ ;" "de <> デ ;" "do <> ド ;" "dzu <> ヅ ;" "dja < ヂャ ;" "dji'~i' < ヂィ ;" // liu "dju < ヂュ ;" "dje < ヂェ ;" "djo < ヂョ ;" "dji <> ヂ ;" "dj } $vowel > ヂ | '~y' ;" // TODO: QUESTION: use ĵĴżŻ instead of dj, dz "cha < チャ ;" "chi'~i' < チィ ;" // liu "chu < チュ ;" "che < チェ ;" "cho < チョ ;" "chi <> チ ;" "ch } $vowel > チ | '~y' ;" "e <> エ ;" "g | '~' < ギ} $small_y ;" "gy } $vowel > ギ | '~y' ;" "ga <> ガ ;" "gi <> ギ ;" "gu <> グ ;" "ge <> ゲ ;" "go <> ゴ ;" "i <> イ ;" // j } $vowel > ジ | '~y' ; "ja <> ジャ ;" "ji'~i' < ジィ ;" // liu "ju <> ジュ ;" "je <> ジェ ;" "jo <> ジョ ;" "ji <> ジ ;" "k | '~' < キ} $small_y ;" "ky } $vowel > キ | '~y' ;" "ka <> カ ;" "ki <> キ ;" "ku <> ク ;" "ke <> ケ ;" "ko <> コ ;" "m | '~' < ミ} $small_y ;" "my } $vowel > ミ | '~y' ;" "ma <> マ ;" "mi <> ミ ;" "mu <> ム ;" "me <> メ ;" "mo <> モ ;" "m } [pbfv] > ン ;" "n | '~' < ニ } $small_y ;" "ny } $vowel > ニ | '~y' ;" "na <> ナ ;" "ni <> ニ ;" "nu <> ヌ ;" "ne <> ネ ;" "no <> ノ ;" "o <> オ ;" "p | '~' < ピ } $small_y ;" "py } $vowel > ピ | '~y' ;" "pa <> パ ;" "pi <> ピ ;" "pu <> プ ;" "pe <> ペ ;" "po <> ポ ;" "h | '~' < ヒ } $small_y ;" "hy } $vowel > ヒ | '~y' ;" "ha <> ハ ;" "hi <> ヒ ;" "hu <> ヘゥ ;" "he <> ヘ ;" "ho <> ホ ;" // f | '~' < フ } $small_y ; // f } $vowel > フ | '~' ; "fa <> ファ ;" "fi <> フィ ;" "fe <> フェ ;" "fo <> フォ ;" "fu <> フ ;" "r | '~' < リ } $small_y ;" "ry } $vowel > リ | '~y' ;" "ra <> ラ ;" "ri <> リ ;" "ru <> ル ;" "re <> レ ;" "ro <> ロ ;" "za <> ザ ;" "zi <> ゼィ ;" "zu <> ズ ;" "ze <> ゼ ;" "zo <> ゾ ;" "sa <> サ ;" "si <> セィ ;" "su <> ス ;" "se <> セ ;" "so <> ソ ;" "sha < シャ ;" "shi'~i' < シィ ;" // liu "shu < シュ ;" "she < シェ ;" "sho < ショ ;" "shi <> シ ;" "sh } $vowel > シ | '~y' ;" "ta <> タ ;" "ti <> ティ ;" "tu <> テゥ ;" "te <> テ ;" "to <> ト ;" "tsu <> ツ ;" // v } $vowel > ヴ | '~' ; //'v~a' < ヴァ ; # liu //'v~i' < ヴィ ; # liu //'v~e' < ヴェ ; # liu //'v~o' < ヴォ ; # liu "vu <> ヴ ;" "u <> ウ ;" // w } $vowel > ウ | '~' ; "wa <> ワ ;" "wi <> ヰ ;" "wu > ウ ;" "we <> ヱ ;" "wo <> ヲ ;" "ya <> ヤ ;" "yi > イ ;" "yu <> ユ ;" "ye > エ ;" "yo <> ヨ ;" // double consonants //specials "s } sh > ッ ;" "t } ch > ッ ;" //voiced "j } j <> ッ } $j_start ;" "b } b <> ッ } [$h_start$f_start] $voice;" "d } d <> ッ } $t_start $voice;" "g } g <> ッ } $k_start $voice;" "p } p <> ッ } [$h_start$f_start] $semivoice;" // v } v <> ッ } [ワヰウヱヲう] $voice ; "z } z <> ッ } $s_start $voice;" "v } v <> ッ } $v_start;" // normal "k } k <> ッ } $k_start ;" "m } m <> ッ } $m_start ;" "n } n <> ッ } $n_start ;" "h } h <> ッ } $h_start ;" "f } f <> ッ } $f_start ;" "r } r <> ッ } $r_start ;" "t } t <> ッ } $t_start ;" "s } s <> ッ } $s_start ;" "w } w <> ッ } $w_start;" "y } y <> ッ } $y_start;" // completeness "x } x > ッ ;" "c } k > ッ ;" "c } c > ッ ;" "c } q > ッ ;" "l } l > ッ ;" "q } q > ッ ;" // y } y > ッ ; // w } w > ッ ; // prolonged vowel mark. this indicates a doubling of // the preceding vowel sound //a < a { ー ; # liu //e < e { ー ; # liu //i < i { ー ; # liu //o < o { ー ; # liu //u < u { ー ; # liu "$macron <> ー ;" // small forms "'~a' <> ァ ;" "'~i' <> ィ ;" "'~u' <> ゥ ;" "'~e' <> ェ ;" "'~o' <> ォ ;" "'~tsu' <> ッ ;" "'~wa' <> ヮ ;" "'~ya' <> ャ ;" "'~yi' > ィ ;" "'~yu' <> ュ ;" "'~ye' > ェ ;" "'~yo' <> ョ ;" // iteration marks // TODO: make more accurate "j $1 < sh (y* $vowel) {ヽ$voice ;" "dj $1 < ch (y* $vowel) {ヽ$voice ;" "dz $1 < ts (y* $vowel) {ヽ$voice ;" "g $1 < k (y* $vowel) {ヽ$voice ;" "z $1 < s (y* $vowel) {ヽ$voice ;" "d $1 < t (y* $vowel) {ヽ$voice ;" "h $1 < b (y* $vowel) {ヽ$voice ;" "v $1 < w (y* $vowel) {ヽ$voice ;" "sh $1 < sh (y* $vowel) {ヽ$voice ;" "j $1 < j (y* $vowel) {ヽ$voice ;" "ch $1 < ch (y* $vowel) {ヽ$voice ;" "dj $1 < dj(y* $vowel) {ヽ$voice ;" "ts $1 < ts (y* $vowel) {ヽ$voice ;" "dz $1 < dz (y* $vowel) {ヽ$voice ;" "$1 < ($consonant y* $vowel) {ヽ$voice? ;" "$1 < (.) {ヽ $voice? ;" // otherwise repeat last character "< ヽ $voice? ;" // delete if no characters found // h- rule: lengthens vowel if not followed by a vowel "[aeiou] } h > ー ;" // one-way latin- > kana rules. these do not occur in // well-formed romaji representing actual japanese text. // their purpose is to make all romaji map to kana of // some sort. // the following are not really necessary, but produce // slightly more natural results. "cy > セィ ;" "dy > ディ ;" "hy > ヒ ;" "sy > セィ ;" "ty > ティ ;" "zy > ゼィ ;" "h > ヘ ;" // isolated consonants listed here so as not to mask // longer rules above. "ch > チ;" "sh > シ ;" "dz > ヅ ;" "dj > ヂ;" "b > ブ ;" "d > デ ;" "g > グ ;" "k > ク ;" "m > ム ;" "n'' < ン } $n_quoter ;" "n <> ン ;" "p > プ ;" "r > ル ;" "s > ス ;" "t > テ ;" "y > イ ;" "z > ズ ;" "v > ヴ ;" "f > フ;" "j > ジ;" "w > ウ;" "ß > | ss ;" "æ > | e ;" "ð > | d ;" "ø > | u ;" "þ > | th ;" // simple substitutions using backup "c > | k ;" "l > | r ;" "q > | k ;" "x > | ks ;" // ~~~ END shared rules ~~~ //------------------------------------------------------ // Final cleanup "'~' > ;" // delete stray tildes between letters "[:Katakana:] { '' } [:Latin:] > ;" // delete stray quotes between letters "[\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ;" // delete any non-spacing marks that we didn't use ":: NFC (NFKD) ;" // use NFKD to get the halfwidth katakana characters // note: a global filter is more efficient, but MUST include all source chars!! ":: ([\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Katakana:] [:nonspacing mark:]]);" // eof } }