scuffed-code/icu4c/source/data/translit/t_Latn_Kana.txt
Alan Liu 2443c39da1 ICU-1575 full rule update
X-SVN-Rev: 7281
2001-12-03 20:51:19 +00:00

517 lines
11 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// -*- Coding: utf-8; -*-
//--------------------------------------------------------------------
// Copyright (c) 1999-2001, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
// Tool: dumpicurules.bat
// Source: ../../text/resources/Transliterator_Latin_Katakana.txt
// Date: Mon Dec 3 11:44:30 2001
//--------------------------------------------------------------------
// Latin_Katakana
translit_Latin_Katakana {
Rule {
//--------------------------------------------------------------------
// Copyright (c) 1999-2001, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// $Source: /xsrl/Nsvn/icu/icu/source/data/translit/Attic/t_Latn_Kana.txt,v $
// $Date: 2001/12/03 20:51:19 $
// $Revision: 1.8 $
//--------------------------------------------------------------------
// note: a global filter is more efficient, but MUST include all source chars
//:: [\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]] ;
// MINIMAL FILTER GENERATED FOR: Latin-Katakana
//## WARNING -- must add width filter, both here and below!!! ###
":: [[\u1100-\u1112\u111A\u1121\u1160-\u1175\u11AA\u11AC-\u11AD\u11B0-\u11B5\u2190-\u2193\u2502\u25A0\u25CB\u3000-\u3002\u300C-\u300D\u3099-\u309A\u30A1-\u30ED\u30EF\u30F2-\u30F4\u30F7\u30FA-\u30FC\uFF01-\uFF5E\uFFE0-\uFFE6][',.A-Za-z~\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01ED\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0304\u04E2-\u04E3\u04EE-\u04EF\u1E00-\u1E99\u1EA0-\u1EF9\u1FB1\u1FB9\u1FD1\u1FD9\u1FE1\u1FE9\u212A-\u212B]] ;"
":: [:Latin:] fullwidth-halfwidth ();"
":: NFD (NFC);"
":: Lower ();" // whenever transliterating from cased to uncased script, include this
// :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
// Uses modified Hepburn. Small changes to make unambiguous.
// | Kunrei-shiki: Hepburn/MHepburn
// | ------------------------------
// | si: shi
// | si ~ya: sha
// | si ~yu: shu
// | si ~yo: sho
// | zi: ji
// | zi ~ya: ja
// | zi ~yu: ju
// | zi ~yo: jo
// | ti: chi
// | ti ~ya: cha
// | ti ~yu: chu
// | ti ~yu: cho
// | tu: tsu
// | di: ji/dji
// | du: zu/dzu
// | hu: fu
// | For foreign words:
// | -----------------
// | se ~i si
// | si ~e she
// |
// | ze ~i zi
// | zi ~e je
// |
// | te ~i ti
// | ti ~e che
// | te ~u tu
// |
// | de ~i di
// | de ~u du
// | de ~i di
// |
// | he ~u: hu
// | hu ~a fa
// | hu ~i fi
// | hu ~e he
// | hu ~o ho
// Most small forms are generated, but if necessary
// explicit small forms are given with ~a, ~ya, etc.
//------------------------------------------------------
// Variables
"$vowel = [aeiou] ;"
"$consonant = [bcdfghjklmnpqrstvwxyz] ;"
"$macron = \u0304 ;"
// Variables used for doubled-consonants with tsu
"$kana = [\u3041-\u3094] ;"
"$voice = [\u3099\u309B];"
"$semivoice = [\u309A\u309C];"
"$k_start = [カキクケコかきくけこ] ;"
"$s_start = [サシスセソさしすせそ] ;"
"$j_start = [シし] $voice ;"
"$t_start = [タチツテトたちつてと] ;"
"$n_start = [ナニヌネノンなにぬねの] ;"
"$h_start = [ハヒヘホはひへほ] ;"
"$f_start = [フふ] ;"
"$m_start = [マミムメモまみむめも] ;"
"$y_start = [ヤユヨやゆよ] ;"
"$r_start = [ラリルレロらりるれろ] ;"
"$w_start = [ワヰヱヲわゐゑを] ;"
"$v_start = [ワヰヱヲ]゙ ;"
// if ン is followed by $n_quoter, then it needs an
// apostrophe after its romaji form to disambiguate it.
// e.g., ン ア ! = ナ, so represent as "n'a", not "na".
"$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ヤ ユ ヨ ン] ;"
"$small_y = [ャィュェョ] ;"
"$iteration = \u309D ;"
//------------------------------------------------------
// katakana rules
// Punctuation
"'.' <> 。;"
"',' <> 、;"
// ' ' } [a-z] > ; # delete spaces before latin
// ' ' < [^' '\u30A0-\u30ff] {} ['\u30A0-\u30ff] ; #insert spaces before hiragana
// Iteration Mark
// Copy previous letter & marks
// TODO
// | $1 $1 < ($kana [[:M:]$voice$semivoice]?) $iteration
// Specials for katakana -- not shared with hiragana
"va <> ヷ ;"
"vi <> ヸ ;"
"ve <> ヹ ;"
"vo <> ヺ ;"
"'~ka' <> ヵ ;"
"'~ke' <> ヶ ;"
// ~~~ begin shared rules ~~~
//special
"ya < '~'ャ;"
"yi < '~'ィ ;"
"yu < '~'ュ;"
"ye < '~'ェ;"
"yo < '~'ョ;"
//normal
"a <> ア ;"
"b | '~' < ヒ ゙} $small_y ;"
"by } $vowel > ビ | '~y' ;"
"ba <> バ ;"
"bi <> ビ ;"
"bu <> ブ ;"
"be <> ベ ;"
"bo <> ボ ;"
"c } i > | s ;"
"c } e > | s ;"
"da <> ダ ;"
"di <> ディ ;"
"du <> デゥ ;"
"de <> デ ;"
"do <> ド ;"
"dzu <> ヅ ;"
"dja < ヂャ ;"
"dji'~i' < ヂィ ;" // liu
"dju < ヂュ ;"
"dje < ヂェ ;"
"djo < ヂョ ;"
"dji <> ヂ ;"
"dj } $vowel > ヂ | '~y' ;"
// TODO: QUESTION: use ĵĴżŻ instead of dj, dz
"cha < チャ ;"
"chi'~i' < チィ ;" // liu
"chu < チュ ;"
"che < チェ ;"
"cho < チョ ;"
"chi <> チ ;"
"ch } $vowel > チ | '~y' ;"
"e <> エ ;"
"g | '~' < ギ} $small_y ;"
"gy } $vowel > ギ | '~y' ;"
"ga <> ガ ;"
"gi <> ギ ;"
"gu <> グ ;"
"ge <> ゲ ;"
"go <> ゴ ;"
"i <> イ ;"
// j } $vowel > ジ | '~y' ;
"ja <> ジャ ;"
"ji'~i' < ジィ ;" // liu
"ju <> ジュ ;"
"je <> ジェ ;"
"jo <> ジョ ;"
"ji <> ジ ;"
"k | '~' < キ} $small_y ;"
"ky } $vowel > キ | '~y' ;"
"ka <> カ ;"
"ki <> キ ;"
"ku <> ク ;"
"ke <> ケ ;"
"ko <> コ ;"
"m | '~' < ミ} $small_y ;"
"my } $vowel > ミ | '~y' ;"
"ma <> マ ;"
"mi <> ミ ;"
"mu <> ム ;"
"me <> メ ;"
"mo <> モ ;"
"m } [pbfv] > ン ;"
"n | '~' < ニ } $small_y ;"
"ny } $vowel > ニ | '~y' ;"
"na <> ナ ;"
"ni <> ニ ;"
"nu <> ヌ ;"
"ne <> ネ ;"
"no <> ;"
"o <> オ ;"
"p | '~' < ピ } $small_y ;"
"py } $vowel > ピ | '~y' ;"
"pa <> パ ;"
"pi <> ピ ;"
"pu <> プ ;"
"pe <> ペ ;"
"po <> ポ ;"
"h | '~' < ヒ } $small_y ;"
"hy } $vowel > ヒ | '~y' ;"
"ha <> ハ ;"
"hi <> ヒ ;"
"hu <> ヘゥ ;"
"he <> ヘ ;"
"ho <> ホ ;"
// f | '~' < フ } $small_y ;
// f } $vowel > フ | '~' ;
"fa <> ファ ;"
"fi <> フィ ;"
"fe <> フェ ;"
"fo <> フォ ;"
"fu <> フ ;"
"r | '~' < リ } $small_y ;"
"ry } $vowel > リ | '~y' ;"
"ra <> ラ ;"
"ri <> リ ;"
"ru <> ル ;"
"re <> レ ;"
"ro <> ロ ;"
"za <> ザ ;"
"zi <> ゼィ ;"
"zu <> ズ ;"
"ze <> ゼ ;"
"zo <> ゾ ;"
"sa <> サ ;"
"si <> セィ ;"
"su <> ス ;"
"se <> セ ;"
"so <> ソ ;"
"sha < シャ ;"
"shi'~i' < シィ ;" // liu
"shu < シュ ;"
"she < シェ ;"
"sho < ショ ;"
"shi <> シ ;"
"sh } $vowel > シ | '~y' ;"
"ta <> タ ;"
"ti <> ティ ;"
"tu <> テゥ ;"
"te <> テ ;"
"to <> ト ;"
"tsu <> ツ ;"
// v } $vowel > ヴ | '~' ;
//'v~a' < ヴァ ; # liu
//'v~i' < ヴィ ; # liu
//'v~e' < ヴェ ; # liu
//'v~o' < ヴォ ; # liu
"vu <> ヴ ;"
"u <> ウ ;"
// w } $vowel > ウ | '~' ;
"wa <> ワ ;"
"wi <> ヰ ;"
"wu > ウ ;"
"we <> ヱ ;"
"wo <> ヲ ;"
"ya <> ヤ ;"
"yi > イ ;"
"yu <> ユ ;"
"ye > エ ;"
"yo <> ヨ ;"
// double consonants
//specials
"s } sh > ッ ;"
"t } ch > ッ ;"
//voiced
"j } j <> ッ } $j_start ;"
"b } b <> ッ } [$h_start$f_start] $voice;"
"d } d <> ッ } $t_start $voice;"
"g } g <> ッ } $k_start $voice;"
"p } p <> ッ } [$h_start$f_start] $semivoice;"
// v } v <> ッ } [ワヰウヱヲう] $voice ;
"z } z <> ッ } $s_start $voice;"
"v } v <> ッ } $v_start;"
// normal
"k } k <> ッ } $k_start ;"
"m } m <> ッ } $m_start ;"
"n } n <> ッ } $n_start ;"
"h } h <> ッ } $h_start ;"
"f } f <> ッ } $f_start ;"
"r } r <> ッ } $r_start ;"
"t } t <> ッ } $t_start ;"
"s } s <> ッ } $s_start ;"
"w } w <> ッ } $w_start;"
"y } y <> ッ } $y_start;"
// completeness
"x } x > ッ ;"
"c } k > ッ ;"
"c } c > ッ ;"
"c } q > ッ ;"
"l } l > ッ ;"
"q } q > ッ ;"
// y } y > ッ ;
// w } w > ッ ;
// prolonged vowel mark. this indicates a doubling of
// the preceding vowel sound
//a < a { ー ; # liu
//e < e { ー ; # liu
//i < i { ー ; # liu
//o < o { ー ; # liu
//u < u { ー ; # liu
"$macron <> ー ;"
// small forms
"'~a' <> ァ ;"
"'~i' <> ィ ;"
"'~u' <> ゥ ;"
"'~e' <> ェ ;"
"'~o' <> ォ ;"
"'~tsu' <> ッ ;"
"'~wa' <> ヮ ;"
"'~ya' <> ャ ;"
"'~yi' > ィ ;"
"'~yu' <> ュ ;"
"'~ye' > ェ ;"
"'~yo' <> ョ ;"
// iteration marks
// TODO: make more accurate
"j $1 < sh (y* $vowel) {ヽ$voice ;"
"dj $1 < ch (y* $vowel) {ヽ$voice ;"
"dz $1 < ts (y* $vowel) {ヽ$voice ;"
"g $1 < k (y* $vowel) {ヽ$voice ;"
"z $1 < s (y* $vowel) {ヽ$voice ;"
"d $1 < t (y* $vowel) {ヽ$voice ;"
"h $1 < b (y* $vowel) {ヽ$voice ;"
"v $1 < w (y* $vowel) {ヽ$voice ;"
"sh $1 < sh (y* $vowel) {ヽ$voice ;"
"j $1 < j (y* $vowel) {ヽ$voice ;"
"ch $1 < ch (y* $vowel) {ヽ$voice ;"
"dj $1 < dj(y* $vowel) {ヽ$voice ;"
"ts $1 < ts (y* $vowel) {ヽ$voice ;"
"dz $1 < dz (y* $vowel) {ヽ$voice ;"
"$1 < ($consonant y* $vowel) {ヽ$voice? ;"
"$1 < (.) {ヽ $voice? ;" // otherwise repeat last character
"< ヽ $voice? ;" // delete if no characters found
// h- rule: lengthens vowel if not followed by a vowel
"[aeiou] } h > ー ;"
// one-way latin- > kana rules. these do not occur in
// well-formed romaji representing actual japanese text.
// their purpose is to make all romaji map to kana of
// some sort.
// the following are not really necessary, but produce
// slightly more natural results.
"cy > セィ ;"
"dy > ディ ;"
"hy > ヒ ;"
"sy > セィ ;"
"ty > ティ ;"
"zy > ゼィ ;"
"h > ヘ ;"
// isolated consonants listed here so as not to mask
// longer rules above.
"ch > チ;"
"sh > シ ;"
"dz > ヅ ;"
"dj > ヂ;"
"b > ブ ;"
"d > デ ;"
"g > グ ;"
"k > ク ;"
"m > ム ;"
"n'' < ン } $n_quoter ;"
"n <> ン ;"
"p > プ ;"
"r > ル ;"
"s > ス ;"
"t > テ ;"
"y > イ ;"
"z > ズ ;"
"v > ヴ ;"
"f > フ;"
"j > ジ;"
"w > ウ;"
"ß > | ss ;"
"æ > | e ;"
"ð > | d ;"
"ø > | u ;"
"þ > | th ;"
// simple substitutions using backup
"c > | k ;"
"l > | r ;"
"q > | k ;"
"x > | ks ;"
// ~~~ END shared rules ~~~
//------------------------------------------------------
// Final cleanup
"'~' > ;" // delete stray tildes between letters
"[:Katakana:] { '' } [:Latin:] > ;" // delete stray quotes between letters
// [\u02BE[:Nonspacing Mark:]-[\u3099-\u309C]] > ; # delete any non-spacing marks that we didn't use
":: NFC (NFD) ;"
":: ([:Katakana:] halfwidth-fullwidth);"
// note: a global filter is more efficient, but MUST include all source chars!!
//:: ([\\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9F [:Latin:][:Katakana:] [:nonspacing mark:]]);
// MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
":: ( [[\\\ -~\u00A2-\u00A3\u00A5-\u00A6\u00AC\u0304\u20A9\uFF61-\uFFBE\uFFC2-\uFFC7\uFFCA-\uFFCF\uFFD2-\uFFD7\uFFDA-\uFFDC\uFFE8-\uFFEE][~\u3001-\u3002\u304C\u304E\u3050\u3052\u3054\u3056\u3058\u305A\u305C\u305E\u3060\u3062\u3065\u3067\u3069\u3070-\u3071\u3073-\u3074\u3076-\u3077\u3079-\u307A\u307C-\u307D\u3094\u3099-\u309B\u309E\u30A1-\u30FA\u30FC-\u30FE]] ) ;"
// eof
}
}