208 lines
4.4 KiB
Plaintext
208 lines
4.4 KiB
Plaintext
|
#--------------------------------------------------------------------
|
|||
|
# Copyright (c) 1999-2004, International Business Machines
|
|||
|
# Corporation and others. All Rights Reserved.
|
|||
|
#--------------------------------------------------------------------
|
|||
|
|
|||
|
# note: a global filter is more efficient, but MUST include all source chars
|
|||
|
:: [\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
|
|||
|
:: NFKC ();
|
|||
|
|
|||
|
# Hiragana-Katakana
|
|||
|
|
|||
|
# This is largely a one-to-one mapping, but it has a
|
|||
|
# few kinks:
|
|||
|
|
|||
|
# 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
|
|||
|
# Hiragana equivalents. We use Hiragana wa/wi/we/wo
|
|||
|
# (308F-3092) with a voicing mark (3099), which is
|
|||
|
# semantically equivalent. However, this is a non-
|
|||
|
# roundtripping transformation.
|
|||
|
|
|||
|
# 2. The Katakana small ka/ke (30F5,30F6) have no
|
|||
|
# Hiragana equiavlents. We convert them to normal
|
|||
|
# Hiragana ka/ke (304B,3051). This is a one-way
|
|||
|
# information-losing transformation and precludes
|
|||
|
# round-tripping of 30F5 and 30F6.
|
|||
|
|
|||
|
# 3. The combining marks 3099-309C are in the Hiragana
|
|||
|
# block, but they apply to Katakana as well, so we
|
|||
|
# leave them untouched.
|
|||
|
|
|||
|
# 4. The Katakana prolonged sound mark 30FC doubles the
|
|||
|
# preceding vowel. This is a one-way information-
|
|||
|
# losing transformation from Katakana to Hiragana.
|
|||
|
|
|||
|
# 5. The Katakana middle dot separates words in foreign
|
|||
|
# expressions; we leave this unmodified.
|
|||
|
|
|||
|
# The above points preclude successful round-trip
|
|||
|
# transformations of arbitrary input text. However,
|
|||
|
# they provide naturalistic results that should conform
|
|||
|
# to user expectations.
|
|||
|
|
|||
|
|
|||
|
# Combining equivalents va/vi/ve/vo
|
|||
|
わ゙ <> ヷ;
|
|||
|
ゐ゙ <> ヸ;
|
|||
|
ゑ゙ <> ヹ;
|
|||
|
を゙ <> ヺ;
|
|||
|
|
|||
|
# One-to-one mappings, main block
|
|||
|
# 3041:3094 <> 30A1:30F4
|
|||
|
# 309D,E <> 30FD,E
|
|||
|
ぁ <> ァ;
|
|||
|
あ <> ア;
|
|||
|
ぃ <> ィ;
|
|||
|
い <> イ;
|
|||
|
ぅ <> ゥ;
|
|||
|
う <> ウ;
|
|||
|
ぇ <> ェ;
|
|||
|
え <> エ;
|
|||
|
ぉ <> ォ;
|
|||
|
お <> オ;
|
|||
|
か <> カ;
|
|||
|
が <> ガ;
|
|||
|
き <> キ;
|
|||
|
ぎ <> ギ;
|
|||
|
く <> ク;
|
|||
|
ぐ <> グ;
|
|||
|
け <> ケ;
|
|||
|
げ <> ゲ;
|
|||
|
こ <> コ;
|
|||
|
ご <> ゴ;
|
|||
|
さ <> サ;
|
|||
|
ざ <> ザ;
|
|||
|
し <> シ;
|
|||
|
じ <> ジ;
|
|||
|
す <> ス;
|
|||
|
ず <> ズ;
|
|||
|
せ <> セ;
|
|||
|
ぜ <> ゼ;
|
|||
|
そ <> ソ;
|
|||
|
ぞ <> ゾ;
|
|||
|
た <> タ;
|
|||
|
だ <> ダ;
|
|||
|
ち <> チ;
|
|||
|
ぢ <> ヂ;
|
|||
|
っ <> ッ;
|
|||
|
つ <> ツ;
|
|||
|
づ <> ヅ;
|
|||
|
て <> テ;
|
|||
|
で <> デ;
|
|||
|
と <> ト;
|
|||
|
ど <> ド;
|
|||
|
な <> ナ;
|
|||
|
に <> ニ;
|
|||
|
ぬ <> ヌ;
|
|||
|
ね <> ネ;
|
|||
|
の <> ノ;
|
|||
|
は <> ハ;
|
|||
|
ば <> バ;
|
|||
|
ぱ <> パ;
|
|||
|
ひ <> ヒ;
|
|||
|
び <> ビ;
|
|||
|
ぴ <> ピ;
|
|||
|
ふ <> フ;
|
|||
|
ぶ <> ブ;
|
|||
|
ぷ <> プ;
|
|||
|
へ <> ヘ;
|
|||
|
べ <> ベ;
|
|||
|
ぺ <> ペ;
|
|||
|
ほ <> ホ;
|
|||
|
ぼ <> ボ;
|
|||
|
ぽ <> ポ;
|
|||
|
ま <> マ;
|
|||
|
み <> ミ;
|
|||
|
む <> ム;
|
|||
|
め <> メ;
|
|||
|
も <> モ;
|
|||
|
ゃ <> ャ;
|
|||
|
や <> ヤ;
|
|||
|
ゅ <> ュ;
|
|||
|
ゆ <> ユ;
|
|||
|
ょ <> ョ;
|
|||
|
よ <> ヨ;
|
|||
|
ら <> ラ;
|
|||
|
り <> リ;
|
|||
|
る <> ル;
|
|||
|
れ <> レ;
|
|||
|
ろ <> ロ;
|
|||
|
ゎ <> ヮ;
|
|||
|
わ <> ワ;
|
|||
|
ゐ <> ヰ;
|
|||
|
ゑ <> ヱ;
|
|||
|
を <> ヲ;
|
|||
|
ん <> ン;
|
|||
|
ゔ <> ヴ;
|
|||
|
ゝ <> ヽ;
|
|||
|
ゞ <> ヾ;
|
|||
|
|
|||
|
# One-way Katakana-Hiragana xform of small K ka/ke to
|
|||
|
# normal H ka/ke.
|
|||
|
か < ヵ;
|
|||
|
け < ヶ;
|
|||
|
|
|||
|
# Katakana followed by a prolonged sound mark 30FC has
|
|||
|
# its final vowel doubled. This is a Katakana-Hiragana
|
|||
|
# one-way information-losing transformation. We
|
|||
|
# include the small Katakana (e.g., small A 3041) and
|
|||
|
# do not distinguish them from their large
|
|||
|
# counterparts. It doesn't make sense to double a
|
|||
|
# small counterpart vowel as a small Hiragana vowel, so
|
|||
|
# we don't do so. In natural text this should never
|
|||
|
# occur anyway. If a 30FC is seen without a preceding
|
|||
|
# vowel sound (e.g., after n 30F3) we do not change it.
|
|||
|
|
|||
|
### $long = ー;
|
|||
|
|
|||
|
# The following categories are Hiragana, not Katakana
|
|||
|
# as might be expected, since by the time we get to the
|
|||
|
# 30FC, the preceding character will have already been
|
|||
|
# transformed to Hiragana.
|
|||
|
|
|||
|
# {The following mechanically generated from the
|
|||
|
# Unicode 3.0 data:}
|
|||
|
|
|||
|
$xa = [ \
|
|||
|
ぁ あ か が さ ざ \
|
|||
|
た だ な は ば ぱ \
|
|||
|
ま ゃ や ら ゎ わ \
|
|||
|
];
|
|||
|
|
|||
|
$xi = [ \
|
|||
|
ぃ い き ぎ し じ \
|
|||
|
ち ぢ に ひ び ぴ \
|
|||
|
み り ゐ \
|
|||
|
];
|
|||
|
|
|||
|
$xu = [ \
|
|||
|
ぅ う く ぐ す ず \
|
|||
|
っ つ づ ぬ ふ ぶ \
|
|||
|
ぷ む ゅ ゆ る ゔ \
|
|||
|
];
|
|||
|
|
|||
|
$xe = [ \
|
|||
|
ぇ え け げ せ ぜ \
|
|||
|
て で ね へ べ ぺ \
|
|||
|
め れ ゑ \
|
|||
|
];
|
|||
|
|
|||
|
$xo = [ \
|
|||
|
ぉ お こ ご そ ぞ \
|
|||
|
と ど の ほ ぼ ぽ \
|
|||
|
も ょ よ ろ を \
|
|||
|
];
|
|||
|
|
|||
|
あ < $xa {ー};
|
|||
|
い < $xi {ー};
|
|||
|
う < $xu {ー};
|
|||
|
え < $xe {ー};
|
|||
|
お < $xo {ー};
|
|||
|
|
|||
|
:: (NFKC) ;
|
|||
|
|
|||
|
# note: a global filter is more efficient, but MUST include all source chars!!
|
|||
|
:: ([\u0000-\u007E \u3001\u3002 \u3099-\u309C \u30A1-\u30FC \uFF61-\uFF9Fー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);
|
|||
|
|
|||
|
# eof
|