ICU-1422 Changed to non-binary in CVS, dropping .utf8
X-SVN-Rev: 6570
This commit is contained in:
parent
d7fa005153
commit
0972265bf5
270
icu4j/src/com/ibm/icu/impl/data/Transliterator_Fullwidth_Halfwidth.txt
Executable file
270
icu4j/src/com/ibm/icu/impl/data/Transliterator_Fullwidth_Halfwidth.txt
Executable file
@ -0,0 +1,270 @@
|
||||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# Date: Tue Jan 23 12:41:57 2001
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Fullwidth-Halfwidth
|
||||
|
||||
# Mechanically generated from Unicode Character Database
|
||||
|
||||
# multicharacter
|
||||
|
||||
ガ<>ガ; # to KATAKANA LETTER GA
|
||||
ギ<>ギ; # to KATAKANA LETTER GI
|
||||
グ<>グ; # to KATAKANA LETTER GU
|
||||
ゲ<>ゲ; # to KATAKANA LETTER GE
|
||||
ゴ<>ゴ; # to KATAKANA LETTER GO
|
||||
ザ<>ザ; # to KATAKANA LETTER ZA
|
||||
ジ<>ジ; # to KATAKANA LETTER ZI
|
||||
ズ<>ズ; # to KATAKANA LETTER ZU
|
||||
ゼ<>ゼ; # to KATAKANA LETTER ZE
|
||||
ゾ<>ゾ; # to KATAKANA LETTER ZO
|
||||
ダ<>ダ; # to KATAKANA LETTER DA
|
||||
ヂ<>ヂ; # to KATAKANA LETTER DI
|
||||
ヅ<>ヅ; # to KATAKANA LETTER DU
|
||||
デ<>デ; # to KATAKANA LETTER DE
|
||||
ド<>ド; # to KATAKANA LETTER DO
|
||||
バ<>バ; # to KATAKANA LETTER BA
|
||||
パ<>パ; # to KATAKANA LETTER PA
|
||||
ビ<>ビ; # to KATAKANA LETTER BI
|
||||
ピ<>ピ; # to KATAKANA LETTER PI
|
||||
ブ<>ブ; # to KATAKANA LETTER BU
|
||||
プ<>プ; # to KATAKANA LETTER PU
|
||||
ベ<>ベ; # to KATAKANA LETTER BE
|
||||
ペ<>ペ; # to KATAKANA LETTER PE
|
||||
ボ<>ボ; # to KATAKANA LETTER BO
|
||||
ポ<>ポ; # to KATAKANA LETTER PO
|
||||
ヴ<>ヴ; # to KATAKANA LETTER VU
|
||||
ヷ<>ヷ; # to KATAKANA LETTER VA
|
||||
ヺ<>ヺ; # to KATAKANA LETTER VO
|
||||
|
||||
# single character
|
||||
|
||||
!<>'!'; # from FULLWIDTH EXCLAMATION MARK
|
||||
"<>'\"'; # from FULLWIDTH QUOTATION MARK
|
||||
#<>'#'; # from FULLWIDTH NUMBER SIGN
|
||||
$<>'$'; # from FULLWIDTH DOLLAR SIGN
|
||||
%<>'%'; # from FULLWIDTH PERCENT SIGN
|
||||
&<>'&'; # from FULLWIDTH AMPERSAND
|
||||
'<>''; # from FULLWIDTH APOSTROPHE
|
||||
(<>'('; # from FULLWIDTH LEFT PARENTHESIS
|
||||
)<>')'; # from FULLWIDTH RIGHT PARENTHESIS
|
||||
*<>'*'; # from FULLWIDTH ASTERISK
|
||||
+<>'+'; # from FULLWIDTH PLUS SIGN
|
||||
,<>','; # from FULLWIDTH COMMA
|
||||
-<>'-'; # from FULLWIDTH HYPHEN-MINUS
|
||||
.<>'.'; # from FULLWIDTH FULL STOP
|
||||
/<>'/'; # from FULLWIDTH SOLIDUS
|
||||
0<>'0'; # from FULLWIDTH DIGIT ZERO
|
||||
1<>'1'; # from FULLWIDTH DIGIT ONE
|
||||
2<>'2'; # from FULLWIDTH DIGIT TWO
|
||||
3<>'3'; # from FULLWIDTH DIGIT THREE
|
||||
4<>'4'; # from FULLWIDTH DIGIT FOUR
|
||||
5<>'5'; # from FULLWIDTH DIGIT FIVE
|
||||
6<>'6'; # from FULLWIDTH DIGIT SIX
|
||||
7<>'7'; # from FULLWIDTH DIGIT SEVEN
|
||||
8<>'8'; # from FULLWIDTH DIGIT EIGHT
|
||||
9<>'9'; # from FULLWIDTH DIGIT NINE
|
||||
:<>':'; # from FULLWIDTH COLON
|
||||
;<>';'; # from FULLWIDTH SEMICOLON
|
||||
<<>'<'; # from FULLWIDTH LESS-THAN SIGN
|
||||
=<>'='; # from FULLWIDTH EQUALS SIGN
|
||||
><>'>'; # from FULLWIDTH GREATER-THAN SIGN
|
||||
?<>'?'; # from FULLWIDTH QUESTION MARK
|
||||
@<>'@'; # from FULLWIDTH COMMERCIAL AT
|
||||
A<>A; # from FULLWIDTH LATIN CAPITAL LETTER A
|
||||
B<>B; # from FULLWIDTH LATIN CAPITAL LETTER B
|
||||
C<>C; # from FULLWIDTH LATIN CAPITAL LETTER C
|
||||
D<>D; # from FULLWIDTH LATIN CAPITAL LETTER D
|
||||
E<>E; # from FULLWIDTH LATIN CAPITAL LETTER E
|
||||
F<>F; # from FULLWIDTH LATIN CAPITAL LETTER F
|
||||
G<>G; # from FULLWIDTH LATIN CAPITAL LETTER G
|
||||
H<>H; # from FULLWIDTH LATIN CAPITAL LETTER H
|
||||
I<>I; # from FULLWIDTH LATIN CAPITAL LETTER I
|
||||
J<>J; # from FULLWIDTH LATIN CAPITAL LETTER J
|
||||
K<>K; # from FULLWIDTH LATIN CAPITAL LETTER K
|
||||
L<>L; # from FULLWIDTH LATIN CAPITAL LETTER L
|
||||
M<>M; # from FULLWIDTH LATIN CAPITAL LETTER M
|
||||
N<>N; # from FULLWIDTH LATIN CAPITAL LETTER N
|
||||
O<>O; # from FULLWIDTH LATIN CAPITAL LETTER O
|
||||
P<>P; # from FULLWIDTH LATIN CAPITAL LETTER P
|
||||
Q<>Q; # from FULLWIDTH LATIN CAPITAL LETTER Q
|
||||
R<>R; # from FULLWIDTH LATIN CAPITAL LETTER R
|
||||
S<>S; # from FULLWIDTH LATIN CAPITAL LETTER S
|
||||
T<>T; # from FULLWIDTH LATIN CAPITAL LETTER T
|
||||
U<>U; # from FULLWIDTH LATIN CAPITAL LETTER U
|
||||
V<>V; # from FULLWIDTH LATIN CAPITAL LETTER V
|
||||
W<>W; # from FULLWIDTH LATIN CAPITAL LETTER W
|
||||
X<>X; # from FULLWIDTH LATIN CAPITAL LETTER X
|
||||
Y<>Y; # from FULLWIDTH LATIN CAPITAL LETTER Y
|
||||
Z<>Z; # from FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
[<>'['; # from FULLWIDTH LEFT SQUARE BRACKET
|
||||
\<>'\\'; # from FULLWIDTH REVERSE SOLIDUS {double escape - aliu}
|
||||
]<>']'; # from FULLWIDTH RIGHT SQUARE BRACKET
|
||||
^<>'^'; # from FULLWIDTH CIRCUMFLEX ACCENT
|
||||
_<>'_'; # from FULLWIDTH LOW LINE
|
||||
`<>'`'; # from FULLWIDTH GRAVE ACCENT
|
||||
a<>a; # from FULLWIDTH LATIN SMALL LETTER A
|
||||
b<>b; # from FULLWIDTH LATIN SMALL LETTER B
|
||||
c<>c; # from FULLWIDTH LATIN SMALL LETTER C
|
||||
d<>d; # from FULLWIDTH LATIN SMALL LETTER D
|
||||
e<>e; # from FULLWIDTH LATIN SMALL LETTER E
|
||||
f<>f; # from FULLWIDTH LATIN SMALL LETTER F
|
||||
g<>g; # from FULLWIDTH LATIN SMALL LETTER G
|
||||
h<>h; # from FULLWIDTH LATIN SMALL LETTER H
|
||||
i<>i; # from FULLWIDTH LATIN SMALL LETTER I
|
||||
j<>j; # from FULLWIDTH LATIN SMALL LETTER J
|
||||
k<>k; # from FULLWIDTH LATIN SMALL LETTER K
|
||||
l<>l; # from FULLWIDTH LATIN SMALL LETTER L
|
||||
m<>m; # from FULLWIDTH LATIN SMALL LETTER M
|
||||
n<>n; # from FULLWIDTH LATIN SMALL LETTER N
|
||||
o<>o; # from FULLWIDTH LATIN SMALL LETTER O
|
||||
p<>p; # from FULLWIDTH LATIN SMALL LETTER P
|
||||
q<>q; # from FULLWIDTH LATIN SMALL LETTER Q
|
||||
r<>r; # from FULLWIDTH LATIN SMALL LETTER R
|
||||
s<>s; # from FULLWIDTH LATIN SMALL LETTER S
|
||||
t<>t; # from FULLWIDTH LATIN SMALL LETTER T
|
||||
u<>u; # from FULLWIDTH LATIN SMALL LETTER U
|
||||
v<>v; # from FULLWIDTH LATIN SMALL LETTER V
|
||||
w<>w; # from FULLWIDTH LATIN SMALL LETTER W
|
||||
x<>x; # from FULLWIDTH LATIN SMALL LETTER X
|
||||
y<>y; # from FULLWIDTH LATIN SMALL LETTER Y
|
||||
z<>z; # from FULLWIDTH LATIN SMALL LETTER Z
|
||||
{<>'{'; # from FULLWIDTH LEFT CURLY BRACKET
|
||||
|<>'|'; # from FULLWIDTH VERTICAL LINE
|
||||
}<>'}'; # from FULLWIDTH RIGHT CURLY BRACKET
|
||||
~<>'~'; # from FULLWIDTH TILDE
|
||||
。<>。; # to HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
「<>「; # to HALFWIDTH LEFT CORNER BRACKET
|
||||
」<>」; # to HALFWIDTH RIGHT CORNER BRACKET
|
||||
、<>、; # to HALFWIDTH IDEOGRAPHIC COMMA
|
||||
・<>・; # to HALFWIDTH KATAKANA MIDDLE DOT
|
||||
ヲ<>ヲ; # to HALFWIDTH KATAKANA LETTER WO
|
||||
ァ<>ァ; # to HALFWIDTH KATAKANA LETTER SMALL A
|
||||
ィ<>ィ; # to HALFWIDTH KATAKANA LETTER SMALL I
|
||||
ゥ<>ゥ; # to HALFWIDTH KATAKANA LETTER SMALL U
|
||||
ェ<>ェ; # to HALFWIDTH KATAKANA LETTER SMALL E
|
||||
ォ<>ォ; # to HALFWIDTH KATAKANA LETTER SMALL O
|
||||
ャ<>ャ; # to HALFWIDTH KATAKANA LETTER SMALL YA
|
||||
ュ<>ュ; # to HALFWIDTH KATAKANA LETTER SMALL YU
|
||||
ョ<>ョ; # to HALFWIDTH KATAKANA LETTER SMALL YO
|
||||
ッ<>ッ; # to HALFWIDTH KATAKANA LETTER SMALL TU
|
||||
ー<>ー; # to HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
ア<>ア; # to HALFWIDTH KATAKANA LETTER A
|
||||
イ<>イ; # to HALFWIDTH KATAKANA LETTER I
|
||||
ウ<>ウ; # to HALFWIDTH KATAKANA LETTER U
|
||||
エ<>エ; # to HALFWIDTH KATAKANA LETTER E
|
||||
オ<>オ; # to HALFWIDTH KATAKANA LETTER O
|
||||
カ<>カ; # to HALFWIDTH KATAKANA LETTER KA
|
||||
キ<>キ; # to HALFWIDTH KATAKANA LETTER KI
|
||||
ク<>ク; # to HALFWIDTH KATAKANA LETTER KU
|
||||
ケ<>ケ; # to HALFWIDTH KATAKANA LETTER KE
|
||||
コ<>コ; # to HALFWIDTH KATAKANA LETTER KO
|
||||
サ<>サ; # to HALFWIDTH KATAKANA LETTER SA
|
||||
シ<>シ; # to HALFWIDTH KATAKANA LETTER SI
|
||||
ス<>ス; # to HALFWIDTH KATAKANA LETTER SU
|
||||
セ<>セ; # to HALFWIDTH KATAKANA LETTER SE
|
||||
ソ<>ソ; # to HALFWIDTH KATAKANA LETTER SO
|
||||
タ<>タ; # to HALFWIDTH KATAKANA LETTER TA
|
||||
チ<>チ; # to HALFWIDTH KATAKANA LETTER TI
|
||||
ツ<>ツ; # to HALFWIDTH KATAKANA LETTER TU
|
||||
テ<>テ; # to HALFWIDTH KATAKANA LETTER TE
|
||||
ト<>ト; # to HALFWIDTH KATAKANA LETTER TO
|
||||
ナ<>ナ; # to HALFWIDTH KATAKANA LETTER NA
|
||||
ニ<>ニ; # to HALFWIDTH KATAKANA LETTER NI
|
||||
ヌ<>ヌ; # to HALFWIDTH KATAKANA LETTER NU
|
||||
ネ<>ネ; # to HALFWIDTH KATAKANA LETTER NE
|
||||
ノ<>ノ; # to HALFWIDTH KATAKANA LETTER NO
|
||||
ハ<>ハ; # to HALFWIDTH KATAKANA LETTER HA
|
||||
ヒ<>ヒ; # to HALFWIDTH KATAKANA LETTER HI
|
||||
フ<>フ; # to HALFWIDTH KATAKANA LETTER HU
|
||||
ヘ<>ヘ; # to HALFWIDTH KATAKANA LETTER HE
|
||||
ホ<>ホ; # to HALFWIDTH KATAKANA LETTER HO
|
||||
マ<>マ; # to HALFWIDTH KATAKANA LETTER MA
|
||||
ミ<>ミ; # to HALFWIDTH KATAKANA LETTER MI
|
||||
ム<>ム; # to HALFWIDTH KATAKANA LETTER MU
|
||||
メ<>メ; # to HALFWIDTH KATAKANA LETTER ME
|
||||
モ<>モ; # to HALFWIDTH KATAKANA LETTER MO
|
||||
ヤ<>ヤ; # to HALFWIDTH KATAKANA LETTER YA
|
||||
ユ<>ユ; # to HALFWIDTH KATAKANA LETTER YU
|
||||
ヨ<>ヨ; # to HALFWIDTH KATAKANA LETTER YO
|
||||
ラ<>ラ; # to HALFWIDTH KATAKANA LETTER RA
|
||||
リ<>リ; # to HALFWIDTH KATAKANA LETTER RI
|
||||
ル<>ル; # to HALFWIDTH KATAKANA LETTER RU
|
||||
レ<>レ; # to HALFWIDTH KATAKANA LETTER RE
|
||||
ロ<>ロ; # to HALFWIDTH KATAKANA LETTER RO
|
||||
ワ<>ワ; # to HALFWIDTH KATAKANA LETTER WA
|
||||
ン<>ン; # to HALFWIDTH KATAKANA LETTER N
|
||||
゙<>゙; # to HALFWIDTH KATAKANA VOICED SOUND MARK
|
||||
゚<>゚; # to HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
ᅠ<>ᅠ; # to HALFWIDTH HANGUL FILLER
|
||||
ᄀ<>ᄀ; # to HALFWIDTH HANGUL LETTER KIYEOK
|
||||
ᄁ<>ᄁ; # to HALFWIDTH HANGUL LETTER SSANGKIYEOK
|
||||
ᆪ<>ᆪ; # to HALFWIDTH HANGUL LETTER KIYEOK-SIOS
|
||||
ᄂ<>ᄂ; # to HALFWIDTH HANGUL LETTER NIEUN
|
||||
ᆬ<>ᆬ; # to HALFWIDTH HANGUL LETTER NIEUN-CIEUC
|
||||
ᆭ<>ᆭ; # to HALFWIDTH HANGUL LETTER NIEUN-HIEUH
|
||||
ᄃ<>ᄃ; # to HALFWIDTH HANGUL LETTER TIKEUT
|
||||
ᄄ<>ᄄ; # to HALFWIDTH HANGUL LETTER SSANGTIKEUT
|
||||
ᄅ<>ᄅ; # to HALFWIDTH HANGUL LETTER RIEUL
|
||||
ᆰ<>ᆰ; # to HALFWIDTH HANGUL LETTER RIEUL-KIYEOK
|
||||
ᆱ<>ᆱ; # to HALFWIDTH HANGUL LETTER RIEUL-MIEUM
|
||||
ᆲ<>ᆲ; # to HALFWIDTH HANGUL LETTER RIEUL-PIEUP
|
||||
ᆳ<>ᆳ; # to HALFWIDTH HANGUL LETTER RIEUL-SIOS
|
||||
ᆴ<>ᆴ; # to HALFWIDTH HANGUL LETTER RIEUL-THIEUTH
|
||||
ᆵ<>ᆵ; # to HALFWIDTH HANGUL LETTER RIEUL-PHIEUPH
|
||||
ᄚ<>ᄚ; # to HALFWIDTH HANGUL LETTER RIEUL-HIEUH
|
||||
ᄆ<>ᄆ; # to HALFWIDTH HANGUL LETTER MIEUM
|
||||
ᄇ<>ᄇ; # to HALFWIDTH HANGUL LETTER PIEUP
|
||||
ᄈ<>ᄈ; # to HALFWIDTH HANGUL LETTER SSANGPIEUP
|
||||
ᄡ<>ᄡ; # to HALFWIDTH HANGUL LETTER PIEUP-SIOS
|
||||
ᄉ<>ᄉ; # to HALFWIDTH HANGUL LETTER SIOS
|
||||
ᄊ<>ᄊ; # to HALFWIDTH HANGUL LETTER SSANGSIOS
|
||||
ᄋ<>ᄋ; # to HALFWIDTH HANGUL LETTER IEUNG
|
||||
ᄌ<>ᄌ; # to HALFWIDTH HANGUL LETTER CIEUC
|
||||
ᄍ<>ᄍ; # to HALFWIDTH HANGUL LETTER SSANGCIEUC
|
||||
ᄎ<>ᄎ; # to HALFWIDTH HANGUL LETTER CHIEUCH
|
||||
ᄏ<>ᄏ; # to HALFWIDTH HANGUL LETTER KHIEUKH
|
||||
ᄐ<>ᄐ; # to HALFWIDTH HANGUL LETTER THIEUTH
|
||||
ᄑ<>ᄑ; # to HALFWIDTH HANGUL LETTER PHIEUPH
|
||||
ᄒ<>ᄒ; # to HALFWIDTH HANGUL LETTER HIEUH
|
||||
ᅡ<>ᅡ; # to HALFWIDTH HANGUL LETTER A
|
||||
ᅢ<>ᅢ; # to HALFWIDTH HANGUL LETTER AE
|
||||
ᅣ<>ᅣ; # to HALFWIDTH HANGUL LETTER YA
|
||||
ᅤ<>ᅤ; # to HALFWIDTH HANGUL LETTER YAE
|
||||
ᅥ<>ᅥ; # to HALFWIDTH HANGUL LETTER EO
|
||||
ᅦ<>ᅦ; # to HALFWIDTH HANGUL LETTER E
|
||||
ᅧ<>ᅧ; # to HALFWIDTH HANGUL LETTER YEO
|
||||
ᅨ<>ᅨ; # to HALFWIDTH HANGUL LETTER YE
|
||||
ᅩ<>ᅩ; # to HALFWIDTH HANGUL LETTER O
|
||||
ᅪ<>ᅪ; # to HALFWIDTH HANGUL LETTER WA
|
||||
ᅫ<>ᅫ; # to HALFWIDTH HANGUL LETTER WAE
|
||||
ᅬ<>ᅬ; # to HALFWIDTH HANGUL LETTER OE
|
||||
ᅭ<>ᅭ; # to HALFWIDTH HANGUL LETTER YO
|
||||
ᅮ<>ᅮ; # to HALFWIDTH HANGUL LETTER U
|
||||
ᅯ<>ᅯ; # to HALFWIDTH HANGUL LETTER WEO
|
||||
ᅰ<>ᅰ; # to HALFWIDTH HANGUL LETTER WE
|
||||
ᅱ<>ᅱ; # to HALFWIDTH HANGUL LETTER WI
|
||||
ᅲ<>ᅲ; # to HALFWIDTH HANGUL LETTER YU
|
||||
ᅳ<>ᅳ; # to HALFWIDTH HANGUL LETTER EU
|
||||
ᅴ<>ᅴ; # to HALFWIDTH HANGUL LETTER YI
|
||||
ᅵ<>ᅵ; # to HALFWIDTH HANGUL LETTER I
|
||||
¢<>'¢'; # from FULLWIDTH CENT SIGN
|
||||
£<>'£'; # from FULLWIDTH POUND SIGN
|
||||
¬<>'¬'; # from FULLWIDTH NOT SIGN
|
||||
 ̄<>' '̄; # from FULLWIDTH MACRON
|
||||
' '<>' '; # ideographic space (place this after MACRON)
|
||||
¦<>'¦'; # from FULLWIDTH BROKEN BAR
|
||||
¥<>'¥'; # from FULLWIDTH YEN SIGN
|
||||
₩<>₩; # from FULLWIDTH WON SIGN
|
||||
│<>│; # to HALFWIDTH FORMS LIGHT VERTICAL
|
||||
←<>←; # to HALFWIDTH LEFTWARDS ARROW
|
||||
↑<>↑; # to HALFWIDTH UPWARDS ARROW
|
||||
→<>→; # to HALFWIDTH RIGHTWARDS ARROW
|
||||
↓<>↓; # to HALFWIDTH DOWNWARDS ARROW
|
||||
■<>■; # to HALFWIDTH BLACK SQUARE
|
||||
○<>○; # to HALFWIDTH WHITE CIRCLE
|
||||
|
||||
# eof
|
515
icu4j/src/com/ibm/icu/impl/data/Transliterator_Latin_Jamo.txt
Executable file
515
icu4j/src/com/ibm/icu/impl/data/Transliterator_Latin_Jamo.txt
Executable file
@ -0,0 +1,515 @@
|
||||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Latin-Jamo
|
||||
|
||||
:: [:Latin:] NFKD ();
|
||||
:: [:Latin:] Lower ();
|
||||
|
||||
# Transliteration from Latin characters to Korean script is done in
|
||||
# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
|
||||
# transliteration is done algorithmically following Unicode 3.0
|
||||
# section 3.11. This file implements the Latin to Jamo
|
||||
# transliteration using rules.
|
||||
|
||||
# Jamo occupy the block 1100-11FF. Within this block there are three
|
||||
# groups of characters: initial consonants or choseong (I), medial
|
||||
# vowels or jungseong (M), and trailing consonants or jongseong (F).
|
||||
# Standard Korean syllables are of the form I+M+F*.
|
||||
|
||||
# Section 3.11 describes the use of 'filler' jamo to convert
|
||||
# nonstandard syllables to standard form: the choseong filler 115F and
|
||||
# the junseong filler 1160. In this transliterator, we will not use
|
||||
# 115F or 1160.
|
||||
|
||||
# We will, however, insert two 'null' jamo to make foreign words
|
||||
# conform to Korean syllable structure. These are the null initial
|
||||
# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
|
||||
# we will use the hyphen in order to disambiguate strings,
|
||||
# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).
|
||||
|
||||
# We will not use all of the characters in the jamo block. We will
|
||||
# only use the 19 initials, 21 medials, and 27 finals possessing a
|
||||
# jamo short name as defined in section 4.4 of the Unicode book.
|
||||
|
||||
# Rules of thumb. These guidelines provide the basic framework
|
||||
# for the rules. They are phrased in terms of Latin-Jamo transliteration.
|
||||
# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
|
||||
# just context-free transliteration of jamo to corresponding short names,
|
||||
# with the addition of hyphens to maintain round-trip integrity
|
||||
# in the context of the Latin-Jamo rules.
|
||||
|
||||
# A sequence of vowels:
|
||||
# - Take the longest sequence you can. If there are too many, or you don't
|
||||
# have a starting consonant, introduce a 110B necessary.
|
||||
|
||||
# A sequence of consonants.
|
||||
# - First join the double consonants: G + G -> GG
|
||||
# - In the remaining list,
|
||||
# -- If there is no preceding vowel, take the first consonant, and insert EU
|
||||
# after it. Continue with the rest of the consonants.
|
||||
# -- If there is one consonant, attach to the following vowel
|
||||
# -- If there are two consonants and a following vowel, attach one to the
|
||||
# preceeding vowel, and one to the following vowel.
|
||||
# -- If there are more than two consonants, join the first two together if you
|
||||
# can: L + G => LG
|
||||
# -- If you still end up with more than 2 consonants, insert EU after the
|
||||
# first one, and continue with the rest of the consonants.
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Variables
|
||||
|
||||
# Some latin consonants or consonant pairs only occur as initials, and
|
||||
# some only as finals, but some occur as both. This makes some jamo
|
||||
# consonants ambiguous when transliterated into latin.
|
||||
# Initial only: IEUNG BB DD JJ R
|
||||
# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
|
||||
# Initial and Final: B C D G GG H J K M N P S SS T
|
||||
|
||||
$Gi = \u1100;
|
||||
$GGi = \u1101;
|
||||
$Ni = \u1102;
|
||||
$Di = \u1103;
|
||||
$DD = \u1104;
|
||||
$R = \u1105;
|
||||
$Mi = \u1106;
|
||||
$Bi = \u1107;
|
||||
$BB = \u1108;
|
||||
$Si = \u1109;
|
||||
$SSi = \u110A;
|
||||
$IEUNG = \u110B; # null initial, inserted during Latin-Jamo
|
||||
$Ji = \u110C;
|
||||
$JJ = \u110D;
|
||||
$Ci = \u110E;
|
||||
$Ki = \u110F;
|
||||
$Ti = \u1110;
|
||||
$Pi = \u1111;
|
||||
$Hi = \u1112;
|
||||
|
||||
$A = \u1161;
|
||||
$AE = \u1162;
|
||||
$YA = \u1163;
|
||||
$YAE = \u1164;
|
||||
$EO = \u1165;
|
||||
$E = \u1166;
|
||||
$YEO = \u1167;
|
||||
$YE = \u1168;
|
||||
$O = \u1169;
|
||||
$WA = \u116A;
|
||||
$WAE = \u116B;
|
||||
$OE = \u116C;
|
||||
$YO = \u116D;
|
||||
$U = \u116E;
|
||||
$WEO = \u116F;
|
||||
$WE = \u1170;
|
||||
$WI = \u1171;
|
||||
$YU = \u1172;
|
||||
$EU = \u1173; # null medial, inserted during Latin-Jamo
|
||||
$YI = \u1174;
|
||||
$I = \u1175;
|
||||
|
||||
$Gf = \u11A8;
|
||||
$GGf = \u11A9;
|
||||
$GS = \u11AA;
|
||||
$Nf = \u11AB;
|
||||
$NJ = \u11AC;
|
||||
$NH = \u11AD;
|
||||
$Df = \u11AE;
|
||||
$L = \u11AF;
|
||||
$LG = \u11B0;
|
||||
$LM = \u11B1;
|
||||
$LB = \u11B2;
|
||||
$LS = \u11B3;
|
||||
$LT = \u11B4;
|
||||
$LP = \u11B5;
|
||||
$LH = \u11B6;
|
||||
$Mf = \u11B7;
|
||||
$Bf = \u11B8;
|
||||
$BS = \u11B9;
|
||||
$Sf = \u11BA;
|
||||
$SSf = \u11BB;
|
||||
$NG = \u11BC;
|
||||
$Jf = \u11BD;
|
||||
$Cf = \u11BE;
|
||||
$Kf = \u11BF;
|
||||
$Tf = \u11C0;
|
||||
$Pf = \u11C1;
|
||||
$Hf = \u11C2;
|
||||
|
||||
$jamoInitial = [\u1100-\u1112];
|
||||
|
||||
$jamoMedial = [\u1161-\u1175];
|
||||
|
||||
$latinInitial = [bcdghjkmnprst];
|
||||
|
||||
# Any character in the latin transliteration of a medial
|
||||
$latinMedial = [aeiouwy];
|
||||
|
||||
# The last character of the latin transliteration of a medial
|
||||
$latinMedialEnd = [aeiou];
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Jamo-Latin
|
||||
|
||||
# Jamo to latin is relatively simple, since it is the latin that is
|
||||
# ambiguous. Most rules are straightforward, and we encode them below
|
||||
# as simple add-on back rule, e.g.:
|
||||
|
||||
# $jamoMedial {bs} > $BS;
|
||||
|
||||
# becomes
|
||||
|
||||
# $jamoMedial {bs} <> $BS;
|
||||
|
||||
# Furthermore, we don't care about the ordering for Jamo-Latin because
|
||||
# we are going from single characters, so we can very easily piggyback
|
||||
# on the Latin-Jamo.
|
||||
|
||||
# The main issue with Jamo-Latin is when to insert hyphens.
|
||||
# Hyphens are inserted to obtain correct round trip behavior. For
|
||||
# example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
|
||||
# would then round trip to Ki A GGi E. To prevent this, we insert a
|
||||
# hyphen: "kag-ge". IMPORTANT: The need for hyphens depends
|
||||
# very specifically on the behavior of the Latin-Jamo rules. A change
|
||||
# in the Latin-Jamo behavior can completely change the way the
|
||||
# hyphen insertion must be done.
|
||||
|
||||
# First try to preserve actual hyphens in the jamo text by doubling
|
||||
# them. This fixes problems like:
|
||||
# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
|
||||
# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
|
||||
# -- if we don't care about losing hyphens in the jamo, we can delete
|
||||
# this rule.
|
||||
|
||||
'--' <> '-';
|
||||
|
||||
# Triple consonants. For three consonants "axxx" we insert a
|
||||
# hyphen between the first and second "x" if XXf, Xf, and Xi all
|
||||
# exist, and we have A Xf XXi. This prevents the reverse
|
||||
# transliteration to A XXf Xi.
|
||||
|
||||
'-' < $latinMedialEnd g {} $GGi;
|
||||
'-' < $latinMedialEnd s {} $SSi;
|
||||
|
||||
# For vowels the rule is similar. If there is a vowel "ae" such that
|
||||
# "a" by itself and "e" by itself are vowels, then we want to map A E
|
||||
# to "a-e" so as not to round trip to AE. However, in the text Ki EO
|
||||
# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For
|
||||
# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
|
||||
# tested. NOTE: These rules used to have a left context of
|
||||
# $latinInitial instead of [^$latinMedial]. The problem with this is
|
||||
# sequences where an initial IEUNG is transliterated away:
|
||||
# (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)
|
||||
|
||||
'-' < [^$latinMedial] [y w] e {} [$O $OE];
|
||||
'-' < [^$latinMedial] e {} [$O $OE $U];
|
||||
'-' < [^$latinMedial] [o a] {} [$E $EO $EU];
|
||||
'-' < [^$latinMedial] [w y] a {} [$E $EO $EU];
|
||||
|
||||
# Similar to the above, but with an intervening $IEUNG.
|
||||
|
||||
'-' < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];
|
||||
'-' < [^$latinMedial] e {} $IEUNG [$O $OE $U];
|
||||
'-' < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];
|
||||
'-' < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];
|
||||
|
||||
# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
|
||||
# where Xi also exists, must be transliterated as "ax-e" to prevent
|
||||
# the round trip conversion to A Xi E.
|
||||
|
||||
'-' < $latinMedialEnd b {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd c {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd d {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd h {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd j {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd k {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd m {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd p {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd t {} $IEUNG $jamoMedial;
|
||||
|
||||
# Double finals followed by IEUNG. Similar to the single finals
|
||||
# followed by IEUNG. Any latin consonant pair X Y, between medials,
|
||||
# that we would split by Latin-Jamo, we must handle when it occurs as
|
||||
# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
|
||||
# E.
|
||||
|
||||
'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial;
|
||||
|
||||
# Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
|
||||
# we transliterate as "ax-xe" to prevent round trip transliteration as
|
||||
# A XXi E.
|
||||
|
||||
'-' < $latinMedialEnd b {} $Bi $jamoMedial;
|
||||
'-' < $latinMedialEnd d {} $Di $jamoMedial;
|
||||
'-' < $latinMedialEnd j {} $Ji $jamoMedial;
|
||||
'-' < $latinMedialEnd g {} $Gi $jamoMedial;
|
||||
'-' < $latinMedialEnd s {} $Si $jamoMedial;
|
||||
|
||||
# XYY. This corresponds to the XYY rule in Latin-Jamo. By default
|
||||
# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
|
||||
# "xyy" forms that correspond to XYf Yi must be transliterated as
|
||||
# "xy-y".
|
||||
|
||||
'-' < $latinMedialEnd b s {} [$Si $SSi];
|
||||
'-' < $latinMedialEnd g s {} [$Si $SSi];
|
||||
'-' < $latinMedialEnd l b {} [$Bi $BB];
|
||||
'-' < $latinMedialEnd l g {} [$Gi $GGi];
|
||||
'-' < $latinMedialEnd l s {} [$Si $SSi];
|
||||
'-' < $latinMedialEnd n g {} [$Gi $GGi];
|
||||
'-' < $latinMedialEnd n j {} [$Ji $JJ];
|
||||
|
||||
# Deletion of IEUNG is handled below.
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Latin-Jamo
|
||||
|
||||
# [Basic, context-free Jamo-Latin rules are embedded here too. See
|
||||
# above.]
|
||||
|
||||
# Split digraphs: Text of the form 'axye', where 'xy' is a final
|
||||
# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
|
||||
# 'e' are medials, we want to transliterate this as A Xf Yi E rather
|
||||
# than A XYf IEUNG E. We do NOT include text of the form "axxe",
|
||||
# since that is handled differently below. These rules are generated
|
||||
# programmatically from the jamo data.
|
||||
|
||||
$jamoMedial {b s} $latinMedial > $Bf $Si;
|
||||
$jamoMedial {g s} $latinMedial > $Gf $Si;
|
||||
$jamoMedial {l b} $latinMedial > $L $Bi;
|
||||
$jamoMedial {l g} $latinMedial > $L $Gi;
|
||||
$jamoMedial {l h} $latinMedial > $L $Hi;
|
||||
$jamoMedial {l m} $latinMedial > $L $Mi;
|
||||
$jamoMedial {l p} $latinMedial > $L $Pi;
|
||||
$jamoMedial {l s} $latinMedial > $L $Si;
|
||||
$jamoMedial {l t} $latinMedial > $L $Ti;
|
||||
$jamoMedial {n g} $latinMedial > $Nf $Gi;
|
||||
$jamoMedial {n h} $latinMedial > $Nf $Hi;
|
||||
$jamoMedial {n j} $latinMedial > $Nf $Ji;
|
||||
|
||||
# Single consonants are initials: Text of the form 'axe', where 'x'
|
||||
# can be an initial or a final, and 'a' and 'e' are medials, we want
|
||||
# to transliterate as A Xi E rather than A Xf IEUNG E.
|
||||
|
||||
$jamoMedial {b} $latinMedial > $Bi;
|
||||
$jamoMedial {c} $latinMedial > $Ci;
|
||||
$jamoMedial {d} $latinMedial > $Di;
|
||||
$jamoMedial {g} $latinMedial > $Gi;
|
||||
$jamoMedial {h} $latinMedial > $Hi;
|
||||
$jamoMedial {j} $latinMedial > $Ji;
|
||||
$jamoMedial {k} $latinMedial > $Ki;
|
||||
$jamoMedial {m} $latinMedial > $Mi;
|
||||
$jamoMedial {n} $latinMedial > $Ni;
|
||||
$jamoMedial {p} $latinMedial > $Pi;
|
||||
$jamoMedial {s} $latinMedial > $Si;
|
||||
$jamoMedial {t} $latinMedial > $Ti;
|
||||
|
||||
# Doubled initials. The sequence "axxe", where XX exists as an initial
|
||||
# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
|
||||
# to transliterate as A XXi E, rather than split to A Xf Xi E.
|
||||
|
||||
$jamoMedial {b b} $latinMedial > $BB;
|
||||
$jamoMedial {d d} $latinMedial > $DD;
|
||||
$jamoMedial {j j} $latinMedial > $JJ;
|
||||
$jamoMedial {g g} $latinMedial > $GGi;
|
||||
$jamoMedial {s s} $latinMedial > $SSi;
|
||||
|
||||
# XYY. Because doubled consonants bind more strongly than XY
|
||||
# consonants, we must handle the sequence "axyy" specially. Here XYf
|
||||
# and YYi must exist. In these cases, we map to Xf YYi rather than
|
||||
# XYf.
|
||||
|
||||
$jamoMedial {b} s s > $Bf;
|
||||
$jamoMedial {g} s s > $Gf;
|
||||
$jamoMedial {l} b b > $L;
|
||||
$jamoMedial {l} g g > $L;
|
||||
$jamoMedial {l} s s > $L;
|
||||
$jamoMedial {n} g g > $Nf;
|
||||
$jamoMedial {n} j j > $Nf;
|
||||
|
||||
# Finals: Attach consonant with preceding medial to preceding medial.
|
||||
# Do this BEFORE mapping consonants to initials. Longer keys must
|
||||
# precede shorter keys that they start with, e.g., the rule for 'bs'
|
||||
# must precede 'b'.
|
||||
|
||||
# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
|
||||
# block for Jamo-Latin.]
|
||||
|
||||
$jamoMedial {bs} <> $BS;
|
||||
$jamoMedial {b} <> $Bf;
|
||||
$jamoMedial {c} <> $Cf;
|
||||
$jamoMedial {d} <> $Df;
|
||||
$jamoMedial {gg} <> $GGf;
|
||||
$jamoMedial {gs} <> $GS;
|
||||
$jamoMedial {g} <> $Gf;
|
||||
$jamoMedial {h} <> $Hf;
|
||||
$jamoMedial {j} <> $Jf;
|
||||
$jamoMedial {k} <> $Kf;
|
||||
$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;
|
||||
$jamoMedial {lh} <> $LH;
|
||||
$jamoMedial {lm} <> $LM;
|
||||
$jamoMedial {lp} <> $LP;
|
||||
$jamoMedial {ls} <> $LS;
|
||||
$jamoMedial {lt} <> $LT;
|
||||
$jamoMedial {l} <> $L;
|
||||
$jamoMedial {m} <> $Mf;
|
||||
$jamoMedial {ng} <> $NG;
|
||||
$jamoMedial {nh} <> $NH;
|
||||
$jamoMedial {nj} <> $NJ;
|
||||
$jamoMedial {n} <> $Nf;
|
||||
$jamoMedial {p} <> $Pf;
|
||||
$jamoMedial {ss} <> $SSf;
|
||||
$jamoMedial {s} <> $Sf;
|
||||
$jamoMedial {t} <> $Tf;
|
||||
|
||||
# Initials: Attach single consonant to following medial. Do this
|
||||
# AFTER mapping finals. Longer keys must precede shorter keys that
|
||||
# they start with, e.g., the rule for 'gg' must precede 'g'.
|
||||
|
||||
# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
|
||||
# this block for Jamo-Latin.]
|
||||
|
||||
{gg} $latinMedial <> $GGi;
|
||||
{g} $latinMedial <> $Gi;
|
||||
{n} $latinMedial <> $Ni;
|
||||
{dd} $latinMedial <> $DD;
|
||||
{d} $latinMedial <> $Di;
|
||||
{r} $latinMedial <> $R;
|
||||
{m} $latinMedial <> $Mi;
|
||||
{bb} $latinMedial <> $BB;
|
||||
{b} $latinMedial <> $Bi;
|
||||
{ss} $latinMedial <> $SSi;
|
||||
{s} $latinMedial <> $Si;
|
||||
{jj} $latinMedial <> $JJ;
|
||||
{j} $latinMedial <> $Ji;
|
||||
{c} $latinMedial <> $Ci;
|
||||
{k} $latinMedial <> $Ki;
|
||||
{t} $latinMedial <> $Ti;
|
||||
{p} $latinMedial <> $Pi;
|
||||
{h} $latinMedial <> $Hi;
|
||||
|
||||
# 'r' in final position. Because of the equivalency of the 'l' and
|
||||
# 'r' jamo (the glyphs are the same), we try to provide the same
|
||||
# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
|
||||
# below. If we see an 'r' in an apparent final position, treat it
|
||||
# like 'l'. For example, "karka" => Ki A R EU Ki A without this rule.
|
||||
# Instead, we want Ki A L Ki A.
|
||||
|
||||
$jamoMedial {r} $latinInitial > | l;
|
||||
|
||||
# Initial + Final: If we match the next rule, we have initial then
|
||||
# final consonant with no intervening medial. We insert the null
|
||||
# vowel BEFORE it to create a well-formed syllable. (In the next rule
|
||||
# we insert a null vowel AFTER an anomalous initial.)
|
||||
|
||||
$jamoInitial {} [bcdghjklmnpst] > $EU;
|
||||
|
||||
# Initial + X: This block matches an initial consonant not followed by
|
||||
# a medial. We insert the null vowel after it. We handle double
|
||||
# initials explicitly here; for single initial consonants we insert EU
|
||||
# (as Latin) after them and let standard rules do the rest.
|
||||
|
||||
# BREAKS ROUND TRIP INTEGRITY
|
||||
|
||||
gg > $GGi $EU;
|
||||
dd > $DD $EU;
|
||||
bb > $BB $EU;
|
||||
ss > $SSi $EU;
|
||||
jj > $JJ $EU;
|
||||
|
||||
([bcdghjkmnprst]) > | $1 eu;
|
||||
|
||||
# X + Final: Finally we have to deal with a consonant that can only be
|
||||
# interpreted as a final (not an initial) and which is preceded
|
||||
# neither by an initial nor a medial. It is the start of the
|
||||
# syllable, but cannot be. Most of these will already be handled by
|
||||
# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
|
||||
# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
|
||||
# For this isolated case, we could add a null initial and medial,
|
||||
# which would give "la" => IEUNG EU L IEUNG A, for example. A more
|
||||
# economical solution is to transliterate isolated "l" (that is,
|
||||
# initial "l") to "r". (Other similar conversions of consonants that
|
||||
# occur neither as initials nor as finals are handled below.)
|
||||
|
||||
l > | r;
|
||||
|
||||
# Medials. If a medial is preceded by an initial, then we proceed
|
||||
# normally. As usual, longer keys must precede shorter ones.
|
||||
|
||||
# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
|
||||
# this block for Jamo-Latin.]
|
||||
|
||||
$jamoInitial {ae} <> $AE;
|
||||
$jamoInitial {a} <> $A;
|
||||
$jamoInitial {eo} <> $EO;
|
||||
$jamoInitial {eu} <> $EU;
|
||||
$jamoInitial {e} <> $E;
|
||||
$jamoInitial {i} <> $I;
|
||||
$jamoInitial {oe} <> $OE;
|
||||
$jamoInitial {o} <> $O;
|
||||
$jamoInitial {u} <> $U;
|
||||
$jamoInitial {wae} <> $WAE;
|
||||
$jamoInitial {wa} <> $WA;
|
||||
$jamoInitial {weo} <> $WEO;
|
||||
$jamoInitial {we} <> $WE;
|
||||
$jamoInitial {wi} <> $WI;
|
||||
$jamoInitial {yae} <> $YAE;
|
||||
$jamoInitial {ya} <> $YA;
|
||||
$jamoInitial {yeo} <> $YEO;
|
||||
$jamoInitial {ye} <> $YE;
|
||||
$jamoInitial {yi} <> $YI;
|
||||
$jamoInitial {yo} <> $YO;
|
||||
$jamoInitial {yu} <> $YU;
|
||||
|
||||
# We may see an anomalous isolated 'w' or 'y'. In that case, we
|
||||
# interpret it as 'wi' and 'yu', respectively.
|
||||
|
||||
# BREAKS ROUND TRIP INTEGRITY
|
||||
|
||||
$jamoInitial {w} > | wi;
|
||||
$jamoInitial {y} > | yu;
|
||||
|
||||
# Otherwise, insert a null consonant IEUNG before the medial (which is
|
||||
# still an untransliterated latin vowel).
|
||||
|
||||
($latinMedial) > $IEUNG | $1;
|
||||
|
||||
# Convert non-jamo latin consonants to equivalents. These occur as
|
||||
# neither initials nor finals in jamo. 'l' occurs as a final, but not
|
||||
# an initial; it is handled above. The following letters (left hand
|
||||
# side) will never be output by Jamo-Latin.
|
||||
|
||||
f > | p;
|
||||
q > | k;
|
||||
v > | b;
|
||||
x > | ks;
|
||||
z > | s;
|
||||
|
||||
# Delete hyphens (Latin-Jamo).
|
||||
|
||||
'-' > ;
|
||||
|
||||
# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
|
||||
# since these may also occur in text.
|
||||
|
||||
< $IEUNG;
|
||||
:: ([[:Hangul:]&[\uFF00-\uFFFF]] NFKD);
|
||||
|
||||
# eof
|
@ -51,9 +51,9 @@ Greek-Latin:file:Transliterator_Greek_Latin.txt:UTF8:FORWARD
|
||||
Latin-el:file:Transliterator_el_Latin.txt:UTF8:REVERSE
|
||||
el-Latin:file:Transliterator_el_Latin.txt:UTF8:FORWARD
|
||||
|
||||
LowerLatin-Jamo:internal:Transliterator_Latin_Jamo.utf8.txt:UTF8:FORWARD
|
||||
Latin-Jamo:alias:Any-Lower;LowerLatin-Jamo
|
||||
Jamo-Latin:file:Transliterator_Latin_Jamo.utf8.txt:UTF8:REVERSE
|
||||
LowerLatin-Jamo:internal:Transliterator_Latin_Jamo.txt:UTF8:FORWARD
|
||||
Latin-Jamo:alias:[:Latin:]Any-Lower;LowerLatin-Jamo
|
||||
Jamo-Latin:file:Transliterator_Latin_Jamo.txt:UTF8:REVERSE
|
||||
|
||||
Latin-Katakana:file:Transliterator_Latin_Katakana.txt:UTF8:FORWARD
|
||||
Katakana-Latin:file:Transliterator_Latin_Katakana.txt:UTF8:REVERSE
|
||||
@ -78,8 +78,8 @@ Kanji-OnRomaji:file:Transliterator_Kanji_OnRomaji.utf8.txt:UTF8:FORWARD
|
||||
|
||||
# Compound rules
|
||||
|
||||
Latin-Hangul:alias:[\p{Latin}];Latin-Jamo;[\u1100-\u11FF]NFC
|
||||
Hangul-Latin:alias:[\uAC00-\uD7AF];NFD;Jamo-Latin
|
||||
Latin-Hangul:alias:[\p{Latin}];NFD;Latin-Jamo;NFC
|
||||
Hangul-Latin:alias:[:Hangul:];NFD;Jamo-Latin
|
||||
|
||||
# Inter-Indic composed rules
|
||||
Latin-InterIndic:internal:Transliterator_Latin_InterIndic.txt:UTF8:FORWARD
|
||||
|
270
icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.txt
Executable file
270
icu4j/src/com/ibm/text/resources/Transliterator_Fullwidth_Halfwidth.txt
Executable file
@ -0,0 +1,270 @@
|
||||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
# Date: Tue Jan 23 12:41:57 2001
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Fullwidth-Halfwidth
|
||||
|
||||
# Mechanically generated from Unicode Character Database
|
||||
|
||||
# multicharacter
|
||||
|
||||
ガ<>ガ; # to KATAKANA LETTER GA
|
||||
ギ<>ギ; # to KATAKANA LETTER GI
|
||||
グ<>グ; # to KATAKANA LETTER GU
|
||||
ゲ<>ゲ; # to KATAKANA LETTER GE
|
||||
ゴ<>ゴ; # to KATAKANA LETTER GO
|
||||
ザ<>ザ; # to KATAKANA LETTER ZA
|
||||
ジ<>ジ; # to KATAKANA LETTER ZI
|
||||
ズ<>ズ; # to KATAKANA LETTER ZU
|
||||
ゼ<>ゼ; # to KATAKANA LETTER ZE
|
||||
ゾ<>ゾ; # to KATAKANA LETTER ZO
|
||||
ダ<>ダ; # to KATAKANA LETTER DA
|
||||
ヂ<>ヂ; # to KATAKANA LETTER DI
|
||||
ヅ<>ヅ; # to KATAKANA LETTER DU
|
||||
デ<>デ; # to KATAKANA LETTER DE
|
||||
ド<>ド; # to KATAKANA LETTER DO
|
||||
バ<>バ; # to KATAKANA LETTER BA
|
||||
パ<>パ; # to KATAKANA LETTER PA
|
||||
ビ<>ビ; # to KATAKANA LETTER BI
|
||||
ピ<>ピ; # to KATAKANA LETTER PI
|
||||
ブ<>ブ; # to KATAKANA LETTER BU
|
||||
プ<>プ; # to KATAKANA LETTER PU
|
||||
ベ<>ベ; # to KATAKANA LETTER BE
|
||||
ペ<>ペ; # to KATAKANA LETTER PE
|
||||
ボ<>ボ; # to KATAKANA LETTER BO
|
||||
ポ<>ポ; # to KATAKANA LETTER PO
|
||||
ヴ<>ヴ; # to KATAKANA LETTER VU
|
||||
ヷ<>ヷ; # to KATAKANA LETTER VA
|
||||
ヺ<>ヺ; # to KATAKANA LETTER VO
|
||||
|
||||
# single character
|
||||
|
||||
!<>'!'; # from FULLWIDTH EXCLAMATION MARK
|
||||
"<>'\"'; # from FULLWIDTH QUOTATION MARK
|
||||
#<>'#'; # from FULLWIDTH NUMBER SIGN
|
||||
$<>'$'; # from FULLWIDTH DOLLAR SIGN
|
||||
%<>'%'; # from FULLWIDTH PERCENT SIGN
|
||||
&<>'&'; # from FULLWIDTH AMPERSAND
|
||||
'<>''; # from FULLWIDTH APOSTROPHE
|
||||
(<>'('; # from FULLWIDTH LEFT PARENTHESIS
|
||||
)<>')'; # from FULLWIDTH RIGHT PARENTHESIS
|
||||
*<>'*'; # from FULLWIDTH ASTERISK
|
||||
+<>'+'; # from FULLWIDTH PLUS SIGN
|
||||
,<>','; # from FULLWIDTH COMMA
|
||||
-<>'-'; # from FULLWIDTH HYPHEN-MINUS
|
||||
.<>'.'; # from FULLWIDTH FULL STOP
|
||||
/<>'/'; # from FULLWIDTH SOLIDUS
|
||||
0<>'0'; # from FULLWIDTH DIGIT ZERO
|
||||
1<>'1'; # from FULLWIDTH DIGIT ONE
|
||||
2<>'2'; # from FULLWIDTH DIGIT TWO
|
||||
3<>'3'; # from FULLWIDTH DIGIT THREE
|
||||
4<>'4'; # from FULLWIDTH DIGIT FOUR
|
||||
5<>'5'; # from FULLWIDTH DIGIT FIVE
|
||||
6<>'6'; # from FULLWIDTH DIGIT SIX
|
||||
7<>'7'; # from FULLWIDTH DIGIT SEVEN
|
||||
8<>'8'; # from FULLWIDTH DIGIT EIGHT
|
||||
9<>'9'; # from FULLWIDTH DIGIT NINE
|
||||
:<>':'; # from FULLWIDTH COLON
|
||||
;<>';'; # from FULLWIDTH SEMICOLON
|
||||
<<>'<'; # from FULLWIDTH LESS-THAN SIGN
|
||||
=<>'='; # from FULLWIDTH EQUALS SIGN
|
||||
><>'>'; # from FULLWIDTH GREATER-THAN SIGN
|
||||
?<>'?'; # from FULLWIDTH QUESTION MARK
|
||||
@<>'@'; # from FULLWIDTH COMMERCIAL AT
|
||||
A<>A; # from FULLWIDTH LATIN CAPITAL LETTER A
|
||||
B<>B; # from FULLWIDTH LATIN CAPITAL LETTER B
|
||||
C<>C; # from FULLWIDTH LATIN CAPITAL LETTER C
|
||||
D<>D; # from FULLWIDTH LATIN CAPITAL LETTER D
|
||||
E<>E; # from FULLWIDTH LATIN CAPITAL LETTER E
|
||||
F<>F; # from FULLWIDTH LATIN CAPITAL LETTER F
|
||||
G<>G; # from FULLWIDTH LATIN CAPITAL LETTER G
|
||||
H<>H; # from FULLWIDTH LATIN CAPITAL LETTER H
|
||||
I<>I; # from FULLWIDTH LATIN CAPITAL LETTER I
|
||||
J<>J; # from FULLWIDTH LATIN CAPITAL LETTER J
|
||||
K<>K; # from FULLWIDTH LATIN CAPITAL LETTER K
|
||||
L<>L; # from FULLWIDTH LATIN CAPITAL LETTER L
|
||||
M<>M; # from FULLWIDTH LATIN CAPITAL LETTER M
|
||||
N<>N; # from FULLWIDTH LATIN CAPITAL LETTER N
|
||||
O<>O; # from FULLWIDTH LATIN CAPITAL LETTER O
|
||||
P<>P; # from FULLWIDTH LATIN CAPITAL LETTER P
|
||||
Q<>Q; # from FULLWIDTH LATIN CAPITAL LETTER Q
|
||||
R<>R; # from FULLWIDTH LATIN CAPITAL LETTER R
|
||||
S<>S; # from FULLWIDTH LATIN CAPITAL LETTER S
|
||||
T<>T; # from FULLWIDTH LATIN CAPITAL LETTER T
|
||||
U<>U; # from FULLWIDTH LATIN CAPITAL LETTER U
|
||||
V<>V; # from FULLWIDTH LATIN CAPITAL LETTER V
|
||||
W<>W; # from FULLWIDTH LATIN CAPITAL LETTER W
|
||||
X<>X; # from FULLWIDTH LATIN CAPITAL LETTER X
|
||||
Y<>Y; # from FULLWIDTH LATIN CAPITAL LETTER Y
|
||||
Z<>Z; # from FULLWIDTH LATIN CAPITAL LETTER Z
|
||||
[<>'['; # from FULLWIDTH LEFT SQUARE BRACKET
|
||||
\<>'\\'; # from FULLWIDTH REVERSE SOLIDUS {double escape - aliu}
|
||||
]<>']'; # from FULLWIDTH RIGHT SQUARE BRACKET
|
||||
^<>'^'; # from FULLWIDTH CIRCUMFLEX ACCENT
|
||||
_<>'_'; # from FULLWIDTH LOW LINE
|
||||
`<>'`'; # from FULLWIDTH GRAVE ACCENT
|
||||
a<>a; # from FULLWIDTH LATIN SMALL LETTER A
|
||||
b<>b; # from FULLWIDTH LATIN SMALL LETTER B
|
||||
c<>c; # from FULLWIDTH LATIN SMALL LETTER C
|
||||
d<>d; # from FULLWIDTH LATIN SMALL LETTER D
|
||||
e<>e; # from FULLWIDTH LATIN SMALL LETTER E
|
||||
f<>f; # from FULLWIDTH LATIN SMALL LETTER F
|
||||
g<>g; # from FULLWIDTH LATIN SMALL LETTER G
|
||||
h<>h; # from FULLWIDTH LATIN SMALL LETTER H
|
||||
i<>i; # from FULLWIDTH LATIN SMALL LETTER I
|
||||
j<>j; # from FULLWIDTH LATIN SMALL LETTER J
|
||||
k<>k; # from FULLWIDTH LATIN SMALL LETTER K
|
||||
l<>l; # from FULLWIDTH LATIN SMALL LETTER L
|
||||
m<>m; # from FULLWIDTH LATIN SMALL LETTER M
|
||||
n<>n; # from FULLWIDTH LATIN SMALL LETTER N
|
||||
o<>o; # from FULLWIDTH LATIN SMALL LETTER O
|
||||
p<>p; # from FULLWIDTH LATIN SMALL LETTER P
|
||||
q<>q; # from FULLWIDTH LATIN SMALL LETTER Q
|
||||
r<>r; # from FULLWIDTH LATIN SMALL LETTER R
|
||||
s<>s; # from FULLWIDTH LATIN SMALL LETTER S
|
||||
t<>t; # from FULLWIDTH LATIN SMALL LETTER T
|
||||
u<>u; # from FULLWIDTH LATIN SMALL LETTER U
|
||||
v<>v; # from FULLWIDTH LATIN SMALL LETTER V
|
||||
w<>w; # from FULLWIDTH LATIN SMALL LETTER W
|
||||
x<>x; # from FULLWIDTH LATIN SMALL LETTER X
|
||||
y<>y; # from FULLWIDTH LATIN SMALL LETTER Y
|
||||
z<>z; # from FULLWIDTH LATIN SMALL LETTER Z
|
||||
{<>'{'; # from FULLWIDTH LEFT CURLY BRACKET
|
||||
|<>'|'; # from FULLWIDTH VERTICAL LINE
|
||||
}<>'}'; # from FULLWIDTH RIGHT CURLY BRACKET
|
||||
~<>'~'; # from FULLWIDTH TILDE
|
||||
。<>。; # to HALFWIDTH IDEOGRAPHIC FULL STOP
|
||||
「<>「; # to HALFWIDTH LEFT CORNER BRACKET
|
||||
」<>」; # to HALFWIDTH RIGHT CORNER BRACKET
|
||||
、<>、; # to HALFWIDTH IDEOGRAPHIC COMMA
|
||||
・<>・; # to HALFWIDTH KATAKANA MIDDLE DOT
|
||||
ヲ<>ヲ; # to HALFWIDTH KATAKANA LETTER WO
|
||||
ァ<>ァ; # to HALFWIDTH KATAKANA LETTER SMALL A
|
||||
ィ<>ィ; # to HALFWIDTH KATAKANA LETTER SMALL I
|
||||
ゥ<>ゥ; # to HALFWIDTH KATAKANA LETTER SMALL U
|
||||
ェ<>ェ; # to HALFWIDTH KATAKANA LETTER SMALL E
|
||||
ォ<>ォ; # to HALFWIDTH KATAKANA LETTER SMALL O
|
||||
ャ<>ャ; # to HALFWIDTH KATAKANA LETTER SMALL YA
|
||||
ュ<>ュ; # to HALFWIDTH KATAKANA LETTER SMALL YU
|
||||
ョ<>ョ; # to HALFWIDTH KATAKANA LETTER SMALL YO
|
||||
ッ<>ッ; # to HALFWIDTH KATAKANA LETTER SMALL TU
|
||||
ー<>ー; # to HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
ア<>ア; # to HALFWIDTH KATAKANA LETTER A
|
||||
イ<>イ; # to HALFWIDTH KATAKANA LETTER I
|
||||
ウ<>ウ; # to HALFWIDTH KATAKANA LETTER U
|
||||
エ<>エ; # to HALFWIDTH KATAKANA LETTER E
|
||||
オ<>オ; # to HALFWIDTH KATAKANA LETTER O
|
||||
カ<>カ; # to HALFWIDTH KATAKANA LETTER KA
|
||||
キ<>キ; # to HALFWIDTH KATAKANA LETTER KI
|
||||
ク<>ク; # to HALFWIDTH KATAKANA LETTER KU
|
||||
ケ<>ケ; # to HALFWIDTH KATAKANA LETTER KE
|
||||
コ<>コ; # to HALFWIDTH KATAKANA LETTER KO
|
||||
サ<>サ; # to HALFWIDTH KATAKANA LETTER SA
|
||||
シ<>シ; # to HALFWIDTH KATAKANA LETTER SI
|
||||
ス<>ス; # to HALFWIDTH KATAKANA LETTER SU
|
||||
セ<>セ; # to HALFWIDTH KATAKANA LETTER SE
|
||||
ソ<>ソ; # to HALFWIDTH KATAKANA LETTER SO
|
||||
タ<>タ; # to HALFWIDTH KATAKANA LETTER TA
|
||||
チ<>チ; # to HALFWIDTH KATAKANA LETTER TI
|
||||
ツ<>ツ; # to HALFWIDTH KATAKANA LETTER TU
|
||||
テ<>テ; # to HALFWIDTH KATAKANA LETTER TE
|
||||
ト<>ト; # to HALFWIDTH KATAKANA LETTER TO
|
||||
ナ<>ナ; # to HALFWIDTH KATAKANA LETTER NA
|
||||
ニ<>ニ; # to HALFWIDTH KATAKANA LETTER NI
|
||||
ヌ<>ヌ; # to HALFWIDTH KATAKANA LETTER NU
|
||||
ネ<>ネ; # to HALFWIDTH KATAKANA LETTER NE
|
||||
ノ<>ノ; # to HALFWIDTH KATAKANA LETTER NO
|
||||
ハ<>ハ; # to HALFWIDTH KATAKANA LETTER HA
|
||||
ヒ<>ヒ; # to HALFWIDTH KATAKANA LETTER HI
|
||||
フ<>フ; # to HALFWIDTH KATAKANA LETTER HU
|
||||
ヘ<>ヘ; # to HALFWIDTH KATAKANA LETTER HE
|
||||
ホ<>ホ; # to HALFWIDTH KATAKANA LETTER HO
|
||||
マ<>マ; # to HALFWIDTH KATAKANA LETTER MA
|
||||
ミ<>ミ; # to HALFWIDTH KATAKANA LETTER MI
|
||||
ム<>ム; # to HALFWIDTH KATAKANA LETTER MU
|
||||
メ<>メ; # to HALFWIDTH KATAKANA LETTER ME
|
||||
モ<>モ; # to HALFWIDTH KATAKANA LETTER MO
|
||||
ヤ<>ヤ; # to HALFWIDTH KATAKANA LETTER YA
|
||||
ユ<>ユ; # to HALFWIDTH KATAKANA LETTER YU
|
||||
ヨ<>ヨ; # to HALFWIDTH KATAKANA LETTER YO
|
||||
ラ<>ラ; # to HALFWIDTH KATAKANA LETTER RA
|
||||
リ<>リ; # to HALFWIDTH KATAKANA LETTER RI
|
||||
ル<>ル; # to HALFWIDTH KATAKANA LETTER RU
|
||||
レ<>レ; # to HALFWIDTH KATAKANA LETTER RE
|
||||
ロ<>ロ; # to HALFWIDTH KATAKANA LETTER RO
|
||||
ワ<>ワ; # to HALFWIDTH KATAKANA LETTER WA
|
||||
ン<>ン; # to HALFWIDTH KATAKANA LETTER N
|
||||
゙<>゙; # to HALFWIDTH KATAKANA VOICED SOUND MARK
|
||||
゚<>゚; # to HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
|
||||
ᅠ<>ᅠ; # to HALFWIDTH HANGUL FILLER
|
||||
ᄀ<>ᄀ; # to HALFWIDTH HANGUL LETTER KIYEOK
|
||||
ᄁ<>ᄁ; # to HALFWIDTH HANGUL LETTER SSANGKIYEOK
|
||||
ᆪ<>ᆪ; # to HALFWIDTH HANGUL LETTER KIYEOK-SIOS
|
||||
ᄂ<>ᄂ; # to HALFWIDTH HANGUL LETTER NIEUN
|
||||
ᆬ<>ᆬ; # to HALFWIDTH HANGUL LETTER NIEUN-CIEUC
|
||||
ᆭ<>ᆭ; # to HALFWIDTH HANGUL LETTER NIEUN-HIEUH
|
||||
ᄃ<>ᄃ; # to HALFWIDTH HANGUL LETTER TIKEUT
|
||||
ᄄ<>ᄄ; # to HALFWIDTH HANGUL LETTER SSANGTIKEUT
|
||||
ᄅ<>ᄅ; # to HALFWIDTH HANGUL LETTER RIEUL
|
||||
ᆰ<>ᆰ; # to HALFWIDTH HANGUL LETTER RIEUL-KIYEOK
|
||||
ᆱ<>ᆱ; # to HALFWIDTH HANGUL LETTER RIEUL-MIEUM
|
||||
ᆲ<>ᆲ; # to HALFWIDTH HANGUL LETTER RIEUL-PIEUP
|
||||
ᆳ<>ᆳ; # to HALFWIDTH HANGUL LETTER RIEUL-SIOS
|
||||
ᆴ<>ᆴ; # to HALFWIDTH HANGUL LETTER RIEUL-THIEUTH
|
||||
ᆵ<>ᆵ; # to HALFWIDTH HANGUL LETTER RIEUL-PHIEUPH
|
||||
ᄚ<>ᄚ; # to HALFWIDTH HANGUL LETTER RIEUL-HIEUH
|
||||
ᄆ<>ᄆ; # to HALFWIDTH HANGUL LETTER MIEUM
|
||||
ᄇ<>ᄇ; # to HALFWIDTH HANGUL LETTER PIEUP
|
||||
ᄈ<>ᄈ; # to HALFWIDTH HANGUL LETTER SSANGPIEUP
|
||||
ᄡ<>ᄡ; # to HALFWIDTH HANGUL LETTER PIEUP-SIOS
|
||||
ᄉ<>ᄉ; # to HALFWIDTH HANGUL LETTER SIOS
|
||||
ᄊ<>ᄊ; # to HALFWIDTH HANGUL LETTER SSANGSIOS
|
||||
ᄋ<>ᄋ; # to HALFWIDTH HANGUL LETTER IEUNG
|
||||
ᄌ<>ᄌ; # to HALFWIDTH HANGUL LETTER CIEUC
|
||||
ᄍ<>ᄍ; # to HALFWIDTH HANGUL LETTER SSANGCIEUC
|
||||
ᄎ<>ᄎ; # to HALFWIDTH HANGUL LETTER CHIEUCH
|
||||
ᄏ<>ᄏ; # to HALFWIDTH HANGUL LETTER KHIEUKH
|
||||
ᄐ<>ᄐ; # to HALFWIDTH HANGUL LETTER THIEUTH
|
||||
ᄑ<>ᄑ; # to HALFWIDTH HANGUL LETTER PHIEUPH
|
||||
ᄒ<>ᄒ; # to HALFWIDTH HANGUL LETTER HIEUH
|
||||
ᅡ<>ᅡ; # to HALFWIDTH HANGUL LETTER A
|
||||
ᅢ<>ᅢ; # to HALFWIDTH HANGUL LETTER AE
|
||||
ᅣ<>ᅣ; # to HALFWIDTH HANGUL LETTER YA
|
||||
ᅤ<>ᅤ; # to HALFWIDTH HANGUL LETTER YAE
|
||||
ᅥ<>ᅥ; # to HALFWIDTH HANGUL LETTER EO
|
||||
ᅦ<>ᅦ; # to HALFWIDTH HANGUL LETTER E
|
||||
ᅧ<>ᅧ; # to HALFWIDTH HANGUL LETTER YEO
|
||||
ᅨ<>ᅨ; # to HALFWIDTH HANGUL LETTER YE
|
||||
ᅩ<>ᅩ; # to HALFWIDTH HANGUL LETTER O
|
||||
ᅪ<>ᅪ; # to HALFWIDTH HANGUL LETTER WA
|
||||
ᅫ<>ᅫ; # to HALFWIDTH HANGUL LETTER WAE
|
||||
ᅬ<>ᅬ; # to HALFWIDTH HANGUL LETTER OE
|
||||
ᅭ<>ᅭ; # to HALFWIDTH HANGUL LETTER YO
|
||||
ᅮ<>ᅮ; # to HALFWIDTH HANGUL LETTER U
|
||||
ᅯ<>ᅯ; # to HALFWIDTH HANGUL LETTER WEO
|
||||
ᅰ<>ᅰ; # to HALFWIDTH HANGUL LETTER WE
|
||||
ᅱ<>ᅱ; # to HALFWIDTH HANGUL LETTER WI
|
||||
ᅲ<>ᅲ; # to HALFWIDTH HANGUL LETTER YU
|
||||
ᅳ<>ᅳ; # to HALFWIDTH HANGUL LETTER EU
|
||||
ᅴ<>ᅴ; # to HALFWIDTH HANGUL LETTER YI
|
||||
ᅵ<>ᅵ; # to HALFWIDTH HANGUL LETTER I
|
||||
¢<>'¢'; # from FULLWIDTH CENT SIGN
|
||||
£<>'£'; # from FULLWIDTH POUND SIGN
|
||||
¬<>'¬'; # from FULLWIDTH NOT SIGN
|
||||
 ̄<>' '̄; # from FULLWIDTH MACRON
|
||||
' '<>' '; # ideographic space (place this after MACRON)
|
||||
¦<>'¦'; # from FULLWIDTH BROKEN BAR
|
||||
¥<>'¥'; # from FULLWIDTH YEN SIGN
|
||||
₩<>₩; # from FULLWIDTH WON SIGN
|
||||
│<>│; # to HALFWIDTH FORMS LIGHT VERTICAL
|
||||
←<>←; # to HALFWIDTH LEFTWARDS ARROW
|
||||
↑<>↑; # to HALFWIDTH UPWARDS ARROW
|
||||
→<>→; # to HALFWIDTH RIGHTWARDS ARROW
|
||||
↓<>↓; # to HALFWIDTH DOWNWARDS ARROW
|
||||
■<>■; # to HALFWIDTH BLACK SQUARE
|
||||
○<>○; # to HALFWIDTH WHITE CIRCLE
|
||||
|
||||
# eof
|
515
icu4j/src/com/ibm/text/resources/Transliterator_Latin_Jamo.txt
Executable file
515
icu4j/src/com/ibm/text/resources/Transliterator_Latin_Jamo.txt
Executable file
@ -0,0 +1,515 @@
|
||||
#--------------------------------------------------------------------
|
||||
# Copyright (c) 1999-2001, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#--------------------------------------------------------------------
|
||||
|
||||
# Latin-Jamo
|
||||
|
||||
:: [:Latin:] NFKD ();
|
||||
:: [:Latin:] Lower ();
|
||||
|
||||
# Transliteration from Latin characters to Korean script is done in
|
||||
# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
|
||||
# transliteration is done algorithmically following Unicode 3.0
|
||||
# section 3.11. This file implements the Latin to Jamo
|
||||
# transliteration using rules.
|
||||
|
||||
# Jamo occupy the block 1100-11FF. Within this block there are three
|
||||
# groups of characters: initial consonants or choseong (I), medial
|
||||
# vowels or jungseong (M), and trailing consonants or jongseong (F).
|
||||
# Standard Korean syllables are of the form I+M+F*.
|
||||
|
||||
# Section 3.11 describes the use of 'filler' jamo to convert
|
||||
# nonstandard syllables to standard form: the choseong filler 115F and
|
||||
# the junseong filler 1160. In this transliterator, we will not use
|
||||
# 115F or 1160.
|
||||
|
||||
# We will, however, insert two 'null' jamo to make foreign words
|
||||
# conform to Korean syllable structure. These are the null initial
|
||||
# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
|
||||
# we will use the hyphen in order to disambiguate strings,
|
||||
# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).
|
||||
|
||||
# We will not use all of the characters in the jamo block. We will
|
||||
# only use the 19 initials, 21 medials, and 27 finals possessing a
|
||||
# jamo short name as defined in section 4.4 of the Unicode book.
|
||||
|
||||
# Rules of thumb. These guidelines provide the basic framework
|
||||
# for the rules. They are phrased in terms of Latin-Jamo transliteration.
|
||||
# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
|
||||
# just context-free transliteration of jamo to corresponding short names,
|
||||
# with the addition of hyphens to maintain round-trip integrity
|
||||
# in the context of the Latin-Jamo rules.
|
||||
|
||||
# A sequence of vowels:
|
||||
# - Take the longest sequence you can. If there are too many, or you don't
|
||||
# have a starting consonant, introduce a 110B necessary.
|
||||
|
||||
# A sequence of consonants.
|
||||
# - First join the double consonants: G + G -> GG
|
||||
# - In the remaining list,
|
||||
# -- If there is no preceding vowel, take the first consonant, and insert EU
|
||||
# after it. Continue with the rest of the consonants.
|
||||
# -- If there is one consonant, attach to the following vowel
|
||||
# -- If there are two consonants and a following vowel, attach one to the
|
||||
# preceeding vowel, and one to the following vowel.
|
||||
# -- If there are more than two consonants, join the first two together if you
|
||||
# can: L + G => LG
|
||||
# -- If you still end up with more than 2 consonants, insert EU after the
|
||||
# first one, and continue with the rest of the consonants.
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Variables
|
||||
|
||||
# Some latin consonants or consonant pairs only occur as initials, and
|
||||
# some only as finals, but some occur as both. This makes some jamo
|
||||
# consonants ambiguous when transliterated into latin.
|
||||
# Initial only: IEUNG BB DD JJ R
|
||||
# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
|
||||
# Initial and Final: B C D G GG H J K M N P S SS T
|
||||
|
||||
$Gi = \u1100;
|
||||
$GGi = \u1101;
|
||||
$Ni = \u1102;
|
||||
$Di = \u1103;
|
||||
$DD = \u1104;
|
||||
$R = \u1105;
|
||||
$Mi = \u1106;
|
||||
$Bi = \u1107;
|
||||
$BB = \u1108;
|
||||
$Si = \u1109;
|
||||
$SSi = \u110A;
|
||||
$IEUNG = \u110B; # null initial, inserted during Latin-Jamo
|
||||
$Ji = \u110C;
|
||||
$JJ = \u110D;
|
||||
$Ci = \u110E;
|
||||
$Ki = \u110F;
|
||||
$Ti = \u1110;
|
||||
$Pi = \u1111;
|
||||
$Hi = \u1112;
|
||||
|
||||
$A = \u1161;
|
||||
$AE = \u1162;
|
||||
$YA = \u1163;
|
||||
$YAE = \u1164;
|
||||
$EO = \u1165;
|
||||
$E = \u1166;
|
||||
$YEO = \u1167;
|
||||
$YE = \u1168;
|
||||
$O = \u1169;
|
||||
$WA = \u116A;
|
||||
$WAE = \u116B;
|
||||
$OE = \u116C;
|
||||
$YO = \u116D;
|
||||
$U = \u116E;
|
||||
$WEO = \u116F;
|
||||
$WE = \u1170;
|
||||
$WI = \u1171;
|
||||
$YU = \u1172;
|
||||
$EU = \u1173; # null medial, inserted during Latin-Jamo
|
||||
$YI = \u1174;
|
||||
$I = \u1175;
|
||||
|
||||
$Gf = \u11A8;
|
||||
$GGf = \u11A9;
|
||||
$GS = \u11AA;
|
||||
$Nf = \u11AB;
|
||||
$NJ = \u11AC;
|
||||
$NH = \u11AD;
|
||||
$Df = \u11AE;
|
||||
$L = \u11AF;
|
||||
$LG = \u11B0;
|
||||
$LM = \u11B1;
|
||||
$LB = \u11B2;
|
||||
$LS = \u11B3;
|
||||
$LT = \u11B4;
|
||||
$LP = \u11B5;
|
||||
$LH = \u11B6;
|
||||
$Mf = \u11B7;
|
||||
$Bf = \u11B8;
|
||||
$BS = \u11B9;
|
||||
$Sf = \u11BA;
|
||||
$SSf = \u11BB;
|
||||
$NG = \u11BC;
|
||||
$Jf = \u11BD;
|
||||
$Cf = \u11BE;
|
||||
$Kf = \u11BF;
|
||||
$Tf = \u11C0;
|
||||
$Pf = \u11C1;
|
||||
$Hf = \u11C2;
|
||||
|
||||
$jamoInitial = [\u1100-\u1112];
|
||||
|
||||
$jamoMedial = [\u1161-\u1175];
|
||||
|
||||
$latinInitial = [bcdghjkmnprst];
|
||||
|
||||
# Any character in the latin transliteration of a medial
|
||||
$latinMedial = [aeiouwy];
|
||||
|
||||
# The last character of the latin transliteration of a medial
|
||||
$latinMedialEnd = [aeiou];
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Jamo-Latin
|
||||
|
||||
# Jamo to latin is relatively simple, since it is the latin that is
|
||||
# ambiguous. Most rules are straightforward, and we encode them below
|
||||
# as simple add-on back rule, e.g.:
|
||||
|
||||
# $jamoMedial {bs} > $BS;
|
||||
|
||||
# becomes
|
||||
|
||||
# $jamoMedial {bs} <> $BS;
|
||||
|
||||
# Furthermore, we don't care about the ordering for Jamo-Latin because
|
||||
# we are going from single characters, so we can very easily piggyback
|
||||
# on the Latin-Jamo.
|
||||
|
||||
# The main issue with Jamo-Latin is when to insert hyphens.
|
||||
# Hyphens are inserted to obtain correct round trip behavior. For
|
||||
# example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
|
||||
# would then round trip to Ki A GGi E. To prevent this, we insert a
|
||||
# hyphen: "kag-ge". IMPORTANT: The need for hyphens depends
|
||||
# very specifically on the behavior of the Latin-Jamo rules. A change
|
||||
# in the Latin-Jamo behavior can completely change the way the
|
||||
# hyphen insertion must be done.
|
||||
|
||||
# First try to preserve actual hyphens in the jamo text by doubling
|
||||
# them. This fixes problems like:
|
||||
# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
|
||||
# => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
|
||||
# -- if we don't care about losing hyphens in the jamo, we can delete
|
||||
# this rule.
|
||||
|
||||
'--' <> '-';
|
||||
|
||||
# Triple consonants. For three consonants "axxx" we insert a
|
||||
# hyphen between the first and second "x" if XXf, Xf, and Xi all
|
||||
# exist, and we have A Xf XXi. This prevents the reverse
|
||||
# transliteration to A XXf Xi.
|
||||
|
||||
'-' < $latinMedialEnd g {} $GGi;
|
||||
'-' < $latinMedialEnd s {} $SSi;
|
||||
|
||||
# For vowels the rule is similar. If there is a vowel "ae" such that
|
||||
# "a" by itself and "e" by itself are vowels, then we want to map A E
|
||||
# to "a-e" so as not to round trip to AE. However, in the text Ki EO
|
||||
# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For
|
||||
# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
|
||||
# tested. NOTE: These rules used to have a left context of
|
||||
# $latinInitial instead of [^$latinMedial]. The problem with this is
|
||||
# sequences where an initial IEUNG is transliterated away:
|
||||
# (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)
|
||||
|
||||
'-' < [^$latinMedial] [y w] e {} [$O $OE];
|
||||
'-' < [^$latinMedial] e {} [$O $OE $U];
|
||||
'-' < [^$latinMedial] [o a] {} [$E $EO $EU];
|
||||
'-' < [^$latinMedial] [w y] a {} [$E $EO $EU];
|
||||
|
||||
# Similar to the above, but with an intervening $IEUNG.
|
||||
|
||||
'-' < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];
|
||||
'-' < [^$latinMedial] e {} $IEUNG [$O $OE $U];
|
||||
'-' < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];
|
||||
'-' < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];
|
||||
|
||||
# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
|
||||
# where Xi also exists, must be transliterated as "ax-e" to prevent
|
||||
# the round trip conversion to A Xi E.
|
||||
|
||||
'-' < $latinMedialEnd b {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd c {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd d {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd h {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd j {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd k {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd m {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd p {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd t {} $IEUNG $jamoMedial;
|
||||
|
||||
# Double finals followed by IEUNG. Similar to the single finals
|
||||
# followed by IEUNG. Any latin consonant pair X Y, between medials,
|
||||
# that we would split by Latin-Jamo, we must handle when it occurs as
|
||||
# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
|
||||
# E.
|
||||
|
||||
'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial;
|
||||
'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial;
|
||||
|
||||
# Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
|
||||
# we transliterate as "ax-xe" to prevent round trip transliteration as
|
||||
# A XXi E.
|
||||
|
||||
'-' < $latinMedialEnd b {} $Bi $jamoMedial;
|
||||
'-' < $latinMedialEnd d {} $Di $jamoMedial;
|
||||
'-' < $latinMedialEnd j {} $Ji $jamoMedial;
|
||||
'-' < $latinMedialEnd g {} $Gi $jamoMedial;
|
||||
'-' < $latinMedialEnd s {} $Si $jamoMedial;
|
||||
|
||||
# XYY. This corresponds to the XYY rule in Latin-Jamo. By default
|
||||
# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
|
||||
# "xyy" forms that correspond to XYf Yi must be transliterated as
|
||||
# "xy-y".
|
||||
|
||||
'-' < $latinMedialEnd b s {} [$Si $SSi];
|
||||
'-' < $latinMedialEnd g s {} [$Si $SSi];
|
||||
'-' < $latinMedialEnd l b {} [$Bi $BB];
|
||||
'-' < $latinMedialEnd l g {} [$Gi $GGi];
|
||||
'-' < $latinMedialEnd l s {} [$Si $SSi];
|
||||
'-' < $latinMedialEnd n g {} [$Gi $GGi];
|
||||
'-' < $latinMedialEnd n j {} [$Ji $JJ];
|
||||
|
||||
# Deletion of IEUNG is handled below.
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Latin-Jamo
|
||||
|
||||
# [Basic, context-free Jamo-Latin rules are embedded here too. See
|
||||
# above.]
|
||||
|
||||
# Split digraphs: Text of the form 'axye', where 'xy' is a final
|
||||
# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
|
||||
# 'e' are medials, we want to transliterate this as A Xf Yi E rather
|
||||
# than A XYf IEUNG E. We do NOT include text of the form "axxe",
|
||||
# since that is handled differently below. These rules are generated
|
||||
# programmatically from the jamo data.
|
||||
|
||||
$jamoMedial {b s} $latinMedial > $Bf $Si;
|
||||
$jamoMedial {g s} $latinMedial > $Gf $Si;
|
||||
$jamoMedial {l b} $latinMedial > $L $Bi;
|
||||
$jamoMedial {l g} $latinMedial > $L $Gi;
|
||||
$jamoMedial {l h} $latinMedial > $L $Hi;
|
||||
$jamoMedial {l m} $latinMedial > $L $Mi;
|
||||
$jamoMedial {l p} $latinMedial > $L $Pi;
|
||||
$jamoMedial {l s} $latinMedial > $L $Si;
|
||||
$jamoMedial {l t} $latinMedial > $L $Ti;
|
||||
$jamoMedial {n g} $latinMedial > $Nf $Gi;
|
||||
$jamoMedial {n h} $latinMedial > $Nf $Hi;
|
||||
$jamoMedial {n j} $latinMedial > $Nf $Ji;
|
||||
|
||||
# Single consonants are initials: Text of the form 'axe', where 'x'
|
||||
# can be an initial or a final, and 'a' and 'e' are medials, we want
|
||||
# to transliterate as A Xi E rather than A Xf IEUNG E.
|
||||
|
||||
$jamoMedial {b} $latinMedial > $Bi;
|
||||
$jamoMedial {c} $latinMedial > $Ci;
|
||||
$jamoMedial {d} $latinMedial > $Di;
|
||||
$jamoMedial {g} $latinMedial > $Gi;
|
||||
$jamoMedial {h} $latinMedial > $Hi;
|
||||
$jamoMedial {j} $latinMedial > $Ji;
|
||||
$jamoMedial {k} $latinMedial > $Ki;
|
||||
$jamoMedial {m} $latinMedial > $Mi;
|
||||
$jamoMedial {n} $latinMedial > $Ni;
|
||||
$jamoMedial {p} $latinMedial > $Pi;
|
||||
$jamoMedial {s} $latinMedial > $Si;
|
||||
$jamoMedial {t} $latinMedial > $Ti;
|
||||
|
||||
# Doubled initials. The sequence "axxe", where XX exists as an initial
|
||||
# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
|
||||
# to transliterate as A XXi E, rather than split to A Xf Xi E.
|
||||
|
||||
$jamoMedial {b b} $latinMedial > $BB;
|
||||
$jamoMedial {d d} $latinMedial > $DD;
|
||||
$jamoMedial {j j} $latinMedial > $JJ;
|
||||
$jamoMedial {g g} $latinMedial > $GGi;
|
||||
$jamoMedial {s s} $latinMedial > $SSi;
|
||||
|
||||
# XYY. Because doubled consonants bind more strongly than XY
|
||||
# consonants, we must handle the sequence "axyy" specially. Here XYf
|
||||
# and YYi must exist. In these cases, we map to Xf YYi rather than
|
||||
# XYf.
|
||||
|
||||
$jamoMedial {b} s s > $Bf;
|
||||
$jamoMedial {g} s s > $Gf;
|
||||
$jamoMedial {l} b b > $L;
|
||||
$jamoMedial {l} g g > $L;
|
||||
$jamoMedial {l} s s > $L;
|
||||
$jamoMedial {n} g g > $Nf;
|
||||
$jamoMedial {n} j j > $Nf;
|
||||
|
||||
# Finals: Attach consonant with preceding medial to preceding medial.
|
||||
# Do this BEFORE mapping consonants to initials. Longer keys must
|
||||
# precede shorter keys that they start with, e.g., the rule for 'bs'
|
||||
# must precede 'b'.
|
||||
|
||||
# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
|
||||
# block for Jamo-Latin.]
|
||||
|
||||
$jamoMedial {bs} <> $BS;
|
||||
$jamoMedial {b} <> $Bf;
|
||||
$jamoMedial {c} <> $Cf;
|
||||
$jamoMedial {d} <> $Df;
|
||||
$jamoMedial {gg} <> $GGf;
|
||||
$jamoMedial {gs} <> $GS;
|
||||
$jamoMedial {g} <> $Gf;
|
||||
$jamoMedial {h} <> $Hf;
|
||||
$jamoMedial {j} <> $Jf;
|
||||
$jamoMedial {k} <> $Kf;
|
||||
$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;
|
||||
$jamoMedial {lh} <> $LH;
|
||||
$jamoMedial {lm} <> $LM;
|
||||
$jamoMedial {lp} <> $LP;
|
||||
$jamoMedial {ls} <> $LS;
|
||||
$jamoMedial {lt} <> $LT;
|
||||
$jamoMedial {l} <> $L;
|
||||
$jamoMedial {m} <> $Mf;
|
||||
$jamoMedial {ng} <> $NG;
|
||||
$jamoMedial {nh} <> $NH;
|
||||
$jamoMedial {nj} <> $NJ;
|
||||
$jamoMedial {n} <> $Nf;
|
||||
$jamoMedial {p} <> $Pf;
|
||||
$jamoMedial {ss} <> $SSf;
|
||||
$jamoMedial {s} <> $Sf;
|
||||
$jamoMedial {t} <> $Tf;
|
||||
|
||||
# Initials: Attach single consonant to following medial. Do this
|
||||
# AFTER mapping finals. Longer keys must precede shorter keys that
|
||||
# they start with, e.g., the rule for 'gg' must precede 'g'.
|
||||
|
||||
# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
|
||||
# this block for Jamo-Latin.]
|
||||
|
||||
{gg} $latinMedial <> $GGi;
|
||||
{g} $latinMedial <> $Gi;
|
||||
{n} $latinMedial <> $Ni;
|
||||
{dd} $latinMedial <> $DD;
|
||||
{d} $latinMedial <> $Di;
|
||||
{r} $latinMedial <> $R;
|
||||
{m} $latinMedial <> $Mi;
|
||||
{bb} $latinMedial <> $BB;
|
||||
{b} $latinMedial <> $Bi;
|
||||
{ss} $latinMedial <> $SSi;
|
||||
{s} $latinMedial <> $Si;
|
||||
{jj} $latinMedial <> $JJ;
|
||||
{j} $latinMedial <> $Ji;
|
||||
{c} $latinMedial <> $Ci;
|
||||
{k} $latinMedial <> $Ki;
|
||||
{t} $latinMedial <> $Ti;
|
||||
{p} $latinMedial <> $Pi;
|
||||
{h} $latinMedial <> $Hi;
|
||||
|
||||
# 'r' in final position. Because of the equivalency of the 'l' and
|
||||
# 'r' jamo (the glyphs are the same), we try to provide the same
|
||||
# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
|
||||
# below. If we see an 'r' in an apparent final position, treat it
|
||||
# like 'l'. For example, "karka" => Ki A R EU Ki A without this rule.
|
||||
# Instead, we want Ki A L Ki A.
|
||||
|
||||
$jamoMedial {r} $latinInitial > | l;
|
||||
|
||||
# Initial + Final: If we match the next rule, we have initial then
|
||||
# final consonant with no intervening medial. We insert the null
|
||||
# vowel BEFORE it to create a well-formed syllable. (In the next rule
|
||||
# we insert a null vowel AFTER an anomalous initial.)
|
||||
|
||||
$jamoInitial {} [bcdghjklmnpst] > $EU;
|
||||
|
||||
# Initial + X: This block matches an initial consonant not followed by
|
||||
# a medial. We insert the null vowel after it. We handle double
|
||||
# initials explicitly here; for single initial consonants we insert EU
|
||||
# (as Latin) after them and let standard rules do the rest.
|
||||
|
||||
# BREAKS ROUND TRIP INTEGRITY
|
||||
|
||||
gg > $GGi $EU;
|
||||
dd > $DD $EU;
|
||||
bb > $BB $EU;
|
||||
ss > $SSi $EU;
|
||||
jj > $JJ $EU;
|
||||
|
||||
([bcdghjkmnprst]) > | $1 eu;
|
||||
|
||||
# X + Final: Finally we have to deal with a consonant that can only be
|
||||
# interpreted as a final (not an initial) and which is preceded
|
||||
# neither by an initial nor a medial. It is the start of the
|
||||
# syllable, but cannot be. Most of these will already be handled by
|
||||
# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
|
||||
# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
|
||||
# For this isolated case, we could add a null initial and medial,
|
||||
# which would give "la" => IEUNG EU L IEUNG A, for example. A more
|
||||
# economical solution is to transliterate isolated "l" (that is,
|
||||
# initial "l") to "r". (Other similar conversions of consonants that
|
||||
# occur neither as initials nor as finals are handled below.)
|
||||
|
||||
l > | r;
|
||||
|
||||
# Medials. If a medial is preceded by an initial, then we proceed
|
||||
# normally. As usual, longer keys must precede shorter ones.
|
||||
|
||||
# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
|
||||
# this block for Jamo-Latin.]
|
||||
|
||||
$jamoInitial {ae} <> $AE;
|
||||
$jamoInitial {a} <> $A;
|
||||
$jamoInitial {eo} <> $EO;
|
||||
$jamoInitial {eu} <> $EU;
|
||||
$jamoInitial {e} <> $E;
|
||||
$jamoInitial {i} <> $I;
|
||||
$jamoInitial {oe} <> $OE;
|
||||
$jamoInitial {o} <> $O;
|
||||
$jamoInitial {u} <> $U;
|
||||
$jamoInitial {wae} <> $WAE;
|
||||
$jamoInitial {wa} <> $WA;
|
||||
$jamoInitial {weo} <> $WEO;
|
||||
$jamoInitial {we} <> $WE;
|
||||
$jamoInitial {wi} <> $WI;
|
||||
$jamoInitial {yae} <> $YAE;
|
||||
$jamoInitial {ya} <> $YA;
|
||||
$jamoInitial {yeo} <> $YEO;
|
||||
$jamoInitial {ye} <> $YE;
|
||||
$jamoInitial {yi} <> $YI;
|
||||
$jamoInitial {yo} <> $YO;
|
||||
$jamoInitial {yu} <> $YU;
|
||||
|
||||
# We may see an anomalous isolated 'w' or 'y'. In that case, we
|
||||
# interpret it as 'wi' and 'yu', respectively.
|
||||
|
||||
# BREAKS ROUND TRIP INTEGRITY
|
||||
|
||||
$jamoInitial {w} > | wi;
|
||||
$jamoInitial {y} > | yu;
|
||||
|
||||
# Otherwise, insert a null consonant IEUNG before the medial (which is
|
||||
# still an untransliterated latin vowel).
|
||||
|
||||
($latinMedial) > $IEUNG | $1;
|
||||
|
||||
# Convert non-jamo latin consonants to equivalents. These occur as
|
||||
# neither initials nor finals in jamo. 'l' occurs as a final, but not
|
||||
# an initial; it is handled above. The following letters (left hand
|
||||
# side) will never be output by Jamo-Latin.
|
||||
|
||||
f > | p;
|
||||
q > | k;
|
||||
v > | b;
|
||||
x > | ks;
|
||||
z > | s;
|
||||
|
||||
# Delete hyphens (Latin-Jamo).
|
||||
|
||||
'-' > ;
|
||||
|
||||
# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
|
||||
# since these may also occur in text.
|
||||
|
||||
< $IEUNG;
|
||||
:: ([[:Hangul:]&[\uFF00-\uFFFF]] NFKD);
|
||||
|
||||
# eof
|
@ -51,9 +51,9 @@ Greek-Latin:file:Transliterator_Greek_Latin.txt:UTF8:FORWARD
|
||||
Latin-el:file:Transliterator_el_Latin.txt:UTF8:REVERSE
|
||||
el-Latin:file:Transliterator_el_Latin.txt:UTF8:FORWARD
|
||||
|
||||
LowerLatin-Jamo:internal:Transliterator_Latin_Jamo.utf8.txt:UTF8:FORWARD
|
||||
Latin-Jamo:alias:Any-Lower;LowerLatin-Jamo
|
||||
Jamo-Latin:file:Transliterator_Latin_Jamo.utf8.txt:UTF8:REVERSE
|
||||
LowerLatin-Jamo:internal:Transliterator_Latin_Jamo.txt:UTF8:FORWARD
|
||||
Latin-Jamo:alias:[:Latin:]Any-Lower;LowerLatin-Jamo
|
||||
Jamo-Latin:file:Transliterator_Latin_Jamo.txt:UTF8:REVERSE
|
||||
|
||||
Latin-Katakana:file:Transliterator_Latin_Katakana.txt:UTF8:FORWARD
|
||||
Katakana-Latin:file:Transliterator_Latin_Katakana.txt:UTF8:REVERSE
|
||||
@ -78,8 +78,8 @@ Kanji-OnRomaji:file:Transliterator_Kanji_OnRomaji.utf8.txt:UTF8:FORWARD
|
||||
|
||||
# Compound rules
|
||||
|
||||
Latin-Hangul:alias:[\p{Latin}];Latin-Jamo;[\u1100-\u11FF]NFC
|
||||
Hangul-Latin:alias:[\uAC00-\uD7AF];NFD;Jamo-Latin
|
||||
Latin-Hangul:alias:[\p{Latin}];NFD;Latin-Jamo;NFC
|
||||
Hangul-Latin:alias:[:Hangul:];NFD;Jamo-Latin
|
||||
|
||||
# Inter-Indic composed rules
|
||||
Latin-InterIndic:internal:Transliterator_Latin_InterIndic.txt:UTF8:FORWARD
|
||||
|
Loading…
Reference in New Issue
Block a user