a82c1b4547
X-SVN-Rev: 7078
537 lines
19 KiB
Plaintext
537 lines
19 KiB
Plaintext
// -*- Coding: utf-8; -*-
|
|
//--------------------------------------------------------------------
|
|
// Copyright (c) 1999-2001, International Business Machines
|
|
// Corporation and others. All Rights Reserved.
|
|
//--------------------------------------------------------------------
|
|
// THIS IS A MACHINE-GENERATED FILE
|
|
// Tool: dumpICUrules.bat
|
|
// Source: ../../text/resources/Transliterator_Latin_Jamo.txt
|
|
// Date: Wed Nov 21 18:58:49 2001
|
|
//--------------------------------------------------------------------
|
|
|
|
// Latin_Jamo
|
|
|
|
translit_Latin_Jamo {
|
|
Rule {
|
|
//--------------------------------------------------------------------
|
|
// Copyright (c) 1999-2001, International Business Machines
|
|
// Corporation and others. All Rights Reserved.
|
|
//--------------------------------------------------------------------
|
|
|
|
// note: a global filter is more efficient, but MUST include all source chars
|
|
":: [\\u0000-\u00FF [:Latin:]] ;"
|
|
|
|
":: NFKD (NFC);"
|
|
":: Lower ();"
|
|
|
|
// Transliteration from Latin characters to Korean script is done in
|
|
// two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
|
|
// transliteration is done algorithmically following Unicode 3.0
|
|
// section 3.11. This file implements the Latin to Jamo
|
|
// transliteration using rules.
|
|
|
|
// Jamo occupy the block 1100-11FF. Within this block there are three
|
|
// groups of characters: initial consonants or choseong (I), medial
|
|
// vowels or jungseong (M), and trailing consonants or jongseong (F).
|
|
// Standard Korean syllables are of the form I+M+F*.
|
|
|
|
// Section 3.11 describes the use of 'filler' jamo to convert
|
|
// nonstandard syllables to standard form: the choseong filler 115F and
|
|
// the junseong filler 1160. In this transliterator, we will not use
|
|
// 115F or 1160.
|
|
|
|
// We will, however, insert two 'null' jamo to make foreign words
|
|
// conform to Korean syllable structure. These are the null initial
|
|
// consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
|
|
// we will use the hyphen in order to disambiguate strings,
|
|
// e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).
|
|
|
|
// We will not use all of the characters in the jamo block. We will
|
|
// only use the 19 initials, 21 medials, and 27 finals possessing a
|
|
// jamo short name as defined in section 4.4 of the Unicode book.
|
|
|
|
// Rules of thumb. These guidelines provide the basic framework
|
|
// for the rules. They are phrased in terms of Latin-Jamo transliteration.
|
|
// The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
|
|
// just context-free transliteration of jamo to corresponding short names,
|
|
// with the addition of hyphens to maintain round-trip integrity
|
|
// in the context of the Latin-Jamo rules.
|
|
|
|
// A sequence of vowels:
|
|
// - Take the longest sequence you can. If there are too many, or you don't
|
|
// have a starting consonant, introduce a 110B necessary.
|
|
|
|
// A sequence of consonants.
|
|
// - First join the double consonants: G + G -> GG
|
|
// - In the remaining list,
|
|
// -- If there is no preceding vowel, take the first consonant, and insert EU
|
|
// after it. Continue with the rest of the consonants.
|
|
// -- If there is one consonant, attach to the following vowel
|
|
// -- If there are two consonants and a following vowel, attach one to the
|
|
// preceeding vowel, and one to the following vowel.
|
|
// -- If there are more than two consonants, join the first two together if you
|
|
// can: L + G => LG
|
|
// -- If you still end up with more than 2 consonants, insert EU after the
|
|
// first one, and continue with the rest of the consonants.
|
|
|
|
//----------------------------------------------------------------------
|
|
// Variables
|
|
|
|
// Some latin consonants or consonant pairs only occur as initials, and
|
|
// some only as finals, but some occur as both. This makes some jamo
|
|
// consonants ambiguous when transliterated into latin.
|
|
// Initial only: IEUNG BB DD JJ R
|
|
// Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
|
|
// Initial and Final: B C D G GG H J K M N P S SS T
|
|
|
|
"$Gi = \u1100;"
|
|
"$GGi = \u1101;"
|
|
"$Ni = \u1102;"
|
|
"$Di = \u1103;"
|
|
"$DD = \u1104;"
|
|
"$R = \u1105;"
|
|
"$Mi = \u1106;"
|
|
"$Bi = \u1107;"
|
|
"$BB = \u1108;"
|
|
"$Si = \u1109;"
|
|
"$SSi = \u110A;"
|
|
"$IEUNG = \u110B;" // null initial, inserted during Latin-Jamo
|
|
"$Ji = \u110C;"
|
|
"$JJ = \u110D;"
|
|
"$Ci = \u110E;"
|
|
"$Ki = \u110F;"
|
|
"$Ti = \u1110;"
|
|
"$Pi = \u1111;"
|
|
"$Hi = \u1112;"
|
|
|
|
"$A = \u1161;"
|
|
"$AE = \u1162;"
|
|
"$YA = \u1163;"
|
|
"$YAE = \u1164;"
|
|
"$EO = \u1165;"
|
|
"$E = \u1166;"
|
|
"$YEO = \u1167;"
|
|
"$YE = \u1168;"
|
|
"$O = \u1169;"
|
|
"$WA = \u116A;"
|
|
"$WAE = \u116B;"
|
|
"$OE = \u116C;"
|
|
"$YO = \u116D;"
|
|
"$U = \u116E;"
|
|
"$WEO = \u116F;"
|
|
"$WE = \u1170;"
|
|
"$WI = \u1171;"
|
|
"$YU = \u1172;"
|
|
"$EU = \u1173;" // null medial, inserted during Latin-Jamo
|
|
"$YI = \u1174;"
|
|
"$I = \u1175;"
|
|
|
|
"$Gf = \u11A8;"
|
|
"$GGf = \u11A9;"
|
|
"$GS = \u11AA;"
|
|
"$Nf = \u11AB;"
|
|
"$NJ = \u11AC;"
|
|
"$NH = \u11AD;"
|
|
"$Df = \u11AE;"
|
|
"$L = \u11AF;"
|
|
"$LG = \u11B0;"
|
|
"$LM = \u11B1;"
|
|
"$LB = \u11B2;"
|
|
"$LS = \u11B3;"
|
|
"$LT = \u11B4;"
|
|
"$LP = \u11B5;"
|
|
"$LH = \u11B6;"
|
|
"$Mf = \u11B7;"
|
|
"$Bf = \u11B8;"
|
|
"$BS = \u11B9;"
|
|
"$Sf = \u11BA;"
|
|
"$SSf = \u11BB;"
|
|
"$NG = \u11BC;"
|
|
"$Jf = \u11BD;"
|
|
"$Cf = \u11BE;"
|
|
"$Kf = \u11BF;"
|
|
"$Tf = \u11C0;"
|
|
"$Pf = \u11C1;"
|
|
"$Hf = \u11C2;"
|
|
|
|
"$jamoInitial = [\u1100-\u1112];"
|
|
|
|
"$jamoMedial = [\u1161-\u1175];"
|
|
|
|
"$latinInitial = [bcdghjkmnprst];"
|
|
|
|
// Any character in the latin transliteration of a medial
|
|
"$latinMedial = [aeiouwy];"
|
|
|
|
// The last character of the latin transliteration of a medial
|
|
"$latinMedialEnd = [aeiou];"
|
|
|
|
//----------------------------------------------------------------------
|
|
// Jamo-Latin
|
|
|
|
// Jamo to latin is relatively simple, since it is the latin that is
|
|
// ambiguous. Most rules are straightforward, and we encode them below
|
|
// as simple add-on back rule, e.g.:
|
|
|
|
// $jamoMedial {bs} > $BS;
|
|
|
|
// becomes
|
|
|
|
// $jamoMedial {bs} <> $BS;
|
|
|
|
// Furthermore, we don't care about the ordering for Jamo-Latin because
|
|
// we are going from single characters, so we can very easily piggyback
|
|
// on the Latin-Jamo.
|
|
|
|
// The main issue with Jamo-Latin is when to insert hyphens.
|
|
// Hyphens are inserted to obtain correct round trip behavior. For
|
|
// example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
|
|
// would then round trip to Ki A GGi E. To prevent this, we insert a
|
|
// hyphen: "kag-ge". IMPORTANT: The need for hyphens depends
|
|
// very specifically on the behavior of the Latin-Jamo rules. A change
|
|
// in the Latin-Jamo behavior can completely change the way the
|
|
// hyphen insertion must be done.
|
|
|
|
// First try to preserve actual hyphens in the jamo text by doubling
|
|
// them. This fixes problems like:
|
|
// (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
|
|
// => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
|
|
// -- if we don't care about losing hyphens in the jamo, we can delete
|
|
// this rule.
|
|
|
|
"'--' <> '-';"
|
|
|
|
// Triple consonants. For three consonants "axxx" we insert a
|
|
// hyphen between the first and second "x" if XXf, Xf, and Xi all
|
|
// exist, and we have A Xf XXi. This prevents the reverse
|
|
// transliteration to A XXf Xi.
|
|
|
|
"'-' < $latinMedialEnd g {} $GGi;"
|
|
"'-' < $latinMedialEnd s {} $SSi;"
|
|
|
|
// For vowels the rule is similar. If there is a vowel "ae" such that
|
|
// "a" by itself and "e" by itself are vowels, then we want to map A E
|
|
// to "a-e" so as not to round trip to AE. However, in the text Ki EO
|
|
// IEUNG E we don't need to map to "keo-e". "keoe" suffices. For
|
|
// vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
|
|
// tested. NOTE: These rules used to have a left context of
|
|
// $latinInitial instead of [^$latinMedial]. The problem with this is
|
|
// sequences where an initial IEUNG is transliterated away:
|
|
// (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)
|
|
|
|
"'-' < [^$latinMedial] [y w] e {} [$O $OE];"
|
|
"'-' < [^$latinMedial] e {} [$O $OE $U];"
|
|
"'-' < [^$latinMedial] [o a] {} [$E $EO $EU];"
|
|
"'-' < [^$latinMedial] [w y] a {} [$E $EO $EU];"
|
|
|
|
// Similar to the above, but with an intervening $IEUNG.
|
|
|
|
"'-' < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];"
|
|
"'-' < [^$latinMedial] e {} $IEUNG [$O $OE $U];"
|
|
"'-' < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];"
|
|
"'-' < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];"
|
|
|
|
// Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
|
|
// where Xi also exists, must be transliterated as "ax-e" to prevent
|
|
// the round trip conversion to A Xi E.
|
|
|
|
"'-' < $latinMedialEnd b {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd c {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd d {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd g {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd h {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd j {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd k {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd m {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd n {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd p {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd s {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd t {} $IEUNG $jamoMedial;"
|
|
|
|
// Double finals followed by IEUNG. Similar to the single finals
|
|
// followed by IEUNG. Any latin consonant pair X Y, between medials,
|
|
// that we would split by Latin-Jamo, we must handle when it occurs as
|
|
// part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
|
|
// E.
|
|
|
|
"'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial;"
|
|
"'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial;"
|
|
|
|
// Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
|
|
// we transliterate as "ax-xe" to prevent round trip transliteration as
|
|
// A XXi E.
|
|
|
|
"'-' < $latinMedialEnd b {} $Bi $jamoMedial;"
|
|
"'-' < $latinMedialEnd d {} $Di $jamoMedial;"
|
|
"'-' < $latinMedialEnd j {} $Ji $jamoMedial;"
|
|
"'-' < $latinMedialEnd g {} $Gi $jamoMedial;"
|
|
"'-' < $latinMedialEnd s {} $Si $jamoMedial;"
|
|
|
|
// XYY. This corresponds to the XYY rule in Latin-Jamo. By default
|
|
// Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
|
|
// "xyy" forms that correspond to XYf Yi must be transliterated as
|
|
// "xy-y".
|
|
|
|
"'-' < $latinMedialEnd b s {} [$Si $SSi];"
|
|
"'-' < $latinMedialEnd g s {} [$Si $SSi];"
|
|
"'-' < $latinMedialEnd l b {} [$Bi $BB];"
|
|
"'-' < $latinMedialEnd l g {} [$Gi $GGi];"
|
|
"'-' < $latinMedialEnd l s {} [$Si $SSi];"
|
|
"'-' < $latinMedialEnd n g {} [$Gi $GGi];"
|
|
"'-' < $latinMedialEnd n j {} [$Ji $JJ];"
|
|
|
|
// Deletion of IEUNG is handled below.
|
|
|
|
//----------------------------------------------------------------------
|
|
// Latin-Jamo
|
|
|
|
// [Basic, context-free Jamo-Latin rules are embedded here too. See
|
|
// above.]
|
|
|
|
// Split digraphs: Text of the form 'axye', where 'xy' is a final
|
|
// digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
|
|
// 'e' are medials, we want to transliterate this as A Xf Yi E rather
|
|
// than A XYf IEUNG E. We do NOT include text of the form "axxe",
|
|
// since that is handled differently below. These rules are generated
|
|
// programmatically from the jamo data.
|
|
|
|
"$jamoMedial {b s} $latinMedial > $Bf $Si;"
|
|
"$jamoMedial {g s} $latinMedial > $Gf $Si;"
|
|
"$jamoMedial {l b} $latinMedial > $L $Bi;"
|
|
"$jamoMedial {l g} $latinMedial > $L $Gi;"
|
|
"$jamoMedial {l h} $latinMedial > $L $Hi;"
|
|
"$jamoMedial {l m} $latinMedial > $L $Mi;"
|
|
"$jamoMedial {l p} $latinMedial > $L $Pi;"
|
|
"$jamoMedial {l s} $latinMedial > $L $Si;"
|
|
"$jamoMedial {l t} $latinMedial > $L $Ti;"
|
|
"$jamoMedial {n g} $latinMedial > $Nf $Gi;"
|
|
"$jamoMedial {n h} $latinMedial > $Nf $Hi;"
|
|
"$jamoMedial {n j} $latinMedial > $Nf $Ji;"
|
|
|
|
// Single consonants are initials: Text of the form 'axe', where 'x'
|
|
// can be an initial or a final, and 'a' and 'e' are medials, we want
|
|
// to transliterate as A Xi E rather than A Xf IEUNG E.
|
|
|
|
"$jamoMedial {b} $latinMedial > $Bi;"
|
|
"$jamoMedial {c} $latinMedial > $Ci;"
|
|
"$jamoMedial {d} $latinMedial > $Di;"
|
|
"$jamoMedial {g} $latinMedial > $Gi;"
|
|
"$jamoMedial {h} $latinMedial > $Hi;"
|
|
"$jamoMedial {j} $latinMedial > $Ji;"
|
|
"$jamoMedial {k} $latinMedial > $Ki;"
|
|
"$jamoMedial {m} $latinMedial > $Mi;"
|
|
"$jamoMedial {n} $latinMedial > $Ni;"
|
|
"$jamoMedial {p} $latinMedial > $Pi;"
|
|
"$jamoMedial {s} $latinMedial > $Si;"
|
|
"$jamoMedial {t} $latinMedial > $Ti;"
|
|
|
|
// Doubled initials. The sequence "axxe", where XX exists as an initial
|
|
// (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
|
|
// to transliterate as A XXi E, rather than split to A Xf Xi E.
|
|
|
|
"$jamoMedial {b b} $latinMedial > $BB;"
|
|
"$jamoMedial {d d} $latinMedial > $DD;"
|
|
"$jamoMedial {j j} $latinMedial > $JJ;"
|
|
"$jamoMedial {g g} $latinMedial > $GGi;"
|
|
"$jamoMedial {s s} $latinMedial > $SSi;"
|
|
|
|
// XYY. Because doubled consonants bind more strongly than XY
|
|
// consonants, we must handle the sequence "axyy" specially. Here XYf
|
|
// and YYi must exist. In these cases, we map to Xf YYi rather than
|
|
// XYf.
|
|
|
|
"$jamoMedial {b} s s > $Bf;"
|
|
"$jamoMedial {g} s s > $Gf;"
|
|
"$jamoMedial {l} b b > $L;"
|
|
"$jamoMedial {l} g g > $L;"
|
|
"$jamoMedial {l} s s > $L;"
|
|
"$jamoMedial {n} g g > $Nf;"
|
|
"$jamoMedial {n} j j > $Nf;"
|
|
|
|
// Finals: Attach consonant with preceding medial to preceding medial.
|
|
// Do this BEFORE mapping consonants to initials. Longer keys must
|
|
// precede shorter keys that they start with, e.g., the rule for 'bs'
|
|
// must precede 'b'.
|
|
|
|
// [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
|
|
// block for Jamo-Latin.]
|
|
|
|
"$jamoMedial {bs} <> $BS;"
|
|
"$jamoMedial {b} <> $Bf;"
|
|
"$jamoMedial {c} <> $Cf;"
|
|
"$jamoMedial {d} <> $Df;"
|
|
"$jamoMedial {gg} <> $GGf;"
|
|
"$jamoMedial {gs} <> $GS;"
|
|
"$jamoMedial {g} <> $Gf;"
|
|
"$jamoMedial {h} <> $Hf;"
|
|
"$jamoMedial {j} <> $Jf;"
|
|
"$jamoMedial {k} <> $Kf;"
|
|
"$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;"
|
|
"$jamoMedial {lh} <> $LH;"
|
|
"$jamoMedial {lm} <> $LM;"
|
|
"$jamoMedial {lp} <> $LP;"
|
|
"$jamoMedial {ls} <> $LS;"
|
|
"$jamoMedial {lt} <> $LT;"
|
|
"$jamoMedial {l} <> $L;"
|
|
"$jamoMedial {m} <> $Mf;"
|
|
"$jamoMedial {ng} <> $NG;"
|
|
"$jamoMedial {nh} <> $NH;"
|
|
"$jamoMedial {nj} <> $NJ;"
|
|
"$jamoMedial {n} <> $Nf;"
|
|
"$jamoMedial {p} <> $Pf;"
|
|
"$jamoMedial {ss} <> $SSf;"
|
|
"$jamoMedial {s} <> $Sf;"
|
|
"$jamoMedial {t} <> $Tf;"
|
|
|
|
// Initials: Attach single consonant to following medial. Do this
|
|
// AFTER mapping finals. Longer keys must precede shorter keys that
|
|
// they start with, e.g., the rule for 'gg' must precede 'g'.
|
|
|
|
// [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
|
|
// this block for Jamo-Latin.]
|
|
|
|
"{gg} $latinMedial <> $GGi;"
|
|
"{g} $latinMedial <> $Gi;"
|
|
"{n} $latinMedial <> $Ni;"
|
|
"{dd} $latinMedial <> $DD;"
|
|
"{d} $latinMedial <> $Di;"
|
|
"{r} $latinMedial <> $R;"
|
|
"{m} $latinMedial <> $Mi;"
|
|
"{bb} $latinMedial <> $BB;"
|
|
"{b} $latinMedial <> $Bi;"
|
|
"{ss} $latinMedial <> $SSi;"
|
|
"{s} $latinMedial <> $Si;"
|
|
"{jj} $latinMedial <> $JJ;"
|
|
"{j} $latinMedial <> $Ji;"
|
|
"{c} $latinMedial <> $Ci;"
|
|
"{k} $latinMedial <> $Ki;"
|
|
"{t} $latinMedial <> $Ti;"
|
|
"{p} $latinMedial <> $Pi;"
|
|
"{h} $latinMedial <> $Hi;"
|
|
|
|
// 'r' in final position. Because of the equivalency of the 'l' and
|
|
// 'r' jamo (the glyphs are the same), we try to provide the same
|
|
// equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
|
|
// below. If we see an 'r' in an apparent final position, treat it
|
|
// like 'l'. For example, "karka" => Ki A R EU Ki A without this rule.
|
|
// Instead, we want Ki A L Ki A.
|
|
|
|
"$jamoMedial {r} $latinInitial > | l;"
|
|
|
|
// Initial + Final: If we match the next rule, we have initial then
|
|
// final consonant with no intervening medial. We insert the null
|
|
// vowel BEFORE it to create a well-formed syllable. (In the next rule
|
|
// we insert a null vowel AFTER an anomalous initial.)
|
|
|
|
"$jamoInitial {} [bcdghjklmnpst] > $EU;"
|
|
|
|
// Initial + X: This block matches an initial consonant not followed by
|
|
// a medial. We insert the null vowel after it. We handle double
|
|
// initials explicitly here; for single initial consonants we insert EU
|
|
// (as Latin) after them and let standard rules do the rest.
|
|
|
|
// BREAKS ROUND TRIP INTEGRITY
|
|
|
|
"gg > $GGi $EU;"
|
|
"dd > $DD $EU;"
|
|
"bb > $BB $EU;"
|
|
"ss > $SSi $EU;"
|
|
"jj > $JJ $EU;"
|
|
|
|
"([bcdghjkmnprst]) > | $1 eu;"
|
|
|
|
// X + Final: Finally we have to deal with a consonant that can only be
|
|
// interpreted as a final (not an initial) and which is preceded
|
|
// neither by an initial nor a medial. It is the start of the
|
|
// syllable, but cannot be. Most of these will already be handled by
|
|
// the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
|
|
// 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
|
|
// For this isolated case, we could add a null initial and medial,
|
|
// which would give "la" => IEUNG EU L IEUNG A, for example. A more
|
|
// economical solution is to transliterate isolated "l" (that is,
|
|
// initial "l") to "r". (Other similar conversions of consonants that
|
|
// occur neither as initials nor as finals are handled below.)
|
|
|
|
"l > | r;"
|
|
|
|
// Medials. If a medial is preceded by an initial, then we proceed
|
|
// normally. As usual, longer keys must precede shorter ones.
|
|
|
|
// [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
|
|
// this block for Jamo-Latin.]
|
|
|
|
"$jamoInitial {ae} <> $AE;"
|
|
"$jamoInitial {a} <> $A;"
|
|
"$jamoInitial {eo} <> $EO;"
|
|
"$jamoInitial {eu} <> $EU;"
|
|
"$jamoInitial {e} <> $E;"
|
|
"$jamoInitial {i} <> $I;"
|
|
"$jamoInitial {oe} <> $OE;"
|
|
"$jamoInitial {o} <> $O;"
|
|
"$jamoInitial {u} <> $U;"
|
|
"$jamoInitial {wae} <> $WAE;"
|
|
"$jamoInitial {wa} <> $WA;"
|
|
"$jamoInitial {weo} <> $WEO;"
|
|
"$jamoInitial {we} <> $WE;"
|
|
"$jamoInitial {wi} <> $WI;"
|
|
"$jamoInitial {yae} <> $YAE;"
|
|
"$jamoInitial {ya} <> $YA;"
|
|
"$jamoInitial {yeo} <> $YEO;"
|
|
"$jamoInitial {ye} <> $YE;"
|
|
"$jamoInitial {yi} <> $YI;"
|
|
"$jamoInitial {yo} <> $YO;"
|
|
"$jamoInitial {yu} <> $YU;"
|
|
|
|
// We may see an anomalous isolated 'w' or 'y'. In that case, we
|
|
// interpret it as 'wi' and 'yu', respectively.
|
|
|
|
// BREAKS ROUND TRIP INTEGRITY
|
|
|
|
"$jamoInitial {w} > | wi;"
|
|
"$jamoInitial {y} > | yu;"
|
|
|
|
// Otherwise, insert a null consonant IEUNG before the medial (which is
|
|
// still an untransliterated latin vowel).
|
|
|
|
"($latinMedial) > $IEUNG | $1;"
|
|
|
|
// Convert non-jamo latin consonants to equivalents. These occur as
|
|
// neither initials nor finals in jamo. 'l' occurs as a final, but not
|
|
// an initial; it is handled above. The following letters (left hand
|
|
// side) will never be output by Jamo-Latin.
|
|
|
|
"f > | p;"
|
|
"q > | k;"
|
|
"v > | b;"
|
|
"x > | ks;"
|
|
"z > | s;"
|
|
|
|
// Delete hyphens (Latin-Jamo).
|
|
|
|
"'-' > ;"
|
|
|
|
// Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
|
|
// since these may also occur in text.
|
|
|
|
"< $IEUNG;"
|
|
":: NFC (NFKD);"
|
|
|
|
// note: a global filter is more efficient, but MUST include all source chars!!
|
|
":: ([:Hangul:]);"
|
|
|
|
// eof
|
|
}
|
|
}
|