// -*- Coding: utf-8; -*- //-------------------------------------------------------------------- // Copyright (c) 1999-2001, International Business Machines // Corporation and others. All Rights Reserved. //-------------------------------------------------------------------- // THIS IS A MACHINE-GENERATED FILE // Tool: dumpICUrules.bat // Source: ../../text/resources/Transliterator_Latin_Jamo.txt // Date: Mon Nov 19 12:15:36 2001 //-------------------------------------------------------------------- // Latin_Jamo translit_Latin_Jamo { Rule { //-------------------------------------------------------------------- // Copyright (c) 1999-2001, International Business Machines // Corporation and others. All Rights Reserved. //-------------------------------------------------------------------- // Latin-Jamo ":: [:Latin:] NFKD ();" ":: [:Latin:] Lower ();" // Transliteration from Latin characters to Korean script is done in // two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul // transliteration is done algorithmically following Unicode 3.0 // section 3.11. This file implements the Latin to Jamo // transliteration using rules. // Jamo occupy the block 1100-11FF. Within this block there are three // groups of characters: initial consonants or choseong (I), medial // vowels or jungseong (M), and trailing consonants or jongseong (F). // Standard Korean syllables are of the form I+M+F*. // Section 3.11 describes the use of 'filler' jamo to convert // nonstandard syllables to standard form: the choseong filler 115F and // the junseong filler 1160. In this transliterator, we will not use // 115F or 1160. // We will, however, insert two 'null' jamo to make foreign words // conform to Korean syllable structure. These are the null initial // consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text, // we will use the hyphen in order to disambiguate strings, // e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G). // We will not use all of the characters in the jamo block. We will // only use the 19 initials, 21 medials, and 27 finals possessing a // jamo short name as defined in section 4.4 of the Unicode book. // Rules of thumb. These guidelines provide the basic framework // for the rules. They are phrased in terms of Latin-Jamo transliteration. // The Jamo-Latin rules derive from these, since the Jamo-Latin rules are // just context-free transliteration of jamo to corresponding short names, // with the addition of hyphens to maintain round-trip integrity // in the context of the Latin-Jamo rules. // A sequence of vowels: // - Take the longest sequence you can. If there are too many, or you don't // have a starting consonant, introduce a 110B necessary. // A sequence of consonants. // - First join the double consonants: G + G -> GG // - In the remaining list, // -- If there is no preceding vowel, take the first consonant, and insert EU // after it. Continue with the rest of the consonants. // -- If there is one consonant, attach to the following vowel // -- If there are two consonants and a following vowel, attach one to the // preceeding vowel, and one to the following vowel. // -- If there are more than two consonants, join the first two together if you // can: L + G => LG // -- If you still end up with more than 2 consonants, insert EU after the // first one, and continue with the rest of the consonants. //---------------------------------------------------------------------- // Variables // Some latin consonants or consonant pairs only occur as initials, and // some only as finals, but some occur as both. This makes some jamo // consonants ambiguous when transliterated into latin. // Initial only: IEUNG BB DD JJ R // Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ // Initial and Final: B C D G GG H J K M N P S SS T "$Gi = \u1100;" "$GGi = \u1101;" "$Ni = \u1102;" "$Di = \u1103;" "$DD = \u1104;" "$R = \u1105;" "$Mi = \u1106;" "$Bi = \u1107;" "$BB = \u1108;" "$Si = \u1109;" "$SSi = \u110A;" "$IEUNG = \u110B;" // null initial, inserted during Latin-Jamo "$Ji = \u110C;" "$JJ = \u110D;" "$Ci = \u110E;" "$Ki = \u110F;" "$Ti = \u1110;" "$Pi = \u1111;" "$Hi = \u1112;" "$A = \u1161;" "$AE = \u1162;" "$YA = \u1163;" "$YAE = \u1164;" "$EO = \u1165;" "$E = \u1166;" "$YEO = \u1167;" "$YE = \u1168;" "$O = \u1169;" "$WA = \u116A;" "$WAE = \u116B;" "$OE = \u116C;" "$YO = \u116D;" "$U = \u116E;" "$WEO = \u116F;" "$WE = \u1170;" "$WI = \u1171;" "$YU = \u1172;" "$EU = \u1173;" // null medial, inserted during Latin-Jamo "$YI = \u1174;" "$I = \u1175;" "$Gf = \u11A8;" "$GGf = \u11A9;" "$GS = \u11AA;" "$Nf = \u11AB;" "$NJ = \u11AC;" "$NH = \u11AD;" "$Df = \u11AE;" "$L = \u11AF;" "$LG = \u11B0;" "$LM = \u11B1;" "$LB = \u11B2;" "$LS = \u11B3;" "$LT = \u11B4;" "$LP = \u11B5;" "$LH = \u11B6;" "$Mf = \u11B7;" "$Bf = \u11B8;" "$BS = \u11B9;" "$Sf = \u11BA;" "$SSf = \u11BB;" "$NG = \u11BC;" "$Jf = \u11BD;" "$Cf = \u11BE;" "$Kf = \u11BF;" "$Tf = \u11C0;" "$Pf = \u11C1;" "$Hf = \u11C2;" "$jamoInitial = [\u1100-\u1112];" "$jamoMedial = [\u1161-\u1175];" "$latinInitial = [bcdghjkmnprst];" // Any character in the latin transliteration of a medial "$latinMedial = [aeiouwy];" // The last character of the latin transliteration of a medial "$latinMedialEnd = [aeiou];" //---------------------------------------------------------------------- // Jamo-Latin // Jamo to latin is relatively simple, since it is the latin that is // ambiguous. Most rules are straightforward, and we encode them below // as simple add-on back rule, e.g.: // $jamoMedial {bs} > $BS; // becomes // $jamoMedial {bs} <> $BS; // Furthermore, we don't care about the ordering for Jamo-Latin because // we are going from single characters, so we can very easily piggyback // on the Latin-Jamo. // The main issue with Jamo-Latin is when to insert hyphens. // Hyphens are inserted to obtain correct round trip behavior. For // example, the sequence Ki A Gf Gi E, if transliterated to "kagge", // would then round trip to Ki A GGi E. To prevent this, we insert a // hyphen: "kag-ge". IMPORTANT: The need for hyphens depends // very specifically on the behavior of the Latin-Jamo rules. A change // in the Latin-Jamo behavior can completely change the way the // hyphen insertion must be done. // First try to preserve actual hyphens in the jamo text by doubling // them. This fixes problems like: // (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol // => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional // -- if we don't care about losing hyphens in the jamo, we can delete // this rule. "'--' <> '-';" // Triple consonants. For three consonants "axxx" we insert a // hyphen between the first and second "x" if XXf, Xf, and Xi all // exist, and we have A Xf XXi. This prevents the reverse // transliteration to A XXf Xi. "'-' < $latinMedialEnd g {} $GGi;" "'-' < $latinMedialEnd s {} $SSi;" // For vowels the rule is similar. If there is a vowel "ae" such that // "a" by itself and "e" by itself are vowels, then we want to map A E // to "a-e" so as not to round trip to AE. However, in the text Ki EO // IEUNG E we don't need to map to "keo-e". "keoe" suffices. For // vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be // tested. NOTE: These rules used to have a left context of // $latinInitial instead of [^$latinMedial]. The problem with this is // sequences where an initial IEUNG is transliterated away: // (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O) "'-' < [^$latinMedial] [y w] e {} [$O $OE];" "'-' < [^$latinMedial] e {} [$O $OE $U];" "'-' < [^$latinMedial] [o a] {} [$E $EO $EU];" "'-' < [^$latinMedial] [w y] a {} [$E $EO $EU];" // Similar to the above, but with an intervening $IEUNG. "'-' < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];" "'-' < [^$latinMedial] e {} $IEUNG [$O $OE $U];" "'-' < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];" "'-' < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];" // Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E, // where Xi also exists, must be transliterated as "ax-e" to prevent // the round trip conversion to A Xi E. "'-' < $latinMedialEnd b {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd c {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd d {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd g {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd h {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd j {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd k {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd m {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd n {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd p {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd s {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd t {} $IEUNG $jamoMedial;" // Double finals followed by IEUNG. Similar to the single finals // followed by IEUNG. Any latin consonant pair X Y, between medials, // that we would split by Latin-Jamo, we must handle when it occurs as // part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi // E. "'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial;" "'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial;" // Split doubles. Text of the form A Xi Xf E, where XXi also occurs, // we transliterate as "ax-xe" to prevent round trip transliteration as // A XXi E. "'-' < $latinMedialEnd b {} $Bi $jamoMedial;" "'-' < $latinMedialEnd d {} $Di $jamoMedial;" "'-' < $latinMedialEnd j {} $Ji $jamoMedial;" "'-' < $latinMedialEnd g {} $Gi $jamoMedial;" "'-' < $latinMedialEnd s {} $Si $jamoMedial;" // XYY. This corresponds to the XYY rule in Latin-Jamo. By default // Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result, // "xyy" forms that correspond to XYf Yi must be transliterated as // "xy-y". "'-' < $latinMedialEnd b s {} [$Si $SSi];" "'-' < $latinMedialEnd g s {} [$Si $SSi];" "'-' < $latinMedialEnd l b {} [$Bi $BB];" "'-' < $latinMedialEnd l g {} [$Gi $GGi];" "'-' < $latinMedialEnd l s {} [$Si $SSi];" "'-' < $latinMedialEnd n g {} [$Gi $GGi];" "'-' < $latinMedialEnd n j {} [$Ji $JJ];" // Deletion of IEUNG is handled below. //---------------------------------------------------------------------- // Latin-Jamo // [Basic, context-free Jamo-Latin rules are embedded here too. See // above.] // Split digraphs: Text of the form 'axye', where 'xy' is a final // digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and // 'e' are medials, we want to transliterate this as A Xf Yi E rather // than A XYf IEUNG E. We do NOT include text of the form "axxe", // since that is handled differently below. These rules are generated // programmatically from the jamo data. "$jamoMedial {b s} $latinMedial > $Bf $Si;" "$jamoMedial {g s} $latinMedial > $Gf $Si;" "$jamoMedial {l b} $latinMedial > $L $Bi;" "$jamoMedial {l g} $latinMedial > $L $Gi;" "$jamoMedial {l h} $latinMedial > $L $Hi;" "$jamoMedial {l m} $latinMedial > $L $Mi;" "$jamoMedial {l p} $latinMedial > $L $Pi;" "$jamoMedial {l s} $latinMedial > $L $Si;" "$jamoMedial {l t} $latinMedial > $L $Ti;" "$jamoMedial {n g} $latinMedial > $Nf $Gi;" "$jamoMedial {n h} $latinMedial > $Nf $Hi;" "$jamoMedial {n j} $latinMedial > $Nf $Ji;" // Single consonants are initials: Text of the form 'axe', where 'x' // can be an initial or a final, and 'a' and 'e' are medials, we want // to transliterate as A Xi E rather than A Xf IEUNG E. "$jamoMedial {b} $latinMedial > $Bi;" "$jamoMedial {c} $latinMedial > $Ci;" "$jamoMedial {d} $latinMedial > $Di;" "$jamoMedial {g} $latinMedial > $Gi;" "$jamoMedial {h} $latinMedial > $Hi;" "$jamoMedial {j} $latinMedial > $Ji;" "$jamoMedial {k} $latinMedial > $Ki;" "$jamoMedial {m} $latinMedial > $Mi;" "$jamoMedial {n} $latinMedial > $Ni;" "$jamoMedial {p} $latinMedial > $Pi;" "$jamoMedial {s} $latinMedial > $Si;" "$jamoMedial {t} $latinMedial > $Ti;" // Doubled initials. The sequence "axxe", where XX exists as an initial // (XXi), and also Xi and Xf exist (true of all digraphs XX), we want // to transliterate as A XXi E, rather than split to A Xf Xi E. "$jamoMedial {b b} $latinMedial > $BB;" "$jamoMedial {d d} $latinMedial > $DD;" "$jamoMedial {j j} $latinMedial > $JJ;" "$jamoMedial {g g} $latinMedial > $GGi;" "$jamoMedial {s s} $latinMedial > $SSi;" // XYY. Because doubled consonants bind more strongly than XY // consonants, we must handle the sequence "axyy" specially. Here XYf // and YYi must exist. In these cases, we map to Xf YYi rather than // XYf. "$jamoMedial {b} s s > $Bf;" "$jamoMedial {g} s s > $Gf;" "$jamoMedial {l} b b > $L;" "$jamoMedial {l} g g > $L;" "$jamoMedial {l} s s > $L;" "$jamoMedial {n} g g > $Nf;" "$jamoMedial {n} j j > $Nf;" // Finals: Attach consonant with preceding medial to preceding medial. // Do this BEFORE mapping consonants to initials. Longer keys must // precede shorter keys that they start with, e.g., the rule for 'bs' // must precede 'b'. // [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this // block for Jamo-Latin.] "$jamoMedial {bs} <> $BS;" "$jamoMedial {b} <> $Bf;" "$jamoMedial {c} <> $Cf;" "$jamoMedial {d} <> $Df;" "$jamoMedial {gg} <> $GGf;" "$jamoMedial {gs} <> $GS;" "$jamoMedial {g} <> $Gf;" "$jamoMedial {h} <> $Hf;" "$jamoMedial {j} <> $Jf;" "$jamoMedial {k} <> $Kf;" "$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;" "$jamoMedial {lh} <> $LH;" "$jamoMedial {lm} <> $LM;" "$jamoMedial {lp} <> $LP;" "$jamoMedial {ls} <> $LS;" "$jamoMedial {lt} <> $LT;" "$jamoMedial {l} <> $L;" "$jamoMedial {m} <> $Mf;" "$jamoMedial {ng} <> $NG;" "$jamoMedial {nh} <> $NH;" "$jamoMedial {nj} <> $NJ;" "$jamoMedial {n} <> $Nf;" "$jamoMedial {p} <> $Pf;" "$jamoMedial {ss} <> $SSf;" "$jamoMedial {s} <> $Sf;" "$jamoMedial {t} <> $Tf;" // Initials: Attach single consonant to following medial. Do this // AFTER mapping finals. Longer keys must precede shorter keys that // they start with, e.g., the rule for 'gg' must precede 'g'. // [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within // this block for Jamo-Latin.] "{gg} $latinMedial <> $GGi;" "{g} $latinMedial <> $Gi;" "{n} $latinMedial <> $Ni;" "{dd} $latinMedial <> $DD;" "{d} $latinMedial <> $Di;" "{r} $latinMedial <> $R;" "{m} $latinMedial <> $Mi;" "{bb} $latinMedial <> $BB;" "{b} $latinMedial <> $Bi;" "{ss} $latinMedial <> $SSi;" "{s} $latinMedial <> $Si;" "{jj} $latinMedial <> $JJ;" "{j} $latinMedial <> $Ji;" "{c} $latinMedial <> $Ci;" "{k} $latinMedial <> $Ki;" "{t} $latinMedial <> $Ti;" "{p} $latinMedial <> $Pi;" "{h} $latinMedial <> $Hi;" // 'r' in final position. Because of the equivalency of the 'l' and // 'r' jamo (the glyphs are the same), we try to provide the same // equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled // below. If we see an 'r' in an apparent final position, treat it // like 'l'. For example, "karka" => Ki A R EU Ki A without this rule. // Instead, we want Ki A L Ki A. "$jamoMedial {r} $latinInitial > | l;" // Initial + Final: If we match the next rule, we have initial then // final consonant with no intervening medial. We insert the null // vowel BEFORE it to create a well-formed syllable. (In the next rule // we insert a null vowel AFTER an anomalous initial.) "$jamoInitial {} [bcdghjklmnpst] > $EU;" // Initial + X: This block matches an initial consonant not followed by // a medial. We insert the null vowel after it. We handle double // initials explicitly here; for single initial consonants we insert EU // (as Latin) after them and let standard rules do the rest. // BREAKS ROUND TRIP INTEGRITY "gg > $GGi $EU;" "dd > $DD $EU;" "bb > $BB $EU;" "ss > $SSi $EU;" "jj > $JJ $EU;" "([bcdghjkmnprst]) > | $1 eu;" // X + Final: Finally we have to deal with a consonant that can only be // interpreted as a final (not an initial) and which is preceded // neither by an initial nor a medial. It is the start of the // syllable, but cannot be. Most of these will already be handled by // the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng' // 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'. // For this isolated case, we could add a null initial and medial, // which would give "la" => IEUNG EU L IEUNG A, for example. A more // economical solution is to transliterate isolated "l" (that is, // initial "l") to "r". (Other similar conversions of consonants that // occur neither as initials nor as finals are handled below.) "l > | r;" // Medials. If a medial is preceded by an initial, then we proceed // normally. As usual, longer keys must precede shorter ones. // [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within // this block for Jamo-Latin.] "$jamoInitial {ae} <> $AE;" "$jamoInitial {a} <> $A;" "$jamoInitial {eo} <> $EO;" "$jamoInitial {eu} <> $EU;" "$jamoInitial {e} <> $E;" "$jamoInitial {i} <> $I;" "$jamoInitial {oe} <> $OE;" "$jamoInitial {o} <> $O;" "$jamoInitial {u} <> $U;" "$jamoInitial {wae} <> $WAE;" "$jamoInitial {wa} <> $WA;" "$jamoInitial {weo} <> $WEO;" "$jamoInitial {we} <> $WE;" "$jamoInitial {wi} <> $WI;" "$jamoInitial {yae} <> $YAE;" "$jamoInitial {ya} <> $YA;" "$jamoInitial {yeo} <> $YEO;" "$jamoInitial {ye} <> $YE;" "$jamoInitial {yi} <> $YI;" "$jamoInitial {yo} <> $YO;" "$jamoInitial {yu} <> $YU;" // We may see an anomalous isolated 'w' or 'y'. In that case, we // interpret it as 'wi' and 'yu', respectively. // BREAKS ROUND TRIP INTEGRITY "$jamoInitial {w} > | wi;" "$jamoInitial {y} > | yu;" // Otherwise, insert a null consonant IEUNG before the medial (which is // still an untransliterated latin vowel). "($latinMedial) > $IEUNG | $1;" // Convert non-jamo latin consonants to equivalents. These occur as // neither initials nor finals in jamo. 'l' occurs as a final, but not // an initial; it is handled above. The following letters (left hand // side) will never be output by Jamo-Latin. "f > | p;" "q > | k;" "v > | b;" "x > | ks;" "z > | s;" // Delete hyphens (Latin-Jamo). "'-' > ;" // Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels, // since these may also occur in text. "< $IEUNG;" ":: ([[:Hangul:]&[\uFF00-\uFFFF]] NFKD);" // eof } }