scuffed-code/icu4c/source/data/translit/t_Latn_Jamo.txt

 // -*- Coding: utf-8; -*-
//--------------------------------------------------------------------
// Copyright (c) 1999-2002, International Business Machines
// Corporation and others.  All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
// Tool: dumpicurules.bat
// Source: ../../../impl/data/Transliterator_Latin_Jamo.txt
// Date: Sat Jul 27 10:31:07 2002
//--------------------------------------------------------------------

// Latin_Jamo

t_Latn_Jamo {
  Rule {
//--------------------------------------------------------------------
//--------------------------------------------------------------------
//--------------------------------------------------------------------

//- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
//- the INDEX file.  This transliterator is, by itself, not
//- instantiated.  It is used as a part of Latin-Jamo, Latin-Hangul, or
//- inverses thereof.

// Transliteration from Latin characters to Korean script is done in
// two steps: Latin to Jamo, then Jamo to Hangul.  The Jamo-Hangul
// transliteration is done algorithmically following Unicode 3.0
// section 3.11.  This file implements the Latin to Jamo
// transliteration using rules.

// Jamo occupy the block 1100-11FF.  Within this block there are three
// groups of characters: initial consonants or choseong (I), medial
// vowels or jungseong (M), and trailing consonants or jongseong (F).
// Standard Korean syllables are of the form I+M+F*.

// Section 3.11 describes the use of 'filler' jamo to convert
// nonstandard syllables to standard form: the choseong filler 115F and
// the junseong filler 1160.  In this transliterator, we will not use
// 115F or 1160.

// We will, however, insert two 'null' jamo to make foreign words
// conform to Korean syllable structure.  These are the null initial
// consonant 110B (IEUNG) and the null vowel 1173 (EU).  In Latin text,
// we will use the separator in order to disambiguate strings,
// e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).

// We will not use all of the characters in the jamo block.  We will
// only use the 19 initials, 21 medials, and 27 finals possessing a
// jamo short name as defined in section 4.4 of the Unicode book.

// Rules of thumb.  These guidelines provide the basic framework
// for the rules.  They are phrased in terms of Latin-Jamo transliteration.
// The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
// just context-free transliteration of jamo to corresponding short names,
// with the addition of separators to maintain round-trip integrity
// in the context of the Latin-Jamo rules.

// A sequence of vowels:
// - Take the longest sequence you can. If there are too many, or you don't
//   have a starting consonant, introduce a 110B necessary.

// A sequence of consonants.
// - First join the double consonants: G + G -> GG
// - In the remaining list,
// -- If there is no preceding vowel, take the first consonant, and insert EU
//    after it. Continue with the rest of the consonants.
// -- If there is one consonant, attach to the following vowel
// -- If there are two consonants and a following vowel, attach one to the
//    preceeding vowel, and one to the following vowel.
// -- If there are more than two consonants, join the first two together if you
//    can: L + G => LG
// -- If you still end up with more than 2 consonants, insert EU after the
//    first one, and continue with the rest of the consonants.

//----------------------------------------------------------------------
// Variables

// Some latin consonants or consonant pairs only occur as initials, and
// some only as finals, but some occur as both.  This makes some jamo
// consonants ambiguous when transliterated into latin.
//   Initial only: IEUNG BB DD JJ R
//   Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
//   Initial and Final: B C D G GG H J K M N P S SS T

  "$Gi = \u1100;"
  "$GGi = \u1101;"
  "$Ni = \u1102;"
  "$Di = \u1103;"
  "$DD = \u1104;"
  "$R = \u1105;"
  "$Mi = \u1106;"
  "$Bi = \u1107;"
  "$BB = \u1108;"
  "$Si = \u1109;"
  "$SSi = \u110A;"
  "$IEUNG = \u110B;" // null initial, inserted during Latin-Jamo
  "$Ji = \u110C;"
  "$JJ = \u110D;"
  "$Ci = \u110E;"
  "$Ki = \u110F;"
  "$Ti = \u1110;"
  "$Pi = \u1111;"
  "$Hi = \u1112;"

  "$A = \u1161;"
  "$AE = \u1162;"
  "$YA = \u1163;"
  "$YAE = \u1164;"
  "$EO = \u1165;"
  "$E = \u1166;"
  "$YEO = \u1167;"
  "$YE = \u1168;"
  "$O = \u1169;"
  "$WA = \u116A;"
  "$WAE = \u116B;"
  "$OE = \u116C;"
  "$YO = \u116D;"
  "$U = \u116E;"
  "$WEO = \u116F;"
  "$WE = \u1170;"
  "$WI = \u1171;"
  "$YU = \u1172;"
  "$EU = \u1173;" // null medial, inserted during Latin-Jamo
  "$YI = \u1174;"
  "$I = \u1175;"

  "$Gf = \u11A8;"
  "$GGf = \u11A9;"
  "$GS = \u11AA;"
  "$Nf = \u11AB;"
  "$NJ = \u11AC;"
  "$NH = \u11AD;"
  "$Df = \u11AE;"
  "$L = \u11AF;"
  "$LG = \u11B0;"
  "$LM = \u11B1;"
  "$LB = \u11B2;"
  "$LS = \u11B3;"
  "$LT = \u11B4;"
  "$LP = \u11B5;"
  "$LH = \u11B6;"
  "$Mf = \u11B7;"
  "$Bf = \u11B8;"
  "$BS = \u11B9;"
  "$Sf = \u11BA;"
  "$SSf = \u11BB;"
  "$NG = \u11BC;"
  "$Jf = \u11BD;"
  "$Cf = \u11BE;"
  "$Kf = \u11BF;"
  "$Tf = \u11C0;"
  "$Pf = \u11C1;"
  "$Hf = \u11C2;"

  "$jamoInitial = [\u1100-\u1112];"

  "$jamoMedial = [\u1161-\u1175];"

  "$latinInitial = [bcdghjkmnprst];"

  // Any character in the latin transliteration of a medial
  "$latinMedial = [aeiouwy];"

  // The last character of the latin transliteration of a medial
  "$latinMedialEnd = [aeiou];"

  // Disambiguation separator
  "$sep = \\\';"

//----------------------------------------------------------------------
// Jamo-Latin

// Jamo to latin is relatively simple, since it is the latin that is
// ambiguous.  Most rules are straightforward, and we encode them below
// as simple add-on back rule, e.g.:

//   $jamoMedial {bs} > $BS;

// becomes

//   $jamoMedial {bs} <> $BS;

// Furthermore, we don't care about the ordering for Jamo-Latin because
// we are going from single characters, so we can very easily piggyback
// on the Latin-Jamo.

// The main issue with Jamo-Latin is when to insert separators.
// Separators are inserted to obtain correct round trip behavior.  For
// example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
// would then round trip to Ki A GGi E.  To prevent this, we insert a
// separator: "kag-ge".  IMPORTANT: The need for separators depends
// very specifically on the behavior of the Latin-Jamo rules.  A change
// in the Latin-Jamo behavior can completely change the way the
// separator insertion must be done.

// First try to preserve actual separators in the jamo text by doubling
// them.  This fixes problems like:
// (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
// => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L).  This is optional
// -- if we don't care about losing separators in the jamo, we can delete
// this rule.

  "$sep $sep <> $sep;"

// Triple consonants.  For three consonants "axxx" we insert a
// separator between the first and second "x" if XXf, Xf, and Xi all
// exist, and we have A Xf XXi.  This prevents the reverse
// transliteration to A XXf Xi.

  "$sep < $latinMedialEnd g {} $GGi;"
  "$sep < $latinMedialEnd s {} $SSi;"

// For vowels the rule is similar.  If there is a vowel "ae" such that
// "a" by itself and "e" by itself are vowels, then we want to map A E
// to "a-e" so as not to round trip to AE.  However, in the text Ki EO
// IEUNG E we don't need to map to "keo-e".  "keoe" suffices.  For
// vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
// tested.  NOTE: These rules used to have a left context of
// $latinInitial instead of [^$latinMedial].  The problem with this is
// sequences where an initial IEUNG is transliterated away:
//   (IEUNG)(A)(IEUNG)(EO) => aeo => (IEUNG)(AE)(IEUNG)(O)

  "$sep < [^$latinMedial] [y w] e {} [$O $OE];"
  "$sep < [^$latinMedial] e {} [$O $OE $U];"
  "$sep < [^$latinMedial] [o a] {} [$E $EO $EU];"
  "$sep < [^$latinMedial] [w y] a {} [$E $EO $EU];"

// Similar to the above, but with an intervening $IEUNG.

  "$sep < [^$latinMedial] [y w] e {} $IEUNG [$O $OE];"
  "$sep < [^$latinMedial] e {} $IEUNG [$O $OE $U];"
  "$sep < [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];"
  "$sep < [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];"

// Single finals followed by IEUNG.  The jamo sequence A Xf IEUNG E,
// where Xi also exists, must be transliterated as "ax-e" to prevent
// the round trip conversion to A Xi E.

  "$sep < $latinMedialEnd b {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd c {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd d {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd g {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd h {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd j {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd k {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd m {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd n {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd p {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd s {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd t {} $IEUNG $jamoMedial;"

// Double finals followed by IEUNG.  Similar to the single finals
// followed by IEUNG.  Any latin consonant pair X Y, between medials,
// that we would split by Latin-Jamo, we must handle when it occurs as
// part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
// E.

  "$sep < $latinMedialEnd b s {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd g g {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd g s {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l b {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l g {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l h {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l m {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l p {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l s {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd l t {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd n g {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd n h {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd n j {} $IEUNG $jamoMedial;"
  "$sep < $latinMedialEnd s s {} $IEUNG $jamoMedial;"

// Split doubles.  Text of the form A Xi Xf E, where XXi also occurs,
// we transliterate as "ax-xe" to prevent round trip transliteration as
// A XXi E.

  "$sep < $latinMedialEnd b {} $Bi $jamoMedial;"
  "$sep < $latinMedialEnd d {} $Di $jamoMedial;"
  "$sep < $latinMedialEnd j {} $Ji $jamoMedial;"
  "$sep < $latinMedialEnd g {} $Gi $jamoMedial;"
  "$sep < $latinMedialEnd s {} $Si $jamoMedial;"

// XYY.  This corresponds to the XYY rule in Latin-Jamo.  By default
// Latin-Jamo maps "xyy" to Xf YYi, to keep YY together.  As a result,
// "xyy" forms that correspond to XYf Yi must be transliterated as
// "xy-y".

  "$sep < $latinMedialEnd b s {} [$Si $SSi];"
  "$sep < $latinMedialEnd g s {} [$Si $SSi];"
  "$sep < $latinMedialEnd l b {} [$Bi $BB];"
  "$sep < $latinMedialEnd l g {} [$Gi $GGi];"
  "$sep < $latinMedialEnd l s {} [$Si $SSi];"
  "$sep < $latinMedialEnd n g {} [$Gi $GGi];"
  "$sep < $latinMedialEnd n j {} [$Ji $JJ];"

// Deletion of IEUNG is handled below.

//----------------------------------------------------------------------
// Latin-Jamo

// [Basic, context-free Jamo-Latin rules are embedded here too.  See
// above.]

// Split digraphs: Text of the form 'axye', where 'xy' is a final
// digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
// 'e' are medials, we want to transliterate this as A Xf Yi E rather
// than A XYf IEUNG E.  We do NOT include text of the form "axxe",
// since that is handled differently below.  These rules are generated
// programmatically from the jamo data.

  "$jamoMedial {b s} $latinMedial > $Bf $Si;"
  "$jamoMedial {g s} $latinMedial > $Gf $Si;"
  "$jamoMedial {l b} $latinMedial > $L $Bi;"
  "$jamoMedial {l g} $latinMedial > $L $Gi;"
  "$jamoMedial {l h} $latinMedial > $L $Hi;"
  "$jamoMedial {l m} $latinMedial > $L $Mi;"
  "$jamoMedial {l p} $latinMedial > $L $Pi;"
  "$jamoMedial {l s} $latinMedial > $L $Si;"
  "$jamoMedial {l t} $latinMedial > $L $Ti;"
  "$jamoMedial {n g} $latinMedial > $Nf $Gi;"
  "$jamoMedial {n h} $latinMedial > $Nf $Hi;"
  "$jamoMedial {n j} $latinMedial > $Nf $Ji;"

// Single consonants are initials: Text of the form 'axe', where 'x'
// can be an initial or a final, and 'a' and 'e' are medials, we want
// to transliterate as A Xi E rather than A Xf IEUNG E.

  "$jamoMedial {b} $latinMedial > $Bi;"
  "$jamoMedial {c} $latinMedial > $Ci;"
  "$jamoMedial {d} $latinMedial > $Di;"
  "$jamoMedial {g} $latinMedial > $Gi;"
  "$jamoMedial {h} $latinMedial > $Hi;"
  "$jamoMedial {j} $latinMedial > $Ji;"
  "$jamoMedial {k} $latinMedial > $Ki;"
  "$jamoMedial {m} $latinMedial > $Mi;"
  "$jamoMedial {n} $latinMedial > $Ni;"
  "$jamoMedial {p} $latinMedial > $Pi;"
  "$jamoMedial {s} $latinMedial > $Si;"
  "$jamoMedial {t} $latinMedial > $Ti;"

// Doubled initials.  The sequence "axxe", where XX exists as an initial
// (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
// to transliterate as A XXi E, rather than split to A Xf Xi E.

  "$jamoMedial {b b} $latinMedial > $BB;"
  "$jamoMedial {d d} $latinMedial > $DD;"
  "$jamoMedial {j j} $latinMedial > $JJ;"
  "$jamoMedial {g g} $latinMedial > $GGi;"
  "$jamoMedial {s s} $latinMedial > $SSi;"

// XYY.  Because doubled consonants bind more strongly than XY
// consonants, we must handle the sequence "axyy" specially.  Here XYf
// and YYi must exist.  In these cases, we map to Xf YYi rather than
// XYf.

  "$jamoMedial {b} s s > $Bf;"
  "$jamoMedial {g} s s > $Gf;"
  "$jamoMedial {l} b b > $L;"
  "$jamoMedial {l} g g > $L;"
  "$jamoMedial {l} s s > $L;"
  "$jamoMedial {n} g g > $Nf;"
  "$jamoMedial {n} j j > $Nf;"

// Finals: Attach consonant with preceding medial to preceding medial.
// Do this BEFORE mapping consonants to initials.  Longer keys must
// precede shorter keys that they start with, e.g., the rule for 'bs'
// must precede 'b'.

// [BASIC Jamo-Latin FINALS handled here.  Order irrelevant within this
// block for Jamo-Latin.]

  "$jamoMedial {bs} <> $BS;"
  "$jamoMedial {b} <> $Bf;"
  "$jamoMedial {c} <> $Cf;"
  "$jamoMedial {d} <> $Df;"
  "$jamoMedial {gg} <> $GGf;"
  "$jamoMedial {gs} <> $GS;"
  "$jamoMedial {g} <> $Gf;"
  "$jamoMedial {h} <> $Hf;"
  "$jamoMedial {j} <> $Jf;"
  "$jamoMedial {k} <> $Kf;"
  "$jamoMedial {lb} <> $LB;  $jamoMedial {lg} <> $LG;"
  "$jamoMedial {lh} <> $LH;"
  "$jamoMedial {lm} <> $LM;"
  "$jamoMedial {lp} <> $LP;"
  "$jamoMedial {ls} <> $LS;"
  "$jamoMedial {lt} <> $LT;"
  "$jamoMedial {l} <> $L;"
  "$jamoMedial {m} <> $Mf;"
  "$jamoMedial {ng} <> $NG;"
  "$jamoMedial {nh} <> $NH;"
  "$jamoMedial {nj} <> $NJ;"
  "$jamoMedial {n} <> $Nf;"
  "$jamoMedial {p} <> $Pf;"
  "$jamoMedial {ss} <> $SSf;"
  "$jamoMedial {s} <> $Sf;"
  "$jamoMedial {t} <> $Tf;"

// Initials: Attach single consonant to following medial.  Do this
// AFTER mapping finals.  Longer keys must precede shorter keys that
// they start with, e.g., the rule for 'gg' must precede 'g'.

// [BASIC Jamo-Latin INITIALS handled here.  Order irrelevant within
// this block for Jamo-Latin.]

  "{gg} $latinMedial <> $GGi;"
  "{g} $latinMedial <> $Gi;"
  "{n} $latinMedial <> $Ni;"
  "{dd} $latinMedial <> $DD;"
  "{d} $latinMedial <> $Di;"
  "{r} $latinMedial <> $R;"
  "{m} $latinMedial <> $Mi;"
  "{bb} $latinMedial <> $BB;"
  "{b} $latinMedial <> $Bi;"
  "{ss} $latinMedial <> $SSi;"
  "{s} $latinMedial <> $Si;"
  "{jj} $latinMedial <> $JJ;"
  "{j} $latinMedial <> $Ji;"
  "{c} $latinMedial <> $Ci;"
  "{k} $latinMedial <> $Ki;"
  "{t} $latinMedial <> $Ti;"
  "{p} $latinMedial <> $Pi;"
  "{h} $latinMedial <> $Hi;"

// 'r' in final position.  Because of the equivalency of the 'l' and
// 'r' jamo (the glyphs are the same), we try to provide the same
// equivalency in Latin-Jamo.  The 'l' to 'r' conversion is handled
// below.  If we see an 'r' in an apparent final position, treat it
// like 'l'.  For example, "karka" => Ki A R EU Ki A without this rule.
// Instead, we want Ki A L Ki A.

  "$jamoMedial {r} $latinInitial > | l;"

// Initial + Final: If we match the next rule, we have initial then
// final consonant with no intervening medial.  We insert the null
// vowel BEFORE it to create a well-formed syllable.  (In the next rule
// we insert a null vowel AFTER an anomalous initial.)

  "$jamoInitial {} [bcdghjklmnpst] > $EU;"

// Initial + X: This block matches an initial consonant not followed by
// a medial.  We insert the null vowel after it.  We handle double
// initials explicitly here; for single initial consonants we insert EU
// (as Latin) after them and let standard rules do the rest.

// BREAKS ROUND TRIP INTEGRITY

  "gg > $GGi $EU;"
  "dd > $DD $EU;"
  "bb > $BB $EU;"
  "ss > $SSi $EU;"
  "jj > $JJ $EU;"

  "([bcdghjkmnprst]) > | $1 eu;"

// X + Final: Finally we have to deal with a consonant that can only be
// interpreted as a final (not an initial) and which is preceded
// neither by an initial nor a medial.  It is the start of the
// syllable, but cannot be.  Most of these will already be handled by
// the above rules.  'bs' splits into Bi EU Sf.  Similar for 'gs' 'ng'
// 'nh' 'nj'.  The only problem is 'l' and digraphs starting with 'l'.
// For this isolated case, we could add a null initial and medial,
// which would give "la" => IEUNG EU L IEUNG A, for example.  A more
// economical solution is to transliterate isolated "l" (that is,
// initial "l") to "r".  (Other similar conversions of consonants that
// occur neither as initials nor as finals are handled below.)

  "l > | r;"

// Medials.  If a medial is preceded by an initial, then we proceed
// normally.  As usual, longer keys must precede shorter ones.

// [BASIC Jamo-Latin MEDIALS handled here.  Order irrelevant within
// this block for Jamo-Latin.]

  "$jamoInitial {ae} <> $AE;"
  "$jamoInitial {a} <> $A;"
  "$jamoInitial {eo} <> $EO;"
  "$jamoInitial {eu} <> $EU;"
  "$jamoInitial {e} <> $E;"
  "$jamoInitial {i} <> $I;"
  "$jamoInitial {oe} <> $OE;"
  "$jamoInitial {o} <> $O;"
  "$jamoInitial {u} <> $U;"
  "$jamoInitial {wae} <> $WAE;"
  "$jamoInitial {wa} <> $WA;"
  "$jamoInitial {weo} <> $WEO;"
  "$jamoInitial {we} <> $WE;"
  "$jamoInitial {wi} <> $WI;"
  "$jamoInitial {yae} <> $YAE;"
  "$jamoInitial {ya} <> $YA;"
  "$jamoInitial {yeo} <> $YEO;"
  "$jamoInitial {ye} <> $YE;"
  "$jamoInitial {yi} <> $YI;"
  "$jamoInitial {yo} <> $YO;"
  "$jamoInitial {yu} <> $YU;"

// We may see an anomalous isolated 'w' or 'y'.  In that case, we
// interpret it as 'wi' and 'yu', respectively.

// BREAKS ROUND TRIP INTEGRITY

  "$jamoInitial {w} > | wi;"
  "$jamoInitial {y} > | yu;"

// Otherwise, insert a null consonant IEUNG before the medial (which is
// still an untransliterated latin vowel).

  "($latinMedial) > $IEUNG | $1;"

// Convert non-jamo latin consonants to equivalents.  These occur as
// neither initials nor finals in jamo.  'l' occurs as a final, but not
// an initial; it is handled above.  The following letters (left hand
// side) will never be output by Jamo-Latin.

  "f > | p;"
  "q > | k;"
  "v > | b;"
  "x > | ks;"
  "z > | s;"

// Delete separators (Latin-Jamo).

  "$sep > ;"

// Delete null consonants (Jamo-Latin).  Do NOT delete null EU vowels,
// since these may also occur in text.

  "< $IEUNG;"

//- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
//- the INDEX file.  This transliterator is, by itself, not
//- instantiated.  It is used as a part of Latin-Jamo, Latin-Hangul, or
//- inverses thereof.

// eof
  }
}