ICU-872 Update Latin-Jamo and Jamo-Latin rules
X-SVN-Rev: 3826
This commit is contained in:
parent
0382d73e2b
commit
650f6510e5
517
icu4c/data/ljamo.txt
Normal file
517
icu4c/data/ljamo.txt
Normal file
@ -0,0 +1,517 @@
|
||||
// -*- Coding: utf-8; -*-
|
||||
//--------------------------------------------------------------------
|
||||
// Copyright (c) 1999-2001, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
// THIS IS A MACHINE-GENERATED FILE
|
||||
// Tool: dumpICUrules.bat
|
||||
// Source: \icu4j\src\com\ibm\text\resources/Transliterator_Latin_Jamo.utf8.txt
|
||||
// Date: Wed Feb 28 11:52:16 2001
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Jamo
|
||||
|
||||
ljamo {
|
||||
Rule {
|
||||
//--------------------------------------------------------------------
|
||||
// Copyright (c) 1999-2001, International Business Machines
|
||||
// Corporation and others. All Rights Reserved.
|
||||
//--------------------------------------------------------------------
|
||||
|
||||
// Latin-Jamo
|
||||
|
||||
// Transliteration from Latin characters to Korean script is done in
|
||||
// two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
|
||||
// transliteration is done algorithmically following Unicode 3.0
|
||||
// section 3.11. This file implements the Latin to Jamo
|
||||
// transliteration using rules.
|
||||
|
||||
// Jamo occupy the block 1100-11FF. Within this block there are three
|
||||
// groups of characters: initial consonants or choseong (I), medial
|
||||
// vowels or jungseong (M), and trailing consonants or jongseong (F).
|
||||
// Standard Korean syllables are of the form I+M+F*.
|
||||
|
||||
// Section 3.11 describes the use of 'filler' jamo to convert
|
||||
// nonstandard syllables to standard form: the choseong filler 115F and
|
||||
// the junseong filler 1160. In this transliterator, we will not use
|
||||
// 115F or 1160.
|
||||
|
||||
// We will, however, insert two 'null' jamo to make foreign words
|
||||
// conform to Korean syllable structure. These are the null initial
|
||||
// consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
|
||||
// we will use the hyphen in order to disambiguate strings,
|
||||
// e.g. "kan'ggan" (initial GG) vs. "kanggan" (final NG + initial G).
|
||||
|
||||
// We will not use all of the characters in the jamo block. We will
|
||||
// only use the 19 initials, 21 medials, and 27 finals possessing a
|
||||
// jamo short name as defined in section 4.4 of the Unicode book.
|
||||
|
||||
// Rules of thumb. These guidelines provide the basic framework
|
||||
// for the rules. They are phrased in terms of Latin-Jamo transliteration.
|
||||
// The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
|
||||
// just context-free transliteration of jamo to corresponding short names,
|
||||
// with the addition of hyphens to maintain round-trip integrity
|
||||
// in the context of the Latin-Jamo rules.
|
||||
|
||||
// A sequence of vowels:
|
||||
// - Take the longest sequence you can. If there are too many, or you don't
|
||||
// have a starting consonant, introduce a 110B necessary.
|
||||
|
||||
// A sequence of consonants.
|
||||
// - First join the double consonants: G + G -> GG
|
||||
// - In the remaining list,
|
||||
// -- If there is no preceding vowel, take the first consonant, and insert EU
|
||||
// after it. Continue with the rest of the consonants.
|
||||
// -- If there is one consonant, attach to the following vowel
|
||||
// -- If there are two consonants and a following vowel, attach one to the
|
||||
// preceeding vowel, and one to the following vowel.
|
||||
// -- If there are more than two consonants, join the first two together if you
|
||||
// can: L + G => LG
|
||||
// -- If you still end up with more than 2 consonants, insert EU after the
|
||||
// first one, and continue with the rest of the consonants.
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Variables
|
||||
|
||||
// Some latin consonants or consonant pairs only occur as initials, and
|
||||
// some only as finals, but some occur as both. This makes some jamo
|
||||
// consonants ambiguous when transliterated into latin.
|
||||
// Initial only: IEUNG BB DD JJ R
|
||||
// Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
|
||||
// Initial and Final: B C D G GG H J K M N P S SS T
|
||||
|
||||
"$Gi = \u1100;"
|
||||
"$GGi = \u1101;"
|
||||
"$Ni = \u1102;"
|
||||
"$Di = \u1103;"
|
||||
"$DD = \u1104;"
|
||||
"$R = \u1105;"
|
||||
"$Mi = \u1106;"
|
||||
"$Bi = \u1107;"
|
||||
"$BB = \u1108;"
|
||||
"$Si = \u1109;"
|
||||
"$SSi = \u110A;"
|
||||
"$IEUNG = \u110B;" // null initial, inserted during Latin-Jamo
|
||||
"$Ji = \u110C;"
|
||||
"$JJ = \u110D;"
|
||||
"$Ci = \u110E;"
|
||||
"$Ki = \u110F;"
|
||||
"$Ti = \u1110;"
|
||||
"$Pi = \u1111;"
|
||||
"$Hi = \u1112;"
|
||||
|
||||
"$A = \u1161;"
|
||||
"$AE = \u1162;"
|
||||
"$YA = \u1163;"
|
||||
"$YAE = \u1164;"
|
||||
"$EO = \u1165;"
|
||||
"$E = \u1166;"
|
||||
"$YEO = \u1167;"
|
||||
"$YE = \u1168;"
|
||||
"$O = \u1169;"
|
||||
"$WA = \u116A;"
|
||||
"$WAE = \u116B;"
|
||||
"$OE = \u116C;"
|
||||
"$YO = \u116D;"
|
||||
"$U = \u116E;"
|
||||
"$WEO = \u116F;"
|
||||
"$WE = \u1170;"
|
||||
"$WI = \u1171;"
|
||||
"$YU = \u1172;"
|
||||
"$EU = \u1173;" // null medial, inserted during Latin-Jamo
|
||||
"$YI = \u1174;"
|
||||
"$I = \u1175;"
|
||||
|
||||
"$Gf = \u11A8;"
|
||||
"$GGf = \u11A9;"
|
||||
"$GS = \u11AA;"
|
||||
"$Nf = \u11AB;"
|
||||
"$NJ = \u11AC;"
|
||||
"$NH = \u11AD;"
|
||||
"$Df = \u11AE;"
|
||||
"$L = \u11AF;"
|
||||
"$LG = \u11B0;"
|
||||
"$LM = \u11B1;"
|
||||
"$LB = \u11B2;"
|
||||
"$LS = \u11B3;"
|
||||
"$LT = \u11B4;"
|
||||
"$LP = \u11B5;"
|
||||
"$LH = \u11B6;"
|
||||
"$Mf = \u11B7;"
|
||||
"$Bf = \u11B8;"
|
||||
"$BS = \u11B9;"
|
||||
"$Sf = \u11BA;"
|
||||
"$SSf = \u11BB;"
|
||||
"$NG = \u11BC;"
|
||||
"$Jf = \u11BD;"
|
||||
"$Cf = \u11BE;"
|
||||
"$Kf = \u11BF;"
|
||||
"$Tf = \u11C0;"
|
||||
"$Pf = \u11C1;"
|
||||
"$Hf = \u11C2;"
|
||||
|
||||
"$jamoInitial = [\u1100-\u1112];"
|
||||
|
||||
"$jamoMedial = [\u1161-\u1175];"
|
||||
|
||||
"$latinInitial = [bcdghjkmnprst];"
|
||||
|
||||
// Any character in the latin transliteration of a medial
|
||||
"$latinMedial = [aeiouwy];"
|
||||
|
||||
// The last character of the latin transliteration of a medial
|
||||
"$latinMedialEnd = [aeiou];"
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Jamo-Latin
|
||||
|
||||
// Jamo to latin is relatively simple, since it is the latin that is
|
||||
// ambiguous. Most rules are straightforward, and we encode them below
|
||||
// as simple add-on back rule, e.g.:
|
||||
|
||||
// $jamoMedial {bs} > $BS;
|
||||
|
||||
// becomes
|
||||
|
||||
// $jamoMedial {bs} <> $BS;
|
||||
|
||||
// Furthermore, we don't care about the ordering for Jamo-Latin because
|
||||
// we are going from single characters, so we can very easily piggyback
|
||||
// on the Latin-Jamo.
|
||||
|
||||
// The main issue with Jamo-Latin is when to insert hyphens.
|
||||
// Hyphens are inserted to obtain correct round trip behavior. For
|
||||
// example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
|
||||
// would then round trip to Ki A GGi E. To prevent this, we insert a
|
||||
// hyphen: "kag'ge". IMPORTANT: The need for hyphens depends
|
||||
// very specifically on the behavior of the Latin-Jamo rules. A change
|
||||
// in the Latin-Jamo behavior can completely change the way the
|
||||
// hyphen insertion must be done.
|
||||
|
||||
// First try to preserve actual hyphens in the jamo text by doubling
|
||||
// them. This fixes problems like:
|
||||
// (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol
|
||||
// => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
|
||||
// -- if we don't care about losing hyphens in the jamo, we can delete
|
||||
// this rule.
|
||||
|
||||
"'--' <> '-';"
|
||||
|
||||
// Triple consonants. For three consonants "axxx" we insert a
|
||||
// hyphen between the first and second "x" if XXf, Xf, and Xi all
|
||||
// exist, and we have A Xf XXi. This prevents the reverse
|
||||
// transliteration to A XXf Xi.
|
||||
|
||||
"'-' < $latinMedialEnd g {} $GGi;"
|
||||
"'-' < $latinMedialEnd s {} $SSi;"
|
||||
|
||||
// For vowels the rule is similar. If there is a vowel "ae" such that
|
||||
// "a" by itself and "e" by itself are vowels, then we want to map A E
|
||||
// to "a'e" so as not to round trip to AE. However, in the text Ki EO
|
||||
// IEUNG E we don't need to map to "keo'e". "keoe" suffices. For
|
||||
// vowels of the form "aei", in theory both "ae" + "i" and "a" + "ei"
|
||||
// must be tested, but in practice only the former occurs.
|
||||
|
||||
"'-' < $latinInitial [ye we] {} $O;"
|
||||
"'-' < $latinInitial e {} [$O $U];"
|
||||
"'-' < $latinInitial [o a wa ya] {} $E;"
|
||||
|
||||
// Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
|
||||
// where Xi also exists, must be transliterated as "ax'e" to prevent
|
||||
// the round trip conversion to A Xi E.
|
||||
|
||||
"'-' < $latinMedialEnd b {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd c {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd d {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd g {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd h {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd j {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd k {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd m {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd n {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd p {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd s {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd t {} $IEUNG $jamoMedial;"
|
||||
|
||||
// Double finals followed by IEUNG. Similar to the single finals
|
||||
// followed by IEUNG. Any latin consonant pair X Y, between medials,
|
||||
// that we would split by Latin-Jamo, we must handle when it occurs as
|
||||
// part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi
|
||||
// E.
|
||||
|
||||
"'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial;"
|
||||
"'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial;"
|
||||
|
||||
// Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
|
||||
// we transliterate as "ax'xe" to prevent round trip transliteration as
|
||||
// A XXi E.
|
||||
|
||||
"'-' < $latinMedialEnd b {} $Bi $jamoMedial;"
|
||||
"'-' < $latinMedialEnd d {} $Di $jamoMedial;"
|
||||
"'-' < $latinMedialEnd j {} $Ji $jamoMedial;"
|
||||
"'-' < $latinMedialEnd g {} $Gi $jamoMedial;"
|
||||
"'-' < $latinMedialEnd s {} $Si $jamoMedial;"
|
||||
|
||||
// XYY. This corresponds to the XYY rule in Latin-Jamo. By default
|
||||
// Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
|
||||
// "xyy" forms that correspond to XYf Yi must be transliterated as
|
||||
// "xy'y".
|
||||
|
||||
"'-' < $latinMedialEnd b s {} $Si;"
|
||||
"'-' < $latinMedialEnd g s {} $Si;"
|
||||
"'-' < $latinMedialEnd l b {} $Bi;"
|
||||
"'-' < $latinMedialEnd l g {} $Gi;"
|
||||
"'-' < $latinMedialEnd l s {} $Si;"
|
||||
"'-' < $latinMedialEnd n g {} $Gi;"
|
||||
"'-' < $latinMedialEnd n j {} $Ji;"
|
||||
|
||||
// Deletion of IEUNG is handled below.
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
// Latin-Jamo
|
||||
|
||||
// [Basic, context-free Jamo-Latin rules are embedded here too. See
|
||||
// above.]
|
||||
|
||||
// Split digraphs: Text of the form 'axye', where 'xy' is a final
|
||||
// digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
|
||||
// 'e' are medials, we want to transliterate this as A Xf Yi E rather
|
||||
// than A XYf IEUNG E. We do NOT include text of the form "axxe",
|
||||
// since that is handled differently below. These rules are generated
|
||||
// programmatically from the jamo data.
|
||||
|
||||
"$jamoMedial {b s} $latinMedial > $Bf $Si;"
|
||||
"$jamoMedial {g s} $latinMedial > $Gf $Si;"
|
||||
"$jamoMedial {l b} $latinMedial > $L $Bi;"
|
||||
"$jamoMedial {l g} $latinMedial > $L $Gi;"
|
||||
"$jamoMedial {l h} $latinMedial > $L $Hi;"
|
||||
"$jamoMedial {l m} $latinMedial > $L $Mi;"
|
||||
"$jamoMedial {l p} $latinMedial > $L $Pi;"
|
||||
"$jamoMedial {l s} $latinMedial > $L $Si;"
|
||||
"$jamoMedial {l t} $latinMedial > $L $Ti;"
|
||||
"$jamoMedial {n g} $latinMedial > $Nf $Gi;"
|
||||
"$jamoMedial {n h} $latinMedial > $Nf $Hi;"
|
||||
"$jamoMedial {n j} $latinMedial > $Nf $Ji;"
|
||||
|
||||
// Single consonants are initials: Text of the form 'axe', where 'x'
|
||||
// can be an initial or a final, and 'a' and 'e' are medials, we want
|
||||
// to transliterate as A Xi E rather than A Xf IEUNG E.
|
||||
|
||||
"$jamoMedial {b} $latinMedial > $Bi;"
|
||||
"$jamoMedial {c} $latinMedial > $Ci;"
|
||||
"$jamoMedial {d} $latinMedial > $Di;"
|
||||
"$jamoMedial {g} $latinMedial > $Gi;"
|
||||
"$jamoMedial {h} $latinMedial > $Hi;"
|
||||
"$jamoMedial {j} $latinMedial > $Ji;"
|
||||
"$jamoMedial {k} $latinMedial > $Ki;"
|
||||
"$jamoMedial {m} $latinMedial > $Mi;"
|
||||
"$jamoMedial {n} $latinMedial > $Ni;"
|
||||
"$jamoMedial {p} $latinMedial > $Pi;"
|
||||
"$jamoMedial {s} $latinMedial > $Si;"
|
||||
"$jamoMedial {t} $latinMedial > $Ti;"
|
||||
|
||||
// Doubled initials. The sequence "axxe", where XX exists as an initial
|
||||
// (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
|
||||
// to transliterate as A XXi E, rather than split to A Xf Xi E.
|
||||
|
||||
"$jamoMedial {b b} $latinMedial > $BB;"
|
||||
"$jamoMedial {d d} $latinMedial > $DD;"
|
||||
"$jamoMedial {j j} $latinMedial > $JJ;"
|
||||
"$jamoMedial {g g} $latinMedial > $GGi;"
|
||||
"$jamoMedial {s s} $latinMedial > $SSi;"
|
||||
|
||||
// XYY. Because doubled consonants bind more strongly than XY
|
||||
// consonants, we must handle the sequence "axyy" specially. Here XYf
|
||||
// and YYi must exist. In these cases, we map to Xf YYi rather than
|
||||
// XYf.
|
||||
|
||||
"$jamoMedial {b} s s > $Bf;"
|
||||
"$jamoMedial {g} s s > $Gf;"
|
||||
"$jamoMedial {l} b b > $L;"
|
||||
"$jamoMedial {l} g g > $L;"
|
||||
"$jamoMedial {l} s s > $L;"
|
||||
"$jamoMedial {n} g g > $Nf;"
|
||||
"$jamoMedial {n} j j > $Nf;"
|
||||
|
||||
// Finals: Attach consonant with preceding medial to preceding medial.
|
||||
// Do this BEFORE mapping consonants to initials. Longer keys must
|
||||
// precede shorter keys that they start with, e.g., the rule for 'bs'
|
||||
// must precede 'b'.
|
||||
|
||||
// [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
|
||||
// block for Jamo-Latin.]
|
||||
|
||||
"$jamoMedial {bs} <> $BS;"
|
||||
"$jamoMedial {b} <> $Bf;"
|
||||
"$jamoMedial {c} <> $Cf;"
|
||||
"$jamoMedial {d} <> $Df;"
|
||||
"$jamoMedial {gg} <> $GGf;"
|
||||
"$jamoMedial {gs} <> $GS;"
|
||||
"$jamoMedial {g} <> $Gf;"
|
||||
"$jamoMedial {h} <> $Hf;"
|
||||
"$jamoMedial {j} <> $Jf;"
|
||||
"$jamoMedial {k} <> $Kf;"
|
||||
"$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;"
|
||||
"$jamoMedial {lh} <> $LH;"
|
||||
"$jamoMedial {lm} <> $LM;"
|
||||
"$jamoMedial {lp} <> $LP;"
|
||||
"$jamoMedial {ls} <> $LS;"
|
||||
"$jamoMedial {lt} <> $LT;"
|
||||
"$jamoMedial {l} <> $L;"
|
||||
"$jamoMedial {m} <> $Mf;"
|
||||
"$jamoMedial {ng} <> $NG;"
|
||||
"$jamoMedial {nh} <> $NH;"
|
||||
"$jamoMedial {nj} <> $NJ;"
|
||||
"$jamoMedial {n} <> $Nf;"
|
||||
"$jamoMedial {p} <> $Pf;"
|
||||
"$jamoMedial {ss} <> $SSf;"
|
||||
"$jamoMedial {s} <> $Sf;"
|
||||
"$jamoMedial {t} <> $Tf;"
|
||||
|
||||
// Initials: Attach single consonant to following medial. Do this
|
||||
// AFTER mapping finals. Longer keys must precede shorter keys that
|
||||
// they start with, e.g., the rule for 'gg' must precede 'g'.
|
||||
|
||||
// [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
|
||||
// this block for Jamo-Latin.]
|
||||
|
||||
"{gg} $latinMedial <> $GGi;"
|
||||
"{g} $latinMedial <> $Gi;"
|
||||
"{n} $latinMedial <> $Ni;"
|
||||
"{dd} $latinMedial <> $DD;"
|
||||
"{d} $latinMedial <> $Di;"
|
||||
"{r} $latinMedial <> $R;"
|
||||
"{m} $latinMedial <> $Mi;"
|
||||
"{bb} $latinMedial <> $BB;"
|
||||
"{b} $latinMedial <> $Bi;"
|
||||
"{ss} $latinMedial <> $SSi;"
|
||||
"{s} $latinMedial <> $Si;"
|
||||
"{jj} $latinMedial <> $JJ;"
|
||||
"{j} $latinMedial <> $Ji;"
|
||||
"{c} $latinMedial <> $Ci;"
|
||||
"{k} $latinMedial <> $Ki;"
|
||||
"{t} $latinMedial <> $Ti;"
|
||||
"{p} $latinMedial <> $Pi;"
|
||||
"{h} $latinMedial <> $Hi;"
|
||||
|
||||
// 'r' in final position. Because of the equivalency of the 'l' and
|
||||
// 'r' jamo (the glyphs are the same), we try to provide the same
|
||||
// equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
|
||||
// below. If we see an 'r' in an apparent final position, treat it
|
||||
// like 'l'. For example, "karka" => Ki A R EU Ki A without this rule.
|
||||
// Instead, we want Ki A L Ki A.
|
||||
|
||||
"$jamoMedial {r} $latinInitial > | l;"
|
||||
|
||||
// Initial + Final: If we match the next rule, we have initial then
|
||||
// final consonant with no intervening medial. We insert the null
|
||||
// vowel BEFORE it to create a well-formed syllable. (In the next rule
|
||||
// we insert a null vowel AFTER an anomalous initial.)
|
||||
|
||||
"$jamoInitial {} [bcdghjklmnpst] > $EU;"
|
||||
|
||||
// Initial + X: This block matches an initial consonant not followed by
|
||||
// a medial. We insert the null vowel after it. We handle double
|
||||
// initials explicitly here; for single initial consonants we insert EU
|
||||
// (as Latin) after them and let standard rules do the rest.
|
||||
|
||||
// BREAKS ROUND TRIP INTEGRITY
|
||||
|
||||
"gg > $GGi $EU;"
|
||||
"dd > $DD $EU;"
|
||||
"bb > $BB $EU;"
|
||||
"ss > $SSi $EU;"
|
||||
"jj > $JJ $EU;"
|
||||
|
||||
"([bcdghjkmnprst]) > | $1 eu;"
|
||||
|
||||
// X + Final: Finally we have to deal with a consonant that can only be
|
||||
// interpreted as a final (not an initial) and which is preceded
|
||||
// neither by an initial nor a medial. It is the start of the
|
||||
// syllable, but cannot be. Most of these will already be handled by
|
||||
// the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
|
||||
// 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
|
||||
// For this isolated case, we could add a null initial and medial,
|
||||
// which would give "la" => IEUNG EU L IEUNG A, for example. A more
|
||||
// economical solution is to transliterate isolated "l" (that is,
|
||||
// initial "l") to "r". (Other similar conversions of consonants that
|
||||
// occur neither as initials nor as finals are handled below.)
|
||||
|
||||
"l > | r;"
|
||||
|
||||
// Medials. If a medial is preceded by an initial, then we proceed
|
||||
// normally. As usual, longer keys must precede shorter ones.
|
||||
|
||||
// [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
|
||||
// this block for Jamo-Latin.]
|
||||
|
||||
"$jamoInitial {ae} <> $AE;"
|
||||
"$jamoInitial {a} <> $A;"
|
||||
"$jamoInitial {eo} <> $EO;"
|
||||
"$jamoInitial {eu} <> $EU;"
|
||||
"$jamoInitial {e} <> $E;"
|
||||
"$jamoInitial {i} <> $I;"
|
||||
"$jamoInitial {oe} <> $OE;"
|
||||
"$jamoInitial {o} <> $O;"
|
||||
"$jamoInitial {u} <> $U;"
|
||||
"$jamoInitial {wae} <> $WAE;"
|
||||
"$jamoInitial {wa} <> $WA;"
|
||||
"$jamoInitial {weo} <> $WEO;"
|
||||
"$jamoInitial {we} <> $WE;"
|
||||
"$jamoInitial {wi} <> $WI;"
|
||||
"$jamoInitial {yae} <> $YAE;"
|
||||
"$jamoInitial {ya} <> $YA;"
|
||||
"$jamoInitial {yeo} <> $YEO;"
|
||||
"$jamoInitial {ye} <> $YE;"
|
||||
"$jamoInitial {yi} <> $YI;"
|
||||
"$jamoInitial {yo} <> $YO;"
|
||||
"$jamoInitial {yu} <> $YU;"
|
||||
|
||||
// We may see an anomalous isolated 'w' or 'y'. In that case, we
|
||||
// interpret it as 'wi' and 'yu', respectively.
|
||||
|
||||
// BREAKS ROUND TRIP INTEGRITY
|
||||
|
||||
"$jamoInitial {w} > | wi;"
|
||||
"$jamoInitial {y} > | yu;"
|
||||
|
||||
// Otherwise, insert a null consonant IEUNG before the medial (which is
|
||||
// still an untransliterated latin vowel).
|
||||
|
||||
"($latinMedial) > $IEUNG | $1;"
|
||||
|
||||
// Convert non-jamo latin consonants to equivalents. These occur as
|
||||
// neither initials nor finals in jamo. 'l' occurs as a final, but not
|
||||
// an initial; it is handled above. The following letters (left hand
|
||||
// side) will never be output by Jamo-Latin.
|
||||
|
||||
"f > | p;"
|
||||
"q > | k;"
|
||||
"v > | b;"
|
||||
"x > | ks;"
|
||||
"z > | s;"
|
||||
|
||||
// Delete hyphens (Latin-Jamo).
|
||||
|
||||
"'-' > ;"
|
||||
|
||||
// Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
|
||||
// since these may also occur in text.
|
||||
|
||||
"< $IEUNG;"
|
||||
|
||||
// eof
|
||||
}
|
||||
}
|
@ -46,7 +46,8 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \
|
||||
tsmthred.o tsmutex.o tsnmfmt.o tsputil.o tstnorm.o tzbdtest.o \
|
||||
tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o transtst.o strtest.o thcoll.o \
|
||||
itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o unhxtrts.o hxuntrts.o \
|
||||
jahatrts.o hajatrts.o ufltlgts.o testutil.o transrt.o normconf.o sfwdchit.o
|
||||
jahatrts.o hajatrts.o ufltlgts.o testutil.o transrt.o normconf.o sfwdchit.o \
|
||||
jamotest.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include "ufltlgts.h"
|
||||
#include "transrt.h"
|
||||
#include "usettest.h"
|
||||
#include "jamotest.h"
|
||||
|
||||
#define CASE(id,test) case id: \
|
||||
name = #test; \
|
||||
@ -54,6 +55,7 @@ void IntlTestTransliterator::runIndexedTest( int32_t index, UBool exec, const ch
|
||||
CASE(7, UnicodeFilterLogicTest);
|
||||
CASE(8, TransliteratorRoundTripTest);
|
||||
CASE(9, UnicodeSetTest);
|
||||
CASE(10, JamoTest);
|
||||
default: name=""; break;
|
||||
}
|
||||
}
|
||||
|
418
icu4c/source/test/intltest/jamotest.cpp
Normal file
418
icu4c/source/test/intltest/jamotest.cpp
Normal file
@ -0,0 +1,418 @@
|
||||
#include "jamotest.h"
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/translit.h"
|
||||
#include "unicode/rbt.h"
|
||||
#include "unicode/cpdtrans.h"
|
||||
|
||||
#define CASE(id,test) case id: \
|
||||
name = #test; \
|
||||
if (exec) { \
|
||||
logln(#test "---"); \
|
||||
logln((UnicodeString)""); \
|
||||
test(); \
|
||||
} \
|
||||
break
|
||||
|
||||
void
|
||||
JamoTest::runIndexedTest(int32_t index, UBool exec,
|
||||
const char* &name, char* /*par*/) {
|
||||
switch (index) {
|
||||
CASE(0,TestJamo);
|
||||
CASE(1,TestRealText);
|
||||
default: name = ""; break;
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
JamoTest::TestJamo() {
|
||||
Transliterator* latinJamo = Transliterator::createInstance("Latin-Jamo");
|
||||
|
||||
if (latinJamo == 0) {
|
||||
errln("FAIL: createInstance() returned 0");
|
||||
return;
|
||||
}
|
||||
|
||||
Transliterator* jamoLatin = latinJamo->createInverse();
|
||||
|
||||
if (jamoLatin == 0) {
|
||||
delete latinJamo;
|
||||
errln("FAIL: createInverse() returned 0");
|
||||
return;
|
||||
}
|
||||
|
||||
const char* CASE[] = {
|
||||
// Column 1 is the latin text L1 to be fed to Latin-Jamo
|
||||
// to yield output J.
|
||||
|
||||
// Column 2 is expected value of J. J is fed to
|
||||
// Jamo-Latin to yield output L2.
|
||||
|
||||
// Column 3 is expected value of L2. If the expected
|
||||
// value of L2 is L1, then L2 is NULL.
|
||||
"bab", "(Bi)(A)(Bf)", NULL,
|
||||
"babb", "(Bi)(A)(Bf)(Bi)(EU)", "bab-beu",
|
||||
"babbba", "(Bi)(A)(Bf)(BB)(A)", NULL,
|
||||
"bagg", "(Bi)(A)(GGf)", NULL,
|
||||
"baggga", "(Bi)(A)(GGf)(Gi)(A)", NULL,
|
||||
"bag-gga", "(Bi)(A)(Gf)(GGi)(A)", NULL,
|
||||
"kabsa", "(Ki)(A)(Bf)(Si)(A)", NULL,
|
||||
"kabska", "(Ki)(A)(BS)(Ki)(A)", NULL,
|
||||
"gabsbka", "(Gi)(A)(BS)(Bi)(EU)(Ki)(A)", "gabsbeuka", // not (Kf)
|
||||
"gga", "(GGi)(A)", NULL,
|
||||
"bsa", "(Bi)(EU)(Si)(A)", "beusa",
|
||||
"agg", "(IEUNG)(A)(GGf)", NULL,
|
||||
"agga", "(IEUNG)(A)(GGi)(A)", NULL,
|
||||
"la", "(R)(A)", "ra",
|
||||
"bs", "(Bi)(EU)(Sf)", "beus",
|
||||
"kalgga", "(Ki)(A)(L)(GGi)(A)", NULL,
|
||||
|
||||
// 'r' in a final position is treated like 'l'
|
||||
"karka", "(Ki)(A)(L)(Ki)(A)", "kalka",
|
||||
};
|
||||
|
||||
enum { CASE_length = sizeof(CASE) / sizeof(CASE[0]) };
|
||||
|
||||
int32_t i;
|
||||
for (i=0; i<CASE_length; i+=3) {
|
||||
UnicodeString jamo = nameToJamo(CASE[i+1]);
|
||||
if (CASE[i+2] == NULL) {
|
||||
expect(*latinJamo, CASE[i], jamo, *jamoLatin);
|
||||
} else {
|
||||
// Handle case where round-trip is expected to fail
|
||||
expect(*latinJamo, CASE[i], jamo);
|
||||
expect(*jamoLatin, jamo, CASE[i+2]);
|
||||
}
|
||||
}
|
||||
|
||||
delete latinJamo;
|
||||
delete jamoLatin;
|
||||
}
|
||||
|
||||
void
|
||||
JamoTest::TestRealText() {
|
||||
// Test text taken from the Unicode web site
|
||||
const char* WHAT_IS_UNICODE[] = {
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc\\uc5d0", "\\ub300\\ud574", "?",
|
||||
|
||||
"\\uc5b4\\ub5a4", "\\ud50c\\ub7ab\\ud3fc,", "\\uc5b4\\ub5a4",
|
||||
"\\ud504\\ub85c\\uadf8\\ub7a8,", "\\uc5b4\\ub5a4", "\\uc5b8\\uc5b4\\uc5d0\\ub3c4",
|
||||
"\\uc0c1\\uad00\\uc5c6\\uc774", "\\uc720\\ub2c8\\ucf54\\ub4dc\\ub294", "\\ubaa8\\ub4e0",
|
||||
"\\ubb38\\uc790\\uc5d0", "\\ub300\\ud574", "\\uace0\\uc720", "\\ubc88\\ud638\\ub97c",
|
||||
"\\uc81c\\uacf5\\ud569\\ub2c8\\ub2e4.",
|
||||
|
||||
"\\uae30\\ubcf8\\uc801\\uc73c\\ub85c", "\\ucef4\\ud4e8\\ud130\\ub294",
|
||||
"\\uc22b\\uc790\\ub9cc", "\\ucc98\\ub9ac\\ud569\\ub2c8\\ub2e4.", "\\uae00\\uc790\\ub098",
|
||||
"\\ub2e4\\ub978", "\\ubb38\\uc790\\uc5d0\\ub3c4", "\\uc22b\\uc790\\ub97c",
|
||||
"\\uc9c0\\uc815\\ud558\\uc5ec",
|
||||
"\\uc800\\uc7a5\\ud569\\ub2c8\\ub2e4.", "\\uc720\\ub2c8\\ucf54\\ub4dc\\uac00",
|
||||
"\\uac1c\\ubc1c\\ub418\\uae30", "\\uc804\\uc5d0\\ub294", "\\uc774\\ub7ec\\ud55c",
|
||||
"\\uc22b\\uc790\\ub97c", "\\uc9c0\\uc815\\ud558\\uae30", "\\uc704\\ud574", "\\uc218\\ubc31",
|
||||
"\\uac00\\uc9c0\\uc758", "\\ub2e4\\ub978", "\\uae30\\ud638\\ud654",
|
||||
"\\uc2dc\\uc2a4\\ud15c\\uc744",
|
||||
"\\uc0ac\\uc6a9\\ud588\\uc2b5\\ub2c8\\ub2e4.", "\\ub2e8\\uc77c", "\\uae30\\ud638\\ud654",
|
||||
"\\ubc29\\ubc95\\uc73c\\ub85c\\ub294", "\\ubaa8\\ub4e0", "\\ubb38\\uc790\\ub97c",
|
||||
"\\ud3ec\\ud568\\ud560", "\\uc218", "\\uc5c6\\uc5c8\\uc2b5\\ub2c8\\ub2e4.", "\\uc608\\ub97c",
|
||||
"\\ub4e4\\uc5b4", "\\uc720\\ub7fd", "\\uc5f0\\ud569\\uc5d0\\uc11c\\ub9cc",
|
||||
"\\ubcf4\\ub354\\ub77c\\ub3c4", "\\ubaa8\\ub4e0", "\\uac01", "\\ub098\\ub77c\\ubcc4",
|
||||
"\\uc5b8\\uc5b4\\ub97c", "\\ucc98\\ub9ac\\ud558\\ub824\\uba74", "\\uc5ec\\ub7ec",
|
||||
"\\uac1c\\uc758", "\\ub2e4\\ub978", "\\uae30\\ud638\\ud654", "\\ubc29\\ubc95\\uc774",
|
||||
"\\ud544\\uc694\\ud569\\ub2c8\\ub2e4.", "\\uc601\\uc5b4\\uc640", "\\uac19\\uc740",
|
||||
"\\ub2e8\\uc77c", "\\uc5b8\\uc5b4\\uc758", "\\uacbd\\uc6b0\\ub3c4",
|
||||
"\\uacf5\\ud1b5\\uc801\\uc73c\\ub85c", "\\uc0ac\\uc6a9\\ub418\\ub294", "\\ubaa8\\ub4e0",
|
||||
"\\uae00\\uc790,", "\\ubb38\\uc7a5", "\\ubd80\\ud638", "\\ubc0f",
|
||||
"\\ud14c\\ud06c\\ub2c8\\uceec", "\\uae30\\ud638\\uc5d0", "\\ub9de\\ub294", "\\ub2e8\\uc77c",
|
||||
"\\uae30\\ud638\\ud654", "\\ubc29\\ubc95\\uc744", "\\uac16\\uace0", "\\uc788\\uc9c0",
|
||||
"\\ubabb\\ud558\\uc600\\uc2b5\\ub2c8\\ub2e4.",
|
||||
|
||||
"\\uc774\\ub7ec\\ud55c", "\\uae30\\ud638\\ud654", "\\uc2dc\\uc2a4\\ud15c\\uc740",
|
||||
"\\ub610\\ud55c", "\\ub2e4\\ub978", "\\uae30\\ud638\\ud654", "\\uc2dc\\uc2a4\\ud15c\\uacfc",
|
||||
"\\ucda9\\ub3cc\\ud569\\ub2c8\\ub2e4.", "\\uc989", "\\ub450", "\\uac00\\uc9c0",
|
||||
"\\uae30\\ud638\\ud654", "\\ubc29\\ubc95\\uc774", "\\ub450", "\\uac1c\\uc758", "\\ub2e4\\ub978",
|
||||
"\\ubb38\\uc790\\uc5d0", "\\ub300\\ud574", "\\uac19\\uc740", "\\ubc88\\ud638\\ub97c",
|
||||
"\\uc0ac\\uc6a9\\ud558\\uac70\\ub098", "\\uac19\\uc740", "\\ubb38\\uc790\\uc5d0",
|
||||
"\\ub300\\ud574", "\\ub2e4\\ub978", "\\ubc88\\ud638\\ub97c", "\\uc0ac\\uc6a9\\ud560", "\\uc218",
|
||||
"\\uc788\\uc2b5\\ub2c8\\ub2e4.", "\\uc8fc\\uc5b4\\uc9c4", "\\ubaa8\\ub4e0",
|
||||
"\\ucef4\\ud4e8\\ud130(\\ud2b9\\ud788", "\\uc11c\\ubc84)\\ub294", "\\uc11c\\ub85c",
|
||||
"\\ub2e4\\ub978", "\\uc5ec\\ub7ec", "\\uac00\\uc9c0", "\\uae30\\ud638\\ud654",
|
||||
"\\ubc29\\ubc95\\uc744", "\\uc9c0\\uc6d0\\ud574\\uc57c",
|
||||
"\\ud569\\ub2c8\\ub2e4.", "\\uadf8\\ub7ec\\ub098,", "\\ub370\\uc774\\ud130\\ub97c",
|
||||
"\\uc11c\\ub85c", "\\ub2e4\\ub978", "\\uae30\\ud638\\ud654", "\\ubc29\\ubc95\\uc774\\ub098",
|
||||
"\\ud50c\\ub7ab\\ud3fc", "\\uac04\\uc5d0", "\\uc804\\ub2ec\\ud560", "\\ub54c\\ub9c8\\ub2e4",
|
||||
"\\uadf8", "\\ub370\\uc774\\ud130\\ub294", "\\ud56d\\uc0c1", "\\uc190\\uc0c1\\uc758",
|
||||
"\\uc704\\ud5d8\\uc744", "\\uacaa\\uac8c", "\\ub429\\ub2c8\\ub2e4.",
|
||||
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc\\ub85c", "\\ubaa8\\ub4e0", "\\uac83\\uc744",
|
||||
"\\ud574\\uacb0\\ud560", "\\uc218", "\\uc788\\uc2b5\\ub2c8\\ub2e4!",
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc\\ub294", "\\uc0ac\\uc6a9", "\\uc911\\uc778",
|
||||
"\\ud50c\\ub7ab\\ud3fc,", "\\ud504\\ub85c\\uadf8\\ub7a8,", "\\uc5b8\\uc5b4\\uc5d0",
|
||||
"\\uad00\\uacc4\\uc5c6\\uc774", "\\ubb38\\uc790\\ub9c8\\ub2e4", "\\uace0\\uc720\\ud55c",
|
||||
"\\uc22b\\uc790\\ub97c",
|
||||
"\\uc81c\\uacf5\\ud569\\ub2c8\\ub2e4.", "\\uc720\\ub2c8\\ucf54\\ub4dc",
|
||||
"\\ud45c\\uc900\\uc740", // "Apple, HP, IBM, JustSystem, Microsoft, Oracle, SAP, "
|
||||
// "Sun, Sybase, Unisys "
|
||||
"\\ubc0f", "\\uae30\\ud0c0", "\\uc5ec\\ub7ec",
|
||||
"\\ud68c\\uc0ac\\uc640", "\\uac19\\uc740", "\\uc5c5\\uacc4",
|
||||
"\\uc120\\ub450\\uc8fc\\uc790\\uc5d0", "\\uc758\\ud574",
|
||||
"\\ucc44\\ud0dd\\ub418\\uc5c8\\uc2b5\\ub2c8\\ub2e4.", "\\uc720\\ub2c8\\ucf54\\ub4dc\\ub294",
|
||||
// "XML, Java, ECMAScript(JavaScript), LDAP, CORBA 3.0, WML "
|
||||
"\\ub4f1\\uacfc",
|
||||
"\\uac19\\uc774", "\\ud604\\uc7ac", "\\ub110\\ub9ac", "\\uc0ac\\uc6a9\\ub418\\ub294",
|
||||
"\\ud45c\\uc900\\uc5d0\\uc11c", "\\ud544\\uc694\\ud558\\uba70", "\\uc774\\ub294", "ISO/IEC",
|
||||
"10646\\uc744", "\\uad6c\\ud604\\ud558\\ub294", "\\uacf5\\uc2dd\\uc801\\uc778",
|
||||
"\\ubc29\\ubc95\\uc785\\ub2c8\\ub2e4.", "\\uc774\\ub294", "\\ub9ce\\uc740", "\\uc6b4\\uc601",
|
||||
"\\uccb4\\uc81c,", "\\uc694\\uc998", "\\uc0ac\\uc6a9\\ub418\\ub294", "\\ubaa8\\ub4e0",
|
||||
"\\ube0c\\ub77c\\uc6b0\\uc800", "\\ubc0f", "\\uae30\\ud0c0", "\\ub9ce\\uc740",
|
||||
"\\uc81c\\ud488\\uc5d0\\uc11c",
|
||||
"\\uc9c0\\uc6d0\\ub429\\ub2c8\\ub2e4.", "\\uc720\\ub2c8\\ucf54\\ub4dc",
|
||||
"\\ud45c\\uc900\\uc758", "\\ubd80\\uc0c1\\uacfc", "\\uc774\\ub97c",
|
||||
"\\uc9c0\\uc6d0\\ud558\\ub294", "\\ub3c4\\uad6c\\uc758", "\\uac00\\uc6a9\\uc131\\uc740",
|
||||
"\\ucd5c\\uadfc", "\\uc804", "\\uc138\\uacc4\\uc5d0", "\\ubd88\\uace0", "\\uc788\\ub294",
|
||||
"\\uae30\\uc220", "\\uacbd\\ud5a5\\uc5d0\\uc11c", "\\uac00\\uc7a5", "\\uc911\\uc694\\ud55c",
|
||||
"\\ubd80\\ubd84\\uc744", "\\ucc28\\uc9c0\\ud558\\uace0", "\\uc788\\uc2b5\\ub2c8\\ub2e4.",
|
||||
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc\\ub97c",
|
||||
"\\ud074\\ub77c\\uc774\\uc5b8\\ud2b8-\\uc11c\\ubc84", "\\ub610\\ub294",
|
||||
"\\ub2e4\\uc911-\\uc5f0\\uacb0", "\\uc751\\uc6a9", "\\ud504\\ub85c\\uadf8\\ub7a8\\uacfc",
|
||||
"\\uc6f9", "\\uc0ac\\uc774\\ud2b8\\uc5d0", "\\ud1b5\\ud569\\ud558\\uba74",
|
||||
"\\ub808\\uac70\\uc2dc", "\\ubb38\\uc790", "\\uc138\\ud2b8", "\\uc0ac\\uc6a9\\uc5d0",
|
||||
"\\uc788\\uc5b4\\uc11c", "\\uc0c1\\ub2f9\\ud55c", "\\ube44\\uc6a9", "\\uc808\\uac10",
|
||||
"\\ud6a8\\uacfc\\uac00",
|
||||
"\\ub098\\ud0c0\\ub0a9\\ub2c8\\ub2e4.", "\\uc720\\ub2c8\\ucf54\\ub4dc\\ub97c",
|
||||
"\\ud1b5\\ud574", "\\ub9ac\\uc5d4\\uc9c0\\ub2c8\\uc5b4\\ub9c1", "\\uc5c6\\uc774",
|
||||
"\\ub2e4\\uc911", "\\ud50c\\ub7ab\\ud3fc,", "\\uc5b8\\uc5b4", "\\ubc0f", "\\uad6d\\uac00",
|
||||
"\\uac04\\uc5d0", "\\ub2e8\\uc77c", "\\uc18c\\ud504\\ud2b8\\uc6e8\\uc5b4",
|
||||
"\\ud50c\\ub7ab\\ud3fc", "\\ub610\\ub294", "\\ub2e8\\uc77c", "\\uc6f9",
|
||||
"\\uc0ac\\uc774\\ud2b8\\ub97c", "\\ubaa9\\ud45c\\ub85c", "\\uc0bc\\uc744", "\\uc218",
|
||||
"\\uc788\\uc2b5\\ub2c8\\ub2e4.", "\\uc774\\ub97c", "\\uc0ac\\uc6a9\\ud558\\uba74",
|
||||
"\\ub370\\uc774\\ud130\\ub97c", "\\uc190\\uc0c1", "\\uc5c6\\uc774", "\\uc5ec\\ub7ec",
|
||||
"\\uc2dc\\uc2a4\\ud15c\\uc744", "\\ud1b5\\ud574", "\\uc804\\uc1a1\\ud560", "\\uc218",
|
||||
"\\uc788\\uc2b5\\ub2c8\\ub2e4.",
|
||||
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc", "\\ucf58\\uc18c\\uc2dc\\uc5c4\\uc5d0", "\\ub300\\ud574",
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc", "\\ucf58\\uc18c\\uc2dc\\uc5c4\\uc740",
|
||||
"\\ube44\\uc601\\ub9ac", "\\uc870\\uc9c1\\uc73c\\ub85c\\uc11c", "\\ud604\\ub300",
|
||||
"\\uc18c\\ud504\\ud2b8\\uc6e8\\uc5b4", "\\uc81c\\ud488\\uacfc",
|
||||
"\\ud45c\\uc900\\uc5d0\\uc11c", "\\ud14d\\uc2a4\\ud2b8\\uc758", "\\ud45c\\ud604\\uc744",
|
||||
"\\uc9c0\\uc815\\ud558\\ub294", "\\uc720\\ub2c8\\ucf54\\ub4dc", "\\ud45c\\uc900\\uc758",
|
||||
"\\uc0ac\\uc6a9\\uc744", "\\uac1c\\ubc1c\\ud558\\uace0", "\\ud655\\uc7a5\\ud558\\uba70",
|
||||
"\\uc7a5\\ub824\\ud558\\uae30", "\\uc704\\ud574",
|
||||
"\\uc138\\uc6cc\\uc84c\\uc2b5\\ub2c8\\ub2e4.", "\\ucf58\\uc18c\\uc2dc\\uc5c4",
|
||||
"\\uba64\\ubc84\\uc27d\\uc740", "\\ucef4\\ud4e8\\ud130\\uc640", "\\uc815\\ubcf4",
|
||||
"\\ucc98\\ub9ac", "\\uc0b0\\uc5c5\\uc5d0", "\\uc885\\uc0ac\\ud558\\uace0", "\\uc788\\ub294",
|
||||
"\\uad11\\ubc94\\uc704\\ud55c", "\\ud68c\\uc0ac", "\\ubc0f", "\\uc870\\uc9c1\\uc758",
|
||||
"\\ubc94\\uc704\\ub97c",
|
||||
"\\ub098\\ud0c0\\ub0c5\\ub2c8\\ub2e4.", "\\ucf58\\uc18c\\uc2dc\\uc5c4\\uc758",
|
||||
"\\uc7ac\\uc815\\uc740", "\\uc804\\uc801\\uc73c\\ub85c", "\\ud68c\\ube44\\uc5d0",
|
||||
"\\uc758\\ud574", "\\ucda9\\ub2f9\\ub429\\ub2c8\\ub2e4.", "\\uc720\\ub2c8\\ucf54\\ub4dc",
|
||||
"\\ucee8\\uc18c\\uc2dc\\uc5c4\\uc5d0\\uc11c\\uc758", "\\uba64\\ubc84\\uc27d\\uc740",
|
||||
"\\uc804", "\\uc138\\uacc4", "\\uc5b4\\ub290", "\\uacf3\\uc5d0\\uc11c\\ub098",
|
||||
"\\uc720\\ub2c8\\ucf54\\ub4dc", "\\ud45c\\uc900\\uc744", "\\uc9c0\\uc6d0\\ud558\\uace0",
|
||||
"\\uadf8", "\\ud655\\uc7a5\\uacfc", "\\uad6c\\ud604\\uc744",
|
||||
"\\uc9c0\\uc6d0\\ud558\\uace0\\uc790\\ud558\\ub294", "\\uc870\\uc9c1\\uacfc",
|
||||
"\\uac1c\\uc778\\uc5d0\\uac8c", "\\uac1c\\ubc29\\ub418\\uc5b4",
|
||||
"\\uc788\\uc2b5\\ub2c8\\ub2e4.",
|
||||
|
||||
"\\ub354", "\\uc790\\uc138\\ud55c", "\\ub0b4\\uc6a9\\uc740", "\\uc6a9\\uc5b4\\uc9d1,",
|
||||
"\\uc608\\uc81c", "\\uc720\\ub2c8\\ucf54\\ub4dc", "\\uc0ac\\uc6a9", "\\uac00\\ub2a5",
|
||||
"\\uc81c\\ud488,", "\\uae30\\uc220", "\\uc815\\ubcf4", "\\ubc0f", "\\uae30\\ud0c0",
|
||||
"\\uc720\\uc6a9\\ud55c", "\\uc815\\ubcf4\\ub97c",
|
||||
"\\ucc38\\uc870\\ud558\\uc2ed\\uc2dc\\uc624."
|
||||
};
|
||||
|
||||
enum { WHAT_IS_UNICODE_length = sizeof(WHAT_IS_UNICODE) / sizeof(WHAT_IS_UNICODE[0]) };
|
||||
|
||||
Transliterator* latinJamo = Transliterator::createInstance("Latin-Jamo");
|
||||
Transliterator* jamoHangul = Transliterator::createInstance("Jamo-Hangul");
|
||||
if (latinJamo == 0 || jamoHangul == 0) {
|
||||
delete latinJamo;
|
||||
delete jamoHangul;
|
||||
errln("FAIL: createInstance returned NULL");
|
||||
return;
|
||||
}
|
||||
Transliterator* jamoLatin = latinJamo->createInverse();
|
||||
Transliterator* hangulJamo = jamoHangul->createInverse();
|
||||
if (jamoLatin == 0 || hangulJamo == 0) {
|
||||
errln("FAIL: createInverse returned NULL");
|
||||
delete latinJamo;
|
||||
delete jamoLatin;
|
||||
delete jamoHangul;
|
||||
delete hangulJamo;
|
||||
return;
|
||||
}
|
||||
|
||||
Transliterator* tarray[4] =
|
||||
{ hangulJamo, jamoLatin, latinJamo, jamoHangul };
|
||||
CompoundTransliterator rt(tarray, 4);
|
||||
|
||||
UnicodeString buf;
|
||||
int32_t total = 0;
|
||||
int32_t errors = 0;
|
||||
int32_t i;
|
||||
for (i=0; i < WHAT_IS_UNICODE_length; ++i) {
|
||||
++total;
|
||||
UnicodeString hangul = WHAT_IS_UNICODE[i];
|
||||
hangul = hangul.unescape(); // Parse backslash-u escapes
|
||||
UnicodeString hangulX = hangul;
|
||||
rt.transliterate(hangulX);
|
||||
if (hangul != hangulX) {
|
||||
++errors;
|
||||
UnicodeString jamo = hangul; hangulJamo->transliterate(jamo);
|
||||
UnicodeString latin = jamo; jamoLatin->transliterate(latin);
|
||||
UnicodeString jamo2 = latin; latinJamo->transliterate(jamo2);
|
||||
UnicodeString hangul2 = jamo2; jamoHangul->transliterate(hangul2);
|
||||
|
||||
buf.remove(0);
|
||||
buf.append("FAIL: ");
|
||||
if (hangul2 != hangulX) {
|
||||
buf.append((UnicodeString)"(Weird: " + hangulX + " != " + hangul2 + ")");
|
||||
}
|
||||
// The Hangul-Jamo conversion is not usually the
|
||||
// bug here, so we hide it from display.
|
||||
// Uncomment lines to see the Hangul.
|
||||
buf.append(//hangul + " => " +
|
||||
jamoToName(jamo) + " => " +
|
||||
latin + " => " + jamoToName(jamo2)
|
||||
//+ " => " + hangul2
|
||||
);
|
||||
errln(prettify(buf));
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
errln((UnicodeString)"Test word failures: " + errors + " out of " + total);
|
||||
} else {
|
||||
logln((UnicodeString)"All " + total + " test words passed");
|
||||
}
|
||||
|
||||
delete latinJamo;
|
||||
delete jamoLatin;
|
||||
delete jamoHangul;
|
||||
delete hangulJamo;
|
||||
}
|
||||
|
||||
// Override TransliteratorTest
|
||||
void
|
||||
JamoTest::expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& summary, UBool pass,
|
||||
const UnicodeString& expectedResult) {
|
||||
UnicodeString jsum = jamoToName(summary);
|
||||
UnicodeString jexp = jamoToName(expectedResult);
|
||||
TransliteratorTest::expectAux(tag, jsum, pass, jexp);
|
||||
}
|
||||
|
||||
const char* JamoTest::JAMO_NAMES_RULES =
|
||||
"'(Gi)' <> \\u1100;"
|
||||
"'(GGi)' <> \\u1101;"
|
||||
"'(Ni)' <> \\u1102;"
|
||||
"'(Di)' <> \\u1103;"
|
||||
"'(DD)' <> \\u1104;"
|
||||
"'(R)' <> \\u1105;"
|
||||
"'(Mi)' <> \\u1106;"
|
||||
"'(Bi)' <> \\u1107;"
|
||||
"'(BB)' <> \\u1108;"
|
||||
"'(Si)' <> \\u1109;"
|
||||
"'(SSi)' <> \\u110A;"
|
||||
"'(IEUNG)' <> \\u110B;"
|
||||
"'(Ji)' <> \\u110C;"
|
||||
"'(JJ)' <> \\u110D;"
|
||||
"'(Ci)' <> \\u110E;"
|
||||
"'(Ki)' <> \\u110F;"
|
||||
"'(Ti)' <> \\u1110;"
|
||||
"'(Pi)' <> \\u1111;"
|
||||
"'(Hi)' <> \\u1112;"
|
||||
|
||||
"'(A)' <> \\u1161;"
|
||||
"'(AE)' <> \\u1162;"
|
||||
"'(YA)' <> \\u1163;"
|
||||
"'(YAE)' <> \\u1164;"
|
||||
"'(EO)' <> \\u1165;"
|
||||
"'(E)' <> \\u1166;"
|
||||
"'(YEO)' <> \\u1167;"
|
||||
"'(YE)' <> \\u1168;"
|
||||
"'(O)' <> \\u1169;"
|
||||
"'(WA)' <> \\u116A;"
|
||||
"'(WAE)' <> \\u116B;"
|
||||
"'(OE)' <> \\u116C;"
|
||||
"'(YO)' <> \\u116D;"
|
||||
"'(U)' <> \\u116E;"
|
||||
"'(WEO)' <> \\u116F;"
|
||||
"'(WE)' <> \\u1170;"
|
||||
"'(WI)' <> \\u1171;"
|
||||
"'(YU)' <> \\u1172;"
|
||||
"'(EU)' <> \\u1173;"
|
||||
"'(YI)' <> \\u1174;"
|
||||
"'(I)' <> \\u1175;"
|
||||
|
||||
"'(Gf)' <> \\u11A8;"
|
||||
"'(GGf)' <> \\u11A9;"
|
||||
"'(GS)' <> \\u11AA;"
|
||||
"'(Nf)' <> \\u11AB;"
|
||||
"'(NJ)' <> \\u11AC;"
|
||||
"'(NH)' <> \\u11AD;"
|
||||
"'(Df)' <> \\u11AE;"
|
||||
"'(L)' <> \\u11AF;"
|
||||
"'(LG)' <> \\u11B0;"
|
||||
"'(LM)' <> \\u11B1;"
|
||||
"'(LB)' <> \\u11B2;"
|
||||
"'(LS)' <> \\u11B3;"
|
||||
"'(LT)' <> \\u11B4;"
|
||||
"'(LP)' <> \\u11B5;"
|
||||
"'(LH)' <> \\u11B6;"
|
||||
"'(Mf)' <> \\u11B7;"
|
||||
"'(Bf)' <> \\u11B8;"
|
||||
"'(BS)' <> \\u11B9;"
|
||||
"'(Sf)' <> \\u11BA;"
|
||||
"'(SSf)' <> \\u11BB;"
|
||||
"'(NG)' <> \\u11BC;"
|
||||
"'(Jf)' <> \\u11BD;"
|
||||
"'(Cf)' <> \\u11BE;"
|
||||
"'(Kf)' <> \\u11BF;"
|
||||
"'(Tf)' <> \\u11C0;"
|
||||
"'(Pf)' <> \\u11C1;"
|
||||
"'(Hf)' <> \\u11C2;";
|
||||
|
||||
Transliterator* JamoTest::JAMO_NAME = 0;
|
||||
Transliterator* JamoTest::NAME_JAMO = 0;
|
||||
|
||||
/**
|
||||
* Convert short names to actual jamo. E.g., "x(LG)y" returns
|
||||
* "x\u11B0y". See JAMO_NAMES for table of names.
|
||||
*/
|
||||
UnicodeString
|
||||
JamoTest::nameToJamo(const UnicodeString& input) {
|
||||
if (NAME_JAMO == 0) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
NAME_JAMO = new RuleBasedTransliterator("Name-Jamo",
|
||||
JAMO_NAMES_RULES,
|
||||
UTRANS_FORWARD, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete NAME_JAMO;
|
||||
NAME_JAMO = 0;
|
||||
return input;
|
||||
}
|
||||
}
|
||||
UnicodeString result(input);
|
||||
NAME_JAMO->transliterate(result);
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert jamo to short names. E.g., "x\u11B0y" returns
|
||||
* "x(LG)y". See JAMO_NAMES for table of names.
|
||||
*/
|
||||
UnicodeString
|
||||
JamoTest::jamoToName(const UnicodeString& input) {
|
||||
if (JAMO_NAME == 0) {
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
JAMO_NAME = new RuleBasedTransliterator("Jamo-Name",
|
||||
JAMO_NAMES_RULES,
|
||||
UTRANS_REVERSE, status);
|
||||
if (U_FAILURE(status)) {
|
||||
delete JAMO_NAME;
|
||||
JAMO_NAME = 0;
|
||||
return input;
|
||||
}
|
||||
}
|
||||
UnicodeString result(input);
|
||||
JAMO_NAME->transliterate(result);
|
||||
return result;
|
||||
}
|
37
icu4c/source/test/intltest/jamotest.h
Normal file
37
icu4c/source/test/intltest/jamotest.h
Normal file
@ -0,0 +1,37 @@
|
||||
#ifndef JAMOTEST_H
|
||||
#define JAMOTEST_H
|
||||
|
||||
#include "transtst.h"
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @summary Test of Latin-Jamo and Jamo-Latin rules
|
||||
*/
|
||||
class JamoTest : public TransliteratorTest {
|
||||
|
||||
void runIndexedTest(int32_t index, UBool exec, const char* &name,
|
||||
char* par=NULL);
|
||||
|
||||
void TestJamo(void);
|
||||
|
||||
void TestRealText(void);
|
||||
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
|
||||
// Override TransliteratorTest
|
||||
virtual void expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& summary, UBool pass,
|
||||
const UnicodeString& expectedResult);
|
||||
|
||||
// Methods to convert Jamo to/from readable short names,
|
||||
// e.g. (Gi) <> U+1100
|
||||
static const char* JAMO_NAMES_RULES;
|
||||
static Transliterator* JAMO_NAME;
|
||||
static Transliterator* NAME_JAMO;
|
||||
static UnicodeString nameToJamo(const UnicodeString& input);
|
||||
static UnicodeString jamoToName(const UnicodeString& input);
|
||||
};
|
||||
|
||||
#endif
|
@ -130,6 +130,7 @@ class TransliteratorTest : public IntlTest {
|
||||
//======================================================================
|
||||
// Support methods
|
||||
//======================================================================
|
||||
protected:
|
||||
void expect(const UnicodeString& rules,
|
||||
const UnicodeString& source,
|
||||
const UnicodeString& expectedResult);
|
||||
@ -148,7 +149,7 @@ class TransliteratorTest : public IntlTest {
|
||||
const UnicodeString& result,
|
||||
const UnicodeString& expectedResult);
|
||||
|
||||
void expectAux(const UnicodeString& tag,
|
||||
virtual void expectAux(const UnicodeString& tag,
|
||||
const UnicodeString& summary, UBool pass,
|
||||
const UnicodeString& expectedResult);
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user