From 650f6510e5a0ea3be66cbe3963f6e0b7bfa832b9 Mon Sep 17 00:00:00 2001 From: Alan Liu Date: Wed, 28 Feb 2001 19:49:07 +0000 Subject: [PATCH] ICU-872 Update Latin-Jamo and Jamo-Latin rules X-SVN-Rev: 3826 --- icu4c/data/ljamo.txt | 517 ++++++++++++++++++++++++ icu4c/source/test/intltest/Makefile.in | 3 +- icu4c/source/test/intltest/ittrans.cpp | 2 + icu4c/source/test/intltest/jamotest.cpp | 418 +++++++++++++++++++ icu4c/source/test/intltest/jamotest.h | 37 ++ icu4c/source/test/intltest/transtst.h | 3 +- 6 files changed, 978 insertions(+), 2 deletions(-) create mode 100644 icu4c/data/ljamo.txt create mode 100644 icu4c/source/test/intltest/jamotest.cpp create mode 100644 icu4c/source/test/intltest/jamotest.h diff --git a/icu4c/data/ljamo.txt b/icu4c/data/ljamo.txt new file mode 100644 index 0000000000..c6c938fc65 --- /dev/null +++ b/icu4c/data/ljamo.txt @@ -0,0 +1,517 @@ + // -*- Coding: utf-8; -*- +//-------------------------------------------------------------------- +// Copyright (c) 1999-2001, International Business Machines +// Corporation and others. All Rights Reserved. +//-------------------------------------------------------------------- +// THIS IS A MACHINE-GENERATED FILE +// Tool: dumpICUrules.bat +// Source: \icu4j\src\com\ibm\text\resources/Transliterator_Latin_Jamo.utf8.txt +// Date: Wed Feb 28 11:52:16 2001 +//-------------------------------------------------------------------- + +// Latin-Jamo + +ljamo { + Rule { +//-------------------------------------------------------------------- +// Copyright (c) 1999-2001, International Business Machines +// Corporation and others. All Rights Reserved. +//-------------------------------------------------------------------- + +// Latin-Jamo + +// Transliteration from Latin characters to Korean script is done in +// two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul +// transliteration is done algorithmically following Unicode 3.0 +// section 3.11. This file implements the Latin to Jamo +// transliteration using rules. + +// Jamo occupy the block 1100-11FF. Within this block there are three +// groups of characters: initial consonants or choseong (I), medial +// vowels or jungseong (M), and trailing consonants or jongseong (F). +// Standard Korean syllables are of the form I+M+F*. + +// Section 3.11 describes the use of 'filler' jamo to convert +// nonstandard syllables to standard form: the choseong filler 115F and +// the junseong filler 1160. In this transliterator, we will not use +// 115F or 1160. + +// We will, however, insert two 'null' jamo to make foreign words +// conform to Korean syllable structure. These are the null initial +// consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text, +// we will use the hyphen in order to disambiguate strings, +// e.g. "kan'ggan" (initial GG) vs. "kanggan" (final NG + initial G). + +// We will not use all of the characters in the jamo block. We will +// only use the 19 initials, 21 medials, and 27 finals possessing a +// jamo short name as defined in section 4.4 of the Unicode book. + +// Rules of thumb. These guidelines provide the basic framework +// for the rules. They are phrased in terms of Latin-Jamo transliteration. +// The Jamo-Latin rules derive from these, since the Jamo-Latin rules are +// just context-free transliteration of jamo to corresponding short names, +// with the addition of hyphens to maintain round-trip integrity +// in the context of the Latin-Jamo rules. + +// A sequence of vowels: +// - Take the longest sequence you can. If there are too many, or you don't +// have a starting consonant, introduce a 110B necessary. + +// A sequence of consonants. +// - First join the double consonants: G + G -> GG +// - In the remaining list, +// -- If there is no preceding vowel, take the first consonant, and insert EU +// after it. Continue with the rest of the consonants. +// -- If there is one consonant, attach to the following vowel +// -- If there are two consonants and a following vowel, attach one to the +// preceeding vowel, and one to the following vowel. +// -- If there are more than two consonants, join the first two together if you +// can: L + G => LG +// -- If you still end up with more than 2 consonants, insert EU after the +// first one, and continue with the rest of the consonants. + +//---------------------------------------------------------------------- +// Variables + +// Some latin consonants or consonant pairs only occur as initials, and +// some only as finals, but some occur as both. This makes some jamo +// consonants ambiguous when transliterated into latin. +// Initial only: IEUNG BB DD JJ R +// Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ +// Initial and Final: B C D G GG H J K M N P S SS T + + "$Gi = \u1100;" + "$GGi = \u1101;" + "$Ni = \u1102;" + "$Di = \u1103;" + "$DD = \u1104;" + "$R = \u1105;" + "$Mi = \u1106;" + "$Bi = \u1107;" + "$BB = \u1108;" + "$Si = \u1109;" + "$SSi = \u110A;" + "$IEUNG = \u110B;" // null initial, inserted during Latin-Jamo + "$Ji = \u110C;" + "$JJ = \u110D;" + "$Ci = \u110E;" + "$Ki = \u110F;" + "$Ti = \u1110;" + "$Pi = \u1111;" + "$Hi = \u1112;" + + "$A = \u1161;" + "$AE = \u1162;" + "$YA = \u1163;" + "$YAE = \u1164;" + "$EO = \u1165;" + "$E = \u1166;" + "$YEO = \u1167;" + "$YE = \u1168;" + "$O = \u1169;" + "$WA = \u116A;" + "$WAE = \u116B;" + "$OE = \u116C;" + "$YO = \u116D;" + "$U = \u116E;" + "$WEO = \u116F;" + "$WE = \u1170;" + "$WI = \u1171;" + "$YU = \u1172;" + "$EU = \u1173;" // null medial, inserted during Latin-Jamo + "$YI = \u1174;" + "$I = \u1175;" + + "$Gf = \u11A8;" + "$GGf = \u11A9;" + "$GS = \u11AA;" + "$Nf = \u11AB;" + "$NJ = \u11AC;" + "$NH = \u11AD;" + "$Df = \u11AE;" + "$L = \u11AF;" + "$LG = \u11B0;" + "$LM = \u11B1;" + "$LB = \u11B2;" + "$LS = \u11B3;" + "$LT = \u11B4;" + "$LP = \u11B5;" + "$LH = \u11B6;" + "$Mf = \u11B7;" + "$Bf = \u11B8;" + "$BS = \u11B9;" + "$Sf = \u11BA;" + "$SSf = \u11BB;" + "$NG = \u11BC;" + "$Jf = \u11BD;" + "$Cf = \u11BE;" + "$Kf = \u11BF;" + "$Tf = \u11C0;" + "$Pf = \u11C1;" + "$Hf = \u11C2;" + + "$jamoInitial = [\u1100-\u1112];" + + "$jamoMedial = [\u1161-\u1175];" + + "$latinInitial = [bcdghjkmnprst];" + + // Any character in the latin transliteration of a medial + "$latinMedial = [aeiouwy];" + + // The last character of the latin transliteration of a medial + "$latinMedialEnd = [aeiou];" + +//---------------------------------------------------------------------- +// Jamo-Latin + +// Jamo to latin is relatively simple, since it is the latin that is +// ambiguous. Most rules are straightforward, and we encode them below +// as simple add-on back rule, e.g.: + +// $jamoMedial {bs} > $BS; + +// becomes + +// $jamoMedial {bs} <> $BS; + +// Furthermore, we don't care about the ordering for Jamo-Latin because +// we are going from single characters, so we can very easily piggyback +// on the Latin-Jamo. + +// The main issue with Jamo-Latin is when to insert hyphens. +// Hyphens are inserted to obtain correct round trip behavior. For +// example, the sequence Ki A Gf Gi E, if transliterated to "kagge", +// would then round trip to Ki A GGi E. To prevent this, we insert a +// hyphen: "kag'ge". IMPORTANT: The need for hyphens depends +// very specifically on the behavior of the Latin-Jamo rules. A change +// in the Latin-Jamo behavior can completely change the way the +// hyphen insertion must be done. + +// First try to preserve actual hyphens in the jamo text by doubling +// them. This fixes problems like: +// (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) => dajung-yeongyeol +// => (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional +// -- if we don't care about losing hyphens in the jamo, we can delete +// this rule. + + "'--' <> '-';" + +// Triple consonants. For three consonants "axxx" we insert a +// hyphen between the first and second "x" if XXf, Xf, and Xi all +// exist, and we have A Xf XXi. This prevents the reverse +// transliteration to A XXf Xi. + + "'-' < $latinMedialEnd g {} $GGi;" + "'-' < $latinMedialEnd s {} $SSi;" + +// For vowels the rule is similar. If there is a vowel "ae" such that +// "a" by itself and "e" by itself are vowels, then we want to map A E +// to "a'e" so as not to round trip to AE. However, in the text Ki EO +// IEUNG E we don't need to map to "keo'e". "keoe" suffices. For +// vowels of the form "aei", in theory both "ae" + "i" and "a" + "ei" +// must be tested, but in practice only the former occurs. + + "'-' < $latinInitial [ye we] {} $O;" + "'-' < $latinInitial e {} [$O $U];" + "'-' < $latinInitial [o a wa ya] {} $E;" + +// Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E, +// where Xi also exists, must be transliterated as "ax'e" to prevent +// the round trip conversion to A Xi E. + + "'-' < $latinMedialEnd b {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd c {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd d {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd g {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd h {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd j {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd k {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd m {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd n {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd p {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd s {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd t {} $IEUNG $jamoMedial;" + +// Double finals followed by IEUNG. Similar to the single finals +// followed by IEUNG. Any latin consonant pair X Y, between medials, +// that we would split by Latin-Jamo, we must handle when it occurs as +// part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi +// E. + + "'-' < $latinMedialEnd b s {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd g g {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd g s {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l b {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l g {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l h {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l m {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l p {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l s {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd l t {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd n g {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd n h {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd n j {} $IEUNG $jamoMedial;" + "'-' < $latinMedialEnd s s {} $IEUNG $jamoMedial;" + +// Split doubles. Text of the form A Xi Xf E, where XXi also occurs, +// we transliterate as "ax'xe" to prevent round trip transliteration as +// A XXi E. + + "'-' < $latinMedialEnd b {} $Bi $jamoMedial;" + "'-' < $latinMedialEnd d {} $Di $jamoMedial;" + "'-' < $latinMedialEnd j {} $Ji $jamoMedial;" + "'-' < $latinMedialEnd g {} $Gi $jamoMedial;" + "'-' < $latinMedialEnd s {} $Si $jamoMedial;" + +// XYY. This corresponds to the XYY rule in Latin-Jamo. By default +// Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result, +// "xyy" forms that correspond to XYf Yi must be transliterated as +// "xy'y". + + "'-' < $latinMedialEnd b s {} $Si;" + "'-' < $latinMedialEnd g s {} $Si;" + "'-' < $latinMedialEnd l b {} $Bi;" + "'-' < $latinMedialEnd l g {} $Gi;" + "'-' < $latinMedialEnd l s {} $Si;" + "'-' < $latinMedialEnd n g {} $Gi;" + "'-' < $latinMedialEnd n j {} $Ji;" + +// Deletion of IEUNG is handled below. + +//---------------------------------------------------------------------- +// Latin-Jamo + +// [Basic, context-free Jamo-Latin rules are embedded here too. See +// above.] + +// Split digraphs: Text of the form 'axye', where 'xy' is a final +// digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and +// 'e' are medials, we want to transliterate this as A Xf Yi E rather +// than A XYf IEUNG E. We do NOT include text of the form "axxe", +// since that is handled differently below. These rules are generated +// programmatically from the jamo data. + + "$jamoMedial {b s} $latinMedial > $Bf $Si;" + "$jamoMedial {g s} $latinMedial > $Gf $Si;" + "$jamoMedial {l b} $latinMedial > $L $Bi;" + "$jamoMedial {l g} $latinMedial > $L $Gi;" + "$jamoMedial {l h} $latinMedial > $L $Hi;" + "$jamoMedial {l m} $latinMedial > $L $Mi;" + "$jamoMedial {l p} $latinMedial > $L $Pi;" + "$jamoMedial {l s} $latinMedial > $L $Si;" + "$jamoMedial {l t} $latinMedial > $L $Ti;" + "$jamoMedial {n g} $latinMedial > $Nf $Gi;" + "$jamoMedial {n h} $latinMedial > $Nf $Hi;" + "$jamoMedial {n j} $latinMedial > $Nf $Ji;" + +// Single consonants are initials: Text of the form 'axe', where 'x' +// can be an initial or a final, and 'a' and 'e' are medials, we want +// to transliterate as A Xi E rather than A Xf IEUNG E. + + "$jamoMedial {b} $latinMedial > $Bi;" + "$jamoMedial {c} $latinMedial > $Ci;" + "$jamoMedial {d} $latinMedial > $Di;" + "$jamoMedial {g} $latinMedial > $Gi;" + "$jamoMedial {h} $latinMedial > $Hi;" + "$jamoMedial {j} $latinMedial > $Ji;" + "$jamoMedial {k} $latinMedial > $Ki;" + "$jamoMedial {m} $latinMedial > $Mi;" + "$jamoMedial {n} $latinMedial > $Ni;" + "$jamoMedial {p} $latinMedial > $Pi;" + "$jamoMedial {s} $latinMedial > $Si;" + "$jamoMedial {t} $latinMedial > $Ti;" + +// Doubled initials. The sequence "axxe", where XX exists as an initial +// (XXi), and also Xi and Xf exist (true of all digraphs XX), we want +// to transliterate as A XXi E, rather than split to A Xf Xi E. + + "$jamoMedial {b b} $latinMedial > $BB;" + "$jamoMedial {d d} $latinMedial > $DD;" + "$jamoMedial {j j} $latinMedial > $JJ;" + "$jamoMedial {g g} $latinMedial > $GGi;" + "$jamoMedial {s s} $latinMedial > $SSi;" + +// XYY. Because doubled consonants bind more strongly than XY +// consonants, we must handle the sequence "axyy" specially. Here XYf +// and YYi must exist. In these cases, we map to Xf YYi rather than +// XYf. + + "$jamoMedial {b} s s > $Bf;" + "$jamoMedial {g} s s > $Gf;" + "$jamoMedial {l} b b > $L;" + "$jamoMedial {l} g g > $L;" + "$jamoMedial {l} s s > $L;" + "$jamoMedial {n} g g > $Nf;" + "$jamoMedial {n} j j > $Nf;" + +// Finals: Attach consonant with preceding medial to preceding medial. +// Do this BEFORE mapping consonants to initials. Longer keys must +// precede shorter keys that they start with, e.g., the rule for 'bs' +// must precede 'b'. + +// [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this +// block for Jamo-Latin.] + + "$jamoMedial {bs} <> $BS;" + "$jamoMedial {b} <> $Bf;" + "$jamoMedial {c} <> $Cf;" + "$jamoMedial {d} <> $Df;" + "$jamoMedial {gg} <> $GGf;" + "$jamoMedial {gs} <> $GS;" + "$jamoMedial {g} <> $Gf;" + "$jamoMedial {h} <> $Hf;" + "$jamoMedial {j} <> $Jf;" + "$jamoMedial {k} <> $Kf;" + "$jamoMedial {lb} <> $LB; $jamoMedial {lg} <> $LG;" + "$jamoMedial {lh} <> $LH;" + "$jamoMedial {lm} <> $LM;" + "$jamoMedial {lp} <> $LP;" + "$jamoMedial {ls} <> $LS;" + "$jamoMedial {lt} <> $LT;" + "$jamoMedial {l} <> $L;" + "$jamoMedial {m} <> $Mf;" + "$jamoMedial {ng} <> $NG;" + "$jamoMedial {nh} <> $NH;" + "$jamoMedial {nj} <> $NJ;" + "$jamoMedial {n} <> $Nf;" + "$jamoMedial {p} <> $Pf;" + "$jamoMedial {ss} <> $SSf;" + "$jamoMedial {s} <> $Sf;" + "$jamoMedial {t} <> $Tf;" + +// Initials: Attach single consonant to following medial. Do this +// AFTER mapping finals. Longer keys must precede shorter keys that +// they start with, e.g., the rule for 'gg' must precede 'g'. + +// [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within +// this block for Jamo-Latin.] + + "{gg} $latinMedial <> $GGi;" + "{g} $latinMedial <> $Gi;" + "{n} $latinMedial <> $Ni;" + "{dd} $latinMedial <> $DD;" + "{d} $latinMedial <> $Di;" + "{r} $latinMedial <> $R;" + "{m} $latinMedial <> $Mi;" + "{bb} $latinMedial <> $BB;" + "{b} $latinMedial <> $Bi;" + "{ss} $latinMedial <> $SSi;" + "{s} $latinMedial <> $Si;" + "{jj} $latinMedial <> $JJ;" + "{j} $latinMedial <> $Ji;" + "{c} $latinMedial <> $Ci;" + "{k} $latinMedial <> $Ki;" + "{t} $latinMedial <> $Ti;" + "{p} $latinMedial <> $Pi;" + "{h} $latinMedial <> $Hi;" + +// 'r' in final position. Because of the equivalency of the 'l' and +// 'r' jamo (the glyphs are the same), we try to provide the same +// equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled +// below. If we see an 'r' in an apparent final position, treat it +// like 'l'. For example, "karka" => Ki A R EU Ki A without this rule. +// Instead, we want Ki A L Ki A. + + "$jamoMedial {r} $latinInitial > | l;" + +// Initial + Final: If we match the next rule, we have initial then +// final consonant with no intervening medial. We insert the null +// vowel BEFORE it to create a well-formed syllable. (In the next rule +// we insert a null vowel AFTER an anomalous initial.) + + "$jamoInitial {} [bcdghjklmnpst] > $EU;" + +// Initial + X: This block matches an initial consonant not followed by +// a medial. We insert the null vowel after it. We handle double +// initials explicitly here; for single initial consonants we insert EU +// (as Latin) after them and let standard rules do the rest. + +// BREAKS ROUND TRIP INTEGRITY + + "gg > $GGi $EU;" + "dd > $DD $EU;" + "bb > $BB $EU;" + "ss > $SSi $EU;" + "jj > $JJ $EU;" + + "([bcdghjkmnprst]) > | $1 eu;" + +// X + Final: Finally we have to deal with a consonant that can only be +// interpreted as a final (not an initial) and which is preceded +// neither by an initial nor a medial. It is the start of the +// syllable, but cannot be. Most of these will already be handled by +// the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng' +// 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'. +// For this isolated case, we could add a null initial and medial, +// which would give "la" => IEUNG EU L IEUNG A, for example. A more +// economical solution is to transliterate isolated "l" (that is, +// initial "l") to "r". (Other similar conversions of consonants that +// occur neither as initials nor as finals are handled below.) + + "l > | r;" + +// Medials. If a medial is preceded by an initial, then we proceed +// normally. As usual, longer keys must precede shorter ones. + +// [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within +// this block for Jamo-Latin.] + + "$jamoInitial {ae} <> $AE;" + "$jamoInitial {a} <> $A;" + "$jamoInitial {eo} <> $EO;" + "$jamoInitial {eu} <> $EU;" + "$jamoInitial {e} <> $E;" + "$jamoInitial {i} <> $I;" + "$jamoInitial {oe} <> $OE;" + "$jamoInitial {o} <> $O;" + "$jamoInitial {u} <> $U;" + "$jamoInitial {wae} <> $WAE;" + "$jamoInitial {wa} <> $WA;" + "$jamoInitial {weo} <> $WEO;" + "$jamoInitial {we} <> $WE;" + "$jamoInitial {wi} <> $WI;" + "$jamoInitial {yae} <> $YAE;" + "$jamoInitial {ya} <> $YA;" + "$jamoInitial {yeo} <> $YEO;" + "$jamoInitial {ye} <> $YE;" + "$jamoInitial {yi} <> $YI;" + "$jamoInitial {yo} <> $YO;" + "$jamoInitial {yu} <> $YU;" + +// We may see an anomalous isolated 'w' or 'y'. In that case, we +// interpret it as 'wi' and 'yu', respectively. + +// BREAKS ROUND TRIP INTEGRITY + + "$jamoInitial {w} > | wi;" + "$jamoInitial {y} > | yu;" + +// Otherwise, insert a null consonant IEUNG before the medial (which is +// still an untransliterated latin vowel). + + "($latinMedial) > $IEUNG | $1;" + +// Convert non-jamo latin consonants to equivalents. These occur as +// neither initials nor finals in jamo. 'l' occurs as a final, but not +// an initial; it is handled above. The following letters (left hand +// side) will never be output by Jamo-Latin. + + "f > | p;" + "q > | k;" + "v > | b;" + "x > | ks;" + "z > | s;" + +// Delete hyphens (Latin-Jamo). + + "'-' > ;" + +// Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels, +// since these may also occur in text. + + "< $IEUNG;" + +// eof + } +} diff --git a/icu4c/source/test/intltest/Makefile.in b/icu4c/source/test/intltest/Makefile.in index 70cb36fceb..6469ceb0ff 100644 --- a/icu4c/source/test/intltest/Makefile.in +++ b/icu4c/source/test/intltest/Makefile.in @@ -46,7 +46,8 @@ tfsmalls.o tmsgfmt.o trcoll.o tscoll.o tsdate.o tsdcfmsy.o tsdtfmsy.o \ tsmthred.o tsmutex.o tsnmfmt.o tsputil.o tstnorm.o tzbdtest.o \ tzregts.o tztest.o ucdtest.o usettest.o ustrtest.o transtst.o strtest.o thcoll.o \ itrbbi.o rbbiapts.o rbbitst.o ittrans.o transapi.o cpdtrtst.o unhxtrts.o hxuntrts.o \ -jahatrts.o hajatrts.o ufltlgts.o testutil.o transrt.o normconf.o sfwdchit.o +jahatrts.o hajatrts.o ufltlgts.o testutil.o transrt.o normconf.o sfwdchit.o \ +jamotest.o DEPS = $(OBJECTS:.o=.d) diff --git a/icu4c/source/test/intltest/ittrans.cpp b/icu4c/source/test/intltest/ittrans.cpp index e04a2078bc..f4351b71f9 100644 --- a/icu4c/source/test/intltest/ittrans.cpp +++ b/icu4c/source/test/intltest/ittrans.cpp @@ -30,6 +30,7 @@ #include "ufltlgts.h" #include "transrt.h" #include "usettest.h" +#include "jamotest.h" #define CASE(id,test) case id: \ name = #test; \ @@ -54,6 +55,7 @@ void IntlTestTransliterator::runIndexedTest( int32_t index, UBool exec, const ch CASE(7, UnicodeFilterLogicTest); CASE(8, TransliteratorRoundTripTest); CASE(9, UnicodeSetTest); + CASE(10, JamoTest); default: name=""; break; } } diff --git a/icu4c/source/test/intltest/jamotest.cpp b/icu4c/source/test/intltest/jamotest.cpp new file mode 100644 index 0000000000..b6c332cd5f --- /dev/null +++ b/icu4c/source/test/intltest/jamotest.cpp @@ -0,0 +1,418 @@ +#include "jamotest.h" +#include "unicode/utypes.h" +#include "unicode/translit.h" +#include "unicode/rbt.h" +#include "unicode/cpdtrans.h" + +#define CASE(id,test) case id: \ + name = #test; \ + if (exec) { \ + logln(#test "---"); \ + logln((UnicodeString)""); \ + test(); \ + } \ + break + +void +JamoTest::runIndexedTest(int32_t index, UBool exec, + const char* &name, char* /*par*/) { + switch (index) { + CASE(0,TestJamo); + CASE(1,TestRealText); + default: name = ""; break; + } +} + +void +JamoTest::TestJamo() { + Transliterator* latinJamo = Transliterator::createInstance("Latin-Jamo"); + + if (latinJamo == 0) { + errln("FAIL: createInstance() returned 0"); + return; + } + + Transliterator* jamoLatin = latinJamo->createInverse(); + + if (jamoLatin == 0) { + delete latinJamo; + errln("FAIL: createInverse() returned 0"); + return; + } + + const char* CASE[] = { + // Column 1 is the latin text L1 to be fed to Latin-Jamo + // to yield output J. + + // Column 2 is expected value of J. J is fed to + // Jamo-Latin to yield output L2. + + // Column 3 is expected value of L2. If the expected + // value of L2 is L1, then L2 is NULL. + "bab", "(Bi)(A)(Bf)", NULL, + "babb", "(Bi)(A)(Bf)(Bi)(EU)", "bab-beu", + "babbba", "(Bi)(A)(Bf)(BB)(A)", NULL, + "bagg", "(Bi)(A)(GGf)", NULL, + "baggga", "(Bi)(A)(GGf)(Gi)(A)", NULL, + "bag-gga", "(Bi)(A)(Gf)(GGi)(A)", NULL, + "kabsa", "(Ki)(A)(Bf)(Si)(A)", NULL, + "kabska", "(Ki)(A)(BS)(Ki)(A)", NULL, + "gabsbka", "(Gi)(A)(BS)(Bi)(EU)(Ki)(A)", "gabsbeuka", // not (Kf) + "gga", "(GGi)(A)", NULL, + "bsa", "(Bi)(EU)(Si)(A)", "beusa", + "agg", "(IEUNG)(A)(GGf)", NULL, + "agga", "(IEUNG)(A)(GGi)(A)", NULL, + "la", "(R)(A)", "ra", + "bs", "(Bi)(EU)(Sf)", "beus", + "kalgga", "(Ki)(A)(L)(GGi)(A)", NULL, + + // 'r' in a final position is treated like 'l' + "karka", "(Ki)(A)(L)(Ki)(A)", "kalka", + }; + + enum { CASE_length = sizeof(CASE) / sizeof(CASE[0]) }; + + int32_t i; + for (i=0; icreateInverse(); + Transliterator* hangulJamo = jamoHangul->createInverse(); + if (jamoLatin == 0 || hangulJamo == 0) { + errln("FAIL: createInverse returned NULL"); + delete latinJamo; + delete jamoLatin; + delete jamoHangul; + delete hangulJamo; + return; + } + + Transliterator* tarray[4] = + { hangulJamo, jamoLatin, latinJamo, jamoHangul }; + CompoundTransliterator rt(tarray, 4); + + UnicodeString buf; + int32_t total = 0; + int32_t errors = 0; + int32_t i; + for (i=0; i < WHAT_IS_UNICODE_length; ++i) { + ++total; + UnicodeString hangul = WHAT_IS_UNICODE[i]; + hangul = hangul.unescape(); // Parse backslash-u escapes + UnicodeString hangulX = hangul; + rt.transliterate(hangulX); + if (hangul != hangulX) { + ++errors; + UnicodeString jamo = hangul; hangulJamo->transliterate(jamo); + UnicodeString latin = jamo; jamoLatin->transliterate(latin); + UnicodeString jamo2 = latin; latinJamo->transliterate(jamo2); + UnicodeString hangul2 = jamo2; jamoHangul->transliterate(hangul2); + + buf.remove(0); + buf.append("FAIL: "); + if (hangul2 != hangulX) { + buf.append((UnicodeString)"(Weird: " + hangulX + " != " + hangul2 + ")"); + } + // The Hangul-Jamo conversion is not usually the + // bug here, so we hide it from display. + // Uncomment lines to see the Hangul. + buf.append(//hangul + " => " + + jamoToName(jamo) + " => " + + latin + " => " + jamoToName(jamo2) + //+ " => " + hangul2 + ); + errln(prettify(buf)); + } + } + if (errors != 0) { + errln((UnicodeString)"Test word failures: " + errors + " out of " + total); + } else { + logln((UnicodeString)"All " + total + " test words passed"); + } + + delete latinJamo; + delete jamoLatin; + delete jamoHangul; + delete hangulJamo; +} + +// Override TransliteratorTest +void +JamoTest::expectAux(const UnicodeString& tag, + const UnicodeString& summary, UBool pass, + const UnicodeString& expectedResult) { + UnicodeString jsum = jamoToName(summary); + UnicodeString jexp = jamoToName(expectedResult); + TransliteratorTest::expectAux(tag, jsum, pass, jexp); +} + +const char* JamoTest::JAMO_NAMES_RULES = + "'(Gi)' <> \\u1100;" + "'(GGi)' <> \\u1101;" + "'(Ni)' <> \\u1102;" + "'(Di)' <> \\u1103;" + "'(DD)' <> \\u1104;" + "'(R)' <> \\u1105;" + "'(Mi)' <> \\u1106;" + "'(Bi)' <> \\u1107;" + "'(BB)' <> \\u1108;" + "'(Si)' <> \\u1109;" + "'(SSi)' <> \\u110A;" + "'(IEUNG)' <> \\u110B;" + "'(Ji)' <> \\u110C;" + "'(JJ)' <> \\u110D;" + "'(Ci)' <> \\u110E;" + "'(Ki)' <> \\u110F;" + "'(Ti)' <> \\u1110;" + "'(Pi)' <> \\u1111;" + "'(Hi)' <> \\u1112;" + + "'(A)' <> \\u1161;" + "'(AE)' <> \\u1162;" + "'(YA)' <> \\u1163;" + "'(YAE)' <> \\u1164;" + "'(EO)' <> \\u1165;" + "'(E)' <> \\u1166;" + "'(YEO)' <> \\u1167;" + "'(YE)' <> \\u1168;" + "'(O)' <> \\u1169;" + "'(WA)' <> \\u116A;" + "'(WAE)' <> \\u116B;" + "'(OE)' <> \\u116C;" + "'(YO)' <> \\u116D;" + "'(U)' <> \\u116E;" + "'(WEO)' <> \\u116F;" + "'(WE)' <> \\u1170;" + "'(WI)' <> \\u1171;" + "'(YU)' <> \\u1172;" + "'(EU)' <> \\u1173;" + "'(YI)' <> \\u1174;" + "'(I)' <> \\u1175;" + + "'(Gf)' <> \\u11A8;" + "'(GGf)' <> \\u11A9;" + "'(GS)' <> \\u11AA;" + "'(Nf)' <> \\u11AB;" + "'(NJ)' <> \\u11AC;" + "'(NH)' <> \\u11AD;" + "'(Df)' <> \\u11AE;" + "'(L)' <> \\u11AF;" + "'(LG)' <> \\u11B0;" + "'(LM)' <> \\u11B1;" + "'(LB)' <> \\u11B2;" + "'(LS)' <> \\u11B3;" + "'(LT)' <> \\u11B4;" + "'(LP)' <> \\u11B5;" + "'(LH)' <> \\u11B6;" + "'(Mf)' <> \\u11B7;" + "'(Bf)' <> \\u11B8;" + "'(BS)' <> \\u11B9;" + "'(Sf)' <> \\u11BA;" + "'(SSf)' <> \\u11BB;" + "'(NG)' <> \\u11BC;" + "'(Jf)' <> \\u11BD;" + "'(Cf)' <> \\u11BE;" + "'(Kf)' <> \\u11BF;" + "'(Tf)' <> \\u11C0;" + "'(Pf)' <> \\u11C1;" + "'(Hf)' <> \\u11C2;"; + +Transliterator* JamoTest::JAMO_NAME = 0; +Transliterator* JamoTest::NAME_JAMO = 0; + +/** + * Convert short names to actual jamo. E.g., "x(LG)y" returns + * "x\u11B0y". See JAMO_NAMES for table of names. + */ +UnicodeString +JamoTest::nameToJamo(const UnicodeString& input) { + if (NAME_JAMO == 0) { + UErrorCode status = U_ZERO_ERROR; + NAME_JAMO = new RuleBasedTransliterator("Name-Jamo", + JAMO_NAMES_RULES, + UTRANS_FORWARD, status); + if (U_FAILURE(status)) { + delete NAME_JAMO; + NAME_JAMO = 0; + return input; + } + } + UnicodeString result(input); + NAME_JAMO->transliterate(result); + return result; +} + +/** + * Convert jamo to short names. E.g., "x\u11B0y" returns + * "x(LG)y". See JAMO_NAMES for table of names. + */ +UnicodeString +JamoTest::jamoToName(const UnicodeString& input) { + if (JAMO_NAME == 0) { + UErrorCode status = U_ZERO_ERROR; + JAMO_NAME = new RuleBasedTransliterator("Jamo-Name", + JAMO_NAMES_RULES, + UTRANS_REVERSE, status); + if (U_FAILURE(status)) { + delete JAMO_NAME; + JAMO_NAME = 0; + return input; + } + } + UnicodeString result(input); + JAMO_NAME->transliterate(result); + return result; +} diff --git a/icu4c/source/test/intltest/jamotest.h b/icu4c/source/test/intltest/jamotest.h new file mode 100644 index 0000000000..221ccf2a34 --- /dev/null +++ b/icu4c/source/test/intltest/jamotest.h @@ -0,0 +1,37 @@ +#ifndef JAMOTEST_H +#define JAMOTEST_H + +#include "transtst.h" + +/** + * @test + * @summary Test of Latin-Jamo and Jamo-Latin rules + */ +class JamoTest : public TransliteratorTest { + + void runIndexedTest(int32_t index, UBool exec, const char* &name, + char* par=NULL); + + void TestJamo(void); + + void TestRealText(void); + + //====================================================================== + // Support methods + //====================================================================== + + // Override TransliteratorTest + virtual void expectAux(const UnicodeString& tag, + const UnicodeString& summary, UBool pass, + const UnicodeString& expectedResult); + + // Methods to convert Jamo to/from readable short names, + // e.g. (Gi) <> U+1100 + static const char* JAMO_NAMES_RULES; + static Transliterator* JAMO_NAME; + static Transliterator* NAME_JAMO; + static UnicodeString nameToJamo(const UnicodeString& input); + static UnicodeString jamoToName(const UnicodeString& input); +}; + +#endif diff --git a/icu4c/source/test/intltest/transtst.h b/icu4c/source/test/intltest/transtst.h index 8826c61a50..f2405f16df 100644 --- a/icu4c/source/test/intltest/transtst.h +++ b/icu4c/source/test/intltest/transtst.h @@ -130,6 +130,7 @@ class TransliteratorTest : public IntlTest { //====================================================================== // Support methods //====================================================================== + protected: void expect(const UnicodeString& rules, const UnicodeString& source, const UnicodeString& expectedResult); @@ -148,7 +149,7 @@ class TransliteratorTest : public IntlTest { const UnicodeString& result, const UnicodeString& expectedResult); - void expectAux(const UnicodeString& tag, + virtual void expectAux(const UnicodeString& tag, const UnicodeString& summary, UBool pass, const UnicodeString& expectedResult); };