scuffed-code/icu4c/data/translit/ljamo.txt

275 lines
9.2 KiB
Plaintext
Raw Normal View History

2000-01-13 14:11:40 +00:00
//--------------------------------------------------------------------
// Copyright (c) 2000, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 01/13/2000 aliu Creation.
//--------------------------------------------------------------------
ljamo {
Rule {
// VARIABLES
"medial=[\u1160-\u11A7];"
"final=[\u11A8-\u11F9];" // added - aliu
"vowel=[aeiouwyAEIOUWY\u1160-\u11A7];"
"ye=[yeYE];"
"ywe=[yweYWE];"
"yw=[ywYW];"
"nl=[nlNL];"
"gnl=[gnlGNL];"
"lsgb=[lsgbLSGB];"
"ywao=[ywaoYWAO];"
"bl=[blBL];"
// RULES
// Hangul structure is IMF or IM
// So you can have, because of adjacent sequences
// IM, but not II or IF
// MF or MI, but not MM
// FI, but not FF or FM
// For English, we just have C or V.
// To generate valid Hangul:
// Vowels:
// We insert IEUNG between VV, and otherwise map V to M
// We also insert IEUNG if there is no
// Consonants:
// We don't break doubles
// Cases like lmgg, we have to break at lm
// So to guess whether a consonant is I or F
// we map all C's to F, except when followed by a vowel, e.g.
// X[{vowel}>CHOSEONG (initial)
// X>JONGSEONG (final)
// special insertion for funny sequences of vowels
"({medial}) ({vowel}) > \u110B;" // HANGUL CHOSEONG IEUNG
// Fix casing.
// Because Korean is caseless, we just want to treat everything as
// lowercase.
// we could do this by always preceeding this transliterator with
// an upper-lowercase transformation, but that wouldn't invert nicely.
// We use the "revisit" syntax to just convert latin to latin
// so that we can avoid
// having to restate all the Latin=>Jamo rules, with the I/F handling.
// We don't have to add titlecase, since that will be picked up
// since the first letter is converted, then revisited. E.g.
// |Gg => |gg => {sang kiyeok}
// We do have to have all caps, since otherwise we could get:
// |GG => |gG => {kiyeok}|G => {kiyeok}|g => {kiyeok}{kiyeok}
"Z > |z;"
"YU > |yu;"
"YO > |yo;"
"YI > |yi;"
"YEO > |yeo;"
"YE > |ye;"
"YAE > |yae;"
"YA > |ya;"
"Y > |y;"
"WI > |wi;"
"WEO > |weo;"
"WE > |we;"
"WAE > |wae;"
"WA > |wa;"
"W > |w;"
"U > |u;"
"T > |t;"
"SS > |ss;"
"S > |s;"
"P > |p;"
"OE > |oe;"
"O > |o;"
"NJ > |nj;"
"NH > |nh;"
"NG > |ng;"
"N > |n;"
"M > |m;"
"LT > |lt;"
"LS > |ls;"
"LP > |lp;"
"LM > |lm;"
"LH > |lh;"
"LG > |lg;"
"LB > |lb;"
"L > |l;"
"K > |k;"
"JJ > |jj;"
"J > |j;"
"I > |i;"
"H > |h;"
"GS > |gs;"
"GG > |gg;"
"G > |g;"
"EU > |eu;"
"EO > |eo;"
"E > |e;"
"DD > |dd;"
"D > |d;"
"BS > |bs;"
"BB > |bb;"
"B > |b;"
"AE > |ae;"
"A > |a;"
// APOSTROPHE
// As always, an apostrophe is used to separate digraphs into
// singles. That is, if you really wanted [KAN][GGAN], instead
// of [KANG][GAN] you would write "kan'ggan".
// Rules for inserting ' when mapping separated digraphs back
// from Hangul to Latin. Catch every letter that can be the
// LAST of a digraph (or multigraph)
"''u < ({ye}) \u116e;" // hangul jungseong u
"''t < (l) \u11c0;" // hangul jongseong thieuth
"''t < (l) \u1110;" // hangul choseong thieuth
"''s < ({lsgb}) \u11ba;" // hangul jongseong sios
"''s < ({lsgb}) \u1109;" // hangul choseong sios
"''p < (l) \u11c1;" // hangul jongseong phieuph
"''p < (l) \u1111;" // hangul choseong phieuph
"''o < ({ywe}) \u1169;" // hangul jungseong o
"''m < (l) \u11b7;" // hangul jongseong mieum
"''m < (l) \u1106;" // hangul choseong mieum
"''j < (n) \u11bd;" // hangul jongseong cieuc
"''j < (n) \u110c;" // hangul choseong cieuc
"''i < ({yw}) \u1175;" // hangul jungseong i
"''h < ({nl}) \u11c2;" // hangul jongseong hieuh
"''h < ({nl}) \u1112;" // hangul choseong hieuh
"''g < ({gnl}) \u11a9;" // hangul jongseong ssangkiyeok
"''g < ({gnl}) \u1100;" // hangul choseong kiyeok
"''e < ({ywao}) \u1166;" // hangul jungseong e
"''d < (d) \u11ae;" // hangul jongseong tikeut
"''d < (d) \u1103;" // hangul choseong tikeut
"''b < ({bl}) \u11b8;" // hangul jongseong pieup
"''b < ({bl}) \u1107;" // hangul choseong pieup
"''a < ({yw}) \u1161;" // hangul jungseong a
// INITIALS
"t ({vowel}) <> \u1110;" // hangul choseong thieuth
"ss ({vowel}) <> \u110a;" // hangul choseong ssangsios
"s ({vowel}) <> \u1109;" // hangul choseong sios
"p ({vowel}) <> \u1111;" // hangul choseong phieuph
"n ({vowel}) <> \u1102;" // hangul choseong nieun
"m ({vowel}) <> \u1106;" // hangul choseong mieum
"l ({vowel}) <> \u1105;" // hangul choseong rieul
"k ({vowel}) <> \u110f;" // hangul choseong khieukh
"j ({vowel}) <> \u110c;" // hangul choseong cieuc
"h ({vowel}) <> \u1112;" // hangul choseong hieuh
"gg ({vowel}) <> \u1101;" // hangul choseong ssangkiyeok
"g ({vowel}) <> \u1100;" // hangul choseong kiyeok
"d ({vowel}) <> \u1103;" // hangul choseong tikeut
"c ({vowel}) <> \u110e;" // hangul choseong chieuch
"bb ({vowel}) <> \u1108;" // hangul choseong ssangpieup
"b ({vowel}) <> \u1107;" // hangul choseong pieup
// If we have gotten through to these rules, and we start with
// a consonant, then the remaining mappings would be to F,
// because must have CC (or C<non-letter>), not CV.
// If we have F before us, then
// we would end up with FF, which is wrong. The simplest fix is
// to still make it an initial, but also insert an "u",
// so we end up with F, I, u, and then continue with the C
"({final}) t > \u1110\u116e;" // hangul choseong thieuth
"({final}) ss > \u110a\u116e;" // hangul choseong ssangsios
"({final}) s > \u1109\u116e;" // hangul choseong sios
"({final}) p > \u1111\u116e;" // hangul choseong phieuph
"({final}) n > \u1102\u116e;" // hangul choseong nieun
"({final}) m > \u1106\u116e;" // hangul choseong mieum
"({final}) l > \u1105\u116e;" // hangul choseong rieul
"({final}) k > \u110f\u116e;" // hangul choseong khieukh
"({final}) j > \u110c\u116e;" // hangul choseong cieuc
"({final}) h > \u1112\u116e;" // hangul choseong hieuh
"({final}) gg > \u1101\u116e;" // hangul choseong ssangkiyeok
"({final}) g > \u1100\u116e;" // hangul choseong kiyeok
"({final}) d > \u1103\u116e;" // hangul choseong tikeut
"({final}) c > \u110e\u116e;" // hangul choseong chieuch
"({final}) bb > \u1108\u116e;" // hangul choseong ssangpieup
"({final}) b > \u1107\u116e;" // hangul choseong pieup
// MEDIALS (vowels) and FINALS
"yu <> \u1172;" // hangul jungseong yu
"yo <> \u116d;" // hangul jungseong yo
"yi <> \u1174;" // hangul jungseong yi
"yeo <> \u1167;" // hangul jungseong yeo
"ye <> \u1168;" // hangul jungseong ye
"yae <> \u1164;" // hangul jungseong yae
"ya <> \u1163;" // hangul jungseong ya
"wi <> \u1171;" // hangul jungseong wi
"weo <> \u116f;" // hangul jungseong weo
"we <> \u1170;" // hangul jungseong we
"wae <> \u116b;" // hangul jungseong wae
"wa <> \u116a;" // hangul jungseong wa
"u <> \u116e;" // hangul jungseong u
"t <> \u11c0;" // hangul jongseong thieuth
"ss <> \u11bb;" // hangul jongseong ssangsios
"s <> \u11ba;" // hangul jongseong sios
"p <> \u11c1;" // hangul jongseong phieuph
"oe <> \u116c;" // hangul jungseong oe
"o <> \u1169;" // hangul jungseong o
"nj <> \u11ac;" // hangul jongseong nieun-cieuc
"nh <> \u11ad;" // hangul jongseong nieun-hieuh
"ng <> \u11bc;" // hangul jongseong ieung
"n <> \u11ab;" // hangul jongseong nieun
"m <> \u11b7;" // hangul jongseong mieum
"lt <> \u11b4;" // hangul jongseong rieul-thieuth
"ls <> \u11b3;" // hangul jongseong rieul-sios
"lp <> \u11b5;" // hangul jongseong rieul-phieuph
"lm <> \u11b1;" // hangul jongseong rieul-mieum
"lh <> \u11b6;" // hangul jongseong rieul-hieuh
"lg <> \u11b0;" // hangul jongseong rieul-kiyeok
"lb <> \u11b2;" // hangul jongseong rieul-pieup
"l <> \u11af;" // hangul jongseong rieul
"k <> \u11bf;" // hangul jongseong khieukh
"jj <> \u110d;" // hangul choseong ssangcieuc
"j <> \u11bd;" // hangul jongseong cieuc
"i <> \u1175;" // hangul jungseong i
"h <> \u11c2;" // hangul jongseong hieuh
"gs <> \u11aa;" // hangul jongseong kiyeok-sios
"gg <> \u11a9;" // hangul jongseong ssangkiyeok
"g <> \u11a8;" // hangul jongseong kiyeok
"eu <> \u1173;" // hangul jungseong eu
"eo <> \u1165;" // hangul jungseong eo
"e <> \u1166;" // hangul jungseong e
"dd <> \u1104;" // hangul choseong ssangtikeut
"d <> \u11ae;" // hangul jongseong tikeut
"c <> \u11be;" // hangul jongseong chieuch
"bs <> \u11b9;" // hangul jongseong pieup-sios
"b <> \u11b8;" // hangul jongseong pieup
"ae <> \u1162;" // hangul jungseong ae
"a <> \u1161;" // hangul jungseong a
// extra English letters
// {moved to bottom - aliu}
"z > |s;"
//{ "Z > |s;" } masked
"x > |ks;"
"X > |ks;"
"v > |b;"
"V > |b;"
"r > |l;"
"R > |l;"
"q > |k;"
"Q > |k;"
"f > |p;"
"F > |p;"
//{ "c > |k;" } masked
"C > |k;"
// ====================================
// Normal final rule: remove '
// ====================================
"''>;"
}
}