scuffed-code/icu4c/data/ljamo.txt
2000-05-15 19:09:37 +00:00

323 lines
11 KiB
Plaintext

//--------------------------------------------------------------------
// Copyright (c) 2000, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// Date Name Description
// 01/13/2000 aliu Creation.
//--------------------------------------------------------------------
ljamo {
Rule {
// VARIABLES
"initial=[\u1100-\u115F];"
"medial=[\u1160-\u11A7];"
"final=[\u11A8-\u11F9];" // added - aliu
"vowel=[aeiouwyAEIOUWY\u1160-\u11A7];"
"consonant=[bcdfghjklmnpqrstvxzBCDFGHJKLMNPQRSTVXZ{medial}{final}];"
"ye=[yeYE];"
"ywe=[yweYWE];"
"yw=[ywYW];"
"nl=[nlNL];"
"gnl=[gnlGNL];"
"lsgb=[lsgbLSGB];"
"ywao=[ywaoYWAO];"
"bl=[blBL];"
// RULES
// Hangul structure is IMF or IM
// So you can have, because of adjacent sequences
// IM, but not II or IF
// MF or MI, but not MM
// FI, but not FF or FM
// For English, we just have C or V.
// To generate valid Hangul:
// Vowels:
// We insert IEUNG between VV, and otherwise map V to M
// We also insert IEUNG if there is no
// Consonants:
// We don't break doubles
// Cases like lmgg, we have to break at lm
// So to guess whether a consonant is I or F
// we map all C's to F, except when followed by a vowel, e.g.
// X[{vowel}>CHOSEONG (initial)
// X>JONGSEONG (final)
// special insertion for funny sequences of vowels, and for empty consonant
"'' < ({consonant}) \u110B;" // insert a break between any consonant and the empty consonant.
"({medial}) ({vowel}) <> \u110B;" // HANGUL CHOSEONG IEUNG
// Below, insert an empty consonant in front of a vowel, if there is no Initial in front.
// Fix casing.
// Because Korean is caseless, we just want to treat everything as
// lowercase.
// we could do this by always preceeding this transliterator with
// an upper-lowercase transformation, but that wouldn't invert nicely.
// We use the "revisit" syntax to just convert latin to latin
// so that we can avoid
// having to restate all the Latin=>Jamo rules, with the I/F handling.
// We don't have to add titlecase, since that will be picked up
// since the first letter is converted, then revisited. E.g.
// |Gg => |gg => {sang kiyeok}
// We do have to have all caps, since otherwise we could get:
// |GG => |gG => {kiyeok}|G => {kiyeok}|g => {kiyeok}{kiyeok}
"Z > |z;"
"YU > |yu;"
"YO > |yo;"
"YI > |yi;"
"YEO > |yeo;"
"YE > |ye;"
"YAE > |yae;"
"YA > |ya;"
"Y > |y;"
"WI > |wi;"
"WEO > |weo;"
"WE > |we;"
"WAE > |wae;"
"WA > |wa;"
"W > |w;"
"U > |u;"
"T > |t;"
"SS > |ss;"
"S > |s;"
"P > |p;"
"OE > |oe;"
"O > |o;"
"NJ > |nj;"
"NH > |nh;"
"NG > |ng;"
"N > |n;"
"M > |m;"
"LT > |lt;"
"LS > |ls;"
"LP > |lp;"
"LM > |lm;"
"LH > |lh;"
"LG > |lg;"
"LB > |lb;"
"L > |l;"
"K > |k;"
"JJ > |jj;"
"J > |j;"
"I > |i;"
"H > |h;"
"GS > |gs;"
"GG > |gg;"
"G > |g;"
"EU > |eu;"
"EO > |eo;"
"E > |e;"
"DD > |dd;"
"D > |d;"
"BS > |bs;"
"BB > |bb;"
"B > |b;"
"AE > |ae;"
"A > |a;"
// APOSTROPHE
// As always, an apostrophe is used to separate digraphs into
// singles. That is, if you really wanted [KAN][GGAN], instead
// of [KANG][GAN] you would write "kan'ggan".
// Rules for inserting ' when mapping separated digraphs back
// from Hangul to Latin. Catch every letter that can be the
// LAST of a digraph (or multigraph) AND first of an initial
"'' < (l) (\u11c0;" // hangul jongseong thieuth
"'' < ({lsgb}) (\u11ba;" // hangul jongseong sios
"'' < (l) (\u11c1;" // hangul jongseong phieuph
"'' < (l) (\u11b7;" // hangul jongseong mieum
"'' < (n) (\u11bd;" // hangul jongseong cieuc
"'' < ({nl}) (\u11c2;" // hangul jongseong hieuh
"'' < ({gnl}) (\u11a9;" // hangul jongseong ssangkiyeok
"'' < ({bl}) (\u11b8;" // hangul jongseong pieup
"'' < (d) (\u11ae;" // hangul jongseong tikeut
"'' < ({ye}) (\u116e;" // hangul jungseong u
"'' < ({ywe}) (\u1169;" // hangul jungseong o
"'' < ({yw}) (\u1175;" // hangul jungseong i
"'' < ({ywao}) (\u1166;" // hangul jungseong e
"'' < ({yw}) (\u1161;" // hangul jungseong a
"'' < (l) (\u1110;" // hangul choseong thieuth
"'' < ({lsgb}) (\u110a;" // hangul choseong ssangsios
"'' < ({lsgb}) (\u1109;" // hangul choseong sios
"'' < (l) (\u1111;" // hangul choseong phieuph
"'' < (l) (\u1106;" // hangul choseong mieum
"'' < (n) (\u110c;" // hangul choseong cieuc
"'' < (n) (\u110d;"
"'' < ({nl}) (\u1112;" // hangul choseong hieuh
"'' < ({gnl}) (\u1101;" // hangul choseong ssangkiyeok
"'' < ({gnl}) (\u1100;" // hangul choseong kiyeok
"'' < (d) (\u1103;" // hangul choseong tikeut
"'' < (d) (\u1104;"
"'' < ({bl}) (\u1107;" // hangul choseong pieup
"'' < ({bl}) (\u1108;"
// INITIALS
"t ({vowel}) <> \u1110;" // hangul choseong thieuth
"ss ({vowel}) <> \u110a;" // hangul choseong ssangsios
"s ({vowel}) <> \u1109;" // hangul choseong sios
"p ({vowel}) <> \u1111;" // hangul choseong phieuph
"n ({vowel}) <> \u1102;" // hangul choseong nieun
"m ({vowel}) <> \u1106;" // hangul choseong mieum
"l ({vowel}) <> \u1105;" // hangul choseong rieul
"k ({vowel}) <> \u110f;" // hangul choseong khieukh
"j ({vowel}) <> \u110c;" // hangul choseong cieuc
"h ({vowel}) <> \u1112;" // hangul choseong hieuh
"gg ({vowel}) <> \u1101;" // hangul choseong ssangkiyeok
"g ({vowel}) <> \u1100;" // hangul choseong kiyeok
"d ({vowel}) <> \u1103;" // hangul choseong tikeut
"c ({vowel}) <> \u110e;" // hangul choseong chieuch
"b ({vowel}) <> \u1107;" // hangul choseong pieup
"bb ({vowel}) <> \u1108;"
"jj ({vowel}) <> \u110d;"
"dd ({vowel}) <> \u1104;"
// If we have gotten through to these rules, and we start with
// a consonant, then the remaining mappings would be to F,
// because must have CC (or C<non-letter>), not CV.
// If we have F before us, then
// we would end up with FF, which is wrong. The simplest fix is
// to still make it an initial, but also insert an "u",
// so we end up with F, I, u, and then continue with the C
// special, only initial
"bb > \u1108\u116e;" // hangul choseong ssangpieup
"jj > \u1108\u110d;" // hangul choseong ssangcieuc
"dd > \u1108\u1104;" // hangul choseong ssangtikeut
"({final}) t > \u1110\u116e;" // hangul choseong thieuth
"({final}) ss > \u110a\u116e;" // hangul choseong ssangsios
"({final}) s > \u1109\u116e;" // hangul choseong sios
"({final}) p > \u1111\u116e;" // hangul choseong phieuph
"({final}) n > \u1102\u116e;" // hangul choseong nieun
"({final}) m > \u1106\u116e;" // hangul choseong mieum
"({final}) l > \u1105\u116e;" // hangul choseong rieul
"({final}) k > \u110f\u116e;" // hangul choseong khieukh
"({final}) j > \u110c\u116e;" // hangul choseong cieuc
"({final}) h > \u1112\u116e;" // hangul choseong hieuh
"({final}) gg > \u1101\u116e;" // hangul choseong ssangkiyeok
"({final}) g > \u1100\u116e;" // hangul choseong kiyeok
"({final}) d > \u1103\u116e;" // hangul choseong tikeut
"({final}) c > \u110e\u116e;" // hangul choseong chieuch
"({final}) b > \u1107\u116e;" // hangul choseong pieup
// MEDIALS after INITIALS
"({initial}) yu <> \u1172;" // hangul jungseong yu
"({initial}) yo <> \u116d;" // hangul jungseong yo
"({initial}) yi <> \u1174;" // hangul jungseong yi
"({initial}) yeo <> \u1167;" // hangul jungseong yeo
"({initial}) ye <> \u1168;" // hangul jungseong ye
"({initial}) yae <> \u1164;" // hangul jungseong yae
"({initial}) ya <> \u1163;" // hangul jungseong ya
"({initial}) wi <> \u1171;" // hangul jungseong wi
"({initial}) weo <> \u116f;" // hangul jungseong weo
"({initial}) we <> \u1170;" // hangul jungseong we
"({initial}) wae <> \u116b;" // hangul jungseong wae
"({initial}) wa <> \u116a;" // hangul jungseong wa
"({initial}) u <> \u116e;" // hangul jungseong u
"({initial}) oe <> \u116c;" // hangul jungseong oe
"({initial}) o <> \u1169;" // hangul jungseong o
"({initial}) i <> \u1175;" // hangul jungseong i
"({initial}) eu <> \u1173;" // hangul jungseong eu
"({initial}) eo <> \u1165;" // hangul jungseong eo
"({initial}) e <> \u1166;" // hangul jungseong e
"({initial}) ae <> \u1162;" // hangul jungseong ae
"({initial}) a <> \u1161;" // hangul jungseong a
// MEDIALS (vowels) not after INITIALs
"yu > \u110B\u1172;" // hangul jungseong yu
"yo > \u110B\u116d;" // hangul jungseong yo
"yi > \u110B\u1174;" // hangul jungseong yi
"yeo > \u110B\u1167;" // hangul jungseong yeo
"ye > \u110B\u1168;" // hangul jungseong ye
"yae > \u110B\u1164;" // hangul jungseong yae
"ya > \u110B\u1163;" // hangul jungseong ya
"wi > \u110B\u1171;" // hangul jungseong wi
"weo > \u110B\u116f;" // hangul jungseong weo
"we > \u110B\u1170;" // hangul jungseong we
"wae > \u110B\u116b;" // hangul jungseong wae
"wa > \u110B\u116a;" // hangul jungseong wa
"u > \u110B\u116e;" // hangul jungseong u
"oe > \u110B\u116c;" // hangul jungseong oe
"o > \u110B\u1169;" // hangul jungseong o
"i > \u110B\u1175;" // hangul jungseong i
"eu > \u110B\u1173;" // hangul jungseong eu
"eo > \u110B\u1165;" // hangul jungseong eo
"e > \u110B\u1166;" // hangul jungseong e
"ae > \u110B\u1162;" // hangul jungseong ae
"a > \u110B\u1161;" // hangul jungseong a
// FINALS
"t <> \u11c0;" // hangul jongseong thieuth
"ss <> \u11bb;" // hangul jongseong ssangsios
"s <> \u11ba;" // hangul jongseong sios
"p <> \u11c1;" // hangul jongseong phieuph
"nj <> \u11ac;" // hangul jongseong nieun-cieuc
"nh <> \u11ad;" // hangul jongseong nieun-hieuh
"ng <> \u11bc;" // hangul jongseong ieung
"n <> \u11ab;" // hangul jongseong nieun
"m <> \u11b7;" // hangul jongseong mieum
"lt <> \u11b4;" // hangul jongseong rieul-thieuth
"ls <> \u11b3;" // hangul jongseong rieul-sios
"lp <> \u11b5;" // hangul jongseong rieul-phieuph
"lm <> \u11b1;" // hangul jongseong rieul-mieum
"lh <> \u11b6;" // hangul jongseong rieul-hieuh
"lg <> \u11b0;" // hangul jongseong rieul-kiyeok
"lb <> \u11b2;" // hangul jongseong rieul-pieup
"l <> \u11af;" // hangul jongseong rieul
"k <> \u11bf;" // hangul jongseong khieukh
"j <> \u11bd;" // hangul jongseong cieuc
"h <> \u11c2;" // hangul jongseong hieuh
"gs <> \u11aa;" // hangul jongseong kiyeok-sios
"gg <> \u11a9;" // hangul jongseong ssangkiyeok
"g <> \u11a8;" // hangul jongseong kiyeok
"d <> \u11ae;" // hangul jongseong tikeut
"c <> \u11be;" // hangul jongseong chieuch
"bs <> \u11b9;" // hangul jongseong pieup-sios
"b <> \u11b8;" // hangul jongseong pieup
// extra English letters
// {moved to bottom - aliu}
"z > |s;"
//{ + "Z > |s;" } masked
"x > |ks;"
"X > |ks;"
"v > |b;"
"V > |b;"
"r > |l;"
"R > |l;"
"q > |k;"
"Q > |k;"
"f > |p;"
"F > |p;"
//{ + "c > |k;" } masked
"C > |k;"
"y > \u1172;" // hangul jungseong yu
"w > \u1171;" // hangul jungseong wi
// ====================================
// Normal final rule: remove '
// ====================================
"''>;"
}
}