scuffed-code/icu4c/source/data/translit/t_Grek_Latn.txt
2002-02-09 00:27:09 +00:00

354 lines
9.9 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// -*- Coding: utf-8; -*-
//--------------------------------------------------------------------
// Copyright (c) 1999-2001, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// THIS IS A MACHINE-GENERATED FILE
// Tool: dumpICUrules.bat
// Source: ../../text/resources/Transliterator_Greek_Latin.txt
// Date: Fri Feb 8 15:53:54 2002
//--------------------------------------------------------------------
// Greek_Latin
t_Grek_Latn {
Rule {
//--------------------------------------------------------------------
// Copyright (c) 1999-2001, International Business Machines
// Corporation and others. All Rights Reserved.
//--------------------------------------------------------------------
// $Source: /xsrl/Nsvn/icu/icu/source/data/translit/Attic/t_Grek_Latn.txt,v $
// $Date: 2002/02/09 00:27:08 $
// $Revision: 1.10 $
//--------------------------------------------------------------------
// Rules are predicated on running NFD first, and NFC afterwards
// :: [\\u0000-\u007F \u0370-\u03FF [:Greek:] [:nonspacing mark:]] ;
// MINIMAL FILTER GENERATED FOR: Greek-Latin
":: [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126] ;"
":: NFD (NFC) ;"
// TEST CASES
// Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
// ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
// ᾳ ῃ ῳ ὃ ὄ
// ὠς ὡς ὢς ὣς
// Ὠς Ὡς Ὢς Ὣς
// ὨΣ ὩΣ ὪΣ ὫΣ
// Ạ, ạ, Ẹ, ẹ, Ọ, ọ
// Useful variables
"$lower = [[:latin:][:greek:] & [:Ll:]];"
"$glower = [[:greek:] & [:Ll:]];"
"$upper = [[:latin:][:greek:] & [:Lu:]] ;"
"$accent = [:M:] ;"
// NOTE: restrict to just the Greek & Latin accents that we care about
// TODO: broaden out once interation is fixed
"$accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ;"
"$macron = \u0304 ;"
"$ddot = \u0308 ;"
"$ddotmac = [$ddot$macron];"
"$lcgvowel = [αεηιουω] ;"
"$ucgvowel = [ΑΕΗΙΟΥΩ] ;"
"$gvowel = [$lcgvowel $ucgvowel] ;"
"$lcgvowelC = [$lcgvowel $accent] ;"
"$evowel = [aeiouyAEIOUY];"
"$vowel = [ $evowel $gvowel] ;"
"$gammaLike = [ΓΚΞΧγκξχϰ] ;"
"$egammaLike = [GKXCgkxc] ;"
"$smooth = ̓ ;"
"$rough = ̔ ;"
"$iotasub = ͅ ;"
"$evowel_i = [$evowel-[iI]] ;"
"$underbar = \u0331;"
"$afterLetter = [:L:] [[:M:]\\\']* ;"
"$beforeLetter = [[:M:]\\\']* [:L:] ;"
"$beforeLower = $accent * $lower ;"
"$notLetter = [^[:L:][:M:]] ;"
// Fix punctuation
"\\\; <> \\\? ;"
"· <> \\\: ;"
// CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
"\u0342 <> \u0302 ;"
// IOTA: convert iota subscript to iota
// first make previous alpha long!
"$accent_minus = [[$accent]-[$iotasub$macron]];"
"Α } $accent_minus * $iotasub > | Α $macron ;"
"α } $accent_minus * $iotasub > | α $macron ;"
// now convert to uppercase if after uppercase, ow to lowercase
"$upper $accent * { $iotasub > I ;"
"$iotasub > i ;"
"| $1 $iotasub < ($evowel $macron $accentMinus *) i ;"
"| $1 $iotasub < ($evowel $macron $accentMinus *) I ;"
// BREATHING
// Convert rough breathing to h, and move before letters.
// Make A ` x = > H a x
"Α ($macron?) $rough } $beforeLower > H | α $1;"
"Ε $rough } $beforeLower > H | ε;"
"Η $rough } $beforeLower > H | η ;"
"Ι ($ddot?) $rough } $beforeLower > H | ι $1;"
"Ο $rough } $beforeLower > H | ο ;"
"Υ $rough } $beforeLower > H | υ ;"
"Ω ($ddot?) $rough } $beforeLower > H | ω $1;"
// Make A x ` = > H a x
"Α ($glower $macron?) $rough > H | α $1 ;"
"Ε ($glower) $rough > H | ε $1 ;"
"Η ($glower) $rough > H | η $1 ;"
"Ι ($glower $ddot?) $rough > H | ι $1 ;"
"Ο ($glower) $rough > H | ο $1 ;"
"Υ ($glower) $rough > H | υ $1 ;"
"Ω ($glower $ddot?) $rough > H | ω $1 ;"
//Otherwise, make x ` into h x and X ` into H X
"($lcgvowel + $ddotmac? ) $rough > h | $1 ;"
"($gvowel + $ddotmac? ) $rough > H | $1 ;"
// Go backwards with H
"| $1 $rough < h ($evowel $macron $ddot? $evowel_i $macron?) ;"
"| $1 $rough < h ($evowel $ddot? $evowel $macron?) ;"
"| $1 $rough < h ($evowel $macron? $ddot?) ;"
"| $1 $rough < H ([AEIOUY] $macron $ddot? $evowel_i $macron?) ;"
"| $1 $rough < H ([AEIOUY] $ddot? $evowel $macron?) ;"
"| $1 $rough < H ([AEIOUY] $macron? $ddot?) ;"
// titlecase, have to fix individually
// in the future, we should add &uppercase() to make this easier
"| A $1 $rough < H a ($macron $ddot? $evowel_i $macron?) ;"
"| E $1 $rough < H e ($macron $ddot? $evowel_i $macron?) ;"
"| I $1 $rough < H i ($macron $ddot? $evowel_i $macron?) ;"
"| O $1 $rough < H o ($macron $ddot? $evowel_i $macron?) ;"
"| U $1 $rough < H u ($macron $ddot? $evowel_i $macron?) ;"
"| Y $1 $rough < H y ($macron $ddot? $evowel_i $macron?) ;"
"| A $1 $rough < H a ($ddot? $evowel $macron?) ;"
"| E $1 $rough < H e ($ddot? $evowel $macron?) ;"
"| I $1 $rough < H i ($ddot? $evowel $macron?) ;"
"| O $1 $rough < H o ($ddot? $evowel $macron?) ;"
"| U $1 $rough < H u ($ddot? $evowel $macron?) ;"
"| Y $1 $rough < H y ($ddot? $evowel $macron?) ;"
"| A $1 $rough < H a ($macron? $ddot? ) ;"
"| E $1 $rough < H e ($macron? $ddot? ) ;"
"| I $1 $rough < H i ($macron? $ddot? ) ;"
"| O $1 $rough < H o ($macron? $ddot? ) ;"
"| U $1 $rough < H u ($macron? $ddot? ) ;"
"| Y $1 $rough < H y ($macron? $ddot? ) ;"
// Now do smooth
//delete smooth breathing for Latin
"$smooth > ;"
// insert in Greek
// the assumption is that all Marks are on letters.
"| $1 $smooth < $notLetter { ([rR]) } [^hH$smooth$rough] ;"
"| $1 $smooth < $notLetter { ($evowel $macron? $evowel $macron?) } [^$smooth$rough] ;"
"| $1 $smooth < $notLetter { ($evowel $macron?) } [^$evowel$smooth$rough] ;"
// TODO: preserve smooth/rough breathing if not
// on initial vowel sequence
// need to have these up here so the rules don't mask
// remove now superfluous macron when returning
"Α < A $macron ;"
"α < a $macron ;"
"η <> e $macron ;"
"Η <> E $macron ;"
"φ <> ph ;"
"Ψ } $beforeLower <> Ps ;"
"Ψ <> PS ;"
"Φ } $beforeLower <> Ph ;"
"Φ <> PH ;"
"ψ <> ps ;"
"ω <> o $macron ;"
"Ω <> O $macron;"
// NORMAL
"α <> a ;"
"Α <> A ;"
"β <> b ;"
"Β <> B ;"
"γ } $gammaLike <> n } $egammaLike ;"
"γ <> g ;"
"Γ } $gammaLike <> N } $egammaLike ;"
"Γ <> G ;"
"δ <> d ;"
"Δ <> D ;"
"ε <> e ;"
"Ε <> E ;"
"ζ <> z ;"
"Ζ <> Z ;"
"θ <> th ;"
"Θ } $beforeLower <> Th ;"
"Θ <> TH ;"
"ι <> i ;"
"Ι <> I ;"
"κ <> k ;"
"Κ <> K ;"
"λ <> l ;"
"Λ <> L ;"
"μ <> m ;"
"Μ <> M ;"
"ν } $gammaLike > n\\\' ;"
"ν <> n ;"
"Ν } $gammaLike <> N\\\' ;"
"Ν <> N ;"
"ξ <> x ;"
"Ξ <> X ;"
"ο <> o ;"
"Ο <> O ;"
"π <> p ;"
"Π <> P ;"
"ρ $rough <> rh;"
"Ρ $rough } $beforeLower <> Rh ;"
"Ρ $rough <> RH ;"
"ρ <> r ;"
"Ρ <> R ;"
// insert separator
"[Pp] { } ς > \\\' ;"
"[Pp] { } σ > \\\' ;"
// underbar means exception
// before a letter, initial
"ς } $beforeLetter <> s $underbar } $beforeLetter;"
"σ } $beforeLetter <> s } $beforeLetter;"
// otherwise, after a letter = final
"$afterLetter { σ <> $afterLetter { s $underbar;"
"$afterLetter { ς <> $afterLetter { s ;"
// otherwise (isolated) = initial
"ς <> s $underbar;"
"σ <> s ;"
"[Pp] { Σ <> \\\'S ;"
"Σ <> S ;"
"τ <> t ;"
"Τ <> T ;"
"$vowel {υ } <> u ;"
"υ <> y ;"
"$vowel { Υ <> U ;"
"Υ <> Y ;"
"χ <> ch ;"
"Χ } $beforeLower <> Ch ;"
"Χ <> CH ;"
// Completeness for ASCII
"$ignore = [[:Mark:]''] * ;"
"| k < c ;"
"| ph < f ;"
"| i < j ;"
"| k < q ;"
"| b < v } $vowel ;"
"| b < w } $vowel;"
"| u < v ;"
"| u < w;"
"| K < C ;"
"| Ph < F ;"
"| I < J ;"
"| K < Q ;"
"| B < V } $vowel ;"
"| B < W } $vowel ;"
"| U < V ;"
"| U < W ;"
"$rough } $ignore [:UppercaseLetter:] > H ;"
"$ignore [:UppercaseLetter:] { $rough > H ;"
"$rough < H ;"
"$rough <> h ;"
// Completeness for Greek
"ϐ > | β ;"
"ϑ > | θ ;"
"ϒ > | Υ ;"
"ϕ > | φ ;"
"ϖ > | π ;"
"ϰ > | κ ;"
"ϱ > | ρ ;"
"ϲ > | σ ;"
"ϳ > j ;"
"ϴ > | Θ ;"
"ϵ > | ε ;"
"µ > | μ ;"
"ͺ > i;"
// delete any trailing ' marks used for roundtripping
"< [Ππ] { \\\' } [Ss] ;"
"< [Νν] { \\\' } $egammaLike ;"
"::NFC (NFD) ;"
// ([\\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ;
// ([\\u0000-\u007F \u00B7 [:Latin:] [:nonspacing mark:]]) ;
// MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
":: ( [':?A-Za-z\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0300-\u0337\u0339-\u0345\u0385-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u1E00-\u1E99\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FC1-\u1FC4\u1FC6-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2-\u1FF4\u1FF6-\u1FFC\u212A-\u212B] ) ;"
}
}