253 lines
4.6 KiB
Plaintext
253 lines
4.6 KiB
Plaintext
|
#--------------------------------------------------------------------
|
|||
|
# Copyright (c) 1999-2004, International Business Machines
|
|||
|
# Corporation and others. All Rights Reserved.
|
|||
|
#--------------------------------------------------------------------
|
|||
|
# For modern Greek, based on UNGEGN rules.
|
|||
|
|
|||
|
# Rules are predicated on running NFD first, and NFC afterwards
|
|||
|
# MINIMAL FILTER GENERATED FOR: Greek-Latin/UNGEGN
|
|||
|
# WARNING: need to add accents to both filters ###
|
|||
|
# :: [́̄̆̈;µ·ÀÂÈÊÌÎÒÔÙÛàâèêìîòôùûĈ-ĉĜ-ĝĤ-ĥĴ-ĵŜ-ŝŴ-ŷǛ-ǜǸ-ǹ̀̂̓-̔̀͂-̓ͅͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϖϰ-ϵЀЍѐѝḔ-ḕṐ-ṑẀ-ẁẐ-ẑẤ-ậẰ-ằẾ-ệỐ-ộỜ-ờỪ-ừỲ-ỳἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-῍῏-ΐῖ-Ί῝῟-῭ῲ-ῴῶ-ῼΩ\u03F7-\u07FB\u03F9] ;
|
|||
|
|
|||
|
:: [[[:Greek:][:Mn:][:Me:]] [\:-;?\u00B7\u037E\u0387]] ;
|
|||
|
::NFD (NFC) ;
|
|||
|
|
|||
|
# Useful variables
|
|||
|
|
|||
|
$lower = [[:latin:][:greek:] & [:Ll:]] ;
|
|||
|
$upper = [[:latin:][:greek:] & [:Lu:]] ;
|
|||
|
$accent = [[:Mn:][:Me:]] ;
|
|||
|
|
|||
|
$macron = ̄ ;
|
|||
|
$ddot = ̈ ;
|
|||
|
|
|||
|
$lcgvowel = [αεηιουω] ;
|
|||
|
$ucgvowel = [ΑΕΗΙΟΥΩ] ;
|
|||
|
$gvowel = [$lcgvowel $ucgvowel] ;
|
|||
|
$lcgvowelC = [$lcgvowel $accent] ;
|
|||
|
|
|||
|
$evowel = [aeiouyAEIOUY];
|
|||
|
$vowel = [ $evowel $gvowel] ;
|
|||
|
|
|||
|
$beforeLower = $accent * $lower ;
|
|||
|
|
|||
|
$gammaLike = [ΓΚΞΧγκξχϰ] ;
|
|||
|
$egammaLike = [GKXCgkxc] ;
|
|||
|
$smooth = ̓ ;
|
|||
|
$rough = ̔ ;
|
|||
|
$iotasub = ͅ ;
|
|||
|
|
|||
|
$softener = [βΒγΓδΔζΖλΛμΜνΝρΡ$gvowel] ;
|
|||
|
|
|||
|
$under = ̱;
|
|||
|
|
|||
|
$caron = ̌;
|
|||
|
|
|||
|
$afterLetter = [:L:] [\'$accent]* ;
|
|||
|
$beforeLetter = [\'$accent]* [:L:] ;
|
|||
|
|
|||
|
# Fix punctuation
|
|||
|
|
|||
|
# preserve orginal
|
|||
|
\: <> \: $under ;
|
|||
|
\? <> \? $under ;
|
|||
|
|
|||
|
\; <> \? ;
|
|||
|
· <> \: ;
|
|||
|
|
|||
|
# Fix any ancient characters that creep in
|
|||
|
|
|||
|
͂ > ́ ;
|
|||
|
̂ > ́ ;
|
|||
|
̀ > ́ ;
|
|||
|
$smooth > ;
|
|||
|
$rough > ;
|
|||
|
$iotasub > ;
|
|||
|
ͺ > ;
|
|||
|
|
|||
|
# need to have these up here so the rules don't mask
|
|||
|
|
|||
|
η <> i $under ;
|
|||
|
Η <> I $under ;
|
|||
|
|
|||
|
Ψ } $beforeLower <> Ps ;
|
|||
|
Ψ <> PS ;
|
|||
|
ψ <> ps ;
|
|||
|
|
|||
|
ω <> o $under ;
|
|||
|
Ω <> O $under;
|
|||
|
|
|||
|
# at begining or end of word, convert mp to b
|
|||
|
|
|||
|
[^[:L:]$accent] { μπ > b ;
|
|||
|
μπ } [^[:L:]$accent] > b ;
|
|||
|
[^[:L:]$accent] { [Μμ][Ππ] > B ;
|
|||
|
[Μμ][Ππ] } [^[:L:]$accent] > B ;
|
|||
|
|
|||
|
μπ < b ;
|
|||
|
Μπ < B } $beforeLower ;
|
|||
|
ΜΠ < B ;
|
|||
|
|
|||
|
# handle diphthongs ending with upsilon
|
|||
|
|
|||
|
ου <> ou ;
|
|||
|
ΟΥ <> OU ;
|
|||
|
Ου <> Ou ;
|
|||
|
οΥ <> oU ;
|
|||
|
|
|||
|
$fmaker = [aeiAEI] $under ? ;
|
|||
|
$shiftForwardVowels = [[:Mn:]-[\u0308]]; # note: a diaeresis keeps the items separate
|
|||
|
|
|||
|
$fmaker { υ ( $shiftForwardVowels )* } $softener > $1 v $under ;
|
|||
|
υ $1 < ( $shiftForwardVowels )* v $under ;
|
|||
|
|
|||
|
$fmaker { υ ( $shiftForwardVowels )* } > $1 f $under;
|
|||
|
υ $1 < ( $shiftForwardVowels )* f $under ;
|
|||
|
|
|||
|
$fmaker { Υ } $softener <> V $under ;
|
|||
|
$fmaker { Υ <> U $under ;
|
|||
|
|
|||
|
υ <> y ;
|
|||
|
Υ <> Y ;
|
|||
|
|
|||
|
# NORMAL
|
|||
|
|
|||
|
α <> a ;
|
|||
|
Α <> A ;
|
|||
|
|
|||
|
β <> v ;
|
|||
|
Β <> V ;
|
|||
|
|
|||
|
γ } $gammaLike <> n } $egammaLike ;
|
|||
|
γ <> g ;
|
|||
|
Γ } $gammaLike <> N } $egammaLike ;
|
|||
|
Γ <> G ;
|
|||
|
|
|||
|
δ <> d ;
|
|||
|
Δ <> D ;
|
|||
|
|
|||
|
ε <> e ;
|
|||
|
Ε <> E ;
|
|||
|
|
|||
|
ζ <> z ;
|
|||
|
Ζ <> Z ;
|
|||
|
|
|||
|
θ <> th ;
|
|||
|
Θ } $beforeLower <> Th ;
|
|||
|
Θ <> TH ;
|
|||
|
|
|||
|
ι <> i ;
|
|||
|
Ι <> I ;
|
|||
|
|
|||
|
κ <> k ;
|
|||
|
Κ <> K ;
|
|||
|
|
|||
|
λ <> l ;
|
|||
|
Λ <> L ;
|
|||
|
|
|||
|
μ <> m ;
|
|||
|
Μ <> M ;
|
|||
|
|
|||
|
ν } $gammaLike > n\' ;
|
|||
|
ν <> n ;
|
|||
|
Ν } $gammaLike <> N\' ;
|
|||
|
Ν <> N ;
|
|||
|
|
|||
|
ξ <> x ;
|
|||
|
Ξ <> X ;
|
|||
|
|
|||
|
ο <> o ;
|
|||
|
Ο <> O ;
|
|||
|
|
|||
|
π <> p ;
|
|||
|
Π <> P ;
|
|||
|
|
|||
|
ρ <> r ;
|
|||
|
Ρ <> R ;
|
|||
|
|
|||
|
# insert separator before things that turn into s
|
|||
|
[Pp] { } [ςσΣϷϸϺϻ] > \' ;
|
|||
|
|
|||
|
# special S variants
|
|||
|
|
|||
|
Ϸ <> Š ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L
|
|||
|
ϸ <> š ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L
|
|||
|
Ϻ <> Ŝ ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L
|
|||
|
ϻ <> ŝ ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L
|
|||
|
|
|||
|
# Caron means exception
|
|||
|
|
|||
|
# before a letter, initial
|
|||
|
ς } $beforeLetter <> s $under } $beforeLetter;
|
|||
|
σ } $beforeLetter <> s } $beforeLetter;
|
|||
|
|
|||
|
# otherwise, after a letter = final
|
|||
|
$afterLetter { σ <> $afterLetter { s $under;
|
|||
|
$afterLetter { ς <> $afterLetter { s ;
|
|||
|
|
|||
|
# otherwise (isolated) = initial
|
|||
|
ς <> s $under;
|
|||
|
σ <> s ;
|
|||
|
|
|||
|
# [Pp] { Σ <> \'S ;
|
|||
|
Σ <> S ;
|
|||
|
|
|||
|
τ <> t ;
|
|||
|
Τ <> T ;
|
|||
|
|
|||
|
φ <> f ;
|
|||
|
Φ <> F ;
|
|||
|
|
|||
|
χ <> ch ;
|
|||
|
Χ } $beforeLower <> Ch ;
|
|||
|
Χ <> CH ;
|
|||
|
|
|||
|
# Completeness for ASCII
|
|||
|
|
|||
|
# $ignore = [[:Mark:]''] * ;
|
|||
|
|
|||
|
| ch < h ;
|
|||
|
| k < c ;
|
|||
|
| i < j ;
|
|||
|
| k < q ;
|
|||
|
| b < u } $vowel ;
|
|||
|
| b < w } $vowel ;
|
|||
|
| y < u ;
|
|||
|
| y < w ;
|
|||
|
|
|||
|
| Ch < H ;
|
|||
|
| K < C ;
|
|||
|
| I < J ;
|
|||
|
| K < Q ;
|
|||
|
| B < W } $vowel ;
|
|||
|
| B < U } $vowel ;
|
|||
|
| Y < W ;
|
|||
|
| Y < U ;
|
|||
|
|
|||
|
# Completeness for Greek
|
|||
|
|
|||
|
ϐ > | β ;
|
|||
|
ϑ > | θ ;
|
|||
|
ϒ > | Υ ;
|
|||
|
ϕ > | φ ;
|
|||
|
ϖ > | π ;
|
|||
|
|
|||
|
ϰ > | κ ;
|
|||
|
ϱ > | ρ ;
|
|||
|
ϲ > | σ ;
|
|||
|
Ϲ > | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL
|
|||
|
ϳ > j ;
|
|||
|
ϴ > | Θ ;
|
|||
|
ϵ > | ε ;
|
|||
|
µ > | μ ;
|
|||
|
|
|||
|
# delete any trailing ' marks used for roundtripping
|
|||
|
|
|||
|
< [Ππ] { \' } [Ss] ;
|
|||
|
< [Νν] { \' } $egammaLike ;
|
|||
|
|
|||
|
::NFC (NFD) ;
|
|||
|
|
|||
|
# MINIMAL FILTER GENERATED FOR: Latin-Greek/UNGEGN BACKWARD
|
|||
|
:: ([[[:Latin:][:Mn:][:Me:]] ['\:?]]) ;
|