346 lines
9.2 KiB
Plaintext
346 lines
9.2 KiB
Plaintext
|
#--------------------------------------------------------------------
|
|||
|
# Copyright (c) 1999-2004, International Business Machines
|
|||
|
# Corporation and others. All Rights Reserved.
|
|||
|
#--------------------------------------------------------------------
|
|||
|
|
|||
|
# Rules are predicated on running NFD first, and NFC afterwards
|
|||
|
# :: [\u0000-\u007F \u0370-\u03FF [:Greek:] [:nonspacing mark:]] ;
|
|||
|
# MINIMAL FILTER GENERATED FOR: Greek-Latin
|
|||
|
:: [;\u00B5\u00B7\u00C4\u00CB\u00CF\u00D6\u00DC\u00E4\u00EB\u00EF\u00F6\u00FC\u00FF-\u0101\u0112-\u0113\u012A-\u012B\u014C-\u014D\u016A-\u016B\u0178\u01D5-\u01DC\u01DE-\u01E3\u01EC-\u01ED\u022A-\u022D\u0230-\u0233\u0304\u0308\u0313-\u0314\u0342-\u0345\u037A\u037E\u0386-\u038A\u038C\u038E-\u03A1\u03A3-\u03CE\u03D0-\u03D7\u03DB\u03DD\u03DF\u03E1\u03E3\u03E5\u03E7\u03E9\u03EB\u03ED\u03EF-\u03F5\u03F7-\u07FB\u0401\u0407\u0451\u0457\u04D2-\u04D3\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F1\u04F4-\u04F5\u04F8-\u04F9\u1E14-\u1E17\u1E20-\u1E21\u1E26-\u1E27\u1E2E-\u1E2F\u1E38-\u1E39\u1E4E-\u1E53\u1E5C-\u1E5D\u1E7A-\u1E7B\u1E84-\u1E85\u1E8C-\u1E8D\u1E97\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FBE\u1FC1-\u1FC4\u1FC6-\u1FCC\u1FCF-\u1FD3\u1FD6-\u1FDB\u1FDF-\u1FEC\u1FF2-\u1FF4\u1FF6-\u1FFC\u2126\u03F9] ;
|
|||
|
|
|||
|
:: NFD (NFC) ;
|
|||
|
|
|||
|
# TEST CASES
|
|||
|
|
|||
|
# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
|
|||
|
# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
|
|||
|
# ᾳ ῃ ῳ ὃ ὄ
|
|||
|
# ὠς ὡς ὢς ὣς
|
|||
|
# Ὠς Ὡς Ὢς Ὣς
|
|||
|
# ὨΣ ὩΣ ὪΣ ὫΣ
|
|||
|
# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
|
|||
|
|
|||
|
# Useful variables
|
|||
|
|
|||
|
$lower = [[:latin:][:greek:] & [:Ll:]];
|
|||
|
$glower = [[:greek:] & [:Ll:]];
|
|||
|
$upper = [[:latin:][:greek:] & [:Lu:]] ;
|
|||
|
$accent = [:M:] ;
|
|||
|
|
|||
|
# NOTE: restrict to just the Greek & Latin accents that we care about
|
|||
|
# TODO: broaden out once interation is fixed
|
|||
|
$accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ;
|
|||
|
|
|||
|
$macron = \u0304 ;
|
|||
|
$ddot = \u0308 ;
|
|||
|
$ddotmac = [$ddot$macron];
|
|||
|
|
|||
|
$lcgvowel = [αεηιουω] ;
|
|||
|
$ucgvowel = [ΑΕΗΙΟΥΩ] ;
|
|||
|
$gvowel = [$lcgvowel $ucgvowel] ;
|
|||
|
$lcgvowelC = [$lcgvowel $accent] ;
|
|||
|
|
|||
|
$evowel = [aeiouyAEIOUY];
|
|||
|
$evowel2 = [iuyIUY];
|
|||
|
$vowel = [ $evowel $gvowel] ;
|
|||
|
|
|||
|
$gammaLike = [ΓΚΞΧγκξχϰ] ;
|
|||
|
$egammaLike = [GKXCgkxc] ;
|
|||
|
$smooth = ̓ ;
|
|||
|
$rough = ̔ ;
|
|||
|
$iotasub = ͅ ;
|
|||
|
|
|||
|
$evowel_i = [$evowel-[iI]] ;
|
|||
|
$evowel2_i = [uyUY];
|
|||
|
|
|||
|
$underbar = \u0331;
|
|||
|
|
|||
|
$afterLetter = [:L:] [[:M:]\']* ;
|
|||
|
$beforeLetter = [[:M:]\']* [:L:] ;
|
|||
|
$beforeLower = $accent * $lower ;
|
|||
|
|
|||
|
$notLetter = [^[:L:][:M:]] ;
|
|||
|
$under = ̱;
|
|||
|
|
|||
|
# Fix punctuation
|
|||
|
# preserve original
|
|||
|
\: <> \: $under ;
|
|||
|
\? <> \? $under ;
|
|||
|
|
|||
|
\; <> \? ;
|
|||
|
· <> \: ;
|
|||
|
|
|||
|
# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
|
|||
|
|
|||
|
\u0342 <> \u0302 ;
|
|||
|
|
|||
|
# IOTA: convert iota subscript to iota
|
|||
|
# first make previous alpha long!
|
|||
|
|
|||
|
$accent_minus = [[$accent]-[$iotasub$macron]];
|
|||
|
|
|||
|
Α } $accent_minus * $iotasub > | Α $macron ;
|
|||
|
α } $accent_minus * $iotasub > | α $macron ;
|
|||
|
|
|||
|
# now convert to uppercase if after uppercase, ow to lowercase
|
|||
|
|
|||
|
$upper $accent * { $iotasub > I ;
|
|||
|
$iotasub > i ;
|
|||
|
|
|||
|
| $1 $iotasub < ($evowel $macron $accentMinus *) i ;
|
|||
|
| $1 $iotasub < ($evowel $macron $accentMinus *) I ;
|
|||
|
|
|||
|
# BREATHING
|
|||
|
|
|||
|
# Convert rough breathing to h, and move before letters.
|
|||
|
|
|||
|
# Make A ` x = > H a x
|
|||
|
|
|||
|
Α ($macron?) $rough } $beforeLower > H | α $1;
|
|||
|
Ε $rough } $beforeLower > H | ε;
|
|||
|
Η $rough } $beforeLower > H | η ;
|
|||
|
Ι ($ddot?) $rough } $beforeLower > H | ι $1;
|
|||
|
Ο $rough } $beforeLower > H | ο ;
|
|||
|
Υ $rough } $beforeLower > H | υ ;
|
|||
|
Ω ($ddot?) $rough } $beforeLower > H | ω $1;
|
|||
|
|
|||
|
# Make A x ` = > H a x
|
|||
|
|
|||
|
Α ($glower $macron?) $rough > H | α $1 ;
|
|||
|
Ε ($glower) $rough > H | ε $1 ;
|
|||
|
Η ($glower) $rough > H | η $1 ;
|
|||
|
Ι ($glower $ddot?) $rough > H | ι $1 ;
|
|||
|
Ο ($glower) $rough > H | ο $1 ;
|
|||
|
Υ ($glower) $rough > H | υ $1 ;
|
|||
|
Ω ($glower $ddot?) $rough > H | ω $1 ;
|
|||
|
|
|||
|
#Otherwise, make x ` into h x and X ` into H X
|
|||
|
|
|||
|
($lcgvowel + $ddotmac? ) $rough > h | $1 ;
|
|||
|
($gvowel + $ddotmac? ) $rough > H | $1 ;
|
|||
|
|
|||
|
# Go backwards with H
|
|||
|
|
|||
|
| $1 $rough < h ($evowel $macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| $1 $rough < h ($evowel $ddot? $evowel2 $macron?) ;
|
|||
|
| $1 $rough < h ($evowel $macron? $ddot?) ;
|
|||
|
|
|||
|
| $1 $rough < H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| $1 $rough < H ([AEIOUY] $ddot? $evowel2 $macron?) ;
|
|||
|
| $1 $rough < H ([AEIOUY] $macron? $ddot?) ;
|
|||
|
|
|||
|
# titlecase, have to fix individually
|
|||
|
# in the future, we should add &uppercase() to make this easier
|
|||
|
|
|||
|
| A $1 $rough < H a ($macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| E $1 $rough < H e ($macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| I $1 $rough < H i ($macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| O $1 $rough < H o ($macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| U $1 $rough < H u ($macron $ddot? $evowel2_i $macron?) ;
|
|||
|
| Y $1 $rough < H y ($macron $ddot? $evowel2_i $macron?) ;
|
|||
|
|
|||
|
| A $1 $rough < H a ($ddot? $evowel2 $macron?) ;
|
|||
|
| E $1 $rough < H e ($ddot? $evowel2 $macron?) ;
|
|||
|
| I $1 $rough < H i ($ddot? $evowel2 $macron?) ;
|
|||
|
| O $1 $rough < H o ($ddot? $evowel2 $macron?) ;
|
|||
|
| U $1 $rough < H u ($ddot? $evowel2 $macron?) ;
|
|||
|
| Y $1 $rough < H y ($ddot? $evowel2 $macron?) ;
|
|||
|
|
|||
|
| A $1 $rough < H a ($macron? $ddot? ) ;
|
|||
|
| E $1 $rough < H e ($macron? $ddot? ) ;
|
|||
|
| I $1 $rough < H i ($macron? $ddot? ) ;
|
|||
|
| O $1 $rough < H o ($macron? $ddot? ) ;
|
|||
|
| U $1 $rough < H u ($macron? $ddot? ) ;
|
|||
|
| Y $1 $rough < H y ($macron? $ddot? ) ;
|
|||
|
|
|||
|
# Now do smooth
|
|||
|
|
|||
|
#delete smooth breathing for Latin
|
|||
|
$smooth > ;
|
|||
|
|
|||
|
# insert in Greek
|
|||
|
# the assumption is that all Marks are on letters.
|
|||
|
|
|||
|
| $1 $smooth < $notLetter { ([rR]) } [^hH$smooth$rough] ;
|
|||
|
| $1 $smooth < $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ;
|
|||
|
| $1 $smooth < $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ;
|
|||
|
|
|||
|
# TODO: preserve smooth/rough breathing if not
|
|||
|
# on initial vowel sequence
|
|||
|
|
|||
|
# need to have these up here so the rules don't mask
|
|||
|
|
|||
|
# remove now superfluous macron when returning
|
|||
|
|
|||
|
Α < A $macron ;
|
|||
|
α < a $macron ;
|
|||
|
|
|||
|
η <> e $macron ;
|
|||
|
Η <> E $macron ;
|
|||
|
|
|||
|
φ <> ph ;
|
|||
|
Ψ } $beforeLower <> Ps ;
|
|||
|
Ψ <> PS ;
|
|||
|
|
|||
|
Φ } $beforeLower <> Ph ;
|
|||
|
Φ <> PH ;
|
|||
|
ψ <> ps ;
|
|||
|
|
|||
|
ω <> o $macron ;
|
|||
|
Ω <> O $macron;
|
|||
|
|
|||
|
# NORMAL
|
|||
|
|
|||
|
α <> a ;
|
|||
|
Α <> A ;
|
|||
|
|
|||
|
β <> b ;
|
|||
|
Β <> B ;
|
|||
|
|
|||
|
γ } $gammaLike <> n } $egammaLike ;
|
|||
|
γ <> g ;
|
|||
|
Γ } $gammaLike <> N } $egammaLike ;
|
|||
|
Γ <> G ;
|
|||
|
|
|||
|
δ <> d ;
|
|||
|
Δ <> D ;
|
|||
|
|
|||
|
ε <> e ;
|
|||
|
Ε <> E ;
|
|||
|
|
|||
|
ζ <> z ;
|
|||
|
Ζ <> Z ;
|
|||
|
|
|||
|
θ <> th ;
|
|||
|
Θ } $beforeLower <> Th ;
|
|||
|
Θ <> TH ;
|
|||
|
|
|||
|
ι <> i ;
|
|||
|
Ι <> I ;
|
|||
|
|
|||
|
κ <> k ;
|
|||
|
Κ <> K ;
|
|||
|
|
|||
|
λ <> l ;
|
|||
|
Λ <> L ;
|
|||
|
|
|||
|
μ <> m ;
|
|||
|
Μ <> M ;
|
|||
|
|
|||
|
ν } $gammaLike > n\' ;
|
|||
|
ν <> n ;
|
|||
|
Ν } $gammaLike <> N\' ;
|
|||
|
Ν <> N ;
|
|||
|
|
|||
|
ξ <> x ;
|
|||
|
Ξ <> X ;
|
|||
|
|
|||
|
ο <> o ;
|
|||
|
Ο <> O ;
|
|||
|
|
|||
|
π <> p ;
|
|||
|
Π <> P ;
|
|||
|
|
|||
|
ρ $rough <> rh;
|
|||
|
Ρ $rough } $beforeLower <> Rh ;
|
|||
|
Ρ $rough <> RH ;
|
|||
|
ρ <> r ;
|
|||
|
Ρ <> R ;
|
|||
|
|
|||
|
# insert separator before things that turn into s
|
|||
|
|
|||
|
[Pp] { } [ςσΣϷϸϺϻ] > \' ;
|
|||
|
|
|||
|
# special S variants
|
|||
|
|
|||
|
Ϸ <> Š ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L
|
|||
|
ϸ <> š ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L
|
|||
|
Ϻ <> Ŝ ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L
|
|||
|
ϻ <> ŝ ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L
|
|||
|
|
|||
|
# underbar means exception
|
|||
|
|
|||
|
# before a letter, initial
|
|||
|
ς } $beforeLetter <> s $underbar } $beforeLetter;
|
|||
|
σ } $beforeLetter <> s } $beforeLetter;
|
|||
|
|
|||
|
# otherwise, after a letter = final
|
|||
|
$afterLetter { σ <> $afterLetter { s $underbar;
|
|||
|
$afterLetter { ς <> $afterLetter { s ;
|
|||
|
|
|||
|
# otherwise (isolated) = initial
|
|||
|
ς <> s $underbar;
|
|||
|
σ <> s ;
|
|||
|
|
|||
|
# [Pp] { Σ <> \'S ;
|
|||
|
Σ <> S ;
|
|||
|
|
|||
|
τ <> t ;
|
|||
|
Τ <> T ;
|
|||
|
|
|||
|
$vowel {υ } <> u ;
|
|||
|
υ <> y ;
|
|||
|
$vowel { Υ <> U ;
|
|||
|
Υ <> Y ;
|
|||
|
|
|||
|
χ <> ch ;
|
|||
|
Χ } $beforeLower <> Ch ;
|
|||
|
Χ <> CH ;
|
|||
|
|
|||
|
# Completeness for ASCII
|
|||
|
|
|||
|
$ignore = [[:Mark:]''] * ;
|
|||
|
|
|||
|
| k < c ;
|
|||
|
| ph < f ;
|
|||
|
| i < j ;
|
|||
|
| k < q ;
|
|||
|
| b < v } $vowel ;
|
|||
|
| b < w } $vowel;
|
|||
|
| u < v ;
|
|||
|
| u < w;
|
|||
|
| K < C ;
|
|||
|
| Ph < F ;
|
|||
|
| I < J ;
|
|||
|
| K < Q ;
|
|||
|
| B < V } $vowel ;
|
|||
|
| B < W } $vowel ;
|
|||
|
| U < V ;
|
|||
|
| U < W ;
|
|||
|
|
|||
|
$rough } $ignore [:UppercaseLetter:] > H ;
|
|||
|
$ignore [:UppercaseLetter:] { $rough > H ;
|
|||
|
$rough < H ;
|
|||
|
$rough <> h ;
|
|||
|
|
|||
|
# Completeness for Greek
|
|||
|
|
|||
|
ϐ > | β ;
|
|||
|
ϑ > | θ ;
|
|||
|
ϒ > | Υ ;
|
|||
|
ϕ > | φ ;
|
|||
|
ϖ > | π ;
|
|||
|
|
|||
|
ϰ > | κ ;
|
|||
|
ϱ > | ρ ;
|
|||
|
ϲ > | σ ;
|
|||
|
Ϲ > | Σ; #U+03F9 GREEK CAPITAL LUNATE SIGMA SYMBOL
|
|||
|
ϳ > j ;
|
|||
|
ϴ > | Θ ;
|
|||
|
ϵ > | ε ;
|
|||
|
|
|||
|
µ > | μ ;
|
|||
|
|
|||
|
ͺ > i;
|
|||
|
|
|||
|
# delete any trailing ' marks used for roundtripping
|
|||
|
|
|||
|
< [Ππ] { \' } [Ss] ;
|
|||
|
< [Νν] { \' } $egammaLike ;
|
|||
|
|
|||
|
::NFC (NFD) ;
|
|||
|
# ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ;
|
|||
|
# ([\u0000-\u007F \u00B7 [:Latin:] [:nonspacing mark:]]) ;
|
|||
|
# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
|
|||
|
:: ( [':?A-Za-z\u00C0-\u00C5\u00C7-\u00CF\u00D1-\u00D6\u00D9-\u00DD\u00E0-\u00E5\u00E7-\u00EF\u00F1-\u00F6\u00F9-\u00FD\u00FF-\u010F\u0112-\u0125\u0128-\u0130\u0134-\u0137\u0139-\u013E\u0143-\u0148\u014C-\u0151\u0154-\u0165\u0168-\u017E\u01A0-\u01A1\u01AF-\u01B0\u01CD-\u01DC\u01DE-\u01E3\u01E6-\u01F0\u01F4-\u01F5\u01F8-\u021B\u021E-\u021F\u0226-\u0233\u0300-\u0337\u0339-\u0345\u0385-\u0386\u0388-\u038A\u038C\u038E-\u0390\u03AA-\u03B0\u03CA-\u03CE\u03D3-\u03D4\u0400-\u0401\u0403\u0407\u040C-\u040E\u0419\u0439\u0450-\u0451\u0453\u0457\u045C-\u045E\u0476-\u0477\u04C1-\u04C2\u04D0-\u04D3\u04D6-\u04D7\u04DA-\u04DF\u04E2-\u04E7\u04EA-\u04F5\u04F8-\u04F9\u1E00-\u1E99\u1E9B\u1EA0-\u1EF9\u1F00-\u1F15\u1F18-\u1F1D\u1F20-\u1F45\u1F48-\u1F4D\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F-\u1F7D\u1F80-\u1FB4\u1FB6-\u1FBC\u1FC1-\u1FC4\u1FC6-\u1FD3\u1FD6-\u1FDB\u1FDD-\u1FEE\u1FF2-\u1FF4\u1FF6-\u1FFC\u212A-\u212B] ) ;
|