ICU-1292 Added Latin-el, fixed roundtrip test, additional Greek fixes

X-SVN-Rev: 6541
This commit is contained in:
Mark Davis 2001-11-01 00:39:31 +00:00
parent c0374fa37a
commit a545c3da81
3 changed files with 226 additions and 18 deletions

View File

@ -1,6 +1,6 @@
# Copyright (c) 2001, International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# TRANSLITERATOR INDEX FILE. This file lists the non-algorithmic
# system transliterators. It allows arbitrary mappings between
# transliterator IDs and file names, and also allows the system to
@ -9,29 +9,29 @@
# "Latin-Jamo;Jamo-Hangul". Internal IDs may also be defined; these
# are invisible to the user, but can be composed together by the
# system to create visible transliterators.
#
#
# Blank lines and lines beginning with '#' are ignored.
#
#
# Lines in this file have one of the following forms (text not
# enclosed by <> is literal):
#
#
# <id>:file:<resource>:<encoding>:<direction>
# <id>:internal:<resource>:<encoding>:<direction>
# <id>:alias:<getInstanceArg>
#
#
# <id> is the ID of the system transliterator being defined. These
# are public IDs enumerated by Transliterator.getAvailableIDs(),
# unless the second field is "internal".
#
#
# <resource> is a ResourceReader resource name. Currently these refer
# to file names under com/ibm/text/resources. This string is passed
# directly to ResourceReader, together with <encoding>.
#
#
# <encoding> is the character encoding to use when reading <resource>;
# passed directly to ResourceReader. E.g., "UTF8".
#
#
# <direction> is either "FORWARD" or "REVERSE".
#
#
# <getInstanceArg> is a string to be passed directly to
# Transliterator.getInstance(). The returned Transliterator object
# then has its ID changed to <id> and is returned.
@ -48,6 +48,9 @@ Cyrillic-Latin:file:Transliterator_Cyrillic_Latin.txt:UTF8:FORWARD
Latin-Greek:file:Transliterator_Greek_Latin.txt:UTF8:REVERSE
Greek-Latin:file:Transliterator_Greek_Latin.txt:UTF8:FORWARD
Latin-el:file:Transliterator_el_Latin.txt:UTF8:REVERSE
el-Latin:file:Transliterator_el_Latin.txt:UTF8:FORWARD
LowerLatin-Jamo:internal:Transliterator_Latin_Jamo.utf8.txt:UTF8:FORWARD
Latin-Jamo:alias:Any-Lower;LowerLatin-Jamo
Jamo-Latin:file:Transliterator_Latin_Jamo.utf8.txt:UTF8:REVERSE

View File

@ -0,0 +1,202 @@
#--------------------------------------------------------------------
# Copyright (c) 1999-2001, International Business Machines
# Corporation and others. All Rights Reserved.
#--------------------------------------------------------------------
# $Source: /xsrl/Nsvn/icu/icu4j/src/com/ibm/text/resources/Attic/Transliterator_el_Latin.txt,v $
# $Date: 2001/11/01 00:39:31 $
# $Revision: 1.1 $
#--------------------------------------------------------------------
# Rules are predicated on running NFD first, and NFC afterwards
::NFD (NFC) ;
# For modern Greek.
# Useful variables
$lower = [:Ll:] ;
$upper = [:Lu:] ;
$accent = [:M:] ;
$macron = \u0304 ;
$ddot = \u0308 ;
$lcgvowel = [αεηιουω] ;
$ucgvowel = [ΑΕΗΙΟΥΩ] ;
$gvowel = [$lcgvowel $ucgvowel] ;
$lcgvowelC = [$lcgvowel $accent] ;
$evowel = [aeiouyAEIOUY];
$vowel = [ $evowel $gvowel] ;
$beforeLower = $accent * $lower ;
$gammaLike = [ΓΚΞΧγκξχϰ] ;
$egammaLike = [GKXCgkxc] ;
$smooth = ̓ ;
$rough = ̔ ;
$iotasub = ͅ ;
$softener = [βΒγΓδΔζΖλΛμΜνΝρΡ$gvowel] ;
$under = \u0331;
# Fix punctuation
\; <> \? ;
· <> \: ;
# Fix any ancient characters that creep in
\u0342 > \u0301 ;
\u0302 > \u0301 ;
\u0300 > \u0301 ;
$smooth > ;
$rough > ;
$iotasub > ;
\u037A > ;
# need to have these up here so the rules don't mask
η <> i $under ;
Η <> I $under ;
Ψ } $beforeLower <> Ps ;
Ψ <> PS ;
ψ <> ps ;
ω <> o $under ;
Ω <> O $under;
# at begining or end of word, convert mp to b
[^[:L:][:M:]] } μπ > b ;
μπ } [^[:L:][:M:]] > b ;
[^[:L:][:M:]] } [Μμ][Ππ] > B ;
[Μμ][Ππ] } [^[:L:][:M:]] > B ;
μπ < b ;
Μπ < B { $beforeLower ;
ΜΠ < B ;
# handle diphthongs ending with upsilon
$vowel { υ } $softener <> v $under ;
$vowel { υ } <> f $under;
υ <> y ;
$vowel { Υ } $softener <> V $under ;
$vowel { Υ <> U $under ;
Υ <> Y ;
# NORMAL
α <> a ;
Α <> A ;
β <> v ;
Β <> V ;
γ } $gammaLike <> n } $egammaLike ;
γ <> g ;
Γ } $gammaLike <> N } $egammaLike ;
Γ <> G ;
δ <> d ;
Δ <> D ;
ε <> e ;
Ε <> E ;
ζ <> z ;
Ζ <> Z ;
θ <> th ;
Θ } $beforeLower <> Th ;
Θ <> TH ;
ι <> i ;
Ι <> I ;
κ <> k ;
Κ <> K ;
λ <> l ;
Λ <> L ;
μ <> m ;
Μ <> M ;
ν } $gammaLike > n\' ;
ν <> n ;
Ν } $gammaLike <> N\' ;
Ν <> N ;
ξ <> x ;
Ξ <> X ;
ο <> o ;
Ο <> O ;
π <> p ;
Π <> P ;
ρ <> r ;
Ρ <> R ;
[Pp] {ς > \'s ;
[Pp] {σ > \'s ;
σ < [:^L:] [:M:]* { s } [:^L:] ;
ς <> s } [:^L:] ;
σ <> s ;
[Pp] { Σ <> \'S ;
Σ <> S ;
τ <> t ;
Τ <> T ;
φ <> f ;
Φ <> F ;
χ <> ch ;
Χ } $beforeLower <> Ch ;
Χ <> CH ;
# Completeness for ASCII
$ignore = [[:Mark:]''] * ;
| ch < h ;
| k < c ;
| i < j ;
| k < q ;
| y < u ;
| y < w ;
| Ch < H ;
| K < C ;
| I < J ;
| K < Q ;
| Y < W ;
| Y < U ;
# Completeness for Greek
ϐ > | β ;
ϑ > | θ ;
ϒ > | Υ ;
ϕ > | φ ;
ϖ > | π ;
ϰ > | κ ;
ϱ > | ρ ;
ϲ > | σ ;
ϳ > j ;
ϴ > | Θ ;
ϵ > | ε ;
# delete any trailing ' marks used for roundtripping
< [Ππ] { \' } [Ss] ;
< [Νν] { \' } $egammaLike ;
::NFC (NFD) ;

View File

@ -1,6 +1,6 @@
# Copyright (c) 2001, International Business Machines Corporation and
# others. All Rights Reserved.
#
#
# TRANSLITERATOR INDEX FILE. This file lists the non-algorithmic
# system transliterators. It allows arbitrary mappings between
# transliterator IDs and file names, and also allows the system to
@ -9,29 +9,29 @@
# "Latin-Jamo;Jamo-Hangul". Internal IDs may also be defined; these
# are invisible to the user, but can be composed together by the
# system to create visible transliterators.
#
#
# Blank lines and lines beginning with '#' are ignored.
#
#
# Lines in this file have one of the following forms (text not
# enclosed by <> is literal):
#
#
# <id>:file:<resource>:<encoding>:<direction>
# <id>:internal:<resource>:<encoding>:<direction>
# <id>:alias:<getInstanceArg>
#
#
# <id> is the ID of the system transliterator being defined. These
# are public IDs enumerated by Transliterator.getAvailableIDs(),
# unless the second field is "internal".
#
#
# <resource> is a ResourceReader resource name. Currently these refer
# to file names under com/ibm/text/resources. This string is passed
# directly to ResourceReader, together with <encoding>.
#
#
# <encoding> is the character encoding to use when reading <resource>;
# passed directly to ResourceReader. E.g., "UTF8".
#
#
# <direction> is either "FORWARD" or "REVERSE".
#
#
# <getInstanceArg> is a string to be passed directly to
# Transliterator.getInstance(). The returned Transliterator object
# then has its ID changed to <id> and is returned.
@ -48,6 +48,9 @@ Cyrillic-Latin:file:Transliterator_Cyrillic_Latin.txt:UTF8:FORWARD
Latin-Greek:file:Transliterator_Greek_Latin.txt:UTF8:REVERSE
Greek-Latin:file:Transliterator_Greek_Latin.txt:UTF8:FORWARD
Latin-el:file:Transliterator_el_Latin.txt:UTF8:REVERSE
el-Latin:file:Transliterator_el_Latin.txt:UTF8:FORWARD
LowerLatin-Jamo:internal:Transliterator_Latin_Jamo.utf8.txt:UTF8:FORWARD
Latin-Jamo:alias:Any-Lower;LowerLatin-Jamo
Jamo-Latin:file:Transliterator_Latin_Jamo.utf8.txt:UTF8:REVERSE