ICU-12091 Merge preliminary CLDR 29 data for 57m1

X-SVN-Rev: 38278
This commit is contained in:
John Emmons 2016-02-05 03:37:50 +00:00
parent 3376c4a0b7
commit 66aa8c0fa4
304 changed files with 13173 additions and 3036 deletions

View File

@ -1,15 +1,20 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Any_Accents.txt
# Generated from CLDR
#
:: NFD (NFC) ;
# to do: make reversible
# define special conversion characters.
# varients of this could use different characters, or set one or the other to null.
$pre = \← ;
$post = \→ ;
# Provide keyboard equivalents for common diacritics used in transliteration
$pre \` $post ↔ \u0300 ; # COMBINING GRAVE ACCENT
$pre \' $post ↔ \u0301 ; # COMBINING ACUTE ACCENT
$pre \^ $post ↔ \u0302 ; # COMBINING CIRCUMFLEX ACCENT
@ -20,6 +25,7 @@ $pre \* $post ↔ \u030A ; # COMBINING RING ABOVE
$pre \, $post ↔ \u0327 ; # COMBINING CEDILLA
$pre '/' $post ↔ \u0338 ; # COMBINING LONG SOLIDUS OVERLAY
$pre \. $post ↔ \u0323 ; # COMBINING DOT BELOW
# Combine common characters
$pre AE $post ↔ Æ ; # LATIN CAPITAL LETTER AE
$pre ae $post ↔ æ ; # LATIN SMALL LETTER AE
$pre D $post ↔ Ð ; # LATIN CAPITAL LETTER ETH
@ -47,7 +53,227 @@ $pre O $post ↔ Ɔ ; # LATIN CAPITAL LETTER OPEN O
$pre o $post ↔ ɔ ; # LATIN SMALL LETTER OPEN O
$pre E $post ↔ Ɛ ; # LATIN CAPITAL LETTER OPEN E
$pre e $post ↔ ɛ ; # LATIN SMALL LETTER OPEN E
# three that don't have uppercases
$pre '?' $post ↔ ʔ ; # LATIN LETTER GLOTTAL STOP
$pre i $post ↔ ɪ ; # LATIN LETTER SMALL CAPITAL I
$pre v $post ↔ ʌ ; # LATIN SMALL LETTER TURNED V
# Additional Characters that may be added in the future
# $pre XXX $post ↔ \u0306 ; # COMBINING BREVE
# $pre XXX $post ↔ \u0307 ; # COMBINING DOT ABOVE
# $pre XXX $post ↔ \u0309 ; # COMBINING HOOK ABOVE
# $pre XXX $post ↔ \u030B ; # COMBINING DOUBLE ACUTE ACCENT
# $pre XXX $post ↔ \u030C ; # COMBINING CARON
# $pre XXX $post ↔ \u030F ; # COMBINING DOUBLE GRAVE ACCENT
# $pre XXX $post ↔ \u0311 ; # COMBINING INVERTED BREVE
# $pre XXX $post ↔ \u0313 ; # COMBINING COMMA ABOVE
# $pre XXX $post ↔ \u0314 ; # COMBINING REVERSED COMMA ABOVE
# $pre XXX $post ↔ \u031B ; # COMBINING HORN
# $pre XXX $post ↔ \u0324 ; # COMBINING DIAERESIS BELOW
# $pre XXX $post ↔ \u0325 ; # COMBINING RING BELOW
# $pre XXX $post ↔ \u0326 ; # COMBINING COMMA BELOW
# $pre XXX $post ↔ \u0328 ; # COMBINING OGONEK
# $pre XXX $post ↔ \u032D ; # COMBINING CIRCUMFLEX ACCENT BELOW
# $pre XXX $post ↔ \u032E ; # COMBINING BREVE BELOW
# $pre XXX $post ↔ \u0330 ; # COMBINING TILDE BELOW
# $pre XXX $post ↔ \u0331 ; # COMBINING MACRON BELOW
# $pre YYY $post ↔ ª ; # FEMININE ORDINAL INDICATOR
# $pre YYY $post ↔ º ; # MASCULINE ORDINAL INDICATOR
# $pre YYY $post ↔ Đ ; # LATIN CAPITAL LETTER D WITH STROKE
# $pre YYY $post ↔ đ ; # LATIN SMALL LETTER D WITH STROKE
# $pre YYY $post ↔ Ħ ; # LATIN CAPITAL LETTER H WITH STROKE
# $pre YYY $post ↔ ħ ; # LATIN SMALL LETTER H WITH STROKE
# $pre YYY $post ↔ ı ; # LATIN SMALL LETTER DOTLESS I
# $pre YYY $post ↔ ĸ ; # LATIN SMALL LETTER KRA
# $pre YYY $post ↔ Ŀ ; # LATIN CAPITAL LETTER L WITH MIDDLE DOT
# $pre YYY $post ↔ ŀ ; # LATIN SMALL LETTER L WITH MIDDLE DOT
# $pre YYY $post ↔ Ł ; # LATIN CAPITAL LETTER L WITH STROKE
# $pre YYY $post ↔ ł ; # LATIN SMALL LETTER L WITH STROKE
# $pre YYY $post ↔ ʼn ; # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
# $pre YYY $post ↔ Ŧ ; # LATIN CAPITAL LETTER T WITH STROKE
# $pre YYY $post ↔ ŧ ; # LATIN SMALL LETTER T WITH STROKE
# $pre YYY $post ↔ ſ ; # LATIN SMALL LETTER LONG S
# $pre YYY $post ↔ ƀ ; # LATIN SMALL LETTER B WITH STROKE
# $pre YYY $post ↔ Ɓ ; # LATIN CAPITAL LETTER B WITH HOOK
# $pre YYY $post ↔ Ƃ ; # LATIN CAPITAL LETTER B WITH TOPBAR
# $pre YYY $post ↔ ƃ ; # LATIN SMALL LETTER B WITH TOPBAR
# $pre YYY $post ↔ Ƅ ; # LATIN CAPITAL LETTER TONE SIX
# $pre YYY $post ↔ ƅ ; # LATIN SMALL LETTER TONE SIX
# $pre YYY $post ↔ Ƈ ; # LATIN CAPITAL LETTER C WITH HOOK
# $pre YYY $post ↔ ƈ ; # LATIN SMALL LETTER C WITH HOOK
# $pre YYY $post ↔ Ɖ ; # LATIN CAPITAL LETTER AFRICAN D
# $pre YYY $post ↔ Ɗ ; # LATIN CAPITAL LETTER D WITH HOOK
# $pre YYY $post ↔ Ƌ ; # LATIN CAPITAL LETTER D WITH TOPBAR
# $pre YYY $post ↔ ƌ ; # LATIN SMALL LETTER D WITH TOPBAR
# $pre YYY $post ↔ ƍ ; # LATIN SMALL LETTER TURNED DELTA
# $pre YYY $post ↔ Ǝ ; # LATIN CAPITAL LETTER REVERSED E
# $pre YYY $post ↔ Ƒ ; # LATIN CAPITAL LETTER F WITH HOOK
# $pre YYY $post ↔ ƒ ; # LATIN SMALL LETTER F WITH HOOK
# $pre YYY $post ↔ Ɠ ; # LATIN CAPITAL LETTER G WITH HOOK
# $pre YYY $post ↔ Ɣ ; # LATIN CAPITAL LETTER GAMMA
# $pre YYY $post ↔ ƕ ; # LATIN SMALL LETTER HV
# $pre YYY $post ↔ Ɩ ; # LATIN CAPITAL LETTER IOTA
# $pre YYY $post ↔ Ɨ ; # LATIN CAPITAL LETTER I WITH STROKE
# $pre YYY $post ↔ Ƙ ; # LATIN CAPITAL LETTER K WITH HOOK
# $pre YYY $post ↔ ƙ ; # LATIN SMALL LETTER K WITH HOOK
# $pre YYY $post ↔ ƚ ; # LATIN SMALL LETTER L WITH BAR
# $pre YYY $post ↔ ƛ ; # LATIN SMALL LETTER LAMBDA WITH STROKE
# $pre YYY $post ↔ Ɯ ; # LATIN CAPITAL LETTER TURNED M
# $pre YYY $post ↔ Ɲ ; # LATIN CAPITAL LETTER N WITH LEFT HOOK
# $pre YYY $post ↔ ƞ ; # LATIN SMALL LETTER N WITH LONG RIGHT LEG
# $pre YYY $post ↔ Ɵ ; # LATIN CAPITAL LETTER O WITH MIDDLE TILDE
# $pre YYY $post ↔ Ƣ ; # LATIN CAPITAL LETTER OI
# $pre YYY $post ↔ ƣ ; # LATIN SMALL LETTER OI
# $pre YYY $post ↔ Ƥ ; # LATIN CAPITAL LETTER P WITH HOOK
# $pre YYY $post ↔ ƥ ; # LATIN SMALL LETTER P WITH HOOK
# $pre YYY $post ↔ Ʀ ; # LATIN LETTER YR
# $pre YYY $post ↔ Ƨ ; # LATIN CAPITAL LETTER TONE TWO
# $pre YYY $post ↔ ƨ ; # LATIN SMALL LETTER TONE TWO
# $pre YYY $post ↔ ƪ ; # LATIN LETTER REVERSED ESH LOOP
# $pre YYY $post ↔ ƫ ; # LATIN SMALL LETTER T WITH PALATAL HOOK
# $pre YYY $post ↔ Ƭ ; # LATIN CAPITAL LETTER T WITH HOOK
# $pre YYY $post ↔ ƭ ; # LATIN SMALL LETTER T WITH HOOK
# $pre YYY $post ↔ Ʈ ; # LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
# $pre YYY $post ↔ Ʋ ; # LATIN CAPITAL LETTER V WITH HOOK
# $pre YYY $post ↔ Ƴ ; # LATIN CAPITAL LETTER Y WITH HOOK
# $pre YYY $post ↔ ƴ ; # LATIN SMALL LETTER Y WITH HOOK
# $pre YYY $post ↔ Ƶ ; # LATIN CAPITAL LETTER Z WITH STROKE
# $pre YYY $post ↔ ƶ ; # LATIN SMALL LETTER Z WITH STROKE
# $pre YYY $post ↔ Ƹ ; # LATIN CAPITAL LETTER EZH REVERSED
# $pre YYY $post ↔ ƹ ; # LATIN SMALL LETTER EZH REVERSED
# $pre YYY $post ↔ ƺ ; # LATIN SMALL LETTER EZH WITH TAIL
# $pre YYY $post ↔ ƻ ; # LATIN LETTER TWO WITH STROKE
# $pre YYY $post ↔ Ƽ ; # LATIN CAPITAL LETTER TONE FIVE
# $pre YYY $post ↔ ƽ ; # LATIN SMALL LETTER TONE FIVE
# $pre YYY $post ↔ ƾ ; # LATIN LETTER INVERTED GLOTTAL STOP WITH STROKE
# $pre YYY $post ↔ ƿ ; # LATIN LETTER WYNN
# $pre YYY $post ↔ ǀ ; # LATIN LETTER DENTAL CLICK
# $pre YYY $post ↔ ǁ ; # LATIN LETTER LATERAL CLICK
# $pre YYY $post ↔ ǂ ; # LATIN LETTER ALVEOLAR CLICK
# $pre YYY $post ↔ ǃ ; # LATIN LETTER RETROFLEX CLICK
# $pre YYY $post ↔ DŽ ; # LATIN CAPITAL LETTER DZ WITH CARON
# $pre YYY $post ↔ Dž ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON
# $pre YYY $post ↔ dž ; # LATIN SMALL LETTER DZ WITH CARON
# $pre YYY $post ↔ LJ ; # LATIN CAPITAL LETTER LJ
# $pre YYY $post ↔ Lj ; # LATIN CAPITAL LETTER L WITH SMALL LETTER J
# $pre YYY $post ↔ lj ; # LATIN SMALL LETTER LJ
# $pre YYY $post ↔ NJ ; # LATIN CAPITAL LETTER NJ
# $pre YYY $post ↔ Nj ; # LATIN CAPITAL LETTER N WITH SMALL LETTER J
# $pre YYY $post ↔ nj ; # LATIN SMALL LETTER NJ
# $pre YYY $post ↔ ǝ ; # LATIN SMALL LETTER TURNED E
# $pre YYY $post ↔ Ǥ ; # LATIN CAPITAL LETTER G WITH STROKE
# $pre YYY $post ↔ ǥ ; # LATIN SMALL LETTER G WITH STROKE
# $pre YYY $post ↔ DZ ; # LATIN CAPITAL LETTER DZ
# $pre YYY $post ↔ Dz ; # LATIN CAPITAL LETTER D WITH SMALL LETTER Z
# $pre YYY $post ↔ dz ; # LATIN SMALL LETTER DZ
# $pre YYY $post ↔ Ƕ ; # LATIN CAPITAL LETTER HWAIR
# $pre YYY $post ↔ Ƿ ; # LATIN CAPITAL LETTER WYNN
# $pre YYY $post ↔ Ȝ ; # LATIN CAPITAL LETTER YOGH
# $pre YYY $post ↔ ȝ ; # LATIN SMALL LETTER YOGH
# $pre YYY $post ↔ Ȣ ; # LATIN CAPITAL LETTER OU
# $pre YYY $post ↔ ȣ ; # LATIN SMALL LETTER OU
# $pre YYY $post ↔ Ȥ ; # LATIN CAPITAL LETTER Z WITH HOOK
# $pre YYY $post ↔ ȥ ; # LATIN SMALL LETTER Z WITH HOOK
# $pre YYY $post ↔ ɐ ; # LATIN SMALL LETTER TURNED A
# $pre YYY $post ↔ ɑ ; # LATIN SMALL LETTER ALPHA
# $pre YYY $post ↔ ɒ ; # LATIN SMALL LETTER TURNED ALPHA
# $pre YYY $post ↔ ɓ ; # LATIN SMALL LETTER B WITH HOOK
# $pre YYY $post ↔ ɕ ; # LATIN SMALL LETTER C WITH CURL
# $pre YYY $post ↔ ɖ ; # LATIN SMALL LETTER D WITH TAIL
# $pre YYY $post ↔ ɗ ; # LATIN SMALL LETTER D WITH HOOK
# $pre YYY $post ↔ ɘ ; # LATIN SMALL LETTER REVERSED E
# $pre YYY $post ↔ ɚ ; # LATIN SMALL LETTER SCHWA WITH HOOK
# $pre YYY $post ↔ ɜ ; # LATIN SMALL LETTER REVERSED OPEN E
# $pre YYY $post ↔ ɝ ; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK
# $pre YYY $post ↔ ɞ ; # LATIN SMALL LETTER CLOSED REVERSED OPEN E
# $pre YYY $post ↔ ɟ ; # LATIN SMALL LETTER DOTLESS J WITH STROKE
# $pre YYY $post ↔ ɠ ; # LATIN SMALL LETTER G WITH HOOK
# $pre YYY $post ↔ ɡ ; # LATIN SMALL LETTER SCRIPT G
# $pre YYY $post ↔ ɢ ; # LATIN LETTER SMALL CAPITAL G
# $pre YYY $post ↔ ɣ ; # LATIN SMALL LETTER GAMMA
# $pre YYY $post ↔ ɤ ; # LATIN SMALL LETTER RAMS HORN
# $pre YYY $post ↔ ɥ ; # LATIN SMALL LETTER TURNED H
# $pre YYY $post ↔ ɦ ; # LATIN SMALL LETTER H WITH HOOK
# $pre YYY $post ↔ ɧ ; # LATIN SMALL LETTER HENG WITH HOOK
# $pre YYY $post ↔ ɨ ; # LATIN SMALL LETTER I WITH STROKE
# $pre YYY $post ↔ ɩ ; # LATIN SMALL LETTER IOTA
# $pre YYY $post ↔ ɫ ; # LATIN SMALL LETTER L WITH MIDDLE TILDE
# $pre YYY $post ↔ ɬ ; # LATIN SMALL LETTER L WITH BELT
# $pre YYY $post ↔ ɭ ; # LATIN SMALL LETTER L WITH RETROFLEX HOOK
# $pre YYY $post ↔ ɮ ; # LATIN SMALL LETTER LEZH
# $pre YYY $post ↔ ɯ ; # LATIN SMALL LETTER TURNED M
# $pre YYY $post ↔ ɰ ; # LATIN SMALL LETTER TURNED M WITH LONG LEG
# $pre YYY $post ↔ ɱ ; # LATIN SMALL LETTER M WITH HOOK
# $pre YYY $post ↔ ɲ ; # LATIN SMALL LETTER N WITH LEFT HOOK
# $pre YYY $post ↔ ɳ ; # LATIN SMALL LETTER N WITH RETROFLEX HOOK
# $pre YYY $post ↔ ɴ ; # LATIN LETTER SMALL CAPITAL N
# $pre YYY $post ↔ ɵ ; # LATIN SMALL LETTER BARRED O
# $pre YYY $post ↔ ɶ ; # LATIN LETTER SMALL CAPITAL OE
# $pre YYY $post ↔ ɷ ; # LATIN SMALL LETTER CLOSED OMEGA
# $pre YYY $post ↔ ɸ ; # LATIN SMALL LETTER PHI
# $pre YYY $post ↔ ɹ ; # LATIN SMALL LETTER TURNED R
# $pre YYY $post ↔ ɺ ; # LATIN SMALL LETTER TURNED R WITH LONG LEG
# $pre YYY $post ↔ ɻ ; # LATIN SMALL LETTER TURNED R WITH HOOK
# $pre YYY $post ↔ ɼ ; # LATIN SMALL LETTER R WITH LONG LEG
# $pre YYY $post ↔ ɽ ; # LATIN SMALL LETTER R WITH TAIL
# $pre YYY $post ↔ ɾ ; # LATIN SMALL LETTER R WITH FISHHOOK
# $pre YYY $post ↔ ɿ ; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK
# $pre YYY $post ↔ ʀ ; # LATIN LETTER SMALL CAPITAL R
# $pre YYY $post ↔ ʁ ; # LATIN LETTER SMALL CAPITAL INVERTED R
# $pre YYY $post ↔ ʂ ; # LATIN SMALL LETTER S WITH HOOK
# $pre YYY $post ↔ ʄ ; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK
# $pre YYY $post ↔ ʅ ; # LATIN SMALL LETTER SQUAT REVERSED ESH
# $pre YYY $post ↔ ʆ ; # LATIN SMALL LETTER ESH WITH CURL
# $pre YYY $post ↔ ʇ ; # LATIN SMALL LETTER TURNED T
# $pre YYY $post ↔ ʈ ; # LATIN SMALL LETTER T WITH RETROFLEX HOOK
# $pre YYY $post ↔ ʉ ; # LATIN SMALL LETTER U BAR
# $pre YYY $post ↔ ʋ ; # LATIN SMALL LETTER V WITH HOOK
# $pre YYY $post ↔ ʍ ; # LATIN SMALL LETTER TURNED W
# $pre YYY $post ↔ ʎ ; # LATIN SMALL LETTER TURNED Y
# $pre YYY $post ↔ ʏ ; # LATIN LETTER SMALL CAPITAL Y
# $pre YYY $post ↔ ʐ ; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
# $pre YYY $post ↔ ʑ ; # LATIN SMALL LETTER Z WITH CURL
# $pre YYY $post ↔ ʓ ; # LATIN SMALL LETTER EZH WITH CURL
# $pre YYY $post ↔ ʔ ; # LATIN LETTER GLOTTAL STOP
# $pre YYY $post ↔ ʕ ; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE
# $pre YYY $post ↔ ʖ ; # LATIN LETTER INVERTED GLOTTAL STOP
# $pre YYY $post ↔ ʗ ; # LATIN LETTER STRETCHED C
# $pre YYY $post ↔ ʘ ; # LATIN LETTER BILABIAL CLICK
# $pre YYY $post ↔ ʙ ; # LATIN LETTER SMALL CAPITAL B
# $pre YYY $post ↔ ʚ ; # LATIN SMALL LETTER CLOSED OPEN E
# $pre YYY $post ↔ ʛ ; # LATIN LETTER SMALL CAPITAL G WITH HOOK
# $pre YYY $post ↔ ʜ ; # LATIN LETTER SMALL CAPITAL H
# $pre YYY $post ↔ ʝ ; # LATIN SMALL LETTER J WITH CROSSED-TAIL
# $pre YYY $post ↔ ʞ ; # LATIN SMALL LETTER TURNED K
# $pre YYY $post ↔ ʟ ; # LATIN LETTER SMALL CAPITAL L
# $pre YYY $post ↔ ʠ ; # LATIN SMALL LETTER Q WITH HOOK
# $pre YYY $post ↔ ʡ ; # LATIN LETTER GLOTTAL STOP WITH STROKE
# $pre YYY $post ↔ ʢ ; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE
# $pre YYY $post ↔ ʣ ; # LATIN SMALL LETTER DZ DIGRAPH
# $pre YYY $post ↔ ʤ ; # LATIN SMALL LETTER DEZH DIGRAPH
# $pre YYY $post ↔ ʥ ; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL
# $pre YYY $post ↔ ʦ ; # LATIN SMALL LETTER TS DIGRAPH
# $pre YYY $post ↔ ʧ ; # LATIN SMALL LETTER TESH DIGRAPH
# $pre YYY $post ↔ ʨ ; # LATIN SMALL LETTER TC DIGRAPH WITH CURL
# $pre YYY $post ↔ ʩ ; # LATIN SMALL LETTER FENG DIGRAPH
# $pre YYY $post ↔ ʪ ; # LATIN SMALL LETTER LS DIGRAPH
# $pre YYY $post ↔ ʫ ; # LATIN SMALL LETTER LZ DIGRAPH
# $pre YYY $post ↔ ʬ ; # LATIN LETTER BILABIAL PERCUSSIVE
# $pre YYY $post ↔ ʭ ; # LATIN LETTER BIDENTAL PERCUSSIVE
# $pre YYY $post ↔ ʰ ; # MODIFIER LETTER SMALL H
# $pre YYY $post ↔ ʱ ; # MODIFIER LETTER SMALL H WITH HOOK
# $pre YYY $post ↔ ʲ ; # MODIFIER LETTER SMALL J
# $pre YYY $post ↔ ʳ ; # MODIFIER LETTER SMALL R
# $pre YYY $post ↔ ʴ ; # MODIFIER LETTER SMALL TURNED R
# $pre YYY $post ↔ ʵ ; # MODIFIER LETTER SMALL TURNED R WITH HOOK
# $pre YYY $post ↔ ʶ ; # MODIFIER LETTER SMALL CAPITAL INVERTED R
# $pre YYY $post ↔ ʷ ; # MODIFIER LETTER SMALL W
# $pre YYY $post ↔ ʸ ; # MODIFIER LETTER SMALL Y
# $pre YYY $post ↔ ˠ ; # MODIFIER LETTER SMALL GAMMA
# $pre YYY $post ↔ ˡ ; # MODIFIER LETTER SMALL L
# $pre YYY $post ↔ ˢ ; # MODIFIER LETTER SMALL S
# $pre YYY $post ↔ ˣ ; # MODIFIER LETTER SMALL X
# $pre YYY $post ↔ ˤ ; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
# $pre YYY $post ↔ ẚ ; # LATIN SMALL LETTER A WITH RIGHT HALF RING
# $pre YYY $post ↔ ⁿ ; # SUPERSCRIPT LATIN SMALL LETTER N
:: NFC (NFD) ;

View File

@ -1,23 +1,31 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Any_Publishing.txt
# Generated from CLDR
#
# Test case
# "The" "(quick)" ('brown') `fox' ` jumped -- "over?"
# Variables
$single = \' ;
$space = ' ' ;
$double = \" ;
$back = \` ;
$tab = \u0008 ;
$makeRight = [[:Z:][:Ps:][:Pi:]$] ;
# fix UNIX quotes
$back $back → “ ;
$back → ;
# fix typewriter quotes, by context
$makeRight {$double} ↔ “ ;
$double ↔ ” ;
$makeRight {$single} ↔ ;
$single ↔ ;
# fix multiple spaces and hyphens
$space {$space} → ;
'--' ↔ — ;

View File

@ -1,23 +1,38 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Arabic_Latin.txt
# File: Arab_Latn.txt
# Generated from CLDR
#
# Generally follows UNGEGN
# http://www.eki.ee/wgrs/rom1_ar.pdf
# Occasionally deviates in the direction of ISO 233
# http://homepage.mac.com/sirbinks/pdf/Arabic.pdf
# a) where required for disambiguation.
# b) with underdot instead of cedilla for letter like SAD,
# since those are explicitly in Unicode for transliteration.
# c) with extra non-Arabic-language letters, like PEH
#
# Does *not* do assimilation of "al", nor hyphenation.
# While it could be done, we need to determine whether a prefix "al" could
# occur other than as the definite article (since no space is used).
:: [[:Arabic:][:block=ARABIC:][‎ⁿ،؛؟ـ\u064B-\u0655٠-٬۰-۹﷼ښ]] ;
:: NFKD (NFC);
$disambig = \u0331 ;
$disambig2 = \u0330 ;
$under = \u0323 ;
$descender = ˌ;
$notAbove = [[:^ccc=0:]&[:^ccc=230:]];
$notAbove = [[:^ccc=0:] & [:^ccc=230:]];
# non-letters
[:Nd:]{٫}[:Nd:] ↔ [:Nd:]{','}[:Nd:] ; # ARABIC DECIMAL SEPARATOR
[:Nd:]{٬}[:Nd:] ↔ [:Nd:]{'.'}[:Nd:] ; # ARABIC THOUSANDS SEPARATOR
٫ ↔ ',' $disambig ; # ARABIC DECIMAL SEPARATOR
٬ ↔ '.' $disambig ; # ARABIC THOUSANDS SEPARATOR
# ٭ ↔ ; # ARABIC FIVE POINTED STAR // no need to transliterate
، ↔ ',' ; # ARABIC COMMA
؛ ↔ ';' ; # ARABIC SEMICOLON
؟ ↔ '?' ; # ARABIC QUESTION MARK
@ -42,9 +57,12 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
٧ ↔ 7 ; # ARABIC-INDIC DIGIT SEVEN
٨ ↔ 8 ; # ARABIC-INDIC DIGIT EIGHT
٩ ↔ 9 ; # ARABIC-INDIC DIGIT NINE
# letters
# long vowels
\u064Eا↔ a\u0304 ; # ARABIC FATHA, ARABIC LETTER ALEF
\u064Fو ↔ u\u0304 ; # ARABIC DAMMA, ARABIC LETTER WAW
\u0650ي ↔ i\u0304 ; # ARABIC KASRA, ARABIC LETTER YEH
# longer items moved here to prevent masking
ث ↔ t h $disambig ; # ARABIC LETTER THEH
ذ ↔ d h $disambig ; # ARABIC LETTER THAL
ش ↔ s h $disambig ; # ARABIC LETTER SHEEN
@ -53,13 +71,19 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
ط ↔ t $under ; # ARABIC LETTER TAH
ظ ↔ z $under ; # ARABIC LETTER ZAH
غ ↔ g h $disambig ; # ARABIC LETTER GHAIN
# WARNING: special case
# ←t, umlaut, half-ring below→ will be canonically ordered as ←t, half-ring below, umlaut→
# so on the return, we have to skip over (but preserve) the half-ring below (or others like it)
# ة\u0655 ← t\u0339\u0308 ; # LATIN SMALL LETTER T, COMBINING RIGHT HALF RING BELOW, COMBINING DIAERESIS
ة ↔ t \u0308 ; # ARABIC LETTER TEH MARBUTA
ة | $1 ← t ($notAbove+) \u0308 ; # ARABIC LETTER TEH MARBUTA
# non-Arabic language
ژ ↔ z h $disambig ; # ARABIC LETTER JEH
ڭ ↔ n $disambig g ; # ARABIC LETTER NG
ۋ ↔ v $disambig ; # ARABIC LETTER VE
ی ↔ y $disambig2 ; # ARABIC LETTER FARSI YEH
ښ ↔ s $descender;
# Arabic language
ء ↔ ʾ ; # ARABIC LETTER HAMZA
ا ↔ a $under; # ARABIC LETTER ALEF
ب ↔ b ; # ARABIC LETTER BEH
@ -92,13 +116,18 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
\u0650 ↔ i ; # ARABIC KASRA
\u0651 ↔ \u0303 ; # ARABIC SHADDA
\u0652 ↔ \u030A ; # ARABIC SUKUN
# special combining marks
\u0653 ↔ \u0302 ; # ARABIC MADDAH ABOVE
\u0654 ↔ \u0309 ; # ARABIC HAMZA ABOVE
\u0655 ↔ \u0339 ; # ARABIC HAMZA BELOW
# Some non-Arabic language (not in UNGEGN)
پ ↔ p ; # ARABIC LETTER PEH
چ ↔ c h $disambig ; # ARABIC LETTER TCHEH
ڤ ↔ v ; # ARABIC LETTER VEH
# ڥ ↔ v $disambig ; # ARABIC LETTER FEH WITH THREE DOTS BELOW
# ڢ ↔ f $disambig ; # ARABIC LETTER FEH WITH DOT MOVED BELOW
گ ↔ g ; # ARABIC LETTER GAF
# fallbacks
| s ← c } [eiy];
| k ← c ;
| i ← e ;
@ -108,3 +137,4 @@ $notAbove = [[:^ccc=0:]&[:^ccc=230:]];
:: (lower) ;
::NFC (NFD);
:: ( [[:Latin:] [%,.0-9;?ʾ-ʿ\u0302-\u0304\u0308-\u030A\u0323\u0330-\u0331\u0339;ˌ]] );

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Devanagari.txt
# File: Beng_Deva.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Devanagari;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Gujarati.txt
# File: Beng_Gujr.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Gujarati;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Gurmukhi.txt
# File: Beng_Guru.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Gurmukhi;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Kannada.txt
# File: Beng_Knda.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Kannada;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Latin.txt
# File: Beng_Latn.txt
# Generated from CLDR
#
::[[:script=bengali:][।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ]];
::NFD;
::Bengali-InterIndic;
::InterIndic-Latin;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Malayalam.txt
# File: Beng_Mlym.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Malayalam;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Oriya.txt
# File: Beng_Orya.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Oriya;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Tamil.txt
# File: Beng_Taml.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Tamil;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_Telugu.txt
# File: Beng_Telu.txt
# Generated from CLDR
#
::[।-॥\u0981-ঃঅ-ঌএ-ঐও-নপ-রলশ-হ\u09BC-\u09C4ে-ৈো-\u09CDৗড়-ঢ়য়-\u09E3-৺ৎ];
::NFD;
::Bengali-InterIndic;
::InterIndic-Telugu;
::NFC;

View File

@ -1,12 +1,14 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bengali_InterIndic.txt
# Generated from CLDR
#
# Bengali-InterIndic
ো→\uE04B; # VOWEL SIGN O
ৌ→\uE04C; # VOWEL SIGN AU
\u0981→\uE001; # SIGN CANDRABINDU
@ -69,9 +71,11 @@
ৈ→\uE048; # VOWEL SIGN AI
ো→\uE04B;
ৌ→\uE04C;
#
\u09CD→\uE04D; # SIGN VIRAMA
ৎ→\uE083; # Khanda-ta
ৗ→\uE057; # AU LENGTH MARK
#
ৠ→\uE060; # LETTER VOCALIC RR
ৡ→\uE061; # LETTER VOCALIC LL
\u09E2→\uE062; # VOWEL SIGN VOCALIC L
@ -99,3 +103,6 @@
৺→\uE07B; # ISSHAR
।→\uE064; # DANDA
॥→\uE065; # DOUBLE DANDA
# :: NFC (NFD) ;
# eof

View File

@ -1,100 +0,0 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Bulgarian_Latin_BGN.txt
# Generated from CLDR
#
:: [АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯѪѢабвгдежзийклмнопрстуфхцчшщъьюяѫѣ] ;
:: NFD (NFC) ;
$upperConsonants = [БВГДЖЗЙКЛМНПРСТФХЦЧШЩЬ] ;
$lowerConsonants = [бвгджзйклмнпрстфхцчшщь] ;
$consonants = [$upperConsonants $lowerConsonants] ;
$upperVowels = [АЕИОУЪЮЯѪѢ] ;
$lowerVowels = [аеиоуъюяѫѣ] ;
$vowels = [$upperVowels $lowerVowels] ;
$lower = [$lowerConsonants $lowerVowels] ;
$bulgarian = [ $lower $upperConsonants $upperVowels ] ;
$wordBoundary = [^[:L:][:M:][:N:]] ;
А → A ; # CYRILLIC CAPITAL LETTER A
а → a ; # CYRILLIC SMALL LETTER A
Б → B ; # CYRILLIC CAPITAL LETTER BE
б → b ; # CYRILLIC SMALL LETTER BE
В → V ; # CYRILLIC CAPITAL LETTER VE
в → v ; # CYRILLIC SMALL LETTER VE
Г → G ; # CYRILLIC CAPITAL LETTER GHE
г → g ; # CYRILLIC SMALL LETTER GHE
Д → D ; # CYRILLIC CAPITAL LETTER DE
д → d ; # CYRILLIC SMALL LETTER DE
Е → E ; # CYRILLIC CAPITAL LETTER DE
е → e ; # CYRILLIC SMALL LETTER DE
Ж} $lower → Zh ; # CYRILLIC CAPITAL LETTER ZHE
Ж → ZH ; # CYRILLIC CAPITAL LETTER ZHE
ж → zh ; # CYRILLIC SMALL LETTER ZHE
З → Z ; # CYRILLIC CAPITAL LETTER ZE
з → z ; # CYRILLIC SMALL LETTER ZE
И → I ; # CYRILLIC CAPITAL LETTER I
и → i ; # CYRILLIC SMALL LETTER I
Й → Y ; # CYRILLIC CAPITAL LETTER I
й → y ; # CYRILLIC SMALL LETTER I
К → K ; # CYRILLIC CAPITAL LETTER KA
к → k ; # CYRILLIC SMALL LETTER KA
Л → L ; # CYRILLIC CAPITAL LETTER EL
л → l ; # CYRILLIC SMALL LETTER EL
М → M ; # CYRILLIC CAPITAL LETTER EM
м → m ; # CYRILLIC SMALL LETTER EM
Н → N ; # CYRILLIC CAPITAL LETTER EN
н → n ; # CYRILLIC SMALL LETTER EN
О → O ; # CYRILLIC CAPITAL LETTER O
о → o ; # CYRILLIC SMALL LETTER O
П → P ; # CYRILLIC CAPITAL LETTER PE
п → p ; # CYRILLIC SMALL LETTER PE
Р → R ; # CYRILLIC CAPITAL LETTER ER
р → r ; # CYRILLIC SMALL LETTER ER
С → S ; # CYRILLIC CAPITAL LETTER ES
с → s ; # CYRILLIC SMALL LETTER ES
ТС → T·S ; # CYRILLIC CAPITAL LETTER TE
Тс → T·s ; # CYRILLIC CAPITAL LETTER TE
тс → t·s ; # CYRILLIC SMALL LETTER TE
Т → T ; # CYRILLIC CAPITAL LETTER TE
т → t ; # CYRILLIC SMALL LETTER TE
У → U ; # CYRILLIC CAPITAL LETTER U
у → u ; # CYRILLIC SMALL LETTER U
Ф → F ; # CYRILLIC CAPITAL LETTER EF
ф → f ; # CYRILLIC SMALL LETTER EF
Х} $lower → Kh ; # CYRILLIC CAPITAL LETTER HA
Х → KH ; # CYRILLIC CAPITAL LETTER HA
х → kh ; # CYRILLIC SMALL LETTER HA
Ц} $lower → Ts ; # CYRILLIC CAPITAL LETTER TSE
Ц → TS ; # CYRILLIC CAPITAL LETTER TSE
ц → ts ; # CYRILLIC SMALL LETTER TSE
Ч} $lower → Ch ; # CYRILLIC CAPITAL LETTER CHE
Ч → CH ; # CYRILLIC CAPITAL LETTER CHE
ч → ch ; # CYRILLIC SMALL LETTER CHE
ШТ → SH·T ; # CYRILLIC CAPITAL LETTER SHA
Шт → Sh·t ; # CYRILLIC CAPITAL LETTER SHA
шт → sh·t ; # CYRILLIC SMALL LETTER SHA
Ш} $lower → Sh ; # CYRILLIC CAPITAL LETTER SHA
Ш → SH ; # CYRILLIC CAPITAL LETTER SHA
ш → sh ; # CYRILLIC SMALL LETTER SHA
Щ} $lower → Sht ; # CYRILLIC CAPITAL LETTER SHCHA
Щ → SHT ; # CYRILLIC CAPITAL LETTER SHCHA
щ → sht ; # CYRILLIC SMALL LETTER SHCHA
Ъ → Ŭ ; # CYRILLIC CAPITAL LETTER HARD SIGN
ъ → ŭ ; # CYRILLIC SMALL LETTER HARD SIGN
$bulgarian { [Ъъ] } $wordBoundary > ;
Ь ; # CYRILLIC CAPITAL LETTER SOFT SIGN
ь → ; # CYRILLIC SMALL LETTER SOFT SIGN
Ю} $lower → Yu ; # CYRILLIC CAPITAL LETTER YU
Ю → YU ; # CYRILLIC CAPITAL LETTER YU
ю → yu ; # CYRILLIC SMALL LETTER YU
Я} $lower → Ya ; # CYRILLIC CAPITAL LETTER YA
Я → YA ; # CYRILLIC CAPITAL LETTER YA
я → ya ; # CYRILLIC SMALL LETTER YA
Ѫ → Ŭ ; # CYRILLIC CAPITAL LETTER BIG YUS
ѫ → ŭ ; # CYRILLIC SMALL LETTER BIG YUS
Ѣ} $lower → Ye ; # CYRILLIC CAPITAL LETTER YAT
Ѣ → YE ; # CYRILLIC CAPITAL LETTER YAT
ѣ → ye ; # CYRILLIC SMALL LETTER YAT

View File

@ -1,129 +0,0 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Cyrillic_Latin.txt
# Generated from CLDR
#
:: [Ққ\u0308Ă-ăĔ-ĕĞ-ğĬ-ĭŎ-ŏŬ-ŭ\u0306Ѐ-џҐ-ҕҘ-ҙӁ-ӂӐ-ӟӢ-ӧӬ-ӵӸ-ӹḜ-ḝẮ-ặᾰᾸῐῘῠῨ] ;
:: NFD (NFC) ;
$modprime = ʹ;
$modprime2 = ʺ;
$grave = \u0300;
$acute = \u0301;
$hat = \u0302;
$breve = \u0306 ;
$dot = \u0307 ;
$caron = \u030C ;
$comma = \u0326 ;
$under = \u0331 ;
$descender = ˌ;
я ↔ a $hat ; # CYRILLIC SMALL LETTER YA
Я ↔ A $hat ; # CYRILLIC CAPITAL LETTER YA
ч ↔ c $caron ; # CYRILLIC SMALL LETTER CHE
Ч ↔ C $caron; # CYRILLIC CAPITAL LETTER CHE
э ↔ e $acute; # CYRILLIC SMALL LETTER E
Э ↔ E $acute; # CYRILLIC CAPITAL LETTER E
є ↔ e $hat; # CYRILLIC SMALL LETTER UKRAINIAN IE
Є ↔ E $hat; # CYRILLIC CAPITAL LETTER UKRAINIAN IE
ш ↔ s $caron ; # CYRILLIC SMALL LETTER SHA
Ш ↔ S $caron ; # CYRILLIC CAPITAL LETTER SHA
щ ↔ s $hat ; # CYRILLIC SMALL LETTER SHCHA
Щ ↔ S $hat; # CYRILLIC CAPITAL LETTER SHCHA
ѕ ↔ z $hat ; # CYRILLIC SMALL LETTER DZE
Ѕ ↔ Z $hat; # CYRILLIC CAPITAL LETTER DZE
ю ↔ u $hat ; # CYRILLIC SMALL LETTER YU
Ю ↔ U $hat ; # CYRILLIC CAPITAL LETTER YU
і ↔ i $acute; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
І ↔ I $acute; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
ј ↔ j $caron; # CYRILLIC SMALL LETTER JE
Ј ↔ J $caron; # CYRILLIC CAPITAL LETTER JE
љ ↔ l $hat ; # CYRILLIC SMALL LETTER LJE
Љ ↔ L $hat ; # CYRILLIC CAPITAL LETTER LJE
њ ↔ n $hat ; # CYRILLIC SMALL LETTER NJE
Њ ↔ N $hat ; # CYRILLIC CAPITAL LETTER NJE
ћ ↔ c $acute ; # CYRILLIC SMALL LETTER TSHE
Ћ ↔ C $acute ; # CYRILLIC CAPITAL LETTER TSHE
џ ↔ d $hat ; # CYRILLIC SMALL LETTER DZHE
Џ ↔ D $hat ; # CYRILLIC CAPITAL LETTER DZHE
а ↔ a ; # CYRILLIC SMALL LETTER A
А ↔ A ; # CYRILLIC CAPITAL LETTER A
ә ↔ ə ; # CYRILLIC SMALL LETTER SCHWA
Ә ↔ Ə ; # CYRILLIC CAPITAL LETTER SCHWA
ӕ ↔ æ ; # CYRILLIC SMALL LIGATURE A IE
Ӕ ↔ Æ ; # CYRILLIC CAPITAL LIGATURE A IE
б ↔ b ; # CYRILLIC SMALL LETTER BE
Б ↔ B ; # CYRILLIC CAPITAL LETTER BE
в ↔ v ; # CYRILLIC SMALL LETTER VE
В ↔ V ; # CYRILLIC CAPITAL LETTER VE
ґ ↔ g $grave ; # CYRILLIC SMALL LETTER GHE WITH UPTURN
Ґ ↔ G $grave ; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN
ғ ↔ g $dot ; # CYRILLIC SMALL LETTER GHE WITH STROKE
Ғ ↔ G $dot; # CYRILLIC CAPITAL LETTER GHE WITH STROKE
ҕ ↔ g $breve; # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK
Ҕ ↔ G $breve; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK
г ↔ g ; # CYRILLIC SMALL LETTER GHE
Г ↔ G ; # CYRILLIC CAPITAL LETTER GHE
д ↔ d; # CYRILLIC SMALL LETTER DE
Д ↔ D; # CYRILLIC CAPITAL LETTER DE
ђ ↔ đ ; # CYRILLIC SMALL LETTER DJE
Ђ ↔ Đ ; # CYRILLIC CAPITAL LETTER DJE
ҙ ↔ z $comma ; # CYRILLIC SMALL LETTER ZE WITH DESCENDER
Ҙ ↔ Z $comma ; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER
е ↔ e ; # CYRILLIC SMALL LETTER IE
Е ↔ E; # CYRILLIC CAPITAL LETTER IE
ж ↔ z $caron; # CYRILLIC SMALL LETTER ZHE
Ж ↔ Z $caron; # CYRILLIC CAPITAL LETTER ZHE
з ↔ z ; # CYRILLIC SMALL LETTER ZE
З ↔ Z; # CYRILLIC CAPITAL LETTER ZE
и\u0306 ↔ j ; # CYRILLIC SMALL LETTER I
И\u0306 ↔ J ; # CYRILLIC CAPITAL LETTER I
и ↔ i ; # CYRILLIC SMALL LETTER I
И ↔ I ; # CYRILLIC CAPITAL LETTER I
қ ↔ k $descender ; # CYRILLIC SMALL LETTER KA WITH DESCENDER
Қ ↔ K $descender ; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER
к ↔ k ; # CYRILLIC SMALL LETTER KA
К ↔ K; # CYRILLIC CAPITAL LETTER KA
л ↔ l ; # CYRILLIC SMALL LETTER EL
Л ↔ L; # CYRILLIC CAPITAL LETTER EL
м ↔ m ; # CYRILLIC SMALL LETTER EM
М ↔ M ; # CYRILLIC CAPITAL LETTER EM
н ↔ n ; # CYRILLIC SMALL LETTER EN
Н ↔ N; # CYRILLIC CAPITAL LETTER EN
о ↔ o ; # CYRILLIC SMALL LETTER O
О ↔ O ; # CYRILLIC CAPITAL LETTER O
п ↔ p ; # CYRILLIC SMALL LETTER PE
П ↔ P ; # CYRILLIC CAPITAL LETTER PE
р ↔ r ; # CYRILLIC SMALL LETTER ER
Р ↔ R ; # CYRILLIC CAPITAL LETTER ER
с ↔ s ; # CYRILLIC SMALL LETTER ES
С ↔ S ; # CYRILLIC CAPITAL LETTER ES
т ↔ t ; # CYRILLIC SMALL LETTER TE
Т ↔ T ; # CYRILLIC CAPITAL LETTER TE
у ↔ u ; # CYRILLIC SMALL LETTER U
У ↔ U ; # CYRILLIC CAPITAL LETTER U
ф ↔ f ; # CYRILLIC SMALL LETTER EF
Ф ↔ F ; # CYRILLIC CAPITAL LETTER EF
х ↔ h ; # CYRILLIC SMALL LETTER HA
Х ↔ H; # CYRILLIC CAPITAL LETTER HA
ц ↔ c ; # CYRILLIC SMALL LETTER TSE
Ц ↔ C; # CYRILLIC CAPITAL LETTER TSE
Ъ ↔ $modprime2 $under ; # CYRILLIC CAPITAL LETTER HARD SIGN
ъ ↔ $modprime2 ; # CYRILLIC SMALL LETTER HARD SIGN
Ь ↔ $modprime $under ; # CYRILLIC CAPITAL LETTER SOFT SIGN
ь ↔ $modprime ; # CYRILLIC SMALL LETTER SOFT SIGN
ы ↔ y ; # CYRILLIC SMALL LETTER YERU
Ы ↔ Y ; # CYRILLIC CAPITAL LETTER YERU
$ignore = [[:Mark:]''] * ;
| k ← q ;
| K ← Q ;
| u ← w ;
| U ← W ;
| KS ← X } $ignore [:UppercaseLetter:] ;
| KS ← [:UppercaseLetter:] $ignore { X ;
| Ks ← X ;
| ks ← x ;
:: NFC (NFD) ;
:: ( [ˌ\u0308A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ\u0300-\u0302\u0306-\u0307\u030C\u0326\u0331\u0340-\u0341\u0344ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ;

View File

@ -0,0 +1,279 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Cyrl_Latn.txt
# Generated from CLDR
#
# TODO: add remaining characters
# Should add variants for Russian-English, Russian-German
# Those can use this as a base, and then remap cases
# like a $hat to ya or ja.
# :: [\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:]] ;
### WARNING, \u0308 must be added to the generated filters, in both directions ###
# MINIMAL FILTER
:: [Ққ\u0308Ă-ăĔ-ĕĞ-ğĬ-ĭŎ-ŏŬ-ŭ\u0306Ѐ-џҐ-ҕҘ-ҙӁ-ӂӐ-ӟӢ-ӧӬ-ӵӸ-ӹḜ-ḝẮ-ặᾰᾸῐῘῠῨ] ;
:: NFD (NFC) ;
$modprime = ʹ;
$modprime2 = ʺ;
$grave = \u0300;
$acute = \u0301;
$hat = \u0302;
$breve = \u0306 ;
$dot = \u0307 ;
$caron = \u030C ;
$comma = \u0326 ;
$under = \u0331 ;
$descender = ˌ;
# move up so not masked
я ↔ a $hat ; # CYRILLIC SMALL LETTER YA
Я ↔ A $hat ; # CYRILLIC CAPITAL LETTER YA
ч ↔ c $caron ; # CYRILLIC SMALL LETTER CHE
Ч ↔ C $caron; # CYRILLIC CAPITAL LETTER CHE
# ҷ ↔ XXX ; # CYRILLIC SMALL LETTER CHE WITH DESCENDER
# Ҷ ↔ XXX ; # CYRILLIC CAPITAL LETTER CHE WITH DESCENDER
# ӌ ↔ XXX ; # CYRILLIC SMALL LETTER KHAKASSIAN CHE
# Ӌ ↔ XXX ; # CYRILLIC CAPITAL LETTER KHAKASSIAN CHE
# ҹ ↔ XXX ; # CYRILLIC SMALL LETTER CHE WITH VERTICAL STROKE
# Ҹ ↔ XXX ; # CYRILLIC CAPITAL LETTER CHE WITH VERTICAL STROKE
э ↔ e $acute; # CYRILLIC SMALL LETTER E
Э ↔ E $acute; # CYRILLIC CAPITAL LETTER E
є ↔ e $hat; # CYRILLIC SMALL LETTER UKRAINIAN IE
Є ↔ E $hat; # CYRILLIC CAPITAL LETTER UKRAINIAN IE
ш ↔ s $caron ; # CYRILLIC SMALL LETTER SHA
Ш ↔ S $caron ; # CYRILLIC CAPITAL LETTER SHA
щ ↔ s $hat ; # CYRILLIC SMALL LETTER SHCHA
Щ ↔ S $hat; # CYRILLIC CAPITAL LETTER SHCHA
ѕ ↔ z $hat ; # CYRILLIC SMALL LETTER DZE
Ѕ ↔ Z $hat; # CYRILLIC CAPITAL LETTER DZE
# ӡ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN DZE
# Ӡ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN DZE
ю ↔ u $hat ; # CYRILLIC SMALL LETTER YU
Ю ↔ U $hat ; # CYRILLIC CAPITAL LETTER YU
і ↔ i $acute; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
І ↔ I $acute; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
ј ↔ j $caron; # CYRILLIC SMALL LETTER JE
Ј ↔ J $caron; # CYRILLIC CAPITAL LETTER JE
љ ↔ l $hat ; # CYRILLIC SMALL LETTER LJE
Љ ↔ L $hat ; # CYRILLIC CAPITAL LETTER LJE
њ ↔ n $hat ; # CYRILLIC SMALL LETTER NJE
Њ ↔ N $hat ; # CYRILLIC CAPITAL LETTER NJE
ћ ↔ c $acute ; # CYRILLIC SMALL LETTER TSHE
Ћ ↔ C $acute ; # CYRILLIC CAPITAL LETTER TSHE
џ ↔ d $hat ; # CYRILLIC SMALL LETTER DZHE
Џ ↔ D $hat ; # CYRILLIC CAPITAL LETTER DZHE
# Normal order
а ↔ a ; # CYRILLIC SMALL LETTER A
А ↔ A ; # CYRILLIC CAPITAL LETTER A
ә ↔ ə ; # CYRILLIC SMALL LETTER SCHWA
Ә ↔ Ə ; # CYRILLIC CAPITAL LETTER SCHWA
ӕ ↔ æ ; # CYRILLIC SMALL LIGATURE A IE
Ӕ ↔ Æ ; # CYRILLIC CAPITAL LIGATURE A IE
б ↔ b ; # CYRILLIC SMALL LETTER BE
Б ↔ B ; # CYRILLIC CAPITAL LETTER BE
в ↔ v ; # CYRILLIC SMALL LETTER VE
В ↔ V ; # CYRILLIC CAPITAL LETTER VE
ґ ↔ g $grave ; # CYRILLIC SMALL LETTER GHE WITH UPTURN
Ґ ↔ G $grave ; # CYRILLIC CAPITAL LETTER GHE WITH UPTURN
ғ ↔ g $dot ; # CYRILLIC SMALL LETTER GHE WITH STROKE
Ғ ↔ G $dot; # CYRILLIC CAPITAL LETTER GHE WITH STROKE
ҕ ↔ g $breve; # CYRILLIC SMALL LETTER GHE WITH MIDDLE HOOK
Ҕ ↔ G $breve; # CYRILLIC CAPITAL LETTER GHE WITH MIDDLE HOOK
г ↔ g ; # CYRILLIC SMALL LETTER GHE
Г ↔ G ; # CYRILLIC CAPITAL LETTER GHE
д ↔ d; # CYRILLIC SMALL LETTER DE
Д ↔ D; # CYRILLIC CAPITAL LETTER DE
ђ ↔ đ ; # CYRILLIC SMALL LETTER DJE
Ђ ↔ Đ ; # CYRILLIC CAPITAL LETTER DJE
ҙ ↔ z $comma ; # CYRILLIC SMALL LETTER ZE WITH DESCENDER
Ҙ ↔ Z $comma ; # CYRILLIC CAPITAL LETTER ZE WITH DESCENDER
е ↔ e ; # CYRILLIC SMALL LETTER IE
Е ↔ E; # CYRILLIC CAPITAL LETTER IE
ж ↔ z $caron; # CYRILLIC SMALL LETTER ZHE
Ж ↔ Z $caron; # CYRILLIC CAPITAL LETTER ZHE
# җ ↔ XXX ; # CYRILLIC SMALL LETTER ZHE WITH DESCENDER
# Җ ↔ XXX ; # CYRILLIC CAPITAL LETTER ZHE WITH DESCENDER
з ↔ z ; # CYRILLIC SMALL LETTER ZE
З ↔ Z; # CYRILLIC CAPITAL LETTER ZE
и\u0306 ↔ j ; # CYRILLIC SMALL LETTER I
И\u0306 ↔ J ; # CYRILLIC CAPITAL LETTER I
и ↔ i ; # CYRILLIC SMALL LETTER I
И ↔ I ; # CYRILLIC CAPITAL LETTER I
қ ↔ k $descender ; # CYRILLIC SMALL LETTER KA WITH DESCENDER
Қ ↔ K $descender ; # CYRILLIC CAPITAL LETTER KA WITH DESCENDER
к ↔ k ; # CYRILLIC SMALL LETTER KA
К ↔ K; # CYRILLIC CAPITAL LETTER KA
# ӄ ↔ XXX ; # CYRILLIC SMALL LETTER KA WITH HOOK
# Ӄ ↔ XXX ; # CYRILLIC CAPITAL LETTER KA WITH HOOK
# ҡ ↔ XXX ; # CYRILLIC SMALL LETTER BASHKIR KA
# Ҡ ↔ XXX ; # CYRILLIC CAPITAL LETTER BASHKIR KA
# ҟ ↔ XXX ; # CYRILLIC SMALL LETTER KA WITH STROKE
# Ҟ ↔ XXX ; # CYRILLIC CAPITAL LETTER KA WITH STROKE
# ҝ ↔ XXX ; # CYRILLIC SMALL LETTER KA WITH VERTICAL STROKE
# Ҝ ↔ XXX ; # CYRILLIC CAPITAL LETTER KA WITH VERTICAL STROKE
л ↔ l ; # CYRILLIC SMALL LETTER EL
Л ↔ L; # CYRILLIC CAPITAL LETTER EL
м ↔ m ; # CYRILLIC SMALL LETTER EM
М ↔ M ; # CYRILLIC CAPITAL LETTER EM
н ↔ n ; # CYRILLIC SMALL LETTER EN
Н ↔ N; # CYRILLIC CAPITAL LETTER EN
# ң ↔ XXX ; # CYRILLIC SMALL LETTER EN WITH DESCENDER
# Ң ↔ XXX ; # CYRILLIC CAPITAL LETTER EN WITH DESCENDER
# ӈ ↔ XXX ; # CYRILLIC SMALL LETTER EN WITH HOOK
# Ӈ ↔ XXX ; # CYRILLIC CAPITAL LETTER EN WITH HOOK
# ҥ ↔ XXX ; # CYRILLIC SMALL LIGATURE EN GHE
# Ҥ ↔ XXX ; # CYRILLIC CAPITAL LIGATURE EN GHE
о ↔ o ; # CYRILLIC SMALL LETTER O
О ↔ O ; # CYRILLIC CAPITAL LETTER O
# ө ↔ XXX ; # CYRILLIC SMALL LETTER BARRED O
# Ө ↔ XXX ; # CYRILLIC CAPITAL LETTER BARRED O
п ↔ p ; # CYRILLIC SMALL LETTER PE
П ↔ P ; # CYRILLIC CAPITAL LETTER PE
# ҧ ↔ XXX ; # CYRILLIC SMALL LETTER PE WITH MIDDLE HOOK
# Ҧ ↔ XXX ; # CYRILLIC CAPITAL LETTER PE WITH MIDDLE HOOK
# ҁ ↔ XXX ; # CYRILLIC SMALL LETTER KOPPA
# Ҁ ↔ XXX ; # CYRILLIC CAPITAL LETTER KOPPA
р ↔ r ; # CYRILLIC SMALL LETTER ER
Р ↔ R ; # CYRILLIC CAPITAL LETTER ER
# ҏ ↔ XXX ; # CYRILLIC SMALL LETTER ER WITH TICK
# Ҏ ↔ XXX ; # CYRILLIC CAPITAL LETTER ER WITH TICK
с ↔ s ; # CYRILLIC SMALL LETTER ES
С ↔ S ; # CYRILLIC CAPITAL LETTER ES
# ҫ ↔ XXX ; # CYRILLIC SMALL LETTER ES WITH DESCENDER
# Ҫ ↔ XXX ; # CYRILLIC CAPITAL LETTER ES WITH DESCENDER
т ↔ t ; # CYRILLIC SMALL LETTER TE
Т ↔ T ; # CYRILLIC CAPITAL LETTER TE
# ҭ ↔ XXX ; # CYRILLIC SMALL LETTER TE WITH DESCENDER
# Ҭ ↔ XXX ; # CYRILLIC CAPITAL LETTER TE WITH DESCENDER
у ↔ u ; # CYRILLIC SMALL LETTER U
У ↔ U ; # CYRILLIC CAPITAL LETTER U
# ү ↔ XXX ; # CYRILLIC SMALL LETTER STRAIGHT U
# Ү ↔ XXX ; # CYRILLIC CAPITAL LETTER STRAIGHT U
# ұ ↔ XXX ; # CYRILLIC SMALL LETTER STRAIGHT U WITH STROKE
# Ұ ↔ XXX ; # CYRILLIC CAPITAL LETTER STRAIGHT U WITH STROKE
# ѹ ↔ XXX ; # CYRILLIC SMALL LETTER UK
# Ѹ ↔ XXX ; # CYRILLIC CAPITAL LETTER UK
ф ↔ f ; # CYRILLIC SMALL LETTER EF
Ф ↔ F ; # CYRILLIC CAPITAL LETTER EF
х ↔ h ; # CYRILLIC SMALL LETTER HA
Х ↔ H; # CYRILLIC CAPITAL LETTER HA
# ҳ ↔ XXX ; # CYRILLIC SMALL LETTER HA WITH DESCENDER
# Ҳ ↔ XXX ; # CYRILLIC CAPITAL LETTER HA WITH DESCENDER
# һ ↔ XXX ; # CYRILLIC SMALL LETTER SHHA
# Һ ↔ XXX ; # CYRILLIC CAPITAL LETTER SHHA
# ѡ ↔ XXX ; # CYRILLIC SMALL LETTER OMEGA
# Ѡ ↔ XXX ; # CYRILLIC CAPITAL LETTER OMEGA
# ѿ ↔ XXX ; # CYRILLIC SMALL LETTER OT
# Ѿ ↔ XXX ; # CYRILLIC CAPITAL LETTER OT
# ѽ ↔ XXX ; # CYRILLIC SMALL LETTER OMEGA WITH TITLO
# Ѽ ↔ XXX ; # CYRILLIC CAPITAL LETTER OMEGA WITH TITLO
# ѻ ↔ XXX ; # CYRILLIC SMALL LETTER ROUND OMEGA
# Ѻ ↔ XXX ; # CYRILLIC CAPITAL LETTER ROUND OMEGA
ц ↔ c ; # CYRILLIC SMALL LETTER TSE
Ц ↔ C; # CYRILLIC CAPITAL LETTER TSE
# ҵ ↔ XXX ; # CYRILLIC SMALL LIGATURE TE TSE
# Ҵ ↔ XXX ; # CYRILLIC CAPITAL LIGATURE TE TSE
# ҽ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN CHE
# Ҽ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE
# ҿ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN CHE WITH DESCENDER
# Ҿ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN CHE WITH DESCENDER
Ъ ↔ $modprime2 $under ; # CYRILLIC CAPITAL LETTER HARD SIGN
ъ ↔ $modprime2 ; # CYRILLIC SMALL LETTER HARD SIGN
Ь ↔ $modprime $under ; # CYRILLIC CAPITAL LETTER SOFT SIGN
ь ↔ $modprime ; # CYRILLIC SMALL LETTER SOFT SIGN
ы ↔ y ; # CYRILLIC SMALL LETTER YERU
Ы ↔ Y ; # CYRILLIC CAPITAL LETTER YERU
# ҍ ↔ XXX ; # CYRILLIC SMALL LETTER SEMISOFT SIGN
# Ҍ ↔ XXX ; # CYRILLIC CAPITAL LETTER SEMISOFT SIGN
# ѣ ↔ XXX ; # CYRILLIC SMALL LETTER YAT
# Ѣ ↔ XXX ; # CYRILLIC CAPITAL LETTER YAT
# ѥ ↔ XXX ; # CYRILLIC SMALL LETTER IOTIFIED E
# Ѥ ↔ XXX ; # CYRILLIC CAPITAL LETTER IOTIFIED E
# ѧ ↔ XXX ; # CYRILLIC SMALL LETTER LITTLE YUS
# Ѧ ↔ XXX ; # CYRILLIC CAPITAL LETTER LITTLE YUS
# ѫ ↔ XXX ; # CYRILLIC SMALL LETTER BIG YUS
# Ѫ ↔ XXX ; # CYRILLIC CAPITAL LETTER BIG YUS
# ѩ ↔ XXX ; # CYRILLIC SMALL LETTER IOTIFIED LITTLE YUS
# Ѩ ↔ XXX ; # CYRILLIC CAPITAL LETTER IOTIFIED LITTLE YUS
# ѭ ↔ XXX ; # CYRILLIC SMALL LETTER IOTIFIED BIG YUS
# Ѭ ↔ XXX ; # CYRILLIC CAPITAL LETTER IOTIFIED BIG YUS
# ѯ ↔ XXX ; # CYRILLIC SMALL LETTER KSI
# Ѯ ↔ XXX ; # CYRILLIC CAPITAL LETTER KSI
# ѱ ↔ XXX ; # CYRILLIC SMALL LETTER PSI
# Ѱ ↔ XXX ; # CYRILLIC CAPITAL LETTER PSI
# ѳ ↔ XXX ; # CYRILLIC SMALL LETTER FITA
# Ѳ ↔ XXX ; # CYRILLIC CAPITAL LETTER FITA
# ѵ ↔ XXX ; # CYRILLIC SMALL LETTER IZHITSA
# Ѵ ↔ XXX ; # CYRILLIC CAPITAL LETTER IZHITSA
# ҩ ↔ XXX ; # CYRILLIC SMALL LETTER ABKHASIAN HA
# Ҩ ↔ XXX ; # CYRILLIC CAPITAL LETTER ABKHASIAN HA
# Ӏ ↔ XXX ; # CYRILLIC LETTER PALOCHKA
### а\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER A
### А\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER A
### а\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER A
### А\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER A
### ә\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER SCHWA
### Ә\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER SCHWA
### г\u0301 ↔ XXX ; # CYRILLIC SMALL LETTER GHE
### Г\u0301 ↔ XXX ; # CYRILLIC CAPITAL LETTER GHE
### е\u0300 ↔ XXX ; # CYRILLIC SMALL LETTER IE
### Е\u0300 ↔ XXX ; # CYRILLIC CAPITAL LETTER IE
### е\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER IE
### Е\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER IE
### е\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER IE
### Е\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER IE
### ж\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER ZHE
### Ж\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER ZHE
### ж\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER ZHE
### Ж\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER ZHE
### з\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER ZE
### З\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER ZE
### и\u0300 ↔ XXX ; # CYRILLIC SMALL LETTER I
### И\u0300 ↔ XXX ; # CYRILLIC CAPITAL LETTER I
### и\u0304 ↔ XXX ; # CYRILLIC SMALL LETTER I
### И\u0304 ↔ XXX ; # CYRILLIC CAPITAL LETTER I
### и\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER I
### И\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER I
### і\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I
### І\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I
### о\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER O
### О\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER O
### ө\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER BARRED O
### Ө\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER BARRED O
### к\u0301 ↔ XXX ; # CYRILLIC SMALL LETTER KA
### К\u0301 ↔ XXX ; # CYRILLIC CAPITAL LETTER KA
### у\u0304 ↔ XXX ; # CYRILLIC SMALL LETTER U
### У\u0304 ↔ XXX ; # CYRILLIC CAPITAL LETTER U
### у\u0306 ↔ XXX ; # CYRILLIC SMALL LETTER U
### У\u0306 ↔ XXX ; # CYRILLIC CAPITAL LETTER U
### у\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER U
### У\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER U
### у\u030B ↔ XXX ; # CYRILLIC SMALL LETTER U
### У\u030B ↔ XXX ; # CYRILLIC CAPITAL LETTER U
### ч\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER CHE
### Ч\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER CHE
### ы\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER YERU
### Ы\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER YERU
### э\u0308 ↔ XXX ; # CYRILLIC SMALL LETTER E
### Э\u0308 ↔ XXX ; # CYRILLIC CAPITAL LETTER E
### ѵ\u030F ↔ XXX ; # CYRILLIC SMALL LETTER IZHITSA
### Ѵ\u030F ↔ XXX ; # CYRILLIC CAPITAL LETTER IZHITSA
# Completeness
$ignore = [[:Mark:]''] * ;
| k ← q ;
| K ← Q ;
| u ← w ;
| U ← W ;
| KS ← X } $ignore [:UppercaseLetter:] ;
| KS ← [:UppercaseLetter:] $ignore { X ;
| Ks ← X ;
| ks ← x ;
:: NFC (NFD) ;
# note: a global filter is more efficient, but MUST include all source chars!!
# :: ([\u0000-\u007E ʹ ʺ [:Cyrillic:] [:Latin:] [:nonspacing mark:]]);
# MINIMAL FILTER: Latin-Cyrillic
:: ( [ˌ\u0308A-Za-zÀ-ÏÑ-ÖÙ-Ýà-ïñ-öù-ýÿ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƏƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳəʹ-ʺ\u0300-\u0302\u0306-\u0307\u030C\u0326\u0331\u0340-\u0341\u0344ʹ΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЀЃЌ-ЎЙйѐѓќ-ўӁ-ӂӐ-ӑӖ-ӗḀ-ẙẛẠ-ỹἂ-ἅἊ-Ἅἒ-ἕἚ-Ἕἢ-ἥἪ-Ἥἲ-ἵἺ-Ἵὂ-ὅὊ-Ὅὒ-ὕὛὝὢ-ὥὪ-Ὥὰ-ώᾂ-ᾅᾊ-ᾍᾒ-ᾕᾚ-ᾝᾢ-ᾥᾪ-ᾭᾰᾲᾴᾸᾺ-ΆῂῄῈ-Ή῍-῎ῐῒ-ΐῘῚ-Ί῝-῞ῠῢ-ΰῨῪ-Ύ῭-΅ῲῴῸ-ΏK-Å] ) ;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Bengali.txt
# File: Deva_Beng.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Bengali;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Gujarati.txt
# File: Deva_Gujr.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Gujarati;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Gurmukhi.txt
# File: Deva_Guru.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Gurmukhi;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Kannada.txt
# File: Deva_Knda.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Kannada;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Latin.txt
# File: Deva_Latn.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Latin;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Malayalam.txt
# File: Deva_Mlym.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Malayalam;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Oriya.txt
# File: Deva_Orya.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Oriya;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Tamil.txt
# File: Deva_Taml.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Tamil;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_Telugu.txt
# File: Deva_Telu.txt
# Generated from CLDR
#
::[\u0901-ःऄअ-ह\u093C-\u094Dॐ-\u0954क़-९ॽ];
::NFD;
::Devanagari-InterIndic;
::InterIndic-Telugu;
::NFC;

View File

@ -1,12 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Devanagari_InterIndic.txt
# Generated from CLDR
#
# Devanagari-InterIndic
# :: NFD;
#Rules for Decomposed characters
\u0901→\uE001; # SIGN CANDRABINDU
\u0902→\uE002; # SIGN ANUSVARA
ः→\uE003; # SIGN VISARGA
@ -113,3 +117,5 @@
९→\uE06F; # DIGIT NINE
॰→\uE070; # Devanagari-InterIndic: ABBREVIATION SIGN
ॽ→\uE082; # Devanagari Glottal Stop
# :: NFC (NFD) ;

View File

@ -1,12 +1,18 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Fullwidth_Halfwidth.txt
# Generated from CLDR
#
# Fullwidth-Halfwidth
# Mechanically generated from Unicode Character Database
# IDEOGRAPHIC SPACE then added, and
# FULLWIDTH MACRON changed to map to MACRON, not SPACE + COMBINING MACRON
# multicharacter
ガ↔ガ; # to KATAKANA LETTER GA
ギ↔ギ; # to KATAKANA LETTER GI
グ↔グ; # to KATAKANA LETTER GU
@ -35,6 +41,7 @@
ヴ↔ヴ; # to KATAKANA LETTER VU
ヷ↔ヷ; # to KATAKANA LETTER VA
ヺ↔ヺ; # to KATAKANA LETTER VO
# single character
!↔'!'; # from FULLWIDTH EXCLAMATION MARK
"↔'"'; # from FULLWIDTH QUOTATION MARK
#↔'#'; # from FULLWIDTH NUMBER SIGN
@ -259,3 +266,5 @@
↓↔↓; # to HALFWIDTH DOWNWARDS ARROW
■↔■; # to HALFWIDTH BLACK SQUARE
○↔○; # to HALFWIDTH WHITE CIRCLE
# eof

View File

@ -1,12 +1,14 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Georgian_Latin.txt
# File: Geor_Latn.txt
# Generated from CLDR
#
# long items moved up
წ ↔ tsʼ ;
კ ↔ kʼ ;
პ ↔ pʼ ;
@ -21,6 +23,7 @@
ძ ↔ dz ;
ხ ↔ kh ;
ჳ ↔ ŭi ;
# normal order
ა ↔ a ;
ბ ↔ b ;
გ ↔ g ;
@ -30,15 +33,30 @@
ზ ↔ z ;
თ ↔ t ;
ი ↔ i ;
#კ ↔ kʼ ;
ლ ↔ l ;
მ ↔ m ;
ნ ↔ n ;
ო ↔ o ;
#პ ↔ pʼ ;
#ჟ ↔ zh ;
რ ↔ r ;
ს ↔ s ;
#ტ ↔ tʼ ;
უ ↔ u ;
ფ ↔ p ;
ქ ↔ k ;
#ღ ↔ gh ;
# ↔ qʼ ;
#შ ↔ sh ;
#ჩ ↔ ch ;
#ც ↔ ts ;
#ძ ↔ dz ;
#წ ↔ tsʼ ;
#ჭ ↔ chʼ ;
#ხ ↔ kh ;
ჯ ↔ j ;
ჰ ↔ h ;
#ჳ ↔ ŭi ;
ჴ ↔ q ;

View File

@ -1,18 +1,33 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Greek_Latin.txt
# File: Grek_Latn.txt
# Generated from CLDR
#
# Rules are predicated on running NFD first, and NFC afterwards
# :: [\u0000-\u007F \u0370-Ͽ [:Greek:] [:nonspacing mark:]] ;
# MINIMAL FILTER GENERATED FOR: Greek-Latin
:: [;µ·ÄËÏÖÜäëïöüÿ-āĒ-ēĪ-īŌ-ōŪ-ūŸǕ-ǜǞ-ǣǬ-ǭȪ-ȭȰ-ȳ\u0304\u0308\u0313-\u0314\u0342-\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϗϛϝϟϡϣϥϧϩϫϭϯ-ϵϷ-\u07FBЁЇёїӒ-ӓӚ-ӟӢ-ӧӪ-ӱӴ-ӵӸ-ӹḔ-ḗḠ-ḡḦ-ḧḮ-ḯḸ-ḹṎ-ṓṜ-ṝṺ-ṻẄ-ẅẌ-ẍẗἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-ῌ῏-ΐῖ-Ί῟-Ῥῲ-ῴῶ-ῼΩϹ] ;
:: NFD (NFC) ;
# TEST CASES
# Ὀλίγοι ἔμφονες πολλῶν ἀφρόνων φοβερώτεροι — Πλάτωνος
# ᾂ ᾒ ᾢ ᾃ ᾓ ᾣ
# ᾳ ῃ ῳ ὃ ὄ
# ὠς ὡς ὢς ὣς
# Ὠς Ὡς Ὢς Ὣς
# ὨΣ ὩΣ ὪΣ ὫΣ
# Ạ, ạ, Ẹ, ẹ, Ọ, ọ
# Useful variables
$lower = [[:latin:][:greek:] & [:Ll:]];
$glower = [[:greek:] & [:Ll:]];
$upper = [[:latin:][:greek:] & [:Lu:]] ;
$accent = [:M:] ;
# NOTE: restrict to just the Greek & Latin accents that we care about
# TODO: broaden out once interation is fixed
$accentMinus = [ [\u0300-\u0345] & [:M:] - [\u0338]] ;
$macron = \u0304 ;
$ddot = \u0308 ;
@ -37,18 +52,27 @@ $beforeLetter = [[:M:]\']* [:L:] ;
$beforeLower = $accent * $lower ;
$notLetter = [^[:L:][:M:]] ;
$under = \u0331;
# Fix punctuation
# preserve original
\: ↔ \: $under ;
\? ↔ \? $under ;
\; ↔ \? ;
· ↔ \: ;
# CIRCUMFLEX: convert greek circumflex to normal one. Could use tilde or inverted breve
\u0342 ↔ \u0302 ;
# IOTA: convert iota subscript to iota
# first make previous alpha long!
$accent_minus = [[$accent]-[$iotasub$macron]];
Α } $accent_minus * $iotasub → | Α $macron ;
α } $accent_minus * $iotasub → | α $macron ;
# now convert to uppercase if after uppercase, ow to lowercase
$upper $accent * { $iotasub → I ;
$iotasub → i ;
| $1 $iotasub ← ($evowel $macron $accentMinus *) i ;
| $1 $iotasub ← ($evowel $macron $accentMinus *) I ;
# BREATHING
# Convert rough breathing to h, and move before letters.
# Make A ` x = → H a x
Α ($macron?) $rough } $beforeLower → H | α $1;
Ε $rough } $beforeLower → H | ε;
Η $rough } $beforeLower → H | η ;
@ -56,6 +80,7 @@ $iotasub → i ;
Ο $rough } $beforeLower → H | ο ;
Υ $rough } $beforeLower → H | υ ;
Ω ($ddot?) $rough } $beforeLower → H | ω $1;
# Make A x ` = → H a x
Α ($glower $macron?) $rough → H | α $1 ;
Ε ($glower) $rough → H | ε $1 ;
Η ($glower) $rough → H | η $1 ;
@ -63,14 +88,18 @@ $iotasub → i ;
Ο ($glower) $rough → H | ο $1 ;
Υ ($glower) $rough → H | υ $1 ;
Ω ($glower $ddot?) $rough → H | ω $1 ;
#Otherwise, make x ` into h x and X ` into H X
($lcgvowel + $ddotmac? ) $rough → h | $1 ;
($gvowel + $ddotmac? ) $rough → H | $1 ;
# Go backwards with H
| $1 $rough ← h ($evowel $macron $ddot? $evowel2_i $macron?) ;
| $1 $rough ← h ($evowel $ddot? $evowel2 $macron?) ;
| $1 $rough ← h ($evowel $macron? $ddot?) ;
| $1 $rough ← H ([AEIOUY] $macron $ddot? $evowel2_i $macron?) ;
| $1 $rough ← H ([AEIOUY] $ddot? $evowel2 $macron?) ;
| $1 $rough ← H ([AEIOUY] $macron? $ddot?) ;
# titlecase, have to fix individually
# in the future, we should add &uppercase() to make this easier
| A $1 $rough ← H a ($macron $ddot? $evowel2_i $macron?) ;
| E $1 $rough ← H e ($macron $ddot? $evowel2_i $macron?) ;
| I $1 $rough ← H i ($macron $ddot? $evowel2_i $macron?) ;
@ -89,10 +118,18 @@ $iotasub → i ;
| O $1 $rough ← H o ($macron? $ddot? ) ;
| U $1 $rough ← H u ($macron? $ddot? ) ;
| Y $1 $rough ← H y ($macron? $ddot? ) ;
# Now do smooth
#delete smooth breathing for Latin
$smooth → ;
# insert in Greek
# the assumption is that all Marks are on letters.
| $1 $smooth ← $notLetter { ([rR]) } [^hH$smooth$rough] ;
| $1 $smooth ← $notLetter { ($evowel $macron? $evowel2 $macron?) } [^$smooth$rough] ;
| $1 $smooth ← $notLetter { ($evowel $macron?) } [^$evowel2$smooth$rough] ;
# TODO: preserve smooth/rough breathing if not
# on initial vowel sequence
# need to have these up here so the rules don't mask
# remove now superfluous macron when returning
Α ← A $macron ;
α ← a $macron ;
η ↔ e $macron ;
@ -105,6 +142,7 @@ $smooth → ;
ψ ↔ ps ;
ω ↔ o $macron ;
Ω ↔ O $macron;
# NORMAL
α ↔ a ;
Α ↔ A ;
β ↔ b ;
@ -145,17 +183,24 @@ $smooth → ;
Ρ $rough ↔ RH ;
ρ ↔ r ;
Ρ ↔ R ;
# insert separator before things that turn into s
[Pp] { } [ςσΣϷϸϺϻ] → \' ;
# special S variants
Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L
ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L
Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L
ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L
# underbar means exception
# before a letter, initial
ς } $beforeLetter ↔ s $underbar } $beforeLetter;
σ } $beforeLetter ↔ s } $beforeLetter;
# otherwise, after a letter = final
$afterLetter { σ ↔ $afterLetter { s $underbar;
$afterLetter { ς ↔ $afterLetter { s ;
# otherwise (isolated) = initial
ς ↔ s $underbar;
σ ↔ s ;
# [Pp] { Σ ↔ \'S ;
Σ ↔ S ;
τ ↔ t ;
Τ ↔ T ;
@ -166,6 +211,7 @@ $vowel { Υ ↔ U ;
χ ↔ ch ;
Χ } $beforeLower ↔ Ch ;
Χ ↔ CH ;
# Completeness for ASCII
$ignore = [[:Mark:]''] * ;
| k ← c ;
| ph ← f ;
@ -187,6 +233,7 @@ $rough } $ignore [:UppercaseLetter:] → H ;
$ignore [:UppercaseLetter:] { $rough → H ;
$rough ← H ;
$rough ↔ h ;
# Completeness for Greek
ϐ → | β ;
ϑ → | θ ;
ϒ → | Υ ;
@ -201,7 +248,12 @@ $rough ↔ h ;
ϵ → | ε ;
µ → | μ ;
ͺ → i;
# delete any trailing ' marks used for roundtripping
← [Ππ] { \' } [Ss] ;
← [Νν] { \' } $egammaLike ;
::NFC (NFD) ;
# ([\u0000-\u007F [:Latin:] [:Greek:] [:nonspacing mark:]]) ;
# ([\u0000-\u007F · [:Latin:] [:nonspacing mark:]]) ;
# MINIMAL FILTER GENERATED FOR: Latin-Greek BACKWARD
:: ( [':?A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0300-\u0337\u0339-\u0345΅-ΆΈ-ΊΌΎ-ΐΪ-ΰϊ-ώϓ-ϔЀ-ЁЃЇЌ-ЎЙйѐ-ёѓїќ-ўѶ-ѷӁ-ӂӐ-ӓӖ-ӗӚ-ӟӢ-ӧӪ-ӵӸ-ӹḀ-ẙẛẠ-ỹἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼ῁-ῄῆ-ΐῖ-Ί῝-΅ῲ-ῴῶ-ῼK-Å] ) ;

View File

@ -1,14 +1,21 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Greek_Latin_UNGEGN.txt
# File: Grek_Latn_UNGEGN.txt
# Generated from CLDR
#
# For modern Greek, based on UNGEGN rules.
# Rules are predicated on running NFD first, and NFC afterwards
# MINIMAL FILTER GENERATED FOR: Greek-Latin/UNGEGN
# WARNING: need to add accents to both filters ###
# :: [\u0301\u0304\u0306\u0308;µ·ÀÂÈÊÌÎÒÔÙÛàâèêìîòôùûĈ-ĉĜ-ĝĤ-ĥĴ-ĵŜ-ŝŴ-ŷǛ-ǜǸ-ǹ\u0300\u0302\u0313-\u0314\u0340\u0342-\u0343\u0345ͺ;Ά-ΊΌΎ-ΡΣ-ώϐ-ϖϰ-ϵЀЍѐѝḔ-ḕṐ-ṑẀ-ẁẐ-ẑẤ-ậẰ-ằẾ-ệỐ-ộỜ-ờỪ-ừỲ-ỳἀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼι῁-ῄῆ-῍῏-ΐῖ-Ί῝῟-῭ῲ-ῴῶ-ῼΩϷ-\u07FBϹ] ;
:: [[[:Greek:][:Mn:][:Me:]] [\:-;?·;·]] ;
::NFD (NFC) ;
# Useful variables
$lower = [[:latin:][:greek:] & [:Ll:]] ;
$upper = [[:latin:][:greek:] & [:Lu:]] ;
$accent = [[:Mn:][:Me:]] ;
@ -31,10 +38,13 @@ $under = \u0331;
$caron = \u030C;
$afterLetter = [:L:] [\'$accent]* ;
$beforeLetter = [\'$accent]* [:L:] ;
# Fix punctuation
# preserve orginal
\: ↔ \: $under ;
\? ↔ \? $under ;
\; ↔ \? ;
· ↔ \: ;
# Fix any ancient characters that creep in
\u0342 → \u0301 ;
\u0302 → \u0301 ;
\u0300 → \u0301 ;
@ -42,6 +52,7 @@ $smooth → ;
$rough → ;
$iotasub → ;
ͺ → ;
# need to have these up here so the rules don't mask
η ↔ i $under ;
Η ↔ I $under ;
Ψ } $beforeLower ↔ Ps ;
@ -49,6 +60,7 @@ $iotasub → ;
ψ ↔ ps ;
ω ↔ o $under ;
Ω ↔ O $under;
# at begining or end of word, convert mp to b
[^[:L:]$accent] { μπ → b ;
μπ } [^[:L:]$accent] → b ;
[^[:L:]$accent] { [Μμ][Ππ] → B ;
@ -56,6 +68,7 @@ $iotasub → ;
μπ ← b ;
Μπ ← B } $beforeLower ;
ΜΠ ← B ;
# handle diphthongs ending with upsilon
ου ↔ ou ;
ΟΥ ↔ OU ;
Ου ↔ Ou ;
@ -70,6 +83,7 @@ $fmaker { Υ } $softener ↔ V $under ;
$fmaker { Υ ↔ U $under ;
υ ↔ y ;
Υ ↔ Y ;
# NORMAL
α ↔ a ;
Α ↔ A ;
β ↔ v ;
@ -107,17 +121,24 @@ $fmaker { Υ ↔ U $under ;
Π ↔ P ;
ρ ↔ r ;
Ρ ↔ R ;
# insert separator before things that turn into s
[Pp] { } [ςσΣϷϸϺϻ] → \' ;
# special S variants
Ϸ ↔ S\u030C ; # Ϸ GREEK CAPITAL LETTER SHO Uppercase_Letter Grek - L
ϸ ↔ s\u030C ; #ϸ GREEK SMALL LETTER SHO Lowercase_Letter Grek - L
Ϻ ↔ S\u0302 ; # Ϻ GREEK CAPITAL LETTER SAN Uppercase_Letter Grek - L
ϻ ↔ s\u0302 ; # ϻ GREEK SMALL LETTER SAN Lowercase_Letter Grek - L
# Caron means exception
# before a letter, initial
ς } $beforeLetter ↔ s $under } $beforeLetter;
σ } $beforeLetter ↔ s } $beforeLetter;
# otherwise, after a letter = final
$afterLetter { σ ↔ $afterLetter { s $under;
$afterLetter { ς ↔ $afterLetter { s ;
# otherwise (isolated) = initial
ς ↔ s $under;
σ ↔ s ;
# [Pp] { Σ ↔ \'S ;
Σ ↔ S ;
τ ↔ t ;
Τ ↔ T ;
@ -126,6 +147,8 @@ $afterLetter { ς ↔ $afterLetter { s ;
χ ↔ ch ;
Χ } $beforeLower ↔ Ch ;
Χ ↔ CH ;
# Completeness for ASCII
# $ignore = [[:Mark:]''] * ;
| ch ← h ;
| k ← c ;
| i ← j ;
@ -142,6 +165,7 @@ $afterLetter { ς ↔ $afterLetter { s ;
| B ← U } $vowel ;
| Y ← W ;
| Y ← U ;
# Completeness for Greek
ϐ → | β ;
ϑ → | θ ;
ϒ → | Υ ;
@ -155,7 +179,10 @@ $afterLetter { ς ↔ $afterLetter { s ;
ϴ → | Θ ;
ϵ → | ε ;
µ → | μ ;
# delete any trailing ' marks used for roundtripping
← [Ππ] { \' } [Ss] ;
← [Νν] { \' } $egammaLike ;
::NFC (NFD) ;
# MINIMAL FILTER GENERATED FOR: Latin-Greek/UNGEGN BACKWARD
:: ([[[:Latin:][:Mn:][:Me:]] ['\:?]]) ;

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_InterIndic.txt
# Generated from CLDR
#
# Gujarati-InterIndic
#:: NFD (NFC) ;
\u0A81→\uE001; # SIGN CANDRABINDU
\u0A82→\uE002; # SIGN ANUSVARA
ઃ→\uE003; # SIGN VISARGA
@ -90,3 +93,6 @@
।→\uE064; # DANDA
॥→\uE065; # DOUBLE DANDA
૰→\uE070; # ABBREVIATION SIGN
# :: NFC (NFD) ;
# eof

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Bengali.txt
# File: Gujr_Beng.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Bengali;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Devanagari.txt
# File: Gujr_Deva.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Devanagari;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Gurmukhi.txt
# File: Gujr_Guru.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Gurmukhi;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Kannada.txt
# File: Gujr_Knda.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Kannada;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Latin.txt
# File: Gujr_Latn.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Latin;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Malayalam.txt
# File: Gujr_Mlym.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Malayalam;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Oriya.txt
# File: Gujr_Orya.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Oriya;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Tamil.txt
# File: Gujr_Taml.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Tamil;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gujarati_Telugu.txt
# File: Gujr_Telu.txt
# Generated from CLDR
#
::[।-॥\u0A81-ઃઅ-ઍએ-ઑઓ-નપ-રલ-ળવ-હ\u0ABC-\u0AC5\u0AC7-ૉો-\u0ACDૐૠૡ-૯];
::NFD;
::Gujarati-InterIndic;
::InterIndic-Telugu;
::NFC;

View File

@ -1,12 +1,21 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_InterIndic.txt
# Generated from CLDR
#
# Gurmukhi-InterIndic
#:: NFD (NFC) ;
#ਖ\u0A3C→\uE059; # LETTER KHHA
#ਗ\u0A3C→\uE05A; # LETTER GHHA
#ਜ\u0A3C→\uE05B; # LETTER ZA
#ਸ\u0A3C→\uE036; # LETTER SHA
#ਲ\u0A3C→\uE033; # LETTER LLA
#ਫ\u0A3C→\uE05E; # LETTER FA
\u0A01→\uE001; # SIGN CHANDRABINDU
\u0A02→\uE002; # SIGN BINDI
ਅ→\uE005; # LETTER A
@ -83,3 +92,6 @@
ੴ→\uE080; # EK ONKAR
।→\uE064; # DANDA
॥→\uE065; # DOUBLE DANDA
# :: NFC (NFD) ;
# eof

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Bengali.txt
# File: Guru_Beng.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Bengali;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Devanagari.txt
# File: Guru_Deva.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Devanagari;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Gujarati.txt
# File: Guru_Gujr.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Gujarati;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Kannada.txt
# File: Guru_Knda.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Kannada;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Latin.txt
# File: Guru_Latn.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Latin;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Malayalam.txt
# File: Guru_Mlym.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Malayalam;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Oriya.txt
# File: Guru_Orya.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Oriya;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Tamil.txt
# File: Guru_Taml.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Tamil;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Gurmukhi_Telugu.txt
# File: Guru_Telu.txt
# Generated from CLDR
#
::[।-॥\u0A01\u0A02ਅ-ਊਏ-ਐਓ-ਨਪ-ਰਲ-ਲ਼ਵ-ਸ਼ਸ-ਹ\u0A3Cਾ-\u0A42\u0A47-\u0A48\u0A4B-\u0A4Dਖ਼-ੜਫ਼੦-ੴ];
::NFD;
::Gurmukhi-InterIndic;
::InterIndic-Telugu;
::NFC;

View File

@ -1,19 +1,33 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Han_Latin_Names.txt
# Generated from CLDR
#
# This transform is primarily intended to produce readings for Chinese surnames, or for full
# Chinese personal names - surname first - that occur at the beginning of a contiguous Han substring
# (i.e. at the beginning of text, or immediately preceded by space or other non-Han characters).
# Several Han characters have different readings in surnames, than the readings found in Han-Latin.
# ----
# Insert marker at start of each Han sequence (including Han after space).
# Do this before ::Han-Spacedhan() to catch Han after space in original text,
# and to apply before all other rules.
$startOfHanMarker = \uFDD1;
[:^script=Han:] { ([:script=Han:]) → $startOfHanMarker $1;
# Need Spacedhan so the name transliterations get spaced properly
::Han-Spacedhan();
# Convert special name readings that depend on next character
令 } \u0020? 狐 →líng;
万 } \u0020? 俟 →mò;
澹 } \u0020? 台 →tán;
# The following maps 长 to the standard Han-Latin reading zhǎng for this case,
# to override the normal Han-Latin/Names reading 长→cháng further below
$startOfHanMarker{ 长 } \u0020? 孙 →zhǎng;
# Convert single characters with special name readings
$startOfHanMarker{ 秘→bì;
$startOfHanMarker{ 卜→bǔ;
长→cháng;
@ -48,7 +62,11 @@ $startOfHanMarker{ 员→yùn;
$startOfHanMarker{ 查→zhā;
翟→zhái;
曾→zēng;
# Convert $startOfHanMarkers to space, or to nothing if they are at the beginning of text.
# Need to do this as a separate pass to get the spacing right.
::Null();
[^$]{ $startOfHanMarker →\u0020;
$startOfHanMarker →;
# Then run the normal Han-Latin transform for the rest
::Han-Latin();

View File

@ -1,18 +1,27 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Han_Spacedhan.txt
# Generated from CLDR
#
# Only intended for internal use
# Make sure Han are normalized, including characters that contain them.
# The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]
# Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!
:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc;
:: fullwidth-halfwidth;
。 → '.';
$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];
$initialPunct = [:Ps:][:Pi:];
# add space between any Han or terminal punctuation and letters, and
# between letters and Han or initial punct
[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;
[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ;
# remove spacing between ideographs and other letters
← [:Ideographic:] { ' ' } [:Letter:] ;
← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ;

View File

@ -1,13 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Hangul_Latin.txt
# File: Hang_Latn.txt
# Generated from CLDR
#
::['ᄀ-하-ᅵᆨ-ᇂㄱ-ㄿㅁ-ㅃㅅ-ㅣ㈀-㈜㉠-㉻가-힣'ᄀ-ᆵᄆ-ᄈᄉ-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ];
::NFKD;
::ConjoiningJamo-Latin;
::NFC;

View File

@ -1,16 +1,22 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Han_Latin.txt
# File: Hani_Latn.txt
# Generated from CLDR
#
# Warning: does not do round-trip mapping!!
# Convert CJK characters
::Han-Spacedhan();
藏 } \u0020? 文 →zàng;
重 } \u0020? 庆 →chóng;
沈 } \u0020? 阳 →shěn;
# Convert compounds; these are added individually, not derived from Unihan kMandarin.
# Note that Han-Spacedhan() has already been applied, so there should be spaces between Han characters.
藏 } \u0020? 文 →zàng;# 藏 is zàng (not cáng) if followed by 文 wén: 藏文 language Zàngwén = Tibetan
重 } \u0020? 庆 →chóng;# 重 is chóng (not zhòng) if followed by 庆 qìng: 重庆 city Chóngqìng
沈 } \u0020? 阳 →shěn;# 沈 is shěn (not chén) if followed by 阳 yáng: 沈阳 city Shěnyáng
# START AUTOGENERATED Han-Latin.xml ( Unihan kMandarin)
[呵锕阿𠼞𥥩𨉚]→ā;
[嗄]→á;
[啊]→a;
@ -1604,3 +1610,31 @@
[㝾佐唨左繓𠂇𥙀𦈛𧲭𨀨]→zuǒ;
[㑅㘀㘴㤰㭮䔘䟶作侳做唑坐岝岞座怍祚糳胙葃葄袏阼飵𠱯𡯨𡹥𥅁𥥏𥽿𦥬𧃘𨐳𨝨𪎲]→zuò;
[咗蓙]→zuo;
# END AUTOGENERATED Han-Latin.xml (Unihan kMandarin)
# fallbacks
## | yi ← i;
## | wu ← u;
## | bi ← b;
## | ci ← c;
## | di ← d;
## | fu ← f;
## | gu ← g;
## | he ← h;
## | ji ← j;
## | ku ← k;
## | li ← l;
## | mi ← m;
## | pi ← p;
## | qi ← q;
## | l ← r;
## | si ← s;
## | ti ← t;
## | f ← v;
## | wa ← w;
## | xi ← x;
## | yi ← y;
## | zi ← z;
# filter out the half-width hangul
# :: [^ᄒ-○] fullwidth-halfwidth ();
## :: (lower) ;

View File

@ -1,25 +1,38 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Simplified_Traditional.txt
# File: Hans_Hant.txt
# Generated from CLDR
#
# Copyright (c) 2005-2007,2010 Apple Inc., Unicode Inc.,
# and others. All Rights Reserved.
# For terms of use, see http://unicode.org/copyright.html#Exhibit1
# Convert between simplified and traditional Chinese
# UTRANS_FORWARD is from simplified to traditional
$SCDigit = [零一二三四五六七八九十百千万亿两] ;
$TCDigit = [零一二三四五六七八九十百千萬億兩] ;
#
# complex mappings for which there is no easy solution
# so we arbitrarily pick one
#
# does not handle the surnames 于 or 於
于飞↔于飛;
于归↔于歸;
于思↔于思;
单于↔單于;
鲜于↔鮮于;
#
姜片↔薑片;
姜末↔薑末;
生姜↔生薑;
鬼子姜↔鬼子薑;
姜↔姜;
姜←薑;
#
赤皮仑↔赤皮崙;
金仑溪↔金崙溪;
下仑↔下崙;
@ -33,6 +46,10 @@ $TCDigit = [零一二三四五六七八九十百千萬億兩] ;
仑↔侖;
曲↔曲;
曲←麯;
#
# Multiliple TC to SC
#
#
茶余饭后↔茶餘飯後;
余怒未消↔餘怒未消;
余音绕梁↔餘音繞梁;
@ -69,22 +86,27 @@ $SCDigit { 余 → 餘;
余↔余;
馀→餘;
余←餘;
#
什么↔甚麼;
么↔麼;
么←麽;
么←么;
#
复数↔複數;
复分数↔複分數;
复杂↔複雜;
复制↔複製;
复↔復;
复←複;
#
#
了然↔瞭然;
了解↔瞭解;
了望↔瞭望;
明了↔明瞭;
了↔了;
了←瞭;
#
解铃系铃↔解鈴繫鈴;
系词↔繫詞;
系辞↔繫辭;
@ -97,6 +119,7 @@ $SCDigit { 余 → 餘;
系↔系;
系←係;
系←繫;
#
糊里糊涂↔糊裡糊塗;
稀里糊涂↔稀裡糊塗;
蒙在鼓里↔蒙在鼓裡;
@ -193,6 +216,10 @@ $SCDigit { 海里 → 海里;
里外↔裡外;
里←裡;
里←裏;
#
# 乾 appears in the most compounds
# 幹 is next
# then 干
干什么↔幹甚麼;
干部↔幹部;
干才↔幹才;
@ -234,6 +261,7 @@ $SCDigit { 海里 → 海里;
干↔乾;
干←幹;
干←干;
#
划时代↔劃時代;
划分↔劃分;
划分←畫分;
@ -269,6 +297,7 @@ $SCDigit { 海里 → 海里;
划↔划;
划←劃;
画↔畫;
#
$SCDigit { 斗 → 斗;
斗量↔斗量;
斗胆↔斗胆;
@ -289,6 +318,7 @@ $SCDigit { 斗 → 斗;
斗←斗;
斗←闘;
斗←鬭;
#
周 } $SCDigit → 週;
周末↔週末;
周刊↔週刊;
@ -300,6 +330,8 @@ $SCDigit { 斗 → 斗;
本周↔本週;
周↔周;
周←週;
#
#
松球↔松毬;
球花↔毬花;
球果↔毬果;
@ -316,11 +348,13 @@ $SCDigit { 斗 → 斗;
肉松↔肉鬆;
松↔松;
松←鬆;
#
果↔果;
果←菓;
老板↔老闆;
板↔板;
板←闆;
#
面条↔麵條;
面粉↔麵粉;
面包↔麵包;
@ -343,14 +377,17 @@ $SCDigit { 斗 → 斗;
白面↔白麵;
面↔面;
面←麵;
#
防御↔防禦;
御敌↔禦敵;
御寒↔禦寒;
御↔御;
御←禦;
#
腼腆↔靦腆;
腼←靦;
䩄→靦;
#
宫商角徵羽↔宮商角徵羽;
征伐↔征伐;
征服↔征服;
@ -361,18 +398,21 @@ $SCDigit { 斗 → 斗;
亲征↔親征;
征↔徵;
征←征;
#
台风↔颱風;
写字台↔寫字檯;
台↔台;
台←颱;
台←臺;
台←檯;
#
胡同↔衚衕;
胡子↔鬍子;
胡须↔鬍鬚;
胡↔胡;
胡←鬍;
胡←衚;
#
须根↔鬚根;
须鲸↔鬚鯨;
须眉↔鬚眉;
@ -380,6 +420,7 @@ $SCDigit { 斗 → 斗;
触须↔觸鬚;
须↔須;
须←鬚;
#
$SCDigit { 只 → 隻;
形单影只↔形單影隻;
只贺新禧←祇賀新禧;
@ -396,6 +437,7 @@ $SCDigit { 只 → 隻;
祇↔祇;
只↔只;
只←隻;
#
并发↔併發;
并拢↔併攏;
并入↔併入;
@ -405,10 +447,13 @@ $SCDigit { 只 → 隻;
吞并↔吞併;
并↔並;
并←併;
#
当↔當;
当←噹;
#
药↔藥;
药←葯;
#
布道↔佈道;
布景↔佈景;
布局↔佈局;
@ -421,12 +466,14 @@ $SCDigit { 只 → 隻;
宣布↔宣佈;
布↔布;
布←佈;
#
开天辟地↔開天闢地;
开辟↔開闢;
辟邪↔闢邪;
辟↔辟;
辟←闢;
人言藉藉↔人言藉藉;
#
借口↔藉口;
借故↔藉故;
借使↔藉使;
@ -439,20 +486,25 @@ $SCDigit { 只 → 隻;
借↔借;
借←藉;
藉→藉;
#
尽管↔儘管;
尽↔盡;
尽←儘;
#
叶韵↔叶韻;
叶↔葉;
叶←叶;
#
伙计↔夥計;
伙伴↔夥伴;
伙↔伙;
伙←夥;
#
家具↔傢具;
家伙↔傢伙;
家↔家;
家←傢;
#
奸夫↔姦夫;
奸妇↔姦婦;
奸情↔姦情;
@ -465,6 +517,7 @@ $SCDigit { 只 → 隻;
诱奸↔誘姦;
奸↔奸;
奸←姦;
#
历书↔曆書;
历法↔曆法;
公历↔公曆;
@ -483,18 +536,22 @@ $SCDigit { 只 → 隻;
万历↔萬曆;
历↔歷;
历←曆;
#
万俟↔万俟; # surname
#
气冲冲↔氣沖沖;
气焰↔氣燄;
焰←燄;
气↔氣;
气←气;
#
细致↔細緻;
精致↔精緻;
标致↔標緻;
别致↔別緻;
致↔致;
致←緻;
#
制版↔製版;
制成↔製成;
制品↔製品;
@ -511,6 +568,7 @@ $SCDigit { 只 → 隻;
预制↔預製;
制↔制;
制←製;
#
谷贱伤农↔穀賤傷農;
谷神星↔穀神星;
鬼谷子↔鬼谷子;
@ -535,6 +593,7 @@ $SCDigit { 只 → 隻;
谷↔谷;
谷←穀;
谷←榖;
#
后妃↔后妃;
后稷↔后稷;
后土↔后土;
@ -545,6 +604,7 @@ $SCDigit { 只 → 隻;
太后↔太后;
后↔後;
后←后;
#
地方志↔地方誌;
标志↔標誌;
墓志↔墓誌;
@ -554,19 +614,24 @@ $SCDigit { 只 → 隻;
杂志↔雜誌;
志↔志;
志←誌;
#
别扭↔彆扭;
别↔別;
别←彆;
#
汇报↔彙報;
词汇↔詞彙;
字汇↔字彙;
汇↔匯;
汇←彙;
#
辞↔辭;
辞←辞;
词↔詞;
#
机↔機;
机←机;
#
发廊↔髮廊;
发妻↔髮妻;
发型↔髮型;
@ -631,11 +696,13 @@ $SCDigit { 只 → 隻;
发↔發;
发←髮;
卷←捲;
#
人云亦云↔人云亦云;
不知所云↔不知所云;
云游↔雲遊;
云↔雲;
云←云;
#
子丑寅卯↔子丑寅卯;
生旦淨末丑↔生旦净末丑;
丑时↔丑時;
@ -644,9 +711,11 @@ $SCDigit { 只 → 隻;
小丑↔小丑;
丑↔醜;
丑←丑;
#
萝卜↔蘿蔔;
卜↔卜;
卜←蔔;
#
冲茶↔沖茶;
冲淡↔沖淡;
冲服↔沖服;
@ -660,16 +729,20 @@ $SCDigit { 只 → 隻;
冲↔衝;
冲←沖;
冲←冲;
#
$SCDigit { 出 } 戏 → 齣;
出游↔出遊;
出↔出;
出←齣;
#
线↔線;
线←綫;
#
核实↔覈實;
核算↔覈算;
核↔核;
核←覈;
#
回路↔迴路;
回廊↔迴廊;
回游↔回遊;
@ -678,12 +751,15 @@ $SCDigit { 出 } 戏 → 齣;
回↔回;
回←迴;
回←廻;
#
冬冬↔鼕鼕;
冬↔冬;
冬←鼕;
#
咸菜↔鹹菜;
咸↔咸;
咸←鹹;
#
清心寡欲↔清心寡慾;
克欲修行↔克慾修行;
欲不可纵↔慾不可縱;
@ -705,6 +781,7 @@ $SCDigit { 出 } 戏 → 齣;
嗜欲↔嗜慾;
欲↔欲;
欲←慾;
#
准绳↔準繩;
准时↔準時;
准头↔準頭;
@ -719,6 +796,7 @@ $SCDigit { 出 } 戏 → 齣;
准↔准;
准←準;
标↔標;
#
注册↔註冊;
注销↔註銷;
注解↔註解;
@ -728,6 +806,9 @@ $SCDigit { 出 } 戏 → 齣;
加注↔加註;
注↔注;
注←註;
#
# variants
#
凶暴↔兇暴;
凶器↔兇器;
凶手↔兇手;
@ -736,23 +817,32 @@ $SCDigit { 出 } 戏 → 齣;
逞凶↔逞兇;
凶↔凶;
凶←兇;
#
扬↔揚;
扬←䬗;
飏↔颺;
#
宴↔宴;
宴←醼;
䜩↔讌;
#
咬↔咬;
咬←齩;
咬←䶧;
#
豆↔豆;
豆←荳;
#
韭↔韭;
韭←韮;
#
#
笺↔箋;
笺←牋;
#
团↔團;
团←糰;
#
卤鸡↔滷雞;
卤味↔滷味;
卤菜↔滷菜;
@ -760,36 +850,50 @@ $SCDigit { 出 } 戏 → 齣;
盐卤↔鹽滷;
卤↔鹵;
卤←滷;
#
呆↔呆;
呆←獃;
#
泛↔泛;
泛←氾;
泛←汎;
#
妫↔媯;
妫←嬀;
#
众↔眾;
众←衆;
#
钩↔鈎;
钩←鉤;
#
绱↔緔;
绱←鞝;
#
锐↔銳;
锐←鋭;
#
赝↔贋;
赝←贗;
赃↔贓;
赃←贜;
#
粗↔粗;
粗←麤;
#
关↔關;
关←関;
#
饥↔飢;
饥←饑;
#
款↔款;
款←欵;
胧↔朧;
#
蒙↔蒙;
蒙←懞;
#
骂↔罵;
骂←駡;
脏↔臟;
@ -819,18 +923,24 @@ $SCDigit { 出 } 戏 → 齣;
炮↔炮;
炮←砲;
炮←礮;
#
启↔啓;
启←啟;
#
茶几↔茶几;
几↔幾;
几←几;
#
德↔德;
德←悳;
#
悫↔愨;
悫←慤;
#
克↔克;
克←剋;
克←尅;
#
坛坛罐罐↔罈罈罐罐;
瓶瓶坛坛↔瓶瓶罈罈;
醋坛↔醋罈;
@ -840,6 +950,7 @@ $SCDigit { 出 } 戏 → 齣;
坛←壜;
坛←罎;
坛←罈;
#
升华↔昇華;
毕升↔畢昇;
高升↔高昇;
@ -847,19 +958,26 @@ $SCDigit { 出 } 戏 → 齣;
升↔升;
升←昇;
升←陞;
#
伪↔偽;
伪←僞;
#
收获→收穫;
获↔獲;
获←穫;
#
绦↔縧;
绦←絛;
#
绣↔繡;
绣←綉;
#
钵↔鉢;
钵←缽;
#
蜡↔蠟;
蜡←蜡;
#
采薪之忧↔采薪之憂;
兴高采烈↔興高采烈;
无精打采↔無精打采;
@ -875,6 +993,7 @@ $SCDigit { 出 } 戏 → 齣;
䌽→綵;
采↔採;
采←埰;
#
厕↔廁;
厕←厠;
捣↔搗;
@ -899,8 +1018,10 @@ $SCDigit { 出 } 戏 → 齣;
凼←氹;
床↔床;
床←牀;
# first form is more common
墙↔牆;
墙←墻;
#
奖↔獎;
奖←奬;
眦↔眥;
@ -927,8 +1048,10 @@ $SCDigit { 出 } 戏 → 齣;
酝←醞;
录↔錄;
录←録;
# 鏽 is more common
锈↔鏽;
锈←銹;
#
镢↔鐝;
䦆←钁;
阅↔閱;
@ -939,6 +1062,7 @@ $SCDigit { 出 } 戏 → 齣;
闲居↔閑居;
闲↔閒;
闲←閑;
#
游山玩水↔遊山玩水;
游伴↔遊伴;
游程↔遊程;
@ -998,6 +1122,7 @@ $SCDigit { 出 } 戏 → 齣;
夜游↔夜遊;
游↔游;
游←遊;
#
表蒙子↔錶蒙子;
表带↔錶帶;
表链↔錶鏈;
@ -1018,11 +1143,14 @@ $SCDigit { 出 } 戏 → 齣;
停表↔停錶;
表↔表;
表←錶;
#
症结↔癥結;
症↔症;
症←癥;
#
痴↔痴;
痴←癡;
#
白洋淀↔白洋淀;
荷花淀↔荷花淀;
水淀↔水淀;
@ -1030,22 +1158,26 @@ $SCDigit { 出 } 戏 → 齣;
东淀↔東淀;
淀↔澱;
淀←淀;
#
向导↔嚮導;
响应←嚮應;
向往↔嚮往;
向↔向;
向←嚮;
向←曏;
#
扎营↔紮營;
驻扎↔駐紮;
扎↔扎;
扎←紮;
#
占卜↔占卜;
占卦↔占卦;
占梦↔占夢;
占星↔占星;
占↔佔;
占←占;
#
托名↔託名;
托收↔託收;
信托↔信託;
@ -1061,14 +1193,18 @@ $SCDigit { 出 } 戏 → 齣;
托↔托;
托←託;
讬→託;
#
涌↔湧;
涌←涌;
#
累↔累;
累←纍;
#
困惫↔睏憊;
困乏↔睏乏;
困↔困;
困←睏;
#
左邻右舍↔左鄰右舍;
舍利↔舍利;
舍弟↔舍弟;
@ -1084,42 +1220,53 @@ $SCDigit { 出 } 戏 → 齣;
猪舍↔豬舍;
舍↔捨;
舍←舍;
#
杠↔槓;
杠←杠;
#
雇员↔僱員;
雇↔雇;
雇←僱;
#
刮倒↔颳倒;
刮↔刮;
刮←颳;
#
狸↔狸;
狸←貍;
#
跌交↔跌跤;
交↔交;
交←跤;
#
侄媳妇↔姪媳婦;
侄女↔姪女;
侄孙↔姪孫;
侄↔侄;
侄←姪;
#
勋↔勳;
勋←勛;
#
秋千↔鞦韆;
荡秋千↔盪鞦韆;
荡↔蕩;
荡←盪;
秋↔秋;
#
不寒而栗↔不寒而慄;
颤栗↔顫慄;
战栗↔戰慄;
栗↔栗;
栗←慄;
#
细嚼慢咽↔細嚼慢嚥;
狼吞虎咽↔狼吞虎嚥;
咽气↔嚥氣;
下咽↔下嚥;
咽↔咽;
咽←嚥;
#
吊民伐罪↔弔民伐罪;
形影相吊↔形影相弔;
提心吊胆↔提心弔膽;
@ -1128,32 +1275,43 @@ $SCDigit { 出 } 戏 → 齣;
吊唁↔弔唁;
吊↔吊;
吊←弔;
#
英寸↔英吋;
#
方腊↔方腊;
腊↔臘;
#
乡愿↔鄉愿;
愿↔願;
愿←愿;
#
古迹↔古蹟;
史迹↔史蹟;
迹↔跡;
迹←蹟;
#
净↔淨;
净←凈;
#
侥幸↔僥倖;
侥↔僥;
幸↔幸;
幸←倖;
#
蚝↔蠔;
蚝←蚝;
#
柜柳↔柜柳; # ju3liu3
柜↔櫃; # gui4
#
拉纤↔拉縴;
纤夫↔縴夫;
纤路↔縴路;
纤绳↔縴繩;
纤↔纖; # reading xian1
纤←縴; # reading qian4
#
# separate readings for po1 or po4 from pu2
厚朴↔厚朴;
朴刀↔朴刀; # po1dao1
朴硝↔朴硝; # po4xiao1
@ -1533,6 +1691,9 @@ $SCDigit { 出 } 戏 → 齣;
镌↔鐫;
镌←鎸;
于↔於;
#
# one-to-one mappings
#
亘↔亙;
铝↔鋁;
极↔極;
@ -1611,6 +1772,7 @@ $SCDigit { 出 } 戏 → 齣;
㑩↔儸;
傩↔儺;
俨↔儼;
# Preserve 丰 for traditional in some cases
丰标不凡→丰標不凡;
丰}[度情茸姿神采]→丰;
丰仪→丰儀;
@ -3985,5 +4147,7 @@ $SCDigit { 出 } 戏 → 齣;
龚↔龔;
龛↔龕;
龟↔龜;
# map some punctuation too
“↔「;
”↔」;

View File

@ -1,15 +1,36 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Hebrew_Latin.txt
# File: Hebr_Latn.txt
# Generated from CLDR
#
# Transliteration table for Hebrew
# Based on the UNGEGN table at:
# http://www.eki.ee/wgrs/rom1_he.pdf
#
# Exceptions:
# - Accents are added to disambiguate letters
# - Combinations of dagesh, shin/sin dot that produce different
# letters are not yet encoded.
#
# To test, open:
# http://www.ibm.com/software/globalization/icu/demo/transform
# Click Edit, paste in this file, Save As hebrew-latin/XXX
# (where XXX is a username)
# Now go back to the main window, and try it out.
# Use hebrew-latin/XXX for Output 1, and (Inverse) for Output 2
# Paste in hebrew text in Input, and hit Transliterate.
#
# For more information, see:
# http://icu.sourceforge.net/userguide/Transform.html
:: [[:Hebrew:] [:^ccc=0:] [\u05B0-\u05B9\u05BB-\u05BC\u05C1-\u05C2ℵ-ℸ\u0304\u05BF] - [\u05BD]] ;
:: nfkd (nfc) ;
$letterAfter = [:M:]* [:L:] ;
# move longer items here to avoid masking
ח ↔ h\u0331 ;
צ ↔ z\u0331 } $letterAfter;
ץ ↔ z\u0331 ;
@ -43,6 +64,7 @@ $letterAfter = [:M:]* [:L:] ;
\u05BC ↔ \u0307 ; # dagesh just goes to overdot for now
\u05C1 ↔ \u030C ; # shin dot -→ sh
\u05C2 ↔ \u0302 ; # sin dot -→ s
# points
$above = [^[:ccc=0:][:ccc=230:]]*;
\u05B2 → à ;
\u05B2 $1← a ($above) \u0300;
@ -62,6 +84,7 @@ $above = [^[:ccc=0:][:ccc=230:]]*;
\u05B6 ↔ e ;
\u05B3 ↔ o ;
\u05BF ↔ \u0304 ;
# fallbacks
ק ← c ;
פ ← f } $letterAfter;
ף ← f ;
@ -71,3 +94,4 @@ $above = [^[:ccc=0:][:ccc=230:]]*;
:: (lower);
:: nfc (nfd) ;
:: ([[:Latin:] [:^ccc=0:] [ʻ-ʼ\u0300-\u0302\u0307\u030C\u0327\u0331\u0340-\u0341 \u0304 ]]);

View File

@ -0,0 +1,188 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Hira_Kana.txt
# Generated from CLDR
#
# note: a global filter is more efficient, but MUST include all source chars
:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
:: NFKC ();
# Hiragana-Katakana
# This is largely a one-to-one mapping, but it has a
# few kinks:
# 1. The Katakana va/vi/ve/vo (30F7-30FA) have no
# Hiragana equivalents. We use Hiragana wa/wi/we/wo
# (308F-3092) with a voicing mark (3099), which is
# semantically equivalent. However, this is a non-
# roundtripping transformation.
# 2. The Katakana small ka/ke (30F5,30F6) have no
# Hiragana equiavlents. We convert them to normal
# Hiragana ka/ke (304B,3051). This is a one-way
# information-losing transformation and precludes
# round-tripping of 30F5 and 30F6.
# 3. The combining marks 3099-309C are in the Hiragana
# block, but they apply to Katakana as well, so we
# leave them untouched.
# 4. The Katakana prolonged sound mark 30FC doubles the
# preceding vowel. This is a one-way information-
# losing transformation from Katakana to Hiragana.
# 5. The Katakana middle dot separates words in foreign
# expressions; we leave this unmodified.
# The above points preclude successful round-trip
# transformations of arbitrary input text. However,
# they provide naturalistic results that should conform
# to user expectations.
# Combining equivalents va/vi/ve/vo
わ\u3099 ↔ ヷ;
ゐ\u3099 ↔ ヸ;
ゑ\u3099 ↔ ヹ;
を\u3099 ↔ ヺ;
# One-to-one mappings, main block
# 3041:3094 ↔ 30A1:30F4
# 309D,E ↔ 30FD,E
ぁ ↔ ァ;
あ ↔ ア;
ぃ ↔ ィ;
い ↔ イ;
ぅ ↔ ゥ;
う ↔ ウ;
ぇ ↔ ェ;
え ↔ エ;
ぉ ↔ ォ;
お ↔ オ;
か ↔ カ;
が ↔ ガ;
き ↔ キ;
ぎ ↔ ギ;
く ↔ ク;
ぐ ↔ グ;
け ↔ ケ;
げ ↔ ゲ;
こ ↔ コ;
ご ↔ ゴ;
さ ↔ サ;
ざ ↔ ザ;
し ↔ シ;
じ ↔ ジ;
す ↔ ス;
ず ↔ ズ;
せ ↔ セ;
ぜ ↔ ゼ;
そ ↔ ソ;
ぞ ↔ ゾ;
た ↔ タ;
だ ↔ ダ;
ち ↔ チ;
ぢ ↔ ヂ;
っ ↔ ッ;
つ ↔ ツ;
づ ↔ ヅ;
て ↔ テ;
で ↔ デ;
と ↔ ト;
ど ↔ ド;
な ↔ ナ;
に ↔ ニ;
ぬ ↔ ヌ;
ね ↔ ネ;
の ↔ ;
は ↔ ハ;
ば ↔ バ;
ぱ ↔ パ;
ひ ↔ ヒ;
び ↔ ビ;
ぴ ↔ ピ;
ふ ↔ フ;
ぶ ↔ ブ;
ぷ ↔ プ;
へ ↔ ヘ;
べ ↔ ベ;
ぺ ↔ ペ;
ほ ↔ ホ;
ぼ ↔ ボ;
ぽ ↔ ポ;
ま ↔ マ;
み ↔ ミ;
む ↔ ム;
め ↔ メ;
も ↔ モ;
ゃ ↔ ャ;
や ↔ ヤ;
ゅ ↔ ュ;
ゆ ↔ ユ;
ょ ↔ ョ;
よ ↔ ヨ;
ら ↔ ラ;
り ↔ リ;
る ↔ ル;
れ ↔ レ;
ろ ↔ ロ;
ゎ ↔ ヮ;
わ ↔ ワ;
ゐ ↔ ヰ;
ゑ ↔ ヱ;
を ↔ ヲ;
ん ↔ ン;
ゔ ↔ ヴ;
ゝ ↔ ヽ;
ゞ ↔ ヾ;
# One-way Katakana-Hiragana xform of small K ka/ke to
# normal H ka/ke.
か ← ヵ;
け ← ヶ;
# Katakana followed by a prolonged sound mark 30FC has
# its final vowel doubled. This is a Katakana-Hiragana
# one-way information-losing transformation. We
# include the small Katakana (e.g., small A 3041) and
# do not distinguish them from their large
# counterparts. It doesn't make sense to double a
# small counterpart vowel as a small Hiragana vowel, so
# we don't do so. In natural text this should never
# occur anyway. If a 30FC is seen without a preceding
# vowel sound (e.g., after n 30F3) we do not change it.
### $long = ー;
# The following categories are Hiragana, not Katakana
# as might be expected, since by the time we get to the
# 30FC, the preceding character will have already been
# transformed to Hiragana.
# {The following mechanically generated from the
# Unicode 3.0 data:}
$xa = [ \
ぁ あ か が さ ざ \
た だ な は ば ぱ \
ま ゃ や ら ゎ わ \
];
$xi = [ \
ぃ い き ぎ し じ \
ち ぢ に ひ び ぴ \
み り ゐ \
];
$xu = [ \
ぅ う く ぐ す ず \
っ つ づ ぬ ふ ぶ \
ぷ む ゅ ゆ る ゔ \
];
$xe = [ \
ぇ え け げ せ ぜ \
て で ね へ べ ぺ \
め れ ゑ \
];
$xo = [ \
ぉ お こ ご そ ぞ \
と ど の ほ ぼ ぽ \
も ょ よ ろ を \
];
あ ← $xa {ー};
い ← $xi {ー};
う ← $xu {ー};
え ← $xe {ー};
お ← $xo {ー};
:: (NFKC) ;
# note: a global filter is more efficient, but MUST include all source chars!!
:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);
# eof

View File

@ -1,12 +1,13 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Hiragana_Latin.txt
# File: Hira_Latn.txt
# Generated from CLDR
#
:: [ぁ-ゔ\u3099ゝ-ゞガギグゲゴザジズゼゾダヂヅデドバビブベボヴヷ-ヺーヾ] ;
:: NFD ;
:: Hiragana-Katakana;
@ -14,3 +15,4 @@
:: NFC ;
:: (Lower) ;
:: ([',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]) ;

View File

@ -1,135 +0,0 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Hiragana_Katakana.txt
# Generated from CLDR
#
:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]] ;
:: NFKC ();
わ\u3099 ↔ ヷ;
ゐ\u3099 ↔ ヸ;
ゑ\u3099 ↔ ヹ;
を\u3099 ↔ ヺ;
ぁ ↔ ァ;
あ ↔ ア;
ぃ ↔ ィ;
い ↔ イ;
ぅ ↔ ゥ;
う ↔ ウ;
ぇ ↔ ェ;
え ↔ エ;
ぉ ↔ ォ;
お ↔ オ;
か ↔ カ;
が ↔ ガ;
き ↔ キ;
ぎ ↔ ギ;
く ↔ ク;
ぐ ↔ グ;
け ↔ ケ;
げ ↔ ゲ;
こ ↔ コ;
ご ↔ ゴ;
さ ↔ サ;
ざ ↔ ザ;
し ↔ シ;
じ ↔ ジ;
す ↔ ス;
ず ↔ ズ;
せ ↔ セ;
ぜ ↔ ゼ;
そ ↔ ソ;
ぞ ↔ ゾ;
た ↔ タ;
だ ↔ ダ;
ち ↔ チ;
ぢ ↔ ヂ;
っ ↔ ッ;
つ ↔ ツ;
づ ↔ ヅ;
て ↔ テ;
で ↔ デ;
と ↔ ト;
ど ↔ ド;
な ↔ ナ;
に ↔ ニ;
ぬ ↔ ヌ;
ね ↔ ネ;
の ↔ ;
は ↔ ハ;
ば ↔ バ;
ぱ ↔ パ;
ひ ↔ ヒ;
び ↔ ビ;
ぴ ↔ ピ;
ふ ↔ フ;
ぶ ↔ ブ;
ぷ ↔ プ;
へ ↔ ヘ;
べ ↔ ベ;
ぺ ↔ ペ;
ほ ↔ ホ;
ぼ ↔ ボ;
ぽ ↔ ポ;
ま ↔ マ;
み ↔ ミ;
む ↔ ム;
め ↔ メ;
も ↔ モ;
ゃ ↔ ャ;
や ↔ ヤ;
ゅ ↔ ュ;
ゆ ↔ ユ;
ょ ↔ ョ;
よ ↔ ヨ;
ら ↔ ラ;
り ↔ リ;
る ↔ ル;
れ ↔ レ;
ろ ↔ ロ;
ゎ ↔ ヮ;
わ ↔ ワ;
ゐ ↔ ヰ;
ゑ ↔ ヱ;
を ↔ ヲ;
ん ↔ ン;
ゔ ↔ ヴ;
ゝ ↔ ヽ;
ゞ ↔ ヾ;
か ← ヵ;
け ← ヶ;
$xa = [ \
ぁ あ か が さ ざ \
た だ な は ば ぱ \
ま ゃ や ら ゎ わ \
];
$xi = [ \
ぃ い き ぎ し じ \
ち ぢ に ひ び ぴ \
み り ゐ \
];
$xu = [ \
ぅ う く ぐ す ず \
っ つ づ ぬ ふ ぶ \
ぷ む ゅ ゆ る ゔ \
];
$xe = [ \
ぇ え け げ せ ぜ \
て で ね へ べ ぺ \
め れ ゑ \
];
$xo = [ \
ぉ お こ ご そ ぞ \
と ど の ほ ぼ ぽ \
も ょ よ ろ を \
];
あ ← $xa {ー};
い ← $xi {ー};
う ← $xu {ー};
え ← $xe {ー};
お ← $xo {ー};
:: (NFKC) ;
:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ー[:Hiragana:] [:Katakana:] [:nonspacing mark:]]);

View File

@ -1,177 +0,0 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: IPA_XSampa.txt
# Generated from CLDR
#
$t = '_'; # X-SAMPA representation of IPA tie bar.
::NFD;
ʯ ↔ 'z`_w='; # LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL
ǁ ↔ '|\|\'; # LATIN LETTER LATERAL CLICK
ʄ ↔ 'J\_<'; # LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK
ʛ ↔ 'G\_<'; # LATIN LETTER SMALL CAPITAL G WITH HOOK
ʮ ↔ 'z_w='; # LATIN SMALL LETTER TURNED H WITH FISHHOOK
\u1DC4 ↔ '_H_T'; # COMBINING MACRON-ACUTE
\u1DC5 ↔ '_B_L'; # COMBINING GRAVE-MACRON
\u1DC8 ↔ '_R_F'; # COMBINING GRAVE-ACUTE-GRAVE
ɓ ↔ 'b_<'; # LATIN SMALL LETTER B WITH HOOK
ɗ ↔ 'd_<'; # LATIN SMALL LETTER D WITH HOOK
ɠ ↔ 'g_<'; # LATIN SMALL LETTER G WITH HOOK
ɻ ↔ 'r\`'; # LATIN SMALL LETTER TURNED R WITH HOOK
↗ ↔ '<R>'; # NORTH EAST ARROW
↘ ↔ '<F>'; # SOUTH EAST ARROW
ħ ↔ 'X\'; # LATIN SMALL LETTER H WITH STROKE
ǀ ↔ '|\'; # LATIN LETTER DENTAL CLICK
ǂ ↔ '=\'; # LATIN LETTER ALVEOLAR CLICK
ǃ ↔ '!\'; # LATIN LETTER RETROFLEX CLICK
ɕ ↔ 's\'; # LATIN SMALL LETTER C WITH CURL
ɖ ↔ 'd`'; # LATIN SMALL LETTER D WITH TAIL
ɘ ↔ '@\'; # LATIN SMALL LETTER REVERSED E
ɚ ↔ '@`'; # LATIN SMALL LETTER SCHWA WITH HOOK
ɝ ↔ '3`'; # LATIN SMALL LETTER REVERSED OPEN E WITH HOOK
ɞ ↔ '3\'; # LATIN SMALL LETTER CLOSED REVERSED OPEN E
ɟ ↔ 'J\'; # LATIN SMALL LETTER DOTLESS J WITH STROKE
ɢ ↔ 'G\'; # LATIN LETTER SMALL CAPITAL G
ɦ ↔ 'h\'; # LATIN SMALL LETTER H WITH HOOK
ɧ ↔ 'x\'; # LATIN SMALL LETTER HENG WITH HOOK
ɭ ↔ 'l`'; # LATIN SMALL LETTER L WITH RETROFLEX HOOK
ɮ ↔ 'K\'; # LATIN SMALL LETTER LEZH
ɰ ↔ 'M\'; # LATIN SMALL LETTER TURNED M WITH LONG LEG
ɳ ↔ 'n`'; # LATIN SMALL LETTER N WITH RETROFLEX HOOK
ɴ ↔ 'N\'; # LATIN LETTER SMALL CAPITAL N
ɸ ↔ 'p\'; # LATIN SMALL LETTER PHI
ɹ ↔ 'r\'; # LATIN SMALL LETTER TURNED R
ɺ ↔ 'l\'; # LATIN SMALL LETTER TURNED R WITH LONG LEG
ɽ ↔ 'r`'; # LATIN SMALL LETTER R WITH TAIL
ʀ ↔ 'R\'; # LATIN LETTER SMALL CAPITAL R
ʂ ↔ 's`'; # LATIN SMALL LETTER S WITH HOOK
ʈ ↔ 't`'; # LATIN SMALL LETTER T WITH RETROFLEX HOOK
ʐ ↔ 'z`'; # LATIN SMALL LETTER Z WITH RETROFLEX HOOK
ʑ ↔ 'z\'; # LATIN SMALL LETTER Z WITH CURL
ʕ ↔ '?\'; # LATIN LETTER PHARYNGEAL VOICED FRICATIVE
ʘ ↔ 'O\'; # LATIN LETTER BILABIAL CLICK
ʙ ↔ 'B\'; # LATIN LETTER SMALL CAPITAL B
ʜ ↔ 'H\'; # LATIN LETTER SMALL CAPITAL H
ʝ ↔ 'j\'; # LATIN SMALL LETTER J WITH CROSSED-TAIL
ʟ ↔ 'L\'; # LATIN LETTER SMALL CAPITAL L
ʡ ↔ '>\'; # LATIN LETTER GLOTTAL STOP WITH STROKE
ʢ ↔ '<\'; # LATIN LETTER REVERSED GLOTTAL STOP WITH STROKE
ʰ ↔ '_h'; # MODIFIER LETTER SMALL H
ʷ ↔ '_w'; # MODIFIER LETTER SMALL W
ʼ ↔ '_>'; # MODIFIER LETTER APOSTROPHE
ˆ ↔ '_\'; # MODIFIER LETTER CIRCUMFLEX ACCENT
ˇ ↔ '_/'; # CARON
ˑ ↔ ':\'; # MODIFIER LETTER HALF TRIANGULAR COLON
ˠ ↔ '_G'; # MODIFIER LETTER SMALL GAMMA
ˡ ↔ '_l'; # MODIFIER LETTER SMALL L
ˤ ↔ '_?\'; # MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
\u0300 ↔ '_L'; # COMBINING GRAVE ACCENT
\u0301 ↔ '_H'; # COMBINING ACUTE ACCENT
\u0302 ↔ '_F'; # COMBINING CIRCUMFLEX ACCENT
\u0304 ↔ '_M'; # COMBINING MACRON
\u0306 ↔ '_X'; # COMBINING BREVE
\u0308 ↔ '_"'; # COMBINING DIAERESIS
\u030B ↔ '_T'; # COMBINING DOUBLE ACUTE ACCENT
\u030C ↔ '_R'; # COMBINING CARON
\u030F ↔ '_B'; # COMBINING DOUBLE GRAVE ACCENT
\u0318 ↔ '_A'; # COMBINING LEFT TACK BELOW
\u0319 ↔ '_q'; # COMBINING RIGHT TACK BELOW
\u031A ↔ '_}'; # COMBINING LEFT ANGLE ABOVE
\u031C ↔ '_c'; # COMBINING LEFT HALF RING BELOW
\u031D ↔ '_r'; # COMBINING UP TACK BELOW
\u031E ↔ '_o'; # COMBINING DOWN TACK BELOW
\u031F ↔ '_+'; # COMBINING PLUS SIGN BELOW
\u0320 ↔ '_-'; # COMBINING MINUS SIGN BELOW
\u0324 ↔ '_t'; # COMBINING DIAERESIS BELOW
\u0325 ↔ '_0'; # COMBINING RING BELOW
\u032A ↔ '_d'; # COMBINING BRIDGE BELOW
\u032C ↔ '_v'; # COMBINING CARON BELOW
\u032F ↔ '_^'; # COMBINING INVERTED BREVE BELOW
\u0330 ↔ '_k'; # COMBINING TILDE BELOW
\u0334 ↔ '_e'; # COMBINING TILDE OVERLAY
\u0339 ↔ '_O'; # COMBINING RIGHT HALF RING BELOW
\u033A ↔ '_a'; # COMBINING INVERTED BRIDGE BELOW
\u033B ↔ '_m'; # COMBINING SQUARE BELOW
\u033C ↔ '_N'; # COMBINING SEAGULL BELOW
\u033D ↔ '_x'; # COMBINING X ABOVE
ᵻ ↔ 'I\'; # LATIN SMALL CAPITAL LETTER I WITH STROKE
ᵿ ↔ 'U\'; # LATIN SMALL CAPITAL LETTER U WITH STROKE
ⁿ ↔ '_n'; # MODIFIER LETTER LATIN SMALL LETTER N
ʋ ← 'v\'; # LATIN SMALL LETTER V WITH HOOK
ʲ ← '_j'; # MODIFIER LETTER SMALL H
\u0303 ← '_~'; # COMBINING TILDE
\u0329 ← '_='; # COMBINING VERTICAL LINE BELOW
c\u0327 ↔ C; # LATIN SMALL LETTER C WITH CEDILLA (decomposed)
æ ↔ '{'; # LATIN SMALL LETTER AE
ð ↔ D; # LATIN SMALL LETTER ETH
ø ↔ 2; # LATIN SMALL LETTER O WITH STROKE
ŋ ↔ N; # LATIN SMALL LETTER ENG
œ ↔ 9; # LATIN SMALL LIGATURE OE
ɐ ↔ 6; # LATIN SMALL LETTER TURNED A
ɑ ↔ A; # LATIN SMALL LETTER ALPHA
ɒ ↔ Q; # LATIN SMALL LETTER TURNED ALPHA
ɔ ↔ O; # LATIN SMALL LETTER OPEN O
ə ↔ '@'; # LATIN SMALL LETTER SCHWA
ɛ ↔ E; # LATIN SMALL LETTER OPEN E
ɜ ↔ 3; # LATIN SMALL LETTER REVERSED OPEN E
ɡ ↔ g; # LATIN SMALL LETTER SCRIPT G
ɣ ↔ G; # LATIN SMALL LETTER GAMMA
ɤ ↔ 7; # LATIN SMALL LETTER RAMS HORN
ɥ ↔ H; # LATIN SMALL LETTER TURNED H
ɨ ↔ 1; # LATIN SMALL LETTER I WITH STROKE
ɪ ↔ I; # LATIN LETTER SMALL CAPITAL I
ɫ ↔ 5; # LATIN SMALL LETTER L WITH MIDDLE TILDE
ɬ ↔ K; # LATIN SMALL LETTER L WITH BELT
ɯ ↔ M; # LATIN SMALL LETTER TURNED M
ɱ ↔ F; # LATIN SMALL LETTER M WITH HOOK
ɲ ↔ J; # LATIN SMALL LETTER N WITH LEFT HOOK
ɵ ↔ 8; # LATIN SMALL LETTER BARRED O
ɶ ↔ '&'; # LATIN LETTER SMALL CAPITAL OE
ɾ ↔ 4; # LATIN SMALL LETTER R WITH FISHHOOK
ʁ ↔ R; # LATIN LETTER SMALL CAPITAL INVERTED R
ʃ ↔ S; # LATIN SMALL LETTER ESH
ʉ ↔ '}'; # LATIN SMALL LETTER U BAR
ʊ ↔ U; # LATIN SMALL LETTER UPSILON
ʋ ↔ P; # LATIN SMALL LETTER V WITH HOOK
ʌ ↔ V; # LATIN SMALL LETTER TURNED V
ʍ ↔ W; # LATIN SMALL LETTER TURNED W
ʎ ↔ L; # LATIN SMALL LETTER TURNED Y
ʏ ↔ Y; # LATIN LETTER SMALL CAPITAL Y
ʒ ↔ Z; # LATIN SMALL LETTER EZH
ʔ ↔ '?'; # LATIN LETTER GLOTTAL STOP
ʲ ↔ \'; # MODIFIER LETTER SMALL J
ˈ ↔ '"'; # MODIFIER LETTER VERTICAL LINE
ˌ ↔ '%'; # MODIFIER LETTER LOW VERTICAL LINE
ː ↔ ':'; # MODIFIER LETTER TRIANGULAR COLON
˞ ↔ '`'; # MODIFIER LETTER RHOTIC HOOK
\u0303 ↔ '~'; # COMBINING TILDE
\u0329 ↔ '='; # COMBINING VERTICAL LINE BELOW
\u0361 ↔ $t; # COMBINING DOUBLE INVERTED BREVE
β ↔ B; # GREEK SMALL LETTER BETA
θ ↔ T; # GREEK SMALL LETTER THETA
χ ↔ X; # GREEK SMALL LETTER CHI
↑ ↔ '^'; # UPWARDS ARROW
↓ ↔ '!'; # DOWNWARDS ARROW
φ → 'p\'; # GREEK SMALL LETTER PHI
ɩ → I; # LATIN SMALL LETTER IOTA
ɷ → U; # LATIN SMALL LETTER CLOSED OMEGA
ɼ → 'r_r'; # LATIN SMALL LETTER R WITH LONG LEG
ɿ → 'z='; # LATIN SMALL LETTER REVERSED R WITH FISHHOOK
ʅ → 'z`='; # LATIN SMALL LETTER SQUAT REVERSED ESH
ʆ → S\'; # LATIN SMALL LETTER ESH WITH CURL
ʇ → '|\' ; # LATIN SMALL LETTER TURNED T
ʓ → Z\'; # LATIN SMALL LETTER EZH WITH CURL
ʖ → '|\|\'; # LATIN LETTER INVERTED GLOTTAL STOP
ʗ → '!\'; # LATIN LETTER STRETCHED C
ʚ → '3\'; # LATIN SMALL LETTER CLOSED OPEN E
ʠ → 'G\_<_0'; # LATIN SMALL LETTER Q WITH HOOK
ʣ → d $t z; # LATIN SMALL LETTER DZ DIGRAPH
ʤ → d $t Z; # LATIN SMALL LETTER DEZH DIGRAPH
ʥ → d $t 'z\'; # LATIN SMALL LETTER DZ DIGRAPH WITH CURL
ʦ → t $t s; # LATIN SMALL LETTER TS DIGRAPH
ʧ → t $t S; # LATIN SMALL LETTER TESH DIGRAPH
ʨ → t $t 's\'; # LATIN SMALL LETTER TC DIGRAPH WITH CURL
::NFC;

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Bengali.txt
# Generated from CLDR
#
# InterIndic-Bengali
#:: NFD (NFC) ;
\uE001→\u0981; # SIGN CANDRABINDU
\uE002→ং; # SIGN ANUSVARA
\uE003→ঃ; # SIGN VISARGA
@ -136,3 +139,6 @@
\uE083→ৎ; # Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ১;
# :: NFC (NFD) ;
# eof

View File

@ -1,12 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Devanagari.txt
# Generated from CLDR
#
# InterIndic-Devanagari
#:: NFD (NFC) ;
#Rules for Decomposed characters
\uE028\uE03C → ऩ; #\uE029
\uE030\uE03C → ऱ; #\uE031
\uE033\uE03C → ऴ; #\uE034
@ -18,6 +22,7 @@
\uE022\uE03C → ढ़; #\uE05D LETTER RHA (pronounced RRHA)
\uE02B\uE03C → फ़; #\uE05E LETTER FA
\uE02F\uE03C → य़; #\uE05F LETTER YYA
#Decomposed compatibility transliterations
\uE012\uE057→औ; # FALLBACK FOR TAMIL AU
0 → ; # FALLBACK FOR TAMIL
1 → १;
@ -73,9 +78,11 @@
\uE02F → य; # LETTER YA
\uE030 → र; # LETTER RA
\uE031 → ऱ; # LETTER RRA (Eyelash RA for Southern scripts)
#\uE031 → र;
\uE032 → ल; # LETTER LA
\uE033 → ळ; # LETTER LLA
\uE034 → ऴ; # LETTER LLLA (LLLA for Southern scripts)
#\uE034 → ळ;
\uE035 → व; # LETTER VA
\uE036 → श; # LETTER SHA
\uE037 → ष; # LETTER SSA
@ -148,3 +155,6 @@
\uE081→व; # FALLBACK FOR ORIYA LETTER WA
\uE082→; # Devanagari Glottal Sign
\uE083→त\u094D; # Bengali Khanda-ta
# :: NFC;
# eof

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Gujarati.txt
# Generated from CLDR
#
# InterIndic-Gujarati
#:: NFD (NFC) ;
\uE001→\u0A81; # SIGN CANDRABINDU
\uE002→\u0A82; # SIGN ANUSVARA
\uE003→; # SIGN VISARGA
@ -136,3 +139,7 @@
\uE083→ત\u0ACD; # Bengali Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ૧;
#\uE080→; # UNMAPPED InterIndic-Gujarati: ISSHAR
# :: NFC (NFD) ;
# eof

View File

@ -1,16 +1,22 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Gurmukhi.txt
# Generated from CLDR
#
# InterIndic-Gurmukhi
#:: NFD (NFC) ;
$vowel = [ਅ-ਔ ਾ-\u0A4D];
$consonant = [ਕ-ਹ];
\uE001→\u0A01; # SIGN CHANDRABINDU
#rules for BINDI
# Anusvara is equivalent to BINDI when preceeded by a vowel
$vowel{\uE002→\u0A02; # SIGN ANUSVARA (\u0A02 = SIGN BINDI)
# else is equivalent to TIPPI
$consonant{\uE002→\u0A70; # SIGN TIPPI
\uE002→\u0A02;
\uE003→; # FALLBACK BLOW AWAY SIGN VISARGA
@ -140,3 +146,6 @@ $consonant{\uE002→\u0A70; # SIGN TIPPI
\uE083→ਤ\u0A4D; # Bengali Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ;
# :: NFC (NFD) ;
# eof

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Kannada.txt
# Generated from CLDR
#
# InterIndic-Kannada
#:: NFD (NFC) ;
\uE033\uE03C→ೞ; # LETTER FA
\uE001→; # REMAP (indicExceptions.txt): \u0C81→ = SIGN CANDRABINDU→SIGN ANUSVARA
\uE002→; # SIGN ANUSVARA
@ -138,3 +141,6 @@
\uE083→ತ\u0CCD; # Bengali Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ೧;
# :: NFC (NFD) ;
# eof

View File

@ -1,15 +1,21 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Latin.txt
# Generated from CLDR
#
# InterIndic-Latin
#\u0E00 reserved
#consonants
$chandrabindu=\uE001;
$anusvara=\uE002;
$visarga=\uE003;
#\u0E004 reserved
# w←vowel→ represents the stand-alone form
$wa=\uE005;
$waa=\uE006;
$wi=\uE007;
@ -64,8 +70,11 @@ $sha=\uE036;
$ssa=\uE037;
$sa=\uE038;
$ha=\uE039;
#\u093A Reserved
#\u093B Reserved
$nukta=\uE03C;
$avagraha=\uE03D; # SIGN AVAGRAHA
# ←vowel→ represents the dependent form
$aa=\uE03E;
$i=\uE03F;
$ii=\uE040;
@ -82,6 +91,8 @@ $so=\uE04A; # VOWEL SIGN SHORT O
$o=\uE04B; # ो
$au=\uE04C;
$virama=\uE04D;
# \u094E Reserved
# \u094F Reserved
$om=\uE050; # OM
\uE051→; # UNMAPPED STRESS SIGN UDATTA
\uE052→; # UNMAPPED STRESS SIGN ANUDATTA
@ -90,6 +101,7 @@ $om=\uE050; # OM
$lm = \uE055;# Telugu Length Mark
$ailm=\uE056;# AI Length Mark
$aulm=\uE057;# AU Length Mark
#urdu compatibity forms
$uka=\uE058;
$ukha=\uE059;
$ugha=\uE05A;
@ -114,14 +126,21 @@ $six=\uE06C; # DIGIT SIX
$seven=\uE06D; # DIGIT SEVEN
$eight=\uE06E; # DIGIT EIGHT
$nine=\uE06F; # DIGIT NINE
# Glottal stop
$dgs=\uE082;
#Khanda-ta
$kta=\uE083;
$depVowelAbove=[\uE03E-\uE040\uE045-\uE04C];
$depVowelBelow=[\uE041-\uE044];
# $x was originally called '§'; $z was '%'
$x=[$aa$ai$au$ii$i$uu$u$rrh$rh$lh$llh$e$o$se$ce$so$co];
$z=[bcdfghjklmnpqrstvwxyz];
$vowels=[aeiour\u0304\u0325\u0306];
$forceIndependentMatra = [^[[:L:][\u0300-\u034C]]];
######################################################################
# convert from Native letters to Latin letters
######################################################################
#transliterations for anusvara
$anusvara} [$ka$kha$ga$gha$nga] → n\u0307;
$anusvara} [$ca$cha$ja$jha$nya] → n\u0304;
$anusvara} [$tta$ttha$dda$ddha$nna] → n\u0323;
@ -129,6 +148,7 @@ $anusvara} [$ta$tha$da$dha$na] → n;
$anusvara} [$pa$pha$ba$bha$ma] → m;
$anusvara} [$ya$ra$lla$la$va$ssa$sha$sa$ha] → n;
$anusvara→ m\u0307;
# Urdu compatibility
$ya$nukta}$x → y\u0307;
$ya$nukta$virama → y\u0307;
$ya$nukta → y\u0307a;
@ -186,6 +206,7 @@ $ela → l\u0331a;
$uya}$x → y\u0307;
$uya$virama → y\u0307;
$uya → y\u0307a;
# normal consonants
$ka$virama}$ha→k'';
$ka}$x→k;
$ka$virama→k;
@ -312,6 +333,7 @@ $sa$virama}$ssa→s'';
$sa$virama}$sa→s'';
$sa}$x→s;
$sa$virama→s;
#for gurmukhi
$sa$nukta}$x→s\u0301;
$sa$nukta$virama→s\u0301;
$sa$nukta→s\u0301a;
@ -325,6 +347,7 @@ $ssa→s\u0323a;
$ha}$x→h;
$ha$virama→h;
$ha→ha;
# dependent vowels (should never occur except following consonants)
$forceIndependentMatra{$aa → \u0314a\u0304;
$forceIndependentMatra{$ai → \u0314ai;
$forceIndependentMatra{$au → \u0314au;
@ -338,6 +361,7 @@ $forceIndependentMatra{$llh → \u0314l\u0325\u0304;
$forceIndependentMatra{$lh → \u0314l\u0325;
$forceIndependentMatra{$e → \u0314e\u0304;
$forceIndependentMatra{$o → \u0314o\u0304;
#extra vowels
$forceIndependentMatra{$ce → \u0314e\u0306;
$forceIndependentMatra{$co → \u0314o\u0306;
$forceIndependentMatra{$se → \u0314e;
@ -357,10 +381,12 @@ $llh → l\u0325\u0304;
$lh → l\u0325;
$e → e\u0304;
$o → o\u0304;
#extra vowels
$ce → e\u0306;
$co → o\u0306;
$se → e;
$so → o;
#dependent vowels when following independent vowels. Generally Illegal only for roundtripping
$waa} $x → a\u0304\u0314;
$wai} $x → ai\u0314;
$wau} $x → au\u0314;
@ -375,11 +401,13 @@ $wl } $x → l\u0325\u0314;
$we } $x → e\u0304\u0314;
$wo } $x → o\u0304\u0314;
$wa } $x → a\u0314;
#extra vowels
$wce} $x → e\u0306\u0314;
$wco} $x → o\u0306\u0314;
$wse} $x → e\u0314;
$wso} $x → o\u0314;
$om} $x → ''om\u0314;
# independent vowels when preceeded by vowels
$vowels{$waa → ''a\u0304;
$vowels{$wai → ''ai;
$vowels{$wau → ''au;
@ -394,10 +422,12 @@ $vowels{$wl → ''l\u0325;
$vowels{$we → ''e\u0304;
$vowels{$wo → ''o\u0304;
$vowels{$wa → ''a;
#extra vowels
$vowels{$wce → ''e\u0306;
$vowels{$wco → ''o\u0306;
$vowels{$wse → ''e;
$vowels{$wso → ''o;
# independent vowels (otherwise)
$waa → a\u0304;
$wai → ai;
$wau → au;
@ -412,15 +442,18 @@ $wl → l\u0325;
$we → e\u0304;
$wo → o\u0304;
$wa → a;
#extra vowels
$wce → e\u0306;
$wco → o\u0306;
$wse → e;
$wso → o;
$om → ''om;
#stress marks
$avagraha → \u0315;
$chandrabindu$anusvara→\u0303;
$chandrabindu → m\u0310;
$visarga→h\u0323;
#numbers
$zero → 0;
$one → 1;
$two → 2;
@ -439,9 +472,11 @@ $kta→t\u0331;
$danda→'.';
$doubleDanda→'.';
\uE070→; # ABBREVIATION SIGN
# LETTER RA WITH MIDDLE DIAGONAL
\uE071}$x→ra;
\uE071$virama→r;
\uE071→ra;
# LETTER RA WITH LOWER DIAGONAL
\uE072}$x→ra;
\uE072$virama→r;
\uE072→ra;
@ -460,3 +495,4 @@ $doubleDanda→'.';
\uE07F→; # URA
\uE080→; # EK ONKAR
\uE004→; # DEVANAGARI VOWEL SIGN SHORT A

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Malayalam.txt
# Generated from CLDR
#
# InterIndic-Malayalam
#:: NFD (NFC) ;
\uE001→; # REMAP (indicExceptions.txt): \u0D01→ = SIGN CANDRABINDU→SIGN ANUSVARA
\uE002→; # SIGN ANUSVARA
\uE003→ഃ; # SIGN VISARGA
@ -138,3 +141,6 @@
\uE083→ത\u0D4D; # Bengali Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ൧;
# :: NFC (NFD) ;
# eof

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Oriya.txt
# Generated from CLDR
#
# InterIndic-Oriya
#:: NFD (NFC) ;
\uE001→\u0B01; # SIGN CANDRABINDU
\uE002→ଂ; # SIGN ANUSVARA
\uE003→; # SIGN VISARGA
@ -136,3 +139,6 @@
\uE083→ତ\u0B4D; # Bengali Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ୧;
# :: NFC (NFD) ;
# eof

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Tamil.txt
# Generated from CLDR
#
# InterIndic-Tamil
#:: NFD (NFC) ;
\uE001→\u0B82; # FALLBACK SIGN CANDRABINDU
\uE002→\u0B82; # SIGN ANUSVARA
\uE003→ஃ; # SIGN VISARGA
@ -137,3 +140,6 @@
\uE081→வ; # FALLBACK FOR ORIYA LETTER WA
\uE082→; # Devanagari Glottal Stop
\uE083→த\u0BCD; # Bengali Khanda-ta
# :: NFC (NFD) ;
# eof

View File

@ -1,12 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: InterIndic_Telugu.txt
# Generated from CLDR
#
# InterIndic-Telugu
#:: NFD (NFC) ;
\uE001→ఁ; # SIGN CANDRABINDU
\uE002→; # SIGN ANUSVARA
\uE003→ః; # SIGN VISARGA
@ -137,3 +140,6 @@
\uE083→త\u0C4D; # Bengali Khanda-ta
0 → ; # FALLBACK FOR TAMIL
1 → ౧;
# :: NFC (NFD) ;
# eof

View File

@ -1,13 +1,15 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Jamo_Latin.txt
# File: Jamo_Latn.txt
# Generated from CLDR
#
::['ᄀ-하-ᅵᆨ-ᇂ가-힣];
::NFD;
::ConjoiningJamo-Latin;
::NFC;

View File

@ -1,12 +1,14 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_InterIndic.txt
# Generated from CLDR
#
# Kannada-InterIndic
\u0CC6ೕ→\uE047; # VOWEL SIGN EE
\u0CC6\u0CCDೖ→\uE048\uE04D; # VOWEL SIGN AI
\u0CC6ೖ→\uE048; # VOWEL SIGN AI
@ -90,3 +92,5 @@
೭→\uE06D; # DIGIT SEVEN
೮→\uE06E; # DIGIT EIGHT
೯→\uE06F; # DIGIT NINE
# eof

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Bengali.txt
# File: Knda_Beng.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Bengali;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Devanagari.txt
# File: Knda_Deva.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Devanagari;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Gujarati.txt
# File: Knda_Gujr.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Gujarati;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Gurmukhi.txt
# File: Knda_Guru.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Gurmukhi;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Latin.txt
# File: Knda_Latn.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBC-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Latin;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Malayalam.txt
# File: Knda_Mlym.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Malayalam;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Oriya.txt
# File: Knda_Orya.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Oriya;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Tamil.txt
# File: Knda_Taml.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Tamil;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Kannada_Telugu.txt
# File: Knda_Telu.txt
# Generated from CLDR
#
::[-ಃಅ-ಌಎ-ಐಒ-ನಪ-ಳವ-ಹ\u0CBCಽಾ-ೄ\u0CC6-ೈೊ-\u0CCDೕ-ೖೞೠ-ೡ೦-೯];
::NFD;
::Kannada-InterIndic;
::InterIndic-Telugu;
::NFC;

View File

@ -1,16 +1,32 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_ASCII.txt
# Generated from CLDR
#
# This handles only Latin, Common, and IDEOGRAPHIC NUMBER ZERO (Han).
#
:: [[:Latin:][:Common:][:Inherited:][]] ;
#
# Don't want NFKD, because that would convert things like superscripts and
# subscripts, which we do not want. So the individual transforms below
# include an appropriate subset of the NFKD ones.
# Here we remove accents from Latin characters. We then recompose to permit rules
# such as mapping NOT EQUAL TO to an ASCII equivalent e.g. "!=" if we choose to.
#
:: NFD() ;
[:Latin:] { [:Mn:]+ → ; # maps to nothing; remove all Mn following Latin letter
:: NFC() ;
#
# Some of the following mappings (noted) are from CLDR character-fallback data.
# (Note, here "character-fallback" uses U+2039/U+203A to avoid XML issues)
#
# Latin letters and IPA
#
Æ → AE ; # 00C6;LATIN CAPITAL LETTER AE (from character-fallback)
Ð → D ; # 00D0;LATIN CAPITAL LETTER ETH
Ø → O ; # 00D8;LATIN CAPITAL LETTER O WITH STROKE
@ -222,6 +238,7 @@
ỽ → v ; # 1EFD;LATIN SMALL LETTER MIDDLE-WELSH V
Ỿ → Y ; # 1EFE;LATIN CAPITAL LETTER Y WITH LOOP
ỿ → y ; # 1EFF;LATIN SMALL LETTER Y WITH LOOP
# Presentation forms
ff → ff ; # FB00;LATIN SMALL LIGATURE FF (compat)
fi → fi ; # FB01;LATIN SMALL LIGATURE FI (compat)
fl → fl ; # FB02;LATIN SMALL LIGATURE FL (compat)
@ -229,6 +246,7 @@
ffl → ffl ; # FB04;LATIN SMALL LIGATURE FFL (compat)
ſt → st ; # FB05;LATIN SMALL LIGATURE LONG S T (compat)
st → st ; # FB06;LATIN SMALL LIGATURE ST (compat)
# Fullwidth
→ A ; # FF21;FULLWIDTH LATIN CAPITAL LETTER A (compat)
→ B ; # FF22;FULLWIDTH LATIN CAPITAL LETTER B (compat)
→ C ; # FF23;FULLWIDTH LATIN CAPITAL LETTER C (compat)
@ -281,6 +299,9 @@
→ x ; # FF58;FULLWIDTH LATIN SMALL LETTER X (compat)
→ y ; # FF59;FULLWIDTH LATIN SMALL LETTER Y (compat)
→ z ; # FF5A;FULLWIDTH LATIN SMALL LETTER Z (compat)
#
# Currency and letterlike
#
© → '(C)' ; # 00A9;COPYRIGHT SIGN (from character-fallback)
® → '(R)' ; # 00AE;REGISTERED SIGN (from character-fallback)
₠ → CE ; # 20A0;EURO-CURRENCY SIGN (from character-fallback)
@ -329,6 +350,9 @@
→ e ; # 2147;DOUBLE-STRUCK ITALIC SMALL E (compat)
→ i ; # 2148;DOUBLE-STRUCK ITALIC SMALL I (compat)
→ j ; # 2149;DOUBLE-STRUCK ITALIC SMALL J (compat)
#
# Squared Latin
#
㍱ → hPa ; # 3371;SQUARE HPA (compat)
㍲ → da ; # 3372;SQUARE DA (compat)
㍳ → AU ; # 3373;SQUARE AU (compat)
@ -410,6 +434,9 @@
㏝ → Wb ; # 33DD;SQUARE WB (compat)
㏞ → 'V/m' ; # 33DE;SQUARE V OVER M (compat) (from character-fallback)
㏟ → 'A/m' ; # 33DF;SQUARE A OVER M (compat) (from character-fallback)
#
# Enclosed Latin
#
⒜ → '(a)' ; # 249C;PARENTHESIZED LATIN SMALL LETTER A (compat)
⒝ → '(b)' ; # 249D;PARENTHESIZED LATIN SMALL LETTER B (compat)
⒞ → '(c)' ; # 249E;PARENTHESIZED LATIN SMALL LETTER C (compat)
@ -436,6 +463,9 @@
⒳ → '(x)' ; # 24B3;PARENTHESIZED LATIN SMALL LETTER X (compat)
⒴ → '(y)' ; # 24B4;PARENTHESIZED LATIN SMALL LETTER Y (compat)
⒵ → '(z)' ; # 24B5;PARENTHESIZED LATIN SMALL LETTER Z (compat)
#
# Roman numerals
#
→ I ; # 2160;ROMAN NUMERAL ONE (compat)
Ⅱ → II ; # 2161;ROMAN NUMERAL TWO (compat)
Ⅲ → III ; # 2162;ROMAN NUMERAL THREE (compat)
@ -468,6 +498,9 @@
→ c ; # 217D;SMALL ROMAN NUMERAL ONE HUNDRED (compat)
→ d ; # 217E;SMALL ROMAN NUMERAL FIVE HUNDRED (compat)
ⅿ → m ; # 217F;SMALL ROMAN NUMERAL ONE THOUSAND (compat)
#
# Fractions
#
¼ → ' 1/4' ; # 00BC;VULGAR FRACTION ONE QUARTER (from character-fallback)
½ → ' 1/2' ; # 00BD;VULGAR FRACTION ONE HALF (from character-fallback)
¾ → ' 3/4' ; # 00BE;VULGAR FRACTION THREE QUARTERS (from character-fallback)
@ -484,6 +517,9 @@
⅝ → ' 5/8' ; # 215D;VULGAR FRACTION FIVE EIGHTHS (from character-fallback)
⅞ → ' 7/8' ; # 215E;VULGAR FRACTION SEVEN EIGHTHS (from character-fallback)
⅟ → ' 1/' ; # 215F;FRACTION NUMERATOR ONE (from character-fallback)
#
# Enclosed numeric
#
⑴ → '(1)' ; # 2474;PARENTHESIZED DIGIT ONE (compat)
⑵ → '(2)' ; # 2475;PARENTHESIZED DIGIT TWO (compat)
⑶ → '(3)' ; # 2476;PARENTHESIZED DIGIT THREE (compat)
@ -524,6 +560,9 @@
⒙ → '18.' ; # 2499;NUMBER EIGHTEEN FULL STOP (compat)
⒚ → '19.' ; # 249A;NUMBER NINETEEN FULL STOP (compat)
⒛ → '20.' ; # 249B;NUMBER TWENTY FULL STOP (compat)
#
# Other numeric (ideographic and fullwidth)
#
→ 0 ; # 3007;IDEOGRAPHIC NUMBER ZERO
→ 0 ; # FF10;FULLWIDTH DIGIT ZERO (compat)
→ 1 ; # FF11;FULLWIDTH DIGIT ONE (compat)
@ -535,6 +574,9 @@
→ 7 ; # FF17;FULLWIDTH DIGIT SEVEN (compat)
→ 8 ; # FF18;FULLWIDTH DIGIT EIGHT (compat)
→ 9 ; # FF19;FULLWIDTH DIGIT NINE (compat)
#
# Spaces
#
\u00A0 → ' ' ; # 00A0;NO-BREAK SPACE
\u2002 → ' ' ; # 2002;EN SPACE (compat)
\u2003 → ' ' ; # 2003;EM SPACE (compat)
@ -547,6 +589,16 @@
\u200A → ' ' ; # 200A;HAIR SPACE (compat)
\u205F → ' ' ; # 205F;MEDIUM MATHEMATICAL SPACE (compat)
\u3000 → ' ' ; # 3000;IDEOGRAPHIC SPACE (from character-fallback)
#
# Quotes, apostrophes
#
ʹ → \' ; # 02B9;MODIFIER LETTER PRIME
ʺ → \" ; # 02BA;MODIFIER LETTER DOUBLE PRIME
ʻ → \' ; # 02BB;MODIFIER LETTER TURNED COMMA
ʼ → \' ; # 02BC;MODIFIER LETTER APOSTROPHE
ʽ → \' ; # 02BD;MODIFIER LETTER REVERSED COMMA
ˈ → \' ; # 02C8;MODIFIER LETTER VERTICAL LINE
ˋ → '`' ; # 02CB;MODIFIER LETTER GRAVE ACCENT
→ \' ; # 2018;LEFT SINGLE QUOTATION MARK (from character-fallback)
→ \' ; # 2019;RIGHT SINGLE QUOTATION MARK (from character-fallback)
→ ',' ; # 201A;SINGLE LOW-9 QUOTATION MARK (from character-fallback)
@ -565,6 +617,9 @@
» → '>>' ; # 00BB;RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK (from character-fallback)
→ '<' ; # 2039;SINGLE LEFT-POINTING ANGLE QUOTATION MARK
→ '>' ; # 203A;SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
#
# Dashes, hyphens...
#
\u00AD → '-' ; # 00AD;SOFT HYPHEN (from character-fallback)
→ '-' ; # 2010;HYPHEN (from character-fallback)
→ '-' ; # 2011;NON-BREAKING HYPHEN (from character-fallback)
@ -577,6 +632,15 @@
→ '-' ; # FE58;SMALL EM DASH (compat)
﹣ → '-' ; # FE63;SMALL HYPHEN-MINUS (compat)
→ '-' ; # FF0D;FULLWIDTH HYPHEN-MINUS (compat)
#
# Other misc punctuation and symbols
#
˂ → '<' ; # 02C2;MODIFIER LETTER LEFT ARROWHEAD
˃ → '>' ; # 02C3;MODIFIER LETTER RIGHT ARROWHEAD
˄ → '^' ; # 02C4;MODIFIER LETTER UP ARROWHEAD
ˆ → '^' ; # 02C6;MODIFIER LETTER CIRCUMFLEX ACCENT
ː → ':' ; # 02D0;MODIFIER LETTER TRIANGULAR COLON
˜ → '~' ; # 02DC;SMALL TILDE
‖ → '||' ; # 2016;DOUBLE VERTICAL LINE
→ '.' ; # 2024;ONE DOT LEADER (compat)
‥ → '..' ; # 2025;TWO DOT LEADER (compat)
@ -589,6 +653,7 @@
⁈ → '?!' ; # 2048;QUESTION EXCLAMATION MARK (compat)
⁉ → '!?' ; # 2049;EXCLAMATION QUESTION MARK (compat)
→ '*' ; # 204E;LOW ASTERISK
# CJK
、 → ',' ; # 3001;IDEOGRAPHIC COMMA
。 → '.' ; # 3002;IDEOGRAPHIC FULL STOP
〈 → '<' ; # 3008;LEFT ANGLE BRACKET
@ -601,6 +666,7 @@
〙 → ']' ; # 3019;RIGHT WHITE TORTOISE SHELL BRACKET
〚 → '[' ; # 301A;LEFT WHITE SQUARE BRACKET
〛 → ']' ; # 301B;RIGHT WHITE SQUARE BRACKET
# Vertical and small forms
︐ → ',' ; # FE10;PRESENTATION FORM FOR VERTICAL COMMA (compat)
︑ → ',' ; # FE11;PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA (compat)
︒ → '.' ; # FE12;PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP (compat)
@ -646,6 +712,7 @@
﹩ → '$' ; # FE69;SMALL DOLLAR SIGN (compat)
﹪ → '%' ; # FE6A;SMALL PERCENT SIGN (compat)
﹫ → '@' ; # FE6B;SMALL COMMERCIAL AT (compat)
# Fullwidth and halfwidth
→ '!' ; # FF01;FULLWIDTH EXCLAMATION MARK (compat)
→ '#' ; # FF03;FULLWIDTH NUMBER SIGN (compat)
→ '$' ; # FF04;FULLWIDTH DOLLAR SIGN (compat)
@ -679,8 +746,13 @@
⦆ → '))' ; # FF60;FULLWIDTH RIGHT WHITE PARENTHESIS (compat)(from character-fallback)
。 → '.' ; # FF61;HALFWIDTH IDEOGRAPHIC FULL STOP (compat)
、 → ',' ; # FF64;HALFWIDTH IDEOGRAPHIC COMMA (compat)
#
# Other math operators (non-ASCII-range)
#
× → '*' ; # 00D7;MULTIPLICATION SIGN
÷ → '/' ; # 00F7;DIVISION SIGN
˖ → '+' ; # 02D6;MODIFIER LETTER PLUS SIGN
˗ → '-' ; # 02D7;MODIFIER LETTER MINUS SIGN
→ '-' ; # 2212;MINUS SIGN (from character-fallback)
→ '/' ; # 2215;DIVISION SLASH (from character-fallback)
→ '\' ; # 2216;SET MINUS (from character-fallback)
@ -693,3 +765,4 @@
⩴ → '::=' ; # 2A74;DOUBLE COLON EQUAL (compat)
⩵ → '==' ; # 2A75;TWO CONSECUTIVE EQUALS SIGNS (compat)
⩶ → '===' ; # 2A76;THREE CONSECUTIVE EQUALS SIGNS (compat)

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +1,69 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_ConjoiningJamo.txt
# Generated from CLDR
#
# Follows the Ministry of Culture and Tourism romanization: see http://www.korea.net/korea/kor_loca.asp?code=A020303
# http://www.unicode.org/cldr/transliteration_guidelines.html#Korean
#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
#- the INDEX file. This transliterator is, by itself, not
#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or
#- inverses thereof.
# Transliteration from Latin characters to Korean script is done in
# two steps: Latin to Jamo, then Jamo to Hangul. The Jamo-Hangul
# transliteration is done algorithmically following Unicode 3.0
# section 3.11. This file implements the Latin to Jamo
# transliteration using rules.
# Jamo occupy the block 1100-11FF. Within this block there are three
# groups of characters: initial consonants or choseong (I), medial
# vowels or jungseong (M), and trailing consonants or jongseong (F).
# Standard Korean syllables are of the form I+M+F*.
# Section 3.11 describes the use of 'filler' jamo to convert
# nonstandard syllables to standard form: the choseong filler 115F and
# the junseong filler 1160. In this transliterator, we will not use
# 115F or 1160.
# We will, however, insert two 'null' jamo to make foreign words
# conform to Korean syllable structure. These are the null initial
# consonant 110B (IEUNG) and the null vowel 1173 (EU). In Latin text,
# we will use the separator in order to disambiguate strings,
# e.g. "kan-ggan" (initial GG) vs. "kanggan" (final NG + initial G).
# We will not use all of the characters in the jamo block. We will
# only use the 19 initials, 21 medials, and 27 finals possessing a
# jamo short name as defined in section 4.4 of the Unicode book.
# Rules of thumb. These guidelines provide the basic framework
# for the rules. They are phrased in terms of Latin-Jamo transliteration.
# The Jamo-Latin rules derive from these, since the Jamo-Latin rules are
# just context-free transliteration of jamo to corresponding short names,
# with the addition of separators to maintain round-trip integrity
# in the context of the Latin-Jamo rules.
# A sequence of vowels:
# - Take the longest sequence you can. If there are too many, or you don't
# have a starting consonant, introduce a 110B necessary.
# A sequence of consonants.
# - First join the double consonants: G + G -→ GG
# - In the remaining list,
# -- If there is no preceding vowel, take the first consonant, and insert EU
# after it. Continue with the rest of the consonants.
# -- If there is one consonant, attach to the following vowel
# -- If there are two consonants and a following vowel, attach one to the
# preceeding vowel, and one to the following vowel.
# -- If there are more than two consonants, join the first two together if you
# can: L + G =→ LG
# -- If you still end up with more than 2 consonants, insert EU after the
# first one, and continue with the rest of the consonants.
#----------------------------------------------------------------------
# Variables
# Some latin consonants or consonant pairs only occur as initials, and
# some only as finals, but some occur as both. This makes some jamo
# consonants ambiguous when transliterated into latin.
# Initial only: IEUNG BB DD JJ R
# Final only: BS GS L LB LG LH LM LP LS LT NG NH NJ
# Initial and Final: B C D G GG H J K M N P S SS T
$Gi = ᄀ;
$KKi = ᄁ;
$Ni = ᄂ;
@ -77,20 +134,81 @@ $Hf = ᇂ;
$jamoInitial = [ᄀ-ᄒ];
$jamoMedial = [ᅡ-ᅵ];
$latinInitial = [bcdghjklmnprst];
# Any character in the latin transliteration of a medial
$latinMedial = [aeiouwy];
# The last character of the latin transliteration of a medial
$latinMedialEnd = [aeiou];
# Disambiguation separator
$sep = \-;
#----------------------------------------------------------------------
# Jamo-Latin
#
# Jamo to latin is relatively simple, since it is the latin that is
# ambiguous. Most rules are straightforward, and we encode them below
# as simple add-on back rule, e.g.:
# $jamoMedial {bs} → $BS;
# becomes
# $jamoMedial {bs} ↔ $BS;
#
# Furthermore, we don't care about the ordering for Jamo-Latin because
# we are going from single characters, so we can very easily piggyback
# on the Latin-Jamo.
#
# The main issue with Jamo-Latin is when to insert separators.
# Separators are inserted to obtain correct round trip behavior. For
# example, the sequence Ki A Gf Gi E, if transliterated to "kagge",
# would then round trip to Ki A GGi E. To prevent this, we insert a
# separator: "kag-ge". IMPORTANT: The need for separators depends
# very specifically on the behavior of the Latin-Jamo rules. A change
# in the Latin-Jamo behavior can completely change the way the
# separator insertion must be done.
# First try to preserve actual separators in the jamo text by doubling
# them. This fixes problems like:
# (Di)(A)(Ji)(U)(NG)-(IEUNG)(YEO)(Nf)(Gi)(YEO)(L) =→ dajung-yeongyeol
# =→ (Di)(A)(Ji)(U)(NG)(IEUNG)(YEO)(Nf)(Gi)(YEO)(L). This is optional
# -- if we don't care about losing separators in the jamo, we can delete
# this rule.
$sep $sep ↔ $sep;
# Triple consonants. For three consonants "axxx" we insert a
# separator between the first and second "x" if XXf, Xf, and Xi all
# exist, and we have A Xf XXi. This prevents the reverse
# transliteration to A XXf Xi.
$sep ← $latinMedialEnd s {} $SSi;
# For vowels the rule is similar. If there is a vowel "ae" such that
# "a" by itself and "e" by itself are vowels, then we want to map A E
# to "a-e" so as not to round trip to AE. However, in the text Ki EO
# IEUNG E we don't need to map to "keo-e". "keoe" suffices. For
# vowels of the form "aei", both "ae" + "i" and "a" + "ei" must be
# tested. NOTE: These rules used to have a left context of
# $latinInitial instead of [^$latinMedial]. The problem with this is
# sequences where an initial IEUNG is transliterated away:
# (IEUNG)(A)(IEUNG)(EO) =→ aeo =→ (IEUNG)(AE)(IEUNG)(O)
# Also problems in cases like gayeo, which needs to be gaye-o
# The hard case is a chain, like aeoeu. Normally interpreted as ae oe u. So for a-eoeu, we have to insert $sep
# But, we don't insert between the o and the e.
#
# a ae
# e eo eu
# i
# o oe
# u
# ui
# wa wae we wi
# yae ya yeo ye yo yu
# These are simple, since they can't chain. Note that we don't handle extreme cases like [ga][eo][e][o]
$sep ← a {} [$E $EO $EU];
$sep ← [^aow] e {} [$O $OE];
$sep ← [^aowy] e {} [$U $UI];
$sep ← [^ey] o {} [$E $EO $EU];
$sep ← [^y] u {} [$I];
# Similar to the above, but with an intervening $IEUNG.
$sep ← [^$latinMedial] [y] e {} $IEUNG [$O $OE];
$sep ← [^$latinMedial] e {} $IEUNG [$O $OE $U];
$sep ← [^$latinMedial] [o a] {} $IEUNG [$E $EO $EU];
$sep ← [^$latinMedial] [w y] a {} $IEUNG [$E $EO $EU];
# Single finals followed by IEUNG. The jamo sequence A Xf IEUNG E,
# where Xi also exists, must be transliterated as "ax-e" to prevent
# the round trip conversion to A Xi E.
$sep ← $latinMedialEnd b {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd d {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd g {} $IEUNG $jamoMedial;
@ -103,6 +221,10 @@ $sep ← $latinMedialEnd p {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd s {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd t {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd l {} $IEUNG $jamoMedial;
# Double finals followed by IEUNG. Similar to the single finals
# followed by IEUNG. Any latin consonant pair X Y, between medials,
# that we would split by Latin-Jamo, we must handle when it occurs as
# part of A XYf IEUNG E, to prevent round trip conversion to A Xf Yi E
$sep ← $latinMedialEnd b s {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd k k {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd g s {} $IEUNG $jamoMedial;
@ -118,9 +240,16 @@ $sep ← $latinMedialEnd n h {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd n j {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd s s {} $IEUNG $jamoMedial;
$sep ← $latinMedialEnd ch {} $IEUNG $jamoMedial;
# Split doubles. Text of the form A Xi Xf E, where XXi also occurs,
# we transliterate as "ax-xe" to prevent round trip transliteration as
# A XXi E.
$sep ← $latinMedialEnd j {} $Ji $jamoMedial;
$sep ← $latinMedialEnd k {} $Ki $jamoMedial;
$sep ← $latinMedialEnd s {} $Si $jamoMedial;
# XYY. This corresponds to the XYY rule in Latin-Jamo. By default
# Latin-Jamo maps "xyy" to Xf YYi, to keep YY together. As a result,
# "xyy" forms that correspond to XYf Yi must be transliterated as
# "xy-y".
$sep ← $latinMedialEnd b s {} [$Si $SSi];
$sep ← $latinMedialEnd g s {} [$Si $SSi];
$sep ← $latinMedialEnd l b {} [$Bi];
@ -128,12 +257,25 @@ $sep ← $latinMedialEnd l g {} [$Gi];
$sep ← $latinMedialEnd l s {} [$Si $SSi];
$sep ← $latinMedialEnd n g {} [$Gi];
$sep ← $latinMedialEnd n j {} [$Ji $JJi];
# $sep ← $latinMedialEnd l {} [$PPi];
# $sep ← $latinMedialEnd l {} [$TTi];
$sep ← $latinMedialEnd l p {} [$Pi];
$sep ← $latinMedialEnd l t {} [$Ti];
$sep ← $latinMedialEnd k {} [$KKi $Ki];
$sep ← $latinMedialEnd p {} $Pi;
$sep ← $latinMedialEnd t {} $Ti;
$sep ← $latinMedialEnd c {} [$Hi];
# Deletion of IEUNG is handled below.
#----------------------------------------------------------------------
# Latin-Jamo
# [Basic, context-free Jamo-Latin rules are embedded here too. See
# above.]
# Split digraphs: Text of the form 'axye', where 'xy' is a final
# digraph, 'x' is a final (by itself), 'y' is an initial, and 'a' and
# 'e' are medials, we want to transliterate this as A Xf Yi E rather
# than A XYf IEUNG E. We do NOT include text of the form "axxe",
# since that is handled differently below. These rules are generated
# programmatically from the jamo data.
$jamoMedial {b s} $latinMedial → $Bf $Si;
$jamoMedial {g s} $latinMedial → $Gf $Si;
$jamoMedial {l b} $latinMedial → $L $Bi;
@ -146,6 +288,9 @@ $jamoMedial {l t} $latinMedial → $L $Ti;
$jamoMedial {n g} $latinMedial → $Nf $Gi;
$jamoMedial {n h} $latinMedial → $Nf $Hi;
$jamoMedial {n j} $latinMedial → $Nf $Ji;
# Single consonants are initials: Text of the form 'axe', where 'x'
# can be an initial or a final, and 'a' and 'e' are medials, we want
# to transliterate as A Xi E rather than A Xf IEUNG E.
$jamoMedial {b} $latinMedial → $Bi;
$jamoMedial {ch} $latinMedial → $CHi;
$jamoMedial {d} $latinMedial → $Di;
@ -159,13 +304,22 @@ $jamoMedial {p} $latinMedial → $Pi;
$jamoMedial {s} $latinMedial → $Si;
$jamoMedial {t} $latinMedial → $Ti;
$jamoMedial {l} $latinMedial → $Li;
# Doubled initials. The sequence "axxe", where XX exists as an initial
# (XXi), and also Xi and Xf exist (true of all digraphs XX), we want
# to transliterate as A XXi E, rather than split to A Xf Xi E.
$jamoMedial {p p} $latinMedial → $PPi;
$jamoMedial {t t} $latinMedial → $TTi;
$jamoMedial {j j} $latinMedial → $JJi;
$jamoMedial {k k} $latinMedial → $KKi;
$jamoMedial {s s} $latinMedial → $SSi;
# XYY. Because doubled consonants bind more strongly than XY
# consonants, we must handle the sequence "axyy" specially. Here XYf
# and YYi must exist. In these cases, we map to Xf YYi rather than
# XYf.
# However, there are two special cases.
$jamoMedial {lp} p p → $LP;
$jamoMedial {lt} t t → $LT;
# End special cases
$jamoMedial {b} s s → $Bf;
$jamoMedial {g} s s → $Gf;
$jamoMedial {l} b b → $L;
@ -175,6 +329,12 @@ $jamoMedial {l} t t → $L;
$jamoMedial {l} p p → $L;
$jamoMedial {n} g g → $Nf;
$jamoMedial {n} j j → $Nf;
# Finals: Attach consonant with preceding medial to preceding medial.
# Do this BEFORE mapping consonants to initials. Longer keys must
# precede shorter keys that they start with, e.g., the rule for 'bs'
# must precede 'b'.
# [BASIC Jamo-Latin FINALS handled here. Order irrelevant within this
# block for Jamo-Latin.]
$jamoMedial {bs} ↔ $BS;
$jamoMedial {b} ↔ $Bf;
$jamoMedial {ch} ↔ $Cf;
@ -202,6 +362,11 @@ $jamoMedial {p} ↔ $Pf;
$jamoMedial {ss} ↔ $SSf;
$jamoMedial {s} ↔ $Sf;
$jamoMedial {t} ↔ $Tf;
# Initials: Attach single consonant to following medial. Do this
# AFTER mapping finals. Longer keys must precede shorter keys that
# they start with, e.g., the rule for 'gg' must precede 'g'.
# [BASIC Jamo-Latin INITIALS handled here. Order irrelevant within
# this block for Jamo-Latin.]
{kk} $latinMedial ↔ $KKi;
{g} $latinMedial ↔ $Gi;
{n} $latinMedial ↔ $Ni;
@ -221,6 +386,21 @@ $jamoMedial {t} ↔ $Tf;
{t} $latinMedial ↔ $Ti;
{p} $latinMedial ↔ $Pi;
{h} $latinMedial ↔ $Hi;
# 'r' in final position. Because of the equivalency of the 'l' and
# 'r' jamo (the glyphs are the same), we try to provide the same
# equivalency in Latin-Jamo. The 'l' to 'r' conversion is handled
# below. If we see an 'r' in an apparent final position, treat it
# like 'l'. For example, "karka" =→ Ki A R EU Ki A without this rule.
# Instead, we want Ki A L Ki A.
# Initial + Final: If we match the next rule, we have initial then
# final consonant with no intervening medial. We insert the null
# vowel BEFORE it to create a well-formed syllable. (In the next rule
# we insert a null vowel AFTER an anomalous initial.)
# Initial + X: This block matches an initial consonant not followed by
# a medial. We insert the null vowel after it. We handle double
# initials explicitly here; for single initial consonants we insert EU
# (as Latin) after them and let standard rules do the rest.
# BREAKS ROUND TRIP INTEGRITY
kk → $KKi $EU;
tt → $TTi $EU;
pp → $PPi $EU;
@ -228,7 +408,31 @@ ss → $SSi $EU;
jj → $JJi $EU;
ch → $CHi $EU;
([lbdghjkmnpst]) → | $1 eu;
# X + Final: Finally we have to deal with a consonant that can only be
# interpreted as a final (not an initial) and which is preceded
# neither by an initial nor a medial. It is the start of the
# syllable, but cannot be. Most of these will already be handled by
# the above rules. 'bs' splits into Bi EU Sf. Similar for 'gs' 'ng'
# 'nh' 'nj'. The only problem is 'l' and digraphs starting with 'l'.
# For this isolated case, we could add a null initial and medial,
# which would give "la" =→ IEUNG EU L IEUNG A, for example. A more
# economical solution is to transliterate isolated "l" (that is,
# initial "l") to "r". (Other similar conversions of consonants that
# occur neither as initials nor as finals are handled below.)
l → | r;
# Medials. If a medial is preceded by an initial, then we proceed
# normally. As usual, longer keys must precede shorter ones.
# [BASIC Jamo-Latin MEDIALS handled here. Order irrelevant within
# this block for Jamo-Latin.]
#
# a e i o u
# ae
# eo eu
# oe
# ui
# wa we wi
# wae
# yae ya yeo ye yo yu
$jamoInitial {ae} ↔ $AE;
$jamoInitial {a} ↔ $A;
$jamoInitial {eo} ↔ $EO;
@ -250,9 +454,18 @@ $jamoInitial {yeo} ↔ $YEO;
$jamoInitial {ye} ↔ $YE;
$jamoInitial {yo} ↔ $YO;
$jamoInitial {yu} ↔ $YU;
# We may see an anomalous isolated 'w' or 'y'. In that case, we
# interpret it as 'wi' and 'yu', respectively.
# BREAKS ROUND TRIP INTEGRITY
$jamoInitial {w} → | wi;
$jamoInitial {y} → | yu;
# Otherwise, insert a null consonant IEUNG before the medial (which is
# still an untransliterated latin vowel).
($latinMedial) → $IEUNG | $1;
# Convert non-jamo latin consonants to equivalents. These occur as
# neither initials nor finals in jamo. 'l' occurs as a final, but not
# an initial; it is handled above. The following letters (left hand
# side) will never be output by Jamo-Latin.
f → | p;
q → | k;
v → | b;
@ -260,5 +473,14 @@ x → | ks;
z → | s;
r → | l;
c → | k;
# Delete separators (Latin-Jamo).
$sep → ;
# Delete null consonants (Jamo-Latin). Do NOT delete null EU vowels,
# since these may also occur in text.
← $IEUNG;
#- N.B. DO NOT put any filters, NFD, etc. here -- those are aliased in
#- the INDEX file. This transliterator is, by itself, not
#- instantiated. It is used as a part of Latin-Jamo, Latin-Hangul, or
#- inverses thereof.
# eof

View File

@ -1,15 +1,22 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_InterIndic.txt
# Generated from CLDR
#
# Latin-InterIndic
#:: NFD;
#\u0E00 reserved
#consonants
$chandrabindu=\uE001;
$anusvara=\uE002;
$visarga=\uE003;
#\u0E004 reserved
# w←vowel→ represents the stand-alone form
$wa=\uE005;
$waa=\uE006;
$wi=\uE007;
@ -64,8 +71,11 @@ $sha=\uE036;
$ssa=\uE037;
$sa=\uE038;
$ha=\uE039;
#\u093A Reserved
#\u093B Reserved
$nukta=\uE03C;
$avagraha=\uE03D; # SIGN AVAGRAHA
# ←vowel→ represents the dependent form
$aa=\uE03E;
$i=\uE03F;
$ii=\uE040;
@ -82,10 +92,17 @@ $so=\uE04A; # VOWEL SIGN SHORT O
$o=\uE04B; # ो
$au=\uE04C;
$virama=\uE04D;
# \u094E Reserved
# \u094F Reserved
$om = \uE050; # OM
# \u0951→; # UNMAPPED STRESS SIGN UDATTA
# \u0952→; # UNMAPPED STRESS SIGN ANUDATTA
# \u0953→; # UNMAPPED GRAVE ACCENT
# \u0954→; # UNMAPPED ACUTE ACCENT
$lm = \uE055;# Telugu Length Mark
$ailm=\uE056;# AI Length Mark
$aulm=\uE057;# AU Length Mark
#urdu compatibity forms
$uka=\uE058;
$ukha=\uE059;
$ugha=\uE05A;
@ -111,6 +128,7 @@ $seven=\uE06D; # DIGIT SEVEN
$eight=\uE06E; # DIGIT EIGHT
$nine=\uE06F; # DIGIT NINE
$dgs=\uE082;
# For all other scripts
$ecp0=\uE070;
$ecp1=\uE071;
$ecp2=\uE072;
@ -127,10 +145,13 @@ $ecpC=\uE07C;
$ecpD=\uE07D;
$ecpE=\uE07E;
$ecpF=\uE07F;
# Khanda-ta
$kta=\uE083;
# ॰→; # nothing in Latin maps to InterIndic ABBREVIATION SIGN
$depVowelAbove=[\uE03E-\uE040\uE045-\uE04C];
$depVowelBelow=[\uE041-\uE044];
$endThing=[$danda$doubleDanda];
# $x was originally called '§'; $z was '%'
$x=[$virama$aa$ai$au$ii$i$uu$u$rrh$rh$lh$e$o$se$ce$so$co];
$z=[bcdfghjklmnpqrstvwxyz];
$consonants=[[$ka-$ha]$z[क-ह][ক-হ][ਕ-ਹ][ક-હ][କ-ହ][க-ஹ][క-హ][ಕ-ಹ][ക-ഹ]];
@ -139,6 +160,8 @@ $consonants=[[$ka-$ha]$z[क-ह][ক-হ][ਕ-ਹ][ક-હ][କ-ହ][க-ஹ][
m\u0310→$chandrabindu;
h\u0323→$visarga;
x→$ka$virama$sa;
# convert to independent forms at start of word or syllable:
# dependent forms for roundtrip
\u0314a\u0304→$aa;
\u0314ai→$ai;
\u0314au→$au;
@ -159,6 +182,7 @@ x→$ka$virama$sa;
\u0314o\u0306→$co;
\u0314e→$se;
\u0314o→$so;
# preceeded by consonants
$consonants{ a\u0304→$aa;
$consonants{ ai→$ai;
$consonants{ au→$au;
@ -179,6 +203,7 @@ $consonants{ e\u0306→$ce;
$consonants{ o\u0306→$co;
$consonants{ e→$se;
$consonants{ o→$so;
# e.g. keai -→ {ka}{e}{wai}; k'ai -→ {ka}{wai}; (ai) -→ ({wai})
a\u0304→$waa;
ai→$wai;
au→$wau;
@ -199,6 +224,7 @@ o\u0306→$wco;
e→$wse;
''om→$om;
o→$wso;
# rules for anusvara
n}r\u0325 → $na|$virama;
n}l\u0325 → $na|$virama;
n}na → $na|$virama;
@ -211,12 +237,14 @@ n}[tdn] → $anusvara;
m}[pbm] → $anusvara;
n}[ylvshr] → $anusvara;
m\u0307 → $anusvara;
#urdu compatibility
q→$uka|$virama;
k\u0331h\u0331→$ukha |$virama;
g\u0307→ $ugha | $virama;
z → $ujha |$virama;
f → $ufa|$virama;
t\u0331→$kta;
# dev
y\u0307→$uya|$virama;
l\u0331→$ela|$virama;
n\u0331→$ena|$virama;
@ -268,15 +296,21 @@ h→$ha|$virama;
$danda'.'→$doubleDanda;
$depVowelAbove{'~'→$anusvara;
$depVowelBelow{'~'→$chandrabindu;
# convert to dependent forms after consonant with no vowel:
# e.g. kai -→ {ka}{virama}ai -→ {ka}{ai}
#$virama aa→$aa;
$virama a\u0304→$aa;
$virama ai→$ai;
$virama au→$au;
$virama ii→$ii;
$virama i\u0304→$ii;
$virama i→$i;
#$virama uu→$uu;
$virama u\u0304→$uu;
$virama u→$u;
#$virama rrh→$rrh;
$virama r\u0325\u0304→$rrh;
#$virama rh→$rh;
$virama r\u0325a→$rh;
$virama r\u0325→$rh;
$virama l\u0325\u0304→$llh;
@ -289,16 +323,23 @@ $virama e\u0306→$ce;
$virama o\u0306→$co;
$virama e→$se;
$virama o→$so;
# otherwise convert independent forms when separated by ': k'ai -→ {ka}{virama}{wai}
#$virama''aa→$waa;
$virama''a\u0304→$waa;
$virama''ai→$wai;
$virama''au→$wau;
#$virama''ii→$wii;
$virama''i\u0304→$wii;
$virama''i→$wi;
#$virama''uu→$wuu;
$virama''u\u0304→$wuu;
$virama''u→$wu;
#$virama''rrh→$wrr;
$virama''r\u0325\u0304→$wrr;
#$virama''rh→$wr;
$virama''r\u0325→$wr;
$virama''l\u0325\u0304→$wll;
#$virama''lh→$wl;
$virama''l\u0325→$wl;
$virama''e\u0304→$we;
$virama''o\u0304→$wo;
@ -307,6 +348,7 @@ $virama''e\u0306→$wce;
$virama''o\u0306→$wco;
$virama''e→$wse;
$virama''o→$wso;
# no virama
''a\u0304→$waa;
''ai→$wai;
''au→$wau;
@ -340,3 +382,5 @@ $virama}$endThing→;
8→$eight;
9→$nine;
''→;
#:: NFC (NFD) ;

View File

@ -1,17 +1,32 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_NumericPinyin.txt
# Generated from CLDR
#
# According to the pinyin definitions I've been able to find:
# 'a', 'e' are the preferred bases
# otherwise 'o'
# otherwise last vowel
# The trailing form of syllables are the following:
# "a", "ai", "ao", "an", "ang",
# "o", "ou", "ong",
# "e", "ei", "er", "en", "eng",
# "i", "ia", "iao", "ie", "iu", "ian", "in", "iang", "ing", "iong",
# "u", "ua", "uo", "uai", "ui", "uan", "un", "uang", "ueng",
# "ü", "üe", "üan", "ün"
# so the letters the tone will 'hop' are:
::NFD (NFC);
$tone = [\u0304\u0301\u030C\u0300\u0306] ;
# Move the tone to the end of a syllable, and convert to number
e {($tone) r} → r &Pinyin-NumericPinyin($1);
($tone) ( [i o n u {o n} {n g}]) → $2 &Pinyin-NumericPinyin($1);
($tone) → &Pinyin-NumericPinyin($1);
# The following backs up until it finds the right vowel, then deposits the tone
$vowel = [aAeEiIoOuU {u\u0308} {U\u0308} vV];
$consonant = [[a-z A-Z] - [$vowel]];
$digit = [1-5];
@ -20,3 +35,4 @@ $1 &NumericPinyin-Pinyin($3) $2 ← ([oO]) ([$vowel-[aeAE]]* $consonant*) ($digi
$1 &NumericPinyin-Pinyin($3) $2 ← ($vowel) ($consonant*) ($digit);
&NumericPinyin-Pinyin($1) ← [:letter:] {($digit)};
::NFC (NFD);

View File

@ -1,12 +1,13 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Armenian.txt
# File: Latn_Armn.txt
# Generated from CLDR
#
::NFD(NFC);
ev ↔ և ;
tʻ ↔ թ ;
@ -89,3 +90,4 @@ W ↔ Ւ ;
Ō ↔ Օ ;
F ↔ Ֆ ;
::NFC(NFD);

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Bengali.txt
# File: Latn_Beng.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Bengali;
::NFC;

File diff suppressed because it is too large Load Diff

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Devanagari.txt
# File: Latn_Deva.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Devanagari;
::NFC;

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Gujarati.txt
# File: Latn_Gujr.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Gujarati;
::NFC;

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Gurmukhi.txt
# File: Latn_Guru.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Gurmukhi;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Hangul.txt
# File: Latn_Hang.txt
# Generated from CLDR
#
::[-A-Za-zÀ-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǡǦ-ǭǰǴ-ǵǸ-ǻȀ-țȞ-ȟȦ-ȳḀ-ẙẠ-ỹK-Å];
::NFD;
::Lower;
::Latin-ConjoiningJamo;
::NFC;

View File

@ -1,14 +1,16 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Jamo.txt
# File: Latn_Jamo.txt
# Generated from CLDR
#
::[[:script=Latin:][:M:]-];
::NFD;
::Lower;
::Latin-ConjoiningJamo;
::[[:script=Latin:][:M:]] NFC;

View File

@ -1,19 +1,70 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Katakana.txt
# File: Latn_Kana.txt
# Generated from CLDR
#
# note: a global filter is more efficient, but MUST include all source chars
#:: [\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]] ;
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
### WARNING -- must add width filter, both here and below!!! ###
:: [[ᄀ-ᄒᄚᄡ\u1160-ᅵᆪᆬ-ᆭᆰ-ᆵ←-↓│■○\u3000-。「-」\u3099-\u309Aァ-ロワヲ-ヴヷヺ-ー!-~¢-₩][',.A-Za-z~À-ÖØ-öø-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳ\u0304Ӣ-ӣӮ-ӯḀ-ẙẠ-ỹᾱᾹῑῙῡῩK-Å]] ;
:: [:Latin:] fullwidth-halfwidth ();
:: NFD (NFC);
:: Lower (); # whenever transliterating from cased to uncased script, include this
# :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
# Uses modified Hepburn. Small changes to make unambiguous.
# | Kunrei-shiki: Hepburn/MHepburn
# | ------------------------------
# | si: shi
# | si ~ya: sha
# | si ~yu: shu
# | si ~yo: sho
# | zi: ji
# | zi ~ya: ja
# | zi ~yu: ju
# | zi ~yo: jo
# | ti: chi
# | ti ~ya: cha
# | ti ~yu: chu
# | ti ~yu: cho
# | tu: tsu
# | di: ji/dji
# | du: zu/dzu
# | hu: fu
# | For foreign words:
# | -----------------
# | se ~i si
# | si ~e she
# |
# | ze ~i zi
# | zi ~e je
# |
# | te ~i ti
# | ti ~e che
# | te ~u tu
# |
# | de ~i di
# | de ~u du
# | de ~i di
# |
# | he ~u: hu
# | hu ~a fa
# | hu ~i fi
# | hu ~e he
# | hu ~o ho
# Most small forms are generated, but if necessary
# explicit small forms are given with ~a, ~ya, etc.
#------------------------------------------------------
# Variables
$vowel = [aeiou] ;
$consonant = [bcdfghjklmnpqrstvwxyz] ;
$macron = \u0304 ;
# Variables used for doubled-consonants with tsu
$kana = [ぁ-ゔ] ;
$voice = [\u3099゛];
$semivoice = [\u309A゜];
@ -30,22 +81,38 @@ $r_start = [ラリルレロらりるれろ] ;
$w_start = [ワヰヱヲわゐゑを] ;
$v_start = [ワヰヱヲ]\u3099 ;
$voweled_basekana = [ァ-オカキクケコサシスセソタチッツテトナ-ノハヒフヘホマ-ヲヵヶ] ;
# if ン is followed by $n_quoter, then it needs an
# apostrophe after its romaji form to disambiguate it.
# e.g., ン ア ! = ナ, so represent as "n'a", not "na".
$n_quoter = [ア イ ウ エ オ ナ ニ ヌ ネ ヤ ユ ヨ ン] ;
$small_y = [ャィュェョ] ;
$iteration = ゝ ;
#------------------------------------------------------
# katakana rules
# Punctuation
'.' ↔ 。;
',' ↔ 、;
# ' ' } [a-z] → ; # delete spaces before latin
# ' ' ← [^' '-ヿ] {} ['-ヿ] ; #insert spaces before hiragana
# Iteration Mark
# Copy previous letter § marks
# TODO
# | $1 $1 ← ($kana [[:M:]$voice$semivoice]?) $iteration
# Specials for katakana -- not shared with hiragana
va ↔ ワ\u3099 ;
vi ↔ ヰ\u3099 ;
ve ↔ ヱ\u3099 ;
vo ↔ ヲ\u3099 ;
'~ka' ↔ ヵ ;
'~ke' ↔ ヶ ;
# ~~~ begin shared rules ~~~
#special
ya ← '~'ャ;
yi ← '~'ィ ;
yu ← '~'ュ;
ye ← '~'ェ;
yo ← '~'ョ;
#normal
a ↔ ア ;
b | '~' ← ヒ \u3099} $small_y ;
by } $vowel → ヒ\u3099 | '~y' ;
@ -69,6 +136,7 @@ dje ← チ\u3099ェ ;
djo ← チ\u3099ョ ;
dji ↔ チ\u3099 ;
dj } $vowel → チ\u3099 | '~y' ;
# TODO: QUESTION: use ĵĴżŻ instead of dj, dz
cha ← チャ ;
chi'~i' ← チィ ; # liu
chu ← チュ ;
@ -85,6 +153,7 @@ gu ↔ ク\u3099 ;
ge ↔ ケ\u3099 ;
go ↔ コ\u3099 ;
i ↔ イ ;
# j } $vowel → シ\u3099 | '~y' ;
ja ↔ シ\u3099ャ ;
ji'~i' ← シ\u3099ィ ; # liu
ju ↔ シ\u3099ュ ;
@ -128,6 +197,8 @@ hi ↔ ヒ ;
hu ↔ ヘゥ ;
he ↔ ヘ ;
ho ↔ ホ ;
# f | '~' ← フ } $small_y ;
# f } $vowel → フ | '~' ;
fa ↔ ファ ;
fi ↔ フィ ;
fe ↔ フェ ;
@ -163,8 +234,14 @@ tu ↔ テゥ ;
te ↔ テ ;
to ↔ ト ;
tsu ↔ ツ ;
# v } $vowel → ウ\u3099 | '~' ;
#'v~a' ← ウ\u3099ァ ; # liu
#'v~i' ← ウ\u3099ィ ; # liu
#'v~e' ← ウ\u3099ェ ; # liu
#'v~o' ← ウ\u3099ォ ; # liu
vu ↔ ウ\u3099 ;
u ↔ ウ ;
# w } $vowel → ウ | '~' ;
wa ↔ ワ ;
wi ↔ ヰ ;
wu → ウ ;
@ -175,15 +252,20 @@ yi → イ ;
yu ↔ ユ ;
ye → エ ;
yo ↔ ヨ ;
# double consonants
#specials
s } sh → ッ ;
t } ch → ッ ;
#voiced
j } j ↔ ッ } $j_start ;
b } b ↔ ッ } [$h_start$f_start] $voice;
d } d ↔ ッ } $t_start $voice;
g } g ↔ ッ } $k_start $voice;
p } p ↔ ッ } [$h_start$f_start] $semivoice;
# v } v ↔ ッ } [ワヰウヱヲう] $voice ;
z } z ↔ ッ } $s_start $voice;
v } v ↔ ッ } $v_start;
# normal
k } k ↔ ッ } $k_start ;
m } m ↔ ッ } $m_start ;
n } n ↔ ッ } $n_start ;
@ -194,13 +276,24 @@ t } t ↔ ッ } $t_start ;
s } s ↔ ッ } $s_start ;
w } w ↔ ッ } $w_start;
y } y ↔ ッ } $y_start;
# completeness
x } x → ッ ;
c } k → ッ ;
c } c → ッ ;
c } q → ッ ;
l } l → ッ ;
q } q → ッ ;
# y } y → ッ ;
# w } w → ッ ;
# prolonged vowel mark. this indicates a doubling of
# the preceding vowel sound
#a ← a { ー ; # liu
#e ← e { ー ; # liu
#i ← i { ー ; # liu
#o ← o { ー ; # liu
#u ← u { ー ; # liu
$macron ↔ ー ;
# small forms
'~a' ↔ ァ ;
'~i' ↔ ィ ;
'~u' ↔ ゥ ;
@ -213,6 +306,8 @@ $macron ↔ ー ;
'~yu' ↔ ュ ;
'~ye' → ェ ;
'~yo' ↔ ョ ;
# iteration marks
# TODO: make more accurate
j $1 ← sh (y* $vowel) {ヽ$voice ;
dj $1 ← ch (y* $vowel) {ヽ$voice ;
dz $1 ← ts (y* $vowel) {ヽ$voice ;
@ -230,7 +325,16 @@ dz $1 ← dz (y* $vowel) {ヽ$voice ;
$1 ← ($consonant y* $vowel) {ヽ$voice? ;
$1 ← (.) {ヽ $voice? ; # otherwise repeat last character
← ヽ $voice? ; # delete if no characters found
# h- rule: lengthens vowel if not followed by a vowel.
# At the point this is applied, latin [cons]?vowel sequences
# have been converted to katakana in NFD form.
$voweled_basekana [\u3099 \u309A]? { h → ー ;
# one-way latin- → kana rules. these do not occur in
# well-formed romaji representing actual japanese text.
# their purpose is to make all romaji map to kana of
# some sort.
# the following are not really necessary, but produce
# slightly more natural results.
cy → セィ ;
dy → テ\u3099ィ ;
hy → ヒ ;
@ -238,6 +342,8 @@ sy → セィ ;
ty → ティ ;
zy → セ\u3099ィ ;
h → ヘ ;
# isolated consonants listed here so as not to mask
# longer rules above.
ch → チ;
sh → シ ;
dz → ツ\u3099 ;
@ -264,12 +370,22 @@ w → ウ;
ð → | d ;
ø → | u ;
þ → | th ;
# simple substitutions using backup
c → | k ;
l → | r ;
q → | k ;
x → | ks ;
# ~~~ END shared rules ~~~
#------------------------------------------------------
# Final cleanup
'~' → ; # delete stray tildes between letters
[:Katakana:] { '' } [:Latin:] → ; # delete stray quotes between letters
# [ʾ[:Nonspacing Mark:]-[\u3099-゜]] → ; # delete any non-spacing marks that we didn't use
:: NFC (NFD) ;
:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
# note: a global filter is more efficient, but MUST include all source chars!!
#:: ([\u0000-\u007E 、。 \u3099-゜ ァ-ー 。-゚ [:Latin:][:Katakana:] [:nonspacing mark:]]);
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [[\ -~¢-£¥-¦¬\u0304₩。-하-ᅦᅧ-ᅬᅭ-ᅲᅳ-ᅵ│-○][~、-。がぎぐげござじずぜぞだぢづでどば-ぱび-ぴぶ-ぷべ-ぺぼ-ぽゔ\u3099-゛ゞァ-ヺー-ヾ][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
# eof

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Kannada.txt
# File: Latn_Knda.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Kannada;
::NFC;

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Malayalam.txt
# File: Latn_Mlym.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Malayalam;
::NFC;

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Oriya.txt
# File: Latn_Orya.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Oriya;
::NFC;

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Tamil.txt
# File: Latn_Taml.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Tamil;
::NFC;

View File

@ -1,15 +1,17 @@
# ***************************************************************************
# *
# * Copyright (C) 2004-2015, International Business Machines
# * Copyright (C) 2004-2016, International Business Machines
# * Corporation; Unicode, Inc.; and others. All Rights Reserved.
# *
# ***************************************************************************
# File: Latin_Telugu.txt
# File: Latn_Telu.txt
# Generated from CLDR
#
::['.0-9A-Za-z~À-ÅÇ-ÏÑ-ÖÙ-Ýà-åç-ïñ-öù-ýÿ-ďĒ-ĥĨ-İĴ-ķĹ-ľŃ-ňŌ-őŔ-ťŨ-žƠ-ơƯ-ưǍ-ǜǞ-ǣǦ-ǭǰǴ-ǵǸ-țȞ-ȟȦ-ȳʔ\u0301\u0303-\u0304\u0306-\u0307\u0310\u0314-\u0315\u0323\u0325\u0331\u0341\u0344΅-ΆΈ-ΊΌΎ-ΐά-ΰό-ώϓЃЌЎЙйѓќўӁ-ӂӐ-ӑӖ-ӗӢ-ӣӮ-ӯḀ-ẙẠ-ỹἁἃ-ἅἇἉἋ-ἍἏἑἓ-ἕἙἛ-Ἕἡἣ-ἥἧἩἫ-ἭἯἱἳ-ἵἷἹἻ-ἽἿὁὃ-ὅὉὋ-Ὅὑὓ-ὕὗὙὛὝὟὡὣ-ὥὧὩὫ-ὭὯάέήίόύώᾁᾃ-ᾅᾇᾉᾋ-ᾍᾏᾑᾓ-ᾕᾗᾙᾛ-ᾝᾟᾡᾣ-ᾥᾧᾩᾫ-ᾭᾯ-ᾱᾴᾸ-ᾹΆῄΈΉ῎ῐ-ῑΐῘ-ῙΊ῞ῠ-ῡΰῥῨ-ῩΎ-Ῥ΅ῴΌΏK-Å\uE04D\uE064];
::NFD;
::Lower;
::Latin-InterIndic;
::InterIndic-Telugu;
::NFC;

Some files were not shown because too many files have changed in this diff Show More