Update transliteration support to Unicode 7.0.0.

The transliteration files are now autogenerated from upstream Unicode
data.
This commit is contained in:
Carlos O'Donell 2015-12-09 22:27:41 -05:00
parent 40b59cace2
commit dd8e8e5476
18 changed files with 3928 additions and 713 deletions

View File

@ -1,3 +1,35 @@
2015-12-09 Mike FABIAN <mfabian@redhat.com>
[BZ #16061]
* unicode-gen/unicode_utils.py: New file.
* unicode-gen/gen_translit_circle.py: New file.
* unicode-gen/gen_translit_cjk_compat.py: New file.
* unicode-gen/gen_translit_combining.py: New file.
* unicode-gen/gen_translit_compat.py: New file
* unicode-gen/gen_translit_font.py: New file.
* unicode-gen/gen_translit_fraction.py: New file.
* unicode-gen/gen_unicode_ctype.py: Use unicode_utils.py.
* unicode-gen/utf8_compatibility.py: Likewise.
* unicode-gen/utf8_gen.py: Likewise.
* unicode-gen/Makefile (GENERATED): Add translit_combining
translit_compat translit_circle translit_cjk_compat translit_font
translit_fraction.
(install): Install translit_combining translit_compat translit_circle
translit_cjk_compat translit_font translit_fraction.
(UTF-8-report): Reference UnicodeData.txt and EastAsianWidth.txt.
(translit_combining): New target.
(translit_compat): New target.
(translit_circle): New target.
(translit_cjk_compat): New target.
(translit_font): New target.
(translit_fraction): New target.
* locales/translit_circle: Regenerate.
* locales/translit_cjk_compat: Regenerate.
* locales/translit_combining: Regenerate.
* locales/translit_compat: Regenerate.
* locales/translit_font: Regenerate.
* locales/translit_fraction: Regenerate.
2015-12-09 Mike FABIAN <mfabian@redhat.com>
Marko Myllynen <myllynen@redhat.com>

View File

@ -2,9 +2,7 @@ escape_char /
comment_char %
% Transliterations of encircled characters.
% Generated through
% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<circle>[^;]*;' UnicodeData.txt | \
% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<circle> \([^;]*\);.*$/<U\1> "<U0028 \3 0029>"% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
% Generated automatically from UnicodeData.txt by gen_translit_circle.py on 2015-12-09 for Unicode 7.0.0.
LC_CTYPE
@ -156,6 +154,14 @@ translit_start
<U24E9> "<U0028><U007A><U0029>"
% CIRCLED DIGIT ZERO
<U24EA> "<U0028><U0030><U0029>"
% CIRCLED IDEOGRAPH QUESTION
<U3244> "<U0028><U554F><U0029>"
% CIRCLED IDEOGRAPH KINDERGARTEN
<U3245> "<U0028><U5E7C><U0029>"
% CIRCLED IDEOGRAPH SCHOOL
<U3246> "<U0028><U6587><U0029>"
% CIRCLED IDEOGRAPH KOTO
<U3247> "<U0028><U7B8F><U0029>"
% CIRCLED NUMBER TWENTY ONE
<U3251> "<U0028><U0032><U0031><U0029>"
% CIRCLED NUMBER TWENTY TWO
@ -242,6 +248,12 @@ translit_start
<U327A> "<U0028><U1111><U1161><U0029>"
% CIRCLED HANGUL HIEUH A
<U327B> "<U0028><U1112><U1161><U0029>"
% CIRCLED KOREAN CHARACTER CHAMKO
<U327C> "<U0028><U110E><U1161><U11B7><U1100><U1169><U0029>"
% CIRCLED KOREAN CHARACTER JUEUI
<U327D> "<U0028><U110C><U116E><U110B><U1174><U0029>"
% CIRCLED HANGUL IEUNG U
<U327E> "<U0028><U110B><U116E><U0029>"
% CIRCLED IDEOGRAPH ONE
<U3280> "<U0028><U4E00><U0029>"
% CIRCLED IDEOGRAPH TWO
@ -464,6 +476,18 @@ translit_start
<U32FD> "<U0028><U30F1><U0029>"
% CIRCLED KATAKANA WO
<U32FE> "<U0028><U30F2><U0029>"
% CIRCLED ITALIC LATIN CAPITAL LETTER C
<U0001F12B> "<U0028><U0043><U0029>"
% CIRCLED ITALIC LATIN CAPITAL LETTER R
<U0001F12C> "<U0028><U0052><U0029>"
% CIRCLED CD
<U0001F12D> "<U0028><U0043><U0044><U0029>"
% CIRCLED WZ
<U0001F12E> "<U0028><U0057><U005A><U0029>"
% CIRCLED IDEOGRAPH ADVANTAGE
<U0001F250> "<U0028><U5F97><U0029>"
% CIRCLED IDEOGRAPH ACCEPT
<U0001F251> "<U0028><U53EF><U0029>"
translit_end

View File

@ -2,18 +2,22 @@ escape_char /
comment_char %
% Transliterations of CJK compatibility characters.
% Generated through
% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<square>[^;]*;' UnicodeData.txt | \
% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<square> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
% and
% $ grep '[^;]*;CJK COMPATIBILITY IDEOGRAPH[^;]*;[^;]*;[^;]*;[^;]*;[^;]' UnicodeData.txt | \
% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;\([^;]*\);.*$/<U\1> <U\3>% \2/' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G' | \
% sed -e 's/<U\(.....\)>/<U000\1>/g'
% Generated automatically from UnicodeData.txt by gen_translit_cjk_compat.py on 2015-12-09 for Unicode 7.0.0.
LC_CTYPE
translit_start
% PARTNERSHIP SIGN
<U3250> "<U0050><U0054><U0045>"
% SQUARE HG
<U32CC> "<U0048><U0067>"
% SQUARE ERG
<U32CD> "<U0065><U0072><U0067>"
% SQUARE EV
<U32CE> "<U0065><U0056>"
% LIMITED LIABILITY SIGN
<U32CF> "<U004C><U0054><U0044>"
% SQUARE APAATO
<U3300> "<U30A2><U30D1><U30FC><U30C8>"
% SQUARE ARUHUA
@ -202,6 +206,14 @@ translit_start
<U3375> "<U006F><U0056>"
% SQUARE PC
<U3376> "<U0070><U0063>"
% SQUARE DM
<U3377> "<U0064><U006D>"
% SQUARE DM SQUARED
<U3378> "<U0064><U006D><U00B2>";"<U0064><U006D><U005E><U0032>"
% SQUARE DM CUBED
<U3379> "<U0064><U006D><U00B3>";"<U0064><U006D><U005E><U0033>"
% SQUARE IU
<U337A> "<U0049><U0055>"
% SQUARE ERA NAME HEISEI
<U337B> "<U5E73><U6210>"
% SQUARE ERA NAME SYOUWA
@ -400,6 +412,170 @@ translit_start
<U33DC> "<U0053><U0076>"
% SQUARE WB
<U33DD> "<U0057><U0062>"
% SQUARE V OVER M
<U33DE> "<U0056><U2215><U006D>";"<U0056><U002F><U006D>"
% SQUARE A OVER M
<U33DF> "<U0041><U2215><U006D>";"<U0041><U002F><U006D>"
% SQUARE GAL
<U33FF> "<U0067><U0061><U006C>"
% SQUARED LATIN CAPITAL LETTER A
<U0001F130> <U0041>
% SQUARED LATIN CAPITAL LETTER B
<U0001F131> <U0042>
% SQUARED LATIN CAPITAL LETTER C
<U0001F132> <U0043>
% SQUARED LATIN CAPITAL LETTER D
<U0001F133> <U0044>
% SQUARED LATIN CAPITAL LETTER E
<U0001F134> <U0045>
% SQUARED LATIN CAPITAL LETTER F
<U0001F135> <U0046>
% SQUARED LATIN CAPITAL LETTER G
<U0001F136> <U0047>
% SQUARED LATIN CAPITAL LETTER H
<U0001F137> <U0048>
% SQUARED LATIN CAPITAL LETTER I
<U0001F138> <U0049>
% SQUARED LATIN CAPITAL LETTER J
<U0001F139> <U004A>
% SQUARED LATIN CAPITAL LETTER K
<U0001F13A> <U004B>
% SQUARED LATIN CAPITAL LETTER L
<U0001F13B> <U004C>
% SQUARED LATIN CAPITAL LETTER M
<U0001F13C> <U004D>
% SQUARED LATIN CAPITAL LETTER N
<U0001F13D> <U004E>
% SQUARED LATIN CAPITAL LETTER O
<U0001F13E> <U004F>
% SQUARED LATIN CAPITAL LETTER P
<U0001F13F> <U0050>
% SQUARED LATIN CAPITAL LETTER Q
<U0001F140> <U0051>
% SQUARED LATIN CAPITAL LETTER R
<U0001F141> <U0052>
% SQUARED LATIN CAPITAL LETTER S
<U0001F142> <U0053>
% SQUARED LATIN CAPITAL LETTER T
<U0001F143> <U0054>
% SQUARED LATIN CAPITAL LETTER U
<U0001F144> <U0055>
% SQUARED LATIN CAPITAL LETTER V
<U0001F145> <U0056>
% SQUARED LATIN CAPITAL LETTER W
<U0001F146> <U0057>
% SQUARED LATIN CAPITAL LETTER X
<U0001F147> <U0058>
% SQUARED LATIN CAPITAL LETTER Y
<U0001F148> <U0059>
% SQUARED LATIN CAPITAL LETTER Z
<U0001F149> <U005A>
% SQUARED HV
<U0001F14A> "<U0048><U0056>"
% SQUARED MV
<U0001F14B> "<U004D><U0056>"
% SQUARED SD
<U0001F14C> "<U0053><U0044>"
% SQUARED SS
<U0001F14D> "<U0053><U0053>"
% SQUARED PPV
<U0001F14E> "<U0050><U0050><U0056>"
% SQUARED WC
<U0001F14F> "<U0057><U0043>"
% SQUARE DJ
<U0001F190> "<U0044><U004A>"
% SQUARE HIRAGANA HOKA
<U0001F200> "<U307B><U304B>"
% SQUARED KATAKANA KOKO
<U0001F201> "<U30B3><U30B3>"
% SQUARED KATAKANA SA
<U0001F202> <U30B5>
% SQUARED CJK UNIFIED IDEOGRAPH-624B
<U0001F210> <U624B>
% SQUARED CJK UNIFIED IDEOGRAPH-5B57
<U0001F211> <U5B57>
% SQUARED CJK UNIFIED IDEOGRAPH-53CC
<U0001F212> <U53CC>
% SQUARED KATAKANA DE
<U0001F213> <U30C7>
% SQUARED CJK UNIFIED IDEOGRAPH-4E8C
<U0001F214> <U4E8C>
% SQUARED CJK UNIFIED IDEOGRAPH-591A
<U0001F215> <U591A>
% SQUARED CJK UNIFIED IDEOGRAPH-89E3
<U0001F216> <U89E3>
% SQUARED CJK UNIFIED IDEOGRAPH-5929
<U0001F217> <U5929>
% SQUARED CJK UNIFIED IDEOGRAPH-4EA4
<U0001F218> <U4EA4>
% SQUARED CJK UNIFIED IDEOGRAPH-6620
<U0001F219> <U6620>
% SQUARED CJK UNIFIED IDEOGRAPH-7121
<U0001F21A> <U7121>
% SQUARED CJK UNIFIED IDEOGRAPH-6599
<U0001F21B> <U6599>
% SQUARED CJK UNIFIED IDEOGRAPH-524D
<U0001F21C> <U524D>
% SQUARED CJK UNIFIED IDEOGRAPH-5F8C
<U0001F21D> <U5F8C>
% SQUARED CJK UNIFIED IDEOGRAPH-518D
<U0001F21E> <U518D>
% SQUARED CJK UNIFIED IDEOGRAPH-65B0
<U0001F21F> <U65B0>
% SQUARED CJK UNIFIED IDEOGRAPH-521D
<U0001F220> <U521D>
% SQUARED CJK UNIFIED IDEOGRAPH-7D42
<U0001F221> <U7D42>
% SQUARED CJK UNIFIED IDEOGRAPH-751F
<U0001F222> <U751F>
% SQUARED CJK UNIFIED IDEOGRAPH-8CA9
<U0001F223> <U8CA9>
% SQUARED CJK UNIFIED IDEOGRAPH-58F0
<U0001F224> <U58F0>
% SQUARED CJK UNIFIED IDEOGRAPH-5439
<U0001F225> <U5439>
% SQUARED CJK UNIFIED IDEOGRAPH-6F14
<U0001F226> <U6F14>
% SQUARED CJK UNIFIED IDEOGRAPH-6295
<U0001F227> <U6295>
% SQUARED CJK UNIFIED IDEOGRAPH-6355
<U0001F228> <U6355>
% SQUARED CJK UNIFIED IDEOGRAPH-4E00
<U0001F229> <U4E00>
% SQUARED CJK UNIFIED IDEOGRAPH-4E09
<U0001F22A> <U4E09>
% SQUARED CJK UNIFIED IDEOGRAPH-904A
<U0001F22B> <U904A>
% SQUARED CJK UNIFIED IDEOGRAPH-5DE6
<U0001F22C> <U5DE6>
% SQUARED CJK UNIFIED IDEOGRAPH-4E2D
<U0001F22D> <U4E2D>
% SQUARED CJK UNIFIED IDEOGRAPH-53F3
<U0001F22E> <U53F3>
% SQUARED CJK UNIFIED IDEOGRAPH-6307
<U0001F22F> <U6307>
% SQUARED CJK UNIFIED IDEOGRAPH-8D70
<U0001F230> <U8D70>
% SQUARED CJK UNIFIED IDEOGRAPH-6253
<U0001F231> <U6253>
% SQUARED CJK UNIFIED IDEOGRAPH-7981
<U0001F232> <U7981>
% SQUARED CJK UNIFIED IDEOGRAPH-7A7A
<U0001F233> <U7A7A>
% SQUARED CJK UNIFIED IDEOGRAPH-5408
<U0001F234> <U5408>
% SQUARED CJK UNIFIED IDEOGRAPH-6E80
<U0001F235> <U6E80>
% SQUARED CJK UNIFIED IDEOGRAPH-6709
<U0001F236> <U6709>
% SQUARED CJK UNIFIED IDEOGRAPH-6708
<U0001F237> <U6708>
% SQUARED CJK UNIFIED IDEOGRAPH-7533
<U0001F238> <U7533>
% SQUARED CJK UNIFIED IDEOGRAPH-5272
<U0001F239> <U5272>
% SQUARED CJK UNIFIED IDEOGRAPH-55B6
<U0001F23A> <U55B6>
% CJK COMPATIBILITY IDEOGRAPH-F900
<UF900> <U8C48>
% CJK COMPATIBILITY IDEOGRAPH-F901
@ -980,6 +1156,10 @@ translit_start
<UFA2C> <U9928>
% CJK COMPATIBILITY IDEOGRAPH-FA2D
<UFA2D> <U9DB4>
% CJK COMPATIBILITY IDEOGRAPH-FA2E
<UFA2E> <U90DE>
% CJK COMPATIBILITY IDEOGRAPH-FA2F
<UFA2F> <U96B7>
% CJK COMPATIBILITY IDEOGRAPH-FA30
<UFA30> <U4FAE>
% CJK COMPATIBILITY IDEOGRAPH-FA31
@ -1098,6 +1278,224 @@ translit_start
<UFA69> <U97FF>
% CJK COMPATIBILITY IDEOGRAPH-FA6A
<UFA6A> <U983B>
% CJK COMPATIBILITY IDEOGRAPH-FA6B
<UFA6B> <U6075>
% CJK COMPATIBILITY IDEOGRAPH-FA6C
<UFA6C> <U000242EE>
% CJK COMPATIBILITY IDEOGRAPH-FA6D
<UFA6D> <U8218>
% CJK COMPATIBILITY IDEOGRAPH-FA70
<UFA70> <U4E26>
% CJK COMPATIBILITY IDEOGRAPH-FA71
<UFA71> <U51B5>
% CJK COMPATIBILITY IDEOGRAPH-FA72
<UFA72> <U5168>
% CJK COMPATIBILITY IDEOGRAPH-FA73
<UFA73> <U4F80>
% CJK COMPATIBILITY IDEOGRAPH-FA74
<UFA74> <U5145>
% CJK COMPATIBILITY IDEOGRAPH-FA75
<UFA75> <U5180>
% CJK COMPATIBILITY IDEOGRAPH-FA76
<UFA76> <U52C7>
% CJK COMPATIBILITY IDEOGRAPH-FA77
<UFA77> <U52FA>
% CJK COMPATIBILITY IDEOGRAPH-FA78
<UFA78> <U559D>
% CJK COMPATIBILITY IDEOGRAPH-FA79
<UFA79> <U5555>
% CJK COMPATIBILITY IDEOGRAPH-FA7A
<UFA7A> <U5599>
% CJK COMPATIBILITY IDEOGRAPH-FA7B
<UFA7B> <U55E2>
% CJK COMPATIBILITY IDEOGRAPH-FA7C
<UFA7C> <U585A>
% CJK COMPATIBILITY IDEOGRAPH-FA7D
<UFA7D> <U58B3>
% CJK COMPATIBILITY IDEOGRAPH-FA7E
<UFA7E> <U5944>
% CJK COMPATIBILITY IDEOGRAPH-FA7F
<UFA7F> <U5954>
% CJK COMPATIBILITY IDEOGRAPH-FA80
<UFA80> <U5A62>
% CJK COMPATIBILITY IDEOGRAPH-FA81
<UFA81> <U5B28>
% CJK COMPATIBILITY IDEOGRAPH-FA82
<UFA82> <U5ED2>
% CJK COMPATIBILITY IDEOGRAPH-FA83
<UFA83> <U5ED9>
% CJK COMPATIBILITY IDEOGRAPH-FA84
<UFA84> <U5F69>
% CJK COMPATIBILITY IDEOGRAPH-FA85
<UFA85> <U5FAD>
% CJK COMPATIBILITY IDEOGRAPH-FA86
<UFA86> <U60D8>
% CJK COMPATIBILITY IDEOGRAPH-FA87
<UFA87> <U614E>
% CJK COMPATIBILITY IDEOGRAPH-FA88
<UFA88> <U6108>
% CJK COMPATIBILITY IDEOGRAPH-FA89
<UFA89> <U618E>
% CJK COMPATIBILITY IDEOGRAPH-FA8A
<UFA8A> <U6160>
% CJK COMPATIBILITY IDEOGRAPH-FA8B
<UFA8B> <U61F2>
% CJK COMPATIBILITY IDEOGRAPH-FA8C
<UFA8C> <U6234>
% CJK COMPATIBILITY IDEOGRAPH-FA8D
<UFA8D> <U63C4>
% CJK COMPATIBILITY IDEOGRAPH-FA8E
<UFA8E> <U641C>
% CJK COMPATIBILITY IDEOGRAPH-FA8F
<UFA8F> <U6452>
% CJK COMPATIBILITY IDEOGRAPH-FA90
<UFA90> <U6556>
% CJK COMPATIBILITY IDEOGRAPH-FA91
<UFA91> <U6674>
% CJK COMPATIBILITY IDEOGRAPH-FA92
<UFA92> <U6717>
% CJK COMPATIBILITY IDEOGRAPH-FA93
<UFA93> <U671B>
% CJK COMPATIBILITY IDEOGRAPH-FA94
<UFA94> <U6756>
% CJK COMPATIBILITY IDEOGRAPH-FA95
<UFA95> <U6B79>
% CJK COMPATIBILITY IDEOGRAPH-FA96
<UFA96> <U6BBA>
% CJK COMPATIBILITY IDEOGRAPH-FA97
<UFA97> <U6D41>
% CJK COMPATIBILITY IDEOGRAPH-FA98
<UFA98> <U6EDB>
% CJK COMPATIBILITY IDEOGRAPH-FA99
<UFA99> <U6ECB>
% CJK COMPATIBILITY IDEOGRAPH-FA9A
<UFA9A> <U6F22>
% CJK COMPATIBILITY IDEOGRAPH-FA9B
<UFA9B> <U701E>
% CJK COMPATIBILITY IDEOGRAPH-FA9C
<UFA9C> <U716E>
% CJK COMPATIBILITY IDEOGRAPH-FA9D
<UFA9D> <U77A7>
% CJK COMPATIBILITY IDEOGRAPH-FA9E
<UFA9E> <U7235>
% CJK COMPATIBILITY IDEOGRAPH-FA9F
<UFA9F> <U72AF>
% CJK COMPATIBILITY IDEOGRAPH-FAA0
<UFAA0> <U732A>
% CJK COMPATIBILITY IDEOGRAPH-FAA1
<UFAA1> <U7471>
% CJK COMPATIBILITY IDEOGRAPH-FAA2
<UFAA2> <U7506>
% CJK COMPATIBILITY IDEOGRAPH-FAA3
<UFAA3> <U753B>
% CJK COMPATIBILITY IDEOGRAPH-FAA4
<UFAA4> <U761D>
% CJK COMPATIBILITY IDEOGRAPH-FAA5
<UFAA5> <U761F>
% CJK COMPATIBILITY IDEOGRAPH-FAA6
<UFAA6> <U76CA>
% CJK COMPATIBILITY IDEOGRAPH-FAA7
<UFAA7> <U76DB>
% CJK COMPATIBILITY IDEOGRAPH-FAA8
<UFAA8> <U76F4>
% CJK COMPATIBILITY IDEOGRAPH-FAA9
<UFAA9> <U774A>
% CJK COMPATIBILITY IDEOGRAPH-FAAA
<UFAAA> <U7740>
% CJK COMPATIBILITY IDEOGRAPH-FAAB
<UFAAB> <U78CC>
% CJK COMPATIBILITY IDEOGRAPH-FAAC
<UFAAC> <U7AB1>
% CJK COMPATIBILITY IDEOGRAPH-FAAD
<UFAAD> <U7BC0>
% CJK COMPATIBILITY IDEOGRAPH-FAAE
<UFAAE> <U7C7B>
% CJK COMPATIBILITY IDEOGRAPH-FAAF
<UFAAF> <U7D5B>
% CJK COMPATIBILITY IDEOGRAPH-FAB0
<UFAB0> <U7DF4>
% CJK COMPATIBILITY IDEOGRAPH-FAB1
<UFAB1> <U7F3E>
% CJK COMPATIBILITY IDEOGRAPH-FAB2
<UFAB2> <U8005>
% CJK COMPATIBILITY IDEOGRAPH-FAB3
<UFAB3> <U8352>
% CJK COMPATIBILITY IDEOGRAPH-FAB4
<UFAB4> <U83EF>
% CJK COMPATIBILITY IDEOGRAPH-FAB5
<UFAB5> <U8779>
% CJK COMPATIBILITY IDEOGRAPH-FAB6
<UFAB6> <U8941>
% CJK COMPATIBILITY IDEOGRAPH-FAB7
<UFAB7> <U8986>
% CJK COMPATIBILITY IDEOGRAPH-FAB8
<UFAB8> <U8996>
% CJK COMPATIBILITY IDEOGRAPH-FAB9
<UFAB9> <U8ABF>
% CJK COMPATIBILITY IDEOGRAPH-FABA
<UFABA> <U8AF8>
% CJK COMPATIBILITY IDEOGRAPH-FABB
<UFABB> <U8ACB>
% CJK COMPATIBILITY IDEOGRAPH-FABC
<UFABC> <U8B01>
% CJK COMPATIBILITY IDEOGRAPH-FABD
<UFABD> <U8AFE>
% CJK COMPATIBILITY IDEOGRAPH-FABE
<UFABE> <U8AED>
% CJK COMPATIBILITY IDEOGRAPH-FABF
<UFABF> <U8B39>
% CJK COMPATIBILITY IDEOGRAPH-FAC0
<UFAC0> <U8B8A>
% CJK COMPATIBILITY IDEOGRAPH-FAC1
<UFAC1> <U8D08>
% CJK COMPATIBILITY IDEOGRAPH-FAC2
<UFAC2> <U8F38>
% CJK COMPATIBILITY IDEOGRAPH-FAC3
<UFAC3> <U9072>
% CJK COMPATIBILITY IDEOGRAPH-FAC4
<UFAC4> <U9199>
% CJK COMPATIBILITY IDEOGRAPH-FAC5
<UFAC5> <U9276>
% CJK COMPATIBILITY IDEOGRAPH-FAC6
<UFAC6> <U967C>
% CJK COMPATIBILITY IDEOGRAPH-FAC7
<UFAC7> <U96E3>
% CJK COMPATIBILITY IDEOGRAPH-FAC8
<UFAC8> <U9756>
% CJK COMPATIBILITY IDEOGRAPH-FAC9
<UFAC9> <U97DB>
% CJK COMPATIBILITY IDEOGRAPH-FACA
<UFACA> <U97FF>
% CJK COMPATIBILITY IDEOGRAPH-FACB
<UFACB> <U980B>
% CJK COMPATIBILITY IDEOGRAPH-FACC
<UFACC> <U983B>
% CJK COMPATIBILITY IDEOGRAPH-FACD
<UFACD> <U9B12>
% CJK COMPATIBILITY IDEOGRAPH-FACE
<UFACE> <U9F9C>
% CJK COMPATIBILITY IDEOGRAPH-FACF
<UFACF> <U0002284A>
% CJK COMPATIBILITY IDEOGRAPH-FAD0
<UFAD0> <U00022844>
% CJK COMPATIBILITY IDEOGRAPH-FAD1
<UFAD1> <U000233D5>
% CJK COMPATIBILITY IDEOGRAPH-FAD2
<UFAD2> <U3B9D>
% CJK COMPATIBILITY IDEOGRAPH-FAD3
<UFAD3> <U4018>
% CJK COMPATIBILITY IDEOGRAPH-FAD4
<UFAD4> <U4039>
% CJK COMPATIBILITY IDEOGRAPH-FAD5
<UFAD5> <U00025249>
% CJK COMPATIBILITY IDEOGRAPH-FAD6
<UFAD6> <U00025CD0>
% CJK COMPATIBILITY IDEOGRAPH-FAD7
<UFAD7> <U00027ED3>
% CJK COMPATIBILITY IDEOGRAPH-FAD8
<UFAD8> <U9F43>
% CJK COMPATIBILITY IDEOGRAPH-FAD9
<UFAD9> <U9F8E>
% CJK COMPATIBILITY IDEOGRAPH-2F800
<U0002F800> <U4E3D>
% CJK COMPATIBILITY IDEOGRAPH-2F801
@ -1307,7 +1705,7 @@ translit_start
% CJK COMPATIBILITY IDEOGRAPH-2F867
<U0002F867> <U36EE>
% CJK COMPATIBILITY IDEOGRAPH-2F868
<U0002F868> <U0002136A>
<U0002F868> <U36FC>
% CJK COMPATIBILITY IDEOGRAPH-2F869
<U0002F869> <U5B08>
% CJK COMPATIBILITY IDEOGRAPH-2F86A
@ -1331,7 +1729,7 @@ translit_start
% CJK COMPATIBILITY IDEOGRAPH-2F873
<U0002F873> <U5C06>
% CJK COMPATIBILITY IDEOGRAPH-2F874
<U0002F874> <U5F33>
<U0002F874> <U5F53>
% CJK COMPATIBILITY IDEOGRAPH-2F875
<U0002F875> <U5C22>
% CJK COMPATIBILITY IDEOGRAPH-2F876
@ -1673,7 +2071,7 @@ translit_start
% CJK COMPATIBILITY IDEOGRAPH-2F91E
<U0002F91E> <U719C>
% CJK COMPATIBILITY IDEOGRAPH-2F91F
<U0002F91F> <U43AB>
<U0002F91F> <U000243AB>
% CJK COMPATIBILITY IDEOGRAPH-2F920
<U0002F920> <U7228>
% CJK COMPATIBILITY IDEOGRAPH-2F921
@ -1801,7 +2199,7 @@ translit_start
% CJK COMPATIBILITY IDEOGRAPH-2F95E
<U0002F95E> <U00025AA7>
% CJK COMPATIBILITY IDEOGRAPH-2F95F
<U0002F95F> <U7AAE>
<U0002F95F> <U7AEE>
% CJK COMPATIBILITY IDEOGRAPH-2F960
<U0002F960> <U4202>
% CJK COMPATIBILITY IDEOGRAPH-2F961
@ -1993,7 +2391,7 @@ translit_start
% CJK COMPATIBILITY IDEOGRAPH-2F9BE
<U0002F9BE> <U8786>
% CJK COMPATIBILITY IDEOGRAPH-2F9BF
<U0002F9BF> <U4D57>
<U0002F9BF> <U45D7>
% CJK COMPATIBILITY IDEOGRAPH-2F9C0
<U0002F9C0> <U87E1>
% CJK COMPATIBILITY IDEOGRAPH-2F9C1

View File

@ -3,7 +3,7 @@ comment_char %
% Transliterations that remove all combining characters (accents,
% pronounciation marks, etc.).
% Generated from UnicodeData.txt.
% Generated automatically from UnicodeData.txt by gen_translit_combining.py on 2015-12-09 for Unicode 7.0.0.
LC_CTYPE
@ -167,6 +167,40 @@ translit_start
<U034D> ""
% COMBINING UPWARDS ARROW BELOW
<U034E> ""
% COMBINING GRAPHEME JOINER
<U034F> ""
% COMBINING RIGHT ARROWHEAD ABOVE
<U0350> ""
% COMBINING LEFT HALF RING ABOVE
<U0351> ""
% COMBINING FERMATA
<U0352> ""
% COMBINING X BELOW
<U0353> ""
% COMBINING LEFT ARROWHEAD BELOW
<U0354> ""
% COMBINING RIGHT ARROWHEAD BELOW
<U0355> ""
% COMBINING RIGHT ARROWHEAD AND UP ARROWHEAD BELOW
<U0356> ""
% COMBINING RIGHT HALF RING ABOVE
<U0357> ""
% COMBINING DOT ABOVE RIGHT
<U0358> ""
% COMBINING ASTERISK BELOW
<U0359> ""
% COMBINING DOUBLE RING BELOW
<U035A> ""
% COMBINING ZIGZAG ABOVE
<U035B> ""
% COMBINING DOUBLE BREVE BELOW
<U035C> ""
% COMBINING DOUBLE BREVE
<U035D> ""
% COMBINING DOUBLE MACRON
<U035E> ""
% COMBINING DOUBLE MACRON BELOW
<U035F> ""
% COMBINING DOUBLE TILDE
<U0360> ""
% COMBINING DOUBLE INVERTED BREVE
@ -199,6 +233,68 @@ translit_start
<U036E> ""
% COMBINING LATIN SMALL LETTER X
<U036F> ""
% HEBREW ACCENT ETNAHTA
<U0591> ""
% HEBREW ACCENT SEGOL
<U0592> ""
% HEBREW ACCENT SHALSHELET
<U0593> ""
% HEBREW ACCENT ZAQEF QATAN
<U0594> ""
% HEBREW ACCENT ZAQEF GADOL
<U0595> ""
% HEBREW ACCENT TIPEHA
<U0596> ""
% HEBREW ACCENT REVIA
<U0597> ""
% HEBREW ACCENT ZARQA
<U0598> ""
% HEBREW ACCENT PASHTA
<U0599> ""
% HEBREW ACCENT YETIV
<U059A> ""
% HEBREW ACCENT TEVIR
<U059B> ""
% HEBREW ACCENT GERESH
<U059C> ""
% HEBREW ACCENT GERESH MUQDAM
<U059D> ""
% HEBREW ACCENT GERSHAYIM
<U059E> ""
% HEBREW ACCENT QARNEY PARA
<U059F> ""
% HEBREW ACCENT TELISHA GEDOLA
<U05A0> ""
% HEBREW ACCENT PAZER
<U05A1> ""
% HEBREW ACCENT ATNAH HAFUKH
<U05A2> ""
% HEBREW ACCENT MUNAH
<U05A3> ""
% HEBREW ACCENT MAHAPAKH
<U05A4> ""
% HEBREW ACCENT MERKHA
<U05A5> ""
% HEBREW ACCENT MERKHA KEFULA
<U05A6> ""
% HEBREW ACCENT DARGA
<U05A7> ""
% HEBREW ACCENT QADMA
<U05A8> ""
% HEBREW ACCENT TELISHA QETANA
<U05A9> ""
% HEBREW ACCENT YERAH BEN YOMO
<U05AA> ""
% HEBREW ACCENT OLE
<U05AB> ""
% HEBREW ACCENT ILUY
<U05AC> ""
% HEBREW ACCENT DEHI
<U05AD> ""
% HEBREW ACCENT ZINOR
<U05AE> ""
% HEBREW MARK MASORA CIRCLE
<U05AF> ""
% HEBREW POINT SHEVA
<U05B0> ""
% HEBREW POINT HATAF SEGOL
@ -219,6 +315,8 @@ translit_start
<U05B8> ""
% HEBREW POINT HOLAM
<U05B9> ""
% HEBREW POINT HOLAM HASER FOR VAV
<U05BA> ""
% HEBREW POINT QUBUTS
<U05BB> ""
% HEBREW POINT DAGESH OR MAPIQ
@ -231,12 +329,358 @@ translit_start
<U05C1> ""
% HEBREW POINT SIN DOT
<U05C2> ""
% HEBREW MARK UPPER DOT
<U05C4> ""
% HEBREW MARK LOWER DOT
<U05C5> ""
% HEBREW POINT QAMATS QATAN
<U05C7> ""
% ARABIC SIGN SALLALLAHOU ALAYHE WASSALLAM
<U0610> ""
% ARABIC SIGN ALAYHE ASSALLAM
<U0611> ""
% ARABIC SIGN RAHMATULLAH ALAYHE
<U0612> ""
% ARABIC SIGN RADI ALLAHOU ANHU
<U0613> ""
% ARABIC SIGN TAKHALLUS
<U0614> ""
% ARABIC SMALL HIGH TAH
<U0615> ""
% ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
<U0616> ""
% ARABIC SMALL HIGH ZAIN
<U0617> ""
% ARABIC SMALL FATHA
<U0618> ""
% ARABIC SMALL DAMMA
<U0619> ""
% ARABIC SMALL KASRA
<U061A> ""
% ARABIC FATHATAN
<U064B> ""
% ARABIC DAMMATAN
<U064C> ""
% ARABIC KASRATAN
<U064D> ""
% ARABIC FATHA
<U064E> ""
% ARABIC DAMMA
<U064F> ""
% ARABIC KASRA
<U0650> ""
% ARABIC SHADDA
<U0651> ""
% ARABIC SUKUN
<U0652> ""
% ARABIC MADDAH ABOVE
<U0653> ""
% ARABIC HAMZA ABOVE
<U0654> ""
% ARABIC HAMZA BELOW
<U0655> ""
% ARABIC SUBSCRIPT ALEF
<U0656> ""
% ARABIC INVERTED DAMMA
<U0657> ""
% ARABIC MARK NOON GHUNNA
<U0658> ""
% ARABIC ZWARAKAY
<U0659> ""
% ARABIC VOWEL SIGN SMALL V ABOVE
<U065A> ""
% ARABIC VOWEL SIGN INVERTED SMALL V ABOVE
<U065B> ""
% ARABIC VOWEL SIGN DOT BELOW
<U065C> ""
% ARABIC REVERSED DAMMA
<U065D> ""
% ARABIC FATHA WITH TWO DOTS
<U065E> ""
% ARABIC WAVY HAMZA BELOW
<U065F> ""
% ARABIC LETTER SUPERSCRIPT ALEF
<U0670> ""
% ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
<U06D6> ""
% ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
<U06D7> ""
% ARABIC SMALL HIGH MEEM INITIAL FORM
<U06D8> ""
% ARABIC SMALL HIGH LAM ALEF
<U06D9> ""
% ARABIC SMALL HIGH JEEM
<U06DA> ""
% ARABIC SMALL HIGH THREE DOTS
<U06DB> ""
% ARABIC SMALL HIGH SEEN
<U06DC> ""
% ARABIC SMALL HIGH ROUNDED ZERO
<U06DF> ""
% ARABIC SMALL HIGH UPRIGHT RECTANGULAR ZERO
<U06E0> ""
% ARABIC SMALL HIGH DOTLESS HEAD OF KHAH
<U06E1> ""
% ARABIC SMALL HIGH MEEM ISOLATED FORM
<U06E2> ""
% ARABIC SMALL LOW SEEN
<U06E3> ""
% ARABIC SMALL HIGH MADDA
<U06E4> ""
% ARABIC SMALL HIGH YEH
<U06E7> ""
% ARABIC SMALL HIGH NOON
<U06E8> ""
% ARABIC EMPTY CENTRE LOW STOP
<U06EA> ""
% ARABIC EMPTY CENTRE HIGH STOP
<U06EB> ""
% ARABIC ROUNDED HIGH STOP WITH FILLED CENTRE
<U06EC> ""
% ARABIC SMALL LOW MEEM
<U06ED> ""
% ARABIC CURLY FATHA
<U08E4> ""
% ARABIC CURLY DAMMA
<U08E5> ""
% ARABIC CURLY KASRA
<U08E6> ""
% ARABIC CURLY FATHATAN
<U08E7> ""
% ARABIC CURLY DAMMATAN
<U08E8> ""
% ARABIC CURLY KASRATAN
<U08E9> ""
% ARABIC TONE ONE DOT ABOVE
<U08EA> ""
% ARABIC TONE TWO DOTS ABOVE
<U08EB> ""
% ARABIC TONE LOOP ABOVE
<U08EC> ""
% ARABIC TONE ONE DOT BELOW
<U08ED> ""
% ARABIC TONE TWO DOTS BELOW
<U08EE> ""
% ARABIC TONE LOOP BELOW
<U08EF> ""
% ARABIC OPEN FATHATAN
<U08F0> ""
% ARABIC OPEN DAMMATAN
<U08F1> ""
% ARABIC OPEN KASRATAN
<U08F2> ""
% ARABIC SMALL HIGH WAW
<U08F3> ""
% ARABIC FATHA WITH RING
<U08F4> ""
% ARABIC FATHA WITH DOT ABOVE
<U08F5> ""
% ARABIC KASRA WITH DOT BELOW
<U08F6> ""
% ARABIC LEFT ARROWHEAD ABOVE
<U08F7> ""
% ARABIC RIGHT ARROWHEAD ABOVE
<U08F8> ""
% ARABIC LEFT ARROWHEAD BELOW
<U08F9> ""
% ARABIC RIGHT ARROWHEAD BELOW
<U08FA> ""
% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE
<U08FB> ""
% ARABIC DOUBLE RIGHT ARROWHEAD ABOVE WITH DOT
<U08FC> ""
% ARABIC RIGHT ARROWHEAD ABOVE WITH DOT
<U08FD> ""
% ARABIC DAMMA WITH DOT
<U08FE> ""
% ARABIC MARK SIDEWAYS NOON GHUNNA
<U08FF> ""
% COMBINING DOUBLED CIRCUMFLEX ACCENT
<U1AB0> ""
% COMBINING DIAERESIS-RING
<U1AB1> ""
% COMBINING INFINITY
<U1AB2> ""
% COMBINING DOWNWARDS ARROW
<U1AB3> ""
% COMBINING TRIPLE DOT
<U1AB4> ""
% COMBINING X-X BELOW
<U1AB5> ""
% COMBINING WIGGLY LINE BELOW
<U1AB6> ""
% COMBINING OPEN MARK BELOW
<U1AB7> ""
% COMBINING DOUBLE OPEN MARK BELOW
<U1AB8> ""
% COMBINING LIGHT CENTRALIZATION STROKE BELOW
<U1AB9> ""
% COMBINING STRONG CENTRALIZATION STROKE BELOW
<U1ABA> ""
% COMBINING PARENTHESES ABOVE
<U1ABB> ""
% COMBINING DOUBLE PARENTHESES ABOVE
<U1ABC> ""
% COMBINING PARENTHESES BELOW
<U1ABD> ""
% COMBINING PARENTHESES OVERLAY
<U1ABE> ""
% COMBINING DOTTED GRAVE ACCENT
<U1DC0> ""
% COMBINING DOTTED ACUTE ACCENT
<U1DC1> ""
% COMBINING SNAKE BELOW
<U1DC2> ""
% COMBINING SUSPENSION MARK
<U1DC3> ""
% COMBINING MACRON-ACUTE
<U1DC4> ""
% COMBINING GRAVE-MACRON
<U1DC5> ""
% COMBINING MACRON-GRAVE
<U1DC6> ""
% COMBINING ACUTE-MACRON
<U1DC7> ""
% COMBINING GRAVE-ACUTE-GRAVE
<U1DC8> ""
% COMBINING ACUTE-GRAVE-ACUTE
<U1DC9> ""
% COMBINING LATIN SMALL LETTER R BELOW
<U1DCA> ""
% COMBINING BREVE-MACRON
<U1DCB> ""
% COMBINING MACRON-BREVE
<U1DCC> ""
% COMBINING DOUBLE CIRCUMFLEX ABOVE
<U1DCD> ""
% COMBINING OGONEK ABOVE
<U1DCE> ""
% COMBINING ZIGZAG BELOW
<U1DCF> ""
% COMBINING IS BELOW
<U1DD0> ""
% COMBINING UR ABOVE
<U1DD1> ""
% COMBINING US ABOVE
<U1DD2> ""
% COMBINING LATIN SMALL LETTER FLATTENED OPEN A ABOVE
<U1DD3> ""
% COMBINING LATIN SMALL LETTER AE
<U1DD4> ""
% COMBINING LATIN SMALL LETTER AO
<U1DD5> ""
% COMBINING LATIN SMALL LETTER AV
<U1DD6> ""
% COMBINING LATIN SMALL LETTER C CEDILLA
<U1DD7> ""
% COMBINING LATIN SMALL LETTER INSULAR D
<U1DD8> ""
% COMBINING LATIN SMALL LETTER ETH
<U1DD9> ""
% COMBINING LATIN SMALL LETTER G
<U1DDA> ""
% COMBINING LATIN LETTER SMALL CAPITAL G
<U1DDB> ""
% COMBINING LATIN SMALL LETTER K
<U1DDC> ""
% COMBINING LATIN SMALL LETTER L
<U1DDD> ""
% COMBINING LATIN LETTER SMALL CAPITAL L
<U1DDE> ""
% COMBINING LATIN LETTER SMALL CAPITAL M
<U1DDF> ""
% COMBINING LATIN SMALL LETTER N
<U1DE0> ""
% COMBINING LATIN LETTER SMALL CAPITAL N
<U1DE1> ""
% COMBINING LATIN LETTER SMALL CAPITAL R
<U1DE2> ""
% COMBINING LATIN SMALL LETTER R ROTUNDA
<U1DE3> ""
% COMBINING LATIN SMALL LETTER S
<U1DE4> ""
% COMBINING LATIN SMALL LETTER LONG S
<U1DE5> ""
% COMBINING LATIN SMALL LETTER Z
<U1DE6> ""
% COMBINING LATIN SMALL LETTER ALPHA
<U1DE7> ""
% COMBINING LATIN SMALL LETTER B
<U1DE8> ""
% COMBINING LATIN SMALL LETTER BETA
<U1DE9> ""
% COMBINING LATIN SMALL LETTER SCHWA
<U1DEA> ""
% COMBINING LATIN SMALL LETTER F
<U1DEB> ""
% COMBINING LATIN SMALL LETTER L WITH DOUBLE MIDDLE TILDE
<U1DEC> ""
% COMBINING LATIN SMALL LETTER O WITH LIGHT CENTRALIZATION STROKE
<U1DED> ""
% COMBINING LATIN SMALL LETTER P
<U1DEE> ""
% COMBINING LATIN SMALL LETTER ESH
<U1DEF> ""
% COMBINING LATIN SMALL LETTER U WITH LIGHT CENTRALIZATION STROKE
<U1DF0> ""
% COMBINING LATIN SMALL LETTER W
<U1DF1> ""
% COMBINING LATIN SMALL LETTER A WITH DIAERESIS
<U1DF2> ""
% COMBINING LATIN SMALL LETTER O WITH DIAERESIS
<U1DF3> ""
% COMBINING LATIN SMALL LETTER U WITH DIAERESIS
<U1DF4> ""
% COMBINING UP TACK ABOVE
<U1DF5> ""
% COMBINING DOUBLE INVERTED BREVE BELOW
<U1DFC> ""
% COMBINING ALMOST EQUAL TO BELOW
<U1DFD> ""
% COMBINING LEFT ARROWHEAD ABOVE
<U1DFE> ""
% COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
<U1DFF> ""
% COMBINING LEFT HARPOON ABOVE
<U20D0> ""
% COMBINING RIGHT HARPOON ABOVE
<U20D1> ""
% COMBINING LONG VERTICAL LINE OVERLAY
<U20D2> ""
% COMBINING SHORT VERTICAL LINE OVERLAY
<U20D3> ""
% COMBINING ANTICLOCKWISE ARROW ABOVE
<U20D4> ""
% COMBINING CLOCKWISE ARROW ABOVE
<U20D5> ""
% COMBINING LEFT ARROW ABOVE
<U20D6> ""
% COMBINING RIGHT ARROW ABOVE
<U20D7> ""
% COMBINING RING OVERLAY
<U20D8> ""
% COMBINING CLOCKWISE RING OVERLAY
<U20D9> ""
% COMBINING ANTICLOCKWISE RING OVERLAY
<U20DA> ""
% COMBINING THREE DOTS ABOVE
<U20DB> ""
% COMBINING FOUR DOTS ABOVE
<U20DC> ""
% COMBINING ENCLOSING CIRCLE
<U20DD> ""
% COMBINING ENCLOSING SQUARE
<U20DE> ""
% COMBINING ENCLOSING DIAMOND
<U20DF> ""
% COMBINING ENCLOSING CIRCLE BACKSLASH
<U20E0> ""
% COMBINING LEFT RIGHT ARROW ABOVE
<U20E1> ""
% COMBINING ENCLOSING SCREEN
<U20E2> ""
% COMBINING ENCLOSING KEYCAP
<U20E3> ""
% COMBINING ENCLOSING UPWARD POINTING TRIANGLE
<U20E4> ""
% COMBINING REVERSE SOLIDUS OVERLAY
@ -251,10 +695,70 @@ translit_start
<U20E9> ""
% COMBINING LEFTWARDS ARROW OVERLAY
<U20EA> ""
% COMBINING LONG DOUBLE SOLIDUS OVERLAY
<U20EB> ""
% COMBINING RIGHTWARDS HARPOON WITH BARB DOWNWARDS
<U20EC> ""
% COMBINING LEFTWARDS HARPOON WITH BARB DOWNWARDS
<U20ED> ""
% COMBINING LEFT ARROW BELOW
<U20EE> ""
% COMBINING RIGHT ARROW BELOW
<U20EF> ""
% COMBINING ASTERISK ABOVE
<U20F0> ""
% COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
<U3099> ""
% COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
<U309A> ""
% HEBREW POINT JUDEO-SPANISH VARIKA
<UFB1E> ""
% COMBINING LIGATURE LEFT HALF
<UFE20> ""
% COMBINING LIGATURE RIGHT HALF
<UFE21> ""
% COMBINING DOUBLE TILDE LEFT HALF
<UFE22> ""
% COMBINING DOUBLE TILDE RIGHT HALF
<UFE23> ""
% COMBINING MACRON LEFT HALF
<UFE24> ""
% COMBINING MACRON RIGHT HALF
<UFE25> ""
% COMBINING CONJOINING MACRON
<UFE26> ""
% COMBINING LIGATURE LEFT HALF BELOW
<UFE27> ""
% COMBINING LIGATURE RIGHT HALF BELOW
<UFE28> ""
% COMBINING TILDE LEFT HALF BELOW
<UFE29> ""
% COMBINING TILDE RIGHT HALF BELOW
<UFE2A> ""
% COMBINING MACRON LEFT HALF BELOW
<UFE2B> ""
% COMBINING MACRON RIGHT HALF BELOW
<UFE2C> ""
% COMBINING CONJOINING MACRON BELOW
<UFE2D> ""
% PHAISTOS DISC SIGN COMBINING OBLIQUE STROKE
<U000101FD> ""
% COMBINING OLD PERMIC LETTER AN
<U00010376> ""
% COMBINING OLD PERMIC LETTER DOI
<U00010377> ""
% COMBINING OLD PERMIC LETTER ZATA
<U00010378> ""
% COMBINING OLD PERMIC LETTER NENOE
<U00010379> ""
% COMBINING OLD PERMIC LETTER SII
<U0001037A> ""
% COMBINING GREEK MUSICAL TRISEME
<U0001D242> ""
% COMBINING GREEK MUSICAL TETRASEME
<U0001D243> ""
% COMBINING GREEK MUSICAL PENTASEME
<U0001D244> ""
% LATIN CAPITAL LETTER A WITH GRAVE
<U00C0> <U0041>
@ -268,6 +772,8 @@ translit_start
<U00C4> <U0041>
% LATIN CAPITAL LETTER A WITH RING ABOVE
<U00C5> <U0041>
% LATIN CAPITAL LETTER AE
<U00C6> "<U0041><U0045>"
% LATIN CAPITAL LETTER C WITH CEDILLA
<U00C7> <U0043>
% LATIN CAPITAL LETTER E WITH GRAVE
@ -298,6 +804,8 @@ translit_start
<U00D5> <U004F>
% LATIN CAPITAL LETTER O WITH DIAERESIS
<U00D6> <U004F>
% LATIN CAPITAL LETTER O WITH STROKE
<U00D8> <U004F>
% LATIN CAPITAL LETTER U WITH GRAVE
<U00D9> <U0055>
% LATIN CAPITAL LETTER U WITH ACUTE
@ -320,6 +828,8 @@ translit_start
<U00E4> <U0061>
% LATIN SMALL LETTER A WITH RING ABOVE
<U00E5> <U0061>
% LATIN SMALL LETTER AE
<U00E6> "<U0061><U0065>"
% LATIN SMALL LETTER C WITH CEDILLA
<U00E7> <U0063>
% LATIN SMALL LETTER E WITH GRAVE
@ -350,6 +860,8 @@ translit_start
<U00F5> <U006F>
% LATIN SMALL LETTER O WITH DIAERESIS
<U00F6> <U006F>
% LATIN SMALL LETTER O WITH STROKE
<U00F8> <U006F>
% LATIN SMALL LETTER U WITH GRAVE
<U00F9> <U0075>
% LATIN SMALL LETTER U WITH ACUTE
@ -472,10 +984,6 @@ translit_start
<U013D> <U004C>
% LATIN SMALL LETTER L WITH CARON
<U013E> <U006C>
% LATIN CAPITAL LETTER L WITH STROKE
<U0141> <U004C>
% LATIN SMALL LETTER L WITH STROKE
<U0142> <U006C>
% LATIN CAPITAL LETTER N WITH ACUTE
<U0143> <U004E>
% LATIN SMALL LETTER N WITH ACUTE
@ -673,9 +1181,9 @@ translit_start
% LATIN SMALL LETTER AE WITH ACUTE
<U01FD> <U00E6>;"<U0061><U0065>"
% LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
<U01FE> <U004F>
<U01FE> <U00D8>;<U004F>
% LATIN SMALL LETTER O WITH STROKE AND ACUTE
<U01FF> <U006F>
<U01FF> <U00F8>;<U006F>
% LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
<U0200> <U0041>
% LATIN SMALL LETTER A WITH DOUBLE GRAVE
@ -764,14 +1272,6 @@ translit_start
<U0232> <U0059>
% LATIN SMALL LETTER Y WITH MACRON
<U0233> <U0079>
% COMBINING GRAVE TONE MARK
<U0340> <U0300>
% COMBINING ACUTE TONE MARK
<U0341> <U0301>
% COMBINING GREEK KORONIS
<U0343> <U0313>
% COMBINING GREEK DIALYTIKA TONOS
<U0344> <U0308>
% GREEK NUMERAL SIGN
<U0374> <U02B9>
% GREEK QUESTION MARK
@ -928,6 +1428,8 @@ translit_start
<U04F8> <U042B>
% CYRILLIC SMALL LETTER YERU WITH DIAERESIS
<U04F9> <U044B>
% HEBREW LIGATURE YIDDISH DOUBLE YOD
<U05F2> "<U05D9><U05D9>"
% ARABIC LETTER ALEF WITH MADDA ABOVE
<U0622> <U0627>
% ARABIC LETTER ALEF WITH HAMZA ABOVE
@ -1017,7 +1519,7 @@ translit_start
% KANNADA VOWEL SIGN O
<U0CCA> "<U0CC6><U0CC2>"
% KANNADA VOWEL SIGN OO
<U0CCB> "<U0CCA><U0CD5>"
<U0CCB> "<U0CC6><U0CC2><U0CD5>"
% MALAYALAM VOWEL SIGN O
<U0D4A> "<U0D46><U0D3E>"
% MALAYALAM VOWEL SIGN OO
@ -1029,7 +1531,7 @@ translit_start
% SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA
<U0DDC> "<U0DD9><U0DCF>"
% SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA
<U0DDD> "<U0DDC><U0DCA>"
<U0DDD> "<U0DD9><U0DCF><U0DCA>"
% SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA
<U0DDE> "<U0DD9><U0DDF>"
% TIBETAN LETTER GHA
@ -2020,16 +2522,114 @@ translit_start
<U2000> <U2002>;<U0020>
% EM QUAD
<U2001> <U2003>;<U0020>
% EN SPACE
<U2002> <U0020>
% EM SPACE
<U2003> <U0020>
% OHM SIGN
<U2126> <U03A9>
% KELVIN SIGN
<U212A> <U004B>
% ANGSTROM SIGN
<U212B> <U00C5>
<U212B> <U0041>
% LEFTWARDS ARROW WITH STROKE
<U219A> <U2190>
% RIGHTWARDS ARROW WITH STROKE
<U219B> <U2192>
% LEFT RIGHT ARROW WITH STROKE
<U21AE> "<U0021><U003C><U002D><U003E>"
% LEFTWARDS DOUBLE ARROW WITH STROKE
<U21CD> "<U0021><U003C><U003D>"
% LEFT RIGHT DOUBLE ARROW WITH STROKE
<U21CE> "<U0021><U003C><U003D><U003E>"
% RIGHTWARDS DOUBLE ARROW WITH STROKE
<U21CF> "<U0021><U003D><U003E>"
% THERE DOES NOT EXIST
<U2204> "<U0021><U2203>"
% NOT AN ELEMENT OF
<U2209> "<U0021><U2208>"
% DOES NOT CONTAIN AS MEMBER
<U220C> "<U0021><U220B>"
% DOES NOT DIVIDE
<U2224> "<U0021><U2223>"
% NOT PARALLEL TO
<U2226> "<U0021><U2225>"
% NOT TILDE
<U2241> "<U0021><U007E>"
% NOT ASYMPTOTICALLY EQUAL TO
<U2244> "<U0021><U007E><U002D>"
% NEITHER APPROXIMATELY NOR ACTUALLY EQUAL TO
<U2247> "<U0021><U007E><U003D>"
% NOT ALMOST EQUAL TO
<U2249> "<U0021><U007E><U007E>"
% NOT EQUAL TO
<U2260> "<U0021><U003D>"
% NOT IDENTICAL TO
<U2262> "<U0021><U003D><U003D>"
% NOT EQUIVALENT TO
<U226D> "<U0021><U224D>"
% NOT LESS-THAN
<U226E> "<U0021><U003C>"
% NOT GREATER-THAN
<U226F> "<U0021><U003E>"
% NEITHER LESS-THAN NOR EQUAL TO
<U2270> "<U0021><U003C><U003D>"
% NEITHER GREATER-THAN NOR EQUAL TO
<U2271> "<U0021><U003E><U003D>"
% NEITHER LESS-THAN NOR EQUIVALENT TO
<U2274> "<U0021><U003C><U007E>"
% NEITHER GREATER-THAN NOR EQUIVALENT TO
<U2275> "<U0021><U003E><U007E>"
% NEITHER LESS-THAN NOR GREATER-THAN
<U2278> "<U0021><U003C><U003E>"
% NEITHER GREATER-THAN NOR LESS-THAN
<U2279> "<U0021><U003E><U003C>"
% DOES NOT PRECEDE
<U2280> "<U0021><U227A>"
% DOES NOT SUCCEED
<U2281> "<U0021><U227B>"
% NOT A SUBSET OF
<U2284> "<U0021><U2282>"
% NOT A SUPERSET OF
<U2285> "<U0021><U2283>"
% NEITHER A SUBSET OF NOR EQUAL TO
<U2288> "<U0021><U2282><U003D>"
% NEITHER A SUPERSET OF NOR EQUAL TO
<U2289> "<U0021><U2283><U003D>"
% DOES NOT PROVE
<U22AC> "<U0021><U22A2>"
% NOT TRUE
<U22AD> "<U0021><U22A8>"
% DOES NOT FORCE
<U22AE> "<U0021><U22A9>"
% NEGATED DOUBLE VERTICAL BAR DOUBLE RIGHT TURNSTILE
<U22AF> "<U0021><U22AB>"
% DOES NOT PRECEDE OR EQUAL
<U22E0> "<U0021><U227C>"
% DOES NOT SUCCEED OR EQUAL
<U22E1> "<U0021><U227D>"
% NOT SQUARE IMAGE OF OR EQUAL TO
<U22E2> "<U0021><U2291>"
% NOT SQUARE ORIGINAL OF OR EQUAL TO
<U22E3> "<U0021><U2292>"
% NOT NORMAL SUBGROUP OF
<U22EA> "<U0021><U22B2>"
% DOES NOT CONTAIN AS NORMAL SUBGROUP
<U22EB> "<U0021><U22B3>"
% NOT NORMAL SUBGROUP OF OR EQUAL TO
<U22EC> "<U0021><U22B4>"
% DOES NOT CONTAIN AS NORMAL SUBGROUP OR EQUAL
<U22ED> "<U0021><U22B5>"
% LEFT-POINTING ANGLE BRACKET
<U2329> <U3008>;<U003C>
% RIGHT-POINTING ANGLE BRACKET
<U232A> <U3009>;<U003E>
% FORKING
<U2ADC> "<U0021><U2ADD>"
% LEFT ANGLE BRACKET
<U3008> <U003C>
% RIGHT ANGLE BRACKET
<U3009> <U003E>
% HIRAGANA LETTER GA
<U304C> <U304B>
% HIRAGANA LETTER GI

View File

@ -2,18 +2,24 @@ escape_char /
comment_char %
% Transliterations of compatibility characters and ligatures.
% Generated through
% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<compat>[^;]*;' UnicodeData.txt | \
% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<compat> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' | grep -v '0020 03[0-6][0-9A-F]' | sed -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
% and
% $ grep '[^;]*;[^;]*LIGATURE[^;]*;' UnicodeData.txt
% Generated automatically from UnicodeData.txt by gen_translit_compat.py on 2015-12-09 for Unicode 7.0.0.
LC_CTYPE
translit_start
% FEMININE ORDINAL INDICATOR
<U00AA> "<U0061>"
% SUPERSCRIPT TWO
<U00B2> "<U0032>"
% SUPERSCRIPT THREE
<U00B3> "<U0033>"
% MICRO SIGN
<U00B5> "<U03BC>";<U0075>
<U00B5> "<U03BC>";"<U0075>"
% SUPERSCRIPT ONE
<U00B9> "<U0031>"
% MASCULINE ORDINAL INDICATOR
<U00BA> "<U006F>"
% LATIN CAPITAL LIGATURE IJ
<U0132> "<U0049><U004A>"
% LATIN SMALL LIGATURE IJ
@ -54,6 +60,38 @@ translit_start
<U01F2> "<U0044><U007A>"
% LATIN SMALL LETTER DZ
<U01F3> "<U0064><U007A>"
% MODIFIER LETTER SMALL H
<U02B0> "<U0068>"
% MODIFIER LETTER SMALL H WITH HOOK
<U02B1> "<U0266>"
% MODIFIER LETTER SMALL J
<U02B2> "<U006A>"
% MODIFIER LETTER SMALL R
<U02B3> "<U0072>"
% MODIFIER LETTER SMALL TURNED R
<U02B4> "<U0279>"
% MODIFIER LETTER SMALL TURNED R WITH HOOK
<U02B5> "<U027B>"
% MODIFIER LETTER SMALL CAPITAL INVERTED R
<U02B6> "<U0281>"
% MODIFIER LETTER SMALL W
<U02B7> "<U0077>"
% MODIFIER LETTER SMALL Y
<U02B8> "<U0079>"
% MODIFIER LETTER APOSTROPHE
<U02BC> "<U0027>"
% MODIFIER LETTER SMALL GAMMA
<U02E0> "<U0263>"
% MODIFIER LETTER SMALL L
<U02E1> "<U006C>"
% MODIFIER LETTER SMALL S
<U02E2> "<U0073>"
% MODIFIER LETTER SMALL X
<U02E3> "<U0078>"
% MODIFIER LETTER SMALL REVERSED GLOTTAL STOP
<U02E4> "<U0295>"
% GREEK SMALL LETTER MU
<U03BC> "<U0075>"
% GREEK BETA SYMBOL
<U03D0> "<U03B2>"
% GREEK THETA SYMBOL
@ -74,6 +112,20 @@ translit_start
<U03F4> "<U0398>"
% GREEK LUNATE EPSILON SYMBOL
<U03F5> "<U03B5>"
% GREEK CAPITAL LUNATE SIGMA SYMBOL
<U03F9> "<U03A3>"
% CYRILLIC CAPITAL LIGATURE EN GHE
<U04A4> "<U041D><U0413>"
% CYRILLIC SMALL LIGATURE EN GHE
<U04A5> "<U043D><U0433>"
% CYRILLIC CAPITAL LIGATURE TE TSE
<U04B4> "<U0422><U0426>"
% CYRILLIC SMALL LIGATURE TE TSE
<U04B5> "<U0442><U0446>"
% CYRILLIC CAPITAL LIGATURE A IE
<U04D4> "<U0410><U0415>"
% CYRILLIC SMALL LIGATURE A IE
<U04D5> "<U0430><U0435>"
% ARMENIAN SMALL LIGATURE ECH YIWN
<U0587> "<U0565><U0582>"
% HEBREW LIGATURE YIDDISH DOUBLE VAV
@ -102,6 +154,204 @@ translit_start
<U0F77> "<U0FB2><U0F81>"
% TIBETAN VOWEL SIGN VOCALIC LL
<U0F79> "<U0FB3><U0F81>"
% MODIFIER LETTER GEORGIAN NAR
<U10FC> "<U10DC>"
% MODIFIER LETTER CAPITAL A
<U1D2C> "<U0041>"
% MODIFIER LETTER CAPITAL AE
<U1D2D> "<U00C6>"
% MODIFIER LETTER CAPITAL B
<U1D2E> "<U0042>"
% MODIFIER LETTER CAPITAL D
<U1D30> "<U0044>"
% MODIFIER LETTER CAPITAL E
<U1D31> "<U0045>"
% MODIFIER LETTER CAPITAL REVERSED E
<U1D32> "<U018E>"
% MODIFIER LETTER CAPITAL G
<U1D33> "<U0047>"
% MODIFIER LETTER CAPITAL H
<U1D34> "<U0048>"
% MODIFIER LETTER CAPITAL I
<U1D35> "<U0049>"
% MODIFIER LETTER CAPITAL J
<U1D36> "<U004A>"
% MODIFIER LETTER CAPITAL K
<U1D37> "<U004B>"
% MODIFIER LETTER CAPITAL L
<U1D38> "<U004C>"
% MODIFIER LETTER CAPITAL M
<U1D39> "<U004D>"
% MODIFIER LETTER CAPITAL N
<U1D3A> "<U004E>"
% MODIFIER LETTER CAPITAL O
<U1D3C> "<U004F>"
% MODIFIER LETTER CAPITAL OU
<U1D3D> "<U0222>"
% MODIFIER LETTER CAPITAL P
<U1D3E> "<U0050>"
% MODIFIER LETTER CAPITAL R
<U1D3F> "<U0052>"
% MODIFIER LETTER CAPITAL T
<U1D40> "<U0054>"
% MODIFIER LETTER CAPITAL U
<U1D41> "<U0055>"
% MODIFIER LETTER CAPITAL W
<U1D42> "<U0057>"
% MODIFIER LETTER SMALL A
<U1D43> "<U0061>"
% MODIFIER LETTER SMALL TURNED A
<U1D44> "<U0250>"
% MODIFIER LETTER SMALL ALPHA
<U1D45> "<U0251>"
% MODIFIER LETTER SMALL TURNED AE
<U1D46> "<U1D02>"
% MODIFIER LETTER SMALL B
<U1D47> "<U0062>"
% MODIFIER LETTER SMALL D
<U1D48> "<U0064>"
% MODIFIER LETTER SMALL E
<U1D49> "<U0065>"
% MODIFIER LETTER SMALL SCHWA
<U1D4A> "<U0259>"
% MODIFIER LETTER SMALL OPEN E
<U1D4B> "<U025B>"
% MODIFIER LETTER SMALL TURNED OPEN E
<U1D4C> "<U025C>"
% MODIFIER LETTER SMALL G
<U1D4D> "<U0067>"
% MODIFIER LETTER SMALL K
<U1D4F> "<U006B>"
% MODIFIER LETTER SMALL M
<U1D50> "<U006D>"
% MODIFIER LETTER SMALL ENG
<U1D51> "<U014B>"
% MODIFIER LETTER SMALL O
<U1D52> "<U006F>"
% MODIFIER LETTER SMALL OPEN O
<U1D53> "<U0254>"
% MODIFIER LETTER SMALL TOP HALF O
<U1D54> "<U1D16>"
% MODIFIER LETTER SMALL BOTTOM HALF O
<U1D55> "<U1D17>"
% MODIFIER LETTER SMALL P
<U1D56> "<U0070>"
% MODIFIER LETTER SMALL T
<U1D57> "<U0074>"
% MODIFIER LETTER SMALL U
<U1D58> "<U0075>"
% MODIFIER LETTER SMALL SIDEWAYS U
<U1D59> "<U1D1D>"
% MODIFIER LETTER SMALL TURNED M
<U1D5A> "<U026F>"
% MODIFIER LETTER SMALL V
<U1D5B> "<U0076>"
% MODIFIER LETTER SMALL AIN
<U1D5C> "<U1D25>"
% MODIFIER LETTER SMALL BETA
<U1D5D> "<U03B2>"
% MODIFIER LETTER SMALL GREEK GAMMA
<U1D5E> "<U03B3>"
% MODIFIER LETTER SMALL DELTA
<U1D5F> "<U03B4>"
% MODIFIER LETTER SMALL GREEK PHI
<U1D60> "<U03C6>"
% MODIFIER LETTER SMALL CHI
<U1D61> "<U03C7>"
% LATIN SUBSCRIPT SMALL LETTER I
<U1D62> "<U0069>"
% LATIN SUBSCRIPT SMALL LETTER R
<U1D63> "<U0072>"
% LATIN SUBSCRIPT SMALL LETTER U
<U1D64> "<U0075>"
% LATIN SUBSCRIPT SMALL LETTER V
<U1D65> "<U0076>"
% GREEK SUBSCRIPT SMALL LETTER BETA
<U1D66> "<U03B2>"
% GREEK SUBSCRIPT SMALL LETTER GAMMA
<U1D67> "<U03B3>"
% GREEK SUBSCRIPT SMALL LETTER RHO
<U1D68> "<U03C1>"
% GREEK SUBSCRIPT SMALL LETTER PHI
<U1D69> "<U03C6>"
% GREEK SUBSCRIPT SMALL LETTER CHI
<U1D6A> "<U03C7>"
% MODIFIER LETTER CYRILLIC EN
<U1D78> "<U043D>"
% MODIFIER LETTER SMALL TURNED ALPHA
<U1D9B> "<U0252>"
% MODIFIER LETTER SMALL C
<U1D9C> "<U0063>"
% MODIFIER LETTER SMALL C WITH CURL
<U1D9D> "<U0255>"
% MODIFIER LETTER SMALL ETH
<U1D9E> "<U00F0>"
% MODIFIER LETTER SMALL REVERSED OPEN E
<U1D9F> "<U025C>"
% MODIFIER LETTER SMALL F
<U1DA0> "<U0066>"
% MODIFIER LETTER SMALL DOTLESS J WITH STROKE
<U1DA1> "<U025F>"
% MODIFIER LETTER SMALL SCRIPT G
<U1DA2> "<U0261>"
% MODIFIER LETTER SMALL TURNED H
<U1DA3> "<U0265>"
% MODIFIER LETTER SMALL I WITH STROKE
<U1DA4> "<U0268>"
% MODIFIER LETTER SMALL IOTA
<U1DA5> "<U0269>"
% MODIFIER LETTER SMALL CAPITAL I
<U1DA6> "<U026A>"
% MODIFIER LETTER SMALL CAPITAL I WITH STROKE
<U1DA7> "<U1D7B>"
% MODIFIER LETTER SMALL J WITH CROSSED-TAIL
<U1DA8> "<U029D>"
% MODIFIER LETTER SMALL L WITH RETROFLEX HOOK
<U1DA9> "<U026D>"
% MODIFIER LETTER SMALL L WITH PALATAL HOOK
<U1DAA> "<U1D85>"
% MODIFIER LETTER SMALL CAPITAL L
<U1DAB> "<U029F>"
% MODIFIER LETTER SMALL M WITH HOOK
<U1DAC> "<U0271>"
% MODIFIER LETTER SMALL TURNED M WITH LONG LEG
<U1DAD> "<U0270>"
% MODIFIER LETTER SMALL N WITH LEFT HOOK
<U1DAE> "<U0272>"
% MODIFIER LETTER SMALL N WITH RETROFLEX HOOK
<U1DAF> "<U0273>"
% MODIFIER LETTER SMALL CAPITAL N
<U1DB0> "<U0274>"
% MODIFIER LETTER SMALL BARRED O
<U1DB1> "<U0275>"
% MODIFIER LETTER SMALL PHI
<U1DB2> "<U0278>"
% MODIFIER LETTER SMALL S WITH HOOK
<U1DB3> "<U0282>"
% MODIFIER LETTER SMALL ESH
<U1DB4> "<U0283>"
% MODIFIER LETTER SMALL T WITH PALATAL HOOK
<U1DB5> "<U01AB>"
% MODIFIER LETTER SMALL U BAR
<U1DB6> "<U0289>"
% MODIFIER LETTER SMALL UPSILON
<U1DB7> "<U028A>"
% MODIFIER LETTER SMALL CAPITAL U
<U1DB8> "<U1D1C>"
% MODIFIER LETTER SMALL V WITH HOOK
<U1DB9> "<U028B>"
% MODIFIER LETTER SMALL TURNED V
<U1DBA> "<U028C>"
% MODIFIER LETTER SMALL Z
<U1DBB> "<U007A>"
% MODIFIER LETTER SMALL Z WITH RETROFLEX HOOK
<U1DBC> "<U0290>"
% MODIFIER LETTER SMALL Z WITH CURL
<U1DBD> "<U0291>"
% MODIFIER LETTER SMALL EZH
<U1DBE> "<U0292>"
% MODIFIER LETTER SMALL THETA
<U1DBF> "<U03B8>"
% LATIN SMALL LETTER A WITH RIGHT HALF RING
<U1E9A> "<U0061><U02BE>"
% EN SPACE
@ -146,6 +396,90 @@ translit_start
<U2057> "<U2032><U2032><U2032><U2032>"
% MEDIUM MATHEMATICAL SPACE
<U205F> "<U0020>"
% SUPERSCRIPT ZERO
<U2070> "<U0030>"
% SUPERSCRIPT LATIN SMALL LETTER I
<U2071> "<U0069>"
% SUPERSCRIPT FOUR
<U2074> "<U0034>"
% SUPERSCRIPT FIVE
<U2075> "<U0035>"
% SUPERSCRIPT SIX
<U2076> "<U0036>"
% SUPERSCRIPT SEVEN
<U2077> "<U0037>"
% SUPERSCRIPT EIGHT
<U2078> "<U0038>"
% SUPERSCRIPT NINE
<U2079> "<U0039>"
% SUPERSCRIPT PLUS SIGN
<U207A> "<U002B>"
% SUPERSCRIPT MINUS
<U207B> "<U2212>"
% SUPERSCRIPT EQUALS SIGN
<U207C> "<U003D>"
% SUPERSCRIPT LEFT PARENTHESIS
<U207D> "<U0028>"
% SUPERSCRIPT RIGHT PARENTHESIS
<U207E> "<U0029>"
% SUPERSCRIPT LATIN SMALL LETTER N
<U207F> "<U006E>"
% SUBSCRIPT ZERO
<U2080> "<U0030>"
% SUBSCRIPT ONE
<U2081> "<U0031>"
% SUBSCRIPT TWO
<U2082> "<U0032>"
% SUBSCRIPT THREE
<U2083> "<U0033>"
% SUBSCRIPT FOUR
<U2084> "<U0034>"
% SUBSCRIPT FIVE
<U2085> "<U0035>"
% SUBSCRIPT SIX
<U2086> "<U0036>"
% SUBSCRIPT SEVEN
<U2087> "<U0037>"
% SUBSCRIPT EIGHT
<U2088> "<U0038>"
% SUBSCRIPT NINE
<U2089> "<U0039>"
% SUBSCRIPT PLUS SIGN
<U208A> "<U002B>"
% SUBSCRIPT MINUS
<U208B> "<U2212>"
% SUBSCRIPT EQUALS SIGN
<U208C> "<U003D>"
% SUBSCRIPT LEFT PARENTHESIS
<U208D> "<U0028>"
% SUBSCRIPT RIGHT PARENTHESIS
<U208E> "<U0029>"
% LATIN SUBSCRIPT SMALL LETTER A
<U2090> "<U0061>"
% LATIN SUBSCRIPT SMALL LETTER E
<U2091> "<U0065>"
% LATIN SUBSCRIPT SMALL LETTER O
<U2092> "<U006F>"
% LATIN SUBSCRIPT SMALL LETTER X
<U2093> "<U0078>"
% LATIN SUBSCRIPT SMALL LETTER SCHWA
<U2094> "<U0259>"
% LATIN SUBSCRIPT SMALL LETTER H
<U2095> "<U0068>"
% LATIN SUBSCRIPT SMALL LETTER K
<U2096> "<U006B>"
% LATIN SUBSCRIPT SMALL LETTER L
<U2097> "<U006C>"
% LATIN SUBSCRIPT SMALL LETTER M
<U2098> "<U006D>"
% LATIN SUBSCRIPT SMALL LETTER N
<U2099> "<U006E>"
% LATIN SUBSCRIPT SMALL LETTER P
<U209A> "<U0070>"
% LATIN SUBSCRIPT SMALL LETTER S
<U209B> "<U0073>"
% LATIN SUBSCRIPT SMALL LETTER T
<U209C> "<U0074>"
% RUPEE SIGN
<U20A8> "<U0052><U0073>"
% ACCOUNT OF
@ -164,8 +498,12 @@ translit_start
<U2109> "<U00B0><U0046>"
% NUMERO SIGN
<U2116> "<U004E><U006F>"
% SERVICE MARK
<U2120> "<U0053><U004D>"
% TELEPHONE SIGN
<U2121> "<U0054><U0045><U004C>"
% TRADE MARK SIGN
<U2122> "<U0054><U004D>"
% ALEF SYMBOL
<U2135> "<U05D0>"
% BET SYMBOL
@ -174,6 +512,8 @@ translit_start
<U2137> "<U05D2>"
% DALET SYMBOL
<U2138> "<U05D3>"
% FACSIMILE SIGN
<U213B> "<U0046><U0041><U0058>"
% ROMAN NUMERAL ONE
<U2160> "<U0049>"
% ROMAN NUMERAL TWO
@ -386,6 +726,12 @@ translit_start
<U2A75> "<U003D><U003D>"
% THREE CONSECUTIVE EQUALS SIGNS
<U2A76> "<U003D><U003D><U003D>"
% LATIN SUBSCRIPT SMALL LETTER J
<U2C7C> "<U006A>"
% MODIFIER LETTER CAPITAL V
<U2C7D> "<U0056>"
% TIFINAGH MODIFIER LETTER LABIALIZATION MARK
<U2D6F> "<U2D61>"
% CJK RADICAL MOTHER
<U2E9F> "<U6BCD>"
% CJK RADICAL C-SIMPLIFIED TURTLE
@ -830,6 +1176,10 @@ translit_start
<U309B> "<U0020><U3099>"
% KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
<U309C> "<U0020><U309A>"
% HIRAGANA DIGRAPH YORI
<U309F> "<U3088><U308A>"
% KATAKANA DIGRAPH KOTO
<U30FF> "<U30B3><U30C8>"
% HANGUL LETTER KIYEOK
<U3131> "<U1100>"
% HANGUL LETTER SSANGKIYEOK
@ -1018,6 +1368,34 @@ translit_start
<U318D> "<U119E>"
% HANGUL LETTER ARAEAE
<U318E> "<U11A1>"
% IDEOGRAPHIC ANNOTATION ONE MARK
<U3192> "<U4E00>"
% IDEOGRAPHIC ANNOTATION TWO MARK
<U3193> "<U4E8C>"
% IDEOGRAPHIC ANNOTATION THREE MARK
<U3194> "<U4E09>"
% IDEOGRAPHIC ANNOTATION FOUR MARK
<U3195> "<U56DB>"
% IDEOGRAPHIC ANNOTATION TOP MARK
<U3196> "<U4E0A>"
% IDEOGRAPHIC ANNOTATION MIDDLE MARK
<U3197> "<U4E2D>"
% IDEOGRAPHIC ANNOTATION BOTTOM MARK
<U3198> "<U4E0B>"
% IDEOGRAPHIC ANNOTATION FIRST MARK
<U3199> "<U7532>"
% IDEOGRAPHIC ANNOTATION SECOND MARK
<U319A> "<U4E59>"
% IDEOGRAPHIC ANNOTATION THIRD MARK
<U319B> "<U4E19>"
% IDEOGRAPHIC ANNOTATION FOURTH MARK
<U319C> "<U4E01>"
% IDEOGRAPHIC ANNOTATION HEAVEN MARK
<U319D> "<U5929>"
% IDEOGRAPHIC ANNOTATION EARTH MARK
<U319E> "<U5730>"
% IDEOGRAPHIC ANNOTATION MAN MARK
<U319F> "<U4EBA>"
% PARENTHESIZED HANGUL KIYEOK
<U3200> "<U0028><U1100><U0029>"
% PARENTHESIZED HANGUL NIEUN
@ -1076,6 +1454,10 @@ translit_start
<U321B> "<U0028><U1112><U1161><U0029>"
% PARENTHESIZED HANGUL CIEUC U
<U321C> "<U0028><U110C><U116E><U0029>"
% PARENTHESIZED KOREAN CHARACTER OJEON
<U321D> "<U0028><U110B><U1169><U110C><U1165><U11AB><U0029>"
% PARENTHESIZED KOREAN CHARACTER O HU
<U321E> "<U0028><U110B><U1169><U1112><U116E><U0029>"
% PARENTHESIZED IDEOGRAPH ONE
<U3220> "<U0028><U4E00><U0029>"
% PARENTHESIZED IDEOGRAPH TWO
@ -1284,6 +1666,24 @@ translit_start
<U33FD> "<U0033><U0030><U65E5>"
% IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
<U33FE> "<U0033><U0031><U65E5>"
% MODIFIER LETTER CYRILLIC HARD SIGN
<UA69C> "<U044A>"
% MODIFIER LETTER CYRILLIC SOFT SIGN
<UA69D> "<U044C>"
% MODIFIER LETTER US
<UA770> "<UA76F>"
% MODIFIER LETTER CAPITAL H WITH STROKE
<UA7F8> "<U0126>"
% MODIFIER LETTER SMALL LIGATURE OE
<UA7F9> "<U0153>"
% MODIFIER LETTER SMALL HENG
<UAB5C> "<UA727>"
% MODIFIER LETTER SMALL L WITH INVERTED LAZY S
<UAB5D> "<UAB37>"
% MODIFIER LETTER SMALL L WITH MIDDLE TILDE
<UAB5E> "<U026B>"
% MODIFIER LETTER SMALL U WITH LEFT HOOK
<UAB5F> "<UAB52>"
% LATIN SMALL LIGATURE FF
<UFB00> "<U0066><U0066>"
% LATIN SMALL LIGATURE FI
@ -1295,7 +1695,7 @@ translit_start
% LATIN SMALL LIGATURE FFL
<UFB04> "<U0066><U0066><U006C>"
% LATIN SMALL LIGATURE LONG S T
<UFB05> "<U017F><U0074>"
<UFB05> "<U0073><U0074>"
% LATIN SMALL LIGATURE ST
<UFB06> "<U0073><U0074>"
% ARMENIAN SMALL LIGATURE MEN NOW
@ -1310,6 +1710,72 @@ translit_start
<UFB17> "<U0574><U056D>"
% HEBREW LIGATURE ALEF LAMED
<UFB4F> "<U05D0><U05DC>"
% PRESENTATION FORM FOR VERTICAL COMMA
<UFE10> "<U002C>"
% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC COMMA
<UFE11> "<U3001>"
% PRESENTATION FORM FOR VERTICAL IDEOGRAPHIC FULL STOP
<UFE12> "<U3002>"
% PRESENTATION FORM FOR VERTICAL COLON
<UFE13> "<U003A>"
% PRESENTATION FORM FOR VERTICAL SEMICOLON
<UFE14> "<U003B>"
% PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK
<UFE15> "<U0021>"
% PRESENTATION FORM FOR VERTICAL QUESTION MARK
<UFE16> "<U003F>"
% PRESENTATION FORM FOR VERTICAL LEFT WHITE LENTICULAR BRACKET
<UFE17> "<U3016>"
% PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRAKCET
<UFE18> "<U3017>"
% PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
<UFE19> "<U002E><U002E><U002E>"
% PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
<UFE30> "<U002E><U002E>"
% PRESENTATION FORM FOR VERTICAL EM DASH
<UFE31> "<U2014>"
% PRESENTATION FORM FOR VERTICAL EN DASH
<UFE32> "<U2013>"
% PRESENTATION FORM FOR VERTICAL LOW LINE
<UFE33> "<U005F>"
% PRESENTATION FORM FOR VERTICAL WAVY LOW LINE
<UFE34> "<U005F>"
% PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
<UFE35> "<U0028>"
% PRESENTATION FORM FOR VERTICAL RIGHT PARENTHESIS
<UFE36> "<U0029>"
% PRESENTATION FORM FOR VERTICAL LEFT CURLY BRACKET
<UFE37> "<U007B>"
% PRESENTATION FORM FOR VERTICAL RIGHT CURLY BRACKET
<UFE38> "<U007D>"
% PRESENTATION FORM FOR VERTICAL LEFT TORTOISE SHELL BRACKET
<UFE39> "<U3014>"
% PRESENTATION FORM FOR VERTICAL RIGHT TORTOISE SHELL BRACKET
<UFE3A> "<U3015>"
% PRESENTATION FORM FOR VERTICAL LEFT BLACK LENTICULAR BRACKET
<UFE3B> "<U3010>"
% PRESENTATION FORM FOR VERTICAL RIGHT BLACK LENTICULAR BRACKET
<UFE3C> "<U3011>"
% PRESENTATION FORM FOR VERTICAL LEFT DOUBLE ANGLE BRACKET
<UFE3D> "<U300A>"
% PRESENTATION FORM FOR VERTICAL RIGHT DOUBLE ANGLE BRACKET
<UFE3E> "<U300B>"
% PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET
<UFE3F> "<U3008>"
% PRESENTATION FORM FOR VERTICAL RIGHT ANGLE BRACKET
<UFE40> "<U3009>"
% PRESENTATION FORM FOR VERTICAL LEFT CORNER BRACKET
<UFE41> "<U300C>"
% PRESENTATION FORM FOR VERTICAL RIGHT CORNER BRACKET
<UFE42> "<U300D>"
% PRESENTATION FORM FOR VERTICAL LEFT WHITE CORNER BRACKET
<UFE43> "<U300E>"
% PRESENTATION FORM FOR VERTICAL RIGHT WHITE CORNER BRACKET
<UFE44> "<U300F>"
% PRESENTATION FORM FOR VERTICAL LEFT SQUARE BRACKET
<UFE47> "<U005B>"
% PRESENTATION FORM FOR VERTICAL RIGHT SQUARE BRACKET
<UFE48> "<U005D>"
% DASHED OVERLINE
<UFE49> "<U203E>"
% CENTRELINE OVERLINE
@ -1324,6 +1790,104 @@ translit_start
<UFE4E> "<U005F>"
% WAVY LOW LINE
<UFE4F> "<U005F>"
% DIGIT ZERO FULL STOP
<U0001F100> "<U0030><U002E>"
% DIGIT ZERO COMMA
<U0001F101> "<U0030><U002C>"
% DIGIT ONE COMMA
<U0001F102> "<U0031><U002C>"
% DIGIT TWO COMMA
<U0001F103> "<U0032><U002C>"
% DIGIT THREE COMMA
<U0001F104> "<U0033><U002C>"
% DIGIT FOUR COMMA
<U0001F105> "<U0034><U002C>"
% DIGIT FIVE COMMA
<U0001F106> "<U0035><U002C>"
% DIGIT SIX COMMA
<U0001F107> "<U0036><U002C>"
% DIGIT SEVEN COMMA
<U0001F108> "<U0037><U002C>"
% DIGIT EIGHT COMMA
<U0001F109> "<U0038><U002C>"
% DIGIT NINE COMMA
<U0001F10A> "<U0039><U002C>"
% PARENTHESIZED LATIN CAPITAL LETTER A
<U0001F110> "<U0028><U0041><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER B
<U0001F111> "<U0028><U0042><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER C
<U0001F112> "<U0028><U0043><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER D
<U0001F113> "<U0028><U0044><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER E
<U0001F114> "<U0028><U0045><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER F
<U0001F115> "<U0028><U0046><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER G
<U0001F116> "<U0028><U0047><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER H
<U0001F117> "<U0028><U0048><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER I
<U0001F118> "<U0028><U0049><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER J
<U0001F119> "<U0028><U004A><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER K
<U0001F11A> "<U0028><U004B><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER L
<U0001F11B> "<U0028><U004C><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER M
<U0001F11C> "<U0028><U004D><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER N
<U0001F11D> "<U0028><U004E><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER O
<U0001F11E> "<U0028><U004F><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER P
<U0001F11F> "<U0028><U0050><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER Q
<U0001F120> "<U0028><U0051><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER R
<U0001F121> "<U0028><U0052><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER S
<U0001F122> "<U0028><U0053><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER T
<U0001F123> "<U0028><U0054><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER U
<U0001F124> "<U0028><U0055><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER V
<U0001F125> "<U0028><U0056><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER W
<U0001F126> "<U0028><U0057><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER X
<U0001F127> "<U0028><U0058><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER Y
<U0001F128> "<U0028><U0059><U0029>"
% PARENTHESIZED LATIN CAPITAL LETTER Z
<U0001F129> "<U0028><U005A><U0029>"
% TORTOISE SHELL BRACKETED LATIN CAPITAL LETTER S
<U0001F12A> "<U3014><U0053><U3015>"
% RAISED MC SIGN
<U0001F16A> "<U004D><U0043>"
% RAISED MD SIGN
<U0001F16B> "<U004D><U0044>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-672C
<U0001F240> "<U3014><U672C><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E09
<U0001F241> "<U3014><U4E09><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-4E8C
<U0001F242> "<U3014><U4E8C><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-5B89
<U0001F243> "<U3014><U5B89><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-70B9
<U0001F244> "<U3014><U70B9><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6253
<U0001F245> "<U3014><U6253><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-76D7
<U0001F246> "<U3014><U76D7><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-52DD
<U0001F247> "<U3014><U52DD><U3015>"
% TORTOISE SHELL BRACKETED CJK UNIFIED IDEOGRAPH-6557
<U0001F248> "<U3014><U6557><U3015>"
translit_end

View File

@ -2,9 +2,7 @@ escape_char /
comment_char %
% Transliterations of font equivalents.
% Generated through
% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<font>[^;]*;' UnicodeData.txt | \
% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<font> \([^;]*\);.*$/<U\1> <U\3> % \2/'
% Generated automatically from UnicodeData.txt by gen_translit_font.py on 2015-12-09 for Unicode 7.0.0.
LC_CTYPE
@ -37,6 +35,7 @@ translit_start
<U2133> <U004D> % SCRIPT CAPITAL M
<U2134> <U006F> % SCRIPT SMALL O
<U2139> <U0069> % INFORMATION SOURCE
<U213C> <U03C0> % DOUBLE-STRUCK SMALL PI
<U213D> <U03B3> % DOUBLE-STRUCK SMALL GAMMA
<U213E> <U0393> % DOUBLE-STRUCK CAPITAL GAMMA
<U213F> <U03A0> % DOUBLE-STRUCK CAPITAL PI
@ -238,6 +237,7 @@ translit_start
<U0001D4BE> <U0069> % MATHEMATICAL SCRIPT SMALL I
<U0001D4BF> <U006A> % MATHEMATICAL SCRIPT SMALL J
<U0001D4C0> <U006B> % MATHEMATICAL SCRIPT SMALL K
<U0001D4C1> <U006C> % MATHEMATICAL SCRIPT SMALL L
<U0001D4C2> <U006D> % MATHEMATICAL SCRIPT SMALL M
<U0001D4C3> <U006E> % MATHEMATICAL SCRIPT SMALL N
<U0001D4C5> <U0070> % MATHEMATICAL SCRIPT SMALL P
@ -707,6 +707,8 @@ translit_start
<U0001D6A1> <U0078> % MATHEMATICAL MONOSPACE SMALL X
<U0001D6A2> <U0079> % MATHEMATICAL MONOSPACE SMALL Y
<U0001D6A3> <U007A> % MATHEMATICAL MONOSPACE SMALL Z
<U0001D6A4> <U0131> % MATHEMATICAL ITALIC SMALL DOTLESS I
<U0001D6A5> <U0237> % MATHEMATICAL ITALIC SMALL DOTLESS J
<U0001D6A8> <U0391> % MATHEMATICAL BOLD CAPITAL ALPHA
<U0001D6A9> <U0392> % MATHEMATICAL BOLD CAPITAL BETA
<U0001D6AA> <U0393> % MATHEMATICAL BOLD CAPITAL GAMMA
@ -997,6 +999,8 @@ translit_start
<U0001D7C7> <U03D5> % MATHEMATICAL SANS-SERIF BOLD ITALIC PHI SYMBOL
<U0001D7C8> <U03F1> % MATHEMATICAL SANS-SERIF BOLD ITALIC RHO SYMBOL
<U0001D7C9> <U03D6> % MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL
<U0001D7CA> <U03DC> % MATHEMATICAL BOLD CAPITAL DIGAMMA
<U0001D7CB> <U03DD> % MATHEMATICAL BOLD SMALL DIGAMMA
<U0001D7CE> <U0030> % MATHEMATICAL BOLD DIGIT ZERO
<U0001D7CF> <U0031> % MATHEMATICAL BOLD DIGIT ONE
<U0001D7D0> <U0032> % MATHEMATICAL BOLD DIGIT TWO
@ -1047,6 +1051,147 @@ translit_start
<U0001D7FD> <U0037> % MATHEMATICAL MONOSPACE DIGIT SEVEN
<U0001D7FE> <U0038> % MATHEMATICAL MONOSPACE DIGIT EIGHT
<U0001D7FF> <U0039> % MATHEMATICAL MONOSPACE DIGIT NINE
<U0001EE00> <U0627> % ARABIC MATHEMATICAL ALEF
<U0001EE01> <U0628> % ARABIC MATHEMATICAL BEH
<U0001EE02> <U062C> % ARABIC MATHEMATICAL JEEM
<U0001EE03> <U062F> % ARABIC MATHEMATICAL DAL
<U0001EE05> <U0648> % ARABIC MATHEMATICAL WAW
<U0001EE06> <U0632> % ARABIC MATHEMATICAL ZAIN
<U0001EE07> <U062D> % ARABIC MATHEMATICAL HAH
<U0001EE08> <U0637> % ARABIC MATHEMATICAL TAH
<U0001EE09> <U064A> % ARABIC MATHEMATICAL YEH
<U0001EE0A> <U0643> % ARABIC MATHEMATICAL KAF
<U0001EE0B> <U0644> % ARABIC MATHEMATICAL LAM
<U0001EE0C> <U0645> % ARABIC MATHEMATICAL MEEM
<U0001EE0D> <U0646> % ARABIC MATHEMATICAL NOON
<U0001EE0E> <U0633> % ARABIC MATHEMATICAL SEEN
<U0001EE0F> <U0639> % ARABIC MATHEMATICAL AIN
<U0001EE10> <U0641> % ARABIC MATHEMATICAL FEH
<U0001EE11> <U0635> % ARABIC MATHEMATICAL SAD
<U0001EE12> <U0642> % ARABIC MATHEMATICAL QAF
<U0001EE13> <U0631> % ARABIC MATHEMATICAL REH
<U0001EE14> <U0634> % ARABIC MATHEMATICAL SHEEN
<U0001EE15> <U062A> % ARABIC MATHEMATICAL TEH
<U0001EE16> <U062B> % ARABIC MATHEMATICAL THEH
<U0001EE17> <U062E> % ARABIC MATHEMATICAL KHAH
<U0001EE18> <U0630> % ARABIC MATHEMATICAL THAL
<U0001EE19> <U0636> % ARABIC MATHEMATICAL DAD
<U0001EE1A> <U0638> % ARABIC MATHEMATICAL ZAH
<U0001EE1B> <U063A> % ARABIC MATHEMATICAL GHAIN
<U0001EE1C> <U066E> % ARABIC MATHEMATICAL DOTLESS BEH
<U0001EE1D> <U06BA> % ARABIC MATHEMATICAL DOTLESS NOON
<U0001EE1E> <U06A1> % ARABIC MATHEMATICAL DOTLESS FEH
<U0001EE1F> <U066F> % ARABIC MATHEMATICAL DOTLESS QAF
<U0001EE21> <U0628> % ARABIC MATHEMATICAL INITIAL BEH
<U0001EE22> <U062C> % ARABIC MATHEMATICAL INITIAL JEEM
<U0001EE24> <U0647> % ARABIC MATHEMATICAL INITIAL HEH
<U0001EE27> <U062D> % ARABIC MATHEMATICAL INITIAL HAH
<U0001EE29> <U064A> % ARABIC MATHEMATICAL INITIAL YEH
<U0001EE2A> <U0643> % ARABIC MATHEMATICAL INITIAL KAF
<U0001EE2B> <U0644> % ARABIC MATHEMATICAL INITIAL LAM
<U0001EE2C> <U0645> % ARABIC MATHEMATICAL INITIAL MEEM
<U0001EE2D> <U0646> % ARABIC MATHEMATICAL INITIAL NOON
<U0001EE2E> <U0633> % ARABIC MATHEMATICAL INITIAL SEEN
<U0001EE2F> <U0639> % ARABIC MATHEMATICAL INITIAL AIN
<U0001EE30> <U0641> % ARABIC MATHEMATICAL INITIAL FEH
<U0001EE31> <U0635> % ARABIC MATHEMATICAL INITIAL SAD
<U0001EE32> <U0642> % ARABIC MATHEMATICAL INITIAL QAF
<U0001EE34> <U0634> % ARABIC MATHEMATICAL INITIAL SHEEN
<U0001EE35> <U062A> % ARABIC MATHEMATICAL INITIAL TEH
<U0001EE36> <U062B> % ARABIC MATHEMATICAL INITIAL THEH
<U0001EE37> <U062E> % ARABIC MATHEMATICAL INITIAL KHAH
<U0001EE39> <U0636> % ARABIC MATHEMATICAL INITIAL DAD
<U0001EE3B> <U063A> % ARABIC MATHEMATICAL INITIAL GHAIN
<U0001EE42> <U062C> % ARABIC MATHEMATICAL TAILED JEEM
<U0001EE47> <U062D> % ARABIC MATHEMATICAL TAILED HAH
<U0001EE49> <U064A> % ARABIC MATHEMATICAL TAILED YEH
<U0001EE4B> <U0644> % ARABIC MATHEMATICAL TAILED LAM
<U0001EE4D> <U0646> % ARABIC MATHEMATICAL TAILED NOON
<U0001EE4E> <U0633> % ARABIC MATHEMATICAL TAILED SEEN
<U0001EE4F> <U0639> % ARABIC MATHEMATICAL TAILED AIN
<U0001EE51> <U0635> % ARABIC MATHEMATICAL TAILED SAD
<U0001EE52> <U0642> % ARABIC MATHEMATICAL TAILED QAF
<U0001EE54> <U0634> % ARABIC MATHEMATICAL TAILED SHEEN
<U0001EE57> <U062E> % ARABIC MATHEMATICAL TAILED KHAH
<U0001EE59> <U0636> % ARABIC MATHEMATICAL TAILED DAD
<U0001EE5B> <U063A> % ARABIC MATHEMATICAL TAILED GHAIN
<U0001EE5D> <U06BA> % ARABIC MATHEMATICAL TAILED DOTLESS NOON
<U0001EE5F> <U066F> % ARABIC MATHEMATICAL TAILED DOTLESS QAF
<U0001EE61> <U0628> % ARABIC MATHEMATICAL STRETCHED BEH
<U0001EE62> <U062C> % ARABIC MATHEMATICAL STRETCHED JEEM
<U0001EE64> <U0647> % ARABIC MATHEMATICAL STRETCHED HEH
<U0001EE67> <U062D> % ARABIC MATHEMATICAL STRETCHED HAH
<U0001EE68> <U0637> % ARABIC MATHEMATICAL STRETCHED TAH
<U0001EE69> <U064A> % ARABIC MATHEMATICAL STRETCHED YEH
<U0001EE6A> <U0643> % ARABIC MATHEMATICAL STRETCHED KAF
<U0001EE6C> <U0645> % ARABIC MATHEMATICAL STRETCHED MEEM
<U0001EE6D> <U0646> % ARABIC MATHEMATICAL STRETCHED NOON
<U0001EE6E> <U0633> % ARABIC MATHEMATICAL STRETCHED SEEN
<U0001EE6F> <U0639> % ARABIC MATHEMATICAL STRETCHED AIN
<U0001EE70> <U0641> % ARABIC MATHEMATICAL STRETCHED FEH
<U0001EE71> <U0635> % ARABIC MATHEMATICAL STRETCHED SAD
<U0001EE72> <U0642> % ARABIC MATHEMATICAL STRETCHED QAF
<U0001EE74> <U0634> % ARABIC MATHEMATICAL STRETCHED SHEEN
<U0001EE75> <U062A> % ARABIC MATHEMATICAL STRETCHED TEH
<U0001EE76> <U062B> % ARABIC MATHEMATICAL STRETCHED THEH
<U0001EE77> <U062E> % ARABIC MATHEMATICAL STRETCHED KHAH
<U0001EE79> <U0636> % ARABIC MATHEMATICAL STRETCHED DAD
<U0001EE7A> <U0638> % ARABIC MATHEMATICAL STRETCHED ZAH
<U0001EE7B> <U063A> % ARABIC MATHEMATICAL STRETCHED GHAIN
<U0001EE7C> <U066E> % ARABIC MATHEMATICAL STRETCHED DOTLESS BEH
<U0001EE7E> <U06A1> % ARABIC MATHEMATICAL STRETCHED DOTLESS FEH
<U0001EE80> <U0627> % ARABIC MATHEMATICAL LOOPED ALEF
<U0001EE81> <U0628> % ARABIC MATHEMATICAL LOOPED BEH
<U0001EE82> <U062C> % ARABIC MATHEMATICAL LOOPED JEEM
<U0001EE83> <U062F> % ARABIC MATHEMATICAL LOOPED DAL
<U0001EE84> <U0647> % ARABIC MATHEMATICAL LOOPED HEH
<U0001EE85> <U0648> % ARABIC MATHEMATICAL LOOPED WAW
<U0001EE86> <U0632> % ARABIC MATHEMATICAL LOOPED ZAIN
<U0001EE87> <U062D> % ARABIC MATHEMATICAL LOOPED HAH
<U0001EE88> <U0637> % ARABIC MATHEMATICAL LOOPED TAH
<U0001EE89> <U064A> % ARABIC MATHEMATICAL LOOPED YEH
<U0001EE8B> <U0644> % ARABIC MATHEMATICAL LOOPED LAM
<U0001EE8C> <U0645> % ARABIC MATHEMATICAL LOOPED MEEM
<U0001EE8D> <U0646> % ARABIC MATHEMATICAL LOOPED NOON
<U0001EE8E> <U0633> % ARABIC MATHEMATICAL LOOPED SEEN
<U0001EE8F> <U0639> % ARABIC MATHEMATICAL LOOPED AIN
<U0001EE90> <U0641> % ARABIC MATHEMATICAL LOOPED FEH
<U0001EE91> <U0635> % ARABIC MATHEMATICAL LOOPED SAD
<U0001EE92> <U0642> % ARABIC MATHEMATICAL LOOPED QAF
<U0001EE93> <U0631> % ARABIC MATHEMATICAL LOOPED REH
<U0001EE94> <U0634> % ARABIC MATHEMATICAL LOOPED SHEEN
<U0001EE95> <U062A> % ARABIC MATHEMATICAL LOOPED TEH
<U0001EE96> <U062B> % ARABIC MATHEMATICAL LOOPED THEH
<U0001EE97> <U062E> % ARABIC MATHEMATICAL LOOPED KHAH
<U0001EE98> <U0630> % ARABIC MATHEMATICAL LOOPED THAL
<U0001EE99> <U0636> % ARABIC MATHEMATICAL LOOPED DAD
<U0001EE9A> <U0638> % ARABIC MATHEMATICAL LOOPED ZAH
<U0001EE9B> <U063A> % ARABIC MATHEMATICAL LOOPED GHAIN
<U0001EEA1> <U0628> % ARABIC MATHEMATICAL DOUBLE-STRUCK BEH
<U0001EEA2> <U062C> % ARABIC MATHEMATICAL DOUBLE-STRUCK JEEM
<U0001EEA3> <U062F> % ARABIC MATHEMATICAL DOUBLE-STRUCK DAL
<U0001EEA5> <U0648> % ARABIC MATHEMATICAL DOUBLE-STRUCK WAW
<U0001EEA6> <U0632> % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAIN
<U0001EEA7> <U062D> % ARABIC MATHEMATICAL DOUBLE-STRUCK HAH
<U0001EEA8> <U0637> % ARABIC MATHEMATICAL DOUBLE-STRUCK TAH
<U0001EEA9> <U064A> % ARABIC MATHEMATICAL DOUBLE-STRUCK YEH
<U0001EEAB> <U0644> % ARABIC MATHEMATICAL DOUBLE-STRUCK LAM
<U0001EEAC> <U0645> % ARABIC MATHEMATICAL DOUBLE-STRUCK MEEM
<U0001EEAD> <U0646> % ARABIC MATHEMATICAL DOUBLE-STRUCK NOON
<U0001EEAE> <U0633> % ARABIC MATHEMATICAL DOUBLE-STRUCK SEEN
<U0001EEAF> <U0639> % ARABIC MATHEMATICAL DOUBLE-STRUCK AIN
<U0001EEB0> <U0641> % ARABIC MATHEMATICAL DOUBLE-STRUCK FEH
<U0001EEB1> <U0635> % ARABIC MATHEMATICAL DOUBLE-STRUCK SAD
<U0001EEB2> <U0642> % ARABIC MATHEMATICAL DOUBLE-STRUCK QAF
<U0001EEB3> <U0631> % ARABIC MATHEMATICAL DOUBLE-STRUCK REH
<U0001EEB4> <U0634> % ARABIC MATHEMATICAL DOUBLE-STRUCK SHEEN
<U0001EEB5> <U062A> % ARABIC MATHEMATICAL DOUBLE-STRUCK TEH
<U0001EEB6> <U062B> % ARABIC MATHEMATICAL DOUBLE-STRUCK THEH
<U0001EEB7> <U062E> % ARABIC MATHEMATICAL DOUBLE-STRUCK KHAH
<U0001EEB8> <U0630> % ARABIC MATHEMATICAL DOUBLE-STRUCK THAL
<U0001EEB9> <U0636> % ARABIC MATHEMATICAL DOUBLE-STRUCK DAD
<U0001EEBA> <U0638> % ARABIC MATHEMATICAL DOUBLE-STRUCK ZAH
<U0001EEBB> <U063A> % ARABIC MATHEMATICAL DOUBLE-STRUCK GHAIN
translit_end

View File

@ -2,10 +2,7 @@ escape_char /
comment_char %
% Transliterations of fractions.
% Generated through
% $ grep '^[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;<fraction>[^;]*;' UnicodeData.txt | \
% sed -e 's/^\([^;]*\);\([^;]*\);[^;]*;[^;]*;[^;]*;<fraction> \([^;]*\);.*$/<U\1> "<U\3>"% \2/' -e 'h' -e 's/^\([^%]*\)% .*$/\1/' -e 's/\([0-9A-F]\) \([0-9A-F]\)/\1><U\2/g' -e 'x' -e 's/^[^%]*\(% .*\)$/\1/' -e 'G'
%
% Generated automatically from UnicodeData.txt by gen_translit_fraction.py on 2015-12-09 for Unicode 7.0.0.
% The replacements have been surrounded with spaces, because fractions are
% often preceded by a decimal number and followed by a unit or a math symbol.
@ -19,6 +16,12 @@ translit_start
<U00BD> "<U0020><U0031><U2044><U0032><U0020>";"<U0020><U0031><U002F><U0032><U0020>"
% VULGAR FRACTION THREE QUARTERS
<U00BE> "<U0020><U0033><U2044><U0034><U0020>";"<U0020><U0033><U002F><U0034><U0020>"
% VULGAR FRACTION ONE SEVENTH
<U2150> "<U0020><U0031><U2044><U0037><U0020>";"<U0020><U0031><U002F><U0037><U0020>"
% VULGAR FRACTION ONE NINTH
<U2151> "<U0020><U0031><U2044><U0039><U0020>";"<U0020><U0031><U002F><U0039><U0020>"
% VULGAR FRACTION ONE TENTH
<U2152> "<U0020><U0031><U2044><U0031><U0030><U0020>";"<U0020><U0031><U002F><U0031><U0030><U0020>"
% VULGAR FRACTION ONE THIRD
<U2153> "<U0020><U0031><U2044><U0033><U0020>";"<U0020><U0031><U002F><U0033><U0020>"
% VULGAR FRACTION TWO THIRDS
@ -44,7 +47,9 @@ translit_start
% VULGAR FRACTION SEVEN EIGHTHS
<U215E> "<U0020><U0037><U2044><U0038><U0020>";"<U0020><U0037><U002F><U0038><U0020>"
% FRACTION NUMERATOR ONE
<U215F> "<U0020><U0031><U2044>";"<U0020><U0031><U002F>"
<U215F> "<U0020><U0031><U2044><U0020>";"<U0020><U0031><U002F><U0020>"
% VULGAR FRACTION ZERO THIRDS
<U2189> "<U0020><U0030><U2044><U0033><U0020>";"<U0020><U0030><U002F><U0033><U0020>"
translit_end

View File

@ -41,7 +41,7 @@ PYTHON3 = python3
WGET = wget
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
GENERATED = i18n UTF-8
GENERATED = i18n UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
REPORTS = i18n-report UTF-8-report
all: $(GENERATED)
@ -51,6 +51,12 @@ check: check-i18n check-UTF-8
install:
cp -p i18n ../locales/i18n
cp -p UTF-8 ../charmaps/UTF-8
cp -p translit_combining ../locales/translit_combining
cp -p translit_compat ../locales/translit_compat
cp -p translit_circle ../locales/translit_circle
cp -p translit_cjk_compat ../locales/translit_cjk_compat
cp -p translit_font ../locales/translit_font
cp -p translit_fraction ../locales/translit_fraction
clean: mostlyclean
-rm -rf __pycache__
@ -82,13 +88,43 @@ UTF-8: utf8_gen.py
UTF-8-report: UTF-8 ../charmaps/UTF-8
UTF-8-report: utf8_compatibility.py
$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
-n UTF-8 -a -m > $@
$(PYTHON3) ./utf8_compatibility.py -u UnicodeData.txt \
-e EastAsianWidth.txt -o ../charmaps/UTF-8 \
-n UTF-8 -a -m -c > $@
check-UTF-8: UTF-8-report
@if grep '^Total.*: [^0]' UTF-8-report; \
then echo manual verification required; false; else true; fi
translit_combining: UnicodeData.txt
translit_combining: gen_translit_combining.py
$(PYTHON3) ./gen_translit_combining.py -u UnicodeData.txt \
-o $@ --unicode_version $(UNICODE_VERSION)
translit_compat: UnicodeData.txt
translit_compat: gen_translit_compat.py
$(PYTHON3) ./gen_translit_compat.py -u UnicodeData.txt \
-o $@ --unicode_version $(UNICODE_VERSION)
translit_circle: UnicodeData.txt
translit_circle: gen_translit_circle.py
$(PYTHON3) ./gen_translit_circle.py -u UnicodeData.txt \
-o $@ --unicode_version $(UNICODE_VERSION)
translit_cjk_compat: UnicodeData.txt
translit_cjk_compat: gen_translit_cjk_compat.py
$(PYTHON3) ./gen_translit_cjk_compat.py -u UnicodeData.txt \
-o $@ --unicode_version $(UNICODE_VERSION)
translit_font: UnicodeData.txt
translit_font: gen_translit_font.py
$(PYTHON3) ./gen_translit_font.py -u UnicodeData.txt \
-o $@ --unicode_version $(UNICODE_VERSION)
translit_fraction: UnicodeData.txt
translit_fraction: gen_translit_fraction.py
$(PYTHON3) ./gen_translit_fraction.py -u UnicodeData.txt \
-o $@ --unicode_version $(UNICODE_VERSION)
.PHONY: downloads clean-downloads
downloads: $(DOWNLOADS)

View File

@ -0,0 +1,150 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_circle file from a UnicodeData file.
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a translit_circle file from UnicodeData.txt
To see how this script is used, call it with the -h option:
$ ./gen_translit_circle -h
prints usage message
'''
import argparse
import time
import unicode_utils
def read_input_file(filename):
'''Reads the original glibc translit_circle file to get the
original head and tail.
We want to replace only the part of the file between
translit_start and translit_end
'''
head = tail = ''
with open(filename, mode='r') as translit_file:
for line in translit_file:
head = head + line
if line.startswith('translit_start'):
break
for line in translit_file:
if line.startswith('translit_end'):
tail = line
break
for line in translit_file:
tail = tail + line
return (head, tail)
def output_head(translit_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the translit_start line.
'''
if ARGS.input_file and head:
translit_file.write(head)
else:
translit_file.write('escape_char /\n')
translit_file.write('comment_char %\n')
translit_file.write('\n')
translit_file.write('% Transliterations of encircled characters.\n')
translit_file.write('% Generated automatically from UnicodeData.txt '
+ 'by gen_translit_circle.py '
+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ 'for Unicode {:s}.\n'.format(unicode_version))
translit_file.write('\n')
translit_file.write('LC_CTYPE\n')
translit_file.write('\n')
translit_file.write('translit_start\n')
def output_tail(translit_file, tail=''):
'''Write the tail of the output file'''
if ARGS.input_file and tail:
translit_file.write(tail)
else:
translit_file.write('translit_end\n')
translit_file.write('\n')
translit_file.write('END LC_CTYPE\n')
def output_transliteration(translit_file):
'''Write the new transliteration to the output file'''
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
if decomposition.startswith('<circle>'):
decomposition = decomposition[9:]
decomposed_code_points = [int(x, 16)
for x in decomposition.split(' ')]
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} "<U0028>'.format(
unicode_utils.ucs_symbol(code_point)))
for decomposed_code_point in decomposed_code_points:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
translit_file.write('<U0029>"\n')
translit_file.write('\n')
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a translit_circle file from UnicodeData.txt.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help=''' The original glibc/localedata/locales/translit_combining
file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='translit_circle.new',
help='''The new translit_circle file, default: %(default)s. If the
original glibc/localedata/locales/translit_circle file has
been given as an option, the header up to the
translit_start line and the tail from the translit_end
line to the end of the file will be copied unchanged into the
output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
unicode_utils.fill_attributes(ARGS.unicode_data_file)
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
output_transliteration(TRANSLIT_FILE)
output_tail(TRANSLIT_FILE, tail=TAIL)

View File

@ -0,0 +1,220 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_cjk_compat file from a UnicodeData file.
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a translit_cjk_compat file from UnicodeData.txt
To see how this script is used, call it with the -h option:
$ ./gen_translit_cjk_compat -h
prints usage message
'''
import argparse
import time
import sys
import unicode_utils
def read_input_file(filename):
'''Reads the original glibc translit_cjk_compat file to get the
original head and tail.
We want to replace only the part of the file between
translit_start and translit_end
'''
head = tail = ''
with open(filename, mode='r') as translit_file:
for line in translit_file:
head = head + line
if line.startswith('translit_start'):
break
for line in translit_file:
if line.startswith('translit_end'):
tail = line
break
for line in translit_file:
tail = tail + line
return (head, tail)
def output_head(translit_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the translit_start line.
'''
if ARGS.input_file and head:
translit_file.write(head)
else:
translit_file.write('escape_char /\n')
translit_file.write('comment_char %\n')
translit_file.write('\n')
translit_file.write('% Transliterations of CJK compatibility ')
translit_file.write('characters.\n')
translit_file.write('% Generated automatically from UnicodeData.txt '
+ 'by gen_translit_cjk_compat.py '
+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ 'for Unicode {:s}.\n'.format(unicode_version))
translit_file.write('\n')
translit_file.write('LC_CTYPE\n')
translit_file.write('\n')
translit_file.write('translit_start\n')
def output_tail(translit_file, tail=''):
'''Write the tail of the output file'''
if ARGS.input_file and tail:
translit_file.write(tail)
else:
translit_file.write('translit_end\n')
translit_file.write('\n')
translit_file.write('END LC_CTYPE\n')
def special_decompose(code_point_list):
'''
Decompositions which are not in UnicodeData.txt at all but which
were used in the original translit_cjk_compat file in glibc and
which seem to make sense. I want to keep the update of
translit_cjk_compat close to the spirit of the original file,
therefore I added this special decomposition rules here.
'''
special_decompose_dict = {
(0x2215,): [0x002F], # → /
(0x00B2,): [0x005E, 0x0032], # ² → ^2
(0x03BC,): [0x00B5], # μ → µ (GREEK SMALL LETTER MU → MICRO SIGN)
(0x2113,): [0x006C], # → l
(0x00B3,): [0x005E, 0x0033], # ³ → ^3
(0x00B5,): [0x0075], # µ → u
(0x03BC, 0x2113): [0x03BC, 0x006C], # μℓ → μl
(0x0072, 0x0061, 0x0064, 0x2215, 0x0073, 0x00B2): [
0x0072, 0x0061, 0x0064, 0x002F, 0x0073, 0x00B2],
(0x006D, 0x2215, 0x0073, 0x00B2): [0x006D, 0x002F, 0x0073, 0x00B2],
}
if tuple(code_point_list) in special_decompose_dict:
return special_decompose_dict[tuple(code_point_list)]
else:
return code_point_list
def output_transliteration(translit_file):
'''Write the new transliteration to the output file'''
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
if decomposition.startswith('<square>'):
decomposition = decomposition[9:]
decomposed_code_points = [[int(x, 16)
for x in decomposition.split(' ')]]
if decomposed_code_points[0]:
while True:
special_decomposed_code_points = special_decompose(
decomposed_code_points[-1])
if (special_decomposed_code_points
!= decomposed_code_points[-1]):
decomposed_code_points.append(
special_decomposed_code_points)
continue
special_decomposed_code_points = []
for decomposed_code_point in decomposed_code_points[-1]:
special_decomposed_code_points += special_decompose(
[decomposed_code_point])
if (special_decomposed_code_points
== decomposed_code_points[-1]):
break
decomposed_code_points.append(
special_decomposed_code_points)
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
for index in range(0, len(decomposed_code_points)):
if index > 0:
translit_file.write(';')
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
for decomposed_code_point in decomposed_code_points[index]:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
if decomposition and name.startswith('CJK COMPATIBILITY IDEOGRAPH'):
decomposed_code_points = [int(x, 16)
for x in decomposition.split(' ')]
if len(decomposed_code_points) != 1:
sys.stderr.write(
'Unexpected decomposition length {:x} {:s} {:s}\n'.format(
code_point, name, decomposition))
exit(1)
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
for decomposed_code_point in decomposed_code_points:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
translit_file.write('\n')
translit_file.write('\n')
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a translit_cjk_compat file from UnicodeData.txt.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help=''' The original glibc/localedata/locales/translit_cjk_compat
file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='translit_cjk_compat.new',
help='''The new translit_cjk_compat file, default: %(default)s. If the
original glibc/localedata/locales/translit_cjk_compat file has
been given as an option, the header up to the
translit_start line and the tail from the translit_end
line to the end of the file will be copied unchanged into the
output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
unicode_utils.fill_attributes(ARGS.unicode_data_file)
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
output_transliteration(TRANSLIT_FILE)
output_tail(TRANSLIT_FILE, tail=TAIL)

View File

@ -0,0 +1,442 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_combining file from a UnicodeData file.
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a translit_combining file from UnicodeData.txt
To see how this script is used, call it with the -h option:
$ ./gen_translit_combining -h
prints usage message
'''
import argparse
import time
import unicode_utils
def read_input_file(filename):
'''Reads the original glibc translit_combining file to get the
original head and tail.
We want to replace only the part of the file between
translit_start and translit_end
'''
head = tail = ''
with open(filename, mode='r') as translit_file:
for line in translit_file:
head = head + line
if line.startswith('translit_start'):
break
for line in translit_file:
if line.startswith('translit_end'):
tail = line
break
for line in translit_file:
tail = tail + line
return (head, tail)
def output_head(translit_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the translit_start line.
'''
if ARGS.input_file and head:
translit_file.write(head)
else:
translit_file.write('escape_char /\n')
translit_file.write('comment_char %\n')
translit_file.write('\n')
translit_file.write('% Transliterations that remove all ')
translit_file.write('combining characters (accents,\n')
translit_file.write('% pronounciation marks, etc.).\n')
translit_file.write('% Generated automatically from UnicodeData.txt '
+ 'by gen_translit_combining.py '
+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ 'for Unicode {:s}.\n'.format(unicode_version))
translit_file.write('\n')
translit_file.write('LC_CTYPE\n')
translit_file.write('\n')
translit_file.write('translit_start\n')
def output_tail(translit_file, tail=''):
'''Write the tail of the output file'''
if ARGS.input_file and tail:
translit_file.write(tail)
else:
translit_file.write('translit_end\n')
translit_file.write('\n')
translit_file.write('END LC_CTYPE\n')
def is_combining_remove(code_point):
'''Check whether this is a combining character which should be listed
in the section of the translit_combining file where combining
characters are replaced by empty strings.
We ignore combining characters from many scripts here because
the original translit_combining file didnt do this for the
combining characters from these scripts either and I am not
sure yet whether this would be useful to do for all combining
characters or not. For the moment I think it is better to keep
close to the spirit of the original file.
'''
if not unicode_utils.is_combining(code_point):
return False
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
for substring in ('DEVANAGARI',
'BENGALI',
'CYRILLIC',
'SYRIAC',
'THAANA',
'NKO',
'GURMUKHI',
'TAMIL',
'GUJARATI',
'ORIYA',
'TELUGU',
'KANNADA',
'MALAYALAM',
'SINHALA',
'THAI',
'LAO',
'TIBETAN',
'MYANMAR',
'ETHIOPIC',
'TAGALOG',
'HANUNOO',
'BUHID',
'TAGBANWA',
'KHMER',
'MONGOLIAN',
'LIMBU',
'NEW TAI LUE',
'BUGINESE',
'BALINESE',
'SUNDANESE',
'LEPCHA',
'IDEOGRAPHIC',
'HANGUL',
'SYLOTI',
'SAURASHTRA',
'KAYAH',
'REJANG',
'CHAM',
'VARIATION SELECTOR',
'KHAROSHTHI',
'MUSICAL SYMBOL',
'SAMARITAN',
'MANDAIC',
'TAI THAM',
'BATAK',
'VEDIC',
'COPTIC',
'TIFINAGH',
'BAMUM',
'JAVANESE',
'TAI VIET',
'MEETEI',
'MANICHAEAN',
'BRAHMI',
'KAITHI',
'CHAKMA',
'MAHAJANI',
'SHARADA',
'KHOJKI',
'KHUDAWADI',
'GRANTHA',
'TIRHUTA',
'SIDDHAM',
'MODI VOWEL',
'MODI SIGN',
'TAKRI',
'BASSA VAH',
'PAHAWH HMONG',
'MIAO',
'DUPLOYAN',
'MENDE KIKAKUI'
):
if substring in name:
return False
return True
def canonical_decompose(code_point):
'''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
In some instances a canonical mapping or a compatibility mapping
may consist of a single character. For a canonical mapping, this
indicates that the character is a canonical equivalent of another
single character. For a compatibility mapping, this indicates that
the character is a compatibility equivalent of another single
character.
A canonical mapping may also consist of a pair of characters, but
is never longer than two characters. When a canonical mapping
consists of a pair of characters, the first character may itself
be a character with a decomposition mapping, but the second
character never has a decomposition mapping.
We ignore the canonical decomposition for code points
matching certain substrings because the original translit_combining
file didnt include these types of characters either. I am unsure
about the usefulness of including them and want to keep close
to the spirit of the original file for the moment.
'''
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
for substring in ('MUSICAL SYMBOL',
'CJK COMPATIBILITY IDEOGRAPH',
'BALINESE',
'KAITHI LETTER',
'CHAKMA VOWEL',
'GRANTHA VOWEL',
'TIRHUTA VOWEL',
'SIDDHAM VOWEL'):
if substring in name:
return []
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
if decomposition and not decomposition.startswith('<'):
decomposed_code_points = [int(x, 16) for x in decomposition.split(' ')]
if decomposed_code_points:
cd0 = canonical_decompose(decomposed_code_points[0])
if cd0:
decomposed_code_points = cd0 + decomposed_code_points[1:]
return decomposed_code_points
else:
return []
def special_decompose(code_point_list):
'''
Decompositions which are not canonical or which are not in
UnicodeData.txt at all but some of these were used in the original
translit_combining file in glibc and they seemed to make sense.
I want to keep the update of translit_combining close to the
spirit of the original file, therefore I added these special
decomposition rules here.
'''
special_decompose_dict = {
# Ø U+00D8 is already handled in translit_neutral. But
# translit_combining is usually included after translit_neutral
# and Ǿ U+01FE LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
# has a canonical decomposition to Ø U+00D8 and we want to
# further decompose this to U+004F.
(0x00D8,): [0x004F], # Ø → O
# ø U+00F8 is already handled in translit_neutral. But
# translit_combining is usually included after translit_neutral
# and ǿ U+01FF LATIN SMALL LETTER O WITH STROKE AND ACUTE
# has a canonical decomposition to ø U+00F8 and we want to
# further decompose this to U+006F.
(0x00F8,): [0x006F], # ø → o
# æ U+00E6 is already in translit_compat because ligatures
# are handled in translit_compat. But ǣ U+01E3 has a
# canonical decomposition to U+00E6, U+0304 and we want to
# further decompose this to “ae”.
(0x00E6,): [0x0061, 0x0065], # æ → ae
# Æ U+00C6 is already in translit_compat because ligatures
# are handled in translit_compat. But Ǣ U+01E2 has a
# canonical decomposition to U+00C6, U+0304 and we want to
# further decompose this to “AE”
(0x00C6,): [0x0041, 0x0045], # Æ → AE
# U+05F2 HEBREW LIGATURE YIDDISH DOUBLE YOD is already in
# translit_compat because ligatures are handled in translit_compat.
# But U+FB1F has a canonical decomposition to U+05F2 and
# we want to further decompose this to U+05D9, U+05D9.
(0x05F2,): [0x05D9, 0x05D9], # ײ → יי
# 0x2002 has a <compat> decomposition to 0x0020 in UnicodeData.txt
# But U+2000 EN QUAD has a canonical decomposition U+2002
# and we want to further decompose this to U+0020.
(0x2002,): [0x0020], # EN SPACE → SPACE
# 0x2003 has a <compat> decomposition to 0x0020 in UnicodeData.txt
# But U+2001 EM QUAD has a canonical decomposition to U+2003
# and we want to further decompose this to U+0020.
(0x2003,): [0x0020], # EM SPACE → SPACE
# U+2260 ≠ has the canonical decomposition U+003D U+0338
# (= followed by ̸). After stripping the combining characters,
# the result is only = which reverses the meaning.
# Therefore, we add a special rules here for such mathematical
# negations:
(0x21AE,): [0x0021, 0x003C, 0x002D, 0x003E], # ↮ → !<->
(0x21CD,): [0x0021, 0x003C, 0x003D], # ⇍ → !<=
(0x21CE,): [0x0021, 0x003C, 0x003D, 0x003E], # ⇎ → !<=>
(0x21CF,): [0x0021, 0x003D, 0x003E], # ⇏ → !=>
(0x2204,): [0x0021, 0x2203], # ∄ → !∃
(0x2209,): [0x0021, 0x2208], # ∉ → !∈
(0x220C,): [0x0021, 0x220B], # ∌ → !∋
(0x2224,): [0x0021, 0x2223], # ∤ → !
(0x2226,): [0x0021, 0x2225], # ∦ → !∥
(0x2241,): [0x0021, 0x007E], # ≁ → !~
(0x2244,): [0x0021, 0x007E, 0x002D], # ≄ → !~-
(0x2247,): [0x0021, 0x007E, 0x003D], # ≇ → !~=
(0x2249,): [0x0021, 0x007E, 0x007E], # ≉ → !~~
(0x2260,): [0x0021, 0x003D], # ≠ → !=
(0x2262,): [0x0021, 0x003D, 0x003D], # ≢ → !==
(0x226D,): [0x0021, 0x224D], # ≭ → !≍
(0x226E,): [0x0021, 0x003C], # ≮ → !<
(0x226F,): [0x0021, 0x003E], # ≯ → !>
(0x2270,): [0x0021, 0x003C, 0x003D], # ≰ → !<=
(0x2271,): [0x0021, 0x003E, 0x003D], # ≱ → !>=
(0x2274,): [0x0021, 0x003C, 0x007E], # ≴ → !<~
(0x2275,): [0x0021, 0x003E, 0x007E], # ≵ → !>~
(0x2278,): [0x0021, 0x003C, 0x003E], # ≸ → !<>
(0x2279,): [0x0021, 0x003E, 0x003C], # ≹ → !><
(0x2280,): [0x0021, 0x227A], # ⊀ → !≺
(0x2281,): [0x0021, 0x227B], # ⊁ → !≻
(0x2284,): [0x0021, 0x2282], # ⊄ → !⊂
(0x2285,): [0x0021, 0x2283], # ⊅ → !⊃
(0x2288,): [0x0021, 0x2282, 0x003D], # ⊈ → !⊂=
(0x2289,): [0x0021, 0x2283, 0x003D], # ⊉ → !⊃=
(0x22AC,): [0x0021, 0x22A2], # ⊬ → !⊢
(0x22AD,): [0x0021, 0x22A8], # ⊭ → !⊨
(0x22AE,): [0x0021, 0x22A9], # ⊮ → !⊩
(0x22AF,): [0x0021, 0x22AB], # ⊯ → !⊫
(0x22E0,): [0x0021, 0x227C], # ⋠ → !≼
(0x22E1,): [0x0021, 0x227D], # ⋡ → !≽
(0x22E2,): [0x0021, 0x2291], # ⋢ → !⊑
(0x22E3,): [0x0021, 0x2292], # ⋣ → !⊒
(0x22EA,): [0x0021, 0x22B2], # ⋪ → !⊲
(0x22EB,): [0x0021, 0x22B3], # ⋫ → !⊳
(0x22EC,): [0x0021, 0x22B4], # ⋬ → !⊴
(0x22ED,): [0x0021, 0x22B5], # ⋭ → !⊵
(0x2ADC,): [0x0021, 0x2ADD], # ⫝̸ → !⫝
# Special rule for 〈 U+3008 is added
# because 〉 U+2329 has the canonical decomposition U+3008
# and we want to further decompose this to > U+003C.
(0x3008,): [0x003C], # 〈 → <
# Special rule for 〉 U+3009 is added
# because 〉 U+232A has the canonical decomposition U+3009
# and we want to further decompose this to < U+003E.
(0x3009,): [0x003E], # 〉→ >
}
if tuple(code_point_list) in special_decompose_dict:
return special_decompose_dict[tuple(code_point_list)]
else:
return code_point_list
def output_combining_remove(translit_file):
'''Write the section of the translit_combining file where combining
characters are replaced by empty strings.
'''
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
if is_combining_remove(code_point):
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} ""\n'.format(
unicode_utils.ucs_symbol(code_point)))
translit_file.write('\n')
def output_decompositions(translit_file):
'''Write the section of the translit_combining file where characters
characters are decomposed and combining characters stripped from
the decompositions.
'''
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
if special_decompose([code_point]) != [code_point]:
decomposed_code_points = [special_decompose([code_point])]
else:
decomposed_code_points = [canonical_decompose(code_point)]
if decomposed_code_points[0]:
while True:
special_decomposed_code_points = special_decompose(
decomposed_code_points[-1])
if (special_decomposed_code_points
!= decomposed_code_points[-1]):
decomposed_code_points.append(
special_decomposed_code_points)
continue
special_decomposed_code_points = []
for decomposed_code_point in decomposed_code_points[-1]:
special_decomposed_code_points += special_decompose(
[decomposed_code_point])
if (special_decomposed_code_points
== decomposed_code_points[-1]):
break
decomposed_code_points.append(
special_decomposed_code_points)
for index in range(0, len(decomposed_code_points)):
decomposed_code_points[index] = [
x for x in decomposed_code_points[index]
if not is_combining_remove(x)]
if decomposed_code_points[0]:
translit_file.write('% {:s}\n'.format(
unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']))
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
for index in range(0, len(decomposed_code_points)):
if index > 0:
translit_file.write(';')
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
for decomposed_code_point in decomposed_code_points[index]:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
translit_file.write('\n')
translit_file.write('\n')
def output_transliteration(translit_file):
'''Write the new transliteration to the output file'''
output_combining_remove(translit_file)
output_decompositions(translit_file)
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a translit_combining file from UnicodeData.txt.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help=''' The original glibc/localedata/locales/translit_combining
file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='translit_combining.new',
help='''The new translit_combining file, default: %(default)s. If the
original glibc/localedata/locales/translit_combining file has
been given as an option, the header up to the
translit_start line and the tail from the translit_end
line to the end of the file will be copied unchanged into the
output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
unicode_utils.fill_attributes(ARGS.unicode_data_file)
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
output_transliteration(TRANSLIT_FILE)
output_tail(TRANSLIT_FILE, tail=TAIL)

View File

@ -0,0 +1,326 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_compat file from a UnicodeData file.
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a translit_compat file from UnicodeData.txt
To see how this script is used, call it with the -h option:
$ ./gen_translit_compat -h
prints usage message
'''
import argparse
import time
import unicode_utils
def read_input_file(filename):
'''Reads the original glibc translit_compat file to get the
original head and tail.
We want to replace only the part of the file between
translit_start and translit_end
'''
head = tail = ''
with open(filename, mode='r') as translit_file:
for line in translit_file:
head = head + line
if line.startswith('translit_start'):
break
for line in translit_file:
if line.startswith('translit_end'):
tail = line
break
for line in translit_file:
tail = tail + line
return (head, tail)
def output_head(translit_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the translit_start line.
'''
if ARGS.input_file and head:
translit_file.write(head)
else:
translit_file.write('escape_char /\n')
translit_file.write('comment_char %\n')
translit_file.write('\n')
translit_file.write('% Transliterations of compatibility characters ')
translit_file.write('and ligatures.\n')
translit_file.write('% Generated automatically from UnicodeData.txt '
+ 'by gen_translit_compat.py '
+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ 'for Unicode {:s}.\n'.format(unicode_version))
translit_file.write('\n')
translit_file.write('LC_CTYPE\n')
translit_file.write('\n')
translit_file.write('translit_start\n')
def output_tail(translit_file, tail=''):
'''Write the tail of the output file'''
if ARGS.input_file and tail:
translit_file.write(tail)
else:
translit_file.write('translit_end\n')
translit_file.write('\n')
translit_file.write('END LC_CTYPE\n')
def compatibility_decompose(code_point):
'''http://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings
The compatibility decomposition is formed by recursively applying
the canonical and compatibility mappings, then applying the
Canonical Ordering Algorithm.
We dont do the canonical decomposition here because this is
done in gen_translit_combining.py to generate translit_combining.
And we ignore some of the possible compatibility formatting tags
here. Some of them are used in other translit_* files, not
translit_compat:
<font>: translit_font
<circle>: translit_circle
<wide>: translit_wide
<narrow>: translit_narrow
<square>: translit_cjk_compat
<fraction>: translit_fraction
And we ignore
<noBreak>, <initial>, <medial>, <final>, <isolated>
because they seem to be not useful for transliteration.
'''
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
compatibility_tags = (
'<compat>', '<super>', '<sub>', '<vertical>')
for compatibility_tag in compatibility_tags:
if decomposition.startswith(compatibility_tag):
decomposition = decomposition[len(compatibility_tag)+1:]
decomposed_code_points = [int(x, 16)
for x in decomposition.split(' ')]
if (len(decomposed_code_points) > 1
and decomposed_code_points[0] == 0x0020
and decomposed_code_points[1] >= 0x0300
and decomposed_code_points[1] <= 0x03FF):
# Decomposes into a space followed by a combining character.
# This is not useful fo transliteration.
return []
else:
return_value = []
for index in range(0, len(decomposed_code_points)):
cd_code_points = compatibility_decompose(
decomposed_code_points[index])
if cd_code_points:
return_value += cd_code_points
else:
return_value += [decomposed_code_points[index]]
return return_value
return []
def special_decompose(code_point_list):
'''
Decompositions which are not in UnicodeData.txt at all but which
were used in the original translit_compat file in glibc and
which seem to make sense. I want to keep the update of
translit_compat close to the spirit of the original file,
therefore I added this special decomposition rules here.
'''
special_decompose_dict = {
(0x03BC,): [0x0075], # μ → u
(0x02BC,): [0x0027], # ʼ → '
}
if tuple(code_point_list) in special_decompose_dict:
return special_decompose_dict[tuple(code_point_list)]
else:
return code_point_list
def special_ligature_decompose(code_point):
'''
Decompositions for ligatures which are not in UnicodeData.txt at
all but which were used in the original translit_compat file in
glibc and which seem to make sense. I want to keep the update of
translit_compat close to the spirit of the original file,
therefore I added these special ligature decomposition rules here.
'''
special_ligature_decompose_dict = {
0x00E6: [0x0061, 0x0065], # æ → ae
0x00C6: [0x0041, 0x0045], # Æ → AE
# These following 5 special ligature decompositions were
# in the original glibc/localedata/locales/translit_compat file
0x0152: [0x004F, 0x0045], # Œ → OE
0x0153: [0x006F, 0x0065], # œ → oe
0x05F0: [0x05D5, 0x05D5], # װ → וו
0x05F1: [0x05D5, 0x05D9], # ױ → וי
0x05F2: [0x05D9, 0x05D9], # ײ → יי
# The following special ligature decompositions were
# not in the original glibc/localedata/locales/translit_compat file
# U+04A4 CYRILLIC CAPITAL LIGATURE EN GHE
# → U+041D CYRILLIC CAPITAL LETTER EN,
# U+0413 CYRILLIC CAPITAL LETTER GHE
0x04A4: [0x041D, 0x0413], # Ҥ → НГ
# U+04A5 CYRILLIC SMALL LIGATURE EN GHE
# → U+043D CYRILLIC SMALL LETTER EN,
# U+0433 CYRILLIC SMALL LETTER GHE
0x04A5: [0x043D, 0x0433], # ҥ → нг
# U+04B4 CYRILLIC CAPITAL LIGATURE TE TSE
# → U+0422 CYRILLIC CAPITAL LETTER TE,
# U+0426 CYRILLIC CAPITAL LETTER TSE
0x04B4: [0x0422, 0x0426], # Ҵ → ТЦ
# U+04B5 CYRILLIC SMALL LIGATURE TE TSE
# → U+0442 CYRILLIC SMALL LETTER TE,
# U+0446 CYRILLIC SMALL LETTER TSE
0x04B5: [0x0442, 0x0446], # ҵ → тц
# U+04d4 CYRILLIC CAPITAL LIGATURE A IE
# → U+0410 CYRILLIC CAPITAL LETTER A
# U+0415;CYRILLIC CAPITAL LETTER IE
0x04D4: [0x0410, 0x0415], # Ӕ → АЕ
# U+04D5 CYRILLIC SMALL LIGATURE A IE
# → U+0430 CYRILLIC SMALL LETTER A,
# U+0435 CYRILLIC SMALL LETTER IE
0x04D5: [0x0430, 0x0435], # ӕ → ае
# I am not sure what to do with the following ligatures
# maybe it makes no sense to decompose them:
# U+0616 ARABIC SMALL HIGH LIGATURE ALEF WITH LAM WITH YEH
# U+06d6 ARABIC SMALL HIGH LIGATURE SAD WITH LAM WITH ALEF MAKSURA
# U+06d7 ARABIC SMALL HIGH LIGATURE QAF WITH LAM WITH ALEF MAKSURA
# U+fdfd ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
# U+fe20 COMBINING LIGATURE LEFT HALF
# U+fe21 COMBINING LIGATURE RIGHT HALF
# U+fe27 COMBINING LIGATURE LEFT HALF BELOW
# U+fe28 COMBINING LIGATURE RIGHT HALF BELOW
# U+11176 MAHAJANI LIGATURE SHRI
# U+1f670 SCRIPT LIGATURE ET ORNAMENT
# U+1f671 HEAVY SCRIPT LIGATURE ET ORNAMENT
# U+1f672 LIGATURE OPEN ET ORNAMENT
# U+1f673 HEAVY LIGATURE OPEN ET ORNAMENT
}
if code_point in special_ligature_decompose_dict:
return special_ligature_decompose_dict[code_point]
else:
return [code_point]
def output_transliteration(translit_file):
'''Write the new transliteration to the output file'''
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
decomposed_code_points = [compatibility_decompose(code_point)]
if not decomposed_code_points[0]:
if special_decompose([code_point]) != [code_point]:
decomposed_code_points[0] = special_decompose([code_point])
else:
special_decomposed_code_points = []
while True:
special_decomposed_code_points = special_decompose(
decomposed_code_points[-1])
if (special_decomposed_code_points
!= decomposed_code_points[-1]):
decomposed_code_points.append(
special_decomposed_code_points)
continue
special_decomposed_code_points = []
for decomposed_code_point in decomposed_code_points[-1]:
special_decomposed_code_points += special_decompose(
[decomposed_code_point])
if (special_decomposed_code_points
== decomposed_code_points[-1]):
break
decomposed_code_points.append(
special_decomposed_code_points)
if decomposed_code_points[0]:
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
for index in range(0, len(decomposed_code_points)):
if index > 0:
translit_file.write(';')
translit_file.write('"')
for decomposed_code_point in decomposed_code_points[index]:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
translit_file.write('"')
translit_file.write('\n')
elif 'LIGATURE' in name and 'ARABIC' not in name:
decomposed_code_points = special_ligature_decompose(code_point)
if decomposed_code_points[0] != code_point:
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
translit_file.write('"')
for decomposed_code_point in decomposed_code_points:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
translit_file.write('"')
translit_file.write('\n')
else:
print('Warning: unhandled ligature: {:x} {:s}'.format(
code_point, name))
translit_file.write('\n')
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a translit_compat file from UnicodeData.txt.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help=''' The original glibc/localedata/locales/translit_compat
file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='translit_compat.new',
help='''The new translit_compat file, default: %(default)s. If the
original glibc/localedata/locales/translit_compat file has
been given as an option, the header up to the
translit_start line and the tail from the translit_end
line to the end of the file will be copied unchanged into the
output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
unicode_utils.fill_attributes(ARGS.unicode_data_file)
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
output_transliteration(TRANSLIT_FILE)
output_tail(TRANSLIT_FILE, tail=TAIL)

View File

@ -0,0 +1,156 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_font file from a UnicodeData file.
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a translit_font file from UnicodeData.txt
To see how this script is used, call it with the -h option:
$ ./gen_translit_font -h
prints usage message
'''
import argparse
import time
import unicode_utils
def read_input_file(filename):
'''Reads the original glibc translit_font file to get the
original head and tail.
We want to replace only the part of the file between
translit_start and translit_end
'''
head = tail = ''
with open(filename, mode='r') as translit_file:
for line in translit_file:
head = head + line
if line.startswith('translit_start'):
break
for line in translit_file:
if line.startswith('translit_end'):
tail = line
break
for line in translit_file:
tail = tail + line
return (head, tail)
def output_head(translit_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the translit_start line.
'''
if ARGS.input_file and head:
translit_file.write(head)
else:
translit_file.write('escape_char /\n')
translit_file.write('comment_char %\n')
translit_file.write('\n')
translit_file.write('% Transliterations of font equivalents.\n')
translit_file.write('% Generated automatically from UnicodeData.txt '
+ 'by gen_translit_font.py '
+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ 'for Unicode {:s}.\n'.format(unicode_version))
translit_file.write('\n')
translit_file.write('LC_CTYPE\n')
translit_file.write('\n')
translit_file.write('translit_start\n')
def output_tail(translit_file, tail=''):
'''Write the tail of the output file'''
if ARGS.input_file and tail:
translit_file.write(tail)
else:
translit_file.write('translit_end\n')
translit_file.write('\n')
translit_file.write('END LC_CTYPE\n')
def output_transliteration(translit_file):
'''Write the new transliteration to the output file'''
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
if decomposition.startswith('<font>'):
decomposition = decomposition[7:]
decomposed_code_points = [[int(x, 16)
for x in decomposition.split(' ')]]
if decomposed_code_points[0]:
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
for index in range(0, len(decomposed_code_points)):
if index > 0:
translit_file.write(';')
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
for decomposed_code_point in decomposed_code_points[index]:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
translit_file.write(' % {:s}\n'.format(name))
translit_file.write('\n')
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a translit_font file from UnicodeData.txt.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help=''' The original glibc/localedata/locales/translit_font
file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='translit_font.new',
help='''The new translit_font file, default: %(default)s. If the
original glibc/localedata/locales/translit_font file has
been given as an option, the header up to the
translit_start line and the tail from the translit_end
line to the end of the file will be copied unchanged into the
output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
unicode_utils.fill_attributes(ARGS.unicode_data_file)
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
output_transliteration(TRANSLIT_FILE)
output_tail(TRANSLIT_FILE, tail=TAIL)

View File

@ -0,0 +1,197 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
#
# Generate a translit_fraction file from a UnicodeData file.
# Copyright (C) 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
Generate a translit_fraction file from UnicodeData.txt
To see how this script is used, call it with the -h option:
$ ./gen_translit_fraction -h
prints usage message
'''
import argparse
import time
import unicode_utils
def read_input_file(filename):
'''Reads the original glibc translit_fraction file to get the
original head and tail.
We want to replace only the part of the file between
translit_start and translit_end
'''
head = tail = ''
with open(filename, mode='r') as translit_file:
for line in translit_file:
head = head + line
if line.startswith('translit_start'):
break
for line in translit_file:
if line.startswith('translit_end'):
tail = line
break
for line in translit_file:
tail = tail + line
return (head, tail)
def output_head(translit_file, unicode_version, head=''):
'''Write the header of the output file, i.e. the part of the file
before the translit_start line.
'''
if ARGS.input_file and head:
translit_file.write(head)
else:
translit_file.write('escape_char /\n')
translit_file.write('comment_char %\n')
translit_file.write('\n')
translit_file.write('% Transliterations of fractions.\n')
translit_file.write('% Generated automatically from UnicodeData.txt '
+ 'by gen_translit_fraction.py '
+ 'on {:s} '.format(time.strftime('%Y-%m-%d'))
+ 'for Unicode {:s}.\n'.format(unicode_version))
translit_file.write('% The replacements have been surrounded ')
translit_file.write('with spaces, because fractions are\n')
translit_file.write('% often preceded by a decimal number and ')
translit_file.write('followed by a unit or a math symbol.\n')
translit_file.write('\n')
translit_file.write('LC_CTYPE\n')
translit_file.write('\n')
translit_file.write('translit_start\n')
def output_tail(translit_file, tail=''):
'''Write the tail of the output file'''
if ARGS.input_file and tail:
translit_file.write(tail)
else:
translit_file.write('translit_end\n')
translit_file.write('\n')
translit_file.write('END LC_CTYPE\n')
def special_decompose(code_point_list):
'''
Decompositions which are not in UnicodeData.txt at all but which
were used in the original translit_fraction file in glibc and
which seem to make sense. I want to keep the update of
translit_fraction close to the spirit of the original file,
therefore I added this special decomposition rules here.
'''
special_decompose_dict = {
(0x2044,): [0x002F], # → /
}
if tuple(code_point_list) in special_decompose_dict:
return special_decompose_dict[tuple(code_point_list)]
else:
return code_point_list
def output_transliteration(translit_file):
'''Write the new transliteration to the output file'''
translit_file.write('\n')
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
name = unicode_utils.UNICODE_ATTRIBUTES[code_point]['name']
decomposition = unicode_utils.UNICODE_ATTRIBUTES[
code_point]['decomposition']
if decomposition.startswith('<fraction>'):
decomposition = decomposition[11:]
decomposed_code_points = [[int(x, 16)
for x in decomposition.split(' ')]]
if decomposed_code_points[0]:
decomposed_code_points[0] = [0x0020] \
+ decomposed_code_points[0] \
+ [0x0020]
while True:
special_decomposed_code_points = special_decompose(
decomposed_code_points[-1])
if (special_decomposed_code_points
!= decomposed_code_points[-1]):
decomposed_code_points.append(
special_decomposed_code_points)
continue
special_decomposed_code_points = []
for decomposed_code_point in decomposed_code_points[-1]:
special_decomposed_code_points += special_decompose(
[decomposed_code_point])
if (special_decomposed_code_points
== decomposed_code_points[-1]):
break
decomposed_code_points.append(
special_decomposed_code_points)
translit_file.write('% {:s}\n'.format(name))
translit_file.write('{:s} '.format(
unicode_utils.ucs_symbol(code_point)))
for index in range(0, len(decomposed_code_points)):
if index > 0:
translit_file.write(';')
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
for decomposed_code_point in decomposed_code_points[index]:
translit_file.write('{:s}'.format(
unicode_utils.ucs_symbol(decomposed_code_point)))
if len(decomposed_code_points[index]) > 1:
translit_file.write('"')
translit_file.write('\n')
translit_file.write('\n')
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
description='''
Generate a translit_cjk_compat file from UnicodeData.txt.
''')
PARSER.add_argument(
'-u', '--unicode_data_file',
nargs='?',
type=str,
default='UnicodeData.txt',
help=('The UnicodeData.txt file to read, '
+ 'default: %(default)s'))
PARSER.add_argument(
'-i', '--input_file',
nargs='?',
type=str,
help=''' The original glibc/localedata/locales/translit_fraction
file.''')
PARSER.add_argument(
'-o', '--output_file',
nargs='?',
type=str,
default='translit_fraction.new',
help='''The new translit_fraction file, default: %(default)s. If the
original glibc/localedata/locales/translit_fraction file has
been given as an option, the header up to the
translit_start line and the tail from the translit_end
line to the end of the file will be copied unchanged into the
output file. ''')
PARSER.add_argument(
'--unicode_version',
nargs='?',
required=True,
type=str,
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
unicode_utils.fill_attributes(ARGS.unicode_data_file)
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)
with open(ARGS.output_file, mode='w') as TRANSLIT_FILE:
output_head(TRANSLIT_FILE, ARGS.unicode_version, head=HEAD)
output_transliteration(TRANSLIT_FILE)
output_tail(TRANSLIT_FILE, tail=TAIL)

View File

@ -30,345 +30,9 @@ To see how this script is used, call it with the “-h” option:
'''
import argparse
import sys
import time
import re
# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
# 'title': None,
# 'digit': '',
# 'name': '<control>',
# 'bidi': 'BN',
# 'combining': '0',
# 'comment': '',
# 'oldname': 'NULL',
# 'decomposition': '',
# 'upper': None,
# 'mirrored': 'N',
# 'lower': None,
# 'decdigit': '',
# 'numeric': ''},
# …
# }
UNICODE_ATTRIBUTES = {}
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
#
# Contents of this dictionary look like this:
#
# {917504: ['Default_Ignorable_Code_Point'],
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
# …
# }
DERIVED_CORE_PROPERTIES = {}
def fill_attribute(code_point, fields):
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
in the UnicodeData.txt file.
'''
UNICODE_ATTRIBUTES[code_point] = {
'name': fields[1], # Character name
'category': fields[2], # General category
'combining': fields[3], # Canonical combining classes
'bidi': fields[4], # Bidirectional category
'decomposition': fields[5], # Character decomposition mapping
'decdigit': fields[6], # Decimal digit value
'digit': fields[7], # Digit value
'numeric': fields[8], # Numeric value
'mirrored': fields[9], # mirrored
'oldname': fields[10], # Old Unicode 1.0 name
'comment': fields[11], # comment
# Uppercase mapping
'upper': int(fields[12], 16) if fields[12] else None,
# Lowercase mapping
'lower': int(fields[13], 16) if fields[13] else None,
# Titlecase mapping
'title': int(fields[14], 16) if fields[14] else None,
}
def fill_attributes(filename):
'''Stores the entire contents of the UnicodeData.txt file
in the UNICODE_ATTRIBUTES dictionary.
A typical line for a single code point in UnicodeData.txt looks
like this:
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
Code point ranges are indicated by pairs of lines like this:
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
'''
with open(filename, mode='r') as unicode_data_file:
fields_start = []
for line in unicode_data_file:
fields = line.strip().split(';')
if len(fields) != 15:
sys.stderr.write(
'short line in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
if fields[2] == 'Cs':
# Surrogates are UTF-16 artefacts,
# not real characters. Ignore them.
fields_start = []
continue
if fields[1].endswith(', First>'):
fields_start = fields
fields_start[1] = fields_start[1].split(',')[0][1:]
continue
if fields[1].endswith(', Last>'):
fields[1] = fields[1].split(',')[0][1:]
if fields[1:] != fields_start[1:]:
sys.stderr.write(
'broken code point range in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
for code_point in range(
int(fields_start[0], 16),
int(fields[0], 16)+1):
fill_attribute(code_point, fields)
fields_start = []
continue
fill_attribute(int(fields[0], 16), fields)
fields_start = []
def fill_derived_core_properties(filename):
'''Stores the entire contents of the DerivedCoreProperties.txt file
in the DERIVED_CORE_PROPERTIES dictionary.
Lines in DerivedCoreProperties.txt are either a code point range like
this:
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
or a single code point like this:
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
'''
with open(filename, mode='r') as derived_core_properties_file:
for line in derived_core_properties_file:
match = re.match(
r'^(?P<codepoint1>[0-9A-F]{4,6})'
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)',
line)
if not match:
continue
start = match.group('codepoint1')
end = match.group('codepoint2')
if not end:
end = start
for code_point in range(int(start, 16), int(end, 16)+1):
prop = match.group('property')
if code_point in DERIVED_CORE_PROPERTIES:
DERIVED_CORE_PROPERTIES[code_point].append(prop)
else:
DERIVED_CORE_PROPERTIES[code_point] = [prop]
def to_upper(code_point):
'''Returns the code point of the uppercase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['upper']):
return UNICODE_ATTRIBUTES[code_point]['upper']
else:
return code_point
def to_lower(code_point):
'''Returns the code point of the lowercase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['lower']):
return UNICODE_ATTRIBUTES[code_point]['lower']
else:
return code_point
def to_title(code_point):
'''Returns the code point of the titlecase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['title']):
return UNICODE_ATTRIBUTES[code_point]['title']
else:
return code_point
def is_upper(code_point):
'''Checks whether the character with this code point is uppercase'''
return (to_lower(code_point) != code_point
or (code_point in DERIVED_CORE_PROPERTIES
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
def is_lower(code_point):
'''Checks whether the character with this code point is lowercase'''
# Some characters are defined as “Lowercase” in
# DerivedCoreProperties.txt but do not have a mapping to upper
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
# one of these.
return (to_upper(code_point) != code_point
# <U00DF> is lowercase, but without simple to_upper mapping.
or code_point == 0x00DF
or (code_point in DERIVED_CORE_PROPERTIES
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
def is_alpha(code_point):
'''Checks whether the character with this code point is alphabetic'''
return ((code_point in DERIVED_CORE_PROPERTIES
and
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
or
# Consider all the non-ASCII digits as alphabetic.
# ISO C 99 forbids us to have them in category “digit”,
# but we want iswalnum to return true on them.
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
and not (code_point >= 0x0030 and code_point <= 0x0039)))
def is_digit(code_point):
'''Checks whether the character with this code point is a digit'''
if False:
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
# a zero. Must add <0> in front of them by hand.
else:
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
# takes it away:
# 7.25.2.1.5:
# The iswdigit function tests for any wide character that
# corresponds to a decimal-digit character (as defined in 5.2.1).
# 5.2.1:
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
return (code_point >= 0x0030 and code_point <= 0x0039)
def is_outdigit(code_point):
'''Checks whether the character with this code point is outdigit'''
return (code_point >= 0x0030 and code_point <= 0x0039)
def is_blank(code_point):
'''Checks whether the character with this code point is blank'''
return (code_point == 0x0009 # '\t'
# Category Zs without mention of '<noBreak>'
or (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
and '<noBreak>' not in
UNICODE_ATTRIBUTES[code_point]['decomposition']))
def is_space(code_point):
'''Checks whether the character with this code point is a space'''
# Dont make U+00A0 a space. Non-breaking space means that all programs
# should treat it like a punctuation character, not like a space.
return (code_point == 0x0020 # ' '
or code_point == 0x000C # '\f'
or code_point == 0x000A # '\n'
or code_point == 0x000D # '\r'
or code_point == 0x0009 # '\t'
or code_point == 0x000B # '\v'
# Categories Zl, Zp, and Zs without mention of "<noBreak>"
or (UNICODE_ATTRIBUTES[code_point]['name']
and
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
or
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
and
'<noBreak>' not in
UNICODE_ATTRIBUTES[code_point]['decomposition']))))
def is_cntrl(code_point):
'''Checks whether the character with this code point is
a control character'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
or
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
def is_xdigit(code_point):
'''Checks whether the character with this code point is
a hexadecimal digit'''
if False:
return (is_digit(code_point)
or (code_point >= 0x0041 and code_point <= 0x0046)
or (code_point >= 0x0061 and code_point <= 0x0066))
else:
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
# takes it away:
# 7.25.2.1.12:
# The iswxdigit function tests for any wide character that
# corresponds to a hexadecimal-digit character (as defined
# in 6.4.4.1).
# 6.4.4.1:
# hexadecimal-digit: one of
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
return ((code_point >= 0x0030 and code_point <= 0x0039)
or (code_point >= 0x0041 and code_point <= 0x0046)
or (code_point >= 0x0061 and code_point <= 0x0066))
def is_graph(code_point):
'''Checks whether the character with this code point is
a graphical character'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
and not is_space(code_point))
def is_print(code_point):
'''Checks whether the character with this code point is printable'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
def is_punct(code_point):
'''Checks whether the character with this code point is punctuation'''
if False:
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
else:
# The traditional POSIX definition of punctuation is every graphic,
# non-alphanumeric character.
return (is_graph(code_point)
and not is_alpha(code_point)
and not is_digit(code_point))
def is_combining(code_point):
'''Checks whether the character with this code point is
a combining character'''
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
# file. In 3.0.1 it was identical to the union of the general categories
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
# PropList.txt file, so we take the latter definition.
return (UNICODE_ATTRIBUTES[code_point]['name']
and
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
def is_combining_level3(code_point):
'''Checks whether the character with this code point is
a combining level3 character'''
return (is_combining(code_point)
and
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
def ucs_symbol_range(code_point_low, code_point_high):
'''Returns a string UCS symbol string for a code point range.
Example:
<U0041>..<U005A>
'''
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
import unicode_utils
def code_point_ranges(is_class_function):
'''Returns a list of ranges of code points for which is_class_function
@ -379,7 +43,7 @@ def code_point_ranges(is_class_function):
[[65, 90], [192, 214], [216, 222], [256], ]
'''
cp_ranges = []
for code_point in sorted(UNICODE_ATTRIBUTES):
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
if is_class_function(code_point):
if (cp_ranges
and cp_ranges[-1][-1] == code_point - 1):
@ -413,9 +77,9 @@ def output_charclass(i18n_file, class_name, is_class_function):
if line.strip():
line += ';'
if len(code_point_range) == 1:
range_string = ucs_symbol(code_point_range[0])
range_string = unicode_utils.ucs_symbol(code_point_range[0])
else:
range_string = ucs_symbol_range(
range_string = unicode_utils.ucs_symbol_range(
code_point_range[0], code_point_range[-1])
if len(line+range_string) > max_column:
i18n_file.write(line+'/\n')
@ -441,15 +105,15 @@ def output_charmap(i18n_file, map_name, map_function):
line = prefix
map_string = ''
i18n_file.write('%s /\n' %map_name)
for code_point in sorted(UNICODE_ATTRIBUTES):
for code_point in sorted(unicode_utils.UNICODE_ATTRIBUTES):
mapped = map_function(code_point)
if code_point != mapped:
if line.strip():
line += ';'
map_string = '(' \
+ ucs_symbol(code_point) \
+ unicode_utils.ucs_symbol(code_point) \
+ ',' \
+ ucs_symbol(mapped) \
+ unicode_utils.ucs_symbol(mapped) \
+ ')'
if len(line+map_string) > max_column:
i18n_file.write(line+'/\n')
@ -459,110 +123,6 @@ def output_charmap(i18n_file, map_name, map_function):
i18n_file.write(line+'\n')
i18n_file.write('\n')
def verifications():
'''Tests whether the is_* functions observe the known restrictions'''
for code_point in sorted(UNICODE_ATTRIBUTES):
# toupper restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (to_upper(code_point) != code_point
and not (is_lower(code_point) or is_upper(code_point))):
sys.stderr.write(
('%(sym)s is not upper|lower '
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
'sym': ucs_symbol(code_point),
'c': code_point,
'uc': to_upper(code_point)})
# tolower restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (to_lower(code_point) != code_point
and not (is_lower(code_point) or is_upper(code_point))):
sys.stderr.write(
('%(sym)s is not upper|lower '
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
'sym': ucs_symbol(code_point),
'c': code_point,
'uc': to_lower(code_point)})
# alpha restriction: "Characters classified as either upper or lower
# shall automatically belong to this class.
if ((is_lower(code_point) or is_upper(code_point))
and not is_alpha(code_point)):
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
'sym': ucs_symbol(code_point)})
# alpha restriction: “No character specified for the keywords cntrl,
# digit, punct or space shall be specified.”
if (is_alpha(code_point) and is_cntrl(code_point)):
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is alpha and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_punct(code_point)):
sys.stderr.write('%(sym)s is alpha and punct\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_space(code_point)):
sys.stderr.write('%(sym)s is alpha and space\n' %{
'sym': ucs_symbol(code_point)})
# space restriction: “No character specified for the keywords upper,
# lower, alpha, digit, graph or xdigit shall be specified.”
# upper, lower, alpha already checked above.
if (is_space(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is space and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_space(code_point) and is_graph(code_point)):
sys.stderr.write('%(sym)s is space and graph\n' %{
'sym': ucs_symbol(code_point)})
if (is_space(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is space and xdigit\n' %{
'sym': ucs_symbol(code_point)})
# cntrl restriction: “No character specified for the keywords upper,
# lower, alpha, digit, punct, graph, print or xdigit shall be
# specified.” upper, lower, alpha already checked above.
if (is_cntrl(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is cntrl and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_punct(code_point)):
sys.stderr.write('%(sym)s is cntrl and punct\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_graph(code_point)):
sys.stderr.write('%(sym)s is cntrl and graph\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_print(code_point)):
sys.stderr.write('%(sym)s is cntrl and print\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
'sym': ucs_symbol(code_point)})
# punct restriction: “No character specified for the keywords upper,
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
# be specified.” upper, lower, alpha, cntrl already checked above.
if (is_punct(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is punct and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_punct(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is punct and xdigit\n' %{
'sym': ucs_symbol(code_point)})
if (is_punct(code_point) and code_point == 0x0020):
sys.stderr.write('%(sym)s is punct\n' %{
'sym': ucs_symbol(code_point)})
# graph restriction: “No character specified for the keyword cntrl
# shall be specified.” Already checked above.
# print restriction: “No character specified for the keyword cntrl
# shall be specified.” Already checked above.
# graph - print relation: differ only in the <space> character.
# How is this possible if there are more than one space character?!
# I think susv2/xbd/locale.html should speak of “space characters”,
# not “space character”.
if (is_print(code_point)
and not (is_graph(code_point) or is_space(code_point))):
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
'sym': ucs_symbol(code_point)})
if (not is_print(code_point)
and (is_graph(code_point) or code_point == 0x0020)):
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
'sym': ucs_symbol(code_point)})
def read_input_file(filename):
'''Reads the original glibc i18n file to get the original head
and tail.
@ -648,18 +208,18 @@ def output_tables(i18n_file, unicode_version):
+ 'program.\n\n')
i18n_file.write('% The "upper" class reflects the uppercase '
+ 'characters of class "alpha"\n')
output_charclass(i18n_file, 'upper', is_upper)
output_charclass(i18n_file, 'upper', unicode_utils.is_upper)
i18n_file.write('% The "lower" class reflects the lowercase '
+ 'characters of class "alpha"\n')
output_charclass(i18n_file, 'lower', is_lower)
output_charclass(i18n_file, 'lower', unicode_utils.is_lower)
i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
+ 'reflecting\n')
i18n_file.write('% the recommendations in TR 10176 annex A\n')
output_charclass(i18n_file, 'alpha', is_alpha)
output_charclass(i18n_file, 'alpha', unicode_utils.is_alpha)
i18n_file.write('% The "digit" class must only contain the '
+ 'BASIC LATIN digits, says ISO C 99\n')
i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
output_charclass(i18n_file, 'digit', is_digit)
output_charclass(i18n_file, 'digit', unicode_utils.is_digit)
i18n_file.write('% The "outdigit" information is by default '
+ '"0" to "9". We don\'t have to\n')
i18n_file.write('% provide it here since localedef will fill '
@ -669,29 +229,30 @@ def output_tables(i18n_file, unicode_version):
i18n_file.write('% outdigit /\n')
i18n_file.write('% <U0030>..<U0039>\n\n')
# output_charclass(i18n_file, 'outdigit', is_outdigit)
output_charclass(i18n_file, 'space', is_space)
output_charclass(i18n_file, 'cntrl', is_cntrl)
output_charclass(i18n_file, 'punct', is_punct)
output_charclass(i18n_file, 'graph', is_graph)
output_charclass(i18n_file, 'print', is_print)
output_charclass(i18n_file, 'space', unicode_utils.is_space)
output_charclass(i18n_file, 'cntrl', unicode_utils.is_cntrl)
output_charclass(i18n_file, 'punct', unicode_utils.is_punct)
output_charclass(i18n_file, 'graph', unicode_utils.is_graph)
output_charclass(i18n_file, 'print', unicode_utils.is_print)
i18n_file.write('% The "xdigit" class must only contain the '
+ 'BASIC LATIN digits and A-F, a-f,\n')
i18n_file.write('% says ISO C 99 '
+ '(sections 7.25.2.1.12 and 6.4.4.1).\n')
output_charclass(i18n_file, 'xdigit', is_xdigit)
output_charclass(i18n_file, 'blank', is_blank)
output_charmap(i18n_file, 'toupper', to_upper)
output_charmap(i18n_file, 'tolower', to_lower)
output_charmap(i18n_file, 'map "totitle";', to_title)
output_charclass(i18n_file, 'xdigit', unicode_utils.is_xdigit)
output_charclass(i18n_file, 'blank', unicode_utils.is_blank)
output_charmap(i18n_file, 'toupper', unicode_utils.to_upper)
output_charmap(i18n_file, 'tolower', unicode_utils.to_lower)
output_charmap(i18n_file, 'map "totitle";', unicode_utils.to_title)
i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
+ 'annex B.1\n')
i18n_file.write('% That is, all combining characters (level 2+3).\n')
output_charclass(i18n_file, 'class "combining";', is_combining)
output_charclass(i18n_file, 'class "combining";',
unicode_utils.is_combining)
i18n_file.write('% The "combining_level3" class reflects '
+ 'ISO/IEC 10646-1 annex B.2\n')
i18n_file.write('% That is, combining characters of level 3.\n')
output_charclass(i18n_file,
'class "combining_level3";', is_combining_level3)
output_charclass(i18n_file, 'class "combining_level3";',
unicode_utils.is_combining_level3)
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
@ -739,9 +300,11 @@ if __name__ == "__main__":
help='The Unicode version of the input files used.')
ARGS = PARSER.parse_args()
fill_attributes(ARGS.unicode_data_file)
fill_derived_core_properties(ARGS.derived_core_properties_file)
verifications()
unicode_utils.fill_attributes(
ARGS.unicode_data_file)
unicode_utils.fill_derived_core_properties(
ARGS.derived_core_properties_file)
unicode_utils.verifications()
HEAD = TAIL = ''
if ARGS.input_file:
(HEAD, TAIL) = read_input_file(ARGS.input_file)

View File

@ -0,0 +1,502 @@
# Utilities to generate Unicode data for glibc from upstream Unicode data.
#
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.
'''
This module contains utilities used by the scripts to generate
Unicode data for glibc from upstream Unicode data files.
'''
import sys
import re
# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
# 'title': None,
# 'digit': '',
# 'name': '<control>',
# 'bidi': 'BN',
# 'combining': '0',
# 'comment': '',
# 'oldname': 'NULL',
# 'decomposition': '',
# 'upper': None,
# 'mirrored': 'N',
# 'lower': None,
# 'decdigit': '',
# 'numeric': ''},
# …
# }
UNICODE_ATTRIBUTES = {}
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
#
# Contents of this dictionary look like this:
#
# {917504: ['Default_Ignorable_Code_Point'],
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
# …
# }
DERIVED_CORE_PROPERTIES = {}
# Dictionary holding the entire contents of the EastAsianWidths.txt file
#
# Contents of this dictionary look like this:
#
# {0: 'N', … , 45430: 'W', …}
EAST_ASIAN_WIDTHS = {}
def fill_attribute(code_point, fields):
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
in the UnicodeData.txt file.
'''
UNICODE_ATTRIBUTES[code_point] = {
'name': fields[1], # Character name
'category': fields[2], # General category
'combining': fields[3], # Canonical combining classes
'bidi': fields[4], # Bidirectional category
'decomposition': fields[5], # Character decomposition mapping
'decdigit': fields[6], # Decimal digit value
'digit': fields[7], # Digit value
'numeric': fields[8], # Numeric value
'mirrored': fields[9], # mirrored
'oldname': fields[10], # Old Unicode 1.0 name
'comment': fields[11], # comment
# Uppercase mapping
'upper': int(fields[12], 16) if fields[12] else None,
# Lowercase mapping
'lower': int(fields[13], 16) if fields[13] else None,
# Titlecase mapping
'title': int(fields[14], 16) if fields[14] else None,
}
def fill_attributes(filename):
'''Stores the entire contents of the UnicodeData.txt file
in the UNICODE_ATTRIBUTES dictionary.
A typical line for a single code point in UnicodeData.txt looks
like this:
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
Code point ranges are indicated by pairs of lines like this:
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
'''
with open(filename, mode='r') as unicode_data_file:
fields_start = []
for line in unicode_data_file:
fields = line.strip().split(';')
if len(fields) != 15:
sys.stderr.write(
'short line in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
if fields[2] == 'Cs':
# Surrogates are UTF-16 artefacts,
# not real characters. Ignore them.
fields_start = []
continue
if fields[1].endswith(', First>'):
fields_start = fields
fields_start[1] = fields_start[1].split(',')[0][1:]
continue
if fields[1].endswith(', Last>'):
fields[1] = fields[1].split(',')[0][1:]
if fields[1:] != fields_start[1:]:
sys.stderr.write(
'broken code point range in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
for code_point in range(
int(fields_start[0], 16),
int(fields[0], 16)+1):
fill_attribute(code_point, fields)
fields_start = []
continue
fill_attribute(int(fields[0], 16), fields)
fields_start = []
def fill_derived_core_properties(filename):
'''Stores the entire contents of the DerivedCoreProperties.txt file
in the DERIVED_CORE_PROPERTIES dictionary.
Lines in DerivedCoreProperties.txt are either a code point range like
this:
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
or a single code point like this:
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
'''
with open(filename, mode='r') as derived_core_properties_file:
for line in derived_core_properties_file:
match = re.match(
r'^(?P<codepoint1>[0-9A-F]{4,6})'
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)',
line)
if not match:
continue
start = match.group('codepoint1')
end = match.group('codepoint2')
if not end:
end = start
for code_point in range(int(start, 16), int(end, 16)+1):
prop = match.group('property')
if code_point in DERIVED_CORE_PROPERTIES:
DERIVED_CORE_PROPERTIES[code_point].append(prop)
else:
DERIVED_CORE_PROPERTIES[code_point] = [prop]
def fill_east_asian_widths(filename):
'''Stores the entire contents of the EastAsianWidths.txt file
in the EAST_ASIAN_WIDTHS dictionary.
Lines in EastAsianWidths.txt are either a code point range like
this:
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
or a single code point like this:
A015;W # Lm YI SYLLABLE WU
'''
with open(filename, mode='r') as east_asian_widths_file:
for line in east_asian_widths_file:
match = re.match(
r'^(?P<codepoint1>[0-9A-F]{4,6})'
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+r'\s*;\s*(?P<property>[a-zA-Z]+)',
line)
if not match:
continue
start = match.group('codepoint1')
end = match.group('codepoint2')
if not end:
end = start
for code_point in range(int(start, 16), int(end, 16)+1):
EAST_ASIAN_WIDTHS[code_point] = match.group('property')
def to_upper(code_point):
'''Returns the code point of the uppercase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['upper']):
return UNICODE_ATTRIBUTES[code_point]['upper']
else:
return code_point
def to_lower(code_point):
'''Returns the code point of the lowercase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['lower']):
return UNICODE_ATTRIBUTES[code_point]['lower']
else:
return code_point
def to_title(code_point):
'''Returns the code point of the titlecase version
of the given code point'''
if (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['title']):
return UNICODE_ATTRIBUTES[code_point]['title']
else:
return code_point
def is_upper(code_point):
'''Checks whether the character with this code point is uppercase'''
return (to_lower(code_point) != code_point
or (code_point in DERIVED_CORE_PROPERTIES
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
def is_lower(code_point):
'''Checks whether the character with this code point is lowercase'''
# Some characters are defined as “Lowercase” in
# DerivedCoreProperties.txt but do not have a mapping to upper
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
# one of these.
return (to_upper(code_point) != code_point
# <U00DF> is lowercase, but without simple to_upper mapping.
or code_point == 0x00DF
or (code_point in DERIVED_CORE_PROPERTIES
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
def is_alpha(code_point):
'''Checks whether the character with this code point is alphabetic'''
return ((code_point in DERIVED_CORE_PROPERTIES
and
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
or
# Consider all the non-ASCII digits as alphabetic.
# ISO C 99 forbids us to have them in category “digit”,
# but we want iswalnum to return true on them.
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
and not (code_point >= 0x0030 and code_point <= 0x0039)))
def is_digit(code_point):
'''Checks whether the character with this code point is a digit'''
if False:
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
# a zero. Must add <0> in front of them by hand.
else:
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
# takes it away:
# 7.25.2.1.5:
# The iswdigit function tests for any wide character that
# corresponds to a decimal-digit character (as defined in 5.2.1).
# 5.2.1:
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
return (code_point >= 0x0030 and code_point <= 0x0039)
def is_outdigit(code_point):
'''Checks whether the character with this code point is outdigit'''
return (code_point >= 0x0030 and code_point <= 0x0039)
def is_blank(code_point):
'''Checks whether the character with this code point is blank'''
return (code_point == 0x0009 # '\t'
# Category Zs without mention of '<noBreak>'
or (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
and '<noBreak>' not in
UNICODE_ATTRIBUTES[code_point]['decomposition']))
def is_space(code_point):
'''Checks whether the character with this code point is a space'''
# Dont make U+00A0 a space. Non-breaking space means that all programs
# should treat it like a punctuation character, not like a space.
return (code_point == 0x0020 # ' '
or code_point == 0x000C # '\f'
or code_point == 0x000A # '\n'
or code_point == 0x000D # '\r'
or code_point == 0x0009 # '\t'
or code_point == 0x000B # '\v'
# Categories Zl, Zp, and Zs without mention of "<noBreak>"
or (UNICODE_ATTRIBUTES[code_point]['name']
and
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
or
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
and
'<noBreak>' not in
UNICODE_ATTRIBUTES[code_point]['decomposition']))))
def is_cntrl(code_point):
'''Checks whether the character with this code point is
a control character'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
or
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
def is_xdigit(code_point):
'''Checks whether the character with this code point is
a hexadecimal digit'''
if False:
return (is_digit(code_point)
or (code_point >= 0x0041 and code_point <= 0x0046)
or (code_point >= 0x0061 and code_point <= 0x0066))
else:
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
# takes it away:
# 7.25.2.1.12:
# The iswxdigit function tests for any wide character that
# corresponds to a hexadecimal-digit character (as defined
# in 6.4.4.1).
# 6.4.4.1:
# hexadecimal-digit: one of
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
return ((code_point >= 0x0030 and code_point <= 0x0039)
or (code_point >= 0x0041 and code_point <= 0x0046)
or (code_point >= 0x0061 and code_point <= 0x0066))
def is_graph(code_point):
'''Checks whether the character with this code point is
a graphical character'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
and not is_space(code_point))
def is_print(code_point):
'''Checks whether the character with this code point is printable'''
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
def is_punct(code_point):
'''Checks whether the character with this code point is punctuation'''
if False:
return (UNICODE_ATTRIBUTES[code_point]['name']
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
else:
# The traditional POSIX definition of punctuation is every graphic,
# non-alphanumeric character.
return (is_graph(code_point)
and not is_alpha(code_point)
and not is_digit(code_point))
def is_combining(code_point):
'''Checks whether the character with this code point is
a combining character'''
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
# file. In 3.0.1 it was identical to the union of the general categories
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
# PropList.txt file, so we take the latter definition.
return (UNICODE_ATTRIBUTES[code_point]['name']
and
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
def is_combining_level3(code_point):
'''Checks whether the character with this code point is
a combining level3 character'''
return (is_combining(code_point)
and
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
def ucs_symbol_range(code_point_low, code_point_high):
'''Returns a string UCS symbol string for a code point range.
Example:
<U0041>..<U005A>
'''
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
def verifications():
'''Tests whether the is_* functions observe the known restrictions'''
for code_point in sorted(UNICODE_ATTRIBUTES):
# toupper restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (to_upper(code_point) != code_point
and not (is_lower(code_point) or is_upper(code_point))):
sys.stderr.write(
('%(sym)s is not upper|lower '
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
'sym': ucs_symbol(code_point),
'c': code_point,
'uc': to_upper(code_point)})
# tolower restriction: "Only characters specified for the keywords
# lower and upper shall be specified.
if (to_lower(code_point) != code_point
and not (is_lower(code_point) or is_upper(code_point))):
sys.stderr.write(
('%(sym)s is not upper|lower '
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
'sym': ucs_symbol(code_point),
'c': code_point,
'uc': to_lower(code_point)})
# alpha restriction: "Characters classified as either upper or lower
# shall automatically belong to this class.
if ((is_lower(code_point) or is_upper(code_point))
and not is_alpha(code_point)):
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
'sym': ucs_symbol(code_point)})
# alpha restriction: “No character specified for the keywords cntrl,
# digit, punct or space shall be specified.”
if (is_alpha(code_point) and is_cntrl(code_point)):
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is alpha and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_punct(code_point)):
sys.stderr.write('%(sym)s is alpha and punct\n' %{
'sym': ucs_symbol(code_point)})
if (is_alpha(code_point) and is_space(code_point)):
sys.stderr.write('%(sym)s is alpha and space\n' %{
'sym': ucs_symbol(code_point)})
# space restriction: “No character specified for the keywords upper,
# lower, alpha, digit, graph or xdigit shall be specified.”
# upper, lower, alpha already checked above.
if (is_space(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is space and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_space(code_point) and is_graph(code_point)):
sys.stderr.write('%(sym)s is space and graph\n' %{
'sym': ucs_symbol(code_point)})
if (is_space(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is space and xdigit\n' %{
'sym': ucs_symbol(code_point)})
# cntrl restriction: “No character specified for the keywords upper,
# lower, alpha, digit, punct, graph, print or xdigit shall be
# specified.” upper, lower, alpha already checked above.
if (is_cntrl(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is cntrl and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_punct(code_point)):
sys.stderr.write('%(sym)s is cntrl and punct\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_graph(code_point)):
sys.stderr.write('%(sym)s is cntrl and graph\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_print(code_point)):
sys.stderr.write('%(sym)s is cntrl and print\n' %{
'sym': ucs_symbol(code_point)})
if (is_cntrl(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
'sym': ucs_symbol(code_point)})
# punct restriction: “No character specified for the keywords upper,
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
# be specified.” upper, lower, alpha, cntrl already checked above.
if (is_punct(code_point) and is_digit(code_point)):
sys.stderr.write('%(sym)s is punct and digit\n' %{
'sym': ucs_symbol(code_point)})
if (is_punct(code_point) and is_xdigit(code_point)):
sys.stderr.write('%(sym)s is punct and xdigit\n' %{
'sym': ucs_symbol(code_point)})
if (is_punct(code_point) and code_point == 0x0020):
sys.stderr.write('%(sym)s is punct\n' %{
'sym': ucs_symbol(code_point)})
# graph restriction: “No character specified for the keyword cntrl
# shall be specified.” Already checked above.
# print restriction: “No character specified for the keyword cntrl
# shall be specified.” Already checked above.
# graph - print relation: differ only in the <space> character.
# How is this possible if there are more than one space character?!
# I think susv2/xbd/locale.html should speak of “space characters”,
# not “space character”.
if (is_print(code_point)
and not (is_graph(code_point) or is_space(code_point))):
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
'sym': unicode_utils.ucs_symbol(code_point)})
if (not is_print(code_point)
and (is_graph(code_point) or code_point == 0x0020)):
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
'sym': unicode_utils.ucs_symbol(code_point)})

View File

@ -30,146 +30,7 @@ To see how this script is used, call it with the “-h” option:
import sys
import re
import argparse
# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
# 'title': None,
# 'digit': '',
# 'name': '<control>',
# 'bidi': 'BN',
# 'combining': '0',
# 'comment': '',
# 'oldname': 'NULL',
# 'decomposition': '',
# 'upper': None,
# 'mirrored': 'N',
# 'lower': None,
# 'decdigit': '',
# 'numeric': ''},
# …
# }
UNICODE_ATTRIBUTES = {}
# Dictionary holding the entire contents of the EastAsianWidths.txt file
#
# Contents of this dictionary look like this:
#
# {0: 'N', … , 45430: 'W', …}
EAST_ASIAN_WIDTHS = {}
def fill_attribute(code_point, fields):
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
in the UnicodeData.txt file.
'''
UNICODE_ATTRIBUTES[code_point] = {
'name': fields[1], # Character name
'category': fields[2], # General category
'combining': fields[3], # Canonical combining classes
'bidi': fields[4], # Bidirectional category
'decomposition': fields[5], # Character decomposition mapping
'decdigit': fields[6], # Decimal digit value
'digit': fields[7], # Digit value
'numeric': fields[8], # Numeric value
'mirrored': fields[9], # mirrored
'oldname': fields[10], # Old Unicode 1.0 name
'comment': fields[11], # comment
# Uppercase mapping
'upper': int(fields[12], 16) if fields[12] else None,
# Lowercase mapping
'lower': int(fields[13], 16) if fields[13] else None,
# Titlecase mapping
'title': int(fields[14], 16) if fields[14] else None,
}
def fill_attributes(filename):
'''Stores the entire contents of the UnicodeData.txt file
in the UNICODE_ATTRIBUTES dictionary.
A typical line for a single code point in UnicodeData.txt looks
like this:
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
Code point ranges are indicated by pairs of lines like this:
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
'''
with open(filename, mode='r') as unicode_data_file:
fields_start = []
for line in unicode_data_file:
fields = line.strip().split(';')
if len(fields) != 15:
sys.stderr.write(
'short line in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
if fields[2] == 'Cs':
# Surrogates are UTF-16 artefacts,
# not real characters. Ignore them.
fields_start = []
continue
if fields[1].endswith(', First>'):
fields_start = fields
fields_start[1] = fields_start[1].split(',')[0][1:]
continue
if fields[1].endswith(', Last>'):
fields[1] = fields[1].split(',')[0][1:]
if fields[1:] != fields_start[1:]:
sys.stderr.write(
'broken code point range in file "%(f)s": %(l)s\n' %{
'f': filename, 'l': line})
exit(1)
for code_point in range(
int(fields_start[0], 16),
int(fields[0], 16)+1):
fill_attribute(code_point, fields)
fields_start = []
continue
fill_attribute(int(fields[0], 16), fields)
fields_start = []
def fill_east_asian_widths(filename):
'''Stores the entire contents of the EastAsianWidths.txt file
in the EAST_ASIAN_WIDTHS dictionary.
Lines in EastAsianWidths.txt are either a code point range like
this:
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
or a single code point like this:
A015;W # Lm YI SYLLABLE WU
'''
with open(filename, mode='r') as east_asian_widths_file:
for line in east_asian_widths_file:
match = re.match(
r'^(?P<codepoint1>[0-9A-F]{4,6})'
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
+r'\s*;\s*(?P<property>[a-zA-Z]+)',
line)
if not match:
continue
start = match.group('codepoint1')
end = match.group('codepoint2')
if not end:
end = start
for code_point in range(int(start, 16), int(end, 16)+1):
EAST_ASIAN_WIDTHS[code_point] = match.group('property')
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
import unicode_utils
def create_charmap_dictionary(file_name):
'''Create a dictionary for all code points found in the CHARMAP
@ -217,10 +78,10 @@ def check_charmap(original_file_name, new_file_name):
if ARGS.show_missing_characters:
for key in sorted(set(ocharmap)-set(ncharmap)):
print('removed: {:s} {:s} {:s}'.format(
ucs_symbol(key),
unicode_utils.ucs_symbol(key),
ocharmap[key],
UNICODE_ATTRIBUTES[key]['name'] \
if key in UNICODE_ATTRIBUTES else None))
unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
print('------------------------------------------------------------')
changed_charmap = {}
for key in set(ocharmap).intersection(set(ncharmap)):
@ -231,21 +92,21 @@ def check_charmap(original_file_name, new_file_name):
if ARGS.show_changed_characters:
for key in sorted(changed_charmap):
print('changed: {:s} {:s}->{:s} {:s}'.format(
ucs_symbol(key),
unicode_utils.ucs_symbol(key),
changed_charmap[key][0],
changed_charmap[key][1],
UNICODE_ATTRIBUTES[key]['name'] \
if key in UNICODE_ATTRIBUTES else None))
unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
print('------------------------------------------------------------')
print('Total added characters in newly generated CHARMAP: %d'
%len(set(ncharmap)-set(ocharmap)))
if ARGS.show_added_characters:
for key in sorted(set(ncharmap)-set(ocharmap)):
print('added: {:s} {:s} {:s}'.format(
ucs_symbol(key),
unicode_utils.ucs_symbol(key),
ncharmap[key],
UNICODE_ATTRIBUTES[key]['name'] \
if key in UNICODE_ATTRIBUTES else None))
unicode_utils.UNICODE_ATTRIBUTES[key]['name'] \
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
def create_width_dictionary(file_name):
'''Create a dictionary for all code points found in the WIDTH
@ -290,20 +151,20 @@ def check_width(original_file_name, new_file_name):
+ 'i.e. these have width 1 now.)')
if ARGS.show_missing_characters:
for key in sorted(set(owidth)-set(nwidth)):
print('removed: {:s} '.format(ucs_symbol(key))
print('removed: {:s} '.format(unicode_utils.ucs_symbol(key))
+ '{:d} : '.format(owidth[key])
+ 'eaw={:s} '.format(
EAST_ASIAN_WIDTHS[key]
if key in EAST_ASIAN_WIDTHS else None)
unicode_utils.EAST_ASIAN_WIDTHS[key]
if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
+ 'category={:2s} '.format(
UNICODE_ATTRIBUTES[key]['category']
if key in UNICODE_ATTRIBUTES else None)
unicode_utils.UNICODE_ATTRIBUTES[key]['category']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
+ 'bidi={:3s} '.format(
UNICODE_ATTRIBUTES[key]['bidi']
if key in UNICODE_ATTRIBUTES else None)
unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
+ 'name={:s}'.format(
UNICODE_ATTRIBUTES[key]['name']
if key in UNICODE_ATTRIBUTES else None))
unicode_utils.UNICODE_ATTRIBUTES[key]['name']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
print('------------------------------------------------------------')
changed_width = {}
for key in set(owidth).intersection(set(nwidth)):
@ -313,21 +174,21 @@ def check_width(original_file_name, new_file_name):
%len(changed_width))
if ARGS.show_changed_characters:
for key in sorted(changed_width):
print('changed width: {:s} '.format(ucs_symbol(key))
print('changed width: {:s} '.format(unicode_utils.ucs_symbol(key))
+ '{:d}->{:d} : '.format(changed_width[key][0],
changed_width[key][1])
+ 'eaw={:s} '.format(
EAST_ASIAN_WIDTHS[key]
if key in EAST_ASIAN_WIDTHS else None)
unicode_utils.EAST_ASIAN_WIDTHS[key]
if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
+ 'category={:2s} '.format(
UNICODE_ATTRIBUTES[key]['category']
if key in UNICODE_ATTRIBUTES else None)
unicode_utils.UNICODE_ATTRIBUTES[key]['category']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
+ 'bidi={:3s} '.format(
UNICODE_ATTRIBUTES[key]['bidi']
if key in UNICODE_ATTRIBUTES else None)
unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
+ 'name={:s}'.format(
UNICODE_ATTRIBUTES[key]['name']
if key in UNICODE_ATTRIBUTES else None))
unicode_utils.UNICODE_ATTRIBUTES[key]['name']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
print('------------------------------------------------------------')
print('Total added characters in newly generated WIDTH: %d'
%len(set(nwidth)-set(owidth)))
@ -335,20 +196,20 @@ def check_width(original_file_name, new_file_name):
+ 'i.e. these had width 1 before.)')
if ARGS.show_added_characters:
for key in sorted(set(nwidth)-set(owidth)):
print('added: {:s} '.format(ucs_symbol(key))
print('added: {:s} '.format(unicode_utils.ucs_symbol(key))
+ '{:d} : '.format(nwidth[key])
+ 'eaw={:s} '.format(
EAST_ASIAN_WIDTHS[key]
if key in EAST_ASIAN_WIDTHS else None)
unicode_utils.EAST_ASIAN_WIDTHS[key]
if key in unicode_utils.EAST_ASIAN_WIDTHS else 'None')
+ 'category={:2s} '.format(
UNICODE_ATTRIBUTES[key]['category']
if key in UNICODE_ATTRIBUTES else None)
unicode_utils.UNICODE_ATTRIBUTES[key]['category']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
+ 'bidi={:3s} '.format(
UNICODE_ATTRIBUTES[key]['bidi']
if key in UNICODE_ATTRIBUTES else None)
unicode_utils.UNICODE_ATTRIBUTES[key]['bidi']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None')
+ 'name={:s}'.format(
UNICODE_ATTRIBUTES[key]['name']
if key in UNICODE_ATTRIBUTES else None))
unicode_utils.UNICODE_ATTRIBUTES[key]['name']
if key in unicode_utils.UNICODE_ATTRIBUTES else 'None'))
if __name__ == "__main__":
PARSER = argparse.ArgumentParser(
@ -392,8 +253,8 @@ if __name__ == "__main__":
ARGS = PARSER.parse_args()
if ARGS.unicode_data_file:
fill_attributes(ARGS.unicode_data_file)
unicode_utils.fill_attributes(ARGS.unicode_data_file)
if ARGS.east_asian_width_file:
fill_east_asian_widths(ARGS.east_asian_width_file)
unicode_utils.fill_east_asian_widths(ARGS.east_asian_width_file)
check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)

View File

@ -29,6 +29,7 @@ It will output UTF-8 file
import sys
import re
import unicode_utils
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
# sections 3.11 and 4.4.
@ -49,13 +50,6 @@ JAMO_FINAL_SHORT_NAME = (
'P', 'H'
)
def ucs_symbol(code_point):
'''Return the UCS symbol string for a Unicode character.'''
if code_point < 0x10000:
return '<U{:04X}>'.format(code_point)
else:
return '<U{:08X}>'.format(code_point)
def process_range(start, end, outfile, name):
'''Writes a range of code points into the CHARMAP section of the
output file
@ -78,7 +72,7 @@ def process_range(start, end, outfile, name):
+ JAMO_MEDIAL_SHORT_NAME[index2] \
+ JAMO_FINAL_SHORT_NAME[index3]
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
ucs_symbol(i), convert_to_hex(i),
unicode_utils.ucs_symbol(i), convert_to_hex(i),
hangul_syllable_name))
return
# UnicodeData.txt file has contains code point ranges like this:
@ -95,14 +89,14 @@ def process_range(start, end, outfile, name):
for i in range(int(start, 16), int(end, 16), 64 ):
if i > (int(end, 16)-64):
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
ucs_symbol(i),
ucs_symbol(int(end,16)),
unicode_utils.ucs_symbol(i),
unicode_utils.ucs_symbol(int(end,16)),
convert_to_hex(i),
name))
break
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
ucs_symbol(i),
ucs_symbol(i+63),
unicode_utils.ucs_symbol(i),
unicode_utils.ucs_symbol(i+63),
convert_to_hex(i),
name))
@ -168,7 +162,7 @@ def process_charmap(flines, outfile):
# comments, so we keep these comment lines.
outfile.write('%')
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
ucs_symbol(int(fields[0], 16)),
unicode_utils.ucs_symbol(int(fields[0], 16)),
convert_to_hex(int(fields[0], 16)),
fields[1]))
@ -230,7 +224,7 @@ def process_width(outfile, ulines, elines):
for line in ulines:
fields = line.split(";")
if fields[4] == "NSM" or fields[2] == "Cf":
width_dict[int(fields[0], 16)] = ucs_symbol(
width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
int(fields[0], 16)) + '\t0'
for line in elines:
@ -238,7 +232,7 @@ def process_width(outfile, ulines, elines):
# UnicodeData.txt:
fields = line.split(";")
if not '..' in fields[0]:
width_dict[int(fields[0], 16)] = ucs_symbol(
width_dict[int(fields[0], 16)] = unicode_utils.ucs_symbol(
int(fields[0], 16)) + '\t2'
else:
code_points = fields[0].split("..")
@ -247,8 +241,8 @@ def process_width(outfile, ulines, elines):
if key in width_dict:
del width_dict[key]
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
ucs_symbol(int(code_points[0], 16)),
ucs_symbol(int(code_points[1], 16)))
unicode_utils.ucs_symbol(int(code_points[0], 16)),
unicode_utils.ucs_symbol(int(code_points[1], 16)))
for key in sorted(width_dict):
outfile.write(width_dict[key]+'\n')