mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 12:30:06 +00:00
Unicode 7.0.0 update; added generator scripts.
for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators.
This commit is contained in:
parent
e4a399dc3d
commit
4a4839c94a
11
NEWS
11
NEWS
@ -9,8 +9,15 @@ Version 2.22
|
||||
|
||||
* The following bugs are resolved with this release:
|
||||
|
||||
4719, 15319, 15467, 15790, 16560, 17569, 17792, 17912, 17932, 17944,
|
||||
17949, 17964, 17965, 17967, 17969, 17978, 17987, 17991, 17996, 17999.
|
||||
4719, 13064, 14094, 15319, 15467, 15790, 16560, 17569, 17588, 17792,
|
||||
17912, 17932, 17944, 17949, 17964, 17965, 17967, 17969, 17978, 17987,
|
||||
17991, 17996, 17998, 17999.
|
||||
|
||||
* Character encoding and ctype tables were updated to Unicode 7.0.0, using
|
||||
new generator scripts contributed by Pravin Satpute and Mike FABIAN (Red
|
||||
Hat). These updates cause user visible changes, such as the fix for bug
|
||||
17998.
|
||||
|
||||
|
||||
Version 2.21
|
||||
|
||||
|
@ -1,3 +1,30 @@
|
||||
2015-02-20 Alexandre Oliva <aoliva@redhat.com>
|
||||
|
||||
[BZ #17588]
|
||||
[BZ #13064]
|
||||
[BZ #14094]
|
||||
[BZ #17998]
|
||||
* unicode-gen/Makefile: New.
|
||||
* unicode-gen/unicode-license.txt: New, from Unicode.
|
||||
* unicode-gen/UnicodeData.txt: New, from Unicode.
|
||||
* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
|
||||
* unicode-gen/EastAsianWidth.txt: New, from Unicode.
|
||||
* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
|
||||
FABIAN <mfabian@redhat.com>.
|
||||
* unicode-gen/ctype_compatibility.py: New verifier, from
|
||||
Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
|
||||
* unicode-gen/ctype_compatibility_test_cases.py: New verifier
|
||||
module, from Mike FABIAN.
|
||||
* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
|
||||
and Mike FABIAN.
|
||||
* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
|
||||
Satpute and Mike FABIAN.
|
||||
* charmaps/UTF-8: Update.
|
||||
* locales/i18n: Update.
|
||||
* gen-unicode-ctype.c: Remove.
|
||||
* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
|
||||
true for ordinal indicators.
|
||||
|
||||
2015-01-21 Marek Polacek <polacek@redhat.com>
|
||||
|
||||
* tests-mbwc/tst_wcscpy.c (tst_wcscpy): Fix condition.
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,784 +0,0 @@
|
||||
/* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
|
||||
Copyright (C) 2000-2015 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Usage example:
|
||||
$ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdbool.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
|
||||
/* This structure represents one line in the UnicodeData.txt file. */
|
||||
struct unicode_attribute
|
||||
{
|
||||
const char *name; /* Character name */
|
||||
const char *category; /* General category */
|
||||
const char *combining; /* Canonical combining classes */
|
||||
const char *bidi; /* Bidirectional category */
|
||||
const char *decomposition; /* Character decomposition mapping */
|
||||
const char *decdigit; /* Decimal digit value */
|
||||
const char *digit; /* Digit value */
|
||||
const char *numeric; /* Numeric value */
|
||||
int mirrored; /* mirrored */
|
||||
const char *oldname; /* Old Unicode 1.0 name */
|
||||
const char *comment; /* Comment */
|
||||
unsigned int upper; /* Uppercase mapping */
|
||||
unsigned int lower; /* Lowercase mapping */
|
||||
unsigned int title; /* Titlecase mapping */
|
||||
};
|
||||
|
||||
/* Missing fields are represented with "" for strings, and NONE for
|
||||
characters. */
|
||||
#define NONE (~(unsigned int)0)
|
||||
|
||||
/* The entire contents of the UnicodeData.txt file. */
|
||||
struct unicode_attribute unicode_attributes [0x110000];
|
||||
|
||||
/* Stores in unicode_attributes[i] the values from the given fields. */
|
||||
static void
|
||||
fill_attribute (unsigned int i,
|
||||
const char *field1, const char *field2,
|
||||
const char *field3, const char *field4,
|
||||
const char *field5, const char *field6,
|
||||
const char *field7, const char *field8,
|
||||
const char *field9, const char *field10,
|
||||
const char *field11, const char *field12,
|
||||
const char *field13, const char *field14)
|
||||
{
|
||||
struct unicode_attribute * uni;
|
||||
|
||||
if (i >= 0x110000)
|
||||
{
|
||||
fprintf (stderr, "index too large\n");
|
||||
exit (1);
|
||||
}
|
||||
if (strcmp (field2, "Cs") == 0)
|
||||
/* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
|
||||
return;
|
||||
uni = &unicode_attributes[i];
|
||||
/* Copy the strings. */
|
||||
uni->name = strdup (field1);
|
||||
uni->category = (field2[0] == '\0' ? "" : strdup (field2));
|
||||
uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
|
||||
uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
|
||||
uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
|
||||
uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
|
||||
uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
|
||||
uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
|
||||
uni->mirrored = (field9[0] == 'Y');
|
||||
uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
|
||||
uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
|
||||
uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
|
||||
uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
|
||||
uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
|
||||
}
|
||||
|
||||
/* Maximum length of a field in the UnicodeData.txt file. */
|
||||
#define FIELDLEN 120
|
||||
|
||||
/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
|
||||
Reads up to (but excluding) DELIM.
|
||||
Returns 1 when a field was successfully read, otherwise 0. */
|
||||
static int
|
||||
getfield (FILE *stream, char *buffer, int delim)
|
||||
{
|
||||
int count = 0;
|
||||
int c;
|
||||
|
||||
for (; (c = getc (stream)), (c != EOF && c != delim); )
|
||||
{
|
||||
/* The original unicode.org UnicodeData.txt file happens to have
|
||||
CR/LF line terminators. Silently convert to LF. */
|
||||
if (c == '\r')
|
||||
continue;
|
||||
|
||||
/* Put c into the buffer. */
|
||||
if (++count >= FIELDLEN - 1)
|
||||
{
|
||||
fprintf (stderr, "field too long\n");
|
||||
exit (1);
|
||||
}
|
||||
*buffer++ = c;
|
||||
}
|
||||
|
||||
if (c == EOF)
|
||||
return 0;
|
||||
|
||||
*buffer = '\0';
|
||||
return 1;
|
||||
}
|
||||
|
||||
/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
|
||||
file. */
|
||||
static void
|
||||
fill_attributes (const char *unicodedata_filename)
|
||||
{
|
||||
unsigned int i, j;
|
||||
FILE *stream;
|
||||
char field0[FIELDLEN];
|
||||
char field1[FIELDLEN];
|
||||
char field2[FIELDLEN];
|
||||
char field3[FIELDLEN];
|
||||
char field4[FIELDLEN];
|
||||
char field5[FIELDLEN];
|
||||
char field6[FIELDLEN];
|
||||
char field7[FIELDLEN];
|
||||
char field8[FIELDLEN];
|
||||
char field9[FIELDLEN];
|
||||
char field10[FIELDLEN];
|
||||
char field11[FIELDLEN];
|
||||
char field12[FIELDLEN];
|
||||
char field13[FIELDLEN];
|
||||
char field14[FIELDLEN];
|
||||
int lineno = 0;
|
||||
|
||||
for (i = 0; i < 0x110000; i++)
|
||||
unicode_attributes[i].name = NULL;
|
||||
|
||||
stream = fopen (unicodedata_filename, "r");
|
||||
if (stream == NULL)
|
||||
{
|
||||
fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
for (;;)
|
||||
{
|
||||
int n;
|
||||
|
||||
lineno++;
|
||||
n = getfield (stream, field0, ';');
|
||||
n += getfield (stream, field1, ';');
|
||||
n += getfield (stream, field2, ';');
|
||||
n += getfield (stream, field3, ';');
|
||||
n += getfield (stream, field4, ';');
|
||||
n += getfield (stream, field5, ';');
|
||||
n += getfield (stream, field6, ';');
|
||||
n += getfield (stream, field7, ';');
|
||||
n += getfield (stream, field8, ';');
|
||||
n += getfield (stream, field9, ';');
|
||||
n += getfield (stream, field10, ';');
|
||||
n += getfield (stream, field11, ';');
|
||||
n += getfield (stream, field12, ';');
|
||||
n += getfield (stream, field13, ';');
|
||||
n += getfield (stream, field14, '\n');
|
||||
if (n == 0)
|
||||
break;
|
||||
if (n != 15)
|
||||
{
|
||||
fprintf (stderr, "short line in'%s':%d\n",
|
||||
unicodedata_filename, lineno);
|
||||
exit (1);
|
||||
}
|
||||
i = strtoul (field0, NULL, 16);
|
||||
if (field1[0] == '<'
|
||||
&& strlen (field1) >= 9
|
||||
&& !strcmp (field1 + strlen(field1) - 8, ", First>"))
|
||||
{
|
||||
/* Deal with a range. */
|
||||
lineno++;
|
||||
n = getfield (stream, field0, ';');
|
||||
n += getfield (stream, field1, ';');
|
||||
n += getfield (stream, field2, ';');
|
||||
n += getfield (stream, field3, ';');
|
||||
n += getfield (stream, field4, ';');
|
||||
n += getfield (stream, field5, ';');
|
||||
n += getfield (stream, field6, ';');
|
||||
n += getfield (stream, field7, ';');
|
||||
n += getfield (stream, field8, ';');
|
||||
n += getfield (stream, field9, ';');
|
||||
n += getfield (stream, field10, ';');
|
||||
n += getfield (stream, field11, ';');
|
||||
n += getfield (stream, field12, ';');
|
||||
n += getfield (stream, field13, ';');
|
||||
n += getfield (stream, field14, '\n');
|
||||
if (n != 15)
|
||||
{
|
||||
fprintf (stderr, "missing end range in '%s':%d\n",
|
||||
unicodedata_filename, lineno);
|
||||
exit (1);
|
||||
}
|
||||
if (!(field1[0] == '<'
|
||||
&& strlen (field1) >= 8
|
||||
&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
|
||||
{
|
||||
fprintf (stderr, "missing end range in '%s':%d\n",
|
||||
unicodedata_filename, lineno);
|
||||
exit (1);
|
||||
}
|
||||
field1[strlen (field1) - 7] = '\0';
|
||||
j = strtoul (field0, NULL, 16);
|
||||
for (; i <= j; i++)
|
||||
fill_attribute (i, field1+1, field2, field3, field4, field5,
|
||||
field6, field7, field8, field9, field10,
|
||||
field11, field12, field13, field14);
|
||||
}
|
||||
else
|
||||
{
|
||||
/* Single character line */
|
||||
fill_attribute (i, field1, field2, field3, field4, field5,
|
||||
field6, field7, field8, field9, field10,
|
||||
field11, field12, field13, field14);
|
||||
}
|
||||
}
|
||||
if (ferror (stream) || fclose (stream))
|
||||
{
|
||||
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
|
||||
exit (1);
|
||||
}
|
||||
}
|
||||
|
||||
/* Character mappings. */
|
||||
|
||||
static unsigned int
|
||||
to_upper (unsigned int ch)
|
||||
{
|
||||
if (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].upper != NONE)
|
||||
return unicode_attributes[ch].upper;
|
||||
else
|
||||
return ch;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
to_lower (unsigned int ch)
|
||||
{
|
||||
if (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].lower != NONE)
|
||||
return unicode_attributes[ch].lower;
|
||||
else
|
||||
return ch;
|
||||
}
|
||||
|
||||
static unsigned int
|
||||
to_title (unsigned int ch)
|
||||
{
|
||||
if (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].title != NONE)
|
||||
return unicode_attributes[ch].title;
|
||||
else
|
||||
return ch;
|
||||
}
|
||||
|
||||
/* Character class properties. */
|
||||
|
||||
static bool
|
||||
is_upper (unsigned int ch)
|
||||
{
|
||||
return (to_lower (ch) != ch);
|
||||
}
|
||||
|
||||
static bool
|
||||
is_lower (unsigned int ch)
|
||||
{
|
||||
return (to_upper (ch) != ch)
|
||||
/* <U00DF> is lowercase, but without simple to_upper mapping. */
|
||||
|| (ch == 0x00DF);
|
||||
}
|
||||
|
||||
static bool
|
||||
is_alpha (unsigned int ch)
|
||||
{
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& ((unicode_attributes[ch].category[0] == 'L'
|
||||
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
||||
<U0E2F>, <U0E46> should belong to is_punct. */
|
||||
&& (ch != 0x0E2F) && (ch != 0x0E46))
|
||||
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
||||
<U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
|
||||
|| (ch == 0x0E31)
|
||||
|| (ch >= 0x0E34 && ch <= 0x0E3A)
|
||||
|| (ch >= 0x0E47 && ch <= 0x0E4E)
|
||||
/* Avoid warning for <U0345>. */
|
||||
|| (ch == 0x0345)
|
||||
/* Avoid warnings for <U2160>..<U217F>. */
|
||||
|| (unicode_attributes[ch].category[0] == 'N'
|
||||
&& unicode_attributes[ch].category[1] == 'l')
|
||||
/* Avoid warnings for <U24B6>..<U24E9>. */
|
||||
|| (unicode_attributes[ch].category[0] == 'S'
|
||||
&& unicode_attributes[ch].category[1] == 'o'
|
||||
&& strstr (unicode_attributes[ch].name, " LETTER ")
|
||||
!= NULL)
|
||||
/* Consider all the non-ASCII digits as alphabetic.
|
||||
ISO C 99 forbids us to have them in category "digit",
|
||||
but we want iswalnum to return true on them. */
|
||||
|| (unicode_attributes[ch].category[0] == 'N'
|
||||
&& unicode_attributes[ch].category[1] == 'd'
|
||||
&& !(ch >= 0x0030 && ch <= 0x0039))));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_digit (unsigned int ch)
|
||||
{
|
||||
#if 0
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].category[0] == 'N'
|
||||
&& unicode_attributes[ch].category[1] == 'd');
|
||||
/* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
||||
a zero. Must add <0> in front of them by hand. */
|
||||
#else
|
||||
/* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
||||
takes it away:
|
||||
7.25.2.1.5:
|
||||
The iswdigit function tests for any wide character that corresponds
|
||||
to a decimal-digit character (as defined in 5.2.1).
|
||||
5.2.1:
|
||||
the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
||||
*/
|
||||
return (ch >= 0x0030 && ch <= 0x0039);
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool
|
||||
is_outdigit (unsigned int ch)
|
||||
{
|
||||
return (ch >= 0x0030 && ch <= 0x0039);
|
||||
}
|
||||
|
||||
static bool
|
||||
is_blank (unsigned int ch)
|
||||
{
|
||||
return (ch == 0x0009 /* '\t' */
|
||||
/* Category Zs without mention of "<noBreak>" */
|
||||
|| (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].category[0] == 'Z'
|
||||
&& unicode_attributes[ch].category[1] == 's'
|
||||
&& !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_space (unsigned int ch)
|
||||
{
|
||||
/* Don't make U+00A0 a space. Non-breaking space means that all programs
|
||||
should treat it like a punctuation character, not like a space. */
|
||||
return (ch == 0x0020 /* ' ' */
|
||||
|| ch == 0x000C /* '\f' */
|
||||
|| ch == 0x000A /* '\n' */
|
||||
|| ch == 0x000D /* '\r' */
|
||||
|| ch == 0x0009 /* '\t' */
|
||||
|| ch == 0x000B /* '\v' */
|
||||
/* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
|
||||
|| (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].category[0] == 'Z'
|
||||
&& (unicode_attributes[ch].category[1] == 'l'
|
||||
|| unicode_attributes[ch].category[1] == 'p'
|
||||
|| (unicode_attributes[ch].category[1] == 's'
|
||||
&& !strstr (unicode_attributes[ch].decomposition,
|
||||
"<noBreak>")))));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_cntrl (unsigned int ch)
|
||||
{
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& (!strcmp (unicode_attributes[ch].name, "<control>")
|
||||
/* Categories Zl and Zp */
|
||||
|| (unicode_attributes[ch].category[0] == 'Z'
|
||||
&& (unicode_attributes[ch].category[1] == 'l'
|
||||
|| unicode_attributes[ch].category[1] == 'p'))));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_xdigit (unsigned int ch)
|
||||
{
|
||||
#if 0
|
||||
return is_digit (ch)
|
||||
|| (ch >= 0x0041 && ch <= 0x0046)
|
||||
|| (ch >= 0x0061 && ch <= 0x0066);
|
||||
#else
|
||||
/* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
||||
takes it away:
|
||||
7.25.2.1.12:
|
||||
The iswxdigit function tests for any wide character that corresponds
|
||||
to a hexadecimal-digit character (as defined in 6.4.4.1).
|
||||
6.4.4.1:
|
||||
hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
||||
*/
|
||||
return (ch >= 0x0030 && ch <= 0x0039)
|
||||
|| (ch >= 0x0041 && ch <= 0x0046)
|
||||
|| (ch >= 0x0061 && ch <= 0x0066);
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool
|
||||
is_graph (unsigned int ch)
|
||||
{
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& strcmp (unicode_attributes[ch].name, "<control>")
|
||||
&& !is_space (ch));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_print (unsigned int ch)
|
||||
{
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& strcmp (unicode_attributes[ch].name, "<control>")
|
||||
/* Categories Zl and Zp */
|
||||
&& !(unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].category[0] == 'Z'
|
||||
&& (unicode_attributes[ch].category[1] == 'l'
|
||||
|| unicode_attributes[ch].category[1] == 'p')));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_punct (unsigned int ch)
|
||||
{
|
||||
#if 0
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].category[0] == 'P');
|
||||
#else
|
||||
/* The traditional POSIX definition of punctuation is every graphic,
|
||||
non-alphanumeric character. */
|
||||
return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
|
||||
#endif
|
||||
}
|
||||
|
||||
static bool
|
||||
is_combining (unsigned int ch)
|
||||
{
|
||||
/* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
||||
file. In 3.0.1 it was identical to the union of the general categories
|
||||
"Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
||||
PropList.txt file, so we take the latter definition. */
|
||||
return (unicode_attributes[ch].name != NULL
|
||||
&& unicode_attributes[ch].category[0] == 'M'
|
||||
&& (unicode_attributes[ch].category[1] == 'n'
|
||||
|| unicode_attributes[ch].category[1] == 'c'
|
||||
|| unicode_attributes[ch].category[1] == 'e'));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_combining_level3 (unsigned int ch)
|
||||
{
|
||||
return is_combining (ch)
|
||||
&& !(unicode_attributes[ch].combining[0] != '\0'
|
||||
&& unicode_attributes[ch].combining[0] != '0'
|
||||
&& strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
|
||||
}
|
||||
|
||||
/* Return the UCS symbol string for a Unicode character. */
|
||||
static const char *
|
||||
ucs_symbol (unsigned int i)
|
||||
{
|
||||
static char buf[11+1];
|
||||
|
||||
sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
|
||||
return buf;
|
||||
}
|
||||
|
||||
/* Return the UCS symbol range string for a Unicode characters interval. */
|
||||
static const char *
|
||||
ucs_symbol_range (unsigned int low, unsigned int high)
|
||||
{
|
||||
static char buf[24+1];
|
||||
|
||||
strcpy (buf, ucs_symbol (low));
|
||||
strcat (buf, "..");
|
||||
strcat (buf, ucs_symbol (high));
|
||||
return buf;
|
||||
}
|
||||
|
||||
/* Output a character class (= property) table. */
|
||||
|
||||
static void
|
||||
output_charclass (FILE *stream, const char *classname,
|
||||
bool (*func) (unsigned int))
|
||||
{
|
||||
char table[0x110000];
|
||||
unsigned int i;
|
||||
bool need_semicolon;
|
||||
const int max_column = 75;
|
||||
int column;
|
||||
|
||||
for (i = 0; i < 0x110000; i++)
|
||||
table[i] = (int) func (i);
|
||||
|
||||
fprintf (stream, "%s ", classname);
|
||||
need_semicolon = false;
|
||||
column = 1000;
|
||||
for (i = 0; i < 0x110000; )
|
||||
{
|
||||
if (!table[i])
|
||||
i++;
|
||||
else
|
||||
{
|
||||
unsigned int low, high;
|
||||
char buf[25];
|
||||
|
||||
low = i;
|
||||
do
|
||||
i++;
|
||||
while (i < 0x110000 && table[i]);
|
||||
high = i - 1;
|
||||
|
||||
if (low == high)
|
||||
strcpy (buf, ucs_symbol (low));
|
||||
else
|
||||
strcpy (buf, ucs_symbol_range (low, high));
|
||||
|
||||
if (need_semicolon)
|
||||
{
|
||||
fprintf (stream, ";");
|
||||
column++;
|
||||
}
|
||||
|
||||
if (column + strlen (buf) > max_column)
|
||||
{
|
||||
fprintf (stream, "/\n ");
|
||||
column = 3;
|
||||
}
|
||||
|
||||
fprintf (stream, "%s", buf);
|
||||
column += strlen (buf);
|
||||
need_semicolon = true;
|
||||
}
|
||||
}
|
||||
fprintf (stream, "\n");
|
||||
}
|
||||
|
||||
/* Output a character mapping table. */
|
||||
|
||||
static void
|
||||
output_charmap (FILE *stream, const char *mapname,
|
||||
unsigned int (*func) (unsigned int))
|
||||
{
|
||||
char table[0x110000];
|
||||
unsigned int i;
|
||||
bool need_semicolon;
|
||||
const int max_column = 75;
|
||||
int column;
|
||||
|
||||
for (i = 0; i < 0x110000; i++)
|
||||
table[i] = (func (i) != i);
|
||||
|
||||
fprintf (stream, "%s ", mapname);
|
||||
need_semicolon = false;
|
||||
column = 1000;
|
||||
for (i = 0; i < 0x110000; i++)
|
||||
if (table[i])
|
||||
{
|
||||
char buf[25+1];
|
||||
|
||||
strcpy (buf, "(");
|
||||
strcat (buf, ucs_symbol (i));
|
||||
strcat (buf, ",");
|
||||
strcat (buf, ucs_symbol (func (i)));
|
||||
strcat (buf, ")");
|
||||
|
||||
if (need_semicolon)
|
||||
{
|
||||
fprintf (stream, ";");
|
||||
column++;
|
||||
}
|
||||
|
||||
if (column + strlen (buf) > max_column)
|
||||
{
|
||||
fprintf (stream, "/\n ");
|
||||
column = 3;
|
||||
}
|
||||
|
||||
fprintf (stream, "%s", buf);
|
||||
column += strlen (buf);
|
||||
need_semicolon = true;
|
||||
}
|
||||
fprintf (stream, "\n");
|
||||
}
|
||||
|
||||
/* Output the width table. */
|
||||
|
||||
static void
|
||||
output_widthmap (FILE *stream)
|
||||
{
|
||||
}
|
||||
|
||||
/* Output the tables to the given file. */
|
||||
|
||||
static void
|
||||
output_tables (const char *filename, const char *version)
|
||||
{
|
||||
FILE *stream;
|
||||
unsigned int ch;
|
||||
|
||||
stream = fopen (filename, "w");
|
||||
if (stream == NULL)
|
||||
{
|
||||
fprintf (stderr, "cannot open '%s' for writing\n", filename);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
fprintf (stream, "escape_char /\n");
|
||||
fprintf (stream, "comment_char %%\n");
|
||||
fprintf (stream, "\n");
|
||||
fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
|
||||
version);
|
||||
fprintf (stream, "\n");
|
||||
|
||||
fprintf (stream, "LC_IDENTIFICATION\n");
|
||||
fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
|
||||
fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
|
||||
fprintf (stream, "address \"\"\n");
|
||||
fprintf (stream, "contact \"\"\n");
|
||||
fprintf (stream, "email \"bug-glibc-locales@gnu.org\"\n");
|
||||
fprintf (stream, "tel \"\"\n");
|
||||
fprintf (stream, "fax \"\"\n");
|
||||
fprintf (stream, "language \"\"\n");
|
||||
fprintf (stream, "territory \"Earth\"\n");
|
||||
fprintf (stream, "revision \"%s\"\n", version);
|
||||
{
|
||||
time_t now;
|
||||
char date[11];
|
||||
now = time (NULL);
|
||||
strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
|
||||
fprintf (stream, "date \"%s\"\n", date);
|
||||
}
|
||||
fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
|
||||
fprintf (stream, "END LC_IDENTIFICATION\n");
|
||||
fprintf (stream, "\n");
|
||||
|
||||
/* Verifications. */
|
||||
for (ch = 0; ch < 0x110000; ch++)
|
||||
{
|
||||
/* toupper restriction: "Only characters specified for the keywords
|
||||
lower and upper shall be specified. */
|
||||
if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
|
||||
fprintf (stderr,
|
||||
"%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
|
||||
ucs_symbol (ch), ch, to_upper (ch));
|
||||
|
||||
/* tolower restriction: "Only characters specified for the keywords
|
||||
lower and upper shall be specified. */
|
||||
if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
|
||||
fprintf (stderr,
|
||||
"%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
|
||||
ucs_symbol (ch), ch, to_lower (ch));
|
||||
|
||||
/* alpha restriction: "Characters classified as either upper or lower
|
||||
shall automatically belong to this class. */
|
||||
if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
|
||||
fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
|
||||
|
||||
/* alpha restriction: "No character specified for the keywords cntrl,
|
||||
digit, punct or space shall be specified." */
|
||||
if (is_alpha (ch) && is_cntrl (ch))
|
||||
fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
|
||||
if (is_alpha (ch) && is_digit (ch))
|
||||
fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
|
||||
if (is_alpha (ch) && is_punct (ch))
|
||||
fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
|
||||
if (is_alpha (ch) && is_space (ch))
|
||||
fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
|
||||
|
||||
/* space restriction: "No character specified for the keywords upper,
|
||||
lower, alpha, digit, graph or xdigit shall be specified."
|
||||
upper, lower, alpha already checked above. */
|
||||
if (is_space (ch) && is_digit (ch))
|
||||
fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
|
||||
if (is_space (ch) && is_graph (ch))
|
||||
fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
|
||||
if (is_space (ch) && is_xdigit (ch))
|
||||
fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
|
||||
|
||||
/* cntrl restriction: "No character specified for the keywords upper,
|
||||
lower, alpha, digit, punct, graph, print or xdigit shall be
|
||||
specified." upper, lower, alpha already checked above. */
|
||||
if (is_cntrl (ch) && is_digit (ch))
|
||||
fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
|
||||
if (is_cntrl (ch) && is_punct (ch))
|
||||
fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
|
||||
if (is_cntrl (ch) && is_graph (ch))
|
||||
fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
|
||||
if (is_cntrl (ch) && is_print (ch))
|
||||
fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
|
||||
if (is_cntrl (ch) && is_xdigit (ch))
|
||||
fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
|
||||
|
||||
/* punct restriction: "No character specified for the keywords upper,
|
||||
lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||||
be specified." upper, lower, alpha, cntrl already checked above. */
|
||||
if (is_punct (ch) && is_digit (ch))
|
||||
fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
|
||||
if (is_punct (ch) && is_xdigit (ch))
|
||||
fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
|
||||
if (is_punct (ch) && (ch == 0x0020))
|
||||
fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
|
||||
|
||||
/* graph restriction: "No character specified for the keyword cntrl
|
||||
shall be specified." Already checked above. */
|
||||
|
||||
/* print restriction: "No character specified for the keyword cntrl
|
||||
shall be specified." Already checked above. */
|
||||
|
||||
/* graph - print relation: differ only in the <space> character.
|
||||
How is this possible if there are more than one space character?!
|
||||
I think susv2/xbd/locale.html should speak of "space characters",
|
||||
not "space character". */
|
||||
if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
|
||||
fprintf (stderr,
|
||||
"%s is print but not graph|<space>\n", ucs_symbol (ch));
|
||||
if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
|
||||
fprintf (stderr,
|
||||
"%s is graph|<space> but not print\n", ucs_symbol (ch));
|
||||
}
|
||||
|
||||
fprintf (stream, "LC_CTYPE\n");
|
||||
output_charclass (stream, "upper", is_upper);
|
||||
output_charclass (stream, "lower", is_lower);
|
||||
output_charclass (stream, "alpha", is_alpha);
|
||||
output_charclass (stream, "digit", is_digit);
|
||||
output_charclass (stream, "outdigit", is_outdigit);
|
||||
output_charclass (stream, "blank", is_blank);
|
||||
output_charclass (stream, "space", is_space);
|
||||
output_charclass (stream, "cntrl", is_cntrl);
|
||||
output_charclass (stream, "punct", is_punct);
|
||||
output_charclass (stream, "xdigit", is_xdigit);
|
||||
output_charclass (stream, "graph", is_graph);
|
||||
output_charclass (stream, "print", is_print);
|
||||
output_charclass (stream, "class \"combining\";", is_combining);
|
||||
output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
|
||||
output_charmap (stream, "toupper", to_upper);
|
||||
output_charmap (stream, "tolower", to_lower);
|
||||
output_charmap (stream, "map \"totitle\";", to_title);
|
||||
output_widthmap (stream);
|
||||
fprintf (stream, "END LC_CTYPE\n");
|
||||
|
||||
if (ferror (stream) || fclose (stream))
|
||||
{
|
||||
fprintf (stderr, "error writing to '%s'\n", filename);
|
||||
exit (1);
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char * argv[])
|
||||
{
|
||||
if (argc != 3)
|
||||
{
|
||||
fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
fill_attributes (argv[1]);
|
||||
|
||||
output_tables ("unicode", argv[2]);
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
||||
lower 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
|
||||
000000000000000000000100000000000000000000000000
|
||||
000000000010000000000100001000000000000000000000
|
||||
lower 倳眑婭笫崷窙嗲睧颬睼麧緗鴇膹擨闀貘覷鏷禴矙𡜍𦶠<F0A19C8D>
|
||||
000000000000000111111111111111111111111011111111
|
||||
upper 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
|
||||
|
10794
localedata/unicode-gen/DerivedCoreProperties.txt
Normal file
10794
localedata/unicode-gen/DerivedCoreProperties.txt
Normal file
File diff suppressed because it is too large
Load Diff
2121
localedata/unicode-gen/EastAsianWidth.txt
Normal file
2121
localedata/unicode-gen/EastAsianWidth.txt
Normal file
File diff suppressed because it is too large
Load Diff
99
localedata/unicode-gen/Makefile
Normal file
99
localedata/unicode-gen/Makefile
Normal file
@ -0,0 +1,99 @@
|
||||
# Copyright (C) 2015 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
# Makefile for generating and updating Unicode-extracted files.
|
||||
|
||||
|
||||
# This Makefile is NOT used as part of the GNU libc build. It needs
|
||||
# to be run manually, within the source tree, at Unicode upgrades
|
||||
# (change UNICODE_VERSION below), to update ../locales/i18n ctype
|
||||
# information (part of the file is preserved, so don't wipe it all
|
||||
# out), and ../charmaps/UTF-8.
|
||||
|
||||
# Use make all to generate the files used in the glibc build out of
|
||||
# the original Unicode files; make check to verify that they are what
|
||||
# we expect; make install to copy them to the location expected by the
|
||||
# glibc build; and make clean to remove all generated files.
|
||||
|
||||
# We keep a local copy of the downloaded Unicode files, to avoid
|
||||
# running afoul of the LGPL corresponding sources requirements, even
|
||||
# though it's not clear that they are preferred over the generated
|
||||
# files for making modifications.
|
||||
|
||||
|
||||
UNICODE_VERSION = 7.0.0
|
||||
|
||||
PYTHON3 = python3
|
||||
WGET = wget
|
||||
|
||||
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
|
||||
GENERATED = i18n UTF-8
|
||||
REPORTS = i18n-report UTF-8-report
|
||||
|
||||
all: $(GENERATED)
|
||||
|
||||
check: check-i18n check-UTF-8
|
||||
|
||||
install:
|
||||
cp -p i18n ../locales/i18n
|
||||
cp -p UTF-8 ../charmaps/UTF-8
|
||||
|
||||
clean: mostlyclean
|
||||
-rm -rf __pycache__
|
||||
mostlyclean:
|
||||
-rm -f $(REPORTS) $(GENERATED)
|
||||
|
||||
.PHONY: all check clean mostlyclean install
|
||||
|
||||
i18n: UnicodeData.txt DerivedCoreProperties.txt
|
||||
i18n: ../locales/i18n # Preserve non-ctype information.
|
||||
i18n: gen_unicode_ctype.py
|
||||
$(PYTHON3) gen_unicode_ctype.py -u UnicodeData.txt \
|
||||
-d DerivedCoreProperties.txt -i ../locales/i18n -o $@ \
|
||||
--unicode_version $(UNICODE_VERSION)
|
||||
|
||||
i18n-report: i18n ../locales/i18n
|
||||
i18n-report: ctype_compatibility.py ctype_compatibility_test_cases.py
|
||||
$(PYTHON3) ./ctype_compatibility.py -o ../locales/i18n \
|
||||
-n i18n -a -m > $@
|
||||
|
||||
check-i18n: i18n-report
|
||||
@if grep '\(Missing\|Added\) [^0]\|^Number of errors[^=]* = [^0]' \
|
||||
i18n-report; \
|
||||
then echo manual verification required; false; else true; fi
|
||||
|
||||
UTF-8: UnicodeData.txt EastAsianWidth.txt
|
||||
UTF-8: utf8_gen.py
|
||||
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
||||
|
||||
UTF-8-report: UTF-8 ../charmaps/UTF-8
|
||||
UTF-8-report: utf8_compatibility.py
|
||||
$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
|
||||
-n UTF-8 -a -m > $@
|
||||
|
||||
check-UTF-8: UTF-8-report
|
||||
@if grep '^Total.*: [^0]' UTF-8-report; \
|
||||
then echo manual verification required; false; else true; fi
|
||||
|
||||
|
||||
.PHONY: downloads clean-downloads
|
||||
downloads: $(DOWNLOADS)
|
||||
clean-downloads:
|
||||
-rm -f $(DOWNLOADS)
|
||||
|
||||
$(DOWNLOADS):
|
||||
$(WGET) http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$@
|
27268
localedata/unicode-gen/UnicodeData.txt
Normal file
27268
localedata/unicode-gen/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
546
localedata/unicode-gen/ctype_compatibility.py
Executable file
546
localedata/unicode-gen/ctype_compatibility.py
Executable file
@ -0,0 +1,546 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
'''
|
||||
This script is useful for checking the differences between
|
||||
an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
|
||||
new one generated by gen_unicode_ctype.py
|
||||
|
||||
To see how it is used, call it with the “-h” option:
|
||||
|
||||
$ ./ctype_compatibility.py -h
|
||||
… prints usage message …
|
||||
'''
|
||||
|
||||
import sys
|
||||
import re
|
||||
import unicodedata
|
||||
import argparse
|
||||
|
||||
from ctype_compatibility_test_cases import TEST_CASES
|
||||
|
||||
def get_lines_from_file(filename):
|
||||
'''Get all non-comment lines from a i18n file
|
||||
|
||||
Also merge all lines which are continued on the next line because
|
||||
they end in “/” into a single line.
|
||||
'''
|
||||
with open(filename) as i18n_file:
|
||||
current_line = ''
|
||||
for line in i18n_file:
|
||||
line = line.strip('\n')
|
||||
if '%' in line:
|
||||
if line.endswith('/'):
|
||||
line = line[0:line.find('%')] + '/'
|
||||
else:
|
||||
line = line[0:line.find('%')]
|
||||
line = line.strip()
|
||||
if line.endswith('/'):
|
||||
current_line += line[:-1]
|
||||
else:
|
||||
yield current_line + line
|
||||
current_line = ''
|
||||
if current_line: # file ends with a continuation line
|
||||
yield current_line
|
||||
|
||||
def extract_character_classes(filename):
|
||||
'''Get all Unicode code points for each character class from a file
|
||||
|
||||
Store these code points in a dictionary using the character classes
|
||||
as keys and the list of code points in this character class as values.
|
||||
|
||||
In case of the character classes “toupper”, “tolower”, and “totitle”,
|
||||
these area actually pairs of code points
|
||||
'''
|
||||
ctype_dict = {}
|
||||
for line in get_lines_from_file(filename):
|
||||
for char_class in [
|
||||
'upper',
|
||||
'lower',
|
||||
'alpha',
|
||||
'digit',
|
||||
'outdigit',
|
||||
'space',
|
||||
'cntrl',
|
||||
'punct',
|
||||
'graph',
|
||||
'print',
|
||||
'xdigit',
|
||||
'blank',
|
||||
'combining',
|
||||
'combining_level3',
|
||||
'toupper',
|
||||
'tolower',
|
||||
'totitle']:
|
||||
match = re.match(r'^('
|
||||
+'(?:(?:class|map)\s+")'
|
||||
+re.escape(char_class)+
|
||||
'(?:";)\s+'
|
||||
+'|'
|
||||
+re.escape(char_class)+'\s+'
|
||||
+')', line)
|
||||
if match:
|
||||
if char_class not in ctype_dict:
|
||||
ctype_dict[char_class] = []
|
||||
process_chars(
|
||||
ctype_dict[char_class],
|
||||
line[match.end():])
|
||||
return ctype_dict
|
||||
|
||||
def process_chars(char_class_list, code_point_line):
|
||||
'''
|
||||
Extract Unicode values from code_point_line
|
||||
and add to the list of code points in a character class
|
||||
'''
|
||||
for code_points in code_point_line.split(';'):
|
||||
code_points = code_points.strip()
|
||||
match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
|
||||
if match: # <Uxxxx>
|
||||
char_class_list.append(
|
||||
int(match.group('codepoint'), 16))
|
||||
continue
|
||||
match = re.match(
|
||||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||
+'\.\.'+
|
||||
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
|
||||
code_points)
|
||||
if match: # <Uxxxx>..<Uxxxx>
|
||||
for codepoint in range(
|
||||
int(match.group('codepoint1'), 16),
|
||||
int(match.group('codepoint2'), 16) + 1):
|
||||
char_class_list.append(codepoint)
|
||||
continue
|
||||
match = re.match(
|
||||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||
+'\.\.\(2\)\.\.'+
|
||||
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
|
||||
code_points)
|
||||
if match: # <Uxxxx>..(2)..<Uxxxx>
|
||||
for codepoint in range(
|
||||
int(match.group('codepoint1'), 16),
|
||||
int(match.group('codepoint2'), 16) + 1,
|
||||
2):
|
||||
char_class_list.append(codepoint)
|
||||
continue
|
||||
match = re.match(
|
||||
r'^\('
|
||||
+'<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||
+','+
|
||||
'<U(?P<codepoint2>[0-9A-F]{4,8})>'
|
||||
+'\)$',
|
||||
code_points)
|
||||
if match: # (<Uxxxx>,<Uxxxx>)
|
||||
char_class_list.append((
|
||||
int(match.group('codepoint1'), 16),
|
||||
int(match.group('codepoint2'), 16)))
|
||||
continue
|
||||
sys.stderr.write(
|
||||
('None of the regexps matched '
|
||||
+ 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
|
||||
'cp': code_points,
|
||||
'cpl': code_point_line
|
||||
})
|
||||
exit(1)
|
||||
|
||||
def compare_lists(old_ctype_dict, new_ctype_dict):
|
||||
'''Compare character classes in the old and the new LC_CTYPE'''
|
||||
print('****************************************************')
|
||||
print('Character classes which are only in the new '
|
||||
+ 'or only in the old file:')
|
||||
for char_class in sorted(old_ctype_dict):
|
||||
if char_class not in new_ctype_dict:
|
||||
print('Character class %s is in old ctype but not in new ctype'
|
||||
%char_class)
|
||||
for char_class in sorted(new_ctype_dict):
|
||||
if char_class not in old_ctype_dict:
|
||||
print('Character class %s is in new ctype but not in old ctype'
|
||||
%char_class)
|
||||
for char_class in sorted(old_ctype_dict):
|
||||
print("****************************************************")
|
||||
print("%s: %d chars in old ctype and %d chars in new ctype" %(
|
||||
char_class,
|
||||
len(old_ctype_dict[char_class]),
|
||||
len(new_ctype_dict[char_class])))
|
||||
print("----------------------------------------------------")
|
||||
report(char_class,
|
||||
old_ctype_dict[char_class],
|
||||
new_ctype_dict[char_class])
|
||||
|
||||
def report_code_points(char_class, code_point_list, text=''):
|
||||
'''Report all code points which have been added to or removed from a
|
||||
character class.
|
||||
'''
|
||||
for code_point in sorted(code_point_list):
|
||||
if type(code_point) == type(int()):
|
||||
print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
|
||||
%{'text': text,
|
||||
'char': chr(code_point),
|
||||
'char_class': char_class,
|
||||
'code_point': hex(code_point),
|
||||
'name': unicodedata.name(chr(code_point), 'name unknown')})
|
||||
else:
|
||||
print(('%(char_class)s: %(text)s: '
|
||||
+ '%(char0)s → %(char1)s '
|
||||
+ '%(code_point0)s → %(code_point1)s '
|
||||
+ '%(name0)s → %(name1)s') %{
|
||||
'text': text,
|
||||
'char_class': char_class,
|
||||
'char0': chr(code_point[0]),
|
||||
'code_point0': hex(code_point[0]),
|
||||
'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
|
||||
'char1': chr(code_point[1]),
|
||||
'code_point1': hex(code_point[1]),
|
||||
'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
|
||||
})
|
||||
|
||||
def report(char_class, old_list, new_list):
|
||||
'''Report the differences for a certain LC_CTYPE character class
|
||||
between the old and the newly generated state
|
||||
'''
|
||||
missing_chars = list(set(old_list)-set(new_list))
|
||||
print(('%(char_class)s: Missing %(number)d characters '
|
||||
+ 'of old ctype in new ctype ')
|
||||
%{'char_class': char_class, 'number': len(missing_chars)})
|
||||
if ARGS.show_missing_characters:
|
||||
report_code_points(char_class, missing_chars, 'Missing')
|
||||
added_chars = list(set(new_list)-set(old_list))
|
||||
print(('%(char_class)s: Added %(number)d characters '
|
||||
+ 'in new ctype which were not in old ctype')
|
||||
%{'char_class': char_class, 'number': len(added_chars)})
|
||||
if ARGS.show_added_characters:
|
||||
report_code_points(char_class, added_chars, 'Added')
|
||||
|
||||
|
||||
def cperror(error_message, errorcounter=0):
|
||||
'''Increase number of errors by one and print an error message'''
|
||||
print(error_message)
|
||||
return errorcounter + 1
|
||||
|
||||
def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
|
||||
errorcounter=0):
|
||||
'''The parameter “code_point_list_with_ranges” is a list of
|
||||
integers or pairs of integers, for example:
|
||||
|
||||
[0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
|
||||
|
||||
where the pairs of integers stand for all the code points in the range
|
||||
of the two integers given, including the two integers of the pair.
|
||||
|
||||
'''
|
||||
for code_point_range in code_point_list_with_ranges:
|
||||
for code_point in ([code_point_range]
|
||||
if type(code_point_range) == type(int())
|
||||
else range(code_point_range[0],
|
||||
code_point_range[1]+1)):
|
||||
for char_class_tuple in char_classes:
|
||||
char_class = char_class_tuple[0]
|
||||
in_char_class = char_class_tuple[1]
|
||||
if (code_point in ctype_dict[char_class]) != in_char_class:
|
||||
errorcounter = cperror(
|
||||
('error: %(code_point)s %(char)s '
|
||||
+ '%(char_class)s %(in)s: %(reason)s') %{
|
||||
'code_point': hex(code_point),
|
||||
'char': chr(code_point),
|
||||
'char_class': char_class,
|
||||
'in': not in_char_class,
|
||||
'reason': reason},
|
||||
errorcounter)
|
||||
return errorcounter
|
||||
|
||||
def tests(ctype_dict, errorcounter = 0):
|
||||
'''Test a LC_CTYPE character class dictionary for known errors'''
|
||||
# copy the information from ctype_dict (which contains lists) in
|
||||
# a new dictionary ctype_dict2 (which contains dictionaries).
|
||||
# The checks below are easier with that type of data structure.
|
||||
|
||||
ctype_dict2 = {}
|
||||
for key in ctype_dict:
|
||||
ctype_dict2[key] = {}
|
||||
if ctype_dict[key]:
|
||||
if type(ctype_dict[key][0]) == type(int()):
|
||||
for value in ctype_dict[key]:
|
||||
ctype_dict2[key][value] = 1
|
||||
else: # key is 'toupper', 'tolower', or 'totitle'
|
||||
for value in ctype_dict[key]:
|
||||
ctype_dict2[key][value[0]] = value[1]
|
||||
|
||||
for test_case in TEST_CASES:
|
||||
errorcounter = cpcheck(ctype_dict2,
|
||||
test_case[0],
|
||||
test_case[1],
|
||||
test_case[2],
|
||||
errorcounter = errorcounter)
|
||||
|
||||
for code_point in range(0, 0x110000):
|
||||
# toupper restriction: "Only characters specified for the keywords
|
||||
# lower and upper shall be specified.
|
||||
if (code_point in ctype_dict2['toupper']
|
||||
and code_point != ctype_dict2['toupper'][code_point]
|
||||
and not (code_point in ctype_dict2['lower']
|
||||
or code_point in ctype_dict2['upper'])):
|
||||
errorcounter = cperror(
|
||||
('error: %(char1)s is not upper|lower '
|
||||
+ 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
|
||||
'char1': chr(code_point),
|
||||
'cp1': hex(code_point),
|
||||
'cp2': hex(ctype_dict2['toupper'][code_point]),
|
||||
'char2': chr(ctype_dict2['toupper'][code_point])
|
||||
},
|
||||
errorcounter)
|
||||
# tolower restriction: "Only characters specified for the keywords
|
||||
# lower and upper shall be specified.
|
||||
if (code_point in ctype_dict2['tolower']
|
||||
and code_point != ctype_dict2['tolower'][code_point]
|
||||
and not (code_point in ctype_dict2['lower']
|
||||
or code_point in ctype_dict2['upper'])):
|
||||
errorcounter = cperror(
|
||||
('error: %(char1)s is not upper|lower '
|
||||
+ 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
|
||||
'char1': chr(code_point),
|
||||
'cp1': hex(code_point),
|
||||
'cp2': hex(ctype_dict2['tolower'][code_point]),
|
||||
'char2': chr(ctype_dict2['tolower'][code_point])
|
||||
},
|
||||
errorcounter)
|
||||
# alpha restriction: "Characters classified as either upper or lower
|
||||
# shall automatically belong to this class.
|
||||
if ((code_point in ctype_dict2['lower']
|
||||
or code_point in ctype_dict2['upper'])
|
||||
and code_point not in ctype_dict2['alpha']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is upper|lower but not alpha' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
# alpha restriction: "No character specified for the keywords cntrl,
|
||||
# digit, punct or space shall be specified."
|
||||
if (code_point in ctype_dict2['alpha']
|
||||
and code_point in ctype_dict2['cntrl']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is alpha and cntrl' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['alpha']
|
||||
and code_point in ctype_dict2['digit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is alpha and digit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['alpha']
|
||||
and code_point in ctype_dict2['punct']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is alpha and punct' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['alpha']
|
||||
and code_point in ctype_dict2['space']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is alpha and space' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
# space restriction: "No character specified for the keywords upper,
|
||||
# lower, alpha, digit, graph or xdigit shall be specified."
|
||||
# upper, lower, alpha already checked above.
|
||||
if (code_point in ctype_dict2['space']
|
||||
and code_point in ctype_dict2['digit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is space and digit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['space']
|
||||
and code_point in ctype_dict2['graph']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is space and graph' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['space']
|
||||
and code_point in ctype_dict2['xdigit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is space and xdigit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
# cntrl restriction: "No character specified for the keywords upper,
|
||||
# lower, alpha, digit, punct, graph, print or xdigit shall be
|
||||
# specified." upper, lower, alpha already checked above.
|
||||
if (code_point in ctype_dict2['cntrl']
|
||||
and code_point in ctype_dict2['digit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is cntrl and digit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['cntrl']
|
||||
and code_point in ctype_dict2['punct']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is cntrl and punct' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['cntrl']
|
||||
and code_point in ctype_dict2['graph']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is cntrl and graph' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['cntrl']
|
||||
and code_point in ctype_dict2['print']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is cntrl and print' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['cntrl']
|
||||
and code_point in ctype_dict2['xdigit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is cntrl and xdigit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
# punct restriction: "No character specified for the keywords upper,
|
||||
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||||
# be specified." upper, lower, alpha, cntrl already checked above.
|
||||
if (code_point in ctype_dict2['punct']
|
||||
and code_point in ctype_dict2['digit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is punct and digit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['punct']
|
||||
and code_point in ctype_dict2['xdigit']):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is punct and xdigit' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point in ctype_dict2['punct']
|
||||
and code_point == 0x0020):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is punct.' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
# graph restriction: "No character specified for the keyword cntrl
|
||||
# shall be specified." Already checked above.
|
||||
|
||||
# print restriction: "No character specified for the keyword cntrl
|
||||
# shall be specified." Already checked above.
|
||||
|
||||
# graph - print relation: differ only in the <space> character.
|
||||
# How is this possible if there are more than one space character?!
|
||||
# I think susv2/xbd/locale.html should speak of "space characters",
|
||||
# not "space character".
|
||||
if (code_point in ctype_dict2['print']
|
||||
and not (code_point in ctype_dict2['graph']
|
||||
or code_point in ctype_dict2['space'])):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s is print but not graph|space' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
if (code_point not in ctype_dict2['print']
|
||||
and (code_point in ctype_dict2['graph']
|
||||
or code_point == 0x0020)):
|
||||
errorcounter = cperror(
|
||||
'error: %(char)s %(cp)s graph|space but not print' %{
|
||||
'char': chr(code_point),
|
||||
'cp': hex(code_point)
|
||||
},
|
||||
errorcounter)
|
||||
return errorcounter
|
||||
|
||||
if __name__ == "__main__":
|
||||
PARSER = argparse.ArgumentParser(
|
||||
description='''
|
||||
Compare the contents of LC_CTYPE in two files and check for errors.
|
||||
''')
|
||||
PARSER.add_argument(
|
||||
'-o', '--old_ctype_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='i18n',
|
||||
help='The old ctype file, default: %(default)s')
|
||||
PARSER.add_argument(
|
||||
'-n', '--new_ctype_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='unicode-ctype',
|
||||
help='The new ctype file, default: %(default)s')
|
||||
PARSER.add_argument(
|
||||
'-a', '--show_added_characters',
|
||||
action='store_true',
|
||||
help=('Show characters which were added to each '
|
||||
+ 'character class in detail.'))
|
||||
PARSER.add_argument(
|
||||
'-m', '--show_missing_characters',
|
||||
action='store_true',
|
||||
help=('Show characters which were removed from each '
|
||||
+ 'character class in detail.'))
|
||||
ARGS = PARSER.parse_args()
|
||||
|
||||
OLD_CTYPE_DICT = extract_character_classes(
|
||||
ARGS.old_ctype_file)
|
||||
NEW_CTYPE_DICT = extract_character_classes(
|
||||
ARGS.new_ctype_file)
|
||||
compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
|
||||
print('============================================================')
|
||||
print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
|
||||
print('------------------------------------------------------------')
|
||||
NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
|
||||
print('------------------------------------------------------------')
|
||||
print('Old file = %s' %ARGS.old_ctype_file)
|
||||
print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
|
||||
print('------------------------------------------------------------')
|
||||
print('============================================================')
|
||||
print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
|
||||
print('------------------------------------------------------------')
|
||||
NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
|
||||
print('------------------------------------------------------------')
|
||||
print('New file = %s' %ARGS.new_ctype_file)
|
||||
print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
|
||||
print('------------------------------------------------------------')
|
||||
if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
|
||||
exit(1)
|
||||
else:
|
||||
exit(0)
|
951
localedata/unicode-gen/ctype_compatibility_test_cases.py
Normal file
951
localedata/unicode-gen/ctype_compatibility_test_cases.py
Normal file
@ -0,0 +1,951 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
'''
|
||||
This file contains a list of test cases used by
|
||||
the ctype_compatibility.py script.
|
||||
'''
|
||||
|
||||
TEST_CASES = [
|
||||
[[0x0E2F, 0x0E46], [('alpha', True), ('punct', False)],
|
||||
'''Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
||||
<U0E2F>, <U0E46> should belong to punct. DerivedCoreProperties.txt
|
||||
says it is alpha. We trust DerivedCoreProperties.txt.'''
|
||||
],
|
||||
[[0x0E31, (0x0E34, 0x0E3A)], [('alpha', True)],
|
||||
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
|
||||
<thep@links.nectec.or.th> says <U0E31>, <U0E34>..<U0E3A>
|
||||
are alpha. DerivedCoreProperties.txt agrees.'''
|
||||
],
|
||||
[[(0x0E47, 0x0E4C), 0x0E4E], [('alpha', False)],
|
||||
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
|
||||
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
|
||||
is_alpha. DerivedCoreProperties does says *only* <U0E4D>
|
||||
in that range is alphabetic, the others are *not*. We
|
||||
trust DerivedCoreProperties.txt.'''
|
||||
],
|
||||
[[0x0E4D], [('alpha', True)],
|
||||
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
|
||||
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
|
||||
is_alpha. DerivedCoreProperties does says *only* <U0E4D>
|
||||
in that range is alphabetic, the others are *not*. We
|
||||
trust DerivedCoreProperties.txt.
|
||||
'''
|
||||
],
|
||||
[[0x0345], [('alpha', True), ('lower', True)],
|
||||
'''COMBINING GREEK YPOGEGRAMMENI
|
||||
According to DerivedCoreProperties.txt, this is “Alphabetic”
|
||||
and “Lowercase”.'''
|
||||
],
|
||||
[[(0x2160, 0x2188)], [('alpha', True)],
|
||||
'''Roman Numerals are “Alphabetic” according to
|
||||
DerivedCoreProperties.txt'''
|
||||
],
|
||||
[[(0x24B6, 0x24E9)], [('alpha', True)],
|
||||
'''Circled Latin letters are “Alphabetic” according to
|
||||
DerivedCoreProperties.txt'''
|
||||
],
|
||||
[[0x661], [('alpha', True), ('digit', False)],
|
||||
'''gen-unicode-ctype.c: All non-ASCII digits should be alphabetic.
|
||||
ISO C 99 forbids us to have them in category "digit", but we
|
||||
want iswalnum to return true on them. Don’t forget to
|
||||
have a look at all the other digits, 0x661 is just one
|
||||
example tested here.'''
|
||||
],
|
||||
[[(0x0030, 0x0039)], [('digit', True)],
|
||||
'''gen-unicode-ctype.c: All ASCII digits should be digits.'''
|
||||
],
|
||||
[[0x0009], [('blank', True)],
|
||||
'''gen-unicode-ctype.c: CHARACTER TABULATION'''
|
||||
],
|
||||
[[0x2007], [('blank', False), ('space', False)],
|
||||
'''gen-unicode-ctype.c: FIGURE SPACE, because it has <noBreak>
|
||||
in the description.'''
|
||||
],
|
||||
[[0x0009, 0x000A, 0x000B, 0x000C, 0x000D], [('space', True)],
|
||||
'''gen-unicode-ctype.c: CHARACTER TABULATION, LINE FEED (LF), LINE
|
||||
TABULATION, ;FORM FEED (FF), CARRIAGE RETURN (CR)'''
|
||||
],
|
||||
[[0x2028, 0x2029], [('cntrl', True)],
|
||||
'''gen-unicode-ctype.c: LINE SEPARATOR and PARAGRAPH SEPARATOR
|
||||
should be cntrl.'''
|
||||
],
|
||||
[[(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)],
|
||||
[('xdigit', True)],
|
||||
'''gen-unicode-ctype.c: ISO C 99 says (6.4.4.1): hexadecimal-digit:
|
||||
one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F (nothing else
|
||||
should be considered as a hexadecimal-digit)'''
|
||||
],
|
||||
[[0x0330], [('combining', True), ('combining_level3', False)],
|
||||
'''gen-unicode-ctype.c: COMBINING TILDE BELOW, canonical combining
|
||||
class value >= 200, should be in combining but not in
|
||||
combining_level3'''
|
||||
],
|
||||
[[0x0250, 0x0251, 0x0271], [('lower', True)],
|
||||
'''Should be lower in Unicode 7.0.0 (was not lower in
|
||||
Unicode 5.0.0).
|
||||
'''
|
||||
],
|
||||
[[0x2184], [('lower', True)],
|
||||
'''Should be lower both in Unicode 5.0.0 and 7.0.0'''
|
||||
],
|
||||
[[0xA67F], [('punct', False), ('alpha', True)],
|
||||
'''0xa67f CYRILLIC PAYEROK. Not in Unicode 5.0.0. In Unicode
|
||||
7.0.0. General category Lm (Letter
|
||||
modifier). DerivedCoreProperties.txt says it is
|
||||
“Alphabetic”. Apparently added manually to punct by mistake in
|
||||
glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[0xA60C], [('punct', False), ('alpha', True)],
|
||||
'''0xa60c VAI SYLLABLE LENGTHENER. Not in Unicode 5.0.0.
|
||||
In Unicode 7.0.0. General category Lm (Letter
|
||||
modifier). DerivedCoreProperties.txt says it is
|
||||
“Alphabetic”. Apparently added manually to punct by mistake in
|
||||
glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[0x2E2F], [('punct', False), ('alpha', True)],
|
||||
'''0x2E2F VERTICAL TILDE. Not in Unicode 5.0.0. In Unicode
|
||||
7.0.0. General category Lm (Letter
|
||||
modifier). DerivedCoreProperties.txt says it is
|
||||
“Alphabetic”. Apparently added manually to punct by mistake in
|
||||
glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[(0x1090, 0x1099)], [('punct', False), ('alpha', True)],
|
||||
'''MYANMAR SHAN DIGIT ZERO - MYANMAR SHAN DIGIT NINE.
|
||||
These are digits, but because ISO C 99 forbids to
|
||||
put them into digit they should go into alpha.'''
|
||||
],
|
||||
[[0x103F], [('punct', False), ('alpha', True)],
|
||||
'''0x103F MYANMAR LETTER GREAT SA. Not in Unicode 5.0.0.
|
||||
In Unicode 7.0.0. General category Lo
|
||||
(Other_Letter). DerivedCoreProperties.txt says it is
|
||||
“Alphabetic”. Apparently added manually to punct by
|
||||
mistake in glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[0x0374], [('punct', False), ('alpha', True)],
|
||||
'''0x0374 GREEK NUMERAL SIGN. Unicode 5.0.0: general category
|
||||
Sk. Unicode 7.0.0: General category Lm
|
||||
(Modifier_Letter). DerivedCoreProperties.txt says it is
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x02EC], [('punct', False), ('alpha', True)],
|
||||
'''0x02EC MODIFIER LETTER VOICING. Unicode 5.0.0: general category
|
||||
Sk. Unicode 7.0.0: General category Lm
|
||||
(Modifier_Letter). DerivedCoreProperties.txt says it is
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x180E], [('space', False), ('blank', False)],
|
||||
'''0x180e MONGOLIAN VOWEL SEPARATOR. Unicode 5.0.0: General
|
||||
category Zs (Space_Separator) Unicode 7.0.0: General category Cf
|
||||
(Format).'''
|
||||
],
|
||||
[[0x1E9C, 0x1E9D, 0x1E9F],
|
||||
[('lower', True), ('upper', False), ('tolower', False),
|
||||
('toupper', False), ('totitle', False)],
|
||||
'''ẜ 0x1e9c LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE,
|
||||
ẝ 0x1e9d LATIN SMALL LETTER LONG S WITH HIGH STROKE,
|
||||
ẟ 0x1e9f LATIN SMALL LETTER DELTA. These are “Lowercase”
|
||||
according to DerivedCoreProperties.txt but no upper case versions
|
||||
exist.'''
|
||||
],
|
||||
[[0x1E9E],
|
||||
[('lower', False), ('upper', True), ('tolower', True),
|
||||
('toupper', False), ('totitle', False)],
|
||||
'''0x1E9E ẞ LATIN CAPITAL LETTER SHARP S This is “Uppercase”
|
||||
according to DerivedCoreProperties.txt and the lower case
|
||||
version is 0x00DF ß LATIN SMALL LETTER SHARP S.'''
|
||||
],
|
||||
[[0x2188],
|
||||
[('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''0x2188 ROMAN NUMERAL ONE HUNDRED THOUSAND. This is “Alphabetic”
|
||||
according to DerivedCoreProperties.txt. In glibc’s old
|
||||
LC_CTYPE, it was in “lower”, which seems to be a
|
||||
mistake. It is not “Lowercase” in
|
||||
DerivedCoreProperties.txt and does not have case mappings
|
||||
in UnicodeData.txt either.'''
|
||||
],
|
||||
[[0x2C71, 0x2C74, (0x2C77, 0x2C7A)],
|
||||
[('alpha', True), ('lower', True), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''These are Latin small letters which were not in Unicode 5.0.0
|
||||
but are in Unicode 7.0.0. According to
|
||||
DerivedCoreProperties.txt they are “Lowercase”. But no
|
||||
uppercase versions exist. They have apparently been added
|
||||
manually to glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[0xA730, 0xA731],
|
||||
[('alpha', True), ('lower', True), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''These are Latin small “capital” letters which were not in
|
||||
Unicode 5.0.0 but are in Unicode 7.0.0. According to
|
||||
DerivedCoreProperties.txt they are “Lowercase”. But no
|
||||
uppercase versions exist. They have apparently been added
|
||||
manually to glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[(0xA771, 0xA778)],
|
||||
[('alpha', True), ('lower', True), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''These are Latin small (or small “capital”) letters which
|
||||
were not in Unicodee 5.0.0 but are in Unicode 7.0.0. According to
|
||||
DerivedCoreProperties.txt they are “Lowercase”. But no
|
||||
uppercase versions exist. They have apparently been added
|
||||
manually to glibc’s old LC_CTYPE.'''
|
||||
],
|
||||
[[0x0375],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''“0375;GREEK LOWER NUMERAL SIGN;Sk;0;ON;;;;;N;;;;;”. Has
|
||||
apparently been added manually to glibc’s old LC_CTYPE as
|
||||
“combining_level3”. That seems wrong, it is no combining
|
||||
character because it does not have one of the general
|
||||
categories Mn, Mc, or Me. According to
|
||||
DerivedCoreProperties.txt it is not “Alphabetic”.'''
|
||||
],
|
||||
[[0x108D],
|
||||
[('combining', True), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''“108D;MYANMAR SIGN SHAN COUNCIL EMPHATIC
|
||||
TONE;Mn;220;NSM;;;;;N;;;;;”. Has apparently been added
|
||||
manually to glibc’s old LC_CTYPE as
|
||||
“combining_level3”. That seems wrong, although it is a
|
||||
combining character because it has the general category
|
||||
Mn, it is not “combining_level3” because the canonical
|
||||
combining class value is 220 which is >= 200. According to
|
||||
gen-unicode-ctype.c, “combining_level3” needs a
|
||||
canonical combining class value < 200. According to
|
||||
DerivedCoreProperties.txt it is not “Alphabetic”.'''
|
||||
],
|
||||
[[0x06DE],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
''' UnicodeData.txt 5.0.0: “06DE;ARABIC START OF RUB EL
|
||||
HIZB;Me;0;NSM;;;;;N;;;;;”; UnicodeData.txt 7.0.0:
|
||||
“06DE;ARABIC START OF RUB EL
|
||||
HIZB;So;0;ON;;;;;N;;;;;”. I.e. this used to be a
|
||||
combining character in Unicode 5.0.0 but not anymore in
|
||||
7.0.0. According to DerivedCoreProperties.txt it is not
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0BD0],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0:
|
||||
“0BD0;TAMIL OM;Lo;0;L;;;;;N;;;;;”. Apparently manually added to
|
||||
“combining” and “combining_level3” in glibc’s old
|
||||
LC_CTYPE. That seems wrong. According to
|
||||
DerivedCoreProperties.txt it is “Alphabetic”.'''
|
||||
],
|
||||
[[0x103F],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0:
|
||||
“103F;MYANMAR LETTER GREAT SA;Lo;0;L;;;;;N;;;;;”.
|
||||
Apparently manually added to “combining” and
|
||||
“combining_level3” in glibc’s old LC_CTYPE. That seems
|
||||
wrong. According to DerivedCoreProperties.txt it is
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0901, 0x0903)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''These have general category “Mn” i.e. these are combining
|
||||
characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
|
||||
“0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”,
|
||||
”0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”,
|
||||
“0903;DEVANAGARI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”.
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x093C],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''UnicodeData.txt (5.0.0 and 7.0.0): “093C;DEVANAGARI SIGN
|
||||
NUKTA;Mn;7;NSM;;;;;N;;;;;” According to
|
||||
DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”. glibc’s old LC_TYPE has this in “alpha”.'''
|
||||
],
|
||||
[[(0x093E, 0x093F)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''These have general category “Mc” i.e. these are combining
|
||||
characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
|
||||
“093E;DEVANAGARI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“093F;DEVANAGARI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0940, 0x094C)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''These are all combining
|
||||
characters (“Mc” or “Mn” both in UnicodeData.txt 5.0.0 and 7.0.0).
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x094D],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
“094D;DEVANAGARI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) it is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0951, 0x0954)],
|
||||
[('combining', True), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0962, 0x0963), (0x0981, 0x0983)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x09BC],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“09BC;BENGALI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
According to DerivedCoreProperties.txt (7.0.0) it is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x09BE, 0x09BF), (0x09C0, 0x09C4), (0x09C7, 0x09C8),
|
||||
(0x09CB, 0x09CC)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“09BE;BENGALI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“09BF;BENGALI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||
“09C0;BENGALI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||
“09C1;BENGALI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||
“09C2;BENGALI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“09C3;BENGALI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||
“09C4;BENGALI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||
“09C7;BENGALI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||
“09C8;BENGALI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
|
||||
“09CB;BENGALI VOWEL SIGN O;Mc;0;L;09C7 09BE;;;;N;;;;;”
|
||||
“09CC;BENGALI VOWEL SIGN AU;Mc;0;L;09C7 09D7;;;;N;;;;;”
|
||||
Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x09CD],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“09CD;BENGALI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
According to DerivedCoreProperties.txt (7.0.0) it is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x09D7, (0x09E2, 0x09E3)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x09F2, 0x09F3],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“09F2;BENGALI RUPEE MARK;Sc;0;ET;;;;;N;;;;;”
|
||||
“09F3;BENGALI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x09F4, 0x09FA)],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“09F4;BENGALI CURRENCY NUMERATOR ONE;No;0;L;;;;1/16;N;;;;;”
|
||||
“09F5;BENGALI CURRENCY NUMERATOR TWO;No;0;L;;;;1/8;N;;;;;”
|
||||
“09F6;BENGALI CURRENCY NUMERATOR THREE;No;0;L;;;;3/16;N;;;;;”
|
||||
“09F7;BENGALI CURRENCY NUMERATOR FOUR;No;0;L;;;;1/4;N;;;;;”
|
||||
“09F8;BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR;
|
||||
No;0;L;;;;3/4;N;;;;;”
|
||||
“09F9;BENGALI CURRENCY DENOMINATOR SIXTEEN;No;0;L;;;;16;N;;;;;”
|
||||
“09FA;BENGALI ISSHAR;So;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0A01, 0x0A03)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0A01;GURMUKHI SIGN ADAK BINDI;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A02;GURMUKHI SIGN BINDI;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A03;GURMUKHI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0A3C],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0A3C;GURMUKHI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0A3E, 0x0A40), (0x0A41, 0x0A42), (0x0A47, 0x0A48),
|
||||
(0x0A4B, 0x0A4C)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0A3E;GURMUKHI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“0A3F;GURMUKHI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||
“0A40;GURMUKHI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||
“0A41;GURMUKHI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A42;GURMUKHI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A47;GURMUKHI VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A48;GURMUKHI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A4B;GURMUKHI VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A4C;GURMUKHI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0A4D],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0A51, (0x0A70, 0x0A71), 0x0A75, (0x0A81, 0x0A83)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
“0A70;GURMUKHI TIPPI;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A71;GURMUKHI ADDAK;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A75;GURMUKHI SIGN YAKASH;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A81;GUJARATI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A82;GUJARATI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0A83;GUJARATI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0ABC],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0ABC;GUJARATI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0ABE, 0x0AC5), (0x0AC7, 0x0AC9), (0x0ACB, 0x0ACC)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0ABE;GUJARATI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“0ABF;GUJARATI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||
“0AC0;GUJARATI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||
“0AC1;GUJARATI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC2;GUJARATI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC3;GUJARATI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC4;GUJARATI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC5;GUJARATI VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC7;GUJARATI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC8;GUJARATI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AC9;GUJARATI VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;;”
|
||||
“0ACB;GUJARATI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;”
|
||||
“0ACC;GUJARATI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0ACD],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0ACD;GUJARATI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0AE2, 0x0AE3)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0AE2;GUJARATI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0AE3;GUJARATI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0AF1],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0AF1;GUJARATI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0B01, 0x0B03)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B01;ORIYA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B02;ORIYA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||
“0B03;ORIYA SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0B3C],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B3C;ORIYA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0B3E, 0x0B44), (0x0B47, 0x0B48), (0x0B4B, 0x0B4C)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B3E;ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“0B3F;ORIYA VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B40;ORIYA VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||
“0B41;ORIYA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B42;ORIYA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B43;ORIYA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B44;ORIYA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B47;ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||
“0B48;ORIYA VOWEL SIGN AI;Mc;0;L;0B47 0B56;;;;N;;;;;”
|
||||
“0B4B;ORIYA VOWEL SIGN O;Mc;0;L;0B47 0B3E;;;;N;;;;;”
|
||||
“0B4C;ORIYA VOWEL SIGN AU;Mc;0;L;0B47 0B57;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0B4D],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B4D;ORIYA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0B56, 0x0B57), (0x0B62, 0x0B63)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B56;ORIYA AI LENGTH MARK;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B57;ORIYA AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
|
||||
“0B62;ORIYA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0B63;ORIYA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0B70],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B70;ORIYA ISSHAR;So;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0B82],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0B82;TAMIL SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0BBE, 0x0BC2), (0x0BC6, 0x0BC8), (0x0BCA, 0x0BCC)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0BBE;TAMIL VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“0BBF;TAMIL VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||
“0BC0;TAMIL VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0BC1;TAMIL VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
|
||||
“0BC2;TAMIL VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
|
||||
“0BC6;TAMIL VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||
“0BC7;TAMIL VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;”
|
||||
“0BC8;TAMIL VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
|
||||
“0BCA;TAMIL VOWEL SIGN O;Mc;0;L;0BC6 0BBE;;;;N;;;;;”
|
||||
“0BCB;TAMIL VOWEL SIGN OO;Mc;0;L;0BC7 0BBE;;;;N;;;;;”
|
||||
“0BCC;TAMIL VOWEL SIGN AU;Mc;0;L;0BC6 0BD7;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0BCD],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0BCD;TAMIL SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0BD7],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0BD7;TAMIL AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0BF0, 0x0BFA)],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0BF0;TAMIL NUMBER TEN;No;0;L;;;;10;N;;;;;”
|
||||
“0BF1;TAMIL NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;”
|
||||
“0BF2;TAMIL NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;”
|
||||
“0BF3;TAMIL DAY SIGN;So;0;ON;;;;;N;;;;;”
|
||||
“0BF4;TAMIL MONTH SIGN;So;0;ON;;;;;N;;;;;”
|
||||
“0BF5;TAMIL YEAR SIGN;So;0;ON;;;;;N;;;;;”
|
||||
“0BF6;TAMIL DEBIT SIGN;So;0;ON;;;;;N;;;;;”
|
||||
“0BF7;TAMIL CREDIT SIGN;So;0;ON;;;;;N;;;;;”
|
||||
“0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;;”
|
||||
“0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
|
||||
“0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0C01, 0x0C03)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;;”
|
||||
“0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||
“0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0C3E, 0x0C44), (0x0C46, 0x0C48), (0x0C4A, 0x0C4C)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0C3E;TELUGU VOWEL SIGN AA;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C3F;TELUGU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C40;TELUGU VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C41;TELUGU VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
|
||||
“0C42;TELUGU VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
|
||||
“0C43;TELUGU VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;”
|
||||
“0C44;TELUGU VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;”
|
||||
“0C46;TELUGU VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C47;TELUGU VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C48;TELUGU VOWEL SIGN AI;Mn;0;NSM;0C46 0C56;;;;N;;;;;”
|
||||
“0C4A;TELUGU VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C4B;TELUGU VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C4C;TELUGU VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0C4D],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0C4D;TELUGU SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0C55, 0x0C56), (0x0C62, 0x0C63)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0C55;TELUGU LENGTH MARK;Mn;84;NSM;;;;;N;;;;;”
|
||||
“0C56;TELUGU AI LENGTH MARK;Mn;91;NSM;;;;;N;;;;;”
|
||||
“0C62;TELUGU VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C63;TELUGU VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0C78, 0x0C7F)],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0C78;TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR;
|
||||
No;0;ON;;;;0;N;;;;;”
|
||||
“0C79;TELUGU FRACTION DIGIT ONE FOR ODD POWERS OF FOUR;
|
||||
No;0;ON;;;;1;N;;;;;”
|
||||
“0C7A;TELUGU FRACTION DIGIT TWO FOR ODD POWERS OF FOUR;
|
||||
No;0;ON;;;;2;N;;;;;”
|
||||
“0C7B;TELUGU FRACTION DIGIT THREE FOR ODD POWERS OF FOUR;
|
||||
No;0;ON;;;;3;N;;;;;”
|
||||
“0C7C;TELUGU FRACTION DIGIT ONE FOR EVEN POWERS OF FOUR;
|
||||
No;0;ON;;;;1;N;;;;;”
|
||||
“0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR;
|
||||
No;0;ON;;;;2;N;;;;;”
|
||||
“0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR;
|
||||
No;0;ON;;;;3;N;;;;;”
|
||||
“0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0C82, 0x0C83)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||
“0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0CBC],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0CBC;KANNADA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0CBE, 0x0CC4), (0x0CC6, 0x0CC8), (0x0CCA, 0x0CCC)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0CBE;KANNADA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“0CBF;KANNADA VOWEL SIGN I;Mn;0;L;;;;;N;;;;;”
|
||||
“0CC0;KANNADA VOWEL SIGN II;Mc;0;L;0CBF 0CD5;;;;N;;;;;”
|
||||
“0CC1;KANNADA VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
|
||||
“0CC2;KANNADA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
|
||||
“0CC3;KANNADA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;”
|
||||
“0CC4;KANNADA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;”
|
||||
“0CC6;KANNADA VOWEL SIGN E;Mn;0;L;;;;;N;;;;;”
|
||||
“0CC7;KANNADA VOWEL SIGN EE;Mc;0;L;0CC6 0CD5;;;;N;;;;;”
|
||||
“0CC8;KANNADA VOWEL SIGN AI;Mc;0;L;0CC6 0CD6;;;;N;;;;;”
|
||||
“0CCA;KANNADA VOWEL SIGN O;Mc;0;L;0CC6 0CC2;;;;N;;;;;”
|
||||
“0CCB;KANNADA VOWEL SIGN OO;Mc;0;L;0CCA 0CD5;;;;N;;;;;”
|
||||
“0CCC;KANNADA VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0CCD],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0CD5, 0x0CD6), (0x0CE2, 0x0CE3)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;;
|
||||
0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;;
|
||||
0CE2;KANNADA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
|
||||
0CE3;KANNADA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0D02, 0x0D03), (0x0D3E, 0x0D44), (0x0D46, 0x0D48),
|
||||
(0x0D4A, 0x0D4C)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||
“0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||
“0D3E;MALAYALAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||
“0D3F;MALAYALAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||
“0D40;MALAYALAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||
“0D41;MALAYALAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0D42;MALAYALAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0D43;MALAYALAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0D44;MALAYALAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0D46;MALAYALAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||
“0D47;MALAYALAM VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;”
|
||||
“0D48;MALAYALAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
|
||||
“0D4A;MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;”
|
||||
“0D4B;MALAYALAM VOWEL SIGN OO;Mc;0;L;0D47 0D3E;;;;N;;;;;”
|
||||
“0D4C;MALAYALAM VOWEL SIGN AU;Mc;0;L;0D46 0D57;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0D4D],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0D4D;MALAYALAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0D57, (0x0D62, 0x0D63)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0D57;MALAYALAM AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
|
||||
“0D62;MALAYALAM VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0D63;MALAYALAM VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0D70, 0x0D79)],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0D70;MALAYALAM NUMBER TEN;No;0;L;;;;10;N;;;;;”
|
||||
“0D71;MALAYALAM NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;”
|
||||
“0D72;MALAYALAM NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;”
|
||||
“0D73;MALAYALAM FRACTION ONE QUARTER;No;0;L;;;;1/4;N;;;;;”
|
||||
“0D74;MALAYALAM FRACTION ONE HALF;No;0;L;;;;1/2;N;;;;;”
|
||||
“0D75;MALAYALAM FRACTION THREE QUARTERS;No;0;L;;;;3/4;N;;;;;”
|
||||
“0D79;MALAYALAM DATE MARK;So;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0D82, 0x0D83)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0D82;SINHALA SIGN ANUSVARAYA;Mc;0;L;;;;;N;;;;;”
|
||||
“0D83;SINHALA SIGN VISARGAYA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0DCA],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0DCA;SINHALA SIGN AL-LAKUNA;Mn;9;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0x0DCF, 0x0DD4), 0x0DD6, (0x0DD8, 0x0DDF), (0x0DF2, 0x0DF3)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0DCF;SINHALA VOWEL SIGN AELA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DD0;SINHALA VOWEL SIGN KETTI AEDA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DD1;SINHALA VOWEL SIGN DIGA AEDA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DD2;SINHALA VOWEL SIGN KETTI IS-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0DD3;SINHALA VOWEL SIGN DIGA IS-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0DD4;SINHALA VOWEL SIGN KETTI PAA-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0DD6;SINHALA VOWEL SIGN DIGA PAA-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||
“0DD8;SINHALA VOWEL SIGN GAETTA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DD9;SINHALA VOWEL SIGN KOMBUVA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DDA;SINHALA VOWEL SIGN DIGA KOMBUVA;Mc;0;L;0DD9 0DCA;;;;N;;;;;”
|
||||
“0DDB;SINHALA VOWEL SIGN KOMBU DEKA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DDC;SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA;
|
||||
Mc;0;L;0DD9 0DCF;;;;N;;;;;”
|
||||
“0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA;
|
||||
Mc;0;L;0DDC 0DCA;;;;N;;;;;”
|
||||
“0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA;
|
||||
Mc;0;L;0DD9 0DDF;;;;N;;;;;”
|
||||
“0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||
“0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[0x0DF4],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0xA789, 0xA78A)],
|
||||
[('combining', False), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“A789;MODIFIER LETTER COLON;Sk;0;L;;;;;N;;;;;”
|
||||
“A78A;MODIFIER LETTER SHORT EQUALS SIGN;Sk;0;L;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0xA926, 0xA92A)],
|
||||
[('combining', True), ('combining_level3', True),
|
||||
('alpha', True), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“A926;KAYAH LI VOWEL UE;Mn;0;NSM;;;;;N;;;;;”
|
||||
“A927;KAYAH LI VOWEL E;Mn;0;NSM;;;;;N;;;;;”
|
||||
“A928;KAYAH LI VOWEL U;Mn;0;NSM;;;;;N;;;;;”
|
||||
“A929;KAYAH LI VOWEL EE;Mn;0;NSM;;;;;N;;;;;”
|
||||
“A92A;KAYAH LI VOWEL O;Mn;0;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||
“Alphabetic”.'''
|
||||
],
|
||||
[[(0xA92B, 0xA92D)],
|
||||
[('combining', True), ('combining_level3', False),
|
||||
('alpha', False), ('lower', False), ('upper', False),
|
||||
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||
'''
|
||||
“A92B;KAYAH LI TONE PLOPHU;Mn;220;NSM;;;;;N;;;;;”
|
||||
“A92C;KAYAH LI TONE CALYA;Mn;220;NSM;;;;;N;;;;;”
|
||||
“A92D;KAYAH LI TONE CALYA PLOPHU;Mn;220;NSM;;;;;N;;;;;”
|
||||
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||
“Alphabetic”.'''
|
||||
]
|
||||
]
|
751
localedata/unicode-gen/gen_unicode_ctype.py
Executable file
751
localedata/unicode-gen/gen_unicode_ctype.py
Executable file
@ -0,0 +1,751 @@
|
||||
#!/usr/bin/python3
|
||||
#
|
||||
# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
|
||||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
'''
|
||||
Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
|
||||
DerivedCoreProperties.txt files.
|
||||
|
||||
To see how this script is used, call it with the “-h” option:
|
||||
|
||||
$ ./gen_unicode_ctype.py -h
|
||||
… prints usage message …
|
||||
'''
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import time
|
||||
import re
|
||||
|
||||
# Dictionary holding the entire contents of the UnicodeData.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {0: {'category': 'Cc',
|
||||
# 'title': None,
|
||||
# 'digit': '',
|
||||
# 'name': '<control>',
|
||||
# 'bidi': 'BN',
|
||||
# 'combining': '0',
|
||||
# 'comment': '',
|
||||
# 'oldname': 'NULL',
|
||||
# 'decomposition': '',
|
||||
# 'upper': None,
|
||||
# 'mirrored': 'N',
|
||||
# 'lower': None,
|
||||
# 'decdigit': '',
|
||||
# 'numeric': ''},
|
||||
# …
|
||||
# }
|
||||
UNICODE_ATTRIBUTES = {}
|
||||
|
||||
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {917504: ['Default_Ignorable_Code_Point'],
|
||||
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
|
||||
# …
|
||||
# }
|
||||
DERIVED_CORE_PROPERTIES = {}
|
||||
|
||||
def fill_attribute(code_point, fields):
|
||||
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||||
|
||||
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||||
in the UnicodeData.txt file.
|
||||
|
||||
'''
|
||||
UNICODE_ATTRIBUTES[code_point] = {
|
||||
'name': fields[1], # Character name
|
||||
'category': fields[2], # General category
|
||||
'combining': fields[3], # Canonical combining classes
|
||||
'bidi': fields[4], # Bidirectional category
|
||||
'decomposition': fields[5], # Character decomposition mapping
|
||||
'decdigit': fields[6], # Decimal digit value
|
||||
'digit': fields[7], # Digit value
|
||||
'numeric': fields[8], # Numeric value
|
||||
'mirrored': fields[9], # mirrored
|
||||
'oldname': fields[10], # Old Unicode 1.0 name
|
||||
'comment': fields[11], # comment
|
||||
# Uppercase mapping
|
||||
'upper': int(fields[12], 16) if fields[12] else None,
|
||||
# Lowercase mapping
|
||||
'lower': int(fields[13], 16) if fields[13] else None,
|
||||
# Titlecase mapping
|
||||
'title': int(fields[14], 16) if fields[14] else None,
|
||||
}
|
||||
|
||||
def fill_attributes(filename):
|
||||
'''Stores the entire contents of the UnicodeData.txt file
|
||||
in the UNICODE_ATTRIBUTES dictionary.
|
||||
|
||||
A typical line for a single code point in UnicodeData.txt looks
|
||||
like this:
|
||||
|
||||
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||||
|
||||
Code point ranges are indicated by pairs of lines like this:
|
||||
|
||||
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||||
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||||
'''
|
||||
with open(filename, mode='r') as unicode_data_file:
|
||||
fields_start = []
|
||||
for line in unicode_data_file:
|
||||
fields = line.strip().split(';')
|
||||
if len(fields) != 15:
|
||||
sys.stderr.write(
|
||||
'short line in file "%(f)s": %(l)s\n' %{
|
||||
'f': filename, 'l': line})
|
||||
exit(1)
|
||||
if fields[2] == 'Cs':
|
||||
# Surrogates are UTF-16 artefacts,
|
||||
# not real characters. Ignore them.
|
||||
fields_start = []
|
||||
continue
|
||||
if fields[1].endswith(', First>'):
|
||||
fields_start = fields
|
||||
fields_start[1] = fields_start[1].split(',')[0][1:]
|
||||
continue
|
||||
if fields[1].endswith(', Last>'):
|
||||
fields[1] = fields[1].split(',')[0][1:]
|
||||
if fields[1:] != fields_start[1:]:
|
||||
sys.stderr.write(
|
||||
'broken code point range in file "%(f)s": %(l)s\n' %{
|
||||
'f': filename, 'l': line})
|
||||
exit(1)
|
||||
for code_point in range(
|
||||
int(fields_start[0], 16),
|
||||
int(fields[0], 16)+1):
|
||||
fill_attribute(code_point, fields)
|
||||
fields_start = []
|
||||
continue
|
||||
fill_attribute(int(fields[0], 16), fields)
|
||||
fields_start = []
|
||||
|
||||
def fill_derived_core_properties(filename):
|
||||
'''Stores the entire contents of the DerivedCoreProperties.txt file
|
||||
in the DERIVED_CORE_PROPERTIES dictionary.
|
||||
|
||||
Lines in DerivedCoreProperties.txt are either a code point range like
|
||||
this:
|
||||
|
||||
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
|
||||
|
||||
or a single code point like this:
|
||||
|
||||
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
|
||||
|
||||
'''
|
||||
with open(filename, mode='r') as derived_core_properties_file:
|
||||
for line in derived_core_properties_file:
|
||||
match = re.match(
|
||||
r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||||
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||||
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)',
|
||||
line)
|
||||
if not match:
|
||||
continue
|
||||
start = match.group('codepoint1')
|
||||
end = match.group('codepoint2')
|
||||
if not end:
|
||||
end = start
|
||||
for code_point in range(int(start, 16), int(end, 16)+1):
|
||||
prop = match.group('property')
|
||||
if code_point in DERIVED_CORE_PROPERTIES:
|
||||
DERIVED_CORE_PROPERTIES[code_point].append(prop)
|
||||
else:
|
||||
DERIVED_CORE_PROPERTIES[code_point] = [prop]
|
||||
|
||||
def to_upper(code_point):
|
||||
'''Returns the code point of the uppercase version
|
||||
of the given code point'''
|
||||
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['upper']):
|
||||
return UNICODE_ATTRIBUTES[code_point]['upper']
|
||||
else:
|
||||
return code_point
|
||||
|
||||
def to_lower(code_point):
|
||||
'''Returns the code point of the lowercase version
|
||||
of the given code point'''
|
||||
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['lower']):
|
||||
return UNICODE_ATTRIBUTES[code_point]['lower']
|
||||
else:
|
||||
return code_point
|
||||
|
||||
def to_title(code_point):
|
||||
'''Returns the code point of the titlecase version
|
||||
of the given code point'''
|
||||
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['title']):
|
||||
return UNICODE_ATTRIBUTES[code_point]['title']
|
||||
else:
|
||||
return code_point
|
||||
|
||||
def is_upper(code_point):
|
||||
'''Checks whether the character with this code point is uppercase'''
|
||||
return (to_lower(code_point) != code_point
|
||||
or (code_point in DERIVED_CORE_PROPERTIES
|
||||
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||||
|
||||
def is_lower(code_point):
|
||||
'''Checks whether the character with this code point is lowercase'''
|
||||
# Some characters are defined as “Lowercase” in
|
||||
# DerivedCoreProperties.txt but do not have a mapping to upper
|
||||
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
|
||||
# one of these.
|
||||
return (to_upper(code_point) != code_point
|
||||
# <U00DF> is lowercase, but without simple to_upper mapping.
|
||||
or code_point == 0x00DF
|
||||
or (code_point in DERIVED_CORE_PROPERTIES
|
||||
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||||
|
||||
def is_alpha(code_point):
|
||||
'''Checks whether the character with this code point is alphabetic'''
|
||||
return ((code_point in DERIVED_CORE_PROPERTIES
|
||||
and
|
||||
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
|
||||
or
|
||||
# Consider all the non-ASCII digits as alphabetic.
|
||||
# ISO C 99 forbids us to have them in category “digit”,
|
||||
# but we want iswalnum to return true on them.
|
||||
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
|
||||
and not (code_point >= 0x0030 and code_point <= 0x0039)))
|
||||
|
||||
def is_digit(code_point):
|
||||
'''Checks whether the character with this code point is a digit'''
|
||||
if False:
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
|
||||
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
||||
# a zero. Must add <0> in front of them by hand.
|
||||
else:
|
||||
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
||||
# takes it away:
|
||||
# 7.25.2.1.5:
|
||||
# The iswdigit function tests for any wide character that
|
||||
# corresponds to a decimal-digit character (as defined in 5.2.1).
|
||||
# 5.2.1:
|
||||
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
||||
return (code_point >= 0x0030 and code_point <= 0x0039)
|
||||
|
||||
def is_outdigit(code_point):
|
||||
'''Checks whether the character with this code point is outdigit'''
|
||||
return (code_point >= 0x0030 and code_point <= 0x0039)
|
||||
|
||||
def is_blank(code_point):
|
||||
'''Checks whether the character with this code point is blank'''
|
||||
return (code_point == 0x0009 # '\t'
|
||||
# Category Zs without mention of '<noBreak>'
|
||||
or (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
|
||||
and '<noBreak>' not in
|
||||
UNICODE_ATTRIBUTES[code_point]['decomposition']))
|
||||
|
||||
def is_space(code_point):
|
||||
'''Checks whether the character with this code point is a space'''
|
||||
# Don’t make U+00A0 a space. Non-breaking space means that all programs
|
||||
# should treat it like a punctuation character, not like a space.
|
||||
return (code_point == 0x0020 # ' '
|
||||
or code_point == 0x000C # '\f'
|
||||
or code_point == 0x000A # '\n'
|
||||
or code_point == 0x000D # '\r'
|
||||
or code_point == 0x0009 # '\t'
|
||||
or code_point == 0x000B # '\v'
|
||||
# Categories Zl, Zp, and Zs without mention of "<noBreak>"
|
||||
or (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and
|
||||
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
|
||||
or
|
||||
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
|
||||
and
|
||||
'<noBreak>' not in
|
||||
UNICODE_ATTRIBUTES[code_point]['decomposition']))))
|
||||
|
||||
def is_cntrl(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a control character'''
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
|
||||
or
|
||||
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
|
||||
|
||||
def is_xdigit(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a hexadecimal digit'''
|
||||
if False:
|
||||
return (is_digit(code_point)
|
||||
or (code_point >= 0x0041 and code_point <= 0x0046)
|
||||
or (code_point >= 0x0061 and code_point <= 0x0066))
|
||||
else:
|
||||
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
||||
# takes it away:
|
||||
# 7.25.2.1.12:
|
||||
# The iswxdigit function tests for any wide character that
|
||||
# corresponds to a hexadecimal-digit character (as defined
|
||||
# in 6.4.4.1).
|
||||
# 6.4.4.1:
|
||||
# hexadecimal-digit: one of
|
||||
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
||||
return ((code_point >= 0x0030 and code_point <= 0x0039)
|
||||
or (code_point >= 0x0041 and code_point <= 0x0046)
|
||||
or (code_point >= 0x0061 and code_point <= 0x0066))
|
||||
|
||||
def is_graph(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a graphical character'''
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||||
and not is_space(code_point))
|
||||
|
||||
def is_print(code_point):
|
||||
'''Checks whether the character with this code point is printable'''
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
|
||||
|
||||
def is_punct(code_point):
|
||||
'''Checks whether the character with this code point is punctuation'''
|
||||
if False:
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
|
||||
else:
|
||||
# The traditional POSIX definition of punctuation is every graphic,
|
||||
# non-alphanumeric character.
|
||||
return (is_graph(code_point)
|
||||
and not is_alpha(code_point)
|
||||
and not is_digit(code_point))
|
||||
|
||||
def is_combining(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a combining character'''
|
||||
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
||||
# file. In 3.0.1 it was identical to the union of the general categories
|
||||
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
||||
# PropList.txt file, so we take the latter definition.
|
||||
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||
and
|
||||
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
|
||||
|
||||
def is_combining_level3(code_point):
|
||||
'''Checks whether the character with this code point is
|
||||
a combining level3 character'''
|
||||
return (is_combining(code_point)
|
||||
and
|
||||
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
|
||||
|
||||
def ucs_symbol(code_point):
|
||||
'''Return the UCS symbol string for a Unicode character.'''
|
||||
if code_point < 0x10000:
|
||||
return '<U{:04X}>'.format(code_point)
|
||||
else:
|
||||
return '<U{:08X}>'.format(code_point)
|
||||
|
||||
def ucs_symbol_range(code_point_low, code_point_high):
|
||||
'''Returns a string UCS symbol string for a code point range.
|
||||
|
||||
Example:
|
||||
|
||||
<U0041>..<U005A>
|
||||
'''
|
||||
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
|
||||
|
||||
def code_point_ranges(is_class_function):
|
||||
'''Returns a list of ranges of code points for which is_class_function
|
||||
returns True.
|
||||
|
||||
Example:
|
||||
|
||||
[[65, 90], [192, 214], [216, 222], [256], … ]
|
||||
'''
|
||||
cp_ranges = []
|
||||
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||
if is_class_function(code_point):
|
||||
if (cp_ranges
|
||||
and cp_ranges[-1][-1] == code_point - 1):
|
||||
if len(cp_ranges[-1]) == 1:
|
||||
cp_ranges[-1].append(code_point)
|
||||
else:
|
||||
cp_ranges[-1][-1] = code_point
|
||||
else:
|
||||
cp_ranges.append([code_point])
|
||||
return cp_ranges
|
||||
|
||||
def output_charclass(i18n_file, class_name, is_class_function):
|
||||
'''Output a LC_CTYPE character class section
|
||||
|
||||
Example:
|
||||
|
||||
upper /
|
||||
<U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
|
||||
…
|
||||
<U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
|
||||
<U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
|
||||
'''
|
||||
cp_ranges = code_point_ranges(is_class_function)
|
||||
if cp_ranges:
|
||||
i18n_file.write('%s /\n' %class_name)
|
||||
max_column = 75
|
||||
prefix = ' '
|
||||
line = prefix
|
||||
range_string = ''
|
||||
for code_point_range in cp_ranges:
|
||||
if line.strip():
|
||||
line += ';'
|
||||
if len(code_point_range) == 1:
|
||||
range_string = ucs_symbol(code_point_range[0])
|
||||
else:
|
||||
range_string = ucs_symbol_range(
|
||||
code_point_range[0], code_point_range[-1])
|
||||
if len(line+range_string) > max_column:
|
||||
i18n_file.write(line+'/\n')
|
||||
line = prefix
|
||||
line += range_string
|
||||
if line.strip():
|
||||
i18n_file.write(line+'\n')
|
||||
i18n_file.write('\n')
|
||||
|
||||
def output_charmap(i18n_file, map_name, map_function):
|
||||
'''Output a LC_CTYPE character map section
|
||||
|
||||
Example:
|
||||
|
||||
toupper /
|
||||
(<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
|
||||
…
|
||||
(<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
|
||||
(<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
|
||||
'''
|
||||
max_column = 75
|
||||
prefix = ' '
|
||||
line = prefix
|
||||
map_string = ''
|
||||
i18n_file.write('%s /\n' %map_name)
|
||||
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||
mapped = map_function(code_point)
|
||||
if code_point != mapped:
|
||||
if line.strip():
|
||||
line += ';'
|
||||
map_string = '(' \
|
||||
+ ucs_symbol(code_point) \
|
||||
+ ',' \
|
||||
+ ucs_symbol(mapped) \
|
||||
+ ')'
|
||||
if len(line+map_string) > max_column:
|
||||
i18n_file.write(line+'/\n')
|
||||
line = prefix
|
||||
line += map_string
|
||||
if line.strip():
|
||||
i18n_file.write(line+'\n')
|
||||
i18n_file.write('\n')
|
||||
|
||||
def verifications():
|
||||
'''Tests whether the is_* functions observe the known restrictions'''
|
||||
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||
# toupper restriction: "Only characters specified for the keywords
|
||||
# lower and upper shall be specified.
|
||||
if (to_upper(code_point) != code_point
|
||||
and not (is_lower(code_point) or is_upper(code_point))):
|
||||
sys.stderr.write(
|
||||
('%(sym)s is not upper|lower '
|
||||
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||||
'sym': ucs_symbol(code_point),
|
||||
'c': code_point,
|
||||
'uc': to_upper(code_point)})
|
||||
# tolower restriction: "Only characters specified for the keywords
|
||||
# lower and upper shall be specified.
|
||||
if (to_lower(code_point) != code_point
|
||||
and not (is_lower(code_point) or is_upper(code_point))):
|
||||
sys.stderr.write(
|
||||
('%(sym)s is not upper|lower '
|
||||
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||||
'sym': ucs_symbol(code_point),
|
||||
'c': code_point,
|
||||
'uc': to_lower(code_point)})
|
||||
# alpha restriction: "Characters classified as either upper or lower
|
||||
# shall automatically belong to this class.
|
||||
if ((is_lower(code_point) or is_upper(code_point))
|
||||
and not is_alpha(code_point)):
|
||||
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# alpha restriction: “No character specified for the keywords cntrl,
|
||||
# digit, punct or space shall be specified.”
|
||||
if (is_alpha(code_point) and is_cntrl(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_alpha(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_alpha(code_point) and is_punct(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and punct\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_alpha(code_point) and is_space(code_point)):
|
||||
sys.stderr.write('%(sym)s is alpha and space\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# space restriction: “No character specified for the keywords upper,
|
||||
# lower, alpha, digit, graph or xdigit shall be specified.”
|
||||
# upper, lower, alpha already checked above.
|
||||
if (is_space(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is space and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_space(code_point) and is_graph(code_point)):
|
||||
sys.stderr.write('%(sym)s is space and graph\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_space(code_point) and is_xdigit(code_point)):
|
||||
sys.stderr.write('%(sym)s is space and xdigit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# cntrl restriction: “No character specified for the keywords upper,
|
||||
# lower, alpha, digit, punct, graph, print or xdigit shall be
|
||||
# specified.” upper, lower, alpha already checked above.
|
||||
if (is_cntrl(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_punct(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and punct\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_graph(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and graph\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_print(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and print\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_cntrl(code_point) and is_xdigit(code_point)):
|
||||
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# punct restriction: “No character specified for the keywords upper,
|
||||
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||||
# be specified.” upper, lower, alpha, cntrl already checked above.
|
||||
if (is_punct(code_point) and is_digit(code_point)):
|
||||
sys.stderr.write('%(sym)s is punct and digit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_punct(code_point) and is_xdigit(code_point)):
|
||||
sys.stderr.write('%(sym)s is punct and xdigit\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (is_punct(code_point) and code_point == 0x0020):
|
||||
sys.stderr.write('%(sym)s is punct\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
# graph restriction: “No character specified for the keyword cntrl
|
||||
# shall be specified.” Already checked above.
|
||||
|
||||
# print restriction: “No character specified for the keyword cntrl
|
||||
# shall be specified.” Already checked above.
|
||||
|
||||
# graph - print relation: differ only in the <space> character.
|
||||
# How is this possible if there are more than one space character?!
|
||||
# I think susv2/xbd/locale.html should speak of “space characters”,
|
||||
# not “space character”.
|
||||
if (is_print(code_point)
|
||||
and not (is_graph(code_point) or is_space(code_point))):
|
||||
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
if (not is_print(code_point)
|
||||
and (is_graph(code_point) or code_point == 0x0020)):
|
||||
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
|
||||
'sym': ucs_symbol(code_point)})
|
||||
|
||||
def read_input_file(filename):
|
||||
'''Reads the original glibc i18n file to get the original head
|
||||
and tail.
|
||||
|
||||
We want to replace only the character classes in LC_CTYPE, and the
|
||||
date stamp. All the rest of the i18n file should stay unchanged.
|
||||
To avoid having to cut and paste the generated data into the
|
||||
original file, it is helpful to read the original file here
|
||||
to be able to generate a complete result file.
|
||||
'''
|
||||
head = tail = ''
|
||||
with open(filename, mode='r') as i18n_file:
|
||||
for line in i18n_file:
|
||||
match = re.match(
|
||||
r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
|
||||
line)
|
||||
if match:
|
||||
line = match.group('key') \
|
||||
+ '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
|
||||
head = head + line
|
||||
if line.startswith('LC_CTYPE'):
|
||||
break
|
||||
for line in i18n_file:
|
||||
if line.startswith('translit_start'):
|
||||
tail = line
|
||||
break
|
||||
for line in i18n_file:
|
||||
tail = tail + line
|
||||
return (head, tail)
|
||||
|
||||
def output_head(i18n_file, unicode_version, head=''):
|
||||
'''Write the header of the output file, i.e. the part of the file
|
||||
before the “LC_CTYPE” line.
|
||||
'''
|
||||
if ARGS.input_file and head:
|
||||
i18n_file.write(head)
|
||||
else:
|
||||
i18n_file.write('escape_char /\n')
|
||||
i18n_file.write('comment_char %\n')
|
||||
i18n_file.write('\n')
|
||||
i18n_file.write('% Generated automatically by '
|
||||
+ 'gen_unicode_ctype.py '
|
||||
+ 'for Unicode {:s}.\n'.format(unicode_version))
|
||||
i18n_file.write('\n')
|
||||
i18n_file.write('LC_IDENTIFICATION\n')
|
||||
i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
|
||||
unicode_version))
|
||||
i18n_file.write('source "UnicodeData.txt, '
|
||||
+ 'DerivedCoreProperties.txt"\n')
|
||||
i18n_file.write('address ""\n')
|
||||
i18n_file.write('contact ""\n')
|
||||
i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
|
||||
i18n_file.write('tel ""\n')
|
||||
i18n_file.write('fax ""\n')
|
||||
i18n_file.write('language ""\n')
|
||||
i18n_file.write('territory "Earth"\n')
|
||||
i18n_file.write('revision "{:s}"\n'.format(unicode_version))
|
||||
i18n_file.write('date "{:s}"\n'.format(
|
||||
time.strftime('%Y-%m-%d')))
|
||||
i18n_file.write('category "unicode:2014";LC_CTYPE\n')
|
||||
i18n_file.write('END LC_IDENTIFICATION\n')
|
||||
i18n_file.write('\n')
|
||||
i18n_file.write('LC_CTYPE\n')
|
||||
|
||||
def output_tail(i18n_file, tail=''):
|
||||
'''Write the tail of the output file, i.e. the part of the file
|
||||
after the last “LC_CTYPE” character class.
|
||||
'''
|
||||
if ARGS.input_file and tail:
|
||||
i18n_file.write(tail)
|
||||
else:
|
||||
i18n_file.write('END LC_CTYPE\n')
|
||||
|
||||
def output_tables(i18n_file, unicode_version):
|
||||
'''Write the new LC_CTYPE character classes to the output file'''
|
||||
i18n_file.write('% The following is the 14652 i18n fdcc-set '
|
||||
+ 'LC_CTYPE category.\n')
|
||||
i18n_file.write('% It covers Unicode version {:s}.\n'.format(
|
||||
unicode_version))
|
||||
i18n_file.write('% The character classes and mapping tables were '
|
||||
+ 'automatically\n')
|
||||
i18n_file.write('% generated using the gen_unicode_ctype.py '
|
||||
+ 'program.\n\n')
|
||||
i18n_file.write('% The "upper" class reflects the uppercase '
|
||||
+ 'characters of class "alpha"\n')
|
||||
output_charclass(i18n_file, 'upper', is_upper)
|
||||
i18n_file.write('% The "lower" class reflects the lowercase '
|
||||
+ 'characters of class "alpha"\n')
|
||||
output_charclass(i18n_file, 'lower', is_lower)
|
||||
i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
|
||||
+ 'reflecting\n')
|
||||
i18n_file.write('% the recommendations in TR 10176 annex A\n')
|
||||
output_charclass(i18n_file, 'alpha', is_alpha)
|
||||
i18n_file.write('% The "digit" class must only contain the '
|
||||
+ 'BASIC LATIN digits, says ISO C 99\n')
|
||||
i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
|
||||
output_charclass(i18n_file, 'digit', is_digit)
|
||||
i18n_file.write('% The "outdigit" information is by default '
|
||||
+ '"0" to "9". We don\'t have to\n')
|
||||
i18n_file.write('% provide it here since localedef will fill '
|
||||
+ 'in the bits and it would\n')
|
||||
i18n_file.write('% prevent locales copying this file define '
|
||||
+ 'their own values.\n')
|
||||
i18n_file.write('% outdigit /\n')
|
||||
i18n_file.write('% <U0030>..<U0039>\n\n')
|
||||
# output_charclass(i18n_file, 'outdigit', is_outdigit)
|
||||
output_charclass(i18n_file, 'space', is_space)
|
||||
output_charclass(i18n_file, 'cntrl', is_cntrl)
|
||||
output_charclass(i18n_file, 'punct', is_punct)
|
||||
output_charclass(i18n_file, 'graph', is_graph)
|
||||
output_charclass(i18n_file, 'print', is_print)
|
||||
i18n_file.write('% The "xdigit" class must only contain the '
|
||||
+ 'BASIC LATIN digits and A-F, a-f,\n')
|
||||
i18n_file.write('% says ISO C 99 '
|
||||
+ '(sections 7.25.2.1.12 and 6.4.4.1).\n')
|
||||
output_charclass(i18n_file, 'xdigit', is_xdigit)
|
||||
output_charclass(i18n_file, 'blank', is_blank)
|
||||
output_charmap(i18n_file, 'toupper', to_upper)
|
||||
output_charmap(i18n_file, 'tolower', to_lower)
|
||||
output_charmap(i18n_file, 'map "totitle";', to_title)
|
||||
i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
|
||||
+ 'annex B.1\n')
|
||||
i18n_file.write('% That is, all combining characters (level 2+3).\n')
|
||||
output_charclass(i18n_file, 'class "combining";', is_combining)
|
||||
i18n_file.write('% The "combining_level3" class reflects '
|
||||
+ 'ISO/IEC 10646-1 annex B.2\n')
|
||||
i18n_file.write('% That is, combining characters of level 3.\n')
|
||||
output_charclass(i18n_file,
|
||||
'class "combining_level3";', is_combining_level3)
|
||||
|
||||
if __name__ == "__main__":
|
||||
PARSER = argparse.ArgumentParser(
|
||||
description='''
|
||||
Generate a Unicode conforming LC_CTYPE category from
|
||||
UnicodeData.txt and DerivedCoreProperties.txt files.
|
||||
''')
|
||||
PARSER.add_argument(
|
||||
'-u', '--unicode_data_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='UnicodeData.txt',
|
||||
help=('The UnicodeData.txt file to read, '
|
||||
+ 'default: %(default)s'))
|
||||
PARSER.add_argument(
|
||||
'-d', '--derived_core_properties_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='DerivedCoreProperties.txt',
|
||||
help=('The DerivedCoreProperties.txt file to read, '
|
||||
+ 'default: %(default)s'))
|
||||
PARSER.add_argument(
|
||||
'-i', '--input_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
help='''The original glibc/localedata/locales/i18n file.''')
|
||||
PARSER.add_argument(
|
||||
'-o', '--output_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default='i18n.new',
|
||||
help='''The file which shall contain the generated LC_CTYPE category,
|
||||
default: %(default)s. If the original
|
||||
glibc/localedata/locales/i18n has been given
|
||||
as an option, all data from the original file
|
||||
except the newly generated LC_CTYPE character
|
||||
classes and the date stamp in
|
||||
LC_IDENTIFICATION will be copied unchanged
|
||||
into the output file. ''')
|
||||
PARSER.add_argument(
|
||||
'--unicode_version',
|
||||
nargs='?',
|
||||
required=True,
|
||||
type=str,
|
||||
help='The Unicode version of the input files used.')
|
||||
ARGS = PARSER.parse_args()
|
||||
|
||||
fill_attributes(ARGS.unicode_data_file)
|
||||
fill_derived_core_properties(ARGS.derived_core_properties_file)
|
||||
verifications()
|
||||
HEAD = TAIL = ''
|
||||
if ARGS.input_file:
|
||||
(HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||||
with open(ARGS.output_file, mode='w') as I18N_FILE:
|
||||
output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
|
||||
output_tables(I18N_FILE, ARGS.unicode_version)
|
||||
output_tail(I18N_FILE, tail=TAIL)
|
50
localedata/unicode-gen/unicode-license.txt
Normal file
50
localedata/unicode-gen/unicode-license.txt
Normal file
@ -0,0 +1,50 @@
|
||||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
|
||||
Unicode Data Files include all data files under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
|
||||
http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF
|
||||
online code charts under the directory http://www.unicode.org/Public/.
|
||||
Software includes any source code published in the Unicode Standard or under
|
||||
the directories http://www.unicode.org/Public/,
|
||||
http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES
|
||||
("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND
|
||||
AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF
|
||||
YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA
|
||||
FILES OR SOFTWARE.
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2013 Unicode, Inc. All rights reserved. Distributed under
|
||||
the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of the Unicode data files and any associated documentation (the "Data
|
||||
Files") or Unicode software and any associated documentation (the "Software")
|
||||
to deal in the Data Files or Software without restriction, including without
|
||||
limitation the rights to use, copy, modify, merge, publish, distribute, and/or
|
||||
sell copies of the Data Files or Software, and to permit persons to whom the
|
||||
Data Files or Software are furnished to do so, provided that (a) the above
|
||||
copyright notice(s) and this permission notice appear with all copies of the
|
||||
Data Files or Software, (b) both the above copyright notice(s) and this
|
||||
permission notice appear in associated documentation, and (c) there is clear
|
||||
notice in each modified Data File or in the Software as well as in the
|
||||
documentation associated with the Data File(s) or Software that the data or
|
||||
software has been modified.
|
||||
|
||||
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
|
||||
PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
|
||||
THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
|
||||
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
|
||||
DATA FILES OR SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder shall
|
||||
not be used in advertising or otherwise to promote the sale, use or other
|
||||
dealings in these Data Files or Software without prior written authorization
|
||||
of the copyright holder.
|
399
localedata/unicode-gen/utf8_compatibility.py
Executable file
399
localedata/unicode-gen/utf8_compatibility.py
Executable file
@ -0,0 +1,399 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
'''
|
||||
This script is useful for checking backward compatibility of newly
|
||||
generated UTF-8 file from utf8_gen.py script
|
||||
|
||||
To see how this script is used, call it with the “-h” option:
|
||||
|
||||
$ ./utf8_compatibility.py -h
|
||||
… prints usage message …
|
||||
'''
|
||||
|
||||
import sys
|
||||
import re
|
||||
import argparse
|
||||
|
||||
# Dictionary holding the entire contents of the UnicodeData.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {0: {'category': 'Cc',
|
||||
# 'title': None,
|
||||
# 'digit': '',
|
||||
# 'name': '<control>',
|
||||
# 'bidi': 'BN',
|
||||
# 'combining': '0',
|
||||
# 'comment': '',
|
||||
# 'oldname': 'NULL',
|
||||
# 'decomposition': '',
|
||||
# 'upper': None,
|
||||
# 'mirrored': 'N',
|
||||
# 'lower': None,
|
||||
# 'decdigit': '',
|
||||
# 'numeric': ''},
|
||||
# …
|
||||
# }
|
||||
UNICODE_ATTRIBUTES = {}
|
||||
|
||||
# Dictionary holding the entire contents of the EastAsianWidths.txt file
|
||||
#
|
||||
# Contents of this dictionary look like this:
|
||||
#
|
||||
# {0: 'N', … , 45430: 'W', …}
|
||||
EAST_ASIAN_WIDTHS = {}
|
||||
|
||||
def fill_attribute(code_point, fields):
|
||||
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||||
|
||||
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||||
in the UnicodeData.txt file.
|
||||
|
||||
'''
|
||||
UNICODE_ATTRIBUTES[code_point] = {
|
||||
'name': fields[1], # Character name
|
||||
'category': fields[2], # General category
|
||||
'combining': fields[3], # Canonical combining classes
|
||||
'bidi': fields[4], # Bidirectional category
|
||||
'decomposition': fields[5], # Character decomposition mapping
|
||||
'decdigit': fields[6], # Decimal digit value
|
||||
'digit': fields[7], # Digit value
|
||||
'numeric': fields[8], # Numeric value
|
||||
'mirrored': fields[9], # mirrored
|
||||
'oldname': fields[10], # Old Unicode 1.0 name
|
||||
'comment': fields[11], # comment
|
||||
# Uppercase mapping
|
||||
'upper': int(fields[12], 16) if fields[12] else None,
|
||||
# Lowercase mapping
|
||||
'lower': int(fields[13], 16) if fields[13] else None,
|
||||
# Titlecase mapping
|
||||
'title': int(fields[14], 16) if fields[14] else None,
|
||||
}
|
||||
|
||||
def fill_attributes(filename):
|
||||
'''Stores the entire contents of the UnicodeData.txt file
|
||||
in the UNICODE_ATTRIBUTES dictionary.
|
||||
|
||||
A typical line for a single code point in UnicodeData.txt looks
|
||||
like this:
|
||||
|
||||
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||||
|
||||
Code point ranges are indicated by pairs of lines like this:
|
||||
|
||||
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||||
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||||
'''
|
||||
with open(filename, mode='r') as unicode_data_file:
|
||||
fields_start = []
|
||||
for line in unicode_data_file:
|
||||
fields = line.strip().split(';')
|
||||
if len(fields) != 15:
|
||||
sys.stderr.write(
|
||||
'short line in file "%(f)s": %(l)s\n' %{
|
||||
'f': filename, 'l': line})
|
||||
exit(1)
|
||||
if fields[2] == 'Cs':
|
||||
# Surrogates are UTF-16 artefacts,
|
||||
# not real characters. Ignore them.
|
||||
fields_start = []
|
||||
continue
|
||||
if fields[1].endswith(', First>'):
|
||||
fields_start = fields
|
||||
fields_start[1] = fields_start[1].split(',')[0][1:]
|
||||
continue
|
||||
if fields[1].endswith(', Last>'):
|
||||
fields[1] = fields[1].split(',')[0][1:]
|
||||
if fields[1:] != fields_start[1:]:
|
||||
sys.stderr.write(
|
||||
'broken code point range in file "%(f)s": %(l)s\n' %{
|
||||
'f': filename, 'l': line})
|
||||
exit(1)
|
||||
for code_point in range(
|
||||
int(fields_start[0], 16),
|
||||
int(fields[0], 16)+1):
|
||||
fill_attribute(code_point, fields)
|
||||
fields_start = []
|
||||
continue
|
||||
fill_attribute(int(fields[0], 16), fields)
|
||||
fields_start = []
|
||||
|
||||
def fill_east_asian_widths(filename):
|
||||
'''Stores the entire contents of the EastAsianWidths.txt file
|
||||
in the EAST_ASIAN_WIDTHS dictionary.
|
||||
|
||||
Lines in EastAsianWidths.txt are either a code point range like
|
||||
this:
|
||||
|
||||
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
|
||||
|
||||
or a single code point like this:
|
||||
|
||||
A015;W # Lm YI SYLLABLE WU
|
||||
'''
|
||||
with open(filename, mode='r') as east_asian_widths_file:
|
||||
for line in east_asian_widths_file:
|
||||
match = re.match(
|
||||
r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||||
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||||
+r'\s*;\s*(?P<property>[a-zA-Z]+)',
|
||||
line)
|
||||
if not match:
|
||||
continue
|
||||
start = match.group('codepoint1')
|
||||
end = match.group('codepoint2')
|
||||
if not end:
|
||||
end = start
|
||||
for code_point in range(int(start, 16), int(end, 16)+1):
|
||||
EAST_ASIAN_WIDTHS[code_point] = match.group('property')
|
||||
|
||||
def ucs_symbol(code_point):
|
||||
'''Return the UCS symbol string for a Unicode character.'''
|
||||
if code_point < 0x10000:
|
||||
return '<U{:04X}>'.format(code_point)
|
||||
else:
|
||||
return '<U{:08X}>'.format(code_point)
|
||||
|
||||
def create_charmap_dictionary(file_name):
|
||||
'''Create a dictionary for all code points found in the CHARMAP
|
||||
section of a file
|
||||
'''
|
||||
with open(file_name, mode='r') as utf8_file:
|
||||
charmap_dictionary = {}
|
||||
for line in utf8_file:
|
||||
if line.startswith('CHARMAP'):
|
||||
break
|
||||
for line in utf8_file:
|
||||
if line.startswith('END CHARMAP'):
|
||||
return charmap_dictionary
|
||||
if line.startswith('%'):
|
||||
continue
|
||||
match = re.match(
|
||||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||
+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
|
||||
+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
|
||||
line)
|
||||
if not match:
|
||||
continue
|
||||
codepoint1 = match.group('codepoint1')
|
||||
codepoint2 = match.group('codepoint2')
|
||||
if not codepoint2:
|
||||
codepoint2 = codepoint1
|
||||
for i in range(int(codepoint1, 16),
|
||||
int(codepoint2, 16) + 1):
|
||||
charmap_dictionary[i] = match.group('hexutf8')
|
||||
sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
|
||||
%file_name)
|
||||
exit(1)
|
||||
|
||||
def check_charmap(original_file_name, new_file_name):
|
||||
'''Report differences in the CHARMAP section between the old and the
|
||||
new file
|
||||
'''
|
||||
print('************************************************************')
|
||||
print('Report on CHARMAP:')
|
||||
ocharmap = create_charmap_dictionary(original_file_name)
|
||||
ncharmap = create_charmap_dictionary(new_file_name)
|
||||
print('------------------------------------------------------------')
|
||||
print('Total removed characters in newly generated CHARMAP: %d'
|
||||
%len(set(ocharmap)-set(ncharmap)))
|
||||
if ARGS.show_missing_characters:
|
||||
for key in sorted(set(ocharmap)-set(ncharmap)):
|
||||
print('removed: {:s} {:s} {:s}'.format(
|
||||
ucs_symbol(key),
|
||||
ocharmap[key],
|
||||
UNICODE_ATTRIBUTES[key]['name'] \
|
||||
if key in UNICODE_ATTRIBUTES else None))
|
||||
print('------------------------------------------------------------')
|
||||
changed_charmap = {}
|
||||
for key in set(ocharmap).intersection(set(ncharmap)):
|
||||
if ocharmap[key] != ncharmap[key]:
|
||||
changed_charmap[key] = (ocharmap[key], ncharmap[key])
|
||||
print('Total changed characters in newly generated CHARMAP: %d'
|
||||
%len(changed_charmap))
|
||||
if ARGS.show_changed_characters:
|
||||
for key in sorted(changed_charmap):
|
||||
print('changed: {:s} {:s}->{:s} {:s}'.format(
|
||||
ucs_symbol(key),
|
||||
changed_charmap[key][0],
|
||||
changed_charmap[key][1],
|
||||
UNICODE_ATTRIBUTES[key]['name'] \
|
||||
if key in UNICODE_ATTRIBUTES else None))
|
||||
print('------------------------------------------------------------')
|
||||
print('Total added characters in newly generated CHARMAP: %d'
|
||||
%len(set(ncharmap)-set(ocharmap)))
|
||||
if ARGS.show_added_characters:
|
||||
for key in sorted(set(ncharmap)-set(ocharmap)):
|
||||
print('added: {:s} {:s} {:s}'.format(
|
||||
ucs_symbol(key),
|
||||
ncharmap[key],
|
||||
UNICODE_ATTRIBUTES[key]['name'] \
|
||||
if key in UNICODE_ATTRIBUTES else None))
|
||||
|
||||
def create_width_dictionary(file_name):
|
||||
'''Create a dictionary for all code points found in the WIDTH
|
||||
section of a file
|
||||
'''
|
||||
with open(file_name, mode='r') as utf8_file:
|
||||
width_dictionary = {}
|
||||
for line in utf8_file:
|
||||
if line.startswith('WIDTH'):
|
||||
break
|
||||
for line in utf8_file:
|
||||
if line.startswith('END WIDTH'):
|
||||
return width_dictionary
|
||||
match = re.match(
|
||||
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||
+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
|
||||
+r'\s+(?P<width>[02])',
|
||||
line)
|
||||
if not match:
|
||||
continue
|
||||
codepoint1 = match.group('codepoint1')
|
||||
codepoint2 = match.group('codepoint2')
|
||||
if not codepoint2:
|
||||
codepoint2 = codepoint1
|
||||
for i in range(int(codepoint1, 16),
|
||||
int(codepoint2, 16) + 1):
|
||||
width_dictionary[i] = int(match.group('width'))
|
||||
sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
|
||||
|
||||
def check_width(original_file_name, new_file_name):
|
||||
'''Report differences in the WIDTH section between the old and the new
|
||||
file
|
||||
'''
|
||||
print('************************************************************')
|
||||
print('Report on WIDTH:')
|
||||
owidth = create_width_dictionary(original_file_name)
|
||||
nwidth = create_width_dictionary(new_file_name)
|
||||
print('------------------------------------------------------------')
|
||||
print('Total removed characters in newly generated WIDTH: %d'
|
||||
%len(set(owidth)-set(nwidth)))
|
||||
print('(Characters not in WIDTH get width 1 by default, '
|
||||
+ 'i.e. these have width 1 now.)')
|
||||
if ARGS.show_missing_characters:
|
||||
for key in sorted(set(owidth)-set(nwidth)):
|
||||
print('removed: {:s} '.format(ucs_symbol(key))
|
||||
+ '{:d} : '.format(owidth[key])
|
||||
+ 'eaw={:s} '.format(
|
||||
EAST_ASIAN_WIDTHS[key]
|
||||
if key in EAST_ASIAN_WIDTHS else None)
|
||||
+ 'category={:2s} '.format(
|
||||
UNICODE_ATTRIBUTES[key]['category']
|
||||
if key in UNICODE_ATTRIBUTES else None)
|
||||
+ 'bidi={:3s} '.format(
|
||||
UNICODE_ATTRIBUTES[key]['bidi']
|
||||
if key in UNICODE_ATTRIBUTES else None)
|
||||
+ 'name={:s}'.format(
|
||||
UNICODE_ATTRIBUTES[key]['name']
|
||||
if key in UNICODE_ATTRIBUTES else None))
|
||||
print('------------------------------------------------------------')
|
||||
changed_width = {}
|
||||
for key in set(owidth).intersection(set(nwidth)):
|
||||
if owidth[key] != nwidth[key]:
|
||||
changed_width[key] = (owidth[key], nwidth[key])
|
||||
print('Total changed characters in newly generated WIDTH: %d'
|
||||
%len(changed_width))
|
||||
if ARGS.show_changed_characters:
|
||||
for key in sorted(changed_width):
|
||||
print('changed width: {:s} '.format(ucs_symbol(key))
|
||||
+ '{:d}->{:d} : '.format(changed_width[key][0],
|
||||
changed_width[key][1])
|
||||
+ 'eaw={:s} '.format(
|
||||
EAST_ASIAN_WIDTHS[key]
|
||||
if key in EAST_ASIAN_WIDTHS else None)
|
||||
+ 'category={:2s} '.format(
|
||||
UNICODE_ATTRIBUTES[key]['category']
|
||||
if key in UNICODE_ATTRIBUTES else None)
|
||||
+ 'bidi={:3s} '.format(
|
||||
UNICODE_ATTRIBUTES[key]['bidi']
|
||||
if key in UNICODE_ATTRIBUTES else None)
|
||||
+ 'name={:s}'.format(
|
||||
UNICODE_ATTRIBUTES[key]['name']
|
||||
if key in UNICODE_ATTRIBUTES else None))
|
||||
print('------------------------------------------------------------')
|
||||
print('Total added characters in newly generated WIDTH: %d'
|
||||
%len(set(nwidth)-set(owidth)))
|
||||
print('(Characters not in WIDTH get width 1 by default, '
|
||||
+ 'i.e. these had width 1 before.)')
|
||||
if ARGS.show_added_characters:
|
||||
for key in sorted(set(nwidth)-set(owidth)):
|
||||
print('added: {:s} '.format(ucs_symbol(key))
|
||||
+ '{:d} : '.format(nwidth[key])
|
||||
+ 'eaw={:s} '.format(
|
||||
EAST_ASIAN_WIDTHS[key]
|
||||
if key in EAST_ASIAN_WIDTHS else None)
|
||||
+ 'category={:2s} '.format(
|
||||
UNICODE_ATTRIBUTES[key]['category']
|
||||
if key in UNICODE_ATTRIBUTES else None)
|
||||
+ 'bidi={:3s} '.format(
|
||||
UNICODE_ATTRIBUTES[key]['bidi']
|
||||
if key in UNICODE_ATTRIBUTES else None)
|
||||
+ 'name={:s}'.format(
|
||||
UNICODE_ATTRIBUTES[key]['name']
|
||||
if key in UNICODE_ATTRIBUTES else None))
|
||||
|
||||
if __name__ == "__main__":
|
||||
PARSER = argparse.ArgumentParser(
|
||||
description='''
|
||||
Compare the contents of LC_CTYPE in two files and check for errors.
|
||||
''')
|
||||
PARSER.add_argument(
|
||||
'-o', '--old_utf8_file',
|
||||
nargs='?',
|
||||
required=True,
|
||||
type=str,
|
||||
help='The old UTF-8 file.')
|
||||
PARSER.add_argument(
|
||||
'-n', '--new_utf8_file',
|
||||
nargs='?',
|
||||
required=True,
|
||||
type=str,
|
||||
help='The new UTF-8 file.')
|
||||
PARSER.add_argument(
|
||||
'-u', '--unicode_data_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
help='The UnicodeData.txt file to read.')
|
||||
PARSER.add_argument(
|
||||
'-e', '--east_asian_width_file',
|
||||
nargs='?',
|
||||
type=str,
|
||||
help='The EastAsianWidth.txt file to read.')
|
||||
PARSER.add_argument(
|
||||
'-a', '--show_added_characters',
|
||||
action='store_true',
|
||||
help='Show characters which were added in detail.')
|
||||
PARSER.add_argument(
|
||||
'-m', '--show_missing_characters',
|
||||
action='store_true',
|
||||
help='Show characters which were removed in detail.')
|
||||
PARSER.add_argument(
|
||||
'-c', '--show_changed_characters',
|
||||
action='store_true',
|
||||
help='Show characters whose width was changed in detail.')
|
||||
ARGS = PARSER.parse_args()
|
||||
|
||||
if ARGS.unicode_data_file:
|
||||
fill_attributes(ARGS.unicode_data_file)
|
||||
if ARGS.east_asian_width_file:
|
||||
fill_east_asian_widths(ARGS.east_asian_width_file)
|
||||
check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
|
||||
check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
|
286
localedata/unicode-gen/utf8_gen.py
Executable file
286
localedata/unicode-gen/utf8_gen.py
Executable file
@ -0,0 +1,286 @@
|
||||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||
# This file is part of the GNU C Library.
|
||||
#
|
||||
# The GNU C Library is free software; you can redistribute it and/or
|
||||
# modify it under the terms of the GNU Lesser General Public
|
||||
# License as published by the Free Software Foundation; either
|
||||
# version 2.1 of the License, or (at your option) any later version.
|
||||
#
|
||||
# The GNU C Library is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
# Lesser General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU Lesser General Public
|
||||
# License along with the GNU C Library; if not, see
|
||||
# <http://www.gnu.org/licenses/>.
|
||||
|
||||
'''glibc/localedata/charmaps/UTF-8 file generator script
|
||||
|
||||
This script generates a glibc/localedata/charmaps/UTF-8 file
|
||||
from Unicode data.
|
||||
|
||||
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
||||
|
||||
It will output UTF-8 file
|
||||
'''
|
||||
|
||||
import sys
|
||||
import re
|
||||
|
||||
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
|
||||
# sections 3.11 and 4.4.
|
||||
|
||||
jamo_initial_short_name = [
|
||||
'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
|
||||
'C', 'K', 'T', 'P', 'H'
|
||||
]
|
||||
|
||||
jamo_medial_short_name = [
|
||||
'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
|
||||
'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
|
||||
]
|
||||
|
||||
jamo_final_short_name = [
|
||||
'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
|
||||
'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
|
||||
'P', 'H'
|
||||
]
|
||||
|
||||
def ucs_symbol(code_point):
|
||||
'''Return the UCS symbol string for a Unicode character.'''
|
||||
if code_point < 0x10000:
|
||||
return '<U{:04X}>'.format(code_point)
|
||||
else:
|
||||
return '<U{:08X}>'.format(code_point)
|
||||
|
||||
def process_range(start, end, outfile, name):
|
||||
'''Writes a range of code points into the CHARMAP section of the
|
||||
output file
|
||||
|
||||
'''
|
||||
if 'Hangul Syllable' in name:
|
||||
# from glibc/localedata/ChangeLog:
|
||||
#
|
||||
# 2000-09-24 Bruno Haible <haible@clisp.cons.org>
|
||||
# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
|
||||
# so they become printable and carry a width. Comment out surrogate
|
||||
# ranges. Add a WIDTH table
|
||||
#
|
||||
# So we expand the Hangul Syllables here:
|
||||
for i in range(int(start, 16), int(end, 16)+1 ):
|
||||
index2, index3 = divmod(i - 0xaC00, 28)
|
||||
index1, index2 = divmod(index2, 21)
|
||||
hangul_syllable_name = 'HANGUL SYLLABLE ' \
|
||||
+ jamo_initial_short_name[index1] \
|
||||
+ jamo_medial_short_name[index2] \
|
||||
+ jamo_final_short_name[index3]
|
||||
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||||
ucs_symbol(i), convert_to_hex(i),
|
||||
hangul_syllable_name))
|
||||
return
|
||||
# UnicodeData.txt file has contains code point ranges like this:
|
||||
#
|
||||
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
#
|
||||
# The glibc UTF-8 file splits ranges like these into shorter
|
||||
# ranges of 64 code points each:
|
||||
#
|
||||
# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
||||
# …
|
||||
# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
|
||||
for i in range(int(start, 16), int(end, 16), 64 ):
|
||||
if i > (int(end, 16)-64):
|
||||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||||
ucs_symbol(i),
|
||||
ucs_symbol(int(end,16)),
|
||||
convert_to_hex(i),
|
||||
name))
|
||||
break
|
||||
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||||
ucs_symbol(i),
|
||||
ucs_symbol(i+63),
|
||||
convert_to_hex(i),
|
||||
name))
|
||||
|
||||
def process_charmap(flines, outfile):
|
||||
'''This function takes an array which contains *all* lines of
|
||||
of UnicodeData.txt and write lines to outfile as used in the
|
||||
|
||||
CHARMAP
|
||||
…
|
||||
END CHARMAP
|
||||
|
||||
section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
|
||||
|
||||
Samples for input lines:
|
||||
|
||||
0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
|
||||
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
|
||||
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
|
||||
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
|
||||
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
|
||||
|
||||
Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
|
||||
|
||||
<U0010> /x10 DATA LINK ESCAPE
|
||||
<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
||||
%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
|
||||
%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
|
||||
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
|
||||
|
||||
'''
|
||||
fields_start = []
|
||||
for line in flines:
|
||||
fields = line.split(";")
|
||||
# Some characters have “<control>” as their name. We try to
|
||||
# use the “Unicode 1.0 Name” (10th field in
|
||||
# UnicodeData.txt) for them.
|
||||
#
|
||||
# The Characters U+0080, U+0081, U+0084 and U+0099 have
|
||||
# “<control>” as their name but do not even have aa
|
||||
# ”Unicode 1.0 Name”. We could write code to take their
|
||||
# alternate names from NameAliases.txt.
|
||||
if fields[1] == "<control>" and fields[10]:
|
||||
fields[1] = fields[10]
|
||||
# Handling code point ranges like:
|
||||
#
|
||||
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||
if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
|
||||
fields_start = fields
|
||||
continue
|
||||
if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
|
||||
process_range(fields_start[0], fields[0],
|
||||
outfile, fields[1][:-7]+'>')
|
||||
fields_start = []
|
||||
continue
|
||||
fields_start = []
|
||||
if 'Surrogate,' in fields[1]:
|
||||
# Comment out the surrogates in the UTF-8 file.
|
||||
# One could of course skip them completely but
|
||||
# the original UTF-8 file in glibc had them as
|
||||
# comments, so we keep these comment lines.
|
||||
outfile.write('%')
|
||||
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||||
ucs_symbol(int(fields[0], 16)),
|
||||
convert_to_hex(int(fields[0], 16)),
|
||||
fields[1]))
|
||||
|
||||
def convert_to_hex(code_point):
|
||||
'''Converts a code point to a hexadecimal UTF-8 representation
|
||||
like /x**/x**/x**.'''
|
||||
# Getting UTF8 of Unicode characters.
|
||||
# In Python3, .encode('UTF-8') does not work for
|
||||
# surrogates. Therefore, we use this conversion table
|
||||
surrogates = {
|
||||
0xD800: '/xed/xa0/x80',
|
||||
0xDB7F: '/xed/xad/xbf',
|
||||
0xDB80: '/xed/xae/x80',
|
||||
0xDBFF: '/xed/xaf/xbf',
|
||||
0xDC00: '/xed/xb0/x80',
|
||||
0xDFFF: '/xed/xbf/xbf',
|
||||
}
|
||||
if code_point in surrogates:
|
||||
return surrogates[code_point]
|
||||
return ''.join([
|
||||
'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
|
||||
])
|
||||
|
||||
def write_header_charmap(outfile):
|
||||
'''Write the header on top of the CHARMAP section to the output file'''
|
||||
outfile.write("<code_set_name> UTF-8\n")
|
||||
outfile.write("<comment_char> %\n")
|
||||
outfile.write("<escape_char> /\n")
|
||||
outfile.write("<mb_cur_min> 1\n")
|
||||
outfile.write("<mb_cur_max> 6\n\n")
|
||||
outfile.write("% CHARMAP generated using utf8_gen.py\n")
|
||||
outfile.write("% alias ISO-10646/UTF-8\n")
|
||||
outfile.write("CHARMAP\n")
|
||||
|
||||
def write_header_width(outfile):
|
||||
'''Writes the header on top of the WIDTH section to the output file'''
|
||||
outfile.write('% Character width according to Unicode 7.0.0.\n')
|
||||
outfile.write('% - Default width is 1.\n')
|
||||
outfile.write('% - Double-width characters have width 2; generated from\n')
|
||||
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
|
||||
outfile.write('% - Non-spacing characters have width 0; '
|
||||
+ 'generated from PropList.txt or\n')
|
||||
outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
|
||||
+ 'UnicodeData.txt"\n')
|
||||
outfile.write('% - Format control characters have width 0; '
|
||||
+ 'generated from\n')
|
||||
outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
|
||||
# Not needed covered by Cf
|
||||
# outfile.write("% - Zero width characters have width 0; generated from\n")
|
||||
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
|
||||
outfile.write("WIDTH\n")
|
||||
|
||||
def process_width(outfile, ulines, elines):
|
||||
'''ulines are lines from UnicodeData.txt, elines are lines from
|
||||
EastAsianWidth.txt
|
||||
|
||||
'''
|
||||
width_dict = {}
|
||||
for line in ulines:
|
||||
fields = line.split(";")
|
||||
if fields[4] == "NSM" or fields[2] == "Cf":
|
||||
width_dict[int(fields[0], 16)] = ucs_symbol(
|
||||
int(fields[0], 16)) + '\t0'
|
||||
|
||||
for line in elines:
|
||||
# If an entry in EastAsianWidth.txt is found, it overrides entries in
|
||||
# UnicodeData.txt:
|
||||
fields = line.split(";")
|
||||
if not '..' in fields[0]:
|
||||
width_dict[int(fields[0], 16)] = ucs_symbol(
|
||||
int(fields[0], 16)) + '\t2'
|
||||
else:
|
||||
code_points = fields[0].split("..")
|
||||
for key in range(int(code_points[0], 16),
|
||||
int(code_points[1], 16)+1):
|
||||
if key in width_dict:
|
||||
del width_dict[key]
|
||||
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
|
||||
ucs_symbol(int(code_points[0], 16)),
|
||||
ucs_symbol(int(code_points[1], 16)))
|
||||
|
||||
for key in sorted(width_dict):
|
||||
outfile.write(width_dict[key]+'\n')
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
|
||||
else:
|
||||
with open(sys.argv[1], mode='r') as UNIDATA_FILE:
|
||||
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
|
||||
with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
|
||||
EAST_ASIAN_WIDTH_LINES = []
|
||||
for LINE in EAST_ASIAN_WIDTH_FILE:
|
||||
# If characters from EastAasianWidth.txt which are from
|
||||
# from reserved ranges (i.e. not yet assigned code points)
|
||||
# are added to the WIDTH section of the UTF-8 file, then
|
||||
# “make check” produces “Unknown Character” errors for
|
||||
# these code points because such unassigned code points
|
||||
# are not in the CHARMAP section of the UTF-8 file.
|
||||
#
|
||||
# Therefore, we skip all reserved code points when reading
|
||||
# the EastAsianWidth.txt file.
|
||||
if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
|
||||
continue
|
||||
if re.match(r'^[^;]*;[WF]', LINE):
|
||||
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
|
||||
with open('UTF-8', mode='w') as OUTFILE:
|
||||
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
|
||||
write_header_charmap(OUTFILE)
|
||||
process_charmap(UNICODE_DATA_LINES, OUTFILE)
|
||||
OUTFILE.write("END CHARMAP\n\n")
|
||||
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
|
||||
write_header_width(OUTFILE)
|
||||
process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
|
||||
OUTFILE.write("END WIDTH\n")
|
Loading…
Reference in New Issue
Block a user