mirror of
https://sourceware.org/git/glibc.git
synced 2024-11-21 12:30:06 +00:00
Unicode 7.0.0 update; added generator scripts.
for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators.
This commit is contained in:
parent
e4a399dc3d
commit
4a4839c94a
11
NEWS
11
NEWS
@ -9,8 +9,15 @@ Version 2.22
|
|||||||
|
|
||||||
* The following bugs are resolved with this release:
|
* The following bugs are resolved with this release:
|
||||||
|
|
||||||
4719, 15319, 15467, 15790, 16560, 17569, 17792, 17912, 17932, 17944,
|
4719, 13064, 14094, 15319, 15467, 15790, 16560, 17569, 17588, 17792,
|
||||||
17949, 17964, 17965, 17967, 17969, 17978, 17987, 17991, 17996, 17999.
|
17912, 17932, 17944, 17949, 17964, 17965, 17967, 17969, 17978, 17987,
|
||||||
|
17991, 17996, 17998, 17999.
|
||||||
|
|
||||||
|
* Character encoding and ctype tables were updated to Unicode 7.0.0, using
|
||||||
|
new generator scripts contributed by Pravin Satpute and Mike FABIAN (Red
|
||||||
|
Hat). These updates cause user visible changes, such as the fix for bug
|
||||||
|
17998.
|
||||||
|
|
||||||
|
|
||||||
Version 2.21
|
Version 2.21
|
||||||
|
|
||||||
|
@ -1,3 +1,30 @@
|
|||||||
|
2015-02-20 Alexandre Oliva <aoliva@redhat.com>
|
||||||
|
|
||||||
|
[BZ #17588]
|
||||||
|
[BZ #13064]
|
||||||
|
[BZ #14094]
|
||||||
|
[BZ #17998]
|
||||||
|
* unicode-gen/Makefile: New.
|
||||||
|
* unicode-gen/unicode-license.txt: New, from Unicode.
|
||||||
|
* unicode-gen/UnicodeData.txt: New, from Unicode.
|
||||||
|
* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
|
||||||
|
* unicode-gen/EastAsianWidth.txt: New, from Unicode.
|
||||||
|
* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
|
||||||
|
FABIAN <mfabian@redhat.com>.
|
||||||
|
* unicode-gen/ctype_compatibility.py: New verifier, from
|
||||||
|
Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
|
||||||
|
* unicode-gen/ctype_compatibility_test_cases.py: New verifier
|
||||||
|
module, from Mike FABIAN.
|
||||||
|
* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
|
||||||
|
and Mike FABIAN.
|
||||||
|
* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
|
||||||
|
Satpute and Mike FABIAN.
|
||||||
|
* charmaps/UTF-8: Update.
|
||||||
|
* locales/i18n: Update.
|
||||||
|
* gen-unicode-ctype.c: Remove.
|
||||||
|
* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
|
||||||
|
true for ordinal indicators.
|
||||||
|
|
||||||
2015-01-21 Marek Polacek <polacek@redhat.com>
|
2015-01-21 Marek Polacek <polacek@redhat.com>
|
||||||
|
|
||||||
* tests-mbwc/tst_wcscpy.c (tst_wcscpy): Fix condition.
|
* tests-mbwc/tst_wcscpy.c (tst_wcscpy): Fix condition.
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,784 +0,0 @@
|
|||||||
/* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
|
|
||||||
Copyright (C) 2000-2015 Free Software Foundation, Inc.
|
|
||||||
This file is part of the GNU C Library.
|
|
||||||
Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
|
|
||||||
|
|
||||||
The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
modify it under the terms of the GNU Lesser General Public
|
|
||||||
License as published by the Free Software Foundation; either
|
|
||||||
version 2.1 of the License, or (at your option) any later version.
|
|
||||||
|
|
||||||
The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
Lesser General Public License for more details.
|
|
||||||
|
|
||||||
You should have received a copy of the GNU Lesser General Public
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<http://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
/* Usage example:
|
|
||||||
$ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdio.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <time.h>
|
|
||||||
|
|
||||||
/* This structure represents one line in the UnicodeData.txt file. */
|
|
||||||
struct unicode_attribute
|
|
||||||
{
|
|
||||||
const char *name; /* Character name */
|
|
||||||
const char *category; /* General category */
|
|
||||||
const char *combining; /* Canonical combining classes */
|
|
||||||
const char *bidi; /* Bidirectional category */
|
|
||||||
const char *decomposition; /* Character decomposition mapping */
|
|
||||||
const char *decdigit; /* Decimal digit value */
|
|
||||||
const char *digit; /* Digit value */
|
|
||||||
const char *numeric; /* Numeric value */
|
|
||||||
int mirrored; /* mirrored */
|
|
||||||
const char *oldname; /* Old Unicode 1.0 name */
|
|
||||||
const char *comment; /* Comment */
|
|
||||||
unsigned int upper; /* Uppercase mapping */
|
|
||||||
unsigned int lower; /* Lowercase mapping */
|
|
||||||
unsigned int title; /* Titlecase mapping */
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Missing fields are represented with "" for strings, and NONE for
|
|
||||||
characters. */
|
|
||||||
#define NONE (~(unsigned int)0)
|
|
||||||
|
|
||||||
/* The entire contents of the UnicodeData.txt file. */
|
|
||||||
struct unicode_attribute unicode_attributes [0x110000];
|
|
||||||
|
|
||||||
/* Stores in unicode_attributes[i] the values from the given fields. */
|
|
||||||
static void
|
|
||||||
fill_attribute (unsigned int i,
|
|
||||||
const char *field1, const char *field2,
|
|
||||||
const char *field3, const char *field4,
|
|
||||||
const char *field5, const char *field6,
|
|
||||||
const char *field7, const char *field8,
|
|
||||||
const char *field9, const char *field10,
|
|
||||||
const char *field11, const char *field12,
|
|
||||||
const char *field13, const char *field14)
|
|
||||||
{
|
|
||||||
struct unicode_attribute * uni;
|
|
||||||
|
|
||||||
if (i >= 0x110000)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "index too large\n");
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
if (strcmp (field2, "Cs") == 0)
|
|
||||||
/* Surrogates are UTF-16 artefacts, not real characters. Ignore them. */
|
|
||||||
return;
|
|
||||||
uni = &unicode_attributes[i];
|
|
||||||
/* Copy the strings. */
|
|
||||||
uni->name = strdup (field1);
|
|
||||||
uni->category = (field2[0] == '\0' ? "" : strdup (field2));
|
|
||||||
uni->combining = (field3[0] == '\0' ? "" : strdup (field3));
|
|
||||||
uni->bidi = (field4[0] == '\0' ? "" : strdup (field4));
|
|
||||||
uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
|
|
||||||
uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6));
|
|
||||||
uni->digit = (field7[0] == '\0' ? "" : strdup (field7));
|
|
||||||
uni->numeric = (field8[0] == '\0' ? "" : strdup (field8));
|
|
||||||
uni->mirrored = (field9[0] == 'Y');
|
|
||||||
uni->oldname = (field10[0] == '\0' ? "" : strdup (field10));
|
|
||||||
uni->comment = (field11[0] == '\0' ? "" : strdup (field11));
|
|
||||||
uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
|
|
||||||
uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
|
|
||||||
uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Maximum length of a field in the UnicodeData.txt file. */
|
|
||||||
#define FIELDLEN 120
|
|
||||||
|
|
||||||
/* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN.
|
|
||||||
Reads up to (but excluding) DELIM.
|
|
||||||
Returns 1 when a field was successfully read, otherwise 0. */
|
|
||||||
static int
|
|
||||||
getfield (FILE *stream, char *buffer, int delim)
|
|
||||||
{
|
|
||||||
int count = 0;
|
|
||||||
int c;
|
|
||||||
|
|
||||||
for (; (c = getc (stream)), (c != EOF && c != delim); )
|
|
||||||
{
|
|
||||||
/* The original unicode.org UnicodeData.txt file happens to have
|
|
||||||
CR/LF line terminators. Silently convert to LF. */
|
|
||||||
if (c == '\r')
|
|
||||||
continue;
|
|
||||||
|
|
||||||
/* Put c into the buffer. */
|
|
||||||
if (++count >= FIELDLEN - 1)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "field too long\n");
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
*buffer++ = c;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c == EOF)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
*buffer = '\0';
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
|
|
||||||
file. */
|
|
||||||
static void
|
|
||||||
fill_attributes (const char *unicodedata_filename)
|
|
||||||
{
|
|
||||||
unsigned int i, j;
|
|
||||||
FILE *stream;
|
|
||||||
char field0[FIELDLEN];
|
|
||||||
char field1[FIELDLEN];
|
|
||||||
char field2[FIELDLEN];
|
|
||||||
char field3[FIELDLEN];
|
|
||||||
char field4[FIELDLEN];
|
|
||||||
char field5[FIELDLEN];
|
|
||||||
char field6[FIELDLEN];
|
|
||||||
char field7[FIELDLEN];
|
|
||||||
char field8[FIELDLEN];
|
|
||||||
char field9[FIELDLEN];
|
|
||||||
char field10[FIELDLEN];
|
|
||||||
char field11[FIELDLEN];
|
|
||||||
char field12[FIELDLEN];
|
|
||||||
char field13[FIELDLEN];
|
|
||||||
char field14[FIELDLEN];
|
|
||||||
int lineno = 0;
|
|
||||||
|
|
||||||
for (i = 0; i < 0x110000; i++)
|
|
||||||
unicode_attributes[i].name = NULL;
|
|
||||||
|
|
||||||
stream = fopen (unicodedata_filename, "r");
|
|
||||||
if (stream == NULL)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (;;)
|
|
||||||
{
|
|
||||||
int n;
|
|
||||||
|
|
||||||
lineno++;
|
|
||||||
n = getfield (stream, field0, ';');
|
|
||||||
n += getfield (stream, field1, ';');
|
|
||||||
n += getfield (stream, field2, ';');
|
|
||||||
n += getfield (stream, field3, ';');
|
|
||||||
n += getfield (stream, field4, ';');
|
|
||||||
n += getfield (stream, field5, ';');
|
|
||||||
n += getfield (stream, field6, ';');
|
|
||||||
n += getfield (stream, field7, ';');
|
|
||||||
n += getfield (stream, field8, ';');
|
|
||||||
n += getfield (stream, field9, ';');
|
|
||||||
n += getfield (stream, field10, ';');
|
|
||||||
n += getfield (stream, field11, ';');
|
|
||||||
n += getfield (stream, field12, ';');
|
|
||||||
n += getfield (stream, field13, ';');
|
|
||||||
n += getfield (stream, field14, '\n');
|
|
||||||
if (n == 0)
|
|
||||||
break;
|
|
||||||
if (n != 15)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "short line in'%s':%d\n",
|
|
||||||
unicodedata_filename, lineno);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
i = strtoul (field0, NULL, 16);
|
|
||||||
if (field1[0] == '<'
|
|
||||||
&& strlen (field1) >= 9
|
|
||||||
&& !strcmp (field1 + strlen(field1) - 8, ", First>"))
|
|
||||||
{
|
|
||||||
/* Deal with a range. */
|
|
||||||
lineno++;
|
|
||||||
n = getfield (stream, field0, ';');
|
|
||||||
n += getfield (stream, field1, ';');
|
|
||||||
n += getfield (stream, field2, ';');
|
|
||||||
n += getfield (stream, field3, ';');
|
|
||||||
n += getfield (stream, field4, ';');
|
|
||||||
n += getfield (stream, field5, ';');
|
|
||||||
n += getfield (stream, field6, ';');
|
|
||||||
n += getfield (stream, field7, ';');
|
|
||||||
n += getfield (stream, field8, ';');
|
|
||||||
n += getfield (stream, field9, ';');
|
|
||||||
n += getfield (stream, field10, ';');
|
|
||||||
n += getfield (stream, field11, ';');
|
|
||||||
n += getfield (stream, field12, ';');
|
|
||||||
n += getfield (stream, field13, ';');
|
|
||||||
n += getfield (stream, field14, '\n');
|
|
||||||
if (n != 15)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "missing end range in '%s':%d\n",
|
|
||||||
unicodedata_filename, lineno);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
if (!(field1[0] == '<'
|
|
||||||
&& strlen (field1) >= 8
|
|
||||||
&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
|
|
||||||
{
|
|
||||||
fprintf (stderr, "missing end range in '%s':%d\n",
|
|
||||||
unicodedata_filename, lineno);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
field1[strlen (field1) - 7] = '\0';
|
|
||||||
j = strtoul (field0, NULL, 16);
|
|
||||||
for (; i <= j; i++)
|
|
||||||
fill_attribute (i, field1+1, field2, field3, field4, field5,
|
|
||||||
field6, field7, field8, field9, field10,
|
|
||||||
field11, field12, field13, field14);
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
/* Single character line */
|
|
||||||
fill_attribute (i, field1, field2, field3, field4, field5,
|
|
||||||
field6, field7, field8, field9, field10,
|
|
||||||
field11, field12, field13, field14);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (ferror (stream) || fclose (stream))
|
|
||||||
{
|
|
||||||
fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Character mappings. */
|
|
||||||
|
|
||||||
static unsigned int
|
|
||||||
to_upper (unsigned int ch)
|
|
||||||
{
|
|
||||||
if (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].upper != NONE)
|
|
||||||
return unicode_attributes[ch].upper;
|
|
||||||
else
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned int
|
|
||||||
to_lower (unsigned int ch)
|
|
||||||
{
|
|
||||||
if (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].lower != NONE)
|
|
||||||
return unicode_attributes[ch].lower;
|
|
||||||
else
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
|
|
||||||
static unsigned int
|
|
||||||
to_title (unsigned int ch)
|
|
||||||
{
|
|
||||||
if (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].title != NONE)
|
|
||||||
return unicode_attributes[ch].title;
|
|
||||||
else
|
|
||||||
return ch;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Character class properties. */
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_upper (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (to_lower (ch) != ch);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_lower (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (to_upper (ch) != ch)
|
|
||||||
/* <U00DF> is lowercase, but without simple to_upper mapping. */
|
|
||||||
|| (ch == 0x00DF);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_alpha (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& ((unicode_attributes[ch].category[0] == 'L'
|
|
||||||
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
|
||||||
<U0E2F>, <U0E46> should belong to is_punct. */
|
|
||||||
&& (ch != 0x0E2F) && (ch != 0x0E46))
|
|
||||||
/* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
|
||||||
<U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha. */
|
|
||||||
|| (ch == 0x0E31)
|
|
||||||
|| (ch >= 0x0E34 && ch <= 0x0E3A)
|
|
||||||
|| (ch >= 0x0E47 && ch <= 0x0E4E)
|
|
||||||
/* Avoid warning for <U0345>. */
|
|
||||||
|| (ch == 0x0345)
|
|
||||||
/* Avoid warnings for <U2160>..<U217F>. */
|
|
||||||
|| (unicode_attributes[ch].category[0] == 'N'
|
|
||||||
&& unicode_attributes[ch].category[1] == 'l')
|
|
||||||
/* Avoid warnings for <U24B6>..<U24E9>. */
|
|
||||||
|| (unicode_attributes[ch].category[0] == 'S'
|
|
||||||
&& unicode_attributes[ch].category[1] == 'o'
|
|
||||||
&& strstr (unicode_attributes[ch].name, " LETTER ")
|
|
||||||
!= NULL)
|
|
||||||
/* Consider all the non-ASCII digits as alphabetic.
|
|
||||||
ISO C 99 forbids us to have them in category "digit",
|
|
||||||
but we want iswalnum to return true on them. */
|
|
||||||
|| (unicode_attributes[ch].category[0] == 'N'
|
|
||||||
&& unicode_attributes[ch].category[1] == 'd'
|
|
||||||
&& !(ch >= 0x0030 && ch <= 0x0039))));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_digit (unsigned int ch)
|
|
||||||
{
|
|
||||||
#if 0
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].category[0] == 'N'
|
|
||||||
&& unicode_attributes[ch].category[1] == 'd');
|
|
||||||
/* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
|
||||||
a zero. Must add <0> in front of them by hand. */
|
|
||||||
#else
|
|
||||||
/* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
|
||||||
takes it away:
|
|
||||||
7.25.2.1.5:
|
|
||||||
The iswdigit function tests for any wide character that corresponds
|
|
||||||
to a decimal-digit character (as defined in 5.2.1).
|
|
||||||
5.2.1:
|
|
||||||
the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
|
||||||
*/
|
|
||||||
return (ch >= 0x0030 && ch <= 0x0039);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_outdigit (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (ch >= 0x0030 && ch <= 0x0039);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_blank (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (ch == 0x0009 /* '\t' */
|
|
||||||
/* Category Zs without mention of "<noBreak>" */
|
|
||||||
|| (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].category[0] == 'Z'
|
|
||||||
&& unicode_attributes[ch].category[1] == 's'
|
|
||||||
&& !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_space (unsigned int ch)
|
|
||||||
{
|
|
||||||
/* Don't make U+00A0 a space. Non-breaking space means that all programs
|
|
||||||
should treat it like a punctuation character, not like a space. */
|
|
||||||
return (ch == 0x0020 /* ' ' */
|
|
||||||
|| ch == 0x000C /* '\f' */
|
|
||||||
|| ch == 0x000A /* '\n' */
|
|
||||||
|| ch == 0x000D /* '\r' */
|
|
||||||
|| ch == 0x0009 /* '\t' */
|
|
||||||
|| ch == 0x000B /* '\v' */
|
|
||||||
/* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
|
|
||||||
|| (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].category[0] == 'Z'
|
|
||||||
&& (unicode_attributes[ch].category[1] == 'l'
|
|
||||||
|| unicode_attributes[ch].category[1] == 'p'
|
|
||||||
|| (unicode_attributes[ch].category[1] == 's'
|
|
||||||
&& !strstr (unicode_attributes[ch].decomposition,
|
|
||||||
"<noBreak>")))));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_cntrl (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& (!strcmp (unicode_attributes[ch].name, "<control>")
|
|
||||||
/* Categories Zl and Zp */
|
|
||||||
|| (unicode_attributes[ch].category[0] == 'Z'
|
|
||||||
&& (unicode_attributes[ch].category[1] == 'l'
|
|
||||||
|| unicode_attributes[ch].category[1] == 'p'))));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_xdigit (unsigned int ch)
|
|
||||||
{
|
|
||||||
#if 0
|
|
||||||
return is_digit (ch)
|
|
||||||
|| (ch >= 0x0041 && ch <= 0x0046)
|
|
||||||
|| (ch >= 0x0061 && ch <= 0x0066);
|
|
||||||
#else
|
|
||||||
/* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
|
||||||
takes it away:
|
|
||||||
7.25.2.1.12:
|
|
||||||
The iswxdigit function tests for any wide character that corresponds
|
|
||||||
to a hexadecimal-digit character (as defined in 6.4.4.1).
|
|
||||||
6.4.4.1:
|
|
||||||
hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
|
||||||
*/
|
|
||||||
return (ch >= 0x0030 && ch <= 0x0039)
|
|
||||||
|| (ch >= 0x0041 && ch <= 0x0046)
|
|
||||||
|| (ch >= 0x0061 && ch <= 0x0066);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_graph (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& strcmp (unicode_attributes[ch].name, "<control>")
|
|
||||||
&& !is_space (ch));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_print (unsigned int ch)
|
|
||||||
{
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& strcmp (unicode_attributes[ch].name, "<control>")
|
|
||||||
/* Categories Zl and Zp */
|
|
||||||
&& !(unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].category[0] == 'Z'
|
|
||||||
&& (unicode_attributes[ch].category[1] == 'l'
|
|
||||||
|| unicode_attributes[ch].category[1] == 'p')));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_punct (unsigned int ch)
|
|
||||||
{
|
|
||||||
#if 0
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].category[0] == 'P');
|
|
||||||
#else
|
|
||||||
/* The traditional POSIX definition of punctuation is every graphic,
|
|
||||||
non-alphanumeric character. */
|
|
||||||
return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_combining (unsigned int ch)
|
|
||||||
{
|
|
||||||
/* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
|
||||||
file. In 3.0.1 it was identical to the union of the general categories
|
|
||||||
"Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
|
||||||
PropList.txt file, so we take the latter definition. */
|
|
||||||
return (unicode_attributes[ch].name != NULL
|
|
||||||
&& unicode_attributes[ch].category[0] == 'M'
|
|
||||||
&& (unicode_attributes[ch].category[1] == 'n'
|
|
||||||
|| unicode_attributes[ch].category[1] == 'c'
|
|
||||||
|| unicode_attributes[ch].category[1] == 'e'));
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool
|
|
||||||
is_combining_level3 (unsigned int ch)
|
|
||||||
{
|
|
||||||
return is_combining (ch)
|
|
||||||
&& !(unicode_attributes[ch].combining[0] != '\0'
|
|
||||||
&& unicode_attributes[ch].combining[0] != '0'
|
|
||||||
&& strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Return the UCS symbol string for a Unicode character. */
|
|
||||||
static const char *
|
|
||||||
ucs_symbol (unsigned int i)
|
|
||||||
{
|
|
||||||
static char buf[11+1];
|
|
||||||
|
|
||||||
sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
|
|
||||||
return buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Return the UCS symbol range string for a Unicode characters interval. */
|
|
||||||
static const char *
|
|
||||||
ucs_symbol_range (unsigned int low, unsigned int high)
|
|
||||||
{
|
|
||||||
static char buf[24+1];
|
|
||||||
|
|
||||||
strcpy (buf, ucs_symbol (low));
|
|
||||||
strcat (buf, "..");
|
|
||||||
strcat (buf, ucs_symbol (high));
|
|
||||||
return buf;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Output a character class (= property) table. */
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_charclass (FILE *stream, const char *classname,
|
|
||||||
bool (*func) (unsigned int))
|
|
||||||
{
|
|
||||||
char table[0x110000];
|
|
||||||
unsigned int i;
|
|
||||||
bool need_semicolon;
|
|
||||||
const int max_column = 75;
|
|
||||||
int column;
|
|
||||||
|
|
||||||
for (i = 0; i < 0x110000; i++)
|
|
||||||
table[i] = (int) func (i);
|
|
||||||
|
|
||||||
fprintf (stream, "%s ", classname);
|
|
||||||
need_semicolon = false;
|
|
||||||
column = 1000;
|
|
||||||
for (i = 0; i < 0x110000; )
|
|
||||||
{
|
|
||||||
if (!table[i])
|
|
||||||
i++;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
unsigned int low, high;
|
|
||||||
char buf[25];
|
|
||||||
|
|
||||||
low = i;
|
|
||||||
do
|
|
||||||
i++;
|
|
||||||
while (i < 0x110000 && table[i]);
|
|
||||||
high = i - 1;
|
|
||||||
|
|
||||||
if (low == high)
|
|
||||||
strcpy (buf, ucs_symbol (low));
|
|
||||||
else
|
|
||||||
strcpy (buf, ucs_symbol_range (low, high));
|
|
||||||
|
|
||||||
if (need_semicolon)
|
|
||||||
{
|
|
||||||
fprintf (stream, ";");
|
|
||||||
column++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (column + strlen (buf) > max_column)
|
|
||||||
{
|
|
||||||
fprintf (stream, "/\n ");
|
|
||||||
column = 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf (stream, "%s", buf);
|
|
||||||
column += strlen (buf);
|
|
||||||
need_semicolon = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fprintf (stream, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Output a character mapping table. */
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_charmap (FILE *stream, const char *mapname,
|
|
||||||
unsigned int (*func) (unsigned int))
|
|
||||||
{
|
|
||||||
char table[0x110000];
|
|
||||||
unsigned int i;
|
|
||||||
bool need_semicolon;
|
|
||||||
const int max_column = 75;
|
|
||||||
int column;
|
|
||||||
|
|
||||||
for (i = 0; i < 0x110000; i++)
|
|
||||||
table[i] = (func (i) != i);
|
|
||||||
|
|
||||||
fprintf (stream, "%s ", mapname);
|
|
||||||
need_semicolon = false;
|
|
||||||
column = 1000;
|
|
||||||
for (i = 0; i < 0x110000; i++)
|
|
||||||
if (table[i])
|
|
||||||
{
|
|
||||||
char buf[25+1];
|
|
||||||
|
|
||||||
strcpy (buf, "(");
|
|
||||||
strcat (buf, ucs_symbol (i));
|
|
||||||
strcat (buf, ",");
|
|
||||||
strcat (buf, ucs_symbol (func (i)));
|
|
||||||
strcat (buf, ")");
|
|
||||||
|
|
||||||
if (need_semicolon)
|
|
||||||
{
|
|
||||||
fprintf (stream, ";");
|
|
||||||
column++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (column + strlen (buf) > max_column)
|
|
||||||
{
|
|
||||||
fprintf (stream, "/\n ");
|
|
||||||
column = 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf (stream, "%s", buf);
|
|
||||||
column += strlen (buf);
|
|
||||||
need_semicolon = true;
|
|
||||||
}
|
|
||||||
fprintf (stream, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Output the width table. */
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_widthmap (FILE *stream)
|
|
||||||
{
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Output the tables to the given file. */
|
|
||||||
|
|
||||||
static void
|
|
||||||
output_tables (const char *filename, const char *version)
|
|
||||||
{
|
|
||||||
FILE *stream;
|
|
||||||
unsigned int ch;
|
|
||||||
|
|
||||||
stream = fopen (filename, "w");
|
|
||||||
if (stream == NULL)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "cannot open '%s' for writing\n", filename);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf (stream, "escape_char /\n");
|
|
||||||
fprintf (stream, "comment_char %%\n");
|
|
||||||
fprintf (stream, "\n");
|
|
||||||
fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
|
|
||||||
version);
|
|
||||||
fprintf (stream, "\n");
|
|
||||||
|
|
||||||
fprintf (stream, "LC_IDENTIFICATION\n");
|
|
||||||
fprintf (stream, "title \"Unicode %s FDCC-set\"\n", version);
|
|
||||||
fprintf (stream, "source \"UnicodeData.txt, PropList.txt\"\n");
|
|
||||||
fprintf (stream, "address \"\"\n");
|
|
||||||
fprintf (stream, "contact \"\"\n");
|
|
||||||
fprintf (stream, "email \"bug-glibc-locales@gnu.org\"\n");
|
|
||||||
fprintf (stream, "tel \"\"\n");
|
|
||||||
fprintf (stream, "fax \"\"\n");
|
|
||||||
fprintf (stream, "language \"\"\n");
|
|
||||||
fprintf (stream, "territory \"Earth\"\n");
|
|
||||||
fprintf (stream, "revision \"%s\"\n", version);
|
|
||||||
{
|
|
||||||
time_t now;
|
|
||||||
char date[11];
|
|
||||||
now = time (NULL);
|
|
||||||
strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
|
|
||||||
fprintf (stream, "date \"%s\"\n", date);
|
|
||||||
}
|
|
||||||
fprintf (stream, "category \"unicode:2001\";LC_CTYPE\n");
|
|
||||||
fprintf (stream, "END LC_IDENTIFICATION\n");
|
|
||||||
fprintf (stream, "\n");
|
|
||||||
|
|
||||||
/* Verifications. */
|
|
||||||
for (ch = 0; ch < 0x110000; ch++)
|
|
||||||
{
|
|
||||||
/* toupper restriction: "Only characters specified for the keywords
|
|
||||||
lower and upper shall be specified. */
|
|
||||||
if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
|
|
||||||
fprintf (stderr,
|
|
||||||
"%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
|
|
||||||
ucs_symbol (ch), ch, to_upper (ch));
|
|
||||||
|
|
||||||
/* tolower restriction: "Only characters specified for the keywords
|
|
||||||
lower and upper shall be specified. */
|
|
||||||
if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
|
|
||||||
fprintf (stderr,
|
|
||||||
"%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
|
|
||||||
ucs_symbol (ch), ch, to_lower (ch));
|
|
||||||
|
|
||||||
/* alpha restriction: "Characters classified as either upper or lower
|
|
||||||
shall automatically belong to this class. */
|
|
||||||
if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
|
|
||||||
fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
|
|
||||||
|
|
||||||
/* alpha restriction: "No character specified for the keywords cntrl,
|
|
||||||
digit, punct or space shall be specified." */
|
|
||||||
if (is_alpha (ch) && is_cntrl (ch))
|
|
||||||
fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
|
|
||||||
if (is_alpha (ch) && is_digit (ch))
|
|
||||||
fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
|
|
||||||
if (is_alpha (ch) && is_punct (ch))
|
|
||||||
fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
|
|
||||||
if (is_alpha (ch) && is_space (ch))
|
|
||||||
fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
|
|
||||||
|
|
||||||
/* space restriction: "No character specified for the keywords upper,
|
|
||||||
lower, alpha, digit, graph or xdigit shall be specified."
|
|
||||||
upper, lower, alpha already checked above. */
|
|
||||||
if (is_space (ch) && is_digit (ch))
|
|
||||||
fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
|
|
||||||
if (is_space (ch) && is_graph (ch))
|
|
||||||
fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
|
|
||||||
if (is_space (ch) && is_xdigit (ch))
|
|
||||||
fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
|
|
||||||
|
|
||||||
/* cntrl restriction: "No character specified for the keywords upper,
|
|
||||||
lower, alpha, digit, punct, graph, print or xdigit shall be
|
|
||||||
specified." upper, lower, alpha already checked above. */
|
|
||||||
if (is_cntrl (ch) && is_digit (ch))
|
|
||||||
fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
|
|
||||||
if (is_cntrl (ch) && is_punct (ch))
|
|
||||||
fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
|
|
||||||
if (is_cntrl (ch) && is_graph (ch))
|
|
||||||
fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
|
|
||||||
if (is_cntrl (ch) && is_print (ch))
|
|
||||||
fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
|
|
||||||
if (is_cntrl (ch) && is_xdigit (ch))
|
|
||||||
fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
|
|
||||||
|
|
||||||
/* punct restriction: "No character specified for the keywords upper,
|
|
||||||
lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
|
||||||
be specified." upper, lower, alpha, cntrl already checked above. */
|
|
||||||
if (is_punct (ch) && is_digit (ch))
|
|
||||||
fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
|
|
||||||
if (is_punct (ch) && is_xdigit (ch))
|
|
||||||
fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
|
|
||||||
if (is_punct (ch) && (ch == 0x0020))
|
|
||||||
fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
|
|
||||||
|
|
||||||
/* graph restriction: "No character specified for the keyword cntrl
|
|
||||||
shall be specified." Already checked above. */
|
|
||||||
|
|
||||||
/* print restriction: "No character specified for the keyword cntrl
|
|
||||||
shall be specified." Already checked above. */
|
|
||||||
|
|
||||||
/* graph - print relation: differ only in the <space> character.
|
|
||||||
How is this possible if there are more than one space character?!
|
|
||||||
I think susv2/xbd/locale.html should speak of "space characters",
|
|
||||||
not "space character". */
|
|
||||||
if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
|
|
||||||
fprintf (stderr,
|
|
||||||
"%s is print but not graph|<space>\n", ucs_symbol (ch));
|
|
||||||
if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
|
|
||||||
fprintf (stderr,
|
|
||||||
"%s is graph|<space> but not print\n", ucs_symbol (ch));
|
|
||||||
}
|
|
||||||
|
|
||||||
fprintf (stream, "LC_CTYPE\n");
|
|
||||||
output_charclass (stream, "upper", is_upper);
|
|
||||||
output_charclass (stream, "lower", is_lower);
|
|
||||||
output_charclass (stream, "alpha", is_alpha);
|
|
||||||
output_charclass (stream, "digit", is_digit);
|
|
||||||
output_charclass (stream, "outdigit", is_outdigit);
|
|
||||||
output_charclass (stream, "blank", is_blank);
|
|
||||||
output_charclass (stream, "space", is_space);
|
|
||||||
output_charclass (stream, "cntrl", is_cntrl);
|
|
||||||
output_charclass (stream, "punct", is_punct);
|
|
||||||
output_charclass (stream, "xdigit", is_xdigit);
|
|
||||||
output_charclass (stream, "graph", is_graph);
|
|
||||||
output_charclass (stream, "print", is_print);
|
|
||||||
output_charclass (stream, "class \"combining\";", is_combining);
|
|
||||||
output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
|
|
||||||
output_charmap (stream, "toupper", to_upper);
|
|
||||||
output_charmap (stream, "tolower", to_lower);
|
|
||||||
output_charmap (stream, "map \"totitle\";", to_title);
|
|
||||||
output_widthmap (stream);
|
|
||||||
fprintf (stream, "END LC_CTYPE\n");
|
|
||||||
|
|
||||||
if (ferror (stream) || fclose (stream))
|
|
||||||
{
|
|
||||||
fprintf (stderr, "error writing to '%s'\n", filename);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
int
|
|
||||||
main (int argc, char * argv[])
|
|
||||||
{
|
|
||||||
if (argc != 3)
|
|
||||||
{
|
|
||||||
fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
|
|
||||||
exit (1);
|
|
||||||
}
|
|
||||||
|
|
||||||
fill_attributes (argv[1]);
|
|
||||||
|
|
||||||
output_tables ("unicode", argv[2]);
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,5 @@
|
|||||||
lower 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
|
lower 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
|
||||||
000000000000000000000100000000000000000000000000
|
000000000010000000000100001000000000000000000000
|
||||||
lower 倳眑婭笫崷窙嗲睧颬睼麧緗鴇膹擨闀貘覷鏷禴矙𡜍𦶠<F0A19C8D>
|
lower 倳眑婭笫崷窙嗲睧颬睼麧緗鴇膹擨闀貘覷鏷禴矙𡜍𦶠<F0A19C8D>
|
||||||
000000000000000111111111111111111111111011111111
|
000000000000000111111111111111111111111011111111
|
||||||
upper 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
|
upper 嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
|
||||||
|
10794
localedata/unicode-gen/DerivedCoreProperties.txt
Normal file
10794
localedata/unicode-gen/DerivedCoreProperties.txt
Normal file
File diff suppressed because it is too large
Load Diff
2121
localedata/unicode-gen/EastAsianWidth.txt
Normal file
2121
localedata/unicode-gen/EastAsianWidth.txt
Normal file
File diff suppressed because it is too large
Load Diff
99
localedata/unicode-gen/Makefile
Normal file
99
localedata/unicode-gen/Makefile
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
# Copyright (C) 2015 Free Software Foundation, Inc.
|
||||||
|
# This file is part of the GNU C Library.
|
||||||
|
|
||||||
|
# The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
# The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with the GNU C Library; if not, see
|
||||||
|
# <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
# Makefile for generating and updating Unicode-extracted files.
|
||||||
|
|
||||||
|
|
||||||
|
# This Makefile is NOT used as part of the GNU libc build. It needs
|
||||||
|
# to be run manually, within the source tree, at Unicode upgrades
|
||||||
|
# (change UNICODE_VERSION below), to update ../locales/i18n ctype
|
||||||
|
# information (part of the file is preserved, so don't wipe it all
|
||||||
|
# out), and ../charmaps/UTF-8.
|
||||||
|
|
||||||
|
# Use make all to generate the files used in the glibc build out of
|
||||||
|
# the original Unicode files; make check to verify that they are what
|
||||||
|
# we expect; make install to copy them to the location expected by the
|
||||||
|
# glibc build; and make clean to remove all generated files.
|
||||||
|
|
||||||
|
# We keep a local copy of the downloaded Unicode files, to avoid
|
||||||
|
# running afoul of the LGPL corresponding sources requirements, even
|
||||||
|
# though it's not clear that they are preferred over the generated
|
||||||
|
# files for making modifications.
|
||||||
|
|
||||||
|
|
||||||
|
UNICODE_VERSION = 7.0.0
|
||||||
|
|
||||||
|
PYTHON3 = python3
|
||||||
|
WGET = wget
|
||||||
|
|
||||||
|
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
|
||||||
|
GENERATED = i18n UTF-8
|
||||||
|
REPORTS = i18n-report UTF-8-report
|
||||||
|
|
||||||
|
all: $(GENERATED)
|
||||||
|
|
||||||
|
check: check-i18n check-UTF-8
|
||||||
|
|
||||||
|
install:
|
||||||
|
cp -p i18n ../locales/i18n
|
||||||
|
cp -p UTF-8 ../charmaps/UTF-8
|
||||||
|
|
||||||
|
clean: mostlyclean
|
||||||
|
-rm -rf __pycache__
|
||||||
|
mostlyclean:
|
||||||
|
-rm -f $(REPORTS) $(GENERATED)
|
||||||
|
|
||||||
|
.PHONY: all check clean mostlyclean install
|
||||||
|
|
||||||
|
i18n: UnicodeData.txt DerivedCoreProperties.txt
|
||||||
|
i18n: ../locales/i18n # Preserve non-ctype information.
|
||||||
|
i18n: gen_unicode_ctype.py
|
||||||
|
$(PYTHON3) gen_unicode_ctype.py -u UnicodeData.txt \
|
||||||
|
-d DerivedCoreProperties.txt -i ../locales/i18n -o $@ \
|
||||||
|
--unicode_version $(UNICODE_VERSION)
|
||||||
|
|
||||||
|
i18n-report: i18n ../locales/i18n
|
||||||
|
i18n-report: ctype_compatibility.py ctype_compatibility_test_cases.py
|
||||||
|
$(PYTHON3) ./ctype_compatibility.py -o ../locales/i18n \
|
||||||
|
-n i18n -a -m > $@
|
||||||
|
|
||||||
|
check-i18n: i18n-report
|
||||||
|
@if grep '\(Missing\|Added\) [^0]\|^Number of errors[^=]* = [^0]' \
|
||||||
|
i18n-report; \
|
||||||
|
then echo manual verification required; false; else true; fi
|
||||||
|
|
||||||
|
UTF-8: UnicodeData.txt EastAsianWidth.txt
|
||||||
|
UTF-8: utf8_gen.py
|
||||||
|
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
||||||
|
|
||||||
|
UTF-8-report: UTF-8 ../charmaps/UTF-8
|
||||||
|
UTF-8-report: utf8_compatibility.py
|
||||||
|
$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
|
||||||
|
-n UTF-8 -a -m > $@
|
||||||
|
|
||||||
|
check-UTF-8: UTF-8-report
|
||||||
|
@if grep '^Total.*: [^0]' UTF-8-report; \
|
||||||
|
then echo manual verification required; false; else true; fi
|
||||||
|
|
||||||
|
|
||||||
|
.PHONY: downloads clean-downloads
|
||||||
|
downloads: $(DOWNLOADS)
|
||||||
|
clean-downloads:
|
||||||
|
-rm -f $(DOWNLOADS)
|
||||||
|
|
||||||
|
$(DOWNLOADS):
|
||||||
|
$(WGET) http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$@
|
27268
localedata/unicode-gen/UnicodeData.txt
Normal file
27268
localedata/unicode-gen/UnicodeData.txt
Normal file
File diff suppressed because it is too large
Load Diff
546
localedata/unicode-gen/ctype_compatibility.py
Executable file
546
localedata/unicode-gen/ctype_compatibility.py
Executable file
@ -0,0 +1,546 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||||
|
# This file is part of the GNU C Library.
|
||||||
|
#
|
||||||
|
# The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with the GNU C Library; if not, see
|
||||||
|
# <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
'''
|
||||||
|
This script is useful for checking the differences between
|
||||||
|
an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
|
||||||
|
new one generated by gen_unicode_ctype.py
|
||||||
|
|
||||||
|
To see how it is used, call it with the “-h” option:
|
||||||
|
|
||||||
|
$ ./ctype_compatibility.py -h
|
||||||
|
… prints usage message …
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
from ctype_compatibility_test_cases import TEST_CASES
|
||||||
|
|
||||||
|
def get_lines_from_file(filename):
|
||||||
|
'''Get all non-comment lines from a i18n file
|
||||||
|
|
||||||
|
Also merge all lines which are continued on the next line because
|
||||||
|
they end in “/” into a single line.
|
||||||
|
'''
|
||||||
|
with open(filename) as i18n_file:
|
||||||
|
current_line = ''
|
||||||
|
for line in i18n_file:
|
||||||
|
line = line.strip('\n')
|
||||||
|
if '%' in line:
|
||||||
|
if line.endswith('/'):
|
||||||
|
line = line[0:line.find('%')] + '/'
|
||||||
|
else:
|
||||||
|
line = line[0:line.find('%')]
|
||||||
|
line = line.strip()
|
||||||
|
if line.endswith('/'):
|
||||||
|
current_line += line[:-1]
|
||||||
|
else:
|
||||||
|
yield current_line + line
|
||||||
|
current_line = ''
|
||||||
|
if current_line: # file ends with a continuation line
|
||||||
|
yield current_line
|
||||||
|
|
||||||
|
def extract_character_classes(filename):
|
||||||
|
'''Get all Unicode code points for each character class from a file
|
||||||
|
|
||||||
|
Store these code points in a dictionary using the character classes
|
||||||
|
as keys and the list of code points in this character class as values.
|
||||||
|
|
||||||
|
In case of the character classes “toupper”, “tolower”, and “totitle”,
|
||||||
|
these area actually pairs of code points
|
||||||
|
'''
|
||||||
|
ctype_dict = {}
|
||||||
|
for line in get_lines_from_file(filename):
|
||||||
|
for char_class in [
|
||||||
|
'upper',
|
||||||
|
'lower',
|
||||||
|
'alpha',
|
||||||
|
'digit',
|
||||||
|
'outdigit',
|
||||||
|
'space',
|
||||||
|
'cntrl',
|
||||||
|
'punct',
|
||||||
|
'graph',
|
||||||
|
'print',
|
||||||
|
'xdigit',
|
||||||
|
'blank',
|
||||||
|
'combining',
|
||||||
|
'combining_level3',
|
||||||
|
'toupper',
|
||||||
|
'tolower',
|
||||||
|
'totitle']:
|
||||||
|
match = re.match(r'^('
|
||||||
|
+'(?:(?:class|map)\s+")'
|
||||||
|
+re.escape(char_class)+
|
||||||
|
'(?:";)\s+'
|
||||||
|
+'|'
|
||||||
|
+re.escape(char_class)+'\s+'
|
||||||
|
+')', line)
|
||||||
|
if match:
|
||||||
|
if char_class not in ctype_dict:
|
||||||
|
ctype_dict[char_class] = []
|
||||||
|
process_chars(
|
||||||
|
ctype_dict[char_class],
|
||||||
|
line[match.end():])
|
||||||
|
return ctype_dict
|
||||||
|
|
||||||
|
def process_chars(char_class_list, code_point_line):
|
||||||
|
'''
|
||||||
|
Extract Unicode values from code_point_line
|
||||||
|
and add to the list of code points in a character class
|
||||||
|
'''
|
||||||
|
for code_points in code_point_line.split(';'):
|
||||||
|
code_points = code_points.strip()
|
||||||
|
match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
|
||||||
|
if match: # <Uxxxx>
|
||||||
|
char_class_list.append(
|
||||||
|
int(match.group('codepoint'), 16))
|
||||||
|
continue
|
||||||
|
match = re.match(
|
||||||
|
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||||
|
+'\.\.'+
|
||||||
|
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
|
||||||
|
code_points)
|
||||||
|
if match: # <Uxxxx>..<Uxxxx>
|
||||||
|
for codepoint in range(
|
||||||
|
int(match.group('codepoint1'), 16),
|
||||||
|
int(match.group('codepoint2'), 16) + 1):
|
||||||
|
char_class_list.append(codepoint)
|
||||||
|
continue
|
||||||
|
match = re.match(
|
||||||
|
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||||
|
+'\.\.\(2\)\.\.'+
|
||||||
|
'<U(?P<codepoint2>[0-9A-F]{4,8})>$',
|
||||||
|
code_points)
|
||||||
|
if match: # <Uxxxx>..(2)..<Uxxxx>
|
||||||
|
for codepoint in range(
|
||||||
|
int(match.group('codepoint1'), 16),
|
||||||
|
int(match.group('codepoint2'), 16) + 1,
|
||||||
|
2):
|
||||||
|
char_class_list.append(codepoint)
|
||||||
|
continue
|
||||||
|
match = re.match(
|
||||||
|
r'^\('
|
||||||
|
+'<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||||
|
+','+
|
||||||
|
'<U(?P<codepoint2>[0-9A-F]{4,8})>'
|
||||||
|
+'\)$',
|
||||||
|
code_points)
|
||||||
|
if match: # (<Uxxxx>,<Uxxxx>)
|
||||||
|
char_class_list.append((
|
||||||
|
int(match.group('codepoint1'), 16),
|
||||||
|
int(match.group('codepoint2'), 16)))
|
||||||
|
continue
|
||||||
|
sys.stderr.write(
|
||||||
|
('None of the regexps matched '
|
||||||
|
+ 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
|
||||||
|
'cp': code_points,
|
||||||
|
'cpl': code_point_line
|
||||||
|
})
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def compare_lists(old_ctype_dict, new_ctype_dict):
|
||||||
|
'''Compare character classes in the old and the new LC_CTYPE'''
|
||||||
|
print('****************************************************')
|
||||||
|
print('Character classes which are only in the new '
|
||||||
|
+ 'or only in the old file:')
|
||||||
|
for char_class in sorted(old_ctype_dict):
|
||||||
|
if char_class not in new_ctype_dict:
|
||||||
|
print('Character class %s is in old ctype but not in new ctype'
|
||||||
|
%char_class)
|
||||||
|
for char_class in sorted(new_ctype_dict):
|
||||||
|
if char_class not in old_ctype_dict:
|
||||||
|
print('Character class %s is in new ctype but not in old ctype'
|
||||||
|
%char_class)
|
||||||
|
for char_class in sorted(old_ctype_dict):
|
||||||
|
print("****************************************************")
|
||||||
|
print("%s: %d chars in old ctype and %d chars in new ctype" %(
|
||||||
|
char_class,
|
||||||
|
len(old_ctype_dict[char_class]),
|
||||||
|
len(new_ctype_dict[char_class])))
|
||||||
|
print("----------------------------------------------------")
|
||||||
|
report(char_class,
|
||||||
|
old_ctype_dict[char_class],
|
||||||
|
new_ctype_dict[char_class])
|
||||||
|
|
||||||
|
def report_code_points(char_class, code_point_list, text=''):
|
||||||
|
'''Report all code points which have been added to or removed from a
|
||||||
|
character class.
|
||||||
|
'''
|
||||||
|
for code_point in sorted(code_point_list):
|
||||||
|
if type(code_point) == type(int()):
|
||||||
|
print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
|
||||||
|
%{'text': text,
|
||||||
|
'char': chr(code_point),
|
||||||
|
'char_class': char_class,
|
||||||
|
'code_point': hex(code_point),
|
||||||
|
'name': unicodedata.name(chr(code_point), 'name unknown')})
|
||||||
|
else:
|
||||||
|
print(('%(char_class)s: %(text)s: '
|
||||||
|
+ '%(char0)s → %(char1)s '
|
||||||
|
+ '%(code_point0)s → %(code_point1)s '
|
||||||
|
+ '%(name0)s → %(name1)s') %{
|
||||||
|
'text': text,
|
||||||
|
'char_class': char_class,
|
||||||
|
'char0': chr(code_point[0]),
|
||||||
|
'code_point0': hex(code_point[0]),
|
||||||
|
'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
|
||||||
|
'char1': chr(code_point[1]),
|
||||||
|
'code_point1': hex(code_point[1]),
|
||||||
|
'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
|
||||||
|
})
|
||||||
|
|
||||||
|
def report(char_class, old_list, new_list):
|
||||||
|
'''Report the differences for a certain LC_CTYPE character class
|
||||||
|
between the old and the newly generated state
|
||||||
|
'''
|
||||||
|
missing_chars = list(set(old_list)-set(new_list))
|
||||||
|
print(('%(char_class)s: Missing %(number)d characters '
|
||||||
|
+ 'of old ctype in new ctype ')
|
||||||
|
%{'char_class': char_class, 'number': len(missing_chars)})
|
||||||
|
if ARGS.show_missing_characters:
|
||||||
|
report_code_points(char_class, missing_chars, 'Missing')
|
||||||
|
added_chars = list(set(new_list)-set(old_list))
|
||||||
|
print(('%(char_class)s: Added %(number)d characters '
|
||||||
|
+ 'in new ctype which were not in old ctype')
|
||||||
|
%{'char_class': char_class, 'number': len(added_chars)})
|
||||||
|
if ARGS.show_added_characters:
|
||||||
|
report_code_points(char_class, added_chars, 'Added')
|
||||||
|
|
||||||
|
|
||||||
|
def cperror(error_message, errorcounter=0):
|
||||||
|
'''Increase number of errors by one and print an error message'''
|
||||||
|
print(error_message)
|
||||||
|
return errorcounter + 1
|
||||||
|
|
||||||
|
def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
|
||||||
|
errorcounter=0):
|
||||||
|
'''The parameter “code_point_list_with_ranges” is a list of
|
||||||
|
integers or pairs of integers, for example:
|
||||||
|
|
||||||
|
[0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
|
||||||
|
|
||||||
|
where the pairs of integers stand for all the code points in the range
|
||||||
|
of the two integers given, including the two integers of the pair.
|
||||||
|
|
||||||
|
'''
|
||||||
|
for code_point_range in code_point_list_with_ranges:
|
||||||
|
for code_point in ([code_point_range]
|
||||||
|
if type(code_point_range) == type(int())
|
||||||
|
else range(code_point_range[0],
|
||||||
|
code_point_range[1]+1)):
|
||||||
|
for char_class_tuple in char_classes:
|
||||||
|
char_class = char_class_tuple[0]
|
||||||
|
in_char_class = char_class_tuple[1]
|
||||||
|
if (code_point in ctype_dict[char_class]) != in_char_class:
|
||||||
|
errorcounter = cperror(
|
||||||
|
('error: %(code_point)s %(char)s '
|
||||||
|
+ '%(char_class)s %(in)s: %(reason)s') %{
|
||||||
|
'code_point': hex(code_point),
|
||||||
|
'char': chr(code_point),
|
||||||
|
'char_class': char_class,
|
||||||
|
'in': not in_char_class,
|
||||||
|
'reason': reason},
|
||||||
|
errorcounter)
|
||||||
|
return errorcounter
|
||||||
|
|
||||||
|
def tests(ctype_dict, errorcounter = 0):
|
||||||
|
'''Test a LC_CTYPE character class dictionary for known errors'''
|
||||||
|
# copy the information from ctype_dict (which contains lists) in
|
||||||
|
# a new dictionary ctype_dict2 (which contains dictionaries).
|
||||||
|
# The checks below are easier with that type of data structure.
|
||||||
|
|
||||||
|
ctype_dict2 = {}
|
||||||
|
for key in ctype_dict:
|
||||||
|
ctype_dict2[key] = {}
|
||||||
|
if ctype_dict[key]:
|
||||||
|
if type(ctype_dict[key][0]) == type(int()):
|
||||||
|
for value in ctype_dict[key]:
|
||||||
|
ctype_dict2[key][value] = 1
|
||||||
|
else: # key is 'toupper', 'tolower', or 'totitle'
|
||||||
|
for value in ctype_dict[key]:
|
||||||
|
ctype_dict2[key][value[0]] = value[1]
|
||||||
|
|
||||||
|
for test_case in TEST_CASES:
|
||||||
|
errorcounter = cpcheck(ctype_dict2,
|
||||||
|
test_case[0],
|
||||||
|
test_case[1],
|
||||||
|
test_case[2],
|
||||||
|
errorcounter = errorcounter)
|
||||||
|
|
||||||
|
for code_point in range(0, 0x110000):
|
||||||
|
# toupper restriction: "Only characters specified for the keywords
|
||||||
|
# lower and upper shall be specified.
|
||||||
|
if (code_point in ctype_dict2['toupper']
|
||||||
|
and code_point != ctype_dict2['toupper'][code_point]
|
||||||
|
and not (code_point in ctype_dict2['lower']
|
||||||
|
or code_point in ctype_dict2['upper'])):
|
||||||
|
errorcounter = cperror(
|
||||||
|
('error: %(char1)s is not upper|lower '
|
||||||
|
+ 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
|
||||||
|
'char1': chr(code_point),
|
||||||
|
'cp1': hex(code_point),
|
||||||
|
'cp2': hex(ctype_dict2['toupper'][code_point]),
|
||||||
|
'char2': chr(ctype_dict2['toupper'][code_point])
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# tolower restriction: "Only characters specified for the keywords
|
||||||
|
# lower and upper shall be specified.
|
||||||
|
if (code_point in ctype_dict2['tolower']
|
||||||
|
and code_point != ctype_dict2['tolower'][code_point]
|
||||||
|
and not (code_point in ctype_dict2['lower']
|
||||||
|
or code_point in ctype_dict2['upper'])):
|
||||||
|
errorcounter = cperror(
|
||||||
|
('error: %(char1)s is not upper|lower '
|
||||||
|
+ 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
|
||||||
|
'char1': chr(code_point),
|
||||||
|
'cp1': hex(code_point),
|
||||||
|
'cp2': hex(ctype_dict2['tolower'][code_point]),
|
||||||
|
'char2': chr(ctype_dict2['tolower'][code_point])
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# alpha restriction: "Characters classified as either upper or lower
|
||||||
|
# shall automatically belong to this class.
|
||||||
|
if ((code_point in ctype_dict2['lower']
|
||||||
|
or code_point in ctype_dict2['upper'])
|
||||||
|
and code_point not in ctype_dict2['alpha']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is upper|lower but not alpha' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# alpha restriction: "No character specified for the keywords cntrl,
|
||||||
|
# digit, punct or space shall be specified."
|
||||||
|
if (code_point in ctype_dict2['alpha']
|
||||||
|
and code_point in ctype_dict2['cntrl']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is alpha and cntrl' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['alpha']
|
||||||
|
and code_point in ctype_dict2['digit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is alpha and digit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['alpha']
|
||||||
|
and code_point in ctype_dict2['punct']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is alpha and punct' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['alpha']
|
||||||
|
and code_point in ctype_dict2['space']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is alpha and space' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# space restriction: "No character specified for the keywords upper,
|
||||||
|
# lower, alpha, digit, graph or xdigit shall be specified."
|
||||||
|
# upper, lower, alpha already checked above.
|
||||||
|
if (code_point in ctype_dict2['space']
|
||||||
|
and code_point in ctype_dict2['digit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is space and digit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['space']
|
||||||
|
and code_point in ctype_dict2['graph']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is space and graph' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['space']
|
||||||
|
and code_point in ctype_dict2['xdigit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is space and xdigit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# cntrl restriction: "No character specified for the keywords upper,
|
||||||
|
# lower, alpha, digit, punct, graph, print or xdigit shall be
|
||||||
|
# specified." upper, lower, alpha already checked above.
|
||||||
|
if (code_point in ctype_dict2['cntrl']
|
||||||
|
and code_point in ctype_dict2['digit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is cntrl and digit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['cntrl']
|
||||||
|
and code_point in ctype_dict2['punct']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is cntrl and punct' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['cntrl']
|
||||||
|
and code_point in ctype_dict2['graph']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is cntrl and graph' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['cntrl']
|
||||||
|
and code_point in ctype_dict2['print']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is cntrl and print' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['cntrl']
|
||||||
|
and code_point in ctype_dict2['xdigit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is cntrl and xdigit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# punct restriction: "No character specified for the keywords upper,
|
||||||
|
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||||||
|
# be specified." upper, lower, alpha, cntrl already checked above.
|
||||||
|
if (code_point in ctype_dict2['punct']
|
||||||
|
and code_point in ctype_dict2['digit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is punct and digit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['punct']
|
||||||
|
and code_point in ctype_dict2['xdigit']):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is punct and xdigit' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point in ctype_dict2['punct']
|
||||||
|
and code_point == 0x0020):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is punct.' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
# graph restriction: "No character specified for the keyword cntrl
|
||||||
|
# shall be specified." Already checked above.
|
||||||
|
|
||||||
|
# print restriction: "No character specified for the keyword cntrl
|
||||||
|
# shall be specified." Already checked above.
|
||||||
|
|
||||||
|
# graph - print relation: differ only in the <space> character.
|
||||||
|
# How is this possible if there are more than one space character?!
|
||||||
|
# I think susv2/xbd/locale.html should speak of "space characters",
|
||||||
|
# not "space character".
|
||||||
|
if (code_point in ctype_dict2['print']
|
||||||
|
and not (code_point in ctype_dict2['graph']
|
||||||
|
or code_point in ctype_dict2['space'])):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s is print but not graph|space' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
if (code_point not in ctype_dict2['print']
|
||||||
|
and (code_point in ctype_dict2['graph']
|
||||||
|
or code_point == 0x0020)):
|
||||||
|
errorcounter = cperror(
|
||||||
|
'error: %(char)s %(cp)s graph|space but not print' %{
|
||||||
|
'char': chr(code_point),
|
||||||
|
'cp': hex(code_point)
|
||||||
|
},
|
||||||
|
errorcounter)
|
||||||
|
return errorcounter
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
PARSER = argparse.ArgumentParser(
|
||||||
|
description='''
|
||||||
|
Compare the contents of LC_CTYPE in two files and check for errors.
|
||||||
|
''')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-o', '--old_ctype_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
default='i18n',
|
||||||
|
help='The old ctype file, default: %(default)s')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-n', '--new_ctype_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
default='unicode-ctype',
|
||||||
|
help='The new ctype file, default: %(default)s')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-a', '--show_added_characters',
|
||||||
|
action='store_true',
|
||||||
|
help=('Show characters which were added to each '
|
||||||
|
+ 'character class in detail.'))
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-m', '--show_missing_characters',
|
||||||
|
action='store_true',
|
||||||
|
help=('Show characters which were removed from each '
|
||||||
|
+ 'character class in detail.'))
|
||||||
|
ARGS = PARSER.parse_args()
|
||||||
|
|
||||||
|
OLD_CTYPE_DICT = extract_character_classes(
|
||||||
|
ARGS.old_ctype_file)
|
||||||
|
NEW_CTYPE_DICT = extract_character_classes(
|
||||||
|
ARGS.new_ctype_file)
|
||||||
|
compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
|
||||||
|
print('============================================================')
|
||||||
|
print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('Old file = %s' %ARGS.old_ctype_file)
|
||||||
|
print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('============================================================')
|
||||||
|
print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('New file = %s' %ARGS.new_ctype_file)
|
||||||
|
print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
exit(0)
|
951
localedata/unicode-gen/ctype_compatibility_test_cases.py
Normal file
951
localedata/unicode-gen/ctype_compatibility_test_cases.py
Normal file
@ -0,0 +1,951 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||||
|
# This file is part of the GNU C Library.
|
||||||
|
#
|
||||||
|
# The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with the GNU C Library; if not, see
|
||||||
|
# <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
'''
|
||||||
|
This file contains a list of test cases used by
|
||||||
|
the ctype_compatibility.py script.
|
||||||
|
'''
|
||||||
|
|
||||||
|
TEST_CASES = [
|
||||||
|
[[0x0E2F, 0x0E46], [('alpha', True), ('punct', False)],
|
||||||
|
'''Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
|
||||||
|
<U0E2F>, <U0E46> should belong to punct. DerivedCoreProperties.txt
|
||||||
|
says it is alpha. We trust DerivedCoreProperties.txt.'''
|
||||||
|
],
|
||||||
|
[[0x0E31, (0x0E34, 0x0E3A)], [('alpha', True)],
|
||||||
|
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
|
||||||
|
<thep@links.nectec.or.th> says <U0E31>, <U0E34>..<U0E3A>
|
||||||
|
are alpha. DerivedCoreProperties.txt agrees.'''
|
||||||
|
],
|
||||||
|
[[(0x0E47, 0x0E4C), 0x0E4E], [('alpha', False)],
|
||||||
|
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
|
||||||
|
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
|
||||||
|
is_alpha. DerivedCoreProperties does says *only* <U0E4D>
|
||||||
|
in that range is alphabetic, the others are *not*. We
|
||||||
|
trust DerivedCoreProperties.txt.'''
|
||||||
|
],
|
||||||
|
[[0x0E4D], [('alpha', True)],
|
||||||
|
'''gen-unicode-ctype.c: Theppitak Karoonboonyanan
|
||||||
|
<thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
|
||||||
|
is_alpha. DerivedCoreProperties does says *only* <U0E4D>
|
||||||
|
in that range is alphabetic, the others are *not*. We
|
||||||
|
trust DerivedCoreProperties.txt.
|
||||||
|
'''
|
||||||
|
],
|
||||||
|
[[0x0345], [('alpha', True), ('lower', True)],
|
||||||
|
'''COMBINING GREEK YPOGEGRAMMENI
|
||||||
|
According to DerivedCoreProperties.txt, this is “Alphabetic”
|
||||||
|
and “Lowercase”.'''
|
||||||
|
],
|
||||||
|
[[(0x2160, 0x2188)], [('alpha', True)],
|
||||||
|
'''Roman Numerals are “Alphabetic” according to
|
||||||
|
DerivedCoreProperties.txt'''
|
||||||
|
],
|
||||||
|
[[(0x24B6, 0x24E9)], [('alpha', True)],
|
||||||
|
'''Circled Latin letters are “Alphabetic” according to
|
||||||
|
DerivedCoreProperties.txt'''
|
||||||
|
],
|
||||||
|
[[0x661], [('alpha', True), ('digit', False)],
|
||||||
|
'''gen-unicode-ctype.c: All non-ASCII digits should be alphabetic.
|
||||||
|
ISO C 99 forbids us to have them in category "digit", but we
|
||||||
|
want iswalnum to return true on them. Don’t forget to
|
||||||
|
have a look at all the other digits, 0x661 is just one
|
||||||
|
example tested here.'''
|
||||||
|
],
|
||||||
|
[[(0x0030, 0x0039)], [('digit', True)],
|
||||||
|
'''gen-unicode-ctype.c: All ASCII digits should be digits.'''
|
||||||
|
],
|
||||||
|
[[0x0009], [('blank', True)],
|
||||||
|
'''gen-unicode-ctype.c: CHARACTER TABULATION'''
|
||||||
|
],
|
||||||
|
[[0x2007], [('blank', False), ('space', False)],
|
||||||
|
'''gen-unicode-ctype.c: FIGURE SPACE, because it has <noBreak>
|
||||||
|
in the description.'''
|
||||||
|
],
|
||||||
|
[[0x0009, 0x000A, 0x000B, 0x000C, 0x000D], [('space', True)],
|
||||||
|
'''gen-unicode-ctype.c: CHARACTER TABULATION, LINE FEED (LF), LINE
|
||||||
|
TABULATION, ;FORM FEED (FF), CARRIAGE RETURN (CR)'''
|
||||||
|
],
|
||||||
|
[[0x2028, 0x2029], [('cntrl', True)],
|
||||||
|
'''gen-unicode-ctype.c: LINE SEPARATOR and PARAGRAPH SEPARATOR
|
||||||
|
should be cntrl.'''
|
||||||
|
],
|
||||||
|
[[(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)],
|
||||||
|
[('xdigit', True)],
|
||||||
|
'''gen-unicode-ctype.c: ISO C 99 says (6.4.4.1): hexadecimal-digit:
|
||||||
|
one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F (nothing else
|
||||||
|
should be considered as a hexadecimal-digit)'''
|
||||||
|
],
|
||||||
|
[[0x0330], [('combining', True), ('combining_level3', False)],
|
||||||
|
'''gen-unicode-ctype.c: COMBINING TILDE BELOW, canonical combining
|
||||||
|
class value >= 200, should be in combining but not in
|
||||||
|
combining_level3'''
|
||||||
|
],
|
||||||
|
[[0x0250, 0x0251, 0x0271], [('lower', True)],
|
||||||
|
'''Should be lower in Unicode 7.0.0 (was not lower in
|
||||||
|
Unicode 5.0.0).
|
||||||
|
'''
|
||||||
|
],
|
||||||
|
[[0x2184], [('lower', True)],
|
||||||
|
'''Should be lower both in Unicode 5.0.0 and 7.0.0'''
|
||||||
|
],
|
||||||
|
[[0xA67F], [('punct', False), ('alpha', True)],
|
||||||
|
'''0xa67f CYRILLIC PAYEROK. Not in Unicode 5.0.0. In Unicode
|
||||||
|
7.0.0. General category Lm (Letter
|
||||||
|
modifier). DerivedCoreProperties.txt says it is
|
||||||
|
“Alphabetic”. Apparently added manually to punct by mistake in
|
||||||
|
glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[0xA60C], [('punct', False), ('alpha', True)],
|
||||||
|
'''0xa60c VAI SYLLABLE LENGTHENER. Not in Unicode 5.0.0.
|
||||||
|
In Unicode 7.0.0. General category Lm (Letter
|
||||||
|
modifier). DerivedCoreProperties.txt says it is
|
||||||
|
“Alphabetic”. Apparently added manually to punct by mistake in
|
||||||
|
glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[0x2E2F], [('punct', False), ('alpha', True)],
|
||||||
|
'''0x2E2F VERTICAL TILDE. Not in Unicode 5.0.0. In Unicode
|
||||||
|
7.0.0. General category Lm (Letter
|
||||||
|
modifier). DerivedCoreProperties.txt says it is
|
||||||
|
“Alphabetic”. Apparently added manually to punct by mistake in
|
||||||
|
glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[(0x1090, 0x1099)], [('punct', False), ('alpha', True)],
|
||||||
|
'''MYANMAR SHAN DIGIT ZERO - MYANMAR SHAN DIGIT NINE.
|
||||||
|
These are digits, but because ISO C 99 forbids to
|
||||||
|
put them into digit they should go into alpha.'''
|
||||||
|
],
|
||||||
|
[[0x103F], [('punct', False), ('alpha', True)],
|
||||||
|
'''0x103F MYANMAR LETTER GREAT SA. Not in Unicode 5.0.0.
|
||||||
|
In Unicode 7.0.0. General category Lo
|
||||||
|
(Other_Letter). DerivedCoreProperties.txt says it is
|
||||||
|
“Alphabetic”. Apparently added manually to punct by
|
||||||
|
mistake in glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[0x0374], [('punct', False), ('alpha', True)],
|
||||||
|
'''0x0374 GREEK NUMERAL SIGN. Unicode 5.0.0: general category
|
||||||
|
Sk. Unicode 7.0.0: General category Lm
|
||||||
|
(Modifier_Letter). DerivedCoreProperties.txt says it is
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x02EC], [('punct', False), ('alpha', True)],
|
||||||
|
'''0x02EC MODIFIER LETTER VOICING. Unicode 5.0.0: general category
|
||||||
|
Sk. Unicode 7.0.0: General category Lm
|
||||||
|
(Modifier_Letter). DerivedCoreProperties.txt says it is
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x180E], [('space', False), ('blank', False)],
|
||||||
|
'''0x180e MONGOLIAN VOWEL SEPARATOR. Unicode 5.0.0: General
|
||||||
|
category Zs (Space_Separator) Unicode 7.0.0: General category Cf
|
||||||
|
(Format).'''
|
||||||
|
],
|
||||||
|
[[0x1E9C, 0x1E9D, 0x1E9F],
|
||||||
|
[('lower', True), ('upper', False), ('tolower', False),
|
||||||
|
('toupper', False), ('totitle', False)],
|
||||||
|
'''ẜ 0x1e9c LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE,
|
||||||
|
ẝ 0x1e9d LATIN SMALL LETTER LONG S WITH HIGH STROKE,
|
||||||
|
ẟ 0x1e9f LATIN SMALL LETTER DELTA. These are “Lowercase”
|
||||||
|
according to DerivedCoreProperties.txt but no upper case versions
|
||||||
|
exist.'''
|
||||||
|
],
|
||||||
|
[[0x1E9E],
|
||||||
|
[('lower', False), ('upper', True), ('tolower', True),
|
||||||
|
('toupper', False), ('totitle', False)],
|
||||||
|
'''0x1E9E ẞ LATIN CAPITAL LETTER SHARP S This is “Uppercase”
|
||||||
|
according to DerivedCoreProperties.txt and the lower case
|
||||||
|
version is 0x00DF ß LATIN SMALL LETTER SHARP S.'''
|
||||||
|
],
|
||||||
|
[[0x2188],
|
||||||
|
[('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''0x2188 ROMAN NUMERAL ONE HUNDRED THOUSAND. This is “Alphabetic”
|
||||||
|
according to DerivedCoreProperties.txt. In glibc’s old
|
||||||
|
LC_CTYPE, it was in “lower”, which seems to be a
|
||||||
|
mistake. It is not “Lowercase” in
|
||||||
|
DerivedCoreProperties.txt and does not have case mappings
|
||||||
|
in UnicodeData.txt either.'''
|
||||||
|
],
|
||||||
|
[[0x2C71, 0x2C74, (0x2C77, 0x2C7A)],
|
||||||
|
[('alpha', True), ('lower', True), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''These are Latin small letters which were not in Unicode 5.0.0
|
||||||
|
but are in Unicode 7.0.0. According to
|
||||||
|
DerivedCoreProperties.txt they are “Lowercase”. But no
|
||||||
|
uppercase versions exist. They have apparently been added
|
||||||
|
manually to glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[0xA730, 0xA731],
|
||||||
|
[('alpha', True), ('lower', True), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''These are Latin small “capital” letters which were not in
|
||||||
|
Unicode 5.0.0 but are in Unicode 7.0.0. According to
|
||||||
|
DerivedCoreProperties.txt they are “Lowercase”. But no
|
||||||
|
uppercase versions exist. They have apparently been added
|
||||||
|
manually to glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[(0xA771, 0xA778)],
|
||||||
|
[('alpha', True), ('lower', True), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''These are Latin small (or small “capital”) letters which
|
||||||
|
were not in Unicodee 5.0.0 but are in Unicode 7.0.0. According to
|
||||||
|
DerivedCoreProperties.txt they are “Lowercase”. But no
|
||||||
|
uppercase versions exist. They have apparently been added
|
||||||
|
manually to glibc’s old LC_CTYPE.'''
|
||||||
|
],
|
||||||
|
[[0x0375],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''“0375;GREEK LOWER NUMERAL SIGN;Sk;0;ON;;;;;N;;;;;”. Has
|
||||||
|
apparently been added manually to glibc’s old LC_CTYPE as
|
||||||
|
“combining_level3”. That seems wrong, it is no combining
|
||||||
|
character because it does not have one of the general
|
||||||
|
categories Mn, Mc, or Me. According to
|
||||||
|
DerivedCoreProperties.txt it is not “Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x108D],
|
||||||
|
[('combining', True), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''“108D;MYANMAR SIGN SHAN COUNCIL EMPHATIC
|
||||||
|
TONE;Mn;220;NSM;;;;;N;;;;;”. Has apparently been added
|
||||||
|
manually to glibc’s old LC_CTYPE as
|
||||||
|
“combining_level3”. That seems wrong, although it is a
|
||||||
|
combining character because it has the general category
|
||||||
|
Mn, it is not “combining_level3” because the canonical
|
||||||
|
combining class value is 220 which is >= 200. According to
|
||||||
|
gen-unicode-ctype.c, “combining_level3” needs a
|
||||||
|
canonical combining class value < 200. According to
|
||||||
|
DerivedCoreProperties.txt it is not “Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x06DE],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
''' UnicodeData.txt 5.0.0: “06DE;ARABIC START OF RUB EL
|
||||||
|
HIZB;Me;0;NSM;;;;;N;;;;;”; UnicodeData.txt 7.0.0:
|
||||||
|
“06DE;ARABIC START OF RUB EL
|
||||||
|
HIZB;So;0;ON;;;;;N;;;;;”. I.e. this used to be a
|
||||||
|
combining character in Unicode 5.0.0 but not anymore in
|
||||||
|
7.0.0. According to DerivedCoreProperties.txt it is not
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0BD0],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0:
|
||||||
|
“0BD0;TAMIL OM;Lo;0;L;;;;;N;;;;;”. Apparently manually added to
|
||||||
|
“combining” and “combining_level3” in glibc’s old
|
||||||
|
LC_CTYPE. That seems wrong. According to
|
||||||
|
DerivedCoreProperties.txt it is “Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x103F],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''Not in UnicodeData.txt 5.0.0. UnicodeData.txt 7.0.0:
|
||||||
|
“103F;MYANMAR LETTER GREAT SA;Lo;0;L;;;;;N;;;;;”.
|
||||||
|
Apparently manually added to “combining” and
|
||||||
|
“combining_level3” in glibc’s old LC_CTYPE. That seems
|
||||||
|
wrong. According to DerivedCoreProperties.txt it is
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0901, 0x0903)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''These have general category “Mn” i.e. these are combining
|
||||||
|
characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
|
||||||
|
“0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”,
|
||||||
|
”0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”,
|
||||||
|
“0903;DEVANAGARI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x093C],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''UnicodeData.txt (5.0.0 and 7.0.0): “093C;DEVANAGARI SIGN
|
||||||
|
NUKTA;Mn;7;NSM;;;;;N;;;;;” According to
|
||||||
|
DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”. glibc’s old LC_TYPE has this in “alpha”.'''
|
||||||
|
],
|
||||||
|
[[(0x093E, 0x093F)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''These have general category “Mc” i.e. these are combining
|
||||||
|
characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
|
||||||
|
“093E;DEVANAGARI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“093F;DEVANAGARI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0940, 0x094C)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''These are all combining
|
||||||
|
characters (“Mc” or “Mn” both in UnicodeData.txt 5.0.0 and 7.0.0).
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x094D],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
“094D;DEVANAGARI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) it is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0951, 0x0954)],
|
||||||
|
[('combining', True), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0962, 0x0963), (0x0981, 0x0983)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x09BC],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“09BC;BENGALI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||||
|
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) it is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x09BE, 0x09BF), (0x09C0, 0x09C4), (0x09C7, 0x09C8),
|
||||||
|
(0x09CB, 0x09CC)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“09BE;BENGALI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“09BF;BENGALI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“09C0;BENGALI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“09C1;BENGALI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“09C2;BENGALI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“09C3;BENGALI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“09C4;BENGALI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“09C7;BENGALI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“09C8;BENGALI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“09CB;BENGALI VOWEL SIGN O;Mc;0;L;09C7 09BE;;;;N;;;;;”
|
||||||
|
“09CC;BENGALI VOWEL SIGN AU;Mc;0;L;09C7 09D7;;;;N;;;;;”
|
||||||
|
Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x09CD],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“09CD;BENGALI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) it is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x09D7, (0x09E2, 0x09E3)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x09F2, 0x09F3],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“09F2;BENGALI RUPEE MARK;Sc;0;ET;;;;;N;;;;;”
|
||||||
|
“09F3;BENGALI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x09F4, 0x09FA)],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“09F4;BENGALI CURRENCY NUMERATOR ONE;No;0;L;;;;1/16;N;;;;;”
|
||||||
|
“09F5;BENGALI CURRENCY NUMERATOR TWO;No;0;L;;;;1/8;N;;;;;”
|
||||||
|
“09F6;BENGALI CURRENCY NUMERATOR THREE;No;0;L;;;;3/16;N;;;;;”
|
||||||
|
“09F7;BENGALI CURRENCY NUMERATOR FOUR;No;0;L;;;;1/4;N;;;;;”
|
||||||
|
“09F8;BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR;
|
||||||
|
No;0;L;;;;3/4;N;;;;;”
|
||||||
|
“09F9;BENGALI CURRENCY DENOMINATOR SIXTEEN;No;0;L;;;;16;N;;;;;”
|
||||||
|
“09FA;BENGALI ISSHAR;So;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0A01, 0x0A03)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0A01;GURMUKHI SIGN ADAK BINDI;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A02;GURMUKHI SIGN BINDI;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A03;GURMUKHI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0A3C],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0A3C;GURMUKHI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0A3E, 0x0A40), (0x0A41, 0x0A42), (0x0A47, 0x0A48),
|
||||||
|
(0x0A4B, 0x0A4C)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0A3E;GURMUKHI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0A3F;GURMUKHI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0A40;GURMUKHI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0A41;GURMUKHI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A42;GURMUKHI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A47;GURMUKHI VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A48;GURMUKHI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A4B;GURMUKHI VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A4C;GURMUKHI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0A4D],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0A51, (0x0A70, 0x0A71), 0x0A75, (0x0A81, 0x0A83)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
“0A70;GURMUKHI TIPPI;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A71;GURMUKHI ADDAK;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A75;GURMUKHI SIGN YAKASH;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A81;GUJARATI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A82;GUJARATI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0A83;GUJARATI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0ABC],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0ABC;GUJARATI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0ABE, 0x0AC5), (0x0AC7, 0x0AC9), (0x0ACB, 0x0ACC)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0ABE;GUJARATI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0ABF;GUJARATI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0AC0;GUJARATI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0AC1;GUJARATI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC2;GUJARATI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC3;GUJARATI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC4;GUJARATI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC5;GUJARATI VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC7;GUJARATI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC8;GUJARATI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AC9;GUJARATI VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0ACB;GUJARATI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0ACC;GUJARATI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0ACD],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0ACD;GUJARATI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0AE2, 0x0AE3)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0AE2;GUJARATI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0AE3;GUJARATI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0AF1],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0AF1;GUJARATI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0B01, 0x0B03)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B01;ORIYA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B02;ORIYA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0B03;ORIYA SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0B3C],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B3C;ORIYA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0B3E, 0x0B44), (0x0B47, 0x0B48), (0x0B4B, 0x0B4C)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B3E;ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0B3F;ORIYA VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B40;ORIYA VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0B41;ORIYA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B42;ORIYA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B43;ORIYA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B44;ORIYA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B47;ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0B48;ORIYA VOWEL SIGN AI;Mc;0;L;0B47 0B56;;;;N;;;;;”
|
||||||
|
“0B4B;ORIYA VOWEL SIGN O;Mc;0;L;0B47 0B3E;;;;N;;;;;”
|
||||||
|
“0B4C;ORIYA VOWEL SIGN AU;Mc;0;L;0B47 0B57;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0B4D],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B4D;ORIYA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0B56, 0x0B57), (0x0B62, 0x0B63)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B56;ORIYA AI LENGTH MARK;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B57;ORIYA AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0B62;ORIYA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0B63;ORIYA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0B70],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B70;ORIYA ISSHAR;So;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0B82],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0B82;TAMIL SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0BBE, 0x0BC2), (0x0BC6, 0x0BC8), (0x0BCA, 0x0BCC)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0BBE;TAMIL VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BBF;TAMIL VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BC0;TAMIL VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0BC1;TAMIL VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BC2;TAMIL VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BC6;TAMIL VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BC7;TAMIL VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BC8;TAMIL VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0BCA;TAMIL VOWEL SIGN O;Mc;0;L;0BC6 0BBE;;;;N;;;;;”
|
||||||
|
“0BCB;TAMIL VOWEL SIGN OO;Mc;0;L;0BC7 0BBE;;;;N;;;;;”
|
||||||
|
“0BCC;TAMIL VOWEL SIGN AU;Mc;0;L;0BC6 0BD7;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0BCD],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0BCD;TAMIL SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0BD7],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0BD7;TAMIL AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0BF0, 0x0BFA)],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0BF0;TAMIL NUMBER TEN;No;0;L;;;;10;N;;;;;”
|
||||||
|
“0BF1;TAMIL NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;”
|
||||||
|
“0BF2;TAMIL NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;”
|
||||||
|
“0BF3;TAMIL DAY SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
“0BF4;TAMIL MONTH SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
“0BF5;TAMIL YEAR SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
“0BF6;TAMIL DEBIT SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
“0BF7;TAMIL CREDIT SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
“0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
“0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
|
||||||
|
“0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) this is *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0C01, 0x0C03)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0C3E, 0x0C44), (0x0C46, 0x0C48), (0x0C4A, 0x0C4C)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0C3E;TELUGU VOWEL SIGN AA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C3F;TELUGU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C40;TELUGU VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C41;TELUGU VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C42;TELUGU VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C43;TELUGU VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C44;TELUGU VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C46;TELUGU VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C47;TELUGU VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C48;TELUGU VOWEL SIGN AI;Mn;0;NSM;0C46 0C56;;;;N;;;;;”
|
||||||
|
“0C4A;TELUGU VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C4B;TELUGU VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C4C;TELUGU VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0C4D],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0C4D;TELUGU SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0C55, 0x0C56), (0x0C62, 0x0C63)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0C55;TELUGU LENGTH MARK;Mn;84;NSM;;;;;N;;;;;”
|
||||||
|
“0C56;TELUGU AI LENGTH MARK;Mn;91;NSM;;;;;N;;;;;”
|
||||||
|
“0C62;TELUGU VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C63;TELUGU VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0C78, 0x0C7F)],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0C78;TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;0;N;;;;;”
|
||||||
|
“0C79;TELUGU FRACTION DIGIT ONE FOR ODD POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;1;N;;;;;”
|
||||||
|
“0C7A;TELUGU FRACTION DIGIT TWO FOR ODD POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;2;N;;;;;”
|
||||||
|
“0C7B;TELUGU FRACTION DIGIT THREE FOR ODD POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;3;N;;;;;”
|
||||||
|
“0C7C;TELUGU FRACTION DIGIT ONE FOR EVEN POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;1;N;;;;;”
|
||||||
|
“0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;2;N;;;;;”
|
||||||
|
“0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR;
|
||||||
|
No;0;ON;;;;3;N;;;;;”
|
||||||
|
“0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0C82, 0x0C83)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0CBC],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0CBC;KANNADA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0CBE, 0x0CC4), (0x0CC6, 0x0CC8), (0x0CCA, 0x0CCC)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0CBE;KANNADA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0CBF;KANNADA VOWEL SIGN I;Mn;0;L;;;;;N;;;;;”
|
||||||
|
“0CC0;KANNADA VOWEL SIGN II;Mc;0;L;0CBF 0CD5;;;;N;;;;;”
|
||||||
|
“0CC1;KANNADA VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0CC2;KANNADA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0CC3;KANNADA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0CC4;KANNADA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0CC6;KANNADA VOWEL SIGN E;Mn;0;L;;;;;N;;;;;”
|
||||||
|
“0CC7;KANNADA VOWEL SIGN EE;Mc;0;L;0CC6 0CD5;;;;N;;;;;”
|
||||||
|
“0CC8;KANNADA VOWEL SIGN AI;Mc;0;L;0CC6 0CD6;;;;N;;;;;”
|
||||||
|
“0CCA;KANNADA VOWEL SIGN O;Mc;0;L;0CC6 0CC2;;;;N;;;;;”
|
||||||
|
“0CCB;KANNADA VOWEL SIGN OO;Mc;0;L;0CCA 0CD5;;;;N;;;;;”
|
||||||
|
“0CCC;KANNADA VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0CCD],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0CD5, 0x0CD6), (0x0CE2, 0x0CE3)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;;
|
||||||
|
0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;;
|
||||||
|
0CE2;KANNADA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
|
||||||
|
0CE3;KANNADA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0D02, 0x0D03), (0x0D3E, 0x0D44), (0x0D46, 0x0D48),
|
||||||
|
(0x0D4A, 0x0D4C)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D3E;MALAYALAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D3F;MALAYALAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D40;MALAYALAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D41;MALAYALAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0D42;MALAYALAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0D43;MALAYALAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0D44;MALAYALAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0D46;MALAYALAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D47;MALAYALAM VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D48;MALAYALAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D4A;MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;”
|
||||||
|
“0D4B;MALAYALAM VOWEL SIGN OO;Mc;0;L;0D47 0D3E;;;;N;;;;;”
|
||||||
|
“0D4C;MALAYALAM VOWEL SIGN AU;Mc;0;L;0D46 0D57;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0D4D],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0D4D;MALAYALAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0D57, (0x0D62, 0x0D63)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0D57;MALAYALAM AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D62;MALAYALAM VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0D63;MALAYALAM VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0D70, 0x0D79)],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0D70;MALAYALAM NUMBER TEN;No;0;L;;;;10;N;;;;;”
|
||||||
|
“0D71;MALAYALAM NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;”
|
||||||
|
“0D72;MALAYALAM NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;”
|
||||||
|
“0D73;MALAYALAM FRACTION ONE QUARTER;No;0;L;;;;1/4;N;;;;;”
|
||||||
|
“0D74;MALAYALAM FRACTION ONE HALF;No;0;L;;;;1/2;N;;;;;”
|
||||||
|
“0D75;MALAYALAM FRACTION THREE QUARTERS;No;0;L;;;;3/4;N;;;;;”
|
||||||
|
“0D79;MALAYALAM DATE MARK;So;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0D82, 0x0D83)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0D82;SINHALA SIGN ANUSVARAYA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0D83;SINHALA SIGN VISARGAYA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0DCA],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0DCA;SINHALA SIGN AL-LAKUNA;Mn;9;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0x0DCF, 0x0DD4), 0x0DD6, (0x0DD8, 0x0DDF), (0x0DF2, 0x0DF3)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0DCF;SINHALA VOWEL SIGN AELA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DD0;SINHALA VOWEL SIGN KETTI AEDA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DD1;SINHALA VOWEL SIGN DIGA AEDA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DD2;SINHALA VOWEL SIGN KETTI IS-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0DD3;SINHALA VOWEL SIGN DIGA IS-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0DD4;SINHALA VOWEL SIGN KETTI PAA-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0DD6;SINHALA VOWEL SIGN DIGA PAA-PILLA;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“0DD8;SINHALA VOWEL SIGN GAETTA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DD9;SINHALA VOWEL SIGN KOMBUVA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DDA;SINHALA VOWEL SIGN DIGA KOMBUVA;Mc;0;L;0DD9 0DCA;;;;N;;;;;”
|
||||||
|
“0DDB;SINHALA VOWEL SIGN KOMBU DEKA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DDC;SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA;
|
||||||
|
Mc;0;L;0DD9 0DCF;;;;N;;;;;”
|
||||||
|
“0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA;
|
||||||
|
Mc;0;L;0DDC 0DCA;;;;N;;;;;”
|
||||||
|
“0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA;
|
||||||
|
Mc;0;L;0DD9 0DDF;;;;N;;;;;”
|
||||||
|
“0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
“0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[0x0DF4],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0xA789, 0xA78A)],
|
||||||
|
[('combining', False), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“A789;MODIFIER LETTER COLON;Sk;0;L;;;;;N;;;;;”
|
||||||
|
“A78A;MODIFIER LETTER SHORT EQUALS SIGN;Sk;0;L;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0xA926, 0xA92A)],
|
||||||
|
[('combining', True), ('combining_level3', True),
|
||||||
|
('alpha', True), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“A926;KAYAH LI VOWEL UE;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“A927;KAYAH LI VOWEL E;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“A928;KAYAH LI VOWEL U;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“A929;KAYAH LI VOWEL EE;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
“A92A;KAYAH LI VOWEL O;Mn;0;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are
|
||||||
|
“Alphabetic”.'''
|
||||||
|
],
|
||||||
|
[[(0xA92B, 0xA92D)],
|
||||||
|
[('combining', True), ('combining_level3', False),
|
||||||
|
('alpha', False), ('lower', False), ('upper', False),
|
||||||
|
('tolower', False), ('toupper', False), ('totitle', False)],
|
||||||
|
'''
|
||||||
|
“A92B;KAYAH LI TONE PLOPHU;Mn;220;NSM;;;;;N;;;;;”
|
||||||
|
“A92C;KAYAH LI TONE CALYA;Mn;220;NSM;;;;;N;;;;;”
|
||||||
|
“A92D;KAYAH LI TONE CALYA PLOPHU;Mn;220;NSM;;;;;N;;;;;”
|
||||||
|
According to DerivedCoreProperties.txt (7.0.0) these are *not*
|
||||||
|
“Alphabetic”.'''
|
||||||
|
]
|
||||||
|
]
|
751
localedata/unicode-gen/gen_unicode_ctype.py
Executable file
751
localedata/unicode-gen/gen_unicode_ctype.py
Executable file
@ -0,0 +1,751 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
#
|
||||||
|
# Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
|
||||||
|
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||||
|
# This file is part of the GNU C Library.
|
||||||
|
# Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
|
||||||
|
#
|
||||||
|
# The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with the GNU C Library; if not, see
|
||||||
|
# <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
'''
|
||||||
|
Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
|
||||||
|
DerivedCoreProperties.txt files.
|
||||||
|
|
||||||
|
To see how this script is used, call it with the “-h” option:
|
||||||
|
|
||||||
|
$ ./gen_unicode_ctype.py -h
|
||||||
|
… prints usage message …
|
||||||
|
'''
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Dictionary holding the entire contents of the UnicodeData.txt file
|
||||||
|
#
|
||||||
|
# Contents of this dictionary look like this:
|
||||||
|
#
|
||||||
|
# {0: {'category': 'Cc',
|
||||||
|
# 'title': None,
|
||||||
|
# 'digit': '',
|
||||||
|
# 'name': '<control>',
|
||||||
|
# 'bidi': 'BN',
|
||||||
|
# 'combining': '0',
|
||||||
|
# 'comment': '',
|
||||||
|
# 'oldname': 'NULL',
|
||||||
|
# 'decomposition': '',
|
||||||
|
# 'upper': None,
|
||||||
|
# 'mirrored': 'N',
|
||||||
|
# 'lower': None,
|
||||||
|
# 'decdigit': '',
|
||||||
|
# 'numeric': ''},
|
||||||
|
# …
|
||||||
|
# }
|
||||||
|
UNICODE_ATTRIBUTES = {}
|
||||||
|
|
||||||
|
# Dictionary holding the entire contents of the DerivedCoreProperties.txt file
|
||||||
|
#
|
||||||
|
# Contents of this dictionary look like this:
|
||||||
|
#
|
||||||
|
# {917504: ['Default_Ignorable_Code_Point'],
|
||||||
|
# 917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
|
||||||
|
# …
|
||||||
|
# }
|
||||||
|
DERIVED_CORE_PROPERTIES = {}
|
||||||
|
|
||||||
|
def fill_attribute(code_point, fields):
|
||||||
|
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||||||
|
|
||||||
|
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||||||
|
in the UnicodeData.txt file.
|
||||||
|
|
||||||
|
'''
|
||||||
|
UNICODE_ATTRIBUTES[code_point] = {
|
||||||
|
'name': fields[1], # Character name
|
||||||
|
'category': fields[2], # General category
|
||||||
|
'combining': fields[3], # Canonical combining classes
|
||||||
|
'bidi': fields[4], # Bidirectional category
|
||||||
|
'decomposition': fields[5], # Character decomposition mapping
|
||||||
|
'decdigit': fields[6], # Decimal digit value
|
||||||
|
'digit': fields[7], # Digit value
|
||||||
|
'numeric': fields[8], # Numeric value
|
||||||
|
'mirrored': fields[9], # mirrored
|
||||||
|
'oldname': fields[10], # Old Unicode 1.0 name
|
||||||
|
'comment': fields[11], # comment
|
||||||
|
# Uppercase mapping
|
||||||
|
'upper': int(fields[12], 16) if fields[12] else None,
|
||||||
|
# Lowercase mapping
|
||||||
|
'lower': int(fields[13], 16) if fields[13] else None,
|
||||||
|
# Titlecase mapping
|
||||||
|
'title': int(fields[14], 16) if fields[14] else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def fill_attributes(filename):
|
||||||
|
'''Stores the entire contents of the UnicodeData.txt file
|
||||||
|
in the UNICODE_ATTRIBUTES dictionary.
|
||||||
|
|
||||||
|
A typical line for a single code point in UnicodeData.txt looks
|
||||||
|
like this:
|
||||||
|
|
||||||
|
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||||||
|
|
||||||
|
Code point ranges are indicated by pairs of lines like this:
|
||||||
|
|
||||||
|
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||||||
|
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
|
'''
|
||||||
|
with open(filename, mode='r') as unicode_data_file:
|
||||||
|
fields_start = []
|
||||||
|
for line in unicode_data_file:
|
||||||
|
fields = line.strip().split(';')
|
||||||
|
if len(fields) != 15:
|
||||||
|
sys.stderr.write(
|
||||||
|
'short line in file "%(f)s": %(l)s\n' %{
|
||||||
|
'f': filename, 'l': line})
|
||||||
|
exit(1)
|
||||||
|
if fields[2] == 'Cs':
|
||||||
|
# Surrogates are UTF-16 artefacts,
|
||||||
|
# not real characters. Ignore them.
|
||||||
|
fields_start = []
|
||||||
|
continue
|
||||||
|
if fields[1].endswith(', First>'):
|
||||||
|
fields_start = fields
|
||||||
|
fields_start[1] = fields_start[1].split(',')[0][1:]
|
||||||
|
continue
|
||||||
|
if fields[1].endswith(', Last>'):
|
||||||
|
fields[1] = fields[1].split(',')[0][1:]
|
||||||
|
if fields[1:] != fields_start[1:]:
|
||||||
|
sys.stderr.write(
|
||||||
|
'broken code point range in file "%(f)s": %(l)s\n' %{
|
||||||
|
'f': filename, 'l': line})
|
||||||
|
exit(1)
|
||||||
|
for code_point in range(
|
||||||
|
int(fields_start[0], 16),
|
||||||
|
int(fields[0], 16)+1):
|
||||||
|
fill_attribute(code_point, fields)
|
||||||
|
fields_start = []
|
||||||
|
continue
|
||||||
|
fill_attribute(int(fields[0], 16), fields)
|
||||||
|
fields_start = []
|
||||||
|
|
||||||
|
def fill_derived_core_properties(filename):
|
||||||
|
'''Stores the entire contents of the DerivedCoreProperties.txt file
|
||||||
|
in the DERIVED_CORE_PROPERTIES dictionary.
|
||||||
|
|
||||||
|
Lines in DerivedCoreProperties.txt are either a code point range like
|
||||||
|
this:
|
||||||
|
|
||||||
|
0061..007A ; Lowercase # L& [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
|
||||||
|
|
||||||
|
or a single code point like this:
|
||||||
|
|
||||||
|
00AA ; Lowercase # Lo FEMININE ORDINAL INDICATOR
|
||||||
|
|
||||||
|
'''
|
||||||
|
with open(filename, mode='r') as derived_core_properties_file:
|
||||||
|
for line in derived_core_properties_file:
|
||||||
|
match = re.match(
|
||||||
|
r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||||||
|
+ r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||||||
|
+ r'\s*;\s*(?P<property>[a-zA-Z_]+)',
|
||||||
|
line)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
start = match.group('codepoint1')
|
||||||
|
end = match.group('codepoint2')
|
||||||
|
if not end:
|
||||||
|
end = start
|
||||||
|
for code_point in range(int(start, 16), int(end, 16)+1):
|
||||||
|
prop = match.group('property')
|
||||||
|
if code_point in DERIVED_CORE_PROPERTIES:
|
||||||
|
DERIVED_CORE_PROPERTIES[code_point].append(prop)
|
||||||
|
else:
|
||||||
|
DERIVED_CORE_PROPERTIES[code_point] = [prop]
|
||||||
|
|
||||||
|
def to_upper(code_point):
|
||||||
|
'''Returns the code point of the uppercase version
|
||||||
|
of the given code point'''
|
||||||
|
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['upper']):
|
||||||
|
return UNICODE_ATTRIBUTES[code_point]['upper']
|
||||||
|
else:
|
||||||
|
return code_point
|
||||||
|
|
||||||
|
def to_lower(code_point):
|
||||||
|
'''Returns the code point of the lowercase version
|
||||||
|
of the given code point'''
|
||||||
|
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['lower']):
|
||||||
|
return UNICODE_ATTRIBUTES[code_point]['lower']
|
||||||
|
else:
|
||||||
|
return code_point
|
||||||
|
|
||||||
|
def to_title(code_point):
|
||||||
|
'''Returns the code point of the titlecase version
|
||||||
|
of the given code point'''
|
||||||
|
if (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['title']):
|
||||||
|
return UNICODE_ATTRIBUTES[code_point]['title']
|
||||||
|
else:
|
||||||
|
return code_point
|
||||||
|
|
||||||
|
def is_upper(code_point):
|
||||||
|
'''Checks whether the character with this code point is uppercase'''
|
||||||
|
return (to_lower(code_point) != code_point
|
||||||
|
or (code_point in DERIVED_CORE_PROPERTIES
|
||||||
|
and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||||||
|
|
||||||
|
def is_lower(code_point):
|
||||||
|
'''Checks whether the character with this code point is lowercase'''
|
||||||
|
# Some characters are defined as “Lowercase” in
|
||||||
|
# DerivedCoreProperties.txt but do not have a mapping to upper
|
||||||
|
# case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
|
||||||
|
# one of these.
|
||||||
|
return (to_upper(code_point) != code_point
|
||||||
|
# <U00DF> is lowercase, but without simple to_upper mapping.
|
||||||
|
or code_point == 0x00DF
|
||||||
|
or (code_point in DERIVED_CORE_PROPERTIES
|
||||||
|
and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
|
||||||
|
|
||||||
|
def is_alpha(code_point):
|
||||||
|
'''Checks whether the character with this code point is alphabetic'''
|
||||||
|
return ((code_point in DERIVED_CORE_PROPERTIES
|
||||||
|
and
|
||||||
|
'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
|
||||||
|
or
|
||||||
|
# Consider all the non-ASCII digits as alphabetic.
|
||||||
|
# ISO C 99 forbids us to have them in category “digit”,
|
||||||
|
# but we want iswalnum to return true on them.
|
||||||
|
(UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
|
||||||
|
and not (code_point >= 0x0030 and code_point <= 0x0039)))
|
||||||
|
|
||||||
|
def is_digit(code_point):
|
||||||
|
'''Checks whether the character with this code point is a digit'''
|
||||||
|
if False:
|
||||||
|
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
|
||||||
|
# Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
|
||||||
|
# a zero. Must add <0> in front of them by hand.
|
||||||
|
else:
|
||||||
|
# SUSV2 gives us some freedom for the "digit" category, but ISO C 99
|
||||||
|
# takes it away:
|
||||||
|
# 7.25.2.1.5:
|
||||||
|
# The iswdigit function tests for any wide character that
|
||||||
|
# corresponds to a decimal-digit character (as defined in 5.2.1).
|
||||||
|
# 5.2.1:
|
||||||
|
# the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
|
||||||
|
return (code_point >= 0x0030 and code_point <= 0x0039)
|
||||||
|
|
||||||
|
def is_outdigit(code_point):
|
||||||
|
'''Checks whether the character with this code point is outdigit'''
|
||||||
|
return (code_point >= 0x0030 and code_point <= 0x0039)
|
||||||
|
|
||||||
|
def is_blank(code_point):
|
||||||
|
'''Checks whether the character with this code point is blank'''
|
||||||
|
return (code_point == 0x0009 # '\t'
|
||||||
|
# Category Zs without mention of '<noBreak>'
|
||||||
|
or (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
|
||||||
|
and '<noBreak>' not in
|
||||||
|
UNICODE_ATTRIBUTES[code_point]['decomposition']))
|
||||||
|
|
||||||
|
def is_space(code_point):
|
||||||
|
'''Checks whether the character with this code point is a space'''
|
||||||
|
# Don’t make U+00A0 a space. Non-breaking space means that all programs
|
||||||
|
# should treat it like a punctuation character, not like a space.
|
||||||
|
return (code_point == 0x0020 # ' '
|
||||||
|
or code_point == 0x000C # '\f'
|
||||||
|
or code_point == 0x000A # '\n'
|
||||||
|
or code_point == 0x000D # '\r'
|
||||||
|
or code_point == 0x0009 # '\t'
|
||||||
|
or code_point == 0x000B # '\v'
|
||||||
|
# Categories Zl, Zp, and Zs without mention of "<noBreak>"
|
||||||
|
or (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and
|
||||||
|
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
|
||||||
|
or
|
||||||
|
(UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
|
||||||
|
and
|
||||||
|
'<noBreak>' not in
|
||||||
|
UNICODE_ATTRIBUTES[code_point]['decomposition']))))
|
||||||
|
|
||||||
|
def is_cntrl(code_point):
|
||||||
|
'''Checks whether the character with this code point is
|
||||||
|
a control character'''
|
||||||
|
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
|
||||||
|
or
|
||||||
|
UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
|
||||||
|
|
||||||
|
def is_xdigit(code_point):
|
||||||
|
'''Checks whether the character with this code point is
|
||||||
|
a hexadecimal digit'''
|
||||||
|
if False:
|
||||||
|
return (is_digit(code_point)
|
||||||
|
or (code_point >= 0x0041 and code_point <= 0x0046)
|
||||||
|
or (code_point >= 0x0061 and code_point <= 0x0066))
|
||||||
|
else:
|
||||||
|
# SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
|
||||||
|
# takes it away:
|
||||||
|
# 7.25.2.1.12:
|
||||||
|
# The iswxdigit function tests for any wide character that
|
||||||
|
# corresponds to a hexadecimal-digit character (as defined
|
||||||
|
# in 6.4.4.1).
|
||||||
|
# 6.4.4.1:
|
||||||
|
# hexadecimal-digit: one of
|
||||||
|
# 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
|
||||||
|
return ((code_point >= 0x0030 and code_point <= 0x0039)
|
||||||
|
or (code_point >= 0x0041 and code_point <= 0x0046)
|
||||||
|
or (code_point >= 0x0061 and code_point <= 0x0066))
|
||||||
|
|
||||||
|
def is_graph(code_point):
|
||||||
|
'''Checks whether the character with this code point is
|
||||||
|
a graphical character'''
|
||||||
|
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||||||
|
and not is_space(code_point))
|
||||||
|
|
||||||
|
def is_print(code_point):
|
||||||
|
'''Checks whether the character with this code point is printable'''
|
||||||
|
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
|
||||||
|
|
||||||
|
def is_punct(code_point):
|
||||||
|
'''Checks whether the character with this code point is punctuation'''
|
||||||
|
if False:
|
||||||
|
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
|
||||||
|
else:
|
||||||
|
# The traditional POSIX definition of punctuation is every graphic,
|
||||||
|
# non-alphanumeric character.
|
||||||
|
return (is_graph(code_point)
|
||||||
|
and not is_alpha(code_point)
|
||||||
|
and not is_digit(code_point))
|
||||||
|
|
||||||
|
def is_combining(code_point):
|
||||||
|
'''Checks whether the character with this code point is
|
||||||
|
a combining character'''
|
||||||
|
# Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
|
||||||
|
# file. In 3.0.1 it was identical to the union of the general categories
|
||||||
|
# "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
|
||||||
|
# PropList.txt file, so we take the latter definition.
|
||||||
|
return (UNICODE_ATTRIBUTES[code_point]['name']
|
||||||
|
and
|
||||||
|
UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
|
||||||
|
|
||||||
|
def is_combining_level3(code_point):
|
||||||
|
'''Checks whether the character with this code point is
|
||||||
|
a combining level3 character'''
|
||||||
|
return (is_combining(code_point)
|
||||||
|
and
|
||||||
|
int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
|
||||||
|
|
||||||
|
def ucs_symbol(code_point):
|
||||||
|
'''Return the UCS symbol string for a Unicode character.'''
|
||||||
|
if code_point < 0x10000:
|
||||||
|
return '<U{:04X}>'.format(code_point)
|
||||||
|
else:
|
||||||
|
return '<U{:08X}>'.format(code_point)
|
||||||
|
|
||||||
|
def ucs_symbol_range(code_point_low, code_point_high):
|
||||||
|
'''Returns a string UCS symbol string for a code point range.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
<U0041>..<U005A>
|
||||||
|
'''
|
||||||
|
return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
|
||||||
|
|
||||||
|
def code_point_ranges(is_class_function):
|
||||||
|
'''Returns a list of ranges of code points for which is_class_function
|
||||||
|
returns True.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
[[65, 90], [192, 214], [216, 222], [256], … ]
|
||||||
|
'''
|
||||||
|
cp_ranges = []
|
||||||
|
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||||
|
if is_class_function(code_point):
|
||||||
|
if (cp_ranges
|
||||||
|
and cp_ranges[-1][-1] == code_point - 1):
|
||||||
|
if len(cp_ranges[-1]) == 1:
|
||||||
|
cp_ranges[-1].append(code_point)
|
||||||
|
else:
|
||||||
|
cp_ranges[-1][-1] = code_point
|
||||||
|
else:
|
||||||
|
cp_ranges.append([code_point])
|
||||||
|
return cp_ranges
|
||||||
|
|
||||||
|
def output_charclass(i18n_file, class_name, is_class_function):
|
||||||
|
'''Output a LC_CTYPE character class section
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
upper /
|
||||||
|
<U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
|
||||||
|
…
|
||||||
|
<U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
|
||||||
|
<U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
|
||||||
|
'''
|
||||||
|
cp_ranges = code_point_ranges(is_class_function)
|
||||||
|
if cp_ranges:
|
||||||
|
i18n_file.write('%s /\n' %class_name)
|
||||||
|
max_column = 75
|
||||||
|
prefix = ' '
|
||||||
|
line = prefix
|
||||||
|
range_string = ''
|
||||||
|
for code_point_range in cp_ranges:
|
||||||
|
if line.strip():
|
||||||
|
line += ';'
|
||||||
|
if len(code_point_range) == 1:
|
||||||
|
range_string = ucs_symbol(code_point_range[0])
|
||||||
|
else:
|
||||||
|
range_string = ucs_symbol_range(
|
||||||
|
code_point_range[0], code_point_range[-1])
|
||||||
|
if len(line+range_string) > max_column:
|
||||||
|
i18n_file.write(line+'/\n')
|
||||||
|
line = prefix
|
||||||
|
line += range_string
|
||||||
|
if line.strip():
|
||||||
|
i18n_file.write(line+'\n')
|
||||||
|
i18n_file.write('\n')
|
||||||
|
|
||||||
|
def output_charmap(i18n_file, map_name, map_function):
|
||||||
|
'''Output a LC_CTYPE character map section
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
toupper /
|
||||||
|
(<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
|
||||||
|
…
|
||||||
|
(<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
|
||||||
|
(<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
|
||||||
|
'''
|
||||||
|
max_column = 75
|
||||||
|
prefix = ' '
|
||||||
|
line = prefix
|
||||||
|
map_string = ''
|
||||||
|
i18n_file.write('%s /\n' %map_name)
|
||||||
|
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||||
|
mapped = map_function(code_point)
|
||||||
|
if code_point != mapped:
|
||||||
|
if line.strip():
|
||||||
|
line += ';'
|
||||||
|
map_string = '(' \
|
||||||
|
+ ucs_symbol(code_point) \
|
||||||
|
+ ',' \
|
||||||
|
+ ucs_symbol(mapped) \
|
||||||
|
+ ')'
|
||||||
|
if len(line+map_string) > max_column:
|
||||||
|
i18n_file.write(line+'/\n')
|
||||||
|
line = prefix
|
||||||
|
line += map_string
|
||||||
|
if line.strip():
|
||||||
|
i18n_file.write(line+'\n')
|
||||||
|
i18n_file.write('\n')
|
||||||
|
|
||||||
|
def verifications():
|
||||||
|
'''Tests whether the is_* functions observe the known restrictions'''
|
||||||
|
for code_point in sorted(UNICODE_ATTRIBUTES):
|
||||||
|
# toupper restriction: "Only characters specified for the keywords
|
||||||
|
# lower and upper shall be specified.
|
||||||
|
if (to_upper(code_point) != code_point
|
||||||
|
and not (is_lower(code_point) or is_upper(code_point))):
|
||||||
|
sys.stderr.write(
|
||||||
|
('%(sym)s is not upper|lower '
|
||||||
|
+ 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||||||
|
'sym': ucs_symbol(code_point),
|
||||||
|
'c': code_point,
|
||||||
|
'uc': to_upper(code_point)})
|
||||||
|
# tolower restriction: "Only characters specified for the keywords
|
||||||
|
# lower and upper shall be specified.
|
||||||
|
if (to_lower(code_point) != code_point
|
||||||
|
and not (is_lower(code_point) or is_upper(code_point))):
|
||||||
|
sys.stderr.write(
|
||||||
|
('%(sym)s is not upper|lower '
|
||||||
|
+ 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
|
||||||
|
'sym': ucs_symbol(code_point),
|
||||||
|
'c': code_point,
|
||||||
|
'uc': to_lower(code_point)})
|
||||||
|
# alpha restriction: "Characters classified as either upper or lower
|
||||||
|
# shall automatically belong to this class.
|
||||||
|
if ((is_lower(code_point) or is_upper(code_point))
|
||||||
|
and not is_alpha(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
# alpha restriction: “No character specified for the keywords cntrl,
|
||||||
|
# digit, punct or space shall be specified.”
|
||||||
|
if (is_alpha(code_point) and is_cntrl(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_alpha(code_point) and is_digit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is alpha and digit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_alpha(code_point) and is_punct(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is alpha and punct\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_alpha(code_point) and is_space(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is alpha and space\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
# space restriction: “No character specified for the keywords upper,
|
||||||
|
# lower, alpha, digit, graph or xdigit shall be specified.”
|
||||||
|
# upper, lower, alpha already checked above.
|
||||||
|
if (is_space(code_point) and is_digit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is space and digit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_space(code_point) and is_graph(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is space and graph\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_space(code_point) and is_xdigit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is space and xdigit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
# cntrl restriction: “No character specified for the keywords upper,
|
||||||
|
# lower, alpha, digit, punct, graph, print or xdigit shall be
|
||||||
|
# specified.” upper, lower, alpha already checked above.
|
||||||
|
if (is_cntrl(code_point) and is_digit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is cntrl and digit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_cntrl(code_point) and is_punct(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is cntrl and punct\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_cntrl(code_point) and is_graph(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is cntrl and graph\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_cntrl(code_point) and is_print(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is cntrl and print\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_cntrl(code_point) and is_xdigit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
# punct restriction: “No character specified for the keywords upper,
|
||||||
|
# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
|
||||||
|
# be specified.” upper, lower, alpha, cntrl already checked above.
|
||||||
|
if (is_punct(code_point) and is_digit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is punct and digit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_punct(code_point) and is_xdigit(code_point)):
|
||||||
|
sys.stderr.write('%(sym)s is punct and xdigit\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (is_punct(code_point) and code_point == 0x0020):
|
||||||
|
sys.stderr.write('%(sym)s is punct\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
# graph restriction: “No character specified for the keyword cntrl
|
||||||
|
# shall be specified.” Already checked above.
|
||||||
|
|
||||||
|
# print restriction: “No character specified for the keyword cntrl
|
||||||
|
# shall be specified.” Already checked above.
|
||||||
|
|
||||||
|
# graph - print relation: differ only in the <space> character.
|
||||||
|
# How is this possible if there are more than one space character?!
|
||||||
|
# I think susv2/xbd/locale.html should speak of “space characters”,
|
||||||
|
# not “space character”.
|
||||||
|
if (is_print(code_point)
|
||||||
|
and not (is_graph(code_point) or is_space(code_point))):
|
||||||
|
sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
if (not is_print(code_point)
|
||||||
|
and (is_graph(code_point) or code_point == 0x0020)):
|
||||||
|
sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
|
||||||
|
'sym': ucs_symbol(code_point)})
|
||||||
|
|
||||||
|
def read_input_file(filename):
|
||||||
|
'''Reads the original glibc i18n file to get the original head
|
||||||
|
and tail.
|
||||||
|
|
||||||
|
We want to replace only the character classes in LC_CTYPE, and the
|
||||||
|
date stamp. All the rest of the i18n file should stay unchanged.
|
||||||
|
To avoid having to cut and paste the generated data into the
|
||||||
|
original file, it is helpful to read the original file here
|
||||||
|
to be able to generate a complete result file.
|
||||||
|
'''
|
||||||
|
head = tail = ''
|
||||||
|
with open(filename, mode='r') as i18n_file:
|
||||||
|
for line in i18n_file:
|
||||||
|
match = re.match(
|
||||||
|
r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
|
||||||
|
line)
|
||||||
|
if match:
|
||||||
|
line = match.group('key') \
|
||||||
|
+ '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
|
||||||
|
head = head + line
|
||||||
|
if line.startswith('LC_CTYPE'):
|
||||||
|
break
|
||||||
|
for line in i18n_file:
|
||||||
|
if line.startswith('translit_start'):
|
||||||
|
tail = line
|
||||||
|
break
|
||||||
|
for line in i18n_file:
|
||||||
|
tail = tail + line
|
||||||
|
return (head, tail)
|
||||||
|
|
||||||
|
def output_head(i18n_file, unicode_version, head=''):
|
||||||
|
'''Write the header of the output file, i.e. the part of the file
|
||||||
|
before the “LC_CTYPE” line.
|
||||||
|
'''
|
||||||
|
if ARGS.input_file and head:
|
||||||
|
i18n_file.write(head)
|
||||||
|
else:
|
||||||
|
i18n_file.write('escape_char /\n')
|
||||||
|
i18n_file.write('comment_char %\n')
|
||||||
|
i18n_file.write('\n')
|
||||||
|
i18n_file.write('% Generated automatically by '
|
||||||
|
+ 'gen_unicode_ctype.py '
|
||||||
|
+ 'for Unicode {:s}.\n'.format(unicode_version))
|
||||||
|
i18n_file.write('\n')
|
||||||
|
i18n_file.write('LC_IDENTIFICATION\n')
|
||||||
|
i18n_file.write('title "Unicode {:s} FDCC-set"\n'.format(
|
||||||
|
unicode_version))
|
||||||
|
i18n_file.write('source "UnicodeData.txt, '
|
||||||
|
+ 'DerivedCoreProperties.txt"\n')
|
||||||
|
i18n_file.write('address ""\n')
|
||||||
|
i18n_file.write('contact ""\n')
|
||||||
|
i18n_file.write('email "bug-glibc-locales@gnu.org"\n')
|
||||||
|
i18n_file.write('tel ""\n')
|
||||||
|
i18n_file.write('fax ""\n')
|
||||||
|
i18n_file.write('language ""\n')
|
||||||
|
i18n_file.write('territory "Earth"\n')
|
||||||
|
i18n_file.write('revision "{:s}"\n'.format(unicode_version))
|
||||||
|
i18n_file.write('date "{:s}"\n'.format(
|
||||||
|
time.strftime('%Y-%m-%d')))
|
||||||
|
i18n_file.write('category "unicode:2014";LC_CTYPE\n')
|
||||||
|
i18n_file.write('END LC_IDENTIFICATION\n')
|
||||||
|
i18n_file.write('\n')
|
||||||
|
i18n_file.write('LC_CTYPE\n')
|
||||||
|
|
||||||
|
def output_tail(i18n_file, tail=''):
|
||||||
|
'''Write the tail of the output file, i.e. the part of the file
|
||||||
|
after the last “LC_CTYPE” character class.
|
||||||
|
'''
|
||||||
|
if ARGS.input_file and tail:
|
||||||
|
i18n_file.write(tail)
|
||||||
|
else:
|
||||||
|
i18n_file.write('END LC_CTYPE\n')
|
||||||
|
|
||||||
|
def output_tables(i18n_file, unicode_version):
|
||||||
|
'''Write the new LC_CTYPE character classes to the output file'''
|
||||||
|
i18n_file.write('% The following is the 14652 i18n fdcc-set '
|
||||||
|
+ 'LC_CTYPE category.\n')
|
||||||
|
i18n_file.write('% It covers Unicode version {:s}.\n'.format(
|
||||||
|
unicode_version))
|
||||||
|
i18n_file.write('% The character classes and mapping tables were '
|
||||||
|
+ 'automatically\n')
|
||||||
|
i18n_file.write('% generated using the gen_unicode_ctype.py '
|
||||||
|
+ 'program.\n\n')
|
||||||
|
i18n_file.write('% The "upper" class reflects the uppercase '
|
||||||
|
+ 'characters of class "alpha"\n')
|
||||||
|
output_charclass(i18n_file, 'upper', is_upper)
|
||||||
|
i18n_file.write('% The "lower" class reflects the lowercase '
|
||||||
|
+ 'characters of class "alpha"\n')
|
||||||
|
output_charclass(i18n_file, 'lower', is_lower)
|
||||||
|
i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
|
||||||
|
+ 'reflecting\n')
|
||||||
|
i18n_file.write('% the recommendations in TR 10176 annex A\n')
|
||||||
|
output_charclass(i18n_file, 'alpha', is_alpha)
|
||||||
|
i18n_file.write('% The "digit" class must only contain the '
|
||||||
|
+ 'BASIC LATIN digits, says ISO C 99\n')
|
||||||
|
i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
|
||||||
|
output_charclass(i18n_file, 'digit', is_digit)
|
||||||
|
i18n_file.write('% The "outdigit" information is by default '
|
||||||
|
+ '"0" to "9". We don\'t have to\n')
|
||||||
|
i18n_file.write('% provide it here since localedef will fill '
|
||||||
|
+ 'in the bits and it would\n')
|
||||||
|
i18n_file.write('% prevent locales copying this file define '
|
||||||
|
+ 'their own values.\n')
|
||||||
|
i18n_file.write('% outdigit /\n')
|
||||||
|
i18n_file.write('% <U0030>..<U0039>\n\n')
|
||||||
|
# output_charclass(i18n_file, 'outdigit', is_outdigit)
|
||||||
|
output_charclass(i18n_file, 'space', is_space)
|
||||||
|
output_charclass(i18n_file, 'cntrl', is_cntrl)
|
||||||
|
output_charclass(i18n_file, 'punct', is_punct)
|
||||||
|
output_charclass(i18n_file, 'graph', is_graph)
|
||||||
|
output_charclass(i18n_file, 'print', is_print)
|
||||||
|
i18n_file.write('% The "xdigit" class must only contain the '
|
||||||
|
+ 'BASIC LATIN digits and A-F, a-f,\n')
|
||||||
|
i18n_file.write('% says ISO C 99 '
|
||||||
|
+ '(sections 7.25.2.1.12 and 6.4.4.1).\n')
|
||||||
|
output_charclass(i18n_file, 'xdigit', is_xdigit)
|
||||||
|
output_charclass(i18n_file, 'blank', is_blank)
|
||||||
|
output_charmap(i18n_file, 'toupper', to_upper)
|
||||||
|
output_charmap(i18n_file, 'tolower', to_lower)
|
||||||
|
output_charmap(i18n_file, 'map "totitle";', to_title)
|
||||||
|
i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
|
||||||
|
+ 'annex B.1\n')
|
||||||
|
i18n_file.write('% That is, all combining characters (level 2+3).\n')
|
||||||
|
output_charclass(i18n_file, 'class "combining";', is_combining)
|
||||||
|
i18n_file.write('% The "combining_level3" class reflects '
|
||||||
|
+ 'ISO/IEC 10646-1 annex B.2\n')
|
||||||
|
i18n_file.write('% That is, combining characters of level 3.\n')
|
||||||
|
output_charclass(i18n_file,
|
||||||
|
'class "combining_level3";', is_combining_level3)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
PARSER = argparse.ArgumentParser(
|
||||||
|
description='''
|
||||||
|
Generate a Unicode conforming LC_CTYPE category from
|
||||||
|
UnicodeData.txt and DerivedCoreProperties.txt files.
|
||||||
|
''')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-u', '--unicode_data_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
default='UnicodeData.txt',
|
||||||
|
help=('The UnicodeData.txt file to read, '
|
||||||
|
+ 'default: %(default)s'))
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-d', '--derived_core_properties_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
default='DerivedCoreProperties.txt',
|
||||||
|
help=('The DerivedCoreProperties.txt file to read, '
|
||||||
|
+ 'default: %(default)s'))
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-i', '--input_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
help='''The original glibc/localedata/locales/i18n file.''')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-o', '--output_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
default='i18n.new',
|
||||||
|
help='''The file which shall contain the generated LC_CTYPE category,
|
||||||
|
default: %(default)s. If the original
|
||||||
|
glibc/localedata/locales/i18n has been given
|
||||||
|
as an option, all data from the original file
|
||||||
|
except the newly generated LC_CTYPE character
|
||||||
|
classes and the date stamp in
|
||||||
|
LC_IDENTIFICATION will be copied unchanged
|
||||||
|
into the output file. ''')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'--unicode_version',
|
||||||
|
nargs='?',
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help='The Unicode version of the input files used.')
|
||||||
|
ARGS = PARSER.parse_args()
|
||||||
|
|
||||||
|
fill_attributes(ARGS.unicode_data_file)
|
||||||
|
fill_derived_core_properties(ARGS.derived_core_properties_file)
|
||||||
|
verifications()
|
||||||
|
HEAD = TAIL = ''
|
||||||
|
if ARGS.input_file:
|
||||||
|
(HEAD, TAIL) = read_input_file(ARGS.input_file)
|
||||||
|
with open(ARGS.output_file, mode='w') as I18N_FILE:
|
||||||
|
output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
|
||||||
|
output_tables(I18N_FILE, ARGS.unicode_version)
|
||||||
|
output_tail(I18N_FILE, tail=TAIL)
|
50
localedata/unicode-gen/unicode-license.txt
Normal file
50
localedata/unicode-gen/unicode-license.txt
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||||
|
|
||||||
|
Unicode Data Files include all data files under the directories
|
||||||
|
http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
|
||||||
|
http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF
|
||||||
|
online code charts under the directory http://www.unicode.org/Public/.
|
||||||
|
Software includes any source code published in the Unicode Standard or under
|
||||||
|
the directories http://www.unicode.org/Public/,
|
||||||
|
http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/.
|
||||||
|
|
||||||
|
NOTICE TO USER: Carefully read the following legal agreement. BY
|
||||||
|
DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES
|
||||||
|
("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND
|
||||||
|
AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF
|
||||||
|
YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA
|
||||||
|
FILES OR SOFTWARE.
|
||||||
|
|
||||||
|
COPYRIGHT AND PERMISSION NOTICE
|
||||||
|
|
||||||
|
Copyright © 1991-2013 Unicode, Inc. All rights reserved. Distributed under
|
||||||
|
the Terms of Use in http://www.unicode.org/copyright.html.
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a
|
||||||
|
copy of the Unicode data files and any associated documentation (the "Data
|
||||||
|
Files") or Unicode software and any associated documentation (the "Software")
|
||||||
|
to deal in the Data Files or Software without restriction, including without
|
||||||
|
limitation the rights to use, copy, modify, merge, publish, distribute, and/or
|
||||||
|
sell copies of the Data Files or Software, and to permit persons to whom the
|
||||||
|
Data Files or Software are furnished to do so, provided that (a) the above
|
||||||
|
copyright notice(s) and this permission notice appear with all copies of the
|
||||||
|
Data Files or Software, (b) both the above copyright notice(s) and this
|
||||||
|
permission notice appear in associated documentation, and (c) there is clear
|
||||||
|
notice in each modified Data File or in the Software as well as in the
|
||||||
|
documentation associated with the Data File(s) or Software that the data or
|
||||||
|
software has been modified.
|
||||||
|
|
||||||
|
THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
|
||||||
|
KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||||
|
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
|
||||||
|
PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
|
||||||
|
THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
|
||||||
|
DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
|
||||||
|
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||||
|
ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
|
||||||
|
DATA FILES OR SOFTWARE.
|
||||||
|
|
||||||
|
Except as contained in this notice, the name of a copyright holder shall
|
||||||
|
not be used in advertising or otherwise to promote the sale, use or other
|
||||||
|
dealings in these Data Files or Software without prior written authorization
|
||||||
|
of the copyright holder.
|
399
localedata/unicode-gen/utf8_compatibility.py
Executable file
399
localedata/unicode-gen/utf8_compatibility.py
Executable file
@ -0,0 +1,399 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||||
|
# This file is part of the GNU C Library.
|
||||||
|
#
|
||||||
|
# The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with the GNU C Library; if not, see
|
||||||
|
# <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
'''
|
||||||
|
This script is useful for checking backward compatibility of newly
|
||||||
|
generated UTF-8 file from utf8_gen.py script
|
||||||
|
|
||||||
|
To see how this script is used, call it with the “-h” option:
|
||||||
|
|
||||||
|
$ ./utf8_compatibility.py -h
|
||||||
|
… prints usage message …
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
# Dictionary holding the entire contents of the UnicodeData.txt file
|
||||||
|
#
|
||||||
|
# Contents of this dictionary look like this:
|
||||||
|
#
|
||||||
|
# {0: {'category': 'Cc',
|
||||||
|
# 'title': None,
|
||||||
|
# 'digit': '',
|
||||||
|
# 'name': '<control>',
|
||||||
|
# 'bidi': 'BN',
|
||||||
|
# 'combining': '0',
|
||||||
|
# 'comment': '',
|
||||||
|
# 'oldname': 'NULL',
|
||||||
|
# 'decomposition': '',
|
||||||
|
# 'upper': None,
|
||||||
|
# 'mirrored': 'N',
|
||||||
|
# 'lower': None,
|
||||||
|
# 'decdigit': '',
|
||||||
|
# 'numeric': ''},
|
||||||
|
# …
|
||||||
|
# }
|
||||||
|
UNICODE_ATTRIBUTES = {}
|
||||||
|
|
||||||
|
# Dictionary holding the entire contents of the EastAsianWidths.txt file
|
||||||
|
#
|
||||||
|
# Contents of this dictionary look like this:
|
||||||
|
#
|
||||||
|
# {0: 'N', … , 45430: 'W', …}
|
||||||
|
EAST_ASIAN_WIDTHS = {}
|
||||||
|
|
||||||
|
def fill_attribute(code_point, fields):
|
||||||
|
'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
|
||||||
|
|
||||||
|
One entry in the UNICODE_ATTRIBUTES dictionary represents one line
|
||||||
|
in the UnicodeData.txt file.
|
||||||
|
|
||||||
|
'''
|
||||||
|
UNICODE_ATTRIBUTES[code_point] = {
|
||||||
|
'name': fields[1], # Character name
|
||||||
|
'category': fields[2], # General category
|
||||||
|
'combining': fields[3], # Canonical combining classes
|
||||||
|
'bidi': fields[4], # Bidirectional category
|
||||||
|
'decomposition': fields[5], # Character decomposition mapping
|
||||||
|
'decdigit': fields[6], # Decimal digit value
|
||||||
|
'digit': fields[7], # Digit value
|
||||||
|
'numeric': fields[8], # Numeric value
|
||||||
|
'mirrored': fields[9], # mirrored
|
||||||
|
'oldname': fields[10], # Old Unicode 1.0 name
|
||||||
|
'comment': fields[11], # comment
|
||||||
|
# Uppercase mapping
|
||||||
|
'upper': int(fields[12], 16) if fields[12] else None,
|
||||||
|
# Lowercase mapping
|
||||||
|
'lower': int(fields[13], 16) if fields[13] else None,
|
||||||
|
# Titlecase mapping
|
||||||
|
'title': int(fields[14], 16) if fields[14] else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def fill_attributes(filename):
|
||||||
|
'''Stores the entire contents of the UnicodeData.txt file
|
||||||
|
in the UNICODE_ATTRIBUTES dictionary.
|
||||||
|
|
||||||
|
A typical line for a single code point in UnicodeData.txt looks
|
||||||
|
like this:
|
||||||
|
|
||||||
|
0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
|
||||||
|
|
||||||
|
Code point ranges are indicated by pairs of lines like this:
|
||||||
|
|
||||||
|
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
|
||||||
|
9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
|
'''
|
||||||
|
with open(filename, mode='r') as unicode_data_file:
|
||||||
|
fields_start = []
|
||||||
|
for line in unicode_data_file:
|
||||||
|
fields = line.strip().split(';')
|
||||||
|
if len(fields) != 15:
|
||||||
|
sys.stderr.write(
|
||||||
|
'short line in file "%(f)s": %(l)s\n' %{
|
||||||
|
'f': filename, 'l': line})
|
||||||
|
exit(1)
|
||||||
|
if fields[2] == 'Cs':
|
||||||
|
# Surrogates are UTF-16 artefacts,
|
||||||
|
# not real characters. Ignore them.
|
||||||
|
fields_start = []
|
||||||
|
continue
|
||||||
|
if fields[1].endswith(', First>'):
|
||||||
|
fields_start = fields
|
||||||
|
fields_start[1] = fields_start[1].split(',')[0][1:]
|
||||||
|
continue
|
||||||
|
if fields[1].endswith(', Last>'):
|
||||||
|
fields[1] = fields[1].split(',')[0][1:]
|
||||||
|
if fields[1:] != fields_start[1:]:
|
||||||
|
sys.stderr.write(
|
||||||
|
'broken code point range in file "%(f)s": %(l)s\n' %{
|
||||||
|
'f': filename, 'l': line})
|
||||||
|
exit(1)
|
||||||
|
for code_point in range(
|
||||||
|
int(fields_start[0], 16),
|
||||||
|
int(fields[0], 16)+1):
|
||||||
|
fill_attribute(code_point, fields)
|
||||||
|
fields_start = []
|
||||||
|
continue
|
||||||
|
fill_attribute(int(fields[0], 16), fields)
|
||||||
|
fields_start = []
|
||||||
|
|
||||||
|
def fill_east_asian_widths(filename):
|
||||||
|
'''Stores the entire contents of the EastAsianWidths.txt file
|
||||||
|
in the EAST_ASIAN_WIDTHS dictionary.
|
||||||
|
|
||||||
|
Lines in EastAsianWidths.txt are either a code point range like
|
||||||
|
this:
|
||||||
|
|
||||||
|
9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>
|
||||||
|
|
||||||
|
or a single code point like this:
|
||||||
|
|
||||||
|
A015;W # Lm YI SYLLABLE WU
|
||||||
|
'''
|
||||||
|
with open(filename, mode='r') as east_asian_widths_file:
|
||||||
|
for line in east_asian_widths_file:
|
||||||
|
match = re.match(
|
||||||
|
r'^(?P<codepoint1>[0-9A-F]{4,6})'
|
||||||
|
+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
|
||||||
|
+r'\s*;\s*(?P<property>[a-zA-Z]+)',
|
||||||
|
line)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
start = match.group('codepoint1')
|
||||||
|
end = match.group('codepoint2')
|
||||||
|
if not end:
|
||||||
|
end = start
|
||||||
|
for code_point in range(int(start, 16), int(end, 16)+1):
|
||||||
|
EAST_ASIAN_WIDTHS[code_point] = match.group('property')
|
||||||
|
|
||||||
|
def ucs_symbol(code_point):
|
||||||
|
'''Return the UCS symbol string for a Unicode character.'''
|
||||||
|
if code_point < 0x10000:
|
||||||
|
return '<U{:04X}>'.format(code_point)
|
||||||
|
else:
|
||||||
|
return '<U{:08X}>'.format(code_point)
|
||||||
|
|
||||||
|
def create_charmap_dictionary(file_name):
|
||||||
|
'''Create a dictionary for all code points found in the CHARMAP
|
||||||
|
section of a file
|
||||||
|
'''
|
||||||
|
with open(file_name, mode='r') as utf8_file:
|
||||||
|
charmap_dictionary = {}
|
||||||
|
for line in utf8_file:
|
||||||
|
if line.startswith('CHARMAP'):
|
||||||
|
break
|
||||||
|
for line in utf8_file:
|
||||||
|
if line.startswith('END CHARMAP'):
|
||||||
|
return charmap_dictionary
|
||||||
|
if line.startswith('%'):
|
||||||
|
continue
|
||||||
|
match = re.match(
|
||||||
|
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||||
|
+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
|
||||||
|
+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
|
||||||
|
line)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
codepoint1 = match.group('codepoint1')
|
||||||
|
codepoint2 = match.group('codepoint2')
|
||||||
|
if not codepoint2:
|
||||||
|
codepoint2 = codepoint1
|
||||||
|
for i in range(int(codepoint1, 16),
|
||||||
|
int(codepoint2, 16) + 1):
|
||||||
|
charmap_dictionary[i] = match.group('hexutf8')
|
||||||
|
sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
|
||||||
|
%file_name)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
def check_charmap(original_file_name, new_file_name):
|
||||||
|
'''Report differences in the CHARMAP section between the old and the
|
||||||
|
new file
|
||||||
|
'''
|
||||||
|
print('************************************************************')
|
||||||
|
print('Report on CHARMAP:')
|
||||||
|
ocharmap = create_charmap_dictionary(original_file_name)
|
||||||
|
ncharmap = create_charmap_dictionary(new_file_name)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('Total removed characters in newly generated CHARMAP: %d'
|
||||||
|
%len(set(ocharmap)-set(ncharmap)))
|
||||||
|
if ARGS.show_missing_characters:
|
||||||
|
for key in sorted(set(ocharmap)-set(ncharmap)):
|
||||||
|
print('removed: {:s} {:s} {:s}'.format(
|
||||||
|
ucs_symbol(key),
|
||||||
|
ocharmap[key],
|
||||||
|
UNICODE_ATTRIBUTES[key]['name'] \
|
||||||
|
if key in UNICODE_ATTRIBUTES else None))
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
changed_charmap = {}
|
||||||
|
for key in set(ocharmap).intersection(set(ncharmap)):
|
||||||
|
if ocharmap[key] != ncharmap[key]:
|
||||||
|
changed_charmap[key] = (ocharmap[key], ncharmap[key])
|
||||||
|
print('Total changed characters in newly generated CHARMAP: %d'
|
||||||
|
%len(changed_charmap))
|
||||||
|
if ARGS.show_changed_characters:
|
||||||
|
for key in sorted(changed_charmap):
|
||||||
|
print('changed: {:s} {:s}->{:s} {:s}'.format(
|
||||||
|
ucs_symbol(key),
|
||||||
|
changed_charmap[key][0],
|
||||||
|
changed_charmap[key][1],
|
||||||
|
UNICODE_ATTRIBUTES[key]['name'] \
|
||||||
|
if key in UNICODE_ATTRIBUTES else None))
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('Total added characters in newly generated CHARMAP: %d'
|
||||||
|
%len(set(ncharmap)-set(ocharmap)))
|
||||||
|
if ARGS.show_added_characters:
|
||||||
|
for key in sorted(set(ncharmap)-set(ocharmap)):
|
||||||
|
print('added: {:s} {:s} {:s}'.format(
|
||||||
|
ucs_symbol(key),
|
||||||
|
ncharmap[key],
|
||||||
|
UNICODE_ATTRIBUTES[key]['name'] \
|
||||||
|
if key in UNICODE_ATTRIBUTES else None))
|
||||||
|
|
||||||
|
def create_width_dictionary(file_name):
|
||||||
|
'''Create a dictionary for all code points found in the WIDTH
|
||||||
|
section of a file
|
||||||
|
'''
|
||||||
|
with open(file_name, mode='r') as utf8_file:
|
||||||
|
width_dictionary = {}
|
||||||
|
for line in utf8_file:
|
||||||
|
if line.startswith('WIDTH'):
|
||||||
|
break
|
||||||
|
for line in utf8_file:
|
||||||
|
if line.startswith('END WIDTH'):
|
||||||
|
return width_dictionary
|
||||||
|
match = re.match(
|
||||||
|
r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
|
||||||
|
+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
|
||||||
|
+r'\s+(?P<width>[02])',
|
||||||
|
line)
|
||||||
|
if not match:
|
||||||
|
continue
|
||||||
|
codepoint1 = match.group('codepoint1')
|
||||||
|
codepoint2 = match.group('codepoint2')
|
||||||
|
if not codepoint2:
|
||||||
|
codepoint2 = codepoint1
|
||||||
|
for i in range(int(codepoint1, 16),
|
||||||
|
int(codepoint2, 16) + 1):
|
||||||
|
width_dictionary[i] = int(match.group('width'))
|
||||||
|
sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
|
||||||
|
|
||||||
|
def check_width(original_file_name, new_file_name):
|
||||||
|
'''Report differences in the WIDTH section between the old and the new
|
||||||
|
file
|
||||||
|
'''
|
||||||
|
print('************************************************************')
|
||||||
|
print('Report on WIDTH:')
|
||||||
|
owidth = create_width_dictionary(original_file_name)
|
||||||
|
nwidth = create_width_dictionary(new_file_name)
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('Total removed characters in newly generated WIDTH: %d'
|
||||||
|
%len(set(owidth)-set(nwidth)))
|
||||||
|
print('(Characters not in WIDTH get width 1 by default, '
|
||||||
|
+ 'i.e. these have width 1 now.)')
|
||||||
|
if ARGS.show_missing_characters:
|
||||||
|
for key in sorted(set(owidth)-set(nwidth)):
|
||||||
|
print('removed: {:s} '.format(ucs_symbol(key))
|
||||||
|
+ '{:d} : '.format(owidth[key])
|
||||||
|
+ 'eaw={:s} '.format(
|
||||||
|
EAST_ASIAN_WIDTHS[key]
|
||||||
|
if key in EAST_ASIAN_WIDTHS else None)
|
||||||
|
+ 'category={:2s} '.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['category']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None)
|
||||||
|
+ 'bidi={:3s} '.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['bidi']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None)
|
||||||
|
+ 'name={:s}'.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['name']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None))
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
changed_width = {}
|
||||||
|
for key in set(owidth).intersection(set(nwidth)):
|
||||||
|
if owidth[key] != nwidth[key]:
|
||||||
|
changed_width[key] = (owidth[key], nwidth[key])
|
||||||
|
print('Total changed characters in newly generated WIDTH: %d'
|
||||||
|
%len(changed_width))
|
||||||
|
if ARGS.show_changed_characters:
|
||||||
|
for key in sorted(changed_width):
|
||||||
|
print('changed width: {:s} '.format(ucs_symbol(key))
|
||||||
|
+ '{:d}->{:d} : '.format(changed_width[key][0],
|
||||||
|
changed_width[key][1])
|
||||||
|
+ 'eaw={:s} '.format(
|
||||||
|
EAST_ASIAN_WIDTHS[key]
|
||||||
|
if key in EAST_ASIAN_WIDTHS else None)
|
||||||
|
+ 'category={:2s} '.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['category']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None)
|
||||||
|
+ 'bidi={:3s} '.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['bidi']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None)
|
||||||
|
+ 'name={:s}'.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['name']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None))
|
||||||
|
print('------------------------------------------------------------')
|
||||||
|
print('Total added characters in newly generated WIDTH: %d'
|
||||||
|
%len(set(nwidth)-set(owidth)))
|
||||||
|
print('(Characters not in WIDTH get width 1 by default, '
|
||||||
|
+ 'i.e. these had width 1 before.)')
|
||||||
|
if ARGS.show_added_characters:
|
||||||
|
for key in sorted(set(nwidth)-set(owidth)):
|
||||||
|
print('added: {:s} '.format(ucs_symbol(key))
|
||||||
|
+ '{:d} : '.format(nwidth[key])
|
||||||
|
+ 'eaw={:s} '.format(
|
||||||
|
EAST_ASIAN_WIDTHS[key]
|
||||||
|
if key in EAST_ASIAN_WIDTHS else None)
|
||||||
|
+ 'category={:2s} '.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['category']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None)
|
||||||
|
+ 'bidi={:3s} '.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['bidi']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None)
|
||||||
|
+ 'name={:s}'.format(
|
||||||
|
UNICODE_ATTRIBUTES[key]['name']
|
||||||
|
if key in UNICODE_ATTRIBUTES else None))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
PARSER = argparse.ArgumentParser(
|
||||||
|
description='''
|
||||||
|
Compare the contents of LC_CTYPE in two files and check for errors.
|
||||||
|
''')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-o', '--old_utf8_file',
|
||||||
|
nargs='?',
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help='The old UTF-8 file.')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-n', '--new_utf8_file',
|
||||||
|
nargs='?',
|
||||||
|
required=True,
|
||||||
|
type=str,
|
||||||
|
help='The new UTF-8 file.')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-u', '--unicode_data_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
help='The UnicodeData.txt file to read.')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-e', '--east_asian_width_file',
|
||||||
|
nargs='?',
|
||||||
|
type=str,
|
||||||
|
help='The EastAsianWidth.txt file to read.')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-a', '--show_added_characters',
|
||||||
|
action='store_true',
|
||||||
|
help='Show characters which were added in detail.')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-m', '--show_missing_characters',
|
||||||
|
action='store_true',
|
||||||
|
help='Show characters which were removed in detail.')
|
||||||
|
PARSER.add_argument(
|
||||||
|
'-c', '--show_changed_characters',
|
||||||
|
action='store_true',
|
||||||
|
help='Show characters whose width was changed in detail.')
|
||||||
|
ARGS = PARSER.parse_args()
|
||||||
|
|
||||||
|
if ARGS.unicode_data_file:
|
||||||
|
fill_attributes(ARGS.unicode_data_file)
|
||||||
|
if ARGS.east_asian_width_file:
|
||||||
|
fill_east_asian_widths(ARGS.east_asian_width_file)
|
||||||
|
check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
|
||||||
|
check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
|
286
localedata/unicode-gen/utf8_gen.py
Executable file
286
localedata/unicode-gen/utf8_gen.py
Executable file
@ -0,0 +1,286 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Copyright (C) 2014, 2015 Free Software Foundation, Inc.
|
||||||
|
# This file is part of the GNU C Library.
|
||||||
|
#
|
||||||
|
# The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
# modify it under the terms of the GNU Lesser General Public
|
||||||
|
# License as published by the Free Software Foundation; either
|
||||||
|
# version 2.1 of the License, or (at your option) any later version.
|
||||||
|
#
|
||||||
|
# The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
# Lesser General Public License for more details.
|
||||||
|
#
|
||||||
|
# You should have received a copy of the GNU Lesser General Public
|
||||||
|
# License along with the GNU C Library; if not, see
|
||||||
|
# <http://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
'''glibc/localedata/charmaps/UTF-8 file generator script
|
||||||
|
|
||||||
|
This script generates a glibc/localedata/charmaps/UTF-8 file
|
||||||
|
from Unicode data.
|
||||||
|
|
||||||
|
Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
||||||
|
|
||||||
|
It will output UTF-8 file
|
||||||
|
'''
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import re
|
||||||
|
|
||||||
|
# Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
|
||||||
|
# sections 3.11 and 4.4.
|
||||||
|
|
||||||
|
jamo_initial_short_name = [
|
||||||
|
'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
|
||||||
|
'C', 'K', 'T', 'P', 'H'
|
||||||
|
]
|
||||||
|
|
||||||
|
jamo_medial_short_name = [
|
||||||
|
'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
|
||||||
|
'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
|
||||||
|
]
|
||||||
|
|
||||||
|
jamo_final_short_name = [
|
||||||
|
'', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
|
||||||
|
'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
|
||||||
|
'P', 'H'
|
||||||
|
]
|
||||||
|
|
||||||
|
def ucs_symbol(code_point):
|
||||||
|
'''Return the UCS symbol string for a Unicode character.'''
|
||||||
|
if code_point < 0x10000:
|
||||||
|
return '<U{:04X}>'.format(code_point)
|
||||||
|
else:
|
||||||
|
return '<U{:08X}>'.format(code_point)
|
||||||
|
|
||||||
|
def process_range(start, end, outfile, name):
|
||||||
|
'''Writes a range of code points into the CHARMAP section of the
|
||||||
|
output file
|
||||||
|
|
||||||
|
'''
|
||||||
|
if 'Hangul Syllable' in name:
|
||||||
|
# from glibc/localedata/ChangeLog:
|
||||||
|
#
|
||||||
|
# 2000-09-24 Bruno Haible <haible@clisp.cons.org>
|
||||||
|
# * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
|
||||||
|
# so they become printable and carry a width. Comment out surrogate
|
||||||
|
# ranges. Add a WIDTH table
|
||||||
|
#
|
||||||
|
# So we expand the Hangul Syllables here:
|
||||||
|
for i in range(int(start, 16), int(end, 16)+1 ):
|
||||||
|
index2, index3 = divmod(i - 0xaC00, 28)
|
||||||
|
index1, index2 = divmod(index2, 21)
|
||||||
|
hangul_syllable_name = 'HANGUL SYLLABLE ' \
|
||||||
|
+ jamo_initial_short_name[index1] \
|
||||||
|
+ jamo_medial_short_name[index2] \
|
||||||
|
+ jamo_final_short_name[index3]
|
||||||
|
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||||||
|
ucs_symbol(i), convert_to_hex(i),
|
||||||
|
hangul_syllable_name))
|
||||||
|
return
|
||||||
|
# UnicodeData.txt file has contains code point ranges like this:
|
||||||
|
#
|
||||||
|
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||||
|
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
|
#
|
||||||
|
# The glibc UTF-8 file splits ranges like these into shorter
|
||||||
|
# ranges of 64 code points each:
|
||||||
|
#
|
||||||
|
# <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
||||||
|
# …
|
||||||
|
# <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
|
||||||
|
for i in range(int(start, 16), int(end, 16), 64 ):
|
||||||
|
if i > (int(end, 16)-64):
|
||||||
|
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||||||
|
ucs_symbol(i),
|
||||||
|
ucs_symbol(int(end,16)),
|
||||||
|
convert_to_hex(i),
|
||||||
|
name))
|
||||||
|
break
|
||||||
|
outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
||||||
|
ucs_symbol(i),
|
||||||
|
ucs_symbol(i+63),
|
||||||
|
convert_to_hex(i),
|
||||||
|
name))
|
||||||
|
|
||||||
|
def process_charmap(flines, outfile):
|
||||||
|
'''This function takes an array which contains *all* lines of
|
||||||
|
of UnicodeData.txt and write lines to outfile as used in the
|
||||||
|
|
||||||
|
CHARMAP
|
||||||
|
…
|
||||||
|
END CHARMAP
|
||||||
|
|
||||||
|
section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
|
||||||
|
|
||||||
|
Samples for input lines:
|
||||||
|
|
||||||
|
0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
|
||||||
|
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||||
|
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
|
D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
|
||||||
|
DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
|
||||||
|
100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
|
||||||
|
10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
|
||||||
|
|
||||||
|
Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
|
||||||
|
|
||||||
|
<U0010> /x10 DATA LINK ESCAPE
|
||||||
|
<U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
||||||
|
%<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
|
||||||
|
%<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
|
||||||
|
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
|
||||||
|
|
||||||
|
'''
|
||||||
|
fields_start = []
|
||||||
|
for line in flines:
|
||||||
|
fields = line.split(";")
|
||||||
|
# Some characters have “<control>” as their name. We try to
|
||||||
|
# use the “Unicode 1.0 Name” (10th field in
|
||||||
|
# UnicodeData.txt) for them.
|
||||||
|
#
|
||||||
|
# The Characters U+0080, U+0081, U+0084 and U+0099 have
|
||||||
|
# “<control>” as their name but do not even have aa
|
||||||
|
# ”Unicode 1.0 Name”. We could write code to take their
|
||||||
|
# alternate names from NameAliases.txt.
|
||||||
|
if fields[1] == "<control>" and fields[10]:
|
||||||
|
fields[1] = fields[10]
|
||||||
|
# Handling code point ranges like:
|
||||||
|
#
|
||||||
|
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
||||||
|
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
||||||
|
if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
|
||||||
|
fields_start = fields
|
||||||
|
continue
|
||||||
|
if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
|
||||||
|
process_range(fields_start[0], fields[0],
|
||||||
|
outfile, fields[1][:-7]+'>')
|
||||||
|
fields_start = []
|
||||||
|
continue
|
||||||
|
fields_start = []
|
||||||
|
if 'Surrogate,' in fields[1]:
|
||||||
|
# Comment out the surrogates in the UTF-8 file.
|
||||||
|
# One could of course skip them completely but
|
||||||
|
# the original UTF-8 file in glibc had them as
|
||||||
|
# comments, so we keep these comment lines.
|
||||||
|
outfile.write('%')
|
||||||
|
outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
||||||
|
ucs_symbol(int(fields[0], 16)),
|
||||||
|
convert_to_hex(int(fields[0], 16)),
|
||||||
|
fields[1]))
|
||||||
|
|
||||||
|
def convert_to_hex(code_point):
|
||||||
|
'''Converts a code point to a hexadecimal UTF-8 representation
|
||||||
|
like /x**/x**/x**.'''
|
||||||
|
# Getting UTF8 of Unicode characters.
|
||||||
|
# In Python3, .encode('UTF-8') does not work for
|
||||||
|
# surrogates. Therefore, we use this conversion table
|
||||||
|
surrogates = {
|
||||||
|
0xD800: '/xed/xa0/x80',
|
||||||
|
0xDB7F: '/xed/xad/xbf',
|
||||||
|
0xDB80: '/xed/xae/x80',
|
||||||
|
0xDBFF: '/xed/xaf/xbf',
|
||||||
|
0xDC00: '/xed/xb0/x80',
|
||||||
|
0xDFFF: '/xed/xbf/xbf',
|
||||||
|
}
|
||||||
|
if code_point in surrogates:
|
||||||
|
return surrogates[code_point]
|
||||||
|
return ''.join([
|
||||||
|
'/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
|
||||||
|
])
|
||||||
|
|
||||||
|
def write_header_charmap(outfile):
|
||||||
|
'''Write the header on top of the CHARMAP section to the output file'''
|
||||||
|
outfile.write("<code_set_name> UTF-8\n")
|
||||||
|
outfile.write("<comment_char> %\n")
|
||||||
|
outfile.write("<escape_char> /\n")
|
||||||
|
outfile.write("<mb_cur_min> 1\n")
|
||||||
|
outfile.write("<mb_cur_max> 6\n\n")
|
||||||
|
outfile.write("% CHARMAP generated using utf8_gen.py\n")
|
||||||
|
outfile.write("% alias ISO-10646/UTF-8\n")
|
||||||
|
outfile.write("CHARMAP\n")
|
||||||
|
|
||||||
|
def write_header_width(outfile):
|
||||||
|
'''Writes the header on top of the WIDTH section to the output file'''
|
||||||
|
outfile.write('% Character width according to Unicode 7.0.0.\n')
|
||||||
|
outfile.write('% - Default width is 1.\n')
|
||||||
|
outfile.write('% - Double-width characters have width 2; generated from\n')
|
||||||
|
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
|
||||||
|
outfile.write('% - Non-spacing characters have width 0; '
|
||||||
|
+ 'generated from PropList.txt or\n')
|
||||||
|
outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
|
||||||
|
+ 'UnicodeData.txt"\n')
|
||||||
|
outfile.write('% - Format control characters have width 0; '
|
||||||
|
+ 'generated from\n')
|
||||||
|
outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
|
||||||
|
# Not needed covered by Cf
|
||||||
|
# outfile.write("% - Zero width characters have width 0; generated from\n")
|
||||||
|
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
|
||||||
|
outfile.write("WIDTH\n")
|
||||||
|
|
||||||
|
def process_width(outfile, ulines, elines):
|
||||||
|
'''ulines are lines from UnicodeData.txt, elines are lines from
|
||||||
|
EastAsianWidth.txt
|
||||||
|
|
||||||
|
'''
|
||||||
|
width_dict = {}
|
||||||
|
for line in ulines:
|
||||||
|
fields = line.split(";")
|
||||||
|
if fields[4] == "NSM" or fields[2] == "Cf":
|
||||||
|
width_dict[int(fields[0], 16)] = ucs_symbol(
|
||||||
|
int(fields[0], 16)) + '\t0'
|
||||||
|
|
||||||
|
for line in elines:
|
||||||
|
# If an entry in EastAsianWidth.txt is found, it overrides entries in
|
||||||
|
# UnicodeData.txt:
|
||||||
|
fields = line.split(";")
|
||||||
|
if not '..' in fields[0]:
|
||||||
|
width_dict[int(fields[0], 16)] = ucs_symbol(
|
||||||
|
int(fields[0], 16)) + '\t2'
|
||||||
|
else:
|
||||||
|
code_points = fields[0].split("..")
|
||||||
|
for key in range(int(code_points[0], 16),
|
||||||
|
int(code_points[1], 16)+1):
|
||||||
|
if key in width_dict:
|
||||||
|
del width_dict[key]
|
||||||
|
width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
|
||||||
|
ucs_symbol(int(code_points[0], 16)),
|
||||||
|
ucs_symbol(int(code_points[1], 16)))
|
||||||
|
|
||||||
|
for key in sorted(width_dict):
|
||||||
|
outfile.write(width_dict[key]+'\n')
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
|
||||||
|
else:
|
||||||
|
with open(sys.argv[1], mode='r') as UNIDATA_FILE:
|
||||||
|
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
|
||||||
|
with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
|
||||||
|
EAST_ASIAN_WIDTH_LINES = []
|
||||||
|
for LINE in EAST_ASIAN_WIDTH_FILE:
|
||||||
|
# If characters from EastAasianWidth.txt which are from
|
||||||
|
# from reserved ranges (i.e. not yet assigned code points)
|
||||||
|
# are added to the WIDTH section of the UTF-8 file, then
|
||||||
|
# “make check” produces “Unknown Character” errors for
|
||||||
|
# these code points because such unassigned code points
|
||||||
|
# are not in the CHARMAP section of the UTF-8 file.
|
||||||
|
#
|
||||||
|
# Therefore, we skip all reserved code points when reading
|
||||||
|
# the EastAsianWidth.txt file.
|
||||||
|
if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
|
||||||
|
continue
|
||||||
|
if re.match(r'^[^;]*;[WF]', LINE):
|
||||||
|
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
|
||||||
|
with open('UTF-8', mode='w') as OUTFILE:
|
||||||
|
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
|
||||||
|
write_header_charmap(OUTFILE)
|
||||||
|
process_charmap(UNICODE_DATA_LINES, OUTFILE)
|
||||||
|
OUTFILE.write("END CHARMAP\n\n")
|
||||||
|
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
|
||||||
|
write_header_width(OUTFILE)
|
||||||
|
process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
|
||||||
|
OUTFILE.write("END WIDTH\n")
|
Loading…
Reference in New Issue
Block a user