Unicode 7.0.0 update; added generator scripts.

for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators.
2024-11-21 12:30:06 +00:00 · 2015-02-20 20:14:59 -02:00 · 2015-02-20 20:14:59 -02:00 · 4a4839c94a
commit 4a4839c94a
parent e4a399dc3d
16 changed files with 53305 additions and 5382 deletions
--- a/11
+++ b/11
@ -9,8 +9,15 @@ Version 2.22
 * The following bugs are resolved with this release:
-  4719, 15319, 15467, 15790, 16560, 17569, 17792, 17912, 17932, 17944,
+  4719, 13064, 14094, 15319, 15467, 15790, 16560, 17569, 17588, 17792,
-  17949, 17964, 17965, 17967, 17969, 17978, 17987, 17991, 17996, 17999.
+  17912, 17932, 17944, 17949, 17964, 17965, 17967, 17969, 17978, 17987,
  17991, 17996, 17998, 17999.
 * Character encoding and ctype tables were updated to Unicode 7.0.0, using
  new generator scripts contributed by Pravin Satpute and Mike FABIAN (Red
  Hat).  These updates cause user visible changes, such as the fix for bug
  17998.
 Version 2.21
--- a/localedata/ChangeLog
+++ b/localedata/ChangeLog
@ -1,3 +1,30 @@
 2015-02-20  Alexandre Oliva <aoliva@redhat.com>
 	[BZ #17588]
 	[BZ #13064]
 	[BZ #14094]
 	[BZ #17998]
 	* unicode-gen/Makefile: New.
 	* unicode-gen/unicode-license.txt: New, from Unicode.
 	* unicode-gen/UnicodeData.txt: New, from Unicode.
 	* unicode-gen/DerivedCoreProperties.txt: New, from Unicode.
 	* unicode-gen/EastAsianWidth.txt: New, from Unicode.
 	* unicode-gen/gen_unicode_ctype.py: New generator, from Mike
 	FABIAN <mfabian@redhat.com>.
 	* unicode-gen/ctype_compatibility.py: New verifier, from
 	Pravin Satpute <psatpute@redhat.com> and Mike FABIAN.
 	* unicode-gen/ctype_compatibility_test_cases.py: New verifier
 	module, from Mike FABIAN.
 	* unicode-gen/utf8_gen.py: New generator, from Pravin Satpute
 	and Mike FABIAN.
 	* unicode-gen/utf8_compatibility.py: New verifier, from Pravin
 	Satpute and Mike FABIAN.
 	* charmaps/UTF-8: Update.
 	* locales/i18n: Update.
 	* gen-unicode-ctype.c: Remove.
 	* tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns
 	true for ordinal indicators.
 2015-01-21  Marek Polacek  <polacek@redhat.com>
 	* tests-mbwc/tst_wcscpy.c (tst_wcscpy): Fix condition.
--- a/localedata/charmaps/UTF-8
+++ b/localedata/charmaps/UTF-8
--- a/localedata/gen-unicode-ctype.c
+++ b/localedata/gen-unicode-ctype.c
@ -1,784 +0,0 @@
 /* Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
   Copyright (C) 2000-2015 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <http://www.gnu.org/licenses/>.  */
 /* Usage example:
     $ gen-unicode /usr/local/share/Unidata/UnicodeData.txt 3.1
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdbool.h>
 #include <string.h>
 #include <time.h>
 /* This structure represents one line in the UnicodeData.txt file.  */
 struct unicode_attribute
 {
  const char *name;           /* Character name */
  const char *category;       /* General category */
  const char *combining;      /* Canonical combining classes */
  const char *bidi;           /* Bidirectional category */
  const char *decomposition;  /* Character decomposition mapping */
  const char *decdigit;       /* Decimal digit value */
  const char *digit;          /* Digit value */
  const char *numeric;        /* Numeric value */
  int mirrored;               /* mirrored */
  const char *oldname;        /* Old Unicode 1.0 name */
  const char *comment;        /* Comment */
  unsigned int upper;         /* Uppercase mapping */
  unsigned int lower;         /* Lowercase mapping */
  unsigned int title;         /* Titlecase mapping */
 };
 /* Missing fields are represented with "" for strings, and NONE for
   characters.  */
 #define NONE (~(unsigned int)0)
 /* The entire contents of the UnicodeData.txt file.  */
 struct unicode_attribute unicode_attributes [0x110000];
 /* Stores in unicode_attributes[i] the values from the given fields.  */
 static void
 fill_attribute (unsigned int i,
 		const char *field1, const char *field2,
 		const char *field3, const char *field4,
 		const char *field5, const char *field6,
 		const char *field7, const char *field8,
 		const char *field9, const char *field10,
 		const char *field11, const char *field12,
 		const char *field13, const char *field14)
 {
  struct unicode_attribute * uni;
  if (i >= 0x110000)
    {
      fprintf (stderr, "index too large\n");
      exit (1);
    }
  if (strcmp (field2, "Cs") == 0)
    /* Surrogates are UTF-16 artefacts, not real characters. Ignore them.  */
    return;
  uni = &unicode_attributes[i];
  /* Copy the strings.  */
  uni->name          = strdup (field1);
  uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
  uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
  uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
  uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
  uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
  uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
  uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
  uni->mirrored      = (field9[0] == 'Y');
  uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
  uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
  uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
  uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
  uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
 }
 /* Maximum length of a field in the UnicodeData.txt file.  */
 #define FIELDLEN 120
 /* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
   Reads up to (but excluding) DELIM.
   Returns 1 when a field was successfully read, otherwise 0.  */
 static int
 getfield (FILE *stream, char *buffer, int delim)
 {
  int count = 0;
  int c;
  for (; (c = getc (stream)), (c != EOF && c != delim); )
    {
      /* The original unicode.org UnicodeData.txt file happens to have
 	 CR/LF line terminators.  Silently convert to LF.  */
      if (c == '\r')
 	continue;
      /* Put c into the buffer.  */
      if (++count >= FIELDLEN - 1)
 	{
 	  fprintf (stderr, "field too long\n");
 	  exit (1);
 	}
      *buffer++ = c;
    }
  if (c == EOF)
    return 0;
  *buffer = '\0';
  return 1;
 }
 /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
   file.  */
 static void
 fill_attributes (const char *unicodedata_filename)
 {
  unsigned int i, j;
  FILE *stream;
  char field0[FIELDLEN];
  char field1[FIELDLEN];
  char field2[FIELDLEN];
  char field3[FIELDLEN];
  char field4[FIELDLEN];
  char field5[FIELDLEN];
  char field6[FIELDLEN];
  char field7[FIELDLEN];
  char field8[FIELDLEN];
  char field9[FIELDLEN];
  char field10[FIELDLEN];
  char field11[FIELDLEN];
  char field12[FIELDLEN];
  char field13[FIELDLEN];
  char field14[FIELDLEN];
  int lineno = 0;
  for (i = 0; i < 0x110000; i++)
    unicode_attributes[i].name = NULL;
  stream = fopen (unicodedata_filename, "r");
  if (stream == NULL)
    {
      fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
      exit (1);
    }
  for (;;)
    {
      int n;
      lineno++;
      n = getfield (stream, field0, ';');
      n += getfield (stream, field1, ';');
      n += getfield (stream, field2, ';');
      n += getfield (stream, field3, ';');
      n += getfield (stream, field4, ';');
      n += getfield (stream, field5, ';');
      n += getfield (stream, field6, ';');
      n += getfield (stream, field7, ';');
      n += getfield (stream, field8, ';');
      n += getfield (stream, field9, ';');
      n += getfield (stream, field10, ';');
      n += getfield (stream, field11, ';');
      n += getfield (stream, field12, ';');
      n += getfield (stream, field13, ';');
      n += getfield (stream, field14, '\n');
      if (n == 0)
 	break;
      if (n != 15)
 	{
 	  fprintf (stderr, "short line in'%s':%d\n",
 		   unicodedata_filename, lineno);
 	  exit (1);
 	}
      i = strtoul (field0, NULL, 16);
      if (field1[0] == '<'
 	  && strlen (field1) >= 9
 	  && !strcmp (field1 + strlen(field1) - 8, ", First>"))
 	{
 	  /* Deal with a range. */
 	  lineno++;
 	  n = getfield (stream, field0, ';');
 	  n += getfield (stream, field1, ';');
 	  n += getfield (stream, field2, ';');
 	  n += getfield (stream, field3, ';');
 	  n += getfield (stream, field4, ';');
 	  n += getfield (stream, field5, ';');
 	  n += getfield (stream, field6, ';');
 	  n += getfield (stream, field7, ';');
 	  n += getfield (stream, field8, ';');
 	  n += getfield (stream, field9, ';');
 	  n += getfield (stream, field10, ';');
 	  n += getfield (stream, field11, ';');
 	  n += getfield (stream, field12, ';');
 	  n += getfield (stream, field13, ';');
 	  n += getfield (stream, field14, '\n');
 	  if (n != 15)
 	    {
 	      fprintf (stderr, "missing end range in '%s':%d\n",
 		       unicodedata_filename, lineno);
 	      exit (1);
 	    }
 	  if (!(field1[0] == '<'
 		&& strlen (field1) >= 8
 		&& !strcmp (field1 + strlen (field1) - 7, ", Last>")))
 	    {
 	      fprintf (stderr, "missing end range in '%s':%d\n",
 		       unicodedata_filename, lineno);
 	      exit (1);
 	    }
 	  field1[strlen (field1) - 7] = '\0';
 	  j = strtoul (field0, NULL, 16);
 	  for (; i <= j; i++)
 	    fill_attribute (i, field1+1, field2, field3, field4, field5,
 			       field6, field7, field8, field9, field10,
 			       field11, field12, field13, field14);
 	}
      else
 	{
 	  /* Single character line */
 	  fill_attribute (i, field1, field2, field3, field4, field5,
 			     field6, field7, field8, field9, field10,
 			     field11, field12, field13, field14);
 	}
    }
  if (ferror (stream) || fclose (stream))
    {
      fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
      exit (1);
    }
 }
 /* Character mappings.  */
 static unsigned int
 to_upper (unsigned int ch)
 {
  if (unicode_attributes[ch].name != NULL
      && unicode_attributes[ch].upper != NONE)
    return unicode_attributes[ch].upper;
  else
    return ch;
 }
 static unsigned int
 to_lower (unsigned int ch)
 {
  if (unicode_attributes[ch].name != NULL
      && unicode_attributes[ch].lower != NONE)
    return unicode_attributes[ch].lower;
  else
    return ch;
 }
 static unsigned int
 to_title (unsigned int ch)
 {
  if (unicode_attributes[ch].name != NULL
      && unicode_attributes[ch].title != NONE)
    return unicode_attributes[ch].title;
  else
    return ch;
 }
 /* Character class properties.  */
 static bool
 is_upper (unsigned int ch)
 {
  return (to_lower (ch) != ch);
 }
 static bool
 is_lower (unsigned int ch)
 {
  return (to_upper (ch) != ch)
 	 /* <U00DF> is lowercase, but without simple to_upper mapping.  */
 	 || (ch == 0x00DF);
 }
 static bool
 is_alpha (unsigned int ch)
 {
  return (unicode_attributes[ch].name != NULL
 	  && ((unicode_attributes[ch].category[0] == 'L'
 	       /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
 		  <U0E2F>, <U0E46> should belong to is_punct.  */
 	       && (ch != 0x0E2F) && (ch != 0x0E46))
 	      /* Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
 		 <U0E31>, <U0E34>..<U0E3A>, <U0E47>..<U0E4E> are is_alpha.  */
 	      || (ch == 0x0E31)
 	      || (ch >= 0x0E34 && ch <= 0x0E3A)
 	      || (ch >= 0x0E47 && ch <= 0x0E4E)
 	      /* Avoid warning for <U0345>.  */
 	      || (ch == 0x0345)
 	      /* Avoid warnings for <U2160>..<U217F>.  */
 	      || (unicode_attributes[ch].category[0] == 'N'
 		  && unicode_attributes[ch].category[1] == 'l')
 	      /* Avoid warnings for <U24B6>..<U24E9>.  */
 	      || (unicode_attributes[ch].category[0] == 'S'
 		  && unicode_attributes[ch].category[1] == 'o'
 		  && strstr (unicode_attributes[ch].name, " LETTER ")
 		     != NULL)
 	      /* Consider all the non-ASCII digits as alphabetic.
 		 ISO C 99 forbids us to have them in category "digit",
 		 but we want iswalnum to return true on them.  */
 	      || (unicode_attributes[ch].category[0] == 'N'
 		  && unicode_attributes[ch].category[1] == 'd'
 		  && !(ch >= 0x0030 && ch <= 0x0039))));
 }
 static bool
 is_digit (unsigned int ch)
 {
 #if 0
  return (unicode_attributes[ch].name != NULL
 	  && unicode_attributes[ch].category[0] == 'N'
 	  && unicode_attributes[ch].category[1] == 'd');
  /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
     a zero.  Must add <0> in front of them by hand.  */
 #else
  /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
     takes it away:
     7.25.2.1.5:
        The iswdigit function tests for any wide character that corresponds
        to a decimal-digit character (as defined in 5.2.1).
     5.2.1:
        the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
   */
  return (ch >= 0x0030 && ch <= 0x0039);
 #endif
 }
 static bool
 is_outdigit (unsigned int ch)
 {
  return (ch >= 0x0030 && ch <= 0x0039);
 }
 static bool
 is_blank (unsigned int ch)
 {
  return (ch == 0x0009 /* '\t' */
 	  /* Category Zs without mention of "<noBreak>" */
 	  || (unicode_attributes[ch].name != NULL
 	      && unicode_attributes[ch].category[0] == 'Z'
 	      && unicode_attributes[ch].category[1] == 's'
 	      && !strstr (unicode_attributes[ch].decomposition, "<noBreak>")));
 }
 static bool
 is_space (unsigned int ch)
 {
  /* Don't make U+00A0 a space. Non-breaking space means that all programs
     should treat it like a punctuation character, not like a space. */
  return (ch == 0x0020 /* ' ' */
 	  || ch == 0x000C /* '\f' */
 	  || ch == 0x000A /* '\n' */
 	  || ch == 0x000D /* '\r' */
 	  || ch == 0x0009 /* '\t' */
 	  || ch == 0x000B /* '\v' */
 	  /* Categories Zl, Zp, and Zs without mention of "<noBreak>" */
 	  || (unicode_attributes[ch].name != NULL
 	      && unicode_attributes[ch].category[0] == 'Z'
 	      && (unicode_attributes[ch].category[1] == 'l'
 		  || unicode_attributes[ch].category[1] == 'p'
 		  || (unicode_attributes[ch].category[1] == 's'
 		      && !strstr (unicode_attributes[ch].decomposition,
 				  "<noBreak>")))));
 }
 static bool
 is_cntrl (unsigned int ch)
 {
  return (unicode_attributes[ch].name != NULL
 	  && (!strcmp (unicode_attributes[ch].name, "<control>")
 	      /* Categories Zl and Zp */
 	      || (unicode_attributes[ch].category[0] == 'Z'
 		  && (unicode_attributes[ch].category[1] == 'l'
 		      || unicode_attributes[ch].category[1] == 'p'))));
 }
 static bool
 is_xdigit (unsigned int ch)
 {
 #if 0
  return is_digit (ch)
 	 || (ch >= 0x0041 && ch <= 0x0046)
 	 || (ch >= 0x0061 && ch <= 0x0066);
 #else
  /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
     takes it away:
     7.25.2.1.12:
        The iswxdigit function tests for any wide character that corresponds
        to a hexadecimal-digit character (as defined in 6.4.4.1).
     6.4.4.1:
        hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
   */
  return (ch >= 0x0030 && ch <= 0x0039)
 	 || (ch >= 0x0041 && ch <= 0x0046)
 	 || (ch >= 0x0061 && ch <= 0x0066);
 #endif
 }
 static bool
 is_graph (unsigned int ch)
 {
  return (unicode_attributes[ch].name != NULL
 	  && strcmp (unicode_attributes[ch].name, "<control>")
 	  && !is_space (ch));
 }
 static bool
 is_print (unsigned int ch)
 {
  return (unicode_attributes[ch].name != NULL
 	  && strcmp (unicode_attributes[ch].name, "<control>")
 	  /* Categories Zl and Zp */
 	  && !(unicode_attributes[ch].name != NULL
 	       && unicode_attributes[ch].category[0] == 'Z'
 	       && (unicode_attributes[ch].category[1] == 'l'
 		   || unicode_attributes[ch].category[1] == 'p')));
 }
 static bool
 is_punct (unsigned int ch)
 {
 #if 0
  return (unicode_attributes[ch].name != NULL
 	  && unicode_attributes[ch].category[0] == 'P');
 #else
  /* The traditional POSIX definition of punctuation is every graphic,
     non-alphanumeric character.  */
  return (is_graph (ch) && !is_alpha (ch) && !is_digit (ch));
 #endif
 }
 static bool
 is_combining (unsigned int ch)
 {
  /* Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
     file. In 3.0.1 it was identical to the union of the general categories
     "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
     PropList.txt file, so we take the latter definition.  */
  return (unicode_attributes[ch].name != NULL
 	  && unicode_attributes[ch].category[0] == 'M'
 	  && (unicode_attributes[ch].category[1] == 'n'
 	      || unicode_attributes[ch].category[1] == 'c'
 	      || unicode_attributes[ch].category[1] == 'e'));
 }
 static bool
 is_combining_level3 (unsigned int ch)
 {
  return is_combining (ch)
 	 && !(unicode_attributes[ch].combining[0] != '\0'
 	      && unicode_attributes[ch].combining[0] != '0'
 	      && strtoul (unicode_attributes[ch].combining, NULL, 10) >= 200);
 }
 /* Return the UCS symbol string for a Unicode character.  */
 static const char *
 ucs_symbol (unsigned int i)
 {
  static char buf[11+1];
  sprintf (buf, (i < 0x10000 ? "<U%04X>" : "<U%08X>"), i);
  return buf;
 }
 /* Return the UCS symbol range string for a Unicode characters interval.  */
 static const char *
 ucs_symbol_range (unsigned int low, unsigned int high)
 {
  static char buf[24+1];
  strcpy (buf, ucs_symbol (low));
  strcat (buf, "..");
  strcat (buf, ucs_symbol (high));
  return buf;
 }
 /* Output a character class (= property) table.  */
 static void
 output_charclass (FILE *stream, const char *classname,
 		  bool (*func) (unsigned int))
 {
  char table[0x110000];
  unsigned int i;
  bool need_semicolon;
  const int max_column = 75;
  int column;
  for (i = 0; i < 0x110000; i++)
    table[i] = (int) func (i);
  fprintf (stream, "%s ", classname);
  need_semicolon = false;
  column = 1000;
  for (i = 0; i < 0x110000; )
    {
      if (!table[i])
 	i++;
      else
 	{
 	  unsigned int low, high;
 	  char buf[25];
 	  low = i;
 	  do
 	    i++;
 	  while (i < 0x110000 && table[i]);
 	  high = i - 1;
 	  if (low == high)
 	    strcpy (buf, ucs_symbol (low));
 	  else
 	    strcpy (buf, ucs_symbol_range (low, high));
 	  if (need_semicolon)
 	    {
 	      fprintf (stream, ";");
 	      column++;
 	    }
 	  if (column + strlen (buf) > max_column)
 	    {
 	      fprintf (stream, "/\n   ");
 	      column = 3;
 	    }
 	  fprintf (stream, "%s", buf);
 	  column += strlen (buf);
 	  need_semicolon = true;
 	}
    }
  fprintf (stream, "\n");
 }
 /* Output a character mapping table.  */
 static void
 output_charmap (FILE *stream, const char *mapname,
 		unsigned int (*func) (unsigned int))
 {
  char table[0x110000];
  unsigned int i;
  bool need_semicolon;
  const int max_column = 75;
  int column;
  for (i = 0; i < 0x110000; i++)
    table[i] = (func (i) != i);
  fprintf (stream, "%s ", mapname);
  need_semicolon = false;
  column = 1000;
  for (i = 0; i < 0x110000; i++)
    if (table[i])
      {
 	char buf[25+1];
 	strcpy (buf, "(");
 	strcat (buf, ucs_symbol (i));
 	strcat (buf, ",");
 	strcat (buf, ucs_symbol (func (i)));
 	strcat (buf, ")");
 	if (need_semicolon)
 	  {
 	    fprintf (stream, ";");
 	    column++;
 	  }
 	if (column + strlen (buf) > max_column)
 	  {
 	    fprintf (stream, "/\n   ");
 	    column = 3;
 	  }
 	fprintf (stream, "%s", buf);
 	column += strlen (buf);
 	need_semicolon = true;
      }
  fprintf (stream, "\n");
 }
 /* Output the width table.  */
 static void
 output_widthmap (FILE *stream)
 {
 }
 /* Output the tables to the given file.  */
 static void
 output_tables (const char *filename, const char *version)
 {
  FILE *stream;
  unsigned int ch;
  stream = fopen (filename, "w");
  if (stream == NULL)
    {
      fprintf (stderr, "cannot open '%s' for writing\n", filename);
      exit (1);
    }
  fprintf (stream, "escape_char /\n");
  fprintf (stream, "comment_char %%\n");
  fprintf (stream, "\n");
  fprintf (stream, "%% Generated automatically by gen-unicode-ctype for Unicode %s.\n",
 	   version);
  fprintf (stream, "\n");
  fprintf (stream, "LC_IDENTIFICATION\n");
  fprintf (stream, "title     \"Unicode %s FDCC-set\"\n", version);
  fprintf (stream, "source    \"UnicodeData.txt, PropList.txt\"\n");
  fprintf (stream, "address   \"\"\n");
  fprintf (stream, "contact   \"\"\n");
  fprintf (stream, "email     \"bug-glibc-locales@gnu.org\"\n");
  fprintf (stream, "tel       \"\"\n");
  fprintf (stream, "fax       \"\"\n");
  fprintf (stream, "language  \"\"\n");
  fprintf (stream, "territory \"Earth\"\n");
  fprintf (stream, "revision  \"%s\"\n", version);
  {
    time_t now;
    char date[11];
    now = time (NULL);
    strftime (date, sizeof (date), "%Y-%m-%d", gmtime (&now));
    fprintf (stream, "date      \"%s\"\n", date);
  }
  fprintf (stream, "category  \"unicode:2001\";LC_CTYPE\n");
  fprintf (stream, "END LC_IDENTIFICATION\n");
  fprintf (stream, "\n");
  /* Verifications. */
  for (ch = 0; ch < 0x110000; ch++)
    {
      /* toupper restriction: "Only characters specified for the keywords
 	 lower and upper shall be specified.  */
      if (to_upper (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 	fprintf (stderr,
 		 "%s is not upper|lower but toupper(0x%04X) = 0x%04X\n",
 		 ucs_symbol (ch), ch, to_upper (ch));
      /* tolower restriction: "Only characters specified for the keywords
 	 lower and upper shall be specified.  */
      if (to_lower (ch) != ch && !(is_lower (ch) || is_upper (ch)))
 	fprintf (stderr,
 		 "%s is not upper|lower but tolower(0x%04X) = 0x%04X\n",
 		 ucs_symbol (ch), ch, to_lower (ch));
      /* alpha restriction: "Characters classified as either upper or lower
 	 shall automatically belong to this class.  */
      if ((is_lower (ch) || is_upper (ch)) && !is_alpha (ch))
 	fprintf (stderr, "%s is upper|lower but not alpha\n", ucs_symbol (ch));
      /* alpha restriction: "No character specified for the keywords cntrl,
 	 digit, punct or space shall be specified."  */
      if (is_alpha (ch) && is_cntrl (ch))
 	fprintf (stderr, "%s is alpha and cntrl\n", ucs_symbol (ch));
      if (is_alpha (ch) && is_digit (ch))
 	fprintf (stderr, "%s is alpha and digit\n", ucs_symbol (ch));
      if (is_alpha (ch) && is_punct (ch))
 	fprintf (stderr, "%s is alpha and punct\n", ucs_symbol (ch));
      if (is_alpha (ch) && is_space (ch))
 	fprintf (stderr, "%s is alpha and space\n", ucs_symbol (ch));
      /* space restriction: "No character specified for the keywords upper,
 	 lower, alpha, digit, graph or xdigit shall be specified."
 	 upper, lower, alpha already checked above.  */
      if (is_space (ch) && is_digit (ch))
 	fprintf (stderr, "%s is space and digit\n", ucs_symbol (ch));
      if (is_space (ch) && is_graph (ch))
 	fprintf (stderr, "%s is space and graph\n", ucs_symbol (ch));
      if (is_space (ch) && is_xdigit (ch))
 	fprintf (stderr, "%s is space and xdigit\n", ucs_symbol (ch));
      /* cntrl restriction: "No character specified for the keywords upper,
 	 lower, alpha, digit, punct, graph, print or xdigit shall be
 	 specified."  upper, lower, alpha already checked above.  */
      if (is_cntrl (ch) && is_digit (ch))
 	fprintf (stderr, "%s is cntrl and digit\n", ucs_symbol (ch));
      if (is_cntrl (ch) && is_punct (ch))
 	fprintf (stderr, "%s is cntrl and punct\n", ucs_symbol (ch));
      if (is_cntrl (ch) && is_graph (ch))
 	fprintf (stderr, "%s is cntrl and graph\n", ucs_symbol (ch));
      if (is_cntrl (ch) && is_print (ch))
 	fprintf (stderr, "%s is cntrl and print\n", ucs_symbol (ch));
      if (is_cntrl (ch) && is_xdigit (ch))
 	fprintf (stderr, "%s is cntrl and xdigit\n", ucs_symbol (ch));
      /* punct restriction: "No character specified for the keywords upper,
 	 lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 	 be specified."  upper, lower, alpha, cntrl already checked above.  */
      if (is_punct (ch) && is_digit (ch))
 	fprintf (stderr, "%s is punct and digit\n", ucs_symbol (ch));
      if (is_punct (ch) && is_xdigit (ch))
 	fprintf (stderr, "%s is punct and xdigit\n", ucs_symbol (ch));
      if (is_punct (ch) && (ch == 0x0020))
 	fprintf (stderr, "%s is punct\n", ucs_symbol (ch));
      /* graph restriction: "No character specified for the keyword cntrl
 	 shall be specified."  Already checked above.  */
      /* print restriction: "No character specified for the keyword cntrl
 	 shall be specified."  Already checked above.  */
      /* graph - print relation: differ only in the <space> character.
 	 How is this possible if there are more than one space character?!
 	 I think susv2/xbd/locale.html should speak of "space characters",
 	 not "space character".  */
      if (is_print (ch) && !(is_graph (ch) || /* ch == 0x0020 */ is_space (ch)))
 	fprintf (stderr,
 		 "%s is print but not graph|<space>\n", ucs_symbol (ch));
      if (!is_print (ch) && (is_graph (ch) || ch == 0x0020))
 	fprintf (stderr,
 		 "%s is graph|<space> but not print\n", ucs_symbol (ch));
    }
  fprintf (stream, "LC_CTYPE\n");
  output_charclass (stream, "upper", is_upper);
  output_charclass (stream, "lower", is_lower);
  output_charclass (stream, "alpha", is_alpha);
  output_charclass (stream, "digit", is_digit);
  output_charclass (stream, "outdigit", is_outdigit);
  output_charclass (stream, "blank", is_blank);
  output_charclass (stream, "space", is_space);
  output_charclass (stream, "cntrl", is_cntrl);
  output_charclass (stream, "punct", is_punct);
  output_charclass (stream, "xdigit", is_xdigit);
  output_charclass (stream, "graph", is_graph);
  output_charclass (stream, "print", is_print);
  output_charclass (stream, "class \"combining\";", is_combining);
  output_charclass (stream, "class \"combining_level3\";", is_combining_level3);
  output_charmap (stream, "toupper", to_upper);
  output_charmap (stream, "tolower", to_lower);
  output_charmap (stream, "map \"totitle\";", to_title);
  output_widthmap (stream);
  fprintf (stream, "END LC_CTYPE\n");
  if (ferror (stream) || fclose (stream))
    {
      fprintf (stderr, "error writing to '%s'\n", filename);
      exit (1);
    }
 }
 int
 main (int argc, char * argv[])
 {
  if (argc != 3)
    {
      fprintf (stderr, "Usage: %s UnicodeData.txt version\n", argv[0]);
      exit (1);
    }
  fill_attributes (argv[1]);
  output_tables ("unicode", argv[2]);
  return 0;
 }
--- a/localedata/locales/i18n
+++ b/localedata/locales/i18n
--- a/localedata/tst-ctype-de_DE.ISO-8859-1.in
+++ b/localedata/tst-ctype-de_DE.ISO-8859-1.in
@ -1,5 +1,5 @@
 lower   嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
-        000000000000000000000100000000000000000000000000
+        000000000010000000000100001000000000000000000000
 lower   倳眑婭笫崷窙嗲睧颬睼麧緗鴇膹擨闀貘覷鏷禴矙𡜍𦶠<F0A19C8D>
        000000000000000111111111111111111111111011111111
 upper   嵗╯丰戍貝物洎悖停眾斯須號獄播噶擱藏霰匸<E99CB0>帊昅恘
--- a/localedata/unicode-gen/DerivedCoreProperties.txt
+++ b/localedata/unicode-gen/DerivedCoreProperties.txt
--- a/localedata/unicode-gen/EastAsianWidth.txt
+++ b/localedata/unicode-gen/EastAsianWidth.txt
--- a/localedata/unicode-gen/Makefile
+++ b/localedata/unicode-gen/Makefile
@ -0,0 +1,99 @@
 # Copyright (C) 2015 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.
 # Makefile for generating and updating Unicode-extracted files.
 # This Makefile is NOT used as part of the GNU libc build.  It needs
 # to be run manually, within the source tree, at Unicode upgrades
 # (change UNICODE_VERSION below), to update ../locales/i18n ctype
 # information (part of the file is preserved, so don't wipe it all
 # out), and ../charmaps/UTF-8.
 # Use make all to generate the files used in the glibc build out of
 # the original Unicode files; make check to verify that they are what
 # we expect; make install to copy them to the location expected by the
 # glibc build; and make clean to remove all generated files.
 # We keep a local copy of the downloaded Unicode files, to avoid
 # running afoul of the LGPL corresponding sources requirements, even
 # though it's not clear that they are preferred over the generated
 # files for making modifications.
 UNICODE_VERSION = 7.0.0
 PYTHON3 = python3
 WGET = wget
 DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
 GENERATED = i18n UTF-8
 REPORTS = i18n-report UTF-8-report
 all: $(GENERATED)
 check: check-i18n check-UTF-8
 install:
 	cp -p i18n ../locales/i18n
 	cp -p UTF-8 ../charmaps/UTF-8
 clean: mostlyclean
 	-rm -rf __pycache__
 mostlyclean:
 	-rm -f $(REPORTS) $(GENERATED)
 .PHONY: all check clean mostlyclean install
 i18n: UnicodeData.txt DerivedCoreProperties.txt
 i18n: ../locales/i18n # Preserve non-ctype information.
 i18n: gen_unicode_ctype.py
 	$(PYTHON3) gen_unicode_ctype.py -u UnicodeData.txt \
 	  -d DerivedCoreProperties.txt -i ../locales/i18n -o $@ \
 	  --unicode_version $(UNICODE_VERSION)
 i18n-report: i18n ../locales/i18n
 i18n-report: ctype_compatibility.py ctype_compatibility_test_cases.py
 	$(PYTHON3) ./ctype_compatibility.py -o ../locales/i18n \
 	  -n i18n -a -m > $@
 check-i18n: i18n-report
 	@if grep '\(Missing\|Added\) [^0]\|^Number of errors[^=]* = [^0]' \
 		i18n-report; \
 	then echo manual verification required; false; else true; fi
 UTF-8: UnicodeData.txt EastAsianWidth.txt
 UTF-8: utf8_gen.py
 	$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
 UTF-8-report: UTF-8 ../charmaps/UTF-8
 UTF-8-report: utf8_compatibility.py
 	$(PYTHON3) ./utf8_compatibility.py -o ../charmaps/UTF-8 \
 	  -n UTF-8 -a -m > $@
 check-UTF-8: UTF-8-report
 	@if grep '^Total.*: [^0]' UTF-8-report; \
 	then echo manual verification required; false; else true; fi
 .PHONY: downloads clean-downloads
 downloads: $(DOWNLOADS)
 clean-downloads:
 	-rm -f $(DOWNLOADS)
 $(DOWNLOADS):
 	$(WGET) http://www.unicode.org/Public/$(UNICODE_VERSION)/ucd/$@
--- a/localedata/unicode-gen/UnicodeData.txt
+++ b/localedata/unicode-gen/UnicodeData.txt
--- a/localedata/unicode-gen/ctype_compatibility.py
+++ b/localedata/unicode-gen/ctype_compatibility.py
@ -0,0 +1,546 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 # Copyright (C) 2014, 2015 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.
 '''
 This script is useful for checking the differences between
 an old LC_CTYPE file /usr/share/i18n/locale/i18n and a
 new one generated by gen_unicode_ctype.py
 To see how it is used, call it with the “-h” option:
    $ ./ctype_compatibility.py -h
    … prints usage message …
 '''
 import sys
 import re
 import unicodedata
 import argparse
 from ctype_compatibility_test_cases import TEST_CASES
 def get_lines_from_file(filename):
    '''Get all non-comment lines from a i18n file
    Also merge all lines which are continued on the next line because
    they end in “/” into a single line.
    '''
    with open(filename) as i18n_file:
        current_line = ''
        for line in i18n_file:
            line = line.strip('\n')
            if '%' in line:
                if line.endswith('/'):
                    line = line[0:line.find('%')] + '/'
                else:
                    line = line[0:line.find('%')]
            line = line.strip()
            if line.endswith('/'):
                current_line += line[:-1]
            else:
                yield current_line + line
                current_line = ''
    if current_line: # file ends with a continuation line
        yield current_line
 def extract_character_classes(filename):
    '''Get all Unicode code points for each character class from a file
    Store these code points in a dictionary using the character classes
    as keys and the list of code points in this character class as values.
    In case  of the character classes “toupper”, “tolower”, and “totitle”,
    these area actually pairs of code points
    '''
    ctype_dict = {}
    for line in get_lines_from_file(filename):
        for char_class in [
                'upper',
                'lower',
                'alpha',
                'digit',
                'outdigit',
                'space',
                'cntrl',
                'punct',
                'graph',
                'print',
                'xdigit',
                'blank',
                'combining',
                'combining_level3',
                'toupper',
                'tolower',
                'totitle']:
            match = re.match(r'^('
                             +'(?:(?:class|map)\s+")'
                             +re.escape(char_class)+
                             '(?:";)\s+'
                             +'|'
                             +re.escape(char_class)+'\s+'
                             +')', line)
            if match:
                if char_class not in ctype_dict:
                    ctype_dict[char_class] = []
                process_chars(
                    ctype_dict[char_class],
                    line[match.end():])
    return ctype_dict
 def process_chars(char_class_list, code_point_line):
    '''
    Extract Unicode values from code_point_line
    and add to the list of code points in a character class
    '''
    for code_points in code_point_line.split(';'):
        code_points = code_points.strip()
        match = re.match(r'^<U(?P<codepoint>[0-9A-F]{4,8})>$', code_points)
        if match: # <Uxxxx>
            char_class_list.append(
                int(match.group('codepoint'), 16))
            continue
        match = re.match(
            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
            +'\.\.'+
            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
            code_points)
        if match: # <Uxxxx>..<Uxxxx>
            for codepoint in range(
                    int(match.group('codepoint1'), 16),
                    int(match.group('codepoint2'), 16) + 1):
                char_class_list.append(codepoint)
            continue
        match = re.match(
            r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
            +'\.\.\(2\)\.\.'+
            '<U(?P<codepoint2>[0-9A-F]{4,8})>$',
            code_points)
        if match: # <Uxxxx>..(2)..<Uxxxx>
            for codepoint in range(
                    int(match.group('codepoint1'), 16),
                    int(match.group('codepoint2'), 16) + 1,
                    2):
                char_class_list.append(codepoint)
            continue
        match = re.match(
            r'^\('
            +'<U(?P<codepoint1>[0-9A-F]{4,8})>'
            +','+
            '<U(?P<codepoint2>[0-9A-F]{4,8})>'
            +'\)$',
            code_points)
        if match: # (<Uxxxx>,<Uxxxx>)
            char_class_list.append((
                int(match.group('codepoint1'), 16),
                int(match.group('codepoint2'), 16)))
            continue
        sys.stderr.write(
            ('None of the regexps matched '
             + 'code_points=%(cp)s in code_point_line=%(cpl)s\n') %{
            'cp': code_points,
            'cpl': code_point_line
        })
        exit(1)
 def compare_lists(old_ctype_dict, new_ctype_dict):
    '''Compare character classes in the old and the new LC_CTYPE'''
    print('****************************************************')
    print('Character classes which are only in the new '
          + 'or only in the old file:')
    for char_class in sorted(old_ctype_dict):
        if char_class not in new_ctype_dict:
            print('Character class %s is in old ctype but not in new ctype'
                  %char_class)
    for char_class in sorted(new_ctype_dict):
        if char_class not in old_ctype_dict:
            print('Character class %s is in new ctype but not in old ctype'
                  %char_class)
    for char_class in sorted(old_ctype_dict):
        print("****************************************************")
        print("%s: %d chars in old ctype and %d chars in new ctype" %(
            char_class,
            len(old_ctype_dict[char_class]),
            len(new_ctype_dict[char_class])))
        print("----------------------------------------------------")
        report(char_class,
               old_ctype_dict[char_class],
               new_ctype_dict[char_class])
 def report_code_points(char_class, code_point_list, text=''):
    '''Report all code points which have been added to or removed from a
    character class.
    '''
    for code_point in sorted(code_point_list):
        if type(code_point) == type(int()):
            print('%(char_class)s: %(text)s: %(char)s %(code_point)s %(name)s'
                  %{'text': text,
                    'char': chr(code_point),
                    'char_class': char_class,
                    'code_point': hex(code_point),
                    'name': unicodedata.name(chr(code_point), 'name unknown')})
        else:
            print(('%(char_class)s: %(text)s: '
                   + '%(char0)s → %(char1)s '
                   + '%(code_point0)s → %(code_point1)s '
                   + '%(name0)s → %(name1)s') %{
                'text': text,
                'char_class': char_class,
                'char0': chr(code_point[0]),
                'code_point0': hex(code_point[0]),
                'name0': unicodedata.name(chr(code_point[0]), 'name unknown'),
                'char1': chr(code_point[1]),
                'code_point1': hex(code_point[1]),
                'name1': unicodedata.name(chr(code_point[1]), 'name unknown')
            })
 def report(char_class, old_list, new_list):
    '''Report the differences for a certain LC_CTYPE character class
    between the old and the newly generated state
    '''
    missing_chars = list(set(old_list)-set(new_list))
    print(('%(char_class)s: Missing %(number)d characters '
           + 'of old ctype in new ctype ')
          %{'char_class': char_class, 'number': len(missing_chars)})
    if ARGS.show_missing_characters:
        report_code_points(char_class, missing_chars, 'Missing')
    added_chars = list(set(new_list)-set(old_list))
    print(('%(char_class)s: Added %(number)d characters '
           + 'in new ctype which were not in old ctype')
          %{'char_class': char_class, 'number': len(added_chars)})
    if ARGS.show_added_characters:
        report_code_points(char_class, added_chars, 'Added')
 def cperror(error_message, errorcounter=0):
    '''Increase number of errors by one and print an error message'''
    print(error_message)
    return errorcounter + 1
 def cpcheck(ctype_dict, code_point_list_with_ranges, char_classes, reason='',
            errorcounter=0):
    '''The parameter “code_point_list_with_ranges” is a list of
    integers or pairs of integers, for example:
    [0x0E31, (0x0E34, 0x0E3A), (0x0E47, 0x0E4E)]
    where the pairs of integers stand for all the code points in the range
    of the two integers given, including the two integers of the pair.
    '''
    for code_point_range in code_point_list_with_ranges:
        for code_point in ([code_point_range]
                           if type(code_point_range) == type(int())
                           else range(code_point_range[0],
                                      code_point_range[1]+1)):
            for char_class_tuple in char_classes:
                char_class = char_class_tuple[0]
                in_char_class = char_class_tuple[1]
                if (code_point in ctype_dict[char_class]) != in_char_class:
                    errorcounter = cperror(
                        ('error: %(code_point)s %(char)s '
                         + '%(char_class)s %(in)s: %(reason)s') %{
                             'code_point': hex(code_point),
                             'char': chr(code_point),
                             'char_class': char_class,
                             'in': not in_char_class,
                             'reason': reason},
                        errorcounter)
    return errorcounter
 def tests(ctype_dict, errorcounter = 0):
    '''Test a LC_CTYPE character class dictionary for known errors'''
    # copy the information from ctype_dict (which contains lists) in
    # a new dictionary ctype_dict2 (which contains dictionaries).
    # The checks below are easier with that type of data structure.
    ctype_dict2 = {}
    for key in ctype_dict:
        ctype_dict2[key] = {}
        if ctype_dict[key]:
            if type(ctype_dict[key][0]) == type(int()):
                for value in ctype_dict[key]:
                    ctype_dict2[key][value] = 1
            else: # key is 'toupper', 'tolower', or 'totitle'
                for value in ctype_dict[key]:
                    ctype_dict2[key][value[0]] = value[1]
    for test_case in TEST_CASES:
        errorcounter = cpcheck(ctype_dict2,
                               test_case[0],
                               test_case[1],
                               test_case[2],
                               errorcounter = errorcounter)
    for code_point in range(0, 0x110000):
        # toupper restriction: "Only characters specified for the keywords
 	# lower and upper shall be specified.
        if (code_point in ctype_dict2['toupper']
            and code_point != ctype_dict2['toupper'][code_point]
            and not (code_point in ctype_dict2['lower']
                     or code_point in ctype_dict2['upper'])):
            errorcounter = cperror(
                ('error: %(char1)s is not upper|lower '
                 + 'but toupper(%(cp1)s)=%(cp2)s (%(char2)s)') %{
                     'char1': chr(code_point),
                     'cp1': hex(code_point),
                     'cp2': hex(ctype_dict2['toupper'][code_point]),
                     'char2': chr(ctype_dict2['toupper'][code_point])
                 },
                errorcounter)
        # tolower restriction: "Only characters specified for the keywords
 	# lower and upper shall be specified.
        if (code_point in ctype_dict2['tolower']
            and code_point != ctype_dict2['tolower'][code_point]
            and not (code_point in ctype_dict2['lower']
                     or code_point in ctype_dict2['upper'])):
            errorcounter = cperror(
                ('error: %(char1)s is not upper|lower '
                 + 'but tolower(%(cp1)s)=%(cp2)s (%(char2)s)') %{
                     'char1': chr(code_point),
                     'cp1': hex(code_point),
                     'cp2': hex(ctype_dict2['tolower'][code_point]),
                     'char2': chr(ctype_dict2['tolower'][code_point])
                 },
                errorcounter)
        # alpha restriction: "Characters classified as either upper or lower
 	# shall automatically belong to this class.
        if ((code_point in ctype_dict2['lower']
             or code_point in ctype_dict2['upper'])
            and code_point not in ctype_dict2['alpha']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is upper|lower but not alpha' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # alpha restriction: "No character specified for the keywords cntrl,
 	# digit, punct or space shall be specified."
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['cntrl']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and cntrl' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['punct']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and punct' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['alpha']
            and code_point in ctype_dict2['space']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is alpha and space' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # space restriction: "No character specified for the keywords upper,
 	# lower, alpha, digit, graph or xdigit shall be specified."
 	# upper, lower, alpha already checked above.
        if (code_point in ctype_dict2['space']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is space and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['space']
            and code_point in ctype_dict2['graph']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is space and graph' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['space']
            and code_point in ctype_dict2['xdigit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is space and xdigit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # cntrl restriction: "No character specified for the keywords upper,
 	# lower, alpha, digit, punct, graph, print or xdigit shall be
 	# specified."  upper, lower, alpha already checked above.
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['punct']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and punct' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['graph']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and graph' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['print']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and print' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['cntrl']
            and code_point in ctype_dict2['xdigit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is cntrl and xdigit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # punct restriction: "No character specified for the keywords upper,
 	# lower, alpha, digit, cntrl, xdigit or as the <space> character shall
 	# be specified."  upper, lower, alpha, cntrl already checked above.
        if (code_point in ctype_dict2['punct']
            and code_point in ctype_dict2['digit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is punct and digit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['punct']
            and code_point in ctype_dict2['xdigit']):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is punct and xdigit' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point in ctype_dict2['punct']
            and code_point == 0x0020):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is punct.' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        # graph restriction: "No character specified for the keyword cntrl
 	# shall be specified."  Already checked above.
        # print restriction: "No character specified for the keyword cntrl
 	# shall be specified."  Already checked above.
        # graph - print relation: differ only in the <space> character.
 	# How is this possible if there are more than one space character?!
 	# I think susv2/xbd/locale.html should speak of "space characters",
 	# not "space character".
        if (code_point in ctype_dict2['print']
            and not (code_point in ctype_dict2['graph']
                     or code_point in ctype_dict2['space'])):
            errorcounter = cperror(
                'error: %(char)s %(cp)s is print but not graph|space' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
        if (code_point not in ctype_dict2['print']
            and (code_point in ctype_dict2['graph']
                 or code_point ==  0x0020)):
            errorcounter = cperror(
                'error: %(char)s %(cp)s graph|space but not print' %{
                    'char': chr(code_point),
                    'cp': hex(code_point)
                },
                errorcounter)
    return errorcounter
 if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Compare the contents of LC_CTYPE in two files and check for errors.
        ''')
    PARSER.add_argument(
        '-o', '--old_ctype_file',
        nargs='?',
        type=str,
        default='i18n',
        help='The old ctype file, default: %(default)s')
    PARSER.add_argument(
        '-n', '--new_ctype_file',
        nargs='?',
        type=str,
        default='unicode-ctype',
        help='The new ctype file, default: %(default)s')
    PARSER.add_argument(
        '-a', '--show_added_characters',
        action='store_true',
        help=('Show characters which were added to each '
              + 'character class in detail.'))
    PARSER.add_argument(
        '-m', '--show_missing_characters',
        action='store_true',
        help=('Show characters which were removed from each '
              + 'character class in detail.'))
    ARGS = PARSER.parse_args()
    OLD_CTYPE_DICT = extract_character_classes(
        ARGS.old_ctype_file)
    NEW_CTYPE_DICT = extract_character_classes(
        ARGS.new_ctype_file)
    compare_lists(OLD_CTYPE_DICT, NEW_CTYPE_DICT)
    print('============================================================')
    print('Checking for errors in old ctype file: %s' %ARGS.old_ctype_file)
    print('------------------------------------------------------------')
    NUMBER_OF_ERRORS_IN_OLD_FILE = tests(OLD_CTYPE_DICT, errorcounter = 0)
    print('------------------------------------------------------------')
    print('Old file = %s' %ARGS.old_ctype_file)
    print('Number of errors in old file = %s' %NUMBER_OF_ERRORS_IN_OLD_FILE)
    print('------------------------------------------------------------')
    print('============================================================')
    print('Checking for errors in new ctype file: %s' %ARGS.new_ctype_file)
    print('------------------------------------------------------------')
    NUMBER_OF_ERRORS_IN_NEW_FILE = tests(NEW_CTYPE_DICT, errorcounter = 0)
    print('------------------------------------------------------------')
    print('New file = %s' %ARGS.new_ctype_file)
    print('Number of errors in new file = %s' %NUMBER_OF_ERRORS_IN_NEW_FILE)
    print('------------------------------------------------------------')
    if NUMBER_OF_ERRORS_IN_NEW_FILE > 0:
        exit(1)
    else:
        exit(0)
--- a/localedata/unicode-gen/ctype_compatibility_test_cases.py
+++ b/localedata/unicode-gen/ctype_compatibility_test_cases.py
@ -0,0 +1,951 @@
 # -*- coding: utf-8 -*-
 # Copyright (C) 2014, 2015 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.
 '''
 This file contains a list of test cases used by
 the ctype_compatibility.py script.
 '''
 TEST_CASES = [
    [[0x0E2F, 0x0E46], [('alpha', True), ('punct', False)],
     '''Theppitak Karoonboonyanan <thep@links.nectec.or.th> says
     <U0E2F>, <U0E46> should belong to punct. DerivedCoreProperties.txt
     says it is alpha. We trust DerivedCoreProperties.txt.'''
    ],
    [[0x0E31, (0x0E34, 0x0E3A)], [('alpha', True)],
     '''gen-unicode-ctype.c: Theppitak Karoonboonyanan
     <thep@links.nectec.or.th> says <U0E31>, <U0E34>..<U0E3A>
     are alpha. DerivedCoreProperties.txt agrees.'''
    ],
    [[(0x0E47, 0x0E4C), 0x0E4E], [('alpha', False)],
     '''gen-unicode-ctype.c: Theppitak Karoonboonyanan
     <thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
     is_alpha. DerivedCoreProperties does says *only* <U0E4D>
     in that range is alphabetic, the others are *not*. We
     trust DerivedCoreProperties.txt.'''
    ],
    [[0x0E4D], [('alpha', True)],
     '''gen-unicode-ctype.c: Theppitak Karoonboonyanan
     <thep@links.nectec.or.th> says <U0E47>..<U0E4E> are
     is_alpha. DerivedCoreProperties does says *only* <U0E4D>
            in that range is alphabetic, the others are *not*. We
            trust DerivedCoreProperties.txt.
            '''
    ],
    [[0x0345], [('alpha', True), ('lower', True)],
     '''COMBINING GREEK YPOGEGRAMMENI
     According to DerivedCoreProperties.txt, this is “Alphabetic”
     and “Lowercase”.'''
    ],
    [[(0x2160, 0x2188)], [('alpha', True)],
     '''Roman Numerals are “Alphabetic” according to
     DerivedCoreProperties.txt'''
    ],
    [[(0x24B6, 0x24E9)], [('alpha', True)],
     '''Circled Latin letters are “Alphabetic” according to
     DerivedCoreProperties.txt'''
    ],
    [[0x661], [('alpha', True), ('digit', False)],
     '''gen-unicode-ctype.c: All non-ASCII digits should be alphabetic.
     ISO C 99 forbids us to have them in category "digit", but we
     want iswalnum to return true on them. Don’t forget to
     have a look at all the other digits, 0x661 is just one
     example tested here.'''
    ],
    [[(0x0030, 0x0039)], [('digit', True)],
     '''gen-unicode-ctype.c: All ASCII digits should be digits.'''
    ],
    [[0x0009], [('blank', True)],
     '''gen-unicode-ctype.c: CHARACTER TABULATION'''
    ],
    [[0x2007], [('blank', False), ('space', False)],
     '''gen-unicode-ctype.c: FIGURE SPACE, because it has <noBreak>
     in the description.'''
    ],
    [[0x0009, 0x000A, 0x000B, 0x000C, 0x000D], [('space', True)],
     '''gen-unicode-ctype.c: CHARACTER TABULATION, LINE FEED (LF), LINE
     TABULATION, ;FORM FEED (FF), CARRIAGE RETURN (CR)'''
    ],
    [[0x2028, 0x2029], [('cntrl', True)],
     '''gen-unicode-ctype.c: LINE SEPARATOR and PARAGRAPH SEPARATOR
     should be cntrl.'''
    ],
    [[(0x0030, 0x0039), (0x0041, 0x0046), (0x0061, 0x0066)],
     [('xdigit', True)],
     '''gen-unicode-ctype.c: ISO C 99 says (6.4.4.1): hexadecimal-digit:
     one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F (nothing else
     should be considered as a hexadecimal-digit)'''
    ],
    [[0x0330], [('combining', True), ('combining_level3', False)],
     '''gen-unicode-ctype.c: COMBINING TILDE BELOW, canonical combining
     class value >= 200, should be in combining but not in
     combining_level3'''
    ],
    [[0x0250, 0x0251, 0x0271], [('lower', True)],
     '''Should be lower in Unicode 7.0.0 (was not lower in
     Unicode 5.0.0).
     '''
    ],
    [[0x2184], [('lower', True)],
     '''Should be lower both in Unicode 5.0.0 and 7.0.0'''
    ],
    [[0xA67F], [('punct', False), ('alpha', True)],
     '''0xa67f CYRILLIC PAYEROK. Not in Unicode 5.0.0. In Unicode
     7.0.0. General category Lm (Letter
     modifier). DerivedCoreProperties.txt says it is
     “Alphabetic”. Apparently added manually to punct by mistake in
     glibc’s old LC_CTYPE.'''
    ],
    [[0xA60C], [('punct', False), ('alpha', True)],
     '''0xa60c VAI SYLLABLE LENGTHENER. Not in Unicode 5.0.0.
     In Unicode 7.0.0. General category Lm (Letter
     modifier). DerivedCoreProperties.txt says it is
     “Alphabetic”. Apparently added manually to punct by mistake in
     glibc’s old LC_CTYPE.'''
    ],
    [[0x2E2F], [('punct', False), ('alpha', True)],
     '''0x2E2F VERTICAL TILDE. Not in Unicode 5.0.0. In Unicode
     7.0.0. General category Lm (Letter
     modifier). DerivedCoreProperties.txt says it is
     “Alphabetic”. Apparently added manually to punct by mistake in
     glibc’s old LC_CTYPE.'''
    ],
    [[(0x1090, 0x1099)], [('punct', False), ('alpha', True)],
     '''MYANMAR SHAN DIGIT ZERO - MYANMAR SHAN DIGIT NINE.
     These are digits, but because ISO C 99 forbids to
     put them into digit they should go into alpha.'''
    ],
    [[0x103F], [('punct', False), ('alpha', True)],
     '''0x103F MYANMAR LETTER GREAT SA. Not in Unicode 5.0.0.
     In Unicode 7.0.0. General category Lo
     (Other_Letter). DerivedCoreProperties.txt says it is
     “Alphabetic”. Apparently added manually to punct by
     mistake in glibc’s old LC_CTYPE.'''
    ],
    [[0x0374], [('punct', False), ('alpha', True)],
     '''0x0374 GREEK NUMERAL SIGN. Unicode 5.0.0: general category
     Sk. Unicode 7.0.0: General category Lm
     (Modifier_Letter). DerivedCoreProperties.txt says it is
     “Alphabetic”.'''
    ],
    [[0x02EC], [('punct', False), ('alpha', True)],
     '''0x02EC MODIFIER LETTER VOICING. Unicode 5.0.0: general category
     Sk. Unicode 7.0.0: General category Lm
     (Modifier_Letter). DerivedCoreProperties.txt says it is
     “Alphabetic”.'''
    ],
    [[0x180E], [('space', False), ('blank', False)],
     '''0x180e MONGOLIAN VOWEL SEPARATOR. Unicode 5.0.0: General
     category Zs (Space_Separator) Unicode 7.0.0: General category Cf
     (Format).'''
    ],
    [[0x1E9C, 0x1E9D, 0x1E9F],
     [('lower', True), ('upper', False), ('tolower', False),
      ('toupper', False), ('totitle', False)],
     '''ẜ 0x1e9c LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE,
     ẝ 0x1e9d LATIN SMALL LETTER LONG S WITH HIGH STROKE,
     ẟ 0x1e9f LATIN SMALL LETTER DELTA. These are “Lowercase”
     according to DerivedCoreProperties.txt but no upper case versions
     exist.'''
    ],
    [[0x1E9E],
     [('lower', False), ('upper', True), ('tolower', True),
      ('toupper', False), ('totitle', False)],
     '''0x1E9E ẞ LATIN CAPITAL LETTER SHARP S This is “Uppercase”
     according to DerivedCoreProperties.txt and the lower case
     version is 0x00DF ß LATIN SMALL LETTER SHARP S.'''
    ],
    [[0x2188],
     [('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''0x2188 ROMAN NUMERAL ONE HUNDRED THOUSAND.  This is “Alphabetic”
     according to DerivedCoreProperties.txt. In glibc’s old
     LC_CTYPE, it was in “lower”, which seems to be a
     mistake. It is not “Lowercase” in
     DerivedCoreProperties.txt and does not have case mappings
     in UnicodeData.txt either.'''
    ],
    [[0x2C71, 0x2C74, (0x2C77, 0x2C7A)],
            [('alpha', True), ('lower', True), ('upper', False),
             ('tolower', False), ('toupper', False), ('totitle', False)],
            '''These are Latin small letters which were not in Unicode 5.0.0
            but are in Unicode 7.0.0. According to
            DerivedCoreProperties.txt they are “Lowercase”. But no
            uppercase versions exist.  They have apparently been added
            manually to glibc’s old LC_CTYPE.'''
    ],
    [[0xA730, 0xA731],
            [('alpha', True), ('lower', True), ('upper', False),
             ('tolower', False), ('toupper', False), ('totitle', False)],
            '''These are Latin small “capital” letters which were not in
            Unicode 5.0.0 but are in Unicode 7.0.0. According to
            DerivedCoreProperties.txt they are “Lowercase”. But no
            uppercase versions exist.  They have apparently been added
            manually to glibc’s old LC_CTYPE.'''
    ],
    [[(0xA771, 0xA778)],
     [('alpha', True), ('lower', True), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''These are Latin small (or small “capital”) letters which
     were not in Unicodee 5.0.0 but are in Unicode 7.0.0. According to
     DerivedCoreProperties.txt they are “Lowercase”. But no
     uppercase versions exist.  They have apparently been added
     manually to glibc’s old LC_CTYPE.'''
    ],
    [[0x0375],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''“0375;GREEK LOWER NUMERAL SIGN;Sk;0;ON;;;;;N;;;;;”.  Has
     apparently been added manually to glibc’s old LC_CTYPE as
     “combining_level3”. That seems wrong, it is no combining
     character because it does not have one of the general
     categories Mn, Mc, or Me. According to
     DerivedCoreProperties.txt it is not “Alphabetic”.'''
    ],
    [[0x108D],
     [('combining', True), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''“108D;MYANMAR SIGN SHAN COUNCIL EMPHATIC
     TONE;Mn;220;NSM;;;;;N;;;;;”.  Has apparently been added
     manually to glibc’s old LC_CTYPE as
     “combining_level3”. That seems wrong, although it is a
     combining character because it has the general category
     Mn, it is not “combining_level3” because the canonical
     combining class value is 220 which is >= 200. According to
     gen-unicode-ctype.c, “combining_level3” needs a
     canonical combining class value < 200. According to
     DerivedCoreProperties.txt it is not “Alphabetic”.'''
    ],
    [[0x06DE],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     ''' UnicodeData.txt 5.0.0: “06DE;ARABIC START OF RUB EL
     HIZB;Me;0;NSM;;;;;N;;;;;”; UnicodeData.txt 7.0.0:
     “06DE;ARABIC START OF RUB EL
     HIZB;So;0;ON;;;;;N;;;;;”. I.e. this used to be a
     combining character in Unicode 5.0.0 but not anymore in
     7.0.0. According to DerivedCoreProperties.txt it is not
     “Alphabetic”.'''
    ],
    [[0x0BD0],
     [('combining', False), ('combining_level3', False),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''Not in UnicodeData.txt 5.0.0.  UnicodeData.txt 7.0.0:
     “0BD0;TAMIL OM;Lo;0;L;;;;;N;;;;;”.  Apparently manually added to
     “combining” and “combining_level3” in glibc’s old
     LC_CTYPE. That seems wrong.  According to
     DerivedCoreProperties.txt it is “Alphabetic”.'''
    ],
    [[0x103F],
     [('combining', False), ('combining_level3', False),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''Not in UnicodeData.txt 5.0.0.  UnicodeData.txt 7.0.0:
     “103F;MYANMAR LETTER GREAT SA;Lo;0;L;;;;;N;;;;;”.
     Apparently manually added to “combining” and
     “combining_level3” in glibc’s old LC_CTYPE. That seems
     wrong.  According to DerivedCoreProperties.txt it is
     “Alphabetic”.'''
    ],
    [[(0x0901, 0x0903)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''These have general category “Mn” i.e. these are combining
     characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
     “0901;DEVANAGARI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”,
     ”0902;DEVANAGARI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”,
     “0903;DEVANAGARI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”.
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x093C],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''UnicodeData.txt (5.0.0 and 7.0.0): “093C;DEVANAGARI SIGN
     NUKTA;Mn;7;NSM;;;;;N;;;;;” According to
     DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”. glibc’s old LC_TYPE has this in “alpha”.'''
    ],
    [[(0x093E, 0x093F)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''These have general category “Mc” i.e. these are combining
     characters (both in UnicodeData.txt 5.0.0 and 7.0.0):
     “093E;DEVANAGARI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “093F;DEVANAGARI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0x0940, 0x094C)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''These are all combining
     characters (“Mc” or “Mn” both in UnicodeData.txt 5.0.0 and 7.0.0).
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x094D],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
     “094D;DEVANAGARI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) it is *not*
     “Alphabetic”.'''
    ],
    [[(0x0951, 0x0954)],
     [('combining', True), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0962, 0x0963), (0x0981, 0x0983)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x09BC],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “09BC;BENGALI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
     Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
     According to DerivedCoreProperties.txt (7.0.0) it is *not*
     “Alphabetic”.'''
    ],
    [[(0x09BE, 0x09BF), (0x09C0, 0x09C4), (0x09C7, 0x09C8),
      (0x09CB, 0x09CC)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “09BE;BENGALI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “09BF;BENGALI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
     “09C0;BENGALI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
     “09C1;BENGALI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
     “09C2;BENGALI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
     “09C3;BENGALI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
     “09C4;BENGALI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
     “09C7;BENGALI VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
     “09C8;BENGALI VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
     “09CB;BENGALI VOWEL SIGN O;Mc;0;L;09C7 09BE;;;;N;;;;;”
     “09CC;BENGALI VOWEL SIGN AU;Mc;0;L;09C7 09D7;;;;N;;;;;”
     Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x09CD],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “09CD;BENGALI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     Combining character, both in UnicodeData.txt 5.0.0 and 7.0.0.
     According to DerivedCoreProperties.txt (7.0.0) it is *not*
     “Alphabetic”.'''
    ],
    [[0x09D7, (0x09E2, 0x09E3)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''Combining characters, both in UnicodeData.txt 5.0.0 and 7.0.0.
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x09F2, 0x09F3],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “09F2;BENGALI RUPEE MARK;Sc;0;ET;;;;;N;;;;;”
     “09F3;BENGALI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x09F4, 0x09FA)],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “09F4;BENGALI CURRENCY NUMERATOR ONE;No;0;L;;;;1/16;N;;;;;”
     “09F5;BENGALI CURRENCY NUMERATOR TWO;No;0;L;;;;1/8;N;;;;;”
     “09F6;BENGALI CURRENCY NUMERATOR THREE;No;0;L;;;;3/16;N;;;;;”
     “09F7;BENGALI CURRENCY NUMERATOR FOUR;No;0;L;;;;1/4;N;;;;;”
     “09F8;BENGALI CURRENCY NUMERATOR ONE LESS THAN THE DENOMINATOR;
     No;0;L;;;;3/4;N;;;;;”
     “09F9;BENGALI CURRENCY DENOMINATOR SIXTEEN;No;0;L;;;;16;N;;;;;”
     “09FA;BENGALI ISSHAR;So;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0A01, 0x0A03)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0A01;GURMUKHI SIGN ADAK BINDI;Mn;0;NSM;;;;;N;;;;;”
     “0A02;GURMUKHI SIGN BINDI;Mn;0;NSM;;;;;N;;;;;”
     “0A03;GURMUKHI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0A3C],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0A3C;GURMUKHI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0A3E, 0x0A40), (0x0A41, 0x0A42), (0x0A47, 0x0A48),
      (0x0A4B, 0x0A4C)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0A3E;GURMUKHI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “0A3F;GURMUKHI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
     “0A40;GURMUKHI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
     “0A41;GURMUKHI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
     “0A42;GURMUKHI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
     “0A47;GURMUKHI VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;”
     “0A48;GURMUKHI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;”
     “0A4B;GURMUKHI VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;”
     “0A4C;GURMUKHI VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0A4D],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[0x0A51, (0x0A70, 0x0A71), 0x0A75, (0x0A81, 0x0A83)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0A4D;GURMUKHI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     “0A70;GURMUKHI TIPPI;Mn;0;NSM;;;;;N;;;;;”
     “0A71;GURMUKHI ADDAK;Mn;0;NSM;;;;;N;;;;;”
     “0A75;GURMUKHI SIGN YAKASH;Mn;0;NSM;;;;;N;;;;;”
     “0A81;GUJARATI SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
     “0A82;GUJARATI SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”
     “0A83;GUJARATI SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0ABC],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0ABC;GUJARATI SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0ABE, 0x0AC5), (0x0AC7, 0x0AC9), (0x0ACB, 0x0ACC)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0ABE;GUJARATI VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “0ABF;GUJARATI VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
     “0AC0;GUJARATI VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
     “0AC1;GUJARATI VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
     “0AC2;GUJARATI VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
     “0AC3;GUJARATI VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
     “0AC4;GUJARATI VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
     “0AC5;GUJARATI VOWEL SIGN CANDRA E;Mn;0;NSM;;;;;N;;;;;”
     “0AC7;GUJARATI VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;”
     “0AC8;GUJARATI VOWEL SIGN AI;Mn;0;NSM;;;;;N;;;;;”
     “0AC9;GUJARATI VOWEL SIGN CANDRA O;Mc;0;L;;;;;N;;;;;”
     “0ACB;GUJARATI VOWEL SIGN O;Mc;0;L;;;;;N;;;;;”
     “0ACC;GUJARATI VOWEL SIGN AU;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0ACD],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0ACD;GUJARATI SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0AE2, 0x0AE3)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0AE2;GUJARATI VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
     “0AE3;GUJARATI VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0AF1],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0AF1;GUJARATI RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0B01, 0x0B03)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B01;ORIYA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
     “0B02;ORIYA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
     “0B03;ORIYA SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0B3C],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B3C;ORIYA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0B3E, 0x0B44), (0x0B47, 0x0B48), (0x0B4B, 0x0B4C)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B3E;ORIYA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “0B3F;ORIYA VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;”
     “0B40;ORIYA VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
     “0B41;ORIYA VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
     “0B42;ORIYA VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
     “0B43;ORIYA VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
     “0B44;ORIYA VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
     “0B47;ORIYA VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
     “0B48;ORIYA VOWEL SIGN AI;Mc;0;L;0B47 0B56;;;;N;;;;;”
     “0B4B;ORIYA VOWEL SIGN O;Mc;0;L;0B47 0B3E;;;;N;;;;;”
     “0B4C;ORIYA VOWEL SIGN AU;Mc;0;L;0B47 0B57;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0B4D],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B4D;ORIYA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0B56, 0x0B57), (0x0B62, 0x0B63)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B56;ORIYA AI LENGTH MARK;Mn;0;NSM;;;;;N;;;;;”
     “0B57;ORIYA AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
     “0B62;ORIYA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
     “0B63;ORIYA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0B70],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B70;ORIYA ISSHAR;So;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[0x0B82],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0B82;TAMIL SIGN ANUSVARA;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0BBE, 0x0BC2), (0x0BC6, 0x0BC8), (0x0BCA, 0x0BCC)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0BBE;TAMIL VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “0BBF;TAMIL VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
     “0BC0;TAMIL VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;”
     “0BC1;TAMIL VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
     “0BC2;TAMIL VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
     “0BC6;TAMIL VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
     “0BC7;TAMIL VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;”
     “0BC8;TAMIL VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
     “0BCA;TAMIL VOWEL SIGN O;Mc;0;L;0BC6 0BBE;;;;N;;;;;”
     “0BCB;TAMIL VOWEL SIGN OO;Mc;0;L;0BC7 0BBE;;;;N;;;;;”
     “0BCC;TAMIL VOWEL SIGN AU;Mc;0;L;0BC6 0BD7;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0BCD],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0BCD;TAMIL SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[0x0BD7],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0BD7;TAMIL AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0BF0, 0x0BFA)],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0BF0;TAMIL NUMBER TEN;No;0;L;;;;10;N;;;;;”
     “0BF1;TAMIL NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;”
     “0BF2;TAMIL NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;”
     “0BF3;TAMIL DAY SIGN;So;0;ON;;;;;N;;;;;”
     “0BF4;TAMIL MONTH SIGN;So;0;ON;;;;;N;;;;;”
     “0BF5;TAMIL YEAR SIGN;So;0;ON;;;;;N;;;;;”
     “0BF6;TAMIL DEBIT SIGN;So;0;ON;;;;;N;;;;;”
     “0BF7;TAMIL CREDIT SIGN;So;0;ON;;;;;N;;;;;”
     “0BF8;TAMIL AS ABOVE SIGN;So;0;ON;;;;;N;;;;;”
     “0BF9;TAMIL RUPEE SIGN;Sc;0;ET;;;;;N;;;;;”
     “0BFA;TAMIL NUMBER SIGN;So;0;ON;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) this is *not*
     “Alphabetic”.'''
    ],
    [[(0x0C01, 0x0C03)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0C01;TELUGU SIGN CANDRABINDU;Mc;0;L;;;;;N;;;;;”
     “0C02;TELUGU SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
     “0C03;TELUGU SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0x0C3E, 0x0C44), (0x0C46, 0x0C48), (0x0C4A, 0x0C4C)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0C3E;TELUGU VOWEL SIGN AA;Mn;0;NSM;;;;;N;;;;;”
     “0C3F;TELUGU VOWEL SIGN I;Mn;0;NSM;;;;;N;;;;;”
     “0C40;TELUGU VOWEL SIGN II;Mn;0;NSM;;;;;N;;;;;”
     “0C41;TELUGU VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
     “0C42;TELUGU VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
     “0C43;TELUGU VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;”
     “0C44;TELUGU VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;”
     “0C46;TELUGU VOWEL SIGN E;Mn;0;NSM;;;;;N;;;;;”
     “0C47;TELUGU VOWEL SIGN EE;Mn;0;NSM;;;;;N;;;;;”
     “0C48;TELUGU VOWEL SIGN AI;Mn;0;NSM;0C46 0C56;;;;N;;;;;”
     “0C4A;TELUGU VOWEL SIGN O;Mn;0;NSM;;;;;N;;;;;”
     “0C4B;TELUGU VOWEL SIGN OO;Mn;0;NSM;;;;;N;;;;;”
     “0C4C;TELUGU VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0C4D],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0C4D;TELUGU SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0C55, 0x0C56), (0x0C62, 0x0C63)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0C55;TELUGU LENGTH MARK;Mn;84;NSM;;;;;N;;;;;”
     “0C56;TELUGU AI LENGTH MARK;Mn;91;NSM;;;;;N;;;;;”
     “0C62;TELUGU VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
     “0C63;TELUGU VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0x0C78, 0x0C7F)],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0C78;TELUGU FRACTION DIGIT ZERO FOR ODD POWERS OF FOUR;
     No;0;ON;;;;0;N;;;;;”
     “0C79;TELUGU FRACTION DIGIT ONE FOR ODD POWERS OF FOUR;
     No;0;ON;;;;1;N;;;;;”
     “0C7A;TELUGU FRACTION DIGIT TWO FOR ODD POWERS OF FOUR;
     No;0;ON;;;;2;N;;;;;”
     “0C7B;TELUGU FRACTION DIGIT THREE FOR ODD POWERS OF FOUR;
     No;0;ON;;;;3;N;;;;;”
     “0C7C;TELUGU FRACTION DIGIT ONE FOR EVEN POWERS OF FOUR;
     No;0;ON;;;;1;N;;;;;”
     “0C7D;TELUGU FRACTION DIGIT TWO FOR EVEN POWERS OF FOUR;
     No;0;ON;;;;2;N;;;;;”
     “0C7E;TELUGU FRACTION DIGIT THREE FOR EVEN POWERS OF FOUR;
     No;0;ON;;;;3;N;;;;;”
     “0C7F;TELUGU SIGN TUUMU;So;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0C82, 0x0C83)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0C81;KANNADA SIGN CANDRABINDU;Mn;0;NSM;;;;;N;;;;;”
     “0C82;KANNADA SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
     “0C83;KANNADA SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0CBC],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0CBC;KANNADA SIGN NUKTA;Mn;7;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0x0CBE, 0x0CC4), (0x0CC6, 0x0CC8), (0x0CCA, 0x0CCC)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0CBE;KANNADA VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “0CBF;KANNADA VOWEL SIGN I;Mn;0;L;;;;;N;;;;;”
     “0CC0;KANNADA VOWEL SIGN II;Mc;0;L;0CBF 0CD5;;;;N;;;;;”
     “0CC1;KANNADA VOWEL SIGN U;Mc;0;L;;;;;N;;;;;”
     “0CC2;KANNADA VOWEL SIGN UU;Mc;0;L;;;;;N;;;;;”
     “0CC3;KANNADA VOWEL SIGN VOCALIC R;Mc;0;L;;;;;N;;;;;”
     “0CC4;KANNADA VOWEL SIGN VOCALIC RR;Mc;0;L;;;;;N;;;;;”
     “0CC6;KANNADA VOWEL SIGN E;Mn;0;L;;;;;N;;;;;”
     “0CC7;KANNADA VOWEL SIGN EE;Mc;0;L;0CC6 0CD5;;;;N;;;;;”
     “0CC8;KANNADA VOWEL SIGN AI;Mc;0;L;0CC6 0CD6;;;;N;;;;;”
     “0CCA;KANNADA VOWEL SIGN O;Mc;0;L;0CC6 0CC2;;;;N;;;;;”
     “0CCB;KANNADA VOWEL SIGN OO;Mc;0;L;0CCA 0CD5;;;;N;;;;;”
     “0CCC;KANNADA VOWEL SIGN AU;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0CCD],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0CCD;KANNADA SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0CD5, 0x0CD6), (0x0CE2, 0x0CE3)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     0CD5;KANNADA LENGTH MARK;Mc;0;L;;;;;N;;;;;
     0CD6;KANNADA AI LENGTH MARK;Mc;0;L;;;;;N;;;;;
     0CE2;KANNADA VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;
     0CE3;KANNADA VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0x0D02, 0x0D03), (0x0D3E, 0x0D44), (0x0D46, 0x0D48),
      (0x0D4A, 0x0D4C)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0D02;MALAYALAM SIGN ANUSVARA;Mc;0;L;;;;;N;;;;;”
     “0D03;MALAYALAM SIGN VISARGA;Mc;0;L;;;;;N;;;;;”
     “0D3E;MALAYALAM VOWEL SIGN AA;Mc;0;L;;;;;N;;;;;”
     “0D3F;MALAYALAM VOWEL SIGN I;Mc;0;L;;;;;N;;;;;”
     “0D40;MALAYALAM VOWEL SIGN II;Mc;0;L;;;;;N;;;;;”
     “0D41;MALAYALAM VOWEL SIGN U;Mn;0;NSM;;;;;N;;;;;”
     “0D42;MALAYALAM VOWEL SIGN UU;Mn;0;NSM;;;;;N;;;;;”
     “0D43;MALAYALAM VOWEL SIGN VOCALIC R;Mn;0;NSM;;;;;N;;;;;”
     “0D44;MALAYALAM VOWEL SIGN VOCALIC RR;Mn;0;NSM;;;;;N;;;;;”
     “0D46;MALAYALAM VOWEL SIGN E;Mc;0;L;;;;;N;;;;;”
     “0D47;MALAYALAM VOWEL SIGN EE;Mc;0;L;;;;;N;;;;;”
     “0D48;MALAYALAM VOWEL SIGN AI;Mc;0;L;;;;;N;;;;;”
     “0D4A;MALAYALAM VOWEL SIGN O;Mc;0;L;0D46 0D3E;;;;N;;;;;”
     “0D4B;MALAYALAM VOWEL SIGN OO;Mc;0;L;0D47 0D3E;;;;N;;;;;”
     “0D4C;MALAYALAM VOWEL SIGN AU;Mc;0;L;0D46 0D57;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0D4D],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0D4D;MALAYALAM SIGN VIRAMA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0D57, (0x0D62, 0x0D63)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0D57;MALAYALAM AU LENGTH MARK;Mc;0;L;;;;;N;;;;;”
     “0D62;MALAYALAM VOWEL SIGN VOCALIC L;Mn;0;NSM;;;;;N;;;;;”
     “0D63;MALAYALAM VOWEL SIGN VOCALIC LL;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0x0D70, 0x0D79)],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0D70;MALAYALAM NUMBER TEN;No;0;L;;;;10;N;;;;;”
     “0D71;MALAYALAM NUMBER ONE HUNDRED;No;0;L;;;;100;N;;;;;”
     “0D72;MALAYALAM NUMBER ONE THOUSAND;No;0;L;;;;1000;N;;;;;”
     “0D73;MALAYALAM FRACTION ONE QUARTER;No;0;L;;;;1/4;N;;;;;”
     “0D74;MALAYALAM FRACTION ONE HALF;No;0;L;;;;1/2;N;;;;;”
     “0D75;MALAYALAM FRACTION THREE QUARTERS;No;0;L;;;;3/4;N;;;;;”
     “0D79;MALAYALAM DATE MARK;So;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0D82, 0x0D83)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0D82;SINHALA SIGN ANUSVARAYA;Mc;0;L;;;;;N;;;;;”
     “0D83;SINHALA SIGN VISARGAYA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0DCA],
     [('combining', True), ('combining_level3', True),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0DCA;SINHALA SIGN AL-LAKUNA;Mn;9;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0x0DCF, 0x0DD4), 0x0DD6, (0x0DD8, 0x0DDF), (0x0DF2, 0x0DF3)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0DCF;SINHALA VOWEL SIGN AELA-PILLA;Mc;0;L;;;;;N;;;;;”
     “0DD0;SINHALA VOWEL SIGN KETTI AEDA-PILLA;Mc;0;L;;;;;N;;;;;”
     “0DD1;SINHALA VOWEL SIGN DIGA AEDA-PILLA;Mc;0;L;;;;;N;;;;;”
     “0DD2;SINHALA VOWEL SIGN KETTI IS-PILLA;Mn;0;NSM;;;;;N;;;;;”
     “0DD3;SINHALA VOWEL SIGN DIGA IS-PILLA;Mn;0;NSM;;;;;N;;;;;”
     “0DD4;SINHALA VOWEL SIGN KETTI PAA-PILLA;Mn;0;NSM;;;;;N;;;;;”
     “0DD6;SINHALA VOWEL SIGN DIGA PAA-PILLA;Mn;0;NSM;;;;;N;;;;;”
     “0DD8;SINHALA VOWEL SIGN GAETTA-PILLA;Mc;0;L;;;;;N;;;;;”
     “0DD9;SINHALA VOWEL SIGN KOMBUVA;Mc;0;L;;;;;N;;;;;”
     “0DDA;SINHALA VOWEL SIGN DIGA KOMBUVA;Mc;0;L;0DD9 0DCA;;;;N;;;;;”
     “0DDB;SINHALA VOWEL SIGN KOMBU DEKA;Mc;0;L;;;;;N;;;;;”
     “0DDC;SINHALA VOWEL SIGN KOMBUVA HAA AELA-PILLA;
     Mc;0;L;0DD9 0DCF;;;;N;;;;;”
     “0DDD;SINHALA VOWEL SIGN KOMBUVA HAA DIGA AELA-PILLA;
     Mc;0;L;0DDC 0DCA;;;;N;;;;;”
     “0DDE;SINHALA VOWEL SIGN KOMBUVA HAA GAYANUKITTA;
     Mc;0;L;0DD9 0DDF;;;;N;;;;;”
     “0DDF;SINHALA VOWEL SIGN GAYANUKITTA;Mc;0;L;;;;;N;;;;;”
     “0DF2;SINHALA VOWEL SIGN DIGA GAETTA-PILLA;Mc;0;L;;;;;N;;;;;”
     “0DF3;SINHALA VOWEL SIGN DIGA GAYANUKITTA;Mc;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[0x0DF4],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “0DF4;SINHALA PUNCTUATION KUNDDALIYA;Po;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0xA789, 0xA78A)],
     [('combining', False), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “A789;MODIFIER LETTER COLON;Sk;0;L;;;;;N;;;;;”
     “A78A;MODIFIER LETTER SHORT EQUALS SIGN;Sk;0;L;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ],
    [[(0xA926, 0xA92A)],
     [('combining', True), ('combining_level3', True),
      ('alpha', True), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “A926;KAYAH LI VOWEL UE;Mn;0;NSM;;;;;N;;;;;”
     “A927;KAYAH LI VOWEL E;Mn;0;NSM;;;;;N;;;;;”
     “A928;KAYAH LI VOWEL U;Mn;0;NSM;;;;;N;;;;;”
     “A929;KAYAH LI VOWEL EE;Mn;0;NSM;;;;;N;;;;;”
     “A92A;KAYAH LI VOWEL O;Mn;0;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are
     “Alphabetic”.'''
    ],
    [[(0xA92B, 0xA92D)],
     [('combining', True), ('combining_level3', False),
      ('alpha', False), ('lower', False), ('upper', False),
      ('tolower', False), ('toupper', False), ('totitle', False)],
     '''
     “A92B;KAYAH LI TONE PLOPHU;Mn;220;NSM;;;;;N;;;;;”
     “A92C;KAYAH LI TONE CALYA;Mn;220;NSM;;;;;N;;;;;”
     “A92D;KAYAH LI TONE CALYA PLOPHU;Mn;220;NSM;;;;;N;;;;;”
     According to DerivedCoreProperties.txt (7.0.0) these are *not*
     “Alphabetic”.'''
    ]
 ]
--- a/localedata/unicode-gen/gen_unicode_ctype.py
+++ b/localedata/unicode-gen/gen_unicode_ctype.py
@ -0,0 +1,751 @@
 #!/usr/bin/python3
 #
 # Generate a Unicode conforming LC_CTYPE category from a UnicodeData file.
 # Copyright (C) 2014, 2015 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 # Based on gen-unicode-ctype.c by Bruno Haible <haible@clisp.cons.org>, 2000.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.
 '''
 Generate a Unicode conforming LC_CTYPE category from UnicodeData.txt and
 DerivedCoreProperties.txt files.
 To see how this script is used, call it with the “-h” option:
    $ ./gen_unicode_ctype.py -h
    … prints usage message …
 '''
 import argparse
 import sys
 import time
 import re
 # Dictionary holding the entire contents of the UnicodeData.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {0: {'category': 'Cc',
 #      'title': None,
 #      'digit': '',
 #      'name': '<control>',
 #      'bidi': 'BN',
 #      'combining': '0',
 #      'comment': '',
 #      'oldname': 'NULL',
 #      'decomposition': '',
 #      'upper': None,
 #      'mirrored': 'N',
 #      'lower': None,
 #      'decdigit': '',
 #      'numeric': ''},
 #      …
 # }
 UNICODE_ATTRIBUTES = {}
 # Dictionary holding the entire contents of the DerivedCoreProperties.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {917504: ['Default_Ignorable_Code_Point'],
 #  917505: ['Case_Ignorable', 'Default_Ignorable_Code_Point'],
 #  …
 # }
 DERIVED_CORE_PROPERTIES = {}
 def fill_attribute(code_point, fields):
    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
    in the UnicodeData.txt file.
    '''
    UNICODE_ATTRIBUTES[code_point] =  {
        'name': fields[1],          # Character name
        'category': fields[2],      # General category
        'combining': fields[3],     # Canonical combining classes
        'bidi': fields[4],          # Bidirectional category
        'decomposition': fields[5], # Character decomposition mapping
        'decdigit': fields[6],      # Decimal digit value
        'digit': fields[7],         # Digit value
        'numeric': fields[8],       # Numeric value
        'mirrored': fields[9],      # mirrored
        'oldname': fields[10],      # Old Unicode 1.0 name
        'comment': fields[11],      # comment
        # Uppercase mapping
        'upper': int(fields[12], 16) if fields[12] else None,
        # Lowercase mapping
        'lower': int(fields[13], 16) if fields[13] else None,
        # Titlecase mapping
        'title': int(fields[14], 16) if fields[14] else None,
    }
 def fill_attributes(filename):
    '''Stores the entire contents of the UnicodeData.txt file
    in the UNICODE_ATTRIBUTES dictionary.
    A typical line for a single code point in UnicodeData.txt looks
    like this:
    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
    Code point ranges are indicated by pairs of lines like this:
    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    '''
    with open(filename, mode='r') as unicode_data_file:
        fields_start = []
        for line in unicode_data_file:
            fields = line.strip().split(';')
            if len(fields) != 15:
                sys.stderr.write(
                    'short line in file "%(f)s": %(l)s\n' %{
                    'f': filename, 'l': line})
                exit(1)
            if fields[2] == 'Cs':
                # Surrogates are UTF-16 artefacts,
                # not real characters. Ignore them.
                fields_start = []
                continue
            if fields[1].endswith(', First>'):
                fields_start = fields
                fields_start[1] = fields_start[1].split(',')[0][1:]
                continue
            if fields[1].endswith(', Last>'):
                fields[1] = fields[1].split(',')[0][1:]
                if fields[1:] != fields_start[1:]:
                    sys.stderr.write(
                        'broken code point range in file "%(f)s": %(l)s\n' %{
                            'f': filename, 'l': line})
                    exit(1)
                for code_point in range(
                        int(fields_start[0], 16),
                        int(fields[0], 16)+1):
                    fill_attribute(code_point, fields)
                fields_start = []
                continue
            fill_attribute(int(fields[0], 16), fields)
            fields_start = []
 def fill_derived_core_properties(filename):
    '''Stores the entire contents of the DerivedCoreProperties.txt file
    in the DERIVED_CORE_PROPERTIES dictionary.
    Lines in DerivedCoreProperties.txt are either a code point range like
    this:
    0061..007A    ; Lowercase # L&  [26] LATIN SMALL LETTER A..LATIN SMALL LETTER Z
    or a single code point like this:
    00AA          ; Lowercase # Lo       FEMININE ORDINAL INDICATOR
    '''
    with open(filename, mode='r') as derived_core_properties_file:
        for line in derived_core_properties_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                + r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                + r'\s*;\s*(?P<property>[a-zA-Z_]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                prop = match.group('property')
                if code_point in DERIVED_CORE_PROPERTIES:
                    DERIVED_CORE_PROPERTIES[code_point].append(prop)
                else:
                    DERIVED_CORE_PROPERTIES[code_point] = [prop]
 def to_upper(code_point):
    '''Returns the code point of the uppercase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['upper']):
        return UNICODE_ATTRIBUTES[code_point]['upper']
    else:
        return code_point
 def to_lower(code_point):
    '''Returns the code point of the lowercase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['lower']):
        return UNICODE_ATTRIBUTES[code_point]['lower']
    else:
        return code_point
 def to_title(code_point):
    '''Returns the code point of the titlecase version
    of the given code point'''
    if (UNICODE_ATTRIBUTES[code_point]['name']
        and UNICODE_ATTRIBUTES[code_point]['title']):
        return UNICODE_ATTRIBUTES[code_point]['title']
    else:
        return code_point
 def is_upper(code_point):
    '''Checks whether the character with this code point is uppercase'''
    return (to_lower(code_point) != code_point
            or (code_point in DERIVED_CORE_PROPERTIES
                and 'Uppercase' in DERIVED_CORE_PROPERTIES[code_point]))
 def is_lower(code_point):
    '''Checks whether the character with this code point is lowercase'''
    # Some characters are defined as “Lowercase” in
    # DerivedCoreProperties.txt but do not have a mapping to upper
    # case. For example, ꜰ U+A72F “LATIN LETTER SMALL CAPITAL F” is
    # one of these.
    return (to_upper(code_point) != code_point
            # <U00DF> is lowercase, but without simple to_upper mapping.
            or code_point == 0x00DF
            or (code_point in DERIVED_CORE_PROPERTIES
                and 'Lowercase' in DERIVED_CORE_PROPERTIES[code_point]))
 def is_alpha(code_point):
    '''Checks whether the character with this code point is alphabetic'''
    return ((code_point in DERIVED_CORE_PROPERTIES
             and
             'Alphabetic' in DERIVED_CORE_PROPERTIES[code_point])
            or
            # Consider all the non-ASCII digits as alphabetic.
            # ISO C 99 forbids us to have them in category “digit”,
            # but we want iswalnum to return true on them.
            (UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd'
             and not (code_point >= 0x0030 and code_point <= 0x0039)))
 def is_digit(code_point):
    '''Checks whether the character with this code point is a digit'''
    if False:
        return (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Nd')
        # Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
        # a zero.  Must add <0> in front of them by hand.
    else:
        # SUSV2 gives us some freedom for the "digit" category, but ISO C 99
        # takes it away:
        # 7.25.2.1.5:
        #    The iswdigit function tests for any wide character that
        #    corresponds to a decimal-digit character (as defined in 5.2.1).
        # 5.2.1:
        #    the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
        return (code_point >= 0x0030 and code_point <= 0x0039)
 def is_outdigit(code_point):
    '''Checks whether the character with this code point is outdigit'''
    return (code_point >= 0x0030 and code_point <= 0x0039)
 def is_blank(code_point):
    '''Checks whether the character with this code point is blank'''
    return (code_point == 0x0009 # '\t'
            # Category Zs without mention of '<noBreak>'
            or (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'] == 'Zs'
                and '<noBreak>' not in
                UNICODE_ATTRIBUTES[code_point]['decomposition']))
 def is_space(code_point):
    '''Checks whether the character with this code point is a space'''
    # Don’t make U+00A0 a space. Non-breaking space means that all programs
    # should treat it like a punctuation character, not like a space.
    return (code_point == 0x0020 # ' '
            or code_point == 0x000C # '\f'
            or code_point == 0x000A # '\n'
            or code_point == 0x000D # '\r'
            or code_point == 0x0009 # '\t'
            or code_point == 0x000B # '\v'
            # Categories Zl, Zp, and Zs without mention of "<noBreak>"
            or (UNICODE_ATTRIBUTES[code_point]['name']
                and
                (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']
                 or
                 (UNICODE_ATTRIBUTES[code_point]['category'] in ['Zs']
                  and
                  '<noBreak>' not in
                  UNICODE_ATTRIBUTES[code_point]['decomposition']))))
 def is_cntrl(code_point):
    '''Checks whether the character with this code point is
    a control character'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and (UNICODE_ATTRIBUTES[code_point]['name'] == '<control>'
                 or
                 UNICODE_ATTRIBUTES[code_point]['category'] in ['Zl', 'Zp']))
 def is_xdigit(code_point):
    '''Checks whether the character with this code point is
    a hexadecimal digit'''
    if False:
        return (is_digit(code_point)
                or (code_point >= 0x0041 and code_point <= 0x0046)
                or (code_point >= 0x0061 and code_point <= 0x0066))
    else:
        # SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
        # takes it away:
        # 7.25.2.1.12:
        #    The iswxdigit function tests for any wide character that
        #    corresponds to a hexadecimal-digit character (as defined
        #    in 6.4.4.1).
        # 6.4.4.1:
        #    hexadecimal-digit: one of
        #    0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
        return ((code_point >= 0x0030 and code_point  <= 0x0039)
                or (code_point >= 0x0041 and code_point <= 0x0046)
                or (code_point >= 0x0061 and code_point <= 0x0066))
 def is_graph(code_point):
    '''Checks whether the character with this code point is
    a graphical character'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
            and not is_space(code_point))
 def is_print(code_point):
    '''Checks whether the character with this code point is printable'''
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and UNICODE_ATTRIBUTES[code_point]['name'] != '<control>'
            and UNICODE_ATTRIBUTES[code_point]['category'] not in ['Zl', 'Zp'])
 def is_punct(code_point):
    '''Checks whether the character with this code point is punctuation'''
    if False:
        return (UNICODE_ATTRIBUTES[code_point]['name']
                and UNICODE_ATTRIBUTES[code_point]['category'].startswith('P'))
    else:
        # The traditional POSIX definition of punctuation is every graphic,
        # non-alphanumeric character.
        return (is_graph(code_point)
                and not is_alpha(code_point)
                and not is_digit(code_point))
 def is_combining(code_point):
    '''Checks whether the character with this code point is
    a combining character'''
    # Up to Unicode 3.0.1 we took the Combining property from the PropList.txt
    # file. In 3.0.1 it was identical to the union of the general categories
    # "Mn", "Mc", "Me". In Unicode 3.1 this property has been dropped from the
    # PropList.txt file, so we take the latter definition.
    return (UNICODE_ATTRIBUTES[code_point]['name']
            and
            UNICODE_ATTRIBUTES[code_point]['category'] in ['Mn', 'Mc', 'Me'])
 def is_combining_level3(code_point):
    '''Checks whether the character with this code point is
    a combining level3 character'''
    return (is_combining(code_point)
            and
            int(UNICODE_ATTRIBUTES[code_point]['combining']) in range(0, 200))
 def ucs_symbol(code_point):
    '''Return the UCS symbol string for a Unicode character.'''
    if code_point < 0x10000:
        return '<U{:04X}>'.format(code_point)
    else:
        return '<U{:08X}>'.format(code_point)
 def ucs_symbol_range(code_point_low, code_point_high):
    '''Returns a string UCS symbol string for a code point range.
    Example:
    <U0041>..<U005A>
    '''
    return ucs_symbol(code_point_low) + '..' + ucs_symbol(code_point_high)
 def code_point_ranges(is_class_function):
    '''Returns a list of ranges of code points for which is_class_function
    returns True.
    Example:
    [[65, 90], [192, 214], [216, 222], [256], … ]
    '''
    cp_ranges  = []
    for code_point in sorted(UNICODE_ATTRIBUTES):
        if is_class_function(code_point):
            if (cp_ranges
                and cp_ranges[-1][-1] == code_point - 1):
                if len(cp_ranges[-1]) == 1:
                    cp_ranges[-1].append(code_point)
                else:
                    cp_ranges[-1][-1] = code_point
            else:
                cp_ranges.append([code_point])
    return cp_ranges
 def output_charclass(i18n_file, class_name, is_class_function):
    '''Output a LC_CTYPE character class section
    Example:
    upper /
       <U0041>..<U005A>;<U00C0>..<U00D6>;<U00D8>..<U00DE>;<U0100>;<U0102>;/
       …
       <U0001D790>..<U0001D7A8>;<U0001D7CA>;<U0001F130>..<U0001F149>;/
       <U0001F150>..<U0001F169>;<U0001F170>..<U0001F189>
    '''
    cp_ranges = code_point_ranges(is_class_function)
    if cp_ranges:
        i18n_file.write('%s /\n' %class_name)
        max_column = 75
        prefix = '   '
        line = prefix
        range_string = ''
        for code_point_range in cp_ranges:
            if line.strip():
                line  += ';'
            if len(code_point_range) == 1:
                range_string = ucs_symbol(code_point_range[0])
            else:
                range_string = ucs_symbol_range(
                    code_point_range[0], code_point_range[-1])
            if len(line+range_string) > max_column:
                i18n_file.write(line+'/\n')
                line = prefix
            line += range_string
        if line.strip():
            i18n_file.write(line+'\n')
        i18n_file.write('\n')
 def output_charmap(i18n_file, map_name, map_function):
    '''Output a LC_CTYPE character map section
    Example:
    toupper /
      (<U0061>,<U0041>);(<U0062>,<U0042>);(<U0063>,<U0043>);(<U0064>,<U0044>);/
      …
      (<U000118DC>,<U000118BC>);(<U000118DD>,<U000118BD>);/
      (<U000118DE>,<U000118BE>);(<U000118DF>,<U000118BF>)
    '''
    max_column = 75
    prefix = '   '
    line = prefix
    map_string = ''
    i18n_file.write('%s /\n' %map_name)
    for code_point in sorted(UNICODE_ATTRIBUTES):
        mapped = map_function(code_point)
        if code_point != mapped:
            if line.strip():
                line += ';'
            map_string = '(' \
                         + ucs_symbol(code_point) \
                         + ',' \
                         + ucs_symbol(mapped) \
                         + ')'
            if len(line+map_string) > max_column:
                i18n_file.write(line+'/\n')
                line = prefix
            line += map_string
    if line.strip():
        i18n_file.write(line+'\n')
    i18n_file.write('\n')
 def verifications():
    '''Tests whether the is_* functions observe the known restrictions'''
    for code_point in sorted(UNICODE_ATTRIBUTES):
        # toupper restriction: "Only characters specified for the keywords
        # lower and upper shall be specified.
        if (to_upper(code_point) != code_point
            and not (is_lower(code_point) or is_upper(code_point))):
            sys.stderr.write(
                ('%(sym)s is not upper|lower '
                 + 'but toupper(0x%(c)04X) = 0x%(uc)04X\n') %{
                    'sym': ucs_symbol(code_point),
                    'c': code_point,
                    'uc': to_upper(code_point)})
        # tolower restriction: "Only characters specified for the keywords
        # lower and upper shall be specified.
        if (to_lower(code_point) != code_point
            and not (is_lower(code_point) or is_upper(code_point))):
            sys.stderr.write(
                ('%(sym)s is not upper|lower '
                 + 'but tolower(0x%(c)04X) = 0x%(uc)04X\n') %{
                    'sym': ucs_symbol(code_point),
                    'c': code_point,
                    'uc': to_lower(code_point)})
        # alpha restriction: "Characters classified as either upper or lower
        # shall automatically belong to this class.
        if ((is_lower(code_point) or is_upper(code_point))
             and not is_alpha(code_point)):
            sys.stderr.write('%(sym)s is upper|lower but not alpha\n' %{
                'sym': ucs_symbol(code_point)})
        # alpha restriction: “No character specified for the keywords cntrl,
        # digit, punct or space shall be specified.”
        if (is_alpha(code_point) and is_cntrl(code_point)):
            sys.stderr.write('%(sym)s is alpha and cntrl\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is alpha and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_punct(code_point)):
            sys.stderr.write('%(sym)s is alpha and punct\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_alpha(code_point) and is_space(code_point)):
            sys.stderr.write('%(sym)s is alpha and space\n' %{
                'sym': ucs_symbol(code_point)})
        # space restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, graph or xdigit shall be specified.”
        # upper, lower, alpha already checked above.
        if (is_space(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is space and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_space(code_point) and is_graph(code_point)):
            sys.stderr.write('%(sym)s is space and graph\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_space(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is space and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        # cntrl restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, punct, graph, print or xdigit shall be
        # specified.”  upper, lower, alpha already checked above.
        if (is_cntrl(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is cntrl and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_punct(code_point)):
            sys.stderr.write('%(sym)s is cntrl and punct\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_graph(code_point)):
            sys.stderr.write('%(sym)s is cntrl and graph\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_print(code_point)):
            sys.stderr.write('%(sym)s is cntrl and print\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_cntrl(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is cntrl and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        # punct restriction: “No character specified for the keywords upper,
        # lower, alpha, digit, cntrl, xdigit or as the <space> character shall
        # be specified.”  upper, lower, alpha, cntrl already checked above.
        if (is_punct(code_point) and is_digit(code_point)):
            sys.stderr.write('%(sym)s is punct and digit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_punct(code_point) and is_xdigit(code_point)):
            sys.stderr.write('%(sym)s is punct and xdigit\n' %{
                'sym': ucs_symbol(code_point)})
        if (is_punct(code_point) and code_point == 0x0020):
            sys.stderr.write('%(sym)s is punct\n' %{
                'sym': ucs_symbol(code_point)})
        # graph restriction: “No character specified for the keyword cntrl
        # shall be specified.”  Already checked above.
        # print restriction: “No character specified for the keyword cntrl
        # shall be specified.”  Already checked above.
        # graph - print relation: differ only in the <space> character.
        # How is this possible if there are more than one space character?!
        # I think susv2/xbd/locale.html should speak of “space characters”,
        # not “space character”.
        if (is_print(code_point)
            and not (is_graph(code_point) or is_space(code_point))):
            sys.stderr.write('%(sym)s is print but not graph|<space>\n' %{
                'sym': ucs_symbol(code_point)})
        if (not is_print(code_point)
            and (is_graph(code_point) or code_point == 0x0020)):
            sys.stderr.write('%(sym)s is graph|<space> but not print\n' %{
                'sym': ucs_symbol(code_point)})
 def read_input_file(filename):
    '''Reads the original glibc i18n file to get the original head
    and tail.
    We want to replace only the character classes in LC_CTYPE, and the
    date stamp. All the rest of the i18n file should stay unchanged.
    To avoid having to cut and paste the generated data into the
    original file, it is helpful to read the original file here
    to be able to generate a complete result file.
    '''
    head = tail = ''
    with open(filename, mode='r') as i18n_file:
        for line in i18n_file:
            match = re.match(
                r'^(?P<key>date\s+)(?P<value>"[0-9]{4}-[0-9]{2}-[0-9]{2}")',
                line)
            if match:
                line = match.group('key') \
                       + '"{:s}"\n'.format(time.strftime('%Y-%m-%d'))
            head = head + line
            if line.startswith('LC_CTYPE'):
                break
        for line in i18n_file:
            if line.startswith('translit_start'):
                tail = line
                break
        for line in i18n_file:
            tail = tail + line
    return (head, tail)
 def output_head(i18n_file, unicode_version, head=''):
    '''Write the header of the output file, i.e. the part of the file
    before the “LC_CTYPE” line.
    '''
    if ARGS.input_file and head:
        i18n_file.write(head)
    else:
        i18n_file.write('escape_char /\n')
        i18n_file.write('comment_char %\n')
        i18n_file.write('\n')
        i18n_file.write('% Generated automatically by '
                        + 'gen_unicode_ctype.py '
                        + 'for Unicode {:s}.\n'.format(unicode_version))
        i18n_file.write('\n')
        i18n_file.write('LC_IDENTIFICATION\n')
        i18n_file.write('title     "Unicode {:s} FDCC-set"\n'.format(
            unicode_version))
        i18n_file.write('source    "UnicodeData.txt, '
                        + 'DerivedCoreProperties.txt"\n')
        i18n_file.write('address   ""\n')
        i18n_file.write('contact   ""\n')
        i18n_file.write('email     "bug-glibc-locales@gnu.org"\n')
        i18n_file.write('tel       ""\n')
        i18n_file.write('fax       ""\n')
        i18n_file.write('language  ""\n')
        i18n_file.write('territory "Earth"\n')
        i18n_file.write('revision  "{:s}"\n'.format(unicode_version))
        i18n_file.write('date      "{:s}"\n'.format(
            time.strftime('%Y-%m-%d')))
        i18n_file.write('category  "unicode:2014";LC_CTYPE\n')
        i18n_file.write('END LC_IDENTIFICATION\n')
        i18n_file.write('\n')
        i18n_file.write('LC_CTYPE\n')
 def output_tail(i18n_file, tail=''):
    '''Write the tail of the output file, i.e. the part of the file
    after the last “LC_CTYPE” character class.
    '''
    if ARGS.input_file and tail:
        i18n_file.write(tail)
    else:
        i18n_file.write('END LC_CTYPE\n')
 def output_tables(i18n_file, unicode_version):
    '''Write the new LC_CTYPE character classes to the output file'''
    i18n_file.write('% The following is the 14652 i18n fdcc-set '
                    + 'LC_CTYPE category.\n')
    i18n_file.write('% It covers Unicode version {:s}.\n'.format(
        unicode_version))
    i18n_file.write('% The character classes and mapping tables were '
                    + 'automatically\n')
    i18n_file.write('% generated using the gen_unicode_ctype.py '
                    + 'program.\n\n')
    i18n_file.write('% The "upper" class reflects the uppercase '
                    + 'characters of class "alpha"\n')
    output_charclass(i18n_file, 'upper', is_upper)
    i18n_file.write('% The "lower" class reflects the lowercase '
                    + 'characters of class "alpha"\n')
    output_charclass(i18n_file, 'lower', is_lower)
    i18n_file.write('% The "alpha" class of the "i18n" FDCC-set is '
                    + 'reflecting\n')
    i18n_file.write('% the recommendations in TR 10176 annex A\n')
    output_charclass(i18n_file, 'alpha', is_alpha)
    i18n_file.write('% The "digit" class must only contain the '
                    + 'BASIC LATIN digits, says ISO C 99\n')
    i18n_file.write('% (sections 7.25.2.1.5 and 5.2.1).\n')
    output_charclass(i18n_file, 'digit', is_digit)
    i18n_file.write('% The "outdigit" information is by default '
                    + '"0" to "9".  We don\'t have to\n')
    i18n_file.write('% provide it here since localedef will fill '
               + 'in the bits and it would\n')
    i18n_file.write('% prevent locales copying this file define '
                    + 'their own values.\n')
    i18n_file.write('% outdigit /\n')
    i18n_file.write('%    <U0030>..<U0039>\n\n')
    # output_charclass(i18n_file, 'outdigit', is_outdigit)
    output_charclass(i18n_file, 'space', is_space)
    output_charclass(i18n_file, 'cntrl', is_cntrl)
    output_charclass(i18n_file, 'punct', is_punct)
    output_charclass(i18n_file, 'graph', is_graph)
    output_charclass(i18n_file, 'print', is_print)
    i18n_file.write('% The "xdigit" class must only contain the '
                    + 'BASIC LATIN digits and A-F, a-f,\n')
    i18n_file.write('% says ISO C 99 '
                    + '(sections 7.25.2.1.12 and 6.4.4.1).\n')
    output_charclass(i18n_file, 'xdigit', is_xdigit)
    output_charclass(i18n_file, 'blank', is_blank)
    output_charmap(i18n_file, 'toupper', to_upper)
    output_charmap(i18n_file, 'tolower', to_lower)
    output_charmap(i18n_file, 'map "totitle";', to_title)
    i18n_file.write('% The "combining" class reflects ISO/IEC 10646-1 '
                    + 'annex B.1\n')
    i18n_file.write('% That is, all combining characters (level 2+3).\n')
    output_charclass(i18n_file, 'class "combining";', is_combining)
    i18n_file.write('% The "combining_level3" class reflects '
                    + 'ISO/IEC 10646-1 annex B.2\n')
    i18n_file.write('% That is, combining characters of level 3.\n')
    output_charclass(i18n_file,
                     'class "combining_level3";', is_combining_level3)
 if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Generate a Unicode conforming LC_CTYPE category from
        UnicodeData.txt and DerivedCoreProperties.txt files.
        ''')
    PARSER.add_argument(
        '-u', '--unicode_data_file',
        nargs='?',
        type=str,
        default='UnicodeData.txt',
        help=('The UnicodeData.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-d', '--derived_core_properties_file',
        nargs='?',
        type=str,
        default='DerivedCoreProperties.txt',
        help=('The DerivedCoreProperties.txt file to read, '
              + 'default: %(default)s'))
    PARSER.add_argument(
        '-i', '--input_file',
        nargs='?',
        type=str,
        help='''The original glibc/localedata/locales/i18n file.''')
    PARSER.add_argument(
        '-o', '--output_file',
        nargs='?',
        type=str,
        default='i18n.new',
        help='''The file which shall contain the generated LC_CTYPE category,
        default: %(default)s.  If the original
        glibc/localedata/locales/i18n has been given
        as an option, all data from the original file
        except the newly generated LC_CTYPE character
        classes and the date stamp in
        LC_IDENTIFICATION will be copied unchanged
        into the output file.  ''')
    PARSER.add_argument(
        '--unicode_version',
        nargs='?',
        required=True,
        type=str,
        help='The Unicode version of the input files used.')
    ARGS = PARSER.parse_args()
    fill_attributes(ARGS.unicode_data_file)
    fill_derived_core_properties(ARGS.derived_core_properties_file)
    verifications()
    HEAD = TAIL = ''
    if ARGS.input_file:
        (HEAD, TAIL) = read_input_file(ARGS.input_file)
    with open(ARGS.output_file, mode='w') as I18N_FILE:
        output_head(I18N_FILE, ARGS.unicode_version, head=HEAD)
        output_tables(I18N_FILE, ARGS.unicode_version)
        output_tail(I18N_FILE, tail=TAIL)
--- a/localedata/unicode-gen/unicode-license.txt
+++ b/localedata/unicode-gen/unicode-license.txt
@ -0,0 +1,50 @@
 UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
    Unicode Data Files include all data files under the directories
 http://www.unicode.org/Public/, http://www.unicode.org/reports/, and
 http://www.unicode.org/cldr/data/. Unicode Data Files do not include PDF
 online code charts under the directory http://www.unicode.org/Public/.
 Software includes any source code published in the Unicode Standard or under
 the directories http://www.unicode.org/Public/,
 http://www.unicode.org/reports/, and http://www.unicode.org/cldr/data/.
    NOTICE TO USER: Carefully read the following legal agreement. BY
 DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES
 ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND
 AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF
 YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA
 FILES OR SOFTWARE.
    COPYRIGHT AND PERMISSION NOTICE
    Copyright © 1991-2013 Unicode, Inc. All rights reserved. Distributed under
 the Terms of Use in http://www.unicode.org/copyright.html.
    Permission is hereby granted, free of charge, to any person obtaining a
 copy of the Unicode data files and any associated documentation (the "Data
 Files") or Unicode software and any associated documentation (the "Software")
 to deal in the Data Files or Software without restriction, including without
 limitation the rights to use, copy, modify, merge, publish, distribute, and/or
 sell copies of the Data Files or Software, and to permit persons to whom the
 Data Files or Software are furnished to do so, provided that (a) the above
 copyright notice(s) and this permission notice appear with all copies of the
 Data Files or Software, (b) both the above copyright notice(s) and this
 permission notice appear in associated documentation, and (c) there is clear
 notice in each modified Data File or in the Software as well as in the
 documentation associated with the Data File(s) or Software that the data or
 software has been modified.
    THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
 KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD
 PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN
 THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
 PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE
 DATA FILES OR SOFTWARE.
    Except as contained in this notice, the name of a copyright holder shall
 not be used in advertising or otherwise to promote the sale, use or other
 dealings in these Data Files or Software without prior written authorization
 of the copyright holder.
--- a/localedata/unicode-gen/utf8_compatibility.py
+++ b/localedata/unicode-gen/utf8_compatibility.py
@ -0,0 +1,399 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 # Copyright (C) 2014, 2015 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.
 '''
 This script is useful for checking backward compatibility of newly
 generated UTF-8 file from utf8_gen.py script
 To see how this script is used, call it with the “-h” option:
    $ ./utf8_compatibility.py -h
    … prints usage message …
 '''
 import sys
 import re
 import argparse
 # Dictionary holding the entire contents of the UnicodeData.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {0: {'category': 'Cc',
 #      'title': None,
 #      'digit': '',
 #      'name': '<control>',
 #      'bidi': 'BN',
 #      'combining': '0',
 #      'comment': '',
 #      'oldname': 'NULL',
 #      'decomposition': '',
 #      'upper': None,
 #      'mirrored': 'N',
 #      'lower': None,
 #      'decdigit': '',
 #      'numeric': ''},
 #      …
 # }
 UNICODE_ATTRIBUTES = {}
 # Dictionary holding the entire contents of the EastAsianWidths.txt file
 #
 # Contents of this dictionary look like this:
 #
 # {0: 'N', … , 45430: 'W', …}
 EAST_ASIAN_WIDTHS = {}
 def fill_attribute(code_point, fields):
    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.
    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
    in the UnicodeData.txt file.
    '''
    UNICODE_ATTRIBUTES[code_point] =  {
        'name': fields[1],          # Character name
        'category': fields[2],      # General category
        'combining': fields[3],     # Canonical combining classes
        'bidi': fields[4],          # Bidirectional category
        'decomposition': fields[5], # Character decomposition mapping
        'decdigit': fields[6],      # Decimal digit value
        'digit': fields[7],         # Digit value
        'numeric': fields[8],       # Numeric value
        'mirrored': fields[9],      # mirrored
        'oldname': fields[10],      # Old Unicode 1.0 name
        'comment': fields[11],      # comment
        # Uppercase mapping
        'upper': int(fields[12], 16) if fields[12] else None,
        # Lowercase mapping
        'lower': int(fields[13], 16) if fields[13] else None,
        # Titlecase mapping
        'title': int(fields[14], 16) if fields[14] else None,
    }
 def fill_attributes(filename):
    '''Stores the entire contents of the UnicodeData.txt file
    in the UNICODE_ATTRIBUTES dictionary.
    A typical line for a single code point in UnicodeData.txt looks
    like this:
    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
    Code point ranges are indicated by pairs of lines like this:
    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    '''
    with open(filename, mode='r') as unicode_data_file:
        fields_start = []
        for line in unicode_data_file:
            fields = line.strip().split(';')
            if len(fields) != 15:
                sys.stderr.write(
                    'short line in file "%(f)s": %(l)s\n' %{
                    'f': filename, 'l': line})
                exit(1)
            if fields[2] == 'Cs':
                # Surrogates are UTF-16 artefacts,
                # not real characters. Ignore them.
                fields_start = []
                continue
            if fields[1].endswith(', First>'):
                fields_start = fields
                fields_start[1] = fields_start[1].split(',')[0][1:]
                continue
            if fields[1].endswith(', Last>'):
                fields[1] = fields[1].split(',')[0][1:]
                if fields[1:] != fields_start[1:]:
                    sys.stderr.write(
                        'broken code point range in file "%(f)s": %(l)s\n' %{
                            'f': filename, 'l': line})
                    exit(1)
                for code_point in range(
                        int(fields_start[0], 16),
                        int(fields[0], 16)+1):
                    fill_attribute(code_point, fields)
                fields_start = []
                continue
            fill_attribute(int(fields[0], 16), fields)
            fields_start = []
 def fill_east_asian_widths(filename):
    '''Stores the entire contents of the EastAsianWidths.txt file
    in the EAST_ASIAN_WIDTHS dictionary.
    Lines in EastAsianWidths.txt are either a code point range like
    this:
    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>
    or a single code point like this:
    A015;W           # Lm         YI SYLLABLE WU
    '''
    with open(filename, mode='r') as east_asian_widths_file:
        for line in east_asian_widths_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                EAST_ASIAN_WIDTHS[code_point] = match.group('property')
 def ucs_symbol(code_point):
    '''Return the UCS symbol string for a Unicode character.'''
    if code_point < 0x10000:
        return '<U{:04X}>'.format(code_point)
    else:
        return '<U{:08X}>'.format(code_point)
 def create_charmap_dictionary(file_name):
    '''Create a dictionary for all code points found in the CHARMAP
    section of a file
    '''
    with open(file_name, mode='r') as utf8_file:
        charmap_dictionary = {}
        for line in utf8_file:
            if line.startswith('CHARMAP'):
                break
        for line in utf8_file:
            if line.startswith('END CHARMAP'):
                return charmap_dictionary
            if line.startswith('%'):
                continue
            match = re.match(
                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
                +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
                +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
                line)
            if not match:
                continue
            codepoint1 = match.group('codepoint1')
            codepoint2 = match.group('codepoint2')
            if not codepoint2:
                codepoint2 = codepoint1
            for i in range(int(codepoint1, 16),
                           int(codepoint2, 16) + 1):
                charmap_dictionary[i] = match.group('hexutf8')
        sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
                         %file_name)
        exit(1)
 def check_charmap(original_file_name, new_file_name):
    '''Report differences in the CHARMAP section between the old and the
    new file
    '''
    print('************************************************************')
    print('Report on CHARMAP:')
    ocharmap = create_charmap_dictionary(original_file_name)
    ncharmap = create_charmap_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated CHARMAP: %d'
          %len(set(ocharmap)-set(ncharmap)))
    if ARGS.show_missing_characters:
        for key in sorted(set(ocharmap)-set(ncharmap)):
            print('removed: {:s}     {:s} {:s}'.format(
                ucs_symbol(key),
                ocharmap[key],
                UNICODE_ATTRIBUTES[key]['name'] \
                if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    changed_charmap = {}
    for key in set(ocharmap).intersection(set(ncharmap)):
        if ocharmap[key] != ncharmap[key]:
            changed_charmap[key] = (ocharmap[key], ncharmap[key])
    print('Total changed characters in newly generated CHARMAP: %d'
          %len(changed_charmap))
    if ARGS.show_changed_characters:
        for key in sorted(changed_charmap):
            print('changed: {:s}     {:s}->{:s} {:s}'.format(
                ucs_symbol(key),
                changed_charmap[key][0],
                changed_charmap[key][1],
                UNICODE_ATTRIBUTES[key]['name'] \
                if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated CHARMAP: %d'
          %len(set(ncharmap)-set(ocharmap)))
    if ARGS.show_added_characters:
        for key in sorted(set(ncharmap)-set(ocharmap)):
            print('added: {:s}     {:s} {:s}'.format(
                ucs_symbol(key),
                ncharmap[key],
                UNICODE_ATTRIBUTES[key]['name'] \
                if key in UNICODE_ATTRIBUTES else None))
 def create_width_dictionary(file_name):
    '''Create a dictionary for all code points found in the WIDTH
    section of a file
    '''
    with open(file_name, mode='r') as utf8_file:
        width_dictionary = {}
        for line in utf8_file:
            if line.startswith('WIDTH'):
                break
        for line in utf8_file:
            if line.startswith('END WIDTH'):
                return width_dictionary
            match = re.match(
                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
                +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
                +r'\s+(?P<width>[02])',
                line)
            if not match:
                continue
            codepoint1 = match.group('codepoint1')
            codepoint2 = match.group('codepoint2')
            if not codepoint2:
                codepoint2 = codepoint1
            for i in range(int(codepoint1, 16),
                           int(codepoint2, 16) + 1):
                width_dictionary[i] = int(match.group('width'))
        sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)
 def check_width(original_file_name, new_file_name):
    '''Report differences in the WIDTH section between the old and the new
    file
    '''
    print('************************************************************')
    print('Report on WIDTH:')
    owidth = create_width_dictionary(original_file_name)
    nwidth = create_width_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated WIDTH: %d'
          %len(set(owidth)-set(nwidth)))
    print('(Characters not in WIDTH get width 1 by default, '
          + 'i.e. these have width 1 now.)')
    if ARGS.show_missing_characters:
        for key in sorted(set(owidth)-set(nwidth)):
            print('removed: {:s} '.format(ucs_symbol(key))
                  + '{:d} : '.format(owidth[key])
                  + 'eaw={:s} '.format(
                      EAST_ASIAN_WIDTHS[key]
                      if key in EAST_ASIAN_WIDTHS else None)
                  + 'category={:2s} '.format(
                      UNICODE_ATTRIBUTES[key]['category']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'bidi={:3s} '.format(
                      UNICODE_ATTRIBUTES[key]['bidi']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'name={:s}'.format(
                      UNICODE_ATTRIBUTES[key]['name']
                      if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    changed_width = {}
    for key in set(owidth).intersection(set(nwidth)):
        if owidth[key] != nwidth[key]:
            changed_width[key] = (owidth[key], nwidth[key])
    print('Total changed characters in newly generated WIDTH: %d'
          %len(changed_width))
    if ARGS.show_changed_characters:
        for key in sorted(changed_width):
            print('changed width: {:s} '.format(ucs_symbol(key))
                  + '{:d}->{:d} : '.format(changed_width[key][0],
                                          changed_width[key][1])
                  + 'eaw={:s} '.format(
                      EAST_ASIAN_WIDTHS[key]
                      if key in EAST_ASIAN_WIDTHS else None)
                  + 'category={:2s} '.format(
                      UNICODE_ATTRIBUTES[key]['category']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'bidi={:3s} '.format(
                      UNICODE_ATTRIBUTES[key]['bidi']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'name={:s}'.format(
                      UNICODE_ATTRIBUTES[key]['name']
                      if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated WIDTH: %d'
          %len(set(nwidth)-set(owidth)))
    print('(Characters not in WIDTH get width 1 by default, '
          + 'i.e. these had width 1 before.)')
    if ARGS.show_added_characters:
        for key in sorted(set(nwidth)-set(owidth)):
            print('added: {:s} '.format(ucs_symbol(key))
                  + '{:d} : '.format(nwidth[key])
                  + 'eaw={:s} '.format(
                      EAST_ASIAN_WIDTHS[key]
                      if key in EAST_ASIAN_WIDTHS else None)
                  + 'category={:2s} '.format(
                      UNICODE_ATTRIBUTES[key]['category']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'bidi={:3s} '.format(
                      UNICODE_ATTRIBUTES[key]['bidi']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'name={:s}'.format(
                      UNICODE_ATTRIBUTES[key]['name']
                      if key in UNICODE_ATTRIBUTES else None))
 if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Compare the contents of LC_CTYPE in two files and check for errors.
        ''')
    PARSER.add_argument(
        '-o', '--old_utf8_file',
        nargs='?',
        required=True,
        type=str,
        help='The old UTF-8 file.')
    PARSER.add_argument(
        '-n', '--new_utf8_file',
        nargs='?',
        required=True,
        type=str,
        help='The new UTF-8 file.')
    PARSER.add_argument(
        '-u', '--unicode_data_file',
        nargs='?',
        type=str,
        help='The UnicodeData.txt file to read.')
    PARSER.add_argument(
        '-e', '--east_asian_width_file',
        nargs='?',
        type=str,
        help='The EastAsianWidth.txt file to read.')
    PARSER.add_argument(
        '-a', '--show_added_characters',
        action='store_true',
        help='Show characters which were added in detail.')
    PARSER.add_argument(
        '-m', '--show_missing_characters',
        action='store_true',
        help='Show characters which were removed in detail.')
    PARSER.add_argument(
        '-c', '--show_changed_characters',
        action='store_true',
        help='Show characters whose width was changed in detail.')
    ARGS = PARSER.parse_args()
    if ARGS.unicode_data_file:
        fill_attributes(ARGS.unicode_data_file)
    if ARGS.east_asian_width_file:
        fill_east_asian_widths(ARGS.east_asian_width_file)
    check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
    check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@ -0,0 +1,286 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 # Copyright (C) 2014, 2015 Free Software Foundation, Inc.
 # This file is part of the GNU C Library.
 #
 # The GNU C Library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
 # version 2.1 of the License, or (at your option) any later version.
 #
 # The GNU C Library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 # Lesser General Public License for more details.
 #
 # You should have received a copy of the GNU Lesser General Public
 # License along with the GNU C Library; if not, see
 # <http://www.gnu.org/licenses/>.
 '''glibc/localedata/charmaps/UTF-8 file generator script
 This script generates a glibc/localedata/charmaps/UTF-8 file
 from Unicode data.
 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
 It will output UTF-8 file
 '''
 import sys
 import re
 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
 # sections 3.11 and 4.4.
 jamo_initial_short_name = [
    'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
    'C', 'K', 'T', 'P', 'H'
 ]
 jamo_medial_short_name = [
    'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
    'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
 ]
 jamo_final_short_name = [
    '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
    'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
    'P', 'H'
 ]
 def ucs_symbol(code_point):
    '''Return the UCS symbol string for a Unicode character.'''
    if code_point < 0x10000:
        return '<U{:04X}>'.format(code_point)
    else:
        return '<U{:08X}>'.format(code_point)
 def process_range(start, end, outfile, name):
    '''Writes a range of code points into the CHARMAP section of the
    output file
    '''
    if 'Hangul Syllable' in name:
        # from glibc/localedata/ChangeLog:
        #
        #  2000-09-24  Bruno Haible  <haible@clisp.cons.org>
        #  * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
        #  so they become printable and carry a width. Comment out surrogate
        #  ranges. Add a WIDTH table
        #
        # So we expand the Hangul Syllables here:
        for i in range(int(start, 16), int(end, 16)+1 ):
            index2, index3 = divmod(i - 0xaC00, 28)
            index1, index2 = divmod(index2, 21)
            hangul_syllable_name = 'HANGUL SYLLABLE ' \
                                   + jamo_initial_short_name[index1] \
                                   + jamo_medial_short_name[index2] \
                                   + jamo_final_short_name[index3]
            outfile.write('{:<11s} {:<12s} {:s}\n'.format(
                ucs_symbol(i), convert_to_hex(i),
                hangul_syllable_name))
        return
    # UnicodeData.txt file has contains code point ranges like this:
    #
    # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    #
    # The glibc UTF-8 file splits ranges like these into shorter
    # ranges of 64 code points each:
    #
    # <U3400>..<U343F>     /xe3/x90/x80         <CJK Ideograph Extension A>
    # …
    # <U4D80>..<U4DB5>     /xe4/xb6/x80         <CJK Ideograph Extension A>
    for i in range(int(start, 16), int(end, 16), 64 ):
        if i > (int(end, 16)-64):
            outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
                    ucs_symbol(i),
                    ucs_symbol(int(end,16)),
                    convert_to_hex(i),
                    name))
            break
        outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
                ucs_symbol(i),
                ucs_symbol(i+63),
                convert_to_hex(i),
                name))
 def process_charmap(flines, outfile):
    '''This function takes an array which contains *all* lines of
    of UnicodeData.txt and write lines to outfile as used in the
    CHARMAP
    …
    END CHARMAP
    section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
    Samples for input lines:
    0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
    3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
    4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
    D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
    DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
    100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
    10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
    Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
    <U0010>     /x10 DATA LINK ESCAPE
    <U3400>..<U343F>     /xe3/x90/x80 <CJK Ideograph Extension A>
    %<UD800>     /xed/xa0/x80 <Non Private Use High Surrogate, First>
    %<UDB7F>     /xed/xad/xbf <Non Private Use High Surrogate, Last>
    <U0010FFC0>..<U0010FFFD>     /xf4/x8f/xbf/x80 <Plane 16 Private Use>
    '''
    fields_start = []
    for line in flines:
        fields = line.split(";")
         # Some characters have “<control>” as their name. We try to
         # use the “Unicode 1.0 Name” (10th field in
         # UnicodeData.txt) for them.
         #
         # The Characters U+0080, U+0081, U+0084 and U+0099 have
         # “<control>” as their name but do not even have aa
         # ”Unicode 1.0 Name”. We could write code to take their
         # alternate names from NameAliases.txt.
        if fields[1] == "<control>" and fields[10]:
            fields[1] = fields[10]
        # Handling code point ranges like:
        #
        # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
        # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
        if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
            fields_start = fields
            continue
        if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
            process_range(fields_start[0], fields[0],
                          outfile, fields[1][:-7]+'>')
            fields_start = []
            continue
        fields_start = []
        if 'Surrogate,' in fields[1]:
            # Comment out the surrogates in the UTF-8 file.
            # One could of course skip them completely but
            # the original UTF-8 file in glibc had them as
            # comments, so we keep these comment lines.
            outfile.write('%')
        outfile.write('{:<11s} {:<12s} {:s}\n'.format(
                ucs_symbol(int(fields[0], 16)),
                convert_to_hex(int(fields[0], 16)),
                fields[1]))
 def convert_to_hex(code_point):
    '''Converts a code point to a hexadecimal UTF-8 representation
    like /x**/x**/x**.'''
    # Getting UTF8 of Unicode characters.
    # In Python3, .encode('UTF-8') does not work for
    # surrogates. Therefore, we use this conversion table
    surrogates = {
        0xD800: '/xed/xa0/x80',
        0xDB7F: '/xed/xad/xbf',
        0xDB80: '/xed/xae/x80',
        0xDBFF: '/xed/xaf/xbf',
        0xDC00: '/xed/xb0/x80',
        0xDFFF: '/xed/xbf/xbf',
    }
    if code_point in surrogates:
        return surrogates[code_point]
    return ''.join([
        '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
    ])
 def write_header_charmap(outfile):
    '''Write the header on top of the CHARMAP section to the output file'''
    outfile.write("<code_set_name> UTF-8\n")
    outfile.write("<comment_char> %\n")
    outfile.write("<escape_char> /\n")
    outfile.write("<mb_cur_min> 1\n")
    outfile.write("<mb_cur_max> 6\n\n")
    outfile.write("% CHARMAP generated using utf8_gen.py\n")
    outfile.write("% alias ISO-10646/UTF-8\n")
    outfile.write("CHARMAP\n")
 def write_header_width(outfile):
    '''Writes the header on top of the WIDTH section to the output file'''
    outfile.write('% Character width according to Unicode 7.0.0.\n')
    outfile.write('% - Default width is 1.\n')
    outfile.write('% - Double-width characters have width 2; generated from\n')
    outfile.write('%        "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
    outfile.write('% - Non-spacing characters have width 0; '
                  + 'generated from PropList.txt or\n')
    outfile.write('%   "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
                  + 'UnicodeData.txt"\n')
    outfile.write('% - Format control characters have width 0; '
                  + 'generated from\n')
    outfile.write("%   \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
 #   Not needed covered by Cf
 #    outfile.write("% - Zero width characters have width 0; generated from\n")
 #    outfile.write("%   \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
    outfile.write("WIDTH\n")
 def process_width(outfile, ulines, elines):
    '''ulines are lines from UnicodeData.txt, elines are lines from
    EastAsianWidth.txt
    '''
    width_dict = {}
    for line in ulines:
        fields = line.split(";")
        if fields[4] == "NSM" or fields[2] == "Cf":
            width_dict[int(fields[0], 16)] = ucs_symbol(
                int(fields[0], 16)) + '\t0'
    for line in elines:
        # If an entry in EastAsianWidth.txt is found, it overrides entries in
        # UnicodeData.txt:
        fields = line.split(";")
        if not '..' in fields[0]:
            width_dict[int(fields[0], 16)] = ucs_symbol(
                int(fields[0], 16)) + '\t2'
        else:
            code_points = fields[0].split("..")
            for key in range(int(code_points[0], 16),
                             int(code_points[1], 16)+1):
                if  key in width_dict:
                    del width_dict[key]
            width_dict[int(code_points[0], 16)] = '{:s}...{:s}\t2'.format(
                ucs_symbol(int(code_points[0], 16)),
                ucs_symbol(int(code_points[1], 16)))
    for key in sorted(width_dict):
        outfile.write(width_dict[key]+'\n')
 if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
    else:
        with open(sys.argv[1], mode='r') as UNIDATA_FILE:
            UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
        with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
            EAST_ASIAN_WIDTH_LINES = []
            for LINE in EAST_ASIAN_WIDTH_FILE:
                # If characters from EastAasianWidth.txt which are from
                # from reserved ranges (i.e. not yet assigned code points)
                # are added to the WIDTH section of the UTF-8 file, then
                # “make check” produces “Unknown Character” errors for
                # these code points because such unassigned code points
                # are not in the CHARMAP section of the UTF-8 file.
                #
                # Therefore, we skip all reserved code points when reading
                # the EastAsianWidth.txt file.
                if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
                    continue
                if re.match(r'^[^;]*;[WF]', LINE):
                    EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
        with open('UTF-8', mode='w') as OUTFILE:
            # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
            write_header_charmap(OUTFILE)
            process_charmap(UNICODE_DATA_LINES, OUTFILE)
            OUTFILE.write("END CHARMAP\n\n")
            # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
            write_header_width(OUTFILE)
            process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
            OUTFILE.write("END WIDTH\n")