2003-01-22 18:10:43 +00:00
|
|
|
#!/bin/perl -w
|
2003-03-04 19:54:27 +00:00
|
|
|
#*******************************************************************
|
|
|
|
# COPYRIGHT:
|
2005-01-10 18:02:54 +00:00
|
|
|
# Copyright (c) 2002-2005, International Business Machines Corporation and
|
2003-03-04 19:54:27 +00:00
|
|
|
# others. All Rights Reserved.
|
|
|
|
#*******************************************************************
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
# This script reads in UCD files PropertyAliases.txt and
|
|
|
|
# PropertyValueAliases.txt and correlates them with ICU enums
|
|
|
|
# defined in uchar.h and uscript.h. It then outputs a header
|
|
|
|
# file which contains all names and enums. The header is included
|
|
|
|
# by the genpname tool C++ source file, which produces the actual
|
|
|
|
# binary data file.
|
|
|
|
#
|
|
|
|
# See usage note below.
|
|
|
|
#
|
|
|
|
# TODO: The Property[Value]Alias.txt files state that they can support
|
|
|
|
# more than 2 names per property|value. Currently (Unicode 3.2) there
|
|
|
|
# are always 1 or 2 names. If more names were supported, presumably
|
|
|
|
# the format would be something like:
|
|
|
|
# nv ; Numeric_Value
|
|
|
|
# nv ; Value_Numerique
|
|
|
|
# CURRENTLY, this script assumes that there are 1 or two names. Any
|
|
|
|
# duplicates it sees are flagged as an error. If multiple aliases
|
|
|
|
# appear in a future version of Unicode, modify this script to support
|
|
|
|
# that.
|
|
|
|
#
|
2003-01-22 18:10:43 +00:00
|
|
|
# NOTE: As of ICU 2.6, this script has been modified to know about the
|
|
|
|
# pseudo-property gcm/General_Category_Mask, which corresponds to the
|
|
|
|
# uchar.h property UCHAR_GENERAL_CATEGORY_MASK. This property
|
|
|
|
# corresponds to General_Category but is a bitmask value. It does not
|
|
|
|
# exist in the UCD. Therefore, I special case it in several places
|
|
|
|
# (search for General_Category_Mask and gcm).
|
|
|
|
#
|
2003-02-20 22:43:37 +00:00
|
|
|
# NOTE: As of ICU 2.6, this script reads an auxiliary data file,
|
|
|
|
# SyntheticPropertyAliases.txt, containing property aliases not
|
|
|
|
# present in the UCD but present in ICU. This file resides in the
|
|
|
|
# same directory as this script. Its contents are merged into those
|
|
|
|
# of PropertyAliases.txt as if the two files were appended.
|
|
|
|
#
|
2003-03-04 19:54:27 +00:00
|
|
|
# NOTE: The following names are handled specially. See script below
|
|
|
|
# for details.
|
|
|
|
#
|
|
|
|
# T/True
|
|
|
|
# F/False
|
|
|
|
# No_Block
|
|
|
|
#
|
2002-10-30 17:58:20 +00:00
|
|
|
# Author: Alan Liu
|
|
|
|
# Created: October 14 2002
|
|
|
|
# Since: ICU 2.4
|
|
|
|
|
|
|
|
use FileHandle;
|
|
|
|
use strict;
|
2003-01-22 18:10:43 +00:00
|
|
|
use Dumpvalue;
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2004-04-03 07:19:23 +00:00
|
|
|
my $DEBUG = 1;
|
2003-01-22 18:10:43 +00:00
|
|
|
my $DUMPER = new Dumpvalue;
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
my $count = @ARGV;
|
2003-01-22 18:10:43 +00:00
|
|
|
my $ICU_DIR = shift() || '';
|
|
|
|
my $OUT_FILE = shift() || 'data.h';
|
2002-10-30 17:58:20 +00:00
|
|
|
my $HEADER_DIR = "$ICU_DIR/source/common/unicode";
|
|
|
|
my $UNIDATA_DIR = "$ICU_DIR/source/data/unidata";
|
|
|
|
|
2003-06-04 00:02:16 +00:00
|
|
|
# Get the current year from the system
|
|
|
|
my $YEAR = 1900+@{[localtime]}[5]; # Get the current year
|
|
|
|
|
2005-05-28 22:54:36 +00:00
|
|
|
# Used to make "n/a" property aliases (Unicode or Synthetic) unique
|
|
|
|
my $propNA = 0;
|
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
#----------------------------------------------------------------------
|
2002-10-30 17:58:20 +00:00
|
|
|
# Top level property keys for binary, enumerated, string, and double props
|
2003-01-22 18:10:43 +00:00
|
|
|
my @TOP = qw( _bp _ep _sp _dp _mp );
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
# This hash governs how top level properties are grouped into output arrays.
|
|
|
|
#my %TOP_PROPS = ( "VALUED" => [ '_bp', '_ep' ],
|
|
|
|
# "NO_VALUE" => [ '_sp', '_dp' ] );m
|
|
|
|
#my %TOP_PROPS = ( "BINARY" => [ '_bp' ],
|
|
|
|
# "ENUMERATED" => [ '_ep' ],
|
|
|
|
# "STRING" => [ '_sp' ],
|
|
|
|
# "DOUBLE" => [ '_dp' ] );
|
2003-01-22 18:10:43 +00:00
|
|
|
my %TOP_PROPS = ( "" => [ '_bp', '_ep', '_sp', '_dp', '_mp' ] );
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
my %PROP_TYPE = (Binary => "_bp",
|
|
|
|
String => "_sp",
|
|
|
|
Double => "_dp",
|
|
|
|
Enumerated => "_ep",
|
|
|
|
Bitmask => "_mp");
|
|
|
|
#----------------------------------------------------------------------
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2003-03-04 19:54:27 +00:00
|
|
|
# Properties that are unsupported in ICU
|
|
|
|
my %UNSUPPORTED = (Composition_Exclusion => 1,
|
|
|
|
Decomposition_Mapping => 1,
|
|
|
|
Expands_On_NFC => 1,
|
|
|
|
Expands_On_NFD => 1,
|
|
|
|
Expands_On_NFKC => 1,
|
|
|
|
Expands_On_NFKD => 1,
|
|
|
|
FC_NFKC_Closure => 1,
|
|
|
|
ID_Start_Exceptions => 1,
|
|
|
|
Special_Case_Condition => 1,
|
|
|
|
);
|
|
|
|
|
|
|
|
# Short names of properties that weren't seen in uchar.h. If the
|
|
|
|
# properties weren't seen, don't complain about the property values
|
|
|
|
# missing.
|
|
|
|
my %MISSING_FROM_UCHAR;
|
|
|
|
|
2005-03-11 23:31:44 +00:00
|
|
|
# Additional property aliases beyond short and long names,
|
|
|
|
# like space in addition to WSpace and White_Space in Unicode 4.1.
|
|
|
|
# Hashtable, maps long name to alias.
|
|
|
|
# For example, maps White_Space->space.
|
|
|
|
#
|
|
|
|
# If multiple additional aliases are defined,
|
|
|
|
# then they are separated in the value string with '|'.
|
|
|
|
# For example, White_Space->space|outer_space
|
|
|
|
my %additional_property_aliases;
|
|
|
|
|
2003-03-04 19:54:27 +00:00
|
|
|
#----------------------------------------------------------------------
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
# Emitted class names
|
|
|
|
my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
|
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
if ($count < 1 || $count > 2 ||
|
2002-10-30 17:58:20 +00:00
|
|
|
!-d $HEADER_DIR ||
|
|
|
|
!-d $UNIDATA_DIR) {
|
|
|
|
my $me = $0;
|
|
|
|
$me =~ s|.+[/\\]||;
|
|
|
|
my $lm = ' ' x length($me);
|
|
|
|
print <<"END";
|
|
|
|
|
|
|
|
$me: Reads ICU4C headers and Unicode data files and creates
|
|
|
|
$lm a C header file that is included by genpname. The header
|
|
|
|
$lm file matches constants defined in the ICU4C headers with
|
|
|
|
$lm property|value aliases in the Unicode data files.
|
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
Usage: $me <icu_dir> [<out_file>]
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
<icu_dir> ICU4C root directory, containing
|
|
|
|
source/common/unicode/uchar.h
|
|
|
|
source/common/unicode/uscript.h
|
|
|
|
source/data/unidata/Blocks.txt
|
|
|
|
source/data/unidata/PropertyAliases.txt
|
|
|
|
source/data/unidata/PropertyValueAliases.txt
|
2003-01-22 18:10:43 +00:00
|
|
|
<out_file> File name of header to be written;
|
|
|
|
default is 'data.h'.
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
The Unicode versions of all input files must match.
|
|
|
|
END
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
my ($h, $version) = readAndMerge($HEADER_DIR, $UNIDATA_DIR);
|
|
|
|
|
|
|
|
if ($DEBUG) {
|
|
|
|
print "Merged hash:\n";
|
|
|
|
for my $key (sort keys %$h) {
|
|
|
|
my $hh = $h->{$key};
|
|
|
|
for my $subkey (sort keys %$hh) {
|
|
|
|
print "$key:$subkey:", $hh->{$subkey}, "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
my $out = new FileHandle($OUT_FILE, 'w');
|
|
|
|
die "Error: Can't write to $OUT_FILE: $!" unless (defined $out);
|
|
|
|
my $save = select($out);
|
|
|
|
formatData($h, $version);
|
|
|
|
select($save);
|
|
|
|
$out->close();
|
|
|
|
|
|
|
|
exit(0);
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# From PropList.html: "The properties of the form Other_XXX
|
|
|
|
# are used to generate properties in DerivedCoreProperties.txt.
|
|
|
|
# They are not intended for general use, such as in APIs that
|
|
|
|
# return property values.
|
|
|
|
# Non_Break is not a valid property as of 3.2.
|
|
|
|
sub isIgnoredProperty {
|
|
|
|
local $_ = shift;
|
|
|
|
/^Other_/i || /^Non_Break$/i;
|
|
|
|
}
|
|
|
|
|
|
|
|
# 'qc' is a pseudo-property matching any quick-check property
|
2003-03-04 19:54:27 +00:00
|
|
|
# see PropertyValueAliases.txt file comments. 'binprop' is
|
|
|
|
# a synthetic binary value alias "True"/"False", not present
|
|
|
|
# in PropertyValueAliases.txt.
|
2002-10-30 17:58:20 +00:00
|
|
|
sub isPseudoProperty {
|
|
|
|
$_[0] eq 'qc' ||
|
|
|
|
$_[0] eq 'binprop';
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Emit the combined data from headers and the Unicode database as a
|
|
|
|
# C source code header file.
|
|
|
|
#
|
|
|
|
# @param ref to hash with the data
|
|
|
|
# @param Unicode version, as a string
|
|
|
|
sub formatData {
|
|
|
|
my $h = shift;
|
|
|
|
my $version = shift;
|
|
|
|
|
|
|
|
my $date = scalar localtime();
|
|
|
|
print <<"END";
|
|
|
|
/**
|
2003-06-04 00:02:16 +00:00
|
|
|
* Copyright (C) 2002-$YEAR, International Business Machines Corporation and
|
|
|
|
* others. All Rights Reserved.
|
|
|
|
*
|
2002-10-30 17:58:20 +00:00
|
|
|
* MACHINE GENERATED FILE. !!! Do not edit manually !!!
|
|
|
|
*
|
|
|
|
* Generated from
|
|
|
|
* uchar.h
|
|
|
|
* uscript.h
|
|
|
|
* Blocks.txt
|
|
|
|
* PropertyAliases.txt
|
|
|
|
* PropertyValueAliases.txt
|
|
|
|
*
|
|
|
|
* Date: $date
|
|
|
|
* Unicode version: $version
|
|
|
|
* Script: $0
|
|
|
|
*/
|
|
|
|
|
|
|
|
END
|
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
# Emit Unicode version
|
|
|
|
print "/* Unicode version $version */\n";
|
|
|
|
my @v = split(/\./, $version);
|
|
|
|
push @v, '0' while (@v < 4);
|
|
|
|
for (my $i=0; $i<@v; ++$i) {
|
|
|
|
print "const uint8_t VERSION_$i = $v[$i];\n";
|
|
|
|
}
|
|
|
|
print "\n";
|
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
# Emit String table
|
|
|
|
# [A table of all identifiers, that is, all long or short property
|
|
|
|
# or value names. The list need NOT be sorted; it will be sorted
|
|
|
|
# by the C program. Strings are referenced by their index into
|
|
|
|
# this table. After sorting, a REMAP[] array is used to map the
|
|
|
|
# old position indices to the new positions.]
|
|
|
|
my %strings;
|
|
|
|
for my $prop (sort keys %$h) {
|
|
|
|
my $hh = $h->{$prop};
|
|
|
|
for my $enum (sort keys %$hh) {
|
|
|
|
my @a = split(/\|/, $hh->{$enum});
|
|
|
|
for (@a) {
|
|
|
|
$strings{$_} = 1 if (length($_));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
my @strings = sort keys %strings;
|
|
|
|
unshift @strings, "";
|
|
|
|
|
|
|
|
print "const int32_t STRING_COUNT = ", scalar @strings, ";\n\n";
|
|
|
|
|
|
|
|
# while printing, create a mapping hash from string table entry to index
|
|
|
|
my %stringToID;
|
|
|
|
print "/* to be sorted */\n";
|
|
|
|
print "const $STRING_CLASS STRING_TABLE[] = {\n";
|
|
|
|
for (my $i=0; $i<@strings; ++$i) {
|
|
|
|
print " $STRING_CLASS(\"$strings[$i]\", $i),\n";
|
|
|
|
$stringToID{$strings[$i]} = $i;
|
|
|
|
}
|
|
|
|
print "};\n\n";
|
|
|
|
|
|
|
|
# placeholder for the remapping index. this is used to map
|
|
|
|
# indices that we compute here to indices of the sorted
|
|
|
|
# STRING_TABLE. STRING_TABLE will be sorted by the C++ program
|
|
|
|
# using the uprv_comparePropertyNames() function. this will
|
|
|
|
# reshuffle the order. we then use the indices (passed to the
|
|
|
|
# String constructor) to create a REMAP[] array.
|
|
|
|
print "/* to be filled in */\n";
|
|
|
|
print "int32_t REMAP[", scalar @strings, "];\n\n";
|
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
# Emit the name group table
|
|
|
|
# [A table of name groups. A name group is one or more names
|
|
|
|
# for a property or property value. The Unicode data files specify
|
|
|
|
# that there may be more than 2, although as of Unicode 3.2 there
|
|
|
|
# are at most 2. The name group table looks like this:
|
|
|
|
#
|
|
|
|
# 114, -115, 116, -117, 0, -118, 65, -64, ...
|
|
|
|
# [0] [2] [4] [6]
|
|
|
|
#
|
|
|
|
# The entry at [0] consists of 2 strings, 114 and 115.
|
|
|
|
# The entry at [2] consists of 116 and 117. The entry at
|
|
|
|
# [4] is one string, 118. There is always at least one
|
|
|
|
# string; typically there are two. If there are two, the first
|
|
|
|
# is the SHORT name and the second is the LONG. If there is
|
|
|
|
# one, then the missing entry (always the short name, in 3.2)
|
|
|
|
# is zero, which is by definition the index of "". The
|
|
|
|
# 'preferred' name will generally be the LONG name, if there are
|
|
|
|
# more than 2 entries. The last entry is negative.
|
|
|
|
|
|
|
|
# Build name group list and replace string refs with nameGroup indices
|
|
|
|
my @nameGroups;
|
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
# Check for duplicate name groups, and reuse them if possible
|
|
|
|
my %groupToInt; # Map group strings to ints
|
2002-10-30 17:58:20 +00:00
|
|
|
for my $prop (sort keys %$h) {
|
|
|
|
my $hh = $h->{$prop};
|
|
|
|
for my $enum (sort keys %$hh) {
|
2003-01-22 18:10:43 +00:00
|
|
|
my $groupString = $hh->{$enum};
|
|
|
|
my $i;
|
|
|
|
if (exists $groupToInt{$groupString}) {
|
|
|
|
$i = $groupToInt{$groupString};
|
|
|
|
} else {
|
|
|
|
my @names = split(/\|/, $groupString);
|
2005-05-28 22:54:36 +00:00
|
|
|
die "Error: Wrong number of names in " . $groupString if (@names < 1);
|
2003-01-22 18:10:43 +00:00
|
|
|
$i = @nameGroups; # index of group we are making
|
|
|
|
$groupToInt{$groupString} = $i; # Cache for reuse
|
|
|
|
push @nameGroups, map { $stringToID{$_} } @names;
|
|
|
|
$nameGroups[$#nameGroups] = -$nameGroups[$#nameGroups]; # mark end
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
# now, replace string list with ref to name group
|
|
|
|
$hh->{$enum} = $i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
print "const int32_t NAME_GROUP_COUNT = ",
|
|
|
|
scalar @nameGroups, ";\n\n";
|
|
|
|
|
|
|
|
print "int32_t NAME_GROUP[] = {\n";
|
|
|
|
# emit one group per line, with annotations
|
2004-04-03 07:19:23 +00:00
|
|
|
my $max_names = 0;
|
2002-10-30 17:58:20 +00:00
|
|
|
for (my $i=0; $i<@nameGroups; ) {
|
|
|
|
my @a;
|
|
|
|
my $line;
|
|
|
|
my $start = $i;
|
|
|
|
for (;;) {
|
|
|
|
my $j = $nameGroups[$i++];
|
|
|
|
$line .= "$j, ";
|
|
|
|
push @a, abs($j);
|
|
|
|
last if ($j < 0);
|
|
|
|
}
|
|
|
|
print " ",
|
|
|
|
$line,
|
|
|
|
' 'x(20-length($line)),
|
|
|
|
"/* ", sprintf("%3d", $start),
|
|
|
|
": \"", join("\", \"", map { $strings[$_] } @a), "\" */\n";
|
2004-04-03 07:19:23 +00:00
|
|
|
$max_names = @a if(@a > $max_names);
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
print "};\n\n";
|
|
|
|
|
|
|
|
# This is fixed for 3.2 at "2" but should be calculated dynamically
|
|
|
|
# when more than 2 names appear in Property[Value]Aliases.txt.
|
2004-04-03 07:19:23 +00:00
|
|
|
print "#define MAX_NAMES_PER_GROUP $max_names\n\n";
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
# Emit enumerated property values
|
|
|
|
for my $prop (sort keys %$h) {
|
|
|
|
next if ($prop =~ /^_/);
|
|
|
|
my $vh = $h->{$prop};
|
|
|
|
my $count = scalar keys %$vh;
|
|
|
|
|
|
|
|
print "const int32_t VALUES_${prop}_COUNT = ",
|
|
|
|
$count, ";\n\n";
|
|
|
|
|
|
|
|
print "const $ALIAS_CLASS VALUES_${prop}\[] = {\n";
|
|
|
|
for my $enum (sort keys %$vh) {
|
|
|
|
#my @names = split(/\|/, $vh->{$enum});
|
|
|
|
#die "Error: Wrong number of names for $prop:$enum in [" . join(",", @names) . "]"
|
|
|
|
# if (@names != 2);
|
|
|
|
print " $ALIAS_CLASS((int32_t) $enum, ", $vh->{$enum}, "),\n";
|
|
|
|
#$stringToID{$names[0]}, ", ",
|
|
|
|
#$stringToID{$names[1]}, "),\n";
|
|
|
|
# "\"", $names[0], "\", ",
|
|
|
|
# "\"", $names[1], "\"),\n";
|
|
|
|
}
|
|
|
|
print "};\n\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
#------------------------------------------------------------
|
|
|
|
# Emit top-level properties (binary, enumerated, etc.)
|
|
|
|
for my $topName (sort keys %TOP_PROPS) {
|
|
|
|
my $a = $TOP_PROPS{$topName};
|
|
|
|
my $count = 0;
|
|
|
|
for my $type (@$a) { # "_bp", "_ep", etc.
|
|
|
|
$count += scalar keys %{$h->{$type}};
|
|
|
|
}
|
|
|
|
|
|
|
|
print "const int32_t ${topName}PROPERTY_COUNT = $count;\n\n";
|
|
|
|
|
|
|
|
print "const $PROPERTY_CLASS ${topName}PROPERTY[] = {\n";
|
|
|
|
|
|
|
|
for my $type (@$a) { # "_bp", "_ep", etc.
|
|
|
|
my $p = $h->{$type};
|
|
|
|
|
|
|
|
for my $enum (sort keys %$p) {
|
|
|
|
my $name = $strings[$nameGroups[$p->{$enum}]];
|
|
|
|
|
|
|
|
my $valueRef = "0, NULL";
|
|
|
|
if ($type eq '_bp') {
|
|
|
|
$valueRef = "VALUES_binprop_COUNT, VALUES_binprop";
|
|
|
|
}
|
|
|
|
elsif (exists $h->{$name}) {
|
|
|
|
$valueRef = "VALUES_${name}_COUNT, VALUES_$name";
|
|
|
|
}
|
|
|
|
|
|
|
|
print " $PROPERTY_CLASS((int32_t) $enum, ",
|
|
|
|
$p->{$enum}, ", $valueRef),\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
print "};\n\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
print "/*eof*/\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Read in the files uchar.h, uscript.h, Blocks.txt,
|
|
|
|
# PropertyAliases.txt, and PropertyValueAliases.txt,
|
|
|
|
# and combine them into one hash.
|
|
|
|
#
|
|
|
|
# @param directory containing headers
|
|
|
|
# @param directory containin Unicode data files
|
|
|
|
#
|
|
|
|
# @return hash ref, Unicode version
|
|
|
|
sub readAndMerge {
|
|
|
|
|
|
|
|
my ($headerDir, $unidataDir) = @_;
|
|
|
|
|
|
|
|
my $h = read_uchar("$headerDir/uchar.h");
|
|
|
|
my $s = read_uscript("$headerDir/uscript.h");
|
|
|
|
my $b = read_Blocks("$unidataDir/Blocks.txt");
|
2003-02-20 22:43:37 +00:00
|
|
|
my $pa = {};
|
|
|
|
read_PropertyAliases($pa, "$unidataDir/PropertyAliases.txt");
|
|
|
|
read_PropertyAliases($pa, "SyntheticPropertyAliases.txt");
|
2002-10-30 17:58:20 +00:00
|
|
|
my $va = read_PropertyValueAliases("$unidataDir/PropertyValueAliases.txt");
|
|
|
|
|
|
|
|
# Extract property family hash
|
|
|
|
my $fam = $pa->{'_family'};
|
|
|
|
delete $pa->{'_family'};
|
|
|
|
|
|
|
|
# Note: uscript.h has no version string, so don't check it
|
2003-06-04 00:02:16 +00:00
|
|
|
my $version = check_versions([ 'uchar.h', $h ],
|
|
|
|
[ 'Blocks.txt', $b ],
|
|
|
|
[ 'PropertyAliases.txt', $pa ],
|
|
|
|
[ 'PropertyValueAliases.txt', $va ]);
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
# Do this BEFORE merging; merging modifies the hashes
|
|
|
|
check_PropertyValueAliases($pa, $va);
|
|
|
|
|
|
|
|
# Dump out the $va hash for debugging
|
|
|
|
if ($DEBUG) {
|
|
|
|
print "Property values hash:\n";
|
|
|
|
for my $key (sort keys %$va) {
|
|
|
|
my $hh = $va->{$key};
|
|
|
|
for my $subkey (sort keys %$hh) {
|
|
|
|
print "$key:$subkey:", $hh->{$subkey}, "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Dump out the $s hash for debugging
|
|
|
|
if ($DEBUG) {
|
|
|
|
print "Script hash:\n";
|
|
|
|
for my $key (sort keys %$s) {
|
|
|
|
print "$key:", $s->{$key}, "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Link in the script data
|
|
|
|
$h->{'sc'} = $s;
|
|
|
|
|
|
|
|
merge_Blocks($h, $b);
|
|
|
|
|
|
|
|
merge_PropertyAliases($h, $pa, $fam);
|
|
|
|
|
|
|
|
merge_PropertyValueAliases($h, $va);
|
|
|
|
|
|
|
|
($h, $version);
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Ensure that the version strings in the given hashes (under the key
|
|
|
|
# '_version') are compatible. Currently this means they must be
|
|
|
|
# identical, with the exception that "X.Y" will match "X.Y.0".
|
|
|
|
# All hashes must define the key '_version'.
|
|
|
|
#
|
2003-06-04 00:02:16 +00:00
|
|
|
# @param a list of pairs of (file name, hash reference)
|
2002-10-30 17:58:20 +00:00
|
|
|
#
|
|
|
|
# @return the version of all the hashes. Upon return, the '_version'
|
|
|
|
# will be removed from all hashes.
|
|
|
|
sub check_versions {
|
|
|
|
my $version = '';
|
2003-06-04 00:02:16 +00:00
|
|
|
my $msg = '';
|
|
|
|
foreach my $a (@_) {
|
|
|
|
my $name = $a->[0];
|
|
|
|
my $h = $a->[1];
|
2002-10-30 17:58:20 +00:00
|
|
|
die "Error: No version found" unless (exists $h->{'_version'});
|
|
|
|
my $v = $h->{'_version'};
|
|
|
|
delete $h->{'_version'};
|
|
|
|
|
|
|
|
# append ".0" if necessary, to standardize to X.Y.Z
|
|
|
|
$v .= '.0' unless ($v =~ /\.\d+\./);
|
2003-03-04 00:39:21 +00:00
|
|
|
$v .= '.0' unless ($v =~ /\.\d+\./);
|
2003-06-04 00:02:16 +00:00
|
|
|
$msg .= "$name = $v\n";
|
2002-10-30 17:58:20 +00:00
|
|
|
if ($version) {
|
2003-06-04 00:02:16 +00:00
|
|
|
die "Error: Mismatched Unicode versions\n$msg"
|
2002-10-30 17:58:20 +00:00
|
|
|
unless ($version eq $v);
|
|
|
|
} else {
|
|
|
|
$version = $v;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$version;
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Make sure the property names in PropertyValueAliases.txt match those
|
|
|
|
# in PropertyAliases.txt.
|
|
|
|
#
|
|
|
|
# @param a hash ref from read_PropertyAliases.
|
|
|
|
# @param a hash ref from read_PropertyValueAliases.
|
|
|
|
sub check_PropertyValueAliases {
|
|
|
|
my ($pa, $va) = @_;
|
|
|
|
|
|
|
|
# make a reverse hash of short->long
|
|
|
|
my %rev;
|
|
|
|
for (keys %$pa) { $rev{$pa->{$_}} = $_; }
|
|
|
|
|
|
|
|
for my $prop (keys %$va) {
|
|
|
|
if (!exists $rev{$prop} && !isPseudoProperty($prop)) {
|
|
|
|
print "Warning: Property $prop from PropertyValueAliases not listed in PropertyAliases\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Merge blocks data into uchar.h enum data. In the 'blk' subhash all
|
2003-03-04 19:54:27 +00:00
|
|
|
# code point values, as returned from read_uchar, are replaced by
|
2002-10-30 17:58:20 +00:00
|
|
|
# block names, as read from Blocks.txt and returned by read_Blocks.
|
|
|
|
# The match must be 1-to-1. If there is any failure of 1-to-1
|
2003-03-04 19:54:27 +00:00
|
|
|
# mapping, an error is signaled. Upon return, the read_Blocks hash
|
2002-10-30 17:58:20 +00:00
|
|
|
# is emptied of all contents, except for those that failed to match.
|
|
|
|
#
|
2003-03-04 19:54:27 +00:00
|
|
|
# The mapping in the 'blk' subhash, after this function returns, is
|
|
|
|
# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
|
|
|
|
# pseudo-name, e.g. "Basic Latin".
|
|
|
|
#
|
|
|
|
# @param a hash ref from read_uchar.
|
2002-10-30 17:58:20 +00:00
|
|
|
# @param a hash ref from read_Blocks.
|
|
|
|
sub merge_Blocks {
|
|
|
|
my ($h, $b) = @_;
|
|
|
|
|
|
|
|
die "Error: No blocks data in uchar.h"
|
|
|
|
unless (exists $h->{'blk'});
|
|
|
|
my $blk = $h->{'blk'};
|
|
|
|
for my $enum (keys %$blk) {
|
|
|
|
my $cp = $blk->{$enum};
|
2003-03-04 19:54:27 +00:00
|
|
|
if ($cp && !exists $b->{$cp}) {
|
|
|
|
die "Error: No block found at $cp in Blocks.txt";
|
|
|
|
}
|
|
|
|
# Convert code point to pseudo-name:
|
|
|
|
$blk->{$enum} = $b->{$cp};
|
2002-10-30 17:58:20 +00:00
|
|
|
delete $b->{$cp};
|
|
|
|
}
|
|
|
|
my $err = '';
|
|
|
|
for my $cp (keys %$b) {
|
|
|
|
$err .= "Error: Block " . $b->{$cp} . " not listed in uchar.h\n";
|
|
|
|
}
|
|
|
|
die $err if ($err);
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Merge property alias names into the uchar.h hash. The subhashes
|
|
|
|
# under the keys _* (b(inary, e(numerated, s(tring, d(ouble) are
|
|
|
|
# examined and the values of those subhashes are assumed to be long
|
|
|
|
# names in PropertyAliases.txt. They are validated and replaced by
|
|
|
|
# "<short>|<long>". Upon return, the read_PropertyAliases hash is
|
|
|
|
# emptied of all contents, except for those that failed to match.
|
|
|
|
# Unmatched names in PropertyAliases are listed as a warning but do
|
|
|
|
# NOT cause the script to die.
|
|
|
|
#
|
2003-03-04 19:54:27 +00:00
|
|
|
# @param a hash ref from read_uchar.
|
2002-10-30 17:58:20 +00:00
|
|
|
# @param a hash ref from read_PropertyAliases.
|
|
|
|
# @param a hash mapping long names to property family (e.g., 'binary')
|
|
|
|
sub merge_PropertyAliases {
|
|
|
|
my ($h, $pa, $fam) = @_;
|
|
|
|
|
|
|
|
for my $k (@TOP) {
|
|
|
|
die "Error: No properties data for $k in uchar.h"
|
|
|
|
unless (exists $h->{$k});
|
|
|
|
}
|
|
|
|
|
|
|
|
for my $subh (map { $h->{$_} } @TOP) {
|
|
|
|
for my $enum (keys %$subh) {
|
2005-03-11 23:31:44 +00:00
|
|
|
my $long_name = $subh->{$enum};
|
|
|
|
if (!exists $pa->{$long_name}) {
|
|
|
|
die "Error: Property $long_name not found (or used more than once)";
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2005-05-28 22:54:36 +00:00
|
|
|
my $value;
|
|
|
|
if($pa->{$long_name} =~ m|^n/a\d*$|) {
|
|
|
|
$value = $long_name;
|
|
|
|
} else {
|
|
|
|
$value = $pa->{$long_name} . "|" . $long_name;
|
|
|
|
}
|
2005-03-11 23:31:44 +00:00
|
|
|
if (exists $additional_property_aliases{$long_name}) {
|
|
|
|
$value .= "|" . $additional_property_aliases{$long_name};
|
|
|
|
}
|
|
|
|
$subh->{$enum} = $value;
|
|
|
|
delete $pa->{$long_name};
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
}
|
2005-03-11 23:31:44 +00:00
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
my @err;
|
|
|
|
for my $name (keys %$pa) {
|
2003-03-04 19:54:27 +00:00
|
|
|
$MISSING_FROM_UCHAR{$pa->{$name}} = 1;
|
|
|
|
if (exists $UNSUPPORTED{$name}) {
|
|
|
|
push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
|
|
|
|
} elsif (!isIgnoredProperty($name)) {
|
|
|
|
push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
print join("\n", sort @err), "\n" if (@err);
|
|
|
|
}
|
|
|
|
|
2003-03-04 19:54:27 +00:00
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Return 1 if two names match ignoring whitespace, '-', and '_'.
|
|
|
|
# Used to match names in Blocks.txt with those in PropertyValueAliases.txt
|
|
|
|
# as of Unicode 4.0.
|
|
|
|
sub matchesLoosely {
|
|
|
|
my ($a, $b) = @_;
|
|
|
|
$a =~ s/[\s\-_]//g;
|
|
|
|
$b =~ s/[\s\-_]//g;
|
|
|
|
$a =~ /^$b$/i;
|
|
|
|
}
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Merge PropertyValueAliases.txt data into the uchar.h hash. All
|
|
|
|
# properties other than blk, _bp, and _ep are analyzed and mapped to
|
|
|
|
# the names listed in PropertyValueAliases. They are then replaced
|
|
|
|
# with a string of the form "<short>|<long>". The short or long name
|
|
|
|
# may be missing.
|
|
|
|
#
|
2003-03-04 19:54:27 +00:00
|
|
|
# @param a hash ref from read_uchar.
|
2002-10-30 17:58:20 +00:00
|
|
|
# @param a hash ref from read_PropertyValueAliases.
|
|
|
|
sub merge_PropertyValueAliases {
|
|
|
|
my ($h, $va) = @_;
|
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
my %gcCount;
|
2002-10-30 17:58:20 +00:00
|
|
|
for my $prop (keys %$h) {
|
|
|
|
# _bp, _ep handled in merge_PropertyAliases
|
2003-03-04 19:54:27 +00:00
|
|
|
next if ($prop =~ /^_/);
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
# Special case: gcm
|
|
|
|
my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
# find corresponding PropertyValueAliases data
|
|
|
|
die "Error: Can't find $prop in PropertyValueAliases.txt"
|
2003-01-22 18:10:43 +00:00
|
|
|
unless (exists $va->{$prop2});
|
|
|
|
my $pva = $va->{$prop2};
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
# match up data
|
|
|
|
my $hh = $h->{$prop};
|
|
|
|
for my $enum (keys %$hh) {
|
|
|
|
|
|
|
|
my $name = $hh->{$enum};
|
|
|
|
|
|
|
|
# look up both long and short & ignore case
|
|
|
|
my $n;
|
|
|
|
if (exists $pva->{$name}) {
|
|
|
|
$n = $name;
|
|
|
|
} else {
|
|
|
|
# iterate (slow)
|
|
|
|
for my $a (keys %$pva) {
|
|
|
|
# case-insensitive match
|
|
|
|
# & case-insensitive reverse match
|
|
|
|
if ($a =~ /^$name$/i ||
|
|
|
|
$pva->{$a} =~ /^$name$/i) {
|
|
|
|
$n = $a;
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2003-03-04 19:54:27 +00:00
|
|
|
|
|
|
|
# For blocks, do a loose match from Blocks.txt pseudo-name
|
|
|
|
# to PropertyValueAliases long name.
|
|
|
|
if (!$n && $prop eq 'blk') {
|
|
|
|
for my $a (keys %$pva) {
|
|
|
|
# The block is only going to match the long name,
|
|
|
|
# but we check both for completeness. As of Unicode
|
|
|
|
# 4.0, blocks do not have short names.
|
|
|
|
if (matchesLoosely($name, $pva->{$a}) ||
|
|
|
|
matchesLoosely($name, $a)) {
|
|
|
|
$n = $a;
|
|
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
die "Error: Property value $prop:$name not found" unless ($n);
|
|
|
|
|
|
|
|
my $l = $n;
|
|
|
|
my $r = $pva->{$n};
|
|
|
|
# convert |n/a\d+| to blank
|
2005-05-28 22:54:36 +00:00
|
|
|
$l = '' if ($l =~ m|^n/a\d*$|);
|
|
|
|
$r = '' if ($r =~ m|^n/a\d*$|);
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
$hh->{$enum} = "$l|$r";
|
2003-01-22 18:10:43 +00:00
|
|
|
# Don't delete the 'gc' properties because we need to share
|
|
|
|
# them between 'gc' and 'gcm'. Count each use instead.
|
|
|
|
if ($prop2 eq 'gc') {
|
|
|
|
++$gcCount{$n};
|
|
|
|
} else {
|
|
|
|
delete $pva->{$n};
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Merge the combining class values in manually
|
2004-04-07 02:57:06 +00:00
|
|
|
# Add the same values to the synthetic lccc and tccc properties
|
2002-10-30 17:58:20 +00:00
|
|
|
die "Error: No ccc data"
|
|
|
|
unless exists $va->{'ccc'};
|
|
|
|
for my $ccc (keys %{$va->{'ccc'}}) {
|
|
|
|
die "Error: Can't overwrite ccc $ccc"
|
|
|
|
if (exists $h->{'ccc'}->{$ccc});
|
2004-04-07 02:57:06 +00:00
|
|
|
$h->{'lccc'}->{$ccc} =
|
|
|
|
$h->{'tccc'}->{$ccc} =
|
2002-10-30 17:58:20 +00:00
|
|
|
$h->{'ccc'}->{$ccc} = $va->{'ccc'}->{$ccc};
|
|
|
|
}
|
|
|
|
delete $va->{'ccc'};
|
|
|
|
|
2003-03-04 19:54:27 +00:00
|
|
|
# Merge synthetic binary property values in manually.
|
|
|
|
# These are the "True" and "False" value aliases.
|
|
|
|
die "Error: No True/False value aliases"
|
2002-10-30 17:58:20 +00:00
|
|
|
unless exists $va->{'binprop'};
|
|
|
|
for my $bp (keys %{$va->{'binprop'}}) {
|
|
|
|
$h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
|
|
|
|
}
|
|
|
|
delete $va->{'binprop'};
|
|
|
|
|
|
|
|
my $err = '';
|
|
|
|
for my $prop (sort keys %$va) {
|
|
|
|
my $hh = $va->{$prop};
|
|
|
|
for my $subkey (sort keys %$hh) {
|
2003-01-22 18:10:43 +00:00
|
|
|
# 'gc' props are shared with 'gcm'; make sure they were used
|
|
|
|
# once or twice.
|
|
|
|
if ($prop eq 'gc') {
|
|
|
|
my $n = $gcCount{$subkey};
|
|
|
|
next if ($n >= 1 && $n <= 2);
|
|
|
|
}
|
2003-03-04 19:54:27 +00:00
|
|
|
$err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
|
|
|
|
unless exists $MISSING_FROM_UCHAR{$prop};
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
print $err if ($err);
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Read the PropertyAliases.txt file. Return a hash that maps the long
|
|
|
|
# name to the short name. The special key '_version' will map to the
|
|
|
|
# Unicode version of the file. The special key '_family' holds a
|
|
|
|
# subhash that maps long names to a family string, for descriptive
|
|
|
|
# purposes.
|
|
|
|
#
|
|
|
|
# @param a filename for PropertyAliases.txt
|
2003-02-20 22:43:37 +00:00
|
|
|
# @param reference to hash to receive data. Keys are long names.
|
|
|
|
# Values are short names.
|
2002-10-30 17:58:20 +00:00
|
|
|
sub read_PropertyAliases {
|
|
|
|
|
2003-02-20 22:43:37 +00:00
|
|
|
my $hash = shift; # result
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2003-02-20 22:43:37 +00:00
|
|
|
my $filename = shift;
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
my $fam = {}; # map long names to family string
|
2003-02-20 22:43:37 +00:00
|
|
|
$fam = $hash->{'_family'} if (exists $hash->{'_family'});
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
my $family; # binary, enumerated, etc.
|
|
|
|
|
|
|
|
my $in = new FileHandle($filename, 'r');
|
|
|
|
die "Error: Cannot open $filename" if (!defined $in);
|
|
|
|
|
|
|
|
while (<$in>) {
|
|
|
|
|
|
|
|
# Read version (embedded in a comment)
|
|
|
|
if (/PropertyAliases-(\d+\.\d+\.\d+)/i) {
|
|
|
|
die "Error: Multiple versions in $filename"
|
|
|
|
if (exists $hash->{'_version'});
|
|
|
|
$hash->{'_version'} = $1;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Read family heading
|
|
|
|
if (/^\s*\#\s*(.+?)\s*Properties\s*$/) {
|
|
|
|
$family = $1;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Ignore comments and blank lines
|
|
|
|
s/\#.*//;
|
|
|
|
next unless (/\S/);
|
|
|
|
|
2005-03-11 23:31:44 +00:00
|
|
|
if (/^\s*(.+?)\s*;/) {
|
|
|
|
my $short = $1;
|
|
|
|
my @fields = /;\s*([^\s;]+)/g;
|
|
|
|
if (@fields < 1 || @fields > 2) {
|
|
|
|
my $number = @fields;
|
|
|
|
die "Error: Wrong number of fields ($number) in $filename at $_";
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
|
2005-03-11 23:31:44 +00:00
|
|
|
# Make "n/a" strings unique
|
2005-05-28 22:54:36 +00:00
|
|
|
if ($short eq 'n/a') {
|
|
|
|
$short .= sprintf("%03d", $propNA++);
|
|
|
|
}
|
2005-03-11 23:31:44 +00:00
|
|
|
my $long = $fields[0];
|
|
|
|
if ($long eq 'n/a') {
|
2005-05-28 22:54:36 +00:00
|
|
|
$long .= sprintf("%03d", $propNA++);
|
2005-03-11 23:31:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
# Add long name->short name to the hash=pa hash table
|
|
|
|
if (exists $hash->{$long}) {
|
|
|
|
die "Error: Duplicate property $long in $filename"
|
|
|
|
}
|
|
|
|
$hash->{$long} = $short;
|
|
|
|
$fam->{$long} = $family;
|
|
|
|
|
|
|
|
# Add the list of further aliases to the additional_property_aliases hash table,
|
|
|
|
# using the long property name as the key.
|
|
|
|
# For example:
|
|
|
|
# White_Space->space|outer_space
|
|
|
|
if (@fields > 1) {
|
|
|
|
my $value = pop @fields;
|
|
|
|
while (@fields > 1) {
|
|
|
|
$value .= "|" . pop @fields;
|
|
|
|
}
|
|
|
|
$additional_property_aliases{$long} = $value;
|
|
|
|
}
|
|
|
|
} else {
|
2002-10-30 17:58:20 +00:00
|
|
|
die "Error: Can't parse $_ in $filename";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$in->close();
|
|
|
|
|
2003-01-22 18:10:43 +00:00
|
|
|
$hash->{'_family'} = $fam;
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Read the PropertyValueAliases.txt file. Return a two level hash
|
|
|
|
# that maps property_short_name:value_short_name:value_long_name. In
|
|
|
|
# the case of the 'ccc' property, the short name is the numeric class
|
|
|
|
# and the long name is "<short>|<long>". The special key '_version'
|
|
|
|
# will map to the Unicode version of the file.
|
|
|
|
#
|
|
|
|
# @param a filename for PropertyValueAliases.txt
|
|
|
|
#
|
|
|
|
# @return a hash reference.
|
|
|
|
sub read_PropertyValueAliases {
|
|
|
|
|
|
|
|
my $filename = shift;
|
|
|
|
|
|
|
|
my $hash = {}; # result
|
|
|
|
|
|
|
|
my $in = new FileHandle($filename, 'r');
|
|
|
|
die "Error: Cannot open $filename" if (!defined $in);
|
|
|
|
|
2005-05-28 22:54:36 +00:00
|
|
|
my $valueNA = 0; # Used to make "n/a" strings unique
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
while (<$in>) {
|
|
|
|
|
|
|
|
# Read version (embedded in a comment)
|
|
|
|
if (/PropertyValueAliases-(\d+\.\d+\.\d+)/i) {
|
|
|
|
die "Error: Multiple versions in $filename"
|
|
|
|
if (exists $hash->{'_version'});
|
|
|
|
$hash->{'_version'} = $1;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Ignore comments and blank lines
|
|
|
|
s/\#.*//;
|
|
|
|
next unless (/\S/);
|
|
|
|
|
|
|
|
if (/^\s*(.+?)\s*;/i) {
|
|
|
|
my $prop = $1;
|
|
|
|
my @fields = /;\s*([^\s;]+)/g;
|
2005-03-11 23:31:44 +00:00
|
|
|
die "Error: Wrong number of fields in $filename"
|
2002-10-30 17:58:20 +00:00
|
|
|
if (@fields < 2 || @fields > 3);
|
|
|
|
# Make "n/a" strings unique
|
2005-05-28 22:54:36 +00:00
|
|
|
$fields[0] .= sprintf("%03d", $valueNA++) if ($fields[0] eq 'n/a');
|
2002-10-30 17:58:20 +00:00
|
|
|
# Squash extra fields together
|
|
|
|
while (@fields > 2) {
|
|
|
|
my $f = pop @fields;
|
|
|
|
$fields[$#fields] .= '|' . $f;
|
|
|
|
}
|
|
|
|
addDatum($hash, $prop, @fields);
|
|
|
|
}
|
|
|
|
|
|
|
|
else {
|
|
|
|
die "Error: Can't parse $_ in $filename";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$in->close();
|
|
|
|
|
|
|
|
# Script Qaac (Coptic) is a special case. Handle it here. See UTR#24:
|
|
|
|
# http://www.unicode.org/unicode/reports/tr24/
|
|
|
|
$hash->{'sc'}->{'Qaac'} = 'Coptic'
|
2005-01-10 18:02:54 +00:00
|
|
|
unless (exists $hash->{'sc'}->{'Qaac'} || exists $hash->{'sc'}->{'Copt'});
|
2002-10-30 17:58:20 +00:00
|
|
|
|
|
|
|
# Add T|True and F|False -- these are values we recognize for
|
|
|
|
# binary properties (NOT from PropertyValueAliases.txt). These
|
|
|
|
# are of the same form as the 'ccc' value aliases.
|
|
|
|
$hash->{'binprop'}->{'0'} = 'F|False';
|
|
|
|
$hash->{'binprop'}->{'1'} = 'T|True';
|
|
|
|
|
|
|
|
$hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Read the Blocks.txt file. Return a hash that maps the code point
|
|
|
|
# range start to the block name. The special key '_version' will map
|
|
|
|
# to the Unicode version of the file.
|
|
|
|
#
|
2003-03-04 19:54:27 +00:00
|
|
|
# As of Unicode 4.0, the names in the Blocks.txt are no longer the
|
|
|
|
# proper names. The proper names are now listed in PropertyValueAliases.
|
|
|
|
# They are similar but not identical. Furthermore, 4.0 introduces
|
|
|
|
# a new block name, No_Block, which is listed only in PropertyValueAliases
|
|
|
|
# and not in Blocks.txt. As a result, we handle blocks as follows:
|
|
|
|
#
|
|
|
|
# 1. Read Blocks.txt to map code point range start to quasi-block name.
|
|
|
|
# 2. Add to Blocks.txt a synthetic No Block code point & name:
|
|
|
|
# X -> No Block
|
|
|
|
# 3. Map quasi-names from Blocks.txt (including No Block) to actual
|
|
|
|
# names from PropertyValueAliases. This occurs in
|
|
|
|
# merge_PropertyValueAliases.
|
|
|
|
#
|
2002-10-30 17:58:20 +00:00
|
|
|
# @param a filename for Blocks.txt
|
|
|
|
#
|
|
|
|
# @return a ref to a hash. Keys are code points, as text, e.g.,
|
2003-03-04 19:54:27 +00:00
|
|
|
# "1720". Values are pseudo-block names, e.g., "Hanunoo".
|
2002-10-30 17:58:20 +00:00
|
|
|
sub read_Blocks {
|
|
|
|
|
|
|
|
my $filename = shift;
|
|
|
|
|
|
|
|
my $hash = {}; # result
|
|
|
|
|
|
|
|
my $in = new FileHandle($filename, 'r');
|
|
|
|
die "Error: Cannot open $filename" if (!defined $in);
|
|
|
|
|
|
|
|
while (<$in>) {
|
|
|
|
|
|
|
|
# Read version (embedded in a comment)
|
|
|
|
if (/Blocks-(\d+\.\d+\.\d+)/i) {
|
|
|
|
die "Error: Multiple versions in $filename"
|
|
|
|
if (exists $hash->{'_version'});
|
|
|
|
$hash->{'_version'} = $1;
|
|
|
|
}
|
|
|
|
|
|
|
|
# Ignore comments and blank lines
|
|
|
|
s/\#.*//;
|
|
|
|
next unless (/\S/);
|
|
|
|
|
|
|
|
if (/^([0-9a-f]+)\.\.[0-9a-f]+;\s*(.+?)\s*$/i) {
|
|
|
|
die "Error: Duplicate range $1 in $filename"
|
|
|
|
if (exists $hash->{$1});
|
|
|
|
$hash->{$1} = $2;
|
|
|
|
}
|
|
|
|
|
|
|
|
else {
|
|
|
|
die "Error: Can't parse $_ in $filename";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$in->close();
|
|
|
|
|
2003-03-04 19:54:27 +00:00
|
|
|
# Add pseudo-name for No Block
|
|
|
|
$hash->{'none'} = 'No Block';
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
$hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Read the uscript.h file and compile a mapping of Unicode symbols to
|
|
|
|
# icu4c enum values.
|
|
|
|
#
|
|
|
|
# @param a filename for uscript.h
|
|
|
|
#
|
|
|
|
# @return a ref to a hash. The keys of the hash are enum symbols from
|
|
|
|
# uscript.h, and the values are script names.
|
|
|
|
sub read_uscript {
|
|
|
|
|
|
|
|
my $filename = shift;
|
|
|
|
|
|
|
|
my $mode = ''; # state machine mode and submode
|
|
|
|
my $submode = '';
|
|
|
|
|
|
|
|
my $last = ''; # for line folding
|
|
|
|
|
|
|
|
my $hash = {}; # result
|
|
|
|
my $key; # first-level key
|
|
|
|
|
|
|
|
my $in = new FileHandle($filename, 'r');
|
|
|
|
die "Error: Cannot open $filename" if (!defined $in);
|
|
|
|
|
|
|
|
while (<$in>) {
|
|
|
|
# Fold continued lines together
|
|
|
|
if (/^(.*)\\$/) {
|
|
|
|
$last = $1;
|
|
|
|
next;
|
|
|
|
} elsif ($last) {
|
|
|
|
$_ = $last . $_;
|
|
|
|
$last = '';
|
|
|
|
}
|
|
|
|
|
|
|
|
# Exit all modes here
|
|
|
|
if ($mode && $mode ne 'DEPRECATED') {
|
|
|
|
if (/^\s*\}/) {
|
|
|
|
$mode = '';
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Handle individual modes
|
|
|
|
|
|
|
|
if ($mode eq 'UScriptCode') {
|
|
|
|
if (m|^\s*(USCRIPT_\w+).+?/\*\s*(\w+)|) {
|
|
|
|
my ($enum, $code) = ($1, $2);
|
|
|
|
die "Error: Duplicate script $enum"
|
|
|
|
if (exists $hash->{$enum});
|
|
|
|
$hash->{$enum} = $code;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'DEPRECATED') {
|
|
|
|
if (/\s*\#ifdef/) {
|
|
|
|
die "Error: Nested #ifdef";
|
|
|
|
}
|
|
|
|
elsif (/\s*\#endif/) {
|
|
|
|
$mode = '';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (!$mode) {
|
|
|
|
if (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
|
|
|
|
/^\s*typedef\s+enum\s+(\w+)\s*$/) {
|
|
|
|
$mode = $1;
|
|
|
|
#print "Parsing $mode\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
|
|
|
|
$mode = 'DEPRECATED';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$in->close();
|
|
|
|
|
|
|
|
$hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Read the uchar.h file and compile a mapping of Unicode symbols to
|
|
|
|
# icu4c enum values.
|
|
|
|
#
|
|
|
|
# @param a filename for uchar.h
|
|
|
|
#
|
|
|
|
# @return a ref to a hash. The keys of the hash are '_bp' for binary
|
2003-01-22 18:10:43 +00:00
|
|
|
# properties, '_ep' for enumerated properties, '_dp'/'_sp'/'_mp' for
|
|
|
|
# double/string/mask properties, and 'gc', 'gcm', 'bc', 'blk',
|
2002-10-30 17:58:20 +00:00
|
|
|
# 'ea', 'dt', 'jt', 'jg', 'lb', or 'nt' for corresponding property
|
|
|
|
# value aliases. The values of the hash are subhashes. The subhashes
|
|
|
|
# have a key of the uchar.h enum symbol, and a value of the alias
|
|
|
|
# string (as listed in PropertyValueAliases.txt). NOTE: The alias
|
|
|
|
# string is whatever alias uchar.h lists. This may be either short or
|
|
|
|
# long, depending on the specific enum. NOTE: For blocks ('blk'), the
|
|
|
|
# value is a hex code point for the start of the associated block.
|
|
|
|
# NOTE: The special key _version will map to the Unicode version of
|
|
|
|
# the file.
|
|
|
|
sub read_uchar {
|
|
|
|
|
|
|
|
my $filename = shift;
|
|
|
|
|
|
|
|
my $mode = ''; # state machine mode and submode
|
|
|
|
my $submode = '';
|
|
|
|
|
|
|
|
my $last = ''; # for line folding
|
|
|
|
|
|
|
|
my $hash = {}; # result
|
|
|
|
my $key; # first-level key
|
|
|
|
|
|
|
|
my $in = new FileHandle($filename, 'r');
|
|
|
|
die "Error: Cannot open $filename" if (!defined $in);
|
|
|
|
|
|
|
|
while (<$in>) {
|
|
|
|
# Fold continued lines together
|
|
|
|
if (/^(.*)\\$/) {
|
|
|
|
$last .= $1;
|
|
|
|
next;
|
|
|
|
} elsif ($last) {
|
|
|
|
$_ = $last . $_;
|
|
|
|
$last = '';
|
|
|
|
}
|
|
|
|
|
|
|
|
# Exit all modes here
|
|
|
|
if ($mode && $mode ne 'DEPRECATED') {
|
|
|
|
if (/^\s*\}/) {
|
|
|
|
$mode = '';
|
|
|
|
next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
# Handle individual modes
|
|
|
|
|
|
|
|
if ($mode eq 'UProperty') {
|
|
|
|
if (/^\s*(UCHAR_\w+)\s*[,=]/ || /^\s+(UCHAR_\w+)\s*$/) {
|
|
|
|
if ($submode) {
|
|
|
|
addDatum($hash, $key, $1, $submode);
|
|
|
|
$submode = '';
|
|
|
|
} else {
|
|
|
|
#print "Warning: Ignoring $1\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (m|^\s*/\*\*\s*(\w+)\s+property\s+(\w+)|i) {
|
|
|
|
die "Error: Unmatched tag $submode" if ($submode);
|
|
|
|
die "Error: Unrecognized UProperty comment: $_"
|
2003-01-22 18:10:43 +00:00
|
|
|
unless (exists $PROP_TYPE{$1});
|
|
|
|
$key = $PROP_TYPE{$1};
|
2002-10-30 17:58:20 +00:00
|
|
|
$submode = $2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UCharCategory') {
|
2003-01-22 18:10:43 +00:00
|
|
|
if (/^\s*(U_\w+)\s*=/) {
|
|
|
|
if ($submode) {
|
|
|
|
addDatum($hash, 'gc', $1, $submode);
|
|
|
|
$submode = '';
|
|
|
|
} else {
|
|
|
|
#print "Warning: Ignoring $1\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (m|^\s*/\*\*\s*([A-Z][a-z])\s|) {
|
|
|
|
die "Error: Unmatched tag $submode" if ($submode);
|
|
|
|
$submode = $1;
|
|
|
|
}
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UCharDirection') {
|
|
|
|
if (/^\s*(U_\w+)\s*[,=]/ || /^\s+(U_\w+)\s*$/) {
|
|
|
|
if ($submode) {
|
|
|
|
addDatum($hash, $key, $1, $submode);
|
|
|
|
$submode = '';
|
|
|
|
} else {
|
|
|
|
#print "Warning: Ignoring $1\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (m|/\*\*\s*([A-Z]+)\s|) {
|
|
|
|
die "Error: Unmatched tag $submode" if ($submode);
|
|
|
|
$key = 'bc';
|
|
|
|
$submode = $1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UBlockCode') {
|
|
|
|
if (m|^\s*(UBLOCK_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'blk', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UEastAsianWidth') {
|
|
|
|
if (m|^\s*(U_EA_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'ea', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UDecompositionType') {
|
|
|
|
if (m|^\s*(U_DT_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'dt', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UJoiningType') {
|
|
|
|
if (m|^\s*(U_JT_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'jt', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UJoiningGroup') {
|
|
|
|
if (/^\s*(U_JG_(\w+))/) {
|
|
|
|
addDatum($hash, 'jg', $1, $2) unless ($2 eq 'COUNT');
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-01-10 18:02:54 +00:00
|
|
|
elsif ($mode eq 'UGraphemeClusterBreak') {
|
|
|
|
if (m|^\s*(U_GCB_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'GCB', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UWordBreakValues') {
|
|
|
|
if (m|^\s*(U_WB_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'WB', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'USentenceBreak') {
|
|
|
|
if (m|^\s*(U_SB_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'SB', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
elsif ($mode eq 'ULineBreak') {
|
|
|
|
if (m|^\s*(U_LB_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'lb', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($mode eq 'UNumericType') {
|
|
|
|
if (m|^\s*(U_NT_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'nt', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-03-08 01:53:11 +00:00
|
|
|
elsif ($mode eq 'UHangulSyllableType') {
|
|
|
|
if (m|^\s*(U_HST_\w+).+?/\*\[(.+?)\]\*/|) {
|
|
|
|
addDatum($hash, 'hst', $1, $2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
elsif ($mode eq 'DEPRECATED') {
|
|
|
|
if (/\s*\#ifdef/) {
|
|
|
|
die "Error: Nested #ifdef";
|
|
|
|
}
|
|
|
|
elsif (/\s*\#endif/) {
|
|
|
|
$mode = '';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (!$mode) {
|
|
|
|
if (/^\s*\#define\s+(\w+)\s+(.+)/) {
|
|
|
|
# #define $left $right
|
|
|
|
my ($left, $right) = ($1, $2);
|
|
|
|
|
|
|
|
if ($left eq 'U_UNICODE_VERSION') {
|
|
|
|
my $version = $right;
|
2003-06-04 00:02:16 +00:00
|
|
|
$version = $1 if ($version =~ /^\"(.*)\"/);
|
2002-10-30 17:58:20 +00:00
|
|
|
# print "Unicode version: ", $version, "\n";
|
|
|
|
die "Error: Multiple versions in $filename"
|
|
|
|
if (defined $hash->{'_version'});
|
|
|
|
$hash->{'_version'} = $version;
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif ($left =~ /U_GC_(\w+?)_MASK/) {
|
2003-01-22 18:10:43 +00:00
|
|
|
addDatum($hash, 'gcm', $left, $1);
|
2002-10-30 17:58:20 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (/^\s*typedef\s+enum\s+(\w+)\s*\{/ ||
|
|
|
|
/^\s*typedef\s+enum\s+(\w+)\s*$/) {
|
|
|
|
$mode = $1;
|
|
|
|
#print "Parsing $mode\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (/^\s*enum\s+(\w+)\s*\{/ ||
|
|
|
|
/^\s*enum\s+(\w+)\s*$/) {
|
|
|
|
$mode = $1;
|
|
|
|
#print "Parsing $mode\n";
|
|
|
|
}
|
|
|
|
|
|
|
|
elsif (/^\s*\#ifdef\s+ICU_UCHAR_USE_DEPRECATES\b/) {
|
|
|
|
$mode = 'DEPRECATED';
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
$in->close();
|
|
|
|
|
2004-04-07 00:28:39 +00:00
|
|
|
# hardcode known values for the normalization quick check properties
|
|
|
|
# see unorm.h for the UNormalizationCheckResult enum
|
|
|
|
|
|
|
|
addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
|
|
|
|
addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
|
|
|
|
addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
|
|
|
|
|
|
|
|
addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
|
|
|
|
addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
|
|
|
|
addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
|
|
|
|
|
|
|
|
# no "maybe" values for NF[K]D
|
|
|
|
|
|
|
|
addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
|
|
|
|
addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
|
|
|
|
|
|
|
|
addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
|
|
|
|
addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
|
|
|
|
|
2002-10-30 17:58:20 +00:00
|
|
|
$hash;
|
|
|
|
}
|
|
|
|
|
|
|
|
#----------------------------------------------------------------------
|
|
|
|
# Add a new value to a two-level hash. That is, given a ref to
|
|
|
|
# a hash, two keys, and a value, add $hash->{$key1}->{$key2} = $value.
|
|
|
|
sub addDatum {
|
|
|
|
my ($h, $k1, $k2, $v) = @_;
|
|
|
|
if (exists $h->{$k1}->{$k2}) {
|
|
|
|
die "Error: $k1:$k2 already set to " .
|
|
|
|
$h->{$k1}->{$k2} . ", cannot set to " . $v;
|
|
|
|
}
|
|
|
|
$h->{$k1}->{$k2} = $v;
|
|
|
|
}
|
|
|
|
|
|
|
|
#eof
|