ICU-2478 update to take 4.0 block names from PropertyValueAliases incl. No_Block; other enhancements
X-SVN-Rev: 11239
This commit is contained in:
parent
f03db87d08
commit
162ab9a922
@ -1,9 +1,9 @@
|
||||
#!/bin/perl -w
|
||||
# ********************************************************************
|
||||
# * COPYRIGHT:
|
||||
# * Copyright (c) 2002, International Business Machines Corporation and
|
||||
# * others. All Rights Reserved.
|
||||
# ********************************************************************
|
||||
#*******************************************************************
|
||||
# COPYRIGHT:
|
||||
# Copyright (c) 2002-2003, International Business Machines Corporation and
|
||||
# others. All Rights Reserved.
|
||||
#*******************************************************************
|
||||
|
||||
# This script reads in UCD files PropertyAliases.txt and
|
||||
# PropertyValueAliases.txt and correlates them with ICU enums
|
||||
@ -38,6 +38,13 @@
|
||||
# same directory as this script. Its contents are merged into those
|
||||
# of PropertyAliases.txt as if the two files were appended.
|
||||
#
|
||||
# NOTE: The following names are handled specially. See script below
|
||||
# for details.
|
||||
#
|
||||
# T/True
|
||||
# F/False
|
||||
# No_Block
|
||||
#
|
||||
# Author: Alan Liu
|
||||
# Created: October 14 2002
|
||||
# Since: ICU 2.4
|
||||
@ -75,6 +82,30 @@ my %PROP_TYPE = (Binary => "_bp",
|
||||
Bitmask => "_mp");
|
||||
#----------------------------------------------------------------------
|
||||
|
||||
# Properties that are unsupported in ICU
|
||||
my %UNSUPPORTED = (Composition_Exclusion => 1,
|
||||
Decomposition_Mapping => 1,
|
||||
Expands_On_NFC => 1,
|
||||
Expands_On_NFD => 1,
|
||||
Expands_On_NFKC => 1,
|
||||
Expands_On_NFKD => 1,
|
||||
FC_NFKC_Closure => 1,
|
||||
Hangul_Syllable_Type => 1,
|
||||
ID_Start_Exceptions => 1,
|
||||
NFC_Quick_Check => 1,
|
||||
NFD_Quick_Check => 1,
|
||||
NFKC_Quick_Check => 1,
|
||||
NFKD_Quick_Check => 1,
|
||||
Special_Case_Condition => 1,
|
||||
);
|
||||
|
||||
# Short names of properties that weren't seen in uchar.h. If the
|
||||
# properties weren't seen, don't complain about the property values
|
||||
# missing.
|
||||
my %MISSING_FROM_UCHAR;
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
|
||||
# Emitted class names
|
||||
my ($STRING_CLASS, $ALIAS_CLASS, $PROPERTY_CLASS) = qw(AliasName Alias Property);
|
||||
|
||||
@ -140,7 +171,9 @@ sub isIgnoredProperty {
|
||||
}
|
||||
|
||||
# 'qc' is a pseudo-property matching any quick-check property
|
||||
# see PropertyValueAliases.txt file comments
|
||||
# see PropertyValueAliases.txt file comments. 'binprop' is
|
||||
# a synthetic binary value alias "True"/"False", not present
|
||||
# in PropertyValueAliases.txt.
|
||||
sub isPseudoProperty {
|
||||
$_[0] eq 'qc' ||
|
||||
$_[0] eq 'binprop';
|
||||
@ -473,13 +506,17 @@ sub check_PropertyValueAliases {
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Merge blocks data into uchar.h enum data. In the 'blk' subhash all
|
||||
# code point values, as returned from read_uhash, are replaced by
|
||||
# code point values, as returned from read_uchar, are replaced by
|
||||
# block names, as read from Blocks.txt and returned by read_Blocks.
|
||||
# The match must be 1-to-1. If there is any failure of 1-to-1
|
||||
# mapping, an error is signalled. Upon return, the read_Blocks hash
|
||||
# mapping, an error is signaled. Upon return, the read_Blocks hash
|
||||
# is emptied of all contents, except for those that failed to match.
|
||||
#
|
||||
# @param a hash ref from read_uhash.
|
||||
# The mapping in the 'blk' subhash, after this function returns, is
|
||||
# from uchar.h enum name, e.g. "UBLOCK_BASIC_LATIN", to Blocks.h
|
||||
# pseudo-name, e.g. "Basic Latin".
|
||||
#
|
||||
# @param a hash ref from read_uchar.
|
||||
# @param a hash ref from read_Blocks.
|
||||
sub merge_Blocks {
|
||||
my ($h, $b) = @_;
|
||||
@ -489,10 +526,11 @@ sub merge_Blocks {
|
||||
my $blk = $h->{'blk'};
|
||||
for my $enum (keys %$blk) {
|
||||
my $cp = $blk->{$enum};
|
||||
die "Error: No block found at $cp in Blocks.txt"
|
||||
unless (exists $b->{$cp});
|
||||
# Convert code point to name:
|
||||
$blk->{$enum} = '|' . $b->{$cp}; # no short names for blocks
|
||||
if ($cp && !exists $b->{$cp}) {
|
||||
die "Error: No block found at $cp in Blocks.txt";
|
||||
}
|
||||
# Convert code point to pseudo-name:
|
||||
$blk->{$enum} = $b->{$cp};
|
||||
delete $b->{$cp};
|
||||
}
|
||||
my $err = '';
|
||||
@ -512,7 +550,7 @@ sub merge_Blocks {
|
||||
# Unmatched names in PropertyAliases are listed as a warning but do
|
||||
# NOT cause the script to die.
|
||||
#
|
||||
# @param a hash ref from read_uhash.
|
||||
# @param a hash ref from read_uchar.
|
||||
# @param a hash ref from read_PropertyAliases.
|
||||
# @param a hash mapping long names to property family (e.g., 'binary')
|
||||
sub merge_PropertyAliases {
|
||||
@ -535,12 +573,27 @@ sub merge_PropertyAliases {
|
||||
}
|
||||
my @err;
|
||||
for my $name (keys %$pa) {
|
||||
push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h"
|
||||
unless isIgnoredProperty($name);
|
||||
$MISSING_FROM_UCHAR{$pa->{$name}} = 1;
|
||||
if (exists $UNSUPPORTED{$name}) {
|
||||
push @err, "Info: No enum for " . $fam->{$name} . " property $name in uchar.h";
|
||||
} elsif (!isIgnoredProperty($name)) {
|
||||
push @err, "Warning: No enum for " . $fam->{$name} . " property $name in uchar.h";
|
||||
}
|
||||
}
|
||||
print join("\n", sort @err), "\n" if (@err);
|
||||
}
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Return 1 if two names match ignoring whitespace, '-', and '_'.
|
||||
# Used to match names in Blocks.txt with those in PropertyValueAliases.txt
|
||||
# as of Unicode 4.0.
|
||||
sub matchesLoosely {
|
||||
my ($a, $b) = @_;
|
||||
$a =~ s/[\s\-_]//g;
|
||||
$b =~ s/[\s\-_]//g;
|
||||
$a =~ /^$b$/i;
|
||||
}
|
||||
|
||||
#----------------------------------------------------------------------
|
||||
# Merge PropertyValueAliases.txt data into the uchar.h hash. All
|
||||
# properties other than blk, _bp, and _ep are analyzed and mapped to
|
||||
@ -548,16 +601,15 @@ sub merge_PropertyAliases {
|
||||
# with a string of the form "<short>|<long>". The short or long name
|
||||
# may be missing.
|
||||
#
|
||||
# @param a hash ref from read_uhash.
|
||||
# @param a hash ref from read_uchar.
|
||||
# @param a hash ref from read_PropertyValueAliases.
|
||||
sub merge_PropertyValueAliases {
|
||||
my ($h, $va) = @_;
|
||||
|
||||
my %gcCount;
|
||||
for my $prop (keys %$h) {
|
||||
# blk handled in merge_Blocks
|
||||
# _bp, _ep handled in merge_PropertyAliases
|
||||
next if ($prop eq 'blk' || $prop =~ /^_/);
|
||||
next if ($prop =~ /^_/);
|
||||
|
||||
# Special case: gcm
|
||||
my $prop2 = ($prop eq 'gcm') ? 'gc' : $prop;
|
||||
@ -589,6 +641,21 @@ sub merge_PropertyValueAliases {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# For blocks, do a loose match from Blocks.txt pseudo-name
|
||||
# to PropertyValueAliases long name.
|
||||
if (!$n && $prop eq 'blk') {
|
||||
for my $a (keys %$pva) {
|
||||
# The block is only going to match the long name,
|
||||
# but we check both for completeness. As of Unicode
|
||||
# 4.0, blocks do not have short names.
|
||||
if (matchesLoosely($name, $pva->{$a}) ||
|
||||
matchesLoosely($name, $a)) {
|
||||
$n = $a;
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
die "Error: Property value $prop:$name not found" unless ($n);
|
||||
|
||||
@ -619,8 +686,9 @@ sub merge_PropertyValueAliases {
|
||||
}
|
||||
delete $va->{'ccc'};
|
||||
|
||||
# Merge synthetic binary property values in manually
|
||||
die "Error: No synthetic binary properties"
|
||||
# Merge synthetic binary property values in manually.
|
||||
# These are the "True" and "False" value aliases.
|
||||
die "Error: No True/False value aliases"
|
||||
unless exists $va->{'binprop'};
|
||||
for my $bp (keys %{$va->{'binprop'}}) {
|
||||
$h->{'binprop'}->{$bp} = $va->{'binprop'}->{$bp};
|
||||
@ -637,7 +705,8 @@ sub merge_PropertyValueAliases {
|
||||
my $n = $gcCount{$subkey};
|
||||
next if ($n >= 1 && $n <= 2);
|
||||
}
|
||||
$err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n";
|
||||
$err .= "Warning: Enum for value $prop:$subkey not found in uchar.h\n"
|
||||
unless exists $MISSING_FROM_UCHAR{$prop};
|
||||
}
|
||||
}
|
||||
print $err if ($err);
|
||||
@ -777,10 +846,23 @@ sub read_PropertyValueAliases {
|
||||
# range start to the block name. The special key '_version' will map
|
||||
# to the Unicode version of the file.
|
||||
#
|
||||
# As of Unicode 4.0, the names in the Blocks.txt are no longer the
|
||||
# proper names. The proper names are now listed in PropertyValueAliases.
|
||||
# They are similar but not identical. Furthermore, 4.0 introduces
|
||||
# a new block name, No_Block, which is listed only in PropertyValueAliases
|
||||
# and not in Blocks.txt. As a result, we handle blocks as follows:
|
||||
#
|
||||
# 1. Read Blocks.txt to map code point range start to quasi-block name.
|
||||
# 2. Add to Blocks.txt a synthetic No Block code point & name:
|
||||
# X -> No Block
|
||||
# 3. Map quasi-names from Blocks.txt (including No Block) to actual
|
||||
# names from PropertyValueAliases. This occurs in
|
||||
# merge_PropertyValueAliases.
|
||||
#
|
||||
# @param a filename for Blocks.txt
|
||||
#
|
||||
# @return a ref to a hash. Keys are code points, as text, e.g.,
|
||||
# "1720". Values are block names, e.g., "Hanunoo".
|
||||
# "1720". Values are pseudo-block names, e.g., "Hanunoo".
|
||||
sub read_Blocks {
|
||||
|
||||
my $filename = shift;
|
||||
@ -816,6 +898,9 @@ sub read_Blocks {
|
||||
|
||||
$in->close();
|
||||
|
||||
# Add pseudo-name for No Block
|
||||
$hash->{'none'} = 'No Block';
|
||||
|
||||
$hash;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user