scuffed-code/tools/colprobe/extractCollationData.pl

252 lines
5.4 KiB
Perl
Raw Normal View History

#!/usr/bin/perl
#
# Copyright (C) 2017 and later: Unicode, Inc. and others.
# License & terms of use: http://www.unicode.org/copyright.html
#
use strict;
use Unicode::UCD 'charinfo';
use Unicode::Normalize;
use utf8;
use open ':utf8';
my $printout = 0;
my $braces = 0;
my $colls = 0;
my $aliased = 0;
my $newName = "";
my $filename;
my $suffix;
my $locale;
NEW_FILE:
foreach my $arg (@ARGV) {
if($newName =~ /^$/) {
$locale = $arg;
$locale =~ s#^.*/##g;
$locale =~ s/\.txt//;
} else {
$newName = "";
}
my $command = "/home/weiv/build/current/bin/uconv -x hex-any/Java -f utf8 -t utf8 $arg";
print $command."\n";
my @bundle = `$command`;
foreach $_ (@bundle) {
#while(<>) {
#print $ARGV if eof;
if(/^\/\//) {
next;
}
if(/collations/) {
print "found Collations\n";
$colls = 1;
if(/alias/) {
print "collations are aliased\n";
$aliased = 1;
}
}
if($aliased) {
print "processing aliased data: $_\n";
if(/\{/) {
print "Braces opened\n";
$braces = 1;
}
if($braces && /\"(.*)\"/) {
$newName = $1;
print "Aliasing to $newName\n";
}
if($braces && /\}/) {
$braces = 0;
print "Braces closed\n";
$aliased = 0;
print "Switching from $filename to $newName\n";
$arg =~ s/$locale\.txt$/$newName\.txt/;
print "$arg\n";
redo NEW_FILE;
}
}
if(/standard|phonebook|traditional|pinyin|stroke|direct/ && $colls) {
print "found $& collation\n";
$suffix = "_".uc($&);
if(/standard/) {
$suffix = "";
}
}
if(/Sequence/ && $colls) {
#binmode ARGV, ":utf8";
$printout = 1;
#$filename = $ARGV;
$filename = $locale;
if($suffix) {
$filename .= "_".$suffix;
}
$filename .= "_collation.html";
print "filename is $filename\n";
#open(OUT, ">:utf8", "$filename");
open(OUT, ">$filename");
printHeading($arg);
#next;
}
my $line = $_;
if($line =~ /\{/ && $printout) {
$braces++;
}
if($printout) {
print OUT processLine($line);
print OUT "\n";
}
if( $line =~ /\}/ && $printout) {
$braces--;
if($braces == 0) {
$printout = 0;
printFooting();
close(OUT);
}
}
}
}
sub processLine {
my $line = shift;
$_ = $line;
my $i = 0;
my $j = 0;
my $result;
# remove comments
s#//.*$##g;
# remove "Sequence" if present
s/Sequence\s*//;
# remove leading brace if present
s/^\s*{//;
# remove trailing brace if present
s/}\s*$//;
# remove trailing quote
s/"\s*$//;
#remove lead quote
s/^\s*"//;
#separate options
s/(\[.*\])/\n\1/g;
#separate resets
s/\s*\&\s*/\n\& /g;
#separate strengths and insert spaces
s/\s*(<{1,4})\s*/\n\1 /g;
#separate equals and insert spaces
s/\s*=\s*/\n= /g;
# break into individual reset/strength/setting lines
my @lines = split(/\n/);
my $line;
my $name;
my $spanEnd = "";
my $result = "";
my $names = "";
my $codes = "";
my $lrm = "";
foreach $line (@lines) {
# skip empty lines
if($line =~ /^$/) {
next;
}
$spanEnd = "";
$name = "";
$lrm = "";
$line = NFC($line);
# for resets and strengths we will get name for elements
if($line =~ /<{1,4} |= |& \[.*\]|& /) {
$name = "<span title=\"";
$names = "";
$codes = "";
my $start = $&;
my $rest = $';
for ($j = 0; $j < length($rest); $j++) {
my $char = substr($rest, $j, 1);
my $charVal = ord($char);
# some of elements are part of the syntax, so they are
# entered without translation to the name
if($charVal == 0x002F || $charVal == 0x007C) {
$name .= $codes.$names." $char ";
$codes = "";
$names = "";
} elsif($charVal == 0x0027) { #quote requires more processing
#$name .= "'";
} else {
my $charinfo = charinfo($charVal);
$codes .= $charinfo->{'code'}." ";
$names .= "{".$charinfo->{'name'}."} ";
if($charinfo->{'bidi'} eq "R" || $charinfo->{'bidi'} eq "AL") {
$lrm = "&lrm;";
}
#$name .= $charinfo->{'code'}." {".$charinfo->{'name'}."} ";
}
}
$name .= $codes.$names."\" >";
$spanEnd = "</span>";
}
#print $name."\n";
if($line =~ /^<<<</) {
$line = " $line";
} elsif($line =~ /^<<</) {
$line = " $line";
} elsif($line =~ /^<</) {
$line = " $line";
} elsif($line =~ /^</) {
$line = " $line";
} elsif($line =~ /^=/) {
$line = " $line";
}
# insert spaces around vertical bars (fix prefixes)
# insert spaces around slashes (fix expansions)
$line =~ s#/# / #g;
# replace &
$line =~ s/\&/&amp;/g;
# replace spaces
$line =~ s/ /&nbsp;/g;
# replace <
$line =~ s/</&lt;/g;
# replace >
$line =~ s/>/&gt;/g;
#$lines[$i] = $name.$lrm.$line."</span><br>";
#$i++;
$result .= $name.$lrm.$line.$spanEnd."<br>\n";
}
#$_ = join("\n", @lines);
return $result;
}
sub printHeading {
my $filename = shift;
$filename =~ s/\.txt//;
print OUT <<"EndOfHeading";
<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=utf-8">
</head>
# Collation data resource bundle generated for locale: $filename<br>
# For platform icu reference platform UCA<br><br>
$filename&nbsp;{<br>
&nbsp;&nbsp;CollationElements&nbsp;{<br>
&nbsp;&nbsp;&nbsp;&nbsp;Sequence&nbsp;{<br>
EndOfHeading
}
sub printFooting {
print OUT <<"EndOfFooting";
&nbsp;&nbsp;&nbsp;&nbsp;}<br>
&nbsp;&nbsp;}<br>
}<br>
</pre>
</html>
EndOfFooting
}