#!/usr/bin/perl use strict; use Unicode::UCD 'charinfo'; use Unicode::Normalize; use utf8; use open ':utf8'; my $printout = 0; my $braces = 0; my $colls = 0; my $aliased = 0; my $newName = ""; my $filename; my $suffix; my $locale; NEW_FILE: foreach my $arg (@ARGV) { if($newName =~ /^$/) { $locale = $arg; $locale =~ s#^.*/##g; $locale =~ s/\.txt//; } else { $newName = ""; } my $command = "/home/weiv/build/current/bin/uconv -x hex-any/Java -f utf8 -t utf8 $arg"; print $command."\n"; my @bundle = `$command`; foreach $_ (@bundle) { #while(<>) { #print $ARGV if eof; if(/^\/\//) { next; } if(/collations/) { print "found Collations\n"; $colls = 1; if(/alias/) { print "collations are aliased\n"; $aliased = 1; } } if($aliased) { print "processing aliased data: $_\n"; if(/\{/) { print "Braces opened\n"; $braces = 1; } if($braces && /\"(.*)\"/) { $newName = $1; print "Aliasing to $newName\n"; } if($braces && /\}/) { $braces = 0; print "Braces closed\n"; $aliased = 0; print "Switching from $filename to $newName\n"; $arg =~ s/$locale\.txt$/$newName\.txt/; print "$arg\n"; redo NEW_FILE; } } if(/standard|phonebook|traditional|pinyin|stroke|direct/ && $colls) { print "found $& collation\n"; $suffix = "_".uc($&); if(/standard/) { $suffix = ""; } } if(/Sequence/ && $colls) { #binmode ARGV, ":utf8"; $printout = 1; #$filename = $ARGV; $filename = $locale; if($suffix) { $filename .= "_".$suffix; } $filename .= "_collation.html"; print "filename is $filename\n"; #open(OUT, ">:utf8", "$filename"); open(OUT, ">$filename"); printHeading($arg); #next; } my $line = $_; if($line =~ /\{/ && $printout) { $braces++; } if($printout) { print OUT processLine($line); print OUT "\n"; } if( $line =~ /\}/ && $printout) { $braces--; if($braces == 0) { $printout = 0; printFooting(); close(OUT); } } } } sub processLine { my $line = shift; $_ = $line; my $i = 0; my $j = 0; my $result; # remove comments s#//.*$##g; # remove "Sequence" if present s/Sequence\s*//; # remove leading brace if present s/^\s*{//; # remove trailing brace if present s/}\s*$//; # remove trailing quote s/"\s*$//; #remove lead quote s/^\s*"//; #separate options s/(\[.*\])/\n\1/g; #separate resets s/\s*\&\s*/\n\& /g; #separate strengths and insert spaces s/\s*(<{1,4})\s*/\n\1 /g; #separate equals and insert spaces s/\s*=\s*/\n= /g; # break into individual reset/strength/setting lines my @lines = split(/\n/); my $line; my $name; my $spanEnd = ""; my $result = ""; my $names = ""; my $codes = ""; my $lrm = ""; foreach $line (@lines) { # skip empty lines if($line =~ /^$/) { next; } $spanEnd = ""; $name = ""; $lrm = ""; $line = NFC($line); # for resets and strengths we will get name for elements if($line =~ /<{1,4} |= |& \[.*\]|& /) { $name = "{'code'}." "; $names .= "{".$charinfo->{'name'}."} "; if($charinfo->{'bidi'} eq "R" || $charinfo->{'bidi'} eq "AL") { $lrm = "‎"; } #$name .= $charinfo->{'code'}." {".$charinfo->{'name'}."} "; } } $name .= $codes.$names."\" >"; $spanEnd = ""; } #print $name."\n"; if($line =~ /^<<< $line =~ s/>/>/g; #$lines[$i] = $name.$lrm.$line."
"; #$i++; $result .= $name.$lrm.$line.$spanEnd."
\n"; } #$_ = join("\n", @lines); return $result; } sub printHeading { my $filename = shift; $filename =~ s/\.txt//; print OUT <<"EndOfHeading"; # Collation data resource bundle generated for locale: $filename
# For platform icu reference platform UCA

$filename {
  CollationElements {
    Sequence {
EndOfHeading } sub printFooting { print OUT <<"EndOfFooting";     }
  }
}
EndOfFooting }