scuffed-code/icu4c/source/tools/gentz/tzparse.pm

######################################################################
# Copyright (C) 1999-2001, International Business Machines
# Corporation and others.  All Rights Reserved.
######################################################################
# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
# where <year> is "1999b" or a similar string.
######################################################################
# This package handles the parsing of time zone files.
# Author: Alan Liu
######################################################################
# Usage:
# Call ParseFile for each file to be imported.  Then call ParseZoneTab
# to add country data.  Then call Postprocess to remove unused rules.

package TZ;
use strict;
use Carp;
use vars qw(@ISA @EXPORT $VERSION $YEAR $STANDARD);
require 'dumpvar.pl';

@ISA = qw(Exporter);
@EXPORT = qw(ParseFile
             Postprocess
             ParseZoneTab
             );
$VERSION = '0.2';

$STANDARD = '-'; # Name of the Standard Time rule

######################################################################
# Read the tzdata zone.tab file and add a {country} field to zones
# in the given hash.
# Param: File name (<dir>/zone.tab)
# Param: Ref to hash of zones
# Param: Ref to hash of links
sub ParseZoneTab {
    my ($FILE, $ZONES, $LINKS) = @_;

    my %linkEntries;

    local(*FILE);
    open(FILE,"<$FILE") or confess "Can't open $FILE: $!";
    while (<FILE>) {
        # Handle comments
        s/\#.*//;
        next if (!/\S/);

        if (/^\s*([A-Z]{2})\s+[-+0-9]+\s+(\S+)/) {
            my ($country, $zone) = ($1, $2);
            if (exists $ZONES->{$zone}) {
                $ZONES->{$zone}->{country} = $country;
            } elsif (exists $LINKS->{$zone}) {
                # We have a country mapping for a zone that isn't in
                # our hash.  This means it is a link entry.  Save this
                # then handle it below.
                $linkEntries{$zone} = $country;
            } else {
                print STDERR "Nonexistent zone $zone in $FILE\n";
            }
        } else {
            confess "Can't parse line \"$_\" of $FILE";
        }
    }
    close(FILE);

    # Now that we have mapped all of the zones in %$ZONES (except
    # those without country affiliations), process the link entries.
    # For those zones in the table that differ by country from their
    # source zone, instantiate a new zone in the new country.  An
    # example is Europe/Vatican, which is linked to Europe/Rome.  If
    # we don't instantiate it, we have nothing for Vatican City.
    # Another example is America/Shiprock, which links to
    # America/Denver.  These are identical and both in the US, so we
    # don't instantiate America/Shiprock.
    foreach my $zone (keys %linkEntries) {
        my $country = $linkEntries{$zone};
        my $linkZone = $LINKS->{$zone};
        my $linkCountry = $ZONES->{$linkZone}->{country};
        if ($linkCountry ne $country) {
            # print "Cloning $zone ($country) from $linkZone ($linkCountry)\n";
            _CloneZone($ZONES, $LINKS->{$zone}, $zone);
            $ZONES->{$zone}->{country} = $country;
        }
    }
}

######################################################################
# Param: File name
# Param: Ref to hash of zones
# Param: Ref to hash of rules
# Parma: Ref to hash of links
# Param: Current year
sub ParseFile {
    my ($FILE, $ZONES, $RULES, $LINKS, $YEAR) = @_;

    local(*FILE);
    open(FILE,"<$FILE") or confess "Can't open $FILE: $!";
    my $zone; # Current zone
    my $badLineCount = 0;
    while (<FILE>) {
        # Handle comments and blanks
        s/\#.*//;
        next if (!/\S/);

        #|# Zone NAME           GMTOFF  RULES   FORMAT  [UNTIL]
        #|Zone America/Montreal -4:54:16 -      LMT     1884
        #|                      -5:00   Mont    E%sT
        #|Zone America/Thunder_Bay -5:57:00 -   LMT     1895
        #|                      -5:00   Canada  E%sT    1970
        #|                      -5:00   Mont    E%sT    1973
        #|                      -5:00   -       EST     1974
        #|                      -5:00   Canada  E%sT
        my ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil);
        if (/^zone/i) {
            # Zone block start
            if (/^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/i
                || /^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)()/i) {
                $zone = $1;
                ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
                    ($2, $3, $4, $5);
            } else {
                print STDERR "Can't parse in $FILE: $_";
                ++$badLineCount;
            }
        } elsif (/^\s/ && $zone) {
            # Zone continuation
            if (/^\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/
                || /^\s+(\S+)\s+(\S+)\s+(\S+)()/) {
                ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
                    ($1, $2, $3, $4);
            } else {
                print STDERR "Can't parse in $FILE: $_";
                ++$badLineCount;
            }
        } elsif (/^rule/i) {
            # Here is where we parse a single line of the rule table.
            # Our goal is to accept only rules applying to the current
            # year.  This is normally a matter of accepting rules
            # that match the current year.  However, in some cases this
            # is more complicated.  For example:
            #|# Tonga
            #|# Rule NAME FROM TO  TYPE IN  ON      AT    SAVE LETTER/S
            #|Rule  Tonga 1999 max -    Oct Sat>=1  2:00s 1:00 S
            #|Rule  Tonga 2000 max -    Apr Sun>=16 2:00s 0    -
            # To handle this properly, we save every rule we encounter
            # (thus overwriting older ones with newer ones, since rules
            # are listed in order), and also use slot [2] to mark when
            # we see a current year rule.  When that happens, we stop
            # saving rules.  Thus we match the latest rule we see, or
            # a matching rule if we find one.  The format of slot [2]
            # is just a 2 bit flag ([2]&1 means slot [0] matched,
            # [2]&2 means slot [1] matched).

            # Note that later, when the rules are post processed
            # (see Postprocess), the slot [2] will be overwritten
            # with the compressed rule string used to implement
            # equality testing.

            $zone = undef;
            # Rule
            #|# Rule NAME FROM TO   TYPE IN  ON      AT   SAVE LETTER/S
            #|Rule   US   1918 1919 -    Mar lastSun 2:00 1:00 W # War
            #|Rule   US   1918 1919 -    Oct lastSun 2:00 0    S
            #|Rule   US   1942 only -    Feb 9       2:00 1:00 W # War
            #|Rule   US   1945 only -    Sep 30      2:00 0    S
            #|Rule   US   1967 max  -    Oct lastSun 2:00 0    S
            #|Rule   US   1967 1973 -    Apr lastSun 2:00 1:00 D
            #|Rule   US   1974 only -    Jan 6       2:00 1:00 D
            #|Rule   US   1975 only -    Feb 23      2:00 1:00 D
            #|Rule   US   1976 1986 -    Apr lastSun 2:00 1:00 D
            #|Rule   US   1987 max  -    Apr Sun>=1  2:00 1:00 D
            if (/^rule\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+
                (\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/xi) {
                my ($name, $from, $to, $type, $in, $on, $at, $save, $letter) =
                    ($1, $2, $3, $4, $5, $6, $7, $8, $9);
                my $i = $save ? 0:1;

                if (!exists $RULES->{$name}) {
                    $RULES->{$name} = [];
                }
                my $ruleArray = $RULES->{$name};

                # Check our bit mask to see if we've already matched
                # a current rule.  If so, do nothing.  If not, then
                # save this rule line as the best one so far.
                if (@{$ruleArray} < 3 ||
                    !($ruleArray->[2] & 1 << $i)) {
                    my $h = $ruleArray->[$i];
                    $ruleArray->[$i]->{from} = $from;
                    $ruleArray->[$i]->{to} = $to;
                    $ruleArray->[$i]->{type} = $type;
                    $ruleArray->[$i]->{in} = $in;
                    $ruleArray->[$i]->{on} = $on;
                    $ruleArray->[$i]->{at} = $at;
                    $ruleArray->[$i]->{save} = $save;
                    $ruleArray->[$i]->{letter} = $letter;

                    # Does this rule match the current year?  If so,
                    # set the bit mask so we don't overwrite this rule.
                    # This makes us ingore rules for subsequent years
                    # that are already listed in the database -- as long
                    # as we have an overriding rule for the current year.
                    if (($from == $YEAR && $to =~ /only/i) ||
                        ($from <= $YEAR &&
                         (($to =~ /^\d/ && $YEAR <= $to) || $to =~ /max/i))) {
                        $ruleArray->[2] |= 1 << $i;
                        $ruleArray->[3] |= 1 << $i;
                    }
                }
            } else {
                print STDERR "Can't parse in $FILE: $_";
                ++$badLineCount;
            }
        } elsif (/^link/i) {
            #|# Old names, for S5 users
            #|
            #|# Link    LINK-FROM               LINK-TO
            #|Link      America/New_York        EST5EDT
            #|Link      America/Chicago         CST6CDT
            #|Link      America/Denver          MST7MDT
            #|Link      America/Los_Angeles     PST8PDT
            #|Link      America/Indianapolis    EST
            #|Link      America/Phoenix         MST
            #|Link      Pacific/Honolulu        HST
            #
            # There are also links for country-specific zones.
            # These are zones the differ only in that they belong
            # to a different country.  E.g.,
            #|Link	Europe/Rome	Europe/Vatican
            #|Link	Europe/Rome	Europe/San_Marino
            if (/^link\s+(\S+)\s+(\S+)/i) {
                my ($from, $to) = ($1, $2);
                # Record all links in $%LINKS
                $LINKS->{$to} = $from;
            } else {
                print STDERR "Can't parse in $FILE: $_";
                ++$badLineCount;
            }
        } else {
            # Unexpected line
            print STDERR "Ignoring in $FILE: $_";
            ++$badLineCount;
        }
        if ($zoneRule &&
            ($zoneUntil !~ /\S/ || ($zoneUntil =~ /^\d/ &&
                                    $zoneUntil >= $YEAR))) {
            $ZONES->{$zone}->{gmtoff} = $zoneGmtoff;
            $ZONES->{$zone}->{rule} = $zoneRule;
            $ZONES->{$zone}->{format} = $zoneFormat;
            $ZONES->{$zone}->{until} = $zoneUntil;
        }
    }
    close(FILE);
}

######################################################################
# Param: Ref to hash of zones
# Param: Ref to hash of rules
sub Postprocess {
    my ($ZONES, $RULES) = @_;
    my %ruleInUse;

# We no longer store links in the zone hash, so we don't need to do this.
#    # Eliminate zone links that have no corresponding zone
#    foreach (keys %$ZONES) {
#        if (exists $ZONES->{$_}->{link} && !exists $ZONES->{$_}->{rule}) {
#            if (0) {
#                print STDERR
#                    "Deleting link from historical/nonexistent zone: ",
#                    $_, " -> ", $ZONES->{$_}->{link}, "\n";
#            }
#            delete $ZONES->{$_};
#        }
#    }

    # Check that each zone has a corresponding rule.  At the same
    # time, build up a hash that marks each rule that is in use.
    foreach (sort keys %$ZONES) {
        my $ruleName = $ZONES->{$_}->{rule};
        next if ($ruleName eq $STANDARD);
        if (exists $RULES->{$ruleName}) {
            $ruleInUse{$ruleName} = 1;
        } else {
            # This means the zone is using the standard rule now
            $ZONES->{$_}->{rule} = $STANDARD;
        }
    }

    # Check that both parts are there for rules
    # Check for unused rules
    # Make coded string for comparisons
    foreach (keys %$RULES) {
        if (!exists $ruleInUse{$_}) {
            if (0) {
                print STDERR "Deleting historical/unused rule: $_\n";
            }
            delete $RULES->{$_};
        } elsif (!$RULES->{$_}->[0] || !$RULES->{$_}->[1]) {
            print STDERR "Rule doesn't have both parts: $_\n";
        } else {
            # Generate coded string
            # This has all the data about a rule; it can be used
            # to see if two rules behave identically
            $RULES->{$_}->[2] =
                lc($RULES->{$_}->[0]->{in} . "," .
                   $RULES->{$_}->[0]->{on} . "," .
                   $RULES->{$_}->[0]->{at} . "," .
                   $RULES->{$_}->[0]->{save} . ";" .
                   $RULES->{$_}->[1]->{in} . "," .
                   $RULES->{$_}->[1]->{on} . "," .
                   $RULES->{$_}->[1]->{at}); # [1]->{save} is always zero
        }
    }
}

######################################################################
# Create a clone of the zone $oldID named $newID in the hash $ZONES.
# Param: ref to hash of zones
# Param: ID of zone to clone
# Param: ID of new zone
sub _CloneZone {
    my $ZONES = shift;
    my $oldID = shift;
    my $newID = shift;
    for my $field (keys %{$ZONES->{$oldID}}) {
        $ZONES->{$newID}->{$field} = $ZONES->{$oldID}->{$field};
    }
}