scuffed-code/icu4c/source/tools/gentz/tz.pl

#!/usr/bin/perl
######################################################################
# Copyright (C) 1999, International Business Machines
# Corporation and others.  All Rights Reserved.
######################################################################
# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
# where <year> is "1999b" or a similar string.
######################################################################
# This script takes time zone data in elsie.nci.nih.gov format and
# parses it into a form usable by ICU.  The raw data contains more
# data than we need, since it contains historical zone data.  We
# parse out the current zones and create a listing of current zones.
# Author: Alan Liu
######################################################################
# This script reads an alias table, $TZ_ALIAS, and creates clones of
# standard UNIX zones with alias names.
######################################################################
# To update the zone data, download the latest data from the NIH URL
# listed above into a directory.  Run this script with the directory
# name as an argument.  THE DIRECTORY NAME MUST END IN tzdataYYYYR.
######################################################################
# OUTPUT FILE FORMAT (filename $OUT)
#
# As a matter of policy, this script wants to do as much of
# the parsing, data processing, and error checking as possible,
# leaving the C++ program that parses this file to just do the binary
# translation step.
#
# - The file is line based, with one record per line.
# - Lines may be followed by a comment; the parser must ignore
#   anything of the form /\s+#.*$/ in each line.
#   |3065,14400 # Asia/Dubai GMT+4:00
# - The file contains a header and 5 lists.
# - The header contains the version of the unix data, the total
#   zone count, the maximum number of zones sharing the same value
#   of gmtOffset, the length of the name table in bytes, and
#   the length of the longest name (not including the terminating
#   zero byte).
#   | 1999 # (tzdata1999j) version of Olson zone
#   | 10 #  data from ftp://elsie.nci.nih.gov
#   | 387 # total zone count
#   | 40 # max count of zones with same gmtOffset
#   | 25 # max name length not incl final zero
#   | 5906 # length of name table in bytes
# - Lists start with a count of the records to follow, the records
#   themselves (one per line), and a single line with the keyword
#   'end'.
# - The first list is the list of standard zones:
#   | 208 # count of standard zones to follow
#   | 0,0 # Africa/Abidjan GMT+0:00
#   | 28,10800 # Africa/Addis_Ababa GMT+3:00
#   ...
#   | end
#   Each standard zone record contains two integers.  The first
#   is a byte offset into the name table for the name of the zone.
#   The second integer is the GMT offset in SECONDS for this zone.
# - The second list is the list of DST zones:
#   | 179 # count of dst zones to follow
#   | 15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
#   | 184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
#   ...
#   | end
#   Each record starts with the same two integers as a standard
#   zone record.  Following this are data for the onset rule and
#   the cease rule.  Each rule is described by the following integers:
#     month (JAN = 0)
#     dowim } These two values are in SimpleTimeZone encoded
#     dow   } format for DOM, DOWIM, DOW>=DOM, or DOW<=DOM.
#     time MINUTES
#     time mode ('w', 's', 'u')
#   The last integer in the record is the DST savings in MINUTES,
#   typically 60.
# - The third list is the name table:
#   | 387 # count of names to follow
#   | Africa/Abidjan
#   | Africa/Accra
#   ...
#   | end
#   Each name is terminated by a newline (like all lines in the file).
#   The offsets in the first two lists refer to this table.
# - The fourth list is an index list by name.  The index entries
#   themselves are of the form /[sd]\d+/, where the first character
#   indicates standard or DST, and the number that follows indexes
#   into the correpsonding array.
#   | 416 # count of name index table entries to follow
#   | d0 # ACT
#   | d1 # AET
#   | d2 # AGT
#   | d3 # ART
#   | d4 # AST
#   | s0 # Africa/Abidjan
#   ...
#   | end
# - The fifth list is an index by GMT offset.  Each line lists the
#   zones with the same offset.  The first number on the line
#   is the GMT offset in seconds.  The second number is the default
#   zone number in the following list, taken from tz.default.  The
#   third number is the count
#   of zone numbers to follow.  Each zone number is an integer from
#   0..n-1, where n is the total number of zones.  The zone numbers
#   refer to the zone list in alphabetical order.
#   | 39 # index by offset entries to follow
#   | -43200,280,1,280 # -12:00 d=Etc/GMT+12 Etc/GMT+12
#   | -39600,374,6,279,366,374,394,396,399 # -11:00 d=Pacific/Apia Etc/GMT+11 MIT Pacific/Apia Pacific/Midway Pacific/Niue Pacific/Pago_Pago
#   ...
#   | end
######################################################################
# As of 1999j, here are the various possible values taken by the
# rule fields.  See code below that generates this data.
#
# at: 0:00, 0:00s, 1:00, 1:00s, 1:00u, 23:00s, 2:00, 2:00s, 2:30, 2:45s,
#     3:00, 3:00s
# in: Apr, Dec, Feb, Jan, Jun, Mar, May, Nov, Oct, Sep
# letter: -, D, GHST, GMT, HS, S, SLST
# on: 1, 12, 15, 18, 2, 20, 21, 22, 23, 25, 28, 3, 30, 31, 4, 7, Fri>=1,
#     Fri>=15, Sat>=1, Sat>=15, Sun<=14, Sun>=1, Sun>=10, Sun>=11, Sun>=15,
#     Sun>=16, Sun>=23, S un>=8, Sun>=9, lastFri, lastSun, lastThu
# save: 0, 0:20, 0:30, 1:00
# type: -

require 5; # Minimum version of perl needed
use strict;
use Getopt::Long;
use vars qw(@FILES $YEAR $DATA_DIR $OUT $SEP @MONTH
            $VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION
            $TZ_ALIAS $TZ_DEFAULT);
require 'dumpvar.pl';
use tzparse;
use tzutil;

# File names
$OUT = 'tz.txt';
$TZ_ALIAS = 'tz.alias';
$TZ_DEFAULT = 'tz.default';

# Separator between fields in the output file
$SEP = ','; # Don't use ':'!

@FILES = qw(africa
            antarctica
            asia
            australasia
            backward
            etcetera
            europe
            factory
            northamerica
            pacificnew
            solar87
            solar88
            solar89
            southamerica);

# We get the current year from the system here.  Later
# we double check this against the zone data version.
$YEAR = 1900+@{[localtime]}[5]; # Get the current year

$DATA_DIR = shift;

if (!$DATA_DIR || ! -d $DATA_DIR) {
    print STDERR "No data directory or invalid directory specified\n\n";
    usage();
}

if ($DATA_DIR =~ /(tzdata(\d{4})(\w?))/) {
    $RAW_VERSION = $1;
    $VERSION_YEAR = $2;
    $VERSION_SUFFIX = $3;
    if ($YEAR != $VERSION_YEAR) {
        print STDERR "WARNING: You appear to be building $VERSION_YEAR data. Don't you want to use current $YEAR data?\n";
        usage(); # Add an override option for this check, if needed
    }
    $VERSION_SUFFIX =~ tr/a-z/A-Z/;
    if ($VERSION_SUFFIX =~ /[A-Z]/) {
        $VERSION_SUFFIX = ord($VERSION_SUFFIX) - ord('A') + 1;
    } else {
        if ($VERSION_SUFFIX) {
            print STDERR "Warning: Ignoring version suffix '$VERSION_SUFFIX' for \"$DATA_DIR\"\n";
        }
        $VERSION_SUFFIX = 0;
    }
    print "Time zone version $RAW_VERSION = $VERSION_YEAR($VERSION_SUFFIX)\n";
} else {
    print STDERR "The directory specified doesn't contain \"tzdataNNNNR\", so I can't tell what version the data is.  Please rename the directory and try again.\n";
    usage();
}


@MONTH = qw(jan feb mar apr may jun
            jul aug sep oct nov dec);

main();

sub usage {
    print STDERR "Usage: $0 data_dir\n\n";
    print STDERR "data_dir contains the unpacked files from\n";
    print STDERR "ftp://elsie.nci.nih.gov/pub/tzdataYYYYR,\n";
    print STDERR "where YYYY is the year and R is the revision\n";
    print STDERR "letter.\n";
    print STDERR "\n";
    print STDERR "Files that are expected to be present are:\n";
    print STDERR join(", ", @FILES), "\n";
    exit 1;
}

sub main {
    my (%ZONES, %RULES, @EQUIV);

    print "Reading";
    foreach (@FILES) {
        if (! -e "$DATA_DIR/$_") {
            print STDERR "\nMissing file $DATA_DIR/$_\n\n";
            usage();
        }
        print ".";
        TZ::ParseFile("$DATA_DIR/$_", \%ZONES, \%RULES, $YEAR);
    }
    print "done\n";

    TZ::Postprocess(\%ZONES, \%RULES);

    incorporateAliases($TZ_ALIAS, \%ZONES);

    print
        "Read ", scalar keys %ZONES, " current zones and ",
        scalar keys %RULES, " rules for $YEAR\n";

    # Make sure we have a zone named GMT from either the
    # UNIX data or the alias table.  If not, add one.
    if (!exists $ZONES{GMT}) {
        print "Adding GMT zone\n";
        my %GMT = ('format' => 'GMT',
                   'gmtoff' => '0:00',
                   'rule' => $TZ::STANDARD,
                   'until' => '');
        $ZONES{GMT} = \%GMT;
    }

    # Write out the zone data in a compact readable format.

    # Create a name table from the zone names.  The format of
    # the name table is:
    #
    # The names are listed in lexical order, and each name
    # is assigned an offset.  The first name's offset is 0.
    # The offset of name i+1 is the offset of name i + the
    # length of name i + 1 (for the zero byte).
    #
    # Store the offsets in a hash %NAME_OFFSET.  Store the
    # names in a big scalar, $NAME_LIST, with "\n" between
    # each name and after the last.
    #
    # Store the length of the entire name table in $NAME_SIZE.
    #
    # Also, count the number of standard and DST zones.
    my $offset = 0;
    my $NAME_LIST = '';
    my %NAME_OFFSET;
    my $STD_COUNT = 0; # Count of standard zones
    my $DST_COUNT = 0; # Count of DST zones
    my $maxNameLen = 0;
    foreach my $z (sort keys %ZONES) {
        # Make sure zone IDs only contain invariant chars
        assertInvariantChars($z);

        my $len = length($z);
        $NAME_OFFSET{$z} = $offset;
        $offset += $len + 1;
        $NAME_LIST .= "$z\n";
        $maxNameLen = $len if ($len > $maxNameLen);
        if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
            $STD_COUNT++;
        } else {
            $DST_COUNT++;
        }
    }
    my $NAME_SIZE = $offset;

    # Find the maximum number of zones with the same value of
    # gmtOffset.
    my %perOffset; # Hash of offset -> count
    foreach my $z (keys %ZONES) {
        # Use parseOffset to normalize values - probably unnecessary
        ++$perOffset{parseOffset($ZONES{$z}->{gmtoff})};
    }
    my $maxPerOffset = 0;
    foreach (values %perOffset) {
        $maxPerOffset = $_ if ($_ > $maxPerOffset);
    }

    open(OUT,">$OUT") or die "Can't open $OUT for writing: $!";

    ############################################################
    # EMIT HEADER
    ############################################################
    # Zone data version
    print OUT $VERSION_YEAR, " # ($RAW_VERSION) version of Olson zone\n";
    print OUT $VERSION_SUFFIX, " #  data from ftp://elsie.nci.nih.gov\n";
    print OUT scalar keys %ZONES, " # total zone count\n";
    print OUT $maxPerOffset, " # max count of zones with same gmtOffset\n";
    print OUT $maxNameLen, " # max name length not incl final zero\n";
    print OUT $NAME_SIZE, " # length of name table in bytes\n";

    ############################################################
    # EMIT ZONE TABLES
    ############################################################
    # Output first the standard zones, then the dst zones.
    # Precede each list with the count of zones to follow,
    # and follow it with the keyword 'end'.
    for my $type (qw(standard dst)) {
        print OUT ($type eq 'standard'
            ? $STD_COUNT : $DST_COUNT), " # count of $type zones to follow\n";
        foreach my $z (sort keys %ZONES) {
            my $isStd = ($ZONES{$z}->{rule} eq $TZ::STANDARD);
            next if ($isStd ne ($type eq 'standard'));
            print OUT $NAME_OFFSET{$z}, ",";
            print OUT formatZone($z, $ZONES{$z}, \%RULES), "\n";
        }
        print OUT "end\n"; # 'end' keyword for error checking
    }

    ############################################################
    # EMIT NAME TABLE
    ############################################################
    # Output the name table, followed by 'end' keyword
    print OUT scalar keys %ZONES, " # count of names to follow\n";
    print OUT $NAME_LIST, "end\n";

    ############################################################
    # EMIT INDEX BY NAME
    ############################################################
    # Output the name index table.  Since we don't know structure
    # sizes, we output the index number of each zone.  For example,
    # "s0" is the first standard zone, "s1" is the second, etc.
    # Likewise, "d0" is the first DST zone, "d1" is the second, etc.

    # First compute index IDs, as described above.
    my %indexID;
    my $s = 0;
    my $d = 0;
    foreach my $z (sort keys %ZONES) {
        if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
            $indexID{$z} = "s$s";
            $s++;
        } else {
            $indexID{$z} = "d$d";
            $d++;
        }
    }

    # Now emit table sorted by name
    print OUT scalar keys %ZONES, " # count of name index table entries to follow\n";
    foreach my $z (sort keys %ZONES) {
        print OUT $indexID{$z}, " # $z\n";
    }
    print OUT "end\n";

    ############################################################
    # EMIT INDEX BY GMT OFFSET
    ############################################################
    # Create a hash mapping zone name -> integer, from 0..n-1.
    # Create an array mapping zone number -> name.
    my %zoneNumber;
    my @zoneName;
    my $i = 0;
    foreach (sort keys %ZONES) {
        $zoneName[$i] = $_;
        $zoneNumber{$_} = $i++;
    }

    # Create a hash by index.  The hash has offset integers as keys
    # and arrays of index numbers as values.
    my %offsetMap;
    foreach (sort keys %ZONES) {
        my $offset = parseOffset($ZONES{$_}->{gmtoff});
        push @{$offsetMap{$offset}}, $zoneNumber{$_};
    }

    # Select defaults.  We do this by reading the file $TZ_DEFAULT.
    # If there are multiple errors, we want to report them all,
    # so we set a flag and die at the end if there are problems.
    my %defaults; # key=offset integer, value=zone #
    my $ok = 1;
    open(IN, $TZ_DEFAULT) or die "Can't open $TZ_DEFAULT: $!";
    while (<IN>) {
        my $raw = $_;
        s/\#.*//; # Trim comments
        next unless (/\S/); # Skip blank lines
        if (/^\s*(\S+)\s*$/) {
            my $z = $1;
            if (! exists $ZONES{$z}) {
                print "Error: Nonexistent zone $z listed in $TZ_DEFAULT line: $raw";
                $ok = 0;
                next;
            }
            my $offset = parseOffset($ZONES{$z}->{gmtoff});
            if (exists $defaults{$offset}) {
                print
                    "Error: Offset ", formatOffset($offset), " has both ",
                    $zoneName[$defaults{$offset}], " and ", $z,
                    " specified as defaults\n";
                $ok = 0;
                next;
            }
            $defaults{$offset} = $zoneNumber{$z};
        } else {
            print "Error: Can't parse line in $TZ_DEFAULT: $raw";
            $ok = 0;
        }
    }
    close(IN);
    die "Error: Aborting due to errors in $TZ_DEFAULT\n" unless ($ok);
    print "Incorporated ", scalar keys %defaults, " defaults from $TZ_DEFAULT\n";

    # Emit it
    my $missing = 0;
    print OUT scalar keys %offsetMap, " # index by offset entries to follow\n";
    foreach (sort {$a <=> $b} keys %offsetMap) {
        my $aref = $offsetMap{$_};
        my $def = -1;
        if (exists $defaults{$_}) {
            $def = $defaults{$_};
        } else {
            # If there is an offset for which we have no listed default
            # in $TZ_DEFAULT, we try to figure out a reasonable default
            # ourselves.  We ignore any zone named Etc/ because that's not
            # a "real" zone; it's just one listed as a POSIX convience.
            # We take the first (alphabetically) zone of what's left,
            # and if there are more than one of those, we emit a warning.

            my $ambiguous = 0;
            # Ignore zones named Etc/ and take the first one we otherwise see;
            # if there is more than one of those, emit a warning.
            foreach (sort map($zoneName[$_], @{$aref})) {
                next if (m|^Etc/|);
                if ($def < 0) {
                    $def = $zoneNumber{$_}
                } else {
                    $ambiguous = 1;
                }
            }
            $def = $aref->[0] if ($def < 0);
            if ($ambiguous) {
                $missing = 1;
                print
                    "Warning: No default for GMT", formatOffset($_),
                    ", using ", $zoneName[$def], "\n";
            }
        }
        # Emit the actual line, including a descriptive comment
        print OUT
            $_, ",", $def, ",",
            scalar @{$aref}, ",", join(",", @{$aref}),
            " # ", formatOffset($_), " d=", $zoneName[$def], " ",
            join(" ", map($zoneName[$_], @{$aref})), "\n";
    }
    print "Defaults may be specified in $TZ_DEFAULT\n" if ($missing);

    print OUT "end\n";

    ############################################################
    # END
    ############################################################
    close(OUT);
    print "$OUT written.\n";


    if (0) {
        TZ::FormZoneEquivalencyGroups(\%ZONES, \%RULES, \@EQUIV);
        print
            "Equivalency groups (including unique zones): ",
            scalar @EQUIV, "\n";
    }

    #::dumpValue($ZONES{"America/Los_Angeles"});
    #::dumpValue($RULES{"US"});
    #::dumpValue($RULES{"Tonga"});

    # Find all the different values of rule fields:
    # in, at, on, save, type, letter
    if (0) {
        my %RULEVALS;
        foreach my $ruleName (keys %RULES) {
            for (my $i=0; $i<2; ++$i) {
                foreach my $key (qw(in on at save type letter)) {
                    if (@{$RULES{$ruleName}} < 2) {
                        print $ruleName, ":";
                        ::dumpValue($RULES{$ruleName});
                    }
                    my $x = $RULES{$ruleName}->[$i]->{$key};
                    $RULEVALS{$key}->{$x} = 1;
                }
            }
        }
        foreach my $key (sort keys %RULEVALS) {
            print "$key: ", join(", ", sort keys %{$RULEVALS{$key}}), "\n";
        }
    }
}

# Read the alias list and create clones with alias names.  This
# sub should be called AFTER all standard zones have been read in.
# Param: File name of alias list
# Param: Ref to zone hash
sub incorporateAliases {
    my $aliasFile = shift;
    my $zones = shift;
    my $n = 0;
    local *IN;
    open(IN,$aliasFile) or die "Can't open $aliasFile: $!";
    while (<IN>) {
        s/\#.*//; # Trim comments
        next unless (/\S/); # Skip blank lines
        if (/^\s*(\S+)\s+(\S+)\s*$/) {
            my ($alias, $original) = ($1, $2);
            if (exists $zones->{$alias}) {
                die "Bad alias in $aliasFile: $alias is a standard UNIX zone. " .
                    "Please remove $alias from the alias table.\n";
            }
            if (!exists $zones->{$original}) {
                die "Bad alias in $aliasFile: $alias maps to the nonexistent " .
                    "zone $original. Please fix this entry in the alias table.\n";
            }
            # Create the alias!
            $zones->{$alias} = $zones->{$original};
            $n++;
        } else {
            die "Bad line in alias table $aliasFile: $_\n";
        }
    }
    print "Incorporated $n aliases from $aliasFile\n";
    close(IN);
}

# Format a time zone as a machine-readable line of text.  Another
# tool will read this line to construct a binary data structure
# representing this zone.
# Param: Zone name
# Param: Zone hash
# Param: Ref to hash of all rules
# Return: One line description of this zone.
sub formatZone { # ($z, $ZONES{$z}, \%RULES)
    my $name = shift;
    my $zone = shift;
    my $rules = shift;

    my @spec;
    my @notes = ( $name );

    # GMT offset
    push @notes, ($zone->{gmtoff}=~/^-/?"GMT":"GMT+") . $zone->{gmtoff};
    push @spec, parseOffset($zone->{gmtoff});

    #|rawOffset      The new SimpleTimeZone's raw GMT offset
    #|ID             The new SimpleTimeZone's time zone ID.
    #|startMonth     The daylight savings starting month. Month is
    #|               0-based. eg, 0 for January.
    #|startDay       The daylight savings starting
    #|               day-of-week-in-month. See setStartRule() for a
    #|               complete explanation.
    #|startDayOfWeek The daylight savings starting day-of-week. See
    #|               setStartRule() for a complete explanation.
    #|startTime      The daylight savings starting time, expressed as the
    #|               number of milliseconds after midnight.
    #|endMonth       The daylight savings ending month. Month is
    #|               0-based. eg, 0 for January.
    #|endDay         The daylight savings ending day-of-week-in-month.
    #|               See setStartRule() for a complete explanation.
    #|endDayOfWeek   The daylight savings ending day-of-week. See
    #|               setStartRule() for a complete explanation.
    #|endTime        The daylight savings ending time, expressed as the
    #|               number of milliseconds after midnight.

    my $rule = $zone->{rule};
    if ($rule ne $TZ::STANDARD) {
        $rule = $rules->{$rule};
        # $rule is now an array ref, with [0] being the onset and
        # [1] being the cease.

        formatRule($rule->[0], \@spec, \@notes); # Onset
        formatRule($rule->[1], \@spec, \@notes); # Cease

        my @a = parseTime($rule->[0]->{save});
        if ($a[1] ne 'w') {
            die "Strange DST savings value: \"$rule->[0]->{save}\"";
        }
        push @notes, $rule->[0]->{save};
        push @spec, $a[0];
    }

    join($SEP, @spec) . " # " . join(' ', @notes);
}

# Format a rule and return the string
# Param: reference to rule hash
# Param: ref to spec array (this is a result param)
# Param: ref to annotation array (this is a result param)
sub formatRule {
    my $rule = shift;
    my $spec = shift;
    my $notes = shift;
    push @$notes, $rule->{in}, $rule->{on}, $rule->{at};
    push @$spec, parseMonth($rule->{in}); # Month
    push @$spec, parseDaySpecifier($rule->{on}); # Day
    push @$spec, parseTime($rule->{at}); # Time
}

# Parse an offset of the form d, d:dd, or d:dd:dd, or any of the above
# preceded by a '-'.  Return the total number of seconds represented.
# Param: String
# Return: Integer number of seconds
sub parseOffset {
    local $_ = shift;
    if (/^(-)?(\d{1,2})(:(\d\d))?(:(\d\d))?$/) {
        #        1   2      4        6
        my $a = (($2 * 60) + (defined $4?$4:0)) * 60 + (defined $6?$6:0);
        $a = -$a if (defined $1 && $1 eq '-');
        return $a;
    } else {
        die "Cannot parse offset \"$_\"";
    }
}

# Format an offset in seconds and return a string of the form
# /[+-]\d{1,2}:\d\d(:\d\d)?/.
# Param: Offset in seconds
# Return: String
sub formatOffset {
    local $_ = shift;
    my $result = $_<0 ? "-":"+";
    $_ = -$_ if ($_ < 0);
    my $sec = $_ % 60;  $_ = ($_ - $sec) / 60;
    my $min = $_ % 60;  $_ = ($_ - $min) / 60;
    $min = "0$min" if ($min < 10);
    $sec = $sec ? ($sec < 10 ? ":0$sec" : ":$sec") : "";
    $result . $_ . ":" . $min . $sec;
}

# Parse a time of the format dd:dds, where s is a suffix character.
# Return the time, in minutes, and the suffix, in an array.
# Only the suffixes 's' and 'u' are recognized.
# Param: String, with optional suffix
# Return: Array ( seconds, suffix ).  If no suffix, 'w' is used.
sub parseTime {
    local $_ = shift;
    if (/^(\d{1,2}):(\d\d)([su])?$/) {
        my $a = ($1*60) + $2;
        my $s = defined $3?$3:'w';
        return ( $a, $s );
    } else {
        die "Cannot parse time \"$_\"";
    }
}

# Given a month string, return an integer from 0 (Jan) to 11 (Dec).
# Param: Str
# Return: Int 0..11.
sub parseMonth {
    local $_ = shift;
    for (my $i=0; $i<12; $i++) {
        return $i if (/$MONTH[$i]/i);
    }
    die "Can't parse month \"$_\"";
}

# Given a specifier for the day of the month on which a rule triggers,
# return an array of two integers encoding that information.  We use
# the ICU/java.util.SimpleTimeZone encoding scheme using two integers.
# We return the two integers in an array of ( dowim dow ).
# Param: String, such as
#     1, 12, 15, 18, 2, 20, 21, 22, 23, 25, 28, 3, 30, 31, 4, 7, Fri>=1,
#     Fri>=15, Sat>=1, Sat>=15, Sun<=14, Sun>=1, Sun>=10, Sun>=11, Sun>=15,
#     Sun>=16, Sun>=23, Sun>=8, Sun>=9, lastFri, lastSun, lastThu
#   This is the {on} field of the rule hash.
# Return: Array of two integers, ( dowim dow ).
#   The dow has Sunday = 1 .. Saturday = 7.
sub parseDaySpecifier {
    local $_ = shift;

    #|+If both dayOfWeekInMonth and dayOfWeek are positive, they specify the
    #| day of week in the month (e.g., (2, WEDNESDAY) is the second Wednesday
    #| of the month).
    #|+If dayOfWeek is positive and dayOfWeekInMonth is negative, they specify
    #| the day of week in the month counting backward from the end of the month.
    #| (e.g., (-1, MONDAY) is the last Monday in the month)
    #|+If dayOfWeek is zero and dayOfWeekInMonth is positive, dayOfWeekInMonth
    #| specifies the day of the month, regardless of what day of the week it is.
    #| (e.g., (10, 0) is the tenth day of the month)
    #|+If dayOfWeek is zero and dayOfWeekInMonth is negative, dayOfWeekInMonth
    #| specifies the day of the month counting backward from the end of the
    #| month, regardless of what day of the week it is (e.g., (-2, 0) is the
    #| next-to-last day of the month).
    #|+If dayOfWeek is negative and dayOfWeekInMonth is positive, they specify the
    #| first specified day of the week on or after the specfied day of the month.
    #| (e.g., (15, -SUNDAY) is the first Sunday after the 15th of the month
    #| [or the 15th itself if the 15th is a Sunday].)
    #|+If dayOfWeek and dayOfWeekInMonth are both negative, they specify the
    #| last specified day of the week on or before the specified day of the month.
    #| (e.g., (-20, -TUESDAY) is the last Tuesday before the 20th of the month
    #| [or the 20th itself if the 20th is a Tuesday].)

    # dowim dow
    # >0    >0    day of week in month
    # <0    >0    day of week in month (from end)
    # >0    0     day of month
    # <0    0     day of month (from end; -1 is last dom)
    # >0    <0    first dow on or after dom
    # <0    <0    last dow on or before dom

    my $dowim;
    my $dow = 0;

    # Check for straight DOM
    if (/^\d+$/) {
        $dowim = $_;
        $dow = 0;
        return ( $dowim, $dow );
    }

    # Anything else must have a dow embedded in it; parse it out
    my @DOW = ( 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat' );
    for (my $i=0; $i<@DOW; $i++) {
        if (s/$DOW[$i]//) {
            $dow = $i + 1;
            last;
        }
    }
    if ($dow == 0) {
        die "Cannot parse day specifier \"$_\"";
    }

    # Now we have either >=n, <=n, last, or first.
    if (/^last$/) {
        $dowim = -1;
    } elsif (/^first$/) {
        $dowim = 1;
    } elsif (/^>=(\d+)$/) {
        $dowim = $1;
        $dow = -$dow;
    } elsif (/^<=(\d+)$/) {
        $dowim = -$1;
        $dow = -$dow;
    } else {
        die "Cannot parse day specifier \"$_\"";
    }

    ( $dowim, $dow );
}

# Confirm that the given ID contains only invariant characters.
# See utypes.h for an explanation.
# Param: string to be checked
sub assertInvariantChars {
    local $_ = shift;
    if (/[^A-Za-z0-9 \"%&\'()*+,-.\/:;<=>?_]/) {
        die "Error: Zone ID \"$_\" contains non-invariant characters\n";
    }
}

__END__