scuffed-code/icu4c/source/tools/gentz/gentz.cpp

/*
**********************************************************************
*   Copyright (C) 1999-2001, International Business Machines
*   Corporation and others.  All Rights Reserved.
**********************************************************************
*   Date        Name        Description
*   11/24/99    aliu        Creation.
*   09/26/00    aliu        Support for equivalency groups added.
*   01/31/01    aliu        Support for ISO 3166 country codes added.
**********************************************************************
*/

/* This program reads a text file full of parsed time zone data and
 * outputs a binary file, tz.dat, which then goes on to become part of
 * the memory-mapped (or dll) ICU data file.
 *
 * The data file read by this program is generated by a perl script,
 * tz.pl.  The input to tz.pl is standard unix time zone data from
 * ftp://elsie.nci.nih.gov.
 *
 * As a matter of policy, the perl script tz.pl wants to do as much of
 * the parsing, data processing, and error checking as possible, and
 * this program wants to just do the binary translation step.
 *
 * See tz.pl for the file format that is READ by this program.
 */

#include <stdio.h>
#include <stdlib.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "unewdata.h"
#include "uoptions.h"
#include "tzdat.h"

#define INPUT_FILE "tz.txt"
#define OUTPUT_FILE "tz.dat"

/* UDataInfo cf. udata.h */
static UDataInfo dataInfo = {
    sizeof(UDataInfo),
    0,

    U_IS_BIG_ENDIAN,
    U_CHARSET_FAMILY,
    sizeof(UChar),
    0,

    {TZ_SIG_0, TZ_SIG_1, TZ_SIG_2, TZ_SIG_3},
    {TZ_FORMAT_VERSION, 0, 0, 0},                 /* formatVersion */
    {0, 0, 0, 0} /* dataVersion - will be filled in with year.suffix */
};


class gentz {
    // These must match SimpleTimeZone!!!
    enum { WALL_TIME = 0,
           STANDARD_TIME,
           UTC_TIME
    };

    // The largest number of zones we accept as sensible.  Anything
    // larger is considered an error.  Adjust as needed.
    enum { MAX_ZONES = 1000 };

    // The maximum sensible GMT offset, in seconds
    static const int32_t MAX_GMT_OFFSET;

    static const char COMMENT;
    static const char CR;
    static const char LF;
    static const char MINUS;
    static const char SPACE;
    static const char TAB;
    static const char ZERO;
    static const char STANDARD_MARK;
    static const char DST_MARK;
    static const char SEP;
    static const char NUL;

    static const char* END_KEYWORD;

    enum { BUFLEN = 1024 };
    char buffer[BUFLEN];
    int32_t lineNumber;

    // Binary data that we construct from tz.txt and write out as tz.dat
    TZHeader              header;
    TZEquivalencyGroup*   equivTable;
    OffsetIndex*          offsetIndex;
    CountryIndex*         countryIndex;
    uint32_t*             nameToEquiv;
    char*                 nameTable;

    uint32_t equivTableSize;  // Total bytes in equivalency group table
    uint32_t offsetIndexSize; // Total bytes in offset index table
    uint32_t countryIndexSize; // Total bytes in country index table
    uint32_t nameToEquivSize; // Total bytes in nameToEquiv
    uint32_t nameTableSize;   // Total bytes in name table

    uint32_t maxPerOffset; // Maximum number of zones per offset
    uint32_t maxPerEquiv; // Maximum number of zones per equivalency group
    uint32_t equivCount; // Number of equivalency groups

    UBool useCopyright;
    UBool verbose;


public:
    int      MMain(int argc, char *argv[]);
private:
    int32_t  writeTzDatFile(const char *destdir);
    void     parseTzTextFile(FileStream* in);

    // High level parsing
    void          parseHeader(FileStream* in);

    TZEquivalencyGroup* parseEquivTable(FileStream* in);

    void          fixupNameToEquiv();

    void          parseDSTRule(char*& p, TZRule& rule);

    OffsetIndex*  parseOffsetIndexTable(FileStream* in);

    CountryIndex* parseCountryIndexTable(FileStream* in);

    char*         parseNameTable(FileStream* in);

    // Low level parsing and reading
    void     readEndMarker(FileStream* in);
    int32_t  readIntegerLine(FileStream* in, int32_t min, int32_t max);
    int32_t  _parseInteger(char*& p);
    int32_t  parseInteger(char*& p, char nextExpectedChar, int32_t, int32_t);
    int32_t  readLine(FileStream* in);

    // Error handling
    void    die(const char* msg);
};

int main(int argc, char *argv[]) {
    gentz x;

    U_MAIN_INIT_ARGS(argc, argv);

    return x.MMain(argc, argv);
}

const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60; // seconds
const char    gentz::COMMENT        = '#';
const char    gentz::CR             = '\r';
const char    gentz::LF             = '\n';
const char    gentz::MINUS          = '-';
const char    gentz::SPACE          = ' ';
const char    gentz::TAB            = '\t';
const char    gentz::ZERO           = '0';
const char    gentz::SEP            = ',';
const char    gentz::STANDARD_MARK  = 's';
const char    gentz::DST_MARK       = 'd';
const char    gentz::NUL            = '\0';
const char*   gentz::END_KEYWORD    = "end";

static UOption options[]={
    UOPTION_HELP_H,
    UOPTION_HELP_QUESTION_MARK,
    UOPTION_COPYRIGHT,
    UOPTION_DESTDIR,
    UOPTION_VERBOSE
};

int gentz::MMain(int argc, char* argv[]) {
    /* preset then read command line options */
    options[3].value=u_getDataDirectory();
    argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options);

    /* error handling, printing usage message */
    if(argc<0) {
        fprintf(stderr,
            "error in command line argument \"%s\"\n",
            argv[-argc]);
    } else if(argc<2) {
        argc=-1;
    }
    if(argc<0 || options[0].doesOccur || options[1].doesOccur) {
        fprintf(stderr,
            "usage: %s [-options] timezone-file\n"
            "\tread the timezone file produced by tz.pl and create " TZ_DATA_NAME "." TZ_DATA_TYPE "\n"
            "options:\n"
            "\t-h or -? or --help  this usage text\n"
            "\t-v or --verbose     turn on verbose output\n"
            "\t-c or --copyright   include a copyright notice\n"
            "\t-d or --destdir     destination directory, followed by the path\n",
            argv[0]);
        return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
    }

    /* get the options values */
    useCopyright=options[2].doesOccur;
    verbose = options[4].doesOccur;


    ////////////////////////////////////////////////////////////
    // Read the input file
    ////////////////////////////////////////////////////////////
    *buffer = NUL;
    lineNumber = 0;
    if (verbose) {
        fprintf(stdout, "Input file: %s\n", argv[1]);
    }
    FileStream* in = T_FileStream_open(argv[1], "r");
    if (in == 0) {
        die("Cannot open input file");
    }
    parseTzTextFile(in);
    T_FileStream_close(in);
    *buffer = NUL;

    ////////////////////////////////////////////////////////////
    // Write the output file
    ////////////////////////////////////////////////////////////
    int32_t wlen = writeTzDatFile(options[3].value);
    if (verbose) {
        fprintf(stdout, "Output file: %s.%s, %ld bytes\n",
            TZ_DATA_NAME, TZ_DATA_TYPE, (long)wlen);
    }

    return 0; // success
}

int32_t gentz::writeTzDatFile(const char *destdir) {
    UNewDataMemory *pdata;
    UErrorCode status = U_ZERO_ERROR;

    // Careful: The order in which the tables are written must match the offsets.
    // Our order is:
    // - equiv table
    // - offset index
    // - country index
    // - name index (name to equiv map)
    // - name table (must be last!)
    header.equivTableDelta = sizeof(header);
    header.offsetIndexDelta = header.equivTableDelta + equivTableSize;
    header.countryIndexDelta = header.offsetIndexDelta + offsetIndexSize;
    header.nameIndexDelta = header.countryIndexDelta + countryIndexSize;
    // Must be last:
    header.nameTableDelta = header.nameIndexDelta + nameToEquivSize;

/*  // Don't need to check for negative values on unsigned numbers.
    if (header.equivTableDelta < 0 ||
        header.offsetIndexDelta < 0 ||
        header.countryIndexDelta < 0 ||
        header.nameIndexDelta < 0 ||
        header.nameTableDelta < 0) {
        die("Table too big -- negative delta");
    }
*/

    // Convert equivalency table indices to offsets.  This can only
    // be done after the header offsets have been set up.
    fixupNameToEquiv();

    // Fill in dataInfo with year.suffix
    *(uint16_t*)&(dataInfo.dataVersion[0]) = header.versionYear;
    *(uint16_t*)&(dataInfo.dataVersion[2]) = header.versionSuffix;

    pdata = udata_create(destdir, TZ_DATA_TYPE, TZ_DATA_NAME, &dataInfo,
                         useCopyright ? U_COPYRIGHT_STRING : 0, &status);
    if (U_FAILURE(status)) {
        die("Unable to create data memory");
    }

    udata_writeBlock(pdata, &header, sizeof(header));
    udata_writeBlock(pdata, equivTable, equivTableSize);
    udata_writeBlock(pdata, offsetIndex, offsetIndexSize);
    udata_writeBlock(pdata, countryIndex, countryIndexSize);
    udata_writeBlock(pdata, nameToEquiv, nameToEquivSize);
    udata_writeBlock(pdata, nameTable, nameTableSize);

    uint32_t dataLength = udata_finish(pdata, &status);
    if (U_FAILURE(status)) {
        die("Error writing output file");
    }

    if (dataLength != (sizeof(header) + equivTableSize +
                       offsetIndexSize + countryIndexSize +
                       nameTableSize + nameToEquivSize
                       )) {
        die("Written file doesn't match expected size");
    }
    return dataLength;
}

void gentz::parseTzTextFile(FileStream* in) {
    parseHeader(in);

    // Read name table, create it, also create nameToEquiv index table
    // as a side effect.
    nameTable = parseNameTable(in);

    // Parse the equivalency groups
    equivTable = parseEquivTable(in);

    // Parse the GMT offset index table
    offsetIndex = parseOffsetIndexTable(in);

    // Parse the ISO 3166 country index table
    countryIndex = parseCountryIndexTable(in);
}

/**
 * Convert equivalency table indices to offsets.  The equivalency
 * table offset (in the header) must be set already.
 */
void gentz::fixupNameToEquiv() {
    uint32_t i;

    // First make a list that maps indices to offsets
    uint32_t *offsets = (uint32_t*) uprv_malloc(sizeof(uint32_t) * equivCount);
	/* test for NULL */
	if(offsets == NULL) {
		die("Out of memory");
	}
    offsets[0] = header.equivTableDelta;
    if (offsets[0] % 4 != 0) {
        die("Header size is not 4-aligned");
    }
    TZEquivalencyGroup *eg = equivTable;
    for (i=1; i<equivCount; ++i) {
        offsets[i] = offsets[i-1] + eg->nextEntryDelta;
        if (offsets[i] % 4 != 0) {
            die("Equivalency group table is not 4-aligned");
        }
        eg = (TZEquivalencyGroup*) (eg->nextEntryDelta + (int8_t*)eg);
    }

    // Now remap index values to offsets
    for (i=0; i<header.count; ++i) {
        uint32_t x = nameToEquiv[i];
        if (x >= equivCount) {
            die("Equiv index out of range");
        }
        nameToEquiv[i] = offsets[x];
    }

    uprv_free(offsets);
}

TZEquivalencyGroup* gentz::parseEquivTable(FileStream* in) {
    uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
    if (n != equivCount) {
        die("Equivalency table count mismatch");
    }

    // We don't know how big the whole thing will be yet, but we can use
    // the maxPerEquiv number to compute an upper limit.
    //
    // The gmtOffset field within each struct must be
    // 4-aligned for some architectures.  To ensure this, we do two
    // things: 1. The entire struct is 4-aligned.  2. The gmtOffset is
    // placed at a 4-aligned position within the struct.  3. The size
    // of the whole structure is padded out to 4n bytes.  We achieve
    // this last condition by adding two bytes of padding after the
    // last entry, if necessary.  We adjust
    // the nextEntryDelta and add 2 bytes of padding if necessary.
    uint32_t maxPossibleSize = sizeof(TZEquivalencyGroup) +
        (maxPerEquiv-1) * sizeof(uint16_t);
    // Pad this out
    if ((maxPossibleSize % 4) != 0) {
        maxPossibleSize += 2;
    }
    if ((maxPossibleSize % 4) != 0) {
        die("Bug in 4-align code for equiv table");
    }
    maxPossibleSize *= n; // Get size of entire set of structs.

    int8_t *result = (int8_t*) uprv_malloc(sizeof(int8_t) * maxPossibleSize);
    if (result == 0) {
        die("Out of memory");
    }

    // Read each line and construct the corresponding entry
    TZEquivalencyGroup* eg = (TZEquivalencyGroup*)result;
    for (uint32_t i=0; i<n; ++i) {
        char *p;

        readLine(in);

        // Each line starts with 's,' or 'd,' to specify the zone type
        char flavor = buffer[0];
        if (buffer[1] != SEP) {
            die("Syntax error in equiv table");
        }
        p = buffer + 2;

        // This pointer will be adjusted to point to the start of the
        // list of zones in this group.
        uint16_t* pList = 0;

        switch (flavor) {
        case STANDARD_MARK:
            eg->isDST = 0;
            eg->u.s.zone.gmtOffset = 1000 * // Convert s -> ms
                parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
            pList = &(eg->u.s.count);
            break;
        case DST_MARK:
            eg->isDST = 1;
            eg->u.d.zone.gmtOffset = 1000 * // Convert s -> ms
                parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
            parseDSTRule(p, eg->u.d.zone.onsetRule);
            parseDSTRule(p, eg->u.d.zone.ceaseRule);
            eg->u.d.zone.dstSavings = (uint16_t) parseInteger(p, SEP, 0, 12*60);
            pList = &(eg->u.d.count);
            break;
        default:
            die("Invalid equiv table type marker (not s or d)");
        }

        // Now parse the list of zones in this group
        uint16_t egCount = (uint16_t) parseInteger(p, SEP, 1, maxPerEquiv);
        *pList++ = egCount;
        for (uint16_t j=0; j<egCount; ++j) {
            *pList++ = (uint16_t) parseInteger(p, (j==(egCount-1)) ? NUL : SEP,
                                               0, header.count-1);
        }

        // At this point pList points to the byte after the last byte of this
        // equiv group struct.  Time to 4-align it.
        uint16_t structSize = (uint16_t) (((int8_t*)pList) - ((int8_t*)eg));
        if ((structSize % 4) != 0) {
            // assert(structSize % 4 == 2);
            *pList++ = 0xFFFF; // Pad with invalid zone index
            structSize += 2;
        }

        // Set up next entry delta
        eg->nextEntryDelta = (i==(n-1)) ? (uint16_t) 0 : structSize;

        eg->reserved = 0; // ignored

        eg = (TZEquivalencyGroup*) (structSize + (int8_t*)eg);
    }
    equivTableSize = (int8_t*)eg - (int8_t*)result;
    readEndMarker(in);
    if (verbose) {
        fprintf(stdout, " Read %lu equivalency table entries, in-memory size %ld bytes\n",
            (unsigned long)equivCount, (long)equivTableSize);
    }
    return (TZEquivalencyGroup*)result;
}

OffsetIndex* gentz::parseOffsetIndexTable(FileStream* in) {
    uint32_t n = readIntegerLine(in, 1, MAX_ZONES);

    // We don't know how big the whole thing will be yet, but we can use
    // the maxPerOffset number to compute an upper limit.
    //
    // The gmtOffset field within each OffsetIndex struct must be
    // 4-aligned for some architectures.  To ensure this, we do two
    // things: 1. The entire struct is 4-aligned.  2. The gmtOffset is
    // placed at a 4-aligned position within the struct.  3. The size
    // of the whole structure is padded out to 4n bytes.  We achieve
    // this last condition by adding two bytes of padding after the
    // last zoneNumber, if count is _even_.  That is, the struct size
    // is 10+2count+padding, where padding is (count%2==0 ? 2:0).
    //
    // Note that we don't change the count itself, but rather adjust
    // the nextEntryDelta and add 2 bytes of padding if necessary.
    //
    // Don't try to compute the exact size in advance
    // (unless we want to avoid the use of sizeof(), which may
    // introduce padding that we won't actually employ).
    uint32_t maxPossibleSize = n * (sizeof(OffsetIndex) +
        (maxPerOffset-1) * sizeof(uint16_t));

    int8_t *result = (int8_t*) uprv_malloc(sizeof(int8_t) * maxPossibleSize);
    if (result == 0) {
        die("Out of memory");
    }

    // Read each line and construct the corresponding entry
    OffsetIndex* index = (OffsetIndex*)result;
    for (uint32_t i=0; i<n; ++i) {
        uint16_t alignedCount;
        readLine(in);
        char* p = buffer;
        index->gmtOffset = 1000 * // Convert s -> ms
            parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
        index->defaultZone = (uint16_t)parseInteger(p, SEP, 0, header.count-1);
        index->count = (uint16_t)parseInteger(p, SEP, 1, maxPerOffset);
        uint16_t* zoneNumberArray = &(index->zoneNumber);
        UBool sawOffset = FALSE; // Sanity check - make sure offset is in zone list
        for (uint16_t j=0; j<index->count; ++j) {
            zoneNumberArray[j] = (uint16_t)
                parseInteger(p, (j==(index->count-1))?NUL:SEP,
                             0, header.count-1);
            if (zoneNumberArray[j] == index->defaultZone) {
                sawOffset = TRUE;
            }
        }
        if (!sawOffset) {
            die("Error: bad offset index entry; default not in zone list");
        }
        alignedCount = index->count;
        if((alignedCount%2)==0) /* force count to be ODD - see above */
        {
            // Use invalid zoneNumber for 2 bytes of padding
            zoneNumberArray[alignedCount++] = (uint16_t)0xFFFF;
        }
        int8_t* nextIndex = (int8_t*)&(zoneNumberArray[alignedCount]);

        index->nextEntryDelta = (uint16_t) ((i==(n-1)) ? 0 : (nextIndex - (int8_t*)index));
        index = (OffsetIndex*)nextIndex;
    }
    offsetIndexSize = (int8_t*)index - (int8_t*)result;
    if (offsetIndexSize > maxPossibleSize) {
        die("Yikes! Interal error while constructing offset index table");
    }
    readEndMarker(in);
    if (verbose) {
        fprintf(stdout, " Read %lu offset index table entries, in-memory size %ld bytes\n",
            (unsigned long)n, (long)offsetIndexSize);
    }
    return (OffsetIndex*)result;
}

CountryIndex* gentz::parseCountryIndexTable(FileStream* in) {
    uint32_t n = readIntegerLine(in, 1, MAX_ZONES);

    // We know how big the whole thing will be: Each zone occupies an
    // int, and each country adds 3 ints (one for the intcode, one for
    // next entry offset, one for the zone count).  Each int is 16
    // bits.
    //
    // Everything is 16-bits, so we don't 4-align the entries.
    // However, we do pad at the end of the table to make the whole
    // thing of size 4n, if necessary.
    uint32_t expectedSize = n*(sizeof(CountryIndex)-sizeof(uint16_t)) +
        header.count * sizeof(uint16_t);
    uint32_t pad = (4 - (expectedSize % 4)) % 4; // This will be 0 or 2
    int8_t *result = (int8_t*) uprv_malloc(sizeof(int8_t) * (expectedSize + pad));
    if (result == 0) {
        die("Out of memory");
    }

    // Read each line and construct the corresponding entry.
    // Along the way, make sure we don't write past 'limit'.
    CountryIndex* index = (CountryIndex*)result;
    int8_t* limit = ((int8_t*)result) + expectedSize; // Don't include pad
    uint32_t i;
    for (i=0; i<n && (int8_t*)(&index->zoneNumber) < limit; ++i) {
        readLine(in);
        char* p = buffer;
        index->intcode = (uint16_t)parseInteger(p, SEP, 0, 25*32+25 /*ZZ*/);
        index->count = (uint16_t)parseInteger(p, SEP, 0, header.count-1);
        uint16_t* zoneNumberArray = &(index->zoneNumber);
        if ((int8_t*)(&index->zoneNumber + index->count - 1) >= limit) {
            // Oops -- out of space
            break;
        }
        for (uint16_t j=0; j<index->count; ++j) {
            zoneNumberArray[j] = (uint16_t)
                parseInteger(p, (j==(index->count-1))?NUL:SEP,
                             0, header.count-1);
        }
        int8_t* nextIndex = (int8_t*)&(zoneNumberArray[index->count]);
        index->nextEntryDelta = (uint16_t) ((i==(n-1)) ? 0 : (nextIndex - (int8_t*)index));
        index = (CountryIndex*)nextIndex;
    }
    readEndMarker(in);

    // Make sure size matches expected value, and pad the total size
    countryIndexSize = (int8_t*)index - (int8_t*)result + pad;
    if (i != n || countryIndexSize != expectedSize) {
        die("Yikes! Interal error while constructing offset index table");
    }
    if (pad != 0) {
        countryIndexSize += pad;
        *(uint16_t*)index = 0; // Clear pad bits
    }
    if (verbose) {
        fprintf(stdout, " Read %lu country index table entries, in-memory size %ld bytes\n", (unsigned long)n, (long)countryIndexSize);
    }
    return (CountryIndex*)result;
}

void gentz::parseHeader(FileStream* in) {

    int32_t version = readIntegerLine(in, 0, 0xFFFF);
    if (version != TZ_FORMAT_VERSION) {
        die("Version mismatch between gentz and input file");
    }

    // Version string, e.g., "1999j" -> (1999<<16) | 10
    header.versionYear = (uint16_t) readIntegerLine(in, 1990, 0xFFFF);
    header.versionSuffix = (uint16_t) readIntegerLine(in, 0, 0xFFFF);

    header.count = readIntegerLine(in, 1, MAX_ZONES);
    equivCount = readIntegerLine(in, 1, header.count);
    maxPerOffset = readIntegerLine(in, 1, header.count);
    maxPerEquiv = readIntegerLine(in, 1, equivCount);

    // Size of name table in bytes
    // (0x00FFFFFF is an arbitrary upper limit; adjust as needed.)
    nameTableSize = readIntegerLine(in, 1, 0x00FFFFFF);

    readEndMarker(in);

    if (verbose) {
        fprintf(stdout, " Read header, data version %u(%u), in-memory size %ld bytes\n",
            header.versionYear, header.versionSuffix,
            (unsigned long)sizeof(header));
    }
}

void gentz::parseDSTRule(char*& p, TZRule& rule) {
    rule.month = (uint8_t) parseInteger(p, SEP, 0, 11);
    rule.dowim = (int8_t) parseInteger(p, SEP, -31, 31);
    rule.dow = (int8_t) parseInteger(p, SEP, -7, 7);
    rule.time = (uint16_t) parseInteger(p, SEP, 0, 24*60);
    rule.mode = *p++;
    if (*p++ != SEP) {
        die("Separator missing");
    }
    switch ((char)rule.mode) {
    case 'w':
        rule.mode = WALL_TIME;
        break;
    case 's':
        rule.mode = STANDARD_TIME;
        break;
    case 'u':
        rule.mode = UTC_TIME;
        break;
    default:
        die("Invalid rule time mode");
        break;
    }
}

/**
 * Parse the name table.
 * Each entry of the name table looks like this:
 * |36,Africa/Djibouti
 * The integer is an equivalency table index.  We build up a name
 * table, that just contains the names, and we return it.  We also
 * build up the name index, which indexes names to equivalency table
 * entries.  This is stored in the member variable nameToEquiv.
 */
char* gentz::parseNameTable(FileStream* in) {
    int32_t n = readIntegerLine(in, 1, MAX_ZONES);
    if (n != (int32_t)header.count) {
        die("Zone count doesn't match name table count");
    }
    char* names = (char*) uprv_malloc(sizeof(char) * nameTableSize);
    nameToEquiv = (uint32_t*) uprv_malloc(sizeof(uint32_t) * n);
    if (names == 0 || nameToEquiv == 0) {
        die("Out of memory");
    }
    nameToEquivSize = n * sizeof(nameToEquiv[0]);
    char* p = names;
    char* limit = names + nameTableSize;
    for (int32_t i=0; i<n; ++i) {
        readLine(in);
        char* q = buffer;
        // We store an index here for now -- later, in fixNameToEquiv,
        // we convert it to an offset.
        nameToEquiv[i] = (uint32_t) parseInteger(q, SEP, 0, equivCount-1);
        int32_t len = uprv_strlen(q);
        if ((p + len) <= limit) {
            uprv_memcpy(p, q, len);
            p += len;
            *p++ = NUL;
        } else {
            die("Name table longer than declared size");
        }
    }
    if (p != limit) {
        die("Name table shorter than declared size");
    }
    readEndMarker(in);
    if (verbose) {
        fprintf(stdout, " Read %ld names, in-memory size %ld bytes\n",
        (long)n, (long)nameTableSize);
    }
    return names;
}

/**
 * Read the end marker (terminates each list).
 */
void gentz::readEndMarker(FileStream* in) {
    readLine(in);
    if (uprv_strcmp(buffer, END_KEYWORD) != 0) {
        die("Keyword 'end' missing");
    }
}

/**
 * Read a line from the FileStream and parse it as an
 * integer.  There should be nothing else on the line.
 */
int32_t gentz::readIntegerLine(FileStream* in, int32_t min, int32_t max) {
    readLine(in);
    char* p = buffer;
    return parseInteger(p, NUL, min, max);
}

/**
 * Parse an integer from the given character buffer.
 * Advance p past the last parsed character.  Return
 * the result.  The integer must be of the form
 * /-?\d+/.
 */
int32_t gentz::_parseInteger(char*& p) {
    int32_t n = 0;
    int32_t digitCount = 0;
    int32_t digit;
    UBool negative = FALSE;
    if (*p == MINUS) {
        ++p;
        negative = TRUE;
    }
    for (;;) {
        digit = *p - ZERO;
        if (digit < 0 || digit > 9) {
            break;
        }
        n = 10*n + digit;
        p++;
        digitCount++;
    }
    if (digitCount < 1) {
        die("Unable to parse integer");
    }
    if (negative) {
        n = -n;
    }
    return n;
}

int32_t gentz::parseInteger(char*& p, char nextExpectedChar,
                            int32_t min, int32_t max) {
    int32_t n = _parseInteger(p);
    if (*p++ != nextExpectedChar) {
        die("Character following integer unexpected");
    }
    if (n < min || n > max) {
        die("Integer field out of range");
    }
    return n;
}

void gentz::die(const char* msg) {
    fprintf(stderr, "ERROR, %s\n", msg);
    if (*buffer) {
        fprintf(stderr, "Input file line %ld: \"%s\"\n", (long)lineNumber, buffer);
    }
    exit(1);
}

/**
 * Read a line.  Trim trailing comment and whitespace.  Ignore (skip)
 * blank lines, or comment-only lines.  Return the number of characters
 * on the line remaining.  On EOF, die.
 */
int32_t gentz::readLine(FileStream* in) {
    ++lineNumber;
    char* result = T_FileStream_readLine(in, buffer, BUFLEN);
    if (result == 0) {
        *buffer = 0;
        die("Unexpected end of file");
    }
    // Trim off trailing comment
    char* p = uprv_strchr(buffer, COMMENT);
    if (p != 0) {
        *p = NUL;
    }
    // Delete trailing whitespace
    p = buffer + uprv_strlen(buffer);
    while (p > buffer && (p[-1] == CR || p[-1] == LF ||
                          p[-1] == SPACE || p[-1] == TAB)) {
        p--;
    }
    *p = NUL;
    // If line is empty after trimming comments & whitespace,
    // then read the next line.
    return (*buffer == NUL) ? readLine(in) : uprv_strlen(buffer);
}