/* ********************************************************************** * Copyright (C) 1999-2001, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** * Date Name Description * 11/24/99 aliu Creation. * 09/26/00 aliu Support for equivalency groups added. * 01/31/01 aliu Support for ISO 3166 country codes added. ********************************************************************** */ /* This program reads a text file full of parsed time zone data and * outputs a binary file, tz.dat, which then goes on to become part of * the memory-mapped (or dll) ICU data file. * * The data file read by this program is generated by a perl script, * tz.pl. The input to tz.pl is standard unix time zone data from * ftp://elsie.nci.nih.gov. * * As a matter of policy, the perl script tz.pl wants to do as much of * the parsing, data processing, and error checking as possible, and * this program wants to just do the binary translation step. * * See tz.pl for the file format that is READ by this program. */ #include #include #include "unicode/utypes.h" #include "unicode/putil.h" #include "cmemory.h" #include "cstring.h" #include "filestrm.h" #include "unewdata.h" #include "uoptions.h" #include "tzdat.h" #define INPUT_FILE "tz.txt" #define OUTPUT_FILE "tz.dat" /* UDataInfo cf. udata.h */ static UDataInfo dataInfo = { sizeof(UDataInfo), 0, U_IS_BIG_ENDIAN, U_CHARSET_FAMILY, sizeof(UChar), 0, {TZ_SIG_0, TZ_SIG_1, TZ_SIG_2, TZ_SIG_3}, {TZ_FORMAT_VERSION, 0, 0, 0}, /* formatVersion */ {0, 0, 0, 0} /* dataVersion - will be filled in with year.suffix */ }; class gentz { // These must match SimpleTimeZone!!! enum { WALL_TIME = 0, STANDARD_TIME, UTC_TIME }; // The largest number of zones we accept as sensible. Anything // larger is considered an error. Adjust as needed. enum { MAX_ZONES = 1000 }; // The maximum sensible GMT offset, in seconds static const int32_t MAX_GMT_OFFSET; static const char COMMENT; static const char CR; static const char LF; static const char MINUS; static const char SPACE; static const char TAB; static const char ZERO; static const char STANDARD_MARK; static const char DST_MARK; static const char SEP; static const char NUL; static const char* END_KEYWORD; enum { BUFLEN = 1024 }; char buffer[BUFLEN]; int32_t lineNumber; // Binary data that we construct from tz.txt and write out as tz.dat TZHeader header; TZEquivalencyGroup* equivTable; OffsetIndex* offsetIndex; CountryIndex* countryIndex; uint32_t* nameToEquiv; char* nameTable; uint32_t equivTableSize; // Total bytes in equivalency group table uint32_t offsetIndexSize; // Total bytes in offset index table uint32_t countryIndexSize; // Total bytes in country index table uint32_t nameToEquivSize; // Total bytes in nameToEquiv uint32_t nameTableSize; // Total bytes in name table uint32_t maxPerOffset; // Maximum number of zones per offset uint32_t maxPerEquiv; // Maximum number of zones per equivalency group uint32_t equivCount; // Number of equivalency groups UBool useCopyright; UBool verbose; public: int MMain(int argc, char *argv[]); private: int32_t writeTzDatFile(const char *destdir); void parseTzTextFile(FileStream* in); // High level parsing void parseHeader(FileStream* in); TZEquivalencyGroup* parseEquivTable(FileStream* in); void fixupNameToEquiv(); void parseDSTRule(char*& p, TZRule& rule); OffsetIndex* parseOffsetIndexTable(FileStream* in); CountryIndex* parseCountryIndexTable(FileStream* in); char* parseNameTable(FileStream* in); // Low level parsing and reading void readEndMarker(FileStream* in); int32_t readIntegerLine(FileStream* in, int32_t min, int32_t max); int32_t _parseInteger(char*& p); int32_t parseInteger(char*& p, char nextExpectedChar, int32_t, int32_t); int32_t readLine(FileStream* in); // Error handling void die(const char* msg); }; int main(int argc, char *argv[]) { gentz x; U_MAIN_INIT_ARGS(argc, argv); return x.MMain(argc, argv); } const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60; // seconds const char gentz::COMMENT = '#'; const char gentz::CR = '\r'; const char gentz::LF = '\n'; const char gentz::MINUS = '-'; const char gentz::SPACE = ' '; const char gentz::TAB = '\t'; const char gentz::ZERO = '0'; const char gentz::SEP = ','; const char gentz::STANDARD_MARK = 's'; const char gentz::DST_MARK = 'd'; const char gentz::NUL = '\0'; const char* gentz::END_KEYWORD = "end"; static UOption options[]={ UOPTION_HELP_H, UOPTION_HELP_QUESTION_MARK, UOPTION_COPYRIGHT, UOPTION_DESTDIR, UOPTION_VERBOSE }; int gentz::MMain(int argc, char* argv[]) { /* preset then read command line options */ options[3].value=u_getDataDirectory(); argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[0]), options); /* error handling, printing usage message */ if(argc<0) { fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); } else if(argc<2) { argc=-1; } if(argc<0 || options[0].doesOccur || options[1].doesOccur) { fprintf(stderr, "usage: %s [-options] timezone-file\n" "\tread the timezone file produced by tz.pl and create " TZ_DATA_NAME "." TZ_DATA_TYPE "\n" "options:\n" "\t-h or -? or --help this usage text\n" "\t-v or --verbose turn on verbose output\n" "\t-c or --copyright include a copyright notice\n" "\t-d or --destdir destination directory, followed by the path\n", argv[0]); return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR; } /* get the options values */ useCopyright=options[2].doesOccur; verbose = options[4].doesOccur; //////////////////////////////////////////////////////////// // Read the input file //////////////////////////////////////////////////////////// *buffer = NUL; lineNumber = 0; if (verbose) { fprintf(stdout, "Input file: %s\n", argv[1]); } FileStream* in = T_FileStream_open(argv[1], "r"); if (in == 0) { die("Cannot open input file"); } parseTzTextFile(in); T_FileStream_close(in); *buffer = NUL; //////////////////////////////////////////////////////////// // Write the output file //////////////////////////////////////////////////////////// int32_t wlen = writeTzDatFile(options[3].value); if (verbose) { fprintf(stdout, "Output file: %s.%s, %ld bytes\n", TZ_DATA_NAME, TZ_DATA_TYPE, (long)wlen); } return 0; // success } int32_t gentz::writeTzDatFile(const char *destdir) { UNewDataMemory *pdata; UErrorCode status = U_ZERO_ERROR; // Careful: The order in which the tables are written must match the offsets. // Our order is: // - equiv table // - offset index // - country index // - name index (name to equiv map) // - name table (must be last!) header.equivTableDelta = sizeof(header); header.offsetIndexDelta = header.equivTableDelta + equivTableSize; header.countryIndexDelta = header.offsetIndexDelta + offsetIndexSize; header.nameIndexDelta = header.countryIndexDelta + countryIndexSize; // Must be last: header.nameTableDelta = header.nameIndexDelta + nameToEquivSize; /* // Don't need to check for negative values on unsigned numbers. if (header.equivTableDelta < 0 || header.offsetIndexDelta < 0 || header.countryIndexDelta < 0 || header.nameIndexDelta < 0 || header.nameTableDelta < 0) { die("Table too big -- negative delta"); } */ // Convert equivalency table indices to offsets. This can only // be done after the header offsets have been set up. fixupNameToEquiv(); // Fill in dataInfo with year.suffix *(uint16_t*)&(dataInfo.dataVersion[0]) = header.versionYear; *(uint16_t*)&(dataInfo.dataVersion[2]) = header.versionSuffix; pdata = udata_create(destdir, TZ_DATA_TYPE, TZ_DATA_NAME, &dataInfo, useCopyright ? U_COPYRIGHT_STRING : 0, &status); if (U_FAILURE(status)) { die("Unable to create data memory"); } udata_writeBlock(pdata, &header, sizeof(header)); udata_writeBlock(pdata, equivTable, equivTableSize); udata_writeBlock(pdata, offsetIndex, offsetIndexSize); udata_writeBlock(pdata, countryIndex, countryIndexSize); udata_writeBlock(pdata, nameToEquiv, nameToEquivSize); udata_writeBlock(pdata, nameTable, nameTableSize); uint32_t dataLength = udata_finish(pdata, &status); if (U_FAILURE(status)) { die("Error writing output file"); } if (dataLength != (sizeof(header) + equivTableSize + offsetIndexSize + countryIndexSize + nameTableSize + nameToEquivSize )) { die("Written file doesn't match expected size"); } return dataLength; } void gentz::parseTzTextFile(FileStream* in) { parseHeader(in); // Read name table, create it, also create nameToEquiv index table // as a side effect. nameTable = parseNameTable(in); // Parse the equivalency groups equivTable = parseEquivTable(in); // Parse the GMT offset index table offsetIndex = parseOffsetIndexTable(in); // Parse the ISO 3166 country index table countryIndex = parseCountryIndexTable(in); } /** * Convert equivalency table indices to offsets. The equivalency * table offset (in the header) must be set already. */ void gentz::fixupNameToEquiv() { uint32_t i; // First make a list that maps indices to offsets uint32_t *offsets = (uint32_t*) uprv_malloc(sizeof(uint32_t) * equivCount); offsets[0] = header.equivTableDelta; if (offsets[0] % 4 != 0) { die("Header size is not 4-aligned"); } TZEquivalencyGroup *eg = equivTable; for (i=1; inextEntryDelta; if (offsets[i] % 4 != 0) { die("Equivalency group table is not 4-aligned"); } eg = (TZEquivalencyGroup*) (eg->nextEntryDelta + (int8_t*)eg); } // Now remap index values to offsets for (i=0; i= equivCount) { die("Equiv index out of range"); } nameToEquiv[i] = offsets[x]; } uprv_free(offsets); } TZEquivalencyGroup* gentz::parseEquivTable(FileStream* in) { uint32_t n = readIntegerLine(in, 1, MAX_ZONES); if (n != equivCount) { die("Equivalency table count mismatch"); } // We don't know how big the whole thing will be yet, but we can use // the maxPerEquiv number to compute an upper limit. // // The gmtOffset field within each struct must be // 4-aligned for some architectures. To ensure this, we do two // things: 1. The entire struct is 4-aligned. 2. The gmtOffset is // placed at a 4-aligned position within the struct. 3. The size // of the whole structure is padded out to 4n bytes. We achieve // this last condition by adding two bytes of padding after the // last entry, if necessary. We adjust // the nextEntryDelta and add 2 bytes of padding if necessary. uint32_t maxPossibleSize = sizeof(TZEquivalencyGroup) + (maxPerEquiv-1) * sizeof(uint16_t); // Pad this out if ((maxPossibleSize % 4) != 0) { maxPossibleSize += 2; } if ((maxPossibleSize % 4) != 0) { die("Bug in 4-align code for equiv table"); } maxPossibleSize *= n; // Get size of entire set of structs. int8_t *result = (int8_t*) uprv_malloc(sizeof(int8_t) * maxPossibleSize); if (result == 0) { die("Out of memory"); } // Read each line and construct the corresponding entry TZEquivalencyGroup* eg = (TZEquivalencyGroup*)result; for (uint32_t i=0; iisDST = 0; eg->u.s.zone.gmtOffset = 1000 * // Convert s -> ms parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET); pList = &(eg->u.s.count); break; case DST_MARK: eg->isDST = 1; eg->u.d.zone.gmtOffset = 1000 * // Convert s -> ms parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET); parseDSTRule(p, eg->u.d.zone.onsetRule); parseDSTRule(p, eg->u.d.zone.ceaseRule); eg->u.d.zone.dstSavings = (uint16_t) parseInteger(p, SEP, 0, 12*60); pList = &(eg->u.d.count); break; default: die("Invalid equiv table type marker (not s or d)"); } // Now parse the list of zones in this group uint16_t egCount = (uint16_t) parseInteger(p, SEP, 1, maxPerEquiv); *pList++ = egCount; for (uint16_t j=0; jnextEntryDelta = (i==(n-1)) ? (uint16_t) 0 : structSize; eg->reserved = 0; // ignored eg = (TZEquivalencyGroup*) (structSize + (int8_t*)eg); } equivTableSize = (int8_t*)eg - (int8_t*)result; readEndMarker(in); if (verbose) { fprintf(stdout, " Read %lu equivalency table entries, in-memory size %ld bytes\n", (unsigned long)equivCount, (long)equivTableSize); } return (TZEquivalencyGroup*)result; } OffsetIndex* gentz::parseOffsetIndexTable(FileStream* in) { uint32_t n = readIntegerLine(in, 1, MAX_ZONES); // We don't know how big the whole thing will be yet, but we can use // the maxPerOffset number to compute an upper limit. // // The gmtOffset field within each OffsetIndex struct must be // 4-aligned for some architectures. To ensure this, we do two // things: 1. The entire struct is 4-aligned. 2. The gmtOffset is // placed at a 4-aligned position within the struct. 3. The size // of the whole structure is padded out to 4n bytes. We achieve // this last condition by adding two bytes of padding after the // last zoneNumber, if count is _even_. That is, the struct size // is 10+2count+padding, where padding is (count%2==0 ? 2:0). // // Note that we don't change the count itself, but rather adjust // the nextEntryDelta and add 2 bytes of padding if necessary. // // Don't try to compute the exact size in advance // (unless we want to avoid the use of sizeof(), which may // introduce padding that we won't actually employ). uint32_t maxPossibleSize = n * (sizeof(OffsetIndex) + (maxPerOffset-1) * sizeof(uint16_t)); int8_t *result = (int8_t*) uprv_malloc(sizeof(int8_t) * maxPossibleSize); if (result == 0) { die("Out of memory"); } // Read each line and construct the corresponding entry OffsetIndex* index = (OffsetIndex*)result; for (uint32_t i=0; igmtOffset = 1000 * // Convert s -> ms parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET); index->defaultZone = (uint16_t)parseInteger(p, SEP, 0, header.count-1); index->count = (uint16_t)parseInteger(p, SEP, 1, maxPerOffset); uint16_t* zoneNumberArray = &(index->zoneNumber); UBool sawOffset = FALSE; // Sanity check - make sure offset is in zone list for (uint16_t j=0; jcount; ++j) { zoneNumberArray[j] = (uint16_t) parseInteger(p, (j==(index->count-1))?NUL:SEP, 0, header.count-1); if (zoneNumberArray[j] == index->defaultZone) { sawOffset = TRUE; } } if (!sawOffset) { die("Error: bad offset index entry; default not in zone list"); } alignedCount = index->count; if((alignedCount%2)==0) /* force count to be ODD - see above */ { // Use invalid zoneNumber for 2 bytes of padding zoneNumberArray[alignedCount++] = (uint16_t)0xFFFF; } int8_t* nextIndex = (int8_t*)&(zoneNumberArray[alignedCount]); index->nextEntryDelta = (uint16_t) ((i==(n-1)) ? 0 : (nextIndex - (int8_t*)index)); index = (OffsetIndex*)nextIndex; } offsetIndexSize = (int8_t*)index - (int8_t*)result; if (offsetIndexSize > maxPossibleSize) { die("Yikes! Interal error while constructing offset index table"); } readEndMarker(in); if (verbose) { fprintf(stdout, " Read %lu offset index table entries, in-memory size %ld bytes\n", (unsigned long)n, (long)offsetIndexSize); } return (OffsetIndex*)result; } CountryIndex* gentz::parseCountryIndexTable(FileStream* in) { uint32_t n = readIntegerLine(in, 1, MAX_ZONES); // We know how big the whole thing will be: Each zone occupies an // int, and each country adds 3 ints (one for the intcode, one for // next entry offset, one for the zone count). Each int is 16 // bits. // // Everything is 16-bits, so we don't 4-align the entries. // However, we do pad at the end of the table to make the whole // thing of size 4n, if necessary. uint32_t expectedSize = n*(sizeof(CountryIndex)-sizeof(uint16_t)) + header.count * sizeof(uint16_t); uint32_t pad = (4 - (expectedSize % 4)) % 4; // This will be 0 or 2 int8_t *result = (int8_t*) uprv_malloc(sizeof(int8_t) * (expectedSize + pad)); if (result == 0) { die("Out of memory"); } // Read each line and construct the corresponding entry. // Along the way, make sure we don't write past 'limit'. CountryIndex* index = (CountryIndex*)result; int8_t* limit = ((int8_t*)result) + expectedSize; // Don't include pad uint32_t i; for (i=0; izoneNumber) < limit; ++i) { readLine(in); char* p = buffer; index->intcode = (uint16_t)parseInteger(p, SEP, 0, 25*32+25 /*ZZ*/); index->count = (uint16_t)parseInteger(p, SEP, 0, header.count-1); uint16_t* zoneNumberArray = &(index->zoneNumber); if ((int8_t*)(&index->zoneNumber + index->count - 1) >= limit) { // Oops -- out of space break; } for (uint16_t j=0; jcount; ++j) { zoneNumberArray[j] = (uint16_t) parseInteger(p, (j==(index->count-1))?NUL:SEP, 0, header.count-1); } int8_t* nextIndex = (int8_t*)&(zoneNumberArray[index->count]); index->nextEntryDelta = (uint16_t) ((i==(n-1)) ? 0 : (nextIndex - (int8_t*)index)); index = (CountryIndex*)nextIndex; } readEndMarker(in); // Make sure size matches expected value, and pad the total size countryIndexSize = (int8_t*)index - (int8_t*)result + pad; if (i != n || countryIndexSize != expectedSize) { die("Yikes! Interal error while constructing offset index table"); } if (pad != 0) { countryIndexSize += pad; *(uint16_t*)index = 0; // Clear pad bits } if (verbose) { fprintf(stdout, " Read %lu country index table entries, in-memory size %ld bytes\n", (unsigned long)n, (long)countryIndexSize); } return (CountryIndex*)result; } void gentz::parseHeader(FileStream* in) { int32_t version = readIntegerLine(in, 0, 0xFFFF); if (version != TZ_FORMAT_VERSION) { die("Version mismatch between gentz and input file"); } // Version string, e.g., "1999j" -> (1999<<16) | 10 header.versionYear = (uint16_t) readIntegerLine(in, 1990, 0xFFFF); header.versionSuffix = (uint16_t) readIntegerLine(in, 0, 0xFFFF); header.count = readIntegerLine(in, 1, MAX_ZONES); equivCount = readIntegerLine(in, 1, header.count); maxPerOffset = readIntegerLine(in, 1, header.count); maxPerEquiv = readIntegerLine(in, 1, equivCount); // Size of name table in bytes // (0x00FFFFFF is an arbitrary upper limit; adjust as needed.) nameTableSize = readIntegerLine(in, 1, 0x00FFFFFF); readEndMarker(in); if (verbose) { fprintf(stdout, " Read header, data version %u(%u), in-memory size %ld bytes\n", header.versionYear, header.versionSuffix, (unsigned long)sizeof(header)); } } void gentz::parseDSTRule(char*& p, TZRule& rule) { rule.month = (uint8_t) parseInteger(p, SEP, 0, 11); rule.dowim = (int8_t) parseInteger(p, SEP, -31, 31); rule.dow = (int8_t) parseInteger(p, SEP, -7, 7); rule.time = (uint16_t) parseInteger(p, SEP, 0, 24*60); rule.mode = *p++; if (*p++ != SEP) { die("Separator missing"); } switch ((char)rule.mode) { case 'w': rule.mode = WALL_TIME; break; case 's': rule.mode = STANDARD_TIME; break; case 'u': rule.mode = UTC_TIME; break; default: die("Invalid rule time mode"); break; } } /** * Parse the name table. * Each entry of the name table looks like this: * |36,Africa/Djibouti * The integer is an equivalency table index. We build up a name * table, that just contains the names, and we return it. We also * build up the name index, which indexes names to equivalency table * entries. This is stored in the member variable nameToEquiv. */ char* gentz::parseNameTable(FileStream* in) { int32_t n = readIntegerLine(in, 1, MAX_ZONES); if (n != (int32_t)header.count) { die("Zone count doesn't match name table count"); } char* names = (char*) uprv_malloc(sizeof(char) * nameTableSize); nameToEquiv = (uint32_t*) uprv_malloc(sizeof(uint32_t) * n); if (names == 0 || nameToEquiv == 0) { die("Out of memory"); } nameToEquivSize = n * sizeof(nameToEquiv[0]); char* p = names; char* limit = names + nameTableSize; for (int32_t i=0; i 9) { break; } n = 10*n + digit; p++; digitCount++; } if (digitCount < 1) { die("Unable to parse integer"); } if (negative) { n = -n; } return n; } int32_t gentz::parseInteger(char*& p, char nextExpectedChar, int32_t min, int32_t max) { int32_t n = _parseInteger(p); if (*p++ != nextExpectedChar) { die("Character following integer unexpected"); } if (n < min || n > max) { die("Integer field out of range"); } return n; } void gentz::die(const char* msg) { fprintf(stderr, "ERROR, %s\n", msg); if (*buffer) { fprintf(stderr, "Input file line %ld: \"%s\"\n", (long)lineNumber, buffer); } exit(1); } /** * Read a line. Trim trailing comment and whitespace. Ignore (skip) * blank lines, or comment-only lines. Return the number of characters * on the line remaining. On EOF, die. */ int32_t gentz::readLine(FileStream* in) { ++lineNumber; char* result = T_FileStream_readLine(in, buffer, BUFLEN); if (result == 0) { *buffer = 0; die("Unexpected end of file"); } // Trim off trailing comment char* p = uprv_strchr(buffer, COMMENT); if (p != 0) { *p = NUL; } // Delete trailing whitespace p = buffer + uprv_strlen(buffer); while (p > buffer && (p[-1] == CR || p[-1] == LF || p[-1] == SPACE || p[-1] == TAB)) { p--; } *p = NUL; // If line is empty after trimming comments & whitespace, // then read the next line. return (*buffer == NUL) ? readLine(in) : uprv_strlen(buffer); }