scuffed-code/icu4c/source/tools/gentz/gentz.cpp

469 lines
13 KiB
C++
Raw Normal View History

/*
**********************************************************************
* Copyright (C) 1999, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/24/99 aliu Creation.
**********************************************************************
*/
/* This program reads a text file full of parsed time zone data and
* outputs a binary file, tz.dat, which then goes on to become part of
* the memory-mapped (or dll) ICU data file.
*
* The data file read by this program is generated by a perl script,
* tz.pl. The input to tz.pl is standard unix time zone data from
* ftp://elsie.nci.nih.gov.
*
* As a matter of policy, the perl script tz.pl wants to do as much of
* the parsing, data processing, and error checking as possible, and
* this program wants to just do the binary translation step.
*
* See tz.pl for the file format that is READ by this program.
*/
#include <stdio.h>
#include <stdlib.h>
#include "utypes.h"
#include "cmemory.h"
#include "cstring.h"
#include "filestrm.h"
#include "udata.h"
#include "unewdata.h"
#include "tzdat.h"
#define INPUT_FILE "tz.txt"
#define OUTPUT_FILE "tz.dat"
#define DATA_NAME "tz"
#define DATA_TYPE "dat"
#define DATA_COPYRIGHT \
"Copyright (C) 1999, International Business Machines " \
"Corporation and others. All Rights Reserved."
/* UDataInfo cf. udata.h */
static const UDataInfo dataInfo = {
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
sizeof(UChar),
0,
'z', 'o', 'n', 'e', /* dataFormat */
1, 0, 0, 0, /* formatVersion */
1, 9, 9, 9 /* dataVersion */
};
class gentz {
// These must match SimpleTimeZone!!!
enum { WALL_TIME = 0,
STANDARD_TIME,
UTC_TIME
};
// The largest number of zones we accept as sensible. Anything
// larger is considered an error. Adjust as needed.
enum { MAX_ZONES = 1000 };
// The maximum sensible GMT offset, in seconds
static const int32_t MAX_GMT_OFFSET;
static const char COMMENT;
static const char CR;
static const char LF;
static const char MINUS;
static const char SPACE;
static const char TAB;
static const char ZERO;
static const char SEP;
static const char NUL;
static const char* END_KEYWORD;
enum { BUFLEN = 1024 };
char buffer[BUFLEN];
TZHeader header;
StandardZone* stdZones;
DSTZone* dstZones;
char* nameTable;
int32_t zoneCount; // Total number of zones
int32_t stdZoneSize;
int32_t dstZoneSize;
int32_t nameTableSize; // Total bytes in name table
bool_t useCopyright;
public:
int main(int argc, char *argv[]);
private:
int32_t writeTzDatFile(FileStream* out);
void parseTzTextFile(FileStream* in);
// High level parsing
void parseHeader(FileStream* in);
StandardZone* parseStandardZones(FileStream* in);
void parse1StandardZone(FileStream* in, StandardZone& zone);
DSTZone* parseDSTZones(FileStream* in);
void parse1DSTZone(FileStream* in, DSTZone& zone);
void parseDSTRule(char*& p, TZRule& rule);
char* parseNameTable(FileStream* in);
// Low level parsing and reading
int32_t readIntegerLine(FileStream* in, int32_t min, int32_t max);
int32_t _parseInteger(char*& p);
int32_t parseInteger(char*& p, char nextExpectedChar, int32_t, int32_t);
int32_t readLine(FileStream* in);
// Error handling
void die(const char* msg);
void usage(const char* argv0);
};
int main(int argc, char *argv[]) {
gentz x;
return x.main(argc, argv);
}
const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60;
const char gentz::COMMENT = '#';
const char gentz::CR = ((char)13);
const char gentz::LF = ((char)10);
const char gentz::MINUS = '-';
const char gentz::SPACE = ' ';
const char gentz::TAB = ((char)9);
const char gentz::ZERO = '0';
const char gentz::SEP = ',';
const char gentz::NUL = ((char)0);
const char* gentz::END_KEYWORD = "end";
void gentz::usage(const char* argv0) {
fprintf(stderr,
"Usage: %s [-c[+|-]] infile outfile\n"
" -c[+|-] [do|do not] include copyright (default=+)\n"
" infile text file produced by tz.pl\n"
" outfile binary file suitable for memory mapping\n",
argv0);
exit(1);
}
int gentz::main(int argc, char *argv[]) {
// Parse arguments
useCopyright = TRUE;
const char* infile = 0;
const char* outfile = 0;
for (int i=1; i<argc; ++i) {
const char* arg = argv[i];
if (arg[0] == '-') {
if (arg[1] != 'c') {
usage(argv[0]);
}
switch (arg[2]) {
case '+':
useCopyright = TRUE;
break;
case '-':
useCopyright = FALSE;
break;
default:
usage(argv[0]);
}
} else if (infile == 0) {
infile = arg;
} else if (outfile == 0) {
outfile = arg;
} else {
usage(argv[0]);
}
}
if (outfile == 0) {
usage(argv[0]);
}
*buffer = NUL;
FileStream* in = T_FileStream_open(infile, "r");
if (in == 0) {
die("Cannot open input file");
}
parseTzTextFile(in);
T_FileStream_close(in);
*buffer = NUL;
fprintf(stdout, "Input file %s, data version %u(%u)\n",
infile, header.versionYear, header.versionSuffix);
fprintf(stdout, "Read %ld standard zones, %ld dst zones, %ld zone names\n",
header.standardCount, header.dstCount, zoneCount);
FileStream* out = T_FileStream_open(outfile, "w");
if (out == 0) {
die("Cannot open output file");
}
int32_t wlen = writeTzDatFile(out);
T_FileStream_close(out);
fprintf(stdout, "Wrote to %s: %ld bytes\n",
outfile, wlen);
// REMOVE THIS NOTICE when it no longer applies:
fprintf(stdout, "NOTE: Currently, gentz writes the output file to"
" the data directory and creates an EMPTY file of the"
" same name in the target directory. Ignore the empty file.");
return 0; // success
}
int32_t gentz::writeTzDatFile(FileStream* out) {
UNewDataMemory *pdata;
UErrorCode status = U_ZERO_ERROR;
pdata = udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
useCopyright ? DATA_COPYRIGHT : 0, &status);
if (U_FAILURE(status)) {
die("Unable to create data memory");
}
udata_writeBlock(pdata, &header, sizeof(header));
udata_writeBlock(pdata, stdZones, stdZoneSize);
udata_writeBlock(pdata, dstZones, dstZoneSize);
udata_writeBlock(pdata, nameTable, nameTableSize);
uint32_t dataLength = udata_finish(pdata, &status);
if (U_FAILURE(status)) {
die("Error writing output file");
}
if (dataLength != (sizeof(header) + stdZoneSize +
dstZoneSize + nameTableSize)) {
die("Written file doesn't match expected size");
}
return dataLength;
}
void gentz::parseTzTextFile(FileStream* in) {
parseHeader(in);
stdZones = parseStandardZones(in);
dstZones = parseDSTZones(in);
if (zoneCount != (int32_t)(header.standardCount + header.dstCount)) {
die("Zone counts don't add up");
}
nameTable = parseNameTable(in);
// Fixup the header offsets
stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
header.standardOffset = sizeof(header);
header.dstOffset = header.standardOffset + stdZoneSize;
header.nameTableOffset = header.dstOffset + dstZoneSize;
if (header.standardOffset < 0 ||
header.dstOffset < 0 ||
header.nameTableOffset < 0) {
die("Negative offset in header after fixup");
}
}
void gentz::parseHeader(FileStream* in) {
// Version string, e.g., "1999j" -> (1999<<16) | 10
header.versionYear = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
header.versionSuffix = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
// Zone count
zoneCount = readIntegerLine(in, 0, MAX_ZONES);
// Size of name table in bytes
// (0x00FFFFFF is an arbitrary upper limit; adjust as needed.)
nameTableSize = readIntegerLine(in, 1, 0x00FFFFFF);
}
StandardZone* gentz::parseStandardZones(FileStream* in) {
header.standardCount = readIntegerLine(in, 1, MAX_ZONES);
StandardZone* zones = new StandardZone[header.standardCount];
if (zones == 0) {
die("Out of memory");
}
for (uint32_t i=0; i<header.standardCount; i++) {
parse1StandardZone(in, zones[i]);
}
readLine(in);
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
die("Keyword 'end' missing");
}
return zones;
}
void gentz::parse1StandardZone(FileStream* in, StandardZone& zone) {
readLine(in);
char* p = buffer;
zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
zone.gmtOffset = parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
}
DSTZone* gentz::parseDSTZones(FileStream* in) {
header.dstCount = readIntegerLine(in, 1, MAX_ZONES);
DSTZone* zones = new DSTZone[header.dstCount];
if (zones == 0) {
die("Out of memory");
}
for (uint32_t i=0; i<header.dstCount; i++) {
parse1DSTZone(in, zones[i]);
}
readLine(in);
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
die("Keyword 'end' missing");
}
return zones;
}
void gentz::parse1DSTZone(FileStream* in, DSTZone& zone) {
readLine(in);
char* p = buffer;
zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
zone.gmtOffset = parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
parseDSTRule(p, zone.onsetRule);
parseDSTRule(p, zone.ceaseRule);
zone.dstSavings = (uint16_t) parseInteger(p, NUL, 0, 12*60);
}
void gentz::parseDSTRule(char*& p, TZRule& rule) {
rule.month = (uint8_t) parseInteger(p, SEP, 0, 11);
rule.dowim = (int8_t) parseInteger(p, SEP, -31, 31);
rule.dow = (int8_t) parseInteger(p, SEP, -7, 7);
rule.time = (uint16_t) parseInteger(p, SEP, 0, 24*60);
rule.mode = *p++;
if (*p++ != SEP) {
die("Separator missing");
}
switch (rule.mode) {
case 'w':
rule.mode = WALL_TIME;
break;
case 's':
rule.mode = STANDARD_TIME;
break;
case 'u':
rule.mode = UTC_TIME;
break;
default:
die("Invalid rule time mode");
break;
}
}
char* gentz::parseNameTable(FileStream* in) {
int32_t n = readIntegerLine(in, 1, MAX_ZONES);
if (n != zoneCount) {
die("Zone count doesn't match name table count");
}
char* names = new char[nameTableSize];
if (names == 0) {
die("Out of memory");
}
char* p = names;
char* limit = names + nameTableSize;
for (int32_t i=0; i<n; ++i) {
int32_t len = readLine(in);
if ((p + len) <= limit) {
icu_memcpy(p, buffer, len);
p += len;
*p++ = NUL;
} else {
die("Name table longer than declared size");
}
}
if (p != limit) {
die("Name table shorter than declared size");
}
return names;
}
/**
* Read a line from the FileStream and parse it as an
* integer. There should be nothing else on the line.
*/
int32_t gentz::readIntegerLine(FileStream* in, int32_t min, int32_t max) {
readLine(in);
char* p = buffer;
return parseInteger(p, NUL, min, max);
}
/**
* Parse an integer from the given character buffer.
* Advance p past the last parsed character. Return
* the result. The integer must be of the form
* /-?\d+/.
*/
int32_t gentz::_parseInteger(char*& p) {
int32_t n = 0;
int32_t digitCount = 0;
int32_t digit;
bool_t negative = FALSE;
if (*p == MINUS) {
++p;
negative = TRUE;
}
for (;;) {
digit = *p - ZERO;
if (digit < 0 || digit > 9) {
break;
}
n = 10*n + digit;
p++;
digitCount++;
}
if (digitCount < 1) {
die("Unable to parse integer");
}
if (negative) {
n = -n;
}
return n;
}
int32_t gentz::parseInteger(char*& p, char nextExpectedChar,
int32_t min, int32_t max) {
int32_t n = _parseInteger(p);
if (*p++ != nextExpectedChar) {
die("Character following integer unexpected");
}
if (n < min || n > max) {
die("Integer field out of range");
}
return n;
}
void gentz::die(const char* msg) {
fprintf(stderr, "ERROR, %s\n", msg);
if (*buffer) {
fprintf(stderr, "Current input line: %s\n", buffer);
}
exit(1);
}
int32_t gentz::readLine(FileStream* in) {
T_FileStream_readLine(in, buffer, BUFLEN);
// Trim off trailing comment
char* p = icu_strchr(buffer, COMMENT);
if (p != 0) {
// Back up past any space or tab characters before
// the comment character.
while (p > buffer && (p[-1] == SPACE || p[-1] == TAB)) {
p--;
}
*p = NUL;
}
// Delete any trailing ^J and/or ^M characters
p = buffer + icu_strlen(buffer);
while (p > buffer && (p[-1] == CR || p[-1] == LF)) {
p--;
}
*p = NUL;
return icu_strlen(buffer);
}