ICU-65 update gentz for new binary format and alias table; make pm file names 8.3

X-SVN-Rev: 300
This commit is contained in:
Alan Liu 1999-12-05 05:55:28 +00:00
parent 76bdfc909a
commit aa7e2fc7b8
6 changed files with 1801 additions and 490 deletions

View file

@ -36,9 +36,6 @@
#define INPUT_FILE "tz.txt"
#define OUTPUT_FILE "tz.dat"
#define DATA_NAME "tz"
#define DATA_TYPE "dat"
#define DATA_COPYRIGHT \
"Copyright (C) 1999, International Business Machines " \
"Corporation and others. All Rights Reserved."
@ -53,9 +50,9 @@ static const UDataInfo dataInfo = {
sizeof(UChar),
0,
'z', 'o', 'n', 'e', /* dataFormat */
1, 0, 0, 0, /* formatVersion */
1, 9, 9, 9 /* dataVersion */
TZ_SIG[0], TZ_SIG[1], TZ_SIG[2], TZ_SIG[3], /* dataFormat */
TZ_FORMAT_VERSION, 0, 0, 0, /* formatVersion */
0, 0, 0, 0 /* dataVersion - will be filled in with year.suffix */
};
@ -70,6 +67,9 @@ class gentz {
// larger is considered an error. Adjust as needed.
enum { MAX_ZONES = 1000 };
// The largest maxNameLength we accept as sensible. Adjust as needed.
enum { MAX_MAX_NAME_LENGTH = 100 };
// The maximum sensible GMT offset, in seconds
static const int32_t MAX_GMT_OFFSET;
@ -87,15 +87,19 @@ class gentz {
enum { BUFLEN = 1024 };
char buffer[BUFLEN];
int32_t lineNumber;
TZHeader header;
StandardZone* stdZones;
DSTZone* dstZones;
char* nameTable;
int32_t* indexByName;
OffsetIndex* indexByOffset;
int32_t zoneCount; // Total number of zones
int32_t maxPerOffset; // Maximum number of zones per offset
int32_t stdZoneSize;
int32_t dstZoneSize;
int32_t offsetIndexSize; // Total bytes in offset index table
int32_t nameTableSize; // Total bytes in name table
bool_t useCopyright;
@ -116,9 +120,13 @@ private:
void parse1DSTZone(FileStream* in, DSTZone& zone);
void parseDSTRule(char*& p, TZRule& rule);
int32_t* parseIndexTable(FileStream* in);
OffsetIndex* parseOffsetIndexTable(FileStream* in);
char* parseNameTable(FileStream* in);
// Low level parsing and reading
void readEndMarker(FileStream* in);
int32_t readIntegerLine(FileStream* in, int32_t min, int32_t max);
int32_t _parseInteger(char*& p);
int32_t parseInteger(char*& p, char nextExpectedChar, int32_t, int32_t);
@ -134,7 +142,7 @@ int main(int argc, char *argv[]) {
return x.main(argc, argv);
}
const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60;
const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60; // seconds
const char gentz::COMMENT = '#';
const char gentz::CR = ((char)13);
const char gentz::LF = ((char)10);
@ -148,19 +156,19 @@ const char* gentz::END_KEYWORD = "end";
void gentz::usage(const char* argv0) {
fprintf(stderr,
"Usage: %s [-c[+|-]] infile outfile\n"
"Usage: %s [-c[+|-]] infile\n"
" -c[+|-] [do|do not] include copyright (default=+)\n"
" infile text file produced by tz.pl\n"
" outfile binary file suitable for memory mapping\n",
" infile text file produced by tz.pl\n",
argv0);
exit(1);
}
int gentz::main(int argc, char *argv[]) {
////////////////////////////////////////////////////////////
// Parse arguments
////////////////////////////////////////////////////////////
useCopyright = TRUE;
const char* infile = 0;
const char* outfile = 0;
for (int i=1; i<argc; ++i) {
const char* arg = argv[i];
if (arg[0] == '-') {
@ -179,17 +187,20 @@ int gentz::main(int argc, char *argv[]) {
}
} else if (infile == 0) {
infile = arg;
} else if (outfile == 0) {
outfile = arg;
} else {
usage(argv[0]);
}
}
if (outfile == 0) {
if (infile == 0) {
usage(argv[0]);
}
////////////////////////////////////////////////////////////
// Read the input file
////////////////////////////////////////////////////////////
*buffer = NUL;
lineNumber = 0;
fprintf(stdout, "Input file: %s\n", infile);
FileStream* in = T_FileStream_open(infile, "r");
if (in == 0) {
die("Cannot open input file");
@ -197,14 +208,13 @@ int gentz::main(int argc, char *argv[]) {
parseTzTextFile(in);
T_FileStream_close(in);
*buffer = NUL;
fprintf(stdout, "Input file %s, data version %u(%u)\n",
infile, header.versionYear, header.versionSuffix);
fprintf(stdout, "Read %ld standard zones, %ld dst zones, %ld zone names\n",
header.standardCount, header.dstCount, zoneCount);
////////////////////////////////////////////////////////////
// Write the output file
////////////////////////////////////////////////////////////
int32_t wlen = writeTzDatFile();
fprintf(stdout, "Wrote to %s: %ld bytes\n",
outfile, wlen);
fprintf(stdout, "Output file: %s.%s, %ld bytes\n",
TZ_DATA_NAME, TZ_DATA_TYPE, wlen);
return 0; // success
}
@ -213,15 +223,23 @@ int32_t gentz::writeTzDatFile() {
UNewDataMemory *pdata;
UErrorCode status = U_ZERO_ERROR;
pdata = udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
// Fill in dataInfo with year.suffix
*(uint16_t*)&(dataInfo.dataVersion[0]) = header.versionYear;
*(uint16_t*)&(dataInfo.dataVersion[2]) = header.versionSuffix;
pdata = udata_create(TZ_DATA_TYPE, TZ_DATA_NAME, &dataInfo,
useCopyright ? DATA_COPYRIGHT : 0, &status);
if (U_FAILURE(status)) {
die("Unable to create data memory");
}
// Careful: This order cannot be changed (without changing
// the offset fixup code).
udata_writeBlock(pdata, &header, sizeof(header));
udata_writeBlock(pdata, stdZones, stdZoneSize);
udata_writeBlock(pdata, dstZones, dstZoneSize);
udata_writeBlock(pdata, indexByName, header.count * sizeof(indexByName[0]));
udata_writeBlock(pdata, indexByOffset, offsetIndexSize);
udata_writeBlock(pdata, nameTable, nameTableSize);
uint32_t dataLength = udata_finish(pdata, &status);
@ -230,7 +248,10 @@ int32_t gentz::writeTzDatFile() {
}
if (dataLength != (sizeof(header) + stdZoneSize +
dstZoneSize + nameTableSize)) {
dstZoneSize + nameTableSize +
header.count * sizeof(indexByName[0]) +
offsetIndexSize
)) {
die("Written file doesn't match expected size");
}
return dataLength;
@ -240,37 +261,139 @@ void gentz::parseTzTextFile(FileStream* in) {
parseHeader(in);
stdZones = parseStandardZones(in);
dstZones = parseDSTZones(in);
if (zoneCount != (int32_t)(header.standardCount + header.dstCount)) {
if (header.count != (header.standardCount + header.dstCount)) {
die("Zone counts don't add up");
}
nameTable = parseNameTable(in);
// Fixup the header offsets
stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
header.standardDelta = sizeof(header);
header.dstDelta = header.standardDelta + stdZoneSize;
header.nameIndexDelta = header.dstDelta + dstZoneSize;
header.standardOffset = sizeof(header);
header.dstOffset = header.standardOffset + stdZoneSize;
header.nameTableOffset = header.dstOffset + dstZoneSize;
// Read in index tables after header is mostly fixed up
indexByName = parseIndexTable(in);
indexByOffset = parseOffsetIndexTable(in);
if (header.standardOffset < 0 ||
header.dstOffset < 0 ||
header.nameTableOffset < 0) {
header.offsetIndexDelta = header.nameIndexDelta + header.count *
sizeof(indexByName[0]);
header.nameTableDelta = header.offsetIndexDelta + offsetIndexSize;
if (header.standardDelta < 0 ||
header.dstDelta < 0 ||
header.nameTableDelta < 0) {
die("Negative offset in header after fixup");
}
}
/**
* Index tables are lists of specifiers of the form /[sd]\d+/, where
* the first character determines if it is a standard or DST zone,
* and the following number is in the range 0..n-1, where n is the
* count of that type of zone.
*
* Header must already be read in and the offsets must be fixed up.
* Standard and DST zones must be read in.
*/
int32_t* gentz::parseIndexTable(FileStream* in) {
uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
if (n != header.count) {
die("Count mismatch in index table");
}
int32_t* result = new int32_t[n];
for (uint32_t i=0; i<n; ++i) {
readLine(in);
char* p = buffer+1;
uint32_t index = parseInteger(p, NUL, 0, header.count);
switch (buffer[0]) {
case 's':
if (index >= header.standardCount) {
die("Standard index entry out of range");
}
result[i] = header.standardDelta +
((char*)&stdZones[index] - (char*)&stdZones[0]);
break;
case 'd':
if (index >= header.dstCount) {
die("DST index entry out of range");
}
result[i] = header.dstDelta +
((char*)&dstZones[index] - (char*)&dstZones[0]);
break;
default:
die("Malformed index entry");
break;
}
}
readEndMarker(in);
fprintf(stdout, " Read %lu name index table entries, in-memory size %ld bytes\n",
n, n * sizeof(int32_t));
return result;
}
OffsetIndex* gentz::parseOffsetIndexTable(FileStream* in) {
uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
// We don't know how big the whole thing will be yet, but we can use
// the maxPerOffset number to compute an upper limit.
//
// Structs will not be 4-aligned because we'll be writing them out
// ourselves. Don't try to compute the exact size in advance
// (unless we want to avoid the use of sizeof(), which may
// introduce padding that we won't actually employ).
int32_t maxPossibleSize = n * (sizeof(OffsetIndex) +
(maxPerOffset-1) * sizeof(uint16_t));
int8_t *result = new int8_t[maxPossibleSize];
if (result == 0) {
die("Out of memory");
}
// Read each line and construct the corresponding entry
OffsetIndex* index = (OffsetIndex*)result;
for (uint32_t i=0; i<n; ++i) {
readLine(in);
char* p = buffer;
index->gmtOffset = 1000 * // Convert s -> ms
parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
index->count = (uint16_t)parseInteger(p, SEP, 1, maxPerOffset);
uint16_t* zoneNumberArray = &(index->zoneNumber);
for (uint16_t j=0; j<index->count; ++j) {
zoneNumberArray[j] = (uint16_t)
parseInteger(p, (j==(index->count-1))?NUL:SEP,
0, header.count-1);
}
int8_t* nextIndex = (int8_t*)&(zoneNumberArray[index->count]);
index->nextEntryDelta = (i==(n-1)) ? 0 : (nextIndex - (int8_t*)index);
index = (OffsetIndex*)nextIndex;
}
offsetIndexSize = (int8_t*)index - (int8_t*)result;
if (offsetIndexSize > maxPossibleSize) {
die("Yikes! Interal error while constructing offset index table");
}
readEndMarker(in);
fprintf(stdout, " Read %lu offset index table entries, in-memory size %ld bytes\n",
n, offsetIndexSize);
return (OffsetIndex*)result;
}
void gentz::parseHeader(FileStream* in) {
int32_t ignored;
// Version string, e.g., "1999j" -> (1999<<16) | 10
header.versionYear = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
header.versionYear = (uint16_t) readIntegerLine(in, 1990, 0xFFFF);
header.versionSuffix = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
// Zone count
zoneCount = readIntegerLine(in, 0, MAX_ZONES);
header.count = readIntegerLine(in, 1, MAX_ZONES);
maxPerOffset = readIntegerLine(in, 1, MAX_ZONES);
/*header.maxNameLength*/ ignored = readIntegerLine(in, 1, MAX_MAX_NAME_LENGTH);
// Size of name table in bytes
// (0x00FFFFFF is an arbitrary upper limit; adjust as needed.)
nameTableSize = readIntegerLine(in, 1, 0x00FFFFFF);
fprintf(stdout, " Read header, data version %u(%u), in-memory size %ld bytes\n",
header.versionYear, header.versionSuffix, sizeof(header));
}
StandardZone* gentz::parseStandardZones(FileStream* in) {
@ -282,18 +405,19 @@ StandardZone* gentz::parseStandardZones(FileStream* in) {
for (uint32_t i=0; i<header.standardCount; i++) {
parse1StandardZone(in, zones[i]);
}
readLine(in);
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
die("Keyword 'end' missing");
}
readEndMarker(in);
stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
fprintf(stdout, " Read %lu standard zones, in-memory size %ld bytes\n",
header.standardCount, stdZoneSize);
return zones;
}
void gentz::parse1StandardZone(FileStream* in, StandardZone& zone) {
readLine(in);
char* p = buffer;
zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
zone.gmtOffset = parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
/*zone.nameDelta =*/ parseInteger(p, SEP, 0, nameTableSize);
zone.gmtOffset = 1000 * // Convert s -> ms
parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
}
DSTZone* gentz::parseDSTZones(FileStream* in) {
@ -305,18 +429,19 @@ DSTZone* gentz::parseDSTZones(FileStream* in) {
for (uint32_t i=0; i<header.dstCount; i++) {
parse1DSTZone(in, zones[i]);
}
readLine(in);
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
die("Keyword 'end' missing");
}
readEndMarker(in);
dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
fprintf(stdout, " Read %lu DST zones, in-memory size %ld bytes\n",
header.dstCount, dstZoneSize);
return zones;
}
void gentz::parse1DSTZone(FileStream* in, DSTZone& zone) {
readLine(in);
char* p = buffer;
zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
zone.gmtOffset = parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
/*zone.nameDelta =*/ parseInteger(p, SEP, 0, nameTableSize);
zone.gmtOffset = 1000 * // Convert s -> ms
parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
parseDSTRule(p, zone.onsetRule);
parseDSTRule(p, zone.ceaseRule);
zone.dstSavings = (uint16_t) parseInteger(p, NUL, 0, 12*60);
@ -349,7 +474,7 @@ void gentz::parseDSTRule(char*& p, TZRule& rule) {
char* gentz::parseNameTable(FileStream* in) {
int32_t n = readIntegerLine(in, 1, MAX_ZONES);
if (n != zoneCount) {
if (n != (int32_t)header.count) {
die("Zone count doesn't match name table count");
}
char* names = new char[nameTableSize];
@ -371,9 +496,21 @@ char* gentz::parseNameTable(FileStream* in) {
if (p != limit) {
die("Name table shorter than declared size");
}
readEndMarker(in);
fprintf(stdout, " Read %ld names, in-memory size %ld bytes\n", n, nameTableSize);
return names;
}
/**
* Read the end marker (terminates each list).
*/
void gentz::readEndMarker(FileStream* in) {
readLine(in);
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
die("Keyword 'end' missing");
}
}
/**
* Read a line from the FileStream and parse it as an
* integer. There should be nothing else on the line.
@ -432,12 +569,13 @@ int32_t gentz::parseInteger(char*& p, char nextExpectedChar,
void gentz::die(const char* msg) {
fprintf(stderr, "ERROR, %s\n", msg);
if (*buffer) {
fprintf(stderr, "Current input line: %s\n", buffer);
fprintf(stderr, "Input file line %ld: \"%s\"\n", lineNumber, buffer);
}
exit(1);
}
int32_t gentz::readLine(FileStream* in) {
++lineNumber;
T_FileStream_readLine(in, buffer, BUFLEN);
// Trim off trailing comment
char* p = icu_strchr(buffer, COMMENT);

View file

@ -1,11 +1,18 @@
Readme file for ICU time zone data (source/tools/gentz)
The time zone data in ICU is taken from the UNIX data files at
ftp://elsie.nci.nih.gov/pub/tzdata<year>.
RAW DATA
--------
The time zone data in ICU is taken from the UNIX data files at
ftp://elsie.nci.nih.gov/pub/tzdata<year>. The other input to the
process is an alias table, described below.
BUILD PROCESS
-------------
Two tools are used to process the data into a format suitable for ICU:
tz.pl directory of raw data files -> tz.txt
tz.pl directory of raw data files + tz.alias -> tz.txt
gentz tz.txt -> tz.dat (memory mappable binary file)
After gentz is run, standard ICU data tools are used to incorporate
@ -34,4 +41,35 @@ The tz.txt file is typically checked into CVS, whereas the raw data
files are not, since they are readily available from the URL listed
above.
ALIAS TABLE
-----------
For backward compatibility, we define several three-letter IDs that
have been used since early ICU and correspond to IDs used in old JDKs.
These IDs are listed in tz.alias. The tz.pl script processes this
alias table and issues errors if there are problems.
IDS
---
All *system* zone IDs must consist only of characters in the invariant
set. See utypes.h for an explanation of what this means. If an ID is
encountered that contains a non-invariant character, tz.pl complains.
Non-system zones may try to use non-invariant characters, but they
shouldn't because of possible collisions with system IDs when the
invariant char converter is used (see TimeZone class for details).
Etc/GMT...
----------
Users may be confused by the fact that various zones with names of the
form Etc/GMT+n appear to have an offset of the wrong sign. For
example, Etc/GMT+8 is 8 hours *behind* GMT; that is, it corresponds to
what one typically sees displayed as "GMT-8:00". The reason for this
inversion is explained in the UNIX zone data file "etcetera".
Briefly, this is done intentionally in order to comply with
POSIX-style signedness. In ICU we reproduce the UNIX zone behavior
faithfully, including this confusing aspect.
Alan Liu 1999

View file

@ -12,6 +12,9 @@
# parse out the current zones and create a listing of current zones.
# Author: Alan Liu
######################################################################
# This script reads an alias table, $TZ_ALIAS, and creates clones of
# standard UNIX zones with alias names.
######################################################################
# To update the zone data, download the latest data from the NIH URL
# listed above into a directory. Run this script with the directory
# name as an argument. THE DIRECTORY NAME MUST END IN tzdataYYYYR.
@ -27,31 +30,36 @@
# - Lines may be followed by a comment; the parser must ignore
# anything of the form /\s+#.*$/ in each line.
# |3065,14400 # Asia/Dubai GMT+4:00
# - The file contains a header and 3 lists.
# - The file contains a header and 5 lists.
# - The header contains the version of the unix data, the total
# zone count, and the length of the name table in bytes.
# |1999 # (tzdata1999j) ftp://elsie.nci.nih.gov data version YEAR
# |10 # (tzdata1999j) ftp://elsie.nci.nih.gov data version SUFFIX
# |387 # total zone count
# |5906 # length of name table in bytes
# zone count, the maximum number of zones sharing the same value
# of gmtOffset, the length of the name table in bytes, and
# the length of the longest name (not including the terminating
# zero byte).
# | 1999 # (tzdata1999j) version of Olson zone
# | 10 # data from ftp://elsie.nci.nih.gov
# | 387 # total zone count
# | 40 # max count of zones with same gmtOffset
# | 25 # max name length not incl final zero
# | 5906 # length of name table in bytes
# - Lists start with a count of the records to follow, the records
# themselves (one per line), and a single line with the keyword
# 'end'.
# - The first list is the list of standard zones:
# |208 # count of standard zones to follow
# |0,0 # Africa/Abidjan GMT+0:00
# |28,10800 # Africa/Addis_Ababa GMT+3:00
# | 208 # count of standard zones to follow
# | 0,0 # Africa/Abidjan GMT+0:00
# | 28,10800 # Africa/Addis_Ababa GMT+3:00
# ...
# |end
# | end
# Each standard zone record contains two integers. The first
# is a byte offset into the name table for the name of the zone.
# The second integer is the GMT offset in SECONDS for this zone.
# - The second list is the list of DST zones:
# |179 # count of dst zones to follow
# |15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
# |184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
# | 179 # count of dst zones to follow
# | 15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
# | 184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
# ...
# |end
# | end
# Each record starts with the same two integers as a standard
# zone record. Following this are data for the onset rule and
# the cease rule. Each rule is described by the following integers:
@ -63,13 +71,38 @@
# The last integer in the record is the DST savings in MINUTES,
# typically 60.
# - The third list is the name table:
# |387 # count of names to follow
# |Africa/Abidjan
# |Africa/Accra
# | 387 # count of names to follow
# | Africa/Abidjan
# | Africa/Accra
# ...
# |end
# | end
# Each name is terminated by a newline (like all lines in the file).
# The offsets in the first two lists refer to this table.
# - The fourth list is an index list by name. The index entries
# themselves are of the form /[sd]\d+/, where the first character
# indicates standard or DST, and the number that follows indexes
# into the correpsonding array.
# | 416 # count of name index table entries to follow
# | d0 # ACT
# | d1 # AET
# | d2 # AGT
# | d3 # ART
# | d4 # AST
# | s0 # Africa/Abidjan
# ...
# | end
# - The fifth list is an index by GMT offset. Each line lists the
# zones with the same offset. The first number on the line
# is the GMT offset in seconds. The second number is the count
# of zone numbers to follow. Each zone number is an integer from
# 0..n-1, where n is the total number of zones. The zone numbers
# refer to the zone list in alphabetical order.
# | 39 # index by offset entries to follow
# | -43200,1,280
# | -39600,6,279,365,373,393,395,398
# | -36000,8,57,278,349,379,386,387,403,405
# ...
# | end
######################################################################
# As of 1999j, here are the various possible values taken by the
# rule fields. See code below that generates this data.
@ -88,10 +121,32 @@ require 5; # Minimum version of perl needed
use strict;
use Getopt::Long;
use vars qw(@FILES $YEAR $DATA_DIR $OUT $SEP @MONTH
$VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION);
$VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION $TZ_ALIAS);
require 'dumpvar.pl';
use TZFileParser;
use TZUtility;
use tzparse;
use tzutil;
# File names
$OUT = 'tz.txt';
$TZ_ALIAS = 'tz.alias';
# Separator between fields in the output file
$SEP = ','; # Don't use ':'!
@FILES = qw(africa
antarctica
asia
australasia
backward
etcetera
europe
factory
northamerica
pacificnew
solar87
solar88
solar89
southamerica);
# We get the current year from the system here. Later
# we double check this against the zone data version.
@ -127,26 +182,6 @@ if ($DATA_DIR =~ /(tzdata(\d{4})(\w?))/) {
usage();
}
# Output file name
$OUT = 'tz.txt';
# Separator between fields in the output file
$SEP = ','; # Don't use ':'!
@FILES = qw(africa
antarctica
asia
australasia
backward
etcetera
europe
factory
northamerica
pacificnew
solar87
solar88
solar89
southamerica);
@MONTH = qw(jan feb mar apr may jun
jul aug sep oct nov dec);
@ -181,6 +216,8 @@ sub main {
TZ::Postprocess(\%ZONES, \%RULES);
incorporateAliases($TZ_ALIAS, \%ZONES);
print
"Read ", scalar keys %ZONES, " current zones and ",
scalar keys %RULES, " rules for $YEAR\n";
@ -207,10 +244,16 @@ sub main {
my %NAME_OFFSET;
my $STD_COUNT = 0; # Count of standard zones
my $DST_COUNT = 0; # Count of DST zones
my $maxNameLen = 0;
foreach my $z (sort keys %ZONES) {
# Make sure zone IDs only contain invariant chars
assertInvariantChars($z);
my $len = length($z);
$NAME_OFFSET{$z} = $offset;
$offset += length($z) + 1;
$offset += $len + 1;
$NAME_LIST .= "$z\n";
$maxNameLen = $len if ($len > $maxNameLen);
if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
$STD_COUNT++;
} else {
@ -218,14 +261,35 @@ sub main {
}
}
my $NAME_SIZE = $offset;
# Find the maximum number of zones with the same value of
# gmtOffset.
my %perOffset; # Hash of offset -> count
foreach my $z (keys %ZONES) {
# Use parseOffset to normalize values - probably unnecessary
++$perOffset{parseOffset($ZONES{$z}->{gmtoff})};
}
my $maxPerOffset = 0;
foreach (values %perOffset) {
$maxPerOffset = $_ if ($_ > $maxPerOffset);
}
open(OUT,">$OUT") or die "Can't open $OUT for writing: $!";
############################################################
# EMIT HEADER
############################################################
# Zone data version
print OUT $VERSION_YEAR, " # ($RAW_VERSION) ftp://elsie.nci.nih.gov data version YEAR\n";
print OUT $VERSION_SUFFIX, " # ($RAW_VERSION) ftp://elsie.nci.nih.gov data version SUFFIX\n";
print OUT $VERSION_YEAR, " # ($RAW_VERSION) version of Olson zone\n";
print OUT $VERSION_SUFFIX, " # data from ftp://elsie.nci.nih.gov\n";
print OUT scalar keys %ZONES, " # total zone count\n";
print OUT $maxPerOffset, " # max count of zones with same gmtOffset\n";
print OUT $maxNameLen, " # max name length not incl final zero\n";
print OUT $NAME_SIZE, " # length of name table in bytes\n";
############################################################
# EMIT ZONE TABLES
############################################################
# Output first the standard zones, then the dst zones.
# Precede each list with the count of zones to follow,
# and follow it with the keyword 'end'.
@ -241,13 +305,73 @@ sub main {
print OUT "end\n"; # 'end' keyword for error checking
}
############################################################
# EMIT NAME TABLE
############################################################
# Output the name table, followed by 'end' keyword
print OUT scalar keys %ZONES, " # count of names to follow\n";
print OUT $NAME_LIST, "end\n";
############################################################
# EMIT INDEX BY NAME
############################################################
# Output the name index table. Since we don't know structure
# sizes, we output the index number of each zone. For example,
# "s0" is the first standard zone, "s1" is the second, etc.
# Likewise, "d0" is the first DST zone, "d1" is the second, etc.
# First compute index IDs, as described above.
my %indexID;
my $s = 0;
my $d = 0;
foreach my $z (sort keys %ZONES) {
if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
$indexID{$z} = "s$s";
$s++;
} else {
$indexID{$z} = "d$d";
$d++;
}
}
# Now emit table sorted by name
print OUT scalar keys %ZONES, " # count of name index table entries to follow\n";
foreach my $z (sort keys %ZONES) {
print OUT $indexID{$z}, " # $z\n";
}
print OUT "end\n";
############################################################
# EMIT INDEX BY GMT OFFSET
############################################################
# Create a hash mapping zone name -> integer, from 0..n-1.
my %zoneNumber;
my $i = 0;
foreach (sort keys %ZONES) { $zoneNumber{$_} = $i++; }
# Create a hash by index. The hash has offset integers as keys
# and arrays of index numbers as values.
my %offsetMap;
foreach (sort keys %ZONES) {
my $offset = parseOffset($ZONES{$_}->{gmtoff});
push @{$offsetMap{$offset}}, $zoneNumber{$_};
}
# Emit it
print OUT scalar keys %offsetMap, " # index by offset entries to follow\n";
foreach (sort {$a <=> $b} keys %offsetMap) {
my $aref = $offsetMap{$_};
print OUT $_, ",", scalar @{$aref}, ",", join(",", @{$aref}), "\n";
}
print OUT "end\n";
############################################################
# END
############################################################
close(OUT);
print "$OUT written.\n";
if (0) {
TZ::FormZoneEquivalencyGroups(\%ZONES, \%RULES, \@EQUIV);
print
@ -281,6 +405,46 @@ sub main {
}
}
# Read the alias list and create clones with alias names. This
# sub should be called AFTER all standard zones have been read in.
# Param: File name of alias list
# Param: Ref to zone hash
sub incorporateAliases {
my $aliasFile = shift;
my $zones = shift;
my $n = 0;
local *IN;
open(IN,$aliasFile) or die "Can't open $aliasFile: $!";
while (<IN>) {
s/\#.*//; # Trim comments
next unless (/\S/); # Skip blank lines
if (/^\s*(\S+)\s+(\S+)\s*$/) {
my ($alias, $original) = ($1, $2);
if (exists $zones->{$alias}) {
die "Bad alias in $aliasFile: $alias is a standard UNIX zone. " .
"Please remove $alias from the alias table.\n";
}
if (!exists $zones->{$original}) {
die "Bad alias in $aliasFile: $alias maps to the nonexistent " .
"zone $original. Please fix this entry in the alias table.\n";
}
# We hardcode the GMT zone in the TimeZone class; don't include
# it in the tz.txt file.
if ($alias eq "GMT") {
die "Bad alias in $aliasFile: GMT is a hardcoded system zone. " .
"Please remove it from the alias table.\n";
}
# Create the alias!
$zones->{$alias} = $zones->{$original};
$n++;
} else {
die "Bad line in alias table $aliasFile: $_\n";
}
}
print "Incorporated $n aliases from $aliasFile\n";
close(IN);
}
# Format a time zone as a machine-readable line of text. Another
# tool will read this line to construct a binary data structure
# representing this zone.
@ -481,4 +645,14 @@ sub parseDaySpecifier {
( $dowim, $dow );
}
# Confirm that the given ID contains only invariant characters.
# See utypes.h for an explanation.
# Param: string to be checked
sub assertInvariantChars {
local $_ = shift;
if (/[^A-Za-z0-9 \"%&\'()*+,-.\/:;<=>?_]/) {
die "Error: Zone ID \"$_\" contains non-invariant characters\n";
}
}
__END__

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,245 @@
######################################################################
# Copyright (C) 1999, International Business Machines
# Corporation and others. All Rights Reserved.
######################################################################
# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
# where <year> is "1999b" or a similar string.
######################################################################
# This package handles the parsing of time zone files.
# Author: Alan Liu
######################################################################
# Usage:
# Call ParseFile for each file to be imported. Then call Postprocess
# to remove unused rules and links.
package TZ;
use strict;
use Carp;
use vars qw(@ISA @EXPORT $VERSION $YEAR $STANDARD);
require 'dumpvar.pl';
@ISA = qw(Exporter);
@EXPORT = qw(ParseFile
Postprocess
);
$VERSION = '0.1';
$STANDARD = '-'; # Name of the Standard Time rule
######################################################################
# Param: File name
# Param: Ref to hash of zones
# Param: Ref to hash of rules
# Param: Current year
sub ParseFile {
my ($FILE, $ZONES, $RULES, $YEAR) = @_;
local(*FILE);
open(FILE,"<$FILE") or confess "Can't open $FILE: $!";
my $zone; # Current zone
my $badLineCount = 0;
while (<FILE>) {
s/\#.*//;
next if (!/\S/);
#|# Zone NAME GMTOFF RULES FORMAT [UNTIL]
#|Zone America/Montreal -4:54:16 - LMT 1884
#| -5:00 Mont E%sT
#|Zone America/Thunder_Bay -5:57:00 - LMT 1895
#| -5:00 Canada E%sT 1970
#| -5:00 Mont E%sT 1973
#| -5:00 - EST 1974
#| -5:00 Canada E%sT
my ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil);
if (/^zone/i) {
# Zone block start
if (/^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/i
|| /^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)()/i) {
$zone = $1;
($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
($2, $3, $4, $5);
} else {
print STDERR "Can't parse in $FILE: $_";
++$badLineCount;
}
} elsif (/^\s/ && $zone) {
# Zone continuation
if (/^\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/
|| /^\s+(\S+)\s+(\S+)\s+(\S+)()/) {
($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
($1, $2, $3, $4);
} else {
print STDERR "Can't parse in $FILE: $_";
++$badLineCount;
}
} elsif (/^rule/i) {
# Here is where we parse a single line of the rule table.
# Our goal is to accept only rules applying to the current
# year. This is normally a matter of accepting rules
# that match the current year. However, in some cases this
# is more complicated. For example:
#|# Tonga
#|# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S
#|Rule Tonga 1999 max - Oct Sat>=1 2:00s 1:00 S
#|Rule Tonga 2000 max - Apr Sun>=16 2:00s 0 -
# To handle this properly, we save every rule we encounter
# (thus overwriting older ones with newer ones, since rules
# are listed in order), and also use slot [2] to mark when
# we see a current year rule. When that happens, we stop
# saving rules. Thus we match the latest rule we see, or
# a matching rule if we find one. The format of slot [2]
# is just a 2 bit flag ([2]&1 means slot [0] matched,
# [2]&2 means slot [1] matched).
# Note that later, when the rules are post processed
# (see Postprocess), the slot [2] will be overwritten
# with the compressed rule string used to implement
# equality testing.
$zone = undef;
# Rule
#|# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S
#|Rule US 1918 1919 - Mar lastSun 2:00 1:00 W # War
#|Rule US 1918 1919 - Oct lastSun 2:00 0 S
#|Rule US 1942 only - Feb 9 2:00 1:00 W # War
#|Rule US 1945 only - Sep 30 2:00 0 S
#|Rule US 1967 max - Oct lastSun 2:00 0 S
#|Rule US 1967 1973 - Apr lastSun 2:00 1:00 D
#|Rule US 1974 only - Jan 6 2:00 1:00 D
#|Rule US 1975 only - Feb 23 2:00 1:00 D
#|Rule US 1976 1986 - Apr lastSun 2:00 1:00 D
#|Rule US 1987 max - Apr Sun>=1 2:00 1:00 D
if (/^rule\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+
(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/xi) {
my ($name, $from, $to, $type, $in, $on, $at, $save, $letter) =
($1, $2, $3, $4, $5, $6, $7, $8, $9);
my $i = $save ? 0:1;
if (!exists $RULES->{$name}) {
$RULES->{$name} = [];
}
my $ruleArray = $RULES->{$name};
# Check our bit mask to see if we've already matched
# a current rule. If so, do nothing. If not, then
# save this rule line as the best one so far.
if (@{$ruleArray} < 3 ||
!($ruleArray->[2] & $i)) {
my $h = $ruleArray->[$i];
$ruleArray->[$i]->{from} = $from;
$ruleArray->[$i]->{to} = $to;
$ruleArray->[$i]->{type} = $type;
$ruleArray->[$i]->{in} = $in;
$ruleArray->[$i]->{on} = $on;
$ruleArray->[$i]->{at} = $at;
$ruleArray->[$i]->{save} = $save;
$ruleArray->[$i]->{letter} = $letter;
# Does this rule match the current year? If so,
# set the bit mask so we don't overwrite this rule.
# This makes us ingore rules for subsequent years
# that are already listed in the database -- as long
# as we have an overriding rule for the current year.
if (($from == $YEAR && $to =~ /only/i) ||
($from <= $YEAR &&
(($to =~ /^\d/ && $YEAR <= $to) || $to =~ /max/i))) {
$ruleArray->[2] |= $i;
}
}
} else {
print STDERR "Can't parse in $FILE: $_";
++$badLineCount;
}
} elsif (/^link/i) {
#|# Old names, for S5 users
#|
#|# Link LINK-FROM LINK-TO
#|Link America/New_York EST5EDT
#|Link America/Chicago CST6CDT
#|Link America/Denver MST7MDT
#|Link America/Los_Angeles PST8PDT
#|Link America/Indianapolis EST
#|Link America/Phoenix MST
#|Link Pacific/Honolulu HST
if (/^link\s+(\S+)\s+(\S+)/i) {
# We currently only record a single link -- if there
# are more than one, we should modify this.
my ($from, $to) = ($1, $2);
$ZONES->{$from}->{link} = $to;
} else {
print STDERR "Can't parse in $FILE: $_";
++$badLineCount;
}
} else {
# Unexpected line
print STDERR "Ignoring in $FILE: $_";
++$badLineCount;
}
if ($zoneRule &&
($zoneUntil !~ /\S/ || ($zoneUntil =~ /^\d/ &&
$zoneUntil >= $YEAR))) {
$ZONES->{$zone}->{gmtoff} = $zoneGmtoff;
$ZONES->{$zone}->{rule} = $zoneRule;
$ZONES->{$zone}->{format} = $zoneFormat;
$ZONES->{$zone}->{until} = $zoneUntil;
}
}
close(FILE);
}
######################################################################
# Param: Ref to hash of zones
# Param: Ref to hash of rules
sub Postprocess {
my ($ZONES, $RULES) = @_;
my %ruleInUse;
# Eliminate zone links that have no corresponding zone
foreach (keys %$ZONES) {
if (exists $ZONES->{$_}->{link} && !exists $ZONES->{$_}->{rule}) {
if (0) {
print STDERR
"Deleting link from historical/nonexistent zone: ",
$_, " -> ", $ZONES->{$_}->{link}, "\n";
}
delete $ZONES->{$_};
}
}
# Check that each zone has a corresponding rule
foreach (sort keys %$ZONES) {
my $ruleName = $ZONES->{$_}->{rule};
next if ($ruleName eq $STANDARD);
if (exists $RULES->{$ruleName}) {
$ruleInUse{$ruleName} = 1;
} else {
# This means the zone is using the standard rule now
$ZONES->{$_}->{rule} = $STANDARD;
}
}
# Check that both parts are there for rules
# Check for unused rules
# Make coded string for comparisons
foreach (keys %$RULES) {
if (!exists $ruleInUse{$_}) {
if (0) {
print STDERR "Deleting historical/unused rule: $_\n";
}
delete $RULES->{$_};
} elsif (!$RULES->{$_}->[0] || !$RULES->{$_}->[1]) {
print STDERR "Rule doesn't have both parts: $_\n";
} else {
# Generate code
# This has all the data about a rule; it can be used
# to see if two rules behave identically
$RULES->{$_}->[2] =
lc($RULES->{$_}->[0]->{in} . "," .
$RULES->{$_}->[0]->{on} . "," .
$RULES->{$_}->[0]->{at} . "," .
$RULES->{$_}->[0]->{save} . ";" .
$RULES->{$_}->[1]->{in} . "," .
$RULES->{$_}->[1]->{on} . "," .
$RULES->{$_}->[1]->{at}); # [1]->{save} is always zero
}
}
}

View file

@ -0,0 +1,197 @@
######################################################################
# Copyright (C) 1999, International Business Machines
# Corporation and others. All Rights Reserved.
######################################################################
# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
# where <year> is "1999b" or a similar string.
######################################################################
# This package contains utility functions for time zone data.
# Author: Alan Liu
######################################################################
# Zones - A time zone object is a hash with the following keys:
# {gmtoff} The offset from GMT, e.g. "-5:00"
# {rule} The name of the rule, e.g. "-", "Canada", "EU", "US"
# {format} The local abbreviation, e.g. "E%sT"
# {until} Data is good until this year, e.g., "2000". Often blank.
# These correspond to file entries:
#|# Zone NAME GMTOFF RULES FORMAT [UNTIL]
#|Zone America/Montreal -4:54:16 - LMT 1884
#| -5:00 Mont E%sT
# Optionally, a zone may also have the key:
# {link} An old name for this zone, e.g. "HST" (for Pacific/Honolulu)
# Links come from the file entries:
#|# Link LINK-FROM LINK-TO
#|Link America/New_York EST5EDT
#|Link America/Chicago CST6CDT
# The name of the zone itself is not kept in the zone object.
# Instead, zones are kept in a big hash. The keys are the names; the
# values are references to the zone objects. The big hash of all
# zones is referred to in all caps: %ZONES ($ZONES if it's a
# reference).
# Example: $ZONES->{"America/Los_Angeles"} =
# 'format' => 'P%sT'
# 'gmtoff' => '-8:00'
# 'link' => 'US/Pacific-New'
# 'rule' => 'US'
# 'until' => ''
######################################################################
# Rules - A time zone rule is an array with the following elements:
# [0] Onset rule
# [1] Cease rule
# [2] Encoded string
# The onset rule and cease rule have the same format. They are each
# references to a hash with keys:
# {from} Start year
# {to} End year, or "only" or "max"
# {type} Unknown, usually "-"
# {in} Month, 3 letters
# {on} Day specifier, e.g. "lastSun", "Sun>=1", "23"
# {at} Time, e.g. "2:00", "1:00u"
# {save} Amount of savings, for the onset; 0 for the cease
# {letter} Guess: the letter that goes into %s in the zone {format}
# These correspond to the file entries thus:
#|# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S
#|Rule US 1942 only - Feb 9 2:00 1:00 W # War
#|Rule US 1945 only - Sep 30 2:00 0 S
#|Rule US 1967 max - Oct lastSun 2:00 0 S
#|Rule US 1967 1973 - Apr lastSun 2:00 1:00 D
#|Rule US 1974 only - Jan 6 2:00 1:00 D
#|Rule US 1975 only - Feb 23 2:00 1:00 D
#|Rule US 1976 1986 - Apr lastSun 2:00 1:00 D
#|Rule US 1987 max - Apr Sun>=1 2:00 1:00 D
# Entry [2], the encoded string, is used to see if two rules are the
# same. It consists of "[0]->{in},[0]->{on},[0]->{at},[0]->{save};
# [1]->{in},[1]->{on},[1]->{at}". Note that the separator between
# values is a comma, between onset and cease is a semicolon. Also
# note that the cease {save} is not used as this is always 0. The
# whole string is forced to lowercase.
# Rules don't contain their own name. Like zones, rules are kept in a
# big hash; the keys are the names, the values the references to the
# arrays. This hash of all rules is referred to in all caps, %RULES
# or for a reference, $RULES.
# Example: $RULES->{"US"} =
# 0 HASH(0x8fa03c)
# 'at' => '2:00'
# 'from' => 1987
# 'in' => 'Apr'
# 'letter' => 'D'
# 'on' => 'Sun>=1'
# 'save' => '1:00'
# 'to' => 'max'
# 'type' => '-'
# 1 HASH(0x8f9fc4)
# 'at' => '2:00'
# 'from' => 1967
# 'in' => 'Oct'
# 'letter' => 'S'
# 'on' => 'lastSun'
# 'save' => 0
# 'to' => 'max'
# 'type' => '-'
# 2 'apr,sun>=1,2:00,1:00;oct,lastsun,2:00'
package TZ;
use strict;
use Carp;
use vars qw(@ISA @EXPORT $VERSION $STANDARD);
require 'dumpvar.pl';
@ISA = qw(Exporter);
@EXPORT = qw(ZoneEquals
RuleEquals
FormZoneEquivalencyGroups
);
$VERSION = '0.1';
$STANDARD = '-'; # Name of the Standard Time rule
######################################################################
# Param: zone object (hash ref)
# Param: zone object (hash ref)
# Param: ref to hash of all rules
# Return: true if two zones are equivalent
sub ZoneEquals {
my $z1 = shift;
my $z2 = shift;
my $RULES = shift;
($z1, $z2) = ($z1->{rule}, $z2->{rule});
return ($z1 eq $z2) ||
RuleEquals($RULES->{$z1}, $RULES->{$z2});
}
######################################################################
# Param: rule object (hash ref)
# Param: rule object (hash ref)
# Return: true if two rules are equivalent
sub RuleEquals {
my $r1 = shift;
my $r2 = shift;
# Just compare the precomputed encoding strings.
# defined() catches undefined rules. The only undefined
# rule is $STANDARD; any others would be cause by
# Postprocess().
return defined($r1) && defined($r2) && $r1->[2] eq $r2->[2];
# There's actually one more level of equivalency analysis we could
# do. This is to recognize that Sun >=1 is the same as First Sun.
# We don't do this yet.
}
######################################################################
# Given a hash of all zones and a hash of all rules, create a list
# of equivalency groups. These are groups of zones with the same
# offset and equivalent rules. Equivalency is tested with
# ZoneEquals and RuleEquals. The resultant equivalency list is an
# array of refs to groups. Each group is an array of one or more
# zone names.
# Param: IN ref to hash of all zones
# Param: IN ref to hash of all rules
# Param: OUT ref to array to receive group refs
sub FormZoneEquivalencyGroups {
my ($ZONES, $RULES, $EQUIV) = @_;
# Group the zones by offset. This improves efficiency greatly;
# instead of an n^2 computation, we just need to do n^2 within
# each offset; a much smaller total number.
my %ZONES_BY_OFFSET;
foreach (keys %$ZONES) {
push @{$ZONES_BY_OFFSET{$ZONES->{$_}->{gmtoff}}}, $_;
}
# Find equivalent rules
foreach my $gmtoff (keys %ZONES_BY_OFFSET) {
# Make an array of equivalency groups
# (array of refs to array of names)
my @equiv;
foreach my $name1 (@{$ZONES_BY_OFFSET{$gmtoff}}) {
my $found = 0;
foreach my $group (@equiv) {
my $name2 = $group->[0];
if (ZoneEquals($ZONES->{$name1}, $ZONES->{$name2}, $RULES)) {
push @$group, $name1;
$found = 1;
last;
}
}
if (!$found) {
my @newGroup = ( $name1 );
push @equiv, \@newGroup;
}
}
push @$EQUIV, @equiv;
}
}