mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-65 update gentz for new binary format and alias table; make pm file names 8.3
X-SVN-Rev: 300
This commit is contained in:
parent
76bdfc909a
commit
aa7e2fc7b8
6 changed files with 1801 additions and 490 deletions
|
@ -36,9 +36,6 @@
|
|||
#define INPUT_FILE "tz.txt"
|
||||
#define OUTPUT_FILE "tz.dat"
|
||||
|
||||
#define DATA_NAME "tz"
|
||||
#define DATA_TYPE "dat"
|
||||
|
||||
#define DATA_COPYRIGHT \
|
||||
"Copyright (C) 1999, International Business Machines " \
|
||||
"Corporation and others. All Rights Reserved."
|
||||
|
@ -53,9 +50,9 @@ static const UDataInfo dataInfo = {
|
|||
sizeof(UChar),
|
||||
0,
|
||||
|
||||
'z', 'o', 'n', 'e', /* dataFormat */
|
||||
1, 0, 0, 0, /* formatVersion */
|
||||
1, 9, 9, 9 /* dataVersion */
|
||||
TZ_SIG[0], TZ_SIG[1], TZ_SIG[2], TZ_SIG[3], /* dataFormat */
|
||||
TZ_FORMAT_VERSION, 0, 0, 0, /* formatVersion */
|
||||
0, 0, 0, 0 /* dataVersion - will be filled in with year.suffix */
|
||||
};
|
||||
|
||||
|
||||
|
@ -70,6 +67,9 @@ class gentz {
|
|||
// larger is considered an error. Adjust as needed.
|
||||
enum { MAX_ZONES = 1000 };
|
||||
|
||||
// The largest maxNameLength we accept as sensible. Adjust as needed.
|
||||
enum { MAX_MAX_NAME_LENGTH = 100 };
|
||||
|
||||
// The maximum sensible GMT offset, in seconds
|
||||
static const int32_t MAX_GMT_OFFSET;
|
||||
|
||||
|
@ -87,15 +87,19 @@ class gentz {
|
|||
|
||||
enum { BUFLEN = 1024 };
|
||||
char buffer[BUFLEN];
|
||||
int32_t lineNumber;
|
||||
|
||||
TZHeader header;
|
||||
StandardZone* stdZones;
|
||||
DSTZone* dstZones;
|
||||
char* nameTable;
|
||||
int32_t* indexByName;
|
||||
OffsetIndex* indexByOffset;
|
||||
|
||||
int32_t zoneCount; // Total number of zones
|
||||
int32_t maxPerOffset; // Maximum number of zones per offset
|
||||
int32_t stdZoneSize;
|
||||
int32_t dstZoneSize;
|
||||
int32_t offsetIndexSize; // Total bytes in offset index table
|
||||
int32_t nameTableSize; // Total bytes in name table
|
||||
|
||||
bool_t useCopyright;
|
||||
|
@ -116,9 +120,13 @@ private:
|
|||
void parse1DSTZone(FileStream* in, DSTZone& zone);
|
||||
void parseDSTRule(char*& p, TZRule& rule);
|
||||
|
||||
int32_t* parseIndexTable(FileStream* in);
|
||||
OffsetIndex* parseOffsetIndexTable(FileStream* in);
|
||||
|
||||
char* parseNameTable(FileStream* in);
|
||||
|
||||
// Low level parsing and reading
|
||||
void readEndMarker(FileStream* in);
|
||||
int32_t readIntegerLine(FileStream* in, int32_t min, int32_t max);
|
||||
int32_t _parseInteger(char*& p);
|
||||
int32_t parseInteger(char*& p, char nextExpectedChar, int32_t, int32_t);
|
||||
|
@ -134,7 +142,7 @@ int main(int argc, char *argv[]) {
|
|||
return x.main(argc, argv);
|
||||
}
|
||||
|
||||
const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60;
|
||||
const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60; // seconds
|
||||
const char gentz::COMMENT = '#';
|
||||
const char gentz::CR = ((char)13);
|
||||
const char gentz::LF = ((char)10);
|
||||
|
@ -148,19 +156,19 @@ const char* gentz::END_KEYWORD = "end";
|
|||
|
||||
void gentz::usage(const char* argv0) {
|
||||
fprintf(stderr,
|
||||
"Usage: %s [-c[+|-]] infile outfile\n"
|
||||
"Usage: %s [-c[+|-]] infile\n"
|
||||
" -c[+|-] [do|do not] include copyright (default=+)\n"
|
||||
" infile text file produced by tz.pl\n"
|
||||
" outfile binary file suitable for memory mapping\n",
|
||||
" infile text file produced by tz.pl\n",
|
||||
argv0);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int gentz::main(int argc, char *argv[]) {
|
||||
////////////////////////////////////////////////////////////
|
||||
// Parse arguments
|
||||
////////////////////////////////////////////////////////////
|
||||
useCopyright = TRUE;
|
||||
const char* infile = 0;
|
||||
const char* outfile = 0;
|
||||
for (int i=1; i<argc; ++i) {
|
||||
const char* arg = argv[i];
|
||||
if (arg[0] == '-') {
|
||||
|
@ -179,17 +187,20 @@ int gentz::main(int argc, char *argv[]) {
|
|||
}
|
||||
} else if (infile == 0) {
|
||||
infile = arg;
|
||||
} else if (outfile == 0) {
|
||||
outfile = arg;
|
||||
} else {
|
||||
usage(argv[0]);
|
||||
}
|
||||
}
|
||||
if (outfile == 0) {
|
||||
if (infile == 0) {
|
||||
usage(argv[0]);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Read the input file
|
||||
////////////////////////////////////////////////////////////
|
||||
*buffer = NUL;
|
||||
lineNumber = 0;
|
||||
fprintf(stdout, "Input file: %s\n", infile);
|
||||
FileStream* in = T_FileStream_open(infile, "r");
|
||||
if (in == 0) {
|
||||
die("Cannot open input file");
|
||||
|
@ -197,14 +208,13 @@ int gentz::main(int argc, char *argv[]) {
|
|||
parseTzTextFile(in);
|
||||
T_FileStream_close(in);
|
||||
*buffer = NUL;
|
||||
fprintf(stdout, "Input file %s, data version %u(%u)\n",
|
||||
infile, header.versionYear, header.versionSuffix);
|
||||
fprintf(stdout, "Read %ld standard zones, %ld dst zones, %ld zone names\n",
|
||||
header.standardCount, header.dstCount, zoneCount);
|
||||
|
||||
////////////////////////////////////////////////////////////
|
||||
// Write the output file
|
||||
////////////////////////////////////////////////////////////
|
||||
int32_t wlen = writeTzDatFile();
|
||||
fprintf(stdout, "Wrote to %s: %ld bytes\n",
|
||||
outfile, wlen);
|
||||
fprintf(stdout, "Output file: %s.%s, %ld bytes\n",
|
||||
TZ_DATA_NAME, TZ_DATA_TYPE, wlen);
|
||||
|
||||
return 0; // success
|
||||
}
|
||||
|
@ -213,15 +223,23 @@ int32_t gentz::writeTzDatFile() {
|
|||
UNewDataMemory *pdata;
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
pdata = udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
|
||||
// Fill in dataInfo with year.suffix
|
||||
*(uint16_t*)&(dataInfo.dataVersion[0]) = header.versionYear;
|
||||
*(uint16_t*)&(dataInfo.dataVersion[2]) = header.versionSuffix;
|
||||
|
||||
pdata = udata_create(TZ_DATA_TYPE, TZ_DATA_NAME, &dataInfo,
|
||||
useCopyright ? DATA_COPYRIGHT : 0, &status);
|
||||
if (U_FAILURE(status)) {
|
||||
die("Unable to create data memory");
|
||||
}
|
||||
|
||||
// Careful: This order cannot be changed (without changing
|
||||
// the offset fixup code).
|
||||
udata_writeBlock(pdata, &header, sizeof(header));
|
||||
udata_writeBlock(pdata, stdZones, stdZoneSize);
|
||||
udata_writeBlock(pdata, dstZones, dstZoneSize);
|
||||
udata_writeBlock(pdata, indexByName, header.count * sizeof(indexByName[0]));
|
||||
udata_writeBlock(pdata, indexByOffset, offsetIndexSize);
|
||||
udata_writeBlock(pdata, nameTable, nameTableSize);
|
||||
|
||||
uint32_t dataLength = udata_finish(pdata, &status);
|
||||
|
@ -230,7 +248,10 @@ int32_t gentz::writeTzDatFile() {
|
|||
}
|
||||
|
||||
if (dataLength != (sizeof(header) + stdZoneSize +
|
||||
dstZoneSize + nameTableSize)) {
|
||||
dstZoneSize + nameTableSize +
|
||||
header.count * sizeof(indexByName[0]) +
|
||||
offsetIndexSize
|
||||
)) {
|
||||
die("Written file doesn't match expected size");
|
||||
}
|
||||
return dataLength;
|
||||
|
@ -240,37 +261,139 @@ void gentz::parseTzTextFile(FileStream* in) {
|
|||
parseHeader(in);
|
||||
stdZones = parseStandardZones(in);
|
||||
dstZones = parseDSTZones(in);
|
||||
if (zoneCount != (int32_t)(header.standardCount + header.dstCount)) {
|
||||
if (header.count != (header.standardCount + header.dstCount)) {
|
||||
die("Zone counts don't add up");
|
||||
}
|
||||
nameTable = parseNameTable(in);
|
||||
|
||||
// Fixup the header offsets
|
||||
stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
|
||||
dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
|
||||
header.standardDelta = sizeof(header);
|
||||
header.dstDelta = header.standardDelta + stdZoneSize;
|
||||
header.nameIndexDelta = header.dstDelta + dstZoneSize;
|
||||
|
||||
header.standardOffset = sizeof(header);
|
||||
header.dstOffset = header.standardOffset + stdZoneSize;
|
||||
header.nameTableOffset = header.dstOffset + dstZoneSize;
|
||||
// Read in index tables after header is mostly fixed up
|
||||
indexByName = parseIndexTable(in);
|
||||
indexByOffset = parseOffsetIndexTable(in);
|
||||
|
||||
if (header.standardOffset < 0 ||
|
||||
header.dstOffset < 0 ||
|
||||
header.nameTableOffset < 0) {
|
||||
header.offsetIndexDelta = header.nameIndexDelta + header.count *
|
||||
sizeof(indexByName[0]);
|
||||
header.nameTableDelta = header.offsetIndexDelta + offsetIndexSize;
|
||||
|
||||
if (header.standardDelta < 0 ||
|
||||
header.dstDelta < 0 ||
|
||||
header.nameTableDelta < 0) {
|
||||
die("Negative offset in header after fixup");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Index tables are lists of specifiers of the form /[sd]\d+/, where
|
||||
* the first character determines if it is a standard or DST zone,
|
||||
* and the following number is in the range 0..n-1, where n is the
|
||||
* count of that type of zone.
|
||||
*
|
||||
* Header must already be read in and the offsets must be fixed up.
|
||||
* Standard and DST zones must be read in.
|
||||
*/
|
||||
int32_t* gentz::parseIndexTable(FileStream* in) {
|
||||
uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
|
||||
if (n != header.count) {
|
||||
die("Count mismatch in index table");
|
||||
}
|
||||
int32_t* result = new int32_t[n];
|
||||
for (uint32_t i=0; i<n; ++i) {
|
||||
readLine(in);
|
||||
char* p = buffer+1;
|
||||
uint32_t index = parseInteger(p, NUL, 0, header.count);
|
||||
switch (buffer[0]) {
|
||||
case 's':
|
||||
if (index >= header.standardCount) {
|
||||
die("Standard index entry out of range");
|
||||
}
|
||||
result[i] = header.standardDelta +
|
||||
((char*)&stdZones[index] - (char*)&stdZones[0]);
|
||||
break;
|
||||
case 'd':
|
||||
if (index >= header.dstCount) {
|
||||
die("DST index entry out of range");
|
||||
}
|
||||
result[i] = header.dstDelta +
|
||||
((char*)&dstZones[index] - (char*)&dstZones[0]);
|
||||
break;
|
||||
default:
|
||||
die("Malformed index entry");
|
||||
break;
|
||||
}
|
||||
}
|
||||
readEndMarker(in);
|
||||
fprintf(stdout, " Read %lu name index table entries, in-memory size %ld bytes\n",
|
||||
n, n * sizeof(int32_t));
|
||||
return result;
|
||||
}
|
||||
|
||||
OffsetIndex* gentz::parseOffsetIndexTable(FileStream* in) {
|
||||
uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
|
||||
|
||||
// We don't know how big the whole thing will be yet, but we can use
|
||||
// the maxPerOffset number to compute an upper limit.
|
||||
//
|
||||
// Structs will not be 4-aligned because we'll be writing them out
|
||||
// ourselves. Don't try to compute the exact size in advance
|
||||
// (unless we want to avoid the use of sizeof(), which may
|
||||
// introduce padding that we won't actually employ).
|
||||
int32_t maxPossibleSize = n * (sizeof(OffsetIndex) +
|
||||
(maxPerOffset-1) * sizeof(uint16_t));
|
||||
|
||||
int8_t *result = new int8_t[maxPossibleSize];
|
||||
if (result == 0) {
|
||||
die("Out of memory");
|
||||
}
|
||||
|
||||
// Read each line and construct the corresponding entry
|
||||
OffsetIndex* index = (OffsetIndex*)result;
|
||||
for (uint32_t i=0; i<n; ++i) {
|
||||
readLine(in);
|
||||
char* p = buffer;
|
||||
index->gmtOffset = 1000 * // Convert s -> ms
|
||||
parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
|
||||
index->count = (uint16_t)parseInteger(p, SEP, 1, maxPerOffset);
|
||||
uint16_t* zoneNumberArray = &(index->zoneNumber);
|
||||
for (uint16_t j=0; j<index->count; ++j) {
|
||||
zoneNumberArray[j] = (uint16_t)
|
||||
parseInteger(p, (j==(index->count-1))?NUL:SEP,
|
||||
0, header.count-1);
|
||||
}
|
||||
int8_t* nextIndex = (int8_t*)&(zoneNumberArray[index->count]);
|
||||
index->nextEntryDelta = (i==(n-1)) ? 0 : (nextIndex - (int8_t*)index);
|
||||
index = (OffsetIndex*)nextIndex;
|
||||
}
|
||||
offsetIndexSize = (int8_t*)index - (int8_t*)result;
|
||||
if (offsetIndexSize > maxPossibleSize) {
|
||||
die("Yikes! Interal error while constructing offset index table");
|
||||
}
|
||||
readEndMarker(in);
|
||||
fprintf(stdout, " Read %lu offset index table entries, in-memory size %ld bytes\n",
|
||||
n, offsetIndexSize);
|
||||
return (OffsetIndex*)result;
|
||||
}
|
||||
|
||||
void gentz::parseHeader(FileStream* in) {
|
||||
int32_t ignored;
|
||||
|
||||
// Version string, e.g., "1999j" -> (1999<<16) | 10
|
||||
header.versionYear = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
|
||||
header.versionYear = (uint16_t) readIntegerLine(in, 1990, 0xFFFF);
|
||||
header.versionSuffix = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
|
||||
|
||||
// Zone count
|
||||
zoneCount = readIntegerLine(in, 0, MAX_ZONES);
|
||||
header.count = readIntegerLine(in, 1, MAX_ZONES);
|
||||
maxPerOffset = readIntegerLine(in, 1, MAX_ZONES);
|
||||
/*header.maxNameLength*/ ignored = readIntegerLine(in, 1, MAX_MAX_NAME_LENGTH);
|
||||
|
||||
// Size of name table in bytes
|
||||
// (0x00FFFFFF is an arbitrary upper limit; adjust as needed.)
|
||||
nameTableSize = readIntegerLine(in, 1, 0x00FFFFFF);
|
||||
|
||||
fprintf(stdout, " Read header, data version %u(%u), in-memory size %ld bytes\n",
|
||||
header.versionYear, header.versionSuffix, sizeof(header));
|
||||
}
|
||||
|
||||
StandardZone* gentz::parseStandardZones(FileStream* in) {
|
||||
|
@ -282,18 +405,19 @@ StandardZone* gentz::parseStandardZones(FileStream* in) {
|
|||
for (uint32_t i=0; i<header.standardCount; i++) {
|
||||
parse1StandardZone(in, zones[i]);
|
||||
}
|
||||
readLine(in);
|
||||
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
|
||||
die("Keyword 'end' missing");
|
||||
}
|
||||
readEndMarker(in);
|
||||
stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
|
||||
fprintf(stdout, " Read %lu standard zones, in-memory size %ld bytes\n",
|
||||
header.standardCount, stdZoneSize);
|
||||
return zones;
|
||||
}
|
||||
|
||||
void gentz::parse1StandardZone(FileStream* in, StandardZone& zone) {
|
||||
readLine(in);
|
||||
char* p = buffer;
|
||||
zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
|
||||
zone.gmtOffset = parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
|
||||
/*zone.nameDelta =*/ parseInteger(p, SEP, 0, nameTableSize);
|
||||
zone.gmtOffset = 1000 * // Convert s -> ms
|
||||
parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
|
||||
}
|
||||
|
||||
DSTZone* gentz::parseDSTZones(FileStream* in) {
|
||||
|
@ -305,18 +429,19 @@ DSTZone* gentz::parseDSTZones(FileStream* in) {
|
|||
for (uint32_t i=0; i<header.dstCount; i++) {
|
||||
parse1DSTZone(in, zones[i]);
|
||||
}
|
||||
readLine(in);
|
||||
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
|
||||
die("Keyword 'end' missing");
|
||||
}
|
||||
readEndMarker(in);
|
||||
dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
|
||||
fprintf(stdout, " Read %lu DST zones, in-memory size %ld bytes\n",
|
||||
header.dstCount, dstZoneSize);
|
||||
return zones;
|
||||
}
|
||||
|
||||
void gentz::parse1DSTZone(FileStream* in, DSTZone& zone) {
|
||||
readLine(in);
|
||||
char* p = buffer;
|
||||
zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
|
||||
zone.gmtOffset = parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
|
||||
/*zone.nameDelta =*/ parseInteger(p, SEP, 0, nameTableSize);
|
||||
zone.gmtOffset = 1000 * // Convert s -> ms
|
||||
parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
|
||||
parseDSTRule(p, zone.onsetRule);
|
||||
parseDSTRule(p, zone.ceaseRule);
|
||||
zone.dstSavings = (uint16_t) parseInteger(p, NUL, 0, 12*60);
|
||||
|
@ -349,7 +474,7 @@ void gentz::parseDSTRule(char*& p, TZRule& rule) {
|
|||
|
||||
char* gentz::parseNameTable(FileStream* in) {
|
||||
int32_t n = readIntegerLine(in, 1, MAX_ZONES);
|
||||
if (n != zoneCount) {
|
||||
if (n != (int32_t)header.count) {
|
||||
die("Zone count doesn't match name table count");
|
||||
}
|
||||
char* names = new char[nameTableSize];
|
||||
|
@ -371,9 +496,21 @@ char* gentz::parseNameTable(FileStream* in) {
|
|||
if (p != limit) {
|
||||
die("Name table shorter than declared size");
|
||||
}
|
||||
readEndMarker(in);
|
||||
fprintf(stdout, " Read %ld names, in-memory size %ld bytes\n", n, nameTableSize);
|
||||
return names;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read the end marker (terminates each list).
|
||||
*/
|
||||
void gentz::readEndMarker(FileStream* in) {
|
||||
readLine(in);
|
||||
if (icu_strcmp(buffer, END_KEYWORD) != 0) {
|
||||
die("Keyword 'end' missing");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Read a line from the FileStream and parse it as an
|
||||
* integer. There should be nothing else on the line.
|
||||
|
@ -432,12 +569,13 @@ int32_t gentz::parseInteger(char*& p, char nextExpectedChar,
|
|||
void gentz::die(const char* msg) {
|
||||
fprintf(stderr, "ERROR, %s\n", msg);
|
||||
if (*buffer) {
|
||||
fprintf(stderr, "Current input line: %s\n", buffer);
|
||||
fprintf(stderr, "Input file line %ld: \"%s\"\n", lineNumber, buffer);
|
||||
}
|
||||
exit(1);
|
||||
}
|
||||
|
||||
int32_t gentz::readLine(FileStream* in) {
|
||||
++lineNumber;
|
||||
T_FileStream_readLine(in, buffer, BUFLEN);
|
||||
// Trim off trailing comment
|
||||
char* p = icu_strchr(buffer, COMMENT);
|
||||
|
|
|
@ -1,11 +1,18 @@
|
|||
Readme file for ICU time zone data (source/tools/gentz)
|
||||
|
||||
The time zone data in ICU is taken from the UNIX data files at
|
||||
ftp://elsie.nci.nih.gov/pub/tzdata<year>.
|
||||
|
||||
RAW DATA
|
||||
--------
|
||||
The time zone data in ICU is taken from the UNIX data files at
|
||||
ftp://elsie.nci.nih.gov/pub/tzdata<year>. The other input to the
|
||||
process is an alias table, described below.
|
||||
|
||||
|
||||
BUILD PROCESS
|
||||
-------------
|
||||
Two tools are used to process the data into a format suitable for ICU:
|
||||
|
||||
tz.pl directory of raw data files -> tz.txt
|
||||
tz.pl directory of raw data files + tz.alias -> tz.txt
|
||||
gentz tz.txt -> tz.dat (memory mappable binary file)
|
||||
|
||||
After gentz is run, standard ICU data tools are used to incorporate
|
||||
|
@ -34,4 +41,35 @@ The tz.txt file is typically checked into CVS, whereas the raw data
|
|||
files are not, since they are readily available from the URL listed
|
||||
above.
|
||||
|
||||
|
||||
ALIAS TABLE
|
||||
-----------
|
||||
For backward compatibility, we define several three-letter IDs that
|
||||
have been used since early ICU and correspond to IDs used in old JDKs.
|
||||
These IDs are listed in tz.alias. The tz.pl script processes this
|
||||
alias table and issues errors if there are problems.
|
||||
|
||||
|
||||
IDS
|
||||
---
|
||||
All *system* zone IDs must consist only of characters in the invariant
|
||||
set. See utypes.h for an explanation of what this means. If an ID is
|
||||
encountered that contains a non-invariant character, tz.pl complains.
|
||||
Non-system zones may try to use non-invariant characters, but they
|
||||
shouldn't because of possible collisions with system IDs when the
|
||||
invariant char converter is used (see TimeZone class for details).
|
||||
|
||||
|
||||
Etc/GMT...
|
||||
----------
|
||||
Users may be confused by the fact that various zones with names of the
|
||||
form Etc/GMT+n appear to have an offset of the wrong sign. For
|
||||
example, Etc/GMT+8 is 8 hours *behind* GMT; that is, it corresponds to
|
||||
what one typically sees displayed as "GMT-8:00". The reason for this
|
||||
inversion is explained in the UNIX zone data file "etcetera".
|
||||
Briefly, this is done intentionally in order to comply with
|
||||
POSIX-style signedness. In ICU we reproduce the UNIX zone behavior
|
||||
faithfully, including this confusing aspect.
|
||||
|
||||
|
||||
Alan Liu 1999
|
||||
|
|
|
@ -12,6 +12,9 @@
|
|||
# parse out the current zones and create a listing of current zones.
|
||||
# Author: Alan Liu
|
||||
######################################################################
|
||||
# This script reads an alias table, $TZ_ALIAS, and creates clones of
|
||||
# standard UNIX zones with alias names.
|
||||
######################################################################
|
||||
# To update the zone data, download the latest data from the NIH URL
|
||||
# listed above into a directory. Run this script with the directory
|
||||
# name as an argument. THE DIRECTORY NAME MUST END IN tzdataYYYYR.
|
||||
|
@ -27,31 +30,36 @@
|
|||
# - Lines may be followed by a comment; the parser must ignore
|
||||
# anything of the form /\s+#.*$/ in each line.
|
||||
# |3065,14400 # Asia/Dubai GMT+4:00
|
||||
# - The file contains a header and 3 lists.
|
||||
# - The file contains a header and 5 lists.
|
||||
# - The header contains the version of the unix data, the total
|
||||
# zone count, and the length of the name table in bytes.
|
||||
# |1999 # (tzdata1999j) ftp://elsie.nci.nih.gov data version YEAR
|
||||
# |10 # (tzdata1999j) ftp://elsie.nci.nih.gov data version SUFFIX
|
||||
# |387 # total zone count
|
||||
# |5906 # length of name table in bytes
|
||||
# zone count, the maximum number of zones sharing the same value
|
||||
# of gmtOffset, the length of the name table in bytes, and
|
||||
# the length of the longest name (not including the terminating
|
||||
# zero byte).
|
||||
# | 1999 # (tzdata1999j) version of Olson zone
|
||||
# | 10 # data from ftp://elsie.nci.nih.gov
|
||||
# | 387 # total zone count
|
||||
# | 40 # max count of zones with same gmtOffset
|
||||
# | 25 # max name length not incl final zero
|
||||
# | 5906 # length of name table in bytes
|
||||
# - Lists start with a count of the records to follow, the records
|
||||
# themselves (one per line), and a single line with the keyword
|
||||
# 'end'.
|
||||
# - The first list is the list of standard zones:
|
||||
# |208 # count of standard zones to follow
|
||||
# |0,0 # Africa/Abidjan GMT+0:00
|
||||
# |28,10800 # Africa/Addis_Ababa GMT+3:00
|
||||
# | 208 # count of standard zones to follow
|
||||
# | 0,0 # Africa/Abidjan GMT+0:00
|
||||
# | 28,10800 # Africa/Addis_Ababa GMT+3:00
|
||||
# ...
|
||||
# |end
|
||||
# | end
|
||||
# Each standard zone record contains two integers. The first
|
||||
# is a byte offset into the name table for the name of the zone.
|
||||
# The second integer is the GMT offset in SECONDS for this zone.
|
||||
# - The second list is the list of DST zones:
|
||||
# |179 # count of dst zones to follow
|
||||
# |15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
|
||||
# |184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
|
||||
# | 179 # count of dst zones to follow
|
||||
# | 15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
|
||||
# | 184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
|
||||
# ...
|
||||
# |end
|
||||
# | end
|
||||
# Each record starts with the same two integers as a standard
|
||||
# zone record. Following this are data for the onset rule and
|
||||
# the cease rule. Each rule is described by the following integers:
|
||||
|
@ -63,13 +71,38 @@
|
|||
# The last integer in the record is the DST savings in MINUTES,
|
||||
# typically 60.
|
||||
# - The third list is the name table:
|
||||
# |387 # count of names to follow
|
||||
# |Africa/Abidjan
|
||||
# |Africa/Accra
|
||||
# | 387 # count of names to follow
|
||||
# | Africa/Abidjan
|
||||
# | Africa/Accra
|
||||
# ...
|
||||
# |end
|
||||
# | end
|
||||
# Each name is terminated by a newline (like all lines in the file).
|
||||
# The offsets in the first two lists refer to this table.
|
||||
# - The fourth list is an index list by name. The index entries
|
||||
# themselves are of the form /[sd]\d+/, where the first character
|
||||
# indicates standard or DST, and the number that follows indexes
|
||||
# into the correpsonding array.
|
||||
# | 416 # count of name index table entries to follow
|
||||
# | d0 # ACT
|
||||
# | d1 # AET
|
||||
# | d2 # AGT
|
||||
# | d3 # ART
|
||||
# | d4 # AST
|
||||
# | s0 # Africa/Abidjan
|
||||
# ...
|
||||
# | end
|
||||
# - The fifth list is an index by GMT offset. Each line lists the
|
||||
# zones with the same offset. The first number on the line
|
||||
# is the GMT offset in seconds. The second number is the count
|
||||
# of zone numbers to follow. Each zone number is an integer from
|
||||
# 0..n-1, where n is the total number of zones. The zone numbers
|
||||
# refer to the zone list in alphabetical order.
|
||||
# | 39 # index by offset entries to follow
|
||||
# | -43200,1,280
|
||||
# | -39600,6,279,365,373,393,395,398
|
||||
# | -36000,8,57,278,349,379,386,387,403,405
|
||||
# ...
|
||||
# | end
|
||||
######################################################################
|
||||
# As of 1999j, here are the various possible values taken by the
|
||||
# rule fields. See code below that generates this data.
|
||||
|
@ -88,10 +121,32 @@ require 5; # Minimum version of perl needed
|
|||
use strict;
|
||||
use Getopt::Long;
|
||||
use vars qw(@FILES $YEAR $DATA_DIR $OUT $SEP @MONTH
|
||||
$VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION);
|
||||
$VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION $TZ_ALIAS);
|
||||
require 'dumpvar.pl';
|
||||
use TZFileParser;
|
||||
use TZUtility;
|
||||
use tzparse;
|
||||
use tzutil;
|
||||
|
||||
# File names
|
||||
$OUT = 'tz.txt';
|
||||
$TZ_ALIAS = 'tz.alias';
|
||||
|
||||
# Separator between fields in the output file
|
||||
$SEP = ','; # Don't use ':'!
|
||||
|
||||
@FILES = qw(africa
|
||||
antarctica
|
||||
asia
|
||||
australasia
|
||||
backward
|
||||
etcetera
|
||||
europe
|
||||
factory
|
||||
northamerica
|
||||
pacificnew
|
||||
solar87
|
||||
solar88
|
||||
solar89
|
||||
southamerica);
|
||||
|
||||
# We get the current year from the system here. Later
|
||||
# we double check this against the zone data version.
|
||||
|
@ -127,26 +182,6 @@ if ($DATA_DIR =~ /(tzdata(\d{4})(\w?))/) {
|
|||
usage();
|
||||
}
|
||||
|
||||
# Output file name
|
||||
$OUT = 'tz.txt';
|
||||
|
||||
# Separator between fields in the output file
|
||||
$SEP = ','; # Don't use ':'!
|
||||
|
||||
@FILES = qw(africa
|
||||
antarctica
|
||||
asia
|
||||
australasia
|
||||
backward
|
||||
etcetera
|
||||
europe
|
||||
factory
|
||||
northamerica
|
||||
pacificnew
|
||||
solar87
|
||||
solar88
|
||||
solar89
|
||||
southamerica);
|
||||
|
||||
@MONTH = qw(jan feb mar apr may jun
|
||||
jul aug sep oct nov dec);
|
||||
|
@ -181,6 +216,8 @@ sub main {
|
|||
|
||||
TZ::Postprocess(\%ZONES, \%RULES);
|
||||
|
||||
incorporateAliases($TZ_ALIAS, \%ZONES);
|
||||
|
||||
print
|
||||
"Read ", scalar keys %ZONES, " current zones and ",
|
||||
scalar keys %RULES, " rules for $YEAR\n";
|
||||
|
@ -207,10 +244,16 @@ sub main {
|
|||
my %NAME_OFFSET;
|
||||
my $STD_COUNT = 0; # Count of standard zones
|
||||
my $DST_COUNT = 0; # Count of DST zones
|
||||
my $maxNameLen = 0;
|
||||
foreach my $z (sort keys %ZONES) {
|
||||
# Make sure zone IDs only contain invariant chars
|
||||
assertInvariantChars($z);
|
||||
|
||||
my $len = length($z);
|
||||
$NAME_OFFSET{$z} = $offset;
|
||||
$offset += length($z) + 1;
|
||||
$offset += $len + 1;
|
||||
$NAME_LIST .= "$z\n";
|
||||
$maxNameLen = $len if ($len > $maxNameLen);
|
||||
if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
|
||||
$STD_COUNT++;
|
||||
} else {
|
||||
|
@ -218,14 +261,35 @@ sub main {
|
|||
}
|
||||
}
|
||||
my $NAME_SIZE = $offset;
|
||||
|
||||
# Find the maximum number of zones with the same value of
|
||||
# gmtOffset.
|
||||
my %perOffset; # Hash of offset -> count
|
||||
foreach my $z (keys %ZONES) {
|
||||
# Use parseOffset to normalize values - probably unnecessary
|
||||
++$perOffset{parseOffset($ZONES{$z}->{gmtoff})};
|
||||
}
|
||||
my $maxPerOffset = 0;
|
||||
foreach (values %perOffset) {
|
||||
$maxPerOffset = $_ if ($_ > $maxPerOffset);
|
||||
}
|
||||
|
||||
open(OUT,">$OUT") or die "Can't open $OUT for writing: $!";
|
||||
|
||||
############################################################
|
||||
# EMIT HEADER
|
||||
############################################################
|
||||
# Zone data version
|
||||
print OUT $VERSION_YEAR, " # ($RAW_VERSION) ftp://elsie.nci.nih.gov data version YEAR\n";
|
||||
print OUT $VERSION_SUFFIX, " # ($RAW_VERSION) ftp://elsie.nci.nih.gov data version SUFFIX\n";
|
||||
print OUT $VERSION_YEAR, " # ($RAW_VERSION) version of Olson zone\n";
|
||||
print OUT $VERSION_SUFFIX, " # data from ftp://elsie.nci.nih.gov\n";
|
||||
print OUT scalar keys %ZONES, " # total zone count\n";
|
||||
print OUT $maxPerOffset, " # max count of zones with same gmtOffset\n";
|
||||
print OUT $maxNameLen, " # max name length not incl final zero\n";
|
||||
print OUT $NAME_SIZE, " # length of name table in bytes\n";
|
||||
|
||||
############################################################
|
||||
# EMIT ZONE TABLES
|
||||
############################################################
|
||||
# Output first the standard zones, then the dst zones.
|
||||
# Precede each list with the count of zones to follow,
|
||||
# and follow it with the keyword 'end'.
|
||||
|
@ -241,13 +305,73 @@ sub main {
|
|||
print OUT "end\n"; # 'end' keyword for error checking
|
||||
}
|
||||
|
||||
############################################################
|
||||
# EMIT NAME TABLE
|
||||
############################################################
|
||||
# Output the name table, followed by 'end' keyword
|
||||
print OUT scalar keys %ZONES, " # count of names to follow\n";
|
||||
print OUT $NAME_LIST, "end\n";
|
||||
|
||||
############################################################
|
||||
# EMIT INDEX BY NAME
|
||||
############################################################
|
||||
# Output the name index table. Since we don't know structure
|
||||
# sizes, we output the index number of each zone. For example,
|
||||
# "s0" is the first standard zone, "s1" is the second, etc.
|
||||
# Likewise, "d0" is the first DST zone, "d1" is the second, etc.
|
||||
|
||||
# First compute index IDs, as described above.
|
||||
my %indexID;
|
||||
my $s = 0;
|
||||
my $d = 0;
|
||||
foreach my $z (sort keys %ZONES) {
|
||||
if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
|
||||
$indexID{$z} = "s$s";
|
||||
$s++;
|
||||
} else {
|
||||
$indexID{$z} = "d$d";
|
||||
$d++;
|
||||
}
|
||||
}
|
||||
|
||||
# Now emit table sorted by name
|
||||
print OUT scalar keys %ZONES, " # count of name index table entries to follow\n";
|
||||
foreach my $z (sort keys %ZONES) {
|
||||
print OUT $indexID{$z}, " # $z\n";
|
||||
}
|
||||
print OUT "end\n";
|
||||
|
||||
############################################################
|
||||
# EMIT INDEX BY GMT OFFSET
|
||||
############################################################
|
||||
# Create a hash mapping zone name -> integer, from 0..n-1.
|
||||
my %zoneNumber;
|
||||
my $i = 0;
|
||||
foreach (sort keys %ZONES) { $zoneNumber{$_} = $i++; }
|
||||
|
||||
# Create a hash by index. The hash has offset integers as keys
|
||||
# and arrays of index numbers as values.
|
||||
my %offsetMap;
|
||||
foreach (sort keys %ZONES) {
|
||||
my $offset = parseOffset($ZONES{$_}->{gmtoff});
|
||||
push @{$offsetMap{$offset}}, $zoneNumber{$_};
|
||||
}
|
||||
|
||||
# Emit it
|
||||
print OUT scalar keys %offsetMap, " # index by offset entries to follow\n";
|
||||
foreach (sort {$a <=> $b} keys %offsetMap) {
|
||||
my $aref = $offsetMap{$_};
|
||||
print OUT $_, ",", scalar @{$aref}, ",", join(",", @{$aref}), "\n";
|
||||
}
|
||||
print OUT "end\n";
|
||||
|
||||
############################################################
|
||||
# END
|
||||
############################################################
|
||||
close(OUT);
|
||||
print "$OUT written.\n";
|
||||
|
||||
|
||||
if (0) {
|
||||
TZ::FormZoneEquivalencyGroups(\%ZONES, \%RULES, \@EQUIV);
|
||||
print
|
||||
|
@ -281,6 +405,46 @@ sub main {
|
|||
}
|
||||
}
|
||||
|
||||
# Read the alias list and create clones with alias names. This
|
||||
# sub should be called AFTER all standard zones have been read in.
|
||||
# Param: File name of alias list
|
||||
# Param: Ref to zone hash
|
||||
sub incorporateAliases {
|
||||
my $aliasFile = shift;
|
||||
my $zones = shift;
|
||||
my $n = 0;
|
||||
local *IN;
|
||||
open(IN,$aliasFile) or die "Can't open $aliasFile: $!";
|
||||
while (<IN>) {
|
||||
s/\#.*//; # Trim comments
|
||||
next unless (/\S/); # Skip blank lines
|
||||
if (/^\s*(\S+)\s+(\S+)\s*$/) {
|
||||
my ($alias, $original) = ($1, $2);
|
||||
if (exists $zones->{$alias}) {
|
||||
die "Bad alias in $aliasFile: $alias is a standard UNIX zone. " .
|
||||
"Please remove $alias from the alias table.\n";
|
||||
}
|
||||
if (!exists $zones->{$original}) {
|
||||
die "Bad alias in $aliasFile: $alias maps to the nonexistent " .
|
||||
"zone $original. Please fix this entry in the alias table.\n";
|
||||
}
|
||||
# We hardcode the GMT zone in the TimeZone class; don't include
|
||||
# it in the tz.txt file.
|
||||
if ($alias eq "GMT") {
|
||||
die "Bad alias in $aliasFile: GMT is a hardcoded system zone. " .
|
||||
"Please remove it from the alias table.\n";
|
||||
}
|
||||
# Create the alias!
|
||||
$zones->{$alias} = $zones->{$original};
|
||||
$n++;
|
||||
} else {
|
||||
die "Bad line in alias table $aliasFile: $_\n";
|
||||
}
|
||||
}
|
||||
print "Incorporated $n aliases from $aliasFile\n";
|
||||
close(IN);
|
||||
}
|
||||
|
||||
# Format a time zone as a machine-readable line of text. Another
|
||||
# tool will read this line to construct a binary data structure
|
||||
# representing this zone.
|
||||
|
@ -481,4 +645,14 @@ sub parseDaySpecifier {
|
|||
( $dowim, $dow );
|
||||
}
|
||||
|
||||
# Confirm that the given ID contains only invariant characters.
|
||||
# See utypes.h for an explanation.
|
||||
# Param: string to be checked
|
||||
sub assertInvariantChars {
|
||||
local $_ = shift;
|
||||
if (/[^A-Za-z0-9 \"%&\'()*+,-.\/:;<=>?_]/) {
|
||||
die "Error: Zone ID \"$_\" contains non-invariant characters\n";
|
||||
}
|
||||
}
|
||||
|
||||
__END__
|
||||
|
|
File diff suppressed because it is too large
Load diff
245
icu4c/source/tools/gentz/tzparse.pm
Normal file
245
icu4c/source/tools/gentz/tzparse.pm
Normal file
|
@ -0,0 +1,245 @@
|
|||
######################################################################
|
||||
# Copyright (C) 1999, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
######################################################################
|
||||
# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
|
||||
# where <year> is "1999b" or a similar string.
|
||||
######################################################################
|
||||
# This package handles the parsing of time zone files.
|
||||
# Author: Alan Liu
|
||||
######################################################################
|
||||
# Usage:
|
||||
# Call ParseFile for each file to be imported. Then call Postprocess
|
||||
# to remove unused rules and links.
|
||||
|
||||
package TZ;
|
||||
use strict;
|
||||
use Carp;
|
||||
use vars qw(@ISA @EXPORT $VERSION $YEAR $STANDARD);
|
||||
require 'dumpvar.pl';
|
||||
|
||||
@ISA = qw(Exporter);
|
||||
@EXPORT = qw(ParseFile
|
||||
Postprocess
|
||||
);
|
||||
$VERSION = '0.1';
|
||||
|
||||
$STANDARD = '-'; # Name of the Standard Time rule
|
||||
|
||||
######################################################################
|
||||
# Param: File name
|
||||
# Param: Ref to hash of zones
|
||||
# Param: Ref to hash of rules
|
||||
# Param: Current year
|
||||
sub ParseFile {
|
||||
my ($FILE, $ZONES, $RULES, $YEAR) = @_;
|
||||
|
||||
local(*FILE);
|
||||
open(FILE,"<$FILE") or confess "Can't open $FILE: $!";
|
||||
my $zone; # Current zone
|
||||
my $badLineCount = 0;
|
||||
while (<FILE>) {
|
||||
s/\#.*//;
|
||||
next if (!/\S/);
|
||||
#|# Zone NAME GMTOFF RULES FORMAT [UNTIL]
|
||||
#|Zone America/Montreal -4:54:16 - LMT 1884
|
||||
#| -5:00 Mont E%sT
|
||||
#|Zone America/Thunder_Bay -5:57:00 - LMT 1895
|
||||
#| -5:00 Canada E%sT 1970
|
||||
#| -5:00 Mont E%sT 1973
|
||||
#| -5:00 - EST 1974
|
||||
#| -5:00 Canada E%sT
|
||||
my ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil);
|
||||
if (/^zone/i) {
|
||||
# Zone block start
|
||||
if (/^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/i
|
||||
|| /^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)()/i) {
|
||||
$zone = $1;
|
||||
($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
|
||||
($2, $3, $4, $5);
|
||||
} else {
|
||||
print STDERR "Can't parse in $FILE: $_";
|
||||
++$badLineCount;
|
||||
}
|
||||
} elsif (/^\s/ && $zone) {
|
||||
# Zone continuation
|
||||
if (/^\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/
|
||||
|| /^\s+(\S+)\s+(\S+)\s+(\S+)()/) {
|
||||
($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
|
||||
($1, $2, $3, $4);
|
||||
} else {
|
||||
print STDERR "Can't parse in $FILE: $_";
|
||||
++$badLineCount;
|
||||
}
|
||||
} elsif (/^rule/i) {
|
||||
# Here is where we parse a single line of the rule table.
|
||||
# Our goal is to accept only rules applying to the current
|
||||
# year. This is normally a matter of accepting rules
|
||||
# that match the current year. However, in some cases this
|
||||
# is more complicated. For example:
|
||||
#|# Tonga
|
||||
#|# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S
|
||||
#|Rule Tonga 1999 max - Oct Sat>=1 2:00s 1:00 S
|
||||
#|Rule Tonga 2000 max - Apr Sun>=16 2:00s 0 -
|
||||
# To handle this properly, we save every rule we encounter
|
||||
# (thus overwriting older ones with newer ones, since rules
|
||||
# are listed in order), and also use slot [2] to mark when
|
||||
# we see a current year rule. When that happens, we stop
|
||||
# saving rules. Thus we match the latest rule we see, or
|
||||
# a matching rule if we find one. The format of slot [2]
|
||||
# is just a 2 bit flag ([2]&1 means slot [0] matched,
|
||||
# [2]&2 means slot [1] matched).
|
||||
|
||||
# Note that later, when the rules are post processed
|
||||
# (see Postprocess), the slot [2] will be overwritten
|
||||
# with the compressed rule string used to implement
|
||||
# equality testing.
|
||||
|
||||
$zone = undef;
|
||||
# Rule
|
||||
#|# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S
|
||||
#|Rule US 1918 1919 - Mar lastSun 2:00 1:00 W # War
|
||||
#|Rule US 1918 1919 - Oct lastSun 2:00 0 S
|
||||
#|Rule US 1942 only - Feb 9 2:00 1:00 W # War
|
||||
#|Rule US 1945 only - Sep 30 2:00 0 S
|
||||
#|Rule US 1967 max - Oct lastSun 2:00 0 S
|
||||
#|Rule US 1967 1973 - Apr lastSun 2:00 1:00 D
|
||||
#|Rule US 1974 only - Jan 6 2:00 1:00 D
|
||||
#|Rule US 1975 only - Feb 23 2:00 1:00 D
|
||||
#|Rule US 1976 1986 - Apr lastSun 2:00 1:00 D
|
||||
#|Rule US 1987 max - Apr Sun>=1 2:00 1:00 D
|
||||
if (/^rule\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+
|
||||
(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/xi) {
|
||||
my ($name, $from, $to, $type, $in, $on, $at, $save, $letter) =
|
||||
($1, $2, $3, $4, $5, $6, $7, $8, $9);
|
||||
my $i = $save ? 0:1;
|
||||
|
||||
if (!exists $RULES->{$name}) {
|
||||
$RULES->{$name} = [];
|
||||
}
|
||||
my $ruleArray = $RULES->{$name};
|
||||
|
||||
# Check our bit mask to see if we've already matched
|
||||
# a current rule. If so, do nothing. If not, then
|
||||
# save this rule line as the best one so far.
|
||||
if (@{$ruleArray} < 3 ||
|
||||
!($ruleArray->[2] & $i)) {
|
||||
my $h = $ruleArray->[$i];
|
||||
$ruleArray->[$i]->{from} = $from;
|
||||
$ruleArray->[$i]->{to} = $to;
|
||||
$ruleArray->[$i]->{type} = $type;
|
||||
$ruleArray->[$i]->{in} = $in;
|
||||
$ruleArray->[$i]->{on} = $on;
|
||||
$ruleArray->[$i]->{at} = $at;
|
||||
$ruleArray->[$i]->{save} = $save;
|
||||
$ruleArray->[$i]->{letter} = $letter;
|
||||
|
||||
# Does this rule match the current year? If so,
|
||||
# set the bit mask so we don't overwrite this rule.
|
||||
# This makes us ingore rules for subsequent years
|
||||
# that are already listed in the database -- as long
|
||||
# as we have an overriding rule for the current year.
|
||||
if (($from == $YEAR && $to =~ /only/i) ||
|
||||
($from <= $YEAR &&
|
||||
(($to =~ /^\d/ && $YEAR <= $to) || $to =~ /max/i))) {
|
||||
$ruleArray->[2] |= $i;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
print STDERR "Can't parse in $FILE: $_";
|
||||
++$badLineCount;
|
||||
}
|
||||
} elsif (/^link/i) {
|
||||
#|# Old names, for S5 users
|
||||
#|
|
||||
#|# Link LINK-FROM LINK-TO
|
||||
#|Link America/New_York EST5EDT
|
||||
#|Link America/Chicago CST6CDT
|
||||
#|Link America/Denver MST7MDT
|
||||
#|Link America/Los_Angeles PST8PDT
|
||||
#|Link America/Indianapolis EST
|
||||
#|Link America/Phoenix MST
|
||||
#|Link Pacific/Honolulu HST
|
||||
if (/^link\s+(\S+)\s+(\S+)/i) {
|
||||
# We currently only record a single link -- if there
|
||||
# are more than one, we should modify this.
|
||||
my ($from, $to) = ($1, $2);
|
||||
$ZONES->{$from}->{link} = $to;
|
||||
} else {
|
||||
print STDERR "Can't parse in $FILE: $_";
|
||||
++$badLineCount;
|
||||
}
|
||||
} else {
|
||||
# Unexpected line
|
||||
print STDERR "Ignoring in $FILE: $_";
|
||||
++$badLineCount;
|
||||
}
|
||||
if ($zoneRule &&
|
||||
($zoneUntil !~ /\S/ || ($zoneUntil =~ /^\d/ &&
|
||||
$zoneUntil >= $YEAR))) {
|
||||
$ZONES->{$zone}->{gmtoff} = $zoneGmtoff;
|
||||
$ZONES->{$zone}->{rule} = $zoneRule;
|
||||
$ZONES->{$zone}->{format} = $zoneFormat;
|
||||
$ZONES->{$zone}->{until} = $zoneUntil;
|
||||
}
|
||||
}
|
||||
close(FILE);
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# Param: Ref to hash of zones
|
||||
# Param: Ref to hash of rules
|
||||
sub Postprocess {
|
||||
my ($ZONES, $RULES) = @_;
|
||||
my %ruleInUse;
|
||||
|
||||
# Eliminate zone links that have no corresponding zone
|
||||
foreach (keys %$ZONES) {
|
||||
if (exists $ZONES->{$_}->{link} && !exists $ZONES->{$_}->{rule}) {
|
||||
if (0) {
|
||||
print STDERR
|
||||
"Deleting link from historical/nonexistent zone: ",
|
||||
$_, " -> ", $ZONES->{$_}->{link}, "\n";
|
||||
}
|
||||
delete $ZONES->{$_};
|
||||
}
|
||||
}
|
||||
|
||||
# Check that each zone has a corresponding rule
|
||||
foreach (sort keys %$ZONES) {
|
||||
my $ruleName = $ZONES->{$_}->{rule};
|
||||
next if ($ruleName eq $STANDARD);
|
||||
if (exists $RULES->{$ruleName}) {
|
||||
$ruleInUse{$ruleName} = 1;
|
||||
} else {
|
||||
# This means the zone is using the standard rule now
|
||||
$ZONES->{$_}->{rule} = $STANDARD;
|
||||
}
|
||||
}
|
||||
|
||||
# Check that both parts are there for rules
|
||||
# Check for unused rules
|
||||
# Make coded string for comparisons
|
||||
foreach (keys %$RULES) {
|
||||
if (!exists $ruleInUse{$_}) {
|
||||
if (0) {
|
||||
print STDERR "Deleting historical/unused rule: $_\n";
|
||||
}
|
||||
delete $RULES->{$_};
|
||||
} elsif (!$RULES->{$_}->[0] || !$RULES->{$_}->[1]) {
|
||||
print STDERR "Rule doesn't have both parts: $_\n";
|
||||
} else {
|
||||
# Generate code
|
||||
# This has all the data about a rule; it can be used
|
||||
# to see if two rules behave identically
|
||||
$RULES->{$_}->[2] =
|
||||
lc($RULES->{$_}->[0]->{in} . "," .
|
||||
$RULES->{$_}->[0]->{on} . "," .
|
||||
$RULES->{$_}->[0]->{at} . "," .
|
||||
$RULES->{$_}->[0]->{save} . ";" .
|
||||
$RULES->{$_}->[1]->{in} . "," .
|
||||
$RULES->{$_}->[1]->{on} . "," .
|
||||
$RULES->{$_}->[1]->{at}); # [1]->{save} is always zero
|
||||
}
|
||||
}
|
||||
}
|
197
icu4c/source/tools/gentz/tzutil.pm
Normal file
197
icu4c/source/tools/gentz/tzutil.pm
Normal file
|
@ -0,0 +1,197 @@
|
|||
######################################################################
|
||||
# Copyright (C) 1999, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
######################################################################
|
||||
# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
|
||||
# where <year> is "1999b" or a similar string.
|
||||
######################################################################
|
||||
# This package contains utility functions for time zone data.
|
||||
# Author: Alan Liu
|
||||
|
||||
######################################################################
|
||||
# Zones - A time zone object is a hash with the following keys:
|
||||
# {gmtoff} The offset from GMT, e.g. "-5:00"
|
||||
# {rule} The name of the rule, e.g. "-", "Canada", "EU", "US"
|
||||
# {format} The local abbreviation, e.g. "E%sT"
|
||||
# {until} Data is good until this year, e.g., "2000". Often blank.
|
||||
|
||||
# These correspond to file entries:
|
||||
#|# Zone NAME GMTOFF RULES FORMAT [UNTIL]
|
||||
#|Zone America/Montreal -4:54:16 - LMT 1884
|
||||
#| -5:00 Mont E%sT
|
||||
|
||||
# Optionally, a zone may also have the key:
|
||||
# {link} An old name for this zone, e.g. "HST" (for Pacific/Honolulu)
|
||||
# Links come from the file entries:
|
||||
#|# Link LINK-FROM LINK-TO
|
||||
#|Link America/New_York EST5EDT
|
||||
#|Link America/Chicago CST6CDT
|
||||
|
||||
# The name of the zone itself is not kept in the zone object.
|
||||
# Instead, zones are kept in a big hash. The keys are the names; the
|
||||
# values are references to the zone objects. The big hash of all
|
||||
# zones is referred to in all caps: %ZONES ($ZONES if it's a
|
||||
# reference).
|
||||
|
||||
# Example: $ZONES->{"America/Los_Angeles"} =
|
||||
# 'format' => 'P%sT'
|
||||
# 'gmtoff' => '-8:00'
|
||||
# 'link' => 'US/Pacific-New'
|
||||
# 'rule' => 'US'
|
||||
# 'until' => ''
|
||||
|
||||
######################################################################
|
||||
# Rules - A time zone rule is an array with the following elements:
|
||||
# [0] Onset rule
|
||||
# [1] Cease rule
|
||||
# [2] Encoded string
|
||||
|
||||
# The onset rule and cease rule have the same format. They are each
|
||||
# references to a hash with keys:
|
||||
# {from} Start year
|
||||
# {to} End year, or "only" or "max"
|
||||
# {type} Unknown, usually "-"
|
||||
# {in} Month, 3 letters
|
||||
# {on} Day specifier, e.g. "lastSun", "Sun>=1", "23"
|
||||
# {at} Time, e.g. "2:00", "1:00u"
|
||||
# {save} Amount of savings, for the onset; 0 for the cease
|
||||
# {letter} Guess: the letter that goes into %s in the zone {format}
|
||||
|
||||
# These correspond to the file entries thus:
|
||||
#|# Rule NAME FROM TO TYPE IN ON AT SAVE LETTER/S
|
||||
#|Rule US 1942 only - Feb 9 2:00 1:00 W # War
|
||||
#|Rule US 1945 only - Sep 30 2:00 0 S
|
||||
#|Rule US 1967 max - Oct lastSun 2:00 0 S
|
||||
#|Rule US 1967 1973 - Apr lastSun 2:00 1:00 D
|
||||
#|Rule US 1974 only - Jan 6 2:00 1:00 D
|
||||
#|Rule US 1975 only - Feb 23 2:00 1:00 D
|
||||
#|Rule US 1976 1986 - Apr lastSun 2:00 1:00 D
|
||||
#|Rule US 1987 max - Apr Sun>=1 2:00 1:00 D
|
||||
|
||||
# Entry [2], the encoded string, is used to see if two rules are the
|
||||
# same. It consists of "[0]->{in},[0]->{on},[0]->{at},[0]->{save};
|
||||
# [1]->{in},[1]->{on},[1]->{at}". Note that the separator between
|
||||
# values is a comma, between onset and cease is a semicolon. Also
|
||||
# note that the cease {save} is not used as this is always 0. The
|
||||
# whole string is forced to lowercase.
|
||||
|
||||
# Rules don't contain their own name. Like zones, rules are kept in a
|
||||
# big hash; the keys are the names, the values the references to the
|
||||
# arrays. This hash of all rules is referred to in all caps, %RULES
|
||||
# or for a reference, $RULES.
|
||||
|
||||
# Example: $RULES->{"US"} =
|
||||
# 0 HASH(0x8fa03c)
|
||||
# 'at' => '2:00'
|
||||
# 'from' => 1987
|
||||
# 'in' => 'Apr'
|
||||
# 'letter' => 'D'
|
||||
# 'on' => 'Sun>=1'
|
||||
# 'save' => '1:00'
|
||||
# 'to' => 'max'
|
||||
# 'type' => '-'
|
||||
# 1 HASH(0x8f9fc4)
|
||||
# 'at' => '2:00'
|
||||
# 'from' => 1967
|
||||
# 'in' => 'Oct'
|
||||
# 'letter' => 'S'
|
||||
# 'on' => 'lastSun'
|
||||
# 'save' => 0
|
||||
# 'to' => 'max'
|
||||
# 'type' => '-'
|
||||
# 2 'apr,sun>=1,2:00,1:00;oct,lastsun,2:00'
|
||||
|
||||
package TZ;
|
||||
use strict;
|
||||
use Carp;
|
||||
use vars qw(@ISA @EXPORT $VERSION $STANDARD);
|
||||
require 'dumpvar.pl';
|
||||
|
||||
@ISA = qw(Exporter);
|
||||
@EXPORT = qw(ZoneEquals
|
||||
RuleEquals
|
||||
FormZoneEquivalencyGroups
|
||||
);
|
||||
$VERSION = '0.1';
|
||||
|
||||
$STANDARD = '-'; # Name of the Standard Time rule
|
||||
|
||||
######################################################################
|
||||
# Param: zone object (hash ref)
|
||||
# Param: zone object (hash ref)
|
||||
# Param: ref to hash of all rules
|
||||
# Return: true if two zones are equivalent
|
||||
sub ZoneEquals {
|
||||
my $z1 = shift;
|
||||
my $z2 = shift;
|
||||
my $RULES = shift;
|
||||
|
||||
($z1, $z2) = ($z1->{rule}, $z2->{rule});
|
||||
|
||||
return ($z1 eq $z2) ||
|
||||
RuleEquals($RULES->{$z1}, $RULES->{$z2});
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# Param: rule object (hash ref)
|
||||
# Param: rule object (hash ref)
|
||||
# Return: true if two rules are equivalent
|
||||
sub RuleEquals {
|
||||
my $r1 = shift;
|
||||
my $r2 = shift;
|
||||
|
||||
# Just compare the precomputed encoding strings.
|
||||
# defined() catches undefined rules. The only undefined
|
||||
# rule is $STANDARD; any others would be cause by
|
||||
# Postprocess().
|
||||
return defined($r1) && defined($r2) && $r1->[2] eq $r2->[2];
|
||||
|
||||
# There's actually one more level of equivalency analysis we could
|
||||
# do. This is to recognize that Sun >=1 is the same as First Sun.
|
||||
# We don't do this yet.
|
||||
}
|
||||
|
||||
######################################################################
|
||||
# Given a hash of all zones and a hash of all rules, create a list
|
||||
# of equivalency groups. These are groups of zones with the same
|
||||
# offset and equivalent rules. Equivalency is tested with
|
||||
# ZoneEquals and RuleEquals. The resultant equivalency list is an
|
||||
# array of refs to groups. Each group is an array of one or more
|
||||
# zone names.
|
||||
# Param: IN ref to hash of all zones
|
||||
# Param: IN ref to hash of all rules
|
||||
# Param: OUT ref to array to receive group refs
|
||||
sub FormZoneEquivalencyGroups {
|
||||
my ($ZONES, $RULES, $EQUIV) = @_;
|
||||
|
||||
# Group the zones by offset. This improves efficiency greatly;
|
||||
# instead of an n^2 computation, we just need to do n^2 within
|
||||
# each offset; a much smaller total number.
|
||||
my %ZONES_BY_OFFSET;
|
||||
foreach (keys %$ZONES) {
|
||||
push @{$ZONES_BY_OFFSET{$ZONES->{$_}->{gmtoff}}}, $_;
|
||||
}
|
||||
|
||||
# Find equivalent rules
|
||||
foreach my $gmtoff (keys %ZONES_BY_OFFSET) {
|
||||
# Make an array of equivalency groups
|
||||
# (array of refs to array of names)
|
||||
my @equiv;
|
||||
foreach my $name1 (@{$ZONES_BY_OFFSET{$gmtoff}}) {
|
||||
my $found = 0;
|
||||
foreach my $group (@equiv) {
|
||||
my $name2 = $group->[0];
|
||||
if (ZoneEquals($ZONES->{$name1}, $ZONES->{$name2}, $RULES)) {
|
||||
push @$group, $name1;
|
||||
$found = 1;
|
||||
last;
|
||||
}
|
||||
}
|
||||
if (!$found) {
|
||||
my @newGroup = ( $name1 );
|
||||
push @equiv, \@newGroup;
|
||||
}
|
||||
}
|
||||
push @$EQUIV, @equiv;
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue