ICU-65 update gentz for new binary format and alias table; make pm file names 8.3

X-SVN-Rev: 300
2025-04-07 22:44:49 +00:00 · 1999-12-05 05:55:28 +00:00 · 1999-12-05 05:55:28 +00:00 · aa7e2fc7b8
commit aa7e2fc7b8
parent 76bdfc909a
6 changed files with 1801 additions and 490 deletions
--- a/icu4c/source/tools/gentz/gentz.cpp
+++ b/icu4c/source/tools/gentz/gentz.cpp
@ -36,9 +36,6 @@
 #define INPUT_FILE "tz.txt"
 #define OUTPUT_FILE "tz.dat"

-#define DATA_NAME "tz"
-#define DATA_TYPE "dat"
-
 #define DATA_COPYRIGHT \
    "Copyright (C) 1999, International Business Machines " \
    "Corporation and others.  All Rights Reserved."
@ -53,9 +50,9 @@ static const UDataInfo dataInfo = {
    sizeof(UChar),
    0,

-    'z', 'o', 'n', 'e',         /* dataFormat */
-    1, 0, 0, 0,                 /* formatVersion */
-    1, 9, 9, 9                  /* dataVersion */
+    TZ_SIG[0], TZ_SIG[1], TZ_SIG[2], TZ_SIG[3], /* dataFormat */
+    TZ_FORMAT_VERSION, 0, 0, 0,                 /* formatVersion */
+    0, 0, 0, 0 /* dataVersion - will be filled in with year.suffix */
 };


@ -70,6 +67,9 @@ class gentz {
    // larger is considered an error.  Adjust as needed.
    enum { MAX_ZONES = 1000 };

+    // The largest maxNameLength we accept as sensible.  Adjust as needed.
+    enum { MAX_MAX_NAME_LENGTH = 100 };
+
    // The maximum sensible GMT offset, in seconds
    static const int32_t MAX_GMT_OFFSET;

@ -87,15 +87,19 @@ class gentz {

    enum { BUFLEN = 1024 };
    char buffer[BUFLEN];
+    int32_t lineNumber;
    
    TZHeader header;
    StandardZone* stdZones;
    DSTZone* dstZones;
    char* nameTable;
+    int32_t* indexByName;
+    OffsetIndex* indexByOffset;
    
-    int32_t zoneCount; // Total number of zones
+    int32_t maxPerOffset; // Maximum number of zones per offset
    int32_t stdZoneSize;
    int32_t dstZoneSize;
+    int32_t offsetIndexSize; // Total bytes in offset index table
    int32_t nameTableSize; // Total bytes in name table

    bool_t useCopyright;
@ -116,9 +120,13 @@ private:
    void          parse1DSTZone(FileStream* in, DSTZone& zone);
    void          parseDSTRule(char*& p, TZRule& rule);

+    int32_t*      parseIndexTable(FileStream* in);
+    OffsetIndex*  parseOffsetIndexTable(FileStream* in);
+
    char*         parseNameTable(FileStream* in);

    // Low level parsing and reading
+    void     readEndMarker(FileStream* in);
    int32_t  readIntegerLine(FileStream* in, int32_t min, int32_t max);
    int32_t  _parseInteger(char*& p);
    int32_t  parseInteger(char*& p, char nextExpectedChar, int32_t, int32_t);
@ -134,7 +142,7 @@ int main(int argc, char *argv[]) {
    return x.main(argc, argv);
 }

-const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60;
+const int32_t gentz::MAX_GMT_OFFSET = (int32_t)24*60*60; // seconds
 const char    gentz::COMMENT        = '#';
 const char    gentz::CR             = ((char)13);
 const char    gentz::LF             = ((char)10);
@ -148,19 +156,19 @@ const char*   gentz::END_KEYWORD    = "end";

 void gentz::usage(const char* argv0) {
    fprintf(stderr,
-            "Usage: %s [-c[+|-]] infile outfile\n"
+            "Usage: %s [-c[+|-]] infile\n"
            " -c[+|-] [do|do not] include copyright (default=+)\n"
-            " infile  text file produced by tz.pl\n"
-            " outfile binary file suitable for memory mapping\n",
+            " infile  text file produced by tz.pl\n",
            argv0);
    exit(1);
 }

 int gentz::main(int argc, char *argv[]) {
+    ////////////////////////////////////////////////////////////
    // Parse arguments
+    ////////////////////////////////////////////////////////////
    useCopyright = TRUE;
    const char* infile = 0;
-    const char* outfile = 0;
    for (int i=1; i<argc; ++i) {
        const char* arg = argv[i];
        if (arg[0] == '-') {
@ -179,17 +187,20 @@ int gentz::main(int argc, char *argv[]) {
            }
        } else if (infile == 0) {
            infile = arg;
-        } else if (outfile == 0) {
-            outfile = arg;
        } else {
            usage(argv[0]);
        }
    }
-    if (outfile == 0) {
+    if (infile == 0) {
        usage(argv[0]);
    }

+    ////////////////////////////////////////////////////////////
+    // Read the input file
+    ////////////////////////////////////////////////////////////
    *buffer = NUL;
+    lineNumber = 0;
+    fprintf(stdout, "Input file: %s\n", infile);
    FileStream* in = T_FileStream_open(infile, "r");
    if (in == 0) {
        die("Cannot open input file");
@ -197,14 +208,13 @@ int gentz::main(int argc, char *argv[]) {
    parseTzTextFile(in);
    T_FileStream_close(in);
    *buffer = NUL;
-    fprintf(stdout, "Input file %s, data version %u(%u)\n",
-            infile, header.versionYear, header.versionSuffix);
-    fprintf(stdout, "Read %ld standard zones, %ld dst zones, %ld zone names\n",
-            header.standardCount, header.dstCount, zoneCount);

+    ////////////////////////////////////////////////////////////
+    // Write the output file
+    ////////////////////////////////////////////////////////////
    int32_t wlen = writeTzDatFile();
-    fprintf(stdout, "Wrote to %s: %ld bytes\n",
-            outfile, wlen);
+    fprintf(stdout, "Output file: %s.%s, %ld bytes\n",
+            TZ_DATA_NAME, TZ_DATA_TYPE, wlen);

    return 0; // success
 }
@ -213,15 +223,23 @@ int32_t gentz::writeTzDatFile() {
    UNewDataMemory *pdata;
    UErrorCode status = U_ZERO_ERROR;

-    pdata = udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
+    // Fill in dataInfo with year.suffix
+    *(uint16_t*)&(dataInfo.dataVersion[0]) = header.versionYear;
+    *(uint16_t*)&(dataInfo.dataVersion[2]) = header.versionSuffix;
+
+    pdata = udata_create(TZ_DATA_TYPE, TZ_DATA_NAME, &dataInfo,
                         useCopyright ? DATA_COPYRIGHT : 0, &status);
    if (U_FAILURE(status)) {
        die("Unable to create data memory");
    }

+    // Careful: This order cannot be changed (without changing
+    // the offset fixup code).
    udata_writeBlock(pdata, &header, sizeof(header));
    udata_writeBlock(pdata, stdZones, stdZoneSize);
    udata_writeBlock(pdata, dstZones, dstZoneSize);
+    udata_writeBlock(pdata, indexByName, header.count * sizeof(indexByName[0]));
+    udata_writeBlock(pdata, indexByOffset, offsetIndexSize);
    udata_writeBlock(pdata, nameTable, nameTableSize);

    uint32_t dataLength = udata_finish(pdata, &status);
@ -230,7 +248,10 @@ int32_t gentz::writeTzDatFile() {
    }

    if (dataLength != (sizeof(header) + stdZoneSize +
-                       dstZoneSize + nameTableSize)) {
+                       dstZoneSize + nameTableSize +
+                       header.count * sizeof(indexByName[0]) +
+                       offsetIndexSize
+                       )) {
        die("Written file doesn't match expected size");
    }
    return dataLength;
@ -240,37 +261,139 @@ void gentz::parseTzTextFile(FileStream* in) {
    parseHeader(in);
    stdZones = parseStandardZones(in);
    dstZones = parseDSTZones(in);
-    if (zoneCount != (int32_t)(header.standardCount + header.dstCount)) {
+    if (header.count != (header.standardCount + header.dstCount)) {
        die("Zone counts don't add up");
    }
    nameTable = parseNameTable(in);

    // Fixup the header offsets
-    stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
-    dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
+    header.standardDelta = sizeof(header);
+    header.dstDelta = header.standardDelta + stdZoneSize;
+    header.nameIndexDelta = header.dstDelta + dstZoneSize;

-    header.standardOffset = sizeof(header);
-    header.dstOffset = header.standardOffset + stdZoneSize;
-    header.nameTableOffset = header.dstOffset + dstZoneSize;
+    // Read in index tables after header is mostly fixed up
+    indexByName = parseIndexTable(in);
+    indexByOffset = parseOffsetIndexTable(in);

-    if (header.standardOffset < 0 ||
-        header.dstOffset < 0 ||
-        header.nameTableOffset < 0) {
+    header.offsetIndexDelta = header.nameIndexDelta + header.count *
+        sizeof(indexByName[0]);
+    header.nameTableDelta = header.offsetIndexDelta + offsetIndexSize;
+
+    if (header.standardDelta < 0 ||
+        header.dstDelta < 0 ||
+        header.nameTableDelta < 0) {
        die("Negative offset in header after fixup");
    }
 }

+/**
+ * Index tables are lists of specifiers of the form /[sd]\d+/, where
+ * the first character determines if it is a standard or DST zone,
+ * and the following number is in the range 0..n-1, where n is the
+ * count of that type of zone.
+ *
+ * Header must already be read in and the offsets must be fixed up.
+ * Standard and DST zones must be read in.
+ */
+int32_t* gentz::parseIndexTable(FileStream* in) {
+    uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
+    if (n != header.count) {
+        die("Count mismatch in index table");
+    }
+    int32_t* result = new int32_t[n];
+    for (uint32_t i=0; i<n; ++i) {
+        readLine(in);
+        char* p = buffer+1;
+        uint32_t index = parseInteger(p, NUL, 0, header.count);
+        switch (buffer[0]) {
+        case 's':
+            if (index >= header.standardCount) {
+                die("Standard index entry out of range");
+            }
+            result[i] = header.standardDelta +
+                ((char*)&stdZones[index] - (char*)&stdZones[0]); 
+            break;
+        case 'd':
+            if (index >= header.dstCount) {
+                die("DST index entry out of range");
+            } 
+            result[i] = header.dstDelta +
+                ((char*)&dstZones[index] - (char*)&dstZones[0]);
+            break;
+        default:
+            die("Malformed index entry");
+            break;
+        }
+    }
+    readEndMarker(in);
+    fprintf(stdout, " Read %lu name index table entries, in-memory size %ld bytes\n",
+            n, n * sizeof(int32_t));
+    return result;
+}
+
+OffsetIndex* gentz::parseOffsetIndexTable(FileStream* in) {
+    uint32_t n = readIntegerLine(in, 1, MAX_ZONES);
+
+    // We don't know how big the whole thing will be yet, but we can use
+    // the maxPerOffset number to compute an upper limit.
+    //
+    // Structs will not be 4-aligned because we'll be writing them out
+    // ourselves.  Don't try to compute the exact size in advance
+    // (unless we want to avoid the use of sizeof(), which may
+    // introduce padding that we won't actually employ).
+    int32_t maxPossibleSize = n * (sizeof(OffsetIndex) +
+        (maxPerOffset-1) * sizeof(uint16_t));
+
+    int8_t *result = new int8_t[maxPossibleSize];
+    if (result == 0) {
+        die("Out of memory");
+    }
+
+    // Read each line and construct the corresponding entry
+    OffsetIndex* index = (OffsetIndex*)result;
+    for (uint32_t i=0; i<n; ++i) {
+        readLine(in);
+        char* p = buffer;
+        index->gmtOffset = 1000 * // Convert s -> ms
+            parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
+        index->count = (uint16_t)parseInteger(p, SEP, 1, maxPerOffset);
+        uint16_t* zoneNumberArray = &(index->zoneNumber);
+        for (uint16_t j=0; j<index->count; ++j) {
+            zoneNumberArray[j] = (uint16_t)
+                parseInteger(p, (j==(index->count-1))?NUL:SEP,
+                             0, header.count-1);
+        }
+        int8_t* nextIndex = (int8_t*)&(zoneNumberArray[index->count]);
+        index->nextEntryDelta = (i==(n-1)) ? 0 : (nextIndex - (int8_t*)index);
+        index = (OffsetIndex*)nextIndex;
+    }
+    offsetIndexSize = (int8_t*)index - (int8_t*)result;
+    if (offsetIndexSize > maxPossibleSize) {
+        die("Yikes! Interal error while constructing offset index table");
+    }
+    readEndMarker(in);
+    fprintf(stdout, " Read %lu offset index table entries, in-memory size %ld bytes\n",
+            n, offsetIndexSize);
+    return (OffsetIndex*)result;
+}
+
 void gentz::parseHeader(FileStream* in) {
+    int32_t ignored;
+
    // Version string, e.g., "1999j" -> (1999<<16) | 10
-    header.versionYear = (uint16_t) readIntegerLine(in, 0, 0xFFFF);
+    header.versionYear = (uint16_t) readIntegerLine(in, 1990, 0xFFFF);
    header.versionSuffix = (uint16_t) readIntegerLine(in, 0, 0xFFFF);

-    // Zone count
-    zoneCount = readIntegerLine(in, 0, MAX_ZONES);
+    header.count = readIntegerLine(in, 1, MAX_ZONES);
+    maxPerOffset = readIntegerLine(in, 1, MAX_ZONES);
+    /*header.maxNameLength*/ ignored = readIntegerLine(in, 1, MAX_MAX_NAME_LENGTH);

    // Size of name table in bytes
    // (0x00FFFFFF is an arbitrary upper limit; adjust as needed.)
    nameTableSize = readIntegerLine(in, 1, 0x00FFFFFF);
+
+    fprintf(stdout, " Read header, data version %u(%u), in-memory size %ld bytes\n",
+            header.versionYear, header.versionSuffix, sizeof(header));
 }

 StandardZone* gentz::parseStandardZones(FileStream* in) {
@ -282,18 +405,19 @@ StandardZone* gentz::parseStandardZones(FileStream* in) {
    for (uint32_t i=0; i<header.standardCount; i++) {
        parse1StandardZone(in, zones[i]);
    }
-    readLine(in);
-    if (icu_strcmp(buffer, END_KEYWORD) != 0) {
-        die("Keyword 'end' missing");
-    }
+    readEndMarker(in);
+    stdZoneSize = (char*)&stdZones[header.standardCount] - (char*)&stdZones[0];
+    fprintf(stdout, " Read %lu standard zones, in-memory size %ld bytes\n",
+            header.standardCount, stdZoneSize);
    return zones;
 }

 void gentz::parse1StandardZone(FileStream* in, StandardZone& zone) {
    readLine(in);
    char* p = buffer;
-    zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
-    zone.gmtOffset = parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
+    /*zone.nameDelta =*/ parseInteger(p, SEP, 0, nameTableSize);
+    zone.gmtOffset = 1000 * // Convert s -> ms
+        parseInteger(p, NUL, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
 }

 DSTZone* gentz::parseDSTZones(FileStream* in) {
@ -305,18 +429,19 @@ DSTZone* gentz::parseDSTZones(FileStream* in) {
    for (uint32_t i=0; i<header.dstCount; i++) {
        parse1DSTZone(in, zones[i]);
    }
-    readLine(in);
-    if (icu_strcmp(buffer, END_KEYWORD) != 0) {
-        die("Keyword 'end' missing");
-    }
+    readEndMarker(in);
+    dstZoneSize = (char*)&dstZones[header.dstCount] - (char*)&dstZones[0];
+    fprintf(stdout, " Read %lu DST zones, in-memory size %ld bytes\n",
+            header.dstCount, dstZoneSize);
    return zones;
 }

 void gentz::parse1DSTZone(FileStream* in, DSTZone& zone) {
    readLine(in);
    char* p = buffer;
-    zone.nameOffset = parseInteger(p, SEP, 0, nameTableSize);
-    zone.gmtOffset = parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
+    /*zone.nameDelta =*/ parseInteger(p, SEP, 0, nameTableSize);
+    zone.gmtOffset = 1000 * // Convert s -> ms
+        parseInteger(p, SEP, -MAX_GMT_OFFSET, MAX_GMT_OFFSET);
    parseDSTRule(p, zone.onsetRule);
    parseDSTRule(p, zone.ceaseRule);
    zone.dstSavings = (uint16_t) parseInteger(p, NUL, 0, 12*60);
@ -349,7 +474,7 @@ void gentz::parseDSTRule(char*& p, TZRule& rule) {

 char* gentz::parseNameTable(FileStream* in) {
    int32_t n = readIntegerLine(in, 1, MAX_ZONES);
-    if (n != zoneCount) {
+    if (n != (int32_t)header.count) {
        die("Zone count doesn't match name table count");
    }
    char* names = new char[nameTableSize];
@ -371,9 +496,21 @@ char* gentz::parseNameTable(FileStream* in) {
    if (p != limit) {
        die("Name table shorter than declared size");
    }
+    readEndMarker(in);
+    fprintf(stdout, " Read %ld names, in-memory size %ld bytes\n", n, nameTableSize);
    return names;
 }

+/**
+ * Read the end marker (terminates each list).
+ */
+void gentz::readEndMarker(FileStream* in) {
+    readLine(in);
+    if (icu_strcmp(buffer, END_KEYWORD) != 0) {
+        die("Keyword 'end' missing");
+    }
+}
+
 /**
 * Read a line from the FileStream and parse it as an
 * integer.  There should be nothing else on the line.
@ -432,12 +569,13 @@ int32_t gentz::parseInteger(char*& p, char nextExpectedChar,
 void gentz::die(const char* msg) {
    fprintf(stderr, "ERROR, %s\n", msg);
    if (*buffer) {
-        fprintf(stderr, "Current input line: %s\n", buffer);
+        fprintf(stderr, "Input file line %ld: \"%s\"\n", lineNumber, buffer);
    }
    exit(1);
 }

 int32_t gentz::readLine(FileStream* in) {
+    ++lineNumber;
    T_FileStream_readLine(in, buffer, BUFLEN);
    // Trim off trailing comment
    char* p = icu_strchr(buffer, COMMENT);
--- a/icu4c/source/tools/gentz/readme.txt
+++ b/icu4c/source/tools/gentz/readme.txt
@ -1,11 +1,18 @@
 Readme file for ICU time zone data (source/tools/gentz)

-The time zone data in ICU is taken from the UNIX data files at
-ftp://elsie.nci.nih.gov/pub/tzdata<year>.

+RAW DATA
+--------
+The time zone data in ICU is taken from the UNIX data files at
+ftp://elsie.nci.nih.gov/pub/tzdata<year>.  The other input to the
+process is an alias table, described below.
+
+
+BUILD PROCESS
+-------------
 Two tools are used to process the data into a format suitable for ICU:

-   tz.pl    directory of raw data files -> tz.txt
+   tz.pl    directory of raw data files + tz.alias -> tz.txt
   gentz    tz.txt -> tz.dat (memory mappable binary file)

 After gentz is run, standard ICU data tools are used to incorporate
@ -34,4 +41,35 @@ The tz.txt file is typically checked into CVS, whereas the raw data
 files are not, since they are readily available from the URL listed
 above.

+
+ALIAS TABLE
+-----------
+For backward compatibility, we define several three-letter IDs that
+have been used since early ICU and correspond to IDs used in old JDKs.
+These IDs are listed in tz.alias.  The tz.pl script processes this
+alias table and issues errors if there are problems.
+
+
+IDS
+---
+All *system* zone IDs must consist only of characters in the invariant
+set.  See utypes.h for an explanation of what this means.  If an ID is
+encountered that contains a non-invariant character, tz.pl complains.
+Non-system zones may try to use non-invariant characters, but they
+shouldn't because of possible collisions with system IDs when the
+invariant char converter is used (see TimeZone class for details).
+
+
+Etc/GMT...
+----------
+Users may be confused by the fact that various zones with names of the
+form Etc/GMT+n appear to have an offset of the wrong sign.  For
+example, Etc/GMT+8 is 8 hours *behind* GMT; that is, it corresponds to
+what one typically sees displayed as "GMT-8:00".  The reason for this
+inversion is explained in the UNIX zone data file "etcetera".
+Briefly, this is done intentionally in order to comply with
+POSIX-style signedness.  In ICU we reproduce the UNIX zone behavior
+faithfully, including this confusing aspect.
+
+
 Alan Liu 1999
--- a/icu4c/source/tools/gentz/tz.pl
+++ b/icu4c/source/tools/gentz/tz.pl
@ -12,6 +12,9 @@
 # parse out the current zones and create a listing of current zones.
 # Author: Alan Liu
 ######################################################################
+# This script reads an alias table, $TZ_ALIAS, and creates clones of
+# standard UNIX zones with alias names.
+######################################################################
 # To update the zone data, download the latest data from the NIH URL
 # listed above into a directory.  Run this script with the directory
 # name as an argument.  THE DIRECTORY NAME MUST END IN tzdataYYYYR.
@ -27,31 +30,36 @@
 # - Lines may be followed by a comment; the parser must ignore
 #   anything of the form /\s+#.*$/ in each line.
 #   |3065,14400 # Asia/Dubai GMT+4:00
-# - The file contains a header and 3 lists.
+# - The file contains a header and 5 lists.
 # - The header contains the version of the unix data, the total
-#   zone count, and the length of the name table in bytes.
-#   |1999 # (tzdata1999j) ftp://elsie.nci.nih.gov data version YEAR
-#   |10 # (tzdata1999j) ftp://elsie.nci.nih.gov data version SUFFIX
-#   |387 # total zone count
-#   |5906 # length of name table in bytes
+#   zone count, the maximum number of zones sharing the same value
+#   of gmtOffset, the length of the name table in bytes, and
+#   the length of the longest name (not including the terminating
+#   zero byte).
+#   | 1999 # (tzdata1999j) version of Olson zone
+#   | 10 #  data from ftp://elsie.nci.nih.gov
+#   | 387 # total zone count
+#   | 40 # max count of zones with same gmtOffset
+#   | 25 # max name length not incl final zero
+#   | 5906 # length of name table in bytes
 # - Lists start with a count of the records to follow, the records
 #   themselves (one per line), and a single line with the keyword
 #   'end'.
 # - The first list is the list of standard zones:
-#   |208 # count of standard zones to follow
-#   |0,0 # Africa/Abidjan GMT+0:00
-#   |28,10800 # Africa/Addis_Ababa GMT+3:00
+#   | 208 # count of standard zones to follow
+#   | 0,0 # Africa/Abidjan GMT+0:00
+#   | 28,10800 # Africa/Addis_Ababa GMT+3:00
 #   ...
-#   |end
+#   | end
 #   Each standard zone record contains two integers.  The first
 #   is a byte offset into the name table for the name of the zone.
 #   The second integer is the GMT offset in SECONDS for this zone.
 # - The second list is the list of DST zones:
-#   |179 # count of dst zones to follow
-#   |15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
-#   |184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
+#   | 179 # count of dst zones to follow
+#   | 15,0,8,1,0,0,w,11,31,0,0,w,20 # Africa/Accra GMT+0:00 Sep 1...
+#   | 184,7200,3,-1,6,0,s,8,-1,5,1380,s,60 # Africa/Cairo GMT+2:0...
 #   ...
-#   |end
+#   | end
 #   Each record starts with the same two integers as a standard
 #   zone record.  Following this are data for the onset rule and
 #   the cease rule.  Each rule is described by the following integers:
@ -63,13 +71,38 @@
 #   The last integer in the record is the DST savings in MINUTES,
 #   typically 60.
 # - The third list is the name table:
-#   |387 # count of names to follow
-#   |Africa/Abidjan
-#   |Africa/Accra
+#   | 387 # count of names to follow
+#   | Africa/Abidjan
+#   | Africa/Accra
 #   ...
-#   |end
+#   | end
 #   Each name is terminated by a newline (like all lines in the file).
 #   The offsets in the first two lists refer to this table.
+# - The fourth list is an index list by name.  The index entries
+#   themselves are of the form /[sd]\d+/, where the first character
+#   indicates standard or DST, and the number that follows indexes
+#   into the correpsonding array.
+#   | 416 # count of name index table entries to follow
+#   | d0 # ACT
+#   | d1 # AET
+#   | d2 # AGT
+#   | d3 # ART
+#   | d4 # AST
+#   | s0 # Africa/Abidjan
+#   ...
+#   | end
+# - The fifth list is an index by GMT offset.  Each line lists the
+#   zones with the same offset.  The first number on the line
+#   is the GMT offset in seconds.  The second number is the count
+#   of zone numbers to follow.  Each zone number is an integer from
+#   0..n-1, where n is the total number of zones.  The zone numbers
+#   refer to the zone list in alphabetical order.
+#   | 39 # index by offset entries to follow
+#   | -43200,1,280
+#   | -39600,6,279,365,373,393,395,398
+#   | -36000,8,57,278,349,379,386,387,403,405
+#   ...
+#   | end
 ######################################################################
 # As of 1999j, here are the various possible values taken by the
 # rule fields.  See code below that generates this data.
@ -88,10 +121,32 @@ require 5; # Minimum version of perl needed
 use strict;
 use Getopt::Long;
 use vars qw(@FILES $YEAR $DATA_DIR $OUT $SEP @MONTH
-            $VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION);
+            $VERSION_YEAR $VERSION_SUFFIX $RAW_VERSION $TZ_ALIAS);
 require 'dumpvar.pl';
-use TZFileParser;
-use TZUtility;
+use tzparse;
+use tzutil;
+
+# File names
+$OUT = 'tz.txt';
+$TZ_ALIAS = 'tz.alias';
+
+# Separator between fields in the output file
+$SEP = ','; # Don't use ':'!
+
+@FILES = qw(africa      
+            antarctica  
+            asia        
+            australasia 
+            backward    
+            etcetera    
+            europe      
+            factory     
+            northamerica
+            pacificnew  
+            solar87     
+            solar88     
+            solar89     
+            southamerica);

 # We get the current year from the system here.  Later
 # we double check this against the zone data version.
@ -127,26 +182,6 @@ if ($DATA_DIR =~ /(tzdata(\d{4})(\w?))/) {
    usage();
 }

-# Output file name
-$OUT = 'tz.txt';
-
-# Separator between fields in the output file
-$SEP = ','; # Don't use ':'!
-
-@FILES = qw(africa      
-            antarctica  
-            asia        
-            australasia 
-            backward    
-            etcetera    
-            europe      
-            factory     
-            northamerica
-            pacificnew  
-            solar87     
-            solar88     
-            solar89     
-            southamerica);

@MONTH = qw(jan feb mar apr may jun
            jul aug sep oct nov dec);
@ -181,6 +216,8 @@ sub main {

    TZ::Postprocess(\%ZONES, \%RULES);

+    incorporateAliases($TZ_ALIAS, \%ZONES);
+
    print
        "Read ", scalar keys %ZONES, " current zones and ",
        scalar keys %RULES, " rules for $YEAR\n";
@ -207,10 +244,16 @@ sub main {
    my %NAME_OFFSET;
    my $STD_COUNT = 0; # Count of standard zones
    my $DST_COUNT = 0; # Count of DST zones
+    my $maxNameLen = 0;
    foreach my $z (sort keys %ZONES) {
+        # Make sure zone IDs only contain invariant chars
+        assertInvariantChars($z);
+
+        my $len = length($z);
        $NAME_OFFSET{$z} = $offset;
-        $offset += length($z) + 1;
+        $offset += $len + 1;
        $NAME_LIST .= "$z\n";
+        $maxNameLen = $len if ($len > $maxNameLen);
        if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
            $STD_COUNT++;
        } else {
@ -218,14 +261,35 @@ sub main {
        }
    }
    my $NAME_SIZE = $offset;
+
+    # Find the maximum number of zones with the same value of
+    # gmtOffset.
+    my %perOffset; # Hash of offset -> count
+    foreach my $z (keys %ZONES) {
+        # Use parseOffset to normalize values - probably unnecessary
+        ++$perOffset{parseOffset($ZONES{$z}->{gmtoff})};
+    }
+    my $maxPerOffset = 0;
+    foreach (values %perOffset) {
+        $maxPerOffset = $_ if ($_ > $maxPerOffset);
+    }
    
    open(OUT,">$OUT") or die "Can't open $OUT for writing: $!";
+
+    ############################################################
+    # EMIT HEADER
+    ############################################################
    # Zone data version
-    print OUT $VERSION_YEAR, " # ($RAW_VERSION) ftp://elsie.nci.nih.gov data version YEAR\n";
-    print OUT $VERSION_SUFFIX, " # ($RAW_VERSION) ftp://elsie.nci.nih.gov data version SUFFIX\n";
+    print OUT $VERSION_YEAR, " # ($RAW_VERSION) version of Olson zone\n";
+    print OUT $VERSION_SUFFIX, " #  data from ftp://elsie.nci.nih.gov\n";
    print OUT scalar keys %ZONES, " # total zone count\n";
+    print OUT $maxPerOffset, " # max count of zones with same gmtOffset\n";
+    print OUT $maxNameLen, " # max name length not incl final zero\n";
    print OUT $NAME_SIZE, " # length of name table in bytes\n";

+    ############################################################
+    # EMIT ZONE TABLES
+    ############################################################
    # Output first the standard zones, then the dst zones.
    # Precede each list with the count of zones to follow,
    # and follow it with the keyword 'end'.
@ -241,13 +305,73 @@ sub main {
        print OUT "end\n"; # 'end' keyword for error checking
    }

+    ############################################################
+    # EMIT NAME TABLE
+    ############################################################
    # Output the name table, followed by 'end' keyword
    print OUT scalar keys %ZONES, " # count of names to follow\n";
    print OUT $NAME_LIST, "end\n";

+    ############################################################
+    # EMIT INDEX BY NAME
+    ############################################################
+    # Output the name index table.  Since we don't know structure
+    # sizes, we output the index number of each zone.  For example,
+    # "s0" is the first standard zone, "s1" is the second, etc.
+    # Likewise, "d0" is the first DST zone, "d1" is the second, etc.
+    
+    # First compute index IDs, as described above.
+    my %indexID;
+    my $s = 0;
+    my $d = 0;
+    foreach my $z (sort keys %ZONES) {
+        if ($ZONES{$z}->{rule} eq $TZ::STANDARD) {
+            $indexID{$z} = "s$s";
+            $s++;
+        } else {
+            $indexID{$z} = "d$d";
+            $d++;
+        }
+    }
+    
+    # Now emit table sorted by name
+    print OUT scalar keys %ZONES, " # count of name index table entries to follow\n";
+    foreach my $z (sort keys %ZONES) {
+        print OUT $indexID{$z}, " # $z\n";
+    }
+    print OUT "end\n";
+
+    ############################################################
+    # EMIT INDEX BY GMT OFFSET
+    ############################################################
+    # Create a hash mapping zone name -> integer, from 0..n-1.
+    my %zoneNumber;
+    my $i = 0;
+    foreach (sort keys %ZONES) { $zoneNumber{$_} = $i++; }
+
+    # Create a hash by index.  The hash has offset integers as keys
+    # and arrays of index numbers as values.
+    my %offsetMap;
+    foreach (sort keys %ZONES) {
+        my $offset = parseOffset($ZONES{$_}->{gmtoff});
+        push @{$offsetMap{$offset}}, $zoneNumber{$_};
+    }
+
+    # Emit it
+    print OUT scalar keys %offsetMap, " # index by offset entries to follow\n";
+    foreach (sort {$a <=> $b} keys %offsetMap) {
+        my $aref = $offsetMap{$_};
+        print OUT $_, ",", scalar @{$aref}, ",", join(",", @{$aref}), "\n";
+    }
+    print OUT "end\n";
+
+    ############################################################
+    # END
+    ############################################################
    close(OUT);
    print "$OUT written.\n";

+
    if (0) {
        TZ::FormZoneEquivalencyGroups(\%ZONES, \%RULES, \@EQUIV);
        print
@ -281,6 +405,46 @@ sub main {
    }
 }

+# Read the alias list and create clones with alias names.  This
+# sub should be called AFTER all standard zones have been read in.
+# Param: File name of alias list
+# Param: Ref to zone hash
+sub incorporateAliases {
+    my $aliasFile = shift;
+    my $zones = shift;
+    my $n = 0;
+    local *IN;
+    open(IN,$aliasFile) or die "Can't open $aliasFile: $!";
+    while (<IN>) {
+        s/\#.*//; # Trim comments
+        next unless (/\S/); # Skip blank lines
+        if (/^\s*(\S+)\s+(\S+)\s*$/) {
+            my ($alias, $original) = ($1, $2);
+            if (exists $zones->{$alias}) {
+                die "Bad alias in $aliasFile: $alias is a standard UNIX zone. " .
+                    "Please remove $alias from the alias table.\n";
+            }
+            if (!exists $zones->{$original}) {
+                die "Bad alias in $aliasFile: $alias maps to the nonexistent " .
+                    "zone $original. Please fix this entry in the alias table.\n";
+            }
+            # We hardcode the GMT zone in the TimeZone class; don't include
+            # it in the tz.txt file.
+            if ($alias eq "GMT") {
+                die "Bad alias in $aliasFile: GMT is a hardcoded system zone. " .
+                    "Please remove it from the alias table.\n";
+            }
+            # Create the alias!
+            $zones->{$alias} = $zones->{$original};
+            $n++;
+        } else {
+            die "Bad line in alias table $aliasFile: $_\n";
+        }
+    }
+    print "Incorporated $n aliases from $aliasFile\n";
+    close(IN);
+}
+
 # Format a time zone as a machine-readable line of text.  Another
 # tool will read this line to construct a binary data structure
 # representing this zone.
@ -481,4 +645,14 @@ sub parseDaySpecifier {
    ( $dowim, $dow );
 }

+# Confirm that the given ID contains only invariant characters.
+# See utypes.h for an explanation.
+# Param: string to be checked
+sub assertInvariantChars {
+    local $_ = shift;
+    if (/[^A-Za-z0-9 \"%&\'()*+,-.\/:;<=>?_]/) {
+        die "Error: Zone ID \"$_\" contains non-invariant characters\n";
+    }
+}
+
 __END__
--- a/icu4c/source/tools/gentz/tz.txt
+++ b/icu4c/source/tools/gentz/tz.txt
--- a/icu4c/source/tools/gentz/tzparse.pm
+++ b/icu4c/source/tools/gentz/tzparse.pm
@ -0,0 +1,245 @@
+######################################################################
+# Copyright (C) 1999, International Business Machines
+# Corporation and others.  All Rights Reserved.
+######################################################################
+# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
+# where <year> is "1999b" or a similar string.
+######################################################################
+# This package handles the parsing of time zone files.
+# Author: Alan Liu
+######################################################################
+# Usage:
+# Call ParseFile for each file to be imported.  Then call Postprocess
+# to remove unused rules and links.
+
+package TZ;
+use strict;
+use Carp;
+use vars qw(@ISA @EXPORT $VERSION $YEAR $STANDARD);
+require 'dumpvar.pl';
+
+@ISA = qw(Exporter);
+@EXPORT = qw(ParseFile
+             Postprocess
+             );
+$VERSION = '0.1';
+
+$STANDARD = '-'; # Name of the Standard Time rule
+
+######################################################################
+# Param: File name
+# Param: Ref to hash of zones
+# Param: Ref to hash of rules
+# Param: Current year
+sub ParseFile {
+    my ($FILE, $ZONES, $RULES, $YEAR) = @_;
+
+    local(*FILE);
+    open(FILE,"<$FILE") or confess "Can't open $FILE: $!";
+    my $zone; # Current zone
+    my $badLineCount = 0;
+    while (<FILE>) {
+        s/\#.*//;
+        next if (!/\S/);
+        #|# Zone NAME           GMTOFF  RULES   FORMAT  [UNTIL]
+        #|Zone America/Montreal -4:54:16 -      LMT     1884
+        #|                      -5:00   Mont    E%sT
+        #|Zone America/Thunder_Bay -5:57:00 -   LMT     1895
+        #|                      -5:00   Canada  E%sT    1970
+        #|                      -5:00   Mont    E%sT    1973
+        #|                      -5:00   -       EST     1974
+        #|                      -5:00   Canada  E%sT
+        my ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil);
+        if (/^zone/i) {
+            # Zone block start
+            if (/^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/i
+                || /^zone\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)()/i) {
+                $zone = $1;
+                ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
+                    ($2, $3, $4, $5);
+            } else {
+                print STDERR "Can't parse in $FILE: $_";
+                ++$badLineCount;
+            }
+        } elsif (/^\s/ && $zone) {
+            # Zone continuation
+            if (/^\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/
+                || /^\s+(\S+)\s+(\S+)\s+(\S+)()/) {
+                ($zoneGmtoff, $zoneRule, $zoneFormat, $zoneUntil) =
+                    ($1, $2, $3, $4);
+            } else {
+                print STDERR "Can't parse in $FILE: $_";
+                ++$badLineCount;
+            }
+        } elsif (/^rule/i) {
+            # Here is where we parse a single line of the rule table.
+            # Our goal is to accept only rules applying to the current
+            # year.  This is normally a matter of accepting rules
+            # that match the current year.  However, in some cases this
+            # is more complicated.  For example:
+            #|# Tonga
+            #|# Rule NAME FROM TO  TYPE IN  ON      AT    SAVE LETTER/S
+            #|Rule  Tonga 1999 max -    Oct Sat>=1  2:00s 1:00 S
+            #|Rule  Tonga 2000 max -    Apr Sun>=16 2:00s 0    -
+            # To handle this properly, we save every rule we encounter
+            # (thus overwriting older ones with newer ones, since rules
+            # are listed in order), and also use slot [2] to mark when
+            # we see a current year rule.  When that happens, we stop
+            # saving rules.  Thus we match the latest rule we see, or
+            # a matching rule if we find one.  The format of slot [2]
+            # is just a 2 bit flag ([2]&1 means slot [0] matched,
+            # [2]&2 means slot [1] matched).
+
+            # Note that later, when the rules are post processed
+            # (see Postprocess), the slot [2] will be overwritten
+            # with the compressed rule string used to implement
+            # equality testing.
+
+            $zone = undef;
+            # Rule
+            #|# Rule NAME FROM TO   TYPE IN  ON      AT   SAVE LETTER/S
+            #|Rule   US   1918 1919 -    Mar lastSun 2:00 1:00 W # War
+            #|Rule   US   1918 1919 -    Oct lastSun 2:00 0    S
+            #|Rule   US   1942 only -    Feb 9       2:00 1:00 W # War
+            #|Rule   US   1945 only -    Sep 30      2:00 0    S
+            #|Rule   US   1967 max  -    Oct lastSun 2:00 0    S
+            #|Rule   US   1967 1973 -    Apr lastSun 2:00 1:00 D
+            #|Rule   US   1974 only -    Jan 6       2:00 1:00 D
+            #|Rule   US   1975 only -    Feb 23      2:00 1:00 D
+            #|Rule   US   1976 1986 -    Apr lastSun 2:00 1:00 D
+            #|Rule   US   1987 max  -    Apr Sun>=1  2:00 1:00 D
+            if (/^rule\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+
+                (\S+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/xi) {
+                my ($name, $from, $to, $type, $in, $on, $at, $save, $letter) =
+                    ($1, $2, $3, $4, $5, $6, $7, $8, $9);
+                my $i = $save ? 0:1;
+
+                if (!exists $RULES->{$name}) {
+                    $RULES->{$name} = [];
+                }
+                my $ruleArray = $RULES->{$name};
+
+                # Check our bit mask to see if we've already matched
+                # a current rule.  If so, do nothing.  If not, then
+                # save this rule line as the best one so far.
+                if (@{$ruleArray} < 3 ||
+                    !($ruleArray->[2] & $i)) {
+                    my $h = $ruleArray->[$i];
+                    $ruleArray->[$i]->{from} = $from;
+                    $ruleArray->[$i]->{to} = $to;
+                    $ruleArray->[$i]->{type} = $type;
+                    $ruleArray->[$i]->{in} = $in;
+                    $ruleArray->[$i]->{on} = $on;
+                    $ruleArray->[$i]->{at} = $at;
+                    $ruleArray->[$i]->{save} = $save;
+                    $ruleArray->[$i]->{letter} = $letter;
+
+                    # Does this rule match the current year?  If so,
+                    # set the bit mask so we don't overwrite this rule.
+                    # This makes us ingore rules for subsequent years
+                    # that are already listed in the database -- as long
+                    # as we have an overriding rule for the current year.
+                    if (($from == $YEAR && $to =~ /only/i) ||
+                        ($from <= $YEAR &&
+                         (($to =~ /^\d/ && $YEAR <= $to) || $to =~ /max/i))) {
+                        $ruleArray->[2] |= $i;
+                    }
+                }
+            } else {
+                print STDERR "Can't parse in $FILE: $_";
+                ++$badLineCount;
+            }
+        } elsif (/^link/i) {
+            #|# Old names, for S5 users
+            #|
+            #|# Link    LINK-FROM               LINK-TO
+            #|Link      America/New_York        EST5EDT
+            #|Link      America/Chicago         CST6CDT
+            #|Link      America/Denver          MST7MDT
+            #|Link      America/Los_Angeles     PST8PDT
+            #|Link      America/Indianapolis    EST
+            #|Link      America/Phoenix         MST
+            #|Link      Pacific/Honolulu        HST
+            if (/^link\s+(\S+)\s+(\S+)/i) {
+                # We currently only record a single link -- if there
+                # are more than one, we should modify this.
+                my ($from, $to) = ($1, $2);
+                $ZONES->{$from}->{link} = $to;
+            } else {
+                print STDERR "Can't parse in $FILE: $_";
+                ++$badLineCount;
+            }
+        } else {
+            # Unexpected line
+            print STDERR "Ignoring in $FILE: $_";
+            ++$badLineCount;
+        }
+        if ($zoneRule &&
+            ($zoneUntil !~ /\S/ || ($zoneUntil =~ /^\d/ &&
+                                    $zoneUntil >= $YEAR))) {
+            $ZONES->{$zone}->{gmtoff} = $zoneGmtoff;
+            $ZONES->{$zone}->{rule} = $zoneRule;
+            $ZONES->{$zone}->{format} = $zoneFormat;
+            $ZONES->{$zone}->{until} = $zoneUntil;
+        }
+    }
+    close(FILE);
+}
+
+######################################################################
+# Param: Ref to hash of zones
+# Param: Ref to hash of rules
+sub Postprocess {
+    my ($ZONES, $RULES) = @_;
+    my %ruleInUse;
+
+    # Eliminate zone links that have no corresponding zone
+    foreach (keys %$ZONES) {
+        if (exists $ZONES->{$_}->{link} && !exists $ZONES->{$_}->{rule}) {
+            if (0) {
+                print STDERR
+                    "Deleting link from historical/nonexistent zone: ",
+                    $_, " -> ", $ZONES->{$_}->{link}, "\n";
+            }
+            delete $ZONES->{$_};
+        }
+    }
+
+    # Check that each zone has a corresponding rule
+    foreach (sort keys %$ZONES) {
+        my $ruleName = $ZONES->{$_}->{rule};
+        next if ($ruleName eq $STANDARD);
+        if (exists $RULES->{$ruleName}) {
+            $ruleInUse{$ruleName} = 1;
+        } else {
+            # This means the zone is using the standard rule now
+            $ZONES->{$_}->{rule} = $STANDARD;
+        }
+    }
+
+    # Check that both parts are there for rules
+    # Check for unused rules
+    # Make coded string for comparisons
+    foreach (keys %$RULES) {
+        if (!exists $ruleInUse{$_}) {
+            if (0) {
+                print STDERR "Deleting historical/unused rule: $_\n";
+            }
+            delete $RULES->{$_};
+        } elsif (!$RULES->{$_}->[0] || !$RULES->{$_}->[1]) {
+            print STDERR "Rule doesn't have both parts: $_\n";
+        } else {
+            # Generate code
+            # This has all the data about a rule; it can be used
+            # to see if two rules behave identically
+            $RULES->{$_}->[2] =
+                lc($RULES->{$_}->[0]->{in} . "," .
+                   $RULES->{$_}->[0]->{on} . "," .
+                   $RULES->{$_}->[0]->{at} . "," .
+                   $RULES->{$_}->[0]->{save} . ";" .
+                   $RULES->{$_}->[1]->{in} . "," .
+                   $RULES->{$_}->[1]->{on} . "," .
+                   $RULES->{$_}->[1]->{at}); # [1]->{save} is always zero
+        }
+    }
+}
--- a/icu4c/source/tools/gentz/tzutil.pm
+++ b/icu4c/source/tools/gentz/tzutil.pm
@ -0,0 +1,197 @@
+######################################################################
+# Copyright (C) 1999, International Business Machines
+# Corporation and others.  All Rights Reserved.
+######################################################################
+# See: ftp://elsie.nci.nih.gov/pub/tzdata<year>
+# where <year> is "1999b" or a similar string.
+######################################################################
+# This package contains utility functions for time zone data.
+# Author: Alan Liu
+
+######################################################################
+# Zones - A time zone object is a hash with the following keys:
+# {gmtoff} The offset from GMT, e.g. "-5:00"
+# {rule}   The name of the rule, e.g. "-", "Canada", "EU", "US"
+# {format} The local abbreviation, e.g. "E%sT"
+# {until}  Data is good until this year, e.g., "2000".  Often blank.
+
+# These correspond to file entries:
+#|# Zone NAME           GMTOFF  RULES   FORMAT  [UNTIL]
+#|Zone America/Montreal -4:54:16 -      LMT     1884
+#|                      -5:00   Mont    E%sT
+
+# Optionally, a zone may also have the key:
+# {link}   An old name for this zone, e.g. "HST" (for Pacific/Honolulu)
+# Links come from the file entries:
+#|# Link    LINK-FROM               LINK-TO
+#|Link      America/New_York        EST5EDT
+#|Link      America/Chicago         CST6CDT
+
+# The name of the zone itself is not kept in the zone object.
+# Instead, zones are kept in a big hash.  The keys are the names; the
+# values are references to the zone objects.  The big hash of all
+# zones is referred to in all caps: %ZONES ($ZONES if it's a
+# reference).
+
+# Example: $ZONES->{"America/Los_Angeles"} =
+#   'format' => 'P%sT'
+#   'gmtoff' => '-8:00'
+#   'link' => 'US/Pacific-New'
+#   'rule' => 'US'
+#   'until' => ''
+
+######################################################################
+# Rules - A time zone rule is an array with the following elements:
+# [0] Onset rule
+# [1] Cease rule
+# [2] Encoded string
+
+# The onset rule and cease rule have the same format.  They are each
+# references to a hash with keys:
+# {from}   Start year
+# {to}     End year, or "only" or "max"
+# {type}   Unknown, usually "-"
+# {in}     Month, 3 letters
+# {on}     Day specifier, e.g. "lastSun", "Sun>=1", "23"
+# {at}     Time, e.g. "2:00", "1:00u"
+# {save}   Amount of savings, for the onset; 0 for the cease
+# {letter} Guess: the letter that goes into %s in the zone {format}
+
+# These correspond to the file entries thus:
+#|# Rule NAME FROM TO   TYPE IN  ON      AT   SAVE LETTER/S
+#|Rule   US   1942 only -    Feb 9       2:00 1:00 W # War
+#|Rule   US   1945 only -    Sep 30      2:00 0    S
+#|Rule   US   1967 max  -    Oct lastSun 2:00 0    S
+#|Rule   US   1967 1973 -    Apr lastSun 2:00 1:00 D
+#|Rule   US   1974 only -    Jan 6       2:00 1:00 D
+#|Rule   US   1975 only -    Feb 23      2:00 1:00 D
+#|Rule   US   1976 1986 -    Apr lastSun 2:00 1:00 D
+#|Rule   US   1987 max  -    Apr Sun>=1  2:00 1:00 D
+
+# Entry [2], the encoded string, is used to see if two rules are the
+# same.  It consists of "[0]->{in},[0]->{on},[0]->{at},[0]->{save};
+# [1]->{in},[1]->{on},[1]->{at}".  Note that the separator between
+# values is a comma, between onset and cease is a semicolon.  Also
+# note that the cease {save} is not used as this is always 0.  The
+# whole string is forced to lowercase.
+
+# Rules don't contain their own name.  Like zones, rules are kept in a
+# big hash; the keys are the names, the values the references to the
+# arrays.  This hash of all rules is referred to in all caps, %RULES
+# or for a reference, $RULES.
+
+# Example: $RULES->{"US"} =
+#   0  HASH(0x8fa03c)
+#      'at' => '2:00'
+#      'from' => 1987
+#      'in' => 'Apr'
+#      'letter' => 'D'
+#      'on' => 'Sun>=1'
+#      'save' => '1:00'
+#      'to' => 'max'
+#      'type' => '-'
+#   1  HASH(0x8f9fc4)
+#      'at' => '2:00'
+#      'from' => 1967
+#      'in' => 'Oct'
+#      'letter' => 'S'
+#      'on' => 'lastSun'
+#      'save' => 0
+#      'to' => 'max'
+#      'type' => '-'
+#   2  'apr,sun>=1,2:00,1:00;oct,lastsun,2:00'
+
+package TZ;
+use strict;
+use Carp;
+use vars qw(@ISA @EXPORT $VERSION $STANDARD);
+require 'dumpvar.pl';
+
+@ISA = qw(Exporter);
+@EXPORT = qw(ZoneEquals
+             RuleEquals
+             FormZoneEquivalencyGroups
+             );
+$VERSION = '0.1';
+
+$STANDARD = '-'; # Name of the Standard Time rule
+
+######################################################################
+# Param: zone object (hash ref)
+# Param: zone object (hash ref)
+# Param: ref to hash of all rules
+# Return: true if two zones are equivalent
+sub ZoneEquals {
+    my $z1 = shift;
+    my $z2 = shift;
+    my $RULES = shift;
+
+    ($z1, $z2) = ($z1->{rule}, $z2->{rule});
+
+    return ($z1 eq $z2) || 
+        RuleEquals($RULES->{$z1}, $RULES->{$z2});
+}
+
+######################################################################
+# Param: rule object (hash ref)
+# Param: rule object (hash ref)
+# Return: true if two rules are equivalent
+sub RuleEquals {
+    my $r1 = shift;
+    my $r2 = shift;
+
+    # Just compare the precomputed encoding strings.
+    # defined() catches undefined rules.  The only undefined
+    # rule is $STANDARD; any others would be cause by
+    # Postprocess().
+    return defined($r1) && defined($r2) && $r1->[2] eq $r2->[2];
+
+    # There's actually one more level of equivalency analysis we could
+    # do.  This is to recognize that Sun >=1 is the same as First Sun.
+    # We don't do this yet.
+}
+
+######################################################################
+# Given a hash of all zones and a hash of all rules, create a list
+# of equivalency groups.  These are groups of zones with the same
+# offset and equivalent rules.   Equivalency is tested with
+# ZoneEquals and RuleEquals.  The resultant equivalency list is an
+# array of refs to groups.  Each group is an array of one or more
+# zone names.
+# Param: IN  ref to hash of all zones
+# Param: IN  ref to hash of all rules
+# Param: OUT ref to array to receive group refs
+sub FormZoneEquivalencyGroups {
+    my ($ZONES, $RULES, $EQUIV) = @_;
+
+    # Group the zones by offset.  This improves efficiency greatly;
+    # instead of an n^2 computation, we just need to do n^2 within
+    # each offset; a much smaller total number.
+    my %ZONES_BY_OFFSET;
+    foreach (keys %$ZONES) {
+        push @{$ZONES_BY_OFFSET{$ZONES->{$_}->{gmtoff}}}, $_;
+    }
+
+    # Find equivalent rules
+    foreach my $gmtoff (keys %ZONES_BY_OFFSET) {
+        # Make an array of equivalency groups
+        # (array of refs to array of names)
+        my @equiv;
+        foreach my $name1 (@{$ZONES_BY_OFFSET{$gmtoff}}) {
+            my $found = 0;
+            foreach my $group (@equiv) {
+                my $name2 = $group->[0];
+                if (ZoneEquals($ZONES->{$name1}, $ZONES->{$name2}, $RULES)) {
+                    push @$group, $name1;
+                    $found = 1;
+                    last;
+                }
+            }
+            if (!$found) {
+                my @newGroup = ( $name1 );
+                push @equiv, \@newGroup;
+            }
+        }
+        push @$EQUIV, @equiv;
+    }
+}