ICU-130 split genprops into parser & encoder

X-SVN-Rev: 392
2025-04-07 22:44:49 +00:00 · 1999-12-13 22:25:50 +00:00 · 1999-12-13 22:25:50 +00:00 · be1fff8aef
commit be1fff8aef
parent c6eb4d6d20
4 changed files with 606 additions and 234 deletions
--- a/icu4c/source/tools/genprops/genprops.c
+++ b/icu4c/source/tools/genprops/genprops.c
@ -1,12 +1,9 @@
 /*
 *******************************************************************************
-*                                                                             *
-* COPYRIGHT:                                                                  *
-*   (C) Copyright International Business Machines Corporation, 1999           *
-*   Licensed Material - Program-Property of IBM - All Rights Reserved.        *
-*   US Government Users Restricted Rights - Use, duplication, or disclosure   *
-*   restricted by GSA ADP Schedule Contract with IBM Corp.                    *
-*                                                                             *
+*
+*   Copyright (C) 1999, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
 *******************************************************************************
 *   file name:  genprops.c
 *   encoding:   US-ASCII
@ -32,66 +29,14 @@
 #include "filestrm.h"
 #include "udata.h"
 #include "unewdata.h"
+#include "genprops.h"

-#define DATA_NAME "uprops"
-#define DATA_TYPE "dat"
-
-/* UDataInfo cf. udata.h */
-static const UDataInfo dataInfo={
-    sizeof(UDataInfo),
-    0,
-
-    U_IS_BIG_ENDIAN,
-    U_CHARSET_FAMILY,
-    U_SIZEOF_UCHAR,
-    0,
-
-    0x55, 0x50, 0x72, 0x6f,     /* dataFormat="UPro" */
-    1, 0, 0, 0,                 /* formatVersion */
-    3, 0, 0, 0                  /* dataVersion */
-};
-
-static bool_t beVerbose=FALSE, haveCopyright=TRUE;
-
-/*
- * Definitions and arrays for the 3-stage lookup.
- */
-enum {
-    STAGE_1_BITS=11, STAGE_2_BITS=6, STAGE_3_BITS=4,
-
-    STAGE_1_COUNT=0x110000>>(STAGE_2_BITS+STAGE_3_BITS),
-    STAGE_2_COUNT=1<<STAGE_2_BITS,
-    STAGE_3_COUNT=1<<STAGE_3_BITS,
-
-    MAX_PROPS_COUNT=20000,
-    MAX_UCHAR_COUNT=10000,
-    MAX_STAGE_2_COUNT=MAX_PROPS_COUNT/10,
-    MAX_STAGES_1_2_COUNT=STAGE_1_COUNT+MAX_STAGE_2_COUNT
-};
-
-static uint16_t stages1_2[MAX_STAGES_1_2_COUNT];
-
-static uint16_t stage2Top=STAGE_1_COUNT;
-
-/* character properties */
-typedef struct {
-    uint32_t code, lowerCase, upperCase, titleCase;
-    uint32_t numericValue, denominator;
-    /* special casing? */
-    /* decomposition mappping? */
-    uint8_t generalCategory, canonicalCombining, bidi, isMirrored;
-} Props;
-
-static Props props[MAX_PROPS_COUNT];
-
-/* Unicode characters, e.g., for special casing or decomposition */
-
-static UChar uchars[MAX_UCHAR_COUNT];
-static uint16_t ucharsTop=0;
+extern bool_t beVerbose=FALSE, haveCopyright=TRUE;

 /* general categories */

-static const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={
+extern const char *const
+genCategoryNames[U_CHAR_CATEGORY_COUNT]={
 	NULL,
    "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
    "Mc", "Nd", "Nl", "No",
@ -103,7 +48,8 @@ static const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={
    "Cn"
 };

-static const char *const bidiNames[U_CHAR_DIRECTION_COUNT]={
+extern const char *const
+bidiNames[U_CHAR_DIRECTION_COUNT]={
 	"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
    "WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
 };
@ -122,18 +68,6 @@ getField(char *line, int16_t start, int16_t limit);
 static void
 checkLineIndex(uint32_t code, int16_t limit, int16_t length);

-static void
-addProps(Props *p);
-
-static void
-compress();
-
-static void
-generateData();
-
-static uint16_t
-addUChars(const UChar *s, uint16_t length);
-
 /* -------------------------------------------------------------------------- */

 extern int
@ -185,7 +119,8 @@ main(int argc, char *argv[]) {

    init();
    parseDB(in);
-    compress();
+    compactStage3();
+    compactProps();
    generateData();

    if(in!=T_FileStream_stdin()) {
@ -427,160 +362,3 @@ checkLineIndex(uint32_t code, int16_t index, int16_t length) {
        exit(U_PARSE_ERROR);
    }
 }
-
-/* store a character's properties ------------------------------------------- */
-
-static void
-addProps(Props *p) {
-    uint16_t count;
-
-    /*
-     * Simple ideas for reducing the number of bits for one character's
-     * properties:
-     *
-     * Some fields are only used for characters of certain
-     * general categories:
-     * - casing fields for letters and others, not for
-     *     numbers & Mn
-     *   + uppercase not for uppercase letters
-     *   + lowercase not for lowercase letters
-     *   + titlecase not for titlecase letters
-     *
-     *   * most of the time, uppercase=titlecase
-     * - numeric fields for various digit & other types
-     * - canonical combining classes for non-spacing marks (Mn)
-     * * the above is not always true, for all three cases
-     *
-     * Using the same bits for alternate fields saves some space.
-     *
-     * For the canonical categories, there are only few actually used.
-     * They can be stored using 5 bits.
-     *
-     * In the BiDi categories, the 5 explicit codes are only ever
-     * assigned 1:1 to 5 well-known code points. Storing only one
-     * value for all "explicit codes" gets this down to 4 bits.
-     * Client code then needs to check for this special value
-     * and replace it by the real one using a 5-element table.
-     *
-     * The general categories Mn & Me, non-spacing & enclosing marks,
-     * are always NSM, and NSM are always of those categories.
-     *
-     * Digit values can often be derived from the code point value
-     * itself in a simple way.
-     *
-     */
-
-    /* count the case mappings */
-    count=0;
-    if(p->upperCase!=0) {
-        ++count;
-    }
-    if(p->lowerCase!=0) {
-        ++count;
-    }
-    if(p->upperCase!=p->titleCase) {
-        ++count;
-    }
-
-    /* verify that only Mn has a canonical combining class */
-    if(p->generalCategory!=U_NON_SPACING_MARK && p->canonicalCombining>0) {
-        printf("*** code 0x%06x: canonical combining class does not fit expected range ***\n", p->code);
-    }
-
-    /* verify that only numeric categories have numeric values */
-    if(genCategoryNames[p->generalCategory][0]!='N' && p->numericValue!=0) {
-        printf("*** code 0x%06x: non-numeric category but numeric value\n", p->code);
-    }
-
-    /* verify that no numbers and no Mn have case mappings */
-    /* this is not 100% true either (see 0345;COMBINING GREEK YPOGEGRAMMENI) */
-    if( (genCategoryNames[p->generalCategory][0]=='N' ||
-         p->generalCategory==U_NON_SPACING_MARK) &&
-        count>0
-    ) {
-        printf("*** code 0x%06x: number category or Mn but case mapping\n", p->code);
-    } else if(count>1) {
-        /* see for which characters there are two case mappings */
-        /* there are some, but few (12) */
-        printf("*** code 0x%06x: more than one case mapping\n", p->code);
-    }
-
-    /* verify that { Mn, Me } if and only if NSM */
-    if( (p->generalCategory==U_NON_SPACING_MARK ||
-         p->generalCategory==U_ENCLOSING_MARK)
-        ^
-        (p->bidi==U_DIR_NON_SPACING_MARK)) {
-        printf("*** code 0x%06x: bidi class does not fit expected range ***\n", p->code);
-    }
-
-    /*
-     * "Higher-hanging fruit":
-     * For some sets of fields, there are fewer sets of values
-     * than the product of the numbers of values per field.
-     * This means that storing one single value for more than
-     * one field and later looking up both field values in a table
-     * saves space.
-     * Examples:
-     * - general category & BiDi
-     *
-     * There are only few common displacements between a code point
-     * and its case mappings. Store deltas. Store codes for few
-     * occuring deltas.
-     */
-}
-
-/* compressing -------------------------------------------------------------- */
-
-static void
-compress() {
-}
-
-/* generate output data ----------------------------------------------------- */
-
-static void
-generateData() {
-    UNewDataMemory *pData;
-    UErrorCode errorCode=U_ZERO_ERROR;
-    uint32_t size;
-    long dataLength;
-
-    pData=udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
-                       haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
-    if(U_FAILURE(errorCode)) {
-        fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
-        exit(errorCode);
-    }
-
-    /* ### */
-    size=0;
-
-    /* finish up */
-    dataLength=udata_finish(pData, &errorCode);
-    if(U_FAILURE(errorCode)) {
-        fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
-        exit(errorCode);
-    }
-
-    if(dataLength!=(long)size) {
-        fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", dataLength, size);
-        exit(U_INTERNAL_PROGRAM_ERROR);
-    }
-}
-
-/* helpers ------------------------------------------------------------------ */
-
-static uint16_t
-addUChars(const UChar *s, uint16_t length) {
-    uint16_t top=ucharsTop+length+1;
-    UChar *p;
-
-    if(top>=MAX_UCHAR_COUNT) {
-        fprintf(stderr, "genprops: out of memory\n");
-        exit(U_MEMORY_ALLOCATION_ERROR);
-    }
-    p=uchars+ucharsTop;
-    icu_memcpy(p, s, length);
-    p[length]=0;
-    ucharsTop=top;
-    return (uint16_t)(p-uchars);
-}
--- a/icu4c/source/tools/genprops/genprops.dsp
+++ b/icu4c/source/tools/genprops/genprops.dsp
@ -88,10 +88,18 @@ LINK32=link.exe

 SOURCE=.\genprops.c
 # End Source File
+# Begin Source File
+
+SOURCE=.\store.c
+# End Source File
 # End Group
 # Begin Group "Header Files"

 # PROP Default_Filter "h;hpp;hxx;hm;inl"
+# Begin Source File
+
+SOURCE=.\genprops.h
+# End Source File
 # End Group
 # Begin Group "Resource Files"

--- a/icu4c/source/tools/genprops/genprops.h
+++ b/icu4c/source/tools/genprops/genprops.h
@ -0,0 +1,60 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 1999, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  genprops.h
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 1999dec13
+*   created by: Markus W. Scherer
+*/
+
+#ifndef __GENPROPS_H__
+#define __GENPROPS_H__
+
+#include "utypes.h"
+
+/* file definitions */
+#define DATA_NAME "uprops"
+#define DATA_TYPE "dat"
+
+/* character properties */
+typedef struct {
+    uint32_t code, lowerCase, upperCase, titleCase;
+    uint32_t decomp[16];
+    uint32_t numericValue, denominator;
+    uint8_t generalCategory, canonicalCombining, bidi, isMirrored;
+} Props;
+
+/* global flags */
+extern bool_t beVerbose, haveCopyright;
+
+/* name tables */
+extern const char *const
+bidiNames[];
+
+extern const char *const
+genCategoryNames[];
+
+/* prototypes */
+extern void
+initStore();
+
+extern void
+addProps(Props *p);
+
+extern void
+compactStage3();
+
+extern void
+compactProps();
+
+extern void
+generateData();
+
+#endif
--- a/icu4c/source/tools/genprops/store.c
+++ b/icu4c/source/tools/genprops/store.c
@ -0,0 +1,526 @@
+/*
+*******************************************************************************
+*
+*   Copyright (C) 1999, International Business Machines
+*   Corporation and others.  All Rights Reserved.
+*
+*******************************************************************************
+*   file name:  store.c
+*   encoding:   US-ASCII
+*   tab size:   8 (not used)
+*   indentation:4
+*
+*   created on: 1999dec11
+*   created by: Markus W. Scherer
+*
+*   Store Unicode character properties efficiently for
+*   random access.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "utypes.h"
+#include "uchar.h"
+#include "cmemory.h"
+#include "cstring.h"
+#include "filestrm.h"
+#include "udata.h"
+#include "unewdata.h"
+#include "genprops.h"
+
+/* Unicode character properties file format ------------------------------------
+
+The file format prepared and written here contains several data
+structures that store indexes or data.
+
+The contents is a parsed, binary form of several Unicode character
+database files, mose prominently UnicodeData.txt.
+
+Any Unicode code point from 0 to 0x10ffff can be looked up to get
+the properties, if any, for that code point. This means that the input
+to the lookup are 21-bit unsigned integers, with not all of the
+21-bit range used.
+
+It is assumed that client code keeps a uint16_t pointer
+to the beginning of the data:
+
+    const uint16 *p16;
+
+Some indexes assume 32-bit units; although client code should only
+cast the above pointer to (const uint32_t *), it is easier here
+to talk about the result of the indexing with the definition of
+another pointer variable for this:
+
+    const uint32_t *p32=(const uint32_t *)p16;
+
+Formally, the file contains the following structures:
+
+    A const uint16_t exceptionsIndex; -- 32-bit index
+    B const uint16_t ucharsIndex; -- 32-bit index
+    C const uint16_t reservedIndex;
+    D const uint16_t reservedIndex;
+
+    E const uint16_t stage1[0x440]; -- 0x440=0x110000>>10
+    F const uint16_t stage2[variable*2*64];
+    G const uint16_t stage3[variable];
+      (possible 1*uint16_t for padding to 4-alignment)
+
+    H const uint32_t props32[variable];
+    I const uint16_t exceptions[variable];
+      (possible 1*uint16_t for padding to 4-alignment)
+
+    J const UChar uchars[variable];
+
+3-stage lookup and properties:
+
+In order to condense the data for the 21-bit code space, several properties of
+the Unicode code assignment are exploited:
+- The code space is sparse.
+- There are several 10k consecutive codes with the same properties.
+- Characters and scripts are allocated in groups of 16 code points.
+- Inside blocks for scripts the properties are often repetitive.
+- The 21-bit space is not fully used for Unicode.
+
+The three-stage lookup organizes code points in groups of 16 in stage 3.
+64 such groups are grouped again, resulting in blocks of 1k in stage 2.
+The first stage is limited according to all code points being <0x110000.
+Each stage contains indexes to groups or blocks of the next stage
+in an n:1 manner, i.e., multiple entries of one stage index the same
+group or block in the next one.
+In the third stage, groups of 16 may partially or completely overlap to save
+space with repetitive properties.
+In the properties table, only unique 32-bit words are stored to exploit
+non-adjacent overlapping. This is why the third stage does not directly
+contain the 32-bit properties words but only indexes to them.
+
+The indexes in each stage take the offset in the data of the next block into
+account to save additional arithmetic in the access.
+
+The second stage also contains properties for groups of characters:
+Each set of 64 indexes to stage 3 groups is followed by 64 group properties
+words of a uint16_t each. This is used for the script ID, since scripts
+are allocated with multiples of 16 code points each.
+
+With a given Unicode code point
+
+    uint32_t c;
+
+and 0<=c<0x110000, the lookup uses the three stage tables to
+arrive at an index into the props32[] table containing the character
+properties for c.
+For some characters, not all of the properties can be efficiently encoded
+using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
+array. Some exception entries, in turn, may contain indexes into the uchars[]
+array of Unicode strings, especially for non-1:1 case mappings.
+
+The first stage consumes the 11 most significant bits of the 21-bit code point
+and results in an index into the second stage:
+
+    uint16_t i2=p16[4+c>>10];
+
+The second stage consumes bits 9 to 4 of c and results in an index into the
+third stage:
+
+    uint16_t i3=p16[i2+((c>>4)&0x3f)];
+
+The third stage consumes bits 3 to 0 of c and results in a code point-
+specific value, which itself is only an index into the props32[] table:
+
+    uint16_t i=p16[i3+(c&0xf)];
+
+There is finally the 32-bit encoded set of properties for c:
+
+    uint32_t props=p32[i];
+
+For some characters, this contains an index into the exceptions array:
+
+    if(props&0x20) {
+        uint16_t e=(uint16_t)(props>>20);
+        ...
+    }
+
+The exception values are a variable number of uint16_t starting at
+
+    const uint16_t *pe=p16+2*p16[0]+e;
+
+The first uint16_t there contains flags about what values actually follow it.
+Some of those may be indexes for case mappings or similar and point to strings
+(zero-terminated) in the uchars[] array:
+
+    ...
+    uint16_t u=pe[depends on pe[0]];
+    const UChar *pu=(const UChar *)(p32+p16[1])+u;
+
+32-bit properties sets:
+
+Each 32-bit properties word contains:
+
+ 0.. 4  general category
+ 5      has exception values
+ 6.. 9  BiDi category (the 5 explicit codes stored as one)
+10      is mirrored
+11..19  reserved
+20..31  value according to bits 0..5:
+        if(has exception) {
+            exception index;
+        } else switch(general category) {
+        case Ll: delta to uppercase; -- same as titlecase
+        case Lu: delta to lowercase; -- titlecase is same as c
+        case Lt: delta to lowercase; -- uppercase is same as c
+        case Mn: canonical category;
+        case N*: numeric value;
+        default: *;
+        }
+
+----------------------------------------------------------------------------- */
+
+/* UDataInfo cf. udata.h */
+static const UDataInfo dataInfo={
+    sizeof(UDataInfo),
+    0,
+
+    U_IS_BIG_ENDIAN,
+    U_CHARSET_FAMILY,
+    U_SIZEOF_UCHAR,
+    0,
+
+    0x55, 0x50, 0x72, 0x6f,     /* dataFormat="UPro" */
+    1, 0, 0, 0,                 /* formatVersion */
+    3, 0, 0, 0                  /* dataVersion */
+};
+
+/* definitions and arrays for the 3-stage lookup */
+enum {
+    STAGE_1_BITS=11, STAGE_2_BITS=6, STAGE_3_BITS=4,
+
+    STAGE_1_BLOCK=0x110000>>(STAGE_2_BITS+STAGE_3_BITS),
+    STAGE_2_BLOCK=1<<STAGE_2_BITS,
+    STAGE_3_BLOCK=1<<STAGE_3_BITS,
+
+    MAX_PROPS_COUNT=20000,
+    MAX_UCHAR_COUNT=10000,
+    MAX_EXCEPTIONS_COUNT=4096,
+    MAX_STAGE_2_COUNT=MAX_PROPS_COUNT/8
+};
+
+static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
+                stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
+
+/* stage1Top=STAGE_1_BLOCK never changes */
+static uint16_t stage2Top=0, stage3Top;
+
+/* props[] is used before, props32[] after compacting the array of properties */
+static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
+static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
+
+/* exceptions values */
+static uint16_t exceptions[MAX_EXCEPTIONS_COUNT+20];
+static uint16_t exceptionsTop=0;
+
+/* Unicode characters, e.g. for special casing or decomposition */
+
+static UChar uchars[MAX_UCHAR_COUNT+20];
+static uint16_t ucharsTop=0;
+
+/* prototypes --------------------------------------------------------------- */
+
+static int
+compareProps(const void *l, const void *r);
+
+static uint16_t
+addUChars(const UChar *s, uint16_t length);
+
+/* -------------------------------------------------------------------------- */
+
+extern void
+initStore() {
+    icu_memset(stage1, 0, sizeof(stage1));
+    icu_memset(stage2, 0, sizeof(stage2));
+    icu_memset(stage3, 0, sizeof(stage3));
+    icu_memset(map, 0, sizeof(map));
+    icu_memset(props32, 0, sizeof(props32));
+}
+
+/* store a character's properties ------------------------------------------- */
+
+extern void
+addProps(Props *p) {
+    uint16_t count;
+
+    /*
+     * Simple ideas for reducing the number of bits for one character's
+     * properties:
+     *
+     * Some fields are only used for characters of certain
+     * general categories:
+     * - casing fields for letters and others, not for
+     *     numbers & Mn
+     *   + uppercase not for uppercase letters
+     *   + lowercase not for lowercase letters
+     *   + titlecase not for titlecase letters
+     *
+     *   * most of the time, uppercase=titlecase
+     * - numeric fields for various digit & other types
+     * - canonical combining classes for non-spacing marks (Mn)
+     * * the above is not always true, for all three cases
+     *
+     * Using the same bits for alternate fields saves some space.
+     *
+     * For the canonical categories, there are only few actually used.
+     * They can be stored using 5 bits.
+     *
+     * In the BiDi categories, the 5 explicit codes are only ever
+     * assigned 1:1 to 5 well-known code points. Storing only one
+     * value for all "explicit codes" gets this down to 4 bits.
+     * Client code then needs to check for this special value
+     * and replace it by the real one using a 5-element table.
+     *
+     * The general categories Mn & Me, non-spacing & enclosing marks,
+     * are always NSM, and NSM are always of those categories.
+     *
+     * Digit values can often be derived from the code point value
+     * itself in a simple way.
+     *
+     */
+
+    /* count the case mappings */
+    count=0;
+    if(p->upperCase!=0) {
+        ++count;
+    }
+    if(p->lowerCase!=0) {
+        ++count;
+    }
+    if(p->upperCase!=p->titleCase) {
+        ++count;
+    }
+
+    /* verify that only Mn has a canonical combining class */
+    if(p->generalCategory!=U_NON_SPACING_MARK && p->canonicalCombining>0) {
+        printf("*** code 0x%06x: canonical combining class does not fit expected range ***\n", p->code);
+    }
+
+    /* verify that only numeric categories have numeric values */
+    if(genCategoryNames[p->generalCategory][0]!='N' && p->numericValue!=0) {
+        printf("*** code 0x%06x: non-numeric category but numeric value\n", p->code);
+    }
+
+    /* verify that no numbers and no Mn have case mappings */
+    /* this is not 100% true either (see 0345;COMBINING GREEK YPOGEGRAMMENI) */
+    if( (genCategoryNames[p->generalCategory][0]=='N' ||
+         p->generalCategory==U_NON_SPACING_MARK) &&
+        count>0
+    ) {
+        printf("*** code 0x%06x: number category or Mn but case mapping\n", p->code);
+    } else if(count>1) {
+        /* see for which characters there are two case mappings */
+        /* there are some, but few (12) */
+        printf("*** code 0x%06x: more than one case mapping\n", p->code);
+    }
+
+    /* verify that { Mn, Me } if and only if NSM */
+    if( (p->generalCategory==U_NON_SPACING_MARK ||
+         p->generalCategory==U_ENCLOSING_MARK)
+        ^
+        (p->bidi==U_DIR_NON_SPACING_MARK)) {
+        printf("*** code 0x%06x: bidi class does not fit expected range ***\n", p->code);
+    }
+
+    /*
+     * "Higher-hanging fruit":
+     * For some sets of fields, there are fewer sets of values
+     * than the product of the numbers of values per field.
+     * This means that storing one single value for more than
+     * one field and later looking up both field values in a table
+     * saves space.
+     * Examples:
+     * - general category & BiDi
+     *
+     * There are only few common displacements between a code point
+     * and its case mappings. Store deltas. Store codes for few
+     * occuring deltas.
+     */
+}
+
+/* compacting --------------------------------------------------------------- */
+
+extern void
+compactStage3() {
+    /*
+     * At this point, all the propsTop properties are in props[].
+     * propsTop is a multiple of 16, and there are always 16 props[] entries
+     * per stage 2 entry which do not overlap.
+     * The first 16 props[] are always the empty ones.
+     * We make them overlap appropriately here and fill every 16th entry in
+     * map[] with the mapping from old to new properties indexes
+     * in order to adjust the stage 2 tables.
+     * This simple algorithm does not find arbitrary overlaps, but only those
+     * where the last i properties of the previous group and the first i of the
+     * current one all have the same value.
+     * This seems reasonable and yields linear performance.
+     */
+    uint16_t i, start, prevEnd, newStart;
+    uint32_t x;
+
+    map[0]=0;
+    newStart=STAGE_3_BLOCK;
+    for(start=newStart; start<propsTop;) {
+        prevEnd=newStart-1;
+        x=props[start];
+        if(x==props[prevEnd]) {
+            /* overlap by at least one */
+            for(i=1; i<STAGE_3_BLOCK && x==props[start+i] && x==props[prevEnd-i]; ++i) {}
+
+            /* overlap by i */
+            map[start]=newStart-i;
+
+            /* move the non-overlapping properties to their new positions */
+            start+=i;
+            for(i=STAGE_3_BLOCK-i; i>0; --i) {
+                props[newStart++]=props[start++];
+            }
+        } else if(newStart<start) {
+            /* move the properties to their new positions */
+            map[start]=newStart;
+            for(i=STAGE_3_BLOCK; i>0; --i) {
+                props[newStart++]=props[start++];
+            }
+        } else /* no overlap && newStart==start */ {
+            map[start]=start;
+            newStart+=STAGE_3_BLOCK;
+            start=newStart;
+        }
+    }
+
+    /* we saved some space */
+    if(beVerbose) {
+        printf("compactStage3() reduced propsTop from %u to %u\n", propsTop, propsTop-(start-newStart));
+    }
+    propsTop-=(start-newStart);
+
+    /* now adjust the stage 2 tables, skipping the data parts in them */
+    for(start=0; start<stage2Top; start+=STAGE_2_BLOCK) {
+        for(i=STAGE_2_BLOCK; i>0; --i) {
+            stage2[start]=map[stage2[start]];
+            ++start;
+        }
+    }
+}
+
+extern void
+compactProps() {
+    /*
+     * At this point, all the propsTop properties are in props[], but they
+     * are not all unique.
+     * Now we sort them, reduce them to unique ones in props32[], and
+     * build an index in stage3[] from the old to the new indexes.
+     * (The quick sort averages at N*log(N) with N=propsTop. The inverting
+     * yields linear performance.)
+     */
+
+    /*
+     * We are going to sort only an index table in map[] because we need this
+     * index table anyway and qsort() does not allow to sort two tables together
+     * directly. This will thus also reduce the amount of data moved around.
+     */
+    uint16_t i, oldIndex, newIndex;
+    uint32_t x;
+
+    /* build the index table */
+    for(i=propsTop; i>0;) {
+        --i;
+        map[i]=i;
+    }
+
+    /* do not reorder the first, empty entries */
+    qsort(map+STAGE_3_BLOCK, propsTop-STAGE_3_BLOCK, 2, compareProps);
+
+    /*
+     * Now invert the reordered table and compact it in the same step.
+     * The result will be props32[] having only unique properties words
+     * and stage3[] having indexes to them.
+     */
+    newIndex=0;
+    for(i=0; i<propsTop;) {
+        /* set the first of a possible series of the same properties */
+        oldIndex=map[i];
+        props32[newIndex]=x=props[oldIndex];
+        stage3[oldIndex]=newIndex;
+
+        /* set the following same properties only in stage3 */
+        while(++i<propsTop && x==props[map[i]]) {
+            stage3[map[i]]=newIndex;
+        }
+
+        ++newIndex;
+    }
+
+    /* we saved some space */
+    stage3Top=propsTop;
+    propsTop=newIndex;
+    if(beVerbose) {
+        printf("compactProps() reduced propsTop from %u to %u\n", stage3Top, propsTop);
+    }
+}
+
+static int
+compareProps(const void *l, const void *r) {
+    uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
+
+    /* compare general categories first */
+    int rc=(int)(left&0x1f)-(int)(right&0x1f);
+    if(rc==0 && left!=right) {
+        rc= left<right ? -1 : 1;
+    }
+    return rc;
+}
+
+/* generate output data ----------------------------------------------------- */
+
+extern void
+generateData() {
+    UNewDataMemory *pData;
+    UErrorCode errorCode=U_ZERO_ERROR;
+    uint32_t size;
+    long dataLength;
+
+    pData=udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
+                       haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
+        exit(errorCode);
+    }
+
+    /* ### */
+    size=0;
+
+    /* finish up */
+    dataLength=udata_finish(pData, &errorCode);
+    if(U_FAILURE(errorCode)) {
+        fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
+        exit(errorCode);
+    }
+
+    if(dataLength!=(long)size) {
+        fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", dataLength, size);
+        exit(U_INTERNAL_PROGRAM_ERROR);
+    }
+}
+
+/* helpers ------------------------------------------------------------------ */
+
+static uint16_t
+addUChars(const UChar *s, uint16_t length) {
+    uint16_t top=ucharsTop+length+1;
+    UChar *p;
+
+    if(top>=MAX_UCHAR_COUNT) {
+        fprintf(stderr, "genprops: out of UChars memory\n");
+        exit(U_MEMORY_ALLOCATION_ERROR);
+    }
+    p=uchars+ucharsTop;
+    icu_memcpy(p, s, length);
+    p[length]=0;
+    ucharsTop=top;
+    return (uint16_t)(p-uchars);
+}