mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-07 22:44:49 +00:00
ICU-130 split genprops into parser & encoder
X-SVN-Rev: 392
This commit is contained in:
parent
c6eb4d6d20
commit
be1fff8aef
4 changed files with 606 additions and 234 deletions
|
@ -1,12 +1,9 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* *
|
||||
* COPYRIGHT: *
|
||||
* (C) Copyright International Business Machines Corporation, 1999 *
|
||||
* Licensed Material - Program-Property of IBM - All Rights Reserved. *
|
||||
* US Government Users Restricted Rights - Use, duplication, or disclosure *
|
||||
* restricted by GSA ADP Schedule Contract with IBM Corp. *
|
||||
* *
|
||||
*
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: genprops.c
|
||||
* encoding: US-ASCII
|
||||
|
@ -32,66 +29,14 @@
|
|||
#include "filestrm.h"
|
||||
#include "udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "genprops.h"
|
||||
|
||||
#define DATA_NAME "uprops"
|
||||
#define DATA_TYPE "dat"
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
static const UDataInfo dataInfo={
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
0x55, 0x50, 0x72, 0x6f, /* dataFormat="UPro" */
|
||||
1, 0, 0, 0, /* formatVersion */
|
||||
3, 0, 0, 0 /* dataVersion */
|
||||
};
|
||||
|
||||
static bool_t beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
/*
|
||||
* Definitions and arrays for the 3-stage lookup.
|
||||
*/
|
||||
enum {
|
||||
STAGE_1_BITS=11, STAGE_2_BITS=6, STAGE_3_BITS=4,
|
||||
|
||||
STAGE_1_COUNT=0x110000>>(STAGE_2_BITS+STAGE_3_BITS),
|
||||
STAGE_2_COUNT=1<<STAGE_2_BITS,
|
||||
STAGE_3_COUNT=1<<STAGE_3_BITS,
|
||||
|
||||
MAX_PROPS_COUNT=20000,
|
||||
MAX_UCHAR_COUNT=10000,
|
||||
MAX_STAGE_2_COUNT=MAX_PROPS_COUNT/10,
|
||||
MAX_STAGES_1_2_COUNT=STAGE_1_COUNT+MAX_STAGE_2_COUNT
|
||||
};
|
||||
|
||||
static uint16_t stages1_2[MAX_STAGES_1_2_COUNT];
|
||||
|
||||
static uint16_t stage2Top=STAGE_1_COUNT;
|
||||
|
||||
/* character properties */
|
||||
typedef struct {
|
||||
uint32_t code, lowerCase, upperCase, titleCase;
|
||||
uint32_t numericValue, denominator;
|
||||
/* special casing? */
|
||||
/* decomposition mappping? */
|
||||
uint8_t generalCategory, canonicalCombining, bidi, isMirrored;
|
||||
} Props;
|
||||
|
||||
static Props props[MAX_PROPS_COUNT];
|
||||
|
||||
/* Unicode characters, e.g., for special casing or decomposition */
|
||||
|
||||
static UChar uchars[MAX_UCHAR_COUNT];
|
||||
static uint16_t ucharsTop=0;
|
||||
extern bool_t beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
/* general categories */
|
||||
|
||||
static const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={
|
||||
extern const char *const
|
||||
genCategoryNames[U_CHAR_CATEGORY_COUNT]={
|
||||
NULL,
|
||||
"Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me",
|
||||
"Mc", "Nd", "Nl", "No",
|
||||
|
@ -103,7 +48,8 @@ static const char *const genCategoryNames[U_CHAR_CATEGORY_COUNT]={
|
|||
"Cn"
|
||||
};
|
||||
|
||||
static const char *const bidiNames[U_CHAR_DIRECTION_COUNT]={
|
||||
extern const char *const
|
||||
bidiNames[U_CHAR_DIRECTION_COUNT]={
|
||||
"L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S",
|
||||
"WS", "ON", "LRE", "LRO", "AL", "RLE", "RLO", "PDF", "NSM", "BN"
|
||||
};
|
||||
|
@ -122,18 +68,6 @@ getField(char *line, int16_t start, int16_t limit);
|
|||
static void
|
||||
checkLineIndex(uint32_t code, int16_t limit, int16_t length);
|
||||
|
||||
static void
|
||||
addProps(Props *p);
|
||||
|
||||
static void
|
||||
compress();
|
||||
|
||||
static void
|
||||
generateData();
|
||||
|
||||
static uint16_t
|
||||
addUChars(const UChar *s, uint16_t length);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
extern int
|
||||
|
@ -185,7 +119,8 @@ main(int argc, char *argv[]) {
|
|||
|
||||
init();
|
||||
parseDB(in);
|
||||
compress();
|
||||
compactStage3();
|
||||
compactProps();
|
||||
generateData();
|
||||
|
||||
if(in!=T_FileStream_stdin()) {
|
||||
|
@ -427,160 +362,3 @@ checkLineIndex(uint32_t code, int16_t index, int16_t length) {
|
|||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/* store a character's properties ------------------------------------------- */
|
||||
|
||||
static void
|
||||
addProps(Props *p) {
|
||||
uint16_t count;
|
||||
|
||||
/*
|
||||
* Simple ideas for reducing the number of bits for one character's
|
||||
* properties:
|
||||
*
|
||||
* Some fields are only used for characters of certain
|
||||
* general categories:
|
||||
* - casing fields for letters and others, not for
|
||||
* numbers & Mn
|
||||
* + uppercase not for uppercase letters
|
||||
* + lowercase not for lowercase letters
|
||||
* + titlecase not for titlecase letters
|
||||
*
|
||||
* * most of the time, uppercase=titlecase
|
||||
* - numeric fields for various digit & other types
|
||||
* - canonical combining classes for non-spacing marks (Mn)
|
||||
* * the above is not always true, for all three cases
|
||||
*
|
||||
* Using the same bits for alternate fields saves some space.
|
||||
*
|
||||
* For the canonical categories, there are only few actually used.
|
||||
* They can be stored using 5 bits.
|
||||
*
|
||||
* In the BiDi categories, the 5 explicit codes are only ever
|
||||
* assigned 1:1 to 5 well-known code points. Storing only one
|
||||
* value for all "explicit codes" gets this down to 4 bits.
|
||||
* Client code then needs to check for this special value
|
||||
* and replace it by the real one using a 5-element table.
|
||||
*
|
||||
* The general categories Mn & Me, non-spacing & enclosing marks,
|
||||
* are always NSM, and NSM are always of those categories.
|
||||
*
|
||||
* Digit values can often be derived from the code point value
|
||||
* itself in a simple way.
|
||||
*
|
||||
*/
|
||||
|
||||
/* count the case mappings */
|
||||
count=0;
|
||||
if(p->upperCase!=0) {
|
||||
++count;
|
||||
}
|
||||
if(p->lowerCase!=0) {
|
||||
++count;
|
||||
}
|
||||
if(p->upperCase!=p->titleCase) {
|
||||
++count;
|
||||
}
|
||||
|
||||
/* verify that only Mn has a canonical combining class */
|
||||
if(p->generalCategory!=U_NON_SPACING_MARK && p->canonicalCombining>0) {
|
||||
printf("*** code 0x%06x: canonical combining class does not fit expected range ***\n", p->code);
|
||||
}
|
||||
|
||||
/* verify that only numeric categories have numeric values */
|
||||
if(genCategoryNames[p->generalCategory][0]!='N' && p->numericValue!=0) {
|
||||
printf("*** code 0x%06x: non-numeric category but numeric value\n", p->code);
|
||||
}
|
||||
|
||||
/* verify that no numbers and no Mn have case mappings */
|
||||
/* this is not 100% true either (see 0345;COMBINING GREEK YPOGEGRAMMENI) */
|
||||
if( (genCategoryNames[p->generalCategory][0]=='N' ||
|
||||
p->generalCategory==U_NON_SPACING_MARK) &&
|
||||
count>0
|
||||
) {
|
||||
printf("*** code 0x%06x: number category or Mn but case mapping\n", p->code);
|
||||
} else if(count>1) {
|
||||
/* see for which characters there are two case mappings */
|
||||
/* there are some, but few (12) */
|
||||
printf("*** code 0x%06x: more than one case mapping\n", p->code);
|
||||
}
|
||||
|
||||
/* verify that { Mn, Me } if and only if NSM */
|
||||
if( (p->generalCategory==U_NON_SPACING_MARK ||
|
||||
p->generalCategory==U_ENCLOSING_MARK)
|
||||
^
|
||||
(p->bidi==U_DIR_NON_SPACING_MARK)) {
|
||||
printf("*** code 0x%06x: bidi class does not fit expected range ***\n", p->code);
|
||||
}
|
||||
|
||||
/*
|
||||
* "Higher-hanging fruit":
|
||||
* For some sets of fields, there are fewer sets of values
|
||||
* than the product of the numbers of values per field.
|
||||
* This means that storing one single value for more than
|
||||
* one field and later looking up both field values in a table
|
||||
* saves space.
|
||||
* Examples:
|
||||
* - general category & BiDi
|
||||
*
|
||||
* There are only few common displacements between a code point
|
||||
* and its case mappings. Store deltas. Store codes for few
|
||||
* occuring deltas.
|
||||
*/
|
||||
}
|
||||
|
||||
/* compressing -------------------------------------------------------------- */
|
||||
|
||||
static void
|
||||
compress() {
|
||||
}
|
||||
|
||||
/* generate output data ----------------------------------------------------- */
|
||||
|
||||
static void
|
||||
generateData() {
|
||||
UNewDataMemory *pData;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
uint32_t size;
|
||||
long dataLength;
|
||||
|
||||
pData=udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
|
||||
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
/* ### */
|
||||
size=0;
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
if(dataLength!=(long)size) {
|
||||
fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", dataLength, size);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/* helpers ------------------------------------------------------------------ */
|
||||
|
||||
static uint16_t
|
||||
addUChars(const UChar *s, uint16_t length) {
|
||||
uint16_t top=ucharsTop+length+1;
|
||||
UChar *p;
|
||||
|
||||
if(top>=MAX_UCHAR_COUNT) {
|
||||
fprintf(stderr, "genprops: out of memory\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
p=uchars+ucharsTop;
|
||||
icu_memcpy(p, s, length);
|
||||
p[length]=0;
|
||||
ucharsTop=top;
|
||||
return (uint16_t)(p-uchars);
|
||||
}
|
||||
|
|
|
@ -88,10 +88,18 @@ LINK32=link.exe
|
|||
|
||||
SOURCE=.\genprops.c
|
||||
# End Source File
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\store.c
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Header Files"
|
||||
|
||||
# PROP Default_Filter "h;hpp;hxx;hm;inl"
|
||||
# Begin Source File
|
||||
|
||||
SOURCE=.\genprops.h
|
||||
# End Source File
|
||||
# End Group
|
||||
# Begin Group "Resource Files"
|
||||
|
||||
|
|
60
icu4c/source/tools/genprops/genprops.h
Normal file
60
icu4c/source/tools/genprops/genprops.h
Normal file
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: genprops.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 1999dec13
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __GENPROPS_H__
|
||||
#define __GENPROPS_H__
|
||||
|
||||
#include "utypes.h"
|
||||
|
||||
/* file definitions */
|
||||
#define DATA_NAME "uprops"
|
||||
#define DATA_TYPE "dat"
|
||||
|
||||
/* character properties */
|
||||
typedef struct {
|
||||
uint32_t code, lowerCase, upperCase, titleCase;
|
||||
uint32_t decomp[16];
|
||||
uint32_t numericValue, denominator;
|
||||
uint8_t generalCategory, canonicalCombining, bidi, isMirrored;
|
||||
} Props;
|
||||
|
||||
/* global flags */
|
||||
extern bool_t beVerbose, haveCopyright;
|
||||
|
||||
/* name tables */
|
||||
extern const char *const
|
||||
bidiNames[];
|
||||
|
||||
extern const char *const
|
||||
genCategoryNames[];
|
||||
|
||||
/* prototypes */
|
||||
extern void
|
||||
initStore();
|
||||
|
||||
extern void
|
||||
addProps(Props *p);
|
||||
|
||||
extern void
|
||||
compactStage3();
|
||||
|
||||
extern void
|
||||
compactProps();
|
||||
|
||||
extern void
|
||||
generateData();
|
||||
|
||||
#endif
|
526
icu4c/source/tools/genprops/store.c
Normal file
526
icu4c/source/tools/genprops/store.c
Normal file
|
@ -0,0 +1,526 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: store.c
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 1999dec11
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* Store Unicode character properties efficiently for
|
||||
* random access.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "utypes.h"
|
||||
#include "uchar.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "filestrm.h"
|
||||
#include "udata.h"
|
||||
#include "unewdata.h"
|
||||
#include "genprops.h"
|
||||
|
||||
/* Unicode character properties file format ------------------------------------
|
||||
|
||||
The file format prepared and written here contains several data
|
||||
structures that store indexes or data.
|
||||
|
||||
The contents is a parsed, binary form of several Unicode character
|
||||
database files, mose prominently UnicodeData.txt.
|
||||
|
||||
Any Unicode code point from 0 to 0x10ffff can be looked up to get
|
||||
the properties, if any, for that code point. This means that the input
|
||||
to the lookup are 21-bit unsigned integers, with not all of the
|
||||
21-bit range used.
|
||||
|
||||
It is assumed that client code keeps a uint16_t pointer
|
||||
to the beginning of the data:
|
||||
|
||||
const uint16 *p16;
|
||||
|
||||
Some indexes assume 32-bit units; although client code should only
|
||||
cast the above pointer to (const uint32_t *), it is easier here
|
||||
to talk about the result of the indexing with the definition of
|
||||
another pointer variable for this:
|
||||
|
||||
const uint32_t *p32=(const uint32_t *)p16;
|
||||
|
||||
Formally, the file contains the following structures:
|
||||
|
||||
A const uint16_t exceptionsIndex; -- 32-bit index
|
||||
B const uint16_t ucharsIndex; -- 32-bit index
|
||||
C const uint16_t reservedIndex;
|
||||
D const uint16_t reservedIndex;
|
||||
|
||||
E const uint16_t stage1[0x440]; -- 0x440=0x110000>>10
|
||||
F const uint16_t stage2[variable*2*64];
|
||||
G const uint16_t stage3[variable];
|
||||
(possible 1*uint16_t for padding to 4-alignment)
|
||||
|
||||
H const uint32_t props32[variable];
|
||||
I const uint16_t exceptions[variable];
|
||||
(possible 1*uint16_t for padding to 4-alignment)
|
||||
|
||||
J const UChar uchars[variable];
|
||||
|
||||
3-stage lookup and properties:
|
||||
|
||||
In order to condense the data for the 21-bit code space, several properties of
|
||||
the Unicode code assignment are exploited:
|
||||
- The code space is sparse.
|
||||
- There are several 10k consecutive codes with the same properties.
|
||||
- Characters and scripts are allocated in groups of 16 code points.
|
||||
- Inside blocks for scripts the properties are often repetitive.
|
||||
- The 21-bit space is not fully used for Unicode.
|
||||
|
||||
The three-stage lookup organizes code points in groups of 16 in stage 3.
|
||||
64 such groups are grouped again, resulting in blocks of 1k in stage 2.
|
||||
The first stage is limited according to all code points being <0x110000.
|
||||
Each stage contains indexes to groups or blocks of the next stage
|
||||
in an n:1 manner, i.e., multiple entries of one stage index the same
|
||||
group or block in the next one.
|
||||
In the third stage, groups of 16 may partially or completely overlap to save
|
||||
space with repetitive properties.
|
||||
In the properties table, only unique 32-bit words are stored to exploit
|
||||
non-adjacent overlapping. This is why the third stage does not directly
|
||||
contain the 32-bit properties words but only indexes to them.
|
||||
|
||||
The indexes in each stage take the offset in the data of the next block into
|
||||
account to save additional arithmetic in the access.
|
||||
|
||||
The second stage also contains properties for groups of characters:
|
||||
Each set of 64 indexes to stage 3 groups is followed by 64 group properties
|
||||
words of a uint16_t each. This is used for the script ID, since scripts
|
||||
are allocated with multiples of 16 code points each.
|
||||
|
||||
With a given Unicode code point
|
||||
|
||||
uint32_t c;
|
||||
|
||||
and 0<=c<0x110000, the lookup uses the three stage tables to
|
||||
arrive at an index into the props32[] table containing the character
|
||||
properties for c.
|
||||
For some characters, not all of the properties can be efficiently encoded
|
||||
using 32 bits. For them, the 32-bit word contains an index into the exceptions[]
|
||||
array. Some exception entries, in turn, may contain indexes into the uchars[]
|
||||
array of Unicode strings, especially for non-1:1 case mappings.
|
||||
|
||||
The first stage consumes the 11 most significant bits of the 21-bit code point
|
||||
and results in an index into the second stage:
|
||||
|
||||
uint16_t i2=p16[4+c>>10];
|
||||
|
||||
The second stage consumes bits 9 to 4 of c and results in an index into the
|
||||
third stage:
|
||||
|
||||
uint16_t i3=p16[i2+((c>>4)&0x3f)];
|
||||
|
||||
The third stage consumes bits 3 to 0 of c and results in a code point-
|
||||
specific value, which itself is only an index into the props32[] table:
|
||||
|
||||
uint16_t i=p16[i3+(c&0xf)];
|
||||
|
||||
There is finally the 32-bit encoded set of properties for c:
|
||||
|
||||
uint32_t props=p32[i];
|
||||
|
||||
For some characters, this contains an index into the exceptions array:
|
||||
|
||||
if(props&0x20) {
|
||||
uint16_t e=(uint16_t)(props>>20);
|
||||
...
|
||||
}
|
||||
|
||||
The exception values are a variable number of uint16_t starting at
|
||||
|
||||
const uint16_t *pe=p16+2*p16[0]+e;
|
||||
|
||||
The first uint16_t there contains flags about what values actually follow it.
|
||||
Some of those may be indexes for case mappings or similar and point to strings
|
||||
(zero-terminated) in the uchars[] array:
|
||||
|
||||
...
|
||||
uint16_t u=pe[depends on pe[0]];
|
||||
const UChar *pu=(const UChar *)(p32+p16[1])+u;
|
||||
|
||||
32-bit properties sets:
|
||||
|
||||
Each 32-bit properties word contains:
|
||||
|
||||
0.. 4 general category
|
||||
5 has exception values
|
||||
6.. 9 BiDi category (the 5 explicit codes stored as one)
|
||||
10 is mirrored
|
||||
11..19 reserved
|
||||
20..31 value according to bits 0..5:
|
||||
if(has exception) {
|
||||
exception index;
|
||||
} else switch(general category) {
|
||||
case Ll: delta to uppercase; -- same as titlecase
|
||||
case Lu: delta to lowercase; -- titlecase is same as c
|
||||
case Lt: delta to lowercase; -- uppercase is same as c
|
||||
case Mn: canonical category;
|
||||
case N*: numeric value;
|
||||
default: *;
|
||||
}
|
||||
|
||||
----------------------------------------------------------------------------- */
|
||||
|
||||
/* UDataInfo cf. udata.h */
|
||||
static const UDataInfo dataInfo={
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
0x55, 0x50, 0x72, 0x6f, /* dataFormat="UPro" */
|
||||
1, 0, 0, 0, /* formatVersion */
|
||||
3, 0, 0, 0 /* dataVersion */
|
||||
};
|
||||
|
||||
/* definitions and arrays for the 3-stage lookup */
|
||||
enum {
|
||||
STAGE_1_BITS=11, STAGE_2_BITS=6, STAGE_3_BITS=4,
|
||||
|
||||
STAGE_1_BLOCK=0x110000>>(STAGE_2_BITS+STAGE_3_BITS),
|
||||
STAGE_2_BLOCK=1<<STAGE_2_BITS,
|
||||
STAGE_3_BLOCK=1<<STAGE_3_BITS,
|
||||
|
||||
MAX_PROPS_COUNT=20000,
|
||||
MAX_UCHAR_COUNT=10000,
|
||||
MAX_EXCEPTIONS_COUNT=4096,
|
||||
MAX_STAGE_2_COUNT=MAX_PROPS_COUNT/8
|
||||
};
|
||||
|
||||
static uint16_t stage1[STAGE_1_BLOCK], stage2[MAX_STAGE_2_COUNT],
|
||||
stage3[MAX_PROPS_COUNT], map[MAX_PROPS_COUNT];
|
||||
|
||||
/* stage1Top=STAGE_1_BLOCK never changes */
|
||||
static uint16_t stage2Top=0, stage3Top;
|
||||
|
||||
/* props[] is used before, props32[] after compacting the array of properties */
|
||||
static uint32_t props[MAX_PROPS_COUNT], props32[MAX_PROPS_COUNT];
|
||||
static uint16_t propsTop=STAGE_3_BLOCK; /* the first props[] are always empty */
|
||||
|
||||
/* exceptions values */
|
||||
static uint16_t exceptions[MAX_EXCEPTIONS_COUNT+20];
|
||||
static uint16_t exceptionsTop=0;
|
||||
|
||||
/* Unicode characters, e.g. for special casing or decomposition */
|
||||
|
||||
static UChar uchars[MAX_UCHAR_COUNT+20];
|
||||
static uint16_t ucharsTop=0;
|
||||
|
||||
/* prototypes --------------------------------------------------------------- */
|
||||
|
||||
static int
|
||||
compareProps(const void *l, const void *r);
|
||||
|
||||
static uint16_t
|
||||
addUChars(const UChar *s, uint16_t length);
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
initStore() {
|
||||
icu_memset(stage1, 0, sizeof(stage1));
|
||||
icu_memset(stage2, 0, sizeof(stage2));
|
||||
icu_memset(stage3, 0, sizeof(stage3));
|
||||
icu_memset(map, 0, sizeof(map));
|
||||
icu_memset(props32, 0, sizeof(props32));
|
||||
}
|
||||
|
||||
/* store a character's properties ------------------------------------------- */
|
||||
|
||||
extern void
|
||||
addProps(Props *p) {
|
||||
uint16_t count;
|
||||
|
||||
/*
|
||||
* Simple ideas for reducing the number of bits for one character's
|
||||
* properties:
|
||||
*
|
||||
* Some fields are only used for characters of certain
|
||||
* general categories:
|
||||
* - casing fields for letters and others, not for
|
||||
* numbers & Mn
|
||||
* + uppercase not for uppercase letters
|
||||
* + lowercase not for lowercase letters
|
||||
* + titlecase not for titlecase letters
|
||||
*
|
||||
* * most of the time, uppercase=titlecase
|
||||
* - numeric fields for various digit & other types
|
||||
* - canonical combining classes for non-spacing marks (Mn)
|
||||
* * the above is not always true, for all three cases
|
||||
*
|
||||
* Using the same bits for alternate fields saves some space.
|
||||
*
|
||||
* For the canonical categories, there are only few actually used.
|
||||
* They can be stored using 5 bits.
|
||||
*
|
||||
* In the BiDi categories, the 5 explicit codes are only ever
|
||||
* assigned 1:1 to 5 well-known code points. Storing only one
|
||||
* value for all "explicit codes" gets this down to 4 bits.
|
||||
* Client code then needs to check for this special value
|
||||
* and replace it by the real one using a 5-element table.
|
||||
*
|
||||
* The general categories Mn & Me, non-spacing & enclosing marks,
|
||||
* are always NSM, and NSM are always of those categories.
|
||||
*
|
||||
* Digit values can often be derived from the code point value
|
||||
* itself in a simple way.
|
||||
*
|
||||
*/
|
||||
|
||||
/* count the case mappings */
|
||||
count=0;
|
||||
if(p->upperCase!=0) {
|
||||
++count;
|
||||
}
|
||||
if(p->lowerCase!=0) {
|
||||
++count;
|
||||
}
|
||||
if(p->upperCase!=p->titleCase) {
|
||||
++count;
|
||||
}
|
||||
|
||||
/* verify that only Mn has a canonical combining class */
|
||||
if(p->generalCategory!=U_NON_SPACING_MARK && p->canonicalCombining>0) {
|
||||
printf("*** code 0x%06x: canonical combining class does not fit expected range ***\n", p->code);
|
||||
}
|
||||
|
||||
/* verify that only numeric categories have numeric values */
|
||||
if(genCategoryNames[p->generalCategory][0]!='N' && p->numericValue!=0) {
|
||||
printf("*** code 0x%06x: non-numeric category but numeric value\n", p->code);
|
||||
}
|
||||
|
||||
/* verify that no numbers and no Mn have case mappings */
|
||||
/* this is not 100% true either (see 0345;COMBINING GREEK YPOGEGRAMMENI) */
|
||||
if( (genCategoryNames[p->generalCategory][0]=='N' ||
|
||||
p->generalCategory==U_NON_SPACING_MARK) &&
|
||||
count>0
|
||||
) {
|
||||
printf("*** code 0x%06x: number category or Mn but case mapping\n", p->code);
|
||||
} else if(count>1) {
|
||||
/* see for which characters there are two case mappings */
|
||||
/* there are some, but few (12) */
|
||||
printf("*** code 0x%06x: more than one case mapping\n", p->code);
|
||||
}
|
||||
|
||||
/* verify that { Mn, Me } if and only if NSM */
|
||||
if( (p->generalCategory==U_NON_SPACING_MARK ||
|
||||
p->generalCategory==U_ENCLOSING_MARK)
|
||||
^
|
||||
(p->bidi==U_DIR_NON_SPACING_MARK)) {
|
||||
printf("*** code 0x%06x: bidi class does not fit expected range ***\n", p->code);
|
||||
}
|
||||
|
||||
/*
|
||||
* "Higher-hanging fruit":
|
||||
* For some sets of fields, there are fewer sets of values
|
||||
* than the product of the numbers of values per field.
|
||||
* This means that storing one single value for more than
|
||||
* one field and later looking up both field values in a table
|
||||
* saves space.
|
||||
* Examples:
|
||||
* - general category & BiDi
|
||||
*
|
||||
* There are only few common displacements between a code point
|
||||
* and its case mappings. Store deltas. Store codes for few
|
||||
* occuring deltas.
|
||||
*/
|
||||
}
|
||||
|
||||
/* compacting --------------------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
compactStage3() {
|
||||
/*
|
||||
* At this point, all the propsTop properties are in props[].
|
||||
* propsTop is a multiple of 16, and there are always 16 props[] entries
|
||||
* per stage 2 entry which do not overlap.
|
||||
* The first 16 props[] are always the empty ones.
|
||||
* We make them overlap appropriately here and fill every 16th entry in
|
||||
* map[] with the mapping from old to new properties indexes
|
||||
* in order to adjust the stage 2 tables.
|
||||
* This simple algorithm does not find arbitrary overlaps, but only those
|
||||
* where the last i properties of the previous group and the first i of the
|
||||
* current one all have the same value.
|
||||
* This seems reasonable and yields linear performance.
|
||||
*/
|
||||
uint16_t i, start, prevEnd, newStart;
|
||||
uint32_t x;
|
||||
|
||||
map[0]=0;
|
||||
newStart=STAGE_3_BLOCK;
|
||||
for(start=newStart; start<propsTop;) {
|
||||
prevEnd=newStart-1;
|
||||
x=props[start];
|
||||
if(x==props[prevEnd]) {
|
||||
/* overlap by at least one */
|
||||
for(i=1; i<STAGE_3_BLOCK && x==props[start+i] && x==props[prevEnd-i]; ++i) {}
|
||||
|
||||
/* overlap by i */
|
||||
map[start]=newStart-i;
|
||||
|
||||
/* move the non-overlapping properties to their new positions */
|
||||
start+=i;
|
||||
for(i=STAGE_3_BLOCK-i; i>0; --i) {
|
||||
props[newStart++]=props[start++];
|
||||
}
|
||||
} else if(newStart<start) {
|
||||
/* move the properties to their new positions */
|
||||
map[start]=newStart;
|
||||
for(i=STAGE_3_BLOCK; i>0; --i) {
|
||||
props[newStart++]=props[start++];
|
||||
}
|
||||
} else /* no overlap && newStart==start */ {
|
||||
map[start]=start;
|
||||
newStart+=STAGE_3_BLOCK;
|
||||
start=newStart;
|
||||
}
|
||||
}
|
||||
|
||||
/* we saved some space */
|
||||
if(beVerbose) {
|
||||
printf("compactStage3() reduced propsTop from %u to %u\n", propsTop, propsTop-(start-newStart));
|
||||
}
|
||||
propsTop-=(start-newStart);
|
||||
|
||||
/* now adjust the stage 2 tables, skipping the data parts in them */
|
||||
for(start=0; start<stage2Top; start+=STAGE_2_BLOCK) {
|
||||
for(i=STAGE_2_BLOCK; i>0; --i) {
|
||||
stage2[start]=map[stage2[start]];
|
||||
++start;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern void
|
||||
compactProps() {
|
||||
/*
|
||||
* At this point, all the propsTop properties are in props[], but they
|
||||
* are not all unique.
|
||||
* Now we sort them, reduce them to unique ones in props32[], and
|
||||
* build an index in stage3[] from the old to the new indexes.
|
||||
* (The quick sort averages at N*log(N) with N=propsTop. The inverting
|
||||
* yields linear performance.)
|
||||
*/
|
||||
|
||||
/*
|
||||
* We are going to sort only an index table in map[] because we need this
|
||||
* index table anyway and qsort() does not allow to sort two tables together
|
||||
* directly. This will thus also reduce the amount of data moved around.
|
||||
*/
|
||||
uint16_t i, oldIndex, newIndex;
|
||||
uint32_t x;
|
||||
|
||||
/* build the index table */
|
||||
for(i=propsTop; i>0;) {
|
||||
--i;
|
||||
map[i]=i;
|
||||
}
|
||||
|
||||
/* do not reorder the first, empty entries */
|
||||
qsort(map+STAGE_3_BLOCK, propsTop-STAGE_3_BLOCK, 2, compareProps);
|
||||
|
||||
/*
|
||||
* Now invert the reordered table and compact it in the same step.
|
||||
* The result will be props32[] having only unique properties words
|
||||
* and stage3[] having indexes to them.
|
||||
*/
|
||||
newIndex=0;
|
||||
for(i=0; i<propsTop;) {
|
||||
/* set the first of a possible series of the same properties */
|
||||
oldIndex=map[i];
|
||||
props32[newIndex]=x=props[oldIndex];
|
||||
stage3[oldIndex]=newIndex;
|
||||
|
||||
/* set the following same properties only in stage3 */
|
||||
while(++i<propsTop && x==props[map[i]]) {
|
||||
stage3[map[i]]=newIndex;
|
||||
}
|
||||
|
||||
++newIndex;
|
||||
}
|
||||
|
||||
/* we saved some space */
|
||||
stage3Top=propsTop;
|
||||
propsTop=newIndex;
|
||||
if(beVerbose) {
|
||||
printf("compactProps() reduced propsTop from %u to %u\n", stage3Top, propsTop);
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
compareProps(const void *l, const void *r) {
|
||||
uint32_t left=props[*(const uint16_t *)l], right=props[*(const uint16_t *)r];
|
||||
|
||||
/* compare general categories first */
|
||||
int rc=(int)(left&0x1f)-(int)(right&0x1f);
|
||||
if(rc==0 && left!=right) {
|
||||
rc= left<right ? -1 : 1;
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
/* generate output data ----------------------------------------------------- */
|
||||
|
||||
extern void
|
||||
generateData() {
|
||||
UNewDataMemory *pData;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
uint32_t size;
|
||||
long dataLength;
|
||||
|
||||
pData=udata_create(DATA_TYPE, DATA_NAME, &dataInfo,
|
||||
haveCopyright ? U_COPYRIGHT_STRING : NULL, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops: unable to create data memory, error %d\n", errorCode);
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
/* ### */
|
||||
size=0;
|
||||
|
||||
/* finish up */
|
||||
dataLength=udata_finish(pData, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
fprintf(stderr, "genprops: error %d writing the output file\n", errorCode);
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
if(dataLength!=(long)size) {
|
||||
fprintf(stderr, "genprops: data length %ld != calculated size %lu\n", dataLength, size);
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/* helpers ------------------------------------------------------------------ */
|
||||
|
||||
static uint16_t
|
||||
addUChars(const UChar *s, uint16_t length) {
|
||||
uint16_t top=ucharsTop+length+1;
|
||||
UChar *p;
|
||||
|
||||
if(top>=MAX_UCHAR_COUNT) {
|
||||
fprintf(stderr, "genprops: out of UChars memory\n");
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
p=uchars+ucharsTop;
|
||||
icu_memcpy(p, s, length);
|
||||
p[length]=0;
|
||||
ucharsTop=top;
|
||||
return (uint16_t)(p-uchars);
|
||||
}
|
Loading…
Add table
Reference in a new issue