ICU-1930 new implicit CE generation code. Also allowed setting of immplicit variables from outside, for bootstrap mode (genUCA)

X-SVN-Rev: 8881
This commit is contained in:
Vladimir Weinstein 2002-06-13 18:34:41 +00:00
parent 503c4b104d
commit 6221f1bfc1

View file

@ -85,23 +85,6 @@ isAcceptableUCA(void * /*context*/,
}
U_CDECL_END
/* added for Han implicit CE */
static const uint32_t IMPLICIT_HAN_START_ = 0x3400;
static const uint32_t IMPLICIT_HAN_LIMIT_ = 0xA000;
static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
static const uint32_t IMPLICIT_LAST_COUNT2_ =
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
(IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
IMPLICIT_LAST_COUNT2_;
static
inline void IInit_collIterate(const UCollator *collator, const UChar *sourceString,
int32_t sourceLen, collIterate *s) {
@ -746,6 +729,206 @@ ucol_cleanup(void)
return TRUE;
}
/* Following is a port of Mark's code for new treatment of implicits.
* It is positioned here, since ucol_initUCA need to initialize the
* variables below according to the data in the fractional UCA.
*/
/*
static boolean isFixedIdeograph(int cp) {
return (0x3400 <= cp && cp <= 0x4DB5
|| 0x4E00 <= cp && cp <= 0x9FA5
|| 0xF900 <= cp && cp <= 0xFA2D // compat: most of these decompose anyway
|| 0x20000 <= cp && cp <= 0x2A6D6
|| 0x2F800 <= cp && cp <= 0x2FA1D // compat: most of these decompose anyway
);
}
*/
/*
3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
2F800;CJK COMPATIBILITY IDEOGRAPH-2F800;Lo;0;L;4E3D;;;;N;;;;;
...
2FA1D;CJK COMPATIBILITY IDEOGRAPH-2FA1D;Lo;0;L;2A600;;;;N;;;;;
*/
/*
static int remapUCA_CompatibilityIdeographToCp(int cp) {
switch (cp) {
case 0x9FA6: return 0xFA0E; // FA0E ; [.9FA6.0020.0002.FA0E] # CJK COMPATIBILITY IDEOGRAPH-FA0E
case 0x9FA7: return 0xFA0F; // FA0F ; [.9FA7.0020.0002.FA0F] # CJK COMPATIBILITY IDEOGRAPH-FA0F
case 0x9FA8: return 0xFA11; // FA11 ; [.9FA8.0020.0002.FA11] # CJK COMPATIBILITY IDEOGRAPH-FA11
case 0x9FA9: return 0xFA13; // FA13 ; [.9FA9.0020.0002.FA13] # CJK COMPATIBILITY IDEOGRAPH-FA13
case 0x9FAA: return 0xFA14; // FA14 ; [.9FAA.0020.0002.FA14] # CJK COMPATIBILITY IDEOGRAPH-FA14
case 0x9FAB: return 0xFA1F; // FA1F ; [.9FAB.0020.0002.FA1F] # CJK COMPATIBILITY IDEOGRAPH-FA1F
case 0x9FAC: return 0xFA21; // FA21 ; [.9FAC.0020.0002.FA21] # CJK COMPATIBILITY IDEOGRAPH-FA21
case 0x9FAD: return 0xFA23; // FA23 ; [.9FAD.0020.0002.FA23] # CJK COMPATIBILITY IDEOGRAPH-FA23
case 0x9FAE: return 0xFA24; // FA24 ; [.9FAE.0020.0002.FA24] # CJK COMPATIBILITY IDEOGRAPH-FA24
case 0x9FAF: return 0xFA27; // FA27 ; [.9FAF.0020.0002.FA27] # CJK COMPATIBILITY IDEOGRAPH-FA27
case 0x9FB0: return 0xFA28; // FA28 ; [.9FB0.0020.0002.FA28] # CJK COMPATIBILITY IDEOGRAPH-FA28
case 0x9FB1: return 0xFA29; // FA29 ; [.9FB1.0020.0002.FA29] # CJK COMPATIBILITY IDEOGRAPH-FA29
}
return cp;
}
*/
/**
* Function used to:
* a) collapse the 2 different Han ranges from UCA into one (in the right order), and
* b) bump any non-CJK characters by 10FFFF.
* The relevant blocks are:
* A: 4E00..9FFF; CJK Unified Ideographs
* F900..FAFF; CJK Compatibility Ideographs
* B: 3400..4DBF; CJK Unified Ideographs Extension A
* 20000..XX; CJK Unified Ideographs Extension B (and others later on)
* As long as
* no new B characters are allocated between 4E00 and FAFF, and
* no new A characters are outside of this range,
* (very high probability) this simple code will work.
* The reordered blocks are:
* Block1 is CJK
* Block2 is CJK_COMPAT_USED
* Block3 is CJK_A
* Any other CJK gets its normal code point
* Any non-CJK gets +10FFFF
* When we reorder Block1, we make sure that it is at the very start,
* so that it will use a 3-byte form.
*/
// CONSTANTS
static const uint32_t
NON_CJK_OFFSET = 0x110000,
BYTES_TO_AVOID = 3,
OTHER_COUNT = 256 - BYTES_TO_AVOID,
LAST_COUNT = OTHER_COUNT / 2,
LAST_COUNT2 = OTHER_COUNT / 21, // room for intervening, without expanding to 5 bytes
IMPLICIT_3BYTE_COUNT = 1;
// These depend on initUCA, and are initialized at that time
static uint32_t
IMPLICIT_BASE_BYTE = 0,
IMPLICIT_LIMIT_BYTE = 0, // leave room for 1 3-byte and 2 4-byte forms
IMPLICIT_4BYTE_BOUNDARY = 0,
LAST_MULTIPLIER = 0,
LAST2_MULTIPLIER = 0,
IMPLICIT_BASE_3BYTE = 0,
IMPLICIT_BASE_4BYTE = 0;
static const uint32_t
CJK_BASE = 0x4E00,
CJK_LIMIT = 0x9FFF+1,
CJK_COMPAT_USED_BASE = 0xFA0E,
CJK_COMPAT_USED_LIMIT = 0xFA2F+1,
CJK_A_BASE = 0x3400,
CJK_A_LIMIT = 0x4DBF+1,
CJK_B_BASE = 0x20000,
CJK_B_LIMIT = 0x2A6DF+1;
static inline UChar32 swapCJK(UChar32 cp) {
if (cp >= CJK_BASE) {
if (cp < CJK_LIMIT) return cp - CJK_BASE;
if (cp < CJK_COMPAT_USED_BASE) return cp + NON_CJK_OFFSET;
if (cp < CJK_COMPAT_USED_LIMIT) return cp - CJK_COMPAT_USED_BASE
+ (CJK_LIMIT - CJK_BASE);
if (cp < CJK_B_BASE) return cp + NON_CJK_OFFSET;
if (cp < CJK_B_LIMIT) return cp; // non-BMP-CJK
return cp + NON_CJK_OFFSET; // non-CJK
}
if (cp < CJK_A_BASE) return cp + NON_CJK_OFFSET;
if (cp < CJK_A_LIMIT) return cp - CJK_A_BASE
+ (CJK_LIMIT - CJK_BASE)
+ (CJK_COMPAT_USED_LIMIT - CJK_COMPAT_USED_BASE);
return cp + NON_CJK_OFFSET; // non-CJK
}
// GET IMPLICIT PRIMARY WEIGHTS
// Return value is left justified primary key
static inline uint32_t getImplicitPrimary(UChar32 cp) {
//if (DEBUG) System.out.println("Incoming: " + Utility.hex(cp));
cp = swapCJK(cp);
//if (DEBUG) System.out.println("CJK swapped: " + Utility.hex(cp));
// we now have a range of numbers from 0 to 21FFFF.
// we must skip all 00, 01, 02 bytes, so most bytes have 253 values
// we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
// we shift so that HAN all has the same first primary, for compression.
// for the 4 byte case, we make the gap as large as we can fit.
// Three byte forms are EC xx xx, ED xx xx, EE xx xx (with a gap of 1)
// Four byte forms (most supplementaries) are EF xx xx xx (with a gap of LAST2_MULTIPLIER == 14)
int32_t last0 = cp - IMPLICIT_4BYTE_BOUNDARY;
if (last0 < 0) {
int32_t last1 = cp / LAST_COUNT;
last0 = cp % LAST_COUNT;
int32_t last2 = last1 / OTHER_COUNT;
last1 %= OTHER_COUNT;
/*
if (DEBUG || last2 > 0xFF-BYTES_TO_AVOID) System.out.println("3B: " + Utility.hex(cp) + " => "
+ Utility.hex(last2) + ", "
+ Utility.hex(last1) + ", "
+ Utility.hex(last0) + ", "
);
*/
return IMPLICIT_BASE_3BYTE + (last2 << 24) + (last1 << 16) + ((last0*LAST_MULTIPLIER) << 8);
} else {
int32_t last1 = last0 / LAST_COUNT2;
last0 %= LAST_COUNT2;
int32_t last2 = last1 / OTHER_COUNT;
last1 %= OTHER_COUNT;
int32_t last3 = last2 / OTHER_COUNT;
last2 %= OTHER_COUNT;
/*
if (DEBUG || last3 > 0xFF-BYTES_TO_AVOID) System.out.println("4B: " + Utility.hex(cp) + " => "
+ Utility.hex(last3) + ", "
+ Utility.hex(last2) + ", "
+ Utility.hex(last1) + ", "
+ Utility.hex(last0 * LAST2_MULTIPLIER) + ", "
);
*/
return IMPLICIT_BASE_4BYTE + (last3 << 24) + (last2 << 16) + (last1 << 8) + (last0 * LAST2_MULTIPLIER);
}
}
/* this function is either called from initUCA or from genUCA before
* doing canonical closure for the UCA.
*/
U_CAPI void U_EXPORT2
uprv_uca_initImplicitConstants(uint32_t baseByte)
{
IMPLICIT_BASE_BYTE = baseByte;
IMPLICIT_LIMIT_BYTE = IMPLICIT_BASE_BYTE + 4; // leave room for 1 3-byte and 2 4-byte forms
IMPLICIT_4BYTE_BOUNDARY = IMPLICIT_3BYTE_COUNT * OTHER_COUNT * LAST_COUNT;
LAST_MULTIPLIER = OTHER_COUNT / LAST_COUNT;
LAST2_MULTIPLIER = OTHER_COUNT / LAST_COUNT2;
IMPLICIT_BASE_3BYTE = (IMPLICIT_BASE_BYTE << 24) + 0x030300;
IMPLICIT_BASE_4BYTE = ((IMPLICIT_BASE_BYTE + IMPLICIT_3BYTE_COUNT) << 24) + 0x030303;
}
void ucol_initUCA(UErrorCode *status) {
if(U_FAILURE(*status))
return;
@ -786,6 +969,9 @@ void ucol_initUCA(UErrorCode *status) {
else {
ucln_i18n_registerCleanup();
}
// Initalize variables for implicit generation
UCAConstants *consts = (UCAConstants *)((uint8_t *)UCA->image + UCA->image->UCAConsts);
uprv_uca_initImplicitConstants(consts->UCA_PRIMARY_IMPLICIT_MIN);
}else{
udata_close(result);
uprv_free(newUCA);
@ -1827,11 +2013,31 @@ uint32_t getDiscontiguous(const UCollator *coll, collIterate *source,
return *(coll->contractionCEs + (constart - coll->contractionIndex));
}
#if 0
/* added for Han implicit CE */
static const uint32_t IMPLICIT_HAN_START_ = 0x3400;
static const uint32_t IMPLICIT_HAN_LIMIT_ = 0xA000;
static const uint32_t IMPLICIT_SUPPLEMENTARY_COUNT_ = 0x100000;
static const uint32_t IMPLICIT_BYTES_TO_AVOID_ = 3;
static const uint32_t IMPLICIT_OTHER_COUNT_ = 256 - IMPLICIT_BYTES_TO_AVOID_;
static const uint32_t IMPLICIT_LAST_COUNT_ = IMPLICIT_OTHER_COUNT_ / 2;
static const uint32_t IMPLICIT_LAST_COUNT2_ =
(IMPLICIT_SUPPLEMENTARY_COUNT_ - 1) /
(IMPLICIT_OTHER_COUNT_ * IMPLICIT_OTHER_COUNT_) + 1;
static const uint32_t IMPLICIT_HAN_SHIFT_ = IMPLICIT_LAST_COUNT_ *
IMPLICIT_OTHER_COUNT_ - IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_BOUNDARY_ = 2 * IMPLICIT_OTHER_COUNT_ *
IMPLICIT_LAST_COUNT_ + IMPLICIT_HAN_START_;
static const uint32_t IMPLICIT_LAST2_MULTIPLIER_ = IMPLICIT_OTHER_COUNT_ /
IMPLICIT_LAST_COUNT2_;
static
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource, uint32_t hanFixup) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
}
/*
we must skip all 00, 01, 02 bytes, so most bytes have 253 values
we must leave a gap of 01 between all values of the last byte, so the last byte has 126 values (3 byte case)
@ -1859,6 +2065,18 @@ inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource, uint32_t h
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
}
#endif
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getImplicit(UChar32 cp, collIterate *collationSource) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
}
uint32_t r = getImplicitPrimary(cp);
*(collationSource->CEpos++) = ((r & 0x0000FFFF)<<16) | 0x000000C0;
return (r & UCOL_PRIMARYMASK) | 0x00000505; // This was 'order'
}
static
inline UChar getPrevNormalizedChar(collIterate *data);
@ -2153,11 +2371,14 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
return CE;
}
/* various implicits optimization */
// TODO: remove CJK_IMPLICIT_TAG completely - handled by the getImplicit
case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
return getImplicit(cp, source, 0x04000000);
//return getImplicit(cp, source, 0x04000000);
return getImplicit(cp, source);
case IMPLICIT_TAG: /* everything that is not defined otherwise */
/* UCA is filled with these. Tailorings are NOT_FOUND */
return getImplicit(cp, source, 0);
//return getImplicit(cp, source, 0);
return getImplicit(cp, source);
case TRAIL_SURROGATE_TAG: /* DC00-DFFF*/
return 0; /* broken surrogate sequence */
case LEAD_SURROGATE_TAG: /* D800-DBFF*/
@ -2166,12 +2387,16 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
UTF_IS_SECOND_SURROGATE((nextChar=*source->pos))) {
cp = ((((uint32_t)ch)<<10UL)+(nextChar)-(((uint32_t)0xd800<<10UL)+0xdc00-0x10000));
source->pos++;
#if 0
// CJKs handled in the getImplicit function. No need for fixup
if((cp >= 0x20000 && cp <= 0x2a6d6) ||
(cp >= 0x2F800 && cp <= 0x2FA1D)) { // this might be a CJK supplementary cp
return getImplicit(cp, source, 0x04000000);
} else { // or a regular one
return getImplicit(cp, source, 0);
}
#endif
return getImplicit(cp, source);
} else {
return 0; /* completely ignorable */
}
@ -2203,14 +2428,17 @@ uint32_t ucol_prv_getSpecialCE(const UCollator *coll, UChar ch, uint32_t CE, col
if (!source->coll->image->jamoSpecial) { // FAST PATH
/**(source->CEpos++) = ucmpe32_get(UCA->mapping, V);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
if (T != TBase) {
/**(source->CEpos++) = ucmpe32_get(UCA->mapping, T);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
}
/*return ucmpe32_get(UCA->mapping, L);*/ // return first one
return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);
/*return UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
return UTRIE_GET32_FROM_LEAD(coll->mapping, L);
} else { // Jamo is Special
// Since Hanguls pass the FCD check, it is
@ -2456,6 +2684,7 @@ inline UChar getPrevNormalizedChar(collIterate *data)
return ch;
}
#if 0
static
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource, uint32_t hanFixup) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
@ -2490,12 +2719,28 @@ inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource, uint32
collationSource->toReturn = collationSource->CEpos;
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
}
#endif
/* now uses Mark's getImplicitPrimary code */
static
inline uint32_t getPrevImplicit(UChar32 cp, collIterate *collationSource) {
if ((cp & 0xFFFE) == 0xFFFE || (0xD800 <= cp && cp <= 0xDC00)) {
return 0; /* illegal code value, use completely ignoreable! */
}
uint32_t r = getImplicitPrimary(cp);
*(collationSource->CEpos++) = (r & UCOL_PRIMARYMASK) | 0x00000505;
collationSource->toReturn = collationSource->CEpos;
return ((r & 0x0000FFFF)<<16) | 0x000000C0;
}
/**
* This function handles the special CEs like contractions, expansions,
* surrogates, Thai.
* It is called by both getPrevCE
*/
* This function handles the special CEs like contractions, expansions,
* surrogates, Thai.
* It is called by both getPrevCE
*/
uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
collIterate *source,
UErrorCode *status)
@ -2769,12 +3014,15 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
if (!source->coll->image->jamoSpecial)
{
/**(source->CEpos ++) = ucmpe32_get(UCA->mapping, L);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, L);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, L);
/**(source->CEpos ++) = ucmpe32_get(UCA->mapping, V);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, V);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, V);
if (T != TBase)
/**(source->CEpos ++) = ucmpe32_get(UCA->mapping, T);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);
/**(source->CEpos++) = UTRIE_GET32_FROM_LEAD(UCA->mapping, T);*/
*(source->CEpos++) = UTRIE_GET32_FROM_LEAD(coll->mapping, T);
source->toReturn = source->CEpos - 1;
return *(source->toReturn);
@ -2844,12 +3092,13 @@ uint32_t ucol_prv_getSpecialPrevCE(const UCollator *coll, UChar ch, uint32_t CE,
} else {
return 0; /* completely ignorable */
}
return getPrevImplicit(cp, source, 0);
return getPrevImplicit(cp, source);
}
// TODO: Remove CJK implicits as they are handled by the getImplicitPrimary function
case CJK_IMPLICIT_TAG: /* 0x3400-0x4DB5, 0x4E00-0x9FA5, 0xF900-0xFA2D*/
return getPrevImplicit(ch, source, 0x04000000);
return getPrevImplicit(ch, source);
case IMPLICIT_TAG: /* everything that is not defined otherwise */
return getPrevImplicit(ch, source, 0);
return getPrevImplicit(ch, source);
/* UCA is filled with these. Tailorings are NOT_FOUND */
/* not yet implemented */
case CHARSET_TAG: /* this tag always returns */