ICU-1083 Data structure for surrogate support

X-SVN-Rev: 5453
This commit is contained in:
Vladimir Weinstein 2001-08-10 20:39:52 +00:00
parent 20053b3398
commit f255a31517
2 changed files with 289 additions and 886 deletions

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,14 @@
* nicely.
*/
/**
* NOTE: This array is specifically implemented to support surrogates
* in the collation framework. It's interface is minimal and usage model
* is far from the flexible. Use at your own risk outside of collation.
* Risk is also present in the collation framework, but there is hardly
* anything you can do about it, save reimplementig the framework
*/
#ifndef UCMPE32_H
#define UCMPE32_H
@ -29,24 +37,6 @@
#include "filestrm.h"
#include "umemstrm.h"
/* INTERNAL CONSTANTS */
#define UCMPE32_kBlockShift 7
#define UCMPE32_kBlockCount (1<<UCMPE32_kBlockShift)
#define UCMPE32_kBlockMask (UCMPE32_kBlockCount-1)
#define UCMPE32_kSurrogateBlockBits (10 - UCMPE32_kBlockShift)
#define UCMPE32_kSurrogateBlockCount (1<<UCMPE32_kSurrogateBlockBits)
#define UCMPE32_kIndexShift (21-UCMPE32_kBlockShift)
/*#define UCMPE32_kIndexCount (1<<UCMPE32_kIndexShift)*/
#define UCMPE32_kIndexCount (0x110000>>UCMPE32_kBlockShift)
/*#define UCMPE32_kIndexBMPCount (1<<(16-UCMPE32_kBlockShift))*/
#define UCMPE32_kIndexBMPCount (0x10000>>UCMPE32_kBlockShift)
#define UCMPE32_kUnicodeCount 0x110000
/* trie constants */
enum {
@ -74,161 +64,41 @@ enum {
/* this may be >0xffff and may not work as an enum */
#define _UCMPE32_STAGE_1_MAX_COUNT (0x110000>>_UCMPE32_TRIE_SHIFT)
typedef struct UToolMemory {
char name[64];
uint32_t count, size, index;
uint32_t array[1];
} UToolMemory;
/**
* class CompactATypeArray : use only on primitive data types
* Provides a compact way to store information that is indexed by Unicode
* values, such as character properties, types, keyboard values, etc.This
* is very useful when you have a block of Unicode data that contains
* significant values while the rest of the Unicode data is unused in the
* application or when you have a lot of redundance, such as where all 21,000
* Han ideographs have the same value. However, lookup is much faster than a
* hash table.
* <P>
* A compact array of any primitive data type serves two purposes:
* <UL type = round>
* <LI>Fast access of the indexed values.
* <LI>Smaller memory footprint.
* </UL>
* <P>
* The index array always points into particular parts of the data array
* it is initially set up to point at regular block boundaries
* The following example uses blocks of 4 for simplicity
* <PRE>
* Example: Expanded
* BLOCK 0 1 2 3 4
* INDEX 0 4 8 12 16 ...
* ARRAY abcdeababcdezyabcdea...
* | | | | | |...
* </PRE>
* <P>
* After compression, the index will point to various places in the data array
* wherever there is a runs of the same elements as in the original
* <PRE>
* Example: Compressed
* BLOCK 0 1 2 3 4
* INDEX 0 4 1 8 2 ...
* ARRAY abcdeabazyabc...
* </PRE>
* <P>
* If you look at the example, index number 2 in the expanded version points
* to data position number 8, which has elements "bcde". In the compressed
* version, index number 2 points to data position 1, which also has "bcde"
* @see CompactByteArray
* @see CompactEIntArray
* @see CompactCharArray
* @see CompactStringArray
* @version $Revision: 1.1 $ 8/25/98
* @author Helena Shih
*/
/*====================================
*CompactEIntArray
* Provides a compact way to store information that is indexed by Unicode values,
* such as character properties, types, keyboard values, etc.
* The ATypes are used by value, so should be small, integers or pointers.
*====================================
*/
typedef struct CompactEIntArray{
uint32_t fStructSize;
int32_t* fArray;
uint16_t* fIndex;
int32_t fCount;
UBool fCompact;
UBool fBogus;
UBool fAlias;
UBool fIAmOwned; /* don't free CBA on close */
UToolMemory *stage2Mem;
uint16_t stage1[_UCMPE32_STAGE_1_MAX_COUNT];
uint32_t *stage2;
uint16_t stage1Top;
uint32_t fStructSize;
UBool fCompact;
UBool fAlias;
UBool fBogus;
uint16_t *stage1;
int32_t *stage2;
int32_t stage1Top;
int32_t stage2Top;
int32_t fDefaultValue;
int32_t fSurrogateValue;
} CompactEIntArray;
U_CAPI int32_t U_EXPORT2 ucmpe32_getkUnicodeCount(void);
U_CAPI int32_t U_EXPORT2 ucmpe32_getkBlockCount(void);
/**
* Construct an empty CompactEIntArray.
*
* @param defaultValue the default value for all characters not explicitly in the array
*/
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_open(int32_t defaultValue);
U_CAPI CompactEIntArray* U_EXPORT2
ucmpe32_open(int32_t defaultValue, int32_t surrogateValue, UErrorCode *status);
/**
* Construct a CompactEIntArray from a pre-computed index and values array. The values
* will be adopted by the CompactEIntArray. Memory is allocated with uprv_malloc.
* Note: for speed, the compact method will only re-use blocks in the values array
* that are on a block boundary. The pre-computed arrays passed in to this constructor
* may re-use blocks at any position in the values array. The indexArray and
* newValues will be uprv_free'd when ucmp16_close() is called.
*
* @param indexArray the index array to be adopted
* @param newValues the value array to be adopted
* @param count the number of entries in the value array
* @see compact
* Opens a compacted read-only array from
* a block in memory.
*/
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openAdopt(uint16_t *indexArray,
int32_t *newValues,
int32_t count);
U_CAPI CompactEIntArray* U_EXPORT2
ucmpe32_openFromData( const uint8_t **source, UErrorCode *status);
/**
* Construct a CompactEIntArray from a pre-computed index and values array. The values
* will be aliased by the CompactEIntArray. Memory is allocated with uprv_malloc.
* Note: for speed, the compact method will only re-use blocks in the values array
* that are on a block boundary. The pre-computed arrays passed in to this constructor
* may re-use blocks at any position in the values array.
*
* @param indexArray the index array to be adopted
* @param newValues the value array to be adopted
* @param count the number of entries in the value array
* @see compact
* Clones an array. It can be either compacted or expanded
*/
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openAlias(uint16_t *indexArray,
int32_t *newValues,
int32_t count);
/**
* Initialize a CompactEIntArray from a pre-computed index and values array. The values
* will be adopted by the CompactEIntArray. No memory is allocated. Note: for speed,
* the compact method will only re-use blocks in the values array that are on a block
* boundary. The pre-computed arrays passed in to this constructor may re-use blocks
* at any position in the values array. The indexArray and
* newValues will be uprv_free'd when ucmp16_close() is called.
*
* @param indexArray the index array to be adopted
* @param newValues the value array to be adopted
* @param count the number of entries in the value array
* @see compact
*/
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAdopt(CompactEIntArray *this_obj,
uint16_t *indexArray,
int32_t *newValues,
int32_t count);
/**
* Initialize a CompactEIntArray from a pre-computed index and values array. The values
* will be aliased by the CompactEIntArray. No memory is allocated. Note: for speed,
* the compact method will only re-use blocks in the values array that are on a block
* boundary. The pre-computed arrays passed in to this constructor may re-use blocks
* at any position in the values array.
*
* @param indexArray the index array to be adopted
* @param newValues the value array to be adopted
* @param count the number of entries in the value array
* @see compact
*/
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAlias(CompactEIntArray *this_obj,
uint16_t *indexArray,
int32_t *newValues,
int32_t count);
U_CAPI CompactEIntArray* U_EXPORT2
ucmpe32_clone(CompactEIntArray* orig, UErrorCode *status);
/**
* Free up any allocated memory associated with this compact array.
@ -239,33 +109,39 @@ U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAlias(CompactEIntArray *this_obj,
*/
U_CAPI void U_EXPORT2 ucmpe32_close(CompactEIntArray* array);
/**
* Returns TRUE if the creation of the compact array fails.
*/
U_CAPI UBool U_EXPORT2 ucmpe32_isBogus(const CompactEIntArray* array);
/**
* Get the mapped value of a Unicode character.
*
* @param index the character to get the mapped value with
* @return the mapped value of the given character
*/
#define ucmpe32_get(this_obj, index) (this_obj->stage2[(this_obj->stage1[(index >> _UCMPE32_TRIE_SHIFT)] )+ \
(index & _UCMPE32_STAGE_2_MASK)])
#if 0
#define ucmpe32_get(array, index) (array->fArray[(array->fIndex[(index >> UCMPE32_kBlockShift)<< UCMPE32_kBlockShift] )+ \
(index & UCMPE32_kBlockMask)])
#endif
/**
* Get the mapped value of a confirmed surrogate. First value already comes
* from the trie and is combined with the following value in order to get
* the value. THIS CAN BE ONLY USED ON A COMPACTED TRIE. You will get wrong
* results if you try it on the expanded one
* NO ERROR CHECKING IS PERFORMED! PREPARE YOUR DATA CAREFULLY!
* @param leadValue32 the mapping of the leading surrogate.
* @param trail the trailing surrogate
* @return the mapped value of the given character
*/
U_CAPI int32_t U_EXPORT2 ucmpe32_get32(CompactEIntArray *array, UChar32 index);
#define ucmpe32_get(array, index) ucmpe32_get32((array), (UChar32)(index))
#define ucmpe32_getu(array, index) (uint16_t)ucmpe32_get(array, index)
#define ucmpe32_getSurrogate(this_obj, leadValue32, trail) ucmpe32_get(this_obj, \
((leadValue32 & 0xffc00) | (trail & 0x3ff)))
/**
* This is a slow function that takes lead and trail surrogate and gets
* the mapping regardless of the compaction status.
*/
U_CAPI int32_t U_EXPORT2
ucmpe32_getSurrogateEx(CompactEIntArray *array, UChar lead, UChar trail);
U_CAPI int32_t ucmpe32_getSurrogate(CompactEIntArray *array, UChar lead, UChar trail);
/**
/**
* Set a new value for a Unicode character.
* Set automatically expands the array if it is compacted.
* Do not set if the array is compacted - nothing will happen.
* @param character the character to set the mapped value with
* @param value the new mapped value
*/
@ -273,69 +149,42 @@ U_CAPI void U_EXPORT2 ucmpe32_set32(CompactEIntArray *array,
UChar32 character,
int32_t value);
/**
* alias for compatibility
*/
#define ucmpe32_set(array, character, value) ucmpe32_set32((array), (UChar32)(character), (value))
U_CAPI void U_EXPORT2 ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead,
UChar trail, int32_t value);
/**
*
* Set new values for a range of Unicode character.
* @param start the starting offset of the range
* @param end the ending offset of the range
* Set a new value for a surrogate character.
* Do not set if the array is compacted - nothing will happen.
* Set automatically expands the array if it is compacted.
* Alternatively you can put the surrogate code point together
* yourself and use set32.
* @param lead leading surrogate unit
* @param trail trailing surrogate unit
* @param value the new mapped value
*/
U_CAPI void U_EXPORT2 ucmpe32_setRange(CompactEIntArray* array,
UChar start,
UChar end,
int32_t value);
/**
* Compact the array. The value of cycle determines how large the overlap can be.
* A cycle of 1 is the most compacted, but takes the most time to do.
* If values stored in the array tend to repeat in cycles of, say, 16,
* then using that will be faster than cycle = 1, and get almost the
* same compression.
U_CAPI void U_EXPORT2
ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead,
UChar trail, int32_t value);
/**
* compacts the array.
* This folds the surrogates and compacts the array.
* no setting will succeed after the array is compacted.
* Array have to be compacted in order to be flattened.
*/
U_CAPI void U_EXPORT2 ucmpe32_compact(CompactEIntArray* array, int32_t cycle);
U_CAPI void U_EXPORT2
ucmpe32_compact(CompactEIntArray* this_object);
/**
* Expands the compacted array.
* Takes the array back to a 65536 element array
/**
* Flattens the array to an memory stream.
* Array has to be compacted beforehand.
* @param MS memory stream to flatten to
* @return number of bytes written.
*/
U_CAPI void U_EXPORT2 ucmpe32_expand(CompactEIntArray* array);
/**
* Get the number of elements in the value array.
*
* @return the number of elements in the value array.
*/
U_CAPI uint32_t U_EXPORT2 ucmpe32_getCount(const CompactEIntArray* array);
/**
* Get the address of the value array.
*
* @return the address of the value array
*/
U_CAPI const int32_t* U_EXPORT2 ucmpe32_getArray(const CompactEIntArray* array);
/**
* Get the address of the index array.
*
* @return the address of the index array
*/
U_CAPI const uint16_t* U_EXPORT2 ucmpe32_getIndex(const CompactEIntArray* array);
U_CAPI void U_EXPORT2 ucmpe32_streamIn( CompactEIntArray* array, FileStream* is);
U_CAPI void U_EXPORT2 ucmpe32_streamOut(CompactEIntArray* array, FileStream* os);
U_CAPI void U_EXPORT2 ucmpe32_streamMemIn( CompactEIntArray* array, UMemoryStream* is);
U_CAPI void U_EXPORT2 ucmpe32_streamMemOut(CompactEIntArray* array, UMemoryStream* os);
U_CAPI uint32_t U_EXPORT2 ucmpe32_flattenMem(const CompactEIntArray* array, UMemoryStream *MS);
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openFromData( const uint8_t **source, UErrorCode *status);
U_CAPI void U_EXPORT2 ucmpe32_initFromData(CompactEIntArray *this_obj, const uint8_t **source, UErrorCode *status);
U_CAPI uint32_t U_EXPORT2
ucmpe32_flattenMem(const CompactEIntArray* this_object, UMemoryStream *MS);
#endif