mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-08 06:53:45 +00:00
ICU-1083 Data structure for surrogate support
X-SVN-Rev: 5453
This commit is contained in:
parent
20053b3398
commit
f255a31517
2 changed files with 289 additions and 886 deletions
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,14 @@
|
|||
* nicely.
|
||||
*/
|
||||
|
||||
/**
|
||||
* NOTE: This array is specifically implemented to support surrogates
|
||||
* in the collation framework. It's interface is minimal and usage model
|
||||
* is far from the flexible. Use at your own risk outside of collation.
|
||||
* Risk is also present in the collation framework, but there is hardly
|
||||
* anything you can do about it, save reimplementig the framework
|
||||
*/
|
||||
|
||||
#ifndef UCMPE32_H
|
||||
#define UCMPE32_H
|
||||
|
||||
|
@ -29,24 +37,6 @@
|
|||
#include "filestrm.h"
|
||||
#include "umemstrm.h"
|
||||
|
||||
/* INTERNAL CONSTANTS */
|
||||
#define UCMPE32_kBlockShift 7
|
||||
|
||||
#define UCMPE32_kBlockCount (1<<UCMPE32_kBlockShift)
|
||||
#define UCMPE32_kBlockMask (UCMPE32_kBlockCount-1)
|
||||
|
||||
#define UCMPE32_kSurrogateBlockBits (10 - UCMPE32_kBlockShift)
|
||||
#define UCMPE32_kSurrogateBlockCount (1<<UCMPE32_kSurrogateBlockBits)
|
||||
|
||||
#define UCMPE32_kIndexShift (21-UCMPE32_kBlockShift)
|
||||
/*#define UCMPE32_kIndexCount (1<<UCMPE32_kIndexShift)*/
|
||||
#define UCMPE32_kIndexCount (0x110000>>UCMPE32_kBlockShift)
|
||||
|
||||
/*#define UCMPE32_kIndexBMPCount (1<<(16-UCMPE32_kBlockShift))*/
|
||||
#define UCMPE32_kIndexBMPCount (0x10000>>UCMPE32_kBlockShift)
|
||||
|
||||
|
||||
#define UCMPE32_kUnicodeCount 0x110000
|
||||
|
||||
/* trie constants */
|
||||
enum {
|
||||
|
@ -74,161 +64,41 @@ enum {
|
|||
/* this may be >0xffff and may not work as an enum */
|
||||
#define _UCMPE32_STAGE_1_MAX_COUNT (0x110000>>_UCMPE32_TRIE_SHIFT)
|
||||
|
||||
typedef struct UToolMemory {
|
||||
char name[64];
|
||||
uint32_t count, size, index;
|
||||
uint32_t array[1];
|
||||
} UToolMemory;
|
||||
|
||||
/**
|
||||
* class CompactATypeArray : use only on primitive data types
|
||||
* Provides a compact way to store information that is indexed by Unicode
|
||||
* values, such as character properties, types, keyboard values, etc.This
|
||||
* is very useful when you have a block of Unicode data that contains
|
||||
* significant values while the rest of the Unicode data is unused in the
|
||||
* application or when you have a lot of redundance, such as where all 21,000
|
||||
* Han ideographs have the same value. However, lookup is much faster than a
|
||||
* hash table.
|
||||
* <P>
|
||||
* A compact array of any primitive data type serves two purposes:
|
||||
* <UL type = round>
|
||||
* <LI>Fast access of the indexed values.
|
||||
* <LI>Smaller memory footprint.
|
||||
* </UL>
|
||||
* <P>
|
||||
* The index array always points into particular parts of the data array
|
||||
* it is initially set up to point at regular block boundaries
|
||||
* The following example uses blocks of 4 for simplicity
|
||||
* <PRE>
|
||||
* Example: Expanded
|
||||
* BLOCK 0 1 2 3 4
|
||||
* INDEX 0 4 8 12 16 ...
|
||||
* ARRAY abcdeababcdezyabcdea...
|
||||
* | | | | | |...
|
||||
* </PRE>
|
||||
* <P>
|
||||
* After compression, the index will point to various places in the data array
|
||||
* wherever there is a runs of the same elements as in the original
|
||||
* <PRE>
|
||||
* Example: Compressed
|
||||
* BLOCK 0 1 2 3 4
|
||||
* INDEX 0 4 1 8 2 ...
|
||||
* ARRAY abcdeabazyabc...
|
||||
* </PRE>
|
||||
* <P>
|
||||
* If you look at the example, index number 2 in the expanded version points
|
||||
* to data position number 8, which has elements "bcde". In the compressed
|
||||
* version, index number 2 points to data position 1, which also has "bcde"
|
||||
* @see CompactByteArray
|
||||
* @see CompactEIntArray
|
||||
* @see CompactCharArray
|
||||
* @see CompactStringArray
|
||||
* @version $Revision: 1.1 $ 8/25/98
|
||||
* @author Helena Shih
|
||||
*/
|
||||
/*====================================
|
||||
*CompactEIntArray
|
||||
* Provides a compact way to store information that is indexed by Unicode values,
|
||||
* such as character properties, types, keyboard values, etc.
|
||||
* The ATypes are used by value, so should be small, integers or pointers.
|
||||
*====================================
|
||||
*/
|
||||
|
||||
typedef struct CompactEIntArray{
|
||||
uint32_t fStructSize;
|
||||
int32_t* fArray;
|
||||
uint16_t* fIndex;
|
||||
int32_t fCount;
|
||||
UBool fCompact;
|
||||
UBool fBogus;
|
||||
UBool fAlias;
|
||||
UBool fIAmOwned; /* don't free CBA on close */
|
||||
|
||||
UToolMemory *stage2Mem;
|
||||
uint16_t stage1[_UCMPE32_STAGE_1_MAX_COUNT];
|
||||
uint32_t *stage2;
|
||||
uint16_t stage1Top;
|
||||
uint32_t fStructSize;
|
||||
UBool fCompact;
|
||||
UBool fAlias;
|
||||
UBool fBogus;
|
||||
|
||||
uint16_t *stage1;
|
||||
int32_t *stage2;
|
||||
int32_t stage1Top;
|
||||
int32_t stage2Top;
|
||||
int32_t fDefaultValue;
|
||||
int32_t fSurrogateValue;
|
||||
} CompactEIntArray;
|
||||
|
||||
U_CAPI int32_t U_EXPORT2 ucmpe32_getkUnicodeCount(void);
|
||||
U_CAPI int32_t U_EXPORT2 ucmpe32_getkBlockCount(void);
|
||||
|
||||
|
||||
/**
|
||||
* Construct an empty CompactEIntArray.
|
||||
*
|
||||
* @param defaultValue the default value for all characters not explicitly in the array
|
||||
*/
|
||||
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_open(int32_t defaultValue);
|
||||
U_CAPI CompactEIntArray* U_EXPORT2
|
||||
ucmpe32_open(int32_t defaultValue, int32_t surrogateValue, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Construct a CompactEIntArray from a pre-computed index and values array. The values
|
||||
* will be adopted by the CompactEIntArray. Memory is allocated with uprv_malloc.
|
||||
* Note: for speed, the compact method will only re-use blocks in the values array
|
||||
* that are on a block boundary. The pre-computed arrays passed in to this constructor
|
||||
* may re-use blocks at any position in the values array. The indexArray and
|
||||
* newValues will be uprv_free'd when ucmp16_close() is called.
|
||||
*
|
||||
* @param indexArray the index array to be adopted
|
||||
* @param newValues the value array to be adopted
|
||||
* @param count the number of entries in the value array
|
||||
* @see compact
|
||||
* Opens a compacted read-only array from
|
||||
* a block in memory.
|
||||
*/
|
||||
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openAdopt(uint16_t *indexArray,
|
||||
int32_t *newValues,
|
||||
int32_t count);
|
||||
U_CAPI CompactEIntArray* U_EXPORT2
|
||||
ucmpe32_openFromData( const uint8_t **source, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Construct a CompactEIntArray from a pre-computed index and values array. The values
|
||||
* will be aliased by the CompactEIntArray. Memory is allocated with uprv_malloc.
|
||||
* Note: for speed, the compact method will only re-use blocks in the values array
|
||||
* that are on a block boundary. The pre-computed arrays passed in to this constructor
|
||||
* may re-use blocks at any position in the values array.
|
||||
*
|
||||
* @param indexArray the index array to be adopted
|
||||
* @param newValues the value array to be adopted
|
||||
* @param count the number of entries in the value array
|
||||
* @see compact
|
||||
* Clones an array. It can be either compacted or expanded
|
||||
*/
|
||||
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openAlias(uint16_t *indexArray,
|
||||
int32_t *newValues,
|
||||
int32_t count);
|
||||
|
||||
/**
|
||||
* Initialize a CompactEIntArray from a pre-computed index and values array. The values
|
||||
* will be adopted by the CompactEIntArray. No memory is allocated. Note: for speed,
|
||||
* the compact method will only re-use blocks in the values array that are on a block
|
||||
* boundary. The pre-computed arrays passed in to this constructor may re-use blocks
|
||||
* at any position in the values array. The indexArray and
|
||||
* newValues will be uprv_free'd when ucmp16_close() is called.
|
||||
*
|
||||
* @param indexArray the index array to be adopted
|
||||
* @param newValues the value array to be adopted
|
||||
* @param count the number of entries in the value array
|
||||
* @see compact
|
||||
*/
|
||||
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAdopt(CompactEIntArray *this_obj,
|
||||
uint16_t *indexArray,
|
||||
int32_t *newValues,
|
||||
int32_t count);
|
||||
|
||||
/**
|
||||
* Initialize a CompactEIntArray from a pre-computed index and values array. The values
|
||||
* will be aliased by the CompactEIntArray. No memory is allocated. Note: for speed,
|
||||
* the compact method will only re-use blocks in the values array that are on a block
|
||||
* boundary. The pre-computed arrays passed in to this constructor may re-use blocks
|
||||
* at any position in the values array.
|
||||
*
|
||||
* @param indexArray the index array to be adopted
|
||||
* @param newValues the value array to be adopted
|
||||
* @param count the number of entries in the value array
|
||||
* @see compact
|
||||
*/
|
||||
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAlias(CompactEIntArray *this_obj,
|
||||
uint16_t *indexArray,
|
||||
int32_t *newValues,
|
||||
int32_t count);
|
||||
U_CAPI CompactEIntArray* U_EXPORT2
|
||||
ucmpe32_clone(CompactEIntArray* orig, UErrorCode *status);
|
||||
|
||||
/**
|
||||
* Free up any allocated memory associated with this compact array.
|
||||
|
@ -239,33 +109,39 @@ U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAlias(CompactEIntArray *this_obj,
|
|||
*/
|
||||
U_CAPI void U_EXPORT2 ucmpe32_close(CompactEIntArray* array);
|
||||
|
||||
/**
|
||||
* Returns TRUE if the creation of the compact array fails.
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2 ucmpe32_isBogus(const CompactEIntArray* array);
|
||||
|
||||
/**
|
||||
* Get the mapped value of a Unicode character.
|
||||
*
|
||||
* @param index the character to get the mapped value with
|
||||
* @return the mapped value of the given character
|
||||
*/
|
||||
#define ucmpe32_get(this_obj, index) (this_obj->stage2[(this_obj->stage1[(index >> _UCMPE32_TRIE_SHIFT)] )+ \
|
||||
(index & _UCMPE32_STAGE_2_MASK)])
|
||||
|
||||
#if 0
|
||||
#define ucmpe32_get(array, index) (array->fArray[(array->fIndex[(index >> UCMPE32_kBlockShift)<< UCMPE32_kBlockShift] )+ \
|
||||
(index & UCMPE32_kBlockMask)])
|
||||
#endif
|
||||
/**
|
||||
* Get the mapped value of a confirmed surrogate. First value already comes
|
||||
* from the trie and is combined with the following value in order to get
|
||||
* the value. THIS CAN BE ONLY USED ON A COMPACTED TRIE. You will get wrong
|
||||
* results if you try it on the expanded one
|
||||
* NO ERROR CHECKING IS PERFORMED! PREPARE YOUR DATA CAREFULLY!
|
||||
* @param leadValue32 the mapping of the leading surrogate.
|
||||
* @param trail the trailing surrogate
|
||||
* @return the mapped value of the given character
|
||||
*/
|
||||
|
||||
U_CAPI int32_t U_EXPORT2 ucmpe32_get32(CompactEIntArray *array, UChar32 index);
|
||||
#define ucmpe32_get(array, index) ucmpe32_get32((array), (UChar32)(index))
|
||||
#define ucmpe32_getu(array, index) (uint16_t)ucmpe32_get(array, index)
|
||||
#define ucmpe32_getSurrogate(this_obj, leadValue32, trail) ucmpe32_get(this_obj, \
|
||||
((leadValue32 & 0xffc00) | (trail & 0x3ff)))
|
||||
|
||||
/**
|
||||
* This is a slow function that takes lead and trail surrogate and gets
|
||||
* the mapping regardless of the compaction status.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
ucmpe32_getSurrogateEx(CompactEIntArray *array, UChar lead, UChar trail);
|
||||
|
||||
U_CAPI int32_t ucmpe32_getSurrogate(CompactEIntArray *array, UChar lead, UChar trail);
|
||||
|
||||
/**
|
||||
/**
|
||||
* Set a new value for a Unicode character.
|
||||
* Set automatically expands the array if it is compacted.
|
||||
* Do not set if the array is compacted - nothing will happen.
|
||||
* @param character the character to set the mapped value with
|
||||
* @param value the new mapped value
|
||||
*/
|
||||
|
@ -273,69 +149,42 @@ U_CAPI void U_EXPORT2 ucmpe32_set32(CompactEIntArray *array,
|
|||
UChar32 character,
|
||||
int32_t value);
|
||||
|
||||
/**
|
||||
* alias for compatibility
|
||||
*/
|
||||
#define ucmpe32_set(array, character, value) ucmpe32_set32((array), (UChar32)(character), (value))
|
||||
|
||||
U_CAPI void U_EXPORT2 ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead,
|
||||
UChar trail, int32_t value);
|
||||
|
||||
/**
|
||||
*
|
||||
* Set new values for a range of Unicode character.
|
||||
* @param start the starting offset of the range
|
||||
* @param end the ending offset of the range
|
||||
* Set a new value for a surrogate character.
|
||||
* Do not set if the array is compacted - nothing will happen.
|
||||
* Set automatically expands the array if it is compacted.
|
||||
* Alternatively you can put the surrogate code point together
|
||||
* yourself and use set32.
|
||||
* @param lead leading surrogate unit
|
||||
* @param trail trailing surrogate unit
|
||||
* @param value the new mapped value
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucmpe32_setRange(CompactEIntArray* array,
|
||||
UChar start,
|
||||
UChar end,
|
||||
int32_t value);
|
||||
|
||||
/**
|
||||
* Compact the array. The value of cycle determines how large the overlap can be.
|
||||
* A cycle of 1 is the most compacted, but takes the most time to do.
|
||||
* If values stored in the array tend to repeat in cycles of, say, 16,
|
||||
* then using that will be faster than cycle = 1, and get almost the
|
||||
* same compression.
|
||||
U_CAPI void U_EXPORT2
|
||||
ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead,
|
||||
UChar trail, int32_t value);
|
||||
/**
|
||||
* compacts the array.
|
||||
* This folds the surrogates and compacts the array.
|
||||
* no setting will succeed after the array is compacted.
|
||||
* Array have to be compacted in order to be flattened.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucmpe32_compact(CompactEIntArray* array, int32_t cycle);
|
||||
U_CAPI void U_EXPORT2
|
||||
ucmpe32_compact(CompactEIntArray* this_object);
|
||||
|
||||
/**
|
||||
* Expands the compacted array.
|
||||
* Takes the array back to a 65536 element array
|
||||
/**
|
||||
* Flattens the array to an memory stream.
|
||||
* Array has to be compacted beforehand.
|
||||
* @param MS memory stream to flatten to
|
||||
* @return number of bytes written.
|
||||
*/
|
||||
U_CAPI void U_EXPORT2 ucmpe32_expand(CompactEIntArray* array);
|
||||
|
||||
/**
|
||||
* Get the number of elements in the value array.
|
||||
*
|
||||
* @return the number of elements in the value array.
|
||||
*/
|
||||
U_CAPI uint32_t U_EXPORT2 ucmpe32_getCount(const CompactEIntArray* array);
|
||||
|
||||
/**
|
||||
* Get the address of the value array.
|
||||
*
|
||||
* @return the address of the value array
|
||||
*/
|
||||
U_CAPI const int32_t* U_EXPORT2 ucmpe32_getArray(const CompactEIntArray* array);
|
||||
|
||||
/**
|
||||
* Get the address of the index array.
|
||||
*
|
||||
* @return the address of the index array
|
||||
*/
|
||||
U_CAPI const uint16_t* U_EXPORT2 ucmpe32_getIndex(const CompactEIntArray* array);
|
||||
|
||||
U_CAPI void U_EXPORT2 ucmpe32_streamIn( CompactEIntArray* array, FileStream* is);
|
||||
U_CAPI void U_EXPORT2 ucmpe32_streamOut(CompactEIntArray* array, FileStream* os);
|
||||
|
||||
U_CAPI void U_EXPORT2 ucmpe32_streamMemIn( CompactEIntArray* array, UMemoryStream* is);
|
||||
U_CAPI void U_EXPORT2 ucmpe32_streamMemOut(CompactEIntArray* array, UMemoryStream* os);
|
||||
|
||||
U_CAPI uint32_t U_EXPORT2 ucmpe32_flattenMem(const CompactEIntArray* array, UMemoryStream *MS);
|
||||
|
||||
U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openFromData( const uint8_t **source, UErrorCode *status);
|
||||
U_CAPI void U_EXPORT2 ucmpe32_initFromData(CompactEIntArray *this_obj, const uint8_t **source, UErrorCode *status);
|
||||
U_CAPI uint32_t U_EXPORT2
|
||||
ucmpe32_flattenMem(const CompactEIntArray* this_object, UMemoryStream *MS);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue