ICU-1083 Data structure for surrogate support

X-SVN-Rev: 5453
2025-04-08 06:53:45 +00:00 · 2001-08-10 20:39:52 +00:00 · 2001-08-10 20:39:52 +00:00 · f255a31517
commit f255a31517
parent 20053b3398
2 changed files with 289 additions and 886 deletions
--- a/icu4c/source/common/ucmpe32.c
+++ b/icu4c/source/common/ucmpe32.c
--- a/icu4c/source/common/ucmpe32.h
+++ b/icu4c/source/common/ucmpe32.h
@ -19,6 +19,14 @@
 *   nicely.
 */

+/** 
+ * NOTE: This array is specifically implemented to support surrogates
+ * in the collation framework. It's interface is minimal and usage model
+ * is far from the flexible. Use at your own risk outside of collation.
+ * Risk is also present in the collation framework, but there is hardly
+ * anything you can do about it, save reimplementig the framework
+ */
+
 #ifndef UCMPE32_H
 #define UCMPE32_H

@ -29,24 +37,6 @@
 #include "filestrm.h"
 #include "umemstrm.h"

-/* INTERNAL CONSTANTS */
-#define UCMPE32_kBlockShift    7
-
-#define UCMPE32_kBlockCount    (1<<UCMPE32_kBlockShift)
-#define UCMPE32_kBlockMask     (UCMPE32_kBlockCount-1)
-
-#define UCMPE32_kSurrogateBlockBits (10 - UCMPE32_kBlockShift)
-#define UCMPE32_kSurrogateBlockCount (1<<UCMPE32_kSurrogateBlockBits)
-
-#define UCMPE32_kIndexShift    (21-UCMPE32_kBlockShift)
-/*#define UCMPE32_kIndexCount    (1<<UCMPE32_kIndexShift)*/
-#define UCMPE32_kIndexCount (0x110000>>UCMPE32_kBlockShift)
-
-/*#define UCMPE32_kIndexBMPCount (1<<(16-UCMPE32_kBlockShift))*/
-#define UCMPE32_kIndexBMPCount (0x10000>>UCMPE32_kBlockShift)
-
-
-#define UCMPE32_kUnicodeCount  0x110000

 /* trie constants */
 enum {
@ -74,161 +64,41 @@ enum {
 /* this may be >0xffff and may not work as an enum */
 #define _UCMPE32_STAGE_1_MAX_COUNT (0x110000>>_UCMPE32_TRIE_SHIFT)

-typedef struct UToolMemory {
-    char name[64];
-    uint32_t count, size, index;
-    uint32_t array[1];
-} UToolMemory;
-
-/**
- * class CompactATypeArray : use only on primitive data types
- * Provides a compact way to store information that is indexed by Unicode
- * values, such as character properties, types, keyboard values, etc.This
- * is very useful when you have a block of Unicode data that contains
- * significant values while the rest of the Unicode data is unused in the
- * application or when you have a lot of redundance, such as where all 21,000
- * Han ideographs have the same value.  However, lookup is much faster than a
- * hash table.
- * <P>
- * A compact array of any primitive data type serves two purposes:
- * <UL type = round>
- *     <LI>Fast access of the indexed values.
- *     <LI>Smaller memory footprint.
- * </UL>
- * <P>
- * The index array always points into particular parts of the data array
- * it is initially set up to point at regular block boundaries
- * The following example uses blocks of 4 for simplicity
- * <PRE>
- * Example: Expanded
- * BLOCK  0   1   2   3   4
- * INDEX  0   4   8   12  16 ...
- * ARRAY  abcdeababcdezyabcdea...
- *        |   |   |   |   |   |...
- * </PRE>
- * <P>
- * After compression, the index will point to various places in the data array
- * wherever there is a runs of the same elements as in the original
- * <PRE>
- * Example: Compressed
- * BLOCK  0   1   2   3   4
- * INDEX  0   4   1   8   2 ...
- * ARRAY  abcdeabazyabc...
- * </PRE>
- * <P>
- * If you look at the example, index number 2 in the expanded version points
- * to data position number 8, which has elements "bcde". In the compressed
- * version, index number 2 points to data position 1, which also has "bcde"
- * @see                CompactByteArray
- * @see                CompactEIntArray
- * @see                CompactCharArray
- * @see                CompactStringArray
- * @version            $Revision: 1.1 $ 8/25/98
- * @author             Helena Shih
- */
-/*====================================
- *CompactEIntArray
- * Provides a compact way to store information that is indexed by Unicode values,
- * such as character properties, types, keyboard values, etc.
- * The ATypes are used by value, so should be small, integers or pointers.
- *====================================
- */
-
 typedef struct CompactEIntArray{
-    uint32_t fStructSize;
-    int32_t* fArray;
-    uint16_t* fIndex;
-    int32_t fCount;
-    UBool fCompact;    
-    UBool fBogus;
-    UBool fAlias;
-    UBool fIAmOwned; /* don't free CBA on close */
-
-  UToolMemory *stage2Mem;
-  uint16_t stage1[_UCMPE32_STAGE_1_MAX_COUNT];
-  uint32_t *stage2;
-  uint16_t stage1Top;
+  uint32_t fStructSize;
+  UBool fCompact;    
+  UBool fAlias;
+  UBool fBogus;

+  uint16_t *stage1;
+  int32_t *stage2;
+  int32_t stage1Top;
+  int32_t stage2Top;
+  int32_t fDefaultValue;
+  int32_t fSurrogateValue;
 } CompactEIntArray;

-    U_CAPI int32_t U_EXPORT2 ucmpe32_getkUnicodeCount(void);
-    U_CAPI int32_t U_EXPORT2 ucmpe32_getkBlockCount(void);
-
    
 /**
 * Construct an empty CompactEIntArray.
 *
 * @param defaultValue the default value for all characters not explicitly in the array
 */
-U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_open(int32_t defaultValue);
+U_CAPI CompactEIntArray* U_EXPORT2 
+ucmpe32_open(int32_t defaultValue, int32_t surrogateValue, UErrorCode *status);

 /**
- * Construct a CompactEIntArray from a pre-computed index and values array. The values
- * will be adopted by the CompactEIntArray. Memory is allocated with uprv_malloc.
- * Note: for speed, the compact method will only re-use blocks in the values array
- * that are on a block boundary. The pre-computed arrays passed in to this constructor
- * may re-use blocks at any position in the values array. The indexArray and
- * newValues will be uprv_free'd when ucmp16_close() is called.
- *
- * @param indexArray the index array to be adopted
- * @param newValues the value array to be adopted
- * @param count the number of entries in the value array
- * @see compact
+ * Opens a compacted read-only array from
+ * a block in memory.
 */
-U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openAdopt(uint16_t *indexArray,
-                          int32_t *newValues,
-                          int32_t count);
+U_CAPI  CompactEIntArray* U_EXPORT2 
+ucmpe32_openFromData( const uint8_t **source, UErrorCode *status);

 /**
- * Construct a CompactEIntArray from a pre-computed index and values array. The values
- * will be aliased by the CompactEIntArray. Memory is allocated with uprv_malloc.
- * Note: for speed, the compact method will only re-use blocks in the values array
- * that are on a block boundary. The pre-computed arrays passed in to this constructor
- * may re-use blocks at any position in the values array.
- *
- * @param indexArray the index array to be adopted
- * @param newValues the value array to be adopted
- * @param count the number of entries in the value array
- * @see compact
+ * Clones an array. It can be either compacted or expanded
 */
-U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_openAlias(uint16_t *indexArray,
-                          int32_t *newValues,
-                          int32_t count);
-
-/**
- * Initialize a CompactEIntArray from a pre-computed index and values array. The values
- * will be adopted by the CompactEIntArray. No memory is allocated. Note: for speed,
- * the compact method will only re-use blocks in the values array that are on a block
- * boundary. The pre-computed arrays passed in to this constructor may re-use blocks
- * at any position in the values array. The indexArray and
- * newValues will be uprv_free'd when ucmp16_close() is called.
- *
- * @param indexArray the index array to be adopted
- * @param newValues the value array to be adopted
- * @param count the number of entries in the value array
- * @see compact
- */
-U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAdopt(CompactEIntArray *this_obj,
-                          uint16_t *indexArray,
-                          int32_t *newValues,
-                          int32_t count);
-
-/**
- * Initialize a CompactEIntArray from a pre-computed index and values array. The values
- * will be aliased by the CompactEIntArray. No memory is allocated. Note: for speed,
- * the compact method will only re-use blocks in the values array that are on a block
- * boundary. The pre-computed arrays passed in to this constructor may re-use blocks
- * at any position in the values array.
- *
- * @param indexArray the index array to be adopted
- * @param newValues the value array to be adopted
- * @param count the number of entries in the value array
- * @see compact
- */
-U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAlias(CompactEIntArray *this_obj,
-                          uint16_t *indexArray,
-                          int32_t *newValues,
-                          int32_t count);
+U_CAPI CompactEIntArray* U_EXPORT2 
+ucmpe32_clone(CompactEIntArray* orig, UErrorCode *status);

 /**
 * Free up any allocated memory associated with this compact array.
@ -239,33 +109,39 @@ U_CAPI CompactEIntArray* U_EXPORT2 ucmpe32_initAlias(CompactEIntArray *this_obj,
 */
 U_CAPI void U_EXPORT2 ucmpe32_close(CompactEIntArray* array);

-/**
- * Returns TRUE if the creation of the compact array fails.
- */
-U_CAPI  UBool U_EXPORT2 ucmpe32_isBogus(const CompactEIntArray* array);
-
 /**
 * Get the mapped value of a Unicode character.
 *
 * @param index the character to get the mapped value with
 * @return the mapped value of the given character
 */
+#define ucmpe32_get(this_obj, index) (this_obj->stage2[(this_obj->stage1[(index >> _UCMPE32_TRIE_SHIFT)] )+ \
+                           (index & _UCMPE32_STAGE_2_MASK)])

-#if 0
-#define ucmpe32_get(array, index) (array->fArray[(array->fIndex[(index >> UCMPE32_kBlockShift)<< UCMPE32_kBlockShift] )+ \
-                           (index & UCMPE32_kBlockMask)])
-#endif
+/** 
+ * Get the mapped value of a confirmed surrogate. First value already comes 
+ * from the trie and is combined with the following value in order to get
+ * the value. THIS CAN BE ONLY USED ON A COMPACTED TRIE. You will get wrong
+ * results if you try it on the expanded one
+ * NO ERROR CHECKING IS PERFORMED! PREPARE YOUR DATA CAREFULLY!
+ * @param leadValue32 the mapping of the leading surrogate.
+ * @param trail the trailing surrogate
+ * @return the mapped value of the given character
+ */

-U_CAPI int32_t U_EXPORT2 ucmpe32_get32(CompactEIntArray *array, UChar32 index);
-#define ucmpe32_get(array, index) ucmpe32_get32((array), (UChar32)(index))
-#define ucmpe32_getu(array, index) (uint16_t)ucmpe32_get(array, index)
+#define ucmpe32_getSurrogate(this_obj, leadValue32, trail) ucmpe32_get(this_obj, \
+                           ((leadValue32 & 0xffc00) | (trail & 0x3ff)))

+/**
+ * This is a slow function that takes lead and trail surrogate and gets
+ * the mapping regardless of the compaction status. 
+ */
+U_CAPI int32_t U_EXPORT2 
+ucmpe32_getSurrogateEx(CompactEIntArray *array, UChar lead, UChar trail);

-U_CAPI int32_t ucmpe32_getSurrogate(CompactEIntArray *array, UChar lead, UChar trail);
-
- /**
+/**
 * Set a new value for a Unicode character.
- * Set automatically expands the array if it is compacted.
+ * Do not set if the array is compacted - nothing will happen.
 * @param character the character to set the mapped value with
 * @param value the new mapped value
 */
@ -273,69 +149,42 @@ U_CAPI  void U_EXPORT2 ucmpe32_set32(CompactEIntArray *array,
                  UChar32 character,
                  int32_t value);

+/** 
+ * alias for compatibility
+ */
 #define ucmpe32_set(array, character, value) ucmpe32_set32((array), (UChar32)(character), (value))

-U_CAPI void  U_EXPORT2 ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead, 
-                           UChar trail, int32_t value);

 /**
- *
- * Set new values for a range of Unicode character.
- * @param start the starting offset of the range
- * @param end the ending offset of the range
+ * Set a new value for a surrogate character.
+ * Do not set if the array is compacted - nothing will happen.
+ * Set automatically expands the array if it is compacted.
+ * Alternatively you can put the surrogate code point together 
+ * yourself and use set32.
+ * @param lead leading surrogate unit
+ * @param trail trailing surrogate unit
 * @param value the new mapped value
 */
-U_CAPI  void U_EXPORT2 ucmpe32_setRange(CompactEIntArray* array,
-                   UChar start,
-                   UChar end, 
-                   int32_t value);
-
-/**
- * Compact the array. The value of cycle determines how large the overlap can be.
- * A cycle of 1 is the most compacted, but takes the most time to do.
- * If values stored in the array tend to repeat in cycles of, say, 16,
- * then using that will be faster than cycle = 1, and get almost the
- * same compression.
+U_CAPI void  U_EXPORT2 
+ucmpe32_setSurrogate(CompactEIntArray* this_obj, UChar lead, 
+                           UChar trail, int32_t value);
+/** 
+ * compacts the array. 
+ * This folds the surrogates and compacts the array.
+ * no setting will succeed after the array is compacted.
+ * Array have to be compacted in order to be flattened.
 */
-U_CAPI  void U_EXPORT2 ucmpe32_compact(CompactEIntArray* array, int32_t cycle);
+U_CAPI  void U_EXPORT2 
+ucmpe32_compact(CompactEIntArray* this_object);

-/**
- * Expands the compacted array.
- * Takes the array back to a 65536 element array
+/** 
+ * Flattens the array to an memory stream.
+ * Array has to be compacted beforehand.
+ * @param MS memory stream to flatten to
+ * @return number of bytes written.
 */
-U_CAPI  void U_EXPORT2 ucmpe32_expand(CompactEIntArray* array);
-
-/**
- * Get the number of elements in the value array.
- *
- * @return the number of elements in the value array.
- */
-U_CAPI  uint32_t U_EXPORT2 ucmpe32_getCount(const CompactEIntArray* array);
-
-/**
- * Get the address of the value array.
- *
- * @return the address of the value array
- */
-U_CAPI  const int32_t* U_EXPORT2 ucmpe32_getArray(const CompactEIntArray* array);
-
-/**
- * Get the address of the index array.
- *
- * @return the address of the index array
- */
-U_CAPI  const uint16_t* U_EXPORT2 ucmpe32_getIndex(const CompactEIntArray* array);
-
-U_CAPI void U_EXPORT2 ucmpe32_streamIn( CompactEIntArray* array, FileStream* is);
-U_CAPI void U_EXPORT2 ucmpe32_streamOut(CompactEIntArray* array, FileStream* os);
-
-U_CAPI void U_EXPORT2 ucmpe32_streamMemIn( CompactEIntArray* array, UMemoryStream* is);
-U_CAPI void U_EXPORT2 ucmpe32_streamMemOut(CompactEIntArray* array, UMemoryStream* os);
-
-U_CAPI  uint32_t U_EXPORT2 ucmpe32_flattenMem(const CompactEIntArray* array, UMemoryStream *MS);
-
-U_CAPI  CompactEIntArray* U_EXPORT2 ucmpe32_openFromData( const uint8_t **source, UErrorCode *status);
-U_CAPI  void U_EXPORT2 ucmpe32_initFromData(CompactEIntArray *this_obj, const uint8_t **source, UErrorCode *status);
+U_CAPI  uint32_t U_EXPORT2 
+ucmpe32_flattenMem(const CompactEIntArray* this_object, UMemoryStream *MS);

 #endif