mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-2481 make the implementation of tailored normalization permanent; remove the a-umlaut option
X-SVN-Rev: 11276
This commit is contained in:
parent
84adae7885
commit
a32165212d
2 changed files with 30 additions and 55 deletions
|
@ -27,6 +27,8 @@
|
|||
#include "unicode/udata.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cmemory.h"
|
||||
#include "ustr_imp.h"
|
||||
|
@ -35,24 +37,31 @@
|
|||
#include "unicode/uset.h"
|
||||
#include "unormimp.h"
|
||||
|
||||
/* ### TODO: These depend on whether tailored normalization becomes permanent. */
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/usetiter.h"
|
||||
|
||||
/*
|
||||
* ### TODO: status of prototype for tailored normalization
|
||||
* Status of tailored normalization
|
||||
*
|
||||
* My main thrust so far was for unorm_normalize() and unorm_quickCheck().
|
||||
* isNormalized() should work, I think.
|
||||
* I have not yet thought about iterative normalization at all.
|
||||
* This was done initially for investigation on Unicode public review issue 7
|
||||
* (http://www.unicode.org/review/). See Jitterbug 2481.
|
||||
* While the UTC at meeting #94 (2003mar) did not take up the issue, this is
|
||||
* a permanent feature in ICU 2.6 in support of IDNA which requires true
|
||||
* Unicode 3.2 normalization.
|
||||
* (NormalizationCorrections are rolled into IDNA mapping tables.)
|
||||
*
|
||||
* Generally, any function that searches for a safe boundary has not been touched,
|
||||
* Tailored normalization as implemented here allows to "normalize less"
|
||||
* than full Unicode normalization would.
|
||||
* Based internally on a UnicodeSet of code points that are
|
||||
* "excluded from normalization", the normalization functions leave those
|
||||
* code points alone ("inert"). This means that tailored normalization
|
||||
* still transforms text into a canonically equivalent form.
|
||||
* It does not add decompositions to code points that do not have any or
|
||||
* change decomposition results.
|
||||
*
|
||||
* Any function that searches for a safe boundary has not been touched,
|
||||
* which means that these functions will be over-pessimistic when
|
||||
* exclusions are applied.
|
||||
* This may not matter because subsequent checks and normalizations do apply the exclusions.
|
||||
*
|
||||
* 2003feb25: Added support for Unicode 3.2 normalization, for IDNA.
|
||||
* This excludes all post-Unicode 3.2 code points.
|
||||
* This should not matter because subsequent checks and normalizations
|
||||
* do apply the exclusions; only a little more of the text may be processed
|
||||
* than necessary under exclusions.
|
||||
*
|
||||
* Normalization exclusions have the following effect on excluded code points c:
|
||||
* - c is not decomposed
|
||||
|
@ -75,11 +84,12 @@ enum {
|
|||
_STACK_BUFFER_CAPACITY=100
|
||||
};
|
||||
|
||||
/* ### TODO prototype
|
||||
/*
|
||||
* Constants for the bit fields in the options bit set parameter.
|
||||
* These need not be public.
|
||||
* A user only needs to know the currently assigned values.
|
||||
* The number and positions of reserved bits per field can remain private.
|
||||
* The number and positions of reserved bits per field can remain private
|
||||
* and may change in future implementations.
|
||||
*/
|
||||
enum {
|
||||
_NORM_OPTIONS_NX_MASK=0x1f,
|
||||
|
@ -175,7 +185,7 @@ static UBool formatVersion_2_1=FALSE, formatVersion_2_2=FALSE;
|
|||
/* the Unicode version of the normalization data */
|
||||
static UVersionInfo dataVersion={ 3, 1, 0, 0 };
|
||||
|
||||
/* ### TODO: prototype ### cache UnicodeSets for each combination of exclusion flags */
|
||||
/* cache UnicodeSets for each combination of exclusion flags */
|
||||
static UnicodeSet *nxCache[_NORM_OPTIONS_SETS_MASK+1]={ NULL };
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
@ -411,13 +421,11 @@ _getExtraData(uint32_t norm32) {
|
|||
/* normalization exclusion sets --------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Normalization exclusion UnicodeSets are used for tailored normalization,
|
||||
* Unicode public review issue number 7. (http://www.unicode.org/review/)
|
||||
* Normalization exclusion UnicodeSets are used for tailored normalization;
|
||||
* see the comment near the beginning of this file.
|
||||
*
|
||||
* By specifying one or several sets of code points,
|
||||
* those code points become inert for normalization.
|
||||
*
|
||||
* ### TODO: This is a prototype. Assess if it should become a permanent part of ICU.
|
||||
*/
|
||||
|
||||
static const UnicodeSet *
|
||||
|
@ -503,30 +511,6 @@ internalGetNXCJKCompat(UErrorCode &errorCode) {
|
|||
return nxCache[UNORM_NX_CJK_COMPAT];
|
||||
}
|
||||
|
||||
static const UnicodeSet *
|
||||
internalGetNXAUmlaut(UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
|
||||
if(nxCache[UNORM_NX_A_UMLAUT]==NULL) {
|
||||
UnicodeSet *set=new UnicodeSet(0xe4, 0xe4);
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
umtx_lock(NULL);
|
||||
if(nxCache[UNORM_NX_A_UMLAUT]==NULL) {
|
||||
nxCache[UNORM_NX_A_UMLAUT]=set;
|
||||
set=NULL;
|
||||
}
|
||||
umtx_unlock(NULL);
|
||||
|
||||
delete set;
|
||||
}
|
||||
|
||||
return nxCache[UNORM_NX_A_UMLAUT];
|
||||
}
|
||||
|
||||
static const UnicodeSet *
|
||||
internalGetNXUnicode(uint32_t options, UErrorCode &errorCode) {
|
||||
/* internal function, does not check for incoming U_FAILURE */
|
||||
|
@ -583,9 +567,6 @@ internalGetNX(int32_t options, UErrorCode &errorCode) {
|
|||
if(options==UNORM_NX_CJK_COMPAT) {
|
||||
return internalGetNXCJKCompat(errorCode);
|
||||
}
|
||||
if(options==UNORM_NX_A_UMLAUT) {
|
||||
return internalGetNXAUmlaut(errorCode);
|
||||
}
|
||||
if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && (options&_NORM_OPTIONS_NX_MASK)==0) {
|
||||
return internalGetNXUnicode(options, errorCode);
|
||||
}
|
||||
|
@ -606,9 +587,6 @@ internalGetNX(int32_t options, UErrorCode &errorCode) {
|
|||
if((options&UNORM_NX_CJK_COMPAT)!=0 && NULL!=(other=internalGetNXCJKCompat(errorCode))) {
|
||||
set->addAll(*other);
|
||||
}
|
||||
if((options&UNORM_NX_A_UMLAUT)!=0 && NULL!=(other=internalGetNXAUmlaut(errorCode))) {
|
||||
set->addAll(*other);
|
||||
}
|
||||
if((options&_NORM_OPTIONS_UNICODE_MASK)!=0 && NULL!=(other=internalGetNXUnicode(options, errorCode))) {
|
||||
set->addAll(*other);
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ enum {
|
|||
_NORM_EXTRA_SURROGATE_TOP=0x3f0, /* hangul etc. */
|
||||
|
||||
_NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP,
|
||||
_NORM_EXTRA_JAMO_L, /* ### not used */
|
||||
_NORM_EXTRA_JAMO_L,
|
||||
_NORM_EXTRA_JAMO_V,
|
||||
_NORM_EXTRA_JAMO_T
|
||||
};
|
||||
|
@ -162,14 +162,12 @@ enum {
|
|||
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT
|
||||
};
|
||||
|
||||
/* Constants for options flags for normalization. ### TODO prototype, see unorm.cpp */
|
||||
/* Constants for options flags for normalization. @draft ICU 2.6 */
|
||||
enum {
|
||||
/** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
|
||||
UNORM_NX_HANGUL=1,
|
||||
/** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
|
||||
UNORM_NX_CJK_COMPAT=2,
|
||||
/** Options bit 2, do not decompose a-umlaut, only for testing. @internal */
|
||||
UNORM_NX_A_UMLAUT=4,
|
||||
|
||||
/**
|
||||
* Options bit set value to select Unicode 3.2 normalization (except NormalizationCorrections).
|
||||
|
@ -189,7 +187,6 @@ enum {
|
|||
* to the normalization implementation.
|
||||
* (options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT)
|
||||
*
|
||||
* ### TODO prototype, see unorm.cpp
|
||||
* @draft ICU 2.6
|
||||
*/
|
||||
#define UNORM_COMPARE_NORM_OPTIONS_SHIFT 20
|
||||
|
|
Loading…
Add table
Reference in a new issue