ICU-2757 add APIs for NF*_QC properties

X-SVN-Rev: 14892
This commit is contained in:
Markus Scherer 2004-04-07 00:28:39 +00:00
parent d7242682d5
commit 81f1506e2a
8 changed files with 1430 additions and 1246 deletions

View file

@ -339,6 +339,18 @@ typedef enum UProperty {
/** Enumerated property Hangul_Syllable_Type, new in Unicode 4.
Returns UHangulSyllableType values. @draft ICU 2.6 */
UCHAR_HANGUL_SYLLABLE_TYPE,
/** Enumerated property NFD_Quick_Check.
Returns UNormalizationCheckResult values. @draft ICU 3.0 */
UCHAR_NFD_QUICK_CHECK,
/** Enumerated property NFKD_Quick_Check.
Returns UNormalizationCheckResult values. @draft ICU 3.0 */
UCHAR_NFKD_QUICK_CHECK,
/** Enumerated property NFC_Quick_Check.
Returns UNormalizationCheckResult values. @draft ICU 3.0 */
UCHAR_NFC_QUICK_CHECK,
/** Enumerated property NFKC_Quick_Check.
Returns UNormalizationCheckResult values. @draft ICU 3.0 */
UCHAR_NFKC_QUICK_CHECK,
/** One more than the last constant for enumerated/integer Unicode properties. @stable ICU 2.2 */
UCHAR_INT_LIMIT,

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (c) 1996-2003, International Business Machines
* Copyright (c) 1996-2004, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
* File unorm.cpp
@ -1163,6 +1163,32 @@ unorm_addPropertyStarts(USet *set, UErrorCode *pErrorCode) {
uset_add(set, HANGUL_BASE+HANGUL_COUNT); /* add Hangul+1 to continue with other properties */
}
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
static const uint32_t qcMask[UNORM_MODE_COUNT]={
0, 0, _NORM_QC_NFD, _NORM_QC_NFKD, _NORM_QC_NFC, _NORM_QC_NFKC
};
UErrorCode errorCode;
uint32_t norm32;
errorCode=U_ZERO_ERROR;
if(!_haveData(errorCode)) {
return UNORM_YES;
}
UTRIE_GET32(&normTrie, c, norm32);
norm32&=qcMask[mode];
if(norm32==0) {
return UNORM_YES;
} else if(norm32&_NORM_QC_ANY_NO) {
return UNORM_NO;
} else /* _NORM_QC_ANY_MAYBE */ {
return UNORM_MAYBE;
}
}
/* reorder UTF-16 in-place -------------------------------------------------- */
/*

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2003, International Business Machines
* Copyright (C) 2001-2004, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -389,6 +389,13 @@ unorm_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
* @internal
*/
U_CAPI UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
/**
* Description of the format of unorm.dat version 2.2.
*

View file

@ -349,6 +349,11 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
return c%JAMO_T_COUNT==0 ? U_HST_LV_SYLLABLE : U_HST_LVT_SYLLABLE;
}
return U_HST_NOT_APPLICABLE;
case UCHAR_NFD_QUICK_CHECK:
case UCHAR_NFKD_QUICK_CHECK:
case UCHAR_NFC_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK)+UNORM_NFD);
default:
return 0; /* undefined */
}
@ -407,6 +412,12 @@ u_getIntPropertyMaxValue(UProperty which) {
return max!=0 ? max : (int32_t)USCRIPT_CODE_LIMIT-1;
case UCHAR_HANGUL_SYLLABLE_TYPE:
return (int32_t)U_HST_COUNT-1;
case UCHAR_NFD_QUICK_CHECK:
case UCHAR_NFKD_QUICK_CHECK:
return (int32_t)UNORM_YES; /* these are never "maybe", only "no" or "yes" */
case UCHAR_NFC_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
return (int32_t)UNORM_MAYBE;
default:
return -1; /* undefined */
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2003, International Business Machines Corporation and
* Copyright (c) 1997-2004, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -32,7 +32,7 @@ void addNormTest(TestNode** root) {
#include "unicode/unorm.h"
#include "cnormtst.h"
#define ARRAY_LENGTH(array) (sizeof (array) / sizeof (*array))
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof ((array)[0]))
static void
TestAPI(void);
@ -51,6 +51,9 @@ static void TestIsNormalized(void);
static void
TestFCNFKCClosure(void);
static void
TestQuickCheckPerCP(void);
const static char* canonTests[][3] = {
/* Input*/ /*Decomposed*/ /*Composed*/
{ "cat", "cat", "cat" },
@ -121,6 +124,7 @@ void addNormTest(TestNode** root)
addTest(root, &TestCompatDecompCompose, "tscoll/cnormtst/CompatDecompCompose");
addTest(root, &TestNull, "tscoll/cnormtst/TestNull");
addTest(root, &TestQuickCheck, "tscoll/cnormtst/TestQuickCheck");
addTest(root, &TestQuickCheckPerCP, "tscoll/cnormtst/TestQuickCheckPerCP");
addTest(root, &TestIsNormalized, "tscoll/cnormtst/TestIsNormalized");
addTest(root, &TestCheckFCD, "tscoll/cnormtst/TestCheckFCD");
addTest(root, &TestNormCoverage, "tscoll/cnormtst/TestNormCoverage");
@ -137,7 +141,7 @@ void TestDecomp()
status = U_ZERO_ERROR;
resLen=0;
log_verbose("Testing unorm_normalize with Decomp canonical\n");
for(x=0; x < ARRAY_LENGTH(canonTests); x++)
for(x=0; x < LENGTHOF(canonTests); x++)
{
source=CharsToUChars(canonTests[x][0]);
neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFD, 0, NULL, 0, &status);
@ -166,7 +170,7 @@ void TestCompatDecomp()
status = U_ZERO_ERROR;
resLen=0;
log_verbose("Testing unorm_normalize with Decomp compat\n");
for(x=0; x < ARRAY_LENGTH(compatTests); x++)
for(x=0; x < LENGTHOF(compatTests); x++)
{
source=CharsToUChars(compatTests[x][0]);
neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFKD, 0, NULL, 0, &status);
@ -195,7 +199,7 @@ void TestCanonDecompCompose()
status = U_ZERO_ERROR;
resLen=0;
log_verbose("Testing unorm_normalize with Decomp can compose compat\n");
for(x=0; x < ARRAY_LENGTH(canonTests); x++)
for(x=0; x < LENGTHOF(canonTests); x++)
{
source=CharsToUChars(canonTests[x][0]);
neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFC, 0, NULL, 0, &status);
@ -224,7 +228,7 @@ void TestCompatDecompCompose()
status = U_ZERO_ERROR;
resLen=0;
log_verbose("Testing unorm_normalize with compat decomp compose can\n");
for(x=0; x < ARRAY_LENGTH(compatTests); x++)
for(x=0; x < LENGTHOF(compatTests); x++)
{
source=CharsToUChars(compatTests[x][0]);
neededLen= unorm_normalize(source, u_strlen(source), UNORM_NFKC, 0, NULL, 0, &status);
@ -503,7 +507,7 @@ static void TestQuickCheckStringResult()
UChar *c = NULL;
UErrorCode error = U_ZERO_ERROR;
for (count = 0; count < ARRAY_LENGTH(canonTests); count ++)
for (count = 0; count < LENGTHOF(canonTests); count ++)
{
d = CharsToUChars(canonTests[count][1]);
c = CharsToUChars(canonTests[count][2]);
@ -525,7 +529,7 @@ static void TestQuickCheckStringResult()
free(c);
}
for (count = 0; count < ARRAY_LENGTH(compatTests); count ++)
for (count = 0; count < LENGTHOF(compatTests); count ++)
{
d = CharsToUChars(compatTests[count][1]);
c = CharsToUChars(compatTests[count][2]);
@ -607,7 +611,7 @@ static void TestIsNormalized(void) {
}
/* specific cases */
for(i=0; i<ARRAY_LENGTH(notNFC); ++i) {
for(i=0; i<LENGTHOF(notNFC); ++i) {
errorCode=U_ZERO_ERROR;
if(unorm_isNormalized(notNFC[i], -1, UNORM_NFC, &errorCode) || U_FAILURE(errorCode)) {
log_err("error: isNormalized(notNFC[%d], NFC) is wrong (%s)\n", i, u_errorName(errorCode));
@ -617,7 +621,7 @@ static void TestIsNormalized(void) {
log_err("error: isNormalized(notNFC[%d], NFKC) is wrong (%s)\n", i, u_errorName(errorCode));
}
}
for(i=0; i<ARRAY_LENGTH(notNFKC); ++i) {
for(i=0; i<LENGTHOF(notNFKC); ++i) {
errorCode=U_ZERO_ERROR;
if(unorm_isNormalized(notNFKC[i], -1, UNORM_NFKC, &errorCode) || U_FAILURE(errorCode)) {
log_err("error: isNormalized(notNFKC[%d], NFKC) is wrong (%s)\n", i, u_errorName(errorCode));
@ -1360,9 +1364,9 @@ TestFCNFKCClosure(void) {
UErrorCode errorCode;
int32_t i, length;
for(i=0; i<ARRAY_LENGTH(tests); ++i) {
for(i=0; i<LENGTHOF(tests); ++i) {
errorCode=U_ZERO_ERROR;
length=u_getFC_NFKC_Closure(tests[i].c, buffer, ARRAY_LENGTH(buffer), &errorCode);
length=u_getFC_NFKC_Closure(tests[i].c, buffer, LENGTHOF(buffer), &errorCode);
if(U_FAILURE(errorCode) || length!=u_strlen(buffer) || 0!=u_strcmp(tests[i].s, buffer)) {
log_err("u_getFC_NFKC_Closure(U+%04lx) is wrong (%s)\n", tests[i].c, u_errorName(errorCode));
}
@ -1370,15 +1374,71 @@ TestFCNFKCClosure(void) {
/* error handling */
errorCode=U_ZERO_ERROR;
length=u_getFC_NFKC_Closure(0x5c, NULL, ARRAY_LENGTH(buffer), &errorCode);
length=u_getFC_NFKC_Closure(0x5c, NULL, LENGTHOF(buffer), &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
log_err("u_getFC_NFKC_Closure(dest=NULL) is wrong (%s)\n", u_errorName(errorCode));
}
length=u_getFC_NFKC_Closure(0x5c, buffer, ARRAY_LENGTH(buffer), &errorCode);
length=u_getFC_NFKC_Closure(0x5c, buffer, LENGTHOF(buffer), &errorCode);
if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR) {
log_err("u_getFC_NFKC_Closure(U_FAILURE) is wrong (%s)\n", u_errorName(errorCode));
}
}
static void
TestQuickCheckPerCP() {
UErrorCode errorCode;
UChar32 c;
UChar s[U16_MAX_LENGTH];
int32_t length;
UNormalizationCheckResult qc1, qc2;
if(
u_getIntPropertyMaxValue(UCHAR_NFD_QUICK_CHECK)!=(int32_t)UNORM_YES ||
u_getIntPropertyMaxValue(UCHAR_NFKD_QUICK_CHECK)!=(int32_t)UNORM_YES ||
u_getIntPropertyMaxValue(UCHAR_NFC_QUICK_CHECK)!=(int32_t)UNORM_MAYBE ||
u_getIntPropertyMaxValue(UCHAR_NFKC_QUICK_CHECK)!=(int32_t)UNORM_MAYBE
) {
log_err("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK)\n");
}
/*
* compare the quick check property values for some code points
* to the quick check results for checking same-code point strings
*/
errorCode=U_ZERO_ERROR;
c=0;
while(c<0x110000) {
length=0;
U16_APPEND_UNSAFE(s, length, c);
qc1=u_getIntPropertyValue(c, UCHAR_NFC_QUICK_CHECK);
qc2=unorm_quickCheck(s, length, UNORM_NFC, &errorCode);
if(qc1!=qc2) {
log_err("u_getIntPropertyValue(NFC)=%d != %d=unorm_quickCheck(NFC) for U+%04x\n", qc1, qc2, c);
}
qc1=u_getIntPropertyValue(c, UCHAR_NFD_QUICK_CHECK);
qc2=unorm_quickCheck(s, length, UNORM_NFD, &errorCode);
if(qc1!=qc2) {
log_err("u_getIntPropertyValue(NFD)=%d != %d=unorm_quickCheck(NFD) for U+%04x\n", qc1, qc2, c);
}
qc1=u_getIntPropertyValue(c, UCHAR_NFKC_QUICK_CHECK);
qc2=unorm_quickCheck(s, length, UNORM_NFKC, &errorCode);
if(qc1!=qc2) {
log_err("u_getIntPropertyValue(NFKC)=%d != %d=unorm_quickCheck(NFKC) for U+%04x\n", qc1, qc2, c);
}
qc1=u_getIntPropertyValue(c, UCHAR_NFKD_QUICK_CHECK);
qc2=unorm_quickCheck(s, length, UNORM_NFKD, &errorCode);
if(qc1!=qc2) {
log_err("u_getIntPropertyValue(NFKD)=%d != %d=unorm_quickCheck(NFKD) for U+%04x\n", qc1, qc2, c);
}
/* skip some code points */
c=(20*c)/19+1;
}
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2002-2003, International Business Machines
* Copyright (C) 2002-2004, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -33,6 +33,7 @@
#include "unicode/uchar.h"
#include "unicode/uscript.h"
#include "unicode/unorm.h"
class AliasName {
public:

View file

@ -1,7 +1,7 @@
#!/bin/perl -w
#*******************************************************************
# COPYRIGHT:
# Copyright (c) 2002-2003, International Business Machines Corporation and
# Copyright (c) 2002-2004, International Business Machines Corporation and
# others. All Rights Reserved.
#*******************************************************************
@ -94,10 +94,6 @@ my %UNSUPPORTED = (Composition_Exclusion => 1,
Expands_On_NFKD => 1,
FC_NFKC_Closure => 1,
ID_Start_Exceptions => 1,
NFC_Quick_Check => 1,
NFD_Quick_Check => 1,
NFKC_Quick_Check => 1,
NFKD_Quick_Check => 1,
Special_Case_Condition => 1,
);
@ -1199,6 +1195,25 @@ sub read_uchar {
$in->close();
# hardcode known values for the normalization quick check properties
# see unorm.h for the UNormalizationCheckResult enum
addDatum($hash, 'NFC_QC', 'UNORM_NO', 'N');
addDatum($hash, 'NFC_QC', 'UNORM_YES', 'Y');
addDatum($hash, 'NFC_QC', 'UNORM_MAYBE', 'M');
addDatum($hash, 'NFKC_QC', 'UNORM_NO', 'N');
addDatum($hash, 'NFKC_QC', 'UNORM_YES', 'Y');
addDatum($hash, 'NFKC_QC', 'UNORM_MAYBE', 'M');
# no "maybe" values for NF[K]D
addDatum($hash, 'NFD_QC', 'UNORM_NO', 'N');
addDatum($hash, 'NFD_QC', 'UNORM_YES', 'Y');
addDatum($hash, 'NFKD_QC', 'UNORM_NO', 'N');
addDatum($hash, 'NFKD_QC', 'UNORM_YES', 'Y');
$hash;
}