ICU-7182 remove support for Unicode Normalization behavior before corrigendum 5 (PRI #29)

X-SVN-Rev: 26952
This commit is contained in:
Markus Scherer 2009-11-19 00:46:36 +00:00
parent 3b12074b40
commit 9fca8644b5
5 changed files with 34 additions and 119 deletions

View file

@ -2090,8 +2090,8 @@ _recompose(UCharBuffer &buffer, int32_t options, const UnicodeSet *nx) {
if((combineFlags&_NORM_COMBINES_BACK) && starter!=NULL) {
if(combineBackIndex&0x8000) {
/* c is a Jamo V/T, see if we can compose it with the previous character */
/* for the PRI #29 fix, check that there is no intervening combining mark */
if((options&UNORM_BEFORE_PRI_29) || prevCC==0) {
/* check that there is no intervening combining mark */
if(prevCC==0) {
pRemove=NULL; /* NULL while no Hangul composition */
combineFlags=0;
c2=*starter;
@ -2104,9 +2104,6 @@ _recompose(UCharBuffer &buffer, int32_t options, const UnicodeSet *nx) {
if(p!=limit && (c2=(UChar)(*p-JAMO_T_BASE))<JAMO_T_COUNT) {
++p;
c+=c2;
} else {
/* the result is an LV syllable, which is a starter (unlike LVT) */
combineFlags=_NORM_COMBINES_FWD;
}
if(!nx_contains(nx, c)) {
*starter=c;
@ -2119,26 +2116,13 @@ _recompose(UCharBuffer &buffer, int32_t options, const UnicodeSet *nx) {
pRemove=NULL;
}
}
}
/*
* Normally, the following can not occur:
* No "else" for Jamo T:
* Since the input is in NFD, there are no Hangul LV syllables that
* a Jamo T could combine with.
* All Jamo Ts are combined above when handling Jamo Vs.
*
* However, before the PRI #29 fix, this can occur due to
* an intervening combining mark between the Hangul LV and the Jamo T.
*/
} else {
/* Jamo T, compose with previous Hangul that does not have a Jamo T */
if(isHangulWithoutJamoT(c2)) {
c2+=(UChar)(c-JAMO_T_BASE);
if(!nx_contains(nx, c2)) {
pRemove=p-1;
*starter=c2;
}
}
}
if(pRemove!=NULL) {
/* remove the Jamo(s) */
@ -2152,41 +2136,22 @@ _recompose(UCharBuffer &buffer, int32_t options, const UnicodeSet *nx) {
}
c2=0; /* c2 held *starter temporarily */
if(combineFlags!=0) {
/*
* not starter=NULL because the composition is a Hangul LV syllable
* and might combine once more (but only before the PRI #29 fix)
*/
/* done? */
if(p==limit) {
return prevCC;
}
/* the composition is a Hangul LV syllable which is a starter that combines forward */
combineFwdIndex=0xfff0;
/* we combined; continue with looking for compositions */
continue;
}
} else {
prevCC=0;
}
/*
* now: cc==0 and the combining index does not include "forward" ->
* the rest of the loop body will reset starter to NULL;
* technically, a composed Hangul syllable is a starter, but it
* does not combine forward now that we have consumed all eligible Jamos;
* for Jamo V/T, combineFlags does not contain _NORM_COMBINES_FWD
*/
/* done? */
if(p==limit) {
return prevCC;
}
starter=NULL;
continue;
} else if(
/* the starter is not a Hangul LV or Jamo V/T and */
!(combineFwdIndex&0x8000) &&
/* the combining mark is not blocked and */
((options&UNORM_BEFORE_PRI_29) ?
(prevCC!=cc || prevCC==0) :
(prevCC<cc || prevCC==0)) &&
(prevCC<cc || prevCC==0) &&
/* the starter and the combining mark (c, c2) do combine and */
0!=(result=_combine(combiningTable+combineFwdIndex, combineBackIndex, value, value2)) &&
/* the composition result is not excluded */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2008, International Business Machines
* Copyright (C) 2001-2009, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -188,15 +188,6 @@ enum {
UNORM_NX_HANGUL=1,
/** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
UNORM_NX_CJK_COMPAT=2,
/**
* Options bit 8, use buggy recomposition described in
* Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
*
* Used in IDNA implementation according to strict interpretation
* of IDNA definition based on Unicode 3.2 which predates PRI #29.
*/
UNORM_BEFORE_PRI_29=0x100
};
/**

View file

@ -632,20 +632,9 @@ static int32_t
usprep_normalize( const UChar* src, int32_t srcLength,
UChar* dest, int32_t destCapacity,
UErrorCode* status ){
/*
* Option UNORM_BEFORE_PRI_29:
*
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
* requires strict adherence to Unicode 3.2 normalization,
* including buggy composition from before fixing Public Review Issue #29.
* Note that this results in some valid but nonsensical text to be
* either corrupted or rejected, depending on the text.
* See http://www.unicode.org/review/resolved-pri.html#pri29
* See unorm.cpp and cnormtst.c
*/
return unorm_normalize(
src, srcLength,
UNORM_NFKC, UNORM_UNICODE_3_2|UNORM_BEFORE_PRI_29,
UNORM_NFKC, UNORM_UNICODE_3_2,
dest, destCapacity,
status);
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2007, International Business Machines Corporation and
* Copyright (c) 1997-2009, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -122,21 +122,21 @@ void addNormTest(TestNode** root);
void addNormTest(TestNode** root)
{
addTest(root, &TestAPI, "tscoll/cnormtst/TestAPI");
addTest(root, &TestDecomp, "tscoll/cnormtst/TestDecomp");
addTest(root, &TestCompatDecomp, "tscoll/cnormtst/TestCompatDecomp");
addTest(root, &TestCanonDecompCompose, "tscoll/cnormtst/TestCanonDecompCompose");
addTest(root, &TestCompatDecompCompose, "tscoll/cnormtst/CompatDecompCompose");
addTest(root, &TestNull, "tscoll/cnormtst/TestNull");
addTest(root, &TestQuickCheck, "tscoll/cnormtst/TestQuickCheck");
addTest(root, &TestQuickCheckPerCP, "tscoll/cnormtst/TestQuickCheckPerCP");
addTest(root, &TestIsNormalized, "tscoll/cnormtst/TestIsNormalized");
addTest(root, &TestCheckFCD, "tscoll/cnormtst/TestCheckFCD");
addTest(root, &TestNormCoverage, "tscoll/cnormtst/TestNormCoverage");
addTest(root, &TestConcatenate, "tscoll/cnormtst/TestConcatenate");
addTest(root, &TestNextPrevious, "tscoll/cnormtst/TestNextPrevious");
addTest(root, &TestFCNFKCClosure, "tscoll/cnormtst/TestFCNFKCClosure");
addTest(root, &TestComposition, "tscoll/cnormtst/TestComposition");
addTest(root, &TestAPI, "tsnorm/cnormtst/TestAPI");
addTest(root, &TestDecomp, "tsnorm/cnormtst/TestDecomp");
addTest(root, &TestCompatDecomp, "tsnorm/cnormtst/TestCompatDecomp");
addTest(root, &TestCanonDecompCompose, "tsnorm/cnormtst/TestCanonDecompCompose");
addTest(root, &TestCompatDecompCompose, "tsnorm/cnormtst/CompatDecompCompose");
addTest(root, &TestNull, "tsnorm/cnormtst/TestNull");
addTest(root, &TestQuickCheck, "tsnorm/cnormtst/TestQuickCheck");
addTest(root, &TestQuickCheckPerCP, "tsnorm/cnormtst/TestQuickCheckPerCP");
addTest(root, &TestIsNormalized, "tsnorm/cnormtst/TestIsNormalized");
addTest(root, &TestCheckFCD, "tsnorm/cnormtst/TestCheckFCD");
addTest(root, &TestNormCoverage, "tsnorm/cnormtst/TestNormCoverage");
addTest(root, &TestConcatenate, "tsnorm/cnormtst/TestConcatenate");
addTest(root, &TestNextPrevious, "tsnorm/cnormtst/TestNextPrevious");
addTest(root, &TestFCNFKCClosure, "tsnorm/cnormtst/TestFCNFKCClosure");
addTest(root, &TestComposition, "tsnorm/cnormtst/TestComposition");
}
void TestDecomp()
@ -1478,19 +1478,15 @@ TestComposition(void) {
} cases[]={
/*
* special cases for UAX #15 bug
* see Unicode Public Review Issue #29
* at http://www.unicode.org/review/resolved-pri.html#pri29
* see Unicode Corrigendum #5: Normalization Idempotency
* at http://unicode.org/versions/corrigendum5.html
* (was Public Review Issue #29)
*/
{ UNORM_NFC, 0, { 0x1100, 0x0300, 0x1161, 0x0327 }, { 0x1100, 0x0300, 0x1161, 0x0327 } },
{ UNORM_NFC, 0, { 0x1100, 0x0300, 0x1161, 0x0327, 0x11a8 }, { 0x1100, 0x0300, 0x1161, 0x0327, 0x11a8 } },
{ UNORM_NFC, 0, { 0xac00, 0x0300, 0x0327, 0x11a8 }, { 0xac00, 0x0327, 0x0300, 0x11a8 } },
{ UNORM_NFC, 0, { 0x0b47, 0x0300, 0x0b3e }, { 0x0b47, 0x0300, 0x0b3e } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0x1100, 0x0300, 0x1161, 0x0327 }, { 0xac00, 0x0300, 0x0327 } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0x1100, 0x0300, 0x1161, 0x0327, 0x11a8 }, { 0xac01, 0x0300, 0x0327 } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0xac00, 0x0300, 0x0327, 0x11a8 }, { 0xac01, 0x0327, 0x0300 } },
{ UNORM_NFC, UNORM_BEFORE_PRI_29, { 0x0b47, 0x0300, 0x0b3e }, { 0x0b4b, 0x0300 } }
/* TODO: add test cases for UNORM_FCC here (j2151) */
};

View file

@ -33,7 +33,6 @@ static void TestToASCII(void);
static void TestIDNToUnicode(void);
static void TestIDNToASCII(void);
static void TestCompare(void);
static void TestUnicode32Norm(void);
static void TestJB4490(void);
static void TestJB4475(void);
static void TestLength(void);
@ -61,7 +60,6 @@ addIDNATest(TestNode** root)
addTest(root, &TestIDNToUnicode, "idna/TestIDNToUnicode");
addTest(root, &TestIDNToASCII, "idna/TestIDNToASCII");
addTest(root, &TestCompare, "idna/TestCompare");
addTest(root, &TestUnicode32Norm,"idna/TestUnicode32Norm");
addTest(root, &TestJB4490, "idna/TestJB4490");
addTest(root, &TestJB4475, "idna/TestJB4475");
addTest(root, &TestLength, "idna/TestLength");
@ -639,30 +637,6 @@ TestCompare(){
}
}
static void TestUnicode32Norm() {
/*
* test Unicode 3.2 normalization, before Public Review Issue #29
* see cnormtst.c TestComposition()
*/
static const UChar strings[][8]={
{ 0x1100, 0x0300, 0x1161, 0x0327 },
{ 0x0b47, 0x0300, 0x0b3e, 0x0327 }
};
UChar ascii[20], unicode[20];
int32_t i, length;
UErrorCode errorCode;
for(i=0; i<LENGTHOF(strings); ++i) {
errorCode=U_ZERO_ERROR;
length=uidna_toASCII(strings[i], -1, ascii, LENGTHOF(ascii), 0, NULL, &errorCode);
length=uidna_toUnicode(ascii, length, unicode, LENGTHOF(unicode), 0, NULL, &errorCode);
if(u_strncmp(ascii, unicode, length)!=0) {
log_err("Did not get the correct output\n");
}
}
}
static void TestJB4490(){
static const UChar data[][50]= {
{0x00F5,0x00dE,0x00dF,0x00dD, 0x0000},