ICU-7144 derive the base exclusion set by comparing IDNA2003 with NFKC_Casefold (follow change in UTS #46)

X-SVN-Rev: 27880
This commit is contained in:
Markus Scherer 2010-03-29 22:29:36 +00:00
parent 6015517389
commit 04f3398d30
2 changed files with 71 additions and 5 deletions

View file

@ -22,6 +22,8 @@
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
#include "unicode/usprep.h"
#include "sprpimpl.h" // HACK
/**
* icu::ErrorCode subclass for easy UErrorCode handling.
@ -50,6 +52,29 @@ void ExitingErrorCode::handleFailure() const {
exit(errorCode);
}
static int
toIDNA2003(const UStringPrepProfile *prep, UChar32 c, icu::UnicodeString &destString) {
UChar src[2];
int32_t srcLength=0;
U16_APPEND_UNSAFE(src, srcLength, c);
UChar *dest;
int32_t destLength;
dest=destString.getBuffer(32);
if(dest==NULL) {
return FALSE;
}
UErrorCode errorCode=U_ZERO_ERROR;
destLength=usprep_prepare(prep, src, srcLength,
dest, destString.getCapacity(),
USPREP_DEFAULT, NULL, &errorCode);
destString.releaseBuffer(destLength);
if(errorCode==U_STRINGPREP_PROHIBITED_ERROR) {
return -1;
} else {
return U_SUCCESS(errorCode);
}
}
enum Status { DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID };
static const char *const statusNames[]={
"disallowed", "ignored", "mapped", "deviation", "valid"
@ -96,6 +121,7 @@ main(int argc, const char *argv[]) {
"-[:ascii:]]"
"[\\u002Da-zA-Z0-9]]", -1, US_INV), errorCode);
#if 0
icu::UnicodeSet baseExclusionSet(icu::UnicodeString(
"[\\u04C0\\u10A0-\\u10C5\\u2132\\u2183"
"\\U0002F868\\U0002F874\\U0002F91F\\U0002F95F\\U0002F9BF"
@ -107,12 +133,54 @@ main(int argc, const char *argv[]) {
"\\u206A-\\u206F"
"\\U000E0001\\U000E0020-\\U000E007F"
"[:Cn:]]", -1, US_INV), errorCode);
#endif
icu::UnicodeSet deviationSet(
UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
errorCode.assertSuccess();
// derived sets
icu::LocalUStringPrepProfilePointer namePrep(usprep_openByType(USPREP_RFC3491_NAMEPREP, errorCode));
const icu::Normalizer2 *nfkc_cf=
icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
errorCode.assertSuccess();
// HACK: The StringPrep API performs a BiDi check according to the data.
// We need to override that for this data generation, by resetting an internal flag.
namePrep->checkBiDi=FALSE;
icu::UnicodeSet baseExclusionSet;
icu::UnicodeString cString, mapping, namePrepResult;
for(UChar32 c=0; c<=0x10ffff; ++c) {
if(c==0xd800) {
c=0xe000;
}
int namePrepStatus=toIDNA2003(namePrep.getAlias(), c, namePrepResult);
if(namePrepStatus!=0) {
// get the UTS #46 base mapping value
switch(c) {
case 0xff0e:
case 0x3002:
case 0xff61:
mapping.setTo(0x2e);
break;
default:
cString.setTo(c);
nfkc_cf->normalize(cString, mapping, errorCode);
break;
}
if(
namePrepStatus>0 ?
// c is valid or mapped in IDNA2003
namePrepResult!=mapping :
// namePrepStatus<0: c is prohibited in IDNA2003
baseValidSet.contains(c) || (cString!=mapping && baseValidSet.containsAll(mapping))
) {
baseExclusionSet.add(c);
}
}
}
icu::UnicodeSet disallowedSet(0, 0x10ffff);
disallowedSet.
removeAll(labelSeparators).
@ -121,15 +189,13 @@ main(int argc, const char *argv[]) {
removeAll(baseValidSet).
addAll(baseExclusionSet);
const icu::Normalizer2 *nfkc_cf=
icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
const icu::Normalizer2 *nfd=
icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
errorCode.assertSuccess();
icu::UnicodeSet ignoredSet; // will be a subset of mappedSet
icu::UnicodeSet removeSet;
icu::UnicodeString cString, mapping, nfdString;
icu::UnicodeString nfdString;
{
icu::UnicodeSetIterator iter(mappedSet);
while(iter.next()) {

View file

@ -41,7 +41,7 @@
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include"
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include, ..\..\..\..\..\icu\trunk\source\common"
PreprocessorDefinitions="U_USING_ICU_NAMESPACE=0"
MinimalRebuild="true"
BasicRuntimeChecks="3"
@ -117,7 +117,7 @@
Name="VCCLCompilerTool"
Optimization="2"
EnableIntrinsicFunctions="true"
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include"
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include, ..\..\..\..\..\icu\trunk\source\common"
PreprocessorDefinitions="U_USING_ICU_NAMESPACE=0"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"