ICU-7144 quick & dirty tool to recreate the UTS #46 data table according to the spec

X-SVN-Rev: 27751
This commit is contained in:
Markus Scherer 2010-03-02 22:59:05 +00:00
parent 1498352330
commit cb3d8ade6d
3 changed files with 454 additions and 0 deletions

View file

@ -0,0 +1,253 @@
/*
*******************************************************************************
* Copyright (C) 2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* file name: genuts46.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2010mar02
* created by: Markus W. Scherer
*
* quick & dirty tool to recreate the UTS #46 data table according to the spec
*/
#include <stdio.h>
#include <string>
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
#include "unicode/normalizer2.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/usetiter.h"
/**
* icu::ErrorCode subclass for easy UErrorCode handling.
* The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
*/
class ExitingErrorCode : public icu::ErrorCode {
public:
/**
* @param loc A short string describing where the ExitingErrorCode is used.
*/
ExitingErrorCode(const char *loc) : location(loc) {}
virtual ~ExitingErrorCode();
protected:
virtual void handleFailure() const;
private:
const char *location;
};
ExitingErrorCode::~ExitingErrorCode() {
// Safe because our handleFailure() does not throw exceptions.
if(isFailure()) { handleFailure(); }
}
void ExitingErrorCode::handleFailure() const {
fprintf(stderr, "error at %s: %s\n", location, errorName());
exit(errorCode);
}
enum Status { DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID };
static const char *const statusNames[]={
"disallowed", "ignored", "mapped", "deviation", "valid"
};
static void
printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) {
if(start==end) {
printf("%04lX ", (long)start);
} else {
printf("%04lX..%04lX ", (long)start, (long)end);
}
printf("; %s", statusNames[status]);
if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) {
printf(" ;");
const UChar *buffer=mapping.getBuffer();
int32_t length=mapping.length();
int32_t i=0;
UChar32 c;
while(i<length) {
U16_NEXT(buffer, i, length, c);
printf(" %04lX", (long)c);
}
}
puts("");
}
extern int
main(int argc, const char *argv[]) {
ExitingErrorCode errorCode("genuts46");
// predefined base sets
icu::UnicodeSet labelSeparators(
UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode);
icu::UnicodeSet mappedSet(
UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode);
mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters
icu::UnicodeSet baseValidSet(icu::UnicodeString(
"[[[:^Changes_When_NFKC_Casefolded:]"
"-[:C:]-[:Z:]"
"-[:Block=Ideographic_Description_Characters:]"
"-[:ascii:]]"
"[\\u002Da-zA-Z0-9]]", -1, US_INV), errorCode);
icu::UnicodeSet baseExclusionSet(icu::UnicodeString(
"[\\u04C0\\u10A0-\\u10C5\\u2132\\u2183"
"\\U0002F868\\U0002F874\\U0002F91F\\U0002F95F\\U0002F9BF"
"\\u3164\\uFFA0\\u115F\\u1160\\u17B4\\u17B5\\u1806\\uFFFC\\uFFFD"
"\\u200E\\u200F\\u202A-\\u202E"
"\\u2061-\\u2063"
"\\U0001D173-\\U0001D17A"
"\\u200B\\u2060\\uFEFF"
"\\u206A-\\u206F"
"\\U000E0001\\U000E0020-\\U000E007F"
"[:Cn:]]", -1, US_INV), errorCode);
icu::UnicodeSet deviationSet(
UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
errorCode.assertSuccess();
// derived sets
icu::UnicodeSet disallowedSet(0, 0x10ffff);
disallowedSet.
removeAll(labelSeparators).
removeAll(deviationSet).
removeAll(mappedSet).
removeAll(baseValidSet).
addAll(baseExclusionSet);
const icu::Normalizer2 *nfkc_cf=
icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
const icu::Normalizer2 *nfd=
icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
errorCode.assertSuccess();
icu::UnicodeSet ignoredSet; // will be a subset of mappedSet
icu::UnicodeSet removeSet;
icu::UnicodeString cString, mapping, nfdString;
{
icu::UnicodeSetIterator iter(mappedSet);
while(iter.next()) {
UChar32 c=iter.getCodepoint();
cString.setTo(c);
nfkc_cf->normalize(cString, mapping, errorCode);
if(!baseValidSet.containsAll(mapping)) {
fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c);
disallowedSet.add(c);
removeSet.add(c);
} else if(mapping.isEmpty()) {
ignoredSet.add(c);
}
}
mappedSet.removeAll(removeSet);
}
errorCode.assertSuccess();
icu::UnicodeSet validSet(baseValidSet);
validSet.
removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end
removeAll(deviationSet).
removeAll(disallowedSet).
removeAll(mappedSet).
add(0x2e); // not mapped, simply valid
UBool madeChange;
do {
madeChange=FALSE;
{
removeSet.clear();
icu::UnicodeSetIterator iter(validSet);
while(iter.next()) {
UChar32 c=iter.getCodepoint();
cString.setTo(c);
nfd->normalize(cString, nfdString, errorCode);
if(!validSet.containsAll(nfdString)) {
fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c);
disallowedSet.add(c);
removeSet.add(c);
madeChange=TRUE;
}
}
validSet.removeAll(removeSet);
}
{
removeSet.clear();
icu::UnicodeSetIterator iter(mappedSet);
while(iter.next()) {
UChar32 c=iter.getCodepoint();
cString.setTo(c);
nfkc_cf->normalize(cString, mapping, errorCode);
nfd->normalize(mapping, nfdString, errorCode);
if(!validSet.containsAll(nfdString)) {
fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c);
disallowedSet.add(c);
removeSet.add(c);
madeChange=TRUE;
}
}
mappedSet.removeAll(removeSet);
}
} while(madeChange);
errorCode.assertSuccess();
// finish up
labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid
deviationSet.freeze();
ignoredSet.freeze();
validSet.freeze();
mappedSet.freeze();
// output
UChar32 prevStart=0, c=0;
Status prevStatus=DISALLOWED, status;
icu::UnicodeString prevMapping;
icu::UnicodeSetIterator iter(disallowedSet);
while(iter.nextRange()) {
UChar32 start=iter.getCodepoint();
while(c<start) {
mapping.remove();
if(labelSeparators.contains(c)) {
status=MAPPED;
mapping.setTo(0x2e);
} else if(deviationSet.contains(c)) {
status=DEVIATION;
cString.setTo(c);
nfkc_cf->normalize(cString, mapping, errorCode);
} else if(ignoredSet.contains(c)) {
status=IGNORED;
} else if(validSet.contains(c)) {
status=VALID;
} else if(mappedSet.contains(c)) {
status=MAPPED;
cString.setTo(c);
nfkc_cf->normalize(cString, mapping, errorCode);
} else {
fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
}
if(prevStart<c && status!=prevStatus || mapping!=prevMapping) {
printLine(prevStart, c-1, prevStatus, prevMapping);
prevStart=c;
prevStatus=status;
prevMapping=mapping;
}
++c;
}
// c==start is disallowed
if(prevStart<c) {
printLine(prevStart, c-1, prevStatus, prevMapping);
}
prevStart=c;
prevStatus=DISALLOWED;
prevMapping.remove();
c=iter.getCodepointEnd()+1;
}
if(prevStart<c) {
printLine(prevStart, c-1, prevStatus, prevMapping);
}
return 0;
}

View file

@ -0,0 +1,20 @@

Microsoft Visual Studio Solution File, Format Version 10.00
# Visual C++ Express 2008
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "genuts46", "genuts46.vcproj", "{20F9F1B1-E362-4A4D-84BA-548557745CD9}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
Release|Win32 = Release|Win32
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Debug|Win32.ActiveCfg = Debug|Win32
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Debug|Win32.Build.0 = Debug|Win32
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Release|Win32.ActiveCfg = Release|Win32
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Release|Win32.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal

View file

@ -0,0 +1,181 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="9.00"
Name="genuts46"
ProjectGUID="{20F9F1B1-E362-4A4D-84BA-548557745CD9}"
RootNamespace="genuts46"
Keyword="Win32Proj"
TargetFrameworkVersion="196613"
>
<Platforms>
<Platform
Name="Win32"
/>
</Platforms>
<ToolFiles>
</ToolFiles>
<Configurations>
<Configuration
Name="Debug|Win32"
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
IntermediateDirectory="$(ConfigurationName)"
ConfigurationType="1"
CharacterSet="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include"
PreprocessorDefinitions="U_USING_ICU_NAMESPACE=0"
MinimalRebuild="true"
BasicRuntimeChecks="3"
RuntimeLibrary="3"
UsePrecompiledHeader="0"
WarningLevel="3"
DebugInformationFormat="4"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
AdditionalDependencies="icuucd.lib"
LinkIncremental="2"
AdditionalLibraryDirectories="..\..\..\..\..\icu\trunk\lib"
GenerateDebugInformation="true"
SubSystem="1"
TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|Win32"
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
IntermediateDirectory="$(ConfigurationName)"
ConfigurationType="1"
CharacterSet="1"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="2"
EnableIntrinsicFunctions="true"
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include"
PreprocessorDefinitions="U_USING_ICU_NAMESPACE=0"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
UsePrecompiledHeader="0"
WarningLevel="3"
DebugInformationFormat="3"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
AdditionalDependencies="icuuc.lib"
LinkIncremental="1"
AdditionalLibraryDirectories="..\..\..\..\..\icu\trunk\lib"
GenerateDebugInformation="true"
SubSystem="1"
OptimizeReferences="2"
EnableCOMDATFolding="2"
TargetMachine="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<File
RelativePath=".\genuts46.cpp"
>
</File>
</Files>
<Globals>
</Globals>
</VisualStudioProject>