mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-06 22:15:31 +00:00
ICU-7144 quick & dirty tool to recreate the UTS #46 data table according to the spec
X-SVN-Rev: 27751
This commit is contained in:
parent
1498352330
commit
cb3d8ade6d
3 changed files with 454 additions and 0 deletions
253
tools/unicode/c/genuts46/genuts46.cpp
Normal file
253
tools/unicode/c/genuts46/genuts46.cpp
Normal file
|
@ -0,0 +1,253 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* file name: genuts46.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2010mar02
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* quick & dirty tool to recreate the UTS #46 data table according to the spec
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/usetiter.h"
|
||||
|
||||
/**
|
||||
* icu::ErrorCode subclass for easy UErrorCode handling.
|
||||
* The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
|
||||
*/
|
||||
class ExitingErrorCode : public icu::ErrorCode {
|
||||
public:
|
||||
/**
|
||||
* @param loc A short string describing where the ExitingErrorCode is used.
|
||||
*/
|
||||
ExitingErrorCode(const char *loc) : location(loc) {}
|
||||
virtual ~ExitingErrorCode();
|
||||
protected:
|
||||
virtual void handleFailure() const;
|
||||
private:
|
||||
const char *location;
|
||||
};
|
||||
|
||||
ExitingErrorCode::~ExitingErrorCode() {
|
||||
// Safe because our handleFailure() does not throw exceptions.
|
||||
if(isFailure()) { handleFailure(); }
|
||||
}
|
||||
|
||||
void ExitingErrorCode::handleFailure() const {
|
||||
fprintf(stderr, "error at %s: %s\n", location, errorName());
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
enum Status { DISALLOWED, IGNORED, MAPPED, DEVIATION, VALID };
|
||||
static const char *const statusNames[]={
|
||||
"disallowed", "ignored", "mapped", "deviation", "valid"
|
||||
};
|
||||
|
||||
static void
|
||||
printLine(UChar32 start, UChar32 end, Status status, const icu::UnicodeString &mapping) {
|
||||
if(start==end) {
|
||||
printf("%04lX ", (long)start);
|
||||
} else {
|
||||
printf("%04lX..%04lX ", (long)start, (long)end);
|
||||
}
|
||||
printf("; %s", statusNames[status]);
|
||||
if(status==MAPPED || status==DEVIATION || !mapping.isEmpty()) {
|
||||
printf(" ;");
|
||||
const UChar *buffer=mapping.getBuffer();
|
||||
int32_t length=mapping.length();
|
||||
int32_t i=0;
|
||||
UChar32 c;
|
||||
while(i<length) {
|
||||
U16_NEXT(buffer, i, length, c);
|
||||
printf(" %04lX", (long)c);
|
||||
}
|
||||
}
|
||||
puts("");
|
||||
}
|
||||
|
||||
extern int
|
||||
main(int argc, const char *argv[]) {
|
||||
ExitingErrorCode errorCode("genuts46");
|
||||
|
||||
// predefined base sets
|
||||
icu::UnicodeSet labelSeparators(
|
||||
UNICODE_STRING_SIMPLE("[\\u002E\\u3002\\uFF0E\\uFF61]"), errorCode);
|
||||
|
||||
icu::UnicodeSet mappedSet(
|
||||
UNICODE_STRING_SIMPLE("[:Changes_When_NFKC_Casefolded:]"), errorCode);
|
||||
mappedSet.removeAll(labelSeparators); // simplifies checking of mapped characters
|
||||
|
||||
icu::UnicodeSet baseValidSet(icu::UnicodeString(
|
||||
"[[[:^Changes_When_NFKC_Casefolded:]"
|
||||
"-[:C:]-[:Z:]"
|
||||
"-[:Block=Ideographic_Description_Characters:]"
|
||||
"-[:ascii:]]"
|
||||
"[\\u002Da-zA-Z0-9]]", -1, US_INV), errorCode);
|
||||
|
||||
icu::UnicodeSet baseExclusionSet(icu::UnicodeString(
|
||||
"[\\u04C0\\u10A0-\\u10C5\\u2132\\u2183"
|
||||
"\\U0002F868\\U0002F874\\U0002F91F\\U0002F95F\\U0002F9BF"
|
||||
"\\u3164\\uFFA0\\u115F\\u1160\\u17B4\\u17B5\\u1806\\uFFFC\\uFFFD"
|
||||
"\\u200E\\u200F\\u202A-\\u202E"
|
||||
"\\u2061-\\u2063"
|
||||
"\\U0001D173-\\U0001D17A"
|
||||
"\\u200B\\u2060\\uFEFF"
|
||||
"\\u206A-\\u206F"
|
||||
"\\U000E0001\\U000E0020-\\U000E007F"
|
||||
"[:Cn:]]", -1, US_INV), errorCode);
|
||||
|
||||
icu::UnicodeSet deviationSet(
|
||||
UNICODE_STRING_SIMPLE("[\\u00DF\\u03C2\\u200C\\u200D]"), errorCode);
|
||||
errorCode.assertSuccess();
|
||||
|
||||
// derived sets
|
||||
icu::UnicodeSet disallowedSet(0, 0x10ffff);
|
||||
disallowedSet.
|
||||
removeAll(labelSeparators).
|
||||
removeAll(deviationSet).
|
||||
removeAll(mappedSet).
|
||||
removeAll(baseValidSet).
|
||||
addAll(baseExclusionSet);
|
||||
|
||||
const icu::Normalizer2 *nfkc_cf=
|
||||
icu::Normalizer2::getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode);
|
||||
const icu::Normalizer2 *nfd=
|
||||
icu::Normalizer2::getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode);
|
||||
errorCode.assertSuccess();
|
||||
|
||||
icu::UnicodeSet ignoredSet; // will be a subset of mappedSet
|
||||
icu::UnicodeSet removeSet;
|
||||
icu::UnicodeString cString, mapping, nfdString;
|
||||
{
|
||||
icu::UnicodeSetIterator iter(mappedSet);
|
||||
while(iter.next()) {
|
||||
UChar32 c=iter.getCodepoint();
|
||||
cString.setTo(c);
|
||||
nfkc_cf->normalize(cString, mapping, errorCode);
|
||||
if(!baseValidSet.containsAll(mapping)) {
|
||||
fprintf(stderr, "U+%04lX mapped -> disallowed: mapping not wholly in base valid set\n", (long)c);
|
||||
disallowedSet.add(c);
|
||||
removeSet.add(c);
|
||||
} else if(mapping.isEmpty()) {
|
||||
ignoredSet.add(c);
|
||||
}
|
||||
}
|
||||
mappedSet.removeAll(removeSet);
|
||||
}
|
||||
errorCode.assertSuccess();
|
||||
|
||||
icu::UnicodeSet validSet(baseValidSet);
|
||||
validSet.
|
||||
removeAll(labelSeparators). // non-ASCII label separators will be mapped in the end
|
||||
removeAll(deviationSet).
|
||||
removeAll(disallowedSet).
|
||||
removeAll(mappedSet).
|
||||
add(0x2e); // not mapped, simply valid
|
||||
UBool madeChange;
|
||||
do {
|
||||
madeChange=FALSE;
|
||||
{
|
||||
removeSet.clear();
|
||||
icu::UnicodeSetIterator iter(validSet);
|
||||
while(iter.next()) {
|
||||
UChar32 c=iter.getCodepoint();
|
||||
cString.setTo(c);
|
||||
nfd->normalize(cString, nfdString, errorCode);
|
||||
if(!validSet.containsAll(nfdString)) {
|
||||
fprintf(stderr, "U+%04lX valid -> disallowed: NFD not wholly valid\n", (long)c);
|
||||
disallowedSet.add(c);
|
||||
removeSet.add(c);
|
||||
madeChange=TRUE;
|
||||
}
|
||||
}
|
||||
validSet.removeAll(removeSet);
|
||||
}
|
||||
{
|
||||
removeSet.clear();
|
||||
icu::UnicodeSetIterator iter(mappedSet);
|
||||
while(iter.next()) {
|
||||
UChar32 c=iter.getCodepoint();
|
||||
cString.setTo(c);
|
||||
nfkc_cf->normalize(cString, mapping, errorCode);
|
||||
nfd->normalize(mapping, nfdString, errorCode);
|
||||
if(!validSet.containsAll(nfdString)) {
|
||||
fprintf(stderr, "U+%04lX mapped -> disallowed: NFD of mapping not wholly valid\n", (long)c);
|
||||
disallowedSet.add(c);
|
||||
removeSet.add(c);
|
||||
madeChange=TRUE;
|
||||
}
|
||||
}
|
||||
mappedSet.removeAll(removeSet);
|
||||
}
|
||||
} while(madeChange);
|
||||
errorCode.assertSuccess();
|
||||
|
||||
// finish up
|
||||
labelSeparators.remove(0x2e).freeze(); // U+002E is simply valid
|
||||
deviationSet.freeze();
|
||||
ignoredSet.freeze();
|
||||
validSet.freeze();
|
||||
mappedSet.freeze();
|
||||
|
||||
// output
|
||||
UChar32 prevStart=0, c=0;
|
||||
Status prevStatus=DISALLOWED, status;
|
||||
icu::UnicodeString prevMapping;
|
||||
|
||||
icu::UnicodeSetIterator iter(disallowedSet);
|
||||
while(iter.nextRange()) {
|
||||
UChar32 start=iter.getCodepoint();
|
||||
while(c<start) {
|
||||
mapping.remove();
|
||||
if(labelSeparators.contains(c)) {
|
||||
status=MAPPED;
|
||||
mapping.setTo(0x2e);
|
||||
} else if(deviationSet.contains(c)) {
|
||||
status=DEVIATION;
|
||||
cString.setTo(c);
|
||||
nfkc_cf->normalize(cString, mapping, errorCode);
|
||||
} else if(ignoredSet.contains(c)) {
|
||||
status=IGNORED;
|
||||
} else if(validSet.contains(c)) {
|
||||
status=VALID;
|
||||
} else if(mappedSet.contains(c)) {
|
||||
status=MAPPED;
|
||||
cString.setTo(c);
|
||||
nfkc_cf->normalize(cString, mapping, errorCode);
|
||||
} else {
|
||||
fprintf(stderr, "*** undetermined status of U+%04lX\n", (long)c);
|
||||
}
|
||||
if(prevStart<c && status!=prevStatus || mapping!=prevMapping) {
|
||||
printLine(prevStart, c-1, prevStatus, prevMapping);
|
||||
prevStart=c;
|
||||
prevStatus=status;
|
||||
prevMapping=mapping;
|
||||
}
|
||||
++c;
|
||||
}
|
||||
// c==start is disallowed
|
||||
if(prevStart<c) {
|
||||
printLine(prevStart, c-1, prevStatus, prevMapping);
|
||||
}
|
||||
prevStart=c;
|
||||
prevStatus=DISALLOWED;
|
||||
prevMapping.remove();
|
||||
c=iter.getCodepointEnd()+1;
|
||||
}
|
||||
if(prevStart<c) {
|
||||
printLine(prevStart, c-1, prevStatus, prevMapping);
|
||||
}
|
||||
return 0;
|
||||
}
|
20
tools/unicode/c/genuts46/genuts46.sln
Normal file
20
tools/unicode/c/genuts46/genuts46.sln
Normal file
|
@ -0,0 +1,20 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 10.00
|
||||
# Visual C++ Express 2008
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "genuts46", "genuts46.vcproj", "{20F9F1B1-E362-4A4D-84BA-548557745CD9}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
Release|Win32 = Release|Win32
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{20F9F1B1-E362-4A4D-84BA-548557745CD9}.Release|Win32.Build.0 = Release|Win32
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
EndGlobal
|
181
tools/unicode/c/genuts46/genuts46.vcproj
Normal file
181
tools/unicode/c/genuts46/genuts46.vcproj
Normal file
|
@ -0,0 +1,181 @@
|
|||
<?xml version="1.0" encoding="Windows-1252"?>
|
||||
<VisualStudioProject
|
||||
ProjectType="Visual C++"
|
||||
Version="9.00"
|
||||
Name="genuts46"
|
||||
ProjectGUID="{20F9F1B1-E362-4A4D-84BA-548557745CD9}"
|
||||
RootNamespace="genuts46"
|
||||
Keyword="Win32Proj"
|
||||
TargetFrameworkVersion="196613"
|
||||
>
|
||||
<Platforms>
|
||||
<Platform
|
||||
Name="Win32"
|
||||
/>
|
||||
</Platforms>
|
||||
<ToolFiles>
|
||||
</ToolFiles>
|
||||
<Configurations>
|
||||
<Configuration
|
||||
Name="Debug|Win32"
|
||||
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
|
||||
IntermediateDirectory="$(ConfigurationName)"
|
||||
ConfigurationType="1"
|
||||
CharacterSet="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include"
|
||||
PreprocessorDefinitions="U_USING_ICU_NAMESPACE=0"
|
||||
MinimalRebuild="true"
|
||||
BasicRuntimeChecks="3"
|
||||
RuntimeLibrary="3"
|
||||
UsePrecompiledHeader="0"
|
||||
WarningLevel="3"
|
||||
DebugInformationFormat="4"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
AdditionalDependencies="icuucd.lib"
|
||||
LinkIncremental="2"
|
||||
AdditionalLibraryDirectories="..\..\..\..\..\icu\trunk\lib"
|
||||
GenerateDebugInformation="true"
|
||||
SubSystem="1"
|
||||
TargetMachine="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|Win32"
|
||||
OutputDirectory="$(SolutionDir)$(ConfigurationName)"
|
||||
IntermediateDirectory="$(ConfigurationName)"
|
||||
ConfigurationType="1"
|
||||
CharacterSet="1"
|
||||
WholeProgramOptimization="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="2"
|
||||
EnableIntrinsicFunctions="true"
|
||||
AdditionalIncludeDirectories="..\..\..\..\..\icu\trunk\include"
|
||||
PreprocessorDefinitions="U_USING_ICU_NAMESPACE=0"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
UsePrecompiledHeader="0"
|
||||
WarningLevel="3"
|
||||
DebugInformationFormat="3"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
AdditionalDependencies="icuuc.lib"
|
||||
LinkIncremental="1"
|
||||
AdditionalLibraryDirectories="..\..\..\..\..\icu\trunk\lib"
|
||||
GenerateDebugInformation="true"
|
||||
SubSystem="1"
|
||||
OptimizeReferences="2"
|
||||
EnableCOMDATFolding="2"
|
||||
TargetMachine="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
</Configurations>
|
||||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<File
|
||||
RelativePath=".\genuts46.cpp"
|
||||
>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
</VisualStudioProject>
|
Loading…
Add table
Reference in a new issue