mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-7273 merge in Normalizer2 API & code, and ICU-5785 UnicodeSet::span(UnicodeString) and ICU-7296 tempSubString()/retainBetween(); merge -r 26971:27150 branches/markus/norm2
X-SVN-Rev: 27155
This commit is contained in:
parent
11acc7e54f
commit
8ddbd1394c
98 changed files with 24433 additions and 8028 deletions
4
.gitattributes
vendored
4
.gitattributes
vendored
|
@ -49,6 +49,10 @@ README text !eol
|
|||
*.tri2 -text
|
||||
|
||||
icu4c/icu4c.css -text
|
||||
icu4c/source/data/in/nfc.nrm -text
|
||||
icu4c/source/data/in/nfkc.nrm -text
|
||||
icu4c/source/data/in/nfkc_cf.nrm -text
|
||||
icu4c/source/data/in/unorm.icu -text
|
||||
icu4c/source/data/locales/pool.res -text
|
||||
icu4c/source/samples/ucnv/data02.bin -text
|
||||
icu4c/source/test/perf/README -text
|
||||
|
|
14
.gitignore
vendored
14
.gitignore
vendored
|
@ -560,6 +560,20 @@ icu4c/source/tools/gennorm/gennorm.vcproj.*.*.user
|
|||
icu4c/source/tools/gennorm/release
|
||||
icu4c/source/tools/gennorm/x64
|
||||
icu4c/source/tools/gennorm/x86
|
||||
icu4c/source/tools/gennorm2/*.d
|
||||
icu4c/source/tools/gennorm2/*.o
|
||||
icu4c/source/tools/gennorm2/*.pdb
|
||||
icu4c/source/tools/gennorm2/*.plg
|
||||
icu4c/source/tools/gennorm2/Debug
|
||||
icu4c/source/tools/gennorm2/Makefile
|
||||
icu4c/source/tools/gennorm2/Release
|
||||
icu4c/source/tools/gennorm2/debug
|
||||
icu4c/source/tools/gennorm2/gennorm2
|
||||
icu4c/source/tools/gennorm2/gennorm2.[0-9]
|
||||
icu4c/source/tools/gennorm2/gennorm2.vcproj.*.*.user
|
||||
icu4c/source/tools/gennorm2/release
|
||||
icu4c/source/tools/gennorm2/x64
|
||||
icu4c/source/tools/gennorm2/x86
|
||||
icu4c/source/tools/genpname/*.d
|
||||
icu4c/source/tools/genpname/*.o
|
||||
icu4c/source/tools/genpname/*.pdb
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
Microsoft Visual Studio Solution File, Format Version 10.00
|
||||
# Visual Studio 2008
|
||||
# Visual C++ Express 2008
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cal", "..\samples\cal\cal.vcproj", "{F7659D77-09CF-4FE9-ACEE-927287AA9509}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776}
|
||||
|
@ -259,6 +259,12 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gencfu", "..\tools\gencfu\g
|
|||
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gennorm2", "..\tools\gennorm2\gennorm2.vcproj", "{C7891A65-80AB-4245-912E-5F1E17B0E6C4}"
|
||||
ProjectSection(ProjectDependencies) = postProject
|
||||
{6B231032-3CB5-4EED-9210-810D666A23A0} = {6B231032-3CB5-4EED-9210-810D666A23A0}
|
||||
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Win32 = Debug|Win32
|
||||
|
@ -555,6 +561,14 @@ Global
|
|||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|Win32.Build.0 = Release|Win32
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|x64.ActiveCfg = Release|x64
|
||||
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|x64.Build.0 = Release|x64
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|x64.ActiveCfg = Debug|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|x64.Build.0 = Debug|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|Win32.Build.0 = Release|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|x64.ActiveCfg = Release|Win32
|
||||
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|x64.Build.0 = Release|Win32
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#******************************************************************************
|
||||
#
|
||||
# Copyright (C) 1999-2009, International Business Machines
|
||||
# Copyright (C) 1999-2010, International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
#
|
||||
#******************************************************************************
|
||||
|
@ -78,7 +78,8 @@ ucat.o locmap.o uloc.o locid.o locutil.o \
|
|||
bytestream.o stringpiece.o \
|
||||
ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
|
||||
utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
|
||||
normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o \
|
||||
normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \
|
||||
chariter.o schriter.o uchriter.o uiter.o \
|
||||
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
|
||||
uscript.o usc_impl.o unames.o \
|
||||
utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*****************************************************************************
|
||||
* Copyright (C) 1996-2006, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*****************************************************************************
|
||||
*/
|
||||
|
@ -12,6 +12,7 @@
|
|||
#include "unicode/uset.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "hash.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "unormimp.h"
|
||||
#include "unicode/caniter.h"
|
||||
#include "unicode/normlzr.h"
|
||||
|
@ -68,7 +69,8 @@ CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode
|
|||
pieces_length(0),
|
||||
pieces_lengths(NULL),
|
||||
current(NULL),
|
||||
current_length(0)
|
||||
current_length(0),
|
||||
nfd(*Normalizer2Factory::getNFDInstance(status))
|
||||
{
|
||||
if(U_SUCCESS(status)) {
|
||||
setSource(sourceStr, status);
|
||||
|
@ -499,73 +501,39 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
|
|||
return NULL;
|
||||
}
|
||||
|
||||
const int32_t bufSize = 256;
|
||||
int32_t bufLen = 0;
|
||||
UChar temp[bufSize];
|
||||
|
||||
int32_t inputLen = 0, decompLen;
|
||||
UChar stackBuffer[4];
|
||||
const UChar *decomp;
|
||||
|
||||
U16_APPEND_UNSAFE(temp, inputLen, comp);
|
||||
decomp = unorm_getCanonicalDecomposition(comp, stackBuffer, &decompLen);
|
||||
if(decomp == NULL) {
|
||||
/* copy temp */
|
||||
stackBuffer[0] = temp[0];
|
||||
if(inputLen > 1) {
|
||||
stackBuffer[1] = temp[1];
|
||||
}
|
||||
decomp = stackBuffer;
|
||||
decompLen = inputLen;
|
||||
}
|
||||
|
||||
UChar *buff = temp+inputLen;
|
||||
UnicodeString temp(comp);
|
||||
int32_t inputLen=temp.length();
|
||||
UnicodeString decompString;
|
||||
nfd.normalize(temp, decompString, status);
|
||||
const UChar *decomp=decompString.getBuffer();
|
||||
int32_t decompLen=decompString.length();
|
||||
|
||||
// See if it matches the start of segment (at segmentPos)
|
||||
UBool ok = FALSE;
|
||||
UChar32 cp;
|
||||
int32_t decompPos = 0;
|
||||
UChar32 decompCp;
|
||||
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
|
||||
U16_NEXT(decomp, decompPos, decompLen, decompCp);
|
||||
|
||||
int32_t i;
|
||||
UBool overflow = FALSE;
|
||||
|
||||
i = segmentPos;
|
||||
int32_t i = segmentPos;
|
||||
while(i < segLen) {
|
||||
UTF_NEXT_CHAR(segment, i, segLen, cp);
|
||||
U16_NEXT(segment, i, segLen, cp);
|
||||
|
||||
if (cp == decompCp) { // if equal, eat another cp from decomp
|
||||
|
||||
//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
|
||||
|
||||
if (decompPos == decompLen) { // done, have all decomp characters!
|
||||
//u_strcat(buff+bufLen, segment+i);
|
||||
uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
|
||||
bufLen+=segLen-i;
|
||||
|
||||
temp.append(segment+i, segLen-i);
|
||||
ok = TRUE;
|
||||
break;
|
||||
}
|
||||
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
|
||||
U16_NEXT(decomp, decompPos, decompLen, decompCp);
|
||||
} else {
|
||||
//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
|
||||
|
||||
// brute force approach
|
||||
|
||||
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
|
||||
|
||||
if(overflow) {
|
||||
/*
|
||||
* ### TODO handle buffer overflow
|
||||
* The buffer is large, but an overflow may still happen with
|
||||
* unusual input (many combining marks?).
|
||||
* Reallocate buffer and continue.
|
||||
* markus 20020929
|
||||
*/
|
||||
|
||||
overflow = FALSE;
|
||||
}
|
||||
temp.append(cp);
|
||||
|
||||
/* TODO: optimize
|
||||
// since we know that the classes are monotonically increasing, after zero
|
||||
|
@ -585,25 +553,20 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
|
|||
|
||||
//if (PROGRESS) printf("Matches\n");
|
||||
|
||||
if (bufLen == 0) {
|
||||
if (inputLen == temp.length()) {
|
||||
fillinResult->put(UnicodeString(), new UnicodeString(), status);
|
||||
return fillinResult; // succeed, but no remainder
|
||||
}
|
||||
|
||||
// brute force approach
|
||||
// check to make sure result is canonically equivalent
|
||||
int32_t tempLen = inputLen + bufLen;
|
||||
|
||||
UChar trial[bufSize];
|
||||
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
|
||||
|
||||
if(U_FAILURE(status)
|
||||
|| uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0)
|
||||
{
|
||||
UnicodeString trial;
|
||||
nfd.normalize(temp, trial, status);
|
||||
if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return getEquivalents2(fillinResult, buff, bufLen, status);
|
||||
return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -769,7 +769,7 @@
|
|||
Name="collation"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\ucol_swp.c"
|
||||
RelativePath=".\ucol_swp.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
|
@ -961,7 +961,7 @@
|
|||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\utrie2.c"
|
||||
RelativePath=".\utrie2.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
|
@ -1172,6 +1172,10 @@
|
|||
RelativePath=".\locmap.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\mutex.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\mutex.h"
|
||||
>
|
||||
|
@ -3057,6 +3061,62 @@
|
|||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\filterednormalizer2.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\normalizer2.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\normalizer2.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\normalizer2impl.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\normalizer2impl.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\normlzr.cpp"
|
||||
>
|
||||
|
@ -3145,6 +3205,46 @@
|
|||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unicode\unorm2.h"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(InputPath)" ..\..\include\unicode
"
|
||||
Outputs="..\..\include\unicode\$(InputFileName)"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unorm_it.c"
|
||||
>
|
||||
|
@ -3470,7 +3570,7 @@
|
|||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uprops.c"
|
||||
RelativePath=".\uprops.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2006, International Business Machines
|
||||
* Copyright (C) 1997-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -81,4 +81,15 @@ uprv_arrayCopy(const U_NAMESPACE_QUALIFIER UnicodeString *src, int32_t srcStart,
|
|||
U_NAMESPACE_QUALIFIER UnicodeString *dst, int32_t dstStart, int32_t count)
|
||||
{ uprv_arrayCopy(src+srcStart, dst+dstStart, count); }
|
||||
|
||||
/**
|
||||
* Checks that the string is readable and writable.
|
||||
* Sets U_ILLEGAL_ARGUMENT_ERROR if the string isBogus() or has an open getBuffer().
|
||||
*/
|
||||
inline void
|
||||
uprv_checkCanGetBuffer(const UnicodeString &s, UErrorCode &errorCode) {
|
||||
if(U_SUCCESS(errorCode) && s.isBogus()) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* _CPPUTILS */
|
||||
|
|
261
icu4c/source/common/filterednormalizer2.cpp
Normal file
261
icu4c/source/common/filterednormalizer2.cpp
Normal file
|
@ -0,0 +1,261 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: filterednormalizer2.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009dec10
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cpputils.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(src, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
}
|
||||
if(&dest==&src) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return dest;
|
||||
}
|
||||
dest.remove();
|
||||
return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
|
||||
}
|
||||
|
||||
// Internal: No argument checking, and appends to dest.
|
||||
// Pass as input spanCondition the one that is likely to yield a non-zero
|
||||
// span length at the start of src.
|
||||
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
|
||||
// USET_SPAN_SIMPLE should be passed in for the start of src
|
||||
// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
|
||||
// an in-filter prefix.
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const {
|
||||
UnicodeString tempDest; // Don't throw away destination buffer between iterations.
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
|
||||
int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
|
||||
int32_t spanLength=spanLimit-prevSpanLimit;
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
if(spanLength!=0) {
|
||||
dest.append(src, prevSpanLimit, spanLength);
|
||||
}
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if(spanLength!=0) {
|
||||
// Not norm2.normalizeSecondAndAppend() because we do not want
|
||||
// to modify the non-filter part of dest.
|
||||
dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
|
||||
tempDest, errorCode));
|
||||
if(U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
return normalizeSecondAndAppend(first, second, TRUE, errorCode);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
return normalizeSecondAndAppend(first, second, FALSE, errorCode);
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UBool doNormalize,
|
||||
UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(first, errorCode);
|
||||
uprv_checkCanGetBuffer(second, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return first;
|
||||
}
|
||||
if(&first==&second) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return first;
|
||||
}
|
||||
if(first.isEmpty()) {
|
||||
if(doNormalize) {
|
||||
return normalize(second, first, errorCode);
|
||||
} else {
|
||||
return first=second;
|
||||
}
|
||||
}
|
||||
// merge the in-filter suffix of the first string with the in-filter prefix of the second
|
||||
int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
|
||||
if(prefixLimit!=0) {
|
||||
UnicodeString prefix(second.tempSubString(0, prefixLimit));
|
||||
int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
|
||||
if(suffixStart==0) {
|
||||
if(doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(first, prefix, errorCode);
|
||||
} else {
|
||||
norm2.append(first, prefix, errorCode);
|
||||
}
|
||||
} else {
|
||||
UnicodeString middle(first, suffixStart, INT32_MAX);
|
||||
if(doNormalize) {
|
||||
norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
|
||||
} else {
|
||||
norm2.append(middle, prefix, errorCode);
|
||||
}
|
||||
first.replace(suffixStart, INT32_MAX, middle);
|
||||
}
|
||||
}
|
||||
if(prefixLimit<second.length()) {
|
||||
UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
|
||||
if(doNormalize) {
|
||||
normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
|
||||
} else {
|
||||
first.append(rest);
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
|
||||
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
|
||||
U_FAILURE(errorCode)
|
||||
) {
|
||||
return FALSE;
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UNormalizationCheckResult
|
||||
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
|
||||
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
UNormalizationCheckResult qcResult=
|
||||
norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
|
||||
if(U_FAILURE(errorCode) || qcResult!=UNORM_YES) {
|
||||
return qcResult;
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return UNORM_YES;
|
||||
}
|
||||
|
||||
int32_t
|
||||
FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
|
||||
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
|
||||
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
|
||||
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition=USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
int32_t yesLimit=
|
||||
prevSpanLimit+
|
||||
norm2.spanQuickCheckYes(
|
||||
s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
|
||||
if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
|
||||
return yesLimit;
|
||||
}
|
||||
spanCondition=USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
prevSpanLimit=spanLimit;
|
||||
}
|
||||
return s.length();
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
|
||||
return !set.contains(c) || norm2.hasBoundaryBefore(c);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
|
||||
return !set.contains(c) || norm2.hasBoundaryAfter(c);
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isInert(UChar32 c) const {
|
||||
return !set.contains(c) || norm2.isInert(c);
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(FilteredNormalizer2)
|
||||
|
||||
U_DRAFT UNormalizer2 * U_EXPORT2
|
||||
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
if(filterSet==NULL) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
|
||||
*UnicodeSet::fromUSet(filterSet));
|
||||
if(fn2==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
return (UNormalizer2 *)fn2;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
|
@ -1,18 +1,91 @@
|
|||
/**
|
||||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 2008, International Business Machines Corporation. *
|
||||
* All Rights Reserved. *
|
||||
*
|
||||
* Copyright (C) 2008-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: mutex.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "mutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
void *SimpleSingleton::getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
void *&duplicate,
|
||||
UErrorCode &errorCode) {
|
||||
duplicate=NULL;
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
void *instance;
|
||||
UMTX_CHECK(NULL, fInstance, instance);
|
||||
if(instance!=NULL) {
|
||||
return instance;
|
||||
} else {
|
||||
instance=instantiator(context, errorCode);
|
||||
Mutex mutex;
|
||||
if(fInstance==NULL && U_SUCCESS(errorCode)) {
|
||||
fInstance=instance;
|
||||
} else {
|
||||
duplicate=instance;
|
||||
}
|
||||
return fInstance;
|
||||
}
|
||||
}
|
||||
|
||||
void *TriStateSingleton::getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
void *&duplicate,
|
||||
UErrorCode &errorCode) {
|
||||
duplicate=NULL;
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
int8_t haveInstance;
|
||||
UMTX_CHECK(NULL, fHaveInstance, haveInstance);
|
||||
if(haveInstance>0) {
|
||||
return fInstance; // instance was created
|
||||
} else if(haveInstance<0) {
|
||||
errorCode=fErrorCode; // instance creation failed
|
||||
return NULL;
|
||||
} else /* haveInstance==0 */ {
|
||||
void *instance=instantiator(context, errorCode);
|
||||
Mutex mutex;
|
||||
if(fHaveInstance==0) {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
fInstance=instance;
|
||||
instance=NULL;
|
||||
fHaveInstance=1;
|
||||
} else {
|
||||
fErrorCode=errorCode;
|
||||
fHaveInstance=-1;
|
||||
}
|
||||
} else {
|
||||
errorCode=fErrorCode;
|
||||
}
|
||||
duplicate=instance;
|
||||
return fInstance;
|
||||
}
|
||||
}
|
||||
|
||||
void TriStateSingleton::reset() {
|
||||
fInstance=NULL;
|
||||
fErrorCode=U_ZERO_ERROR;
|
||||
fHaveInstance=0;
|
||||
}
|
||||
|
||||
#if UCONFIG_NO_SERVICE
|
||||
|
||||
/* If UCONFIG_NO_SERVICE, then there is no invocation of Mutex elsewhere in
|
||||
common, so add one here to force an export */
|
||||
#include "mutex.h"
|
||||
static Mutex *aMutex = 0;
|
||||
|
||||
/* UCONFIG_NO_SERVICE */
|
||||
#endif
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1997-2009, International Business Machines
|
||||
* Copyright (C) 1997-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -71,6 +71,128 @@ inline Mutex::~Mutex()
|
|||
umtx_unlock(fMutex);
|
||||
}
|
||||
|
||||
// common code for singletons ---------------------------------------------- ***
|
||||
|
||||
/**
|
||||
* Function pointer for the instantiator parameter of
|
||||
* SimpleSingleton::getInstance() and TriStateSingleton::getInstance().
|
||||
* The function creates some object, optionally using the context parameter.
|
||||
* The function need not check for U_FAILURE(errorCode).
|
||||
*/
|
||||
typedef void *InstantiatorFn(const void *context, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Singleton struct with shared instantiation/mutexing code.
|
||||
* Simple: Does not remember if a previous instantiation failed.
|
||||
* Best used if the instantiation can really only fail with an out-of-memory error,
|
||||
* otherwise use a TriStateSingleton.
|
||||
* Best used via SimpleSingletonWrapper or similar.
|
||||
* Define a static SimpleSingleton instance via the STATIC_SIMPLE_SINGLETON macro.
|
||||
*/
|
||||
struct SimpleSingleton {
|
||||
void *fInstance;
|
||||
|
||||
/**
|
||||
* Returns the singleton instance, or NULL if it could not be created.
|
||||
* Calls the instantiator with the context if the instance has not been
|
||||
* created yet. In a race condition, the duplicate may not be NULL.
|
||||
* The caller must delete the duplicate.
|
||||
* The caller need not initialize the duplicate before the call.
|
||||
*/
|
||||
void *getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
void *&duplicate,
|
||||
UErrorCode &errorCode);
|
||||
/**
|
||||
* Resets the fields. The caller must have deleted the singleton instance.
|
||||
* Not mutexed.
|
||||
* Call this from a cleanup function.
|
||||
*/
|
||||
void reset() { fInstance=NULL; }
|
||||
};
|
||||
|
||||
#define STATIC_SIMPLE_SINGLETON(name) static SimpleSingleton name={ NULL }
|
||||
|
||||
/**
|
||||
* Handy wrapper for an SimpleSingleton.
|
||||
* Intended for temporary use on the stack, to make the SimpleSingleton easier to deal with.
|
||||
* Takes care of the duplicate deletion and type casting.
|
||||
*/
|
||||
template<typename T>
|
||||
class SimpleSingletonWrapper {
|
||||
public:
|
||||
SimpleSingletonWrapper(SimpleSingleton &s) : singleton(s) {}
|
||||
void deleteInstance() {
|
||||
delete (T *)singleton.fInstance;
|
||||
singleton.reset();
|
||||
}
|
||||
T *getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
UErrorCode &errorCode) {
|
||||
void *duplicate;
|
||||
T *instance=(T *)singleton.getInstance(instantiator, context, duplicate, errorCode);
|
||||
delete (T *)duplicate;
|
||||
return instance;
|
||||
}
|
||||
private:
|
||||
SimpleSingleton &singleton;
|
||||
};
|
||||
|
||||
/**
|
||||
* Singleton struct with shared instantiation/mutexing code.
|
||||
* Tri-state: Instantiation succeeded/failed/not attempted yet.
|
||||
* Best used via TriStateSingletonWrapper or similar.
|
||||
* Define a static TriStateSingleton instance via the STATIC_TRI_STATE_SINGLETON macro.
|
||||
*/
|
||||
struct TriStateSingleton {
|
||||
void *fInstance;
|
||||
UErrorCode fErrorCode;
|
||||
int8_t fHaveInstance;
|
||||
|
||||
/**
|
||||
* Returns the singleton instance, or NULL if it could not be created.
|
||||
* Calls the instantiator with the context if the instance has not been
|
||||
* created yet. In a race condition, the duplicate may not be NULL.
|
||||
* The caller must delete the duplicate.
|
||||
* The caller need not initialize the duplicate before the call.
|
||||
* The singleton creation is only attempted once. If it fails,
|
||||
* the singleton will then always return NULL.
|
||||
*/
|
||||
void *getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
void *&duplicate,
|
||||
UErrorCode &errorCode);
|
||||
/**
|
||||
* Resets the fields. The caller must have deleted the singleton instance.
|
||||
* Not mutexed.
|
||||
* Call this from a cleanup function.
|
||||
*/
|
||||
void reset();
|
||||
};
|
||||
|
||||
#define STATIC_TRI_STATE_SINGLETON(name) static TriStateSingleton name={ NULL, U_ZERO_ERROR, 0 }
|
||||
|
||||
/**
|
||||
* Handy wrapper for an TriStateSingleton.
|
||||
* Intended for temporary use on the stack, to make the TriStateSingleton easier to deal with.
|
||||
* Takes care of the duplicate deletion and type casting.
|
||||
*/
|
||||
template<typename T>
|
||||
class TriStateSingletonWrapper {
|
||||
public:
|
||||
TriStateSingletonWrapper(TriStateSingleton &s) : singleton(s) {}
|
||||
void deleteInstance() {
|
||||
delete (T *)singleton.fInstance;
|
||||
singleton.reset();
|
||||
}
|
||||
T *getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
UErrorCode &errorCode) {
|
||||
void *duplicate;
|
||||
T *instance=(T *)singleton.getInstance(instantiator, context, duplicate, errorCode);
|
||||
delete (T *)duplicate;
|
||||
return instance;
|
||||
}
|
||||
private:
|
||||
TriStateSingleton &singleton;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif //_MUTEX_
|
||||
|
|
744
icu4c/source/common/normalizer2.cpp
Normal file
744
icu4c/source/common/normalizer2.cpp
Normal file
|
@ -0,0 +1,744 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: normalizer2.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov22
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cpputils.h"
|
||||
#include "cstring.h"
|
||||
#include "mutex.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ucln_cmn.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
|
||||
|
||||
// Normalizer2 implementation for the old UNORM_NONE.
|
||||
class NoopNormalizer2 : public Normalizer2 {
|
||||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&dest!=&src) {
|
||||
dest=src;
|
||||
} else {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
first.append(second);
|
||||
}
|
||||
return first;
|
||||
}
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(&first!=&second) {
|
||||
first.append(second);
|
||||
} else {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
}
|
||||
return first;
|
||||
}
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
return TRUE;
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
return UNORM_YES;
|
||||
}
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
return s.length();
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return TRUE; }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return TRUE; }
|
||||
virtual UBool isInert(UChar32 c) const { return TRUE; }
|
||||
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
};
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NoopNormalizer2)
|
||||
|
||||
// Intermediate class:
|
||||
// Has Normalizer2Impl and does boilerplate argument checking and setup.
|
||||
class Normalizer2WithImpl : public Normalizer2 {
|
||||
public:
|
||||
Normalizer2WithImpl(const Normalizer2Impl &ni) : impl(ni) {}
|
||||
|
||||
// normalize
|
||||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
}
|
||||
const UChar *sArray=src.getBuffer();
|
||||
if(&dest==&src || sArray==NULL) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
}
|
||||
dest.remove();
|
||||
ReorderingBuffer buffer(impl, dest);
|
||||
if(buffer.init(src.length(), errorCode)) {
|
||||
normalize(sArray, sArray+src.length(), buffer, errorCode);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
||||
|
||||
// normalize and append
|
||||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
return normalizeSecondAndAppend(first, second, TRUE, errorCode);
|
||||
}
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
return normalizeSecondAndAppend(first, second, FALSE, errorCode);
|
||||
}
|
||||
UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UBool doNormalize,
|
||||
UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(first, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return first;
|
||||
}
|
||||
const UChar *secondArray=second.getBuffer();
|
||||
if(&first==&second || secondArray==NULL) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return first;
|
||||
}
|
||||
ReorderingBuffer buffer(impl, first);
|
||||
if(buffer.init(first.length()+second.length(), errorCode)) {
|
||||
normalizeAndAppend(secondArray, secondArray+second.length(), doNormalize,
|
||||
buffer, errorCode);
|
||||
}
|
||||
return first;
|
||||
}
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
||||
|
||||
// quick checks
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
const UChar *sArray=s.getBuffer();
|
||||
if(sArray==NULL) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
const UChar *sLimit=sArray+s.length();
|
||||
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
const UChar *sArray=s.getBuffer();
|
||||
if(sArray==NULL) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
return (int32_t)(spanQuickCheckYes(sArray, sArray+s.length(), errorCode)-sArray);
|
||||
}
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const = 0;
|
||||
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
|
||||
return UNORM_YES;
|
||||
}
|
||||
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
|
||||
const Normalizer2Impl &impl;
|
||||
};
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer2WithImpl)
|
||||
|
||||
class DecomposeNormalizer2 : public Normalizer2WithImpl {
|
||||
public:
|
||||
DecomposeNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
|
||||
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
impl.decompose(src, limit, &buffer, errorCode);
|
||||
}
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
impl.decomposeAndAppend(src, limit, doNormalize, buffer, errorCode);
|
||||
}
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
|
||||
return impl.decompose(src, limit, NULL, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
|
||||
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundary(c, TRUE); }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundary(c, FALSE); }
|
||||
virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
|
||||
};
|
||||
|
||||
class ComposeNormalizer2 : public Normalizer2WithImpl {
|
||||
public:
|
||||
ComposeNormalizer2(const Normalizer2Impl &ni, UBool fcc) :
|
||||
Normalizer2WithImpl(ni), onlyContiguous(fcc) {}
|
||||
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
impl.compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
|
||||
}
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, buffer, errorCode);
|
||||
}
|
||||
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
const UChar *sArray=s.getBuffer();
|
||||
if(sArray==NULL) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
UnicodeString temp;
|
||||
ReorderingBuffer buffer(impl, temp);
|
||||
if(!buffer.init(5, errorCode)) { // small destCapacity for substring normalization
|
||||
return FALSE;
|
||||
}
|
||||
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
const UChar *sArray=s.getBuffer();
|
||||
if(sArray==NULL) {
|
||||
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
UNormalizationCheckResult qcResult=UNORM_YES;
|
||||
impl.composeQuickCheck(sArray, sArray+s.length(), onlyContiguous, &qcResult);
|
||||
return qcResult;
|
||||
}
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
|
||||
return impl.composeQuickCheck(src, limit, onlyContiguous, NULL);
|
||||
}
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
|
||||
return impl.getCompQuickCheck(impl.getNorm16(c));
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const {
|
||||
return impl.hasCompBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous, FALSE);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous, TRUE);
|
||||
}
|
||||
private:
|
||||
UBool onlyContiguous;
|
||||
};
|
||||
|
||||
class FCDNormalizer2 : public Normalizer2WithImpl {
|
||||
public:
|
||||
FCDNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
|
||||
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
impl.makeFCD(src, limit, &buffer, errorCode);
|
||||
}
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
impl.makeFCDAndAppend(src, limit, doNormalize, buffer, errorCode);
|
||||
}
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
|
||||
return impl.makeFCD(src, limit, NULL, errorCode);
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasFCDBoundaryBefore(c); }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasFCDBoundaryAfter(c); }
|
||||
virtual UBool isInert(UChar32 c) const { return impl.isFCDInert(c); }
|
||||
};
|
||||
|
||||
// instance cache ---------------------------------------------------------- ***
|
||||
|
||||
struct Norm2AllModes : public UMemory {
|
||||
static Norm2AllModes *createInstance(const char *packageName,
|
||||
const char *name,
|
||||
UErrorCode &errorCode);
|
||||
Norm2AllModes() : comp(impl, FALSE), decomp(impl), fcd(impl), fcc(impl, TRUE) {}
|
||||
|
||||
Normalizer2Impl impl;
|
||||
ComposeNormalizer2 comp;
|
||||
DecomposeNormalizer2 decomp;
|
||||
FCDNormalizer2 fcd;
|
||||
ComposeNormalizer2 fcc;
|
||||
};
|
||||
|
||||
Norm2AllModes *
|
||||
Norm2AllModes::createInstance(const char *packageName,
|
||||
const char *name,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
LocalPointer<Norm2AllModes> allModes(new Norm2AllModes);
|
||||
if(allModes.isNull()) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
allModes->impl.load(packageName, name, errorCode);
|
||||
return U_SUCCESS(errorCode) ? allModes.orphan() : NULL;
|
||||
}
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV uprv_normalizer2_cleanup();
|
||||
U_CDECL_END
|
||||
|
||||
class Norm2AllModesSingleton : public TriStateSingletonWrapper<Norm2AllModes> {
|
||||
public:
|
||||
Norm2AllModesSingleton(TriStateSingleton &s, const char *n) :
|
||||
TriStateSingletonWrapper<Norm2AllModes>(s), name(n) {}
|
||||
Norm2AllModes *getInstance(UErrorCode &errorCode) {
|
||||
return TriStateSingletonWrapper<Norm2AllModes>::getInstance(createInstance, name, errorCode);
|
||||
}
|
||||
private:
|
||||
static void *createInstance(const void *context, UErrorCode &errorCode) {
|
||||
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
|
||||
return Norm2AllModes::createInstance(NULL, (const char *)context, errorCode);
|
||||
}
|
||||
|
||||
const char *name;
|
||||
};
|
||||
|
||||
STATIC_TRI_STATE_SINGLETON(nfcSingleton);
|
||||
STATIC_TRI_STATE_SINGLETON(nfkcSingleton);
|
||||
STATIC_TRI_STATE_SINGLETON(nfkc_cfSingleton);
|
||||
|
||||
class Norm2Singleton : public SimpleSingletonWrapper<Normalizer2> {
|
||||
public:
|
||||
Norm2Singleton(SimpleSingleton &s) : SimpleSingletonWrapper<Normalizer2>(s) {}
|
||||
Normalizer2 *getInstance(UErrorCode &errorCode) {
|
||||
return SimpleSingletonWrapper<Normalizer2>::getInstance(createInstance, NULL, errorCode);
|
||||
}
|
||||
private:
|
||||
static void *createInstance(const void *context, UErrorCode &errorCode) {
|
||||
Normalizer2 *noop=new NoopNormalizer2;
|
||||
if(noop==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
|
||||
return noop;
|
||||
}
|
||||
};
|
||||
|
||||
STATIC_SIMPLE_SINGLETON(noopSingleton);
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static UBool U_CALLCONV uprv_normalizer2_cleanup() {
|
||||
Norm2AllModesSingleton(nfcSingleton, NULL).deleteInstance();
|
||||
Norm2AllModesSingleton(nfkcSingleton, NULL).deleteInstance();
|
||||
Norm2AllModesSingleton(nfkc_cfSingleton, NULL).deleteInstance();
|
||||
Norm2Singleton(noopSingleton).deleteInstance();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNFCInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->comp : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNFDInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->decomp : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
if(allModes!=NULL) {
|
||||
allModes->impl.getFCDTrie(errorCode);
|
||||
return &allModes->fcd;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->fcc : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNFKCInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->comp : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNFKDInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->decomp : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNFKC_CFInstance(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfkc_cfSingleton, "nfkc_cf").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->comp : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
|
||||
return Norm2Singleton(noopSingleton).getInstance(errorCode);
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2Factory::getInstance(UNormalizationMode mode, UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
switch(mode) {
|
||||
case UNORM_NFD:
|
||||
return getNFDInstance(errorCode);
|
||||
case UNORM_NFKD:
|
||||
return getNFKDInstance(errorCode);
|
||||
case UNORM_NFC:
|
||||
return getNFCInstance(errorCode);
|
||||
case UNORM_NFKC:
|
||||
return getNFKCInstance(errorCode);
|
||||
case UNORM_FCD:
|
||||
return getFCDInstance(errorCode);
|
||||
default: // UNORM_NONE
|
||||
return getNoopInstance(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->impl : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFKCImpl(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->impl : NULL;
|
||||
}
|
||||
|
||||
const Normalizer2Impl *
|
||||
Normalizer2Factory::getNFKC_CFImpl(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfkc_cfSingleton, "nfkc_cf").getInstance(errorCode);
|
||||
return allModes!=NULL ? &allModes->impl : NULL;
|
||||
}
|
||||
|
||||
const UTrie2 *
|
||||
Normalizer2Factory::getFCDTrie(UErrorCode &errorCode) {
|
||||
Norm2AllModes *allModes=
|
||||
Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
if(allModes!=NULL) {
|
||||
return allModes->impl.getFCDTrie(errorCode);
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
const Normalizer2 *
|
||||
Normalizer2::getInstance(const char *packageName,
|
||||
const char *name,
|
||||
UNormalization2Mode mode,
|
||||
UErrorCode &errorCode) {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return NULL;
|
||||
}
|
||||
if(packageName==NULL) {
|
||||
Norm2AllModes *allModes=NULL;
|
||||
if(0==uprv_strcmp(name, "nfc")) {
|
||||
allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
|
||||
} else if(0==uprv_strcmp(name, "nfkc")) {
|
||||
allModes=Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
|
||||
} else if(0==uprv_strcmp(name, "nfkc_cf")) {
|
||||
allModes=Norm2AllModesSingleton(nfkc_cfSingleton, "nfkc_cf").getInstance(errorCode);
|
||||
}
|
||||
if(allModes!=NULL) {
|
||||
switch(mode) {
|
||||
case UNORM2_COMPOSE:
|
||||
return &allModes->comp;
|
||||
case UNORM2_DECOMPOSE:
|
||||
return &allModes->decomp;
|
||||
case UNORM2_FCD:
|
||||
allModes->impl.getFCDTrie(errorCode);
|
||||
return &allModes->fcd;
|
||||
case UNORM2_COMPOSE_CONTIGUOUS:
|
||||
return &allModes->fcc;
|
||||
default:
|
||||
break; // do nothing
|
||||
}
|
||||
}
|
||||
}
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
// TODO: Real loading and caching...
|
||||
errorCode=U_UNSUPPORTED_ERROR;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Normalizer2)
|
||||
|
||||
// C API ------------------------------------------------------------------- ***
|
||||
|
||||
U_DRAFT const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getInstance(const char *packageName,
|
||||
const char *name,
|
||||
UNormalization2Mode mode,
|
||||
UErrorCode *pErrorCode) {
|
||||
return (const UNormalizer2 *)Normalizer2::getInstance(packageName, name, mode, *pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT void U_EXPORT2
|
||||
unorm2_close(UNormalizer2 *norm2) {
|
||||
delete (Normalizer2 *)norm2;
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_normalize(const UNormalizer2 *norm2,
|
||||
const UChar *src, int32_t length,
|
||||
UChar *dest, int32_t capacity,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(src==NULL || length<-1 || capacity<0 || (dest==NULL && capacity>0) || src==dest) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString destString(dest, 0, capacity);
|
||||
const Normalizer2 *n2=(const Normalizer2 *)norm2;
|
||||
if(n2->getDynamicClassID()==Normalizer2WithImpl::getStaticClassID()) {
|
||||
// Avoid duplicate argument checking and support NUL-terminated src.
|
||||
const Normalizer2WithImpl *n2wi=(const Normalizer2WithImpl *)n2;
|
||||
ReorderingBuffer buffer(n2wi->impl, destString);
|
||||
if(buffer.init(length, *pErrorCode)) {
|
||||
n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
|
||||
}
|
||||
} else {
|
||||
UnicodeString srcString(length<0, src, length);
|
||||
n2->normalize(srcString, destString, *pErrorCode);
|
||||
}
|
||||
return destString.extract(dest, capacity, *pErrorCode);
|
||||
}
|
||||
|
||||
static int32_t
|
||||
normalizeSecondAndAppend(const UNormalizer2 *norm2,
|
||||
UChar *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const UChar *second, int32_t secondLength,
|
||||
UBool doNormalize,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if( second==NULL || secondLength<-1 ||
|
||||
firstCapacity<0 || (first==NULL && firstCapacity>0) || firstLength<-1 ||
|
||||
first==second
|
||||
) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString firstString(first, firstLength, firstCapacity);
|
||||
const Normalizer2 *n2=(const Normalizer2 *)norm2;
|
||||
if(n2->getDynamicClassID()==Normalizer2WithImpl::getStaticClassID()) {
|
||||
// Avoid duplicate argument checking and support NUL-terminated src.
|
||||
const Normalizer2WithImpl *n2wi=(const Normalizer2WithImpl *)n2;
|
||||
ReorderingBuffer buffer(n2wi->impl, firstString);
|
||||
if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
|
||||
n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
|
||||
doNormalize, buffer, *pErrorCode);
|
||||
}
|
||||
} else {
|
||||
UnicodeString secondString(secondLength<0, second, secondLength);
|
||||
if(doNormalize) {
|
||||
n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
|
||||
} else {
|
||||
n2->append(firstString, secondString, *pErrorCode);
|
||||
}
|
||||
}
|
||||
return firstString.extract(first, firstCapacity, *pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
|
||||
UChar *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const UChar *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return normalizeSecondAndAppend(norm2,
|
||||
first, firstLength, firstCapacity,
|
||||
second, secondLength,
|
||||
TRUE, pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_append(const UNormalizer2 *norm2,
|
||||
UChar *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const UChar *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode) {
|
||||
return normalizeSecondAndAppend(norm2,
|
||||
first, firstLength, firstCapacity,
|
||||
second, secondLength,
|
||||
FALSE, pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_isNormalized(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s==NULL || length<-1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString sString(length<0, s, length);
|
||||
return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT UNormalizationCheckResult U_EXPORT2
|
||||
unorm2_quickCheck(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return UNORM_NO;
|
||||
}
|
||||
if(s==NULL || length<-1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return UNORM_NO;
|
||||
}
|
||||
UnicodeString sString(length<0, s, length);
|
||||
return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
UErrorCode *pErrorCode) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s==NULL || length<-1) {
|
||||
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return 0;
|
||||
}
|
||||
UnicodeString sString(length<0, s, length);
|
||||
return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
|
||||
}
|
||||
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
|
||||
}
|
||||
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
|
||||
}
|
||||
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
|
||||
return ((const Normalizer2 *)norm2)->isInert(c);
|
||||
}
|
||||
|
||||
// Some properties APIs ---------------------------------------------------- ***
|
||||
|
||||
U_CFUNC UNormalizationCheckResult U_EXPORT2
|
||||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
|
||||
if(mode<=UNORM_NONE || UNORM_FCD<=mode) {
|
||||
return UNORM_YES;
|
||||
}
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2 *norm2=Normalizer2Factory::getInstance(mode, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return ((const Normalizer2WithImpl *)norm2)->getQuickCheck(c);
|
||||
} else {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI const uint16_t * U_EXPORT2
|
||||
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) {
|
||||
const UTrie2 *trie=Normalizer2Factory::getFCDTrie(*pErrorCode);
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
fcdHighStart=trie->highStart;
|
||||
return trie->index;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
1727
icu4c/source/common/normalizer2impl.cpp
Normal file
1727
icu4c/source/common/normalizer2impl.cpp
Normal file
File diff suppressed because it is too large
Load diff
603
icu4c/source/common/normalizer2impl.h
Normal file
603
icu4c/source/common/normalizer2impl.h
Normal file
|
@ -0,0 +1,603 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: normalizer2impl.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov22
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __NORMALIZER2IMPL_H__
|
||||
#define __NORMALIZER2IMPL_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/udata.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "mutex.h"
|
||||
#include "uset_imp.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class Hangul {
|
||||
public:
|
||||
/* Korean Hangul and Jamo constants */
|
||||
enum {
|
||||
JAMO_L_BASE=0x1100, /* "lead" jamo */
|
||||
JAMO_V_BASE=0x1161, /* "vowel" jamo */
|
||||
JAMO_T_BASE=0x11a7, /* "trail" jamo */
|
||||
|
||||
HANGUL_BASE=0xac00,
|
||||
|
||||
JAMO_L_COUNT=19,
|
||||
JAMO_V_COUNT=21,
|
||||
JAMO_T_COUNT=28,
|
||||
|
||||
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
|
||||
HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
|
||||
};
|
||||
|
||||
static inline UBool isHangul(UChar32 c) {
|
||||
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
|
||||
}
|
||||
static inline UBool
|
||||
isHangulWithoutJamoT(UChar c) {
|
||||
c-=HANGUL_BASE;
|
||||
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
|
||||
}
|
||||
static inline UBool isJamoL(UChar32 c) {
|
||||
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
|
||||
}
|
||||
static inline UBool isJamoV(UChar32 c) {
|
||||
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer
|
||||
* and returns the length of the decomposition (2 or 3).
|
||||
*/
|
||||
static inline int32_t decompose(UChar32 c, UChar buffer[3]) {
|
||||
c-=HANGUL_BASE;
|
||||
UChar32 c2=c%JAMO_T_COUNT;
|
||||
c/=JAMO_T_COUNT;
|
||||
buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
|
||||
buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
|
||||
if(c2==0) {
|
||||
return 2;
|
||||
} else {
|
||||
buffer[2]=(UChar)(JAMO_T_BASE+c2);
|
||||
return 3;
|
||||
}
|
||||
}
|
||||
private:
|
||||
Hangul(); // no instantiation
|
||||
};
|
||||
|
||||
class Normalizer2Impl;
|
||||
|
||||
class ReorderingBuffer : public UMemory {
|
||||
public:
|
||||
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
|
||||
impl(ni), str(dest),
|
||||
start(NULL), reorderStart(NULL), limit(NULL),
|
||||
remainingCapacity(0), lastCC(0) {}
|
||||
~ReorderingBuffer() {
|
||||
if(start!=NULL) {
|
||||
str.releaseBuffer((int32_t)(limit-start));
|
||||
}
|
||||
}
|
||||
UBool init(int32_t destCapacity, UErrorCode &errorCode);
|
||||
|
||||
UBool isEmpty() const { return start==limit; }
|
||||
int32_t length() const { return (int32_t)(limit-start); }
|
||||
UChar *getStart() { return start; }
|
||||
UChar *getLimit() { return limit; }
|
||||
uint8_t getLastCC() const { return lastCC; }
|
||||
|
||||
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
|
||||
return (c<=0xffff) ?
|
||||
appendBMP((UChar)c, cc, errorCode) :
|
||||
appendSupplementary(c, cc, errorCode);
|
||||
}
|
||||
// s must be in NFD, otherwise change the implementation.
|
||||
UBool append(const UChar *s, int32_t length,
|
||||
uint8_t leadCC, uint8_t trailCC,
|
||||
UErrorCode &errorCode);
|
||||
UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) {
|
||||
if(remainingCapacity==0 && !resize(1, errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
if(lastCC<=cc || cc==0) {
|
||||
*limit++=c;
|
||||
lastCC=cc;
|
||||
if(cc<=1) {
|
||||
reorderStart=limit;
|
||||
}
|
||||
} else {
|
||||
insert(c, cc);
|
||||
}
|
||||
--remainingCapacity;
|
||||
return TRUE;
|
||||
}
|
||||
UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
|
||||
UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode);
|
||||
void removeZeroCCSuffix(int32_t length);
|
||||
void setReorderingLimitAndLastCC(UChar *newLimit, uint8_t newLastCC) {
|
||||
remainingCapacity+=(int32_t)(limit-newLimit);
|
||||
reorderStart=limit=newLimit;
|
||||
lastCC=newLastCC;
|
||||
}
|
||||
private:
|
||||
/*
|
||||
* TODO: Revisit whether it makes sense to track reorderStart.
|
||||
* It is set to after the last known character with cc<=1,
|
||||
* which stops previousCC() before it reads that character and looks up its cc.
|
||||
* previousCC() is normally only called from insert().
|
||||
* In other words, reorderStart speeds up the insertion of a combining mark
|
||||
* into a multi-combining mark sequence where it does not belong at the end.
|
||||
* This might not be worth the trouble.
|
||||
* On the other hand, it's not a huge amount of trouble.
|
||||
*
|
||||
* We probably need it for UNORM_SIMPLE_APPEND.
|
||||
*/
|
||||
|
||||
UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
|
||||
void insert(UChar32 c, uint8_t cc);
|
||||
static void writeCodePoint(UChar *p, UChar32 c) {
|
||||
if(c<=0xffff) {
|
||||
*p=(UChar)c;
|
||||
} else {
|
||||
p[0]=U16_LEAD(c);
|
||||
p[1]=U16_TRAIL(c);
|
||||
}
|
||||
}
|
||||
UBool resize(int32_t appendLength, UErrorCode &errorCode);
|
||||
|
||||
const Normalizer2Impl &impl;
|
||||
UnicodeString &str;
|
||||
UChar *start, *reorderStart, *limit;
|
||||
int32_t remainingCapacity;
|
||||
uint8_t lastCC;
|
||||
|
||||
// private backward iterator
|
||||
void setIterator() { codePointStart=limit; }
|
||||
void skipPrevious(); // Requires start<codePointStart.
|
||||
uint8_t previousCC(); // Returns 0 if there is no previous character.
|
||||
|
||||
UChar *codePointStart, *codePointLimit;
|
||||
};
|
||||
|
||||
class U_COMMON_API Normalizer2Impl : public UMemory {
|
||||
public:
|
||||
Normalizer2Impl() : memory(NULL), normTrie(NULL) {
|
||||
fcdTrieSingleton.fInstance=NULL;
|
||||
}
|
||||
~Normalizer2Impl();
|
||||
|
||||
void load(const char *packageName, const char *name, UErrorCode &errorCode);
|
||||
|
||||
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
|
||||
|
||||
// low-level properties ------------------------------------------------ ***
|
||||
|
||||
const UTrie2 *getNormTrie() const { return normTrie; }
|
||||
const UTrie2 *getFCDTrie(UErrorCode &errorCode) const ;
|
||||
|
||||
uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
|
||||
uint16_t getNorm16FromBMP(UChar c) const { return UTRIE2_GET16(normTrie, c); }
|
||||
uint16_t getNorm16FromSingleLead(UChar c) const {
|
||||
return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c);
|
||||
}
|
||||
uint16_t getNorm16FromSupplementary(UChar32 c) const {
|
||||
return UTRIE2_GET16_FROM_SUPP(normTrie, c);
|
||||
}
|
||||
uint16_t getNorm16FromSurrogatePair(UChar c, UChar c2) const {
|
||||
return getNorm16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
|
||||
}
|
||||
|
||||
UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
|
||||
if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
|
||||
return UNORM_YES;
|
||||
} else if(minMaybeYes<=norm16) {
|
||||
return UNORM_MAYBE;
|
||||
} else {
|
||||
return UNORM_NO;
|
||||
}
|
||||
}
|
||||
UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
|
||||
|
||||
uint8_t getCC(uint16_t norm16) const {
|
||||
if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
||||
return (uint8_t)norm16;
|
||||
}
|
||||
if(norm16<minNoNo || limitNoNo<=norm16) {
|
||||
return 0;
|
||||
}
|
||||
return getCCFromNoNo(norm16);
|
||||
}
|
||||
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
|
||||
return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
|
||||
}
|
||||
|
||||
uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); }
|
||||
uint16_t getFCD16FromBMP(UChar c) const { return UTRIE2_GET16(fcdTrie(), c); }
|
||||
uint16_t getFCD16FromSingleLead(UChar c) const {
|
||||
return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c);
|
||||
}
|
||||
uint16_t getFCD16FromSupplementary(UChar32 c) const {
|
||||
return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c);
|
||||
}
|
||||
uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const {
|
||||
return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
|
||||
}
|
||||
|
||||
void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
UTrie2 *newFCDTrie, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Get the decomposition for one code point.
|
||||
* @param c code point
|
||||
* @param buffer out-only buffer for algorithmic decompositions
|
||||
* @param length out-only, takes the length of the decomposition, if any
|
||||
* @return pointer to the decomposition, or NULL if none
|
||||
*/
|
||||
const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;
|
||||
|
||||
enum {
|
||||
MIN_CCC_LCCC_CP=0x300
|
||||
};
|
||||
|
||||
enum {
|
||||
MIN_YES_YES_WITH_CC=0xff01,
|
||||
JAMO_VT=0xff00,
|
||||
MIN_NORMAL_MAYBE_YES=0xfe00,
|
||||
JAMO_L=1,
|
||||
MAX_DELTA=0x40
|
||||
};
|
||||
|
||||
enum {
|
||||
// Byte offsets from the start of the data, after the generic header.
|
||||
IX_NORM_TRIE_OFFSET,
|
||||
IX_EXTRA_DATA_OFFSET,
|
||||
IX_RESERVED2_OFFSET,
|
||||
IX_RESERVED3_OFFSET,
|
||||
IX_RESERVED4_OFFSET,
|
||||
IX_RESERVED5_OFFSET,
|
||||
IX_RESERVED6_OFFSET,
|
||||
IX_TOTAL_SIZE,
|
||||
|
||||
// Code point thresholds for quick check codes.
|
||||
IX_MIN_DECOMP_NO_CP,
|
||||
IX_MIN_COMP_NO_MAYBE_CP,
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
IX_MIN_YES_NO,
|
||||
IX_MIN_NO_NO,
|
||||
IX_LIMIT_NO_NO,
|
||||
IX_MIN_MAYBE_YES,
|
||||
|
||||
IX_RESERVED14,
|
||||
IX_RESERVED15,
|
||||
IX_COUNT
|
||||
};
|
||||
|
||||
enum {
|
||||
MAPPING_HAS_CCC_LCCC_WORD=0x80,
|
||||
MAPPING_PLUS_COMPOSITION_LIST=0x40,
|
||||
MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
|
||||
MAPPING_LENGTH_MASK=0x1f
|
||||
};
|
||||
|
||||
enum {
|
||||
COMP_1_LAST_TUPLE=0x8000,
|
||||
COMP_1_TRIPLE=1,
|
||||
COMP_1_TRAIL_LIMIT=0x3400,
|
||||
COMP_1_TRAIL_MASK=0x7ffe,
|
||||
COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit
|
||||
COMP_2_TRAIL_SHIFT=6,
|
||||
COMP_2_TRAIL_MASK=0xffc0
|
||||
};
|
||||
|
||||
// higher-level functionality ------------------------------------------ ***
|
||||
|
||||
const UChar *decompose(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
|
||||
void decomposeAndAppend(const UChar *src, const UChar *limit,
|
||||
UBool doDecompose,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
UBool compose(const UChar *src, const UChar *limit,
|
||||
UBool onlyContiguous,
|
||||
UBool doCompose,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
const UChar *composeQuickCheck(const UChar *src, const UChar *limit,
|
||||
UBool onlyContiguous,
|
||||
UNormalizationCheckResult *pQCResult) const;
|
||||
void composeAndAppend(const UChar *src, const UChar *limit,
|
||||
UBool doCompose,
|
||||
UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
const UChar *makeFCD(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
|
||||
void makeFCDAndAppend(const UChar *src, const UChar *limit,
|
||||
UBool doMakeFCD,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
UBool hasDecompBoundary(UChar32 c, UBool before) const;
|
||||
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
|
||||
|
||||
UBool hasCompBoundaryBefore(UChar32 c) const {
|
||||
return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
|
||||
}
|
||||
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
|
||||
|
||||
UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
|
||||
UBool hasFCDBoundaryAfter(UChar32 c) const {
|
||||
uint16_t fcd16=getFCD16(c);
|
||||
return fcd16<=1 || (fcd16&0xff)==0;
|
||||
}
|
||||
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
|
||||
private:
|
||||
static UBool U_CALLCONV
|
||||
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
|
||||
|
||||
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
||||
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
|
||||
static UBool isInert(uint16_t norm16) { return norm16==0; }
|
||||
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
|
||||
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
|
||||
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
|
||||
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
|
||||
// UBool isCompYes(uint16_t norm16) const {
|
||||
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
|
||||
// }
|
||||
// UBool isCompYesOrMaybe(uint16_t norm16) const {
|
||||
// return norm16<minNoNo || minMaybeYes<=norm16;
|
||||
// }
|
||||
UBool hasZeroCCFromDecompYes(uint16_t norm16) {
|
||||
return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
|
||||
}
|
||||
UBool isDecompYesAndZeroCC(uint16_t norm16) const {
|
||||
return norm16<minYesNo ||
|
||||
norm16==JAMO_VT ||
|
||||
(minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
|
||||
}
|
||||
/**
|
||||
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
|
||||
* the MaybeYes which combine-forward and have ccc=0.
|
||||
* (Standard Unicode 5.2 normalization does not have such characters.)
|
||||
*/
|
||||
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
|
||||
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
|
||||
}
|
||||
UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
|
||||
|
||||
// For use with isCompYes().
|
||||
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
|
||||
// static uint8_t getCCFromYes(uint16_t norm16) {
|
||||
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
|
||||
// }
|
||||
uint8_t getCCFromNoNo(uint16_t norm16) const {
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
|
||||
return (uint8_t)mapping[1];
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
|
||||
uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
|
||||
|
||||
// Requires algorithmic-NoNo.
|
||||
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
|
||||
return c+norm16-(minMaybeYes-MAX_DELTA-1);
|
||||
}
|
||||
|
||||
// Requires minYesNo<norm16<limitNoNo.
|
||||
const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
|
||||
const uint16_t *getCompositionsListForDecompYesAndZeroCC(uint16_t norm16) const {
|
||||
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||
return NULL;
|
||||
} else if(norm16<minMaybeYes) {
|
||||
return extraData+norm16; // for yesYes; if Jamo L: harmless empty list
|
||||
} else {
|
||||
return maybeYesCompositions+norm16-minMaybeYes;
|
||||
}
|
||||
}
|
||||
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
|
||||
const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
|
||||
return list+ // mapping pointer
|
||||
1+ // +1 to skip the first unit with the mapping lenth
|
||||
(*list&MAPPING_LENGTH_MASK)+ // + mapping length
|
||||
((*list>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
|
||||
}
|
||||
|
||||
const UChar *copyLowPrefixFromNulTerminated(const UChar *src,
|
||||
UChar32 minNeedDataCP,
|
||||
ReorderingBuffer *buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
UBool decomposeShort(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
UBool decompose(UChar32 c, uint16_t norm16,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
static int32_t combine(const uint16_t *list, UChar32 trail);
|
||||
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
|
||||
UBool onlyContiguous) const;
|
||||
|
||||
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
|
||||
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
|
||||
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
|
||||
|
||||
const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; }
|
||||
|
||||
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
|
||||
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
|
||||
|
||||
UDataMemory *memory;
|
||||
UVersionInfo dataVersion;
|
||||
|
||||
// Code point thresholds for quick check codes.
|
||||
UChar32 minDecompNoCP;
|
||||
UChar32 minCompNoMaybeCP;
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
uint16_t minYesNo;
|
||||
uint16_t minNoNo;
|
||||
uint16_t limitNoNo;
|
||||
uint16_t minMaybeYes;
|
||||
|
||||
UTrie2 *normTrie;
|
||||
const uint16_t *maybeYesCompositions;
|
||||
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
|
||||
SimpleSingleton fcdTrieSingleton;
|
||||
};
|
||||
|
||||
/**
|
||||
* ICU-internal shortcut for quick access to standard Unicode normalization.
|
||||
*/
|
||||
class U_COMMON_API Normalizer2Factory {
|
||||
public:
|
||||
static const Normalizer2 *getNFCInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getNFDInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode);
|
||||
static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
|
||||
|
||||
static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
|
||||
|
||||
static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
|
||||
static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
|
||||
static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
|
||||
|
||||
static const UTrie2 *getFCDTrie(UErrorCode &errorCode);
|
||||
private:
|
||||
Normalizer2Factory(); // No instantiation.
|
||||
};
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm2_swap(const UDataSwapper *ds,
|
||||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UNormalizationCheckResult U_EXPORT2
|
||||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get access to the internal FCD trie table to be able to perform
|
||||
* incremental, per-code unit, FCD checks in collation.
|
||||
* One pointer is sufficient because the trie index values are offset
|
||||
* by the index size, so that the same pointer is used to access the trie data.
|
||||
* Code points at fcdHighStart and above have a zero FCD value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI const uint16_t * U_EXPORT2
|
||||
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value for a code unit, with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* If c is a lead surrogate and the value is not 0,
|
||||
* then some of c's associated supplementary code points have a non-zero FCD value.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
|
||||
return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value of the next code point (post-increment), with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
||||
const UChar *&s, const UChar *limit) {
|
||||
UChar32 c=*s++;
|
||||
uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
if(fcd!=0 && U16_IS_LEAD(c)) {
|
||||
UChar c2;
|
||||
if(s!=limit && U16_IS_TRAIL(c2=*s)) {
|
||||
++s;
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
if(c<fcdHighStart) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
|
||||
} else {
|
||||
fcd=0;
|
||||
}
|
||||
} else /* unpaired lead surrogate */ {
|
||||
fcd=0;
|
||||
}
|
||||
}
|
||||
return fcd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value of the previous code point (pre-decrement), with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
||||
const UChar *start, const UChar *&s) {
|
||||
UChar32 c=*--s;
|
||||
uint16_t fcd;
|
||||
if(!U16_IS_SURROGATE(c)) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
} else {
|
||||
UChar c2;
|
||||
if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) {
|
||||
--s;
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
if(c<fcdHighStart) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
|
||||
} else {
|
||||
fcd=0;
|
||||
}
|
||||
} else /* unpaired surrogate */ {
|
||||
fcd=0;
|
||||
}
|
||||
}
|
||||
return fcd;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif /* !UCONFIG_NO_NORMALIZATION */
|
||||
#endif /* __NORMALIZER2IMPL_H__ */
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*************************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1996-2005, International Business Machines Corporation and
|
||||
* Copyright (c) 1996-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
*************************************************************************
|
||||
*/
|
||||
|
@ -10,14 +10,15 @@
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/schriter.h"
|
||||
#include "unicode/uchriter.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "cmemory.h"
|
||||
#include "unormimp.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "uprops.h" // for uniset_getUnicode32Instance()
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -28,72 +29,68 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
|
|||
//-------------------------------------------------------------------------
|
||||
|
||||
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
|
||||
UObject(), fUMode(mode), fOptions(0),
|
||||
UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
|
||||
text(new StringCharacterIterator(str)),
|
||||
currentIndex(0), nextIndex(0),
|
||||
buffer(), bufferPos(0)
|
||||
{
|
||||
init(new StringCharacterIterator(str));
|
||||
init();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
|
||||
UObject(), fUMode(mode), fOptions(0),
|
||||
UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
|
||||
text(new UCharCharacterIterator(str, length)),
|
||||
currentIndex(0), nextIndex(0),
|
||||
buffer(), bufferPos(0)
|
||||
{
|
||||
init(new UCharCharacterIterator(str, length));
|
||||
init();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
|
||||
UObject(), fUMode(mode), fOptions(0),
|
||||
UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
|
||||
text(iter.clone()),
|
||||
currentIndex(0), nextIndex(0),
|
||||
buffer(), bufferPos(0)
|
||||
{
|
||||
init(iter.clone());
|
||||
init();
|
||||
}
|
||||
|
||||
Normalizer::Normalizer(const Normalizer ©) :
|
||||
UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
|
||||
UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
|
||||
text(copy.text->clone()),
|
||||
currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
|
||||
buffer(copy.buffer), bufferPos(copy.bufferPos)
|
||||
{
|
||||
init(((CharacterIterator *)(copy.text->context))->clone());
|
||||
init();
|
||||
}
|
||||
|
||||
static const UChar _NUL=0;
|
||||
|
||||
void
|
||||
Normalizer::init(CharacterIterator *iter) {
|
||||
Normalizer::init() {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
||||
text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
|
||||
if(text!=NULL) {
|
||||
if(unorm_haveData(&errorCode)) {
|
||||
uiter_setCharacterIterator(text, iter);
|
||||
} else {
|
||||
delete iter;
|
||||
uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
|
||||
}
|
||||
} else {
|
||||
delete iter;
|
||||
fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
|
||||
if(fOptions&UNORM_UNICODE_3_2) {
|
||||
delete fFilteredNorm2;
|
||||
fNorm2=fFilteredNorm2=
|
||||
new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
Normalizer::~Normalizer()
|
||||
{
|
||||
if(text!=NULL) {
|
||||
delete (CharacterIterator *)text->context;
|
||||
uprv_free(text);
|
||||
}
|
||||
delete fFilteredNorm2;
|
||||
delete text;
|
||||
}
|
||||
|
||||
Normalizer*
|
||||
Normalizer::clone() const
|
||||
{
|
||||
if(this!=0) {
|
||||
return new Normalizer(*this);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return new Normalizer(*this);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -101,7 +98,7 @@ Normalizer::clone() const
|
|||
*/
|
||||
int32_t Normalizer::hashCode() const
|
||||
{
|
||||
return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
|
||||
return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
|
||||
}
|
||||
|
||||
UBool Normalizer::operator==(const Normalizer& that) const
|
||||
|
@ -110,7 +107,7 @@ UBool Normalizer::operator==(const Normalizer& that) const
|
|||
this==&that ||
|
||||
fUMode==that.fUMode &&
|
||||
fOptions==that.fOptions &&
|
||||
*((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
|
||||
*text==*that.text &&
|
||||
buffer==that.buffer &&
|
||||
bufferPos==that.bufferPos &&
|
||||
nextIndex==that.nextIndex;
|
||||
|
@ -140,29 +137,18 @@ Normalizer::normalize(const UnicodeString& source,
|
|||
// the source and result strings are the same object, use a temporary one
|
||||
dest=&localDest;
|
||||
}
|
||||
|
||||
UChar *buffer=dest->getBuffer(source.length());
|
||||
int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
mode, options,
|
||||
&status);
|
||||
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
status=U_ZERO_ERROR;
|
||||
buffer=dest->getBuffer(length);
|
||||
length=unorm_internalNormalize(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
mode, options,
|
||||
&status);
|
||||
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
||||
normalize(source, *dest, status);
|
||||
} else {
|
||||
n2->normalize(source, *dest, status);
|
||||
}
|
||||
}
|
||||
|
||||
if(dest==&localDest) {
|
||||
if(dest==&localDest && U_SUCCESS(status)) {
|
||||
result=*dest;
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -171,45 +157,7 @@ Normalizer::compose(const UnicodeString& source,
|
|||
UBool compat, int32_t options,
|
||||
UnicodeString& result,
|
||||
UErrorCode &status) {
|
||||
if(source.isBogus() || U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
if(U_SUCCESS(status)) {
|
||||
status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
}
|
||||
} else {
|
||||
UnicodeString localDest;
|
||||
UnicodeString *dest;
|
||||
|
||||
if(&source!=&result) {
|
||||
dest=&result;
|
||||
} else {
|
||||
// the source and result strings are the same object, use a temporary one
|
||||
dest=&localDest;
|
||||
}
|
||||
|
||||
UChar *buffer=dest->getBuffer(source.length());
|
||||
int32_t length=unorm_compose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
status=U_ZERO_ERROR;
|
||||
buffer=dest->getBuffer(length);
|
||||
length=unorm_compose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
|
||||
}
|
||||
|
||||
if(dest==&localDest) {
|
||||
result=*dest;
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
}
|
||||
}
|
||||
normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
|
||||
}
|
||||
|
||||
void U_EXPORT2
|
||||
|
@ -217,44 +165,40 @@ Normalizer::decompose(const UnicodeString& source,
|
|||
UBool compat, int32_t options,
|
||||
UnicodeString& result,
|
||||
UErrorCode &status) {
|
||||
if(source.isBogus() || U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
if(U_SUCCESS(status)) {
|
||||
status=U_ILLEGAL_ARGUMENT_ERROR;
|
||||
normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
|
||||
}
|
||||
|
||||
UNormalizationCheckResult
|
||||
Normalizer::quickCheck(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &status) {
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
||||
quickCheck(source, status);
|
||||
} else {
|
||||
return n2->quickCheck(source, status);
|
||||
}
|
||||
} else {
|
||||
UnicodeString localDest;
|
||||
UnicodeString *dest;
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
}
|
||||
|
||||
if(&source!=&result) {
|
||||
dest=&result;
|
||||
UBool
|
||||
Normalizer::isNormalized(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &status) {
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
|
||||
isNormalized(source, status);
|
||||
} else {
|
||||
// the source and result strings are the same object, use a temporary one
|
||||
dest=&localDest;
|
||||
}
|
||||
|
||||
UChar *buffer=dest->getBuffer(source.length());
|
||||
int32_t length=unorm_decompose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
status=U_ZERO_ERROR;
|
||||
buffer=dest->getBuffer(length);
|
||||
length=unorm_decompose(buffer, dest->getCapacity(),
|
||||
source.getBuffer(), source.length(),
|
||||
compat, options,
|
||||
&status);
|
||||
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
|
||||
}
|
||||
|
||||
if(dest==&localDest) {
|
||||
result=*dest;
|
||||
}
|
||||
if(U_FAILURE(status)) {
|
||||
result.setToBogus();
|
||||
return n2->isNormalized(source, status);
|
||||
}
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -272,37 +216,25 @@ Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
|
|||
UnicodeString localDest;
|
||||
UnicodeString *dest;
|
||||
|
||||
if(&left!=&result && &right!=&result) {
|
||||
if(&right!=&result) {
|
||||
dest=&result;
|
||||
} else {
|
||||
// the source and result strings are the same object, use a temporary one
|
||||
// the right and result strings are the same object, use a temporary one
|
||||
dest=&localDest;
|
||||
}
|
||||
|
||||
UChar *buffer=dest->getBuffer(left.length()+right.length());
|
||||
int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
|
||||
right.getBuffer(), right.length(),
|
||||
buffer, dest->getCapacity(),
|
||||
mode, options,
|
||||
&errorCode);
|
||||
dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
buffer=dest->getBuffer(length);
|
||||
int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
|
||||
right.getBuffer(), right.length(),
|
||||
buffer, dest->getCapacity(),
|
||||
mode, options,
|
||||
&errorCode);
|
||||
dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
*dest=left;
|
||||
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(options&UNORM_UNICODE_3_2) {
|
||||
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
|
||||
append(*dest, right, errorCode);
|
||||
} else {
|
||||
n2->append(*dest, right, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
if(dest==&localDest) {
|
||||
if(dest==&localDest && U_SUCCESS(errorCode)) {
|
||||
result=*dest;
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
result.setToBogus();
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
@ -353,19 +285,20 @@ UChar32 Normalizer::previous() {
|
|||
}
|
||||
|
||||
void Normalizer::reset() {
|
||||
currentIndex=nextIndex=text->move(text, 0, UITER_START);
|
||||
currentIndex=nextIndex=text->setToStart();
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
void
|
||||
Normalizer::setIndexOnly(int32_t index) {
|
||||
currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
|
||||
text->setIndex(index); // pins index
|
||||
currentIndex=nextIndex=text->getIndex();
|
||||
clearBuffer();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the first character in the normalized text-> This resets
|
||||
* the <tt>Normalizer's</tt> position to the beginning of the text->
|
||||
* Return the first character in the normalized text. This resets
|
||||
* the <tt>Normalizer's</tt> position to the beginning of the text.
|
||||
*/
|
||||
UChar32 Normalizer::first() {
|
||||
reset();
|
||||
|
@ -373,12 +306,12 @@ UChar32 Normalizer::first() {
|
|||
}
|
||||
|
||||
/**
|
||||
* Return the last character in the normalized text-> This resets
|
||||
* Return the last character in the normalized text. This resets
|
||||
* the <tt>Normalizer's</tt> position to be just before the
|
||||
* the input text corresponding to that normalized character.
|
||||
*/
|
||||
UChar32 Normalizer::last() {
|
||||
currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
|
||||
currentIndex=nextIndex=text->setToEnd();
|
||||
clearBuffer();
|
||||
return previous();
|
||||
}
|
||||
|
@ -406,21 +339,21 @@ int32_t Normalizer::getIndex() const {
|
|||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the start of the input text-> This is the begin index
|
||||
* Retrieve the index of the start of the input text. This is the begin index
|
||||
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
|
||||
* over which this <tt>Normalizer</tt> is iterating
|
||||
*/
|
||||
int32_t Normalizer::startIndex() const {
|
||||
return text->getIndex(text, UITER_START);
|
||||
return text->startIndex();
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the index of the end of the input text-> This is the end index
|
||||
* Retrieve the index of the end of the input text. This is the end index
|
||||
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
|
||||
* over which this <tt>Normalizer</tt> is iterating
|
||||
*/
|
||||
int32_t Normalizer::endIndex() const {
|
||||
return text->getIndex(text, UITER_LIMIT);
|
||||
return text->endIndex();
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
@ -431,6 +364,7 @@ void
|
|||
Normalizer::setMode(UNormalizationMode newMode)
|
||||
{
|
||||
fUMode = newMode;
|
||||
init();
|
||||
}
|
||||
|
||||
UNormalizationMode
|
||||
|
@ -448,6 +382,7 @@ Normalizer::setOption(int32_t option,
|
|||
} else {
|
||||
fOptions &= (~option);
|
||||
}
|
||||
init();
|
||||
}
|
||||
|
||||
UBool
|
||||
|
@ -458,7 +393,7 @@ Normalizer::getOption(int32_t option) const
|
|||
|
||||
/**
|
||||
* Set the input text over which this <tt>Normalizer</tt> will iterate.
|
||||
* The iteration position is set to the beginning of the input text->
|
||||
* The iteration position is set to the beginning of the input text.
|
||||
*/
|
||||
void
|
||||
Normalizer::setText(const UnicodeString& newText,
|
||||
|
@ -472,8 +407,8 @@ Normalizer::setText(const UnicodeString& newText,
|
|||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
delete (CharacterIterator *)(text->context);
|
||||
text->context = newIter;
|
||||
delete text;
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
|
@ -493,8 +428,8 @@ Normalizer::setText(const CharacterIterator& newText,
|
|||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
delete (CharacterIterator *)(text->context);
|
||||
text->context = newIter;
|
||||
delete text;
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
|
@ -511,8 +446,8 @@ Normalizer::setText(const UChar* newText,
|
|||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
delete (CharacterIterator *)(text->context);
|
||||
text->context = newIter;
|
||||
delete text;
|
||||
text = newIter;
|
||||
reset();
|
||||
}
|
||||
|
||||
|
@ -523,7 +458,7 @@ Normalizer::setText(const UChar* newText,
|
|||
void
|
||||
Normalizer::getText(UnicodeString& result)
|
||||
{
|
||||
((CharacterIterator *)(text->context))->getText(result);
|
||||
text->getText(result);
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
@ -537,72 +472,48 @@ void Normalizer::clearBuffer() {
|
|||
|
||||
UBool
|
||||
Normalizer::nextNormalize() {
|
||||
UChar *p;
|
||||
int32_t length;
|
||||
UErrorCode errorCode;
|
||||
|
||||
clearBuffer();
|
||||
currentIndex=nextIndex;
|
||||
text->move(text, nextIndex, UITER_ZERO);
|
||||
if(!text->hasNext(text)) {
|
||||
text->setIndex(nextIndex);
|
||||
if(!text->hasNext()) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
p=buffer.getBuffer(-1);
|
||||
length=unorm_next(text, p, buffer.getCapacity(),
|
||||
fUMode, fOptions,
|
||||
TRUE, 0,
|
||||
&errorCode);
|
||||
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
text->move(text, nextIndex, UITER_ZERO);
|
||||
p=buffer.getBuffer(length);
|
||||
length=unorm_next(text, p, buffer.getCapacity(),
|
||||
fUMode, fOptions,
|
||||
TRUE, 0,
|
||||
&errorCode);
|
||||
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
// Skip at least one character so we make progress.
|
||||
UnicodeString segment(text->next32PostInc());
|
||||
while(text->hasNext()) {
|
||||
UChar32 c;
|
||||
if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
|
||||
text->move32(-1, CharacterIterator::kCurrent);
|
||||
break;
|
||||
}
|
||||
segment.append(c);
|
||||
}
|
||||
|
||||
nextIndex=text->getIndex(text, UITER_CURRENT);
|
||||
nextIndex=text->getIndex();
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
fNorm2->normalize(segment, buffer, errorCode);
|
||||
return U_SUCCESS(errorCode) && !buffer.isEmpty();
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer::previousNormalize() {
|
||||
UChar *p;
|
||||
int32_t length;
|
||||
UErrorCode errorCode;
|
||||
|
||||
clearBuffer();
|
||||
nextIndex=currentIndex;
|
||||
text->move(text, currentIndex, UITER_ZERO);
|
||||
if(!text->hasPrevious(text)) {
|
||||
text->setIndex(currentIndex);
|
||||
if(!text->hasPrevious()) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
errorCode=U_ZERO_ERROR;
|
||||
p=buffer.getBuffer(-1);
|
||||
length=unorm_previous(text, p, buffer.getCapacity(),
|
||||
fUMode, fOptions,
|
||||
TRUE, 0,
|
||||
&errorCode);
|
||||
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
text->move(text, currentIndex, UITER_ZERO);
|
||||
p=buffer.getBuffer(length);
|
||||
length=unorm_previous(text, p, buffer.getCapacity(),
|
||||
fUMode, fOptions,
|
||||
TRUE, 0,
|
||||
&errorCode);
|
||||
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
UnicodeString segment;
|
||||
while(text->hasPrevious()) {
|
||||
UChar32 c=text->previous32();
|
||||
segment.insert(0, c);
|
||||
if(fNorm2->hasBoundaryBefore(c)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
currentIndex=text->getIndex();
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
fNorm2->normalize(segment, buffer, errorCode);
|
||||
bufferPos=buffer.length();
|
||||
currentIndex=text->getIndex(text, UITER_CURRENT);
|
||||
return U_SUCCESS(errorCode) && !buffer.isEmpty();
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
********************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines
|
||||
* Copyright (C) 1996-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
********************************************************************************
|
||||
*
|
||||
|
@ -28,7 +28,6 @@
|
|||
#include "ucln_cmn.h"
|
||||
#include "utrie2.h"
|
||||
#include "udataswp.h"
|
||||
#include "unormimp.h" /* JAMO_L_BASE etc. */
|
||||
#include "uprops.h"
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
@ -650,10 +649,6 @@ u_getNumericValue(UChar32 c) {
|
|||
}
|
||||
}
|
||||
|
||||
/* ICU 3.4: bidi/shaping properties moved to ubidi_props.c */
|
||||
|
||||
/* ICU 2.1: u_getCombiningClass() moved to unorm.cpp */
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_digit(UChar32 ch, int8_t radix) {
|
||||
int8_t value;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* *
|
||||
* Copyright (C) 2001-2006, International Business Machines *
|
||||
* Copyright (C) 2001-2010, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
* *
|
||||
******************************************************************************
|
||||
|
@ -41,6 +41,7 @@ typedef enum ECleanupCommonType {
|
|||
UCLN_COMMON_LOCALE,
|
||||
UCLN_COMMON_ULOC,
|
||||
UCLN_COMMON_UNORM,
|
||||
UCLN_COMMON_NORMALIZER2,
|
||||
UCLN_COMMON_USET,
|
||||
UCLN_COMMON_UNAMES,
|
||||
UCLN_COMMON_PNAME,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2003-2009, International Business Machines
|
||||
* Copyright (C) 2003-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2006, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -31,6 +31,7 @@
|
|||
U_NAMESPACE_BEGIN
|
||||
|
||||
class Hashtable;
|
||||
class Normalizer2;
|
||||
|
||||
/**
|
||||
* This class allows one to iterate through all the strings that are canonically equivalent to a given
|
||||
|
@ -174,6 +175,8 @@ private:
|
|||
// transient fields
|
||||
UnicodeString buffer;
|
||||
|
||||
const Normalizer2 &nfd;
|
||||
|
||||
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
|
||||
UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
|
||||
|
||||
|
|
460
icu4c/source/common/unicode/normalizer2.h
Normal file
460
icu4c/source/common/unicode/normalizer2.h
Normal file
|
@ -0,0 +1,460 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: normalizer2.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov22
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __NORMALIZER2_H__
|
||||
#define __NORMALIZER2_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C++ API: New API for Unicode Normalization.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm2.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* Unicode normalization functionality for standard Unicode normalization or
|
||||
* for using custom mapping tables.
|
||||
* All instances of this class are unmodifiable/immutable.
|
||||
* Instances returned by getInstance() are singletons that must not be deleted by the caller.
|
||||
*
|
||||
* Some of the functions in this class identify normalization boundaries.
|
||||
* At a normalization boundary, the portions of the string
|
||||
* before it and starting from it do not interact and can be handled independently.
|
||||
*
|
||||
* The spanQuickCheckYes() stops at a normalization boundary.
|
||||
* When the goal is a normalized string, then the text before the boundary
|
||||
* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
|
||||
*
|
||||
* The isBoundary() function tests whether a character is at a normalization boundary.
|
||||
* This is used for moving from one normalization boundary to the next
|
||||
* or preceding boundary, and for performing iterative normalization.
|
||||
*
|
||||
* Iterative normalization is useful when only a small portion of a
|
||||
* longer string needs to be processed.
|
||||
* In ICU, iterative normalization is used by the NormalizationTransliterator
|
||||
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
|
||||
* (to process only the substring for which sort key bytes are computed).
|
||||
*
|
||||
* The set of normalization boundaries returned by these functions may not be
|
||||
* complete: There may be more boundaries that could be returned.
|
||||
* Different functions may return different boundaries.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
class U_COMMON_API Normalizer2 : public UObject {
|
||||
public:
|
||||
/**
|
||||
* Returns a Normalizer2 instance which uses the specified data file
|
||||
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
|
||||
* and which composes or decomposes text according to the specified mode.
|
||||
* Returns an unmodifiable singleton instance. Do not delete it.
|
||||
*
|
||||
* Use packageName=NULL for data files that are part of ICU's own data.
|
||||
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
|
||||
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
|
||||
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
|
||||
*
|
||||
* @param packageName NULL for ICU built-in data, otherwise application data package name
|
||||
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
|
||||
* @param mode normalization mode (compose or decompose etc.)
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return the requested Normalizer2, if successful
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
static const Normalizer2 *
|
||||
getInstance(const char *packageName,
|
||||
const char *name,
|
||||
UNormalization2Mode mode,
|
||||
UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
* Returns the normalized form of the source string.
|
||||
* @param src source string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return normalized src
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
UnicodeString
|
||||
normalize(const UnicodeString &src, UErrorCode &errorCode) const {
|
||||
UnicodeString result;
|
||||
normalize(src, result, errorCode);
|
||||
return result;
|
||||
}
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination string
|
||||
* (replacing its contents) and returns the destination string.
|
||||
* The source and destination strings must be different objects.
|
||||
* @param src source string
|
||||
* @param dest destination string; its contents is replaced with normalized src
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return dest
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const = 0;
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
* The result is normalized if the first string was normalized.
|
||||
* The first and second strings must be different objects.
|
||||
* @param first string, should be normalized
|
||||
* @param second string, will be normalized
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return first
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const = 0;
|
||||
/**
|
||||
* Appends the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
* The result is normalized if both the strings were normalized.
|
||||
* The first and second strings must be different objects.
|
||||
* @param first string, should be normalized
|
||||
* @param second string, should be normalized
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return first
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const = 0;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
* (which is only possible for the two COMPOSE modes) this method
|
||||
* resolves to "yes" or "no" to provide a definitive result,
|
||||
* at the cost of doing more work in those cases.
|
||||
* @param s input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return TRUE if s is normalized
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For the two COMPOSE modes, the result could be "maybe" in cases that
|
||||
* would take a little more work to resolve definitively.
|
||||
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
|
||||
* combination of quick check + normalization, to avoid
|
||||
* re-checking the "yes" prefix.
|
||||
* @param s input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return UNormalizationCheckResult
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
|
||||
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string.
|
||||
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
|
||||
* the substring <code>UnicodeString(s, 0, end)</code>
|
||||
* will pass the quick check with a "yes" result.
|
||||
*
|
||||
* The returned end index is usually one or more characters before the
|
||||
* "no" or "maybe" character: The end index is at a normalization boundary.
|
||||
* (See the class documentation for more about normalization boundaries.)
|
||||
*
|
||||
* When the goal is a normalized string and most input strings are expected
|
||||
* to be normalized already, then call this method,
|
||||
* and if it returns a prefix shorter than the input string,
|
||||
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
|
||||
* @param s input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return UNormalizationCheckResult
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
|
||||
|
||||
/**
|
||||
* Tests if the character has a normalization boundary before it.
|
||||
* If true, then the character does not normalization-interact with
|
||||
* preceding characters.
|
||||
* In other words, a string containing this character can be normalized
|
||||
* by processing portions before this character and starting from this
|
||||
* character independently.
|
||||
* This is used for iterative normalization. See the class documentation for details.
|
||||
* @param c character to test
|
||||
* @return TRUE if c has a normalization boundary before it
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
|
||||
|
||||
/**
|
||||
* Tests if the character has a normalization boundary after it.
|
||||
* If true, then the character does not normalization-interact with
|
||||
* following characters.
|
||||
* In other words, a string containing this character can be normalized
|
||||
* by processing portions up to this character and after this
|
||||
* character independently.
|
||||
* This is used for iterative normalization. See the class documentation for details.
|
||||
* @param c character to test
|
||||
* @return TRUE if c has a normalization boundary after it
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
|
||||
|
||||
/**
|
||||
* Tests if the character is normalization-inert.
|
||||
* If true, then the character does not change, nor normalization-interact with
|
||||
* preceding or following characters.
|
||||
* In other words, a string containing this character can be normalized
|
||||
* by processing portions before this character and after this
|
||||
* character independently.
|
||||
* This is used for iterative normalization. See the class documentation for details.
|
||||
* @param c character to test
|
||||
* @return TRUE if c is normalization-inert
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool isInert(UChar32 c) const = 0;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
* @returns a UClassID for this class.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
* @return a UClassID for the actual class.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* Normalization filtered by a UnicodeSet.
|
||||
* Normalizes portions of the text contained in the filter set and leaves
|
||||
* portions not contained in the filter set unchanged.
|
||||
* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
|
||||
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
|
||||
* This class implements all of (and only) the Normalizer2 API.
|
||||
* An instance of this class is unmodifiable/immutable but is constructed and
|
||||
* must be destructed by the owner.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
|
||||
public:
|
||||
/**
|
||||
* Constructs a filtered normalizer wrapping any Normalizer2 instance
|
||||
* and a filter set.
|
||||
* Both are aliased and must not be modified or deleted while this object
|
||||
* is used.
|
||||
* The filter set should be frozen; otherwise the performance will suffer greatly.
|
||||
* @param n2 wrapped Normalizer2 instance
|
||||
* @param filterSet UnicodeSet which determines the characters to be normalized
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
|
||||
norm2(n2), set(filterSet) {}
|
||||
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination string
|
||||
* (replacing its contents) and returns the destination string.
|
||||
* The source and destination strings must be different objects.
|
||||
* @param src source string
|
||||
* @param dest destination string; its contents is replaced with normalized src
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return dest
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const;
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
* The result is normalized if the first string was normalized.
|
||||
* The first and second strings must be different objects.
|
||||
* @param first string, should be normalized
|
||||
* @param second string, will be normalized
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return first
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const;
|
||||
/**
|
||||
* Appends the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
* The result is normalized if both the strings were normalized.
|
||||
* The first and second strings must be different objects.
|
||||
* @param first string, should be normalized
|
||||
* @param second string, should be normalized
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return first
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param s input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return TRUE if s is normalized
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param s input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return UNormalizationCheckResult
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param s input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return UNormalizationCheckResult
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Tests if the character has a normalization boundary before it.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param c character to test
|
||||
* @return TRUE if c has a normalization boundary before it
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* Tests if the character has a normalization boundary after it.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param c character to test
|
||||
* @return TRUE if c has a normalization boundary after it
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* Tests if the character is normalization-inert.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param c character to test
|
||||
* @return TRUE if c is normalization-inert
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UBool isInert(UChar32 c) const;
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for this class.
|
||||
* @returns a UClassID for this class.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
static UClassID U_EXPORT2 getStaticClassID();
|
||||
|
||||
/**
|
||||
* ICU "poor man's RTTI", returns a UClassID for the actual class.
|
||||
* @return a UClassID for the actual class.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
virtual UClassID getDynamicClassID() const;
|
||||
private:
|
||||
UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UBool doNormalize,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
const Normalizer2 &norm2;
|
||||
const UnicodeSet &set;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
#endif // __NORMALIZER2_H__
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1996-2006, International Business Machines Corporation and
|
||||
* Copyright (c) 1996-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************
|
||||
*/
|
||||
|
@ -18,14 +18,11 @@
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/uobject.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
|
||||
|
||||
struct UCharIterator;
|
||||
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
|
||||
#include "unicode/uobject.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
/**
|
||||
|
@ -33,6 +30,10 @@ U_NAMESPACE_BEGIN
|
|||
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
|
||||
* Unicode Standard Annex #15: Unicode Normalization Forms</a>.
|
||||
*
|
||||
* Note: This API has been replaced by the Normalizer2 class and is only available
|
||||
* for backward compatibility. This class simply delegates to the Normalizer2 class.
|
||||
* There is one exception: The new API does not provide a replacement for Normalizer::compare().
|
||||
*
|
||||
* The Normalizer class consists of two parts:
|
||||
* - static functions that normalize strings or test if strings are normalized
|
||||
* - a Normalizer object is an iterator that takes any kind of text and
|
||||
|
@ -40,13 +41,11 @@ U_NAMESPACE_BEGIN
|
|||
*
|
||||
* The Normalizer class is not suitable for subclassing.
|
||||
*
|
||||
* The static functions are basically wrappers around the C implementation,
|
||||
* using UnicodeString instead of UChar*.
|
||||
* For basic information about normalization forms and details about the C API
|
||||
* please see the documentation in unorm.h.
|
||||
*
|
||||
* The iterator API with the Normalizer constructors and the non-static functions
|
||||
* uses a CharacterIterator as input. It is possible to pass a string which
|
||||
* use a CharacterIterator as input. It is possible to pass a string which
|
||||
* is then internally wrapped in a CharacterIterator.
|
||||
* The input text is not normalized all at once, but incrementally where needed
|
||||
* (providing efficient random access).
|
||||
|
@ -287,7 +286,7 @@ public:
|
|||
* @see isNormalized
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
static inline UNormalizationCheckResult
|
||||
static UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
|
||||
|
||||
/**
|
||||
|
@ -328,7 +327,7 @@ public:
|
|||
* @see quickCheck
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
static inline UBool
|
||||
static UBool
|
||||
isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
|
||||
|
||||
/**
|
||||
|
@ -726,18 +725,20 @@ private:
|
|||
UBool nextNormalize();
|
||||
UBool previousNormalize();
|
||||
|
||||
void init(CharacterIterator *iter);
|
||||
void init();
|
||||
void clearBuffer(void);
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Private data
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
FilteredNormalizer2*fFilteredNorm2; // owned if not NULL
|
||||
const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2
|
||||
UNormalizationMode fUMode;
|
||||
int32_t fOptions;
|
||||
|
||||
// The input text and our position in it
|
||||
UCharIterator *text;
|
||||
CharacterIterator *text;
|
||||
|
||||
// The normalization buffer is the result of normalization
|
||||
// of the source in [currentIndex..nextIndex[ .
|
||||
|
@ -746,7 +747,6 @@ private:
|
|||
// A buffer for holding intermediate results
|
||||
UnicodeString buffer;
|
||||
int32_t bufferPos;
|
||||
|
||||
};
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
|
@ -761,48 +761,14 @@ inline UNormalizationCheckResult
|
|||
Normalizer::quickCheck(const UnicodeString& source,
|
||||
UNormalizationMode mode,
|
||||
UErrorCode &status) {
|
||||
if(U_FAILURE(status)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
|
||||
return unorm_quickCheck(source.getBuffer(), source.length(),
|
||||
mode, &status);
|
||||
}
|
||||
|
||||
inline UNormalizationCheckResult
|
||||
Normalizer::quickCheck(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &status) {
|
||||
if(U_FAILURE(status)) {
|
||||
return UNORM_MAYBE;
|
||||
}
|
||||
|
||||
return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
|
||||
mode, options, &status);
|
||||
return quickCheck(source, mode, 0, status);
|
||||
}
|
||||
|
||||
inline UBool
|
||||
Normalizer::isNormalized(const UnicodeString& source,
|
||||
UNormalizationMode mode,
|
||||
UErrorCode &status) {
|
||||
if(U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return unorm_isNormalized(source.getBuffer(), source.length(),
|
||||
mode, &status);
|
||||
}
|
||||
|
||||
inline UBool
|
||||
Normalizer::isNormalized(const UnicodeString& source,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode &status) {
|
||||
if(U_FAILURE(status)) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
|
||||
mode, options, &status);
|
||||
return isNormalized(source, mode, 0, status);
|
||||
}
|
||||
|
||||
inline int32_t
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1997-2009, International Business Machines
|
||||
* Copyright (C) 1997-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
|
@ -321,51 +321,29 @@ typedef enum UProperty {
|
|||
/** Binary property NFD_Inert.
|
||||
ICU-specific property for characters that are inert under NFD,
|
||||
i.e., they do not interact with adjacent characters.
|
||||
Used for example in normalizing transforms in incremental mode
|
||||
to find the boundary of safely normalizable text despite possible
|
||||
text additions.
|
||||
|
||||
There is one such property per normalization form.
|
||||
These properties are computed as follows - an inert character is:
|
||||
a) unassigned, or ALL of the following:
|
||||
b) of combining class 0.
|
||||
c) not decomposed by this normalization form.
|
||||
AND if NFC or NFKC,
|
||||
d) can never compose with a previous character.
|
||||
e) can never compose with a following character.
|
||||
f) can never change if another character is added.
|
||||
Example: a-breve might satisfy all but f, but if you
|
||||
add an ogonek it changes to a-ogonek + breve
|
||||
|
||||
See also com.ibm.text.UCD.NFSkippable in the ICU4J repository,
|
||||
and icu/source/common/unormimp.h .
|
||||
See the documentation for the Normalizer2 class and the
|
||||
Normalizer2::isInert() method.
|
||||
@stable ICU 3.0 */
|
||||
UCHAR_NFD_INERT=37,
|
||||
/** Binary property NFKD_Inert.
|
||||
ICU-specific property for characters that are inert under NFKD,
|
||||
i.e., they do not interact with adjacent characters.
|
||||
Used for example in normalizing transforms in incremental mode
|
||||
to find the boundary of safely normalizable text despite possible
|
||||
text additions.
|
||||
@see UCHAR_NFD_INERT
|
||||
See the documentation for the Normalizer2 class and the
|
||||
Normalizer2::isInert() method.
|
||||
@stable ICU 3.0 */
|
||||
UCHAR_NFKD_INERT=38,
|
||||
/** Binary property NFC_Inert.
|
||||
ICU-specific property for characters that are inert under NFC,
|
||||
i.e., they do not interact with adjacent characters.
|
||||
Used for example in normalizing transforms in incremental mode
|
||||
to find the boundary of safely normalizable text despite possible
|
||||
text additions.
|
||||
@see UCHAR_NFD_INERT
|
||||
See the documentation for the Normalizer2 class and the
|
||||
Normalizer2::isInert() method.
|
||||
@stable ICU 3.0 */
|
||||
UCHAR_NFC_INERT=39,
|
||||
/** Binary property NFKC_Inert.
|
||||
ICU-specific property for characters that are inert under NFKC,
|
||||
i.e., they do not interact with adjacent characters.
|
||||
Used for example in normalizing transforms in incremental mode
|
||||
to find the boundary of safely normalizable text despite possible
|
||||
text additions.
|
||||
@see UCHAR_NFD_INERT
|
||||
See the documentation for the Normalizer2 class and the
|
||||
Normalizer2::isInert() method.
|
||||
@stable ICU 3.0 */
|
||||
UCHAR_NFKC_INERT=40,
|
||||
/** Binary Property Segment_Starter.
|
||||
|
@ -428,8 +406,10 @@ typedef enum UProperty {
|
|||
UCHAR_CHANGES_WHEN_CASEFOLDED=54,
|
||||
/** Binary property Changes_When_Casemapped. @draft ICU 4.4 */
|
||||
UCHAR_CHANGES_WHEN_CASEMAPPED=55,
|
||||
/** Binary property Changes_When_NFKC_Casefolded. @draft ICU 4.4 */
|
||||
UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED=56,
|
||||
/** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
|
||||
UCHAR_BINARY_LIMIT=56,
|
||||
UCHAR_BINARY_LIMIT=57,
|
||||
|
||||
/** Enumerated property Bidi_Class.
|
||||
Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
***************************************************************************
|
||||
* Copyright (C) 1999-2009, International Business Machines Corporation
|
||||
* Copyright (C) 1999-2010, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
***************************************************************************
|
||||
* Date Name Description
|
||||
|
@ -861,6 +861,20 @@ public:
|
|||
*/
|
||||
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the end of the substring of the input string according to the USetSpanCondition.
|
||||
* Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
|
||||
* after pinning start to 0<=start<=s.length().
|
||||
* @param s the string
|
||||
* @param start the start index in the string for the span operation
|
||||
* @param spanCondition specifies the containment condition
|
||||
* @return the exclusive end of the substring according to the spanCondition;
|
||||
* the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
|
||||
* @draft ICU 4.4
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the start of the trailing substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
|
@ -880,6 +894,21 @@ public:
|
|||
*/
|
||||
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the start of the substring of the input string according to the USetSpanCondition.
|
||||
* Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
|
||||
* after pinning limit to 0<=end<=s.length().
|
||||
* @param s the string
|
||||
* @param limit the exclusive-end index in the string for the span operation
|
||||
* (use s.length() or INT32_MAX for spanning back from the end of the string)
|
||||
* @param spanCondition specifies the containment condition
|
||||
* @return the start of the substring according to the spanCondition;
|
||||
* the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
|
||||
* @draft ICU 4.4
|
||||
* @see USetSpanCondition
|
||||
*/
|
||||
inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
|
||||
|
||||
/**
|
||||
* Returns the length of the initial substring of the input string which
|
||||
* consists only of characters and strings that are contained in this set
|
||||
|
@ -1619,6 +1648,26 @@ inline const USet *UnicodeSet::toUSet() const {
|
|||
return reinterpret_cast<const USet *>(this);
|
||||
}
|
||||
|
||||
inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
|
||||
int32_t sLength=s.length();
|
||||
if(start<0) {
|
||||
start=0;
|
||||
} else if(start>sLength) {
|
||||
start=sLength;
|
||||
}
|
||||
return start+span(s.getBuffer()+start, sLength-start, spanCondition);
|
||||
}
|
||||
|
||||
inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
|
||||
int32_t sLength=s.length();
|
||||
if(limit<0) {
|
||||
limit=0;
|
||||
} else if(limit>sLength) {
|
||||
limit=sLength;
|
||||
}
|
||||
return spanBack(s.getBuffer(), limit, spanCondition);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1998-2009, International Business Machines
|
||||
* Copyright (C) 1998-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
*
|
||||
|
@ -1566,6 +1566,33 @@ public:
|
|||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Create a temporary substring for the specified range.
|
||||
* Unlike the substring constructor and setTo() functions,
|
||||
* the object returned here will be a read-only alias (using getBuffer())
|
||||
* rather than copying the text.
|
||||
* As a result, this substring operation is much faster but requires
|
||||
* that the original string not be modified or deleted during the lifetime
|
||||
* of the returned substring object.
|
||||
* @param start offset of the first character visible in the substring
|
||||
* @param length length of the substring
|
||||
* @return a read-only alias UnicodeString object for the substring
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
UnicodeString tempSubString(int32_t start=0, int32_t length=INT32_MAX) const;
|
||||
|
||||
/**
|
||||
* Create a temporary substring for the specified range.
|
||||
* Same as tempSubString(start, length) except that the substring range
|
||||
* is specified as a (start, limit) pair (with an exclusive limit index)
|
||||
* rather than a (start, length) pair.
|
||||
* @param start offset of the first character visible in the substring
|
||||
* @param limit offset immediately following the last character visible in the substring
|
||||
* @return a read-only alias UnicodeString object for the substring
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
inline UnicodeString tempSubStringBetween(int32_t start, int32_t limit=INT32_MAX) const;
|
||||
|
||||
/**
|
||||
* Convert the UnicodeString to UTF-8 and write the result
|
||||
* to a ByteSink. This is called by toUTF8String().
|
||||
|
@ -2396,6 +2423,16 @@ public:
|
|||
inline UnicodeString& removeBetween(int32_t start,
|
||||
int32_t limit = (int32_t)INT32_MAX);
|
||||
|
||||
/**
|
||||
* Retain only the characters in the range
|
||||
* [<code>start</code>, <code>limit</code>) from the UnicodeString object.
|
||||
* Removes characters before <code>start</code> and at and after <code>limit</code>.
|
||||
* @param start the offset of the first character to retain
|
||||
* @param limit the offset immediately following the range to retain
|
||||
* @return a reference to this
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
inline UnicodeString &retainBetween(int32_t start, int32_t limit = INT32_MAX);
|
||||
|
||||
/* Length operations */
|
||||
|
||||
|
@ -4068,6 +4105,11 @@ UnicodeString::extractBetween(int32_t start,
|
|||
doExtract(start, limit - start, dst, dstStart);
|
||||
}
|
||||
|
||||
inline UnicodeString
|
||||
UnicodeString::tempSubStringBetween(int32_t start, int32_t limit) const {
|
||||
return tempSubString(start, limit - start);
|
||||
}
|
||||
|
||||
inline UChar
|
||||
UnicodeString::doCharAt(int32_t offset) const
|
||||
{
|
||||
|
@ -4161,7 +4203,13 @@ UnicodeString::getTerminatedBuffer() {
|
|||
} else {
|
||||
UChar *array = getArrayStart();
|
||||
int32_t len = length();
|
||||
if(len < getCapacity()) {
|
||||
if(len < getCapacity() && ((fFlags&kRefCounted) == 0 || refCount() == 1)) {
|
||||
/*
|
||||
* kRefCounted: Do not write the NUL if the buffer is shared.
|
||||
* That is mostly safe, except when the length of one copy was modified
|
||||
* without copy-on-write, e.g., via truncate(newLength) or remove(void).
|
||||
* Then the NUL would be written into the middle of another copy's string.
|
||||
*/
|
||||
if(!(fFlags&kBufferIsReadonly)) {
|
||||
/*
|
||||
* We must not write to a readonly buffer, but it is known to be
|
||||
|
@ -4332,10 +4380,12 @@ inline UnicodeString&
|
|||
UnicodeString::remove()
|
||||
{
|
||||
// remove() of a bogus string makes the string empty and non-bogus
|
||||
if(isBogus()) {
|
||||
unBogus();
|
||||
// we also un-alias a read-only alias to deal with NUL-termination
|
||||
// issues with getTerminatedBuffer()
|
||||
if(fFlags & (kIsBogus|kBufferIsReadonly)) {
|
||||
setToEmpty();
|
||||
} else {
|
||||
setLength(0);
|
||||
fShortLength = 0;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
@ -4356,6 +4406,12 @@ UnicodeString::removeBetween(int32_t start,
|
|||
int32_t limit)
|
||||
{ return doReplace(start, limit - start, NULL, 0, 0); }
|
||||
|
||||
inline UnicodeString &
|
||||
UnicodeString::retainBetween(int32_t start, int32_t limit) {
|
||||
truncate(limit);
|
||||
return doReplace(0, start, NULL, 0, 0);
|
||||
}
|
||||
|
||||
inline UBool
|
||||
UnicodeString::truncate(int32_t targetLength)
|
||||
{
|
||||
|
@ -4365,6 +4421,9 @@ UnicodeString::truncate(int32_t targetLength)
|
|||
return FALSE;
|
||||
} else if((uint32_t)targetLength < (uint32_t)length()) {
|
||||
setLength(targetLength);
|
||||
if(fFlags&kBufferIsReadonly) {
|
||||
fUnion.fFields.fCapacity = targetLength; // not NUL-terminated any more
|
||||
}
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (c) 1996-2007, International Business Machines Corporation
|
||||
* Copyright (c) 1996-2010, International Business Machines Corporation
|
||||
* and others. All Rights Reserved.
|
||||
*******************************************************************************
|
||||
* File unorm.h
|
||||
|
@ -20,6 +20,7 @@
|
|||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/unorm2.h"
|
||||
|
||||
/**
|
||||
* \file
|
||||
|
@ -27,6 +28,11 @@
|
|||
*
|
||||
* <h2>Unicode normalization API</h2>
|
||||
*
|
||||
* Note: This API has been replaced by the unorm2.h API and is only available
|
||||
* for backward compatibility. The functions here simply delegate to the
|
||||
* unorm2.h functions, for example unorm2_getInstance() and unorm2_normalize().
|
||||
* There is one exception: The new API does not provide a replacement for unorm_compare().
|
||||
*
|
||||
* <code>unorm_normalize</code> transforms Unicode text into an equivalent composed or
|
||||
* decomposed form, allowing for easier sorting and searching of text.
|
||||
* <code>unorm_normalize</code> supports the standard normalization forms described in
|
||||
|
@ -202,28 +208,7 @@ unorm_normalize(const UChar *source, int32_t sourceLength,
|
|||
UNormalizationMode mode, int32_t options,
|
||||
UChar *result, int32_t resultLength,
|
||||
UErrorCode *status);
|
||||
#endif
|
||||
/**
|
||||
* Result values for unorm_quickCheck().
|
||||
* For details see Unicode Technical Report 15.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
typedef enum UNormalizationCheckResult {
|
||||
/**
|
||||
* Indicates that string is not in the normalized format
|
||||
*/
|
||||
UNORM_NO,
|
||||
/**
|
||||
* Indicates that string is in the normalized format
|
||||
*/
|
||||
UNORM_YES,
|
||||
/**
|
||||
* Indicates that string cannot be determined if it is in the normalized
|
||||
* format without further thorough checks.
|
||||
*/
|
||||
UNORM_MAYBE
|
||||
} UNormalizationCheckResult;
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/**
|
||||
* Performing quick check on a string, to quickly determine if the string is
|
||||
* in a particular normalization format.
|
||||
|
|
348
icu4c/source/common/unicode/unorm2.h
Normal file
348
icu4c/source/common/unicode/unorm2.h
Normal file
|
@ -0,0 +1,348 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: unorm2.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009dec15
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __UNORM2_H__
|
||||
#define __UNORM2_H__
|
||||
|
||||
/**
|
||||
* \file
|
||||
* \brief C API: New API for Unicode Normalization.
|
||||
*
|
||||
* Unicode normalization functionality for standard Unicode normalization or
|
||||
* for using custom mapping tables.
|
||||
* All instances of UNormalizer2 are unmodifiable/immutable.
|
||||
* Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
|
||||
* For more details see the Normalizer2 C++ class.
|
||||
*/
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/uset.h"
|
||||
|
||||
/**
|
||||
* Constants for normalization modes.
|
||||
* For details about standard Unicode normalization forms
|
||||
* and about the algorithms which are also used with custom mapping tables
|
||||
* see http://www.unicode.org/unicode/reports/tr15/
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
typedef enum {
|
||||
/**
|
||||
* Decomposition followed by composition.
|
||||
* Same as standard NFC when using an "nfc" instance.
|
||||
* Same as standard NFKC when using an "nfkc" instance.
|
||||
* For details about standard Unicode normalization forms
|
||||
* see http://www.unicode.org/unicode/reports/tr15/
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
UNORM2_COMPOSE,
|
||||
/**
|
||||
* Map, and reorder canonically.
|
||||
* Same as standard NFD when using an "nfc" instance.
|
||||
* Same as standard NFKD when using an "nfkc" instance.
|
||||
* For details about standard Unicode normalization forms
|
||||
* see http://www.unicode.org/unicode/reports/tr15/
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
UNORM2_DECOMPOSE,
|
||||
/**
|
||||
* "Fast C or D" form.
|
||||
* Further decomposition <i>without reordering</i>
|
||||
* would yield the same form as DECOMPOSE.
|
||||
* Text in "Fast C or D" form can be processed efficiently with data tables
|
||||
* that are "canonically closed", that is, that provide equivalent data for
|
||||
* equivalent text, without having to be fully normalized.
|
||||
* Not a standard Unicode normalization form.
|
||||
* Not a unique form: Different FCD strings can be canonically equivalent.
|
||||
* For details see http://www.unicode.org/notes/tn5/#FCD
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
UNORM2_FCD,
|
||||
/**
|
||||
* Compose only contiguously.
|
||||
* Also known as "FCC" or "Fast C Contiguous".
|
||||
* The result will often but not always be in NFC.
|
||||
* The result will conform to FCD which is useful for processing.
|
||||
* Not a standard Unicode normalization form.
|
||||
* For details see http://www.unicode.org/notes/tn5/#FCC
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
UNORM2_COMPOSE_CONTIGUOUS
|
||||
} UNormalization2Mode;
|
||||
|
||||
/**
|
||||
* Result values for normalization quick check functions.
|
||||
* For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
typedef enum UNormalizationCheckResult {
|
||||
/**
|
||||
* The input string is not in the normalization form.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UNORM_NO,
|
||||
/**
|
||||
* The input string is in the normalization form.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UNORM_YES,
|
||||
/**
|
||||
* The input string may or may not be in the normalization form.
|
||||
* This value is only returned for composition forms like NFC and FCC,
|
||||
* when a backward-combining character is found for which the surrounding text
|
||||
* would have to be analyzed further.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
UNORM_MAYBE
|
||||
} UNormalizationCheckResult;
|
||||
|
||||
/**
|
||||
* Opaque C service object type for the new normalization API.
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
struct UNormalizer2;
|
||||
typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @draft ICU 4.4 */
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/**
|
||||
* Returns a UNormalizer2 instance which uses the specified data file
|
||||
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
|
||||
* and which composes or decomposes text according to the specified mode.
|
||||
* Returns an unmodifiable singleton instance. Do not delete it.
|
||||
*
|
||||
* Use packageName=NULL for data files that are part of ICU's own data.
|
||||
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
|
||||
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
|
||||
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
|
||||
*
|
||||
* @param packageName NULL for ICU built-in data, otherwise application data package name
|
||||
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
|
||||
* @param mode normalization mode (compose or decompose etc.)
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return the requested UNormalizer2, if successful
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT const UNormalizer2 * U_EXPORT2
|
||||
unorm2_getInstance(const char *packageName,
|
||||
const char *name,
|
||||
UNormalization2Mode mode,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Constructs a filtered normalizer wrapping any UNormalizer2 instance
|
||||
* and a filter set.
|
||||
* Both are aliased and must not be modified or deleted while this object
|
||||
* is used.
|
||||
* The filter set should be frozen; otherwise the performance will suffer greatly.
|
||||
* @param norm2 wrapped Normalizer2 instance
|
||||
* @param filterSet USet which determines the characters to be normalized
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return the requested UNormalizer2, if successful
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT UNormalizer2 * U_EXPORT2
|
||||
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Closes a UNormalizer2 instance from unorm2_openFiltered().
|
||||
* Do not close instances from unorm2_getInstance()!
|
||||
* @param norm2 UNormalizer2 instance to be closed
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT void U_EXPORT2
|
||||
unorm2_close(UNormalizer2 *norm2);
|
||||
|
||||
/**
|
||||
* Writes the normalized form of the source string to the destination string
|
||||
* (replacing its contents) and returns the length of the destination string.
|
||||
* The source and destination strings must be different buffers.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param src source string
|
||||
* @param length length of the source string, or -1 if NUL-terminated
|
||||
* @param dest destination string; its contents is replaced with normalized src
|
||||
* @param capacity number of UChars that can be written to dest
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return dest
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_normalize(const UNormalizer2 *norm2,
|
||||
const UChar *src, int32_t length,
|
||||
UChar *dest, int32_t capacity,
|
||||
UErrorCode *pErrorCode);
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string
|
||||
* (merging them at the boundary) and returns the length of the first string.
|
||||
* The result is normalized if the first string was normalized.
|
||||
* The first and second strings must be different buffers.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param first string, should be normalized
|
||||
* @param firstLength length of the first string, or -1 if NUL-terminated
|
||||
* @param firstCapacity number of UChars that can be written to first
|
||||
* @param second string, will be normalized
|
||||
* @param secondLength length of the source string, or -1 if NUL-terminated
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return first
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
|
||||
UChar *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const UChar *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode);
|
||||
/**
|
||||
* Appends the second string to the first string
|
||||
* (merging them at the boundary) and returns the length of the first string.
|
||||
* The result is normalized if both the strings were normalized.
|
||||
* The first and second strings must be different buffers.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param first string, should be normalized
|
||||
* @param firstLength length of the first string, or -1 if NUL-terminated
|
||||
* @param firstCapacity number of UChars that can be written to first
|
||||
* @param second string, should be normalized
|
||||
* @param secondLength length of the source string, or -1 if NUL-terminated
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return first
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_append(const UNormalizer2 *norm2,
|
||||
UChar *first, int32_t firstLength, int32_t firstCapacity,
|
||||
const UChar *second, int32_t secondLength,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
* (which is only possible for the two COMPOSE modes) this method
|
||||
* resolves to "yes" or "no" to provide a definitive result,
|
||||
* at the cost of doing more work in those cases.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param s input string
|
||||
* @param length length of the string, or -1 if NUL-terminated
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return TRUE if s is normalized
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_isNormalized(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For the two COMPOSE modes, the result could be "maybe" in cases that
|
||||
* would take a little more work to resolve definitively.
|
||||
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
|
||||
* combination of quick check + normalization, to avoid
|
||||
* re-checking the "yes" prefix.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param s input string
|
||||
* @param length length of the string, or -1 if NUL-terminated
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return UNormalizationCheckResult
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT UNormalizationCheckResult U_EXPORT2
|
||||
unorm2_quickCheck(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string.
|
||||
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
|
||||
* the substring <code>UnicodeString(s, 0, end)</code>
|
||||
* will pass the quick check with a "yes" result.
|
||||
*
|
||||
* The returned end index is usually one or more characters before the
|
||||
* "no" or "maybe" character: The end index is at a normalization boundary.
|
||||
* (See the class documentation for more about normalization boundaries.)
|
||||
*
|
||||
* When the goal is a normalized string and most input strings are expected
|
||||
* to be normalized already, then call this method,
|
||||
* and if it returns a prefix shorter than the input string,
|
||||
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param s input string
|
||||
* @param length length of the string, or -1 if NUL-terminated
|
||||
* @param pErrorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return UNormalizationCheckResult
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT int32_t U_EXPORT2
|
||||
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
|
||||
const UChar *s, int32_t length,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Tests if the character has a normalization boundary before it.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param c character to test
|
||||
* @return TRUE if c has a normalization boundary before it
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
|
||||
|
||||
/**
|
||||
* Tests if the character has a normalization boundary after it.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param c character to test
|
||||
* @return TRUE if c has a normalization boundary after it
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
|
||||
|
||||
/**
|
||||
* Tests if the character is normalization-inert.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
* @param norm2 UNormalizer2 instance
|
||||
* @param c character to test
|
||||
* @return TRUE if c is normalization-inert
|
||||
* @draft ICU 4.4
|
||||
*/
|
||||
U_DRAFT UBool U_EXPORT2
|
||||
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
|
||||
|
||||
#endif /* !UCONFIG_NO_NORMALIZATION */
|
||||
#endif /* __UNORM2_H__ */
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -33,12 +33,15 @@
|
|||
#include "uvector.h"
|
||||
#include "uprops.h"
|
||||
#include "propname.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "unormimp.h"
|
||||
#include "ucase.h"
|
||||
#include "ubidi_props.h"
|
||||
#include "uinvchar.h"
|
||||
#include "uprops.h"
|
||||
#include "charstr.h"
|
||||
#include "cstring.h"
|
||||
#include "mutex.h"
|
||||
#include "umutex.h"
|
||||
#include "uassert.h"
|
||||
#include "hash.h"
|
||||
|
@ -91,10 +94,43 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
|
|||
*/
|
||||
//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
|
||||
|
||||
// Cached sets ------------------------------------------------------------- ***
|
||||
|
||||
U_CDECL_BEGIN
|
||||
static UBool U_CALLCONV uset_cleanup();
|
||||
U_CDECL_END
|
||||
|
||||
// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
|
||||
// can only fail with an out-of-memory error
|
||||
// if we have a correct pattern and the properties data is hardcoded and always available.
|
||||
class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
|
||||
public:
|
||||
UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
|
||||
SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
|
||||
UnicodeSet *getInstance(UErrorCode &errorCode) {
|
||||
return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
|
||||
}
|
||||
private:
|
||||
static void *createInstance(const void *context, UErrorCode &errorCode) {
|
||||
UnicodeString pattern((const char *)context, -1, US_INV);
|
||||
UnicodeSet *set=new UnicodeSet(pattern, errorCode);
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
set->freeze();
|
||||
ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
|
||||
return set;
|
||||
}
|
||||
|
||||
const char *fPattern;
|
||||
};
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
|
||||
|
||||
STATIC_SIMPLE_SINGLETON(uni32Singleton);
|
||||
|
||||
//----------------------------------------------------------------
|
||||
// Inclusions list
|
||||
//----------------------------------------------------------------
|
||||
|
@ -128,7 +164,7 @@ static UBool U_CALLCONV uset_cleanup(void) {
|
|||
INCLUSIONS[i] = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
@ -177,6 +213,27 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
|
|||
ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
|
||||
unorm_addPropertyStarts(&sa, &status);
|
||||
break;
|
||||
case UPROPS_SRC_NFC: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
|
||||
if(U_SUCCESS(status)) {
|
||||
impl->addPropertyStarts(&sa, status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFKC: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
|
||||
if(U_SUCCESS(status)) {
|
||||
impl->addPropertyStarts(&sa, status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case UPROPS_SRC_NFKC_CF: {
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
|
||||
if(U_SUCCESS(status)) {
|
||||
impl->addPropertyStarts(&sa, status);
|
||||
}
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
case UPROPS_SRC_CASE:
|
||||
ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
|
||||
|
@ -207,6 +264,13 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
|
|||
return INCLUSIONS[src];
|
||||
}
|
||||
|
||||
// Cache some sets for other services -------------------------------------- ***
|
||||
|
||||
U_CFUNC UnicodeSet *
|
||||
uniset_getUnicode32Instance(UErrorCode &errorCode) {
|
||||
return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
|
||||
}
|
||||
|
||||
// helper functions for matching of pattern syntax pieces ------------------ ***
|
||||
// these functions are parallel to the PERL_OPEN etc. strings above
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1999-2009, International Business Machines Corporation and *
|
||||
* Copyright (C) 1999-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*
|
||||
|
@ -780,6 +780,17 @@ UnicodeString::extract(int32_t start,
|
|||
return u_terminateChars(target, targetCapacity, length, &status);
|
||||
}
|
||||
|
||||
UnicodeString
|
||||
UnicodeString::tempSubString(int32_t start, int32_t len) const {
|
||||
pinIndices(start, len);
|
||||
const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
|
||||
if(array==NULL) {
|
||||
array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string
|
||||
len=-2; // bogus result string
|
||||
}
|
||||
return UnicodeString(FALSE, array + start, len);
|
||||
}
|
||||
|
||||
int32_t
|
||||
UnicodeString::toUTF8(int32_t start, int32_t len,
|
||||
char *target, int32_t capacity) const {
|
||||
|
@ -1218,6 +1229,28 @@ UnicodeString::doReplace(int32_t start,
|
|||
return *this;
|
||||
}
|
||||
|
||||
int32_t oldLength = this->length();
|
||||
|
||||
// optimize (read-only alias).remove(0, start) and .remove(start, end)
|
||||
if((fFlags&kBufferIsReadonly) && srcLength == 0) {
|
||||
if(start == 0) {
|
||||
// remove prefix by adjusting the array pointer
|
||||
pinIndex(length);
|
||||
fUnion.fFields.fArray += length;
|
||||
fUnion.fFields.fCapacity -= length;
|
||||
setLength(oldLength - length);
|
||||
return *this;
|
||||
} else {
|
||||
pinIndex(start);
|
||||
if(length >= (oldLength - start)) {
|
||||
// remove suffix by reducing the length (like truncate())
|
||||
setLength(start);
|
||||
fUnion.fFields.fCapacity = start; // not NUL-terminated any more
|
||||
return *this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(srcChars == 0) {
|
||||
srcStart = srcLength = 0;
|
||||
} else if(srcLength < 0) {
|
||||
|
@ -1225,8 +1258,6 @@ UnicodeString::doReplace(int32_t start,
|
|||
srcLength = u_strlen(srcChars + srcStart);
|
||||
}
|
||||
|
||||
int32_t oldLength = this->length();
|
||||
|
||||
// calculate the size of the string after the replace
|
||||
int32_t newSize;
|
||||
|
||||
|
@ -1594,4 +1625,3 @@ static void uprv_UnicodeStringDummy(void) {
|
|||
delete [] (new UnicodeString[2]);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
* file name: unorm_props_data.c
|
||||
|
@ -14,6 +14,7 @@ static const int32_t indexes[_NORM_INDEX_TOP]={
|
|||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
|
||||
};
|
||||
|
||||
#if 0
|
||||
static const uint16_t normTrie_index[2532]={
|
||||
0,8,0x10,0x18,0x28,0x30,0x38,0x40,0x48,0x50,0x58,0x60,0x68,0x70,0x77,0x7f,
|
||||
0x87,0x8f,0x1f,0x27,0x94,0x9c,0xa3,0xab,0xb3,0xbb,0xc3,0xcb,0xd3,0xdb,0xe3,0xeb,
|
||||
|
@ -835,6 +836,7 @@ static const UTrie2 normTrie={
|
|||
0x2810,
|
||||
NULL, 0, FALSE, FALSE, 0, NULL
|
||||
};
|
||||
#endif
|
||||
|
||||
static const uint16_t extraData[16431]={
|
||||
0x1c2,0xff02,0x20,0x3b9,0xff01,0x3c5,0xff01,0x3cd,0xff01,0x3cb,0xff01,0x3c3,0xff01,0x61,0xff01,0xe6,
|
||||
|
@ -1866,6 +1868,7 @@ static const uint16_t extraData[16431]={
|
|||
0x773,0x776,0x77c,0x782,0x788,0x78e,0x794,0x797,0x79a,0x79d,0x7a0,0x7a3,0x7a6,0x7a9,0x7ac
|
||||
};
|
||||
|
||||
#if 0
|
||||
static const uint16_t combiningTable[1967]={
|
||||
0x7af,0xc0,0x7b0,0xc1,0x7b1,0x20c2,0x7b2,0xc3,0x7b3,0x20c4,0x7b4,0x20c5,0x7b6,0x100,0x7b7,0x2102,
|
||||
0x7b8,0x104,0x7b9,0x2226,0x7ba,0x1cd,0x7bd,0x200,0x7be,0x202,0x7d6,0x1e00,0x7d7,0x3ea0,0x87dd,0x1ea2,
|
||||
|
@ -2416,6 +2419,7 @@ static const UTrie2 fcdTrie={
|
|||
0x1968,
|
||||
NULL, 0, FALSE, FALSE, 0, NULL
|
||||
};
|
||||
#endif
|
||||
|
||||
static const uint16_t auxTrie_index[6664]={
|
||||
0x278,0x280,0x288,0x290,0x278,0x280,0x2a8,0x2b0,0x2b8,0x2c0,0x2c8,0x2d0,0x278,0x280,0x2d8,0x2e0,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -22,12 +22,13 @@
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unormimp.h"
|
||||
#include "ucase.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cmemory.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ucase.h"
|
||||
#include "uprops.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
@ -134,12 +135,19 @@ struct CmpEquivLevel {
|
|||
};
|
||||
typedef struct CmpEquivLevel CmpEquivLevel;
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for decomposing.
|
||||
* If not set, just do strcasecmp().
|
||||
*/
|
||||
#define _COMPARE_EQUIV 0x80000
|
||||
|
||||
/* internal function */
|
||||
static int32_t
|
||||
unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
||||
const UChar *s2, int32_t length2,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
const Normalizer2Impl *nfcImpl;
|
||||
const UCaseProps *csp;
|
||||
|
||||
/* current-level start/limit - s1/s2 as current */
|
||||
|
@ -152,7 +160,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
|||
/* stacks of previous-level start/current/limit */
|
||||
CmpEquivLevel stack1[2], stack2[2];
|
||||
|
||||
/* decomposition buffers for Hangul */
|
||||
/* buffers for algorithmic decompositions */
|
||||
UChar decomp1[4], decomp2[4];
|
||||
|
||||
/* case folding buffers, only use current-level start/limit */
|
||||
|
@ -173,19 +181,19 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
|||
*/
|
||||
|
||||
/* normalization/properties data loaded? */
|
||||
if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) ||
|
||||
U_FAILURE(*pErrorCode)
|
||||
) {
|
||||
return 0;
|
||||
if((options&_COMPARE_EQUIV)!=0) {
|
||||
nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
|
||||
} else {
|
||||
nfcImpl=NULL;
|
||||
}
|
||||
if((options&U_COMPARE_IGNORE_CASE)!=0) {
|
||||
csp=ucase_getSingleton(pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
} else {
|
||||
csp=NULL;
|
||||
}
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
start1=s1;
|
||||
|
@ -404,7 +412,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
|||
}
|
||||
|
||||
if( level1<2 && (options&_COMPARE_EQUIV) &&
|
||||
0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length))
|
||||
0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
|
||||
) {
|
||||
/* cp1 decomposes into p[length] */
|
||||
if(U_IS_SURROGATE(c1)) {
|
||||
|
@ -445,7 +453,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
|
|||
}
|
||||
|
||||
if( level2<2 && (options&_COMPARE_EQUIV) &&
|
||||
0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length))
|
||||
0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
|
||||
) {
|
||||
/* cp2 decomposes into p[length] */
|
||||
if(U_IS_SURROGATE(c2)) {
|
||||
|
@ -534,14 +542,8 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
const UChar *s2, int32_t length2,
|
||||
uint32_t options,
|
||||
UErrorCode *pErrorCode) {
|
||||
MaybeStackArray<UChar, 300> fcd1, fcd2;
|
||||
const UnicodeSet *nx;
|
||||
UNormalizationMode mode;
|
||||
int32_t normOptions;
|
||||
int32_t result;
|
||||
|
||||
/* argument checking */
|
||||
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s1==0 || length1<-1 || s2==0 || length2<-1) {
|
||||
|
@ -549,21 +551,9 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
return 0;
|
||||
}
|
||||
|
||||
if(!unorm_haveData(pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(!uprv_haveProperties(pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
|
||||
nx=unorm_getNX(normOptions, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
UnicodeString fcd1, fcd2;
|
||||
int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
|
||||
options|=_COMPARE_EQUIV;
|
||||
result=0;
|
||||
|
||||
/*
|
||||
* UAX #21 Case Mappings, as fixed for Unicode version 4
|
||||
|
@ -586,20 +576,30 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
* are first decomposed or not, so an FCD check - a check only for
|
||||
* canonical order - is not sufficient.
|
||||
*/
|
||||
if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
|
||||
mode=UNORM_NFD;
|
||||
options&=~UNORM_INPUT_IS_FCD;
|
||||
} else {
|
||||
mode=UNORM_FCD;
|
||||
}
|
||||
|
||||
if(!(options&UNORM_INPUT_IS_FCD)) {
|
||||
int32_t _len1, _len2;
|
||||
UBool isFCD1, isFCD2;
|
||||
if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
|
||||
const Normalizer2 *n2;
|
||||
if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
|
||||
n2=Normalizer2Factory::getNFDInstance(*pErrorCode);
|
||||
} else {
|
||||
n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
|
||||
}
|
||||
|
||||
// check if s1 and/or s2 fulfill the FCD conditions
|
||||
isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
|
||||
isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
|
||||
const UnicodeSet *uni32;
|
||||
if(normOptions&UNORM_UNICODE_3_2) {
|
||||
uni32=uniset_getUnicode32Instance(*pErrorCode);
|
||||
} else {
|
||||
uni32=NULL; // unused
|
||||
}
|
||||
FilteredNormalizer2 fn2(*n2, *uni32);
|
||||
if(normOptions&UNORM_UNICODE_3_2) {
|
||||
n2=&fn2;
|
||||
}
|
||||
|
||||
UnicodeString str1(length1<0, s1, length1);
|
||||
UnicodeString str2(length2<0, s2, length2);
|
||||
int32_t spanQCYes1=n2->spanQuickCheckYes(str1, *pErrorCode);
|
||||
int32_t spanQCYes2=n2->spanQuickCheckYes(str2, *pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -613,59 +613,27 @@ unorm_compare(const UChar *s1, int32_t length1,
|
|||
* Therefore, ICU 2.6 removes that optimization.
|
||||
*/
|
||||
|
||||
if(!isFCD1) {
|
||||
_len1=unorm_internalNormalizeWithNX(fcd1.getAlias(), fcd1.getCapacity(),
|
||||
s1, length1,
|
||||
mode, normOptions, nx,
|
||||
pErrorCode);
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
if(fcd1.resize(_len1)==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
_len1=unorm_internalNormalizeWithNX(fcd1.getAlias(), fcd1.getCapacity(),
|
||||
s1, length1,
|
||||
mode, normOptions, nx,
|
||||
pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
s1=fcd1.getAlias();
|
||||
length1=_len1;
|
||||
if(spanQCYes1<str1.length()) {
|
||||
UnicodeString unnormalized=str1.tempSubString(spanQCYes1);
|
||||
fcd1.setTo(FALSE, str1.getBuffer(), spanQCYes1);
|
||||
n2->normalizeSecondAndAppend(fcd1, unnormalized, *pErrorCode);
|
||||
s1=fcd1.getBuffer();
|
||||
length1=fcd1.length();
|
||||
}
|
||||
|
||||
if(!isFCD2) {
|
||||
_len2=unorm_internalNormalizeWithNX(fcd2.getAlias(), fcd2.getCapacity(),
|
||||
s2, length2,
|
||||
mode, normOptions, nx,
|
||||
pErrorCode);
|
||||
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
|
||||
if(fcd2.resize(_len2)==NULL) {
|
||||
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return result;
|
||||
}
|
||||
|
||||
*pErrorCode=U_ZERO_ERROR;
|
||||
_len2=unorm_internalNormalizeWithNX(fcd2.getAlias(), fcd2.getCapacity(),
|
||||
s2, length2,
|
||||
mode, normOptions, nx,
|
||||
pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
s2=fcd2.getAlias();
|
||||
length2=_len2;
|
||||
if(spanQCYes2<str2.length()) {
|
||||
UnicodeString unnormalized=str2.tempSubString(spanQCYes2);
|
||||
fcd2.setTo(FALSE, str2.getBuffer(), spanQCYes2);
|
||||
n2->normalizeSecondAndAppend(fcd2, unnormalized, *pErrorCode);
|
||||
s2=fcd2.getBuffer();
|
||||
length2=fcd2.length();
|
||||
}
|
||||
}
|
||||
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
|
||||
return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -163,25 +163,6 @@ enum {
|
|||
_NORM_DECOMP_LENGTH_MASK=0x7f
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
/* Korean Hangul and Jamo constants */
|
||||
enum {
|
||||
JAMO_L_BASE=0x1100, /* "lead" jamo */
|
||||
JAMO_V_BASE=0x1161, /* "vowel" jamo */
|
||||
JAMO_T_BASE=0x11a7, /* "trail" jamo */
|
||||
|
||||
HANGUL_BASE=0xac00,
|
||||
|
||||
JAMO_L_COUNT=19,
|
||||
JAMO_V_COUNT=21,
|
||||
JAMO_T_COUNT=28,
|
||||
|
||||
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT
|
||||
};
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/* Constants for options flags for normalization. @draft ICU 2.6 */
|
||||
enum {
|
||||
/** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
|
||||
|
@ -205,199 +186,6 @@ enum {
|
|||
U_CAPI UBool U_EXPORT2
|
||||
unorm_haveData(UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API for normalizing.
|
||||
* Does not check for bad input.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
/**
|
||||
* Internal API for normalizing.
|
||||
* Does not check for bad input.
|
||||
* Requires _haveData() to be true.
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC int32_t
|
||||
unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UNormalizationMode mode, int32_t options, const U_NAMESPACE_QUALIFIER UnicodeSet *nx,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* internal API, used by normlzr.cpp
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_decompose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* internal API, used by normlzr.cpp
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_compose(UChar *dest, int32_t destCapacity,
|
||||
const UChar *src, int32_t srcLength,
|
||||
UBool compat, int32_t options,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
/**
|
||||
* internal API, used by unormcmp.cpp
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UNormalizationCheckResult
|
||||
unorm_internalQuickCheck(const UChar *src,
|
||||
int32_t srcLength,
|
||||
UNormalizationMode mode,
|
||||
UBool allowMaybe,
|
||||
const U_NAMESPACE_QUALIFIER UnicodeSet *nx,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#endif
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for decomposing.
|
||||
* If not set, just do strcasecmp().
|
||||
* @internal
|
||||
*/
|
||||
#define _COMPARE_EQUIV 0x80000
|
||||
|
||||
#ifndef U_COMPARE_IGNORE_CASE
|
||||
/* see also unorm.h */
|
||||
/**
|
||||
* Option bit for unorm_compare:
|
||||
* Perform case-insensitive comparison.
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
#define U_COMPARE_IGNORE_CASE 0x10000
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for strncmp style.
|
||||
* If set, checks for both string length and terminating NUL.
|
||||
* @internal
|
||||
*/
|
||||
#define _STRNCMP_STYLE 0x1000
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
/**
|
||||
* Internal API to get the 16-bit FCD value (lccc + tccc) for c,
|
||||
* for u_getIntPropertyValue().
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC uint16_t U_EXPORT2
|
||||
unorm_getFCD16FromCodePoint(UChar32 c);
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get access to the internal FCD trie table to be able to perform
|
||||
* incremental, per-code unit, FCD checks in collation.
|
||||
* One pointer is sufficient because the trie index values are offset
|
||||
* by the index size, so that the same pointer is used to access the trie data.
|
||||
* Code points at fcdHighStart and above have a zero FCD value.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI const uint16_t * U_EXPORT2
|
||||
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value for a code unit, with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* If c is a lead surrogate and the value is not 0,
|
||||
* then some of c's associated supplementary code points have a non-zero FCD value.
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
|
||||
return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value of the next code point (post-increment), with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
||||
const UChar *&s, const UChar *limit) {
|
||||
UChar32 c=*s++;
|
||||
uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
if(fcd!=0 && U16_IS_LEAD(c)) {
|
||||
UChar c2;
|
||||
if(s!=limit && U16_IS_TRAIL(c2=*s)) {
|
||||
++s;
|
||||
c=U16_GET_SUPPLEMENTARY(c, c2);
|
||||
if(c<fcdHighStart) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
|
||||
} else {
|
||||
fcd=0;
|
||||
}
|
||||
} else /* unpaired lead surrogate */ {
|
||||
fcd=0;
|
||||
}
|
||||
}
|
||||
return fcd;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal API, used by collation code.
|
||||
* Get the FCD value of the previous code point (pre-decrement), with
|
||||
* bits 15..8 lead combining class
|
||||
* bits 7..0 trail combining class
|
||||
*
|
||||
* @internal
|
||||
*/
|
||||
static inline uint16_t
|
||||
unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
||||
const UChar *start, const UChar *&s) {
|
||||
UChar32 c=*--s;
|
||||
uint16_t fcd;
|
||||
if(!U16_IS_SURROGATE(c)) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
|
||||
} else {
|
||||
UChar c2;
|
||||
if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) {
|
||||
--s;
|
||||
c=U16_GET_SUPPLEMENTARY(c2, c);
|
||||
if(c<fcdHighStart) {
|
||||
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
|
||||
} else {
|
||||
fcd=0;
|
||||
}
|
||||
} else /* unpaired surrogate */ {
|
||||
fcd=0;
|
||||
}
|
||||
}
|
||||
return fcd;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* internal API, used by StringPrep
|
||||
* @internal
|
||||
|
@ -405,35 +193,6 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
|
|||
U_CAPI void U_EXPORT2
|
||||
unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Get the canonical decomposition for one code point.
|
||||
* Requires unorm_haveData() and buffer!=NULL and pLength!=NULL.
|
||||
* @param c code point
|
||||
* @param buffer out-only buffer for algorithmic decompositions of Hangul
|
||||
* @param length out-only, takes the length of the decomposition, if any
|
||||
* @return pointer to decomposition, or 0 if none
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC const UChar *
|
||||
unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength);
|
||||
|
||||
/**
|
||||
* internal API, used by the canonical iterator
|
||||
* TODO Consider using signature similar to unorm_getCanonicalDecomposition()
|
||||
* for more efficiency
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
unorm_getDecomposition(UChar32 c, UBool compat,
|
||||
UChar *dest, int32_t destCapacity);
|
||||
|
||||
/**
|
||||
* internal API, used by uprops.cpp
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UBool U_EXPORT2
|
||||
unorm_internalIsFullCompositionExclusion(UChar32 c);
|
||||
|
||||
/**
|
||||
* Internal API, used by enumeration of canonically equivalent strings
|
||||
* @internal
|
||||
|
@ -448,13 +207,6 @@ unorm_isCanonSafeStart(UChar32 c);
|
|||
U_CAPI UBool U_EXPORT2
|
||||
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
|
||||
|
||||
/**
|
||||
* Is c an NF<mode>-skippable code point? See unormimp.h.
|
||||
* @internal
|
||||
*/
|
||||
U_CAPI UBool U_EXPORT2
|
||||
unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
/**
|
||||
|
@ -484,13 +236,6 @@ unorm_swap(const UDataSwapper *ds,
|
|||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
|
||||
* @internal
|
||||
*/
|
||||
U_CFUNC UNormalizationCheckResult U_EXPORT2
|
||||
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
|
||||
|
||||
/**
|
||||
* Description of the format of unorm.icu version 2.3.
|
||||
*
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2009, International Business Machines
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: uprops.h
|
||||
* file name: uprops.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
|
@ -26,6 +26,7 @@
|
|||
#include "unicode/uscript.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "cstring.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "umutex.h"
|
||||
#include "unormimp.h"
|
||||
|
@ -106,7 +107,7 @@ static const struct {
|
|||
{ 1, U_MASK(UPROPS_DEPRECATED) },
|
||||
{ 1, U_MASK(UPROPS_DIACRITIC) },
|
||||
{ 1, U_MASK(UPROPS_EXTENDER) },
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
|
||||
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
|
||||
{ 1, U_MASK(UPROPS_GRAPHEME_BASE) },
|
||||
{ 1, U_MASK(UPROPS_GRAPHEME_EXTEND) },
|
||||
{ 1, U_MASK(UPROPS_GRAPHEME_LINK) },
|
||||
|
@ -134,10 +135,10 @@ static const struct {
|
|||
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CASE_SENSITIVE */
|
||||
{ 1, U_MASK(UPROPS_S_TERM) },
|
||||
{ 1, U_MASK(UPROPS_VARIATION_SELECTOR) },
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFD_INERT */
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFKD_INERT */
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFC_INERT */
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFKC_INERT */
|
||||
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_NFD_INERT */
|
||||
{ UPROPS_SRC_NFKC, 0 }, /* UCHAR_NFKD_INERT */
|
||||
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_NFC_INERT */
|
||||
{ UPROPS_SRC_NFKC, 0 }, /* UCHAR_NFKC_INERT */
|
||||
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_SEGMENT_STARTER */
|
||||
{ 1, U_MASK(UPROPS_PATTERN_SYNTAX) },
|
||||
{ 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE) },
|
||||
|
@ -152,7 +153,8 @@ static const struct {
|
|||
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_UPPERCASED */
|
||||
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_TITLECASED */
|
||||
{ UPROPS_SRC_CASE_AND_NORM, 0 }, /* UCHAR_CHANGES_WHEN_CASEFOLDED */
|
||||
{ UPROPS_SRC_CASE, 0 } /* UCHAR_CHANGES_WHEN_CASEMAPPED */
|
||||
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_CASEMAPPED */
|
||||
{ UPROPS_SRC_NFKC_CF, 0 } /* UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED */
|
||||
};
|
||||
|
||||
U_CAPI UBool U_EXPORT2
|
||||
|
@ -173,18 +175,56 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
|||
#if !UCONFIG_NO_NORMALIZATION
|
||||
/* normalization properties from unorm.icu */
|
||||
switch(which) {
|
||||
case UCHAR_FULL_COMPOSITION_EXCLUSION:
|
||||
return unorm_internalIsFullCompositionExclusion(c);
|
||||
case UCHAR_NFD_INERT:
|
||||
case UCHAR_NFKD_INERT:
|
||||
case UCHAR_NFC_INERT:
|
||||
case UCHAR_NFKC_INERT:
|
||||
return unorm_isNFSkippable(c, (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD));
|
||||
case UCHAR_SEGMENT_STARTER:
|
||||
return unorm_isCanonSafeStart(c);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
} else if(column==UPROPS_SRC_NFC || column==UPROPS_SRC_NFKC) {
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
switch(which) {
|
||||
case UCHAR_FULL_COMPOSITION_EXCLUSION: {
|
||||
// By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return impl->isCompNo(impl->getNorm16(c));
|
||||
}
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
// UCHAR_NF..._INERT properties
|
||||
const Normalizer2 *norm2=Normalizer2Factory::getInstance(
|
||||
(UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return norm2->isInert(c);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
} else if(column==UPROPS_SRC_NFKC_CF) {
|
||||
// currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
UnicodeString src(c);
|
||||
UnicodeString dest;
|
||||
{
|
||||
// The ReorderingBuffer must be in a block because its destructor
|
||||
// needs to release dest's buffer before we look at its contents.
|
||||
ReorderingBuffer buffer(*kcf, dest);
|
||||
// Small destCapacity for NFKC_CF(c).
|
||||
if(U_SUCCESS(errorCode) && buffer.init(5, errorCode)) {
|
||||
const UChar *srcArray=src.getBuffer();
|
||||
kcf->compose(srcArray, srcArray+src.length(), FALSE,
|
||||
TRUE, buffer, errorCode);
|
||||
}
|
||||
}
|
||||
return U_SUCCESS(errorCode) && dest!=src;
|
||||
}
|
||||
#endif
|
||||
} else if(column==UPROPS_SRC_BIDI) {
|
||||
/* bidi/shaping properties */
|
||||
|
@ -225,14 +265,16 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
|||
} else if(column==UPROPS_SRC_CASE_AND_NORM) {
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
UChar nfdBuffer[4];
|
||||
const UChar *nfd=NULL;
|
||||
const UChar *nfd;
|
||||
int32_t nfdLength;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
switch(which) {
|
||||
case UCHAR_CHANGES_WHEN_CASEFOLDED:
|
||||
if(unorm_haveData(&errorCode)) {
|
||||
nfd=unorm_getCanonicalDecomposition(c, nfdBuffer, &nfdLength);
|
||||
}
|
||||
nfd=nfcImpl->getDecomposition(c, nfdBuffer, nfdLength);
|
||||
if(nfd!=NULL) {
|
||||
/* c has a decomposition */
|
||||
if(nfdLength==1) {
|
||||
|
@ -274,6 +316,32 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
|
|||
return FALSE;
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
U_CAPI uint8_t U_EXPORT2
|
||||
u_getCombiningClass(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return impl->getCC(impl->getNorm16(c));
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static uint16_t
|
||||
getFCD16(UChar32 c) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
const UTrie2 *trie=Normalizer2Factory::getFCDTrie(errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return UTRIE2_GET16(trie, c);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
|
||||
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
|
||||
|
@ -311,11 +379,9 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
|
|||
return (int32_t)u_charDirection(c);
|
||||
case UCHAR_BLOCK:
|
||||
return (int32_t)ublock_getCode(c);
|
||||
case UCHAR_CANONICAL_COMBINING_CLASS:
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
case UCHAR_CANONICAL_COMBINING_CLASS:
|
||||
return u_getCombiningClass(c);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
case UCHAR_DECOMPOSITION_TYPE:
|
||||
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
|
||||
|
@ -352,9 +418,9 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
|
|||
case UCHAR_NFKC_QUICK_CHECK:
|
||||
return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD));
|
||||
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
|
||||
return unorm_getFCD16FromCodePoint(c)>>8;
|
||||
return getFCD16(c)>>8;
|
||||
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
|
||||
return unorm_getFCD16FromCodePoint(c)&0xff;
|
||||
return getFCD16(c)&0xff;
|
||||
#endif
|
||||
case UCHAR_GRAPHEME_CLUSTER_BREAK:
|
||||
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
|
||||
|
@ -462,12 +528,13 @@ uprops_getSource(UProperty which) {
|
|||
|
||||
case UCHAR_CANONICAL_COMBINING_CLASS:
|
||||
case UCHAR_NFD_QUICK_CHECK:
|
||||
case UCHAR_NFKD_QUICK_CHECK:
|
||||
case UCHAR_NFC_QUICK_CHECK:
|
||||
case UCHAR_NFKC_QUICK_CHECK:
|
||||
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
|
||||
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
|
||||
return UPROPS_SRC_NORM;
|
||||
return UPROPS_SRC_NFC;
|
||||
case UCHAR_NFKD_QUICK_CHECK:
|
||||
case UCHAR_NFKC_QUICK_CHECK:
|
||||
return UPROPS_SRC_NFKC;
|
||||
|
||||
case UCHAR_BIDI_CLASS:
|
||||
case UCHAR_JOINING_GROUP:
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2002-2009, International Business Machines
|
||||
* Copyright (C) 2002-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -332,6 +332,12 @@ enum UPropertySource {
|
|||
UPROPS_SRC_CHAR_AND_PROPSVEC,
|
||||
/** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
|
||||
UPROPS_SRC_CASE_AND_NORM,
|
||||
/** From normalizer2impl.cpp/nfc.nrm */
|
||||
UPROPS_SRC_NFC,
|
||||
/** From normalizer2impl.cpp/nfkc.nrm */
|
||||
UPROPS_SRC_NFKC,
|
||||
/** From normalizer2impl.cpp/nfkc_cf.nrm */
|
||||
UPROPS_SRC_NFKC_CF,
|
||||
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
|
||||
UPROPS_SRC_COUNT
|
||||
};
|
||||
|
@ -390,4 +396,18 @@ uchar_swapNames(const UDataSwapper *ds,
|
|||
const void *inData, int32_t length, void *outData,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
class UnicodeSet;
|
||||
|
||||
// implemented in uniset_props.cpp
|
||||
U_CFUNC UnicodeSet *
|
||||
uniset_getUnicode32Instance(UErrorCode &errorCode);
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* file name: ustr_imp.h
|
||||
|
@ -25,6 +25,23 @@
|
|||
typedef struct UBreakIterator UBreakIterator;
|
||||
#endif
|
||||
|
||||
#ifndef U_COMPARE_IGNORE_CASE
|
||||
/* see also unorm.h */
|
||||
/**
|
||||
* Option bit for unorm_compare:
|
||||
* Perform case-insensitive comparison.
|
||||
* @draft ICU 2.2
|
||||
*/
|
||||
#define U_COMPARE_IGNORE_CASE 0x10000
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Internal option for unorm_cmpEquivFold() for strncmp style.
|
||||
* If set, checks for both string length and terminating NUL.
|
||||
* @internal
|
||||
*/
|
||||
#define _STRNCMP_STYLE 0x1000
|
||||
|
||||
/**
|
||||
* Compare two strings in code point order or code unit order.
|
||||
* Works in strcmp style (both lengths -1),
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -25,7 +25,6 @@
|
|||
#include "unicode/ubrk.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucase.h"
|
||||
#include "unormimp.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
/* string casing ------------------------------------------------------------ */
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
* file name: utrie2.c
|
||||
* file name: utrie2.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
|
@ -423,7 +423,7 @@ utrie2_swap(const UDataSwapper *ds,
|
|||
trie.indexLength=ds->readUInt16(inTrie->indexLength);
|
||||
trie.shiftedDataLength=ds->readUInt16(inTrie->shiftedDataLength);
|
||||
|
||||
valueBits=trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK;
|
||||
valueBits=(UTrie2ValueBits)(trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK);
|
||||
dataLength=(int32_t)trie.shiftedDataLength<<UTRIE2_INDEX_SHIFT;
|
||||
|
||||
if( trie.signature!=UTRIE2_SIG ||
|
||||
|
@ -696,3 +696,39 @@ utrie2_enumForLeadSurrogate(const UTrie2 *trie, UChar32 lead,
|
|||
lead=(lead-0xd7c0)<<10; /* start code point */
|
||||
enumEitherTrie(trie, lead, lead+0x400, enumValue, enumRange, context);
|
||||
}
|
||||
|
||||
/* C++ convenience wrappers ------------------------------------------------- */
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
uint16_t BackwardUTrie2StringIterator::previous16() {
|
||||
codePointLimit=codePointStart;
|
||||
if(start>=codePointStart) {
|
||||
codePoint=U_SENTINEL;
|
||||
return 0;
|
||||
}
|
||||
uint16_t result;
|
||||
UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
uint16_t ForwardUTrie2StringIterator::next16() {
|
||||
codePointStart=codePointLimit;
|
||||
if(codePointLimit==limit) {
|
||||
codePoint=U_SENTINEL;
|
||||
return 0;
|
||||
}
|
||||
uint16_t result;
|
||||
UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
UTrie2 *UTrie2Singleton::getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
UErrorCode &errorCode) {
|
||||
void *duplicate;
|
||||
UTrie2 *instance=(UTrie2 *)singleton.getInstance(instantiator, context, duplicate, errorCode);
|
||||
utrie2_close((UTrie2 *)duplicate);
|
||||
return instance;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
******************************************************************************
|
||||
|
@ -605,8 +605,70 @@ utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
|
|||
*/
|
||||
#define UTRIE2_GET32_FROM_SUPP(trie, c) _UTRIE2_GET_FROM_SUPP((trie), data32, c)
|
||||
|
||||
U_CDECL_END
|
||||
|
||||
/* C++ convenience wrappers ------------------------------------------------- */
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
#include "mutex.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
// Use the Forward/Backward subclasses below.
|
||||
class UTrie2StringIterator : public UMemory {
|
||||
public:
|
||||
UTrie2StringIterator(const UTrie2 *t, const UChar *p) :
|
||||
trie(t), codePointStart(p), codePointLimit(p), codePoint(U_SENTINEL) {}
|
||||
|
||||
const UTrie2 *trie;
|
||||
const UChar *codePointStart, *codePointLimit;
|
||||
UChar32 codePoint;
|
||||
};
|
||||
|
||||
class BackwardUTrie2StringIterator : public UTrie2StringIterator {
|
||||
public:
|
||||
BackwardUTrie2StringIterator(const UTrie2 *t, const UChar *s, const UChar *p) :
|
||||
UTrie2StringIterator(t, p), start(s) {}
|
||||
|
||||
uint16_t previous16();
|
||||
|
||||
const UChar *start;
|
||||
};
|
||||
|
||||
class ForwardUTrie2StringIterator : public UTrie2StringIterator {
|
||||
public:
|
||||
// Iteration limit l can be NULL.
|
||||
// In that case, the caller must detect c==0 and stop.
|
||||
ForwardUTrie2StringIterator(const UTrie2 *t, const UChar *p, const UChar *l) :
|
||||
UTrie2StringIterator(t, p), limit(l) {}
|
||||
|
||||
uint16_t next16();
|
||||
|
||||
const UChar *limit;
|
||||
};
|
||||
|
||||
class UTrie2Singleton {
|
||||
public:
|
||||
UTrie2Singleton(SimpleSingleton &s) : singleton(s) {}
|
||||
void deleteInstance() {
|
||||
utrie2_close((UTrie2 *)singleton.fInstance);
|
||||
singleton.reset();
|
||||
}
|
||||
UTrie2 *getInstance(InstantiatorFn *instantiator, const void *context,
|
||||
UErrorCode &errorCode);
|
||||
private:
|
||||
SimpleSingleton &singleton;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
/* Internal definitions ----------------------------------------------------- */
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
/** Build-time trie structure. */
|
||||
struct UNewTrie2;
|
||||
typedef struct UNewTrie2 UNewTrie2;
|
||||
|
|
5
icu4c/source/configure
vendored
5
icu4c/source/configure
vendored
|
@ -2,7 +2,7 @@
|
|||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.63.
|
||||
#
|
||||
# Copyright (c) 1999-2009, International Business Machines Corporation and others. All Rights Reserved.
|
||||
# Copyright (c) 1999-2010, International Business Machines Corporation and others. All Rights Reserved.
|
||||
#
|
||||
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
|
||||
# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
|
||||
|
@ -10583,7 +10583,7 @@ then
|
|||
fi
|
||||
|
||||
# output the Makefiles
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/gennorm2/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
|
||||
|
||||
cat >confcache <<\_ACEOF
|
||||
# This file is a shell script that caches the results of configure
|
||||
|
@ -11210,6 +11210,7 @@ do
|
|||
"tools/gennames/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennames/Makefile" ;;
|
||||
"tools/gentest/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gentest/Makefile" ;;
|
||||
"tools/gennorm/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennorm/Makefile" ;;
|
||||
"tools/gennorm2/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennorm2/Makefile" ;;
|
||||
"tools/genprops/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genprops/Makefile" ;;
|
||||
"tools/gencase/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencase/Makefile" ;;
|
||||
"tools/genbidi/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genbidi/Makefile" ;;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# -*-autoconf-*-
|
||||
AC_COPYRIGHT([ Copyright (c) 1999-2009, International Business Machines Corporation and others. All Rights Reserved. ])
|
||||
AC_COPYRIGHT([ Copyright (c) 1999-2010, International Business Machines Corporation and others. All Rights Reserved. ])
|
||||
# configure.in for ICU
|
||||
# Stephen F. Booth, heavily modified by Yves and others
|
||||
|
||||
|
@ -1223,6 +1223,7 @@ AC_CONFIG_FILES([icudefs.mk \
|
|||
tools/gennames/Makefile \
|
||||
tools/gentest/Makefile \
|
||||
tools/gennorm/Makefile \
|
||||
tools/gennorm2/Makefile \
|
||||
tools/genprops/Makefile \
|
||||
tools/gencase/Makefile \
|
||||
tools/genbidi/Makefile \
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
## Makefile.in for ICU data
|
||||
## Copyright (c) 1999-2009, International Business Machines Corporation and
|
||||
## Copyright (c) 1999-2010, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
|
@ -223,7 +223,7 @@ package390: $(OUTTMPDIR)/icudata390.lst $(PKGDATA_LIST) ./icupkg.inc packagedata
|
|||
# 2005-may-05 Removed Unicode properties files (unorm.icu, uprops.icu, ucase.icu, ubidi.icu)
|
||||
# from data build. See Jitterbug 4497. (makedata.mak revision 1.117)
|
||||
#
|
||||
DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu
|
||||
DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm
|
||||
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
|
||||
|
||||
## BRK files
|
||||
|
@ -488,14 +488,17 @@ $(BUILDDIR)/pnames.icu: $(UNICODEDATADIR)/PropertyAliases.txt $(UNICODEDATADIR)/
|
|||
$(INVOKE) $(TOOLBINDIR)/genpname -d $(BUILDDIR)
|
||||
|
||||
# unorm.icu
|
||||
$(BUILDDIR)/unorm.icu: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/DerivedNormalizationProps.txt $(UNICODEDATADIR)/BidiMirroring.txt $(TOOLBINDIR)/gennorm$(TOOLEXEEXT) $(BUILDDIR)/$(ICUDT)pnames.icu $(BUILDDIR)/$(ICUDT)uprops.icu $(BUILDDIR)/$(ICUDT)ucase.icu
|
||||
$(INVOKE) $(TOOLBINDIR)/gennorm -s $(UNICODEDATADIR) -i $(BUILDDIR) -d $(BUILDDIR) -u $(UNICODE_VERSION)
|
||||
# ICU 4.4: $(BUILDDIR)/unorm.icu is now prebuilt, see below.
|
||||
$(OUTTMPDIR)/unorm_props_data.c: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/DerivedNormalizationProps.txt $(UNICODEDATADIR)/BidiMirroring.txt $(TOOLBINDIR)/gennorm$(TOOLEXEEXT) $(BUILDDIR)/$(ICUDT)pnames.icu $(BUILDDIR)/$(ICUDT)uprops.icu $(BUILDDIR)/$(ICUDT)ucase.icu
|
||||
$(INVOKE) $(TOOLBINDIR)/gennorm --csource -s $(UNICODEDATADIR) -i $(BUILDDIR) -d $(OUTTMPDIR) -u $(UNICODE_VERSION)
|
||||
|
||||
# unorm.icu used to be built like this:
|
||||
# $(INVOKE) $(TOOLBINDIR)/gennorm -s $(UNICODEDATADIR) -i $(BUILDDIR) -d $(BUILDDIR) -u $(UNICODE_VERSION)
|
||||
|
||||
# ucadata.icu
|
||||
# used to depend on $(BUILDDIR)/$(ICUDT)unorm.icu $(BUILDDIR)/$(ICUDT)ucase.icu
|
||||
# see Jitterbug 4497
|
||||
$(COLBLDDIR)/ucadata.icu $(COLBLDDIR)/invuca.icu: $(UNICODEDATADIR)/FractionalUCA.txt $(TOOLBINDIR)/genuca$(TOOLEXEEXT)
|
||||
$(COLBLDDIR)/ucadata.icu $(COLBLDDIR)/invuca.icu: $(UNICODEDATADIR)/FractionalUCA.txt $(TOOLBINDIR)/genuca$(TOOLEXEEXT) $(BUILDDIR)/$(ICUDT)nfc.nrm
|
||||
$(INVOKE) $(TOOLBINDIR)/genuca -s $(UNICODEDATADIR) -d $(COLBLDDIR) -i $(BUILDDIR)
|
||||
|
||||
# unames.icu
|
||||
|
@ -506,6 +509,13 @@ $(BUILDDIR)/unames.icu: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/Name
|
|||
$(BUILDDIR)/cnvalias.icu: $(UCMSRCDIR)/convrtrs.txt $(TOOLBINDIR)/gencnval$(TOOLEXEEXT)
|
||||
$(INVOKE) $(TOOLBINDIR)/gencnval -d $(BUILDDIR) $(UCMSRCDIR)/convrtrs.txt
|
||||
|
||||
# Targets for prebuilt Unicode data
|
||||
$(BUILDDIR)/unorm.icu: $(SRCDATADIR)/in/unorm.icu
|
||||
$(INVOKE) $(TOOLBINDIR)/icupkg -t$(ICUDATA_CHAR) $< $@
|
||||
|
||||
$(BUILDDIR)/%.nrm: $(SRCDATADIR)/in/%.nrm
|
||||
$(INVOKE) $(TOOLBINDIR)/icupkg -t$(ICUDATA_CHAR) $< $@
|
||||
|
||||
#################################################### SPP
|
||||
# SPP FILES
|
||||
|
||||
|
@ -751,7 +761,7 @@ $(INDEX_RES_FILE): $(INDEX_FILE) $(TOOLBINDIR)/genrb$(TOOLEXEEXT)
|
|||
# They are not built by default but need to be built for ICU4J data and for getting the .c source files
|
||||
# when updating the Unicode data.
|
||||
# Changed in Makefile.in revision 1.147. See Jitterbug 4497.
|
||||
uni-core-data: build-dir $(UNI_CORE_TARGET_DATA)
|
||||
uni-core-data: build-dir $(UNI_CORE_TARGET_DATA) $(OUTTMPDIR)/unorm_props_data.c
|
||||
@echo Unicode .icu files built to $(BUILDDIR)
|
||||
@echo Unicode .c source files built to $(OUTTMPDIR)
|
||||
|
||||
|
|
BIN
icu4c/source/data/in/nfc.nrm
Normal file
BIN
icu4c/source/data/in/nfc.nrm
Normal file
Binary file not shown.
BIN
icu4c/source/data/in/nfkc.nrm
Normal file
BIN
icu4c/source/data/in/nfkc.nrm
Normal file
Binary file not shown.
BIN
icu4c/source/data/in/nfkc_cf.nrm
Normal file
BIN
icu4c/source/data/in/nfkc_cf.nrm
Normal file
Binary file not shown.
BIN
icu4c/source/data/in/unorm.icu
Normal file
BIN
icu4c/source/data/in/unorm.icu
Normal file
Binary file not shown.
|
@ -1,5 +1,5 @@
|
|||
#**********************************************************************
|
||||
#* Copyright (C) 1999-2009, International Business Machines Corporation
|
||||
#* Copyright (C) 1999-2010, International Business Machines Corporation
|
||||
#* and others. All Rights Reserved.
|
||||
#**********************************************************************
|
||||
# nmake file for creating data files on win32
|
||||
|
@ -28,7 +28,7 @@ ICU_LIB_TARGET=$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll
|
|||
!MESSAGE ICU data make path is $(ICUMAKE)
|
||||
|
||||
# Suffixes for data files
|
||||
.SUFFIXES : .ucm .cnv .dll .dat .res .txt .c
|
||||
.SUFFIXES : .nrm .icu .ucm .cnv .dll .dat .res .txt .c
|
||||
|
||||
ICUOUT=$(ICUMAKE)\out
|
||||
|
||||
|
@ -474,8 +474,8 @@ ALL : GODATA "$(ICU_LIB_TARGET)" "$(TESTDATAOUT)\testdata.dat"
|
|||
# when updating the Unicode data.
|
||||
# Changed in makedata.mak revision 1.117. See Jitterbug 4497.
|
||||
# Command line:
|
||||
# C:\svn\icuproj\icu\trunk\source\data>nmake -f makedata.mak ICUMAKE=C:\svn\icuproj\icu\trunk\source\data\ CFG=Debug uni-core-data
|
||||
uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu" "$(ICUBLD_PKG)\unorm.icu"
|
||||
# C:\svn\icuproj\icu\trunk\source\data>nmake -f makedata.mak ICUMAKE=C:\svn\icuproj\icu\trunk\source\data\ CFG=x86\Debug uni-core-data
|
||||
uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu" "$(ICUBLD_PKG)\unorm.icu" "$(ICUTMP)\unorm_props_data.c"
|
||||
@echo Unicode .icu files built to "$(ICUBLD_PKG)"
|
||||
@echo Unicode .c source files built to "$(ICUTMP)"
|
||||
|
||||
|
@ -553,7 +553,7 @@ testdata.jar: GODATA "$(ICUOUT)\icu4j\testdata.jar"
|
|||
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
|
||||
-@erase "$(ICUTMP)\$(ICUPKG).dat"
|
||||
!ELSE
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
|
||||
@echo Building icu data
|
||||
cd "$(ICUBLD_PKG)"
|
||||
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
|
||||
|
@ -563,6 +563,9 @@ confusables.cfu
|
|||
$(ICUCOL)\ucadata.icu
|
||||
$(ICUCOL)\invuca.icu
|
||||
cnvalias.icu
|
||||
nfc.nrm
|
||||
nfkc.nrm
|
||||
nfkc_cf.nrm
|
||||
$(CNV_FILES:.cnv =.cnv
|
||||
)
|
||||
$(ALL_RES:.res =.res
|
||||
|
@ -627,6 +630,7 @@ CLEAN : GODATA
|
|||
-@erase "*.exp"
|
||||
-@erase "*.icu"
|
||||
-@erase "*.lib"
|
||||
-@erase "*.nrm"
|
||||
-@erase "*.res"
|
||||
-@erase "*.spp"
|
||||
-@erase "*.txt"
|
||||
|
@ -878,9 +882,10 @@ res_index:table(nofallback) {
|
|||
@"$(ICUTOOLS)\gencase\$(CFG)\gencase" --csource -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUTMP)"
|
||||
|
||||
# Targets for unorm.icu
|
||||
"$(ICUBLD_PKG)\unorm.icu": "$(ICUUNIDATA)\*.txt" "$(ICUTOOLS)\gennorm\$(CFG)\gennorm.exe" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu"
|
||||
# ICU 4.4: "$(ICUBLD_PKG)\unorm.icu" is now prebuilt, see below.
|
||||
"$(ICUTMP)\unorm_props_data.c": "$(ICUUNIDATA)\*.txt" "$(ICUTOOLS)\gennorm\$(CFG)\gennorm.exe" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu"
|
||||
@echo Creating data file for Unicode Normalization
|
||||
@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUBLD_PKG)"
|
||||
@rem @"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUBLD_PKG)"
|
||||
@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" --csource -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUTMP)"
|
||||
|
||||
# Targets for converters
|
||||
|
@ -891,10 +896,23 @@ res_index:table(nofallback) {
|
|||
# Targets for ucadata.icu & invuca.icu
|
||||
# used to depend on "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\unorm.icu"
|
||||
# see Jitterbug 4497
|
||||
"$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu": "$(ICUUNIDATA)\FractionalUCA.txt" "$(ICUTOOLS)\genuca\$(CFG)\genuca.exe"
|
||||
"$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu": "$(ICUUNIDATA)\FractionalUCA.txt" "$(ICUTOOLS)\genuca\$(CFG)\genuca.exe" "$(ICUBLD_PKG)\nfc.nrm"
|
||||
@echo Creating UCA data files
|
||||
@"$(ICUTOOLS)\genuca\$(CFG)\genuca" -d "$(ICUBLD_PKG)\$(ICUCOL)" -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)"
|
||||
|
||||
# Targets for prebuilt Unicode data
|
||||
"$(ICUBLD_PKG)\unorm.icu": $(ICUSRCDATA_RELATIVE_PATH)\in\unorm.icu
|
||||
"$(ICUPBIN)\icupkg" -tl $? $@
|
||||
|
||||
"$(ICUBLD_PKG)\nfc.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfc.nrm
|
||||
"$(ICUPBIN)\icupkg" -tl $? $@
|
||||
|
||||
"$(ICUBLD_PKG)\nfkc.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfkc.nrm
|
||||
"$(ICUPBIN)\icupkg" -tl $? $@
|
||||
|
||||
"$(ICUBLD_PKG)\nfkc_cf.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfkc_cf.nrm
|
||||
"$(ICUPBIN)\icupkg" -tl $? $@
|
||||
|
||||
# Stringprep .spp file generation.
|
||||
{$(ICUSRCDATA_RELATIVE_PATH)\$(ICUSPREP)}.txt.spp:
|
||||
@echo Creating $@
|
||||
|
@ -924,6 +942,6 @@ $(MISC_SOURCE) $(RB_FILES) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FI
|
|||
# This used to depend on "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu" "$(ICUBLD_PKG)\unorm.icu"
|
||||
# This data is now hard coded as a part of the library.
|
||||
# See Jitterbug 4497 for details.
|
||||
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu"
|
||||
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\nfc.nrm"
|
||||
!ENDIF
|
||||
|
||||
|
|
2319
icu4c/source/data/unidata/norm2/nfc.txt
Normal file
2319
icu4c/source/data/unidata/norm2/nfc.txt
Normal file
File diff suppressed because it is too large
Load diff
5786
icu4c/source/data/unidata/norm2/nfkc.txt
Normal file
5786
icu4c/source/data/unidata/norm2/nfkc.txt
Normal file
File diff suppressed because it is too large
Load diff
5376
icu4c/source/data/unidata/norm2/nfkc_cf.txt
Normal file
5376
icu4c/source/data/unidata/norm2/nfkc_cf.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines *
|
||||
* Copyright (C) 1996-2010, International Business Machines *
|
||||
* Corporation and others. All Rights Reserved. *
|
||||
******************************************************************************
|
||||
*/
|
||||
|
@ -27,7 +27,7 @@
|
|||
#include "hash.h"
|
||||
#include "uhash.h"
|
||||
#include "ucol_imp.h"
|
||||
#include "unormimp.h"
|
||||
#include "normalizer2impl.h"
|
||||
|
||||
#include "unicode/colldata.h"
|
||||
#include "unicode/bmsearch.h"
|
||||
|
@ -81,6 +81,7 @@ private:
|
|||
uint32_t variableTop;
|
||||
UBool toShift;
|
||||
UCollator *coll;
|
||||
const Normalizer2 &nfd;
|
||||
|
||||
const UnicodeString *targetString;
|
||||
const UChar *targetBuffer;
|
||||
|
@ -93,6 +94,7 @@ private:
|
|||
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
|
||||
: bufferSize(0), bufferMin(0), bufferMax(0),
|
||||
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
|
||||
nfd(*Normalizer2Factory::getNFDInstance(status)),
|
||||
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
|
||||
{
|
||||
strength = ucol_getStrength(coll);
|
||||
|
@ -348,63 +350,14 @@ UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
|
|||
return TRUE;
|
||||
}
|
||||
|
||||
UChar t2[32], p2[32];
|
||||
const UChar *pBuffer = pattern.getBuffer();
|
||||
int32_t pLength = pattern.length();
|
||||
int32_t length = end - start;
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
|
||||
|
||||
int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2),
|
||||
targetBuffer + start, length,
|
||||
FALSE, 0, &status);
|
||||
|
||||
// use separate status2 in case of buffer overflow
|
||||
if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
|
||||
pBuffer, pLength,
|
||||
FALSE, 0, &status2)) {
|
||||
return FALSE; // lengths are different
|
||||
}
|
||||
|
||||
// compare contents
|
||||
UChar *text, *pat;
|
||||
|
||||
if(U_SUCCESS(status)) {
|
||||
text = t2;
|
||||
pat = p2;
|
||||
} else if(status == U_BUFFER_OVERFLOW_ERROR) {
|
||||
status = U_ZERO_ERROR;
|
||||
|
||||
// allocate one buffer for both decompositions
|
||||
text = NEW_ARRAY(UChar, decomplength * 2);
|
||||
|
||||
// Check for allocation failure.
|
||||
if (text == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
pat = text + decomplength;
|
||||
|
||||
unorm_decompose(text, decomplength, targetBuffer + start,
|
||||
length, FALSE, 0, &status);
|
||||
|
||||
unorm_decompose(pat, decomplength, pBuffer,
|
||||
pLength, FALSE, 0, &status);
|
||||
} else {
|
||||
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
|
||||
// and that we don't uprv_free() an undefined text pointer
|
||||
text = pat = t2;
|
||||
decomplength = 0;
|
||||
}
|
||||
|
||||
UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
|
||||
|
||||
if(text != t2) {
|
||||
DELETE_ARRAY(text);
|
||||
}
|
||||
|
||||
// Note: We could use Normalizer::compare() or similar, but for short strings
|
||||
// which may not be in FCD it might be faster to just NFD them.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString t2, p2;
|
||||
nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
|
||||
nfd.normalize(pattern, p2, status);
|
||||
// return FALSE if NFD failed
|
||||
return U_SUCCESS(status) && result;
|
||||
return U_SUCCESS(status) && t2 == p2;
|
||||
}
|
||||
|
||||
#define HASH_TABLE_SIZE 257
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
* Copyright (C) 1996-2009, International Business Machines Corporation and *
|
||||
* Copyright (C) 1996-2010, International Business Machines Corporation and *
|
||||
* others. All Rights Reserved. *
|
||||
*******************************************************************************
|
||||
*/
|
||||
|
@ -122,9 +122,9 @@ UBool CollationElementIterator::operator==(
|
|||
}
|
||||
// both are in the normalization buffer
|
||||
if (m_data_->iteratordata_.pos
|
||||
- m_data_->iteratordata_.writableBuffer
|
||||
- m_data_->iteratordata_.writableBuffer.getBuffer()
|
||||
!= that.m_data_->iteratordata_.pos
|
||||
- that.m_data_->iteratordata_.writableBuffer) {
|
||||
- that.m_data_->iteratordata_.writableBuffer.getBuffer()) {
|
||||
// not in the same position in the normalization buffer
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -176,7 +176,7 @@ void CollationElementIterator::setText(const UnicodeString& source,
|
|||
int32_t length = source.length();
|
||||
UChar *string = NULL;
|
||||
if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) {
|
||||
uprv_free(m_data_->iteratordata_.string);
|
||||
uprv_free((UChar *)m_data_->iteratordata_.string);
|
||||
}
|
||||
m_data_->isWritable = TRUE;
|
||||
if (length > 0) {
|
||||
|
@ -200,7 +200,7 @@ void CollationElementIterator::setText(const UnicodeString& source,
|
|||
/* Free offsetBuffer before initializing it. */
|
||||
ucol_freeOffsetBuffer(&(m_data_->iteratordata_));
|
||||
uprv_init_collIterate(m_data_->iteratordata_.coll, string, length,
|
||||
&m_data_->iteratordata_);
|
||||
&m_data_->iteratordata_, &status);
|
||||
|
||||
m_data_->reset_ = TRUE;
|
||||
}
|
||||
|
@ -241,13 +241,13 @@ void CollationElementIterator::setText(CharacterIterator& source,
|
|||
}
|
||||
|
||||
if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) {
|
||||
uprv_free(m_data_->iteratordata_.string);
|
||||
uprv_free((UChar *)m_data_->iteratordata_.string);
|
||||
}
|
||||
m_data_->isWritable = TRUE;
|
||||
/* Free offsetBuffer before initializing it. */
|
||||
ucol_freeOffsetBuffer(&(m_data_->iteratordata_));
|
||||
uprv_init_collIterate(m_data_->iteratordata_.coll, buffer, length,
|
||||
&m_data_->iteratordata_);
|
||||
&m_data_->iteratordata_, &status);
|
||||
m_data_->reset_ = TRUE;
|
||||
}
|
||||
|
||||
|
@ -407,7 +407,7 @@ const CollationElementIterator& CollationElementIterator::operator=(
|
|||
if (length > 0) {
|
||||
coliter->string = (UChar *)uprv_malloc(length * U_SIZEOF_UCHAR);
|
||||
if(coliter->string != NULL) {
|
||||
uprv_memcpy(coliter->string, othercoliter->string,
|
||||
uprv_memcpy((UChar *)coliter->string, othercoliter->string,
|
||||
length * U_SIZEOF_UCHAR);
|
||||
} else { // Error: couldn't allocate memory. No copying should be done
|
||||
length = 0;
|
||||
|
@ -423,27 +423,8 @@ const CollationElementIterator& CollationElementIterator::operator=(
|
|||
/* handle writable buffer here */
|
||||
|
||||
if (othercoliter->flags & UCOL_ITER_INNORMBUF) {
|
||||
uint32_t wlength = u_strlen(othercoliter->writableBuffer) + 1;
|
||||
if (wlength < coliter->writableBufSize) {
|
||||
uprv_memcpy(coliter->stackWritableBuffer,
|
||||
othercoliter->stackWritableBuffer,
|
||||
wlength * U_SIZEOF_UCHAR);
|
||||
}
|
||||
else {
|
||||
if (coliter->writableBuffer != coliter->stackWritableBuffer) {
|
||||
uprv_free(coliter->writableBuffer);
|
||||
}
|
||||
coliter->writableBuffer = (UChar *)uprv_malloc(
|
||||
wlength * U_SIZEOF_UCHAR);
|
||||
if(coliter->writableBuffer != NULL) {
|
||||
uprv_memcpy(coliter->writableBuffer,
|
||||
othercoliter->writableBuffer,
|
||||
wlength * U_SIZEOF_UCHAR);
|
||||
coliter->writableBufSize = wlength;
|
||||
} else { // Error: couldn't allocate memory for writableBuffer
|
||||
coliter->writableBufSize = 0;
|
||||
}
|
||||
}
|
||||
coliter->writableBuffer = othercoliter->writableBuffer;
|
||||
coliter->writableBuffer.getTerminatedBuffer();
|
||||
}
|
||||
|
||||
/* current position */
|
||||
|
@ -453,13 +434,9 @@ const CollationElementIterator& CollationElementIterator::operator=(
|
|||
coliter->pos = coliter->string +
|
||||
(othercoliter->pos - othercoliter->string);
|
||||
}
|
||||
else if (coliter->writableBuffer != NULL) {
|
||||
coliter->pos = coliter->writableBuffer +
|
||||
(othercoliter->pos - othercoliter->writableBuffer);
|
||||
}
|
||||
else {
|
||||
// Error: couldn't allocate memory for writableBuffer
|
||||
coliter->pos = NULL;
|
||||
coliter->pos = coliter->writableBuffer.getTerminatedBuffer() +
|
||||
(othercoliter->pos - othercoliter->writableBuffer.getBuffer());
|
||||
}
|
||||
|
||||
/* CE buffer */
|
||||
|
|
|
@ -895,7 +895,7 @@
|
|||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucol_wgt.c"
|
||||
RelativePath=".\ucol_wgt.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2007, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
|
@ -12,37 +12,43 @@
|
|||
|
||||
#if !UCONFIG_NO_TRANSLITERATION
|
||||
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/uiter.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "cstring.h"
|
||||
#include "nortrans.h"
|
||||
#include "unormimp.h"
|
||||
#include "ucln_in.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
|
||||
|
||||
static inline Transliterator::Token cstrToken(const char *s) {
|
||||
return Transliterator::pointerToken((void *)s);
|
||||
}
|
||||
|
||||
/**
|
||||
* System registration hook.
|
||||
*/
|
||||
void NormalizationTransliterator::registerIDs() {
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
if(!unorm_haveData(&errorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// In the Token, the byte after the NUL is the UNormalization2Mode.
|
||||
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
|
||||
_create, integerToken(UNORM_NFC));
|
||||
_create, cstrToken("nfc\0\0"));
|
||||
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
|
||||
_create, integerToken(UNORM_NFKC));
|
||||
_create, cstrToken("nfkc\0\0"));
|
||||
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
|
||||
_create, integerToken(UNORM_NFD));
|
||||
_create, cstrToken("nfc\0\1"));
|
||||
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
|
||||
_create, integerToken(UNORM_NFKD));
|
||||
_create, cstrToken("nfkc\0\1"));
|
||||
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
|
||||
_create, cstrToken("nfc\0\2"));
|
||||
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
|
||||
_create, cstrToken("nfc\0\3"));
|
||||
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
|
||||
UNICODE_STRING_SIMPLE("NFD"), TRUE);
|
||||
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
|
||||
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
|
||||
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
|
||||
UNICODE_STRING_SIMPLE("NFD"), FALSE);
|
||||
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
|
||||
UNICODE_STRING_SIMPLE("FCD"), FALSE);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -50,19 +56,23 @@ void NormalizationTransliterator::registerIDs() {
|
|||
*/
|
||||
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
|
||||
Token context) {
|
||||
return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
|
||||
const char *name = (const char *)context.pointer;
|
||||
UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
return new NormalizationTransliterator(ID, *norm2);
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs a transliterator.
|
||||
*/
|
||||
NormalizationTransliterator::NormalizationTransliterator(
|
||||
const UnicodeString& id,
|
||||
UNormalizationMode mode, int32_t opt) :
|
||||
Transliterator(id, 0) {
|
||||
fMode = mode;
|
||||
options = opt;
|
||||
}
|
||||
NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
|
||||
const Normalizer2 &norm2) :
|
||||
Transliterator(id, 0), fNorm2(norm2) {}
|
||||
|
||||
/**
|
||||
* Destructor.
|
||||
|
@ -74,20 +84,7 @@ NormalizationTransliterator::~NormalizationTransliterator() {
|
|||
* Copy constructor.
|
||||
*/
|
||||
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
|
||||
Transliterator(o) {
|
||||
fMode = o.fMode;
|
||||
options = o.options;
|
||||
}
|
||||
|
||||
/**
|
||||
* Assignment operator.
|
||||
*/
|
||||
/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
|
||||
Transliterator::operator=(o);
|
||||
fMode = o.fMode;
|
||||
options = o.options;
|
||||
return *this;
|
||||
}*/
|
||||
Transliterator(o), fNorm2(o.fNorm2) {}
|
||||
|
||||
/**
|
||||
* Transliterator API.
|
||||
|
@ -104,23 +101,10 @@ void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransP
|
|||
// start and limit of the input range
|
||||
int32_t start = offsets.start;
|
||||
int32_t limit = offsets.limit;
|
||||
int32_t length, delta;
|
||||
|
||||
if(start >= limit) {
|
||||
return;
|
||||
}
|
||||
|
||||
// a C code unit iterator, implemented around the Replaceable
|
||||
UCharIterator iter;
|
||||
uiter_setReplaceable(&iter, &text);
|
||||
|
||||
// the output string and buffer pointer
|
||||
UnicodeString output;
|
||||
UChar *buffer;
|
||||
UBool neededToNormalize;
|
||||
|
||||
UErrorCode errorCode;
|
||||
|
||||
/*
|
||||
* Normalize as short chunks at a time as possible even in
|
||||
* bulk mode, so that styled text is minimally disrupted.
|
||||
|
@ -129,101 +113,62 @@ void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransP
|
|||
*
|
||||
* If it was known that the input text is not styled, then
|
||||
* a bulk mode normalization could look like this:
|
||||
*
|
||||
|
||||
UChar staticChars[256];
|
||||
UnicodeString input;
|
||||
|
||||
length = limit - start;
|
||||
input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
|
||||
|
||||
UnicodeString input, normalized;
|
||||
int32_t length = limit - start;
|
||||
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
|
||||
input.releaseBuffer(length);
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
Normalizer::normalize(input, fMode, options, output, status);
|
||||
fNorm2.normalize(input, normalized, status);
|
||||
|
||||
text.handleReplaceBetween(start, limit, output);
|
||||
text.handleReplaceBetween(start, limit, normalized);
|
||||
|
||||
int32_t delta = output.length() - length;
|
||||
int32_t delta = normalized.length() - length;
|
||||
offsets.contextLimit += delta;
|
||||
offsets.limit += delta;
|
||||
offsets.start = limit + delta;
|
||||
|
||||
*
|
||||
*/
|
||||
while(start < limit) {
|
||||
// set the iterator limits for the remaining input range
|
||||
// this is a moving target because of the replacements in the text object
|
||||
iter.start = iter.index = start;
|
||||
iter.limit = limit;
|
||||
|
||||
// incrementally normalize a small chunk of the input
|
||||
buffer = output.getBuffer(-1);
|
||||
errorCode = U_ZERO_ERROR;
|
||||
length = unorm_next(&iter, buffer, output.getCapacity(),
|
||||
fMode, 0,
|
||||
TRUE, &neededToNormalize,
|
||||
&errorCode);
|
||||
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
|
||||
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
|
||||
// use a larger output string buffer and do it again from the start
|
||||
iter.index = start;
|
||||
buffer = output.getBuffer(length);
|
||||
errorCode = U_ZERO_ERROR;
|
||||
length = unorm_next(&iter, buffer, output.getCapacity(),
|
||||
fMode, 0,
|
||||
TRUE, &neededToNormalize,
|
||||
&errorCode);
|
||||
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
UnicodeString segment;
|
||||
UnicodeString normalized;
|
||||
UChar32 c = text.char32At(start);
|
||||
do {
|
||||
int32_t prev = start;
|
||||
// Skip at least one character so we make progress.
|
||||
// c holds the character at start.
|
||||
segment.setTo(c);
|
||||
start += U16_LENGTH(c);
|
||||
while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))) {
|
||||
segment.append(c);
|
||||
start += U16_LENGTH(c);
|
||||
}
|
||||
|
||||
if(U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
|
||||
limit = iter.index;
|
||||
if(isIncremental && limit == iter.limit) {
|
||||
if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
|
||||
// stop in incremental mode when we reach the input limit
|
||||
// in case there are additional characters that could change the
|
||||
// normalization result
|
||||
|
||||
// UNLESS all characters in the result of the normalization of
|
||||
// the last run are in the skippable set
|
||||
const UChar *s=output.getBuffer();
|
||||
int32_t i=0, outLength=output.length();
|
||||
UChar32 c;
|
||||
|
||||
while(i<outLength) {
|
||||
U16_NEXT(s, i, outLength, c);
|
||||
if(!unorm_isNFSkippable(c, fMode)) {
|
||||
outLength=-1; // I wish C++ had labeled loops and break outer; ...
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (outLength<0) {
|
||||
break;
|
||||
}
|
||||
start=prev;
|
||||
break;
|
||||
}
|
||||
|
||||
if(neededToNormalize) {
|
||||
fNorm2.normalize(segment, normalized, errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
if(segment != normalized) {
|
||||
// replace the input chunk with its normalized form
|
||||
text.handleReplaceBetween(start, limit, output);
|
||||
text.handleReplaceBetween(prev, start, normalized);
|
||||
|
||||
// update all necessary indexes accordingly
|
||||
delta = length - (limit - start); // length change in the text object
|
||||
start = limit += delta; // the next chunk starts where this one ends, with adjustment
|
||||
limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
|
||||
offsets.contextLimit += delta;
|
||||
} else {
|
||||
// delta == 0
|
||||
start = limit;
|
||||
limit = offsets.limit;
|
||||
int32_t delta = normalized.length() - (start - prev);
|
||||
start += delta;
|
||||
limit += delta;
|
||||
}
|
||||
}
|
||||
} while(start < limit);
|
||||
|
||||
offsets.start = start;
|
||||
offsets.contextLimit += limit - offsets.limit;
|
||||
offsets.limit = limit;
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2007, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
|
@ -15,7 +15,7 @@
|
|||
#if !UCONFIG_NO_TRANSLITERATION
|
||||
|
||||
#include "unicode/translit.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -24,16 +24,7 @@ U_NAMESPACE_BEGIN
|
|||
* @author Alan Liu
|
||||
*/
|
||||
class NormalizationTransliterator : public Transliterator {
|
||||
|
||||
/**
|
||||
* The normalization mode of this transliterator.
|
||||
*/
|
||||
UNormalizationMode fMode;
|
||||
|
||||
/**
|
||||
* Normalization options for this transliterator.
|
||||
*/
|
||||
int32_t options;
|
||||
const Normalizer2 &fNorm2;
|
||||
|
||||
public:
|
||||
|
||||
|
@ -93,8 +84,7 @@ class NormalizationTransliterator : public Transliterator {
|
|||
* Constructs a transliterator. This method is private.
|
||||
* Public users must use the factory method createInstance().
|
||||
*/
|
||||
NormalizationTransliterator(const UnicodeString& id,
|
||||
UNormalizationMode mode, int32_t opt);
|
||||
NormalizationTransliterator(const UnicodeString& id, const Normalizer2 &norm2);
|
||||
|
||||
private:
|
||||
/**
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2008, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -25,12 +25,12 @@
|
|||
#include "unicode/udata.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ucol_bld.h"
|
||||
#include "ucol_elm.h"
|
||||
#include "ucol_cnt.h"
|
||||
#include "ucln_in.h"
|
||||
#include "umutex.h"
|
||||
#include "unormimp.h"
|
||||
#include "cmemory.h"
|
||||
|
||||
static const InverseUCATableHeader* _staticInvUCA = NULL;
|
||||
|
@ -626,7 +626,7 @@ uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t l
|
|||
nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
|
||||
if(U_SUCCESS(*status)) {
|
||||
for(i = 0; i < nLen; i++) {
|
||||
uprv_init_collIterate(UCA, &n[i], 1, &s);
|
||||
uprv_init_collIterate(UCA, &n[i], 1, &s, status);
|
||||
order = ucol_getNextCE(UCA, &s, status);
|
||||
if(isContinuation(order)) {
|
||||
*status = U_INTERNAL_PROGRAM_ERROR;
|
||||
|
@ -878,7 +878,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
|
|||
/* then pick CEs out until there is no more and stuff them into expansion */
|
||||
collIterate s;
|
||||
uint32_t order = 0;
|
||||
uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
|
||||
uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
|
||||
|
||||
for(;;) {
|
||||
order = ucol_getNextCE(src->UCA, &s, status);
|
||||
|
@ -1045,7 +1045,7 @@ ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
|
|||
// it doesn't make any difference whether we have to go to the UCA
|
||||
// or not.
|
||||
{
|
||||
uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
|
||||
uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
|
||||
while(CE != UCOL_NO_MORE_CES) {
|
||||
CE = ucol_getNextCE(src->UCA, &colIt, status);
|
||||
if(CE != UCOL_NO_MORE_CES) {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -31,6 +31,7 @@
|
|||
#include "unicode/unistr.h"
|
||||
#include "unicode/ucoleitr.h"
|
||||
#include "unicode/normlzr.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ucol_elm.h"
|
||||
#include "ucol_tok.h"
|
||||
#include "ucol_cnt.h"
|
||||
|
@ -1602,6 +1603,7 @@ struct enumStruct {
|
|||
tempUCATable *t;
|
||||
UCollator *tempColl;
|
||||
UCollationElements* colEl;
|
||||
const Normalizer2Impl *nfcImpl;
|
||||
int32_t noOfClosures;
|
||||
UErrorCode *status;
|
||||
};
|
||||
|
@ -1615,7 +1617,8 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
|
|||
UCollator *tempColl = ((enumStruct *)context)->tempColl;
|
||||
UCollationElements* colEl = ((enumStruct *)context)->colEl;
|
||||
UCAElements el;
|
||||
UChar decomp[256] = { 0 };
|
||||
UChar decompBuffer[4];
|
||||
const UChar *decomp;
|
||||
int32_t noOfDec = 0;
|
||||
|
||||
UChar32 u32 = 0;
|
||||
|
@ -1623,13 +1626,14 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
|
|||
uint32_t len = 0;
|
||||
|
||||
for(u32 = start; u32 < limit; u32++) {
|
||||
noOfDec = unorm_getDecomposition(u32, FALSE, decomp, 256);
|
||||
decomp = ((enumStruct *)context)->nfcImpl->
|
||||
getDecomposition(u32, decompBuffer, noOfDec);
|
||||
//if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
|
||||
//|| (noOfDec == 1 && *decomp != (UChar)u32))
|
||||
if(noOfDec > 0) // if we're positive, that means there is no decomposition
|
||||
if(decomp != NULL)
|
||||
{
|
||||
len = 0;
|
||||
UTF_APPEND_CHAR_UNSAFE(comp, len, u32);
|
||||
U16_APPEND_UNSAFE(comp, len, u32);
|
||||
if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) {
|
||||
#ifdef UCOL_DEBUG
|
||||
fprintf(stderr, "Closure: %08X -> ", u32);
|
||||
|
@ -1640,7 +1644,7 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
|
|||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
((enumStruct *)context)->noOfClosures++;
|
||||
el.cPoints = decomp;
|
||||
el.cPoints = (UChar *)decomp;
|
||||
el.cSize = noOfDec;
|
||||
el.noOfCEs = 0;
|
||||
el.prefix = el.prefixChars;
|
||||
|
@ -1938,7 +1942,7 @@ uprv_uca_canonicalClosure(tempUCATable *t,
|
|||
UChar baseChar, firstCM;
|
||||
UChar32 fcdHighStart;
|
||||
const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
|
||||
|
||||
context.nfcImpl=Normalizer2Factory::getNFCImpl(*status);
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1998-2009, International Business Machines
|
||||
* Copyright (C) 1998-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -41,6 +41,10 @@
|
|||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/unistr.h"
|
||||
#endif
|
||||
#include "unicode/ucol.h"
|
||||
#include "utrie.h"
|
||||
#include "cmemory.h"
|
||||
|
@ -264,12 +268,14 @@ minimum number for special Jamo
|
|||
|
||||
#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300
|
||||
|
||||
typedef struct collIterate {
|
||||
UChar *string; /* Original string */
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
typedef struct collIterate : public UMemory {
|
||||
const UChar *string; /* Original string */
|
||||
/* UChar *start; Pointer to the start of the source string. Either points to string
|
||||
or to writableBuffer */
|
||||
UChar *endp; /* string end ptr. Is undefined for null terminated strings */
|
||||
UChar *pos; /* This is position in the string. Can be to original or writable buf */
|
||||
const UChar *endp; /* string end ptr. Is undefined for null terminated strings */
|
||||
const UChar *pos; /* This is position in the string. Can be to original or writable buf */
|
||||
|
||||
uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */
|
||||
uint32_t *CEpos; /* This is the position to which we have stored processed CEs */
|
||||
|
@ -279,16 +285,15 @@ typedef struct collIterate {
|
|||
int32_t offsetRepeatCount; /* Repeat stored offset if non-zero */
|
||||
int32_t offsetRepeatValue; /* offset value to repeat */
|
||||
|
||||
UChar *writableBuffer;
|
||||
uint32_t writableBufSize;
|
||||
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
UnicodeString writableBuffer;
|
||||
const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
const UCollator *coll;
|
||||
const Normalizer2 *nfd;
|
||||
uint8_t flags;
|
||||
uint8_t origFlags;
|
||||
uint32_t *extendCEs; /* This is use if CEs is not big enough */
|
||||
int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */
|
||||
uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */
|
||||
UChar stackWritableBuffer[UCOL_WRITABLE_BUFFER_SIZE]; /* A writable buffer. */
|
||||
|
||||
int32_t *offsetBuffer; /* A dynamic buffer to hold offsets */
|
||||
int32_t offsetBufferSize; /* The size of the offset buffer */
|
||||
|
@ -297,6 +302,12 @@ typedef struct collIterate {
|
|||
/*int32_t iteratorIndex;*/
|
||||
} collIterate;
|
||||
|
||||
#else
|
||||
|
||||
typedef struct collIterate collIterate;
|
||||
|
||||
#endif
|
||||
|
||||
#define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0))
|
||||
#define headersize (paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)))
|
||||
|
||||
|
@ -305,19 +316,34 @@ struct used internally in getSpecial*CE.
|
|||
data similar to collIterate.
|
||||
*/
|
||||
struct collIterateState {
|
||||
UChar *pos; /* This is position in the string. Can be to original or writable buf */
|
||||
UChar *returnPos;
|
||||
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
UChar *bufferaddress; /* address of the normalization buffer */
|
||||
uint32_t buffersize;
|
||||
const UChar *pos; /* This is position in the string. Can be to original or writable buf */
|
||||
const UChar *returnPos;
|
||||
const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
|
||||
const UChar *bufferaddress; /* address of the normalization buffer */
|
||||
int32_t buffersize;
|
||||
uint8_t flags;
|
||||
uint8_t origFlags;
|
||||
uint32_t iteratorIndex;
|
||||
int32_t iteratorMove;
|
||||
};
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, int32_t sourceLen, collIterate *s);
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_init_collIterate(const UCollator *collator,
|
||||
const UChar *sourceString, int32_t sourceLen,
|
||||
collIterate *s, UErrorCode *status);
|
||||
|
||||
/* Internal functions for C test code. */
|
||||
U_CAPI collIterate * U_EXPORT2
|
||||
uprv_new_collIterate(UErrorCode *status);
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
uprv_delete_collIterate(collIterate *s);
|
||||
|
||||
/* @return s->pos == s->endp */
|
||||
U_CAPI UBool U_EXPORT2
|
||||
uprv_collIterateAtEnd(collIterate *s);
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -326,7 +352,7 @@ typedef struct UCollationPCE UCollationPCE;
|
|||
|
||||
U_NAMESPACE_END
|
||||
|
||||
struct UCollationElements
|
||||
struct UCollationElements : public UMemory
|
||||
{
|
||||
/**
|
||||
* Struct wrapper for source data
|
||||
|
@ -351,6 +377,8 @@ struct UCollationElements
|
|||
U_CAPI void U_EXPORT2
|
||||
uprv_init_pce(const struct UCollationElements *elems);
|
||||
|
||||
#endif
|
||||
|
||||
#define UCOL_LEVELTERMINATOR 1
|
||||
|
||||
/* mask off anything but primary order */
|
||||
|
@ -1066,7 +1094,6 @@ static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
|
|||
/* The offsetBuffer in collIterate might need to be freed to avoid memory leaks. */
|
||||
void ucol_freeOffsetBuffer(collIterate *s);
|
||||
|
||||
|
||||
#endif /* #if !UCONFIG_NO_COLLATION */
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -1108,7 +1108,7 @@ reset may be null.
|
|||
handled.
|
||||
*/
|
||||
|
||||
static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
|
||||
static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
|
||||
UParseError *parseError, UErrorCode *status)
|
||||
{
|
||||
if(src->resultLen == src->listCapacity) {
|
||||
|
@ -1200,9 +1200,12 @@ inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken,
|
|||
uint32_t CE, SecondCE;
|
||||
uint32_t invPos;
|
||||
if(sourceToken != NULL) {
|
||||
uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
|
||||
uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
|
||||
} else {
|
||||
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
|
||||
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
|
||||
}
|
||||
if(U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
|
||||
|
@ -1684,10 +1687,10 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
|
|||
collIterate s;
|
||||
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
|
||||
|
||||
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
|
||||
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
|
||||
|
||||
CE = ucol_getNextCE(src->UCA, &s, status);
|
||||
UChar *expand = s.pos;
|
||||
const UChar *expand = s.pos;
|
||||
SecondCE = ucol_getNextCE(src->UCA, &s, status);
|
||||
|
||||
ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
******************************************************************************
|
||||
* Copyright (C) 2001-2009, International Business Machines
|
||||
* Copyright (C) 2001-20109, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
******************************************************************************
|
||||
*
|
||||
|
@ -313,19 +313,16 @@ ucol_openElements(const UCollator *coll,
|
|||
int32_t textLength,
|
||||
UErrorCode *status)
|
||||
{
|
||||
UCollationElements *result;
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
|
||||
/* test for NULL */
|
||||
UCollationElements *result = new UCollationElements;
|
||||
if (result == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
result->reset_ = TRUE;
|
||||
result->isWritable = FALSE;
|
||||
result->pce = NULL;
|
||||
|
@ -333,7 +330,7 @@ ucol_openElements(const UCollator *coll,
|
|||
if (text == NULL) {
|
||||
textLength = 0;
|
||||
}
|
||||
uprv_init_collIterate(coll, text, textLength, &result->iteratordata_);
|
||||
uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
@ -345,30 +342,24 @@ ucol_closeElements(UCollationElements *elems)
|
|||
if (elems != NULL) {
|
||||
collIterate *ci = &elems->iteratordata_;
|
||||
|
||||
if (ci != NULL) {
|
||||
if (ci->writableBuffer != ci->stackWritableBuffer) {
|
||||
uprv_free(ci->writableBuffer);
|
||||
}
|
||||
if (ci->extendCEs) {
|
||||
uprv_free(ci->extendCEs);
|
||||
}
|
||||
|
||||
if (ci->extendCEs) {
|
||||
uprv_free(ci->extendCEs);
|
||||
}
|
||||
|
||||
if (ci->offsetBuffer) {
|
||||
uprv_free(ci->offsetBuffer);
|
||||
}
|
||||
if (ci->offsetBuffer) {
|
||||
uprv_free(ci->offsetBuffer);
|
||||
}
|
||||
|
||||
if (elems->isWritable && elems->iteratordata_.string != NULL)
|
||||
{
|
||||
uprv_free(elems->iteratordata_.string);
|
||||
uprv_free((UChar *)elems->iteratordata_.string);
|
||||
}
|
||||
|
||||
if (elems->pce != NULL) {
|
||||
delete elems->pce;
|
||||
}
|
||||
|
||||
uprv_free(elems);
|
||||
delete elems;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -387,11 +378,7 @@ ucol_reset(UCollationElements *elems)
|
|||
ci->flags |= UCOL_ITER_NORM;
|
||||
}
|
||||
|
||||
if (ci->stackWritableBuffer != ci->writableBuffer) {
|
||||
uprv_free(ci->writableBuffer);
|
||||
ci->writableBuffer = ci->stackWritableBuffer;
|
||||
ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
|
||||
}
|
||||
ci->writableBuffer.remove();
|
||||
ci->fcdPosition = NULL;
|
||||
|
||||
//ci->offsetReturn = ci->offsetStore = NULL;
|
||||
|
@ -686,7 +673,7 @@ ucol_setText( UCollationElements *elems,
|
|||
|
||||
if (elems->isWritable && elems->iteratordata_.string != NULL)
|
||||
{
|
||||
uprv_free(elems->iteratordata_.string);
|
||||
uprv_free((UChar *)elems->iteratordata_.string);
|
||||
}
|
||||
|
||||
if (text == NULL) {
|
||||
|
@ -698,7 +685,7 @@ ucol_setText( UCollationElements *elems,
|
|||
/* free offset buffer to avoid memory leak before initializing. */
|
||||
ucol_freeOffsetBuffer(&(elems->iteratordata_));
|
||||
uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
|
||||
&elems->iteratordata_);
|
||||
&elems->iteratordata_, status);
|
||||
|
||||
elems->reset_ = TRUE;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2009 IBM and others. All rights reserved.
|
||||
* Copyright (C) 2001-2010 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 07/02/2001 synwee Creation.
|
||||
|
@ -14,12 +14,14 @@
|
|||
#include "unicode/usearch.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "unormimp.h"
|
||||
#include "ucol_imp.h"
|
||||
#include "usrchimp.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucln_in.h"
|
||||
#include "uassert.h"
|
||||
#include "ustr_imp.h"
|
||||
|
||||
U_NAMESPACE_USE
|
||||
|
||||
|
@ -311,7 +313,11 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch,
|
|||
else {
|
||||
uprv_init_collIterate(strsrch->collator, pattern->text,
|
||||
pattern->textLength,
|
||||
&coleiter->iteratordata_);
|
||||
&coleiter->iteratordata_,
|
||||
status);
|
||||
}
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (pattern->CE != cetable && pattern->CE) {
|
||||
|
@ -381,7 +387,11 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
|
|||
} else {
|
||||
uprv_init_collIterate(strsrch->collator, pattern->text,
|
||||
pattern->textLength,
|
||||
&coleiter->iteratordata_);
|
||||
&coleiter->iteratordata_,
|
||||
status);
|
||||
}
|
||||
if(U_FAILURE(*status)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (pattern->PCE != pcetable && pattern->PCE != NULL) {
|
||||
|
@ -1074,54 +1084,20 @@ static
|
|||
inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
|
||||
int32_t end)
|
||||
{
|
||||
UChar t2[32], p2[32];
|
||||
int32_t length = end - start;
|
||||
if (strsrch->strength != UCOL_IDENTICAL) {
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
|
||||
int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
|
||||
strsrch->search->text + start, length,
|
||||
FALSE, 0, &status);
|
||||
// use separate status2 in case of buffer overflow
|
||||
if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
|
||||
strsrch->pattern.text,
|
||||
strsrch->pattern.textLength,
|
||||
FALSE, 0, &status2)) {
|
||||
return FALSE; // lengths are different
|
||||
}
|
||||
|
||||
// compare contents
|
||||
UChar *text, *pattern;
|
||||
if(U_SUCCESS(status)) {
|
||||
text = t2;
|
||||
pattern = p2;
|
||||
} else if(status==U_BUFFER_OVERFLOW_ERROR) {
|
||||
status = U_ZERO_ERROR;
|
||||
// allocate one buffer for both decompositions
|
||||
text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
|
||||
// Check for allocation failure.
|
||||
if (text == NULL) {
|
||||
return FALSE;
|
||||
}
|
||||
pattern = text + decomplength;
|
||||
unorm_decompose(text, decomplength, strsrch->search->text + start,
|
||||
length, FALSE, 0, &status);
|
||||
unorm_decompose(pattern, decomplength, strsrch->pattern.text,
|
||||
strsrch->pattern.textLength, FALSE, 0, &status);
|
||||
} else {
|
||||
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
|
||||
// and that we don't uprv_free() an undefined text pointer
|
||||
text = pattern = t2;
|
||||
decomplength = 0;
|
||||
}
|
||||
UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
|
||||
if(text != t2) {
|
||||
uprv_free(text);
|
||||
}
|
||||
// Note: We could use Normalizer::compare() or similar, but for short strings
|
||||
// which may not be in FCD it might be faster to just NFD them.
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
UnicodeString t2, p2;
|
||||
strsrch->nfd->normalize(
|
||||
UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
|
||||
strsrch->nfd->normalize(
|
||||
UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
|
||||
// return FALSE if NFD failed
|
||||
return U_SUCCESS(status) && result;
|
||||
return U_SUCCESS(status) && t2 == p2;
|
||||
}
|
||||
|
||||
#if BOYER_MOORE
|
||||
|
@ -2724,6 +2700,8 @@ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
|
|||
UCOL_SHIFTED;
|
||||
result->variableTop = ucol_getVariableTop(collator, status);
|
||||
|
||||
result->nfd = Normalizer2Factory::getNFDInstance(*status);
|
||||
|
||||
if (U_FAILURE(*status)) {
|
||||
uprv_free(result);
|
||||
return NULL;
|
||||
|
@ -3040,7 +3018,8 @@ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
|
|||
ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
|
||||
uprv_init_collIterate(collator, strsrch->search->text,
|
||||
strsrch->search->textLength,
|
||||
&(strsrch->textIter->iteratordata_));
|
||||
&(strsrch->textIter->iteratordata_),
|
||||
status);
|
||||
strsrch->utilIter->iteratordata_.coll = collator;
|
||||
}
|
||||
}
|
||||
|
@ -3432,7 +3411,8 @@ U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
|
|||
ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
|
||||
uprv_init_collIterate(strsrch->collator, strsrch->search->text,
|
||||
strsrch->search->textLength,
|
||||
&(strsrch->textIter->iteratordata_));
|
||||
&(strsrch->textIter->iteratordata_),
|
||||
&status);
|
||||
strsrch->search->matchedLength = 0;
|
||||
strsrch->search->matchedIndex = USEARCH_DONE;
|
||||
strsrch->search->isOverlap = FALSE;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
**********************************************************************
|
||||
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
|
||||
* Copyright (C) 2001-2010 IBM and others. All rights reserved.
|
||||
**********************************************************************
|
||||
* Date Name Description
|
||||
* 08/13/2001 synwee Creation.
|
||||
|
@ -13,6 +13,7 @@
|
|||
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/ucol.h"
|
||||
#include "unicode/ucoleitr.h"
|
||||
#include "unicode/ubrk.h"
|
||||
|
@ -59,6 +60,7 @@ struct UStringSearch {
|
|||
struct USearch *search;
|
||||
struct UPattern pattern;
|
||||
const UCollator *collator;
|
||||
const Normalizer2 *nfd;
|
||||
// positions within the collation element iterator is used to determine
|
||||
// if we are at the start of the text.
|
||||
UCollationElements *textIter;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*******************************************************************************
|
||||
|
@ -52,7 +52,6 @@
|
|||
#include "calldata.h"
|
||||
#include "cstring.h"
|
||||
#include "cmemory.h"
|
||||
#include "ucol_imp.h"
|
||||
|
||||
/* set to 1 to test offsets in backAndForth() */
|
||||
#define TEST_OFFSETS 0
|
||||
|
@ -148,13 +147,14 @@ static char* U_EXPORT2 sortKeyToString(const UCollator *coll, const uint8_t *sor
|
|||
int32_t strength = UCOL_PRIMARY;
|
||||
uint32_t res_size = 0;
|
||||
UBool doneCase = FALSE;
|
||||
UErrorCode errorCode = U_ZERO_ERROR;
|
||||
|
||||
char *current = buffer;
|
||||
const uint8_t *currentSk = sortkey;
|
||||
|
||||
uprv_strcpy(current, "[");
|
||||
|
||||
while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
|
||||
while(strength <= UCOL_QUATERNARY && strength <= ucol_getStrength(coll)) {
|
||||
if(strength > UCOL_PRIMARY) {
|
||||
uprv_strcat(current, " . ");
|
||||
}
|
||||
|
@ -162,20 +162,20 @@ static char* U_EXPORT2 sortKeyToString(const UCollator *coll, const uint8_t *sor
|
|||
uprv_appendByteToHexString(current, *currentSk++);
|
||||
uprv_strcat(current, " ");
|
||||
}
|
||||
if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
|
||||
if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
|
||||
doneCase = TRUE;
|
||||
} else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
|
||||
} else if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
|
||||
strength ++;
|
||||
}
|
||||
if (*currentSk) {
|
||||
uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
|
||||
}
|
||||
if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
|
||||
if(strength == UCOL_QUATERNARY && ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &errorCode) == UCOL_NON_IGNORABLE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(coll->strength == UCOL_IDENTICAL) {
|
||||
if(ucol_getStrength(coll) == UCOL_IDENTICAL) {
|
||||
uprv_strcat(current, " . ");
|
||||
while(*currentSk != 0) {
|
||||
uprv_appendByteToHexString(current, *currentSk++);
|
||||
|
@ -214,7 +214,7 @@ UBool hasCollationElements(const char *locName) {
|
|||
|
||||
UErrorCode status = U_ZERO_ERROR;
|
||||
|
||||
UResourceBundle *loc = ures_open(U_ICUDATA_COLL, locName, &status);;
|
||||
UResourceBundle *loc = ures_open(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll", locName, &status);;
|
||||
|
||||
if(U_SUCCESS(status)) {
|
||||
status = U_ZERO_ERROR;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
|
@ -994,11 +994,6 @@ static void TestSmallBuffer()
|
|||
free(orders);
|
||||
|
||||
ucol_reset(testiter);
|
||||
/* ensures that the writable buffer was cleared */
|
||||
if (testiter->iteratordata_.writableBuffer !=
|
||||
testiter->iteratordata_.stackWritableBuffer) {
|
||||
log_err("Error Writable buffer in collation element iterator not reset\n");
|
||||
}
|
||||
|
||||
/* ensures closing of elements done properly to clear writable buffer */
|
||||
ucol_next(testiter, &status);
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 2001-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 2001-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*******************************************************************************
|
||||
|
@ -1093,7 +1093,7 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
|
|||
UColOptionSet opts;
|
||||
UParseError parseError;
|
||||
UChar *rulesCopy = NULL;
|
||||
collIterate c;
|
||||
collIterate *c = uprv_new_collIterate(status);
|
||||
UCAConstants *consts = NULL;
|
||||
uint32_t UCOL_RESET_TOP_VALUE, /*UCOL_RESET_TOP_CONT, */
|
||||
UCOL_NEXT_TOP_VALUE, UCOL_NEXT_TOP_CONT;
|
||||
|
@ -1102,12 +1102,15 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
|
|||
|
||||
if (U_FAILURE(*status)) {
|
||||
log_err("Could not open root collator %s\n", u_errorName(*status));
|
||||
uprv_delete_collIterate(c);
|
||||
return;
|
||||
}
|
||||
|
||||
colLoc = ucol_getLocaleByType(coll, ULOC_ACTUAL_LOCALE, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
log_err("Could not get collator name: %s\n", u_errorName(*status));
|
||||
ucol_close(UCA);
|
||||
uprv_delete_collIterate(c);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1183,15 +1186,15 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
|
|||
varT = (UBool)((specs & UCOL_TOK_VARIABLE_TOP) != 0);
|
||||
top_ = (UBool)((specs & UCOL_TOK_TOP) != 0);
|
||||
|
||||
uprv_init_collIterate(coll, rulesCopy+chOffset, chLen, &c);
|
||||
uprv_init_collIterate(coll, rulesCopy+chOffset, chLen, c, status);
|
||||
|
||||
currCE = ucol_getNextCE(coll, &c, status);
|
||||
currCE = ucol_getNextCE(coll, c, status);
|
||||
if(currCE == 0 && UCOL_ISTHAIPREVOWEL(*(rulesCopy+chOffset))) {
|
||||
log_verbose("Thai prevowel detected. Will pick next CE\n");
|
||||
currCE = ucol_getNextCE(coll, &c, status);
|
||||
currCE = ucol_getNextCE(coll, c, status);
|
||||
}
|
||||
|
||||
currContCE = ucol_getNextCE(coll, &c, status);
|
||||
currContCE = ucol_getNextCE(coll, c, status);
|
||||
if(!isContinuation(currContCE)) {
|
||||
currContCE = 0;
|
||||
}
|
||||
|
@ -1272,6 +1275,7 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
|
|||
free(rulesCopy);
|
||||
}
|
||||
ucol_close(UCA);
|
||||
uprv_delete_collIterate(c);
|
||||
}
|
||||
|
||||
#if 0
|
||||
|
@ -2992,10 +2996,11 @@ static void TestVariableTopSetting(void) {
|
|||
uint32_t CE = UCOL_NO_MORE_CES;
|
||||
|
||||
/* before we start screaming, let's see if there is a problem with the rules */
|
||||
collIterate s;
|
||||
uprv_init_collIterate(coll, rulesCopy+oldChOffset, oldChLen, &s);
|
||||
UErrorCode collIterateStatus = U_ZERO_ERROR;
|
||||
collIterate *s = uprv_new_collIterate(&collIterateStatus);
|
||||
uprv_init_collIterate(coll, rulesCopy+oldChOffset, oldChLen, s, &collIterateStatus);
|
||||
|
||||
CE = ucol_getNextCE(coll, &s, &status);
|
||||
CE = ucol_getNextCE(coll, s, &status);
|
||||
|
||||
for(i = 0; i < oldChLen; i++) {
|
||||
j = sprintf(buf, "%04X ", *(rulesCopy+oldChOffset+i));
|
||||
|
@ -3004,7 +3009,7 @@ static void TestVariableTopSetting(void) {
|
|||
if(status == U_PRIMARY_TOO_LONG_ERROR) {
|
||||
log_verbose("= Expected failure for %s =", buffer);
|
||||
} else {
|
||||
if(s.pos == s.endp) {
|
||||
if(uprv_collIterateAtEnd(s)) {
|
||||
log_err("Unexpected failure setting variable top at offset %d. Error %s. Codepoints: %s\n",
|
||||
oldChOffset, u_errorName(status), buffer);
|
||||
} else {
|
||||
|
@ -3012,6 +3017,7 @@ static void TestVariableTopSetting(void) {
|
|||
buffer);
|
||||
}
|
||||
}
|
||||
uprv_delete_collIterate(s);
|
||||
}
|
||||
varTop2 = ucol_getVariableTop(coll, &status);
|
||||
if((varTop1 & 0xFFFF0000) != (varTop2 & 0xFFFF0000)) {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/********************************************************************************
|
||||
|
@ -1334,17 +1334,6 @@ TestNextPrevious() {
|
|||
log_err("error unorm_next(U_MISPLACED_QUANTIFIER) %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
/* missing pErrorCode */
|
||||
buffer[0]=5;
|
||||
iter.index=1;
|
||||
length=unorm_next(&iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR,
|
||||
UNORM_NFD, 0, TRUE, NULL,
|
||||
NULL);
|
||||
if(iter.index!=1 || buffer[0]!=5) {
|
||||
log_err("error unorm_next(pErrorCode==NULL) %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
/*******************************************************************************
|
||||
|
@ -22,6 +22,7 @@
|
|||
#include "unicode/putil.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/uloc.h"
|
||||
#include "unicode/unorm2.h"
|
||||
|
||||
#include "cintltst.h"
|
||||
#include "putilimp.h"
|
||||
|
@ -2942,6 +2943,7 @@ TestConsistency() {
|
|||
UErrorCode errorCode;
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
const UNormalizer2 *norm2;
|
||||
USerializedSet sset;
|
||||
#endif
|
||||
UChar32 start, end;
|
||||
|
@ -3070,15 +3072,26 @@ TestConsistency() {
|
|||
* In general, the set for the middle such character should be a subset
|
||||
* of the set for the first.
|
||||
*/
|
||||
errorCode=U_ZERO_ERROR;
|
||||
norm2=unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
log_data_err("unorm2_getInstance(NFD) failed - %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
set1=uset_open(1, 0);
|
||||
set2=uset_open(1, 0);
|
||||
|
||||
if (unorm_getCanonStartSet(0x49, &sset)) {
|
||||
UChar source[1];
|
||||
|
||||
_setAddSerialized(set1, &sset);
|
||||
|
||||
/* enumerate all characters that are plausible to be latin letters */
|
||||
for(start=0xa0; start<0x2000; ++start) {
|
||||
if(unorm_getDecomposition(start, FALSE, buffer16, LENGTHOF(buffer16))>1 && buffer16[0]==0x49) {
|
||||
source[0]=(UChar)start;
|
||||
length=unorm2_normalize(norm2, source, 1, buffer16, LENGTHOF(buffer16), &errorCode);
|
||||
if(length>1 && buffer16[0]==0x49) {
|
||||
uset_add(set2, start);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2001-2009 International Business Machines
|
||||
# Copyright (c) 2001-20109 International Business Machines
|
||||
# Corporation and others. All Rights Reserved.
|
||||
# common & i18n
|
||||
bidi.h
|
||||
|
@ -38,6 +38,7 @@ measfmt.h
|
|||
measunit.h
|
||||
measure.h
|
||||
msgfmt.h
|
||||
normalizer2.h
|
||||
normlzr.h
|
||||
numfmt.h
|
||||
numsys.h
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -776,38 +776,10 @@ void BasicNormalizerTest::TestConcatenate() {
|
|||
},
|
||||
/* ### TODO: add more interesting cases */
|
||||
{
|
||||
"D",
|
||||
"\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
|
||||
"\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
|
||||
"\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
|
||||
"\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
|
||||
"\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
|
||||
"\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
|
||||
"\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u1FDB",
|
||||
|
||||
"\\u1FE3\\u1FEB\\u1FEE\\u1FEF\\u1FF9\\u1FFB\\u1FFD\\u2000"
|
||||
"\\u2001\\u2126\\u212A\\u212B\\u2329\\u232A\\uF900\\uFA10"
|
||||
"\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A\\uFB1F"
|
||||
"\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E\\uFB2F\\uFB30\\uFB31"
|
||||
"\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36\\uFB38\\uFB39\\uFB3A"
|
||||
"\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
|
||||
"\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B\\uFB4C\\uFB4D\\uFB4E",
|
||||
|
||||
"\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
|
||||
"\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
|
||||
"\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
|
||||
"\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
|
||||
"\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
|
||||
"\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
|
||||
"\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u0399"
|
||||
"\\u0301\\u03C5\\u0308\\u0301\\u1FEB\\u1FEE\\u1FEF\\u1FF9"
|
||||
"\\u1FFB\\u1FFD\\u2000\\u2001\\u2126\\u212A\\u212B\\u2329"
|
||||
"\\u232A\\uF900\\uFA10\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25"
|
||||
"\\uFA26\\uFA2A\\uFB1F\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E"
|
||||
"\\uFB2F\\uFB30\\uFB31\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36"
|
||||
"\\uFB38\\uFB39\\uFB3A\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41"
|
||||
"\\uFB43\\uFB44\\uFB46\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B"
|
||||
"\\uFB4C\\uFB4D\\uFB4E"
|
||||
"D",
|
||||
"\\u03B1\\u0345",
|
||||
"\\u0C4D\\U000110BA\\U0001D169",
|
||||
"\\u03B1\\U0001D169\\U000110BA\\u0C4D\\u0345"
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -1743,72 +1715,23 @@ U_CDECL_END
|
|||
|
||||
void
|
||||
BasicNormalizerTest::TestSkippable() {
|
||||
UnicodeSet starts, diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
|
||||
UnicodeSet *startsPtr = &starts;
|
||||
UnicodeSet diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
|
||||
UnicodeString s, pattern;
|
||||
UChar32 start, limit, rangeStart, rangeEnd;
|
||||
int32_t i, range, count;
|
||||
|
||||
UErrorCode status;
|
||||
|
||||
/* build NF*Skippable sets from runtime data */
|
||||
status=U_ZERO_ERROR;
|
||||
USetAdder sa = {
|
||||
(USet *)startsPtr,
|
||||
_set_add,
|
||||
_set_addRange,
|
||||
_set_addString,
|
||||
NULL, // don't need remove()
|
||||
NULL
|
||||
};
|
||||
unorm_addPropertyStarts(&sa, &status);
|
||||
if(U_FAILURE(status)) {
|
||||
errln("unable to load normalization data for unorm_addPropertyStarts(() - %s\n", u_errorName(status));
|
||||
IcuTestErrorCode errorCode(*this, "TestSkippable");
|
||||
skipSets[UNORM_NFD].applyPattern(UNICODE_STRING_SIMPLE("[:NFD_Inert:]"), errorCode);
|
||||
skipSets[UNORM_NFKD].applyPattern(UNICODE_STRING_SIMPLE("[:NFKD_Inert:]"), errorCode);
|
||||
skipSets[UNORM_NFC].applyPattern(UNICODE_STRING_SIMPLE("[:NFC_Inert:]"), errorCode);
|
||||
skipSets[UNORM_NFKC].applyPattern(UNICODE_STRING_SIMPLE("[:NFKC_Inert:]"), errorCode);
|
||||
if(errorCode.logIfFailureAndReset("UnicodeSet(NF..._Inert) failed")) {
|
||||
return;
|
||||
}
|
||||
count=starts.getRangeCount();
|
||||
|
||||
start=limit=0;
|
||||
rangeStart=rangeEnd=0;
|
||||
range=0;
|
||||
for(;;) {
|
||||
if(start<limit) {
|
||||
/* get properties for start and apply them to [start..limit[ */
|
||||
if(unorm_isNFSkippable(start, UNORM_NFD)) {
|
||||
skipSets[UNORM_NFD].add(start, limit-1);
|
||||
}
|
||||
if(unorm_isNFSkippable(start, UNORM_NFKD)) {
|
||||
skipSets[UNORM_NFKD].add(start, limit-1);
|
||||
}
|
||||
if(unorm_isNFSkippable(start, UNORM_NFC)) {
|
||||
skipSets[UNORM_NFC].add(start, limit-1);
|
||||
}
|
||||
if(unorm_isNFSkippable(start, UNORM_NFKC)) {
|
||||
skipSets[UNORM_NFKC].add(start, limit-1);
|
||||
}
|
||||
}
|
||||
|
||||
/* go to next range of same properties */
|
||||
start=limit;
|
||||
if(++limit>rangeEnd) {
|
||||
if(range<count) {
|
||||
limit=rangeStart=starts.getRangeStart(range);
|
||||
rangeEnd=starts.getRangeEnd(range);
|
||||
++range;
|
||||
} else if(range==count) {
|
||||
/* additional range to complete the Unicode code space */
|
||||
limit=rangeStart=rangeEnd=0x110000;
|
||||
++range;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* get expected sets from hardcoded patterns */
|
||||
initExpectedSkippables(expectSets);
|
||||
|
||||
for(i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
|
||||
for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
|
||||
if(skipSets[i]!=expectSets[i]) {
|
||||
errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n"
|
||||
"may need to update hardcoded UnicodeSet patterns in\n"
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2005, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -163,25 +163,6 @@ BasicNormalizerTest::TestNormalizerAPI() {
|
|||
if(s.charAt(1)!=0xe4) {
|
||||
errln("error in Normalizer::decompose(self)");
|
||||
}
|
||||
|
||||
// test internal normalization exclusion options
|
||||
// s contains a compatibility CJK character and a Hangul syllable
|
||||
s=UnicodeString("a\\uFACE\\uD7A3b", -1, US_INV).unescape();
|
||||
status=U_ZERO_ERROR;
|
||||
Normalizer::decompose(s, FALSE, UNORM_NX_HANGUL, out, status);
|
||||
if(U_FAILURE(status) || out!=UNICODE_STRING_SIMPLE("a\\u9F9C\\uD7A3b").unescape()) {
|
||||
errln("Normalizer::decompose(UNORM_NX_HANGUL) failed - %s", u_errorName(status));
|
||||
}
|
||||
status=U_ZERO_ERROR;
|
||||
Normalizer::decompose(s, FALSE, UNORM_NX_CJK_COMPAT, out, status);
|
||||
if(U_FAILURE(status) || out!=UNICODE_STRING_SIMPLE("a\\uFACE\\u1112\\u1175\\u11c2b").unescape()) {
|
||||
errln("Normalizer::decompose(UNORM_NX_CJK_COMPAT) failed - %s", u_errorName(status));
|
||||
}
|
||||
status=U_ZERO_ERROR;
|
||||
Normalizer::decompose(s, FALSE, UNORM_NX_CJK_COMPAT|UNORM_NX_HANGUL, out, status);
|
||||
if(U_FAILURE(status) || out!=UNICODE_STRING_SIMPLE("a\\uFACE\\uD7A3b").unescape()) {
|
||||
errln("Normalizer::decompose(UNORM_NX_CJK_COMPAT|UNORM_NX_HANGUL) failed - %s", u_errorName(status));
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -15,6 +15,19 @@
|
|||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0]))
|
||||
|
||||
static const char *ignorePropNames[]={
|
||||
"FC_NFKC",
|
||||
"NFD_QC",
|
||||
"NFC_QC",
|
||||
"NFKD_QC",
|
||||
"NFKC_QC",
|
||||
"Expands_On_NFD",
|
||||
"Expands_On_NFC",
|
||||
"Expands_On_NFKD",
|
||||
"Expands_On_NFKC",
|
||||
"NFKC_CF"
|
||||
};
|
||||
|
||||
UnicodeTest::UnicodeTest()
|
||||
{
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
|
@ -23,6 +36,10 @@ UnicodeTest::UnicodeTest()
|
|||
delete unknownPropertyNames;
|
||||
unknownPropertyNames=NULL;
|
||||
}
|
||||
// Ignore some property names altogether.
|
||||
for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) {
|
||||
unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeTest::~UnicodeTest()
|
||||
|
@ -76,7 +93,7 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
|
|||
}
|
||||
|
||||
static const char *const
|
||||
derivedCorePropsNames[]={
|
||||
derivedPropsNames[]={
|
||||
"Math",
|
||||
"Alphabetic",
|
||||
"Lowercase",
|
||||
|
@ -86,6 +103,7 @@ derivedCorePropsNames[]={
|
|||
"XID_Start",
|
||||
"XID_Continue",
|
||||
"Default_Ignorable_Code_Point",
|
||||
"Full_Composition_Exclusion",
|
||||
"Grapheme_Extend",
|
||||
"Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
|
||||
"Grapheme_Base",
|
||||
|
@ -95,11 +113,12 @@ derivedCorePropsNames[]={
|
|||
"Changes_When_Uppercased",
|
||||
"Changes_When_Titlecased",
|
||||
"Changes_When_Casefolded",
|
||||
"Changes_When_Casemapped"
|
||||
"Changes_When_Casemapped",
|
||||
"Changes_When_NFKC_Casefolded"
|
||||
};
|
||||
|
||||
static const UProperty
|
||||
derivedCorePropsIndex[]={
|
||||
derivedPropsIndex[]={
|
||||
UCHAR_MATH,
|
||||
UCHAR_ALPHABETIC,
|
||||
UCHAR_LOWERCASE,
|
||||
|
@ -109,6 +128,7 @@ derivedCorePropsIndex[]={
|
|||
UCHAR_XID_START,
|
||||
UCHAR_XID_CONTINUE,
|
||||
UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
|
||||
UCHAR_FULL_COMPOSITION_EXCLUSION,
|
||||
UCHAR_GRAPHEME_EXTEND,
|
||||
UCHAR_GRAPHEME_LINK,
|
||||
UCHAR_GRAPHEME_BASE,
|
||||
|
@ -118,17 +138,18 @@ derivedCorePropsIndex[]={
|
|||
UCHAR_CHANGES_WHEN_UPPERCASED,
|
||||
UCHAR_CHANGES_WHEN_TITLECASED,
|
||||
UCHAR_CHANGES_WHEN_CASEFOLDED,
|
||||
UCHAR_CHANGES_WHEN_CASEMAPPED
|
||||
UCHAR_CHANGES_WHEN_CASEMAPPED,
|
||||
UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
|
||||
};
|
||||
|
||||
static int32_t numErrors[LENGTHOF(derivedCorePropsIndex)]={ 0 };
|
||||
static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 };
|
||||
|
||||
enum { MAX_ERRORS=50 };
|
||||
|
||||
U_CFUNC void U_CALLCONV
|
||||
derivedCorePropsLineFn(void *context,
|
||||
char *fields[][2], int32_t /* fieldCount */,
|
||||
UErrorCode *pErrorCode)
|
||||
derivedPropsLineFn(void *context,
|
||||
char *fields[][2], int32_t /* fieldCount */,
|
||||
UErrorCode *pErrorCode)
|
||||
{
|
||||
UnicodeTest *me=(UnicodeTest *)context;
|
||||
uint32_t start, end;
|
||||
|
@ -136,35 +157,35 @@ derivedCorePropsLineFn(void *context,
|
|||
|
||||
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt field 0 at %s\n", fields[0][0]);
|
||||
me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
|
||||
return;
|
||||
}
|
||||
|
||||
/* parse derived binary property name, ignore unknown names */
|
||||
i=getTokenIndex(derivedCorePropsNames, LENGTHOF(derivedCorePropsNames), fields[1][0]);
|
||||
i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]);
|
||||
if(i<0) {
|
||||
UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
|
||||
propName.trim();
|
||||
if(me->unknownPropertyNames->find(propName)==NULL) {
|
||||
UErrorCode errorCode=U_ZERO_ERROR;
|
||||
me->unknownPropertyNames->puti(propName, 1, errorCode);
|
||||
me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt\n", fields[1][0]);
|
||||
me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
me->derivedCoreProps[i].add(start, end);
|
||||
me->derivedProps[i].add(start, end);
|
||||
}
|
||||
|
||||
void UnicodeTest::TestAdditionalProperties() {
|
||||
// test DerivedCoreProperties.txt
|
||||
if(LENGTHOF(derivedCoreProps)<LENGTHOF(derivedCorePropsNames)) {
|
||||
errln("error: UnicodeTest::derivedCoreProps[] too short, need at least %d UnicodeSets\n",
|
||||
LENGTHOF(derivedCorePropsNames));
|
||||
// test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
|
||||
if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) {
|
||||
errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
|
||||
LENGTHOF(derivedPropsNames));
|
||||
return;
|
||||
}
|
||||
if(LENGTHOF(derivedCorePropsIndex)!=LENGTHOF(derivedCorePropsNames)) {
|
||||
errln("error in ucdtest.cpp: LENGTHOF(derivedCorePropsIndex)!=LENGTHOF(derivedCorePropsNames)\n");
|
||||
if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) {
|
||||
errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n");
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -188,16 +209,25 @@ void UnicodeTest::TestAdditionalProperties() {
|
|||
strcat(backupPath, U_FILE_SEP_STRING);
|
||||
strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
|
||||
|
||||
u_parseDelimitedFile(newPath, ';', fields, 2, derivedCorePropsLineFn, this, &errorCode);
|
||||
char *path=newPath;
|
||||
u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
|
||||
|
||||
if(errorCode==U_FILE_ACCESS_ERROR) {
|
||||
errorCode=U_ZERO_ERROR;
|
||||
u_parseDelimitedFile(backupPath, ';', fields, 2, derivedCorePropsLineFn, this, &errorCode);
|
||||
path=backupPath;
|
||||
u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
|
||||
}
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt");
|
||||
strcpy(basename, "DerivedNormalizationProps.txt");
|
||||
u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
|
||||
if(U_FAILURE(errorCode)) {
|
||||
errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
|
||||
return;
|
||||
}
|
||||
|
||||
// now we have all derived core properties in the UnicodeSets
|
||||
// run them all through the API
|
||||
|
@ -206,14 +236,14 @@ void UnicodeTest::TestAdditionalProperties() {
|
|||
UChar32 start, end;
|
||||
|
||||
// test all TRUE properties
|
||||
for(i=0; i<LENGTHOF(derivedCorePropsNames); ++i) {
|
||||
rangeCount=derivedCoreProps[i].getRangeCount();
|
||||
for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
|
||||
rangeCount=derivedProps[i].getRangeCount();
|
||||
for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
|
||||
start=derivedCoreProps[i].getRangeStart(range);
|
||||
end=derivedCoreProps[i].getRangeEnd(range);
|
||||
start=derivedProps[i].getRangeStart(range);
|
||||
end=derivedProps[i].getRangeEnd(range);
|
||||
for(; start<=end; ++start) {
|
||||
if(!u_hasBinaryProperty(start, derivedCorePropsIndex[i])) {
|
||||
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong\n", start, derivedCorePropsNames[i]);
|
||||
if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
|
||||
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong\n", start, derivedPropsNames[i]);
|
||||
if(++numErrors[i]>=MAX_ERRORS) {
|
||||
errln("Too many errors, moving to the next test");
|
||||
break;
|
||||
|
@ -224,19 +254,19 @@ void UnicodeTest::TestAdditionalProperties() {
|
|||
}
|
||||
|
||||
// invert all properties
|
||||
for(i=0; i<LENGTHOF(derivedCorePropsNames); ++i) {
|
||||
derivedCoreProps[i].complement();
|
||||
for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
|
||||
derivedProps[i].complement();
|
||||
}
|
||||
|
||||
// test all FALSE properties
|
||||
for(i=0; i<LENGTHOF(derivedCorePropsNames); ++i) {
|
||||
rangeCount=derivedCoreProps[i].getRangeCount();
|
||||
for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
|
||||
rangeCount=derivedProps[i].getRangeCount();
|
||||
for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
|
||||
start=derivedCoreProps[i].getRangeStart(range);
|
||||
end=derivedCoreProps[i].getRangeEnd(range);
|
||||
start=derivedProps[i].getRangeStart(range);
|
||||
end=derivedProps[i].getRangeEnd(range);
|
||||
for(; start<=end; ++start) {
|
||||
if(u_hasBinaryProperty(start, derivedCorePropsIndex[i])) {
|
||||
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedCorePropsNames[i]);
|
||||
if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
|
||||
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]);
|
||||
if(++numErrors[i]>=MAX_ERRORS) {
|
||||
errln("Too many errors, moving to the next test");
|
||||
break;
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -13,9 +13,9 @@ U_CFUNC void U_CALLCONV unicodeDataLineFn(void *context,
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CFUNC void U_CALLCONV
|
||||
derivedCorePropsLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode);
|
||||
derivedPropsLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -43,11 +43,11 @@ private:
|
|||
UErrorCode *pErrorCode);
|
||||
|
||||
friend void U_CALLCONV
|
||||
derivedCorePropsLineFn(void *context,
|
||||
derivedPropsLineFn(void *context,
|
||||
char *fields[][2], int32_t fieldCount,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
UnicodeSet derivedCoreProps[30];
|
||||
UnicodeSet derivedProps[30];
|
||||
U_NAMESPACE_QUALIFIER Hashtable *unknownPropertyNames;
|
||||
};
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/*
|
||||
********************************************************************************
|
||||
* Copyright (C) 1999-2009 International Business Machines Corporation and
|
||||
* Copyright (C) 1999-2010 International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************************
|
||||
* Date Name Description
|
||||
|
@ -709,6 +709,37 @@ void UnicodeSetTest::TestAPI() {
|
|||
TEST_ASSERT((void *)constUSet == (void *)constSet);
|
||||
const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
|
||||
TEST_ASSERT((void *)constSetx == (void *)constUSet);
|
||||
|
||||
// span(UnicodeString) and spanBack(UnicodeString) convenience methods
|
||||
UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
|
||||
UnicodeSet ac(0x61, 0x63);
|
||||
ac.remove(0x62).freeze();
|
||||
if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
|
||||
ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
|
||||
ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
|
||||
ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
|
||||
ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
|
||||
ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
|
||||
ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
|
||||
ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
|
||||
ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
|
||||
ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
|
||||
) {
|
||||
errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
|
||||
}
|
||||
if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
|
||||
ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
|
||||
ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
|
||||
ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
|
||||
ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
|
||||
ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
|
||||
ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
|
||||
ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
|
||||
ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
|
||||
ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
|
||||
) {
|
||||
errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
|
||||
}
|
||||
}
|
||||
|
||||
void UnicodeSetTest::TestIteration() {
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -62,6 +62,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &
|
|||
case 17: name = "TestNameSpace"; if (exec) TestNameSpace(); break;
|
||||
case 18: name = "TestUTF32"; if (exec) TestUTF32(); break;
|
||||
case 19: name = "TestUTF8"; if (exec) TestUTF8(); break;
|
||||
case 20: name = "TestReadOnlyAlias"; if (exec) TestReadOnlyAlias(); break;
|
||||
|
||||
default: name = ""; break; //needed to end loop
|
||||
}
|
||||
|
@ -1120,6 +1121,30 @@ UnicodeStringTest::TestMiscellaneous()
|
|||
if(test1.hasMetaData() || UnicodeString().hasMetaData()) {
|
||||
errln("UnicodeString::hasMetaData() returns TRUE");
|
||||
}
|
||||
|
||||
// test getTerminatedBuffer() on a truncated, shared, heap-allocated string
|
||||
test1=UNICODE_STRING_SIMPLE("abcdefghijklmnopqrstuvwxyz0123456789.");
|
||||
test1.truncate(36); // ensure length()<getCapacity()
|
||||
test2=test1; // share the buffer
|
||||
test1.truncate(5);
|
||||
if(test1.length()!=5 || test1.getTerminatedBuffer()[5]!=0) {
|
||||
errln("UnicodeString(shared buffer).truncate() failed");
|
||||
}
|
||||
if(test2.length()!=36 || test2[5]!=0x66 || u_strlen(test2.getTerminatedBuffer())!=36) {
|
||||
errln("UnicodeString(shared buffer).truncate().getTerminatedBuffer() "
|
||||
"modified another copy of the string!");
|
||||
}
|
||||
test1=UNICODE_STRING_SIMPLE("abcdefghijklmnopqrstuvwxyz0123456789.");
|
||||
test1.truncate(36); // ensure length()<getCapacity()
|
||||
test2=test1; // share the buffer
|
||||
test1.remove();
|
||||
if(test1.length()!=0 || test1.getTerminatedBuffer()[0]!=0) {
|
||||
errln("UnicodeString(shared buffer).remove() failed");
|
||||
}
|
||||
if(test2.length()!=36 || test2[0]!=0x61 || u_strlen(test2.getTerminatedBuffer())!=36) {
|
||||
errln("UnicodeString(shared buffer).remove().getTerminatedBuffer() "
|
||||
"modified another copy of the string!");
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -1873,3 +1898,108 @@ UnicodeStringTest::TestUTF8() {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
// Test if this compiler supports Return Value Optimization of unnamed temporary objects.
|
||||
static UnicodeString wrapUChars(const UChar *uchars) {
|
||||
return UnicodeString(TRUE, uchars, -1);
|
||||
}
|
||||
|
||||
void
|
||||
UnicodeStringTest::TestReadOnlyAlias() {
|
||||
UChar uchars[]={ 0x61, 0x62, 0 };
|
||||
UnicodeString alias(TRUE, uchars, 2);
|
||||
if(alias.length()!=2 || alias.getBuffer()!=uchars || alias.getTerminatedBuffer()!=uchars) {
|
||||
errln("UnicodeString read-only-aliasing constructor does not behave as expected.");
|
||||
return;
|
||||
}
|
||||
alias.truncate(1);
|
||||
if(alias.length()!=1 || alias.getBuffer()!=uchars) {
|
||||
errln("UnicodeString(read-only-alias).truncate() did not preserve aliasing as expected.");
|
||||
}
|
||||
if(alias.getTerminatedBuffer()==uchars) {
|
||||
errln("UnicodeString(read-only-alias).truncate().getTerminatedBuffer() "
|
||||
"did not allocate and copy as expected.");
|
||||
}
|
||||
if(uchars[1]!=0x62) {
|
||||
errln("UnicodeString(read-only-alias).truncate().getTerminatedBuffer() "
|
||||
"modified the original buffer.");
|
||||
}
|
||||
if(1!=u_strlen(alias.getTerminatedBuffer())) {
|
||||
errln("UnicodeString(read-only-alias).truncate().getTerminatedBuffer() "
|
||||
"does not return a buffer terminated at the proper length.");
|
||||
}
|
||||
|
||||
alias.setTo(TRUE, uchars, 2);
|
||||
if(alias.length()!=2 || alias.getBuffer()!=uchars || alias.getTerminatedBuffer()!=uchars) {
|
||||
errln("UnicodeString read-only-aliasing setTo() does not behave as expected.");
|
||||
return;
|
||||
}
|
||||
alias.remove();
|
||||
if(alias.length()!=0) {
|
||||
errln("UnicodeString(read-only-alias).remove() did not work.");
|
||||
}
|
||||
if(alias.getTerminatedBuffer()==uchars) {
|
||||
errln("UnicodeString(read-only-alias).remove().getTerminatedBuffer() "
|
||||
"did not un-alias as expected.");
|
||||
}
|
||||
if(uchars[0]!=0x61) {
|
||||
errln("UnicodeString(read-only-alias).remove().getTerminatedBuffer() "
|
||||
"modified the original buffer.");
|
||||
}
|
||||
if(0!=u_strlen(alias.getTerminatedBuffer())) {
|
||||
errln("UnicodeString.setTo(read-only-alias).remove().getTerminatedBuffer() "
|
||||
"does not return a buffer terminated at length 0.");
|
||||
}
|
||||
|
||||
UnicodeString longString=UNICODE_STRING_SIMPLE("abcdefghijklmnopqrstuvwxyz0123456789");
|
||||
alias.setTo(FALSE, longString.getBuffer(), longString.length());
|
||||
alias.remove(0, 10);
|
||||
if(longString.compare(10, INT32_MAX, alias)!=0 || alias.getBuffer()!=longString.getBuffer()+10) {
|
||||
errln("UnicodeString.setTo(read-only-alias).remove(0, 10) did not preserve aliasing as expected.");
|
||||
}
|
||||
alias.setTo(FALSE, longString.getBuffer(), longString.length());
|
||||
alias.remove(27, 99);
|
||||
if(longString.compare(0, 27, alias)!=0 || alias.getBuffer()!=longString.getBuffer()) {
|
||||
errln("UnicodeString.setTo(read-only-alias).remove(27, 99) did not preserve aliasing as expected.");
|
||||
}
|
||||
alias.setTo(FALSE, longString.getBuffer(), longString.length());
|
||||
alias.retainBetween(6, 30);
|
||||
if(longString.compare(6, 24, alias)!=0 || alias.getBuffer()!=longString.getBuffer()+6) {
|
||||
errln("UnicodeString.setTo(read-only-alias).retainBetween(6, 30) did not preserve aliasing as expected.");
|
||||
}
|
||||
|
||||
UChar abc[]={ 0x61, 0x62, 0x63, 0 };
|
||||
UBool hasRVO= wrapUChars(abc).getBuffer()==abc;
|
||||
|
||||
UnicodeString temp;
|
||||
temp.fastCopyFrom(longString.tempSubString());
|
||||
if(temp!=longString || (hasRVO && temp.getBuffer()!=longString.getBuffer())) {
|
||||
errln("UnicodeString.tempSubString() failed");
|
||||
}
|
||||
temp.fastCopyFrom(longString.tempSubString(-3, 5));
|
||||
if(longString.compare(0, 5, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer())) {
|
||||
errln("UnicodeString.tempSubString(-3, 5) failed");
|
||||
}
|
||||
temp.fastCopyFrom(longString.tempSubString(17));
|
||||
if(longString.compare(17, INT32_MAX, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer()+17)) {
|
||||
errln("UnicodeString.tempSubString(17) failed");
|
||||
}
|
||||
temp.fastCopyFrom(longString.tempSubString(99));
|
||||
if(!temp.isEmpty()) {
|
||||
errln("UnicodeString.tempSubString(99) failed");
|
||||
}
|
||||
temp.fastCopyFrom(longString.tempSubStringBetween(6));
|
||||
if(longString.compare(6, INT32_MAX, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer()+6)) {
|
||||
errln("UnicodeString.tempSubStringBetween(6) failed");
|
||||
}
|
||||
temp.fastCopyFrom(longString.tempSubStringBetween(8, 18));
|
||||
if(longString.compare(8, 10, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer()+8)) {
|
||||
errln("UnicodeString.tempSubStringBetween(8, 18) failed");
|
||||
}
|
||||
UnicodeString bogusString;
|
||||
bogusString.setToBogus();
|
||||
temp.fastCopyFrom(bogusString.tempSubStringBetween(8, 18));
|
||||
if(!temp.isBogus()) {
|
||||
errln("UnicodeString.setToBogus().tempSubStringBetween(8, 18) failed");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
/********************************************************************
|
||||
* COPYRIGHT:
|
||||
* Copyright (c) 1997-2009, International Business Machines Corporation and
|
||||
* Copyright (c) 1997-2010, International Business Machines Corporation and
|
||||
* others. All Rights Reserved.
|
||||
********************************************************************/
|
||||
|
||||
|
@ -77,6 +77,7 @@ public:
|
|||
void TestNameSpace(void);
|
||||
void TestUTF32(void);
|
||||
void TestUTF8(void);
|
||||
void TestReadOnlyAlias(void);
|
||||
};
|
||||
|
||||
class StringCaseTest: public IntlTest {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
## Makefile.in for ICU tools
|
||||
## Copyright (c) 1999-2009, International Business Machines Corporation and
|
||||
## Copyright (c) 1999-2010, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
|
||||
## Source directory information
|
||||
|
@ -15,7 +15,7 @@ subdir = tools
|
|||
|
||||
SUBDIRS = toolutil ctestfw makeconv genrb genuca genbrk genctd \
|
||||
gennames genpname gencnval gensprep genccode gencmn icupkg pkgdata \
|
||||
gentest genprops gencase genbidi gennorm gencfu
|
||||
gentest genprops gencase genbidi gennorm gennorm2 gencfu
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local all-recursive install install-local \
|
||||
|
|
|
@ -389,25 +389,10 @@
|
|||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
<File
|
||||
RelativePath=".\gennames.c"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\gennames.c"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
Filter="h;hpp;hxx;hm;inl"
|
||||
>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
>
|
||||
</Filter>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2001-2005, International Business Machines
|
||||
* Copyright (C) 2001-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -61,7 +61,8 @@ enum {
|
|||
UNICODE_VERSION,
|
||||
ICUDATADIR,
|
||||
CSOURCE,
|
||||
STORE_FLAGS
|
||||
STORE_FLAGS,
|
||||
WRITE_NORM2
|
||||
};
|
||||
|
||||
static UOption options[]={
|
||||
|
@ -74,7 +75,8 @@ static UOption options[]={
|
|||
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
|
||||
UOPTION_ICUDATADIR,
|
||||
UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
|
||||
UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
|
||||
UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("write-norm2", '\1', UOPT_NO_ARG)
|
||||
};
|
||||
|
||||
extern int
|
||||
|
@ -140,6 +142,8 @@ main(int argc, char* argv[]) {
|
|||
"\t to the source file basenames before opening;\n"
|
||||
"\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
|
||||
u_getDataDirectory());
|
||||
fprintf(stderr,
|
||||
"\t--write-norm2 write nfc.txt and nfkc.txt files for gennorm2\n");
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
|
@ -243,7 +247,7 @@ main(int argc, char* argv[]) {
|
|||
/* prepare the filename beginning with the source dir */
|
||||
uprv_strcpy(filename, srcDir);
|
||||
basename=filename+uprv_strlen(filename);
|
||||
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
|
||||
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR && *(basename-1)!=U_FILE_ALT_SEP_CHAR) {
|
||||
*basename++=U_FILE_SEP_CHAR;
|
||||
}
|
||||
|
||||
|
@ -286,6 +290,10 @@ main(int argc, char* argv[]) {
|
|||
|
||||
/* process parsed data */
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
if(options[WRITE_NORM2].doesOccur) {
|
||||
writeNorm2(destDir);
|
||||
}
|
||||
|
||||
processData();
|
||||
|
||||
/* write the properties data file */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2005, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -83,6 +83,9 @@ setCompositionExclusion(uint32_t code);
|
|||
U_CFUNC void
|
||||
setFNC(uint32_t c, UChar *s);
|
||||
|
||||
extern void
|
||||
writeNorm2(const char *dataDir);
|
||||
|
||||
extern void
|
||||
processData(void);
|
||||
|
||||
|
|
|
@ -389,33 +389,18 @@
|
|||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="c;cpp;rc"
|
||||
<File
|
||||
RelativePath=".\gennorm.c"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\gennorm.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\store.c"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
Filter="h"
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\gennorm.h"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\gennorm.h"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\store.c"
|
||||
>
|
||||
</Filter>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -106,11 +106,13 @@ static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
|
|||
|
||||
static Norm *norms;
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
/*
|
||||
* set a flag for each code point that was seen in decompositions -
|
||||
* avoid to decompose ones that have not been used before
|
||||
*/
|
||||
static uint32_t haveSeenFlags[256];
|
||||
#endif
|
||||
|
||||
/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
|
||||
static USet *nfdQCNoSet;
|
||||
|
@ -192,8 +194,10 @@ init() {
|
|||
/* allocate UTF-32 string memory */
|
||||
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
/* reset all "have seen" flags */
|
||||
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
|
||||
#endif
|
||||
|
||||
/* open an empty set */
|
||||
nfdQCNoSet=uset_open(1, 0);
|
||||
|
@ -289,6 +293,7 @@ enumTrie(EnumTrieFn *fn, void *context) {
|
|||
return count;
|
||||
}
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
static void
|
||||
setHaveSeenString(const uint32_t *s, int32_t length) {
|
||||
uint32_t c;
|
||||
|
@ -301,6 +306,7 @@ setHaveSeenString(const uint32_t *s, int32_t length) {
|
|||
}
|
||||
|
||||
#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
|
||||
#endif
|
||||
|
||||
/* handle combining data ---------------------------------------------------- */
|
||||
|
||||
|
@ -410,6 +416,7 @@ findCombiningCP(uint32_t code, UBool isLead) {
|
|||
return 0xffff;
|
||||
}
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
static void
|
||||
addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
|
||||
CombiningTriple *triple;
|
||||
|
@ -434,6 +441,7 @@ addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
|
|||
triple->trail=trail;
|
||||
triple->combined=combined;
|
||||
}
|
||||
#endif
|
||||
|
||||
static int
|
||||
compareTriples(const void *l, const void *r) {
|
||||
|
@ -560,6 +568,7 @@ processCombining() {
|
|||
|
||||
/* processing incoming normalization data ----------------------------------- */
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
/*
|
||||
* Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
|
||||
* c must be a Hangul syllable code point.
|
||||
|
@ -594,6 +603,7 @@ getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3])
|
|||
pHangulNorm->lenNFKD=length;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* decompose the one decomposition further, may generate two decompositions
|
||||
|
@ -601,6 +611,20 @@ getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3])
|
|||
*/
|
||||
static void
|
||||
decompStoreNewNF(uint32_t code, Norm *norm) {
|
||||
#if !GENNORM_OBSOLETE
|
||||
/* always allocate the original string */
|
||||
uint32_t *s32;
|
||||
uint8_t length;
|
||||
if((length=norm->lenNFD)!=0) {
|
||||
s32=utm_allocN(utf32Mem, norm->lenNFD);
|
||||
uprv_memcpy(s32, norm->nfd, norm->lenNFD*4);
|
||||
norm->nfd=s32;
|
||||
} else if((length=norm->lenNFKD)!=0) {
|
||||
s32=utm_allocN(utf32Mem, norm->lenNFKD);
|
||||
uprv_memcpy(s32, norm->nfkd, norm->lenNFKD*4);
|
||||
norm->nfkd=s32;
|
||||
}
|
||||
#else
|
||||
uint32_t nfd[40], nfkd[40], hangulBuffer[3];
|
||||
Norm hangulNorm;
|
||||
|
||||
|
@ -695,8 +719,10 @@ decompStoreNewNF(uint32_t code, Norm *norm) {
|
|||
norm->nfkd=s32;
|
||||
setHaveSeenString(nfkd, lenNFKD);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
typedef struct DecompSingle {
|
||||
uint32_t c;
|
||||
Norm *norm;
|
||||
|
@ -800,6 +826,7 @@ decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
|
|||
norm->nfkd=s32;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* process the data for one code point listed in UnicodeData;
|
||||
|
@ -807,7 +834,9 @@ decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
|
|||
*/
|
||||
extern void
|
||||
storeNorm(uint32_t code, Norm *norm) {
|
||||
#if GENNORM_OBSOLETE
|
||||
DecompSingle decompSingle;
|
||||
#endif
|
||||
Norm *p;
|
||||
|
||||
if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
|
||||
|
@ -826,6 +855,7 @@ storeNorm(uint32_t code, Norm *norm) {
|
|||
/* decompose this one decomposition further, may generate two decompositions */
|
||||
decompStoreNewNF(code, norm);
|
||||
|
||||
#if GENNORM_OBSOLETE
|
||||
/* has this code point been used in previous decompositions? */
|
||||
if(HAVE_SEEN(code)) {
|
||||
/* use this decomposition to decompose other decompositions further */
|
||||
|
@ -833,6 +863,7 @@ storeNorm(uint32_t code, Norm *norm) {
|
|||
decompSingle.norm=norm;
|
||||
enumTrie(decompWithSingleFn, &decompSingle);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
/* store the data */
|
||||
|
@ -1815,6 +1846,144 @@ getFoldingAuxOffset(uint32_t data) {
|
|||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
||||
static void
|
||||
writeAllCC(FILE *f) {
|
||||
uint32_t i;
|
||||
UChar32 prevCode, code;
|
||||
uint8_t prevCC, cc;
|
||||
UBool isInBlockZero;
|
||||
|
||||
fprintf(f, "# Canonical_Combining_Class (ccc) values\n");
|
||||
prevCode=0;
|
||||
prevCC=0;
|
||||
for(code=0; code<=0x110000;) {
|
||||
if(code==0x110000) {
|
||||
cc=0;
|
||||
} else {
|
||||
i=utrie_get32(normTrie, code, &isInBlockZero);
|
||||
if(i==0 || isInBlockZero) {
|
||||
cc=0;
|
||||
} else {
|
||||
cc=norms[i].udataCC;
|
||||
}
|
||||
}
|
||||
if(prevCC!=cc) {
|
||||
if(prevCC!=0) {
|
||||
uint32_t lastCode=code-1;
|
||||
if(prevCode==lastCode) {
|
||||
fprintf(f, "%04lX:%d\n", (long)lastCode, prevCC);
|
||||
} else {
|
||||
fprintf(f, "%04lX..%04lX:%d\n",
|
||||
(long)prevCode, (long)lastCode, prevCC);
|
||||
}
|
||||
}
|
||||
prevCode=code;
|
||||
prevCC=cc;
|
||||
}
|
||||
if(isInBlockZero) {
|
||||
code+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
++code;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static UBool
|
||||
hasMapping(uint32_t code) {
|
||||
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
|
||||
return norm->lenNFD!=0 || norm->lenNFKD!=0;
|
||||
}
|
||||
|
||||
static UBool
|
||||
hasOneWayMapping(uint32_t code, UBool withCompat) {
|
||||
for(;;) {
|
||||
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
|
||||
uint8_t length;
|
||||
if((length=norm->lenNFD)!=0) {
|
||||
/*
|
||||
* The canonical decomposition is a one-way mapping if
|
||||
* - it does not map to exactly two code points
|
||||
* - the code has ccc!=0
|
||||
* - the code has the Composition_Exclusion property
|
||||
* - its starter has a one-way mapping (loop for this)
|
||||
* - its non-starter decomposes
|
||||
*/
|
||||
if( length!=2 ||
|
||||
norm->udataCC!=0 ||
|
||||
norm->combiningFlags&0x80 ||
|
||||
hasMapping(norm->nfd[1])
|
||||
) {
|
||||
return TRUE;
|
||||
}
|
||||
code=norm->nfd[0]; /* continue */
|
||||
} else if(withCompat && norm->lenNFKD!=0) {
|
||||
return TRUE;
|
||||
} else {
|
||||
return FALSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
writeAllMappings(FILE *f, UBool withCompat) {
|
||||
uint32_t i, code;
|
||||
UBool isInBlockZero;
|
||||
|
||||
if(withCompat) {
|
||||
fprintf(f, "\n# Canonical and compatibility decomposition mappings\n");
|
||||
} else {
|
||||
fprintf(f, "\n# Canonical decomposition mappings\n");
|
||||
}
|
||||
for(code=0; code<=0x10ffff;) {
|
||||
i=utrie_get32(normTrie, code, &isInBlockZero);
|
||||
if(isInBlockZero) {
|
||||
code+=UTRIE_DATA_BLOCK_LENGTH;
|
||||
} else {
|
||||
if(i!=0) {
|
||||
uint32_t *s32;
|
||||
uint8_t length;
|
||||
char separator;
|
||||
if((length=norms[i].lenNFD)!=0) {
|
||||
s32=norms[i].nfd;
|
||||
separator= hasOneWayMapping(code, withCompat) ? '>' : '=';
|
||||
} else if(withCompat && (length=norms[i].lenNFKD)!=0) {
|
||||
s32=norms[i].nfkd;
|
||||
separator='>';
|
||||
}
|
||||
if(length!=0) {
|
||||
uint8_t j;
|
||||
fprintf(f, "%04lX%c", (long)code, separator);
|
||||
for(j=0; j<length; ++j) {
|
||||
if(j!=0) {
|
||||
fputc(' ', f);
|
||||
}
|
||||
fprintf(f, "%04lX", (long)s32[j]);
|
||||
}
|
||||
fputc('\n', f);
|
||||
}
|
||||
}
|
||||
++code;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
writeNorm2TextFile(const char *path, const char *filename, UBool withCompat) {
|
||||
FILE *f=usrc_createTextData(path, filename);
|
||||
if(f==NULL) {
|
||||
exit(U_FILE_ACCESS_ERROR);
|
||||
}
|
||||
writeAllCC(f);
|
||||
writeAllMappings(f, withCompat);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
extern void
|
||||
writeNorm2(const char *dataDir) {
|
||||
writeNorm2TextFile(dataDir, "nfc.txt", FALSE);
|
||||
writeNorm2TextFile(dataDir, "nfkc.txt", TRUE);
|
||||
}
|
||||
|
||||
extern void
|
||||
generateData(const char *dataDir, UBool csource) {
|
||||
static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];
|
||||
|
|
82
icu4c/source/tools/gennorm2/Makefile.in
Normal file
82
icu4c/source/tools/gennorm2/Makefile.in
Normal file
|
@ -0,0 +1,82 @@
|
|||
## Makefile.in for ICU - tools/gennorm2
|
||||
## Copyright (c) 2009-2010, International Business Machines Corporation and
|
||||
## others. All Rights Reserved.
|
||||
## Steven R. Loomis/Markus W. Scherer
|
||||
|
||||
## Source directory information
|
||||
srcdir = @srcdir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
top_builddir = ../..
|
||||
|
||||
include $(top_builddir)/icudefs.mk
|
||||
|
||||
## Build directory information
|
||||
subdir = tools/gennorm2
|
||||
|
||||
TARGET_STUB_NAME = gennorm2
|
||||
|
||||
## Extra files to remove for 'make clean'
|
||||
CLEANFILES = *~ $(DEPS)
|
||||
|
||||
## Target information
|
||||
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
|
||||
|
||||
ifneq ($(top_builddir),$(top_srcdir))
|
||||
CPPFLAGS += -I$(top_builddir)/common
|
||||
endif
|
||||
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
|
||||
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
|
||||
|
||||
OBJECTS = gennorm2.o n2builder.o
|
||||
|
||||
DEPS = $(OBJECTS:.o=.d)
|
||||
|
||||
## List of phony targets
|
||||
.PHONY : all all-local install install-local clean clean-local \
|
||||
distclean distclean-local dist dist-local check check-local install-man
|
||||
|
||||
## Clear suffix list
|
||||
.SUFFIXES :
|
||||
|
||||
## List of standard targets
|
||||
all: all-local
|
||||
install: install-local
|
||||
clean: clean-local
|
||||
distclean : distclean-local
|
||||
dist: dist-local
|
||||
check: all check-local
|
||||
|
||||
all-local: $(TARGET)
|
||||
|
||||
install-local: all-local
|
||||
# $(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
|
||||
# $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)
|
||||
|
||||
dist-local:
|
||||
|
||||
clean-local:
|
||||
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
|
||||
$(RMV) $(TARGET) $(OBJECTS)
|
||||
|
||||
distclean-local: clean-local
|
||||
$(RMV) Makefile
|
||||
|
||||
check-local: all-local
|
||||
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
cd $(top_builddir) \
|
||||
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
|
||||
|
||||
$(TARGET) : $(OBJECTS)
|
||||
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
|
||||
$(POST_BUILD_STEP)
|
||||
|
||||
|
||||
ifeq (,$(MAKECMDGOALS))
|
||||
-include $(DEPS)
|
||||
else
|
||||
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
|
||||
-include $(DEPS)
|
||||
endif
|
||||
endif
|
258
icu4c/source/tools/gennorm2/gennorm2.cpp
Normal file
258
icu4c/source/tools/gennorm2/gennorm2.cpp
Normal file
|
@ -0,0 +1,258 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: gennorm2.cpp
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov25
|
||||
* created by: Markus W. Scherer
|
||||
*
|
||||
* This program reads text files that define Unicode normalization,
|
||||
* parses them, and builds a binary data file.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/localpointer.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/uchar.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "n2builder.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "toolutil.h"
|
||||
#include "uoptions.h"
|
||||
#include "uparse.h"
|
||||
|
||||
#if UCONFIG_NO_NORMALIZATION
|
||||
#include "unewdata.h"
|
||||
#endif
|
||||
|
||||
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
UBool beVerbose=FALSE, haveCopyright=TRUE;
|
||||
|
||||
U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
void parseFile(FILE *f, Normalizer2DataBuilder &builder);
|
||||
#endif
|
||||
|
||||
/* -------------------------------------------------------------------------- */
|
||||
|
||||
enum {
|
||||
HELP_H,
|
||||
HELP_QUESTION_MARK,
|
||||
VERBOSE,
|
||||
COPYRIGHT,
|
||||
SOURCEDIR,
|
||||
OUTPUT_FILENAME,
|
||||
UNICODE_VERSION
|
||||
};
|
||||
|
||||
static UOption options[]={
|
||||
UOPTION_HELP_H,
|
||||
UOPTION_HELP_QUESTION_MARK,
|
||||
UOPTION_VERBOSE,
|
||||
UOPTION_COPYRIGHT,
|
||||
UOPTION_SOURCEDIR,
|
||||
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
|
||||
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG)
|
||||
};
|
||||
|
||||
extern "C" int
|
||||
main(int argc, char* argv[]) {
|
||||
U_MAIN_INIT_ARGS(argc, argv);
|
||||
|
||||
/* preset then read command line options */
|
||||
options[SOURCEDIR].value="";
|
||||
options[UNICODE_VERSION].value=U_UNICODE_VERSION;
|
||||
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
|
||||
|
||||
/* error handling, printing usage message */
|
||||
if(argc<0) {
|
||||
fprintf(stderr,
|
||||
"error in command line argument \"%s\"\n",
|
||||
argv[-argc]);
|
||||
}
|
||||
if(!options[OUTPUT_FILENAME].doesOccur) {
|
||||
argc=-1;
|
||||
}
|
||||
if( argc<2 ||
|
||||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
|
||||
) {
|
||||
/*
|
||||
* Broken into chunks because the C89 standard says the minimum
|
||||
* required supported string length is 509 bytes.
|
||||
*/
|
||||
fprintf(stderr,
|
||||
"Usage: %s [-options] infiles+ -o outputfilename\n"
|
||||
"\n"
|
||||
"Reads the infiles with normalization data and\n"
|
||||
"creates a binary file (outputfilename) with the data.\n"
|
||||
"\n",
|
||||
argv[0]);
|
||||
fprintf(stderr,
|
||||
"Options:\n"
|
||||
"\t-h or -? or --help this usage text\n"
|
||||
"\t-v or --verbose verbose output\n"
|
||||
"\t-c or --copyright include a copyright notice\n"
|
||||
"\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
|
||||
fprintf(stderr,
|
||||
"\t-s or --sourcedir source directory, followed by the path\n"
|
||||
"\t-o or --output output filename\n");
|
||||
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
|
||||
}
|
||||
|
||||
beVerbose=options[VERBOSE].doesOccur;
|
||||
haveCopyright=options[COPYRIGHT].doesOccur;
|
||||
|
||||
IcuToolErrorCode errorCode("gennorm2/main()");
|
||||
|
||||
#if UCONFIG_NO_NORMALIZATION
|
||||
|
||||
fprintf(stderr,
|
||||
"gennorm2 writes a dummy binary data file "
|
||||
"because UCONFIG_NO_NORMALIZATION is set, \n"
|
||||
"see icu/source/common/unicode/uconfig.h\n");
|
||||
udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
|
||||
return U_UNSUPPORTED_ERROR;
|
||||
|
||||
#else
|
||||
|
||||
LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
|
||||
errorCode.assertSuccess();
|
||||
|
||||
builder->setUnicodeVersion(options[UNICODE_VERSION].value);
|
||||
|
||||
// prepare the filename beginning with the source dir
|
||||
std::string filename(options[SOURCEDIR].value);
|
||||
int32_t pathLength=filename.length();
|
||||
if( pathLength>0 &&
|
||||
filename[pathLength-1]!=U_FILE_SEP_CHAR &&
|
||||
filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
|
||||
) {
|
||||
filename.push_back(U_FILE_SEP_CHAR);
|
||||
pathLength=filename.length();
|
||||
}
|
||||
|
||||
for(int i=1; i<argc; ++i) {
|
||||
printf("gennorm2: processing %s\n", argv[i]);
|
||||
filename.append(argv[i]);
|
||||
LocalStdioFilePointer f(fopen(filename.c_str(), "r"));
|
||||
if(f==NULL) {
|
||||
fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.c_str());
|
||||
exit(U_FILE_ACCESS_ERROR);
|
||||
}
|
||||
builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
|
||||
parseFile(f.getAlias(), *builder);
|
||||
filename.erase(pathLength);
|
||||
}
|
||||
|
||||
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
|
||||
|
||||
return errorCode.get();
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
|
||||
IcuToolErrorCode errorCode("gennorm2/parseFile()");
|
||||
char line[300];
|
||||
uint32_t startCP, endCP;
|
||||
while(NULL!=fgets(line, (int)sizeof(line), f)) {
|
||||
char *comment=(char *)strchr(line, '#');
|
||||
if(comment!=NULL) {
|
||||
*comment=0;
|
||||
}
|
||||
u_rtrim(line);
|
||||
if(line[0]==0) {
|
||||
continue; // skip empty and comment-only lines
|
||||
}
|
||||
if(line[0]=='*') {
|
||||
continue; // reserved syntax
|
||||
}
|
||||
const char *delimiter;
|
||||
int32_t rangeLength=
|
||||
u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
|
||||
if(errorCode.isFailure()) {
|
||||
fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
|
||||
exit(errorCode.reset());
|
||||
}
|
||||
delimiter=u_skipWhitespace(delimiter);
|
||||
if(*delimiter==':') {
|
||||
const char *s=u_skipWhitespace(delimiter+1);
|
||||
char *end;
|
||||
unsigned long value=strtoul(s, &end, 10);
|
||||
if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
|
||||
fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
|
||||
builder.setCC(c, (uint8_t)value);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(*delimiter=='-') {
|
||||
if(*u_skipWhitespace(delimiter+1)!=0) {
|
||||
fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
|
||||
builder.removeMapping(c);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
if(*delimiter=='=' || *delimiter=='>') {
|
||||
UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
|
||||
int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
|
||||
if(errorCode.isFailure()) {
|
||||
fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
|
||||
exit(errorCode.reset());
|
||||
}
|
||||
UnicodeString mapping(FALSE, uchars, length);
|
||||
if(*delimiter=='=') {
|
||||
if(rangeLength!=1) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
|
||||
line);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
builder.setRoundTripMapping((UChar32)startCP, mapping);
|
||||
} else {
|
||||
for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
|
||||
builder.setOneWayMapping(c, mapping);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
|
||||
exit(U_PARSE_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
/*
|
||||
* Hey, Emacs, please set the following:
|
||||
*
|
||||
* Local Variables:
|
||||
* indent-tabs-mode: nil
|
||||
* End:
|
||||
*
|
||||
*/
|
409
icu4c/source/tools/gennorm2/gennorm2.vcproj
Normal file
409
icu4c/source/tools/gennorm2/gennorm2.vcproj
Normal file
|
@ -0,0 +1,409 @@
|
|||
<?xml version="1.0" encoding="Windows-1252"?>
|
||||
<VisualStudioProject
|
||||
ProjectType="Visual C++"
|
||||
Version="9.00"
|
||||
Name="gennorm2"
|
||||
ProjectGUID="{C7891A65-80AB-4245-912E-5F1E17B0E6C4}"
|
||||
RootNamespace="gennorm2"
|
||||
Keyword="Win32Proj"
|
||||
TargetFrameworkVersion="196613"
|
||||
>
|
||||
<Platforms>
|
||||
<Platform
|
||||
Name="Win32"
|
||||
/>
|
||||
<Platform
|
||||
Name="x64"
|
||||
/>
|
||||
</Platforms>
|
||||
<ToolFiles>
|
||||
</ToolFiles>
|
||||
<Configurations>
|
||||
<Configuration
|
||||
Name="Release|Win32"
|
||||
OutputDirectory=".\x86\Release"
|
||||
IntermediateDirectory=".\x86\Release"
|
||||
ConfigurationType="1"
|
||||
CharacterSet="1"
|
||||
WholeProgramOptimization="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin
"
|
||||
Outputs="..\..\..\bin\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
Optimization="2"
|
||||
EnableIntrinsicFunctions="true"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x86\Release\gennorm2.pch"
|
||||
AssemblerListingLocation=".\x86\Release\"
|
||||
ObjectFile=".\x86\Release\"
|
||||
ProgramDataBaseFileName=".\x86\Release\"
|
||||
WarningLevel="3"
|
||||
DebugInformationFormat="3"
|
||||
SuppressStartupBanner="true"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="NDEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x86\Release\gennorm2.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
ProgramDatabaseFile=".\x86\Release\gennorm2.pdb"
|
||||
GenerateDebugInformation="true"
|
||||
SubSystem="1"
|
||||
OptimizeReferences="2"
|
||||
EnableCOMDATFolding="2"
|
||||
TargetMachine="1"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Debug|Win32"
|
||||
OutputDirectory=".\x86\Debug"
|
||||
IntermediateDirectory=".\x86\Debug"
|
||||
ConfigurationType="1"
|
||||
CharacterSet="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin
"
|
||||
Outputs="..\..\..\bin\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
EnableIntrinsicFunctions="true"
|
||||
MinimalRebuild="true"
|
||||
BasicRuntimeChecks="3"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x86\Debug\gennorm2.pch"
|
||||
AssemblerListingLocation=".\x86\Debug\"
|
||||
ObjectFile=".\x86\Debug\"
|
||||
ProgramDataBaseFileName=".\x86\Debug\"
|
||||
BrowseInformation="1"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
DebugInformationFormat="4"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="_DEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x86\Debug\gennorm2.exe"
|
||||
LinkIncremental="2"
|
||||
SuppressStartupBanner="true"
|
||||
GenerateDebugInformation="true"
|
||||
ProgramDatabaseFile=".\x86\Debug\gennorm2.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="1"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Release|x64"
|
||||
OutputDirectory=".\x64\Release"
|
||||
IntermediateDirectory=".\x64\Release"
|
||||
ConfigurationType="1"
|
||||
CharacterSet="1"
|
||||
WholeProgramOptimization="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin64
"
|
||||
Outputs="..\..\..\bin64\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
Optimization="2"
|
||||
EnableIntrinsicFunctions="true"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="2"
|
||||
EnableFunctionLevelLinking="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x64\Release\gennorm2.pch"
|
||||
AssemblerListingLocation=".\x64\Release\"
|
||||
ObjectFile=".\x64\Release\"
|
||||
ProgramDataBaseFileName=".\x64\Release\"
|
||||
WarningLevel="3"
|
||||
DebugInformationFormat="3"
|
||||
SuppressStartupBanner="true"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="NDEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x64\Release\gennorm2.exe"
|
||||
LinkIncremental="1"
|
||||
SuppressStartupBanner="true"
|
||||
ProgramDatabaseFile=".\x64\Release\gennorm2.pdb"
|
||||
GenerateDebugInformation="true"
|
||||
SubSystem="1"
|
||||
OptimizeReferences="2"
|
||||
EnableCOMDATFolding="2"
|
||||
TargetMachine="17"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
<Configuration
|
||||
Name="Debug|x64"
|
||||
OutputDirectory=".\x64\Debug"
|
||||
IntermediateDirectory=".\x64\Debug"
|
||||
ConfigurationType="1"
|
||||
CharacterSet="1"
|
||||
>
|
||||
<Tool
|
||||
Name="VCPreBuildEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCustomBuildTool"
|
||||
CommandLine="copy "$(TargetPath)" ..\..\..\bin64
"
|
||||
Outputs="..\..\..\bin64\$(TargetFileName)"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXMLDataGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCWebServiceProxyGeneratorTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCMIDLTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
Optimization="0"
|
||||
AdditionalIncludeDirectories="..\..\common;..\toolutil"
|
||||
PreprocessorDefinitions="WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
|
||||
EnableIntrinsicFunctions="true"
|
||||
MinimalRebuild="true"
|
||||
BasicRuntimeChecks="3"
|
||||
StringPooling="true"
|
||||
RuntimeLibrary="3"
|
||||
BufferSecurityCheck="true"
|
||||
DisableLanguageExtensions="true"
|
||||
TreatWChar_tAsBuiltInType="true"
|
||||
PrecompiledHeaderFile=".\x64\Debug\gennorm2.pch"
|
||||
AssemblerListingLocation=".\x64\Debug\"
|
||||
ObjectFile=".\x64\Debug\"
|
||||
ProgramDataBaseFileName=".\x64\Debug\"
|
||||
BrowseInformation="1"
|
||||
WarningLevel="3"
|
||||
SuppressStartupBanner="true"
|
||||
DebugInformationFormat="4"
|
||||
CompileAs="0"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManagedResourceCompilerTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCResourceCompilerTool"
|
||||
PreprocessorDefinitions="_DEBUG"
|
||||
Culture="1033"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPreLinkEventTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCLinkerTool"
|
||||
OutputFile=".\x64\Debug\gennorm2.exe"
|
||||
LinkIncremental="2"
|
||||
SuppressStartupBanner="true"
|
||||
GenerateDebugInformation="true"
|
||||
ProgramDatabaseFile=".\x64\Debug\gennorm2.pdb"
|
||||
SubSystem="1"
|
||||
TargetMachine="17"
|
||||
RandomizedBaseAddress="1"
|
||||
DataExecutionPrevention="1"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCALinkTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCManifestTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCXDCMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCBscMakeTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCFxCopTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCAppVerifierTool"
|
||||
/>
|
||||
<Tool
|
||||
Name="VCPostBuildEventTool"
|
||||
/>
|
||||
</Configuration>
|
||||
</Configurations>
|
||||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<File
|
||||
RelativePath=".\gennorm2.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\n2builder.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\n2builder.h"
|
||||
>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
</VisualStudioProject>
|
1094
icu4c/source/tools/gennorm2/n2builder.cpp
Normal file
1094
icu4c/source/tools/gennorm2/n2builder.cpp
Normal file
File diff suppressed because it is too large
Load diff
113
icu4c/source/tools/gennorm2/n2builder.h
Normal file
113
icu4c/source/tools/gennorm2/n2builder.h
Normal file
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2009-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
* file name: n2builder.h
|
||||
* encoding: US-ASCII
|
||||
* tab size: 8 (not used)
|
||||
* indentation:4
|
||||
*
|
||||
* created on: 2009nov25
|
||||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#ifndef __N2BUILDER_H__
|
||||
#define __N2BUILDER_H__
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "normalizer2impl.h" // for IX_COUNT
|
||||
#include "toolutil.h"
|
||||
#include "utrie2.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
extern UBool beVerbose, haveCopyright;
|
||||
|
||||
struct Norm;
|
||||
|
||||
class BuilderReorderingBuffer;
|
||||
class ExtraDataWriter;
|
||||
|
||||
class Normalizer2DataBuilder {
|
||||
public:
|
||||
Normalizer2DataBuilder(UErrorCode &errorCode);
|
||||
~Normalizer2DataBuilder();
|
||||
|
||||
enum OverrideHandling {
|
||||
OVERRIDE_NONE,
|
||||
OVERRIDE_ANY,
|
||||
OVERRIDE_PREVIOUS
|
||||
};
|
||||
|
||||
void setOverrideHandling(OverrideHandling oh);
|
||||
|
||||
void setCC(UChar32 c, uint8_t cc);
|
||||
void setOneWayMapping(UChar32 c, const UnicodeString &m);
|
||||
void setRoundTripMapping(UChar32 c, const UnicodeString &m);
|
||||
void removeMapping(UChar32 c);
|
||||
|
||||
void setUnicodeVersion(const char *v);
|
||||
|
||||
void writeBinaryFile(const char *filename);
|
||||
|
||||
private:
|
||||
friend class CompositionBuilder;
|
||||
friend class Decomposer;
|
||||
friend class ExtraDataWriter;
|
||||
friend class Norm16Writer;
|
||||
|
||||
// No copy constructor nor assignment operator.
|
||||
Normalizer2DataBuilder(const Normalizer2DataBuilder &other);
|
||||
Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other);
|
||||
|
||||
Norm *allocNorm();
|
||||
Norm *getNorm(UChar32 c);
|
||||
Norm *createNorm(UChar32 c);
|
||||
Norm *checkNormForMapping(Norm *p, UChar32 c); // check for permitted overrides
|
||||
|
||||
const Norm &getNormRef(UChar32 c) const;
|
||||
uint8_t getCC(UChar32 c) const;
|
||||
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
|
||||
UChar32 combine(const Norm &norm, UChar32 trail) const;
|
||||
|
||||
void addComposition(UChar32 start, UChar32 end, uint32_t value);
|
||||
UBool decompose(UChar32 start, UChar32 end, uint32_t value);
|
||||
void reorder(Norm *p, BuilderReorderingBuffer &buffer);
|
||||
UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
|
||||
void setHangulData();
|
||||
void writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString);
|
||||
void writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString);
|
||||
void writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer);
|
||||
int32_t getCenterNoNoDelta() {
|
||||
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
|
||||
}
|
||||
void writeNorm16(UChar32 start, UChar32 end, uint32_t value);
|
||||
void processData();
|
||||
|
||||
UTrie2 *normTrie;
|
||||
UToolMemory *normMem;
|
||||
Norm *norms;
|
||||
|
||||
int32_t phase;
|
||||
OverrideHandling overrideHandling;
|
||||
|
||||
int32_t indexes[Normalizer2Impl::IX_COUNT];
|
||||
UTrie2 *norm16Trie;
|
||||
UnicodeString extraData;
|
||||
|
||||
UVersionInfo unicodeVersion;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif // #if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#endif // __N2BUILDER_H__
|
File diff suppressed because it is too large
Load diff
|
@ -389,29 +389,14 @@
|
|||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
<File
|
||||
RelativePath=".\data.h"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\genpname.cpp"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
Filter="h;hpp;hxx;hm;inl"
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\genpname.cpp"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\data.h"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
>
|
||||
</Filter>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2005-2009, International Business Machines
|
||||
* Copyright (C) 2005-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -50,6 +50,7 @@
|
|||
#include "ucol_swp.h"
|
||||
#include "ucnv_bld.h"
|
||||
#include "unormimp.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "sprpimpl.h"
|
||||
#include "propname.h"
|
||||
#include "rbbidata.h"
|
||||
|
@ -619,6 +620,7 @@ static const struct {
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
{ { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */
|
||||
{ { 0x4e, 0x72, 0x6d, 0x32 }, unorm2_swap }, /* dataFormat="Nrm2" */
|
||||
#endif
|
||||
#if !UCONFIG_NO_COLLATION
|
||||
{ { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -21,11 +21,6 @@
|
|||
#include <stdio.h>
|
||||
#include <sys/stat.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "toolutil.h"
|
||||
#include "unicode/ucal.h"
|
||||
|
||||
#ifdef U_WINDOWS
|
||||
# define VC_EXTRALEAN
|
||||
|
@ -42,6 +37,27 @@
|
|||
#endif
|
||||
#include <errno.h>
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "cmemory.h"
|
||||
#include "cstring.h"
|
||||
#include "toolutil.h"
|
||||
#include "unicode/ucal.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
IcuToolErrorCode::~IcuToolErrorCode() {
|
||||
// Safe because our handleFailure() does not throw exceptions.
|
||||
if(isFailure()) { handleFailure(); }
|
||||
}
|
||||
|
||||
void IcuToolErrorCode::handleFailure() const {
|
||||
fprintf(stderr, "error at %s: %s\n", location, errorName());
|
||||
exit(errorCode);
|
||||
}
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
static int32_t currentYear = -1;
|
||||
|
||||
U_CAPI int32_t U_EXPORT2 getCurrentYear() {
|
||||
|
@ -235,6 +251,7 @@ utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
|
|||
fprintf(stderr, "error: %s - out of memory\n", mem->name);
|
||||
exit(U_MEMORY_ALLOCATION_ERROR);
|
||||
}
|
||||
mem->capacity=newCapacity;
|
||||
}
|
||||
|
||||
return TRUE;
|
||||
|
@ -242,9 +259,11 @@ utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
|
|||
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_alloc(UToolMemory *mem) {
|
||||
char *p=(char *)mem->array+mem->idx*mem->size;
|
||||
int32_t newIndex=mem->idx+1;
|
||||
char *p=NULL;
|
||||
int32_t oldIndex=mem->idx;
|
||||
int32_t newIndex=oldIndex+1;
|
||||
if(utm_hasCapacity(mem, newIndex)) {
|
||||
p=(char *)mem->array+oldIndex*mem->size;
|
||||
mem->idx=newIndex;
|
||||
uprv_memset(p, 0, mem->size);
|
||||
}
|
||||
|
@ -253,9 +272,11 @@ utm_alloc(UToolMemory *mem) {
|
|||
|
||||
U_CAPI void * U_EXPORT2
|
||||
utm_allocN(UToolMemory *mem, int32_t n) {
|
||||
char *p=(char *)mem->array+mem->idx*mem->size;
|
||||
int32_t newIndex=mem->idx+n;
|
||||
char *p=NULL;
|
||||
int32_t oldIndex=mem->idx;
|
||||
int32_t newIndex=oldIndex+n;
|
||||
if(utm_hasCapacity(mem, newIndex)) {
|
||||
p=(char *)mem->array+oldIndex*mem->size;
|
||||
mem->idx=newIndex;
|
||||
uprv_memset(p, 0, n*mem->size);
|
||||
}
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2009, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -21,6 +21,33 @@
|
|||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
#ifdef XP_CPLUSPLUS
|
||||
|
||||
#include "unicode/errorcode.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
/**
|
||||
* ErrorCode subclass for use in ICU command-line tools.
|
||||
* The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
|
||||
*/
|
||||
class U_TOOLUTIL_API IcuToolErrorCode : public ErrorCode {
|
||||
public:
|
||||
/**
|
||||
* @param loc A short string describing where the IcuToolErrorCode is used.
|
||||
*/
|
||||
IcuToolErrorCode(const char *loc) : location(loc) {}
|
||||
virtual ~IcuToolErrorCode();
|
||||
protected:
|
||||
virtual void handleFailure() const;
|
||||
private:
|
||||
const char *location;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For Windows, a path/filename may be the short (8.3) version
|
||||
* of the "real", long one. In this case, the short one
|
||||
|
|
|
@ -407,261 +407,246 @@
|
|||
<References>
|
||||
</References>
|
||||
<Files>
|
||||
<Filter
|
||||
Name="Source Files"
|
||||
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
|
||||
<File
|
||||
RelativePath=".\filestrm.c"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\filestrm.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\filetools.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\flagparser.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\package.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_genc.c"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_gencmn.c"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_icu.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkgitems.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\swapimpl.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\toolutil.c"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucbuf.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucm.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucmstate.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unewdata.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uoptions.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uparse.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\writesrc.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\xmlparser.cpp"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Header Files"
|
||||
Filter="h;hpp;hxx;hm;inl"
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\filestrm.h"
|
||||
>
|
||||
<File
|
||||
RelativePath=".\filestrm.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\filetools.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\flagparser.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\package.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_genc.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_gencmn.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_icu.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_imp.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\platform_xopen_source_extended.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\swapimpl.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\toolutil.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucbuf.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucm.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unewdata.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uoptions.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uparse.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\writesrc.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\xmlparser.h"
|
||||
>
|
||||
</File>
|
||||
</Filter>
|
||||
<Filter
|
||||
Name="Resource Files"
|
||||
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\filetools.cpp"
|
||||
>
|
||||
</Filter>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\filetools.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\flagparser.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\flagparser.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\package.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\package.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_genc.c"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_genc.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_gencmn.c"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_gencmn.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_icu.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_icu.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkg_imp.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\pkgitems.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\platform_xopen_source_extended.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\swapimpl.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\swapimpl.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\toolutil.cpp"
|
||||
>
|
||||
<FileConfiguration
|
||||
Name="Release|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|Win32"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Release|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
<FileConfiguration
|
||||
Name="Debug|x64"
|
||||
>
|
||||
<Tool
|
||||
Name="VCCLCompilerTool"
|
||||
DisableLanguageExtensions="false"
|
||||
/>
|
||||
</FileConfiguration>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\toolutil.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucbuf.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucbuf.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucm.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucm.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\ucmstate.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unewdata.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\unewdata.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uoptions.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uoptions.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uparse.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\uparse.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\writesrc.c"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\writesrc.h"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\xmlparser.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath=".\xmlparser.h"
|
||||
>
|
||||
</File>
|
||||
</Files>
|
||||
<Globals>
|
||||
</Globals>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999,2008, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -14,6 +14,7 @@
|
|||
* created by: Markus W. Scherer
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "unicode/utypes.h"
|
||||
#include "unicode/putil.h"
|
||||
#include "unicode/ustring.h"
|
||||
|
@ -162,6 +163,33 @@ udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode) {
|
|||
return fileLength;
|
||||
}
|
||||
|
||||
/* dummy UDataInfo cf. udata.h */
|
||||
static const UDataInfo dummyDataInfo = {
|
||||
sizeof(UDataInfo),
|
||||
0,
|
||||
|
||||
U_IS_BIG_ENDIAN,
|
||||
U_CHARSET_FAMILY,
|
||||
U_SIZEOF_UCHAR,
|
||||
0,
|
||||
|
||||
{ 0, 0, 0, 0 }, /* dummy dataFormat */
|
||||
{ 0, 0, 0, 0 }, /* dummy formatVersion */
|
||||
{ 0, 0, 0, 0 } /* dummy dataVersion */
|
||||
};
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode) {
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
udata_finish(udata_create(dir, type, name, &dummyDataInfo, NULL, pErrorCode), pErrorCode);
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
fprintf(stderr, "error %s writing dummy data file %s" U_FILE_SEP_STRING "%s.%s\n",
|
||||
u_errorName(*pErrorCode), dir, name, type);
|
||||
exit(*pErrorCode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
udata_write8(UNewDataMemory *pData, uint8_t byte) {
|
||||
if(pData!=NULL && pData->file!=NULL) {
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 1999-2000, International Business Machines
|
||||
* Copyright (C) 1999-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -66,6 +66,10 @@ udata_create(const char *dir, const char *type, const char *name,
|
|||
U_CAPI uint32_t U_EXPORT2
|
||||
udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode);
|
||||
|
||||
/** @memo Write a dummy data file. */
|
||||
U_CAPI void U_EXPORT2
|
||||
udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode);
|
||||
|
||||
/** @memo Write an 8-bit byte to the file. */
|
||||
U_CAPI void U_EXPORT2
|
||||
udata_write8(UNewDataMemory *pData, uint8_t byte);
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2009, International Business Machines
|
||||
* Copyright (C) 2000-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -81,7 +81,7 @@ u_parseDelimitedFile(const char *filename, char delimiter,
|
|||
char *start, *limit;
|
||||
int32_t i, length;
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -193,7 +193,7 @@ u_parseCodePoints(const char *s,
|
|||
uint32_t value;
|
||||
int32_t count;
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
|
||||
|
@ -242,7 +242,7 @@ u_parseString(const char *s,
|
|||
uint32_t value;
|
||||
int32_t destLength;
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
|
||||
|
@ -275,15 +275,16 @@ u_parseString(const char *s,
|
|||
}
|
||||
|
||||
/* store the first code point */
|
||||
if(destLength==0 && pFirst!=NULL) {
|
||||
if(pFirst!=NULL) {
|
||||
*pFirst=value;
|
||||
pFirst=NULL;
|
||||
}
|
||||
|
||||
/* append it to the destination array */
|
||||
if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
|
||||
UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
|
||||
if((destLength+U16_LENGTH(value))<=destCapacity) {
|
||||
U16_APPEND_UNSAFE(dest, destLength, value);
|
||||
} else {
|
||||
destLength+=UTF_CHAR_LENGTH(value);
|
||||
destLength+=U16_LENGTH(value);
|
||||
}
|
||||
|
||||
/* go to the following characters */
|
||||
|
@ -293,13 +294,14 @@ u_parseString(const char *s,
|
|||
|
||||
/* read a range like start or start..end */
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_parseCodePointRange(const char *s,
|
||||
uint32_t *pStart, uint32_t *pEnd,
|
||||
UErrorCode *pErrorCode) {
|
||||
u_parseCodePointRangeAnyTerminator(const char *s,
|
||||
uint32_t *pStart, uint32_t *pEnd,
|
||||
const char **terminator,
|
||||
UErrorCode *pErrorCode) {
|
||||
char *end;
|
||||
uint32_t value;
|
||||
|
||||
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
|
||||
if(U_FAILURE(*pErrorCode)) {
|
||||
return 0;
|
||||
}
|
||||
if(s==NULL || pStart==NULL || pEnd==NULL) {
|
||||
|
@ -307,15 +309,10 @@ u_parseCodePointRange(const char *s,
|
|||
return 0;
|
||||
}
|
||||
|
||||
s=u_skipWhitespace(s);
|
||||
if(*s==';' || *s==0) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* read the start code point */
|
||||
s=u_skipWhitespace(s);
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 16);
|
||||
if(end<=s || (!IS_INV_WHITESPACE(*end) && *end!='.' && *end!=';' && *end!=0) || value>=0x110000) {
|
||||
if(end<=s || value>=0x110000) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
@ -323,19 +320,15 @@ u_parseCodePointRange(const char *s,
|
|||
|
||||
/* is there a "..end"? */
|
||||
s=u_skipWhitespace(end);
|
||||
if(*s==';' || *s==0) {
|
||||
if(*s!='.' || s[1]!='.') {
|
||||
*terminator=end;
|
||||
return 1;
|
||||
}
|
||||
|
||||
if(*s!='.' || s[1]!='.') {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
s+=2;
|
||||
s=u_skipWhitespace(s+2);
|
||||
|
||||
/* read the end code point */
|
||||
value=(uint32_t)uprv_strtoul(s, &end, 16);
|
||||
if(end<=s || (!IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
|
||||
if(end<=s || value>=0x110000) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
|
@ -347,14 +340,25 @@ u_parseCodePointRange(const char *s,
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* no garbage after that? */
|
||||
s=u_skipWhitespace(end);
|
||||
if(*s==';' || *s==0) {
|
||||
return value-*pStart+1;
|
||||
} else {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return 0;
|
||||
*terminator=end;
|
||||
return value-*pStart+1;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_parseCodePointRange(const char *s,
|
||||
uint32_t *pStart, uint32_t *pEnd,
|
||||
UErrorCode *pErrorCode) {
|
||||
const char *terminator;
|
||||
int32_t rangeLength=
|
||||
u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
terminator=u_skipWhitespace(terminator);
|
||||
if(*terminator!=';' && *terminator!=0) {
|
||||
*pErrorCode=U_PARSE_ERROR;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return rangeLength;
|
||||
}
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2000-2009, International Business Machines
|
||||
* Copyright (C) 2000-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -126,6 +126,16 @@ u_parseCodePointRange(const char *s,
|
|||
uint32_t *pStart, uint32_t *pEnd,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
/**
|
||||
* Same as u_parseCodePointRange() but the range may be terminated by
|
||||
* any character. The position of the terminating character is returned via
|
||||
* the *terminator output parameter.
|
||||
*/
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_parseCodePointRangeAnyTerminator(const char *s,
|
||||
uint32_t *pStart, uint32_t *pEnd,
|
||||
const char **terminator,
|
||||
UErrorCode *pErrorCode);
|
||||
|
||||
U_CAPI int32_t U_EXPORT2
|
||||
u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Copyright (C) 2005-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -24,8 +24,8 @@
|
|||
#include "cstring.h"
|
||||
#include "writesrc.h"
|
||||
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_create(const char *path, const char *filename) {
|
||||
static FILE *
|
||||
usrc_createWithHeader(const char *path, const char *filename, const char *header) {
|
||||
char buffer[1024];
|
||||
const char *p;
|
||||
char *q;
|
||||
|
@ -55,19 +55,7 @@ usrc_create(const char *path, const char *filename) {
|
|||
lt=localtime(&t);
|
||||
strftime(year, sizeof(year), "%Y", lt);
|
||||
strftime(buffer, sizeof(buffer), "%Y-%m-%d", lt);
|
||||
fprintf(
|
||||
f,
|
||||
"/*\n"
|
||||
" * Copyright (C) 1999-%s, International Business Machines\n"
|
||||
" * Corporation and others. All Rights Reserved.\n"
|
||||
" *\n"
|
||||
" * file name: %s\n"
|
||||
" *\n"
|
||||
" * machine-generated on: %s\n"
|
||||
" */\n\n",
|
||||
year,
|
||||
filename,
|
||||
buffer);
|
||||
fprintf(f, header, year, filename, buffer);
|
||||
} else {
|
||||
fprintf(
|
||||
stderr,
|
||||
|
@ -77,6 +65,33 @@ usrc_create(const char *path, const char *filename) {
|
|||
return f;
|
||||
}
|
||||
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_create(const char *path, const char *filename) {
|
||||
const char *header=
|
||||
"/*\n"
|
||||
" * Copyright (C) 1999-%s, International Business Machines\n"
|
||||
" * Corporation and others. All Rights Reserved.\n"
|
||||
" *\n"
|
||||
" * file name: %s\n"
|
||||
" *\n"
|
||||
" * machine-generated on: %s\n"
|
||||
" */\n\n";
|
||||
return usrc_createWithHeader(path, filename, header);
|
||||
}
|
||||
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_createTextData(const char *path, const char *filename) {
|
||||
const char *header=
|
||||
"# Copyright (C) 1999-%s, International Business Machines\n"
|
||||
"# Corporation and others. All Rights Reserved.\n"
|
||||
"#\n"
|
||||
"# file name: %s\n"
|
||||
"#\n"
|
||||
"# machine-generated on: %s\n"
|
||||
"#\n\n";
|
||||
return usrc_createWithHeader(path, filename, header);
|
||||
}
|
||||
|
||||
U_CAPI void U_EXPORT2
|
||||
usrc_writeArray(FILE *f,
|
||||
const char *prefix,
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
/*
|
||||
*******************************************************************************
|
||||
*
|
||||
* Copyright (C) 2005-2008, International Business Machines
|
||||
* Copyright (C) 2005-2010, International Business Machines
|
||||
* Corporation and others. All Rights Reserved.
|
||||
*
|
||||
*******************************************************************************
|
||||
|
@ -25,10 +25,18 @@
|
|||
|
||||
/**
|
||||
* Create a source text file and write a header comment with the ICU copyright.
|
||||
* Writes a C/Java-style comment.
|
||||
*/
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_create(const char *path, const char *filename);
|
||||
|
||||
/**
|
||||
* Create a source text file and write a header comment with the ICU copyright.
|
||||
* Writes the comment with # lines, as used in scripts and text data.
|
||||
*/
|
||||
U_CAPI FILE * U_EXPORT2
|
||||
usrc_createTextData(const char *path, const char *filename);
|
||||
|
||||
/**
|
||||
* Write the contents of an array of 8/16/32-bit words.
|
||||
* The prefix and postfix are optional (can be NULL) and are written first/last.
|
||||
|
|
Loading…
Add table
Reference in a new issue