ICU-7273 merge in Normalizer2 API & code, and ICU-5785 UnicodeSet::span(UnicodeString) and ICU-7296 tempSubString()/retainBetween(); merge -r 26971:27150 branches/markus/norm2

X-SVN-Rev: 27155
This commit is contained in:
Markus Scherer 2010-01-06 23:50:03 +00:00
parent 11acc7e54f
commit 8ddbd1394c
98 changed files with 24433 additions and 8028 deletions

4
.gitattributes vendored
View file

@ -49,6 +49,10 @@ README text !eol
*.tri2 -text
icu4c/icu4c.css -text
icu4c/source/data/in/nfc.nrm -text
icu4c/source/data/in/nfkc.nrm -text
icu4c/source/data/in/nfkc_cf.nrm -text
icu4c/source/data/in/unorm.icu -text
icu4c/source/data/locales/pool.res -text
icu4c/source/samples/ucnv/data02.bin -text
icu4c/source/test/perf/README -text

14
.gitignore vendored
View file

@ -560,6 +560,20 @@ icu4c/source/tools/gennorm/gennorm.vcproj.*.*.user
icu4c/source/tools/gennorm/release
icu4c/source/tools/gennorm/x64
icu4c/source/tools/gennorm/x86
icu4c/source/tools/gennorm2/*.d
icu4c/source/tools/gennorm2/*.o
icu4c/source/tools/gennorm2/*.pdb
icu4c/source/tools/gennorm2/*.plg
icu4c/source/tools/gennorm2/Debug
icu4c/source/tools/gennorm2/Makefile
icu4c/source/tools/gennorm2/Release
icu4c/source/tools/gennorm2/debug
icu4c/source/tools/gennorm2/gennorm2
icu4c/source/tools/gennorm2/gennorm2.[0-9]
icu4c/source/tools/gennorm2/gennorm2.vcproj.*.*.user
icu4c/source/tools/gennorm2/release
icu4c/source/tools/gennorm2/x64
icu4c/source/tools/gennorm2/x86
icu4c/source/tools/genpname/*.d
icu4c/source/tools/genpname/*.o
icu4c/source/tools/genpname/*.pdb

View file

@ -1,5 +1,5 @@
Microsoft Visual Studio Solution File, Format Version 10.00
# Visual Studio 2008
# Visual C++ Express 2008
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cal", "..\samples\cal\cal.vcproj", "{F7659D77-09CF-4FE9-ACEE-927287AA9509}"
ProjectSection(ProjectDependencies) = postProject
{0178B127-6269-407D-B112-93877BB62776} = {0178B127-6269-407D-B112-93877BB62776}
@ -259,6 +259,12 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gencfu", "..\tools\gencfu\g
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
EndProjectSection
EndProject
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gennorm2", "..\tools\gennorm2\gennorm2.vcproj", "{C7891A65-80AB-4245-912E-5F1E17B0E6C4}"
ProjectSection(ProjectDependencies) = postProject
{6B231032-3CB5-4EED-9210-810D666A23A0} = {6B231032-3CB5-4EED-9210-810D666A23A0}
{73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D} = {73C0A65B-D1F2-4DE1-B3A6-15DAD2C23F3D}
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Win32 = Debug|Win32
@ -555,6 +561,14 @@ Global
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|Win32.Build.0 = Release|Win32
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|x64.ActiveCfg = Release|x64
{691EE0C0-DC57-4A48-8AEE-8ED75EB3A057}.Release|x64.Build.0 = Release|x64
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|Win32.ActiveCfg = Debug|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|Win32.Build.0 = Debug|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|x64.ActiveCfg = Debug|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Debug|x64.Build.0 = Debug|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|Win32.ActiveCfg = Release|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|Win32.Build.0 = Release|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|x64.ActiveCfg = Release|Win32
{C7891A65-80AB-4245-912E-5F1E17B0E6C4}.Release|x64.Build.0 = Release|Win32
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

View file

@ -1,6 +1,6 @@
#******************************************************************************
#
# Copyright (C) 1999-2009, International Business Machines
# Copyright (C) 1999-2010, International Business Machines
# Corporation and others. All Rights Reserved.
#
#******************************************************************************
@ -78,7 +78,8 @@ ucat.o locmap.o uloc.o locid.o locutil.o \
bytestream.o stringpiece.o \
ustr_cnv.o unistr_cnv.o unistr.o unistr_case.o unistr_props.o \
utf_impl.o ustring.o ustrcase.o ucasemap.o cstring.o ustrfmt.o ustrtrns.o ustr_wcs.o utext.o \
normlzr.o unorm.o unormcmp.o unorm_it.o chariter.o schriter.o uchriter.o uiter.o \
normalizer2impl.o normalizer2.o filterednormalizer2.o normlzr.o unorm.o unormcmp.o unorm_it.o \
chariter.o schriter.o uchriter.o uiter.o \
uchar.o uprops.o ucase.o propname.o ubidi_props.o ubidi.o ubidiwrt.o ubidiln.o ushape.o \
uscript.o usc_impl.o unames.o \
utrie.o utrie2.o utrie2_builder.o bmpset.o unisetspan.o uset_props.o uniset_props.o uset.o uniset.o usetiter.o ruleiter.o caniter.o unifilt.o unifunct.o \

View file

@ -1,6 +1,6 @@
/*
*****************************************************************************
* Copyright (C) 1996-2006, International Business Machines Corporation and *
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*****************************************************************************
*/
@ -12,6 +12,7 @@
#include "unicode/uset.h"
#include "unicode/ustring.h"
#include "hash.h"
#include "normalizer2impl.h"
#include "unormimp.h"
#include "unicode/caniter.h"
#include "unicode/normlzr.h"
@ -68,7 +69,8 @@ CanonicalIterator::CanonicalIterator(const UnicodeString &sourceStr, UErrorCode
pieces_length(0),
pieces_lengths(NULL),
current(NULL),
current_length(0)
current_length(0),
nfd(*Normalizer2Factory::getNFDInstance(status))
{
if(U_SUCCESS(status)) {
setSource(sourceStr, status);
@ -499,73 +501,39 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
return NULL;
}
const int32_t bufSize = 256;
int32_t bufLen = 0;
UChar temp[bufSize];
int32_t inputLen = 0, decompLen;
UChar stackBuffer[4];
const UChar *decomp;
U16_APPEND_UNSAFE(temp, inputLen, comp);
decomp = unorm_getCanonicalDecomposition(comp, stackBuffer, &decompLen);
if(decomp == NULL) {
/* copy temp */
stackBuffer[0] = temp[0];
if(inputLen > 1) {
stackBuffer[1] = temp[1];
}
decomp = stackBuffer;
decompLen = inputLen;
}
UChar *buff = temp+inputLen;
UnicodeString temp(comp);
int32_t inputLen=temp.length();
UnicodeString decompString;
nfd.normalize(temp, decompString, status);
const UChar *decomp=decompString.getBuffer();
int32_t decompLen=decompString.length();
// See if it matches the start of segment (at segmentPos)
UBool ok = FALSE;
UChar32 cp;
int32_t decompPos = 0;
UChar32 decompCp;
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
U16_NEXT(decomp, decompPos, decompLen, decompCp);
int32_t i;
UBool overflow = FALSE;
i = segmentPos;
int32_t i = segmentPos;
while(i < segLen) {
UTF_NEXT_CHAR(segment, i, segLen, cp);
U16_NEXT(segment, i, segLen, cp);
if (cp == decompCp) { // if equal, eat another cp from decomp
//if (PROGRESS) printf(" matches: %s\n", UToS(Tr(UnicodeString(cp))));
if (decompPos == decompLen) { // done, have all decomp characters!
//u_strcat(buff+bufLen, segment+i);
uprv_memcpy(buff+bufLen, segment+i, (segLen-i)*sizeof(UChar));
bufLen+=segLen-i;
temp.append(segment+i, segLen-i);
ok = TRUE;
break;
}
UTF_NEXT_CHAR(decomp, decompPos, decompLen, decompCp);
U16_NEXT(decomp, decompPos, decompLen, decompCp);
} else {
//if (PROGRESS) printf(" buffer: %s\n", UToS(Tr(UnicodeString(cp))));
// brute force approach
U16_APPEND(buff, bufLen, bufSize, cp, overflow);
if(overflow) {
/*
* ### TODO handle buffer overflow
* The buffer is large, but an overflow may still happen with
* unusual input (many combining marks?).
* Reallocate buffer and continue.
* markus 20020929
*/
overflow = FALSE;
}
temp.append(cp);
/* TODO: optimize
// since we know that the classes are monotonically increasing, after zero
@ -585,25 +553,20 @@ Hashtable *CanonicalIterator::extract(Hashtable *fillinResult, UChar32 comp, con
//if (PROGRESS) printf("Matches\n");
if (bufLen == 0) {
if (inputLen == temp.length()) {
fillinResult->put(UnicodeString(), new UnicodeString(), status);
return fillinResult; // succeed, but no remainder
}
// brute force approach
// check to make sure result is canonically equivalent
int32_t tempLen = inputLen + bufLen;
UChar trial[bufSize];
unorm_decompose(trial, bufSize, temp, tempLen, FALSE, 0, &status);
if(U_FAILURE(status)
|| uprv_memcmp(segment+segmentPos, trial, (segLen - segmentPos)*sizeof(UChar)) != 0)
{
UnicodeString trial;
nfd.normalize(temp, trial, status);
if(U_FAILURE(status) || trial.compare(segment+segmentPos, segLen - segmentPos) != 0) {
return NULL;
}
return getEquivalents2(fillinResult, buff, bufLen, status);
return getEquivalents2(fillinResult, temp.getBuffer()+inputLen, temp.length()-inputLen, status);
}
U_NAMESPACE_END

View file

@ -769,7 +769,7 @@
Name="collation"
>
<File
RelativePath=".\ucol_swp.c"
RelativePath=".\ucol_swp.cpp"
>
<FileConfiguration
Name="Release|Win32"
@ -961,7 +961,7 @@
>
</File>
<File
RelativePath=".\utrie2.c"
RelativePath=".\utrie2.cpp"
>
</File>
<File
@ -1172,6 +1172,10 @@
RelativePath=".\locmap.h"
>
</File>
<File
RelativePath=".\mutex.cpp"
>
</File>
<File
RelativePath=".\mutex.h"
>
@ -3057,6 +3061,62 @@
/>
</FileConfiguration>
</File>
<File
RelativePath=".\filterednormalizer2.cpp"
>
</File>
<File
RelativePath=".\normalizer2.cpp"
>
</File>
<File
RelativePath=".\unicode\normalizer2.h"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\normalizer2impl.cpp"
>
</File>
<File
RelativePath=".\normalizer2impl.h"
>
</File>
<File
RelativePath=".\normlzr.cpp"
>
@ -3145,6 +3205,46 @@
/>
</FileConfiguration>
</File>
<File
RelativePath=".\unicode\unorm2.h"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(InputPath)&quot; ..\..\include\unicode&#x0D;&#x0A;"
Outputs="..\..\include\unicode\$(InputFileName)"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\unorm_it.c"
>
@ -3470,7 +3570,7 @@
>
</File>
<File
RelativePath=".\uprops.c"
RelativePath=".\uprops.cpp"
>
</File>
<File

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1997-2006, International Business Machines
* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -81,4 +81,15 @@ uprv_arrayCopy(const U_NAMESPACE_QUALIFIER UnicodeString *src, int32_t srcStart,
U_NAMESPACE_QUALIFIER UnicodeString *dst, int32_t dstStart, int32_t count)
{ uprv_arrayCopy(src+srcStart, dst+dstStart, count); }
/**
* Checks that the string is readable and writable.
* Sets U_ILLEGAL_ARGUMENT_ERROR if the string isBogus() or has an open getBuffer().
*/
inline void
uprv_checkCanGetBuffer(const UnicodeString &s, UErrorCode &errorCode) {
if(U_SUCCESS(errorCode) && s.isBogus()) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
#endif /* _CPPUTILS */

View file

@ -0,0 +1,261 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: filterednormalizer2.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009dec10
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/normalizer2.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"
U_NAMESPACE_BEGIN
UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(src, errorCode);
if(U_FAILURE(errorCode)) {
dest.setToBogus();
return dest;
}
if(&dest==&src) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return dest;
}
dest.remove();
return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
}
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// USET_SPAN_SIMPLE should be passed in for the start of src
// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
UnicodeString &
FilteredNormalizer2::normalize(const UnicodeString &src,
UnicodeString &dest,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const {
UnicodeString tempDest; // Don't throw away destination buffer between iterations.
for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
int32_t spanLength=spanLimit-prevSpanLimit;
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
if(spanLength!=0) {
dest.append(src, prevSpanLimit, spanLength);
}
spanCondition=USET_SPAN_SIMPLE;
} else {
if(spanLength!=0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
tempDest, errorCode));
if(U_FAILURE(errorCode)) {
break;
}
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return dest;
}
UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
return normalizeSecondAndAppend(first, second, TRUE, errorCode);
}
UnicodeString &
FilteredNormalizer2::append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
return normalizeSecondAndAppend(first, second, FALSE, errorCode);
}
UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UBool doNormalize,
UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(first, errorCode);
uprv_checkCanGetBuffer(second, errorCode);
if(U_FAILURE(errorCode)) {
return first;
}
if(&first==&second) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return first;
}
if(first.isEmpty()) {
if(doNormalize) {
return normalize(second, first, errorCode);
} else {
return first=second;
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of the second
int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
if(prefixLimit!=0) {
UnicodeString prefix(second.tempSubString(0, prefixLimit));
int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
if(suffixStart==0) {
if(doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix, errorCode);
} else {
norm2.append(first, prefix, errorCode);
}
} else {
UnicodeString middle(first, suffixStart, INT32_MAX);
if(doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
} else {
norm2.append(middle, prefix, errorCode);
}
first.replace(suffixStart, INT32_MAX, middle);
}
}
if(prefixLimit<second.length()) {
UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
if(doNormalize) {
normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
} else {
first.append(rest);
}
}
return first;
}
UBool
FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
if(U_FAILURE(errorCode)) {
return FALSE;
}
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_SIMPLE;
} else {
if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
U_FAILURE(errorCode)
) {
return FALSE;
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return TRUE;
}
UNormalizationCheckResult
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
if(U_FAILURE(errorCode)) {
return UNORM_MAYBE;
}
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_SIMPLE;
} else {
UNormalizationCheckResult qcResult=
norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
if(U_FAILURE(errorCode) || qcResult!=UNORM_YES) {
return qcResult;
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return UNORM_YES;
}
int32_t
FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);
if(U_FAILURE(errorCode)) {
return 0;
}
USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==USET_SPAN_NOT_CONTAINED) {
spanCondition=USET_SPAN_SIMPLE;
} else {
int32_t yesLimit=
prevSpanLimit+
norm2.spanQuickCheckYes(
s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
return yesLimit;
}
spanCondition=USET_SPAN_NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return s.length();
}
UBool
FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
UBool
FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
return !set.contains(c) || norm2.hasBoundaryAfter(c);
}
UBool
FilteredNormalizer2::isInert(UChar32 c) const {
return !set.contains(c) || norm2.isInert(c);
}
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(FilteredNormalizer2)
U_DRAFT UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return NULL;
}
if(filterSet==NULL) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return NULL;
}
Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
*UnicodeSet::fromUSet(filterSet));
if(fn2==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
}
return (UNormalizer2 *)fn2;
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_NORMALIZATION

View file

@ -1,18 +1,91 @@
/**
/*
*******************************************************************************
* Copyright (C) 2008, International Business Machines Corporation. *
* All Rights Reserved. *
*
* Copyright (C) 2008-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: mutex.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*/
#include "unicode/utypes.h"
#include "mutex.h"
U_NAMESPACE_BEGIN
void *SimpleSingleton::getInstance(InstantiatorFn *instantiator, const void *context,
void *&duplicate,
UErrorCode &errorCode) {
duplicate=NULL;
if(U_FAILURE(errorCode)) {
return NULL;
}
void *instance;
UMTX_CHECK(NULL, fInstance, instance);
if(instance!=NULL) {
return instance;
} else {
instance=instantiator(context, errorCode);
Mutex mutex;
if(fInstance==NULL && U_SUCCESS(errorCode)) {
fInstance=instance;
} else {
duplicate=instance;
}
return fInstance;
}
}
void *TriStateSingleton::getInstance(InstantiatorFn *instantiator, const void *context,
void *&duplicate,
UErrorCode &errorCode) {
duplicate=NULL;
if(U_FAILURE(errorCode)) {
return NULL;
}
int8_t haveInstance;
UMTX_CHECK(NULL, fHaveInstance, haveInstance);
if(haveInstance>0) {
return fInstance; // instance was created
} else if(haveInstance<0) {
errorCode=fErrorCode; // instance creation failed
return NULL;
} else /* haveInstance==0 */ {
void *instance=instantiator(context, errorCode);
Mutex mutex;
if(fHaveInstance==0) {
if(U_SUCCESS(errorCode)) {
fInstance=instance;
instance=NULL;
fHaveInstance=1;
} else {
fErrorCode=errorCode;
fHaveInstance=-1;
}
} else {
errorCode=fErrorCode;
}
duplicate=instance;
return fInstance;
}
}
void TriStateSingleton::reset() {
fInstance=NULL;
fErrorCode=U_ZERO_ERROR;
fHaveInstance=0;
}
#if UCONFIG_NO_SERVICE
/* If UCONFIG_NO_SERVICE, then there is no invocation of Mutex elsewhere in
common, so add one here to force an export */
#include "mutex.h"
static Mutex *aMutex = 0;
/* UCONFIG_NO_SERVICE */
#endif
U_NAMESPACE_END

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 1997-2009, International Business Machines
* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -71,6 +71,128 @@ inline Mutex::~Mutex()
umtx_unlock(fMutex);
}
// common code for singletons ---------------------------------------------- ***
/**
* Function pointer for the instantiator parameter of
* SimpleSingleton::getInstance() and TriStateSingleton::getInstance().
* The function creates some object, optionally using the context parameter.
* The function need not check for U_FAILURE(errorCode).
*/
typedef void *InstantiatorFn(const void *context, UErrorCode &errorCode);
/**
* Singleton struct with shared instantiation/mutexing code.
* Simple: Does not remember if a previous instantiation failed.
* Best used if the instantiation can really only fail with an out-of-memory error,
* otherwise use a TriStateSingleton.
* Best used via SimpleSingletonWrapper or similar.
* Define a static SimpleSingleton instance via the STATIC_SIMPLE_SINGLETON macro.
*/
struct SimpleSingleton {
void *fInstance;
/**
* Returns the singleton instance, or NULL if it could not be created.
* Calls the instantiator with the context if the instance has not been
* created yet. In a race condition, the duplicate may not be NULL.
* The caller must delete the duplicate.
* The caller need not initialize the duplicate before the call.
*/
void *getInstance(InstantiatorFn *instantiator, const void *context,
void *&duplicate,
UErrorCode &errorCode);
/**
* Resets the fields. The caller must have deleted the singleton instance.
* Not mutexed.
* Call this from a cleanup function.
*/
void reset() { fInstance=NULL; }
};
#define STATIC_SIMPLE_SINGLETON(name) static SimpleSingleton name={ NULL }
/**
* Handy wrapper for an SimpleSingleton.
* Intended for temporary use on the stack, to make the SimpleSingleton easier to deal with.
* Takes care of the duplicate deletion and type casting.
*/
template<typename T>
class SimpleSingletonWrapper {
public:
SimpleSingletonWrapper(SimpleSingleton &s) : singleton(s) {}
void deleteInstance() {
delete (T *)singleton.fInstance;
singleton.reset();
}
T *getInstance(InstantiatorFn *instantiator, const void *context,
UErrorCode &errorCode) {
void *duplicate;
T *instance=(T *)singleton.getInstance(instantiator, context, duplicate, errorCode);
delete (T *)duplicate;
return instance;
}
private:
SimpleSingleton &singleton;
};
/**
* Singleton struct with shared instantiation/mutexing code.
* Tri-state: Instantiation succeeded/failed/not attempted yet.
* Best used via TriStateSingletonWrapper or similar.
* Define a static TriStateSingleton instance via the STATIC_TRI_STATE_SINGLETON macro.
*/
struct TriStateSingleton {
void *fInstance;
UErrorCode fErrorCode;
int8_t fHaveInstance;
/**
* Returns the singleton instance, or NULL if it could not be created.
* Calls the instantiator with the context if the instance has not been
* created yet. In a race condition, the duplicate may not be NULL.
* The caller must delete the duplicate.
* The caller need not initialize the duplicate before the call.
* The singleton creation is only attempted once. If it fails,
* the singleton will then always return NULL.
*/
void *getInstance(InstantiatorFn *instantiator, const void *context,
void *&duplicate,
UErrorCode &errorCode);
/**
* Resets the fields. The caller must have deleted the singleton instance.
* Not mutexed.
* Call this from a cleanup function.
*/
void reset();
};
#define STATIC_TRI_STATE_SINGLETON(name) static TriStateSingleton name={ NULL, U_ZERO_ERROR, 0 }
/**
* Handy wrapper for an TriStateSingleton.
* Intended for temporary use on the stack, to make the TriStateSingleton easier to deal with.
* Takes care of the duplicate deletion and type casting.
*/
template<typename T>
class TriStateSingletonWrapper {
public:
TriStateSingletonWrapper(TriStateSingleton &s) : singleton(s) {}
void deleteInstance() {
delete (T *)singleton.fInstance;
singleton.reset();
}
T *getInstance(InstantiatorFn *instantiator, const void *context,
UErrorCode &errorCode) {
void *duplicate;
T *instance=(T *)singleton.getInstance(instantiator, context, duplicate, errorCode);
delete (T *)duplicate;
return instance;
}
private:
TriStateSingleton &singleton;
};
U_NAMESPACE_END
#endif //_MUTEX_

View file

@ -0,0 +1,744 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: normalizer2.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov22
* created by: Markus W. Scherer
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/localpointer.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"
#include "cstring.h"
#include "mutex.h"
#include "normalizer2impl.h"
#include "ucln_cmn.h"
U_NAMESPACE_BEGIN
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
// Normalizer2 implementation for the old UNORM_NONE.
class NoopNormalizer2 : public Normalizer2 {
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const {
if(U_SUCCESS(errorCode)) {
if(&dest!=&src) {
dest=src;
} else {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
return dest;
}
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
if(U_SUCCESS(errorCode)) {
first.append(second);
}
return first;
}
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
if(U_SUCCESS(errorCode)) {
if(&first!=&second) {
first.append(second);
} else {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
}
}
return first;
}
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
return TRUE;
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
return UNORM_YES;
}
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
return s.length();
}
virtual UBool hasBoundaryBefore(UChar32 c) const { return TRUE; }
virtual UBool hasBoundaryAfter(UChar32 c) const { return TRUE; }
virtual UBool isInert(UChar32 c) const { return TRUE; }
static UClassID U_EXPORT2 getStaticClassID();
virtual UClassID getDynamicClassID() const;
};
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NoopNormalizer2)
// Intermediate class:
// Has Normalizer2Impl and does boilerplate argument checking and setup.
class Normalizer2WithImpl : public Normalizer2 {
public:
Normalizer2WithImpl(const Normalizer2Impl &ni) : impl(ni) {}
// normalize
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
dest.setToBogus();
return dest;
}
const UChar *sArray=src.getBuffer();
if(&dest==&src || sArray==NULL) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
dest.setToBogus();
return dest;
}
dest.remove();
ReorderingBuffer buffer(impl, dest);
if(buffer.init(src.length(), errorCode)) {
normalize(sArray, sArray+src.length(), buffer, errorCode);
}
return dest;
}
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
// normalize and append
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
return normalizeSecondAndAppend(first, second, TRUE, errorCode);
}
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
return normalizeSecondAndAppend(first, second, FALSE, errorCode);
}
UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UBool doNormalize,
UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(first, errorCode);
if(U_FAILURE(errorCode)) {
return first;
}
const UChar *secondArray=second.getBuffer();
if(&first==&second || secondArray==NULL) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return first;
}
ReorderingBuffer buffer(impl, first);
if(buffer.init(first.length()+second.length(), errorCode)) {
normalizeAndAppend(secondArray, secondArray+second.length(), doNormalize,
buffer, errorCode);
}
return first;
}
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
// quick checks
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const UChar *sArray=s.getBuffer();
if(sArray==NULL) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
const UChar *sLimit=sArray+s.length();
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
}
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
return 0;
}
const UChar *sArray=s.getBuffer();
if(sArray==NULL) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
return (int32_t)(spanQuickCheckYes(sArray, sArray+s.length(), errorCode)-sArray);
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const = 0;
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
return UNORM_YES;
}
static UClassID U_EXPORT2 getStaticClassID();
virtual UClassID getDynamicClassID() const;
const Normalizer2Impl &impl;
};
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer2WithImpl)
class DecomposeNormalizer2 : public Normalizer2WithImpl {
public:
DecomposeNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
impl.decompose(src, limit, &buffer, errorCode);
}
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
impl.decomposeAndAppend(src, limit, doNormalize, buffer, errorCode);
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
return impl.decompose(src, limit, NULL, errorCode);
}
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
}
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundary(c, TRUE); }
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundary(c, FALSE); }
virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
};
class ComposeNormalizer2 : public Normalizer2WithImpl {
public:
ComposeNormalizer2(const Normalizer2Impl &ni, UBool fcc) :
Normalizer2WithImpl(ni), onlyContiguous(fcc) {}
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
impl.compose(src, limit, onlyContiguous, TRUE, buffer, errorCode);
}
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
impl.composeAndAppend(src, limit, doNormalize, onlyContiguous, buffer, errorCode);
}
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const UChar *sArray=s.getBuffer();
if(sArray==NULL) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return FALSE;
}
UnicodeString temp;
ReorderingBuffer buffer(impl, temp);
if(!buffer.init(5, errorCode)) { // small destCapacity for substring normalization
return FALSE;
}
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
return UNORM_MAYBE;
}
const UChar *sArray=s.getBuffer();
if(sArray==NULL) {
errorCode=U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_MAYBE;
}
UNormalizationCheckResult qcResult=UNORM_YES;
impl.composeQuickCheck(sArray, sArray+s.length(), onlyContiguous, &qcResult);
return qcResult;
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
return impl.composeQuickCheck(src, limit, onlyContiguous, NULL);
}
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
return impl.getCompQuickCheck(impl.getNorm16(c));
}
virtual UBool hasBoundaryBefore(UChar32 c) const {
return impl.hasCompBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const {
return impl.hasCompBoundaryAfter(c, onlyContiguous, FALSE);
}
virtual UBool isInert(UChar32 c) const {
return impl.hasCompBoundaryAfter(c, onlyContiguous, TRUE);
}
private:
UBool onlyContiguous;
};
class FCDNormalizer2 : public Normalizer2WithImpl {
public:
FCDNormalizer2(const Normalizer2Impl &ni) : Normalizer2WithImpl(ni) {}
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
impl.makeFCD(src, limit, &buffer, errorCode);
}
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
impl.makeFCDAndAppend(src, limit, doNormalize, buffer, errorCode);
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
return impl.makeFCD(src, limit, NULL, errorCode);
}
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasFCDBoundaryBefore(c); }
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasFCDBoundaryAfter(c); }
virtual UBool isInert(UChar32 c) const { return impl.isFCDInert(c); }
};
// instance cache ---------------------------------------------------------- ***
struct Norm2AllModes : public UMemory {
static Norm2AllModes *createInstance(const char *packageName,
const char *name,
UErrorCode &errorCode);
Norm2AllModes() : comp(impl, FALSE), decomp(impl), fcd(impl), fcc(impl, TRUE) {}
Normalizer2Impl impl;
ComposeNormalizer2 comp;
DecomposeNormalizer2 decomp;
FCDNormalizer2 fcd;
ComposeNormalizer2 fcc;
};
Norm2AllModes *
Norm2AllModes::createInstance(const char *packageName,
const char *name,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
LocalPointer<Norm2AllModes> allModes(new Norm2AllModes);
if(allModes.isNull()) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
allModes->impl.load(packageName, name, errorCode);
return U_SUCCESS(errorCode) ? allModes.orphan() : NULL;
}
U_CDECL_BEGIN
static UBool U_CALLCONV uprv_normalizer2_cleanup();
U_CDECL_END
class Norm2AllModesSingleton : public TriStateSingletonWrapper<Norm2AllModes> {
public:
Norm2AllModesSingleton(TriStateSingleton &s, const char *n) :
TriStateSingletonWrapper<Norm2AllModes>(s), name(n) {}
Norm2AllModes *getInstance(UErrorCode &errorCode) {
return TriStateSingletonWrapper<Norm2AllModes>::getInstance(createInstance, name, errorCode);
}
private:
static void *createInstance(const void *context, UErrorCode &errorCode) {
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
return Norm2AllModes::createInstance(NULL, (const char *)context, errorCode);
}
const char *name;
};
STATIC_TRI_STATE_SINGLETON(nfcSingleton);
STATIC_TRI_STATE_SINGLETON(nfkcSingleton);
STATIC_TRI_STATE_SINGLETON(nfkc_cfSingleton);
class Norm2Singleton : public SimpleSingletonWrapper<Normalizer2> {
public:
Norm2Singleton(SimpleSingleton &s) : SimpleSingletonWrapper<Normalizer2>(s) {}
Normalizer2 *getInstance(UErrorCode &errorCode) {
return SimpleSingletonWrapper<Normalizer2>::getInstance(createInstance, NULL, errorCode);
}
private:
static void *createInstance(const void *context, UErrorCode &errorCode) {
Normalizer2 *noop=new NoopNormalizer2;
if(noop==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
ucln_common_registerCleanup(UCLN_COMMON_NORMALIZER2, uprv_normalizer2_cleanup);
return noop;
}
};
STATIC_SIMPLE_SINGLETON(noopSingleton);
U_CDECL_BEGIN
static UBool U_CALLCONV uprv_normalizer2_cleanup() {
Norm2AllModesSingleton(nfcSingleton, NULL).deleteInstance();
Norm2AllModesSingleton(nfkcSingleton, NULL).deleteInstance();
Norm2AllModesSingleton(nfkc_cfSingleton, NULL).deleteInstance();
Norm2Singleton(noopSingleton).deleteInstance();
return TRUE;
}
U_CDECL_END
const Normalizer2 *Normalizer2Factory::getNFCInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
return allModes!=NULL ? &allModes->comp : NULL;
}
const Normalizer2 *Normalizer2Factory::getNFDInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
return allModes!=NULL ? &allModes->decomp : NULL;
}
const Normalizer2 *Normalizer2Factory::getFCDInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
if(allModes!=NULL) {
allModes->impl.getFCDTrie(errorCode);
return &allModes->fcd;
} else {
return NULL;
}
}
const Normalizer2 *Normalizer2Factory::getFCCInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
return allModes!=NULL ? &allModes->fcc : NULL;
}
const Normalizer2 *Normalizer2Factory::getNFKCInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
return allModes!=NULL ? &allModes->comp : NULL;
}
const Normalizer2 *Normalizer2Factory::getNFKDInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
return allModes!=NULL ? &allModes->decomp : NULL;
}
const Normalizer2 *Normalizer2Factory::getNFKC_CFInstance(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfkc_cfSingleton, "nfkc_cf").getInstance(errorCode);
return allModes!=NULL ? &allModes->comp : NULL;
}
const Normalizer2 *Normalizer2Factory::getNoopInstance(UErrorCode &errorCode) {
return Norm2Singleton(noopSingleton).getInstance(errorCode);
}
const Normalizer2 *
Normalizer2Factory::getInstance(UNormalizationMode mode, UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
switch(mode) {
case UNORM_NFD:
return getNFDInstance(errorCode);
case UNORM_NFKD:
return getNFKDInstance(errorCode);
case UNORM_NFC:
return getNFCInstance(errorCode);
case UNORM_NFKC:
return getNFKCInstance(errorCode);
case UNORM_FCD:
return getFCDInstance(errorCode);
default: // UNORM_NONE
return getNoopInstance(errorCode);
}
}
const Normalizer2Impl *
Normalizer2Factory::getNFCImpl(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
return allModes!=NULL ? &allModes->impl : NULL;
}
const Normalizer2Impl *
Normalizer2Factory::getNFKCImpl(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
return allModes!=NULL ? &allModes->impl : NULL;
}
const Normalizer2Impl *
Normalizer2Factory::getNFKC_CFImpl(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfkc_cfSingleton, "nfkc_cf").getInstance(errorCode);
return allModes!=NULL ? &allModes->impl : NULL;
}
const UTrie2 *
Normalizer2Factory::getFCDTrie(UErrorCode &errorCode) {
Norm2AllModes *allModes=
Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
if(allModes!=NULL) {
return allModes->impl.getFCDTrie(errorCode);
} else {
return NULL;
}
}
const Normalizer2 *
Normalizer2::getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode &errorCode) {
if(U_FAILURE(errorCode)) {
return NULL;
}
if(packageName==NULL) {
Norm2AllModes *allModes=NULL;
if(0==uprv_strcmp(name, "nfc")) {
allModes=Norm2AllModesSingleton(nfcSingleton, "nfc").getInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc")) {
allModes=Norm2AllModesSingleton(nfkcSingleton, "nfkc").getInstance(errorCode);
} else if(0==uprv_strcmp(name, "nfkc_cf")) {
allModes=Norm2AllModesSingleton(nfkc_cfSingleton, "nfkc_cf").getInstance(errorCode);
}
if(allModes!=NULL) {
switch(mode) {
case UNORM2_COMPOSE:
return &allModes->comp;
case UNORM2_DECOMPOSE:
return &allModes->decomp;
case UNORM2_FCD:
allModes->impl.getFCDTrie(errorCode);
return &allModes->fcd;
case UNORM2_COMPOSE_CONTIGUOUS:
return &allModes->fcc;
default:
break; // do nothing
}
}
}
if(U_SUCCESS(errorCode)) {
// TODO: Real loading and caching...
errorCode=U_UNSUPPORTED_ERROR;
}
return NULL;
}
UOBJECT_DEFINE_ABSTRACT_RTTI_IMPLEMENTATION(Normalizer2)
// C API ------------------------------------------------------------------- ***
U_DRAFT const UNormalizer2 * U_EXPORT2
unorm2_getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode *pErrorCode) {
return (const UNormalizer2 *)Normalizer2::getInstance(packageName, name, mode, *pErrorCode);
}
U_DRAFT void U_EXPORT2
unorm2_close(UNormalizer2 *norm2) {
delete (Normalizer2 *)norm2;
}
U_DRAFT int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 *norm2,
const UChar *src, int32_t length,
UChar *dest, int32_t capacity,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(src==NULL || length<-1 || capacity<0 || (dest==NULL && capacity>0) || src==dest) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString destString(dest, 0, capacity);
const Normalizer2 *n2=(const Normalizer2 *)norm2;
if(n2->getDynamicClassID()==Normalizer2WithImpl::getStaticClassID()) {
// Avoid duplicate argument checking and support NUL-terminated src.
const Normalizer2WithImpl *n2wi=(const Normalizer2WithImpl *)n2;
ReorderingBuffer buffer(n2wi->impl, destString);
if(buffer.init(length, *pErrorCode)) {
n2wi->normalize(src, length>=0 ? src+length : NULL, buffer, *pErrorCode);
}
} else {
UnicodeString srcString(length<0, src, length);
n2->normalize(srcString, destString, *pErrorCode);
}
return destString.extract(dest, capacity, *pErrorCode);
}
static int32_t
normalizeSecondAndAppend(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UBool doNormalize,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if( second==NULL || secondLength<-1 ||
firstCapacity<0 || (first==NULL && firstCapacity>0) || firstLength<-1 ||
first==second
) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString firstString(first, firstLength, firstCapacity);
const Normalizer2 *n2=(const Normalizer2 *)norm2;
if(n2->getDynamicClassID()==Normalizer2WithImpl::getStaticClassID()) {
// Avoid duplicate argument checking and support NUL-terminated src.
const Normalizer2WithImpl *n2wi=(const Normalizer2WithImpl *)n2;
ReorderingBuffer buffer(n2wi->impl, firstString);
if(buffer.init(firstLength+secondLength+1, *pErrorCode)) { // destCapacity>=-1
n2wi->normalizeAndAppend(second, secondLength>=0 ? second+secondLength : NULL,
doNormalize, buffer, *pErrorCode);
}
} else {
UnicodeString secondString(secondLength<0, second, secondLength);
if(doNormalize) {
n2->normalizeSecondAndAppend(firstString, secondString, *pErrorCode);
} else {
n2->append(firstString, secondString, *pErrorCode);
}
}
return firstString.extract(first, firstCapacity, *pErrorCode);
}
U_DRAFT int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UErrorCode *pErrorCode) {
return normalizeSecondAndAppend(norm2,
first, firstLength, firstCapacity,
second, secondLength,
TRUE, pErrorCode);
}
U_DRAFT int32_t U_EXPORT2
unorm2_append(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UErrorCode *pErrorCode) {
return normalizeSecondAndAppend(norm2,
first, firstLength, firstCapacity,
second, secondLength,
FALSE, pErrorCode);
}
U_DRAFT UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s==NULL || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString sString(length<0, s, length);
return ((const Normalizer2 *)norm2)->isNormalized(sString, *pErrorCode);
}
U_DRAFT UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return UNORM_NO;
}
if(s==NULL || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return UNORM_NO;
}
UnicodeString sString(length<0, s, length);
return ((const Normalizer2 *)norm2)->quickCheck(sString, *pErrorCode);
}
U_DRAFT int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s==NULL || length<-1) {
*pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
return 0;
}
UnicodeString sString(length<0, s, length);
return ((const Normalizer2 *)norm2)->spanQuickCheckYes(sString, *pErrorCode);
}
U_DRAFT UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c) {
return ((const Normalizer2 *)norm2)->hasBoundaryBefore(c);
}
U_DRAFT UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c) {
return ((const Normalizer2 *)norm2)->hasBoundaryAfter(c);
}
U_DRAFT UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c) {
return ((const Normalizer2 *)norm2)->isInert(c);
}
// Some properties APIs ---------------------------------------------------- ***
U_CFUNC UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode) {
if(mode<=UNORM_NONE || UNORM_FCD<=mode) {
return UNORM_YES;
}
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2 *norm2=Normalizer2Factory::getInstance(mode, errorCode);
if(U_SUCCESS(errorCode)) {
return ((const Normalizer2WithImpl *)norm2)->getQuickCheck(c);
} else {
return UNORM_MAYBE;
}
}
U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode) {
const UTrie2 *trie=Normalizer2Factory::getFCDTrie(*pErrorCode);
if(U_SUCCESS(*pErrorCode)) {
fcdHighStart=trie->highStart;
return trie->index;
} else {
return NULL;
}
}
U_NAMESPACE_END
#endif // !UCONFIG_NO_NORMALIZATION

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,603 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: normalizer2impl.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov22
* created by: Markus W. Scherer
*/
#ifndef __NORMALIZER2IMPL_H__
#define __NORMALIZER2IMPL_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/normalizer2.h"
#include "unicode/udata.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "mutex.h"
#include "uset_imp.h"
#include "utrie2.h"
U_NAMESPACE_BEGIN
class Hangul {
public:
/* Korean Hangul and Jamo constants */
enum {
JAMO_L_BASE=0x1100, /* "lead" jamo */
JAMO_V_BASE=0x1161, /* "vowel" jamo */
JAMO_T_BASE=0x11a7, /* "trail" jamo */
HANGUL_BASE=0xac00,
JAMO_L_COUNT=19,
JAMO_V_COUNT=21,
JAMO_T_COUNT=28,
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT,
HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT
};
static inline UBool isHangul(UChar32 c) {
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
}
static inline UBool
isHangulWithoutJamoT(UChar c) {
c-=HANGUL_BASE;
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
static inline UBool isJamoL(UChar32 c) {
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
}
static inline UBool isJamoV(UChar32 c) {
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer
* and returns the length of the decomposition (2 or 3).
*/
static inline int32_t decompose(UChar32 c, UChar buffer[3]) {
c-=HANGUL_BASE;
UChar32 c2=c%JAMO_T_COUNT;
c/=JAMO_T_COUNT;
buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT);
buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT);
if(c2==0) {
return 2;
} else {
buffer[2]=(UChar)(JAMO_T_BASE+c2);
return 3;
}
}
private:
Hangul(); // no instantiation
};
class Normalizer2Impl;
class ReorderingBuffer : public UMemory {
public:
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
impl(ni), str(dest),
start(NULL), reorderStart(NULL), limit(NULL),
remainingCapacity(0), lastCC(0) {}
~ReorderingBuffer() {
if(start!=NULL) {
str.releaseBuffer((int32_t)(limit-start));
}
}
UBool init(int32_t destCapacity, UErrorCode &errorCode);
UBool isEmpty() const { return start==limit; }
int32_t length() const { return (int32_t)(limit-start); }
UChar *getStart() { return start; }
UChar *getLimit() { return limit; }
uint8_t getLastCC() const { return lastCC; }
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
return (c<=0xffff) ?
appendBMP((UChar)c, cc, errorCode) :
appendSupplementary(c, cc, errorCode);
}
// s must be in NFD, otherwise change the implementation.
UBool append(const UChar *s, int32_t length,
uint8_t leadCC, uint8_t trailCC,
UErrorCode &errorCode);
UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) {
if(remainingCapacity==0 && !resize(1, errorCode)) {
return FALSE;
}
if(lastCC<=cc || cc==0) {
*limit++=c;
lastCC=cc;
if(cc<=1) {
reorderStart=limit;
}
} else {
insert(c, cc);
}
--remainingCapacity;
return TRUE;
}
UBool appendZeroCC(UChar32 c, UErrorCode &errorCode);
UBool appendZeroCC(const UChar *s, const UChar *sLimit, UErrorCode &errorCode);
void removeZeroCCSuffix(int32_t length);
void setReorderingLimitAndLastCC(UChar *newLimit, uint8_t newLastCC) {
remainingCapacity+=(int32_t)(limit-newLimit);
reorderStart=limit=newLimit;
lastCC=newLastCC;
}
private:
/*
* TODO: Revisit whether it makes sense to track reorderStart.
* It is set to after the last known character with cc<=1,
* which stops previousCC() before it reads that character and looks up its cc.
* previousCC() is normally only called from insert().
* In other words, reorderStart speeds up the insertion of a combining mark
* into a multi-combining mark sequence where it does not belong at the end.
* This might not be worth the trouble.
* On the other hand, it's not a huge amount of trouble.
*
* We probably need it for UNORM_SIMPLE_APPEND.
*/
UBool appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &errorCode);
void insert(UChar32 c, uint8_t cc);
static void writeCodePoint(UChar *p, UChar32 c) {
if(c<=0xffff) {
*p=(UChar)c;
} else {
p[0]=U16_LEAD(c);
p[1]=U16_TRAIL(c);
}
}
UBool resize(int32_t appendLength, UErrorCode &errorCode);
const Normalizer2Impl &impl;
UnicodeString &str;
UChar *start, *reorderStart, *limit;
int32_t remainingCapacity;
uint8_t lastCC;
// private backward iterator
void setIterator() { codePointStart=limit; }
void skipPrevious(); // Requires start<codePointStart.
uint8_t previousCC(); // Returns 0 if there is no previous character.
UChar *codePointStart, *codePointLimit;
};
class U_COMMON_API Normalizer2Impl : public UMemory {
public:
Normalizer2Impl() : memory(NULL), normTrie(NULL) {
fcdTrieSingleton.fInstance=NULL;
}
~Normalizer2Impl();
void load(const char *packageName, const char *name, UErrorCode &errorCode);
void addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const;
// low-level properties ------------------------------------------------ ***
const UTrie2 *getNormTrie() const { return normTrie; }
const UTrie2 *getFCDTrie(UErrorCode &errorCode) const ;
uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
uint16_t getNorm16FromBMP(UChar c) const { return UTRIE2_GET16(normTrie, c); }
uint16_t getNorm16FromSingleLead(UChar c) const {
return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c);
}
uint16_t getNorm16FromSupplementary(UChar32 c) const {
return UTRIE2_GET16_FROM_SUPP(normTrie, c);
}
uint16_t getNorm16FromSurrogatePair(UChar c, UChar c2) const {
return getNorm16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
}
UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const {
if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) {
return UNORM_YES;
} else if(minMaybeYes<=norm16) {
return UNORM_MAYBE;
} else {
return UNORM_NO;
}
}
UBool isCompNo(uint16_t norm16) const { return minNoNo<=norm16 && norm16<minMaybeYes; }
UBool isDecompYes(uint16_t norm16) const { return norm16<minYesNo || minMaybeYes<=norm16; }
uint8_t getCC(uint16_t norm16) const {
if(norm16>=MIN_NORMAL_MAYBE_YES) {
return (uint8_t)norm16;
}
if(norm16<minNoNo || limitNoNo<=norm16) {
return 0;
}
return getCCFromNoNo(norm16);
}
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
}
uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); }
uint16_t getFCD16FromBMP(UChar c) const { return UTRIE2_GET16(fcdTrie(), c); }
uint16_t getFCD16FromSingleLead(UChar c) const {
return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c);
}
uint16_t getFCD16FromSupplementary(UChar32 c) const {
return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c);
}
uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const {
return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2));
}
void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
UTrie2 *newFCDTrie, UErrorCode &errorCode) const;
/**
* Get the decomposition for one code point.
* @param c code point
* @param buffer out-only buffer for algorithmic decompositions
* @param length out-only, takes the length of the decomposition, if any
* @return pointer to the decomposition, or NULL if none
*/
const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const;
enum {
MIN_CCC_LCCC_CP=0x300
};
enum {
MIN_YES_YES_WITH_CC=0xff01,
JAMO_VT=0xff00,
MIN_NORMAL_MAYBE_YES=0xfe00,
JAMO_L=1,
MAX_DELTA=0x40
};
enum {
// Byte offsets from the start of the data, after the generic header.
IX_NORM_TRIE_OFFSET,
IX_EXTRA_DATA_OFFSET,
IX_RESERVED2_OFFSET,
IX_RESERVED3_OFFSET,
IX_RESERVED4_OFFSET,
IX_RESERVED5_OFFSET,
IX_RESERVED6_OFFSET,
IX_TOTAL_SIZE,
// Code point thresholds for quick check codes.
IX_MIN_DECOMP_NO_CP,
IX_MIN_COMP_NO_MAYBE_CP,
// Norm16 value thresholds for quick check combinations and types of extra data.
IX_MIN_YES_NO,
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
IX_RESERVED14,
IX_RESERVED15,
IX_COUNT
};
enum {
MAPPING_HAS_CCC_LCCC_WORD=0x80,
MAPPING_PLUS_COMPOSITION_LIST=0x40,
MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
MAPPING_LENGTH_MASK=0x1f
};
enum {
COMP_1_LAST_TUPLE=0x8000,
COMP_1_TRIPLE=1,
COMP_1_TRAIL_LIMIT=0x3400,
COMP_1_TRAIL_MASK=0x7ffe,
COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit
COMP_2_TRAIL_SHIFT=6,
COMP_2_TRAIL_MASK=0xffc0
};
// higher-level functionality ------------------------------------------ ***
const UChar *decompose(const UChar *src, const UChar *limit,
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
void decomposeAndAppend(const UChar *src, const UChar *limit,
UBool doDecompose,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool compose(const UChar *src, const UChar *limit,
UBool onlyContiguous,
UBool doCompose,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
const UChar *composeQuickCheck(const UChar *src, const UChar *limit,
UBool onlyContiguous,
UNormalizationCheckResult *pQCResult) const;
void composeAndAppend(const UChar *src, const UChar *limit,
UBool doCompose,
UBool onlyContiguous,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
const UChar *makeFCD(const UChar *src, const UChar *limit,
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
void makeFCDAndAppend(const UChar *src, const UChar *limit,
UBool doMakeFCD,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool hasDecompBoundary(UChar32 c, UBool before) const;
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
UBool hasCompBoundaryBefore(UChar32 c) const {
return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
}
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
UBool hasFCDBoundaryAfter(UChar32 c) const {
uint16_t fcd16=getFCD16(c);
return fcd16<=1 || (fcd16&0xff)==0;
}
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
private:
static UBool U_CALLCONV
isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo);
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
static UBool isInert(uint16_t norm16) { return norm16==0; }
// static UBool isJamoL(uint16_t norm16) const { return norm16==1; }
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
// UBool isCompYes(uint16_t norm16) const {
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
// }
// UBool isCompYesOrMaybe(uint16_t norm16) const {
// return norm16<minNoNo || minMaybeYes<=norm16;
// }
UBool hasZeroCCFromDecompYes(uint16_t norm16) {
return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
}
UBool isDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo ||
norm16==JAMO_VT ||
(minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES);
}
/**
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
* the MaybeYes which combine-forward and have ccc=0.
* (Standard Unicode 5.2 normalization does not have such characters.)
*/
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
}
UBool isDecompNoAlgorithmic(uint16_t norm16) const { return norm16>=limitNoNo; }
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
// }
uint8_t getCCFromNoNo(uint16_t norm16) const {
const uint16_t *mapping=getMapping(norm16);
if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) {
return (uint8_t)mapping[1];
} else {
return 0;
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
// Requires algorithmic-NoNo.
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
return c+norm16-(minMaybeYes-MAX_DELTA-1);
}
// Requires minYesNo<norm16<limitNoNo.
const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
const uint16_t *getCompositionsListForDecompYesAndZeroCC(uint16_t norm16) const {
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
return NULL;
} else if(norm16<minMaybeYes) {
return extraData+norm16; // for yesYes; if Jamo L: harmless empty list
} else {
return maybeYesCompositions+norm16-minMaybeYes;
}
}
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
return list+ // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
(*list&MAPPING_LENGTH_MASK)+ // + mapping length
((*list>>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD
}
const UChar *copyLowPrefixFromNulTerminated(const UChar *src,
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const;
UBool decomposeShort(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
UBool decompose(UChar32 c, uint16_t norm16,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
static int32_t combine(const uint16_t *list, UChar32 trail);
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
UBool onlyContiguous) const;
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; }
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
UDataMemory *memory;
UVersionInfo dataVersion;
// Code point thresholds for quick check codes.
UChar32 minDecompNoCP;
UChar32 minCompNoMaybeCP;
// Norm16 value thresholds for quick check combinations and types of extra data.
uint16_t minYesNo;
uint16_t minNoNo;
uint16_t limitNoNo;
uint16_t minMaybeYes;
UTrie2 *normTrie;
const uint16_t *maybeYesCompositions;
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
SimpleSingleton fcdTrieSingleton;
};
/**
* ICU-internal shortcut for quick access to standard Unicode normalization.
*/
class U_COMMON_API Normalizer2Factory {
public:
static const Normalizer2 *getNFCInstance(UErrorCode &errorCode);
static const Normalizer2 *getNFDInstance(UErrorCode &errorCode);
static const Normalizer2 *getFCDInstance(UErrorCode &errorCode);
static const Normalizer2 *getFCCInstance(UErrorCode &errorCode);
static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode);
static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode);
static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode);
static const Normalizer2 *getNoopInstance(UErrorCode &errorCode);
static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode);
static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode);
static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode);
static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode);
static const UTrie2 *getFCDTrie(UErrorCode &errorCode);
private:
Normalizer2Factory(); // No instantiation.
};
U_CAPI int32_t U_EXPORT2
unorm2_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
* @internal
*/
U_CFUNC UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
/**
* Internal API, used by collation code.
* Get access to the internal FCD trie table to be able to perform
* incremental, per-code unit, FCD checks in collation.
* One pointer is sufficient because the trie index values are offset
* by the index size, so that the same pointer is used to access the trie data.
* Code points at fcdHighStart and above have a zero FCD value.
* @internal
*/
U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode);
/**
* Internal API, used by collation code.
* Get the FCD value for a code unit, with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* If c is a lead surrogate and the value is not 0,
* then some of c's associated supplementary code points have a non-zero FCD value.
*
* @internal
*/
static inline uint16_t
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
}
/**
* Internal API, used by collation code.
* Get the FCD value of the next code point (post-increment), with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* @internal
*/
static inline uint16_t
unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
const UChar *&s, const UChar *limit) {
UChar32 c=*s++;
uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
if(fcd!=0 && U16_IS_LEAD(c)) {
UChar c2;
if(s!=limit && U16_IS_TRAIL(c2=*s)) {
++s;
c=U16_GET_SUPPLEMENTARY(c, c2);
if(c<fcdHighStart) {
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
} else {
fcd=0;
}
} else /* unpaired lead surrogate */ {
fcd=0;
}
}
return fcd;
}
/**
* Internal API, used by collation code.
* Get the FCD value of the previous code point (pre-decrement), with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* @internal
*/
static inline uint16_t
unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
const UChar *start, const UChar *&s) {
UChar32 c=*--s;
uint16_t fcd;
if(!U16_IS_SURROGATE(c)) {
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
} else {
UChar c2;
if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) {
--s;
c=U16_GET_SUPPLEMENTARY(c2, c);
if(c<fcdHighStart) {
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
} else {
fcd=0;
}
} else /* unpaired surrogate */ {
fcd=0;
}
}
return fcd;
}
U_NAMESPACE_END
#endif /* !UCONFIG_NO_NORMALIZATION */
#endif /* __NORMALIZER2IMPL_H__ */

View file

@ -1,7 +1,7 @@
/*
*************************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2005, International Business Machines Corporation and
* Copyright (c) 1996-2010, International Business Machines Corporation and
* others. All Rights Reserved.
*************************************************************************
*/
@ -10,14 +10,15 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/schriter.h"
#include "unicode/uchriter.h"
#include "unicode/uiter.h"
#include "unicode/normlzr.h"
#include "cmemory.h"
#include "unormimp.h"
#include "normalizer2impl.h"
#include "uprops.h" // for uniset_getUnicode32Instance()
U_NAMESPACE_BEGIN
@ -28,72 +29,68 @@ UOBJECT_DEFINE_RTTI_IMPLEMENTATION(Normalizer)
//-------------------------------------------------------------------------
Normalizer::Normalizer(const UnicodeString& str, UNormalizationMode mode) :
UObject(), fUMode(mode), fOptions(0),
UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
text(new StringCharacterIterator(str)),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
init(new StringCharacterIterator(str));
init();
}
Normalizer::Normalizer(const UChar *str, int32_t length, UNormalizationMode mode) :
UObject(), fUMode(mode), fOptions(0),
UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
text(new UCharCharacterIterator(str, length)),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
init(new UCharCharacterIterator(str, length));
init();
}
Normalizer::Normalizer(const CharacterIterator& iter, UNormalizationMode mode) :
UObject(), fUMode(mode), fOptions(0),
UObject(), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(mode), fOptions(0),
text(iter.clone()),
currentIndex(0), nextIndex(0),
buffer(), bufferPos(0)
{
init(iter.clone());
init();
}
Normalizer::Normalizer(const Normalizer &copy) :
UObject(copy), fUMode(copy.fUMode), fOptions(copy.fOptions),
UObject(copy), fFilteredNorm2(NULL), fNorm2(NULL), fUMode(copy.fUMode), fOptions(copy.fOptions),
text(copy.text->clone()),
currentIndex(copy.currentIndex), nextIndex(copy.nextIndex),
buffer(copy.buffer), bufferPos(copy.bufferPos)
{
init(((CharacterIterator *)(copy.text->context))->clone());
init();
}
static const UChar _NUL=0;
void
Normalizer::init(CharacterIterator *iter) {
Normalizer::init() {
UErrorCode errorCode=U_ZERO_ERROR;
text=(UCharIterator *)uprv_malloc(sizeof(UCharIterator));
if(text!=NULL) {
if(unorm_haveData(&errorCode)) {
uiter_setCharacterIterator(text, iter);
} else {
delete iter;
uiter_setCharacterIterator(text, new UCharCharacterIterator(&_NUL, 0));
}
} else {
delete iter;
fNorm2=Normalizer2Factory::getInstance(fUMode, errorCode);
if(fOptions&UNORM_UNICODE_3_2) {
delete fFilteredNorm2;
fNorm2=fFilteredNorm2=
new FilteredNormalizer2(*fNorm2, *uniset_getUnicode32Instance(errorCode));
}
if(U_FAILURE(errorCode)) {
errorCode=U_ZERO_ERROR;
fNorm2=Normalizer2Factory::getNoopInstance(errorCode);
}
}
Normalizer::~Normalizer()
{
if(text!=NULL) {
delete (CharacterIterator *)text->context;
uprv_free(text);
}
delete fFilteredNorm2;
delete text;
}
Normalizer*
Normalizer::clone() const
{
if(this!=0) {
return new Normalizer(*this);
} else {
return 0;
}
return new Normalizer(*this);
}
/**
@ -101,7 +98,7 @@ Normalizer::clone() const
*/
int32_t Normalizer::hashCode() const
{
return ((CharacterIterator *)(text->context))->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
return text->hashCode() + fUMode + fOptions + buffer.hashCode() + bufferPos + currentIndex + nextIndex;
}
UBool Normalizer::operator==(const Normalizer& that) const
@ -110,7 +107,7 @@ UBool Normalizer::operator==(const Normalizer& that) const
this==&that ||
fUMode==that.fUMode &&
fOptions==that.fOptions &&
*((CharacterIterator *)(text->context))==*((CharacterIterator *)(that.text->context)) &&
*text==*that.text &&
buffer==that.buffer &&
bufferPos==that.bufferPos &&
nextIndex==that.nextIndex;
@ -140,29 +137,18 @@ Normalizer::normalize(const UnicodeString& source,
// the source and result strings are the same object, use a temporary one
dest=&localDest;
}
UChar *buffer=dest->getBuffer(source.length());
int32_t length=unorm_internalNormalize(buffer, dest->getCapacity(),
source.getBuffer(), source.length(),
mode, options,
&status);
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
if(status==U_BUFFER_OVERFLOW_ERROR) {
status=U_ZERO_ERROR;
buffer=dest->getBuffer(length);
length=unorm_internalNormalize(buffer, dest->getCapacity(),
source.getBuffer(), source.length(),
mode, options,
&status);
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
if(U_SUCCESS(status)) {
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
normalize(source, *dest, status);
} else {
n2->normalize(source, *dest, status);
}
}
if(dest==&localDest) {
if(dest==&localDest && U_SUCCESS(status)) {
result=*dest;
}
if(U_FAILURE(status)) {
result.setToBogus();
}
}
}
@ -171,45 +157,7 @@ Normalizer::compose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status) {
if(source.isBogus() || U_FAILURE(status)) {
result.setToBogus();
if(U_SUCCESS(status)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
}
} else {
UnicodeString localDest;
UnicodeString *dest;
if(&source!=&result) {
dest=&result;
} else {
// the source and result strings are the same object, use a temporary one
dest=&localDest;
}
UChar *buffer=dest->getBuffer(source.length());
int32_t length=unorm_compose(buffer, dest->getCapacity(),
source.getBuffer(), source.length(),
compat, options,
&status);
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
if(status==U_BUFFER_OVERFLOW_ERROR) {
status=U_ZERO_ERROR;
buffer=dest->getBuffer(length);
length=unorm_compose(buffer, dest->getCapacity(),
source.getBuffer(), source.length(),
compat, options,
&status);
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
}
if(dest==&localDest) {
result=*dest;
}
if(U_FAILURE(status)) {
result.setToBogus();
}
}
normalize(source, compat ? UNORM_NFKC : UNORM_NFC, options, result, status);
}
void U_EXPORT2
@ -217,44 +165,40 @@ Normalizer::decompose(const UnicodeString& source,
UBool compat, int32_t options,
UnicodeString& result,
UErrorCode &status) {
if(source.isBogus() || U_FAILURE(status)) {
result.setToBogus();
if(U_SUCCESS(status)) {
status=U_ILLEGAL_ARGUMENT_ERROR;
normalize(source, compat ? UNORM_NFKD : UNORM_NFD, options, result, status);
}
UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UErrorCode &status) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
if(U_SUCCESS(status)) {
if(options&UNORM_UNICODE_3_2) {
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
quickCheck(source, status);
} else {
return n2->quickCheck(source, status);
}
} else {
UnicodeString localDest;
UnicodeString *dest;
return UNORM_MAYBE;
}
}
if(&source!=&result) {
dest=&result;
UBool
Normalizer::isNormalized(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UErrorCode &status) {
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, status);
if(U_SUCCESS(status)) {
if(options&UNORM_UNICODE_3_2) {
return FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(status)).
isNormalized(source, status);
} else {
// the source and result strings are the same object, use a temporary one
dest=&localDest;
}
UChar *buffer=dest->getBuffer(source.length());
int32_t length=unorm_decompose(buffer, dest->getCapacity(),
source.getBuffer(), source.length(),
compat, options,
&status);
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
if(status==U_BUFFER_OVERFLOW_ERROR) {
status=U_ZERO_ERROR;
buffer=dest->getBuffer(length);
length=unorm_decompose(buffer, dest->getCapacity(),
source.getBuffer(), source.length(),
compat, options,
&status);
dest->releaseBuffer(U_SUCCESS(status) ? length : 0);
}
if(dest==&localDest) {
result=*dest;
}
if(U_FAILURE(status)) {
result.setToBogus();
return n2->isNormalized(source, status);
}
} else {
return FALSE;
}
}
@ -272,37 +216,25 @@ Normalizer::concatenate(UnicodeString &left, UnicodeString &right,
UnicodeString localDest;
UnicodeString *dest;
if(&left!=&result && &right!=&result) {
if(&right!=&result) {
dest=&result;
} else {
// the source and result strings are the same object, use a temporary one
// the right and result strings are the same object, use a temporary one
dest=&localDest;
}
UChar *buffer=dest->getBuffer(left.length()+right.length());
int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
right.getBuffer(), right.length(),
buffer, dest->getCapacity(),
mode, options,
&errorCode);
dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
buffer=dest->getBuffer(length);
int32_t length=unorm_concatenate(left.getBuffer(), left.length(),
right.getBuffer(), right.length(),
buffer, dest->getCapacity(),
mode, options,
&errorCode);
dest->releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
*dest=left;
const Normalizer2 *n2=Normalizer2Factory::getInstance(mode, errorCode);
if(U_SUCCESS(errorCode)) {
if(options&UNORM_UNICODE_3_2) {
FilteredNormalizer2(*n2, *uniset_getUnicode32Instance(errorCode)).
append(*dest, right, errorCode);
} else {
n2->append(*dest, right, errorCode);
}
}
if(dest==&localDest) {
if(dest==&localDest && U_SUCCESS(errorCode)) {
result=*dest;
}
if(U_FAILURE(errorCode)) {
result.setToBogus();
}
}
return result;
}
@ -353,19 +285,20 @@ UChar32 Normalizer::previous() {
}
void Normalizer::reset() {
currentIndex=nextIndex=text->move(text, 0, UITER_START);
currentIndex=nextIndex=text->setToStart();
clearBuffer();
}
void
Normalizer::setIndexOnly(int32_t index) {
currentIndex=nextIndex=text->move(text, index, UITER_ZERO); // validates index
text->setIndex(index); // pins index
currentIndex=nextIndex=text->getIndex();
clearBuffer();
}
/**
* Return the first character in the normalized text-> This resets
* the <tt>Normalizer's</tt> position to the beginning of the text->
* Return the first character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to the beginning of the text.
*/
UChar32 Normalizer::first() {
reset();
@ -373,12 +306,12 @@ UChar32 Normalizer::first() {
}
/**
* Return the last character in the normalized text-> This resets
* Return the last character in the normalized text. This resets
* the <tt>Normalizer's</tt> position to be just before the
* the input text corresponding to that normalized character.
*/
UChar32 Normalizer::last() {
currentIndex=nextIndex=text->move(text, 0, UITER_LIMIT);
currentIndex=nextIndex=text->setToEnd();
clearBuffer();
return previous();
}
@ -406,21 +339,21 @@ int32_t Normalizer::getIndex() const {
}
/**
* Retrieve the index of the start of the input text-> This is the begin index
* Retrieve the index of the start of the input text. This is the begin index
* of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
int32_t Normalizer::startIndex() const {
return text->getIndex(text, UITER_START);
return text->startIndex();
}
/**
* Retrieve the index of the end of the input text-> This is the end index
* Retrieve the index of the end of the input text. This is the end index
* of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
* over which this <tt>Normalizer</tt> is iterating
*/
int32_t Normalizer::endIndex() const {
return text->getIndex(text, UITER_LIMIT);
return text->endIndex();
}
//-------------------------------------------------------------------------
@ -431,6 +364,7 @@ void
Normalizer::setMode(UNormalizationMode newMode)
{
fUMode = newMode;
init();
}
UNormalizationMode
@ -448,6 +382,7 @@ Normalizer::setOption(int32_t option,
} else {
fOptions &= (~option);
}
init();
}
UBool
@ -458,7 +393,7 @@ Normalizer::getOption(int32_t option) const
/**
* Set the input text over which this <tt>Normalizer</tt> will iterate.
* The iteration position is set to the beginning of the input text->
* The iteration position is set to the beginning of the input text.
*/
void
Normalizer::setText(const UnicodeString& newText,
@ -472,8 +407,8 @@ Normalizer::setText(const UnicodeString& newText,
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete (CharacterIterator *)(text->context);
text->context = newIter;
delete text;
text = newIter;
reset();
}
@ -493,8 +428,8 @@ Normalizer::setText(const CharacterIterator& newText,
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete (CharacterIterator *)(text->context);
text->context = newIter;
delete text;
text = newIter;
reset();
}
@ -511,8 +446,8 @@ Normalizer::setText(const UChar* newText,
status = U_MEMORY_ALLOCATION_ERROR;
return;
}
delete (CharacterIterator *)(text->context);
text->context = newIter;
delete text;
text = newIter;
reset();
}
@ -523,7 +458,7 @@ Normalizer::setText(const UChar* newText,
void
Normalizer::getText(UnicodeString& result)
{
((CharacterIterator *)(text->context))->getText(result);
text->getText(result);
}
//-------------------------------------------------------------------------
@ -537,72 +472,48 @@ void Normalizer::clearBuffer() {
UBool
Normalizer::nextNormalize() {
UChar *p;
int32_t length;
UErrorCode errorCode;
clearBuffer();
currentIndex=nextIndex;
text->move(text, nextIndex, UITER_ZERO);
if(!text->hasNext(text)) {
text->setIndex(nextIndex);
if(!text->hasNext()) {
return FALSE;
}
errorCode=U_ZERO_ERROR;
p=buffer.getBuffer(-1);
length=unorm_next(text, p, buffer.getCapacity(),
fUMode, fOptions,
TRUE, 0,
&errorCode);
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
text->move(text, nextIndex, UITER_ZERO);
p=buffer.getBuffer(length);
length=unorm_next(text, p, buffer.getCapacity(),
fUMode, fOptions,
TRUE, 0,
&errorCode);
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
// Skip at least one character so we make progress.
UnicodeString segment(text->next32PostInc());
while(text->hasNext()) {
UChar32 c;
if(fNorm2->hasBoundaryBefore(c=text->next32PostInc())) {
text->move32(-1, CharacterIterator::kCurrent);
break;
}
segment.append(c);
}
nextIndex=text->getIndex(text, UITER_CURRENT);
nextIndex=text->getIndex();
UErrorCode errorCode=U_ZERO_ERROR;
fNorm2->normalize(segment, buffer, errorCode);
return U_SUCCESS(errorCode) && !buffer.isEmpty();
}
UBool
Normalizer::previousNormalize() {
UChar *p;
int32_t length;
UErrorCode errorCode;
clearBuffer();
nextIndex=currentIndex;
text->move(text, currentIndex, UITER_ZERO);
if(!text->hasPrevious(text)) {
text->setIndex(currentIndex);
if(!text->hasPrevious()) {
return FALSE;
}
errorCode=U_ZERO_ERROR;
p=buffer.getBuffer(-1);
length=unorm_previous(text, p, buffer.getCapacity(),
fUMode, fOptions,
TRUE, 0,
&errorCode);
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
errorCode=U_ZERO_ERROR;
text->move(text, currentIndex, UITER_ZERO);
p=buffer.getBuffer(length);
length=unorm_previous(text, p, buffer.getCapacity(),
fUMode, fOptions,
TRUE, 0,
&errorCode);
buffer.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
UnicodeString segment;
while(text->hasPrevious()) {
UChar32 c=text->previous32();
segment.insert(0, c);
if(fNorm2->hasBoundaryBefore(c)) {
break;
}
}
currentIndex=text->getIndex();
UErrorCode errorCode=U_ZERO_ERROR;
fNorm2->normalize(segment, buffer, errorCode);
bufferPos=buffer.length();
currentIndex=text->getIndex(text, UITER_CURRENT);
return U_SUCCESS(errorCode) && !buffer.isEmpty();
}

View file

@ -1,6 +1,6 @@
/*
********************************************************************************
* Copyright (C) 1996-2009, International Business Machines
* Copyright (C) 1996-2010, International Business Machines
* Corporation and others. All Rights Reserved.
********************************************************************************
*
@ -28,7 +28,6 @@
#include "ucln_cmn.h"
#include "utrie2.h"
#include "udataswp.h"
#include "unormimp.h" /* JAMO_L_BASE etc. */
#include "uprops.h"
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
@ -650,10 +649,6 @@ u_getNumericValue(UChar32 c) {
}
}
/* ICU 3.4: bidi/shaping properties moved to ubidi_props.c */
/* ICU 2.1: u_getCombiningClass() moved to unorm.cpp */
U_CAPI int32_t U_EXPORT2
u_digit(UChar32 ch, int8_t radix) {
int8_t value;

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
* *
* Copyright (C) 2001-2006, International Business Machines *
* Copyright (C) 2001-2010, International Business Machines *
* Corporation and others. All Rights Reserved. *
* *
******************************************************************************
@ -41,6 +41,7 @@ typedef enum ECleanupCommonType {
UCLN_COMMON_LOCALE,
UCLN_COMMON_ULOC,
UCLN_COMMON_UNORM,
UCLN_COMMON_NORMALIZER2,
UCLN_COMMON_USET,
UCLN_COMMON_UNAMES,
UCLN_COMMON_PNAME,

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2003-2009, International Business Machines
* Copyright (C) 2003-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2006, International Business Machines Corporation and *
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -31,6 +31,7 @@
U_NAMESPACE_BEGIN
class Hashtable;
class Normalizer2;
/**
* This class allows one to iterate through all the strings that are canonically equivalent to a given
@ -174,6 +175,8 @@ private:
// transient fields
UnicodeString buffer;
const Normalizer2 &nfd;
// we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)

View file

@ -0,0 +1,460 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: normalizer2.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov22
* created by: Markus W. Scherer
*/
#ifndef __NORMALIZER2_H__
#define __NORMALIZER2_H__
/**
* \file
* \brief C++ API: New API for Unicode Normalization.
*/
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm2.h"
U_NAMESPACE_BEGIN
/**
* Unicode normalization functionality for standard Unicode normalization or
* for using custom mapping tables.
* All instances of this class are unmodifiable/immutable.
* Instances returned by getInstance() are singletons that must not be deleted by the caller.
*
* Some of the functions in this class identify normalization boundaries.
* At a normalization boundary, the portions of the string
* before it and starting from it do not interact and can be handled independently.
*
* The spanQuickCheckYes() stops at a normalization boundary.
* When the goal is a normalized string, then the text before the boundary
* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
*
* The isBoundary() function tests whether a character is at a normalization boundary.
* This is used for moving from one normalization boundary to the next
* or preceding boundary, and for performing iterative normalization.
*
* Iterative normalization is useful when only a small portion of a
* longer string needs to be processed.
* In ICU, iterative normalization is used by the NormalizationTransliterator
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
* (to process only the substring for which sort key bytes are computed).
*
* The set of normalization boundaries returned by these functions may not be
* complete: There may be more boundaries that could be returned.
* Different functions may return different boundaries.
* @draft ICU 4.4
*/
class U_COMMON_API Normalizer2 : public UObject {
public:
/**
* Returns a Normalizer2 instance which uses the specified data file
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
* and which composes or decomposes text according to the specified mode.
* Returns an unmodifiable singleton instance. Do not delete it.
*
* Use packageName=NULL for data files that are part of ICU's own data.
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
*
* @param packageName NULL for ICU built-in data, otherwise application data package name
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
* @param mode normalization mode (compose or decompose etc.)
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested Normalizer2, if successful
* @draft ICU 4.4
*/
static const Normalizer2 *
getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode &errorCode);
/**
* Returns the normalized form of the source string.
* @param src source string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return normalized src
* @draft ICU 4.4
*/
UnicodeString
normalize(const UnicodeString &src, UErrorCode &errorCode) const {
UnicodeString result;
normalize(src, result, errorCode);
return result;
}
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string.
* The source and destination strings must be different objects.
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @draft ICU 4.4
*/
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const = 0;
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, will be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @draft ICU 4.4
*/
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const = 0;
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, should be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @draft ICU 4.4
*/
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const = 0;
/**
* Tests if the string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return TRUE if s is normalized
* @draft ICU 4.4
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Tests if the string is normalized.
* For the two COMPOSE modes, the result could be "maybe" in cases that
* would take a little more work to resolve definitively.
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
* combination of quick check + normalization, to avoid
* re-checking the "yes" prefix.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @draft ICU 4.4
*/
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Returns the end of the normalized substring of the input string.
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
* the substring <code>UnicodeString(s, 0, end)</code>
* will pass the quick check with a "yes" result.
*
* The returned end index is usually one or more characters before the
* "no" or "maybe" character: The end index is at a normalization boundary.
* (See the class documentation for more about normalization boundaries.)
*
* When the goal is a normalized string and most input strings are expected
* to be normalized already, then call this method,
* and if it returns a prefix shorter than the input string,
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @draft ICU 4.4
*/
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Tests if the character has a normalization boundary before it.
* If true, then the character does not normalization-interact with
* preceding characters.
* In other words, a string containing this character can be normalized
* by processing portions before this character and starting from this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* @param c character to test
* @return TRUE if c has a normalization boundary before it
* @draft ICU 4.4
*/
virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
/**
* Tests if the character has a normalization boundary after it.
* If true, then the character does not normalization-interact with
* following characters.
* In other words, a string containing this character can be normalized
* by processing portions up to this character and after this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* @param c character to test
* @return TRUE if c has a normalization boundary after it
* @draft ICU 4.4
*/
virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
/**
* Tests if the character is normalization-inert.
* If true, then the character does not change, nor normalization-interact with
* preceding or following characters.
* In other words, a string containing this character can be normalized
* by processing portions before this character and after this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* @param c character to test
* @return TRUE if c is normalization-inert
* @draft ICU 4.4
*/
virtual UBool isInert(UChar32 c) const = 0;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
* @returns a UClassID for this class.
* @draft ICU 4.4
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
* @return a UClassID for the actual class.
* @draft ICU 4.4
*/
virtual UClassID getDynamicClassID() const = 0;
};
/**
* Normalization filtered by a UnicodeSet.
* Normalizes portions of the text contained in the filter set and leaves
* portions not contained in the filter set unchanged.
* Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
* This class implements all of (and only) the Normalizer2 API.
* An instance of this class is unmodifiable/immutable but is constructed and
* must be destructed by the owner.
* @draft ICU 4.4
*/
class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
public:
/**
* Constructs a filtered normalizer wrapping any Normalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param n2 wrapped Normalizer2 instance
* @param filterSet UnicodeSet which determines the characters to be normalized
* @draft ICU 4.4
*/
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
norm2(n2), set(filterSet) {}
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string.
* The source and destination strings must be different objects.
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @draft ICU 4.4
*/
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const;
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, will be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @draft ICU 4.4
*/
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const;
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, should be normalized
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @draft ICU 4.4
*/
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return TRUE if s is normalized
* @draft ICU 4.4
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @draft ICU 4.4
*/
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
/**
* Returns the end of the normalized substring of the input string.
* For details see the Normalizer2 base class documentation.
* @param s input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @draft ICU 4.4
*/
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
/**
* Tests if the character has a normalization boundary before it.
* For details see the Normalizer2 base class documentation.
* @param c character to test
* @return TRUE if c has a normalization boundary before it
* @draft ICU 4.4
*/
virtual UBool hasBoundaryBefore(UChar32 c) const;
/**
* Tests if the character has a normalization boundary after it.
* For details see the Normalizer2 base class documentation.
* @param c character to test
* @return TRUE if c has a normalization boundary after it
* @draft ICU 4.4
*/
virtual UBool hasBoundaryAfter(UChar32 c) const;
/**
* Tests if the character is normalization-inert.
* For details see the Normalizer2 base class documentation.
* @param c character to test
* @return TRUE if c is normalization-inert
* @draft ICU 4.4
*/
virtual UBool isInert(UChar32 c) const;
/**
* ICU "poor man's RTTI", returns a UClassID for this class.
* @returns a UClassID for this class.
* @draft ICU 4.4
*/
static UClassID U_EXPORT2 getStaticClassID();
/**
* ICU "poor man's RTTI", returns a UClassID for the actual class.
* @return a UClassID for the actual class.
* @draft ICU 4.4
*/
virtual UClassID getDynamicClassID() const;
private:
UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const;
UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UBool doNormalize,
UErrorCode &errorCode) const;
const Normalizer2 &norm2;
const UnicodeSet &set;
};
U_NAMESPACE_END
#endif // !UCONFIG_NO_NORMALIZATION
#endif // __NORMALIZER2_H__

View file

@ -1,7 +1,7 @@
/*
********************************************************************
* COPYRIGHT:
* Copyright (c) 1996-2006, International Business Machines Corporation and
* Copyright (c) 1996-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************
*/
@ -18,14 +18,11 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uobject.h"
#include "unicode/unistr.h"
#include "unicode/chariter.h"
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
struct UCharIterator;
typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */
#include "unicode/uobject.h"
U_NAMESPACE_BEGIN
/**
@ -33,6 +30,10 @@ U_NAMESPACE_BEGIN
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Standard Annex #15: Unicode Normalization Forms</a>.
*
* Note: This API has been replaced by the Normalizer2 class and is only available
* for backward compatibility. This class simply delegates to the Normalizer2 class.
* There is one exception: The new API does not provide a replacement for Normalizer::compare().
*
* The Normalizer class consists of two parts:
* - static functions that normalize strings or test if strings are normalized
* - a Normalizer object is an iterator that takes any kind of text and
@ -40,13 +41,11 @@ U_NAMESPACE_BEGIN
*
* The Normalizer class is not suitable for subclassing.
*
* The static functions are basically wrappers around the C implementation,
* using UnicodeString instead of UChar*.
* For basic information about normalization forms and details about the C API
* please see the documentation in unorm.h.
*
* The iterator API with the Normalizer constructors and the non-static functions
* uses a CharacterIterator as input. It is possible to pass a string which
* use a CharacterIterator as input. It is possible to pass a string which
* is then internally wrapped in a CharacterIterator.
* The input text is not normalized all at once, but incrementally where needed
* (providing efficient random access).
@ -287,7 +286,7 @@ public:
* @see isNormalized
* @stable ICU 2.6
*/
static inline UNormalizationCheckResult
static UNormalizationCheckResult
quickCheck(const UnicodeString &source, UNormalizationMode mode, int32_t options, UErrorCode &status);
/**
@ -328,7 +327,7 @@ public:
* @see quickCheck
* @stable ICU 2.6
*/
static inline UBool
static UBool
isNormalized(const UnicodeString &src, UNormalizationMode mode, int32_t options, UErrorCode &errorCode);
/**
@ -726,18 +725,20 @@ private:
UBool nextNormalize();
UBool previousNormalize();
void init(CharacterIterator *iter);
void init();
void clearBuffer(void);
//-------------------------------------------------------------------------
// Private data
//-------------------------------------------------------------------------
FilteredNormalizer2*fFilteredNorm2; // owned if not NULL
const Normalizer2 *fNorm2; // not owned; may be equal to fFilteredNorm2
UNormalizationMode fUMode;
int32_t fOptions;
// The input text and our position in it
UCharIterator *text;
CharacterIterator *text;
// The normalization buffer is the result of normalization
// of the source in [currentIndex..nextIndex[ .
@ -746,7 +747,6 @@ private:
// A buffer for holding intermediate results
UnicodeString buffer;
int32_t bufferPos;
};
//-------------------------------------------------------------------------
@ -761,48 +761,14 @@ inline UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
UNormalizationMode mode,
UErrorCode &status) {
if(U_FAILURE(status)) {
return UNORM_MAYBE;
}
return unorm_quickCheck(source.getBuffer(), source.length(),
mode, &status);
}
inline UNormalizationCheckResult
Normalizer::quickCheck(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UErrorCode &status) {
if(U_FAILURE(status)) {
return UNORM_MAYBE;
}
return unorm_quickCheckWithOptions(source.getBuffer(), source.length(),
mode, options, &status);
return quickCheck(source, mode, 0, status);
}
inline UBool
Normalizer::isNormalized(const UnicodeString& source,
UNormalizationMode mode,
UErrorCode &status) {
if(U_FAILURE(status)) {
return FALSE;
}
return unorm_isNormalized(source.getBuffer(), source.length(),
mode, &status);
}
inline UBool
Normalizer::isNormalized(const UnicodeString& source,
UNormalizationMode mode, int32_t options,
UErrorCode &status) {
if(U_FAILURE(status)) {
return FALSE;
}
return unorm_isNormalizedWithOptions(source.getBuffer(), source.length(),
mode, options, &status);
return isNormalized(source, mode, 0, status);
}
inline int32_t

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1997-2009, International Business Machines
* Copyright (C) 1997-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -321,51 +321,29 @@ typedef enum UProperty {
/** Binary property NFD_Inert.
ICU-specific property for characters that are inert under NFD,
i.e., they do not interact with adjacent characters.
Used for example in normalizing transforms in incremental mode
to find the boundary of safely normalizable text despite possible
text additions.
There is one such property per normalization form.
These properties are computed as follows - an inert character is:
a) unassigned, or ALL of the following:
b) of combining class 0.
c) not decomposed by this normalization form.
AND if NFC or NFKC,
d) can never compose with a previous character.
e) can never compose with a following character.
f) can never change if another character is added.
Example: a-breve might satisfy all but f, but if you
add an ogonek it changes to a-ogonek + breve
See also com.ibm.text.UCD.NFSkippable in the ICU4J repository,
and icu/source/common/unormimp.h .
See the documentation for the Normalizer2 class and the
Normalizer2::isInert() method.
@stable ICU 3.0 */
UCHAR_NFD_INERT=37,
/** Binary property NFKD_Inert.
ICU-specific property for characters that are inert under NFKD,
i.e., they do not interact with adjacent characters.
Used for example in normalizing transforms in incremental mode
to find the boundary of safely normalizable text despite possible
text additions.
@see UCHAR_NFD_INERT
See the documentation for the Normalizer2 class and the
Normalizer2::isInert() method.
@stable ICU 3.0 */
UCHAR_NFKD_INERT=38,
/** Binary property NFC_Inert.
ICU-specific property for characters that are inert under NFC,
i.e., they do not interact with adjacent characters.
Used for example in normalizing transforms in incremental mode
to find the boundary of safely normalizable text despite possible
text additions.
@see UCHAR_NFD_INERT
See the documentation for the Normalizer2 class and the
Normalizer2::isInert() method.
@stable ICU 3.0 */
UCHAR_NFC_INERT=39,
/** Binary property NFKC_Inert.
ICU-specific property for characters that are inert under NFKC,
i.e., they do not interact with adjacent characters.
Used for example in normalizing transforms in incremental mode
to find the boundary of safely normalizable text despite possible
text additions.
@see UCHAR_NFD_INERT
See the documentation for the Normalizer2 class and the
Normalizer2::isInert() method.
@stable ICU 3.0 */
UCHAR_NFKC_INERT=40,
/** Binary Property Segment_Starter.
@ -428,8 +406,10 @@ typedef enum UProperty {
UCHAR_CHANGES_WHEN_CASEFOLDED=54,
/** Binary property Changes_When_Casemapped. @draft ICU 4.4 */
UCHAR_CHANGES_WHEN_CASEMAPPED=55,
/** Binary property Changes_When_NFKC_Casefolded. @draft ICU 4.4 */
UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED=56,
/** One more than the last constant for binary Unicode properties. @stable ICU 2.1 */
UCHAR_BINARY_LIMIT=56,
UCHAR_BINARY_LIMIT=57,
/** Enumerated property Bidi_Class.
Same as u_charDirection, returns UCharDirection values. @stable ICU 2.2 */

View file

@ -1,6 +1,6 @@
/*
***************************************************************************
* Copyright (C) 1999-2009, International Business Machines Corporation
* Copyright (C) 1999-2010, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* Date Name Description
@ -861,6 +861,20 @@ public:
*/
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the end of the substring of the input string according to the USetSpanCondition.
* Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
* after pinning start to 0<=start<=s.length().
* @param s the string
* @param start the start index in the string for the span operation
* @param spanCondition specifies the containment condition
* @return the exclusive end of the substring according to the spanCondition;
* the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
* @draft ICU 4.4
* @see USetSpanCondition
*/
inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
@ -880,6 +894,21 @@ public:
*/
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the start of the substring of the input string according to the USetSpanCondition.
* Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
* after pinning limit to 0<=end<=s.length().
* @param s the string
* @param limit the exclusive-end index in the string for the span operation
* (use s.length() or INT32_MAX for spanning back from the end of the string)
* @param spanCondition specifies the containment condition
* @return the start of the substring according to the spanCondition;
* the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
* @draft ICU 4.4
* @see USetSpanCondition
*/
inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
@ -1619,6 +1648,26 @@ inline const USet *UnicodeSet::toUSet() const {
return reinterpret_cast<const USet *>(this);
}
inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
int32_t sLength=s.length();
if(start<0) {
start=0;
} else if(start>sLength) {
start=sLength;
}
return start+span(s.getBuffer()+start, sLength-start, spanCondition);
}
inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
int32_t sLength=s.length();
if(limit<0) {
limit=0;
} else if(limit>sLength) {
limit=sLength;
}
return spanBack(s.getBuffer(), limit, spanCondition);
}
U_NAMESPACE_END
#endif

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1998-2009, International Business Machines
* Copyright (C) 1998-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*
@ -1566,6 +1566,33 @@ public:
#endif
/**
* Create a temporary substring for the specified range.
* Unlike the substring constructor and setTo() functions,
* the object returned here will be a read-only alias (using getBuffer())
* rather than copying the text.
* As a result, this substring operation is much faster but requires
* that the original string not be modified or deleted during the lifetime
* of the returned substring object.
* @param start offset of the first character visible in the substring
* @param length length of the substring
* @return a read-only alias UnicodeString object for the substring
* @draft ICU 4.4
*/
UnicodeString tempSubString(int32_t start=0, int32_t length=INT32_MAX) const;
/**
* Create a temporary substring for the specified range.
* Same as tempSubString(start, length) except that the substring range
* is specified as a (start, limit) pair (with an exclusive limit index)
* rather than a (start, length) pair.
* @param start offset of the first character visible in the substring
* @param limit offset immediately following the last character visible in the substring
* @return a read-only alias UnicodeString object for the substring
* @draft ICU 4.4
*/
inline UnicodeString tempSubStringBetween(int32_t start, int32_t limit=INT32_MAX) const;
/**
* Convert the UnicodeString to UTF-8 and write the result
* to a ByteSink. This is called by toUTF8String().
@ -2396,6 +2423,16 @@ public:
inline UnicodeString& removeBetween(int32_t start,
int32_t limit = (int32_t)INT32_MAX);
/**
* Retain only the characters in the range
* [<code>start</code>, <code>limit</code>) from the UnicodeString object.
* Removes characters before <code>start</code> and at and after <code>limit</code>.
* @param start the offset of the first character to retain
* @param limit the offset immediately following the range to retain
* @return a reference to this
* @draft ICU 4.4
*/
inline UnicodeString &retainBetween(int32_t start, int32_t limit = INT32_MAX);
/* Length operations */
@ -4068,6 +4105,11 @@ UnicodeString::extractBetween(int32_t start,
doExtract(start, limit - start, dst, dstStart);
}
inline UnicodeString
UnicodeString::tempSubStringBetween(int32_t start, int32_t limit) const {
return tempSubString(start, limit - start);
}
inline UChar
UnicodeString::doCharAt(int32_t offset) const
{
@ -4161,7 +4203,13 @@ UnicodeString::getTerminatedBuffer() {
} else {
UChar *array = getArrayStart();
int32_t len = length();
if(len < getCapacity()) {
if(len < getCapacity() && ((fFlags&kRefCounted) == 0 || refCount() == 1)) {
/*
* kRefCounted: Do not write the NUL if the buffer is shared.
* That is mostly safe, except when the length of one copy was modified
* without copy-on-write, e.g., via truncate(newLength) or remove(void).
* Then the NUL would be written into the middle of another copy's string.
*/
if(!(fFlags&kBufferIsReadonly)) {
/*
* We must not write to a readonly buffer, but it is known to be
@ -4332,10 +4380,12 @@ inline UnicodeString&
UnicodeString::remove()
{
// remove() of a bogus string makes the string empty and non-bogus
if(isBogus()) {
unBogus();
// we also un-alias a read-only alias to deal with NUL-termination
// issues with getTerminatedBuffer()
if(fFlags & (kIsBogus|kBufferIsReadonly)) {
setToEmpty();
} else {
setLength(0);
fShortLength = 0;
}
return *this;
}
@ -4356,6 +4406,12 @@ UnicodeString::removeBetween(int32_t start,
int32_t limit)
{ return doReplace(start, limit - start, NULL, 0, 0); }
inline UnicodeString &
UnicodeString::retainBetween(int32_t start, int32_t limit) {
truncate(limit);
return doReplace(0, start, NULL, 0, 0);
}
inline UBool
UnicodeString::truncate(int32_t targetLength)
{
@ -4365,6 +4421,9 @@ UnicodeString::truncate(int32_t targetLength)
return FALSE;
} else if((uint32_t)targetLength < (uint32_t)length()) {
setLength(targetLength);
if(fFlags&kBufferIsReadonly) {
fUnion.fFields.fCapacity = targetLength; // not NUL-terminated any more
}
return TRUE;
} else {
return FALSE;

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (c) 1996-2007, International Business Machines Corporation
* Copyright (c) 1996-2010, International Business Machines Corporation
* and others. All Rights Reserved.
*******************************************************************************
* File unorm.h
@ -20,6 +20,7 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/uiter.h"
#include "unicode/unorm2.h"
/**
* \file
@ -27,6 +28,11 @@
*
* <h2>Unicode normalization API</h2>
*
* Note: This API has been replaced by the unorm2.h API and is only available
* for backward compatibility. The functions here simply delegate to the
* unorm2.h functions, for example unorm2_getInstance() and unorm2_normalize().
* There is one exception: The new API does not provide a replacement for unorm_compare().
*
* <code>unorm_normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>unorm_normalize</code> supports the standard normalization forms described in
@ -202,28 +208,7 @@ unorm_normalize(const UChar *source, int32_t sourceLength,
UNormalizationMode mode, int32_t options,
UChar *result, int32_t resultLength,
UErrorCode *status);
#endif
/**
* Result values for unorm_quickCheck().
* For details see Unicode Technical Report 15.
* @stable ICU 2.0
*/
typedef enum UNormalizationCheckResult {
/**
* Indicates that string is not in the normalized format
*/
UNORM_NO,
/**
* Indicates that string is in the normalized format
*/
UNORM_YES,
/**
* Indicates that string cannot be determined if it is in the normalized
* format without further thorough checks.
*/
UNORM_MAYBE
} UNormalizationCheckResult;
#if !UCONFIG_NO_NORMALIZATION
/**
* Performing quick check on a string, to quickly determine if the string is
* in a particular normalization format.

View file

@ -0,0 +1,348 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: unorm2.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009dec15
* created by: Markus W. Scherer
*/
#ifndef __UNORM2_H__
#define __UNORM2_H__
/**
* \file
* \brief C API: New API for Unicode Normalization.
*
* Unicode normalization functionality for standard Unicode normalization or
* for using custom mapping tables.
* All instances of UNormalizer2 are unmodifiable/immutable.
* Instances returned by unorm2_getInstance() are singletons that must not be deleted by the caller.
* For more details see the Normalizer2 C++ class.
*/
#include "unicode/utypes.h"
#include "unicode/uset.h"
/**
* Constants for normalization modes.
* For details about standard Unicode normalization forms
* and about the algorithms which are also used with custom mapping tables
* see http://www.unicode.org/unicode/reports/tr15/
* @draft ICU 4.4
*/
typedef enum {
/**
* Decomposition followed by composition.
* Same as standard NFC when using an "nfc" instance.
* Same as standard NFKC when using an "nfkc" instance.
* For details about standard Unicode normalization forms
* see http://www.unicode.org/unicode/reports/tr15/
* @draft ICU 4.4
*/
UNORM2_COMPOSE,
/**
* Map, and reorder canonically.
* Same as standard NFD when using an "nfc" instance.
* Same as standard NFKD when using an "nfkc" instance.
* For details about standard Unicode normalization forms
* see http://www.unicode.org/unicode/reports/tr15/
* @draft ICU 4.4
*/
UNORM2_DECOMPOSE,
/**
* "Fast C or D" form.
* Further decomposition <i>without reordering</i>
* would yield the same form as DECOMPOSE.
* Text in "Fast C or D" form can be processed efficiently with data tables
* that are "canonically closed", that is, that provide equivalent data for
* equivalent text, without having to be fully normalized.
* Not a standard Unicode normalization form.
* Not a unique form: Different FCD strings can be canonically equivalent.
* For details see http://www.unicode.org/notes/tn5/#FCD
* @draft ICU 4.4
*/
UNORM2_FCD,
/**
* Compose only contiguously.
* Also known as "FCC" or "Fast C Contiguous".
* The result will often but not always be in NFC.
* The result will conform to FCD which is useful for processing.
* Not a standard Unicode normalization form.
* For details see http://www.unicode.org/notes/tn5/#FCC
* @draft ICU 4.4
*/
UNORM2_COMPOSE_CONTIGUOUS
} UNormalization2Mode;
/**
* Result values for normalization quick check functions.
* For details see http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
* @stable ICU 2.0
*/
typedef enum UNormalizationCheckResult {
/**
* The input string is not in the normalization form.
* @stable ICU 2.0
*/
UNORM_NO,
/**
* The input string is in the normalization form.
* @stable ICU 2.0
*/
UNORM_YES,
/**
* The input string may or may not be in the normalization form.
* This value is only returned for composition forms like NFC and FCC,
* when a backward-combining character is found for which the surrounding text
* would have to be analyzed further.
* @stable ICU 2.0
*/
UNORM_MAYBE
} UNormalizationCheckResult;
/**
* Opaque C service object type for the new normalization API.
* @draft ICU 4.4
*/
struct UNormalizer2;
typedef struct UNormalizer2 UNormalizer2; /**< C typedef for struct UNormalizer2. @draft ICU 4.4 */
#if !UCONFIG_NO_NORMALIZATION
/**
* Returns a UNormalizer2 instance which uses the specified data file
* (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
* and which composes or decomposes text according to the specified mode.
* Returns an unmodifiable singleton instance. Do not delete it.
*
* Use packageName=NULL for data files that are part of ICU's own data.
* Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
* Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
* Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
*
* @param packageName NULL for ICU built-in data, otherwise application data package name
* @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
* @param mode normalization mode (compose or decompose etc.)
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested UNormalizer2, if successful
* @draft ICU 4.4
*/
U_DRAFT const UNormalizer2 * U_EXPORT2
unorm2_getInstance(const char *packageName,
const char *name,
UNormalization2Mode mode,
UErrorCode *pErrorCode);
/**
* Constructs a filtered normalizer wrapping any UNormalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param norm2 wrapped Normalizer2 instance
* @param filterSet USet which determines the characters to be normalized
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return the requested UNormalizer2, if successful
* @draft ICU 4.4
*/
U_DRAFT UNormalizer2 * U_EXPORT2
unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode);
/**
* Closes a UNormalizer2 instance from unorm2_openFiltered().
* Do not close instances from unorm2_getInstance()!
* @param norm2 UNormalizer2 instance to be closed
* @draft ICU 4.4
*/
U_DRAFT void U_EXPORT2
unorm2_close(UNormalizer2 *norm2);
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the length of the destination string.
* The source and destination strings must be different buffers.
* @param norm2 UNormalizer2 instance
* @param src source string
* @param length length of the source string, or -1 if NUL-terminated
* @param dest destination string; its contents is replaced with normalized src
* @param capacity number of UChars that can be written to dest
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return dest
* @draft ICU 4.4
*/
U_DRAFT int32_t U_EXPORT2
unorm2_normalize(const UNormalizer2 *norm2,
const UChar *src, int32_t length,
UChar *dest, int32_t capacity,
UErrorCode *pErrorCode);
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the length of the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different buffers.
* @param norm2 UNormalizer2 instance
* @param first string, should be normalized
* @param firstLength length of the first string, or -1 if NUL-terminated
* @param firstCapacity number of UChars that can be written to first
* @param second string, will be normalized
* @param secondLength length of the source string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @draft ICU 4.4
*/
U_DRAFT int32_t U_EXPORT2
unorm2_normalizeSecondAndAppend(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UErrorCode *pErrorCode);
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the length of the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different buffers.
* @param norm2 UNormalizer2 instance
* @param first string, should be normalized
* @param firstLength length of the first string, or -1 if NUL-terminated
* @param firstCapacity number of UChars that can be written to first
* @param second string, should be normalized
* @param secondLength length of the source string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return first
* @draft ICU 4.4
*/
U_DRAFT int32_t U_EXPORT2
unorm2_append(const UNormalizer2 *norm2,
UChar *first, int32_t firstLength, int32_t firstCapacity,
const UChar *second, int32_t secondLength,
UErrorCode *pErrorCode);
/**
* Tests if the string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
* @param norm2 UNormalizer2 instance
* @param s input string
* @param length length of the string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return TRUE if s is normalized
* @draft ICU 4.4
*/
U_DRAFT UBool U_EXPORT2
unorm2_isNormalized(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode);
/**
* Tests if the string is normalized.
* For the two COMPOSE modes, the result could be "maybe" in cases that
* would take a little more work to resolve definitively.
* Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
* combination of quick check + normalization, to avoid
* re-checking the "yes" prefix.
* @param norm2 UNormalizer2 instance
* @param s input string
* @param length length of the string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @draft ICU 4.4
*/
U_DRAFT UNormalizationCheckResult U_EXPORT2
unorm2_quickCheck(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode);
/**
* Returns the end of the normalized substring of the input string.
* In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
* the substring <code>UnicodeString(s, 0, end)</code>
* will pass the quick check with a "yes" result.
*
* The returned end index is usually one or more characters before the
* "no" or "maybe" character: The end index is at a normalization boundary.
* (See the class documentation for more about normalization boundaries.)
*
* When the goal is a normalized string and most input strings are expected
* to be normalized already, then call this method,
* and if it returns a prefix shorter than the input string,
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
* @param norm2 UNormalizer2 instance
* @param s input string
* @param length length of the string, or -1 if NUL-terminated
* @param pErrorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return UNormalizationCheckResult
* @draft ICU 4.4
*/
U_DRAFT int32_t U_EXPORT2
unorm2_spanQuickCheckYes(const UNormalizer2 *norm2,
const UChar *s, int32_t length,
UErrorCode *pErrorCode);
/**
* Tests if the character has a normalization boundary before it.
* For details see the Normalizer2 base class documentation.
* @param norm2 UNormalizer2 instance
* @param c character to test
* @return TRUE if c has a normalization boundary before it
* @draft ICU 4.4
*/
U_DRAFT UBool U_EXPORT2
unorm2_hasBoundaryBefore(const UNormalizer2 *norm2, UChar32 c);
/**
* Tests if the character has a normalization boundary after it.
* For details see the Normalizer2 base class documentation.
* @param norm2 UNormalizer2 instance
* @param c character to test
* @return TRUE if c has a normalization boundary after it
* @draft ICU 4.4
*/
U_DRAFT UBool U_EXPORT2
unorm2_hasBoundaryAfter(const UNormalizer2 *norm2, UChar32 c);
/**
* Tests if the character is normalization-inert.
* For details see the Normalizer2 base class documentation.
* @param norm2 UNormalizer2 instance
* @param c character to test
* @return TRUE if c is normalization-inert
* @draft ICU 4.4
*/
U_DRAFT UBool U_EXPORT2
unorm2_isInert(const UNormalizer2 *norm2, UChar32 c);
#endif /* !UCONFIG_NO_NORMALIZATION */
#endif /* __UNORM2_H__ */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -33,12 +33,15 @@
#include "uvector.h"
#include "uprops.h"
#include "propname.h"
#include "normalizer2impl.h"
#include "unormimp.h"
#include "ucase.h"
#include "ubidi_props.h"
#include "uinvchar.h"
#include "uprops.h"
#include "charstr.h"
#include "cstring.h"
#include "mutex.h"
#include "umutex.h"
#include "uassert.h"
#include "hash.h"
@ -91,10 +94,43 @@ static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
*/
//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
// Cached sets ------------------------------------------------------------- ***
U_CDECL_BEGIN
static UBool U_CALLCONV uset_cleanup();
U_CDECL_END
// Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
// can only fail with an out-of-memory error
// if we have a correct pattern and the properties data is hardcoded and always available.
class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
public:
UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
UnicodeSet *getInstance(UErrorCode &errorCode) {
return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
}
private:
static void *createInstance(const void *context, UErrorCode &errorCode) {
UnicodeString pattern((const char *)context, -1, US_INV);
UnicodeSet *set=new UnicodeSet(pattern, errorCode);
if(set==NULL) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
set->freeze();
ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
return set;
}
const char *fPattern;
};
U_CDECL_BEGIN
static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
STATIC_SIMPLE_SINGLETON(uni32Singleton);
//----------------------------------------------------------------
// Inclusions list
//----------------------------------------------------------------
@ -128,7 +164,7 @@ static UBool U_CALLCONV uset_cleanup(void) {
INCLUSIONS[i] = NULL;
}
}
UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
return TRUE;
}
@ -177,6 +213,27 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
unorm_addPropertyStarts(&sa, &status);
break;
case UPROPS_SRC_NFC: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
if(U_SUCCESS(status)) {
impl->addPropertyStarts(&sa, status);
}
break;
}
case UPROPS_SRC_NFKC: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
if(U_SUCCESS(status)) {
impl->addPropertyStarts(&sa, status);
}
break;
}
case UPROPS_SRC_NFKC_CF: {
const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
if(U_SUCCESS(status)) {
impl->addPropertyStarts(&sa, status);
}
break;
}
#endif
case UPROPS_SRC_CASE:
ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
@ -207,6 +264,13 @@ const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
return INCLUSIONS[src];
}
// Cache some sets for other services -------------------------------------- ***
U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode &errorCode) {
return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
}
// helper functions for matching of pattern syntax pieces ------------------ ***
// these functions are parallel to the PERL_OPEN etc. strings above

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1999-2009, International Business Machines Corporation and *
* Copyright (C) 1999-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*
@ -780,6 +780,17 @@ UnicodeString::extract(int32_t start,
return u_terminateChars(target, targetCapacity, length, &status);
}
UnicodeString
UnicodeString::tempSubString(int32_t start, int32_t len) const {
pinIndices(start, len);
const UChar *array = getBuffer(); // not getArrayStart() to check kIsBogus & kOpenGetBuffer
if(array==NULL) {
array=fUnion.fStackBuffer; // anything not NULL because that would make an empty string
len=-2; // bogus result string
}
return UnicodeString(FALSE, array + start, len);
}
int32_t
UnicodeString::toUTF8(int32_t start, int32_t len,
char *target, int32_t capacity) const {
@ -1218,6 +1229,28 @@ UnicodeString::doReplace(int32_t start,
return *this;
}
int32_t oldLength = this->length();
// optimize (read-only alias).remove(0, start) and .remove(start, end)
if((fFlags&kBufferIsReadonly) && srcLength == 0) {
if(start == 0) {
// remove prefix by adjusting the array pointer
pinIndex(length);
fUnion.fFields.fArray += length;
fUnion.fFields.fCapacity -= length;
setLength(oldLength - length);
return *this;
} else {
pinIndex(start);
if(length >= (oldLength - start)) {
// remove suffix by reducing the length (like truncate())
setLength(start);
fUnion.fFields.fCapacity = start; // not NUL-terminated any more
return *this;
}
}
}
if(srcChars == 0) {
srcStart = srcLength = 0;
} else if(srcLength < 0) {
@ -1225,8 +1258,6 @@ UnicodeString::doReplace(int32_t start,
srcLength = u_strlen(srcChars + srcStart);
}
int32_t oldLength = this->length();
// calculate the size of the string after the replace
int32_t newSize;
@ -1594,4 +1625,3 @@ static void uprv_UnicodeStringDummy(void) {
delete [] (new UnicodeString[2]);
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
/*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
* file name: unorm_props_data.c
@ -14,6 +14,7 @@ static const int32_t indexes[_NORM_INDEX_TOP]={
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
};
#if 0
static const uint16_t normTrie_index[2532]={
0,8,0x10,0x18,0x28,0x30,0x38,0x40,0x48,0x50,0x58,0x60,0x68,0x70,0x77,0x7f,
0x87,0x8f,0x1f,0x27,0x94,0x9c,0xa3,0xab,0xb3,0xbb,0xc3,0xcb,0xd3,0xdb,0xe3,0xeb,
@ -835,6 +836,7 @@ static const UTrie2 normTrie={
0x2810,
NULL, 0, FALSE, FALSE, 0, NULL
};
#endif
static const uint16_t extraData[16431]={
0x1c2,0xff02,0x20,0x3b9,0xff01,0x3c5,0xff01,0x3cd,0xff01,0x3cb,0xff01,0x3c3,0xff01,0x61,0xff01,0xe6,
@ -1866,6 +1868,7 @@ static const uint16_t extraData[16431]={
0x773,0x776,0x77c,0x782,0x788,0x78e,0x794,0x797,0x79a,0x79d,0x7a0,0x7a3,0x7a6,0x7a9,0x7ac
};
#if 0
static const uint16_t combiningTable[1967]={
0x7af,0xc0,0x7b0,0xc1,0x7b1,0x20c2,0x7b2,0xc3,0x7b3,0x20c4,0x7b4,0x20c5,0x7b6,0x100,0x7b7,0x2102,
0x7b8,0x104,0x7b9,0x2226,0x7ba,0x1cd,0x7bd,0x200,0x7be,0x202,0x7d6,0x1e00,0x7d7,0x3ea0,0x87dd,0x1ea2,
@ -2416,6 +2419,7 @@ static const UTrie2 fcdTrie={
0x1968,
NULL, 0, FALSE, FALSE, 0, NULL
};
#endif
static const uint16_t auxTrie_index[6664]={
0x278,0x280,0x288,0x290,0x278,0x280,0x2a8,0x2b0,0x2b8,0x2c0,0x2c8,0x2d0,0x278,0x280,0x2d8,0x2e0,

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -22,12 +22,13 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/ustring.h"
#include "unicode/unorm.h"
#include "unicode/uniset.h"
#include "unormimp.h"
#include "ucase.h"
#include "unicode/ustring.h"
#include "cmemory.h"
#include "normalizer2impl.h"
#include "ucase.h"
#include "uprops.h"
#include "ustr_imp.h"
U_NAMESPACE_USE
@ -134,12 +135,19 @@ struct CmpEquivLevel {
};
typedef struct CmpEquivLevel CmpEquivLevel;
/**
* Internal option for unorm_cmpEquivFold() for decomposing.
* If not set, just do strcasecmp().
*/
#define _COMPARE_EQUIV 0x80000
/* internal function */
static int32_t
unorm_cmpEquivFold(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
uint32_t options,
UErrorCode *pErrorCode) {
const Normalizer2Impl *nfcImpl;
const UCaseProps *csp;
/* current-level start/limit - s1/s2 as current */
@ -152,7 +160,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
/* stacks of previous-level start/current/limit */
CmpEquivLevel stack1[2], stack2[2];
/* decomposition buffers for Hangul */
/* buffers for algorithmic decompositions */
UChar decomp1[4], decomp2[4];
/* case folding buffers, only use current-level start/limit */
@ -173,19 +181,19 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
*/
/* normalization/properties data loaded? */
if( ((options&_COMPARE_EQUIV)!=0 && !unorm_haveData(pErrorCode)) ||
U_FAILURE(*pErrorCode)
) {
return 0;
if((options&_COMPARE_EQUIV)!=0) {
nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode);
} else {
nfcImpl=NULL;
}
if((options&U_COMPARE_IGNORE_CASE)!=0) {
csp=ucase_getSingleton(pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
} else {
csp=NULL;
}
if(U_FAILURE(*pErrorCode)) {
return 0;
}
/* initialize */
start1=s1;
@ -404,7 +412,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
}
if( level1<2 && (options&_COMPARE_EQUIV) &&
0!=(p=unorm_getCanonicalDecomposition((UChar32)cp1, decomp1, &length))
0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length))
) {
/* cp1 decomposes into p[length] */
if(U_IS_SURROGATE(c1)) {
@ -445,7 +453,7 @@ unorm_cmpEquivFold(const UChar *s1, int32_t length1,
}
if( level2<2 && (options&_COMPARE_EQUIV) &&
0!=(p=unorm_getCanonicalDecomposition((UChar32)cp2, decomp2, &length))
0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length))
) {
/* cp2 decomposes into p[length] */
if(U_IS_SURROGATE(c2)) {
@ -534,14 +542,8 @@ unorm_compare(const UChar *s1, int32_t length1,
const UChar *s2, int32_t length2,
uint32_t options,
UErrorCode *pErrorCode) {
MaybeStackArray<UChar, 300> fcd1, fcd2;
const UnicodeSet *nx;
UNormalizationMode mode;
int32_t normOptions;
int32_t result;
/* argument checking */
if(pErrorCode==0 || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s1==0 || length1<-1 || s2==0 || length2<-1) {
@ -549,21 +551,9 @@ unorm_compare(const UChar *s1, int32_t length1,
return 0;
}
if(!unorm_haveData(pErrorCode)) {
return 0;
}
if(!uprv_haveProperties(pErrorCode)) {
return 0;
}
normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
nx=unorm_getNX(normOptions, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
UnicodeString fcd1, fcd2;
int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT);
options|=_COMPARE_EQUIV;
result=0;
/*
* UAX #21 Case Mappings, as fixed for Unicode version 4
@ -586,20 +576,30 @@ unorm_compare(const UChar *s1, int32_t length1,
* are first decomposed or not, so an FCD check - a check only for
* canonical order - is not sufficient.
*/
if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
mode=UNORM_NFD;
options&=~UNORM_INPUT_IS_FCD;
} else {
mode=UNORM_FCD;
}
if(!(options&UNORM_INPUT_IS_FCD)) {
int32_t _len1, _len2;
UBool isFCD1, isFCD2;
if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) {
const Normalizer2 *n2;
if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) {
n2=Normalizer2Factory::getNFDInstance(*pErrorCode);
} else {
n2=Normalizer2Factory::getFCDInstance(*pErrorCode);
}
// check if s1 and/or s2 fulfill the FCD conditions
isFCD1= UNORM_YES==unorm_internalQuickCheck(s1, length1, mode, TRUE, nx, pErrorCode);
isFCD2= UNORM_YES==unorm_internalQuickCheck(s2, length2, mode, TRUE, nx, pErrorCode);
const UnicodeSet *uni32;
if(normOptions&UNORM_UNICODE_3_2) {
uni32=uniset_getUnicode32Instance(*pErrorCode);
} else {
uni32=NULL; // unused
}
FilteredNormalizer2 fn2(*n2, *uni32);
if(normOptions&UNORM_UNICODE_3_2) {
n2=&fn2;
}
UnicodeString str1(length1<0, s1, length1);
UnicodeString str2(length2<0, s2, length2);
int32_t spanQCYes1=n2->spanQuickCheckYes(str1, *pErrorCode);
int32_t spanQCYes2=n2->spanQuickCheckYes(str2, *pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return 0;
}
@ -613,59 +613,27 @@ unorm_compare(const UChar *s1, int32_t length1,
* Therefore, ICU 2.6 removes that optimization.
*/
if(!isFCD1) {
_len1=unorm_internalNormalizeWithNX(fcd1.getAlias(), fcd1.getCapacity(),
s1, length1,
mode, normOptions, nx,
pErrorCode);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
if(fcd1.resize(_len1)==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
*pErrorCode=U_ZERO_ERROR;
_len1=unorm_internalNormalizeWithNX(fcd1.getAlias(), fcd1.getCapacity(),
s1, length1,
mode, normOptions, nx,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return result;
}
}
s1=fcd1.getAlias();
length1=_len1;
if(spanQCYes1<str1.length()) {
UnicodeString unnormalized=str1.tempSubString(spanQCYes1);
fcd1.setTo(FALSE, str1.getBuffer(), spanQCYes1);
n2->normalizeSecondAndAppend(fcd1, unnormalized, *pErrorCode);
s1=fcd1.getBuffer();
length1=fcd1.length();
}
if(!isFCD2) {
_len2=unorm_internalNormalizeWithNX(fcd2.getAlias(), fcd2.getCapacity(),
s2, length2,
mode, normOptions, nx,
pErrorCode);
if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) {
if(fcd2.resize(_len2)==NULL) {
*pErrorCode=U_MEMORY_ALLOCATION_ERROR;
return result;
}
*pErrorCode=U_ZERO_ERROR;
_len2=unorm_internalNormalizeWithNX(fcd2.getAlias(), fcd2.getCapacity(),
s2, length2,
mode, normOptions, nx,
pErrorCode);
if(U_FAILURE(*pErrorCode)) {
return result;
}
}
s2=fcd2.getAlias();
length2=_len2;
if(spanQCYes2<str2.length()) {
UnicodeString unnormalized=str2.tempSubString(spanQCYes2);
fcd2.setTo(FALSE, str2.getBuffer(), spanQCYes2);
n2->normalizeSecondAndAppend(fcd2, unnormalized, *pErrorCode);
s2=fcd2.getBuffer();
length2=fcd2.length();
}
}
if(U_SUCCESS(*pErrorCode)) {
result=unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode);
} else {
return 0;
}
return result;
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -163,25 +163,6 @@ enum {
_NORM_DECOMP_LENGTH_MASK=0x7f
};
#endif /* #if !UCONFIG_NO_NORMALIZATION */
/* Korean Hangul and Jamo constants */
enum {
JAMO_L_BASE=0x1100, /* "lead" jamo */
JAMO_V_BASE=0x1161, /* "vowel" jamo */
JAMO_T_BASE=0x11a7, /* "trail" jamo */
HANGUL_BASE=0xac00,
JAMO_L_COUNT=19,
JAMO_V_COUNT=21,
JAMO_T_COUNT=28,
HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT
};
#if !UCONFIG_NO_NORMALIZATION
/* Constants for options flags for normalization. @draft ICU 2.6 */
enum {
/** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
@ -205,199 +186,6 @@ enum {
U_CAPI UBool U_EXPORT2
unorm_haveData(UErrorCode *pErrorCode);
/**
* Internal API for normalizing.
* Does not check for bad input.
* @internal
*/
U_CAPI int32_t U_EXPORT2
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options,
UErrorCode *pErrorCode);
#ifdef XP_CPLUSPLUS
/**
* Internal API for normalizing.
* Does not check for bad input.
* Requires _haveData() to be true.
* @internal
*/
U_CFUNC int32_t
unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UNormalizationMode mode, int32_t options, const U_NAMESPACE_QUALIFIER UnicodeSet *nx,
UErrorCode *pErrorCode);
#endif
/**
* internal API, used by normlzr.cpp
* @internal
*/
U_CAPI int32_t U_EXPORT2
unorm_decompose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, int32_t options,
UErrorCode *pErrorCode);
/**
* internal API, used by normlzr.cpp
* @internal
*/
U_CAPI int32_t U_EXPORT2
unorm_compose(UChar *dest, int32_t destCapacity,
const UChar *src, int32_t srcLength,
UBool compat, int32_t options,
UErrorCode *pErrorCode);
#ifdef XP_CPLUSPLUS
/**
* internal API, used by unormcmp.cpp
* @internal
*/
U_CFUNC UNormalizationCheckResult
unorm_internalQuickCheck(const UChar *src,
int32_t srcLength,
UNormalizationMode mode,
UBool allowMaybe,
const U_NAMESPACE_QUALIFIER UnicodeSet *nx,
UErrorCode *pErrorCode);
#endif
#endif /* #if !UCONFIG_NO_NORMALIZATION */
/**
* Internal option for unorm_cmpEquivFold() for decomposing.
* If not set, just do strcasecmp().
* @internal
*/
#define _COMPARE_EQUIV 0x80000
#ifndef U_COMPARE_IGNORE_CASE
/* see also unorm.h */
/**
* Option bit for unorm_compare:
* Perform case-insensitive comparison.
* @draft ICU 2.2
*/
#define U_COMPARE_IGNORE_CASE 0x10000
#endif
/**
* Internal option for unorm_cmpEquivFold() for strncmp style.
* If set, checks for both string length and terminating NUL.
* @internal
*/
#define _STRNCMP_STYLE 0x1000
#if !UCONFIG_NO_NORMALIZATION
/**
* Internal API to get the 16-bit FCD value (lccc + tccc) for c,
* for u_getIntPropertyValue().
* @internal
*/
U_CFUNC uint16_t U_EXPORT2
unorm_getFCD16FromCodePoint(UChar32 c);
#ifdef XP_CPLUSPLUS
/**
* Internal API, used by collation code.
* Get access to the internal FCD trie table to be able to perform
* incremental, per-code unit, FCD checks in collation.
* One pointer is sufficient because the trie index values are offset
* by the index size, so that the same pointer is used to access the trie data.
* Code points at fcdHighStart and above have a zero FCD value.
* @internal
*/
U_CAPI const uint16_t * U_EXPORT2
unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode);
/**
* Internal API, used by collation code.
* Get the FCD value for a code unit, with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* If c is a lead surrogate and the value is not 0,
* then some of c's associated supplementary code points have a non-zero FCD value.
*
* @internal
*/
static inline uint16_t
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
}
/**
* Internal API, used by collation code.
* Get the FCD value of the next code point (post-increment), with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* @internal
*/
static inline uint16_t
unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
const UChar *&s, const UChar *limit) {
UChar32 c=*s++;
uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
if(fcd!=0 && U16_IS_LEAD(c)) {
UChar c2;
if(s!=limit && U16_IS_TRAIL(c2=*s)) {
++s;
c=U16_GET_SUPPLEMENTARY(c, c2);
if(c<fcdHighStart) {
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
} else {
fcd=0;
}
} else /* unpaired lead surrogate */ {
fcd=0;
}
}
return fcd;
}
/**
* Internal API, used by collation code.
* Get the FCD value of the previous code point (pre-decrement), with
* bits 15..8 lead combining class
* bits 7..0 trail combining class
*
* @internal
*/
static inline uint16_t
unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
const UChar *start, const UChar *&s) {
UChar32 c=*--s;
uint16_t fcd;
if(!U16_IS_SURROGATE(c)) {
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)];
} else {
UChar c2;
if(U16_IS_SURROGATE_TRAIL(c) && s!=start && U16_IS_LEAD(c2=*(s-1))) {
--s;
c=U16_GET_SUPPLEMENTARY(c2, c);
if(c<fcdHighStart) {
fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_SUPP(fcdTrieIndex, c)];
} else {
fcd=0;
}
} else /* unpaired surrogate */ {
fcd=0;
}
}
return fcd;
}
#endif
/**
* internal API, used by StringPrep
* @internal
@ -405,35 +193,6 @@ unorm_prevFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart,
U_CAPI void U_EXPORT2
unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode);
/**
* Get the canonical decomposition for one code point.
* Requires unorm_haveData() and buffer!=NULL and pLength!=NULL.
* @param c code point
* @param buffer out-only buffer for algorithmic decompositions of Hangul
* @param length out-only, takes the length of the decomposition, if any
* @return pointer to decomposition, or 0 if none
* @internal
*/
U_CFUNC const UChar *
unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength);
/**
* internal API, used by the canonical iterator
* TODO Consider using signature similar to unorm_getCanonicalDecomposition()
* for more efficiency
* @internal
*/
U_CAPI int32_t U_EXPORT2
unorm_getDecomposition(UChar32 c, UBool compat,
UChar *dest, int32_t destCapacity);
/**
* internal API, used by uprops.cpp
* @internal
*/
U_CFUNC UBool U_EXPORT2
unorm_internalIsFullCompositionExclusion(UChar32 c);
/**
* Internal API, used by enumeration of canonically equivalent strings
* @internal
@ -448,13 +207,6 @@ unorm_isCanonSafeStart(UChar32 c);
U_CAPI UBool U_EXPORT2
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
/**
* Is c an NF<mode>-skippable code point? See unormimp.h.
* @internal
*/
U_CAPI UBool U_EXPORT2
unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
#ifdef XP_CPLUSPLUS
/**
@ -484,13 +236,6 @@ unorm_swap(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
/**
* Get the NF*_QC property for a code point, for u_getIntPropertyValue().
* @internal
*/
U_CFUNC UNormalizationCheckResult U_EXPORT2
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
/**
* Description of the format of unorm.icu version 2.3.
*

View file

@ -1,11 +1,11 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2009, International Business Machines
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: uprops.h
* file name: uprops.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
@ -26,6 +26,7 @@
#include "unicode/uscript.h"
#include "unicode/ustring.h"
#include "cstring.h"
#include "normalizer2impl.h"
#include "ucln_cmn.h"
#include "umutex.h"
#include "unormimp.h"
@ -106,7 +107,7 @@ static const struct {
{ 1, U_MASK(UPROPS_DEPRECATED) },
{ 1, U_MASK(UPROPS_DIACRITIC) },
{ 1, U_MASK(UPROPS_EXTENDER) },
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_FULL_COMPOSITION_EXCLUSION */
{ 1, U_MASK(UPROPS_GRAPHEME_BASE) },
{ 1, U_MASK(UPROPS_GRAPHEME_EXTEND) },
{ 1, U_MASK(UPROPS_GRAPHEME_LINK) },
@ -134,10 +135,10 @@ static const struct {
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CASE_SENSITIVE */
{ 1, U_MASK(UPROPS_S_TERM) },
{ 1, U_MASK(UPROPS_VARIATION_SELECTOR) },
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFD_INERT */
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFKD_INERT */
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFC_INERT */
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_NFKC_INERT */
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_NFD_INERT */
{ UPROPS_SRC_NFKC, 0 }, /* UCHAR_NFKD_INERT */
{ UPROPS_SRC_NFC, 0 }, /* UCHAR_NFC_INERT */
{ UPROPS_SRC_NFKC, 0 }, /* UCHAR_NFKC_INERT */
{ UPROPS_SRC_NORM, 0 }, /* UCHAR_SEGMENT_STARTER */
{ 1, U_MASK(UPROPS_PATTERN_SYNTAX) },
{ 1, U_MASK(UPROPS_PATTERN_WHITE_SPACE) },
@ -152,7 +153,8 @@ static const struct {
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_UPPERCASED */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_TITLECASED */
{ UPROPS_SRC_CASE_AND_NORM, 0 }, /* UCHAR_CHANGES_WHEN_CASEFOLDED */
{ UPROPS_SRC_CASE, 0 } /* UCHAR_CHANGES_WHEN_CASEMAPPED */
{ UPROPS_SRC_CASE, 0 }, /* UCHAR_CHANGES_WHEN_CASEMAPPED */
{ UPROPS_SRC_NFKC_CF, 0 } /* UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED */
};
U_CAPI UBool U_EXPORT2
@ -173,18 +175,56 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
#if !UCONFIG_NO_NORMALIZATION
/* normalization properties from unorm.icu */
switch(which) {
case UCHAR_FULL_COMPOSITION_EXCLUSION:
return unorm_internalIsFullCompositionExclusion(c);
case UCHAR_NFD_INERT:
case UCHAR_NFKD_INERT:
case UCHAR_NFC_INERT:
case UCHAR_NFKC_INERT:
return unorm_isNFSkippable(c, (UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD));
case UCHAR_SEGMENT_STARTER:
return unorm_isCanonSafeStart(c);
default:
break;
}
#endif
} else if(column==UPROPS_SRC_NFC || column==UPROPS_SRC_NFKC) {
#if !UCONFIG_NO_NORMALIZATION
UErrorCode errorCode=U_ZERO_ERROR;
switch(which) {
case UCHAR_FULL_COMPOSITION_EXCLUSION: {
// By definition, Full_Composition_Exclusion is the same as NFC_QC=No.
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
return impl->isCompNo(impl->getNorm16(c));
}
break;
}
default: {
// UCHAR_NF..._INERT properties
const Normalizer2 *norm2=Normalizer2Factory::getInstance(
(UNormalizationMode)(which-UCHAR_NFD_INERT+UNORM_NFD), errorCode);
if(U_SUCCESS(errorCode)) {
return norm2->isInert(c);
}
break;
}
}
#endif
} else if(column==UPROPS_SRC_NFKC_CF) {
// currently only for UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
#if !UCONFIG_NO_NORMALIZATION
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *kcf=Normalizer2Factory::getNFKC_CFImpl(errorCode);
if(U_SUCCESS(errorCode)) {
UnicodeString src(c);
UnicodeString dest;
{
// The ReorderingBuffer must be in a block because its destructor
// needs to release dest's buffer before we look at its contents.
ReorderingBuffer buffer(*kcf, dest);
// Small destCapacity for NFKC_CF(c).
if(U_SUCCESS(errorCode) && buffer.init(5, errorCode)) {
const UChar *srcArray=src.getBuffer();
kcf->compose(srcArray, srcArray+src.length(), FALSE,
TRUE, buffer, errorCode);
}
}
return U_SUCCESS(errorCode) && dest!=src;
}
#endif
} else if(column==UPROPS_SRC_BIDI) {
/* bidi/shaping properties */
@ -225,14 +265,16 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
} else if(column==UPROPS_SRC_CASE_AND_NORM) {
#if !UCONFIG_NO_NORMALIZATION
UChar nfdBuffer[4];
const UChar *nfd=NULL;
const UChar *nfd;
int32_t nfdLength;
UErrorCode errorCode = U_ZERO_ERROR;
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *nfcImpl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_FAILURE(errorCode)) {
return FALSE;
}
switch(which) {
case UCHAR_CHANGES_WHEN_CASEFOLDED:
if(unorm_haveData(&errorCode)) {
nfd=unorm_getCanonicalDecomposition(c, nfdBuffer, &nfdLength);
}
nfd=nfcImpl->getDecomposition(c, nfdBuffer, nfdLength);
if(nfd!=NULL) {
/* c has a decomposition */
if(nfdLength==1) {
@ -274,6 +316,32 @@ u_hasBinaryProperty(UChar32 c, UProperty which) {
return FALSE;
}
#if !UCONFIG_NO_NORMALIZATION
U_CAPI uint8_t U_EXPORT2
u_getCombiningClass(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(errorCode);
if(U_SUCCESS(errorCode)) {
return impl->getCC(impl->getNorm16(c));
} else {
return 0;
}
}
static uint16_t
getFCD16(UChar32 c) {
UErrorCode errorCode=U_ZERO_ERROR;
const UTrie2 *trie=Normalizer2Factory::getFCDTrie(errorCode);
if(U_SUCCESS(errorCode)) {
return UTRIE2_GET16(trie, c);
} else {
return 0;
}
}
#endif
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
@ -311,11 +379,9 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
return (int32_t)u_charDirection(c);
case UCHAR_BLOCK:
return (int32_t)ublock_getCode(c);
case UCHAR_CANONICAL_COMBINING_CLASS:
#if !UCONFIG_NO_NORMALIZATION
case UCHAR_CANONICAL_COMBINING_CLASS:
return u_getCombiningClass(c);
#else
return 0;
#endif
case UCHAR_DECOMPOSITION_TYPE:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_DT_MASK);
@ -352,9 +418,9 @@ u_getIntPropertyValue(UChar32 c, UProperty which) {
case UCHAR_NFKC_QUICK_CHECK:
return (int32_t)unorm_getQuickCheck(c, (UNormalizationMode)(which-UCHAR_NFD_QUICK_CHECK+UNORM_NFD));
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
return unorm_getFCD16FromCodePoint(c)>>8;
return getFCD16(c)>>8;
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
return unorm_getFCD16FromCodePoint(c)&0xff;
return getFCD16(c)&0xff;
#endif
case UCHAR_GRAPHEME_CLUSTER_BREAK:
return (int32_t)(u_getUnicodeProperties(c, 2)&UPROPS_GCB_MASK)>>UPROPS_GCB_SHIFT;
@ -462,12 +528,13 @@ uprops_getSource(UProperty which) {
case UCHAR_CANONICAL_COMBINING_CLASS:
case UCHAR_NFD_QUICK_CHECK:
case UCHAR_NFKD_QUICK_CHECK:
case UCHAR_NFC_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
case UCHAR_LEAD_CANONICAL_COMBINING_CLASS:
case UCHAR_TRAIL_CANONICAL_COMBINING_CLASS:
return UPROPS_SRC_NORM;
return UPROPS_SRC_NFC;
case UCHAR_NFKD_QUICK_CHECK:
case UCHAR_NFKC_QUICK_CHECK:
return UPROPS_SRC_NFKC;
case UCHAR_BIDI_CLASS:
case UCHAR_JOINING_GROUP:

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2002-2009, International Business Machines
* Copyright (C) 2002-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -332,6 +332,12 @@ enum UPropertySource {
UPROPS_SRC_CHAR_AND_PROPSVEC,
/** From ucase.c/ucase.icu as well as unorm.cpp/unorm.icu */
UPROPS_SRC_CASE_AND_NORM,
/** From normalizer2impl.cpp/nfc.nrm */
UPROPS_SRC_NFC,
/** From normalizer2impl.cpp/nfkc.nrm */
UPROPS_SRC_NFKC,
/** From normalizer2impl.cpp/nfkc_cf.nrm */
UPROPS_SRC_NFKC_CF,
/** One more than the highest UPropertySource (UPROPS_SRC_) constant. */
UPROPS_SRC_COUNT
};
@ -390,4 +396,18 @@ uchar_swapNames(const UDataSwapper *ds,
const void *inData, int32_t length, void *outData,
UErrorCode *pErrorCode);
#ifdef XP_CPLUSPLUS
U_NAMESPACE_BEGIN
class UnicodeSet;
// implemented in uniset_props.cpp
U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode &errorCode);
U_NAMESPACE_END
#endif
#endif

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* file name: ustr_imp.h
@ -25,6 +25,23 @@
typedef struct UBreakIterator UBreakIterator;
#endif
#ifndef U_COMPARE_IGNORE_CASE
/* see also unorm.h */
/**
* Option bit for unorm_compare:
* Perform case-insensitive comparison.
* @draft ICU 2.2
*/
#define U_COMPARE_IGNORE_CASE 0x10000
#endif
/**
* Internal option for unorm_cmpEquivFold() for strncmp style.
* If set, checks for both string length and terminating NUL.
* @internal
*/
#define _STRNCMP_STYLE 0x1000
/**
* Compare two strings in code point order or code unit order.
* Works in strcmp style (both lengths -1),

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -25,7 +25,6 @@
#include "unicode/ubrk.h"
#include "cmemory.h"
#include "ucase.h"
#include "unormimp.h"
#include "ustr_imp.h"
/* string casing ------------------------------------------------------------ */

View file

@ -1,11 +1,11 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
* file name: utrie2.c
* file name: utrie2.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
@ -423,7 +423,7 @@ utrie2_swap(const UDataSwapper *ds,
trie.indexLength=ds->readUInt16(inTrie->indexLength);
trie.shiftedDataLength=ds->readUInt16(inTrie->shiftedDataLength);
valueBits=trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK;
valueBits=(UTrie2ValueBits)(trie.options&UTRIE2_OPTIONS_VALUE_BITS_MASK);
dataLength=(int32_t)trie.shiftedDataLength<<UTRIE2_INDEX_SHIFT;
if( trie.signature!=UTRIE2_SIG ||
@ -696,3 +696,39 @@ utrie2_enumForLeadSurrogate(const UTrie2 *trie, UChar32 lead,
lead=(lead-0xd7c0)<<10; /* start code point */
enumEitherTrie(trie, lead, lead+0x400, enumValue, enumRange, context);
}
/* C++ convenience wrappers ------------------------------------------------- */
U_NAMESPACE_BEGIN
uint16_t BackwardUTrie2StringIterator::previous16() {
codePointLimit=codePointStart;
if(start>=codePointStart) {
codePoint=U_SENTINEL;
return 0;
}
uint16_t result;
UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result);
return result;
}
uint16_t ForwardUTrie2StringIterator::next16() {
codePointStart=codePointLimit;
if(codePointLimit==limit) {
codePoint=U_SENTINEL;
return 0;
}
uint16_t result;
UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result);
return result;
}
UTrie2 *UTrie2Singleton::getInstance(InstantiatorFn *instantiator, const void *context,
UErrorCode &errorCode) {
void *duplicate;
UTrie2 *instance=(UTrie2 *)singleton.getInstance(instantiator, context, duplicate, errorCode);
utrie2_close((UTrie2 *)duplicate);
return instance;
}
U_NAMESPACE_END

View file

@ -1,7 +1,7 @@
/*
******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
@ -605,8 +605,70 @@ utrie2_set32ForLeadSurrogateCodeUnit(UTrie2 *trie,
*/
#define UTRIE2_GET32_FROM_SUPP(trie, c) _UTRIE2_GET_FROM_SUPP((trie), data32, c)
U_CDECL_END
/* C++ convenience wrappers ------------------------------------------------- */
#ifdef XP_CPLUSPLUS
#include "mutex.h"
U_NAMESPACE_BEGIN
// Use the Forward/Backward subclasses below.
class UTrie2StringIterator : public UMemory {
public:
UTrie2StringIterator(const UTrie2 *t, const UChar *p) :
trie(t), codePointStart(p), codePointLimit(p), codePoint(U_SENTINEL) {}
const UTrie2 *trie;
const UChar *codePointStart, *codePointLimit;
UChar32 codePoint;
};
class BackwardUTrie2StringIterator : public UTrie2StringIterator {
public:
BackwardUTrie2StringIterator(const UTrie2 *t, const UChar *s, const UChar *p) :
UTrie2StringIterator(t, p), start(s) {}
uint16_t previous16();
const UChar *start;
};
class ForwardUTrie2StringIterator : public UTrie2StringIterator {
public:
// Iteration limit l can be NULL.
// In that case, the caller must detect c==0 and stop.
ForwardUTrie2StringIterator(const UTrie2 *t, const UChar *p, const UChar *l) :
UTrie2StringIterator(t, p), limit(l) {}
uint16_t next16();
const UChar *limit;
};
class UTrie2Singleton {
public:
UTrie2Singleton(SimpleSingleton &s) : singleton(s) {}
void deleteInstance() {
utrie2_close((UTrie2 *)singleton.fInstance);
singleton.reset();
}
UTrie2 *getInstance(InstantiatorFn *instantiator, const void *context,
UErrorCode &errorCode);
private:
SimpleSingleton &singleton;
};
U_NAMESPACE_END
#endif
/* Internal definitions ----------------------------------------------------- */
U_CDECL_BEGIN
/** Build-time trie structure. */
struct UNewTrie2;
typedef struct UNewTrie2 UNewTrie2;

View file

@ -2,7 +2,7 @@
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.63.
#
# Copyright (c) 1999-2009, International Business Machines Corporation and others. All Rights Reserved.
# Copyright (c) 1999-2010, International Business Machines Corporation and others. All Rights Reserved.
#
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
@ -10583,7 +10583,7 @@ then
fi
# output the Makefiles
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
ac_config_files="$ac_config_files icudefs.mk Makefile data/pkgdataMakefile config/Makefile.inc config/pkgdataMakefile data/Makefile stubdata/Makefile common/Makefile i18n/Makefile layout/Makefile layoutex/Makefile io/Makefile extra/Makefile extra/uconv/Makefile extra/uconv/pkgdataMakefile extra/scrptrun/Makefile tools/Makefile tools/ctestfw/Makefile tools/toolutil/Makefile tools/makeconv/Makefile tools/genrb/Makefile tools/genuca/Makefile tools/genccode/Makefile tools/gencmn/Makefile tools/gencnval/Makefile tools/genctd/Makefile tools/gennames/Makefile tools/gentest/Makefile tools/gennorm/Makefile tools/gennorm2/Makefile tools/genprops/Makefile tools/gencase/Makefile tools/genbidi/Makefile tools/genpname/Makefile tools/genbrk/Makefile tools/gensprep/Makefile tools/icupkg/Makefile tools/icuswap/Makefile tools/pkgdata/Makefile tools/tzcode/Makefile tools/gencfu/Makefile test/Makefile test/compat/Makefile test/testdata/Makefile test/testdata/pkgdataMakefile test/hdrtst/Makefile test/intltest/Makefile test/cintltst/Makefile test/iotest/Makefile test/letest/Makefile test/perf/Makefile test/perf/collationperf/Makefile test/perf/ubrkperf/Makefile test/perf/charperf/Makefile test/perf/convperf/Makefile test/perf/normperf/Makefile test/perf/strsrchperf/Makefile test/perf/unisetperf/Makefile test/perf/usetperf/Makefile test/perf/ustrperf/Makefile test/perf/utfperf/Makefile test/perf/utrie2perf/Makefile samples/Makefile samples/date/Makefile samples/cal/Makefile samples/layout/Makefile common/unicode/platform.h"
cat >confcache <<\_ACEOF
# This file is a shell script that caches the results of configure
@ -11210,6 +11210,7 @@ do
"tools/gennames/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennames/Makefile" ;;
"tools/gentest/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gentest/Makefile" ;;
"tools/gennorm/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennorm/Makefile" ;;
"tools/gennorm2/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gennorm2/Makefile" ;;
"tools/genprops/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genprops/Makefile" ;;
"tools/gencase/Makefile") CONFIG_FILES="$CONFIG_FILES tools/gencase/Makefile" ;;
"tools/genbidi/Makefile") CONFIG_FILES="$CONFIG_FILES tools/genbidi/Makefile" ;;

View file

@ -1,5 +1,5 @@
# -*-autoconf-*-
AC_COPYRIGHT([ Copyright (c) 1999-2009, International Business Machines Corporation and others. All Rights Reserved. ])
AC_COPYRIGHT([ Copyright (c) 1999-2010, International Business Machines Corporation and others. All Rights Reserved. ])
# configure.in for ICU
# Stephen F. Booth, heavily modified by Yves and others
@ -1223,6 +1223,7 @@ AC_CONFIG_FILES([icudefs.mk \
tools/gennames/Makefile \
tools/gentest/Makefile \
tools/gennorm/Makefile \
tools/gennorm2/Makefile \
tools/genprops/Makefile \
tools/gencase/Makefile \
tools/genbidi/Makefile \

View file

@ -1,5 +1,5 @@
## Makefile.in for ICU data
## Copyright (c) 1999-2009, International Business Machines Corporation and
## Copyright (c) 1999-2010, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
@ -223,7 +223,7 @@ package390: $(OUTTMPDIR)/icudata390.lst $(PKGDATA_LIST) ./icupkg.inc packagedata
# 2005-may-05 Removed Unicode properties files (unorm.icu, uprops.icu, ucase.icu, ubidi.icu)
# from data build. See Jitterbug 4497. (makedata.mak revision 1.117)
#
DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu
DAT_FILES_SHORT=pnames.icu unames.icu cnvalias.icu coll/ucadata.icu coll/invuca.icu nfc.nrm nfkc.nrm nfkc_cf.nrm
DAT_FILES=$(DAT_FILES_SHORT:%=$(BUILDDIR)/%)
## BRK files
@ -488,14 +488,17 @@ $(BUILDDIR)/pnames.icu: $(UNICODEDATADIR)/PropertyAliases.txt $(UNICODEDATADIR)/
$(INVOKE) $(TOOLBINDIR)/genpname -d $(BUILDDIR)
# unorm.icu
$(BUILDDIR)/unorm.icu: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/DerivedNormalizationProps.txt $(UNICODEDATADIR)/BidiMirroring.txt $(TOOLBINDIR)/gennorm$(TOOLEXEEXT) $(BUILDDIR)/$(ICUDT)pnames.icu $(BUILDDIR)/$(ICUDT)uprops.icu $(BUILDDIR)/$(ICUDT)ucase.icu
$(INVOKE) $(TOOLBINDIR)/gennorm -s $(UNICODEDATADIR) -i $(BUILDDIR) -d $(BUILDDIR) -u $(UNICODE_VERSION)
# ICU 4.4: $(BUILDDIR)/unorm.icu is now prebuilt, see below.
$(OUTTMPDIR)/unorm_props_data.c: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/DerivedNormalizationProps.txt $(UNICODEDATADIR)/BidiMirroring.txt $(TOOLBINDIR)/gennorm$(TOOLEXEEXT) $(BUILDDIR)/$(ICUDT)pnames.icu $(BUILDDIR)/$(ICUDT)uprops.icu $(BUILDDIR)/$(ICUDT)ucase.icu
$(INVOKE) $(TOOLBINDIR)/gennorm --csource -s $(UNICODEDATADIR) -i $(BUILDDIR) -d $(OUTTMPDIR) -u $(UNICODE_VERSION)
# unorm.icu used to be built like this:
# $(INVOKE) $(TOOLBINDIR)/gennorm -s $(UNICODEDATADIR) -i $(BUILDDIR) -d $(BUILDDIR) -u $(UNICODE_VERSION)
# ucadata.icu
# used to depend on $(BUILDDIR)/$(ICUDT)unorm.icu $(BUILDDIR)/$(ICUDT)ucase.icu
# see Jitterbug 4497
$(COLBLDDIR)/ucadata.icu $(COLBLDDIR)/invuca.icu: $(UNICODEDATADIR)/FractionalUCA.txt $(TOOLBINDIR)/genuca$(TOOLEXEEXT)
$(COLBLDDIR)/ucadata.icu $(COLBLDDIR)/invuca.icu: $(UNICODEDATADIR)/FractionalUCA.txt $(TOOLBINDIR)/genuca$(TOOLEXEEXT) $(BUILDDIR)/$(ICUDT)nfc.nrm
$(INVOKE) $(TOOLBINDIR)/genuca -s $(UNICODEDATADIR) -d $(COLBLDDIR) -i $(BUILDDIR)
# unames.icu
@ -506,6 +509,13 @@ $(BUILDDIR)/unames.icu: $(UNICODEDATADIR)/UnicodeData.txt $(UNICODEDATADIR)/Name
$(BUILDDIR)/cnvalias.icu: $(UCMSRCDIR)/convrtrs.txt $(TOOLBINDIR)/gencnval$(TOOLEXEEXT)
$(INVOKE) $(TOOLBINDIR)/gencnval -d $(BUILDDIR) $(UCMSRCDIR)/convrtrs.txt
# Targets for prebuilt Unicode data
$(BUILDDIR)/unorm.icu: $(SRCDATADIR)/in/unorm.icu
$(INVOKE) $(TOOLBINDIR)/icupkg -t$(ICUDATA_CHAR) $< $@
$(BUILDDIR)/%.nrm: $(SRCDATADIR)/in/%.nrm
$(INVOKE) $(TOOLBINDIR)/icupkg -t$(ICUDATA_CHAR) $< $@
#################################################### SPP
# SPP FILES
@ -751,7 +761,7 @@ $(INDEX_RES_FILE): $(INDEX_FILE) $(TOOLBINDIR)/genrb$(TOOLEXEEXT)
# They are not built by default but need to be built for ICU4J data and for getting the .c source files
# when updating the Unicode data.
# Changed in Makefile.in revision 1.147. See Jitterbug 4497.
uni-core-data: build-dir $(UNI_CORE_TARGET_DATA)
uni-core-data: build-dir $(UNI_CORE_TARGET_DATA) $(OUTTMPDIR)/unorm_props_data.c
@echo Unicode .icu files built to $(BUILDDIR)
@echo Unicode .c source files built to $(OUTTMPDIR)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,5 +1,5 @@
#**********************************************************************
#* Copyright (C) 1999-2009, International Business Machines Corporation
#* Copyright (C) 1999-2010, International Business Machines Corporation
#* and others. All Rights Reserved.
#**********************************************************************
# nmake file for creating data files on win32
@ -28,7 +28,7 @@ ICU_LIB_TARGET=$(DLL_OUTPUT)\$(U_ICUDATA_NAME).dll
!MESSAGE ICU data make path is $(ICUMAKE)
# Suffixes for data files
.SUFFIXES : .ucm .cnv .dll .dat .res .txt .c
.SUFFIXES : .nrm .icu .ucm .cnv .dll .dat .res .txt .c
ICUOUT=$(ICUMAKE)\out
@ -474,8 +474,8 @@ ALL : GODATA "$(ICU_LIB_TARGET)" "$(TESTDATAOUT)\testdata.dat"
# when updating the Unicode data.
# Changed in makedata.mak revision 1.117. See Jitterbug 4497.
# Command line:
# C:\svn\icuproj\icu\trunk\source\data>nmake -f makedata.mak ICUMAKE=C:\svn\icuproj\icu\trunk\source\data\ CFG=Debug uni-core-data
uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu" "$(ICUBLD_PKG)\unorm.icu"
# C:\svn\icuproj\icu\trunk\source\data>nmake -f makedata.mak ICUMAKE=C:\svn\icuproj\icu\trunk\source\data\ CFG=x86\Debug uni-core-data
uni-core-data: GODATA "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu" "$(ICUBLD_PKG)\unorm.icu" "$(ICUTMP)\unorm_props_data.c"
@echo Unicode .icu files built to "$(ICUBLD_PKG)"
@echo Unicode .c source files built to "$(ICUTMP)"
@ -553,7 +553,7 @@ testdata.jar: GODATA "$(ICUOUT)\icu4j\testdata.jar"
copy "$(ICUTMP)\$(ICUPKG).dat" "$(ICUOUT)\$(U_ICUDATA_NAME)$(U_ICUDATA_ENDIAN_SUFFIX).dat"
-@erase "$(ICUTMP)\$(ICUPKG).dat"
!ELSE
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
"$(ICU_LIB_TARGET)" : $(COMMON_ICUDATA_DEPENDENCIES) $(CNV_FILES) "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\cnvalias.icu" "$(ICUBLD_PKG)\nfc.nrm" "$(ICUBLD_PKG)\nfkc.nrm" "$(ICUBLD_PKG)\nfkc_cf.nrm" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu" "$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" $(CURR_RES_FILES) $(LANG_RES_FILES) $(REGION_RES_FILES) $(ZONE_RES_FILES) $(BRK_FILES) $(BRK_CTD_FILES) $(BRK_RES_FILES) $(COL_COL_FILES) $(RBNF_RES_FILES) $(TRANSLIT_RES_FILES) $(ALL_RES) $(SPREP_FILES) "$(ICUBLD_PKG)\confusables.cfu"
@echo Building icu data
cd "$(ICUBLD_PKG)"
"$(ICUPBIN)\pkgdata" $(COMMON_ICUDATA_ARGUMENTS) <<"$(ICUTMP)\icudata.lst"
@ -563,6 +563,9 @@ confusables.cfu
$(ICUCOL)\ucadata.icu
$(ICUCOL)\invuca.icu
cnvalias.icu
nfc.nrm
nfkc.nrm
nfkc_cf.nrm
$(CNV_FILES:.cnv =.cnv
)
$(ALL_RES:.res =.res
@ -627,6 +630,7 @@ CLEAN : GODATA
-@erase "*.exp"
-@erase "*.icu"
-@erase "*.lib"
-@erase "*.nrm"
-@erase "*.res"
-@erase "*.spp"
-@erase "*.txt"
@ -878,9 +882,10 @@ res_index:table(nofallback) {
@"$(ICUTOOLS)\gencase\$(CFG)\gencase" --csource -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUTMP)"
# Targets for unorm.icu
"$(ICUBLD_PKG)\unorm.icu": "$(ICUUNIDATA)\*.txt" "$(ICUTOOLS)\gennorm\$(CFG)\gennorm.exe" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu"
# ICU 4.4: "$(ICUBLD_PKG)\unorm.icu" is now prebuilt, see below.
"$(ICUTMP)\unorm_props_data.c": "$(ICUUNIDATA)\*.txt" "$(ICUTOOLS)\gennorm\$(CFG)\gennorm.exe" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu"
@echo Creating data file for Unicode Normalization
@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUBLD_PKG)"
@rem @"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUBLD_PKG)"
@"$(ICUTOOLS)\gennorm\$(CFG)\gennorm" --csource -u $(UNICODE_VERSION) -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)" -d "$(ICUTMP)"
# Targets for converters
@ -891,10 +896,23 @@ res_index:table(nofallback) {
# Targets for ucadata.icu & invuca.icu
# used to depend on "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\unorm.icu"
# see Jitterbug 4497
"$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu": "$(ICUUNIDATA)\FractionalUCA.txt" "$(ICUTOOLS)\genuca\$(CFG)\genuca.exe"
"$(ICUBLD_PKG)\$(ICUCOL)\invuca.icu" "$(ICUBLD_PKG)\$(ICUCOL)\ucadata.icu": "$(ICUUNIDATA)\FractionalUCA.txt" "$(ICUTOOLS)\genuca\$(CFG)\genuca.exe" "$(ICUBLD_PKG)\nfc.nrm"
@echo Creating UCA data files
@"$(ICUTOOLS)\genuca\$(CFG)\genuca" -d "$(ICUBLD_PKG)\$(ICUCOL)" -i "$(ICUBLD_PKG)" -s "$(ICUUNIDATA)"
# Targets for prebuilt Unicode data
"$(ICUBLD_PKG)\unorm.icu": $(ICUSRCDATA_RELATIVE_PATH)\in\unorm.icu
"$(ICUPBIN)\icupkg" -tl $? $@
"$(ICUBLD_PKG)\nfc.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfc.nrm
"$(ICUPBIN)\icupkg" -tl $? $@
"$(ICUBLD_PKG)\nfkc.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfkc.nrm
"$(ICUPBIN)\icupkg" -tl $? $@
"$(ICUBLD_PKG)\nfkc_cf.nrm": $(ICUSRCDATA_RELATIVE_PATH)\in\nfkc_cf.nrm
"$(ICUPBIN)\icupkg" -tl $? $@
# Stringprep .spp file generation.
{$(ICUSRCDATA_RELATIVE_PATH)\$(ICUSPREP)}.txt.spp:
@echo Creating $@
@ -924,6 +942,6 @@ $(MISC_SOURCE) $(RB_FILES) $(CURR_FILES) $(LANG_FILES) $(REGION_FILES) $(ZONE_FI
# This used to depend on "$(ICUBLD_PKG)\uprops.icu" "$(ICUBLD_PKG)\ucase.icu" "$(ICUBLD_PKG)\ubidi.icu" "$(ICUBLD_PKG)\unorm.icu"
# This data is now hard coded as a part of the library.
# See Jitterbug 4497 for details.
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu"
$(BRK_SOURCE) : "$(ICUBLD_PKG)\unames.icu" "$(ICUBLD_PKG)\pnames.icu" "$(ICUBLD_PKG)\nfc.nrm"
!ENDIF

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 1996-2009, International Business Machines *
* Copyright (C) 1996-2010, International Business Machines *
* Corporation and others. All Rights Reserved. *
******************************************************************************
*/
@ -27,7 +27,7 @@
#include "hash.h"
#include "uhash.h"
#include "ucol_imp.h"
#include "unormimp.h"
#include "normalizer2impl.h"
#include "unicode/colldata.h"
#include "unicode/bmsearch.h"
@ -81,6 +81,7 @@ private:
uint32_t variableTop;
UBool toShift;
UCollator *coll;
const Normalizer2 &nfd;
const UnicodeString *targetString;
const UChar *targetBuffer;
@ -93,6 +94,7 @@ private:
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
: bufferSize(0), bufferMin(0), bufferMax(0),
strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
nfd(*Normalizer2Factory::getNFDInstance(status)),
targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
{
strength = ucol_getStrength(coll);
@ -348,63 +350,14 @@ UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
return TRUE;
}
UChar t2[32], p2[32];
const UChar *pBuffer = pattern.getBuffer();
int32_t pLength = pattern.length();
int32_t length = end - start;
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
int32_t decomplength = unorm_decompose(t2, ARRAY_SIZE(t2),
targetBuffer + start, length,
FALSE, 0, &status);
// use separate status2 in case of buffer overflow
if (decomplength != unorm_decompose(p2, ARRAY_SIZE(p2),
pBuffer, pLength,
FALSE, 0, &status2)) {
return FALSE; // lengths are different
}
// compare contents
UChar *text, *pat;
if(U_SUCCESS(status)) {
text = t2;
pat = p2;
} else if(status == U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
// allocate one buffer for both decompositions
text = NEW_ARRAY(UChar, decomplength * 2);
// Check for allocation failure.
if (text == NULL) {
return FALSE;
}
pat = text + decomplength;
unorm_decompose(text, decomplength, targetBuffer + start,
length, FALSE, 0, &status);
unorm_decompose(pat, decomplength, pBuffer,
pLength, FALSE, 0, &status);
} else {
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
// and that we don't uprv_free() an undefined text pointer
text = pat = t2;
decomplength = 0;
}
UBool result = (UBool)(u_memcmp(pat, text, decomplength) == 0);
if(text != t2) {
DELETE_ARRAY(text);
}
// Note: We could use Normalizer::compare() or similar, but for short strings
// which may not be in FCD it might be faster to just NFD them.
UErrorCode status = U_ZERO_ERROR;
UnicodeString t2, p2;
nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
nfd.normalize(pattern, p2, status);
// return FALSE if NFD failed
return U_SUCCESS(status) && result;
return U_SUCCESS(status) && t2 == p2;
}
#define HASH_TABLE_SIZE 257

View file

@ -1,6 +1,6 @@
/*
*******************************************************************************
* Copyright (C) 1996-2009, International Business Machines Corporation and *
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
@ -122,9 +122,9 @@ UBool CollationElementIterator::operator==(
}
// both are in the normalization buffer
if (m_data_->iteratordata_.pos
- m_data_->iteratordata_.writableBuffer
- m_data_->iteratordata_.writableBuffer.getBuffer()
!= that.m_data_->iteratordata_.pos
- that.m_data_->iteratordata_.writableBuffer) {
- that.m_data_->iteratordata_.writableBuffer.getBuffer()) {
// not in the same position in the normalization buffer
return FALSE;
}
@ -176,7 +176,7 @@ void CollationElementIterator::setText(const UnicodeString& source,
int32_t length = source.length();
UChar *string = NULL;
if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) {
uprv_free(m_data_->iteratordata_.string);
uprv_free((UChar *)m_data_->iteratordata_.string);
}
m_data_->isWritable = TRUE;
if (length > 0) {
@ -200,7 +200,7 @@ void CollationElementIterator::setText(const UnicodeString& source,
/* Free offsetBuffer before initializing it. */
ucol_freeOffsetBuffer(&(m_data_->iteratordata_));
uprv_init_collIterate(m_data_->iteratordata_.coll, string, length,
&m_data_->iteratordata_);
&m_data_->iteratordata_, &status);
m_data_->reset_ = TRUE;
}
@ -241,13 +241,13 @@ void CollationElementIterator::setText(CharacterIterator& source,
}
if (m_data_->isWritable && m_data_->iteratordata_.string != NULL) {
uprv_free(m_data_->iteratordata_.string);
uprv_free((UChar *)m_data_->iteratordata_.string);
}
m_data_->isWritable = TRUE;
/* Free offsetBuffer before initializing it. */
ucol_freeOffsetBuffer(&(m_data_->iteratordata_));
uprv_init_collIterate(m_data_->iteratordata_.coll, buffer, length,
&m_data_->iteratordata_);
&m_data_->iteratordata_, &status);
m_data_->reset_ = TRUE;
}
@ -407,7 +407,7 @@ const CollationElementIterator& CollationElementIterator::operator=(
if (length > 0) {
coliter->string = (UChar *)uprv_malloc(length * U_SIZEOF_UCHAR);
if(coliter->string != NULL) {
uprv_memcpy(coliter->string, othercoliter->string,
uprv_memcpy((UChar *)coliter->string, othercoliter->string,
length * U_SIZEOF_UCHAR);
} else { // Error: couldn't allocate memory. No copying should be done
length = 0;
@ -423,27 +423,8 @@ const CollationElementIterator& CollationElementIterator::operator=(
/* handle writable buffer here */
if (othercoliter->flags & UCOL_ITER_INNORMBUF) {
uint32_t wlength = u_strlen(othercoliter->writableBuffer) + 1;
if (wlength < coliter->writableBufSize) {
uprv_memcpy(coliter->stackWritableBuffer,
othercoliter->stackWritableBuffer,
wlength * U_SIZEOF_UCHAR);
}
else {
if (coliter->writableBuffer != coliter->stackWritableBuffer) {
uprv_free(coliter->writableBuffer);
}
coliter->writableBuffer = (UChar *)uprv_malloc(
wlength * U_SIZEOF_UCHAR);
if(coliter->writableBuffer != NULL) {
uprv_memcpy(coliter->writableBuffer,
othercoliter->writableBuffer,
wlength * U_SIZEOF_UCHAR);
coliter->writableBufSize = wlength;
} else { // Error: couldn't allocate memory for writableBuffer
coliter->writableBufSize = 0;
}
}
coliter->writableBuffer = othercoliter->writableBuffer;
coliter->writableBuffer.getTerminatedBuffer();
}
/* current position */
@ -453,13 +434,9 @@ const CollationElementIterator& CollationElementIterator::operator=(
coliter->pos = coliter->string +
(othercoliter->pos - othercoliter->string);
}
else if (coliter->writableBuffer != NULL) {
coliter->pos = coliter->writableBuffer +
(othercoliter->pos - othercoliter->writableBuffer);
}
else {
// Error: couldn't allocate memory for writableBuffer
coliter->pos = NULL;
coliter->pos = coliter->writableBuffer.getTerminatedBuffer() +
(othercoliter->pos - othercoliter->writableBuffer.getBuffer());
}
/* CE buffer */

View file

@ -895,7 +895,7 @@
>
</File>
<File
RelativePath=".\ucol_wgt.c"
RelativePath=".\ucol_wgt.cpp"
>
</File>
<File

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2007, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -12,37 +12,43 @@
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/uniset.h"
#include "unicode/uiter.h"
#include "unicode/normalizer2.h"
#include "cstring.h"
#include "nortrans.h"
#include "unormimp.h"
#include "ucln_in.h"
U_NAMESPACE_BEGIN
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(NormalizationTransliterator)
static inline Transliterator::Token cstrToken(const char *s) {
return Transliterator::pointerToken((void *)s);
}
/**
* System registration hook.
*/
void NormalizationTransliterator::registerIDs() {
UErrorCode errorCode = U_ZERO_ERROR;
if(!unorm_haveData(&errorCode)) {
return;
}
// In the Token, the byte after the NUL is the UNormalization2Mode.
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFC"),
_create, integerToken(UNORM_NFC));
_create, cstrToken("nfc\0\0"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKC"),
_create, integerToken(UNORM_NFKC));
_create, cstrToken("nfkc\0\0"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFD"),
_create, integerToken(UNORM_NFD));
_create, cstrToken("nfc\0\1"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-NFKD"),
_create, integerToken(UNORM_NFKD));
_create, cstrToken("nfkc\0\1"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCD"),
_create, cstrToken("nfc\0\2"));
Transliterator::_registerFactory(UNICODE_STRING_SIMPLE("Any-FCC"),
_create, cstrToken("nfc\0\3"));
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFC"),
UNICODE_STRING_SIMPLE("NFD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("NFKC"),
UNICODE_STRING_SIMPLE("NFKD"), TRUE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCC"),
UNICODE_STRING_SIMPLE("NFD"), FALSE);
Transliterator::_registerSpecialInverse(UNICODE_STRING_SIMPLE("FCD"),
UNICODE_STRING_SIMPLE("FCD"), FALSE);
}
/**
@ -50,19 +56,23 @@ void NormalizationTransliterator::registerIDs() {
*/
Transliterator* NormalizationTransliterator::_create(const UnicodeString& ID,
Token context) {
return new NormalizationTransliterator(ID, (UNormalizationMode) context.integer, 0);
const char *name = (const char *)context.pointer;
UNormalization2Mode mode = (UNormalization2Mode)uprv_strchr(name, 0)[1];
UErrorCode errorCode = U_ZERO_ERROR;
const Normalizer2 *norm2 = Normalizer2::getInstance(NULL, name, mode, errorCode);
if(U_SUCCESS(errorCode)) {
return new NormalizationTransliterator(ID, *norm2);
} else {
return NULL;
}
}
/**
* Constructs a transliterator.
*/
NormalizationTransliterator::NormalizationTransliterator(
const UnicodeString& id,
UNormalizationMode mode, int32_t opt) :
Transliterator(id, 0) {
fMode = mode;
options = opt;
}
NormalizationTransliterator::NormalizationTransliterator(const UnicodeString& id,
const Normalizer2 &norm2) :
Transliterator(id, 0), fNorm2(norm2) {}
/**
* Destructor.
@ -74,20 +84,7 @@ NormalizationTransliterator::~NormalizationTransliterator() {
* Copy constructor.
*/
NormalizationTransliterator::NormalizationTransliterator(const NormalizationTransliterator& o) :
Transliterator(o) {
fMode = o.fMode;
options = o.options;
}
/**
* Assignment operator.
*/
/*NormalizationTransliterator& NormalizationTransliterator::operator=(const NormalizationTransliterator& o) {
Transliterator::operator=(o);
fMode = o.fMode;
options = o.options;
return *this;
}*/
Transliterator(o), fNorm2(o.fNorm2) {}
/**
* Transliterator API.
@ -104,23 +101,10 @@ void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransP
// start and limit of the input range
int32_t start = offsets.start;
int32_t limit = offsets.limit;
int32_t length, delta;
if(start >= limit) {
return;
}
// a C code unit iterator, implemented around the Replaceable
UCharIterator iter;
uiter_setReplaceable(&iter, &text);
// the output string and buffer pointer
UnicodeString output;
UChar *buffer;
UBool neededToNormalize;
UErrorCode errorCode;
/*
* Normalize as short chunks at a time as possible even in
* bulk mode, so that styled text is minimally disrupted.
@ -129,101 +113,62 @@ void NormalizationTransliterator::handleTransliterate(Replaceable& text, UTransP
*
* If it was known that the input text is not styled, then
* a bulk mode normalization could look like this:
*
UChar staticChars[256];
UnicodeString input;
length = limit - start;
input.setTo(staticChars, 0, sizeof(staticChars)/U_SIZEOF_UCHAR); // writable alias
UnicodeString input, normalized;
int32_t length = limit - start;
_Replaceable_extractBetween(text, start, limit, input.getBuffer(length));
input.releaseBuffer(length);
UErrorCode status = U_ZERO_ERROR;
Normalizer::normalize(input, fMode, options, output, status);
fNorm2.normalize(input, normalized, status);
text.handleReplaceBetween(start, limit, output);
text.handleReplaceBetween(start, limit, normalized);
int32_t delta = output.length() - length;
int32_t delta = normalized.length() - length;
offsets.contextLimit += delta;
offsets.limit += delta;
offsets.start = limit + delta;
*
*/
while(start < limit) {
// set the iterator limits for the remaining input range
// this is a moving target because of the replacements in the text object
iter.start = iter.index = start;
iter.limit = limit;
// incrementally normalize a small chunk of the input
buffer = output.getBuffer(-1);
errorCode = U_ZERO_ERROR;
length = unorm_next(&iter, buffer, output.getCapacity(),
fMode, 0,
TRUE, &neededToNormalize,
&errorCode);
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
// use a larger output string buffer and do it again from the start
iter.index = start;
buffer = output.getBuffer(length);
errorCode = U_ZERO_ERROR;
length = unorm_next(&iter, buffer, output.getCapacity(),
fMode, 0,
TRUE, &neededToNormalize,
&errorCode);
output.releaseBuffer(U_SUCCESS(errorCode) ? length : 0);
UErrorCode errorCode = U_ZERO_ERROR;
UnicodeString segment;
UnicodeString normalized;
UChar32 c = text.char32At(start);
do {
int32_t prev = start;
// Skip at least one character so we make progress.
// c holds the character at start.
segment.setTo(c);
start += U16_LENGTH(c);
while(start < limit && !fNorm2.hasBoundaryBefore(c = text.char32At(start))) {
segment.append(c);
start += U16_LENGTH(c);
}
if(U_FAILURE(errorCode)) {
break;
}
limit = iter.index;
if(isIncremental && limit == iter.limit) {
if(start == limit && isIncremental && !fNorm2.hasBoundaryAfter(c)) {
// stop in incremental mode when we reach the input limit
// in case there are additional characters that could change the
// normalization result
// UNLESS all characters in the result of the normalization of
// the last run are in the skippable set
const UChar *s=output.getBuffer();
int32_t i=0, outLength=output.length();
UChar32 c;
while(i<outLength) {
U16_NEXT(s, i, outLength, c);
if(!unorm_isNFSkippable(c, fMode)) {
outLength=-1; // I wish C++ had labeled loops and break outer; ...
break;
}
}
if (outLength<0) {
break;
}
start=prev;
break;
}
if(neededToNormalize) {
fNorm2.normalize(segment, normalized, errorCode);
if(U_FAILURE(errorCode)) {
break;
}
if(segment != normalized) {
// replace the input chunk with its normalized form
text.handleReplaceBetween(start, limit, output);
text.handleReplaceBetween(prev, start, normalized);
// update all necessary indexes accordingly
delta = length - (limit - start); // length change in the text object
start = limit += delta; // the next chunk starts where this one ends, with adjustment
limit = offsets.limit += delta; // set the iteration limit to the adjusted end of the input range
offsets.contextLimit += delta;
} else {
// delta == 0
start = limit;
limit = offsets.limit;
int32_t delta = normalized.length() - (start - prev);
start += delta;
limit += delta;
}
}
} while(start < limit);
offsets.start = start;
offsets.contextLimit += limit - offsets.limit;
offsets.limit = limit;
}
U_NAMESPACE_END

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2007, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
@ -15,7 +15,7 @@
#if !UCONFIG_NO_TRANSLITERATION
#include "unicode/translit.h"
#include "unicode/normlzr.h"
#include "unicode/normalizer2.h"
U_NAMESPACE_BEGIN
@ -24,16 +24,7 @@ U_NAMESPACE_BEGIN
* @author Alan Liu
*/
class NormalizationTransliterator : public Transliterator {
/**
* The normalization mode of this transliterator.
*/
UNormalizationMode fMode;
/**
* Normalization options for this transliterator.
*/
int32_t options;
const Normalizer2 &fNorm2;
public:
@ -93,8 +84,7 @@ class NormalizationTransliterator : public Transliterator {
* Constructs a transliterator. This method is private.
* Public users must use the factory method createInstance().
*/
NormalizationTransliterator(const UnicodeString& id,
UNormalizationMode mode, int32_t opt);
NormalizationTransliterator(const UnicodeString& id, const Normalizer2 &norm2);
private:
/**

File diff suppressed because it is too large Load diff

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2008, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -25,12 +25,12 @@
#include "unicode/udata.h"
#include "unicode/uchar.h"
#include "unicode/uniset.h"
#include "normalizer2impl.h"
#include "ucol_bld.h"
#include "ucol_elm.h"
#include "ucol_cnt.h"
#include "ucln_in.h"
#include "umutex.h"
#include "unormimp.h"
#include "cmemory.h"
static const InverseUCATableHeader* _staticInvUCA = NULL;
@ -626,7 +626,7 @@ uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t l
nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
if(U_SUCCESS(*status)) {
for(i = 0; i < nLen; i++) {
uprv_init_collIterate(UCA, &n[i], 1, &s);
uprv_init_collIterate(UCA, &n[i], 1, &s, status);
order = ucol_getNextCE(UCA, &s, status);
if(isContinuation(order)) {
*status = U_INTERNAL_PROGRAM_ERROR;
@ -878,7 +878,7 @@ U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokL
/* then pick CEs out until there is no more and stuff them into expansion */
collIterate s;
uint32_t order = 0;
uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
for(;;) {
order = ucol_getNextCE(src->UCA, &s, status);
@ -1045,7 +1045,7 @@ ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
// it doesn't make any difference whether we have to go to the UCA
// or not.
{
uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
while(CE != UCOL_NO_MORE_CES) {
CE = ucol_getNextCE(src->UCA, &colIt, status);
if(CE != UCOL_NO_MORE_CES) {

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -31,6 +31,7 @@
#include "unicode/unistr.h"
#include "unicode/ucoleitr.h"
#include "unicode/normlzr.h"
#include "normalizer2impl.h"
#include "ucol_elm.h"
#include "ucol_tok.h"
#include "ucol_cnt.h"
@ -1602,6 +1603,7 @@ struct enumStruct {
tempUCATable *t;
UCollator *tempColl;
UCollationElements* colEl;
const Normalizer2Impl *nfcImpl;
int32_t noOfClosures;
UErrorCode *status;
};
@ -1615,7 +1617,8 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
UCollator *tempColl = ((enumStruct *)context)->tempColl;
UCollationElements* colEl = ((enumStruct *)context)->colEl;
UCAElements el;
UChar decomp[256] = { 0 };
UChar decompBuffer[4];
const UChar *decomp;
int32_t noOfDec = 0;
UChar32 u32 = 0;
@ -1623,13 +1626,14 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
uint32_t len = 0;
for(u32 = start; u32 < limit; u32++) {
noOfDec = unorm_getDecomposition(u32, FALSE, decomp, 256);
decomp = ((enumStruct *)context)->nfcImpl->
getDecomposition(u32, decompBuffer, noOfDec);
//if((noOfDec = unorm_normalize(comp, len, UNORM_NFD, 0, decomp, 256, status)) > 1
//|| (noOfDec == 1 && *decomp != (UChar)u32))
if(noOfDec > 0) // if we're positive, that means there is no decomposition
if(decomp != NULL)
{
len = 0;
UTF_APPEND_CHAR_UNSAFE(comp, len, u32);
U16_APPEND_UNSAFE(comp, len, u32);
if(ucol_strcoll(tempColl, comp, len, decomp, noOfDec) != UCOL_EQUAL) {
#ifdef UCOL_DEBUG
fprintf(stderr, "Closure: %08X -> ", u32);
@ -1640,7 +1644,7 @@ _enumCategoryRangeClosureCategory(const void *context, UChar32 start, UChar32 li
fprintf(stderr, "\n");
#endif
((enumStruct *)context)->noOfClosures++;
el.cPoints = decomp;
el.cPoints = (UChar *)decomp;
el.cSize = noOfDec;
el.noOfCEs = 0;
el.prefix = el.prefixChars;
@ -1938,7 +1942,7 @@ uprv_uca_canonicalClosure(tempUCATable *t,
UChar baseChar, firstCM;
UChar32 fcdHighStart;
const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
context.nfcImpl=Normalizer2Factory::getNFCImpl(*status);
if(U_FAILURE(*status)) {
return 0;
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1998-2009, International Business Machines
* Copyright (C) 1998-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -41,6 +41,10 @@
#if !UCONFIG_NO_COLLATION
#ifdef XP_CPLUSPLUS
#include "unicode/normalizer2.h"
#include "unicode/unistr.h"
#endif
#include "unicode/ucol.h"
#include "utrie.h"
#include "cmemory.h"
@ -264,12 +268,14 @@ minimum number for special Jamo
#define NFC_ZERO_CC_BLOCK_LIMIT_ 0x300
typedef struct collIterate {
UChar *string; /* Original string */
#ifdef XP_CPLUSPLUS
typedef struct collIterate : public UMemory {
const UChar *string; /* Original string */
/* UChar *start; Pointer to the start of the source string. Either points to string
or to writableBuffer */
UChar *endp; /* string end ptr. Is undefined for null terminated strings */
UChar *pos; /* This is position in the string. Can be to original or writable buf */
const UChar *endp; /* string end ptr. Is undefined for null terminated strings */
const UChar *pos; /* This is position in the string. Can be to original or writable buf */
uint32_t *toReturn; /* This is the CE from CEs buffer that should be returned */
uint32_t *CEpos; /* This is the position to which we have stored processed CEs */
@ -279,16 +285,15 @@ typedef struct collIterate {
int32_t offsetRepeatCount; /* Repeat stored offset if non-zero */
int32_t offsetRepeatValue; /* offset value to repeat */
UChar *writableBuffer;
uint32_t writableBufSize;
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
UnicodeString writableBuffer;
const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
const UCollator *coll;
const Normalizer2 *nfd;
uint8_t flags;
uint8_t origFlags;
uint32_t *extendCEs; /* This is use if CEs is not big enough */
int32_t extendCEsSize; /* Holds the size of the dynamic CEs buffer */
uint32_t CEs[UCOL_EXPAND_CE_BUFFER_SIZE]; /* This is where we store CEs */
UChar stackWritableBuffer[UCOL_WRITABLE_BUFFER_SIZE]; /* A writable buffer. */
int32_t *offsetBuffer; /* A dynamic buffer to hold offsets */
int32_t offsetBufferSize; /* The size of the offset buffer */
@ -297,6 +302,12 @@ typedef struct collIterate {
/*int32_t iteratorIndex;*/
} collIterate;
#else
typedef struct collIterate collIterate;
#endif
#define paddedsize(something) ((something)+((((something)%4)!=0)?(4-(something)%4):0))
#define headersize (paddedsize(sizeof(UCATableHeader))+paddedsize(sizeof(UColOptionSet)))
@ -305,19 +316,34 @@ struct used internally in getSpecial*CE.
data similar to collIterate.
*/
struct collIterateState {
UChar *pos; /* This is position in the string. Can be to original or writable buf */
UChar *returnPos;
UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
UChar *bufferaddress; /* address of the normalization buffer */
uint32_t buffersize;
const UChar *pos; /* This is position in the string. Can be to original or writable buf */
const UChar *returnPos;
const UChar *fcdPosition; /* Position in the original string to continue FCD check from. */
const UChar *bufferaddress; /* address of the normalization buffer */
int32_t buffersize;
uint8_t flags;
uint8_t origFlags;
uint32_t iteratorIndex;
int32_t iteratorMove;
};
U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator *collator, const UChar *sourceString, int32_t sourceLen, collIterate *s);
U_CAPI void U_EXPORT2
uprv_init_collIterate(const UCollator *collator,
const UChar *sourceString, int32_t sourceLen,
collIterate *s, UErrorCode *status);
/* Internal functions for C test code. */
U_CAPI collIterate * U_EXPORT2
uprv_new_collIterate(UErrorCode *status);
U_CAPI void U_EXPORT2
uprv_delete_collIterate(collIterate *s);
/* @return s->pos == s->endp */
U_CAPI UBool U_EXPORT2
uprv_collIterateAtEnd(collIterate *s);
#ifdef XP_CPLUSPLUS
U_NAMESPACE_BEGIN
@ -326,7 +352,7 @@ typedef struct UCollationPCE UCollationPCE;
U_NAMESPACE_END
struct UCollationElements
struct UCollationElements : public UMemory
{
/**
* Struct wrapper for source data
@ -351,6 +377,8 @@ struct UCollationElements
U_CAPI void U_EXPORT2
uprv_init_pce(const struct UCollationElements *elems);
#endif
#define UCOL_LEVELTERMINATOR 1
/* mask off anything but primary order */
@ -1066,7 +1094,6 @@ static inline UBool ucol_unsafeCP(UChar c, const UCollator *coll) {
/* The offsetBuffer in collIterate might need to be freed to avoid memory leaks. */
void ucol_freeOffsetBuffer(collIterate *s);
#endif /* #if !UCONFIG_NO_COLLATION */
#endif

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -1108,7 +1108,7 @@ reset may be null.
handled.
*/
static UColToken *ucol_tok_initAReset(UColTokenParser *src, UChar *expand, uint32_t *expandNext,
static UColToken *ucol_tok_initAReset(UColTokenParser *src, const UChar *expand, uint32_t *expandNext,
UParseError *parseError, UErrorCode *status)
{
if(src->resultLen == src->listCapacity) {
@ -1200,9 +1200,12 @@ inline UColToken *getVirginBefore(UColTokenParser *src, UColToken *sourceToken,
uint32_t CE, SecondCE;
uint32_t invPos;
if(sourceToken != NULL) {
uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s);
uprv_init_collIterate(src->UCA, src->source+((sourceToken->source)&0xFFFFFF), 1, &s, status);
} else {
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s);
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset /**charsOffset*/, 1, &s, status);
}
if(U_FAILURE(*status)) {
return NULL;
}
baseCE = ucol_getNextCE(src->UCA, &s, status) & 0xFFFFFF3F;
@ -1684,10 +1687,10 @@ uint32_t ucol_tok_assembleTokenList(UColTokenParser *src, UParseError *parseErro
collIterate s;
uint32_t CE = UCOL_NOT_FOUND, SecondCE = UCOL_NOT_FOUND;
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s);
uprv_init_collIterate(src->UCA, src->source+src->parsedToken.charsOffset, src->parsedToken.charsLen, &s, status);
CE = ucol_getNextCE(src->UCA, &s, status);
UChar *expand = s.pos;
const UChar *expand = s.pos;
SecondCE = ucol_getNextCE(src->UCA, &s, status);
ListList[src->resultLen].baseCE = CE & 0xFFFFFF3F;

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************

View file

@ -1,6 +1,6 @@
/*
******************************************************************************
* Copyright (C) 2001-2009, International Business Machines
* Copyright (C) 2001-20109, International Business Machines
* Corporation and others. All Rights Reserved.
******************************************************************************
*
@ -313,19 +313,16 @@ ucol_openElements(const UCollator *coll,
int32_t textLength,
UErrorCode *status)
{
UCollationElements *result;
if (U_FAILURE(*status)) {
return NULL;
}
result = (UCollationElements *)uprv_malloc(sizeof(UCollationElements));
/* test for NULL */
UCollationElements *result = new UCollationElements;
if (result == NULL) {
*status = U_MEMORY_ALLOCATION_ERROR;
return NULL;
}
result->reset_ = TRUE;
result->isWritable = FALSE;
result->pce = NULL;
@ -333,7 +330,7 @@ ucol_openElements(const UCollator *coll,
if (text == NULL) {
textLength = 0;
}
uprv_init_collIterate(coll, text, textLength, &result->iteratordata_);
uprv_init_collIterate(coll, text, textLength, &result->iteratordata_, status);
return result;
}
@ -345,30 +342,24 @@ ucol_closeElements(UCollationElements *elems)
if (elems != NULL) {
collIterate *ci = &elems->iteratordata_;
if (ci != NULL) {
if (ci->writableBuffer != ci->stackWritableBuffer) {
uprv_free(ci->writableBuffer);
}
if (ci->extendCEs) {
uprv_free(ci->extendCEs);
}
if (ci->extendCEs) {
uprv_free(ci->extendCEs);
}
if (ci->offsetBuffer) {
uprv_free(ci->offsetBuffer);
}
if (ci->offsetBuffer) {
uprv_free(ci->offsetBuffer);
}
if (elems->isWritable && elems->iteratordata_.string != NULL)
{
uprv_free(elems->iteratordata_.string);
uprv_free((UChar *)elems->iteratordata_.string);
}
if (elems->pce != NULL) {
delete elems->pce;
}
uprv_free(elems);
delete elems;
}
}
@ -387,11 +378,7 @@ ucol_reset(UCollationElements *elems)
ci->flags |= UCOL_ITER_NORM;
}
if (ci->stackWritableBuffer != ci->writableBuffer) {
uprv_free(ci->writableBuffer);
ci->writableBuffer = ci->stackWritableBuffer;
ci->writableBufSize = UCOL_WRITABLE_BUFFER_SIZE;
}
ci->writableBuffer.remove();
ci->fcdPosition = NULL;
//ci->offsetReturn = ci->offsetStore = NULL;
@ -686,7 +673,7 @@ ucol_setText( UCollationElements *elems,
if (elems->isWritable && elems->iteratordata_.string != NULL)
{
uprv_free(elems->iteratordata_.string);
uprv_free((UChar *)elems->iteratordata_.string);
}
if (text == NULL) {
@ -698,7 +685,7 @@ ucol_setText( UCollationElements *elems,
/* free offset buffer to avoid memory leak before initializing. */
ucol_freeOffsetBuffer(&(elems->iteratordata_));
uprv_init_collIterate(elems->iteratordata_.coll, text, textLength,
&elems->iteratordata_);
&elems->iteratordata_, status);
elems->reset_ = TRUE;
}

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2009 IBM and others. All rights reserved.
* Copyright (C) 2001-2010 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 07/02/2001 synwee Creation.
@ -14,12 +14,14 @@
#include "unicode/usearch.h"
#include "unicode/ustring.h"
#include "unicode/uchar.h"
#include "normalizer2impl.h"
#include "unormimp.h"
#include "ucol_imp.h"
#include "usrchimp.h"
#include "cmemory.h"
#include "ucln_in.h"
#include "uassert.h"
#include "ustr_imp.h"
U_NAMESPACE_USE
@ -311,7 +313,11 @@ inline uint16_t initializePatternCETable(UStringSearch *strsrch,
else {
uprv_init_collIterate(strsrch->collator, pattern->text,
pattern->textLength,
&coleiter->iteratordata_);
&coleiter->iteratordata_,
status);
}
if(U_FAILURE(*status)) {
return 0;
}
if (pattern->CE != cetable && pattern->CE) {
@ -381,7 +387,11 @@ inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
} else {
uprv_init_collIterate(strsrch->collator, pattern->text,
pattern->textLength,
&coleiter->iteratordata_);
&coleiter->iteratordata_,
status);
}
if(U_FAILURE(*status)) {
return 0;
}
if (pattern->PCE != pcetable && pattern->PCE != NULL) {
@ -1074,54 +1084,20 @@ static
inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
int32_t end)
{
UChar t2[32], p2[32];
int32_t length = end - start;
if (strsrch->strength != UCOL_IDENTICAL) {
return TRUE;
}
UErrorCode status = U_ZERO_ERROR, status2 = U_ZERO_ERROR;
int32_t decomplength = unorm_decompose(t2, LENGTHOF(t2),
strsrch->search->text + start, length,
FALSE, 0, &status);
// use separate status2 in case of buffer overflow
if (decomplength != unorm_decompose(p2, LENGTHOF(p2),
strsrch->pattern.text,
strsrch->pattern.textLength,
FALSE, 0, &status2)) {
return FALSE; // lengths are different
}
// compare contents
UChar *text, *pattern;
if(U_SUCCESS(status)) {
text = t2;
pattern = p2;
} else if(status==U_BUFFER_OVERFLOW_ERROR) {
status = U_ZERO_ERROR;
// allocate one buffer for both decompositions
text = (UChar *)uprv_malloc(decomplength * 2 * U_SIZEOF_UCHAR);
// Check for allocation failure.
if (text == NULL) {
return FALSE;
}
pattern = text + decomplength;
unorm_decompose(text, decomplength, strsrch->search->text + start,
length, FALSE, 0, &status);
unorm_decompose(pattern, decomplength, strsrch->pattern.text,
strsrch->pattern.textLength, FALSE, 0, &status);
} else {
// NFD failed, make sure that u_memcmp() does not overrun t2 & p2
// and that we don't uprv_free() an undefined text pointer
text = pattern = t2;
decomplength = 0;
}
UBool result = (UBool)(u_memcmp(pattern, text, decomplength) == 0);
if(text != t2) {
uprv_free(text);
}
// Note: We could use Normalizer::compare() or similar, but for short strings
// which may not be in FCD it might be faster to just NFD them.
UErrorCode status = U_ZERO_ERROR;
UnicodeString t2, p2;
strsrch->nfd->normalize(
UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
strsrch->nfd->normalize(
UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
// return FALSE if NFD failed
return U_SUCCESS(status) && result;
return U_SUCCESS(status) && t2 == p2;
}
#if BOYER_MOORE
@ -2724,6 +2700,8 @@ U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
UCOL_SHIFTED;
result->variableTop = ucol_getVariableTop(collator, status);
result->nfd = Normalizer2Factory::getNFDInstance(*status);
if (U_FAILURE(*status)) {
uprv_free(result);
return NULL;
@ -3040,7 +3018,8 @@ U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
uprv_init_collIterate(collator, strsrch->search->text,
strsrch->search->textLength,
&(strsrch->textIter->iteratordata_));
&(strsrch->textIter->iteratordata_),
status);
strsrch->utilIter->iteratordata_.coll = collator;
}
}
@ -3432,7 +3411,8 @@ U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
ucol_freeOffsetBuffer(&(strsrch->textIter->iteratordata_));
uprv_init_collIterate(strsrch->collator, strsrch->search->text,
strsrch->search->textLength,
&(strsrch->textIter->iteratordata_));
&(strsrch->textIter->iteratordata_),
&status);
strsrch->search->matchedLength = 0;
strsrch->search->matchedIndex = USEARCH_DONE;
strsrch->search->isOverlap = FALSE;

View file

@ -1,6 +1,6 @@
/*
**********************************************************************
* Copyright (C) 2001-2008 IBM and others. All rights reserved.
* Copyright (C) 2001-2010 IBM and others. All rights reserved.
**********************************************************************
* Date Name Description
* 08/13/2001 synwee Creation.
@ -13,6 +13,7 @@
#if !UCONFIG_NO_COLLATION
#include "unicode/normalizer2.h"
#include "unicode/ucol.h"
#include "unicode/ucoleitr.h"
#include "unicode/ubrk.h"
@ -59,6 +60,7 @@ struct UStringSearch {
struct USearch *search;
struct UPattern pattern;
const UCollator *collator;
const Normalizer2 *nfd;
// positions within the collation element iterator is used to determine
// if we are at the start of the text.
UCollationElements *textIter;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
@ -52,7 +52,6 @@
#include "calldata.h"
#include "cstring.h"
#include "cmemory.h"
#include "ucol_imp.h"
/* set to 1 to test offsets in backAndForth() */
#define TEST_OFFSETS 0
@ -148,13 +147,14 @@ static char* U_EXPORT2 sortKeyToString(const UCollator *coll, const uint8_t *sor
int32_t strength = UCOL_PRIMARY;
uint32_t res_size = 0;
UBool doneCase = FALSE;
UErrorCode errorCode = U_ZERO_ERROR;
char *current = buffer;
const uint8_t *currentSk = sortkey;
uprv_strcpy(current, "[");
while(strength <= UCOL_QUATERNARY && strength <= coll->strength) {
while(strength <= UCOL_QUATERNARY && strength <= ucol_getStrength(coll)) {
if(strength > UCOL_PRIMARY) {
uprv_strcat(current, " . ");
}
@ -162,20 +162,20 @@ static char* U_EXPORT2 sortKeyToString(const UCollator *coll, const uint8_t *sor
uprv_appendByteToHexString(current, *currentSk++);
uprv_strcat(current, " ");
}
if(coll->caseLevel == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
doneCase = TRUE;
} else if(coll->caseLevel == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
} else if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
strength ++;
}
if (*currentSk) {
uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
}
if(strength == UCOL_QUATERNARY && coll->alternateHandling == UCOL_NON_IGNORABLE) {
if(strength == UCOL_QUATERNARY && ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &errorCode) == UCOL_NON_IGNORABLE) {
break;
}
}
if(coll->strength == UCOL_IDENTICAL) {
if(ucol_getStrength(coll) == UCOL_IDENTICAL) {
uprv_strcat(current, " . ");
while(*currentSk != 0) {
uprv_appendByteToHexString(current, *currentSk++);
@ -214,7 +214,7 @@ UBool hasCollationElements(const char *locName) {
UErrorCode status = U_ZERO_ERROR;
UResourceBundle *loc = ures_open(U_ICUDATA_COLL, locName, &status);;
UResourceBundle *loc = ures_open(U_ICUDATA_NAME U_TREE_SEPARATOR_STRING "coll", locName, &status);;
if(U_SUCCESS(status)) {
status = U_ZERO_ERROR;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -994,11 +994,6 @@ static void TestSmallBuffer()
free(orders);
ucol_reset(testiter);
/* ensures that the writable buffer was cleared */
if (testiter->iteratordata_.writableBuffer !=
testiter->iteratordata_.stackWritableBuffer) {
log_err("Error Writable buffer in collation element iterator not reset\n");
}
/* ensures closing of elements done properly to clear writable buffer */
ucol_next(testiter, &status);

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 2001-2009, International Business Machines Corporation and
* Copyright (c) 2001-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
@ -1093,7 +1093,7 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
UColOptionSet opts;
UParseError parseError;
UChar *rulesCopy = NULL;
collIterate c;
collIterate *c = uprv_new_collIterate(status);
UCAConstants *consts = NULL;
uint32_t UCOL_RESET_TOP_VALUE, /*UCOL_RESET_TOP_CONT, */
UCOL_NEXT_TOP_VALUE, UCOL_NEXT_TOP_CONT;
@ -1102,12 +1102,15 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
if (U_FAILURE(*status)) {
log_err("Could not open root collator %s\n", u_errorName(*status));
uprv_delete_collIterate(c);
return;
}
colLoc = ucol_getLocaleByType(coll, ULOC_ACTUAL_LOCALE, status);
if (U_FAILURE(*status)) {
log_err("Could not get collator name: %s\n", u_errorName(*status));
ucol_close(UCA);
uprv_delete_collIterate(c);
return;
}
@ -1183,15 +1186,15 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
varT = (UBool)((specs & UCOL_TOK_VARIABLE_TOP) != 0);
top_ = (UBool)((specs & UCOL_TOK_TOP) != 0);
uprv_init_collIterate(coll, rulesCopy+chOffset, chLen, &c);
uprv_init_collIterate(coll, rulesCopy+chOffset, chLen, c, status);
currCE = ucol_getNextCE(coll, &c, status);
currCE = ucol_getNextCE(coll, c, status);
if(currCE == 0 && UCOL_ISTHAIPREVOWEL(*(rulesCopy+chOffset))) {
log_verbose("Thai prevowel detected. Will pick next CE\n");
currCE = ucol_getNextCE(coll, &c, status);
currCE = ucol_getNextCE(coll, c, status);
}
currContCE = ucol_getNextCE(coll, &c, status);
currContCE = ucol_getNextCE(coll, c, status);
if(!isContinuation(currContCE)) {
currContCE = 0;
}
@ -1272,6 +1275,7 @@ static void testCEs(UCollator *coll, UErrorCode *status) {
free(rulesCopy);
}
ucol_close(UCA);
uprv_delete_collIterate(c);
}
#if 0
@ -2992,10 +2996,11 @@ static void TestVariableTopSetting(void) {
uint32_t CE = UCOL_NO_MORE_CES;
/* before we start screaming, let's see if there is a problem with the rules */
collIterate s;
uprv_init_collIterate(coll, rulesCopy+oldChOffset, oldChLen, &s);
UErrorCode collIterateStatus = U_ZERO_ERROR;
collIterate *s = uprv_new_collIterate(&collIterateStatus);
uprv_init_collIterate(coll, rulesCopy+oldChOffset, oldChLen, s, &collIterateStatus);
CE = ucol_getNextCE(coll, &s, &status);
CE = ucol_getNextCE(coll, s, &status);
for(i = 0; i < oldChLen; i++) {
j = sprintf(buf, "%04X ", *(rulesCopy+oldChOffset+i));
@ -3004,7 +3009,7 @@ static void TestVariableTopSetting(void) {
if(status == U_PRIMARY_TOO_LONG_ERROR) {
log_verbose("= Expected failure for %s =", buffer);
} else {
if(s.pos == s.endp) {
if(uprv_collIterateAtEnd(s)) {
log_err("Unexpected failure setting variable top at offset %d. Error %s. Codepoints: %s\n",
oldChOffset, u_errorName(status), buffer);
} else {
@ -3012,6 +3017,7 @@ static void TestVariableTopSetting(void) {
buffer);
}
}
uprv_delete_collIterate(s);
}
varTop2 = ucol_getVariableTop(coll, &status);
if((varTop1 & 0xFFFF0000) != (varTop2 & 0xFFFF0000)) {

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/********************************************************************************
@ -1334,17 +1334,6 @@ TestNextPrevious() {
log_err("error unorm_next(U_MISPLACED_QUANTIFIER) %s\n", u_errorName(errorCode));
return;
}
/* missing pErrorCode */
buffer[0]=5;
iter.index=1;
length=unorm_next(&iter, buffer, sizeof(buffer)/U_SIZEOF_UCHAR,
UNORM_NFD, 0, TRUE, NULL,
NULL);
if(iter.index!=1 || buffer[0]!=5) {
log_err("error unorm_next(pErrorCode==NULL) %s\n", u_errorName(errorCode));
return;
}
}
static void

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
/*******************************************************************************
@ -22,6 +22,7 @@
#include "unicode/putil.h"
#include "unicode/ustring.h"
#include "unicode/uloc.h"
#include "unicode/unorm2.h"
#include "cintltst.h"
#include "putilimp.h"
@ -2942,6 +2943,7 @@ TestConsistency() {
UErrorCode errorCode;
#if !UCONFIG_NO_NORMALIZATION
const UNormalizer2 *norm2;
USerializedSet sset;
#endif
UChar32 start, end;
@ -3070,15 +3072,26 @@ TestConsistency() {
* In general, the set for the middle such character should be a subset
* of the set for the first.
*/
errorCode=U_ZERO_ERROR;
norm2=unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, &errorCode);
if(U_FAILURE(errorCode)) {
log_data_err("unorm2_getInstance(NFD) failed - %s\n", u_errorName(errorCode));
return;
}
set1=uset_open(1, 0);
set2=uset_open(1, 0);
if (unorm_getCanonStartSet(0x49, &sset)) {
UChar source[1];
_setAddSerialized(set1, &sset);
/* enumerate all characters that are plausible to be latin letters */
for(start=0xa0; start<0x2000; ++start) {
if(unorm_getDecomposition(start, FALSE, buffer16, LENGTHOF(buffer16))>1 && buffer16[0]==0x49) {
source[0]=(UChar)start;
length=unorm2_normalize(norm2, source, 1, buffer16, LENGTHOF(buffer16), &errorCode);
if(length>1 && buffer16[0]==0x49) {
uset_add(set2, start);
}
}

View file

@ -1,4 +1,4 @@
# Copyright (c) 2001-2009 International Business Machines
# Copyright (c) 2001-20109 International Business Machines
# Corporation and others. All Rights Reserved.
# common & i18n
bidi.h
@ -38,6 +38,7 @@ measfmt.h
measunit.h
measure.h
msgfmt.h
normalizer2.h
normlzr.h
numfmt.h
numsys.h

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -776,38 +776,10 @@ void BasicNormalizerTest::TestConcatenate() {
},
/* ### TODO: add more interesting cases */
{
"D",
"\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
"\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
"\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
"\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
"\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
"\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
"\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u1FDB",
"\\u1FE3\\u1FEB\\u1FEE\\u1FEF\\u1FF9\\u1FFB\\u1FFD\\u2000"
"\\u2001\\u2126\\u212A\\u212B\\u2329\\u232A\\uF900\\uFA10"
"\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25\\uFA26\\uFA2A\\uFB1F"
"\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E\\uFB2F\\uFB30\\uFB31"
"\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36\\uFB38\\uFB39\\uFB3A"
"\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41\\uFB43\\uFB44\\uFB46"
"\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B\\uFB4C\\uFB4D\\uFB4E",
"\\u0340\\u0341\\u0343\\u0344\\u0374\\u037E\\u0387\\u0958"
"\\u0959\\u095A\\u095B\\u095C\\u095D\\u095E\\u095F\\u09DC"
"\\u09DD\\u09DF\\u0A33\\u0A36\\u0A59\\u0A5A\\u0A5B\\u0A5E"
"\\u0B5C\\u0B5D\\u0F43\\u0F4D\\u0F52\\u0F57\\u0F5C\\u0F69"
"\\u0F73\\u0F75\\u0F76\\u0F78\\u0F81\\u0F93\\u0F9D\\u0FA2"
"\\u0FA7\\u0FAC\\u0FB9\\u1F71\\u1F73\\u1F75\\u1F77\\u1F79"
"\\u1F7B\\u1F7D\\u1FBB\\u1FBE\\u1FC9\\u1FCB\\u1FD3\\u0399"
"\\u0301\\u03C5\\u0308\\u0301\\u1FEB\\u1FEE\\u1FEF\\u1FF9"
"\\u1FFB\\u1FFD\\u2000\\u2001\\u2126\\u212A\\u212B\\u2329"
"\\u232A\\uF900\\uFA10\\uFA12\\uFA15\\uFA20\\uFA22\\uFA25"
"\\uFA26\\uFA2A\\uFB1F\\uFB2A\\uFB2B\\uFB2C\\uFB2D\\uFB2E"
"\\uFB2F\\uFB30\\uFB31\\uFB32\\uFB33\\uFB34\\uFB35\\uFB36"
"\\uFB38\\uFB39\\uFB3A\\uFB3B\\uFB3C\\uFB3E\\uFB40\\uFB41"
"\\uFB43\\uFB44\\uFB46\\uFB47\\uFB48\\uFB49\\uFB4A\\uFB4B"
"\\uFB4C\\uFB4D\\uFB4E"
"D",
"\\u03B1\\u0345",
"\\u0C4D\\U000110BA\\U0001D169",
"\\u03B1\\U0001D169\\U000110BA\\u0C4D\\u0345"
}
};
@ -1743,72 +1715,23 @@ U_CDECL_END
void
BasicNormalizerTest::TestSkippable() {
UnicodeSet starts, diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
UnicodeSet *startsPtr = &starts;
UnicodeSet diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
UnicodeString s, pattern;
UChar32 start, limit, rangeStart, rangeEnd;
int32_t i, range, count;
UErrorCode status;
/* build NF*Skippable sets from runtime data */
status=U_ZERO_ERROR;
USetAdder sa = {
(USet *)startsPtr,
_set_add,
_set_addRange,
_set_addString,
NULL, // don't need remove()
NULL
};
unorm_addPropertyStarts(&sa, &status);
if(U_FAILURE(status)) {
errln("unable to load normalization data for unorm_addPropertyStarts(() - %s\n", u_errorName(status));
IcuTestErrorCode errorCode(*this, "TestSkippable");
skipSets[UNORM_NFD].applyPattern(UNICODE_STRING_SIMPLE("[:NFD_Inert:]"), errorCode);
skipSets[UNORM_NFKD].applyPattern(UNICODE_STRING_SIMPLE("[:NFKD_Inert:]"), errorCode);
skipSets[UNORM_NFC].applyPattern(UNICODE_STRING_SIMPLE("[:NFC_Inert:]"), errorCode);
skipSets[UNORM_NFKC].applyPattern(UNICODE_STRING_SIMPLE("[:NFKC_Inert:]"), errorCode);
if(errorCode.logIfFailureAndReset("UnicodeSet(NF..._Inert) failed")) {
return;
}
count=starts.getRangeCount();
start=limit=0;
rangeStart=rangeEnd=0;
range=0;
for(;;) {
if(start<limit) {
/* get properties for start and apply them to [start..limit[ */
if(unorm_isNFSkippable(start, UNORM_NFD)) {
skipSets[UNORM_NFD].add(start, limit-1);
}
if(unorm_isNFSkippable(start, UNORM_NFKD)) {
skipSets[UNORM_NFKD].add(start, limit-1);
}
if(unorm_isNFSkippable(start, UNORM_NFC)) {
skipSets[UNORM_NFC].add(start, limit-1);
}
if(unorm_isNFSkippable(start, UNORM_NFKC)) {
skipSets[UNORM_NFKC].add(start, limit-1);
}
}
/* go to next range of same properties */
start=limit;
if(++limit>rangeEnd) {
if(range<count) {
limit=rangeStart=starts.getRangeStart(range);
rangeEnd=starts.getRangeEnd(range);
++range;
} else if(range==count) {
/* additional range to complete the Unicode code space */
limit=rangeStart=rangeEnd=0x110000;
++range;
} else {
break;
}
}
}
/* get expected sets from hardcoded patterns */
initExpectedSkippables(expectSets);
for(i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
if(skipSets[i]!=expectSets[i]) {
errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n"
"may need to update hardcoded UnicodeSet patterns in\n"

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2005, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -163,25 +163,6 @@ BasicNormalizerTest::TestNormalizerAPI() {
if(s.charAt(1)!=0xe4) {
errln("error in Normalizer::decompose(self)");
}
// test internal normalization exclusion options
// s contains a compatibility CJK character and a Hangul syllable
s=UnicodeString("a\\uFACE\\uD7A3b", -1, US_INV).unescape();
status=U_ZERO_ERROR;
Normalizer::decompose(s, FALSE, UNORM_NX_HANGUL, out, status);
if(U_FAILURE(status) || out!=UNICODE_STRING_SIMPLE("a\\u9F9C\\uD7A3b").unescape()) {
errln("Normalizer::decompose(UNORM_NX_HANGUL) failed - %s", u_errorName(status));
}
status=U_ZERO_ERROR;
Normalizer::decompose(s, FALSE, UNORM_NX_CJK_COMPAT, out, status);
if(U_FAILURE(status) || out!=UNICODE_STRING_SIMPLE("a\\uFACE\\u1112\\u1175\\u11c2b").unescape()) {
errln("Normalizer::decompose(UNORM_NX_CJK_COMPAT) failed - %s", u_errorName(status));
}
status=U_ZERO_ERROR;
Normalizer::decompose(s, FALSE, UNORM_NX_CJK_COMPAT|UNORM_NX_HANGUL, out, status);
if(U_FAILURE(status) || out!=UNICODE_STRING_SIMPLE("a\\uFACE\\uD7A3b").unescape()) {
errln("Normalizer::decompose(UNORM_NX_CJK_COMPAT|UNORM_NX_HANGUL) failed - %s", u_errorName(status));
}
}
#endif

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -15,6 +15,19 @@
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof(array[0]))
static const char *ignorePropNames[]={
"FC_NFKC",
"NFD_QC",
"NFC_QC",
"NFKD_QC",
"NFKC_QC",
"Expands_On_NFD",
"Expands_On_NFC",
"Expands_On_NFKD",
"Expands_On_NFKC",
"NFKC_CF"
};
UnicodeTest::UnicodeTest()
{
UErrorCode errorCode=U_ZERO_ERROR;
@ -23,6 +36,10 @@ UnicodeTest::UnicodeTest()
delete unknownPropertyNames;
unknownPropertyNames=NULL;
}
// Ignore some property names altogether.
for(int32_t i=0; i<LENGTHOF(ignorePropNames); ++i) {
unknownPropertyNames->puti(UnicodeString(ignorePropNames[i], -1, US_INV), 1, errorCode);
}
}
UnicodeTest::~UnicodeTest()
@ -76,7 +93,7 @@ getTokenIndex(const char *const tokens[], int32_t countTokens, const char *s) {
}
static const char *const
derivedCorePropsNames[]={
derivedPropsNames[]={
"Math",
"Alphabetic",
"Lowercase",
@ -86,6 +103,7 @@ derivedCorePropsNames[]={
"XID_Start",
"XID_Continue",
"Default_Ignorable_Code_Point",
"Full_Composition_Exclusion",
"Grapheme_Extend",
"Grapheme_Link", /* Unicode 5 moves this property here from PropList.txt */
"Grapheme_Base",
@ -95,11 +113,12 @@ derivedCorePropsNames[]={
"Changes_When_Uppercased",
"Changes_When_Titlecased",
"Changes_When_Casefolded",
"Changes_When_Casemapped"
"Changes_When_Casemapped",
"Changes_When_NFKC_Casefolded"
};
static const UProperty
derivedCorePropsIndex[]={
derivedPropsIndex[]={
UCHAR_MATH,
UCHAR_ALPHABETIC,
UCHAR_LOWERCASE,
@ -109,6 +128,7 @@ derivedCorePropsIndex[]={
UCHAR_XID_START,
UCHAR_XID_CONTINUE,
UCHAR_DEFAULT_IGNORABLE_CODE_POINT,
UCHAR_FULL_COMPOSITION_EXCLUSION,
UCHAR_GRAPHEME_EXTEND,
UCHAR_GRAPHEME_LINK,
UCHAR_GRAPHEME_BASE,
@ -118,17 +138,18 @@ derivedCorePropsIndex[]={
UCHAR_CHANGES_WHEN_UPPERCASED,
UCHAR_CHANGES_WHEN_TITLECASED,
UCHAR_CHANGES_WHEN_CASEFOLDED,
UCHAR_CHANGES_WHEN_CASEMAPPED
UCHAR_CHANGES_WHEN_CASEMAPPED,
UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED
};
static int32_t numErrors[LENGTHOF(derivedCorePropsIndex)]={ 0 };
static int32_t numErrors[LENGTHOF(derivedPropsIndex)]={ 0 };
enum { MAX_ERRORS=50 };
U_CFUNC void U_CALLCONV
derivedCorePropsLineFn(void *context,
char *fields[][2], int32_t /* fieldCount */,
UErrorCode *pErrorCode)
derivedPropsLineFn(void *context,
char *fields[][2], int32_t /* fieldCount */,
UErrorCode *pErrorCode)
{
UnicodeTest *me=(UnicodeTest *)context;
uint32_t start, end;
@ -136,35 +157,35 @@ derivedCorePropsLineFn(void *context,
u_parseCodePointRange(fields[0][0], &start, &end, pErrorCode);
if(U_FAILURE(*pErrorCode)) {
me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt field 0 at %s\n", fields[0][0]);
me->errln("UnicodeTest: syntax error in DerivedCoreProperties.txt or DerivedNormalizationProps.txt field 0 at %s\n", fields[0][0]);
return;
}
/* parse derived binary property name, ignore unknown names */
i=getTokenIndex(derivedCorePropsNames, LENGTHOF(derivedCorePropsNames), fields[1][0]);
i=getTokenIndex(derivedPropsNames, LENGTHOF(derivedPropsNames), fields[1][0]);
if(i<0) {
UnicodeString propName(fields[1][0], (int32_t)(fields[1][1]-fields[1][0]));
propName.trim();
if(me->unknownPropertyNames->find(propName)==NULL) {
UErrorCode errorCode=U_ZERO_ERROR;
me->unknownPropertyNames->puti(propName, 1, errorCode);
me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt\n", fields[1][0]);
me->errln("UnicodeTest warning: unknown property name '%s' in DerivedCoreProperties.txt or DerivedNormalizationProps.txt\n", fields[1][0]);
}
return;
}
me->derivedCoreProps[i].add(start, end);
me->derivedProps[i].add(start, end);
}
void UnicodeTest::TestAdditionalProperties() {
// test DerivedCoreProperties.txt
if(LENGTHOF(derivedCoreProps)<LENGTHOF(derivedCorePropsNames)) {
errln("error: UnicodeTest::derivedCoreProps[] too short, need at least %d UnicodeSets\n",
LENGTHOF(derivedCorePropsNames));
// test DerivedCoreProperties.txt and DerivedNormalizationProps.txt
if(LENGTHOF(derivedProps)<LENGTHOF(derivedPropsNames)) {
errln("error: UnicodeTest::derivedProps[] too short, need at least %d UnicodeSets\n",
LENGTHOF(derivedPropsNames));
return;
}
if(LENGTHOF(derivedCorePropsIndex)!=LENGTHOF(derivedCorePropsNames)) {
errln("error in ucdtest.cpp: LENGTHOF(derivedCorePropsIndex)!=LENGTHOF(derivedCorePropsNames)\n");
if(LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)) {
errln("error in ucdtest.cpp: LENGTHOF(derivedPropsIndex)!=LENGTHOF(derivedPropsNames)\n");
return;
}
@ -188,16 +209,25 @@ void UnicodeTest::TestAdditionalProperties() {
strcat(backupPath, U_FILE_SEP_STRING);
strcat(backupPath, "unidata" U_FILE_SEP_STRING "DerivedCoreProperties.txt");
u_parseDelimitedFile(newPath, ';', fields, 2, derivedCorePropsLineFn, this, &errorCode);
char *path=newPath;
u_parseDelimitedFile(newPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
if(errorCode==U_FILE_ACCESS_ERROR) {
errorCode=U_ZERO_ERROR;
u_parseDelimitedFile(backupPath, ';', fields, 2, derivedCorePropsLineFn, this, &errorCode);
path=backupPath;
u_parseDelimitedFile(backupPath, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
}
if(U_FAILURE(errorCode)) {
errln("error parsing DerivedCoreProperties.txt: %s\n", u_errorName(errorCode));
return;
}
char *basename=path+strlen(path)-strlen("DerivedCoreProperties.txt");
strcpy(basename, "DerivedNormalizationProps.txt");
u_parseDelimitedFile(path, ';', fields, 2, derivedPropsLineFn, this, &errorCode);
if(U_FAILURE(errorCode)) {
errln("error parsing DerivedNormalizationProps.txt: %s\n", u_errorName(errorCode));
return;
}
// now we have all derived core properties in the UnicodeSets
// run them all through the API
@ -206,14 +236,14 @@ void UnicodeTest::TestAdditionalProperties() {
UChar32 start, end;
// test all TRUE properties
for(i=0; i<LENGTHOF(derivedCorePropsNames); ++i) {
rangeCount=derivedCoreProps[i].getRangeCount();
for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
rangeCount=derivedProps[i].getRangeCount();
for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
start=derivedCoreProps[i].getRangeStart(range);
end=derivedCoreProps[i].getRangeEnd(range);
start=derivedProps[i].getRangeStart(range);
end=derivedProps[i].getRangeEnd(range);
for(; start<=end; ++start) {
if(!u_hasBinaryProperty(start, derivedCorePropsIndex[i])) {
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong\n", start, derivedCorePropsNames[i]);
if(!u_hasBinaryProperty(start, derivedPropsIndex[i])) {
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==FALSE is wrong\n", start, derivedPropsNames[i]);
if(++numErrors[i]>=MAX_ERRORS) {
errln("Too many errors, moving to the next test");
break;
@ -224,19 +254,19 @@ void UnicodeTest::TestAdditionalProperties() {
}
// invert all properties
for(i=0; i<LENGTHOF(derivedCorePropsNames); ++i) {
derivedCoreProps[i].complement();
for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
derivedProps[i].complement();
}
// test all FALSE properties
for(i=0; i<LENGTHOF(derivedCorePropsNames); ++i) {
rangeCount=derivedCoreProps[i].getRangeCount();
for(i=0; i<LENGTHOF(derivedPropsNames); ++i) {
rangeCount=derivedProps[i].getRangeCount();
for(range=0; range<rangeCount && numErrors[i]<MAX_ERRORS; ++range) {
start=derivedCoreProps[i].getRangeStart(range);
end=derivedCoreProps[i].getRangeEnd(range);
start=derivedProps[i].getRangeStart(range);
end=derivedProps[i].getRangeEnd(range);
for(; start<=end; ++start) {
if(u_hasBinaryProperty(start, derivedCorePropsIndex[i])) {
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedCorePropsNames[i]);
if(u_hasBinaryProperty(start, derivedPropsIndex[i])) {
errln("UnicodeTest error: u_hasBinaryProperty(U+%04lx, %s)==TRUE is wrong\n", start, derivedPropsNames[i]);
if(++numErrors[i]>=MAX_ERRORS) {
errln("Too many errors, moving to the next test");
break;

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -13,9 +13,9 @@ U_CFUNC void U_CALLCONV unicodeDataLineFn(void *context,
UErrorCode *pErrorCode);
U_CFUNC void U_CALLCONV
derivedCorePropsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
derivedPropsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
U_NAMESPACE_BEGIN
@ -43,11 +43,11 @@ private:
UErrorCode *pErrorCode);
friend void U_CALLCONV
derivedCorePropsLineFn(void *context,
derivedPropsLineFn(void *context,
char *fields[][2], int32_t fieldCount,
UErrorCode *pErrorCode);
UnicodeSet derivedCoreProps[30];
UnicodeSet derivedProps[30];
U_NAMESPACE_QUALIFIER Hashtable *unknownPropertyNames;
};

View file

@ -1,6 +1,6 @@
/*
********************************************************************************
* Copyright (C) 1999-2009 International Business Machines Corporation and
* Copyright (C) 1999-2010 International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************************
* Date Name Description
@ -709,6 +709,37 @@ void UnicodeSetTest::TestAPI() {
TEST_ASSERT((void *)constUSet == (void *)constSet);
const UnicodeSet *constSetx = UnicodeSet::fromUSet(constUSet);
TEST_ASSERT((void *)constSetx == (void *)constUSet);
// span(UnicodeString) and spanBack(UnicodeString) convenience methods
UnicodeString longString=UNICODE_STRING_SIMPLE("aaaaaaaaaabbbbbbbbbbcccccccccc");
UnicodeSet ac(0x61, 0x63);
ac.remove(0x62).freeze();
if( ac.span(longString, -5, USET_SPAN_CONTAINED)!=10 ||
ac.span(longString, 0, USET_SPAN_CONTAINED)!=10 ||
ac.span(longString, 5, USET_SPAN_CONTAINED)!=10 ||
ac.span(longString, 10, USET_SPAN_CONTAINED)!=10 ||
ac.span(longString, 15, USET_SPAN_CONTAINED)!=15 ||
ac.span(longString, 20, USET_SPAN_CONTAINED)!=30 ||
ac.span(longString, 25, USET_SPAN_CONTAINED)!=30 ||
ac.span(longString, 30, USET_SPAN_CONTAINED)!=30 ||
ac.span(longString, 35, USET_SPAN_CONTAINED)!=30 ||
ac.span(longString, INT32_MAX, USET_SPAN_CONTAINED)!=30
) {
errln("UnicodeSet.span(UnicodeString, ...) returns incorrect end indexes");
}
if( ac.spanBack(longString, -5, USET_SPAN_CONTAINED)!=0 ||
ac.spanBack(longString, 0, USET_SPAN_CONTAINED)!=0 ||
ac.spanBack(longString, 5, USET_SPAN_CONTAINED)!=0 ||
ac.spanBack(longString, 10, USET_SPAN_CONTAINED)!=0 ||
ac.spanBack(longString, 15, USET_SPAN_CONTAINED)!=15 ||
ac.spanBack(longString, 20, USET_SPAN_CONTAINED)!=20 ||
ac.spanBack(longString, 25, USET_SPAN_CONTAINED)!=20 ||
ac.spanBack(longString, 30, USET_SPAN_CONTAINED)!=20 ||
ac.spanBack(longString, 35, USET_SPAN_CONTAINED)!=20 ||
ac.spanBack(longString, INT32_MAX, USET_SPAN_CONTAINED)!=20
) {
errln("UnicodeSet.spanBack(UnicodeString, ...) returns incorrect start indexes");
}
}
void UnicodeSetTest::TestIteration() {

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -62,6 +62,7 @@ void UnicodeStringTest::runIndexedTest( int32_t index, UBool exec, const char* &
case 17: name = "TestNameSpace"; if (exec) TestNameSpace(); break;
case 18: name = "TestUTF32"; if (exec) TestUTF32(); break;
case 19: name = "TestUTF8"; if (exec) TestUTF8(); break;
case 20: name = "TestReadOnlyAlias"; if (exec) TestReadOnlyAlias(); break;
default: name = ""; break; //needed to end loop
}
@ -1120,6 +1121,30 @@ UnicodeStringTest::TestMiscellaneous()
if(test1.hasMetaData() || UnicodeString().hasMetaData()) {
errln("UnicodeString::hasMetaData() returns TRUE");
}
// test getTerminatedBuffer() on a truncated, shared, heap-allocated string
test1=UNICODE_STRING_SIMPLE("abcdefghijklmnopqrstuvwxyz0123456789.");
test1.truncate(36); // ensure length()<getCapacity()
test2=test1; // share the buffer
test1.truncate(5);
if(test1.length()!=5 || test1.getTerminatedBuffer()[5]!=0) {
errln("UnicodeString(shared buffer).truncate() failed");
}
if(test2.length()!=36 || test2[5]!=0x66 || u_strlen(test2.getTerminatedBuffer())!=36) {
errln("UnicodeString(shared buffer).truncate().getTerminatedBuffer() "
"modified another copy of the string!");
}
test1=UNICODE_STRING_SIMPLE("abcdefghijklmnopqrstuvwxyz0123456789.");
test1.truncate(36); // ensure length()<getCapacity()
test2=test1; // share the buffer
test1.remove();
if(test1.length()!=0 || test1.getTerminatedBuffer()[0]!=0) {
errln("UnicodeString(shared buffer).remove() failed");
}
if(test2.length()!=36 || test2[0]!=0x61 || u_strlen(test2.getTerminatedBuffer())!=36) {
errln("UnicodeString(shared buffer).remove().getTerminatedBuffer() "
"modified another copy of the string!");
}
}
void
@ -1873,3 +1898,108 @@ UnicodeStringTest::TestUTF8() {
}
#endif
}
// Test if this compiler supports Return Value Optimization of unnamed temporary objects.
static UnicodeString wrapUChars(const UChar *uchars) {
return UnicodeString(TRUE, uchars, -1);
}
void
UnicodeStringTest::TestReadOnlyAlias() {
UChar uchars[]={ 0x61, 0x62, 0 };
UnicodeString alias(TRUE, uchars, 2);
if(alias.length()!=2 || alias.getBuffer()!=uchars || alias.getTerminatedBuffer()!=uchars) {
errln("UnicodeString read-only-aliasing constructor does not behave as expected.");
return;
}
alias.truncate(1);
if(alias.length()!=1 || alias.getBuffer()!=uchars) {
errln("UnicodeString(read-only-alias).truncate() did not preserve aliasing as expected.");
}
if(alias.getTerminatedBuffer()==uchars) {
errln("UnicodeString(read-only-alias).truncate().getTerminatedBuffer() "
"did not allocate and copy as expected.");
}
if(uchars[1]!=0x62) {
errln("UnicodeString(read-only-alias).truncate().getTerminatedBuffer() "
"modified the original buffer.");
}
if(1!=u_strlen(alias.getTerminatedBuffer())) {
errln("UnicodeString(read-only-alias).truncate().getTerminatedBuffer() "
"does not return a buffer terminated at the proper length.");
}
alias.setTo(TRUE, uchars, 2);
if(alias.length()!=2 || alias.getBuffer()!=uchars || alias.getTerminatedBuffer()!=uchars) {
errln("UnicodeString read-only-aliasing setTo() does not behave as expected.");
return;
}
alias.remove();
if(alias.length()!=0) {
errln("UnicodeString(read-only-alias).remove() did not work.");
}
if(alias.getTerminatedBuffer()==uchars) {
errln("UnicodeString(read-only-alias).remove().getTerminatedBuffer() "
"did not un-alias as expected.");
}
if(uchars[0]!=0x61) {
errln("UnicodeString(read-only-alias).remove().getTerminatedBuffer() "
"modified the original buffer.");
}
if(0!=u_strlen(alias.getTerminatedBuffer())) {
errln("UnicodeString.setTo(read-only-alias).remove().getTerminatedBuffer() "
"does not return a buffer terminated at length 0.");
}
UnicodeString longString=UNICODE_STRING_SIMPLE("abcdefghijklmnopqrstuvwxyz0123456789");
alias.setTo(FALSE, longString.getBuffer(), longString.length());
alias.remove(0, 10);
if(longString.compare(10, INT32_MAX, alias)!=0 || alias.getBuffer()!=longString.getBuffer()+10) {
errln("UnicodeString.setTo(read-only-alias).remove(0, 10) did not preserve aliasing as expected.");
}
alias.setTo(FALSE, longString.getBuffer(), longString.length());
alias.remove(27, 99);
if(longString.compare(0, 27, alias)!=0 || alias.getBuffer()!=longString.getBuffer()) {
errln("UnicodeString.setTo(read-only-alias).remove(27, 99) did not preserve aliasing as expected.");
}
alias.setTo(FALSE, longString.getBuffer(), longString.length());
alias.retainBetween(6, 30);
if(longString.compare(6, 24, alias)!=0 || alias.getBuffer()!=longString.getBuffer()+6) {
errln("UnicodeString.setTo(read-only-alias).retainBetween(6, 30) did not preserve aliasing as expected.");
}
UChar abc[]={ 0x61, 0x62, 0x63, 0 };
UBool hasRVO= wrapUChars(abc).getBuffer()==abc;
UnicodeString temp;
temp.fastCopyFrom(longString.tempSubString());
if(temp!=longString || (hasRVO && temp.getBuffer()!=longString.getBuffer())) {
errln("UnicodeString.tempSubString() failed");
}
temp.fastCopyFrom(longString.tempSubString(-3, 5));
if(longString.compare(0, 5, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer())) {
errln("UnicodeString.tempSubString(-3, 5) failed");
}
temp.fastCopyFrom(longString.tempSubString(17));
if(longString.compare(17, INT32_MAX, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer()+17)) {
errln("UnicodeString.tempSubString(17) failed");
}
temp.fastCopyFrom(longString.tempSubString(99));
if(!temp.isEmpty()) {
errln("UnicodeString.tempSubString(99) failed");
}
temp.fastCopyFrom(longString.tempSubStringBetween(6));
if(longString.compare(6, INT32_MAX, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer()+6)) {
errln("UnicodeString.tempSubStringBetween(6) failed");
}
temp.fastCopyFrom(longString.tempSubStringBetween(8, 18));
if(longString.compare(8, 10, temp)!=0 || (hasRVO && temp.getBuffer()!=longString.getBuffer()+8)) {
errln("UnicodeString.tempSubStringBetween(8, 18) failed");
}
UnicodeString bogusString;
bogusString.setToBogus();
temp.fastCopyFrom(bogusString.tempSubStringBetween(8, 18));
if(!temp.isBogus()) {
errln("UnicodeString.setToBogus().tempSubStringBetween(8, 18) failed");
}
}

View file

@ -1,6 +1,6 @@
/********************************************************************
* COPYRIGHT:
* Copyright (c) 1997-2009, International Business Machines Corporation and
* Copyright (c) 1997-2010, International Business Machines Corporation and
* others. All Rights Reserved.
********************************************************************/
@ -77,6 +77,7 @@ public:
void TestNameSpace(void);
void TestUTF32(void);
void TestUTF8(void);
void TestReadOnlyAlias(void);
};
class StringCaseTest: public IntlTest {

View file

@ -1,5 +1,5 @@
## Makefile.in for ICU tools
## Copyright (c) 1999-2009, International Business Machines Corporation and
## Copyright (c) 1999-2010, International Business Machines Corporation and
## others. All Rights Reserved.
## Source directory information
@ -15,7 +15,7 @@ subdir = tools
SUBDIRS = toolutil ctestfw makeconv genrb genuca genbrk genctd \
gennames genpname gencnval gensprep genccode gencmn icupkg pkgdata \
gentest genprops gencase genbidi gennorm gencfu
gentest genprops gencase genbidi gennorm gennorm2 gencfu
## List of phony targets
.PHONY : all all-local all-recursive install install-local \

View file

@ -389,25 +389,10 @@
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
<File
RelativePath=".\gennames.c"
>
<File
RelativePath=".\gennames.c"
>
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h;hpp;hxx;hm;inl"
>
</Filter>
<Filter
Name="Resource Files"
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
>
</Filter>
</File>
</Files>
<Globals>
</Globals>

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2001-2005, International Business Machines
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -61,7 +61,8 @@ enum {
UNICODE_VERSION,
ICUDATADIR,
CSOURCE,
STORE_FLAGS
STORE_FLAGS,
WRITE_NORM2
};
static UOption options[]={
@ -74,7 +75,8 @@ static UOption options[]={
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG),
UOPTION_ICUDATADIR,
UOPTION_DEF("csource", 'C', UOPT_NO_ARG),
UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG)
UOPTION_DEF("prune", 'p', UOPT_REQUIRES_ARG),
UOPTION_DEF("write-norm2", '\1', UOPT_NO_ARG)
};
extern int
@ -140,6 +142,8 @@ main(int argc, char* argv[]) {
"\t to the source file basenames before opening;\n"
"\t 'gennorm new' will read UnicodeData-new.txt etc.\n",
u_getDataDirectory());
fprintf(stderr,
"\t--write-norm2 write nfc.txt and nfkc.txt files for gennorm2\n");
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
@ -243,7 +247,7 @@ main(int argc, char* argv[]) {
/* prepare the filename beginning with the source dir */
uprv_strcpy(filename, srcDir);
basename=filename+uprv_strlen(filename);
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR) {
if(basename>filename && *(basename-1)!=U_FILE_SEP_CHAR && *(basename-1)!=U_FILE_ALT_SEP_CHAR) {
*basename++=U_FILE_SEP_CHAR;
}
@ -286,6 +290,10 @@ main(int argc, char* argv[]) {
/* process parsed data */
if(U_SUCCESS(errorCode)) {
if(options[WRITE_NORM2].doesOccur) {
writeNorm2(destDir);
}
processData();
/* write the properties data file */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2005, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -83,6 +83,9 @@ setCompositionExclusion(uint32_t code);
U_CFUNC void
setFNC(uint32_t c, UChar *s);
extern void
writeNorm2(const char *dataDir);
extern void
processData(void);

View file

@ -389,33 +389,18 @@
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="c;cpp;rc"
<File
RelativePath=".\gennorm.c"
>
<File
RelativePath=".\gennorm.c"
>
</File>
<File
RelativePath=".\store.c"
>
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h"
</File>
<File
RelativePath=".\gennorm.h"
>
<File
RelativePath=".\gennorm.h"
>
</File>
</Filter>
<Filter
Name="Resource Files"
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
</File>
<File
RelativePath=".\store.c"
>
</Filter>
</File>
</Files>
<Globals>
</Globals>

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -106,11 +106,13 @@ static UToolMemory *normMem, *utf32Mem, *extraMem, *combiningTriplesMem;
static Norm *norms;
#if GENNORM_OBSOLETE
/*
* set a flag for each code point that was seen in decompositions -
* avoid to decompose ones that have not been used before
*/
static uint32_t haveSeenFlags[256];
#endif
/* set of characters with NFD_QC=No (i.e., those with canonical decompositions) */
static USet *nfdQCNoSet;
@ -192,8 +194,10 @@ init() {
/* allocate UTF-32 string memory */
utf32Mem=utm_open("gennorm UTF-32 strings", 30000, 30000, 4);
#if GENNORM_OBSOLETE
/* reset all "have seen" flags */
uprv_memset(haveSeenFlags, 0, sizeof(haveSeenFlags));
#endif
/* open an empty set */
nfdQCNoSet=uset_open(1, 0);
@ -289,6 +293,7 @@ enumTrie(EnumTrieFn *fn, void *context) {
return count;
}
#if GENNORM_OBSOLETE
static void
setHaveSeenString(const uint32_t *s, int32_t length) {
uint32_t c;
@ -301,6 +306,7 @@ setHaveSeenString(const uint32_t *s, int32_t length) {
}
#define HAVE_SEEN(c) (haveSeenFlags[((c)>>5)&0xff]&(1<<((c)&0x1f)))
#endif
/* handle combining data ---------------------------------------------------- */
@ -410,6 +416,7 @@ findCombiningCP(uint32_t code, UBool isLead) {
return 0xffff;
}
#if GENNORM_OBSOLETE
static void
addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
CombiningTriple *triple;
@ -434,6 +441,7 @@ addCombiningTriple(uint32_t lead, uint32_t trail, uint32_t combined) {
triple->trail=trail;
triple->combined=combined;
}
#endif
static int
compareTriples(const void *l, const void *r) {
@ -560,6 +568,7 @@ processCombining() {
/* processing incoming normalization data ----------------------------------- */
#if GENNORM_OBSOLETE
/*
* Decompose Hangul syllables algorithmically and fill a pseudo-Norm struct.
* c must be a Hangul syllable code point.
@ -594,6 +603,7 @@ getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3])
pHangulNorm->lenNFKD=length;
}
}
#endif
/*
* decompose the one decomposition further, may generate two decompositions
@ -601,6 +611,20 @@ getHangulDecomposition(uint32_t c, Norm *pHangulNorm, uint32_t hangulBuffer[3])
*/
static void
decompStoreNewNF(uint32_t code, Norm *norm) {
#if !GENNORM_OBSOLETE
/* always allocate the original string */
uint32_t *s32;
uint8_t length;
if((length=norm->lenNFD)!=0) {
s32=utm_allocN(utf32Mem, norm->lenNFD);
uprv_memcpy(s32, norm->nfd, norm->lenNFD*4);
norm->nfd=s32;
} else if((length=norm->lenNFKD)!=0) {
s32=utm_allocN(utf32Mem, norm->lenNFKD);
uprv_memcpy(s32, norm->nfkd, norm->lenNFKD*4);
norm->nfkd=s32;
}
#else
uint32_t nfd[40], nfkd[40], hangulBuffer[3];
Norm hangulNorm;
@ -695,8 +719,10 @@ decompStoreNewNF(uint32_t code, Norm *norm) {
norm->nfkd=s32;
setHaveSeenString(nfkd, lenNFKD);
}
#endif
}
#if GENNORM_OBSOLETE
typedef struct DecompSingle {
uint32_t c;
Norm *norm;
@ -800,6 +826,7 @@ decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
norm->nfkd=s32;
}
}
#endif
/*
* process the data for one code point listed in UnicodeData;
@ -807,7 +834,9 @@ decompWithSingleFn(void *context, uint32_t code, Norm *norm) {
*/
extern void
storeNorm(uint32_t code, Norm *norm) {
#if GENNORM_OBSOLETE
DecompSingle decompSingle;
#endif
Norm *p;
if(DO_NOT_STORE(UGENNORM_STORE_COMPAT)) {
@ -826,6 +855,7 @@ storeNorm(uint32_t code, Norm *norm) {
/* decompose this one decomposition further, may generate two decompositions */
decompStoreNewNF(code, norm);
#if GENNORM_OBSOLETE
/* has this code point been used in previous decompositions? */
if(HAVE_SEEN(code)) {
/* use this decomposition to decompose other decompositions further */
@ -833,6 +863,7 @@ storeNorm(uint32_t code, Norm *norm) {
decompSingle.norm=norm;
enumTrie(decompWithSingleFn, &decompSingle);
}
#endif
}
/* store the data */
@ -1815,6 +1846,144 @@ getFoldingAuxOffset(uint32_t data) {
#endif /* #if !UCONFIG_NO_NORMALIZATION */
static void
writeAllCC(FILE *f) {
uint32_t i;
UChar32 prevCode, code;
uint8_t prevCC, cc;
UBool isInBlockZero;
fprintf(f, "# Canonical_Combining_Class (ccc) values\n");
prevCode=0;
prevCC=0;
for(code=0; code<=0x110000;) {
if(code==0x110000) {
cc=0;
} else {
i=utrie_get32(normTrie, code, &isInBlockZero);
if(i==0 || isInBlockZero) {
cc=0;
} else {
cc=norms[i].udataCC;
}
}
if(prevCC!=cc) {
if(prevCC!=0) {
uint32_t lastCode=code-1;
if(prevCode==lastCode) {
fprintf(f, "%04lX:%d\n", (long)lastCode, prevCC);
} else {
fprintf(f, "%04lX..%04lX:%d\n",
(long)prevCode, (long)lastCode, prevCC);
}
}
prevCode=code;
prevCC=cc;
}
if(isInBlockZero) {
code+=UTRIE_DATA_BLOCK_LENGTH;
} else {
++code;
}
}
}
static UBool
hasMapping(uint32_t code) {
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
return norm->lenNFD!=0 || norm->lenNFKD!=0;
}
static UBool
hasOneWayMapping(uint32_t code, UBool withCompat) {
for(;;) {
Norm *norm=norms+utrie_get32(normTrie, code, NULL);
uint8_t length;
if((length=norm->lenNFD)!=0) {
/*
* The canonical decomposition is a one-way mapping if
* - it does not map to exactly two code points
* - the code has ccc!=0
* - the code has the Composition_Exclusion property
* - its starter has a one-way mapping (loop for this)
* - its non-starter decomposes
*/
if( length!=2 ||
norm->udataCC!=0 ||
norm->combiningFlags&0x80 ||
hasMapping(norm->nfd[1])
) {
return TRUE;
}
code=norm->nfd[0]; /* continue */
} else if(withCompat && norm->lenNFKD!=0) {
return TRUE;
} else {
return FALSE;
}
}
}
static void
writeAllMappings(FILE *f, UBool withCompat) {
uint32_t i, code;
UBool isInBlockZero;
if(withCompat) {
fprintf(f, "\n# Canonical and compatibility decomposition mappings\n");
} else {
fprintf(f, "\n# Canonical decomposition mappings\n");
}
for(code=0; code<=0x10ffff;) {
i=utrie_get32(normTrie, code, &isInBlockZero);
if(isInBlockZero) {
code+=UTRIE_DATA_BLOCK_LENGTH;
} else {
if(i!=0) {
uint32_t *s32;
uint8_t length;
char separator;
if((length=norms[i].lenNFD)!=0) {
s32=norms[i].nfd;
separator= hasOneWayMapping(code, withCompat) ? '>' : '=';
} else if(withCompat && (length=norms[i].lenNFKD)!=0) {
s32=norms[i].nfkd;
separator='>';
}
if(length!=0) {
uint8_t j;
fprintf(f, "%04lX%c", (long)code, separator);
for(j=0; j<length; ++j) {
if(j!=0) {
fputc(' ', f);
}
fprintf(f, "%04lX", (long)s32[j]);
}
fputc('\n', f);
}
}
++code;
}
}
}
static void
writeNorm2TextFile(const char *path, const char *filename, UBool withCompat) {
FILE *f=usrc_createTextData(path, filename);
if(f==NULL) {
exit(U_FILE_ACCESS_ERROR);
}
writeAllCC(f);
writeAllMappings(f, withCompat);
fclose(f);
}
extern void
writeNorm2(const char *dataDir) {
writeNorm2TextFile(dataDir, "nfc.txt", FALSE);
writeNorm2TextFile(dataDir, "nfkc.txt", TRUE);
}
extern void
generateData(const char *dataDir, UBool csource) {
static uint8_t normTrieBlock[100000], fcdTrieBlock[100000], auxTrieBlock[100000];

View file

@ -0,0 +1,82 @@
## Makefile.in for ICU - tools/gennorm2
## Copyright (c) 2009-2010, International Business Machines Corporation and
## others. All Rights Reserved.
## Steven R. Loomis/Markus W. Scherer
## Source directory information
srcdir = @srcdir@
top_srcdir = @top_srcdir@
top_builddir = ../..
include $(top_builddir)/icudefs.mk
## Build directory information
subdir = tools/gennorm2
TARGET_STUB_NAME = gennorm2
## Extra files to remove for 'make clean'
CLEANFILES = *~ $(DEPS)
## Target information
TARGET = $(BINDIR)/$(TARGET_STUB_NAME)$(EXEEXT)
ifneq ($(top_builddir),$(top_srcdir))
CPPFLAGS += -I$(top_builddir)/common
endif
CPPFLAGS += -I$(top_srcdir)/common -I$(srcdir)/../toolutil
LIBS = $(LIBICUTOOLUTIL) $(LIBICUI18N) $(LIBICUUC) $(DEFAULT_LIBS) $(LIB_M)
OBJECTS = gennorm2.o n2builder.o
DEPS = $(OBJECTS:.o=.d)
## List of phony targets
.PHONY : all all-local install install-local clean clean-local \
distclean distclean-local dist dist-local check check-local install-man
## Clear suffix list
.SUFFIXES :
## List of standard targets
all: all-local
install: install-local
clean: clean-local
distclean : distclean-local
dist: dist-local
check: all check-local
all-local: $(TARGET)
install-local: all-local
# $(MKINSTALLDIRS) $(DESTDIR)$(sbindir)
# $(INSTALL) $(TARGET) $(DESTDIR)$(sbindir)
dist-local:
clean-local:
test -z "$(CLEANFILES)" || $(RMV) $(CLEANFILES)
$(RMV) $(TARGET) $(OBJECTS)
distclean-local: clean-local
$(RMV) Makefile
check-local: all-local
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
cd $(top_builddir) \
&& CONFIG_FILES=$(subdir)/$@ CONFIG_HEADERS= $(SHELL) ./config.status
$(TARGET) : $(OBJECTS)
$(LINK.cc) $(OUTOPT)$@ $^ $(LIBS)
$(POST_BUILD_STEP)
ifeq (,$(MAKECMDGOALS))
-include $(DEPS)
else
ifneq ($(patsubst %clean,,$(MAKECMDGOALS)),)
-include $(DEPS)
endif
endif

View file

@ -0,0 +1,258 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: gennorm2.cpp
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov25
* created by: Markus W. Scherer
*
* This program reads text files that define Unicode normalization,
* parses them, and builds a binary data file.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include "unicode/utypes.h"
#include "unicode/errorcode.h"
#include "unicode/localpointer.h"
#include "unicode/putil.h"
#include "unicode/uchar.h"
#include "unicode/unistr.h"
#include "n2builder.h"
#include "normalizer2impl.h"
#include "toolutil.h"
#include "uoptions.h"
#include "uparse.h"
#if UCONFIG_NO_NORMALIZATION
#include "unewdata.h"
#endif
#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
U_NAMESPACE_BEGIN
UBool beVerbose=FALSE, haveCopyright=TRUE;
U_DEFINE_LOCAL_OPEN_POINTER(LocalStdioFilePointer, FILE, fclose);
#if !UCONFIG_NO_NORMALIZATION
void parseFile(FILE *f, Normalizer2DataBuilder &builder);
#endif
/* -------------------------------------------------------------------------- */
enum {
HELP_H,
HELP_QUESTION_MARK,
VERBOSE,
COPYRIGHT,
SOURCEDIR,
OUTPUT_FILENAME,
UNICODE_VERSION
};
static UOption options[]={
UOPTION_HELP_H,
UOPTION_HELP_QUESTION_MARK,
UOPTION_VERBOSE,
UOPTION_COPYRIGHT,
UOPTION_SOURCEDIR,
UOPTION_DEF("output", 'o', UOPT_REQUIRES_ARG),
UOPTION_DEF("unicode", 'u', UOPT_REQUIRES_ARG)
};
extern "C" int
main(int argc, char* argv[]) {
U_MAIN_INIT_ARGS(argc, argv);
/* preset then read command line options */
options[SOURCEDIR].value="";
options[UNICODE_VERSION].value=U_UNICODE_VERSION;
argc=u_parseArgs(argc, argv, sizeof(options)/sizeof(options[HELP_H]), options);
/* error handling, printing usage message */
if(argc<0) {
fprintf(stderr,
"error in command line argument \"%s\"\n",
argv[-argc]);
}
if(!options[OUTPUT_FILENAME].doesOccur) {
argc=-1;
}
if( argc<2 ||
options[HELP_H].doesOccur || options[HELP_QUESTION_MARK].doesOccur
) {
/*
* Broken into chunks because the C89 standard says the minimum
* required supported string length is 509 bytes.
*/
fprintf(stderr,
"Usage: %s [-options] infiles+ -o outputfilename\n"
"\n"
"Reads the infiles with normalization data and\n"
"creates a binary file (outputfilename) with the data.\n"
"\n",
argv[0]);
fprintf(stderr,
"Options:\n"
"\t-h or -? or --help this usage text\n"
"\t-v or --verbose verbose output\n"
"\t-c or --copyright include a copyright notice\n"
"\t-u or --unicode Unicode version, followed by the version like 5.2.0\n");
fprintf(stderr,
"\t-s or --sourcedir source directory, followed by the path\n"
"\t-o or --output output filename\n");
return argc<0 ? U_ILLEGAL_ARGUMENT_ERROR : U_ZERO_ERROR;
}
beVerbose=options[VERBOSE].doesOccur;
haveCopyright=options[COPYRIGHT].doesOccur;
IcuToolErrorCode errorCode("gennorm2/main()");
#if UCONFIG_NO_NORMALIZATION
fprintf(stderr,
"gennorm2 writes a dummy binary data file "
"because UCONFIG_NO_NORMALIZATION is set, \n"
"see icu/source/common/unicode/uconfig.h\n");
udata_createDummy(NULL, NULL, options[OUTPUT_FILENAME].value, errorCode);
return U_UNSUPPORTED_ERROR;
#else
LocalPointer<Normalizer2DataBuilder> builder(new Normalizer2DataBuilder(errorCode));
errorCode.assertSuccess();
builder->setUnicodeVersion(options[UNICODE_VERSION].value);
// prepare the filename beginning with the source dir
std::string filename(options[SOURCEDIR].value);
int32_t pathLength=filename.length();
if( pathLength>0 &&
filename[pathLength-1]!=U_FILE_SEP_CHAR &&
filename[pathLength-1]!=U_FILE_ALT_SEP_CHAR
) {
filename.push_back(U_FILE_SEP_CHAR);
pathLength=filename.length();
}
for(int i=1; i<argc; ++i) {
printf("gennorm2: processing %s\n", argv[i]);
filename.append(argv[i]);
LocalStdioFilePointer f(fopen(filename.c_str(), "r"));
if(f==NULL) {
fprintf(stderr, "gennorm2 error: unable to open %s\n", filename.c_str());
exit(U_FILE_ACCESS_ERROR);
}
builder->setOverrideHandling(Normalizer2DataBuilder::OVERRIDE_PREVIOUS);
parseFile(f.getAlias(), *builder);
filename.erase(pathLength);
}
builder->writeBinaryFile(options[OUTPUT_FILENAME].value);
return errorCode.get();
#endif
}
#if !UCONFIG_NO_NORMALIZATION
void parseFile(FILE *f, Normalizer2DataBuilder &builder) {
IcuToolErrorCode errorCode("gennorm2/parseFile()");
char line[300];
uint32_t startCP, endCP;
while(NULL!=fgets(line, (int)sizeof(line), f)) {
char *comment=(char *)strchr(line, '#');
if(comment!=NULL) {
*comment=0;
}
u_rtrim(line);
if(line[0]==0) {
continue; // skip empty and comment-only lines
}
if(line[0]=='*') {
continue; // reserved syntax
}
const char *delimiter;
int32_t rangeLength=
u_parseCodePointRangeAnyTerminator(line, &startCP, &endCP, &delimiter, errorCode);
if(errorCode.isFailure()) {
fprintf(stderr, "gennorm2 error: parsing code point range from %s\n", line);
exit(errorCode.reset());
}
delimiter=u_skipWhitespace(delimiter);
if(*delimiter==':') {
const char *s=u_skipWhitespace(delimiter+1);
char *end;
unsigned long value=strtoul(s, &end, 10);
if(end<=s || *u_skipWhitespace(end)!=0 || value>=0xff) {
fprintf(stderr, "gennorm2 error: parsing ccc from %s\n", line);
exit(U_PARSE_ERROR);
}
for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
builder.setCC(c, (uint8_t)value);
}
continue;
}
if(*delimiter=='-') {
if(*u_skipWhitespace(delimiter+1)!=0) {
fprintf(stderr, "gennorm2 error: parsing remove-mapping %s\n", line);
exit(U_PARSE_ERROR);
}
for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
builder.removeMapping(c);
}
continue;
}
if(*delimiter=='=' || *delimiter=='>') {
UChar uchars[Normalizer2Impl::MAPPING_LENGTH_MASK];
int32_t length=u_parseString(delimiter+1, uchars, LENGTHOF(uchars), NULL, errorCode);
if(errorCode.isFailure()) {
fprintf(stderr, "gennorm2 error: parsing mapping string from %s\n", line);
exit(errorCode.reset());
}
UnicodeString mapping(FALSE, uchars, length);
if(*delimiter=='=') {
if(rangeLength!=1) {
fprintf(stderr,
"gennorm2 error: round-trip mapping for more than 1 code point on %s\n",
line);
exit(U_PARSE_ERROR);
}
builder.setRoundTripMapping((UChar32)startCP, mapping);
} else {
for(UChar32 c=(UChar32)startCP; c<=(UChar32)endCP; ++c) {
builder.setOneWayMapping(c, mapping);
}
}
continue;
}
fprintf(stderr, "gennorm2 error: unrecognized data line %s\n", line);
exit(U_PARSE_ERROR);
}
}
#endif // !UCONFIG_NO_NORMALIZATION
U_NAMESPACE_END
/*
* Hey, Emacs, please set the following:
*
* Local Variables:
* indent-tabs-mode: nil
* End:
*
*/

View file

@ -0,0 +1,409 @@
<?xml version="1.0" encoding="Windows-1252"?>
<VisualStudioProject
ProjectType="Visual C++"
Version="9.00"
Name="gennorm2"
ProjectGUID="{C7891A65-80AB-4245-912E-5F1E17B0E6C4}"
RootNamespace="gennorm2"
Keyword="Win32Proj"
TargetFrameworkVersion="196613"
>
<Platforms>
<Platform
Name="Win32"
/>
<Platform
Name="x64"
/>
</Platforms>
<ToolFiles>
</ToolFiles>
<Configurations>
<Configuration
Name="Release|Win32"
OutputDirectory=".\x86\Release"
IntermediateDirectory=".\x86\Release"
ConfigurationType="1"
CharacterSet="1"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin&#x0D;&#x0A;"
Outputs="..\..\..\bin\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories="..\..\common;..\toolutil"
PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
Optimization="2"
EnableIntrinsicFunctions="true"
StringPooling="true"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
PrecompiledHeaderFile=".\x86\Release\gennorm2.pch"
AssemblerListingLocation=".\x86\Release\"
ObjectFile=".\x86\Release\"
ProgramDataBaseFileName=".\x86\Release\"
WarningLevel="3"
DebugInformationFormat="3"
SuppressStartupBanner="true"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="NDEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x86\Release\gennorm2.exe"
LinkIncremental="1"
SuppressStartupBanner="true"
ProgramDatabaseFile=".\x86\Release\gennorm2.pdb"
GenerateDebugInformation="true"
SubSystem="1"
OptimizeReferences="2"
EnableCOMDATFolding="2"
TargetMachine="1"
RandomizedBaseAddress="1"
DataExecutionPrevention="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Debug|Win32"
OutputDirectory=".\x86\Debug"
IntermediateDirectory=".\x86\Debug"
ConfigurationType="1"
CharacterSet="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin&#x0D;&#x0A;"
Outputs="..\..\..\bin\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\common;..\toolutil"
PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
EnableIntrinsicFunctions="true"
MinimalRebuild="true"
BasicRuntimeChecks="3"
StringPooling="true"
RuntimeLibrary="3"
BufferSecurityCheck="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
PrecompiledHeaderFile=".\x86\Debug\gennorm2.pch"
AssemblerListingLocation=".\x86\Debug\"
ObjectFile=".\x86\Debug\"
ProgramDataBaseFileName=".\x86\Debug\"
BrowseInformation="1"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="4"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="_DEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x86\Debug\gennorm2.exe"
LinkIncremental="2"
SuppressStartupBanner="true"
GenerateDebugInformation="true"
ProgramDatabaseFile=".\x86\Debug\gennorm2.pdb"
SubSystem="1"
TargetMachine="1"
RandomizedBaseAddress="1"
DataExecutionPrevention="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Release|x64"
OutputDirectory=".\x64\Release"
IntermediateDirectory=".\x64\Release"
ConfigurationType="1"
CharacterSet="1"
WholeProgramOptimization="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin64&#x0D;&#x0A;"
Outputs="..\..\..\bin64\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
AdditionalIncludeDirectories="..\..\common;..\toolutil"
PreprocessorDefinitions="WIN64;WIN32;NDEBUG;_CRT_SECURE_NO_DEPRECATE"
Optimization="2"
EnableIntrinsicFunctions="true"
StringPooling="true"
RuntimeLibrary="2"
EnableFunctionLevelLinking="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
PrecompiledHeaderFile=".\x64\Release\gennorm2.pch"
AssemblerListingLocation=".\x64\Release\"
ObjectFile=".\x64\Release\"
ProgramDataBaseFileName=".\x64\Release\"
WarningLevel="3"
DebugInformationFormat="3"
SuppressStartupBanner="true"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="NDEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x64\Release\gennorm2.exe"
LinkIncremental="1"
SuppressStartupBanner="true"
ProgramDatabaseFile=".\x64\Release\gennorm2.pdb"
GenerateDebugInformation="true"
SubSystem="1"
OptimizeReferences="2"
EnableCOMDATFolding="2"
TargetMachine="17"
RandomizedBaseAddress="1"
DataExecutionPrevention="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
<Configuration
Name="Debug|x64"
OutputDirectory=".\x64\Debug"
IntermediateDirectory=".\x64\Debug"
ConfigurationType="1"
CharacterSet="1"
>
<Tool
Name="VCPreBuildEventTool"
/>
<Tool
Name="VCCustomBuildTool"
CommandLine="copy &quot;$(TargetPath)&quot; ..\..\..\bin64&#x0D;&#x0A;"
Outputs="..\..\..\bin64\$(TargetFileName)"
/>
<Tool
Name="VCXMLDataGeneratorTool"
/>
<Tool
Name="VCWebServiceProxyGeneratorTool"
/>
<Tool
Name="VCMIDLTool"
/>
<Tool
Name="VCCLCompilerTool"
Optimization="0"
AdditionalIncludeDirectories="..\..\common;..\toolutil"
PreprocessorDefinitions="WIN64;WIN32;_DEBUG;_CRT_SECURE_NO_DEPRECATE"
EnableIntrinsicFunctions="true"
MinimalRebuild="true"
BasicRuntimeChecks="3"
StringPooling="true"
RuntimeLibrary="3"
BufferSecurityCheck="true"
DisableLanguageExtensions="true"
TreatWChar_tAsBuiltInType="true"
PrecompiledHeaderFile=".\x64\Debug\gennorm2.pch"
AssemblerListingLocation=".\x64\Debug\"
ObjectFile=".\x64\Debug\"
ProgramDataBaseFileName=".\x64\Debug\"
BrowseInformation="1"
WarningLevel="3"
SuppressStartupBanner="true"
DebugInformationFormat="4"
CompileAs="0"
/>
<Tool
Name="VCManagedResourceCompilerTool"
/>
<Tool
Name="VCResourceCompilerTool"
PreprocessorDefinitions="_DEBUG"
Culture="1033"
/>
<Tool
Name="VCPreLinkEventTool"
/>
<Tool
Name="VCLinkerTool"
OutputFile=".\x64\Debug\gennorm2.exe"
LinkIncremental="2"
SuppressStartupBanner="true"
GenerateDebugInformation="true"
ProgramDatabaseFile=".\x64\Debug\gennorm2.pdb"
SubSystem="1"
TargetMachine="17"
RandomizedBaseAddress="1"
DataExecutionPrevention="1"
/>
<Tool
Name="VCALinkTool"
/>
<Tool
Name="VCManifestTool"
/>
<Tool
Name="VCXDCMakeTool"
/>
<Tool
Name="VCBscMakeTool"
/>
<Tool
Name="VCFxCopTool"
/>
<Tool
Name="VCAppVerifierTool"
/>
<Tool
Name="VCPostBuildEventTool"
/>
</Configuration>
</Configurations>
<References>
</References>
<Files>
<File
RelativePath=".\gennorm2.cpp"
>
</File>
<File
RelativePath=".\n2builder.cpp"
>
</File>
<File
RelativePath=".\n2builder.h"
>
</File>
</Files>
<Globals>
</Globals>
</VisualStudioProject>

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,113 @@
/*
*******************************************************************************
*
* Copyright (C) 2009-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: n2builder.h
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2009nov25
* created by: Markus W. Scherer
*/
#ifndef __N2BUILDER_H__
#define __N2BUILDER_H__
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/errorcode.h"
#include "unicode/unistr.h"
#include "normalizer2impl.h" // for IX_COUNT
#include "toolutil.h"
#include "utrie2.h"
U_NAMESPACE_BEGIN
extern UBool beVerbose, haveCopyright;
struct Norm;
class BuilderReorderingBuffer;
class ExtraDataWriter;
class Normalizer2DataBuilder {
public:
Normalizer2DataBuilder(UErrorCode &errorCode);
~Normalizer2DataBuilder();
enum OverrideHandling {
OVERRIDE_NONE,
OVERRIDE_ANY,
OVERRIDE_PREVIOUS
};
void setOverrideHandling(OverrideHandling oh);
void setCC(UChar32 c, uint8_t cc);
void setOneWayMapping(UChar32 c, const UnicodeString &m);
void setRoundTripMapping(UChar32 c, const UnicodeString &m);
void removeMapping(UChar32 c);
void setUnicodeVersion(const char *v);
void writeBinaryFile(const char *filename);
private:
friend class CompositionBuilder;
friend class Decomposer;
friend class ExtraDataWriter;
friend class Norm16Writer;
// No copy constructor nor assignment operator.
Normalizer2DataBuilder(const Normalizer2DataBuilder &other);
Normalizer2DataBuilder &operator=(const Normalizer2DataBuilder &other);
Norm *allocNorm();
Norm *getNorm(UChar32 c);
Norm *createNorm(UChar32 c);
Norm *checkNormForMapping(Norm *p, UChar32 c); // check for permitted overrides
const Norm &getNormRef(UChar32 c) const;
uint8_t getCC(UChar32 c) const;
UBool combinesWithCCBetween(const Norm &norm, uint8_t lowCC, uint8_t highCC) const;
UChar32 combine(const Norm &norm, UChar32 trail) const;
void addComposition(UChar32 start, UChar32 end, uint32_t value);
UBool decompose(UChar32 start, UChar32 end, uint32_t value);
void reorder(Norm *p, BuilderReorderingBuffer &buffer);
UBool hasNoCompBoundaryAfter(BuilderReorderingBuffer &buffer);
void setHangulData();
void writeMapping(UChar32 c, const Norm *p, UnicodeString &dataString);
void writeCompositions(UChar32 c, const Norm *p, UnicodeString &dataString);
void writeExtraData(UChar32 c, uint32_t value, ExtraDataWriter &writer);
int32_t getCenterNoNoDelta() {
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
}
void writeNorm16(UChar32 start, UChar32 end, uint32_t value);
void processData();
UTrie2 *normTrie;
UToolMemory *normMem;
Norm *norms;
int32_t phase;
OverrideHandling overrideHandling;
int32_t indexes[Normalizer2Impl::IX_COUNT];
UTrie2 *norm16Trie;
UnicodeString extraData;
UVersionInfo unicodeVersion;
};
U_NAMESPACE_END
#endif // #if !UCONFIG_NO_NORMALIZATION
#endif // __N2BUILDER_H__

File diff suppressed because it is too large Load diff

View file

@ -389,29 +389,14 @@
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
<File
RelativePath=".\data.h"
>
<File
RelativePath=".\genpname.cpp"
>
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h;hpp;hxx;hm;inl"
</File>
<File
RelativePath=".\genpname.cpp"
>
<File
RelativePath=".\data.h"
>
</File>
</Filter>
<Filter
Name="Resource Files"
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
>
</Filter>
</File>
</Files>
<Globals>
</Globals>

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005-2009, International Business Machines
* Copyright (C) 2005-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -50,6 +50,7 @@
#include "ucol_swp.h"
#include "ucnv_bld.h"
#include "unormimp.h"
#include "normalizer2impl.h"
#include "sprpimpl.h"
#include "propname.h"
#include "rbbidata.h"
@ -619,6 +620,7 @@ static const struct {
#if !UCONFIG_NO_NORMALIZATION
{ { 0x4e, 0x6f, 0x72, 0x6d }, unorm_swap }, /* dataFormat="Norm" */
{ { 0x4e, 0x72, 0x6d, 0x32 }, unorm2_swap }, /* dataFormat="Nrm2" */
#endif
#if !UCONFIG_NO_COLLATION
{ { 0x55, 0x43, 0x6f, 0x6c }, ucol_swap }, /* dataFormat="UCol" */

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -21,11 +21,6 @@
#include <stdio.h>
#include <sys/stat.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "toolutil.h"
#include "unicode/ucal.h"
#ifdef U_WINDOWS
# define VC_EXTRALEAN
@ -42,6 +37,27 @@
#endif
#include <errno.h>
#include "unicode/errorcode.h"
#include "unicode/putil.h"
#include "cmemory.h"
#include "cstring.h"
#include "toolutil.h"
#include "unicode/ucal.h"
U_NAMESPACE_BEGIN
IcuToolErrorCode::~IcuToolErrorCode() {
// Safe because our handleFailure() does not throw exceptions.
if(isFailure()) { handleFailure(); }
}
void IcuToolErrorCode::handleFailure() const {
fprintf(stderr, "error at %s: %s\n", location, errorName());
exit(errorCode);
}
U_NAMESPACE_END
static int32_t currentYear = -1;
U_CAPI int32_t U_EXPORT2 getCurrentYear() {
@ -235,6 +251,7 @@ utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
fprintf(stderr, "error: %s - out of memory\n", mem->name);
exit(U_MEMORY_ALLOCATION_ERROR);
}
mem->capacity=newCapacity;
}
return TRUE;
@ -242,9 +259,11 @@ utm_hasCapacity(UToolMemory *mem, int32_t capacity) {
U_CAPI void * U_EXPORT2
utm_alloc(UToolMemory *mem) {
char *p=(char *)mem->array+mem->idx*mem->size;
int32_t newIndex=mem->idx+1;
char *p=NULL;
int32_t oldIndex=mem->idx;
int32_t newIndex=oldIndex+1;
if(utm_hasCapacity(mem, newIndex)) {
p=(char *)mem->array+oldIndex*mem->size;
mem->idx=newIndex;
uprv_memset(p, 0, mem->size);
}
@ -253,9 +272,11 @@ utm_alloc(UToolMemory *mem) {
U_CAPI void * U_EXPORT2
utm_allocN(UToolMemory *mem, int32_t n) {
char *p=(char *)mem->array+mem->idx*mem->size;
int32_t newIndex=mem->idx+n;
char *p=NULL;
int32_t oldIndex=mem->idx;
int32_t newIndex=oldIndex+n;
if(utm_hasCapacity(mem, newIndex)) {
p=(char *)mem->array+oldIndex*mem->size;
mem->idx=newIndex;
uprv_memset(p, 0, n*mem->size);
}

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2009, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -21,6 +21,33 @@
#include "unicode/utypes.h"
#ifdef XP_CPLUSPLUS
#include "unicode/errorcode.h"
U_NAMESPACE_BEGIN
/**
* ErrorCode subclass for use in ICU command-line tools.
* The destructor calls handleFailure() which calls exit(errorCode) when isFailure().
*/
class U_TOOLUTIL_API IcuToolErrorCode : public ErrorCode {
public:
/**
* @param loc A short string describing where the IcuToolErrorCode is used.
*/
IcuToolErrorCode(const char *loc) : location(loc) {}
virtual ~IcuToolErrorCode();
protected:
virtual void handleFailure() const;
private:
const char *location;
};
U_NAMESPACE_END
#endif
/*
* For Windows, a path/filename may be the short (8.3) version
* of the "real", long one. In this case, the short one

View file

@ -407,261 +407,246 @@
<References>
</References>
<Files>
<Filter
Name="Source Files"
Filter="cpp;c;cxx;rc;def;r;odl;idl;hpj;bat"
<File
RelativePath=".\filestrm.c"
>
<File
RelativePath=".\filestrm.c"
>
</File>
<File
RelativePath=".\filetools.cpp"
>
</File>
<File
RelativePath=".\flagparser.c"
>
</File>
<File
RelativePath=".\package.cpp"
>
</File>
<File
RelativePath=".\pkg_genc.c"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\pkg_gencmn.c"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\pkg_icu.cpp"
>
</File>
<File
RelativePath=".\pkgitems.cpp"
>
</File>
<File
RelativePath=".\swapimpl.cpp"
>
</File>
<File
RelativePath=".\toolutil.c"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\ucbuf.c"
>
</File>
<File
RelativePath=".\ucm.c"
>
</File>
<File
RelativePath=".\ucmstate.c"
>
</File>
<File
RelativePath=".\unewdata.c"
>
</File>
<File
RelativePath=".\uoptions.c"
>
</File>
<File
RelativePath=".\uparse.c"
>
</File>
<File
RelativePath=".\writesrc.c"
>
</File>
<File
RelativePath=".\xmlparser.cpp"
>
</File>
</Filter>
<Filter
Name="Header Files"
Filter="h;hpp;hxx;hm;inl"
</File>
<File
RelativePath=".\filestrm.h"
>
<File
RelativePath=".\filestrm.h"
>
</File>
<File
RelativePath=".\filetools.h"
>
</File>
<File
RelativePath=".\flagparser.h"
>
</File>
<File
RelativePath=".\package.h"
>
</File>
<File
RelativePath=".\pkg_genc.h"
>
</File>
<File
RelativePath=".\pkg_gencmn.h"
>
</File>
<File
RelativePath=".\pkg_icu.h"
>
</File>
<File
RelativePath=".\pkg_imp.h"
>
</File>
<File
RelativePath=".\platform_xopen_source_extended.h"
>
</File>
<File
RelativePath=".\swapimpl.h"
>
</File>
<File
RelativePath=".\toolutil.h"
>
</File>
<File
RelativePath=".\ucbuf.h"
>
</File>
<File
RelativePath=".\ucm.h"
>
</File>
<File
RelativePath=".\unewdata.h"
>
</File>
<File
RelativePath=".\uoptions.h"
>
</File>
<File
RelativePath=".\uparse.h"
>
</File>
<File
RelativePath=".\writesrc.h"
>
</File>
<File
RelativePath=".\xmlparser.h"
>
</File>
</Filter>
<Filter
Name="Resource Files"
Filter="ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe"
</File>
<File
RelativePath=".\filetools.cpp"
>
</Filter>
</File>
<File
RelativePath=".\filetools.h"
>
</File>
<File
RelativePath=".\flagparser.c"
>
</File>
<File
RelativePath=".\flagparser.h"
>
</File>
<File
RelativePath=".\package.cpp"
>
</File>
<File
RelativePath=".\package.h"
>
</File>
<File
RelativePath=".\pkg_genc.c"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\pkg_genc.h"
>
</File>
<File
RelativePath=".\pkg_gencmn.c"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\pkg_gencmn.h"
>
</File>
<File
RelativePath=".\pkg_icu.cpp"
>
</File>
<File
RelativePath=".\pkg_icu.h"
>
</File>
<File
RelativePath=".\pkg_imp.h"
>
</File>
<File
RelativePath=".\pkgitems.cpp"
>
</File>
<File
RelativePath=".\platform_xopen_source_extended.h"
>
</File>
<File
RelativePath=".\swapimpl.cpp"
>
</File>
<File
RelativePath=".\swapimpl.h"
>
</File>
<File
RelativePath=".\toolutil.cpp"
>
<FileConfiguration
Name="Release|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|Win32"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Release|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
<FileConfiguration
Name="Debug|x64"
>
<Tool
Name="VCCLCompilerTool"
DisableLanguageExtensions="false"
/>
</FileConfiguration>
</File>
<File
RelativePath=".\toolutil.h"
>
</File>
<File
RelativePath=".\ucbuf.c"
>
</File>
<File
RelativePath=".\ucbuf.h"
>
</File>
<File
RelativePath=".\ucm.c"
>
</File>
<File
RelativePath=".\ucm.h"
>
</File>
<File
RelativePath=".\ucmstate.c"
>
</File>
<File
RelativePath=".\unewdata.c"
>
</File>
<File
RelativePath=".\unewdata.h"
>
</File>
<File
RelativePath=".\uoptions.c"
>
</File>
<File
RelativePath=".\uoptions.h"
>
</File>
<File
RelativePath=".\uparse.c"
>
</File>
<File
RelativePath=".\uparse.h"
>
</File>
<File
RelativePath=".\writesrc.c"
>
</File>
<File
RelativePath=".\writesrc.h"
>
</File>
<File
RelativePath=".\xmlparser.cpp"
>
</File>
<File
RelativePath=".\xmlparser.h"
>
</File>
</Files>
<Globals>
</Globals>

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999,2008, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -14,6 +14,7 @@
* created by: Markus W. Scherer
*/
#include <stdio.h>
#include "unicode/utypes.h"
#include "unicode/putil.h"
#include "unicode/ustring.h"
@ -162,6 +163,33 @@ udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode) {
return fileLength;
}
/* dummy UDataInfo cf. udata.h */
static const UDataInfo dummyDataInfo = {
sizeof(UDataInfo),
0,
U_IS_BIG_ENDIAN,
U_CHARSET_FAMILY,
U_SIZEOF_UCHAR,
0,
{ 0, 0, 0, 0 }, /* dummy dataFormat */
{ 0, 0, 0, 0 }, /* dummy formatVersion */
{ 0, 0, 0, 0 } /* dummy dataVersion */
};
U_CAPI void U_EXPORT2
udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode) {
if(U_SUCCESS(*pErrorCode)) {
udata_finish(udata_create(dir, type, name, &dummyDataInfo, NULL, pErrorCode), pErrorCode);
if(U_FAILURE(*pErrorCode)) {
fprintf(stderr, "error %s writing dummy data file %s" U_FILE_SEP_STRING "%s.%s\n",
u_errorName(*pErrorCode), dir, name, type);
exit(*pErrorCode);
}
}
}
U_CAPI void U_EXPORT2
udata_write8(UNewDataMemory *pData, uint8_t byte) {
if(pData!=NULL && pData->file!=NULL) {

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 1999-2000, International Business Machines
* Copyright (C) 1999-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -66,6 +66,10 @@ udata_create(const char *dir, const char *type, const char *name,
U_CAPI uint32_t U_EXPORT2
udata_finish(UNewDataMemory *pData, UErrorCode *pErrorCode);
/** @memo Write a dummy data file. */
U_CAPI void U_EXPORT2
udata_createDummy(const char *dir, const char *type, const char *name, UErrorCode *pErrorCode);
/** @memo Write an 8-bit byte to the file. */
U_CAPI void U_EXPORT2
udata_write8(UNewDataMemory *pData, uint8_t byte);

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2009, International Business Machines
* Copyright (C) 2000-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -81,7 +81,7 @@ u_parseDelimitedFile(const char *filename, char delimiter,
char *start, *limit;
int32_t i, length;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return;
}
@ -193,7 +193,7 @@ u_parseCodePoints(const char *s,
uint32_t value;
int32_t count;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
@ -242,7 +242,7 @@ u_parseString(const char *s,
uint32_t value;
int32_t destLength;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s==NULL || destCapacity<0 || (destCapacity>0 && dest==NULL)) {
@ -275,15 +275,16 @@ u_parseString(const char *s,
}
/* store the first code point */
if(destLength==0 && pFirst!=NULL) {
if(pFirst!=NULL) {
*pFirst=value;
pFirst=NULL;
}
/* append it to the destination array */
if((destLength+UTF_CHAR_LENGTH(value))<=destCapacity) {
UTF_APPEND_CHAR_UNSAFE(dest, destLength, value);
if((destLength+U16_LENGTH(value))<=destCapacity) {
U16_APPEND_UNSAFE(dest, destLength, value);
} else {
destLength+=UTF_CHAR_LENGTH(value);
destLength+=U16_LENGTH(value);
}
/* go to the following characters */
@ -293,13 +294,14 @@ u_parseString(const char *s,
/* read a range like start or start..end */
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
uint32_t *pStart, uint32_t *pEnd,
UErrorCode *pErrorCode) {
u_parseCodePointRangeAnyTerminator(const char *s,
uint32_t *pStart, uint32_t *pEnd,
const char **terminator,
UErrorCode *pErrorCode) {
char *end;
uint32_t value;
if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) {
if(U_FAILURE(*pErrorCode)) {
return 0;
}
if(s==NULL || pStart==NULL || pEnd==NULL) {
@ -307,15 +309,10 @@ u_parseCodePointRange(const char *s,
return 0;
}
s=u_skipWhitespace(s);
if(*s==';' || *s==0) {
*pErrorCode=U_PARSE_ERROR;
return 0;
}
/* read the start code point */
s=u_skipWhitespace(s);
value=(uint32_t)uprv_strtoul(s, &end, 16);
if(end<=s || (!IS_INV_WHITESPACE(*end) && *end!='.' && *end!=';' && *end!=0) || value>=0x110000) {
if(end<=s || value>=0x110000) {
*pErrorCode=U_PARSE_ERROR;
return 0;
}
@ -323,19 +320,15 @@ u_parseCodePointRange(const char *s,
/* is there a "..end"? */
s=u_skipWhitespace(end);
if(*s==';' || *s==0) {
if(*s!='.' || s[1]!='.') {
*terminator=end;
return 1;
}
if(*s!='.' || s[1]!='.') {
*pErrorCode=U_PARSE_ERROR;
return 0;
}
s+=2;
s=u_skipWhitespace(s+2);
/* read the end code point */
value=(uint32_t)uprv_strtoul(s, &end, 16);
if(end<=s || (!IS_INV_WHITESPACE(*end) && *end!=';' && *end!=0) || value>=0x110000) {
if(end<=s || value>=0x110000) {
*pErrorCode=U_PARSE_ERROR;
return 0;
}
@ -347,14 +340,25 @@ u_parseCodePointRange(const char *s,
return 0;
}
/* no garbage after that? */
s=u_skipWhitespace(end);
if(*s==';' || *s==0) {
return value-*pStart+1;
} else {
*pErrorCode=U_PARSE_ERROR;
return 0;
*terminator=end;
return value-*pStart+1;
}
U_CAPI int32_t U_EXPORT2
u_parseCodePointRange(const char *s,
uint32_t *pStart, uint32_t *pEnd,
UErrorCode *pErrorCode) {
const char *terminator;
int32_t rangeLength=
u_parseCodePointRangeAnyTerminator(s, pStart, pEnd, &terminator, pErrorCode);
if(U_SUCCESS(*pErrorCode)) {
terminator=u_skipWhitespace(terminator);
if(*terminator!=';' && *terminator!=0) {
*pErrorCode=U_PARSE_ERROR;
return 0;
}
}
return rangeLength;
}
U_CAPI int32_t U_EXPORT2

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2000-2009, International Business Machines
* Copyright (C) 2000-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -126,6 +126,16 @@ u_parseCodePointRange(const char *s,
uint32_t *pStart, uint32_t *pEnd,
UErrorCode *pErrorCode);
/**
* Same as u_parseCodePointRange() but the range may be terminated by
* any character. The position of the terminating character is returned via
* the *terminator output parameter.
*/
U_CAPI int32_t U_EXPORT2
u_parseCodePointRangeAnyTerminator(const char *s,
uint32_t *pStart, uint32_t *pEnd,
const char **terminator,
UErrorCode *pErrorCode);
U_CAPI int32_t U_EXPORT2
u_parseUTF8(const char *source, int32_t sLen, char *dest, int32_t destCapacity, UErrorCode *status);

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005-2008, International Business Machines
* Copyright (C) 2005-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -24,8 +24,8 @@
#include "cstring.h"
#include "writesrc.h"
U_CAPI FILE * U_EXPORT2
usrc_create(const char *path, const char *filename) {
static FILE *
usrc_createWithHeader(const char *path, const char *filename, const char *header) {
char buffer[1024];
const char *p;
char *q;
@ -55,19 +55,7 @@ usrc_create(const char *path, const char *filename) {
lt=localtime(&t);
strftime(year, sizeof(year), "%Y", lt);
strftime(buffer, sizeof(buffer), "%Y-%m-%d", lt);
fprintf(
f,
"/*\n"
" * Copyright (C) 1999-%s, International Business Machines\n"
" * Corporation and others. All Rights Reserved.\n"
" *\n"
" * file name: %s\n"
" *\n"
" * machine-generated on: %s\n"
" */\n\n",
year,
filename,
buffer);
fprintf(f, header, year, filename, buffer);
} else {
fprintf(
stderr,
@ -77,6 +65,33 @@ usrc_create(const char *path, const char *filename) {
return f;
}
U_CAPI FILE * U_EXPORT2
usrc_create(const char *path, const char *filename) {
const char *header=
"/*\n"
" * Copyright (C) 1999-%s, International Business Machines\n"
" * Corporation and others. All Rights Reserved.\n"
" *\n"
" * file name: %s\n"
" *\n"
" * machine-generated on: %s\n"
" */\n\n";
return usrc_createWithHeader(path, filename, header);
}
U_CAPI FILE * U_EXPORT2
usrc_createTextData(const char *path, const char *filename) {
const char *header=
"# Copyright (C) 1999-%s, International Business Machines\n"
"# Corporation and others. All Rights Reserved.\n"
"#\n"
"# file name: %s\n"
"#\n"
"# machine-generated on: %s\n"
"#\n\n";
return usrc_createWithHeader(path, filename, header);
}
U_CAPI void U_EXPORT2
usrc_writeArray(FILE *f,
const char *prefix,

View file

@ -1,7 +1,7 @@
/*
*******************************************************************************
*
* Copyright (C) 2005-2008, International Business Machines
* Copyright (C) 2005-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
@ -25,10 +25,18 @@
/**
* Create a source text file and write a header comment with the ICU copyright.
* Writes a C/Java-style comment.
*/
U_CAPI FILE * U_EXPORT2
usrc_create(const char *path, const char *filename);
/**
* Create a source text file and write a header comment with the ICU copyright.
* Writes the comment with # lines, as used in scripts and text data.
*/
U_CAPI FILE * U_EXPORT2
usrc_createTextData(const char *path, const char *filename);
/**
* Write the contents of an array of 8/16/32-bit words.
* The prefix and postfix are optional (can be NULL) and are written first/last.