ICU-13197 improved normalization data structure and code; .nrm formatVersion 3; merged from branches/markus/normv3 except for cherry-picks from trunk to there

X-SVN-Rev: 40265
This commit is contained in:
Markus Scherer 2017-07-14 22:38:40 +00:00
parent 3da97c910d
commit e6748afd82
34 changed files with 3372 additions and 2692 deletions

View file

@ -20,11 +20,13 @@
#if !UCONFIG_NO_NORMALIZATION
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/uniset.h"
#include "unicode/unistr.h"
#include "unicode/unorm.h"
#include "cpputils.h"
#include "ustr_imp.h" // U_EDITS_NO_RESET
U_NAMESPACE_BEGIN
@ -85,6 +87,52 @@ FilteredNormalizer2::normalize(const UnicodeString &src,
return dest;
}
void
FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
options |= U_EDITS_NO_RESET; // Do not reset for each span.
normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
}
void
FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
ByteSink &sink, Edits *edits,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const {
while (length > 0) {
int32_t spanLength = set.spanUTF8(src, length, spanCondition);
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
if (spanLength != 0) {
if (edits != nullptr) {
edits->addUnchanged(spanLength);
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(src, spanLength);
}
}
spanCondition = USET_SPAN_SIMPLE;
} else {
if (spanLength != 0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
if (U_FAILURE(errorCode)) {
break;
}
}
spanCondition = USET_SPAN_NOT_CONTAINED;
}
src += spanLength;
length -= spanLength;
}
}
UnicodeString &
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,

View file

@ -62,7 +62,7 @@ LoadedNormalizer2Impl::isAcceptable(void * /*context*/,
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6d &&
pInfo->dataFormat[3]==0x32 &&
pInfo->formatVersion[0]==2
pInfo->formatVersion[0]==3
) {
// Normalizer2Impl *me=(Normalizer2Impl *)context;
// uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
@ -84,7 +84,7 @@ LoadedNormalizer2Impl::load(const char *packageName, const char *name, UErrorCod
const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
const int32_t *inIndexes=(const int32_t *)inBytes;
int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
if(indexesLength<=IX_MIN_YES_NO_MAPPINGS_ONLY) {
if(indexesLength<=IX_MIN_LCCC_CP) {
errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
return;
}

File diff suppressed because it is too large Load diff

View file

@ -23,6 +23,7 @@
#include "unicode/unistr.h"
#include "cpputils.h"
#include "normalizer2impl.h"
#include "ustr_imp.h" // U_EDITS_NO_RESET
U_NAMESPACE_BEGIN
@ -211,8 +212,8 @@ private:
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
}
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundary(c, TRUE); }
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundary(c, FALSE); }
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundaryBefore(c); }
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundaryAfter(c); }
virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
};
@ -236,12 +237,12 @@ private:
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr) {
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
impl.composeUTF8(options, s, s + src.length(),
onlyContiguous, TRUE, sink, edits, errorCode);
impl.composeUTF8(options, onlyContiguous, s, s + src.length(),
&sink, edits, errorCode);
sink.Flush();
}
@ -295,10 +296,10 @@ private:
return impl.hasCompBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const override {
return impl.hasCompBoundaryAfter(c, onlyContiguous, FALSE);
return impl.hasCompBoundaryAfter(c, onlyContiguous);
}
virtual UBool isInert(UChar32 c) const override {
return impl.hasCompBoundaryAfter(c, onlyContiguous, TRUE);
return impl.isCompInert(c, onlyContiguous);
}
const UBool onlyContiguous;

View file

@ -30,6 +30,7 @@
#include "normalizer2impl.h"
#include "uassert.h"
#include "ucln_cmn.h"
#include "ustr_imp.h" // U_EDITS_NO_RESET
using icu::Normalizer2Impl;
@ -90,14 +91,18 @@ class NoopNormalizer2 : public Normalizer2 {
return dest;
}
virtual void
normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override {
if(U_SUCCESS(errorCode)) {
sink.Append(src.data(), src.length());
if (edits != nullptr) {
edits->reset();
if ((options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
edits->addUnchanged(src.length());
}
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
sink.Append(src.data(), src.length());
}
sink.Flush();
}
}

File diff suppressed because it is too large Load diff

View file

@ -37,6 +37,8 @@ struct CanonIterData;
class ByteSink;
class Edits;
class InitCanonIterData;
class LcccContext;
class U_COMMON_API Hangul {
public:
@ -66,9 +68,9 @@ public:
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
}
static inline UBool
isHangulWithoutJamoT(UChar c) {
isHangulLV(UChar32 c) {
c-=HANGUL_BASE;
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
static inline UBool isJamoL(UChar32 c) {
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
@ -76,6 +78,14 @@ public:
static inline UBool isJamoV(UChar32 c) {
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
}
static inline UBool isJamoT(UChar32 c) {
int32_t t=c-JAMO_T_BASE;
return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself
}
static UBool isJamo(UChar32 c) {
return JAMO_L_BASE<=c && c<=JAMO_T_END &&
(c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
}
/**
* Decomposes c, which must be a Hangul syllable, into buffer
@ -120,10 +130,13 @@ class Normalizer2Impl;
class U_COMMON_API ReorderingBuffer : public UMemory {
public:
/** Constructs only; init() should be called. */
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
impl(ni), str(dest),
start(NULL), reorderStart(NULL), limit(NULL),
remainingCapacity(0), lastCC(0) {}
/** Constructs, removes the string contents, and initializes for a small initial capacity. */
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
~ReorderingBuffer() {
if(start!=NULL) {
str.releaseBuffer((int32_t)(limit-start));
@ -140,11 +153,6 @@ public:
UBool equals(const UChar *start, const UChar *limit) const;
UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
// For Hangul composition, replacing the Leading consonant Jamo with the syllable.
void setLastChar(UChar c) {
*(limit-1)=c;
}
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
return (c<=0xffff) ?
appendBMP((UChar)c, cc, errorCode) :
@ -222,6 +230,12 @@ private:
UChar *codePointStart, *codePointLimit;
};
/**
* Low-level implementation of the Unicode Normalization Algorithm.
* For the data structure and details see the documentation at the end of
* this normalizer2impl.h and in the design doc at
* http://site.icu-project.org/design/normalization/custom
*/
class U_COMMON_API Normalizer2Impl : public UObject {
public:
Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) {
@ -238,8 +252,6 @@ public:
// low-level properties ------------------------------------------------ ***
const UTrie2 *getNormTrie() const { return normTrie; }
UBool ensureCanonIterData(UErrorCode &errorCode) const;
uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
@ -259,15 +271,22 @@ public:
uint8_t getCC(uint16_t norm16) const {
if(norm16>=MIN_NORMAL_MAYBE_YES) {
return (uint8_t)norm16;
return getCCFromNormalYesOrMaybe(norm16);
}
if(norm16<minNoNo || limitNoNo<=norm16) {
return 0;
}
return getCCFromNoNo(norm16);
}
static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
return (uint8_t)(norm16 >> OFFSET_SHIFT);
}
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
}
uint8_t getCCFromYesOrMaybeCP(UChar32 c) const {
if (c < minCompNoMaybeCP) { return 0; }
return getCCFromYesOrMaybe(getNorm16(c));
}
/**
@ -276,10 +295,8 @@ public:
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
uint16_t getFCD16(UChar32 c) const {
if(c<0) {
if(c<minDecompNoCP) {
return 0;
} else if(c<0x180) {
return tccc180[c];
} else if(c<=0xffff) {
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
}
@ -295,9 +312,7 @@ public:
*/
uint16_t nextFCD16(const UChar *&s, const UChar *limit) const {
UChar32 c=*s++;
if(c<0x180) {
return tccc180[c];
} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
return 0;
}
UChar c2;
@ -315,8 +330,8 @@ public:
*/
uint16_t previousFCD16(const UChar *start, const UChar *&s) const {
UChar32 c=*--s;
if(c<0x180) {
return tccc180[c];
if(c<minDecompNoCP) {
return 0;
}
if(!U16_IS_TRAIL(c)) {
if(!singleLeadMightHaveNonZeroFCD16(c)) {
@ -332,8 +347,6 @@ public:
return getFCD16FromNormData(c);
}
/** Returns the FCD data for U+0000<=c<U+0180. */
uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; }
/** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
// 0<=lead<=0xffff
@ -344,9 +357,6 @@ public:
/** Returns the FCD value from the regular normalization data. */
uint16_t getFCD16FromNormData(UChar32 c) const;
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
CanonIterData &newData, UErrorCode &errorCode) const;
/**
* Gets the decomposition for one code point.
* @param c code point
@ -371,14 +381,25 @@ public:
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
enum {
MIN_CCC_LCCC_CP=0x300
};
// Fixed norm16 values.
MIN_YES_YES_WITH_CC=0xfe02,
JAMO_VT=0xfe00,
MIN_NORMAL_MAYBE_YES=0xfc00,
JAMO_L=2, // offset=1 hasCompBoundaryAfter=FALSE
INERT=1, // offset=0 hasCompBoundaryAfter=TRUE
// norm16 bit 0 is comp-boundary-after.
HAS_COMP_BOUNDARY_AFTER=1,
OFFSET_SHIFT=1,
// For algorithmic one-way mappings, norm16 bits 2..1 indicate the
// tccc (0, 1, >1) for quick FCC boundary-after tests.
DELTA_TCCC_0=0,
DELTA_TCCC_1=2,
DELTA_TCCC_GT_1=4,
DELTA_TCCC_MASK=6,
DELTA_SHIFT=3,
enum {
MIN_YES_YES_WITH_CC=0xff01,
JAMO_VT=0xff00,
MIN_NORMAL_MAYBE_YES=0xfe00,
JAMO_L=1,
MAX_DELTA=0x40
};
@ -398,21 +419,32 @@ public:
IX_MIN_COMP_NO_MAYBE_CP,
// Norm16 value thresholds for quick check combinations and types of extra data.
IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
IX_MIN_YES_NO,
// Mappings are comp-normalized.
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[.
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
IX_MIN_YES_NO_MAPPINGS_ONLY,
// Mappings are not comp-normalized but have a comp boundary before.
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
// Mappings do not have a comp boundary before.
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
// Mappings to the empty string.
IX_MIN_NO_NO_EMPTY,
IX_RESERVED15,
IX_MIN_LCCC_CP,
IX_RESERVED19,
IX_COUNT
};
enum {
MAPPING_HAS_CCC_LCCC_WORD=0x80,
MAPPING_HAS_RAW_MAPPING=0x40,
MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
// unused bit 0x20,
MAPPING_LENGTH_MASK=0x1f
};
@ -462,10 +494,10 @@ public:
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool composeUTF8(uint32_t options,
/** sink==nullptr: isNormalized() */
UBool composeUTF8(uint32_t options, UBool onlyContiguous,
const uint8_t *src, const uint8_t *limit,
UBool onlyContiguous, UBool doCompose,
ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) const;
ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
const UChar *makeFCD(const UChar *src, const UChar *limit,
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
@ -475,27 +507,42 @@ public:
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool hasDecompBoundary(UChar32 c, UBool before) const;
UBool hasDecompBoundaryBefore(UChar32 c) const;
UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
UBool hasDecompBoundaryAfter(UChar32 c) const;
UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
UBool hasCompBoundaryBefore(UChar32 c) const {
return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
}
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
}
UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
uint16_t norm16=getNorm16(c);
return isCompYesAndZeroCC(norm16) &&
(norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
(!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);
}
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
UBool hasFCDBoundaryAfter(UChar32 c) const {
uint16_t fcd16=getFCD16(c);
return fcd16<=1 || (fcd16&0xff)==0;
}
UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
private:
friend class InitCanonIterData;
friend class LcccContext;
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
static UBool isInert(uint16_t norm16) { return norm16==0; }
static UBool isJamoL(uint16_t norm16) { return norm16==1; }
static UBool isInert(uint16_t norm16) { return norm16==INERT; }
static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
UBool isHangulLVT(uint16_t norm16) const {
return norm16==hangulLVT();
}
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
// UBool isCompYes(uint16_t norm16) const {
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
@ -514,7 +561,7 @@ private:
/**
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
* the MaybeYes which combine-forward and have ccc=0.
* (Standard Unicode 5.2 normalization does not have such characters.)
* (Standard Unicode 10 normalization does not have such characters.)
*/
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
@ -524,7 +571,7 @@ private:
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
// return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
// }
uint8_t getCCFromNoNo(uint16_t norm16) const {
const uint16_t *mapping=getMapping(norm16);
@ -535,30 +582,47 @@ private:
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
if(norm16<=minYesNo) {
return 0; // yesYes and Hangul LV have ccc=tccc=0
} else {
// For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo
}
}
uint8_t getPreviousTrailCC(const UChar *start, const UChar *p) const;
uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
// Requires algorithmic-NoNo.
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
return c+norm16-(minMaybeYes-MAX_DELTA-1);
return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
}
UChar32 getAlgorithmicDelta(uint16_t norm16) const {
return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
}
// Requires minYesNo<norm16<limitNoNo.
const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); }
const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
return NULL;
} else if(norm16<minMaybeYes) {
return extraData+norm16; // for yesYes; if Jamo L: harmless empty list
return getMapping(norm16); // for yesYes; if Jamo L: harmless empty list
} else {
return maybeYesCompositions+norm16-minMaybeYes;
}
}
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
// A composite has both mapping & compositions list.
const uint16_t *list=getMapping(norm16);
return list+ // mapping pointer
1+ // +1 to skip the first unit with the mapping lenth
1+ // +1 to skip the first unit with the mapping length
(*list&MAPPING_LENGTH_MASK); // + mapping length
}
const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const {
// minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);
}
/**
* @param c code point must have compositions
* @return compositions list pointer
@ -573,55 +637,78 @@ private:
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const;
UBool decomposeShort(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
const UChar *decomposeShort(const UChar *src, const UChar *limit,
UBool stopAtCompBoundary, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
UBool decompose(UChar32 c, uint16_t norm16,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
UBool stopAtCompBoundary, ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
UBool stopAtCompBoundary, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
static int32_t combine(const uint16_t *list, UChar32 trail);
void addComposites(const uint16_t *list, UnicodeSet &set) const;
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
UBool onlyContiguous) const;
int32_t getCompProps(const uint8_t *src, const uint8_t *limit,
uint16_t norm16, UBool onlyContiguous) const;
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
}
UBool norm16HasCompBoundaryBefore(uint16_t norm16) const {
return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
}
UBool hasCompBoundaryBefore(const UChar *src, const UChar *limit) const;
UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
UBool hasCompBoundaryAfter(const UChar *start, const UChar *p,
UBool onlyContiguous) const;
UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
UBool onlyContiguous) const;
UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
(!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
}
/** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
(norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
}
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p, UBool onlyContiguous) const;
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit, UBool onlyContiguous) const;
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
CanonIterData &newData, UErrorCode &errorCode) const;
int32_t getCanonValue(UChar32 c) const;
const UnicodeSet &getCanonStartSet(int32_t n) const;
// UVersionInfo dataVersion;
// Code point thresholds for quick check codes.
UChar32 minDecompNoCP;
UChar32 minCompNoMaybeCP;
// BMP code point thresholds for quick check loops looking at single UTF-16 code units.
UChar minDecompNoCP;
UChar minCompNoMaybeCP;
UChar minLcccCP;
// Norm16 value thresholds for quick check combinations and types of extra data.
uint16_t minYesNo;
uint16_t minYesNoMappingsOnly;
uint16_t minNoNo;
uint16_t minNoNoCompBoundaryBefore;
uint16_t minNoNoCompNoMaybeCC;
uint16_t minNoNoEmpty;
uint16_t limitNoNo;
uint16_t centerNoNoDelta;
uint16_t minMaybeYes;
const UTrie2 *normTrie;
const uint16_t *maybeYesCompositions;
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F
public: // CanonIterData is public to allow access from C callback functions.
UInitOnce fCanonIterDataInitOnce;
CanonIterData *fCanonIterData;
};
@ -677,13 +764,14 @@ unorm_getFCD16(UChar32 c);
/**
* Format of Normalizer2 .nrm data files.
* Format version 2.0.
* Format version 3.0.
*
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
* ICU ships with data files for standard Unicode Normalization Forms
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
* Custom (application-specific) data can be built into additional .nrm files
* with the gennorm2 build tool.
* ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
*
* Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
* cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
@ -714,14 +802,20 @@ unorm_getFCD16(UChar32 c);
* with a decomposition mapping, that is, with NF*D_QC=No.
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
* minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
* is the lowest code point with lccc!=0.
*
* The next five indexes are thresholds of 16-bit trie values for ranges of
* The next eight indexes are thresholds of 16-bit trie values for ranges of
* values indicating multiple normalization properties.
* They are listed here in threshold order, not in the order they are stored in the indexes.
* minYesNo=indexes[IX_MIN_YES_NO];
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* minNoNo=indexes[IX_MIN_NO_NO];
* minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
* minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
* minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
* limitNoNo=indexes[IX_LIMIT_NO_NO];
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* See the normTrie description below and the design doc for details.
*
* UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
@ -729,12 +823,14 @@ unorm_getFCD16(UChar32 c);
* The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
* Rather than using independent bits in the value (which would require more than 16 bits),
* information is extracted primarily via range checks.
* Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
* For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
* which means it has a two-way (round-trip) decomposition mapping.
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
* pointing to mappings, compositions lists, or both.
* Value norm16==0 means that the character is normalization-inert, that is,
* Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
* means that the character is normalization-inert, that is,
* it does not have a mapping, does not participate in composition, has a zero
* canonical combining class, and forms a boundary where text before it and after it
* can be normalized independently.
@ -748,7 +844,7 @@ unorm_getFCD16(UChar32 c);
* The trie has a value for each lead surrogate code unit representing the "worst case"
* properties of the 1024 supplementary characters whose UTF-16 form starts with
* the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
* then their lead surrogate code unit has the trie value 0.
* then their lead surrogate code unit has the trie value INERT.
* When the lead surrogate unit's value exceeds the quick check minimum during processing,
* the properties for the full supplementary code point need to be looked up.
*
@ -757,6 +853,7 @@ unorm_getFCD16(UChar32 c);
*
* There is only one byte offset for the end of these two arrays.
* The split between them is given by the constant and variable mentioned above.
* In version 3, the difference must be shifted right by OFFSET_SHIFT.
*
* The maybeYesCompositions array contains compositions lists for characters that
* combine both forward (as starters in composition pairs)
@ -773,6 +870,8 @@ unorm_getFCD16(UChar32 c);
* followed by only mappings for "NoNo" characters.
* (Referring to pairs of NFC/NFD quick check values.)
* The norm16 values of those characters are directly indexes into the extraData array.
* In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
* for accessing extraData.
*
* The data structures for compositions lists and mappings are described in the design doc.
*
@ -803,6 +902,50 @@ unorm_getFCD16(UChar32 c);
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
* - Addition of the smallFCD[] bit set.
*
* Changes from format version 2 to format version 3 (ICU 60) ------------------
*
* - norm16 bit 0 indicates hasCompBoundaryAfter(),
* except that for contiguous composition (FCC) the tccc must be checked as well.
* Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
* Thresholds like minNoNo are tested before shifting.
*
* - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
* to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
* See DELTA_TCCC_MASK etc.
* This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
* minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
*
* - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
* and ASCII characters are mapped algorithmically only to other ASCII characters.
* This helps with hasCompBoundaryBefore() and compose() fast paths.
* It is never necessary any more to loop for algorithmic mappings.
*
* - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
* indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
* and separation of the noNo extraData into distinct ranges.
* With this, the noNo norm16 value indicates whether the mapping is
* compose-normalized, not normalized but hasCompBoundaryBefore(),
* not even that, or maps to an empty string.
* hasCompBoundaryBefore() can be determined solely from the norm16 value.
*
* - The norm16 value for Hangul LVT is now different from that for Hangul LV,
* so that hasCompBoundaryAfter() need not check for the syllable type.
* For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
* For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
* The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
* to simplify some code.
*
* - The extraData firstUnit bit 5 is no longer necessary
* (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
* is reserved again, and always set to 0.
*
* - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
* This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
* U+00AD Soft Hyphen maps to an empty string,
* which is artificially assigned "worst case" values lccc=1 and tccc=255.
*
* - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
*/
#endif /* !UCONFIG_NO_NORMALIZATION */

View file

@ -820,7 +820,7 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P
return 0;
}
if(edits!=NULL) {
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR

View file

@ -240,7 +240,7 @@ public:
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @internal ICU 60 technology preview, may be changed or removed in the future
* @draft ICU 60
*/
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
@ -510,7 +510,35 @@ public:
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const;
UErrorCode &errorCode) const override;
/**
* Normalizes a UTF-8 string and optionally records how source substrings
* relate to changed and unchanged result substrings.
*
* Currently implemented completely only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* Otherwise currently converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
* @param src Source UTF-8 string.
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
* sink.Flush() is called at the end.
* @param edits Records edits for index mapping, working with styled text,
* and getting only changes (if any).
* The Edits contents is undefined if any error occurs.
* This function calls edits->reset() first. edits can be nullptr.
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @draft ICU 60
*/
virtual void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const override;
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
@ -528,7 +556,7 @@ public:
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const;
UErrorCode &errorCode) const override;
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
@ -546,7 +574,7 @@ public:
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const;
UErrorCode &errorCode) const override;
/**
* Gets the decomposition mapping of c.
@ -560,7 +588,7 @@ public:
* @stable ICU 4.6
*/
virtual UBool
getDecomposition(UChar32 c, UnicodeString &decomposition) const;
getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
/**
* Gets the raw decomposition mapping of c.
@ -574,7 +602,7 @@ public:
* @stable ICU 49
*/
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
/**
* Performs pairwise composition of a & b and returns the composite if there is one.
@ -587,7 +615,7 @@ public:
* @stable ICU 49
*/
virtual UChar32
composePair(UChar32 a, UChar32 b) const;
composePair(UChar32 a, UChar32 b) const override;
/**
* Gets the combining class of c.
@ -598,7 +626,7 @@ public:
* @stable ICU 49
*/
virtual uint8_t
getCombiningClass(UChar32 c) const;
getCombiningClass(UChar32 c) const override;
/**
* Tests if the string is normalized.
@ -612,7 +640,7 @@ public:
* @stable ICU 4.4
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.
@ -625,7 +653,7 @@ public:
* @stable ICU 4.4
*/
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Returns the end of the normalized substring of the input string.
* For details see the Normalizer2 base class documentation.
@ -638,7 +666,7 @@ public:
* @stable ICU 4.4
*/
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Tests if the character always has a normalization boundary before it,
@ -648,7 +676,7 @@ public:
* @return TRUE if c has a normalization boundary before it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryBefore(UChar32 c) const;
virtual UBool hasBoundaryBefore(UChar32 c) const override;
/**
* Tests if the character always has a normalization boundary after it,
@ -658,7 +686,7 @@ public:
* @return TRUE if c has a normalization boundary after it
* @stable ICU 4.4
*/
virtual UBool hasBoundaryAfter(UChar32 c) const;
virtual UBool hasBoundaryAfter(UChar32 c) const override;
/**
* Tests if the character is normalization-inert.
@ -667,7 +695,7 @@ public:
* @return TRUE if c is normalization-inert
* @stable ICU 4.4
*/
virtual UBool isInert(UChar32 c) const;
virtual UBool isInert(UChar32 c) const override;
private:
UnicodeString &
normalize(const UnicodeString &src,
@ -675,6 +703,12 @@ private:
USetSpanCondition spanCondition,
UErrorCode &errorCode) const;
void
normalizeUTF8(uint32_t options, const char *src, int32_t length,
ByteSink &sink, Edits *edits,
USetSpanCondition spanCondition,
UErrorCode &errorCode) const;
UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,

View file

@ -182,6 +182,7 @@
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
// ustr_imp.h #define U_EDITS_NO_RESET 0x2000
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
#endif // __STRINGOPTIONS_H__

View file

@ -25,6 +25,11 @@
*/
#define _STRNCMP_STYLE 0x1000
/**
* Internal option for string transformation functions to not first reset the Edits object.
*/
#define U_EDITS_NO_RESET 0x2000
/**
* Compare two strings in code point order or code unit order.
* Works in strcmp style (both lengths -1),

View file

@ -1103,7 +1103,7 @@ ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
return 0;
}
if(edits!=NULL) {
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR

View file

@ -746,7 +746,7 @@ uint16_t BackwardUTrie2StringIterator::previous16() {
codePointLimit=codePointStart;
if(start>=codePointStart) {
codePoint=U_SENTINEL;
return 0;
return trie->errorValue;
}
uint16_t result;
UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result);
@ -757,7 +757,7 @@ uint16_t ForwardUTrie2StringIterator::next16() {
codePointStart=codePointLimit;
if(codePointLimit==limit) {
codePoint=U_SENTINEL;
return 0;
return trie->errorValue;
}
uint16_t result;
UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -159,6 +159,7 @@ void addNormTest(TestNode** root)
}
static const char* const modeStrings[]={
"?",
"UNORM_NONE",
"UNORM_NFD",
"UNORM_NFKD",
@ -183,7 +184,7 @@ static void TestNormCases(UNormalizationMode mode,
length2= unorm_normalize(source, -1, mode, 0, NULL, 0, &status2);
if(neededLen!=length2) {
log_err("ERROR in unorm_normalize(%s)[%d]: "
"preflight length/NUL %d!=%d preflight length/srcLength\n",
"preflight length/srcLength %d!=%d preflight length/NUL\n",
modeStrings[mode], (int)x, (int)neededLen, (int)length2);
}
if(status==U_BUFFER_OVERFLOW_ERROR)
@ -192,14 +193,14 @@ static void TestNormCases(UNormalizationMode mode,
}
length2=unorm_normalize(source, u_strlen(source), mode, 0, result, UPRV_LENGTHOF(result), &status);
if(U_FAILURE(status) || neededLen!=length2) {
log_data_err("ERROR in unorm_normalize(%s/NUL) at %s: %s - (Are you missing data?)\n",
log_data_err("ERROR in unorm_normalize(%s/srcLength) at %s: %s - (Are you missing data?)\n",
modeStrings[mode], austrdup(source), myErrorName(status));
} else {
assertEqual(result, cases[x][expIndex], x);
}
length2=unorm_normalize(source, -1, mode, 0, result, UPRV_LENGTHOF(result), &status);
if(U_FAILURE(status) || neededLen!=length2) {
log_data_err("ERROR in unorm_normalize(%s/srcLength) at %s: %s - (Are you missing data?)\n",
log_data_err("ERROR in unorm_normalize(%s/NUL) at %s: %s - (Are you missing data?)\n",
modeStrings[mode], austrdup(source), myErrorName(status));
} else {
assertEqual(result, cases[x][expIndex], x);

View file

@ -406,11 +406,11 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
}
static const char *const kModeStrings[UNORM_MODE_COUNT] = {
"?", "D", "KD", "C", "KC", "FCD"
"?", "none", "D", "KD", "C", "KC", "FCD"
};
static const char *const kMessages[UNORM_MODE_COUNT] = {
"?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
"?!=?", "?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
};
UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t options,
@ -450,6 +450,7 @@ UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t opti
std::string exp8;
exp.toUTF8String(exp8);
std::string out8;
out8.reserve(exp8.length());
Edits edits;
Edits *editsPtr = (mode == UNORM_NFC || mode == UNORM_NFKC) ? &edits : nullptr;
StringByteSink<std::string> sink(&out8);

View file

@ -55,6 +55,9 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
#endif
TESTCASE_AUTO(TestFilteredNormalizer2Coverage);
TESTCASE_AUTO(TestNormalizeUTF8WithEdits);
TESTCASE_AUTO(TestLowMappingToEmpty_D);
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
TESTCASE_AUTO(TestNormalizeIllFormedText);
TESTCASE_AUTO_END;
}
@ -1374,6 +1377,10 @@ initExpectedSkippables(UnicodeSet skipSets[UNORM_MODE_COUNT], UErrorCode &errorC
delete [] combineBackCharsAndCc;
}
static const char *const kModeStrings[UNORM_MODE_COUNT] = {
"?", "none", "D", "KD", "C", "KC", "FCD"
};
void
BasicNormalizerTest::TestSkippable() {
UnicodeSet diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
@ -1395,7 +1402,8 @@ BasicNormalizerTest::TestSkippable() {
for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
if(skipSets[i]!=expectSets[i]) {
errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n", i, i);
const char *ms=kModeStrings[i];
errln("error: TestSkippable skipSets[%s]!=expectedSets[%s]\n", ms, ms);
// Note: This used to depend on hardcoded UnicodeSet patterns generated by
// Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
// running com.ibm.text.UCD.Main with the option NFSkippable.
@ -1531,7 +1539,7 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
return;
}
static const char *const src =
u8" AÄA\u0308A\u0308\u0323Ä\u0323,\u1100\u1161\u11A8\u3133 ";
u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161\u11A8\u3133 ";
std::string expected = u8" aääạ\u0308\u0308,가각갃 ";
std::string result;
StringByteSink<std::string> sink(&result);
@ -1544,9 +1552,10 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
{ TRUE, 1, 1 }, // A→a
{ TRUE, 2, 2 }, // Ä→ä
{ TRUE, 3, 2 }, // A\u0308→ä
{ TRUE, 5, 5 }, // A\u0308\u0323→ạ\u0308
{ TRUE, 7, 5 }, // A\u0308\u00ad\u0323→ạ\u0308 removes the soft hyphen
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
{ FALSE, 1, 1 }, // comma
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
{ TRUE, 6, 3 }, // \u1100\u1161→ 가
{ TRUE, 6, 3 }, // 가\u11A8→ 각
{ TRUE, 6, 3 }, // 가\u3133→ 갃
@ -1568,6 +1577,138 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
edits.getFineIterator(), edits.getFineIterator(),
expectedChanges, UPRV_LENGTHOF(expectedChanges),
TRUE, errorCode);
// With filter: The normalization code does not see the "A" substrings.
UnicodeSet filter(u"[^A]", errorCode);
FilteredNormalizer2 fn2(*nfkc_cf, filter);
expected = u8" AäA\u0308A\u0323\u0308\u0308,가각갃 ";
result.clear();
edits.reset();
fn2.normalizeUTF8(0, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8", errorCode.get());
assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str());
static const EditChange filteredChanges[] = {
{ FALSE, 3, 3 }, // 2 spaces + A
{ TRUE, 2, 2 }, // Ä→ä
{ FALSE, 4, 4 }, // A\u0308A
{ TRUE, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
{ FALSE, 1, 1 }, // comma
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
{ TRUE, 6, 3 }, // \u1100\u1161→ 가
{ TRUE, 6, 3 }, // 가\u11A8→ 각
{ TRUE, 6, 3 }, // 가\u3133→ 갃
{ FALSE, 2, 2 } // 2 spaces
};
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8",
edits.getFineIterator(), edits.getFineIterator(),
filteredChanges, UPRV_LENGTHOF(filteredChanges),
TRUE, errorCode);
// Omit unchanged text.
// Note that the result is not normalized because the inner normalizer
// does not see text across filter spans.
expected = u8"ä\u0323\u0308\u0308가각갃";
result.clear();
edits.reset();
fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
edits.getFineIterator(), edits.getFineIterator(),
filteredChanges, UPRV_LENGTHOF(filteredChanges),
TRUE, errorCode);
}
void
BasicNormalizerTest::TestLowMappingToEmpty_D() {
IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_D");
const Normalizer2 *n2 = Normalizer2::getInstance(
nullptr, "nfkc_cf", UNORM2_DECOMPOSE, errorCode);
if (errorCode.logDataIfFailureAndReset("Normalizer2::getInstance() call failed")) {
return;
}
checkLowMappingToEmpty(*n2);
UnicodeString sh(u'\u00AD');
assertFalse("soft hyphen is not normalized", n2->isNormalized(sh, errorCode));
UnicodeString result = n2->normalize(sh, errorCode);
assertTrue("soft hyphen normalizes to empty", result.isEmpty());
assertEquals("soft hyphen QC=No", UNORM_NO, n2->quickCheck(sh, errorCode));
assertEquals("soft hyphen spanQuickCheckYes", 0, n2->spanQuickCheckYes(sh, errorCode));
UnicodeString s(u"\u00ADÄ\u00AD\u0323");
result = n2->normalize(s, errorCode);
assertEquals("normalize string with soft hyphens", u"a\u0323\u0308", result);
}
void
BasicNormalizerTest::TestLowMappingToEmpty_FCD() {
IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_FCD");
const Normalizer2 *n2 = Normalizer2::getInstance(
nullptr, "nfkc_cf", UNORM2_FCD, errorCode);
if (errorCode.logDataIfFailureAndReset("Normalizer2::getInstance() call failed")) {
return;
}
checkLowMappingToEmpty(*n2);
UnicodeString sh(u'\u00AD');
assertTrue("soft hyphen is FCD", n2->isNormalized(sh, errorCode));
UnicodeString s(u"\u00ADÄ\u00AD\u0323");
UnicodeString result = n2->normalize(s, errorCode);
assertEquals("normalize string with soft hyphens", u"\u00ADa\u0323\u0308", result);
}
void
BasicNormalizerTest::checkLowMappingToEmpty(const Normalizer2 &n2) {
UnicodeString mapping;
assertTrue("getDecomposition(soft hyphen)", n2.getDecomposition(0xad, mapping));
assertTrue("soft hyphen maps to empty", mapping.isEmpty());
assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad));
assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad));
assertFalse("soft hyphen is not inert", n2.isInert(0xad));
}
void
BasicNormalizerTest::TestNormalizeIllFormedText() {
IcuTestErrorCode errorCode(*this, "TestNormalizeIllFormedText");
const Normalizer2 *nfkc_cf = Normalizer2::getNFKCCasefoldInstance(errorCode);
if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
return;
}
// Normalization behavior for ill-formed text is not defined.
// ICU currently treats ill-formed sequences as normalization-inert
// and copies them unchanged.
UnicodeString src(u" A");
src.append((char16_t)0xD800).append(u"ÄA\u0308").append((char16_t)0xD900).
append(u"A\u0308\u00ad\u0323").append((char16_t)0xDBFF).
append(u"Ä\u0323,\u00ad").append((char16_t)0xDC00).
append(u"\u1100\u1161\u11A8\u3133 ").append((char16_t)0xDFFF);
UnicodeString expected(u" a");
expected.append((char16_t)0xD800).append(u"ää").append((char16_t)0xD900).
append(u"\u0308").append((char16_t)0xDBFF).
append(u"\u0308,").append((char16_t)0xDC00).
append(u"가각갃 ").append((char16_t)0xDFFF);
UnicodeString result = nfkc_cf->normalize(src, errorCode);
assertSuccess("normalize", errorCode.get());
assertEquals("normalize", expected, result);
std::string src8(u8" A");
src8.append("\x80").append(u8"ÄA\u0308").append("\xC0\x80").
append(u8"A\u0308\u00ad\u0323").append("\xED\xA0\x80").
append(u8"Ä\u0323,\u00ad").append("\xF4\x90\x80\x80").
append(u8"\u1100\u1161\u11A8\u3133 ").append("\xF0");
std::string expected8(u8" a");
expected8.append("\x80").append(u8"ää").append("\xC0\x80").
append(u8"\u0308").append("\xED\xA0\x80").
append(u8"\u0308,").append("\xF4\x90\x80\x80").
append(u8"가각갃 ").append("\xF0");
std::string result8;
StringByteSink<std::string> sink(&result8);
nfkc_cf->normalizeUTF8(0, src8, sink, nullptr, errorCode);
assertSuccess("normalizeUTF8", errorCode.get());
assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -48,6 +48,9 @@ public:
void TestCustomFCC();
void TestFilteredNormalizer2Coverage();
void TestNormalizeUTF8WithEdits();
void TestLowMappingToEmpty_D();
void TestLowMappingToEmpty_FCD();
void TestNormalizeIllFormedText();
private:
UnicodeString canonTests[24][3];
@ -83,6 +86,7 @@ private:
static UnicodeString hex(UChar ch);
static UnicodeString hex(const UnicodeString& str);
void checkLowMappingToEmpty(const Normalizer2 &n2);
};
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -26,19 +26,20 @@ U_NAMESPACE_BEGIN
ExtraData::ExtraData(Norms &n, UBool fast) :
Norms::Enumerator(n),
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul, 1=start of normal data
optimizeFast(fast) {}
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data
yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data
optimizeFast(fast) {
// Hangul LV algorithmically decomposes to two Jamo.
// Some code may harmlessly read this firstUnit.
yesNoMappingsAndCompositions.setCharAt(0, 2);
// Hangul LVT algorithmically decomposes to three Jamo.
// Some code may harmlessly read this firstUnit.
yesNoMappingsOnly.setCharAt(0, 3);
}
int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
UnicodeString &m=*norm.mapping;
int32_t length=m.length();
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
fprintf(stderr,
"gennorm2 error: "
"mapping for U+%04lX longer than maximum of %d\n",
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
exit(U_INVALID_FORMAT_ERROR);
}
// Write the mapping & raw mapping extraData.
int32_t firstUnit=length|(norm.trailCC<<8);
int32_t preMappingLength=0;
@ -81,9 +82,6 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
++preMappingLength;
firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
}
if(norm.hasNoCompBoundaryAfter) {
firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
}
dataString.append((UChar)firstUnit);
dataString.append(m);
return preMappingLength;
@ -109,6 +107,22 @@ int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
return offset;
}
UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
// Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
// Do not map from ASCII to non-ASCII.
if(norm.mappingCP>=0 &&
!(c<=0x7f && norm.mappingCP>0x7f) &&
norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
int32_t delta=norm.mappingCP-c;
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
norm.type=Norm::NO_NO_DELTA;
norm.offset=delta;
return TRUE;
}
}
return FALSE;
}
void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
if(norm.cc!=0) {
fprintf(stderr,
@ -189,29 +203,27 @@ void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
norm.offset=yesNoMappingsOnly.length()+
writeMapping(c, norm, yesNoMappingsOnly);
break;
case Norm::NO_NO:
if(norm.cc==0 && !optimizeFast) {
// Try a compact, algorithmic encoding.
// Only for ccc=0, because we can't store additional information
// and we do not recursively follow an algorithmic encoding for access to the ccc.
//
// Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
// if the mappingCP decomposes further, to ensure that there is a place to store it.
// We want to see that the final mapping does not have exactly 1 code point,
// or else we would have to recursively ensure that the final mapping is stored
// in normal extraData.
if(norm.mappingCP>=0 &&
(!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
int32_t delta=norm.mappingCP-c;
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
norm.type=Norm::NO_NO_DELTA;
norm.offset=delta;
break;
}
}
case Norm::NO_NO_COMP_YES:
if(!optimizeFast && setNoNoDelta(c, norm)) {
break;
}
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
norm.offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
break;
case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
if(!optimizeFast && setNoNoDelta(c, norm)) {
break;
}
norm.offset=writeNoNoMapping(
c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
break;
case Norm::NO_NO_COMP_NO_MAYBE_CC:
norm.offset=writeNoNoMapping(
c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
break;
case Norm::NO_NO_EMPTY:
// There can be multiple extra data entries for mappings to the empty string
// if they have different raw mappings.
norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
break;
case Norm::MAYBE_YES_COMBINES_FWD:
norm.offset=maybeYesCompositions.length();

View file

@ -36,7 +36,10 @@ public:
UnicodeString yesYesCompositions;
UnicodeString yesNoMappingsAndCompositions;
UnicodeString yesNoMappingsOnly;
UnicodeString noNoMappings;
UnicodeString noNoMappingsCompYes;
UnicodeString noNoMappingsCompBoundaryBefore;
UnicodeString noNoMappingsCompNoMaybeCC;
UnicodeString noNoMappingsEmpty;
private:
/**
@ -48,12 +51,16 @@ private:
int32_t writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString);
int32_t writeNoNoMapping(UChar32 c, const Norm &norm,
UnicodeString &dataString, Hashtable &previousMappings);
UBool setNoNoDelta(UChar32 c, Norm &norm) const;
/** Requires norm.compositions!=nullptr. */
void writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString);
void writeExtraData(UChar32 c, Norm &norm);
UBool optimizeFast;
Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode.
Hashtable previousNoNoMappingsCompYes; // If constructed in runtime code, pass in UErrorCode.
Hashtable previousNoNoMappingsCompBoundaryBefore;
Hashtable previousNoNoMappingsCompNoMaybeCC;
Hashtable previousNoNoMappingsEmpty;
};
U_NAMESPACE_END

View file

@ -56,8 +56,8 @@ static UDataInfo dataInfo={
0,
{ 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
{ 2, 0, 0, 0 }, /* formatVersion */
{ 5, 2, 0, 0 } /* dataVersion (Unicode version) */
{ 3, 0, 0, 0 }, /* formatVersion */
{ 10, 0, 0, 0 } /* dataVersion (Unicode version) */
};
U_NAMESPACE_BEGIN
@ -65,8 +65,7 @@ U_NAMESPACE_BEGIN
class HangulIterator {
public:
struct Range {
UChar32 start, limit;
uint16_t norm16;
UChar32 start, end;
};
HangulIterator() : rangeIndex(0) {}
@ -77,18 +76,17 @@ public:
return NULL;
}
}
void reset() { rangeIndex=0; }
private:
static const Range ranges[4];
int32_t rangeIndex;
};
const HangulIterator::Range HangulIterator::ranges[4]={
{ Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
{ Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
{ Hangul::JAMO_L_BASE, Hangul::JAMO_L_END },
{ Hangul::JAMO_V_BASE, Hangul::JAMO_V_END },
// JAMO_T_BASE+1: not U+11A7
{ Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
{ Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo
{ Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END },
{ Hangul::HANGUL_BASE, Hangul::HANGUL_END },
};
Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
@ -200,58 +198,109 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
}
void Normalizer2DataBuilder::removeMapping(UChar32 c) {
Norm *p=checkNormForMapping(norms.getNorm(c), c);
if(p!=NULL) {
p->mappingType=Norm::REMOVED;
}
// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
Norm *p=checkNormForMapping(norms.createNorm(c), c);
p->mappingType=Norm::REMOVED;
}
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer) {
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
if(buffer.isEmpty()) {
return TRUE; // Maps-to-empty-string is no boundary of any kind.
return FALSE; // Maps-to-empty-string is no boundary of any kind.
}
int32_t lastStarterIndex=buffer.lastStarterIndex();
if(lastStarterIndex<0) {
return TRUE; // no starter
return FALSE; // no starter
}
UChar32 starter=buffer.charAt(lastStarterIndex);
if(lastStarterIndex==0 && norms.combinesBack(starter)) {
// The last starter is at the beginning of the mapping and combines backward.
return FALSE;
}
if(Hangul::isJamoL(starter) ||
(Hangul::isJamoV(starter) &&
0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
// A Jamo leading consonant or an LV pair combines-forward if it is at the end,
// otherwise it is blocked.
return lastStarterIndex==buffer.length()-1;
return lastStarterIndex!=buffer.length()-1;
}
// Note: There can be no Hangul syllable in the fully decomposed mapping.
const Norm *starterNorm=norms.getNorm(starter);
if(starterNorm==nullptr || starterNorm->compositions==nullptr) {
return FALSE; // The last starter does not combine forward.
// Multiple starters can combine into one.
// Look for the first of the last sequence of starters, excluding Jamos.
int32_t i=lastStarterIndex;
UChar32 c;
while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
starter=c;
--i;
}
// Compose as far as possible, and see if further compositions with
// characters following this mapping are possible.
const Norm *starterNorm=norms.getNorm(starter);
if(i==lastStarterIndex &&
(starterNorm==nullptr || starterNorm->compositions==nullptr)) {
return TRUE; // The last starter does not combine forward.
}
// Compose as far as possible, and see if further compositions are possible.
uint8_t prevCC=0;
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length(); ++combMarkIndex) {
uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter
if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
while(++i<buffer.length()) {
uint8_t cc=buffer.ccAt(i); // !=0 if after last starter
if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
// The starter combines with a mark that reorders before the current one.
return TRUE;
return FALSE;
}
if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
// The starter combines with this mark into a composite replacement starter.
UChar32 c=buffer.charAt(i);
if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
// The starter combines with c into a composite replacement starter.
starterNorm=norms.getNorm(starter);
if(starterNorm==nullptr || starterNorm->compositions==nullptr) {
return FALSE; // The composite does not combine further.
if(i>=lastStarterIndex &&
(starterNorm==nullptr || starterNorm->compositions==nullptr)) {
return TRUE; // The composite does not combine further.
}
// Keep prevCC because we "removed" the combining mark.
} else if(cc==0) {
starterNorm=norms.getNorm(c);
if(i==lastStarterIndex &&
(starterNorm==nullptr || starterNorm->compositions==nullptr)) {
return TRUE; // The new starter does not combine forward.
}
prevCC=0;
} else {
prevCC=cc;
}
}
if(prevCC==0) {
return TRUE; // forward-combining starter at the very end
return FALSE; // forward-combining starter at the very end
}
if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
// The starter combines with another mark.
return TRUE;
return FALSE;
}
return TRUE;
}
UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const {
if(buffer.lastStarterIndex()<0) {
return FALSE; // no starter
}
const Norm *starterNorm=nullptr;
uint8_t prevCC=0;
for(int32_t i=0; i<buffer.length(); ++i) {
UChar32 c=buffer.charAt(i);
uint8_t cc=buffer.ccAt(i);
if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
return TRUE; // normal composite
} else if(cc==0) {
if(Hangul::isJamoL(c)) {
if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
return TRUE; // Hangul syllable
}
starterNorm=nullptr;
} else {
starterNorm=norms.getNorm(c);
}
}
prevCC=cc;
}
return FALSE;
}
@ -264,6 +313,10 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
// Therefore, we cannot compute algorithmic mapping deltas here.
// Error conditions are checked, but printed later when we do know the offending code point.
if(norm.hasMapping()) {
if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
norm.error="mapping longer than maximum of 31";
return;
}
// Ensure canonical order.
BuilderReorderingBuffer buffer;
if(norm.rawMapping!=nullptr) {
@ -272,20 +325,20 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
}
norms.reorder(*norm.mapping, buffer);
if(buffer.isEmpty()) {
norm.leadCC=norm.trailCC=0;
// A character that is deleted (maps to an empty string) must
// get the worst-case lccc and tccc values because arbitrary
// characters on both sides will become adjacent.
norm.leadCC=1;
norm.trailCC=0xff;
} else {
norm.leadCC=buffer.ccAt(0);
norm.trailCC=buffer.ccAt(buffer.length()-1);
}
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
// in Normalizer2Impl::hasCompBoundaryAfter().
// For details see the comments on hasNoCompBoundaryAfter(buffer).
if(norm.compositions!=nullptr) {
norm.hasNoCompBoundaryAfter=TRUE;
} else {
norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
}
norm.hasCompBoundaryBefore=
!buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
norm.hasCompBoundaryAfter=
norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer);
if(norm.combinesBack) {
norm.error="combines-back and decomposes, not possible in Unicode normalization";
@ -299,13 +352,25 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
if(norm.compositions!=NULL) {
norm.error="combines-forward and has a one-way mapping, "
"not possible in Unicode normalization";
} else if(buffer.isEmpty()) {
norm.type=Norm::NO_NO_EMPTY;
} else if(!norm.hasCompBoundaryBefore) {
norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
} else if(mappingRecomposes(buffer)) {
norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
} else {
norm.type=Norm::NO_NO;
// The mapping is comp-normalized.
norm.type=Norm::NO_NO_COMP_YES;
}
}
} else { // no mapping
norm.leadCC=norm.trailCC=norm.cc;
norm.hasCompBoundaryBefore=
norm.cc==0 && !norm.combinesBack;
norm.hasCompBoundaryAfter=
norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr;
if(norm.combinesBack) {
if(norm.compositions!=nullptr) {
// Earlier code checked ccc=0.
@ -339,13 +404,6 @@ void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
}
void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
if(start<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || norm.leadCC!=0)) {
fprintf(stderr,
"gennorm2 error: "
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
(long)start);
exit(U_INVALID_FORMAT_ERROR);
}
if((norm.leadCC|norm.trailCC)!=0) {
for(UChar32 c=start; c<=end; ++c) {
setSmallFCD(c);
@ -355,37 +413,60 @@ void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm)
int32_t norm16;
switch(norm.type) {
case Norm::INERT:
norm16=0;
norm16=Normalizer2Impl::INERT;
break;
case Norm::YES_YES_COMBINES_FWD:
norm16=norm.offset;
norm16=norm.offset*2;
break;
case Norm::YES_NO_COMBINES_FWD:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset;
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2;
break;
case Norm::YES_NO_MAPPING_ONLY:
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset;
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2;
break;
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
case Norm::NO_NO:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset;
case Norm::NO_NO_COMP_YES:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
break;
case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2;
break;
case Norm::NO_NO_COMP_NO_MAYBE_CC:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2;
break;
case Norm::NO_NO_EMPTY:
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2;
break;
case Norm::NO_NO_DELTA:
norm16=getCenterNoNoDelta()+norm.offset;
break;
{
// Positive offset from minNoNoDelta, shifted left for additional bits.
int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT;
if(norm.trailCC==0) {
// DELTA_TCCC_0==0
} else if(norm.trailCC==1) {
offset|=Normalizer2Impl::DELTA_TCCC_1;
} else {
offset|=Normalizer2Impl::DELTA_TCCC_GT_1;
}
norm16=getMinNoNoDelta()+offset;
break;
}
case Norm::MAYBE_YES_COMBINES_FWD:
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset;
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2;
break;
case Norm::MAYBE_YES_SIMPLE:
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc; // ccc=0..255
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255
break;
case Norm::YES_YES_WITH_CC:
U_ASSERT(norm.cc!=0);
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc; // ccc=1..255
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255
break;
default: // Should not occur.
exit(U_INTERNAL_PROGRAM_ERROR);
}
U_ASSERT((norm16&1)==0);
if(norm.hasCompBoundaryAfter) {
norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
}
IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
@ -396,10 +477,13 @@ void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm)
if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
}
UBool isCompNoMaybe= norm.type>=Norm::NO_NO;
UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
}
if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) {
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start;
}
}
void Normalizer2DataBuilder::setHangulData() {
@ -407,8 +491,8 @@ void Normalizer2DataBuilder::setHangulData() {
const HangulIterator::Range *range;
// Check that none of the Hangul/Jamo code points have data.
while((range=hi.nextRange())!=NULL) {
for(UChar32 c=range->start; c<range->limit; ++c) {
if(utrie2_get32(norm16Trie, c)!=0) {
for(UChar32 c=range->start; c<=range->end; ++c) {
if(utrie2_get32(norm16Trie, c)>Normalizer2Impl::INERT) {
fprintf(stderr,
"gennorm2 error: "
"illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
@ -419,32 +503,62 @@ void Normalizer2DataBuilder::setHangulData() {
}
// Set data for algorithmic runtime handling.
IcuToolErrorCode errorCode("gennorm2/setHangulData()");
hi.reset();
while((range=hi.nextRange())!=NULL) {
uint16_t norm16=range->norm16;
if(norm16==0) {
norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo
if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
}
} else {
if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
}
}
utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
errorCode.assertSuccess();
// Jamo V/T are maybeYes
if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE;
}
utrie2_setRange32(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END,
Normalizer2Impl::JAMO_L, TRUE, errorCode);
utrie2_setRange32(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END,
Normalizer2Impl::JAMO_VT, TRUE, errorCode);
// JAMO_T_BASE+1: not U+11A7
utrie2_setRange32(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END,
Normalizer2Impl::JAMO_VT, TRUE, errorCode);
// Hangul LV encoded as minYesNo
uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO];
// Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]|
Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
}
// Set the first LV, then write all other Hangul syllables as LVT,
// then overwrite the remaining LV.
// The UTrie2 should be able to compact this into 7 32-item blocks
// because JAMO_T_COUNT is 28 and the UTrie2 granularity is 4.
// (7*32=8*28 smallest common multiple)
utrie2_set32(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
utrie2_setRange32(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END,
lvt, TRUE, errorCode);
UChar32 c=Hangul::HANGUL_BASE;
while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
utrie2_set32(norm16Trie, c, lv, errorCode);
}
errorCode.assertSuccess();
}
namespace {
struct Norm16Summary {
uint32_t maxNorm16;
// ANDing values yields 0 bits where any value has a 0.
// Used for worst-case HAS_COMP_BOUNDARY_AFTER.
uint32_t andedNorm16;
};
} // namespace
U_CDECL_BEGIN
static UBool U_CALLCONV
enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
uint32_t *pMaxValue=(uint32_t *)context;
if(value>*pMaxValue) {
*pMaxValue=value;
Norm16Summary *p=(Norm16Summary *)context;
if(value>p->maxNorm16) {
p->maxNorm16=value;
}
p->andedNorm16&=value;
return TRUE;
}
@ -452,7 +566,7 @@ U_CDECL_END
void Normalizer2DataBuilder::processData() {
IcuToolErrorCode errorCode("gennorm2/processData()");
norm16Trie=utrie2_open(0, 0, errorCode);
norm16Trie=utrie2_open(Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode);
errorCode.assertSuccess();
// Build composition lists before recursive decomposition,
@ -479,26 +593,37 @@ void Normalizer2DataBuilder::processData() {
norms.enumRanges(extra);
extraData=extra.yesYesCompositions;
indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length();
indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2;
extraData.append(extra.yesNoMappingsAndCompositions);
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length();
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2;
extraData.append(extra.yesNoMappingsOnly);
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length();
extraData.append(extra.noNoMappings);
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length();
indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2;
extraData.append(extra.noNoMappingsCompYes);
indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2;
extraData.append(extra.noNoMappingsCompBoundaryBefore);
indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2;
extraData.append(extra.noNoMappingsCompNoMaybeCC);
indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2;
extraData.append(extra.noNoMappingsEmpty);
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2;
// Pad the maybeYesCompositions length to a multiple of 4,
// so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
while(extra.maybeYesCompositions.length()&3) {
extra.maybeYesCompositions.append((UChar)0);
}
extraData.insert(0, extra.maybeYesCompositions);
indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
extra.maybeYesCompositions.length();
extra.maybeYesCompositions.length()*2;
// Pad to even length for 4-byte alignment of following data.
if(extraData.length()&1) {
extraData.append((UChar)0);
}
int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
int32_t minNoNoDelta=getMinNoNoDelta();
U_ASSERT((minNoNoDelta&7)==0);
if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
fprintf(stderr,
"gennorm2 error: "
@ -509,6 +634,7 @@ void Normalizer2DataBuilder::processData() {
// writeNorm16() and setHangulData() reduce these as needed.
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
// Map each code point to its norm16 value,
// including the properties that fit directly,
@ -529,17 +655,21 @@ void Normalizer2DataBuilder::processData() {
// inner loops if necessary.
// However, that seems like overkill for an optimization for supplementary characters.
for(UChar lead=0xd800; lead<0xdc00; ++lead) {
uint32_t maxValue=utrie2_get32(norm16Trie, lead);
utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
) {
uint32_t surrogateCPNorm16=utrie2_get32(norm16Trie, lead);
Norm16Summary summary={ surrogateCPNorm16, surrogateCPNorm16 };
utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &summary);
uint32_t norm16=summary.maxNorm16;
if(norm16>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
norm16>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]) {
// Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
// Otherwise it might end up at something like JAMO_VT which stays in
// the inner decomposition quick check loop.
maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
norm16=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
}
utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
norm16=
(norm16&~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
(summary.andedNorm16&Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER);
utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, norm16, errorCode);
}
// Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
@ -554,6 +684,10 @@ void Normalizer2DataBuilder::processData() {
if(minCP>=0x10000) {
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
}
minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP];
if(minCP>=0x10000) {
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
}
utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
@ -583,11 +717,15 @@ void Normalizer2DataBuilder::processData() {
printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]);
printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]);
printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta);
printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
}

View file

@ -73,22 +73,21 @@ private:
Norm *checkNormForMapping(Norm *p, UChar32 c); // check for permitted overrides
/**
* Computes the MAPPING_NO_COMP_BOUNDARY_AFTER flag for a character's mapping
* (especially for a "YesNo" which has a round-trip mapping).
* This flag is used in Normalizer2Impl::hasCompBoundaryAfter().
*
* A starter character with a mapping does not have a composition boundary after it
* if the character itself combines-forward (which is tested by the caller of this function),
* or it is deleted (mapped to the empty string),
* or its mapping contains no starter,
* or the last starter combines-forward.
*/
UBool hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer);
UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const;
/** Returns TRUE if the mapping by itself recomposes, that is, it is not comp-normalized. */
UBool mappingRecomposes(const BuilderReorderingBuffer &buffer) const;
void postProcess(Norm &norm);
void setSmallFCD(UChar32 c);
int32_t getCenterNoNoDelta() {
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
int32_t getMinNoNoDelta() const {
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-
((2*Normalizer2Impl::MAX_DELTA+1)<<Normalizer2Impl::DELTA_SHIFT);
}
void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
void setHangulData();

View file

@ -99,6 +99,14 @@ Norm *Norms::getNorm(UChar32 c) {
return norms+i;
}
const Norm *Norms::getNorm(UChar32 c) const {
uint32_t i=utrie2_get32(normTrie, c);
if(i==0) {
return nullptr;
}
return norms+i;
}
const Norm &Norms::getNormRef(UChar32 c) const {
return norms[utrie2_get32(normTrie, c)];
}
@ -118,9 +126,7 @@ Norm *Norms::createNorm(UChar32 c) {
void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
int32_t length=mapping.length();
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
return; // writeMapping() will complain about it and print the code point.
}
U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
const char16_t *s=mapping.getBuffer();
int32_t i=0;
UChar32 c;

View file

@ -89,7 +89,7 @@ struct Norm {
UVector32 *compositions; // (trail, composite) pairs
uint8_t cc, leadCC, trailCC;
UBool combinesBack;
UBool hasNoCompBoundaryAfter;
UBool hasCompBoundaryBefore, hasCompBoundaryAfter;
/**
* Overall type of normalization properties.
@ -112,9 +112,14 @@ struct Norm {
YES_NO_COMBINES_FWD,
/** Starter with a round-trip mapping but no compositions. */
YES_NO_MAPPING_ONLY,
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
/** Has a one-way mapping. */
NO_NO,
/** Has a one-way mapping which is comp-normalized. */
NO_NO_COMP_YES,
/** Has a one-way mapping which is not comp-normalized but has a comp boundary before. */
NO_NO_COMP_BOUNDARY_BEFORE,
/** Has a one-way mapping which does not have a comp boundary before. */
NO_NO_COMP_NO_MAYBE_CC,
/** Has a one-way mapping to the empty string. */
NO_NO_EMPTY,
/** Has an algorithmic one-way mapping to a single code point. */
NO_NO_DELTA,
/**
@ -149,11 +154,15 @@ public:
Norm *allocNorm();
/** Returns an existing Norm unit, or nullptr if c has no data. */
Norm *getNorm(UChar32 c);
const Norm *getNorm(UChar32 c) const;
/** Returns a Norm unit, creating a new one if necessary. */
Norm *createNorm(UChar32 c);
/** Returns an existing Norm unit, or an immutable empty object if c has no data. */
const Norm &getNormRef(UChar32 c) const;
uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
UBool combinesBack(UChar32 c) const {
return Hangul::isJamoV(c) || Hangul::isJamoT(c) || getNormRef(c).combinesBack;
}
void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const;

View file

@ -185,9 +185,9 @@ public final class Norm2AllModes {
return impl.isDecompYes(impl.getNorm16(c)) ? 1 : 0;
}
@Override
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundary(c, true); }
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); }
@Override
public boolean hasBoundaryAfter(int c) { return impl.hasDecompBoundary(c, false); }
public boolean hasBoundaryAfter(int c) { return impl.hasDecompBoundaryAfter(c); }
@Override
public boolean isInert(int c) { return impl.isDecompInert(c); }
}
@ -238,11 +238,11 @@ public final class Norm2AllModes {
public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); }
@Override
public boolean hasBoundaryAfter(int c) {
return impl.hasCompBoundaryAfter(c, onlyContiguous, false);
return impl.hasCompBoundaryAfter(c, onlyContiguous);
}
@Override
public boolean isInert(int c) {
return impl.hasCompBoundaryAfter(c, onlyContiguous, true);
return impl.isCompInert(c, onlyContiguous);
}
private final boolean onlyContiguous;

View file

@ -223,8 +223,10 @@ public abstract class Normalizer2 {
if(spanLength==src.length()) {
return (String)src;
}
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
if (spanLength != 0) {
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
}
}
return normalize(src, new StringBuilder(src.length())).toString();
}

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a9b7099447b42325988ae448908ba0e690cf9c8259667c49019cc15d3a0fb760
size 12224152
oid sha256:d4b1866a85ceb079d912a3283e5ec6a7d6988df8c0e56e98fd67def82c35dcf3
size 12225515

View file

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a7bc00733ee580f117cfecc6c1790df5c495dea56dddd2472f6253c4baafd664
size 812715
oid sha256:fd856769e94b963fb8a0b63148c63349198ef0c0ec3729173170ccbfd94c4999
size 812769

View file

@ -183,8 +183,8 @@ public class BasicTest extends TestFmwk {
@Test
public void TestCanonCompose() throws Exception{
Normalizer norm = new Normalizer("", Normalizer.NFC,0);
iterateTest(norm, canonTests, 2);
staticTest(Normalizer.NFC, canonTests, 2);
iterateTest(norm, canonTests, 2);
composeTest(Normalizer.NFC, canonTests, 2);
}
@ -2412,6 +2412,10 @@ public class BasicTest extends TestFmwk {
return skipSets;
}
private static String[] kModeStrings = {
"D", "C", "KD", "KC"
};
@Test
public void TestSkippable() {
UnicodeSet[] skipSets = new UnicodeSet[] {
@ -2440,7 +2444,8 @@ public class BasicTest extends TestFmwk {
}
for(int i=0; i<expectSets.length; ++i) {
if(!skipSets[i].equals(expectSets[i])) {
errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n");
String ms = kModeStrings[i];
errln("error: TestSkippable skipSets["+ms+"]!=expectedSets["+ms+"]\n");
// Note: This used to depend on hardcoded UnicodeSet patterns generated by
// Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
// running com.ibm.text.UCD.Main with the option NFSkippable.
@ -2797,6 +2802,58 @@ public class BasicTest extends TestFmwk {
" \u1E09", out);
}
@Test
public void TestLowMappingToEmpty_D() {
Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.DECOMPOSE);
checkLowMappingToEmpty(n2);
String sh = "\u00AD";
assertFalse("soft hyphen is not normalized", n2.isNormalized(sh));
String result = n2.normalize(sh);
assertTrue("soft hyphen normalizes to empty", result.isEmpty());
assertEquals("soft hyphen QC=No", Normalizer.NO, n2.quickCheck(sh));
assertEquals("soft hyphen spanQuickCheckYes", 0, n2.spanQuickCheckYes(sh));
String s = "\u00ADÄ\u00AD\u0323";
result = n2.normalize(s);
assertEquals("normalize string with soft hyphens", "a\u0323\u0308", result);
}
@Test
public void TestLowMappingToEmpty_FCD() {
Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.FCD);
checkLowMappingToEmpty(n2);
String sh = "\u00AD";
assertTrue("soft hyphen is FCD", n2.isNormalized(sh));
String s = "\u00ADÄ\u00AD\u0323";
String result = n2.normalize(s);
assertEquals("normalize string with soft hyphens", "\u00ADa\u0323\u0308", result);
}
private void checkLowMappingToEmpty(Normalizer2 n2) {
String mapping = n2.getDecomposition(0xad);
assertNotNull("getDecomposition(soft hyphen)", mapping);
assertTrue("soft hyphen maps to empty", mapping.isEmpty());
assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad));
assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad));
assertFalse("soft hyphen is not inert", n2.isInert(0xad));
}
@Test
public void TestNormalizeIllFormedText() {
Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance();
// Normalization behavior for ill-formed text is not defined.
// ICU currently treats ill-formed sequences as normalization-inert
// and copies them unchanged.
String src = " A\uD800ÄA\u0308\uD900A\u0308\u00ad\u0323\uDBFFÄ\u0323," +
"\u00ad\uDC00\u1100\u1161가\u11A8가\u3133 \uDFFF";
String expected = " a\uD800ää\uD900ạ\u0308\uDBFFạ\u0308,\uDC00가각갃 \uDFFF";
String result = nfkc_cf.normalize(src);
assertEquals("normalize", expected, result);
}
@Test
public void TestNFC() {
// Coverage tests.
@ -2848,18 +2905,6 @@ public class BasicTest extends TestFmwk {
assertTrue("noop.isInert()", noop.isInert(0x0308));
}
/*
* This unit test covers two 'get' methods in class Normalizer2Impl. It only tests that
* an object is returned.
*/
@Test
public void TestGetsFromImpl() {
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;
assertNotEquals("getNormTrie() returns null", null, nfcImpl.getNormTrie());
assertNotEquals("getFCD16FromBelow180() returns null", null,
nfcImpl.getFCD16FromBelow180(0));
}
/*
* Abstract class Normalizer2 has non-abstract methods which are overwritten by
* its derived classes. To test these methods a derived class is defined here.

View file

@ -36,7 +36,7 @@ public class ConformanceTest extends TestFmwk {
static String[] moreCases ={
// Markus 2001aug30
"0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
// Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
"0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
};
@ -54,7 +54,7 @@ public class ConformanceTest extends TestFmwk {
public void TestConformance_3_2() throws Exception{
runConformance("unicode/NormalizationTest-3.2.0.txt",Normalizer.UNICODE_3_2);
}
public void runConformance(String fileName, int options) throws Exception{
String line = null;
String[] fields = new String[5];
@ -88,10 +88,10 @@ public class ConformanceTest extends TestFmwk {
// Parse out the fields
hexsplit(line, ';', fields, buf);
// Remove a single code point from the "other" UnicodeSet
if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
c=UTF16.charAt(fields[0],0);
c=UTF16.charAt(fields[0],0);
if(0xac20<=c && c<=0xd73f) {
// not an exhaustive test run: skip most Hangul syllables
if(c==0xac20) {
@ -132,7 +132,7 @@ public class ConformanceTest extends TestFmwk {
logln("Total: " + passCount + " lines passed");
}
}
/**
* Verify the conformance of the given line of the Unicode
* normalization (UTR 15) test suite file. For each line,
@ -154,74 +154,16 @@ public class ConformanceTest extends TestFmwk {
String out,fcd;
int i=0;
for (i=0; i<5; ++i) {
int fieldNum = i+1;
if (i<3) {
out = Normalizer.normalize(field[i], Normalizer.NFC, options);
pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFC, buf, +1,options);
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFC, buf, -1,options);
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, +1,options);
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, -1,options);
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
out = Normalizer.normalize(field[i], Normalizer.NFD);
pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFD, buf, +1,options);
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFD, buf, -1,options);
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, +1,options);
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, -1,options);
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
cross(field[2] /*NFD String*/, field[1]/*NFC String*/, Normalizer.NFC);
cross(field[1] /*NFC String*/, field[2]/*NFD String*/, Normalizer.NFD);
pass &= checkNorm(Normalizer.NFC, options, field[i], field[1], fieldNum);
pass &= checkNorm(Normalizer.NFD, options, field[i], field[2], fieldNum);
}
out = Normalizer.normalize(field[i], Normalizer.NFKC,options);
pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1,options);
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1,options);
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, +1,options);
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, -1,options);
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
out = Normalizer.normalize(field[i], Normalizer.NFKD,options);
pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1,options);
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1,options);
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, +1,options);
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, -1,options);
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
pass &= checkNorm(Normalizer.NFKC, options, field[i], field[3], fieldNum);
pass &= checkNorm(Normalizer.NFKD, options, field[i], field[4], fieldNum);
cross(field[4] /*NFKD String*/, field[3]/*NFKC String*/, Normalizer.NFKC);
cross(field[3] /*NFKC String*/, field[4]/*NFKD String*/, Normalizer.NFKD);
}
compare(field[1],field[2]);
compare(field[0],field[1]);
@ -243,7 +185,7 @@ public class ConformanceTest extends TestFmwk {
errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO");
pass = false;
}
if(!Normalizer.isNormalized(field[1], Normalizer.NFC, options)) {
errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
pass = false;
@ -298,24 +240,24 @@ public class ConformanceTest extends TestFmwk {
errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO");
pass = false;
}
out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1,options);
out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1,options);
out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1,options);
out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1,options);
out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1,options);
out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1,options);
out=Normalizer.normalize(fcd, Normalizer.NFD);
if(!out.equals(field[2])) {
errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
pass = false;
}
}
if (!pass) {
errln("FAIL: " + line);
}
}
if(field[0]!=field[2]) {
// two strings that are canonically equivalent must test
// equal under a canonical caseless match
@ -327,9 +269,57 @@ public class ConformanceTest extends TestFmwk {
pass=false;
}
}
return pass;
}
private static int getModeNumber(Normalizer.Mode mode) {
if (mode == Normalizer.NFD) { return 0; }
if (mode == Normalizer.NFKD) { return 1; }
if (mode == Normalizer.NFC) { return 2; }
if (mode == Normalizer.NFKC) { return 3; }
return -1;
}
private static final String[] kModeStrings = {
"D", "KD", "C", "KC"
};
private static final String[] kMessages = {
"c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)"
};
boolean checkNorm(Normalizer.Mode mode, int options, // Normalizer2 norm2,
String s, String exp, int field) throws Exception {
String modeString = kModeStrings[getModeNumber(mode)];
String msg = String.format(kMessages[getModeNumber(mode)], field);
StringBuffer buf = new StringBuffer();
String out = Normalizer.normalize(s, mode, options);
if (!assertEqual(modeString, "", s, out, exp, msg)) {
return false;
}
out = iterativeNorm(s, mode, buf, +1,options);
if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
return false;
}
out = iterativeNorm(s, mode, buf, -1,options);
if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
return false;
}
out = iterativeNorm(new StringCharacterIterator(s), mode, buf, +1,options);
if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
return false;
}
out = iterativeNorm(new StringCharacterIterator(s), mode, buf, -1,options);
if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
return false;
}
return true;
}
// two strings that are canonically equivalent must test
// equal under a canonical caseless match
// see UAX #21 Case Mappings and Jitterbug 2021 and
@ -339,26 +329,26 @@ public class ConformanceTest extends TestFmwk {
if(Normalizer.compare(UTF16.charAt(s1,0),UTF16.charAt(s2,0),Normalizer.COMPARE_IGNORE_CASE)!=0){
errln("Normalizer.compare(int,int) failed for s1: "
+Utility.hex(s1) + " s2: " + Utility.hex(s2));
}
}
}
if(s1.length()==1 && s2.length()>1){
if(Normalizer.compare(UTF16.charAt(s1,0),s2,Normalizer.COMPARE_IGNORE_CASE)!=0){
errln("Normalizer.compare(int,String) failed for s1: "
+Utility.hex(s1) + " s2: " + Utility.hex(s2));
}
}
}
if(s1.length()>1 && s2.length()>1){
// TODO: Re-enable this tests after UTC fixes UAX 21
if(Normalizer.compare(s1.toCharArray(),s2.toCharArray(),Normalizer.COMPARE_IGNORE_CASE)!=0){
errln("Normalizer.compare(char[],char[]) failed for s1: "
+Utility.hex(s1) + " s2: " + Utility.hex(s2));
}
}
}
}
}
private void cross(String s1, String s2,Normalizer.Mode mode){
String result = Normalizer.normalize(s1,mode);
if(!result.equals(s2)){
errln("cross test failed s1: " + Utility.hex(s1) + " s2: "
errln("cross test failed s1: " + Utility.hex(s1) + " s2: "
+Utility.hex(s2));
}
}
@ -389,7 +379,7 @@ public class ConformanceTest extends TestFmwk {
}
return buf.toString();
}
/**
* Do a normalization using the iterative API in the given direction.
* @param str a Java StringCharacterIterator
@ -421,18 +411,19 @@ public class ConformanceTest extends TestFmwk {
/**
* @param op name of normalization form, e.g., "KC"
* @param op2 name of test case variant, e.g., "(-1)"
* @param s string being normalized
* @param got value received
* @param exp expected value
* @param msg description of this test
* @returns true if got == exp
*/
private boolean assertEqual(String op, String s, String got,
private boolean assertEqual(String op, String op2, String s, String got,
String exp, String msg) {
if (exp.equals(got)) {
return true;
}
errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) +
errln((" " + msg + ": " + op + op2 + '(' + s + ")=" + hex(got) +
", exp. " + hex(exp)));
return false;
}
@ -459,7 +450,7 @@ public class ConformanceTest extends TestFmwk {
}
// Our field is from pos..delim-1.
buf.setLength(0);
String toHex = s.substring(pos,delim);
pos = delim;
int index = 0;
@ -478,7 +469,7 @@ public class ConformanceTest extends TestFmwk {
index = spacePos+1;
}
}
if (buf.length() < 1) {
throw new IllegalArgumentException("Empty field " + i + " in " + s);
}
@ -492,13 +483,13 @@ public class ConformanceTest extends TestFmwk {
throw new IllegalArgumentException("Out of range hex " +
hex + " in " + s);
}else if (hex > 0xFFFF){
buf.append((char)((hex>>10)+0xd7c0));
buf.append((char)((hex>>10)+0xd7c0));
buf.append((char)((hex&0x3ff)|0xdc00));
}else{
buf.append((char) hex);
}
}
// Specific tests for debugging. These are generally failures
// taken from the conformance file, but culled out to make
// debugging easier. These can be eliminated without affecting
@ -516,6 +507,6 @@ public class ConformanceTest extends TestFmwk {
hexsplit(line, ';', fields, buf);
checkConformance(fields, line,options);
}
}