mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 21:45:37 +00:00
ICU-13197 improved normalization data structure and code; .nrm formatVersion 3; merged from branches/markus/normv3 except for cherry-picks from trunk to there
X-SVN-Rev: 40265
This commit is contained in:
parent
3da97c910d
commit
e6748afd82
34 changed files with 3372 additions and 2692 deletions
|
@ -20,11 +20,13 @@
|
|||
|
||||
#if !UCONFIG_NO_NORMALIZATION
|
||||
|
||||
#include "unicode/edits.h"
|
||||
#include "unicode/normalizer2.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/unorm.h"
|
||||
#include "cpputils.h"
|
||||
#include "ustr_imp.h" // U_EDITS_NO_RESET
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -85,6 +87,52 @@ FilteredNormalizer2::normalize(const UnicodeString &src,
|
|||
return dest;
|
||||
}
|
||||
|
||||
void
|
||||
FilteredNormalizer2::normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
options |= U_EDITS_NO_RESET; // Do not reset for each span.
|
||||
normalizeUTF8(options, src.data(), src.length(), sink, edits, USET_SPAN_SIMPLE, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
FilteredNormalizer2::normalizeUTF8(uint32_t options, const char *src, int32_t length,
|
||||
ByteSink &sink, Edits *edits,
|
||||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const {
|
||||
while (length > 0) {
|
||||
int32_t spanLength = set.spanUTF8(src, length, spanCondition);
|
||||
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
|
||||
if (spanLength != 0) {
|
||||
if (edits != nullptr) {
|
||||
edits->addUnchanged(spanLength);
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(src, spanLength);
|
||||
}
|
||||
}
|
||||
spanCondition = USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if (spanLength != 0) {
|
||||
// Not norm2.normalizeSecondAndAppend() because we do not want
|
||||
// to modify the non-filter part of dest.
|
||||
norm2.normalizeUTF8(options, StringPiece(src, spanLength), sink, edits, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
spanCondition = USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
src += spanLength;
|
||||
length -= spanLength;
|
||||
}
|
||||
}
|
||||
|
||||
UnicodeString &
|
||||
FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
|
|
|
@ -62,7 +62,7 @@ LoadedNormalizer2Impl::isAcceptable(void * /*context*/,
|
|||
pInfo->dataFormat[1]==0x72 &&
|
||||
pInfo->dataFormat[2]==0x6d &&
|
||||
pInfo->dataFormat[3]==0x32 &&
|
||||
pInfo->formatVersion[0]==2
|
||||
pInfo->formatVersion[0]==3
|
||||
) {
|
||||
// Normalizer2Impl *me=(Normalizer2Impl *)context;
|
||||
// uprv_memcpy(me->dataVersion, pInfo->dataVersion, 4);
|
||||
|
@ -84,7 +84,7 @@ LoadedNormalizer2Impl::load(const char *packageName, const char *name, UErrorCod
|
|||
const uint8_t *inBytes=(const uint8_t *)udata_getMemory(memory);
|
||||
const int32_t *inIndexes=(const int32_t *)inBytes;
|
||||
int32_t indexesLength=inIndexes[IX_NORM_TRIE_OFFSET]/4;
|
||||
if(indexesLength<=IX_MIN_YES_NO_MAPPINGS_ONLY) {
|
||||
if(indexesLength<=IX_MIN_LCCC_CP) {
|
||||
errorCode=U_INVALID_FORMAT_ERROR; // Not enough indexes.
|
||||
return;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -23,6 +23,7 @@
|
|||
#include "unicode/unistr.h"
|
||||
#include "cpputils.h"
|
||||
#include "normalizer2impl.h"
|
||||
#include "ustr_imp.h" // U_EDITS_NO_RESET
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
|
@ -211,8 +212,8 @@ private:
|
|||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
|
||||
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundary(c, TRUE); }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundary(c, FALSE); }
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundaryBefore(c); }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundaryAfter(c); }
|
||||
virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
|
||||
};
|
||||
|
||||
|
@ -236,12 +237,12 @@ private:
|
|||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr) {
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
|
||||
impl.composeUTF8(options, s, s + src.length(),
|
||||
onlyContiguous, TRUE, sink, edits, errorCode);
|
||||
impl.composeUTF8(options, onlyContiguous, s, s + src.length(),
|
||||
&sink, edits, errorCode);
|
||||
sink.Flush();
|
||||
}
|
||||
|
||||
|
@ -295,10 +296,10 @@ private:
|
|||
return impl.hasCompBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous, FALSE);
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const override {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous, TRUE);
|
||||
return impl.isCompInert(c, onlyContiguous);
|
||||
}
|
||||
|
||||
const UBool onlyContiguous;
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "normalizer2impl.h"
|
||||
#include "uassert.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "ustr_imp.h" // U_EDITS_NO_RESET
|
||||
|
||||
using icu::Normalizer2Impl;
|
||||
|
||||
|
@ -90,14 +91,18 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
return dest;
|
||||
}
|
||||
virtual void
|
||||
normalizeUTF8(uint32_t /*options*/, StringPiece src, ByteSink &sink,
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override {
|
||||
if(U_SUCCESS(errorCode)) {
|
||||
sink.Append(src.data(), src.length());
|
||||
if (edits != nullptr) {
|
||||
edits->reset();
|
||||
if ((options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
edits->addUnchanged(src.length());
|
||||
}
|
||||
if ((options & U_OMIT_UNCHANGED_TEXT) == 0) {
|
||||
sink.Append(src.data(), src.length());
|
||||
}
|
||||
sink.Flush();
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -37,6 +37,8 @@ struct CanonIterData;
|
|||
|
||||
class ByteSink;
|
||||
class Edits;
|
||||
class InitCanonIterData;
|
||||
class LcccContext;
|
||||
|
||||
class U_COMMON_API Hangul {
|
||||
public:
|
||||
|
@ -66,9 +68,9 @@ public:
|
|||
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
|
||||
}
|
||||
static inline UBool
|
||||
isHangulWithoutJamoT(UChar c) {
|
||||
isHangulLV(UChar32 c) {
|
||||
c-=HANGUL_BASE;
|
||||
return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
|
||||
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
|
||||
}
|
||||
static inline UBool isJamoL(UChar32 c) {
|
||||
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
|
||||
|
@ -76,6 +78,14 @@ public:
|
|||
static inline UBool isJamoV(UChar32 c) {
|
||||
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
|
||||
}
|
||||
static inline UBool isJamoT(UChar32 c) {
|
||||
int32_t t=c-JAMO_T_BASE;
|
||||
return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself
|
||||
}
|
||||
static UBool isJamo(UChar32 c) {
|
||||
return JAMO_L_BASE<=c && c<=JAMO_T_END &&
|
||||
(c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
|
||||
}
|
||||
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer
|
||||
|
@ -120,10 +130,13 @@ class Normalizer2Impl;
|
|||
|
||||
class U_COMMON_API ReorderingBuffer : public UMemory {
|
||||
public:
|
||||
/** Constructs only; init() should be called. */
|
||||
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
|
||||
impl(ni), str(dest),
|
||||
start(NULL), reorderStart(NULL), limit(NULL),
|
||||
remainingCapacity(0), lastCC(0) {}
|
||||
/** Constructs, removes the string contents, and initializes for a small initial capacity. */
|
||||
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
|
||||
~ReorderingBuffer() {
|
||||
if(start!=NULL) {
|
||||
str.releaseBuffer((int32_t)(limit-start));
|
||||
|
@ -140,11 +153,6 @@ public:
|
|||
UBool equals(const UChar *start, const UChar *limit) const;
|
||||
UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
|
||||
|
||||
// For Hangul composition, replacing the Leading consonant Jamo with the syllable.
|
||||
void setLastChar(UChar c) {
|
||||
*(limit-1)=c;
|
||||
}
|
||||
|
||||
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
|
||||
return (c<=0xffff) ?
|
||||
appendBMP((UChar)c, cc, errorCode) :
|
||||
|
@ -222,6 +230,12 @@ private:
|
|||
UChar *codePointStart, *codePointLimit;
|
||||
};
|
||||
|
||||
/**
|
||||
* Low-level implementation of the Unicode Normalization Algorithm.
|
||||
* For the data structure and details see the documentation at the end of
|
||||
* this normalizer2impl.h and in the design doc at
|
||||
* http://site.icu-project.org/design/normalization/custom
|
||||
*/
|
||||
class U_COMMON_API Normalizer2Impl : public UObject {
|
||||
public:
|
||||
Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) {
|
||||
|
@ -238,8 +252,6 @@ public:
|
|||
|
||||
// low-level properties ------------------------------------------------ ***
|
||||
|
||||
const UTrie2 *getNormTrie() const { return normTrie; }
|
||||
|
||||
UBool ensureCanonIterData(UErrorCode &errorCode) const;
|
||||
|
||||
uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
|
||||
|
@ -259,15 +271,22 @@ public:
|
|||
|
||||
uint8_t getCC(uint16_t norm16) const {
|
||||
if(norm16>=MIN_NORMAL_MAYBE_YES) {
|
||||
return (uint8_t)norm16;
|
||||
return getCCFromNormalYesOrMaybe(norm16);
|
||||
}
|
||||
if(norm16<minNoNo || limitNoNo<=norm16) {
|
||||
return 0;
|
||||
}
|
||||
return getCCFromNoNo(norm16);
|
||||
}
|
||||
static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
|
||||
return (uint8_t)(norm16 >> OFFSET_SHIFT);
|
||||
}
|
||||
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
|
||||
return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
|
||||
return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
|
||||
}
|
||||
uint8_t getCCFromYesOrMaybeCP(UChar32 c) const {
|
||||
if (c < minCompNoMaybeCP) { return 0; }
|
||||
return getCCFromYesOrMaybe(getNorm16(c));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -276,10 +295,8 @@ public:
|
|||
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
|
||||
*/
|
||||
uint16_t getFCD16(UChar32 c) const {
|
||||
if(c<0) {
|
||||
if(c<minDecompNoCP) {
|
||||
return 0;
|
||||
} else if(c<0x180) {
|
||||
return tccc180[c];
|
||||
} else if(c<=0xffff) {
|
||||
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
|
||||
}
|
||||
|
@ -295,9 +312,7 @@ public:
|
|||
*/
|
||||
uint16_t nextFCD16(const UChar *&s, const UChar *limit) const {
|
||||
UChar32 c=*s++;
|
||||
if(c<0x180) {
|
||||
return tccc180[c];
|
||||
} else if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
return 0;
|
||||
}
|
||||
UChar c2;
|
||||
|
@ -315,8 +330,8 @@ public:
|
|||
*/
|
||||
uint16_t previousFCD16(const UChar *start, const UChar *&s) const {
|
||||
UChar32 c=*--s;
|
||||
if(c<0x180) {
|
||||
return tccc180[c];
|
||||
if(c<minDecompNoCP) {
|
||||
return 0;
|
||||
}
|
||||
if(!U16_IS_TRAIL(c)) {
|
||||
if(!singleLeadMightHaveNonZeroFCD16(c)) {
|
||||
|
@ -332,8 +347,6 @@ public:
|
|||
return getFCD16FromNormData(c);
|
||||
}
|
||||
|
||||
/** Returns the FCD data for U+0000<=c<U+0180. */
|
||||
uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; }
|
||||
/** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
|
||||
UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
|
||||
// 0<=lead<=0xffff
|
||||
|
@ -344,9 +357,6 @@ public:
|
|||
/** Returns the FCD value from the regular normalization data. */
|
||||
uint16_t getFCD16FromNormData(UChar32 c) const;
|
||||
|
||||
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
|
||||
CanonIterData &newData, UErrorCode &errorCode) const;
|
||||
|
||||
/**
|
||||
* Gets the decomposition for one code point.
|
||||
* @param c code point
|
||||
|
@ -371,14 +381,25 @@ public:
|
|||
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
|
||||
|
||||
enum {
|
||||
MIN_CCC_LCCC_CP=0x300
|
||||
};
|
||||
// Fixed norm16 values.
|
||||
MIN_YES_YES_WITH_CC=0xfe02,
|
||||
JAMO_VT=0xfe00,
|
||||
MIN_NORMAL_MAYBE_YES=0xfc00,
|
||||
JAMO_L=2, // offset=1 hasCompBoundaryAfter=FALSE
|
||||
INERT=1, // offset=0 hasCompBoundaryAfter=TRUE
|
||||
|
||||
// norm16 bit 0 is comp-boundary-after.
|
||||
HAS_COMP_BOUNDARY_AFTER=1,
|
||||
OFFSET_SHIFT=1,
|
||||
|
||||
// For algorithmic one-way mappings, norm16 bits 2..1 indicate the
|
||||
// tccc (0, 1, >1) for quick FCC boundary-after tests.
|
||||
DELTA_TCCC_0=0,
|
||||
DELTA_TCCC_1=2,
|
||||
DELTA_TCCC_GT_1=4,
|
||||
DELTA_TCCC_MASK=6,
|
||||
DELTA_SHIFT=3,
|
||||
|
||||
enum {
|
||||
MIN_YES_YES_WITH_CC=0xff01,
|
||||
JAMO_VT=0xff00,
|
||||
MIN_NORMAL_MAYBE_YES=0xfe00,
|
||||
JAMO_L=1,
|
||||
MAX_DELTA=0x40
|
||||
};
|
||||
|
||||
|
@ -398,21 +419,32 @@ public:
|
|||
IX_MIN_COMP_NO_MAYBE_CP,
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
|
||||
|
||||
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
|
||||
IX_MIN_YES_NO,
|
||||
// Mappings are comp-normalized.
|
||||
IX_MIN_NO_NO,
|
||||
IX_LIMIT_NO_NO,
|
||||
IX_MIN_MAYBE_YES,
|
||||
|
||||
IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[.
|
||||
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
|
||||
IX_MIN_YES_NO_MAPPINGS_ONLY,
|
||||
// Mappings are not comp-normalized but have a comp boundary before.
|
||||
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
|
||||
// Mappings do not have a comp boundary before.
|
||||
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
|
||||
// Mappings to the empty string.
|
||||
IX_MIN_NO_NO_EMPTY,
|
||||
|
||||
IX_RESERVED15,
|
||||
IX_MIN_LCCC_CP,
|
||||
IX_RESERVED19,
|
||||
IX_COUNT
|
||||
};
|
||||
|
||||
enum {
|
||||
MAPPING_HAS_CCC_LCCC_WORD=0x80,
|
||||
MAPPING_HAS_RAW_MAPPING=0x40,
|
||||
MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
|
||||
// unused bit 0x20,
|
||||
MAPPING_LENGTH_MASK=0x1f
|
||||
};
|
||||
|
||||
|
@ -462,10 +494,10 @@ public:
|
|||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
UBool composeUTF8(uint32_t options,
|
||||
/** sink==nullptr: isNormalized() */
|
||||
UBool composeUTF8(uint32_t options, UBool onlyContiguous,
|
||||
const uint8_t *src, const uint8_t *limit,
|
||||
UBool onlyContiguous, UBool doCompose,
|
||||
ByteSink &sink, icu::Edits *edits, UErrorCode &errorCode) const;
|
||||
ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
|
||||
|
||||
const UChar *makeFCD(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
|
||||
|
@ -475,27 +507,42 @@ public:
|
|||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
UBool hasDecompBoundary(UChar32 c, UBool before) const;
|
||||
UBool hasDecompBoundaryBefore(UChar32 c) const;
|
||||
UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
|
||||
UBool hasDecompBoundaryAfter(UChar32 c) const;
|
||||
UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
|
||||
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
|
||||
|
||||
UBool hasCompBoundaryBefore(UChar32 c) const {
|
||||
return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
|
||||
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
|
||||
}
|
||||
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
|
||||
return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
|
||||
}
|
||||
UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
|
||||
uint16_t norm16=getNorm16(c);
|
||||
return isCompYesAndZeroCC(norm16) &&
|
||||
(norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
|
||||
(!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);
|
||||
}
|
||||
UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
|
||||
|
||||
UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
|
||||
UBool hasFCDBoundaryAfter(UChar32 c) const {
|
||||
uint16_t fcd16=getFCD16(c);
|
||||
return fcd16<=1 || (fcd16&0xff)==0;
|
||||
}
|
||||
UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
|
||||
UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
|
||||
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
|
||||
private:
|
||||
friend class InitCanonIterData;
|
||||
friend class LcccContext;
|
||||
|
||||
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
|
||||
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
|
||||
static UBool isInert(uint16_t norm16) { return norm16==0; }
|
||||
static UBool isJamoL(uint16_t norm16) { return norm16==1; }
|
||||
static UBool isInert(uint16_t norm16) { return norm16==INERT; }
|
||||
static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
|
||||
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
|
||||
UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
|
||||
uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
|
||||
UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
|
||||
UBool isHangulLVT(uint16_t norm16) const {
|
||||
return norm16==hangulLVT();
|
||||
}
|
||||
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
|
||||
// UBool isCompYes(uint16_t norm16) const {
|
||||
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
|
||||
|
@ -514,7 +561,7 @@ private:
|
|||
/**
|
||||
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
|
||||
* the MaybeYes which combine-forward and have ccc=0.
|
||||
* (Standard Unicode 5.2 normalization does not have such characters.)
|
||||
* (Standard Unicode 10 normalization does not have such characters.)
|
||||
*/
|
||||
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
|
||||
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
|
||||
|
@ -524,7 +571,7 @@ private:
|
|||
// For use with isCompYes().
|
||||
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
|
||||
// static uint8_t getCCFromYes(uint16_t norm16) {
|
||||
// return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
|
||||
// return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
|
||||
// }
|
||||
uint8_t getCCFromNoNo(uint16_t norm16) const {
|
||||
const uint16_t *mapping=getMapping(norm16);
|
||||
|
@ -535,30 +582,47 @@ private:
|
|||
}
|
||||
}
|
||||
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
|
||||
uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
|
||||
uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
|
||||
if(norm16<=minYesNo) {
|
||||
return 0; // yesYes and Hangul LV have ccc=tccc=0
|
||||
} else {
|
||||
// For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
|
||||
return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo
|
||||
}
|
||||
}
|
||||
uint8_t getPreviousTrailCC(const UChar *start, const UChar *p) const;
|
||||
uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
|
||||
|
||||
// Requires algorithmic-NoNo.
|
||||
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
|
||||
return c+norm16-(minMaybeYes-MAX_DELTA-1);
|
||||
return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
|
||||
}
|
||||
UChar32 getAlgorithmicDelta(uint16_t norm16) const {
|
||||
return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
|
||||
}
|
||||
|
||||
// Requires minYesNo<norm16<limitNoNo.
|
||||
const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
|
||||
const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); }
|
||||
const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
|
||||
if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||
if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
|
||||
return NULL;
|
||||
} else if(norm16<minMaybeYes) {
|
||||
return extraData+norm16; // for yesYes; if Jamo L: harmless empty list
|
||||
return getMapping(norm16); // for yesYes; if Jamo L: harmless empty list
|
||||
} else {
|
||||
return maybeYesCompositions+norm16-minMaybeYes;
|
||||
}
|
||||
}
|
||||
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
|
||||
const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
|
||||
// A composite has both mapping & compositions list.
|
||||
const uint16_t *list=getMapping(norm16);
|
||||
return list+ // mapping pointer
|
||||
1+ // +1 to skip the first unit with the mapping lenth
|
||||
1+ // +1 to skip the first unit with the mapping length
|
||||
(*list&MAPPING_LENGTH_MASK); // + mapping length
|
||||
}
|
||||
const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const {
|
||||
// minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
|
||||
return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);
|
||||
}
|
||||
/**
|
||||
* @param c code point must have compositions
|
||||
* @return compositions list pointer
|
||||
|
@ -573,55 +637,78 @@ private:
|
|||
UChar32 minNeedDataCP,
|
||||
ReorderingBuffer *buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
UBool decomposeShort(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
const UChar *decomposeShort(const UChar *src, const UChar *limit,
|
||||
UBool stopAtCompBoundary, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
UBool decompose(UChar32 c, uint16_t norm16,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
|
||||
UBool stopAtCompBoundary, ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
UBool stopAtCompBoundary, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
static int32_t combine(const uint16_t *list, UChar32 trail);
|
||||
void addComposites(const uint16_t *list, UnicodeSet &set) const;
|
||||
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
|
||||
UBool onlyContiguous) const;
|
||||
|
||||
int32_t getCompProps(const uint8_t *src, const uint8_t *limit,
|
||||
uint16_t norm16, UBool onlyContiguous) const;
|
||||
|
||||
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
|
||||
UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
|
||||
return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
|
||||
}
|
||||
UBool norm16HasCompBoundaryBefore(uint16_t norm16) const {
|
||||
return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
|
||||
}
|
||||
UBool hasCompBoundaryBefore(const UChar *src, const UChar *limit) const;
|
||||
UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
|
||||
UBool hasCompBoundaryAfter(const UChar *start, const UChar *p,
|
||||
UBool onlyContiguous) const;
|
||||
UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
|
||||
UBool onlyContiguous) const;
|
||||
UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
|
||||
return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
|
||||
(!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
|
||||
}
|
||||
/** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
|
||||
UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
|
||||
return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
|
||||
(norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
|
||||
}
|
||||
|
||||
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
|
||||
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
|
||||
const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p, UBool onlyContiguous) const;
|
||||
const UChar *findNextCompBoundary(const UChar *p, const UChar *limit, UBool onlyContiguous) const;
|
||||
|
||||
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
|
||||
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
|
||||
|
||||
void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
|
||||
CanonIterData &newData, UErrorCode &errorCode) const;
|
||||
|
||||
int32_t getCanonValue(UChar32 c) const;
|
||||
const UnicodeSet &getCanonStartSet(int32_t n) const;
|
||||
|
||||
// UVersionInfo dataVersion;
|
||||
|
||||
// Code point thresholds for quick check codes.
|
||||
UChar32 minDecompNoCP;
|
||||
UChar32 minCompNoMaybeCP;
|
||||
// BMP code point thresholds for quick check loops looking at single UTF-16 code units.
|
||||
UChar minDecompNoCP;
|
||||
UChar minCompNoMaybeCP;
|
||||
UChar minLcccCP;
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
uint16_t minYesNo;
|
||||
uint16_t minYesNoMappingsOnly;
|
||||
uint16_t minNoNo;
|
||||
uint16_t minNoNoCompBoundaryBefore;
|
||||
uint16_t minNoNoCompNoMaybeCC;
|
||||
uint16_t minNoNoEmpty;
|
||||
uint16_t limitNoNo;
|
||||
uint16_t centerNoNoDelta;
|
||||
uint16_t minMaybeYes;
|
||||
|
||||
const UTrie2 *normTrie;
|
||||
const uint16_t *maybeYesCompositions;
|
||||
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
|
||||
uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F
|
||||
|
||||
public: // CanonIterData is public to allow access from C callback functions.
|
||||
UInitOnce fCanonIterDataInitOnce;
|
||||
CanonIterData *fCanonIterData;
|
||||
};
|
||||
|
@ -677,13 +764,14 @@ unorm_getFCD16(UChar32 c);
|
|||
|
||||
/**
|
||||
* Format of Normalizer2 .nrm data files.
|
||||
* Format version 2.0.
|
||||
* Format version 3.0.
|
||||
*
|
||||
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
|
||||
* ICU ships with data files for standard Unicode Normalization Forms
|
||||
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
|
||||
* Custom (application-specific) data can be built into additional .nrm files
|
||||
* with the gennorm2 build tool.
|
||||
* ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
|
||||
*
|
||||
* Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
|
||||
* cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
|
||||
|
@ -714,14 +802,20 @@ unorm_getFCD16(UChar32 c);
|
|||
* with a decomposition mapping, that is, with NF*D_QC=No.
|
||||
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
|
||||
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
|
||||
* minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
|
||||
* is the lowest code point with lccc!=0.
|
||||
*
|
||||
* The next five indexes are thresholds of 16-bit trie values for ranges of
|
||||
* The next eight indexes are thresholds of 16-bit trie values for ranges of
|
||||
* values indicating multiple normalization properties.
|
||||
* They are listed here in threshold order, not in the order they are stored in the indexes.
|
||||
* minYesNo=indexes[IX_MIN_YES_NO];
|
||||
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
||||
* minNoNo=indexes[IX_MIN_NO_NO];
|
||||
* minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
|
||||
* minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
|
||||
* minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
|
||||
* limitNoNo=indexes[IX_LIMIT_NO_NO];
|
||||
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
|
||||
* minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
|
||||
* See the normTrie description below and the design doc for details.
|
||||
*
|
||||
* UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
|
||||
|
@ -729,12 +823,14 @@ unorm_getFCD16(UChar32 c);
|
|||
* The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
|
||||
* Rather than using independent bits in the value (which would require more than 16 bits),
|
||||
* information is extracted primarily via range checks.
|
||||
* Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
|
||||
* For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
|
||||
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
|
||||
* which means it has a two-way (round-trip) decomposition mapping.
|
||||
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
|
||||
* pointing to mappings, compositions lists, or both.
|
||||
* Value norm16==0 means that the character is normalization-inert, that is,
|
||||
* Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
|
||||
* means that the character is normalization-inert, that is,
|
||||
* it does not have a mapping, does not participate in composition, has a zero
|
||||
* canonical combining class, and forms a boundary where text before it and after it
|
||||
* can be normalized independently.
|
||||
|
@ -748,7 +844,7 @@ unorm_getFCD16(UChar32 c);
|
|||
* The trie has a value for each lead surrogate code unit representing the "worst case"
|
||||
* properties of the 1024 supplementary characters whose UTF-16 form starts with
|
||||
* the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
|
||||
* then their lead surrogate code unit has the trie value 0.
|
||||
* then their lead surrogate code unit has the trie value INERT.
|
||||
* When the lead surrogate unit's value exceeds the quick check minimum during processing,
|
||||
* the properties for the full supplementary code point need to be looked up.
|
||||
*
|
||||
|
@ -757,6 +853,7 @@ unorm_getFCD16(UChar32 c);
|
|||
*
|
||||
* There is only one byte offset for the end of these two arrays.
|
||||
* The split between them is given by the constant and variable mentioned above.
|
||||
* In version 3, the difference must be shifted right by OFFSET_SHIFT.
|
||||
*
|
||||
* The maybeYesCompositions array contains compositions lists for characters that
|
||||
* combine both forward (as starters in composition pairs)
|
||||
|
@ -773,6 +870,8 @@ unorm_getFCD16(UChar32 c);
|
|||
* followed by only mappings for "NoNo" characters.
|
||||
* (Referring to pairs of NFC/NFD quick check values.)
|
||||
* The norm16 values of those characters are directly indexes into the extraData array.
|
||||
* In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
|
||||
* for accessing extraData.
|
||||
*
|
||||
* The data structures for compositions lists and mappings are described in the design doc.
|
||||
*
|
||||
|
@ -803,6 +902,50 @@ unorm_getFCD16(UChar32 c);
|
|||
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
|
||||
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
|
||||
* - Addition of the smallFCD[] bit set.
|
||||
*
|
||||
* Changes from format version 2 to format version 3 (ICU 60) ------------------
|
||||
*
|
||||
* - norm16 bit 0 indicates hasCompBoundaryAfter(),
|
||||
* except that for contiguous composition (FCC) the tccc must be checked as well.
|
||||
* Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
|
||||
* Thresholds like minNoNo are tested before shifting.
|
||||
*
|
||||
* - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
|
||||
* to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
|
||||
* See DELTA_TCCC_MASK etc.
|
||||
* This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
|
||||
* minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
|
||||
*
|
||||
* - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
|
||||
* and ASCII characters are mapped algorithmically only to other ASCII characters.
|
||||
* This helps with hasCompBoundaryBefore() and compose() fast paths.
|
||||
* It is never necessary any more to loop for algorithmic mappings.
|
||||
*
|
||||
* - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
|
||||
* indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
|
||||
* and separation of the noNo extraData into distinct ranges.
|
||||
* With this, the noNo norm16 value indicates whether the mapping is
|
||||
* compose-normalized, not normalized but hasCompBoundaryBefore(),
|
||||
* not even that, or maps to an empty string.
|
||||
* hasCompBoundaryBefore() can be determined solely from the norm16 value.
|
||||
*
|
||||
* - The norm16 value for Hangul LVT is now different from that for Hangul LV,
|
||||
* so that hasCompBoundaryAfter() need not check for the syllable type.
|
||||
* For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
|
||||
* For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
|
||||
* The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
|
||||
* to simplify some code.
|
||||
*
|
||||
* - The extraData firstUnit bit 5 is no longer necessary
|
||||
* (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
|
||||
* is reserved again, and always set to 0.
|
||||
*
|
||||
* - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
|
||||
* This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
|
||||
* U+00AD Soft Hyphen maps to an empty string,
|
||||
* which is artificially assigned "worst case" values lccc=1 and tccc=255.
|
||||
*
|
||||
* - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
|
||||
*/
|
||||
|
||||
#endif /* !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -820,7 +820,7 @@ ucasemap_mapUTF8(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_P
|
|||
return 0;
|
||||
}
|
||||
|
||||
if(edits!=NULL) {
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
|
|
|
@ -240,7 +240,7 @@ public:
|
|||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @internal ICU 60 technology preview, may be changed or removed in the future
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
|
@ -510,7 +510,35 @@ public:
|
|||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const;
|
||||
UErrorCode &errorCode) const override;
|
||||
|
||||
/**
|
||||
* Normalizes a UTF-8 string and optionally records how source substrings
|
||||
* relate to changed and unchanged result substrings.
|
||||
*
|
||||
* Currently implemented completely only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* Otherwise currently converts to & from UTF-16 and does not support edits.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT.
|
||||
* @param src Source UTF-8 string.
|
||||
* @param sink A ByteSink to which the normalized UTF-8 result string is written.
|
||||
* sink.Flush() is called at the end.
|
||||
* @param edits Records edits for index mapping, working with styled text,
|
||||
* and getting only changes (if any).
|
||||
* The Edits contents is undefined if any error occurs.
|
||||
* This function calls edits->reset() first. edits can be nullptr.
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const override;
|
||||
|
||||
/**
|
||||
* Appends the normalized form of the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
|
@ -528,7 +556,7 @@ public:
|
|||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const;
|
||||
UErrorCode &errorCode) const override;
|
||||
/**
|
||||
* Appends the second string to the first string
|
||||
* (merging them at the boundary) and returns the first string.
|
||||
|
@ -546,7 +574,7 @@ public:
|
|||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const;
|
||||
UErrorCode &errorCode) const override;
|
||||
|
||||
/**
|
||||
* Gets the decomposition mapping of c.
|
||||
|
@ -560,7 +588,7 @@ public:
|
|||
* @stable ICU 4.6
|
||||
*/
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const override;
|
||||
|
||||
/**
|
||||
* Gets the raw decomposition mapping of c.
|
||||
|
@ -574,7 +602,7 @@ public:
|
|||
* @stable ICU 49
|
||||
*/
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const override;
|
||||
|
||||
/**
|
||||
* Performs pairwise composition of a & b and returns the composite if there is one.
|
||||
|
@ -587,7 +615,7 @@ public:
|
|||
* @stable ICU 49
|
||||
*/
|
||||
virtual UChar32
|
||||
composePair(UChar32 a, UChar32 b) const;
|
||||
composePair(UChar32 a, UChar32 b) const override;
|
||||
|
||||
/**
|
||||
* Gets the combining class of c.
|
||||
|
@ -598,7 +626,7 @@ public:
|
|||
* @stable ICU 49
|
||||
*/
|
||||
virtual uint8_t
|
||||
getCombiningClass(UChar32 c) const;
|
||||
getCombiningClass(UChar32 c) const override;
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
|
@ -612,7 +640,7 @@ public:
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const;
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
|
@ -625,7 +653,7 @@ public:
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const;
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
/**
|
||||
* Returns the end of the normalized substring of the input string.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
|
@ -638,7 +666,7 @@ public:
|
|||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const;
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
|
||||
/**
|
||||
* Tests if the character always has a normalization boundary before it,
|
||||
|
@ -648,7 +676,7 @@ public:
|
|||
* @return TRUE if c has a normalization boundary before it
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const;
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const override;
|
||||
|
||||
/**
|
||||
* Tests if the character always has a normalization boundary after it,
|
||||
|
@ -658,7 +686,7 @@ public:
|
|||
* @return TRUE if c has a normalization boundary after it
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const;
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const override;
|
||||
|
||||
/**
|
||||
* Tests if the character is normalization-inert.
|
||||
|
@ -667,7 +695,7 @@ public:
|
|||
* @return TRUE if c is normalization-inert
|
||||
* @stable ICU 4.4
|
||||
*/
|
||||
virtual UBool isInert(UChar32 c) const;
|
||||
virtual UBool isInert(UChar32 c) const override;
|
||||
private:
|
||||
UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
|
@ -675,6 +703,12 @@ private:
|
|||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
void
|
||||
normalizeUTF8(uint32_t options, const char *src, int32_t length,
|
||||
ByteSink &sink, Edits *edits,
|
||||
USetSpanCondition spanCondition,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
|
|
|
@ -182,6 +182,7 @@
|
|||
// ucasemap_imp.h #define U_TITLECASE_ITERATOR_MASK 0xe0
|
||||
// ucasemap_imp.h #define U_TITLECASE_ADJUSTMENT_MASK 0x600
|
||||
// ustr_imp.h #define _STRNCMP_STYLE 0x1000
|
||||
// ustr_imp.h #define U_EDITS_NO_RESET 0x2000
|
||||
// unormcmp.cpp #define _COMPARE_EQUIV 0x80000
|
||||
|
||||
#endif // __STRINGOPTIONS_H__
|
||||
|
|
|
@ -25,6 +25,11 @@
|
|||
*/
|
||||
#define _STRNCMP_STYLE 0x1000
|
||||
|
||||
/**
|
||||
* Internal option for string transformation functions to not first reset the Edits object.
|
||||
*/
|
||||
#define U_EDITS_NO_RESET 0x2000
|
||||
|
||||
/**
|
||||
* Compare two strings in code point order or code unit order.
|
||||
* Works in strcmp style (both lengths -1),
|
||||
|
|
|
@ -1103,7 +1103,7 @@ ustrcase_map(int32_t caseLocale, uint32_t options, UCASEMAP_BREAK_ITERATOR_PARAM
|
|||
return 0;
|
||||
}
|
||||
|
||||
if(edits!=NULL) {
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
destLength=stringCaseMapper(caseLocale, options, UCASEMAP_BREAK_ITERATOR
|
||||
|
|
|
@ -746,7 +746,7 @@ uint16_t BackwardUTrie2StringIterator::previous16() {
|
|||
codePointLimit=codePointStart;
|
||||
if(start>=codePointStart) {
|
||||
codePoint=U_SENTINEL;
|
||||
return 0;
|
||||
return trie->errorValue;
|
||||
}
|
||||
uint16_t result;
|
||||
UTRIE2_U16_PREV16(trie, start, codePointStart, codePoint, result);
|
||||
|
@ -757,7 +757,7 @@ uint16_t ForwardUTrie2StringIterator::next16() {
|
|||
codePointStart=codePointLimit;
|
||||
if(codePointLimit==limit) {
|
||||
codePoint=U_SENTINEL;
|
||||
return 0;
|
||||
return trie->errorValue;
|
||||
}
|
||||
uint16_t result;
|
||||
UTRIE2_U16_NEXT16(trie, codePointLimit, limit, codePoint, result);
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -159,6 +159,7 @@ void addNormTest(TestNode** root)
|
|||
}
|
||||
|
||||
static const char* const modeStrings[]={
|
||||
"?",
|
||||
"UNORM_NONE",
|
||||
"UNORM_NFD",
|
||||
"UNORM_NFKD",
|
||||
|
@ -183,7 +184,7 @@ static void TestNormCases(UNormalizationMode mode,
|
|||
length2= unorm_normalize(source, -1, mode, 0, NULL, 0, &status2);
|
||||
if(neededLen!=length2) {
|
||||
log_err("ERROR in unorm_normalize(%s)[%d]: "
|
||||
"preflight length/NUL %d!=%d preflight length/srcLength\n",
|
||||
"preflight length/srcLength %d!=%d preflight length/NUL\n",
|
||||
modeStrings[mode], (int)x, (int)neededLen, (int)length2);
|
||||
}
|
||||
if(status==U_BUFFER_OVERFLOW_ERROR)
|
||||
|
@ -192,14 +193,14 @@ static void TestNormCases(UNormalizationMode mode,
|
|||
}
|
||||
length2=unorm_normalize(source, u_strlen(source), mode, 0, result, UPRV_LENGTHOF(result), &status);
|
||||
if(U_FAILURE(status) || neededLen!=length2) {
|
||||
log_data_err("ERROR in unorm_normalize(%s/NUL) at %s: %s - (Are you missing data?)\n",
|
||||
log_data_err("ERROR in unorm_normalize(%s/srcLength) at %s: %s - (Are you missing data?)\n",
|
||||
modeStrings[mode], austrdup(source), myErrorName(status));
|
||||
} else {
|
||||
assertEqual(result, cases[x][expIndex], x);
|
||||
}
|
||||
length2=unorm_normalize(source, -1, mode, 0, result, UPRV_LENGTHOF(result), &status);
|
||||
if(U_FAILURE(status) || neededLen!=length2) {
|
||||
log_data_err("ERROR in unorm_normalize(%s/srcLength) at %s: %s - (Are you missing data?)\n",
|
||||
log_data_err("ERROR in unorm_normalize(%s/NUL) at %s: %s - (Are you missing data?)\n",
|
||||
modeStrings[mode], austrdup(source), myErrorName(status));
|
||||
} else {
|
||||
assertEqual(result, cases[x][expIndex], x);
|
||||
|
|
|
@ -406,11 +406,11 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
|
|||
}
|
||||
|
||||
static const char *const kModeStrings[UNORM_MODE_COUNT] = {
|
||||
"?", "D", "KD", "C", "KC", "FCD"
|
||||
"?", "none", "D", "KD", "C", "KC", "FCD"
|
||||
};
|
||||
|
||||
static const char *const kMessages[UNORM_MODE_COUNT] = {
|
||||
"?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
|
||||
"?!=?", "?!=?", "c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)", "FCD"
|
||||
};
|
||||
|
||||
UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t options,
|
||||
|
@ -450,6 +450,7 @@ UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t opti
|
|||
std::string exp8;
|
||||
exp.toUTF8String(exp8);
|
||||
std::string out8;
|
||||
out8.reserve(exp8.length());
|
||||
Edits edits;
|
||||
Edits *editsPtr = (mode == UNORM_NFC || mode == UNORM_NFKC) ? &edits : nullptr;
|
||||
StringByteSink<std::string> sink(&out8);
|
||||
|
|
|
@ -55,6 +55,9 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
|
|||
#endif
|
||||
TESTCASE_AUTO(TestFilteredNormalizer2Coverage);
|
||||
TESTCASE_AUTO(TestNormalizeUTF8WithEdits);
|
||||
TESTCASE_AUTO(TestLowMappingToEmpty_D);
|
||||
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
|
||||
TESTCASE_AUTO(TestNormalizeIllFormedText);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -1374,6 +1377,10 @@ initExpectedSkippables(UnicodeSet skipSets[UNORM_MODE_COUNT], UErrorCode &errorC
|
|||
delete [] combineBackCharsAndCc;
|
||||
}
|
||||
|
||||
static const char *const kModeStrings[UNORM_MODE_COUNT] = {
|
||||
"?", "none", "D", "KD", "C", "KC", "FCD"
|
||||
};
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestSkippable() {
|
||||
UnicodeSet diff, skipSets[UNORM_MODE_COUNT], expectSets[UNORM_MODE_COUNT];
|
||||
|
@ -1395,7 +1402,8 @@ BasicNormalizerTest::TestSkippable() {
|
|||
|
||||
for(int32_t i=UNORM_NONE; i<UNORM_MODE_COUNT; ++i) {
|
||||
if(skipSets[i]!=expectSets[i]) {
|
||||
errln("error: TestSkippable skipSets[%d]!=expectedSets[%d]\n", i, i);
|
||||
const char *ms=kModeStrings[i];
|
||||
errln("error: TestSkippable skipSets[%s]!=expectedSets[%s]\n", ms, ms);
|
||||
// Note: This used to depend on hardcoded UnicodeSet patterns generated by
|
||||
// Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
|
||||
// running com.ibm.text.UCD.Main with the option NFSkippable.
|
||||
|
@ -1531,7 +1539,7 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
return;
|
||||
}
|
||||
static const char *const src =
|
||||
u8" AÄA\u0308A\u0308\u0323Ä\u0323,\u1100\u1161가\u11A8가\u3133 ";
|
||||
u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 ";
|
||||
std::string expected = u8" aääạ\u0308ạ\u0308,가각갃 ";
|
||||
std::string result;
|
||||
StringByteSink<std::string> sink(&result);
|
||||
|
@ -1544,9 +1552,10 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
{ TRUE, 1, 1 }, // A→a
|
||||
{ TRUE, 2, 2 }, // Ä→ä
|
||||
{ TRUE, 3, 2 }, // A\u0308→ä
|
||||
{ TRUE, 5, 5 }, // A\u0308\u0323→ạ\u0308
|
||||
{ TRUE, 7, 5 }, // A\u0308\u00ad\u0323→ạ\u0308 removes the soft hyphen
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
|
||||
{ FALSE, 1, 1 }, // comma
|
||||
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
|
||||
{ TRUE, 6, 3 }, // \u1100\u1161→ 가
|
||||
{ TRUE, 6, 3 }, // 가\u11A8→ 각
|
||||
{ TRUE, 6, 3 }, // 가\u3133→ 갃
|
||||
|
@ -1568,6 +1577,138 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
expectedChanges, UPRV_LENGTHOF(expectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
// With filter: The normalization code does not see the "A" substrings.
|
||||
UnicodeSet filter(u"[^A]", errorCode);
|
||||
FilteredNormalizer2 fn2(*nfkc_cf, filter);
|
||||
expected = u8" AäA\u0308A\u0323\u0308ạ\u0308,가각갃 ";
|
||||
result.clear();
|
||||
edits.reset();
|
||||
fn2.normalizeUTF8(0, src, sink, &edits, errorCode);
|
||||
assertSuccess("filtered normalizeUTF8", errorCode.get());
|
||||
assertEquals("filtered normalizeUTF8", expected.c_str(), result.c_str());
|
||||
static const EditChange filteredChanges[] = {
|
||||
{ FALSE, 3, 3 }, // 2 spaces + A
|
||||
{ TRUE, 2, 2 }, // Ä→ä
|
||||
{ FALSE, 4, 4 }, // A\u0308A
|
||||
{ TRUE, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
|
||||
{ FALSE, 1, 1 }, // comma
|
||||
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
|
||||
{ TRUE, 6, 3 }, // \u1100\u1161→ 가
|
||||
{ TRUE, 6, 3 }, // 가\u11A8→ 각
|
||||
{ TRUE, 6, 3 }, // 가\u3133→ 갃
|
||||
{ FALSE, 2, 2 } // 2 spaces
|
||||
};
|
||||
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
filteredChanges, UPRV_LENGTHOF(filteredChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
// Omit unchanged text.
|
||||
// Note that the result is not normalized because the inner normalizer
|
||||
// does not see text across filter spans.
|
||||
expected = u8"ä\u0323\u0308ạ\u0308가각갃";
|
||||
result.clear();
|
||||
edits.reset();
|
||||
fn2.normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
|
||||
assertSuccess("filtered normalizeUTF8 omit unchanged", errorCode.get());
|
||||
assertEquals("filtered normalizeUTF8 omit unchanged", expected.c_str(), result.c_str());
|
||||
TestUtility::checkEditsIter(*this, u"filtered normalizeUTF8 omit unchanged",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
filteredChanges, UPRV_LENGTHOF(filteredChanges),
|
||||
TRUE, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestLowMappingToEmpty_D() {
|
||||
IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_D");
|
||||
const Normalizer2 *n2 = Normalizer2::getInstance(
|
||||
nullptr, "nfkc_cf", UNORM2_DECOMPOSE, errorCode);
|
||||
if (errorCode.logDataIfFailureAndReset("Normalizer2::getInstance() call failed")) {
|
||||
return;
|
||||
}
|
||||
checkLowMappingToEmpty(*n2);
|
||||
|
||||
UnicodeString sh(u'\u00AD');
|
||||
assertFalse("soft hyphen is not normalized", n2->isNormalized(sh, errorCode));
|
||||
UnicodeString result = n2->normalize(sh, errorCode);
|
||||
assertTrue("soft hyphen normalizes to empty", result.isEmpty());
|
||||
assertEquals("soft hyphen QC=No", UNORM_NO, n2->quickCheck(sh, errorCode));
|
||||
assertEquals("soft hyphen spanQuickCheckYes", 0, n2->spanQuickCheckYes(sh, errorCode));
|
||||
|
||||
UnicodeString s(u"\u00ADÄ\u00AD\u0323");
|
||||
result = n2->normalize(s, errorCode);
|
||||
assertEquals("normalize string with soft hyphens", u"a\u0323\u0308", result);
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestLowMappingToEmpty_FCD() {
|
||||
IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_FCD");
|
||||
const Normalizer2 *n2 = Normalizer2::getInstance(
|
||||
nullptr, "nfkc_cf", UNORM2_FCD, errorCode);
|
||||
if (errorCode.logDataIfFailureAndReset("Normalizer2::getInstance() call failed")) {
|
||||
return;
|
||||
}
|
||||
checkLowMappingToEmpty(*n2);
|
||||
|
||||
UnicodeString sh(u'\u00AD');
|
||||
assertTrue("soft hyphen is FCD", n2->isNormalized(sh, errorCode));
|
||||
|
||||
UnicodeString s(u"\u00ADÄ\u00AD\u0323");
|
||||
UnicodeString result = n2->normalize(s, errorCode);
|
||||
assertEquals("normalize string with soft hyphens", u"\u00ADa\u0323\u0308", result);
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::checkLowMappingToEmpty(const Normalizer2 &n2) {
|
||||
UnicodeString mapping;
|
||||
assertTrue("getDecomposition(soft hyphen)", n2.getDecomposition(0xad, mapping));
|
||||
assertTrue("soft hyphen maps to empty", mapping.isEmpty());
|
||||
assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad));
|
||||
assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad));
|
||||
assertFalse("soft hyphen is not inert", n2.isInert(0xad));
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestNormalizeIllFormedText() {
|
||||
IcuTestErrorCode errorCode(*this, "TestNormalizeIllFormedText");
|
||||
const Normalizer2 *nfkc_cf = Normalizer2::getNFKCCasefoldInstance(errorCode);
|
||||
if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
|
||||
return;
|
||||
}
|
||||
// Normalization behavior for ill-formed text is not defined.
|
||||
// ICU currently treats ill-formed sequences as normalization-inert
|
||||
// and copies them unchanged.
|
||||
UnicodeString src(u" A");
|
||||
src.append((char16_t)0xD800).append(u"ÄA\u0308").append((char16_t)0xD900).
|
||||
append(u"A\u0308\u00ad\u0323").append((char16_t)0xDBFF).
|
||||
append(u"Ä\u0323,\u00ad").append((char16_t)0xDC00).
|
||||
append(u"\u1100\u1161가\u11A8가\u3133 ").append((char16_t)0xDFFF);
|
||||
UnicodeString expected(u" a");
|
||||
expected.append((char16_t)0xD800).append(u"ää").append((char16_t)0xD900).
|
||||
append(u"ạ\u0308").append((char16_t)0xDBFF).
|
||||
append(u"ạ\u0308,").append((char16_t)0xDC00).
|
||||
append(u"가각갃 ").append((char16_t)0xDFFF);
|
||||
UnicodeString result = nfkc_cf->normalize(src, errorCode);
|
||||
assertSuccess("normalize", errorCode.get());
|
||||
assertEquals("normalize", expected, result);
|
||||
|
||||
std::string src8(u8" A");
|
||||
src8.append("\x80").append(u8"ÄA\u0308").append("\xC0\x80").
|
||||
append(u8"A\u0308\u00ad\u0323").append("\xED\xA0\x80").
|
||||
append(u8"Ä\u0323,\u00ad").append("\xF4\x90\x80\x80").
|
||||
append(u8"\u1100\u1161가\u11A8가\u3133 ").append("\xF0");
|
||||
std::string expected8(u8" a");
|
||||
expected8.append("\x80").append(u8"ää").append("\xC0\x80").
|
||||
append(u8"ạ\u0308").append("\xED\xA0\x80").
|
||||
append(u8"ạ\u0308,").append("\xF4\x90\x80\x80").
|
||||
append(u8"가각갃 ").append("\xF0");
|
||||
std::string result8;
|
||||
StringByteSink<std::string> sink(&result8);
|
||||
nfkc_cf->normalizeUTF8(0, src8, sink, nullptr, errorCode);
|
||||
assertSuccess("normalizeUTF8", errorCode.get());
|
||||
assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -48,6 +48,9 @@ public:
|
|||
void TestCustomFCC();
|
||||
void TestFilteredNormalizer2Coverage();
|
||||
void TestNormalizeUTF8WithEdits();
|
||||
void TestLowMappingToEmpty_D();
|
||||
void TestLowMappingToEmpty_FCD();
|
||||
void TestNormalizeIllFormedText();
|
||||
|
||||
private:
|
||||
UnicodeString canonTests[24][3];
|
||||
|
@ -83,6 +86,7 @@ private:
|
|||
static UnicodeString hex(UChar ch);
|
||||
static UnicodeString hex(const UnicodeString& str);
|
||||
|
||||
void checkLowMappingToEmpty(const Normalizer2 &n2);
|
||||
};
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -26,19 +26,20 @@ U_NAMESPACE_BEGIN
|
|||
ExtraData::ExtraData(Norms &n, UBool fast) :
|
||||
Norms::Enumerator(n),
|
||||
yesYesCompositions(1000, (UChar32)0xffff, 2), // 0=inert, 1=Jamo L, 2=start of compositions
|
||||
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul, 1=start of normal data
|
||||
optimizeFast(fast) {}
|
||||
yesNoMappingsAndCompositions(1000, (UChar32)0, 1), // 0=Hangul LV, 1=start of normal data
|
||||
yesNoMappingsOnly(1000, (UChar32)0, 1), // 0=Hangul LVT, 1=start of normal data
|
||||
optimizeFast(fast) {
|
||||
// Hangul LV algorithmically decomposes to two Jamo.
|
||||
// Some code may harmlessly read this firstUnit.
|
||||
yesNoMappingsAndCompositions.setCharAt(0, 2);
|
||||
// Hangul LVT algorithmically decomposes to three Jamo.
|
||||
// Some code may harmlessly read this firstUnit.
|
||||
yesNoMappingsOnly.setCharAt(0, 3);
|
||||
}
|
||||
|
||||
int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) {
|
||||
UnicodeString &m=*norm.mapping;
|
||||
int32_t length=m.length();
|
||||
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"mapping for U+%04lX longer than maximum of %d\n",
|
||||
(long)c, Normalizer2Impl::MAPPING_LENGTH_MASK);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
// Write the mapping & raw mapping extraData.
|
||||
int32_t firstUnit=length|(norm.trailCC<<8);
|
||||
int32_t preMappingLength=0;
|
||||
|
@ -81,9 +82,6 @@ int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &data
|
|||
++preMappingLength;
|
||||
firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD;
|
||||
}
|
||||
if(norm.hasNoCompBoundaryAfter) {
|
||||
firstUnit|=Normalizer2Impl::MAPPING_NO_COMP_BOUNDARY_AFTER;
|
||||
}
|
||||
dataString.append((UChar)firstUnit);
|
||||
dataString.append(m);
|
||||
return preMappingLength;
|
||||
|
@ -109,6 +107,22 @@ int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm,
|
|||
return offset;
|
||||
}
|
||||
|
||||
UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const {
|
||||
// Try a compact, algorithmic encoding to a single compYesAndZeroCC code point.
|
||||
// Do not map from ASCII to non-ASCII.
|
||||
if(norm.mappingCP>=0 &&
|
||||
!(c<=0x7f && norm.mappingCP>0x7f) &&
|
||||
norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) {
|
||||
int32_t delta=norm.mappingCP-c;
|
||||
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
|
||||
norm.type=Norm::NO_NO_DELTA;
|
||||
norm.offset=delta;
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) {
|
||||
if(norm.cc!=0) {
|
||||
fprintf(stderr,
|
||||
|
@ -189,29 +203,27 @@ void ExtraData::writeExtraData(UChar32 c, Norm &norm) {
|
|||
norm.offset=yesNoMappingsOnly.length()+
|
||||
writeMapping(c, norm, yesNoMappingsOnly);
|
||||
break;
|
||||
case Norm::NO_NO:
|
||||
if(norm.cc==0 && !optimizeFast) {
|
||||
// Try a compact, algorithmic encoding.
|
||||
// Only for ccc=0, because we can't store additional information
|
||||
// and we do not recursively follow an algorithmic encoding for access to the ccc.
|
||||
//
|
||||
// Also, if hasNoCompBoundaryAfter is set, we can only use the algorithmic encoding
|
||||
// if the mappingCP decomposes further, to ensure that there is a place to store it.
|
||||
// We want to see that the final mapping does not have exactly 1 code point,
|
||||
// or else we would have to recursively ensure that the final mapping is stored
|
||||
// in normal extraData.
|
||||
if(norm.mappingCP>=0 &&
|
||||
(!norm.hasNoCompBoundaryAfter || 1!=norm.mapping->countChar32())) {
|
||||
int32_t delta=norm.mappingCP-c;
|
||||
if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) {
|
||||
norm.type=Norm::NO_NO_DELTA;
|
||||
norm.offset=delta;
|
||||
break;
|
||||
}
|
||||
}
|
||||
case Norm::NO_NO_COMP_YES:
|
||||
if(!optimizeFast && setNoNoDelta(c, norm)) {
|
||||
break;
|
||||
}
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
norm.offset=writeNoNoMapping(c, norm, noNoMappings, previousNoNoMappings);
|
||||
norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes);
|
||||
break;
|
||||
case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
|
||||
if(!optimizeFast && setNoNoDelta(c, norm)) {
|
||||
break;
|
||||
}
|
||||
norm.offset=writeNoNoMapping(
|
||||
c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore);
|
||||
break;
|
||||
case Norm::NO_NO_COMP_NO_MAYBE_CC:
|
||||
norm.offset=writeNoNoMapping(
|
||||
c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC);
|
||||
break;
|
||||
case Norm::NO_NO_EMPTY:
|
||||
// There can be multiple extra data entries for mappings to the empty string
|
||||
// if they have different raw mappings.
|
||||
norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty);
|
||||
break;
|
||||
case Norm::MAYBE_YES_COMBINES_FWD:
|
||||
norm.offset=maybeYesCompositions.length();
|
||||
|
|
|
@ -36,7 +36,10 @@ public:
|
|||
UnicodeString yesYesCompositions;
|
||||
UnicodeString yesNoMappingsAndCompositions;
|
||||
UnicodeString yesNoMappingsOnly;
|
||||
UnicodeString noNoMappings;
|
||||
UnicodeString noNoMappingsCompYes;
|
||||
UnicodeString noNoMappingsCompBoundaryBefore;
|
||||
UnicodeString noNoMappingsCompNoMaybeCC;
|
||||
UnicodeString noNoMappingsEmpty;
|
||||
|
||||
private:
|
||||
/**
|
||||
|
@ -48,12 +51,16 @@ private:
|
|||
int32_t writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString);
|
||||
int32_t writeNoNoMapping(UChar32 c, const Norm &norm,
|
||||
UnicodeString &dataString, Hashtable &previousMappings);
|
||||
UBool setNoNoDelta(UChar32 c, Norm &norm) const;
|
||||
/** Requires norm.compositions!=nullptr. */
|
||||
void writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString);
|
||||
void writeExtraData(UChar32 c, Norm &norm);
|
||||
|
||||
UBool optimizeFast;
|
||||
Hashtable previousNoNoMappings; // If constructed in runtime code, pass in UErrorCode.
|
||||
Hashtable previousNoNoMappingsCompYes; // If constructed in runtime code, pass in UErrorCode.
|
||||
Hashtable previousNoNoMappingsCompBoundaryBefore;
|
||||
Hashtable previousNoNoMappingsCompNoMaybeCC;
|
||||
Hashtable previousNoNoMappingsEmpty;
|
||||
};
|
||||
|
||||
U_NAMESPACE_END
|
||||
|
|
|
@ -56,8 +56,8 @@ static UDataInfo dataInfo={
|
|||
0,
|
||||
|
||||
{ 0x4e, 0x72, 0x6d, 0x32 }, /* dataFormat="Nrm2" */
|
||||
{ 2, 0, 0, 0 }, /* formatVersion */
|
||||
{ 5, 2, 0, 0 } /* dataVersion (Unicode version) */
|
||||
{ 3, 0, 0, 0 }, /* formatVersion */
|
||||
{ 10, 0, 0, 0 } /* dataVersion (Unicode version) */
|
||||
};
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -65,8 +65,7 @@ U_NAMESPACE_BEGIN
|
|||
class HangulIterator {
|
||||
public:
|
||||
struct Range {
|
||||
UChar32 start, limit;
|
||||
uint16_t norm16;
|
||||
UChar32 start, end;
|
||||
};
|
||||
|
||||
HangulIterator() : rangeIndex(0) {}
|
||||
|
@ -77,18 +76,17 @@ public:
|
|||
return NULL;
|
||||
}
|
||||
}
|
||||
void reset() { rangeIndex=0; }
|
||||
private:
|
||||
static const Range ranges[4];
|
||||
int32_t rangeIndex;
|
||||
};
|
||||
|
||||
const HangulIterator::Range HangulIterator::ranges[4]={
|
||||
{ Hangul::JAMO_L_BASE, Hangul::JAMO_L_BASE+Hangul::JAMO_L_COUNT, 1 },
|
||||
{ Hangul::JAMO_V_BASE, Hangul::JAMO_V_BASE+Hangul::JAMO_V_COUNT, Normalizer2Impl::JAMO_VT },
|
||||
{ Hangul::JAMO_L_BASE, Hangul::JAMO_L_END },
|
||||
{ Hangul::JAMO_V_BASE, Hangul::JAMO_V_END },
|
||||
// JAMO_T_BASE+1: not U+11A7
|
||||
{ Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_BASE+Hangul::JAMO_T_COUNT, Normalizer2Impl::JAMO_VT },
|
||||
{ Hangul::HANGUL_BASE, Hangul::HANGUL_BASE+Hangul::HANGUL_COUNT, 0 }, // will become minYesNo
|
||||
{ Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END },
|
||||
{ Hangul::HANGUL_BASE, Hangul::HANGUL_END },
|
||||
};
|
||||
|
||||
Normalizer2DataBuilder::Normalizer2DataBuilder(UErrorCode &errorCode) :
|
||||
|
@ -200,58 +198,109 @@ void Normalizer2DataBuilder::setRoundTripMapping(UChar32 c, const UnicodeString
|
|||
}
|
||||
|
||||
void Normalizer2DataBuilder::removeMapping(UChar32 c) {
|
||||
Norm *p=checkNormForMapping(norms.getNorm(c), c);
|
||||
if(p!=NULL) {
|
||||
p->mappingType=Norm::REMOVED;
|
||||
}
|
||||
// createNorm(c), not getNorm(c), to record a non-mapping and detect conflicting data.
|
||||
Norm *p=checkNormForMapping(norms.createNorm(c), c);
|
||||
p->mappingType=Norm::REMOVED;
|
||||
}
|
||||
|
||||
UBool Normalizer2DataBuilder::hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer) {
|
||||
UBool Normalizer2DataBuilder::mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const {
|
||||
if(buffer.isEmpty()) {
|
||||
return TRUE; // Maps-to-empty-string is no boundary of any kind.
|
||||
return FALSE; // Maps-to-empty-string is no boundary of any kind.
|
||||
}
|
||||
int32_t lastStarterIndex=buffer.lastStarterIndex();
|
||||
if(lastStarterIndex<0) {
|
||||
return TRUE; // no starter
|
||||
return FALSE; // no starter
|
||||
}
|
||||
UChar32 starter=buffer.charAt(lastStarterIndex);
|
||||
if(lastStarterIndex==0 && norms.combinesBack(starter)) {
|
||||
// The last starter is at the beginning of the mapping and combines backward.
|
||||
return FALSE;
|
||||
}
|
||||
if(Hangul::isJamoL(starter) ||
|
||||
(Hangul::isJamoV(starter) &&
|
||||
0<lastStarterIndex && Hangul::isJamoL(buffer.charAt(lastStarterIndex-1)))) {
|
||||
// A Jamo leading consonant or an LV pair combines-forward if it is at the end,
|
||||
// otherwise it is blocked.
|
||||
return lastStarterIndex==buffer.length()-1;
|
||||
return lastStarterIndex!=buffer.length()-1;
|
||||
}
|
||||
// Note: There can be no Hangul syllable in the fully decomposed mapping.
|
||||
const Norm *starterNorm=norms.getNorm(starter);
|
||||
if(starterNorm==nullptr || starterNorm->compositions==nullptr) {
|
||||
return FALSE; // The last starter does not combine forward.
|
||||
|
||||
// Multiple starters can combine into one.
|
||||
// Look for the first of the last sequence of starters, excluding Jamos.
|
||||
int32_t i=lastStarterIndex;
|
||||
UChar32 c;
|
||||
while(0<i && buffer.ccAt(i-1)==0 && !Hangul::isJamo(c=buffer.charAt(i-1))) {
|
||||
starter=c;
|
||||
--i;
|
||||
}
|
||||
// Compose as far as possible, and see if further compositions with
|
||||
// characters following this mapping are possible.
|
||||
const Norm *starterNorm=norms.getNorm(starter);
|
||||
if(i==lastStarterIndex &&
|
||||
(starterNorm==nullptr || starterNorm->compositions==nullptr)) {
|
||||
return TRUE; // The last starter does not combine forward.
|
||||
}
|
||||
// Compose as far as possible, and see if further compositions are possible.
|
||||
uint8_t prevCC=0;
|
||||
for(int32_t combMarkIndex=lastStarterIndex+1; combMarkIndex<buffer.length(); ++combMarkIndex) {
|
||||
uint8_t cc=buffer.ccAt(combMarkIndex); // !=0 because after last starter
|
||||
if(norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
|
||||
while(++i<buffer.length()) {
|
||||
uint8_t cc=buffer.ccAt(i); // !=0 if after last starter
|
||||
if(i>lastStarterIndex && norms.combinesWithCCBetween(*starterNorm, prevCC, cc)) {
|
||||
// The starter combines with a mark that reorders before the current one.
|
||||
return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
if(prevCC<cc && (starter=starterNorm->combine(buffer.charAt(combMarkIndex)))>=0) {
|
||||
// The starter combines with this mark into a composite replacement starter.
|
||||
UChar32 c=buffer.charAt(i);
|
||||
if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
|
||||
norms.getNormRef(c).combinesBack && (starter=starterNorm->combine(c))>=0) {
|
||||
// The starter combines with c into a composite replacement starter.
|
||||
starterNorm=norms.getNorm(starter);
|
||||
if(starterNorm==nullptr || starterNorm->compositions==nullptr) {
|
||||
return FALSE; // The composite does not combine further.
|
||||
if(i>=lastStarterIndex &&
|
||||
(starterNorm==nullptr || starterNorm->compositions==nullptr)) {
|
||||
return TRUE; // The composite does not combine further.
|
||||
}
|
||||
// Keep prevCC because we "removed" the combining mark.
|
||||
} else if(cc==0) {
|
||||
starterNorm=norms.getNorm(c);
|
||||
if(i==lastStarterIndex &&
|
||||
(starterNorm==nullptr || starterNorm->compositions==nullptr)) {
|
||||
return TRUE; // The new starter does not combine forward.
|
||||
}
|
||||
prevCC=0;
|
||||
} else {
|
||||
prevCC=cc;
|
||||
}
|
||||
}
|
||||
if(prevCC==0) {
|
||||
return TRUE; // forward-combining starter at the very end
|
||||
return FALSE; // forward-combining starter at the very end
|
||||
}
|
||||
if(norms.combinesWithCCBetween(*starterNorm, prevCC, 256)) {
|
||||
// The starter combines with another mark.
|
||||
return TRUE;
|
||||
return FALSE;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UBool Normalizer2DataBuilder::mappingRecomposes(const BuilderReorderingBuffer &buffer) const {
|
||||
if(buffer.lastStarterIndex()<0) {
|
||||
return FALSE; // no starter
|
||||
}
|
||||
const Norm *starterNorm=nullptr;
|
||||
uint8_t prevCC=0;
|
||||
for(int32_t i=0; i<buffer.length(); ++i) {
|
||||
UChar32 c=buffer.charAt(i);
|
||||
uint8_t cc=buffer.ccAt(i);
|
||||
if(starterNorm!=nullptr && (prevCC<cc || prevCC==0) &&
|
||||
norms.getNormRef(c).combinesBack && starterNorm->combine(c)>=0) {
|
||||
return TRUE; // normal composite
|
||||
} else if(cc==0) {
|
||||
if(Hangul::isJamoL(c)) {
|
||||
if((i+1)<buffer.length() && Hangul::isJamoV(buffer.charAt(i+1))) {
|
||||
return TRUE; // Hangul syllable
|
||||
}
|
||||
starterNorm=nullptr;
|
||||
} else {
|
||||
starterNorm=norms.getNorm(c);
|
||||
}
|
||||
}
|
||||
prevCC=cc;
|
||||
}
|
||||
return FALSE;
|
||||
}
|
||||
|
@ -264,6 +313,10 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
|
|||
// Therefore, we cannot compute algorithmic mapping deltas here.
|
||||
// Error conditions are checked, but printed later when we do know the offending code point.
|
||||
if(norm.hasMapping()) {
|
||||
if(norm.mapping->length()>Normalizer2Impl::MAPPING_LENGTH_MASK) {
|
||||
norm.error="mapping longer than maximum of 31";
|
||||
return;
|
||||
}
|
||||
// Ensure canonical order.
|
||||
BuilderReorderingBuffer buffer;
|
||||
if(norm.rawMapping!=nullptr) {
|
||||
|
@ -272,20 +325,20 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
|
|||
}
|
||||
norms.reorder(*norm.mapping, buffer);
|
||||
if(buffer.isEmpty()) {
|
||||
norm.leadCC=norm.trailCC=0;
|
||||
// A character that is deleted (maps to an empty string) must
|
||||
// get the worst-case lccc and tccc values because arbitrary
|
||||
// characters on both sides will become adjacent.
|
||||
norm.leadCC=1;
|
||||
norm.trailCC=0xff;
|
||||
} else {
|
||||
norm.leadCC=buffer.ccAt(0);
|
||||
norm.trailCC=buffer.ccAt(buffer.length()-1);
|
||||
}
|
||||
|
||||
// Set the hasNoCompBoundaryAfter flag for use by the last code branch
|
||||
// in Normalizer2Impl::hasCompBoundaryAfter().
|
||||
// For details see the comments on hasNoCompBoundaryAfter(buffer).
|
||||
if(norm.compositions!=nullptr) {
|
||||
norm.hasNoCompBoundaryAfter=TRUE;
|
||||
} else {
|
||||
norm.hasNoCompBoundaryAfter=hasNoCompBoundaryAfter(buffer);
|
||||
}
|
||||
norm.hasCompBoundaryBefore=
|
||||
!buffer.isEmpty() && norm.leadCC==0 && !norms.combinesBack(buffer.charAt(0));
|
||||
norm.hasCompBoundaryAfter=
|
||||
norm.compositions==nullptr && mappingHasCompBoundaryAfter(buffer);
|
||||
|
||||
if(norm.combinesBack) {
|
||||
norm.error="combines-back and decomposes, not possible in Unicode normalization";
|
||||
|
@ -299,13 +352,25 @@ void Normalizer2DataBuilder::postProcess(Norm &norm) {
|
|||
if(norm.compositions!=NULL) {
|
||||
norm.error="combines-forward and has a one-way mapping, "
|
||||
"not possible in Unicode normalization";
|
||||
} else if(buffer.isEmpty()) {
|
||||
norm.type=Norm::NO_NO_EMPTY;
|
||||
} else if(!norm.hasCompBoundaryBefore) {
|
||||
norm.type=Norm::NO_NO_COMP_NO_MAYBE_CC;
|
||||
} else if(mappingRecomposes(buffer)) {
|
||||
norm.type=Norm::NO_NO_COMP_BOUNDARY_BEFORE;
|
||||
} else {
|
||||
norm.type=Norm::NO_NO;
|
||||
// The mapping is comp-normalized.
|
||||
norm.type=Norm::NO_NO_COMP_YES;
|
||||
}
|
||||
}
|
||||
} else { // no mapping
|
||||
norm.leadCC=norm.trailCC=norm.cc;
|
||||
|
||||
norm.hasCompBoundaryBefore=
|
||||
norm.cc==0 && !norm.combinesBack;
|
||||
norm.hasCompBoundaryAfter=
|
||||
norm.cc==0 && !norm.combinesBack && norm.compositions==nullptr;
|
||||
|
||||
if(norm.combinesBack) {
|
||||
if(norm.compositions!=nullptr) {
|
||||
// Earlier code checked ccc=0.
|
||||
|
@ -339,13 +404,6 @@ void Normalizer2DataBuilder::setSmallFCD(UChar32 c) {
|
|||
}
|
||||
|
||||
void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm) {
|
||||
if(start<Normalizer2Impl::MIN_CCC_LCCC_CP && (norm.cc!=0 || norm.leadCC!=0)) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"U+%04lX below U+0300 has ccc!=0 or lccc!=0, not supported by ICU\n",
|
||||
(long)start);
|
||||
exit(U_INVALID_FORMAT_ERROR);
|
||||
}
|
||||
if((norm.leadCC|norm.trailCC)!=0) {
|
||||
for(UChar32 c=start; c<=end; ++c) {
|
||||
setSmallFCD(c);
|
||||
|
@ -355,37 +413,60 @@ void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm)
|
|||
int32_t norm16;
|
||||
switch(norm.type) {
|
||||
case Norm::INERT:
|
||||
norm16=0;
|
||||
norm16=Normalizer2Impl::INERT;
|
||||
break;
|
||||
case Norm::YES_YES_COMBINES_FWD:
|
||||
norm16=norm.offset;
|
||||
norm16=norm.offset*2;
|
||||
break;
|
||||
case Norm::YES_NO_COMBINES_FWD:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset;
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO]+norm.offset*2;
|
||||
break;
|
||||
case Norm::YES_NO_MAPPING_ONLY:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset;
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]+norm.offset*2;
|
||||
break;
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
case Norm::NO_NO:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset;
|
||||
case Norm::NO_NO_COMP_YES:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO]+norm.offset*2;
|
||||
break;
|
||||
case Norm::NO_NO_COMP_BOUNDARY_BEFORE:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]+norm.offset*2;
|
||||
break;
|
||||
case Norm::NO_NO_COMP_NO_MAYBE_CC:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]+norm.offset*2;
|
||||
break;
|
||||
case Norm::NO_NO_EMPTY:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]+norm.offset*2;
|
||||
break;
|
||||
case Norm::NO_NO_DELTA:
|
||||
norm16=getCenterNoNoDelta()+norm.offset;
|
||||
break;
|
||||
{
|
||||
// Positive offset from minNoNoDelta, shifted left for additional bits.
|
||||
int32_t offset=(norm.offset+Normalizer2Impl::MAX_DELTA)<<Normalizer2Impl::DELTA_SHIFT;
|
||||
if(norm.trailCC==0) {
|
||||
// DELTA_TCCC_0==0
|
||||
} else if(norm.trailCC==1) {
|
||||
offset|=Normalizer2Impl::DELTA_TCCC_1;
|
||||
} else {
|
||||
offset|=Normalizer2Impl::DELTA_TCCC_GT_1;
|
||||
}
|
||||
norm16=getMinNoNoDelta()+offset;
|
||||
break;
|
||||
}
|
||||
case Norm::MAYBE_YES_COMBINES_FWD:
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset;
|
||||
norm16=indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]+norm.offset*2;
|
||||
break;
|
||||
case Norm::MAYBE_YES_SIMPLE:
|
||||
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc; // ccc=0..255
|
||||
norm16=Normalizer2Impl::MIN_NORMAL_MAYBE_YES+norm.cc*2; // ccc=0..255
|
||||
break;
|
||||
case Norm::YES_YES_WITH_CC:
|
||||
U_ASSERT(norm.cc!=0);
|
||||
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-1+norm.cc; // ccc=1..255
|
||||
norm16=Normalizer2Impl::MIN_YES_YES_WITH_CC-2+norm.cc*2; // ccc=1..255
|
||||
break;
|
||||
default: // Should not occur.
|
||||
exit(U_INTERNAL_PROGRAM_ERROR);
|
||||
}
|
||||
U_ASSERT((norm16&1)==0);
|
||||
if(norm.hasCompBoundaryAfter) {
|
||||
norm16|=Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
|
||||
}
|
||||
IcuToolErrorCode errorCode("gennorm2/writeNorm16()");
|
||||
utrie2_setRange32(norm16Trie, start, end, (uint32_t)norm16, TRUE, errorCode);
|
||||
|
||||
|
@ -396,10 +477,13 @@ void Normalizer2DataBuilder::writeNorm16(UChar32 start, UChar32 end, Norm &norm)
|
|||
if(isDecompNo && start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=start;
|
||||
}
|
||||
UBool isCompNoMaybe= norm.type>=Norm::NO_NO;
|
||||
UBool isCompNoMaybe= norm.type>=Norm::NO_NO_COMP_YES;
|
||||
if(isCompNoMaybe && start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=start;
|
||||
}
|
||||
if(norm.leadCC!=0 && start<indexes[Normalizer2Impl::IX_MIN_LCCC_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=start;
|
||||
}
|
||||
}
|
||||
|
||||
void Normalizer2DataBuilder::setHangulData() {
|
||||
|
@ -407,8 +491,8 @@ void Normalizer2DataBuilder::setHangulData() {
|
|||
const HangulIterator::Range *range;
|
||||
// Check that none of the Hangul/Jamo code points have data.
|
||||
while((range=hi.nextRange())!=NULL) {
|
||||
for(UChar32 c=range->start; c<range->limit; ++c) {
|
||||
if(utrie2_get32(norm16Trie, c)!=0) {
|
||||
for(UChar32 c=range->start; c<=range->end; ++c) {
|
||||
if(utrie2_get32(norm16Trie, c)>Normalizer2Impl::INERT) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
"illegal mapping/composition/ccc data for Hangul or Jamo U+%04lX\n",
|
||||
|
@ -419,32 +503,62 @@ void Normalizer2DataBuilder::setHangulData() {
|
|||
}
|
||||
// Set data for algorithmic runtime handling.
|
||||
IcuToolErrorCode errorCode("gennorm2/setHangulData()");
|
||||
hi.reset();
|
||||
while((range=hi.nextRange())!=NULL) {
|
||||
uint16_t norm16=range->norm16;
|
||||
if(norm16==0) {
|
||||
norm16=(uint16_t)indexes[Normalizer2Impl::IX_MIN_YES_NO]; // Hangul LV/LVT encoded as minYesNo
|
||||
if(range->start<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=range->start;
|
||||
}
|
||||
} else {
|
||||
if(range->start<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) { // Jamo V/T are maybeYes
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=range->start;
|
||||
}
|
||||
}
|
||||
utrie2_setRange32(norm16Trie, range->start, range->limit-1, norm16, TRUE, errorCode);
|
||||
errorCode.assertSuccess();
|
||||
|
||||
// Jamo V/T are maybeYes
|
||||
if(Hangul::JAMO_V_BASE<indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=Hangul::JAMO_V_BASE;
|
||||
}
|
||||
utrie2_setRange32(norm16Trie, Hangul::JAMO_L_BASE, Hangul::JAMO_L_END,
|
||||
Normalizer2Impl::JAMO_L, TRUE, errorCode);
|
||||
utrie2_setRange32(norm16Trie, Hangul::JAMO_V_BASE, Hangul::JAMO_V_END,
|
||||
Normalizer2Impl::JAMO_VT, TRUE, errorCode);
|
||||
// JAMO_T_BASE+1: not U+11A7
|
||||
utrie2_setRange32(norm16Trie, Hangul::JAMO_T_BASE+1, Hangul::JAMO_T_END,
|
||||
Normalizer2Impl::JAMO_VT, TRUE, errorCode);
|
||||
|
||||
// Hangul LV encoded as minYesNo
|
||||
uint32_t lv=indexes[Normalizer2Impl::IX_MIN_YES_NO];
|
||||
// Hangul LVT encoded as minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER
|
||||
uint32_t lvt=indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]|
|
||||
Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER;
|
||||
if(Hangul::HANGUL_BASE<indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]) {
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=Hangul::HANGUL_BASE;
|
||||
}
|
||||
// Set the first LV, then write all other Hangul syllables as LVT,
|
||||
// then overwrite the remaining LV.
|
||||
// The UTrie2 should be able to compact this into 7 32-item blocks
|
||||
// because JAMO_T_COUNT is 28 and the UTrie2 granularity is 4.
|
||||
// (7*32=8*28 smallest common multiple)
|
||||
utrie2_set32(norm16Trie, Hangul::HANGUL_BASE, lv, errorCode);
|
||||
utrie2_setRange32(norm16Trie, Hangul::HANGUL_BASE+1, Hangul::HANGUL_END,
|
||||
lvt, TRUE, errorCode);
|
||||
UChar32 c=Hangul::HANGUL_BASE;
|
||||
while((c+=Hangul::JAMO_T_COUNT)<=Hangul::HANGUL_END) {
|
||||
utrie2_set32(norm16Trie, c, lv, errorCode);
|
||||
}
|
||||
errorCode.assertSuccess();
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
struct Norm16Summary {
|
||||
uint32_t maxNorm16;
|
||||
// ANDing values yields 0 bits where any value has a 0.
|
||||
// Used for worst-case HAS_COMP_BOUNDARY_AFTER.
|
||||
uint32_t andedNorm16;
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CDECL_BEGIN
|
||||
|
||||
static UBool U_CALLCONV
|
||||
enumRangeMaxValue(const void *context, UChar32 /*start*/, UChar32 /*end*/, uint32_t value) {
|
||||
uint32_t *pMaxValue=(uint32_t *)context;
|
||||
if(value>*pMaxValue) {
|
||||
*pMaxValue=value;
|
||||
Norm16Summary *p=(Norm16Summary *)context;
|
||||
if(value>p->maxNorm16) {
|
||||
p->maxNorm16=value;
|
||||
}
|
||||
p->andedNorm16&=value;
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
@ -452,7 +566,7 @@ U_CDECL_END
|
|||
|
||||
void Normalizer2DataBuilder::processData() {
|
||||
IcuToolErrorCode errorCode("gennorm2/processData()");
|
||||
norm16Trie=utrie2_open(0, 0, errorCode);
|
||||
norm16Trie=utrie2_open(Normalizer2Impl::INERT, Normalizer2Impl::INERT, errorCode);
|
||||
errorCode.assertSuccess();
|
||||
|
||||
// Build composition lists before recursive decomposition,
|
||||
|
@ -479,26 +593,37 @@ void Normalizer2DataBuilder::processData() {
|
|||
norms.enumRanges(extra);
|
||||
|
||||
extraData=extra.yesYesCompositions;
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length();
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO]=extraData.length()*2;
|
||||
extraData.append(extra.yesNoMappingsAndCompositions);
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length();
|
||||
indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]=extraData.length()*2;
|
||||
extraData.append(extra.yesNoMappingsOnly);
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length();
|
||||
extraData.append(extra.noNoMappings);
|
||||
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length();
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO]=extraData.length()*2;
|
||||
extraData.append(extra.noNoMappingsCompYes);
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]=extraData.length()*2;
|
||||
extraData.append(extra.noNoMappingsCompBoundaryBefore);
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]=extraData.length()*2;
|
||||
extraData.append(extra.noNoMappingsCompNoMaybeCC);
|
||||
indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]=extraData.length()*2;
|
||||
extraData.append(extra.noNoMappingsEmpty);
|
||||
indexes[Normalizer2Impl::IX_LIMIT_NO_NO]=extraData.length()*2;
|
||||
|
||||
// Pad the maybeYesCompositions length to a multiple of 4,
|
||||
// so that NO_NO_DELTA bits 2..1 can be used without subtracting the center.
|
||||
while(extra.maybeYesCompositions.length()&3) {
|
||||
extra.maybeYesCompositions.append((UChar)0);
|
||||
}
|
||||
extraData.insert(0, extra.maybeYesCompositions);
|
||||
indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]=
|
||||
Normalizer2Impl::MIN_NORMAL_MAYBE_YES-
|
||||
extra.maybeYesCompositions.length();
|
||||
extra.maybeYesCompositions.length()*2;
|
||||
|
||||
// Pad to even length for 4-byte alignment of following data.
|
||||
if(extraData.length()&1) {
|
||||
extraData.append((UChar)0);
|
||||
}
|
||||
|
||||
int32_t minNoNoDelta=getCenterNoNoDelta()-Normalizer2Impl::MAX_DELTA;
|
||||
int32_t minNoNoDelta=getMinNoNoDelta();
|
||||
U_ASSERT((minNoNoDelta&7)==0);
|
||||
if(indexes[Normalizer2Impl::IX_LIMIT_NO_NO]>minNoNoDelta) {
|
||||
fprintf(stderr,
|
||||
"gennorm2 error: "
|
||||
|
@ -509,6 +634,7 @@ void Normalizer2DataBuilder::processData() {
|
|||
// writeNorm16() and setHangulData() reduce these as needed.
|
||||
indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]=0x110000;
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=0x110000;
|
||||
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=0x110000;
|
||||
|
||||
// Map each code point to its norm16 value,
|
||||
// including the properties that fit directly,
|
||||
|
@ -529,17 +655,21 @@ void Normalizer2DataBuilder::processData() {
|
|||
// inner loops if necessary.
|
||||
// However, that seems like overkill for an optimization for supplementary characters.
|
||||
for(UChar lead=0xd800; lead<0xdc00; ++lead) {
|
||||
uint32_t maxValue=utrie2_get32(norm16Trie, lead);
|
||||
utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &maxValue);
|
||||
if( maxValue>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
|
||||
maxValue>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]
|
||||
) {
|
||||
uint32_t surrogateCPNorm16=utrie2_get32(norm16Trie, lead);
|
||||
Norm16Summary summary={ surrogateCPNorm16, surrogateCPNorm16 };
|
||||
utrie2_enumForLeadSurrogate(norm16Trie, lead, NULL, enumRangeMaxValue, &summary);
|
||||
uint32_t norm16=summary.maxNorm16;
|
||||
if(norm16>=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO] &&
|
||||
norm16>(uint32_t)indexes[Normalizer2Impl::IX_MIN_NO_NO]) {
|
||||
// Set noNo ("worst" value) if it got into "less-bad" maybeYes or ccc!=0.
|
||||
// Otherwise it might end up at something like JAMO_VT which stays in
|
||||
// the inner decomposition quick check loop.
|
||||
maxValue=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
|
||||
norm16=(uint32_t)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]-1;
|
||||
}
|
||||
utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, maxValue, errorCode);
|
||||
norm16=
|
||||
(norm16&~Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER)|
|
||||
(summary.andedNorm16&Normalizer2Impl::HAS_COMP_BOUNDARY_AFTER);
|
||||
utrie2_set32ForLeadSurrogateCodeUnit(norm16Trie, lead, norm16, errorCode);
|
||||
}
|
||||
|
||||
// Adjust supplementary minimum code points to break quick check loops at their lead surrogates.
|
||||
|
@ -554,6 +684,10 @@ void Normalizer2DataBuilder::processData() {
|
|||
if(minCP>=0x10000) {
|
||||
indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]=U16_LEAD(minCP);
|
||||
}
|
||||
minCP=indexes[Normalizer2Impl::IX_MIN_LCCC_CP];
|
||||
if(minCP>=0x10000) {
|
||||
indexes[Normalizer2Impl::IX_MIN_LCCC_CP]=U16_LEAD(minCP);
|
||||
}
|
||||
|
||||
utrie2_freeze(norm16Trie, UTRIE2_16_VALUE_BITS, errorCode);
|
||||
norm16TrieLength=utrie2_serialize(norm16Trie, NULL, 0, errorCode);
|
||||
|
@ -583,11 +717,15 @@ void Normalizer2DataBuilder::processData() {
|
|||
printf("size of binary data file contents: %5ld bytes\n", (long)totalSize);
|
||||
printf("minDecompNoCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_DECOMP_NO_CP]);
|
||||
printf("minCompNoMaybeCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_COMP_NO_MAYBE_CP]);
|
||||
printf("minYesNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
|
||||
printf("minLcccCodePoint: U+%04lX\n", (long)indexes[Normalizer2Impl::IX_MIN_LCCC_CP]);
|
||||
printf("minYesNo: (with compositions) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO]);
|
||||
printf("minYesNoMappingsOnly: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY]);
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
printf("minNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
|
||||
printf("minNoNo: (comp-normalized) 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO]);
|
||||
printf("minNoNoCompBoundaryBefore: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
|
||||
printf("minNoNoCompNoMaybeCC: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
|
||||
printf("minNoNoEmpty: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_NO_NO_EMPTY]);
|
||||
printf("limitNoNo: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_LIMIT_NO_NO]);
|
||||
printf("minNoNoDelta: 0x%04x\n", (int)minNoNoDelta);
|
||||
printf("minMaybeYes: 0x%04x\n", (int)indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]);
|
||||
}
|
||||
|
||||
|
|
|
@ -73,22 +73,21 @@ private:
|
|||
Norm *checkNormForMapping(Norm *p, UChar32 c); // check for permitted overrides
|
||||
|
||||
/**
|
||||
* Computes the MAPPING_NO_COMP_BOUNDARY_AFTER flag for a character's mapping
|
||||
* (especially for a "YesNo" which has a round-trip mapping).
|
||||
* This flag is used in Normalizer2Impl::hasCompBoundaryAfter().
|
||||
*
|
||||
* A starter character with a mapping does not have a composition boundary after it
|
||||
* if the character itself combines-forward (which is tested by the caller of this function),
|
||||
* or it is deleted (mapped to the empty string),
|
||||
* or its mapping contains no starter,
|
||||
* or the last starter combines-forward.
|
||||
*/
|
||||
UBool hasNoCompBoundaryAfter(const BuilderReorderingBuffer &buffer);
|
||||
UBool mappingHasCompBoundaryAfter(const BuilderReorderingBuffer &buffer) const;
|
||||
/** Returns TRUE if the mapping by itself recomposes, that is, it is not comp-normalized. */
|
||||
UBool mappingRecomposes(const BuilderReorderingBuffer &buffer) const;
|
||||
void postProcess(Norm &norm);
|
||||
|
||||
void setSmallFCD(UChar32 c);
|
||||
int32_t getCenterNoNoDelta() {
|
||||
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-Normalizer2Impl::MAX_DELTA-1;
|
||||
int32_t getMinNoNoDelta() const {
|
||||
return indexes[Normalizer2Impl::IX_MIN_MAYBE_YES]-
|
||||
((2*Normalizer2Impl::MAX_DELTA+1)<<Normalizer2Impl::DELTA_SHIFT);
|
||||
}
|
||||
void writeNorm16(UChar32 start, UChar32 end, Norm &norm);
|
||||
void setHangulData();
|
||||
|
|
|
@ -99,6 +99,14 @@ Norm *Norms::getNorm(UChar32 c) {
|
|||
return norms+i;
|
||||
}
|
||||
|
||||
const Norm *Norms::getNorm(UChar32 c) const {
|
||||
uint32_t i=utrie2_get32(normTrie, c);
|
||||
if(i==0) {
|
||||
return nullptr;
|
||||
}
|
||||
return norms+i;
|
||||
}
|
||||
|
||||
const Norm &Norms::getNormRef(UChar32 c) const {
|
||||
return norms[utrie2_get32(normTrie, c)];
|
||||
}
|
||||
|
@ -118,9 +126,7 @@ Norm *Norms::createNorm(UChar32 c) {
|
|||
|
||||
void Norms::reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const {
|
||||
int32_t length=mapping.length();
|
||||
if(length>Normalizer2Impl::MAPPING_LENGTH_MASK) {
|
||||
return; // writeMapping() will complain about it and print the code point.
|
||||
}
|
||||
U_ASSERT(length<=Normalizer2Impl::MAPPING_LENGTH_MASK);
|
||||
const char16_t *s=mapping.getBuffer();
|
||||
int32_t i=0;
|
||||
UChar32 c;
|
||||
|
|
|
@ -89,7 +89,7 @@ struct Norm {
|
|||
UVector32 *compositions; // (trail, composite) pairs
|
||||
uint8_t cc, leadCC, trailCC;
|
||||
UBool combinesBack;
|
||||
UBool hasNoCompBoundaryAfter;
|
||||
UBool hasCompBoundaryBefore, hasCompBoundaryAfter;
|
||||
|
||||
/**
|
||||
* Overall type of normalization properties.
|
||||
|
@ -112,9 +112,14 @@ struct Norm {
|
|||
YES_NO_COMBINES_FWD,
|
||||
/** Starter with a round-trip mapping but no compositions. */
|
||||
YES_NO_MAPPING_ONLY,
|
||||
// TODO: minMappingNotCompYes, minMappingNoCompBoundaryBefore
|
||||
/** Has a one-way mapping. */
|
||||
NO_NO,
|
||||
/** Has a one-way mapping which is comp-normalized. */
|
||||
NO_NO_COMP_YES,
|
||||
/** Has a one-way mapping which is not comp-normalized but has a comp boundary before. */
|
||||
NO_NO_COMP_BOUNDARY_BEFORE,
|
||||
/** Has a one-way mapping which does not have a comp boundary before. */
|
||||
NO_NO_COMP_NO_MAYBE_CC,
|
||||
/** Has a one-way mapping to the empty string. */
|
||||
NO_NO_EMPTY,
|
||||
/** Has an algorithmic one-way mapping to a single code point. */
|
||||
NO_NO_DELTA,
|
||||
/**
|
||||
|
@ -149,11 +154,15 @@ public:
|
|||
Norm *allocNorm();
|
||||
/** Returns an existing Norm unit, or nullptr if c has no data. */
|
||||
Norm *getNorm(UChar32 c);
|
||||
const Norm *getNorm(UChar32 c) const;
|
||||
/** Returns a Norm unit, creating a new one if necessary. */
|
||||
Norm *createNorm(UChar32 c);
|
||||
/** Returns an existing Norm unit, or an immutable empty object if c has no data. */
|
||||
const Norm &getNormRef(UChar32 c) const;
|
||||
uint8_t getCC(UChar32 c) const { return getNormRef(c).cc; }
|
||||
UBool combinesBack(UChar32 c) const {
|
||||
return Hangul::isJamoV(c) || Hangul::isJamoT(c) || getNormRef(c).combinesBack;
|
||||
}
|
||||
|
||||
void reorder(UnicodeString &mapping, BuilderReorderingBuffer &buffer) const;
|
||||
|
||||
|
|
|
@ -185,9 +185,9 @@ public final class Norm2AllModes {
|
|||
return impl.isDecompYes(impl.getNorm16(c)) ? 1 : 0;
|
||||
}
|
||||
@Override
|
||||
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundary(c, true); }
|
||||
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); }
|
||||
@Override
|
||||
public boolean hasBoundaryAfter(int c) { return impl.hasDecompBoundary(c, false); }
|
||||
public boolean hasBoundaryAfter(int c) { return impl.hasDecompBoundaryAfter(c); }
|
||||
@Override
|
||||
public boolean isInert(int c) { return impl.isDecompInert(c); }
|
||||
}
|
||||
|
@ -238,11 +238,11 @@ public final class Norm2AllModes {
|
|||
public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); }
|
||||
@Override
|
||||
public boolean hasBoundaryAfter(int c) {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous, false);
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous);
|
||||
}
|
||||
@Override
|
||||
public boolean isInert(int c) {
|
||||
return impl.hasCompBoundaryAfter(c, onlyContiguous, true);
|
||||
return impl.isCompInert(c, onlyContiguous);
|
||||
}
|
||||
|
||||
private final boolean onlyContiguous;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -223,8 +223,10 @@ public abstract class Normalizer2 {
|
|||
if(spanLength==src.length()) {
|
||||
return (String)src;
|
||||
}
|
||||
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
|
||||
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
|
||||
if (spanLength != 0) {
|
||||
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
|
||||
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
|
||||
}
|
||||
}
|
||||
return normalize(src, new StringBuilder(src.length())).toString();
|
||||
}
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a9b7099447b42325988ae448908ba0e690cf9c8259667c49019cc15d3a0fb760
|
||||
size 12224152
|
||||
oid sha256:d4b1866a85ceb079d912a3283e5ec6a7d6988df8c0e56e98fd67def82c35dcf3
|
||||
size 12225515
|
||||
|
|
|
@ -1,3 +1,3 @@
|
|||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a7bc00733ee580f117cfecc6c1790df5c495dea56dddd2472f6253c4baafd664
|
||||
size 812715
|
||||
oid sha256:fd856769e94b963fb8a0b63148c63349198ef0c0ec3729173170ccbfd94c4999
|
||||
size 812769
|
||||
|
|
|
@ -183,8 +183,8 @@ public class BasicTest extends TestFmwk {
|
|||
@Test
|
||||
public void TestCanonCompose() throws Exception{
|
||||
Normalizer norm = new Normalizer("", Normalizer.NFC,0);
|
||||
iterateTest(norm, canonTests, 2);
|
||||
staticTest(Normalizer.NFC, canonTests, 2);
|
||||
iterateTest(norm, canonTests, 2);
|
||||
composeTest(Normalizer.NFC, canonTests, 2);
|
||||
}
|
||||
|
||||
|
@ -2412,6 +2412,10 @@ public class BasicTest extends TestFmwk {
|
|||
return skipSets;
|
||||
}
|
||||
|
||||
private static String[] kModeStrings = {
|
||||
"D", "C", "KD", "KC"
|
||||
};
|
||||
|
||||
@Test
|
||||
public void TestSkippable() {
|
||||
UnicodeSet[] skipSets = new UnicodeSet[] {
|
||||
|
@ -2440,7 +2444,8 @@ public class BasicTest extends TestFmwk {
|
|||
}
|
||||
for(int i=0; i<expectSets.length; ++i) {
|
||||
if(!skipSets[i].equals(expectSets[i])) {
|
||||
errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n");
|
||||
String ms = kModeStrings[i];
|
||||
errln("error: TestSkippable skipSets["+ms+"]!=expectedSets["+ms+"]\n");
|
||||
// Note: This used to depend on hardcoded UnicodeSet patterns generated by
|
||||
// Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
|
||||
// running com.ibm.text.UCD.Main with the option NFSkippable.
|
||||
|
@ -2797,6 +2802,58 @@ public class BasicTest extends TestFmwk {
|
|||
" \u1E09", out);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestLowMappingToEmpty_D() {
|
||||
Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.DECOMPOSE);
|
||||
checkLowMappingToEmpty(n2);
|
||||
|
||||
String sh = "\u00AD";
|
||||
assertFalse("soft hyphen is not normalized", n2.isNormalized(sh));
|
||||
String result = n2.normalize(sh);
|
||||
assertTrue("soft hyphen normalizes to empty", result.isEmpty());
|
||||
assertEquals("soft hyphen QC=No", Normalizer.NO, n2.quickCheck(sh));
|
||||
assertEquals("soft hyphen spanQuickCheckYes", 0, n2.spanQuickCheckYes(sh));
|
||||
|
||||
String s = "\u00ADÄ\u00AD\u0323";
|
||||
result = n2.normalize(s);
|
||||
assertEquals("normalize string with soft hyphens", "a\u0323\u0308", result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestLowMappingToEmpty_FCD() {
|
||||
Normalizer2 n2 = Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.FCD);
|
||||
checkLowMappingToEmpty(n2);
|
||||
|
||||
String sh = "\u00AD";
|
||||
assertTrue("soft hyphen is FCD", n2.isNormalized(sh));
|
||||
|
||||
String s = "\u00ADÄ\u00AD\u0323";
|
||||
String result = n2.normalize(s);
|
||||
assertEquals("normalize string with soft hyphens", "\u00ADa\u0323\u0308", result);
|
||||
}
|
||||
|
||||
private void checkLowMappingToEmpty(Normalizer2 n2) {
|
||||
String mapping = n2.getDecomposition(0xad);
|
||||
assertNotNull("getDecomposition(soft hyphen)", mapping);
|
||||
assertTrue("soft hyphen maps to empty", mapping.isEmpty());
|
||||
assertFalse("soft hyphen has no boundary before", n2.hasBoundaryBefore(0xad));
|
||||
assertFalse("soft hyphen has no boundary after", n2.hasBoundaryAfter(0xad));
|
||||
assertFalse("soft hyphen is not inert", n2.isInert(0xad));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestNormalizeIllFormedText() {
|
||||
Normalizer2 nfkc_cf = Normalizer2.getNFKCCasefoldInstance();
|
||||
// Normalization behavior for ill-formed text is not defined.
|
||||
// ICU currently treats ill-formed sequences as normalization-inert
|
||||
// and copies them unchanged.
|
||||
String src = " A\uD800ÄA\u0308\uD900A\u0308\u00ad\u0323\uDBFFÄ\u0323," +
|
||||
"\u00ad\uDC00\u1100\u1161가\u11A8가\u3133 \uDFFF";
|
||||
String expected = " a\uD800ää\uD900ạ\u0308\uDBFFạ\u0308,\uDC00가각갃 \uDFFF";
|
||||
String result = nfkc_cf.normalize(src);
|
||||
assertEquals("normalize", expected, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestNFC() {
|
||||
// Coverage tests.
|
||||
|
@ -2848,18 +2905,6 @@ public class BasicTest extends TestFmwk {
|
|||
assertTrue("noop.isInert()", noop.isInert(0x0308));
|
||||
}
|
||||
|
||||
/*
|
||||
* This unit test covers two 'get' methods in class Normalizer2Impl. It only tests that
|
||||
* an object is returned.
|
||||
*/
|
||||
@Test
|
||||
public void TestGetsFromImpl() {
|
||||
Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;
|
||||
assertNotEquals("getNormTrie() returns null", null, nfcImpl.getNormTrie());
|
||||
assertNotEquals("getFCD16FromBelow180() returns null", null,
|
||||
nfcImpl.getFCD16FromBelow180(0));
|
||||
}
|
||||
|
||||
/*
|
||||
* Abstract class Normalizer2 has non-abstract methods which are overwritten by
|
||||
* its derived classes. To test these methods a derived class is defined here.
|
||||
|
|
|
@ -36,7 +36,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
static String[] moreCases ={
|
||||
// Markus 2001aug30
|
||||
"0061 0332 0308;00E4 0332;0061 0332 0308;00E4 0332;0061 0332 0308; # Markus 0",
|
||||
|
||||
|
||||
// Markus 2001oct26 - test edge case for iteration: U+0f73.cc==0 but decomposition.lead.cc==129
|
||||
"0061 0301 0F73;00E1 0F71 0F72;0061 0F71 0F72 0301;00E1 0F71 0F72;0061 0F71 0F72 0301; # Markus 1"
|
||||
};
|
||||
|
@ -54,7 +54,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
public void TestConformance_3_2() throws Exception{
|
||||
runConformance("unicode/NormalizationTest-3.2.0.txt",Normalizer.UNICODE_3_2);
|
||||
}
|
||||
|
||||
|
||||
public void runConformance(String fileName, int options) throws Exception{
|
||||
String line = null;
|
||||
String[] fields = new String[5];
|
||||
|
@ -88,10 +88,10 @@ public class ConformanceTest extends TestFmwk {
|
|||
|
||||
// Parse out the fields
|
||||
hexsplit(line, ';', fields, buf);
|
||||
|
||||
|
||||
// Remove a single code point from the "other" UnicodeSet
|
||||
if(fields[0].length()==UTF16.moveCodePointOffset(fields[0],0, 1)) {
|
||||
c=UTF16.charAt(fields[0],0);
|
||||
c=UTF16.charAt(fields[0],0);
|
||||
if(0xac20<=c && c<=0xd73f) {
|
||||
// not an exhaustive test run: skip most Hangul syllables
|
||||
if(c==0xac20) {
|
||||
|
@ -132,7 +132,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
logln("Total: " + passCount + " lines passed");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Verify the conformance of the given line of the Unicode
|
||||
* normalization (UTR 15) test suite file. For each line,
|
||||
|
@ -154,74 +154,16 @@ public class ConformanceTest extends TestFmwk {
|
|||
String out,fcd;
|
||||
int i=0;
|
||||
for (i=0; i<5; ++i) {
|
||||
int fieldNum = i+1;
|
||||
if (i<3) {
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFC, options);
|
||||
pass &= assertEqual("C", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFC, buf, +1,options);
|
||||
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFC, buf, -1,options);
|
||||
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, +1,options);
|
||||
pass &= assertEqual("C(+1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFC, buf, -1,options);
|
||||
pass &= assertEqual("C(-1)", field[i], out, field[1], "c2!=C(c" + (i+1));
|
||||
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFD);
|
||||
pass &= assertEqual("D", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFD, buf, +1,options);
|
||||
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFD, buf, -1,options);
|
||||
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, +1,options);
|
||||
pass &= assertEqual("D(+1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFD, buf, -1,options);
|
||||
pass &= assertEqual("D(-1)", field[i], out, field[2], "c3!=D(c" + (i+1));
|
||||
|
||||
cross(field[2] /*NFD String*/, field[1]/*NFC String*/, Normalizer.NFC);
|
||||
cross(field[1] /*NFC String*/, field[2]/*NFD String*/, Normalizer.NFD);
|
||||
pass &= checkNorm(Normalizer.NFC, options, field[i], field[1], fieldNum);
|
||||
pass &= checkNorm(Normalizer.NFD, options, field[i], field[2], fieldNum);
|
||||
}
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFKC,options);
|
||||
pass &= assertEqual("KC", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKC, buf, +1,options);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKC, buf, -1,options);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, +1,options);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKC, buf, -1,options);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[3], "c4!=KC(c" + (i+1));
|
||||
|
||||
|
||||
out = Normalizer.normalize(field[i], Normalizer.NFKD,options);
|
||||
pass &= assertEqual("KD", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKD, buf, +1,options);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(field[i], Normalizer.NFKD, buf, -1,options);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, +1,options);
|
||||
pass &= assertEqual("KD(+1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[i]), Normalizer.NFKD, buf, -1,options);
|
||||
pass &= assertEqual("KD(-1)", field[i], out, field[4], "c5!=KD(c" + (i+1));
|
||||
|
||||
pass &= checkNorm(Normalizer.NFKC, options, field[i], field[3], fieldNum);
|
||||
pass &= checkNorm(Normalizer.NFKD, options, field[i], field[4], fieldNum);
|
||||
cross(field[4] /*NFKD String*/, field[3]/*NFKC String*/, Normalizer.NFKC);
|
||||
cross(field[3] /*NFKC String*/, field[4]/*NFKD String*/, Normalizer.NFKD);
|
||||
|
||||
|
||||
}
|
||||
compare(field[1],field[2]);
|
||||
compare(field[0],field[1]);
|
||||
|
@ -243,7 +185,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
errln("Normalizer error: quickCheck(NFKD(s), Normalizer.NFKD) is Normalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
|
||||
if(!Normalizer.isNormalized(field[1], Normalizer.NFC, options)) {
|
||||
errln("Normalizer error: isNormalized(NFC(s), Normalizer.NFC) is false");
|
||||
pass = false;
|
||||
|
@ -298,24 +240,24 @@ public class ConformanceTest extends TestFmwk {
|
|||
errln("Normalizer error: quickCheck(NFKD(s), Normalizer.FCD) is Normalizer.NO");
|
||||
pass = false;
|
||||
}
|
||||
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, +1,options);
|
||||
out = iterativeNorm(new StringCharacterIterator(field[0]), Normalizer.FCD, buf, -1,options);
|
||||
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, +1,options);
|
||||
out = iterativeNorm(new StringCharacterIterator(field[2]), Normalizer.FCD, buf, -1,options);
|
||||
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, +1,options);
|
||||
out = iterativeNorm(new StringCharacterIterator(field[4]), Normalizer.FCD, buf, -1,options);
|
||||
|
||||
|
||||
out=Normalizer.normalize(fcd, Normalizer.NFD);
|
||||
if(!out.equals(field[2])) {
|
||||
errln("Normalizer error: NFD(FCD(s))!=NFD(s)");
|
||||
pass = false;
|
||||
}
|
||||
}
|
||||
if (!pass) {
|
||||
errln("FAIL: " + line);
|
||||
}
|
||||
}
|
||||
if(field[0]!=field[2]) {
|
||||
// two strings that are canonically equivalent must test
|
||||
// equal under a canonical caseless match
|
||||
|
@ -327,9 +269,57 @@ public class ConformanceTest extends TestFmwk {
|
|||
pass=false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return pass;
|
||||
}
|
||||
|
||||
private static int getModeNumber(Normalizer.Mode mode) {
|
||||
if (mode == Normalizer.NFD) { return 0; }
|
||||
if (mode == Normalizer.NFKD) { return 1; }
|
||||
if (mode == Normalizer.NFC) { return 2; }
|
||||
if (mode == Normalizer.NFKC) { return 3; }
|
||||
return -1;
|
||||
}
|
||||
private static final String[] kModeStrings = {
|
||||
"D", "KD", "C", "KC"
|
||||
};
|
||||
private static final String[] kMessages = {
|
||||
"c3!=D(c%d)", "c5!=KC(c%d)", "c2!=C(c%d)", "c4!=KC(c%d)"
|
||||
};
|
||||
|
||||
boolean checkNorm(Normalizer.Mode mode, int options, // Normalizer2 norm2,
|
||||
String s, String exp, int field) throws Exception {
|
||||
String modeString = kModeStrings[getModeNumber(mode)];
|
||||
String msg = String.format(kMessages[getModeNumber(mode)], field);
|
||||
StringBuffer buf = new StringBuffer();
|
||||
String out = Normalizer.normalize(s, mode, options);
|
||||
if (!assertEqual(modeString, "", s, out, exp, msg)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
out = iterativeNorm(s, mode, buf, +1,options);
|
||||
if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
out = iterativeNorm(s, mode, buf, -1,options);
|
||||
if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(s), mode, buf, +1,options);
|
||||
if (!assertEqual(modeString, "(+1)", s, out, exp, msg)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
out = iterativeNorm(new StringCharacterIterator(s), mode, buf, -1,options);
|
||||
if (!assertEqual(modeString, "(-1)", s, out, exp, msg)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// two strings that are canonically equivalent must test
|
||||
// equal under a canonical caseless match
|
||||
// see UAX #21 Case Mappings and Jitterbug 2021 and
|
||||
|
@ -339,26 +329,26 @@ public class ConformanceTest extends TestFmwk {
|
|||
if(Normalizer.compare(UTF16.charAt(s1,0),UTF16.charAt(s2,0),Normalizer.COMPARE_IGNORE_CASE)!=0){
|
||||
errln("Normalizer.compare(int,int) failed for s1: "
|
||||
+Utility.hex(s1) + " s2: " + Utility.hex(s2));
|
||||
}
|
||||
}
|
||||
}
|
||||
if(s1.length()==1 && s2.length()>1){
|
||||
if(Normalizer.compare(UTF16.charAt(s1,0),s2,Normalizer.COMPARE_IGNORE_CASE)!=0){
|
||||
errln("Normalizer.compare(int,String) failed for s1: "
|
||||
+Utility.hex(s1) + " s2: " + Utility.hex(s2));
|
||||
}
|
||||
}
|
||||
}
|
||||
if(s1.length()>1 && s2.length()>1){
|
||||
// TODO: Re-enable this tests after UTC fixes UAX 21
|
||||
if(Normalizer.compare(s1.toCharArray(),s2.toCharArray(),Normalizer.COMPARE_IGNORE_CASE)!=0){
|
||||
errln("Normalizer.compare(char[],char[]) failed for s1: "
|
||||
+Utility.hex(s1) + " s2: " + Utility.hex(s2));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
private void cross(String s1, String s2,Normalizer.Mode mode){
|
||||
String result = Normalizer.normalize(s1,mode);
|
||||
if(!result.equals(s2)){
|
||||
errln("cross test failed s1: " + Utility.hex(s1) + " s2: "
|
||||
errln("cross test failed s1: " + Utility.hex(s1) + " s2: "
|
||||
+Utility.hex(s2));
|
||||
}
|
||||
}
|
||||
|
@ -389,7 +379,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
}
|
||||
return buf.toString();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Do a normalization using the iterative API in the given direction.
|
||||
* @param str a Java StringCharacterIterator
|
||||
|
@ -421,18 +411,19 @@ public class ConformanceTest extends TestFmwk {
|
|||
|
||||
/**
|
||||
* @param op name of normalization form, e.g., "KC"
|
||||
* @param op2 name of test case variant, e.g., "(-1)"
|
||||
* @param s string being normalized
|
||||
* @param got value received
|
||||
* @param exp expected value
|
||||
* @param msg description of this test
|
||||
* @returns true if got == exp
|
||||
*/
|
||||
private boolean assertEqual(String op, String s, String got,
|
||||
private boolean assertEqual(String op, String op2, String s, String got,
|
||||
String exp, String msg) {
|
||||
if (exp.equals(got)) {
|
||||
return true;
|
||||
}
|
||||
errln((" " + msg + ") " + op + "(" + s + ")=" + hex(got) +
|
||||
errln((" " + msg + ": " + op + op2 + '(' + s + ")=" + hex(got) +
|
||||
", exp. " + hex(exp)));
|
||||
return false;
|
||||
}
|
||||
|
@ -459,7 +450,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
}
|
||||
// Our field is from pos..delim-1.
|
||||
buf.setLength(0);
|
||||
|
||||
|
||||
String toHex = s.substring(pos,delim);
|
||||
pos = delim;
|
||||
int index = 0;
|
||||
|
@ -478,7 +469,7 @@ public class ConformanceTest extends TestFmwk {
|
|||
index = spacePos+1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (buf.length() < 1) {
|
||||
throw new IllegalArgumentException("Empty field " + i + " in " + s);
|
||||
}
|
||||
|
@ -492,13 +483,13 @@ public class ConformanceTest extends TestFmwk {
|
|||
throw new IllegalArgumentException("Out of range hex " +
|
||||
hex + " in " + s);
|
||||
}else if (hex > 0xFFFF){
|
||||
buf.append((char)((hex>>10)+0xd7c0));
|
||||
buf.append((char)((hex>>10)+0xd7c0));
|
||||
buf.append((char)((hex&0x3ff)|0xdc00));
|
||||
}else{
|
||||
buf.append((char) hex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Specific tests for debugging. These are generally failures
|
||||
// taken from the conformance file, but culled out to make
|
||||
// debugging easier. These can be eliminated without affecting
|
||||
|
@ -516,6 +507,6 @@ public class ConformanceTest extends TestFmwk {
|
|||
hexsplit(line, ';', fields, buf);
|
||||
checkConformance(fields, line,options);
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue