mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-05 13:35:32 +00:00
parent
25eb1510ec
commit
fc12cf095c
7 changed files with 311 additions and 64 deletions
|
@ -38,7 +38,7 @@ public:
|
|||
virtual UnicodeString &
|
||||
normalize(const UnicodeString &src,
|
||||
UnicodeString &dest,
|
||||
UErrorCode &errorCode) const {
|
||||
UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
dest.setToBogus();
|
||||
return dest;
|
||||
|
@ -64,13 +64,13 @@ public:
|
|||
virtual UnicodeString &
|
||||
normalizeSecondAndAppend(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return normalizeSecondAndAppend(first, second, true, errorCode);
|
||||
}
|
||||
virtual UnicodeString &
|
||||
append(UnicodeString &first,
|
||||
const UnicodeString &second,
|
||||
UErrorCode &errorCode) const {
|
||||
UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return normalizeSecondAndAppend(first, second, false, errorCode);
|
||||
}
|
||||
UnicodeString &
|
||||
|
@ -107,7 +107,7 @@ public:
|
|||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
|
||||
virtual UBool
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
|
||||
UChar buffer[4];
|
||||
int32_t length;
|
||||
const UChar *d=impl.getDecomposition(c, buffer, length);
|
||||
|
@ -122,7 +122,7 @@ public:
|
|||
return true;
|
||||
}
|
||||
virtual UBool
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
|
||||
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
|
||||
UChar buffer[30];
|
||||
int32_t length;
|
||||
const UChar *d=impl.getRawDecomposition(c, buffer, length);
|
||||
|
@ -137,18 +137,18 @@ public:
|
|||
return true;
|
||||
}
|
||||
virtual UChar32
|
||||
composePair(UChar32 a, UChar32 b) const {
|
||||
composePair(UChar32 a, UChar32 b) const U_OVERRIDE {
|
||||
return impl.composePair(a, b);
|
||||
}
|
||||
|
||||
virtual uint8_t
|
||||
getCombiningClass(UChar32 c) const {
|
||||
getCombiningClass(UChar32 c) const U_OVERRIDE {
|
||||
return impl.getCC(impl.getNorm16(c));
|
||||
}
|
||||
|
||||
// quick checks
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -161,11 +161,11 @@ public:
|
|||
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual int32_t
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -194,27 +194,57 @@ public:
|
|||
private:
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
impl.decompose(src, limit, &buffer, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
|
||||
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
|
||||
edits->reset();
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
|
||||
impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
|
||||
sink.Flush();
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return false;
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
|
||||
const uint8_t *sLimit = s + sp.length();
|
||||
return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
|
||||
}
|
||||
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return impl.decompose(src, limit, NULL, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
|
||||
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const U_OVERRIDE {
|
||||
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
|
||||
}
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundaryBefore(c); }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundaryAfter(c); }
|
||||
virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
|
||||
return impl.hasDecompBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
|
||||
return impl.hasDecompBoundaryAfter(c);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const U_OVERRIDE {
|
||||
return impl.isDecompInert(c);
|
||||
}
|
||||
};
|
||||
|
||||
class ComposeNormalizer2 : public Normalizer2WithImpl {
|
||||
|
@ -321,24 +351,30 @@ public:
|
|||
private:
|
||||
virtual void
|
||||
normalize(const UChar *src, const UChar *limit,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
impl.makeFCD(src, limit, &buffer, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
|
||||
virtual void
|
||||
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
|
||||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
|
||||
}
|
||||
virtual const UChar *
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
|
||||
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
|
||||
return impl.makeFCD(src, limit, NULL, errorCode);
|
||||
}
|
||||
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasFCDBoundaryBefore(c); }
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasFCDBoundaryAfter(c); }
|
||||
virtual UBool isInert(UChar32 c) const { return impl.isFCDInert(c); }
|
||||
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
|
||||
return impl.hasFCDBoundaryBefore(c);
|
||||
}
|
||||
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
|
||||
return impl.hasFCDBoundaryAfter(c);
|
||||
}
|
||||
virtual UBool isInert(UChar32 c) const U_OVERRIDE {
|
||||
return impl.isFCDInert(c);
|
||||
}
|
||||
};
|
||||
|
||||
struct Norm2AllModes : public UMemory {
|
||||
|
|
|
@ -731,9 +731,131 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
|
|||
return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
|
||||
}
|
||||
|
||||
// Dual functionality:
|
||||
// sink != nullptr: normalize
|
||||
// sink == nullptr: isNormalized/spanQuickCheckYes
|
||||
const uint8_t *
|
||||
Normalizer2Impl::decomposeUTF8(uint32_t options,
|
||||
const uint8_t *src, const uint8_t *limit,
|
||||
ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
|
||||
U_ASSERT(limit != nullptr);
|
||||
UnicodeString s16;
|
||||
uint8_t minNoLead = leadByteForCP(minDecompNoCP);
|
||||
|
||||
const uint8_t *prevBoundary = src;
|
||||
// only for quick check
|
||||
uint8_t prevCC = 0;
|
||||
|
||||
for (;;) {
|
||||
// Fast path: Scan over a sequence of characters below the minimum "no" code point,
|
||||
// or with (decompYes && ccc==0) properties.
|
||||
const uint8_t *fastStart = src;
|
||||
const uint8_t *prevSrc;
|
||||
uint16_t norm16 = 0;
|
||||
|
||||
for (;;) {
|
||||
if (src == limit) {
|
||||
if (prevBoundary != limit && sink != nullptr) {
|
||||
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
|
||||
*sink, options, edits, errorCode);
|
||||
}
|
||||
return src;
|
||||
}
|
||||
if (*src < minNoLead) {
|
||||
++src;
|
||||
} else {
|
||||
prevSrc = src;
|
||||
UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
|
||||
if (!isMostDecompYesAndZeroCC(norm16)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
|
||||
// and the current character at [prevSrc..src[ is not a common case with cc=0
|
||||
// (MIN_NORMAL_MAYBE_YES or JAMO_VT).
|
||||
// It could still be a maybeYes with cc=0.
|
||||
if (prevSrc != fastStart) {
|
||||
// The fast path looped over yes/0 characters before the current one.
|
||||
if (sink != nullptr &&
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
prevBoundary = prevSrc;
|
||||
prevCC = 0;
|
||||
}
|
||||
|
||||
// Medium-fast path: Quick check.
|
||||
if (isMaybeOrNonZeroCC(norm16)) {
|
||||
// Does not decompose.
|
||||
uint8_t cc = getCCFromYesOrMaybe(norm16);
|
||||
if (prevCC <= cc || cc == 0) {
|
||||
prevCC = cc;
|
||||
if (cc <= 1) {
|
||||
if (sink != nullptr &&
|
||||
!ByteSinkUtil::appendUnchanged(prevBoundary, src,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
prevBoundary = src;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (sink == nullptr) {
|
||||
return prevBoundary; // quick check: "no" or cc out of order
|
||||
}
|
||||
|
||||
// Slow path
|
||||
// Decompose up to and including the current character.
|
||||
if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
|
||||
if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
prevBoundary = prevSrc;
|
||||
}
|
||||
ReorderingBuffer buffer(*this, s16, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
|
||||
buffer, errorCode);
|
||||
// Decompose until the next boundary.
|
||||
if (buffer.getLastCC() > 1) {
|
||||
src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
|
||||
buffer, errorCode);
|
||||
}
|
||||
if (U_FAILURE(errorCode)) {
|
||||
break;
|
||||
}
|
||||
if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
|
||||
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
break;
|
||||
}
|
||||
// We already know there was a change if the original character decomposed;
|
||||
// otherwise compare.
|
||||
if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
|
||||
if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
|
||||
*sink, options, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
|
||||
*sink, edits, errorCode)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
prevBoundary = src;
|
||||
prevCC = 0;
|
||||
}
|
||||
return src;
|
||||
}
|
||||
|
||||
const uint8_t *
|
||||
Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
|
||||
UBool stopAtCompBoundary, UBool onlyContiguous,
|
||||
StopAt stopAt, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return nullptr;
|
||||
|
@ -746,21 +868,28 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
|
|||
UChar32 c = U_SENTINEL;
|
||||
if (norm16 >= limitNoNo) {
|
||||
if (isMaybeOrNonZeroCC(norm16)) {
|
||||
// No boundaries around this character.
|
||||
// No comp boundaries around this character.
|
||||
uint8_t cc = getCCFromYesOrMaybe(norm16);
|
||||
if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
|
||||
return prevSrc;
|
||||
}
|
||||
c = codePointFromValidUTF8(prevSrc, src);
|
||||
if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
|
||||
if (!buffer.append(c, cc, errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
|
||||
return src;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Maps to an isCompYesAndZeroCC.
|
||||
if (stopAtCompBoundary) {
|
||||
if (stopAt != STOP_AT_LIMIT) {
|
||||
return prevSrc;
|
||||
}
|
||||
c = codePointFromValidUTF8(prevSrc, src);
|
||||
c = mapAlgorithmic(c, norm16);
|
||||
norm16 = getRawNorm16(c);
|
||||
} else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
|
||||
} else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
|
||||
return prevSrc;
|
||||
}
|
||||
// norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
|
||||
|
@ -768,7 +897,8 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
|
|||
// its norm16==INERT is normalization-inert,
|
||||
// so it gets copied unchanged in the fast path,
|
||||
// and we stop the slow path where invalid UTF-8 begins.
|
||||
U_ASSERT(norm16 != INERT);
|
||||
// c >= 0 is the result of an algorithmic mapping.
|
||||
U_ASSERT(c >= 0 || norm16 != INERT);
|
||||
if (norm16 < minYesNo) {
|
||||
if (c < 0) {
|
||||
c = codePointFromValidUTF8(prevSrc, src);
|
||||
|
@ -798,11 +928,15 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
|
|||
} else {
|
||||
leadCC = 0;
|
||||
}
|
||||
if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
|
||||
return prevSrc;
|
||||
}
|
||||
if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
|
||||
if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
|
||||
(stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
@ -1954,10 +2088,10 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
|
|||
break;
|
||||
}
|
||||
// We know there is not a boundary here.
|
||||
decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
|
||||
decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
|
||||
buffer, errorCode);
|
||||
// Decompose until the next boundary.
|
||||
src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
|
||||
src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
|
||||
buffer, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
break;
|
||||
|
|
|
@ -491,6 +491,12 @@ public:
|
|||
UnicodeString &safeMiddle,
|
||||
ReorderingBuffer &buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
/** sink==nullptr: isNormalized()/spanQuickCheckYes() */
|
||||
const uint8_t *decomposeUTF8(uint32_t options,
|
||||
const uint8_t *src, const uint8_t *limit,
|
||||
ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
|
||||
|
||||
UBool compose(const UChar *src, const UChar *limit,
|
||||
UBool onlyContiguous,
|
||||
UBool doCompose,
|
||||
|
@ -649,6 +655,9 @@ private:
|
|||
UChar32 minNeedDataCP,
|
||||
ReorderingBuffer *buffer,
|
||||
UErrorCode &errorCode) const;
|
||||
|
||||
enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
|
||||
|
||||
const UChar *decomposeShort(const UChar *src, const UChar *limit,
|
||||
UBool stopAtCompBoundary, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
@ -656,7 +665,7 @@ private:
|
|||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
|
||||
UBool stopAtCompBoundary, UBool onlyContiguous,
|
||||
StopAt stopAt, UBool onlyContiguous,
|
||||
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
|
||||
|
||||
static int32_t combine(const uint16_t *list, UChar32 trail);
|
||||
|
|
|
@ -225,10 +225,8 @@ public:
|
|||
* Normalizes a UTF-8 string and optionally records how source substrings
|
||||
* relate to changed and unchanged result substrings.
|
||||
*
|
||||
* Currently implemented completely only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* Otherwise currently converts to & from UTF-16 and does not support edits.
|
||||
* Implemented completely for all built-in modes except for FCD.
|
||||
* The base class implementation converts to & from UTF-16 and does not support edits.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src Source UTF-8 string.
|
||||
|
@ -381,11 +379,9 @@ public:
|
|||
* resolves to "yes" or "no" to provide a definitive result,
|
||||
* at the cost of doing more work in those cases.
|
||||
*
|
||||
* This works for all normalization modes,
|
||||
* but it is currently optimized for UTF-8 only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* For other modes it currently converts to UTF-16 and calls isNormalized().
|
||||
* This works for all normalization modes.
|
||||
* It is optimized for UTF-8 for all built-in modes except for FCD.
|
||||
* The base class implementation converts to UTF-16 and calls isNormalized().
|
||||
*
|
||||
* @param s UTF-8 input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
|
@ -543,10 +539,8 @@ public:
|
|||
* Normalizes a UTF-8 string and optionally records how source substrings
|
||||
* relate to changed and unchanged result substrings.
|
||||
*
|
||||
* Currently implemented completely only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* Otherwise currently converts to & from UTF-16 and does not support edits.
|
||||
* Implemented completely for most built-in modes except for FCD.
|
||||
* The base class implementation converts to & from UTF-16 and does not support edits.
|
||||
*
|
||||
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
|
||||
* @param src Source UTF-8 string.
|
||||
|
@ -676,11 +670,9 @@ public:
|
|||
* resolves to "yes" or "no" to provide a definitive result,
|
||||
* at the cost of doing more work in those cases.
|
||||
*
|
||||
* This works for all normalization modes,
|
||||
* but it is currently optimized for UTF-8 only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* For other modes it currently converts to UTF-16 and calls isNormalized().
|
||||
* This works for all normalization modes.
|
||||
* It is optimized for UTF-8 for all built-in modes except for FCD.
|
||||
* The base class implementation converts to UTF-16 and calls isNormalized().
|
||||
*
|
||||
* @param s UTF-8 input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
|
|
|
@ -365,6 +365,10 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
|
|||
pass = FALSE;
|
||||
}
|
||||
}
|
||||
if(options==0 && !isNormalizedUTF8(*nfd, field[2], status)) {
|
||||
dataerrln("Normalizer error: nfd.isNormalizedUTF8(NFD(s)) is FALSE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
|
||||
dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
|
||||
pass = FALSE;
|
||||
|
@ -384,6 +388,10 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
|
|||
}
|
||||
}
|
||||
}
|
||||
if(options==0 && !isNormalizedUTF8(*nfkd, field[4], status)) {
|
||||
dataerrln("Normalizer error: nfkd.isNormalizedUTF8(NFKD(s)) is FALSE");
|
||||
pass = FALSE;
|
||||
}
|
||||
|
||||
// test FCD quick check and "makeFCD"
|
||||
Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
|
||||
|
@ -481,7 +489,7 @@ UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t opti
|
|||
exp.toUTF8String(exp8);
|
||||
std::string out8;
|
||||
Edits edits;
|
||||
Edits *editsPtr = (mode == UNORM_NFC || mode == UNORM_NFKC) ? &edits : nullptr;
|
||||
Edits *editsPtr = mode != UNORM_FCD ? &edits : nullptr;
|
||||
StringByteSink<std::string> sink(&out8, static_cast<int32_t>(exp8.length()));
|
||||
norm2->normalizeUTF8(0, s8, sink, editsPtr, errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
|
|
|
@ -55,7 +55,8 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE_AUTO(TestCustomFCC);
|
||||
#endif
|
||||
TESTCASE_AUTO(TestFilteredNormalizer2Coverage);
|
||||
TESTCASE_AUTO(TestNormalizeUTF8WithEdits);
|
||||
TESTCASE_AUTO(TestComposeUTF8WithEdits);
|
||||
TESTCASE_AUTO(TestDecomposeUTF8WithEdits);
|
||||
TESTCASE_AUTO(TestLowMappingToEmpty_D);
|
||||
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
|
||||
TESTCASE_AUTO(TestNormalizeIllFormedText);
|
||||
|
@ -1568,8 +1569,8 @@ BasicNormalizerTest::TestFilteredNormalizer2Coverage() {
|
|||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestNormalizeUTF8WithEdits");
|
||||
BasicNormalizerTest::TestComposeUTF8WithEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestComposeUTF8WithEdits");
|
||||
const Normalizer2 *nfkc_cf=Normalizer2::getNFKCCasefoldInstance(errorCode);
|
||||
if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
|
||||
return;
|
||||
|
@ -1589,12 +1590,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
{ TRUE, 2, 2 }, // Ä→ä
|
||||
{ TRUE, 3, 2 }, // A\u0308→ä
|
||||
{ TRUE, 7, 5 }, // A\u0308\u00ad\u0323→ạ\u0308 removes the soft hyphen
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→ạ\u0308
|
||||
{ FALSE, 1, 1 }, // comma
|
||||
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
|
||||
{ TRUE, 6, 3 }, // \u1100\u1161→ 가
|
||||
{ TRUE, 6, 3 }, // 가\u11A8→ 각
|
||||
{ TRUE, 6, 3 }, // 가\u3133→ 갃
|
||||
{ TRUE, 6, 3 }, // \u1100\u1161→가
|
||||
{ TRUE, 6, 3 }, // 가\u11A8→각
|
||||
{ TRUE, 6, 3 }, // 가\u3133→갃
|
||||
{ FALSE, 2, 2 } // 2 spaces
|
||||
};
|
||||
assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges());
|
||||
|
@ -1635,12 +1636,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
{ TRUE, 2, 2 }, // Ä→ä
|
||||
{ FALSE, 4, 4 }, // A\u0308A
|
||||
{ TRUE, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→ạ\u0308
|
||||
{ FALSE, 1, 1 }, // comma
|
||||
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
|
||||
{ TRUE, 6, 3 }, // \u1100\u1161→ 가
|
||||
{ TRUE, 6, 3 }, // 가\u11A8→ 각
|
||||
{ TRUE, 6, 3 }, // 가\u3133→ 갃
|
||||
{ TRUE, 6, 3 }, // \u1100\u1161→가
|
||||
{ TRUE, 6, 3 }, // 가\u11A8→각
|
||||
{ TRUE, 6, 3 }, // 가\u3133→갃
|
||||
{ FALSE, 2, 2 } // 2 spaces
|
||||
};
|
||||
assertTrue("filtered normalizeUTF8 hasChanges", edits.hasChanges());
|
||||
|
@ -1670,6 +1671,72 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
TRUE, errorCode);
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestDecomposeUTF8WithEdits() {
|
||||
IcuTestErrorCode errorCode(*this, "TestDecomposeUTF8WithEdits");
|
||||
const Normalizer2 *nfkd_cf =
|
||||
Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_DECOMPOSE, errorCode);
|
||||
if(errorCode.errDataIfFailureAndReset("Normalizer2::getInstance(nfkc_cf/decompose) call failed")) {
|
||||
return;
|
||||
}
|
||||
static const StringPiece src =
|
||||
u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161가\u11A8가\u3133 ";
|
||||
StringPiece expected =
|
||||
u8" aa\u0308a\u0308a\u0323\u0308a\u0323\u0308,"
|
||||
u8"\u1100\u1161\u1100\u1161\u11A8\u1100\u1161\u11AA ";
|
||||
std::string result;
|
||||
StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length()));
|
||||
Edits edits;
|
||||
nfkd_cf->normalizeUTF8(0, src, sink, &edits, errorCode);
|
||||
assertSuccess("normalizeUTF8 with Edits", errorCode.get());
|
||||
assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str());
|
||||
static const EditChange expectedChanges[] = {
|
||||
{ FALSE, 2, 2 }, // 2 spaces
|
||||
{ TRUE, 1, 1 }, // A→a
|
||||
{ TRUE, 2, 3 }, // Ä→a\u0308
|
||||
{ TRUE, 1, 1 }, // A→a
|
||||
{ FALSE, 2, 2 }, // \u0308→\u0308 unchanged
|
||||
{ TRUE, 1, 1 }, // A→a
|
||||
{ TRUE, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen
|
||||
{ TRUE, 4, 5 }, // Ä\u0323→a\u0323\u0308
|
||||
{ FALSE, 1, 1 }, // comma
|
||||
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
|
||||
{ FALSE, 6, 6 }, // \u1100\u1161 unchanged
|
||||
{ TRUE, 3, 6 }, // 가→\u1100\u1161
|
||||
{ FALSE, 3, 3 }, // \u11A8 unchanged
|
||||
{ TRUE, 3, 6 }, // 가→\u1100\u1161
|
||||
{ TRUE, 3, 3 }, // \u3133→\u11AA
|
||||
{ FALSE, 2, 2 } // 2 spaces
|
||||
};
|
||||
assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges());
|
||||
assertEquals("normalizeUTF8 with Edits numberOfChanges", 10, edits.numberOfChanges());
|
||||
TestUtility::checkEditsIter(*this, u"normalizeUTF8 with Edits",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
expectedChanges, UPRV_LENGTHOF(expectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
assertFalse("isNormalizedUTF8(source)", nfkd_cf->isNormalizedUTF8(src, errorCode));
|
||||
assertTrue("isNormalizedUTF8(normalized)", nfkd_cf->isNormalizedUTF8(result, errorCode));
|
||||
|
||||
// Omit unchanged text.
|
||||
expected = u8"aa\u0308aa\u0323\u0308a\u0323\u0308\u1100\u1161\u1100\u1161\u11AA";
|
||||
result.clear();
|
||||
edits.reset();
|
||||
nfkd_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
|
||||
assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
|
||||
assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str());
|
||||
assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
|
||||
assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 10, edits.numberOfChanges());
|
||||
TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
|
||||
edits.getFineIterator(), edits.getFineIterator(),
|
||||
expectedChanges, UPRV_LENGTHOF(expectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
// Not testing FilteredNormalizer2:
|
||||
// The code there is the same for all normalization modes, and
|
||||
// TestComposeUTF8WithEdits() covers it well.
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestLowMappingToEmpty_D() {
|
||||
IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_D");
|
||||
|
|
|
@ -47,7 +47,8 @@ public:
|
|||
void TestCustomComp();
|
||||
void TestCustomFCC();
|
||||
void TestFilteredNormalizer2Coverage();
|
||||
void TestNormalizeUTF8WithEdits();
|
||||
void TestComposeUTF8WithEdits();
|
||||
void TestDecomposeUTF8WithEdits();
|
||||
void TestLowMappingToEmpty_D();
|
||||
void TestLowMappingToEmpty_FCD();
|
||||
void TestNormalizeIllFormedText();
|
||||
|
|
Loading…
Add table
Reference in a new issue