ICU-21279 decompose (NFD/NFKD) UTF-8 with Edits

See #1518
This commit is contained in:
Markus Scherer 2020-12-21 23:06:24 +00:00
parent 25eb1510ec
commit fc12cf095c
7 changed files with 311 additions and 64 deletions

View file

@ -38,7 +38,7 @@ public:
virtual UnicodeString &
normalize(const UnicodeString &src,
UnicodeString &dest,
UErrorCode &errorCode) const {
UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
dest.setToBogus();
return dest;
@ -64,13 +64,13 @@ public:
virtual UnicodeString &
normalizeSecondAndAppend(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
UErrorCode &errorCode) const U_OVERRIDE {
return normalizeSecondAndAppend(first, second, true, errorCode);
}
virtual UnicodeString &
append(UnicodeString &first,
const UnicodeString &second,
UErrorCode &errorCode) const {
UErrorCode &errorCode) const U_OVERRIDE {
return normalizeSecondAndAppend(first, second, false, errorCode);
}
UnicodeString &
@ -107,7 +107,7 @@ public:
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const = 0;
virtual UBool
getDecomposition(UChar32 c, UnicodeString &decomposition) const {
getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
UChar buffer[4];
int32_t length;
const UChar *d=impl.getDecomposition(c, buffer, length);
@ -122,7 +122,7 @@ public:
return true;
}
virtual UBool
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE {
UChar buffer[30];
int32_t length;
const UChar *d=impl.getRawDecomposition(c, buffer, length);
@ -137,18 +137,18 @@ public:
return true;
}
virtual UChar32
composePair(UChar32 a, UChar32 b) const {
composePair(UChar32 a, UChar32 b) const U_OVERRIDE {
return impl.composePair(a, b);
}
virtual uint8_t
getCombiningClass(UChar32 c) const {
getCombiningClass(UChar32 c) const U_OVERRIDE {
return impl.getCC(impl.getNorm16(c));
}
// quick checks
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
return false;
}
@ -161,11 +161,11 @@ public:
return sLimit==spanQuickCheckYes(sArray, sLimit, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
return Normalizer2WithImpl::isNormalized(s, errorCode) ? UNORM_YES : UNORM_NO;
}
virtual int32_t
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
return 0;
}
@ -194,27 +194,57 @@ public:
private:
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
impl.decompose(src, limit, &buffer, errorCode);
}
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
impl.decomposeAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
}
void
normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
Edits *edits, UErrorCode &errorCode) const U_OVERRIDE {
if (U_FAILURE(errorCode)) {
return;
}
if (edits != nullptr && (options & U_EDITS_NO_RESET) == 0) {
edits->reset();
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(src.data());
impl.decomposeUTF8(options, s, s + src.length(), &sink, edits, errorCode);
sink.Flush();
}
virtual UBool
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const U_OVERRIDE {
if(U_FAILURE(errorCode)) {
return false;
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
const uint8_t *sLimit = s + sp.length();
return sLimit == impl.decomposeUTF8(0, s, sLimit, nullptr, nullptr, errorCode);
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
return impl.decompose(src, limit, NULL, errorCode);
}
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const {
virtual UNormalizationCheckResult getQuickCheck(UChar32 c) const U_OVERRIDE {
return impl.isDecompYes(impl.getNorm16(c)) ? UNORM_YES : UNORM_NO;
}
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasDecompBoundaryBefore(c); }
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasDecompBoundaryAfter(c); }
virtual UBool isInert(UChar32 c) const { return impl.isDecompInert(c); }
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
return impl.hasDecompBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
return impl.hasDecompBoundaryAfter(c);
}
virtual UBool isInert(UChar32 c) const U_OVERRIDE {
return impl.isDecompInert(c);
}
};
class ComposeNormalizer2 : public Normalizer2WithImpl {
@ -321,24 +351,30 @@ public:
private:
virtual void
normalize(const UChar *src, const UChar *limit,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
impl.makeFCD(src, limit, &buffer, errorCode);
}
using Normalizer2WithImpl::normalize; // Avoid warning about hiding base class function.
virtual void
normalizeAndAppend(const UChar *src, const UChar *limit, UBool doNormalize,
UnicodeString &safeMiddle,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
ReorderingBuffer &buffer, UErrorCode &errorCode) const U_OVERRIDE {
impl.makeFCDAndAppend(src, limit, doNormalize, safeMiddle, buffer, errorCode);
}
virtual const UChar *
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const {
spanQuickCheckYes(const UChar *src, const UChar *limit, UErrorCode &errorCode) const U_OVERRIDE {
return impl.makeFCD(src, limit, NULL, errorCode);
}
using Normalizer2WithImpl::spanQuickCheckYes; // Avoid warning about hiding base class function.
virtual UBool hasBoundaryBefore(UChar32 c) const { return impl.hasFCDBoundaryBefore(c); }
virtual UBool hasBoundaryAfter(UChar32 c) const { return impl.hasFCDBoundaryAfter(c); }
virtual UBool isInert(UChar32 c) const { return impl.isFCDInert(c); }
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE {
return impl.hasFCDBoundaryBefore(c);
}
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE {
return impl.hasFCDBoundaryAfter(c);
}
virtual UBool isInert(UChar32 c) const U_OVERRIDE {
return impl.isFCDInert(c);
}
};
struct Norm2AllModes : public UMemory {

View file

@ -731,9 +731,131 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
}
// Dual functionality:
// sink != nullptr: normalize
// sink == nullptr: isNormalized/spanQuickCheckYes
const uint8_t *
Normalizer2Impl::decomposeUTF8(uint32_t options,
const uint8_t *src, const uint8_t *limit,
ByteSink *sink, Edits *edits, UErrorCode &errorCode) const {
U_ASSERT(limit != nullptr);
UnicodeString s16;
uint8_t minNoLead = leadByteForCP(minDecompNoCP);
const uint8_t *prevBoundary = src;
// only for quick check
uint8_t prevCC = 0;
for (;;) {
// Fast path: Scan over a sequence of characters below the minimum "no" code point,
// or with (decompYes && ccc==0) properties.
const uint8_t *fastStart = src;
const uint8_t *prevSrc;
uint16_t norm16 = 0;
for (;;) {
if (src == limit) {
if (prevBoundary != limit && sink != nullptr) {
ByteSinkUtil::appendUnchanged(prevBoundary, limit,
*sink, options, edits, errorCode);
}
return src;
}
if (*src < minNoLead) {
++src;
} else {
prevSrc = src;
UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
if (!isMostDecompYesAndZeroCC(norm16)) {
break;
}
}
}
// isMostDecompYesAndZeroCC(norm16) is false, that is, norm16>=minYesNo,
// and the current character at [prevSrc..src[ is not a common case with cc=0
// (MIN_NORMAL_MAYBE_YES or JAMO_VT).
// It could still be a maybeYes with cc=0.
if (prevSrc != fastStart) {
// The fast path looped over yes/0 characters before the current one.
if (sink != nullptr &&
!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
prevBoundary = prevSrc;
prevCC = 0;
}
// Medium-fast path: Quick check.
if (isMaybeOrNonZeroCC(norm16)) {
// Does not decompose.
uint8_t cc = getCCFromYesOrMaybe(norm16);
if (prevCC <= cc || cc == 0) {
prevCC = cc;
if (cc <= 1) {
if (sink != nullptr &&
!ByteSinkUtil::appendUnchanged(prevBoundary, src,
*sink, options, edits, errorCode)) {
break;
}
prevBoundary = src;
}
continue;
}
}
if (sink == nullptr) {
return prevBoundary; // quick check: "no" or cc out of order
}
// Slow path
// Decompose up to and including the current character.
if (prevBoundary != prevSrc && norm16HasDecompBoundaryBefore(norm16)) {
if (!ByteSinkUtil::appendUnchanged(prevBoundary, prevSrc,
*sink, options, edits, errorCode)) {
break;
}
prevBoundary = prevSrc;
}
ReorderingBuffer buffer(*this, s16, errorCode);
if (U_FAILURE(errorCode)) {
break;
}
decomposeShort(prevBoundary, src, STOP_AT_LIMIT, FALSE /* onlyContiguous */,
buffer, errorCode);
// Decompose until the next boundary.
if (buffer.getLastCC() > 1) {
src = decomposeShort(src, limit, STOP_AT_DECOMP_BOUNDARY, FALSE /* onlyContiguous */,
buffer, errorCode);
}
if (U_FAILURE(errorCode)) {
break;
}
if ((src - prevSrc) > INT32_MAX) { // guard before buffer.equals()
errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
break;
}
// We already know there was a change if the original character decomposed;
// otherwise compare.
if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) {
if (!ByteSinkUtil::appendUnchanged(prevBoundary, src,
*sink, options, edits, errorCode)) {
break;
}
} else {
if (!ByteSinkUtil::appendChange(prevBoundary, src, buffer.getStart(), buffer.length(),
*sink, edits, errorCode)) {
break;
}
}
prevBoundary = src;
prevCC = 0;
}
return src;
}
const uint8_t *
Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
UBool stopAtCompBoundary, UBool onlyContiguous,
StopAt stopAt, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const {
if (U_FAILURE(errorCode)) {
return nullptr;
@ -746,21 +868,28 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
UChar32 c = U_SENTINEL;
if (norm16 >= limitNoNo) {
if (isMaybeOrNonZeroCC(norm16)) {
// No boundaries around this character.
// No comp boundaries around this character.
uint8_t cc = getCCFromYesOrMaybe(norm16);
if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
return prevSrc;
}
c = codePointFromValidUTF8(prevSrc, src);
if (!buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode)) {
if (!buffer.append(c, cc, errorCode)) {
return nullptr;
}
if (stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1) {
return src;
}
continue;
}
// Maps to an isCompYesAndZeroCC.
if (stopAtCompBoundary) {
if (stopAt != STOP_AT_LIMIT) {
return prevSrc;
}
c = codePointFromValidUTF8(prevSrc, src);
c = mapAlgorithmic(c, norm16);
norm16 = getRawNorm16(c);
} else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
} else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) {
return prevSrc;
}
// norm16!=INERT guarantees that [prevSrc, src[ is valid UTF-8.
@ -768,7 +897,8 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
// its norm16==INERT is normalization-inert,
// so it gets copied unchanged in the fast path,
// and we stop the slow path where invalid UTF-8 begins.
U_ASSERT(norm16 != INERT);
// c >= 0 is the result of an algorithmic mapping.
U_ASSERT(c >= 0 || norm16 != INERT);
if (norm16 < minYesNo) {
if (c < 0) {
c = codePointFromValidUTF8(prevSrc, src);
@ -798,11 +928,15 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
} else {
leadCC = 0;
}
if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) {
return prevSrc;
}
if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
return nullptr;
}
}
if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
if ((stopAt == STOP_AT_COMP_BOUNDARY && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) ||
(stopAt == STOP_AT_DECOMP_BOUNDARY && buffer.getLastCC() <= 1)) {
return src;
}
}
@ -1954,10 +2088,10 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
break;
}
// We know there is not a boundary here.
decomposeShort(prevSrc, src, FALSE /* !stopAtCompBoundary */, onlyContiguous,
decomposeShort(prevSrc, src, STOP_AT_LIMIT, onlyContiguous,
buffer, errorCode);
// Decompose until the next boundary.
src = decomposeShort(src, limit, TRUE /* stopAtCompBoundary */, onlyContiguous,
src = decomposeShort(src, limit, STOP_AT_COMP_BOUNDARY, onlyContiguous,
buffer, errorCode);
if (U_FAILURE(errorCode)) {
break;

View file

@ -491,6 +491,12 @@ public:
UnicodeString &safeMiddle,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
/** sink==nullptr: isNormalized()/spanQuickCheckYes() */
const uint8_t *decomposeUTF8(uint32_t options,
const uint8_t *src, const uint8_t *limit,
ByteSink *sink, Edits *edits, UErrorCode &errorCode) const;
UBool compose(const UChar *src, const UChar *limit,
UBool onlyContiguous,
UBool doCompose,
@ -649,6 +655,9 @@ private:
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const;
enum StopAt { STOP_AT_LIMIT, STOP_AT_DECOMP_BOUNDARY, STOP_AT_COMP_BOUNDARY };
const UChar *decomposeShort(const UChar *src, const UChar *limit,
UBool stopAtCompBoundary, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
@ -656,7 +665,7 @@ private:
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
UBool stopAtCompBoundary, UBool onlyContiguous,
StopAt stopAt, UBool onlyContiguous,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
static int32_t combine(const uint16_t *list, UChar32 trail);

View file

@ -225,10 +225,8 @@ public:
* Normalizes a UTF-8 string and optionally records how source substrings
* relate to changed and unchanged result substrings.
*
* Currently implemented completely only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* Otherwise currently converts to & from UTF-16 and does not support edits.
* Implemented completely for all built-in modes except for FCD.
* The base class implementation converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src Source UTF-8 string.
@ -381,11 +379,9 @@ public:
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
*
* This works for all normalization modes,
* but it is currently optimized for UTF-8 only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* For other modes it currently converts to UTF-16 and calls isNormalized().
* This works for all normalization modes.
* It is optimized for UTF-8 for all built-in modes except for FCD.
* The base class implementation converts to UTF-16 and calls isNormalized().
*
* @param s UTF-8 input string
* @param errorCode Standard ICU error code. Its input value must
@ -543,10 +539,8 @@ public:
* Normalizes a UTF-8 string and optionally records how source substrings
* relate to changed and unchanged result substrings.
*
* Currently implemented completely only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* Otherwise currently converts to & from UTF-16 and does not support edits.
* Implemented completely for most built-in modes except for FCD.
* The base class implementation converts to & from UTF-16 and does not support edits.
*
* @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
* @param src Source UTF-8 string.
@ -676,11 +670,9 @@ public:
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
*
* This works for all normalization modes,
* but it is currently optimized for UTF-8 only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* For other modes it currently converts to UTF-16 and calls isNormalized().
* This works for all normalization modes.
* It is optimized for UTF-8 for all built-in modes except for FCD.
* The base class implementation converts to UTF-16 and calls isNormalized().
*
* @param s UTF-8 input string
* @param errorCode Standard ICU error code. Its input value must

View file

@ -365,6 +365,10 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
pass = FALSE;
}
}
if(options==0 && !isNormalizedUTF8(*nfd, field[2], status)) {
dataerrln("Normalizer error: nfd.isNormalizedUTF8(NFD(s)) is FALSE");
pass = FALSE;
}
if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
pass = FALSE;
@ -384,6 +388,10 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
}
}
}
if(options==0 && !isNormalizedUTF8(*nfkd, field[4], status)) {
dataerrln("Normalizer error: nfkd.isNormalizedUTF8(NFKD(s)) is FALSE");
pass = FALSE;
}
// test FCD quick check and "makeFCD"
Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
@ -481,7 +489,7 @@ UBool NormalizerConformanceTest::checkNorm(UNormalizationMode mode, int32_t opti
exp.toUTF8String(exp8);
std::string out8;
Edits edits;
Edits *editsPtr = (mode == UNORM_NFC || mode == UNORM_NFKC) ? &edits : nullptr;
Edits *editsPtr = mode != UNORM_FCD ? &edits : nullptr;
StringByteSink<std::string> sink(&out8, static_cast<int32_t>(exp8.length()));
norm2->normalizeUTF8(0, s8, sink, editsPtr, errorCode);
if (U_FAILURE(errorCode)) {

View file

@ -55,7 +55,8 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestCustomFCC);
#endif
TESTCASE_AUTO(TestFilteredNormalizer2Coverage);
TESTCASE_AUTO(TestNormalizeUTF8WithEdits);
TESTCASE_AUTO(TestComposeUTF8WithEdits);
TESTCASE_AUTO(TestDecomposeUTF8WithEdits);
TESTCASE_AUTO(TestLowMappingToEmpty_D);
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
TESTCASE_AUTO(TestNormalizeIllFormedText);
@ -1568,8 +1569,8 @@ BasicNormalizerTest::TestFilteredNormalizer2Coverage() {
}
void
BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
IcuTestErrorCode errorCode(*this, "TestNormalizeUTF8WithEdits");
BasicNormalizerTest::TestComposeUTF8WithEdits() {
IcuTestErrorCode errorCode(*this, "TestComposeUTF8WithEdits");
const Normalizer2 *nfkc_cf=Normalizer2::getNFKCCasefoldInstance(errorCode);
if(errorCode.errDataIfFailureAndReset("Normalizer2::getNFKCCasefoldInstance() call failed")) {
return;
@ -1589,12 +1590,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
{ TRUE, 2, 2 }, // Ä→ä
{ TRUE, 3, 2 }, // A\u0308→ä
{ TRUE, 7, 5 }, // A\u0308\u00ad\u0323→ạ\u0308 removes the soft hyphen
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
{ TRUE, 4, 5 }, // Ä\u0323→ạ\u0308
{ FALSE, 1, 1 }, // comma
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
{ TRUE, 6, 3 }, // \u1100\u1161→
{ TRUE, 6, 3 }, // 가\u11A8→
{ TRUE, 6, 3 }, // 가\u3133→
{ TRUE, 6, 3 }, // \u1100\u1161→
{ TRUE, 6, 3 }, // 가\u11A8→
{ TRUE, 6, 3 }, // 가\u3133→
{ FALSE, 2, 2 } // 2 spaces
};
assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges());
@ -1635,12 +1636,12 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
{ TRUE, 2, 2 }, // Ä→ä
{ FALSE, 4, 4 }, // A\u0308A
{ TRUE, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen
{ TRUE, 4, 5 }, // Ä\u0323→ ạ\u0308
{ TRUE, 4, 5 }, // Ä\u0323→ạ\u0308
{ FALSE, 1, 1 }, // comma
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
{ TRUE, 6, 3 }, // \u1100\u1161→
{ TRUE, 6, 3 }, // 가\u11A8→
{ TRUE, 6, 3 }, // 가\u3133→
{ TRUE, 6, 3 }, // \u1100\u1161→
{ TRUE, 6, 3 }, // 가\u11A8→
{ TRUE, 6, 3 }, // 가\u3133→
{ FALSE, 2, 2 } // 2 spaces
};
assertTrue("filtered normalizeUTF8 hasChanges", edits.hasChanges());
@ -1670,6 +1671,72 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
TRUE, errorCode);
}
void
BasicNormalizerTest::TestDecomposeUTF8WithEdits() {
IcuTestErrorCode errorCode(*this, "TestDecomposeUTF8WithEdits");
const Normalizer2 *nfkd_cf =
Normalizer2::getInstance(nullptr, "nfkc_cf", UNORM2_DECOMPOSE, errorCode);
if(errorCode.errDataIfFailureAndReset("Normalizer2::getInstance(nfkc_cf/decompose) call failed")) {
return;
}
static const StringPiece src =
u8" AÄA\u0308A\u0308\u00ad\u0323Ä\u0323,\u00ad\u1100\u1161\u11A8\u3133 ";
StringPiece expected =
u8" aa\u0308a\u0308a\u0323\u0308a\u0323\u0308,"
u8"\u1100\u1161\u1100\u1161\u11A8\u1100\u1161\u11AA ";
std::string result;
StringByteSink<std::string> sink(&result, static_cast<int32_t>(expected.length()));
Edits edits;
nfkd_cf->normalizeUTF8(0, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 with Edits", errorCode.get());
assertEquals("normalizeUTF8 with Edits", expected.data(), result.c_str());
static const EditChange expectedChanges[] = {
{ FALSE, 2, 2 }, // 2 spaces
{ TRUE, 1, 1 }, // A→a
{ TRUE, 2, 3 }, // Ä→a\u0308
{ TRUE, 1, 1 }, // A→a
{ FALSE, 2, 2 }, // \u0308→\u0308 unchanged
{ TRUE, 1, 1 }, // A→a
{ TRUE, 6, 4 }, // \u0308\u00ad\u0323→\u0323\u0308 removes the soft hyphen
{ TRUE, 4, 5 }, // Ä\u0323→a\u0323\u0308
{ FALSE, 1, 1 }, // comma
{ TRUE, 2, 0 }, // U+00AD soft hyphen maps to empty
{ FALSE, 6, 6 }, // \u1100\u1161 unchanged
{ TRUE, 3, 6 }, // 가→\u1100\u1161
{ FALSE, 3, 3 }, // \u11A8 unchanged
{ TRUE, 3, 6 }, // 가→\u1100\u1161
{ TRUE, 3, 3 }, // \u3133→\u11AA
{ FALSE, 2, 2 } // 2 spaces
};
assertTrue("normalizeUTF8 with Edits hasChanges", edits.hasChanges());
assertEquals("normalizeUTF8 with Edits numberOfChanges", 10, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"normalizeUTF8 with Edits",
edits.getFineIterator(), edits.getFineIterator(),
expectedChanges, UPRV_LENGTHOF(expectedChanges),
TRUE, errorCode);
assertFalse("isNormalizedUTF8(source)", nfkd_cf->isNormalizedUTF8(src, errorCode));
assertTrue("isNormalizedUTF8(normalized)", nfkd_cf->isNormalizedUTF8(result, errorCode));
// Omit unchanged text.
expected = u8"aa\u0308aa\u0323\u0308a\u0323\u0308\u1100\u1161\u1100\u1161\u11AA";
result.clear();
edits.reset();
nfkd_cf->normalizeUTF8(U_OMIT_UNCHANGED_TEXT, src, sink, &edits, errorCode);
assertSuccess("normalizeUTF8 omit unchanged", errorCode.get());
assertEquals("normalizeUTF8 omit unchanged", expected.data(), result.c_str());
assertTrue("normalizeUTF8 omit unchanged hasChanges", edits.hasChanges());
assertEquals("normalizeUTF8 omit unchanged numberOfChanges", 10, edits.numberOfChanges());
TestUtility::checkEditsIter(*this, u"normalizeUTF8 omit unchanged",
edits.getFineIterator(), edits.getFineIterator(),
expectedChanges, UPRV_LENGTHOF(expectedChanges),
TRUE, errorCode);
// Not testing FilteredNormalizer2:
// The code there is the same for all normalization modes, and
// TestComposeUTF8WithEdits() covers it well.
}
void
BasicNormalizerTest::TestLowMappingToEmpty_D() {
IcuTestErrorCode errorCode(*this, "TestLowMappingToEmpty_D");

View file

@ -47,7 +47,8 @@ public:
void TestCustomComp();
void TestCustomFCC();
void TestFilteredNormalizer2Coverage();
void TestNormalizeUTF8WithEdits();
void TestComposeUTF8WithEdits();
void TestDecomposeUTF8WithEdits();
void TestLowMappingToEmpty_D();
void TestLowMappingToEmpty_FCD();
void TestNormalizeIllFormedText();