ICU-13271 add Normalizer2::isNormalizedUTF8()

X-SVN-Rev: 40280
2025-04-10 07:39:16 +00:00 · 2017-07-20 22:08:30 +00:00 · 2017-07-20 22:08:30 +00:00 · aa6d5e3e76
commit aa6d5e3e76
parent 09b77193dc
10 changed files with 186 additions and 18 deletions
--- a/icu4c/source/common/filterednormalizer2.cpp
+++ b/icu4c/source/common/filterednormalizer2.cpp
@ -244,6 +244,31 @@ FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode)
    return TRUE;
 }

+UBool
+FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
+    if(U_FAILURE(errorCode)) {
+        return FALSE;
+    }
+    const char *s = sp.data();
+    int32_t length = sp.length();
+    USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
+    while (length > 0) {
+        int32_t spanLength = set.spanUTF8(s, length, spanCondition);
+        if (spanCondition == USET_SPAN_NOT_CONTAINED) {
+            spanCondition = USET_SPAN_SIMPLE;
+        } else {
+            if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
+                    U_FAILURE(errorCode)) {
+                return FALSE;
+            }
+            spanCondition = USET_SPAN_NOT_CONTAINED;
+        }
+        s += spanLength;
+        length -= spanLength;
+    }
+    return TRUE;
+}
+
 UNormalizationCheckResult
 FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
    uprv_checkCanGetBuffer(s, errorCode);
--- a/icu4c/source/common/norm2allmodes.h
+++ b/icu4c/source/common/norm2allmodes.h
@ -270,6 +270,14 @@ private:
        }
        return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
    }
+    virtual UBool
+    isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
+        if(U_FAILURE(errorCode)) {
+            return FALSE;
+        }
+        const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
+        return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
+    }
    virtual UNormalizationCheckResult
    quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
        if(U_FAILURE(errorCode)) {
--- a/icu4c/source/common/normalizer2.cpp
+++ b/icu4c/source/common/normalizer2.cpp
@ -73,6 +73,11 @@ Normalizer2::getCombiningClass(UChar32 /*c*/) const {
    return 0;
 }

+UBool
+Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
+    return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
+}
+
 // Normalizer2 implementation for the old UNORM_NONE.
 class NoopNormalizer2 : public Normalizer2 {
    virtual ~NoopNormalizer2();
@ -139,8 +144,12 @@ class NoopNormalizer2 : public Normalizer2 {
    }
    // No need to override the default getRawDecomposition().
    virtual UBool
-    isNormalized(const UnicodeString &, UErrorCode &) const override {
-        return TRUE;
+    isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
+        return U_SUCCESS(errorCode);
+    }
+    virtual UBool
+    isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
+        return U_SUCCESS(errorCode);
    }
    virtual UNormalizationCheckResult
    quickCheck(const UnicodeString &, UErrorCode &) const override {
--- a/icu4c/source/common/normalizer2impl.h
+++ b/icu4c/source/common/normalizer2impl.h
@ -420,20 +420,20 @@ public:

        // Norm16 value thresholds for quick check combinations and types of extra data.

-        // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+        /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
        IX_MIN_YES_NO,
-        // Mappings are comp-normalized.
+        /** Mappings are comp-normalized. */
        IX_MIN_NO_NO,
        IX_LIMIT_NO_NO,
        IX_MIN_MAYBE_YES,

-        // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+        /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
        IX_MIN_YES_NO_MAPPINGS_ONLY,
-        // Mappings are not comp-normalized but have a comp boundary before.
+        /** Mappings are not comp-normalized but have a comp boundary before. */
        IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
-        // Mappings do not have a comp boundary before.
+        /** Mappings do not have a comp boundary before. */
        IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
-        // Mappings to the empty string.
+        /** Mappings to the empty string. */
        IX_MIN_NO_NO_EMPTY,

        IX_MIN_LCCC_CP,
--- a/icu4c/source/common/unicode/normalizer2.h
+++ b/icu4c/source/common/unicode/normalizer2.h
@ -371,6 +371,30 @@ public:
     */
    virtual UBool
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
+    /**
+     * Tests if the UTF-8 string is normalized.
+     * Internally, in cases where the quickCheck() method would return "maybe"
+     * (which is only possible for the two COMPOSE modes) this method
+     * resolves to "yes" or "no" to provide a definitive result,
+     * at the cost of doing more work in those cases.
+     *
+     * This works for all normalization modes,
+     * but it is currently optimized for UTF-8 only for "compose" modes,
+     * such as for NFC, NFKC, and NFKC_Casefold
+     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     *
+     * @param s UTF-8 input string
+     * @param errorCode Standard ICU error code. Its input value must
+     *                  pass the U_SUCCESS() test, or else the function returns
+     *                  immediately. Check for U_FAILURE() on output or use with
+     *                  function chaining. (See User Guide for details.)
+     * @return TRUE if s is normalized
+     * @draft ICU 60
+     */
+    virtual UBool
+    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
+

    /**
     * Tests if the string is normalized.
@ -641,6 +665,29 @@ public:
     */
    virtual UBool
    isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
+    /**
+     * Tests if the UTF-8 string is normalized.
+     * Internally, in cases where the quickCheck() method would return "maybe"
+     * (which is only possible for the two COMPOSE modes) this method
+     * resolves to "yes" or "no" to provide a definitive result,
+     * at the cost of doing more work in those cases.
+     *
+     * This works for all normalization modes,
+     * but it is currently optimized for UTF-8 only for "compose" modes,
+     * such as for NFC, NFKC, and NFKC_Casefold
+     * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
+     * For other modes it currently converts to UTF-16 and calls isNormalized().
+     *
+     * @param s UTF-8 input string
+     * @param errorCode Standard ICU error code. Its input value must
+     *                  pass the U_SUCCESS() test, or else the function returns
+     *                  immediately. Check for U_FAILURE() on output or use with
+     *                  function chaining. (See User Guide for details.)
+     * @return TRUE if s is normalized
+     * @draft ICU 60
+     */
+    virtual UBool
+    isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
    /**
     * Tests if the string is normalized.
     * For details see the Normalizer2 base class documentation.
--- a/icu4c/source/test/intltest/normconf.cpp
+++ b/icu4c/source/test/intltest/normconf.cpp
@ -280,6 +280,15 @@ void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t optio
    }
 }

+namespace {
+
+UBool isNormalizedUTF8(const Normalizer2 &norm2, const UnicodeString &s, UErrorCode &errorCode) {
+    std::string s8;
+    return norm2.isNormalizedUTF8(s.toUTF8String(s8), errorCode);
+}
+
+}  // namespace
+
 /**
 * Verify the conformance of the given line of the Unicode
 * normalization (UTR 15) test suite file.  For each line,
@ -342,18 +351,38 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
        dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
        pass = FALSE;
    }
-    if(field[0]!=field[1] && Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
-        errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
+    if(options==0 && !isNormalizedUTF8(*nfc, field[1], status)) {
+        dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is FALSE");
        pass = FALSE;
    }
+    if(field[0]!=field[1]) {
+        if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
+            errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
+            pass = FALSE;
+        }
+        if(isNormalizedUTF8(*nfc, field[0], status)) {
+            errln("Normalizer error: nfc.isNormalizedUTF8(s) is TRUE");
+            pass = FALSE;
+        }
+    }
    if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
        dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
        pass = FALSE;
    }
-    if(field[0]!=field[3] && Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
-        errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
+    if(options==0 && !isNormalizedUTF8(*nfkc, field[3], status)) {
+        dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is FALSE");
        pass = FALSE;
    }
+    if(field[0]!=field[3]) {
+        if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
+            errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
+            pass = FALSE;
+        }
+        if(options==0 && isNormalizedUTF8(*nfkc, field[0], status)) {
+            errln("Normalizer error: nfkc.isNormalizedUTF8(s) is TRUE");
+            pass = FALSE;
+        }
+    }

    // test FCD quick check and "makeFCD"
    Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
--- a/icu4c/source/test/intltest/tstnorm.cpp
+++ b/icu4c/source/test/intltest/tstnorm.cpp
@ -58,6 +58,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
    TESTCASE_AUTO(TestLowMappingToEmpty_D);
    TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
    TESTCASE_AUTO(TestNormalizeIllFormedText);
+    TESTCASE_AUTO(TestComposeJamoTBase);
    TESTCASE_AUTO_END;
 }

@ -1566,6 +1567,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
            expectedChanges, UPRV_LENGTHOF(expectedChanges),
            TRUE, errorCode);

+    assertFalse("isNormalizedUTF8(source)", nfkc_cf->isNormalizedUTF8(src, errorCode));
+    assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
+
    // Omit unchanged text.
    expected = u8"aääạ\u0308ạ\u0308가각갃";
    result.clear();
@ -1605,6 +1609,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
            filteredChanges, UPRV_LENGTHOF(filteredChanges),
            TRUE, errorCode);

+    assertFalse("filtered isNormalizedUTF8(source)", fn2.isNormalizedUTF8(src, errorCode));
+    assertTrue("filtered isNormalizedUTF8(normalized)", fn2.isNormalizedUTF8(result, errorCode));
+
    // Omit unchanged text.
    // Note that the result is not normalized because the inner normalizer
    // does not see text across filter spans.
@ -1711,4 +1718,32 @@ BasicNormalizerTest::TestNormalizeIllFormedText() {
    assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
 }

+void
+BasicNormalizerTest::TestComposeJamoTBase() {
+    // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
+    // which is not a conjoining Jamo Trailing consonant.
+    IcuTestErrorCode errorCode(*this, "TestComposeJamoTBase");
+    const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
+    if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
+        return;
+    }
+    UnicodeString s(u"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+    UnicodeString expected(u"가\u11A7가\u11A7가\u11A7");
+    UnicodeString result = nfkc->normalize(s, errorCode);
+    assertSuccess("normalize(LV+11A7)", errorCode.get());
+    assertEquals("normalize(LV+11A7)", expected, result);
+    assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
+    assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
+
+    std::string s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
+    std::string expected8(u8"가\u11A7가\u11A7가\u11A7");
+    std::string result8;
+    StringByteSink<std::string> sink(&result8, expected8.length());
+    nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
+    assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
+    assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
+    assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
+    assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
+}
+
 #endif /* #if !UCONFIG_NO_NORMALIZATION */
--- a/icu4c/source/test/intltest/tstnorm.h
+++ b/icu4c/source/test/intltest/tstnorm.h
@ -51,6 +51,7 @@ public:
    void TestLowMappingToEmpty_D();
    void TestLowMappingToEmpty_FCD();
    void TestNormalizeIllFormedText();
+    void TestComposeJamoTBase();

 private:
    UnicodeString canonTests[24][3];
--- a/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
+++ b/icu4j/main/classes/core/src/com/ibm/icu/impl/Normalizer2Impl.java
@ -901,20 +901,21 @@ public final class Normalizer2Impl {
    public static final int IX_MIN_COMP_NO_MAYBE_CP=9;

    // Norm16 value thresholds for quick check combinations and types of extra data.
-    // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+
+    /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
    public static final int IX_MIN_YES_NO=10;
-    // Mappings are comp-normalized.
+    /** Mappings are comp-normalized. */
    public static final int IX_MIN_NO_NO=11;
    public static final int IX_LIMIT_NO_NO=12;
    public static final int IX_MIN_MAYBE_YES=13;

-    // Mappings only in [minYesNoMappingsOnly..minNoNo[.
+    /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
    public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
-    // Mappings are not comp-normalized but have a comp boundary before.
+    /** Mappings are not comp-normalized but have a comp boundary before. */
    public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
-    // Mappings do not have a comp boundary before.
+    /** Mappings do not have a comp boundary before. */
    public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
-    // Mappings to the empty string.
+    /** Mappings to the empty string. */
    public static final int IX_MIN_NO_NO_EMPTY=17;

    public static final int IX_MIN_LCCC_CP=18;
--- a/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
+++ b/icu4j/main/tests/core/src/com/ibm/icu/dev/test/normalizer/BasicTest.java
@ -2854,6 +2854,19 @@ public class BasicTest extends TestFmwk {
        assertEquals("normalize", expected, result);
    }

+    @Test
+    public void TestComposeJamoTBase() {
+        // Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
+        // which is not a conjoining Jamo Trailing consonant.
+        Normalizer2 nfkc = Normalizer2.getNFKCInstance();
+        String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7";
+        String expected = "가\u11A7가\u11A7가\u11A7";
+        String result = nfkc.normalize(s);
+        assertEquals("normalize(LV+11A7)", expected, result);
+        assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s));
+        assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
+    }
+
    @Test
    public void TestNFC() {
        // Coverage tests.