ICU-13271 add Normalizer2::isNormalizedUTF8()

X-SVN-Rev: 40280
This commit is contained in:
Markus Scherer 2017-07-20 22:08:30 +00:00
parent 09b77193dc
commit aa6d5e3e76
10 changed files with 186 additions and 18 deletions

View file

@ -244,6 +244,31 @@ FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode)
return TRUE;
}
UBool
FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const char *s = sp.data();
int32_t length = sp.length();
USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
while (length > 0) {
int32_t spanLength = set.spanUTF8(s, length, spanCondition);
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
spanCondition = USET_SPAN_SIMPLE;
} else {
if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
U_FAILURE(errorCode)) {
return FALSE;
}
spanCondition = USET_SPAN_NOT_CONTAINED;
}
s += spanLength;
length -= spanLength;
}
return TRUE;
}
UNormalizationCheckResult
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
uprv_checkCanGetBuffer(s, errorCode);

View file

@ -270,6 +270,14 @@ private:
}
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
}
virtual UBool
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {
return FALSE;
}
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
if(U_FAILURE(errorCode)) {

View file

@ -73,6 +73,11 @@ Normalizer2::getCombiningClass(UChar32 /*c*/) const {
return 0;
}
UBool
Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
}
// Normalizer2 implementation for the old UNORM_NONE.
class NoopNormalizer2 : public Normalizer2 {
virtual ~NoopNormalizer2();
@ -139,8 +144,12 @@ class NoopNormalizer2 : public Normalizer2 {
}
// No need to override the default getRawDecomposition().
virtual UBool
isNormalized(const UnicodeString &, UErrorCode &) const override {
return TRUE;
isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
return U_SUCCESS(errorCode);
}
virtual UBool
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
return U_SUCCESS(errorCode);
}
virtual UNormalizationCheckResult
quickCheck(const UnicodeString &, UErrorCode &) const override {

View file

@ -420,20 +420,20 @@ public:
// Norm16 value thresholds for quick check combinations and types of extra data.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
IX_MIN_YES_NO,
// Mappings are comp-normalized.
/** Mappings are comp-normalized. */
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
IX_MIN_YES_NO_MAPPINGS_ONLY,
// Mappings are not comp-normalized but have a comp boundary before.
/** Mappings are not comp-normalized but have a comp boundary before. */
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
// Mappings do not have a comp boundary before.
/** Mappings do not have a comp boundary before. */
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
// Mappings to the empty string.
/** Mappings to the empty string. */
IX_MIN_NO_NO_EMPTY,
IX_MIN_LCCC_CP,

View file

@ -371,6 +371,30 @@ public:
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
/**
* Tests if the UTF-8 string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
*
* This works for all normalization modes,
* but it is currently optimized for UTF-8 only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* For other modes it currently converts to UTF-16 and calls isNormalized().
*
* @param s UTF-8 input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return TRUE if s is normalized
* @draft ICU 60
*/
virtual UBool
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
/**
* Tests if the string is normalized.
@ -641,6 +665,29 @@ public:
*/
virtual UBool
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
/**
* Tests if the UTF-8 string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
*
* This works for all normalization modes,
* but it is currently optimized for UTF-8 only for "compose" modes,
* such as for NFC, NFKC, and NFKC_Casefold
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
* For other modes it currently converts to UTF-16 and calls isNormalized().
*
* @param s UTF-8 input string
* @param errorCode Standard ICU error code. Its input value must
* pass the U_SUCCESS() test, or else the function returns
* immediately. Check for U_FAILURE() on output or use with
* function chaining. (See User Guide for details.)
* @return TRUE if s is normalized
* @draft ICU 60
*/
virtual UBool
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
/**
* Tests if the string is normalized.
* For details see the Normalizer2 base class documentation.

View file

@ -280,6 +280,15 @@ void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t optio
}
}
namespace {
UBool isNormalizedUTF8(const Normalizer2 &norm2, const UnicodeString &s, UErrorCode &errorCode) {
std::string s8;
return norm2.isNormalizedUTF8(s.toUTF8String(s8), errorCode);
}
} // namespace
/**
* Verify the conformance of the given line of the Unicode
* normalization (UTR 15) test suite file. For each line,
@ -342,18 +351,38 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
pass = FALSE;
}
if(field[0]!=field[1] && Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
if(options==0 && !isNormalizedUTF8(*nfc, field[1], status)) {
dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is FALSE");
pass = FALSE;
}
if(field[0]!=field[1]) {
if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
pass = FALSE;
}
if(isNormalizedUTF8(*nfc, field[0], status)) {
errln("Normalizer error: nfc.isNormalizedUTF8(s) is TRUE");
pass = FALSE;
}
}
if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
pass = FALSE;
}
if(field[0]!=field[3] && Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
if(options==0 && !isNormalizedUTF8(*nfkc, field[3], status)) {
dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is FALSE");
pass = FALSE;
}
if(field[0]!=field[3]) {
if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
pass = FALSE;
}
if(options==0 && isNormalizedUTF8(*nfkc, field[0], status)) {
errln("Normalizer error: nfkc.isNormalizedUTF8(s) is TRUE");
pass = FALSE;
}
}
// test FCD quick check and "makeFCD"
Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);

View file

@ -58,6 +58,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
TESTCASE_AUTO(TestLowMappingToEmpty_D);
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
TESTCASE_AUTO(TestNormalizeIllFormedText);
TESTCASE_AUTO(TestComposeJamoTBase);
TESTCASE_AUTO_END;
}
@ -1566,6 +1567,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
expectedChanges, UPRV_LENGTHOF(expectedChanges),
TRUE, errorCode);
assertFalse("isNormalizedUTF8(source)", nfkc_cf->isNormalizedUTF8(src, errorCode));
assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
// Omit unchanged text.
expected = u8"aääạ\u0308\u0308가각갃";
result.clear();
@ -1605,6 +1609,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
filteredChanges, UPRV_LENGTHOF(filteredChanges),
TRUE, errorCode);
assertFalse("filtered isNormalizedUTF8(source)", fn2.isNormalizedUTF8(src, errorCode));
assertTrue("filtered isNormalizedUTF8(normalized)", fn2.isNormalizedUTF8(result, errorCode));
// Omit unchanged text.
// Note that the result is not normalized because the inner normalizer
// does not see text across filter spans.
@ -1711,4 +1718,32 @@ BasicNormalizerTest::TestNormalizeIllFormedText() {
assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
}
void
BasicNormalizerTest::TestComposeJamoTBase() {
// Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
// which is not a conjoining Jamo Trailing consonant.
IcuTestErrorCode errorCode(*this, "TestComposeJamoTBase");
const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
return;
}
UnicodeString s(u"\u1100\u1161\u11A7\u1100\u314F\u11A7\u11A7");
UnicodeString expected(u"\u11A7\u11A7\u11A7");
UnicodeString result = nfkc->normalize(s, errorCode);
assertSuccess("normalize(LV+11A7)", errorCode.get());
assertEquals("normalize(LV+11A7)", expected, result);
assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
std::string s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7\u11A7");
std::string expected8(u8"\u11A7\u11A7\u11A7");
std::string result8;
StringByteSink<std::string> sink(&result8, expected8.length());
nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
}
#endif /* #if !UCONFIG_NO_NORMALIZATION */

View file

@ -51,6 +51,7 @@ public:
void TestLowMappingToEmpty_D();
void TestLowMappingToEmpty_FCD();
void TestNormalizeIllFormedText();
void TestComposeJamoTBase();
private:
UnicodeString canonTests[24][3];

View file

@ -901,20 +901,21 @@ public final class Normalizer2Impl {
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
// Norm16 value thresholds for quick check combinations and types of extra data.
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
public static final int IX_MIN_YES_NO=10;
// Mappings are comp-normalized.
/** Mappings are comp-normalized. */
public static final int IX_MIN_NO_NO=11;
public static final int IX_LIMIT_NO_NO=12;
public static final int IX_MIN_MAYBE_YES=13;
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
// Mappings are not comp-normalized but have a comp boundary before.
/** Mappings are not comp-normalized but have a comp boundary before. */
public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
// Mappings do not have a comp boundary before.
/** Mappings do not have a comp boundary before. */
public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
// Mappings to the empty string.
/** Mappings to the empty string. */
public static final int IX_MIN_NO_NO_EMPTY=17;
public static final int IX_MIN_LCCC_CP=18;

View file

@ -2854,6 +2854,19 @@ public class BasicTest extends TestFmwk {
assertEquals("normalize", expected, result);
}
@Test
public void TestComposeJamoTBase() {
// Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
// which is not a conjoining Jamo Trailing consonant.
Normalizer2 nfkc = Normalizer2.getNFKCInstance();
String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7";
String expected = "\u11A7가\u11A7가\u11A7";
String result = nfkc.normalize(s);
assertEquals("normalize(LV+11A7)", expected, result);
assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s));
assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
}
@Test
public void TestNFC() {
// Coverage tests.