mirror of
https://github.com/unicode-org/icu.git
synced 2025-04-10 07:39:16 +00:00
ICU-13271 add Normalizer2::isNormalizedUTF8()
X-SVN-Rev: 40280
This commit is contained in:
parent
09b77193dc
commit
aa6d5e3e76
10 changed files with 186 additions and 18 deletions
|
@ -244,6 +244,31 @@ FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode)
|
|||
return TRUE;
|
||||
}
|
||||
|
||||
UBool
|
||||
FilteredNormalizer2::isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
const char *s = sp.data();
|
||||
int32_t length = sp.length();
|
||||
USetSpanCondition spanCondition = USET_SPAN_SIMPLE;
|
||||
while (length > 0) {
|
||||
int32_t spanLength = set.spanUTF8(s, length, spanCondition);
|
||||
if (spanCondition == USET_SPAN_NOT_CONTAINED) {
|
||||
spanCondition = USET_SPAN_SIMPLE;
|
||||
} else {
|
||||
if (!norm2.isNormalizedUTF8(StringPiece(s, spanLength), errorCode) ||
|
||||
U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
spanCondition = USET_SPAN_NOT_CONTAINED;
|
||||
}
|
||||
s += spanLength;
|
||||
length -= spanLength;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
UNormalizationCheckResult
|
||||
FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
|
||||
uprv_checkCanGetBuffer(s, errorCode);
|
||||
|
|
|
@ -270,6 +270,14 @@ private:
|
|||
}
|
||||
return impl.compose(sArray, sArray+s.length(), onlyContiguous, FALSE, buffer, errorCode);
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece sp, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
return FALSE;
|
||||
}
|
||||
const uint8_t *s = reinterpret_cast<const uint8_t *>(sp.data());
|
||||
return impl.composeUTF8(0, onlyContiguous, s, s + sp.length(), nullptr, nullptr, errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &s, UErrorCode &errorCode) const override {
|
||||
if(U_FAILURE(errorCode)) {
|
||||
|
|
|
@ -73,6 +73,11 @@ Normalizer2::getCombiningClass(UChar32 /*c*/) const {
|
|||
return 0;
|
||||
}
|
||||
|
||||
UBool
|
||||
Normalizer2::isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const {
|
||||
return U_SUCCESS(errorCode) && isNormalized(UnicodeString::fromUTF8(s), errorCode);
|
||||
}
|
||||
|
||||
// Normalizer2 implementation for the old UNORM_NONE.
|
||||
class NoopNormalizer2 : public Normalizer2 {
|
||||
virtual ~NoopNormalizer2();
|
||||
|
@ -139,8 +144,12 @@ class NoopNormalizer2 : public Normalizer2 {
|
|||
}
|
||||
// No need to override the default getRawDecomposition().
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &, UErrorCode &) const override {
|
||||
return TRUE;
|
||||
isNormalized(const UnicodeString &, UErrorCode &errorCode) const override {
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece, UErrorCode &errorCode) const override {
|
||||
return U_SUCCESS(errorCode);
|
||||
}
|
||||
virtual UNormalizationCheckResult
|
||||
quickCheck(const UnicodeString &, UErrorCode &) const override {
|
||||
|
|
|
@ -420,20 +420,20 @@ public:
|
|||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
|
||||
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
|
||||
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
|
||||
IX_MIN_YES_NO,
|
||||
// Mappings are comp-normalized.
|
||||
/** Mappings are comp-normalized. */
|
||||
IX_MIN_NO_NO,
|
||||
IX_LIMIT_NO_NO,
|
||||
IX_MIN_MAYBE_YES,
|
||||
|
||||
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
|
||||
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
|
||||
IX_MIN_YES_NO_MAPPINGS_ONLY,
|
||||
// Mappings are not comp-normalized but have a comp boundary before.
|
||||
/** Mappings are not comp-normalized but have a comp boundary before. */
|
||||
IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
|
||||
// Mappings do not have a comp boundary before.
|
||||
/** Mappings do not have a comp boundary before. */
|
||||
IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
|
||||
// Mappings to the empty string.
|
||||
/** Mappings to the empty string. */
|
||||
IX_MIN_NO_NO_EMPTY,
|
||||
|
||||
IX_MIN_LCCC_CP,
|
||||
|
|
|
@ -371,6 +371,30 @@ public:
|
|||
*/
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
|
||||
/**
|
||||
* Tests if the UTF-8 string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
* (which is only possible for the two COMPOSE modes) this method
|
||||
* resolves to "yes" or "no" to provide a definitive result,
|
||||
* at the cost of doing more work in those cases.
|
||||
*
|
||||
* This works for all normalization modes,
|
||||
* but it is currently optimized for UTF-8 only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* For other modes it currently converts to UTF-16 and calls isNormalized().
|
||||
*
|
||||
* @param s UTF-8 input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return TRUE if s is normalized
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
|
||||
|
||||
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
|
@ -641,6 +665,29 @@ public:
|
|||
*/
|
||||
virtual UBool
|
||||
isNormalized(const UnicodeString &s, UErrorCode &errorCode) const override;
|
||||
/**
|
||||
* Tests if the UTF-8 string is normalized.
|
||||
* Internally, in cases where the quickCheck() method would return "maybe"
|
||||
* (which is only possible for the two COMPOSE modes) this method
|
||||
* resolves to "yes" or "no" to provide a definitive result,
|
||||
* at the cost of doing more work in those cases.
|
||||
*
|
||||
* This works for all normalization modes,
|
||||
* but it is currently optimized for UTF-8 only for "compose" modes,
|
||||
* such as for NFC, NFKC, and NFKC_Casefold
|
||||
* (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
|
||||
* For other modes it currently converts to UTF-16 and calls isNormalized().
|
||||
*
|
||||
* @param s UTF-8 input string
|
||||
* @param errorCode Standard ICU error code. Its input value must
|
||||
* pass the U_SUCCESS() test, or else the function returns
|
||||
* immediately. Check for U_FAILURE() on output or use with
|
||||
* function chaining. (See User Guide for details.)
|
||||
* @return TRUE if s is normalized
|
||||
* @draft ICU 60
|
||||
*/
|
||||
virtual UBool
|
||||
isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const override;
|
||||
/**
|
||||
* Tests if the string is normalized.
|
||||
* For details see the Normalizer2 base class documentation.
|
||||
|
|
|
@ -280,6 +280,15 @@ void NormalizerConformanceTest::TestConformance(FileStream *input, int32_t optio
|
|||
}
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
UBool isNormalizedUTF8(const Normalizer2 &norm2, const UnicodeString &s, UErrorCode &errorCode) {
|
||||
std::string s8;
|
||||
return norm2.isNormalizedUTF8(s.toUTF8String(s8), errorCode);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
/**
|
||||
* Verify the conformance of the given line of the Unicode
|
||||
* normalization (UTR 15) test suite file. For each line,
|
||||
|
@ -342,18 +351,38 @@ UBool NormalizerConformanceTest::checkConformance(const UnicodeString* field,
|
|||
dataerrln("Normalizer error: isNormalized(NFC(s), UNORM_NFC) is FALSE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(field[0]!=field[1] && Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
|
||||
errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
|
||||
if(options==0 && !isNormalizedUTF8(*nfc, field[1], status)) {
|
||||
dataerrln("Normalizer error: nfc.isNormalizedUTF8(NFC(s)) is FALSE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(field[0]!=field[1]) {
|
||||
if(Normalizer::isNormalized(field[0], UNORM_NFC, options, status)) {
|
||||
errln("Normalizer error: isNormalized(s, UNORM_NFC) is TRUE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(isNormalizedUTF8(*nfc, field[0], status)) {
|
||||
errln("Normalizer error: nfc.isNormalizedUTF8(s) is TRUE");
|
||||
pass = FALSE;
|
||||
}
|
||||
}
|
||||
if(!Normalizer::isNormalized(field[3], UNORM_NFKC, options, status)) {
|
||||
dataerrln("Normalizer error: isNormalized(NFKC(s), UNORM_NFKC) is FALSE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(field[0]!=field[3] && Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
|
||||
errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
|
||||
if(options==0 && !isNormalizedUTF8(*nfkc, field[3], status)) {
|
||||
dataerrln("Normalizer error: nfkc.isNormalizedUTF8(NFKC(s)) is FALSE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(field[0]!=field[3]) {
|
||||
if(Normalizer::isNormalized(field[0], UNORM_NFKC, options, status)) {
|
||||
errln("Normalizer error: isNormalized(s, UNORM_NFKC) is TRUE");
|
||||
pass = FALSE;
|
||||
}
|
||||
if(options==0 && isNormalizedUTF8(*nfkc, field[0], status)) {
|
||||
errln("Normalizer error: nfkc.isNormalizedUTF8(s) is TRUE");
|
||||
pass = FALSE;
|
||||
}
|
||||
}
|
||||
|
||||
// test FCD quick check and "makeFCD"
|
||||
Normalizer::normalize(field[0], UNORM_FCD, options, fcd, status);
|
||||
|
|
|
@ -58,6 +58,7 @@ void BasicNormalizerTest::runIndexedTest(int32_t index, UBool exec,
|
|||
TESTCASE_AUTO(TestLowMappingToEmpty_D);
|
||||
TESTCASE_AUTO(TestLowMappingToEmpty_FCD);
|
||||
TESTCASE_AUTO(TestNormalizeIllFormedText);
|
||||
TESTCASE_AUTO(TestComposeJamoTBase);
|
||||
TESTCASE_AUTO_END;
|
||||
}
|
||||
|
||||
|
@ -1566,6 +1567,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
expectedChanges, UPRV_LENGTHOF(expectedChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
assertFalse("isNormalizedUTF8(source)", nfkc_cf->isNormalizedUTF8(src, errorCode));
|
||||
assertTrue("isNormalizedUTF8(normalized)", nfkc_cf->isNormalizedUTF8(result, errorCode));
|
||||
|
||||
// Omit unchanged text.
|
||||
expected = u8"aääạ\u0308ạ\u0308가각갃";
|
||||
result.clear();
|
||||
|
@ -1605,6 +1609,9 @@ BasicNormalizerTest::TestNormalizeUTF8WithEdits() {
|
|||
filteredChanges, UPRV_LENGTHOF(filteredChanges),
|
||||
TRUE, errorCode);
|
||||
|
||||
assertFalse("filtered isNormalizedUTF8(source)", fn2.isNormalizedUTF8(src, errorCode));
|
||||
assertTrue("filtered isNormalizedUTF8(normalized)", fn2.isNormalizedUTF8(result, errorCode));
|
||||
|
||||
// Omit unchanged text.
|
||||
// Note that the result is not normalized because the inner normalizer
|
||||
// does not see text across filter spans.
|
||||
|
@ -1711,4 +1718,32 @@ BasicNormalizerTest::TestNormalizeIllFormedText() {
|
|||
assertEquals("normalizeUTF8", expected8.c_str(), result8.c_str());
|
||||
}
|
||||
|
||||
void
|
||||
BasicNormalizerTest::TestComposeJamoTBase() {
|
||||
// Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
|
||||
// which is not a conjoining Jamo Trailing consonant.
|
||||
IcuTestErrorCode errorCode(*this, "TestComposeJamoTBase");
|
||||
const Normalizer2 *nfkc = Normalizer2::getNFKCInstance(errorCode);
|
||||
if(errorCode.logDataIfFailureAndReset("Normalizer2::getNFKCInstance() call failed")) {
|
||||
return;
|
||||
}
|
||||
UnicodeString s(u"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
|
||||
UnicodeString expected(u"가\u11A7가\u11A7가\u11A7");
|
||||
UnicodeString result = nfkc->normalize(s, errorCode);
|
||||
assertSuccess("normalize(LV+11A7)", errorCode.get());
|
||||
assertEquals("normalize(LV+11A7)", expected, result);
|
||||
assertFalse("isNormalized(LV+11A7)", nfkc->isNormalized(s, errorCode));
|
||||
assertTrue("isNormalized(normalized)", nfkc->isNormalized(result, errorCode));
|
||||
|
||||
std::string s8(u8"\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7");
|
||||
std::string expected8(u8"가\u11A7가\u11A7가\u11A7");
|
||||
std::string result8;
|
||||
StringByteSink<std::string> sink(&result8, expected8.length());
|
||||
nfkc->normalizeUTF8(0, s8, sink, nullptr, errorCode);
|
||||
assertSuccess("normalizeUTF8(LV+11A7)", errorCode.get());
|
||||
assertEquals("normalizeUTF8(LV+11A7)", expected8.c_str(), result8.c_str());
|
||||
assertFalse("isNormalizedUTF8(LV+11A7)", nfkc->isNormalizedUTF8(s8, errorCode));
|
||||
assertTrue("isNormalizedUTF8(normalized)", nfkc->isNormalizedUTF8(result8, errorCode));
|
||||
}
|
||||
|
||||
#endif /* #if !UCONFIG_NO_NORMALIZATION */
|
||||
|
|
|
@ -51,6 +51,7 @@ public:
|
|||
void TestLowMappingToEmpty_D();
|
||||
void TestLowMappingToEmpty_FCD();
|
||||
void TestNormalizeIllFormedText();
|
||||
void TestComposeJamoTBase();
|
||||
|
||||
private:
|
||||
UnicodeString canonTests[24][3];
|
||||
|
|
|
@ -901,20 +901,21 @@ public final class Normalizer2Impl {
|
|||
public static final int IX_MIN_COMP_NO_MAYBE_CP=9;
|
||||
|
||||
// Norm16 value thresholds for quick check combinations and types of extra data.
|
||||
// Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
|
||||
|
||||
/** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
|
||||
public static final int IX_MIN_YES_NO=10;
|
||||
// Mappings are comp-normalized.
|
||||
/** Mappings are comp-normalized. */
|
||||
public static final int IX_MIN_NO_NO=11;
|
||||
public static final int IX_LIMIT_NO_NO=12;
|
||||
public static final int IX_MIN_MAYBE_YES=13;
|
||||
|
||||
// Mappings only in [minYesNoMappingsOnly..minNoNo[.
|
||||
/** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
|
||||
public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14;
|
||||
// Mappings are not comp-normalized but have a comp boundary before.
|
||||
/** Mappings are not comp-normalized but have a comp boundary before. */
|
||||
public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15;
|
||||
// Mappings do not have a comp boundary before.
|
||||
/** Mappings do not have a comp boundary before. */
|
||||
public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16;
|
||||
// Mappings to the empty string.
|
||||
/** Mappings to the empty string. */
|
||||
public static final int IX_MIN_NO_NO_EMPTY=17;
|
||||
|
||||
public static final int IX_MIN_LCCC_CP=18;
|
||||
|
|
|
@ -2854,6 +2854,19 @@ public class BasicTest extends TestFmwk {
|
|||
assertEquals("normalize", expected, result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestComposeJamoTBase() {
|
||||
// Algorithmic composition of Hangul syllables must not combine with JAMO_T_BASE = U+11A7
|
||||
// which is not a conjoining Jamo Trailing consonant.
|
||||
Normalizer2 nfkc = Normalizer2.getNFKCInstance();
|
||||
String s = "\u1100\u1161\u11A7\u1100\u314F\u11A7가\u11A7";
|
||||
String expected = "가\u11A7가\u11A7가\u11A7";
|
||||
String result = nfkc.normalize(s);
|
||||
assertEquals("normalize(LV+11A7)", expected, result);
|
||||
assertFalse("isNormalized(LV+11A7)", nfkc.isNormalized(s));
|
||||
assertTrue("isNormalized(normalized)", nfkc.isNormalized(result));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestNFC() {
|
||||
// Coverage tests.
|
||||
|
|
Loading…
Add table
Reference in a new issue