strings::NormalizeDigits for full-width unicode numbers.

This commit is contained in:
Alex Zolotarev 2016-03-31 09:51:32 +03:00
parent 86d4446e86
commit f4c112b59f
3 changed files with 31 additions and 0 deletions

View file

@ -612,3 +612,16 @@ UNIT_TEST(EditDistance)
testUniStringEditDistance("ll", "l1", 1);
testUniStringEditDistance("\u0132ij", "\u0133IJ", 3);
}
UNIT_TEST(NormalizeDigits)
{
auto const nd = [](string str) -> string
{
strings::NormalizeDigits(str);
return str;
};
TEST_EQUAL(nd(""), "", ());
TEST_EQUAL(nd("z12345"), "z12345", ());
TEST_EQUAL(nd("a9 "), "a0192 ", ());
TEST_EQUAL(nd(""), "3456789", ());
}

View file

@ -113,6 +113,21 @@ UniString Normalize(UniString const & s)
return result;
}
void NormalizeDigits(string & utf8)
{
for (size_t i = 0; i + 2 < utf8.size(); ++i)
{
if (utf8[i] == '\xEF' && utf8[i + 1] == '\xBC')
{
char const n = utf8[i + 2];
if (n < '\x90' || n > '\x99')
continue;
utf8[i] = n - 0x90 + '0';
utf8.erase(i + 1, 2);
}
}
}
namespace
{
char ascii_to_lower(char in)

View file

@ -43,6 +43,9 @@ UniString MakeLowerCase(UniString const & s);
void NormalizeInplace(UniString & s);
UniString Normalize(UniString const & s);
/// Replaces "full width" unicode digits with ascii ones.
void NormalizeDigits(string & utf8);
/// Counts number of start symbols in string s (that is not lower and not normalized) that maches
/// to lower and normalized string low_s. If s doen't starts with low_s then returns 0; otherwise
/// returns number of start symbols in s that equivalent to lowStr