forked from organicmaps/organicmaps
strings::NormalizeDigits for full-width unicode numbers.
This commit is contained in:
parent
86d4446e86
commit
f4c112b59f
3 changed files with 31 additions and 0 deletions
|
@ -612,3 +612,16 @@ UNIT_TEST(EditDistance)
|
|||
testUniStringEditDistance("ll", "l1", 1);
|
||||
testUniStringEditDistance("\u0132ij", "\u0133IJ", 3);
|
||||
}
|
||||
|
||||
UNIT_TEST(NormalizeDigits)
|
||||
{
|
||||
auto const nd = [](string str) -> string
|
||||
{
|
||||
strings::NormalizeDigits(str);
|
||||
return str;
|
||||
};
|
||||
TEST_EQUAL(nd(""), "", ());
|
||||
TEST_EQUAL(nd("z12345//"), "z12345//", ());
|
||||
TEST_EQUAL(nd("a0192 "), "a0192 ", ());
|
||||
TEST_EQUAL(nd("3456789"), "3456789", ());
|
||||
}
|
||||
|
|
|
@ -113,6 +113,21 @@ UniString Normalize(UniString const & s)
|
|||
return result;
|
||||
}
|
||||
|
||||
void NormalizeDigits(string & utf8)
|
||||
{
|
||||
for (size_t i = 0; i + 2 < utf8.size(); ++i)
|
||||
{
|
||||
if (utf8[i] == '\xEF' && utf8[i + 1] == '\xBC')
|
||||
{
|
||||
char const n = utf8[i + 2];
|
||||
if (n < '\x90' || n > '\x99')
|
||||
continue;
|
||||
utf8[i] = n - 0x90 + '0';
|
||||
utf8.erase(i + 1, 2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
char ascii_to_lower(char in)
|
||||
|
|
|
@ -43,6 +43,9 @@ UniString MakeLowerCase(UniString const & s);
|
|||
void NormalizeInplace(UniString & s);
|
||||
UniString Normalize(UniString const & s);
|
||||
|
||||
/// Replaces "full width" unicode digits with ascii ones.
|
||||
void NormalizeDigits(string & utf8);
|
||||
|
||||
/// Counts number of start symbols in string s (that is not lower and not normalized) that maches
|
||||
/// to lower and normalized string low_s. If s doen't starts with low_s then returns 0; otherwise
|
||||
/// returns number of start symbols in s that equivalent to lowStr
|
||||
|
|
Loading…
Add table
Reference in a new issue