From f4c112b59fcee233ba33b0308c2db9d6e49f3a9f Mon Sep 17 00:00:00 2001 From: Alex Zolotarev Date: Thu, 31 Mar 2016 09:51:32 +0300 Subject: [PATCH] strings::NormalizeDigits for full-width unicode numbers. --- base/base_tests/string_utils_test.cpp | 13 +++++++++++++ base/string_utils.cpp | 15 +++++++++++++++ base/string_utils.hpp | 3 +++ 3 files changed, 31 insertions(+) diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index 55b256bacf..05c269d29c 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -612,3 +612,16 @@ UNIT_TEST(EditDistance) testUniStringEditDistance("ll", "l1", 1); testUniStringEditDistance("\u0132ij", "\u0133IJ", 3); } + +UNIT_TEST(NormalizeDigits) +{ + auto const nd = [](string str) -> string + { + strings::NormalizeDigits(str); + return str; + }; + TEST_EQUAL(nd(""), "", ()); + TEST_EQUAL(nd("z12345//"), "z12345//", ()); + TEST_EQUAL(nd("a0192 "), "a0192 ", ()); + TEST_EQUAL(nd("3456789"), "3456789", ()); +} diff --git a/base/string_utils.cpp b/base/string_utils.cpp index e9f2aa1d39..6e643d18f2 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -113,6 +113,21 @@ UniString Normalize(UniString const & s) return result; } +void NormalizeDigits(string & utf8) +{ + for (size_t i = 0; i + 2 < utf8.size(); ++i) + { + if (utf8[i] == '\xEF' && utf8[i + 1] == '\xBC') + { + char const n = utf8[i + 2]; + if (n < '\x90' || n > '\x99') + continue; + utf8[i] = n - 0x90 + '0'; + utf8.erase(i + 1, 2); + } + } +} + namespace { char ascii_to_lower(char in) diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 4d006a0d08..a7a6a4290d 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -43,6 +43,9 @@ UniString MakeLowerCase(UniString const & s); void NormalizeInplace(UniString & s); UniString Normalize(UniString const & s); +/// Replaces "full width" unicode digits with ascii ones. +void NormalizeDigits(string & utf8); + /// Counts number of start symbols in string s (that is not lower and not normalized) that maches /// to lower and normalized string low_s. If s doen't starts with low_s then returns 0; otherwise /// returns number of start symbols in s that equivalent to lowStr