From baf4a0c0cd07bea2472e9934970a0f16778898cd Mon Sep 17 00:00:00 2001 From: Alex Zolotarev Date: Mon, 4 Apr 2016 17:15:45 +0300 Subject: [PATCH] Speed improvement from Yury Gorshenin. --- base/string_utils.cpp | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 6e643d18f2..857c190c9b 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -113,19 +113,42 @@ UniString Normalize(UniString const & s) return result; } -void NormalizeDigits(string & utf8) -{ - for (size_t i = 0; i + 2 < utf8.size(); ++i) +void NormalizeDigits(string &utf8) { + size_t const n = utf8.size(); + size_t const m = n >= 2 ? n - 2 : 0; + + size_t i = 0; + while (i < n && utf8[i] != '\xEF') + ++i; + size_t j = i; + + // Following invariant holds before/between/after loop iterations below: + // * utf8[0, i) represents a checked part of the input string. + // * utf8[0, j) represents a normalized version of the utf8[0, i). + while (i < m) { if (utf8[i] == '\xEF' && utf8[i + 1] == '\xBC') { - char const n = utf8[i + 2]; - if (n < '\x90' || n > '\x99') - continue; - utf8[i] = n - 0x90 + '0'; - utf8.erase(i + 1, 2); + auto const n = utf8[i + 2]; + if (n >= '\x90' && n <= '\x99') + { + utf8[j++] = n - 0x90 + '0'; + i += 3; + } + else + { + utf8[j++] = utf8[i++]; + utf8[j++] = utf8[i++]; + } + } + else + { + utf8[j++] = utf8[i++]; } } + while (i < n) + utf8[j++] = utf8[i++]; + utf8.resize(j); } namespace