Add function to calculate equal range for UTF-16 strings.

This commit is contained in:
Denis Koronchik 2014-06-19 15:22:01 +03:00 committed by Alex Zolotarev
parent 23f76dc513
commit 93917f9c7e
3 changed files with 95 additions and 0 deletions

View file

@ -434,3 +434,63 @@ UNIT_TEST(IsUtf8Test)
TEST(strings::IsASCIIString("YES"), ());
TEST(strings::IsASCIIString("Nice places in Zhodino.kml"), ());
}
UNIT_TEST(CountNormLowerSymbols)
{
char const * strs[] = {
"æüßs",
"üßü",
"İʼnẖtestὒ",
"İʼnẖ",
"İʼnẖtestὒ",
"HelloWorld",
"üßü",
"",
"",
"Тест на не корректную русскую строку",
"В ответе пустая строка",
"Überstraße"
};
char const * low_strs[] = {
"æusss",
"ussu",
"i\u0307\u02bcnh\u0331testυ\u0313\u0300",
"i\u0307\u02bcnh\u0331testυ\u0313\u0300",
"i\u0307\u02bcnh\u0331",
"helloworld",
"usu",
"",
"empty",
"Тест на не корректную строку",
"",
"uberstras"
};
size_t const results [] = {
4,
3,
8,
0,
3,
10,
0,
0,
0,
0,
0,
9
};
size_t const test_count = ARRAY_SIZE(strs);
for (size_t i = 0; i < test_count; ++i)
{
strings::UniString source = strings::MakeUniString(strs[i]);
strings::UniString result = strings::MakeUniString(low_strs[i]);
size_t res = strings::CountNormLowerSymbols(source, result);
TEST_EQUAL(res, results[i], ());
}
}

View file

@ -194,4 +194,33 @@ void MakeLowerCase(UniString & s)
s.swap(r);
}
size_t CountNormLowerSymbols(UniString const & s, UniString const & lowStr)
{
size_t const size = s.size();
size_t const lowSize = lowStr.size();
size_t lowIdx = 0, sIdx = 0;
while (lowIdx < lowSize)
{
if (sIdx == size)
return 0; // low_s has more length than s
UniString strCharNorm;
strCharNorm.push_back(s[sIdx++]);
MakeLowerCase(strCharNorm);
Normalize(strCharNorm);
for (size_t i = 0; i < strCharNorm.size(); ++i)
{
if (lowIdx >= lowSize)
return sIdx;
else
if (lowStr[lowIdx++] != strCharNorm[i])
return 0;
}
}
return sIdx;
}
} // namespace strings

View file

@ -36,6 +36,12 @@ UniString Normalize(UniString const & s);
/// For implementation @see base/normilize_unicode.cpp
void Normalize(UniString & s);
/// Counts number of start symbols in string s (that is not lower and not normalized) that maches
/// to lower and normalized string low_s. If s doen't starts with low_s then returns 0; otherwise
/// returns number of start symbols in s that equivalent to lowStr
/// For implementation @see base/lower_case.cpp
size_t CountNormLowerSymbols(UniString const & s, UniString const & lowStr);
void AsciiToLower(string & s);
void Trim(string & s);