forked from organicmaps/organicmaps
[search] Turkish I fix, Russian й fix, Danish æ,ø and œ fix.
This commit is contained in:
parent
06ac7e7387
commit
2b6dd058f2
3 changed files with 75 additions and 3 deletions
|
@ -34,6 +34,7 @@ SOURCES += \
|
|||
mwm_set_test.cpp \
|
||||
point_to_int64_test.cpp \
|
||||
scales_test.cpp \
|
||||
search_string_utils_test.cpp \
|
||||
sort_and_merge_intervals_test.cpp \
|
||||
test_polylines.cpp \
|
||||
test_type.cpp \
|
||||
|
|
21
indexer/indexer_tests/search_string_utils_test.cpp
Normal file
21
indexer/indexer_tests/search_string_utils_test.cpp
Normal file
|
@ -0,0 +1,21 @@
|
|||
#include "../../testing/testing.hpp"
|
||||
#include "../search_string_utils.hpp"
|
||||
|
||||
#include "../../base/string_utils.hpp"
|
||||
|
||||
UNIT_TEST(FeatureTypeToString)
|
||||
{
|
||||
TEST_EQUAL("!type:123", strings::ToUtf8(search::FeatureTypeToString(123)), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(NormalizeAndSimplifyStringWithOurTambourines)
|
||||
{
|
||||
string const arr[] = {"ÜbërÅłłęšß", "uberallesss", // Basic test case.
|
||||
"Iiİı", "iiii", // Famous turkish "I" letter bug.
|
||||
"ЙЁйёШКИЙй", "йейешкийй", // Better handling of Russian й letter.
|
||||
"ØøÆ挜", "ooaeaeoeoe",
|
||||
"バス", "ハス"
|
||||
};
|
||||
for (size_t i = 0; i < ARRAY_SIZE(arr); i += 2)
|
||||
TEST_EQUAL(arr[i + 1], strings::ToUtf8(search::NormalizeAndSimplifyString(arr[i])), (i));
|
||||
}
|
|
@ -1,15 +1,65 @@
|
|||
#pragma once
|
||||
#include "../base/string_utils.hpp"
|
||||
|
||||
#include "../std/algorithm.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
|
||||
// This function should be used for all search strings normalization.
|
||||
// It does some magic text transformation which greatly helps us to improve our search.
|
||||
inline strings::UniString NormalizeAndSimplifyString(string const & s)
|
||||
{
|
||||
strings::UniString uniS = strings::MakeLowerCase(strings::MakeUniString(s));
|
||||
strings::Normalize(uniS);
|
||||
return uniS;
|
||||
using namespace strings;
|
||||
UniString uniString = MakeUniString(s);
|
||||
for (size_t i = 0; i < uniString.size(); ++i)
|
||||
{
|
||||
UniChar & c = uniString[i];
|
||||
switch (c)
|
||||
{
|
||||
// Replace small turkish dotless 'ı' with dotted 'i'.
|
||||
// Our own invented hack to avoid well-known Turkish I-letter bug.
|
||||
case 0x0131: c = 'i'; break;
|
||||
// Replace capital turkish dotted 'İ' with dotted lowercased 'i'.
|
||||
// Here we need to handle this case manually too, because default unicode-compliant implementation
|
||||
// of MakeLowerCase converts 'İ' to 'i' + 0x0307.
|
||||
case 0x0130: c = 'i'; break;
|
||||
// Some Danish-specific hacks.
|
||||
case 0x00d8: // Ø
|
||||
case 0x00f8: c = 'o'; break; // ø
|
||||
case 0x0152: // Œ
|
||||
case 0x0153: // œ
|
||||
c = 'o';
|
||||
uniString.insert(uniString.begin() + (i++) + 1, 'e');
|
||||
break;
|
||||
case 0x00c6: // Æ
|
||||
case 0x00e6: // æ
|
||||
c = 'a';
|
||||
uniString.insert(uniString.begin() + (i++) + 1, 'e');
|
||||
break;
|
||||
}
|
||||
}
|
||||
MakeLowerCase(uniString);
|
||||
// Just after lower casing is a correct place to avoid normalization for specific chars.
|
||||
static auto const isSpecificChar = [](UniChar c) -> bool
|
||||
{
|
||||
return c == 0x0439; // й
|
||||
};
|
||||
UniString result;
|
||||
result.reserve(uniString.size());
|
||||
for (auto i = uniString.begin(), end = uniString.end(); i != end;)
|
||||
{
|
||||
auto j = find_if(i, end, isSpecificChar);
|
||||
// We don't check if (j != i) because UniString and Normalize handle it correctly.
|
||||
UniString normString(i, j);
|
||||
Normalize(normString);
|
||||
result.insert(result.end(), normString.begin(), normString.end());
|
||||
if (j == end)
|
||||
break;
|
||||
result.push_back(*j);
|
||||
i = j + 1;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
template <class DelimsT, typename F>
|
||||
|
|
Loading…
Add table
Reference in a new issue