diff --git a/indexer/indexer_tests/indexer_tests.pro b/indexer/indexer_tests/indexer_tests.pro index 5a7bcd5b56..fb2b05f02c 100644 --- a/indexer/indexer_tests/indexer_tests.pro +++ b/indexer/indexer_tests/indexer_tests.pro @@ -34,6 +34,7 @@ SOURCES += \ mwm_set_test.cpp \ point_to_int64_test.cpp \ scales_test.cpp \ + search_string_utils_test.cpp \ sort_and_merge_intervals_test.cpp \ test_polylines.cpp \ test_type.cpp \ diff --git a/indexer/indexer_tests/search_string_utils_test.cpp b/indexer/indexer_tests/search_string_utils_test.cpp new file mode 100644 index 0000000000..7a2a68b10c --- /dev/null +++ b/indexer/indexer_tests/search_string_utils_test.cpp @@ -0,0 +1,21 @@ +#include "../../testing/testing.hpp" +#include "../search_string_utils.hpp" + +#include "../../base/string_utils.hpp" + +UNIT_TEST(FeatureTypeToString) +{ + TEST_EQUAL("!type:123", strings::ToUtf8(search::FeatureTypeToString(123)), ()); +} + +UNIT_TEST(NormalizeAndSimplifyStringWithOurTambourines) +{ + string const arr[] = {"ÜbërÅłłęšß", "uberallesss", // Basic test case. + "Iiİı", "iiii", // Famous turkish "I" letter bug. + "ЙЁйёШКИЙй", "йейешкийй", // Better handling of Russian й letter. + "ØøÆ挜", "ooaeaeoeoe", + "バス", "ハス" + }; + for (size_t i = 0; i < ARRAY_SIZE(arr); i += 2) + TEST_EQUAL(arr[i + 1], strings::ToUtf8(search::NormalizeAndSimplifyString(arr[i])), (i)); +} diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index b729e08327..d3cf48f8f1 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -1,15 +1,65 @@ #pragma once #include "../base/string_utils.hpp" +#include "../std/algorithm.hpp" + namespace search { // This function should be used for all search strings normalization. +// It does some magic text transformation which greatly helps us to improve our search. inline strings::UniString NormalizeAndSimplifyString(string const & s) { - strings::UniString uniS = strings::MakeLowerCase(strings::MakeUniString(s)); - strings::Normalize(uniS); - return uniS; + using namespace strings; + UniString uniString = MakeUniString(s); + for (size_t i = 0; i < uniString.size(); ++i) + { + UniChar & c = uniString[i]; + switch (c) + { + // Replace small turkish dotless 'ı' with dotted 'i'. + // Our own invented hack to avoid well-known Turkish I-letter bug. + case 0x0131: c = 'i'; break; + // Replace capital turkish dotted 'İ' with dotted lowercased 'i'. + // Here we need to handle this case manually too, because default unicode-compliant implementation + // of MakeLowerCase converts 'İ' to 'i' + 0x0307. + case 0x0130: c = 'i'; break; + // Some Danish-specific hacks. + case 0x00d8: // Ø + case 0x00f8: c = 'o'; break; // ø + case 0x0152: // Œ + case 0x0153: // œ + c = 'o'; + uniString.insert(uniString.begin() + (i++) + 1, 'e'); + break; + case 0x00c6: // Æ + case 0x00e6: // æ + c = 'a'; + uniString.insert(uniString.begin() + (i++) + 1, 'e'); + break; + } + } + MakeLowerCase(uniString); + // Just after lower casing is a correct place to avoid normalization for specific chars. + static auto const isSpecificChar = [](UniChar c) -> bool + { + return c == 0x0439; // й + }; + UniString result; + result.reserve(uniString.size()); + for (auto i = uniString.begin(), end = uniString.end(); i != end;) + { + auto j = find_if(i, end, isSpecificChar); + // We don't check if (j != i) because UniString and Normalize handle it correctly. + UniString normString(i, j); + Normalize(normString); + result.insert(result.end(), normString.begin(), normString.end()); + if (j == end) + break; + result.push_back(*j); + i = j + 1; + } + return result; } template