diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index 2a047c2aeb..34c86579a7 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -25,7 +25,6 @@ #include "platform/platform.hpp" -#include "coding/map_uint32_to_val.hpp" #include "coding/reader_writer_ops.hpp" #include "coding/succinct_mapper.hpp" #include "coding/writer.hpp" @@ -49,6 +48,11 @@ #define SYNONYMS_FILE "synonyms.txt" +namespace indexer +{ +using namespace strings; +using namespace search; + namespace { class SynonymsHolder @@ -130,23 +134,22 @@ void GetCategoryTypes(CategoriesHolder const & categories, std::pair s template class FeatureNameInserter { - strings::UniString m_Str, m_Strasse; + String2StringMap const & m_suffixes; public: FeatureNameInserter(uint32_t index, SynonymsHolder * synonyms, std::vector> & keyValuePairs, bool hasStreetType) - : m_val(index) + : m_suffixes(GetDACHStreets()) + , m_val(index) , m_synonyms(synonyms) , m_keyValuePairs(keyValuePairs) , m_hasStreetType(hasStreetType) { - m_Strasse = strings::MakeUniString("strasse"); - m_Str = strings::MakeUniString("str"); } - void AddToken(uint8_t lang, strings::UniString const & s) const + void AddToken(uint8_t lang, UniString const & s) const { - strings::UniString key; + UniString key; key.reserve(s.size() + 1); key.push_back(lang); key.append(s.begin(), s.end()); @@ -157,28 +160,34 @@ public: // Adds search tokens for different ways of writing strasse: // Hauptstrasse -> Haupt strasse, Hauptstr. // Haupt strasse -> Hauptstrasse, Hauptstr. - void AddStrasseNames(int8_t lang, std::vector const & tokens) const + void AddDACHNames(int8_t lang, std::vector const & tokens) const { for (size_t i = 0; i < tokens.size(); ++i) { auto const & token = tokens[i]; - if (!strings::EndsWith(token, m_Strasse)) - continue; + for (auto const & sx : m_suffixes) + { + if (!EndsWith(token, sx.first)) + continue; - if (token == m_Strasse) - { - if (i != 0) + // We expect that suffixes are street synonyms, so no need to add them separately into index. + ASSERT(IsStreetSynonym(sx.first) || IsStreetSynonym(sx.second), (sx.first)); + + if (token == sx.first) { - AddToken(lang, tokens[i - 1] + m_Strasse); - AddToken(lang, tokens[i - 1] + m_Str); + if (i != 0) + { + AddToken(lang, tokens[i - 1] + sx.first); + AddToken(lang, tokens[i - 1] + sx.second); + } + } + else + { + auto const name = UniString(token.begin(), token.end() - sx.first.size()); + AddToken(lang, name); + AddToken(lang, name + sx.second); } - } - else - { - auto const name = strings::UniString(token.begin(), token.end() - m_Strasse.size()); - AddToken(lang, name); - AddToken(lang, name + m_Str); } } } @@ -186,7 +195,7 @@ public: void operator()(int8_t lang, std::string_view name) const { /// @todo No problem here if we will have duplicating tokens? (POI name like "Step by Step"). - auto tokens = search::NormalizeAndTokenizeString(name); + auto tokens = NormalizeAndTokenizeString(name); // add synonyms for input native string if (m_synonyms) @@ -194,7 +203,7 @@ public: /// @todo Avoid creating temporary std::string. m_synonyms->ForEach(std::string(name), [&](std::string const & utf8str) { - tokens.push_back(search::NormalizeAndSimplifyString(utf8str)); + tokens.push_back(NormalizeAndSimplifyString(utf8str)); }); } @@ -208,14 +217,14 @@ public: if (m_hasStreetType) { - search::StreetTokensFilter filter( - [&](strings::UniString const & token, size_t /* tag */) { AddToken(lang, token); }, + StreetTokensFilter filter( + [&](UniString const & token, size_t /* tag */) { AddToken(lang, token); }, false /* withMisprints */); for (auto const & token : tokens) filter.Put(token, false /* isPrefix */, 0 /* tag */); - AddStrasseNames(lang, tokens); + AddDACHNames(lang, tokens); } else { @@ -507,8 +516,6 @@ void BuildAddressTable(FilesContainerR & container, std::string const & addressD } } // namespace -namespace indexer -{ void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter); bool BuildSearchIndexFromDataFile(std::string const & country, feature::GenerateInfo const & info, diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 270522a5e7..89ff482acc 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -356,7 +356,7 @@ private: "ქუჩა", // German - Deutsch - "straße", "str", + "straße", "str", "platz", "pl", // Hungarian - Magyar "utca", "út", @@ -498,4 +498,14 @@ void StreetTokensFilter::Put(UniString const & token, bool isPrefix, size_t tag) m_callback(token, tag); } + +String2StringMap const & GetDACHStreets() +{ + static String2StringMap res = { + { MakeUniString("strasse"), MakeUniString("str") }, + { MakeUniString("platz"), MakeUniString("pl") }, + }; + return res; +} + } // namespace search diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 7e306ab72b..c3f32b86ad 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -128,4 +128,9 @@ private: Callback m_callback; bool m_withMisprints = false; }; -} // namespace search + +// D-A-CH countries have special street suffixes processing. +using String2StringMap = std::map; +String2StringMap const & GetDACHStreets(); + +} // namespace search diff --git a/search/ranker.cpp b/search/ranker.cpp index a1a87c8961..aec46fa380 100644 --- a/search/ranker.cpp +++ b/search/ranker.cpp @@ -54,41 +54,44 @@ void UpdateNameScores(TokensVector & tokens, uint8_t lang, Slice const & slice, } // This function supports only street names like "abcdstrasse"/"abcd strasse". -vector> ModifyStrasse(vector const & streetTokens) +/// @see Also FeatureNameInserter::AddDACHNames +vector> ModifyDACHStreet(vector const & streetTokens) { - vector> result; - auto static const kStrasse = strings::MakeUniString("strasse"); - auto static const kStr = strings::MakeUniString("str"); auto const size = streetTokens.size(); + ASSERT_GREATER(size, 0, ()); - if (size == 0 || !strings::EndsWith(streetTokens.back(), kStrasse)) - return {}; - - if (streetTokens.back() == kStrasse) + vector> result; + for (auto const & sx : GetDACHStreets()) { - if (size == 1) - return {}; + if (!strings::EndsWith(streetTokens.back(), sx.first)) + continue; - // "Abcd strasse" -> "abcdstrasse". - result.emplace_back(streetTokens.begin(), streetTokens.end() - 1); - result.back().back() += kStrasse; + if (streetTokens.back() == sx.first) + { + if (size == 1) + return {}; - // "Abcd strasse" -> "abcdstr". - result.emplace_back(streetTokens.begin(), streetTokens.end() - 1); - result.back().back() += kStr; - return result; + // "Abcd strasse" -> "abcdstrasse". + result.emplace_back(streetTokens.begin(), streetTokens.end() - 1); + result.back().back() += sx.first; + + // "Abcd strasse" -> "abcdstr". + result.emplace_back(streetTokens.begin(), streetTokens.end() - 1); + result.back().back() += sx.second; + return result; + } + + // "Abcdstrasse" -> "abcd strasse". + auto const name = strings::UniString(streetTokens.back().begin(), streetTokens.back().end() - sx.first.size()); + result.push_back(streetTokens); + result.back().back() = name; + result.back().push_back(sx.first); + + // "Abcdstrasse" -> "abcdstr". + result.push_back(streetTokens); + result.back().back() = name + sx.second; } - // "Abcdstrasse" -> "abcd strasse". - auto const name = - strings::UniString(streetTokens.back().begin(), streetTokens.back().end() - kStrasse.size()); - result.push_back(streetTokens); - result.back().back() = name; - result.back().push_back(kStrasse); - - // "Abcdstrasse" -> "abcdstr". - result.push_back(streetTokens); - result.back().back() = name + kStr; return result; } @@ -129,19 +132,18 @@ NameScores GetNameScores(FeatureType & ft, Geocoder::Params const & params, /// 2. Make an optimization: If there are no synonyms or "strasse", skip this step. if (type == Model::TYPE_STREET) { - // Searching for "Santa Fe" should rank "Avenida Satna Fe" like FULL_MATCH or FULL_PREFIX, but not SUBSTRING. + // Searching for "Santa Fe" should rank "Avenida Santa Fe" like FULL_MATCH or FULL_PREFIX, but not SUBSTRING. { TokensVector cleaned(RemoveStreetSynonyms(vec.GetTokens())); UpdateNameScores(cleaned, lang, slice, bestScores); UpdateNameScores(cleaned, lang, sliceNoCategories, bestScores); } - /// @todo Should definitely add "platz"-"pl". And maybe "gasse"-"g", "allee"-"al"? - for (auto & variant : ModifyStrasse(vec.GetTokens())) + for (auto & variant : ModifyDACHStreet(vec.GetTokens())) { - TokensVector vec(std::move(variant)); - UpdateNameScores(vec, lang, slice, bestScores); - UpdateNameScores(vec, lang, sliceNoCategories, bestScores); + TokensVector modified(std::move(variant)); + UpdateNameScores(modified, lang, slice, bestScores); + UpdateNameScores(modified, lang, sliceNoCategories, bestScores); } } }; diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp index 653643b654..cf0883f834 100644 --- a/search/ranking_utils.hpp +++ b/search/ranking_utils.hpp @@ -129,7 +129,11 @@ struct NameScores void UpdateIfBetter(NameScores const & rhs) { - auto const newNameScoreIsBetter = m_nameScore < rhs.m_nameScore; + auto newNameScoreIsBetter = m_nameScore < rhs.m_nameScore; + // FULL_PREFIX with 0 errors is better than FULL_MATCH with 2 errors. + if (newNameScoreIsBetter && m_nameScore == NameScore::FULL_PREFIX && m_errorsMade.IsBetterThan(rhs.m_errorsMade)) + newNameScoreIsBetter = false; + auto const nameScoresAreEqual = m_nameScore == rhs.m_nameScore; auto const newLanguageIsBetter = m_isAltOrOldName && !rhs.m_isAltOrOldName; auto const languagesAreEqual = m_isAltOrOldName == rhs.m_isAltOrOldName; diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 0c60565634..210d39e141 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -2108,18 +2108,20 @@ UNIT_CLASS_TEST(ProcessorTest, StreetSynonymPrefix) UNIT_CLASS_TEST(ProcessorTest, Strasse) { - TestStreet s1({{-1.0, -1.0},{1.0, 1.0}}, "abcdstraße", "en"); - TestStreet s2({{1.0, -1.0}, {-1.0, 1.0}}, "xyz strasse", "en"); + TestStreet s1({{-1.0, -1.0},{1.0, 1.0}}, "abcdstraße", "de"); + TestStreet s2({{1.0, -1.0}, {-1.0, 1.0}}, "xyz strasse", "de"); + TestStreet s3({{-2.0, -2.0},{2.0, 2.0}}, "bahnhofplatz", "de"); auto countryId = BuildCountry("Wonderland", [&](TestMwmBuilder & builder) { builder.Add(s1); builder.Add(s2); + builder.Add(s3); }); auto checkNoErrors = [&](string const & query, Rules const & rules) { - auto request = MakeRequest(query, "en"); + auto request = MakeRequest(query, "de"); auto const & results = request->Results(); TEST(ResultsMatch(results, rules), (query)); @@ -2129,7 +2131,7 @@ UNIT_CLASS_TEST(ProcessorTest, Strasse) TEST(nameScore == NameScore::FULL_MATCH || nameScore == NameScore::FULL_PREFIX, (query)); }; - SetViewport(m2::RectD(0.0, 0.0, 1.0, 2.0)); + SetViewport(m2::RectD(-1, -1, 1, 1)); { Rules rules = {ExactMatch(countryId, s1)}; checkNoErrors("abcdstrasse ", rules); @@ -2160,6 +2162,15 @@ UNIT_CLASS_TEST(ProcessorTest, Strasse) checkNoErrors("xyz ", rules); checkNoErrors("xyz", rules); } + { + Rules rules = {ExactMatch(countryId, s3)}; + checkNoErrors("bahnhofplatz", rules); + checkNoErrors("bahnhof platz", rules); + checkNoErrors("bahnhof", rules); + checkNoErrors("bahnhof ", rules); + checkNoErrors("bahnhofpl", rules); + checkNoErrors("bahnhofpl ", rules); + } } UNIT_CLASS_TEST(ProcessorTest, StreetSynonymsWithMisprints)