From b053c2de1feb1cb56cb813aa6bde6e973af05285 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 25 Jan 2017 13:47:47 +0300 Subject: [PATCH] Review fixes. --- search/feature_offset_match.hpp | 7 +- search/geocoder.cpp | 48 +++++------- search/geocoder.hpp | 2 +- search/processor.cpp | 2 +- search/query_params.cpp | 64 +++++++++------- search/query_params.hpp | 73 +++++++++++++++---- search/ranking_utils.cpp | 16 ++-- search/ranking_utils.hpp | 8 +- search/retrieval.cpp | 4 +- .../processor_test.cpp | 2 +- search/search_tests/locality_scorer_test.cpp | 6 +- search/search_tests/ranking_tests.cpp | 2 +- search/streets_matcher.cpp | 2 +- search/token_slice.hpp | 10 +-- search/utils.hpp | 2 +- 15 files changed, 149 insertions(+), 99 deletions(-) diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index 1eb7409462..cb88fbddf7 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -320,16 +320,13 @@ void MatchPostcodesInTrie(TokenSlice const & slice, if (slice.IsPrefix(i)) { vector> dfas; - for (auto const & s : slice.Get(i)) - dfas.emplace_back(UniStringDFA(s)); - + slice.Get(i).ForEach([&dfas](UniString const & s) { dfas.emplace_back(UniStringDFA(s)); }); MatchInTrie(dfas, TrieRootPrefix(*postcodesRoot, edge), intersector); } else { vector dfas; - for (auto const & s : slice.Get(i)) - dfas.emplace_back(s); + slice.Get(i).ForEach([&dfas](UniString const & s) { dfas.emplace_back(s); }); MatchInTrie(dfas, TrieRootPrefix(*postcodesRoot, edge), intersector); } diff --git a/search/geocoder.cpp b/search/geocoder.cpp index 0900d58dc7..e5c60b8e80 100644 --- a/search/geocoder.cpp +++ b/search/geocoder.cpp @@ -180,10 +180,7 @@ void JoinQueryTokens(QueryParams const & params, size_t curToken, size_t endToke ASSERT_LESS_OR_EQUAL(curToken, endToken, ()); for (size_t i = curToken; i < endToken; ++i) { - auto const & syms = params.GetTokens(i); - CHECK(!syms.empty(), ("Empty synonyms for token:", i)); - res.append(syms.front()); - + res.append(params.GetToken(i).m_original); if (i + 1 != endToken) res.append(sep); } @@ -384,12 +381,16 @@ void Geocoder::SetParams(Params const & params) continue; } - auto & syms = m_params.GetTokens(i); - my::EraseIf(syms, &IsStopWord); - if (syms.empty()) + auto & token = m_params.GetToken(i); + if (IsStopWord(token.m_original)) + { m_params.RemoveToken(i); + } else + { + my::EraseIf(token.m_synonyms, &IsStopWord); ++i; + } } // If all tokens are stop words - give up. @@ -401,10 +402,8 @@ void Geocoder::SetParams(Params const & params) // individually. for (size_t i = 0; i < m_params.GetNumTokens(); ++i) { - auto & synonyms = m_params.GetTokens(i); - ASSERT(!synonyms.empty(), ()); - - if (IsStreetSynonym(synonyms.front())) + auto & token = m_params.GetToken(i); + if (IsStreetSynonym(token.m_original)) m_params.GetTypeIndices(i).clear(); } @@ -416,13 +415,12 @@ void Geocoder::SetParams(Params const & params) { m_tokenRequests.emplace_back(); auto & request = m_tokenRequests.back(); - for (auto const & name : m_params.GetTokens(i)) - { + m_params.GetToken(i).ForEach([&request](UniString const & s) { // Here and below, we use LevenshteinDFAs for fuzzy // matching. But due to performance reasons, we assume that the // first letter is always correct. - request.m_names.emplace_back(name, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(name)); - } + request.m_names.emplace_back(s, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(s)); + }); for (auto const & index : m_params.GetTypeIndices(i)) request.m_categories.emplace_back(FeatureTypeToString(index)); request.m_langs = m_params.GetLangs(); @@ -430,22 +428,17 @@ void Geocoder::SetParams(Params const & params) else { auto & request = m_prefixTokenRequest; - for (auto const & name : m_params.GetTokens(i)) - { - LOG(LINFO, ("Adding prefix name:", name)); + m_params.GetToken(i).ForEach([&request](UniString const & s) { request.m_names.emplace_back( - LevenshteinDFA(name, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(name))); - } + LevenshteinDFA(s, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(s))); + }); for (auto const & index : m_params.GetTypeIndices(i)) - { - LOG(LINFO, ("Adding category:", FeatureTypeToString(index))); request.m_categories.emplace_back(FeatureTypeToString(index)); - } request.m_langs = m_params.GetLangs(); } } - LOG(LDEBUG, (DebugPrint(static_cast(m_params)))); + LOG(LDEBUG, (static_cast(m_params))); } void Geocoder::GoEverywhere() @@ -623,16 +616,9 @@ void Geocoder::InitBaseContext(BaseContext & ctx) for (size_t i = 0; i < ctx.m_features.size(); ++i) { if (m_params.IsPrefixToken(i)) - { - LOG(LINFO, ("Token is prefix.")); ctx.m_features[i] = RetrieveAddressFeatures(*m_context, m_cancellable, m_prefixTokenRequest); - } else - { - LOG(LINFO, ("Token is full.")); ctx.m_features[i] = RetrieveAddressFeatures(*m_context, m_cancellable, m_tokenRequests[i]); - } - LOG(LINFO, ("For i-th token:", i, ctx.m_features[i].PopCount())); } ctx.m_hotelsFilter = m_hotelsFilter.MakeScopedFilter(*m_context, m_params.m_hotelsFilter); } diff --git a/search/geocoder.hpp b/search/geocoder.hpp index 6aa412594d..229c8c65a8 100644 --- a/search/geocoder.hpp +++ b/search/geocoder.hpp @@ -184,7 +184,7 @@ private: template using TLocalitiesCache = map, vector>; - QueryParams::Synonyms const & GetTokens(size_t i) const; + QueryParams::Token const & GetTokens(size_t i) const; // Creates a cache of posting lists corresponding to features in m_context // for each token and saves it to m_addressFeatures. diff --git a/search/processor.cpp b/search/processor.cpp index fd8c40eb7e..63d325e7e8 100644 --- a/search/processor.cpp +++ b/search/processor.cpp @@ -614,7 +614,7 @@ int GetOldTypeFromIndex(size_t index) void Processor::InitParams(QueryParams & params) { if (m_prefix.empty()) - params.Init(m_tokens.begin(), m_tokens.end()); + params.InitNoPrefix(m_tokens.begin(), m_tokens.end()); else params.InitWithPrefix(m_tokens.begin(), m_tokens.end(), m_prefix); diff --git a/search/query_params.cpp b/search/query_params.cpp index d404996671..db5e31cac8 100644 --- a/search/query_params.cpp +++ b/search/query_params.cpp @@ -8,7 +8,7 @@ namespace search { namespace { -// TODO (@y, @m): reuse this class in search::Processor. +// TODO (@y, @m): reuse this class in Processor. class DoAddStreetSynonyms { public: @@ -22,26 +22,25 @@ public: // All synonyms should be lowercase! if (ss == "n") - AddSym(i, "north"); + AddSynonym(i, "north"); if (ss == "w") - AddSym(i, "west"); + AddSynonym(i, "west"); if (ss == "s") - AddSym(i, "south"); + AddSynonym(i, "south"); if (ss == "e") - AddSym(i, "east"); + AddSynonym(i, "east"); if (ss == "nw") - AddSym(i, "northwest"); + AddSynonym(i, "northwest"); if (ss == "ne") - AddSym(i, "northeast"); + AddSynonym(i, "northeast"); if (ss == "sw") - AddSym(i, "southwest"); + AddSynonym(i, "southwest"); if (ss == "se") - AddSym(i, "southeast"); + AddSynonym(i, "southeast"); } private: - QueryParams::Synonyms & GetSyms(size_t i) const { return m_params.GetTokens(i); } - void AddSym(size_t i, string const & sym) { GetSyms(i).push_back(strings::MakeUniString(sym)); } + void AddSynonym(size_t i, string const & synonym) { m_params.GetToken(i).AddSynonym(synonym); } QueryParams & m_params; }; @@ -50,7 +49,8 @@ private: void QueryParams::Clear() { m_tokens.clear(); - m_prefixTokens.clear(); + m_prefixToken.Clear(); + m_hasPrefix = false; m_typeIndices.clear(); m_langs.clear(); m_scale = scales::GetUpperScale(); @@ -76,16 +76,16 @@ bool QueryParams::IsPrefixToken(size_t i) const return i == m_tokens.size(); } -QueryParams::Synonyms const & QueryParams::GetTokens(size_t i) const +QueryParams::Token const & QueryParams::GetToken(size_t i) const { ASSERT_LESS(i, GetNumTokens(), ()); - return i < m_tokens.size() ? m_tokens[i] : m_prefixTokens; + return i < m_tokens.size() ? m_tokens[i] : m_prefixToken; } -QueryParams::Synonyms & QueryParams::GetTokens(size_t i) +QueryParams::Token & QueryParams::GetToken(size_t i) { ASSERT_LESS(i, GetNumTokens(), ()); - return i < m_tokens.size() ? m_tokens[i] : m_prefixTokens; + return i < m_tokens.size() ? m_tokens[i] : m_prefixToken; } bool QueryParams::IsNumberTokens(size_t start, size_t end) const @@ -96,14 +96,15 @@ bool QueryParams::IsNumberTokens(size_t start, size_t end) const for (; start != end; ++start) { bool number = false; - for (auto const & t : GetTokens(start)) - { - if (feature::IsNumber(t)) + GetToken(start).ForEach([&number](String const & s) { + if (feature::IsNumber(s)) { number = true; - break; + return false; // breaks ForEach } - } + return true; // continues ForEach + }); + if (!number) return false; } @@ -115,19 +116,32 @@ void QueryParams::RemoveToken(size_t i) { ASSERT_LESS(i, GetNumTokens(), ()); if (i == m_tokens.size()) - m_prefixTokens.clear(); + { + m_prefixToken.Clear(); + m_hasPrefix = false; + } else + { m_tokens.erase(m_tokens.begin() + i); + } m_typeIndices.erase(m_typeIndices.begin() + i); } -string DebugPrint(search::QueryParams const & params) +string DebugPrint(QueryParams const & params) { ostringstream os; - os << "QueryParams [ m_tokens=" << DebugPrint(params.m_tokens) - << ", m_prefixTokens=" << DebugPrint(params.m_prefixTokens) + os << "QueryParams [ m_tokens=" << ::DebugPrint(params.m_tokens) + << ", m_prefixToken=" << DebugPrint(params.m_prefixToken) << ", m_typeIndices=" << ::DebugPrint(params.m_typeIndices) << ", m_langs=" << ::DebugPrint(params.m_langs) << " ]"; return os.str(); } + +string DebugPrint(QueryParams::Token const & token) +{ + ostringstream os; + os << "Token [ m_original=" << DebugPrint(token.m_original) + << ", m_synonyms=" << DebugPrint(token.m_synonyms) << " ]"; + return os.str(); +} } // namespace search diff --git a/search/query_params.hpp b/search/query_params.hpp index e879fe9ed5..e1844bfd89 100644 --- a/search/query_params.hpp +++ b/search/query_params.hpp @@ -6,7 +6,9 @@ #include "base/string_utils.hpp" #include "std/cstdint.hpp" +#include "std/type_traits.hpp" #include "std/unordered_set.hpp" +#include "std/utility.hpp" #include "std/vector.hpp" namespace search @@ -15,18 +17,58 @@ class QueryParams { public: using String = strings::UniString; - using Synonyms = vector; using TypeIndices = vector; using Langs = unordered_set; + struct Token + { + Token() = default; + Token(String const & original) : m_original(original) {} + + void AddSynonym(String const & s) { m_synonyms.push_back(s); } + void AddSynonym(string const & s) { m_synonyms.push_back(strings::MakeUniString(s)); } + + // Calls |fn| on the original token and on synonyms. + template + typename enable_if::type, void>::value>::type ForEach( + Fn && fn) const + { + fn(m_original); + for_each(m_synonyms.begin(), m_synonyms.end(), forward(fn)); + } + + // Calls |fn| on the original token and on synonyms until |fn| return false. + template + typename enable_if::type, bool>::value>::type ForEach( + Fn && fn) const + { + if (!fn(m_original)) + return; + for (auto const & synonym : m_synonyms) + { + if (!fn(synonym)) + return; + } + } + + void Clear() + { + m_original.clear(); + m_synonyms.clear(); + } + + String m_original; + vector m_synonyms; + }; + QueryParams() = default; template - void Init(It tokenBegin, It tokenEnd) + void InitNoPrefix(It tokenBegin, It tokenEnd) { Clear(); for (; tokenBegin != tokenEnd; ++tokenBegin) - m_tokens.push_back({*tokenBegin}); + m_tokens.emplace_back(*tokenBegin); m_typeIndices.resize(GetNumTokens()); } @@ -35,17 +77,18 @@ public: { Clear(); for (; tokenBegin != tokenEnd; ++tokenBegin) - m_tokens.push_back({*tokenBegin}); - m_prefixTokens.push_back(prefix); + m_tokens.push_back(*tokenBegin); + m_prefixToken.m_original = prefix; + m_hasPrefix = true; m_typeIndices.resize(GetNumTokens()); } inline size_t GetNumTokens() const { - return m_prefixTokens.empty() ? m_tokens.size() : m_tokens.size() + 1; + return m_hasPrefix ? m_tokens.size() + 1: m_tokens.size(); } - inline bool LastTokenIsPrefix() const { return !m_prefixTokens.empty(); } + inline bool LastTokenIsPrefix() const { return m_hasPrefix; } inline bool IsEmpty() const { return GetNumTokens() == 0; } void Clear(); @@ -55,8 +98,8 @@ public: TypeIndices const & GetTypeIndices(size_t i) const; bool IsPrefixToken(size_t i) const; - Synonyms const & GetTokens(size_t i) const; - Synonyms & GetTokens(size_t i); + Token const & GetToken(size_t i) const; + Token & GetToken(size_t i); // Returns true if all tokens in [start, end) range have integral // synonyms. @@ -66,18 +109,22 @@ public: inline Langs & GetLangs() { return m_langs; } inline Langs const & GetLangs() const { return m_langs; } - inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; } + inline bool LangExists(int8_t lang) const { return m_langs.count(lang) != 0; } inline int GetScale() const { return m_scale; } private: - friend string DebugPrint(search::QueryParams const & params); + friend string DebugPrint(QueryParams const & params); + + vector m_tokens; + Token m_prefixToken; + bool m_hasPrefix = false; - vector m_tokens; - Synonyms m_prefixTokens; vector m_typeIndices; Langs m_langs; int m_scale = scales::GetUpperScale(); }; + +string DebugPrint(QueryParams::Token const & token); } // namespace search diff --git a/search/ranking_utils.cpp b/search/ranking_utils.cpp index 39dcf2fe7f..9d8a97b978 100644 --- a/search/ranking_utils.cpp +++ b/search/ranking_utils.cpp @@ -8,16 +8,22 @@ namespace search { namespace impl { -bool Match(vector const & tokens, UniString const & token) +bool FullMatch(QueryParams::Token const & token, UniString const & text) { - return find(tokens.begin(), tokens.end(), token) != tokens.end(); + if (token.m_original == text) + return true; + auto const & synonyms = token.m_synonyms; + return find(synonyms.begin(), synonyms.end(), text) != synonyms.end(); } -bool PrefixMatch(vector const & prefixes, UniString const & token) +bool PrefixMatch(QueryParams::Token const & token, UniString const & text) { - for (auto const & prefix : prefixes) + if (StartsWith(text, token.m_original)) + return true; + + for (auto const & synonym : token.m_synonyms) { - if (StartsWith(token, prefix)) + if (StartsWith(text, synonym)) return true; } return false; diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp index 5dd9496360..5e336d4289 100644 --- a/search/ranking_utils.hpp +++ b/search/ranking_utils.hpp @@ -20,9 +20,9 @@ class QueryParams; namespace impl { -bool Match(vector const & tokens, strings::UniString const & token); +bool FullMatch(QueryParams::Token const & token, strings::UniString const & text); -bool PrefixMatch(vector const & prefixes, strings::UniString const & token); +bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & text); } // namespace impl // The order and numeric values are important here. Please, check all @@ -65,11 +65,11 @@ NameScore GetNameScore(vector const & tokens, TSlice const & { bool match = true; for (size_t i = 0; i < m - 1 && match; ++i) - match = match && impl::Match(slice.Get(i), tokens[offset + i]); + match = match && impl::FullMatch(slice.Get(i), tokens[offset + i]); if (!match) continue; - if (impl::Match(slice.Get(m - 1), tokens[offset + m - 1])) + if (impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1])) { if (m == n) return NAME_SCORE_FULL_MATCH; diff --git a/search/retrieval.cpp b/search/retrieval.cpp index e68e420ed3..69f736beac 100644 --- a/search/retrieval.cpp +++ b/search/retrieval.cpp @@ -184,10 +184,10 @@ bool MatchFeatureByPostcode(FeatureType const & ft, TokenSlice const & slice) { if (slice.IsPrefix(i)) { - if (!StartsWith(tokens[i], slice.Get(i).front())) + if (!StartsWith(tokens[i], slice.Get(i).m_original)) return false; } - else if (tokens[i] != slice.Get(i).front()) + else if (tokens[i] != slice.Get(i).m_original) { return false; } diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 13c000fbf5..670cc5232b 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -534,7 +534,7 @@ UNIT_CLASS_TEST(ProcessorTest, TestPostcodes) QueryParams params; { strings::UniString const tokens[] = {strings::MakeUniString("141702")}; - params.Init(tokens, tokens + ARRAY_SIZE(tokens)); + params.InitNoPrefix(tokens, tokens + ARRAY_SIZE(tokens)); } auto features = RetrievePostcodeFeatures(context, cancellable, TokenSlice(params, 0, params.GetNumTokens())); diff --git a/search/search_tests/locality_scorer_test.cpp b/search/search_tests/locality_scorer_test.cpp index 17306796bd..4e01f56e4e 100644 --- a/search/search_tests/locality_scorer_test.cpp +++ b/search/search_tests/locality_scorer_test.cpp @@ -35,7 +35,7 @@ void InitParams(string const & query, bool lastTokenIsPrefix, QueryParams & para } else { - params.Init(tokens.begin(), tokens.end()); + params.InitNoPrefix(tokens.begin(), tokens.end()); } } @@ -47,7 +47,7 @@ void AddLocality(string const & name, uint32_t featureId, QueryParams & params, Delimiters delims; SplitUniString(NormalizeAndSimplifyString(name), MakeInsertFunctor(tokens), delims); - size_t numTokens = params.GetNumTokens(); + size_t const numTokens = params.GetNumTokens(); for (size_t startToken = 0; startToken != numTokens; ++startToken) { @@ -56,7 +56,7 @@ void AddLocality(string const & name, uint32_t featureId, QueryParams & params, bool matches = true; for (size_t i = startToken; i != endToken && matches; ++i) { - UniString const & queryToken = params.GetTokens(i).front(); + UniString const & queryToken = params.GetToken(i).m_original; if (params.IsPrefixToken(i)) { matches = any_of(tokens.begin(), tokens.end(), [&queryToken](UniString const & token) diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp index e313ebbee5..d5b14c004d 100644 --- a/search/search_tests/ranking_tests.cpp +++ b/search/search_tests/ranking_tests.cpp @@ -33,7 +33,7 @@ NameScore GetScore(string const & name, string const & query, size_t startToken, } else { - params.Init(tokens.begin(), tokens.end()); + params.InitNoPrefix(tokens.begin(), tokens.end()); } return GetNameScore(name, TokenSlice(params, startToken, endToken)); diff --git a/search/streets_matcher.cpp b/search/streets_matcher.cpp index e28088abce..733f19c8b4 100644 --- a/search/streets_matcher.cpp +++ b/search/streets_matcher.cpp @@ -154,7 +154,7 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const & for (; curToken < ctx.m_numTokens && !ctx.m_usedTokens[curToken] && !streets.IsEmpty(); ++curToken) { - auto const & token = params.GetTokens(curToken).front(); + auto const & token = params.GetToken(curToken).m_original; bool const isPrefix = params.IsPrefixToken(curToken); if (house_numbers::LooksLikeHouseNumber(token, isPrefix)) diff --git a/search/token_slice.hpp b/search/token_slice.hpp index 999f542370..7301103420 100644 --- a/search/token_slice.hpp +++ b/search/token_slice.hpp @@ -17,10 +17,10 @@ class TokenSlice public: TokenSlice(QueryParams const & params, size_t startToken, size_t endToken); - inline QueryParams::Synonyms const & Get(size_t i) const + inline QueryParams::Token const & Get(size_t i) const { ASSERT_LESS(i, Size(), ()); - return m_params.GetTokens(m_offset + i); + return m_params.GetToken(m_offset + i); } inline size_t Size() const { return m_size; } @@ -42,10 +42,10 @@ class TokenSliceNoCategories public: TokenSliceNoCategories(QueryParams const & params, size_t startToken, size_t endToken); - inline QueryParams::Synonyms const & Get(size_t i) const + inline QueryParams::Token const & Get(size_t i) const { ASSERT_LESS(i, Size(), ()); - return m_params.GetTokens(m_indexes[i]); + return m_params.GetToken(m_indexes[i]); } inline size_t Size() const { return m_indexes.size(); } @@ -64,7 +64,7 @@ public: QuerySlice(TokenSlice const & slice) : m_slice(slice) {} // QuerySlice overrides: - QueryParams::String const & Get(size_t i) const override { return m_slice.Get(i).front(); } + QueryParams::String const & Get(size_t i) const override { return m_slice.Get(i).m_original; } size_t Size() const override { return m_slice.Size(); } private: diff --git a/search/utils.hpp b/search/utils.hpp index 2a1ae8402f..c36f74736a 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -17,7 +17,7 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales { for (size_t i = 0; i < slice.Size(); ++i) { - auto token = slice.Get(i); + auto const & token = slice.Get(i); for (size_t j = 0; j < locales.size(); ++j) categories.ForEachTypeByName(locales[j], token, bind(todo, i, _1));