From ba145c115de9f6172cd54b3840a3ed15d36f73b8 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Wed, 6 Jan 2016 20:32:31 +0300 Subject: [PATCH] [search] [indexer] Do not index streets by tokens synonymous to "street". It is enough to index them in the categories branch of the trie. --- generator/dumper.hpp | 2 +- generator/search_index_builder.cpp | 132 ++++++++++++++++++++--------- indexer/feature.hpp | 2 +- 3 files changed, 92 insertions(+), 44 deletions(-) diff --git a/generator/dumper.hpp b/generator/dumper.hpp index 91f68c522a..08d27a2b31 100644 --- a/generator/dumper.hpp +++ b/generator/dumper.hpp @@ -14,6 +14,6 @@ namespace feature // Writes the names of all features in the locale provided by lang // (e.g. "en", "ru", "sv"). If the locale is not recognized, writes all names - // preceded by locale. + // preceded by their locales. void DumpFeatureNames(string const & fPath, string const & lang); } diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index a8250249c1..aafbcef201 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -12,6 +12,7 @@ #include "indexer/feature_utils.hpp" #include "indexer/feature_visibility.hpp" #include "indexer/features_vector.hpp" +#include "indexer/ftypes_matcher.hpp" #include "indexer/index.hpp" #include "indexer/trie_builder.hpp" #include "indexer/types_skipper.hpp" @@ -95,6 +96,30 @@ public: } }; +void GetCategoryTypes(CategoriesHolder const & categories, pair const & scaleRange, + feature::TypesHolder const & types, vector & result) +{ + Classificator const & c = classif(); + + for (uint32_t t : types) + { + // Leave only 2 levels of types - for example, do not distinguish: + // highway-primary-bridge or amenity-parking-fee. + ftype::TruncValue(t, 2); + + // Only categorized types will be added to index. + if (!categories.IsTypeExist(t)) + continue; + + // Index only those types that are visible. + pair const r = feature::GetDrawableScaleRange(t); + CHECK(r.first <= r.second && r.first != -1, (c.GetReadableObjectName(t))); + + if (r.second >= scaleRange.first && r.first <= scaleRange.second) + result.push_back(t); + } +} + template struct FeatureNameInserter { @@ -102,9 +127,14 @@ struct FeatureNameInserter vector> & m_keyValuePairs; TValue m_val; - FeatureNameInserter(SynonymsHolder * synonyms, vector> & keyValuePairs) + bool m_hasStreetType; + + FeatureNameInserter(SynonymsHolder * synonyms, vector> & keyValuePairs, + vector const & categoryTypes) : m_synonyms(synonyms), m_keyValuePairs(keyValuePairs) { + auto const & streetChecker = ftypes::IsStreetChecker::Instance(); + m_hasStreetType = streetChecker(categoryTypes); } void AddToken(signed char lang, strings::UniString const & s) const @@ -117,21 +147,6 @@ struct FeatureNameInserter m_keyValuePairs.emplace_back(key, m_val); } -private: - using TTokensArray = buffer_vector; - - class PushSynonyms - { - TTokensArray & m_tokens; - - public: - PushSynonyms(TTokensArray & tokens) : m_tokens(tokens) {} - void operator() (string const & utf8str) const - { - m_tokens.push_back(search::NormalizeAndSimplifyString(utf8str)); - } - }; - public: bool operator()(signed char lang, string const & name) const { @@ -143,7 +158,12 @@ public: // add synonyms for input native string if (m_synonyms) - m_synonyms->ForEach(name, PushSynonyms(tokens)); + { + m_synonyms->ForEach(name, [&](string const & utf8str) + { + tokens.push_back(search::NormalizeAndSimplifyString(utf8str)); + }); + } int const maxTokensCount = search::MAX_TOKENS - 1; if (tokens.size() > maxTokensCount) @@ -152,8 +172,32 @@ public: tokens.resize(maxTokensCount); } + // Streets are a special case: we do not add the token "street" and its + // synonyms when the feature's name contains it because in + // the search phase this part of the query will be matched against the + // "street" in the categories branch of the search index. + // However, we still add it when there are two or more street tokens + // ("industrial st", "улица набережная"). + size_t numStreets = 0; + vector isStreet(tokens.size()); for (size_t i = 0; i < tokens.size(); ++i) + { + if (search::IsStreetSynonym(strings::ToUtf8(tokens[i]))) + { + isStreet[i] = true; + ++numStreets; + } + } + + for (size_t i = 0; i < tokens.size(); ++i) + { + if (numStreets == 1 && isStreet[i] && m_hasStreetType) + { + LOG(LDEBUG, ("skipping street:", name)); + continue; + } AddToken(lang, tokens[i]); + } return true; } @@ -224,10 +268,13 @@ public: if (types.Empty()) return; + vector categoryTypes; + GetCategoryTypes(m_categories, m_scales, types, categoryTypes); + // Init inserter with serialized value. // Insert synonyms only for countries and states (maybe will add cities in future). FeatureNameInserter inserter( - skipIndex.IsCountryOrState(types) ? m_synonyms : nullptr, m_keyValuePairs); + skipIndex.IsCountryOrState(types) ? m_synonyms : nullptr, m_keyValuePairs, categoryTypes); m_valueBuilder.MakeValue(f, types, index, inserter.m_val); // Skip types for features without names. @@ -238,27 +285,12 @@ public: Classificator const & c = classif(); + categoryTypes.clear(); + GetCategoryTypes(m_categories, m_scales, types, categoryTypes); + // add names of categories of the feature - for (uint32_t t : types) - { - // Leave only 2 level of type - for example, do not distinguish: - // highway-primary-bridge or amenity-parking-fee. - ftype::TruncValue(t, 2); - - // Push to index only categorized types. - if (m_categories.IsTypeExist(t)) - { - // Do index only for visible types in mwm. - pair const r = feature::GetDrawableScaleRange(t); - CHECK(r.first <= r.second && r.first != -1, (c.GetReadableObjectName(t))); - - if (r.second >= m_scales.first && r.first <= m_scales.second) - { - inserter.AddToken(search::kCategoriesLang, - search::FeatureTypeToString(c.GetIndexForType(t))); - } - } - } + for (uint32_t t : categoryTypes) + inserter.AddToken(search::kCategoriesLang, search::FeatureTypeToString(c.GetIndexForType(t))); } }; @@ -371,15 +403,31 @@ bool BuildSearchIndexFromDataFile(string const & filename, bool forceRebuild) LOG(LINFO, ("Search address table size =", writer.Size())); } { - FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); - writeContainer.DeleteSection(SEARCH_TOKENS_FILE_TAG); - + // The behaviour of generator_tool's generate_search_index + // is currently broken: this section is generated elsewhere + // and is deleted here before the final step of the mwm generation + // so it does not pollute the resulting mwm. + // So using and deleting this section is fine when generating + // an mwm from scratch but does not work when regenerating the + // search index section. Comment out the call to DeleteSection + // if you need to regenerate the search intex. + // todo(@m) Is it possible to make it work? { + FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); + writeContainer.DeleteSection(SEARCH_TOKENS_FILE_TAG); + } + + // Separate scopes because FilesContainerW cannot write two sections at once. + { + FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); FileWriter writer = writeContainer.GetWriter(SEARCH_INDEX_FILE_TAG); rw_ops::Reverse(FileReader(indexFilePath), writer); } - writeContainer.Write(addrFilePath, SEARCH_ADDRESS_FILE_TAG); + { + FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); + writeContainer.Write(addrFilePath, SEARCH_ADDRESS_FILE_TAG); + } } } catch (Reader::Exception const & e) diff --git a/indexer/feature.hpp b/indexer/feature.hpp index 4b4b4a1195..b64face4a3 100644 --- a/indexer/feature.hpp +++ b/indexer/feature.hpp @@ -80,7 +80,7 @@ public: */ template - inline bool ForEachNameRef(T & functor) const + inline bool ForEachNameRef(T && functor) const { if (!HasName()) return false;