diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index 4953cecd02..a37ed25e0e 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -33,15 +33,15 @@ public: void ForEach(ToDo && toDo) { TString prefix; - ForEach(&m_root, prefix, toDo); + ForEach(&m_root, prefix, forward(toDo)); } template - void ForEachInSubtree(TString prefix, ToDo && toDo) + void ForEachInSubtree(TString prefix, ToDo && toDo) const { - Node * nd = MoveTo(prefix); - if (nd) - ForEach(nd, prefix, toDo); + Node const * node = MoveTo(prefix); + if (node) + ForEach(node, prefix, forward(toDo)); } size_t GetNumNodes() const { return m_numNodes; } @@ -79,22 +79,21 @@ private: DISALLOW_COPY_AND_MOVE(Node); }; - Node * MoveTo(TString const & key) + Node const * MoveTo(TString const & key) const { - Node * cur = &m_root; + Node const * cur = &m_root; for (auto const & c : key) { auto const it = cur->m_moves.find(c); - if (it != cur->m_moves.end()) - cur = it->second; - else + if (it == cur->m_moves.end()) return nullptr; + cur = it->second; } return cur; } template - void ForEach(Node * root, TString & prefix, ToDo && toDo) + void ForEach(Node const * root, TString & prefix, ToDo && toDo) const { if (!root->m_values.empty()) { diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp index 8b49341ab8..cf96e2a223 100644 --- a/indexer/categories_holder.cpp +++ b/indexer/categories_holder.cpp @@ -277,10 +277,12 @@ int8_t CategoriesHolder::MapLocaleToInteger(string const & locale) {"he", 29 }, {"sw", 30 } }; - ASSERT_EQUAL(ARRAY_SIZE(mapping), kNumLanguages, ()); + static_assert(ARRAY_SIZE(mapping) == kNumLanguages, ""); for (size_t i = 0; i < kNumLanguages; ++i) + { if (locale.find(mapping[i].m_name) == 0) return mapping[i].m_code; + } // Special cases for different Chinese variations if (locale.find("zh") == 0) diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index 68b543e003..bba60a738c 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -60,8 +60,8 @@ public: template void ForEachTypeAndCategory(ToDo && toDo) const { - for (IteratorT i = m_type2cat.begin(); i != m_type2cat.end(); ++i) - toDo(i->first, *i->second); + for (auto const it : m_type2cat) + toDo(it.first, *it.second); } template diff --git a/indexer/categories_index.cpp b/indexer/categories_index.cpp index 344c2b3721..0c39adc53b 100644 --- a/indexer/categories_index.cpp +++ b/indexer/categories_index.cpp @@ -1,34 +1,60 @@ #include "categories_index.hpp" +#include "search_delimiters.hpp" +#include "search_string_utils.hpp" +#include "base/assert.hpp" +#include "base/stl_add.hpp" #include "base/stl_helpers.hpp" +#include "base/string_utils.hpp" #include "std/algorithm.hpp" #include "std/set.hpp" namespace { -void AddAllSubstrings(my::MemTrie & trie, string const & s, uint32_t value) +void AddAllNonemptySubstrings(my::MemTrie & trie, string const & s, + uint32_t value) { + ASSERT(!s.empty(), ()); for (size_t i = 0; i < s.length(); ++i) { string t; for (size_t j = i; j < s.length(); ++j) { - t.append(1, s[j]); + t.push_back(s[j]); trie.Add(t, value); } } } + +template +void ForEachToken(string const & s, TF && fn) +{ + vector tokens; + SplitUniString(search::NormalizeAndSimplifyString(s), MakeBackInsertFunctor(tokens), search::Delimiters()); + for (auto const & token : tokens) + fn(strings::ToUtf8(token)); +} + +void TokenizeAndAddAllSubstrings(my::MemTrie & trie, string const & s, + uint32_t value) +{ + auto fn = [&](string const & token) + { + AddAllNonemptySubstrings(trie, token, value); + }; + ForEachToken(s, fn); +} } // namespace namespace indexer { void CategoriesIndex::AddCategoryByTypeAndLang(uint32_t type, int8_t lang) { - m_catHolder.ForEachNameByType(type, [&](CategoriesHolder::Category::Name const & name) + m_catHolder.ForEachNameByType(type, [&](TCategory::Name const & name) { if (name.m_locale == lang) - AddAllSubstrings(m_trie, name.m_name, type); + TokenizeAndAddAllSubstrings(m_trie, name.m_name, type); }); } @@ -40,45 +66,62 @@ void CategoriesIndex::AddCategoryByTypeAllLangs(uint32_t type) void CategoriesIndex::AddAllCategoriesInLang(int8_t lang) { - m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat) + m_catHolder.ForEachTypeAndCategory([&](uint32_t type, TCategory const & cat) { for (auto const & name : cat.m_synonyms) { if (name.m_locale == lang) - AddAllSubstrings(m_trie, name.m_name, type); + TokenizeAndAddAllSubstrings(m_trie, name.m_name, type); } }); } -void CategoriesIndex::AddAllCategoriesAllLangs() +void CategoriesIndex::AddAllCategoriesInAllLangs() { - m_catHolder.ForEachTypeAndCategory([this](uint32_t type, Category const & cat) + m_catHolder.ForEachTypeAndCategory([this](uint32_t type, TCategory const & cat) { for (auto const & name : cat.m_synonyms) - AddAllSubstrings(m_trie, name.m_name, type); + TokenizeAndAddAllSubstrings(m_trie, name.m_name, type); }); } -void CategoriesIndex::GetCategories(string const & query, vector & result) +void CategoriesIndex::GetCategories(string const & query, vector & result) const { vector types; GetAssociatedTypes(query, types); my::SortUnique(types); - m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat) + m_catHolder.ForEachTypeAndCategory([&](uint32_t type, TCategory const & cat) { if (binary_search(types.begin(), types.end(), type)) result.push_back(cat); }); } -void CategoriesIndex::GetAssociatedTypes(string const & query, vector & result) +void CategoriesIndex::GetAssociatedTypes(string const & query, vector & result) const { - set types; - auto fn = [&](string const & s, uint32_t type) + bool first = true; + set intersection; + ForEachToken(query, [&](string const & token) { - types.insert(type); - }; - m_trie.ForEachInSubtree(query, fn); - result.insert(result.end(), types.begin(), types.end()); + set types; + auto fn = [&](string const &, uint32_t type) + { + types.insert(type); + }; + m_trie.ForEachInSubtree(token, fn); + if (first) + { + intersection.swap(types); + } + else + { + set tmp; + set_intersection(intersection.begin(),intersection.end(),types.begin(),types.end(),inserter(tmp,tmp.begin())); + intersection.swap(tmp); + } + first = false; + }); + + result.insert(result.end(), intersection.begin(), intersection.end()); } } // namespace indexer diff --git a/indexer/categories_index.hpp b/indexer/categories_index.hpp index 0429976096..09859499ad 100644 --- a/indexer/categories_index.hpp +++ b/indexer/categories_index.hpp @@ -16,7 +16,7 @@ namespace indexer class CategoriesIndex { public: - using Category = CategoriesHolder::Category; + using TCategory = CategoriesHolder::Category; CategoriesIndex() : m_catHolder(GetDefaultCategories()) {} @@ -36,21 +36,22 @@ public: void AddAllCategoriesInLang(int8_t lang); // Adds all categories from data/classificator.txt. - void AddAllCategoriesAllLangs(); + void AddAllCategoriesInAllLangs(); // Returns all categories that have |query| as a substring. Note // that all synonyms for a category are contained in a returned // value even if only one language was used when adding this // category's name to index. // Beware weird results when query is a malformed UTF-8 string. - void GetCategories(string const & query, vector & result); + void GetCategories(string const & query, vector & result) const; // Returns all types that match to categories that have |query| as substring. // Beware weird results when query is a malformed UTF-8 string. - void GetAssociatedTypes(string const & query, vector & result); + // Note: no types are returned if the query is empty. + void GetAssociatedTypes(string const & query, vector & result) const; #ifdef DEBUG - int GetNumTrieNodes() { return m_trie.GetNumNodes(); } + inline int GetNumTrieNodes() const { return m_trie.GetNumNodes(); } #endif private: diff --git a/indexer/indexer_tests/categories_test.cpp b/indexer/indexer_tests/categories_test.cpp index 3e46c2834c..d7b340d305 100644 --- a/indexer/indexer_tests/categories_test.cpp +++ b/indexer/indexer_tests/categories_test.cpp @@ -16,7 +16,7 @@ using namespace indexer; -char const * g_testCategoriesTxt = +char const g_testCategoriesTxt[] = "amenity-bench\n" "en:1bench|sit down|to sit\n" "de:0bank|auf die strafbank schicken\n" @@ -94,7 +94,7 @@ UNIT_TEST(LoadCategories) { classificator::Load(); - CategoriesHolder h(make_unique(g_testCategoriesTxt, strlen(g_testCategoriesTxt))); + CategoriesHolder h(make_unique(g_testCategoriesTxt, sizeof(g_testCategoriesTxt) - 1)); size_t count = 0; Checker f(count); h.ForEachCategory(f); @@ -105,62 +105,110 @@ UNIT_TEST(CategoriesIndex_Smoke) { classificator::Load(); - CategoriesHolder catHolder( - make_unique(g_testCategoriesTxt, strlen(g_testCategoriesTxt))); - CategoriesIndex catIndex(catHolder); - + CategoriesHolder holder( + make_unique(g_testCategoriesTxt, sizeof(g_testCategoriesTxt) - 1)); + CategoriesIndex index(holder); + uint32_t type1 = classif().GetTypeByPath({"amenity", "bench"}); uint32_t type2 = classif().GetTypeByPath({"place", "village"}); if (type1 > type2) swap(type1, type2); int8_t lang1 = CategoriesHolder::MapLocaleToInteger("en"); int8_t lang2 = CategoriesHolder::MapLocaleToInteger("de"); - - auto testTypes = [&](string const & query, vector && expected) + + auto testTypes = [&](string const & query, vector const & expected) { vector result; - catIndex.GetAssociatedTypes(query, result); + index.GetAssociatedTypes(query, result); TEST_EQUAL(result, expected, (query)); }; - - catIndex.AddCategoryByTypeAndLang(type1, lang1); + + index.AddCategoryByTypeAndLang(type1, lang1); testTypes("bench", {type1}); + testTypes("BENCH", {type1}); testTypes("down", {type1}); testTypes("benck", {}); testTypes("strafbank", {}); - catIndex.AddCategoryByTypeAndLang(type1, lang2); + index.AddCategoryByTypeAndLang(type1, lang2); testTypes("strafbank", {type1}); - catIndex.AddCategoryByTypeAndLang(type2, lang1); + testTypes("ie strafbank sc", {type1}); + testTypes("rafb", {type1}); + index.AddCategoryByTypeAndLang(type2, lang1); testTypes("i", {type1, type2}); + + CategoriesIndex fullIndex(holder); + fullIndex.AddCategoryByTypeAllLangs(type1); + fullIndex.AddCategoryByTypeAllLangs(type2); + vector cats; + + // The letter 'a' matches "strafbank" and "village". + // One language is not enough. + fullIndex.GetCategories("a", cats); + + TEST_EQUAL(cats.size(), 2, ()); + + TEST_EQUAL(cats[0].m_synonyms.size(), 8, ()); + TEST_EQUAL(cats[0].m_synonyms[4].m_locale, CategoriesHolder::MapLocaleToInteger("de"), ()); + TEST_EQUAL(cats[0].m_synonyms[4].m_name, "auf die strafbank schicken", ()); + + TEST_EQUAL(cats[1].m_synonyms.size(), 3, ()); + TEST_EQUAL(cats[1].m_synonyms[0].m_locale, CategoriesHolder::MapLocaleToInteger("en"), ()); + TEST_EQUAL(cats[1].m_synonyms[0].m_name, "village", ()); } +UNIT_TEST(CategoriesIndex_MultipleTokens) +{ + char const kCategories[] = + "shop-bakery\n" + "en:shop of buns\n" + "\n" + "shop-butcher\n" + "en:shop of meat"; + + classificator::Load(); + CategoriesHolder holder(make_unique(kCategories, sizeof(kCategories) - 1)); + CategoriesIndex index(holder); + + index.AddAllCategoriesInAllLangs(); + auto testTypes = [&](string const & query, vector const & expected) + { + vector result; + index.GetAssociatedTypes(query, result); + TEST_EQUAL(result, expected, (query)); + }; + + uint32_t type1 = classif().GetTypeByPath({"shop", "bakery"}); + uint32_t type2 = classif().GetTypeByPath({"shop", "butcher"}); + if (type1 > type2) + swap(type1, type2); + + testTypes("shop", {type1, type2}); + testTypes("shop buns", {type1}); + testTypes("shop meat", {type2}); +} + +#ifdef DEBUG +// A check that this data structure is not too heavy. UNIT_TEST(CategoriesIndex_AllCategories) { classificator::Load(); - CategoriesIndex catIndex; + CategoriesIndex index; - catIndex.AddAllCategoriesAllLangs(); - vector types; - catIndex.GetAssociatedTypes("", types); - TEST_LESS(types.size(), 300, ()); -#ifdef DEBUG - TEST_LESS(catIndex.GetNumTrieNodes(), 400000, ()); -#endif + index.AddAllCategoriesInAllLangs(); + TEST_LESS(index.GetNumTrieNodes(), 250000, ()); } +#endif +#ifdef DEBUG +// A check that this data structure is not too heavy. UNIT_TEST(CategoriesIndex_AllCategoriesEnglishName) { classificator::Load(); - CategoriesIndex catIndex; + CategoriesIndex index; - catIndex.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en")); - vector types; - catIndex.GetAssociatedTypes("", types); - my::SortUnique(types); - TEST_LESS(types.size(), 300, ()); -#ifdef DEBUG - TEST_LESS(catIndex.GetNumTrieNodes(), 10000, ()); -#endif + index.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en")); + TEST_LESS(index.GetNumTrieNodes(), 6000, ()); } +#endif diff --git a/std/iterator.hpp b/std/iterator.hpp index d72f99eed1..998fc33f3e 100644 --- a/std/iterator.hpp +++ b/std/iterator.hpp @@ -13,6 +13,7 @@ using std::begin; using std::distance; using std::end; using std::insert_iterator; +using std::inserter; using std::istream_iterator; using std::iterator_traits; using std::next;