diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index dd19256f0a..4953cecd02 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -7,39 +7,49 @@ namespace my { -/// This class is a simple in-memory trie which allows to add -/// key-value pairs and then traverse them in a sorted order. -template +// This class is a simple in-memory trie which allows to add +// key-value pairs and then traverse them in a sorted order. +template class MemTrie { public: MemTrie() = default; - /// Adds a key-value pair to the trie. - void Add(StringT const & key, ValueT const & value) + // Adds a key-value pair to the trie. + void Add(TString const & key, TValue const & value) { Node * cur = &m_root; for (auto const & c : key) - cur = cur->GetMove(c); + { + size_t numNewNodes; + cur = cur->GetMove(c, numNewNodes); + m_numNodes += numNewNodes; + } cur->AddValue(value); } - /// Traverses all key-value pairs in the trie in a sorted order. - /// - /// \param toDo A callable object that will be called on an each - /// key-value pair. + // Traverses all key-value pairs in the trie and calls |toDo| on each of them. template - void ForEach(ToDo const & toDo) + void ForEach(ToDo && toDo) { - StringT prefix; + TString prefix; ForEach(&m_root, prefix, toDo); } + template + void ForEachInSubtree(TString prefix, ToDo && toDo) + { + Node * nd = MoveTo(prefix); + if (nd) + ForEach(nd, prefix, toDo); + } + + size_t GetNumNodes() const { return m_numNodes; } + private: struct Node { - using CharT = typename StringT::value_type; - using MovesMap = map; + using TChar = typename TString::value_type; Node() = default; @@ -49,28 +59,45 @@ private: delete move.second; } - Node * GetMove(CharT const & c) + Node * GetMove(TChar const & c, size_t & numNewNodes) { + numNewNodes = 0; Node *& node = m_moves[c]; if (!node) + { node = new Node(); + ++numNewNodes; + } return node; } - void AddValue(const ValueT & value) { m_values.push_back(value); } + void AddValue(TValue const & value) { m_values.push_back(value); } - MovesMap m_moves; - vector m_values; + map m_moves; + vector m_values; DISALLOW_COPY_AND_MOVE(Node); }; + Node * MoveTo(TString const & key) + { + Node * cur = &m_root; + for (auto const & c : key) + { + auto const it = cur->m_moves.find(c); + if (it != cur->m_moves.end()) + cur = it->second; + else + return nullptr; + } + return cur; + } + template - void ForEach(Node * root, StringT & prefix, ToDo const & toDo) + void ForEach(Node * root, TString & prefix, ToDo && toDo) { if (!root->m_values.empty()) { - sort(root->m_values.begin(), root->m_values.end()); for (auto const & value : root->m_values) toDo(prefix, value); } @@ -84,6 +111,7 @@ private: } Node m_root; + size_t m_numNodes = 0; DISALLOW_COPY_AND_MOVE(MemTrie); }; // class MemTrie diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp index bbef05a57b..8b49341ab8 100644 --- a/indexer/categories_holder.cpp +++ b/indexer/categories_holder.cpp @@ -21,6 +21,8 @@ enum State } // unnamed namespace +// static +size_t const CategoriesHolder::kNumLanguages = 30; CategoriesHolder::CategoriesHolder(unique_ptr && reader) { @@ -275,7 +277,8 @@ int8_t CategoriesHolder::MapLocaleToInteger(string const & locale) {"he", 29 }, {"sw", 30 } }; - for (size_t i = 0; i < ARRAY_SIZE(mapping); ++i) + ASSERT_EQUAL(ARRAY_SIZE(mapping), kNumLanguages, ()); + for (size_t i = 0; i < kNumLanguages; ++i) if (locale.find(mapping[i].m_name) == 0) return mapping[i].m_code; diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index 97a56733f9..68b543e003 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -45,6 +45,8 @@ private: Name2CatContT m_name2type; public: + static size_t const kNumLanguages; + explicit CategoriesHolder(unique_ptr && reader); void LoadFromStream(istream & s); @@ -55,6 +57,13 @@ public: toDo(*i->second); } + template + void ForEachTypeAndCategory(ToDo && toDo) const + { + for (IteratorT i = m_type2cat.begin(); i != m_type2cat.end(); ++i) + toDo(i->first, *i->second); + } + template void ForEachName(ToDo && toDo) const { @@ -63,6 +72,16 @@ public: toDo(i->second->m_synonyms[j]); } + template + void ForEachNameByType(uint32_t type, ToDo && toDo) const + { + auto it = m_type2cat.find(type); + if (it == m_type2cat.end()) + return; + for (auto const & name : it->second->m_synonyms) + toDo(name); + } + template void ForEachTypeByName(int8_t locale, StringT const & name, ToDo && toDo) const { diff --git a/indexer/categories_index.cpp b/indexer/categories_index.cpp new file mode 100644 index 0000000000..344c2b3721 --- /dev/null +++ b/indexer/categories_index.cpp @@ -0,0 +1,84 @@ +#include "categories_index.hpp" + +#include "base/stl_helpers.hpp" + +#include "std/algorithm.hpp" +#include "std/set.hpp" + +namespace +{ +void AddAllSubstrings(my::MemTrie & trie, string const & s, uint32_t value) +{ + for (size_t i = 0; i < s.length(); ++i) + { + string t; + for (size_t j = i; j < s.length(); ++j) + { + t.append(1, s[j]); + trie.Add(t, value); + } + } +} +} // namespace + +namespace indexer +{ +void CategoriesIndex::AddCategoryByTypeAndLang(uint32_t type, int8_t lang) +{ + m_catHolder.ForEachNameByType(type, [&](CategoriesHolder::Category::Name const & name) + { + if (name.m_locale == lang) + AddAllSubstrings(m_trie, name.m_name, type); + }); +} + +void CategoriesIndex::AddCategoryByTypeAllLangs(uint32_t type) +{ + for (size_t i = 1; i <= CategoriesHolder::kNumLanguages; ++i) + AddCategoryByTypeAndLang(type, i); +} + +void CategoriesIndex::AddAllCategoriesInLang(int8_t lang) +{ + m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat) + { + for (auto const & name : cat.m_synonyms) + { + if (name.m_locale == lang) + AddAllSubstrings(m_trie, name.m_name, type); + } + }); +} + +void CategoriesIndex::AddAllCategoriesAllLangs() +{ + m_catHolder.ForEachTypeAndCategory([this](uint32_t type, Category const & cat) + { + for (auto const & name : cat.m_synonyms) + AddAllSubstrings(m_trie, name.m_name, type); + }); +} + +void CategoriesIndex::GetCategories(string const & query, vector & result) +{ + vector types; + GetAssociatedTypes(query, types); + my::SortUnique(types); + m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat) + { + if (binary_search(types.begin(), types.end(), type)) + result.push_back(cat); + }); +} + +void CategoriesIndex::GetAssociatedTypes(string const & query, vector & result) +{ + set types; + auto fn = [&](string const & s, uint32_t type) + { + types.insert(type); + }; + m_trie.ForEachInSubtree(query, fn); + result.insert(result.end(), types.begin(), types.end()); +} +} // namespace indexer diff --git a/indexer/categories_index.hpp b/indexer/categories_index.hpp new file mode 100644 index 0000000000..0429976096 --- /dev/null +++ b/indexer/categories_index.hpp @@ -0,0 +1,60 @@ +#include "categories_holder.hpp" + +#include "base/mem_trie.hpp" + +#include "std/string.hpp" +#include "std/vector.hpp" + +namespace indexer +{ +// This class is used to simplify searches of categories by +// synonyms to their names (in various languages). +// An example usage is helping a user who is trying to add +// a new feature with our editor. +// All category data is taken from data/categories.txt. +// All types returned are those from classificator. +class CategoriesIndex +{ +public: + using Category = CategoriesHolder::Category; + + CategoriesIndex() : m_catHolder(GetDefaultCategories()) {} + + CategoriesIndex(CategoriesHolder const & catHolder) : m_catHolder(catHolder) {} + + // Adds all categories that match |type|. Only synonyms + // in language |lang| are added. See indexer/categories_holder.cpp + // for language enumeration. + void AddCategoryByTypeAndLang(uint32_t type, int8_t lang); + + // Adds all categories that match |type|. All known synonyms + // are added. + void AddCategoryByTypeAllLangs(uint32_t type); + + // Adds all categories from data/classificator.txt. Only + // names in language |lang| are added. + void AddAllCategoriesInLang(int8_t lang); + + // Adds all categories from data/classificator.txt. + void AddAllCategoriesAllLangs(); + + // Returns all categories that have |query| as a substring. Note + // that all synonyms for a category are contained in a returned + // value even if only one language was used when adding this + // category's name to index. + // Beware weird results when query is a malformed UTF-8 string. + void GetCategories(string const & query, vector & result); + + // Returns all types that match to categories that have |query| as substring. + // Beware weird results when query is a malformed UTF-8 string. + void GetAssociatedTypes(string const & query, vector & result); + +#ifdef DEBUG + int GetNumTrieNodes() { return m_trie.GetNumNodes(); } +#endif + +private: + CategoriesHolder const & m_catHolder; + my::MemTrie m_trie; +}; +} // namespace indexer diff --git a/indexer/indexer.pro b/indexer/indexer.pro index b2b9fe13f7..6e309cddd7 100644 --- a/indexer/indexer.pro +++ b/indexer/indexer.pro @@ -12,10 +12,11 @@ include($$ROOT_DIR/common.pri) SOURCES += \ categories_holder.cpp \ categories_holder_loader.cpp \ + categories_index.cpp \ classificator.cpp \ classificator_loader.cpp \ - cuisines.cpp \ coding_params.cpp \ + cuisines.cpp \ data_factory.cpp \ data_header.cpp \ drawing_rule_def.cpp \ @@ -57,6 +58,7 @@ SOURCES += \ HEADERS += \ categories_holder.hpp \ + categories_index.hpp \ cell_coverer.hpp \ cell_id.hpp \ classificator.hpp \ @@ -118,6 +120,7 @@ HEADERS += \ types_mapping.hpp \ unique_index.hpp \ + OTHER_FILES += drules_struct.proto SOURCES += drules_struct.pb.cc diff --git a/indexer/indexer_tests/categories_test.cpp b/indexer/indexer_tests/categories_test.cpp index 62ca8df964..3e46c2834c 100644 --- a/indexer/indexer_tests/categories_test.cpp +++ b/indexer/indexer_tests/categories_test.cpp @@ -1,25 +1,32 @@ #include "testing/testing.hpp" #include "indexer/categories_holder.hpp" +#include "indexer/categories_index.hpp" #include "indexer/classificator.hpp" #include "indexer/classificator_loader.hpp" #include "coding/multilang_utf8_string.hpp" #include "coding/reader.hpp" +#include "std/algorithm.hpp" #include "std/sstream.hpp" +#include "std/vector.hpp" +#include "base/stl_helpers.hpp" -char const * TEST_STRING = "amenity-bench\n" - "en:1bench|sit down|to sit\n" - "de:0bank|auf die strafbank schicken\n" - "zh-Hans:长凳\n" - "zh-Hant:長板凳\n" - "da:bænk\n" - "\n" - "place-village|place-hamlet\n" - "en:village\n" - "de:2dorf|4weiler"; +using namespace indexer; + +char const * g_testCategoriesTxt = + "amenity-bench\n" + "en:1bench|sit down|to sit\n" + "de:0bank|auf die strafbank schicken\n" + "zh-Hans:长凳\n" + "zh-Hant:長板凳\n" + "da:bænk\n" + "\n" + "place-village|place-hamlet\n" + "en:village\n" + "de:2dorf|4weiler"; struct Checker { @@ -87,9 +94,73 @@ UNIT_TEST(LoadCategories) { classificator::Load(); - CategoriesHolder h(make_unique(TEST_STRING, strlen(TEST_STRING))); + CategoriesHolder h(make_unique(g_testCategoriesTxt, strlen(g_testCategoriesTxt))); size_t count = 0; Checker f(count); h.ForEachCategory(f); TEST_EQUAL(count, 3, ()); } + +UNIT_TEST(CategoriesIndex_Smoke) +{ + classificator::Load(); + + CategoriesHolder catHolder( + make_unique(g_testCategoriesTxt, strlen(g_testCategoriesTxt))); + CategoriesIndex catIndex(catHolder); + + uint32_t type1 = classif().GetTypeByPath({"amenity", "bench"}); + uint32_t type2 = classif().GetTypeByPath({"place", "village"}); + if (type1 > type2) + swap(type1, type2); + int8_t lang1 = CategoriesHolder::MapLocaleToInteger("en"); + int8_t lang2 = CategoriesHolder::MapLocaleToInteger("de"); + + auto testTypes = [&](string const & query, vector && expected) + { + vector result; + catIndex.GetAssociatedTypes(query, result); + TEST_EQUAL(result, expected, (query)); + }; + + catIndex.AddCategoryByTypeAndLang(type1, lang1); + testTypes("bench", {type1}); + testTypes("down", {type1}); + testTypes("benck", {}); + testTypes("strafbank", {}); + catIndex.AddCategoryByTypeAndLang(type1, lang2); + testTypes("strafbank", {type1}); + catIndex.AddCategoryByTypeAndLang(type2, lang1); + testTypes("i", {type1, type2}); +} + +UNIT_TEST(CategoriesIndex_AllCategories) +{ + classificator::Load(); + + CategoriesIndex catIndex; + + catIndex.AddAllCategoriesAllLangs(); + vector types; + catIndex.GetAssociatedTypes("", types); + TEST_LESS(types.size(), 300, ()); +#ifdef DEBUG + TEST_LESS(catIndex.GetNumTrieNodes(), 400000, ()); +#endif +} + +UNIT_TEST(CategoriesIndex_AllCategoriesEnglishName) +{ + classificator::Load(); + + CategoriesIndex catIndex; + + catIndex.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en")); + vector types; + catIndex.GetAssociatedTypes("", types); + my::SortUnique(types); + TEST_LESS(types.size(), 300, ()); +#ifdef DEBUG + TEST_LESS(catIndex.GetNumTrieNodes(), 10000, ()); +#endif +}