forked from organicmaps/organicmaps
Review fixes.
This commit is contained in:
parent
360bafbc29
commit
9243f3130a
7 changed files with 161 additions and 67 deletions
|
@ -33,15 +33,15 @@ public:
|
|||
void ForEach(ToDo && toDo)
|
||||
{
|
||||
TString prefix;
|
||||
ForEach(&m_root, prefix, toDo);
|
||||
ForEach(&m_root, prefix, forward<ToDo>(toDo));
|
||||
}
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEachInSubtree(TString prefix, ToDo && toDo)
|
||||
void ForEachInSubtree(TString prefix, ToDo && toDo) const
|
||||
{
|
||||
Node * nd = MoveTo(prefix);
|
||||
if (nd)
|
||||
ForEach(nd, prefix, toDo);
|
||||
Node const * node = MoveTo(prefix);
|
||||
if (node)
|
||||
ForEach(node, prefix, forward<ToDo>(toDo));
|
||||
}
|
||||
|
||||
size_t GetNumNodes() const { return m_numNodes; }
|
||||
|
@ -79,22 +79,21 @@ private:
|
|||
DISALLOW_COPY_AND_MOVE(Node);
|
||||
};
|
||||
|
||||
Node * MoveTo(TString const & key)
|
||||
Node const * MoveTo(TString const & key) const
|
||||
{
|
||||
Node * cur = &m_root;
|
||||
Node const * cur = &m_root;
|
||||
for (auto const & c : key)
|
||||
{
|
||||
auto const it = cur->m_moves.find(c);
|
||||
if (it != cur->m_moves.end())
|
||||
cur = it->second;
|
||||
else
|
||||
if (it == cur->m_moves.end())
|
||||
return nullptr;
|
||||
cur = it->second;
|
||||
}
|
||||
return cur;
|
||||
}
|
||||
|
||||
template <typename ToDo>
|
||||
void ForEach(Node * root, TString & prefix, ToDo && toDo)
|
||||
void ForEach(Node const * root, TString & prefix, ToDo && toDo) const
|
||||
{
|
||||
if (!root->m_values.empty())
|
||||
{
|
||||
|
|
|
@ -277,10 +277,12 @@ int8_t CategoriesHolder::MapLocaleToInteger(string const & locale)
|
|||
{"he", 29 },
|
||||
{"sw", 30 }
|
||||
};
|
||||
ASSERT_EQUAL(ARRAY_SIZE(mapping), kNumLanguages, ());
|
||||
static_assert(ARRAY_SIZE(mapping) == kNumLanguages, "");
|
||||
for (size_t i = 0; i < kNumLanguages; ++i)
|
||||
{
|
||||
if (locale.find(mapping[i].m_name) == 0)
|
||||
return mapping[i].m_code;
|
||||
}
|
||||
|
||||
// Special cases for different Chinese variations
|
||||
if (locale.find("zh") == 0)
|
||||
|
|
|
@ -60,8 +60,8 @@ public:
|
|||
template <class ToDo>
|
||||
void ForEachTypeAndCategory(ToDo && toDo) const
|
||||
{
|
||||
for (IteratorT i = m_type2cat.begin(); i != m_type2cat.end(); ++i)
|
||||
toDo(i->first, *i->second);
|
||||
for (auto const it : m_type2cat)
|
||||
toDo(it.first, *it.second);
|
||||
}
|
||||
|
||||
template <class ToDo>
|
||||
|
|
|
@ -1,34 +1,60 @@
|
|||
#include "categories_index.hpp"
|
||||
#include "search_delimiters.hpp"
|
||||
#include "search_string_utils.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/stl_add.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/set.hpp"
|
||||
|
||||
namespace
|
||||
{
|
||||
void AddAllSubstrings(my::MemTrie<string, uint32_t> & trie, string const & s, uint32_t value)
|
||||
void AddAllNonemptySubstrings(my::MemTrie<string, uint32_t> & trie, string const & s,
|
||||
uint32_t value)
|
||||
{
|
||||
ASSERT(!s.empty(), ());
|
||||
for (size_t i = 0; i < s.length(); ++i)
|
||||
{
|
||||
string t;
|
||||
for (size_t j = i; j < s.length(); ++j)
|
||||
{
|
||||
t.append(1, s[j]);
|
||||
t.push_back(s[j]);
|
||||
trie.Add(t, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TF>
|
||||
void ForEachToken(string const & s, TF && fn)
|
||||
{
|
||||
vector<strings::UniString> tokens;
|
||||
SplitUniString(search::NormalizeAndSimplifyString(s), MakeBackInsertFunctor(tokens), search::Delimiters());
|
||||
for (auto const & token : tokens)
|
||||
fn(strings::ToUtf8(token));
|
||||
}
|
||||
|
||||
void TokenizeAndAddAllSubstrings(my::MemTrie<string, uint32_t> & trie, string const & s,
|
||||
uint32_t value)
|
||||
{
|
||||
auto fn = [&](string const & token)
|
||||
{
|
||||
AddAllNonemptySubstrings(trie, token, value);
|
||||
};
|
||||
ForEachToken(s, fn);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace indexer
|
||||
{
|
||||
void CategoriesIndex::AddCategoryByTypeAndLang(uint32_t type, int8_t lang)
|
||||
{
|
||||
m_catHolder.ForEachNameByType(type, [&](CategoriesHolder::Category::Name const & name)
|
||||
m_catHolder.ForEachNameByType(type, [&](TCategory::Name const & name)
|
||||
{
|
||||
if (name.m_locale == lang)
|
||||
AddAllSubstrings(m_trie, name.m_name, type);
|
||||
TokenizeAndAddAllSubstrings(m_trie, name.m_name, type);
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -40,45 +66,62 @@ void CategoriesIndex::AddCategoryByTypeAllLangs(uint32_t type)
|
|||
|
||||
void CategoriesIndex::AddAllCategoriesInLang(int8_t lang)
|
||||
{
|
||||
m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat)
|
||||
m_catHolder.ForEachTypeAndCategory([&](uint32_t type, TCategory const & cat)
|
||||
{
|
||||
for (auto const & name : cat.m_synonyms)
|
||||
{
|
||||
if (name.m_locale == lang)
|
||||
AddAllSubstrings(m_trie, name.m_name, type);
|
||||
TokenizeAndAddAllSubstrings(m_trie, name.m_name, type);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
void CategoriesIndex::AddAllCategoriesAllLangs()
|
||||
void CategoriesIndex::AddAllCategoriesInAllLangs()
|
||||
{
|
||||
m_catHolder.ForEachTypeAndCategory([this](uint32_t type, Category const & cat)
|
||||
m_catHolder.ForEachTypeAndCategory([this](uint32_t type, TCategory const & cat)
|
||||
{
|
||||
for (auto const & name : cat.m_synonyms)
|
||||
AddAllSubstrings(m_trie, name.m_name, type);
|
||||
TokenizeAndAddAllSubstrings(m_trie, name.m_name, type);
|
||||
});
|
||||
}
|
||||
|
||||
void CategoriesIndex::GetCategories(string const & query, vector<Category> & result)
|
||||
void CategoriesIndex::GetCategories(string const & query, vector<TCategory> & result) const
|
||||
{
|
||||
vector<uint32_t> types;
|
||||
GetAssociatedTypes(query, types);
|
||||
my::SortUnique(types);
|
||||
m_catHolder.ForEachTypeAndCategory([&](uint32_t type, Category const & cat)
|
||||
m_catHolder.ForEachTypeAndCategory([&](uint32_t type, TCategory const & cat)
|
||||
{
|
||||
if (binary_search(types.begin(), types.end(), type))
|
||||
result.push_back(cat);
|
||||
});
|
||||
}
|
||||
|
||||
void CategoriesIndex::GetAssociatedTypes(string const & query, vector<uint32_t> & result)
|
||||
void CategoriesIndex::GetAssociatedTypes(string const & query, vector<uint32_t> & result) const
|
||||
{
|
||||
set<uint32_t> types;
|
||||
auto fn = [&](string const & s, uint32_t type)
|
||||
bool first = true;
|
||||
set<uint32_t> intersection;
|
||||
ForEachToken(query, [&](string const & token)
|
||||
{
|
||||
types.insert(type);
|
||||
};
|
||||
m_trie.ForEachInSubtree(query, fn);
|
||||
result.insert(result.end(), types.begin(), types.end());
|
||||
set<uint32_t> types;
|
||||
auto fn = [&](string const &, uint32_t type)
|
||||
{
|
||||
types.insert(type);
|
||||
};
|
||||
m_trie.ForEachInSubtree(token, fn);
|
||||
if (first)
|
||||
{
|
||||
intersection.swap(types);
|
||||
}
|
||||
else
|
||||
{
|
||||
set<uint32_t> tmp;
|
||||
set_intersection(intersection.begin(),intersection.end(),types.begin(),types.end(),inserter(tmp,tmp.begin()));
|
||||
intersection.swap(tmp);
|
||||
}
|
||||
first = false;
|
||||
});
|
||||
|
||||
result.insert(result.end(), intersection.begin(), intersection.end());
|
||||
}
|
||||
} // namespace indexer
|
||||
|
|
|
@ -16,7 +16,7 @@ namespace indexer
|
|||
class CategoriesIndex
|
||||
{
|
||||
public:
|
||||
using Category = CategoriesHolder::Category;
|
||||
using TCategory = CategoriesHolder::Category;
|
||||
|
||||
CategoriesIndex() : m_catHolder(GetDefaultCategories()) {}
|
||||
|
||||
|
@ -36,21 +36,22 @@ public:
|
|||
void AddAllCategoriesInLang(int8_t lang);
|
||||
|
||||
// Adds all categories from data/classificator.txt.
|
||||
void AddAllCategoriesAllLangs();
|
||||
void AddAllCategoriesInAllLangs();
|
||||
|
||||
// Returns all categories that have |query| as a substring. Note
|
||||
// that all synonyms for a category are contained in a returned
|
||||
// value even if only one language was used when adding this
|
||||
// category's name to index.
|
||||
// Beware weird results when query is a malformed UTF-8 string.
|
||||
void GetCategories(string const & query, vector<Category> & result);
|
||||
void GetCategories(string const & query, vector<TCategory> & result) const;
|
||||
|
||||
// Returns all types that match to categories that have |query| as substring.
|
||||
// Beware weird results when query is a malformed UTF-8 string.
|
||||
void GetAssociatedTypes(string const & query, vector<uint32_t> & result);
|
||||
// Note: no types are returned if the query is empty.
|
||||
void GetAssociatedTypes(string const & query, vector<uint32_t> & result) const;
|
||||
|
||||
#ifdef DEBUG
|
||||
int GetNumTrieNodes() { return m_trie.GetNumNodes(); }
|
||||
inline int GetNumTrieNodes() const { return m_trie.GetNumNodes(); }
|
||||
#endif
|
||||
|
||||
private:
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
using namespace indexer;
|
||||
|
||||
char const * g_testCategoriesTxt =
|
||||
char const g_testCategoriesTxt[] =
|
||||
"amenity-bench\n"
|
||||
"en:1bench|sit down|to sit\n"
|
||||
"de:0bank|auf die strafbank schicken\n"
|
||||
|
@ -94,7 +94,7 @@ UNIT_TEST(LoadCategories)
|
|||
{
|
||||
classificator::Load();
|
||||
|
||||
CategoriesHolder h(make_unique<MemReader>(g_testCategoriesTxt, strlen(g_testCategoriesTxt)));
|
||||
CategoriesHolder h(make_unique<MemReader>(g_testCategoriesTxt, sizeof(g_testCategoriesTxt) - 1));
|
||||
size_t count = 0;
|
||||
Checker f(count);
|
||||
h.ForEachCategory(f);
|
||||
|
@ -105,62 +105,110 @@ UNIT_TEST(CategoriesIndex_Smoke)
|
|||
{
|
||||
classificator::Load();
|
||||
|
||||
CategoriesHolder catHolder(
|
||||
make_unique<MemReader>(g_testCategoriesTxt, strlen(g_testCategoriesTxt)));
|
||||
CategoriesIndex catIndex(catHolder);
|
||||
|
||||
CategoriesHolder holder(
|
||||
make_unique<MemReader>(g_testCategoriesTxt, sizeof(g_testCategoriesTxt) - 1));
|
||||
CategoriesIndex index(holder);
|
||||
|
||||
uint32_t type1 = classif().GetTypeByPath({"amenity", "bench"});
|
||||
uint32_t type2 = classif().GetTypeByPath({"place", "village"});
|
||||
if (type1 > type2)
|
||||
swap(type1, type2);
|
||||
int8_t lang1 = CategoriesHolder::MapLocaleToInteger("en");
|
||||
int8_t lang2 = CategoriesHolder::MapLocaleToInteger("de");
|
||||
|
||||
auto testTypes = [&](string const & query, vector<uint32_t> && expected)
|
||||
|
||||
auto testTypes = [&](string const & query, vector<uint32_t> const & expected)
|
||||
{
|
||||
vector<uint32_t> result;
|
||||
catIndex.GetAssociatedTypes(query, result);
|
||||
index.GetAssociatedTypes(query, result);
|
||||
TEST_EQUAL(result, expected, (query));
|
||||
};
|
||||
|
||||
catIndex.AddCategoryByTypeAndLang(type1, lang1);
|
||||
|
||||
index.AddCategoryByTypeAndLang(type1, lang1);
|
||||
testTypes("bench", {type1});
|
||||
testTypes("BENCH", {type1});
|
||||
testTypes("down", {type1});
|
||||
testTypes("benck", {});
|
||||
testTypes("strafbank", {});
|
||||
catIndex.AddCategoryByTypeAndLang(type1, lang2);
|
||||
index.AddCategoryByTypeAndLang(type1, lang2);
|
||||
testTypes("strafbank", {type1});
|
||||
catIndex.AddCategoryByTypeAndLang(type2, lang1);
|
||||
testTypes("ie strafbank sc", {type1});
|
||||
testTypes("rafb", {type1});
|
||||
index.AddCategoryByTypeAndLang(type2, lang1);
|
||||
testTypes("i", {type1, type2});
|
||||
|
||||
CategoriesIndex fullIndex(holder);
|
||||
fullIndex.AddCategoryByTypeAllLangs(type1);
|
||||
fullIndex.AddCategoryByTypeAllLangs(type2);
|
||||
vector<CategoriesHolder::Category> cats;
|
||||
|
||||
// The letter 'a' matches "strafbank" and "village".
|
||||
// One language is not enough.
|
||||
fullIndex.GetCategories("a", cats);
|
||||
|
||||
TEST_EQUAL(cats.size(), 2, ());
|
||||
|
||||
TEST_EQUAL(cats[0].m_synonyms.size(), 8, ());
|
||||
TEST_EQUAL(cats[0].m_synonyms[4].m_locale, CategoriesHolder::MapLocaleToInteger("de"), ());
|
||||
TEST_EQUAL(cats[0].m_synonyms[4].m_name, "auf die strafbank schicken", ());
|
||||
|
||||
TEST_EQUAL(cats[1].m_synonyms.size(), 3, ());
|
||||
TEST_EQUAL(cats[1].m_synonyms[0].m_locale, CategoriesHolder::MapLocaleToInteger("en"), ());
|
||||
TEST_EQUAL(cats[1].m_synonyms[0].m_name, "village", ());
|
||||
}
|
||||
|
||||
UNIT_TEST(CategoriesIndex_MultipleTokens)
|
||||
{
|
||||
char const kCategories[] =
|
||||
"shop-bakery\n"
|
||||
"en:shop of buns\n"
|
||||
"\n"
|
||||
"shop-butcher\n"
|
||||
"en:shop of meat";
|
||||
|
||||
classificator::Load();
|
||||
CategoriesHolder holder(make_unique<MemReader>(kCategories, sizeof(kCategories) - 1));
|
||||
CategoriesIndex index(holder);
|
||||
|
||||
index.AddAllCategoriesInAllLangs();
|
||||
auto testTypes = [&](string const & query, vector<uint32_t> const & expected)
|
||||
{
|
||||
vector<uint32_t> result;
|
||||
index.GetAssociatedTypes(query, result);
|
||||
TEST_EQUAL(result, expected, (query));
|
||||
};
|
||||
|
||||
uint32_t type1 = classif().GetTypeByPath({"shop", "bakery"});
|
||||
uint32_t type2 = classif().GetTypeByPath({"shop", "butcher"});
|
||||
if (type1 > type2)
|
||||
swap(type1, type2);
|
||||
|
||||
testTypes("shop", {type1, type2});
|
||||
testTypes("shop buns", {type1});
|
||||
testTypes("shop meat", {type2});
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
// A check that this data structure is not too heavy.
|
||||
UNIT_TEST(CategoriesIndex_AllCategories)
|
||||
{
|
||||
classificator::Load();
|
||||
|
||||
CategoriesIndex catIndex;
|
||||
CategoriesIndex index;
|
||||
|
||||
catIndex.AddAllCategoriesAllLangs();
|
||||
vector<uint32_t> types;
|
||||
catIndex.GetAssociatedTypes("", types);
|
||||
TEST_LESS(types.size(), 300, ());
|
||||
#ifdef DEBUG
|
||||
TEST_LESS(catIndex.GetNumTrieNodes(), 400000, ());
|
||||
#endif
|
||||
index.AddAllCategoriesInAllLangs();
|
||||
TEST_LESS(index.GetNumTrieNodes(), 250000, ());
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
// A check that this data structure is not too heavy.
|
||||
UNIT_TEST(CategoriesIndex_AllCategoriesEnglishName)
|
||||
{
|
||||
classificator::Load();
|
||||
|
||||
CategoriesIndex catIndex;
|
||||
CategoriesIndex index;
|
||||
|
||||
catIndex.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en"));
|
||||
vector<uint32_t> types;
|
||||
catIndex.GetAssociatedTypes("", types);
|
||||
my::SortUnique(types);
|
||||
TEST_LESS(types.size(), 300, ());
|
||||
#ifdef DEBUG
|
||||
TEST_LESS(catIndex.GetNumTrieNodes(), 10000, ());
|
||||
#endif
|
||||
index.AddAllCategoriesInLang(CategoriesHolder::MapLocaleToInteger("en"));
|
||||
TEST_LESS(index.GetNumTrieNodes(), 6000, ());
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -13,6 +13,7 @@ using std::begin;
|
|||
using std::distance;
|
||||
using std::end;
|
||||
using std::insert_iterator;
|
||||
using std::inserter;
|
||||
using std::istream_iterator;
|
||||
using std::iterator_traits;
|
||||
using std::next;
|
||||
|
|
Loading…
Add table
Reference in a new issue