forked from organicmaps/organicmaps
[geocoder] Index now uses integer document ids instead of pointers.
This commit is contained in:
parent
d321b3c05b
commit
5bc2854421
7 changed files with 142 additions and 125 deletions
|
@ -73,22 +73,6 @@ geocoder::Type NextType(geocoder::Type type)
|
|||
return static_cast<geocoder::Type>(t + 1);
|
||||
}
|
||||
|
||||
bool HasParent(vector<geocoder::Geocoder::Layer> const & layers,
|
||||
geocoder::Hierarchy::Entry const & e)
|
||||
{
|
||||
CHECK(!layers.empty(), ());
|
||||
auto const & layer = layers.back();
|
||||
for (auto const * pe : layer.m_entries)
|
||||
{
|
||||
// Note that the relationship is somewhat inverted: every ancestor
|
||||
// is stored in the address but the nodes have no information
|
||||
// about their children.
|
||||
if (pe->IsParentTo(e))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens)
|
||||
{
|
||||
return strings::MakeUniString(strings::JoinStrings(tokens, " "));
|
||||
|
@ -285,8 +269,11 @@ void Geocoder::Go(Context & ctx, Type type) const
|
|||
allTypes.push_back(t);
|
||||
}
|
||||
|
||||
for (auto const * e : curLayer.m_entries)
|
||||
ctx.AddResult(e->m_osmId, certainty, type, move(allTypes), ctx.AllTokensUsed());
|
||||
for (auto const & docId : curLayer.m_entries)
|
||||
{
|
||||
ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, move(allTypes),
|
||||
ctx.AllTokensUsed());
|
||||
}
|
||||
|
||||
ctx.GetLayers().emplace_back(move(curLayer));
|
||||
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
|
||||
|
@ -316,36 +303,43 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer
|
|||
// let's stay on the safer side and set the house number bit.
|
||||
ctx.SetHouseNumberBit();
|
||||
|
||||
for (auto const & se : layer.m_entries)
|
||||
for (auto const & streetDocId : layer.m_entries)
|
||||
{
|
||||
auto const * buildings = m_index.GetBuildingsOnStreet(se->m_osmId);
|
||||
if (buildings == nullptr)
|
||||
continue;
|
||||
for (auto const & be : *buildings)
|
||||
{
|
||||
m_index.ForEachBuildingOnStreet(streetDocId, [&](Index::DocId const & buildingDocId) {
|
||||
auto const & bld = m_index.GetDoc(buildingDocId);
|
||||
auto const bt = static_cast<size_t>(Type::Building);
|
||||
auto const & realHN = MakeHouseNumber(be->m_address[bt]);
|
||||
auto const & realHN = MakeHouseNumber(bld.m_address[bt]);
|
||||
if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */))
|
||||
curLayer.m_entries.emplace_back(be);
|
||||
}
|
||||
curLayer.m_entries.emplace_back(buildingDocId);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
|
||||
Layer & curLayer) const
|
||||
{
|
||||
auto const * entries = m_index.GetEntries(subquery);
|
||||
if (!entries || entries->empty())
|
||||
return;
|
||||
m_index.ForEachDocId(subquery, [&](Index::DocId const & docId) {
|
||||
auto const & d = m_index.GetDoc(docId);
|
||||
if (d.m_type != type)
|
||||
return;
|
||||
|
||||
for (auto const * e : *entries)
|
||||
if (ctx.GetLayers().empty() || HasParent(ctx.GetLayers(), d))
|
||||
curLayer.m_entries.emplace_back(docId);
|
||||
});
|
||||
}
|
||||
|
||||
bool Geocoder::HasParent(vector<Geocoder::Layer> const & layers, Hierarchy::Entry const & e) const
|
||||
{
|
||||
CHECK(!layers.empty(), ());
|
||||
auto const & layer = layers.back();
|
||||
for (auto const & docId : layer.m_entries)
|
||||
{
|
||||
CHECK(e, ());
|
||||
if (e->m_type != type)
|
||||
continue;
|
||||
|
||||
if (ctx.GetLayers().empty() || HasParent(ctx.GetLayers(), *e))
|
||||
curLayer.m_entries.emplace_back(e);
|
||||
// Note that the relationship is somewhat inverted: every ancestor
|
||||
// is stored in the address but the nodes have no information
|
||||
// about their children.
|
||||
if (m_hierarchy.IsParent(m_index.GetDoc(docId), e))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -42,7 +42,7 @@ public:
|
|||
struct Layer
|
||||
{
|
||||
Type m_type = Type::Count;
|
||||
std::vector<Hierarchy::Entry const *> m_entries;
|
||||
std::vector<Index::DocId> m_entries;
|
||||
};
|
||||
|
||||
// This class is very similar to the one we use in search/.
|
||||
|
@ -134,6 +134,10 @@ private:
|
|||
void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
|
||||
Layer & curLayer) const;
|
||||
|
||||
// Returns whether any of the paths through |layers| can be extended
|
||||
// by appending |e|.
|
||||
bool HasParent(std::vector<Geocoder::Layer> const & layers, Hierarchy::Entry const & e) const;
|
||||
|
||||
Hierarchy m_hierarchy;
|
||||
|
||||
Index m_index;
|
||||
|
|
|
@ -72,15 +72,15 @@ UNIT_TEST(Geocoder_Hierarchy)
|
|||
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
|
||||
Geocoder geocoder(regionsJsonFile.GetFullPath());
|
||||
|
||||
auto entries = geocoder.GetIndex().GetEntries({("florencia")});
|
||||
vector<Hierarchy::Entry> entries;
|
||||
geocoder.GetIndex().ForEachDocId({("florencia")}, [&](Index::DocId const & docId) {
|
||||
entries.emplace_back(geocoder.GetIndex().GetDoc(docId));
|
||||
});
|
||||
|
||||
TEST(entries, ());
|
||||
TEST_EQUAL(entries->size(), 1, ());
|
||||
TEST_EQUAL((*entries)[0]->m_address[static_cast<size_t>(Type::Country)], Split("cuba"), ());
|
||||
TEST_EQUAL((*entries)[0]->m_address[static_cast<size_t>(Type::Region)], Split("ciego de avila"),
|
||||
());
|
||||
TEST_EQUAL((*entries)[0]->m_address[static_cast<size_t>(Type::Subregion)], Split("florencia"),
|
||||
());
|
||||
TEST_EQUAL(entries.size(), 1, ());
|
||||
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Country)], Split("cuba"), ());
|
||||
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Region)], Split("ciego de avila"), ());
|
||||
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Subregion)], Split("florencia"), ());
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_OnlyBuildings)
|
||||
|
|
|
@ -127,16 +127,6 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
|
|||
return true;
|
||||
}
|
||||
|
||||
bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const
|
||||
{
|
||||
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
|
||||
{
|
||||
if (!m_address[i].empty() && m_address[i] != e.m_address[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Hierarchy ---------------------------------------------------------------------------------------
|
||||
Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
||||
{
|
||||
|
@ -216,4 +206,14 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o
|
|||
|
||||
return &(*it);
|
||||
}
|
||||
|
||||
bool Hierarchy::IsParent(Hierarchy::Entry const & pe, Hierarchy::Entry const & e) const
|
||||
{
|
||||
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
|
||||
{
|
||||
if (!pe.m_address[i].empty() && pe.m_address[i] != e.m_address[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -57,9 +57,6 @@ public:
|
|||
bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
|
||||
ParsingStats & stats);
|
||||
|
||||
// Checks whether this entry is a parent of |e|.
|
||||
bool IsParentTo(Entry const & e) const;
|
||||
|
||||
bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; }
|
||||
|
||||
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
|
||||
|
@ -81,6 +78,9 @@ public:
|
|||
|
||||
Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const;
|
||||
|
||||
// Checks whether |pe| is a parent of |e|.
|
||||
bool IsParent(Entry const & pe, Entry const & e) const;
|
||||
|
||||
private:
|
||||
std::vector<Entry> m_entries;
|
||||
};
|
||||
|
|
|
@ -8,69 +8,58 @@
|
|||
#include "base/logging.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace
|
||||
{
|
||||
// Information will be logged for every |kLogBatch| entries.
|
||||
// Information will be logged for every |kLogBatch| docs.
|
||||
size_t const kLogBatch = 100000;
|
||||
|
||||
string MakeIndexKey(geocoder::Tokens const & tokens) { return strings::JoinStrings(tokens, " "); }
|
||||
} // namespace
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
Index::Index(Hierarchy const & hierarchy) : m_entries(hierarchy.GetEntries())
|
||||
Index::Index(Hierarchy const & hierarchy) : m_docs(hierarchy.GetEntries())
|
||||
{
|
||||
LOG(LINFO, ("Indexing entries..."));
|
||||
LOG(LINFO, ("Indexing hierarchy entries..."));
|
||||
AddEntries();
|
||||
LOG(LINFO, ("Indexing houses..."));
|
||||
AddHouses();
|
||||
AddHouses(hierarchy);
|
||||
}
|
||||
|
||||
vector<Index::EntryPtr> const * const Index::GetEntries(Tokens const & tokens) const
|
||||
Index::Doc const & Index::GetDoc(DocId const id) const
|
||||
{
|
||||
auto const it = m_entriesByTokens.find(MakeIndexKey(tokens));
|
||||
if (it == m_entriesByTokens.end())
|
||||
return {};
|
||||
|
||||
return &it->second;
|
||||
ASSERT_LESS(static_cast<size_t>(id), m_docs.size(), ());
|
||||
return m_docs[static_cast<size_t>(id)];
|
||||
}
|
||||
|
||||
vector<Index::EntryPtr> const * const Index::GetBuildingsOnStreet(
|
||||
base::GeoObjectId const & osmId) const
|
||||
string Index::MakeIndexKey(Tokens const & tokens) const
|
||||
{
|
||||
auto const it = m_buildingsOnStreet.find(osmId);
|
||||
if (it == m_buildingsOnStreet.end())
|
||||
return {};
|
||||
|
||||
return &it->second;
|
||||
return strings::JoinStrings(tokens, " ");
|
||||
}
|
||||
|
||||
void Index::AddEntries()
|
||||
{
|
||||
size_t numIndexed = 0;
|
||||
for (auto const & e : m_entries)
|
||||
for (DocId docId = 0; docId < static_cast<DocId>(m_docs.size()); ++docId)
|
||||
{
|
||||
// The entry is indexed only by its address.
|
||||
auto const & doc = m_docs[static_cast<size_t>(docId)];
|
||||
// The doc is indexed only by its address.
|
||||
// todo(@m) Index it by name too.
|
||||
if (e.m_type == Type::Count)
|
||||
if (doc.m_type == Type::Count)
|
||||
continue;
|
||||
|
||||
if (e.m_type == Type::Street)
|
||||
if (doc.m_type == Type::Street)
|
||||
{
|
||||
AddStreet(e);
|
||||
AddStreet(docId, doc);
|
||||
}
|
||||
else
|
||||
{
|
||||
size_t const t = static_cast<size_t>(e.m_type);
|
||||
m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
|
||||
size_t const t = static_cast<size_t>(doc.m_type);
|
||||
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
|
||||
}
|
||||
|
||||
// Index every token but do not index prefixes.
|
||||
// for (auto const & tok : entry.m_address[t])
|
||||
// m_entriesByTokens[{tok}].emplace_back(entry);
|
||||
|
||||
++numIndexed;
|
||||
if (numIndexed % kLogBatch == 0)
|
||||
LOG(LINFO, ("Indexed", numIndexed, "entries"));
|
||||
|
@ -80,45 +69,45 @@ void Index::AddEntries()
|
|||
LOG(LINFO, ("Indexed", numIndexed, "entries"));
|
||||
}
|
||||
|
||||
void Index::AddStreet(Hierarchy::Entry const & e)
|
||||
void Index::AddStreet(DocId const & docId, Index::Doc const & doc)
|
||||
{
|
||||
CHECK_EQUAL(e.m_type, Type::Street, ());
|
||||
size_t const t = static_cast<size_t>(e.m_type);
|
||||
m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
|
||||
CHECK_EQUAL(doc.m_type, Type::Street, ());
|
||||
size_t const t = static_cast<size_t>(doc.m_type);
|
||||
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
|
||||
|
||||
for (size_t i = 0; i < e.m_address[t].size(); ++i)
|
||||
for (size_t i = 0; i < doc.m_address[t].size(); ++i)
|
||||
{
|
||||
if (!search::IsStreetSynonym(strings::MakeUniString(e.m_address[t][i])))
|
||||
if (!search::IsStreetSynonym(strings::MakeUniString(doc.m_address[t][i])))
|
||||
continue;
|
||||
auto addr = e.m_address[t];
|
||||
auto addr = doc.m_address[t];
|
||||
addr.erase(addr.begin() + i);
|
||||
m_entriesByTokens[MakeIndexKey(addr)].emplace_back(&e);
|
||||
m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId);
|
||||
}
|
||||
}
|
||||
|
||||
void Index::AddHouses()
|
||||
void Index::AddHouses(Hierarchy const & hierarchy)
|
||||
{
|
||||
size_t numIndexed = 0;
|
||||
for (auto & be : m_entries)
|
||||
for (DocId docId = 0; docId < static_cast<DocId>(hierarchy.GetEntries().size()); ++docId)
|
||||
{
|
||||
if (be.m_type != Type::Building)
|
||||
auto const & buildingDoc = GetDoc(docId);
|
||||
|
||||
if (buildingDoc.m_type != Type::Building)
|
||||
continue;
|
||||
|
||||
size_t const t = static_cast<size_t>(Type::Street);
|
||||
|
||||
auto const * streetCandidates = GetEntries(be.m_address[t]);
|
||||
if (streetCandidates == nullptr)
|
||||
continue;
|
||||
ForEachDocId(buildingDoc.m_address[t], [&](DocId const & streetCandidate) {
|
||||
auto const & streetDoc = GetDoc(streetCandidate);
|
||||
if (hierarchy.IsParent(streetDoc, buildingDoc))
|
||||
{
|
||||
m_buildingsOnStreet[streetCandidate].emplace_back(docId);
|
||||
|
||||
for (auto & se : *streetCandidates)
|
||||
{
|
||||
if (se->IsParentTo(be))
|
||||
m_buildingsOnStreet[se->m_osmId].emplace_back(&be);
|
||||
}
|
||||
|
||||
++numIndexed;
|
||||
if (numIndexed % kLogBatch == 0)
|
||||
LOG(LINFO, ("Indexed", numIndexed, "houses"));
|
||||
++numIndexed;
|
||||
if (numIndexed % kLogBatch == 0)
|
||||
LOG(LINFO, ("Indexed", numIndexed, "houses"));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if (numIndexed % kLogBatch != 0)
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
|
||||
#include "base/geo_object_id.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
@ -13,36 +14,65 @@ namespace geocoder
|
|||
class Index
|
||||
{
|
||||
public:
|
||||
using EntryPtr = Hierarchy::Entry const *;
|
||||
// Number of the entry in the list of all hierarchy entries
|
||||
// that the index was constructed from.
|
||||
using DocId = uint32_t;
|
||||
|
||||
using Doc = Hierarchy::Entry;
|
||||
|
||||
explicit Index(Hierarchy const & hierarchy);
|
||||
|
||||
// Returns a pointer to entries whose names exactly match |tokens|
|
||||
// (the order matters) or nullptr if there are no such entries.
|
||||
Doc const & GetDoc(DocId const id) const;
|
||||
|
||||
// Calls |fn| for DocIds of Docs whose names exactly match |tokens| (the order matters).
|
||||
//
|
||||
// todo This method (and the whole class, in fact) is in the
|
||||
// prototype stage and may be too slow. Proper indexing should
|
||||
// be implemented to perform this type of queries.
|
||||
std::vector<EntryPtr> const * const GetEntries(Tokens const & tokens) const;
|
||||
template <typename Fn>
|
||||
void ForEachDocId(Tokens const & tokens, Fn && fn) const
|
||||
{
|
||||
auto const it = m_docIdsByTokens.find(MakeIndexKey(tokens));
|
||||
if (it == m_docIdsByTokens.end())
|
||||
return;
|
||||
|
||||
std::vector<EntryPtr> const * const GetBuildingsOnStreet(base::GeoObjectId const & osmId) const;
|
||||
for (DocId const & docId : it->second)
|
||||
fn(docId);
|
||||
}
|
||||
|
||||
// Calls |fn| for DocIds of buildings that are located on the
|
||||
// street whose DocId is |streetDocId|.
|
||||
template <typename Fn>
|
||||
void ForEachBuildingOnStreet(DocId const & streetDocId, Fn && fn) const
|
||||
{
|
||||
auto const it = m_buildingsOnStreet.find(streetDocId);
|
||||
if (it == m_buildingsOnStreet.end())
|
||||
return;
|
||||
|
||||
for (DocId const & docId : it->second)
|
||||
fn(docId);
|
||||
}
|
||||
|
||||
private:
|
||||
// Adds address information of entries to the index.
|
||||
// Converts |tokens| to a single UTF-8 string that can be used
|
||||
// as a key in the |m_docIdsByTokens| map.
|
||||
std::string MakeIndexKey(Tokens const & tokens) const;
|
||||
|
||||
// Adds address information of |m_docs| to the index.
|
||||
void AddEntries();
|
||||
|
||||
// Adds the street |e| to the index, with and without synonyms
|
||||
// of the word "street".
|
||||
void AddStreet(Hierarchy::Entry const & e);
|
||||
// Adds the street |e| (which has the id of |docId|) to the index,
|
||||
// with and without synonyms of the word "street".
|
||||
void AddStreet(DocId const & docId, Doc const & e);
|
||||
|
||||
// Fills the |m_buildingsOnStreet| field.
|
||||
void AddHouses();
|
||||
void AddHouses(Hierarchy const & hierarchy);
|
||||
|
||||
std::vector<Hierarchy::Entry> const & m_entries;
|
||||
std::vector<Doc> const & m_docs;
|
||||
|
||||
std::unordered_map<std::string, std::vector<EntryPtr>> m_entriesByTokens;
|
||||
std::unordered_map<std::string, std::vector<DocId>> m_docIdsByTokens;
|
||||
|
||||
// Lists of houses grouped by the streets they belong to.
|
||||
std::unordered_map<base::GeoObjectId, std::vector<EntryPtr>> m_buildingsOnStreet;
|
||||
std::unordered_map<DocId, std::vector<DocId>> m_buildingsOnStreet;
|
||||
};
|
||||
} // namespace geocoder
|
||||
|
|
Loading…
Add table
Reference in a new issue