[geocoder] Index now uses integer document ids instead of pointers.

This commit is contained in:
Maxim Pimenov 2018-12-29 18:29:29 +03:00 committed by Tatiana Yan
parent d321b3c05b
commit 5bc2854421
7 changed files with 142 additions and 125 deletions

View file

@ -73,22 +73,6 @@ geocoder::Type NextType(geocoder::Type type)
return static_cast<geocoder::Type>(t + 1);
}
bool HasParent(vector<geocoder::Geocoder::Layer> const & layers,
geocoder::Hierarchy::Entry const & e)
{
CHECK(!layers.empty(), ());
auto const & layer = layers.back();
for (auto const * pe : layer.m_entries)
{
// Note that the relationship is somewhat inverted: every ancestor
// is stored in the address but the nodes have no information
// about their children.
if (pe->IsParentTo(e))
return true;
}
return false;
}
strings::UniString MakeHouseNumber(geocoder::Tokens const & tokens)
{
return strings::MakeUniString(strings::JoinStrings(tokens, " "));
@ -285,8 +269,11 @@ void Geocoder::Go(Context & ctx, Type type) const
allTypes.push_back(t);
}
for (auto const * e : curLayer.m_entries)
ctx.AddResult(e->m_osmId, certainty, type, move(allTypes), ctx.AllTokensUsed());
for (auto const & docId : curLayer.m_entries)
{
ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, move(allTypes),
ctx.AllTokensUsed());
}
ctx.GetLayers().emplace_back(move(curLayer));
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
@ -316,36 +303,43 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer
// let's stay on the safer side and set the house number bit.
ctx.SetHouseNumberBit();
for (auto const & se : layer.m_entries)
for (auto const & streetDocId : layer.m_entries)
{
auto const * buildings = m_index.GetBuildingsOnStreet(se->m_osmId);
if (buildings == nullptr)
continue;
for (auto const & be : *buildings)
{
m_index.ForEachBuildingOnStreet(streetDocId, [&](Index::DocId const & buildingDocId) {
auto const & bld = m_index.GetDoc(buildingDocId);
auto const bt = static_cast<size_t>(Type::Building);
auto const & realHN = MakeHouseNumber(be->m_address[bt]);
auto const & realHN = MakeHouseNumber(bld.m_address[bt]);
if (search::house_numbers::HouseNumbersMatch(realHN, subqueryHN, false /* queryIsPrefix */))
curLayer.m_entries.emplace_back(be);
}
curLayer.m_entries.emplace_back(buildingDocId);
});
}
}
void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
Layer & curLayer) const
{
auto const * entries = m_index.GetEntries(subquery);
if (!entries || entries->empty())
return;
m_index.ForEachDocId(subquery, [&](Index::DocId const & docId) {
auto const & d = m_index.GetDoc(docId);
if (d.m_type != type)
return;
for (auto const * e : *entries)
if (ctx.GetLayers().empty() || HasParent(ctx.GetLayers(), d))
curLayer.m_entries.emplace_back(docId);
});
}
bool Geocoder::HasParent(vector<Geocoder::Layer> const & layers, Hierarchy::Entry const & e) const
{
CHECK(!layers.empty(), ());
auto const & layer = layers.back();
for (auto const & docId : layer.m_entries)
{
CHECK(e, ());
if (e->m_type != type)
continue;
if (ctx.GetLayers().empty() || HasParent(ctx.GetLayers(), *e))
curLayer.m_entries.emplace_back(e);
// Note that the relationship is somewhat inverted: every ancestor
// is stored in the address but the nodes have no information
// about their children.
if (m_hierarchy.IsParent(m_index.GetDoc(docId), e))
return true;
}
return false;
}
} // namespace geocoder

View file

@ -42,7 +42,7 @@ public:
struct Layer
{
Type m_type = Type::Count;
std::vector<Hierarchy::Entry const *> m_entries;
std::vector<Index::DocId> m_entries;
};
// This class is very similar to the one we use in search/.
@ -134,6 +134,10 @@ private:
void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
Layer & curLayer) const;
// Returns whether any of the paths through |layers| can be extended
// by appending |e|.
bool HasParent(std::vector<Geocoder::Layer> const & layers, Hierarchy::Entry const & e) const;
Hierarchy m_hierarchy;
Index m_index;

View file

@ -72,15 +72,15 @@ UNIT_TEST(Geocoder_Hierarchy)
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
auto entries = geocoder.GetIndex().GetEntries({("florencia")});
vector<Hierarchy::Entry> entries;
geocoder.GetIndex().ForEachDocId({("florencia")}, [&](Index::DocId const & docId) {
entries.emplace_back(geocoder.GetIndex().GetDoc(docId));
});
TEST(entries, ());
TEST_EQUAL(entries->size(), 1, ());
TEST_EQUAL((*entries)[0]->m_address[static_cast<size_t>(Type::Country)], Split("cuba"), ());
TEST_EQUAL((*entries)[0]->m_address[static_cast<size_t>(Type::Region)], Split("ciego de avila"),
());
TEST_EQUAL((*entries)[0]->m_address[static_cast<size_t>(Type::Subregion)], Split("florencia"),
());
TEST_EQUAL(entries.size(), 1, ());
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Country)], Split("cuba"), ());
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Region)], Split("ciego de avila"), ());
TEST_EQUAL(entries[0].m_address[static_cast<size_t>(Type::Subregion)], Split("florencia"), ());
}
UNIT_TEST(Geocoder_OnlyBuildings)

View file

@ -127,16 +127,6 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const
return true;
}
bool Hierarchy::Entry::IsParentTo(Hierarchy::Entry const & e) const
{
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
{
if (!m_address[i].empty() && m_address[i] != e.m_address[i])
return false;
}
return true;
}
// Hierarchy ---------------------------------------------------------------------------------------
Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
{
@ -216,4 +206,14 @@ Hierarchy::Entry const * Hierarchy::GetEntryForOsmId(base::GeoObjectId const & o
return &(*it);
}
bool Hierarchy::IsParent(Hierarchy::Entry const & pe, Hierarchy::Entry const & e) const
{
for (size_t i = 0; i < static_cast<size_t>(geocoder::Type::Count); ++i)
{
if (!pe.m_address[i].empty() && pe.m_address[i] != e.m_address[i])
return false;
}
return true;
}
} // namespace geocoder

View file

@ -57,9 +57,6 @@ public:
bool DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr,
ParsingStats & stats);
// Checks whether this entry is a parent of |e|.
bool IsParentTo(Entry const & e) const;
bool operator<(Entry const & rhs) const { return m_osmId < rhs.m_osmId; }
base::GeoObjectId m_osmId = base::GeoObjectId(base::GeoObjectId::kInvalid);
@ -81,6 +78,9 @@ public:
Entry const * GetEntryForOsmId(base::GeoObjectId const & osmId) const;
// Checks whether |pe| is a parent of |e|.
bool IsParent(Entry const & pe, Entry const & e) const;
private:
std::vector<Entry> m_entries;
};

View file

@ -8,69 +8,58 @@
#include "base/logging.hpp"
#include "base/string_utils.hpp"
#include <cstddef>
using namespace std;
namespace
{
// Information will be logged for every |kLogBatch| entries.
// Information will be logged for every |kLogBatch| docs.
size_t const kLogBatch = 100000;
string MakeIndexKey(geocoder::Tokens const & tokens) { return strings::JoinStrings(tokens, " "); }
} // namespace
namespace geocoder
{
Index::Index(Hierarchy const & hierarchy) : m_entries(hierarchy.GetEntries())
Index::Index(Hierarchy const & hierarchy) : m_docs(hierarchy.GetEntries())
{
LOG(LINFO, ("Indexing entries..."));
LOG(LINFO, ("Indexing hierarchy entries..."));
AddEntries();
LOG(LINFO, ("Indexing houses..."));
AddHouses();
AddHouses(hierarchy);
}
vector<Index::EntryPtr> const * const Index::GetEntries(Tokens const & tokens) const
Index::Doc const & Index::GetDoc(DocId const id) const
{
auto const it = m_entriesByTokens.find(MakeIndexKey(tokens));
if (it == m_entriesByTokens.end())
return {};
return &it->second;
ASSERT_LESS(static_cast<size_t>(id), m_docs.size(), ());
return m_docs[static_cast<size_t>(id)];
}
vector<Index::EntryPtr> const * const Index::GetBuildingsOnStreet(
base::GeoObjectId const & osmId) const
string Index::MakeIndexKey(Tokens const & tokens) const
{
auto const it = m_buildingsOnStreet.find(osmId);
if (it == m_buildingsOnStreet.end())
return {};
return &it->second;
return strings::JoinStrings(tokens, " ");
}
void Index::AddEntries()
{
size_t numIndexed = 0;
for (auto const & e : m_entries)
for (DocId docId = 0; docId < static_cast<DocId>(m_docs.size()); ++docId)
{
// The entry is indexed only by its address.
auto const & doc = m_docs[static_cast<size_t>(docId)];
// The doc is indexed only by its address.
// todo(@m) Index it by name too.
if (e.m_type == Type::Count)
if (doc.m_type == Type::Count)
continue;
if (e.m_type == Type::Street)
if (doc.m_type == Type::Street)
{
AddStreet(e);
AddStreet(docId, doc);
}
else
{
size_t const t = static_cast<size_t>(e.m_type);
m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
size_t const t = static_cast<size_t>(doc.m_type);
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
}
// Index every token but do not index prefixes.
// for (auto const & tok : entry.m_address[t])
// m_entriesByTokens[{tok}].emplace_back(entry);
++numIndexed;
if (numIndexed % kLogBatch == 0)
LOG(LINFO, ("Indexed", numIndexed, "entries"));
@ -80,45 +69,45 @@ void Index::AddEntries()
LOG(LINFO, ("Indexed", numIndexed, "entries"));
}
void Index::AddStreet(Hierarchy::Entry const & e)
void Index::AddStreet(DocId const & docId, Index::Doc const & doc)
{
CHECK_EQUAL(e.m_type, Type::Street, ());
size_t const t = static_cast<size_t>(e.m_type);
m_entriesByTokens[MakeIndexKey(e.m_address[t])].emplace_back(&e);
CHECK_EQUAL(doc.m_type, Type::Street, ());
size_t const t = static_cast<size_t>(doc.m_type);
m_docIdsByTokens[MakeIndexKey(doc.m_address[t])].emplace_back(docId);
for (size_t i = 0; i < e.m_address[t].size(); ++i)
for (size_t i = 0; i < doc.m_address[t].size(); ++i)
{
if (!search::IsStreetSynonym(strings::MakeUniString(e.m_address[t][i])))
if (!search::IsStreetSynonym(strings::MakeUniString(doc.m_address[t][i])))
continue;
auto addr = e.m_address[t];
auto addr = doc.m_address[t];
addr.erase(addr.begin() + i);
m_entriesByTokens[MakeIndexKey(addr)].emplace_back(&e);
m_docIdsByTokens[MakeIndexKey(addr)].emplace_back(docId);
}
}
void Index::AddHouses()
void Index::AddHouses(Hierarchy const & hierarchy)
{
size_t numIndexed = 0;
for (auto & be : m_entries)
for (DocId docId = 0; docId < static_cast<DocId>(hierarchy.GetEntries().size()); ++docId)
{
if (be.m_type != Type::Building)
auto const & buildingDoc = GetDoc(docId);
if (buildingDoc.m_type != Type::Building)
continue;
size_t const t = static_cast<size_t>(Type::Street);
auto const * streetCandidates = GetEntries(be.m_address[t]);
if (streetCandidates == nullptr)
continue;
ForEachDocId(buildingDoc.m_address[t], [&](DocId const & streetCandidate) {
auto const & streetDoc = GetDoc(streetCandidate);
if (hierarchy.IsParent(streetDoc, buildingDoc))
{
m_buildingsOnStreet[streetCandidate].emplace_back(docId);
for (auto & se : *streetCandidates)
{
if (se->IsParentTo(be))
m_buildingsOnStreet[se->m_osmId].emplace_back(&be);
}
++numIndexed;
if (numIndexed % kLogBatch == 0)
LOG(LINFO, ("Indexed", numIndexed, "houses"));
++numIndexed;
if (numIndexed % kLogBatch == 0)
LOG(LINFO, ("Indexed", numIndexed, "houses"));
}
});
}
if (numIndexed % kLogBatch != 0)

View file

@ -4,6 +4,7 @@
#include "base/geo_object_id.hpp"
#include <cstdint>
#include <string>
#include <unordered_map>
#include <vector>
@ -13,36 +14,65 @@ namespace geocoder
class Index
{
public:
using EntryPtr = Hierarchy::Entry const *;
// Number of the entry in the list of all hierarchy entries
// that the index was constructed from.
using DocId = uint32_t;
using Doc = Hierarchy::Entry;
explicit Index(Hierarchy const & hierarchy);
// Returns a pointer to entries whose names exactly match |tokens|
// (the order matters) or nullptr if there are no such entries.
Doc const & GetDoc(DocId const id) const;
// Calls |fn| for DocIds of Docs whose names exactly match |tokens| (the order matters).
//
// todo This method (and the whole class, in fact) is in the
// prototype stage and may be too slow. Proper indexing should
// be implemented to perform this type of queries.
std::vector<EntryPtr> const * const GetEntries(Tokens const & tokens) const;
template <typename Fn>
void ForEachDocId(Tokens const & tokens, Fn && fn) const
{
auto const it = m_docIdsByTokens.find(MakeIndexKey(tokens));
if (it == m_docIdsByTokens.end())
return;
std::vector<EntryPtr> const * const GetBuildingsOnStreet(base::GeoObjectId const & osmId) const;
for (DocId const & docId : it->second)
fn(docId);
}
// Calls |fn| for DocIds of buildings that are located on the
// street whose DocId is |streetDocId|.
template <typename Fn>
void ForEachBuildingOnStreet(DocId const & streetDocId, Fn && fn) const
{
auto const it = m_buildingsOnStreet.find(streetDocId);
if (it == m_buildingsOnStreet.end())
return;
for (DocId const & docId : it->second)
fn(docId);
}
private:
// Adds address information of entries to the index.
// Converts |tokens| to a single UTF-8 string that can be used
// as a key in the |m_docIdsByTokens| map.
std::string MakeIndexKey(Tokens const & tokens) const;
// Adds address information of |m_docs| to the index.
void AddEntries();
// Adds the street |e| to the index, with and without synonyms
// of the word "street".
void AddStreet(Hierarchy::Entry const & e);
// Adds the street |e| (which has the id of |docId|) to the index,
// with and without synonyms of the word "street".
void AddStreet(DocId const & docId, Doc const & e);
// Fills the |m_buildingsOnStreet| field.
void AddHouses();
void AddHouses(Hierarchy const & hierarchy);
std::vector<Hierarchy::Entry> const & m_entries;
std::vector<Doc> const & m_docs;
std::unordered_map<std::string, std::vector<EntryPtr>> m_entriesByTokens;
std::unordered_map<std::string, std::vector<DocId>> m_docIdsByTokens;
// Lists of houses grouped by the streets they belong to.
std::unordered_map<base::GeoObjectId, std::vector<EntryPtr>> m_buildingsOnStreet;
std::unordered_map<DocId, std::vector<DocId>> m_buildingsOnStreet;
};
} // namespace geocoder