forked from organicmaps/organicmaps
[geocoder] Simple hierarchy-assisted backtracking.
This is the baseline version of the hierarchy-based geocoder.
This commit is contained in:
parent
579b08ea3a
commit
58ccd704e9
5 changed files with 273 additions and 23 deletions
|
@ -3,31 +3,208 @@
|
|||
#include "indexer/search_string_utils.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/osm_id.hpp"
|
||||
#include "base/logging.hpp"
|
||||
#include "base/scope_guard.hpp"
|
||||
#include "base/timer.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <utility>
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace
|
||||
{
|
||||
// todo(@m) This is taken from search/geocoder.hpp. Refactor.
|
||||
struct ScopedMarkTokens
|
||||
{
|
||||
using Type = geocoder::Hierarchy::EntryType;
|
||||
|
||||
// The range is [l, r).
|
||||
ScopedMarkTokens(geocoder::Geocoder::Context & context, Type const & type, size_t l, size_t r)
|
||||
: m_context(context), m_type(type), m_l(l), m_r(r)
|
||||
{
|
||||
ASSERT_LESS_OR_EQUAL(l, r, ());
|
||||
ASSERT_LESS_OR_EQUAL(r, context.GetNumTokens(), ());
|
||||
|
||||
for (size_t i = m_l; i < m_r; ++i)
|
||||
m_context.MarkToken(i, m_type);
|
||||
}
|
||||
|
||||
~ScopedMarkTokens()
|
||||
{
|
||||
for (size_t i = m_l; i < m_r; ++i)
|
||||
m_context.MarkToken(i, Type::Count);
|
||||
}
|
||||
|
||||
geocoder::Geocoder::Context & m_context;
|
||||
Type const m_type;
|
||||
size_t m_l;
|
||||
size_t m_r;
|
||||
};
|
||||
|
||||
geocoder::Hierarchy::EntryType NextType(geocoder::Hierarchy::EntryType const & type)
|
||||
{
|
||||
CHECK_NOT_EQUAL(type, geocoder::Hierarchy::EntryType::Count, ());
|
||||
auto t = static_cast<size_t>(type);
|
||||
return static_cast<geocoder::Hierarchy::EntryType>(t + 1);
|
||||
}
|
||||
|
||||
bool FindParent(vector<geocoder::Geocoder::Layer> const & layers,
|
||||
geocoder::Hierarchy::Entry const & e)
|
||||
{
|
||||
for (auto const & layer : layers)
|
||||
{
|
||||
for (auto const * pe : layer.m_entries)
|
||||
{
|
||||
// Note that the relationship is somewhat inverted: every ancestor
|
||||
// is stored in the address but the nodes have no information
|
||||
// about their children.
|
||||
if (e.m_address[static_cast<size_t>(pe->m_type)] == pe->m_nameTokens)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
namespace geocoder
|
||||
{
|
||||
// Geocoder::Context -------------------------------------------------------------------------------
|
||||
Geocoder::Context::Context(string const & query)
|
||||
{
|
||||
search::NormalizeAndTokenizeString(query, m_tokens);
|
||||
m_tokenTypes.assign(m_tokens.size(), Hierarchy::EntryType::Count);
|
||||
m_numUsedTokens = 0;
|
||||
}
|
||||
|
||||
vector<Hierarchy::EntryType> & Geocoder::Context::GetTokenTypes() { return m_tokenTypes; }
|
||||
|
||||
size_t Geocoder::Context::GetNumTokens() const { return m_tokens.size(); }
|
||||
|
||||
size_t Geocoder::Context::GetNumUsedTokens() const
|
||||
{
|
||||
ASSERT_LESS_OR_EQUAL(m_numUsedTokens, m_tokens.size(), ());
|
||||
return m_numUsedTokens;
|
||||
}
|
||||
|
||||
strings::UniString const & Geocoder::Context::GetToken(size_t id) const
|
||||
{
|
||||
ASSERT_LESS(id, m_tokens.size(), ());
|
||||
return m_tokens[id];
|
||||
}
|
||||
|
||||
void Geocoder::Context::MarkToken(size_t id, Hierarchy::EntryType const & type)
|
||||
{
|
||||
ASSERT_LESS(id, m_tokens.size(), ());
|
||||
bool wasUsed = m_tokenTypes[id] != Hierarchy::EntryType::Count;
|
||||
m_tokenTypes[id] = type;
|
||||
bool nowUsed = m_tokenTypes[id] != Hierarchy::EntryType::Count;
|
||||
|
||||
if (wasUsed && !nowUsed)
|
||||
--m_numUsedTokens;
|
||||
if (!wasUsed && nowUsed)
|
||||
++m_numUsedTokens;
|
||||
}
|
||||
|
||||
bool Geocoder::Context::IsTokenUsed(size_t id) const
|
||||
{
|
||||
ASSERT_LESS(id, m_tokens.size(), ());
|
||||
return m_tokenTypes[id] != Hierarchy::EntryType::Count;
|
||||
}
|
||||
|
||||
bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_tokens.size(); }
|
||||
|
||||
void Geocoder::Context::AddResult(osm::Id const & osmId, double certainty)
|
||||
{
|
||||
m_results[osmId] = max(m_results[osmId], certainty);
|
||||
}
|
||||
|
||||
void Geocoder::Context::FillResults(std::vector<Result> & results) const
|
||||
{
|
||||
results.clear();
|
||||
results.reserve(m_results.size());
|
||||
for (auto const & e : m_results)
|
||||
results.emplace_back(e.first /* osmId */, e.second /* certainty */);
|
||||
}
|
||||
|
||||
std::vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
|
||||
|
||||
std::vector<Geocoder::Layer> const & Geocoder::Context::GetLayers() const { return m_layers; }
|
||||
|
||||
// Geocoder ----------------------------------------------------------------------------------------
|
||||
Geocoder::Geocoder(string pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {}
|
||||
|
||||
void Geocoder::ProcessQuery(string const & query, vector<Result> & results) const
|
||||
{
|
||||
// Only here for demonstration purposes and will be removed shortly.
|
||||
results.clear();
|
||||
if (query == "a")
|
||||
{
|
||||
results.emplace_back(osm::Id(0xC00000000026FCFDULL), 0.5 /* certainty */);
|
||||
results.emplace_back(osm::Id(0x40000000C4D63818ULL), 1.0 /* certainty */);
|
||||
}
|
||||
if (query == "b")
|
||||
{
|
||||
results.emplace_back(osm::Id(0x8000000014527125ULL), 0.8 /* certainty */);
|
||||
results.emplace_back(osm::Id(0x40000000F26943B9ULL), 0.1 /* certainty */);
|
||||
}
|
||||
#if defined(DEBUG)
|
||||
my::Timer timer;
|
||||
MY_SCOPE_GUARD(printDuration, [&timer]() {
|
||||
LOG(LINFO, ("Total geocoding time:", timer.ElapsedSeconds(), "seconds"));
|
||||
});
|
||||
#endif
|
||||
|
||||
Context ctx(query);
|
||||
Go(ctx, Hierarchy::EntryType::Country);
|
||||
ctx.FillResults(results);
|
||||
}
|
||||
|
||||
Hierarchy const & Geocoder::GetHierarchy() const { return m_hierarchy; }
|
||||
|
||||
void Geocoder::Go(Context & ctx, Hierarchy::EntryType const & type) const
|
||||
{
|
||||
if (ctx.GetNumTokens() == 0)
|
||||
return;
|
||||
|
||||
if (ctx.AllTokensUsed())
|
||||
return;
|
||||
|
||||
if (type == Hierarchy::EntryType::Count)
|
||||
return;
|
||||
|
||||
vector<strings::UniString> subquery;
|
||||
for (size_t i = 0; i < ctx.GetNumTokens(); ++i)
|
||||
{
|
||||
subquery.clear();
|
||||
for (size_t j = i; j < ctx.GetNumTokens(); ++j)
|
||||
{
|
||||
if (ctx.IsTokenUsed(j))
|
||||
break;
|
||||
|
||||
subquery.push_back(ctx.GetToken(j));
|
||||
|
||||
auto const * entries = m_hierarchy.GetEntries(subquery);
|
||||
if (!entries || entries->empty())
|
||||
continue;
|
||||
|
||||
Layer curLayer;
|
||||
curLayer.m_type = type;
|
||||
for (auto const & e : *entries)
|
||||
{
|
||||
if (e.m_type != type)
|
||||
continue;
|
||||
|
||||
if (ctx.GetLayers().empty() || FindParent(ctx.GetLayers(), e))
|
||||
curLayer.m_entries.emplace_back(&e);
|
||||
}
|
||||
|
||||
if (!curLayer.m_entries.empty())
|
||||
{
|
||||
ScopedMarkTokens mark(ctx, type, i, j + 1);
|
||||
|
||||
double const certainty =
|
||||
static_cast<double>(ctx.GetNumUsedTokens()) / static_cast<double>(ctx.GetNumTokens());
|
||||
|
||||
for (auto const * e : curLayer.m_entries)
|
||||
ctx.AddResult(e->m_osmId, certainty);
|
||||
|
||||
ctx.GetLayers().emplace_back(move(curLayer));
|
||||
MY_SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
|
||||
|
||||
Go(ctx, NextType(type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Go(ctx, NextType(type));
|
||||
}
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -3,9 +3,11 @@
|
|||
#include "geocoder/hierarchy.hpp"
|
||||
#include "geocoder/result.hpp"
|
||||
|
||||
#include "base/osm_id.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace geocoder
|
||||
|
@ -30,6 +32,57 @@ namespace geocoder
|
|||
class Geocoder
|
||||
{
|
||||
public:
|
||||
// A Layer contains all entries matched by a subquery of consecutive tokens.
|
||||
struct Layer
|
||||
{
|
||||
Hierarchy::EntryType m_type = Hierarchy::EntryType::Count;
|
||||
std::vector<Hierarchy::Entry const *> m_entries;
|
||||
};
|
||||
|
||||
// This class is very similar to the one we use in search/.
|
||||
// See search/geocoder_context.hpp.
|
||||
class Context
|
||||
{
|
||||
public:
|
||||
Context(std::string const & query);
|
||||
|
||||
void Clear();
|
||||
|
||||
std::vector<Hierarchy::EntryType> & GetTokenTypes();
|
||||
size_t GetNumTokens() const;
|
||||
size_t GetNumUsedTokens() const;
|
||||
|
||||
strings::UniString const & GetToken(size_t id) const;
|
||||
|
||||
void MarkToken(size_t id, Hierarchy::EntryType const & type);
|
||||
|
||||
// Returns true if |token| is marked as used.
|
||||
bool IsTokenUsed(size_t id) const;
|
||||
|
||||
// Returns true iff all tokens are used.
|
||||
bool AllTokensUsed() const;
|
||||
|
||||
void AddResult(osm::Id const & osmId, double certainty);
|
||||
|
||||
void FillResults(std::vector<Result> & results) const;
|
||||
|
||||
std::vector<Layer> & GetLayers();
|
||||
|
||||
std::vector<Layer> const & GetLayers() const;
|
||||
|
||||
private:
|
||||
// todo(@m) std::string?
|
||||
std::vector<strings::UniString> m_tokens;
|
||||
std::vector<Hierarchy::EntryType> m_tokenTypes;
|
||||
|
||||
size_t m_numUsedTokens = 0;
|
||||
|
||||
// The highest value of certainty for each retrieved osm id.
|
||||
std::unordered_map<osm::Id, double, osm::HashId> m_results;
|
||||
|
||||
std::vector<Layer> m_layers;
|
||||
};
|
||||
|
||||
explicit Geocoder(std::string pathToJsonHierarchy);
|
||||
|
||||
void ProcessQuery(std::string const & query, std::vector<Result> & results) const;
|
||||
|
@ -37,6 +90,10 @@ public:
|
|||
Hierarchy const & GetHierarchy() const;
|
||||
|
||||
private:
|
||||
void Go(Context & ctx, Hierarchy::EntryType const & type) const;
|
||||
|
||||
void EmitResult() const;
|
||||
|
||||
Hierarchy m_hierarchy;
|
||||
};
|
||||
} // namespace geocoder
|
||||
|
|
|
@ -7,7 +7,10 @@
|
|||
#include "platform/platform_tests_support/scoped_file.hpp"
|
||||
|
||||
#include "base/math.hpp"
|
||||
#include "base/osm_id.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
|
@ -19,6 +22,8 @@ namespace
|
|||
double const kCertaintyEps = 1e-6;
|
||||
|
||||
string const kRegionsData = R"#(
|
||||
-4611686018427080071 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"name": "Cuba", "rank": 2, "address": {"country": "Cuba"}}}
|
||||
-4611686018425533273 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"name": "Ciego de Ávila", "rank": 4, "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}
|
||||
-4611686018421500235 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"name": "Florencia", "rank": 6, "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}}
|
||||
)#";
|
||||
|
||||
|
@ -32,26 +37,32 @@ geocoder::Tokens Split(string const & s)
|
|||
|
||||
namespace geocoder
|
||||
{
|
||||
void TestGeocoder(Geocoder const & geocoder, string const & query, vector<Result> const & expected)
|
||||
void TestGeocoder(Geocoder & geocoder, string const & query, vector<Result> && expected)
|
||||
{
|
||||
vector<Result> actual;
|
||||
geocoder.ProcessQuery(query, actual);
|
||||
TEST_EQUAL(actual.size(), expected.size(), ());
|
||||
TEST_EQUAL(actual.size(), expected.size(), (actual, expected));
|
||||
sort(actual.begin(), actual.end(), my::LessBy(&Result::m_osmId));
|
||||
sort(expected.begin(), expected.end(), my::LessBy(&Result::m_osmId));
|
||||
for (size_t i = 0; i < actual.size(); ++i)
|
||||
{
|
||||
TEST_EQUAL(actual[i].m_osmId, expected[i].m_osmId, ());
|
||||
TEST(my::AlmostEqualAbs(actual[i].m_certainty, expected[i].m_certainty, kCertaintyEps), ());
|
||||
TEST(my::AlmostEqualAbs(actual[i].m_certainty, expected[i].m_certainty, kCertaintyEps),
|
||||
(query, actual[i].m_certainty, expected[i].m_certainty));
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_Smoke)
|
||||
{
|
||||
Geocoder geocoder("" /* pathToJsonHierarchy */);
|
||||
ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData);
|
||||
Geocoder geocoder(regionsJsonFile.GetFullPath());
|
||||
|
||||
TestGeocoder(geocoder, "a",
|
||||
{{osm::Id(0xC00000000026FCFDULL), 0.5}, {osm::Id(0x40000000C4D63818ULL), 1.0}});
|
||||
TestGeocoder(geocoder, "b",
|
||||
{{osm::Id(0x8000000014527125ULL), 0.8}, {osm::Id(0x40000000F26943B9ULL), 0.1}});
|
||||
osm::Id const florenciaId(13835058055288051381ULL);
|
||||
osm::Id const cubaId(13835058055282471545ULL);
|
||||
|
||||
TestGeocoder(geocoder, "florencia", {{florenciaId, 1.0}});
|
||||
TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 1.0}, {cubaId, 0.5}});
|
||||
TestGeocoder(geocoder, "florencia somewhere in cuba", {{cubaId, 0.25}, {florenciaId, 0.5}});
|
||||
}
|
||||
|
||||
UNIT_TEST(Geocoder_Hierarchy)
|
||||
|
|
|
@ -95,10 +95,10 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * root)
|
|||
// Hierarchy ---------------------------------------------------------------------------------------
|
||||
Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
||||
{
|
||||
fstream fs(pathToJsonHierarchy);
|
||||
ifstream ifs(pathToJsonHierarchy);
|
||||
string line;
|
||||
|
||||
while (getline(fs, line))
|
||||
while (getline(ifs, line))
|
||||
{
|
||||
if (line.empty())
|
||||
continue;
|
||||
|
@ -116,6 +116,8 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy)
|
|||
CHECK(entry.DeserializeFromJSON(line), (line));
|
||||
m_entries[entry.m_nameTokens].emplace_back(entry);
|
||||
}
|
||||
|
||||
LOG(LINFO, ("Finished reading the hierarchy"));
|
||||
}
|
||||
|
||||
vector<Hierarchy::Entry> const * const Hierarchy::GetEntries(
|
||||
|
|
|
@ -44,7 +44,10 @@ public:
|
|||
void DeserializeFromJSONImpl(json_t * root);
|
||||
|
||||
osm::Id m_osmId = osm::Id(osm::Id::kInvalid);
|
||||
|
||||
// Original name of the entry. Useful for debugging.
|
||||
std::string m_name;
|
||||
// Tokenized and simplified name of the entry.
|
||||
std::vector<strings::UniString> m_nameTokens;
|
||||
|
||||
EntryType m_type = EntryType::Count;
|
||||
|
|
Loading…
Add table
Reference in a new issue