[geocoder] Relax number matching for streets

This commit is contained in:
Anatoly Serdtcev 2019-05-30 17:17:53 +03:00 committed by mpimenov
parent 03e2cdd140
commit 4c2f6be287
4 changed files with 201 additions and 60 deletions

View file

@ -299,6 +299,23 @@ bool IsSortedAndUnique(Iter beg, Iter end)
return IsSortedAndUnique(beg, end, std::less<typename std::iterator_traits<Iter>::value_type>());
}
// See std::includes() C++20.
template <typename Iter1, typename Iter2>
bool Includes(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2)
{
assert(std::is_sorted(first1, last1));
assert(std::is_sorted(first2, last2));
for (; first2 != last2; ++first1)
{
if (first1 == last1 || *first2 < *first1)
return false;
if (!(*first1 < *first2))
++first2;
}
return true;
}
struct DeleteFunctor
{
template <typename T>

View file

@ -14,6 +14,7 @@
#include "base/timer.hpp"
#include <algorithm>
#include <numeric>
#include <set>
#include <thread>
#include <utility>
@ -138,9 +139,9 @@ bool Geocoder::Context::IsTokenUsed(size_t id) const
bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_tokens.size(); }
void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty, Type type,
vector<Type> const & allTypes, bool allTokensUsed)
vector<size_t> const & tokenIds, vector<Type> const & allTypes)
{
m_beam.Add(BeamKey(osmId, type, allTypes, allTokensUsed), certainty);
m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes), certainty);
}
void Geocoder::Context::FillResults(vector<Result> & results) const
@ -149,30 +150,13 @@ void Geocoder::Context::FillResults(vector<Result> & results) const
results.reserve(m_beam.GetEntries().size());
set<base::GeoObjectId> seen;
bool const hasPotentialHouseNumber = !m_houseNumberPositionsInQuery.empty();
for (auto const & e : m_beam.GetEntries())
{
if (!seen.insert(e.m_key.m_osmId).second)
continue;
bool isGoodHouseHumber = false;
if (e.m_key.m_type == Type::Building)
{
bool gotLocality = false;
bool gotStreet = false;
bool gotBuilding = false;
for (Type t : e.m_key.m_allTypes)
{
if (t == Type::Region || t == Type::Subregion || t == Type::Locality)
gotLocality = true;
if (t == Type::Street)
gotStreet = true;
if (t == Type::Building)
gotBuilding = true;
}
isGoodHouseHumber = gotLocality && gotStreet && gotBuilding;
}
if (m_surelyGotHouseNumber && !isGoodHouseHumber && !e.m_key.m_allTokensUsed)
if (hasPotentialHouseNumber && !IsGoodForPotentialHouseNumberAt(e.m_key, m_houseNumberPositionsInQuery))
continue;
results.emplace_back(e.m_key.m_osmId, e.m_value /* certainty */);
@ -197,6 +181,64 @@ vector<Geocoder::Layer> & Geocoder::Context::GetLayers() { return m_layers; }
vector<Geocoder::Layer> const & Geocoder::Context::GetLayers() const { return m_layers; }
void Geocoder::Context::MarkHouseNumberPositionsInQuery(vector<size_t> const & tokenIds)
{
m_houseNumberPositionsInQuery.insert(tokenIds.begin(), tokenIds.end());
}
bool Geocoder::Context::IsGoodForPotentialHouseNumberAt(BeamKey const & beamKey,
set<size_t> const & tokenIds) const
{
if (beamKey.m_tokenIds.size() == m_tokens.size())
return true;
if (IsBuildingWithAddress(beamKey))
return true;
// Pass street, locality or region with number in query address parts.
if (HasLocalityOrRegion(beamKey) && ContainsTokenIds(beamKey, tokenIds))
return true;
return false;
}
bool Geocoder::Context::IsBuildingWithAddress(BeamKey const & beamKey) const
{
if (beamKey.m_type != Type::Building)
return false;
bool gotLocality = false;
bool gotStreet = false;
bool gotBuilding = false;
for (Type t : beamKey.m_allTypes)
{
if (t == Type::Region || t == Type::Subregion || t == Type::Locality)
gotLocality = true;
if (t == Type::Street)
gotStreet = true;
if (t == Type::Building)
gotBuilding = true;
}
return gotLocality && gotStreet && gotBuilding;
}
bool Geocoder::Context::HasLocalityOrRegion(BeamKey const & beamKey) const
{
for (Type t : beamKey.m_allTypes)
{
if (t == Type::Region || t == Type::Subregion || t == Type::Locality)
return true;
}
return false;
}
bool Geocoder::Context::ContainsTokenIds(BeamKey const & beamKey, set<size_t> const & needTokenIds) const
{
auto const & keyTokenIds = beamKey.m_tokenIds;
return base::Includes(keyTokenIds.begin(), keyTokenIds.end(), needTokenIds.begin(), needTokenIds.end());
}
// Geocoder ----------------------------------------------------------------------------------------
Geocoder::Geocoder(string const & pathToJsonHierarchy, unsigned int loadThreadsCount)
: Geocoder{HierarchyReader{pathToJsonHierarchy}.Read(loadThreadsCount), loadThreadsCount}
@ -243,15 +285,18 @@ void Geocoder::Go(Context & ctx, Type type) const
return;
Tokens subquery;
vector<size_t> subqueryTokenIds;
for (size_t i = 0; i < ctx.GetNumTokens(); ++i)
{
subquery.clear();
subqueryTokenIds.clear();
for (size_t j = i; j < ctx.GetNumTokens(); ++j)
{
if (ctx.IsTokenUsed(j))
break;
subquery.push_back(ctx.GetToken(j));
subqueryTokenIds.push_back(j);
Layer curLayer;
curLayer.m_type = type;
@ -259,7 +304,7 @@ void Geocoder::Go(Context & ctx, Type type) const
// Buildings are indexed separately.
if (type == Type::Building)
{
FillBuildingsLayer(ctx, subquery, curLayer);
FillBuildingsLayer(ctx, subquery, subqueryTokenIds, curLayer);
}
else
{
@ -273,6 +318,7 @@ void Geocoder::Go(Context & ctx, Type type) const
boost::optional<ScopedMarkTokens> streetSynonymMark;
double certainty = 0;
vector<size_t> tokenIds;
vector<Type> allTypes;
for (size_t tokId = 0; tokId < ctx.GetNumTokens(); ++tokId)
{
@ -285,11 +331,14 @@ void Geocoder::Go(Context & ctx, Type type) const
certainty += GetWeight(t);
if (t != Type::Count)
{
tokenIds.push_back(tokId);
allTypes.push_back(t);
}
}
for (auto const & docId : curLayer.m_entries)
ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, allTypes, ctx.AllTokensUsed());
ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, tokenIds, allTypes);
ctx.GetLayers().emplace_back(move(curLayer));
SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); });
@ -301,7 +350,8 @@ void Geocoder::Go(Context & ctx, Type type) const
Go(ctx, NextType(type));
}
void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer & curLayer) const
void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector<size_t> const & subqueryTokenIds,
Layer & curLayer) const
{
if (ctx.GetLayers().empty())
return;
@ -317,8 +367,8 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer
// We've already filled a street/location layer and now see something that resembles
// a house number. While it still can be something else (a zip code, for example)
// let's stay on the safer side and set the house number bit.
ctx.SetHouseNumberBit();
// let's stay on the safer side and mark the tokens as potential house number.
ctx.MarkHouseNumberPositionsInQuery(subqueryTokenIds);
for (auto const & docId : layer.m_entries)
{

View file

@ -11,6 +11,7 @@
#include "base/string_utils.hpp"
#include <cstddef>
#include <set>
#include <string>
#include <thread>
#include <unordered_map>
@ -53,19 +54,20 @@ public:
public:
struct BeamKey
{
BeamKey(base::GeoObjectId osmId, Type type, std::vector<Type> const & allTypes, bool allTokensUsed)
BeamKey(base::GeoObjectId osmId, Type type, std::vector<size_t> const & tokenIds,
std::vector<Type> const & allTypes)
: m_osmId(osmId)
, m_type(type)
, m_tokenIds{tokenIds}
, m_allTypes(allTypes)
, m_allTokensUsed(allTokensUsed)
{
base::SortUnique(m_allTypes);
}
base::GeoObjectId m_osmId;
Type m_type;
std::vector<size_t> m_tokenIds;
std::vector<Type> m_allTypes;
bool m_allTokensUsed;
};
Context(std::string const & query);
@ -89,7 +91,7 @@ public:
bool AllTokensUsed() const;
void AddResult(base::GeoObjectId const & osmId, double certainty, Type type,
std::vector<Type> const & allTypes, bool allTokensUsed);
std::vector<size_t> const & tokenIds, std::vector<Type> const & allTypes);
void FillResults(std::vector<Result> & results) const;
@ -97,20 +99,25 @@ public:
std::vector<Layer> const & GetLayers() const;
void SetHouseNumberBit() { m_surelyGotHouseNumber = true; }
void MarkHouseNumberPositionsInQuery(std::vector<size_t> const & tokenIds);
private:
bool IsGoodForPotentialHouseNumberAt(BeamKey const & beamKey, std::set<size_t> const & tokenIds) const;
bool IsBuildingWithAddress(BeamKey const & beamKey) const;
bool HasLocalityOrRegion(BeamKey const & beamKey) const;
bool ContainsTokenIds(BeamKey const & beamKey, std::set<size_t> const & needTokenIds) const;
Tokens m_tokens;
std::vector<Type> m_tokenTypes;
size_t m_numUsedTokens = 0;
// Sticky bit that records a heuristic check whether
// the current query contains a house number.
// |m_houseNumberPositionsInQuery| has indexes of query tokens which are placed on
// context-dependent positions of house number.
// The rationale is that we must only emit buildings in this case
// and implement a fallback to a more powerful geocoder if we
// could not find a building.
bool m_surelyGotHouseNumber = false;
std::set<size_t> m_houseNumberPositionsInQuery;
// The highest value of certainty for a fixed amount of
// the most relevant retrieved osm ids.
@ -133,8 +140,8 @@ private:
void Go(Context & ctx, Type type) const;
void FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer & curLayer) const;
void FillBuildingsLayer(Context & ctx, Tokens const & subquery, std::vector<size_t> const & subqueryTokenIds,
Layer & curLayer) const;
void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery,
Layer & curLayer) const;

View file

@ -20,8 +20,9 @@ using namespace std;
namespace
{
double const kCertaintyEps = 1e-6;
using Id = base::GeoObjectId;
double const kCertaintyEps = 1e-6;
string const kRegionsData = R"#(
-4611686018427080071 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"name": "Cuba", "rank": 2, "address": {"country": "Cuba"}}}
-4611686018425533273 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"name": "Ciego de Ávila", "rank": 4, "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}
@ -153,42 +154,108 @@ UNIT_TEST(Geocoder_MismatchedLocality)
TestGeocoder(geocoder, "Moscow Krymskaya 3", {});
}
UNIT_TEST(Geocoder_StreetWithNumber)
// Geocoder_StreetWithNumber* ----------------------------------------------------------------------
UNIT_TEST(Geocoder_StreetWithNumberInCity)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
20 {"properties": {"address": {"locality": "Краснокамск"}}}
11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}
12 {"properties": {"address": {"locality": "Москва", "street": "4-я улица 8 Марта"}}}
13 {"properties": {"address": {"locality": "Москва", "street": "8 Марта"}}}
21 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 8 Марта"}}}
25 {"properties": {"address": {"locality": "Краснокамск", "street": "Январская улица"}}}
26 {"properties": {"address": {"locality": "Краснокамск", "street": "Январская улица", "building": "8"}}}
20 {"properties": {"address": {"locality": "Краснокамск"}}}
28 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
using Id = base::GeoObjectId;
TestGeocoder(geocoder, "Москва, улица 1905 года", {{Id{11}, 1.0}});
TestGeocoder(geocoder, "Москва, 1905 года", {{Id{11}, 1.0}});
TestGeocoder(geocoder, "Краснокамск, улица 1905 года", {});
TestGeocoder(geocoder, "Москва, 4-я улица 8 Марта", {{Id{12}, 1.0}});
TestGeocoder(geocoder, "Москва, 4-я 8 Марта", {{Id{12}, 1.0}});
TestGeocoder(geocoder, "Москва, 8 Марта", {{Id{13}, 1.0}});
TestGeocoder(geocoder, "Москва, улица 8 Марта", {{Id{13}, 1.0}});
TestGeocoder(geocoder, "Краснокамск, улица 8 Марта", {{Id{21}, 1.0}});
TestGeocoder(geocoder, "Краснокамск, 8 Марта", {{Id{21}, 1.0}});
TestGeocoder(geocoder, "Краснокамск, Январская 8", {{Id{26}, 1.0}});
}
UNIT_TEST(Geocoder_StreetWithNumberInClassifiedCity)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "город Москва, улица 1905 года", {{Id{11}, 1.0}});
}
UNIT_TEST(Geocoder_StreetWithNumberInAnyCity)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}
20 {"properties": {"address": {"locality": "Краснокамск"}}}
28 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "улица 1905 года", {{Id{11}, 1.0}, {Id{28}, 1.0}});
}
UNIT_TEST(Geocoder_StreetWithNumberAndWithoutStreetSynonym)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "Москва, 1905 года", {{Id{11}, 1.0}});
}
UNIT_TEST(Geocoder_UntypedStreetWithNumberAndStreetSynonym)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
13 {"properties": {"address": {"locality": "Москва", "street": "8 Марта"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "Москва, улица 8 Марта", {{Id{13}, 1.0}});
}
UNIT_TEST(Geocoder_StreetWithTwoNumbers)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
12 {"properties": {"address": {"locality": "Москва", "street": "4-я улица 8 Марта"}}}
13 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "Москва, 4-я улица 8 Марта", {{Id{12}, 1.0}});
}
UNIT_TEST(Geocoder_BuildingOnStreetWithNumber)
{
string const kData = R"#(
10 {"properties": {"address": {"locality": "Москва"}}}
13 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}}
15 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта", "building": "4"}}}
)#";
ScopedFile const regionsJsonFile("regions.jsonl", kData);
Geocoder geocoder(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "Москва, улица 8 Марта, 4", {{Id{15}, 1.0}});
}
//--------------------------------------------------------------------------------------------------
UNIT_TEST(Geocoder_LocalityBuilding)
{
string const kData = R"#(