[geocoder] Fix rank for house number candidate with extra tokens

This commit is contained in:
Anatoly Serdtcev 2019-12-16 14:33:25 +03:00 committed by Sergey Yershov
parent eaf07a172d
commit 45609ef3cd
5 changed files with 162 additions and 27 deletions

View file

@ -150,6 +150,13 @@ strings::UniString MakeHouseNumber(Tokens const & tokens)
{
return strings::MakeUniString(strings::JoinStrings(tokens, " "));
}
strings::UniString & AppendToHouseNumber(strings::UniString & houseNumber, std::string const & token)
{
houseNumber += strings::MakeUniString(" ");
houseNumber += strings::MakeUniString(token);
return houseNumber;
}
} // namespace
// Geocoder::Layer ---------------------------------------------------------------------------------
@ -222,9 +229,10 @@ bool Geocoder::Context::IsTokenUsed(size_t id) const
bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_tokens.size(); }
void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty, Type type,
vector<size_t> const & tokenIds, vector<Type> const & allTypes)
vector<size_t> const & tokenIds, vector<Type> const & allTypes,
bool isOtherSimilar)
{
m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes), certainty);
m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes, isOtherSimilar), certainty);
}
void Geocoder::Context::FillResults(vector<Result> & results) const
@ -232,6 +240,8 @@ void Geocoder::Context::FillResults(vector<Result> & results) const
results.clear();
results.reserve(m_beam.GetEntries().size());
auto normalizationCertainty = 0.0;
set<base::GeoObjectId> seen;
bool const hasPotentialHouseNumber = !m_houseNumberPositionsInQuery.empty();
for (auto const & e : m_beam.GetEntries())
@ -242,18 +252,21 @@ void Geocoder::Context::FillResults(vector<Result> & results) const
if (hasPotentialHouseNumber && !IsGoodForPotentialHouseNumberAt(e.m_key, m_houseNumberPositionsInQuery))
continue;
results.emplace_back(e.m_key.m_osmId, e.m_value /* certainty */);
}
if (!results.empty())
{
auto const by = results.front().m_certainty;
for (auto & r : results)
if (!normalizationCertainty)
{
r.m_certainty /= by;
ASSERT_GREATER_OR_EQUAL(r.m_certainty, 0.0, ());
ASSERT_LESS_OR_EQUAL(r.m_certainty, 1.0, ());
normalizationCertainty = e.m_value;
// Normalize other-similar candidate certaintly to 0.95 in the best results.
if (e.m_key.m_isOtherSimilar)
normalizationCertainty /= 0.95;
}
ASSERT_GREATER_OR_EQUAL(normalizationCertainty, e.m_value, ());
auto resultCertainty = e.m_value / normalizationCertainty;
ASSERT_GREATER_OR_EQUAL(resultCertainty, 0.0, ());
ASSERT_LESS_OR_EQUAL(resultCertainty, 1.0, ());
results.emplace_back(e.m_key.m_osmId, resultCertainty);
}
ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ());
@ -427,6 +440,11 @@ void Geocoder::Go(Context & ctx, Type type) const
// Buildings are indexed separately.
if (type == Type::Building)
{
// House building parser has specific tokenizer.
// Pass biggest house number token sequence to house number matcher.
if (IsValidHouseNumberWithNextUnusedToken(ctx, subquery, subqueryTokenIds))
continue;
FillBuildingsLayer(ctx, subquery, subqueryTokenIds, curLayer);
}
else
@ -476,6 +494,9 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
// let's stay on the safer side and mark the tokens as potential house number.
ctx.MarkHouseNumberPositionsInQuery(subqueryTokenIds);
auto subqueryNumberParse = std::vector<search::house_numbers::Token>{};
ParseQuery(subqueryHN, false /* queryIsPrefix */, subqueryNumberParse);
auto candidates = std::vector<Candidate>{};
auto const & lastLayer = ctx.GetLayers().back();
@ -490,8 +511,8 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
Type::Building, m_hierarchy.GetNormalizedNameDictionary());
auto const & realHN = multipleHN.GetMainName();
auto const & realHNUniStr = strings::MakeUniString(realHN);
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN,
false /* queryIsPrefix */))
auto matchResult = search::house_numbers::MatchResult{};
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryNumberParse, matchResult))
{
auto && parentCandidateCertainty =
forSublocalityLayer ? FindMaxCertaintyInParentCandidates(ctx.GetLayers(), building)
@ -499,11 +520,11 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
if (!parentCandidateCertainty)
return;
static auto const buildingTokenWeight = GetWeight(Kind::Building);
auto totalCertainty =
*parentCandidateCertainty + buildingTokenWeight * subqueryTokenIds.size();
candidates.push_back({buildingDocId, totalCertainty});
*parentCandidateCertainty + SumHouseNumberSubqueryCertainty(matchResult);
auto const isOtherSimilar =
matchResult.queryMismatchedTokensCount || matchResult.houseNumberMismatchedTokensCount;
candidates.push_back({buildingDocId, totalCertainty, isOtherSimilar});
}
});
}
@ -535,7 +556,7 @@ void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & s
(d.m_kind != Kind::Unknown ? GetWeight(d.m_kind) : GetWeight(d.m_type)) * subquery.size();
auto totalCertainty = *parentCandidateCertainty + subqueryWeight;
candidates.push_back({docId, totalCertainty});
candidates.push_back({docId, totalCertainty, false /* m_isOtherSimilar */});
});
if (!candidates.empty())
@ -571,10 +592,56 @@ void Geocoder::AddResults(Context & ctx, std::vector<Candidate> const & candidat
entryCertainty += kCityStateExtraWeight;
}
ctx.AddResult(entry.m_osmId, entryCertainty, entry.m_type, tokenIds, allTypes);
ctx.AddResult(entry.m_osmId, entryCertainty, entry.m_type, tokenIds, allTypes,
candidate.m_isOtherSimilar);
}
}
bool Geocoder::IsValidHouseNumberWithNextUnusedToken(
Context const & ctx, Tokens const & subquery, vector<size_t> const & subqueryTokenIds) const
{
auto const nextTokenId = subqueryTokenIds.back() + 1;
if (nextTokenId >= ctx.GetNumTokens() || ctx.IsTokenUsed(nextTokenId))
return false;
auto subqueryHouseNumber = MakeHouseNumber(subquery);
AppendToHouseNumber(subqueryHouseNumber, ctx.GetToken(nextTokenId));
return search::house_numbers::LooksLikeHouseNumber(subqueryHouseNumber, false /* isPrefix */);
}
double Geocoder::SumHouseNumberSubqueryCertainty(
search::house_numbers::MatchResult const & matchResult) const
{
static auto const buildingTokenWeight = GetWeight(Kind::Building);
auto const matchedTokensCount = matchResult.matchedTokensCount;
auto certainty = matchedTokensCount * buildingTokenWeight;
// Candidate don't have all query tokens.
if (matchResult.queryMismatchedTokensCount)
{
auto const missingTokensCount = matchResult.queryMismatchedTokensCount;
// Missing tokens in the candidate are more penalty than extra tokents
// in other candidates.
auto missingTokenRelativeWeight = 4.0; // <missing token weight> / <extra token weight>
auto const penaltyRatio =
missingTokenRelativeWeight * missingTokensCount /
(missingTokenRelativeWeight * missingTokensCount + matchedTokensCount);
certainty -= penaltyRatio * buildingTokenWeight;
}
// Candidate has extra tokens.
if (matchResult.houseNumberMismatchedTokensCount)
{
auto const extraTokensCount = matchResult.houseNumberMismatchedTokensCount;
auto const penaltyRatio =
double(extraTokensCount) / (matchedTokensCount + extraTokensCount);
certainty -= penaltyRatio * buildingTokenWeight;
}
return certainty;
}
bool Geocoder::InCityState(Hierarchy::Entry const & entry) const
{
if (!entry.HasFieldInAddress(Type::Locality))

View file

@ -1,6 +1,7 @@
#pragma once
#include "geocoder/hierarchy.hpp"
#include "geocoder/house_numbers_matcher.hpp"
#include "geocoder/index.hpp"
#include "geocoder/result.hpp"
#include "geocoder/types.hpp"
@ -51,6 +52,7 @@ public:
{
Index::DocId m_entry;
double m_totalCertainty;
bool m_isOtherSimilar;
};
// A Layer contains all entries matched by a subquery of consecutive tokens.
@ -80,11 +82,12 @@ public:
struct BeamKey
{
BeamKey(base::GeoObjectId osmId, Type type, std::vector<size_t> const & tokenIds,
std::vector<Type> const & allTypes)
std::vector<Type> const & allTypes, bool isOtherSimilar)
: m_osmId(osmId)
, m_type(type)
, m_tokenIds{tokenIds}
, m_allTypes(allTypes)
, m_isOtherSimilar(isOtherSimilar)
{
base::SortUnique(m_allTypes);
}
@ -93,6 +96,7 @@ public:
Type m_type;
std::vector<size_t> m_tokenIds;
std::vector<Type> m_allTypes;
bool m_isOtherSimilar;
};
Context(std::string const & query);
@ -116,7 +120,8 @@ public:
bool AllTokensUsed() const;
void AddResult(base::GeoObjectId const & osmId, double certainty, Type type,
std::vector<size_t> const & tokenIds, std::vector<Type> const & allTypes);
std::vector<size_t> const & tokenIds, std::vector<Type> const & allTypes,
bool isOtherSimilar);
void FillResults(std::vector<Result> & results) const;
@ -180,6 +185,11 @@ private:
Layer & curLayer) const;
void AddResults(Context & ctx, std::vector<Candidate> const & candidates) const;
bool IsValidHouseNumberWithNextUnusedToken(Context const & ctx, Tokens const & subquery,
std::vector<size_t> const & subqueryTokenIds) const;
double SumHouseNumberSubqueryCertainty(
search::house_numbers::MatchResult const & matchResult) const;
bool InCityState(Hierarchy::Entry const & entry) const;
// Find max certainty in parent candidates.

View file

@ -22,7 +22,7 @@ namespace
{
using Id = base::GeoObjectId;
double const kCertaintyEps = 1e-4;
double const kCertaintyEps = 1e-3;
string const kRegionsData = R"#(
C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"kind": "country", "locales": {"default": {"name": "Cuba", "address": {"country": "Cuba"}}}, "rank": 2}}
C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"kind": "province", "locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}}
@ -170,6 +170,29 @@ UNIT_TEST(Geocoder_MismatchedLocality)
TestGeocoder(geocoder, "Moscow Krymskaya 3", {});
}
//--------------------------------------------------------------------------------------------------
UNIT_TEST(Geocoder_HouseNumberPartialMatch)
{
string const kData = R"#(
10 {"properties": {"kind": "city", "locales": {"default": {"address": {"locality": "Москва"}}}}}
11 {"properties": {"kind": "street", "locales": {"default": {"address": {"street": "Зорге", "locality": "Москва"}}}}}
12 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7", "street": "Зорге", "locality": "Москва"}}}}}
13 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7 к2", "street": "Зорге", "locality": "Москва"}}}}}
14 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7 к2 с3", "street": "Зорге", "locality": "Москва"}}}}}
)#";
Geocoder geocoder;
ScopedFile const regionsJsonFile("regions.jsonl", kData);
geocoder.LoadFromJsonl(regionsJsonFile.GetFullPath());
TestGeocoder(geocoder, "Москва, Зорге 7к2", {{Id{0x13}, 1.0}, {Id{0x14}, 0.995}, {Id{0x12}, 0.975}});
TestGeocoder(geocoder, "Москва, Зорге 7 к2", {{Id{0x13}, 1.0}, {Id{0x14}, 0.995}, {Id{0x12}, 0.975}});
TestGeocoder(geocoder, "Москва, Зорге 7", {{Id{0x12}, 1.0}, {Id{0x13}, 0.993}, {Id{0x14}, 0.990}});
TestGeocoder(geocoder, "Москва, Зорге 7к1", {{Id{0x12}, 0.95}});
TestGeocoder(geocoder, "Москва, Зорге 7A", {{Id{0x12}, 0.95}});
TestGeocoder(geocoder, "Москва, Зорге 7 A", {{Id{0x12}, 0.95}});
}
// Geocoder_Moscow* -----------------------------------------------------------------------------
UNIT_TEST(Geocoder_MoscowLocalityRank)
{

View file

@ -535,15 +535,26 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin
}
bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> const & queryParse)
{
auto && matchResult = MatchResult{};
return HouseNumbersMatch(houseNumber, queryParse, matchResult);
}
bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> const & queryParse,
MatchResult & matchResult)
{
if (houseNumber.empty() || queryParse.empty())
{
matchResult = {};
return false;
}
// Fast pre-check, helps to early exit without complex house number
// parsing.
if (IsASCIIDigit(houseNumber[0]) && IsASCIIDigit(queryParse[0].m_value[0]) &&
houseNumber[0] != queryParse[0].m_value[0])
{
matchResult = {};
return false;
}
@ -554,13 +565,25 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> con
{
if (parse.empty())
continue;
if (parse[0] == queryParse[0] &&
(IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end()) ||
IsSubsequence(queryParse.begin() + 1, queryParse.end(), parse.begin() + 1, parse.end())))
if (parse[0] == queryParse[0])
{
return true;
if (IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end()))
{
matchResult = {queryParse.size(), parse.size() - queryParse.size(),
0 /* queryMismatchedTokensCount */};
return true;
}
if (IsSubsequence(queryParse.begin() + 1, queryParse.end(), parse.begin() + 1, parse.end()))
{
matchResult = {parse.size(), 0 /* houseNumberMismatchedTokensCount */,
queryParse.size() - parse.size()};
return true;
}
}
}
matchResult = {};
return false;
}

View file

@ -52,6 +52,13 @@ struct Token
bool m_prefix = false;
};
struct MatchResult
{
size_t matchedTokensCount;
size_t houseNumberMismatchedTokensCount;
size_t queryMismatchedTokensCount;
};
// Tokenizes |s| that may be a house number.
void Tokenize(strings::UniString s, bool isPrefix, std::vector<Token> & ts);
@ -70,6 +77,11 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin
bool HouseNumbersMatch(strings::UniString const & houseNumber,
std::vector<Token> const & queryParse);
// Returns true if house number matches to a given parsed query.
// If true is returned then |matchResult| has matching info.
bool HouseNumbersMatch(strings::UniString const & houseNumber, std::vector<Token> const & queryParse,
MatchResult & matchResult);
// Returns true if |s| looks like a house number.
bool LooksLikeHouseNumber(strings::UniString const & s, bool isPrefix);
bool LooksLikeHouseNumber(std::string const & s, bool isPrefix);