[geocoder] Fix rank for house number candidate with extra tokens
This commit is contained in:
parent
eaf07a172d
commit
45609ef3cd
5 changed files with 162 additions and 27 deletions
|
@ -150,6 +150,13 @@ strings::UniString MakeHouseNumber(Tokens const & tokens)
|
|||
{
|
||||
return strings::MakeUniString(strings::JoinStrings(tokens, " "));
|
||||
}
|
||||
|
||||
strings::UniString & AppendToHouseNumber(strings::UniString & houseNumber, std::string const & token)
|
||||
{
|
||||
houseNumber += strings::MakeUniString(" ");
|
||||
houseNumber += strings::MakeUniString(token);
|
||||
return houseNumber;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// Geocoder::Layer ---------------------------------------------------------------------------------
|
||||
|
@ -222,9 +229,10 @@ bool Geocoder::Context::IsTokenUsed(size_t id) const
|
|||
bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_tokens.size(); }
|
||||
|
||||
void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty, Type type,
|
||||
vector<size_t> const & tokenIds, vector<Type> const & allTypes)
|
||||
vector<size_t> const & tokenIds, vector<Type> const & allTypes,
|
||||
bool isOtherSimilar)
|
||||
{
|
||||
m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes), certainty);
|
||||
m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes, isOtherSimilar), certainty);
|
||||
}
|
||||
|
||||
void Geocoder::Context::FillResults(vector<Result> & results) const
|
||||
|
@ -232,6 +240,8 @@ void Geocoder::Context::FillResults(vector<Result> & results) const
|
|||
results.clear();
|
||||
results.reserve(m_beam.GetEntries().size());
|
||||
|
||||
auto normalizationCertainty = 0.0;
|
||||
|
||||
set<base::GeoObjectId> seen;
|
||||
bool const hasPotentialHouseNumber = !m_houseNumberPositionsInQuery.empty();
|
||||
for (auto const & e : m_beam.GetEntries())
|
||||
|
@ -242,18 +252,21 @@ void Geocoder::Context::FillResults(vector<Result> & results) const
|
|||
if (hasPotentialHouseNumber && !IsGoodForPotentialHouseNumberAt(e.m_key, m_houseNumberPositionsInQuery))
|
||||
continue;
|
||||
|
||||
results.emplace_back(e.m_key.m_osmId, e.m_value /* certainty */);
|
||||
}
|
||||
|
||||
if (!results.empty())
|
||||
{
|
||||
auto const by = results.front().m_certainty;
|
||||
for (auto & r : results)
|
||||
if (!normalizationCertainty)
|
||||
{
|
||||
r.m_certainty /= by;
|
||||
ASSERT_GREATER_OR_EQUAL(r.m_certainty, 0.0, ());
|
||||
ASSERT_LESS_OR_EQUAL(r.m_certainty, 1.0, ());
|
||||
normalizationCertainty = e.m_value;
|
||||
// Normalize other-similar candidate certaintly to 0.95 in the best results.
|
||||
if (e.m_key.m_isOtherSimilar)
|
||||
normalizationCertainty /= 0.95;
|
||||
}
|
||||
|
||||
ASSERT_GREATER_OR_EQUAL(normalizationCertainty, e.m_value, ());
|
||||
|
||||
auto resultCertainty = e.m_value / normalizationCertainty;
|
||||
ASSERT_GREATER_OR_EQUAL(resultCertainty, 0.0, ());
|
||||
ASSERT_LESS_OR_EQUAL(resultCertainty, 1.0, ());
|
||||
|
||||
results.emplace_back(e.m_key.m_osmId, resultCertainty);
|
||||
}
|
||||
|
||||
ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ());
|
||||
|
@ -427,6 +440,11 @@ void Geocoder::Go(Context & ctx, Type type) const
|
|||
// Buildings are indexed separately.
|
||||
if (type == Type::Building)
|
||||
{
|
||||
// House building parser has specific tokenizer.
|
||||
// Pass biggest house number token sequence to house number matcher.
|
||||
if (IsValidHouseNumberWithNextUnusedToken(ctx, subquery, subqueryTokenIds))
|
||||
continue;
|
||||
|
||||
FillBuildingsLayer(ctx, subquery, subqueryTokenIds, curLayer);
|
||||
}
|
||||
else
|
||||
|
@ -476,6 +494,9 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
|
|||
// let's stay on the safer side and mark the tokens as potential house number.
|
||||
ctx.MarkHouseNumberPositionsInQuery(subqueryTokenIds);
|
||||
|
||||
auto subqueryNumberParse = std::vector<search::house_numbers::Token>{};
|
||||
ParseQuery(subqueryHN, false /* queryIsPrefix */, subqueryNumberParse);
|
||||
|
||||
auto candidates = std::vector<Candidate>{};
|
||||
|
||||
auto const & lastLayer = ctx.GetLayers().back();
|
||||
|
@ -490,8 +511,8 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
|
|||
Type::Building, m_hierarchy.GetNormalizedNameDictionary());
|
||||
auto const & realHN = multipleHN.GetMainName();
|
||||
auto const & realHNUniStr = strings::MakeUniString(realHN);
|
||||
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN,
|
||||
false /* queryIsPrefix */))
|
||||
auto matchResult = search::house_numbers::MatchResult{};
|
||||
if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryNumberParse, matchResult))
|
||||
{
|
||||
auto && parentCandidateCertainty =
|
||||
forSublocalityLayer ? FindMaxCertaintyInParentCandidates(ctx.GetLayers(), building)
|
||||
|
@ -499,11 +520,11 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector
|
|||
if (!parentCandidateCertainty)
|
||||
return;
|
||||
|
||||
static auto const buildingTokenWeight = GetWeight(Kind::Building);
|
||||
auto totalCertainty =
|
||||
*parentCandidateCertainty + buildingTokenWeight * subqueryTokenIds.size();
|
||||
|
||||
candidates.push_back({buildingDocId, totalCertainty});
|
||||
*parentCandidateCertainty + SumHouseNumberSubqueryCertainty(matchResult);
|
||||
auto const isOtherSimilar =
|
||||
matchResult.queryMismatchedTokensCount || matchResult.houseNumberMismatchedTokensCount;
|
||||
candidates.push_back({buildingDocId, totalCertainty, isOtherSimilar});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -535,7 +556,7 @@ void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & s
|
|||
(d.m_kind != Kind::Unknown ? GetWeight(d.m_kind) : GetWeight(d.m_type)) * subquery.size();
|
||||
auto totalCertainty = *parentCandidateCertainty + subqueryWeight;
|
||||
|
||||
candidates.push_back({docId, totalCertainty});
|
||||
candidates.push_back({docId, totalCertainty, false /* m_isOtherSimilar */});
|
||||
});
|
||||
|
||||
if (!candidates.empty())
|
||||
|
@ -571,10 +592,56 @@ void Geocoder::AddResults(Context & ctx, std::vector<Candidate> const & candidat
|
|||
entryCertainty += kCityStateExtraWeight;
|
||||
}
|
||||
|
||||
ctx.AddResult(entry.m_osmId, entryCertainty, entry.m_type, tokenIds, allTypes);
|
||||
ctx.AddResult(entry.m_osmId, entryCertainty, entry.m_type, tokenIds, allTypes,
|
||||
candidate.m_isOtherSimilar);
|
||||
}
|
||||
}
|
||||
|
||||
bool Geocoder::IsValidHouseNumberWithNextUnusedToken(
|
||||
Context const & ctx, Tokens const & subquery, vector<size_t> const & subqueryTokenIds) const
|
||||
{
|
||||
auto const nextTokenId = subqueryTokenIds.back() + 1;
|
||||
if (nextTokenId >= ctx.GetNumTokens() || ctx.IsTokenUsed(nextTokenId))
|
||||
return false;
|
||||
|
||||
auto subqueryHouseNumber = MakeHouseNumber(subquery);
|
||||
AppendToHouseNumber(subqueryHouseNumber, ctx.GetToken(nextTokenId));
|
||||
|
||||
return search::house_numbers::LooksLikeHouseNumber(subqueryHouseNumber, false /* isPrefix */);
|
||||
}
|
||||
|
||||
double Geocoder::SumHouseNumberSubqueryCertainty(
|
||||
search::house_numbers::MatchResult const & matchResult) const
|
||||
{
|
||||
static auto const buildingTokenWeight = GetWeight(Kind::Building);
|
||||
auto const matchedTokensCount = matchResult.matchedTokensCount;
|
||||
auto certainty = matchedTokensCount * buildingTokenWeight;
|
||||
|
||||
// Candidate don't have all query tokens.
|
||||
if (matchResult.queryMismatchedTokensCount)
|
||||
{
|
||||
auto const missingTokensCount = matchResult.queryMismatchedTokensCount;
|
||||
// Missing tokens in the candidate are more penalty than extra tokents
|
||||
// in other candidates.
|
||||
auto missingTokenRelativeWeight = 4.0; // <missing token weight> / <extra token weight>
|
||||
auto const penaltyRatio =
|
||||
missingTokenRelativeWeight * missingTokensCount /
|
||||
(missingTokenRelativeWeight * missingTokensCount + matchedTokensCount);
|
||||
certainty -= penaltyRatio * buildingTokenWeight;
|
||||
}
|
||||
|
||||
// Candidate has extra tokens.
|
||||
if (matchResult.houseNumberMismatchedTokensCount)
|
||||
{
|
||||
auto const extraTokensCount = matchResult.houseNumberMismatchedTokensCount;
|
||||
auto const penaltyRatio =
|
||||
double(extraTokensCount) / (matchedTokensCount + extraTokensCount);
|
||||
certainty -= penaltyRatio * buildingTokenWeight;
|
||||
}
|
||||
|
||||
return certainty;
|
||||
}
|
||||
|
||||
bool Geocoder::InCityState(Hierarchy::Entry const & entry) const
|
||||
{
|
||||
if (!entry.HasFieldInAddress(Type::Locality))
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "geocoder/hierarchy.hpp"
|
||||
#include "geocoder/house_numbers_matcher.hpp"
|
||||
#include "geocoder/index.hpp"
|
||||
#include "geocoder/result.hpp"
|
||||
#include "geocoder/types.hpp"
|
||||
|
@ -51,6 +52,7 @@ public:
|
|||
{
|
||||
Index::DocId m_entry;
|
||||
double m_totalCertainty;
|
||||
bool m_isOtherSimilar;
|
||||
};
|
||||
|
||||
// A Layer contains all entries matched by a subquery of consecutive tokens.
|
||||
|
@ -80,11 +82,12 @@ public:
|
|||
struct BeamKey
|
||||
{
|
||||
BeamKey(base::GeoObjectId osmId, Type type, std::vector<size_t> const & tokenIds,
|
||||
std::vector<Type> const & allTypes)
|
||||
std::vector<Type> const & allTypes, bool isOtherSimilar)
|
||||
: m_osmId(osmId)
|
||||
, m_type(type)
|
||||
, m_tokenIds{tokenIds}
|
||||
, m_allTypes(allTypes)
|
||||
, m_isOtherSimilar(isOtherSimilar)
|
||||
{
|
||||
base::SortUnique(m_allTypes);
|
||||
}
|
||||
|
@ -93,6 +96,7 @@ public:
|
|||
Type m_type;
|
||||
std::vector<size_t> m_tokenIds;
|
||||
std::vector<Type> m_allTypes;
|
||||
bool m_isOtherSimilar;
|
||||
};
|
||||
|
||||
Context(std::string const & query);
|
||||
|
@ -116,7 +120,8 @@ public:
|
|||
bool AllTokensUsed() const;
|
||||
|
||||
void AddResult(base::GeoObjectId const & osmId, double certainty, Type type,
|
||||
std::vector<size_t> const & tokenIds, std::vector<Type> const & allTypes);
|
||||
std::vector<size_t> const & tokenIds, std::vector<Type> const & allTypes,
|
||||
bool isOtherSimilar);
|
||||
|
||||
void FillResults(std::vector<Result> & results) const;
|
||||
|
||||
|
@ -180,6 +185,11 @@ private:
|
|||
Layer & curLayer) const;
|
||||
void AddResults(Context & ctx, std::vector<Candidate> const & candidates) const;
|
||||
|
||||
bool IsValidHouseNumberWithNextUnusedToken(Context const & ctx, Tokens const & subquery,
|
||||
std::vector<size_t> const & subqueryTokenIds) const;
|
||||
double SumHouseNumberSubqueryCertainty(
|
||||
search::house_numbers::MatchResult const & matchResult) const;
|
||||
|
||||
bool InCityState(Hierarchy::Entry const & entry) const;
|
||||
|
||||
// Find max certainty in parent candidates.
|
||||
|
|
|
@ -22,7 +22,7 @@ namespace
|
|||
{
|
||||
using Id = base::GeoObjectId;
|
||||
|
||||
double const kCertaintyEps = 1e-4;
|
||||
double const kCertaintyEps = 1e-3;
|
||||
string const kRegionsData = R"#(
|
||||
C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"kind": "country", "locales": {"default": {"name": "Cuba", "address": {"country": "Cuba"}}}, "rank": 2}}
|
||||
C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"kind": "province", "locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}}
|
||||
|
@ -170,6 +170,29 @@ UNIT_TEST(Geocoder_MismatchedLocality)
|
|||
TestGeocoder(geocoder, "Moscow Krymskaya 3", {});
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------------------------------
|
||||
UNIT_TEST(Geocoder_HouseNumberPartialMatch)
|
||||
{
|
||||
string const kData = R"#(
|
||||
10 {"properties": {"kind": "city", "locales": {"default": {"address": {"locality": "Москва"}}}}}
|
||||
11 {"properties": {"kind": "street", "locales": {"default": {"address": {"street": "Зорге", "locality": "Москва"}}}}}
|
||||
12 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7", "street": "Зорге", "locality": "Москва"}}}}}
|
||||
13 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7 к2", "street": "Зорге", "locality": "Москва"}}}}}
|
||||
14 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7 к2 с3", "street": "Зорге", "locality": "Москва"}}}}}
|
||||
)#";
|
||||
|
||||
Geocoder geocoder;
|
||||
ScopedFile const regionsJsonFile("regions.jsonl", kData);
|
||||
geocoder.LoadFromJsonl(regionsJsonFile.GetFullPath());
|
||||
|
||||
TestGeocoder(geocoder, "Москва, Зорге 7к2", {{Id{0x13}, 1.0}, {Id{0x14}, 0.995}, {Id{0x12}, 0.975}});
|
||||
TestGeocoder(geocoder, "Москва, Зорге 7 к2", {{Id{0x13}, 1.0}, {Id{0x14}, 0.995}, {Id{0x12}, 0.975}});
|
||||
TestGeocoder(geocoder, "Москва, Зорге 7", {{Id{0x12}, 1.0}, {Id{0x13}, 0.993}, {Id{0x14}, 0.990}});
|
||||
TestGeocoder(geocoder, "Москва, Зорге 7к1", {{Id{0x12}, 0.95}});
|
||||
TestGeocoder(geocoder, "Москва, Зорге 7A", {{Id{0x12}, 0.95}});
|
||||
TestGeocoder(geocoder, "Москва, Зорге 7 A", {{Id{0x12}, 0.95}});
|
||||
}
|
||||
|
||||
// Geocoder_Moscow* -----------------------------------------------------------------------------
|
||||
UNIT_TEST(Geocoder_MoscowLocalityRank)
|
||||
{
|
||||
|
|
|
@ -535,15 +535,26 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin
|
|||
}
|
||||
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> const & queryParse)
|
||||
{
|
||||
auto && matchResult = MatchResult{};
|
||||
return HouseNumbersMatch(houseNumber, queryParse, matchResult);
|
||||
}
|
||||
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> const & queryParse,
|
||||
MatchResult & matchResult)
|
||||
{
|
||||
if (houseNumber.empty() || queryParse.empty())
|
||||
{
|
||||
matchResult = {};
|
||||
return false;
|
||||
}
|
||||
|
||||
// Fast pre-check, helps to early exit without complex house number
|
||||
// parsing.
|
||||
if (IsASCIIDigit(houseNumber[0]) && IsASCIIDigit(queryParse[0].m_value[0]) &&
|
||||
houseNumber[0] != queryParse[0].m_value[0])
|
||||
{
|
||||
matchResult = {};
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -554,13 +565,25 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> con
|
|||
{
|
||||
if (parse.empty())
|
||||
continue;
|
||||
if (parse[0] == queryParse[0] &&
|
||||
(IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end()) ||
|
||||
IsSubsequence(queryParse.begin() + 1, queryParse.end(), parse.begin() + 1, parse.end())))
|
||||
if (parse[0] == queryParse[0])
|
||||
{
|
||||
return true;
|
||||
if (IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end()))
|
||||
{
|
||||
matchResult = {queryParse.size(), parse.size() - queryParse.size(),
|
||||
0 /* queryMismatchedTokensCount */};
|
||||
return true;
|
||||
}
|
||||
|
||||
if (IsSubsequence(queryParse.begin() + 1, queryParse.end(), parse.begin() + 1, parse.end()))
|
||||
{
|
||||
matchResult = {parse.size(), 0 /* houseNumberMismatchedTokensCount */,
|
||||
queryParse.size() - parse.size()};
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
matchResult = {};
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -52,6 +52,13 @@ struct Token
|
|||
bool m_prefix = false;
|
||||
};
|
||||
|
||||
struct MatchResult
|
||||
{
|
||||
size_t matchedTokensCount;
|
||||
size_t houseNumberMismatchedTokensCount;
|
||||
size_t queryMismatchedTokensCount;
|
||||
};
|
||||
|
||||
// Tokenizes |s| that may be a house number.
|
||||
void Tokenize(strings::UniString s, bool isPrefix, std::vector<Token> & ts);
|
||||
|
||||
|
@ -70,6 +77,11 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin
|
|||
bool HouseNumbersMatch(strings::UniString const & houseNumber,
|
||||
std::vector<Token> const & queryParse);
|
||||
|
||||
// Returns true if house number matches to a given parsed query.
|
||||
// If true is returned then |matchResult| has matching info.
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber, std::vector<Token> const & queryParse,
|
||||
MatchResult & matchResult);
|
||||
|
||||
// Returns true if |s| looks like a house number.
|
||||
bool LooksLikeHouseNumber(strings::UniString const & s, bool isPrefix);
|
||||
bool LooksLikeHouseNumber(std::string const & s, bool isPrefix);
|
||||
|
|
Loading…
Add table
Reference in a new issue