From 45609ef3cdd63fde576904d2657e24b36235abe4 Mon Sep 17 00:00:00 2001 From: Anatoly Serdtcev Date: Mon, 16 Dec 2019 14:33:25 +0300 Subject: [PATCH] [geocoder] Fix rank for house number candidate with extra tokens --- geocoder/geocoder.cpp | 107 +++++++++++++++++---- geocoder/geocoder.hpp | 14 ++- geocoder/geocoder_tests/geocoder_tests.cpp | 25 ++++- geocoder/house_numbers_matcher.cpp | 31 +++++- geocoder/house_numbers_matcher.hpp | 12 +++ 5 files changed, 162 insertions(+), 27 deletions(-) diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index f380c45..f02b20b 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -150,6 +150,13 @@ strings::UniString MakeHouseNumber(Tokens const & tokens) { return strings::MakeUniString(strings::JoinStrings(tokens, " ")); } + +strings::UniString & AppendToHouseNumber(strings::UniString & houseNumber, std::string const & token) +{ + houseNumber += strings::MakeUniString(" "); + houseNumber += strings::MakeUniString(token); + return houseNumber; +} } // namespace // Geocoder::Layer --------------------------------------------------------------------------------- @@ -222,9 +229,10 @@ bool Geocoder::Context::IsTokenUsed(size_t id) const bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_tokens.size(); } void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty, Type type, - vector const & tokenIds, vector const & allTypes) + vector const & tokenIds, vector const & allTypes, + bool isOtherSimilar) { - m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes), certainty); + m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes, isOtherSimilar), certainty); } void Geocoder::Context::FillResults(vector & results) const @@ -232,6 +240,8 @@ void Geocoder::Context::FillResults(vector & results) const results.clear(); results.reserve(m_beam.GetEntries().size()); + auto normalizationCertainty = 0.0; + set seen; bool const hasPotentialHouseNumber = !m_houseNumberPositionsInQuery.empty(); for (auto const & e : m_beam.GetEntries()) @@ -242,18 +252,21 @@ void Geocoder::Context::FillResults(vector & results) const if (hasPotentialHouseNumber && !IsGoodForPotentialHouseNumberAt(e.m_key, m_houseNumberPositionsInQuery)) continue; - results.emplace_back(e.m_key.m_osmId, e.m_value /* certainty */); - } - - if (!results.empty()) - { - auto const by = results.front().m_certainty; - for (auto & r : results) + if (!normalizationCertainty) { - r.m_certainty /= by; - ASSERT_GREATER_OR_EQUAL(r.m_certainty, 0.0, ()); - ASSERT_LESS_OR_EQUAL(r.m_certainty, 1.0, ()); + normalizationCertainty = e.m_value; + // Normalize other-similar candidate certaintly to 0.95 in the best results. + if (e.m_key.m_isOtherSimilar) + normalizationCertainty /= 0.95; } + + ASSERT_GREATER_OR_EQUAL(normalizationCertainty, e.m_value, ()); + + auto resultCertainty = e.m_value / normalizationCertainty; + ASSERT_GREATER_OR_EQUAL(resultCertainty, 0.0, ()); + ASSERT_LESS_OR_EQUAL(resultCertainty, 1.0, ()); + + results.emplace_back(e.m_key.m_osmId, resultCertainty); } ASSERT(is_sorted(results.rbegin(), results.rend(), base::LessBy(&Result::m_certainty)), ()); @@ -427,6 +440,11 @@ void Geocoder::Go(Context & ctx, Type type) const // Buildings are indexed separately. if (type == Type::Building) { + // House building parser has specific tokenizer. + // Pass biggest house number token sequence to house number matcher. + if (IsValidHouseNumberWithNextUnusedToken(ctx, subquery, subqueryTokenIds)) + continue; + FillBuildingsLayer(ctx, subquery, subqueryTokenIds, curLayer); } else @@ -476,6 +494,9 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector // let's stay on the safer side and mark the tokens as potential house number. ctx.MarkHouseNumberPositionsInQuery(subqueryTokenIds); + auto subqueryNumberParse = std::vector{}; + ParseQuery(subqueryHN, false /* queryIsPrefix */, subqueryNumberParse); + auto candidates = std::vector{}; auto const & lastLayer = ctx.GetLayers().back(); @@ -490,8 +511,8 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector Type::Building, m_hierarchy.GetNormalizedNameDictionary()); auto const & realHN = multipleHN.GetMainName(); auto const & realHNUniStr = strings::MakeUniString(realHN); - if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryHN, - false /* queryIsPrefix */)) + auto matchResult = search::house_numbers::MatchResult{}; + if (search::house_numbers::HouseNumbersMatch(realHNUniStr, subqueryNumberParse, matchResult)) { auto && parentCandidateCertainty = forSublocalityLayer ? FindMaxCertaintyInParentCandidates(ctx.GetLayers(), building) @@ -499,11 +520,11 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector if (!parentCandidateCertainty) return; - static auto const buildingTokenWeight = GetWeight(Kind::Building); auto totalCertainty = - *parentCandidateCertainty + buildingTokenWeight * subqueryTokenIds.size(); - - candidates.push_back({buildingDocId, totalCertainty}); + *parentCandidateCertainty + SumHouseNumberSubqueryCertainty(matchResult); + auto const isOtherSimilar = + matchResult.queryMismatchedTokensCount || matchResult.houseNumberMismatchedTokensCount; + candidates.push_back({buildingDocId, totalCertainty, isOtherSimilar}); } }); } @@ -535,7 +556,7 @@ void Geocoder::FillRegularLayer(Context const & ctx, Type type, Tokens const & s (d.m_kind != Kind::Unknown ? GetWeight(d.m_kind) : GetWeight(d.m_type)) * subquery.size(); auto totalCertainty = *parentCandidateCertainty + subqueryWeight; - candidates.push_back({docId, totalCertainty}); + candidates.push_back({docId, totalCertainty, false /* m_isOtherSimilar */}); }); if (!candidates.empty()) @@ -571,10 +592,56 @@ void Geocoder::AddResults(Context & ctx, std::vector const & candidat entryCertainty += kCityStateExtraWeight; } - ctx.AddResult(entry.m_osmId, entryCertainty, entry.m_type, tokenIds, allTypes); + ctx.AddResult(entry.m_osmId, entryCertainty, entry.m_type, tokenIds, allTypes, + candidate.m_isOtherSimilar); } } +bool Geocoder::IsValidHouseNumberWithNextUnusedToken( + Context const & ctx, Tokens const & subquery, vector const & subqueryTokenIds) const +{ + auto const nextTokenId = subqueryTokenIds.back() + 1; + if (nextTokenId >= ctx.GetNumTokens() || ctx.IsTokenUsed(nextTokenId)) + return false; + + auto subqueryHouseNumber = MakeHouseNumber(subquery); + AppendToHouseNumber(subqueryHouseNumber, ctx.GetToken(nextTokenId)); + + return search::house_numbers::LooksLikeHouseNumber(subqueryHouseNumber, false /* isPrefix */); +} + +double Geocoder::SumHouseNumberSubqueryCertainty( + search::house_numbers::MatchResult const & matchResult) const +{ + static auto const buildingTokenWeight = GetWeight(Kind::Building); + auto const matchedTokensCount = matchResult.matchedTokensCount; + auto certainty = matchedTokensCount * buildingTokenWeight; + + // Candidate don't have all query tokens. + if (matchResult.queryMismatchedTokensCount) + { + auto const missingTokensCount = matchResult.queryMismatchedTokensCount; + // Missing tokens in the candidate are more penalty than extra tokents + // in other candidates. + auto missingTokenRelativeWeight = 4.0; // / + auto const penaltyRatio = + missingTokenRelativeWeight * missingTokensCount / + (missingTokenRelativeWeight * missingTokensCount + matchedTokensCount); + certainty -= penaltyRatio * buildingTokenWeight; + } + + // Candidate has extra tokens. + if (matchResult.houseNumberMismatchedTokensCount) + { + auto const extraTokensCount = matchResult.houseNumberMismatchedTokensCount; + auto const penaltyRatio = + double(extraTokensCount) / (matchedTokensCount + extraTokensCount); + certainty -= penaltyRatio * buildingTokenWeight; + } + + return certainty; +} + bool Geocoder::InCityState(Hierarchy::Entry const & entry) const { if (!entry.HasFieldInAddress(Type::Locality)) diff --git a/geocoder/geocoder.hpp b/geocoder/geocoder.hpp index bda87f0..83e54e7 100644 --- a/geocoder/geocoder.hpp +++ b/geocoder/geocoder.hpp @@ -1,6 +1,7 @@ #pragma once #include "geocoder/hierarchy.hpp" +#include "geocoder/house_numbers_matcher.hpp" #include "geocoder/index.hpp" #include "geocoder/result.hpp" #include "geocoder/types.hpp" @@ -51,6 +52,7 @@ public: { Index::DocId m_entry; double m_totalCertainty; + bool m_isOtherSimilar; }; // A Layer contains all entries matched by a subquery of consecutive tokens. @@ -80,11 +82,12 @@ public: struct BeamKey { BeamKey(base::GeoObjectId osmId, Type type, std::vector const & tokenIds, - std::vector const & allTypes) + std::vector const & allTypes, bool isOtherSimilar) : m_osmId(osmId) , m_type(type) , m_tokenIds{tokenIds} , m_allTypes(allTypes) + , m_isOtherSimilar(isOtherSimilar) { base::SortUnique(m_allTypes); } @@ -93,6 +96,7 @@ public: Type m_type; std::vector m_tokenIds; std::vector m_allTypes; + bool m_isOtherSimilar; }; Context(std::string const & query); @@ -116,7 +120,8 @@ public: bool AllTokensUsed() const; void AddResult(base::GeoObjectId const & osmId, double certainty, Type type, - std::vector const & tokenIds, std::vector const & allTypes); + std::vector const & tokenIds, std::vector const & allTypes, + bool isOtherSimilar); void FillResults(std::vector & results) const; @@ -180,6 +185,11 @@ private: Layer & curLayer) const; void AddResults(Context & ctx, std::vector const & candidates) const; + bool IsValidHouseNumberWithNextUnusedToken(Context const & ctx, Tokens const & subquery, + std::vector const & subqueryTokenIds) const; + double SumHouseNumberSubqueryCertainty( + search::house_numbers::MatchResult const & matchResult) const; + bool InCityState(Hierarchy::Entry const & entry) const; // Find max certainty in parent candidates. diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 65110fb..ace6224 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -22,7 +22,7 @@ namespace { using Id = base::GeoObjectId; -double const kCertaintyEps = 1e-4; +double const kCertaintyEps = 1e-3; string const kRegionsData = R"#( C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"kind": "country", "locales": {"default": {"name": "Cuba", "address": {"country": "Cuba"}}}, "rank": 2}} C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"kind": "province", "locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}} @@ -170,6 +170,29 @@ UNIT_TEST(Geocoder_MismatchedLocality) TestGeocoder(geocoder, "Moscow Krymskaya 3", {}); } +//-------------------------------------------------------------------------------------------------- +UNIT_TEST(Geocoder_HouseNumberPartialMatch) +{ + string const kData = R"#( +10 {"properties": {"kind": "city", "locales": {"default": {"address": {"locality": "Москва"}}}}} +11 {"properties": {"kind": "street", "locales": {"default": {"address": {"street": "Зорге", "locality": "Москва"}}}}} +12 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7", "street": "Зорге", "locality": "Москва"}}}}} +13 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7 к2", "street": "Зорге", "locality": "Москва"}}}}} +14 {"properties": {"kind": "building", "locales": {"default": {"address": {"building": "7 к2 с3", "street": "Зорге", "locality": "Москва"}}}}} +)#"; + + Geocoder geocoder; + ScopedFile const regionsJsonFile("regions.jsonl", kData); + geocoder.LoadFromJsonl(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "Москва, Зорге 7к2", {{Id{0x13}, 1.0}, {Id{0x14}, 0.995}, {Id{0x12}, 0.975}}); + TestGeocoder(geocoder, "Москва, Зорге 7 к2", {{Id{0x13}, 1.0}, {Id{0x14}, 0.995}, {Id{0x12}, 0.975}}); + TestGeocoder(geocoder, "Москва, Зорге 7", {{Id{0x12}, 1.0}, {Id{0x13}, 0.993}, {Id{0x14}, 0.990}}); + TestGeocoder(geocoder, "Москва, Зорге 7к1", {{Id{0x12}, 0.95}}); + TestGeocoder(geocoder, "Москва, Зорге 7A", {{Id{0x12}, 0.95}}); + TestGeocoder(geocoder, "Москва, Зорге 7 A", {{Id{0x12}, 0.95}}); +} + // Geocoder_Moscow* ----------------------------------------------------------------------------- UNIT_TEST(Geocoder_MoscowLocalityRank) { diff --git a/geocoder/house_numbers_matcher.cpp b/geocoder/house_numbers_matcher.cpp index b1f926c..d5c835f 100644 --- a/geocoder/house_numbers_matcher.cpp +++ b/geocoder/house_numbers_matcher.cpp @@ -535,15 +535,26 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin } bool HouseNumbersMatch(strings::UniString const & houseNumber, vector const & queryParse) +{ + auto && matchResult = MatchResult{}; + return HouseNumbersMatch(houseNumber, queryParse, matchResult); +} + +bool HouseNumbersMatch(strings::UniString const & houseNumber, vector const & queryParse, + MatchResult & matchResult) { if (houseNumber.empty() || queryParse.empty()) + { + matchResult = {}; return false; + } // Fast pre-check, helps to early exit without complex house number // parsing. if (IsASCIIDigit(houseNumber[0]) && IsASCIIDigit(queryParse[0].m_value[0]) && houseNumber[0] != queryParse[0].m_value[0]) { + matchResult = {}; return false; } @@ -554,13 +565,25 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, vector con { if (parse.empty()) continue; - if (parse[0] == queryParse[0] && - (IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end()) || - IsSubsequence(queryParse.begin() + 1, queryParse.end(), parse.begin() + 1, parse.end()))) + if (parse[0] == queryParse[0]) { - return true; + if (IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end())) + { + matchResult = {queryParse.size(), parse.size() - queryParse.size(), + 0 /* queryMismatchedTokensCount */}; + return true; + } + + if (IsSubsequence(queryParse.begin() + 1, queryParse.end(), parse.begin() + 1, parse.end())) + { + matchResult = {parse.size(), 0 /* houseNumberMismatchedTokensCount */, + queryParse.size() - parse.size()}; + return true; + } } } + + matchResult = {}; return false; } diff --git a/geocoder/house_numbers_matcher.hpp b/geocoder/house_numbers_matcher.hpp index 6214c8b..597c086 100644 --- a/geocoder/house_numbers_matcher.hpp +++ b/geocoder/house_numbers_matcher.hpp @@ -52,6 +52,13 @@ struct Token bool m_prefix = false; }; +struct MatchResult +{ + size_t matchedTokensCount; + size_t houseNumberMismatchedTokensCount; + size_t queryMismatchedTokensCount; +}; + // Tokenizes |s| that may be a house number. void Tokenize(strings::UniString s, bool isPrefix, std::vector & ts); @@ -70,6 +77,11 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin bool HouseNumbersMatch(strings::UniString const & houseNumber, std::vector const & queryParse); +// Returns true if house number matches to a given parsed query. +// If true is returned then |matchResult| has matching info. +bool HouseNumbersMatch(strings::UniString const & houseNumber, std::vector const & queryParse, + MatchResult & matchResult); + // Returns true if |s| looks like a house number. bool LooksLikeHouseNumber(strings::UniString const & s, bool isPrefix); bool LooksLikeHouseNumber(std::string const & s, bool isPrefix);