diff --git a/base/stl_helpers.hpp b/base/stl_helpers.hpp index f86395da52..c7b3c5e28f 100644 --- a/base/stl_helpers.hpp +++ b/base/stl_helpers.hpp @@ -299,6 +299,23 @@ bool IsSortedAndUnique(Iter beg, Iter end) return IsSortedAndUnique(beg, end, std::less::value_type>()); } +// See std::includes() C++20. +template +bool Includes(Iter1 first1, Iter1 last1, Iter2 first2, Iter2 last2) +{ + assert(std::is_sorted(first1, last1)); + assert(std::is_sorted(first2, last2)); + + for (; first2 != last2; ++first1) + { + if (first1 == last1 || *first2 < *first1) + return false; + if (!(*first1 < *first2)) + ++first2; + } + return true; +} + struct DeleteFunctor { template diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index 8d96fdecec..57fce89ac9 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -14,6 +14,7 @@ #include "base/timer.hpp" #include +#include #include #include #include @@ -138,9 +139,9 @@ bool Geocoder::Context::IsTokenUsed(size_t id) const bool Geocoder::Context::AllTokensUsed() const { return m_numUsedTokens == m_tokens.size(); } void Geocoder::Context::AddResult(base::GeoObjectId const & osmId, double certainty, Type type, - vector const & allTypes, bool allTokensUsed) + vector const & tokenIds, vector const & allTypes) { - m_beam.Add(BeamKey(osmId, type, allTypes, allTokensUsed), certainty); + m_beam.Add(BeamKey(osmId, type, tokenIds, allTypes), certainty); } void Geocoder::Context::FillResults(vector & results) const @@ -149,30 +150,13 @@ void Geocoder::Context::FillResults(vector & results) const results.reserve(m_beam.GetEntries().size()); set seen; + bool const hasPotentialHouseNumber = !m_houseNumberPositionsInQuery.empty(); for (auto const & e : m_beam.GetEntries()) { if (!seen.insert(e.m_key.m_osmId).second) continue; - bool isGoodHouseHumber = false; - if (e.m_key.m_type == Type::Building) - { - bool gotLocality = false; - bool gotStreet = false; - bool gotBuilding = false; - for (Type t : e.m_key.m_allTypes) - { - if (t == Type::Region || t == Type::Subregion || t == Type::Locality) - gotLocality = true; - if (t == Type::Street) - gotStreet = true; - if (t == Type::Building) - gotBuilding = true; - } - isGoodHouseHumber = gotLocality && gotStreet && gotBuilding; - } - - if (m_surelyGotHouseNumber && !isGoodHouseHumber && !e.m_key.m_allTokensUsed) + if (hasPotentialHouseNumber && !IsGoodForPotentialHouseNumberAt(e.m_key, m_houseNumberPositionsInQuery)) continue; results.emplace_back(e.m_key.m_osmId, e.m_value /* certainty */); @@ -197,6 +181,64 @@ vector & Geocoder::Context::GetLayers() { return m_layers; } vector const & Geocoder::Context::GetLayers() const { return m_layers; } +void Geocoder::Context::MarkHouseNumberPositionsInQuery(vector const & tokenIds) +{ + m_houseNumberPositionsInQuery.insert(tokenIds.begin(), tokenIds.end()); +} + +bool Geocoder::Context::IsGoodForPotentialHouseNumberAt(BeamKey const & beamKey, + set const & tokenIds) const +{ + if (beamKey.m_tokenIds.size() == m_tokens.size()) + return true; + + if (IsBuildingWithAddress(beamKey)) + return true; + + // Pass street, locality or region with number in query address parts. + if (HasLocalityOrRegion(beamKey) && ContainsTokenIds(beamKey, tokenIds)) + return true; + + return false; +} + +bool Geocoder::Context::IsBuildingWithAddress(BeamKey const & beamKey) const +{ + if (beamKey.m_type != Type::Building) + return false; + + bool gotLocality = false; + bool gotStreet = false; + bool gotBuilding = false; + for (Type t : beamKey.m_allTypes) + { + if (t == Type::Region || t == Type::Subregion || t == Type::Locality) + gotLocality = true; + if (t == Type::Street) + gotStreet = true; + if (t == Type::Building) + gotBuilding = true; + } + return gotLocality && gotStreet && gotBuilding; +} + +bool Geocoder::Context::HasLocalityOrRegion(BeamKey const & beamKey) const +{ + for (Type t : beamKey.m_allTypes) + { + if (t == Type::Region || t == Type::Subregion || t == Type::Locality) + return true; + } + + return false; +} + +bool Geocoder::Context::ContainsTokenIds(BeamKey const & beamKey, set const & needTokenIds) const +{ + auto const & keyTokenIds = beamKey.m_tokenIds; + return base::Includes(keyTokenIds.begin(), keyTokenIds.end(), needTokenIds.begin(), needTokenIds.end()); +} + // Geocoder ---------------------------------------------------------------------------------------- Geocoder::Geocoder(string const & pathToJsonHierarchy, unsigned int loadThreadsCount) : Geocoder{HierarchyReader{pathToJsonHierarchy}.Read(loadThreadsCount), loadThreadsCount} @@ -243,15 +285,18 @@ void Geocoder::Go(Context & ctx, Type type) const return; Tokens subquery; + vector subqueryTokenIds; for (size_t i = 0; i < ctx.GetNumTokens(); ++i) { subquery.clear(); + subqueryTokenIds.clear(); for (size_t j = i; j < ctx.GetNumTokens(); ++j) { if (ctx.IsTokenUsed(j)) break; subquery.push_back(ctx.GetToken(j)); + subqueryTokenIds.push_back(j); Layer curLayer; curLayer.m_type = type; @@ -259,7 +304,7 @@ void Geocoder::Go(Context & ctx, Type type) const // Buildings are indexed separately. if (type == Type::Building) { - FillBuildingsLayer(ctx, subquery, curLayer); + FillBuildingsLayer(ctx, subquery, subqueryTokenIds, curLayer); } else { @@ -273,6 +318,7 @@ void Geocoder::Go(Context & ctx, Type type) const boost::optional streetSynonymMark; double certainty = 0; + vector tokenIds; vector allTypes; for (size_t tokId = 0; tokId < ctx.GetNumTokens(); ++tokId) { @@ -285,11 +331,14 @@ void Geocoder::Go(Context & ctx, Type type) const certainty += GetWeight(t); if (t != Type::Count) + { + tokenIds.push_back(tokId); allTypes.push_back(t); + } } for (auto const & docId : curLayer.m_entries) - ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, allTypes, ctx.AllTokensUsed()); + ctx.AddResult(m_index.GetDoc(docId).m_osmId, certainty, type, tokenIds, allTypes); ctx.GetLayers().emplace_back(move(curLayer)); SCOPE_GUARD(pop, [&] { ctx.GetLayers().pop_back(); }); @@ -301,7 +350,8 @@ void Geocoder::Go(Context & ctx, Type type) const Go(ctx, NextType(type)); } -void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer & curLayer) const +void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, vector const & subqueryTokenIds, + Layer & curLayer) const { if (ctx.GetLayers().empty()) return; @@ -317,8 +367,8 @@ void Geocoder::FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer // We've already filled a street/location layer and now see something that resembles // a house number. While it still can be something else (a zip code, for example) - // let's stay on the safer side and set the house number bit. - ctx.SetHouseNumberBit(); + // let's stay on the safer side and mark the tokens as potential house number. + ctx.MarkHouseNumberPositionsInQuery(subqueryTokenIds); for (auto const & docId : layer.m_entries) { diff --git a/geocoder/geocoder.hpp b/geocoder/geocoder.hpp index 27be2587ca..4e9f09a15d 100644 --- a/geocoder/geocoder.hpp +++ b/geocoder/geocoder.hpp @@ -11,6 +11,7 @@ #include "base/string_utils.hpp" #include +#include #include #include #include @@ -53,19 +54,20 @@ public: public: struct BeamKey { - BeamKey(base::GeoObjectId osmId, Type type, std::vector const & allTypes, bool allTokensUsed) + BeamKey(base::GeoObjectId osmId, Type type, std::vector const & tokenIds, + std::vector const & allTypes) : m_osmId(osmId) , m_type(type) + , m_tokenIds{tokenIds} , m_allTypes(allTypes) - , m_allTokensUsed(allTokensUsed) { base::SortUnique(m_allTypes); } base::GeoObjectId m_osmId; Type m_type; + std::vector m_tokenIds; std::vector m_allTypes; - bool m_allTokensUsed; }; Context(std::string const & query); @@ -89,7 +91,7 @@ public: bool AllTokensUsed() const; void AddResult(base::GeoObjectId const & osmId, double certainty, Type type, - std::vector const & allTypes, bool allTokensUsed); + std::vector const & tokenIds, std::vector const & allTypes); void FillResults(std::vector & results) const; @@ -97,20 +99,25 @@ public: std::vector const & GetLayers() const; - void SetHouseNumberBit() { m_surelyGotHouseNumber = true; } + void MarkHouseNumberPositionsInQuery(std::vector const & tokenIds); private: + bool IsGoodForPotentialHouseNumberAt(BeamKey const & beamKey, std::set const & tokenIds) const; + bool IsBuildingWithAddress(BeamKey const & beamKey) const; + bool HasLocalityOrRegion(BeamKey const & beamKey) const; + bool ContainsTokenIds(BeamKey const & beamKey, std::set const & needTokenIds) const; + Tokens m_tokens; std::vector m_tokenTypes; size_t m_numUsedTokens = 0; - // Sticky bit that records a heuristic check whether - // the current query contains a house number. + // |m_houseNumberPositionsInQuery| has indexes of query tokens which are placed on + // context-dependent positions of house number. // The rationale is that we must only emit buildings in this case // and implement a fallback to a more powerful geocoder if we // could not find a building. - bool m_surelyGotHouseNumber = false; + std::set m_houseNumberPositionsInQuery; // The highest value of certainty for a fixed amount of // the most relevant retrieved osm ids. @@ -133,8 +140,8 @@ private: void Go(Context & ctx, Type type) const; - void FillBuildingsLayer(Context & ctx, Tokens const & subquery, Layer & curLayer) const; - + void FillBuildingsLayer(Context & ctx, Tokens const & subquery, std::vector const & subqueryTokenIds, + Layer & curLayer) const; void FillRegularLayer(Context const & ctx, Type type, Tokens const & subquery, Layer & curLayer) const; diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 1f61253e59..af89f5a79b 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -20,8 +20,9 @@ using namespace std; namespace { -double const kCertaintyEps = 1e-6; +using Id = base::GeoObjectId; +double const kCertaintyEps = 1e-6; string const kRegionsData = R"#( -4611686018427080071 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"name": "Cuba", "rank": 2, "address": {"country": "Cuba"}}} -4611686018425533273 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"name": "Ciego de Ávila", "rank": 4, "address": {"region": "Ciego de Ávila", "country": "Cuba"}}} @@ -153,42 +154,108 @@ UNIT_TEST(Geocoder_MismatchedLocality) TestGeocoder(geocoder, "Moscow Krymskaya 3", {}); } -UNIT_TEST(Geocoder_StreetWithNumber) +// Geocoder_StreetWithNumber* ---------------------------------------------------------------------- +UNIT_TEST(Geocoder_StreetWithNumberInCity) { string const kData = R"#( 10 {"properties": {"address": {"locality": "Москва"}}} -20 {"properties": {"address": {"locality": "Краснокамск"}}} - 11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} -12 {"properties": {"address": {"locality": "Москва", "street": "4-я улица 8 Марта"}}} - -13 {"properties": {"address": {"locality": "Москва", "street": "8 Марта"}}} - -21 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 8 Марта"}}} -25 {"properties": {"address": {"locality": "Краснокамск", "street": "Январская улица"}}} -26 {"properties": {"address": {"locality": "Краснокамск", "street": "Январская улица", "building": "8"}}} +20 {"properties": {"address": {"locality": "Краснокамск"}}} +28 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - using Id = base::GeoObjectId; TestGeocoder(geocoder, "Москва, улица 1905 года", {{Id{11}, 1.0}}); - TestGeocoder(geocoder, "Москва, 1905 года", {{Id{11}, 1.0}}); - TestGeocoder(geocoder, "Краснокамск, улица 1905 года", {}); - - TestGeocoder(geocoder, "Москва, 4-я улица 8 Марта", {{Id{12}, 1.0}}); - TestGeocoder(geocoder, "Москва, 4-я 8 Марта", {{Id{12}, 1.0}}); - - TestGeocoder(geocoder, "Москва, 8 Марта", {{Id{13}, 1.0}}); - TestGeocoder(geocoder, "Москва, улица 8 Марта", {{Id{13}, 1.0}}); - - TestGeocoder(geocoder, "Краснокамск, улица 8 Марта", {{Id{21}, 1.0}}); - TestGeocoder(geocoder, "Краснокамск, 8 Марта", {{Id{21}, 1.0}}); - TestGeocoder(geocoder, "Краснокамск, Январская 8", {{Id{26}, 1.0}}); } +UNIT_TEST(Geocoder_StreetWithNumberInClassifiedCity) +{ + string const kData = R"#( +10 {"properties": {"address": {"locality": "Москва"}}} +11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "город Москва, улица 1905 года", {{Id{11}, 1.0}}); +} + +UNIT_TEST(Geocoder_StreetWithNumberInAnyCity) +{ + string const kData = R"#( +10 {"properties": {"address": {"locality": "Москва"}}} +11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} + +20 {"properties": {"address": {"locality": "Краснокамск"}}} +28 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "улица 1905 года", {{Id{11}, 1.0}, {Id{28}, 1.0}}); +} + +UNIT_TEST(Geocoder_StreetWithNumberAndWithoutStreetSynonym) +{ + string const kData = R"#( +10 {"properties": {"address": {"locality": "Москва"}}} +11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "Москва, 1905 года", {{Id{11}, 1.0}}); +} + +UNIT_TEST(Geocoder_UntypedStreetWithNumberAndStreetSynonym) +{ + string const kData = R"#( +10 {"properties": {"address": {"locality": "Москва"}}} +13 {"properties": {"address": {"locality": "Москва", "street": "8 Марта"}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "Москва, улица 8 Марта", {{Id{13}, 1.0}}); +} + +UNIT_TEST(Geocoder_StreetWithTwoNumbers) +{ + string const kData = R"#( +10 {"properties": {"address": {"locality": "Москва"}}} +12 {"properties": {"address": {"locality": "Москва", "street": "4-я улица 8 Марта"}}} + +13 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "Москва, 4-я улица 8 Марта", {{Id{12}, 1.0}}); +} + +UNIT_TEST(Geocoder_BuildingOnStreetWithNumber) +{ + string const kData = R"#( +10 {"properties": {"address": {"locality": "Москва"}}} +13 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}} +15 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта", "building": "4"}}} +)#"; + + ScopedFile const regionsJsonFile("regions.jsonl", kData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + TestGeocoder(geocoder, "Москва, улица 8 Марта, 4", {{Id{15}, 1.0}}); +} + +//-------------------------------------------------------------------------------------------------- UNIT_TEST(Geocoder_LocalityBuilding) { string const kData = R"#(