From 88d5775c230e5ac2caa06ad2266b893b300a2211 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Mon, 11 Jul 2016 22:42:43 +0300 Subject: [PATCH] Refactor. Add Matching by name. --- generator/booking_dataset.cpp | 56 +---------- generator/booking_dataset.hpp | 5 - .../booking_quality_check.cpp | 9 +- generator/booking_scoring.cpp | 97 +++++++++++++++++++ generator/booking_scoring.hpp | 21 ++++ generator/osm_element.cpp | 14 +++ generator/osm_element.hpp | 3 +- .../generator.xcodeproj/project.pbxproj | 9 ++ 8 files changed, 151 insertions(+), 63 deletions(-) create mode 100644 generator/booking_scoring.cpp create mode 100644 generator/booking_scoring.hpp diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 8239f68344..fec4af1689 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -1,11 +1,11 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" + #include "platform/local_country_file_utils.hpp" #include "platform/platform.hpp" #include "indexer/ftypes_matcher.hpp" -#include "indexer/search_delimiters.hpp" -#include "indexer/search_string_utils.hpp" #include "geometry/distance_on_sphere.hpp" @@ -171,45 +171,6 @@ vector BookingDataset::GetNearestHotels(double lat, double lon, size_t l return indexes; } -bool BookingDataset::MatchByName(string const & osmName, - vector const & bookingIndexes) const -{ - return false; - - // Match name. - // vector osmTokens; - // NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); - // - // cout << "\n------------- " << name << endl; - // - // bool matched = false; - // for (auto const & index : indexes) - // { - // vector bookingTokens; - // NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); - // - // map>> weightPair; - // - // for (size_t j = 0; j < osmTokens.size(); ++j) - // { - // for (size_t i = 0; i < bookingTokens.size(); ++i) - // { - // size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), - // bookingTokens[i].begin(), - // bookingTokens[i].end()); - // if (distance < 3) - // weightPair[distance].emplace_back(i, j); - // } - // } - // - // if (!weightPair.empty()) - // { - // cout << m_hotels[e.second] << endl; - // matched = true; - // } - // } -} - void BookingDataset::BuildFeatures(function const & fn) const { for (auto const & hotel : m_hotels) @@ -302,13 +263,6 @@ void BookingDataset::BuildFeatures(function const & fn) cons } } -// static -double BookingDataset::ScoreByLinearNormDistance(double distance) -{ - distance = my::clamp(distance, 0, kDistanceLimitInMeters); - return 1.0 - distance / kDistanceLimitInMeters; -} - void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath) { m_hotels.clear(); @@ -374,11 +328,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const for (size_t const j : bookingIndexes) { - auto const & hotel = GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double score = ScoreByLinearNormDistance(distanceMeters); - matched = score > kOptimalThreshold; - if (matched) + if (booking_scoring::Match(GetHotel(j), e).IsMatched()) break; } diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index f56bba4d17..c37859cca6 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -22,9 +22,6 @@ public: double static constexpr kDistanceLimitInMeters = 150; size_t static constexpr kMaxSelectedElements = 3; - // Calculated with tools/python/booking_hotels_quality.py - double static constexpr kOptimalThreshold = 0.709283; - struct Hotel { enum class Fields @@ -92,8 +89,6 @@ public: void BuildFeatures(function const & fn) const; - static double ScoreByLinearNormDistance(double distance); - protected: vector m_hotels; diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 6602687ec4..0331a7f9d3 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -1,4 +1,5 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" #include "generator/osm_source.hpp" #include "geometry/distance_on_sphere.hpp" @@ -73,15 +74,15 @@ int main(int argc, char * argv[]) for (size_t const j : bookingIndexes) { auto const & hotel = bookingDataset.GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double const score = BookingDataset::ScoreByLinearNormDistance(distanceMeters); + auto const score = booking_scoring::Match(hotel, e); - bool matched = score > BookingDataset::kOptimalThreshold; + double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); + bool matched = score.IsMatched(); outStream << "# ------------------------------------------" << fixed << setprecision(6) << endl; outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j - << " distance: " << distanceMeters << " score: " << score << endl; + << " distance: " << distanceMeters << " score: " << score.GetMatchingScore() << endl; outStream << "# " << e << endl; outStream << "# " << hotel << endl; outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp new file mode 100644 index 0000000000..ff2aaec177 --- /dev/null +++ b/generator/booking_scoring.cpp @@ -0,0 +1,97 @@ +#include "generator/booking_scoring.hpp" + +#include "generator/booking_dataset.hpp" + +#include "indexer/search_string_utils.hpp" +#include "indexer/search_delimiters.hpp" + +#include "geometry/distance_on_sphere.hpp" + +#include "base/collection_cast.hpp" + +namespace generator +{ +namespace booking_scoring +{ +namespace +{ +// Calculated with tools/python/booking_hotels_quality.py. +double constexpr kOptimalThreshold = 0.151001; + +template +struct decay_equiv : + std::is_same::type, U>::type +{}; + +set StringToSetOfWords(string const & str) +{ + vector result; + search::NormalizeAndTokenizeString(str, result, search::Delimiters{}); + return my::collection_cast(result); +} + +// TODO(mgsergio): Update existing one in base or wherever... +// Or just use one from boost. +struct CounterIterator +{ + template::value>::type> + CounterIterator & operator=(T const &) { ++m_count; return *this; } + CounterIterator & operator++() { return *this; } + CounterIterator & operator++(int) { return *this; } + CounterIterator & operator*() { return *this; } + uint32_t Count() const { return m_count; } + + uint32_t m_count = 0; +}; + +double StringSimilarityScore(string const & a, string const & b) +{ + auto const aWords = StringToSetOfWords(a); + auto const bWords = StringToSetOfWords(b); + + auto const intersectionCard = set_intersection(begin(aWords), end(aWords), + begin(bWords), end(bWords), + CounterIterator()).Count(); + auto const aLikeBScore = static_cast(intersectionCard) / aWords.size(); + auto const bLikeAScore = static_cast(intersectionCard) / bWords.size(); + + return aLikeBScore * bLikeAScore; +} + +double GetLinearNormDistanceScrore(double distance) +{ + distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters); + return 1.0 - distance / BookingDataset::kDistanceLimitInMeters; +} + +double GetNameSimilarityScore(string const & booking_name, string const & osm_name) +{ + return StringSimilarityScore(booking_name, osm_name); +} +} // namespace + +double BookingMatchScore::GetMatchingScore() const +{ + return m_linearNormDistanceScore * m_nameSimilarityScore; +} + +bool BookingMatchScore::IsMatched() const +{ + return GetMatchingScore() > kOptimalThreshold; +} + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e) +{ + BookingMatchScore score; + + auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon); + score.m_linearNormDistanceScore = GetLinearNormDistanceScrore(distance); + + string osmHotelName; + score.m_nameSimilarityScore = e.GetTag("name", osmHotelName) + ? GetNameSimilarityScore(h.name, osmHotelName) : 0; + + return score; +} +} // namespace booking_scoring +} // namespace generator diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp new file mode 100644 index 0000000000..d92482cf35 --- /dev/null +++ b/generator/booking_scoring.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "generator/booking_dataset.hpp" +#include "generator/osm_element.hpp" + +namespace generator +{ +namespace booking_scoring +{ +struct BookingMatchScore +{ + double GetMatchingScore() const; + bool IsMatched() const; + + double m_linearNormDistanceScore{}; + double m_nameSimilarityScore{}; +}; + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e); +} // namespace booking_scoring +} // namespace generator diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 1c65dd599d..8ae3340132 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -121,6 +121,20 @@ string OsmElement::ToString(string const & shift) const return ss.str(); } +bool OsmElement::GetTag(string const & key, string & value) const +{ + auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag) + { + return tag.key == key; + }); + + if (it == end(m_tags)) + return false; + + value = it->value; + return true; +} + string DebugPrint(OsmElement const & e) { return e.ToString(); diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index fc1187c6a7..c473f7d175 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -152,7 +152,8 @@ struct OsmElement if (!v.empty()) AddTag(k, v); } + + bool GetTag(string const & key, string & value) const; }; string DebugPrint(OsmElement const & e); - diff --git a/xcode/generator/generator.xcodeproj/project.pbxproj b/xcode/generator/generator.xcodeproj/project.pbxproj index d95c9fb035..4d668aeb25 100644 --- a/xcode/generator/generator.xcodeproj/project.pbxproj +++ b/xcode/generator/generator.xcodeproj/project.pbxproj @@ -63,6 +63,8 @@ 67BC92E31D1A9ED800A4A378 /* test_feature.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92D91D1A9E9800A4A378 /* test_feature.hpp */; }; 67BC92E41D1A9ED800A4A378 /* test_mwm_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */; }; 67BC92E51D1A9ED800A4A378 /* test_mwm_builder.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */; }; + E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E9502E311D34012200CAB86B /* booking_scoring.cpp */; }; + E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */ = {isa = PBXBuildFile; fileRef = E9502E321D34012200CAB86B /* booking_scoring.hpp */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -127,6 +129,8 @@ 67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = test_mwm_builder.cpp; sourceTree = ""; }; 67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = test_mwm_builder.hpp; sourceTree = ""; }; 67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = ""; }; + E9502E311D34012200CAB86B /* booking_scoring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_scoring.cpp; sourceTree = ""; }; + E9502E321D34012200CAB86B /* booking_scoring.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_scoring.hpp; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -169,6 +173,8 @@ 6753401D1A3F2A1B00A0A8C3 /* generator */ = { isa = PBXGroup; children = ( + E9502E311D34012200CAB86B /* booking_scoring.cpp */, + E9502E321D34012200CAB86B /* booking_scoring.hpp */, 677E2A111CAACC5F001DC42A /* tag_admixer.hpp */, 677E2A121CAACC5F001DC42A /* towns_dumper.cpp */, 677E2A131CAACC5F001DC42A /* towns_dumper.hpp */, @@ -253,6 +259,7 @@ 6753407F1A3F2A7400A0A8C3 /* osm2type.hpp in Headers */, 670B84BD1A8CDB0000CE4492 /* osm_source.hpp in Headers */, 675340631A3F2A7400A0A8C3 /* coastlines_generator.hpp in Headers */, + E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */, 675340641A3F2A7400A0A8C3 /* intermediate_data.hpp in Headers */, 675340781A3F2A7400A0A8C3 /* intermediate_elements.hpp in Headers */, 6753406B1A3F2A7400A0A8C3 /* feature_emitter_iface.hpp in Headers */, @@ -384,6 +391,7 @@ 677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */, 6753405C1A3F2A7400A0A8C3 /* borders_generator.cpp in Sources */, 675340671A3F2A7400A0A8C3 /* dumper.cpp in Sources */, + E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */, 675340831A3F2A7400A0A8C3 /* statistics.cpp in Sources */, 6753407E1A3F2A7400A0A8C3 /* osm2type.cpp in Sources */, 675340601A3F2A7400A0A8C3 /* check_model.cpp in Sources */, @@ -581,6 +589,7 @@ 67BC92D51D1A9E5F00A4A378 /* Release */, ); defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; }; /* End XCConfigurationList section */ };