From 0888234b6cc7ac14c1b95870250b956781f18f2e Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Tue, 28 Jun 2016 11:38:50 +0300 Subject: [PATCH 1/6] shuffle is used instead of random_shuffle. --- generator/booking_dataset.cpp | 1 + generator/booking_quality_check/booking_quality_check.cpp | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index e8c103ee07..8239f68344 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -248,6 +248,7 @@ void BookingDataset::BuildFeatures(function const & fn) cons if (!hotel.houseNumber.empty()) e.AddTag("addr:housenumber", hotel.houseNumber); + // TODO(mgsergio): Add a comment or use enum. switch (hotel.type) { case 19: diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 1696a69d6c..a03a270c4b 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -6,6 +6,7 @@ #include "std/fstream.hpp" #include "std/iostream.hpp" #include "std/numeric.hpp" +#include "std/random.hpp" #include "3party/gflags/src/gflags/gflags.h" @@ -15,6 +16,7 @@ DEFINE_string(osm_file_name, "", "Input .o5m file"); DEFINE_string(booking_data, "", "Path to booking data in .tsv format"); DEFINE_string(sample_data, "", "Sample output path"); DEFINE_uint64(selection_size, 1000, "Selection size"); +DEFINE_uint64(random_seed, minstd_rand::default_seed, "Seed for random shuffle"); using namespace generator; @@ -57,9 +59,7 @@ int main(int argc, char * argv[]) vector elementIndexes(elements.size()); iota(elementIndexes.begin(), elementIndexes.end(), 0); - // In first implementation, we used random_shufle for reference dataset. - // Next time we are going to replace random_shuffle by shuffle with defined seed. - random_shuffle(elementIndexes.begin(), elementIndexes.end()); + shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_random_seed)); if (FLAGS_selection_size < elementIndexes.size()) elementIndexes.resize(FLAGS_selection_size); From 2eed098a18e1eff26a2737d7cdcff70cdce036c7 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Tue, 5 Jul 2016 12:21:59 +0300 Subject: [PATCH 2/6] Fix error in booking_hotels.py. --- tools/python/booking_hotels.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index f684505036..dda40cc7cf 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -126,7 +126,8 @@ def translate(source, output): ''' Reads *.pkl files and produces a single list of hotels as tab separated values. ''' - files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')] + files = [os.path.join(source, filename) + for filename in os.listdir(source) if filename.endswith('.pkl')] data = [] for filename in sorted(files): @@ -207,6 +208,7 @@ def process_options(): if not options.download and not options.translate: parser.print_help() + # TODO(mgsergio): implpement it with argparse facilities. if options.translate and not options.output: print("--output isn't set") parser.print_help() From cd5b765877776bb3c36b9ca64f807a8e1b69b041 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Sat, 9 Jul 2016 21:53:34 +0300 Subject: [PATCH 3/6] Small changes. Xcode build fixed. --- .../booking_quality_check.cpp | 2 +- tools/python/booking_hotels.py | 2 +- tools/python/booking_hotels_quality.py | 5 +++-- .../generator_tool.xcodeproj/project.pbxproj | 22 +++++++++++++++++++ 4 files changed, 27 insertions(+), 4 deletions(-) diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index a03a270c4b..6602687ec4 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -74,7 +74,7 @@ int main(int argc, char * argv[]) { auto const & hotel = bookingDataset.GetHotel(j); double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double score = BookingDataset::ScoreByLinearNormDistance(distanceMeters); + double const score = BookingDataset::ScoreByLinearNormDistance(distanceMeters); bool matched = score > BookingDataset::kOptimalThreshold; diff --git a/tools/python/booking_hotels.py b/tools/python/booking_hotels.py index dda40cc7cf..8912fdebe9 100755 --- a/tools/python/booking_hotels.py +++ b/tools/python/booking_hotels.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # coding: utf8 from __future__ import print_function diff --git a/tools/python/booking_hotels_quality.py b/tools/python/booking_hotels_quality.py index 9c81dad52a..914e4c797f 100755 --- a/tools/python/booking_hotels_quality.py +++ b/tools/python/booking_hotels_quality.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # coding: utf8 from __future__ import print_function @@ -21,7 +21,7 @@ logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %( def load_binary_list(path): """ - Loads binary classifier output. + Loads referance binary classifier output. """ bits = [] with open(path, 'r') as fd: @@ -46,6 +46,7 @@ def load_score_list(path): def process_options(): + # TODO(mgsergio): Fix description. parser = argparse.ArgumentParser(description="Download and process booking hotels.") parser.add_argument("-v", "--verbose", action="store_true", dest="verbose") parser.add_argument("-q", "--quiet", action="store_false", dest="verbose") diff --git a/xcode/generator_tool/generator_tool.xcodeproj/project.pbxproj b/xcode/generator_tool/generator_tool.xcodeproj/project.pbxproj index 4b8eb4c9be..2b9e27fa15 100644 --- a/xcode/generator_tool/generator_tool.xcodeproj/project.pbxproj +++ b/xcode/generator_tool/generator_tool.xcodeproj/project.pbxproj @@ -140,6 +140,11 @@ 67BC92C51D17FD5800A4A378 /* libstb_image.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 679624BF1D11775300AE4E3C /* libstb_image.a */; }; 67BC92C61D17FDE600A4A378 /* OpenGL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 679624CE1D11779700AE4E3C /* OpenGL.framework */; }; 67BC92C71D17FDF800A4A378 /* libapi.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 679624B61D11775300AE4E3C /* libapi.a */; }; + E9502E241D2BD26C00CAB86B /* libalohalitics.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E231D2BD26C00CAB86B /* libalohalitics.a */; }; + E9502E261D2BD28C00CAB86B /* liboauthcpp.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E251D2BD28C00CAB86B /* liboauthcpp.a */; }; + E9502E281D2BD2CC00CAB86B /* libopening_hours.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E271D2BD2CC00CAB86B /* libopening_hours.a */; }; + E9502E2D1D2BD47B00CAB86B /* libsearch.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E291D2BD34A00CAB86B /* libsearch.a */; }; + E9502E301D2BD6E600CAB86B /* libz.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 677E2A091CAAC771001DC42A /* libz.tbd */; }; /* End PBXBuildFile section */ /* Begin PBXCopyFilesBuildPhase section */ @@ -307,6 +312,12 @@ 67AB92C61B73D03500AB5194 /* libmap.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libmap.a; path = "../../../omim-xcode-build/Debug/libmap.a"; sourceTree = ""; }; 67AB92C81B73D10200AB5194 /* libosrm.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libosrm.a; path = "../../../omim-xcode-build/Debug/libosrm.a"; sourceTree = ""; }; 67AB92CA1B73D10B00AB5194 /* libsuccinct.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libsuccinct.a; path = "../../../omim-xcode-build/Debug/libsuccinct.a"; sourceTree = ""; }; + E9502E231D2BD26C00CAB86B /* libalohalitics.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libalohalitics.a; path = "/Users/mgsergio/omim/xcode/alohalitics/../../../omim-xcode-build/Debug/libalohalitics.a"; sourceTree = ""; }; + E9502E251D2BD28C00CAB86B /* liboauthcpp.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = liboauthcpp.a; path = "/Users/mgsergio/omim/xcode/oauthcpp/../../../omim-xcode-build/Debug/liboauthcpp.a"; sourceTree = ""; }; + E9502E271D2BD2CC00CAB86B /* libopening_hours.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libopening_hours.a; path = "/Users/mgsergio/omim/xcode/opening_hours/../../../omim-xcode-build/Debug/libopening_hours.a"; sourceTree = ""; }; + E9502E291D2BD34A00CAB86B /* libsearch.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libsearch.a; path = "/Users/mgsergio/omim/xcode/search/../../../omim-xcode-build/Debug/libsearch.a"; sourceTree = ""; }; + E9502E2B1D2BD44E00CAB86B /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = "/Users/mgsergio/omim/xcode/protobuf/../../../omim-xcode-build/Debug/libprotobuf.a"; sourceTree = ""; }; + E9502E2E1D2BD5BE00CAB86B /* libminizip.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libminizip.a; path = "/Users/mgsergio/omim/xcode/minizip/../../../omim-xcode-build/Debug/libminizip.a"; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -342,6 +353,11 @@ isa = PBXFrameworksBuildPhase; buildActionMask = 2147483647; files = ( + E9502E301D2BD6E600CAB86B /* libz.tbd in Frameworks */, + E9502E2D1D2BD47B00CAB86B /* libsearch.a in Frameworks */, + E9502E281D2BD2CC00CAB86B /* libopening_hours.a in Frameworks */, + E9502E261D2BD28C00CAB86B /* liboauthcpp.a in Frameworks */, + E9502E241D2BD26C00CAB86B /* libalohalitics.a in Frameworks */, 673746771CF47E83005E6D1F /* Cocoa.framework in Frameworks */, 673746701CF47E14005E6D1F /* libeditor.a in Frameworks */, 6737466B1CF47D82005E6D1F /* libplatform.a in Frameworks */, @@ -555,6 +571,12 @@ 6753414F1A3F54D800A0A8C3 = { isa = PBXGroup; children = ( + E9502E2E1D2BD5BE00CAB86B /* libminizip.a */, + E9502E2B1D2BD44E00CAB86B /* libprotobuf.a */, + E9502E291D2BD34A00CAB86B /* libsearch.a */, + E9502E271D2BD2CC00CAB86B /* libopening_hours.a */, + E9502E251D2BD28C00CAB86B /* liboauthcpp.a */, + E9502E231D2BD26C00CAB86B /* libalohalitics.a */, 670D05AD1B0E08260013A7AC /* defaults.xcconfig */, 6737465A1CF46324005E6D1F /* booking_quality_check */, 670B84C41A9F73AB00CE4492 /* std */, From 6f423fbac4dc92d0bbca7f4d2a7c34c600a59633 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Mon, 11 Jul 2016 13:27:55 +0300 Subject: [PATCH 4/6] build.gradle update. --- android/3rd_party/BottomSheet/build.gradle | 2 +- android/UnitTests/build.gradle | 2 +- android/build.gradle | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/android/3rd_party/BottomSheet/build.gradle b/android/3rd_party/BottomSheet/build.gradle index f93a7a5d96..70f45ba9ad 100644 --- a/android/3rd_party/BottomSheet/build.gradle +++ b/android/3rd_party/BottomSheet/build.gradle @@ -4,7 +4,7 @@ buildscript { } dependencies { - classpath 'com.android.tools.build:gradle:2.1.0' + classpath 'com.android.tools.build:gradle:2.1.2' } } diff --git a/android/UnitTests/build.gradle b/android/UnitTests/build.gradle index e30a3877d5..cbd1b88ad0 100644 --- a/android/UnitTests/build.gradle +++ b/android/UnitTests/build.gradle @@ -5,7 +5,7 @@ buildscript { jcenter() } dependencies { - classpath 'com.android.tools.build:gradle:2.1.0' + classpath 'com.android.tools.build:gradle:2.1.2' // NOTE: Do not place your application dependencies here; they belong // in the individual module build.gradle files diff --git a/android/build.gradle b/android/build.gradle index e5ca7b798e..6e87bac5fe 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -7,7 +7,7 @@ buildscript { } dependencies { - classpath 'com.android.tools.build:gradle:2.1.0' + classpath 'com.android.tools.build:gradle:2.1.2' classpath 'io.fabric.tools:gradle:1.+' } } From 88d5775c230e5ac2caa06ad2266b893b300a2211 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Mon, 11 Jul 2016 22:42:43 +0300 Subject: [PATCH 5/6] Refactor. Add Matching by name. --- generator/booking_dataset.cpp | 56 +---------- generator/booking_dataset.hpp | 5 - .../booking_quality_check.cpp | 9 +- generator/booking_scoring.cpp | 97 +++++++++++++++++++ generator/booking_scoring.hpp | 21 ++++ generator/osm_element.cpp | 14 +++ generator/osm_element.hpp | 3 +- .../generator.xcodeproj/project.pbxproj | 9 ++ 8 files changed, 151 insertions(+), 63 deletions(-) create mode 100644 generator/booking_scoring.cpp create mode 100644 generator/booking_scoring.hpp diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 8239f68344..fec4af1689 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -1,11 +1,11 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" + #include "platform/local_country_file_utils.hpp" #include "platform/platform.hpp" #include "indexer/ftypes_matcher.hpp" -#include "indexer/search_delimiters.hpp" -#include "indexer/search_string_utils.hpp" #include "geometry/distance_on_sphere.hpp" @@ -171,45 +171,6 @@ vector BookingDataset::GetNearestHotels(double lat, double lon, size_t l return indexes; } -bool BookingDataset::MatchByName(string const & osmName, - vector const & bookingIndexes) const -{ - return false; - - // Match name. - // vector osmTokens; - // NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); - // - // cout << "\n------------- " << name << endl; - // - // bool matched = false; - // for (auto const & index : indexes) - // { - // vector bookingTokens; - // NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); - // - // map>> weightPair; - // - // for (size_t j = 0; j < osmTokens.size(); ++j) - // { - // for (size_t i = 0; i < bookingTokens.size(); ++i) - // { - // size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), - // bookingTokens[i].begin(), - // bookingTokens[i].end()); - // if (distance < 3) - // weightPair[distance].emplace_back(i, j); - // } - // } - // - // if (!weightPair.empty()) - // { - // cout << m_hotels[e.second] << endl; - // matched = true; - // } - // } -} - void BookingDataset::BuildFeatures(function const & fn) const { for (auto const & hotel : m_hotels) @@ -302,13 +263,6 @@ void BookingDataset::BuildFeatures(function const & fn) cons } } -// static -double BookingDataset::ScoreByLinearNormDistance(double distance) -{ - distance = my::clamp(distance, 0, kDistanceLimitInMeters); - return 1.0 - distance / kDistanceLimitInMeters; -} - void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath) { m_hotels.clear(); @@ -374,11 +328,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const for (size_t const j : bookingIndexes) { - auto const & hotel = GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double score = ScoreByLinearNormDistance(distanceMeters); - matched = score > kOptimalThreshold; - if (matched) + if (booking_scoring::Match(GetHotel(j), e).IsMatched()) break; } diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index f56bba4d17..c37859cca6 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -22,9 +22,6 @@ public: double static constexpr kDistanceLimitInMeters = 150; size_t static constexpr kMaxSelectedElements = 3; - // Calculated with tools/python/booking_hotels_quality.py - double static constexpr kOptimalThreshold = 0.709283; - struct Hotel { enum class Fields @@ -92,8 +89,6 @@ public: void BuildFeatures(function const & fn) const; - static double ScoreByLinearNormDistance(double distance); - protected: vector m_hotels; diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 6602687ec4..0331a7f9d3 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -1,4 +1,5 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" #include "generator/osm_source.hpp" #include "geometry/distance_on_sphere.hpp" @@ -73,15 +74,15 @@ int main(int argc, char * argv[]) for (size_t const j : bookingIndexes) { auto const & hotel = bookingDataset.GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double const score = BookingDataset::ScoreByLinearNormDistance(distanceMeters); + auto const score = booking_scoring::Match(hotel, e); - bool matched = score > BookingDataset::kOptimalThreshold; + double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); + bool matched = score.IsMatched(); outStream << "# ------------------------------------------" << fixed << setprecision(6) << endl; outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j - << " distance: " << distanceMeters << " score: " << score << endl; + << " distance: " << distanceMeters << " score: " << score.GetMatchingScore() << endl; outStream << "# " << e << endl; outStream << "# " << hotel << endl; outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp new file mode 100644 index 0000000000..ff2aaec177 --- /dev/null +++ b/generator/booking_scoring.cpp @@ -0,0 +1,97 @@ +#include "generator/booking_scoring.hpp" + +#include "generator/booking_dataset.hpp" + +#include "indexer/search_string_utils.hpp" +#include "indexer/search_delimiters.hpp" + +#include "geometry/distance_on_sphere.hpp" + +#include "base/collection_cast.hpp" + +namespace generator +{ +namespace booking_scoring +{ +namespace +{ +// Calculated with tools/python/booking_hotels_quality.py. +double constexpr kOptimalThreshold = 0.151001; + +template +struct decay_equiv : + std::is_same::type, U>::type +{}; + +set StringToSetOfWords(string const & str) +{ + vector result; + search::NormalizeAndTokenizeString(str, result, search::Delimiters{}); + return my::collection_cast(result); +} + +// TODO(mgsergio): Update existing one in base or wherever... +// Or just use one from boost. +struct CounterIterator +{ + template::value>::type> + CounterIterator & operator=(T const &) { ++m_count; return *this; } + CounterIterator & operator++() { return *this; } + CounterIterator & operator++(int) { return *this; } + CounterIterator & operator*() { return *this; } + uint32_t Count() const { return m_count; } + + uint32_t m_count = 0; +}; + +double StringSimilarityScore(string const & a, string const & b) +{ + auto const aWords = StringToSetOfWords(a); + auto const bWords = StringToSetOfWords(b); + + auto const intersectionCard = set_intersection(begin(aWords), end(aWords), + begin(bWords), end(bWords), + CounterIterator()).Count(); + auto const aLikeBScore = static_cast(intersectionCard) / aWords.size(); + auto const bLikeAScore = static_cast(intersectionCard) / bWords.size(); + + return aLikeBScore * bLikeAScore; +} + +double GetLinearNormDistanceScrore(double distance) +{ + distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters); + return 1.0 - distance / BookingDataset::kDistanceLimitInMeters; +} + +double GetNameSimilarityScore(string const & booking_name, string const & osm_name) +{ + return StringSimilarityScore(booking_name, osm_name); +} +} // namespace + +double BookingMatchScore::GetMatchingScore() const +{ + return m_linearNormDistanceScore * m_nameSimilarityScore; +} + +bool BookingMatchScore::IsMatched() const +{ + return GetMatchingScore() > kOptimalThreshold; +} + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e) +{ + BookingMatchScore score; + + auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon); + score.m_linearNormDistanceScore = GetLinearNormDistanceScrore(distance); + + string osmHotelName; + score.m_nameSimilarityScore = e.GetTag("name", osmHotelName) + ? GetNameSimilarityScore(h.name, osmHotelName) : 0; + + return score; +} +} // namespace booking_scoring +} // namespace generator diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp new file mode 100644 index 0000000000..d92482cf35 --- /dev/null +++ b/generator/booking_scoring.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "generator/booking_dataset.hpp" +#include "generator/osm_element.hpp" + +namespace generator +{ +namespace booking_scoring +{ +struct BookingMatchScore +{ + double GetMatchingScore() const; + bool IsMatched() const; + + double m_linearNormDistanceScore{}; + double m_nameSimilarityScore{}; +}; + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e); +} // namespace booking_scoring +} // namespace generator diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 1c65dd599d..8ae3340132 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -121,6 +121,20 @@ string OsmElement::ToString(string const & shift) const return ss.str(); } +bool OsmElement::GetTag(string const & key, string & value) const +{ + auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag) + { + return tag.key == key; + }); + + if (it == end(m_tags)) + return false; + + value = it->value; + return true; +} + string DebugPrint(OsmElement const & e) { return e.ToString(); diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index fc1187c6a7..c473f7d175 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -152,7 +152,8 @@ struct OsmElement if (!v.empty()) AddTag(k, v); } + + bool GetTag(string const & key, string & value) const; }; string DebugPrint(OsmElement const & e); - diff --git a/xcode/generator/generator.xcodeproj/project.pbxproj b/xcode/generator/generator.xcodeproj/project.pbxproj index d95c9fb035..4d668aeb25 100644 --- a/xcode/generator/generator.xcodeproj/project.pbxproj +++ b/xcode/generator/generator.xcodeproj/project.pbxproj @@ -63,6 +63,8 @@ 67BC92E31D1A9ED800A4A378 /* test_feature.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92D91D1A9E9800A4A378 /* test_feature.hpp */; }; 67BC92E41D1A9ED800A4A378 /* test_mwm_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */; }; 67BC92E51D1A9ED800A4A378 /* test_mwm_builder.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */; }; + E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E9502E311D34012200CAB86B /* booking_scoring.cpp */; }; + E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */ = {isa = PBXBuildFile; fileRef = E9502E321D34012200CAB86B /* booking_scoring.hpp */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ @@ -127,6 +129,8 @@ 67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = test_mwm_builder.cpp; sourceTree = ""; }; 67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = test_mwm_builder.hpp; sourceTree = ""; }; 67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = ""; }; + E9502E311D34012200CAB86B /* booking_scoring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_scoring.cpp; sourceTree = ""; }; + E9502E321D34012200CAB86B /* booking_scoring.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_scoring.hpp; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -169,6 +173,8 @@ 6753401D1A3F2A1B00A0A8C3 /* generator */ = { isa = PBXGroup; children = ( + E9502E311D34012200CAB86B /* booking_scoring.cpp */, + E9502E321D34012200CAB86B /* booking_scoring.hpp */, 677E2A111CAACC5F001DC42A /* tag_admixer.hpp */, 677E2A121CAACC5F001DC42A /* towns_dumper.cpp */, 677E2A131CAACC5F001DC42A /* towns_dumper.hpp */, @@ -253,6 +259,7 @@ 6753407F1A3F2A7400A0A8C3 /* osm2type.hpp in Headers */, 670B84BD1A8CDB0000CE4492 /* osm_source.hpp in Headers */, 675340631A3F2A7400A0A8C3 /* coastlines_generator.hpp in Headers */, + E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */, 675340641A3F2A7400A0A8C3 /* intermediate_data.hpp in Headers */, 675340781A3F2A7400A0A8C3 /* intermediate_elements.hpp in Headers */, 6753406B1A3F2A7400A0A8C3 /* feature_emitter_iface.hpp in Headers */, @@ -384,6 +391,7 @@ 677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */, 6753405C1A3F2A7400A0A8C3 /* borders_generator.cpp in Sources */, 675340671A3F2A7400A0A8C3 /* dumper.cpp in Sources */, + E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */, 675340831A3F2A7400A0A8C3 /* statistics.cpp in Sources */, 6753407E1A3F2A7400A0A8C3 /* osm2type.cpp in Sources */, 675340601A3F2A7400A0A8C3 /* check_model.cpp in Sources */, @@ -581,6 +589,7 @@ 67BC92D51D1A9E5F00A4A378 /* Release */, ); defaultConfigurationIsVisible = 0; + defaultConfigurationName = Release; }; /* End XCConfigurationList section */ }; From 61babee342ffe9da76e920aba8c61294607ab2f1 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Tue, 12 Jul 2016 16:21:46 +0300 Subject: [PATCH 6/6] Code review. --- generator/booking_dataset.cpp | 3 +- .../booking_quality_check.cpp | 10 +- generator/booking_scoring.cpp | 112 ++++++++++++------ generator/booking_scoring.hpp | 3 +- generator/osm_element.cpp | 7 +- generator/osm_element.hpp | 2 +- tools/python/booking_hotels_quality.py | 5 +- 7 files changed, 96 insertions(+), 46 deletions(-) diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index fec4af1689..60745a6364 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -209,7 +209,8 @@ void BookingDataset::BuildFeatures(function const & fn) cons if (!hotel.houseNumber.empty()) e.AddTag("addr:housenumber", hotel.houseNumber); - // TODO(mgsergio): Add a comment or use enum. + // Matching booking.com hotel types to OpenStreetMap values. + // Booking types are listed in the closed API docs. switch (hotel.type) { case 19: diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 0331a7f9d3..b4e817fc18 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -17,7 +17,7 @@ DEFINE_string(osm_file_name, "", "Input .o5m file"); DEFINE_string(booking_data, "", "Path to booking data in .tsv format"); DEFINE_string(sample_data, "", "Sample output path"); DEFINE_uint64(selection_size, 1000, "Selection size"); -DEFINE_uint64(random_seed, minstd_rand::default_seed, "Seed for random shuffle"); +DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle"); using namespace generator; @@ -60,7 +60,7 @@ int main(int argc, char * argv[]) vector elementIndexes(elements.size()); iota(elementIndexes.begin(), elementIndexes.end(), 0); - shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_random_seed)); + shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_seed)); if (FLAGS_selection_size < elementIndexes.size()) elementIndexes.resize(FLAGS_selection_size); @@ -82,7 +82,11 @@ int main(int argc, char * argv[]) outStream << "# ------------------------------------------" << fixed << setprecision(6) << endl; outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j - << " distance: " << distanceMeters << " score: " << score.GetMatchingScore() << endl; + << "\tdistance: " << distanceMeters + << "\tdistance score: " << score.m_linearNormDistanceScore + << "\tname score: " << score.m_nameSimilarityScore + << "\tresult score: " << score.GetMatchingScore() + << endl; outStream << "# " << e << endl; outStream << "# " << hotel << endl; outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp index ff2aaec177..2ddd071f6f 100644 --- a/generator/booking_scoring.cpp +++ b/generator/booking_scoring.cpp @@ -2,12 +2,16 @@ #include "generator/booking_dataset.hpp" -#include "indexer/search_string_utils.hpp" #include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" #include "geometry/distance_on_sphere.hpp" #include "base/collection_cast.hpp" +#include "base/stl_iterator.hpp" + +#include "std/algorithm.hpp" +#include "std/vector.hpp" namespace generator { @@ -16,49 +20,82 @@ namespace booking_scoring namespace { // Calculated with tools/python/booking_hotels_quality.py. -double constexpr kOptimalThreshold = 0.151001; +double constexpr kOptimalThreshold = 0.317324; template struct decay_equiv : std::is_same::type, U>::type {}; -set StringToSetOfWords(string const & str) +using WeightedBagOfWords = vector>; + +vector StringToSetOfWords(string const & str) { vector result; search::NormalizeAndTokenizeString(str, result, search::Delimiters{}); - return my::collection_cast(result); + sort(begin(result), end(result)); + return result; } -// TODO(mgsergio): Update existing one in base or wherever... -// Or just use one from boost. -struct CounterIterator +WeightedBagOfWords MakeWeightedBagOfWords(vector const & words) { - template::value>::type> - CounterIterator & operator=(T const &) { ++m_count; return *this; } - CounterIterator & operator++() { return *this; } - CounterIterator & operator++(int) { return *this; } - CounterIterator & operator*() { return *this; } - uint32_t Count() const { return m_count; } + // TODO(mgsergio): Calculate tf-idsf score for every word. + auto constexpr kTfIdfScorePlaceholder = 1; - uint32_t m_count = 0; -}; - -double StringSimilarityScore(string const & a, string const & b) -{ - auto const aWords = StringToSetOfWords(a); - auto const bWords = StringToSetOfWords(b); - - auto const intersectionCard = set_intersection(begin(aWords), end(aWords), - begin(bWords), end(bWords), - CounterIterator()).Count(); - auto const aLikeBScore = static_cast(intersectionCard) / aWords.size(); - auto const bLikeAScore = static_cast(intersectionCard) / bWords.size(); - - return aLikeBScore * bLikeAScore; + WeightedBagOfWords result; + for (auto i = 0; i < words.size(); ++i) + { + result.emplace_back(words[i], kTfIdfScorePlaceholder); + while (i + 1 < words.size() && words[i] == words[i + 1]) + { + result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist; + ++i; + } + } + return result; } -double GetLinearNormDistanceScrore(double distance) +double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + double result{}; + + auto lhsIt = begin(lhs); + auto rhsIt = begin(rhs); + + while (lhsIt != end(lhs) && rhsIt != end(rhs)) + { + if (lhsIt->first == rhsIt->first) + { + result += lhsIt->second * rhsIt->second; + ++lhsIt; + ++rhsIt; + } + else if (lhsIt->first < rhsIt->first) + { + ++lhsIt; + } + else + { + ++rhsIt; + } + } + + return result; +} + +double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + auto const product = WeightedBagsDotProduct(lhs, rhs); + auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs)); + auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs)); + + if (product == 0.0) + return 0.0; + + return product / (lhsLength * rhsLength); +} + +double GetLinearNormDistanceScore(double distance) { distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters); return 1.0 - distance / BookingDataset::kDistanceLimitInMeters; @@ -66,7 +103,15 @@ double GetLinearNormDistanceScrore(double distance) double GetNameSimilarityScore(string const & booking_name, string const & osm_name) { - return StringSimilarityScore(booking_name, osm_name); + auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name)); + auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name)); + + if (aws.empty() && bws.empty()) + return 1.0; + if (aws.empty() || bws.empty()) + return 0.0; + + return WeightedBagOfWordsCos(aws, bws); } } // namespace @@ -85,11 +130,10 @@ BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e) BookingMatchScore score; auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon); - score.m_linearNormDistanceScore = GetLinearNormDistanceScrore(distance); + score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance); - string osmHotelName; - score.m_nameSimilarityScore = e.GetTag("name", osmHotelName) - ? GetNameSimilarityScore(h.name, osmHotelName) : 0; + // TODO(mgsergio): Check all translations and use the best one. + score.m_nameSimilarityScore = GetNameSimilarityScore(h.name, e.GetTag("name")); return score; } diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp index d92482cf35..e5516a4470 100644 --- a/generator/booking_scoring.hpp +++ b/generator/booking_scoring.hpp @@ -1,7 +1,8 @@ #pragma once #include "generator/booking_dataset.hpp" -#include "generator/osm_element.hpp" + +struct OsmElement; namespace generator { diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 8ae3340132..5e54137283 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -121,7 +121,7 @@ string OsmElement::ToString(string const & shift) const return ss.str(); } -bool OsmElement::GetTag(string const & key, string & value) const +string OsmElement::GetTag(string const & key) const { auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag) { @@ -129,10 +129,9 @@ bool OsmElement::GetTag(string const & key, string & value) const }); if (it == end(m_tags)) - return false; + return {}; - value = it->value; - return true; + return it->value; } string DebugPrint(OsmElement const & e) diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index c473f7d175..6a89905b9b 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -153,7 +153,7 @@ struct OsmElement AddTag(k, v); } - bool GetTag(string const & key, string & value) const; + string GetTag(string const & key) const; }; string DebugPrint(OsmElement const & e); diff --git a/tools/python/booking_hotels_quality.py b/tools/python/booking_hotels_quality.py index 914e4c797f..cd82adfd2d 100755 --- a/tools/python/booking_hotels_quality.py +++ b/tools/python/booking_hotels_quality.py @@ -14,6 +14,7 @@ import os import pickle import time import urllib2 +import re # init logging logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s') @@ -21,7 +22,7 @@ logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %( def load_binary_list(path): """ - Loads referance binary classifier output. + Loads reference binary classifier output. """ bits = [] with open(path, 'r') as fd: @@ -41,7 +42,7 @@ def load_score_list(path): for line in fd: if (not line.strip()) or line[0] == '#': continue - scores.append(float(line[line.rfind(':')+2:])) + scores.append(float(re.search(r'result score: (\d*\.\d+)', line).group(1))) return scores