From 61babee342ffe9da76e920aba8c61294607ab2f1 Mon Sep 17 00:00:00 2001 From: Sergey Magidovich Date: Tue, 12 Jul 2016 16:21:46 +0300 Subject: [PATCH] Code review. --- generator/booking_dataset.cpp | 3 +- .../booking_quality_check.cpp | 10 +- generator/booking_scoring.cpp | 112 ++++++++++++------ generator/booking_scoring.hpp | 3 +- generator/osm_element.cpp | 7 +- generator/osm_element.hpp | 2 +- tools/python/booking_hotels_quality.py | 5 +- 7 files changed, 96 insertions(+), 46 deletions(-) diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index fec4af1689..60745a6364 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -209,7 +209,8 @@ void BookingDataset::BuildFeatures(function const & fn) cons if (!hotel.houseNumber.empty()) e.AddTag("addr:housenumber", hotel.houseNumber); - // TODO(mgsergio): Add a comment or use enum. + // Matching booking.com hotel types to OpenStreetMap values. + // Booking types are listed in the closed API docs. switch (hotel.type) { case 19: diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 0331a7f9d3..b4e817fc18 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -17,7 +17,7 @@ DEFINE_string(osm_file_name, "", "Input .o5m file"); DEFINE_string(booking_data, "", "Path to booking data in .tsv format"); DEFINE_string(sample_data, "", "Sample output path"); DEFINE_uint64(selection_size, 1000, "Selection size"); -DEFINE_uint64(random_seed, minstd_rand::default_seed, "Seed for random shuffle"); +DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle"); using namespace generator; @@ -60,7 +60,7 @@ int main(int argc, char * argv[]) vector elementIndexes(elements.size()); iota(elementIndexes.begin(), elementIndexes.end(), 0); - shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_random_seed)); + shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_seed)); if (FLAGS_selection_size < elementIndexes.size()) elementIndexes.resize(FLAGS_selection_size); @@ -82,7 +82,11 @@ int main(int argc, char * argv[]) outStream << "# ------------------------------------------" << fixed << setprecision(6) << endl; outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j - << " distance: " << distanceMeters << " score: " << score.GetMatchingScore() << endl; + << "\tdistance: " << distanceMeters + << "\tdistance score: " << score.m_linearNormDistanceScore + << "\tname score: " << score.m_nameSimilarityScore + << "\tresult score: " << score.GetMatchingScore() + << endl; outStream << "# " << e << endl; outStream << "# " << hotel << endl; outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp index ff2aaec177..2ddd071f6f 100644 --- a/generator/booking_scoring.cpp +++ b/generator/booking_scoring.cpp @@ -2,12 +2,16 @@ #include "generator/booking_dataset.hpp" -#include "indexer/search_string_utils.hpp" #include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" #include "geometry/distance_on_sphere.hpp" #include "base/collection_cast.hpp" +#include "base/stl_iterator.hpp" + +#include "std/algorithm.hpp" +#include "std/vector.hpp" namespace generator { @@ -16,49 +20,82 @@ namespace booking_scoring namespace { // Calculated with tools/python/booking_hotels_quality.py. -double constexpr kOptimalThreshold = 0.151001; +double constexpr kOptimalThreshold = 0.317324; template struct decay_equiv : std::is_same::type, U>::type {}; -set StringToSetOfWords(string const & str) +using WeightedBagOfWords = vector>; + +vector StringToSetOfWords(string const & str) { vector result; search::NormalizeAndTokenizeString(str, result, search::Delimiters{}); - return my::collection_cast(result); + sort(begin(result), end(result)); + return result; } -// TODO(mgsergio): Update existing one in base or wherever... -// Or just use one from boost. -struct CounterIterator +WeightedBagOfWords MakeWeightedBagOfWords(vector const & words) { - template::value>::type> - CounterIterator & operator=(T const &) { ++m_count; return *this; } - CounterIterator & operator++() { return *this; } - CounterIterator & operator++(int) { return *this; } - CounterIterator & operator*() { return *this; } - uint32_t Count() const { return m_count; } + // TODO(mgsergio): Calculate tf-idsf score for every word. + auto constexpr kTfIdfScorePlaceholder = 1; - uint32_t m_count = 0; -}; - -double StringSimilarityScore(string const & a, string const & b) -{ - auto const aWords = StringToSetOfWords(a); - auto const bWords = StringToSetOfWords(b); - - auto const intersectionCard = set_intersection(begin(aWords), end(aWords), - begin(bWords), end(bWords), - CounterIterator()).Count(); - auto const aLikeBScore = static_cast(intersectionCard) / aWords.size(); - auto const bLikeAScore = static_cast(intersectionCard) / bWords.size(); - - return aLikeBScore * bLikeAScore; + WeightedBagOfWords result; + for (auto i = 0; i < words.size(); ++i) + { + result.emplace_back(words[i], kTfIdfScorePlaceholder); + while (i + 1 < words.size() && words[i] == words[i + 1]) + { + result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist; + ++i; + } + } + return result; } -double GetLinearNormDistanceScrore(double distance) +double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + double result{}; + + auto lhsIt = begin(lhs); + auto rhsIt = begin(rhs); + + while (lhsIt != end(lhs) && rhsIt != end(rhs)) + { + if (lhsIt->first == rhsIt->first) + { + result += lhsIt->second * rhsIt->second; + ++lhsIt; + ++rhsIt; + } + else if (lhsIt->first < rhsIt->first) + { + ++lhsIt; + } + else + { + ++rhsIt; + } + } + + return result; +} + +double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + auto const product = WeightedBagsDotProduct(lhs, rhs); + auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs)); + auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs)); + + if (product == 0.0) + return 0.0; + + return product / (lhsLength * rhsLength); +} + +double GetLinearNormDistanceScore(double distance) { distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters); return 1.0 - distance / BookingDataset::kDistanceLimitInMeters; @@ -66,7 +103,15 @@ double GetLinearNormDistanceScrore(double distance) double GetNameSimilarityScore(string const & booking_name, string const & osm_name) { - return StringSimilarityScore(booking_name, osm_name); + auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name)); + auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name)); + + if (aws.empty() && bws.empty()) + return 1.0; + if (aws.empty() || bws.empty()) + return 0.0; + + return WeightedBagOfWordsCos(aws, bws); } } // namespace @@ -85,11 +130,10 @@ BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e) BookingMatchScore score; auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon); - score.m_linearNormDistanceScore = GetLinearNormDistanceScrore(distance); + score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance); - string osmHotelName; - score.m_nameSimilarityScore = e.GetTag("name", osmHotelName) - ? GetNameSimilarityScore(h.name, osmHotelName) : 0; + // TODO(mgsergio): Check all translations and use the best one. + score.m_nameSimilarityScore = GetNameSimilarityScore(h.name, e.GetTag("name")); return score; } diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp index d92482cf35..e5516a4470 100644 --- a/generator/booking_scoring.hpp +++ b/generator/booking_scoring.hpp @@ -1,7 +1,8 @@ #pragma once #include "generator/booking_dataset.hpp" -#include "generator/osm_element.hpp" + +struct OsmElement; namespace generator { diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 8ae3340132..5e54137283 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -121,7 +121,7 @@ string OsmElement::ToString(string const & shift) const return ss.str(); } -bool OsmElement::GetTag(string const & key, string & value) const +string OsmElement::GetTag(string const & key) const { auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag) { @@ -129,10 +129,9 @@ bool OsmElement::GetTag(string const & key, string & value) const }); if (it == end(m_tags)) - return false; + return {}; - value = it->value; - return true; + return it->value; } string DebugPrint(OsmElement const & e) diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index c473f7d175..6a89905b9b 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -153,7 +153,7 @@ struct OsmElement AddTag(k, v); } - bool GetTag(string const & key, string & value) const; + string GetTag(string const & key) const; }; string DebugPrint(OsmElement const & e); diff --git a/tools/python/booking_hotels_quality.py b/tools/python/booking_hotels_quality.py index 914e4c797f..cd82adfd2d 100755 --- a/tools/python/booking_hotels_quality.py +++ b/tools/python/booking_hotels_quality.py @@ -14,6 +14,7 @@ import os import pickle import time import urllib2 +import re # init logging logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s') @@ -21,7 +22,7 @@ logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %( def load_binary_list(path): """ - Loads referance binary classifier output. + Loads reference binary classifier output. """ bits = [] with open(path, 'r') as fd: @@ -41,7 +42,7 @@ def load_score_list(path): for line in fd: if (not line.strip()) or line[0] == '#': continue - scores.append(float(line[line.rfind(':')+2:])) + scores.append(float(re.search(r'result score: (\d*\.\d+)', line).group(1))) return scores