Merge pull request #3761 from mgsergio/booking-name-matching

[booking] Booking name matching
This commit is contained in:
ygorshenin 2016-07-14 12:23:40 +03:00 committed by GitHub
commit 9934545216
11 changed files with 235 additions and 71 deletions

View file

@ -1,11 +1,11 @@
#include "generator/booking_dataset.hpp"
#include "generator/booking_scoring.hpp"
#include "platform/local_country_file_utils.hpp"
#include "platform/platform.hpp"
#include "indexer/ftypes_matcher.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
@ -171,45 +171,6 @@ vector<size_t> BookingDataset::GetNearestHotels(double lat, double lon, size_t l
return indexes;
}
bool BookingDataset::MatchByName(string const & osmName,
vector<size_t> const & bookingIndexes) const
{
return false;
// Match name.
// vector<strings::UniString> osmTokens;
// NormalizeAndTokenizeString(name, osmTokens, search::Delimiters());
//
// cout << "\n------------- " << name << endl;
//
// bool matched = false;
// for (auto const & index : indexes)
// {
// vector<strings::UniString> bookingTokens;
// NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters());
//
// map<size_t, vector<pair<size_t, size_t>>> weightPair;
//
// for (size_t j = 0; j < osmTokens.size(); ++j)
// {
// for (size_t i = 0; i < bookingTokens.size(); ++i)
// {
// size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(),
// bookingTokens[i].begin(),
// bookingTokens[i].end());
// if (distance < 3)
// weightPair[distance].emplace_back(i, j);
// }
// }
//
// if (!weightPair.empty())
// {
// cout << m_hotels[e.second] << endl;
// matched = true;
// }
// }
}
void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const
{
for (auto const & hotel : m_hotels)
@ -248,6 +209,8 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons
if (!hotel.houseNumber.empty())
e.AddTag("addr:housenumber", hotel.houseNumber);
// Matching booking.com hotel types to OpenStreetMap values.
// Booking types are listed in the closed API docs.
switch (hotel.type)
{
case 19:
@ -301,13 +264,6 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons
}
}
// static
double BookingDataset::ScoreByLinearNormDistance(double distance)
{
distance = my::clamp(distance, 0, kDistanceLimitInMeters);
return 1.0 - distance / kDistanceLimitInMeters;
}
void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath)
{
m_hotels.clear();
@ -373,11 +329,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const
for (size_t const j : bookingIndexes)
{
auto const & hotel = GetHotel(j);
double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
double score = ScoreByLinearNormDistance(distanceMeters);
matched = score > kOptimalThreshold;
if (matched)
if (booking_scoring::Match(GetHotel(j), e).IsMatched())
break;
}

View file

@ -22,9 +22,6 @@ public:
double static constexpr kDistanceLimitInMeters = 150;
size_t static constexpr kMaxSelectedElements = 3;
// Calculated with tools/python/booking_hotels_quality.py
double static constexpr kOptimalThreshold = 0.709283;
struct Hotel
{
enum class Fields
@ -92,8 +89,6 @@ public:
void BuildFeatures(function<void(OsmElement *)> const & fn) const;
static double ScoreByLinearNormDistance(double distance);
protected:
vector<Hotel> m_hotels;

View file

@ -1,4 +1,5 @@
#include "generator/booking_dataset.hpp"
#include "generator/booking_scoring.hpp"
#include "generator/osm_source.hpp"
#include "geometry/distance_on_sphere.hpp"
@ -6,6 +7,7 @@
#include "std/fstream.hpp"
#include "std/iostream.hpp"
#include "std/numeric.hpp"
#include "std/random.hpp"
#include "3party/gflags/src/gflags/gflags.h"
@ -15,6 +17,7 @@ DEFINE_string(osm_file_name, "", "Input .o5m file");
DEFINE_string(booking_data, "", "Path to booking data in .tsv format");
DEFINE_string(sample_data, "", "Sample output path");
DEFINE_uint64(selection_size, 1000, "Selection size");
DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle");
using namespace generator;
@ -57,9 +60,7 @@ int main(int argc, char * argv[])
vector<size_t> elementIndexes(elements.size());
iota(elementIndexes.begin(), elementIndexes.end(), 0);
// In first implementation, we used random_shufle for reference dataset.
// Next time we are going to replace random_shuffle by shuffle with defined seed.
random_shuffle(elementIndexes.begin(), elementIndexes.end());
shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_seed));
if (FLAGS_selection_size < elementIndexes.size())
elementIndexes.resize(FLAGS_selection_size);
@ -73,15 +74,19 @@ int main(int argc, char * argv[])
for (size_t const j : bookingIndexes)
{
auto const & hotel = bookingDataset.GetHotel(j);
double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
double score = BookingDataset::ScoreByLinearNormDistance(distanceMeters);
auto const score = booking_scoring::Match(hotel, e);
bool matched = score > BookingDataset::kOptimalThreshold;
double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
bool matched = score.IsMatched();
outStream << "# ------------------------------------------" << fixed << setprecision(6)
<< endl;
outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j
<< " distance: " << distanceMeters << " score: " << score << endl;
<< "\tdistance: " << distanceMeters
<< "\tdistance score: " << score.m_linearNormDistanceScore
<< "\tname score: " << score.m_nameSimilarityScore
<< "\tresult score: " << score.GetMatchingScore()
<< endl;
outStream << "# " << e << endl;
outStream << "# " << hotel << endl;
outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat

View file

@ -0,0 +1,141 @@
#include "generator/booking_scoring.hpp"
#include "generator/booking_dataset.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
#include "base/collection_cast.hpp"
#include "base/stl_iterator.hpp"
#include "std/algorithm.hpp"
#include "std/vector.hpp"
namespace generator
{
namespace booking_scoring
{
namespace
{
// Calculated with tools/python/booking_hotels_quality.py.
double constexpr kOptimalThreshold = 0.317324;
template <typename T, typename U>
struct decay_equiv :
std::is_same<typename std::decay<T>::type, U>::type
{};
using WeightedBagOfWords = vector<pair<strings::UniString, double>>;
vector<strings::UniString> StringToSetOfWords(string const & str)
{
vector<strings::UniString> result;
search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
sort(begin(result), end(result));
return result;
}
WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words)
{
// TODO(mgsergio): Calculate tf-idsf score for every word.
auto constexpr kTfIdfScorePlaceholder = 1;
WeightedBagOfWords result;
for (auto i = 0; i < words.size(); ++i)
{
result.emplace_back(words[i], kTfIdfScorePlaceholder);
while (i + 1 < words.size() && words[i] == words[i + 1])
{
result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist;
++i;
}
}
return result;
}
double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
{
double result{};
auto lhsIt = begin(lhs);
auto rhsIt = begin(rhs);
while (lhsIt != end(lhs) && rhsIt != end(rhs))
{
if (lhsIt->first == rhsIt->first)
{
result += lhsIt->second * rhsIt->second;
++lhsIt;
++rhsIt;
}
else if (lhsIt->first < rhsIt->first)
{
++lhsIt;
}
else
{
++rhsIt;
}
}
return result;
}
double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
{
auto const product = WeightedBagsDotProduct(lhs, rhs);
auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs));
auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs));
if (product == 0.0)
return 0.0;
return product / (lhsLength * rhsLength);
}
double GetLinearNormDistanceScore(double distance)
{
distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters);
return 1.0 - distance / BookingDataset::kDistanceLimitInMeters;
}
double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
{
auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name));
auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name));
if (aws.empty() && bws.empty())
return 1.0;
if (aws.empty() || bws.empty())
return 0.0;
return WeightedBagOfWordsCos(aws, bws);
}
} // namespace
double BookingMatchScore::GetMatchingScore() const
{
return m_linearNormDistanceScore * m_nameSimilarityScore;
}
bool BookingMatchScore::IsMatched() const
{
return GetMatchingScore() > kOptimalThreshold;
}
BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e)
{
BookingMatchScore score;
auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon);
score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance);
// TODO(mgsergio): Check all translations and use the best one.
score.m_nameSimilarityScore = GetNameSimilarityScore(h.name, e.GetTag("name"));
return score;
}
} // namespace booking_scoring
} // namespace generator

View file

@ -0,0 +1,22 @@
#pragma once
#include "generator/booking_dataset.hpp"
struct OsmElement;
namespace generator
{
namespace booking_scoring
{
struct BookingMatchScore
{
double GetMatchingScore() const;
bool IsMatched() const;
double m_linearNormDistanceScore{};
double m_nameSimilarityScore{};
};
BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e);
} // namespace booking_scoring
} // namespace generator

View file

@ -121,6 +121,19 @@ string OsmElement::ToString(string const & shift) const
return ss.str();
}
string OsmElement::GetTag(string const & key) const
{
auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag)
{
return tag.key == key;
});
if (it == end(m_tags))
return {};
return it->value;
}
string DebugPrint(OsmElement const & e)
{
return e.ToString();

View file

@ -152,7 +152,8 @@ struct OsmElement
if (!v.empty())
AddTag(k, v);
}
string GetTag(string const & key) const;
};
string DebugPrint(OsmElement const & e);

View file

@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
# coding: utf8
from __future__ import print_function
@ -126,7 +126,8 @@ def translate(source, output):
'''
Reads *.pkl files and produces a single list of hotels as tab separated values.
'''
files = [filename for filename in os.listdir(source) if filename.endswith('.pkl')]
files = [os.path.join(source, filename)
for filename in os.listdir(source) if filename.endswith('.pkl')]
data = []
for filename in sorted(files):
@ -207,6 +208,7 @@ def process_options():
if not options.download and not options.translate:
parser.print_help()
# TODO(mgsergio): implpement it with argparse facilities.
if options.translate and not options.output:
print("--output isn't set")
parser.print_help()

View file

@ -1,4 +1,4 @@
#!/usr/bin/python
#!/usr/bin/env python
# coding: utf8
from __future__ import print_function
@ -14,6 +14,7 @@ import os
import pickle
import time
import urllib2
import re
# init logging
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(message)s')
@ -21,7 +22,7 @@ logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(levelname)s: %(
def load_binary_list(path):
"""
Loads binary classifier output.
Loads reference binary classifier output.
"""
bits = []
with open(path, 'r') as fd:
@ -41,11 +42,12 @@ def load_score_list(path):
for line in fd:
if (not line.strip()) or line[0] == '#':
continue
scores.append(float(line[line.rfind(':')+2:]))
scores.append(float(re.search(r'result score: (\d*\.\d+)', line).group(1)))
return scores
def process_options():
# TODO(mgsergio): Fix description.
parser = argparse.ArgumentParser(description="Download and process booking hotels.")
parser.add_argument("-v", "--verbose", action="store_true", dest="verbose")
parser.add_argument("-q", "--quiet", action="store_false", dest="verbose")

View file

@ -63,6 +63,8 @@
67BC92E31D1A9ED800A4A378 /* test_feature.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92D91D1A9E9800A4A378 /* test_feature.hpp */; };
67BC92E41D1A9ED800A4A378 /* test_mwm_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */; };
67BC92E51D1A9ED800A4A378 /* test_mwm_builder.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */; };
E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E9502E311D34012200CAB86B /* booking_scoring.cpp */; };
E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */ = {isa = PBXBuildFile; fileRef = E9502E321D34012200CAB86B /* booking_scoring.hpp */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
@ -127,6 +129,8 @@
67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = test_mwm_builder.cpp; sourceTree = "<group>"; };
67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = test_mwm_builder.hpp; sourceTree = "<group>"; };
67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = "<group>"; };
E9502E311D34012200CAB86B /* booking_scoring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_scoring.cpp; sourceTree = "<group>"; };
E9502E321D34012200CAB86B /* booking_scoring.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_scoring.hpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -169,6 +173,8 @@
6753401D1A3F2A1B00A0A8C3 /* generator */ = {
isa = PBXGroup;
children = (
E9502E311D34012200CAB86B /* booking_scoring.cpp */,
E9502E321D34012200CAB86B /* booking_scoring.hpp */,
677E2A111CAACC5F001DC42A /* tag_admixer.hpp */,
677E2A121CAACC5F001DC42A /* towns_dumper.cpp */,
677E2A131CAACC5F001DC42A /* towns_dumper.hpp */,
@ -253,6 +259,7 @@
6753407F1A3F2A7400A0A8C3 /* osm2type.hpp in Headers */,
670B84BD1A8CDB0000CE4492 /* osm_source.hpp in Headers */,
675340631A3F2A7400A0A8C3 /* coastlines_generator.hpp in Headers */,
E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */,
675340641A3F2A7400A0A8C3 /* intermediate_data.hpp in Headers */,
675340781A3F2A7400A0A8C3 /* intermediate_elements.hpp in Headers */,
6753406B1A3F2A7400A0A8C3 /* feature_emitter_iface.hpp in Headers */,
@ -384,6 +391,7 @@
677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */,
6753405C1A3F2A7400A0A8C3 /* borders_generator.cpp in Sources */,
675340671A3F2A7400A0A8C3 /* dumper.cpp in Sources */,
E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */,
675340831A3F2A7400A0A8C3 /* statistics.cpp in Sources */,
6753407E1A3F2A7400A0A8C3 /* osm2type.cpp in Sources */,
675340601A3F2A7400A0A8C3 /* check_model.cpp in Sources */,
@ -581,6 +589,7 @@
67BC92D51D1A9E5F00A4A378 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};

View file

@ -140,6 +140,11 @@
67BC92C51D17FD5800A4A378 /* libstb_image.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 679624BF1D11775300AE4E3C /* libstb_image.a */; };
67BC92C61D17FDE600A4A378 /* OpenGL.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 679624CE1D11779700AE4E3C /* OpenGL.framework */; };
67BC92C71D17FDF800A4A378 /* libapi.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 679624B61D11775300AE4E3C /* libapi.a */; };
E9502E241D2BD26C00CAB86B /* libalohalitics.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E231D2BD26C00CAB86B /* libalohalitics.a */; };
E9502E261D2BD28C00CAB86B /* liboauthcpp.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E251D2BD28C00CAB86B /* liboauthcpp.a */; };
E9502E281D2BD2CC00CAB86B /* libopening_hours.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E271D2BD2CC00CAB86B /* libopening_hours.a */; };
E9502E2D1D2BD47B00CAB86B /* libsearch.a in Frameworks */ = {isa = PBXBuildFile; fileRef = E9502E291D2BD34A00CAB86B /* libsearch.a */; };
E9502E301D2BD6E600CAB86B /* libz.tbd in Frameworks */ = {isa = PBXBuildFile; fileRef = 677E2A091CAAC771001DC42A /* libz.tbd */; };
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
@ -307,6 +312,12 @@
67AB92C61B73D03500AB5194 /* libmap.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libmap.a; path = "../../../omim-xcode-build/Debug/libmap.a"; sourceTree = "<group>"; };
67AB92C81B73D10200AB5194 /* libosrm.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libosrm.a; path = "../../../omim-xcode-build/Debug/libosrm.a"; sourceTree = "<group>"; };
67AB92CA1B73D10B00AB5194 /* libsuccinct.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libsuccinct.a; path = "../../../omim-xcode-build/Debug/libsuccinct.a"; sourceTree = "<group>"; };
E9502E231D2BD26C00CAB86B /* libalohalitics.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libalohalitics.a; path = "/Users/mgsergio/omim/xcode/alohalitics/../../../omim-xcode-build/Debug/libalohalitics.a"; sourceTree = "<absolute>"; };
E9502E251D2BD28C00CAB86B /* liboauthcpp.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = liboauthcpp.a; path = "/Users/mgsergio/omim/xcode/oauthcpp/../../../omim-xcode-build/Debug/liboauthcpp.a"; sourceTree = "<absolute>"; };
E9502E271D2BD2CC00CAB86B /* libopening_hours.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libopening_hours.a; path = "/Users/mgsergio/omim/xcode/opening_hours/../../../omim-xcode-build/Debug/libopening_hours.a"; sourceTree = "<absolute>"; };
E9502E291D2BD34A00CAB86B /* libsearch.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libsearch.a; path = "/Users/mgsergio/omim/xcode/search/../../../omim-xcode-build/Debug/libsearch.a"; sourceTree = "<absolute>"; };
E9502E2B1D2BD44E00CAB86B /* libprotobuf.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libprotobuf.a; path = "/Users/mgsergio/omim/xcode/protobuf/../../../omim-xcode-build/Debug/libprotobuf.a"; sourceTree = "<absolute>"; };
E9502E2E1D2BD5BE00CAB86B /* libminizip.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = libminizip.a; path = "/Users/mgsergio/omim/xcode/minizip/../../../omim-xcode-build/Debug/libminizip.a"; sourceTree = "<absolute>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -342,6 +353,11 @@
isa = PBXFrameworksBuildPhase;
buildActionMask = 2147483647;
files = (
E9502E301D2BD6E600CAB86B /* libz.tbd in Frameworks */,
E9502E2D1D2BD47B00CAB86B /* libsearch.a in Frameworks */,
E9502E281D2BD2CC00CAB86B /* libopening_hours.a in Frameworks */,
E9502E261D2BD28C00CAB86B /* liboauthcpp.a in Frameworks */,
E9502E241D2BD26C00CAB86B /* libalohalitics.a in Frameworks */,
673746771CF47E83005E6D1F /* Cocoa.framework in Frameworks */,
673746701CF47E14005E6D1F /* libeditor.a in Frameworks */,
6737466B1CF47D82005E6D1F /* libplatform.a in Frameworks */,
@ -555,6 +571,12 @@
6753414F1A3F54D800A0A8C3 = {
isa = PBXGroup;
children = (
E9502E2E1D2BD5BE00CAB86B /* libminizip.a */,
E9502E2B1D2BD44E00CAB86B /* libprotobuf.a */,
E9502E291D2BD34A00CAB86B /* libsearch.a */,
E9502E271D2BD2CC00CAB86B /* libopening_hours.a */,
E9502E251D2BD28C00CAB86B /* liboauthcpp.a */,
E9502E231D2BD26C00CAB86B /* libalohalitics.a */,
670D05AD1B0E08260013A7AC /* defaults.xcconfig */,
6737465A1CF46324005E6D1F /* booking_quality_check */,
670B84C41A9F73AB00CE4492 /* std */,