Refactor. Add Matching by name.

This commit is contained in:
Sergey Magidovich 2016-07-11 22:42:43 +03:00
parent 6f423fbac4
commit 88d5775c23
8 changed files with 151 additions and 63 deletions

View file

@ -1,11 +1,11 @@
#include "generator/booking_dataset.hpp"
#include "generator/booking_scoring.hpp"
#include "platform/local_country_file_utils.hpp"
#include "platform/platform.hpp"
#include "indexer/ftypes_matcher.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
@ -171,45 +171,6 @@ vector<size_t> BookingDataset::GetNearestHotels(double lat, double lon, size_t l
return indexes;
}
bool BookingDataset::MatchByName(string const & osmName,
vector<size_t> const & bookingIndexes) const
{
return false;
// Match name.
// vector<strings::UniString> osmTokens;
// NormalizeAndTokenizeString(name, osmTokens, search::Delimiters());
//
// cout << "\n------------- " << name << endl;
//
// bool matched = false;
// for (auto const & index : indexes)
// {
// vector<strings::UniString> bookingTokens;
// NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters());
//
// map<size_t, vector<pair<size_t, size_t>>> weightPair;
//
// for (size_t j = 0; j < osmTokens.size(); ++j)
// {
// for (size_t i = 0; i < bookingTokens.size(); ++i)
// {
// size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(),
// bookingTokens[i].begin(),
// bookingTokens[i].end());
// if (distance < 3)
// weightPair[distance].emplace_back(i, j);
// }
// }
//
// if (!weightPair.empty())
// {
// cout << m_hotels[e.second] << endl;
// matched = true;
// }
// }
}
void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const
{
for (auto const & hotel : m_hotels)
@ -302,13 +263,6 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons
}
}
// static
double BookingDataset::ScoreByLinearNormDistance(double distance)
{
distance = my::clamp(distance, 0, kDistanceLimitInMeters);
return 1.0 - distance / kDistanceLimitInMeters;
}
void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath)
{
m_hotels.clear();
@ -374,11 +328,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const
for (size_t const j : bookingIndexes)
{
auto const & hotel = GetHotel(j);
double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
double score = ScoreByLinearNormDistance(distanceMeters);
matched = score > kOptimalThreshold;
if (matched)
if (booking_scoring::Match(GetHotel(j), e).IsMatched())
break;
}

View file

@ -22,9 +22,6 @@ public:
double static constexpr kDistanceLimitInMeters = 150;
size_t static constexpr kMaxSelectedElements = 3;
// Calculated with tools/python/booking_hotels_quality.py
double static constexpr kOptimalThreshold = 0.709283;
struct Hotel
{
enum class Fields
@ -92,8 +89,6 @@ public:
void BuildFeatures(function<void(OsmElement *)> const & fn) const;
static double ScoreByLinearNormDistance(double distance);
protected:
vector<Hotel> m_hotels;

View file

@ -1,4 +1,5 @@
#include "generator/booking_dataset.hpp"
#include "generator/booking_scoring.hpp"
#include "generator/osm_source.hpp"
#include "geometry/distance_on_sphere.hpp"
@ -73,15 +74,15 @@ int main(int argc, char * argv[])
for (size_t const j : bookingIndexes)
{
auto const & hotel = bookingDataset.GetHotel(j);
double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
double const score = BookingDataset::ScoreByLinearNormDistance(distanceMeters);
auto const score = booking_scoring::Match(hotel, e);
bool matched = score > BookingDataset::kOptimalThreshold;
double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
bool matched = score.IsMatched();
outStream << "# ------------------------------------------" << fixed << setprecision(6)
<< endl;
outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j
<< " distance: " << distanceMeters << " score: " << score << endl;
<< " distance: " << distanceMeters << " score: " << score.GetMatchingScore() << endl;
outStream << "# " << e << endl;
outStream << "# " << hotel << endl;
outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat

View file

@ -0,0 +1,97 @@
#include "generator/booking_scoring.hpp"
#include "generator/booking_dataset.hpp"
#include "indexer/search_string_utils.hpp"
#include "indexer/search_delimiters.hpp"
#include "geometry/distance_on_sphere.hpp"
#include "base/collection_cast.hpp"
namespace generator
{
namespace booking_scoring
{
namespace
{
// Calculated with tools/python/booking_hotels_quality.py.
double constexpr kOptimalThreshold = 0.151001;
template <typename T, typename U>
struct decay_equiv :
std::is_same<typename std::decay<T>::type, U>::type
{};
set<strings::UniString> StringToSetOfWords(string const & str)
{
vector<strings::UniString> result;
search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
return my::collection_cast<set>(result);
}
// TODO(mgsergio): Update existing one in base or wherever...
// Or just use one from boost.
struct CounterIterator
{
template<typename T, typename = typename enable_if<!decay_equiv<T, CounterIterator>::value>::type>
CounterIterator & operator=(T const &) { ++m_count; return *this; }
CounterIterator & operator++() { return *this; }
CounterIterator & operator++(int) { return *this; }
CounterIterator & operator*() { return *this; }
uint32_t Count() const { return m_count; }
uint32_t m_count = 0;
};
double StringSimilarityScore(string const & a, string const & b)
{
auto const aWords = StringToSetOfWords(a);
auto const bWords = StringToSetOfWords(b);
auto const intersectionCard = set_intersection(begin(aWords), end(aWords),
begin(bWords), end(bWords),
CounterIterator()).Count();
auto const aLikeBScore = static_cast<double>(intersectionCard) / aWords.size();
auto const bLikeAScore = static_cast<double>(intersectionCard) / bWords.size();
return aLikeBScore * bLikeAScore;
}
double GetLinearNormDistanceScrore(double distance)
{
distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters);
return 1.0 - distance / BookingDataset::kDistanceLimitInMeters;
}
double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
{
return StringSimilarityScore(booking_name, osm_name);
}
} // namespace
double BookingMatchScore::GetMatchingScore() const
{
return m_linearNormDistanceScore * m_nameSimilarityScore;
}
bool BookingMatchScore::IsMatched() const
{
return GetMatchingScore() > kOptimalThreshold;
}
BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e)
{
BookingMatchScore score;
auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon);
score.m_linearNormDistanceScore = GetLinearNormDistanceScrore(distance);
string osmHotelName;
score.m_nameSimilarityScore = e.GetTag("name", osmHotelName)
? GetNameSimilarityScore(h.name, osmHotelName) : 0;
return score;
}
} // namespace booking_scoring
} // namespace generator

View file

@ -0,0 +1,21 @@
#pragma once
#include "generator/booking_dataset.hpp"
#include "generator/osm_element.hpp"
namespace generator
{
namespace booking_scoring
{
struct BookingMatchScore
{
double GetMatchingScore() const;
bool IsMatched() const;
double m_linearNormDistanceScore{};
double m_nameSimilarityScore{};
};
BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e);
} // namespace booking_scoring
} // namespace generator

View file

@ -121,6 +121,20 @@ string OsmElement::ToString(string const & shift) const
return ss.str();
}
bool OsmElement::GetTag(string const & key, string & value) const
{
auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag)
{
return tag.key == key;
});
if (it == end(m_tags))
return false;
value = it->value;
return true;
}
string DebugPrint(OsmElement const & e)
{
return e.ToString();

View file

@ -152,7 +152,8 @@ struct OsmElement
if (!v.empty())
AddTag(k, v);
}
bool GetTag(string const & key, string & value) const;
};
string DebugPrint(OsmElement const & e);

View file

@ -63,6 +63,8 @@
67BC92E31D1A9ED800A4A378 /* test_feature.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92D91D1A9E9800A4A378 /* test_feature.hpp */; };
67BC92E41D1A9ED800A4A378 /* test_mwm_builder.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */; };
67BC92E51D1A9ED800A4A378 /* test_mwm_builder.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */; };
E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */ = {isa = PBXBuildFile; fileRef = E9502E311D34012200CAB86B /* booking_scoring.cpp */; };
E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */ = {isa = PBXBuildFile; fileRef = E9502E321D34012200CAB86B /* booking_scoring.hpp */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
@ -127,6 +129,8 @@
67BC92DA1D1A9E9800A4A378 /* test_mwm_builder.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = test_mwm_builder.cpp; sourceTree = "<group>"; };
67BC92DB1D1A9E9800A4A378 /* test_mwm_builder.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = test_mwm_builder.hpp; sourceTree = "<group>"; };
67F0F6761B8C9DCE003F52FF /* osm_xml_source.hpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = osm_xml_source.hpp; sourceTree = "<group>"; };
E9502E311D34012200CAB86B /* booking_scoring.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = booking_scoring.cpp; sourceTree = "<group>"; };
E9502E321D34012200CAB86B /* booking_scoring.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = booking_scoring.hpp; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -169,6 +173,8 @@
6753401D1A3F2A1B00A0A8C3 /* generator */ = {
isa = PBXGroup;
children = (
E9502E311D34012200CAB86B /* booking_scoring.cpp */,
E9502E321D34012200CAB86B /* booking_scoring.hpp */,
677E2A111CAACC5F001DC42A /* tag_admixer.hpp */,
677E2A121CAACC5F001DC42A /* towns_dumper.cpp */,
677E2A131CAACC5F001DC42A /* towns_dumper.hpp */,
@ -253,6 +259,7 @@
6753407F1A3F2A7400A0A8C3 /* osm2type.hpp in Headers */,
670B84BD1A8CDB0000CE4492 /* osm_source.hpp in Headers */,
675340631A3F2A7400A0A8C3 /* coastlines_generator.hpp in Headers */,
E9502E341D34012200CAB86B /* booking_scoring.hpp in Headers */,
675340641A3F2A7400A0A8C3 /* intermediate_data.hpp in Headers */,
675340781A3F2A7400A0A8C3 /* intermediate_elements.hpp in Headers */,
6753406B1A3F2A7400A0A8C3 /* feature_emitter_iface.hpp in Headers */,
@ -384,6 +391,7 @@
677E2A171CAACC5F001DC42A /* towns_dumper.cpp in Sources */,
6753405C1A3F2A7400A0A8C3 /* borders_generator.cpp in Sources */,
675340671A3F2A7400A0A8C3 /* dumper.cpp in Sources */,
E9502E331D34012200CAB86B /* booking_scoring.cpp in Sources */,
675340831A3F2A7400A0A8C3 /* statistics.cpp in Sources */,
6753407E1A3F2A7400A0A8C3 /* osm2type.cpp in Sources */,
675340601A3F2A7400A0A8C3 /* check_model.cpp in Sources */,
@ -581,6 +589,7 @@
67BC92D51D1A9E5F00A4A378 /* Release */,
);
defaultConfigurationIsVisible = 0;
defaultConfigurationName = Release;
};
/* End XCConfigurationList section */
};