diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index d6f291e838..a5984abbc9 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -1,7 +1,8 @@ #include "generator/booking_dataset.hpp" -#include "generator/booking_scoring.hpp" #include "generator/feature_builder.hpp" +#include "generator/opentable_dataset.hpp" #include "generator/osm_source.hpp" +#include "generator/sponsored_scoring.hpp" #include "indexer/classificator_loader.hpp" @@ -21,12 +22,19 @@ #include "3party/gflags/src/gflags/gflags.h" +#include "boost/range/adaptor/map.hpp" +#include "boost/range/algorithm/copy.hpp" + + DEFINE_string(osm, "", "Input .o5m file"); DEFINE_string(booking, "", "Path to booking data in .tsv format"); +DEFINE_string(opentable, "", "Path to opentable data in .tsv format"); DEFINE_string(factors, "", "Factors output path"); DEFINE_string(sample, "", "Path so sample file"); -DEFINE_uint64(selection_size, 1000, "Selection size"); + DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle"); +DEFINE_uint64(selection_size, 1000, "Selection size"); +DEFINE_bool(generate, false, "Generate unmarked sample"); using namespace generator; @@ -89,11 +97,12 @@ osm::Id ReadDebuggedPrintedOsmId(string const & str) MYTHROW(ParseError, ("Can't make osmId from string", str)); } +template class Emitter : public EmitterBase { public: - Emitter(BookingDataset const & booking, map & features) - : m_bookingDataset(booking) + Emitter(Dataset const & dataset, map & features) + : m_dataset(dataset) , m_features(features) { LOG_SHORT(LINFO, ("OSM data:", FLAGS_osm)); @@ -101,7 +110,7 @@ public: void operator()(FeatureBuilder1 & fb) override { - if (m_bookingDataset.NecessaryMatchingConditionHolds(fb)) + if (m_dataset.NecessaryMatchingConditionHolds(fb)) m_features.emplace(fb.GetMostGenericOsmId(), fb); } @@ -117,7 +126,7 @@ public: } private: - BookingDataset const & m_bookingDataset; + Dataset const & m_dataset; map & m_features; }; @@ -125,6 +134,7 @@ feature::GenerateInfo GetGenerateInfo() { feature::GenerateInfo info; info.m_bookingDatafileName = FLAGS_booking; + info.m_opentableDataFile = FLAGS_opentable; info.m_osmFileName = FLAGS_osm; info.SetNodeStorageType("map"); info.SetOsmFileType("o5m"); @@ -144,15 +154,15 @@ struct SampleItem SampleItem() = default; - SampleItem(osm::Id const & osmId, ObjectId const bookingId, MatchStatus const match = Uninitialized) + SampleItem(osm::Id const & osmId, ObjectId const sponsoredId, MatchStatus const match = Uninitialized) : m_osmId(osmId) - , m_bookingId(bookingId) + , m_sponsoredId(sponsoredId) , m_match(match) { } osm::Id m_osmId; - ObjectId m_bookingId = Object::InvalidObjectId(); + ObjectId m_sponsoredId = Object::InvalidObjectId(); MatchStatus m_match = Uninitialized; }; @@ -182,7 +192,7 @@ SampleItem ReadSampleItem(string const & str) "due to wrong number of fields.")); item.m_osmId = ReadDebuggedPrintedOsmId(parts[0]); - if (!strings::to_uint(parts[1], item.m_bookingId.Get())) + if (!strings::to_uint(parts[1], item.m_sponsoredId.Get())) MYTHROW(ParseError, ("Can't make uint32 from string:", parts[1])); item.m_match = ReadMatchStatus(parts[2]); @@ -219,35 +229,159 @@ vector> ReadSampleFromFile(string const & name) return ReadSample(ist); } -template -void GenerateFactors(BookingDataset const & booking, map const & features, +template +void GenerateFactors(Dataset const & dataset, + map const & features, vector> const & sampleItems, ostream & ost) { for (auto const & item : sampleItems) { - auto const & hotel = booking.GetObjectById(item.m_bookingId); + auto const & object = dataset.GetObjectById(item.m_sponsoredId); auto const & feature = features.at(item.m_osmId); - auto const score = booking_scoring::Match(hotel, feature); + auto const score = generator::sponsored_scoring::Match(object, feature); auto const center = MercatorBounds::ToLatLon(feature.GetKeyPoint()); double const distanceMeters = ms::DistanceOnEarth(center.lat, center.lon, - hotel.m_lat, hotel.m_lon); + object.m_lat, object.m_lon); auto const matched = score.IsMatched(); ost << "# ------------------------------------------" << fixed << setprecision(6) << endl; ost << (matched ? 'y' : 'n') << " \t" << DebugPrint(feature.GetMostGenericOsmId()) - << "\t " << hotel.m_id + << "\t " << object.m_id << "\tdistance: " << distanceMeters << "\tdistance score: " << score.m_linearNormDistanceScore << "\tname score: " << score.m_nameSimilarityScore << "\tresult score: " << score.GetMatchingScore() << endl; ost << "# " << PrintBuilder(feature) << endl; - ost << "# " << hotel << endl; - ost << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.m_lat - << "&mlon=" << hotel.m_lon << "#map=18/" << hotel.m_lat << "/" << hotel.m_lon << endl; + ost << "# " << object << endl; + ost << "# URL: https://www.openstreetmap.org/?mlat=" << object.m_lat + << "&mlon=" << object.m_lon << "#map=18/" << object.m_lat << "/" << object.m_lon << endl; + } +} + +enum class DatasetType +{ + Booking, + Opentable +}; + +template +void GenerateSample(Dataset const & dataset, + map const & features, + ostream & ost) +{ + LOG_SHORT(LINFO, ("Num of elements:", features.size())); + vector elementIndexes(features.size()); + boost::copy(features | boost::adaptors::map_keys, begin(elementIndexes)); + + shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_seed)); + if (FLAGS_selection_size < elementIndexes.size()) + elementIndexes.resize(FLAGS_selection_size); + + stringstream outStream; + + for (auto osmId : elementIndexes) + { + auto const & fb = features.at(osmId); + auto const sponsoredIndexes = dataset.GetNearestObjects( + MercatorBounds::ToLatLon(fb.GetKeyPoint()), + Dataset::kMaxSelectedElements, + Dataset::kDistanceLimitInMeters); + + for (auto const sponsoredId : sponsoredIndexes) + { + auto const & object = dataset.GetObjectById(sponsoredId); + auto const score = sponsored_scoring::Match(object, fb); + + auto const center = MercatorBounds::ToLatLon(fb.GetKeyPoint()); + double const distanceMeters = ms::DistanceOnEarth(center.lat, center.lon, + object.m_lat, object.m_lon); + auto const matched = score.IsMatched(); + + outStream << "# ------------------------------------------" << fixed << setprecision(6) + << endl; + outStream << (matched ? 'y' : 'n') << " \t" << DebugPrint(osmId) << "\t " << sponsoredId + << "\tdistance: " << distanceMeters + << "\tdistance score: " << score.m_linearNormDistanceScore + << "\tname score: " << score.m_nameSimilarityScore + << "\tresult score: " << score.GetMatchingScore() + << endl; + outStream << "# " << PrintBuilder(fb) << endl; + outStream << "# " << object << endl; + outStream << "# URL: https://www.openstreetmap.org/?mlat=" + << object.m_lat << "&mlon=" << object.m_lon + << "#map=18/" << object.m_lat << "/" << object.m_lon << endl; + } + if (!sponsoredIndexes.empty()) + outStream << endl << endl; + } + + if (FLAGS_sample.empty()) + { + cout << outStream.str(); + } + else + { + ofstream file(FLAGS_sample); + if (file.is_open()) + file << outStream.str(); + else + LOG(LERROR, ("Can't output into", FLAGS_sample, strerror(errno))); + } +} + +template +string GetDatasetFilePath(feature::GenerateInfo const & info); + +template <> +string GetDatasetFilePath(feature::GenerateInfo const & info) +{ + return info.m_bookingDatafileName; +} + +template <> +string GetDatasetFilePath(feature::GenerateInfo const & info) +{ + return info.m_opentableDataFile; +} + +template +void RunImpl(feature::GenerateInfo & info) +{ + // TODO(mgsergio): Log correctly LOG_SHORT(LINFO, ("Booking data:", FLAGS_booking)); + Dataset dataset(GetDatasetFilePath(info)); + LOG_SHORT(LINFO, (dataset.Size(), "objects are loaded from a Dataset.")); + + map features; + GenerateFeatures(info, [&dataset, &features](feature::GenerateInfo const & /* info */) + { + return make_unique>(dataset, features); + }); + + if (FLAGS_generate) + { + ofstream ost(FLAGS_sample); + GenerateSample(dataset, features, ost); + } + else + { + auto const sample = ReadSampleFromFile(FLAGS_sample); + LOG(LINFO, ("Sample size is", sample.size())); + ofstream ost(FLAGS_factors); + CHECK(ost.is_open(), ("Can't open file", FLAGS_factors, strerror(errno))); + GenerateFactors(dataset, features, sample, ost); + } +} + +void Run(DatasetType const datasetType, feature::GenerateInfo & info) +{ + switch (datasetType) + { + case DatasetType::Booking: RunImpl(info); break; + case DatasetType::Opentable: RunImpl(info); break; } } } // namespace @@ -266,31 +400,19 @@ int main(int argc, char * argv[]) CHECK(!FLAGS_sample.empty(), ("Please specify sample path.")); CHECK(!FLAGS_osm.empty(), ("Please specify osm path.")); - CHECK(!FLAGS_booking.empty(), ("Please specify booking path.")); - CHECK(!FLAGS_factors.empty(), ("Please specify factors path.")); + CHECK(!FLAGS_booking.empty() ^ !FLAGS_opentable.empty(), + ("Please specify either booking or opentable path.")); + CHECK(!FLAGS_factors.empty() ^ FLAGS_generate, ("Please either specify factors path" + "or use -generate.")); + + auto const datasetType = FLAGS_booking.empty() ? DatasetType::Opentable : DatasetType::Booking; classificator::Load(); auto info = GetGenerateInfo(); GenerateIntermediateData(info); - LOG_SHORT(LINFO, ("Booking data:", FLAGS_booking)); - BookingDataset booking(info.m_bookingDatafileName); - LOG_SHORT(LINFO, (booking.Size(), "hotels are loaded from Booking.")); - - map features; - GenerateFeatures(info, [&booking, &features](feature::GenerateInfo const & /* info */) - { - return make_unique(booking, features); - }); - - auto const sample = ReadSampleFromFile(FLAGS_sample); - LOG(LINFO, ("Sample size is", sample.size())); - { - ofstream ost(FLAGS_factors); - CHECK(ost.is_open(), ("Can't open file", FLAGS_factors, strerror(errno))); - GenerateFactors(booking, features, sample, ost); - } + Run(datasetType, info); return 0; }