From d8ab0550707ddb5114d18509b936b49093da1230 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Thu, 22 Aug 2019 15:27:16 +0300 Subject: [PATCH] [search_quality] Add address mode for booking_dataset_generator tool. --- .../booking_dataset_generator.cpp | 137 ++++++++++++++++-- 1 file changed, 124 insertions(+), 13 deletions(-) diff --git a/search/search_quality/booking_dataset_generator/booking_dataset_generator.cpp b/search/search_quality/booking_dataset_generator/booking_dataset_generator.cpp index ab550c53cc..fcaca4839a 100644 --- a/search/search_quality/booking_dataset_generator/booking_dataset_generator.cpp +++ b/search/search_quality/booking_dataset_generator/booking_dataset_generator.cpp @@ -27,7 +27,9 @@ #include #include #include +#include #include +#include #include #include @@ -40,23 +42,32 @@ using namespace search; using namespace std; using namespace storage; -DEFINE_string(data_path, "", "Path to data directory (resources dir)"); -DEFINE_string(mwm_path, "", "Path to mwm files (writable dir)"); -DEFINE_string(out_path, "samples.json", "Path to output samples file"); +DEFINE_string(data_path, "", "Path to data directory (resources dir)."); +DEFINE_string(mwm_path, "", "Path to mwm files (writable dir)."); +DEFINE_string(out_path, "samples.json", "Path to output samples file."); +DEFINE_string(dataset_type, "name", + "Dataset type: name (search hotel by name) or address (search hotel by address)."); +DEFINE_string(address_dataset_path, "", "Path to address dataset."); -string GetSampleString(FeatureType & hotel, m2::PointD const & userPos) +string GetSampleString(FeatureType & hotel, m2::PointD const & userPos, string const & address) { Sample sample; string hotelName; double constexpr kViewportRadiusM = 1000.0; - if (!hotel.GetName(StringUtf8Multilang::kEnglishCode, hotelName) && - !hotel.GetName(StringUtf8Multilang::kDefaultCode, hotelName)) + if (!address.empty()) { - LOG(LINFO, ("Cannot get name for", hotel.GetID())); - return ""; + sample.m_query = strings::MakeUniString(address + " "); + } + else + { + if (!hotel.GetName(StringUtf8Multilang::kEnglishCode, hotelName) && + !hotel.GetName(StringUtf8Multilang::kDefaultCode, hotelName)) + { + LOG(LINFO, ("Cannot get name for", hotel.GetID())); + return ""; + } + sample.m_query = strings::MakeUniString(hotelName + " "); } - - sample.m_query = strings::MakeUniString(hotelName + " "); sample.m_locale = "en"; sample.m_pos = userPos; sample.m_viewport = MercatorBounds::RectByCenterXYAndSizeInMeters(userPos, kViewportRadiusM); @@ -66,6 +77,56 @@ string GetSampleString(FeatureType & hotel, m2::PointD const & userPos) return json; } +enum class Fields : uint8_t +{ + SponsoredId = 0, + Address = 1, + Zip = 2, + City = 3, + District = 4, + Country = 5, + Count = 6 +}; + +string CreateAddress(vector const & fields) +{ + string result = fields[base::Underlying(Fields::Address)]; + if (result.empty()) + return {}; + + auto const district = fields[base::Underlying(Fields::District)]; + if (district != "None") + result += ", " + district; + result += ", " + fields[base::Underlying(Fields::Zip)]; + result += ", " + fields[base::Underlying(Fields::City)]; + result += ", " + fields[base::Underlying(Fields::Country)]; + return result; +} + +map ParseAddressDataset(string const & filename) +{ + if (filename.empty()) + return {}; + + map result; + ifstream data(filename); + for (string line; getline(data, line);) + { + vector fields; + strings::ParseCSVRow(line, '\t', fields); + CHECK_EQUAL(fields.size(), base::Underlying(Fields::Count), ()); + auto const id = fields[base::Underlying(Fields::SponsoredId)]; + auto const address = CreateAddress(fields); + if (address.empty()) + continue; + auto const ret = result.emplace(id, address); + // Hotel may appear several times. + if (!ret.second) + CHECK_EQUAL(result[id], address, ()); + } + return result; +} + int main(int argc, char * argv[]) { ChangeMaxNumberOfOpenFiles(kMaxOpenFiles); @@ -74,6 +135,20 @@ int main(int argc, char * argv[]) google::SetUsageMessage("Booking dataset generator."); google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_dataset_type != "name" && FLAGS_dataset_type != "address") + { + LOG(LERROR, ("Wrong dataset type:", FLAGS_dataset_type, ". Supported types: name, address")); + return -1; + } + + auto const generateAddress = FLAGS_dataset_type == "address"; + + if (generateAddress && FLAGS_address_dataset_path.empty()) + { + LOG(LERROR, ("Set address_dataset_path.")); + return -1; + } + SetPlatformDirs(FLAGS_data_path, FLAGS_mwm_path); classificator::Load(); @@ -91,11 +166,29 @@ int main(int argc, char * argv[]) auto const & hotelChecker = ftypes::IsBookingHotelChecker::Instance(); + map addressData; + if (generateAddress) + { + addressData = ParseAddressDataset(FLAGS_address_dataset_path); + } + + auto const getAddress = [&](FeatureType & hotel) -> string { + auto const id = hotel.GetMetadata().Get(feature::Metadata::FMD_SPONSORED_ID); + if (id.empty()) + return {}; + + auto const it = addressData.find(id); + if (it == addressData.end()) + return {}; + + return it->second; + }; + // For all airports from World.mwm (international or other important airports) and all // hotels which are closer than 100 km from airport we create sample with query=|hotel name| and // viewport and position in the airport. double constexpr kDistanceToHotelM = 1e5; - std::set hotelsNextToAirport; + set hotelsNextToAirport; { auto const handle = indexer::FindWorld(dataSource); if (!handle.IsAlive()) @@ -123,7 +216,15 @@ int main(int argc, char * argv[]) return; } - string json = GetSampleString(hotel, airportPos); + string address; + if (generateAddress) + { + address = getAddress(hotel); + if (address.empty()) + return; + } + + string const json = GetSampleString(hotel, airportPos, address); if (json.empty()) return; out << json; @@ -154,13 +255,23 @@ int main(int argc, char * argv[]) auto hotel = guard.GetFeatureByIndex(i); if (!hotelChecker(*hotel)) continue; + if (hotelsNextToAirport.count(hotel->GetID()) != 0) continue; + string address; + if (generateAddress) + { + address = getAddress(*hotel); + if (address.empty()) + continue; + } + static double kRadiusToHotelM = kDistanceToHotelM / sqrt(2.0); string json = GetSampleString( *hotel, - MercatorBounds::GetSmPoint(feature::GetCenter(*hotel), kRadiusToHotelM, kRadiusToHotelM)); + MercatorBounds::GetSmPoint(feature::GetCenter(*hotel), kRadiusToHotelM, kRadiusToHotelM), + address); if (!json.empty()) out << json;