[search_quality] Add address mode for booking_dataset_generator tool.

This commit is contained in:
tatiana-yan 2019-08-22 15:27:16 +03:00 committed by mpimenov
parent 4d6b881baa
commit d8ab055070

View file

@ -27,7 +27,9 @@
#include <fstream>
#include <iostream>
#include <limits>
#include <map>
#include <memory>
#include <set>
#include <string>
#include <vector>
@ -40,23 +42,32 @@ using namespace search;
using namespace std;
using namespace storage;
DEFINE_string(data_path, "", "Path to data directory (resources dir)");
DEFINE_string(mwm_path, "", "Path to mwm files (writable dir)");
DEFINE_string(out_path, "samples.json", "Path to output samples file");
DEFINE_string(data_path, "", "Path to data directory (resources dir).");
DEFINE_string(mwm_path, "", "Path to mwm files (writable dir).");
DEFINE_string(out_path, "samples.json", "Path to output samples file.");
DEFINE_string(dataset_type, "name",
"Dataset type: name (search hotel by name) or address (search hotel by address).");
DEFINE_string(address_dataset_path, "", "Path to address dataset.");
string GetSampleString(FeatureType & hotel, m2::PointD const & userPos)
string GetSampleString(FeatureType & hotel, m2::PointD const & userPos, string const & address)
{
Sample sample;
string hotelName;
double constexpr kViewportRadiusM = 1000.0;
if (!hotel.GetName(StringUtf8Multilang::kEnglishCode, hotelName) &&
!hotel.GetName(StringUtf8Multilang::kDefaultCode, hotelName))
if (!address.empty())
{
LOG(LINFO, ("Cannot get name for", hotel.GetID()));
return "";
sample.m_query = strings::MakeUniString(address + " ");
}
else
{
if (!hotel.GetName(StringUtf8Multilang::kEnglishCode, hotelName) &&
!hotel.GetName(StringUtf8Multilang::kDefaultCode, hotelName))
{
LOG(LINFO, ("Cannot get name for", hotel.GetID()));
return "";
}
sample.m_query = strings::MakeUniString(hotelName + " ");
}
sample.m_query = strings::MakeUniString(hotelName + " ");
sample.m_locale = "en";
sample.m_pos = userPos;
sample.m_viewport = MercatorBounds::RectByCenterXYAndSizeInMeters(userPos, kViewportRadiusM);
@ -66,6 +77,56 @@ string GetSampleString(FeatureType & hotel, m2::PointD const & userPos)
return json;
}
enum class Fields : uint8_t
{
SponsoredId = 0,
Address = 1,
Zip = 2,
City = 3,
District = 4,
Country = 5,
Count = 6
};
string CreateAddress(vector<string> const & fields)
{
string result = fields[base::Underlying(Fields::Address)];
if (result.empty())
return {};
auto const district = fields[base::Underlying(Fields::District)];
if (district != "None")
result += ", " + district;
result += ", " + fields[base::Underlying(Fields::Zip)];
result += ", " + fields[base::Underlying(Fields::City)];
result += ", " + fields[base::Underlying(Fields::Country)];
return result;
}
map<string, string> ParseAddressDataset(string const & filename)
{
if (filename.empty())
return {};
map<string, string> result;
ifstream data(filename);
for (string line; getline(data, line);)
{
vector<string> fields;
strings::ParseCSVRow(line, '\t', fields);
CHECK_EQUAL(fields.size(), base::Underlying(Fields::Count), ());
auto const id = fields[base::Underlying(Fields::SponsoredId)];
auto const address = CreateAddress(fields);
if (address.empty())
continue;
auto const ret = result.emplace(id, address);
// Hotel may appear several times.
if (!ret.second)
CHECK_EQUAL(result[id], address, ());
}
return result;
}
int main(int argc, char * argv[])
{
ChangeMaxNumberOfOpenFiles(kMaxOpenFiles);
@ -74,6 +135,20 @@ int main(int argc, char * argv[])
google::SetUsageMessage("Booking dataset generator.");
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_dataset_type != "name" && FLAGS_dataset_type != "address")
{
LOG(LERROR, ("Wrong dataset type:", FLAGS_dataset_type, ". Supported types: name, address"));
return -1;
}
auto const generateAddress = FLAGS_dataset_type == "address";
if (generateAddress && FLAGS_address_dataset_path.empty())
{
LOG(LERROR, ("Set address_dataset_path."));
return -1;
}
SetPlatformDirs(FLAGS_data_path, FLAGS_mwm_path);
classificator::Load();
@ -91,11 +166,29 @@ int main(int argc, char * argv[])
auto const & hotelChecker = ftypes::IsBookingHotelChecker::Instance();
map<string, string> addressData;
if (generateAddress)
{
addressData = ParseAddressDataset(FLAGS_address_dataset_path);
}
auto const getAddress = [&](FeatureType & hotel) -> string {
auto const id = hotel.GetMetadata().Get(feature::Metadata::FMD_SPONSORED_ID);
if (id.empty())
return {};
auto const it = addressData.find(id);
if (it == addressData.end())
return {};
return it->second;
};
// For all airports from World.mwm (international or other important airports) and all
// hotels which are closer than 100 km from airport we create sample with query=|hotel name| and
// viewport and position in the airport.
double constexpr kDistanceToHotelM = 1e5;
std::set<FeatureID> hotelsNextToAirport;
set<FeatureID> hotelsNextToAirport;
{
auto const handle = indexer::FindWorld(dataSource);
if (!handle.IsAlive())
@ -123,7 +216,15 @@ int main(int argc, char * argv[])
return;
}
string json = GetSampleString(hotel, airportPos);
string address;
if (generateAddress)
{
address = getAddress(hotel);
if (address.empty())
return;
}
string const json = GetSampleString(hotel, airportPos, address);
if (json.empty())
return;
out << json;
@ -154,13 +255,23 @@ int main(int argc, char * argv[])
auto hotel = guard.GetFeatureByIndex(i);
if (!hotelChecker(*hotel))
continue;
if (hotelsNextToAirport.count(hotel->GetID()) != 0)
continue;
string address;
if (generateAddress)
{
address = getAddress(*hotel);
if (address.empty())
continue;
}
static double kRadiusToHotelM = kDistanceToHotelM / sqrt(2.0);
string json = GetSampleString(
*hotel,
MercatorBounds::GetSmPoint(feature::GetCenter(*hotel), kRadiusToHotelM, kRadiusToHotelM));
MercatorBounds::GetSmPoint(feature::GetCenter(*hotel), kRadiusToHotelM, kRadiusToHotelM),
address);
if (!json.empty())
out << json;