[search_quality] Booking dataset generation tool.

This commit is contained in:
tatiana-yan 2019-04-25 16:36:57 +03:00 committed by mpimenov
parent c08ce405d3
commit 58d86f929f
5 changed files with 250 additions and 0 deletions

View file

@ -550,6 +550,11 @@ char const * IsHotelChecker::GetHotelTypeTag(Type type)
UNREACHABLE();
}
IsBookingHotelChecker::IsBookingHotelChecker()
{
m_types.push_back(classif().GetTypeByPath({"sponsored", "booking"}));
}
IsWifiChecker::IsWifiChecker()
{
m_types.push_back(classif().GetTypeByPath({"internet_access", "wlan"}));

View file

@ -303,6 +303,14 @@ private:
std::array<std::pair<uint32_t, Type>, base::Underlying(Type::Count)> m_sortedTypes;
};
class IsBookingHotelChecker : public BaseChecker
{
IsBookingHotelChecker();
public:
DECLARE_CHECKER_INSTANCE(IsBookingHotelChecker);
};
// WiFi is a type in classificator.txt,
// it should be checked for filling metadata in MapObject.
class IsWifiChecker : public BaseChecker

View file

@ -17,6 +17,7 @@ if (NOT SKIP_DESKTOP)
add_subdirectory(assessment_tool)
endif()
add_subdirectory(booking_dataset_generator)
add_subdirectory(features_collector_tool)
add_subdirectory(search_quality_tool)
omim_add_test_subdirectory(search_quality_tests)

View file

@ -0,0 +1,35 @@
project(booking_dataset_generator)
include_directories(${OMIM_ROOT}/3party/gflags/src)
set(SRC booking_dataset_generator.cpp)
omim_add_executable(${PROJECT_NAME} ${SRC})
omim_link_libraries(
${PROJECT_NAME}
search
search_quality
storage
editor
indexer
platform
mwm_diff
bsdiff
geometry
coding
base
oauthcpp
gflags
jansson
protobuf
stats_client
minizip
succinct
opening_hours
pugixml
icu
${Qt5Core_LIBRARIES}
${Qt5Network_LIBRARIES}
${LIBZ}
)

View file

@ -0,0 +1,201 @@
#include "search/result.hpp"
#include "search/search_quality/helpers.hpp"
#include "search/search_quality/sample.hpp"
#include "search/utils.hpp"
#include "indexer/classificator_loader.hpp"
#include "indexer/data_source.hpp"
#include "indexer/feature_algo.hpp"
#include "indexer/ftypes_matcher.hpp"
#include "indexer/scales.hpp"
#include "storage/country_info_getter.hpp"
#include "storage/storage.hpp"
#include "storage/storage_defines.hpp"
#include "coding/file_name_utils.hpp"
#include "platform/local_country_file.hpp"
#include "platform/local_country_file_utils.hpp"
#include "platform/platform.hpp"
#include "geometry/mercator.hpp"
#include "base/macros.hpp"
#include "base/string_utils.hpp"
#include <cstddef>
#include <fstream>
#include <iostream>
#include <limits>
#include <memory>
#include <string>
#include <vector>
#include "3party/gflags/src/gflags/gflags.h"
#include "defines.hpp"
using namespace search;
using namespace std;
using namespace storage;
DEFINE_string(data_path, "", "Path to data directory (resources dir)");
DEFINE_string(mwm_path, "", "Path to mwm files (writable dir)");
DEFINE_string(out_path, "samples.json", "Path to output samples file");
string GetSampleString(FeatureType & hotel, m2::PointD const & userPos)
{
Sample sample;
string hotelName;
double constexpr kViewportRadiusM = 1000.0;
if (!hotel.GetName(StringUtf8Multilang::kEnglishCode, hotelName) &&
!hotel.GetName(StringUtf8Multilang::kDefaultCode, hotelName))
{
LOG(LINFO, ("Cannot get name for", hotel.GetID()));
return "";
}
sample.m_query = strings::MakeUniString(hotelName + " ");
sample.m_locale = "en";
sample.m_pos = userPos;
sample.m_viewport = MercatorBounds::RectByCenterXYAndSizeInMeters(userPos, kViewportRadiusM);
sample.m_results.push_back(Sample::Result::Build(hotel, Sample::Result::Relevance::Vital));
string json;
Sample::SerializeToJSONLines({sample}, json);
return json;
}
int main(int argc, char * argv[])
{
ChangeMaxNumberOfOpenFiles(kMaxOpenFiles);
CheckLocale();
google::SetUsageMessage("Booking dataset generator.");
google::ParseCommandLineFlags(&argc, &argv, true);
Platform & platform = GetPlatform();
string countriesFile = COUNTRIES_FILE;
if (!FLAGS_data_path.empty())
{
platform.SetResourceDir(FLAGS_data_path);
countriesFile = base::JoinPath(FLAGS_data_path, COUNTRIES_FILE);
}
if (!FLAGS_mwm_path.empty())
platform.SetWritableDirForTests(FLAGS_mwm_path);
ofstream out;
out.open(FLAGS_out_path);
if (!out.is_open())
{
LOG(LERROR, ("Can't open output file", FLAGS_out_path));
return -1;
}
LOG(LINFO, ("writable dir =", platform.WritableDir()));
LOG(LINFO, ("resources dir =", platform.ResourcesDir()));
auto didDownload = [](CountryId const &, shared_ptr<platform::LocalCountryFile> const &) {};
auto willDelete = [](CountryId const &, shared_ptr<platform::LocalCountryFile> const &) {
return false;
};
Storage storage(countriesFile);
storage.Init(didDownload, willDelete);
auto infoGetter = CountryInfoReader::CreateCountryInfoReader(platform);
infoGetter->InitAffiliationsInfo(&storage.GetAffiliations());
classificator::Load();
FrozenDataSource dataSource;
vector<platform::LocalCountryFile> mwms;
platform::FindAllLocalMapsAndCleanup(numeric_limits<int64_t>::max() /* the latest version */,
mwms);
for (auto & mwm : mwms)
{
mwm.SyncWithDisk();
dataSource.RegisterMap(mwm);
}
auto const & hotelChecker = ftypes::IsBookingHotelChecker::Instance();
// For all airports from World.mwm (international or other important airports) and all
// hotels which are closer than 100 km from airport we create sample with query=|hotel name| and
// viewport and position in the airport.
double constexpr kDistanceToHotelM = 1e5;
std::set<FeatureID> hotelsNextToAirport;
{
auto const handle = FindWorld(dataSource);
if (!handle.IsAlive())
{
LOG(LERROR, ("Cannot find World.mwm"));
return -1;
}
auto const & airportChecker = ftypes::IsAirportChecker::Instance();
FeaturesLoaderGuard const guard(dataSource, handle.GetId());
for (uint32_t i = 0; i < guard.GetNumFeatures(); ++i)
{
auto airport = guard.GetFeatureByIndex(i);
if (!airportChecker(*airport))
continue;
auto const airportPos = feature::GetCenter(*airport);
auto addHotel = [&](FeatureType & hotel) {
if (!hotelChecker(hotel))
return;
if (MercatorBounds::DistanceOnEarth(airportPos, feature::GetCenter(hotel)) >
kDistanceToHotelM)
{
return;
}
string json = GetSampleString(hotel, airportPos);
if (json.empty())
return;
out << json;
hotelsNextToAirport.insert(hotel.GetID());
};
dataSource.ForEachInRect(
addHotel, MercatorBounds::RectByCenterXYAndSizeInMeters(airportPos, kDistanceToHotelM),
scales::GetUpperScale());
}
LOG(LINFO, (hotelsNextToAirport.size(), "hotels have nearby airport."));
}
// For all hotels without an airport nearby we set user position 100km away from hotel.
vector<shared_ptr<MwmInfo>> infos;
dataSource.GetMwmsInfo(infos);
for (auto const & info : infos)
{
auto handle = dataSource.GetMwmHandleById(MwmSet::MwmId(info));
if (!handle.IsAlive())
{
LOG(LERROR, ("Mwm reading error", info));
return -1;
}
FeaturesLoaderGuard const guard(dataSource, handle.GetId());
for (uint32_t i = 0; i < guard.GetNumFeatures(); ++i)
{
auto hotel = guard.GetFeatureByIndex(i);
if (!hotelChecker(*hotel))
continue;
if (hotelsNextToAirport.count(hotel->GetID()) != 0)
continue;
static double kRadiusToHotelM = kDistanceToHotelM / sqrt(2.0);
string json = GetSampleString(
*hotel,
MercatorBounds::GetSmPoint(feature::GetCenter(*hotel), kRadiusToHotelM, kRadiusToHotelM));
if (!json.empty())
out << json;
}
}
return 0;
}