From a82efed41564a5900637d7e362e05defa52c7af6 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Mon, 16 Dec 2019 17:03:09 +0300 Subject: [PATCH] [search][search_quality] Add tool which creates search samples from aloha. --- search/search_quality/CMakeLists.txt | 1 + .../aloha_to_samples_tool/CMakeLists.txt | 36 +++ .../aloha_to_samples_tool.cpp | 261 ++++++++++++++++++ 3 files changed, 298 insertions(+) create mode 100644 search/search_quality/aloha_to_samples_tool/CMakeLists.txt create mode 100644 search/search_quality/aloha_to_samples_tool/aloha_to_samples_tool.cpp diff --git a/search/search_quality/CMakeLists.txt b/search/search_quality/CMakeLists.txt index 3bf9ab751c..1250f8ee12 100644 --- a/search/search_quality/CMakeLists.txt +++ b/search/search_quality/CMakeLists.txt @@ -19,6 +19,7 @@ if (NOT SKIP_DESKTOP) add_subdirectory(assessment_tool) endif() +add_subdirectory(aloha_to_samples_tool) add_subdirectory(booking_dataset_generator) add_subdirectory(features_collector_tool) add_subdirectory(samples_generation_tool) diff --git a/search/search_quality/aloha_to_samples_tool/CMakeLists.txt b/search/search_quality/aloha_to_samples_tool/CMakeLists.txt new file mode 100644 index 0000000000..a4f974e654 --- /dev/null +++ b/search/search_quality/aloha_to_samples_tool/CMakeLists.txt @@ -0,0 +1,36 @@ +project(aloha_to_samples_tool) + +include_directories(${OMIM_ROOT}/3party/gflags/src) + +set(SRC aloha_to_samples_tool.cpp) + +omim_add_executable(${PROJECT_NAME} ${SRC}) + +omim_link_libraries( + ${PROJECT_NAME} + search_quality + search_tests_support + search + storage + editor + indexer + platform + mwm_diff + bsdiff + geometry + coding + base + oauthcpp + gflags + jansson + protobuf + stats_client + minizip + succinct + opening_hours + pugixml + icu + ${Qt5Core_LIBRARIES} + ${Qt5Network_LIBRARIES} + ${LIBZ} +) diff --git a/search/search_quality/aloha_to_samples_tool/aloha_to_samples_tool.cpp b/search/search_quality/aloha_to_samples_tool/aloha_to_samples_tool.cpp new file mode 100644 index 0000000000..a115b98fcc --- /dev/null +++ b/search/search_quality/aloha_to_samples_tool/aloha_to_samples_tool.cpp @@ -0,0 +1,261 @@ +#include "search/search_quality/sample.hpp" + +#include "indexer/categories_holder.hpp" +#include "indexer/classificator_loader.hpp" +#include "indexer/search_string_utils.hpp" + +#include "platform/platform.hpp" + +#include "geometry/mercator.hpp" +#include "geometry/point2d.hpp" +#include "geometry/rect2d.hpp" + +#include "base/logging.hpp" +#include "base/string_utils.hpp" + +#include +#include +#include +#include +#include +#include + +#include + +#include "3party/gflags/src/gflags/gflags.h" + +#define ALOHALYTICS_SERVER +#include "3party/Alohalytics/src/event_base.h" + +using namespace search; +using namespace std; +using namespace strings; + +DEFINE_string(data_path, "", "Path to data directory (resources dir)."); +DEFINE_bool(categorial_only, false, "Save only categorial requests."); + +struct EmitInfo +{ + m2::PointD m_pos; + m2::RectD m_viewport; + vector m_results; + string m_query; + string m_locale; + bool m_isCategorial; +}; + +struct ResultInfo +{ + size_t m_pos; + string m_result; +}; + +bool IsCategorialRequest(string query, uint8_t locale) +{ + if (query.empty() || query.back() != ' ') + return false; + + Trim(query); + auto const normalizedQuery = NormalizeAndSimplifyString(query); + + auto const & catHolder = GetDefaultCategories(); + + bool isCategorialRequest = false; + catHolder.ForEachName([&](auto const & categorySynonym) { + if (isCategorialRequest) + return; + + if (categorySynonym.m_locale != locale && + categorySynonym.m_locale != CategoriesHolder::kEnglishCode) + { + return; + } + + auto const normalizedCat = NormalizeAndSimplifyString(categorySynonym.m_name); + if (normalizedCat == normalizedQuery) + isCategorialRequest = true; + }); + return isCategorialRequest; +} + +boost::optional ParseEmitResultsAndCoords(map const & kpePairs) +{ + EmitInfo info; + try + { + if (!to_double(kpePairs.at("posX"), info.m_pos.x)) + return {}; + if (!to_double(kpePairs.at("posY"), info.m_pos.y)) + return {}; + + double minX, minY, maxX, maxY; + bool gotViewport = true; + gotViewport = gotViewport && to_double(kpePairs.at("viewportMinX"), minX); + gotViewport = gotViewport && to_double(kpePairs.at("viewportMinY"), minY); + gotViewport = gotViewport && to_double(kpePairs.at("viewportMaxX"), maxX); + gotViewport = gotViewport && to_double(kpePairs.at("viewportMaxY"), maxY); + if (!gotViewport) + return {}; + info.m_viewport = m2::RectD(minX, minY, maxX, maxY); + + info.m_locale = kpePairs.at("locale"); + info.m_query = kpePairs.at("query"); + + auto const locale = CategoriesHolder::MapLocaleToInteger(info.m_locale); + info.m_isCategorial = IsCategorialRequest(info.m_query, locale); + + info.m_results = Tokenize(kpePairs.at("results"), "\t"); + // Skip results number. + if (!info.m_results.empty()) + info.m_results.erase(info.m_results.begin()); + } + catch (out_of_range) + { + return {}; + } + + return info; +} + +boost::optional ParseShowResult(map const & kpePairs) +{ + ResultInfo info; + try + { + if (!to_size_t(kpePairs.at("pos"), info.m_pos)) + return {}; + info.m_result = kpePairs.at("result"); + } + catch (out_of_range) + { + return {}; + } + + return info; +} + +boost::optional ParseResultWithCoords(string const & str) +{ + auto const tokens = Tokenize(str, "|"); + // No coords. + if (tokens.size() < 5) + return {}; + + // Suggest. + if (tokens[2] == "1") + return {}; + + Sample::Result res; + res.m_name = MakeUniString(tokens[0]); + res.m_types = {tokens[1]}; + + double lat; + if (!to_double(tokens[3], lat)) + return {}; + + double lon; + if (!to_double(tokens[4], lon)) + return {}; + + res.m_pos = mercator::FromLatLon(lat, lon); + + return res; +} + +boost::optional MakeSample(EmitInfo const & info, size_t relevantPos) +{ + Sample sample; + sample.m_query = MakeUniString(info.m_query); + sample.m_locale = info.m_locale; + sample.m_pos = info.m_pos; + sample.m_viewport = info.m_viewport; + sample.m_results.reserve(info.m_results.size()); + for (size_t i = 0; i < info.m_results.size(); ++i) + { + auto res = ParseResultWithCoords(info.m_results[i]); + if (!res) + return {}; + + res->m_relevance = i == relevantPos ? Sample::Result::Relevance::Relevant + : Sample::Result::Relevance::Irrelevant; + sample.m_results.push_back(*res); + } + return sample; +} + +int main(int argc, char * argv[]) +{ + google::SetUsageMessage("This tool converts events from Alohalytics to search samples."); + google::ParseCommandLineFlags(&argc, &argv, true); + + if (!FLAGS_data_path.empty()) + { + Platform & platform = GetPlatform(); + platform.SetResourceDir(FLAGS_data_path); + platform.SetWritableDirForTests(FLAGS_data_path); + } + + classificator::Load(); + + cereal::BinaryInputArchive ar(std::cin); + unique_ptr ptr; + bool newUser = true; + boost::optional info; + + while (true) + { + try + { + ar(ptr); + } + catch (const cereal::Exception & ex) + { + if (string("Failed to read 4 bytes from input stream! Read 0") != ex.what()) + { + // The exception above is a normal one, Cereal lacks to detect the end of the stream. + cerr << ex.what() << endl; + return -1; + } + return 0; + } + + auto const * event = dynamic_cast(ptr.get()); + if (event) + { + newUser = true; + continue; + } + + auto const * kpe = dynamic_cast(ptr.get()); + if (!kpe) + continue; + + if (kpe->key == "searchEmitResultsAndCoords") + { + info = ParseEmitResultsAndCoords(kpe->pairs); + newUser = false; + } + else if (kpe->key == "searchShowResult" && !newUser) + { + auto const result = ParseShowResult(kpe->pairs); + if (!info || !result) + continue; + + if (FLAGS_categorial_only && !info->m_isCategorial) + continue; + + auto const resultMatches = info->m_results.size() > result->m_pos && + info->m_results[result->m_pos] == result->m_result; + if (!resultMatches) + continue; + + if (auto const sample = MakeSample(*info, result->m_pos)) + { + string json; + Sample::SerializeToJSONLines({*sample}, json); + cout << json; + } + } + } + return 0; +}