From 58021db34c45a87752f1c4e9b42e3c6c1838941b Mon Sep 17 00:00:00 2001 From: Maksim Andrianov Date: Tue, 29 Jan 2019 11:25:49 +0300 Subject: [PATCH] [generator] Improved descriptions using wikidata. --- generator/CMakeLists.txt | 2 + generator/collector_interface.hpp | 2 - generator/collector_tag.cpp | 42 ++++++ generator/collector_tag.hpp | 37 +++++ generator/descriptions_section_builder.cpp | 67 ++++++++- generator/descriptions_section_builder.hpp | 79 ++++++++++- generator/generate_info.hpp | 2 + .../descriptions_section_builder_tests.cpp | 17 ++- generator/generator_tool/generator_tool.cpp | 18 ++- generator/osm_element.cpp | 14 ++ generator/osm_element.hpp | 3 + generator/translator_planet.cpp | 28 +++- generator/translator_planet.hpp | 2 + generator/wiki_url_dumper.cpp | 63 +++++++++ generator/wiki_url_dumper.hpp | 21 ++- tools/python/descriptions_downloader.py | 130 ++++++++++++++---- tools/unix/generate_planet.sh | 16 ++- 17 files changed, 489 insertions(+), 54 deletions(-) create mode 100644 generator/collector_tag.cpp create mode 100644 generator/collector_tag.hpp diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 858ca4bb17..4eda851148 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt @@ -35,6 +35,8 @@ set(SRC coastlines_generator.cpp coastlines_generator.hpp collector_interface.hpp + collector_tag.cpp + collector_tag.hpp descriptions_section_builder.cpp descriptions_section_builder.hpp dumper.cpp diff --git a/generator/collector_interface.hpp b/generator/collector_interface.hpp index 876d51eb65..9ce9259331 100644 --- a/generator/collector_interface.hpp +++ b/generator/collector_interface.hpp @@ -1,7 +1,5 @@ #pragma once -#include "base/geo_object_id.hpp" - #include struct OsmElement; diff --git a/generator/collector_tag.cpp b/generator/collector_tag.cpp new file mode 100644 index 0000000000..29a17c0fb2 --- /dev/null +++ b/generator/collector_tag.cpp @@ -0,0 +1,42 @@ +#include "generator/collector_tag.hpp" + +#include "generator/osm_element.hpp" + +#include "base/geo_object_id.hpp" +#include "base/logging.hpp" + +namespace generator +{ +CollectorTag::CollectorTag(std::string const & filename, std::string const & tagKey, + Validator const & validator, bool ignoreIfNotOpen) + : m_tagKey(tagKey), m_validator(validator), m_needCollect(true) +{ + m_stream.exceptions(std::fstream::failbit | std::fstream::badbit); + try + { + m_stream.open(filename); + } + catch (std::ios::failure const & e) + { + if (ignoreIfNotOpen) + { + m_needCollect = false; + LOG(LINFO, ("Сould not open file", filename, ". This was ignored.")); + } + else + { + throw e; + } + } +} + +void CollectorTag::Collect(base::GeoObjectId const & osmId, OsmElement const & el) +{ + if (!m_needCollect) + return; + + auto const tag = el.GetTag(m_tagKey); + if (!tag.empty() && m_validator(tag)) + m_stream << osmId.GetEncodedId() << "\t" << tag << "\n"; +} +} // namespace generator diff --git a/generator/collector_tag.hpp b/generator/collector_tag.hpp new file mode 100644 index 0000000000..a776ab28f4 --- /dev/null +++ b/generator/collector_tag.hpp @@ -0,0 +1,37 @@ +#pragma once + +#include "generator/collector_interface.hpp" + +#include +#include +#include + +struct OsmElement; +namespace base +{ +class GeoObjectId; +} // namespace base + +namespace generator +{ +// CollectorTag class collects validated value of a tag and to save it to file with following +// format: osmIdtagValue. +class CollectorTag : public CollectorInterface +{ +public: + using Validator = std::function; + + explicit CollectorTag(std::string const & filename, std::string const & tagKey, + Validator const & validator, bool ignoreIfNotOpen = false); + + // CollectorInterface overrides: + void Collect(base::GeoObjectId const & osmId, OsmElement const & el) override; + void Save() override {} + +private: + std::ofstream m_stream; + std::string m_tagKey; + Validator m_validator; + bool m_needCollect; +}; +} // namespace generator diff --git a/generator/descriptions_section_builder.cpp b/generator/descriptions_section_builder.cpp index 71086b8451..f58708d5e8 100644 --- a/generator/descriptions_section_builder.cpp +++ b/generator/descriptions_section_builder.cpp @@ -1,9 +1,13 @@ #include "generator/descriptions_section_builder.hpp" +#include "generator/routing_helpers.hpp" + #include "platform/platform.hpp" #include "base/string_utils.hpp" +#include "defines.hpp" + #include #include #include @@ -26,6 +30,39 @@ std::string GetFileName(std::string path) namespace generator { +WikidataHelper::WikidataHelper(std::string const & mwmPath, std::string const & id2wikidataPath) + : m_mwmPath(mwmPath) + , m_id2wikidataPath(id2wikidataPath) +{ + std::string const osmIdsToFeatureIdsPath = m_mwmPath + OSM2FEATURE_FILE_EXTENSION; + if (!routing::ParseFeatureIdToOsmIdMapping(osmIdsToFeatureIdsPath, m_featureIdToOsmId)) + LOG(LCRITICAL, ("Error parse OsmIdToFeatureId mapping.")); + + std::ifstream stream; + stream.exceptions(std::fstream::failbit | std::fstream::badbit); + stream.open(m_id2wikidataPath); + stream.exceptions(std::fstream::badbit); + uint64_t id; + std::string wikidataId; + while (stream) + { + stream >> id >> wikidataId; + strings::Trim(wikidataId); + m_osmIdToFeatureId.emplace(base::GeoObjectId(id), wikidataId); + } +} + +boost::optional WikidataHelper::GetWikidataId(uint32_t featureId) const +{ + auto const itFeatureIdToOsmId = m_featureIdToOsmId.find(featureId); + if (itFeatureIdToOsmId == std::end(m_featureIdToOsmId)) + return {}; + + auto const itOsmIdToFeatureId = m_osmIdToFeatureId.find(itFeatureIdToOsmId->second); + return itOsmIdToFeatureId == std::end(m_osmIdToFeatureId) ? + boost::optional() : itOsmIdToFeatureId->second; +} + std::string DescriptionsCollectionBuilderStat::LangStatisticsToString() const { std::stringstream stream; @@ -45,12 +82,18 @@ std::string DescriptionsCollectionBuilderStat::LangStatisticsToString() const return stream.str(); } +DescriptionsCollectionBuilder::DescriptionsCollectionBuilder(std::string const & wikipediaDir, + std::string const & mwmFile, + std::string const & id2wikidataPath) + : m_wikidataHelper(mwmFile, id2wikidataPath), m_wikipediaDir(wikipediaDir), m_mwmFile(mwmFile) {} + DescriptionsCollectionBuilder::DescriptionsCollectionBuilder(std::string const & wikipediaDir, std::string const & mwmFile) : m_wikipediaDir(wikipediaDir), m_mwmFile(mwmFile) {} // static -std::string DescriptionsCollectionBuilder::MakePath(std::string const & wikipediaDir, std::string wikipediaUrl) +std::string DescriptionsCollectionBuilder::MakePathForWikipedia(std::string const & wikipediaDir, + std::string wikipediaUrl) { strings::Trim(wikipediaUrl); strings::ReplaceFirst(wikipediaUrl, "http://", ""); @@ -61,6 +104,13 @@ std::string DescriptionsCollectionBuilder::MakePath(std::string const & wikipedi return base::JoinPath(wikipediaDir, wikipediaUrl); } +// static +std::string DescriptionsCollectionBuilder::MakePathForWikidata(std::string const & wikipediaDir, + std::string wikidataId) +{ + return base::JoinPath(wikipediaDir, "wikidata", wikidataId); +} + // static size_t DescriptionsCollectionBuilder::FillStringFromFile(std::string const & fullPath, int8_t code, StringUtf8Multilang & str) @@ -76,10 +126,9 @@ size_t DescriptionsCollectionBuilder::FillStringFromFile(std::string const & ful return contentSize; } -boost::optional DescriptionsCollectionBuilder::FindPageAndFill(std::string wikipediaUrl, +boost::optional DescriptionsCollectionBuilder::FindPageAndFill(std::string path, StringUtf8Multilang & str) { - auto const path = MakePath(m_wikipediaDir, wikipediaUrl); if (!IsValidDir(path)) { LOG(LWARNING, ("Directory", path, "not found.")); @@ -108,14 +157,14 @@ boost::optional DescriptionsCollectionBuilder::FindPageAndFill(std::stri return size; } -size_t DescriptionsCollectionBuilder::GetFeatureDescription(std::string const & wikiUrl, uint32_t featureId, +size_t DescriptionsCollectionBuilder::GetFeatureDescription(std::string const & path, uint32_t featureId, descriptions::FeatureDescription & description) { - if (wikiUrl.empty()) + if (path.empty()) return 0; StringUtf8Multilang string; - auto const ret = FindPageAndFill(wikiUrl, string); + auto const ret = FindPageAndFill(path, string); if (!ret || *ret == 0) return 0; @@ -123,6 +172,12 @@ size_t DescriptionsCollectionBuilder::GetFeatureDescription(std::string const & return *ret; } +void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile, + std::string const & id2wikidataPath) +{ + DescriptionsSectionBuilder::Build(wikipediaDir, mwmFile, id2wikidataPath); +} + void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile) { DescriptionsSectionBuilder::Build(wikipediaDir, mwmFile); diff --git a/generator/descriptions_section_builder.hpp b/generator/descriptions_section_builder.hpp index 458d5ad1c5..68e3c0e7b7 100644 --- a/generator/descriptions_section_builder.hpp +++ b/generator/descriptions_section_builder.hpp @@ -12,6 +12,7 @@ #include "coding/string_utf8_multilang.hpp" #include "base/assert.hpp" +#include "base/geo_object_id.hpp" #include "base/logging.hpp" #include "defines.hpp" @@ -19,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -31,6 +34,21 @@ class TestDescriptionSectionBuilder; namespace generator { +class WikidataHelper +{ +public: + WikidataHelper() = default; + explicit WikidataHelper(std::string const & mwmPath, std::string const & id2wikidataPath); + + boost::optional GetWikidataId(uint32_t featureId) const; + +private: + std::string m_mwmPath; + std::string m_id2wikidataPath; + std::map m_featureIdToOsmId; + std::map m_osmIdToFeatureId; +}; + template struct ForEachFromDatAdapt { @@ -61,13 +79,19 @@ public: } void AddSize(size_t size) { m_size += size; } void IncPage() { ++m_pages; } + void IncNumberWikipediaUrls() { ++m_numberWikipediaUrls; } + void IncNumberWikidataIds() { ++m_numberWikidataIds; } size_t GetSize() const { return m_size; } size_t GetPages() const { return m_pages; } + size_t GetNumberWikipediaUrls() const { return m_numberWikipediaUrls; } + size_t GetNumberWikidataIds() const { return m_numberWikidataIds; } LangStatistics const & GetLangStatistics() const { return m_langsStat; } private: size_t m_size = 0; size_t m_pages = 0; + size_t m_numberWikipediaUrls = 0; + size_t m_numberWikidataIds = 0; LangStatistics m_langsStat = {}; }; @@ -76,6 +100,8 @@ class DescriptionsCollectionBuilder public: friend class generator_tests::TestDescriptionSectionBuilder; + DescriptionsCollectionBuilder(std::string const & wikipediaDir, std::string const & mwmFile, + std::string const & id2wikidataPath); DescriptionsCollectionBuilder(std::string const & wikipediaDir, std::string const & mwmFile); template class ForEachFromDatAdapter> @@ -87,13 +113,35 @@ public: if (!wikiChecker.NeedFeature(f)) return; + std::function incSource = []() {}; descriptions::FeatureDescription description; + std::string path; + // We first try to get wikipedia url. auto const wikiUrl = f.GetMetadata().GetWikiURL(); - auto const ret = GetFeatureDescription(wikiUrl, featureId, description); - CHECK_GREATER_OR_EQUAL(ret, 0, ()); + if (!wikiUrl.empty()) + { + path = MakePathForWikipedia(m_wikipediaDir, wikiUrl); + incSource = std::bind(&DescriptionsCollectionBuilderStat::IncNumberWikipediaUrls, std::ref(m_stat)); + } + else + { + // We second try to get wikidata id. + auto const wikidataId = m_wikidataHelper.GetWikidataId(featureId); + if (wikidataId) + { + path = MakePathForWikidata(m_wikipediaDir, *wikidataId); + incSource = std::bind(&DescriptionsCollectionBuilderStat::IncNumberWikidataIds, std::ref(m_stat)); + } + } + + if (path.empty()) + return; + + auto const ret = GetFeatureDescription(path, featureId, description); if (ret == 0) return; + incSource(); m_stat.AddSize(ret); m_stat.IncPage(); descriptionList.emplace_back(std::move(description)); @@ -104,7 +152,8 @@ public: } DescriptionsCollectionBuilderStat const & GetStat() const { return m_stat; } - static std::string MakePath(std::string const & wikipediaDir, std::string wikipediaUrl); + static std::string MakePathForWikipedia(std::string const & wikipediaDir, std::string wikipediaUrl); + static std::string MakePathForWikidata(std::string const & wikipediaDir, std::string wikidataId); private: static size_t FillStringFromFile(std::string const & fullPath, int8_t code, @@ -114,6 +163,7 @@ private: descriptions::FeatureDescription & description); DescriptionsCollectionBuilderStat m_stat; + WikidataHelper m_wikidataHelper; std::string m_wikipediaDir; std::string m_mwmFile; }; @@ -121,14 +171,28 @@ private: template class ForEachFromDatAdapter = ForEachFromDatAdapt> struct DescriptionsSectionBuilder { + static void Build(std::string const & wikipediaDir, std::string const & mwmFile, + std::string const & id2wikidataPath) + { + DescriptionsCollectionBuilder descriptionsCollectionBuilder(wikipediaDir, mwmFile, id2wikidataPath); + Build(mwmFile, descriptionsCollectionBuilder); + } + static void Build(std::string const & wikipediaDir, std::string const & mwmFile) { DescriptionsCollectionBuilder descriptionsCollectionBuilder(wikipediaDir, mwmFile); - auto descriptionList = descriptionsCollectionBuilder.MakeDescriptions(); + Build(mwmFile, descriptionsCollectionBuilder); + } - auto const & stat = descriptionsCollectionBuilder.GetStat(); +private: + static void Build(std::string const & mwmFile, DescriptionsCollectionBuilder & builder) + { + auto descriptionList = builder.MakeDescriptions(); + auto const & stat = builder.GetStat(); auto const size = stat.GetSize(); - LOG(LINFO, ("Found", stat.GetPages(), "pages for", mwmFile)); + LOG(LINFO, ("Added", stat.GetNumberWikipediaUrls(), "pages form wikipedia urls for", mwmFile)); + LOG(LINFO, ("Added", stat.GetNumberWikidataIds(), "pages form wikidata ids for", mwmFile)); + LOG(LINFO, ("Added", stat.GetPages(), "pages for", mwmFile)); LOG(LINFO, ("Total size of added pages (before writing to section):", size)); CHECK_GREATER_OR_EQUAL(size, 0, ()); if (size == 0) @@ -147,5 +211,8 @@ struct DescriptionsSectionBuilder } }; +void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile, + std::string const & id2wikidataPath); + void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile); } // namespace generator diff --git a/generator/generate_info.hpp b/generator/generate_info.hpp index 3fb11fc403..2ac4448cfb 100644 --- a/generator/generate_info.hpp +++ b/generator/generate_info.hpp @@ -53,6 +53,8 @@ struct GenerateInfo std::string m_popularPlacesFilename; + std::string m_id2wikidataFilename; + std::shared_ptr m_boundariesTable; uint32_t m_versionDate = 0; diff --git a/generator/generator_tests/descriptions_section_builder_tests.cpp b/generator/generator_tests/descriptions_section_builder_tests.cpp index dd813513a6..c25770563a 100644 --- a/generator/generator_tests/descriptions_section_builder_tests.cpp +++ b/generator/generator_tests/descriptions_section_builder_tests.cpp @@ -77,7 +77,7 @@ public: { for (auto const & m : kWikiData) { - auto const dir = DescriptionsCollectionBuilder::MakePath(m_wikiDir, m.m_url); + auto const dir = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, m.m_url); CHECK(Platform::MkDirRecursively(dir), ()); for (auto const & d : m.m_pages) { @@ -112,13 +112,13 @@ public: { std::string const wikiDir = "/wikiDir/"; std::string const wikiUrl = "http://en.wikipedia.org/wiki/Helsinki_Olympic_Stadium/"; - auto const answer = DescriptionsCollectionBuilder::MakePath(wikiDir, wikiUrl); + auto const answer = DescriptionsCollectionBuilder::MakePathForWikipedia(wikiDir, wikiUrl); TEST_EQUAL(trueAnswer, answer, ()); } { std::string const wikiDir = "/wikiDir"; std::string const wikiUrl = "https://en.wikipedia.org/wiki/Helsinki_Olympic_Stadium"; - auto const answer = DescriptionsCollectionBuilder::MakePath(wikiDir, wikiUrl); + auto const answer = DescriptionsCollectionBuilder::MakePathForWikipedia(wikiDir, wikiUrl); TEST_EQUAL(trueAnswer, answer, ()); } } @@ -130,7 +130,8 @@ public: CHECK(!kWikiData.empty(), ()); auto const & first = kWikiData.front(); StringUtf8Multilang str; - auto const size = b.FindPageAndFill(first.m_url, str); + auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, first.m_url); + auto const size = b.FindPageAndFill(path, str); TEST(size, ()); TEST_EQUAL(*size, GetPageSize(first.m_pages), ()); TEST(CheckLangs(str, first.m_pages), ()); @@ -139,7 +140,8 @@ public: DescriptionsCollectionBuilder b(m_wikiDir, kMwmFile); StringUtf8Multilang str; std::string const badUrl = "https://en.wikipedia.org/wiki/Not_exists"; - auto const size = b.FindPageAndFill(badUrl, str); + auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, badUrl); + auto const size = b.FindPageAndFill(path, str); TEST(!size, ()); } } @@ -151,7 +153,7 @@ public: auto const & first = kWikiData.front(); std::string const lang = "en"; auto const langIndex = StringUtf8Multilang::GetLangIndex(lang); - auto const path = DescriptionsCollectionBuilder::MakePath(m_wikiDir, first.m_url); + auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, first.m_url); auto const fullPath = base::JoinPath(path, (lang + ".html")); StringUtf8Multilang str; // This is a private function and should take the right path fullPath. @@ -172,7 +174,8 @@ public: auto ft = MakeFeature(first.m_url); descriptions::FeatureDescription description; auto const wikiUrl = ft.GetMetadata().GetWikiURL(); - auto const size = b.GetFeatureDescription(wikiUrl, featureId, description); + auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, wikiUrl); + auto const size = b.GetFeatureDescription(path, featureId, description); TEST_EQUAL(size, GetPageSize(first.m_pages), ()); CHECK_NOT_EQUAL(size, 0, ()); diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index 082b21c8fd..0dc6c3b01a 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -153,6 +153,7 @@ DEFINE_string(opentable_data, "", "Path to opentable data in .tsv format."); DEFINE_string(ugc_data, "", "Input UGC source database file name."); DEFINE_string(wikipedia_pages, "", "Input dir with wikipedia pages."); +DEFINE_string(id2wikidata, "", "Path to file with id to wikidata mapping."); DEFINE_string(dump_wikipedia_urls, "", "Output file with wikipedia urls."); DEFINE_bool(generate_popular_places, false, "Generate popular places section."); @@ -306,6 +307,7 @@ int GeneratorToolMain(int argc, char ** argv) genInfo.m_emitCoasts = FLAGS_emit_coasts; genInfo.m_fileName = FLAGS_output; genInfo.m_genAddresses = FLAGS_generate_addresses_file; + genInfo.m_id2wikidataFilename = FLAGS_id2wikidata; auto emitter = CreateEmitter(EmitterType::Planet, genInfo); if (!GenerateFeatures(genInfo, emitter)) @@ -433,8 +435,15 @@ int GeneratorToolMain(int argc, char ** argv) { auto const tmpPath = base::JoinPath(genInfo.m_intermediateDir, "tmp"); auto const datFiles = platform_helpers::GetFullDataTmpFilePaths(tmpPath); + WikiUrlDumper wikiUrlDumper(FLAGS_dump_wikipedia_urls, datFiles); wikiUrlDumper.Dump(threadsCount); + + if (!FLAGS_id2wikidata.empty()) + { + WikiDataFilter wikiDataFilter(FLAGS_id2wikidata, datFiles); + wikiDataFilter.Filter(threadsCount); + } } // Enumerate over all dat files that were created. @@ -601,7 +610,12 @@ int GeneratorToolMain(int argc, char ** argv) } if (!FLAGS_wikipedia_pages.empty()) - BuildDescriptionsSection(FLAGS_wikipedia_pages, datFile); + { + if (!FLAGS_id2wikidata.empty()) + BuildDescriptionsSection(FLAGS_wikipedia_pages, datFile, FLAGS_id2wikidata); + else + BuildDescriptionsSection(FLAGS_wikipedia_pages, datFile); + } if (FLAGS_generate_popular_places) { @@ -674,7 +688,7 @@ int GeneratorToolMain(int argc, char ** argv) int main(int argc, char ** argv) -{ +{ try { return GeneratorToolMain(argc, argv); diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 1af4ba8134..9957b7e431 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -153,3 +153,17 @@ std::string DebugPrint(OsmElement::Tag const & tag) ss << tag.key << '=' << tag.value; return ss.str(); } + +base::GeoObjectId GetGeoObjectId(OsmElement const & element) +{ + switch (element.type) + { + case OsmElement::EntityType::Node: + return base::MakeOsmNode(element.id); + case OsmElement::EntityType::Way: + return base::MakeOsmWay(element.id); + case OsmElement::EntityType::Relation: + return base::MakeOsmRelation(element.id); + } + UNREACHABLE(); +} diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index d6f05466f9..dca542b96c 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -1,6 +1,7 @@ #pragma once #include "base/assert.hpp" +#include "base/geo_object_id.hpp" #include "base/math.hpp" #include "base/string_utils.hpp" @@ -162,6 +163,8 @@ struct OsmElement std::string GetTag(std::string const & key) const; }; +base::GeoObjectId GetGeoObjectId(OsmElement const & element); + std::string DebugPrint(OsmElement const & e); std::string DebugPrint(OsmElement::EntityType e); std::string DebugPrint(OsmElement::Tag const & tag); diff --git a/generator/translator_planet.cpp b/generator/translator_planet.cpp index 0da628f1a0..cbbe37b975 100644 --- a/generator/translator_planet.cpp +++ b/generator/translator_planet.cpp @@ -15,12 +15,37 @@ #include "geometry/point2d.hpp" #include "base/assert.hpp" +#include "base/string_utils.hpp" +#include #include #include namespace generator { +namespace +{ +// https://www.wikidata.org/wiki/Wikidata:Identifiers +bool WikiDataValidator(std::string const & tagValue) +{ + if (tagValue.size() < 2) + return false; + + size_t pos = 0; + // Only items are are needed. + if (tagValue[pos++] != 'Q') + return false; + + while (pos != tagValue.size()) + { + if (!std::isdigit(tagValue[pos++])) + return false; + } + + return true; +} +} // namespace + TranslatorPlanet::TranslatorPlanet(std::shared_ptr emitter, cache::IntermediateDataReader & holder, feature::GenerateInfo const & info) @@ -31,6 +56,7 @@ TranslatorPlanet::TranslatorPlanet(std::shared_ptr emitter, , m_nodeRelations(m_routingTagsProcessor) , m_wayRelations(m_routingTagsProcessor) , m_metalinesBuilder(info.GetIntermediateFileName(METALINES_FILENAME)) + , m_wikiDataCollector(info.m_id2wikidataFilename, "wikidata", WikiDataValidator, true /* ignoreIfNotOpen */) { auto const addrFilePath = info.GetAddressesFileName(); if (!addrFilePath.empty()) @@ -182,7 +208,7 @@ bool TranslatorPlanet::ParseType(OsmElement * p, FeatureParams & params) m_routingTagsProcessor.m_cameraNodeWriter.Process(*p, params, m_cache); m_routingTagsProcessor.m_roadAccessWriter.Process(*p); - + m_wikiDataCollector.Collect(GetGeoObjectId(*p), *p); return true; } diff --git a/generator/translator_planet.hpp b/generator/translator_planet.hpp index 8cee841ff0..f370112d07 100644 --- a/generator/translator_planet.hpp +++ b/generator/translator_planet.hpp @@ -1,6 +1,7 @@ #pragma once #include "generator/camera_info_collector.hpp" +#include "generator/collector_tag.hpp" #include "generator/metalines_builder.hpp" #include "generator/relation_tags.hpp" #include "generator/routing_helpers.hpp" @@ -58,5 +59,6 @@ private: RelationTagsNode m_nodeRelations; RelationTagsWay m_wayRelations; feature::MetalinesBuilder m_metalinesBuilder; + CollectorTag m_wikiDataCollector; }; } // namespace generator diff --git a/generator/wiki_url_dumper.cpp b/generator/wiki_url_dumper.cpp index 781d20f2ff..42991c1c5a 100644 --- a/generator/wiki_url_dumper.cpp +++ b/generator/wiki_url_dumper.cpp @@ -65,4 +65,67 @@ void WikiUrlDumper::DumpOne(std::string const & path, std::ostream & stream) stream << path << "\t" << feature.GetMostGenericOsmId() << "\t" << wikiUrl << "\n"; }); } + +WikiDataFilter::WikiDataFilter(std::string const & path, std::vector const & datFiles) + : m_path(path), m_dataFiles(datFiles) +{ + std::ifstream stream; + stream.exceptions(std::fstream::failbit | std::fstream::badbit); + stream.open(m_path); + stream.exceptions(std::fstream::badbit); + uint64_t id; + std::string wikidata; + while (stream) + { + stream >> id >> wikidata; + m_id2wikiData.emplace(base::GeoObjectId(id), wikidata); + } +} + +// static +void WikiDataFilter::FilterOne(std::string const & path, std::map const & id2wikiData, + std::ostream & stream) +{ + auto const & needWikiUrl = ftypes::WikiChecker::Instance(); + feature::ForEachFromDatRawFormat(path, [&](FeatureBuilder1 const & feature, uint64_t /* pos */) { + if (!needWikiUrl(feature.GetTypesHolder())) + return; + + auto const it = id2wikiData.find(feature.GetMostGenericOsmId()); + if (it == std::end(id2wikiData)) + return; + + stream << it->first.GetEncodedId() << "\t" << it->second << "\n"; + }); +} + +void WikiDataFilter::Filter(size_t cpuCount) +{ + CHECK_GREATER(cpuCount, 0, ()); + + base::thread_pool::computational::ThreadPool threadPool(cpuCount); + std::vector> futures; + futures.reserve(m_dataFiles.size()); + + auto const fn = [&](std::string const & filename) { + std::stringstream stringStream; + FilterOne(filename, m_id2wikiData, stringStream); + return stringStream.str(); + }; + + for (auto const & path : m_dataFiles) + { + auto result = threadPool.Submit(fn, path); + futures.emplace_back(std::move(result)); + } + + std::ofstream stream; + stream.exceptions(std::fstream::failbit | std::fstream::badbit); + stream.open(m_path); + for (auto & f : futures) + { + auto lines = f.get(); + stream << lines; + } +} } // namespace generator diff --git a/generator/wiki_url_dumper.hpp b/generator/wiki_url_dumper.hpp index df924edf3d..4f64aaceff 100644 --- a/generator/wiki_url_dumper.hpp +++ b/generator/wiki_url_dumper.hpp @@ -1,6 +1,9 @@ #pragma once +#include "base/geo_object_id.hpp" + #include +#include #include #include @@ -9,7 +12,7 @@ namespace generator class WikiUrlDumper { public: - WikiUrlDumper(std::string const & path, std::vector const & datFiles); + explicit WikiUrlDumper(std::string const & path, std::vector const & datFiles); static void DumpOne(std::string const & path, std::ostream & stream); @@ -19,4 +22,20 @@ private: std::string m_path; std::vector m_dataFiles; }; + +class WikiDataFilter +{ +public: + explicit WikiDataFilter(std::string const & path, std::vector const & datFiles); + + static void FilterOne(std::string const & path, std::map const & id2wikiData, + std::ostream & stream); + + void Filter(size_t cpuCount); + +private: + std::string m_path; + std::map m_id2wikiData; + std::vector m_dataFiles; +}; } // namespace generator diff --git a/tools/python/descriptions_downloader.py b/tools/python/descriptions_downloader.py index 071cd41596..d7391c731c 100644 --- a/tools/python/descriptions_downloader.py +++ b/tools/python/descriptions_downloader.py @@ -6,6 +6,8 @@ import logging import os import random import time +import types +import urllib.error import urllib.parse from multiprocessing.pool import ThreadPool @@ -13,6 +15,7 @@ import htmlmin import requests import wikipediaapi from bs4 import BeautifulSoup +from wikidata.client import Client """ This script downloads Wikipedia pages for different languages. @@ -20,7 +23,7 @@ This script downloads Wikipedia pages for different languages. log = logging.getLogger(__name__) WORKERS = 80 -CHUNK_SIZE = 128 +CHUNK_SIZE = 16 REQUEST_ATTEMPTS = 32 ATTEMPTS_PAUSE_MS = 4000 @@ -48,16 +51,21 @@ class GettingError(MyException): pass -def try_get(obj, prop): +def try_get(obj, prop, *args, **kwargs): attempts = REQUEST_ATTEMPTS while attempts != 0: try: - return getattr(obj, prop) + attr = getattr(obj, prop) + is_method = isinstance(attr, types.MethodType) + return attr(*args, **kwargs) if is_method else attr except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout, json.decoder.JSONDecodeError): time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS)) attempts -= 1 + except urllib.error.HTTPError as e: + if e.code == 404: + raise GettingError(f"Page not found {e.msg}") except KeyError: raise GettingError(f"Getting {prop} field failed. {prop} not found.") @@ -80,7 +88,7 @@ def read_popularity(path): return ids -def should_download_wikipage(popularity_set): +def should_download_page(popularity_set): @functools.wraps(popularity_set) def wrapped(ident): return popularity_set is None or ident in popularity_set @@ -184,7 +192,7 @@ def get_wiki_langs(url): return curr_lang -def download_all(path, url, langs): +def download_all_from_wikipedia(path, url, langs): try: available_langs = get_wiki_langs(url) except ParseError: @@ -195,8 +203,8 @@ def download_all(path, url, langs): download(path, lang[1]) -def worker(output_dir, checker, langs): - @functools.wraps(worker) +def wikipedia_worker(output_dir, checker, langs): + @functools.wraps(wikipedia_worker) def wrapped(line): if not line.strip(): return @@ -211,20 +219,94 @@ def worker(output_dir, checker, langs): return parsed = urllib.parse.urlparse(url) path = os.path.join(output_dir, parsed.netloc, parsed.path[1:]) - download_all(path, url, langs) + download_all_from_wikipedia(path, url, langs) return wrapped +def download_from_wikipedia_tags(input_file, output_dir, langs, checker): + with open(input_file) as file: + _ = file.readline() + pool = ThreadPool(processes=WORKERS) + pool.map(wikipedia_worker(output_dir, checker, langs), file, CHUNK_SIZE) + pool.close() + pool.join() + + +def get_wikidata_urls(entity, langs): + try: + keys = entity.data["sitelinks"].keys() + except (KeyError, AttributeError): + log.exception(f"Sitelinks not found for {entity.id}.") + return None + return [ + entity.data["sitelinks"][k]["url"] for k in keys + if any([k.startswith(lang) for lang in langs]) + ] + + +def wikidata_worker(output_dir, checker, langs): + @functools.wraps(wikidata_worker) + def wrapped(line): + if not line.strip(): + return + try: + ident, wikidata_id = line.split("\t") + ident = int(ident) + wikidata_id = wikidata_id.strip() + if not checker(ident): + return + except (AttributeError, IndexError): + log.exception(f"{line} is incorrect.") + return + client = Client() + try: + entity = try_get(client, "get", wikidata_id, load=True) + except GettingError: + log.exception(f"Error: page is not downloaded {wikidata_id}.") + return + urls = get_wikidata_urls(entity, langs) + if not urls: + return + path = os.path.join(output_dir, wikidata_id) + for url in urls: + download(path, url) + return wrapped + + +def download_from_wikidata_tags(input_file, output_dir, langs, checker): + wikidata_output_dir = os.path.join(output_dir, "wikidata") + os.makedirs(wikidata_output_dir, exist_ok=True) + with open(input_file) as file: + pool = ThreadPool(processes=WORKERS) + pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE) + pool.close() + pool.join() + + +def check_and_get_checker(popularity_file): + popularity_set = None + if popularity_file is None: + log.warning(f"Popularity file not set.") + elif os.path.exists(popularity_file): + popularity_set = read_popularity(popularity_file) + log.info(f"Popularity set size: {len(popularity_set)}.") + else: + log.error(f"Popularity file ({popularity_file}) not found.") + return should_download_page(popularity_set) + + def parse_args(): parser = argparse.ArgumentParser(description="Download wiki pages.") - parser.add_argument("--o", metavar="PATH", type=str, + parser.add_argument("--output_dir", metavar="PATH", type=str, help="Output dir for saving pages") - parser.add_argument("--p", metavar="PATH", type=str, + parser.add_argument("--popularity", metavar="PATH", type=str, help="File with popular object ids for which we " "download wikipedia data. If not given, download " "for all objects.") - parser.add_argument('--i', metavar="PATH", type=str, required=True, + parser.add_argument('--wikipedia', metavar="PATH", type=str, required=True, help="Input file with wikipedia url.") + parser.add_argument('--wikidata', metavar="PATH", type=str, + help="Input file with wikidata ids.") parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+', action='append', help="Languages ​​for pages. If left blank, pages in all " @@ -236,22 +318,20 @@ def main(): log.setLevel(logging.WARNING) wikipediaapi.log.setLevel(logging.WARNING) args = parse_args() - input_file = args.i - output_dir = args.o - popularity_file = args.p + wikipedia_file = args.wikipedia + wikidata_file = args.wikidata + output_dir = args.output_dir + popularity_file = args.popularity langs = list(itertools.chain.from_iterable(args.langs)) os.makedirs(output_dir, exist_ok=True) - popularity_set = read_popularity(popularity_file) if popularity_file else None - if popularity_set: - log.info(f"Popularity set size: {len(popularity_set)}.") - checker = should_download_wikipage(popularity_set) - with open(input_file) as file: - _ = file.readline() - pool = ThreadPool(processes=WORKERS) - pool.map(worker(output_dir, checker, langs), file, CHUNK_SIZE) - pool.close() - pool.join() - + checker = check_and_get_checker(popularity_file) + download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker) + if wikidata_file is None: + log.warning(f"Wikidata file not set.") + elif os.path.exists(wikidata_file): + download_from_wikidata_tags(wikidata_file, output_dir, langs, checker) + else: + log.warning(f"Wikidata ({wikidata_file}) file not set.") if __name__ == "__main__": main() diff --git a/tools/unix/generate_planet.sh b/tools/unix/generate_planet.sh index b1b6598443..b0116c1b7b 100755 --- a/tools/unix/generate_planet.sh +++ b/tools/unix/generate_planet.sh @@ -185,6 +185,7 @@ DESCRIPTIONS_DOWNLOADER="$PYTHON_SCRIPTS_PATH/descriptions_downloader.py" LOCALADS_SCRIPT="$PYTHON_SCRIPTS_PATH/local_ads/mwm_to_csv_4localads.py" UGC_FILE="${UGC_FILE:-$INTDIR/ugc_db.sqlite3}" POPULAR_PLACES_FILE="${POPULAR_PLACES_FILE:-$INTDIR/popular_places.csv}" +WIKIDATA_FILE="${WIKIDATA_FILE:-$INTDIR/id2wikidata.csv}" BOOKING_SCRIPT="$PYTHON_SCRIPTS_PATH/booking_hotels.py" BOOKING_FILE="${BOOKING_FILE:-$INTDIR/hotels.csv}" OPENTABLE_SCRIPT="$PYTHON_SCRIPTS_PATH/opentable_restaurants.py" @@ -453,6 +454,9 @@ if [ "$MODE" == "features" ]; then [ -f "$BOOKING_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --booking_data=$BOOKING_FILE" [ -f "$OPENTABLE_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --opentable_data=$OPENTABLE_FILE" [ -f "$POPULAR_PLACES_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --popular_places_data=$POPULAR_PLACES_FILE" + [ -n "$OPT_DESCRIPTIONS" ] && PARAMS_SPLIT="$PARAMS_SPLIT --id2wikidata=$WIKIDATA_FILE" + + "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" \ --node_storage=$NODE_STORAGE \ --osm_file_type=o5m \ @@ -555,14 +559,18 @@ if [ "$MODE" == "descriptions" ]; then LOG="$LOG_PATH/descriptions.log" LANGS="en ru es" - "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG - $PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs $LANGS 2>> $LOG + "$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" \ + --dump_wikipedia_urls="$URLS_PATH" --id2wikidata="$WIKIDATA_FILE" 2>> $LOG + + PARAMS="--wikipedia $URLS_PATH --wikidata $WIKIDATA_FILE --output_dir $WIKI_PAGES_PATH" + [ -f "$POPULAR_PLACES_FILE" ] && PARAMS="$PARAMS --popularity=$POPULAR_PLACES_FILE" + $PYTHON36 $DESCRIPTIONS_DOWNLOADER $PARAMS --langs $LANGS 2>> $LOG for file in "$TARGET"/*.mwm; do if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then BASENAME="$(basename "$file" .mwm)" - "$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --data_path="$TARGET" --user_resource_path="$DATA_PATH/" \ - --output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" & + "$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --id2wikidata="$WIKIDATA_FILE" \ + --data_path="$TARGET" --user_resource_path="$DATA_PATH/" --output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" & forky fi done