[generator] Improved descriptions using wikidata.

This commit is contained in:
Maksim Andrianov 2019-01-29 11:25:49 +03:00 committed by mpimenov
parent 2d9e4a90ee
commit 58021db34c
17 changed files with 489 additions and 54 deletions

View file

@ -35,6 +35,8 @@ set(SRC
coastlines_generator.cpp
coastlines_generator.hpp
collector_interface.hpp
collector_tag.cpp
collector_tag.hpp
descriptions_section_builder.cpp
descriptions_section_builder.hpp
dumper.cpp

View file

@ -1,7 +1,5 @@
#pragma once
#include "base/geo_object_id.hpp"
#include <string>
struct OsmElement;

View file

@ -0,0 +1,42 @@
#include "generator/collector_tag.hpp"
#include "generator/osm_element.hpp"
#include "base/geo_object_id.hpp"
#include "base/logging.hpp"
namespace generator
{
CollectorTag::CollectorTag(std::string const & filename, std::string const & tagKey,
Validator const & validator, bool ignoreIfNotOpen)
: m_tagKey(tagKey), m_validator(validator), m_needCollect(true)
{
m_stream.exceptions(std::fstream::failbit | std::fstream::badbit);
try
{
m_stream.open(filename);
}
catch (std::ios::failure const & e)
{
if (ignoreIfNotOpen)
{
m_needCollect = false;
LOG(LINFO, ("Сould not open file", filename, ". This was ignored."));
}
else
{
throw e;
}
}
}
void CollectorTag::Collect(base::GeoObjectId const & osmId, OsmElement const & el)
{
if (!m_needCollect)
return;
auto const tag = el.GetTag(m_tagKey);
if (!tag.empty() && m_validator(tag))
m_stream << osmId.GetEncodedId() << "\t" << tag << "\n";
}
} // namespace generator

View file

@ -0,0 +1,37 @@
#pragma once
#include "generator/collector_interface.hpp"
#include <fstream>
#include <functional>
#include <string>
struct OsmElement;
namespace base
{
class GeoObjectId;
} // namespace base
namespace generator
{
// CollectorTag class collects validated value of a tag and to save it to file with following
// format: osmId<tab>tagValue.
class CollectorTag : public CollectorInterface
{
public:
using Validator = std::function<bool(std::string const & tagValue)>;
explicit CollectorTag(std::string const & filename, std::string const & tagKey,
Validator const & validator, bool ignoreIfNotOpen = false);
// CollectorInterface overrides:
void Collect(base::GeoObjectId const & osmId, OsmElement const & el) override;
void Save() override {}
private:
std::ofstream m_stream;
std::string m_tagKey;
Validator m_validator;
bool m_needCollect;
};
} // namespace generator

View file

@ -1,9 +1,13 @@
#include "generator/descriptions_section_builder.hpp"
#include "generator/routing_helpers.hpp"
#include "platform/platform.hpp"
#include "base/string_utils.hpp"
#include "defines.hpp"
#include <fstream>
#include <iterator>
#include <limits>
@ -26,6 +30,39 @@ std::string GetFileName(std::string path)
namespace generator
{
WikidataHelper::WikidataHelper(std::string const & mwmPath, std::string const & id2wikidataPath)
: m_mwmPath(mwmPath)
, m_id2wikidataPath(id2wikidataPath)
{
std::string const osmIdsToFeatureIdsPath = m_mwmPath + OSM2FEATURE_FILE_EXTENSION;
if (!routing::ParseFeatureIdToOsmIdMapping(osmIdsToFeatureIdsPath, m_featureIdToOsmId))
LOG(LCRITICAL, ("Error parse OsmIdToFeatureId mapping."));
std::ifstream stream;
stream.exceptions(std::fstream::failbit | std::fstream::badbit);
stream.open(m_id2wikidataPath);
stream.exceptions(std::fstream::badbit);
uint64_t id;
std::string wikidataId;
while (stream)
{
stream >> id >> wikidataId;
strings::Trim(wikidataId);
m_osmIdToFeatureId.emplace(base::GeoObjectId(id), wikidataId);
}
}
boost::optional<std::string> WikidataHelper::GetWikidataId(uint32_t featureId) const
{
auto const itFeatureIdToOsmId = m_featureIdToOsmId.find(featureId);
if (itFeatureIdToOsmId == std::end(m_featureIdToOsmId))
return {};
auto const itOsmIdToFeatureId = m_osmIdToFeatureId.find(itFeatureIdToOsmId->second);
return itOsmIdToFeatureId == std::end(m_osmIdToFeatureId) ?
boost::optional<std::string>() : itOsmIdToFeatureId->second;
}
std::string DescriptionsCollectionBuilderStat::LangStatisticsToString() const
{
std::stringstream stream;
@ -45,12 +82,18 @@ std::string DescriptionsCollectionBuilderStat::LangStatisticsToString() const
return stream.str();
}
DescriptionsCollectionBuilder::DescriptionsCollectionBuilder(std::string const & wikipediaDir,
std::string const & mwmFile,
std::string const & id2wikidataPath)
: m_wikidataHelper(mwmFile, id2wikidataPath), m_wikipediaDir(wikipediaDir), m_mwmFile(mwmFile) {}
DescriptionsCollectionBuilder::DescriptionsCollectionBuilder(std::string const & wikipediaDir,
std::string const & mwmFile)
: m_wikipediaDir(wikipediaDir), m_mwmFile(mwmFile) {}
// static
std::string DescriptionsCollectionBuilder::MakePath(std::string const & wikipediaDir, std::string wikipediaUrl)
std::string DescriptionsCollectionBuilder::MakePathForWikipedia(std::string const & wikipediaDir,
std::string wikipediaUrl)
{
strings::Trim(wikipediaUrl);
strings::ReplaceFirst(wikipediaUrl, "http://", "");
@ -61,6 +104,13 @@ std::string DescriptionsCollectionBuilder::MakePath(std::string const & wikipedi
return base::JoinPath(wikipediaDir, wikipediaUrl);
}
// static
std::string DescriptionsCollectionBuilder::MakePathForWikidata(std::string const & wikipediaDir,
std::string wikidataId)
{
return base::JoinPath(wikipediaDir, "wikidata", wikidataId);
}
// static
size_t DescriptionsCollectionBuilder::FillStringFromFile(std::string const & fullPath, int8_t code,
StringUtf8Multilang & str)
@ -76,10 +126,9 @@ size_t DescriptionsCollectionBuilder::FillStringFromFile(std::string const & ful
return contentSize;
}
boost::optional<size_t> DescriptionsCollectionBuilder::FindPageAndFill(std::string wikipediaUrl,
boost::optional<size_t> DescriptionsCollectionBuilder::FindPageAndFill(std::string path,
StringUtf8Multilang & str)
{
auto const path = MakePath(m_wikipediaDir, wikipediaUrl);
if (!IsValidDir(path))
{
LOG(LWARNING, ("Directory", path, "not found."));
@ -108,14 +157,14 @@ boost::optional<size_t> DescriptionsCollectionBuilder::FindPageAndFill(std::stri
return size;
}
size_t DescriptionsCollectionBuilder::GetFeatureDescription(std::string const & wikiUrl, uint32_t featureId,
size_t DescriptionsCollectionBuilder::GetFeatureDescription(std::string const & path, uint32_t featureId,
descriptions::FeatureDescription & description)
{
if (wikiUrl.empty())
if (path.empty())
return 0;
StringUtf8Multilang string;
auto const ret = FindPageAndFill(wikiUrl, string);
auto const ret = FindPageAndFill(path, string);
if (!ret || *ret == 0)
return 0;
@ -123,6 +172,12 @@ size_t DescriptionsCollectionBuilder::GetFeatureDescription(std::string const &
return *ret;
}
void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile,
std::string const & id2wikidataPath)
{
DescriptionsSectionBuilder<FeatureType>::Build(wikipediaDir, mwmFile, id2wikidataPath);
}
void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile)
{
DescriptionsSectionBuilder<FeatureType>::Build(wikipediaDir, mwmFile);

View file

@ -12,6 +12,7 @@
#include "coding/string_utf8_multilang.hpp"
#include "base/assert.hpp"
#include "base/geo_object_id.hpp"
#include "base/logging.hpp"
#include "defines.hpp"
@ -19,6 +20,8 @@
#include <array>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <map>
#include <string>
#include <utility>
@ -31,6 +34,21 @@ class TestDescriptionSectionBuilder;
namespace generator
{
class WikidataHelper
{
public:
WikidataHelper() = default;
explicit WikidataHelper(std::string const & mwmPath, std::string const & id2wikidataPath);
boost::optional<std::string> GetWikidataId(uint32_t featureId) const;
private:
std::string m_mwmPath;
std::string m_id2wikidataPath;
std::map<uint32_t, base::GeoObjectId> m_featureIdToOsmId;
std::map<base::GeoObjectId, std::string> m_osmIdToFeatureId;
};
template <class T>
struct ForEachFromDatAdapt
{
@ -61,13 +79,19 @@ public:
}
void AddSize(size_t size) { m_size += size; }
void IncPage() { ++m_pages; }
void IncNumberWikipediaUrls() { ++m_numberWikipediaUrls; }
void IncNumberWikidataIds() { ++m_numberWikidataIds; }
size_t GetSize() const { return m_size; }
size_t GetPages() const { return m_pages; }
size_t GetNumberWikipediaUrls() const { return m_numberWikipediaUrls; }
size_t GetNumberWikidataIds() const { return m_numberWikidataIds; }
LangStatistics const & GetLangStatistics() const { return m_langsStat; }
private:
size_t m_size = 0;
size_t m_pages = 0;
size_t m_numberWikipediaUrls = 0;
size_t m_numberWikidataIds = 0;
LangStatistics m_langsStat = {};
};
@ -76,6 +100,8 @@ class DescriptionsCollectionBuilder
public:
friend class generator_tests::TestDescriptionSectionBuilder;
DescriptionsCollectionBuilder(std::string const & wikipediaDir, std::string const & mwmFile,
std::string const & id2wikidataPath);
DescriptionsCollectionBuilder(std::string const & wikipediaDir, std::string const & mwmFile);
template <typename Ft, template <typename> class ForEachFromDatAdapter>
@ -87,13 +113,35 @@ public:
if (!wikiChecker.NeedFeature(f))
return;
std::function<void()> incSource = []() {};
descriptions::FeatureDescription description;
std::string path;
// We first try to get wikipedia url.
auto const wikiUrl = f.GetMetadata().GetWikiURL();
auto const ret = GetFeatureDescription(wikiUrl, featureId, description);
CHECK_GREATER_OR_EQUAL(ret, 0, ());
if (!wikiUrl.empty())
{
path = MakePathForWikipedia(m_wikipediaDir, wikiUrl);
incSource = std::bind(&DescriptionsCollectionBuilderStat::IncNumberWikipediaUrls, std::ref(m_stat));
}
else
{
// We second try to get wikidata id.
auto const wikidataId = m_wikidataHelper.GetWikidataId(featureId);
if (wikidataId)
{
path = MakePathForWikidata(m_wikipediaDir, *wikidataId);
incSource = std::bind(&DescriptionsCollectionBuilderStat::IncNumberWikidataIds, std::ref(m_stat));
}
}
if (path.empty())
return;
auto const ret = GetFeatureDescription(path, featureId, description);
if (ret == 0)
return;
incSource();
m_stat.AddSize(ret);
m_stat.IncPage();
descriptionList.emplace_back(std::move(description));
@ -104,7 +152,8 @@ public:
}
DescriptionsCollectionBuilderStat const & GetStat() const { return m_stat; }
static std::string MakePath(std::string const & wikipediaDir, std::string wikipediaUrl);
static std::string MakePathForWikipedia(std::string const & wikipediaDir, std::string wikipediaUrl);
static std::string MakePathForWikidata(std::string const & wikipediaDir, std::string wikidataId);
private:
static size_t FillStringFromFile(std::string const & fullPath, int8_t code,
@ -114,6 +163,7 @@ private:
descriptions::FeatureDescription & description);
DescriptionsCollectionBuilderStat m_stat;
WikidataHelper m_wikidataHelper;
std::string m_wikipediaDir;
std::string m_mwmFile;
};
@ -121,14 +171,28 @@ private:
template <typename Ft, template <typename> class ForEachFromDatAdapter = ForEachFromDatAdapt>
struct DescriptionsSectionBuilder
{
static void Build(std::string const & wikipediaDir, std::string const & mwmFile,
std::string const & id2wikidataPath)
{
DescriptionsCollectionBuilder descriptionsCollectionBuilder(wikipediaDir, mwmFile, id2wikidataPath);
Build(mwmFile, descriptionsCollectionBuilder);
}
static void Build(std::string const & wikipediaDir, std::string const & mwmFile)
{
DescriptionsCollectionBuilder descriptionsCollectionBuilder(wikipediaDir, mwmFile);
auto descriptionList = descriptionsCollectionBuilder.MakeDescriptions<Ft, ForEachFromDatAdapter>();
Build(mwmFile, descriptionsCollectionBuilder);
}
auto const & stat = descriptionsCollectionBuilder.GetStat();
private:
static void Build(std::string const & mwmFile, DescriptionsCollectionBuilder & builder)
{
auto descriptionList = builder.MakeDescriptions<Ft, ForEachFromDatAdapter>();
auto const & stat = builder.GetStat();
auto const size = stat.GetSize();
LOG(LINFO, ("Found", stat.GetPages(), "pages for", mwmFile));
LOG(LINFO, ("Added", stat.GetNumberWikipediaUrls(), "pages form wikipedia urls for", mwmFile));
LOG(LINFO, ("Added", stat.GetNumberWikidataIds(), "pages form wikidata ids for", mwmFile));
LOG(LINFO, ("Added", stat.GetPages(), "pages for", mwmFile));
LOG(LINFO, ("Total size of added pages (before writing to section):", size));
CHECK_GREATER_OR_EQUAL(size, 0, ());
if (size == 0)
@ -147,5 +211,8 @@ struct DescriptionsSectionBuilder
}
};
void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile,
std::string const & id2wikidataPath);
void BuildDescriptionsSection(std::string const & wikipediaDir, std::string const & mwmFile);
} // namespace generator

View file

@ -53,6 +53,8 @@ struct GenerateInfo
std::string m_popularPlacesFilename;
std::string m_id2wikidataFilename;
std::shared_ptr<generator::OsmIdToBoundariesTable> m_boundariesTable;
uint32_t m_versionDate = 0;

View file

@ -77,7 +77,7 @@ public:
{
for (auto const & m : kWikiData)
{
auto const dir = DescriptionsCollectionBuilder::MakePath(m_wikiDir, m.m_url);
auto const dir = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, m.m_url);
CHECK(Platform::MkDirRecursively(dir), ());
for (auto const & d : m.m_pages)
{
@ -112,13 +112,13 @@ public:
{
std::string const wikiDir = "/wikiDir/";
std::string const wikiUrl = "http://en.wikipedia.org/wiki/Helsinki_Olympic_Stadium/";
auto const answer = DescriptionsCollectionBuilder::MakePath(wikiDir, wikiUrl);
auto const answer = DescriptionsCollectionBuilder::MakePathForWikipedia(wikiDir, wikiUrl);
TEST_EQUAL(trueAnswer, answer, ());
}
{
std::string const wikiDir = "/wikiDir";
std::string const wikiUrl = "https://en.wikipedia.org/wiki/Helsinki_Olympic_Stadium";
auto const answer = DescriptionsCollectionBuilder::MakePath(wikiDir, wikiUrl);
auto const answer = DescriptionsCollectionBuilder::MakePathForWikipedia(wikiDir, wikiUrl);
TEST_EQUAL(trueAnswer, answer, ());
}
}
@ -130,7 +130,8 @@ public:
CHECK(!kWikiData.empty(), ());
auto const & first = kWikiData.front();
StringUtf8Multilang str;
auto const size = b.FindPageAndFill(first.m_url, str);
auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, first.m_url);
auto const size = b.FindPageAndFill(path, str);
TEST(size, ());
TEST_EQUAL(*size, GetPageSize(first.m_pages), ());
TEST(CheckLangs(str, first.m_pages), ());
@ -139,7 +140,8 @@ public:
DescriptionsCollectionBuilder b(m_wikiDir, kMwmFile);
StringUtf8Multilang str;
std::string const badUrl = "https://en.wikipedia.org/wiki/Not_exists";
auto const size = b.FindPageAndFill(badUrl, str);
auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, badUrl);
auto const size = b.FindPageAndFill(path, str);
TEST(!size, ());
}
}
@ -151,7 +153,7 @@ public:
auto const & first = kWikiData.front();
std::string const lang = "en";
auto const langIndex = StringUtf8Multilang::GetLangIndex(lang);
auto const path = DescriptionsCollectionBuilder::MakePath(m_wikiDir, first.m_url);
auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, first.m_url);
auto const fullPath = base::JoinPath(path, (lang + ".html"));
StringUtf8Multilang str;
// This is a private function and should take the right path fullPath.
@ -172,7 +174,8 @@ public:
auto ft = MakeFeature(first.m_url);
descriptions::FeatureDescription description;
auto const wikiUrl = ft.GetMetadata().GetWikiURL();
auto const size = b.GetFeatureDescription(wikiUrl, featureId, description);
auto const path = DescriptionsCollectionBuilder::MakePathForWikipedia(m_wikiDir, wikiUrl);
auto const size = b.GetFeatureDescription(path, featureId, description);
TEST_EQUAL(size, GetPageSize(first.m_pages), ());
CHECK_NOT_EQUAL(size, 0, ());

View file

@ -153,6 +153,7 @@ DEFINE_string(opentable_data, "", "Path to opentable data in .tsv format.");
DEFINE_string(ugc_data, "", "Input UGC source database file name.");
DEFINE_string(wikipedia_pages, "", "Input dir with wikipedia pages.");
DEFINE_string(id2wikidata, "", "Path to file with id to wikidata mapping.");
DEFINE_string(dump_wikipedia_urls, "", "Output file with wikipedia urls.");
DEFINE_bool(generate_popular_places, false, "Generate popular places section.");
@ -306,6 +307,7 @@ int GeneratorToolMain(int argc, char ** argv)
genInfo.m_emitCoasts = FLAGS_emit_coasts;
genInfo.m_fileName = FLAGS_output;
genInfo.m_genAddresses = FLAGS_generate_addresses_file;
genInfo.m_id2wikidataFilename = FLAGS_id2wikidata;
auto emitter = CreateEmitter(EmitterType::Planet, genInfo);
if (!GenerateFeatures(genInfo, emitter))
@ -433,8 +435,15 @@ int GeneratorToolMain(int argc, char ** argv)
{
auto const tmpPath = base::JoinPath(genInfo.m_intermediateDir, "tmp");
auto const datFiles = platform_helpers::GetFullDataTmpFilePaths(tmpPath);
WikiUrlDumper wikiUrlDumper(FLAGS_dump_wikipedia_urls, datFiles);
wikiUrlDumper.Dump(threadsCount);
if (!FLAGS_id2wikidata.empty())
{
WikiDataFilter wikiDataFilter(FLAGS_id2wikidata, datFiles);
wikiDataFilter.Filter(threadsCount);
}
}
// Enumerate over all dat files that were created.
@ -601,7 +610,12 @@ int GeneratorToolMain(int argc, char ** argv)
}
if (!FLAGS_wikipedia_pages.empty())
BuildDescriptionsSection(FLAGS_wikipedia_pages, datFile);
{
if (!FLAGS_id2wikidata.empty())
BuildDescriptionsSection(FLAGS_wikipedia_pages, datFile, FLAGS_id2wikidata);
else
BuildDescriptionsSection(FLAGS_wikipedia_pages, datFile);
}
if (FLAGS_generate_popular_places)
{
@ -674,7 +688,7 @@ int GeneratorToolMain(int argc, char ** argv)
int main(int argc, char ** argv)
{
{
try
{
return GeneratorToolMain(argc, argv);

View file

@ -153,3 +153,17 @@ std::string DebugPrint(OsmElement::Tag const & tag)
ss << tag.key << '=' << tag.value;
return ss.str();
}
base::GeoObjectId GetGeoObjectId(OsmElement const & element)
{
switch (element.type)
{
case OsmElement::EntityType::Node:
return base::MakeOsmNode(element.id);
case OsmElement::EntityType::Way:
return base::MakeOsmWay(element.id);
case OsmElement::EntityType::Relation:
return base::MakeOsmRelation(element.id);
}
UNREACHABLE();
}

View file

@ -1,6 +1,7 @@
#pragma once
#include "base/assert.hpp"
#include "base/geo_object_id.hpp"
#include "base/math.hpp"
#include "base/string_utils.hpp"
@ -162,6 +163,8 @@ struct OsmElement
std::string GetTag(std::string const & key) const;
};
base::GeoObjectId GetGeoObjectId(OsmElement const & element);
std::string DebugPrint(OsmElement const & e);
std::string DebugPrint(OsmElement::EntityType e);
std::string DebugPrint(OsmElement::Tag const & tag);

View file

@ -15,12 +15,37 @@
#include "geometry/point2d.hpp"
#include "base/assert.hpp"
#include "base/string_utils.hpp"
#include <cctype>
#include <string>
#include <vector>
namespace generator
{
namespace
{
// https://www.wikidata.org/wiki/Wikidata:Identifiers
bool WikiDataValidator(std::string const & tagValue)
{
if (tagValue.size() < 2)
return false;
size_t pos = 0;
// Only items are are needed.
if (tagValue[pos++] != 'Q')
return false;
while (pos != tagValue.size())
{
if (!std::isdigit(tagValue[pos++]))
return false;
}
return true;
}
} // namespace
TranslatorPlanet::TranslatorPlanet(std::shared_ptr<EmitterInterface> emitter,
cache::IntermediateDataReader & holder,
feature::GenerateInfo const & info)
@ -31,6 +56,7 @@ TranslatorPlanet::TranslatorPlanet(std::shared_ptr<EmitterInterface> emitter,
, m_nodeRelations(m_routingTagsProcessor)
, m_wayRelations(m_routingTagsProcessor)
, m_metalinesBuilder(info.GetIntermediateFileName(METALINES_FILENAME))
, m_wikiDataCollector(info.m_id2wikidataFilename, "wikidata", WikiDataValidator, true /* ignoreIfNotOpen */)
{
auto const addrFilePath = info.GetAddressesFileName();
if (!addrFilePath.empty())
@ -182,7 +208,7 @@ bool TranslatorPlanet::ParseType(OsmElement * p, FeatureParams & params)
m_routingTagsProcessor.m_cameraNodeWriter.Process(*p, params, m_cache);
m_routingTagsProcessor.m_roadAccessWriter.Process(*p);
m_wikiDataCollector.Collect(GetGeoObjectId(*p), *p);
return true;
}

View file

@ -1,6 +1,7 @@
#pragma once
#include "generator/camera_info_collector.hpp"
#include "generator/collector_tag.hpp"
#include "generator/metalines_builder.hpp"
#include "generator/relation_tags.hpp"
#include "generator/routing_helpers.hpp"
@ -58,5 +59,6 @@ private:
RelationTagsNode m_nodeRelations;
RelationTagsWay m_wayRelations;
feature::MetalinesBuilder m_metalinesBuilder;
CollectorTag m_wikiDataCollector;
};
} // namespace generator

View file

@ -65,4 +65,67 @@ void WikiUrlDumper::DumpOne(std::string const & path, std::ostream & stream)
stream << path << "\t" << feature.GetMostGenericOsmId() << "\t" << wikiUrl << "\n";
});
}
WikiDataFilter::WikiDataFilter(std::string const & path, std::vector<std::string> const & datFiles)
: m_path(path), m_dataFiles(datFiles)
{
std::ifstream stream;
stream.exceptions(std::fstream::failbit | std::fstream::badbit);
stream.open(m_path);
stream.exceptions(std::fstream::badbit);
uint64_t id;
std::string wikidata;
while (stream)
{
stream >> id >> wikidata;
m_id2wikiData.emplace(base::GeoObjectId(id), wikidata);
}
}
// static
void WikiDataFilter::FilterOne(std::string const & path, std::map<base::GeoObjectId, std::string> const & id2wikiData,
std::ostream & stream)
{
auto const & needWikiUrl = ftypes::WikiChecker::Instance();
feature::ForEachFromDatRawFormat(path, [&](FeatureBuilder1 const & feature, uint64_t /* pos */) {
if (!needWikiUrl(feature.GetTypesHolder()))
return;
auto const it = id2wikiData.find(feature.GetMostGenericOsmId());
if (it == std::end(id2wikiData))
return;
stream << it->first.GetEncodedId() << "\t" << it->second << "\n";
});
}
void WikiDataFilter::Filter(size_t cpuCount)
{
CHECK_GREATER(cpuCount, 0, ());
base::thread_pool::computational::ThreadPool threadPool(cpuCount);
std::vector<std::future<std::string>> futures;
futures.reserve(m_dataFiles.size());
auto const fn = [&](std::string const & filename) {
std::stringstream stringStream;
FilterOne(filename, m_id2wikiData, stringStream);
return stringStream.str();
};
for (auto const & path : m_dataFiles)
{
auto result = threadPool.Submit(fn, path);
futures.emplace_back(std::move(result));
}
std::ofstream stream;
stream.exceptions(std::fstream::failbit | std::fstream::badbit);
stream.open(m_path);
for (auto & f : futures)
{
auto lines = f.get();
stream << lines;
}
}
} // namespace generator

View file

@ -1,6 +1,9 @@
#pragma once
#include "base/geo_object_id.hpp"
#include <iosfwd>
#include <map>
#include <string>
#include <vector>
@ -9,7 +12,7 @@ namespace generator
class WikiUrlDumper
{
public:
WikiUrlDumper(std::string const & path, std::vector<std::string> const & datFiles);
explicit WikiUrlDumper(std::string const & path, std::vector<std::string> const & datFiles);
static void DumpOne(std::string const & path, std::ostream & stream);
@ -19,4 +22,20 @@ private:
std::string m_path;
std::vector<std::string> m_dataFiles;
};
class WikiDataFilter
{
public:
explicit WikiDataFilter(std::string const & path, std::vector<std::string> const & datFiles);
static void FilterOne(std::string const & path, std::map<base::GeoObjectId, std::string> const & id2wikiData,
std::ostream & stream);
void Filter(size_t cpuCount);
private:
std::string m_path;
std::map<base::GeoObjectId, std::string> m_id2wikiData;
std::vector<std::string> m_dataFiles;
};
} // namespace generator

View file

@ -6,6 +6,8 @@ import logging
import os
import random
import time
import types
import urllib.error
import urllib.parse
from multiprocessing.pool import ThreadPool
@ -13,6 +15,7 @@ import htmlmin
import requests
import wikipediaapi
from bs4 import BeautifulSoup
from wikidata.client import Client
"""
This script downloads Wikipedia pages for different languages.
@ -20,7 +23,7 @@ This script downloads Wikipedia pages for different languages.
log = logging.getLogger(__name__)
WORKERS = 80
CHUNK_SIZE = 128
CHUNK_SIZE = 16
REQUEST_ATTEMPTS = 32
ATTEMPTS_PAUSE_MS = 4000
@ -48,16 +51,21 @@ class GettingError(MyException):
pass
def try_get(obj, prop):
def try_get(obj, prop, *args, **kwargs):
attempts = REQUEST_ATTEMPTS
while attempts != 0:
try:
return getattr(obj, prop)
attr = getattr(obj, prop)
is_method = isinstance(attr, types.MethodType)
return attr(*args, **kwargs) if is_method else attr
except (requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout,
json.decoder.JSONDecodeError):
time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS))
attempts -= 1
except urllib.error.HTTPError as e:
if e.code == 404:
raise GettingError(f"Page not found {e.msg}")
except KeyError:
raise GettingError(f"Getting {prop} field failed. {prop} not found.")
@ -80,7 +88,7 @@ def read_popularity(path):
return ids
def should_download_wikipage(popularity_set):
def should_download_page(popularity_set):
@functools.wraps(popularity_set)
def wrapped(ident):
return popularity_set is None or ident in popularity_set
@ -184,7 +192,7 @@ def get_wiki_langs(url):
return curr_lang
def download_all(path, url, langs):
def download_all_from_wikipedia(path, url, langs):
try:
available_langs = get_wiki_langs(url)
except ParseError:
@ -195,8 +203,8 @@ def download_all(path, url, langs):
download(path, lang[1])
def worker(output_dir, checker, langs):
@functools.wraps(worker)
def wikipedia_worker(output_dir, checker, langs):
@functools.wraps(wikipedia_worker)
def wrapped(line):
if not line.strip():
return
@ -211,20 +219,94 @@ def worker(output_dir, checker, langs):
return
parsed = urllib.parse.urlparse(url)
path = os.path.join(output_dir, parsed.netloc, parsed.path[1:])
download_all(path, url, langs)
download_all_from_wikipedia(path, url, langs)
return wrapped
def download_from_wikipedia_tags(input_file, output_dir, langs, checker):
with open(input_file) as file:
_ = file.readline()
pool = ThreadPool(processes=WORKERS)
pool.map(wikipedia_worker(output_dir, checker, langs), file, CHUNK_SIZE)
pool.close()
pool.join()
def get_wikidata_urls(entity, langs):
try:
keys = entity.data["sitelinks"].keys()
except (KeyError, AttributeError):
log.exception(f"Sitelinks not found for {entity.id}.")
return None
return [
entity.data["sitelinks"][k]["url"] for k in keys
if any([k.startswith(lang) for lang in langs])
]
def wikidata_worker(output_dir, checker, langs):
@functools.wraps(wikidata_worker)
def wrapped(line):
if not line.strip():
return
try:
ident, wikidata_id = line.split("\t")
ident = int(ident)
wikidata_id = wikidata_id.strip()
if not checker(ident):
return
except (AttributeError, IndexError):
log.exception(f"{line} is incorrect.")
return
client = Client()
try:
entity = try_get(client, "get", wikidata_id, load=True)
except GettingError:
log.exception(f"Error: page is not downloaded {wikidata_id}.")
return
urls = get_wikidata_urls(entity, langs)
if not urls:
return
path = os.path.join(output_dir, wikidata_id)
for url in urls:
download(path, url)
return wrapped
def download_from_wikidata_tags(input_file, output_dir, langs, checker):
wikidata_output_dir = os.path.join(output_dir, "wikidata")
os.makedirs(wikidata_output_dir, exist_ok=True)
with open(input_file) as file:
pool = ThreadPool(processes=WORKERS)
pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE)
pool.close()
pool.join()
def check_and_get_checker(popularity_file):
popularity_set = None
if popularity_file is None:
log.warning(f"Popularity file not set.")
elif os.path.exists(popularity_file):
popularity_set = read_popularity(popularity_file)
log.info(f"Popularity set size: {len(popularity_set)}.")
else:
log.error(f"Popularity file ({popularity_file}) not found.")
return should_download_page(popularity_set)
def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.")
parser.add_argument("--o", metavar="PATH", type=str,
parser.add_argument("--output_dir", metavar="PATH", type=str,
help="Output dir for saving pages")
parser.add_argument("--p", metavar="PATH", type=str,
parser.add_argument("--popularity", metavar="PATH", type=str,
help="File with popular object ids for which we "
"download wikipedia data. If not given, download "
"for all objects.")
parser.add_argument('--i', metavar="PATH", type=str, required=True,
parser.add_argument('--wikipedia', metavar="PATH", type=str, required=True,
help="Input file with wikipedia url.")
parser.add_argument('--wikidata', metavar="PATH", type=str,
help="Input file with wikidata ids.")
parser.add_argument('--langs', metavar="LANGS", type=str, nargs='+',
action='append',
help="Languages for pages. If left blank, pages in all "
@ -236,22 +318,20 @@ def main():
log.setLevel(logging.WARNING)
wikipediaapi.log.setLevel(logging.WARNING)
args = parse_args()
input_file = args.i
output_dir = args.o
popularity_file = args.p
wikipedia_file = args.wikipedia
wikidata_file = args.wikidata
output_dir = args.output_dir
popularity_file = args.popularity
langs = list(itertools.chain.from_iterable(args.langs))
os.makedirs(output_dir, exist_ok=True)
popularity_set = read_popularity(popularity_file) if popularity_file else None
if popularity_set:
log.info(f"Popularity set size: {len(popularity_set)}.")
checker = should_download_wikipage(popularity_set)
with open(input_file) as file:
_ = file.readline()
pool = ThreadPool(processes=WORKERS)
pool.map(worker(output_dir, checker, langs), file, CHUNK_SIZE)
pool.close()
pool.join()
checker = check_and_get_checker(popularity_file)
download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
if wikidata_file is None:
log.warning(f"Wikidata file not set.")
elif os.path.exists(wikidata_file):
download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
else:
log.warning(f"Wikidata ({wikidata_file}) file not set.")
if __name__ == "__main__":
main()

View file

@ -185,6 +185,7 @@ DESCRIPTIONS_DOWNLOADER="$PYTHON_SCRIPTS_PATH/descriptions_downloader.py"
LOCALADS_SCRIPT="$PYTHON_SCRIPTS_PATH/local_ads/mwm_to_csv_4localads.py"
UGC_FILE="${UGC_FILE:-$INTDIR/ugc_db.sqlite3}"
POPULAR_PLACES_FILE="${POPULAR_PLACES_FILE:-$INTDIR/popular_places.csv}"
WIKIDATA_FILE="${WIKIDATA_FILE:-$INTDIR/id2wikidata.csv}"
BOOKING_SCRIPT="$PYTHON_SCRIPTS_PATH/booking_hotels.py"
BOOKING_FILE="${BOOKING_FILE:-$INTDIR/hotels.csv}"
OPENTABLE_SCRIPT="$PYTHON_SCRIPTS_PATH/opentable_restaurants.py"
@ -453,6 +454,9 @@ if [ "$MODE" == "features" ]; then
[ -f "$BOOKING_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --booking_data=$BOOKING_FILE"
[ -f "$OPENTABLE_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --opentable_data=$OPENTABLE_FILE"
[ -f "$POPULAR_PLACES_FILE" ] && PARAMS_SPLIT="$PARAMS_SPLIT --popular_places_data=$POPULAR_PLACES_FILE"
[ -n "$OPT_DESCRIPTIONS" ] && PARAMS_SPLIT="$PARAMS_SPLIT --id2wikidata=$WIKIDATA_FILE"
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" \
--node_storage=$NODE_STORAGE \
--osm_file_type=o5m \
@ -555,14 +559,18 @@ if [ "$MODE" == "descriptions" ]; then
LOG="$LOG_PATH/descriptions.log"
LANGS="en ru es"
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" --dump_wikipedia_urls="$URLS_PATH" 2>> $LOG
$PYTHON36 $DESCRIPTIONS_DOWNLOADER --i "$URLS_PATH" --o "$WIKI_PAGES_PATH" --langs $LANGS 2>> $LOG
"$GENERATOR_TOOL" --intermediate_data_path="$INTDIR/" --user_resource_path="$DATA_PATH/" \
--dump_wikipedia_urls="$URLS_PATH" --id2wikidata="$WIKIDATA_FILE" 2>> $LOG
PARAMS="--wikipedia $URLS_PATH --wikidata $WIKIDATA_FILE --output_dir $WIKI_PAGES_PATH"
[ -f "$POPULAR_PLACES_FILE" ] && PARAMS="$PARAMS --popularity=$POPULAR_PLACES_FILE"
$PYTHON36 $DESCRIPTIONS_DOWNLOADER $PARAMS --langs $LANGS 2>> $LOG
for file in "$TARGET"/*.mwm; do
if [[ "$file" != *minsk-pass* && "$file" != *World* ]]; then
BASENAME="$(basename "$file" .mwm)"
"$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --data_path="$TARGET" --user_resource_path="$DATA_PATH/" \
--output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" &
"$GENERATOR_TOOL" --wikipedia_pages="$WIKI_PAGES_PATH/" --id2wikidata="$WIKIDATA_FILE" \
--data_path="$TARGET" --user_resource_path="$DATA_PATH/" --output="$BASENAME" 2>> "$LOG_PATH/$BASENAME.log" &
forky
fi
done