[generator][search] Move postcode_points section build to separate file.

This commit is contained in:
tatiana-yan 2019-12-05 14:08:29 +03:00 committed by Maksim Andrianov
parent 6a70dc7160
commit 8eaf9f862f
7 changed files with 247 additions and 199 deletions

View file

@ -146,6 +146,8 @@ set(
platform_helpers.hpp
popular_places_section_builder.cpp
popular_places_section_builder.hpp
postcode_points_builder.cpp
postcode_points_builder.hpp
postcodes_section_builder.cpp
postcodes_section_builder.hpp
processor_booking.hpp

View file

@ -6,6 +6,7 @@
#include "generator/feature_generator.hpp"
#include "generator/feature_sorter.hpp"
#include "generator/generator_tests_support/test_feature.hpp"
#include "generator/postcode_points_builder.hpp"
#include "generator/postcodes_section_builder.hpp"
#include "generator/search_index_builder.hpp"
@ -157,9 +158,9 @@ void TestMwmBuilder::Finish()
if (!m_postcodesPath.empty() && m_postcodesCountryInfoGetter)
{
CHECK(indexer::BuildPostcodesWithInfoGetter(m_file.GetDirectory(), m_file.GetCountryName(),
m_postcodesPath, true /* forceRebuild */,
*m_postcodesCountryInfoGetter),
CHECK(indexer::BuildPostcodePointsWithInfoGetter(m_file.GetDirectory(), m_file.GetCountryName(),
m_postcodesPath, true /* forceRebuild */,
*m_postcodesCountryInfoGetter),
("Can't build postcodes section."));
}

View file

@ -17,6 +17,7 @@
#include "generator/osm_source.hpp"
#include "generator/platform_helpers.hpp"
#include "generator/popular_places_section_builder.hpp"
#include "generator/postcode_points_builder.hpp"
#include "generator/postcodes_section_builder.hpp"
#include "generator/processor_factory.hpp"
#include "generator/ratings_section_builder.hpp"
@ -378,7 +379,7 @@ MAIN_WITH_ERROR_HANDLING([](int argc, char ** argv)
if (!FLAGS_postcodes_dataset.empty())
{
if (!indexer::BuildPostcodes(path, country, FLAGS_postcodes_dataset, true /*forceRebuild*/))
if (!indexer::BuildPostcodePoints(path, country, FLAGS_postcodes_dataset, true /*forceRebuild*/))
LOG(LCRITICAL, ("Error generating postcodes section."));
}

View file

@ -0,0 +1,219 @@
#include "generator/postcode_points_builder.hpp"
#include "search/postcode_points.hpp"
#include "search/search_index_values.hpp"
#include "search/search_trie.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "indexer/trie_builder.hpp"
#include "storage/country_info_getter.hpp"
#include "storage/storage_defines.hpp"
#include "platform/platform.hpp"
#include "coding/map_uint32_to_val.hpp"
#include "coding/reader.hpp"
#include "coding/reader_writer_ops.hpp"
#include "coding/writer.hpp"
#include "geometry/mercator.hpp"
#include "geometry/point2d.hpp"
#include "base/assert.hpp"
#include "base/checked_cast.hpp"
#include "base/file_name_utils.hpp"
#include "base/logging.hpp"
#include "base/scope_guard.hpp"
#include "base/string_utils.hpp"
#include <cstdint>
#include <fstream>
#include <utility>
#include <vector>
#include "defines.hpp"
using namespace std;
namespace
{
template <typename Key, typename Value>
void GetUKPostcodes(string const & filename, storage::CountryId const & countryId,
storage::CountryInfoGetter & infoGetter, vector<m2::PointD> & valueMapping,
vector<pair<Key, Value>> & keyValuePairs)
{
// Original dataset uses UK National Grid UTM coordinates.
// It was converted to WGS84 by https://pypi.org/project/OSGridConverter/.
size_t constexpr kPostcodeIndex = 0;
size_t constexpr kLatIndex = 1;
size_t constexpr kLongIndex = 2;
size_t constexpr kDatasetCount = 3;
ifstream data;
data.exceptions(fstream::failbit | fstream::badbit);
data.open(filename);
data.exceptions(fstream::badbit);
string line;
size_t index = 0;
while (getline(data, line))
{
vector<string> fields;
strings::ParseCSVRow(line, ',', fields);
CHECK_EQUAL(fields.size(), kDatasetCount, (line));
double lat;
CHECK(strings::to_double(fields[kLatIndex], lat), ());
double lon;
CHECK(strings::to_double(fields[kLongIndex], lon), ());
auto const p = mercator::FromLatLon(lat, lon);
vector<storage::CountryId> countries;
infoGetter.GetRegionsCountryId(p, countries, 200.0 /* lookupRadiusM */);
if (find(countries.begin(), countries.end(), countryId) == countries.end())
continue;
// UK postcodes formats are: aana naa, ana naa, an naa, ann naa, aan naa, aann naa.
auto postcode = fields[kPostcodeIndex];
// Do not index outer postcodes.
if (postcode.size() < 5)
continue;
// Space is skipped in dataset for |aana naa| and |aann naa| to make it fit 7 symbols in csv.
// Let's fix it here.
if (postcode.find(' ') == string::npos)
postcode.insert(static_cast<size_t>(postcode.size() - 3), " ");
CHECK_EQUAL(valueMapping.size(), index, ());
valueMapping.push_back(p);
keyValuePairs.emplace_back(search::NormalizeAndSimplifyString(postcode), Value(index));
++index;
}
}
bool BuildPostcodePointsImpl(FilesContainerR & container, storage::CountryId const & country,
string const & dataset, string const & tmpName,
storage::CountryInfoGetter & infoGetter, Writer & writer)
{
using Key = strings::UniString;
using Value = Uint64IndexValue;
CHECK_EQUAL(writer.Pos(), 0, ());
search::PostcodePoints::Header header;
header.Serialize(writer);
uint64_t bytesWritten = writer.Pos();
coding::WritePadding(writer, bytesWritten);
header.m_trieOffset = base::asserted_cast<uint32_t>(writer.Pos());
vector<pair<Key, Value>> ukPostcodesKeyValuePairs;
vector<m2::PointD> valueMapping;
GetUKPostcodes(dataset, country, infoGetter, valueMapping, ukPostcodesKeyValuePairs);
if (ukPostcodesKeyValuePairs.empty())
return false;
sort(ukPostcodesKeyValuePairs.begin(), ukPostcodesKeyValuePairs.end());
{
FileWriter tmpWriter(tmpName);
SingleValueSerializer<Value> serializer;
trie::Build<Writer, Key, SingleUint64Value, SingleValueSerializer<Value>>(
tmpWriter, serializer, ukPostcodesKeyValuePairs);
}
rw_ops::Reverse(FileReader(tmpName), writer);
header.m_trieSize = base::asserted_cast<uint32_t>(writer.Pos() - header.m_trieOffset);
bytesWritten = writer.Pos();
coding::WritePadding(writer, bytesWritten);
header.m_pointsOffset = base::asserted_cast<uint32_t>(writer.Pos());
{
search::CentersTableBuilder builder;
builder.SetGeometryParams(feature::DataHeader(container).GetBounds());
for (size_t i = 0; i < valueMapping.size(); ++i)
builder.Put(base::asserted_cast<uint32_t>(i), valueMapping[i]);
builder.Freeze(writer);
}
header.m_pointsSize = base::asserted_cast<uint32_t>(writer.Pos() - header.m_pointsOffset);
auto const endOffset = writer.Pos();
writer.Seek(0);
header.Serialize(writer);
writer.Seek(endOffset);
return true;
}
} // namespace
namespace indexer
{
bool BuildPostcodePointsWithInfoGetter(string const & path, string const & country,
string const & datasetPath, bool forceRebuild,
storage::CountryInfoGetter & infoGetter)
{
auto const filename = base::JoinPath(path, country + DATA_FILE_EXTENSION);
if (filename == WORLD_FILE_NAME || filename == WORLD_COASTS_FILE_NAME)
return true;
Platform & platform = GetPlatform();
FilesContainerR readContainer(platform.GetReader(filename, "f"));
if (readContainer.IsExist(POSTCODE_POINTS_FILE_TAG) && !forceRebuild)
return true;
string const postcodesFilePath = filename + "." + POSTCODE_POINTS_FILE_TAG EXTENSION_TMP;
// Temporary file used to reverse trie part of postcodes section.
string const trieTmpFilePath =
filename + "." + POSTCODE_POINTS_FILE_TAG + "_trie" + EXTENSION_TMP;
SCOPE_GUARD(postcodesFileGuard, bind(&FileWriter::DeleteFileX, postcodesFilePath));
SCOPE_GUARD(trieTmpFileGuard, bind(&FileWriter::DeleteFileX, trieTmpFilePath));
try
{
FileWriter writer(postcodesFilePath);
if (!BuildPostcodePointsImpl(readContainer, storage::CountryId(country), datasetPath,
trieTmpFilePath, infoGetter, writer))
{
// No postcodes for country.
return true;
}
LOG(LINFO, ("Postcodes section size =", writer.Size()));
FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING);
writeContainer.Write(postcodesFilePath, POSTCODE_POINTS_FILE_TAG);
}
catch (Reader::Exception const & e)
{
LOG(LERROR, ("Error while reading file:", e.Msg()));
return false;
}
catch (Writer::Exception const & e)
{
LOG(LERROR, ("Error writing file:", e.Msg()));
return false;
}
return true;
}
bool BuildPostcodePoints(string const & path, string const & country, string const & datasetPath,
bool forceRebuild)
{
auto const & platform = GetPlatform();
auto infoGetter = storage::CountryInfoReader::CreateCountryInfoReader(platform);
CHECK(infoGetter, ());
return BuildPostcodePointsWithInfoGetter(path, country, datasetPath, forceRebuild, *infoGetter);
}
} // namespace indexer

View file

@ -0,0 +1,19 @@
#pragma once
#include <string>
namespace storage
{
class CountryInfoGetter;
}
namespace indexer
{
// Builds postcodes section with external postcodes data and writes it to the mwm file.
bool BuildPostcodePoints(std::string const & path, std::string const & country,
std::string const & datasetPath, bool forceRebuild);
// Exposed for testing.
bool BuildPostcodePointsWithInfoGetter(std::string const & path, std::string const & country,
std::string const & datasetPath, bool forceRebuild,
storage::CountryInfoGetter & infoGetter);
} // namespace indexer

View file

@ -1,8 +1,7 @@
#include "search_index_builder.hpp"
#include "generator/search_index_builder.hpp"
#include "search/common.hpp"
#include "search/mwm_context.hpp"
#include "search/postcode_points.hpp"
#include "search/reverse_geocoder.hpp"
#include "search/search_index_values.hpp"
#include "search/search_trie.hpp"
@ -23,9 +22,6 @@
#include "indexer/search_string_utils.hpp"
#include "indexer/trie_builder.hpp"
#include "storage/country_info_getter.hpp"
#include "storage/storage_defines.hpp"
#include "platform/platform.hpp"
#include "coding/map_uint32_to_val.hpp"
@ -243,63 +239,6 @@ struct FeatureNameInserter
bool m_hasStreetType = false;
};
template <typename Key, typename Value>
void GetUKPostcodes(string const & filename, storage::CountryId const & countryId,
storage::CountryInfoGetter & infoGetter, vector<m2::PointD> & valueMapping,
vector<pair<Key, Value>> & keyValuePairs)
{
// Original dataset uses UK National Grid UTM coordinates.
// It was converted to WGS84 by https://pypi.org/project/OSGridConverter/.
size_t constexpr kPostcodeIndex = 0;
size_t constexpr kLatIndex = 1;
size_t constexpr kLongIndex = 2;
size_t constexpr kDatasetCount = 3;
ifstream data;
data.exceptions(fstream::failbit | fstream::badbit);
data.open(filename);
data.exceptions(fstream::badbit);
string line;
size_t index = 0;
while (getline(data, line))
{
vector<string> fields;
strings::ParseCSVRow(line, ',', fields);
CHECK_EQUAL(fields.size(), kDatasetCount, (line));
double lat;
CHECK(strings::to_double(fields[kLatIndex], lat), ());
double lon;
CHECK(strings::to_double(fields[kLongIndex], lon), ());
auto const p = mercator::FromLatLon(lat, lon);
vector<storage::CountryId> countries;
infoGetter.GetRegionsCountryId(p, countries, 200.0 /* lookupRadiusM */);
if (find(countries.begin(), countries.end(), countryId) == countries.end())
continue;
// UK postcodes formats are: aana naa, ana naa, an naa, ann naa, aan naa, aann naa.
auto postcode = fields[kPostcodeIndex];
// Do not index outer postcodes.
if (postcode.size() < 5)
continue;
// Space is skipped in dataset for |aana naa| and |aann naa| to make it fit 7 symbols in csv.
// Let's fix it here.
if (postcode.find(' ') == string::npos)
postcode.insert(static_cast<size_t>(postcode.size() - 3), " ");
CHECK_EQUAL(valueMapping.size(), index, ());
valueMapping.push_back(p);
keyValuePairs.emplace_back(search::NormalizeAndSimplifyString(postcode), Value(index));
++index;
}
}
// Returns true iff feature name was indexed as postcode and should be ignored for name indexing.
bool InsertPostcodes(FeatureType & f, function<void(strings::UniString const &)> const & fn)
{
@ -588,9 +527,6 @@ void BuildAddressTable(FilesContainerR & container, string const & addressDataFi
namespace indexer
{
void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter);
bool BuildPostcodesImpl(FilesContainerR & container, storage::CountryId const & country,
string const & dataset, string const & tmpFileName,
storage::CountryInfoGetter & infoGetter, Writer & indexWriter);
bool BuildSearchIndexFromDataFile(string const & path, string const & country, bool forceRebuild,
uint32_t threadsCount)
@ -649,123 +585,6 @@ bool BuildSearchIndexFromDataFile(string const & path, string const & country, b
return true;
}
bool BuildPostcodesWithInfoGetter(string const & path, string const & country,
string const & datasetPath, bool forceRebuild,
storage::CountryInfoGetter & infoGetter)
{
auto const filename = base::JoinPath(path, country + DATA_FILE_EXTENSION);
if (filename == WORLD_FILE_NAME || filename == WORLD_COASTS_FILE_NAME)
return true;
Platform & platform = GetPlatform();
FilesContainerR readContainer(platform.GetReader(filename, "f"));
if (readContainer.IsExist(POSTCODE_POINTS_FILE_TAG) && !forceRebuild)
return true;
string const postcodesFilePath = filename + "." + POSTCODE_POINTS_FILE_TAG EXTENSION_TMP;
// Temporary file used to reverse trie part of postcodes section.
string const trieTmpFilePath =
filename + "." + POSTCODE_POINTS_FILE_TAG + "_trie" + EXTENSION_TMP;
SCOPE_GUARD(postcodesFileGuard, bind(&FileWriter::DeleteFileX, postcodesFilePath));
SCOPE_GUARD(trieTmpFileGuard, bind(&FileWriter::DeleteFileX, trieTmpFilePath));
try
{
FileWriter writer(postcodesFilePath);
if (!BuildPostcodesImpl(readContainer, storage::CountryId(country), datasetPath,
trieTmpFilePath, infoGetter, writer))
{
// No postcodes for country.
return true;
}
LOG(LINFO, ("Postcodes section size =", writer.Size()));
FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING);
writeContainer.Write(postcodesFilePath, POSTCODE_POINTS_FILE_TAG);
}
catch (Reader::Exception const & e)
{
LOG(LERROR, ("Error while reading file:", e.Msg()));
return false;
}
catch (Writer::Exception const & e)
{
LOG(LERROR, ("Error writing file:", e.Msg()));
return false;
}
return true;
}
bool BuildPostcodes(string const & path, string const & country, string const & datasetPath,
bool forceRebuild)
{
auto const & platform = GetPlatform();
auto infoGetter = storage::CountryInfoReader::CreateCountryInfoReader(platform);
CHECK(infoGetter, ());
return BuildPostcodesWithInfoGetter(path, country, datasetPath, forceRebuild, *infoGetter);
}
bool BuildPostcodesImpl(FilesContainerR & container, storage::CountryId const & country,
string const & dataset, string const & tmpName,
storage::CountryInfoGetter & infoGetter, Writer & writer)
{
using Key = strings::UniString;
using Value = Uint64IndexValue;
CHECK_EQUAL(writer.Pos(), 0, ());
search::PostcodePoints::Header header;
header.Serialize(writer);
uint64_t bytesWritten = writer.Pos();
coding::WritePadding(writer, bytesWritten);
header.m_trieOffset = base::asserted_cast<uint32_t>(writer.Pos());
vector<pair<Key, Value>> ukPostcodesKeyValuePairs;
vector<m2::PointD> valueMapping;
GetUKPostcodes(dataset, country, infoGetter, valueMapping, ukPostcodesKeyValuePairs);
if (ukPostcodesKeyValuePairs.empty())
return false;
sort(ukPostcodesKeyValuePairs.begin(), ukPostcodesKeyValuePairs.end());
{
FileWriter tmpWriter(tmpName);
SingleValueSerializer<Value> serializer;
trie::Build<Writer, Key, SingleUint64Value, SingleValueSerializer<Value>>(
tmpWriter, serializer, ukPostcodesKeyValuePairs);
}
rw_ops::Reverse(FileReader(tmpName), writer);
header.m_trieSize = base::asserted_cast<uint32_t>(writer.Pos() - header.m_trieOffset);
bytesWritten = writer.Pos();
coding::WritePadding(writer, bytesWritten);
header.m_pointsOffset = base::asserted_cast<uint32_t>(writer.Pos());
{
search::CentersTableBuilder builder;
builder.SetGeometryParams(feature::DataHeader(container).GetBounds());
for (size_t i = 0; i < valueMapping.size(); ++i)
builder.Put(base::asserted_cast<uint32_t>(i), valueMapping[i]);
builder.Freeze(writer);
}
header.m_pointsSize = base::asserted_cast<uint32_t>(writer.Pos() - header.m_pointsOffset);
auto const endOffset = writer.Pos();
writer.Seek(0);
header.Serialize(writer);
writer.Seek(endOffset);
return true;
}
void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter)
{
using Key = strings::UniString;

View file

@ -2,11 +2,6 @@
#include <string>
namespace storage
{
class CountryInfoGetter;
}
namespace indexer
{
// Builds the latest version of the search index section and writes it to the mwm file.
@ -15,12 +10,4 @@ namespace indexer
// in version mismatch when trying to read the index.
bool BuildSearchIndexFromDataFile(std::string const & path, std::string const & country,
bool forceRebuild, uint32_t threadsCount);
// Builds postcodes section with external postcodes data and writes it to the mwm file.
bool BuildPostcodes(std::string const & path, std::string const & country,
std::string const & datasetPath, bool forceRebuild);
// Exposed for testing.
bool BuildPostcodesWithInfoGetter(std::string const & path, std::string const & country,
std::string const & datasetPath, bool forceRebuild,
storage::CountryInfoGetter & infoGetter);
} // namespace indexer