From 8eaf9f862f24b17d9e0f0c2bc821ac0b04a65ed2 Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Thu, 5 Dec 2019 14:08:29 +0300 Subject: [PATCH] [generator][search] Move postcode_points section build to separate file. --- generator/CMakeLists.txt | 2 + .../test_mwm_builder.cpp | 7 +- generator/generator_tool/generator_tool.cpp | 3 +- generator/postcode_points_builder.cpp | 219 ++++++++++++++++++ generator/postcode_points_builder.hpp | 19 ++ generator/search_index_builder.cpp | 183 +-------------- generator/search_index_builder.hpp | 13 -- 7 files changed, 247 insertions(+), 199 deletions(-) create mode 100644 generator/postcode_points_builder.cpp create mode 100644 generator/postcode_points_builder.hpp diff --git a/generator/CMakeLists.txt b/generator/CMakeLists.txt index 2d93810c90..ab358724f0 100644 --- a/generator/CMakeLists.txt +++ b/generator/CMakeLists.txt @@ -146,6 +146,8 @@ set( platform_helpers.hpp popular_places_section_builder.cpp popular_places_section_builder.hpp + postcode_points_builder.cpp + postcode_points_builder.hpp postcodes_section_builder.cpp postcodes_section_builder.hpp processor_booking.hpp diff --git a/generator/generator_tests_support/test_mwm_builder.cpp b/generator/generator_tests_support/test_mwm_builder.cpp index 2d8fd28f4a..cef3de94b0 100644 --- a/generator/generator_tests_support/test_mwm_builder.cpp +++ b/generator/generator_tests_support/test_mwm_builder.cpp @@ -6,6 +6,7 @@ #include "generator/feature_generator.hpp" #include "generator/feature_sorter.hpp" #include "generator/generator_tests_support/test_feature.hpp" +#include "generator/postcode_points_builder.hpp" #include "generator/postcodes_section_builder.hpp" #include "generator/search_index_builder.hpp" @@ -157,9 +158,9 @@ void TestMwmBuilder::Finish() if (!m_postcodesPath.empty() && m_postcodesCountryInfoGetter) { - CHECK(indexer::BuildPostcodesWithInfoGetter(m_file.GetDirectory(), m_file.GetCountryName(), - m_postcodesPath, true /* forceRebuild */, - *m_postcodesCountryInfoGetter), + CHECK(indexer::BuildPostcodePointsWithInfoGetter(m_file.GetDirectory(), m_file.GetCountryName(), + m_postcodesPath, true /* forceRebuild */, + *m_postcodesCountryInfoGetter), ("Can't build postcodes section.")); } diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index f0b8d35104..4ab04e316b 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -17,6 +17,7 @@ #include "generator/osm_source.hpp" #include "generator/platform_helpers.hpp" #include "generator/popular_places_section_builder.hpp" +#include "generator/postcode_points_builder.hpp" #include "generator/postcodes_section_builder.hpp" #include "generator/processor_factory.hpp" #include "generator/ratings_section_builder.hpp" @@ -378,7 +379,7 @@ MAIN_WITH_ERROR_HANDLING([](int argc, char ** argv) if (!FLAGS_postcodes_dataset.empty()) { - if (!indexer::BuildPostcodes(path, country, FLAGS_postcodes_dataset, true /*forceRebuild*/)) + if (!indexer::BuildPostcodePoints(path, country, FLAGS_postcodes_dataset, true /*forceRebuild*/)) LOG(LCRITICAL, ("Error generating postcodes section.")); } diff --git a/generator/postcode_points_builder.cpp b/generator/postcode_points_builder.cpp new file mode 100644 index 0000000000..2c2e57f802 --- /dev/null +++ b/generator/postcode_points_builder.cpp @@ -0,0 +1,219 @@ +#include "generator/postcode_points_builder.hpp" + +#include "search/postcode_points.hpp" +#include "search/search_index_values.hpp" +#include "search/search_trie.hpp" + +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" +#include "indexer/trie_builder.hpp" + +#include "storage/country_info_getter.hpp" +#include "storage/storage_defines.hpp" + +#include "platform/platform.hpp" + +#include "coding/map_uint32_to_val.hpp" +#include "coding/reader.hpp" +#include "coding/reader_writer_ops.hpp" +#include "coding/writer.hpp" + +#include "geometry/mercator.hpp" +#include "geometry/point2d.hpp" + +#include "base/assert.hpp" +#include "base/checked_cast.hpp" +#include "base/file_name_utils.hpp" +#include "base/logging.hpp" +#include "base/scope_guard.hpp" +#include "base/string_utils.hpp" + +#include +#include +#include +#include + +#include "defines.hpp" + +using namespace std; + +namespace +{ +template +void GetUKPostcodes(string const & filename, storage::CountryId const & countryId, + storage::CountryInfoGetter & infoGetter, vector & valueMapping, + vector> & keyValuePairs) +{ + // Original dataset uses UK National Grid UTM coordinates. + // It was converted to WGS84 by https://pypi.org/project/OSGridConverter/. + size_t constexpr kPostcodeIndex = 0; + size_t constexpr kLatIndex = 1; + size_t constexpr kLongIndex = 2; + size_t constexpr kDatasetCount = 3; + + ifstream data; + data.exceptions(fstream::failbit | fstream::badbit); + data.open(filename); + data.exceptions(fstream::badbit); + + string line; + size_t index = 0; + while (getline(data, line)) + { + vector fields; + strings::ParseCSVRow(line, ',', fields); + CHECK_EQUAL(fields.size(), kDatasetCount, (line)); + + double lat; + CHECK(strings::to_double(fields[kLatIndex], lat), ()); + + double lon; + CHECK(strings::to_double(fields[kLongIndex], lon), ()); + + auto const p = mercator::FromLatLon(lat, lon); + + vector countries; + infoGetter.GetRegionsCountryId(p, countries, 200.0 /* lookupRadiusM */); + if (find(countries.begin(), countries.end(), countryId) == countries.end()) + continue; + + // UK postcodes formats are: aana naa, ana naa, an naa, ann naa, aan naa, aann naa. + + auto postcode = fields[kPostcodeIndex]; + // Do not index outer postcodes. + if (postcode.size() < 5) + continue; + + // Space is skipped in dataset for |aana naa| and |aann naa| to make it fit 7 symbols in csv. + // Let's fix it here. + if (postcode.find(' ') == string::npos) + postcode.insert(static_cast(postcode.size() - 3), " "); + + CHECK_EQUAL(valueMapping.size(), index, ()); + valueMapping.push_back(p); + keyValuePairs.emplace_back(search::NormalizeAndSimplifyString(postcode), Value(index)); + ++index; + } +} + +bool BuildPostcodePointsImpl(FilesContainerR & container, storage::CountryId const & country, + string const & dataset, string const & tmpName, + storage::CountryInfoGetter & infoGetter, Writer & writer) +{ + using Key = strings::UniString; + using Value = Uint64IndexValue; + + CHECK_EQUAL(writer.Pos(), 0, ()); + + search::PostcodePoints::Header header; + header.Serialize(writer); + + uint64_t bytesWritten = writer.Pos(); + coding::WritePadding(writer, bytesWritten); + + header.m_trieOffset = base::asserted_cast(writer.Pos()); + + vector> ukPostcodesKeyValuePairs; + vector valueMapping; + GetUKPostcodes(dataset, country, infoGetter, valueMapping, ukPostcodesKeyValuePairs); + + if (ukPostcodesKeyValuePairs.empty()) + return false; + + sort(ukPostcodesKeyValuePairs.begin(), ukPostcodesKeyValuePairs.end()); + + { + FileWriter tmpWriter(tmpName); + SingleValueSerializer serializer; + trie::Build>( + tmpWriter, serializer, ukPostcodesKeyValuePairs); + } + + rw_ops::Reverse(FileReader(tmpName), writer); + + header.m_trieSize = base::asserted_cast(writer.Pos() - header.m_trieOffset); + + bytesWritten = writer.Pos(); + coding::WritePadding(writer, bytesWritten); + + header.m_pointsOffset = base::asserted_cast(writer.Pos()); + + { + search::CentersTableBuilder builder; + + builder.SetGeometryParams(feature::DataHeader(container).GetBounds()); + for (size_t i = 0; i < valueMapping.size(); ++i) + builder.Put(base::asserted_cast(i), valueMapping[i]); + + builder.Freeze(writer); + } + + header.m_pointsSize = base::asserted_cast(writer.Pos() - header.m_pointsOffset); + auto const endOffset = writer.Pos(); + writer.Seek(0); + header.Serialize(writer); + writer.Seek(endOffset); + return true; +} +} // namespace + +namespace indexer +{ +bool BuildPostcodePointsWithInfoGetter(string const & path, string const & country, + string const & datasetPath, bool forceRebuild, + storage::CountryInfoGetter & infoGetter) +{ + auto const filename = base::JoinPath(path, country + DATA_FILE_EXTENSION); + if (filename == WORLD_FILE_NAME || filename == WORLD_COASTS_FILE_NAME) + return true; + + Platform & platform = GetPlatform(); + FilesContainerR readContainer(platform.GetReader(filename, "f")); + if (readContainer.IsExist(POSTCODE_POINTS_FILE_TAG) && !forceRebuild) + return true; + + string const postcodesFilePath = filename + "." + POSTCODE_POINTS_FILE_TAG EXTENSION_TMP; + // Temporary file used to reverse trie part of postcodes section. + string const trieTmpFilePath = + filename + "." + POSTCODE_POINTS_FILE_TAG + "_trie" + EXTENSION_TMP; + SCOPE_GUARD(postcodesFileGuard, bind(&FileWriter::DeleteFileX, postcodesFilePath)); + SCOPE_GUARD(trieTmpFileGuard, bind(&FileWriter::DeleteFileX, trieTmpFilePath)); + + try + { + FileWriter writer(postcodesFilePath); + if (!BuildPostcodePointsImpl(readContainer, storage::CountryId(country), datasetPath, + trieTmpFilePath, infoGetter, writer)) + { + // No postcodes for country. + return true; + } + + LOG(LINFO, ("Postcodes section size =", writer.Size())); + FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); + writeContainer.Write(postcodesFilePath, POSTCODE_POINTS_FILE_TAG); + } + catch (Reader::Exception const & e) + { + LOG(LERROR, ("Error while reading file:", e.Msg())); + return false; + } + catch (Writer::Exception const & e) + { + LOG(LERROR, ("Error writing file:", e.Msg())); + return false; + } + + return true; +} + +bool BuildPostcodePoints(string const & path, string const & country, string const & datasetPath, + bool forceRebuild) +{ + auto const & platform = GetPlatform(); + auto infoGetter = storage::CountryInfoReader::CreateCountryInfoReader(platform); + CHECK(infoGetter, ()); + return BuildPostcodePointsWithInfoGetter(path, country, datasetPath, forceRebuild, *infoGetter); +} + +} // namespace indexer diff --git a/generator/postcode_points_builder.hpp b/generator/postcode_points_builder.hpp new file mode 100644 index 0000000000..e39f77e28b --- /dev/null +++ b/generator/postcode_points_builder.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include + +namespace storage +{ +class CountryInfoGetter; +} + +namespace indexer +{ +// Builds postcodes section with external postcodes data and writes it to the mwm file. +bool BuildPostcodePoints(std::string const & path, std::string const & country, + std::string const & datasetPath, bool forceRebuild); +// Exposed for testing. +bool BuildPostcodePointsWithInfoGetter(std::string const & path, std::string const & country, + std::string const & datasetPath, bool forceRebuild, + storage::CountryInfoGetter & infoGetter); +} // namespace indexer diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index 083da5aa72..eb58ee7431 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -1,8 +1,7 @@ -#include "search_index_builder.hpp" +#include "generator/search_index_builder.hpp" #include "search/common.hpp" #include "search/mwm_context.hpp" -#include "search/postcode_points.hpp" #include "search/reverse_geocoder.hpp" #include "search/search_index_values.hpp" #include "search/search_trie.hpp" @@ -23,9 +22,6 @@ #include "indexer/search_string_utils.hpp" #include "indexer/trie_builder.hpp" -#include "storage/country_info_getter.hpp" -#include "storage/storage_defines.hpp" - #include "platform/platform.hpp" #include "coding/map_uint32_to_val.hpp" @@ -243,63 +239,6 @@ struct FeatureNameInserter bool m_hasStreetType = false; }; -template -void GetUKPostcodes(string const & filename, storage::CountryId const & countryId, - storage::CountryInfoGetter & infoGetter, vector & valueMapping, - vector> & keyValuePairs) -{ - // Original dataset uses UK National Grid UTM coordinates. - // It was converted to WGS84 by https://pypi.org/project/OSGridConverter/. - size_t constexpr kPostcodeIndex = 0; - size_t constexpr kLatIndex = 1; - size_t constexpr kLongIndex = 2; - size_t constexpr kDatasetCount = 3; - - ifstream data; - data.exceptions(fstream::failbit | fstream::badbit); - data.open(filename); - data.exceptions(fstream::badbit); - - string line; - size_t index = 0; - while (getline(data, line)) - { - vector fields; - strings::ParseCSVRow(line, ',', fields); - CHECK_EQUAL(fields.size(), kDatasetCount, (line)); - - double lat; - CHECK(strings::to_double(fields[kLatIndex], lat), ()); - - double lon; - CHECK(strings::to_double(fields[kLongIndex], lon), ()); - - auto const p = mercator::FromLatLon(lat, lon); - - vector countries; - infoGetter.GetRegionsCountryId(p, countries, 200.0 /* lookupRadiusM */); - if (find(countries.begin(), countries.end(), countryId) == countries.end()) - continue; - - // UK postcodes formats are: aana naa, ana naa, an naa, ann naa, aan naa, aann naa. - - auto postcode = fields[kPostcodeIndex]; - // Do not index outer postcodes. - if (postcode.size() < 5) - continue; - - // Space is skipped in dataset for |aana naa| and |aann naa| to make it fit 7 symbols in csv. - // Let's fix it here. - if (postcode.find(' ') == string::npos) - postcode.insert(static_cast(postcode.size() - 3), " "); - - CHECK_EQUAL(valueMapping.size(), index, ()); - valueMapping.push_back(p); - keyValuePairs.emplace_back(search::NormalizeAndSimplifyString(postcode), Value(index)); - ++index; - } -} - // Returns true iff feature name was indexed as postcode and should be ignored for name indexing. bool InsertPostcodes(FeatureType & f, function const & fn) { @@ -588,9 +527,6 @@ void BuildAddressTable(FilesContainerR & container, string const & addressDataFi namespace indexer { void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter); -bool BuildPostcodesImpl(FilesContainerR & container, storage::CountryId const & country, - string const & dataset, string const & tmpFileName, - storage::CountryInfoGetter & infoGetter, Writer & indexWriter); bool BuildSearchIndexFromDataFile(string const & path, string const & country, bool forceRebuild, uint32_t threadsCount) @@ -649,123 +585,6 @@ bool BuildSearchIndexFromDataFile(string const & path, string const & country, b return true; } -bool BuildPostcodesWithInfoGetter(string const & path, string const & country, - string const & datasetPath, bool forceRebuild, - storage::CountryInfoGetter & infoGetter) -{ - auto const filename = base::JoinPath(path, country + DATA_FILE_EXTENSION); - if (filename == WORLD_FILE_NAME || filename == WORLD_COASTS_FILE_NAME) - return true; - - Platform & platform = GetPlatform(); - FilesContainerR readContainer(platform.GetReader(filename, "f")); - if (readContainer.IsExist(POSTCODE_POINTS_FILE_TAG) && !forceRebuild) - return true; - - string const postcodesFilePath = filename + "." + POSTCODE_POINTS_FILE_TAG EXTENSION_TMP; - // Temporary file used to reverse trie part of postcodes section. - string const trieTmpFilePath = - filename + "." + POSTCODE_POINTS_FILE_TAG + "_trie" + EXTENSION_TMP; - SCOPE_GUARD(postcodesFileGuard, bind(&FileWriter::DeleteFileX, postcodesFilePath)); - SCOPE_GUARD(trieTmpFileGuard, bind(&FileWriter::DeleteFileX, trieTmpFilePath)); - - try - { - FileWriter writer(postcodesFilePath); - if (!BuildPostcodesImpl(readContainer, storage::CountryId(country), datasetPath, - trieTmpFilePath, infoGetter, writer)) - { - // No postcodes for country. - return true; - } - - LOG(LINFO, ("Postcodes section size =", writer.Size())); - FilesContainerW writeContainer(readContainer.GetFileName(), FileWriter::OP_WRITE_EXISTING); - writeContainer.Write(postcodesFilePath, POSTCODE_POINTS_FILE_TAG); - } - catch (Reader::Exception const & e) - { - LOG(LERROR, ("Error while reading file:", e.Msg())); - return false; - } - catch (Writer::Exception const & e) - { - LOG(LERROR, ("Error writing file:", e.Msg())); - return false; - } - - return true; -} - -bool BuildPostcodes(string const & path, string const & country, string const & datasetPath, - bool forceRebuild) -{ - auto const & platform = GetPlatform(); - auto infoGetter = storage::CountryInfoReader::CreateCountryInfoReader(platform); - CHECK(infoGetter, ()); - return BuildPostcodesWithInfoGetter(path, country, datasetPath, forceRebuild, *infoGetter); -} - -bool BuildPostcodesImpl(FilesContainerR & container, storage::CountryId const & country, - string const & dataset, string const & tmpName, - storage::CountryInfoGetter & infoGetter, Writer & writer) -{ - using Key = strings::UniString; - using Value = Uint64IndexValue; - - CHECK_EQUAL(writer.Pos(), 0, ()); - - search::PostcodePoints::Header header; - header.Serialize(writer); - - uint64_t bytesWritten = writer.Pos(); - coding::WritePadding(writer, bytesWritten); - - header.m_trieOffset = base::asserted_cast(writer.Pos()); - - vector> ukPostcodesKeyValuePairs; - vector valueMapping; - GetUKPostcodes(dataset, country, infoGetter, valueMapping, ukPostcodesKeyValuePairs); - - if (ukPostcodesKeyValuePairs.empty()) - return false; - - sort(ukPostcodesKeyValuePairs.begin(), ukPostcodesKeyValuePairs.end()); - - { - FileWriter tmpWriter(tmpName); - SingleValueSerializer serializer; - trie::Build>( - tmpWriter, serializer, ukPostcodesKeyValuePairs); - } - - rw_ops::Reverse(FileReader(tmpName), writer); - - header.m_trieSize = base::asserted_cast(writer.Pos() - header.m_trieOffset); - - bytesWritten = writer.Pos(); - coding::WritePadding(writer, bytesWritten); - - header.m_pointsOffset = base::asserted_cast(writer.Pos()); - - { - search::CentersTableBuilder builder; - - builder.SetGeometryParams(feature::DataHeader(container).GetBounds()); - for (size_t i = 0; i < valueMapping.size(); ++i) - builder.Put(base::asserted_cast(i), valueMapping[i]); - - builder.Freeze(writer); - } - - header.m_pointsSize = base::asserted_cast(writer.Pos() - header.m_pointsOffset); - auto const endOffset = writer.Pos(); - writer.Seek(0); - header.Serialize(writer); - writer.Seek(endOffset); - return true; -} - void BuildSearchIndex(FilesContainerR & container, Writer & indexWriter) { using Key = strings::UniString; diff --git a/generator/search_index_builder.hpp b/generator/search_index_builder.hpp index e5168392cc..95f89c184b 100644 --- a/generator/search_index_builder.hpp +++ b/generator/search_index_builder.hpp @@ -2,11 +2,6 @@ #include -namespace storage -{ -class CountryInfoGetter; -} - namespace indexer { // Builds the latest version of the search index section and writes it to the mwm file. @@ -15,12 +10,4 @@ namespace indexer // in version mismatch when trying to read the index. bool BuildSearchIndexFromDataFile(std::string const & path, std::string const & country, bool forceRebuild, uint32_t threadsCount); - -// Builds postcodes section with external postcodes data and writes it to the mwm file. -bool BuildPostcodes(std::string const & path, std::string const & country, - std::string const & datasetPath, bool forceRebuild); -// Exposed for testing. -bool BuildPostcodesWithInfoGetter(std::string const & path, std::string const & country, - std::string const & datasetPath, bool forceRebuild, - storage::CountryInfoGetter & infoGetter); } // namespace indexer