From 0e8477ad8bd0f5d6f0ce63a0e766f32ccc2885c4 Mon Sep 17 00:00:00 2001 From: Anatoly Serdtcev Date: Thu, 31 Jan 2019 19:32:25 +0300 Subject: [PATCH] [geocoder] Fix for review --- geocoder/geocoder.cpp | 4 ++-- geocoder/geocoder_tests/geocoder_tests.cpp | 2 +- geocoder/hierarchy.hpp | 16 +++++++-------- geocoder/hierarchy_reader.cpp | 23 +++++++++++----------- geocoder/hierarchy_reader.hpp | 5 +++-- geocoder/index.cpp | 10 ++++++++-- geocoder/index.hpp | 4 +++- 7 files changed, 37 insertions(+), 27 deletions(-) diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index 7d5784e17c..266d68c923 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -2,10 +2,10 @@ #include "geocoder/hierarchy_reader.hpp" -#include "indexer/search_string_utils.hpp" - #include "search/house_numbers_matcher.hpp" +#include "indexer/search_string_utils.hpp" + #include "base/assert.hpp" #include "base/logging.hpp" #include "base/scope_guard.hpp" diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 27a9d8a5ba..1f0329e4d5 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -164,7 +164,7 @@ UNIT_TEST(Geocoder_EmptyFileConcurrentRead) UNIT_TEST(Geocoder_BigFileConcurrentRead) { - int const kEntryCount = 1000000; + int const kEntryCount = 100000; stringstream s; for (int i = 0; i < kEntryCount; ++i) diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index 89198bd7af..8579db75fb 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -20,31 +20,31 @@ public: struct ParsingStats { // Number of entries that the hierarchy was constructed from. - uint64_t m_numLoaded{0}; + uint64_t m_numLoaded = 0; // Number of corrupted json lines. - uint64_t m_badJsons{0}; + uint64_t m_badJsons = 0; // Number of entries with unreadable base::GeoObjectIds. - uint64_t m_badOsmIds{0}; + uint64_t m_badOsmIds = 0; // Number of base::GeoObjectsIds that occur as keys in at least two entries. - uint64_t m_duplicateOsmIds{0}; + uint64_t m_duplicateOsmIds = 0; // Number of entries with duplicate subfields in the address field. - uint64_t m_duplicateAddresses{0}; + uint64_t m_duplicateAddresses = 0; // Number of entries whose address field either does // not exist or consists of empty lines. - uint64_t m_emptyAddresses{0}; + uint64_t m_emptyAddresses = 0; // Number of entries without the name field or with an empty one. - uint64_t m_emptyNames{0}; + uint64_t m_emptyNames = 0; // Number of entries whose names do not match the most // specific parts of their addresses. // This is expected from POIs but not from regions or streets. - uint64_t m_mismatchedNames{0}; + uint64_t m_mismatchedNames = 0; }; // A single entry in the hierarchy directed acyclic graph. diff --git a/geocoder/hierarchy_reader.cpp b/geocoder/hierarchy_reader.cpp index f917d4bb6c..c86e60d490 100644 --- a/geocoder/hierarchy_reader.cpp +++ b/geocoder/hierarchy_reader.cpp @@ -14,7 +14,7 @@ namespace // Information will be logged for every |kLogBatch| entries. size_t const kLogBatch = 100000; -void operator += (Hierarchy::ParsingStats & accumulator, Hierarchy::ParsingStats & stats) +void operator+=(Hierarchy::ParsingStats & accumulator, Hierarchy::ParsingStats & stats) { struct ValidationStats { @@ -32,7 +32,6 @@ void operator += (Hierarchy::ParsingStats & accumulator, Hierarchy::ParsingStats accumulator.m_emptyNames += stats.m_emptyNames; accumulator.m_mismatchedNames += stats.m_mismatchedNames; } - } // namespace HierarchyReader::HierarchyReader(string const & pathToJsonHierarchy) @@ -47,11 +46,13 @@ HierarchyReader::HierarchyReader(std::istream & in) { } -Hierarchy HierarchyReader::Read(size_t readersCount) +Hierarchy HierarchyReader::Read(unsigned int readersCount) { LOG(LINFO, ("Reading entries...")); - readersCount = min(readersCount, size_t{thread::hardware_concurrency()}); + if (auto hardwareConcurrency = thread::hardware_concurrency()) + readersCount = min(hardwareConcurrency, readersCount); + readersCount = max(1U, readersCount); vector> taskEntries(readersCount); vector tasksStats(readersCount); @@ -109,7 +110,7 @@ vector HierarchyReader::MergeEntries(vector, ReferenceGreater> (entryParts.begin(), entryParts.end()); - while (partsQueue.size()) + while (!partsQueue.empty()) { auto & minPart = partsQueue.top().get(); partsQueue.pop(); @@ -120,7 +121,7 @@ vector HierarchyReader::MergeEntries(vector & entries, // Temporary local object for efficient concurent processing (individual cache line for container). auto localEntries = multimap{}; - int const kLineBufferCapacity = 10000; + size_t const kLineBufferCapacity = 10000; vector linesBuffer(kLineBufferCapacity); - int bufferSize = 0; + size_t bufferSize = 0; while (true) { @@ -180,10 +181,10 @@ void HierarchyReader::ReadEntryMap(multimap & entries, entries = move(localEntries); } -void HierarchyReader::DeserializeEntryMap(vector const & linesBuffer, int const bufferSize, +void HierarchyReader::DeserializeEntryMap(vector const & linesBuffer, size_t const bufferSize, multimap & entries, ParsingStats & stats) { - for (int i = 0; i < bufferSize; ++i) + for (size_t i = 0; i < bufferSize; ++i) { auto & line = linesBuffer[i]; @@ -220,4 +221,4 @@ void HierarchyReader::DeserializeEntryMap(vector const & linesBuffer, in entries.emplace(osmId, move(entry)); } } -} // namespace geocoder +} // namespace geocoder diff --git a/geocoder/hierarchy_reader.hpp b/geocoder/hierarchy_reader.hpp index 48ae156764..54b38073a5 100644 --- a/geocoder/hierarchy_reader.hpp +++ b/geocoder/hierarchy_reader.hpp @@ -25,12 +25,13 @@ public: explicit HierarchyReader(std::string const & pathToJsonHierarchy); explicit HierarchyReader(std::istream & in); - Hierarchy Read(size_t readersCount = 4); + // Read hierarchy file/stream concurrency in |readersCount| threads. + Hierarchy Read(unsigned int readersCount = 4); private: void ReadEntryMap(std::multimap & entries, ParsingStats & stats); - void DeserializeEntryMap(std::vector const & linesBuffer, int const bufferSize, + void DeserializeEntryMap(std::vector const & linesBuffer, std::size_t const bufferSize, std::multimap & entries, ParsingStats & stats); std::vector MergeEntries(std::vector> & entryParts); diff --git a/geocoder/index.cpp b/geocoder/index.cpp index f11e991c0a..7117805040 100644 --- a/geocoder/index.cpp +++ b/geocoder/index.cpp @@ -23,8 +23,13 @@ size_t const kLogBatch = 100000; namespace geocoder { -Index::Index(Hierarchy const & hierarchy) : m_docs(hierarchy.GetEntries()) +Index::Index(Hierarchy const & hierarchy, unsigned int processingThreadsCount) + : m_docs(hierarchy.GetEntries()), m_processingThreadsCount{processingThreadsCount} { + if (auto hardwareConcurrency = thread::hardware_concurrency()) + m_processingThreadsCount = min(hardwareConcurrency, m_processingThreadsCount); + m_processingThreadsCount = max(1U, m_processingThreadsCount); + LOG(LINFO, ("Indexing hierarchy entries...")); AddEntries(); LOG(LINFO, ("Indexing houses...")); @@ -94,7 +99,8 @@ void Index::AddHouses() atomic numIndexed{0}; std::mutex mutex; - vector threads(thread::hardware_concurrency()); + vector threads(m_processingThreadsCount); + CHECK_GREATER(threads.size(), 0, ()); for (size_t t = 0; t < threads.size(); ++t) { diff --git a/geocoder/index.hpp b/geocoder/index.hpp index 8a56fb4271..a2c96dc1ca 100644 --- a/geocoder/index.hpp +++ b/geocoder/index.hpp @@ -20,7 +20,7 @@ public: // that the index was constructed from. using DocId = std::vector::size_type; - explicit Index(Hierarchy const & hierarchy); + explicit Index(Hierarchy const & hierarchy, unsigned int processingThreadsCount = 4); Doc const & GetDoc(DocId const id) const; @@ -74,5 +74,7 @@ private: // Lists of houses grouped by the streets they belong to. std::unordered_map> m_buildingsOnStreet; + + unsigned int m_processingThreadsCount; }; } // namespace geocoder