From 9d1db68c980ad58a7f3aa704462a4c784dc4d706 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 8 Feb 2017 13:33:28 +0300 Subject: [PATCH 1/5] [search] SmallSet for languages. --- base/CMakeLists.txt | 1 + base/base.pro | 1 + base/base_tests/CMakeLists.txt | 1 + base/base_tests/base_tests.pro | 1 + base/base_tests/small_set_test.cpp | 74 +++++++++++++ base/small_set.hpp | 167 +++++++++++++++++++++++++++++ search/feature_offset_match.hpp | 6 +- search/processor.cpp | 2 +- search/query_params.cpp | 4 +- search/query_params.hpp | 39 ++++++- 10 files changed, 288 insertions(+), 8 deletions(-) create mode 100644 base/base_tests/small_set_test.cpp create mode 100644 base/small_set.hpp diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 78b4bc815d..d4e2ad4bd4 100644 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -42,6 +42,7 @@ set( set_operations.hpp shared_buffer_manager.cpp shared_buffer_manager.hpp + small_set.hpp src_point.cpp src_point.hpp stats.hpp diff --git a/base/base.pro b/base/base.pro index d8f90faf79..7eda31055f 100644 --- a/base/base.pro +++ b/base/base.pro @@ -66,6 +66,7 @@ HEADERS += \ scope_guard.hpp \ set_operations.hpp \ shared_buffer_manager.hpp \ + small_set.hpp \ src_point.hpp \ stats.hpp \ std_serialization.hpp \ diff --git a/base/base_tests/CMakeLists.txt b/base/base_tests/CMakeLists.txt index 7d3106daaa..2a31f84d0c 100644 --- a/base/base_tests/CMakeLists.txt +++ b/base/base_tests/CMakeLists.txt @@ -23,6 +23,7 @@ set( regexp_test.cpp rolling_hash_test.cpp scope_guard_test.cpp + small_set_test.cpp stl_add_test.cpp stl_helpers_test.cpp string_format_test.cpp diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro index 32049b5282..a057e05ed7 100644 --- a/base/base_tests/base_tests.pro +++ b/base/base_tests/base_tests.pro @@ -33,6 +33,7 @@ SOURCES += \ regexp_test.cpp \ rolling_hash_test.cpp \ scope_guard_test.cpp \ + small_set_test.cpp \ stl_add_test.cpp \ stl_helpers_test.cpp \ string_format_test.cpp \ diff --git a/base/base_tests/small_set_test.cpp b/base/base_tests/small_set_test.cpp new file mode 100644 index 0000000000..bb2ae235cf --- /dev/null +++ b/base/base_tests/small_set_test.cpp @@ -0,0 +1,74 @@ +#include "testing/testing.hpp" + +#include "base/small_set.hpp" + +#include +#include +#include + +using namespace base; + +namespace +{ +UNIT_TEST(SmallSet_Empty) +{ + SmallSet<0> set; + TEST_EQUAL(set.Size(), 0, ()); +} + +UNIT_TEST(SmallSet_Smoke) +{ + SmallSet<300> set; + TEST_EQUAL(set.Size(), 0, ()); + + set.Insert(0); + TEST_EQUAL(set.Size(), 1, ()); + TEST(set.Contains(0), ()); + + set.Insert(0); + TEST_EQUAL(set.Size(), 1, ()); + TEST(set.Contains(0), ()); + + set.Insert(5); + TEST_EQUAL(set.Size(), 2, ()); + TEST(set.Contains(0), ()); + TEST(set.Contains(5), ()); + + set.Insert(64); + TEST_EQUAL(set.Size(), 3, ()); + TEST(set.Contains(0), ()); + TEST(set.Contains(5), ()); + TEST(set.Contains(64), ()); + + { + auto cur = set.begin(); + auto end = set.end(); + for (uint64_t i : {0, 5, 64}) + { + TEST(cur != end, ()); + TEST_EQUAL(*cur, i, ()); + ++cur; + } + TEST(cur == end, ()); + } + + set.Remove(5); + TEST_EQUAL(set.Size(), 2, ()); + TEST(set.Contains(0), ()); + TEST(!set.Contains(5), ()); + TEST(set.Contains(64), ()); + + set.Insert(297); + set.Insert(298); + set.Insert(299); + TEST_EQUAL(set.Size(), 5, ()); + + { + std::vector const actual(set.begin(), set.end()); + std::vector const expected = {0, 64, 297, 298, 299}; + TEST_EQUAL(actual, expected, ()); + } + + TEST_EQUAL(set.Size(), std::distance(set.begin(), set.end()), ()); +} +} // namespace diff --git a/base/small_set.hpp b/base/small_set.hpp new file mode 100644 index 0000000000..de1dad1406 --- /dev/null +++ b/base/small_set.hpp @@ -0,0 +1,167 @@ +#pragma once + +#include "base/assert.hpp" +#include "base/bits.hpp" +#include "base/macros.hpp" + +#include +#include +#include +#include + +namespace base +{ +// A set of values less than |UpperBound|. +// +// Requires UpperBound + O(1) bits of memory. All operations except +// Clear() and iteration are O(1). Clear() and iteration require +// O(UpperBound) steps. +// +// *NOTE* This class *IS NOT* thread safe. +template +class SmallSet +{ +public: + static uint64_t constexpr kNumBlocks = (UpperBound + 63) / 64; + + class Iterator + { + public: + using difference_type = uint64_t; + using value_type = uint64_t; + using pointer = void; + using reference = void; + using iterator_category = std::forward_iterator_tag; + + Iterator(uint64_t const * blocks, uint64_t current_block_index) + : m_blocks(blocks), m_current_block_index(current_block_index), m_current_block(0) + { + ASSERT_LESS_OR_EQUAL(current_block_index, kNumBlocks, ()); + if (current_block_index < kNumBlocks) + m_current_block = m_blocks[current_block_index]; + SkipZeroes(); + } + + bool operator==(Iterator const & rhs) const + { + return m_blocks == rhs.m_blocks && m_current_block_index == rhs.m_current_block_index && + m_current_block == rhs.m_current_block; + } + + bool operator!=(Iterator const & rhs) const { return !(*this == rhs); } + + uint64_t operator*() const + { + ASSERT_NOT_EQUAL(m_current_block, 0, ()); + auto const bit = m_current_block & -m_current_block; + return bits::PopCount(bit - 1) + m_current_block_index * 64; + } + + Iterator const & operator++() + { + ASSERT(m_current_block_index < kNumBlocks, ()); + ASSERT_NOT_EQUAL(m_current_block, 0, ()); + m_current_block = m_current_block & (m_current_block - 1); + SkipZeroes(); + return *this; + } + + private: + void SkipZeroes() + { + ASSERT_LESS_OR_EQUAL(m_current_block_index, kNumBlocks, ()); + + if (m_current_block != 0 || m_current_block_index == kNumBlocks) + return; + + do + ++m_current_block_index; + while (m_current_block_index < kNumBlocks && m_blocks[m_current_block_index] == 0); + if (m_current_block_index < kNumBlocks) + m_current_block = m_blocks[m_current_block_index]; + else + m_current_block = 0; + } + + uint64_t const * m_blocks; + uint64_t m_current_block_index; + uint64_t m_current_block; + }; + +#define DEFINE_BLOCK_OFFSET(value) \ + uint64_t const block = value / 64; \ + uint64_t const offset = value % 64 + + // This invalidates all iterators except end(). + void Insert(uint64_t value) + { + ASSERT_LESS(value, UpperBound, ()); + + DEFINE_BLOCK_OFFSET(value); + auto const bit = kOne << offset; + m_size += (m_blocks[block] & bit) == 0; + m_blocks[block] |= bit; + } + + // This invalidates all iterators except end(). + void Remove(uint64_t value) + { + ASSERT_LESS(value, UpperBound, ()); + + DEFINE_BLOCK_OFFSET(value); + auto const bit = kOne << offset; + m_size -= (m_blocks[block] & bit) != 0; + m_blocks[block] &= ~bit; + } + + bool Contains(uint64_t value) const + { + ASSERT_LESS(value, UpperBound, ()); + + DEFINE_BLOCK_OFFSET(value); + return m_blocks[block] & (kOne << offset); + } + +#undef DEFINE_BLOCK_OFFSET + + uint64_t Size() const { return m_size; } + + // This invalidates all iterators except end(). + void Clear() + { + std::fill(std::begin(m_blocks), std::end(m_blocks), static_cast(0)); + m_size = 0; + } + + Iterator begin() const { return Iterator(m_blocks, 0); } + Iterator cbegin() const { return Iterator(m_blocks, 0); } + + Iterator end() const { return Iterator(m_blocks, kNumBlocks); } + Iterator cend() const { return Iterator(m_blocks, kNumBlocks); } + +private: + static uint64_t constexpr kOne = 1; + + uint64_t m_blocks[kNumBlocks] = {}; + uint64_t m_size = 0; +}; + +// static +template +uint64_t constexpr SmallSet::kNumBlocks; + +// static +template +uint64_t constexpr SmallSet::kOne; + +template +std::string DebugPrint(SmallSet const & set) +{ + std::ostringstream os; + os << "SmallSet<" << UpperBound << "> [" << set.Size() << ": "; + for (auto const & v : set) + os << v << " "; + os << "]"; + return os.str(); +} +} // namespace base diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index cb88fbddf7..1c57683304 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -207,18 +207,18 @@ private: template struct SearchTrieRequest { - inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; } + inline bool IsLangExist(int8_t lang) const { return m_langs.Contains(lang); } inline void Clear() { m_names.clear(); m_categories.clear(); - m_langs.clear(); + m_langs.Clear(); } vector m_names; vector m_categories; - unordered_set m_langs; + QueryParams::Langs m_langs; }; // Calls |toDo| for each feature accepted but at least one DFA. diff --git a/search/processor.cpp b/search/processor.cpp index ab6caeb387..9fab4d0e5c 100644 --- a/search/processor.cpp +++ b/search/processor.cpp @@ -677,7 +677,7 @@ void Processor::InitParams(QueryParams & params) auto & langs = params.GetLangs(); for (int i = 0; i < LANG_COUNT; ++i) - langs.insert(GetLanguage(i)); + langs.Insert(GetLanguage(i)); RemoveStopWordsIfNeeded(params); diff --git a/search/query_params.cpp b/search/query_params.cpp index 573e4cffca..dec6d4ead4 100644 --- a/search/query_params.cpp +++ b/search/query_params.cpp @@ -54,7 +54,7 @@ void QueryParams::Clear() m_prefixToken.Clear(); m_hasPrefix = false; m_typeIndices.clear(); - m_langs.clear(); + m_langs.Clear(); m_scale = scales::GetUpperScale(); } @@ -135,7 +135,7 @@ string DebugPrint(QueryParams const & params) os << "QueryParams [ m_tokens=" << ::DebugPrint(params.m_tokens) << ", m_prefixToken=" << DebugPrint(params.m_prefixToken) << ", m_typeIndices=" << ::DebugPrint(params.m_typeIndices) - << ", m_langs=" << ::DebugPrint(params.m_langs) << " ]"; + << ", m_langs=" << DebugPrint(params.m_langs) << " ]"; return os.str(); } diff --git a/search/query_params.hpp b/search/query_params.hpp index 42a890a288..480ebee23c 100644 --- a/search/query_params.hpp +++ b/search/query_params.hpp @@ -2,7 +2,10 @@ #include "indexer/scales.hpp" +#include "coding/multilang_utf8_string.hpp" + #include "base/assert.hpp" +#include "base/small_set.hpp" #include "base/string_utils.hpp" #include "std/cstdint.hpp" @@ -20,7 +23,39 @@ class QueryParams public: using String = strings::UniString; using TypeIndices = vector; - using Langs = unordered_set; + + class Langs + { + public: + using Set = base::SmallSet; + using Iterator = Set::Iterator; + + void Insert(int8_t lang) + { + if (IsValid(lang)) + m_set.Insert(lang); + } + + bool Contains(int8_t lang) const { return IsValid(lang) ? m_set.Contains(lang) : false; } + + void Clear() { m_set.Clear(); } + + Iterator begin() const { return m_set.begin(); } + Iterator cbegin() const { return m_set.cbegin(); } + + Iterator end() const { return m_set.end(); } + Iterator cend() const { return m_set.cend(); } + + private: + friend string DebugPrint(Langs const & langs) { return DebugPrint(langs.m_set); } + + bool IsValid(int8_t lang) const + { + return lang >= 0 && lang < StringUtf8Multilang::kMaxSupportedLanguages; + } + + Set m_set; + }; struct Token { @@ -110,7 +145,7 @@ public: inline Langs & GetLangs() { return m_langs; } inline Langs const & GetLangs() const { return m_langs; } - inline bool LangExists(int8_t lang) const { return m_langs.count(lang) != 0; } + inline bool LangExists(int8_t lang) const { return m_langs.Contains(lang); } inline int GetScale() const { return m_scale; } From b8bcf82802d73612ab6402516074834d8f85550e Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 8 Feb 2017 15:51:53 +0300 Subject: [PATCH 2/5] Review fixes. --- base/base_tests/small_set_test.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/base/base_tests/small_set_test.cpp b/base/base_tests/small_set_test.cpp index bb2ae235cf..41d08935d1 100644 --- a/base/base_tests/small_set_test.cpp +++ b/base/base_tests/small_set_test.cpp @@ -20,6 +20,8 @@ UNIT_TEST(SmallSet_Smoke) { SmallSet<300> set; TEST_EQUAL(set.Size(), 0, ()); + for (uint64_t i = 0; i < 300; ++i) + TEST(!set.Contains(i), ()); set.Insert(0); TEST_EQUAL(set.Size(), 1, ()); From acb0e55ebb6a21ad8a570f27d407a05a949fe6d9 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 8 Feb 2017 16:36:03 +0300 Subject: [PATCH 3/5] CeilLog is renamed to FloorLog. --- base/base_tests/bits_test.cpp | 18 +++++++++--------- base/bits.hpp | 2 +- base/small_set.hpp | 2 +- coding/compressed_bit_vector.cpp | 2 +- coding/elias_coder.hpp | 4 ++-- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/base/base_tests/bits_test.cpp b/base/base_tests/bits_test.cpp index c2a43733ea..5263246dae 100644 --- a/base/base_tests/bits_test.cpp +++ b/base/base_tests/bits_test.cpp @@ -120,14 +120,14 @@ UNIT_TEST(PopCount64) UNIT_TEST(CeilLog) { - TEST_EQUAL(0, bits::CeilLog(0x0), ()); - TEST_EQUAL(0, bits::CeilLog(0x1), ()); - TEST_EQUAL(1, bits::CeilLog(0x2), ()); - TEST_EQUAL(1, bits::CeilLog(0x3), ()); - TEST_EQUAL(2, bits::CeilLog(0x4), ()); + TEST_EQUAL(0, bits::FloorLog(0x0), ()); + TEST_EQUAL(0, bits::FloorLog(0x1), ()); + TEST_EQUAL(1, bits::FloorLog(0x2), ()); + TEST_EQUAL(1, bits::FloorLog(0x3), ()); + TEST_EQUAL(2, bits::FloorLog(0x4), ()); - TEST_EQUAL(6, bits::CeilLog(0x7f), ()); - TEST_EQUAL(7, bits::CeilLog(0x80), ()); - TEST_EQUAL(31, bits::CeilLog(0xFFFFFFFF), ()); - TEST_EQUAL(63, bits::CeilLog(0xFFFFFFFFFFFFFFFF), ()); + TEST_EQUAL(6, bits::FloorLog(0x7f), ()); + TEST_EQUAL(7, bits::FloorLog(0x80), ()); + TEST_EQUAL(31, bits::FloorLog(0xFFFFFFFF), ()); + TEST_EQUAL(63, bits::FloorLog(0xFFFFFFFFFFFFFFFF), ()); } diff --git a/base/bits.hpp b/base/bits.hpp index ad0eeac236..fda447185a 100644 --- a/base/bits.hpp +++ b/base/bits.hpp @@ -70,7 +70,7 @@ namespace bits return static_cast(x); } - inline uint8_t CeilLog(uint64_t x) noexcept + inline uint8_t FloorLog(uint64_t x) noexcept { #define CHECK_RSH(x, msb, offset) \ if (x >> offset) \ diff --git a/base/small_set.hpp b/base/small_set.hpp index de1dad1406..35c7afe7a1 100644 --- a/base/small_set.hpp +++ b/base/small_set.hpp @@ -54,7 +54,7 @@ public: { ASSERT_NOT_EQUAL(m_current_block, 0, ()); auto const bit = m_current_block & -m_current_block; - return bits::PopCount(bit - 1) + m_current_block_index * 64; + return bits::FloorLog(bit) + m_current_block_index * 64; } Iterator const & operator++() diff --git a/coding/compressed_bit_vector.cpp b/coding/compressed_bit_vector.cpp index d52eaffb6d..88a863c27f 100644 --- a/coding/compressed_bit_vector.cpp +++ b/coding/compressed_bit_vector.cpp @@ -434,7 +434,7 @@ unique_ptr CompressedBitVectorBuilder::FromBitGroups( if (bitGroups.empty()) return make_unique(move(bitGroups)); - uint64_t const maxBit = kBlockSize * (bitGroups.size() - 1) + bits::CeilLog(bitGroups.back()); + uint64_t const maxBit = kBlockSize * (bitGroups.size() - 1) + bits::FloorLog(bitGroups.back()); uint64_t popCount = 0; for (size_t i = 0; i < bitGroups.size(); ++i) popCount += bits::PopCount(bitGroups[i]); diff --git a/coding/elias_coder.hpp b/coding/elias_coder.hpp index e7cc92e494..47b183b4b1 100644 --- a/coding/elias_coder.hpp +++ b/coding/elias_coder.hpp @@ -18,7 +18,7 @@ public: if (value == 0) return false; - uint8_t const n = bits::CeilLog(value); + uint8_t const n = bits::FloorLog(value); ASSERT_LESS_OR_EQUAL(n, 63, ()); uint64_t const msb = static_cast(1) << n; @@ -50,7 +50,7 @@ public: if (value == 0) return false; - uint8_t const n = bits::CeilLog(value); + uint8_t const n = bits::FloorLog(value); ASSERT_LESS_OR_EQUAL(n, 63, ()); if (!GammaCoder::Encode(writer, n + 1)) return false; From 41c18b845c5379a3c8f5ecdfeea7dc05fde7dcf4 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 8 Feb 2017 16:50:50 +0300 Subject: [PATCH 4/5] [base] Implemented SafeSmallSet. --- base/small_set.hpp | 53 ++++++++++++++++++++++++++++++++++++++++- search/query_params.hpp | 34 +------------------------- 2 files changed, 53 insertions(+), 34 deletions(-) diff --git a/base/small_set.hpp b/base/small_set.hpp index 35c7afe7a1..1e1995d38a 100644 --- a/base/small_set.hpp +++ b/base/small_set.hpp @@ -154,7 +154,7 @@ uint64_t constexpr SmallSet::kNumBlocks; template uint64_t constexpr SmallSet::kOne; -template +template std::string DebugPrint(SmallSet const & set) { std::ostringstream os; @@ -164,4 +164,55 @@ std::string DebugPrint(SmallSet const & set) os << "]"; return os.str(); } + +// This is a delegate for SmallSet<>, that checks the validity of +// argument in Insert(), Remove() and Contains() methods and does +// nothing when the argument is not valid. +template +class SafeSmallSet +{ +public: + using Set = SmallSet; + using Iterator = typename Set::Iterator; + + void Insert(uint64_t value) + { + if (IsValid(value)) + m_set.Insert(value); + } + + void Remove(uint64_t value) + { + if (IsValid(value)) + m_set.Remove(value); + } + + bool Contains(uint64_t value) const { return IsValid(value) && m_set.Contains(value); } + + uint64_t Size() const { return m_set.Size(); } + + void Clear() { m_set.Clear(); } + + Iterator begin() const { return m_set.begin(); } + Iterator cbegin() const { return m_set.cbegin(); } + + Iterator end() const { return m_set.end(); } + Iterator cend() const { return m_set.cend(); } + +private: + bool IsValid(uint64_t value) const { return value < UpperBound; } + + Set m_set; +}; + +template +std::string DebugPrint(SafeSmallSet const & set) +{ + std::ostringstream os; + os << "SafeSmallSet<" << UpperBound << "> [" << set.Size() << ": "; + for (auto const & v : set) + os << v << " "; + os << "]"; + return os.str(); +} } // namespace base diff --git a/search/query_params.hpp b/search/query_params.hpp index 480ebee23c..d489deb6bc 100644 --- a/search/query_params.hpp +++ b/search/query_params.hpp @@ -23,39 +23,7 @@ class QueryParams public: using String = strings::UniString; using TypeIndices = vector; - - class Langs - { - public: - using Set = base::SmallSet; - using Iterator = Set::Iterator; - - void Insert(int8_t lang) - { - if (IsValid(lang)) - m_set.Insert(lang); - } - - bool Contains(int8_t lang) const { return IsValid(lang) ? m_set.Contains(lang) : false; } - - void Clear() { m_set.Clear(); } - - Iterator begin() const { return m_set.begin(); } - Iterator cbegin() const { return m_set.cbegin(); } - - Iterator end() const { return m_set.end(); } - Iterator cend() const { return m_set.cend(); } - - private: - friend string DebugPrint(Langs const & langs) { return DebugPrint(langs.m_set); } - - bool IsValid(int8_t lang) const - { - return lang >= 0 && lang < StringUtf8Multilang::kMaxSupportedLanguages; - } - - Set m_set; - }; + using Langs = base::SafeSmallSet; struct Token { From fa02e26e43a3ac8d274b8626f887dc52141a9146 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 9 Feb 2017 13:09:05 +0300 Subject: [PATCH 5/5] Review fixes. --- base/small_set.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/small_set.hpp b/base/small_set.hpp index 1e1995d38a..c99fade4e6 100644 --- a/base/small_set.hpp +++ b/base/small_set.hpp @@ -11,7 +11,7 @@ namespace base { -// A set of values less than |UpperBound|. +// A set of nonnegative integers less than |UpperBound|. // // Requires UpperBound + O(1) bits of memory. All operations except // Clear() and iteration are O(1). Clear() and iteration require