forked from organicmaps/organicmaps
Merge pull request #5379 from ygorshenin/small-set
[search] SmallSet for languages.
This commit is contained in:
commit
00db23ed79
14 changed files with 322 additions and 21 deletions
|
@ -42,6 +42,7 @@ set(
|
|||
set_operations.hpp
|
||||
shared_buffer_manager.cpp
|
||||
shared_buffer_manager.hpp
|
||||
small_set.hpp
|
||||
src_point.cpp
|
||||
src_point.hpp
|
||||
stats.hpp
|
||||
|
|
|
@ -66,6 +66,7 @@ HEADERS += \
|
|||
scope_guard.hpp \
|
||||
set_operations.hpp \
|
||||
shared_buffer_manager.hpp \
|
||||
small_set.hpp \
|
||||
src_point.hpp \
|
||||
stats.hpp \
|
||||
std_serialization.hpp \
|
||||
|
|
|
@ -23,6 +23,7 @@ set(
|
|||
regexp_test.cpp
|
||||
rolling_hash_test.cpp
|
||||
scope_guard_test.cpp
|
||||
small_set_test.cpp
|
||||
stl_add_test.cpp
|
||||
stl_helpers_test.cpp
|
||||
string_format_test.cpp
|
||||
|
|
|
@ -33,6 +33,7 @@ SOURCES += \
|
|||
regexp_test.cpp \
|
||||
rolling_hash_test.cpp \
|
||||
scope_guard_test.cpp \
|
||||
small_set_test.cpp \
|
||||
stl_add_test.cpp \
|
||||
stl_helpers_test.cpp \
|
||||
string_format_test.cpp \
|
||||
|
|
|
@ -120,14 +120,14 @@ UNIT_TEST(PopCount64)
|
|||
|
||||
UNIT_TEST(CeilLog)
|
||||
{
|
||||
TEST_EQUAL(0, bits::CeilLog(0x0), ());
|
||||
TEST_EQUAL(0, bits::CeilLog(0x1), ());
|
||||
TEST_EQUAL(1, bits::CeilLog(0x2), ());
|
||||
TEST_EQUAL(1, bits::CeilLog(0x3), ());
|
||||
TEST_EQUAL(2, bits::CeilLog(0x4), ());
|
||||
TEST_EQUAL(0, bits::FloorLog(0x0), ());
|
||||
TEST_EQUAL(0, bits::FloorLog(0x1), ());
|
||||
TEST_EQUAL(1, bits::FloorLog(0x2), ());
|
||||
TEST_EQUAL(1, bits::FloorLog(0x3), ());
|
||||
TEST_EQUAL(2, bits::FloorLog(0x4), ());
|
||||
|
||||
TEST_EQUAL(6, bits::CeilLog(0x7f), ());
|
||||
TEST_EQUAL(7, bits::CeilLog(0x80), ());
|
||||
TEST_EQUAL(31, bits::CeilLog(0xFFFFFFFF), ());
|
||||
TEST_EQUAL(63, bits::CeilLog(0xFFFFFFFFFFFFFFFF), ());
|
||||
TEST_EQUAL(6, bits::FloorLog(0x7f), ());
|
||||
TEST_EQUAL(7, bits::FloorLog(0x80), ());
|
||||
TEST_EQUAL(31, bits::FloorLog(0xFFFFFFFF), ());
|
||||
TEST_EQUAL(63, bits::FloorLog(0xFFFFFFFFFFFFFFFF), ());
|
||||
}
|
||||
|
|
76
base/base_tests/small_set_test.cpp
Normal file
76
base/base_tests/small_set_test.cpp
Normal file
|
@ -0,0 +1,76 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "base/small_set.hpp"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iterator>
|
||||
#include <vector>
|
||||
|
||||
using namespace base;
|
||||
|
||||
namespace
|
||||
{
|
||||
UNIT_TEST(SmallSet_Empty)
|
||||
{
|
||||
SmallSet<0> set;
|
||||
TEST_EQUAL(set.Size(), 0, ());
|
||||
}
|
||||
|
||||
UNIT_TEST(SmallSet_Smoke)
|
||||
{
|
||||
SmallSet<300> set;
|
||||
TEST_EQUAL(set.Size(), 0, ());
|
||||
for (uint64_t i = 0; i < 300; ++i)
|
||||
TEST(!set.Contains(i), ());
|
||||
|
||||
set.Insert(0);
|
||||
TEST_EQUAL(set.Size(), 1, ());
|
||||
TEST(set.Contains(0), ());
|
||||
|
||||
set.Insert(0);
|
||||
TEST_EQUAL(set.Size(), 1, ());
|
||||
TEST(set.Contains(0), ());
|
||||
|
||||
set.Insert(5);
|
||||
TEST_EQUAL(set.Size(), 2, ());
|
||||
TEST(set.Contains(0), ());
|
||||
TEST(set.Contains(5), ());
|
||||
|
||||
set.Insert(64);
|
||||
TEST_EQUAL(set.Size(), 3, ());
|
||||
TEST(set.Contains(0), ());
|
||||
TEST(set.Contains(5), ());
|
||||
TEST(set.Contains(64), ());
|
||||
|
||||
{
|
||||
auto cur = set.begin();
|
||||
auto end = set.end();
|
||||
for (uint64_t i : {0, 5, 64})
|
||||
{
|
||||
TEST(cur != end, ());
|
||||
TEST_EQUAL(*cur, i, ());
|
||||
++cur;
|
||||
}
|
||||
TEST(cur == end, ());
|
||||
}
|
||||
|
||||
set.Remove(5);
|
||||
TEST_EQUAL(set.Size(), 2, ());
|
||||
TEST(set.Contains(0), ());
|
||||
TEST(!set.Contains(5), ());
|
||||
TEST(set.Contains(64), ());
|
||||
|
||||
set.Insert(297);
|
||||
set.Insert(298);
|
||||
set.Insert(299);
|
||||
TEST_EQUAL(set.Size(), 5, ());
|
||||
|
||||
{
|
||||
std::vector<uint64_t> const actual(set.begin(), set.end());
|
||||
std::vector<uint64_t> const expected = {0, 64, 297, 298, 299};
|
||||
TEST_EQUAL(actual, expected, ());
|
||||
}
|
||||
|
||||
TEST_EQUAL(set.Size(), std::distance(set.begin(), set.end()), ());
|
||||
}
|
||||
} // namespace
|
|
@ -70,7 +70,7 @@ namespace bits
|
|||
return static_cast<uint32_t>(x);
|
||||
}
|
||||
|
||||
inline uint8_t CeilLog(uint64_t x) noexcept
|
||||
inline uint8_t FloorLog(uint64_t x) noexcept
|
||||
{
|
||||
#define CHECK_RSH(x, msb, offset) \
|
||||
if (x >> offset) \
|
||||
|
|
218
base/small_set.hpp
Normal file
218
base/small_set.hpp
Normal file
|
@ -0,0 +1,218 @@
|
|||
#pragma once
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/bits.hpp"
|
||||
#include "base/macros.hpp"
|
||||
|
||||
#include <cstdint>
|
||||
#include <iterator>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
namespace base
|
||||
{
|
||||
// A set of nonnegative integers less than |UpperBound|.
|
||||
//
|
||||
// Requires UpperBound + O(1) bits of memory. All operations except
|
||||
// Clear() and iteration are O(1). Clear() and iteration require
|
||||
// O(UpperBound) steps.
|
||||
//
|
||||
// *NOTE* This class *IS NOT* thread safe.
|
||||
template <uint64_t UpperBound>
|
||||
class SmallSet
|
||||
{
|
||||
public:
|
||||
static uint64_t constexpr kNumBlocks = (UpperBound + 63) / 64;
|
||||
|
||||
class Iterator
|
||||
{
|
||||
public:
|
||||
using difference_type = uint64_t;
|
||||
using value_type = uint64_t;
|
||||
using pointer = void;
|
||||
using reference = void;
|
||||
using iterator_category = std::forward_iterator_tag;
|
||||
|
||||
Iterator(uint64_t const * blocks, uint64_t current_block_index)
|
||||
: m_blocks(blocks), m_current_block_index(current_block_index), m_current_block(0)
|
||||
{
|
||||
ASSERT_LESS_OR_EQUAL(current_block_index, kNumBlocks, ());
|
||||
if (current_block_index < kNumBlocks)
|
||||
m_current_block = m_blocks[current_block_index];
|
||||
SkipZeroes();
|
||||
}
|
||||
|
||||
bool operator==(Iterator const & rhs) const
|
||||
{
|
||||
return m_blocks == rhs.m_blocks && m_current_block_index == rhs.m_current_block_index &&
|
||||
m_current_block == rhs.m_current_block;
|
||||
}
|
||||
|
||||
bool operator!=(Iterator const & rhs) const { return !(*this == rhs); }
|
||||
|
||||
uint64_t operator*() const
|
||||
{
|
||||
ASSERT_NOT_EQUAL(m_current_block, 0, ());
|
||||
auto const bit = m_current_block & -m_current_block;
|
||||
return bits::FloorLog(bit) + m_current_block_index * 64;
|
||||
}
|
||||
|
||||
Iterator const & operator++()
|
||||
{
|
||||
ASSERT(m_current_block_index < kNumBlocks, ());
|
||||
ASSERT_NOT_EQUAL(m_current_block, 0, ());
|
||||
m_current_block = m_current_block & (m_current_block - 1);
|
||||
SkipZeroes();
|
||||
return *this;
|
||||
}
|
||||
|
||||
private:
|
||||
void SkipZeroes()
|
||||
{
|
||||
ASSERT_LESS_OR_EQUAL(m_current_block_index, kNumBlocks, ());
|
||||
|
||||
if (m_current_block != 0 || m_current_block_index == kNumBlocks)
|
||||
return;
|
||||
|
||||
do
|
||||
++m_current_block_index;
|
||||
while (m_current_block_index < kNumBlocks && m_blocks[m_current_block_index] == 0);
|
||||
if (m_current_block_index < kNumBlocks)
|
||||
m_current_block = m_blocks[m_current_block_index];
|
||||
else
|
||||
m_current_block = 0;
|
||||
}
|
||||
|
||||
uint64_t const * m_blocks;
|
||||
uint64_t m_current_block_index;
|
||||
uint64_t m_current_block;
|
||||
};
|
||||
|
||||
#define DEFINE_BLOCK_OFFSET(value) \
|
||||
uint64_t const block = value / 64; \
|
||||
uint64_t const offset = value % 64
|
||||
|
||||
// This invalidates all iterators except end().
|
||||
void Insert(uint64_t value)
|
||||
{
|
||||
ASSERT_LESS(value, UpperBound, ());
|
||||
|
||||
DEFINE_BLOCK_OFFSET(value);
|
||||
auto const bit = kOne << offset;
|
||||
m_size += (m_blocks[block] & bit) == 0;
|
||||
m_blocks[block] |= bit;
|
||||
}
|
||||
|
||||
// This invalidates all iterators except end().
|
||||
void Remove(uint64_t value)
|
||||
{
|
||||
ASSERT_LESS(value, UpperBound, ());
|
||||
|
||||
DEFINE_BLOCK_OFFSET(value);
|
||||
auto const bit = kOne << offset;
|
||||
m_size -= (m_blocks[block] & bit) != 0;
|
||||
m_blocks[block] &= ~bit;
|
||||
}
|
||||
|
||||
bool Contains(uint64_t value) const
|
||||
{
|
||||
ASSERT_LESS(value, UpperBound, ());
|
||||
|
||||
DEFINE_BLOCK_OFFSET(value);
|
||||
return m_blocks[block] & (kOne << offset);
|
||||
}
|
||||
|
||||
#undef DEFINE_BLOCK_OFFSET
|
||||
|
||||
uint64_t Size() const { return m_size; }
|
||||
|
||||
// This invalidates all iterators except end().
|
||||
void Clear()
|
||||
{
|
||||
std::fill(std::begin(m_blocks), std::end(m_blocks), static_cast<uint64_t>(0));
|
||||
m_size = 0;
|
||||
}
|
||||
|
||||
Iterator begin() const { return Iterator(m_blocks, 0); }
|
||||
Iterator cbegin() const { return Iterator(m_blocks, 0); }
|
||||
|
||||
Iterator end() const { return Iterator(m_blocks, kNumBlocks); }
|
||||
Iterator cend() const { return Iterator(m_blocks, kNumBlocks); }
|
||||
|
||||
private:
|
||||
static uint64_t constexpr kOne = 1;
|
||||
|
||||
uint64_t m_blocks[kNumBlocks] = {};
|
||||
uint64_t m_size = 0;
|
||||
};
|
||||
|
||||
// static
|
||||
template <uint64_t UpperBound>
|
||||
uint64_t constexpr SmallSet<UpperBound>::kNumBlocks;
|
||||
|
||||
// static
|
||||
template <uint64_t UpperBound>
|
||||
uint64_t constexpr SmallSet<UpperBound>::kOne;
|
||||
|
||||
template <uint64_t UpperBound>
|
||||
std::string DebugPrint(SmallSet<UpperBound> const & set)
|
||||
{
|
||||
std::ostringstream os;
|
||||
os << "SmallSet<" << UpperBound << "> [" << set.Size() << ": ";
|
||||
for (auto const & v : set)
|
||||
os << v << " ";
|
||||
os << "]";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
// This is a delegate for SmallSet<>, that checks the validity of
|
||||
// argument in Insert(), Remove() and Contains() methods and does
|
||||
// nothing when the argument is not valid.
|
||||
template <uint64_t UpperBound>
|
||||
class SafeSmallSet
|
||||
{
|
||||
public:
|
||||
using Set = SmallSet<UpperBound>;
|
||||
using Iterator = typename Set::Iterator;
|
||||
|
||||
void Insert(uint64_t value)
|
||||
{
|
||||
if (IsValid(value))
|
||||
m_set.Insert(value);
|
||||
}
|
||||
|
||||
void Remove(uint64_t value)
|
||||
{
|
||||
if (IsValid(value))
|
||||
m_set.Remove(value);
|
||||
}
|
||||
|
||||
bool Contains(uint64_t value) const { return IsValid(value) && m_set.Contains(value); }
|
||||
|
||||
uint64_t Size() const { return m_set.Size(); }
|
||||
|
||||
void Clear() { m_set.Clear(); }
|
||||
|
||||
Iterator begin() const { return m_set.begin(); }
|
||||
Iterator cbegin() const { return m_set.cbegin(); }
|
||||
|
||||
Iterator end() const { return m_set.end(); }
|
||||
Iterator cend() const { return m_set.cend(); }
|
||||
|
||||
private:
|
||||
bool IsValid(uint64_t value) const { return value < UpperBound; }
|
||||
|
||||
Set m_set;
|
||||
};
|
||||
|
||||
template <uint64_t UpperBound>
|
||||
std::string DebugPrint(SafeSmallSet<UpperBound> const & set)
|
||||
{
|
||||
std::ostringstream os;
|
||||
os << "SafeSmallSet<" << UpperBound << "> [" << set.Size() << ": ";
|
||||
for (auto const & v : set)
|
||||
os << v << " ";
|
||||
os << "]";
|
||||
return os.str();
|
||||
}
|
||||
} // namespace base
|
|
@ -434,7 +434,7 @@ unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitGroups(
|
|||
if (bitGroups.empty())
|
||||
return make_unique<SparseCBV>(move(bitGroups));
|
||||
|
||||
uint64_t const maxBit = kBlockSize * (bitGroups.size() - 1) + bits::CeilLog(bitGroups.back());
|
||||
uint64_t const maxBit = kBlockSize * (bitGroups.size() - 1) + bits::FloorLog(bitGroups.back());
|
||||
uint64_t popCount = 0;
|
||||
for (size_t i = 0; i < bitGroups.size(); ++i)
|
||||
popCount += bits::PopCount(bitGroups[i]);
|
||||
|
|
|
@ -18,7 +18,7 @@ public:
|
|||
if (value == 0)
|
||||
return false;
|
||||
|
||||
uint8_t const n = bits::CeilLog(value);
|
||||
uint8_t const n = bits::FloorLog(value);
|
||||
ASSERT_LESS_OR_EQUAL(n, 63, ());
|
||||
|
||||
uint64_t const msb = static_cast<uint64_t>(1) << n;
|
||||
|
@ -50,7 +50,7 @@ public:
|
|||
if (value == 0)
|
||||
return false;
|
||||
|
||||
uint8_t const n = bits::CeilLog(value);
|
||||
uint8_t const n = bits::FloorLog(value);
|
||||
ASSERT_LESS_OR_EQUAL(n, 63, ());
|
||||
if (!GammaCoder::Encode(writer, n + 1))
|
||||
return false;
|
||||
|
|
|
@ -207,18 +207,18 @@ private:
|
|||
template <typename DFA>
|
||||
struct SearchTrieRequest
|
||||
{
|
||||
inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; }
|
||||
inline bool IsLangExist(int8_t lang) const { return m_langs.Contains(lang); }
|
||||
|
||||
inline void Clear()
|
||||
{
|
||||
m_names.clear();
|
||||
m_categories.clear();
|
||||
m_langs.clear();
|
||||
m_langs.Clear();
|
||||
}
|
||||
|
||||
vector<DFA> m_names;
|
||||
vector<strings::UniStringDFA> m_categories;
|
||||
unordered_set<int8_t> m_langs;
|
||||
QueryParams::Langs m_langs;
|
||||
};
|
||||
|
||||
// Calls |toDo| for each feature accepted but at least one DFA.
|
||||
|
|
|
@ -675,7 +675,7 @@ void Processor::InitParams(QueryParams & params)
|
|||
|
||||
auto & langs = params.GetLangs();
|
||||
for (int i = 0; i < LANG_COUNT; ++i)
|
||||
langs.insert(GetLanguage(i));
|
||||
langs.Insert(GetLanguage(i));
|
||||
|
||||
RemoveStopWordsIfNeeded(params);
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ void QueryParams::Clear()
|
|||
m_prefixToken.Clear();
|
||||
m_hasPrefix = false;
|
||||
m_typeIndices.clear();
|
||||
m_langs.clear();
|
||||
m_langs.Clear();
|
||||
m_scale = scales::GetUpperScale();
|
||||
}
|
||||
|
||||
|
@ -135,7 +135,7 @@ string DebugPrint(QueryParams const & params)
|
|||
os << "QueryParams [ m_tokens=" << ::DebugPrint(params.m_tokens)
|
||||
<< ", m_prefixToken=" << DebugPrint(params.m_prefixToken)
|
||||
<< ", m_typeIndices=" << ::DebugPrint(params.m_typeIndices)
|
||||
<< ", m_langs=" << ::DebugPrint(params.m_langs) << " ]";
|
||||
<< ", m_langs=" << DebugPrint(params.m_langs) << " ]";
|
||||
return os.str();
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,10 @@
|
|||
|
||||
#include "indexer/scales.hpp"
|
||||
|
||||
#include "coding/multilang_utf8_string.hpp"
|
||||
|
||||
#include "base/assert.hpp"
|
||||
#include "base/small_set.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/cstdint.hpp"
|
||||
|
@ -20,7 +23,7 @@ class QueryParams
|
|||
public:
|
||||
using String = strings::UniString;
|
||||
using TypeIndices = vector<uint32_t>;
|
||||
using Langs = unordered_set<int8_t>;
|
||||
using Langs = base::SafeSmallSet<StringUtf8Multilang::kMaxSupportedLanguages>;
|
||||
|
||||
struct Token
|
||||
{
|
||||
|
@ -110,7 +113,7 @@ public:
|
|||
|
||||
inline Langs & GetLangs() { return m_langs; }
|
||||
inline Langs const & GetLangs() const { return m_langs; }
|
||||
inline bool LangExists(int8_t lang) const { return m_langs.count(lang) != 0; }
|
||||
inline bool LangExists(int8_t lang) const { return m_langs.Contains(lang); }
|
||||
|
||||
inline int GetScale() const { return m_scale; }
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue