[search] SmallSet for languages.

This commit is contained in:
Yuri Gorshenin 2017-02-08 13:33:28 +03:00
parent 803e85ea59
commit 9d1db68c98
10 changed files with 288 additions and 8 deletions

View file

@ -42,6 +42,7 @@ set(
set_operations.hpp
shared_buffer_manager.cpp
shared_buffer_manager.hpp
small_set.hpp
src_point.cpp
src_point.hpp
stats.hpp

View file

@ -66,6 +66,7 @@ HEADERS += \
scope_guard.hpp \
set_operations.hpp \
shared_buffer_manager.hpp \
small_set.hpp \
src_point.hpp \
stats.hpp \
std_serialization.hpp \

View file

@ -23,6 +23,7 @@ set(
regexp_test.cpp
rolling_hash_test.cpp
scope_guard_test.cpp
small_set_test.cpp
stl_add_test.cpp
stl_helpers_test.cpp
string_format_test.cpp

View file

@ -33,6 +33,7 @@ SOURCES += \
regexp_test.cpp \
rolling_hash_test.cpp \
scope_guard_test.cpp \
small_set_test.cpp \
stl_add_test.cpp \
stl_helpers_test.cpp \
string_format_test.cpp \

View file

@ -0,0 +1,74 @@
#include "testing/testing.hpp"
#include "base/small_set.hpp"
#include <algorithm>
#include <iterator>
#include <vector>
using namespace base;
namespace
{
UNIT_TEST(SmallSet_Empty)
{
SmallSet<0> set;
TEST_EQUAL(set.Size(), 0, ());
}
UNIT_TEST(SmallSet_Smoke)
{
SmallSet<300> set;
TEST_EQUAL(set.Size(), 0, ());
set.Insert(0);
TEST_EQUAL(set.Size(), 1, ());
TEST(set.Contains(0), ());
set.Insert(0);
TEST_EQUAL(set.Size(), 1, ());
TEST(set.Contains(0), ());
set.Insert(5);
TEST_EQUAL(set.Size(), 2, ());
TEST(set.Contains(0), ());
TEST(set.Contains(5), ());
set.Insert(64);
TEST_EQUAL(set.Size(), 3, ());
TEST(set.Contains(0), ());
TEST(set.Contains(5), ());
TEST(set.Contains(64), ());
{
auto cur = set.begin();
auto end = set.end();
for (uint64_t i : {0, 5, 64})
{
TEST(cur != end, ());
TEST_EQUAL(*cur, i, ());
++cur;
}
TEST(cur == end, ());
}
set.Remove(5);
TEST_EQUAL(set.Size(), 2, ());
TEST(set.Contains(0), ());
TEST(!set.Contains(5), ());
TEST(set.Contains(64), ());
set.Insert(297);
set.Insert(298);
set.Insert(299);
TEST_EQUAL(set.Size(), 5, ());
{
std::vector<uint64_t> const actual(set.begin(), set.end());
std::vector<uint64_t> const expected = {0, 64, 297, 298, 299};
TEST_EQUAL(actual, expected, ());
}
TEST_EQUAL(set.Size(), std::distance(set.begin(), set.end()), ());
}
} // namespace

167
base/small_set.hpp Normal file
View file

@ -0,0 +1,167 @@
#pragma once
#include "base/assert.hpp"
#include "base/bits.hpp"
#include "base/macros.hpp"
#include <cstdint>
#include <iterator>
#include <sstream>
#include <string>
namespace base
{
// A set of values less than |UpperBound|.
//
// Requires UpperBound + O(1) bits of memory. All operations except
// Clear() and iteration are O(1). Clear() and iteration require
// O(UpperBound) steps.
//
// *NOTE* This class *IS NOT* thread safe.
template <uint64_t UpperBound>
class SmallSet
{
public:
static uint64_t constexpr kNumBlocks = (UpperBound + 63) / 64;
class Iterator
{
public:
using difference_type = uint64_t;
using value_type = uint64_t;
using pointer = void;
using reference = void;
using iterator_category = std::forward_iterator_tag;
Iterator(uint64_t const * blocks, uint64_t current_block_index)
: m_blocks(blocks), m_current_block_index(current_block_index), m_current_block(0)
{
ASSERT_LESS_OR_EQUAL(current_block_index, kNumBlocks, ());
if (current_block_index < kNumBlocks)
m_current_block = m_blocks[current_block_index];
SkipZeroes();
}
bool operator==(Iterator const & rhs) const
{
return m_blocks == rhs.m_blocks && m_current_block_index == rhs.m_current_block_index &&
m_current_block == rhs.m_current_block;
}
bool operator!=(Iterator const & rhs) const { return !(*this == rhs); }
uint64_t operator*() const
{
ASSERT_NOT_EQUAL(m_current_block, 0, ());
auto const bit = m_current_block & -m_current_block;
return bits::PopCount(bit - 1) + m_current_block_index * 64;
}
Iterator const & operator++()
{
ASSERT(m_current_block_index < kNumBlocks, ());
ASSERT_NOT_EQUAL(m_current_block, 0, ());
m_current_block = m_current_block & (m_current_block - 1);
SkipZeroes();
return *this;
}
private:
void SkipZeroes()
{
ASSERT_LESS_OR_EQUAL(m_current_block_index, kNumBlocks, ());
if (m_current_block != 0 || m_current_block_index == kNumBlocks)
return;
do
++m_current_block_index;
while (m_current_block_index < kNumBlocks && m_blocks[m_current_block_index] == 0);
if (m_current_block_index < kNumBlocks)
m_current_block = m_blocks[m_current_block_index];
else
m_current_block = 0;
}
uint64_t const * m_blocks;
uint64_t m_current_block_index;
uint64_t m_current_block;
};
#define DEFINE_BLOCK_OFFSET(value) \
uint64_t const block = value / 64; \
uint64_t const offset = value % 64
// This invalidates all iterators except end().
void Insert(uint64_t value)
{
ASSERT_LESS(value, UpperBound, ());
DEFINE_BLOCK_OFFSET(value);
auto const bit = kOne << offset;
m_size += (m_blocks[block] & bit) == 0;
m_blocks[block] |= bit;
}
// This invalidates all iterators except end().
void Remove(uint64_t value)
{
ASSERT_LESS(value, UpperBound, ());
DEFINE_BLOCK_OFFSET(value);
auto const bit = kOne << offset;
m_size -= (m_blocks[block] & bit) != 0;
m_blocks[block] &= ~bit;
}
bool Contains(uint64_t value) const
{
ASSERT_LESS(value, UpperBound, ());
DEFINE_BLOCK_OFFSET(value);
return m_blocks[block] & (kOne << offset);
}
#undef DEFINE_BLOCK_OFFSET
uint64_t Size() const { return m_size; }
// This invalidates all iterators except end().
void Clear()
{
std::fill(std::begin(m_blocks), std::end(m_blocks), static_cast<uint64_t>(0));
m_size = 0;
}
Iterator begin() const { return Iterator(m_blocks, 0); }
Iterator cbegin() const { return Iterator(m_blocks, 0); }
Iterator end() const { return Iterator(m_blocks, kNumBlocks); }
Iterator cend() const { return Iterator(m_blocks, kNumBlocks); }
private:
static uint64_t constexpr kOne = 1;
uint64_t m_blocks[kNumBlocks] = {};
uint64_t m_size = 0;
};
// static
template <uint64_t UpperBound>
uint64_t constexpr SmallSet<UpperBound>::kNumBlocks;
// static
template <uint64_t UpperBound>
uint64_t constexpr SmallSet<UpperBound>::kOne;
template<uint64_t UpperBound>
std::string DebugPrint(SmallSet<UpperBound> const & set)
{
std::ostringstream os;
os << "SmallSet<" << UpperBound << "> [" << set.Size() << ": ";
for (auto const & v : set)
os << v << " ";
os << "]";
return os.str();
}
} // namespace base

View file

@ -207,18 +207,18 @@ private:
template <typename DFA>
struct SearchTrieRequest
{
inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; }
inline bool IsLangExist(int8_t lang) const { return m_langs.Contains(lang); }
inline void Clear()
{
m_names.clear();
m_categories.clear();
m_langs.clear();
m_langs.Clear();
}
vector<DFA> m_names;
vector<strings::UniStringDFA> m_categories;
unordered_set<int8_t> m_langs;
QueryParams::Langs m_langs;
};
// Calls |toDo| for each feature accepted but at least one DFA.

View file

@ -677,7 +677,7 @@ void Processor::InitParams(QueryParams & params)
auto & langs = params.GetLangs();
for (int i = 0; i < LANG_COUNT; ++i)
langs.insert(GetLanguage(i));
langs.Insert(GetLanguage(i));
RemoveStopWordsIfNeeded(params);

View file

@ -54,7 +54,7 @@ void QueryParams::Clear()
m_prefixToken.Clear();
m_hasPrefix = false;
m_typeIndices.clear();
m_langs.clear();
m_langs.Clear();
m_scale = scales::GetUpperScale();
}
@ -135,7 +135,7 @@ string DebugPrint(QueryParams const & params)
os << "QueryParams [ m_tokens=" << ::DebugPrint(params.m_tokens)
<< ", m_prefixToken=" << DebugPrint(params.m_prefixToken)
<< ", m_typeIndices=" << ::DebugPrint(params.m_typeIndices)
<< ", m_langs=" << ::DebugPrint(params.m_langs) << " ]";
<< ", m_langs=" << DebugPrint(params.m_langs) << " ]";
return os.str();
}

View file

@ -2,7 +2,10 @@
#include "indexer/scales.hpp"
#include "coding/multilang_utf8_string.hpp"
#include "base/assert.hpp"
#include "base/small_set.hpp"
#include "base/string_utils.hpp"
#include "std/cstdint.hpp"
@ -20,7 +23,39 @@ class QueryParams
public:
using String = strings::UniString;
using TypeIndices = vector<uint32_t>;
using Langs = unordered_set<int8_t>;
class Langs
{
public:
using Set = base::SmallSet<StringUtf8Multilang::kMaxSupportedLanguages>;
using Iterator = Set::Iterator;
void Insert(int8_t lang)
{
if (IsValid(lang))
m_set.Insert(lang);
}
bool Contains(int8_t lang) const { return IsValid(lang) ? m_set.Contains(lang) : false; }
void Clear() { m_set.Clear(); }
Iterator begin() const { return m_set.begin(); }
Iterator cbegin() const { return m_set.cbegin(); }
Iterator end() const { return m_set.end(); }
Iterator cend() const { return m_set.cend(); }
private:
friend string DebugPrint(Langs const & langs) { return DebugPrint(langs.m_set); }
bool IsValid(int8_t lang) const
{
return lang >= 0 && lang < StringUtf8Multilang::kMaxSupportedLanguages;
}
Set m_set;
};
struct Token
{
@ -110,7 +145,7 @@ public:
inline Langs & GetLangs() { return m_langs; }
inline Langs const & GetLangs() const { return m_langs; }
inline bool LangExists(int8_t lang) const { return m_langs.count(lang) != 0; }
inline bool LangExists(int8_t lang) const { return m_langs.Contains(lang); }
inline int GetScale() const { return m_scale; }