diff --git a/coding/simple_dense_coding.cpp b/coding/simple_dense_coding.cpp index 4fd359a6f5..6df600fa73 100644 --- a/coding/simple_dense_coding.cpp +++ b/coding/simple_dense_coding.cpp @@ -4,7 +4,8 @@ #include "std/algorithm.hpp" #include "std/limits.hpp" -#include "std/utility.hpp" + +#include "3party/boost/boost/range/adaptor/transformed.hpp" namespace coding { @@ -12,47 +13,6 @@ namespace { size_t const kAlphabetSize = static_cast(numeric_limits::max()) + 1; -struct Code -{ - Code() : m_code(0), m_length(0) {} - - uint8_t m_code; - uint8_t m_length; -}; - -// Initializes code table for simple dense coding with following code -// words: 0, 1, 00, 01, 10, 11, 000, 001, ... -struct CodeTable -{ -public: - CodeTable() - { - size_t rank = 0; - uint8_t length = 1; - while (rank < kAlphabetSize) - { - // Number of codes with the same bit length. - size_t const numCodes = static_cast(1) << length; - - uint8_t code = 0; - for (; code < numCodes && rank + code < kAlphabetSize; ++code) - { - size_t const pos = rank + code; - m_table[pos].m_code = code; - m_table[pos].m_length = length; - } - - rank += code; - length += 1; - } - } - - inline Code const & GetCode(uint8_t rank) const { return m_table[rank]; } - -private: - Code m_table[kAlphabetSize]; -}; - // Calculates frequences for data symbols. void CalcFrequences(vector const & data, uint64_t frequency[]) { @@ -65,8 +25,6 @@ void CalcFrequences(vector const & data, uint64_t frequency[]) SimpleDenseCoding::SimpleDenseCoding(vector const & data) { // This static initialization isn't thread safe prior to C++11. - static CodeTable codeTable; - uint64_t frequency[kAlphabetSize]; // Maps symbols to frequences. CalcFrequences(data, frequency); @@ -75,57 +33,34 @@ SimpleDenseCoding::SimpleDenseCoding(vector const & data) for (size_t i = 0; i < kAlphabetSize; ++i) symbols[i] = i; - sort(symbols, symbols + kAlphabetSize, [&frequency](uint8_t lsym, uint8_t rsym) + + auto frequencyCmp = [&frequency](uint8_t lsym, uint8_t rsym) { return frequency[lsym] > frequency[rsym]; - }); + }; + sort(symbols, symbols + kAlphabetSize, frequencyCmp); for (size_t r = 0; r < kAlphabetSize; ++r) rank[symbols[r]] = r; - uint64_t bitLength = 0; - for (size_t symbol = 0; symbol < kAlphabetSize; ++symbol) - bitLength += frequency[symbol] * codeTable.GetCode(rank[symbol]).m_length; - - succinct::bit_vector_builder bitsBuilder; - bitsBuilder.reserve(bitLength); - vector indexBuilder(bitLength); - size_t pos = 0; - for (uint8_t symbol : data) + auto getRank = [&rank](uint8_t sym) { - Code const & code = codeTable.GetCode(rank[symbol]); - ASSERT_LESS(pos, bitLength, ()); - indexBuilder[pos] = 1; + return rank[sym]; + }; - bitsBuilder.append_bits(code.m_code, code.m_length); - pos += code.m_length; - } - ASSERT_EQUAL(pos, bitLength, ()); - - succinct::bit_vector(&bitsBuilder).swap(m_bits); - succinct::rs_bit_vector(indexBuilder, true /* with_select_hints */).swap(m_index); + using namespace boost::adaptors; + succinct::elias_fano_compressed_list(data | transformed(getRank)).swap(m_ranks); m_symbols.assign(symbols); } SimpleDenseCoding::SimpleDenseCoding(SimpleDenseCoding && rhs) { - m_bits.swap(rhs.m_bits); - m_index.swap(rhs.m_index); + m_ranks.swap(rhs.m_ranks); m_symbols.swap(rhs.m_symbols); } uint8_t SimpleDenseCoding::Get(uint64_t i) const { ASSERT_LESS(i, Size(), ()); - uint64_t const start = m_index.select(i); - uint64_t const end = i + 1 == Size() ? m_index.size() : m_index.select(i + 1); - - ASSERT_LESS(start, end, ()); - - uint8_t const length = static_cast(end - start); - ASSERT_LESS_OR_EQUAL(length, 8, ()); - - uint8_t const code = m_bits.get_bits(start, length); - uint8_t const rank = (1 << length) - 2 + code; - return m_symbols[rank]; + return m_symbols[m_ranks[i]]; } } // namespace coding diff --git a/coding/simple_dense_coding.hpp b/coding/simple_dense_coding.hpp index 025767e722..dee95e6804 100644 --- a/coding/simple_dense_coding.hpp +++ b/coding/simple_dense_coding.hpp @@ -2,16 +2,14 @@ #include "std/vector.hpp" -#include "3party/succinct/bit_vector.hpp" -#include "3party/succinct/mappable_vector.hpp" -#include "3party/succinct/rs_bit_vector.hpp" +#include "3party/succinct/elias_fano_compressed_list.hpp" namespace coding { -// This class represents so-called simple dense coding for byte -// strings. It can be used when it's necessary to compress strings -// with skewed entropy and nevertheless efficient access to the -// string's elements is needed. +// This class represents a variant of a so-called simple dense coding +// scheme for byte strings. It can be used when it's necessary to +// compress strings with skewed entropy and nevertheless efficient +// access to the string's elements is needed. // // The main idea is to assign codewords from the set { 0, 1, 00, 01, // 10, 11, 000, ... } to string's symbols in accordance with their @@ -40,21 +38,19 @@ public: uint8_t Get(uint64_t i) const; - inline uint64_t Size() const { return m_index.num_ones(); } + inline uint64_t Size() const { return m_ranks.size(); } // map is used here (instead of Map) for compatibility with succinct // structures. template void map(TVisitor & visitor) { - visitor(m_bits, "m_bits"); - visitor(m_index, "m_index"); + visitor(m_ranks, "m_ranks"); visitor(m_symbols, "m_symbols"); } private: - succinct::bit_vector m_bits; - succinct::rs_bit_vector m_index; + succinct::elias_fano_compressed_list m_ranks; succinct::mapper::mappable_vector m_symbols; }; } // namespace coding diff --git a/indexer/indexer_tests/rank_table_test.cpp b/indexer/indexer_tests/rank_table_test.cpp index 8a496ce2d5..65ab0e2ff3 100644 --- a/indexer/indexer_tests/rank_table_test.cpp +++ b/indexer/indexer_tests/rank_table_test.cpp @@ -49,7 +49,7 @@ void TestTable(vector const & ranks, string const & path) } } // namespace -UNIT_TEST(FeatureRankTableBuilder_Smoke) +UNIT_TEST(RankTableBuilder_Smoke) { char const kTestCont[] = "test.tmp"; size_t const kNumRanks = 256; @@ -69,7 +69,7 @@ UNIT_TEST(FeatureRankTableBuilder_Smoke) TestTable(ranks, kTestCont); } -UNIT_TEST(FeatureRankTableBuilder_EndToEnd) +UNIT_TEST(RankTableBuilder_EndToEnd) { classificator::Load(); @@ -89,7 +89,10 @@ UNIT_TEST(FeatureRankTableBuilder_EndToEnd) search::RankTableBuilder::CalcSearchRanks(rcont, ranks); } - search::RankTableBuilder::Create(localFile); + { + FilesContainerW wcont(mapPath, FileWriter::OP_WRITE_EXISTING); + search::RankTableBuilder::Create(ranks, wcont); + } Index index; auto regResult = index.RegisterMap(localFile); diff --git a/indexer/rank_table.cpp b/indexer/rank_table.cpp index 34484f7e84..16077937ae 100644 --- a/indexer/rank_table.cpp +++ b/indexer/rank_table.cpp @@ -94,7 +94,7 @@ private: DISALLOW_COPY(CopiedMemoryRegion); }; -unique_ptr GetMemoryRegionForTag(FilesContainerR & rcont, +unique_ptr GetMemoryRegionForTag(FilesContainerR const & rcont, FilesContainerBase::Tag const & tag) { if (!rcont.IsExist(tag)) @@ -105,7 +105,7 @@ unique_ptr GetMemoryRegionForTag(FilesContainerR & rcont, return make_unique(move(buffer)); } -unique_ptr GetMemoryRegionForTag(FilesMappingContainer & mcont, +unique_ptr GetMemoryRegionForTag(FilesMappingContainer const & mcont, FilesContainerBase::Tag const & tag) { if (!mcont.IsExist(tag)) @@ -242,13 +242,13 @@ uint8_t CalcSearchRank(FeatureType const & ft) } // namespace // static -unique_ptr RankTable::Load(FilesContainerR & rcont) +unique_ptr RankTable::Load(FilesContainerR const & rcont) { return LoadRankTable(GetMemoryRegionForTag(rcont, RANKS_FILE_TAG)); } // static -unique_ptr RankTable::Load(FilesMappingContainer & mcont) +unique_ptr RankTable::Load(FilesMappingContainer const & mcont) { return LoadRankTable(GetMemoryRegionForTag(mcont, RANKS_FILE_TAG)); } @@ -266,7 +266,7 @@ void RankTableBuilder::CalcSearchRanks(FilesContainerR & rcont, vector } // static -void RankTableBuilder::Create(platform::LocalCountryFile const & localFile) +void RankTableBuilder::CreateIfNotExists(platform::LocalCountryFile const & localFile) { string const mapPath = localFile.GetPath(MapOptions::Map); diff --git a/indexer/rank_table.hpp b/indexer/rank_table.hpp index 113675bedb..0917c4d639 100644 --- a/indexer/rank_table.hpp +++ b/indexer/rank_table.hpp @@ -65,7 +65,7 @@ public: // *NOTE* Return value can outlive |rcont|. Also note that there is // undefined behaviour if ranks section exists but internally // damaged. - static unique_ptr Load(FilesContainerR & rcont); + static unique_ptr Load(FilesContainerR const & rcont); // Maps whole section corresponding to a rank table and deserializes // it. Returns nullptr if there're no ranks section, rank table's @@ -76,7 +76,7 @@ public: // destructed before |mcont| is closed. Also note that there're // undefined behaviour if ranks section exists but internally // damaged. - static unique_ptr Load(FilesMappingContainer & mcont); + static unique_ptr Load(FilesMappingContainer const & mcont); }; // A builder class for rank tables. @@ -92,7 +92,7 @@ public: // reverse mapping. // * When rank table does not exist or exists but is damaged, calculates all // features' ranks and creates rank table. - static void Create(platform::LocalCountryFile const & localFile); + static void CreateIfNotExists(platform::LocalCountryFile const & localFile); // Force creation of a rank table from array of ranks. Existing rank // table is removed (if any). Note that |wcont| must be instantiated