[coding, indexer] Refactored SimpleDenseCoding.

This commit is contained in:
Yuri Gorshenin 2015-09-09 13:21:16 +03:00 committed by Sergey Yershov
parent 070a14d8ac
commit 00001c40c2
5 changed files with 35 additions and 101 deletions

View file

@ -4,7 +4,8 @@
#include "std/algorithm.hpp"
#include "std/limits.hpp"
#include "std/utility.hpp"
#include "3party/boost/boost/range/adaptor/transformed.hpp"
namespace coding
{
@ -12,47 +13,6 @@ namespace
{
size_t const kAlphabetSize = static_cast<size_t>(numeric_limits<uint8_t>::max()) + 1;
struct Code
{
Code() : m_code(0), m_length(0) {}
uint8_t m_code;
uint8_t m_length;
};
// Initializes code table for simple dense coding with following code
// words: 0, 1, 00, 01, 10, 11, 000, 001, ...
struct CodeTable
{
public:
CodeTable()
{
size_t rank = 0;
uint8_t length = 1;
while (rank < kAlphabetSize)
{
// Number of codes with the same bit length.
size_t const numCodes = static_cast<size_t>(1) << length;
uint8_t code = 0;
for (; code < numCodes && rank + code < kAlphabetSize; ++code)
{
size_t const pos = rank + code;
m_table[pos].m_code = code;
m_table[pos].m_length = length;
}
rank += code;
length += 1;
}
}
inline Code const & GetCode(uint8_t rank) const { return m_table[rank]; }
private:
Code m_table[kAlphabetSize];
};
// Calculates frequences for data symbols.
void CalcFrequences(vector<uint8_t> const & data, uint64_t frequency[])
{
@ -65,8 +25,6 @@ void CalcFrequences(vector<uint8_t> const & data, uint64_t frequency[])
SimpleDenseCoding::SimpleDenseCoding(vector<uint8_t> const & data)
{
// This static initialization isn't thread safe prior to C++11.
static CodeTable codeTable;
uint64_t frequency[kAlphabetSize]; // Maps symbols to frequences.
CalcFrequences(data, frequency);
@ -75,57 +33,34 @@ SimpleDenseCoding::SimpleDenseCoding(vector<uint8_t> const & data)
for (size_t i = 0; i < kAlphabetSize; ++i)
symbols[i] = i;
sort(symbols, symbols + kAlphabetSize, [&frequency](uint8_t lsym, uint8_t rsym)
auto frequencyCmp = [&frequency](uint8_t lsym, uint8_t rsym)
{
return frequency[lsym] > frequency[rsym];
});
};
sort(symbols, symbols + kAlphabetSize, frequencyCmp);
for (size_t r = 0; r < kAlphabetSize; ++r)
rank[symbols[r]] = r;
uint64_t bitLength = 0;
for (size_t symbol = 0; symbol < kAlphabetSize; ++symbol)
bitLength += frequency[symbol] * codeTable.GetCode(rank[symbol]).m_length;
succinct::bit_vector_builder bitsBuilder;
bitsBuilder.reserve(bitLength);
vector<bool> indexBuilder(bitLength);
size_t pos = 0;
for (uint8_t symbol : data)
auto getRank = [&rank](uint8_t sym)
{
Code const & code = codeTable.GetCode(rank[symbol]);
ASSERT_LESS(pos, bitLength, ());
indexBuilder[pos] = 1;
return rank[sym];
};
bitsBuilder.append_bits(code.m_code, code.m_length);
pos += code.m_length;
}
ASSERT_EQUAL(pos, bitLength, ());
succinct::bit_vector(&bitsBuilder).swap(m_bits);
succinct::rs_bit_vector(indexBuilder, true /* with_select_hints */).swap(m_index);
using namespace boost::adaptors;
succinct::elias_fano_compressed_list(data | transformed(getRank)).swap(m_ranks);
m_symbols.assign(symbols);
}
SimpleDenseCoding::SimpleDenseCoding(SimpleDenseCoding && rhs)
{
m_bits.swap(rhs.m_bits);
m_index.swap(rhs.m_index);
m_ranks.swap(rhs.m_ranks);
m_symbols.swap(rhs.m_symbols);
}
uint8_t SimpleDenseCoding::Get(uint64_t i) const
{
ASSERT_LESS(i, Size(), ());
uint64_t const start = m_index.select(i);
uint64_t const end = i + 1 == Size() ? m_index.size() : m_index.select(i + 1);
ASSERT_LESS(start, end, ());
uint8_t const length = static_cast<uint8_t>(end - start);
ASSERT_LESS_OR_EQUAL(length, 8, ());
uint8_t const code = m_bits.get_bits(start, length);
uint8_t const rank = (1 << length) - 2 + code;
return m_symbols[rank];
return m_symbols[m_ranks[i]];
}
} // namespace coding

View file

@ -2,16 +2,14 @@
#include "std/vector.hpp"
#include "3party/succinct/bit_vector.hpp"
#include "3party/succinct/mappable_vector.hpp"
#include "3party/succinct/rs_bit_vector.hpp"
#include "3party/succinct/elias_fano_compressed_list.hpp"
namespace coding
{
// This class represents so-called simple dense coding for byte
// strings. It can be used when it's necessary to compress strings
// with skewed entropy and nevertheless efficient access to the
// string's elements is needed.
// This class represents a variant of a so-called simple dense coding
// scheme for byte strings. It can be used when it's necessary to
// compress strings with skewed entropy and nevertheless efficient
// access to the string's elements is needed.
//
// The main idea is to assign codewords from the set { 0, 1, 00, 01,
// 10, 11, 000, ... } to string's symbols in accordance with their
@ -40,21 +38,19 @@ public:
uint8_t Get(uint64_t i) const;
inline uint64_t Size() const { return m_index.num_ones(); }
inline uint64_t Size() const { return m_ranks.size(); }
// map is used here (instead of Map) for compatibility with succinct
// structures.
template <typename TVisitor>
void map(TVisitor & visitor)
{
visitor(m_bits, "m_bits");
visitor(m_index, "m_index");
visitor(m_ranks, "m_ranks");
visitor(m_symbols, "m_symbols");
}
private:
succinct::bit_vector m_bits;
succinct::rs_bit_vector m_index;
succinct::elias_fano_compressed_list m_ranks;
succinct::mapper::mappable_vector<uint8_t> m_symbols;
};
} // namespace coding

View file

@ -49,7 +49,7 @@ void TestTable(vector<uint8_t> const & ranks, string const & path)
}
} // namespace
UNIT_TEST(FeatureRankTableBuilder_Smoke)
UNIT_TEST(RankTableBuilder_Smoke)
{
char const kTestCont[] = "test.tmp";
size_t const kNumRanks = 256;
@ -69,7 +69,7 @@ UNIT_TEST(FeatureRankTableBuilder_Smoke)
TestTable(ranks, kTestCont);
}
UNIT_TEST(FeatureRankTableBuilder_EndToEnd)
UNIT_TEST(RankTableBuilder_EndToEnd)
{
classificator::Load();
@ -89,7 +89,10 @@ UNIT_TEST(FeatureRankTableBuilder_EndToEnd)
search::RankTableBuilder::CalcSearchRanks(rcont, ranks);
}
search::RankTableBuilder::Create(localFile);
{
FilesContainerW wcont(mapPath, FileWriter::OP_WRITE_EXISTING);
search::RankTableBuilder::Create(ranks, wcont);
}
Index index;
auto regResult = index.RegisterMap(localFile);

View file

@ -94,7 +94,7 @@ private:
DISALLOW_COPY(CopiedMemoryRegion);
};
unique_ptr<CopiedMemoryRegion> GetMemoryRegionForTag(FilesContainerR & rcont,
unique_ptr<CopiedMemoryRegion> GetMemoryRegionForTag(FilesContainerR const & rcont,
FilesContainerBase::Tag const & tag)
{
if (!rcont.IsExist(tag))
@ -105,7 +105,7 @@ unique_ptr<CopiedMemoryRegion> GetMemoryRegionForTag(FilesContainerR & rcont,
return make_unique<CopiedMemoryRegion>(move(buffer));
}
unique_ptr<MappedMemoryRegion> GetMemoryRegionForTag(FilesMappingContainer & mcont,
unique_ptr<MappedMemoryRegion> GetMemoryRegionForTag(FilesMappingContainer const & mcont,
FilesContainerBase::Tag const & tag)
{
if (!mcont.IsExist(tag))
@ -242,13 +242,13 @@ uint8_t CalcSearchRank(FeatureType const & ft)
} // namespace
// static
unique_ptr<RankTable> RankTable::Load(FilesContainerR & rcont)
unique_ptr<RankTable> RankTable::Load(FilesContainerR const & rcont)
{
return LoadRankTable(GetMemoryRegionForTag(rcont, RANKS_FILE_TAG));
}
// static
unique_ptr<RankTable> RankTable::Load(FilesMappingContainer & mcont)
unique_ptr<RankTable> RankTable::Load(FilesMappingContainer const & mcont)
{
return LoadRankTable(GetMemoryRegionForTag(mcont, RANKS_FILE_TAG));
}
@ -266,7 +266,7 @@ void RankTableBuilder::CalcSearchRanks(FilesContainerR & rcont, vector<uint8_t>
}
// static
void RankTableBuilder::Create(platform::LocalCountryFile const & localFile)
void RankTableBuilder::CreateIfNotExists(platform::LocalCountryFile const & localFile)
{
string const mapPath = localFile.GetPath(MapOptions::Map);

View file

@ -65,7 +65,7 @@ public:
// *NOTE* Return value can outlive |rcont|. Also note that there is
// undefined behaviour if ranks section exists but internally
// damaged.
static unique_ptr<RankTable> Load(FilesContainerR & rcont);
static unique_ptr<RankTable> Load(FilesContainerR const & rcont);
// Maps whole section corresponding to a rank table and deserializes
// it. Returns nullptr if there're no ranks section, rank table's
@ -76,7 +76,7 @@ public:
// destructed before |mcont| is closed. Also note that there're
// undefined behaviour if ranks section exists but internally
// damaged.
static unique_ptr<RankTable> Load(FilesMappingContainer & mcont);
static unique_ptr<RankTable> Load(FilesMappingContainer const & mcont);
};
// A builder class for rank tables.
@ -92,7 +92,7 @@ public:
// reverse mapping.
// * When rank table does not exist or exists but is damaged, calculates all
// features' ranks and creates rank table.
static void Create(platform::LocalCountryFile const & localFile);
static void CreateIfNotExists(platform::LocalCountryFile const & localFile);
// Force creation of a rank table from array of ranks. Existing rank
// table is removed (if any). Note that |wcont| must be instantiated