forked from organicmaps/organicmaps
[coding, indexer] Refactored SimpleDenseCoding.
This commit is contained in:
parent
070a14d8ac
commit
00001c40c2
5 changed files with 35 additions and 101 deletions
|
@ -4,7 +4,8 @@
|
|||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/limits.hpp"
|
||||
#include "std/utility.hpp"
|
||||
|
||||
#include "3party/boost/boost/range/adaptor/transformed.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
|
@ -12,47 +13,6 @@ namespace
|
|||
{
|
||||
size_t const kAlphabetSize = static_cast<size_t>(numeric_limits<uint8_t>::max()) + 1;
|
||||
|
||||
struct Code
|
||||
{
|
||||
Code() : m_code(0), m_length(0) {}
|
||||
|
||||
uint8_t m_code;
|
||||
uint8_t m_length;
|
||||
};
|
||||
|
||||
// Initializes code table for simple dense coding with following code
|
||||
// words: 0, 1, 00, 01, 10, 11, 000, 001, ...
|
||||
struct CodeTable
|
||||
{
|
||||
public:
|
||||
CodeTable()
|
||||
{
|
||||
size_t rank = 0;
|
||||
uint8_t length = 1;
|
||||
while (rank < kAlphabetSize)
|
||||
{
|
||||
// Number of codes with the same bit length.
|
||||
size_t const numCodes = static_cast<size_t>(1) << length;
|
||||
|
||||
uint8_t code = 0;
|
||||
for (; code < numCodes && rank + code < kAlphabetSize; ++code)
|
||||
{
|
||||
size_t const pos = rank + code;
|
||||
m_table[pos].m_code = code;
|
||||
m_table[pos].m_length = length;
|
||||
}
|
||||
|
||||
rank += code;
|
||||
length += 1;
|
||||
}
|
||||
}
|
||||
|
||||
inline Code const & GetCode(uint8_t rank) const { return m_table[rank]; }
|
||||
|
||||
private:
|
||||
Code m_table[kAlphabetSize];
|
||||
};
|
||||
|
||||
// Calculates frequences for data symbols.
|
||||
void CalcFrequences(vector<uint8_t> const & data, uint64_t frequency[])
|
||||
{
|
||||
|
@ -65,8 +25,6 @@ void CalcFrequences(vector<uint8_t> const & data, uint64_t frequency[])
|
|||
SimpleDenseCoding::SimpleDenseCoding(vector<uint8_t> const & data)
|
||||
{
|
||||
// This static initialization isn't thread safe prior to C++11.
|
||||
static CodeTable codeTable;
|
||||
|
||||
uint64_t frequency[kAlphabetSize]; // Maps symbols to frequences.
|
||||
CalcFrequences(data, frequency);
|
||||
|
||||
|
@ -75,57 +33,34 @@ SimpleDenseCoding::SimpleDenseCoding(vector<uint8_t> const & data)
|
|||
|
||||
for (size_t i = 0; i < kAlphabetSize; ++i)
|
||||
symbols[i] = i;
|
||||
sort(symbols, symbols + kAlphabetSize, [&frequency](uint8_t lsym, uint8_t rsym)
|
||||
|
||||
auto frequencyCmp = [&frequency](uint8_t lsym, uint8_t rsym)
|
||||
{
|
||||
return frequency[lsym] > frequency[rsym];
|
||||
});
|
||||
};
|
||||
sort(symbols, symbols + kAlphabetSize, frequencyCmp);
|
||||
for (size_t r = 0; r < kAlphabetSize; ++r)
|
||||
rank[symbols[r]] = r;
|
||||
|
||||
uint64_t bitLength = 0;
|
||||
for (size_t symbol = 0; symbol < kAlphabetSize; ++symbol)
|
||||
bitLength += frequency[symbol] * codeTable.GetCode(rank[symbol]).m_length;
|
||||
|
||||
succinct::bit_vector_builder bitsBuilder;
|
||||
bitsBuilder.reserve(bitLength);
|
||||
vector<bool> indexBuilder(bitLength);
|
||||
size_t pos = 0;
|
||||
for (uint8_t symbol : data)
|
||||
auto getRank = [&rank](uint8_t sym)
|
||||
{
|
||||
Code const & code = codeTable.GetCode(rank[symbol]);
|
||||
ASSERT_LESS(pos, bitLength, ());
|
||||
indexBuilder[pos] = 1;
|
||||
return rank[sym];
|
||||
};
|
||||
|
||||
bitsBuilder.append_bits(code.m_code, code.m_length);
|
||||
pos += code.m_length;
|
||||
}
|
||||
ASSERT_EQUAL(pos, bitLength, ());
|
||||
|
||||
succinct::bit_vector(&bitsBuilder).swap(m_bits);
|
||||
succinct::rs_bit_vector(indexBuilder, true /* with_select_hints */).swap(m_index);
|
||||
using namespace boost::adaptors;
|
||||
succinct::elias_fano_compressed_list(data | transformed(getRank)).swap(m_ranks);
|
||||
m_symbols.assign(symbols);
|
||||
}
|
||||
|
||||
SimpleDenseCoding::SimpleDenseCoding(SimpleDenseCoding && rhs)
|
||||
{
|
||||
m_bits.swap(rhs.m_bits);
|
||||
m_index.swap(rhs.m_index);
|
||||
m_ranks.swap(rhs.m_ranks);
|
||||
m_symbols.swap(rhs.m_symbols);
|
||||
}
|
||||
|
||||
uint8_t SimpleDenseCoding::Get(uint64_t i) const
|
||||
{
|
||||
ASSERT_LESS(i, Size(), ());
|
||||
uint64_t const start = m_index.select(i);
|
||||
uint64_t const end = i + 1 == Size() ? m_index.size() : m_index.select(i + 1);
|
||||
|
||||
ASSERT_LESS(start, end, ());
|
||||
|
||||
uint8_t const length = static_cast<uint8_t>(end - start);
|
||||
ASSERT_LESS_OR_EQUAL(length, 8, ());
|
||||
|
||||
uint8_t const code = m_bits.get_bits(start, length);
|
||||
uint8_t const rank = (1 << length) - 2 + code;
|
||||
return m_symbols[rank];
|
||||
return m_symbols[m_ranks[i]];
|
||||
}
|
||||
} // namespace coding
|
||||
|
|
|
@ -2,16 +2,14 @@
|
|||
|
||||
#include "std/vector.hpp"
|
||||
|
||||
#include "3party/succinct/bit_vector.hpp"
|
||||
#include "3party/succinct/mappable_vector.hpp"
|
||||
#include "3party/succinct/rs_bit_vector.hpp"
|
||||
#include "3party/succinct/elias_fano_compressed_list.hpp"
|
||||
|
||||
namespace coding
|
||||
{
|
||||
// This class represents so-called simple dense coding for byte
|
||||
// strings. It can be used when it's necessary to compress strings
|
||||
// with skewed entropy and nevertheless efficient access to the
|
||||
// string's elements is needed.
|
||||
// This class represents a variant of a so-called simple dense coding
|
||||
// scheme for byte strings. It can be used when it's necessary to
|
||||
// compress strings with skewed entropy and nevertheless efficient
|
||||
// access to the string's elements is needed.
|
||||
//
|
||||
// The main idea is to assign codewords from the set { 0, 1, 00, 01,
|
||||
// 10, 11, 000, ... } to string's symbols in accordance with their
|
||||
|
@ -40,21 +38,19 @@ public:
|
|||
|
||||
uint8_t Get(uint64_t i) const;
|
||||
|
||||
inline uint64_t Size() const { return m_index.num_ones(); }
|
||||
inline uint64_t Size() const { return m_ranks.size(); }
|
||||
|
||||
// map is used here (instead of Map) for compatibility with succinct
|
||||
// structures.
|
||||
template <typename TVisitor>
|
||||
void map(TVisitor & visitor)
|
||||
{
|
||||
visitor(m_bits, "m_bits");
|
||||
visitor(m_index, "m_index");
|
||||
visitor(m_ranks, "m_ranks");
|
||||
visitor(m_symbols, "m_symbols");
|
||||
}
|
||||
|
||||
private:
|
||||
succinct::bit_vector m_bits;
|
||||
succinct::rs_bit_vector m_index;
|
||||
succinct::elias_fano_compressed_list m_ranks;
|
||||
succinct::mapper::mappable_vector<uint8_t> m_symbols;
|
||||
};
|
||||
} // namespace coding
|
||||
|
|
|
@ -49,7 +49,7 @@ void TestTable(vector<uint8_t> const & ranks, string const & path)
|
|||
}
|
||||
} // namespace
|
||||
|
||||
UNIT_TEST(FeatureRankTableBuilder_Smoke)
|
||||
UNIT_TEST(RankTableBuilder_Smoke)
|
||||
{
|
||||
char const kTestCont[] = "test.tmp";
|
||||
size_t const kNumRanks = 256;
|
||||
|
@ -69,7 +69,7 @@ UNIT_TEST(FeatureRankTableBuilder_Smoke)
|
|||
TestTable(ranks, kTestCont);
|
||||
}
|
||||
|
||||
UNIT_TEST(FeatureRankTableBuilder_EndToEnd)
|
||||
UNIT_TEST(RankTableBuilder_EndToEnd)
|
||||
{
|
||||
classificator::Load();
|
||||
|
||||
|
@ -89,7 +89,10 @@ UNIT_TEST(FeatureRankTableBuilder_EndToEnd)
|
|||
search::RankTableBuilder::CalcSearchRanks(rcont, ranks);
|
||||
}
|
||||
|
||||
search::RankTableBuilder::Create(localFile);
|
||||
{
|
||||
FilesContainerW wcont(mapPath, FileWriter::OP_WRITE_EXISTING);
|
||||
search::RankTableBuilder::Create(ranks, wcont);
|
||||
}
|
||||
|
||||
Index index;
|
||||
auto regResult = index.RegisterMap(localFile);
|
||||
|
|
|
@ -94,7 +94,7 @@ private:
|
|||
DISALLOW_COPY(CopiedMemoryRegion);
|
||||
};
|
||||
|
||||
unique_ptr<CopiedMemoryRegion> GetMemoryRegionForTag(FilesContainerR & rcont,
|
||||
unique_ptr<CopiedMemoryRegion> GetMemoryRegionForTag(FilesContainerR const & rcont,
|
||||
FilesContainerBase::Tag const & tag)
|
||||
{
|
||||
if (!rcont.IsExist(tag))
|
||||
|
@ -105,7 +105,7 @@ unique_ptr<CopiedMemoryRegion> GetMemoryRegionForTag(FilesContainerR & rcont,
|
|||
return make_unique<CopiedMemoryRegion>(move(buffer));
|
||||
}
|
||||
|
||||
unique_ptr<MappedMemoryRegion> GetMemoryRegionForTag(FilesMappingContainer & mcont,
|
||||
unique_ptr<MappedMemoryRegion> GetMemoryRegionForTag(FilesMappingContainer const & mcont,
|
||||
FilesContainerBase::Tag const & tag)
|
||||
{
|
||||
if (!mcont.IsExist(tag))
|
||||
|
@ -242,13 +242,13 @@ uint8_t CalcSearchRank(FeatureType const & ft)
|
|||
} // namespace
|
||||
|
||||
// static
|
||||
unique_ptr<RankTable> RankTable::Load(FilesContainerR & rcont)
|
||||
unique_ptr<RankTable> RankTable::Load(FilesContainerR const & rcont)
|
||||
{
|
||||
return LoadRankTable(GetMemoryRegionForTag(rcont, RANKS_FILE_TAG));
|
||||
}
|
||||
|
||||
// static
|
||||
unique_ptr<RankTable> RankTable::Load(FilesMappingContainer & mcont)
|
||||
unique_ptr<RankTable> RankTable::Load(FilesMappingContainer const & mcont)
|
||||
{
|
||||
return LoadRankTable(GetMemoryRegionForTag(mcont, RANKS_FILE_TAG));
|
||||
}
|
||||
|
@ -266,7 +266,7 @@ void RankTableBuilder::CalcSearchRanks(FilesContainerR & rcont, vector<uint8_t>
|
|||
}
|
||||
|
||||
// static
|
||||
void RankTableBuilder::Create(platform::LocalCountryFile const & localFile)
|
||||
void RankTableBuilder::CreateIfNotExists(platform::LocalCountryFile const & localFile)
|
||||
{
|
||||
string const mapPath = localFile.GetPath(MapOptions::Map);
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ public:
|
|||
// *NOTE* Return value can outlive |rcont|. Also note that there is
|
||||
// undefined behaviour if ranks section exists but internally
|
||||
// damaged.
|
||||
static unique_ptr<RankTable> Load(FilesContainerR & rcont);
|
||||
static unique_ptr<RankTable> Load(FilesContainerR const & rcont);
|
||||
|
||||
// Maps whole section corresponding to a rank table and deserializes
|
||||
// it. Returns nullptr if there're no ranks section, rank table's
|
||||
|
@ -76,7 +76,7 @@ public:
|
|||
// destructed before |mcont| is closed. Also note that there're
|
||||
// undefined behaviour if ranks section exists but internally
|
||||
// damaged.
|
||||
static unique_ptr<RankTable> Load(FilesMappingContainer & mcont);
|
||||
static unique_ptr<RankTable> Load(FilesMappingContainer const & mcont);
|
||||
};
|
||||
|
||||
// A builder class for rank tables.
|
||||
|
@ -92,7 +92,7 @@ public:
|
|||
// reverse mapping.
|
||||
// * When rank table does not exist or exists but is damaged, calculates all
|
||||
// features' ranks and creates rank table.
|
||||
static void Create(platform::LocalCountryFile const & localFile);
|
||||
static void CreateIfNotExists(platform::LocalCountryFile const & localFile);
|
||||
|
||||
// Force creation of a rank table from array of ranks. Existing rank
|
||||
// table is removed (if any). Note that |wcont| must be instantiated
|
||||
|
|
Loading…
Add table
Reference in a new issue