[3party] Removed the PagedArray from our bsdiff implementation.

We have enough address space even when computing diffs for the largest mwms.
This commit is contained in:
Maxim Pimenov 2019-01-24 13:48:15 +03:00 committed by Arsentiy Milchakov
parent 17af61c395
commit 878329aefc
7 changed files with 24 additions and 524 deletions

View file

@ -9,7 +9,6 @@ set(
bsdiff/bsdiff.h
bsdiff/bsdiff_common.h
bsdiff/bsdiff_search.h
bsdiff/paged_array.h
divsufsort/divsufsort.cc
divsufsort/divsufsort.h
divsufsort/divsufsort_private.h

View file

@ -38,6 +38,9 @@
// all routines to use MAPS.ME readers and writers instead
// of Courgette streams and files.
// --Maxim Pimenov <m@maps.me>
// 2019-01-24 - Got rid of the paged array. We have enough address space
// for our application of bsdiff.
// --Maxim Pimenov <m@maps.me>
// Changelog for bsdiff_apply:
// 2009-03-31 - Change to use Streams. Move CRC code to crc.{h,cc}
@ -89,7 +92,6 @@
#include "3party/bsdiff-courgette/bsdiff/bsdiff_common.h"
#include "3party/bsdiff-courgette/bsdiff/bsdiff_search.h"
#include "3party/bsdiff-courgette/bsdiff/paged_array.h"
#include "3party/bsdiff-courgette/divsufsort/divsufsort.h"
#include "zlib.h"
@ -141,16 +143,20 @@ BSDiffStatus CreateBinaryPatch(OldReader & old_reader,
old_source.Read(old_buf.data(), old_buf.size());
const uint8_t * old = old_buf.data();
courgette::PagedArray<divsuf::saidx_t> I;
if (!I.Allocate(old_size + 1)) {
LOG(LERROR, ("Could not allocate I[], ", ((old_size + 1) * sizeof(int)), "bytes"));
return MEM_ERROR;
std::vector<divsuf::saidx_t> I;
try
{
I.resize(old_size + 1);
}
catch (...)
{
LOG(LERROR, ("Could not allocate I[], ", ((old_size + 1) * sizeof(int)), "bytes"));
return MEM_ERROR;
}
base::Timer suf_sort_timer;
divsuf::saint_t result = divsuf::divsufsort_include_empty(
old, I.begin(), old_size);
old, I.data(), old_size);
LOG(LINFO, ("Done divsufsort", suf_sort_timer.ElapsedSeconds()));
if (result != 0)
return UNEXPECTED_ERROR;
@ -215,7 +221,7 @@ BSDiffStatus CreateBinaryPatch(OldReader & old_reader,
scan += match.size;
for (int scsc = scan; scan < new_size; ++scan) {
match = search<courgette::PagedArray<divsuf::saidx_t>&>(
match = search<decltype(I)>(
I, old, old_size, newbuf + scan, new_size - scan);
for (; scsc < scan + match.size; scsc++)

View file

@ -71,14 +71,14 @@ inline int matchlen(const unsigned char* buf1,
// |pos| a position of best match in |old|. If multiple such positions exist,
// |pos| would take an arbitrary one.
template <class T>
SearchResult search(T I,
SearchResult search(const T & I,
const unsigned char* srcbuf,
int srcsize,
const unsigned char* keybuf,
int keysize) {
int lo = 0;
int hi = srcsize;
while (hi - lo >= 2) {
while (hi - lo > 1) {
int mid = (lo + hi) / 2;
if (std::lexicographical_compare(
srcbuf + I[mid], srcbuf + srcsize, keybuf, keybuf + keysize)) {

View file

@ -5,9 +5,9 @@
#include "courgette/third_party/bsdiff/bsdiff_search.h"
#include <cstring>
#include <vector>
#include "base/macros.h"
#include "courgette/third_party/bsdiff/paged_array.h"
#include "courgette/third_party/divsufsort/divsufsort.h"
#include "testing/gtest/include/gtest/gtest.h"
@ -18,9 +18,8 @@ TEST(BSDiffSearchTest, Search) {
const char* str = "the quick brown fox jumps over the lazy dog.";
int size = static_cast<int>(::strlen(str));
const unsigned char* buf = reinterpret_cast<const unsigned char*>(str);
courgette::PagedArray<divsuf::saidx_t> I;
ASSERT_TRUE(I.Allocate(size + 1));
divsuf::divsufsort_include_empty(buf, I.begin(), size);
std::vector<divsuf::saidx_t> I(size + 1);
divsuf::divsufsort_include_empty(buf, I.data(), size);
// Specific queries.
const struct {
@ -65,7 +64,7 @@ TEST(BSDiffSearchTest, Search) {
// Perform the search.
bsdiff::SearchResult match =
bsdiff::search<courgette::PagedArray<divsuf::saidx_t>&>(
bsdiff::search<decltype(I)>(
I, buf, size, query_buf, query_size);
// Check basic properties and match with expected values.
@ -101,9 +100,9 @@ TEST(BSDiffSearchTest, SearchExact) {
int size = static_cast<int>(::strlen(test_cases[idx]));
const unsigned char* buf =
reinterpret_cast<const unsigned char*>(test_cases[idx]);
courgette::PagedArray<divsuf::saidx_t> I;
ASSERT_TRUE(I.Allocate(size + 1));
divsuf::divsufsort_include_empty(buf, I.begin(), size);
std::vector<divsuf::saidx_t> I(size + 1);
divsuf::divsufsort_include_empty(buf, I.data(), size);
// Test exact matches for every non-empty substring.
for (int lo = 0; lo < size; ++lo) {
@ -114,7 +113,7 @@ TEST(BSDiffSearchTest, SearchExact) {
const unsigned char* query_buf =
reinterpret_cast<const unsigned char*>(query.c_str());
bsdiff::SearchResult match =
bsdiff::search<courgette::PagedArray<divsuf::saidx_t>&>(
bsdiff::search<decltype(I)>(
I, buf, size, query_buf, query_size);
EXPECT_EQ(query_size, match.size);

View file

@ -1,249 +0,0 @@
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// PagedArray implements an array stored using many fixed-size pages.
//
// PagedArray is a work-around to allow large arrays to be allocated when there
// is too much address space fragmentation for allocating the large arrays as
// contiguous arrays.
#ifndef COURGETTE_THIRD_PARTY_BSDIFF_PAGED_ARRAY_H_
#define COURGETTE_THIRD_PARTY_BSDIFF_PAGED_ARRAY_H_
#include <cstddef>
#include <cstdlib>
#include <iterator>
#include <type_traits>
#include "base/macros.hpp"
namespace courgette {
// Page size of 2^18 * sizeof(T) is 1MB for T = int32_t.
constexpr int kPagedArrayDefaultPageLogSize = 18;
template <typename T, int LOG_PAGE_SIZE = kPagedArrayDefaultPageLogSize>
class PagedArray;
// A random access iterator with pointer-like semantics, for PagedArray.
template <typename ContainerType, typename T>
class PagedArray_iterator {
public:
using ThisType = PagedArray_iterator<ContainerType, T>;
using difference_type = ptrdiff_t;
using value_type = typename std::remove_const<T>::type;
using reference = T&;
using pointer = T*;
using iterator_category = std::random_access_iterator_tag;
PagedArray_iterator() : array_(nullptr), index_(0U) {}
PagedArray_iterator(ContainerType* array, size_t index)
: array_(array), index_(index) {}
template <typename ContainerType2, typename T2>
PagedArray_iterator(const PagedArray_iterator<ContainerType2, T2>& it)
: array_(it.array_), index_(it.index_) {}
PagedArray_iterator(std::nullptr_t) : array_(nullptr), index_(0) {}
~PagedArray_iterator() = default;
reference operator*() const { return (*array_)[index_]; }
reference operator[](size_t idx) const { return (*array_)[index_ + idx]; }
pointer operator->() const { return &(*array_)[index_]; }
ThisType& operator=(std::nullptr_t) {
array_ = nullptr;
index_ = 0;
return *this;
}
ThisType& operator++() {
++index_;
return *this;
}
ThisType& operator--() {
--index_;
return *this;
}
ThisType operator++(int) { return ThisType(array_, index_++); }
ThisType operator--(int) { return ThisType(array_, index_--); }
ThisType& operator+=(difference_type delta) {
index_ += delta;
return *this;
}
ThisType& operator-=(difference_type delta) {
index_ -= delta;
return *this;
}
ThisType operator+(difference_type offset) const {
return ThisType(array_, index_ + offset);
}
ThisType operator-(difference_type offset) const {
return ThisType(array_, index_ - offset);
}
template <typename ContainerType2, typename T2>
bool operator==(const PagedArray_iterator<ContainerType2, T2>& it) const {
return index_ == it.index_ && array_ == it.array_;
}
bool operator==(std::nullptr_t) const {
return index_ == 0 && array_ == nullptr;
}
template <typename ContainerType2, typename T2>
bool operator!=(const PagedArray_iterator<ContainerType2, T2>& it) const {
return !(*this == it);
}
template <typename ContainerType2, typename T2>
bool operator<(const PagedArray_iterator<ContainerType2, T2>& it) const {
#ifdef DEBUG
// For performance, skip the |array_| check in Release builds.
if (array_ != it.array_)
return false;
#endif
return index_ < it.index_;
}
template <typename ContainerType2, typename T2>
bool operator<=(const PagedArray_iterator<ContainerType2, T2>& it) const {
#ifdef DEBUG
// For performance, skip the |array_| check in Release builds.
if (array_ != it.array_)
return false;
#endif
return index_ <= it.index_;
}
template <typename ContainerType2, typename T2>
bool operator>(const PagedArray_iterator<ContainerType2, T2>& it) const {
#ifdef DEBUG
// For performance, skip the |array_| check in Release builds.
if (array_ != it.array_)
return false;
#endif
return index_ > it.index_;
}
template <typename ContainerType2, typename T2>
bool operator>=(const PagedArray_iterator<ContainerType2, T2>& it) const {
#ifdef DEBUG
// For performance, skip the |array_| check in Release builds.
if (array_ != it.array_)
return false;
#endif
return index_ >= it.index_;
}
template <typename ContainerType2, typename T2>
difference_type operator-(
const PagedArray_iterator<ContainerType2, T2>& it) const {
return index_ - it.index_;
}
private:
template <typename, typename>
friend class PagedArray_iterator;
ContainerType* array_;
size_t index_;
};
// PagedArray implements an array stored using many fixed-size pages.
template <typename T, int LOG_PAGE_SIZE>
class PagedArray {
enum {
// Page size in elements.
kLogPageSize = LOG_PAGE_SIZE,
kPageSize = 1 << LOG_PAGE_SIZE
};
public:
using ThisType = PagedArray<T, LOG_PAGE_SIZE>;
using const_iterator = PagedArray_iterator<const ThisType, const T>;
using iterator = PagedArray_iterator<ThisType, T>;
PagedArray() = default;
~PagedArray() { clear(); }
iterator begin() { return iterator(this, 0); }
iterator end() { return iterator(this, size_); }
const_iterator begin() const { return const_iterator(this, 0); }
const_iterator end() const { return const_iterator(this, size_); }
T& operator[](size_t i) {
size_t page = i >> kLogPageSize;
size_t offset = i & (kPageSize - 1);
// *NOTE* CHECK() would significaltly slow down bsdiff_create
// even in optimized Release build (about 1.4x).
ASSERT_LESS(page, page_count_, ());
return pages_[page][offset];
}
const T& operator[](size_t i) const {
// Duplicating code here for performance. If we use common code for this
// then bsdiff_create slows down by ~5% in optimized Release build.
size_t page = i >> kLogPageSize;
size_t offset = i & (kPageSize - 1);
// *NOTE* CHECK() would significaltly slow down bsdiff_create
// even in optimized Release build (about 1.4x).
ASSERT_LESS(page, page_count_, ());
return pages_[page][offset];
}
// Allocates storage for |size| elements. Returns true on success and false if
// allocation fails.
bool Allocate(size_t size) {
clear();
size_ = size;
size_t pages_needed = (size_ + kPageSize - 1) >> kLogPageSize;
if (!UncheckedMalloc(sizeof(T*) * pages_needed,
reinterpret_cast<void**>(&pages_))) {
return false;
}
for (page_count_ = 0; page_count_ < pages_needed; ++page_count_) {
T* block = nullptr;
if (!UncheckedMalloc(sizeof(T) * kPageSize,
reinterpret_cast<void**>(&block))) {
clear();
return false;
}
pages_[page_count_] = block;
}
return true;
}
// Releases all storage. May be called more than once.
void clear() {
if (pages_ != nullptr) {
while (page_count_ != 0) {
--page_count_;
free(pages_[page_count_]);
}
free(pages_);
pages_ = nullptr;
}
}
private:
T** pages_ = nullptr;
size_t size_ = 0U;
size_t page_count_ = 0U;
bool UncheckedMalloc(size_t size, void** result) {
*result = malloc(size);
return *result != NULL;
}
DISALLOW_COPY_AND_MOVE(PagedArray);
};
} // namespace courgette
#endif // COURGETTE_THIRD_PARTY_BSDIFF_PAGED_ARRAY_H_

View file

@ -1,248 +0,0 @@
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "courgette/third_party/bsdiff/paged_array.h"
#include <stdint.h>
#include <algorithm>
#include <iterator>
#include <random>
#include <vector>
#include "testing/gtest/include/gtest/gtest.h"
namespace {
#if !defined(ADDRESS_SANITIZER) || !defined(OS_WIN)
// Total allocation of 4GB will fail in 32 bit programs if allocations are
// leaked.
const int kIterations = 20;
const int kSizeBig = 200 * 1024 * 1024 / sizeof(int); // 200MB
#endif
const size_t kLogBlockSizeSmall = 10;
const size_t kBlockSizeSmall = 1 << kLogBlockSizeSmall;
const size_t kSizeList[] = {1,
16,
123,
kBlockSizeSmall,
kBlockSizeSmall + 1,
123 * kBlockSizeSmall + 567};
const size_t kIteratorTestDataSize = 7;
const int32_t kIteratorTestData[kIteratorTestDataSize] = {2, 3, 5, 7,
11, 13, 17};
} // namespace
class PagedArrayTest : public testing::Test {
public:
template <typename Iterator>
void TestIteratorBasic(Iterator begin, Iterator end) {
EXPECT_FALSE(begin == nullptr);
EXPECT_FALSE(end == nullptr);
// Test TestPagedArray::iterator data read.
Iterator it = begin;
EXPECT_EQ(2, *it);
EXPECT_EQ(3, *(it + 1));
EXPECT_EQ(5, it[2]); // Pseudo-pointer.
EXPECT_EQ(7, (it + 1)[2]);
EXPECT_EQ(3, *(++it));
EXPECT_EQ(3, *it);
EXPECT_EQ(3, *(it++));
EXPECT_EQ(5, *it);
EXPECT_EQ(11, *(it += 2));
EXPECT_EQ(11, *it);
EXPECT_FALSE(it == begin);
EXPECT_TRUE(it != begin);
EXPECT_TRUE(it == (begin + 4));
EXPECT_FALSE(it != (begin + 4));
EXPECT_EQ(static_cast<ptrdiff_t>(kIteratorTestDataSize), end - begin);
it = begin + 5;
EXPECT_EQ(begin, it - 5);
EXPECT_EQ(13, *it);
EXPECT_EQ(11, *(--it));
EXPECT_EQ(11, *it);
EXPECT_EQ(11, *(it--));
EXPECT_EQ(7, *it);
EXPECT_EQ(3, *(it -= 2));
EXPECT_EQ(3, *it);
// Test binary operators for every pair of iterator.
EXPECT_TRUE(begin <= end);
EXPECT_TRUE(begin < end);
for (size_t i = 0; i < kIteratorTestDataSize; ++i) {
Iterator it1 = begin + i;
EXPECT_TRUE(it1 == it1);
EXPECT_FALSE(it1 != it1);
EXPECT_TRUE(begin <= it1);
EXPECT_FALSE(begin > it1);
EXPECT_TRUE(it1 >= begin);
EXPECT_FALSE(it1 < begin);
EXPECT_EQ(begin, it1 - i);
EXPECT_EQ(end, it1 + (kIteratorTestDataSize - i));
EXPECT_EQ(kIteratorTestData[i], *it1);
EXPECT_EQ(kIteratorTestData[i], begin[i]);
for (size_t j = 0; i + j < kIteratorTestDataSize; ++j) {
Iterator it2 = it1 + j;
EXPECT_TRUE(it1 <= it2);
EXPECT_FALSE(it1 > it2);
EXPECT_TRUE(it2 >= it1);
EXPECT_FALSE(it2 < it1);
if (j > 0) {
EXPECT_TRUE(it1 < it2);
EXPECT_FALSE(it1 >= it2);
EXPECT_TRUE(it2 > it1);
EXPECT_FALSE(it2 <= it1);
EXPECT_FALSE(it1 == it2);
EXPECT_TRUE(it1 != it2);
}
EXPECT_EQ(kIteratorTestData[i + j], it1[j]); // Pseudo-pointer.
EXPECT_EQ(kIteratorTestData[i + j], *it2);
EXPECT_EQ(static_cast<ptrdiff_t>(j), it2 - it1);
EXPECT_EQ(it1, it2 - j);
}
EXPECT_TRUE(it1 < end);
EXPECT_FALSE(it1 >= end);
EXPECT_TRUE(it1 <= end);
EXPECT_FALSE(it1 > end);
EXPECT_TRUE(end > it1);
EXPECT_FALSE(end <= it1);
EXPECT_TRUE(end >= it1);
EXPECT_FALSE(end < it1);
EXPECT_TRUE(it1 != end);
EXPECT_FALSE(it1 == end);
}
// Test initialize with null.
Iterator it_dummy;
it_dummy = nullptr;
EXPECT_TRUE(it_dummy == nullptr);
// Test copy constructor.
Iterator begin_copy(begin);
EXPECT_TRUE(begin_copy == begin);
// Test STL read-only usage.
std::vector<typename std::iterator_traits<Iterator>::value_type> v(begin,
end);
EXPECT_TRUE(std::equal(begin, end, v.begin()));
EXPECT_TRUE(std::equal(v.begin(), v.end(), begin));
}
};
// AddressSanitizer on Windows adds additional memory overhead, which
// causes these tests to go OOM and fail.
#if !defined(ADDRESS_SANITIZER) || !defined(OS_WIN)
TEST_F(PagedArrayTest, TestManyAllocationsDestructorFree) {
for (int i = 0; i < kIterations; ++i) {
courgette::PagedArray<int> a;
EXPECT_TRUE(a.Allocate(kSizeBig));
}
}
TEST_F(PagedArrayTest, TestManyAllocationsManualFree) {
courgette::PagedArray<int> a;
for (int i = 0; i < kIterations; ++i) {
EXPECT_TRUE(a.Allocate(kSizeBig));
a.clear();
}
}
#endif
TEST_F(PagedArrayTest, TestAccess) {
for (size_t size : kSizeList) {
courgette::PagedArray<int32_t, kLogBlockSizeSmall> a;
EXPECT_TRUE(a.Allocate(size));
for (size_t i = 0; i < size; ++i)
a[i] = i;
for (size_t i = 0; i < size; ++i)
EXPECT_EQ(static_cast<int32_t>(i), a[i]);
}
}
TEST_F(PagedArrayTest, TestIterator) {
using TestPagedArray = courgette::PagedArray<int32_t, 1>; // Page size = 2.
TestPagedArray::const_iterator uninit_const_it;
EXPECT_TRUE(uninit_const_it == nullptr);
TestPagedArray::iterator uninit_it;
EXPECT_TRUE(uninit_it == nullptr);
TestPagedArray a;
EXPECT_TRUE(a.Allocate(kIteratorTestDataSize));
std::copy(kIteratorTestData, kIteratorTestData + kIteratorTestDataSize,
a.begin());
const TestPagedArray& a_const = a;
// Test TestPagedArray::const_iterator.
TestIteratorBasic(a_const.begin(), a_const.end());
// Test TestPagedArray::iterator.
TestIteratorBasic(a.begin(), a.end());
// Test equality of const and non-const.
EXPECT_TRUE(a.begin() == a_const.begin());
EXPECT_TRUE(a_const.begin() == a.begin());
// Test casting from non-const to const.
TestPagedArray::iterator non_const_it = a.begin();
EXPECT_TRUE(non_const_it == a.begin());
TestPagedArray::const_iterator const_it = non_const_it;
EXPECT_TRUE(const_it == non_const_it);
// The following should and will emit compile error:
// non_const_it = a_const.begin();
// Test copy constructor from non-const to const.
TestPagedArray::iterator const_it2(a.begin());
EXPECT_TRUE(const_it2 == a.begin());
// Test pointer distance from non-const to const.
EXPECT_EQ(static_cast<ptrdiff_t>(kIteratorTestDataSize),
a.end() - a_const.begin());
EXPECT_EQ(static_cast<ptrdiff_t>(kIteratorTestDataSize),
a_const.end() - a.begin());
// Test operator->().
struct TestStruct {
int32_t value = 0;
};
using TestStructPagedArray = courgette::PagedArray<TestStruct, 1>;
TestStructPagedArray b;
b.Allocate(3);
b[0].value = 100;
b[1].value = 200;
b[2].value = 300;
const TestStructPagedArray& b_const = b;
EXPECT_EQ(100, b.begin()->value);
EXPECT_EQ(100, b_const.begin()->value);
EXPECT_EQ(200, (b.begin() + 1)->value);
EXPECT_EQ(200, (b_const.begin() + 1)->value);
(b.begin() + 2)->value *= -1;
EXPECT_EQ(-300, (b.begin() + 2)->value);
EXPECT_EQ(-300, (b_const.begin() + 2)->value);
}
// Test generic read-write of itrators by sorting pseudo-random numbers.
TEST_F(PagedArrayTest, TestSort) {
courgette::PagedArray<int> a;
std::minstd_rand pseudo_rand_gen; // Deterministic, using defaults.
for (size_t size : kSizeList) {
std::vector<int32_t> v(size);
courgette::PagedArray<int32_t, kLogBlockSizeSmall> a;
EXPECT_TRUE(a.Allocate(size));
for (size_t i = 0; i < size; ++i) {
v[i] = pseudo_rand_gen();
a[i] = v[i];
}
std::sort(v.begin(), v.end());
std::sort(a.begin(), a.end());
for (size_t i = 0; i < size; ++i)
EXPECT_EQ(v[i], a[i]);
}
}

View file

@ -30,8 +30,6 @@
#include <stdint.h>
#include "3party/bsdiff-courgette/bsdiff/paged_array.h"
namespace divsuf {
/*- Datatypes -*/
@ -39,13 +37,8 @@ typedef int32_t saint_t;
typedef int32_t saidx_t;
typedef uint8_t sauchar_t;
#ifdef DIVSUFSORT_NO_PAGED_ARRAY
typedef saidx_t* saidx_it;
typedef const saidx_t* const_saidx_it;
#else
typedef courgette::PagedArray<saidx_t>::iterator saidx_it;
typedef courgette::PagedArray<saidx_t>::const_iterator const_saidx_it;
#endif
/*- Prototypes -*/