Review fixes.

This commit is contained in:
Maxim Pimenov 2018-07-09 18:35:33 +03:00 committed by Tatiana Yan
parent 82bfd52260
commit 29ded65c09
6 changed files with 102 additions and 47 deletions

View file

@ -8,7 +8,6 @@
#include "coding/reader.hpp"
#include "coding/varint.hpp"
#include "coding/write_to_sink.hpp"
#include "base/assert.hpp"
#include "base/string_utils.hpp"
@ -88,30 +87,31 @@ private:
class MemPostingsFetcher : public PostingsFetcher
{
public:
MemPostingsFetcher(std::map<Token, std::vector<Posting>> const & postingsByToken)
explicit MemPostingsFetcher(std::map<Token, std::vector<Posting>> const & postingsByToken)
: m_postingsByToken(postingsByToken), m_it(m_postingsByToken.begin())
{
// todo(@m) An unnecessary copy?
m_postings.reserve(postingsByToken.size());
for (auto const & entry : postingsByToken)
m_postings.emplace_back(entry.second);
}
// PostingsFetcher overrides:
bool GetPostingsForNextToken(std::vector<uint32_t> & postings)
bool IsValid() const override { return m_it != m_postingsByToken.end(); }
void Advance() override
{
CHECK_LESS_OR_EQUAL(m_tokenId, m_postings.size(), ());
if (m_tokenId == m_postings.size())
return false;
postings.swap(m_postings[m_tokenId++]);
return true;
if (m_it != m_postingsByToken.end())
++m_it;
}
void ForEachPosting(Fn const & fn) const override
{
CHECK(IsValid(), ());
for (uint32_t p : m_it->second)
fn(p);
}
private:
std::vector<std::vector<uint32_t>> m_postings;
// Index of the next token to be processed. The
// copy of the postings list in |m_postings| is not guaranteed
// to be valid after it's been processed.
size_t m_tokenId = 0;
std::map<Token, std::vector<Posting>> const & m_postingsByToken;
// Iterator to the current token that will be processed when ForEachPosting is called.
std::map<Token, std::vector<Posting>>::const_iterator m_it;
};
void SortPostings();

View file

@ -14,6 +14,8 @@
#include "base/stl_helpers.hpp"
#include <algorithm>
#include <cstdint>
#include <iterator>
#include <utility>
#include <vector>
@ -30,45 +32,67 @@ public:
TextIndexReader const & index2)
: m_dict(dict), m_index1(index1), m_index2(index2)
{
ReadPostings();
}
// PostingsFetcher overrides:
bool GetPostingsForNextToken(std::vector<uint32_t> & postings)
bool IsValid() const override
{
postings.clear();
auto const & tokens = m_dict.GetTokens();
CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ());
return m_tokenId < tokens.size();
}
void Advance() override
{
auto const & tokens = m_dict.GetTokens();
CHECK_LESS_OR_EQUAL(m_tokenId, tokens.size(), ());
if (m_tokenId == tokens.size())
return false;
return;
m_index1.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(postings));
m_index2.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(postings));
my::SortUnique(postings);
++m_tokenId;
return true;
ReadPostings();
}
void ForEachPosting(Fn const & fn) const override
{
CHECK(IsValid(), ());
for (uint32_t p : m_postings)
fn(p);
}
private:
// Reads postings for the current token.
void ReadPostings()
{
m_postings.clear();
if (!IsValid())
return;
auto const & tokens = m_dict.GetTokens();
m_index1.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(m_postings));
m_index2.ForEachPosting(tokens[m_tokenId], MakeBackInsertFunctor(m_postings));
my::SortUnique(m_postings);
}
TextIndexDictionary const & m_dict;
TextIndexReader const & m_index1;
TextIndexReader const & m_index2;
// Index of the next token from |m_dict| to be processed.
size_t m_tokenId = 0;
vector<uint32_t> m_postings;
};
TextIndexDictionary MergeDictionaries(TextIndexDictionary const & dict1,
TextIndexDictionary const & dict2)
{
vector<Token> commonTokens = dict1.GetTokens();
for (auto const & token : dict2.GetTokens())
{
size_t dummy;
if (!dict1.GetTokenId(token, dummy))
commonTokens.emplace_back(token);
}
vector<Token> commonTokens;
auto const & ts1 = dict1.GetTokens();
auto const & ts2 = dict2.GetTokens();
merge(ts1.begin(), ts1.end(), ts2.begin(), ts2.end(), back_inserter(commonTokens));
ASSERT(is_sorted(commonTokens.begin(), commonTokens.end()), ());
commonTokens.erase(unique(commonTokens.begin(), commonTokens.end()), commonTokens.end());
sort(commonTokens.begin(), commonTokens.end());
TextIndexDictionary dict;
dict.SetTokens(move(commonTokens));
return dict;

View file

@ -5,7 +5,10 @@
#include "search/base/text_index/utils.hpp"
#include "coding/varint.hpp"
#include "coding/write_to_sink.hpp"
#include <cstdint>
#include <functional>
#include <vector>
namespace search
@ -20,11 +23,21 @@ struct TextIndexHeader;
class PostingsFetcher
{
public:
// Returns true and fills |postings| with the postings list of the next token
// when there is one.
// Returns false if the underlying source is exhausted, i.e. there are
// no more tokens left.
virtual bool GetPostingsForNextToken(std::vector<uint32_t> & postings) = 0;
using Fn = std::function<void(uint32_t)>;
virtual ~PostingsFetcher() = default;
// Returns true when there are tokens left in the fetcher and false otherwise.
virtual bool IsValid() const = 0;
// Advances fetcher to the next token.
virtual void Advance() = 0;
// Calls |fn| for every posting for the current token. Initially,
// current token is the first token and then calls to Advance
// may be used to process the next token until the underlying
// source of the tokens is exhausted and the fetcher is no longer valid.
virtual void ForEachPosting(Fn const & fn) const = 0;
};
// Fetches the postings list one by one from |fetcher| and writes them
@ -44,20 +57,21 @@ void WritePostings(Sink & sink, uint64_t startPos, TextIndexHeader & header,
std::vector<uint32_t> postingsStarts;
postingsStarts.reserve(header.m_numTokens);
// todo(@m) s/uint32_t/Posting/ ?
std::vector<uint32_t> postings;
while (fetcher.GetPostingsForNextToken(postings))
{
postingsStarts.emplace_back(RelativePos(sink, startPos));
uint32_t last = 0;
for (auto const p : postings)
{
uint32_t last;
// todo(@m) s/uint32_t/Posting/ ?
auto writePostings = [&](uint32_t p) {
CHECK(last == 0 || last < p, (last, p));
uint32_t const delta = p - last;
WriteVarUint(sink, delta);
last = p;
};
while (fetcher.IsValid())
{
postingsStarts.emplace_back(RelativePos(sink, startPos));
last = 0;
fetcher.ForEachPosting(writePostings);
fetcher.Advance();
}
}
// One more for convenience.

View file

@ -1,5 +1,6 @@
#pragma once
#include "search/base/text_index/dictionary.hpp"
#include "search/base/text_index/text_index.hpp"
#include "coding/file_reader.hpp"

View file

@ -9,7 +9,7 @@ namespace search
namespace base
{
template <typename Sink>
static uint32_t RelativePos(Sink & sink, uint64_t startPos)
uint32_t RelativePos(Sink & sink, uint64_t startPos)
{
return ::base::checked_cast<uint32_t>(sink.Pos() - startPos);
}

View file

@ -100,6 +100,10 @@
3974BB921FB4723000F265E5 /* testingmain.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 671C62241AE9229A00076BD0 /* testingmain.cpp */; };
397AFE061D6C9AC700F583E7 /* downloader_search_callback.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 397AFE041D6C9AC700F583E7 /* downloader_search_callback.cpp */; };
397AFE071D6C9AC700F583E7 /* downloader_search_callback.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 397AFE051D6C9AC700F583E7 /* downloader_search_callback.hpp */; };
39831A3820F3C5C6005FF294 /* merger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 39831A3420F3C5C6005FF294 /* merger.cpp */; };
39831A3920F3C5C6005FF294 /* utils.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39831A3520F3C5C6005FF294 /* utils.hpp */; };
39831A3A20F3C5C6005FF294 /* merger.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39831A3620F3C5C6005FF294 /* merger.hpp */; };
39831A3B20F3C5C6005FF294 /* postings.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39831A3720F3C5C6005FF294 /* postings.hpp */; };
39AEF8361FB4597300943FC9 /* tracer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 39AEF8351FB4597300943FC9 /* tracer.cpp */; };
39AEF8381FB4597F00943FC9 /* point_rect_matcher.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39AEF8371FB4597E00943FC9 /* point_rect_matcher.hpp */; };
39AEF83A1FB4598900943FC9 /* tracer.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 39AEF8391FB4598900943FC9 /* tracer.hpp */; };
@ -379,6 +383,10 @@
3936A60C20EA2F5F00A68C09 /* mem.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; name = mem.hpp; path = ../text_index/mem.hpp; sourceTree = "<group>"; };
397AFE041D6C9AC700F583E7 /* downloader_search_callback.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = downloader_search_callback.cpp; sourceTree = "<group>"; };
397AFE051D6C9AC700F583E7 /* downloader_search_callback.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = downloader_search_callback.hpp; sourceTree = "<group>"; };
39831A3420F3C5C6005FF294 /* merger.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = merger.cpp; sourceTree = "<group>"; };
39831A3520F3C5C6005FF294 /* utils.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = utils.hpp; sourceTree = "<group>"; };
39831A3620F3C5C6005FF294 /* merger.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = merger.hpp; sourceTree = "<group>"; };
39831A3720F3C5C6005FF294 /* postings.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = postings.hpp; sourceTree = "<group>"; };
39AEF8351FB4597300943FC9 /* tracer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tracer.cpp; sourceTree = "<group>"; };
39AEF8371FB4597E00943FC9 /* point_rect_matcher.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = point_rect_matcher.hpp; sourceTree = "<group>"; };
39AEF8391FB4598900943FC9 /* tracer.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = tracer.hpp; sourceTree = "<group>"; };
@ -673,6 +681,10 @@
3936A60420EA2F3E00A68C09 /* text_index */ = {
isa = PBXGroup;
children = (
39831A3420F3C5C6005FF294 /* merger.cpp */,
39831A3620F3C5C6005FF294 /* merger.hpp */,
39831A3720F3C5C6005FF294 /* postings.hpp */,
39831A3520F3C5C6005FF294 /* utils.hpp */,
3936A60920EA2F5E00A68C09 /* dictionary.hpp */,
3936A60520EA2F5E00A68C09 /* header.cpp */,
3936A60620EA2F5E00A68C09 /* header.hpp */,
@ -999,11 +1011,14 @@
3461C9A21D79949600E6E6F5 /* categories_set.hpp in Headers */,
397AFE071D6C9AC700F583E7 /* downloader_search_callback.hpp in Headers */,
3936A60F20EA2F5F00A68C09 /* text_index.hpp in Headers */,
39831A3A20F3C5C6005FF294 /* merger.hpp in Headers */,
675346EE1A40560D00A0A8C3 /* locality_finder.hpp in Headers */,
675346EC1A40560D00A0A8C3 /* latlon_match.hpp in Headers */,
A1347D521B8758C3009050FF /* query_saver.hpp in Headers */,
3459A7A81E4C4D0200ED235F /* geocoder_locality.hpp in Headers */,
675346EA1A40560D00A0A8C3 /* keyword_matcher.hpp in Headers */,
39831A3B20F3C5C6005FF294 /* postings.hpp in Headers */,
39831A3920F3C5C6005FF294 /* utils.hpp in Headers */,
347F33211C4540A8009758CC /* projection_on_street.hpp in Headers */,
F652D8EF1CFDE21900FC29A0 /* features_layer.hpp in Headers */,
0831F255200E56110034C365 /* mem_search_index.hpp in Headers */,
@ -1225,6 +1240,7 @@
39B2B9491FB4620200AB85A1 /* pre_ranker_test.cpp in Sources */,
3461C9A31D79949600E6E6F5 /* editor_delegate.cpp in Sources */,
39BBC13B1F9FD65C009D1687 /* highlighting.cpp in Sources */,
39831A3820F3C5C6005FF294 /* merger.cpp in Sources */,
F652D8BF1CFDE1E800FC29A0 /* engine.cpp in Sources */,
675346DD1A40560D00A0A8C3 /* approximate_string_match.cpp in Sources */,
34586B8B1DCB1E8300CF7FC9 /* interval_set_test.cpp in Sources */,