[search] Got rid of templated Token type in the text index.

The tokens are now stored as UTF8-encoded strings.
This commit is contained in:
Maxim Pimenov 2018-06-29 16:26:21 +03:00 committed by Tatiana Yan
parent 7ec094b9f4
commit cb504d0504
2 changed files with 27 additions and 19 deletions

View file

@ -45,6 +45,7 @@ namespace search
{
namespace base
{
using Token = std::string;
using Posting = uint32_t;
enum class TextIndexVersion : uint8_t
@ -97,7 +98,6 @@ struct TextIndexHeader
// The dictionary contains all tokens that are present
// in the text index.
template <typename Token>
class TextIndexDictionary
{
public:
@ -194,7 +194,6 @@ private:
std::vector<Token> m_tokens;
};
template <typename Token>
class MemTextIndex
{
public:
@ -217,6 +216,13 @@ public:
fn(p);
}
template <typename Fn>
void ForEachPosting(strings::UniString const & token, Fn && fn) const
{
auto const utf8s = strings::ToUtf8(token);
ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
}
template <typename Sink>
void Serialize(Sink & sink)
{
@ -358,11 +364,10 @@ private:
}
std::map<Token, std::vector<Posting>> m_postingsByToken;
TextIndexDictionary<Token> m_dictionary;
TextIndexDictionary m_dictionary;
};
// A reader class for on-demand reading of postings lists from disk.
template <typename Token>
class TextIndexReader
{
public:
@ -406,9 +411,16 @@ public:
}
}
template <typename Fn>
void ForEachPosting(strings::UniString const & token, Fn && fn) const
{
auto const utf8s = strings::ToUtf8(token);
ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
}
private:
FileReader m_fileReader;
TextIndexDictionary<Token> m_dictionary;
TextIndexDictionary m_dictionary;
std::vector<uint32_t> m_postingsStarts;
};

View file

@ -31,9 +31,7 @@ namespace
// Prepend several bytes to serialized indexes in order to check the relative offsets.
size_t const kSkip = 10;
template <typename Token>
void Serdes(MemTextIndex<Token> & memIndex, MemTextIndex<Token> & deserializedMemIndex,
vector<uint8_t> & buf)
void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex, vector<uint8_t> & buf)
{
buf.clear();
{
@ -62,14 +60,14 @@ namespace search
{
UNIT_TEST(TextIndex_Smoke)
{
using Token = string;
using Token = base::Token;
vector<Token> const docsCollection = {
"a b c",
"a c",
};
MemTextIndex<Token> memIndex;
MemTextIndex memIndex;
for (size_t docId = 0; docId < docsCollection.size(); ++docId)
{
@ -82,7 +80,7 @@ UNIT_TEST(TextIndex_Smoke)
}
vector<uint8_t> indexData;
MemTextIndex<Token> deserializedMemIndex;
MemTextIndex deserializedMemIndex;
Serdes(memIndex, deserializedMemIndex, indexData);
for (auto const & index : {memIndex, deserializedMemIndex})
@ -98,7 +96,7 @@ UNIT_TEST(TextIndex_Smoke)
copy_n(indexData.begin() + kSkip, indexData.size() - kSkip, back_inserter(contents));
ScopedFile file("text_index_tmp", contents);
FileReader fileReader(file.GetFullPath());
TextIndexReader<Token> textIndexReader(fileReader);
TextIndexReader textIndexReader(fileReader);
TestForEach(textIndexReader, "a", {0, 1});
TestForEach(textIndexReader, "b", {0});
TestForEach(textIndexReader, "c", {0, 1});
@ -108,29 +106,27 @@ UNIT_TEST(TextIndex_Smoke)
UNIT_TEST(TextIndex_UniString)
{
using Token = strings::UniString;
vector<std::string> const docsCollectionUtf8s = {
"â b ç",
"â ç",
};
vector<Token> const docsCollection(
vector<strings::UniString> const docsCollection(
make_transform_iterator(docsCollectionUtf8s.begin(), &strings::MakeUniString),
make_transform_iterator(docsCollectionUtf8s.end(), &strings::MakeUniString));
MemTextIndex<Token> memIndex;
MemTextIndex memIndex;
for (size_t docId = 0; docId < docsCollection.size(); ++docId)
{
auto addToIndex = [&](Token const & token) {
memIndex.AddPosting(token, static_cast<uint32_t>(docId));
auto addToIndex = [&](strings::UniString const & token) {
memIndex.AddPosting(strings::ToUtf8(token), static_cast<uint32_t>(docId));
};
auto delims = [](strings::UniChar const & c) { return c == ' '; };
SplitUniString(docsCollection[docId], addToIndex, delims);
}
vector<uint8_t> indexData;
MemTextIndex<Token> deserializedMemIndex;
MemTextIndex deserializedMemIndex;
Serdes(memIndex, deserializedMemIndex, indexData);
for (auto const & index : {memIndex, deserializedMemIndex})