diff --git a/search/base/text_index.hpp b/search/base/text_index.hpp index 6f546610cf..3f69898389 100644 --- a/search/base/text_index.hpp +++ b/search/base/text_index.hpp @@ -45,6 +45,7 @@ namespace search { namespace base { +using Token = std::string; using Posting = uint32_t; enum class TextIndexVersion : uint8_t @@ -97,7 +98,6 @@ struct TextIndexHeader // The dictionary contains all tokens that are present // in the text index. -template class TextIndexDictionary { public: @@ -194,7 +194,6 @@ private: std::vector m_tokens; }; -template class MemTextIndex { public: @@ -217,6 +216,13 @@ public: fn(p); } + template + void ForEachPosting(strings::UniString const & token, Fn && fn) const + { + auto const utf8s = strings::ToUtf8(token); + ForEachPosting(std::move(utf8s), std::forward(fn)); + } + template void Serialize(Sink & sink) { @@ -358,11 +364,10 @@ private: } std::map> m_postingsByToken; - TextIndexDictionary m_dictionary; + TextIndexDictionary m_dictionary; }; // A reader class for on-demand reading of postings lists from disk. -template class TextIndexReader { public: @@ -406,9 +411,16 @@ public: } } + template + void ForEachPosting(strings::UniString const & token, Fn && fn) const + { + auto const utf8s = strings::ToUtf8(token); + ForEachPosting(std::move(utf8s), std::forward(fn)); + } + private: FileReader m_fileReader; - TextIndexDictionary m_dictionary; + TextIndexDictionary m_dictionary; std::vector m_postingsStarts; }; diff --git a/search/search_tests/text_index_tests.cpp b/search/search_tests/text_index_tests.cpp index 4007d1cb9d..3b29218b21 100644 --- a/search/search_tests/text_index_tests.cpp +++ b/search/search_tests/text_index_tests.cpp @@ -31,9 +31,7 @@ namespace // Prepend several bytes to serialized indexes in order to check the relative offsets. size_t const kSkip = 10; -template -void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex, - vector & buf) +void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex, vector & buf) { buf.clear(); { @@ -62,14 +60,14 @@ namespace search { UNIT_TEST(TextIndex_Smoke) { - using Token = string; + using Token = base::Token; vector const docsCollection = { "a b c", "a c", }; - MemTextIndex memIndex; + MemTextIndex memIndex; for (size_t docId = 0; docId < docsCollection.size(); ++docId) { @@ -82,7 +80,7 @@ UNIT_TEST(TextIndex_Smoke) } vector indexData; - MemTextIndex deserializedMemIndex; + MemTextIndex deserializedMemIndex; Serdes(memIndex, deserializedMemIndex, indexData); for (auto const & index : {memIndex, deserializedMemIndex}) @@ -98,7 +96,7 @@ UNIT_TEST(TextIndex_Smoke) copy_n(indexData.begin() + kSkip, indexData.size() - kSkip, back_inserter(contents)); ScopedFile file("text_index_tmp", contents); FileReader fileReader(file.GetFullPath()); - TextIndexReader textIndexReader(fileReader); + TextIndexReader textIndexReader(fileReader); TestForEach(textIndexReader, "a", {0, 1}); TestForEach(textIndexReader, "b", {0}); TestForEach(textIndexReader, "c", {0, 1}); @@ -108,29 +106,27 @@ UNIT_TEST(TextIndex_Smoke) UNIT_TEST(TextIndex_UniString) { - using Token = strings::UniString; - vector const docsCollectionUtf8s = { "â b ç", "â ç", }; - vector const docsCollection( + vector const docsCollection( make_transform_iterator(docsCollectionUtf8s.begin(), &strings::MakeUniString), make_transform_iterator(docsCollectionUtf8s.end(), &strings::MakeUniString)); - MemTextIndex memIndex; + MemTextIndex memIndex; for (size_t docId = 0; docId < docsCollection.size(); ++docId) { - auto addToIndex = [&](Token const & token) { - memIndex.AddPosting(token, static_cast(docId)); + auto addToIndex = [&](strings::UniString const & token) { + memIndex.AddPosting(strings::ToUtf8(token), static_cast(docId)); }; auto delims = [](strings::UniChar const & c) { return c == ' '; }; SplitUniString(docsCollection[docId], addToIndex, delims); } vector indexData; - MemTextIndex deserializedMemIndex; + MemTextIndex deserializedMemIndex; Serdes(memIndex, deserializedMemIndex, indexData); for (auto const & index : {memIndex, deserializedMemIndex})