From 76197901a91ecb59ab0dffb1ad66a8570ce47604 Mon Sep 17 00:00:00 2001 From: vng Date: Tue, 13 Dec 2011 17:19:56 +0300 Subject: [PATCH] Store feature names in temporary file during search index generation. --- coding/coding_tests/trie_test.cpp | 8 ++- coding/trie_builder.hpp | 20 +++++-- indexer/indexer.pro | 4 ++ indexer/search_index_builder.cpp | 81 ++++++++------------------ indexer/string_file.cpp | 76 +++++++++++++++++++++++++ indexer/string_file.hpp | 95 +++++++++++++++++++++++++++++++ std/iterator_facade.hpp | 1 + 7 files changed, 220 insertions(+), 65 deletions(-) create mode 100644 indexer/string_file.cpp create mode 100644 indexer/string_file.hpp diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp index 2b1c2c7b33..b32253dca0 100644 --- a/coding/coding_tests/trie_test.cpp +++ b/coding/coding_tests/trie_test.cpp @@ -45,8 +45,12 @@ struct KeyValuePair uint32_t GetKeySize() const { return m_key.size(); } trie::TrieChar const * GetKeyData() const { return m_key.data(); } - uint32_t GetValueSize() const { return 4; } - void const * GetValueData() const { return &m_value; } + + template void SerializeValue(TCont & cont) const + { + cont.resize(4); + memcpy(cont.data(), &m_value, 4); + } bool operator == (KeyValuePair const & p) const { diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp index 8e72164c7f..cc29b3d605 100644 --- a/coding/trie_builder.hpp +++ b/coding/trie_builder.hpp @@ -235,9 +235,14 @@ template void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & edgeBuilder) { typedef buffer_vector TrieString; - buffer_vector, 32> nodes; - nodes.push_back(builder::NodeInfo(sink.Pos(), DEFAULT_CHAR, edgeBuilder)); + typedef buffer_vector TrieValue; + typedef builder::NodeInfo NodeInfoT; + + buffer_vector nodes; + nodes.push_back(NodeInfoT(sink.Pos(), DEFAULT_CHAR, edgeBuilder)); + TrieString prevKey; + TrieValue value; for (IterT it = beg; it != end; ++it) { TrieChar const * const pKeyData = it->GetKeyData(); @@ -246,14 +251,19 @@ void Build(SinkT & sink, IterT const beg, IterT const end, EdgeBuilderT const & size_t nCommon = 0; while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon]) ++nCommon; + builder::PopNodes(sink, nodes, nodes.size() - nCommon - 1); // Root is also a common node. + uint64_t const pos = sink.Pos(); for (size_t i = nCommon; i < key.size(); ++i) nodes.push_back(builder::NodeInfo(pos, key[i], edgeBuilder)); - uint8_t const * const pValue = static_cast(it->GetValueData()); - nodes.back().m_values.insert(nodes.back().m_values.end(), pValue, pValue + it->GetValueSize()); + + it->SerializeValue(value); + nodes.back().m_values.insert(nodes.back().m_values.end(), + value.begin(), value.end()); nodes.back().m_valueCount += 1; - nodes.back().m_edgeBuilder.AddValue(pValue, it->GetValueSize()); + nodes.back().m_edgeBuilder.AddValue(value.data(), value.size()); + prevKey.swap(key); } diff --git a/indexer/indexer.pro b/indexer/indexer.pro index b212d89707..d44f87d82d 100644 --- a/indexer/indexer.pro +++ b/indexer/indexer.pro @@ -41,6 +41,7 @@ SOURCES += \ categories_holder.cpp \ search_string_utils.cpp \ drules_struct.pb.cc \ + string_file.cpp HEADERS += \ feature.hpp \ @@ -86,5 +87,8 @@ HEADERS += \ mwm_set.hpp \ categories_holder.hpp \ drules_struct.pb.h \ + string_file.hpp OTHER_FILES += drules_struct.proto + + diff --git a/indexer/search_index_builder.cpp b/indexer/search_index_builder.cpp index 829d2a8a5a..fc6a10490b 100644 --- a/indexer/search_index_builder.cpp +++ b/indexer/search_index_builder.cpp @@ -5,6 +5,7 @@ #include "search_delimiters.hpp" #include "search_trie.hpp" #include "search_string_utils.hpp" +#include "string_file.hpp" #include "../defines.hpp" @@ -24,59 +25,13 @@ namespace { -struct FeatureName -{ - strings::UniString m_name; - char m_Value[5]; - - FeatureName(strings::UniString const & name, signed char lang, uint32_t id, uint8_t rank) - { - m_name.reserve(name.size() + 1); - m_name.push_back(static_cast(lang)); - m_name.append(name.begin(), name.end()); - - m_Value[0] = rank; - uint32_t const idToWrite = SwapIfBigEndian(id); - memcpy(&m_Value[1], &idToWrite, 4); - } - - uint32_t GetKeySize() const { return m_name.size(); } - uint32_t const * GetKeyData() const { return m_name.data(); } - uint32_t GetValueSize() const { return 5; } - void const * GetValueData() const { return &m_Value; } - - uint8_t GetRank() const { return static_cast(m_Value[0]); } - uint32_t GetOffset() const - { - uint32_t offset; - memcpy(&offset, &m_Value[1], 4); - return SwapIfBigEndian(offset); - } - - inline bool operator < (FeatureName const & name) const - { - if (m_name != name.m_name) - return m_name < name.m_name; - if (GetRank() != name.GetRank()) - return GetRank() > name.GetRank(); - if (GetOffset() != name.GetOffset()) - return GetOffset() < name.GetOffset(); - return false; - } - - inline bool operator == (FeatureName const & name) const - { - return m_name == name.m_name && 0 == memcmp(&m_Value, &name.m_Value, sizeof(m_Value)); - } -}; - struct FeatureNameInserter { - vector & m_names; + StringsFile & m_names; uint32_t m_pos; uint32_t m_rank; - FeatureNameInserter(vector & names, uint32_t pos, uint8_t rank) + FeatureNameInserter(StringsFile & names, uint32_t pos, uint8_t rank) : m_names(names), m_pos(pos), m_rank(rank) {} void AddToken(signed char lang, strings::UniString const & s) const @@ -86,12 +41,11 @@ struct FeatureNameInserter void AddToken(signed char lang, strings::UniString const & s, uint32_t rank) const { - m_names.push_back(FeatureName(s, lang, m_pos, static_cast(min(rank, 255U)))); + m_names.AddString(StringsFile::StringT(s, lang, m_pos, static_cast(min(rank, 255U)))); } bool operator()(signed char lang, string const & name) const { - // m_names.push_back(FeatureName(, m_pos, m_rank)); strings::UniString uniName = search::NormalizeAndSimplifyString(name); buffer_vector tokens; SplitUniString(uniName, MakeBackInsertFunctor(tokens), search::Delimiters()); @@ -108,9 +62,9 @@ struct FeatureNameInserter struct FeatureInserter { - vector & m_names; + StringsFile & m_names; - explicit FeatureInserter(vector & names) : m_names(names) {} + explicit FeatureInserter(StringsFile & names) : m_names(names) {} void operator() (FeatureType const & feature, uint64_t pos) const { @@ -141,19 +95,29 @@ struct MaxValueCalc void indexer::BuildSearchIndex(FeaturesVector const & featuresVector, Writer & writer) { - vector names; - featuresVector.ForEachOffset(FeatureInserter(names)); - sort(names.begin(), names.end()); - names.erase(unique(names.begin(), names.end()), names.end()); - trie::Build(writer, names.begin(), names.end(), + StringsFile names; + string const tmpFile = GetPlatform().WritablePathForFile("search_index_1.tmp"); + + { + FileWriter writer(tmpFile); + names.OpenForWrite(&writer); + featuresVector.ForEachOffset(FeatureInserter(names)); + } + + names.OpenForRead(new FileReader(tmpFile)); + names.SortStrings(); + + trie::Build(writer, names.Begin(), names.End(), trie::builder::MaxValueEdgeBuilder()); + + FileWriter::DeleteFileX(tmpFile); } bool indexer::BuildSearchIndexFromDatFile(string const & datFile) { try { - string const tmpFile = GetPlatform().WritablePathForFile(datFile + ".search.tmp"); + string const tmpFile = GetPlatform().WritablePathForFile("search_index_2.tmp"); { FilesContainerR readCont(datFile); @@ -182,6 +146,7 @@ bool indexer::BuildSearchIndexFromDatFile(string const & datFile) catch (Writer::Exception const & e) { LOG(LERROR, ("Error writing index file: ", e.what())); + return false; } return true; diff --git a/indexer/string_file.cpp b/indexer/string_file.cpp new file mode 100644 index 0000000000..65840c13f1 --- /dev/null +++ b/indexer/string_file.cpp @@ -0,0 +1,76 @@ +#include "string_file.hpp" + +#include "../coding/read_write_utils.hpp" +#include "../coding/reader.hpp" +#include "../coding/writer.hpp" + +#include "../std/algorithm.hpp" + + +template +StringsFile::IdT StringsFile::StringT::Write(TWriter & writer) const +{ + IdT const pos = static_cast(writer.Pos()); + CHECK_EQUAL(static_cast(pos), writer.Pos(), ()); + + rw::Write(writer, m_name); + WriteVarUint(writer, m_pos); + WriteVarUint(writer, m_rank); + + return pos; +} + +template +void StringsFile::StringT::Read(IdT id, TReader & reader) +{ + ReaderSource src(reader); + src.Skip(id); + + rw::Read(src, m_name); + m_pos = ReadVarUint(src); + m_rank = ReadPrimitiveFromSource(src); +} + +bool StringsFile::StringT::operator<(StringT const & name) const +{ + if (m_name != name.m_name) + return m_name < name.m_name; + if (GetRank() != name.GetRank()) + return GetRank() > name.GetRank(); + if (GetOffset() != name.GetOffset()) + return GetOffset() < name.GetOffset(); + return false; +} + +bool StringsFile::StringT::operator==(StringT const & name) const +{ + return (m_name == name.m_name && m_pos == name.m_pos && m_rank == name.m_rank); +} + +void StringsFile::AddString(StringT const & s) +{ + ASSERT ( m_writer != 0, () ); + m_ids.push_back(s.Write(*m_writer)); +} + +bool StringsFile::StringCompare::operator() (IdT const & id1, IdT const & id2) const +{ + StringT str[2]; + str[0].Read(id1, m_file.m_reader); + str[1].Read(id2, m_file.m_reader); + return (str[0] < str[1]); +} + +void StringsFile::SortStrings() +{ + stable_sort(m_ids.begin(), m_ids.end(), StringCompare(*this)); +} + +StringsFile::StringT StringsFile::IteratorT::dereference() const +{ + ASSERT_LESS ( m_index, m_file->m_ids.size(), () ); + + StringT s; + s.Read(m_file->m_ids[m_index], m_file->m_reader); + return s; +} diff --git a/indexer/string_file.hpp b/indexer/string_file.hpp new file mode 100644 index 0000000000..92dbf60f56 --- /dev/null +++ b/indexer/string_file.hpp @@ -0,0 +1,95 @@ +#pragma once + +#include "../coding/writer.hpp" +#include "../coding/reader.hpp" + +#include "../base/string_utils.hpp" + +#include "../std/iterator_facade.hpp" + + +class StringsFile +{ +public: + + typedef uint32_t IdT; + + struct StringT + { + strings::UniString m_name; + uint32_t m_pos; + uint8_t m_rank; + + StringT() {} + StringT(strings::UniString const & name, signed char lang, uint32_t pos, uint8_t rank) + : m_pos(pos), m_rank(rank) + { + m_name.reserve(name.size() + 1); + m_name.push_back(static_cast(lang)); + m_name.append(name.begin(), name.end()); + } + + uint32_t GetKeySize() const { return m_name.size(); } + uint32_t const * GetKeyData() const { return m_name.data(); } + + template void SerializeValue(TCont & cont) const + { + cont.resize(5); + cont[0] = m_rank; + uint32_t const i = SwapIfBigEndian(m_pos); + memcpy(&cont[1], &i, 4); + } + + uint8_t GetRank() const { return m_rank; } + uint32_t GetOffset() const { return m_pos; } + + bool operator<(StringT const & name) const; + bool operator==(StringT const & name) const; + + template IdT Write(TWriter & writer) const; + template void Read(IdT id, TReader & reader); + }; + + class StringCompare + { + StringsFile & m_file; + public: + StringCompare(StringsFile & file) : m_file(file) {} + bool operator() (IdT const & id1, IdT const & id2) const; + }; + + class IteratorT : public iterator_facade + { + size_t m_index; + StringsFile const * m_file; + + public: + IteratorT(size_t index, StringsFile const & file) + : m_index(index), m_file(&file) {} + + StringT dereference() const; + bool equal(IteratorT const & r) const { return m_index == r.m_index; } + void increment() { ++m_index; } + }; + + StringsFile() : m_writer(0), m_reader(0) {} + + void OpenForWrite(Writer * w) { m_writer = w; } + /// Note! r should be in dynamic memory and this class takes shared ownership of it. + void OpenForRead(Reader * r) { m_reader = ReaderPtr(r); } + + /// @precondition Should be opened for writing. + void AddString(StringT const & s); + + /// @precondition Should be opened for reading. + void SortStrings(); + + IteratorT Begin() const { return IteratorT(0, *this); } + IteratorT End() const { return IteratorT(m_ids.size(), *this); } + +private: + vector m_ids; + + Writer * m_writer; + ReaderPtr m_reader; +}; diff --git a/std/iterator_facade.hpp b/std/iterator_facade.hpp index 232956644e..613240d19f 100644 --- a/std/iterator_facade.hpp +++ b/std/iterator_facade.hpp @@ -8,6 +8,7 @@ #include using boost::iterator_facade; using boost::random_access_traversal_tag; +using boost::forward_traversal_tag; #ifdef DEBUG_NEW #define new DEBUG_NEW