[search] New index generation algorithm.

This commit is contained in:
vng 2012-01-03 16:07:55 +03:00 committed by Alex Zolotarev
parent edc3df3fc6
commit 1e5a568fcd
3 changed files with 120 additions and 62 deletions

View file

@ -97,16 +97,11 @@ void indexer::BuildSearchIndex(FeaturesVector const & featuresVector, Writer & w
string const & tmpFilePath)
{
{
StringsFile names;
StringsFile names(tmpFilePath);
featuresVector.ForEachOffset(FeatureInserter(names));
{
FileWriter writer(tmpFilePath);
names.OpenForWrite(&writer);
featuresVector.ForEachOffset(FeatureInserter(names));
}
names.OpenForRead(new FileReader(tmpFilePath));
names.SortStrings();
names.EndAdding();
names.OpenForRead();
trie::Build(writer, names.Begin(), names.End(),
trie::builder::MaxValueEdgeBuilder<MaxValueCalc>());

View file

@ -1,10 +1,13 @@
#include "string_file.hpp"
#include "../coding/read_write_utils.hpp"
#include "../coding/reader.hpp"
#include "../coding/writer.hpp"
#include "../coding/file_reader.hpp"
#include "../coding/file_writer.hpp"
#include "../base/logging.hpp"
#include "../std/algorithm.hpp"
#include "../std/bind.hpp"
template <class TWriter>
@ -21,11 +24,8 @@ StringsFile::IdT StringsFile::StringT::Write(TWriter & writer) const
}
template <class TReader>
void StringsFile::StringT::Read(IdT id, TReader & reader)
void StringsFile::StringT::Read(TReader & src)
{
ReaderSource<TReader> src(reader);
src.Skip(id);
rw::Read(src, m_name);
m_pos = ReadVarUint<uint32_t>(src);
m_rank = ReadPrimitiveFromSource<uint8_t>(src);
@ -47,30 +47,81 @@ bool StringsFile::StringT::operator == (StringT const & name) const
return (m_name == name.m_name && m_pos == name.m_pos && m_rank == name.m_rank);
}
StringsFile::~StringsFile()
{
m_readers.clear();
for (int i = 0; i < m_index; ++i)
FileWriter::DeleteFileX(FormatFilePath(i));
}
void StringsFile::AddString(StringT const & s)
{
ASSERT ( m_writer != 0, () );
m_ids.push_back(s.Write(*m_writer));
}
if (m_strings.size() >= 30000)
Flush();
bool StringsFile::StringCompare::operator() (IdT const & id1, IdT const & id2) const
{
StringT str[2];
str[0].Read(id1, m_file.m_reader);
str[1].Read(id2, m_file.m_reader);
return (str[0] < str[1]);
}
void StringsFile::SortStrings()
{
stable_sort(m_ids.begin(), m_ids.end(), StringCompare(*this));
m_strings.push_back(s);
}
StringsFile::StringT StringsFile::IteratorT::dereference() const
{
ASSERT_LESS ( m_index, m_file->m_ids.size(), () );
StringT s;
s.Read(m_file->m_ids[m_index], m_file->m_reader);
return s;
ASSERT ( !m_file.m_queue.empty(), () );
return m_file.m_queue.top().m_string;
}
void StringsFile::IteratorT::increment()
{
ASSERT ( !m_file.m_queue.empty(), () );
int const index = m_file.m_queue.top().m_index;
m_file.m_queue.pop();
if (!m_file.PushNextValue(index))
m_end = m_file.m_queue.empty();
}
string StringsFile::FormatFilePath(int i) const
{
return m_filePath + string(".") + strings::to_string(i);
}
void StringsFile::Flush()
{
sort(m_strings.begin(), m_strings.end());
FileWriter w(FormatFilePath(m_index++));
for_each(m_strings.begin(), m_strings.end(), bind(&StringT::Write<FileWriter>, _1, ref(w)));
m_strings.clear();
}
bool StringsFile::PushNextValue(int i)
{
try
{
StringT s;
s.Read(m_readers[i]);
m_queue.push(QValue(s, i));
return true;
}
catch (SourceOutOfBoundsException const &)
{
return false;
}
}
void StringsFile::EndAdding()
{
Flush();
}
void StringsFile::OpenForRead()
{
for (int i = 0; i < m_index; ++i)
{
m_readers.push_back(ReaderT(new FileReader(FormatFilePath(i), 6, 1)));
CHECK ( PushNextValue(i), () );
}
}

View file

@ -1,11 +1,12 @@
#pragma once
#include "../coding/writer.hpp"
#include "../coding/reader.hpp"
#include "../base/string_utils.hpp"
#include "../std/iterator_facade.hpp"
#include "../std/queue.hpp"
#include "../std/functional.hpp"
class StringsFile
@ -33,6 +34,8 @@ public:
uint32_t GetKeySize() const { return m_name.size(); }
uint32_t const * GetKeyData() const { return m_name.data(); }
strings::UniString const & GetString() const { return m_name; }
template <class TCont> void SerializeValue(TCont & cont) const
{
cont.resize(5);
@ -48,7 +51,7 @@ public:
bool operator == (StringT const & name) const;
template <class TWriter> IdT Write(TWriter & writer) const;
template <class TReader> void Read(IdT id, TReader & reader);
template <class TReader> void Read(TReader & src);
void Swap(StringT & r)
{
@ -58,46 +61,55 @@ public:
}
};
class StringCompare
{
StringsFile & m_file;
public:
StringCompare(StringsFile & file) : m_file(file) {}
bool operator() (IdT const & id1, IdT const & id2) const;
};
class IteratorT : public iterator_facade<IteratorT, StringT, forward_traversal_tag, StringT>
{
size_t m_index;
StringsFile const * m_file;
StringsFile & m_file;
bool m_end;
public:
IteratorT(size_t index, StringsFile const & file)
: m_index(index), m_file(&file) {}
IteratorT(StringsFile & file, bool isEnd)
: m_file(file), m_end(isEnd)
{
}
StringT dereference() const;
bool equal(IteratorT const & r) const { return m_index == r.m_index; }
void increment() { ++m_index; }
bool equal(IteratorT const & r) const { return m_end == r.m_end; }
void increment();
};
StringsFile() : m_writer(0), m_reader(0) {}
StringsFile(string const & fPath) : m_filePath(fPath), m_index(0) {}
~StringsFile();
void OpenForWrite(Writer * w) { m_writer = w; }
/// Note! r should be in dynamic memory and this class takes shared ownership of it.
void OpenForRead(Reader * r) { m_reader = ReaderPtr<Reader>(r); }
void EndAdding();
void OpenForRead();
/// @precondition Should be opened for writing.
void AddString(StringT const & s);
/// @precondition Should be opened for reading.
void SortStrings();
IteratorT Begin() const { return IteratorT(0, *this); }
IteratorT End() const { return IteratorT(m_ids.size(), *this); }
IteratorT Begin() { return IteratorT(*this, false); }
IteratorT End() { return IteratorT(*this, true); }
private:
vector<IdT> m_ids;
string FormatFilePath(int i) const;
void Flush();
bool PushNextValue(int i);
Writer * m_writer;
ReaderPtr<Reader> m_reader;
vector<StringT> m_strings;
string m_filePath;
int m_index;
typedef ReaderSource<ReaderPtr<Reader> > ReaderT;
vector<ReaderT> m_readers;
struct QValue
{
StringT m_string;
int m_index;
QValue(StringT const & s, int i) : m_string(s), m_index(i) {}
inline bool operator > (QValue const & rhs) const { return !(m_string < rhs.m_string); }
};
priority_queue<QValue, vector<QValue>, greater<QValue> > m_queue;
};