Add --dump_search_tokens to generator_tool.

This commit is contained in:
Yury Melnichek 2011-10-19 17:16:03 +02:00 committed by Alex Zolotarev
parent a6345ddf68
commit c058e5fca8
3 changed files with 68 additions and 4 deletions

View file

@ -2,16 +2,19 @@
#include "../indexer/search_delimiters.hpp"
#include "../indexer/search_string_utils.hpp"
#include "../indexer/classificator.hpp"
#include "../indexer/feature_processor.hpp"
#include "../indexer/search_trie.hpp"
#include "../coding/multilang_utf8_string.hpp"
#include "../indexer/classificator.hpp"
#include "../indexer/feature_processor.hpp"
#include "../base/logging.hpp"
#include "../std/algorithm.hpp"
#include "../std/bind.hpp"
#include "../std/iostream.hpp"
#include "../std/map.hpp"
#include "../std/queue.hpp"
#include "../std/vector.hpp"
namespace feature
@ -149,7 +152,63 @@ namespace feature
{
Print(it->first, it->second);
}
}
}
struct SearchTokensCollector
{
priority_queue<pair<uint32_t, strings::UniString> > tokens;
strings::UniString m_currentS;
uint32_t m_currentCount;
SearchTokensCollector() : m_currentS(), m_currentCount(0) {}
void operator() (strings::UniString const & s, search::trie::ValueReader::ValueType value)
{
if (m_currentS == s)
{
++m_currentCount;
}
else
{
if (m_currentCount > 0)
{
tokens.push(make_pair(m_currentCount, m_currentS));
if (tokens.size() > 100)
tokens.pop();
}
m_currentS = s;
m_currentCount = 0;
}
}
void Finish()
{
if (m_currentCount > 0)
{
tokens.push(make_pair(m_currentCount, m_currentS));
if (tokens.size() > 100)
tokens.pop();
}
}
};
void DumpSearchTokens(string const & fPath)
{
FilesContainerR container(new FileReader(fPath));
scoped_ptr<search::TrieIterator> pTrieRoot(
::trie::reader::ReadTrie(container.GetReader(SEARCH_INDEX_FILE_TAG),
::search::trie::ValueReader(),
::search::trie::EdgeValueReader()));
SearchTokensCollector f;
trie::ForEachRef(*pTrieRoot, f, strings::UniString());
f.Finish();
while (!f.tokens.empty())
{
strings::UniString const & s = f.tokens.top().second;
cout << f.tokens.top().first << " '" << strings::ToUtf8(s) << "'" << endl;
f.tokens.pop();
}
}
} // namespace feature

View file

@ -6,4 +6,5 @@ namespace feature
{
void DumpTypes(string const & fPath);
void DumpPrefixes(string const & fPath);
void DumpSearchTokens(string const & fPath);
}

View file

@ -56,6 +56,7 @@ DEFINE_string(generate_borders, "",
"specify tag name and optional value: ISO3166-1 or admin_level=4");
DEFINE_bool(dump_types, false, "If defined, prints all types combinations and their total count");
DEFINE_bool(dump_prefixes, false, "If defined, prints statistics on feature name prefixes");
DEFINE_bool(dump_search_tokens, false, "Print statistics on search tokens.");
DEFINE_bool(unpack_mwm, false, "Unpack each section of mwm into a separate file with name filePath.sectionName.");
DEFINE_bool(generate_packed_borders, false, "Generate packed file with country polygons");
@ -238,6 +239,9 @@ int main(int argc, char ** argv)
if (FLAGS_dump_prefixes)
feature::DumpPrefixes(path + FLAGS_output + ".mwm");
if (FLAGS_dump_search_tokens)
feature::DumpSearchTokens(path + FLAGS_output + ".mwm");
if (FLAGS_unpack_mwm)
UnpackMwm(path + FLAGS_output + ".mwm");