From c058e5fca884b6904ba7fbdaf8b270d716e1f53c Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Wed, 19 Oct 2011 17:16:03 +0200 Subject: [PATCH] Add --dump_search_tokens to generator_tool. --- generator/dumper.cpp | 67 +++++++++++++++++++-- generator/dumper.hpp | 1 + generator/generator_tool/generator_tool.cpp | 4 ++ 3 files changed, 68 insertions(+), 4 deletions(-) diff --git a/generator/dumper.cpp b/generator/dumper.cpp index 980f6332e7..4a62c572d4 100644 --- a/generator/dumper.cpp +++ b/generator/dumper.cpp @@ -2,16 +2,19 @@ #include "../indexer/search_delimiters.hpp" #include "../indexer/search_string_utils.hpp" +#include "../indexer/classificator.hpp" +#include "../indexer/feature_processor.hpp" +#include "../indexer/search_trie.hpp" #include "../coding/multilang_utf8_string.hpp" -#include "../indexer/classificator.hpp" -#include "../indexer/feature_processor.hpp" +#include "../base/logging.hpp" #include "../std/algorithm.hpp" #include "../std/bind.hpp" #include "../std/iostream.hpp" #include "../std/map.hpp" +#include "../std/queue.hpp" #include "../std/vector.hpp" namespace feature @@ -149,7 +152,63 @@ namespace feature { Print(it->first, it->second); } - } -} + struct SearchTokensCollector + { + priority_queue > tokens; + strings::UniString m_currentS; + uint32_t m_currentCount; + + SearchTokensCollector() : m_currentS(), m_currentCount(0) {} + + void operator() (strings::UniString const & s, search::trie::ValueReader::ValueType value) + { + if (m_currentS == s) + { + ++m_currentCount; + } + else + { + if (m_currentCount > 0) + { + tokens.push(make_pair(m_currentCount, m_currentS)); + if (tokens.size() > 100) + tokens.pop(); + } + m_currentS = s; + m_currentCount = 0; + } + } + + void Finish() + { + if (m_currentCount > 0) + { + tokens.push(make_pair(m_currentCount, m_currentS)); + if (tokens.size() > 100) + tokens.pop(); + } + } + }; + + void DumpSearchTokens(string const & fPath) + { + FilesContainerR container(new FileReader(fPath)); + scoped_ptr pTrieRoot( + ::trie::reader::ReadTrie(container.GetReader(SEARCH_INDEX_FILE_TAG), + ::search::trie::ValueReader(), + ::search::trie::EdgeValueReader())); + SearchTokensCollector f; + trie::ForEachRef(*pTrieRoot, f, strings::UniString()); + f.Finish(); + + while (!f.tokens.empty()) + { + strings::UniString const & s = f.tokens.top().second; + cout << f.tokens.top().first << " '" << strings::ToUtf8(s) << "'" << endl; + f.tokens.pop(); + } + } + +} // namespace feature diff --git a/generator/dumper.hpp b/generator/dumper.hpp index 846e3d8e3b..0b32e8199a 100644 --- a/generator/dumper.hpp +++ b/generator/dumper.hpp @@ -6,4 +6,5 @@ namespace feature { void DumpTypes(string const & fPath); void DumpPrefixes(string const & fPath); + void DumpSearchTokens(string const & fPath); } diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index e6f9b35e01..48bff56a47 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -56,6 +56,7 @@ DEFINE_string(generate_borders, "", "specify tag name and optional value: ISO3166-1 or admin_level=4"); DEFINE_bool(dump_types, false, "If defined, prints all types combinations and their total count"); DEFINE_bool(dump_prefixes, false, "If defined, prints statistics on feature name prefixes"); +DEFINE_bool(dump_search_tokens, false, "Print statistics on search tokens."); DEFINE_bool(unpack_mwm, false, "Unpack each section of mwm into a separate file with name filePath.sectionName."); DEFINE_bool(generate_packed_borders, false, "Generate packed file with country polygons"); @@ -238,6 +239,9 @@ int main(int argc, char ** argv) if (FLAGS_dump_prefixes) feature::DumpPrefixes(path + FLAGS_output + ".mwm"); + if (FLAGS_dump_search_tokens) + feature::DumpSearchTokens(path + FLAGS_output + ".mwm"); + if (FLAGS_unpack_mwm) UnpackMwm(path + FLAGS_output + ".mwm");