[generator_tool] Added -dump_prefixes for displaying feature name tokens stats

This commit is contained in:
Alex Zolotarev 2011-08-31 16:49:54 +03:00 committed by Alex Zolotarev
parent a1ee16f983
commit aebf7deb82
5 changed files with 108 additions and 59 deletions

View file

@ -2,26 +2,33 @@
#include "../defines.hpp"
static char const * gLangs[] = { "default",
"en", "ja", "fr", "ko_rm", "ar", "de", "ru", "sv", "zh", "fi",
"ko", "ka", "he", "be", "nl", "ga", "ja_rm", "el", "it", "es",
"th", "zh_pinyin", "ca", "cy", "hu", "hsb", "sr", "fa", "eu", "pl",
"br", "uk", "sl", "ro", "sq", "am", "fy", "gd", "cs", "sk",
"af", "hr", "hy", "tr", "kn", "pt", "lt", "lb", "bg", "eo",
"kk", "la", "et", "vi", "mn", "mk", "lv", "fur", "gsw", "ja_kana",
"is", "hi", "ku" };
int8_t StringUtf8Multilang::GetLangIndex(string const & lang)
{
static char const * arr[] = { "default",
"en", "ja", "fr", "ko_rm", "ar", "de", "ru", "sv", "zh", "fi",
"ko", "ka", "he", "be", "nl", "ga", "ja_rm", "el", "it", "es",
"th", "zh_pinyin", "ca", "cy", "hu", "hsb", "sr", "fa", "eu", "pl",
"br", "uk", "sl", "ro", "sq", "am", "fy", "gd", "cs", "sk",
"af", "hr", "hy", "tr", "kn", "pt", "lt", "lb", "bg", "eo",
"kk", "la", "et", "vi", "mn", "mk", "lv", "fur", "gsw", "ja_kana",
"is", "hi", "ku" };
STATIC_ASSERT(ARRAY_SIZE(gLangs) == MAX_SUPPORTED_LANGUAGES);
STATIC_ASSERT(ARRAY_SIZE(arr) == MAX_SUPPORTED_LANGUAGES);
for (size_t i = 0; i < ARRAY_SIZE(arr); ++i)
if (lang == arr[i])
for (size_t i = 0; i < ARRAY_SIZE(gLangs); ++i)
if (lang == gLangs[i])
return static_cast<int8_t>(i);
return -1;
}
char const * StringUtf8Multilang::GetLangByCode(int8_t langCode)
{
if (langCode < 0 || langCode > ARRAY_SIZE(gLangs) - 1)
return "";
return gLangs[langCode];
}
size_t StringUtf8Multilang::GetNextIndex(size_t i) const
{
++i;

View file

@ -36,6 +36,8 @@ class StringUtf8Multilang
public:
static int8_t GetLangIndex(string const & lang);
/// @return empty string if langCode is invalid
static char const * GetLangByCode(int8_t langCode);
inline bool operator== (StringUtf8Multilang const & rhs) const
{

View file

@ -1,11 +1,15 @@
#include "dumper.hpp"
#include "../indexer/feature_processor.hpp"
#include "../indexer/classificator.hpp"
#include "../coding/multilang_utf8_string.hpp"
#include "../std/vector.hpp"
#include "../std/unordered_map.hpp"
#include "../indexer/classificator.hpp"
#include "../indexer/feature_processor.hpp"
#include "../std/algorithm.hpp"
#include "../std/bind.hpp"
#include "../std/iostream.hpp"
#include "../std/map.hpp"
#include "../std/vector.hpp"
namespace feature
{
@ -14,7 +18,7 @@ namespace feature
vector<uint32_t> m_currFeatureTypes;
public:
typedef unordered_map<vector<uint32_t>, size_t> value_type;
typedef map<vector<uint32_t>, size_t> value_type;
value_type m_stats;
size_t m_namesCount;
size_t m_totalCount;
@ -68,53 +72,88 @@ namespace feature
cout << "Features with names: " << doClass.m_namesCount << endl;
}
class NamesCollector
///////////////////////////////////////////////////////////////////
typedef map<int8_t, map<string, size_t> > TokensContainerT;
class PrefixesCollector
{
typedef unordered_map<string, size_t> NamesContainerT;
class LangsFunctor
{
public:
vector<string> m_names;
bool operator()(signed char, string const & name)
{
m_names.push_back(name);
return true;
}
};
public:
NamesContainerT m_stats;
TokensContainerT m_stats;
bool operator()(int8_t langCode, string const & name)
{
CHECK(!name.empty(), ("Feature name is empty"));
vector<string> tokens;
strings::SimpleTokenizer tok(name, " ");
while (tok)
{
tokens.push_back(*tok);
++tok;
}
if (tokens.empty())
return true;
// ignore token if it's first letter is an uppercase letter
strings::UniString const s1 = strings::MakeUniString(tokens[0]);
strings::UniString const s2 = strings::MakeLowerCase(s1);
if (s1[0] != s2[0])
return true;
for (size_t i = 1; i < tokens.size(); ++i)
{
string s;
for (size_t numTokens = 0; numTokens < i; ++numTokens)
{
s += tokens[numTokens];
s += " ";
}
pair<TokensContainerT::mapped_type::iterator, bool> found = m_stats[langCode].insert(make_pair(s, 1));
if (!found.second)
found.first->second++;
}
return true;
}
void operator()(FeatureType & f, uint32_t)
{
LangsFunctor doLangs;
f.ForEachNameRef(doLangs);
for (size_t i = 0; i < doLangs.m_names.size(); ++i)
{
strings::SimpleTokenizer tok(doLangs.m_names[i], " ");
while (tok)
{
pair<NamesContainerT::iterator, bool> found = m_stats.insert(make_pair(*tok, 1));
if (!found.second)
found.first->second++;
++tok;
}
}
f.ForEachNameRef(*this);
}
};
typedef pair<string, size_t> NameElemT;
void DumpNames(string const & fPath)
static size_t const MIN_OCCURRENCE = 3;
void Print(int8_t langCode, TokensContainerT::mapped_type const & container)
{
NamesCollector doClass;
feature::ForEachFromDat(fPath, doClass);
typedef pair<string, size_t> NameElemT;
typedef vector<NameElemT> VecToSortT;
VecToSortT vecToSort(doClass.m_stats.begin(), doClass.m_stats.end());
sort(vecToSort.begin(), vecToSort.end(), &SortFunc<NameElemT>);
VecToSortT v(container.begin(), container.end());
sort(v.begin(), v.end(), &SortFunc<NameElemT>);
// do not display prefixes with low occurrences
if (v[0].second > MIN_OCCURRENCE)
{
cout << "Language code: " << StringUtf8Multilang::GetLangByCode(langCode) << endl;
for (VecToSortT::iterator it = v.begin(); it != v.end(); ++it)
{
if (it->second <= MIN_OCCURRENCE)
break;
cout << it->second << " " << it->first << endl;
}
}
}
void DumpPrefixes(string const & fPath)
{
PrefixesCollector doClass;
feature::ForEachFromDat(fPath, doClass);
for (TokensContainerT::iterator it = doClass.m_stats.begin();
it != doClass.m_stats.end(); ++it)
{
Print(it->first, it->second);
}
for (VecToSortT::iterator it = vecToSort.begin(); it != vecToSort.end(); ++it)
cout << it->second << " " << it->first << endl;
}
}

View file

@ -5,5 +5,5 @@
namespace feature
{
void DumpTypes(string const & fPath);
void DumpNames(string const & fPath);
void DumpPrefixes(string const & fPath);
}

View file

@ -51,6 +51,7 @@ DEFINE_string(generate_borders, "",
"Create binary country .borders file for osm xml file given in 'output' parameter,"
"specify tag name and optional value: ISO3166-1 or admin_level=4");
DEFINE_bool(dump_types, false, "If defined, prints all types combinations and their total count");
DEFINE_bool(dump_prefixes, false, "If defined, prints statistics on feature name prefixes");
DEFINE_bool(unpack_mwm, false, "Unpack each section of mwm into a separate file with name filePath.sectionName.");
string AddSlashIfNeeded(string const & str)
@ -106,7 +107,7 @@ int main(int argc, char ** argv)
// load classificator only if necessary
if (FLAGS_generate_features || FLAGS_generate_geometry ||
FLAGS_generate_index || FLAGS_generate_search_index ||
FLAGS_calc_statistics || FLAGS_dump_types)
FLAGS_calc_statistics || FLAGS_dump_types || FLAGS_dump_prefixes)
{
classificator::Read(pl.GetReader("drawing_rules.bin"),
pl.GetReader("classificator.txt"),
@ -218,10 +219,10 @@ int main(int argc, char ** argv)
}
if (FLAGS_dump_types)
{
//feature::DumpNames(path + FLAGS_output + ".mwm");
feature::DumpTypes(path + FLAGS_output + ".mwm");
}
if (FLAGS_dump_prefixes)
feature::DumpPrefixes(path + FLAGS_output + ".mwm");
if (FLAGS_unpack_mwm)
{