forked from organicmaps/organicmaps
[generator_tool] Added -dump_prefixes for displaying feature name tokens stats
This commit is contained in:
parent
a1ee16f983
commit
aebf7deb82
5 changed files with 108 additions and 59 deletions
|
@ -2,26 +2,33 @@
|
|||
|
||||
#include "../defines.hpp"
|
||||
|
||||
static char const * gLangs[] = { "default",
|
||||
"en", "ja", "fr", "ko_rm", "ar", "de", "ru", "sv", "zh", "fi",
|
||||
"ko", "ka", "he", "be", "nl", "ga", "ja_rm", "el", "it", "es",
|
||||
"th", "zh_pinyin", "ca", "cy", "hu", "hsb", "sr", "fa", "eu", "pl",
|
||||
"br", "uk", "sl", "ro", "sq", "am", "fy", "gd", "cs", "sk",
|
||||
"af", "hr", "hy", "tr", "kn", "pt", "lt", "lb", "bg", "eo",
|
||||
"kk", "la", "et", "vi", "mn", "mk", "lv", "fur", "gsw", "ja_kana",
|
||||
"is", "hi", "ku" };
|
||||
|
||||
int8_t StringUtf8Multilang::GetLangIndex(string const & lang)
|
||||
{
|
||||
static char const * arr[] = { "default",
|
||||
"en", "ja", "fr", "ko_rm", "ar", "de", "ru", "sv", "zh", "fi",
|
||||
"ko", "ka", "he", "be", "nl", "ga", "ja_rm", "el", "it", "es",
|
||||
"th", "zh_pinyin", "ca", "cy", "hu", "hsb", "sr", "fa", "eu", "pl",
|
||||
"br", "uk", "sl", "ro", "sq", "am", "fy", "gd", "cs", "sk",
|
||||
"af", "hr", "hy", "tr", "kn", "pt", "lt", "lb", "bg", "eo",
|
||||
"kk", "la", "et", "vi", "mn", "mk", "lv", "fur", "gsw", "ja_kana",
|
||||
"is", "hi", "ku" };
|
||||
STATIC_ASSERT(ARRAY_SIZE(gLangs) == MAX_SUPPORTED_LANGUAGES);
|
||||
|
||||
STATIC_ASSERT(ARRAY_SIZE(arr) == MAX_SUPPORTED_LANGUAGES);
|
||||
|
||||
for (size_t i = 0; i < ARRAY_SIZE(arr); ++i)
|
||||
if (lang == arr[i])
|
||||
for (size_t i = 0; i < ARRAY_SIZE(gLangs); ++i)
|
||||
if (lang == gLangs[i])
|
||||
return static_cast<int8_t>(i);
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
char const * StringUtf8Multilang::GetLangByCode(int8_t langCode)
|
||||
{
|
||||
if (langCode < 0 || langCode > ARRAY_SIZE(gLangs) - 1)
|
||||
return "";
|
||||
return gLangs[langCode];
|
||||
}
|
||||
|
||||
size_t StringUtf8Multilang::GetNextIndex(size_t i) const
|
||||
{
|
||||
++i;
|
||||
|
|
|
@ -36,6 +36,8 @@ class StringUtf8Multilang
|
|||
|
||||
public:
|
||||
static int8_t GetLangIndex(string const & lang);
|
||||
/// @return empty string if langCode is invalid
|
||||
static char const * GetLangByCode(int8_t langCode);
|
||||
|
||||
inline bool operator== (StringUtf8Multilang const & rhs) const
|
||||
{
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
#include "dumper.hpp"
|
||||
|
||||
#include "../indexer/feature_processor.hpp"
|
||||
#include "../indexer/classificator.hpp"
|
||||
#include "../coding/multilang_utf8_string.hpp"
|
||||
|
||||
#include "../std/vector.hpp"
|
||||
#include "../std/unordered_map.hpp"
|
||||
#include "../indexer/classificator.hpp"
|
||||
#include "../indexer/feature_processor.hpp"
|
||||
|
||||
#include "../std/algorithm.hpp"
|
||||
#include "../std/bind.hpp"
|
||||
#include "../std/iostream.hpp"
|
||||
#include "../std/map.hpp"
|
||||
#include "../std/vector.hpp"
|
||||
|
||||
namespace feature
|
||||
{
|
||||
|
@ -14,7 +18,7 @@ namespace feature
|
|||
vector<uint32_t> m_currFeatureTypes;
|
||||
|
||||
public:
|
||||
typedef unordered_map<vector<uint32_t>, size_t> value_type;
|
||||
typedef map<vector<uint32_t>, size_t> value_type;
|
||||
value_type m_stats;
|
||||
size_t m_namesCount;
|
||||
size_t m_totalCount;
|
||||
|
@ -68,53 +72,88 @@ namespace feature
|
|||
cout << "Features with names: " << doClass.m_namesCount << endl;
|
||||
}
|
||||
|
||||
class NamesCollector
|
||||
///////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef map<int8_t, map<string, size_t> > TokensContainerT;
|
||||
class PrefixesCollector
|
||||
{
|
||||
typedef unordered_map<string, size_t> NamesContainerT;
|
||||
|
||||
class LangsFunctor
|
||||
{
|
||||
public:
|
||||
vector<string> m_names;
|
||||
bool operator()(signed char, string const & name)
|
||||
{
|
||||
m_names.push_back(name);
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
public:
|
||||
NamesContainerT m_stats;
|
||||
TokensContainerT m_stats;
|
||||
|
||||
bool operator()(int8_t langCode, string const & name)
|
||||
{
|
||||
CHECK(!name.empty(), ("Feature name is empty"));
|
||||
|
||||
vector<string> tokens;
|
||||
strings::SimpleTokenizer tok(name, " ");
|
||||
while (tok)
|
||||
{
|
||||
tokens.push_back(*tok);
|
||||
++tok;
|
||||
}
|
||||
|
||||
if (tokens.empty())
|
||||
return true;
|
||||
// ignore token if it's first letter is an uppercase letter
|
||||
strings::UniString const s1 = strings::MakeUniString(tokens[0]);
|
||||
strings::UniString const s2 = strings::MakeLowerCase(s1);
|
||||
if (s1[0] != s2[0])
|
||||
return true;
|
||||
|
||||
for (size_t i = 1; i < tokens.size(); ++i)
|
||||
{
|
||||
string s;
|
||||
for (size_t numTokens = 0; numTokens < i; ++numTokens)
|
||||
{
|
||||
s += tokens[numTokens];
|
||||
s += " ";
|
||||
}
|
||||
pair<TokensContainerT::mapped_type::iterator, bool> found = m_stats[langCode].insert(make_pair(s, 1));
|
||||
if (!found.second)
|
||||
found.first->second++;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void operator()(FeatureType & f, uint32_t)
|
||||
{
|
||||
LangsFunctor doLangs;
|
||||
f.ForEachNameRef(doLangs);
|
||||
for (size_t i = 0; i < doLangs.m_names.size(); ++i)
|
||||
{
|
||||
strings::SimpleTokenizer tok(doLangs.m_names[i], " ");
|
||||
while (tok)
|
||||
{
|
||||
pair<NamesContainerT::iterator, bool> found = m_stats.insert(make_pair(*tok, 1));
|
||||
if (!found.second)
|
||||
found.first->second++;
|
||||
++tok;
|
||||
}
|
||||
}
|
||||
f.ForEachNameRef(*this);
|
||||
}
|
||||
};
|
||||
|
||||
typedef pair<string, size_t> NameElemT;
|
||||
void DumpNames(string const & fPath)
|
||||
static size_t const MIN_OCCURRENCE = 3;
|
||||
|
||||
void Print(int8_t langCode, TokensContainerT::mapped_type const & container)
|
||||
{
|
||||
NamesCollector doClass;
|
||||
feature::ForEachFromDat(fPath, doClass);
|
||||
|
||||
typedef pair<string, size_t> NameElemT;
|
||||
typedef vector<NameElemT> VecToSortT;
|
||||
VecToSortT vecToSort(doClass.m_stats.begin(), doClass.m_stats.end());
|
||||
sort(vecToSort.begin(), vecToSort.end(), &SortFunc<NameElemT>);
|
||||
VecToSortT v(container.begin(), container.end());
|
||||
sort(v.begin(), v.end(), &SortFunc<NameElemT>);
|
||||
|
||||
// do not display prefixes with low occurrences
|
||||
if (v[0].second > MIN_OCCURRENCE)
|
||||
{
|
||||
cout << "Language code: " << StringUtf8Multilang::GetLangByCode(langCode) << endl;
|
||||
|
||||
for (VecToSortT::iterator it = v.begin(); it != v.end(); ++it)
|
||||
{
|
||||
if (it->second <= MIN_OCCURRENCE)
|
||||
break;
|
||||
cout << it->second << " " << it->first << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void DumpPrefixes(string const & fPath)
|
||||
{
|
||||
PrefixesCollector doClass;
|
||||
feature::ForEachFromDat(fPath, doClass);
|
||||
for (TokensContainerT::iterator it = doClass.m_stats.begin();
|
||||
it != doClass.m_stats.end(); ++it)
|
||||
{
|
||||
Print(it->first, it->second);
|
||||
}
|
||||
|
||||
for (VecToSortT::iterator it = vecToSort.begin(); it != vecToSort.end(); ++it)
|
||||
cout << it->second << " " << it->first << endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,5 +5,5 @@
|
|||
namespace feature
|
||||
{
|
||||
void DumpTypes(string const & fPath);
|
||||
void DumpNames(string const & fPath);
|
||||
void DumpPrefixes(string const & fPath);
|
||||
}
|
||||
|
|
|
@ -51,6 +51,7 @@ DEFINE_string(generate_borders, "",
|
|||
"Create binary country .borders file for osm xml file given in 'output' parameter,"
|
||||
"specify tag name and optional value: ISO3166-1 or admin_level=4");
|
||||
DEFINE_bool(dump_types, false, "If defined, prints all types combinations and their total count");
|
||||
DEFINE_bool(dump_prefixes, false, "If defined, prints statistics on feature name prefixes");
|
||||
DEFINE_bool(unpack_mwm, false, "Unpack each section of mwm into a separate file with name filePath.sectionName.");
|
||||
|
||||
string AddSlashIfNeeded(string const & str)
|
||||
|
@ -106,7 +107,7 @@ int main(int argc, char ** argv)
|
|||
// load classificator only if necessary
|
||||
if (FLAGS_generate_features || FLAGS_generate_geometry ||
|
||||
FLAGS_generate_index || FLAGS_generate_search_index ||
|
||||
FLAGS_calc_statistics || FLAGS_dump_types)
|
||||
FLAGS_calc_statistics || FLAGS_dump_types || FLAGS_dump_prefixes)
|
||||
{
|
||||
classificator::Read(pl.GetReader("drawing_rules.bin"),
|
||||
pl.GetReader("classificator.txt"),
|
||||
|
@ -218,10 +219,10 @@ int main(int argc, char ** argv)
|
|||
}
|
||||
|
||||
if (FLAGS_dump_types)
|
||||
{
|
||||
//feature::DumpNames(path + FLAGS_output + ".mwm");
|
||||
feature::DumpTypes(path + FLAGS_output + ".mwm");
|
||||
}
|
||||
|
||||
if (FLAGS_dump_prefixes)
|
||||
feature::DumpPrefixes(path + FLAGS_output + ".mwm");
|
||||
|
||||
if (FLAGS_unpack_mwm)
|
||||
{
|
||||
|
|
Loading…
Add table
Reference in a new issue