diff --git a/coding/multilang_utf8_string.cpp b/coding/multilang_utf8_string.cpp index 54c539549d..06a7b7ea99 100644 --- a/coding/multilang_utf8_string.cpp +++ b/coding/multilang_utf8_string.cpp @@ -2,26 +2,33 @@ #include "../defines.hpp" +static char const * gLangs[] = { "default", + "en", "ja", "fr", "ko_rm", "ar", "de", "ru", "sv", "zh", "fi", + "ko", "ka", "he", "be", "nl", "ga", "ja_rm", "el", "it", "es", + "th", "zh_pinyin", "ca", "cy", "hu", "hsb", "sr", "fa", "eu", "pl", + "br", "uk", "sl", "ro", "sq", "am", "fy", "gd", "cs", "sk", + "af", "hr", "hy", "tr", "kn", "pt", "lt", "lb", "bg", "eo", + "kk", "la", "et", "vi", "mn", "mk", "lv", "fur", "gsw", "ja_kana", + "is", "hi", "ku" }; + int8_t StringUtf8Multilang::GetLangIndex(string const & lang) { - static char const * arr[] = { "default", - "en", "ja", "fr", "ko_rm", "ar", "de", "ru", "sv", "zh", "fi", - "ko", "ka", "he", "be", "nl", "ga", "ja_rm", "el", "it", "es", - "th", "zh_pinyin", "ca", "cy", "hu", "hsb", "sr", "fa", "eu", "pl", - "br", "uk", "sl", "ro", "sq", "am", "fy", "gd", "cs", "sk", - "af", "hr", "hy", "tr", "kn", "pt", "lt", "lb", "bg", "eo", - "kk", "la", "et", "vi", "mn", "mk", "lv", "fur", "gsw", "ja_kana", - "is", "hi", "ku" }; + STATIC_ASSERT(ARRAY_SIZE(gLangs) == MAX_SUPPORTED_LANGUAGES); - STATIC_ASSERT(ARRAY_SIZE(arr) == MAX_SUPPORTED_LANGUAGES); - - for (size_t i = 0; i < ARRAY_SIZE(arr); ++i) - if (lang == arr[i]) + for (size_t i = 0; i < ARRAY_SIZE(gLangs); ++i) + if (lang == gLangs[i]) return static_cast(i); return -1; } +char const * StringUtf8Multilang::GetLangByCode(int8_t langCode) +{ + if (langCode < 0 || langCode > ARRAY_SIZE(gLangs) - 1) + return ""; + return gLangs[langCode]; +} + size_t StringUtf8Multilang::GetNextIndex(size_t i) const { ++i; diff --git a/coding/multilang_utf8_string.hpp b/coding/multilang_utf8_string.hpp index ccde7c6a45..27ce471fba 100644 --- a/coding/multilang_utf8_string.hpp +++ b/coding/multilang_utf8_string.hpp @@ -36,6 +36,8 @@ class StringUtf8Multilang public: static int8_t GetLangIndex(string const & lang); + /// @return empty string if langCode is invalid + static char const * GetLangByCode(int8_t langCode); inline bool operator== (StringUtf8Multilang const & rhs) const { diff --git a/generator/dumper.cpp b/generator/dumper.cpp index d1f7527cdb..b6dc76efdc 100644 --- a/generator/dumper.cpp +++ b/generator/dumper.cpp @@ -1,11 +1,15 @@ #include "dumper.hpp" -#include "../indexer/feature_processor.hpp" -#include "../indexer/classificator.hpp" +#include "../coding/multilang_utf8_string.hpp" -#include "../std/vector.hpp" -#include "../std/unordered_map.hpp" +#include "../indexer/classificator.hpp" +#include "../indexer/feature_processor.hpp" + +#include "../std/algorithm.hpp" +#include "../std/bind.hpp" #include "../std/iostream.hpp" +#include "../std/map.hpp" +#include "../std/vector.hpp" namespace feature { @@ -14,7 +18,7 @@ namespace feature vector m_currFeatureTypes; public: - typedef unordered_map, size_t> value_type; + typedef map, size_t> value_type; value_type m_stats; size_t m_namesCount; size_t m_totalCount; @@ -68,53 +72,88 @@ namespace feature cout << "Features with names: " << doClass.m_namesCount << endl; } - class NamesCollector + /////////////////////////////////////////////////////////////////// + + typedef map > TokensContainerT; + class PrefixesCollector { - typedef unordered_map NamesContainerT; - - class LangsFunctor - { - public: - vector m_names; - bool operator()(signed char, string const & name) - { - m_names.push_back(name); - return true; - } - }; - public: - NamesContainerT m_stats; + TokensContainerT m_stats; + + bool operator()(int8_t langCode, string const & name) + { + CHECK(!name.empty(), ("Feature name is empty")); + + vector tokens; + strings::SimpleTokenizer tok(name, " "); + while (tok) + { + tokens.push_back(*tok); + ++tok; + } + + if (tokens.empty()) + return true; + // ignore token if it's first letter is an uppercase letter + strings::UniString const s1 = strings::MakeUniString(tokens[0]); + strings::UniString const s2 = strings::MakeLowerCase(s1); + if (s1[0] != s2[0]) + return true; + + for (size_t i = 1; i < tokens.size(); ++i) + { + string s; + for (size_t numTokens = 0; numTokens < i; ++numTokens) + { + s += tokens[numTokens]; + s += " "; + } + pair found = m_stats[langCode].insert(make_pair(s, 1)); + if (!found.second) + found.first->second++; + } + return true; + } + void operator()(FeatureType & f, uint32_t) { - LangsFunctor doLangs; - f.ForEachNameRef(doLangs); - for (size_t i = 0; i < doLangs.m_names.size(); ++i) - { - strings::SimpleTokenizer tok(doLangs.m_names[i], " "); - while (tok) - { - pair found = m_stats.insert(make_pair(*tok, 1)); - if (!found.second) - found.first->second++; - ++tok; - } - } + f.ForEachNameRef(*this); } }; - typedef pair NameElemT; - void DumpNames(string const & fPath) + static size_t const MIN_OCCURRENCE = 3; + + void Print(int8_t langCode, TokensContainerT::mapped_type const & container) { - NamesCollector doClass; - feature::ForEachFromDat(fPath, doClass); - + typedef pair NameElemT; typedef vector VecToSortT; - VecToSortT vecToSort(doClass.m_stats.begin(), doClass.m_stats.end()); - sort(vecToSort.begin(), vecToSort.end(), &SortFunc); + VecToSortT v(container.begin(), container.end()); + sort(v.begin(), v.end(), &SortFunc); + + // do not display prefixes with low occurrences + if (v[0].second > MIN_OCCURRENCE) + { + cout << "Language code: " << StringUtf8Multilang::GetLangByCode(langCode) << endl; + + for (VecToSortT::iterator it = v.begin(); it != v.end(); ++it) + { + if (it->second <= MIN_OCCURRENCE) + break; + cout << it->second << " " << it->first << endl; + } + } + } + + void DumpPrefixes(string const & fPath) + { + PrefixesCollector doClass; + feature::ForEachFromDat(fPath, doClass); + for (TokensContainerT::iterator it = doClass.m_stats.begin(); + it != doClass.m_stats.end(); ++it) + { + Print(it->first, it->second); + } - for (VecToSortT::iterator it = vecToSort.begin(); it != vecToSort.end(); ++it) - cout << it->second << " " << it->first << endl; } } diff --git a/generator/dumper.hpp b/generator/dumper.hpp index 665694e48d..846e3d8e3b 100644 --- a/generator/dumper.hpp +++ b/generator/dumper.hpp @@ -5,5 +5,5 @@ namespace feature { void DumpTypes(string const & fPath); - void DumpNames(string const & fPath); + void DumpPrefixes(string const & fPath); } diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index c6b185e603..02c08774b3 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -51,6 +51,7 @@ DEFINE_string(generate_borders, "", "Create binary country .borders file for osm xml file given in 'output' parameter," "specify tag name and optional value: ISO3166-1 or admin_level=4"); DEFINE_bool(dump_types, false, "If defined, prints all types combinations and their total count"); +DEFINE_bool(dump_prefixes, false, "If defined, prints statistics on feature name prefixes"); DEFINE_bool(unpack_mwm, false, "Unpack each section of mwm into a separate file with name filePath.sectionName."); string AddSlashIfNeeded(string const & str) @@ -106,7 +107,7 @@ int main(int argc, char ** argv) // load classificator only if necessary if (FLAGS_generate_features || FLAGS_generate_geometry || FLAGS_generate_index || FLAGS_generate_search_index || - FLAGS_calc_statistics || FLAGS_dump_types) + FLAGS_calc_statistics || FLAGS_dump_types || FLAGS_dump_prefixes) { classificator::Read(pl.GetReader("drawing_rules.bin"), pl.GetReader("classificator.txt"), @@ -218,10 +219,10 @@ int main(int argc, char ** argv) } if (FLAGS_dump_types) - { - //feature::DumpNames(path + FLAGS_output + ".mwm"); feature::DumpTypes(path + FLAGS_output + ".mwm"); - } + + if (FLAGS_dump_prefixes) + feature::DumpPrefixes(path + FLAGS_output + ".mwm"); if (FLAGS_unpack_mwm) {