diff --git a/generator/generator_tool/generator_tool.cpp b/generator/generator_tool/generator_tool.cpp index de9e42611d..34b3ea9120 100644 --- a/generator/generator_tool/generator_tool.cpp +++ b/generator/generator_tool/generator_tool.cpp @@ -180,8 +180,9 @@ DEFINE_string(uk_postcodes_dataset, "", "Path to dataset with UK postcodes."); DEFINE_string(us_postcodes_dataset, "", "Path to dataset with US postcodes."); // Printing stuff. -DEFINE_bool(calc_statistics, false, "Calculate feature statistics for specified mwm bucket files."); -DEFINE_bool(type_statistics, false, "Calculate statistics by type for specified mwm bucket files."); +DEFINE_bool(calc_stats, false, "Print file and feature stats."); +DEFINE_bool(calc_geom_stats, false, "Print outer geometry stats."); +DEFINE_bool(calc_type_stats, false, "Print feature stats by type."); DEFINE_bool(dump_types, false, "Prints all types combinations and their total count."); DEFINE_bool(dump_prefixes, false, "Prints statistics on feature's' name prefixes."); DEFINE_bool(dump_search_tokens, false, "Print statistics on search tokens."); @@ -598,23 +599,30 @@ MAIN_WITH_ERROR_HANDLING([](int argc, char ** argv) string const dataFile = base::JoinPath(path, FLAGS_output + DATA_FILE_EXTENSION); - if (FLAGS_calc_statistics || FLAGS_type_statistics) + if (FLAGS_calc_stats || FLAGS_calc_geom_stats || FLAGS_calc_type_stats) { + LOG(LINFO, ("Calculating statistics for", dataFile)); auto file = OfstreamWithExceptions(genInfo.GetIntermediateFileName(FLAGS_output, STATS_EXTENSION)); stats::MapInfo info; stats::CalcStatistics(dataFile, info); - if (FLAGS_calc_statistics) + if (FLAGS_calc_stats) { - LOG(LINFO, ("Calculating statistics for", dataFile)); + LOG(LINFO, ("Writing general statistics")); stats::FileContainerStatistics(file, dataFile); stats::PrintStatistics(file, info); } - else + if (FLAGS_calc_geom_stats) { - LOG(LINFO, ("Calculating type statistics for", dataFile)); + LOG(LINFO, ("Writing geometry statistics")); + stats::PrintOuterGeometryStatistics(file, info); + } + if (FLAGS_calc_type_stats) + { + LOG(LINFO, ("Writing types statistics")); stats::PrintTypeStatistics(file, info); } + LOG(LINFO, ("Stats written to file", FLAGS_output + STATS_EXTENSION)); } if (FLAGS_dump_types) diff --git a/generator/statistics.cpp b/generator/statistics.cpp index fd2f5c1b69..67f130a5a3 100644 --- a/generator/statistics.cpp +++ b/generator/statistics.cpp @@ -17,6 +17,8 @@ using namespace std; namespace stats { + static const double kAlmostDupElemsFactor = 1.5; + void FileContainerStatistics(std::ostream & os, string const & fPath) { try @@ -25,13 +27,15 @@ namespace stats FilesContainerR cont(fPath); cont.ForEachTag([&] (FilesContainerR::Tag const & tag) { - os << std::setw(18) << tag << " : " << cont.GetReader(tag).Size() << endl; + os << std::setw(18) << tag << " : " + << std::setw(10) << cont.GetReader(tag).Size() << endl; }); } catch (Reader::Exception const & ex) { LOG(LWARNING, ("Error reading file:", fPath, ex.Msg())); } + os << endl; } // 0.001 deg² ≈ 12.392 km² * cos(lat) @@ -63,14 +67,43 @@ namespace stats m_info.m_inner[2].Add(innerStats.m_size); // Get size stats and load the best geometry. - FeatureType::GeomStat const geom = f.GetOuterGeometrySize(); - FeatureType::GeomStat const trg = f.GetOuterTrianglesSize(); + FeatureType::GeomStat const geom = f.GetOuterGeometryStats(); + FeatureType::GeomStat const trg = f.GetOuterTrianglesStats(); - m_info.m_byPointsCount[CountType(geom.m_count)].Add(innerStats.m_points + geom.m_size); - m_info.m_byTrgCount[CountType(trg.m_count)].Add(innerStats.m_strips + trg.m_size); + uint32_t geomSize = 0, trgSize = 0; + int const n = feature::DataHeader::kMaxScalesCount; + for (int ind = 0; ind < n; ++ind) + { + m_info.m_byLineGeom[ind].Add(geom.m_sizes[ind], geom.m_elements[ind]); + geomSize += geom.m_sizes[ind]; + m_info.m_byAreaGeom[ind].Add(trg.m_sizes[ind], trg.m_elements[ind]); + trgSize += trg.m_sizes[ind]; + + if (ind > 0) + { + // If a feature has a more simplified version of current geometry. + if (geom.m_elements[ind - 1] > 0) + m_info.m_byLineGeomCompared[ind].Add(geom.m_sizes[ind], geom.m_elements[ind]); + if (trg.m_elements[ind - 1] > 0) + m_info.m_byAreaGeomCompared[ind].Add(trg.m_sizes[ind], trg.m_elements[ind]); + } + + if (ind < n - 1) + { + // If feature's current geometry almost duplicates a more detailed one + // (has 0 && geom.m_elements[ind] * kAlmostDupElemsFactor > geom.m_elements[ind + 1]) + m_info.m_byLineGeomDup[ind].Add(geom.m_sizes[ind], geom.m_elements[ind]); + if (trg.m_elements[ind] > 0 && trg.m_elements[ind] * kAlmostDupElemsFactor > trg.m_elements[ind + 1]) + m_info.m_byAreaGeomDup[ind].Add(trg.m_sizes[ind], trg.m_elements[ind]); + } + } + + m_info.m_byPointsCount[CountType(geom.m_elements[n - 1])].Add(innerStats.m_points + geomSize); + m_info.m_byTrgCount[CountType(trg.m_elements[n - 1])].Add(innerStats.m_strips + trgSize); // Header size (incl. inner geometry) + outer geometry size. - uint32_t const allSize = innerStats.m_size + geom.m_size + trg.m_size; + uint32_t const allSize = innerStats.m_size + geomSize + trgSize; double len = 0.0; double area = 0.0; @@ -121,7 +154,7 @@ namespace stats { os << std::setw(prefixWidth) << prefix << ": size = " << std::setw(9) << info.m_size - << "; features = " << std::setw(8) << info.m_count; + << "; features = " << std::setw(7) << info.m_count; if (measurements) { @@ -197,9 +230,9 @@ namespace stats void PrintStatistics(std::ostream & os, MapInfo & info) { - PrintInfo(os, "\nFeature headers", info.m_inner[2]); - PrintInfo(os, " incl. inner points", info.m_inner[0]); - PrintInfo(os, " incl. inner triangles (strips)", info.m_inner[1]); + PrintInfo(os, "Feature headers", info.m_inner[2], 30); + PrintInfo(os, "incl. inner points", info.m_inner[0], 30); + PrintInfo(os, "incl. inner triangles (strips)", info.m_inner[1], 30); PrintTop(os, "Top SIZE by Geometry Type", info.m_byGeomType, 5, true); PrintTop(os, "Top SIZE by Classificator Type\n" @@ -208,15 +241,79 @@ namespace stats PrintTop(os, "Top SIZE by Points Count", info.m_byPointsCount); PrintTop(os, "Top SIZE by Triangles Count", info.m_byTrgCount); PrintTop(os, "Top SIZE by Area", info.m_byAreaSize, 5, true); + os << endl; } void PrintTypeStatistics(std::ostream & os, MapInfo & info) { - os << "NOTE: a single feature can contain several types and thus its size can be included in several type lines." - << endl << endl; + os << "Feature stats by Classificator Type" << endl + << "(a single feature can contain several types and thus its size can be included in several type lines)" + << endl; for (auto it = info.m_byClassifType.begin(); it != info.m_byClassifType.end(); ++it) { - PrintInfo(os, GetKey(it->first).c_str(), it->second, 30, true, true); + PrintInfo(os, GetKey(it->first), it->second, 30, true, true); + } + os << endl; + } + + void PrintGeometryInfo(std::ostream & os, char const * prefix, GeomStats const & geomStats, + GeomStats const & comparedStats, GeomStats const & dupStats) + { + int const n = feature::DataHeader::kMaxScalesCount; + for (int ind = 0; ind < n; ++ind) + { + GeomInfo const & info = geomStats[ind]; + if (ind > 0) + { + GeomInfo const & compInfo = comparedStats[ind]; + os << prefix << ind << "w/" << ind - 1 + << ": size = " << std::setw(9) << compInfo.m_size + << ": elements = " << std::setw(9) << compInfo.m_elements + << "; feats w/" << prefix << ind - 1 + << " = " << std::setw(7) << compInfo.m_count + << "; elems/feats = " << std::setw(5) + << compInfo.m_elements / static_cast(compInfo.m_count) + << "; size factor = " << std::setw(4) + << compInfo.m_size / static_cast(geomStats[ind - 1].m_size) + << "x; elems factor = " << std::setw(4) + << compInfo.m_elements / static_cast(geomStats[ind - 1].m_elements) + << "x" << endl; + } + os << " " << prefix << ind + << ": size = " << std::setw(9) << info.m_size + << ": elements = " << std::setw(9) << info.m_elements + << "; features = " << std::setw(7) << info.m_count + << "; elems/feats = " << std::setw(5) + << info.m_elements / static_cast(info.m_count) + << "; bytes/elems = " << std::setw(4) + << info.m_size / static_cast(info.m_elements) + << endl; + } + + os << "Geometry almost duplicating (<" << kAlmostDupElemsFactor + << "x less elements) a more detailed one" << endl; + for (int ind = 0; ind < n - 1; ++ind) + { + GeomInfo const & dupInfo = dupStats[ind]; + os << prefix << ind << "~=" << ind + 1 + << ": size = " << std::setw(9) << dupInfo.m_size + << ": elements = " << std::setw(9) << dupInfo.m_elements + << "; features = " << std::setw(7) << dupInfo.m_count + << "; elems/feats = " << std::setw(5) + << dupInfo.m_elements / static_cast(dupInfo.m_count) + << "; dups size % = " << std::setw(2) + << 100 * dupInfo.m_size / geomStats[ind].m_size << "%" + << endl; } } + + void PrintOuterGeometryStatistics(std::ostream & os, MapInfo & info) + { + os << "Outer LINE geometry" << fixed << setprecision(1) << endl; + PrintGeometryInfo(os, "geom", info.m_byLineGeom, info.m_byLineGeomCompared, info.m_byLineGeomDup); + + os << endl << "Outer AREA geometry" << endl; + PrintGeometryInfo(os, "trg", info.m_byAreaGeom, info.m_byAreaGeomCompared, info.m_byAreaGeomDup); + os << endl; + } } diff --git a/generator/statistics.hpp b/generator/statistics.hpp index 0486cf2eb8..b16f5152f1 100644 --- a/generator/statistics.hpp +++ b/generator/statistics.hpp @@ -33,6 +33,25 @@ namespace stats double m_area; }; + struct GeomInfo + { + GeomInfo() : m_count(0), m_size(0), m_elements(0) {} + + void Add(uint64_t szBytes, uint32_t elements) + { + if (szBytes > 0) + { + ++m_count; + m_size += szBytes; + m_elements += elements; + } + } + + uint64_t m_count, m_size, m_elements; + }; + + using GeomStats = GeomInfo[feature::DataHeader::kMaxScalesCount]; + template struct IntegralType { @@ -52,6 +71,10 @@ namespace stats std::map m_byPointsCount, m_byTrgCount; std::map m_byAreaSize; + GeomStats m_byLineGeom, m_byAreaGeom, + m_byLineGeomCompared, m_byAreaGeomCompared, + m_byLineGeomDup, m_byAreaGeomDup; + GeneralInfo m_inner[3]; }; @@ -60,4 +83,5 @@ namespace stats void CalcStatistics(std::string const & fPath, MapInfo & info); void PrintStatistics(std::ostream & os, MapInfo & info); void PrintTypeStatistics(std::ostream & os, MapInfo & info); + void PrintOuterGeometryStatistics(std::ostream & os, MapInfo & info); } diff --git a/indexer/data_header.hpp b/indexer/data_header.hpp index dcfea3bc6b..d8606fc62e 100644 --- a/indexer/data_header.hpp +++ b/indexer/data_header.hpp @@ -30,8 +30,8 @@ namespace feature Country }; - /// Max possible scales. @see arrays in feature_impl.hpp - static const size_t kMaxScalesCount = 4; + /// Max possible geometry scales. @see arrays in feature_impl.hpp + static constexpr size_t kMaxScalesCount = 4; DataHeader() = default; explicit DataHeader(std::string const & fileName); diff --git a/indexer/feature.cpp b/indexer/feature.cpp index 69cf4bf333..02a8b8c189 100644 --- a/indexer/feature.cpp +++ b/indexer/feature.cpp @@ -476,11 +476,13 @@ void FeatureType::ParseGeometry(int scale) } } -FeatureType::GeomStat FeatureType::GetOuterGeometrySize() +FeatureType::GeomStat FeatureType::GetOuterGeometryStats() { - uint32_t sz = 0; - + ASSERT(!m_parsed.m_points, ()); CHECK(m_loadInfo, ()); + size_t const n = m_loadInfo->GetScalesCount(); + ASSERT_LESS_OR_EQUAL(n, feature::DataHeader::kMaxScalesCount, ()); + FeatureType::GeomStat res; auto const headerGeomType = static_cast(Header(m_data) & HEADER_MASK_GEOMTYPE); if (headerGeomType == HeaderGeomType::Line) @@ -488,11 +490,12 @@ FeatureType::GeomStat FeatureType::GetOuterGeometrySize() size_t const count = m_points.size(); if (count < 2) { + // Outer geometry present. ASSERT_EQUAL(count, 1, ()); FeatureType::Points points; - int const n = m_loadInfo->GetScalesCount(); - for (int ind = 0; ind < n; ++ind) + + for (size_t ind = 0; ind < n; ++ind) { if (m_offsets.m_pts[ind] != kInvalidOffset) { @@ -506,7 +509,8 @@ FeatureType::GeomStat FeatureType::GetOuterGeometrySize() cp.SetBasePoint(points[0]); serial::LoadOuterPath(src, cp, points); - sz += static_cast(src.Pos() - m_offsets.m_pts[ind]); + res.m_sizes[ind] = static_cast(src.Pos() - m_offsets.m_pts[ind]); + res.m_elements[ind] = points.size(); } } // Retain best geometry. @@ -516,7 +520,9 @@ FeatureType::GeomStat FeatureType::GetOuterGeometrySize() } m_parsed.m_points = true; - return GeomStat(sz, m_points.size()); + // Points count can come from the inner geometry. + res.m_elements[n - 1] = m_points.size(); + return res; } void FeatureType::ParseTriangles(int scale) @@ -546,19 +552,20 @@ void FeatureType::ParseTriangles(int scale) } } -FeatureType::GeomStat FeatureType::GetOuterTrianglesSize() +FeatureType::GeomStat FeatureType::GetOuterTrianglesStats() { - uint32_t sz = 0; - + ASSERT(!m_parsed.m_triangles, ()); CHECK(m_loadInfo, ()); + size_t const n = m_loadInfo->GetScalesCount(); + ASSERT_LESS_OR_EQUAL(n, feature::DataHeader::kMaxScalesCount, ()); + FeatureType::GeomStat res; auto const headerGeomType = static_cast(Header(m_data) & HEADER_MASK_GEOMTYPE); if (headerGeomType == HeaderGeomType::Area) { if (m_triangles.empty()) { - int const n = m_loadInfo->GetScalesCount(); - for (int ind = 0; ind < n; ++ind) + for (size_t ind = 0; ind < n; ++ind) { if (m_offsets.m_trg[ind] != kInvalidOffset) { @@ -568,7 +575,8 @@ FeatureType::GeomStat FeatureType::GetOuterTrianglesSize() src.Skip(m_offsets.m_trg[ind]); serial::LoadOuterTriangles(src, m_loadInfo->GetGeometryCodingParams(ind), m_triangles); - sz += static_cast(src.Pos() - m_offsets.m_trg[ind]); + res.m_sizes[ind] = static_cast(src.Pos() - m_offsets.m_trg[ind]); + res.m_elements[ind] = m_triangles.size() / 3; } } // The best geometry is retained in m_triangles. @@ -577,7 +585,9 @@ FeatureType::GeomStat FeatureType::GetOuterTrianglesSize() } m_parsed.m_triangles = true; - return GeomStat(sz, m_triangles.size() / 3); + // Triangles count can come from the inner geometry. + res.m_elements[n - 1] = m_triangles.size() / 3; + return res; } void FeatureType::ParseMetadata() diff --git a/indexer/feature.hpp b/indexer/feature.hpp index ddc0a1ea3a..cf080b998e 100644 --- a/indexer/feature.hpp +++ b/indexer/feature.hpp @@ -170,17 +170,15 @@ public: InnerGeomStat GetInnerStats() const { return m_innerStats; } + using GeomArr = uint32_t[feature::DataHeader::kMaxScalesCount]; struct GeomStat { - uint32_t m_size = 0, m_count = 0; - - GeomStat(uint32_t sz, size_t count) : m_size(sz), m_count(static_cast(count)) {} + GeomArr m_sizes = {}, m_elements = {}; }; - // Returns total outer geometry/triangles size for all geo levels and - // number of points/triangles in the best one. Loads the best geometry. - GeomStat GetOuterGeometrySize(); - GeomStat GetOuterTrianglesSize(); + // Returns outer points/triangles stats for all geo levels and loads the best geometry. + GeomStat GetOuterGeometryStats(); + GeomStat GetOuterTrianglesStats(); //@} private: