diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 248251a6e9..bd232f4387 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -58,7 +58,7 @@ UNIT_TEST(Geocoder_Smoke) Geocoder geocoder(regionsJsonFile.GetFullPath()); osm::Id const florenciaId(0xc00000000059d6b5); - osm::Id const cubaId(c00000000004b279); + osm::Id const cubaId(0xc00000000004b279); TestGeocoder(geocoder, "florencia", {{florenciaId, 1.0}}); TestGeocoder(geocoder, "cuba florencia", {{florenciaId, 1.0}, {cubaId, 0.5}}); diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index 259eed4983..c6eb0c3851 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -14,30 +14,34 @@ using namespace std; namespace geocoder { // Hierarchy::Entry -------------------------------------------------------------------------------- -bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr) +bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr, ParsingStats & stats) { try { my::Json root(jsonStr.c_str()); - DeserializeFromJSONImpl(root.get(), jsonStr); + DeserializeFromJSONImpl(root.get(), jsonStr, stats); return true; } catch (my::Json::Exception const & e) { - LOG(LWARNING, ("Can't parse entry:", e.Msg(), jsonStr)); + LOG(LDEBUG, ("Can't parse entry:", e.Msg(), jsonStr)); } return false; } // todo(@m) Factor out to geojson.hpp? Add geojson to myjansson? -void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const & jsonStr) +void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const & jsonStr, + ParsingStats & stats) { if (!json_is_object(root)) + { + ++stats.m_badJsons; MYTHROW(my::Json::Exception, ("Not a json object.")); + } json_t * const properties = my::GetJSONObligatoryField(root, "properties"); - json_t * const address = my::GetJSONObligatoryField(properties, "address"); + bool hasDuplicateAddress = false; for (size_t i = 0; i < static_cast(Type::Count); ++i) { @@ -49,51 +53,44 @@ void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const continue; if (!m_address[i].empty()) - LOG(LWARNING, ("Duplicate address field", type, "when parsing", jsonStr)); + { + LOG(LDEBUG, ("Duplicate address field", type, "when parsing", jsonStr)); + hasDuplicateAddress = true; + } search::NormalizeAndTokenizeString(levelValue, m_address[i]); if (!m_address[i].empty()) m_type = static_cast(i); } - SetName(properties, jsonStr); + m_nameTokens.clear(); + FromJSONObjectOptionalField(properties, "name", m_name); + search::NormalizeAndTokenizeString(m_name, m_nameTokens); + + if (m_name.empty()) + ++stats.m_emptyNames; + + if (hasDuplicateAddress) + ++stats.m_duplicateAddresses; if (m_type == Type::Count) { - LOG(LWARNING, ("No address in an hierarchy entry:", jsonStr)); + LOG(LDEBUG, ("No address in an hierarchy entry:", jsonStr)); + ++stats.m_emptyAddresses; } - else + else if (m_nameTokens != m_address[static_cast(m_type)]) { - if (m_nameTokens != m_address[static_cast(m_type)]) - { - LOG(LWARNING, ("Hierarchy entry name is not the most detailed field in its address. Name:", - m_name, "Name tokens:", m_nameTokens, "Address:", m_address)); - } + ++stats.m_mismatchedNames; + LOG(LDEBUG, ("Hierarchy entry name is not the most detailed field in its address:", jsonStr)); } } -void Hierarchy::Entry::SetName(json_t * const properties, string const & jsonStr) -{ - m_nameTokens.clear(); - FromJSONObjectOptionalField(properties, "name", m_name); - if (!m_name.empty()) - { - search::NormalizeAndTokenizeString(m_name, m_nameTokens); - return; - } - - LOG(LWARNING, ("Hierarchy entry has no name. Trying to set name from address.", jsonStr)); - if (m_type != Type::Count) - m_nameTokens = m_address[static_cast(m_type)]; - if (m_name.empty()) - LOG(LWARNING, ("Hierarchy entry has no name and no address:", jsonStr)); -} - // Hierarchy --------------------------------------------------------------------------------------- Hierarchy::Hierarchy(string const & pathToJsonHierarchy) { ifstream ifs(pathToJsonHierarchy); string line; + ParsingStats stats; while (getline(ifs, line)) { @@ -105,6 +102,7 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy) if (i == string::npos || !strings::to_any(line.substr(0, i), encodedId)) { LOG(LWARNING, ("Cannot read osm id. Line:", line)); + ++stats.m_badOsmIds; continue; } line = line.substr(i + 1); @@ -113,13 +111,27 @@ Hierarchy::Hierarchy(string const & pathToJsonHierarchy) // todo(@m) We should really write uints as uints. entry.m_osmId = osm::Id(static_cast(encodedId)); - if (!entry.DeserializeFromJSON(line)) + if (!entry.DeserializeFromJSON(line, stats)) continue; - m_entries[entry.m_nameTokens].emplace_back(entry); + // The entry is indexed only by its address. + // todo(@m) Index it by name too. + if (entry.m_type != Type::Count) + { + size_t const t = static_cast(entry.m_type); + m_entries[entry.m_address[t]].emplace_back(entry); + } } - LOG(LINFO, ("Finished reading the hierarchy")); + LOG(LINFO, ("Finished reading the hierarchy. Stats:")); + LOG(LINFO, ("Corrupted json lines:", stats.m_badJsons)); + LOG(LINFO, ("Unreadable osm::Ids:", stats.m_badOsmIds)); + LOG(LINFO, ("Entries with duplicate address parts:", stats.m_duplicateAddresses)); + LOG(LINFO, ("Entries without address:", stats.m_emptyAddresses)); + LOG(LINFO, ("Entries without names:", stats.m_emptyNames)); + LOG(LINFO, + ("Entries whose names do not match their most specific addresses:", stats.m_mismatchedNames)); + LOG(LINFO, ("(End of stats.)")); } vector const * const Hierarchy::GetEntries( diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index e567a166ba..f9a9358ee2 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -19,18 +20,39 @@ namespace geocoder class Hierarchy { public: + struct ParsingStats + { + // Number of corrupted json lines. + uint64_t m_badJsons = 0; + + // Number of entries with unreadable osm::Ids. + uint64_t m_badOsmIds = 0; + + // Number of entries with duplicate subfields in the address field. + uint64_t m_duplicateAddresses = 0; + + // Number of entries whose address field either does + // not exist or consists of empty lines. + uint64_t m_emptyAddresses = 0; + + // Number of entries without the name field or with an empty one. + uint64_t m_emptyNames = 0; + + // Number of entries whose names do not match the most + // specific parts of their addresses. + // This is expected from POIs but not from regions or streets. + uint64_t m_mismatchedNames = 0; + }; + // A single entry in the hierarchy directed acyclic graph. // Currently, this is more or less the "properties"-"address" // part of the geojson entry. struct Entry { - bool DeserializeFromJSON(std::string const & jsonStr); + bool DeserializeFromJSON(std::string const & jsonStr, ParsingStats & stats); - void DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr); - - // Tries to set |m_name| and |m_nameTokens| from - // the "name" and "address" fields in the json description. - void SetName(json_t * const properties, std::string const & jsonStr); + void DeserializeFromJSONImpl(json_t * const root, std::string const & jsonStr, + ParsingStats & stats); osm::Id m_osmId = osm::Id(osm::Id::kInvalid);