From aefeeee2343320aead7b91c008c28b97d6fb8124 Mon Sep 17 00:00:00 2001 From: Anatoly Serdtcev Date: Mon, 8 Jul 2019 13:29:22 +0300 Subject: [PATCH] [geocoder] Moving to new jsonl version: hex id, locales --- 3party/jansson/myjansson.cpp | 10 ++ 3party/jansson/myjansson.hpp | 13 ++- generator/region_meta.cpp | 2 +- geocoder/geocoder_tests/geocoder_tests.cpp | 120 +++++++++++---------- geocoder/hierarchy.cpp | 14 +-- geocoder/hierarchy_reader.cpp | 12 ++- geocoder/hierarchy_reader.hpp | 1 + 7 files changed, 97 insertions(+), 75 deletions(-) diff --git a/3party/jansson/myjansson.cpp b/3party/jansson/myjansson.cpp index bfa98b7e29..0b70b89ce7 100644 --- a/3party/jansson/myjansson.cpp +++ b/3party/jansson/myjansson.cpp @@ -27,6 +27,11 @@ json_t * GetJSONObligatoryField(json_t * root, char const * field) return const_cast(GetJSONObligatoryField(const_cast(root), field)); } +json_t const * GetJSONObligatoryField(json_t const * root, std::string const & field) +{ + return GetJSONObligatoryField(root, field.c_str()); +} + json_t const * GetJSONObligatoryField(json_t const * root, char const * field) { auto * value = base::GetJSONOptionalField(root, field); @@ -45,6 +50,11 @@ json_t * GetJSONOptionalField(json_t * root, char const * field) return const_cast(GetJSONOptionalField(const_cast(root), field)); } +json_t const * GetJSONOptionalField(json_t const * root, std::string const & field) +{ + return GetJSONOptionalField(root, field.c_str()); +} + json_t const * GetJSONOptionalField(json_t const * root, char const * field) { if (!json_is_object(root)) diff --git a/3party/jansson/myjansson.hpp b/3party/jansson/myjansson.hpp index 931fd34430..abbf8d5825 100644 --- a/3party/jansson/myjansson.hpp +++ b/3party/jansson/myjansson.hpp @@ -71,9 +71,11 @@ JSONPtr LoadFromString(std::string const & str); std::string DumpToString(JSONPtr const & json, size_t flags = 0); json_t * GetJSONObligatoryField(json_t * root, std::string const & field); +json_t const * GetJSONObligatoryField(json_t const * root, std::string const & field); json_t * GetJSONObligatoryField(json_t * root, char const * field); json_t const * GetJSONObligatoryField(json_t const * root, char const * field); json_t * GetJSONOptionalField(json_t * root, std::string const & field); +json_t const * GetJSONOptionalField(json_t const * root, std::string const & field); json_t * GetJSONOptionalField(json_t * root, char const * field); json_t const * GetJSONOptionalField(json_t const * root, char const * field); @@ -103,6 +105,7 @@ T FromJSON(json_t const * root) } inline void FromJSON(json_t * root, json_t *& value) { value = root; } +inline void FromJSON(json_t const * root, json_t const *& value) { value = root; } void FromJSON(json_t const * root, double & result); void FromJSON(json_t const * root, bool & result); @@ -158,7 +161,7 @@ boost::optional FromJSONObjectOptional(json_t const * root, char const * fiel } template -void FromJSONObjectOptionalField(json_t * root, std::string const & field, T & result) +void FromJSONObjectOptionalField(json_t const * root, std::string const & field, T & result) { auto * json = base::GetJSONOptionalField(root, field); if (!json) @@ -287,7 +290,13 @@ void ToJSONObject(json_t & root, std::string const & field, std::vector const template void FromJSONObjectOptionalField(json_t * root, std::string const & field, std::vector & result) { - json_t * arr = base::GetJSONOptionalField(root, field); + FromJSONObjectOptionalField(const_cast(root), field, result); +} + +template +void FromJSONObjectOptionalField(json_t const * root, std::string const & field, std::vector & result) +{ + json_t const * arr = base::GetJSONOptionalField(root, field); if (!arr) { result.clear(); diff --git a/generator/region_meta.cpp b/generator/region_meta.cpp index 60d0de780e..3dd85f327a 100644 --- a/generator/region_meta.cpp +++ b/generator/region_meta.cpp @@ -33,7 +33,7 @@ bool ReadRegionDataImpl(std::string const & countryName, RegionData & data) reader->ReadAsString(buffer); base::Json root(buffer.data()); - json_t * jsonData = nullptr; + json_t const * jsonData = nullptr; FromJSONObjectOptionalField(root.get(), countryName, jsonData); if (!jsonData) return false; diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index af89f5a79b..85f1d4efc8 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -12,6 +12,7 @@ #include "base/stl_helpers.hpp" #include +#include #include #include @@ -24,9 +25,9 @@ using Id = base::GeoObjectId; double const kCertaintyEps = 1e-6; string const kRegionsData = R"#( --4611686018427080071 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"name": "Cuba", "rank": 2, "address": {"country": "Cuba"}}} --4611686018425533273 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"name": "Ciego de Ávila", "rank": 4, "address": {"region": "Ciego de Ávila", "country": "Cuba"}}} --4611686018421500235 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"name": "Florencia", "rank": 6, "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}} +C00000000004B279 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-80.1142033187951, 21.55511095]}, "properties": {"locales": {"default": {"name": "Cuba", "address": {"country": "Cuba"}}}, "rank": 2}} +C0000000001C4CA7 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.7260117405499, 21.74300205]}, "properties": {"locales": {"default": {"name": "Ciego de Ávila", "address": {"region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 4}} +C00000000059D6B5 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"locales": {"default": {"name": "Florencia", "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}}, "rank": 6}} )#"; geocoder::Tokens Split(string const & s) @@ -88,27 +89,27 @@ UNIT_TEST(Geocoder_Hierarchy) UNIT_TEST(Geocoder_OnlyBuildings) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Some Locality"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Some Locality"}}}}} -21 {"properties": {"address": {"street": "Good", "locality": "Some Locality"}}} -22 {"properties": {"address": {"building": "5", "street": "Good", "locality": "Some Locality"}}} +21 {"properties": {"locales": {"default": {"address": {"street": "Good", "locality": "Some Locality"}}}}} +22 {"properties": {"locales": {"default": {"address": {"building": "5", "street": "Good", "locality": "Some Locality"}}}}} -31 {"properties": {"address": {"street": "Bad", "locality": "Some Locality"}}} -32 {"properties": {"address": {"building": "10", "street": "Bad", "locality": "Some Locality"}}} +31 {"properties": {"locales": {"default": {"address": {"street": "Bad", "locality": "Some Locality"}}}}} +32 {"properties": {"locales": {"default": {"address": {"building": "10", "street": "Bad", "locality": "Some Locality"}}}}} -40 {"properties": {"address": {"street": "MaybeNumbered", "locality": "Some Locality"}}} -41 {"properties": {"address": {"street": "MaybeNumbered-3", "locality": "Some Locality"}}} -42 {"properties": {"address": {"building": "3", "street": "MaybeNumbered", "locality": "Some Locality"}}} +40 {"properties": {"locales": {"default": {"address": {"street": "MaybeNumbered", "locality": "Some Locality"}}}}} +41 {"properties": {"locales": {"default": {"address": {"street": "MaybeNumbered-3", "locality": "Some Locality"}}}}} +42 {"properties": {"locales": {"default": {"address": {"building": "3", "street": "MaybeNumbered", "locality": "Some Locality"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - base::GeoObjectId const localityId(10); - base::GeoObjectId const goodStreetId(21); - base::GeoObjectId const badStreetId(31); - base::GeoObjectId const building5(22); - base::GeoObjectId const building10(32); + base::GeoObjectId const localityId(0x10); + base::GeoObjectId const goodStreetId(0x21); + base::GeoObjectId const badStreetId(0x31); + base::GeoObjectId const building5(0x22); + base::GeoObjectId const building10(0x32); TestGeocoder(geocoder, "some locality", {{localityId, 1.0}}); TestGeocoder(geocoder, "some locality good", {{goodStreetId, 1.0}, {localityId, 0.857143}}); @@ -124,8 +125,8 @@ UNIT_TEST(Geocoder_OnlyBuildings) // Sometimes we may still emit a non-building. // In this case it happens because all query tokens are used. - base::GeoObjectId const numberedStreet(41); - base::GeoObjectId const houseOnANonNumberedStreet(42); + base::GeoObjectId const numberedStreet(0x41); + base::GeoObjectId const houseOnANonNumberedStreet(0x42); TestGeocoder(geocoder, "some locality maybenumbered 3", {{numberedStreet, 1.0}, {houseOnANonNumberedStreet, 0.8875}}); } @@ -133,20 +134,20 @@ UNIT_TEST(Geocoder_OnlyBuildings) UNIT_TEST(Geocoder_MismatchedLocality) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Moscow"}}} -11 {"properties": {"address": {"locality": "Paris"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Moscow"}}}}} +11 {"properties": {"locales": {"default": {"address": {"locality": "Paris"}}}}} -21 {"properties": {"address": {"street": "Krymskaya", "locality": "Moscow"}}} -22 {"properties": {"address": {"building": "2", "street": "Krymskaya", "locality": "Moscow"}}} +21 {"properties": {"locales": {"default": {"address": {"street": "Krymskaya", "locality": "Moscow"}}}}} +22 {"properties": {"locales": {"default": {"address": {"building": "2", "street": "Krymskaya", "locality": "Moscow"}}}}} -31 {"properties": {"address": {"street": "Krymskaya", "locality": "Paris"}}} -32 {"properties": {"address": {"building": "3", "street": "Krymskaya", "locality": "Paris"}}} +31 {"properties": {"locales": {"default": {"address": {"street": "Krymskaya", "locality": "Paris"}}}}} +32 {"properties": {"locales": {"default": {"address": {"building": "3", "street": "Krymskaya", "locality": "Paris"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - base::GeoObjectId const building2(22); + base::GeoObjectId const building2(0x22); TestGeocoder(geocoder, "Moscow Krymskaya 2", {{building2, 1.0}}); @@ -158,119 +159,119 @@ UNIT_TEST(Geocoder_MismatchedLocality) UNIT_TEST(Geocoder_StreetWithNumberInCity) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}}} -20 {"properties": {"address": {"locality": "Краснокамск"}}} -28 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}} +20 {"properties": {"locales": {"default": {"address": {"locality": "Краснокамск"}}}}} +28 {"properties": {"locales": {"default": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "Москва, улица 1905 года", {{Id{11}, 1.0}}); + TestGeocoder(geocoder, "Москва, улица 1905 года", {{Id{0x11}, 1.0}}); } UNIT_TEST(Geocoder_StreetWithNumberInClassifiedCity) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "город Москва, улица 1905 года", {{Id{11}, 1.0}}); + TestGeocoder(geocoder, "город Москва, улица 1905 года", {{Id{0x11}, 1.0}}); } UNIT_TEST(Geocoder_StreetWithNumberInAnyCity) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}}} -20 {"properties": {"address": {"locality": "Краснокамск"}}} -28 {"properties": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}} +20 {"properties": {"locales": {"default": {"address": {"locality": "Краснокамск"}}}}} +28 {"properties": {"locales": {"default": {"address": {"locality": "Краснокамск", "street": "улица 1905 года"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "улица 1905 года", {{Id{11}, 1.0}, {Id{28}, 1.0}}); + TestGeocoder(geocoder, "улица 1905 года", {{Id{0x11}, 1.0}, {Id{0x28}, 1.0}}); } UNIT_TEST(Geocoder_StreetWithNumberAndWithoutStreetSynonym) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -11 {"properties": {"address": {"locality": "Москва", "street": "улица 1905 года"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +11 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 1905 года"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "Москва, 1905 года", {{Id{11}, 1.0}}); + TestGeocoder(geocoder, "Москва, 1905 года", {{Id{0x11}, 1.0}}); } UNIT_TEST(Geocoder_UntypedStreetWithNumberAndStreetSynonym) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -13 {"properties": {"address": {"locality": "Москва", "street": "8 Марта"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +13 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "8 Марта"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "Москва, улица 8 Марта", {{Id{13}, 1.0}}); + TestGeocoder(geocoder, "Москва, улица 8 Марта", {{Id{0x13}, 1.0}}); } UNIT_TEST(Geocoder_StreetWithTwoNumbers) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -12 {"properties": {"address": {"locality": "Москва", "street": "4-я улица 8 Марта"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +12 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "4-я улица 8 Марта"}}}}} -13 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}} +13 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "Москва, 4-я улица 8 Марта", {{Id{12}, 1.0}}); + TestGeocoder(geocoder, "Москва, 4-я улица 8 Марта", {{Id{0x12}, 1.0}}); } UNIT_TEST(Geocoder_BuildingOnStreetWithNumber) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Москва"}}} -13 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}} -15 {"properties": {"address": {"locality": "Москва", "street": "улица 8 Марта", "building": "4"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Москва"}}}}} +13 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 8 Марта"}}}}} +15 {"properties": {"locales": {"default": {"address": {"locality": "Москва", "street": "улица 8 Марта", "building": "4"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - TestGeocoder(geocoder, "Москва, улица 8 Марта, 4", {{Id{15}, 1.0}}); + TestGeocoder(geocoder, "Москва, улица 8 Марта, 4", {{Id{0x15}, 1.0}}); } //-------------------------------------------------------------------------------------------------- UNIT_TEST(Geocoder_LocalityBuilding) { string const kData = R"#( -10 {"properties": {"address": {"locality": "Zelenograd"}}} +10 {"properties": {"locales": {"default": {"address": {"locality": "Zelenograd"}}}}} -22 {"properties": {"address": {"building": "2", "locality": "Zelenograd"}}} +22 {"properties": {"locales": {"default": {"address": {"building": "2", "locality": "Zelenograd"}}}}} -31 {"properties": {"address": {"street": "Krymskaya", "locality": "Zelenograd"}}} -32 {"properties": {"address": {"building": "2", "street": "Krymskaya", "locality": "Zelenograd"}}} +31 {"properties": {"locales": {"default": {"address": {"street": "Krymskaya", "locality": "Zelenograd"}}}}} +32 {"properties": {"locales": {"default": {"address": {"building": "2", "street": "Krymskaya", "locality": "Zelenograd"}}}}} )#"; ScopedFile const regionsJsonFile("regions.jsonl", kData); Geocoder geocoder(regionsJsonFile.GetFullPath()); - base::GeoObjectId const building2(22); + base::GeoObjectId const building2(0x22); TestGeocoder(geocoder, "Zelenograd 2", {{building2, 1.0}}); } @@ -290,11 +291,12 @@ UNIT_TEST(Geocoder_BigFileConcurrentRead) stringstream s; for (int i = 0; i < kEntryCount; ++i) { - s << i << " " + s << setw(16) << setfill('0') << hex << uppercase << i << " " << "{" << R"("type": "Feature",)" << R"("geometry": {"type": "Point", "coordinates": [0, 0]},)" - << R"("properties": {"name": ")" << i << R"(", "rank": 2, "address": {"country": ")" << i << R"("}})" + << R"("properties": {"locales": {"default": {)" + << R"("name": ")" << i << R"(", "address": {"country": ")" << i << R"("}}}, "rank": 2})" << "}\n"; } diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index abccd07598..a6e99f5511 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -40,19 +40,15 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const MYTHROW(base::Json::Exception, ("Not a json object.")); } - json_t * properties = nullptr; - FromJSONObject(root, "properties", properties); - json_t * address = nullptr; - FromJSONObject(properties, "address", address); - + auto const defaultLocale = base::GetJSONObligatoryFieldByPath(root, "properties", "locales", + "default"); + auto const address = base::GetJSONObligatoryField(defaultLocale, "address"); bool hasDuplicateAddress = false; - for (size_t i = 0; i < static_cast(Type::Count); ++i) { Type const type = static_cast(i); string const & levelKey = ToString(type); - json_t * levelJson = nullptr; - FromJSONObjectOptionalField(address, levelKey, levelJson); + auto const levelJson = base::GetJSONOptionalField(address, levelKey); if (!levelJson) continue; @@ -90,7 +86,7 @@ bool Hierarchy::Entry::DeserializeFromJSONImpl(json_t * const root, string const } m_nameTokens.clear(); - FromJSONObjectOptionalField(properties, "name", m_name); + FromJSONObjectOptionalField(defaultLocale, "name", m_name); search::NormalizeAndTokenizeAsUtf8(m_name, m_nameTokens); if (m_name.empty()) diff --git a/geocoder/hierarchy_reader.cpp b/geocoder/hierarchy_reader.cpp index dbde6138bb..874e9182d9 100644 --- a/geocoder/hierarchy_reader.cpp +++ b/geocoder/hierarchy_reader.cpp @@ -195,8 +195,8 @@ void HierarchyReader::DeserializeEntryMap(vector const & linesBuffer, si continue; auto const p = line.find(' '); - int64_t encodedId; - if (p == string::npos || !strings::to_any(line.substr(0, p), encodedId)) + uint64_t encodedId; + if (p == string::npos || !DeserializeId(line.substr(0, p), encodedId)) { LOG(LWARNING, ("Cannot read osm id. Line:", line)); ++stats.m_badOsmIds; @@ -205,8 +205,7 @@ void HierarchyReader::DeserializeEntryMap(vector const & linesBuffer, si auto json = line.substr(p + 1); Entry entry; - // TODO: (@m) We should really write uints as uints. - auto const osmId = base::GeoObjectId(static_cast(encodedId)); + auto const osmId = base::GeoObjectId(encodedId); entry.m_osmId = osmId; if (!entry.DeserializeFromJSON(json, stats)) @@ -224,4 +223,9 @@ void HierarchyReader::DeserializeEntryMap(vector const & linesBuffer, si entries.emplace(osmId, move(entry)); } } + +bool HierarchyReader::DeserializeId(std::string const & str, uint64_t & id) +{ + return strings::to_uint64(str, id, 16 /* base */); +} } // namespace geocoder diff --git a/geocoder/hierarchy_reader.hpp b/geocoder/hierarchy_reader.hpp index 19701659f0..c91b42bfba 100644 --- a/geocoder/hierarchy_reader.hpp +++ b/geocoder/hierarchy_reader.hpp @@ -34,6 +34,7 @@ private: void DeserializeEntryMap(std::vector const & linesBuffer, std::size_t const bufferSize, std::multimap & entries, ParsingStats & stats); + bool DeserializeId(std::string const & str, uint64_t & id); std::vector MergeEntries(std::vector> & entryParts); void CheckDuplicateOsmIds(std::vector const & entries, ParsingStats & stats);