From 1df068457991f32bf62a05d31adc74eabe276f0c Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Wed, 18 Jul 2018 23:40:19 +0300 Subject: [PATCH] [geocoder] Reading the hierarchy from json. --- 3party/jansson/myjansson.hpp | 5 +- 3party/jansson/src/jansson.h | 2 +- geocoder/geocoder.cpp | 10 +- geocoder/geocoder.hpp | 6 +- geocoder/geocoder_tests/CMakeLists.txt | 3 + geocoder/geocoder_tests/geocoder_tests.cpp | 36 +++++- geocoder/hierarchy.cpp | 126 ++++++++++++++++++++- geocoder/hierarchy.hpp | 60 ++++++++++ 8 files changed, 241 insertions(+), 7 deletions(-) diff --git a/3party/jansson/myjansson.hpp b/3party/jansson/myjansson.hpp index cfeb2b35e7..24f7b9a8d4 100644 --- a/3party/jansson/myjansson.hpp +++ b/3party/jansson/myjansson.hpp @@ -31,8 +31,6 @@ inline JSONPtr NewJSONNull() { return JSONPtr(json_null()); } class Json { - JsonHandle m_handle; - public: DECLARE_EXCEPTION(Exception, RootException); @@ -51,6 +49,9 @@ public: json_t * get() const { return m_handle.get(); } json_t * get_deep_copy() const { return json_deep_copy(get()); } + +private: + JsonHandle m_handle; }; json_t * GetJSONObligatoryField(json_t * root, std::string const & field); diff --git a/3party/jansson/src/jansson.h b/3party/jansson/src/jansson.h index 7ad6f0e778..c731028eb8 100644 --- a/3party/jansson/src/jansson.h +++ b/3party/jansson/src/jansson.h @@ -12,7 +12,7 @@ #include /* for size_t */ #include -#include +#include "jansson_config.h" #ifdef __cplusplus extern "C" { diff --git a/geocoder/geocoder.cpp b/geocoder/geocoder.cpp index 4fdb8483ec..02ee0265b5 100644 --- a/geocoder/geocoder.cpp +++ b/geocoder/geocoder.cpp @@ -1,14 +1,20 @@ #include "geocoder/geocoder.hpp" +#include "indexer/search_string_utils.hpp" + +#include "base/assert.hpp" #include "base/osm_id.hpp" +#include +#include + using namespace std; namespace geocoder { Geocoder::Geocoder(string pathToJsonHierarchy) : m_hierarchy(pathToJsonHierarchy) {} -void Geocoder::ProcessQuery(string const & query, vector & results) const +void Geocoder::ProcessQuery(string const & query, vector & results) { // Only here for demonstration purposes and will be removed shortly. results.clear(); @@ -23,4 +29,6 @@ void Geocoder::ProcessQuery(string const & query, vector & results) cons results.emplace_back(osm::Id(0x40000000F26943B9ULL), 0.1 /* certainty */); } } + +Hierarchy const & Geocoder::GetHierarchy() const { return m_hierarchy; } } // namespace geocoder diff --git a/geocoder/geocoder.hpp b/geocoder/geocoder.hpp index eb1614900f..3614d67fc8 100644 --- a/geocoder/geocoder.hpp +++ b/geocoder/geocoder.hpp @@ -3,6 +3,8 @@ #include "geocoder/hierarchy.hpp" #include "geocoder/result.hpp" +#include "base/string_utils.hpp" + #include #include @@ -30,7 +32,9 @@ class Geocoder public: explicit Geocoder(std::string pathToJsonHierarchy); - void ProcessQuery(std::string const & query, std::vector & results) const; + void ProcessQuery(std::string const & query, std::vector & results); + + Hierarchy const & GetHierarchy() const; private: Hierarchy m_hierarchy; diff --git a/geocoder/geocoder_tests/CMakeLists.txt b/geocoder/geocoder_tests/CMakeLists.txt index 7091fd3de6..cf9478c276 100644 --- a/geocoder/geocoder_tests/CMakeLists.txt +++ b/geocoder/geocoder_tests/CMakeLists.txt @@ -9,11 +9,14 @@ omim_add_test(${PROJECT_NAME} ${SRC}) omim_link_libraries( ${PROJECT_NAME} + platform_tests_support geocoder + indexer platform coding base stats_client + jansson ${LIBZ} ) diff --git a/geocoder/geocoder_tests/geocoder_tests.cpp b/geocoder/geocoder_tests/geocoder_tests.cpp index 934b18ae38..e2913e0983 100644 --- a/geocoder/geocoder_tests/geocoder_tests.cpp +++ b/geocoder/geocoder_tests/geocoder_tests.cpp @@ -2,21 +2,37 @@ #include "geocoder/geocoder.hpp" +#include "indexer/search_string_utils.hpp" + +#include "platform/platform_tests_support/scoped_file.hpp" + #include "base/math.hpp" #include #include +using namespace platform::tests_support; using namespace std; namespace { double const kCertaintyEps = 1e-6; + +string const kRegionsData = R"#( +-4611686018421500235 {"type": "Feature", "geometry": {"type": "Point", "coordinates": [-78.9263054493181, 22.08185765]}, "properties": {"name": "Florencia", "rank": 6, "address": {"subregion": "Florencia", "region": "Ciego de Ávila", "country": "Cuba"}}} +)#"; + +geocoder::Tokens Split(string const & s) +{ + geocoder::Tokens result; + search::NormalizeAndTokenizeString(s, result); + return result; +} } // namespace namespace geocoder { -void TestGeocoder(Geocoder const & geocoder, string const & query, vector const & expected) +void TestGeocoder(Geocoder & geocoder, string const & query, vector const & expected) { vector actual; geocoder.ProcessQuery(query, actual); @@ -37,4 +53,22 @@ UNIT_TEST(Geocoder_Smoke) TestGeocoder(geocoder, "b", {{osm::Id(0x8000000014527125ULL), 0.8}, {osm::Id(0x40000000F26943B9ULL), 0.1}}); } + +UNIT_TEST(Geocoder_Hierarchy) +{ + ScopedFile const regionsJsonFile("regions.jsonl", kRegionsData); + Geocoder geocoder(regionsJsonFile.GetFullPath()); + + vector> entries; + geocoder.GetHierarchy().GetEntries({strings::MakeUniString("florencia")}, entries); + + TEST_EQUAL(entries.size(), 1, ()); + TEST(entries[0] != nullptr, ()); + TEST_EQUAL(entries[0]->m_address[static_cast(Hierarchy::EntryType::Country)], + Split("cuba"), ()); + TEST_EQUAL(entries[0]->m_address[static_cast(Hierarchy::EntryType::Region)], + Split("ciego de avila"), ()); + TEST_EQUAL(entries[0]->m_address[static_cast(Hierarchy::EntryType::Subregion)], + Split("florencia"), ()); +} } // namespace geocoder diff --git a/geocoder/hierarchy.cpp b/geocoder/hierarchy.cpp index 8138cfa090..63b812eb5b 100644 --- a/geocoder/hierarchy.cpp +++ b/geocoder/hierarchy.cpp @@ -1,13 +1,137 @@ #include "geocoder/hierarchy.hpp" +#include "indexer/search_string_utils.hpp" + +#include "base/assert.hpp" +#include "base/exception.hpp" +#include "base/logging.hpp" #include "base/macros.hpp" +#include +#include + using namespace std; +namespace +{ +using EntryType = geocoder::Hierarchy::EntryType; + +map const kKnownLevels = { + {"country", EntryType::Country}, + {"region", EntryType::Region}, + {"subregion", EntryType::Subregion}, + {"locality", EntryType::Locality}, + {"sublocality", EntryType::Sublocality}, + {"suburb", EntryType::Suburb}, + {"building", EntryType::Building}, +}; +} // namespace + namespace geocoder { +// Hierarchy::Entry -------------------------------------------------------------------------------- +bool Hierarchy::Entry::DeserializeFromJSON(string const & jsonStr) +{ + try + { + my::Json root(jsonStr.c_str()); + DeserializeFromJSONImpl(root.get()); + return true; + } + catch (my::Json::Exception const & e) + { + LOG(LWARNING, ("Can't parse entry:", e.Msg(), jsonStr)); + } + return false; +} + +// todo(@m) Factor out to geojson.hpp? Add geojson to myjansson? +void Hierarchy::Entry::DeserializeFromJSONImpl(json_t * root) +{ + if (!json_is_object(root)) + MYTHROW(my::Json::Exception, ("Not a json object.")); + + json_t * const properties = my::GetJSONObligatoryField(root, "properties"); + + FromJSONObject(properties, "name", m_name); + m_nameTokens.clear(); + search::NormalizeAndTokenizeString(m_name, m_nameTokens); + + json_t * const address = my::GetJSONObligatoryField(properties, "address"); + + for (auto const & e : kKnownLevels) + { + string const & levelKey = e.first; + string levelValue; + FromJSONObjectOptionalField(address, levelKey, levelValue); + if (levelValue.empty()) + continue; + + EntryType const type = e.second; + CHECK(m_address[static_cast(type)].empty(), ()); + search::NormalizeAndTokenizeString(levelValue, m_address[static_cast(type)]); + } + + for (size_t i = 0; i < static_cast(Hierarchy::EntryType::Count); ++i) + { + if (!m_address[i].empty()) + m_type = static_cast(i); + } +} + +// Hierarchy --------------------------------------------------------------------------------------- Hierarchy::Hierarchy(string const & pathToJsonHierarchy) { - UNUSED_VALUE(pathToJsonHierarchy); + fstream fs(pathToJsonHierarchy); + string line; + + while (getline(fs, line)) + { + if (line.empty()) + continue; + + auto i = line.find(' '); + CHECK(i != string::npos, ()); + int64_t encodedId; + CHECK(strings::to_any(line.substr(0, i), encodedId), ()); + line = line.substr(i + 1); + + Entry entry; + // todo(@m) We should really write uints as uints. + entry.m_osmId = osm::Id(static_cast(encodedId)); + + CHECK(entry.DeserializeFromJSON(line), (line)); + m_entries[entry.m_nameTokens].emplace_back(entry); + } +} + +void Hierarchy::GetEntries(vector const & tokens, + vector> & entries) const +{ + entries.clear(); + + auto it = m_entries.find(tokens); + if (it == m_entries.end()) + return; + + for (auto const & entry : it->second) + entries.emplace_back(make_shared(entry)); +} + +// Functions --------------------------------------------------------------------------------------- +string DebugPrint(Hierarchy::EntryType const & type) +{ + switch (type) + { + case Hierarchy::EntryType::Country: return "country"; break; + case Hierarchy::EntryType::Region: return "region"; break; + case Hierarchy::EntryType::Subregion: return "subregion"; break; + case Hierarchy::EntryType::Locality: return "locality"; break; + case Hierarchy::EntryType::Sublocality: return "sublocality"; break; + case Hierarchy::EntryType::Suburb: return "suburb"; break; + case Hierarchy::EntryType::Building: return "building"; break; + case Hierarchy::EntryType::Count: return "count"; break; + } + CHECK_SWITCH(); } } // namespace geocoder diff --git a/geocoder/hierarchy.hpp b/geocoder/hierarchy.hpp index 836998bead..0454f8e5b4 100644 --- a/geocoder/hierarchy.hpp +++ b/geocoder/hierarchy.hpp @@ -1,12 +1,72 @@ #pragma once +#include "base/osm_id.hpp" +#include "base/string_utils.hpp" + +#include +#include +#include #include +#include +#include + +#include "3party/jansson/myjansson.hpp" namespace geocoder { +using Tokens = std::vector; + class Hierarchy { public: + enum class EntryType + { + // It is important that the types are ordered from + // the more general to the more specific. + Country, + Region, + Subregion, + Locality, + Sublocality, + Suburb, + Building, + + Count + }; + + // A single entry in the hierarchy directed acyclic graph. + // Currently, this is more or less the "properties"-"address" + // part of the geojson entry. + struct Entry + { + bool DeserializeFromJSON(std::string const & jsonStr); + + void DeserializeFromJSONImpl(json_t * root); + + osm::Id m_osmId = osm::Id(osm::Id::kInvalid); + std::string m_name; + std::vector m_nameTokens; + + EntryType m_type = EntryType::Count; + + // The address fields of this entry, one per EntryType. + std::array(EntryType::Count) + 1> m_address; + }; + explicit Hierarchy(std::string const & pathToJsonHierarchy); + + // Fills |entries| with pointers to entries whose names exactly match |tokens| (the order + // matters). + // + // todo This method (and the whole class, in fact) is in the + // prototype stage and may be too slow. Proper indexing should + // be implemented to perform this type of queries.a + void GetEntries(std::vector const & tokens, + std::vector> & entries) const; + +private: + std::map> m_entries; }; + +std::string DebugPrint(Hierarchy::EntryType const & type); } // namespace geocoder