From d0aa2631ddd05fe04b0dd208d85fdac065f4c5de Mon Sep 17 00:00:00 2001 From: tatiana-yan Date: Tue, 24 Sep 2019 11:27:53 +0300 Subject: [PATCH] [search] Optimize postcodes index. --- generator/search_index_builder.cpp | 45 ++++++------- geometry/mercator.cpp | 16 +++++ geometry/mercator.hpp | 3 + search/postcode_points.cpp | 26 +++++++- search/postcode_points.hpp | 5 +- search/search_index_values.hpp | 64 +++++++++++++++++++ .../postcode_points_tests.cpp | 58 +++++++++++------ 7 files changed, 171 insertions(+), 46 deletions(-) diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index 570458bb3e..c6758c3611 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -247,12 +247,20 @@ void GetUKPostcodes(string const & filename, storage::CountryId const & countryI storage::CountryInfoGetter & infoGetter, vector & valueMapping, vector> & keyValuePairs) { - // ,,,,,,<2+6 NGR>,, - size_t constexpr kOutwardIndex = 0; - size_t constexpr kInwardIndex = 1; - size_t constexpr kLatIndex = 4; - size_t constexpr kLonIndex = 5; - size_t constexpr kDatasetCount = 9; + // 0 Postcode + // 1 Positional_quality_indicator + // 2 Eastings + // 3 Northings + // 4 Country_code + // 5 NHS_regional_HA_code + // 6 NHS_HA_code + // 7 Admin_county_code + // 8 Admin_district_code + // 9 Admin_ward_code + size_t constexpr kPostcodeIndex = 0; + size_t constexpr kEastingIndex = 2; + size_t constexpr kNorthingIndex = 3; + size_t constexpr kDatasetCount = 10; ifstream data; data.exceptions(fstream::failbit | fstream::badbit); @@ -272,29 +280,22 @@ void GetUKPostcodes(string const & filename, storage::CountryId const & countryI // Some lines have comma in "source". It leads to fields number greater than kDatasetCount. CHECK_GREATER_OR_EQUAL(fields.size(), kDatasetCount, (line)); - // Skip outward-only postcodes, build outward from inwards. - if (fields[kInwardIndex].empty()) - continue; + uint64_t lonMeters; + CHECK(strings::to_uint64(fields[kEastingIndex], lonMeters), ()); - double lon; - CHECK(strings::to_double(fields[kLonIndex], lon), ()); - auto const x = MercatorBounds::LonToX(lon); + uint64_t latMeters; + CHECK(strings::to_uint64(fields[kNorthingIndex], latMeters), ()); - double lat; - CHECK(strings::to_double(fields[kLatIndex], lat), ()); - auto const y = MercatorBounds::LatToY(lat); + auto const p = MercatorBounds::UKCoordsToXY(lonMeters, latMeters); vector countries; - infoGetter.GetRegionsCountryId(m2::PointD(x, y), countries); + infoGetter.GetRegionsCountryId(p, countries); if (find(countries.begin(), countries.end(), countryId) == countries.end()) continue; CHECK_EQUAL(valueMapping.size(), index, ()); - valueMapping.emplace_back(x, y); - keyValuePairs.emplace_back( - search::NormalizeAndSimplifyString(fields[kOutwardIndex] + " " + fields[kInwardIndex]), - Value(index)); - keyValuePairs.emplace_back(search::NormalizeAndSimplifyString(fields[kOutwardIndex]), + valueMapping.push_back(p); + keyValuePairs.emplace_back(search::NormalizeAndSimplifyString(fields[kPostcodeIndex]), Value(index)); ++index; } @@ -711,7 +712,7 @@ bool BuildPostcodesImpl(FilesContainerR & container, storage::CountryId const & { FileWriter tmpWriter(tmpName); SingleValueSerializer serializer; - trie::Build, SingleValueSerializer>( + trie::Build>( tmpWriter, serializer, ukPostcodesKeyValuePairs); } diff --git a/geometry/mercator.cpp b/geometry/mercator.cpp index f1a9e46258..5900aa98e9 100644 --- a/geometry/mercator.cpp +++ b/geometry/mercator.cpp @@ -56,3 +56,19 @@ double MercatorBounds::AreaOnEarth(m2::RectD const & rect) return MercatorBounds::AreaOnEarth(rect.LeftTop(), rect.LeftBottom(), rect.RightBottom()) + MercatorBounds::AreaOnEarth(rect.LeftTop(), rect.RightTop(), rect.RightBottom()); } + +m2::PointD MercatorBounds::UKCoordsToXY(double eastingM, double northingM) +{ + // The map projection used on Ordnance Survey Great Britain maps is known as the National Grid. + // It's UTM-like coordinate system. + // The Transverse Mercator eastings and northings axes are given a ‘false origin’ just south west + // of the Scilly Isles to ensure that all coordinates in Britain are positive. The false origin is + // 400 km west and 100 km north of the ‘true origin’ on the central meridian at 49°N 2°W (OSGB36) + // and approx. 49°N 2°0′5″ W (WGS 84). For further details see: + // https://www.ordnancesurvey.co.uk/documents/resources/guide-coordinate-systems-great-britain.pdf + // https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid + auto static kNationalGridOriginX = MercatorBounds::LonToX(-7.5571597); + auto static kNationalGridOriginY = MercatorBounds::LatToY(49.7668072); + + return GetSmPoint({kNationalGridOriginX, kNationalGridOriginY}, eastingM, northingM); +} diff --git a/geometry/mercator.hpp b/geometry/mercator.hpp index 0229d561f9..acf4f7e40a 100644 --- a/geometry/mercator.hpp +++ b/geometry/mercator.hpp @@ -120,4 +120,7 @@ struct MercatorBounds static double AreaOnEarth(m2::PointD const & p1, m2::PointD const & p2, m2::PointD const & p3); /// Calculates area on Earth in m². static double AreaOnEarth(m2::RectD const & mercatorRect); + + // Converts UK easting and northing measured from UK National Grid origin to mercator. + static m2::PointD UKCoordsToXY(double eastingM, double northingM); }; diff --git a/search/postcode_points.cpp b/search/postcode_points.cpp index 4fb6ec621c..9ae470370b 100644 --- a/search/postcode_points.cpp +++ b/search/postcode_points.cpp @@ -36,7 +36,7 @@ PostcodePoints::PostcodePoints(MwmValue const & value) m_header.Read(*reader.GetPtr()); m_trieSubReader = reader.GetPtr()->CreateSubReader(m_header.m_trieOffset, m_header.m_trieSize); - m_root = trie::ReadTrie, ValueList>( + m_root = trie::ReadTrie, SingleUint64Value>( SubReaderWrapper(m_trieSubReader.get()), SingleValueSerializer()); CHECK(m_root, ()); @@ -47,7 +47,8 @@ PostcodePoints::PostcodePoints(MwmValue const & value) CHECK(m_points, ()); } -void PostcodePoints::Get(strings::UniString const & postcode, vector & points) const +void PostcodePoints::Get(strings::UniString const & postcode, bool recursive, + vector & points) const { if (!m_root || !m_points || !m_trieSubReader || !m_pointsSubReader || postcode.empty()) return; @@ -75,8 +76,29 @@ void PostcodePoints::Get(strings::UniString const & postcode, vector indexes.push_back(base::asserted_cast(v.m_featureId)); }); + if (recursive) + { + trie::ForEachRef( + *trieIt, + [&indexes](auto const & /* s */, auto const & v) { + indexes.push_back(base::asserted_cast(v.m_featureId)); + }, + strings::UniString{}); + } + points.resize(indexes.size()); for (size_t i = 0; i < indexes.size(); ++i) CHECK(m_points->Get(indexes[i], points[i]), ()); } + +void PostcodePoints::Get(strings::UniString const & postcode, vector & points) const +{ + points.clear(); + Get(postcode, false /* recursive */, points); + if (!points.empty()) + return; + + auto static const space = strings::MakeUniString(" "); + Get(postcode + space, true /* recursive */, points); +} } // namespace search diff --git a/search/postcode_points.hpp b/search/postcode_points.hpp index 5b2dfea93e..dec1d7e017 100644 --- a/search/postcode_points.hpp +++ b/search/postcode_points.hpp @@ -57,9 +57,12 @@ public: void Get(strings::UniString const & postcode, std::vector & points) const; private: + void Get(strings::UniString const & postcode, bool recursive, + std::vector & points) const; + Header m_header; std::unique_ptr m_points; - std::unique_ptr>> m_root; + std::unique_ptr> m_root; std::unique_ptr m_trieSubReader; std::unique_ptr m_pointsSubReader; }; diff --git a/search/search_index_values.hpp b/search/search_index_values.hpp index a088e4b4a2..353c81a4e5 100644 --- a/search/search_index_values.hpp +++ b/search/search_index_values.hpp @@ -172,3 +172,67 @@ public: private: std::unique_ptr m_cbv; }; + +class SingleUint64Value +{ +public: + using Value = Uint64IndexValue; + + SingleUint64Value() = default; + + SingleUint64Value(SingleUint64Value const & o) + { + m_empty = o.m_empty; + m_val = o.m_val; + } + + void Init(std::vector const & values) + { + CHECK_LESS_OR_EQUAL(values.size(), 1, ()); + m_empty = values.empty(); + if (!m_empty) + m_val = values[0].m_featureId; + } + + size_t Size() const { return m_empty ? 0 : 1; } + + bool IsEmpty() const { return m_empty; } + + template + void Serialize(Sink & sink, SingleValueSerializer const & /* serializer */) const + { + if (m_empty) + return; + WriteVarUint(sink, m_val); + } + + template + void Deserialize(Source & src, uint64_t valueCount, + SingleValueSerializer const & /* serializer */) + { + CHECK_LESS_OR_EQUAL(valueCount, 1, ()); + m_empty = valueCount == 0; + if (!m_empty) + m_val = ReadVarUint(src); + } + + template + void Deserialize(Source & src, SingleValueSerializer const & /* serializer */) + { + m_empty = src.Size() == 0; + if (!m_empty) + m_val = ReadVarUint(src); + } + + template + void ForEach(ToDo && toDo) const + { + if (IsEmpty()) + return; + toDo(Value(m_val)); + } + +private: + uint64_t m_val; + bool m_empty = false; +}; diff --git a/search/search_integration_tests/postcode_points_tests.cpp b/search/search_integration_tests/postcode_points_tests.cpp index 5b640fb694..c89f8d174e 100644 --- a/search/search_integration_tests/postcode_points_tests.cpp +++ b/search/search_integration_tests/postcode_points_tests.cpp @@ -46,14 +46,16 @@ UNIT_CLASS_TEST(PostcodePointsTest, Smoke) auto const postcodesRelativePath = base::JoinPath(writableDir, testFile); // ,,,,,,<2+6 NGR>,, - ScopedFile const osmScopedFile(testFile, - "aa11, 0, dummy, dummy, 0.0, 0.0, dummy, dummy, dummy\n" - "aa11, 1, dummy, dummy, 0.1, 0.1, dummy, dummy, dummy\n" - "aa11, 2, dummy, dummy, 0.2, 0.2, dummy, dummy, dummy\n"); + ScopedFile const osmScopedFile( + testFile, + "aa11 0, dummy, 1000, 1000, dummy, dummy, dummy, dummy, dummy, dummy\n" + "aa11 1, dummy, 2000, 2000, dummy, dummy, dummy, dummy, dummy, dummy\n" + "aa11 2, dummy, 3000, 3000, dummy, dummy, dummy, dummy, dummy, dummy\n"); auto infoGetter = std::make_shared(); infoGetter->AddCountry( - storage::CountryDef(countryName, m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)))); + storage::CountryDef(countryName, m2::RectD(MercatorBounds::UKCoordsToXY(999, 999), + MercatorBounds::UKCoordsToXY(30001, 30001)))); auto const id = BuildCountry(countryName, [&](TestMwmBuilder & builder) { builder.SetPostcodesData(postcodesRelativePath, infoGetter); @@ -69,28 +71,40 @@ UNIT_CLASS_TEST(PostcodePointsTest, Smoke) vector points; p.Get(NormalizeAndSimplifyString("aa11 0"), points); TEST_EQUAL(points.size(), 1, ()); - TEST(base::AlmostEqualAbs(points[0], m2::PointD(0.0, 0.0), kMwmPointAccuracy), ()); + TEST(base::AlmostEqualAbs(points[0], MercatorBounds::UKCoordsToXY(1000, 1000), + kMwmPointAccuracy), + ()); } { vector points; p.Get(NormalizeAndSimplifyString("aa11 1"), points); TEST_EQUAL(points.size(), 1, ()); - TEST(base::AlmostEqualAbs(points[0], m2::PointD(0.1, 0.1), kMwmPointAccuracy), ()); + TEST(base::AlmostEqualAbs(points[0], MercatorBounds::UKCoordsToXY(2000, 2000), + kMwmPointAccuracy), + ()); } { vector points; p.Get(NormalizeAndSimplifyString("aa11 2"), points); TEST_EQUAL(points.size(), 1, ()); - TEST(base::AlmostEqualAbs(points[0], m2::PointD(0.2, 0.2), kMwmPointAccuracy), ()); + TEST(base::AlmostEqualAbs(points[0], MercatorBounds::UKCoordsToXY(3000, 3000), + kMwmPointAccuracy), + ()); } { vector points; p.Get(NormalizeAndSimplifyString("aa11"), points); TEST_EQUAL(points.size(), 3, ()); sort(points.begin(), points.end()); - TEST(base::AlmostEqualAbs(points[0], m2::PointD(0.0, 0.0), kMwmPointAccuracy), ()); - TEST(base::AlmostEqualAbs(points[1], m2::PointD(0.1, 0.1), kMwmPointAccuracy), ()); - TEST(base::AlmostEqualAbs(points[2], m2::PointD(0.2, 0.2), kMwmPointAccuracy), ()); + TEST(base::AlmostEqualAbs(points[0], MercatorBounds::UKCoordsToXY(1000, 1000), + kMwmPointAccuracy), + ()); + TEST(base::AlmostEqualAbs(points[1], MercatorBounds::UKCoordsToXY(2000, 2000), + kMwmPointAccuracy), + ()); + TEST(base::AlmostEqualAbs(points[2], MercatorBounds::UKCoordsToXY(3000, 3000), + kMwmPointAccuracy), + ()); } } @@ -104,13 +118,15 @@ UNIT_CLASS_TEST(PostcodePointsTest, SearchPostcode) auto const postcodesRelativePath = base::JoinPath(writableDir, testFile); // ,,,,,,<2+6 NGR>,, - ScopedFile const osmScopedFile(testFile, - "BA6, 7JP, dummy, dummy, 0.4, 0.4, dummy, dummy, dummy\n" - "BA6, 8JP, dummy, dummy, 0.6, 0.6, dummy, dummy, dummy\n"); + ScopedFile const osmScopedFile( + testFile, + "BA6 7JP, dummy, 4000, 4000, dummy, dummy, dummy, dummy, dummy, dummy\n" + "BA6 8JP, dummy, 6000, 6000, dummy, dummy, dummy, dummy, dummy, dummy\n"); auto infoGetter = std::make_shared(); infoGetter->AddCountry( - storage::CountryDef(countryName, m2::RectD(m2::PointD(0.0, 0.0), m2::PointD(1.0, 1.0)))); + storage::CountryDef(countryName, m2::RectD(MercatorBounds::UKCoordsToXY(3000, 3000), + MercatorBounds::UKCoordsToXY(7000, 7000)))); auto const id = BuildCountry(countryName, [&](TestMwmBuilder & builder) { builder.SetPostcodesData(postcodesRelativePath, infoGetter); @@ -129,12 +145,12 @@ UNIT_CLASS_TEST(PostcodePointsTest, SearchPostcode) TEST(base::AlmostEqualAbs(expected, actual, kMwmPointAccuracy), ()); }; - test("BA6 7JP", MercatorBounds::FromLatLon(0.4, 0.4)); - test("BA6 7JP ", MercatorBounds::FromLatLon(0.4, 0.4)); - test("BA6 8JP", MercatorBounds::FromLatLon(0.6, 0.6)); - test("BA6 8JP ", MercatorBounds::FromLatLon(0.6, 0.6)); + test("BA6 7JP", MercatorBounds::UKCoordsToXY(4000, 4000)); + test("BA6 7JP ", MercatorBounds::UKCoordsToXY(4000, 4000)); + test("BA6 8JP", MercatorBounds::UKCoordsToXY(6000, 6000)); + test("BA6 8JP ", MercatorBounds::UKCoordsToXY(6000, 6000)); // Search should return center of all inward codes for outward query. - test("BA6", MercatorBounds::FromLatLon(0.5, 0.5)); - test("BA6 ", MercatorBounds::FromLatLon(0.5, 0.5)); + test("BA6", MercatorBounds::UKCoordsToXY(5000, 5000)); + test("BA6 ", MercatorBounds::UKCoordsToXY(5000, 5000)); } } // namespace