From b70f13668d4303c419bc66fbe872acf13d43226d Mon Sep 17 00:00:00 2001 From: vng Date: Thu, 20 Feb 2014 15:32:45 +0300 Subject: [PATCH] [search] Better house number parsing (take into account American style). --- indexer/feature.hpp | 2 + search/house_detector.cpp | 176 +++++++++++++------ search/house_detector.hpp | 57 +++--- search/search_tests/house_detector_tests.cpp | 90 +++++++++- 4 files changed, 230 insertions(+), 95 deletions(-) diff --git a/indexer/feature.hpp b/indexer/feature.hpp index 4c65698268..ba8859a69c 100644 --- a/indexer/feature.hpp +++ b/indexer/feature.hpp @@ -230,6 +230,8 @@ public: void GetPreferredNames(string & defaultName, string & intName) const; /// Get one most suitable name for user. void GetReadableName(string & name) const; + + static int8_t const DEFAULT_LANG = StringUtf8Multilang::DEFAULT_CODE; bool GetName(int8_t lang, string & name) const; //@} diff --git a/search/house_detector.cpp b/search/house_detector.cpp index 8d8aae5915..340895289b 100644 --- a/search/house_detector.cpp +++ b/search/house_detector.cpp @@ -98,18 +98,26 @@ public: /// @todo Move prefixes, suffixes into separate file (autogenerated). /// "Набережная" улица встречается в городах -string affics1[] = +string affics[] = { + // Russian "аллея", "бульвар", "набережная", "переулок", "площадь", "проезд", "проспект", "шоссе", - "тупик", "улица", "тракт" -}; + "тупик", "улица", "тракт", -string affics2[] = -{ "ал", "бул", "наб", "пер", "пл", "пр", "просп", "ш", - "туп", "ул", "тр" + "туп", "ул", "тр", + + // English + "street", "avenue", "square", "road", "drive", + "st", "av", "sq", "rd", + + // German + "strasse", "weg", "platz", + + // Lithuanian + "g", "pr", "pl", "kel", }; void GetStreetName(strings::SimpleTokenizer iter, string & streetName) @@ -120,9 +128,9 @@ void GetStreetName(strings::SimpleTokenizer iter, string & streetName) ++iter; bool flag = true; - for (size_t i = 0; i < ARRAY_SIZE(affics2); ++i) + for (size_t i = 0; i < ARRAY_SIZE(affics); ++i) { - if (s == affics2[i] || s == affics1[i]) + if (s == affics[i]) { flag = false; break; @@ -134,18 +142,9 @@ void GetStreetName(strings::SimpleTokenizer iter, string & streetName) } } -int GetIntHouse(string const & s) -{ - char const * start = s.c_str(); - char * stop; - long const x = strtol(start, &stop, 10); - return (stop == start ? -1 : x); -} - double const STREET_CONNECTION_LENGTH_M = 100.0; char const * STREET_TOKENS_SEPARATOR = "\t -,."; -char const * HN_TOKENS_SEPARATOR = ",-; "; int const HN_NEARBY_DISTANCE = 4; double const STREET_CONNECTION_MAX_ANGLE = math::pi / 2.0; size_t const HN_COUNT_FOR_ODD_TEST = 16; @@ -175,49 +174,85 @@ void GetStreetNameAsKey(string const & name, string & res) GetStreetName(iter, res); } -void House::InitHouseNumber() +ParsedNumber::ParsedNumber(string const & number, bool american) + : m_fullN(number) { - strings::SimpleTokenizer it(m_number, HN_TOKENS_SEPARATOR); - while (it) + strings::MakeLowerCase(m_fullN); + + char * curr; + m_startN = strtol(number.c_str(), &curr, 10); + m_endN = -1; + ASSERT_GREATER_OR_EQUAL(m_startN, 0, (number)); + + bool hasMinus = false; + bool hasComma = false; + while (curr && *curr != 0) { - int const number = GetIntHouse(*it); - if (number != -1) + switch (*curr) { - if (m_startN == -1) - m_startN = number; - else + case ' ': case '\t': ++curr; break; + case ',': case ';': ++curr; hasComma = true; break; + case '-': ++curr; hasMinus = true; break; + default: { - // always assign to get house number boundaries [176, 182] - m_endN = number; + if (hasComma || hasMinus) + { + char const * start = curr; + long const x = strtol(start, &curr, 10); + if (curr != start) + { + m_endN = x; + ASSERT_GREATER_OR_EQUAL(m_endN, 0, (number)); + break; + } + } + + curr = 0; + break; } } - - ++it; } - ASSERT_GREATER_OR_EQUAL(m_startN, 0, (m_number)); - - if (m_endN != -1 && m_startN > m_endN) - swap(m_startN, m_endN); + if (m_endN != -1) + { + if (hasMinus && american) + { + m_startN = m_startN * 100 + m_endN; + m_endN = -1; + } + else + { + if (abs(m_endN - m_startN) >= 2*HN_NEARBY_DISTANCE) + m_endN = -1; + else + { + if (m_startN > m_endN) + swap(m_startN, m_endN); + } + } + } } -House::ParsedNumber::ParsedNumber(string const & number) - : m_fullN(&number), m_intN(GetIntHouse(number)) +bool ParsedNumber::IsIntersect(ParsedNumber const & number, int offset) const { + int const n = number.GetIntNumber(); + if (((m_endN == -1) && abs(GetIntNumber() - n) > offset) || + ((m_endN != -1) && (m_startN - offset > n || m_endN + offset < n))) + { + return false; + } + return true; } int House::GetMatch(ParsedNumber const & number) const { - if (((m_endN == -1) && m_startN != number.m_intN) || - ((m_endN != -1) && (m_startN > number.m_intN || m_endN < number.m_intN))) - { + if (!m_number.IsIntersect(number)) return -1; - } - if (*number.m_fullN == m_number) + if (m_number.GetNumber() == number.GetNumber()) return 0; - if ((number.m_intN % 2 == 0) == (m_startN % 2 == 0)) + if (m_number.IsOdd() == number.IsOdd()) return 1; return 2; @@ -225,13 +260,7 @@ int House::GetMatch(ParsedNumber const & number) const bool House::GetNearbyMatch(ParsedNumber const & number) const { - if (((m_endN == -1) && abs(m_startN - number.m_intN) > HN_NEARBY_DISTANCE) || - ((m_endN != -1) && (m_startN - HN_NEARBY_DISTANCE > number.m_intN || m_endN + HN_NEARBY_DISTANCE < number.m_intN))) - { - return false; - } - - return true; + return m_number.IsIntersect(number, HN_NEARBY_DISTANCE); } FeatureLoader::FeatureLoader(Index const * pIndex) @@ -451,10 +480,11 @@ int HouseDetector::LoadStreets(vector const & ids) m_loader.Load(ids[i], f); if (f.GetFeatureType() == feature::GEOM_LINE) { - /// @todo Assume that default name always exist as primary compare key. + // Use default name as a primary compare key for merging. string name; - if (!f.GetName(0, name) || name.empty()) + if (!f.GetName(FeatureType::DEFAULT_LANG, name)) continue; + ASSERT(!name.empty(), ()); ++count; @@ -764,6 +794,8 @@ void HouseDetector::ReadHouses(Street * st, double offsetMeters) void HouseDetector::ReadAllHouses(double offsetMeters) { + m_houseOffsetM = offsetMeters; + for (StreetMapT::iterator it = m_id2st.begin(); it != m_id2st.end(); ++it) ReadHouses(it->second, offsetMeters); @@ -790,6 +822,19 @@ void HouseDetector::ClearCaches() m_streets.clear(); } +string DebugPrint(HouseProjection const & p) +{ + return p.m_house->GetNumber(); +} + +template void LogSequence(vector const & v) +{ +#ifdef DEBUG + for (size_t i = 0; i < v.size(); ++i) + LOG(LDEBUG, (*v[i])); +#endif +} + namespace { @@ -803,7 +848,7 @@ struct ScoredHouse class ResultAccumulator { - House::ParsedNumber m_parsedNumber; + ParsedNumber m_number; bool m_isOdd, m_sign; ScoredHouse m_results[4]; @@ -815,16 +860,16 @@ class ResultAccumulator public: ResultAccumulator(string const & houseNumber) - : m_parsedNumber(houseNumber) + : m_number(houseNumber) { } - string const & GetFullNumber() const { return *m_parsedNumber.m_fullN; } + string const & GetFullNumber() const { return m_number.GetNumber(); } bool SetStreet(MergedStreet const & st) { Reset(); - m_isOdd = m_parsedNumber.IsOdd(); + m_isOdd = m_number.IsOdd(); return st.GetHousePivot(m_isOdd, m_sign) != 0; } @@ -849,10 +894,10 @@ public: void MatchCandidate(HouseProjection const & p, bool checkNearby) { - int ind = p.m_house->GetMatch(m_parsedNumber); + int ind = p.m_house->GetMatch(m_number); if (ind == -1) { - if (checkNearby && p.m_house->GetNearbyMatch(m_parsedNumber)) + if (checkNearby && p.m_house->GetNearbyMatch(m_number)) ind = 3; else return; @@ -1101,6 +1146,18 @@ House const * GetBestHouseWithNumber(MergedStreet const & st, ResultAccumulator return ProccessHouses(v, acc.GetFullNumber()).house; } +struct CompareHouseNumber +{ + inline bool Less(HouseProjection const * h1, HouseProjection const * h2) const + { + return (h1->m_house->GetIntNumber() <= h2->m_house->GetIntNumber()); + } + inline bool Greater(HouseProjection const * h1, HouseProjection const * h2) const + { + return (h1->m_house->GetIntNumber() >= h2->m_house->GetIntNumber()); + } +}; + void LongestSubsequence(vector const & v, vector & res) { @@ -1122,6 +1179,9 @@ void GetLSHouse(MergedStreet const & st, double offsetMeters, ResultAccumulator vector res; LongestSubsequence(v, res); + //LOG(LDEBUG, ("=== Offset", offsetMeters, "===")); + //LogSequence(res); + for (size_t i = 0; i < res.size(); ++i) acc.MatchCandidate(*(res[i]), true); @@ -1145,8 +1205,8 @@ void HouseDetector::GetHouseForName(string const & houseNumber, vector void ForEachInRect(m2::RectD const & rect, ToDo toDo); }; +struct ParsedNumber +{ + string m_fullN; + int m_startN, m_endN; + +public: + /// @todo Pass correct "American" notation flag. + ParsedNumber(string const & number, bool american = false); + + inline string const & GetNumber() const { return m_fullN; } + inline bool IsOdd() const { return (m_startN % 2 == 1); } + inline int GetIntNumber() const { return m_startN; } + + bool IsIntersect(ParsedNumber const & number, int offset = 0) const; +}; + class House { - string m_number; + ParsedNumber m_number; m2::PointD m_point; - // Start and End house numbers for this object. - // Osm objects can store many numbers for one area feature. - int m_startN, m_endN; - friend struct CompareHouseNumber; - - void InitHouseNumber(); - public: House(string const & number, m2::PointD const & point) - : m_number(number), m_point(point), m_startN(-1), m_endN(-1) + : m_number(number), m_point(point) { - InitHouseNumber(); } - inline string const & GetNumber() const { return m_number; } - inline int GetIntNumber() const { return m_startN; } + inline string const & GetNumber() const { return m_number.GetNumber(); } + inline int GetIntNumber() const { return m_number.GetIntNumber(); } inline m2::PointD const & GetPosition() const { return m_point; } - struct ParsedNumber - { - string const * m_fullN; - int m_intN; - - ParsedNumber(string const & number); - - bool IsOdd() const { return (m_intN % 2 == 1); } - }; - /// @return \n /// -1 - no match; /// 0 - full match; @@ -103,18 +101,6 @@ struct HouseProjection }; }; -struct CompareHouseNumber -{ - bool Less(HouseProjection const * h1, HouseProjection const * h2) const - { - return (h1->m_house->m_startN <= h2->m_house->m_startN); - } - bool Greater(HouseProjection const * h1, HouseProjection const * h2) const - { - return (h1->m_house->m_startN >= h2->m_house->m_startN); - } -}; - // many features combines to street class Street { @@ -213,6 +199,7 @@ class HouseDetector double m_metres2Mercator; int m_streetNum; + double m_houseOffsetM; typedef pair StreetPtr; StreetPtr FindConnection(Street const * st, bool beg) const; @@ -233,7 +220,7 @@ public: /// @return number of different joined streets. int MergeStreets(); - static int const DEFAULT_OFFSET_M = 500; + static int const DEFAULT_OFFSET_M = 200; void ReadAllHouses(double offsetMeters = DEFAULT_OFFSET_M); void GetHouseForName(string const & houseNumber, vector & res); diff --git a/search/search_tests/house_detector_tests.cpp b/search/search_tests/house_detector_tests.cpp index 2a67d1767f..8e54675ff9 100644 --- a/search/search_tests/house_detector_tests.cpp +++ b/search/search_tests/house_detector_tests.cpp @@ -95,6 +95,90 @@ public: } }; +UNIT_TEST(HS_ParseNumber) +{ + typedef search::ParsedNumber NumberT; + + { + NumberT n("135"); + TEST(n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 135, ()); + + NumberT n1("133"); + NumberT n2("137"); + TEST(n.IsIntersect(n1, 2), ()); + TEST(!n.IsIntersect(n1, 1), ()); + TEST(n.IsIntersect(n2, 2), ()); + TEST(!n.IsIntersect(n2, 1), ()); + } + + { + NumberT n("135 1к/2"); + TEST(n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 135, ()); + + TEST(!n.IsIntersect(NumberT("134")), ()); + TEST(!n.IsIntersect(NumberT("136")), ()); + } + + { + NumberT n("135A"); + TEST(n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 135, ()); + + TEST(!n.IsIntersect(NumberT("134")), ()); + TEST(!n.IsIntersect(NumberT("136")), ()); + } + + { + NumberT n("135-к1", false); + TEST(n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 135, ()); + + TEST(!n.IsIntersect(NumberT("134")), ()); + TEST(!n.IsIntersect(NumberT("136")), ()); + } + + { + NumberT n("135-12", false); + TEST(n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 135, ()); + + TEST(!n.IsIntersect(NumberT("134")), ()); + TEST(!n.IsIntersect(NumberT("136")), ()); + } + + + { + NumberT n("135-24", true); + TEST(!n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 13524, ()); + } + + { + NumberT n("135;133;131"); + TEST(n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 131, ()); + + for (int i = 131; i <= 135; ++i) + TEST(n.IsIntersect(NumberT(strings::to_string(i))), ()); + TEST(!n.IsIntersect(NumberT("130")), ()); + TEST(!n.IsIntersect(NumberT("136")), ()); + } + + { + NumberT n("6-10", false); + TEST(!n.IsOdd(), ()); + TEST_EQUAL(n.GetIntNumber(), 6, ()); + + for (int i = 6; i <= 10; ++i) + TEST(n.IsIntersect(NumberT(strings::to_string(i))), ()); + + TEST(!n.IsIntersect(NumberT("5")), ()); + TEST(!n.IsIntersect(NumberT("11")), ()); + } +} + UNIT_TEST(HS_StreetsMerge) { classificator::Load(); @@ -279,7 +363,9 @@ void swap(Address & a1, Address & a2) UNIT_TEST(HS_MWMSearch) { - string const path = GetPlatform().WritableDir() + "addresses-Minsk.txt"; + string const country = "Minsk"; //"Belarus"; //"USA_New York"; + + string const path = GetPlatform().WritableDir() + "addresses-" + country + ".txt"; ifstream file(path.c_str()); if (!file.good()) { @@ -289,7 +375,7 @@ UNIT_TEST(HS_MWMSearch) Index index; m2::RectD rect; - if (!index.Add("Minsk.mwm", rect)) + if (!index.Add(country + ".mwm", rect)) { LOG(LWARNING, ("MWM file not found")); return;