diff --git a/search/search_tests/house_numbers_matcher_test.cpp b/search/search_tests/house_numbers_matcher_test.cpp index 524ed06c0e..eef84bdb66 100644 --- a/search/search_tests/house_numbers_matcher_test.cpp +++ b/search/search_tests/house_numbers_matcher_test.cpp @@ -25,7 +25,7 @@ bool HouseNumbersMatch(string const & houseNumber, string const & query) strings::MakeUniString(query)); } -void CheckTokenizer(string const & utf8s, vector const & expected) +bool CheckTokenizer(string const & utf8s, vector const & expected) { UniString utf32s = MakeUniString(utf8s); vector tokens; @@ -34,10 +34,10 @@ void CheckTokenizer(string const & utf8s, vector const & expected) vector actual; for (auto const & token : tokens) actual.push_back(ToUtf8(token.m_token)); - TEST_EQUAL(actual, expected, ()); + return actual == expected; } -void CheckNormalizer(string const & utf8s, string const & expected) +bool CheckNormalizer(string const & utf8s, string const & expected) { vector tokens; NormalizeHouseNumber(utf8s, tokens); @@ -49,27 +49,27 @@ void CheckNormalizer(string const & utf8s, string const & expected) if (i + 1 != tokens.size()) actual.push_back(' '); } - TEST_EQUAL(actual, expected, ()); + return actual == expected; } } // namespace UNIT_TEST(HouseNumberTokenizer_Smoke) { - CheckTokenizer("123Б", {"123", "Б"}); - CheckTokenizer("123/Б", {"123", "Б"}); - CheckTokenizer("123/34 корп. 4 стр1", {"123", "34", "корп", "4", "стр", "1"}); + TEST(CheckTokenizer("123Б", {"123", "Б"}), ()); + TEST(CheckTokenizer("123/Б", {"123", "Б"}), ()); + TEST(CheckTokenizer("123/34 корп. 4 стр1", {"123", "34", "корп", "4", "стр", "1"}), ()); } UNIT_TEST(HouseNumberNormalizer_Smoke) { - CheckNormalizer("123Б", "123б"); - CheckNormalizer("123/4 Литер А", "123 4 а"); - CheckNormalizer("123а корп. 2б", "123а 2б"); - CheckNormalizer("123к4", "123 4"); - CheckNormalizer("123к Корпус 2", "123к 2"); - CheckNormalizer("9 литер А корпус 2", "9 а 2"); - CheckNormalizer("39с79", "39 79"); - CheckNormalizer("9 литер аб1", "9 аб1"); + TEST(CheckNormalizer("123Б", "123б"), ()); + TEST(CheckNormalizer("123/4 Литер А", "123 4 а"), ()); + TEST(CheckNormalizer("123а корп. 2б", "123а 2б"), ()); + TEST(CheckNormalizer("123к4", "123 4"), ()); + TEST(CheckNormalizer("123к Корпус 2", "123к 2"), ()); + TEST(CheckNormalizer("9 литер А корпус 2", "9 2 а"), ()); + TEST(CheckNormalizer("39с79", "39 79"), ()); + TEST(CheckNormalizer("9 литер аб1", "9 аб1"), ()); } UNIT_TEST(HouseNumbersMatcher_Smoke) @@ -83,6 +83,12 @@ UNIT_TEST(HouseNumbersMatcher_Smoke) TEST(HouseNumbersMatch("127а корпус 2", "127а кор. 2"), ()); TEST(HouseNumbersMatch("1234abcdef", "1234 abcdef"), ()); TEST(HouseNumbersMatch("10/42 корпус 2", "10"), ()); + TEST(HouseNumbersMatch("10 к2 с2", "10 корпус 2"), ()); + TEST(HouseNumbersMatch("10 к2 с2", "10 корпус 2 с 2"), ()); + TEST(HouseNumbersMatch("10 корпус 2 строение 2", "10 к2 с2"), ()); + TEST(HouseNumbersMatch("10 корпус 2 строение 2", "10к2с2"), ()); + TEST(HouseNumbersMatch("10к2а", "10 2а"), ()); + TEST(HouseNumbersMatch("10 к2с", "10 2с"), ()); TEST(!HouseNumbersMatch("39", "39 с 79"), ()); TEST(!HouseNumbersMatch("127а корпус 2", "127"), ()); @@ -90,3 +96,11 @@ UNIT_TEST(HouseNumbersMatcher_Smoke) TEST(!HouseNumbersMatch("10/42 корпус 2", "42"), ()); TEST(!HouseNumbersMatch("--...--.-", "--.....-"), ()); } + +UNIT_TEST(HouseNumbersMatcher_TwoStages) +{ + strings::UniString number = strings::MakeUniString("10 к2 с2"); + vector tokens; + NormalizeHouseNumber(number, tokens); + TEST(HouseNumbersMatch(number, tokens), (number, tokens)); +} diff --git a/search/v2/house_numbers_matcher.cpp b/search/v2/house_numbers_matcher.cpp index 377520f1b1..2fd37271f1 100644 --- a/search/v2/house_numbers_matcher.cpp +++ b/search/v2/house_numbers_matcher.cpp @@ -2,6 +2,8 @@ #include "std/algorithm.hpp" #include "std/iterator.hpp" +#include "std/limits.hpp" +#include "std/sstream.hpp" #include "base/logging.hpp" @@ -13,6 +15,8 @@ namespace v2 { namespace { +size_t constexpr kInvalidNum = numeric_limits::max(); + HouseNumberTokenizer::CharClass GetCharClass(UniChar c) { static UniString const kSeps = MakeUniString("\"\\/(),. \t№#-"); @@ -38,22 +42,20 @@ bool IsNumberOrShortWord(HouseNumberTokenizer::Token const & t) return IsNumber(t) || IsShortWord(t); } -// Returns number of tokens starting at position |i|, -// where the first token is some way of writing of "корпус", or -// "building", second token is a number or a letter, and (possibly) -// third token which can be a letter when second token is a -// number. -size_t GetNumTokensForBuildingPart(vector const & ts, size_t i) +size_t GetNumTokensForBuildingPart(vector const & ts, size_t i, + vector & memory); + +size_t GetNumTokensForBuildingPartImpl(vector const & ts, size_t i, + vector & memory) { + ASSERT_LESS(i, ts.size(), ()); + // TODO (@y, @m, @vng): move these constans out. static UniString kSynonyms[] = {MakeUniString("building"), MakeUniString("unit"), MakeUniString("block"), MakeUniString("корпус"), MakeUniString("литер"), MakeUniString("строение"), MakeUniString("блок")}; - if (i >= ts.size()) - return 0; - auto const & token = ts[i]; if (token.m_klass != HouseNumberTokenizer::CharClass::Other) return 0; @@ -78,14 +80,40 @@ size_t GetNumTokensForBuildingPart(vector const & t size_t j = i + 2; // Consume one more number of short word, if possible. - if (j < ts.size() && IsNumberOrShortWord(ts[j]) && ts[j].m_klass != ts[j - 1].m_klass) + if (j < ts.size() && IsNumberOrShortWord(ts[j]) && ts[j].m_klass != ts[j - 1].m_klass && + GetNumTokensForBuildingPart(ts, j, memory) == 0) + { ++j; + } return j - i; } +// Returns number of tokens starting at position |i|, where the first +// token is some way of writing of "корпус", or "building", second +// token is a number or a letter, and (possibly) third token which can +// be a letter when second token is a number. |memory| is used here to +// store results of previous calls and prevents degradation to +// non-linear time. +// +// TODO (@y, @m): the parser is quite complex now. Consider to just +// throw out all prefixes of "building" or "литер" and sort rest +// tokens. Number of false positives will be higher but the parser +// will be more robust, simple and faster. +size_t GetNumTokensForBuildingPart(vector const & ts, size_t i, + vector & memory) +{ + if (i >= ts.size()) + return 0; + if (memory[i] == kInvalidNum) + memory[i] = GetNumTokensForBuildingPartImpl(ts, i, memory); + return memory[i]; +} + void MergeTokens(vector const & ts, vector & rs) { + vector memory(ts.size(), kInvalidNum); + size_t i = 0; while (i < ts.size()) { @@ -96,7 +124,7 @@ void MergeTokens(vector const & ts, vector const & ts, vector tokens; HouseNumberTokenizer::Tokenize(MakeLowerCase(s), tokens); MergeTokens(tokens, ts); + + if (!ts.empty()) + sort(ts.begin() + 1, ts.end()); } bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query) @@ -168,9 +199,6 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin vector queryTokens; NormalizeHouseNumber(query, queryTokens); - if (!queryTokens.empty()) - sort(queryTokens.begin() + 1, queryTokens.end()); - return HouseNumbersMatch(houseNumber, queryTokens); } @@ -191,8 +219,6 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, vector const & queryTokens); + +string DebugPrint(HouseNumberTokenizer::CharClass charClass); + +string DebugPrint(HouseNumberTokenizer::Token const & token); } // namespace v2 } // namespace search