forked from organicmaps/organicmaps
[search] Implemented house numbers matching.
This commit is contained in:
parent
4f735b9d3c
commit
91c13d4fec
5 changed files with 318 additions and 0 deletions
|
@ -46,6 +46,7 @@ HEADERS += \
|
|||
v2/features_layer_matcher.hpp \
|
||||
v2/features_layer_path_finder.hpp \
|
||||
v2/geocoder.hpp \
|
||||
v2/house_numbers_matcher.hpp \
|
||||
v2/search_model.hpp \
|
||||
v2/search_query_v2.hpp \
|
||||
v2/street_vicinity_loader.hpp \
|
||||
|
@ -78,6 +79,7 @@ SOURCES += \
|
|||
v2/features_layer_matcher.cpp \
|
||||
v2/features_layer_path_finder.cpp \
|
||||
v2/geocoder.cpp \
|
||||
v2/house_numbers_matcher.cpp \
|
||||
v2/search_model.cpp \
|
||||
v2/search_query_v2.cpp \
|
||||
v2/street_vicinity_loader.cpp \
|
||||
|
|
72
search/search_tests/house_numbers_matcher_test.cpp
Normal file
72
search/search_tests/house_numbers_matcher_test.cpp
Normal file
|
@ -0,0 +1,72 @@
|
|||
#include "testing/testing.hpp"
|
||||
|
||||
#include "search/v2/house_numbers_matcher.hpp"
|
||||
|
||||
#include "std/vector.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
using namespace strings;
|
||||
using namespace search::v2;
|
||||
|
||||
void CheckTokenizer(string const & utf8s, vector<string> const & expected)
|
||||
{
|
||||
UniString utf32s = MakeUniString(utf8s);
|
||||
vector<HouseNumberTokenizer::Token> tokens;
|
||||
HouseNumberTokenizer::Tokenize(utf32s, tokens);
|
||||
|
||||
vector<string> actual;
|
||||
for (auto const & token : tokens)
|
||||
actual.push_back(ToUtf8(token.m_token));
|
||||
TEST_EQUAL(actual, expected, ());
|
||||
}
|
||||
|
||||
void CheckNormalizer(string const & utf8s, string const & expected)
|
||||
{
|
||||
vector<string> tokens;
|
||||
NormalizeHouseNumber(utf8s, tokens);
|
||||
|
||||
string actual;
|
||||
for (size_t i = 0; i < tokens.size(); ++i)
|
||||
{
|
||||
actual.append(tokens[i]);
|
||||
if (i + 1 != tokens.size())
|
||||
actual.push_back(' ');
|
||||
}
|
||||
TEST_EQUAL(actual, expected, ());
|
||||
}
|
||||
|
||||
UNIT_TEST(HouseNumberTokenizer_Smoke)
|
||||
{
|
||||
CheckTokenizer("123Б", {"123", "Б"});
|
||||
CheckTokenizer("123/Б", {"123", "/", "Б"});
|
||||
CheckTokenizer("123/34 корп. 4 стр1",
|
||||
{"123", "/", "34", " ", "корп", ". ", "4", " ", "стр", "1"});
|
||||
}
|
||||
|
||||
UNIT_TEST(HouseNumberNormalizer_Smoke)
|
||||
{
|
||||
CheckNormalizer("123Б", "123б");
|
||||
CheckNormalizer("123/4 Литер А", "123 4 b.а");
|
||||
CheckNormalizer("123а корп. 2б", "123а b.2б");
|
||||
CheckNormalizer("123к4", "123 b.4");
|
||||
CheckNormalizer("123к Корпус 2", "123к b.2");
|
||||
CheckNormalizer("9 литер А корпус 2", "9 b.а b.2");
|
||||
CheckNormalizer("39с79", "39 b.79");
|
||||
CheckNormalizer("9 литер аб1", "9 b.аб1");
|
||||
}
|
||||
|
||||
UNIT_TEST(HouseNumbersMatcher_Smoke)
|
||||
{
|
||||
TEST(HouseNumbersMatch("39с79", "39"), ());
|
||||
TEST(HouseNumbersMatch("39с79", "39 Строение 79"), ());
|
||||
TEST(HouseNumbersMatch("39с79", "39 к. 79"), ());
|
||||
TEST(HouseNumbersMatch("127а корпус 2", "127а"), ());
|
||||
TEST(HouseNumbersMatch("127а корпус 2", "127а кор. 2"), ());
|
||||
TEST(HouseNumbersMatch("1234abcdef", "1234 abcdef"), ());
|
||||
TEST(HouseNumbersMatch("10/42 корпус 2", "10"), ());
|
||||
|
||||
TEST(!HouseNumbersMatch("127а корпус 2", "127"), ());
|
||||
TEST(!HouseNumbersMatch("6 корпус 2", "7"), ());
|
||||
TEST(!HouseNumbersMatch("10/42 корпус 2", "42"), ());
|
||||
}
|
|
@ -19,6 +19,7 @@ SOURCES += \
|
|||
algos_tests.cpp \
|
||||
categories_test.cpp \
|
||||
house_detector_tests.cpp \
|
||||
house_numbers_matcher_test.cpp \
|
||||
interval_set_test.cpp \
|
||||
keyword_lang_matcher_test.cpp \
|
||||
keyword_matcher_test.cpp \
|
||||
|
|
199
search/v2/house_numbers_matcher.cpp
Normal file
199
search/v2/house_numbers_matcher.cpp
Normal file
|
@ -0,0 +1,199 @@
|
|||
#include "search/v2/house_numbers_matcher.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/iterator.hpp"
|
||||
|
||||
using namespace strings;
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace v2
|
||||
{
|
||||
namespace
|
||||
{
|
||||
HouseNumberTokenizer::CharClass GetCharClass(UniChar c)
|
||||
{
|
||||
static UniString const kSeps = MakeUniString("\"\\/(),. \t№#");
|
||||
if (c >= '0' && c <= '9')
|
||||
return HouseNumberTokenizer::CharClass::Digit;
|
||||
if (find(kSeps.begin(), kSeps.end(), c) != kSeps.end())
|
||||
return HouseNumberTokenizer::CharClass::Separator;
|
||||
return HouseNumberTokenizer::CharClass::Other;
|
||||
}
|
||||
|
||||
bool IsShortWord(HouseNumberTokenizer::Token const & t)
|
||||
{
|
||||
return t.m_klass == HouseNumberTokenizer::CharClass::Other && t.m_token.size() <= 3;
|
||||
}
|
||||
|
||||
bool IsNumber(HouseNumberTokenizer::Token const & t)
|
||||
{
|
||||
return t.m_klass == HouseNumberTokenizer::CharClass::Digit;
|
||||
}
|
||||
|
||||
bool IsNumberOrShortWord(HouseNumberTokenizer::Token const & t)
|
||||
{
|
||||
return IsNumber(t) || IsShortWord(t);
|
||||
}
|
||||
|
||||
// Returns number of tokens starting at position |i|,
|
||||
// where the first token is some way of writing of "корпус", or
|
||||
// "building", second token is a number or a letter, and (possibly)
|
||||
// third token which can be a letter when second token is a
|
||||
// number.
|
||||
size_t IsBuilding(vector<HouseNumberTokenizer::Token> const & ts, size_t i)
|
||||
{
|
||||
static UniString kSynonyms[] = {MakeUniString("building"), MakeUniString("unit"),
|
||||
MakeUniString("block"), MakeUniString("корпус"),
|
||||
MakeUniString("литер"), MakeUniString("строение"),
|
||||
MakeUniString("блок")};
|
||||
|
||||
if (i >= ts.size())
|
||||
return 0;
|
||||
|
||||
auto const & token = ts[i];
|
||||
if (token.m_klass != HouseNumberTokenizer::CharClass::Other)
|
||||
return 0;
|
||||
|
||||
bool prefix = false;
|
||||
for (UniString const & synonym : kSynonyms)
|
||||
{
|
||||
if (StartsWith(synonym, token.m_token))
|
||||
{
|
||||
prefix = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!prefix)
|
||||
return 0;
|
||||
|
||||
// No sense in single "корпус" или "литер".
|
||||
if (i + 1 >= ts.size() || !IsNumberOrShortWord(ts[i + 1]))
|
||||
return 0;
|
||||
|
||||
// Consume next token, either number or short word.
|
||||
size_t j = i + 2;
|
||||
|
||||
// Consume one more number of short word, if possible.
|
||||
if (j < ts.size() && IsNumberOrShortWord(ts[j]) && ts[j].m_klass != ts[j - 1].m_klass)
|
||||
++j;
|
||||
|
||||
return j - i;
|
||||
}
|
||||
|
||||
void MergeTokens(vector<HouseNumberTokenizer::Token> const & ts, vector<UniString> & rs)
|
||||
{
|
||||
size_t i = 0;
|
||||
while (i < ts.size())
|
||||
{
|
||||
switch (ts[i].m_klass)
|
||||
{
|
||||
case HouseNumberTokenizer::CharClass::Digit:
|
||||
{
|
||||
UniString token = ts[i].m_token;
|
||||
++i;
|
||||
// Process cases like "123 б" or "9PQ".
|
||||
if (i < ts.size() && IsShortWord(ts[i]) && IsBuilding(ts, i) == 0)
|
||||
{
|
||||
token.append(ts[i].m_token.begin(), ts[i].m_token.end());
|
||||
++i;
|
||||
}
|
||||
rs.push_back(move(token));
|
||||
break;
|
||||
}
|
||||
case HouseNumberTokenizer::CharClass::Separator:
|
||||
{
|
||||
ASSERT(false, ("Seps can't be merged."));
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
case HouseNumberTokenizer::CharClass::Other:
|
||||
{
|
||||
if (size_t numTokens = IsBuilding(ts, i))
|
||||
{
|
||||
UniString token = MakeUniString("b.");
|
||||
++i;
|
||||
for (size_t j = 1; j < numTokens; ++j, ++i)
|
||||
token.append(ts[i].m_token.begin(), ts[i].m_token.end());
|
||||
rs.push_back(move(token));
|
||||
break;
|
||||
}
|
||||
|
||||
rs.push_back(ts[i].m_token);
|
||||
++i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
// static
|
||||
void HouseNumberTokenizer::Tokenize(UniString const & s, vector<Token> & ts)
|
||||
{
|
||||
size_t i = 0;
|
||||
while (i < s.size())
|
||||
{
|
||||
CharClass klass = GetCharClass(s[i]);
|
||||
UniString token;
|
||||
while (i < s.size() && GetCharClass(s[i]) == klass)
|
||||
{
|
||||
token.push_back(s[i]);
|
||||
++i;
|
||||
}
|
||||
ts.emplace_back(move(token), klass);
|
||||
}
|
||||
}
|
||||
|
||||
void NormalizeHouseNumber(string const & s, vector<string> & ts)
|
||||
{
|
||||
vector<HouseNumberTokenizer::Token> tokens;
|
||||
HouseNumberTokenizer::Tokenize(MakeLowerCase(MakeUniString(s)), tokens);
|
||||
|
||||
auto isSep = [](HouseNumberTokenizer::Token const & token)
|
||||
{
|
||||
return token.m_klass == HouseNumberTokenizer::CharClass::Separator;
|
||||
};
|
||||
tokens.erase(remove_if(tokens.begin(), tokens.end(), isSep), tokens.end());
|
||||
|
||||
vector<UniString> mergedTokens;
|
||||
MergeTokens(tokens, mergedTokens);
|
||||
|
||||
transform(mergedTokens.begin(), mergedTokens.end(), back_inserter(ts), &ToUtf8);
|
||||
}
|
||||
|
||||
bool HouseNumbersMatch(string const & houseNumber, string const & query)
|
||||
{
|
||||
if (houseNumber == query)
|
||||
return true;
|
||||
|
||||
vector<string> houseNumberTokens;
|
||||
NormalizeHouseNumber(houseNumber, houseNumberTokens);
|
||||
|
||||
vector<string> queryTokens;
|
||||
NormalizeHouseNumber(query, queryTokens);
|
||||
|
||||
if (houseNumberTokens.empty() || query.empty())
|
||||
return false;
|
||||
|
||||
// Check first tokens (hope, house numbers).
|
||||
if (houseNumberTokens.front() != queryTokens.front())
|
||||
return false;
|
||||
|
||||
sort(houseNumberTokens.begin() + 1, houseNumberTokens.end());
|
||||
sort(queryTokens.begin() + 1, queryTokens.end());
|
||||
|
||||
size_t i = 1, j = 1;
|
||||
while (i != houseNumberTokens.size() && j != queryTokens.size())
|
||||
{
|
||||
while (i != houseNumberTokens.size() && houseNumberTokens[i] < queryTokens[j])
|
||||
++i;
|
||||
if (i == houseNumberTokens.size() || houseNumberTokens[i] != queryTokens[j])
|
||||
return false;
|
||||
++i;
|
||||
++j;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} // namespace v2
|
||||
} // namespace search
|
44
search/v2/house_numbers_matcher.hpp
Normal file
44
search/v2/house_numbers_matcher.hpp
Normal file
|
@ -0,0 +1,44 @@
|
|||
#pragma once
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/string.hpp"
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace search
|
||||
{
|
||||
namespace v2
|
||||
{
|
||||
// This class splits a string representing a house number to a groups
|
||||
// of symbols from the same class (separators, digits or other
|
||||
// symbols, hope, letters).
|
||||
class HouseNumberTokenizer
|
||||
{
|
||||
public:
|
||||
enum class CharClass
|
||||
{
|
||||
Separator,
|
||||
Digit,
|
||||
Other,
|
||||
};
|
||||
|
||||
struct Token
|
||||
{
|
||||
Token() : m_klass(CharClass::Separator) {}
|
||||
Token(strings::UniString const & token, CharClass klass) : m_token(token), m_klass(klass) {}
|
||||
Token(strings::UniString && token, CharClass klass) : m_token(move(token)), m_klass(klass) {}
|
||||
|
||||
strings::UniString m_token;
|
||||
CharClass m_klass;
|
||||
};
|
||||
|
||||
static void Tokenize(strings::UniString const & s, vector<Token> & ts);
|
||||
};
|
||||
|
||||
// Splits house number by tokens, removes blanks and separators.
|
||||
void NormalizeHouseNumber(string const & s, vector<string> & ts);
|
||||
|
||||
// Returns true when |query| matches to |houseNumber|.
|
||||
bool HouseNumbersMatch(string const & houseNumber, string const & query);
|
||||
} // namespace v2
|
||||
} // namespace search
|
Loading…
Add table
Reference in a new issue