[search] Implemented house numbers matching.

This commit is contained in:
Yuri Gorshenin 2015-12-04 18:33:19 +03:00 committed by Sergey Yershov
parent 4f735b9d3c
commit 91c13d4fec
5 changed files with 318 additions and 0 deletions

View file

@ -46,6 +46,7 @@ HEADERS += \
v2/features_layer_matcher.hpp \
v2/features_layer_path_finder.hpp \
v2/geocoder.hpp \
v2/house_numbers_matcher.hpp \
v2/search_model.hpp \
v2/search_query_v2.hpp \
v2/street_vicinity_loader.hpp \
@ -78,6 +79,7 @@ SOURCES += \
v2/features_layer_matcher.cpp \
v2/features_layer_path_finder.cpp \
v2/geocoder.cpp \
v2/house_numbers_matcher.cpp \
v2/search_model.cpp \
v2/search_query_v2.cpp \
v2/street_vicinity_loader.cpp \

View file

@ -0,0 +1,72 @@
#include "testing/testing.hpp"
#include "search/v2/house_numbers_matcher.hpp"
#include "std/vector.hpp"
#include "base/string_utils.hpp"
using namespace strings;
using namespace search::v2;
void CheckTokenizer(string const & utf8s, vector<string> const & expected)
{
UniString utf32s = MakeUniString(utf8s);
vector<HouseNumberTokenizer::Token> tokens;
HouseNumberTokenizer::Tokenize(utf32s, tokens);
vector<string> actual;
for (auto const & token : tokens)
actual.push_back(ToUtf8(token.m_token));
TEST_EQUAL(actual, expected, ());
}
void CheckNormalizer(string const & utf8s, string const & expected)
{
vector<string> tokens;
NormalizeHouseNumber(utf8s, tokens);
string actual;
for (size_t i = 0; i < tokens.size(); ++i)
{
actual.append(tokens[i]);
if (i + 1 != tokens.size())
actual.push_back(' ');
}
TEST_EQUAL(actual, expected, ());
}
UNIT_TEST(HouseNumberTokenizer_Smoke)
{
CheckTokenizer("123Б", {"123", "Б"});
CheckTokenizer("123/Б", {"123", "/", "Б"});
CheckTokenizer("123/34 корп. 4 стр1",
{"123", "/", "34", " ", "корп", ". ", "4", " ", "стр", "1"});
}
UNIT_TEST(HouseNumberNormalizer_Smoke)
{
CheckNormalizer("123Б", "123б");
CheckNormalizer("123/4 Литер А", "123 4 b.а");
CheckNormalizer("123а корп. 2б", "123а b.2б");
CheckNormalizer("123к4", "123 b.4");
CheckNormalizer("123к Корпус 2", "123к b.2");
CheckNormalizer("9 литер А корпус 2", "9 b.а b.2");
CheckNormalizer("39с79", "39 b.79");
CheckNormalizer("9 литер аб1", "9 b.аб1");
}
UNIT_TEST(HouseNumbersMatcher_Smoke)
{
TEST(HouseNumbersMatch("39с79", "39"), ());
TEST(HouseNumbersMatch("39с79", "39 Строение 79"), ());
TEST(HouseNumbersMatch("39с79", "39 к. 79"), ());
TEST(HouseNumbersMatch("127а корпус 2", "127а"), ());
TEST(HouseNumbersMatch("127а корпус 2", "127а кор. 2"), ());
TEST(HouseNumbersMatch("1234abcdef", "1234 abcdef"), ());
TEST(HouseNumbersMatch("10/42 корпус 2", "10"), ());
TEST(!HouseNumbersMatch("127а корпус 2", "127"), ());
TEST(!HouseNumbersMatch("6 корпус 2", "7"), ());
TEST(!HouseNumbersMatch("10/42 корпус 2", "42"), ());
}

View file

@ -19,6 +19,7 @@ SOURCES += \
algos_tests.cpp \
categories_test.cpp \
house_detector_tests.cpp \
house_numbers_matcher_test.cpp \
interval_set_test.cpp \
keyword_lang_matcher_test.cpp \
keyword_matcher_test.cpp \

View file

@ -0,0 +1,199 @@
#include "search/v2/house_numbers_matcher.hpp"
#include "std/algorithm.hpp"
#include "std/iterator.hpp"
using namespace strings;
namespace search
{
namespace v2
{
namespace
{
HouseNumberTokenizer::CharClass GetCharClass(UniChar c)
{
static UniString const kSeps = MakeUniString("\"\\/(),. \t№#");
if (c >= '0' && c <= '9')
return HouseNumberTokenizer::CharClass::Digit;
if (find(kSeps.begin(), kSeps.end(), c) != kSeps.end())
return HouseNumberTokenizer::CharClass::Separator;
return HouseNumberTokenizer::CharClass::Other;
}
bool IsShortWord(HouseNumberTokenizer::Token const & t)
{
return t.m_klass == HouseNumberTokenizer::CharClass::Other && t.m_token.size() <= 3;
}
bool IsNumber(HouseNumberTokenizer::Token const & t)
{
return t.m_klass == HouseNumberTokenizer::CharClass::Digit;
}
bool IsNumberOrShortWord(HouseNumberTokenizer::Token const & t)
{
return IsNumber(t) || IsShortWord(t);
}
// Returns number of tokens starting at position |i|,
// where the first token is some way of writing of "корпус", or
// "building", second token is a number or a letter, and (possibly)
// third token which can be a letter when second token is a
// number.
size_t IsBuilding(vector<HouseNumberTokenizer::Token> const & ts, size_t i)
{
static UniString kSynonyms[] = {MakeUniString("building"), MakeUniString("unit"),
MakeUniString("block"), MakeUniString("корпус"),
MakeUniString("литер"), MakeUniString("строение"),
MakeUniString("блок")};
if (i >= ts.size())
return 0;
auto const & token = ts[i];
if (token.m_klass != HouseNumberTokenizer::CharClass::Other)
return 0;
bool prefix = false;
for (UniString const & synonym : kSynonyms)
{
if (StartsWith(synonym, token.m_token))
{
prefix = true;
break;
}
}
if (!prefix)
return 0;
// No sense in single "корпус" или "литер".
if (i + 1 >= ts.size() || !IsNumberOrShortWord(ts[i + 1]))
return 0;
// Consume next token, either number or short word.
size_t j = i + 2;
// Consume one more number of short word, if possible.
if (j < ts.size() && IsNumberOrShortWord(ts[j]) && ts[j].m_klass != ts[j - 1].m_klass)
++j;
return j - i;
}
void MergeTokens(vector<HouseNumberTokenizer::Token> const & ts, vector<UniString> & rs)
{
size_t i = 0;
while (i < ts.size())
{
switch (ts[i].m_klass)
{
case HouseNumberTokenizer::CharClass::Digit:
{
UniString token = ts[i].m_token;
++i;
// Process cases like "123 б" or "9PQ".
if (i < ts.size() && IsShortWord(ts[i]) && IsBuilding(ts, i) == 0)
{
token.append(ts[i].m_token.begin(), ts[i].m_token.end());
++i;
}
rs.push_back(move(token));
break;
}
case HouseNumberTokenizer::CharClass::Separator:
{
ASSERT(false, ("Seps can't be merged."));
++i;
break;
}
case HouseNumberTokenizer::CharClass::Other:
{
if (size_t numTokens = IsBuilding(ts, i))
{
UniString token = MakeUniString("b.");
++i;
for (size_t j = 1; j < numTokens; ++j, ++i)
token.append(ts[i].m_token.begin(), ts[i].m_token.end());
rs.push_back(move(token));
break;
}
rs.push_back(ts[i].m_token);
++i;
break;
}
}
}
}
} // namespace
// static
void HouseNumberTokenizer::Tokenize(UniString const & s, vector<Token> & ts)
{
size_t i = 0;
while (i < s.size())
{
CharClass klass = GetCharClass(s[i]);
UniString token;
while (i < s.size() && GetCharClass(s[i]) == klass)
{
token.push_back(s[i]);
++i;
}
ts.emplace_back(move(token), klass);
}
}
void NormalizeHouseNumber(string const & s, vector<string> & ts)
{
vector<HouseNumberTokenizer::Token> tokens;
HouseNumberTokenizer::Tokenize(MakeLowerCase(MakeUniString(s)), tokens);
auto isSep = [](HouseNumberTokenizer::Token const & token)
{
return token.m_klass == HouseNumberTokenizer::CharClass::Separator;
};
tokens.erase(remove_if(tokens.begin(), tokens.end(), isSep), tokens.end());
vector<UniString> mergedTokens;
MergeTokens(tokens, mergedTokens);
transform(mergedTokens.begin(), mergedTokens.end(), back_inserter(ts), &ToUtf8);
}
bool HouseNumbersMatch(string const & houseNumber, string const & query)
{
if (houseNumber == query)
return true;
vector<string> houseNumberTokens;
NormalizeHouseNumber(houseNumber, houseNumberTokens);
vector<string> queryTokens;
NormalizeHouseNumber(query, queryTokens);
if (houseNumberTokens.empty() || query.empty())
return false;
// Check first tokens (hope, house numbers).
if (houseNumberTokens.front() != queryTokens.front())
return false;
sort(houseNumberTokens.begin() + 1, houseNumberTokens.end());
sort(queryTokens.begin() + 1, queryTokens.end());
size_t i = 1, j = 1;
while (i != houseNumberTokens.size() && j != queryTokens.size())
{
while (i != houseNumberTokens.size() && houseNumberTokens[i] < queryTokens[j])
++i;
if (i == houseNumberTokens.size() || houseNumberTokens[i] != queryTokens[j])
return false;
++i;
++j;
}
return true;
}
} // namespace v2
} // namespace search

View file

@ -0,0 +1,44 @@
#pragma once
#include "base/string_utils.hpp"
#include "std/string.hpp"
#include "std/vector.hpp"
namespace search
{
namespace v2
{
// This class splits a string representing a house number to a groups
// of symbols from the same class (separators, digits or other
// symbols, hope, letters).
class HouseNumberTokenizer
{
public:
enum class CharClass
{
Separator,
Digit,
Other,
};
struct Token
{
Token() : m_klass(CharClass::Separator) {}
Token(strings::UniString const & token, CharClass klass) : m_token(token), m_klass(klass) {}
Token(strings::UniString && token, CharClass klass) : m_token(move(token)), m_klass(klass) {}
strings::UniString m_token;
CharClass m_klass;
};
static void Tokenize(strings::UniString const & s, vector<Token> & ts);
};
// Splits house number by tokens, removes blanks and separators.
void NormalizeHouseNumber(string const & s, vector<string> & ts);
// Returns true when |query| matches to |houseNumber|.
bool HouseNumbersMatch(string const & houseNumber, string const & query);
} // namespace v2
} // namespace search