[search] Fixed greedy street matching.

This commit is contained in:
Yuri Gorshenin 2016-01-29 15:34:04 +03:00 committed by Sergey Yershov
parent 86f604992b
commit ca90764d85
13 changed files with 116 additions and 49 deletions

View file

@ -100,6 +100,12 @@ public:
return *this;
}
template <size_t M>
void append(buffer_vector<value_type, M> const & v)
{
append(v.begin(), v.end());
}
template <typename IterT>
void append(IterT beg, IterT end)
{

View file

@ -67,4 +67,8 @@ bool IsHouseNumber(string const & s)
return (!s.empty() && IsDigit(s[0]));
}
bool IsHouseNumber(strings::UniString const & s)
{
return (!s.empty() && IsDigit(s[0]));
}
}

View file

@ -29,8 +29,10 @@ namespace feature
return str;
}
bool IsDigit(int c);
bool IsNumber(strings::UniString const & s);
bool IsHouseNumber(string const & s);
bool IsHouseNumber(strings::UniString const & s);
bool IsHouseNumberDeepCheck(strings::UniString const & s);
}

View file

@ -1,7 +1,7 @@
#include "search_string_utils.hpp"
#include "std/set.hpp"
#include "std/transform_iterator.hpp"
#include "std/unordered_set.hpp"
#include "base/macros.hpp"

View file

@ -9,6 +9,22 @@
using namespace strings;
using namespace search::v2;
namespace
{
void NormalizeHouseNumber(string const & s, vector<string> & ts)
{
vector<strings::UniString> tokens;
search::v2::NormalizeHouseNumber(strings::MakeUniString(s), tokens);
for (auto const & token : tokens)
ts.push_back(strings::ToUtf8(token));
}
bool HouseNumbersMatch(string const & houseNumber, string const & query)
{
return search::v2::HouseNumbersMatch(strings::MakeUniString(houseNumber),
strings::MakeUniString(query));
}
void CheckTokenizer(string const & utf8s, vector<string> const & expected)
{
UniString utf32s = MakeUniString(utf8s);
@ -35,6 +51,7 @@ void CheckNormalizer(string const & utf8s, string const & expected)
}
TEST_EQUAL(actual, expected, ());
}
} // namespace
UNIT_TEST(HouseNumberTokenizer_Smoke)
{

View file

@ -25,7 +25,7 @@ string DebugPrint(FeaturesLayer const & layer)
ostringstream os;
os << "FeaturesLayer [ size of m_sortedFeatures: "
<< (layer.m_sortedFeatures ? layer.m_sortedFeatures->size() : 0)
<< ", m_subQuery: " << layer.m_subQuery << ", m_startToken: " << layer.m_startToken
<< ", m_subQuery: " << DebugPrint(layer.m_subQuery) << ", m_startToken: " << layer.m_startToken
<< ", m_endToken: " << layer.m_endToken << ", m_type: " << DebugPrint(layer.m_type) << " ]";
return os.str();
}

View file

@ -2,6 +2,8 @@
#include "search/v2/search_model.hpp"
#include "base/string_utils.hpp"
#include "std/vector.hpp"
namespace search
@ -20,7 +22,7 @@ struct FeaturesLayer
// Non-owning ptr to a sorted vector of features.
vector<uint32_t> const * m_sortedFeatures;
string m_subQuery;
strings::UniString m_subQuery;
size_t m_startToken;
size_t m_endToken;

View file

@ -24,6 +24,7 @@
#include "base/logging.hpp"
#include "base/macros.hpp"
#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
#include "std/bind.hpp"
@ -151,7 +152,7 @@ private:
// |buildings| doesn't contain buildings matching by house number,
// so following code reads buildings in POIs vicinities and checks
// house numbers.
vector<string> queryTokens;
vector<strings::UniString> queryTokens;
NormalizeHouseNumber(parent.m_subQuery, queryTokens);
if (queryTokens.empty())
return;
@ -168,7 +169,7 @@ private:
{
if (building.m_id.m_mwmId != m_context->m_id || building.m_distanceMeters > kBuildingRadiusMeters)
continue;
if (HouseNumbersMatch(building.m_name, queryTokens))
if (HouseNumbersMatch(strings::MakeUniString(building.m_name), queryTokens))
fn(pois[i], building.m_id.m_index);
}
}
@ -240,7 +241,7 @@ private:
return;
}
vector<string> queryTokens;
vector<strings::UniString> queryTokens;
NormalizeHouseNumber(child.m_subQuery, queryTokens);
auto const & checker = ftypes::IsBuildingChecker::Instance();
@ -272,7 +273,7 @@ private:
if (!child.m_hasDelayedFeatures)
return false;
string const houseNumber = feature.GetHouseNumber();
strings::UniString const houseNumber(strings::MakeUniString(feature.GetHouseNumber()));
if (!feature::IsHouseNumber(houseNumber))
return false;
return HouseNumbersMatch(houseNumber, queryTokens);

View file

@ -45,9 +45,9 @@ uint64_t CalcBottomUpPassCost(vector<FeaturesLayer const *> const & layers)
return CalcPassCost(layers.begin(), layers.end());
}
bool LooksLikeHouseNumber(string const & query)
bool LooksLikeHouseNumber(strings::UniString const & query)
{
vector<string> tokens;
vector<strings::UniString> tokens;
NormalizeHouseNumber(query, tokens);
return !tokens.empty() && feature::IsHouseNumber(tokens.front());
}

View file

@ -56,6 +56,10 @@ size_t constexpr kMaxNumCities = 5;
size_t constexpr kMaxNumStates = 5;
size_t constexpr kMaxNumVillages = 5;
size_t constexpr kMaxNumCountries = 5;
// This constant limits number of localities that will be extracted
// from World map. Villages are not counted here as they're not
// included into World map.
size_t constexpr kMaxNumLocalities = kMaxNumCities + kMaxNumStates + kMaxNumCountries;
// List of countries we're supporting search by state. Elements of the
@ -63,6 +67,8 @@ size_t constexpr kMaxNumLocalities = kMaxNumCities + kMaxNumStates + kMaxNumCoun
string const kCountriesWithStates[] = {"US_", "Canada_"};
double constexpr kComparePoints = MercatorBounds::GetCellID2PointAbsEpsilon();
strings::UniString const kUniSpace(strings::MakeUniString(" "));
template <typename T>
struct Id
{
@ -149,19 +155,19 @@ private:
};
void JoinQueryTokens(SearchQueryParams const & params, size_t curToken, size_t endToken,
string const & sep, string & res)
strings::UniString const & sep, strings::UniString & res)
{
ASSERT_LESS_OR_EQUAL(curToken, endToken, ());
for (size_t i = curToken; i < endToken; ++i)
{
if (i < params.m_tokens.size())
{
res.append(strings::ToUtf8(params.m_tokens[i].front()));
res.append(params.m_tokens[i].front());
}
else
{
ASSERT_EQUAL(i, params.m_tokens.size(), ());
res.append(strings::ToUtf8(params.m_prefixTokens.front()));
res.append(params.m_prefixTokens.front());
}
if (i + 1 != endToken)
@ -375,7 +381,11 @@ void Geocoder::SetParams(Params const & params)
for (size_t i = 0; i < m_numTokens; ++i)
{
auto & synonyms = m_params.GetTokens(i);
if (!synonyms.empty() && IsStreetSynonym(synonyms.front()))
ASSERT(!synonyms.empty(), ());
auto const & token = synonyms.front();
if (IsStreetSynonym(token))
{
auto b = synonyms.begin();
auto e = synonyms.end();
@ -971,24 +981,35 @@ void Geocoder::LimitedSearch(coding::CompressedBitVector const * filter, size_t
void Geocoder::GreedilyMatchStreets()
{
ASSERT(m_layers.empty(), ());
m_layers.emplace_back();
MY_SCOPE_GUARD(cleanupGuard, bind(&vector<FeaturesLayer>::pop_back, &m_layers));
for (size_t startToken = 0; startToken < m_numTokens; ++startToken)
{
if (m_usedTokens[startToken])
continue;
unique_ptr<coding::CompressedBitVector> buffer;
// Here we try to match as many tokens as possible while
// intersection is a non-empty bit vector of streets. All tokens
// that are synonyms to streets are ignored. Moreover, each time
// a token that looks like a beginning of a house number is met,
// we try to use current intersection of tokens as a street layer
// and try to match buildings or pois.
unique_ptr<coding::CompressedBitVector> allFeatures;
size_t curToken = startToken;
for (; curToken < m_numTokens && !m_usedTokens[curToken]; ++curToken)
{
if (IsStreetSynonym(m_params.GetTokens(curToken).front()))
auto const & token = m_params.GetTokens(curToken).front();
if (IsStreetSynonym(token))
continue;
if (feature::IsHouseNumber(token) &&
!coding::CompressedBitVector::IsEmpty(allFeatures))
{
if (m_filter.NeedToFilter(*allFeatures))
allFeatures = m_filter.Filter(*allFeatures);
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
}
unique_ptr<coding::CompressedBitVector> buffer;
if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures))
buffer = coding::CompressedBitVector::Intersect(*m_streets, *m_addressFeatures[curToken]);
else
@ -1002,27 +1023,41 @@ void Geocoder::GreedilyMatchStreets()
if (coding::CompressedBitVector::IsEmpty(allFeatures))
continue;
if (m_filter.NeedToFilter(*allFeatures))
allFeatures = m_filter.Filter(*allFeatures);
auto & layer = m_layers.back();
layer.Clear();
layer.m_type = SearchModel::SEARCH_TYPE_STREET;
layer.m_startToken = startToken;
layer.m_endToken = curToken;
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, " " /* sep */,
layer.m_subQuery);
vector<uint32_t> sortedFeatures;
coding::CompressedBitVectorEnumerator::ForEach(*allFeatures,
MakeBackInsertFunctor(sortedFeatures));
layer.m_sortedFeatures = &sortedFeatures;
ScopedMarkTokens mark(m_usedTokens, startToken, curToken);
MatchPOIsAndBuildings(0 /* curToken */);
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
}
}
void Geocoder::CreateStreetsLayerAndMatchLowerLayers(
size_t startToken, size_t endToken, unique_ptr<coding::CompressedBitVector> const & features)
{
ASSERT(m_layers.empty(), ());
if (coding::CompressedBitVector::IsEmpty(features))
return;
m_layers.emplace_back();
MY_SCOPE_GUARD(cleanupGuard, bind(&vector<FeaturesLayer>::pop_back, &m_layers));
auto & layer = m_layers.back();
layer.Clear();
layer.m_type = SearchModel::SEARCH_TYPE_STREET;
layer.m_startToken = startToken;
layer.m_endToken = endToken;
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, kUniSpace /* sep */,
layer.m_subQuery);
vector<uint32_t> sortedFeatures;
sortedFeatures.reserve(features->PopCount());
coding::CompressedBitVectorEnumerator::ForEach(*features, MakeBackInsertFunctor(sortedFeatures));
layer.m_sortedFeatures = &sortedFeatures;
ScopedMarkTokens mark(m_usedTokens, startToken, endToken);
MatchPOIsAndBuildings(0 /* curToken */);
}
void Geocoder::MatchPOIsAndBuildings(size_t curToken)
{
// Skip used tokens.
@ -1063,7 +1098,7 @@ void Geocoder::MatchPOIsAndBuildings(size_t curToken)
layer.Clear();
layer.m_startToken = curToken;
layer.m_endToken = curToken + n;
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, " " /* sep */,
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, kUniSpace /* sep */,
layer.m_subQuery);
}

View file

@ -186,6 +186,9 @@ private:
// then performs geocoding in street vicinities.
void GreedilyMatchStreets();
void CreateStreetsLayerAndMatchLowerLayers(
size_t startToken, size_t endToken, unique_ptr<coding::CompressedBitVector> const & features);
// Tries to find all paths in a search tree, where each edge is
// marked with some substring of the query tokens. These paths are
// called "layer sequence" and current path is stored in |m_layers|.

View file

@ -153,23 +153,19 @@ void HouseNumberTokenizer::Tokenize(UniString const & s, vector<Token> & ts)
}
}
void NormalizeHouseNumber(string const & s, vector<string> & ts)
void NormalizeHouseNumber(strings::UniString const & s, vector<strings::UniString> & ts)
{
vector<HouseNumberTokenizer::Token> tokens;
HouseNumberTokenizer::Tokenize(MakeLowerCase(MakeUniString(s)), tokens);
vector<UniString> mergedTokens;
MergeTokens(tokens, mergedTokens);
transform(mergedTokens.begin(), mergedTokens.end(), back_inserter(ts), &ToUtf8);
HouseNumberTokenizer::Tokenize(MakeLowerCase(s), tokens);
MergeTokens(tokens, ts);
}
bool HouseNumbersMatch(string const & houseNumber, string const & query)
bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query)
{
if (houseNumber == query)
return true;
vector<string> queryTokens;
vector<strings::UniString> queryTokens;
NormalizeHouseNumber(query, queryTokens);
if (!queryTokens.empty())
@ -178,14 +174,14 @@ bool HouseNumbersMatch(string const & houseNumber, string const & query)
return HouseNumbersMatch(houseNumber, queryTokens);
}
bool HouseNumbersMatch(string const & houseNumber, vector<string> const & queryTokens)
bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<strings::UniString> const & queryTokens)
{
if (houseNumber.empty() || queryTokens.empty())
return false;
if (queryTokens[0][0] != houseNumber[0])
return false;
vector<string> houseNumberTokens;
vector<strings::UniString> houseNumberTokens;
NormalizeHouseNumber(houseNumber, houseNumberTokens);
if (houseNumberTokens.empty())

View file

@ -38,12 +38,13 @@ public:
};
// Splits house number by tokens, removes blanks and separators.
void NormalizeHouseNumber(string const & s, vector<string> & ts);
void NormalizeHouseNumber(strings::UniString const & s, vector<strings::UniString> & ts);
// Returns true when |query| matches to |houseNumber|.
bool HouseNumbersMatch(string const & houseNumber, string const & query);
bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query);
// Returns true when |queryTokens| match to |houseNumber|.
bool HouseNumbersMatch(string const & houseNumber, vector<string> const & queryTokens);
bool HouseNumbersMatch(strings::UniString const & houseNumber,
vector<strings::UniString> const & queryTokens);
} // namespace v2
} // namespace search