forked from organicmaps/organicmaps-tmp
[search] Fixed greedy street matching.
This commit is contained in:
parent
86f604992b
commit
ca90764d85
13 changed files with 116 additions and 49 deletions
|
@ -100,6 +100,12 @@ public:
|
|||
return *this;
|
||||
}
|
||||
|
||||
template <size_t M>
|
||||
void append(buffer_vector<value_type, M> const & v)
|
||||
{
|
||||
append(v.begin(), v.end());
|
||||
}
|
||||
|
||||
template <typename IterT>
|
||||
void append(IterT beg, IterT end)
|
||||
{
|
||||
|
|
|
@ -67,4 +67,8 @@ bool IsHouseNumber(string const & s)
|
|||
return (!s.empty() && IsDigit(s[0]));
|
||||
}
|
||||
|
||||
bool IsHouseNumber(strings::UniString const & s)
|
||||
{
|
||||
return (!s.empty() && IsDigit(s[0]));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,8 +29,10 @@ namespace feature
|
|||
return str;
|
||||
}
|
||||
|
||||
bool IsDigit(int c);
|
||||
bool IsNumber(strings::UniString const & s);
|
||||
|
||||
bool IsHouseNumber(string const & s);
|
||||
bool IsHouseNumber(strings::UniString const & s);
|
||||
bool IsHouseNumberDeepCheck(strings::UniString const & s);
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#include "search_string_utils.hpp"
|
||||
|
||||
#include "std/set.hpp"
|
||||
#include "std/transform_iterator.hpp"
|
||||
#include "std/unordered_set.hpp"
|
||||
|
||||
#include "base/macros.hpp"
|
||||
|
||||
|
|
|
@ -9,6 +9,22 @@
|
|||
using namespace strings;
|
||||
using namespace search::v2;
|
||||
|
||||
namespace
|
||||
{
|
||||
void NormalizeHouseNumber(string const & s, vector<string> & ts)
|
||||
{
|
||||
vector<strings::UniString> tokens;
|
||||
search::v2::NormalizeHouseNumber(strings::MakeUniString(s), tokens);
|
||||
for (auto const & token : tokens)
|
||||
ts.push_back(strings::ToUtf8(token));
|
||||
}
|
||||
|
||||
bool HouseNumbersMatch(string const & houseNumber, string const & query)
|
||||
{
|
||||
return search::v2::HouseNumbersMatch(strings::MakeUniString(houseNumber),
|
||||
strings::MakeUniString(query));
|
||||
}
|
||||
|
||||
void CheckTokenizer(string const & utf8s, vector<string> const & expected)
|
||||
{
|
||||
UniString utf32s = MakeUniString(utf8s);
|
||||
|
@ -35,6 +51,7 @@ void CheckNormalizer(string const & utf8s, string const & expected)
|
|||
}
|
||||
TEST_EQUAL(actual, expected, ());
|
||||
}
|
||||
} // namespace
|
||||
|
||||
UNIT_TEST(HouseNumberTokenizer_Smoke)
|
||||
{
|
||||
|
|
|
@ -25,7 +25,7 @@ string DebugPrint(FeaturesLayer const & layer)
|
|||
ostringstream os;
|
||||
os << "FeaturesLayer [ size of m_sortedFeatures: "
|
||||
<< (layer.m_sortedFeatures ? layer.m_sortedFeatures->size() : 0)
|
||||
<< ", m_subQuery: " << layer.m_subQuery << ", m_startToken: " << layer.m_startToken
|
||||
<< ", m_subQuery: " << DebugPrint(layer.m_subQuery) << ", m_startToken: " << layer.m_startToken
|
||||
<< ", m_endToken: " << layer.m_endToken << ", m_type: " << DebugPrint(layer.m_type) << " ]";
|
||||
return os.str();
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
#include "search/v2/search_model.hpp"
|
||||
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/vector.hpp"
|
||||
|
||||
namespace search
|
||||
|
@ -20,7 +22,7 @@ struct FeaturesLayer
|
|||
// Non-owning ptr to a sorted vector of features.
|
||||
vector<uint32_t> const * m_sortedFeatures;
|
||||
|
||||
string m_subQuery;
|
||||
strings::UniString m_subQuery;
|
||||
|
||||
size_t m_startToken;
|
||||
size_t m_endToken;
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "base/logging.hpp"
|
||||
#include "base/macros.hpp"
|
||||
#include "base/stl_helpers.hpp"
|
||||
#include "base/string_utils.hpp"
|
||||
|
||||
#include "std/algorithm.hpp"
|
||||
#include "std/bind.hpp"
|
||||
|
@ -151,7 +152,7 @@ private:
|
|||
// |buildings| doesn't contain buildings matching by house number,
|
||||
// so following code reads buildings in POIs vicinities and checks
|
||||
// house numbers.
|
||||
vector<string> queryTokens;
|
||||
vector<strings::UniString> queryTokens;
|
||||
NormalizeHouseNumber(parent.m_subQuery, queryTokens);
|
||||
if (queryTokens.empty())
|
||||
return;
|
||||
|
@ -168,7 +169,7 @@ private:
|
|||
{
|
||||
if (building.m_id.m_mwmId != m_context->m_id || building.m_distanceMeters > kBuildingRadiusMeters)
|
||||
continue;
|
||||
if (HouseNumbersMatch(building.m_name, queryTokens))
|
||||
if (HouseNumbersMatch(strings::MakeUniString(building.m_name), queryTokens))
|
||||
fn(pois[i], building.m_id.m_index);
|
||||
}
|
||||
}
|
||||
|
@ -240,7 +241,7 @@ private:
|
|||
return;
|
||||
}
|
||||
|
||||
vector<string> queryTokens;
|
||||
vector<strings::UniString> queryTokens;
|
||||
NormalizeHouseNumber(child.m_subQuery, queryTokens);
|
||||
|
||||
auto const & checker = ftypes::IsBuildingChecker::Instance();
|
||||
|
@ -272,7 +273,7 @@ private:
|
|||
if (!child.m_hasDelayedFeatures)
|
||||
return false;
|
||||
|
||||
string const houseNumber = feature.GetHouseNumber();
|
||||
strings::UniString const houseNumber(strings::MakeUniString(feature.GetHouseNumber()));
|
||||
if (!feature::IsHouseNumber(houseNumber))
|
||||
return false;
|
||||
return HouseNumbersMatch(houseNumber, queryTokens);
|
||||
|
|
|
@ -45,9 +45,9 @@ uint64_t CalcBottomUpPassCost(vector<FeaturesLayer const *> const & layers)
|
|||
return CalcPassCost(layers.begin(), layers.end());
|
||||
}
|
||||
|
||||
bool LooksLikeHouseNumber(string const & query)
|
||||
bool LooksLikeHouseNumber(strings::UniString const & query)
|
||||
{
|
||||
vector<string> tokens;
|
||||
vector<strings::UniString> tokens;
|
||||
NormalizeHouseNumber(query, tokens);
|
||||
return !tokens.empty() && feature::IsHouseNumber(tokens.front());
|
||||
}
|
||||
|
|
|
@ -56,6 +56,10 @@ size_t constexpr kMaxNumCities = 5;
|
|||
size_t constexpr kMaxNumStates = 5;
|
||||
size_t constexpr kMaxNumVillages = 5;
|
||||
size_t constexpr kMaxNumCountries = 5;
|
||||
|
||||
// This constant limits number of localities that will be extracted
|
||||
// from World map. Villages are not counted here as they're not
|
||||
// included into World map.
|
||||
size_t constexpr kMaxNumLocalities = kMaxNumCities + kMaxNumStates + kMaxNumCountries;
|
||||
|
||||
// List of countries we're supporting search by state. Elements of the
|
||||
|
@ -63,6 +67,8 @@ size_t constexpr kMaxNumLocalities = kMaxNumCities + kMaxNumStates + kMaxNumCoun
|
|||
string const kCountriesWithStates[] = {"US_", "Canada_"};
|
||||
double constexpr kComparePoints = MercatorBounds::GetCellID2PointAbsEpsilon();
|
||||
|
||||
strings::UniString const kUniSpace(strings::MakeUniString(" "));
|
||||
|
||||
template <typename T>
|
||||
struct Id
|
||||
{
|
||||
|
@ -149,19 +155,19 @@ private:
|
|||
};
|
||||
|
||||
void JoinQueryTokens(SearchQueryParams const & params, size_t curToken, size_t endToken,
|
||||
string const & sep, string & res)
|
||||
strings::UniString const & sep, strings::UniString & res)
|
||||
{
|
||||
ASSERT_LESS_OR_EQUAL(curToken, endToken, ());
|
||||
for (size_t i = curToken; i < endToken; ++i)
|
||||
{
|
||||
if (i < params.m_tokens.size())
|
||||
{
|
||||
res.append(strings::ToUtf8(params.m_tokens[i].front()));
|
||||
res.append(params.m_tokens[i].front());
|
||||
}
|
||||
else
|
||||
{
|
||||
ASSERT_EQUAL(i, params.m_tokens.size(), ());
|
||||
res.append(strings::ToUtf8(params.m_prefixTokens.front()));
|
||||
res.append(params.m_prefixTokens.front());
|
||||
}
|
||||
|
||||
if (i + 1 != endToken)
|
||||
|
@ -375,7 +381,11 @@ void Geocoder::SetParams(Params const & params)
|
|||
for (size_t i = 0; i < m_numTokens; ++i)
|
||||
{
|
||||
auto & synonyms = m_params.GetTokens(i);
|
||||
if (!synonyms.empty() && IsStreetSynonym(synonyms.front()))
|
||||
ASSERT(!synonyms.empty(), ());
|
||||
|
||||
auto const & token = synonyms.front();
|
||||
|
||||
if (IsStreetSynonym(token))
|
||||
{
|
||||
auto b = synonyms.begin();
|
||||
auto e = synonyms.end();
|
||||
|
@ -971,24 +981,35 @@ void Geocoder::LimitedSearch(coding::CompressedBitVector const * filter, size_t
|
|||
|
||||
void Geocoder::GreedilyMatchStreets()
|
||||
{
|
||||
ASSERT(m_layers.empty(), ());
|
||||
m_layers.emplace_back();
|
||||
|
||||
MY_SCOPE_GUARD(cleanupGuard, bind(&vector<FeaturesLayer>::pop_back, &m_layers));
|
||||
for (size_t startToken = 0; startToken < m_numTokens; ++startToken)
|
||||
{
|
||||
if (m_usedTokens[startToken])
|
||||
continue;
|
||||
|
||||
unique_ptr<coding::CompressedBitVector> buffer;
|
||||
// Here we try to match as many tokens as possible while
|
||||
// intersection is a non-empty bit vector of streets. All tokens
|
||||
// that are synonyms to streets are ignored. Moreover, each time
|
||||
// a token that looks like a beginning of a house number is met,
|
||||
// we try to use current intersection of tokens as a street layer
|
||||
// and try to match buildings or pois.
|
||||
unique_ptr<coding::CompressedBitVector> allFeatures;
|
||||
|
||||
size_t curToken = startToken;
|
||||
for (; curToken < m_numTokens && !m_usedTokens[curToken]; ++curToken)
|
||||
{
|
||||
if (IsStreetSynonym(m_params.GetTokens(curToken).front()))
|
||||
auto const & token = m_params.GetTokens(curToken).front();
|
||||
if (IsStreetSynonym(token))
|
||||
continue;
|
||||
|
||||
if (feature::IsHouseNumber(token) &&
|
||||
!coding::CompressedBitVector::IsEmpty(allFeatures))
|
||||
{
|
||||
if (m_filter.NeedToFilter(*allFeatures))
|
||||
allFeatures = m_filter.Filter(*allFeatures);
|
||||
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
|
||||
}
|
||||
|
||||
unique_ptr<coding::CompressedBitVector> buffer;
|
||||
if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures))
|
||||
buffer = coding::CompressedBitVector::Intersect(*m_streets, *m_addressFeatures[curToken]);
|
||||
else
|
||||
|
@ -1002,27 +1023,41 @@ void Geocoder::GreedilyMatchStreets()
|
|||
|
||||
if (coding::CompressedBitVector::IsEmpty(allFeatures))
|
||||
continue;
|
||||
|
||||
if (m_filter.NeedToFilter(*allFeatures))
|
||||
allFeatures = m_filter.Filter(*allFeatures);
|
||||
|
||||
auto & layer = m_layers.back();
|
||||
layer.Clear();
|
||||
layer.m_type = SearchModel::SEARCH_TYPE_STREET;
|
||||
layer.m_startToken = startToken;
|
||||
layer.m_endToken = curToken;
|
||||
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, " " /* sep */,
|
||||
layer.m_subQuery);
|
||||
vector<uint32_t> sortedFeatures;
|
||||
coding::CompressedBitVectorEnumerator::ForEach(*allFeatures,
|
||||
MakeBackInsertFunctor(sortedFeatures));
|
||||
layer.m_sortedFeatures = &sortedFeatures;
|
||||
|
||||
ScopedMarkTokens mark(m_usedTokens, startToken, curToken);
|
||||
MatchPOIsAndBuildings(0 /* curToken */);
|
||||
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
|
||||
}
|
||||
}
|
||||
|
||||
void Geocoder::CreateStreetsLayerAndMatchLowerLayers(
|
||||
size_t startToken, size_t endToken, unique_ptr<coding::CompressedBitVector> const & features)
|
||||
{
|
||||
ASSERT(m_layers.empty(), ());
|
||||
|
||||
if (coding::CompressedBitVector::IsEmpty(features))
|
||||
return;
|
||||
|
||||
m_layers.emplace_back();
|
||||
MY_SCOPE_GUARD(cleanupGuard, bind(&vector<FeaturesLayer>::pop_back, &m_layers));
|
||||
|
||||
auto & layer = m_layers.back();
|
||||
layer.Clear();
|
||||
layer.m_type = SearchModel::SEARCH_TYPE_STREET;
|
||||
layer.m_startToken = startToken;
|
||||
layer.m_endToken = endToken;
|
||||
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, kUniSpace /* sep */,
|
||||
layer.m_subQuery);
|
||||
|
||||
vector<uint32_t> sortedFeatures;
|
||||
sortedFeatures.reserve(features->PopCount());
|
||||
coding::CompressedBitVectorEnumerator::ForEach(*features, MakeBackInsertFunctor(sortedFeatures));
|
||||
layer.m_sortedFeatures = &sortedFeatures;
|
||||
|
||||
ScopedMarkTokens mark(m_usedTokens, startToken, endToken);
|
||||
MatchPOIsAndBuildings(0 /* curToken */);
|
||||
}
|
||||
|
||||
void Geocoder::MatchPOIsAndBuildings(size_t curToken)
|
||||
{
|
||||
// Skip used tokens.
|
||||
|
@ -1063,7 +1098,7 @@ void Geocoder::MatchPOIsAndBuildings(size_t curToken)
|
|||
layer.Clear();
|
||||
layer.m_startToken = curToken;
|
||||
layer.m_endToken = curToken + n;
|
||||
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, " " /* sep */,
|
||||
JoinQueryTokens(m_params, layer.m_startToken, layer.m_endToken, kUniSpace /* sep */,
|
||||
layer.m_subQuery);
|
||||
}
|
||||
|
||||
|
|
|
@ -186,6 +186,9 @@ private:
|
|||
// then performs geocoding in street vicinities.
|
||||
void GreedilyMatchStreets();
|
||||
|
||||
void CreateStreetsLayerAndMatchLowerLayers(
|
||||
size_t startToken, size_t endToken, unique_ptr<coding::CompressedBitVector> const & features);
|
||||
|
||||
// Tries to find all paths in a search tree, where each edge is
|
||||
// marked with some substring of the query tokens. These paths are
|
||||
// called "layer sequence" and current path is stored in |m_layers|.
|
||||
|
|
|
@ -153,23 +153,19 @@ void HouseNumberTokenizer::Tokenize(UniString const & s, vector<Token> & ts)
|
|||
}
|
||||
}
|
||||
|
||||
void NormalizeHouseNumber(string const & s, vector<string> & ts)
|
||||
void NormalizeHouseNumber(strings::UniString const & s, vector<strings::UniString> & ts)
|
||||
{
|
||||
vector<HouseNumberTokenizer::Token> tokens;
|
||||
HouseNumberTokenizer::Tokenize(MakeLowerCase(MakeUniString(s)), tokens);
|
||||
|
||||
vector<UniString> mergedTokens;
|
||||
MergeTokens(tokens, mergedTokens);
|
||||
|
||||
transform(mergedTokens.begin(), mergedTokens.end(), back_inserter(ts), &ToUtf8);
|
||||
HouseNumberTokenizer::Tokenize(MakeLowerCase(s), tokens);
|
||||
MergeTokens(tokens, ts);
|
||||
}
|
||||
|
||||
bool HouseNumbersMatch(string const & houseNumber, string const & query)
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query)
|
||||
{
|
||||
if (houseNumber == query)
|
||||
return true;
|
||||
|
||||
vector<string> queryTokens;
|
||||
vector<strings::UniString> queryTokens;
|
||||
NormalizeHouseNumber(query, queryTokens);
|
||||
|
||||
if (!queryTokens.empty())
|
||||
|
@ -178,14 +174,14 @@ bool HouseNumbersMatch(string const & houseNumber, string const & query)
|
|||
return HouseNumbersMatch(houseNumber, queryTokens);
|
||||
}
|
||||
|
||||
bool HouseNumbersMatch(string const & houseNumber, vector<string> const & queryTokens)
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<strings::UniString> const & queryTokens)
|
||||
{
|
||||
if (houseNumber.empty() || queryTokens.empty())
|
||||
return false;
|
||||
if (queryTokens[0][0] != houseNumber[0])
|
||||
return false;
|
||||
|
||||
vector<string> houseNumberTokens;
|
||||
vector<strings::UniString> houseNumberTokens;
|
||||
NormalizeHouseNumber(houseNumber, houseNumberTokens);
|
||||
|
||||
if (houseNumberTokens.empty())
|
||||
|
|
|
@ -38,12 +38,13 @@ public:
|
|||
};
|
||||
|
||||
// Splits house number by tokens, removes blanks and separators.
|
||||
void NormalizeHouseNumber(string const & s, vector<string> & ts);
|
||||
void NormalizeHouseNumber(strings::UniString const & s, vector<strings::UniString> & ts);
|
||||
|
||||
// Returns true when |query| matches to |houseNumber|.
|
||||
bool HouseNumbersMatch(string const & houseNumber, string const & query);
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query);
|
||||
|
||||
// Returns true when |queryTokens| match to |houseNumber|.
|
||||
bool HouseNumbersMatch(string const & houseNumber, vector<string> const & queryTokens);
|
||||
bool HouseNumbersMatch(strings::UniString const & houseNumber,
|
||||
vector<strings::UniString> const & queryTokens);
|
||||
} // namespace v2
|
||||
} // namespace search
|
||||
|
|
Loading…
Add table
Reference in a new issue