[search] Fixed streets matching.

This commit is contained in:
Yuri Gorshenin 2016-06-16 17:25:36 +03:00
parent 01db5017dc
commit 683250079a
7 changed files with 212 additions and 60 deletions

View file

@ -176,33 +176,19 @@ public:
tokens.resize(maxTokensCount);
}
// Streets are a special case: we do not add the token "street" and its
// synonyms when the feature's name contains it because in
// the search phase this part of the query will be matched against the
// "street" in the categories branch of the search index.
// However, we still add it when there are two or more street tokens
// ("avenue st", "улица набережная").
size_t const tokensCount = tokens.size();
size_t numStreetTokens = 0;
vector<bool> isStreet(tokensCount);
for (size_t i = 0; i < tokensCount; ++i)
if (m_hasStreetType)
{
if (search::IsStreetSynonym(tokens[i]))
{
isStreet[i] = true;
++numStreetTokens;
}
search::StreetTokensFilter filter([&](strings::UniString const & token, size_t /* tag */)
{
AddToken(lang, token);
});
for (auto const & token : tokens)
filter.Put(token, false /* isPrefix */, 0 /* tag */);
}
for (size_t i = 0; i < tokensCount; ++i)
else
{
if (numStreetTokens == 1 && isStreet[i] && m_hasStreetType)
{
//LOG(LDEBUG, ("Skipping token:", tokens[i], "in", name));
continue;
}
AddToken(lang, tokens[i]);
for (auto const & token : tokens)
AddToken(lang, token);
}
return true;

View file

@ -9,6 +9,28 @@ using namespace strings;
namespace
{
class Utf8StreetTokensFilter
{
public:
Utf8StreetTokensFilter(vector<pair<string, size_t>> & cont)
: m_cont(cont)
, m_filter([&](UniString const & token, size_t tag)
{
m_cont.emplace_back(ToUtf8(token), tag);
})
{
}
inline void Put(string const & token, bool isPrefix, size_t tag)
{
m_filter.Put(MakeUniString(token), isPrefix, tag);
}
private:
vector<pair<string, size_t>> & m_cont;
StreetTokensFilter m_filter;
};
bool TestStreetPrefixMatch(char const * s)
{
return IsStreetSynonymPrefix(MakeUniString(s));
@ -74,3 +96,51 @@ UNIT_TEST(StreetPrefixMatch)
TEST(TestStreetPrefixMatch("проезд"), ());
TEST(!TestStreetPrefixMatch("проездд"), ());
}
UNIT_TEST(StreetTokensFilter)
{
using TList = vector<pair<string, size_t>>;
{
TList expected = {};
TList actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("ули", true /* isPrefix */, 0 /* tag */);
TEST_EQUAL(expected, actual, ());
}
{
TList expected = {};
TList actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("улица", false /* isPrefix */, 0 /* tag */);
TEST_EQUAL(expected, actual, ());
}
{
TList expected = {{"генерала", 1}, {"антонова", 2}};
TList actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("ул", false /* isPrefix */, 0 /* tag */);
filter.Put("генерала", false /* isPrefix */, 1 /* tag */);
filter.Put("антонова", false /* isPrefix */, 2 /* tag */);
TEST_EQUAL(expected, actual, ());
}
{
TList expected = {{"улица", 100}, {"набережная", 50}};
TList actual;
Utf8StreetTokensFilter filter(actual);
filter.Put("улица", false /* isPrefix */, 100 /* tag */);
filter.Put("набережная", true /* isPrefix */, 50 /* tag */);
TEST_EQUAL(expected, actual, ());
}
}

View file

@ -223,4 +223,27 @@ bool ContainsNormalized(string const & str, string const & substr)
UniString const usubstr = NormalizeAndSimplifyString(substr);
return std::search(ustr.begin(), ustr.end(), usubstr.begin(), usubstr.end()) != ustr.end();
}
// StreetTokensFilter ------------------------------------------------------------------------------
void StreetTokensFilter::Put(strings::UniString const & token, bool isPrefix, size_t tag)
{
if ((isPrefix && IsStreetSynonymPrefix(token)) || (!isPrefix && IsStreetSynonym(token)))
{
++m_numSynonyms;
if (m_numSynonyms == 1)
{
m_delayedToken = token;
m_delayedTag = tag;
}
else
{
EmitToken(m_delayedToken, m_delayedTag);
EmitToken(token, tag);
}
}
else
{
EmitToken(token, tag);
}
}
} // namespace search

View file

@ -54,4 +54,37 @@ bool IsStreetSynonymPrefix(strings::UniString const & s);
/// Normalizes both str and substr, and then returns true if substr is found in str.
/// Used in native platform code for search in localized strings (cuisines, categories, strings etc.).
bool ContainsNormalized(string const & str, string const & substr);
// This class can be used as a filter for street tokens. As there can
// be street synonyms in the street name, single street synonym is
// skipped, but multiple synonyms are left as is.
class StreetTokensFilter
{
public:
using TCallback = function<void(strings::UniString const & token, size_t tag)>;
template <typename TC>
StreetTokensFilter(TC && callback)
: m_callback(forward<TC>(callback))
{
}
// Puts token to the filter. Filter checks following cases:
// * if |token| is the first street synonym met so far, it's delayed
// * if |token| is a street synonym, but not the first, callback is called
// for the |token| and for the previously delayed token
// * if |token| is not a street synonym, callback is called for the |token|
void Put(strings::UniString const & token, bool isPrefix, size_t tag);
private:
using TCell = pair<strings::UniString, size_t>;
inline void EmitToken(strings::UniString const & token, size_t tag) { m_callback(token, tag); }
strings::UniString m_delayedToken;
size_t m_delayedTag = 0;
size_t m_numSynonyms = 0;
TCallback m_callback;
};
} // namespace search

View file

@ -466,6 +466,8 @@ void Geocoder::SetParams(Params const & params)
}
}
LOG(LDEBUG, ("Tokens = ", m_params.m_tokens));
LOG(LDEBUG, ("Prefix tokens = ", m_params.m_prefixTokens));
LOG(LDEBUG, ("Languages =", m_params.m_langs));
}
@ -1075,61 +1077,82 @@ void Geocoder::GreedilyMatchStreets()
continue;
// Here we try to match as many tokens as possible while
// intersection is a non-empty bit vector of streets. All tokens
// that are synonyms to streets are ignored. Moreover, each time
// a token that looks like a beginning of a house number is met,
// we try to use current intersection of tokens as a street layer
// and try to match buildings or pois.
unique_ptr<coding::CompressedBitVector> allFeatures;
// intersection is a non-empty bit vector of streets. Single
// tokens that are synonyms to streets are ignored. Moreover,
// each time a token that looks like a beginning of a house number
// is met, we try to use current intersection of tokens as a
// street layer and try to match BUILDINGs or POIs.
CBVPtr allFeatures(m_streets, false /* isOwner */);
size_t curToken = startToken;
// This variable is used for prevention of duplicate calls to
// CreateStreetsLayerAndMatchLowerLayers() with the same
// arguments.
size_t lastStopToken = curToken;
size_t lastToken = startToken;
for (; curToken < m_numTokens && !m_usedTokens[curToken]; ++curToken)
bool emptyIntersection = true;
bool incomplete = false;
auto createStreetsLayerAndMatchLowerLayers = [&]()
{
if (!allFeatures.IsEmpty() && !emptyIntersection && !incomplete && lastToken != curToken)
{
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, *allFeatures);
lastToken = curToken;
}
};
StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag)
{
auto buffer = coding::CompressedBitVector::Intersect(
*allFeatures, *m_addressFeatures[tag]);
if (tag < curToken)
{
allFeatures.Set(move(buffer));
emptyIntersection = false;
incomplete = true;
return;
}
ASSERT_EQUAL(tag, curToken, ());
// |allFeatures| will become empty
// after the intersection. Therefore
// we need to create streets layer
// right now.
if (coding::CompressedBitVector::IsEmpty(buffer))
createStreetsLayerAndMatchLowerLayers();
allFeatures.Set(move(buffer));
emptyIntersection = false;
incomplete = false;
});
for (; curToken < m_numTokens && !m_usedTokens[curToken] && !allFeatures.IsEmpty(); ++curToken)
{
auto const & token = m_params.GetTokens(curToken).front();
if (IsStreetSynonymPrefix(token))
continue;
bool const isPrefix = curToken >= m_params.m_tokens.size();
if (house_numbers::LooksLikeHouseNumber(token, isPrefix))
{
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
lastStopToken = curToken;
}
createStreetsLayerAndMatchLowerLayers();
unique_ptr<coding::CompressedBitVector> buffer;
if (startToken == curToken || coding::CompressedBitVector::IsEmpty(allFeatures))
buffer = coding::CompressedBitVector::Intersect(*m_streets, *m_addressFeatures[curToken]);
else
buffer = coding::CompressedBitVector::Intersect(*allFeatures, *m_addressFeatures[curToken]);
if (coding::CompressedBitVector::IsEmpty(buffer))
break;
allFeatures.swap(buffer);
filter.Put(token, isPrefix, curToken);
}
if (curToken != lastStopToken)
CreateStreetsLayerAndMatchLowerLayers(startToken, curToken, allFeatures);
createStreetsLayerAndMatchLowerLayers();
}
}
void Geocoder::CreateStreetsLayerAndMatchLowerLayers(
size_t startToken, size_t endToken, unique_ptr<coding::CompressedBitVector> const & features)
size_t startToken, size_t endToken, coding::CompressedBitVector const & features)
{
ASSERT(m_layers.empty(), ());
if (coding::CompressedBitVector::IsEmpty(features))
if (coding::CompressedBitVector::IsEmpty(&features))
return;
CBVPtr filtered(features.get(), false /* isOwner */);
if (m_filter->NeedToFilter(*features))
filtered.Set(m_filter->Filter(*features).release(), true /* isOwner */);
CBVPtr filtered(&features, false /* isOwner */);
if (m_filter->NeedToFilter(features))
filtered.Set(m_filter->Filter(features).release(), true /* isOwner */);
m_layers.emplace_back();
MY_SCOPE_GUARD(cleanupGuard, bind(&vector<FeaturesLayer>::pop_back, &m_layers));
@ -1138,7 +1161,7 @@ void Geocoder::CreateStreetsLayerAndMatchLowerLayers(
InitLayer(SearchModel::SEARCH_TYPE_STREET, startToken, endToken, layer);
vector<uint32_t> sortedFeatures;
sortedFeatures.reserve(features->PopCount());
sortedFeatures.reserve(features.PopCount());
filtered.ForEach(MakeBackInsertFunctor(sortedFeatures));
layer.m_sortedFeatures = &sortedFeatures;

View file

@ -245,8 +245,8 @@ private:
// then performs geocoding in street vicinities.
void GreedilyMatchStreets();
void CreateStreetsLayerAndMatchLowerLayers(
size_t startToken, size_t endToken, unique_ptr<coding::CompressedBitVector> const & features);
void CreateStreetsLayerAndMatchLowerLayers(size_t startToken, size_t endToken,
coding::CompressedBitVector const & features);
// Tries to find all paths in a search tree, where each edge is
// marked with some substring of the query tokens. These paths are

View file

@ -59,6 +59,8 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke)
TestCity losAlamosCity(m2::PointD(10, 10), "Los Alamos", "en", 100 /* rank */);
TestCity mskCity(m2::PointD(0, 0), "Moscow", "en", 100 /* rank */);
TestCity torontoCity(m2::PointD(-10, -10), "Toronto", "en", 100 /* rank */);
TestVillage longPondVillage(m2::PointD(15, 15), "Long Pond Village", "en", 10 /* rank */);
TestStreet feynmanStreet(
vector<m2::PointD>{m2::PointD(9.999, 9.999), m2::PointD(10, 10), m2::PointD(10.001, 10.001)},
@ -94,16 +96,23 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke)
TestPOI lantern1(m2::PointD(10.0005, 10.0005), "lantern 1", "en");
TestPOI lantern2(m2::PointD(10.0006, 10.0005), "lantern 2", "en");
TestStreet stradaDrive(vector<m2::PointD>{m2::PointD(-10.001, -10.001), m2::PointD(-10, -10),
m2::PointD(-9.999, -9.999)},
"Strada drive", "en");
TestBuilding terranceHouse(m2::PointD(-10, -10), "", "155", stradaDrive, "en");
BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(wonderlandCountry);
builder.Add(losAlamosCity);
builder.Add(mskCity);
builder.Add(torontoCity);
});
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder)
{
builder.Add(losAlamosCity);
builder.Add(mskCity);
builder.Add(torontoCity);
builder.Add(longPondVillage);
builder.Add(feynmanStreet);
@ -125,6 +134,9 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke)
builder.Add(quantumCafe);
builder.Add(lantern1);
builder.Add(lantern2);
builder.Add(stradaDrive);
builder.Add(terranceHouse);
});
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
@ -190,6 +202,11 @@ UNIT_CLASS_TEST(ProcessorTest, Smoke)
TRules rules = {ExactMatch(wonderlandId, bornHouse)};
TEST(ResultsMatch("long pond 1st april street 8", rules), ());
}
{
TRules rules = {ExactMatch(wonderlandId, terranceHouse)};
TEST(ResultsMatch("Toronto strada drive 155", rules), ());
}
}
UNIT_CLASS_TEST(ProcessorTest, SearchInWorld)