[search] Minor refactoring.

This commit is contained in:
tatiana-yan 2019-07-04 17:17:11 +03:00 committed by mpimenov
parent c37d7fc084
commit 954050088d

View file

@ -67,6 +67,112 @@ bool EqualsByHashAndMisprints(StreetsMatcher::Prediction const & lhs,
{
return lhs.m_withMisprints == rhs.m_withMisprints && lhs.m_hash == rhs.m_hash;
}
void FindStreets(BaseContext const & ctx, FeaturesFilter const & filter, QueryParams const & params,
size_t startToken, bool withMisprints,
vector<StreetsMatcher::Prediction> & predictions)
{
// Here we try to match as many tokens as possible while
// intersection is a non-empty bit vector of streets. Single
// tokens that are synonyms to streets are ignored. Moreover,
// each time a token that looks like a beginning of a house number
// is met, we try to use current intersection of tokens as a
// street layer and try to match BUILDINGs or POIs.
CBV streets(ctx.m_streets);
CBV all;
all.SetFull();
size_t curToken = startToken;
// This variable is used for prevention of duplicate calls to
// CreateStreetsLayerAndMatchLowerLayers() with the same
// arguments.
size_t lastToken = startToken;
// When true, no bit vectors were intersected with |streets| at all.
bool emptyIntersection = true;
// When true, |streets| is in the incomplete state and can't be
// used for creation of street layers.
bool incomplete = false;
auto emit = [&]() {
if (streets.IsEmpty() || emptyIntersection || incomplete || lastToken == curToken)
return;
CBV fs(streets);
CBV fa(all);
ASSERT(!fs.IsFull(), ());
ASSERT(!fa.IsFull(), ());
if (filter.NeedToFilter(fs))
fs = filter.Filter(fs);
if (fs.IsEmpty())
return;
if (filter.NeedToFilter(fa))
fa = filter.Filter(fa).Union(fs);
predictions.emplace_back();
auto & prediction = predictions.back();
prediction.m_tokenRange = TokenRange(startToken, curToken);
ASSERT_NOT_EQUAL(fs.PopCount(), 0, ());
ASSERT_LESS_OR_EQUAL(fs.PopCount(), fa.PopCount(), ());
prediction.m_prob = static_cast<double>(fs.PopCount()) / static_cast<double>(fa.PopCount());
prediction.m_features = move(fs);
prediction.m_hash = prediction.m_features.Hash();
prediction.m_withMisprints = withMisprints;
};
StreetTokensFilter streetsFilter(
[&](strings::UniString const & /* token */, size_t tag) {
auto buffer = streets.Intersect(ctx.m_features[tag].m_features);
if (tag < curToken)
{
// This is the case for delayed
// street synonym. Therefore,
// |streets| is temporarily in the
// incomplete state.
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = true;
return;
}
ASSERT_EQUAL(tag, curToken, ());
// |streets| will become empty after
// the intersection. Therefore we need
// to create streets layer right now.
if (buffer.IsEmpty())
emit();
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = false;
},
withMisprints);
for (; curToken < ctx.m_numTokens && !ctx.IsTokenUsed(curToken) && !streets.IsEmpty(); ++curToken)
{
auto const & token = params.GetToken(curToken).GetOriginal();
bool const isPrefix = params.IsPrefixToken(curToken);
if (house_numbers::LooksLikeHouseNumber(token, isPrefix))
emit();
streetsFilter.Put(token, isPrefix, curToken);
}
emit();
}
} // namespace
// static
@ -87,9 +193,10 @@ void StreetsMatcher::Go(BaseContext const & ctx, FeaturesFilter const & filter,
// Leave the most probable and longest prediction for predictions with the same m_hash (features)
// and m_withMisprints.
// We will still distinguish parses with the same m_hash (features) but different range and m_withMisprints.
// For example, for "Paramount dive" we will have two parses:
// STREET UNUSED (can be matched to poi later)
// We will still distinguish parses with the same m_hash (features) but different range and
// m_withMisprints. For example, for "Paramount dive" we will have two parses:
//
// STREET UNUSED (can be matched to poi later)
// Paramount dive
//
// STREET STREET ("drive" with misprints)
@ -115,114 +222,8 @@ void StreetsMatcher::FindStreets(BaseContext const & ctx, FeaturesFilter const &
if (ctx.IsTokenUsed(startToken))
continue;
auto emit = [&](CBV const & streets, CBV const & all, size_t curToken, size_t lastToken,
bool emptyIntersection, bool incomplete, bool withMisprints) {
if (!streets.IsEmpty() && !emptyIntersection && !incomplete && lastToken != curToken)
{
CBV fs(streets);
CBV fa(all);
ASSERT(!fs.IsFull(), ());
ASSERT(!fa.IsFull(), ());
if (filter.NeedToFilter(fs))
fs = filter.Filter(fs);
if (fs.IsEmpty())
return;
if (filter.NeedToFilter(fa))
fa = filter.Filter(fa).Union(fs);
predictions.emplace_back();
auto & prediction = predictions.back();
prediction.m_tokenRange = TokenRange(startToken, curToken);
ASSERT_NOT_EQUAL(fs.PopCount(), 0, ());
ASSERT_LESS_OR_EQUAL(fs.PopCount(), fa.PopCount(), ());
prediction.m_prob = static_cast<double>(fs.PopCount()) / static_cast<double>(fa.PopCount());
prediction.m_features = move(fs);
prediction.m_hash = prediction.m_features.Hash();
prediction.m_withMisprints = withMisprints;
}
};
auto findStreets = [&](bool withMisprints) {
// Here we try to match as many tokens as possible while
// intersection is a non-empty bit vector of streets. Single
// tokens that are synonyms to streets are ignored. Moreover,
// each time a token that looks like a beginning of a house number
// is met, we try to use current intersection of tokens as a
// street layer and try to match BUILDINGs or POIs.
CBV streets(ctx.m_streets);
CBV all;
all.SetFull();
size_t curToken = startToken;
// This variable is used for prevention of duplicate calls to
// CreateStreetsLayerAndMatchLowerLayers() with the same
// arguments.
size_t lastToken = startToken;
// When true, no bit vectors were intersected with |streets| at all.
bool emptyIntersection = true;
// When true, |streets| is in the incomplete state and can't be
// used for creation of street layers.
bool incomplete = false;
StreetTokensFilter filter([&](strings::UniString const & /* token */, size_t tag)
{
auto buffer = streets.Intersect(ctx.m_features[tag].m_features);
if (tag < curToken)
{
// This is the case for delayed
// street synonym. Therefore,
// |streets| is temporarily in the
// incomplete state.
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = true;
return;
}
ASSERT_EQUAL(tag, curToken, ());
// |streets| will become empty after
// the intersection. Therefore we need
// to create streets layer right now.
if (buffer.IsEmpty())
emit(streets, all, curToken, lastToken, emptyIntersection,
incomplete, withMisprints);
streets = buffer;
all = all.Intersect(ctx.m_features[tag].m_features);
emptyIntersection = false;
incomplete = false;
},
withMisprints);
for (; curToken < ctx.m_numTokens && !ctx.IsTokenUsed(curToken) && !streets.IsEmpty();
++curToken)
{
auto const & token = params.GetToken(curToken).GetOriginal();
bool const isPrefix = params.IsPrefixToken(curToken);
if (house_numbers::LooksLikeHouseNumber(token, isPrefix))
emit(streets, all, curToken, lastToken, emptyIntersection, incomplete, withMisprints);
filter.Put(token, isPrefix, curToken);
}
emit(streets, all, curToken, lastToken, emptyIntersection, incomplete, withMisprints);
};
findStreets(false /* withMisprints */);
findStreets(true /* withMisprints */);
::search::FindStreets(ctx, filter, params, startToken, false /* withMisprints */, predictions);
::search::FindStreets(ctx, filter, params, startToken, true /* withMisprints */, predictions);
}
}
} // namespace search