Merge pull request #2961 from ygorshenin/add-postcodes-to-search-index

[search] Postcodes are added to the search index.
This commit is contained in:
mpimenov 2016-04-22 20:15:55 +03:00
commit b9cc722dd7
25 changed files with 806 additions and 216 deletions

View file

@ -220,6 +220,10 @@ bool IsASCIIString(string const & str)
return true;
}
bool IsASCIIDigit(UniChar c) { return c >= '0' && c <= '9'; }
bool IsASCIILatin(UniChar c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); }
bool StartsWith(UniString const & s, UniString const & p)
{
if (p.size() > s.size())

View file

@ -29,6 +29,19 @@ public:
template <class IterT> UniString(IterT b, IterT e) : BaseT(b, e) {}
bool IsEqualAscii(char const * s) const;
UniString & operator+=(UniString const & rhs)
{
append(rhs);
return *this;
}
UniString operator+(UniString const & rhs) const
{
UniString result(*this);
result += rhs;
return result;
}
};
/// Performs full case folding for string to make it search-compatible according
@ -67,6 +80,8 @@ bool EqualNoCase(string const & s1, string const & s2);
UniString MakeUniString(string const & utf8s);
string ToUtf8(UniString const & s);
bool IsASCIIString(string const & str);
bool IsASCIIDigit(UniChar c);
bool IsASCIILatin(UniChar c);
inline string DebugPrint(UniString const & s)
{

View file

@ -136,11 +136,11 @@ struct FeatureNameInserter
{
}
void AddToken(signed char lang, strings::UniString const & s) const
void AddToken(uint8_t lang, strings::UniString const & s) const
{
strings::UniString key;
key.reserve(s.size() + 1);
key.push_back(static_cast<uint8_t>(lang));
key.push_back(lang);
key.append(s.begin(), s.end());
m_keyValuePairs.emplace_back(key, m_val);
@ -261,9 +261,11 @@ public:
void operator() (FeatureType const & f, uint32_t index) const
{
using namespace search;
feature::TypesHolder types(f);
static search::TypesSkipper skipIndex;
static TypesSkipper skipIndex;
skipIndex.SkipTypes(types);
if (types.Empty())
@ -278,6 +280,18 @@ public:
skipIndex.IsCountryOrState(types) ? m_synonyms : nullptr, m_keyValuePairs, hasStreetType);
m_valueBuilder.MakeValue(f, types, index, inserter.m_val);
string const postcode = f.GetMetadata().Get(feature::Metadata::FMD_POSTCODE);
if (!postcode.empty())
{
// See OSM TagInfo or Wiki about modern postcodes format. The average number of tokens is less
// than two.
buffer_vector<strings::UniString, 2> tokens;
SplitUniString(NormalizeAndSimplifyString(postcode), MakeBackInsertFunctor(tokens),
Delimiters());
for (auto const & token : tokens)
inserter.AddToken(kPostcodesLang, PostcodeToString(token));
}
// Skip types for features without names.
if (!f.ForEachName(inserter))
skipIndex.SkipEmptyNameTypes(types);
@ -291,7 +305,7 @@ public:
// add names of categories of the feature
for (uint32_t t : categoryTypes)
inserter.AddToken(search::kCategoriesLang, search::FeatureTypeToString(c.GetIndexForType(t)));
inserter.AddToken(kCategoriesLang, FeatureTypeToString(c.GetIndexForType(t)));
}
};

View file

@ -90,6 +90,12 @@ UniString FeatureTypeToString(uint32_t type)
return UniString(s.begin(), s.end());
}
UniString PostcodeToString(strings::UniString const & postcode)
{
static UniString const kPrefix = MakeUniString("!postcode:");
return kPrefix + postcode;
}
namespace
{
char const * kStreetTokensSeparator = "\t -,.";

View file

@ -19,6 +19,8 @@ void SplitUniString(strings::UniString const & uniS, F f, DelimsT const & delims
strings::UniString FeatureTypeToString(uint32_t type);
strings::UniString PostcodeToString(strings::UniString const & postcode);
template <class ContainerT, class DelimsT>
bool TokenizeStringAndCheckIfLastTokenIsPrefix(strings::UniString const & s,
ContainerT & tokens,

View file

@ -3,6 +3,7 @@
#include "search/search_index_values.hpp"
#include "search/search_query.hpp"
#include "search/search_query_params.hpp"
#include "search/v2/token_slice.hpp"
#include "indexer/trie.hpp"
@ -103,6 +104,25 @@ bool CheckMatchString(strings::UniChar const * rootPrefix, size_t rootPrefixSize
return false;
}
template <typename TValue>
bool FindLangIndex(trie::Iterator<ValueList<TValue>> const & trieRoot, uint8_t lang, uint32_t & langIx)
{
ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const numLangs = static_cast<uint32_t>(trieRoot.m_edge.size());
for (uint32_t i = 0; i < numLangs; ++i)
{
auto const & edge = trieRoot.m_edge[i].m_label;
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
if (edge[0] == lang)
{
langIx = i;
return true;
}
}
return false;
}
} // namespace
template <typename TValue, typename TF>
@ -222,7 +242,7 @@ public:
toDo(value);
}
};
} // namespace search::impl
} // impl
template <typename TValue>
struct TrieRootPrefix
@ -345,27 +365,23 @@ template <typename TValue, typename THolder>
bool MatchCategoriesInTrie(SearchQueryParams const & params,
trie::Iterator<ValueList<TValue>> const & trieRoot, THolder && holder)
{
ASSERT_LESS(trieRoot.m_edge.size(), numeric_limits<uint32_t>::max(), ());
uint32_t const numLangs = static_cast<uint32_t>(trieRoot.m_edge.size());
for (uint32_t langIx = 0; langIx < numLangs; ++langIx)
{
auto const & edge = trieRoot.m_edge[langIx].m_label;
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
if (edge[0] == search::kCategoriesLang)
{
auto const catRoot = trieRoot.GoToEdge(langIx);
MatchTokensInTrie(params.m_tokens, TrieRootPrefix<TValue>(*catRoot, edge), holder);
uint32_t langIx = 0;
if (!impl::FindLangIndex(trieRoot, search::kCategoriesLang, langIx))
return false;
// Last token's prefix is used as a complete token here, to
// limit the number of features in the last bucket of a
// holder. Probably, this is a false optimization.
holder.Resize(params.m_tokens.size() + 1);
holder.SwitchTo(params.m_tokens.size());
MatchTokenInTrie(params.m_prefixTokens, TrieRootPrefix<TValue>(*catRoot, edge), holder);
return true;
}
}
return false;
auto const & edge = trieRoot.m_edge[langIx].m_label;
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
auto const catRoot = trieRoot.GoToEdge(langIx);
MatchTokensInTrie(params.m_tokens, TrieRootPrefix<TValue>(*catRoot, edge), holder);
// Last token's prefix is used as a complete token here, to limit
// the number of features in the last bucket of a holder. Probably,
// this is a false optimization.
holder.Resize(params.m_tokens.size() + 1);
holder.SwitchTo(params.m_tokens.size());
MatchTokenInTrie(params.m_prefixTokens, TrieRootPrefix<TValue>(*catRoot, edge), holder);
return true;
}
// Calls toDo with trie root prefix and language code on each language
@ -427,4 +443,29 @@ void MatchFeaturesInTrie(SearchQueryParams const & params,
intersecter.ForEachResult(forward<ToDo>(toDo));
}
template <typename TValue, typename TFilter, typename ToDo>
void MatchPostcodesInTrie(v2::TokenSlice const & slice,
trie::Iterator<ValueList<TValue>> const & trieRoot,
TFilter const & filter, ToDo && toDo)
{
uint32_t langIx = 0;
if (!impl::FindLangIndex(trieRoot, search::kPostcodesLang, langIx))
return;
auto const & edge = trieRoot.m_edge[langIx].m_label;
auto const postcodesRoot = trieRoot.GoToEdge(langIx);
impl::OffsetIntersecter<TFilter, TValue> intersecter(filter);
for (size_t i = 0; i < slice.Size(); ++i)
{
if (slice.IsPrefix(i))
MatchTokenPrefixInTrie(slice.Get(i), TrieRootPrefix<TValue>(*postcodesRoot, edge), intersecter);
else
MatchTokenInTrie(slice.Get(i), TrieRootPrefix<TValue>(*postcodesRoot, edge), intersecter);
intersecter.NextStep();
}
intersecter.ForEachResult(forward<ToDo>(toDo));
}
} // namespace search

View file

@ -7,6 +7,7 @@
#include "search_trie.hpp"
#include "v2/mwm_context.hpp"
#include "v2/token_slice.hpp"
#include "indexer/feature.hpp"
#include "indexer/feature_algo.hpp"
@ -29,8 +30,65 @@ using osm::Editor;
namespace search
{
namespace v2
{
namespace
{
class FeaturesCollector
{
public:
FeaturesCollector(my::Cancellable const & cancellable, vector<uint64_t> & features)
: m_cancellable(cancellable), m_features(features), m_counter(0)
{
}
template <typename TValue>
void operator()(TValue const & value)
{
if ((++m_counter & 0xFF) == 0)
BailIfCancelled(m_cancellable);
m_features.push_back(value.m_featureId);
}
inline void operator()(uint32_t feature) { m_features.push_back(feature); }
inline void operator()(uint64_t feature) { m_features.push_back(feature); }
private:
my::Cancellable const & m_cancellable;
vector<uint64_t> & m_features;
uint32_t m_counter;
};
class EditedFeaturesHolder
{
public:
EditedFeaturesHolder(MwmSet::MwmId const & id)
{
Editor & editor = Editor::Instance();
m_deleted = editor.GetFeaturesByStatus(id, Editor::FeatureStatus::Deleted);
m_modified = editor.GetFeaturesByStatus(id, Editor::FeatureStatus::Modified);
m_created = editor.GetFeaturesByStatus(id, Editor::FeatureStatus::Created);
}
bool ModifiedOrDeleted(uint32_t featureIndex) const
{
return binary_search(m_deleted.begin(), m_deleted.end(), featureIndex) ||
binary_search(m_modified.begin(), m_modified.end(), featureIndex);
}
template <typename TFn>
void ForEachModifiedOrCreated(TFn & fn)
{
for_each(m_modified.begin(), m_modified.end(), fn);
for_each(m_created.begin(), m_created.end(), fn);
}
private:
vector<uint32_t> m_deleted;
vector<uint32_t> m_modified;
vector<uint32_t> m_created;
};
unique_ptr<coding::CompressedBitVector> SortFeaturesAndBuildCBV(vector<uint64_t> && features)
{
@ -89,6 +147,13 @@ bool MatchFeatureByName(FeatureType const & ft, SearchQueryParams const & params
return matched;
}
bool MatchFeatureByPostcode(FeatureType const & ft, v2::TokenSlice const & slice)
{
string const postcode = ft.GetMetadata().Get(feature::Metadata::FMD_POSTCODE);
// TODO(@y): implement this.
return false;
}
// Retrieves from the search index corresponding to |value| all
// features matching to |params|.
template <typename TValue>
@ -96,16 +161,7 @@ unique_ptr<coding::CompressedBitVector> RetrieveAddressFeaturesImpl(
MwmSet::MwmId const & id, MwmValue & value, my::Cancellable const & cancellable,
SearchQueryParams const & params)
{
// Exclude from search all deleted/modified features and match all edited/created features separately.
Editor & editor = Editor::Instance();
auto const deleted = editor.GetFeaturesByStatus(id, Editor::FeatureStatus::Deleted);
auto const modified = editor.GetFeaturesByStatus(id, Editor::FeatureStatus::Modified);
auto const filter = [&](uint32_t featureIndex) -> bool
{
return (!binary_search(deleted.begin(), deleted.end(), featureIndex) &&
!binary_search(modified.begin(), modified.end(), featureIndex));
};
EditedFeaturesHolder holder(id);
serial::CodingParams codingParams(trie::GetCodingParams(value.GetHeader().GetDefCodingParams()));
ModelReaderPtr searchReader = value.m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
@ -116,17 +172,16 @@ unique_ptr<coding::CompressedBitVector> RetrieveAddressFeaturesImpl(
// TODO (@y, @m): This code may be optimized in the case where
// bit vectors are sorted in the search index.
vector<uint64_t> features;
uint32_t counter = 0;
auto const collector = [&](TValue const & value)
{
if ((++counter & 0xFF) == 0)
BailIfCancelled(cancellable);
features.push_back(value.m_featureId);
};
FeaturesCollector collector(cancellable, features);
MatchFeaturesInTrie(params, *trieRoot, filter, collector);
MatchFeaturesInTrie(params, *trieRoot, [&holder](uint32_t featureIndex)
{
return !holder.ModifiedOrDeleted(featureIndex);
},
collector);
// Match all edited/created features separately.
Editor & editor = Editor::Instance();
auto const matcher = [&](uint32_t featureIndex)
{
FeatureType ft;
@ -136,9 +191,47 @@ unique_ptr<coding::CompressedBitVector> RetrieveAddressFeaturesImpl(
features.push_back(featureIndex);
};
for_each(modified.begin(), modified.end(), matcher);
auto const created = editor.GetFeaturesByStatus(id, Editor::FeatureStatus::Created);
for_each(created.begin(), created.end(), matcher);
holder.ForEachModifiedOrCreated(matcher);
return SortFeaturesAndBuildCBV(move(features));
}
template <typename TValue>
unique_ptr<coding::CompressedBitVector> RetrievePostcodeFeaturesImpl(
MwmSet::MwmId const & id, MwmValue & value, my::Cancellable const & cancellable,
TokenSlice const & slice)
{
EditedFeaturesHolder holder(id);
serial::CodingParams codingParams(trie::GetCodingParams(value.GetHeader().GetDefCodingParams()));
ModelReaderPtr searchReader = value.m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
auto const trieRoot = trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<TValue>>(
SubReaderWrapper<Reader>(searchReader.GetPtr()), SingleValueSerializer<TValue>(codingParams));
// TODO (@y, @m): This code may be optimized in the case where
// bit vectors are sorted in the search index.
vector<uint64_t> features;
FeaturesCollector collector(cancellable, features);
MatchPostcodesInTrie(slice, *trieRoot, [&holder](uint32_t featureIndex)
{
return !holder.ModifiedOrDeleted(featureIndex);
},
collector);
// Match all edited/created features separately.
Editor & editor = Editor::Instance();
auto const matcher = [&](uint32_t featureIndex)
{
FeatureType ft;
VERIFY(editor.GetEditedFeature(id, featureIndex, ft), ());
// TODO(AlexZ): Should we match by some feature's metafields too?
if (MatchFeatureByPostcode(ft, slice))
features.push_back(featureIndex);
};
holder.ForEachModifiedOrCreated(matcher);
return SortFeaturesAndBuildCBV(move(features));
}
@ -149,42 +242,74 @@ unique_ptr<coding::CompressedBitVector> RetrieveGeometryFeaturesImpl(
v2::MwmContext const & context, my::Cancellable const & cancellable,
covering::IntervalsT const & coverage, int scale)
{
uint32_t counter = 0;
vector<uint64_t> features;
context.ForEachIndex(coverage, scale, [&](uint64_t featureId)
{
if ((++counter & 0xFF) == 0)
BailIfCancelled(cancellable);
features.push_back(featureId);
});
FeaturesCollector collector(cancellable, features);
context.ForEachIndex(coverage, scale, collector);
return SortFeaturesAndBuildCBV(move(features));
}
template <typename T>
struct RetrieveAddressFeaturesAdaptor
{
template <typename... TArgs>
unique_ptr<coding::CompressedBitVector> operator()(TArgs &&... args)
{
return RetrieveAddressFeaturesImpl<T>(forward<TArgs>(args)...);
}
};
template <typename T>
struct RetrievePostcodeFeaturesAdaptor
{
template <typename... TArgs>
unique_ptr<coding::CompressedBitVector> operator()(TArgs &&... args)
{
return RetrievePostcodeFeaturesImpl<T>(forward<TArgs>(args)...);
}
};
template <template <typename> class T>
struct Selector
{
template <typename... TArgs>
unique_ptr<coding::CompressedBitVector> operator()(MwmSet::MwmId const & id, MwmValue & value,
TArgs &&... args)
{
version::MwmTraits mwmTraits(value.GetMwmVersion().GetFormat());
if (mwmTraits.GetSearchIndexFormat() ==
version::MwmTraits::SearchIndexFormat::FeaturesWithRankAndCenter)
{
T<FeatureWithRankAndCenter> t;
return t(id, value, forward<TArgs>(args)...);
}
if (mwmTraits.GetSearchIndexFormat() ==
version::MwmTraits::SearchIndexFormat::CompressedBitVector)
{
T<FeatureIndexValue> t;
return t(id, value, forward<TArgs>(args)...);
}
return unique_ptr<coding::CompressedBitVector>();
}
};
} // namespace
namespace v2
{
unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(
MwmSet::MwmId const & id, MwmValue & value, my::Cancellable const & cancellable,
SearchQueryParams const & params)
{
version::MwmTraits mwmTraits(value.GetMwmVersion().GetFormat());
Selector<RetrieveAddressFeaturesAdaptor> selector;
return selector(id, value, cancellable, params);
}
if (mwmTraits.GetSearchIndexFormat() ==
version::MwmTraits::SearchIndexFormat::FeaturesWithRankAndCenter)
{
using TValue = FeatureWithRankAndCenter;
return RetrieveAddressFeaturesImpl<TValue>(id, value, cancellable, params);
}
else if (mwmTraits.GetSearchIndexFormat() ==
version::MwmTraits::SearchIndexFormat::CompressedBitVector)
{
using TValue = FeatureIndexValue;
return RetrieveAddressFeaturesImpl<TValue>(id, value, cancellable, params);
}
return unique_ptr<coding::CompressedBitVector>();
unique_ptr<coding::CompressedBitVector> RetrievePostcodeFeatures(
MwmSet::MwmId const & id, MwmValue & value, my::Cancellable const & cancellable,
TokenSlice const & slice)
{
Selector<RetrievePostcodeFeaturesAdaptor> selector;
return selector(id, value, cancellable, slice);
}
unique_ptr<coding::CompressedBitVector> RetrieveGeometryFeatures(

View file

@ -21,16 +21,24 @@ namespace search
namespace v2
{
class MwmContext;
class TokenSlice;
// Retrieves from the search index corresponding to |value| all
// features matching to |params|.
unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(
MwmSet::MwmId const & id, MwmValue & value, my::Cancellable const & cancellable,
SearchQueryParams const & params);
unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(MwmSet::MwmId const & id,
MwmValue & value,
my::Cancellable const & cancellable,
SearchQueryParams const & params);
// Retrieves from the search index corresponding to |value| all
// postcodes matching to |slice|.
unique_ptr<coding::CompressedBitVector> RetrievePostcodeFeatures(
MwmSet::MwmId const & id, MwmValue & value, my::Cancellable const & cancellable,
TokenSlice const & slice);
// Retrieves from the geometry index corresponding to |value| all features belonging to |rect|.
unique_ptr<coding::CompressedBitVector> RetrieveGeometryFeatures(
MwmContext const & context, my::Cancellable const & cancellable,
m2::RectD const & rect, int scale);
MwmContext const & context, my::Cancellable const & cancellable, m2::RectD const & rect,
int scale);
} // namespace v2
} // namespace search

View file

@ -54,6 +54,7 @@ HEADERS += \
v2/locality_scorer.hpp \
v2/mwm_context.hpp \
v2/nested_rects_cache.hpp \
v2/postcodes_matcher.hpp \
v2/pre_ranking_info.hpp \
v2/rank_table_cache.hpp \
v2/ranking_info.hpp \
@ -62,6 +63,7 @@ HEADERS += \
v2/search_query_v2.hpp \
v2/stats_cache.hpp \
v2/street_vicinity_loader.hpp \
v2/token_slice.hpp \
SOURCES += \
approximate_string_match.cpp \
@ -99,6 +101,7 @@ SOURCES += \
v2/locality_scorer.cpp \
v2/mwm_context.cpp \
v2/nested_rects_cache.cpp \
v2/postcodes_matcher.cpp \
v2/pre_ranking_info.cpp \
v2/rank_table_cache.cpp \
v2/ranking_info.cpp \
@ -106,3 +109,4 @@ SOURCES += \
v2/search_model.cpp \
v2/search_query_v2.cpp \
v2/street_vicinity_loader.cpp \
v2/token_slice.cpp \

View file

@ -33,7 +33,14 @@ void SearchTest::RegisterCountry(string const & name, m2::RectD const & rect)
bool SearchTest::ResultsMatch(string const & query,
vector<shared_ptr<tests_support::MatchingRule>> const & rules)
{
tests_support::TestSearchRequest request(m_engine, query, "en", Mode::Everywhere, m_viewport);
return ResultsMatch(query, "en" /* locale */, rules);
}
bool SearchTest::ResultsMatch(string const & query,
string const & locale,
vector<shared_ptr<tests_support::MatchingRule>> const & rules)
{
tests_support::TestSearchRequest request(m_engine, query, locale, Mode::Everywhere, m_viewport);
request.Wait();
return MatchResults(m_engine, rules, request.Results());
}

View file

@ -72,11 +72,26 @@ public:
return id;
}
template <typename TBuildFn>
MwmSet::MwmId BuildWorld(TBuildFn && fn)
{
return BuildMwm("testWorld", feature::DataHeader::world, forward<TBuildFn>(fn));
}
template <typename TBuildFn>
MwmSet::MwmId BuildCountry(string const & name, TBuildFn && fn)
{
return BuildMwm(name, feature::DataHeader::country, forward<TBuildFn>(fn));
}
inline void SetViewport(m2::RectD const & viewport) { m_viewport = viewport; }
bool ResultsMatch(string const & query,
vector<shared_ptr<tests_support::MatchingRule>> const & rules);
bool ResultsMatch(string const & query, string const & locale,
vector<shared_ptr<tests_support::MatchingRule>> const & rules);
bool ResultsMatch(string const & query, Mode mode,
vector<shared_ptr<tests_support::MatchingRule>> const & rules);

View file

@ -1,10 +1,15 @@
#include "testing/testing.hpp"
#include "search/retrieval.hpp"
#include "search/search_integration_tests/helpers.hpp"
#include "search/search_tests_support/test_feature.hpp"
#include "search/search_tests_support/test_mwm_builder.hpp"
#include "search/search_tests_support/test_results_matching.hpp"
#include "search/search_tests_support/test_search_request.hpp"
#include "search/v2/token_slice.hpp"
#include "indexer/feature.hpp"
#include "indexer/index.hpp"
#include "geometry/point2d.hpp"
#include "geometry/rect2d.hpp"
@ -77,39 +82,38 @@ UNIT_CLASS_TEST(SearchQueryV2Test, Smoke)
TestPOI lantern1(m2::PointD(10.0005, 10.0005), "lantern 1", "en");
TestPOI lantern2(m2::PointD(10.0006, 10.0005), "lantern 2", "en");
BuildMwm("testWorld", feature::DataHeader::world, [&](TestMwmBuilder & builder)
{
builder.Add(wonderlandCountry);
builder.Add(losAlamosCity);
builder.Add(mskCity);
});
auto wonderlandId =
BuildMwm(countryName, feature::DataHeader::country, [&](TestMwmBuilder & builder)
{
builder.Add(losAlamosCity);
builder.Add(mskCity);
builder.Add(longPondVillage);
BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(wonderlandCountry);
builder.Add(losAlamosCity);
builder.Add(mskCity);
});
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder)
{
builder.Add(losAlamosCity);
builder.Add(mskCity);
builder.Add(longPondVillage);
builder.Add(feynmanStreet);
builder.Add(bohrStreet1);
builder.Add(bohrStreet2);
builder.Add(bohrStreet3);
builder.Add(firstAprilStreet);
builder.Add(feynmanStreet);
builder.Add(bohrStreet1);
builder.Add(bohrStreet2);
builder.Add(bohrStreet3);
builder.Add(firstAprilStreet);
builder.Add(feynmanHouse);
builder.Add(bohrHouse);
builder.Add(hilbertHouse);
builder.Add(descartesHouse);
builder.Add(bornHouse);
builder.Add(feynmanHouse);
builder.Add(bohrHouse);
builder.Add(hilbertHouse);
builder.Add(descartesHouse);
builder.Add(bornHouse);
builder.Add(busStop);
builder.Add(tramStop);
builder.Add(quantumTeleport1);
builder.Add(quantumTeleport2);
builder.Add(quantumCafe);
builder.Add(lantern1);
builder.Add(lantern2);
});
builder.Add(busStop);
builder.Add(tramStop);
builder.Add(quantumTeleport1);
builder.Add(quantumTeleport2);
builder.Add(quantumCafe);
builder.Add(lantern1);
builder.Add(lantern2);
});
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
{
@ -182,11 +186,11 @@ UNIT_CLASS_TEST(SearchQueryV2Test, SearchInWorld)
TestCountry wonderland(m2::PointD(0, 0), countryName, "en");
TestCity losAlamos(m2::PointD(0, 0), "Los Alamos", "en", 100 /* rank */);
auto testWorldId = BuildMwm("testWorld", feature::DataHeader::world, [&](TestMwmBuilder & builder)
{
builder.Add(wonderland);
builder.Add(losAlamos);
});
auto testWorldId = BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(wonderland);
builder.Add(losAlamos);
});
RegisterCountry(countryName, m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(-0.5, -0.5)));
@ -213,16 +217,15 @@ UNIT_CLASS_TEST(SearchQueryV2Test, SearchByName)
"Hyde Park", "en");
TestPOI cafe(m2::PointD(1.0, 1.0), "London Cafe", "en");
auto worldId = BuildMwm("testWorld", feature::DataHeader::world, [&](TestMwmBuilder & builder)
{
builder.Add(london);
});
auto wonderlandId =
BuildMwm(countryName, feature::DataHeader::country, [&](TestMwmBuilder & builder)
{
builder.Add(hydePark);
builder.Add(cafe);
});
auto worldId = BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(london);
});
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder)
{
builder.Add(hydePark);
builder.Add(cafe);
});
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(-0.9, -0.9)));
{
@ -248,11 +251,11 @@ UNIT_CLASS_TEST(SearchQueryV2Test, DisableSuggests)
TestCity london1(m2::PointD(1, 1), "London", "en", 100 /* rank */);
TestCity london2(m2::PointD(-1, -1), "London", "en", 100 /* rank */);
auto worldId = BuildMwm("testWorld", feature::DataHeader::world, [&](TestMwmBuilder & builder)
{
builder.Add(london1);
builder.Add(london2);
});
auto worldId = BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(london1);
builder.Add(london2);
});
SetViewport(m2::RectD(m2::PointD(0.5, 0.5), m2::PointD(1.5, 1.5)));
{
@ -299,21 +302,20 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)
TestPOI cafe2(m2::PointD(-0.99, -0.99), "", "en");
cafe2.SetTypes({{"amenity", "cafe"}});
auto worldId = BuildMwm("testWorld", feature::DataHeader::world, [&](TestMwmBuilder & builder)
{
builder.Add(sanFrancisco);
builder.Add(lermontovo);
});
auto wonderlandId = BuildMwm(countryName, feature::DataHeader::country, [&](TestMwmBuilder & builder)
{
builder.Add(cafe1);
builder.Add(cafe2);
builder.Add(goldenGateBridge);
builder.Add(goldenGateStreet);
builder.Add(lermontov);
builder.Add(waterfall);
});
auto worldId = BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(sanFrancisco);
builder.Add(lermontovo);
});
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder)
{
builder.Add(cafe1);
builder.Add(cafe2);
builder.Add(goldenGateBridge);
builder.Add(goldenGateStreet);
builder.Add(lermontov);
builder.Add(waterfall);
});
SetViewport(m2::RectD(m2::PointD(-0.5, -0.5), m2::PointD(0.5, 0.5)));
{
@ -359,5 +361,65 @@ UNIT_CLASS_TEST(SearchQueryV2Test, TestRankingInfo)
TEST(ResultsMatch("waterfall", rules), ());
}
}
UNIT_CLASS_TEST(SearchQueryV2Test, TestPostcodes)
{
string const countryName = "Russia";
TestCity city(m2::PointD(0, 0), "Долгопрудный", "ru", 100 /* rank */);
TestStreet street(
vector<m2::PointD>{m2::PointD(-0.5, 0.0), m2::PointD(0, 0), m2::PointD(0.5, 0.0)},
"Первомайская", "ru");
TestBuilding building(m2::PointD(0.0, 0.00001), "", "28 а", street, "ru");
building.SetPostcode("141701");
BuildWorld([&](TestMwmBuilder & builder)
{
builder.Add(city);
});
auto countryId = BuildCountry(countryName, [&](TestMwmBuilder & builder)
{
builder.Add(street);
builder.Add(building);
});
// Tests that postcode is added to the search index.
{
auto handle = m_engine.GetMwmHandleById(countryId);
TEST(handle.IsAlive(), ());
my::Cancellable cancellable;
SearchQueryParams params;
params.m_tokens.emplace_back();
params.m_tokens.back().push_back(PostcodeToString(strings::MakeUniString("141701")));
auto * value = handle.GetValue<MwmValue>();
auto features = v2::RetrievePostcodeFeatures(countryId, *value, cancellable,
TokenSlice(params, 0, params.m_tokens.size()));
TEST_EQUAL(1, features->PopCount(), ());
uint64_t index = 0;
while (!features->GetBit(index))
++index;
Index::FeaturesLoaderGuard loader(m_engine, countryId);
FeatureType ft;
loader.GetFeatureByIndex(index, ft);
auto rule = ExactMatch(countryId, building);
TEST(rule->Matches(ft), ());
}
{
TRules rules{ExactMatch(countryId, building)};
TEST(ResultsMatch("Долгопрудный первомайская 28а", "ru" /* locale */, rules), ());
}
// TODO (@y): uncomment this test and add more tests when postcodes
// search will be implemented.
//
// {
// TRules rules{ExactMatch(countryId, building)};
// TEST(ResultsMatch("Долгопрудный первомайская 28а, 141701", "ru" /* locale */, rules), ());
// }
}
} // namespace
} // namespace search

View file

@ -12,6 +12,7 @@
#include "search/v2/pre_ranking_info.hpp"
#include "search/v2/ranking_info.hpp"
#include "search/v2/ranking_utils.hpp"
#include "search/v2/token_slice.hpp"
#include "storage/country_info_getter.hpp"
#include "storage/index.hpp"
@ -464,10 +465,8 @@ void Query::SetQuery(string const & query)
search::Delimiters delims;
SplitUniString(NormalizeAndSimplifyString(query), MakeBackInsertFunctor(m_tokens), delims);
bool checkPrefix = true;
// Assign prefix with last parsed token.
if (checkPrefix && !m_tokens.empty() && !delims(strings::LastUniChar(query)))
if (!m_tokens.empty() && !delims(strings::LastUniChar(query)))
{
m_prefix.swap(m_tokens.back());
m_tokens.pop_back();
@ -623,7 +622,7 @@ class PreResult2Maker
info.m_nameScore = v2::NAME_SCORE_ZERO;
v2::TokensSliceNoCategories slice(m_params, preInfo.m_startToken, preInfo.m_endToken);
v2::TokenSliceNoCategories slice(m_params, preInfo.m_startToken, preInfo.m_endToken);
for (auto const & lang : m_params.m_langs)
{

View file

@ -0,0 +1,72 @@
#include "../../testing/testing.hpp"
#include "search/search_query_params.hpp"
#include "search/v2/postcodes_matcher.hpp"
#include "search/v2/token_slice.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
#include "std/string.hpp"
#include "std/vector.hpp"
using namespace strings;
namespace search
{
namespace v2
{
namespace
{
bool LooksLikePostcode(string const & s, bool checkPrefix)
{
vector<UniString> tokens;
bool const lastTokenIsPrefix =
TokenizeStringAndCheckIfLastTokenIsPrefix(s, tokens, search::Delimiters());
size_t const numTokens = tokens.size();
SearchQueryParams params;
if (checkPrefix && lastTokenIsPrefix)
{
params.m_prefixTokens.push_back(tokens.back());
tokens.pop_back();
}
for (auto const & token : tokens)
{
params.m_tokens.emplace_back();
params.m_tokens.back().push_back(token);
}
return LooksLikePostcode(TokenSlice(params, 0, numTokens));
}
UNIT_TEST(PostcodesMatcher_Smoke)
{
TEST(LooksLikePostcode("141701", false /* checkPrefix */), ());
TEST(LooksLikePostcode("141", true /* checkPrefix */), ());
TEST(LooksLikePostcode("BA6 8JP", true /* checkPrefix */), ());
TEST(LooksLikePostcode("BA6 8JP", true /* checkPrefix */), ());
TEST(LooksLikePostcode("BA22 9HR", true /* checkPrefix */), ());
TEST(LooksLikePostcode("BA22", true /* checkPrefix */), ());
TEST(LooksLikePostcode("DE56 4FW", true /* checkPrefix */), ());
TEST(LooksLikePostcode("NY 1000", true /* checkPrefix */), ());
TEST(LooksLikePostcode("AZ 85203", true /* checkPrefix */), ());
TEST(LooksLikePostcode("AZ", true /* checkPrefix */), ());
TEST(LooksLikePostcode("803 0271", true /* checkPrefix */), ());
TEST(LooksLikePostcode("803-0271", true /* checkPrefix */), ());
TEST(LooksLikePostcode("〒803-0271", true /* checkPrefix */), ());
TEST(!LooksLikePostcode("1 мая", true /* checkPrefix */), ());
TEST(!LooksLikePostcode("1 мая улица", true /* checkPrefix */), ());
TEST(!LooksLikePostcode("москва", true /* checkPrefix */), ());
TEST(!LooksLikePostcode("39 с 79", true /* checkPrefix */), ());
}
} // namespace
} // namespace v2
} // namespace search

View file

@ -2,6 +2,7 @@
#include "search/search_query_params.hpp"
#include "search/v2/ranking_utils.hpp"
#include "search/v2/token_slice.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
@ -32,7 +33,7 @@ NameScore GetScore(string const & name, string const & query, size_t startToken,
params.m_prefixTokens.swap(params.m_tokens.back());
params.m_tokens.pop_back();
}
return GetNameScore(name, TokensSlice(params, startToken, endToken));
return GetNameScore(name, TokenSlice(params, startToken, endToken));
}
UNIT_TEST(NameTest_Smoke)

View file

@ -27,6 +27,7 @@ SOURCES += \
latlon_match_test.cpp \
locality_finder_test.cpp \
locality_scorer_test.cpp \
postcodes_matcher_tests.cpp \
query_saver_tests.cpp \
ranking_tests.cpp \
string_intersection_test.cpp \

View file

@ -19,6 +19,7 @@ public:
bool Matches(FeatureType const & feature) const;
inline void SetPostcode(string const & postcode) { m_postcode = postcode; }
inline uint64_t GetId() const { return m_id; }
inline string const & GetName() const { return m_name; }
virtual void Serialize(FeatureBuilder1 & fb) const;

View file

@ -6,6 +6,7 @@
namespace search
{
static const uint8_t kCategoriesLang = 128;
static const uint8_t kPostcodesLang = 129;
static const uint8_t kPointCodingBits = 20;
} // namespace search

View file

@ -979,6 +979,8 @@ void Geocoder::LimitedSearch(FeaturesFilter const & filter)
m_filter = &filter;
MY_SCOPE_GUARD(resetFilter, [&]() { m_filter = nullptr; });
// TODO (@y): implement postcodes matching here.
// The order is rather important. Match streets first, then all other stuff.
GreedilyMatchStreets();
MatchPOIsAndBuildings(0 /* curToken */);

View file

@ -1,5 +1,7 @@
#include "search/v2/locality_scorer.hpp"
#include "search/v2/token_slice.hpp"
#include "std/algorithm.hpp"
namespace search
@ -99,8 +101,8 @@ void LocalityScorer::SortByName(vector<ExLocality> & ls) const
auto score = NAME_SCORE_ZERO;
for (auto const & name : names)
{
score = max(score, GetNameScore(name, v2::TokensSlice(m_params, l.m_locality.m_startToken,
l.m_locality.m_endToken)));
score = max(score, GetNameScore(name, v2::TokenSlice(m_params, l.m_locality.m_startToken,
l.m_locality.m_endToken)));
}
l.m_nameScore = score;
}

View file

@ -0,0 +1,168 @@
#include "search/v2/postcodes_matcher.hpp"
#include "search/v2/token_slice.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/logging.hpp"
#include "base/macros.hpp"
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
#include "std/transform_iterator.hpp"
#include "std/unique_ptr.hpp"
#include "std/utility.hpp"
#include "std/vector.hpp"
using namespace strings;
namespace search
{
namespace v2
{
namespace
{
// Top patterns for postcodes. See
// search/search_quality/clusterize_postcodes.lisp for details how
// these patterns were constructed.
char const * const g_patterns[] = {
"aa nnnn", "aa nnnnn", "aaa nnnn", "aan", "aan naa", "aana naa", "aann",
"aann naa", "aannaa", "aannnaa", "aannnn", "an naa", "ana naa", "ana nan",
"ananan", "ann aann", "ann naa", "annnnaaa", "nn nnn", "nnn", "nnn nn",
"nnn nnn", "nnn nnnn", "nnnn", "nnnn aa", "nnnn nnn", "nnnnaa", "nnnnn",
"nnnnn nnn", "nnnnn nnnn", "nnnnn nnnnn", "nnnnnn", "nnnnnnn", "nnnnnnnn", "〒nnn nnnn"};
UniChar SimplifyChar(UniChar const & c)
{
if (IsASCIIDigit(c))
return 'n';
if (IsASCIILatin(c))
return 'a';
return c;
}
struct Node
{
Node() : m_isLeaf(false) {}
Node const * Move(UniChar c) const
{
for (auto const & p : m_moves)
{
if (p.first == c)
return p.second.get();
}
return nullptr;
}
template <typename TIt>
Node const * Move(TIt begin, TIt end) const
{
Node const * cur = this;
for (; begin != end && cur; ++begin)
cur = cur->Move(*begin);
return cur;
}
Node & MakeMove(UniChar c)
{
for (auto const & p : m_moves)
{
if (p.first == c)
return *p.second;
}
m_moves.emplace_back(c, make_unique<Node>());
return *m_moves.back().second;
}
template <typename TIt>
Node & MakeMove(TIt begin, TIt end)
{
Node * cur = this;
for (; begin != end; ++begin)
cur = &cur->MakeMove(*begin);
return *cur;
}
buffer_vector<pair<UniChar, unique_ptr<Node>>, 2> m_moves;
bool m_isLeaf;
DISALLOW_COPY(Node);
};
// This class puts all strings from g_patterns to a trie with a low
// branching factor and matches queries against these patterns.
class PostcodesMatcher
{
public:
PostcodesMatcher() : m_root(), m_maxNumTokensInPostcode(0)
{
search::Delimiters delimiters;
for (auto const * pattern : g_patterns)
AddString(MakeUniString(pattern), delimiters);
}
// Checks that given tokens match to at least one of postcodes
// patterns.
//
// Complexity: O(total length of tokens in |slice|).
bool HasString(TokenSlice const & slice) const
{
Node const * cur = &m_root;
for (size_t i = 0; i < slice.Size() && cur; ++i)
{
auto const & s = slice.Get(i).front();
cur = cur->Move(make_transform_iterator(s.begin(), &SimplifyChar),
make_transform_iterator(s.end(), &SimplifyChar));
if (cur && i + 1 < slice.Size())
cur = cur->Move(' ');
}
if (!cur)
return false;
if (slice.Size() > 0 && slice.IsPrefix(slice.Size() - 1))
return true;
return cur->m_isLeaf;
}
inline size_t GetMaxNumTokensInPostcode() const { return m_maxNumTokensInPostcode; }
private:
void AddString(UniString const & s, search::Delimiters & delimiters)
{
vector<UniString> tokens;
SplitUniString(s, MakeBackInsertFunctor(tokens), delimiters);
m_maxNumTokensInPostcode = max(m_maxNumTokensInPostcode, tokens.size());
Node * cur = &m_root;
for (size_t i = 0; i < tokens.size(); ++i)
{
cur = &cur->MakeMove(tokens[i].begin(), tokens[i].end());
if (i + 1 != tokens.size())
cur = &cur->MakeMove(' ');
}
cur->m_isLeaf = true;
}
Node m_root;
size_t m_maxNumTokensInPostcode;
DISALLOW_COPY(PostcodesMatcher);
};
PostcodesMatcher const & GetPostcodesMatcher()
{
static PostcodesMatcher kMatcher;
return kMatcher;
}
} // namespace
bool LooksLikePostcode(TokenSlice const & slice) { return GetPostcodesMatcher().HasString(slice); }
size_t GetMaxNumTokensInPostcode() { return GetPostcodesMatcher().GetMaxNumTokensInPostcode(); }
} // namespace v2
} // namespace search

View file

@ -0,0 +1,15 @@
#pragma once
#include "std/cstdint.hpp"
namespace search
{
namespace v2
{
class TokenSlice;
bool LooksLikePostcode(TokenSlice const & slice);
size_t GetMaxNumTokensInPostcode();
} // namespace v2
} // namespace search

View file

@ -7,7 +7,6 @@
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
#include "base/assert.hpp"
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
@ -42,74 +41,6 @@ enum NameScore
NAME_SCORE_COUNT
};
class TokensSlice
{
public:
TokensSlice(SearchQueryParams const & params, size_t startToken, size_t endToken)
: m_params(params), m_offset(startToken), m_size(endToken - startToken)
{
ASSERT_LESS_OR_EQUAL(startToken, endToken, ());
}
inline SearchQueryParams::TSynonymsVector const & Get(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_params.GetTokens(m_offset + i);
}
inline size_t Size() const { return m_size; }
inline bool Empty() const { return Size() == 0; }
inline bool IsPrefix(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_offset + i == m_params.m_tokens.size();
}
private:
SearchQueryParams const & m_params;
size_t const m_offset;
size_t const m_size;
};
class TokensSliceNoCategories
{
public:
TokensSliceNoCategories(SearchQueryParams const & params, size_t startToken, size_t endToken)
: m_params(params)
{
ASSERT_LESS_OR_EQUAL(startToken, endToken, ());
m_indexes.reserve(endToken - startToken);
for (size_t i = startToken; i < endToken; ++i)
{
if (!m_params.m_isCategorySynonym[i])
m_indexes.push_back(i);
}
}
inline SearchQueryParams::TSynonymsVector const & Get(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_params.GetTokens(m_indexes[i]);
}
inline size_t Size() const { return m_indexes.size(); }
inline bool Empty() const { return Size() == 0; }
inline bool IsPrefix(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_indexes[i] == m_params.m_tokens.size();
}
private:
SearchQueryParams const & m_params;
vector<size_t> m_indexes;
};
template <typename TSlice>
NameScore GetNameScore(string const & name, TSlice const & slice)
{

27
search/v2/token_slice.cpp Normal file
View file

@ -0,0 +1,27 @@
#include "search/v2/token_slice.hpp"
namespace search
{
namespace v2
{
TokenSlice::TokenSlice(SearchQueryParams const & params, size_t startToken, size_t endToken)
: m_params(params), m_offset(startToken), m_size(endToken - startToken)
{
ASSERT_LESS_OR_EQUAL(startToken, endToken, ());
}
TokenSliceNoCategories::TokenSliceNoCategories(SearchQueryParams const & params, size_t startToken,
size_t endToken)
: m_params(params)
{
ASSERT_LESS_OR_EQUAL(startToken, endToken, ());
m_indexes.reserve(endToken - startToken);
for (size_t i = startToken; i < endToken; ++i)
{
if (!m_params.m_isCategorySynonym[i])
m_indexes.push_back(i);
}
}
} // namespace v2
} // namespace search

67
search/v2/token_slice.hpp Normal file
View file

@ -0,0 +1,67 @@
#pragma once
#include "search/search_query_params.hpp"
#include "base/assert.hpp"
#include "std/cstdint.hpp"
#include "std/vector.hpp"
namespace search
{
namespace v2
{
class TokenSlice
{
public:
TokenSlice(SearchQueryParams const & params, size_t startToken, size_t endToken);
inline SearchQueryParams::TSynonymsVector const & Get(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_params.GetTokens(m_offset + i);
}
inline size_t Size() const { return m_size; }
inline bool Empty() const { return Size() == 0; }
inline bool IsPrefix(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_offset + i == m_params.m_tokens.size();
}
private:
SearchQueryParams const & m_params;
size_t const m_offset;
size_t const m_size;
};
class TokenSliceNoCategories
{
public:
TokenSliceNoCategories(SearchQueryParams const & params, size_t startToken, size_t endToken);
inline SearchQueryParams::TSynonymsVector const & Get(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_params.GetTokens(m_indexes[i]);
}
inline size_t Size() const { return m_indexes.size(); }
inline bool Empty() const { return Size() == 0; }
inline bool IsPrefix(size_t i) const
{
ASSERT_LESS(i, Size(), ());
return m_indexes[i] == m_params.m_tokens.size();
}
private:
SearchQueryParams const & m_params;
vector<size_t> m_indexes;
};
} // namespace v2
} // namespace search