[search] Add PreprocessBeforeTokenization to process abbreviations which can be splited during tokenization

This commit is contained in:
tatiana-yan 2019-02-14 15:47:45 +03:00 committed by mpimenov
parent 0e658ee411
commit 6f1e6de8f7
4 changed files with 76 additions and 1 deletions

View file

@ -128,6 +128,32 @@ UniString NormalizeAndSimplifyString(string const & s)
*/
}
void PreprocessBeforeTokenization(strings::UniString & query)
{
search::Delimiters const delims;
vector<pair<strings::UniString, strings::UniString>> const replacements = {
{MakeUniString("пр-т"), MakeUniString("проспект")},
{MakeUniString("пр-д"), MakeUniString("проезд")},
{MakeUniString("наб-я"), MakeUniString("набережная")}};
for (auto const & replacement : replacements)
{
auto start = query.begin();
while ((start = std::search(start, query.end(), replacement.first.begin(),
replacement.first.end())) != query.end())
{
auto end = start + replacement.first.size();
if ((start == query.begin() || delims(*(start - 1))) && (end == query.end() || delims(*end)))
{
auto const dist = distance(query.begin(), start);
query.Replace(start, end, replacement.second.begin(), replacement.second.end());
start = query.begin() + dist;
}
start += 1;
}
}
}
UniString FeatureTypeToString(uint32_t type)
{
string const s = "!type:" + to_string(type);

View file

@ -16,6 +16,10 @@ namespace search
// It does some magic text transformation which greatly helps us to improve our search.
strings::UniString NormalizeAndSimplifyString(std::string const & s);
// Replace abbreviations which can be split during tokenization with full form.
// Eg. "пр-т" -> "проспект".
void PreprocessBeforeTokenization(strings::UniString & query);
template <class Delims, typename Fn>
void SplitUniString(strings::UniString const & uniS, Fn && f, Delims const & delims)
{

View file

@ -209,7 +209,9 @@ void Processor::SetQuery(string const & query)
vector<strings::UniString> tokens;
{
search::DelimitersWithExceptions delims(vector<strings::UniChar>{'#'});
SplitUniString(NormalizeAndSimplifyString(query), base::MakeBackInsertFunctor(tokens), delims);
auto normalizedQuery = NormalizeAndSimplifyString(query);
PreprocessBeforeTokenization(normalizedQuery);
SplitUniString(normalizedQuery, base::MakeBackInsertFunctor(tokens), delims);
}
search::Delimiters delims;

View file

@ -1765,5 +1765,48 @@ UNIT_CLASS_TEST(ProcessorTest, SynonymsTest)
TEST(ResultsMatch("собор св петра ", rules), ());
}
}
UNIT_CLASS_TEST(ProcessorTest, PreprocessBeforeTokenizationTest)
{
string const countryName = "Wonderland";
TestStreet prt(
vector<m2::PointD>{m2::PointD(0.5, -0.5), m2::PointD(0.0, 0.0), m2::PointD(-0.5, 0.5)},
"Октябрьский проспект", "ru");
TestStreet prd(
vector<m2::PointD>{m2::PointD(-0.5, -0.5), m2::PointD(0.0, 0.0), m2::PointD(0.5, 0.5)},
"Жуков проезд", "ru");
TestStreet nabya(
vector<m2::PointD>{m2::PointD(0.0, -0.5), m2::PointD(0.0, 0.0), m2::PointD(0.0, 0.5)},
"Москворецкая набережная", "ru");
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(prt);
builder.Add(prd);
builder.Add(nabya);
});
SetViewport(m2::RectD(-1, -1, 1, 1));
{
Rules rules = {ExactMatch(wonderlandId, prt)};
TEST(ResultsMatch("Октябрьский проспект", rules), ());
TEST(ResultsMatch("пр-т Октябрьский", rules), ());
TEST(ResultsMatch("Октябрьский пр-т", rules), ());
}
{
Rules rules = {ExactMatch(wonderlandId, prd)};
TEST(ResultsMatch("Жуков проезд", rules), ());
TEST(ResultsMatch("пр-д Жуков", rules), ());
TEST(ResultsMatch("Жуков пр-д", rules), ());
}
{
Rules rules = {ExactMatch(wonderlandId, nabya)};
TEST(ResultsMatch("Москворецкая набережная", rules), ());
TEST(ResultsMatch("наб-я Москворецкая", rules), ());
TEST(ResultsMatch("Москворецкая наб-я", rules), ());
}
}
} // namespace
} // namespace search