forked from organicmaps/organicmaps
[search] Add PreprocessBeforeTokenization to process abbreviations which can be splited during tokenization
This commit is contained in:
parent
0e658ee411
commit
6f1e6de8f7
4 changed files with 76 additions and 1 deletions
|
@ -128,6 +128,32 @@ UniString NormalizeAndSimplifyString(string const & s)
|
|||
*/
|
||||
}
|
||||
|
||||
void PreprocessBeforeTokenization(strings::UniString & query)
|
||||
{
|
||||
search::Delimiters const delims;
|
||||
vector<pair<strings::UniString, strings::UniString>> const replacements = {
|
||||
{MakeUniString("пр-т"), MakeUniString("проспект")},
|
||||
{MakeUniString("пр-д"), MakeUniString("проезд")},
|
||||
{MakeUniString("наб-я"), MakeUniString("набережная")}};
|
||||
|
||||
for (auto const & replacement : replacements)
|
||||
{
|
||||
auto start = query.begin();
|
||||
while ((start = std::search(start, query.end(), replacement.first.begin(),
|
||||
replacement.first.end())) != query.end())
|
||||
{
|
||||
auto end = start + replacement.first.size();
|
||||
if ((start == query.begin() || delims(*(start - 1))) && (end == query.end() || delims(*end)))
|
||||
{
|
||||
auto const dist = distance(query.begin(), start);
|
||||
query.Replace(start, end, replacement.second.begin(), replacement.second.end());
|
||||
start = query.begin() + dist;
|
||||
}
|
||||
start += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
UniString FeatureTypeToString(uint32_t type)
|
||||
{
|
||||
string const s = "!type:" + to_string(type);
|
||||
|
|
|
@ -16,6 +16,10 @@ namespace search
|
|||
// It does some magic text transformation which greatly helps us to improve our search.
|
||||
strings::UniString NormalizeAndSimplifyString(std::string const & s);
|
||||
|
||||
// Replace abbreviations which can be split during tokenization with full form.
|
||||
// Eg. "пр-т" -> "проспект".
|
||||
void PreprocessBeforeTokenization(strings::UniString & query);
|
||||
|
||||
template <class Delims, typename Fn>
|
||||
void SplitUniString(strings::UniString const & uniS, Fn && f, Delims const & delims)
|
||||
{
|
||||
|
|
|
@ -209,7 +209,9 @@ void Processor::SetQuery(string const & query)
|
|||
vector<strings::UniString> tokens;
|
||||
{
|
||||
search::DelimitersWithExceptions delims(vector<strings::UniChar>{'#'});
|
||||
SplitUniString(NormalizeAndSimplifyString(query), base::MakeBackInsertFunctor(tokens), delims);
|
||||
auto normalizedQuery = NormalizeAndSimplifyString(query);
|
||||
PreprocessBeforeTokenization(normalizedQuery);
|
||||
SplitUniString(normalizedQuery, base::MakeBackInsertFunctor(tokens), delims);
|
||||
}
|
||||
|
||||
search::Delimiters delims;
|
||||
|
|
|
@ -1765,5 +1765,48 @@ UNIT_CLASS_TEST(ProcessorTest, SynonymsTest)
|
|||
TEST(ResultsMatch("собор св петра ", rules), ());
|
||||
}
|
||||
}
|
||||
|
||||
UNIT_CLASS_TEST(ProcessorTest, PreprocessBeforeTokenizationTest)
|
||||
{
|
||||
string const countryName = "Wonderland";
|
||||
|
||||
TestStreet prt(
|
||||
vector<m2::PointD>{m2::PointD(0.5, -0.5), m2::PointD(0.0, 0.0), m2::PointD(-0.5, 0.5)},
|
||||
"Октябрьский проспект", "ru");
|
||||
|
||||
TestStreet prd(
|
||||
vector<m2::PointD>{m2::PointD(-0.5, -0.5), m2::PointD(0.0, 0.0), m2::PointD(0.5, 0.5)},
|
||||
"Жуков проезд", "ru");
|
||||
|
||||
TestStreet nabya(
|
||||
vector<m2::PointD>{m2::PointD(0.0, -0.5), m2::PointD(0.0, 0.0), m2::PointD(0.0, 0.5)},
|
||||
"Москворецкая набережная", "ru");
|
||||
|
||||
auto wonderlandId = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
|
||||
builder.Add(prt);
|
||||
builder.Add(prd);
|
||||
builder.Add(nabya);
|
||||
});
|
||||
|
||||
SetViewport(m2::RectD(-1, -1, 1, 1));
|
||||
{
|
||||
Rules rules = {ExactMatch(wonderlandId, prt)};
|
||||
TEST(ResultsMatch("Октябрьский проспект", rules), ());
|
||||
TEST(ResultsMatch("пр-т Октябрьский", rules), ());
|
||||
TEST(ResultsMatch("Октябрьский пр-т", rules), ());
|
||||
}
|
||||
{
|
||||
Rules rules = {ExactMatch(wonderlandId, prd)};
|
||||
TEST(ResultsMatch("Жуков проезд", rules), ());
|
||||
TEST(ResultsMatch("пр-д Жуков", rules), ());
|
||||
TEST(ResultsMatch("Жуков пр-д", rules), ());
|
||||
}
|
||||
{
|
||||
Rules rules = {ExactMatch(wonderlandId, nabya)};
|
||||
TEST(ResultsMatch("Москворецкая набережная", rules), ());
|
||||
TEST(ResultsMatch("наб-я Москворецкая", rules), ());
|
||||
TEST(ResultsMatch("Москворецкая наб-я", rules), ());
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
} // namespace search
|
||||
|
|
Loading…
Add table
Reference in a new issue