[search] Fixed emoji.

The "information box" emoji U+2139 was converted to the letter "i" after
all simplifications. As a result, every token that started with
this letter had the tourism-information category as its synonym.

This was the only case where a normalized and simplified emoji
resulted in a pure ASCII string.
This commit is contained in:
Maxim Pimenov 2016-03-25 12:16:48 +03:00
parent cd2d0868b2
commit e4035c7d85
2 changed files with 9 additions and 2 deletions

View file

@ -1565,7 +1565,7 @@ fi:3Näköalatasanne|nähtävyydet
sw:Genge|utalii
tourism-information
en:4Tourist Information|4information|sights|U+2139|U+1F481
en:4Tourist Information|4information|sights|U+1F481
ru:4Туринформация|информация|достопримечательность
uk:4Турінформація|інформація|пам’ятка|пам’ятні місця
de:4Tourist-Information|Information|Sehenswürdigkeit

View file

@ -154,14 +154,21 @@ void CategoriesHolder::LoadFromStream(istream & s)
using namespace strings;
if (StartsWith(name.m_name, "U+"))
{
auto const code = name.m_name;
int c;
if (!to_int(name.m_name.c_str() + 2, c, 16))
{
LOG(LWARNING, ("Bad emoji code:", name.m_name));
LOG(LWARNING, ("Bad emoji code:", code));
continue;
}
name.m_name = ToUtf8(UniString(1, static_cast<UniChar>(c)));
if (IsASCIIString(ToUtf8(search::NormalizeAndSimplifyString(name.m_name))))
{
LOG(LWARNING, ("Bad emoji code:", code));
continue;
}
}
cat.m_synonyms.push_back(name);