From e4035c7d85f41698fd8f732bdce4f16d33477bb3 Mon Sep 17 00:00:00 2001 From: Maxim Pimenov Date: Fri, 25 Mar 2016 12:16:48 +0300 Subject: [PATCH] [search] Fixed emoji. The "information box" emoji U+2139 was converted to the letter "i" after all simplifications. As a result, every token that started with this letter had the tourism-information category as its synonym. This was the only case where a normalized and simplified emoji resulted in a pure ASCII string. --- data/categories.txt | 2 +- indexer/categories_holder.cpp | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/data/categories.txt b/data/categories.txt index dd33adb99b..bfb056d88e 100644 --- a/data/categories.txt +++ b/data/categories.txt @@ -1565,7 +1565,7 @@ fi:3Näköalatasanne|nähtävyydet sw:Genge|utalii tourism-information -en:4Tourist Information|4information|sights|U+2139|U+1F481 +en:4Tourist Information|4information|sights|U+1F481 ru:4Туринформация|информация|достопримечательность uk:4Турінформація|інформація|пам’ятка|пам’ятні місця de:4Tourist-Information|Information|Sehenswürdigkeit diff --git a/indexer/categories_holder.cpp b/indexer/categories_holder.cpp index e6678c55ca..bbef05a57b 100644 --- a/indexer/categories_holder.cpp +++ b/indexer/categories_holder.cpp @@ -154,14 +154,21 @@ void CategoriesHolder::LoadFromStream(istream & s) using namespace strings; if (StartsWith(name.m_name, "U+")) { + auto const code = name.m_name; int c; if (!to_int(name.m_name.c_str() + 2, c, 16)) { - LOG(LWARNING, ("Bad emoji code:", name.m_name)); + LOG(LWARNING, ("Bad emoji code:", code)); continue; } name.m_name = ToUtf8(UniString(1, static_cast(c))); + + if (IsASCIIString(ToUtf8(search::NormalizeAndSimplifyString(name.m_name)))) + { + LOG(LWARNING, ("Bad emoji code:", code)); + continue; + } } cat.m_synonyms.push_back(name);