From 671dcce02a7a73079c66023b6713f5660aa5379f Mon Sep 17 00:00:00 2001 From: vng Date: Wed, 1 Jul 2015 16:50:49 +0300 Subject: [PATCH] =?UTF-8?q?[search]=20Treat=20=E2=80=9C=C4=90,=C4=91?= =?UTF-8?q?=E2=80=9D=20as=20=E2=80=9Cd=E2=80=9D=20letter=20in=20normalizat?= =?UTF-8?q?ion.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- indexer/indexer_tests/search_string_utils_test.cpp | 12 +++++++++--- indexer/search_string_utils.hpp | 4 ++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/indexer/indexer_tests/search_string_utils_test.cpp b/indexer/indexer_tests/search_string_utils_test.cpp index 8278d87ab8..26ed3b52e7 100644 --- a/indexer/indexer_tests/search_string_utils_test.cpp +++ b/indexer/indexer_tests/search_string_utils_test.cpp @@ -18,14 +18,20 @@ UNIT_TEST(NormalizeAndSimplifyStringWithOurTambourines) "Iiİı", "iiii", // Famous turkish "I" letter bug. "ЙЁйёШКИЙй", "йейешкийй", // Better handling of Russian й letter. "ØøÆæŒœ", "ooaeaeoeoe", - "バス", "ハス" + "バス", "ハス", + "âàáạăốợồôểềệếỉđưựứửýĂÂĐÊÔƠƯ", + "aaaaaooooeeeeiduuuuyaadeoou", // Vietnamese + "ăâț", "aat" // Romanian }; */ string const arr[] = {"ÜbërÅłłęšß", "uberallesss", // Basic test case. "Iiİı", "iiii", // Famous turkish "I" letter bug. "ЙЁйёШКИЙй", "иеиешкиии", // Better handling of Russian й letter. - "ØøÆæŒœ", "ooaeaeoeoe", - "バス", "ハス" + "ØøÆæŒœ", "ooaeaeoeoe", // Dansk + "バス", "ハス", + "âàáạăốợồôểềệếỉđưựứửýĂÂĐÊÔƠƯ", + "aaaaaooooeeeeiduuuuyaadeoou", // Vietnamese + "ăâț", "aat" // Romanian }; for (size_t i = 0; i < ARRAY_SIZE(arr); i += 2) diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index d27e43991a..86c9b10092 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -17,6 +17,10 @@ inline strings::UniString NormalizeAndSimplifyString(string const & s) UniChar & c = uniString[i]; switch (c) { + // Replace "d with stroke" to simple d letter. Used in Vietnamese. + // (unicode-compliant implementation leaves it unchanged) + case 0x0110: + case 0x0111: c = 'd'; break; // Replace small turkish dotless 'ı' with dotted 'i'. // Our own invented hack to avoid well-known Turkish I-letter bug. case 0x0131: c = 'i'; break;