diff --git a/base/base.pro b/base/base.pro index 8a3b772c15..db8b80e8c2 100644 --- a/base/base.pro +++ b/base/base.pro @@ -20,7 +20,6 @@ SOURCES += \ memory_mapped_file.cpp \ path_utils.cpp \ condition.cpp \ - utf8_string.cpp \ HEADERS += \ SRC_FIRST.hpp \ @@ -63,4 +62,3 @@ HEADERS += \ buffer_vector.hpp \ path_utils.hpp \ array_adapters.hpp \ - utf8_string.hpp \ diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro index 6d98e5aee5..8c66fff5b7 100644 --- a/base/base_tests/base_tests.pro +++ b/base/base_tests/base_tests.pro @@ -29,6 +29,5 @@ SOURCES += \ matrix_test.cpp \ commands_queue_test.cpp \ buffer_vector_test.cpp \ - utf8_string_test.cpp \ HEADERS += diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index 9e582ada71..4e402db222 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -113,3 +113,11 @@ UNIT_TEST(SimpleTokenizer) } } + +UNIT_TEST(LastUniChar) +{ + TEST_EQUAL(strings::LastUniChar(""), 0, ()); + TEST_EQUAL(strings::LastUniChar("Hello"), 0x6f, ()); + TEST_EQUAL(strings::LastUniChar(" \xD0\x90"), 0x0410, ()); + +} diff --git a/base/base_tests/utf8_string_test.cpp b/base/base_tests/utf8_string_test.cpp deleted file mode 100644 index 0a55cacdc2..0000000000 --- a/base/base_tests/utf8_string_test.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include "../../testing/testing.hpp" - -#include "../utf8_string.hpp" - -using namespace utf8_string; - -bool IsDelimeter(uint32_t symbol) -{ - switch (symbol) - { - case ' ': - case '-': - case '/': - case ',': - case '.': - case 0x0336: - return true; - } - return false; -} - -UNIT_TEST(Utf8_Split) -{ - vector result; - TEST(!Split("", result, &IsDelimeter), ()); - TEST_EQUAL(result.size(), 0, ()); - - TEST(!Split(" - ,. ", result, &IsDelimeter), ()); - TEST_EQUAL(result.size(), 0, ()); - - TEST(Split("London - is the capital of babai-city.", result, &IsDelimeter), ()); - TEST_EQUAL(result.size(), 7, ()); - TEST_EQUAL(result[0], "London", ()); - TEST_EQUAL(result[6], "city", ()); - - // Доллар подорожал на 500 рублей ̶копеек - char const * s = - "- \xD0\x94\xD0\xBE\xD0\xBB\xD0\xBB\xD0\xB0\xD1\x80\x20\xD0\xBF\xD0\xBE\xD0\xB4\xD0" - "\xBE\xD1\x80\xD0\xBE\xD0\xB6\xD0\xB0\xD0\xBB\x20\xD0\xBD\xD0\xB0\x20\x35\x30\x30" - "\x20\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9\x20\xCC\xB6\xD0\xBA\xD0\xBE" - "\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA -"; - TEST(Split(s, result, &IsDelimeter), ()); - TEST_EQUAL(result.size(), 6, ()); - TEST_EQUAL(result[3], "500", ()); - TEST_EQUAL(result[4], "\xD1\x80\xD1\x83\xD0\xB1\xD0\xBB\xD0\xB5\xD0\xB9", ()); - TEST_EQUAL(result[5], "\xD0\xBA\xD0\xBE\xD0\xBF\xD0\xB5\xD0\xB5\xD0\xBA", ()); -} - -UNIT_TEST(Utf8_Split_MultipleDelimeters) -{ - vector result; - TEST(Split("A B C .,D", result, &IsDelimeter), ()); - char const * expected [] = {"A", "B", "C", "D"}; - TEST_EQUAL(result, vector(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); -} - -UNIT_TEST(Utf8_IsSearchDelimiter) -{ - TEST(utf8_string::IsSearchDelimiter(static_cast('~')), ()); - TEST(utf8_string::IsSearchDelimiter(static_cast('`')), ()); -} diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 6b676cc395..46e0d17d23 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -25,6 +25,15 @@ bool SimpleDelimiter::operator()(UniChar c) const return false; } +UniChar LastUniChar(string const & s) +{ + if (s.empty()) + return 0; + utf8::unchecked::iterator iter(s.end()); + --iter; + return *iter; +} + bool to_int(char const * s, int & i) { char * stop; diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 9ce3c7ff25..badc6f1468 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -93,6 +93,9 @@ void Tokenize(string const & str, char const * delims, FunctorT f) } } +/// @return code of last symbol in string or 0 if s is empty +UniChar LastUniChar(string const & s); + template bool IsInArray(T (&arr) [N], TT const & t) { for (size_t i = 0; i < N; ++i) diff --git a/base/utf8_string.cpp b/base/utf8_string.cpp deleted file mode 100644 index b4afc1f424..0000000000 --- a/base/utf8_string.cpp +++ /dev/null @@ -1,66 +0,0 @@ -#include "utf8_string.hpp" - -#include "../std/iterator.hpp" - -#include "../3party/utfcpp/source/utf8/unchecked.h" - -namespace utf8_string -{ - bool Split(string const & str, vector & out, IsDelimiterFuncT f) - { - out.clear(); - string::const_iterator curr = str.begin(); - string::const_iterator end = str.end(); - string word; - back_insert_iterator inserter = back_inserter(word); - while (curr != end) - { - uint32_t symbol = ::utf8::unchecked::next(curr); - if (f(symbol)) - { - if (!word.empty()) - { - out.push_back(word); - word.clear(); - inserter = back_inserter(word); - } - } - else - { - inserter = utf8::unchecked::append(symbol, inserter); - } - } - if (!word.empty()) - out.push_back(word); - return !out.empty(); - } - - bool IsSearchDelimiter(uint32_t symbol) - { - // latin table optimization - if (symbol >= ' ' && symbol < '0') - return true; - - switch (symbol) - { - case ':': - case ';': - case '<': - case '=': - case '>': - case '[': - case ']': - case '\\': - case '^': - case '_': - case '`': - case '{': - case '}': - case '|': - case '~': - case 0x0336: - return true; - } - return false; - } -} diff --git a/base/utf8_string.hpp b/base/utf8_string.hpp deleted file mode 100644 index d3ca3e6caf..0000000000 --- a/base/utf8_string.hpp +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "../std/string.hpp" -#include "../std/vector.hpp" -#include "../std/stdint.hpp" - -namespace utf8_string -{ - typedef bool (*IsDelimiterFuncT)(uint32_t); - /// delimeters optimal for search - bool IsSearchDelimiter(uint32_t symbol); - bool Split(string const & str, vector & out, IsDelimiterFuncT f = &IsSearchDelimiter); -} diff --git a/search/delimiters.cpp b/search/delimiters.cpp new file mode 100644 index 0000000000..2be9f0f397 --- /dev/null +++ b/search/delimiters.cpp @@ -0,0 +1,35 @@ +#include "delimiters.hpp" + +namespace search +{ + +bool Delimiters::operator()(strings::UniChar c) const +{ + // @TODO impement full unicode range delimiters table + // latin table optimization + if (c >= ' ' && c < '0') + return true; + switch (c) + { + case ':': + case ';': + case '<': + case '=': + case '>': + case '[': + case ']': + case '\\': + case '^': + case '_': + case '`': + case '{': + case '}': + case '|': + case '~': + case 0x0336: + return true; + } + return false; +} + +} diff --git a/search/delimiters.hpp b/search/delimiters.hpp new file mode 100644 index 0000000000..848c670aff --- /dev/null +++ b/search/delimiters.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "../base/string_utils.hpp" + +namespace search +{ + +class Delimiters +{ +public: + bool operator()(strings::UniChar c) const; +}; + +} diff --git a/search/query.cpp b/search/query.cpp index d2668c133c..abdb558d1c 100644 --- a/search/query.cpp +++ b/search/query.cpp @@ -1,19 +1,22 @@ #include "query.hpp" -#include "../base/utf8_string.hpp" +#include "delimiters.hpp" + +#include "../base/string_utils.hpp" namespace search1 { Query::Query(string const & query) { - utf8_string::Split(query, m_Keywords, &utf8_string::IsSearchDelimiter); - if (!query.empty() && !utf8_string::IsSearchDelimiter(query[query.size() - 1])) + search::Delimiters delims; + strings::TokenizeIterator iter(query, delims); + while (iter) { - m_Prefix.swap(m_Keywords.back()); - m_Keywords.pop_back(); + if (iter.IsLast() && !delims(strings::LastUniChar(query))) + m_prefix = *iter; + else + m_keywords.push_back(*iter); } } - - } diff --git a/search/query.hpp b/search/query.hpp index cdbf85bc94..d950b87e69 100644 --- a/search/query.hpp +++ b/search/query.hpp @@ -12,8 +12,8 @@ class Query public: explicit Query(string const & query); private: - vector m_Keywords; - string m_Prefix; + vector m_keywords; + string m_prefix; }; } // namespace search1 diff --git a/search/search.pro b/search/search.pro index 857e529f4a..086453d34f 100644 --- a/search/search.pro +++ b/search/search.pro @@ -10,11 +10,13 @@ DEPENDENCIES = indexer geometry coding base include($$ROOT_DIR/common.pri) HEADERS += \ - query.hpp \ - search_processor.hpp \ - string_match.hpp \ + query.hpp \ + search_processor.hpp \ + string_match.hpp \ + delimiters.hpp \ SOURCES += \ - query.cpp \ - search_processor.cpp \ - string_match.cpp \ + query.cpp \ + search_processor.cpp \ + string_match.cpp \ + delimiters.cpp \ diff --git a/search/search_processor.cpp b/search/search_processor.cpp index 088ce4c360..a0900a2f8e 100644 --- a/search/search_processor.cpp +++ b/search/search_processor.cpp @@ -3,7 +3,6 @@ #include "../indexer/feature.hpp" #include "../indexer/classificator.hpp" -#include "../base/utf8_string.hpp" #include "../base/logging.hpp" #include "../std/bind.hpp" @@ -31,13 +30,13 @@ namespace search Query::Query(string const & line) { - utf8_string::Split(line, m_tokens); + //utf8_string::Split(line, m_tokens); } bool Query::operator()(char lang, string const & utf8s) { vector words; - utf8_string::Split(utf8s, words); + //utf8_string::Split(utf8s, words); int score = -1; for (size_t i = 0; i < m_tokens.size(); ++i) {