From 4a2fb4f338afab77649b63455f7003829b907efa Mon Sep 17 00:00:00 2001 From: Yury Melnichek Date: Sun, 22 May 2011 16:11:38 +0200 Subject: [PATCH] Add fuzzy string matching. --- search/search.pro | 2 + search/search_tests/search_tests.pro | 1 + search/search_tests/string_match_test.cpp | 53 ++++++++++++++ search/string_match.cpp | 46 ++++++++++++ search/string_match.hpp | 89 +++++++++++++++++++++++ 5 files changed, 191 insertions(+) create mode 100644 search/search_tests/string_match_test.cpp create mode 100644 search/string_match.cpp create mode 100644 search/string_match.hpp diff --git a/search/search.pro b/search/search.pro index b74ed1811e..857e529f4a 100644 --- a/search/search.pro +++ b/search/search.pro @@ -12,7 +12,9 @@ include($$ROOT_DIR/common.pri) HEADERS += \ query.hpp \ search_processor.hpp \ + string_match.hpp \ SOURCES += \ query.cpp \ search_processor.cpp \ + string_match.cpp \ diff --git a/search/search_tests/search_tests.pro b/search/search_tests/search_tests.pro index 587bb16b59..cbb09573f0 100644 --- a/search/search_tests/search_tests.pro +++ b/search/search_tests/search_tests.pro @@ -19,3 +19,4 @@ win32 { SOURCES += \ ../../testing/testingmain.cpp \ + string_match_test.cpp \ diff --git a/search/search_tests/string_match_test.cpp b/search/search_tests/string_match_test.cpp new file mode 100644 index 0000000000..cdc36c55a3 --- /dev/null +++ b/search/search_tests/string_match_test.cpp @@ -0,0 +1,53 @@ +#include "../../testing/testing.hpp" +#include "../string_match.hpp" + +namespace +{ + +class TestMatchCost +{ +public: + uint32_t Cost10(char) const { return 1; } + uint32_t Cost01(char) const { return 1; } + uint32_t Cost11(char, char) const { return 1; } + uint32_t Cost12(char a, char const * pB) const + { + if (a == 'X' && pB[0] == '>' && pB[1] == '<') + return 0; + return 2; + } + uint32_t Cost21(char const * pA, char b) const { return Cost12(b, pA); } + uint32_t Cost22(char const *, char const *) const { return 2; } + uint32_t SwapCost(char, char) const { return 1; } +}; + +uint32_t MatchCost(char const * a, char const * b, uint32_t maxCost = 1000) +{ + return ::search::StringMatchCost(a, strlen(a), b, strlen(b), TestMatchCost(), maxCost); +} + +} + +UNIT_TEST(StringMatchCost) +{ + TEST_EQUAL(MatchCost("", ""), 0, ()); + TEST_EQUAL(MatchCost("a", "b"), 1, ()); + TEST_EQUAL(MatchCost("a", ""), 1, ()); + TEST_EQUAL(MatchCost("", "b"), 1, ()); + TEST_EQUAL(MatchCost("ab", "cd"), 2, ()); + TEST_EQUAL(MatchCost("ab", "ba"), 1, ()); + TEST_EQUAL(MatchCost("abcd", "efgh"), 4, ()); + TEST_EQUAL(MatchCost("Hello!", "Hello!"), 0, ()); + TEST_EQUAL(MatchCost("Hello!", "Helo!"), 1, ()); + TEST_EQUAL(MatchCost("X", "X"), 0, ()); + TEST_EQUAL(MatchCost("X", "><"), 0, ()); + TEST_EQUAL(MatchCost("XXX", "><><><"), 0, ()); + TEST_EQUAL(MatchCost("XXX", "><"), 0, ()); + TEST_EQUAL(MatchCost("TeXt", "Te><"), 1, ()); + TEST_EQUAL(MatchCost("TeXt", "TetX"), 1, ()); + TEST_EQUAL(MatchCost("TeXt", "Tet><"), 2, ()); + TEST_EQUAL(MatchCost("", "ALongString"), 11, ()); + TEST_EQUAL(MatchCost("x", "ALongString"), 11, ()); + TEST_EQUAL(MatchCost("g", "ALongString"), 10, ()); +} diff --git a/search/string_match.cpp b/search/string_match.cpp new file mode 100644 index 0000000000..bf3e00b13d --- /dev/null +++ b/search/string_match.cpp @@ -0,0 +1,46 @@ +#include "string_match.hpp" + +// TODO: Сделать модель ошибок. +// Учитывать соседние кнопки на клавиатуре. +// 1. Сосед вместо нужной +// 2. Сосед до или после нужной. + +namespace search +{ + +uint32_t DefaultMatchCost::Cost10(UniChar) const +{ + return 128; +} + +uint32_t DefaultMatchCost::Cost01(UniChar) const +{ + return 128; +} + +uint32_t DefaultMatchCost::Cost11(UniChar, UniChar) const +{ + return 128; +} + +uint32_t DefaultMatchCost::Cost12(UniChar, UniChar const *) const +{ + return 256; +} + +uint32_t DefaultMatchCost::Cost21(UniChar const *, UniChar) const +{ + return 256; +} + +uint32_t DefaultMatchCost::Cost22(UniChar const *, UniChar const *) const +{ + return 256; +} + +uint32_t DefaultMatchCost::SwapCost(UniChar, UniChar) const +{ + return 128; +} + +} // namespace search diff --git a/search/string_match.hpp b/search/string_match.hpp new file mode 100644 index 0000000000..fa482d2dbf --- /dev/null +++ b/search/string_match.hpp @@ -0,0 +1,89 @@ +#pragma once + +#include "../base/base.hpp" +#include "../base/buffer_vector.hpp" +#include "../std/queue.hpp" + +namespace search +{ + +typedef uint32_t UniChar; + +namespace impl +{ + +struct MatchCostData +{ + uint32_t m_A, m_B; + uint32_t m_Cost; + + MatchCostData() : m_A(0), m_B(0), m_Cost(0) {} + MatchCostData(uint32_t a, uint32_t b, uint32_t cost) : m_A(a), m_B(b), m_Cost(cost) {} + + bool operator < (MatchCostData const & o) const + { + return m_Cost > o.m_Cost; + } +}; + +template +void PushMatchCost(PriorityQueyeT & q, uint32_t maxCost, uint32_t a, uint32_t b, uint32_t cost) +{ + if (cost < maxCost) + q.push(MatchCostData(a, b, cost)); +} + +} // namespace search::impl + +class DefaultMatchCost +{ +public: + uint32_t Cost10(UniChar a) const; + uint32_t Cost01(UniChar b) const; + uint32_t Cost11(UniChar a, UniChar b) const; + uint32_t Cost12(UniChar a, UniChar const * pB) const; + uint32_t Cost21(UniChar const * pA, UniChar b) const; + uint32_t Cost22(UniChar const * pA, UniChar const * pB) const; + uint32_t SwapCost(UniChar a1, UniChar a2) const; +}; + +template +uint32_t StringMatchCost(CharT const * sA, uint32_t sizeA, + CharT const * sB, uint32_t sizeB, + CostF const & costF, uint32_t maxCost) +{ + priority_queue > q; + q.push(impl::MatchCostData(0, 0, 0)); + while (!q.empty()) + { + uint32_t a = q.top().m_A; + uint32_t b = q.top().m_B; + uint32_t const c = q.top().m_Cost; + q.pop(); + while (a < sizeA && b < sizeB && sA[a] == sB[b]) + ++a, ++b; + + if (a == sizeA && b == sizeB) + return c; + + if (a < sizeA) + impl::PushMatchCost(q, maxCost, a + 1, b, c + costF.Cost10(sA[a])); + if (b < sizeB) + impl::PushMatchCost(q, maxCost, a, b + 1, c + costF.Cost01(sB[b])); + if (a < sizeA && b < sizeB) + impl::PushMatchCost(q, maxCost, a + 1, b + 1, c + costF.Cost11(sA[a], sB[b])); + if (a + 1 < sizeA && b < sizeB) + impl::PushMatchCost(q, maxCost, a + 2, b + 1, c + costF.Cost21(&sA[a], sB[b])); + if (a < sizeA && b + 1 < sizeB) + impl::PushMatchCost(q, maxCost, a + 1, b + 2, c + costF.Cost12(sA[a], &sB[b])); + if (a + 1 < sizeA && b + 1 < sizeB) + { + impl::PushMatchCost(q, maxCost, a + 2, b + 2, c + costF.Cost22(&sA[a], &sB[b])); + if (sA[a] == sB[b + 1] && sA[a + 1] == sB[b]) + impl::PushMatchCost(q, maxCost, a + 2, b + 2, c + costF.SwapCost(sA[a], sA[a + 1])); + } + } + return maxCost + 1; +} + +} // namespace search