diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp index cf8e776be2..01e9756206 100644 --- a/base/base_tests/levenshtein_dfa_test.cpp +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -72,7 +72,7 @@ UNIT_TEST(LevenshteinDFA_Smoke) TEST(Rejects(dfa, "abcde"), ()); TEST(Accepts(dfa, "ac"), ()); - TEST(Intermediate(dfa, "acb"), ()); // transpositions are not supported yet + TEST(Accepts(dfa, "acb"), ()); TEST(Accepts(dfa, "acbc"), ()); TEST(Rejects(dfa, "acbd"), ()); @@ -107,7 +107,7 @@ UNIT_TEST(LevenshteinDFA_Prefix) LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */); TEST(Accepts(dfa, "москва"), ()); TEST(Accepts(dfa, "иосква"), ()); - TEST(Rejects(dfa, "моксва"), ()); + TEST(Accepts(dfa, "моксва"), ()); } } } // namespace diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp index 54c4bb07e0..0586e83390 100644 --- a/base/levenshtein_dfa.cpp +++ b/base/levenshtein_dfa.cpp @@ -13,13 +13,12 @@ namespace strings { namespace { +inline size_t AbsDiff(size_t a, size_t b) { return a > b ? a - b : b - a; } + class TransitionTable { public: - TransitionTable(UniString const & s, uint8_t maxErrors) - : m_s(s), m_size(s.size()), m_maxErrors(maxErrors) - { - } + TransitionTable(UniString const & s) : m_s(s), m_size(s.size()) {} void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c, LevenshteinDFA::State & t) @@ -36,38 +35,50 @@ private: { auto & ps = t.m_positions; - if (p.m_offset < m_size && m_s[p.m_offset] == c) + if (p.IsTransposed()) { - ps.emplace_back(p.m_offset + 1, p.m_numErrors); + if (p.m_offset + 2 <= m_size && m_s[p.m_offset] == c) + ps.emplace_back(p.m_offset + 2, p.m_errorsLeft, false /* transposed */); return; } - if (p.m_numErrors == m_maxErrors) + ASSERT(p.IsStandard(), ()); + + if (p.m_offset < m_size && m_s[p.m_offset] == c) + { + ps.emplace_back(p.m_offset + 1, p.m_errorsLeft, false /* transposed */); + return; + } + + if (p.m_errorsLeft == 0) return; if (p.m_offset < prefixCharsToKeep) return; - ps.emplace_back(p.m_offset, p.m_numErrors + 1); + ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, false /* transposed */); if (p.m_offset == m_size) return; - ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); + ps.emplace_back(p.m_offset + 1, p.m_errorsLeft - 1, false /* transposed */); size_t i; if (FindRelevant(p, c, i)) { ASSERT_GREATER(i, 0, (i)); ASSERT_LESS_OR_EQUAL(p.m_offset + i + 1, m_size, ()); - ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i); + ps.emplace_back(p.m_offset + i + 1, p.m_errorsLeft - i, false /* transposed */); + + if (i == 1) + ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, true /* transposed */); } } bool FindRelevant(LevenshteinDFA::Position const & p, UniChar c, size_t & i) const { size_t const limit = - min(m_size - p.m_offset, static_cast(m_maxErrors - p.m_numErrors) + 1); + min(m_size - p.m_offset, static_cast(p.m_errorsLeft) + 1); for (i = 0; i < limit; ++i) { @@ -79,7 +90,6 @@ private: UniString const & m_s; size_t const m_size; - uint8_t const m_maxErrors; }; } // namespace @@ -89,49 +99,46 @@ size_t const LevenshteinDFA::kStartingState = 0; size_t const LevenshteinDFA::kRejectingState = 1; // LevenshteinDFA::Position ------------------------------------------------------------------------ -LevenshteinDFA::Position::Position(size_t offset, uint8_t numErrors) - : m_offset(offset), m_numErrors(numErrors) +LevenshteinDFA::Position::Position(size_t offset, uint8_t errorsLeft, bool transposed) + : m_offset(offset), m_errorsLeft(errorsLeft), m_transposed(transposed) { } bool LevenshteinDFA::Position::SubsumedBy(Position const & rhs) const { - if (m_numErrors <= rhs.m_numErrors) + if (m_errorsLeft >= rhs.m_errorsLeft) return false; - size_t const u = m_offset < rhs.m_offset ? rhs.m_offset - m_offset : m_offset - rhs.m_offset; - size_t const v = m_numErrors - rhs.m_numErrors; + if (IsStandard() && rhs.IsStandard()) + return AbsDiff(m_offset, rhs.m_offset) <= rhs.m_errorsLeft - m_errorsLeft; - return u <= v; + if (IsStandard() && rhs.IsTransposed()) + return m_offset == rhs.m_offset && m_errorsLeft == 0; + + if (IsTransposed() && rhs.IsStandard()) + return AbsDiff(m_offset + 1, rhs.m_offset) <= rhs.m_errorsLeft - m_errorsLeft; + + ASSERT(IsTransposed(), ()); + ASSERT(rhs.IsTransposed(), ()); + return m_offset == rhs.m_offset; } bool LevenshteinDFA::Position::operator<(Position const & rhs) const { if (m_offset != rhs.m_offset) return m_offset < rhs.m_offset; - return m_numErrors < rhs.m_numErrors; + if (m_errorsLeft != rhs.m_errorsLeft) + return m_errorsLeft < rhs.m_errorsLeft; + return m_transposed < rhs.m_transposed; } bool LevenshteinDFA::Position::operator==(Position const & rhs) const { - return m_offset == rhs.m_offset && m_numErrors == rhs.m_numErrors; + return m_offset == rhs.m_offset && m_errorsLeft == rhs.m_errorsLeft && + m_transposed == rhs.m_transposed; } // LevenshteinDFA::State --------------------------------------------------------------------------- -// static -LevenshteinDFA::State LevenshteinDFA::State::MakeStart() -{ - State state; - state.m_positions.emplace_back(0 /* offset */, 0 /* numErrors */); - return state; -} - -// static -LevenshteinDFA::State LevenshteinDFA::State::MakeRejecting() -{ - return State(); -} - void LevenshteinDFA::State::Normalize() { size_t j = m_positions.size(); @@ -183,10 +190,10 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, ui m_accepting.push_back(false); }; - pushState(State::MakeStart(), kStartingState); - pushState(State::MakeRejecting(), kRejectingState); + pushState(MakeStart(), kStartingState); + pushState(MakeRejecting(), kRejectingState); - TransitionTable table(s, maxErrors); + TransitionTable table(s); while (!states.empty()) { @@ -239,9 +246,22 @@ LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors) { } +LevenshteinDFA::State LevenshteinDFA::MakeStart() +{ + State state; + state.m_positions.emplace_back(0 /* offset */, m_maxErrors /* errorsLeft */, + false /* transposed */); + return state; +} + +LevenshteinDFA::State LevenshteinDFA::MakeRejecting() +{ + return State(); +} + bool LevenshteinDFA::IsValid(Position const & p) const { - return p.m_offset <= m_size && p.m_numErrors <= m_maxErrors; + return p.m_offset <= m_size && p.m_errorsLeft <= m_maxErrors; } bool LevenshteinDFA::IsValid(State const & s) const @@ -256,7 +276,7 @@ bool LevenshteinDFA::IsValid(State const & s) const bool LevenshteinDFA::IsAccepting(Position const & p) const { - return m_size - p.m_offset <= m_maxErrors - p.m_numErrors; + return p.IsStandard() && m_size - p.m_offset <= p.m_errorsLeft; } bool LevenshteinDFA::IsAccepting(State const & s) const @@ -287,7 +307,8 @@ size_t LevenshteinDFA::Move(size_t s, UniChar c) const string DebugPrint(LevenshteinDFA::Position const & p) { ostringstream os; - os << "Position [" << p.m_offset << ", " << static_cast(p.m_numErrors) << "]"; + os << "Position [" << p.m_offset << ", " << static_cast(p.m_errorsLeft) << ", " + << p.m_transposed << "]"; return os.str(); } diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index 0e3236784b..3ba7bbb22c 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -10,19 +10,19 @@ namespace strings // This class represents a DFA recognizing a language consisting of // all words that are close enough to a given string, in terms of // Levenshtein distance. Levenshtein distance treats deletions, -// insertions and replacements as errors, transpositions are not -// handled now. The code is based on the work "Fast String Correction -// with Levenshtein-Automata" by Klaus U. Schulz and Stoyan Mihov. -// For a fixed number of allowed errors and fixed alphabet the -// construction time and size of automata is O(length of the pattern), -// but the size grows exponentially with the number of errors, so be -// reasonable and don't use this class when the number of errors is -// too high. +// insertions and replacements as errors. Transpositions are handled +// in a quite special way - its assumed that all transformations are +// applied in parallel, i.e. the Levenshtein distance between "ab" and +// "bca" is three, not two. The code is based on the work "Fast +// String Correction with Levenshtein-Automata" by Klaus U. Schulz and +// Stoyan Mihov. For a fixed number of allowed errors and fixed +// alphabet the construction time and size of automata is +// O(length of the pattern), but the size grows exponentially with the +// number of errors, so be reasonable and don't use this class when +// the number of errors is too high. // // *NOTE* The class *IS* thread-safe. // -// TODO (@y): add support for transpositions. -// // TODO (@y): consider to implement a factory of automata, that will // be able to construct them quickly for a fixed number of errors. class LevenshteinDFA @@ -34,22 +34,28 @@ public: struct Position { Position() = default; - Position(size_t offset, uint8_t numErrors); + Position(size_t offset, uint8_t errorsLeft, bool transposed); + // SubsumedBy is a relation on two positions, which allows to + // efficiently remove unnecessary positions in a state. When the + // function returns true, it means that |rhs| is more powerful + // than the current position and it is safe to remove the current + // position from the state, if the state contains |rhs|. bool SubsumedBy(Position const & rhs) const; bool operator<(Position const & rhs) const; bool operator==(Position const & rhs) const; + inline bool IsStandard() const { return !m_transposed; } + inline bool IsTransposed() const { return m_transposed; } + size_t m_offset = 0; - uint8_t m_numErrors = 0; + uint8_t m_errorsLeft = 0; + bool m_transposed = false; }; struct State { - static State MakeStart(); - static State MakeRejecting(); - void Normalize(); inline void Clear() { m_positions.clear(); } @@ -96,7 +102,8 @@ public: private: friend class Iterator; - size_t TransformChar(UniChar c) const; + State MakeStart(); + State MakeRejecting(); bool IsValid(Position const & p) const; bool IsValid(State const & s) const; diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index 610e313d2e..e511ddad71 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -784,6 +784,8 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) // Too many errors, can't do anything. TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ()); + + TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ()); } } } // namespace