diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp index c0741c7aa8..6c78c4f2a4 100644 --- a/base/levenshtein_dfa.cpp +++ b/base/levenshtein_dfa.cpp @@ -1,7 +1,6 @@ #include "base/levenshtein_dfa.hpp" #include "base/assert.hpp" -#include "base/logging.hpp" #include "base/stl_helpers.hpp" #include "std/algorithm.hpp" @@ -35,50 +34,28 @@ private: { auto & ps = t.m_positions; - if (p.m_numErrors < m_maxErrors) - { - if (p.m_offset + 1 < m_size) - { - size_t i; - if (FindRelevant(p, c, i)) - { - if (i == 0) - { - ps.emplace_back(p.m_offset + 1, p.m_numErrors); - } - else - { - ps.emplace_back(p.m_offset, p.m_numErrors + 1); - ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); - ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i); - } - } - else - { - ps.emplace_back(p.m_offset, p.m_numErrors + 1); - ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); - } - } - else if (p.m_offset + 1 == m_size) - { - if (m_s[p.m_offset] == c) - { - ps.emplace_back(p.m_offset + 1, p.m_numErrors); - } - else - { - ps.emplace_back(p.m_offset, p.m_numErrors + 1); - ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); - } - } - else - { - ps.emplace_back(p.m_offset, p.m_numErrors + 1); - } - } - else if (p.m_offset < m_size && m_s[p.m_offset] == c) + if (p.m_offset < m_size && m_s[p.m_offset] == c) { ps.emplace_back(p.m_offset + 1, p.m_numErrors); + return; + } + + if (p.m_numErrors == m_maxErrors) + return; + + ps.emplace_back(p.m_offset, p.m_numErrors + 1); + + if (p.m_offset == m_size) + return; + + ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); + + size_t i; + if (FindRelevant(p, c, i)) + { + ASSERT_GREATER(i, 0, (i)); + ASSERT_LESS_OR_EQUAL(p.m_offset + i + 1, m_size, ()); + ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i); } } @@ -101,23 +78,26 @@ private: }; } // namespace +// LevenshteinDFA ---------------------------------------------------------------------------------- +// static +size_t const LevenshteinDFA::kStartingState = 0; +size_t const LevenshteinDFA::kRejectingState = 1; + // LevenshteinDFA::Position ------------------------------------------------------------------------ LevenshteinDFA::Position::Position(size_t offset, uint8_t numErrors) : m_offset(offset), m_numErrors(numErrors) { } -bool LevenshteinDFA::Position::SumsumedBy(Position const & rhs) const +bool LevenshteinDFA::Position::SubsumedBy(Position const & rhs) const { if (m_numErrors <= rhs.m_numErrors) return false; - size_t const delta = m_numErrors - rhs.m_numErrors; - if (m_offset + delta < delta) - return true; - if (rhs.m_offset + delta < delta) - return true; - return m_offset + delta >= rhs.m_offset || m_offset <= rhs.m_offset + delta; + size_t const u = m_offset < rhs.m_offset ? rhs.m_offset - m_offset : m_offset - rhs.m_offset; + size_t const v = m_numErrors - rhs.m_numErrors; + + return u <= v; } bool LevenshteinDFA::Position::operator<(Position const & rhs) const @@ -141,6 +121,12 @@ LevenshteinDFA::State LevenshteinDFA::State::MakeStart() return state; } +// static +LevenshteinDFA::State LevenshteinDFA::State::MakeRejecting() +{ + return State(); +} + void LevenshteinDFA::State::Normalize() { size_t j = m_positions.size(); @@ -149,7 +135,7 @@ void LevenshteinDFA::State::Normalize() auto const & cur = m_positions[i]; auto it = find_if(m_positions.begin(), m_positions.begin() + j, - [&](Position const & rhs) { return cur.SumsumedBy(rhs); }); + [&](Position const & rhs) { return cur.SubsumedBy(rhs); }); if (it != m_positions.begin() + j) { ASSERT_GREATER(j, 0, ()); @@ -163,8 +149,9 @@ void LevenshteinDFA::State::Normalize() } // LevenshteinDFA ---------------------------------------------------------------------------------- +// static LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) - : m_size(s.size()), m_maxErrors(maxErrors), m_rejecting(0) + : m_size(s.size()), m_maxErrors(maxErrors) { m_alphabet.assign(s.begin(), s.end()); my::SortUnique(m_alphabet); @@ -177,35 +164,36 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) } m_alphabet.push_back(missed); - TransitionTable table(s, maxErrors); - queue> states; - states.emplace(State::MakeStart(), 0); - + queue states; map visited; - visited[states.front().first] = states.front().second; - m_transitions.emplace_back(m_alphabet.size()); + auto pushState = [&states, &visited, this](State const & state, size_t id) + { + ASSERT_EQUAL(id, m_transitions.size(), ()); + ASSERT_EQUAL(visited.count(state), 0, (state, id)); + + states.emplace(state); + visited[state] = id; + m_transitions.emplace_back(m_alphabet.size()); + }; + + pushState(State::MakeStart(), kStartingState); + pushState(State::MakeRejecting(), kRejectingState); + + TransitionTable table(s, maxErrors); while (!states.empty()) { - auto const p = states.front(); + auto const curr = states.front(); states.pop(); - - auto const & curr = p.first; - auto const id = p.second; - ASSERT(IsValid(curr), (curr)); + ASSERT_GREATER(visited.count(curr), 0, (curr)); + auto const id = visited[curr]; + ASSERT_LESS(id, m_transitions.size(), ()); + if (IsAccepting(curr)) m_accepting.insert(id); - if (IsRejecting(curr)) - { - ASSERT(m_rejecting == 0 || m_rejecting == id, - ("Rejecting state must be set once:", m_rejecting, id)); - m_rejecting = id; - } - - ASSERT_LESS(id, m_transitions.size(), ()); for (size_t i = 0; i < m_alphabet.size(); ++i) { @@ -213,15 +201,12 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) table.Move(curr, m_alphabet[i], next); size_t nid; + auto const it = visited.find(next); if (it == visited.end()) { nid = visited.size(); - visited[next] = nid; - m_transitions.emplace_back(m_alphabet.size()); - states.emplace(next, nid); - - ASSERT_EQUAL(visited.size(), m_transitions.size(), ()); + pushState(next, nid); } else { @@ -231,8 +216,6 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) m_transitions[id][i] = nid; } } - - ASSERT(m_rejecting != 0, ("Rejecting state is not set")); } LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors) diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index 2632a73135..cef354c458 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -7,22 +7,36 @@ namespace strings { -// This class represends a DFA recognizing a language consisting of +// This class represents a DFA recognizing a language consisting of // all words that are close enough to a given string, in terms of -// Levenshtein distance. The code is based on the work "Fast String -// Correction with Levenshtein-Automata" by Klaus U. Schulz and Stoyan -// Mihov. +// Levenshtein distance. Levenshtein distance treats deletions, +// insertions and replacements as errors, transpositions are not +// handled now. The code is based on the work "Fast String Correction +// with Levenshtein-Automata" by Klaus U. Schulz and Stoyan Mihov. +// For a fixed number of allowed errors and fixed alphabet the +// construction time and size of automata is O(length of the pattern), +// but the size grows exponentially with the number of errors, so be +// reasonable and don't use this class when the number of errors is +// too high. // // *NOTE* The class *IS* thread-safe. +// +// TODO (@y): add support for transpositions. +// +// TODO (@y): consider to implement a factory of automata, that will +// be able to construct them quickly for a fixed number of errors. class LevenshteinDFA { public: + static size_t const kStartingState; + static size_t const kRejectingState; + struct Position { Position() = default; Position(size_t offset, uint8_t numErrors); - bool SumsumedBy(Position const & rhs) const; + bool SubsumedBy(Position const & rhs) const; bool operator<(Position const & rhs) const; bool operator==(Position const & rhs) const; @@ -34,6 +48,7 @@ public: struct State { static State MakeStart(); + static State MakeRejecting(); void Normalize(); inline void Clear() { m_positions.clear(); } @@ -78,9 +93,9 @@ public: private: friend class LevenshteinDFA; - Iterator(LevenshteinDFA const & dfa) : m_s(0), m_dfa(dfa) {} + Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {} - size_t m_s = 0; + size_t m_s; LevenshteinDFA const & m_dfa; }; @@ -105,7 +120,7 @@ private: inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; } inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); } - inline bool IsRejecting(size_t s) const { return s == m_rejecting; } + inline bool IsRejecting(size_t s) const { return s == kRejectingState; } size_t Move(size_t s, UniChar c) const; @@ -116,7 +131,6 @@ private: vector> m_transitions; unordered_set m_accepting; - size_t m_rejecting; }; string DebugPrint(LevenshteinDFA::Position const & p);