From 59fdb701e0b6afd4e44be1ee8d319a93224c1688 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 3 Nov 2016 15:26:10 +0300 Subject: [PATCH] [base] Levenshtein DFA. --- base/base.pro | 2 + base/base_tests/base_tests.pro | 1 + base/base_tests/levenshtein_dfa_test.cpp | 90 +++++++ base/levenshtein_dfa.cpp | 307 +++++++++++++++++++++++ base/levenshtein_dfa.hpp | 124 +++++++++ 5 files changed, 524 insertions(+) create mode 100644 base/base_tests/levenshtein_dfa_test.cpp create mode 100644 base/levenshtein_dfa.cpp create mode 100644 base/levenshtein_dfa.hpp diff --git a/base/base.pro b/base/base.pro index 803d1b544a..9f3fe820b7 100644 --- a/base/base.pro +++ b/base/base.pro @@ -14,6 +14,7 @@ SOURCES += \ exception.cpp \ gmtime.cpp \ internal/message.cpp \ + levenshtein_dfa.cpp \ logging.cpp \ lower_case.cpp \ normalize_unicode.cpp \ @@ -48,6 +49,7 @@ HEADERS += \ exception.hpp \ gmtime.hpp \ internal/messagex.hpp \ + levenshtein_dfa.hpp \ limited_priority_queue.hpp \ logging.hpp \ macros.hpp \ diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro index 7fa3e5a31b..1a4f2bb9ec 100644 --- a/base/base_tests/base_tests.pro +++ b/base/base_tests/base_tests.pro @@ -22,6 +22,7 @@ SOURCES += \ condition_test.cpp \ const_helper.cpp \ containers_test.cpp \ + levenshtein_dfa_test.cpp \ logging_test.cpp \ newtype_test.cpp \ math_test.cpp \ diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp new file mode 100644 index 0000000000..c671b1aa1e --- /dev/null +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -0,0 +1,90 @@ +#include "testing/testing.hpp" + +#include "base/levenshtein_dfa.hpp" + +#include "std/string.hpp" + +using namespace strings; + +namespace +{ +enum class Status +{ + Accepts, + Rejects, + Intermediate +}; + +Status GetStatus(LevenshteinDFA const & dfa, string const & s) +{ + auto it = dfa.Begin(); + it.Move(s); + if (it.Accepts()) + return Status::Accepts; + if (it.Rejects()) + return Status::Rejects; + return Status::Intermediate; +} + +bool Accepts(LevenshteinDFA const & dfa, string const & s) +{ + return GetStatus(dfa, s) == Status::Accepts; +} + +bool Rejects(LevenshteinDFA const & dfa, string const & s) +{ + return GetStatus(dfa, s) == Status::Rejects; +} + +bool Intermediate(LevenshteinDFA const & dfa, string const & s) +{ + return GetStatus(dfa, s) == Status::Intermediate; +} + +UNIT_TEST(LevenshteinDFA_Smoke) +{ + { + LevenshteinDFA dfa("", 0 /* maxErrors */); + + auto it = dfa.Begin(); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + it.Move('a'); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + + it.Move('b'); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } + + { + LevenshteinDFA dfa("abc", 1 /* maxErrors */); + + TEST(Accepts(dfa, "ab"), ()); + TEST(Accepts(dfa, "abd"), ()); + TEST(Accepts(dfa, "abcd"), ()); + TEST(Accepts(dfa, "bbc"), ()); + + TEST(Rejects(dfa, "cba"), ()); + TEST(Rejects(dfa, "abcde"), ()); + + TEST(Accepts(dfa, "ac"), ()); + TEST(Intermediate(dfa, "acb"), ()); // transpositions are not supported yet + TEST(Accepts(dfa, "acbc"), ()); + TEST(Rejects(dfa, "acbd"), ()); + + TEST(Intermediate(dfa, "a"), ()); + } + + { + LevenshteinDFA dfa("ленинградский", 2 /* maxErrors */); + + TEST(Accepts(dfa, "ленинградский"), ()); + TEST(Accepts(dfa, "ленингадский"), ()); + TEST(Accepts(dfa, "ленигнрадский"), ()); + TEST(Rejects(dfa, "ленинский"), ()); + } +} +} // namespace diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp new file mode 100644 index 0000000000..c0741c7aa8 --- /dev/null +++ b/base/levenshtein_dfa.cpp @@ -0,0 +1,307 @@ +#include "base/levenshtein_dfa.hpp" + +#include "base/assert.hpp" +#include "base/logging.hpp" +#include "base/stl_helpers.hpp" + +#include "std/algorithm.hpp" +#include "std/queue.hpp" +#include "std/set.hpp" +#include "std/sstream.hpp" +#include "std/vector.hpp" + +namespace strings +{ +namespace +{ +class TransitionTable +{ +public: + TransitionTable(UniString const & s, uint8_t maxErrors) + : m_s(s), m_size(s.size()), m_maxErrors(maxErrors) + { + } + + void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t) + { + t.Clear(); + for (auto const & p : s.m_positions) + GetMoves(p, c, t); + t.Normalize(); + } + +private: + void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t) + { + auto & ps = t.m_positions; + + if (p.m_numErrors < m_maxErrors) + { + if (p.m_offset + 1 < m_size) + { + size_t i; + if (FindRelevant(p, c, i)) + { + if (i == 0) + { + ps.emplace_back(p.m_offset + 1, p.m_numErrors); + } + else + { + ps.emplace_back(p.m_offset, p.m_numErrors + 1); + ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); + ps.emplace_back(p.m_offset + i + 1, p.m_numErrors + i); + } + } + else + { + ps.emplace_back(p.m_offset, p.m_numErrors + 1); + ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); + } + } + else if (p.m_offset + 1 == m_size) + { + if (m_s[p.m_offset] == c) + { + ps.emplace_back(p.m_offset + 1, p.m_numErrors); + } + else + { + ps.emplace_back(p.m_offset, p.m_numErrors + 1); + ps.emplace_back(p.m_offset + 1, p.m_numErrors + 1); + } + } + else + { + ps.emplace_back(p.m_offset, p.m_numErrors + 1); + } + } + else if (p.m_offset < m_size && m_s[p.m_offset] == c) + { + ps.emplace_back(p.m_offset + 1, p.m_numErrors); + } + } + + bool FindRelevant(LevenshteinDFA::Position const & p, UniChar c, size_t & i) const + { + size_t const limit = + min(m_size - p.m_offset, static_cast(m_maxErrors - p.m_numErrors) + 1); + + for (i = 0; i < limit; ++i) + { + if (m_s[p.m_offset + i] == c) + return true; + } + return false; + } + + UniString const & m_s; + size_t const m_size; + uint8_t const m_maxErrors; +}; +} // namespace + +// LevenshteinDFA::Position ------------------------------------------------------------------------ +LevenshteinDFA::Position::Position(size_t offset, uint8_t numErrors) + : m_offset(offset), m_numErrors(numErrors) +{ +} + +bool LevenshteinDFA::Position::SumsumedBy(Position const & rhs) const +{ + if (m_numErrors <= rhs.m_numErrors) + return false; + + size_t const delta = m_numErrors - rhs.m_numErrors; + if (m_offset + delta < delta) + return true; + if (rhs.m_offset + delta < delta) + return true; + return m_offset + delta >= rhs.m_offset || m_offset <= rhs.m_offset + delta; +} + +bool LevenshteinDFA::Position::operator<(Position const & rhs) const +{ + if (m_offset != rhs.m_offset) + return m_offset < rhs.m_offset; + return m_numErrors < rhs.m_numErrors; +} + +bool LevenshteinDFA::Position::operator==(Position const & rhs) const +{ + return m_offset == rhs.m_offset && m_numErrors == rhs.m_numErrors; +} + +// LevenshteinDFA::State --------------------------------------------------------------------------- +// static +LevenshteinDFA::State LevenshteinDFA::State::MakeStart() +{ + State state; + state.m_positions.emplace_back(0 /* offset */, 0 /* numErrors */); + return state; +} + +void LevenshteinDFA::State::Normalize() +{ + size_t j = m_positions.size(); + for (size_t i = 0; i < j; ++i) + { + auto const & cur = m_positions[i]; + + auto it = find_if(m_positions.begin(), m_positions.begin() + j, + [&](Position const & rhs) { return cur.SumsumedBy(rhs); }); + if (it != m_positions.begin() + j) + { + ASSERT_GREATER(j, 0, ()); + --j; + swap(m_positions[i], m_positions[j]); + } + } + + m_positions.erase(m_positions.begin() + j, m_positions.end()); + my::SortUnique(m_positions); +} + +// LevenshteinDFA ---------------------------------------------------------------------------------- +LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) + : m_size(s.size()), m_maxErrors(maxErrors), m_rejecting(0) +{ + m_alphabet.assign(s.begin(), s.end()); + my::SortUnique(m_alphabet); + + UniChar missed = 0; + for (size_t i = 0; i < m_alphabet.size() && missed >= m_alphabet[i]; ++i) + { + if (missed == m_alphabet[i]) + ++missed; + } + m_alphabet.push_back(missed); + + TransitionTable table(s, maxErrors); + queue> states; + states.emplace(State::MakeStart(), 0); + + map visited; + visited[states.front().first] = states.front().second; + + m_transitions.emplace_back(m_alphabet.size()); + + while (!states.empty()) + { + auto const p = states.front(); + states.pop(); + + auto const & curr = p.first; + auto const id = p.second; + + ASSERT(IsValid(curr), (curr)); + + if (IsAccepting(curr)) + m_accepting.insert(id); + if (IsRejecting(curr)) + { + ASSERT(m_rejecting == 0 || m_rejecting == id, + ("Rejecting state must be set once:", m_rejecting, id)); + m_rejecting = id; + } + + ASSERT_LESS(id, m_transitions.size(), ()); + + for (size_t i = 0; i < m_alphabet.size(); ++i) + { + State next; + table.Move(curr, m_alphabet[i], next); + + size_t nid; + auto const it = visited.find(next); + if (it == visited.end()) + { + nid = visited.size(); + visited[next] = nid; + m_transitions.emplace_back(m_alphabet.size()); + states.emplace(next, nid); + + ASSERT_EQUAL(visited.size(), m_transitions.size(), ()); + } + else + { + nid = it->second; + } + + m_transitions[id][i] = nid; + } + } + + ASSERT(m_rejecting != 0, ("Rejecting state is not set")); +} + +LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors) + : LevenshteinDFA(MakeUniString(s), maxErrors) +{ +} + +bool LevenshteinDFA::IsValid(Position const & p) const +{ + return p.m_offset <= m_size && p.m_numErrors <= m_maxErrors; +} + +bool LevenshteinDFA::IsValid(State const & s) const +{ + for (auto const & p : s.m_positions) + { + if (!IsValid(p)) + return false; + } + return true; +} + +bool LevenshteinDFA::IsAccepting(Position const & p) const +{ + return m_size - p.m_offset <= m_maxErrors - p.m_numErrors; +} + +bool LevenshteinDFA::IsAccepting(State const & s) const +{ + for (auto const & p : s.m_positions) + { + if (IsAccepting(p)) + return true; + } + return false; +} + +size_t LevenshteinDFA::Move(size_t s, UniChar c) const +{ + ASSERT_GREATER(m_alphabet.size(), 0, ()); + ASSERT(is_sorted(m_alphabet.begin(), m_alphabet.end() - 1), ()); + + size_t i; + auto const it = lower_bound(m_alphabet.begin(), m_alphabet.end() - 1, c); + if (it == m_alphabet.end() - 1 || *it != c) + i = m_alphabet.size() - 1; + else + i = distance(m_alphabet.begin(), it); + + return m_transitions[s][i]; +} + +string DebugPrint(LevenshteinDFA::Position const & p) +{ + ostringstream os; + os << "Position [" << p.m_offset << ", " << static_cast(p.m_numErrors) << "]"; + return os.str(); +} + +string DebugPrint(LevenshteinDFA::State const & s) +{ + ostringstream os; + os << "State ["; + for (size_t i = 0; i < s.m_positions.size(); ++i) + { + os << DebugPrint(s.m_positions[i]); + if (i + 1 != s.m_positions.size()) + os << ", "; + } + return os.str(); +} +} // namespace strings diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp new file mode 100644 index 0000000000..2632a73135 --- /dev/null +++ b/base/levenshtein_dfa.hpp @@ -0,0 +1,124 @@ +#pragma once + +#include "base/string_utils.hpp" + +#include "std/cstdint.hpp" +#include "std/unordered_set.hpp" + +namespace strings +{ +// This class represends a DFA recognizing a language consisting of +// all words that are close enough to a given string, in terms of +// Levenshtein distance. The code is based on the work "Fast String +// Correction with Levenshtein-Automata" by Klaus U. Schulz and Stoyan +// Mihov. +// +// *NOTE* The class *IS* thread-safe. +class LevenshteinDFA +{ +public: + struct Position + { + Position() = default; + Position(size_t offset, uint8_t numErrors); + + bool SumsumedBy(Position const & rhs) const; + + bool operator<(Position const & rhs) const; + bool operator==(Position const & rhs) const; + + size_t m_offset = 0; + uint8_t m_numErrors = 0; + }; + + struct State + { + static State MakeStart(); + + void Normalize(); + inline void Clear() { m_positions.clear(); } + + inline bool operator<(State const & rhs) const { return m_positions < rhs.m_positions; } + + vector m_positions; + }; + + // An iterator to the current state in the DFA. + // + // *NOTE* The class *IS NOT* thread safe. Moreover, it should not be + // used after destruction of the corresponding DFA. + class Iterator + { + public: + Iterator & Move(UniChar c) + { + m_s = m_dfa.Move(m_s, c); + return *this; + } + + inline Iterator & Move(UniString const & s) { return Move(s.begin(), s.end()); } + + inline Iterator & Move(string const & s) + { + UniString us = MakeUniString(s); + return Move(us); + } + + template + Iterator & Move(It begin, It end) + { + for (; begin != end; ++begin) + Move(*begin); + return *this; + } + + bool Accepts() const { return m_dfa.IsAccepting(m_s); } + bool Rejects() const { return m_dfa.IsRejecting(m_s); } + + private: + friend class LevenshteinDFA; + + Iterator(LevenshteinDFA const & dfa) : m_s(0), m_dfa(dfa) {} + + size_t m_s = 0; + LevenshteinDFA const & m_dfa; + }; + + LevenshteinDFA(UniString const & s, uint8_t maxErrors); + LevenshteinDFA(string const & s, uint8_t maxErrors); + + inline Iterator Begin() const { return Iterator(*this); } + + size_t GetNumStates() const { return m_transitions.size(); } + size_t GetAlphabetSize() const { return m_alphabet.size(); } + +private: + friend class Iterator; + + size_t TransformChar(UniChar c) const; + + bool IsValid(Position const & p) const; + bool IsValid(State const & s) const; + + bool IsAccepting(Position const & p) const; + bool IsAccepting(State const & s) const; + inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; } + + inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); } + inline bool IsRejecting(size_t s) const { return s == m_rejecting; } + + size_t Move(size_t s, UniChar c) const; + + size_t const m_size; + uint8_t const m_maxErrors; + + vector m_alphabet; + + vector> m_transitions; + unordered_set m_accepting; + size_t m_rejecting; +}; + +string DebugPrint(LevenshteinDFA::Position const & p); +string DebugPrint(LevenshteinDFA::State const & s); +} // namespace strings