From 429de988363ae5effde9cb36ae88b0e0a516fe3b Mon Sep 17 00:00:00 2001 From: Martchus Date: Sat, 5 May 2018 23:26:53 +0200 Subject: [PATCH] =?UTF-8?q?Add=20algorithm=20for=20computing=20Damerau?= =?UTF-8?q?=E2=80=93Levenshtein=20distance?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 3 + misc/levenshtein.cpp | 133 +++++++++++++++++++++++++++++++++++++++++++ misc/levenshtein.h | 25 ++++++++ tests/misctests.cpp | 125 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 286 insertions(+) create mode 100644 misc/levenshtein.cpp create mode 100644 misc/levenshtein.h create mode 100644 tests/misctests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 43ed092..7c82f39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ set(HEADER_FILES misc/multiarray.h misc/random.h misc/traits.h + misc/levenshtein.h tests/testutils.h tests/cppunit.h tests/outputcheck.h @@ -59,6 +60,7 @@ set(SRC_FILES io/misc.cpp math/math.cpp misc/random.cpp + misc/levenshtein.cpp tests/testutils.cpp ) @@ -73,6 +75,7 @@ set(TEST_SRC_FILES tests/argumentparsertests.cpp tests/traitstests.cpp tests/mathtests.cpp + tests/misctests.cpp ) set(CMAKE_MODULE_FILES diff --git a/misc/levenshtein.cpp b/misc/levenshtein.cpp new file mode 100644 index 0000000..fe05f99 --- /dev/null +++ b/misc/levenshtein.cpp @@ -0,0 +1,133 @@ +#include "./levenshtein.h" +#include "./multiarray.h" + +#include "../math/math.h" + +#include +#include +#include + +using namespace std; + +/*! + * \namespace MiscUtilities + * \brief The MiscUtilities namespace contains various utilities such as computing Damerau–Levenshtein distance + * and *N*-dimensional arrays. + */ +namespace MiscUtilities { + +/// \cond + +/// \brief The DistanceArray is a 2D array which is allocated either on the stack or the heap. +using DistanceArray = MultiArray; + +/// \brief Initializes to distance array. +/// +/// For size1=5 and size2=4 it would look like this: +/// ``` +/// 9 9 9 9 9 9 +/// 9 0 1 2 3 4 +/// 9 1 0 0 0 0 +/// 9 2 0 0 0 0 +/// 9 3 0 0 0 0 +/// 9 4 0 0 0 0 +/// 9 5 0 0 0 0 +/// ``` +void initDistanceArray(DistanceArray &distanceArray, const size_t size1, const size_t size2) +{ + const auto maxDistance(size1 + size2); + distanceArray.at(0, 0) = maxDistance; + for (size_t i = 0; i <= size1; ++i) { + distanceArray.at(i + 1, 1) = i; + distanceArray.at(i + 1, 0) = maxDistance; + } + for (size_t i = 0; i <= size2; ++i) { + distanceArray.at(1, i + 1) = i; + distanceArray.at(0, i + 1) = maxDistance; + } +} + +/// \brief Performs the actual Damerau–Levenshtein algorithm. +/// \sa computeDamerauLevenshteinDistance() for more details on the algorithm. +size_t performDamerauLevenshteinAlgorithm( + DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2) +{ + size_t dist1[std::numeric_limits::max() + 1] = { 0 }; + for (size_t index1 = 1; index1 <= size1; ++index1) { + size_t dist2 = 0; + for (size_t index2 = 1; index2 <= size2; ++index2) { + const size_t substitution((str1[index1 - 1] == str2[index2 - 1]) ? 0 : 1); + const size_t transposition1(dist1[static_cast(str2[index2 - 1])]); + const size_t transposition2(dist2); + if (!substitution) { + dist2 = index2; + } + // clang-format off + distanceArray.at(index1 + 1, index2 + 1) = MathUtilities::min( + distanceArray.at(index1, index2) + substitution, // substitution + distanceArray.at(index1 + 1, index2) + 1, // insertion + distanceArray.at(index1, index2 + 1) + 1, // deletion + distanceArray.at(transposition1, transposition2) + (index1 - transposition1 - 1) + 1 + (index2 - transposition2 - 1) // transposition + ); + // clang-format on + } + dist1[static_cast(str1[index1 - 1])] = index1; + } + return distanceArray.at(size1 + 1, size2 + 1); +} + +/// \brief Allocates the distance array on the heap and performs the Damerau–Levenshtein algorithm. +template +size_t performDamerauLevenshteinAlgorithmAllocatingOnHeap( + DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2) +{ + std::vector buffer(distanceArray.totalSize()); + distanceArray.buffer() = buffer.data(); + initDistanceArray(distanceArray, size1, size2); + return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2); +} + +/// \brief Allocates the distance array on the stack and performs the Damerau–Levenshtein algorithm. +/// \remarks The totalSize() of \a distanceArray mustn't exceed 128 byte. +template +size_t performDamerauLevenshteinAlgorithmAllocatingOnStack( + DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2) +{ + size_t buffer[128]; + distanceArray.buffer() = buffer; + initDistanceArray(distanceArray, size1, size2); + return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2); +} + +/// \endcond + +/*! + * \brief Computes Damerau–Levenshtein distance with adjacent transpositions. + * + * \returns Returns the number of editing steps required to turn \a str1 into \a str2. + * The following operations are considered as editing steps: + * - substitution: replace one character with another character + * - insertion: insert one character at any position + * - deletion: delete one character at any position + * - transposition: swap any pair of adjacent characters + * + * \remarks + * - Computing Optimal string alignment distance is a *different* thing. + * - The algorithm operates on byte-level. So characters requiring more than one byte in + * the used character encoding (eg. UTF-8 encoded German umlauts) are counted as multiple + * characters (eg. substitution of those umlauts with non-umlauts requires 2 editing steps). + * - The memory consumption of this algorithm is considerably. The required memory increases + * with the product of \a size1 and \a size2. Pass only short words to this function! + */ +std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2) +{ + // allocate distance array + auto distanceArray(makeNoneOwningMultiArray(size1 + 2, size2 + 2)); + if (distanceArray.totalSize() <= 128) { + return performDamerauLevenshteinAlgorithmAllocatingOnStack(distanceArray, str1, size1, str2, size2); + } else { + return performDamerauLevenshteinAlgorithmAllocatingOnHeap(distanceArray, str1, size1, str2, size2); + } +} + +} // namespace MiscUtilities diff --git a/misc/levenshtein.h b/misc/levenshtein.h new file mode 100644 index 0000000..5761c55 --- /dev/null +++ b/misc/levenshtein.h @@ -0,0 +1,25 @@ +#ifndef CPP_UTILITIES_LEVENSHTEIN_H +#define CPP_UTILITIES_LEVENSHTEIN_H + +#include "../global.h" + +#include +#include + +namespace MiscUtilities { + +CPP_UTILITIES_EXPORT std::size_t computeDamerauLevenshteinDistance(const char *str1, std::size_t size1, const char *str2, std::size_t size2); + +inline std::size_t computeDamerauLevenshteinDistance(const std::string &str1, const std::string &str2) +{ + return computeDamerauLevenshteinDistance(str1.data(), str1.size(), str2.data(), str2.size()); +} + +inline std::size_t computeDamerauLevenshteinDistance(const char *str1, const char *str2) +{ + return computeDamerauLevenshteinDistance(str1, std::strlen(str1), str2, std::strlen(str2)); +} + +} // namespace MiscUtilities + +#endif // CPP_UTILITIES_LEVENSHTEIN_H diff --git a/tests/misctests.cpp b/tests/misctests.cpp new file mode 100644 index 0000000..398c675 --- /dev/null +++ b/tests/misctests.cpp @@ -0,0 +1,125 @@ +#include "../misc/levenshtein.h" +#include "../misc/multiarray.h" +#include "../tests/testutils.h" + +#include +#include + +using namespace std; +using namespace MiscUtilities; +using namespace TestUtilities::Literals; + +using namespace CPPUNIT_NS; + +/*! + * \brief The MiscTests class tests functions and classes from the misc directory. + */ +class MiscTests : public TestFixture { + CPPUNIT_TEST_SUITE(MiscTests); + CPPUNIT_TEST(testMultiArray); + CPPUNIT_TEST(testLevenshtein); + CPPUNIT_TEST_SUITE_END(); + +public: + void setUp() + { + } + void tearDown() + { + } + + void testMultiArray(); + void testLevenshtein(); +}; + +CPPUNIT_TEST_SUITE_REGISTRATION(MiscTests); + +void MiscTests::testMultiArray() +{ + static_assert(decltype(makeMultiArray(3))::dimensionCount() == 1, "dimension count 1D"); + static_assert(decltype(makeMultiArray(3, 2))::dimensionCount() == 2, "dimension count 2D"); + static_assert(decltype(makeMultiArray(3, 2, 3))::dimensionCount() == 3, "dimension count 3D"); + + auto array1d(makeMultiArray(3)); + CPPUNIT_ASSERT_EQUAL(3_st, array1d.dimensionSize<0>()); + CPPUNIT_ASSERT_EQUAL(3_st, array1d.totalSize()); + array1d.at(0) = 'a'; + array1d.at(1) = 'b'; + array1d.at(2) = 'c'; + CPPUNIT_ASSERT_EQUAL("abc"s, string(array1d.data(), 3)); + + auto array2d(makeMultiArray(3, 2)); + CPPUNIT_ASSERT_EQUAL(3_st, array2d.dimensionSize<0>()); + CPPUNIT_ASSERT_EQUAL(2_st, array2d.dimensionSize<1>()); + CPPUNIT_ASSERT_EQUAL(6_st, array2d.totalSize()); + const char *const data(array2d.data()); + array2d.at(0, 0) = 'a'; + array2d.at(0, 1) = 'b'; + array2d.at(1, 0) = 'c'; + array2d.at(1, 1) = 'd'; + array2d.at(2, 0) = 'e'; + array2d.at(2, 1) = 'f'; + CPPUNIT_ASSERT_EQUAL("abcdef"s, string(data, 6)); + + auto array3d(makeMultiArray(3, 2, 3)); + CPPUNIT_ASSERT_EQUAL(3_st, array3d.dimensionSize<0>()); + CPPUNIT_ASSERT_EQUAL(2_st, array3d.dimensionSize<1>()); + CPPUNIT_ASSERT_EQUAL(3_st, array3d.dimensionSize<2>()); + CPPUNIT_ASSERT_EQUAL(18_st, array3d.totalSize()); + array3d.at(0, 0, 0) = 'a'; + array3d.at(0, 0, 1) = 'b'; + array3d.at(0, 0, 2) = 'c'; + array3d.at(0, 1, 0) = 'd'; + array3d.at(0, 1, 1) = 'e'; + array3d.at(0, 1, 2) = 'f'; + array3d.at(1, 0, 0) = 'g'; + array3d.at(1, 0, 1) = 'h'; + array3d.at(1, 0, 2) = 'i'; + array3d.at(1, 1, 0) = 'j'; + array3d.at(1, 1, 1) = 'k'; + array3d.at(1, 1, 2) = 'l'; + array3d.at(2, 0, 0) = 'm'; + array3d.at(2, 0, 1) = 'n'; + array3d.at(2, 0, 2) = 'o'; + array3d.at(2, 1, 0) = 'p'; + array3d.at(2, 1, 1) = 'q'; + array3d.at(2, 1, 2) = 'r'; + CPPUNIT_ASSERT_EQUAL("abcdefghijklmnopqr"s, string(array3d.data(), 18)); + + auto stackMultiArray(makeFixedSizeMultiArray(3, 3)); + CPPUNIT_ASSERT_EQUAL(3_st, stackMultiArray.dimensionSize<0>()); + CPPUNIT_ASSERT_EQUAL(3_st, stackMultiArray.dimensionSize<1>()); + CPPUNIT_ASSERT_EQUAL(9_st, stackMultiArray.totalSize()); + stackMultiArray.at(0, 0) = 'a'; + stackMultiArray.at(0, 1) = 'b'; + stackMultiArray.at(0, 2) = 'c'; + stackMultiArray.at(1, 0) = 'd'; + stackMultiArray.at(1, 1) = 'e'; + stackMultiArray.at(1, 2) = 'f'; + stackMultiArray.at(2, 0) = 'g'; + stackMultiArray.at(2, 1) = 'h'; + stackMultiArray.at(2, 2) = 'i'; + CPPUNIT_ASSERT_EQUAL("abcdefghi"s, string(stackMultiArray.data(), 9)); +} + +void MiscTests::testLevenshtein() +{ + CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "abc")); + CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("abc", "ab")); + CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("xzaby", "xbay")); + CPPUNIT_ASSERT_EQUAL(0_st, computeDamerauLevenshteinDistance("", "")); + CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "ba")); + CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("xaby", "xbay")); + CPPUNIT_ASSERT_EQUAL(0_st, computeDamerauLevenshteinDistance("abc", "abc")); + CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "abc")); + CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("ca", "abc")); + CPPUNIT_ASSERT_EQUAL(4_st, computeDamerauLevenshteinDistance("", "abcd")); + CPPUNIT_ASSERT_EQUAL(4_st, computeDamerauLevenshteinDistance("abcd", "")); + CPPUNIT_ASSERT_EQUAL(3_st, computeDamerauLevenshteinDistance("abcd", "d")); + CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("abcd", "bc")); + CPPUNIT_ASSERT_EQUAL(3_st, computeDamerauLevenshteinDistance("abcd", "a")); + CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("adb", "abc")); + CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("xxaxx", "xxäxx")); + CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("xxöxx", "xxäxx")); + CPPUNIT_ASSERT_EQUAL(11_st, computeDamerauLevenshteinDistance("this is a long text", "this is too long for stack")); +}