Add algorithm for computing Damerau–Levenshtein distance

This commit is contained in:
Martchus 2018-05-05 23:26:53 +02:00
parent 034b8a75d1
commit 429de98836
4 changed files with 286 additions and 0 deletions

View File

@ -33,6 +33,7 @@ set(HEADER_FILES
misc/multiarray.h
misc/random.h
misc/traits.h
misc/levenshtein.h
tests/testutils.h
tests/cppunit.h
tests/outputcheck.h
@ -59,6 +60,7 @@ set(SRC_FILES
io/misc.cpp
math/math.cpp
misc/random.cpp
misc/levenshtein.cpp
tests/testutils.cpp
)
@ -73,6 +75,7 @@ set(TEST_SRC_FILES
tests/argumentparsertests.cpp
tests/traitstests.cpp
tests/mathtests.cpp
tests/misctests.cpp
)
set(CMAKE_MODULE_FILES

133
misc/levenshtein.cpp Normal file
View File

@ -0,0 +1,133 @@
#include "./levenshtein.h"
#include "./multiarray.h"
#include "../math/math.h"
#include <iostream>
#include <limits>
#include <memory>
using namespace std;
/*!
* \namespace MiscUtilities
* \brief The MiscUtilities namespace contains various utilities such as computing DamerauLevenshtein distance
* and *N*-dimensional arrays.
*/
namespace MiscUtilities {
/// \cond
/// \brief The DistanceArray is a 2D array which is allocated either on the stack or the heap.
using DistanceArray = MultiArray<size_t, NoneOwningMultiArray, size_t, size_t>;
/// \brief Initializes to distance array.
///
/// For size1=5 and size2=4 it would look like this:
/// ```
/// 9 9 9 9 9 9
/// 9 0 1 2 3 4
/// 9 1 0 0 0 0
/// 9 2 0 0 0 0
/// 9 3 0 0 0 0
/// 9 4 0 0 0 0
/// 9 5 0 0 0 0
/// ```
void initDistanceArray(DistanceArray &distanceArray, const size_t size1, const size_t size2)
{
const auto maxDistance(size1 + size2);
distanceArray.at(0, 0) = maxDistance;
for (size_t i = 0; i <= size1; ++i) {
distanceArray.at(i + 1, 1) = i;
distanceArray.at(i + 1, 0) = maxDistance;
}
for (size_t i = 0; i <= size2; ++i) {
distanceArray.at(1, i + 1) = i;
distanceArray.at(0, i + 1) = maxDistance;
}
}
/// \brief Performs the actual DamerauLevenshtein algorithm.
/// \sa computeDamerauLevenshteinDistance() for more details on the algorithm.
size_t performDamerauLevenshteinAlgorithm(
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
size_t dist1[std::numeric_limits<unsigned char>::max() + 1] = { 0 };
for (size_t index1 = 1; index1 <= size1; ++index1) {
size_t dist2 = 0;
for (size_t index2 = 1; index2 <= size2; ++index2) {
const size_t substitution((str1[index1 - 1] == str2[index2 - 1]) ? 0 : 1);
const size_t transposition1(dist1[static_cast<unsigned char>(str2[index2 - 1])]);
const size_t transposition2(dist2);
if (!substitution) {
dist2 = index2;
}
// clang-format off
distanceArray.at(index1 + 1, index2 + 1) = MathUtilities::min(
distanceArray.at(index1, index2) + substitution, // substitution
distanceArray.at(index1 + 1, index2) + 1, // insertion
distanceArray.at(index1, index2 + 1) + 1, // deletion
distanceArray.at(transposition1, transposition2) + (index1 - transposition1 - 1) + 1 + (index2 - transposition2 - 1) // transposition
);
// clang-format on
}
dist1[static_cast<int>(str1[index1 - 1])] = index1;
}
return distanceArray.at(size1 + 1, size2 + 1);
}
/// \brief Allocates the distance array on the heap and performs the DamerauLevenshtein algorithm.
template <typename DistanceArray>
size_t performDamerauLevenshteinAlgorithmAllocatingOnHeap(
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
std::vector<size_t> buffer(distanceArray.totalSize());
distanceArray.buffer() = buffer.data();
initDistanceArray(distanceArray, size1, size2);
return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
}
/// \brief Allocates the distance array on the stack and performs the DamerauLevenshtein algorithm.
/// \remarks The totalSize() of \a distanceArray mustn't exceed 128 byte.
template <typename DistanceArray>
size_t performDamerauLevenshteinAlgorithmAllocatingOnStack(
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
size_t buffer[128];
distanceArray.buffer() = buffer;
initDistanceArray(distanceArray, size1, size2);
return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
}
/// \endcond
/*!
* \brief Computes DamerauLevenshtein distance with adjacent transpositions.
*
* \returns Returns the number of editing steps required to turn \a str1 into \a str2.
* The following operations are considered as editing steps:
* - substitution: replace one character with another character
* - insertion: insert one character at any position
* - deletion: delete one character at any position
* - transposition: swap any pair of adjacent characters
*
* \remarks
* - Computing Optimal string alignment distance is a *different* thing.
* - The algorithm operates on byte-level. So characters requiring more than one byte in
* the used character encoding (eg. UTF-8 encoded German umlauts) are counted as multiple
* characters (eg. substitution of those umlauts with non-umlauts requires 2 editing steps).
* - The memory consumption of this algorithm is considerably. The required memory increases
* with the product of \a size1 and \a size2. Pass only short words to this function!
*/
std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
// allocate distance array
auto distanceArray(makeNoneOwningMultiArray<std::size_t>(size1 + 2, size2 + 2));
if (distanceArray.totalSize() <= 128) {
return performDamerauLevenshteinAlgorithmAllocatingOnStack(distanceArray, str1, size1, str2, size2);
} else {
return performDamerauLevenshteinAlgorithmAllocatingOnHeap(distanceArray, str1, size1, str2, size2);
}
}
} // namespace MiscUtilities

25
misc/levenshtein.h Normal file
View File

@ -0,0 +1,25 @@
#ifndef CPP_UTILITIES_LEVENSHTEIN_H
#define CPP_UTILITIES_LEVENSHTEIN_H
#include "../global.h"
#include <cstring>
#include <string>
namespace MiscUtilities {
CPP_UTILITIES_EXPORT std::size_t computeDamerauLevenshteinDistance(const char *str1, std::size_t size1, const char *str2, std::size_t size2);
inline std::size_t computeDamerauLevenshteinDistance(const std::string &str1, const std::string &str2)
{
return computeDamerauLevenshteinDistance(str1.data(), str1.size(), str2.data(), str2.size());
}
inline std::size_t computeDamerauLevenshteinDistance(const char *str1, const char *str2)
{
return computeDamerauLevenshteinDistance(str1, std::strlen(str1), str2, std::strlen(str2));
}
} // namespace MiscUtilities
#endif // CPP_UTILITIES_LEVENSHTEIN_H

125
tests/misctests.cpp Normal file
View File

@ -0,0 +1,125 @@
#include "../misc/levenshtein.h"
#include "../misc/multiarray.h"
#include "../tests/testutils.h"
#include <cppunit/TestFixture.h>
#include <cppunit/extensions/HelperMacros.h>
using namespace std;
using namespace MiscUtilities;
using namespace TestUtilities::Literals;
using namespace CPPUNIT_NS;
/*!
* \brief The MiscTests class tests functions and classes from the misc directory.
*/
class MiscTests : public TestFixture {
CPPUNIT_TEST_SUITE(MiscTests);
CPPUNIT_TEST(testMultiArray);
CPPUNIT_TEST(testLevenshtein);
CPPUNIT_TEST_SUITE_END();
public:
void setUp()
{
}
void tearDown()
{
}
void testMultiArray();
void testLevenshtein();
};
CPPUNIT_TEST_SUITE_REGISTRATION(MiscTests);
void MiscTests::testMultiArray()
{
static_assert(decltype(makeMultiArray<char>(3))::dimensionCount() == 1, "dimension count 1D");
static_assert(decltype(makeMultiArray<char>(3, 2))::dimensionCount() == 2, "dimension count 2D");
static_assert(decltype(makeMultiArray<char>(3, 2, 3))::dimensionCount() == 3, "dimension count 3D");
auto array1d(makeMultiArray<char>(3));
CPPUNIT_ASSERT_EQUAL(3_st, array1d.dimensionSize<0>());
CPPUNIT_ASSERT_EQUAL(3_st, array1d.totalSize());
array1d.at(0) = 'a';
array1d.at(1) = 'b';
array1d.at(2) = 'c';
CPPUNIT_ASSERT_EQUAL("abc"s, string(array1d.data(), 3));
auto array2d(makeMultiArray<char>(3, 2));
CPPUNIT_ASSERT_EQUAL(3_st, array2d.dimensionSize<0>());
CPPUNIT_ASSERT_EQUAL(2_st, array2d.dimensionSize<1>());
CPPUNIT_ASSERT_EQUAL(6_st, array2d.totalSize());
const char *const data(array2d.data());
array2d.at(0, 0) = 'a';
array2d.at(0, 1) = 'b';
array2d.at(1, 0) = 'c';
array2d.at(1, 1) = 'd';
array2d.at(2, 0) = 'e';
array2d.at(2, 1) = 'f';
CPPUNIT_ASSERT_EQUAL("abcdef"s, string(data, 6));
auto array3d(makeMultiArray<char>(3, 2, 3));
CPPUNIT_ASSERT_EQUAL(3_st, array3d.dimensionSize<0>());
CPPUNIT_ASSERT_EQUAL(2_st, array3d.dimensionSize<1>());
CPPUNIT_ASSERT_EQUAL(3_st, array3d.dimensionSize<2>());
CPPUNIT_ASSERT_EQUAL(18_st, array3d.totalSize());
array3d.at(0, 0, 0) = 'a';
array3d.at(0, 0, 1) = 'b';
array3d.at(0, 0, 2) = 'c';
array3d.at(0, 1, 0) = 'd';
array3d.at(0, 1, 1) = 'e';
array3d.at(0, 1, 2) = 'f';
array3d.at(1, 0, 0) = 'g';
array3d.at(1, 0, 1) = 'h';
array3d.at(1, 0, 2) = 'i';
array3d.at(1, 1, 0) = 'j';
array3d.at(1, 1, 1) = 'k';
array3d.at(1, 1, 2) = 'l';
array3d.at(2, 0, 0) = 'm';
array3d.at(2, 0, 1) = 'n';
array3d.at(2, 0, 2) = 'o';
array3d.at(2, 1, 0) = 'p';
array3d.at(2, 1, 1) = 'q';
array3d.at(2, 1, 2) = 'r';
CPPUNIT_ASSERT_EQUAL("abcdefghijklmnopqr"s, string(array3d.data(), 18));
auto stackMultiArray(makeFixedSizeMultiArray<char, 9>(3, 3));
CPPUNIT_ASSERT_EQUAL(3_st, stackMultiArray.dimensionSize<0>());
CPPUNIT_ASSERT_EQUAL(3_st, stackMultiArray.dimensionSize<1>());
CPPUNIT_ASSERT_EQUAL(9_st, stackMultiArray.totalSize());
stackMultiArray.at(0, 0) = 'a';
stackMultiArray.at(0, 1) = 'b';
stackMultiArray.at(0, 2) = 'c';
stackMultiArray.at(1, 0) = 'd';
stackMultiArray.at(1, 1) = 'e';
stackMultiArray.at(1, 2) = 'f';
stackMultiArray.at(2, 0) = 'g';
stackMultiArray.at(2, 1) = 'h';
stackMultiArray.at(2, 2) = 'i';
CPPUNIT_ASSERT_EQUAL("abcdefghi"s, string(stackMultiArray.data(), 9));
}
void MiscTests::testLevenshtein()
{
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "abc"));
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("abc", "ab"));
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("xzaby", "xbay"));
CPPUNIT_ASSERT_EQUAL(0_st, computeDamerauLevenshteinDistance("", ""));
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "ba"));
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("xaby", "xbay"));
CPPUNIT_ASSERT_EQUAL(0_st, computeDamerauLevenshteinDistance("abc", "abc"));
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "abc"));
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("ca", "abc"));
CPPUNIT_ASSERT_EQUAL(4_st, computeDamerauLevenshteinDistance("", "abcd"));
CPPUNIT_ASSERT_EQUAL(4_st, computeDamerauLevenshteinDistance("abcd", ""));
CPPUNIT_ASSERT_EQUAL(3_st, computeDamerauLevenshteinDistance("abcd", "d"));
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("abcd", "bc"));
CPPUNIT_ASSERT_EQUAL(3_st, computeDamerauLevenshteinDistance("abcd", "a"));
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("adb", "abc"));
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("xxaxx", "xxäxx"));
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("xxöxx", "xxäxx"));
CPPUNIT_ASSERT_EQUAL(11_st, computeDamerauLevenshteinDistance("this is a long text", "this is too long for stack"));
}