Add algorithm for computing Damerau–Levenshtein distance
This commit is contained in:
parent
034b8a75d1
commit
429de98836
|
@ -33,6 +33,7 @@ set(HEADER_FILES
|
|||
misc/multiarray.h
|
||||
misc/random.h
|
||||
misc/traits.h
|
||||
misc/levenshtein.h
|
||||
tests/testutils.h
|
||||
tests/cppunit.h
|
||||
tests/outputcheck.h
|
||||
|
@ -59,6 +60,7 @@ set(SRC_FILES
|
|||
io/misc.cpp
|
||||
math/math.cpp
|
||||
misc/random.cpp
|
||||
misc/levenshtein.cpp
|
||||
tests/testutils.cpp
|
||||
)
|
||||
|
||||
|
@ -73,6 +75,7 @@ set(TEST_SRC_FILES
|
|||
tests/argumentparsertests.cpp
|
||||
tests/traitstests.cpp
|
||||
tests/mathtests.cpp
|
||||
tests/misctests.cpp
|
||||
)
|
||||
|
||||
set(CMAKE_MODULE_FILES
|
||||
|
|
|
@ -0,0 +1,133 @@
|
|||
#include "./levenshtein.h"
|
||||
#include "./multiarray.h"
|
||||
|
||||
#include "../math/math.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
|
||||
using namespace std;
|
||||
|
||||
/*!
|
||||
* \namespace MiscUtilities
|
||||
* \brief The MiscUtilities namespace contains various utilities such as computing Damerau–Levenshtein distance
|
||||
* and *N*-dimensional arrays.
|
||||
*/
|
||||
namespace MiscUtilities {
|
||||
|
||||
/// \cond
|
||||
|
||||
/// \brief The DistanceArray is a 2D array which is allocated either on the stack or the heap.
|
||||
using DistanceArray = MultiArray<size_t, NoneOwningMultiArray, size_t, size_t>;
|
||||
|
||||
/// \brief Initializes to distance array.
|
||||
///
|
||||
/// For size1=5 and size2=4 it would look like this:
|
||||
/// ```
|
||||
/// 9 9 9 9 9 9
|
||||
/// 9 0 1 2 3 4
|
||||
/// 9 1 0 0 0 0
|
||||
/// 9 2 0 0 0 0
|
||||
/// 9 3 0 0 0 0
|
||||
/// 9 4 0 0 0 0
|
||||
/// 9 5 0 0 0 0
|
||||
/// ```
|
||||
void initDistanceArray(DistanceArray &distanceArray, const size_t size1, const size_t size2)
|
||||
{
|
||||
const auto maxDistance(size1 + size2);
|
||||
distanceArray.at(0, 0) = maxDistance;
|
||||
for (size_t i = 0; i <= size1; ++i) {
|
||||
distanceArray.at(i + 1, 1) = i;
|
||||
distanceArray.at(i + 1, 0) = maxDistance;
|
||||
}
|
||||
for (size_t i = 0; i <= size2; ++i) {
|
||||
distanceArray.at(1, i + 1) = i;
|
||||
distanceArray.at(0, i + 1) = maxDistance;
|
||||
}
|
||||
}
|
||||
|
||||
/// \brief Performs the actual Damerau–Levenshtein algorithm.
|
||||
/// \sa computeDamerauLevenshteinDistance() for more details on the algorithm.
|
||||
size_t performDamerauLevenshteinAlgorithm(
|
||||
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
|
||||
{
|
||||
size_t dist1[std::numeric_limits<unsigned char>::max() + 1] = { 0 };
|
||||
for (size_t index1 = 1; index1 <= size1; ++index1) {
|
||||
size_t dist2 = 0;
|
||||
for (size_t index2 = 1; index2 <= size2; ++index2) {
|
||||
const size_t substitution((str1[index1 - 1] == str2[index2 - 1]) ? 0 : 1);
|
||||
const size_t transposition1(dist1[static_cast<unsigned char>(str2[index2 - 1])]);
|
||||
const size_t transposition2(dist2);
|
||||
if (!substitution) {
|
||||
dist2 = index2;
|
||||
}
|
||||
// clang-format off
|
||||
distanceArray.at(index1 + 1, index2 + 1) = MathUtilities::min(
|
||||
distanceArray.at(index1, index2) + substitution, // substitution
|
||||
distanceArray.at(index1 + 1, index2) + 1, // insertion
|
||||
distanceArray.at(index1, index2 + 1) + 1, // deletion
|
||||
distanceArray.at(transposition1, transposition2) + (index1 - transposition1 - 1) + 1 + (index2 - transposition2 - 1) // transposition
|
||||
);
|
||||
// clang-format on
|
||||
}
|
||||
dist1[static_cast<int>(str1[index1 - 1])] = index1;
|
||||
}
|
||||
return distanceArray.at(size1 + 1, size2 + 1);
|
||||
}
|
||||
|
||||
/// \brief Allocates the distance array on the heap and performs the Damerau–Levenshtein algorithm.
|
||||
template <typename DistanceArray>
|
||||
size_t performDamerauLevenshteinAlgorithmAllocatingOnHeap(
|
||||
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
|
||||
{
|
||||
std::vector<size_t> buffer(distanceArray.totalSize());
|
||||
distanceArray.buffer() = buffer.data();
|
||||
initDistanceArray(distanceArray, size1, size2);
|
||||
return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
|
||||
}
|
||||
|
||||
/// \brief Allocates the distance array on the stack and performs the Damerau–Levenshtein algorithm.
|
||||
/// \remarks The totalSize() of \a distanceArray mustn't exceed 128 byte.
|
||||
template <typename DistanceArray>
|
||||
size_t performDamerauLevenshteinAlgorithmAllocatingOnStack(
|
||||
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
|
||||
{
|
||||
size_t buffer[128];
|
||||
distanceArray.buffer() = buffer;
|
||||
initDistanceArray(distanceArray, size1, size2);
|
||||
return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
|
||||
}
|
||||
|
||||
/// \endcond
|
||||
|
||||
/*!
|
||||
* \brief Computes Damerau–Levenshtein distance with adjacent transpositions.
|
||||
*
|
||||
* \returns Returns the number of editing steps required to turn \a str1 into \a str2.
|
||||
* The following operations are considered as editing steps:
|
||||
* - substitution: replace one character with another character
|
||||
* - insertion: insert one character at any position
|
||||
* - deletion: delete one character at any position
|
||||
* - transposition: swap any pair of adjacent characters
|
||||
*
|
||||
* \remarks
|
||||
* - Computing Optimal string alignment distance is a *different* thing.
|
||||
* - The algorithm operates on byte-level. So characters requiring more than one byte in
|
||||
* the used character encoding (eg. UTF-8 encoded German umlauts) are counted as multiple
|
||||
* characters (eg. substitution of those umlauts with non-umlauts requires 2 editing steps).
|
||||
* - The memory consumption of this algorithm is considerably. The required memory increases
|
||||
* with the product of \a size1 and \a size2. Pass only short words to this function!
|
||||
*/
|
||||
std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2)
|
||||
{
|
||||
// allocate distance array
|
||||
auto distanceArray(makeNoneOwningMultiArray<std::size_t>(size1 + 2, size2 + 2));
|
||||
if (distanceArray.totalSize() <= 128) {
|
||||
return performDamerauLevenshteinAlgorithmAllocatingOnStack(distanceArray, str1, size1, str2, size2);
|
||||
} else {
|
||||
return performDamerauLevenshteinAlgorithmAllocatingOnHeap(distanceArray, str1, size1, str2, size2);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace MiscUtilities
|
|
@ -0,0 +1,25 @@
|
|||
#ifndef CPP_UTILITIES_LEVENSHTEIN_H
|
||||
#define CPP_UTILITIES_LEVENSHTEIN_H
|
||||
|
||||
#include "../global.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
|
||||
namespace MiscUtilities {
|
||||
|
||||
CPP_UTILITIES_EXPORT std::size_t computeDamerauLevenshteinDistance(const char *str1, std::size_t size1, const char *str2, std::size_t size2);
|
||||
|
||||
inline std::size_t computeDamerauLevenshteinDistance(const std::string &str1, const std::string &str2)
|
||||
{
|
||||
return computeDamerauLevenshteinDistance(str1.data(), str1.size(), str2.data(), str2.size());
|
||||
}
|
||||
|
||||
inline std::size_t computeDamerauLevenshteinDistance(const char *str1, const char *str2)
|
||||
{
|
||||
return computeDamerauLevenshteinDistance(str1, std::strlen(str1), str2, std::strlen(str2));
|
||||
}
|
||||
|
||||
} // namespace MiscUtilities
|
||||
|
||||
#endif // CPP_UTILITIES_LEVENSHTEIN_H
|
|
@ -0,0 +1,125 @@
|
|||
#include "../misc/levenshtein.h"
|
||||
#include "../misc/multiarray.h"
|
||||
#include "../tests/testutils.h"
|
||||
|
||||
#include <cppunit/TestFixture.h>
|
||||
#include <cppunit/extensions/HelperMacros.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace MiscUtilities;
|
||||
using namespace TestUtilities::Literals;
|
||||
|
||||
using namespace CPPUNIT_NS;
|
||||
|
||||
/*!
|
||||
* \brief The MiscTests class tests functions and classes from the misc directory.
|
||||
*/
|
||||
class MiscTests : public TestFixture {
|
||||
CPPUNIT_TEST_SUITE(MiscTests);
|
||||
CPPUNIT_TEST(testMultiArray);
|
||||
CPPUNIT_TEST(testLevenshtein);
|
||||
CPPUNIT_TEST_SUITE_END();
|
||||
|
||||
public:
|
||||
void setUp()
|
||||
{
|
||||
}
|
||||
void tearDown()
|
||||
{
|
||||
}
|
||||
|
||||
void testMultiArray();
|
||||
void testLevenshtein();
|
||||
};
|
||||
|
||||
CPPUNIT_TEST_SUITE_REGISTRATION(MiscTests);
|
||||
|
||||
void MiscTests::testMultiArray()
|
||||
{
|
||||
static_assert(decltype(makeMultiArray<char>(3))::dimensionCount() == 1, "dimension count 1D");
|
||||
static_assert(decltype(makeMultiArray<char>(3, 2))::dimensionCount() == 2, "dimension count 2D");
|
||||
static_assert(decltype(makeMultiArray<char>(3, 2, 3))::dimensionCount() == 3, "dimension count 3D");
|
||||
|
||||
auto array1d(makeMultiArray<char>(3));
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, array1d.dimensionSize<0>());
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, array1d.totalSize());
|
||||
array1d.at(0) = 'a';
|
||||
array1d.at(1) = 'b';
|
||||
array1d.at(2) = 'c';
|
||||
CPPUNIT_ASSERT_EQUAL("abc"s, string(array1d.data(), 3));
|
||||
|
||||
auto array2d(makeMultiArray<char>(3, 2));
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, array2d.dimensionSize<0>());
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, array2d.dimensionSize<1>());
|
||||
CPPUNIT_ASSERT_EQUAL(6_st, array2d.totalSize());
|
||||
const char *const data(array2d.data());
|
||||
array2d.at(0, 0) = 'a';
|
||||
array2d.at(0, 1) = 'b';
|
||||
array2d.at(1, 0) = 'c';
|
||||
array2d.at(1, 1) = 'd';
|
||||
array2d.at(2, 0) = 'e';
|
||||
array2d.at(2, 1) = 'f';
|
||||
CPPUNIT_ASSERT_EQUAL("abcdef"s, string(data, 6));
|
||||
|
||||
auto array3d(makeMultiArray<char>(3, 2, 3));
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, array3d.dimensionSize<0>());
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, array3d.dimensionSize<1>());
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, array3d.dimensionSize<2>());
|
||||
CPPUNIT_ASSERT_EQUAL(18_st, array3d.totalSize());
|
||||
array3d.at(0, 0, 0) = 'a';
|
||||
array3d.at(0, 0, 1) = 'b';
|
||||
array3d.at(0, 0, 2) = 'c';
|
||||
array3d.at(0, 1, 0) = 'd';
|
||||
array3d.at(0, 1, 1) = 'e';
|
||||
array3d.at(0, 1, 2) = 'f';
|
||||
array3d.at(1, 0, 0) = 'g';
|
||||
array3d.at(1, 0, 1) = 'h';
|
||||
array3d.at(1, 0, 2) = 'i';
|
||||
array3d.at(1, 1, 0) = 'j';
|
||||
array3d.at(1, 1, 1) = 'k';
|
||||
array3d.at(1, 1, 2) = 'l';
|
||||
array3d.at(2, 0, 0) = 'm';
|
||||
array3d.at(2, 0, 1) = 'n';
|
||||
array3d.at(2, 0, 2) = 'o';
|
||||
array3d.at(2, 1, 0) = 'p';
|
||||
array3d.at(2, 1, 1) = 'q';
|
||||
array3d.at(2, 1, 2) = 'r';
|
||||
CPPUNIT_ASSERT_EQUAL("abcdefghijklmnopqr"s, string(array3d.data(), 18));
|
||||
|
||||
auto stackMultiArray(makeFixedSizeMultiArray<char, 9>(3, 3));
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, stackMultiArray.dimensionSize<0>());
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, stackMultiArray.dimensionSize<1>());
|
||||
CPPUNIT_ASSERT_EQUAL(9_st, stackMultiArray.totalSize());
|
||||
stackMultiArray.at(0, 0) = 'a';
|
||||
stackMultiArray.at(0, 1) = 'b';
|
||||
stackMultiArray.at(0, 2) = 'c';
|
||||
stackMultiArray.at(1, 0) = 'd';
|
||||
stackMultiArray.at(1, 1) = 'e';
|
||||
stackMultiArray.at(1, 2) = 'f';
|
||||
stackMultiArray.at(2, 0) = 'g';
|
||||
stackMultiArray.at(2, 1) = 'h';
|
||||
stackMultiArray.at(2, 2) = 'i';
|
||||
CPPUNIT_ASSERT_EQUAL("abcdefghi"s, string(stackMultiArray.data(), 9));
|
||||
}
|
||||
|
||||
void MiscTests::testLevenshtein()
|
||||
{
|
||||
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "abc"));
|
||||
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("abc", "ab"));
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("xzaby", "xbay"));
|
||||
CPPUNIT_ASSERT_EQUAL(0_st, computeDamerauLevenshteinDistance("", ""));
|
||||
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "ba"));
|
||||
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("xaby", "xbay"));
|
||||
CPPUNIT_ASSERT_EQUAL(0_st, computeDamerauLevenshteinDistance("abc", "abc"));
|
||||
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("ab", "abc"));
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("ca", "abc"));
|
||||
CPPUNIT_ASSERT_EQUAL(4_st, computeDamerauLevenshteinDistance("", "abcd"));
|
||||
CPPUNIT_ASSERT_EQUAL(4_st, computeDamerauLevenshteinDistance("abcd", ""));
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, computeDamerauLevenshteinDistance("abcd", "d"));
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("abcd", "bc"));
|
||||
CPPUNIT_ASSERT_EQUAL(3_st, computeDamerauLevenshteinDistance("abcd", "a"));
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("adb", "abc"));
|
||||
CPPUNIT_ASSERT_EQUAL(2_st, computeDamerauLevenshteinDistance("xxaxx", "xxäxx"));
|
||||
CPPUNIT_ASSERT_EQUAL(1_st, computeDamerauLevenshteinDistance("xxöxx", "xxäxx"));
|
||||
CPPUNIT_ASSERT_EQUAL(11_st, computeDamerauLevenshteinDistance("this is a long text", "this is too long for stack"));
|
||||
}
|
Loading…
Reference in New Issue