cpp-utilities/misc/levenshtein.cpp

137 lines
5.6 KiB
C++
Raw Permalink Normal View History

#include "./levenshtein.h"
#include "./math.h"
#include "./multiarray.h"
#include <iostream>
#include <limits>
#include <memory>
using namespace std;
namespace CppUtilities {
/// \cond
/// \brief The DistanceArray is a 2D array which is allocated either on the stack or the heap.
using DistanceArray = MultiArray<size_t, NoneOwningMultiArray, size_t, size_t>;
/// \brief Initializes the distance array.
///
/// For size1=5 and size2=4 it would look like this:
/// ```
/// 9 9 9 9 9 9
/// 9 0 1 2 3 4
/// 9 1 0 0 0 0
/// 9 2 0 0 0 0
/// 9 3 0 0 0 0
/// 9 4 0 0 0 0
/// 9 5 0 0 0 0
/// ```
void initDistanceArray(DistanceArray &distanceArray, const size_t size1, const size_t size2)
{
const auto maxDistance(size1 + size2);
// ignore warning about null pointer dereference for now (which is *likely* not correct)
#ifdef __GNUC__
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnull-dereference"
#endif
distanceArray.at(0, 0) = maxDistance;
#ifdef __GNUC__
#pragma GCC diagnostic pop
#endif
for (size_t i = 0; i <= size1; ++i) {
distanceArray.at(i + 1, 1) = i;
distanceArray.at(i + 1, 0) = maxDistance;
}
for (size_t i = 0; i <= size2; ++i) {
distanceArray.at(1, i + 1) = i;
distanceArray.at(0, i + 1) = maxDistance;
}
}
/// \brief Performs the actual DamerauLevenshtein algorithm.
/// \sa computeDamerauLevenshteinDistance() for more details on the algorithm.
size_t performDamerauLevenshteinAlgorithm(
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
size_t dist1[std::numeric_limits<unsigned char>::max() + 1] = { 0 };
for (size_t index1 = 1; index1 <= size1; ++index1) {
size_t dist2 = 0;
for (size_t index2 = 1; index2 <= size2; ++index2) {
const size_t substitution((str1[index1 - 1] == str2[index2 - 1]) ? 0 : 1);
const size_t transposition1(dist1[static_cast<unsigned char>(str2[index2 - 1])]);
const size_t transposition2(dist2);
if (!substitution) {
dist2 = index2;
}
// clang-format off
distanceArray.at(index1 + 1, index2 + 1) = CppUtilities::min(
distanceArray.at(index1, index2) + substitution, // substitution
distanceArray.at(index1 + 1, index2) + 1, // insertion
distanceArray.at(index1, index2 + 1) + 1, // deletion
distanceArray.at(transposition1, transposition2) + (index1 - transposition1 - 1) + 1 + (index2 - transposition2 - 1) // transposition
);
// clang-format on
}
dist1[static_cast<int>(str1[index1 - 1])] = index1;
}
return distanceArray.at(size1 + 1, size2 + 1);
}
/// \brief Allocates the distance array on the heap and performs the DamerauLevenshtein algorithm.
template <typename DistanceArray>
size_t performDamerauLevenshteinAlgorithmAllocatingOnHeap(
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
std::vector<size_t> buffer(distanceArray.totalSize());
distanceArray.buffer() = buffer.data();
initDistanceArray(distanceArray, size1, size2);
return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
}
/// \brief Allocates the distance array on the stack and performs the DamerauLevenshtein algorithm.
/// \remarks The totalSize() of \a distanceArray mustn't exceed 128 byte.
template <typename DistanceArray>
size_t performDamerauLevenshteinAlgorithmAllocatingOnStack(
DistanceArray &distanceArray, const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
size_t buffer[128] = { 0 };
distanceArray.buffer() = buffer;
initDistanceArray(distanceArray, size1, size2);
return performDamerauLevenshteinAlgorithm(distanceArray, str1, size1, str2, size2);
}
/// \endcond
/*!
* \brief Computes DamerauLevenshtein distance with adjacent transpositions.
*
* \returns Returns the number of editing steps required to turn \a str1 into \a str2.
* The following operations are considered as editing steps:
* - substitution: replace one character with another character
* - insertion: insert one character at any position
* - deletion: delete one character at any position
* - transposition: swap any pair of adjacent characters
*
* \remarks
* - Computing "Optimal string alignment distance" is a *different* thing.
* - The algorithm operates on byte-level. So characters requiring more than one byte in
* the used character encoding (e.g. UTF-8 encoded German umlauts) are counted as multiple
* characters (e.g. substitution of those umlauts with non-umlauts requires 2 editing steps).
* - The memory consumption of this algorithm is considerably. The required memory increases
* with the product of \a size1 and \a size2. Pass only short words to this function!
*/
std::size_t computeDamerauLevenshteinDistance(const char *const str1, const size_t size1, const char *const str2, const size_t size2)
{
// allocate distance array
auto distanceArray(makeNoneOwningMultiArray<std::size_t>(size1 + 2, size2 + 2));
if (distanceArray.totalSize() <= 128) {
return performDamerauLevenshteinAlgorithmAllocatingOnStack(distanceArray, str1, size1, str2, size2);
} else {
return performDamerauLevenshteinAlgorithmAllocatingOnHeap(distanceArray, str1, size1, str2, size2);
}
}
} // namespace CppUtilities