cpp-utilities/conversion/stringconversion.cpp

435 lines
16 KiB
C++
Raw Normal View History

2015-09-06 20:19:09 +02:00
#include "./stringconversion.h"
2015-04-22 18:36:40 +02:00
#ifndef CPP_UTILITIES_NO_THREAD_LOCAL
#include "../feature_detection/features.h"
#else
#define CPP_UTILITIES_THREAD_LOCAL
#endif
#include <cmath>
2019-03-11 17:47:17 +01:00
#include <cstdlib>
2017-05-01 03:13:11 +02:00
#include <iomanip>
#include <limits>
2017-02-05 21:00:52 +01:00
#include <memory>
2015-04-22 18:36:40 +02:00
#include <sstream>
#include <errno.h>
2017-05-01 03:13:11 +02:00
#include <iconv.h>
2015-04-22 18:36:40 +02:00
#ifdef PLATFORM_WINDOWS
#include <windows.h>
#endif
2015-04-22 18:36:40 +02:00
using namespace std;
namespace CppUtilities {
2015-04-22 18:36:40 +02:00
/// \cond
2017-05-01 03:13:11 +02:00
struct Keep {
size_t operator()(size_t value)
{
return value;
}
};
struct Double {
size_t operator()(size_t value)
{
return value + value;
}
};
struct Half {
size_t operator()(size_t value)
{
return value / 2;
}
};
struct Factor {
2017-05-01 03:13:11 +02:00
Factor(float factor)
: factor(factor){};
size_t operator()(size_t value)
{
2021-03-19 23:09:01 +01:00
return static_cast<size_t>(static_cast<float>(value) * factor);
2017-05-01 03:13:11 +02:00
}
float factor;
};
2017-05-01 03:13:11 +02:00
template <class OutputSizeHint> class ConversionDescriptor {
public:
2017-05-01 03:13:11 +02:00
ConversionDescriptor(const char *fromCharset, const char *toCharset)
: m_ptr(iconv_open(toCharset, fromCharset))
, m_outputSizeHint(OutputSizeHint())
{
2017-05-01 03:13:11 +02:00
if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
throw ConversionException("Unable to allocate descriptor for character set conversion.");
}
}
2017-05-01 03:13:11 +02:00
ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint)
: m_ptr(iconv_open(toCharset, fromCharset))
, m_outputSizeHint(outputSizeHint)
{
2017-05-01 03:13:11 +02:00
if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
throw ConversionException("Unable to allocate descriptor for character set conversion.");
}
}
~ConversionDescriptor()
{
iconv_close(m_ptr);
}
public:
StringData convertString(const char *inputBuffer, size_t inputBufferSize)
{
// setup input and output buffer
size_t inputBytesLeft = inputBufferSize;
size_t outputSize = m_outputSizeHint(inputBufferSize);
size_t outputBytesLeft = outputSize;
char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
size_t bytesWritten;
char *currentOutputOffset = outputBuffer;
2017-05-01 03:13:11 +02:00
for (;; currentOutputOffset = outputBuffer + bytesWritten) {
bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
2017-05-01 03:13:11 +02:00
if (bytesWritten == static_cast<size_t>(-1)) {
if (errno == EINVAL) {
// ignore incomplete multibyte sequence in the input
bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
break;
2017-05-01 03:13:11 +02:00
} else if (errno == E2BIG) {
// output buffer has no more room for next converted character
bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
} else /*if(errno == EILSEQ)*/ {
// invalid multibyte sequence in the input
free(outputBuffer);
throw ConversionException("Invalid multibyte sequence in the input.");
}
} else {
// conversion completed without (further) errors
break;
}
}
2016-11-18 16:41:46 +01:00
return StringData(std::unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
}
private:
iconv_t m_ptr;
OutputSizeHint m_outputSizeHint;
};
/// \endcond
/*!
* \brief Converts the specified string from one character set to another.
* \remarks
* - The term "size" referes here always to the actual number of bytes and not to the number of characters
* (eg. the size of the UTF-8 string "ö" is 2 and not 1).
* - The expected size of the output buffer can be specified via \a outputBufferSizeFactor. This hint helps
* to reduce buffer reallocations during the conversion (eg. for the conversion from Latin-1 to UTF-16
* the factor would be 2, for the conversion from UTF-16 to Latin-1 the factor would be 0.5).
*/
2017-05-01 03:13:11 +02:00
StringData convertString(
const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
{
return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-8 string to UTF-16 (little-endian).
*/
StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
{
CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-16 (little-endian) string to UTF-8.
*/
StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
{
CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-8 string to UTF-16 (big-endian).
*/
StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
{
CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-16 (big-endian) string to UTF-8.
*/
StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
{
CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified Latin-1 string to UTF-8.
*/
StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
{
CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-8 string to Latin-1.
*/
StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
{
CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("UTF-8", "ISO-8859-1");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
#ifdef PLATFORM_WINDOWS
/*!
* \brief Converts the specified multi-byte string to a wide string using the WinAPI.
* \remarks
* - Only available under Windows.
* - If \a inputBufferSize is -1, \a inputBuffer is considered null-terminated.
*/
WideStringData convertMultiByteToWide(std::error_code &ec, const char *inputBuffer, int inputBufferSize)
{
// calculate required size
WideStringData widePath;
widePath.second = MultiByteToWideChar(CP_UTF8, 0, inputBuffer, inputBufferSize, nullptr, 0);
if (widePath.second <= 0) {
ec = std::error_code(GetLastError(), std::system_category());
return widePath;
}
// do the actual conversion
widePath.first = make_unique<wchar_t[]>(static_cast<size_t>(widePath.second));
widePath.second = MultiByteToWideChar(CP_UTF8, 0, inputBuffer, inputBufferSize, widePath.first.get(), widePath.second);
if (widePath.second <= 0) {
ec = std::error_code(GetLastError(), std::system_category());
widePath.first.reset();
}
return widePath;
}
/*!
* \brief Converts the specified multi-byte string to a wide string using the WinAPI.
* \remarks Only available under Windows.
*/
WideStringData convertMultiByteToWide(std::error_code &ec, const std::string &inputBuffer)
{
2018-10-06 16:18:53 +02:00
return convertMultiByteToWide(
ec, inputBuffer.data(), inputBuffer.size() < (numeric_limits<int>::max() - 1) ? static_cast<int>(inputBuffer.size() + 1) : -1);
}
/*!
* \brief Converts the specified multi-byte string to a wide string using the WinAPI.
* \remarks
* - Only available under Windows.
* - If \a inputBufferSize is -1, \a inputBuffer is considered null-terminated.
*/
WideStringData convertMultiByteToWide(const char *inputBuffer, int inputBufferSize)
{
std::error_code ec;
return convertMultiByteToWide(ec, inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified multi-byte string to a wide string using the WinAPI.
* \remarks Only available under Windows.
*/
WideStringData convertMultiByteToWide(const std::string &inputBuffer)
{
std::error_code ec;
return convertMultiByteToWide(ec, inputBuffer);
}
#endif
2015-04-22 18:36:40 +02:00
/*!
* \brief Truncates all characters after the first occurrence of the
2016-01-18 23:41:30 +01:00
* specified \a terminationChar and the termination character as well.
2015-04-22 18:36:40 +02:00
*/
void truncateString(string &str, char terminationChar)
{
string::size_type firstNullByte = str.find(terminationChar);
2017-05-01 03:13:11 +02:00
if (firstNullByte != string::npos) {
2015-04-22 18:36:40 +02:00
str.resize(firstNullByte);
}
}
/*!
* \brief Converts the specified data size in byte to its equivalent std::string representation.
*
2015-07-18 01:11:12 +02:00
* The unit with appropriate binary prefix will be appended.
2015-04-22 18:36:40 +02:00
*/
2019-03-13 19:00:37 +01:00
string dataSizeToString(std::uint64_t sizeInByte, bool includeByte)
2015-04-22 18:36:40 +02:00
{
stringstream res(stringstream::in | stringstream::out);
res.setf(ios::fixed, ios::floatfield);
res << setprecision(2);
if (sizeInByte < 1024LL) {
res << sizeInByte << " bytes";
} else if (sizeInByte < 1048576LL) {
2015-07-18 01:11:12 +02:00
res << (static_cast<double>(sizeInByte) / 1024.0) << " KiB";
2015-04-22 18:36:40 +02:00
} else if (sizeInByte < 1073741824LL) {
2015-07-18 01:11:12 +02:00
res << (static_cast<double>(sizeInByte) / 1048576.0) << " MiB";
2015-04-22 18:36:40 +02:00
} else if (sizeInByte < 1099511627776LL) {
2015-07-18 01:11:12 +02:00
res << (static_cast<double>(sizeInByte) / 1073741824.0) << " GiB";
2015-04-22 18:36:40 +02:00
} else {
2015-07-18 01:11:12 +02:00
res << (static_cast<double>(sizeInByte) / 1099511627776.0) << " TiB";
2015-04-22 18:36:40 +02:00
}
2017-05-01 03:13:11 +02:00
if (includeByte && sizeInByte > 1024LL) {
res << ' ' << '(' << sizeInByte << " byte)";
}
2015-04-22 18:36:40 +02:00
return res.str();
}
/*!
* \brief Converts the specified bitrate in kbit/s to its equivalent std::string representation.
*
2015-07-18 01:11:12 +02:00
* The unit with appropriate binary prefix will be appended.
2015-04-22 18:36:40 +02:00
*
2015-07-18 01:11:12 +02:00
* \param bitrateInKbitsPerSecond Specifies the bitrate in kbit/s.
* \param useIecBinaryPrefixes Indicates whether IEC binary prefixes should be used (eg. KiB/s).
2015-04-22 18:36:40 +02:00
*
* \sa <a href="http://en.wikipedia.org/wiki/Binary_prefix">Binary prefix - Wikipedia</a>
*/
string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes)
{
stringstream res(stringstream::in | stringstream::out);
res << setprecision(3);
if (std::isnan(bitrateInKbitsPerSecond)) {
res << "indeterminable";
} else if (useIecBinaryPrefixes) {
2015-04-22 18:36:40 +02:00
if (bitrateInKbitsPerSecond < 8.0) {
res << (bitrateInKbitsPerSecond * 125.0) << " byte/s";
} else if (bitrateInKbitsPerSecond < 8000.0) {
res << (bitrateInKbitsPerSecond * 0.125) << " KiB/s";
} else if (bitrateInKbitsPerSecond < 8000000.0) {
res << (bitrateInKbitsPerSecond * 0.000125) << " MiB/s";
} else {
res << (bitrateInKbitsPerSecond * 0.000000125) << " GiB/s";
}
} else {
if (bitrateInKbitsPerSecond < 1.0) {
res << (bitrateInKbitsPerSecond * 1000.0) << " bit/s";
} else if (bitrateInKbitsPerSecond < 1000.0) {
res << (bitrateInKbitsPerSecond) << " kbit/s";
} else if (bitrateInKbitsPerSecond < 1000000.0) {
res << (bitrateInKbitsPerSecond * 0.001) << " Mbit/s";
} else {
res << (bitrateInKbitsPerSecond * 0.000001) << " Gbit/s";
}
}
return res.str();
}
//! \cond
2015-08-16 23:44:38 +02:00
const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
2015-07-18 01:11:12 +02:00
const char base64Pad = '=';
//! \endcond
2015-04-22 18:36:40 +02:00
/*!
2015-08-16 23:44:38 +02:00
* \brief Encodes the specified \a data to Base64.
2018-01-31 21:00:13 +01:00
* \sa [RFC 4648](http://www.ietf.org/rfc/rfc4648.txt)
2015-04-22 18:36:40 +02:00
*/
2019-03-13 19:00:37 +01:00
string encodeBase64(const std::uint8_t *data, std::uint32_t dataSize)
2015-04-22 18:36:40 +02:00
{
2021-03-19 23:09:01 +01:00
auto encoded = std::string();
auto mod = static_cast<std::uint8_t>(dataSize % 3);
auto temp = std::uint32_t();
2015-08-16 23:44:38 +02:00
encoded.reserve(((dataSize / 3) + (mod > 0)) * 4);
2019-03-13 19:00:37 +01:00
for (const std::uint8_t *end = --data + dataSize - mod; data != end;) {
2021-03-19 23:09:01 +01:00
temp = static_cast<std::uint32_t>(*++data << 16);
temp |= static_cast<std::uint32_t>(*++data << 8);
2015-08-16 23:44:38 +02:00
temp |= *++data;
encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
2017-05-01 03:13:11 +02:00
encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
encoded.push_back(base64Chars[(temp & 0x0000003F)]);
2015-04-22 18:36:40 +02:00
}
2017-05-01 03:13:11 +02:00
switch (mod) {
2015-04-22 18:36:40 +02:00
case 1:
2021-03-19 23:09:01 +01:00
temp = static_cast<std::uint32_t>(*++data << 16);
2015-08-16 23:44:38 +02:00
encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
encoded.push_back(base64Pad);
encoded.push_back(base64Pad);
2015-04-22 18:36:40 +02:00
break;
case 2:
2021-03-19 23:09:01 +01:00
temp = static_cast<std::uint32_t>(*++data << 16);
temp |= static_cast<std::uint32_t>(*++data << 8);
2015-08-16 23:44:38 +02:00
encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
2017-05-01 03:13:11 +02:00
encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
2015-08-16 23:44:38 +02:00
encoded.push_back(base64Pad);
2015-04-22 18:36:40 +02:00
break;
}
return encoded;
}
/*!
2015-08-16 23:44:38 +02:00
* \brief Decodes the specified Base64 encoded string.
* \throw Throws a ConversionException if the specified string is no valid Base64.
2018-01-31 21:00:13 +01:00
* \sa [RFC 4648](http://www.ietf.org/rfc/rfc4648.txt)
2015-04-22 18:36:40 +02:00
*/
2019-03-13 19:00:37 +01:00
pair<unique_ptr<std::uint8_t[]>, std::uint32_t> decodeBase64(const char *encodedStr, const std::uint32_t strSize)
2015-04-22 18:36:40 +02:00
{
2017-05-01 03:13:11 +02:00
if (strSize % 4) {
2015-08-16 23:44:38 +02:00
throw ConversionException("invalid size of base64");
2015-04-22 18:36:40 +02:00
}
2019-03-13 19:00:37 +01:00
std::uint32_t decodedSize = (strSize / 4) * 3;
2015-08-16 23:44:38 +02:00
const char *const end = encodedStr + strSize;
2017-05-01 03:13:11 +02:00
if (strSize) {
if (*(end - 1) == base64Pad) {
2015-08-16 23:44:38 +02:00
--decodedSize;
}
2017-05-01 03:13:11 +02:00
if (*(end - 2) == base64Pad) {
2015-08-16 23:44:38 +02:00
--decodedSize;
}
2015-04-22 18:36:40 +02:00
}
2019-03-13 19:00:37 +01:00
auto buffer = make_unique<std::uint8_t[]>(decodedSize);
2015-08-16 23:44:38 +02:00
auto *iter = buffer.get() - 1;
2017-05-01 03:13:11 +02:00
while (encodedStr < end) {
2019-03-13 19:00:37 +01:00
std::int32_t temp = 0;
for (std::uint8_t quantumPos = 0; quantumPos < 4; ++quantumPos, ++encodedStr) {
2015-04-22 18:36:40 +02:00
temp <<= 6;
2017-05-01 03:13:11 +02:00
if (*encodedStr >= 'A' && *encodedStr <= 'Z') {
2015-08-16 23:44:38 +02:00
temp |= *encodedStr - 'A';
2017-05-01 03:13:11 +02:00
} else if (*encodedStr >= 'a' && *encodedStr <= 'z') {
2015-08-16 23:44:38 +02:00
temp |= *encodedStr - 'a' + 26;
2017-05-01 03:13:11 +02:00
} else if (*encodedStr >= '0' && *encodedStr <= '9') {
2015-08-16 23:44:38 +02:00
temp |= *encodedStr - '0' + 2 * 26;
2017-05-01 03:13:11 +02:00
} else if (*encodedStr == '+') {
2015-08-16 23:44:38 +02:00
temp |= 2 * 26 + 10;
2017-05-01 03:13:11 +02:00
} else if (*encodedStr == '/') {
2015-08-16 23:44:38 +02:00
temp |= 2 * 26 + 10 + 1;
2017-05-01 03:13:11 +02:00
} else if (*encodedStr == base64Pad) {
switch (end - encodedStr) {
2015-04-22 18:36:40 +02:00
case 1:
2021-03-19 23:09:01 +01:00
*++iter = static_cast<std::uint8_t>((temp >> 16) & 0xFF);
*++iter = static_cast<std::uint8_t>((temp >> 8) & 0xFF);
2015-08-16 23:44:38 +02:00
return make_pair(move(buffer), decodedSize);
2015-04-22 18:36:40 +02:00
case 2:
2021-03-19 23:09:01 +01:00
*++iter = static_cast<std::uint8_t>((temp >> 10) & 0xFF);
2015-08-16 23:44:38 +02:00
return make_pair(move(buffer), decodedSize);
2015-04-22 18:36:40 +02:00
default:
2015-08-16 23:44:38 +02:00
throw ConversionException("invalid padding in base64");
2015-04-22 18:36:40 +02:00
}
} else {
2015-08-16 23:44:38 +02:00
throw ConversionException("invalid character in base64");
2015-04-22 18:36:40 +02:00
}
}
2021-03-19 23:09:01 +01:00
*++iter = static_cast<std::uint8_t>((temp >> 16) & 0xFF);
*++iter = static_cast<std::uint8_t>((temp >> 8) & 0xFF);
*++iter = static_cast<std::uint8_t>(temp & 0xFF);
2015-04-22 18:36:40 +02:00
}
2015-08-16 23:44:38 +02:00
return make_pair(move(buffer), decodedSize);
2015-04-22 18:36:40 +02:00
}
} // namespace CppUtilities