Add functions for character set conversions

This commit is contained in:
Martchus 2016-07-27 18:24:37 +02:00
parent 323ad615cb
commit 0c40a510e5
5 changed files with 304 additions and 13 deletions

View File

@ -7,6 +7,7 @@ The library utilizes:
* dealing with dates and times
* conversion of primitive data types to byte-buffers and vice versa (litte-endian and big-endian)
* common string conversions/operations, eg.
- character set conversion via iconv
- split, join, find and replace
- conversion from number to string and vice verca
- encoding/decoding base-64
@ -20,14 +21,19 @@ The library utilizes:
## Build instructions
### Requirements
#### Build-only dependencies
* C++ compiler supporting C++11 (I've tested GNU g++, Clang and mingw-w64 yet.)
* CMake (I've only tested 3.5.1 so far.)
* C++ compiler supporting C++11, tested with
- GNU g++
- mingw-w64
- Clang
* CMake, tested 3.5.1 and 3.6.0
* cppunit for unit tests (optional)
* Doxygen for API documentation (optional)
* Graphviz for diagrams in the API documentation (optional)
#### Runtime dependencies
* The c++utilities library itself only needs the C/C++ standard library.
* The c++utilities library itself only needs
- the C/C++ standard library
- libiconv (might be part of glibc or provided as extra library)
* For dependencies of my other projects check the README.md of these projects.
### How to build

View File

@ -4,12 +4,165 @@
#include <sstream>
#include <iomanip>
#include <cstdlib>
#include <iconv.h>
#include <errno.h>
using namespace std;
namespace ConversionUtilities
{
/// \cond
struct Keep { size_t operator()(size_t value) { return value; } };
struct Double { size_t operator()(size_t value) { return value + value; } };
struct Half { size_t operator()(size_t value) { return value / 2; } };
struct Factor {
Factor(float factor) : factor(factor) {};
size_t operator()(size_t value) { return value * factor; }
float factor;
};
template<class OutputSizeHint>
class ConversionDescriptor
{
public:
ConversionDescriptor(const char *fromCharset, const char *toCharset) :
m_ptr(iconv_open(toCharset, fromCharset)),
m_outputSizeHint(OutputSizeHint())
{
if(m_ptr == reinterpret_cast<iconv_t>(-1)) {
throw ConversionException("Unable to allocate descriptor for character set conversion.");
}
}
ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint) :
m_ptr(iconv_open(toCharset, fromCharset)),
m_outputSizeHint(outputSizeHint)
{
if(m_ptr == reinterpret_cast<iconv_t>(-1)) {
throw ConversionException("Unable to allocate descriptor for character set conversion.");
}
}
~ConversionDescriptor()
{
iconv_close(m_ptr);
}
public:
StringData convertString(const char *inputBuffer, size_t inputBufferSize)
{
// setup input and output buffer
size_t inputBytesLeft = inputBufferSize;
size_t outputSize = m_outputSizeHint(inputBufferSize);
size_t outputBytesLeft = outputSize;
char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
size_t bytesWritten;
char *currentOutputOffset = outputBuffer;
for(; ; currentOutputOffset = outputBuffer + bytesWritten) {
bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
if(bytesWritten == static_cast<size_t>(-1)) {
if(errno == EINVAL) {
// ignore incomplete multibyte sequence in the input
bytesWritten = currentOutputOffset - outputBuffer;
break;
} else if(errno == E2BIG) {
// output buffer has no more room for next converted character
bytesWritten = currentOutputOffset - outputBuffer;
outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
} else /*if(errno == EILSEQ)*/ {
// invalid multibyte sequence in the input
free(outputBuffer);
throw ConversionException("Invalid multibyte sequence in the input.");
}
} else {
// conversion completed without (further) errors
break;
}
}
return make_pair(unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
}
private:
iconv_t m_ptr;
OutputSizeHint m_outputSizeHint;
};
/// \endcond
/*!
* \brief Converts the specified string from one character set to another.
* \remarks
* - The term "size" referes here always to the actual number of bytes and not to the number of characters
* (eg. the size of the UTF-8 string "ö" is 2 and not 1).
* - The expected size of the output buffer can be specified via \a outputBufferSizeFactor. This hint helps
* to reduce buffer reallocations during the conversion (eg. for the conversion from Latin-1 to UTF-16
* the factor would be 2, for the conversion from UTF-16 to Latin-1 the factor would be 0.5).
*/
StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
{
return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-8 string to UTF-16 (little-endian).
*/
StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
{
static ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-16 (little-endian) string to UTF-8.
*/
StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
{
static ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-8 string to UTF-16 (big-endian).
*/
StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
{
static ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-16 (big-endian) string to UTF-8.
*/
StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
{
static ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified Latin-1 string to UTF-8.
*/
StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
{
static ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Converts the specified UTF-8 string to Latin-1.
*/
StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
{
static ConversionDescriptor<Factor> descriptor("UTF-8", "ISO-8859-1", 1.1);
return descriptor.convertString(inputBuffer, inputBufferSize);
}
/*!
* \brief Truncates all characters after the first occurrence of the
* specified \a terminationChar and the termination character as well.
@ -87,13 +240,15 @@ string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes
return res.str();
}
//! \cond
const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const char base64Pad = '=';
//! \endcond
/*!
* \brief Encodes the specified \a data to Base64.
*/
LIB_EXPORT string encodeBase64(const byte *data, uint32 dataSize)
string encodeBase64(const byte *data, uint32 dataSize)
{
string encoded;
byte mod = dataSize % 3;
@ -132,7 +287,7 @@ LIB_EXPORT string encodeBase64(const byte *data, uint32 dataSize)
* \brief Decodes the specified Base64 encoded string.
* \throw Throws a ConversionException if the specified string is no valid Base64.
*/
LIB_EXPORT pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr, const uint32 strSize)
pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr, const uint32 strSize)
{
if(strSize % 4) {
throw ConversionException("invalid size of base64");
@ -187,4 +342,3 @@ LIB_EXPORT pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr,
}
}

View File

@ -12,11 +12,38 @@
#include <vector>
#include <memory>
#include <bits/codecvt.h>
//#include <bits/codecvt.h>
namespace ConversionUtilities
{
/*!
* \brief The StringDataDeleter struct deletes the data of a StringData instance.
*/
struct LIB_EXPORT StringDataDeleter {
/*!
* \brief Deletes the specified \a stringData with std::free(), because the memory has been
* allocated using std::malloc()/std::realloc().
*/
void operator()(char *stringData)
{
std::free(stringData);
}
};
/*!
* \brief Type used to return string encoding conversion result.
*/
typedef std::pair<std::unique_ptr<char[], StringDataDeleter>, std::size_t> StringData;
LIB_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor = 1.0f);
LIB_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize);
LIB_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize);
LIB_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize);
LIB_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize);
LIB_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize);
LIB_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize);
LIB_EXPORT void truncateString(std::string &str, char terminationChar = '\0');
/*!
@ -147,7 +174,7 @@ template <typename StringType> LIB_EXPORT void findAndReplace(StringType &str, c
}
/*!
* \brief Converts the given \a number to its equivalent std::string representation using the specified \a base.
* \brief Converts the given \a number to its equivalent string representation using the specified \a base.
* \tparam NumberType The data type of the given number.
* \tparam StringType The string type (should be an instantiation of the basic_string class template).
* \sa stringToNumber()
@ -160,7 +187,7 @@ template <typename NumberType, typename StringType = std::string> LIB_EXPORT Str
}
/*!
* \brief Converts the given \a string to a numeric value using the specified \a base.
* \brief Converts the given \a string to a number assuming \a string uses the specified \a base.
* \tparam NumberType The data type used to store the converted value.
* \tparam StringType The string type (should be an instantiation of the basic_string class template).
* \throws A ConversionException will be thrown if the provided string is not a valid number.
@ -179,7 +206,7 @@ template <typename NumberType, typename StringType> LIB_EXPORT NumberType string
}
/*!
* \brief Converts the given \a string to a numeric value using the specified \a base.
* \brief Converts the given null-terminated \a string to a numeric value using the specified \a base.
* \tparam NumberType The data type used to store the converted value.
* \tparam StringType The string type (should be an instantiation of the basic_string class template).
* \throws A ConversionException will be thrown if the provided string is not a valid number.
@ -200,9 +227,9 @@ template <typename NumberType, typename CharType> LIB_EXPORT NumberType stringTo
/*!
* \brief Interprets the given \a integer at the specified position as std::string using the specified byte order.
*
* Example: Interpretation of ID3v2 frame IDs (stored as 32-bit integer) as string
* Example: interpretation of ID3v2 frame IDs (stored as 32-bit integer) as string
* - 0x54495432/1414091826 will be interpreted as "TIT2".
* - 0x00545432/5526578 will be interpreted as "TT2" using start offset 1 to "exclude" the first byte.
* - 0x00545432/5526578 will be interpreted as "TT2" using start offset 1 to omit the first byte.
*
* \tparam T The data type of the integer to be interpreted.
*/

View File

@ -1,5 +1,6 @@
#include "../conversion/binaryconversion.h"
#include "../conversion/stringconversion.h"
#include "../tests/testutils.h"
#include <cppunit/extensions/HelperMacros.h>
#include <cppunit/TestFixture.h>
@ -11,6 +12,7 @@
using namespace std;
using namespace ConversionUtilities;
using namespace TestUtilities;
using namespace CPPUNIT_NS;
@ -23,6 +25,7 @@ class ConversionTests : public TestFixture
CPPUNIT_TEST(testEndianness);
CPPUNIT_TEST(testBinaryConversions);
CPPUNIT_TEST(testSwapOrderFunctions);
CPPUNIT_TEST(testStringEncodingConversions);
CPPUNIT_TEST(testStringConversions);
CPPUNIT_TEST_SUITE_END();
@ -35,6 +38,7 @@ public:
void testEndianness();
void testBinaryConversions();
void testSwapOrderFunctions();
void testStringEncodingConversions();
void testStringConversions();
private:
@ -157,7 +161,70 @@ void ConversionTests::testSwapOrderFunctions()
}
/*!
* \brief Tests most important string conversions.
* \brief Internally used for string encoding tests to check results.
*/
void assertEqual(const char *message, const byte *expectedValues, size_t expectedSize, const pair<unique_ptr<char[], StringDataDeleter>, size_t> &actualValues)
{
// check whether number of elements matches
CPPUNIT_ASSERT_EQUAL_MESSAGE(message, expectedSize, actualValues.second);
// check whether contents match
auto *end = expectedValues + expectedSize;
auto *i = reinterpret_cast<byte *>(actualValues.first.get());
for(; expectedValues != end; ++expectedValues, ++i) {
CPPUNIT_ASSERT_EQUAL_MESSAGE(message, asHexNumber(*expectedValues), asHexNumber(*i));
}
}
#if CONVERSION_UTILITIES_IS_BYTE_ORDER_LITTLE_ENDIAN == true
# define LE_STR_FOR_ENDIANNESS(name) name ## LE ## String
# define BE_STR_FOR_ENDIANNESS(name) name ## BE ## String
#elif CONVERSION_UTILITIES_IS_BYTE_ORDER_BIG_ENDIAN == true
# define LE_STR_FOR_ENDIANNESS(name) name ## BE ## String
# define BE_STR_FOR_ENDIANNESS(name) name ## LE ## String
#endif
/*!
* \def LE_STR_FOR_ENDIANNESS
* \brief Selects right string for little-endian checks.
*/
/*!
* \def BE_STR_FOR_ENDIANNESS
* \brief Selects right string for big-endian checks.
*/
/*!
* \brief Tests string encoding conversions.
*/
void ConversionTests::testStringEncodingConversions()
{
// define test string "ABCD" for the different encodings
const byte simpleString[] = {'A', 'B', 'C', 'D'};
const uint16 simpleUtf16LEString[] = {0x0041, 0x0042, 0x0043, 0x0044};
const uint16 simpleUtf16BEString[] = {0x4100, 0x4200, 0x4300, 0x4400};
// define test string "ABÖCD" for the different encodings
const byte latin1String[] = {'A', 'B', 0xD6, 'C', 'D'};
const byte utf8String[] = {'A', 'B', 0xC3, 0x96, 'C', 'D'};
const uint16 utf16LEString[] = {0x0041, 0x0042, 0x00D6, 0x0043, 0x0044};
const uint16 utf16BEString[] = {0x4100, 0x4200, 0xD600, 0x4300, 0x4400};
// test conversion to UTF-8
assertEqual("Latin-1 to UTF-8 (simple)", simpleString, 4, convertLatin1ToUtf8(reinterpret_cast<const char *>(simpleString), 4));
assertEqual("Latin-1 to UTF-8", utf8String, 6, convertLatin1ToUtf8(reinterpret_cast<const char *>(latin1String), 5));
assertEqual("UTF-16LE to UTF-8 (simple)", simpleString, 4, convertUtf16LEToUtf8(reinterpret_cast<const char *>(LE_STR_FOR_ENDIANNESS(simpleUtf16)), 8));
assertEqual("UTF-16LE to UTF-8", utf8String, 6, convertUtf16LEToUtf8(reinterpret_cast<const char *>(LE_STR_FOR_ENDIANNESS(utf16)), 10));
assertEqual("UTF-16BE to UTF-8 (simple)", simpleString, 4, convertUtf16BEToUtf8(reinterpret_cast<const char *>(BE_STR_FOR_ENDIANNESS(simpleUtf16)), 8));
assertEqual("UTF-16BE to UTF-8", utf8String, 6, convertUtf16BEToUtf8(reinterpret_cast<const char *>(BE_STR_FOR_ENDIANNESS(utf16)), 10));
// test conversion from UTF-8
assertEqual("UTF-8 to Latin-1 (simple)", simpleString, 4, convertUtf8ToLatin1(reinterpret_cast<const char *>(simpleString), 4));
assertEqual("UTF-8 to Latin-1", latin1String, 5, convertUtf8ToLatin1(reinterpret_cast<const char *>(utf8String), 6));
assertEqual("UTF-8 to UFT-16LE (simple)", reinterpret_cast<const byte *>(LE_STR_FOR_ENDIANNESS(simpleUtf16)), 8, convertUtf8ToUtf16LE(reinterpret_cast<const char *>(simpleString), 4));
assertEqual("UTF-8 to UFT-16LE", reinterpret_cast<const byte *>(LE_STR_FOR_ENDIANNESS(utf16)), 10, convertUtf8ToUtf16LE(reinterpret_cast<const char *>(utf8String), 6));
assertEqual("UTF-8 to UFT-16BE (simple)", reinterpret_cast<const byte *>(BE_STR_FOR_ENDIANNESS(simpleUtf16)), 8, convertUtf8ToUtf16BE(reinterpret_cast<const char *>(simpleString), 4));
assertEqual("UTF-8 to UFT-16BE", reinterpret_cast<const byte *>(BE_STR_FOR_ENDIANNESS(utf16)), 10, convertUtf8ToUtf16BE(reinterpret_cast<const char *>(utf8String), 6));
}
/*!
* \brief Tests miscellaneous string conversions.
*/
void ConversionTests::testStringConversions()
{

View File

@ -4,6 +4,7 @@
#include "../application/argumentparser.h"
#include <string>
#include <ostream>
namespace TestUtilities {
@ -91,6 +92,42 @@ inline LIB_EXPORT std::string workingCopyPath(const std::string &name)
}
#endif
/*!
* \brief The AsHexNumber class allows printing values asserted with cppunit (or similar test framework) using the
* hex system in the error case.
*/
template <typename T> class AsHexNumber
{
public:
/// \brief Constructs a new instance; use asHexNumber() for convenience instead.
AsHexNumber(const T &value) : value(value) {}
const T &value;
};
/*!
* \brief Provides operator == required by CPPUNIT_ASSERT_EQUAL.
*/
template <typename T> bool operator==(const AsHexNumber<T> &lhs, const AsHexNumber<T> &rhs)
{
return lhs.value == rhs.value;
}
/*!
* \brief Provides the actual formatting of the output for AsHexNumber class.
*/
template <typename T> std::ostream &operator<< (std::ostream &out, const AsHexNumber<T> &value)
{
return out << std::hex << '0' << 'x' << unsigned(value.value) << std::dec;
}
/*!
* \brief Wraps the value to be printed using the hex system.
*/
template <typename T> AsHexNumber<T> asHexNumber(const T &value)
{
return AsHexNumber<T>(value);
}
}
#endif // TESTUTILS_H