diff --git a/README.md b/README.md index 51047f1..03d6890 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ The library utilizes: * dealing with dates and times * conversion of primitive data types to byte-buffers and vice versa (litte-endian and big-endian) * common string conversions/operations, eg. + - character set conversion via iconv - split, join, find and replace - conversion from number to string and vice verca - encoding/decoding base-64 @@ -20,14 +21,19 @@ The library utilizes: ## Build instructions ### Requirements #### Build-only dependencies -* C++ compiler supporting C++11 (I've tested GNU g++, Clang and mingw-w64 yet.) -* CMake (I've only tested 3.5.1 so far.) +* C++ compiler supporting C++11, tested with + - GNU g++ + - mingw-w64 + - Clang +* CMake, tested 3.5.1 and 3.6.0 * cppunit for unit tests (optional) * Doxygen for API documentation (optional) * Graphviz for diagrams in the API documentation (optional) #### Runtime dependencies -* The c++utilities library itself only needs the C/C++ standard library. +* The c++utilities library itself only needs + - the C/C++ standard library + - libiconv (might be part of glibc or provided as extra library) * For dependencies of my other projects check the README.md of these projects. ### How to build diff --git a/conversion/stringconversion.cpp b/conversion/stringconversion.cpp index 572e04f..587d3c1 100644 --- a/conversion/stringconversion.cpp +++ b/conversion/stringconversion.cpp @@ -4,12 +4,165 @@ #include #include +#include + +#include +#include using namespace std; namespace ConversionUtilities { +/// \cond + +struct Keep { size_t operator()(size_t value) { return value; } }; +struct Double { size_t operator()(size_t value) { return value + value; } }; +struct Half { size_t operator()(size_t value) { return value / 2; } }; +struct Factor { + Factor(float factor) : factor(factor) {}; + size_t operator()(size_t value) { return value * factor; } + float factor; +}; + +template +class ConversionDescriptor +{ +public: + ConversionDescriptor(const char *fromCharset, const char *toCharset) : + m_ptr(iconv_open(toCharset, fromCharset)), + m_outputSizeHint(OutputSizeHint()) + { + if(m_ptr == reinterpret_cast(-1)) { + throw ConversionException("Unable to allocate descriptor for character set conversion."); + } + } + + ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint) : + m_ptr(iconv_open(toCharset, fromCharset)), + m_outputSizeHint(outputSizeHint) + { + if(m_ptr == reinterpret_cast(-1)) { + throw ConversionException("Unable to allocate descriptor for character set conversion."); + } + } + + ~ConversionDescriptor() + { + iconv_close(m_ptr); + } + +public: + StringData convertString(const char *inputBuffer, size_t inputBufferSize) + { + // setup input and output buffer + size_t inputBytesLeft = inputBufferSize; + size_t outputSize = m_outputSizeHint(inputBufferSize); + size_t outputBytesLeft = outputSize; + char *outputBuffer = reinterpret_cast(malloc(outputSize)); + size_t bytesWritten; + + char *currentOutputOffset = outputBuffer; + for(; ; currentOutputOffset = outputBuffer + bytesWritten) { + bytesWritten = iconv(m_ptr, const_cast(&inputBuffer), &inputBytesLeft, ¤tOutputOffset, &outputBytesLeft); + if(bytesWritten == static_cast(-1)) { + if(errno == EINVAL) { + // ignore incomplete multibyte sequence in the input + bytesWritten = currentOutputOffset - outputBuffer; + break; + } else if(errno == E2BIG) { + // output buffer has no more room for next converted character + bytesWritten = currentOutputOffset - outputBuffer; + outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten; + outputBuffer = reinterpret_cast(realloc(outputBuffer, outputSize)); + } else /*if(errno == EILSEQ)*/ { + // invalid multibyte sequence in the input + free(outputBuffer); + throw ConversionException("Invalid multibyte sequence in the input."); + } + } else { + // conversion completed without (further) errors + break; + } + } + return make_pair(unique_ptr(outputBuffer), currentOutputOffset - outputBuffer); + } + +private: + iconv_t m_ptr; + OutputSizeHint m_outputSizeHint; +}; + +/// \endcond + +/*! + * \brief Converts the specified string from one character set to another. + * \remarks + * - The term "size" referes here always to the actual number of bytes and not to the number of characters + * (eg. the size of the UTF-8 string "ö" is 2 and not 1). + * - The expected size of the output buffer can be specified via \a outputBufferSizeFactor. This hint helps + * to reduce buffer reallocations during the conversion (eg. for the conversion from Latin-1 to UTF-16 + * the factor would be 2, for the conversion from UTF-16 to Latin-1 the factor would be 0.5). + */ +StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor) +{ + return ConversionDescriptor(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize); +} + +/*! + * \brief Converts the specified UTF-8 string to UTF-16 (little-endian). + */ +StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize) +{ + static ConversionDescriptor descriptor("UTF-8", "UTF-16LE"); + return descriptor.convertString(inputBuffer, inputBufferSize); +} + +/*! + * \brief Converts the specified UTF-16 (little-endian) string to UTF-8. + */ +StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize) +{ + static ConversionDescriptor descriptor("UTF-16LE", "UTF-8"); + return descriptor.convertString(inputBuffer, inputBufferSize); +} + +/*! + * \brief Converts the specified UTF-8 string to UTF-16 (big-endian). + */ +StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize) +{ + static ConversionDescriptor descriptor("UTF-8", "UTF-16BE"); + return descriptor.convertString(inputBuffer, inputBufferSize); +} + +/*! + * \brief Converts the specified UTF-16 (big-endian) string to UTF-8. + */ +StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize) +{ + static ConversionDescriptor descriptor("UTF-16BE", "UTF-8"); + return descriptor.convertString(inputBuffer, inputBufferSize); +} + +/*! + * \brief Converts the specified Latin-1 string to UTF-8. + */ +StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize) +{ + static ConversionDescriptor descriptor("ISO-8859-1", "UTF-8"); + return descriptor.convertString(inputBuffer, inputBufferSize); +} + +/*! + * \brief Converts the specified UTF-8 string to Latin-1. + */ +StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize) +{ + static ConversionDescriptor descriptor("UTF-8", "ISO-8859-1", 1.1); + return descriptor.convertString(inputBuffer, inputBufferSize); +} + /*! * \brief Truncates all characters after the first occurrence of the * specified \a terminationChar and the termination character as well. @@ -87,13 +240,15 @@ string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes return res.str(); } +//! \cond const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; const char base64Pad = '='; +//! \endcond /*! * \brief Encodes the specified \a data to Base64. */ -LIB_EXPORT string encodeBase64(const byte *data, uint32 dataSize) +string encodeBase64(const byte *data, uint32 dataSize) { string encoded; byte mod = dataSize % 3; @@ -132,7 +287,7 @@ LIB_EXPORT string encodeBase64(const byte *data, uint32 dataSize) * \brief Decodes the specified Base64 encoded string. * \throw Throws a ConversionException if the specified string is no valid Base64. */ -LIB_EXPORT pair, uint32> decodeBase64(const char *encodedStr, const uint32 strSize) +pair, uint32> decodeBase64(const char *encodedStr, const uint32 strSize) { if(strSize % 4) { throw ConversionException("invalid size of base64"); @@ -187,4 +342,3 @@ LIB_EXPORT pair, uint32> decodeBase64(const char *encodedStr, } } - diff --git a/conversion/stringconversion.h b/conversion/stringconversion.h index 2667a15..5c45842 100644 --- a/conversion/stringconversion.h +++ b/conversion/stringconversion.h @@ -12,11 +12,38 @@ #include #include -#include +//#include namespace ConversionUtilities { +/*! + * \brief The StringDataDeleter struct deletes the data of a StringData instance. + */ +struct LIB_EXPORT StringDataDeleter { + /*! + * \brief Deletes the specified \a stringData with std::free(), because the memory has been + * allocated using std::malloc()/std::realloc(). + */ + void operator()(char *stringData) + { + std::free(stringData); + } +}; + +/*! + * \brief Type used to return string encoding conversion result. + */ +typedef std::pair, std::size_t> StringData; + +LIB_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor = 1.0f); +LIB_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize); +LIB_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize); +LIB_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize); +LIB_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize); +LIB_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize); +LIB_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize); + LIB_EXPORT void truncateString(std::string &str, char terminationChar = '\0'); /*! @@ -147,7 +174,7 @@ template LIB_EXPORT void findAndReplace(StringType &str, c } /*! - * \brief Converts the given \a number to its equivalent std::string representation using the specified \a base. + * \brief Converts the given \a number to its equivalent string representation using the specified \a base. * \tparam NumberType The data type of the given number. * \tparam StringType The string type (should be an instantiation of the basic_string class template). * \sa stringToNumber() @@ -160,7 +187,7 @@ template LIB_EXPORT Str } /*! - * \brief Converts the given \a string to a numeric value using the specified \a base. + * \brief Converts the given \a string to a number assuming \a string uses the specified \a base. * \tparam NumberType The data type used to store the converted value. * \tparam StringType The string type (should be an instantiation of the basic_string class template). * \throws A ConversionException will be thrown if the provided string is not a valid number. @@ -179,7 +206,7 @@ template LIB_EXPORT NumberType string } /*! - * \brief Converts the given \a string to a numeric value using the specified \a base. + * \brief Converts the given null-terminated \a string to a numeric value using the specified \a base. * \tparam NumberType The data type used to store the converted value. * \tparam StringType The string type (should be an instantiation of the basic_string class template). * \throws A ConversionException will be thrown if the provided string is not a valid number. @@ -200,9 +227,9 @@ template LIB_EXPORT NumberType stringTo /*! * \brief Interprets the given \a integer at the specified position as std::string using the specified byte order. * - * Example: Interpretation of ID3v2 frame IDs (stored as 32-bit integer) as string + * Example: interpretation of ID3v2 frame IDs (stored as 32-bit integer) as string * - 0x54495432/1414091826 will be interpreted as "TIT2". - * - 0x00545432/5526578 will be interpreted as "TT2" using start offset 1 to "exclude" the first byte. + * - 0x00545432/5526578 will be interpreted as "TT2" using start offset 1 to omit the first byte. * * \tparam T The data type of the integer to be interpreted. */ diff --git a/tests/conversiontests.cpp b/tests/conversiontests.cpp index 3cfe98c..77a6e31 100644 --- a/tests/conversiontests.cpp +++ b/tests/conversiontests.cpp @@ -1,5 +1,6 @@ #include "../conversion/binaryconversion.h" #include "../conversion/stringconversion.h" +#include "../tests/testutils.h" #include #include @@ -11,6 +12,7 @@ using namespace std; using namespace ConversionUtilities; +using namespace TestUtilities; using namespace CPPUNIT_NS; @@ -23,6 +25,7 @@ class ConversionTests : public TestFixture CPPUNIT_TEST(testEndianness); CPPUNIT_TEST(testBinaryConversions); CPPUNIT_TEST(testSwapOrderFunctions); + CPPUNIT_TEST(testStringEncodingConversions); CPPUNIT_TEST(testStringConversions); CPPUNIT_TEST_SUITE_END(); @@ -35,6 +38,7 @@ public: void testEndianness(); void testBinaryConversions(); void testSwapOrderFunctions(); + void testStringEncodingConversions(); void testStringConversions(); private: @@ -157,7 +161,70 @@ void ConversionTests::testSwapOrderFunctions() } /*! - * \brief Tests most important string conversions. + * \brief Internally used for string encoding tests to check results. + */ +void assertEqual(const char *message, const byte *expectedValues, size_t expectedSize, const pair, size_t> &actualValues) +{ + // check whether number of elements matches + CPPUNIT_ASSERT_EQUAL_MESSAGE(message, expectedSize, actualValues.second); + // check whether contents match + auto *end = expectedValues + expectedSize; + auto *i = reinterpret_cast(actualValues.first.get()); + for(; expectedValues != end; ++expectedValues, ++i) { + CPPUNIT_ASSERT_EQUAL_MESSAGE(message, asHexNumber(*expectedValues), asHexNumber(*i)); + } +} + +#if CONVERSION_UTILITIES_IS_BYTE_ORDER_LITTLE_ENDIAN == true +# define LE_STR_FOR_ENDIANNESS(name) name ## LE ## String +# define BE_STR_FOR_ENDIANNESS(name) name ## BE ## String +#elif CONVERSION_UTILITIES_IS_BYTE_ORDER_BIG_ENDIAN == true +# define LE_STR_FOR_ENDIANNESS(name) name ## BE ## String +# define BE_STR_FOR_ENDIANNESS(name) name ## LE ## String +#endif + +/*! + * \def LE_STR_FOR_ENDIANNESS + * \brief Selects right string for little-endian checks. + */ + +/*! + * \def BE_STR_FOR_ENDIANNESS + * \brief Selects right string for big-endian checks. + */ + +/*! + * \brief Tests string encoding conversions. + */ +void ConversionTests::testStringEncodingConversions() +{ + // define test string "ABCD" for the different encodings + const byte simpleString[] = {'A', 'B', 'C', 'D'}; + const uint16 simpleUtf16LEString[] = {0x0041, 0x0042, 0x0043, 0x0044}; + const uint16 simpleUtf16BEString[] = {0x4100, 0x4200, 0x4300, 0x4400}; + // define test string "ABÖCD" for the different encodings + const byte latin1String[] = {'A', 'B', 0xD6, 'C', 'D'}; + const byte utf8String[] = {'A', 'B', 0xC3, 0x96, 'C', 'D'}; + const uint16 utf16LEString[] = {0x0041, 0x0042, 0x00D6, 0x0043, 0x0044}; + const uint16 utf16BEString[] = {0x4100, 0x4200, 0xD600, 0x4300, 0x4400}; + // test conversion to UTF-8 + assertEqual("Latin-1 to UTF-8 (simple)", simpleString, 4, convertLatin1ToUtf8(reinterpret_cast(simpleString), 4)); + assertEqual("Latin-1 to UTF-8", utf8String, 6, convertLatin1ToUtf8(reinterpret_cast(latin1String), 5)); + assertEqual("UTF-16LE to UTF-8 (simple)", simpleString, 4, convertUtf16LEToUtf8(reinterpret_cast(LE_STR_FOR_ENDIANNESS(simpleUtf16)), 8)); + assertEqual("UTF-16LE to UTF-8", utf8String, 6, convertUtf16LEToUtf8(reinterpret_cast(LE_STR_FOR_ENDIANNESS(utf16)), 10)); + assertEqual("UTF-16BE to UTF-8 (simple)", simpleString, 4, convertUtf16BEToUtf8(reinterpret_cast(BE_STR_FOR_ENDIANNESS(simpleUtf16)), 8)); + assertEqual("UTF-16BE to UTF-8", utf8String, 6, convertUtf16BEToUtf8(reinterpret_cast(BE_STR_FOR_ENDIANNESS(utf16)), 10)); + // test conversion from UTF-8 + assertEqual("UTF-8 to Latin-1 (simple)", simpleString, 4, convertUtf8ToLatin1(reinterpret_cast(simpleString), 4)); + assertEqual("UTF-8 to Latin-1", latin1String, 5, convertUtf8ToLatin1(reinterpret_cast(utf8String), 6)); + assertEqual("UTF-8 to UFT-16LE (simple)", reinterpret_cast(LE_STR_FOR_ENDIANNESS(simpleUtf16)), 8, convertUtf8ToUtf16LE(reinterpret_cast(simpleString), 4)); + assertEqual("UTF-8 to UFT-16LE", reinterpret_cast(LE_STR_FOR_ENDIANNESS(utf16)), 10, convertUtf8ToUtf16LE(reinterpret_cast(utf8String), 6)); + assertEqual("UTF-8 to UFT-16BE (simple)", reinterpret_cast(BE_STR_FOR_ENDIANNESS(simpleUtf16)), 8, convertUtf8ToUtf16BE(reinterpret_cast(simpleString), 4)); + assertEqual("UTF-8 to UFT-16BE", reinterpret_cast(BE_STR_FOR_ENDIANNESS(utf16)), 10, convertUtf8ToUtf16BE(reinterpret_cast(utf8String), 6)); +} + +/*! + * \brief Tests miscellaneous string conversions. */ void ConversionTests::testStringConversions() { diff --git a/tests/testutils.h b/tests/testutils.h index fa0da66..3e0dd9b 100644 --- a/tests/testutils.h +++ b/tests/testutils.h @@ -4,6 +4,7 @@ #include "../application/argumentparser.h" #include +#include namespace TestUtilities { @@ -91,6 +92,42 @@ inline LIB_EXPORT std::string workingCopyPath(const std::string &name) } #endif +/*! + * \brief The AsHexNumber class allows printing values asserted with cppunit (or similar test framework) using the + * hex system in the error case. + */ +template class AsHexNumber +{ +public: + /// \brief Constructs a new instance; use asHexNumber() for convenience instead. + AsHexNumber(const T &value) : value(value) {} + const T &value; +}; + +/*! + * \brief Provides operator == required by CPPUNIT_ASSERT_EQUAL. + */ +template bool operator==(const AsHexNumber &lhs, const AsHexNumber &rhs) +{ + return lhs.value == rhs.value; +} + +/*! + * \brief Provides the actual formatting of the output for AsHexNumber class. + */ +template std::ostream &operator<< (std::ostream &out, const AsHexNumber &value) +{ + return out << std::hex << '0' << 'x' << unsigned(value.value) << std::dec; +} + +/*! + * \brief Wraps the value to be printed using the hex system. + */ +template AsHexNumber asHexNumber(const T &value) +{ + return AsHexNumber(value); +} + } #endif // TESTUTILS_H