C++ Utilities  5.0.1
Useful C++ classes and routines such as argument parser, IO and conversion utilities
stringconversion.cpp
Go to the documentation of this file.
1 #include "./stringconversion.h"
2 
3 #ifndef CPP_UTILITIES_NO_THREAD_LOCAL
4 #include "resources/features.h"
5 #else
6 #define CPP_UTILITIES_THREAD_LOCAL
7 #endif
8 
9 #include <cmath>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <limits>
13 #include <memory>
14 #include <sstream>
15 
16 #include <errno.h>
17 #include <iconv.h>
18 
19 #ifdef PLATFORM_WINDOWS
20 #include <windows.h>
21 #endif
22 
23 using namespace std;
24 
25 namespace CppUtilities {
26 
28 
29 struct Keep {
30  size_t operator()(size_t value)
31  {
32  return value;
33  }
34 };
35 struct Double {
36  size_t operator()(size_t value)
37  {
38  return value + value;
39  }
40 };
41 struct Half {
42  size_t operator()(size_t value)
43  {
44  return value / 2;
45  }
46 };
47 struct Factor {
48  Factor(float factor)
49  : factor(factor){};
50  size_t operator()(size_t value)
51  {
52  return static_cast<size_t>(value * factor);
53  }
54  float factor;
55 };
56 
57 template <class OutputSizeHint> class ConversionDescriptor {
58 public:
59  ConversionDescriptor(const char *fromCharset, const char *toCharset)
60  : m_ptr(iconv_open(toCharset, fromCharset))
61  , m_outputSizeHint(OutputSizeHint())
62  {
63  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
64  throw ConversionException("Unable to allocate descriptor for character set conversion.");
65  }
66  }
67 
68  ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint)
69  : m_ptr(iconv_open(toCharset, fromCharset))
70  , m_outputSizeHint(outputSizeHint)
71  {
72  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
73  throw ConversionException("Unable to allocate descriptor for character set conversion.");
74  }
75  }
76 
77  ~ConversionDescriptor()
78  {
79  iconv_close(m_ptr);
80  }
81 
82 public:
83  StringData convertString(const char *inputBuffer, size_t inputBufferSize)
84  {
85  // setup input and output buffer
86  size_t inputBytesLeft = inputBufferSize;
87  size_t outputSize = m_outputSizeHint(inputBufferSize);
88  size_t outputBytesLeft = outputSize;
89  char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
90  size_t bytesWritten;
91 
92  char *currentOutputOffset = outputBuffer;
93  for (;; currentOutputOffset = outputBuffer + bytesWritten) {
94  bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
95  if (bytesWritten == static_cast<size_t>(-1)) {
96  if (errno == EINVAL) {
97  // ignore incomplete multibyte sequence in the input
98  bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
99  break;
100  } else if (errno == E2BIG) {
101  // output buffer has no more room for next converted character
102  bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
103  outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
104  outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
105  } else /*if(errno == EILSEQ)*/ {
106  // invalid multibyte sequence in the input
107  free(outputBuffer);
108  throw ConversionException("Invalid multibyte sequence in the input.");
109  }
110  } else {
111  // conversion completed without (further) errors
112  break;
113  }
114  }
115  return StringData(std::unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
116  }
117 
118 private:
119  iconv_t m_ptr;
120  OutputSizeHint m_outputSizeHint;
121 };
122 
124 
135  const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
136 {
137  return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
138 }
139 
143 StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
144 {
145  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
146  return descriptor.convertString(inputBuffer, inputBufferSize);
147 }
148 
152 StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
153 {
154  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
155  return descriptor.convertString(inputBuffer, inputBufferSize);
156 }
157 
161 StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
162 {
163  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
164  return descriptor.convertString(inputBuffer, inputBufferSize);
165 }
166 
170 StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
171 {
172  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
173  return descriptor.convertString(inputBuffer, inputBufferSize);
174 }
175 
179 StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
180 {
181  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
182  return descriptor.convertString(inputBuffer, inputBufferSize);
183 }
184 
188 StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
189 {
190  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("UTF-8", "ISO-8859-1");
191  return descriptor.convertString(inputBuffer, inputBufferSize);
192 }
193 
194 #ifdef PLATFORM_WINDOWS
195 
201 WideStringData convertMultiByteToWide(const char *inputBuffer, int inputBufferSize)
202 {
203  // calculate required size
204  WideStringData widePath;
205  widePath.second = MultiByteToWideChar(CP_UTF8, 0, inputBuffer, inputBufferSize, nullptr, 0);
206  if (widePath.second <= 0) {
207  return widePath;
208  }
209  // do the actual conversion
210  widePath.first = make_unique<wchar_t[]>(static_cast<size_t>(widePath.second));
211  widePath.second = MultiByteToWideChar(CP_UTF8, 0, inputBuffer, inputBufferSize, widePath.first.get(), widePath.second);
212  if (widePath.second <= 0) {
213  widePath.first.reset();
214  }
215  return widePath;
216 }
217 
222 WideStringData convertMultiByteToWide(const std::string &inputBuffer)
223 {
224  return convertMultiByteToWide(
225  inputBuffer.data(), inputBuffer.size() < (numeric_limits<int>::max() - 1) ? static_cast<int>(inputBuffer.size() + 1) : -1);
226 }
227 #endif
228 
233 void truncateString(string &str, char terminationChar)
234 {
235  string::size_type firstNullByte = str.find(terminationChar);
236  if (firstNullByte != string::npos) {
237  str.resize(firstNullByte);
238  }
239 }
240 
246 string dataSizeToString(std::uint64_t sizeInByte, bool includeByte)
247 {
248  stringstream res(stringstream::in | stringstream::out);
249  res.setf(ios::fixed, ios::floatfield);
250  res << setprecision(2);
251  if (sizeInByte < 1024LL) {
252  res << sizeInByte << " bytes";
253  } else if (sizeInByte < 1048576LL) {
254  res << (static_cast<double>(sizeInByte) / 1024.0) << " KiB";
255  } else if (sizeInByte < 1073741824LL) {
256  res << (static_cast<double>(sizeInByte) / 1048576.0) << " MiB";
257  } else if (sizeInByte < 1099511627776LL) {
258  res << (static_cast<double>(sizeInByte) / 1073741824.0) << " GiB";
259  } else {
260  res << (static_cast<double>(sizeInByte) / 1099511627776.0) << " TiB";
261  }
262  if (includeByte && sizeInByte > 1024LL) {
263  res << ' ' << '(' << sizeInByte << " byte)";
264  }
265  return res.str();
266 }
267 
278 string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes)
279 {
280  stringstream res(stringstream::in | stringstream::out);
281  res << setprecision(3);
282  if (std::isnan(bitrateInKbitsPerSecond)) {
283  res << "indeterminable";
284  } else if (useIecBinaryPrefixes) {
285  if (bitrateInKbitsPerSecond < 8.0) {
286  res << (bitrateInKbitsPerSecond * 125.0) << " byte/s";
287  } else if (bitrateInKbitsPerSecond < 8000.0) {
288  res << (bitrateInKbitsPerSecond * 0.125) << " KiB/s";
289  } else if (bitrateInKbitsPerSecond < 8000000.0) {
290  res << (bitrateInKbitsPerSecond * 0.000125) << " MiB/s";
291  } else {
292  res << (bitrateInKbitsPerSecond * 0.000000125) << " GiB/s";
293  }
294  } else {
295  if (bitrateInKbitsPerSecond < 1.0) {
296  res << (bitrateInKbitsPerSecond * 1000.0) << " bit/s";
297  } else if (bitrateInKbitsPerSecond < 1000.0) {
298  res << (bitrateInKbitsPerSecond) << " kbit/s";
299  } else if (bitrateInKbitsPerSecond < 1000000.0) {
300  res << (bitrateInKbitsPerSecond * 0.001) << " Mbit/s";
301  } else {
302  res << (bitrateInKbitsPerSecond * 0.000001) << " Gbit/s";
303  }
304  }
305  return res.str();
306 }
307 
309 const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
310 const char base64Pad = '=';
312 
317 string encodeBase64(const std::uint8_t *data, std::uint32_t dataSize)
318 {
319  string encoded;
320  std::uint8_t mod = dataSize % 3;
321  encoded.reserve(((dataSize / 3) + (mod > 0)) * 4);
322  std::uint32_t temp;
323  for (const std::uint8_t *end = --data + dataSize - mod; data != end;) {
324  temp = *++data << 16;
325  temp |= *++data << 8;
326  temp |= *++data;
327  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
328  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
329  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
330  encoded.push_back(base64Chars[(temp & 0x0000003F)]);
331  }
332  switch (mod) {
333  case 1:
334  temp = *++data << 16;
335  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
336  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
337  encoded.push_back(base64Pad);
338  encoded.push_back(base64Pad);
339  break;
340  case 2:
341  temp = *++data << 16;
342  temp |= *++data << 8;
343  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
344  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
345  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
346  encoded.push_back(base64Pad);
347  break;
348  }
349  return encoded;
350 }
351 
357 pair<unique_ptr<std::uint8_t[]>, std::uint32_t> decodeBase64(const char *encodedStr, const std::uint32_t strSize)
358 {
359  if (strSize % 4) {
360  throw ConversionException("invalid size of base64");
361  }
362  std::uint32_t decodedSize = (strSize / 4) * 3;
363  const char *const end = encodedStr + strSize;
364  if (strSize) {
365  if (*(end - 1) == base64Pad) {
366  --decodedSize;
367  }
368  if (*(end - 2) == base64Pad) {
369  --decodedSize;
370  }
371  }
372  auto buffer = make_unique<std::uint8_t[]>(decodedSize);
373  auto *iter = buffer.get() - 1;
374  while (encodedStr < end) {
375  std::int32_t temp = 0;
376  for (std::uint8_t quantumPos = 0; quantumPos < 4; ++quantumPos, ++encodedStr) {
377  temp <<= 6;
378  if (*encodedStr >= 'A' && *encodedStr <= 'Z') {
379  temp |= *encodedStr - 'A';
380  } else if (*encodedStr >= 'a' && *encodedStr <= 'z') {
381  temp |= *encodedStr - 'a' + 26;
382  } else if (*encodedStr >= '0' && *encodedStr <= '9') {
383  temp |= *encodedStr - '0' + 2 * 26;
384  } else if (*encodedStr == '+') {
385  temp |= 2 * 26 + 10;
386  } else if (*encodedStr == '/') {
387  temp |= 2 * 26 + 10 + 1;
388  } else if (*encodedStr == base64Pad) {
389  switch (end - encodedStr) {
390  case 1:
391  *++iter = (temp >> 16) & 0xFF;
392  *++iter = (temp >> 8) & 0xFF;
393  return make_pair(move(buffer), decodedSize);
394  case 2:
395  *++iter = (temp >> 10) & 0xFF;
396  return make_pair(move(buffer), decodedSize);
397  default:
398  throw ConversionException("invalid padding in base64");
399  }
400  } else {
401  throw ConversionException("invalid character in base64");
402  }
403  }
404  *++iter = (temp >> 16) & 0xFF;
405  *++iter = (temp >> 8) & 0xFF;
406  *++iter = (temp)&0xFF;
407  }
408  return make_pair(move(buffer), decodedSize);
409 }
410 } // namespace CppUtilities
stringconversion.h
CppUtilities::convertLatin1ToUtf8
CPP_UTILITIES_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified Latin-1 string to UTF-8.
Definition: stringconversion.cpp:179
CppUtilities::convertUtf16BEToUtf8
CPP_UTILITIES_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (big-endian) string to UTF-8.
Definition: stringconversion.cpp:170
CppUtilities::truncateString
CPP_UTILITIES_EXPORT void truncateString(std::string &str, char terminationChar='\0')
Truncates all characters after the first occurrence of the specified terminationChar and the terminat...
Definition: stringconversion.cpp:233
CppUtilities::convertString
CPP_UTILITIES_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor=1.0f)
Converts the specified string from one character set to another.
Definition: stringconversion.cpp:134
CppUtilities::convertUtf8ToUtf16BE
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (big-endian).
Definition: stringconversion.cpp:161
CppUtilities::bitrateToString
CPP_UTILITIES_EXPORT std::string bitrateToString(double speedInKbitsPerSecond, bool useByteInsteadOfBits=false)
Converts the specified bitrate in kbit/s to its equivalent std::string representation.
Definition: stringconversion.cpp:278
CppUtilities::max
constexpr T max(T first, T second)
Returns the greatest of the given items.
Definition: math.h:100
CppUtilities::decodeBase64
CPP_UTILITIES_EXPORT std::pair< std::unique_ptr< std::uint8_t[]>, std::uint32_t > decodeBase64(const char *encodedStr, const std::uint32_t strSize)
Decodes the specified Base64 encoded string.
Definition: stringconversion.cpp:357
CppUtilities
Contains all utilities provides by the c++utilities library.
Definition: argumentparser.h:17
CppUtilities::EmptyPartsTreat::Keep
CppUtilities::convertUtf16LEToUtf8
CPP_UTILITIES_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (little-endian) string to UTF-8.
Definition: stringconversion.cpp:152
CppUtilities::dataSizeToString
CPP_UTILITIES_EXPORT std::string dataSizeToString(std::uint64_t sizeInByte, bool includeByte=false)
Converts the specified data size in byte to its equivalent std::string representation.
Definition: stringconversion.cpp:246
CppUtilities::ConversionException
The ConversionException class is thrown by the various conversion functions of this library when a co...
Definition: conversionexception.h:11
CppUtilities::convertUtf8ToLatin1
CPP_UTILITIES_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to Latin-1.
Definition: stringconversion.cpp:188
CppUtilities::StringData
std::pair< std::unique_ptr< char[], StringDataDeleter >, std::size_t > StringData
Type used to return string encoding conversion result.
Definition: stringconversion.h:38
CppUtilities::convertUtf8ToUtf16LE
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (little-endian).
Definition: stringconversion.cpp:143
CppUtilities::encodeBase64
CPP_UTILITIES_EXPORT std::string encodeBase64(const std::uint8_t *data, std::uint32_t dataSize)
Encodes the specified data to Base64.
Definition: stringconversion.cpp:317