C++ Utilities  4.14.2
Useful C++ classes and routines such as argument parser, IO and conversion utilities
stringconversion.cpp
Go to the documentation of this file.
1 #include "./stringconversion.h"
2 
3 #ifndef CPP_UTILITIES_NO_THREAD_LOCAL
4 #include "resources/features.h"
5 #else
6 #define CPP_UTILITIES_THREAD_LOCAL
7 #endif
8 
9 #include <cstdlib>
10 #include <iomanip>
11 #include <memory>
12 #include <sstream>
13 
14 #include <errno.h>
15 #include <iconv.h>
16 
17 using namespace std;
18 
26 namespace ConversionUtilities {
27 
29 
30 struct Keep {
31  size_t operator()(size_t value)
32  {
33  return value;
34  }
35 };
36 struct Double {
37  size_t operator()(size_t value)
38  {
39  return value + value;
40  }
41 };
42 struct Half {
43  size_t operator()(size_t value)
44  {
45  return value / 2;
46  }
47 };
48 struct Factor {
49  Factor(float factor)
50  : factor(factor){};
51  size_t operator()(size_t value)
52  {
53  return static_cast<size_t>(value * factor);
54  }
55  float factor;
56 };
57 
58 template <class OutputSizeHint> class ConversionDescriptor {
59 public:
60  ConversionDescriptor(const char *fromCharset, const char *toCharset)
61  : m_ptr(iconv_open(toCharset, fromCharset))
62  , m_outputSizeHint(OutputSizeHint())
63  {
64  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
65  throw ConversionException("Unable to allocate descriptor for character set conversion.");
66  }
67  }
68 
69  ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint)
70  : m_ptr(iconv_open(toCharset, fromCharset))
71  , m_outputSizeHint(outputSizeHint)
72  {
73  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
74  throw ConversionException("Unable to allocate descriptor for character set conversion.");
75  }
76  }
77 
78  ~ConversionDescriptor()
79  {
80  iconv_close(m_ptr);
81  }
82 
83 public:
84  StringData convertString(const char *inputBuffer, size_t inputBufferSize)
85  {
86  // setup input and output buffer
87  size_t inputBytesLeft = inputBufferSize;
88  size_t outputSize = m_outputSizeHint(inputBufferSize);
89  size_t outputBytesLeft = outputSize;
90  char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
91  size_t bytesWritten;
92 
93  char *currentOutputOffset = outputBuffer;
94  for (;; currentOutputOffset = outputBuffer + bytesWritten) {
95  bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
96  if (bytesWritten == static_cast<size_t>(-1)) {
97  if (errno == EINVAL) {
98  // ignore incomplete multibyte sequence in the input
99  bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
100  break;
101  } else if (errno == E2BIG) {
102  // output buffer has no more room for next converted character
103  bytesWritten = static_cast<size_t>(currentOutputOffset - outputBuffer);
104  outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
105  outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
106  } else /*if(errno == EILSEQ)*/ {
107  // invalid multibyte sequence in the input
108  free(outputBuffer);
109  throw ConversionException("Invalid multibyte sequence in the input.");
110  }
111  } else {
112  // conversion completed without (further) errors
113  break;
114  }
115  }
116  return StringData(std::unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
117  }
118 
119 private:
120  iconv_t m_ptr;
121  OutputSizeHint m_outputSizeHint;
122 };
123 
125 
136  const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
137 {
138  return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
139 }
140 
144 StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
145 {
146  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
147  return descriptor.convertString(inputBuffer, inputBufferSize);
148 }
149 
153 StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
154 {
155  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
156  return descriptor.convertString(inputBuffer, inputBufferSize);
157 }
158 
162 StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
163 {
164  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
165  return descriptor.convertString(inputBuffer, inputBufferSize);
166 }
167 
171 StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
172 {
173  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
174  return descriptor.convertString(inputBuffer, inputBufferSize);
175 }
176 
180 StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
181 {
182  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
183  return descriptor.convertString(inputBuffer, inputBufferSize);
184 }
185 
189 StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
190 {
191  CPP_UTILITIES_THREAD_LOCAL ConversionDescriptor<Keep> descriptor("UTF-8", "ISO-8859-1");
192  return descriptor.convertString(inputBuffer, inputBufferSize);
193 }
194 
199 void truncateString(string &str, char terminationChar)
200 {
201  string::size_type firstNullByte = str.find(terminationChar);
202  if (firstNullByte != string::npos) {
203  str.resize(firstNullByte);
204  }
205 }
206 
212 string dataSizeToString(uint64 sizeInByte, bool includeByte)
213 {
214  stringstream res(stringstream::in | stringstream::out);
215  res.setf(ios::fixed, ios::floatfield);
216  res << setprecision(2);
217  if (sizeInByte < 1024LL) {
218  res << sizeInByte << " bytes";
219  } else if (sizeInByte < 1048576LL) {
220  res << (static_cast<double>(sizeInByte) / 1024.0) << " KiB";
221  } else if (sizeInByte < 1073741824LL) {
222  res << (static_cast<double>(sizeInByte) / 1048576.0) << " MiB";
223  } else if (sizeInByte < 1099511627776LL) {
224  res << (static_cast<double>(sizeInByte) / 1073741824.0) << " GiB";
225  } else {
226  res << (static_cast<double>(sizeInByte) / 1099511627776.0) << " TiB";
227  }
228  if (includeByte && sizeInByte > 1024LL) {
229  res << ' ' << '(' << sizeInByte << " byte)";
230  }
231  return res.str();
232 }
233 
244 string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes)
245 {
246  stringstream res(stringstream::in | stringstream::out);
247  res << setprecision(3);
248  if (useIecBinaryPrefixes) {
249  if (bitrateInKbitsPerSecond < 8.0) {
250  res << (bitrateInKbitsPerSecond * 125.0) << " byte/s";
251  } else if (bitrateInKbitsPerSecond < 8000.0) {
252  res << (bitrateInKbitsPerSecond * 0.125) << " KiB/s";
253  } else if (bitrateInKbitsPerSecond < 8000000.0) {
254  res << (bitrateInKbitsPerSecond * 0.000125) << " MiB/s";
255  } else {
256  res << (bitrateInKbitsPerSecond * 0.000000125) << " GiB/s";
257  }
258  } else {
259  if (bitrateInKbitsPerSecond < 1.0) {
260  res << (bitrateInKbitsPerSecond * 1000.0) << " bit/s";
261  } else if (bitrateInKbitsPerSecond < 1000.0) {
262  res << (bitrateInKbitsPerSecond) << " kbit/s";
263  } else if (bitrateInKbitsPerSecond < 1000000.0) {
264  res << (bitrateInKbitsPerSecond * 0.001) << " Mbit/s";
265  } else {
266  res << (bitrateInKbitsPerSecond * 0.000001) << " Gbit/s";
267  }
268  }
269  return res.str();
270 }
271 
273 const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
274 const char base64Pad = '=';
276 
281 string encodeBase64(const byte *data, uint32 dataSize)
282 {
283  string encoded;
284  byte mod = dataSize % 3;
285  encoded.reserve(((dataSize / 3) + (mod > 0)) * 4);
286  uint32 temp;
287  for (const byte *end = --data + dataSize - mod; data != end;) {
288  temp = *++data << 16;
289  temp |= *++data << 8;
290  temp |= *++data;
291  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
292  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
293  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
294  encoded.push_back(base64Chars[(temp & 0x0000003F)]);
295  }
296  switch (mod) {
297  case 1:
298  temp = *++data << 16;
299  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
300  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
301  encoded.push_back(base64Pad);
302  encoded.push_back(base64Pad);
303  break;
304  case 2:
305  temp = *++data << 16;
306  temp |= *++data << 8;
307  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
308  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
309  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
310  encoded.push_back(base64Pad);
311  break;
312  }
313  return encoded;
314 }
315 
321 pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr, const uint32 strSize)
322 {
323  if (strSize % 4) {
324  throw ConversionException("invalid size of base64");
325  }
326  uint32 decodedSize = (strSize / 4) * 3;
327  const char *const end = encodedStr + strSize;
328  if (strSize) {
329  if (*(end - 1) == base64Pad) {
330  --decodedSize;
331  }
332  if (*(end - 2) == base64Pad) {
333  --decodedSize;
334  }
335  }
336  auto buffer = make_unique<byte[]>(decodedSize);
337  auto *iter = buffer.get() - 1;
338  while (encodedStr < end) {
339  int32 temp = 0;
340  for (byte quantumPos = 0; quantumPos < 4; ++quantumPos, ++encodedStr) {
341  temp <<= 6;
342  if (*encodedStr >= 'A' && *encodedStr <= 'Z') {
343  temp |= *encodedStr - 'A';
344  } else if (*encodedStr >= 'a' && *encodedStr <= 'z') {
345  temp |= *encodedStr - 'a' + 26;
346  } else if (*encodedStr >= '0' && *encodedStr <= '9') {
347  temp |= *encodedStr - '0' + 2 * 26;
348  } else if (*encodedStr == '+') {
349  temp |= 2 * 26 + 10;
350  } else if (*encodedStr == '/') {
351  temp |= 2 * 26 + 10 + 1;
352  } else if (*encodedStr == base64Pad) {
353  switch (end - encodedStr) {
354  case 1:
355  *++iter = (temp >> 16) & 0xFF;
356  *++iter = (temp >> 8) & 0xFF;
357  return make_pair(move(buffer), decodedSize);
358  case 2:
359  *++iter = (temp >> 10) & 0xFF;
360  return make_pair(move(buffer), decodedSize);
361  default:
362  throw ConversionException("invalid padding in base64");
363  }
364  } else {
365  throw ConversionException("invalid character in base64");
366  }
367  }
368  *++iter = (temp >> 16) & 0xFF;
369  *++iter = (temp >> 8) & 0xFF;
370  *++iter = (temp)&0xFF;
371  }
372  return make_pair(move(buffer), decodedSize);
373 }
374 } // namespace ConversionUtilities
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (little-endian).
CPP_UTILITIES_EXPORT std::string encodeBase64(const byte *data, uint32 dataSize)
Encodes the specified data to Base64.
The ConversionException class is thrown by the various conversion functions of this library when a co...
CPP_UTILITIES_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (big-endian) string to UTF-8.
STL namespace.
CPP_UTILITIES_EXPORT void truncateString(std::string &str, char terminationChar='\0')
Truncates all characters after the first occurrence of the specified terminationChar and the terminat...
std::uint64_t uint64
unsigned 64-bit integer
Definition: types.h:49
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (big-endian).
CPP_UTILITIES_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to Latin-1.
Contains several functions providing conversions between different data types.
std::uint32_t uint32
unsigned 32-bit integer
Definition: types.h:44
std::pair< std::unique_ptr< char[], StringDataDeleter >, std::size_t > StringData
Type used to return string encoding conversion result.
CPP_UTILITIES_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (little-endian) string to UTF-8.
std::int32_t int32
signed 32-bit integer
Definition: types.h:24
CPP_UTILITIES_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor=1.0f)
Converts the specified string from one character set to another.
std::uint8_t byte
unsigned byte
Definition: types.h:14
CPP_UTILITIES_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified Latin-1 string to UTF-8.
CPP_UTILITIES_EXPORT std::string dataSizeToString(uint64 sizeInByte, bool includeByte=false)
Converts the specified data size in byte to its equivalent std::string representation.
CPP_UTILITIES_EXPORT std::pair< std::unique_ptr< byte[]>, uint32 > decodeBase64(const char *encodedStr, const uint32 strSize)
Decodes the specified Base64 encoded string.
CPP_UTILITIES_EXPORT std::string bitrateToString(double speedInKbitsPerSecond, bool useByteInsteadOfBits=false)
Converts the specified bitrate in kbit/s to its equivalent std::string representation.