C++ Utilities  4.9.1
Common C++ classes and routines used by my applications such as argument parser, IO and conversion utilities
stringconversion.cpp
Go to the documentation of this file.
1 #include "./stringconversion.h"
2 
3 #include <cstdlib>
4 #include <iomanip>
5 #include <memory>
6 #include <sstream>
7 
8 #include <errno.h>
9 #include <iconv.h>
10 
11 using namespace std;
12 
20 namespace ConversionUtilities {
21 
23 
24 struct Keep {
25  size_t operator()(size_t value)
26  {
27  return value;
28  }
29 };
30 struct Double {
31  size_t operator()(size_t value)
32  {
33  return value + value;
34  }
35 };
36 struct Half {
37  size_t operator()(size_t value)
38  {
39  return value / 2;
40  }
41 };
42 struct Factor {
43  Factor(float factor)
44  : factor(factor){};
45  size_t operator()(size_t value)
46  {
47  return value * factor;
48  }
49  float factor;
50 };
51 
52 template <class OutputSizeHint> class ConversionDescriptor {
53 public:
54  ConversionDescriptor(const char *fromCharset, const char *toCharset)
55  : m_ptr(iconv_open(toCharset, fromCharset))
56  , m_outputSizeHint(OutputSizeHint())
57  {
58  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
59  throw ConversionException("Unable to allocate descriptor for character set conversion.");
60  }
61  }
62 
63  ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint)
64  : m_ptr(iconv_open(toCharset, fromCharset))
65  , m_outputSizeHint(outputSizeHint)
66  {
67  if (m_ptr == reinterpret_cast<iconv_t>(-1)) {
68  throw ConversionException("Unable to allocate descriptor for character set conversion.");
69  }
70  }
71 
72  ~ConversionDescriptor()
73  {
74  iconv_close(m_ptr);
75  }
76 
77 public:
78  StringData convertString(const char *inputBuffer, size_t inputBufferSize)
79  {
80  // setup input and output buffer
81  size_t inputBytesLeft = inputBufferSize;
82  size_t outputSize = m_outputSizeHint(inputBufferSize);
83  size_t outputBytesLeft = outputSize;
84  char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
85  size_t bytesWritten;
86 
87  char *currentOutputOffset = outputBuffer;
88  for (;; currentOutputOffset = outputBuffer + bytesWritten) {
89  bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
90  if (bytesWritten == static_cast<size_t>(-1)) {
91  if (errno == EINVAL) {
92  // ignore incomplete multibyte sequence in the input
93  bytesWritten = currentOutputOffset - outputBuffer;
94  break;
95  } else if (errno == E2BIG) {
96  // output buffer has no more room for next converted character
97  bytesWritten = currentOutputOffset - outputBuffer;
98  outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
99  outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
100  } else /*if(errno == EILSEQ)*/ {
101  // invalid multibyte sequence in the input
102  free(outputBuffer);
103  throw ConversionException("Invalid multibyte sequence in the input.");
104  }
105  } else {
106  // conversion completed without (further) errors
107  break;
108  }
109  }
110  return StringData(std::unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
111  }
112 
113 private:
114  iconv_t m_ptr;
115  OutputSizeHint m_outputSizeHint;
116 };
117 
119 
130  const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
131 {
132  return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
133 }
134 
138 StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
139 {
140  static ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
141  return descriptor.convertString(inputBuffer, inputBufferSize);
142 }
143 
147 StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
148 {
149  static ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
150  return descriptor.convertString(inputBuffer, inputBufferSize);
151 }
152 
156 StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
157 {
158  static ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
159  return descriptor.convertString(inputBuffer, inputBufferSize);
160 }
161 
165 StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
166 {
167  static ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
168  return descriptor.convertString(inputBuffer, inputBufferSize);
169 }
170 
174 StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
175 {
176  static ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
177  return descriptor.convertString(inputBuffer, inputBufferSize);
178 }
179 
183 StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
184 {
185  static ConversionDescriptor<Factor> descriptor("UTF-8", "ISO-8859-1", 1.1);
186  return descriptor.convertString(inputBuffer, inputBufferSize);
187 }
188 
193 void truncateString(string &str, char terminationChar)
194 {
195  string::size_type firstNullByte = str.find(terminationChar);
196  if (firstNullByte != string::npos) {
197  str.resize(firstNullByte);
198  }
199 }
200 
206 string dataSizeToString(uint64 sizeInByte, bool includeByte)
207 {
208  stringstream res(stringstream::in | stringstream::out);
209  res.setf(ios::fixed, ios::floatfield);
210  res << setprecision(2);
211  if (sizeInByte < 1024LL) {
212  res << sizeInByte << " bytes";
213  } else if (sizeInByte < 1048576LL) {
214  res << (static_cast<double>(sizeInByte) / 1024.0) << " KiB";
215  } else if (sizeInByte < 1073741824LL) {
216  res << (static_cast<double>(sizeInByte) / 1048576.0) << " MiB";
217  } else if (sizeInByte < 1099511627776LL) {
218  res << (static_cast<double>(sizeInByte) / 1073741824.0) << " GiB";
219  } else {
220  res << (static_cast<double>(sizeInByte) / 1099511627776.0) << " TiB";
221  }
222  if (includeByte && sizeInByte > 1024LL) {
223  res << ' ' << '(' << sizeInByte << " byte)";
224  }
225  return res.str();
226 }
227 
238 string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes)
239 {
240  stringstream res(stringstream::in | stringstream::out);
241  res << setprecision(3);
242  if (useIecBinaryPrefixes) {
243  if (bitrateInKbitsPerSecond < 8.0) {
244  res << (bitrateInKbitsPerSecond * 125.0) << " byte/s";
245  } else if (bitrateInKbitsPerSecond < 8000.0) {
246  res << (bitrateInKbitsPerSecond * 0.125) << " KiB/s";
247  } else if (bitrateInKbitsPerSecond < 8000000.0) {
248  res << (bitrateInKbitsPerSecond * 0.000125) << " MiB/s";
249  } else {
250  res << (bitrateInKbitsPerSecond * 0.000000125) << " GiB/s";
251  }
252  } else {
253  if (bitrateInKbitsPerSecond < 1.0) {
254  res << (bitrateInKbitsPerSecond * 1000.0) << " bit/s";
255  } else if (bitrateInKbitsPerSecond < 1000.0) {
256  res << (bitrateInKbitsPerSecond) << " kbit/s";
257  } else if (bitrateInKbitsPerSecond < 1000000.0) {
258  res << (bitrateInKbitsPerSecond * 0.001) << " Mbit/s";
259  } else {
260  res << (bitrateInKbitsPerSecond * 0.000001) << " Gbit/s";
261  }
262  }
263  return res.str();
264 }
265 
267 const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
268 const char base64Pad = '=';
270 
274 string encodeBase64(const byte *data, uint32 dataSize)
275 {
276  string encoded;
277  byte mod = dataSize % 3;
278  encoded.reserve(((dataSize / 3) + (mod > 0)) * 4);
279  uint32 temp;
280  for (const byte *end = --data + dataSize - mod; data != end;) {
281  temp = *++data << 16;
282  temp |= *++data << 8;
283  temp |= *++data;
284  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
285  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
286  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
287  encoded.push_back(base64Chars[(temp & 0x0000003F)]);
288  }
289  switch (mod) {
290  case 1:
291  temp = *++data << 16;
292  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
293  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
294  encoded.push_back(base64Pad);
295  encoded.push_back(base64Pad);
296  break;
297  case 2:
298  temp = *++data << 16;
299  temp |= *++data << 8;
300  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
301  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
302  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6]);
303  encoded.push_back(base64Pad);
304  break;
305  }
306  return encoded;
307 }
308 
313 pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr, const uint32 strSize)
314 {
315  if (strSize % 4) {
316  throw ConversionException("invalid size of base64");
317  }
318  uint32 decodedSize = (strSize / 4) * 3;
319  const char *const end = encodedStr + strSize;
320  if (strSize) {
321  if (*(end - 1) == base64Pad) {
322  --decodedSize;
323  }
324  if (*(end - 2) == base64Pad) {
325  --decodedSize;
326  }
327  }
328  auto buffer = make_unique<byte[]>(decodedSize);
329  auto *iter = buffer.get() - 1;
330  while (encodedStr < end) {
331  uint32 temp = 0;
332  for (byte quantumPos = 0; quantumPos < 4; ++quantumPos, ++encodedStr) {
333  temp <<= 6;
334  if (*encodedStr >= 'A' && *encodedStr <= 'Z') {
335  temp |= *encodedStr - 'A';
336  } else if (*encodedStr >= 'a' && *encodedStr <= 'z') {
337  temp |= *encodedStr - 'a' + 26;
338  } else if (*encodedStr >= '0' && *encodedStr <= '9') {
339  temp |= *encodedStr - '0' + 2 * 26;
340  } else if (*encodedStr == '+') {
341  temp |= 2 * 26 + 10;
342  } else if (*encodedStr == '/') {
343  temp |= 2 * 26 + 10 + 1;
344  } else if (*encodedStr == base64Pad) {
345  switch (end - encodedStr) {
346  case 1:
347  *++iter = (temp >> 16) & 0xFF;
348  *++iter = (temp >> 8) & 0xFF;
349  return make_pair(move(buffer), decodedSize);
350  case 2:
351  *++iter = (temp >> 10) & 0xFF;
352  return make_pair(move(buffer), decodedSize);
353  default:
354  throw ConversionException("invalid padding in base64");
355  }
356  } else {
357  throw ConversionException("invalid character in base64");
358  }
359  }
360  *++iter = (temp >> 16) & 0xFF;
361  *++iter = (temp >> 8) & 0xFF;
362  *++iter = (temp)&0xFF;
363  }
364  return make_pair(move(buffer), decodedSize);
365 }
366 }
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (little-endian).
CPP_UTILITIES_EXPORT std::string encodeBase64(const byte *data, uint32 dataSize)
Encodes the specified data to Base64.
The ConversionException class is thrown by the various conversion functions of this library when a co...
CPP_UTILITIES_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (big-endian) string to UTF-8.
STL namespace.
CPP_UTILITIES_EXPORT void truncateString(std::string &str, char terminationChar='\0')
Truncates all characters after the first occurrence of the specified terminationChar and the terminat...
std::uint64_t uint64
unsigned 64-bit integer
Definition: types.h:49
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (big-endian).
CPP_UTILITIES_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to Latin-1.
Contains several functions providing conversions between different data types.
std::uint32_t uint32
unsigned 32-bit integer
Definition: types.h:44
std::pair< std::unique_ptr< char[], StringDataDeleter >, std::size_t > StringData
Type used to return string encoding conversion result.
CPP_UTILITIES_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (little-endian) string to UTF-8.
CPP_UTILITIES_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor=1.0f)
Converts the specified string from one character set to another.
std::uint8_t byte
unsigned byte
Definition: types.h:14
CPP_UTILITIES_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified Latin-1 string to UTF-8.
CPP_UTILITIES_EXPORT std::string dataSizeToString(uint64 sizeInByte, bool includeByte=false)
Converts the specified data size in byte to its equivalent std::string representation.
CPP_UTILITIES_EXPORT std::pair< std::unique_ptr< byte[]>, uint32 > decodeBase64(const char *encodedStr, const uint32 strSize)
Decodes the specified Base64 encoded string.
CPP_UTILITIES_EXPORT std::string bitrateToString(double speedInKbitsPerSecond, bool useByteInsteadOfBits=false)
Converts the specified bitrate in kbit/s to its equivalent std::string representation.