C++ Utilities  4.6.1
Common C++ classes and routines used by my applications such as argument parser, IO and conversion utilities
stringconversion.cpp
Go to the documentation of this file.
1 #include "./stringconversion.h"
2 
3 #include <memory>
4 #include <sstream>
5 #include <iomanip>
6 #include <cstdlib>
7 
8 #include <iconv.h>
9 #include <errno.h>
10 
11 using namespace std;
12 
20 namespace ConversionUtilities
21 {
22 
24 
25 struct Keep { size_t operator()(size_t value) { return value; } };
26 struct Double { size_t operator()(size_t value) { return value + value; } };
27 struct Half { size_t operator()(size_t value) { return value / 2; } };
28 struct Factor {
29  Factor(float factor) : factor(factor) {};
30  size_t operator()(size_t value) { return value * factor; }
31  float factor;
32 };
33 
34 template<class OutputSizeHint>
35 class ConversionDescriptor
36 {
37 public:
38  ConversionDescriptor(const char *fromCharset, const char *toCharset) :
39  m_ptr(iconv_open(toCharset, fromCharset)),
40  m_outputSizeHint(OutputSizeHint())
41  {
42  if(m_ptr == reinterpret_cast<iconv_t>(-1)) {
43  throw ConversionException("Unable to allocate descriptor for character set conversion.");
44  }
45  }
46 
47  ConversionDescriptor(const char *fromCharset, const char *toCharset, OutputSizeHint outputSizeHint) :
48  m_ptr(iconv_open(toCharset, fromCharset)),
49  m_outputSizeHint(outputSizeHint)
50  {
51  if(m_ptr == reinterpret_cast<iconv_t>(-1)) {
52  throw ConversionException("Unable to allocate descriptor for character set conversion.");
53  }
54  }
55 
56  ~ConversionDescriptor()
57  {
58  iconv_close(m_ptr);
59  }
60 
61 public:
62  StringData convertString(const char *inputBuffer, size_t inputBufferSize)
63  {
64  // setup input and output buffer
65  size_t inputBytesLeft = inputBufferSize;
66  size_t outputSize = m_outputSizeHint(inputBufferSize);
67  size_t outputBytesLeft = outputSize;
68  char *outputBuffer = reinterpret_cast<char *>(malloc(outputSize));
69  size_t bytesWritten;
70 
71  char *currentOutputOffset = outputBuffer;
72  for(; ; currentOutputOffset = outputBuffer + bytesWritten) {
73  bytesWritten = iconv(m_ptr, const_cast<char **>(&inputBuffer), &inputBytesLeft, &currentOutputOffset, &outputBytesLeft);
74  if(bytesWritten == static_cast<size_t>(-1)) {
75  if(errno == EINVAL) {
76  // ignore incomplete multibyte sequence in the input
77  bytesWritten = currentOutputOffset - outputBuffer;
78  break;
79  } else if(errno == E2BIG) {
80  // output buffer has no more room for next converted character
81  bytesWritten = currentOutputOffset - outputBuffer;
82  outputBytesLeft = (outputSize += m_outputSizeHint(inputBytesLeft)) - bytesWritten;
83  outputBuffer = reinterpret_cast<char *>(realloc(outputBuffer, outputSize));
84  } else /*if(errno == EILSEQ)*/ {
85  // invalid multibyte sequence in the input
86  free(outputBuffer);
87  throw ConversionException("Invalid multibyte sequence in the input.");
88  }
89  } else {
90  // conversion completed without (further) errors
91  break;
92  }
93  }
94  return StringData(std::unique_ptr<char[], StringDataDeleter>(outputBuffer), currentOutputOffset - outputBuffer);
95  }
96 
97 private:
98  iconv_t m_ptr;
99  OutputSizeHint m_outputSizeHint;
100 };
101 
103 
113 StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor)
114 {
115  return ConversionDescriptor<Factor>(fromCharset, toCharset, outputBufferSizeFactor).convertString(inputBuffer, inputBufferSize);
116 }
117 
121 StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
122 {
123  static ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16LE");
124  return descriptor.convertString(inputBuffer, inputBufferSize);
125 }
126 
130 StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
131 {
132  static ConversionDescriptor<Half> descriptor("UTF-16LE", "UTF-8");
133  return descriptor.convertString(inputBuffer, inputBufferSize);
134 }
135 
139 StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
140 {
141  static ConversionDescriptor<Double> descriptor("UTF-8", "UTF-16BE");
142  return descriptor.convertString(inputBuffer, inputBufferSize);
143 }
144 
148 StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
149 {
150  static ConversionDescriptor<Half> descriptor("UTF-16BE", "UTF-8");
151  return descriptor.convertString(inputBuffer, inputBufferSize);
152 }
153 
157 StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
158 {
159  static ConversionDescriptor<Keep> descriptor("ISO-8859-1", "UTF-8");
160  return descriptor.convertString(inputBuffer, inputBufferSize);
161 }
162 
166 StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
167 {
168  static ConversionDescriptor<Factor> descriptor("UTF-8", "ISO-8859-1", 1.1);
169  return descriptor.convertString(inputBuffer, inputBufferSize);
170 }
171 
176 void truncateString(string &str, char terminationChar)
177 {
178  string::size_type firstNullByte = str.find(terminationChar);
179  if(firstNullByte != string::npos) {
180  str.resize(firstNullByte);
181  }
182 }
183 
189 string dataSizeToString(uint64 sizeInByte, bool includeByte)
190 {
191  stringstream res(stringstream::in | stringstream::out);
192  res.setf(ios::fixed, ios::floatfield);
193  res << setprecision(2);
194  if (sizeInByte < 1024LL) {
195  res << sizeInByte << " bytes";
196  } else if (sizeInByte < 1048576LL) {
197  res << (static_cast<double>(sizeInByte) / 1024.0) << " KiB";
198  } else if (sizeInByte < 1073741824LL) {
199  res << (static_cast<double>(sizeInByte) / 1048576.0) << " MiB";
200  } else if (sizeInByte < 1099511627776LL) {
201  res << (static_cast<double>(sizeInByte) / 1073741824.0) << " GiB";
202  } else {
203  res << (static_cast<double>(sizeInByte) / 1099511627776.0) << " TiB";
204  }
205  if(includeByte && sizeInByte > 1024LL) {
206  res << ' ' << '(' << sizeInByte << " byte)";
207  }
208  return res.str();
209 }
210 
221 string bitrateToString(double bitrateInKbitsPerSecond, bool useIecBinaryPrefixes)
222 {
223  stringstream res(stringstream::in | stringstream::out);
224  res << setprecision(3);
225  if (useIecBinaryPrefixes) {
226  if (bitrateInKbitsPerSecond < 8.0) {
227  res << (bitrateInKbitsPerSecond * 125.0) << " byte/s";
228  } else if (bitrateInKbitsPerSecond < 8000.0) {
229  res << (bitrateInKbitsPerSecond * 0.125) << " KiB/s";
230  } else if (bitrateInKbitsPerSecond < 8000000.0) {
231  res << (bitrateInKbitsPerSecond * 0.000125) << " MiB/s";
232  } else {
233  res << (bitrateInKbitsPerSecond * 0.000000125) << " GiB/s";
234  }
235  } else {
236  if (bitrateInKbitsPerSecond < 1.0) {
237  res << (bitrateInKbitsPerSecond * 1000.0) << " bit/s";
238  } else if (bitrateInKbitsPerSecond < 1000.0) {
239  res << (bitrateInKbitsPerSecond) << " kbit/s";
240  } else if (bitrateInKbitsPerSecond < 1000000.0) {
241  res << (bitrateInKbitsPerSecond * 0.001) << " Mbit/s";
242  } else {
243  res << (bitrateInKbitsPerSecond * 0.000001) << " Gbit/s";
244  }
245  }
246  return res.str();
247 }
248 
250 const char *const base64Chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
251 const char base64Pad = '=';
253 
257 string encodeBase64(const byte *data, uint32 dataSize)
258 {
259  string encoded;
260  byte mod = dataSize % 3;
261  encoded.reserve(((dataSize / 3) + (mod > 0)) * 4);
262  uint32 temp;
263  for(const byte *end = --data + dataSize - mod; data != end; ) {
264  temp = *++data << 16;
265  temp |= *++data << 8;
266  temp |= *++data;
267  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
268  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
269  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6 ]);
270  encoded.push_back(base64Chars[(temp & 0x0000003F) ]);
271  }
272  switch(mod) {
273  case 1:
274  temp = *++data << 16;
275  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
276  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
277  encoded.push_back(base64Pad);
278  encoded.push_back(base64Pad);
279  break;
280  case 2:
281  temp = *++data << 16;
282  temp |= *++data << 8;
283  encoded.push_back(base64Chars[(temp & 0x00FC0000) >> 18]);
284  encoded.push_back(base64Chars[(temp & 0x0003F000) >> 12]);
285  encoded.push_back(base64Chars[(temp & 0x00000FC0) >> 6 ]);
286  encoded.push_back(base64Pad);
287  break;
288  }
289  return encoded;
290 }
291 
296 pair<unique_ptr<byte[]>, uint32> decodeBase64(const char *encodedStr, const uint32 strSize)
297 {
298  if(strSize % 4) {
299  throw ConversionException("invalid size of base64");
300  }
301  uint32 decodedSize = (strSize / 4) * 3;
302  const char *const end = encodedStr + strSize;
303  if(strSize) {
304  if(*(end - 1) == base64Pad) {
305  --decodedSize;
306  }
307  if(*(end - 2) == base64Pad) {
308  --decodedSize;
309  }
310  }
311  auto buffer = make_unique<byte[]>(decodedSize);
312  auto *iter = buffer.get() - 1;
313  while(encodedStr < end) {
314  uint32 temp = 0;
315  for(byte quantumPos = 0; quantumPos < 4; ++quantumPos, ++encodedStr) {
316  temp <<= 6;
317  if(*encodedStr >= 'A' && *encodedStr <= 'Z') {
318  temp |= *encodedStr - 'A';
319  } else if(*encodedStr >= 'a' && *encodedStr <= 'z') {
320  temp |= *encodedStr - 'a' + 26;
321  } else if(*encodedStr >= '0' && *encodedStr <= '9') {
322  temp |= *encodedStr - '0' + 2 * 26;
323  } else if(*encodedStr == '+') {
324  temp |= 2 * 26 + 10;
325  } else if(*encodedStr == '/') {
326  temp |= 2 * 26 + 10 + 1;
327  } else if(*encodedStr == base64Pad) {
328  switch(end - encodedStr) {
329  case 1:
330  *++iter = (temp >> 16) & 0xFF;
331  *++iter = (temp >> 8) & 0xFF;
332  return make_pair(move(buffer), decodedSize);
333  case 2:
334  *++iter = (temp >> 10) & 0xFF;
335  return make_pair(move(buffer), decodedSize);
336  default:
337  throw ConversionException("invalid padding in base64");
338  }
339  } else {
340  throw ConversionException("invalid character in base64");
341  }
342  }
343  *++iter = (temp >> 16) & 0xFF;
344  *++iter = (temp >> 8) & 0xFF;
345  *++iter = (temp ) & 0xFF;
346  }
347  return make_pair(move(buffer), decodedSize);
348 }
349 
350 }
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16LE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (little-endian).
CPP_UTILITIES_EXPORT std::string encodeBase64(const byte *data, uint32 dataSize)
Encodes the specified data to Base64.
The ConversionException class is thrown by the various conversion functions of this library when a co...
CPP_UTILITIES_EXPORT StringData convertUtf16BEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (big-endian) string to UTF-8.
STL namespace.
CPP_UTILITIES_EXPORT void truncateString(std::string &str, char terminationChar='\0')
Truncates all characters after the first occurrence of the specified terminationChar and the terminat...
std::uint64_t uint64
unsigned 64-bit integer
Definition: types.h:49
CPP_UTILITIES_EXPORT StringData convertUtf8ToUtf16BE(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to UTF-16 (big-endian).
CPP_UTILITIES_EXPORT StringData convertUtf8ToLatin1(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-8 string to Latin-1.
Contains several functions providing conversions between different data types.
std::uint32_t uint32
unsigned 32-bit integer
Definition: types.h:44
std::pair< std::unique_ptr< char[], StringDataDeleter >, std::size_t > StringData
Type used to return string encoding conversion result.
CPP_UTILITIES_EXPORT StringData convertUtf16LEToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified UTF-16 (little-endian) string to UTF-8.
CPP_UTILITIES_EXPORT StringData convertString(const char *fromCharset, const char *toCharset, const char *inputBuffer, std::size_t inputBufferSize, float outputBufferSizeFactor=1.0f)
Converts the specified string from one character set to another.
std::uint8_t byte
unsigned byte
Definition: types.h:14
CPP_UTILITIES_EXPORT StringData convertLatin1ToUtf8(const char *inputBuffer, std::size_t inputBufferSize)
Converts the specified Latin-1 string to UTF-8.
CPP_UTILITIES_EXPORT std::string dataSizeToString(uint64 sizeInByte, bool includeByte=false)
Converts the specified data size in byte to its equivalent std::string representation.
CPP_UTILITIES_EXPORT std::pair< std::unique_ptr< byte[]>, uint32 > decodeBase64(const char *encodedStr, const uint32 strSize)
Decodes the specified Base64 encoded string.
CPP_UTILITIES_EXPORT std::string bitrateToString(double speedInKbitsPerSecond, bool useByteInsteadOfBits=false)
Converts the specified bitrate in kbit/s to its equivalent std::string representation.