MythTV  master
stringutil.cpp
Go to the documentation of this file.
1 #include <array>
2 #include <QObject>
3 #include "stringutil.h"
4 
5 #if __has_include(<bit>) // C++20
6 #include <bit>
7 #endif
8 
9 #include <climits> // for CHAR_BIT
10 
11 #include "ternarycompare.h"
12 
13 #if defined(__cpp_lib_bitops) && __cpp_lib_bitops >= 201907L
14 using std::countl_one;
15 #else
16 // 8 bit LUT based count leading ones
17 static int countl_one(unsigned char x)
18 {
19 #if CHAR_BIT != 8
20  if (x > 256)
21  return 8; // works for our purposes even if not correct
22 #endif
23  static constexpr std::array<uint8_t,256> leading_ones =
24  {
25  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33 
34  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40  3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
41  4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8,
42  };
43  return leading_ones[x];
44 }
45 #endif // C++20 feature test macro
46 
47 bool StringUtil::isValidUTF8(const QByteArray& data)
48 {
49  const auto* p = (const unsigned char*)data.data();
50  const unsigned char* const end = p + data.size();
51  while (p < end)
52  {
53  int code_point_length = countl_one(*p);
54 
55  switch (code_point_length)
56  {
57  case 0: p++; continue; // ASCII
58  case 1: return false; // invalid, continuation byte
59  case 2:
60  case 3:
61  case 4: break;
62  default: return false;
63  /* the variable length code is limited to 4 bytes by RFC3629 §3 to match
64  the range of UTF-16, i.e. the maximum code point is U+10FFFF
65  */
66  }
67 
68  if (end < code_point_length + p)
69  {
70  return false; // truncated codepoint at end
71  }
72 
73  // verify each starting byte is followed by the correct number of continuation bytes
74  switch (code_point_length)
75  {
76  case 4:
77  if (countl_one(p[3]) != 1)
78  {
79  return false;
80  }
81  [[fallthrough]];
82  case 3:
83  if (countl_one(p[2]) != 1)
84  {
85  return false;
86  }
87  [[fallthrough]];
88  case 2:
89  if (countl_one(p[1]) != 1)
90  {
91  return false;
92  }
93  break;
94  default: break; // should never be reached
95  }
96 
97  // all continuation bytes are in the range 0x80 to 0xBF
98  switch (code_point_length)
99  {
100  case 2:
101  // overlong encoding of single byte character
102  if (*p == 0xC0 || *p == 0xC1)
103  {
104  return false;
105  }
106  break;
107  case 3:
108  // U+D800–U+DFFF are invalid; UTF-16 surrogate halves
109  // 0xED'A0'80 to 0xED'BF'BF
110  if (p[0] == 0xED && p[1] >= 0xA0)
111  {
112  return false;
113  }
114  // overlong encoding of 2 byte character
115  if (p[0] == 0xE0 && p[1] < 0xA0)
116  {
117  return false;
118  }
119  break;
120  case 4:
121  // code points > U+10FFFF are invalid
122  // U+10FFFF in UTF-8 is 0xF4'8F'BF'BF
123  // U+110000 in UTF-8 is 0xF4'90'80'80
124  if (*p > 0xF4 || (p[0] == 0xF4 && p[1] >= 0x90))
125  {
126  return false;
127  }
128  // overlong encoding of 3 byte character
129  if (p[0] == 0xF0 && p[1] < 0x90)
130  {
131  return false;
132  }
133  break;
134  default: break; // should never be reached
135  }
136 
137  p += code_point_length;
138  }
139 
140  return true;
141 }
142 
160 int StringUtil::naturalCompare(const QString &_a, const QString &_b, Qt::CaseSensitivity caseSensitivity)
161 {
162  QString a;
163  QString b;
164 
165  if (caseSensitivity == Qt::CaseSensitive)
166  {
167  a = _a;
168  b = _b;
169  }
170  else
171  {
172  a = _a.toLower();
173  b = _b.toLower();
174  }
175 
176  const QChar* currA = a.unicode(); // iterator over a
177  const QChar* currB = b.unicode(); // iterator over b
178 
179  if (currA == currB)
180  {
181  return 0;
182  }
183 
184  while (!currA->isNull() && !currB->isNull())
185  {
186  const QChar* begSeqA = currA; // beginning of a new character sequence of a
187  const QChar* begSeqB = currB;
188 
189  if (currA->unicode() == QChar::ObjectReplacementCharacter)
190  {
191  return 1;
192  }
193 
194  if (currB->unicode() == QChar::ObjectReplacementCharacter)
195  {
196  return -1;
197  }
198 
199  if (currA->unicode() == QChar::ReplacementCharacter)
200  {
201  return 1;
202  }
203 
204  if (currB->unicode() == QChar::ReplacementCharacter)
205  {
206  return -1;
207  }
208 
209  // find sequence of characters ending at the first non-character
210  while (!currA->isNull() && !currA->isDigit() && !currA->isPunct() &&
211  !currA->isSpace())
212  {
213  ++currA;
214  }
215 
216  while (!currB->isNull() && !currB->isDigit() && !currB->isPunct() &&
217  !currB->isSpace())
218  {
219  ++currB;
220  }
221 
222  // compare these sequences
223  const QString& subA(a.mid(begSeqA - a.unicode(), currA - begSeqA));
224  const QString& subB(b.mid(begSeqB - b.unicode(), currB - begSeqB));
225  int cmp = QString::localeAwareCompare(subA, subB);
226 
227  if (cmp != 0)
228  {
229  return cmp < 0 ? -1 : +1;
230  }
231 
232  if (currA->isNull() || currB->isNull())
233  {
234  break;
235  }
236 
237  // find sequence of characters ending at the first non-character
238  while ((currA->isPunct() || currA->isSpace()) &&
239  (currB->isPunct() || currB->isSpace()))
240  {
241  cmp = ternary_compare(*currA, *currB);
242  if (cmp != 0)
243  {
244  return cmp;
245  }
246  ++currA;
247  ++currB;
248  if (currA->isNull() || currB->isNull())
249  {
250  break;
251  }
252  }
253 
254  // now some digits follow...
255  if ((*currA == QLatin1Char('0')) || (*currB == QLatin1Char('0')))
256  {
257  // one digit-sequence starts with 0 -> assume we are in a fraction part
258  // do left aligned comparison (numbers are considered left aligned)
259  while (true)
260  {
261  if (!currA->isDigit() && !currB->isDigit())
262  {
263  break;
264  }
265  if (!currA->isDigit())
266  {
267  return +1;
268  }
269  if (!currB->isDigit())
270  {
271  return -1;
272  }
273  if (*currA < *currB)
274  {
275  return -1;
276  }
277  if (*currA > *currB)
278  {
279  return + 1;
280  }
281  ++currA;
282  ++currB;
283  }
284  }
285  else
286  {
287  // No digit-sequence starts with 0 -> assume we are looking at some integer
288  // do right aligned comparison.
289  //
290  // The longest run of digits wins. That aside, the greatest
291  // value wins, but we can't know that it will until we've scanned
292  // both numbers to know that they have the same magnitude.
293 
294  bool isFirstRun = true;
295  int weight = 0;
296 
297  while (true)
298  {
299  if (!currA->isDigit() && !currB->isDigit())
300  {
301  if (weight != 0)
302  {
303  return weight;
304  }
305  break;
306  }
307  if (!currA->isDigit())
308  {
309  if (isFirstRun)
310  {
311  return *currA < *currB ? -1 : +1;
312  }
313  return -1;
314  }
315  if (!currB->isDigit())
316  {
317  if (isFirstRun)
318  {
319  return *currA < *currB ? -1 : +1;
320  }
321  return +1;
322  }
323  if ((*currA < *currB) && (weight == 0))
324  {
325  weight = -1;
326  }
327  else if ((*currA > *currB) && (weight == 0))
328  {
329  weight = + 1;
330  }
331  ++currA;
332  ++currB;
333  isFirstRun = false;
334  }
335  }
336  }
337 
338  if (currA->isNull() && currB->isNull())
339  {
340  return 0;
341  }
342 
343  return currA->isNull() ? -1 : + 1;
344 }
345 
346 static constexpr int64_t kOneTerabyte { 1024 * 1024LL * 1024};
347 static constexpr int64_t kOneGigabyte { 1024LL * 1024};
348 static constexpr int64_t kOneMegabyte { 1024};
349 
357 QString StringUtil::formatKBytes(int64_t sizeKB, int precision)
358 {
359  if (sizeKB > kOneTerabyte)
360  {
361  double sizeTB = sizeKB/(1.0 * kOneTerabyte);
362  return QObject::tr("%1 TB").arg(sizeTB, 0, 'f', (sizeTB>10)?0:precision);
363  }
364  if (sizeKB > kOneGigabyte)
365  {
366  double sizeGB = sizeKB/(1.0 * kOneGigabyte);
367  return QObject::tr("%1 GB").arg(sizeGB, 0, 'f', (sizeGB>10)?0:precision);
368  }
369  if (sizeKB > kOneMegabyte)
370  {
371  double sizeMB = sizeKB/(1.0 * kOneMegabyte);
372  return QObject::tr("%1 MB").arg(sizeMB, 0, 'f', (sizeMB>10)?0:precision);
373  }
374  // Kilobytes
375  return QObject::tr("%1 KB").arg(sizeKB);
376 }
377 
378 QString StringUtil::formatBytes(int64_t sizeB, int precision)
379 {
380  if (sizeB > 1024)
381  return formatKBytes(sizeB / 1024, precision);
382  return QString("%1 B").arg(sizeB);
383 }
ternarycompare.h
countl_one
static int countl_one(unsigned char x)
Definition: stringutil.cpp:17
kOneGigabyte
static constexpr int64_t kOneGigabyte
Definition: stringutil.cpp:347
hardwareprofile.config.p
p
Definition: config.py:33
StringUtil::naturalCompare
MBASE_PUBLIC int naturalCompare(const QString &_a, const QString &_b, Qt::CaseSensitivity caseSensitivity=Qt::CaseSensitive)
This method chops the input a and b into pieces of digits and non-digits (a1.05 becomes a | 1 | .
Definition: stringutil.cpp:160
stringutil.h
ternary_compare
int ternary_compare(const QDateTime &a, const QDateTime &b)
balanced ternary (three way) comparison This is equivalent to C++20's operator <=>.
Definition: mythdate.h:76
StringUtil::formatKBytes
MBASE_PUBLIC QString formatKBytes(int64_t sizeKB, int prec=1)
Definition: stringutil.cpp:357
StringUtil::isValidUTF8
MBASE_PUBLIC bool isValidUTF8(const QByteArray &data)
Definition: stringutil.cpp:47
kOneMegabyte
static constexpr int64_t kOneMegabyte
Definition: stringutil.cpp:348
kOneTerabyte
static constexpr int64_t kOneTerabyte
Definition: stringutil.cpp:346
StringUtil::formatBytes
MBASE_PUBLIC QString formatBytes(int64_t sizeB, int prec=1)
Definition: stringutil.cpp:378