MythTV master
stringutil.cpp
Go to the documentation of this file.
1#include <array>
2#include <QObject>
3#include "stringutil.h"
4
5#if __has_include(<bit>) // C++20
6#include <bit>
7#endif
8
9#include <climits> // for CHAR_BIT
10
11#include "ternarycompare.h"
12
13#if defined(__cpp_lib_bitops) && __cpp_lib_bitops >= 201907L
14using std::countl_one;
15#else
16// 8 bit LUT based count leading ones
17static int countl_one(unsigned char x)
18{
19#if CHAR_BIT != 8
20 if (x > 256)
21 return 8; // works for our purposes even if not correct
22#endif
23 static constexpr std::array<uint8_t,256> leading_ones =
24 {
25 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
26 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
33
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
38 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
39 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
40 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
41 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 7, 8,
42 };
43 return leading_ones[x];
44}
45#endif // C++20 feature test macro
46
47bool StringUtil::isValidUTF8(const QByteArray& data)
48{
49 const auto* p = (const unsigned char*)data.data();
50 const unsigned char* const end = p + data.size();
51 while (p < end)
52 {
53 int code_point_length = countl_one(*p);
54
55 switch (code_point_length)
56 {
57 case 0: p++; continue; // ASCII
58 case 1: return false; // invalid, continuation byte
59 case 2:
60 case 3:
61 case 4: break;
62 default: return false;
63 /* the variable length code is limited to 4 bytes by RFC3629 §3 to match
64 the range of UTF-16, i.e. the maximum code point is U+10FFFF
65 */
66 }
67
68 if (end < code_point_length + p)
69 {
70 return false; // truncated codepoint at end
71 }
72
73 // verify each starting byte is followed by the correct number of continuation bytes
74 switch (code_point_length)
75 {
76 case 4:
77 if (countl_one(p[3]) != 1)
78 {
79 return false;
80 }
81 [[fallthrough]];
82 case 3:
83 if (countl_one(p[2]) != 1)
84 {
85 return false;
86 }
87 [[fallthrough]];
88 case 2:
89 if (countl_one(p[1]) != 1)
90 {
91 return false;
92 }
93 break;
94 default: break; // should never be reached
95 }
96
97 // all continuation bytes are in the range 0x80 to 0xBF
98 switch (code_point_length)
99 {
100 case 2:
101 // overlong encoding of single byte character
102 if (*p == 0xC0 || *p == 0xC1)
103 {
104 return false;
105 }
106 break;
107 case 3:
108 // U+D800–U+DFFF are invalid; UTF-16 surrogate halves
109 // 0xED'A0'80 to 0xED'BF'BF
110 if (p[0] == 0xED && p[1] >= 0xA0)
111 {
112 return false;
113 }
114 // overlong encoding of 2 byte character
115 if (p[0] == 0xE0 && p[1] < 0xA0)
116 {
117 return false;
118 }
119 break;
120 case 4:
121 // code points > U+10FFFF are invalid
122 // U+10FFFF in UTF-8 is 0xF4'8F'BF'BF
123 // U+110000 in UTF-8 is 0xF4'90'80'80
124 if (*p > 0xF4 || (p[0] == 0xF4 && p[1] >= 0x90))
125 {
126 return false;
127 }
128 // overlong encoding of 3 byte character
129 if (p[0] == 0xF0 && p[1] < 0x90)
130 {
131 return false;
132 }
133 break;
134 default: break; // should never be reached
135 }
136
137 p += code_point_length;
138 }
139
140 return true;
141}
142
160int StringUtil::naturalCompare(const QString &_a, const QString &_b, Qt::CaseSensitivity caseSensitivity)
161{
162 QString a;
163 QString b;
164
165 if (caseSensitivity == Qt::CaseSensitive)
166 {
167 a = _a;
168 b = _b;
169 }
170 else
171 {
172 a = _a.toLower();
173 b = _b.toLower();
174 }
175
176 const QChar* currA = a.unicode(); // iterator over a
177 const QChar* currB = b.unicode(); // iterator over b
178
179 if (currA == currB)
180 {
181 return 0;
182 }
183
184 while (!currA->isNull() && !currB->isNull())
185 {
186 const QChar* begSeqA = currA; // beginning of a new character sequence of a
187 const QChar* begSeqB = currB;
188
189 if (currA->unicode() == QChar::ObjectReplacementCharacter)
190 {
191 return 1;
192 }
193
194 if (currB->unicode() == QChar::ObjectReplacementCharacter)
195 {
196 return -1;
197 }
198
199 if (currA->unicode() == QChar::ReplacementCharacter)
200 {
201 return 1;
202 }
203
204 if (currB->unicode() == QChar::ReplacementCharacter)
205 {
206 return -1;
207 }
208
209 // find sequence of characters ending at the first non-character
210 while (!currA->isNull() && !currA->isDigit() && !currA->isPunct() &&
211 !currA->isSpace())
212 {
213 ++currA;
214 }
215
216 while (!currB->isNull() && !currB->isDigit() && !currB->isPunct() &&
217 !currB->isSpace())
218 {
219 ++currB;
220 }
221
222 // compare these sequences
223 const QString& subA(a.mid(begSeqA - a.unicode(), currA - begSeqA));
224 const QString& subB(b.mid(begSeqB - b.unicode(), currB - begSeqB));
225 int cmp = QString::localeAwareCompare(subA, subB);
226
227 if (cmp != 0)
228 {
229 return cmp < 0 ? -1 : +1;
230 }
231
232 if (currA->isNull() || currB->isNull())
233 {
234 break;
235 }
236
237 // find sequence of characters ending at the first non-character
238 while ((currA->isPunct() || currA->isSpace()) &&
239 (currB->isPunct() || currB->isSpace()))
240 {
241 cmp = ternary_compare(*currA, *currB);
242 if (cmp != 0)
243 {
244 return cmp;
245 }
246 ++currA;
247 ++currB;
248 if (currA->isNull() || currB->isNull())
249 {
250 break;
251 }
252 }
253
254 // now some digits follow...
255 if ((*currA == QLatin1Char('0')) || (*currB == QLatin1Char('0')))
256 {
257 // one digit-sequence starts with 0 -> assume we are in a fraction part
258 // do left aligned comparison (numbers are considered left aligned)
259 while (true)
260 {
261 if (!currA->isDigit() && !currB->isDigit())
262 {
263 break;
264 }
265 if (!currA->isDigit())
266 {
267 return +1;
268 }
269 if (!currB->isDigit())
270 {
271 return -1;
272 }
273 if (*currA < *currB)
274 {
275 return -1;
276 }
277 if (*currA > *currB)
278 {
279 return + 1;
280 }
281 ++currA;
282 ++currB;
283 }
284 }
285 else
286 {
287 // No digit-sequence starts with 0 -> assume we are looking at some integer
288 // do right aligned comparison.
289 //
290 // The longest run of digits wins. That aside, the greatest
291 // value wins, but we can't know that it will until we've scanned
292 // both numbers to know that they have the same magnitude.
293
294 bool isFirstRun = true;
295 int weight = 0;
296
297 while (true)
298 {
299 if (!currA->isDigit() && !currB->isDigit())
300 {
301 if (weight != 0)
302 {
303 return weight;
304 }
305 break;
306 }
307 if (!currA->isDigit())
308 {
309 if (isFirstRun)
310 {
311 return *currA < *currB ? -1 : +1;
312 }
313 return -1;
314 }
315 if (!currB->isDigit())
316 {
317 if (isFirstRun)
318 {
319 return *currA < *currB ? -1 : +1;
320 }
321 return +1;
322 }
323 if ((*currA < *currB) && (weight == 0))
324 {
325 weight = -1;
326 }
327 else if ((*currA > *currB) && (weight == 0))
328 {
329 weight = + 1;
330 }
331 ++currA;
332 ++currB;
333 isFirstRun = false;
334 }
335 }
336 }
337
338 if (currA->isNull() && currB->isNull())
339 {
340 return 0;
341 }
342
343 return currA->isNull() ? -1 : + 1;
344}
345
346static constexpr int64_t kOneTerabyte { 1024 * 1024LL * 1024};
347static constexpr int64_t kOneGigabyte { 1024LL * 1024};
348static constexpr int64_t kOneMegabyte { 1024};
349
357QString StringUtil::formatKBytes(int64_t sizeKB, int precision)
358{
359 if (sizeKB > kOneTerabyte)
360 {
361 double sizeTB = sizeKB/(1.0 * kOneTerabyte);
362 return QObject::tr("%1 TB").arg(sizeTB, 0, 'f', (sizeTB>10)?0:precision);
363 }
364 if (sizeKB > kOneGigabyte)
365 {
366 double sizeGB = sizeKB/(1.0 * kOneGigabyte);
367 return QObject::tr("%1 GB").arg(sizeGB, 0, 'f', (sizeGB>10)?0:precision);
368 }
369 if (sizeKB > kOneMegabyte)
370 {
371 double sizeMB = sizeKB/(1.0 * kOneMegabyte);
372 return QObject::tr("%1 MB").arg(sizeMB, 0, 'f', (sizeMB>10)?0:precision);
373 }
374 // Kilobytes
375 return QObject::tr("%1 KB").arg(sizeKB);
376}
377
378QString StringUtil::formatBytes(int64_t sizeB, int precision)
379{
380 if (sizeB > 1024)
381 return formatKBytes(sizeB / 1024, precision);
382 return QString("%1 B").arg(sizeB);
383}
int ternary_compare(const QDateTime &a, const QDateTime &b)
balanced ternary (three way) comparison This is equivalent to C++20's operator <=>.
Definition: mythdate.h:76
MBASE_PUBLIC bool isValidUTF8(const QByteArray &data)
Definition: stringutil.cpp:47
MBASE_PUBLIC int naturalCompare(const QString &_a, const QString &_b, Qt::CaseSensitivity caseSensitivity=Qt::CaseSensitive)
This method chops the input a and b into pieces of digits and non-digits (a1.05 becomes a | 1 | .
Definition: stringutil.cpp:160
MBASE_PUBLIC QString formatBytes(int64_t sizeB, int prec=1)
Definition: stringutil.cpp:378
MBASE_PUBLIC QString formatKBytes(int64_t sizeKB, int prec=1)
Definition: stringutil.cpp:357
static constexpr int64_t kOneTerabyte
Definition: stringutil.cpp:346
static int countl_one(unsigned char x)
Definition: stringutil.cpp:17
static constexpr int64_t kOneGigabyte
Definition: stringutil.cpp:347
static constexpr int64_t kOneMegabyte
Definition: stringutil.cpp:348