Ticket #10922: 0001-OpenGL-Optimize-writing-packed-images-to-pixel-buffe.patch
File 0001-OpenGL-Optimize-writing-packed-images-to-pixel-buffe.patch, 15.9 KB (added by , 13 years ago) |
---|
-
mythtv/libs/libmythtv/util-opengl.cpp
From 0dbd9fd29d12f5d49e2c25f6c60bcb2d7bc27b48 Mon Sep 17 00:00:00 2001 From: Lawrence Rust <lvr@softsystem.co.uk> Date: Wed, 18 Jul 2012 10:26:01 +0200 Subject: [PATCH] OpenGL: Optimize writing packed images to pixel buffer objects (PBO) pack_yv12interlaced() packs pixels and writes them to an OpenGL pixel buffer object. This buffer may reside on the graphics card so writes may be slower than to RAM. Combining writes to the buffer can improve performance by 200-300%. This patch also provides optimized non-mmx code and also tightens const correctness. Signed-off-by: Lawrence Rust <lvr@softsystem.co.uk> --- mythtv/libs/libmythtv/util-opengl.cpp | 246 ++++++++++++++++++--------------- mythtv/libs/libmythtv/util-opengl.h | 4 +- 2 files changed, 139 insertions(+), 111 deletions(-) diff --git a/mythtv/libs/libmythtv/util-opengl.cpp b/mythtv/libs/libmythtv/util-opengl.cpp index 5fca85a..1e5e547 100644
a b 1 1 // -*- Mode: c++ -*- 2 2 3 #include <string.h> 3 4 #include <stdint.h> 4 5 #include <QSize> 5 6 #include "compat.h" … … extern "C" { 10 11 #include "ffmpeg-mmx.h" 11 12 } 12 13 13 static mmx_t mmx_1s = {0xffffffffffffffffLL};14 static mmx_t const mmx_1s = {0xffffffffffffffffLL}; 14 15 15 static inline void mmx_pack_alpha1s_high( uint8_t *y1,uint8_t *y2)16 static inline void mmx_pack_alpha1s_high(const uint8_t *y1, const uint8_t *y2) 16 17 { 17 18 movq_m2r (mmx_1s, mm4); 18 19 punpckhbw_m2r (*y1, mm4); … … static inline void mmx_pack_alpha1s_high(uint8_t *y1, uint8_t *y2) 20 21 punpckhbw_m2r (*y2, mm7); 21 22 } 22 23 23 static inline void mmx_pack_alpha1s_low( uint8_t *y1,uint8_t *y2)24 static inline void mmx_pack_alpha1s_low(const uint8_t *y1, const uint8_t *y2) 24 25 { 25 26 movq_m2r (mmx_1s, mm4); 26 27 punpcklbw_m2r (*y1, mm4); … … static inline void mmx_pack_end(uint8_t *dest1, uint8_t *dest2) 70 71 movq_r2m (mm3, *(dest2 + 24)); 71 72 } 72 73 73 static inline void mmx_pack_easy(uint8_t *dest, uint8_t *y)74 static inline void mmx_pack_easy(uint8_t *dest, const uint8_t *y) 74 75 { 75 76 movq_m2r (mmx_1s, mm4); 76 77 punpcklbw_m2r (*y, mm4); … … static inline void mmx_pack_easy(uint8_t *dest, uint8_t *y) 99 100 movq_r2m (mm3, *(dest + 24)); 100 101 } 101 102 102 static mmx_t mmx_0s = {0x0000000000000000LL};103 static mmx_t round = {0x0002000200020002LL};103 static mmx_t const mmx_0s = {0x0000000000000000LL}; 104 static mmx_t const round = {0x0002000200020002LL}; 104 105 105 static inline void mmx_interp_start( uint8_t *left,uint8_t *right)106 static inline void mmx_interp_start(const uint8_t *left, const uint8_t *right) 106 107 { 107 108 movd_m2r (*left, mm5); 108 109 punpcklbw_m2r (mmx_0s, mm5); … … static inline void mmx_interp_endv(void) 133 134 paddb_r2r (mm4, mm3); 134 135 } 135 136 136 static inline void mmx_pack_chroma( uint8_t *u,uint8_t *v)137 static inline void mmx_pack_chroma(const uint8_t *u, const uint8_t *v) 137 138 { 138 139 movd_m2r (*u, mm2); 139 140 movd_m2r (*v, mm3); … … static inline void mmx_pack_chroma(uint8_t *u, uint8_t *v) 142 143 } 143 144 #endif // MMX 144 145 145 static inline void c_interp(u int8_t *dest, uint8_t *a, uint8_t *b,146 u int8_t *c, uint8_t *d)146 static inline void c_interp(unsigned dest[4], unsigned a, unsigned b, 147 unsigned c, unsigned d) 147 148 { 148 unsigned int tmp = (unsigned int) *a;149 unsigned int tmp = a; 149 150 tmp *= 3; 150 151 tmp += 2; 151 tmp += (unsigned int) *c;152 dest[0] = (uint8_t) (tmp >> 2);152 tmp += c; 153 dest[0] = tmp >> 2; 153 154 154 tmp = (unsigned int) *b;155 tmp = b; 155 156 tmp *= 3; 156 157 tmp += 2; 157 tmp += (unsigned int) *d;158 dest[1] = (uint8_t) (tmp >> 2);158 tmp += d; 159 dest[1] = tmp >> 2; 159 160 160 tmp = (unsigned int) *c;161 tmp = c; 161 162 tmp *= 3; 162 163 tmp += 2; 163 tmp += (unsigned int) *a;164 dest[2] = (uint8_t) (tmp >> 2);164 tmp += a; 165 dest[2] = tmp >> 2; 165 166 166 tmp = (unsigned int) *d;167 tmp = d; 167 168 tmp *= 3; 168 169 tmp += 2; 169 tmp += (unsigned int) *b; 170 dest[3] = (uint8_t) (tmp >> 2); 170 tmp += b; 171 dest[3] = tmp >> 2; 172 } 173 174 #ifdef __GNUC__ 175 #define MYTH_PACKED __attribute__((packed)) 176 #else 177 #define MYTH_PACKED 178 #endif 179 static inline unsigned c_pack2(uint8_t dest[], 180 unsigned v, unsigned u, unsigned y1, unsigned y2) 181 { 182 struct pack 183 { 184 uint8_t v1, a1, u1, y1; 185 uint8_t v2, a2, u2, y2; 186 } MYTH_PACKED tmp = {v, 0xff, u, y1, v, 0xff, u, y2}; 187 188 *reinterpret_cast< struct pack * >(dest) = tmp; 189 190 return sizeof tmp; 171 191 } 172 192 173 193 void pack_yv12progressive(const unsigned char *source, … … void pack_yv12progressive(const unsigned char *source, 271 291 } 272 292 273 293 void pack_yv12interlaced(const unsigned char *source, 274 constunsigned char *dest,275 const int *offsets,276 const int *pitches,294 unsigned char *dest, 295 const int offsets[3], 296 const int pitches[3], 277 297 const QSize &size) 278 298 { 279 int width = size.width();299 const int width = size.width(); 280 300 int height = size.height(); 281 301 282 302 if (height % 4 || width % 2) 283 303 return; 284 304 285 uint bgra_width = width << 2; 286 uint dwrap = (bgra_width << 2) - bgra_width; 287 uint chroma_width = width >> 1; 288 uint ywrap = (pitches[0] << 1) - width; 289 uint uwrap = (pitches[1] << 1) - chroma_width; 290 uint vwrap = (pitches[2] << 1) - chroma_width; 291 292 uint8_t *ypt_1 = (uint8_t *)source + offsets[0]; 293 uint8_t *ypt_2 = ypt_1 + pitches[0]; 294 uint8_t *ypt_3 = ypt_1 + (pitches[0] * (height - 2)); 295 uint8_t *ypt_4 = ypt_3 + pitches[0]; 296 297 uint8_t *u1 = (uint8_t *)source + offsets[1]; 298 uint8_t *v1 = (uint8_t *)source + offsets[2]; 299 uint8_t *u2 = u1 + pitches[1]; uint8_t *v2 = v1 + pitches[2]; 300 uint8_t *u3 = u1 + (pitches[1] * ((height - 4) >> 1)); 301 uint8_t *v3 = v1 + (pitches[2] * ((height - 4) >> 1)); 302 uint8_t *u4 = u3 + pitches[1]; uint8_t *v4 = v3 + pitches[2]; 303 304 uint8_t *dst_1 = (uint8_t *) dest; 305 uint8_t *dst_2 = dst_1 + bgra_width; 306 uint8_t *dst_3 = dst_1 + (bgra_width * (height - 2)); 307 uint8_t *dst_4 = dst_3 + bgra_width; 305 const uint bgra_width = width << 2; 306 const uint chroma_width = width >> 1; 307 uint ywrap = (pitches[0] << 1) - width; 308 const uint uwrap = (pitches[1] << 1) - chroma_width; 309 const uint vwrap = (pitches[2] << 1) - chroma_width; 310 311 const uint8_t *ypt_1 = static_cast< const uint8_t * >(source) + offsets[0]; 312 const uint8_t *ypt_2 = ypt_1 + pitches[0]; 313 const uint8_t *ypt_3 = ypt_1 + (pitches[0] * (height - 2)); 314 const uint8_t *ypt_4 = ypt_3 + pitches[0]; 315 316 const uint8_t *u1 = static_cast< const uint8_t * >(source) + offsets[1]; 317 const uint8_t *v1 = static_cast< const uint8_t * >(source) + offsets[2]; 318 const uint8_t *u2 = u1 + pitches[1]; 319 const uint8_t *v2 = v1 + pitches[2]; 320 const uint8_t *u3 = u1 + (pitches[1] * ((height - 4) >> 1)); 321 const uint8_t *v3 = v1 + (pitches[2] * ((height - 4) >> 1)); 322 const uint8_t *u4 = u3 + pitches[1]; 323 const uint8_t *v4 = v3 + pitches[2]; 324 325 uint8_t *dst_1 = static_cast< uint8_t * >(dest); 326 uint8_t *dst_3 = dst_1 + (bgra_width * (height - 2)); 327 328 // Allocate a 4 line packed data buffer 329 // NB dest is probably graphics memory so access may be slow 330 const uint bufsize = bgra_width * 4; 331 uint8_t *buf = new uint8_t[bufsize]; 332 333 uint8_t *b1 = buf; 334 uint8_t *b2 = b1 + bgra_width; 335 const uint len = bgra_width * 2; // 2 line buffer size 336 uint8_t *b3 = buf + len; 337 uint8_t *b4 = b3 + bgra_width; 308 338 309 339 #ifdef MMX 310 340 … … void pack_yv12interlaced(const unsigned char *source, 314 344 for (int col = 0; col < width; col += 8) 315 345 { 316 346 mmx_pack_chroma(u1, v1); 317 mmx_pack_easy( dst_1, ypt_1);347 mmx_pack_easy(b1, ypt_1); 318 348 mmx_pack_chroma(u2, v2); 319 mmx_pack_easy( dst_2, ypt_2);349 mmx_pack_easy(b2, ypt_2); 320 350 mmx_pack_chroma(u3, v3); 321 mmx_pack_easy( dst_3, ypt_3);351 mmx_pack_easy(b3, ypt_3); 322 352 mmx_pack_chroma(u4, v4); 323 mmx_pack_easy( dst_4, ypt_4);353 mmx_pack_easy(b4, ypt_4); 324 354 325 dst_1 += 32; dst_2 += 32; dst_3 += 32; dst_4 += 32;355 b1 += 32; b2 += 32; b3 += 32; b4 += 32; 326 356 ypt_1 += 8; ypt_2 += 8; ypt_3 += 8; ypt_4 += 8; 327 357 u1 += 4; v1 += 4; u2 += 4; v2 += 4; 328 358 u3 += 4; v3 += 4; u4 += 4; v4 += 4; 329 359 } 330 360 331 ypt_1 += ywrap; ypt_2 += ywrap; 332 dst_1 += bgra_width; dst_2 += bgra_width; 361 memcpy(dst_1, buf, len); 362 dst_1 += len; 363 memcpy(dst_3, buf + len, len); 333 364 365 ypt_1 += ywrap; ypt_2 += ywrap; 334 366 ypt_3 = ypt_2 + pitches[0]; 335 367 ypt_4 = ypt_3 + pitches[0]; 336 dst_3 = dst_2 + bgra_width;337 dst_4 = dst_3 + bgra_width;338 368 339 369 ywrap = (pitches[0] << 2) - width; 340 370 … … void pack_yv12interlaced(const unsigned char *source, 349 379 // pack main body 350 380 for (int row = 0 ; row < height; row += 4) 351 381 { 382 b1 = buf; 383 b2 = b1 + bgra_width; 384 b3 = b2 + bgra_width; 385 b4 = b3 + bgra_width; 386 352 387 for (int col = 0; col < width; col += 8) 353 388 { 354 389 mmx_interp_start(u1, u3); mmx_interp_endu(); 355 390 mmx_interp_start(v1, v3); mmx_interp_endv(); 356 mmx_pack_easy( dst_1, ypt_1);391 mmx_pack_easy(b1, ypt_1); 357 392 358 393 mmx_interp_start(u2, u4); mmx_interp_endu(); 359 394 mmx_interp_start(v2, v4); mmx_interp_endv(); 360 mmx_pack_easy( dst_2, ypt_2);395 mmx_pack_easy(b2, ypt_2); 361 396 362 397 mmx_interp_start(u3, u1); mmx_interp_endu(); 363 398 mmx_interp_start(v3, v1); mmx_interp_endv(); 364 mmx_pack_easy( dst_3, ypt_3);399 mmx_pack_easy(b3, ypt_3); 365 400 366 401 mmx_interp_start(u4, u2); mmx_interp_endu(); 367 402 mmx_interp_start(v4, v2); mmx_interp_endv(); 368 mmx_pack_easy( dst_4, ypt_4);403 mmx_pack_easy(b4, ypt_4); 369 404 370 dst_1 += 32; dst_2 += 32; dst_3 += 32; dst_4 += 32;405 b1 += 32; b2 += 32; b3 += 32; b4 += 32; 371 406 ypt_1 += 8; ypt_2 += 8; ypt_3 += 8; ypt_4 += 8; 372 407 u1 += 4; u2 += 4; u3 += 4; u4 += 4; 373 408 v1 += 4; v2 += 4; v3 += 4; v4 += 4; 374 409 } 375 410 411 // Copy the packed data to dest 412 memcpy(dst_1, buf, bufsize); 413 dst_1 += bufsize; 414 376 415 ypt_1 += ywrap; ypt_2 += ywrap; ypt_3 += ywrap; ypt_4 += ywrap; 377 dst_1 += dwrap; dst_2 += dwrap; dst_3 += dwrap; dst_4 += dwrap;378 416 u1 += uwrap; v1 += vwrap; u2 += uwrap; v2 += vwrap; 379 417 u3 += uwrap; v3 += vwrap; u4 += uwrap;v4 += vwrap; 380 418 } 381 419 382 420 emms(); 383 421 422 delete[] buf; 384 423 return; 385 424 } 386 425 #endif //MMX … … void pack_yv12interlaced(const unsigned char *source, 388 427 // pack first 2 and last 2 rows 389 428 for (int col = 0; col < width; col += 2) 390 429 { 391 *(dst_1++) = *v1; *(dst_2++) = *v2; *(dst_3++) = *v3; *(dst_4++) = *v4; 392 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255; 393 *(dst_1++) = *u1; *(dst_2++) = *u2; *(dst_3++) = *u3; *(dst_4++) = *u4; 394 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++); 395 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++); 396 397 *(dst_1++) = *(v1++); *(dst_2++) = *(v2++); 398 *(dst_3++) = *(v3++); *(dst_4++) = *(v4++); 399 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255; 400 *(dst_1++) = *(u1++); *(dst_2++) = *(u2++); 401 *(dst_3++) = *(u3++); *(dst_4++) = *(u4++); 402 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++); 403 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++); 430 b1 += c_pack2(b1, *(v1++), *(u1++), ypt_1[col], ypt_1[col + 1]); 431 b2 += c_pack2(b2, *(v2++), *(u2++), ypt_2[col], ypt_2[col + 1]); 432 b3 += c_pack2(b3, *(v3++), *(u3++), ypt_3[col], ypt_3[col + 1]); 433 b4 += c_pack2(b4, *(v4++), *(u4++), ypt_4[col], ypt_4[col + 1]); 404 434 } 405 435 406 ypt_1 += ywrap; ypt_2 += ywrap; 407 dst_1 += bgra_width; dst_2 += bgra_width; 436 // Copy the packed data to dest 437 memcpy(dst_1, buf, len); 438 dst_1 += len; 439 memcpy(dst_3, buf + len, len); 408 440 441 ywrap = pitches[0] << 2; 442 443 ypt_1 += ywrap; ypt_2 += ywrap; 409 444 ypt_3 = ypt_2 + pitches[0]; 410 445 ypt_4 = ypt_3 + pitches[0]; 411 dst_3 = dst_2 + bgra_width;412 dst_4 = dst_3 + bgra_width;413 446 414 ywrap = (pitches[0] << 2) - width; 415 416 u1 = (uint8_t *)source + offsets[1]; 417 v1 = (uint8_t *)source + offsets[2]; 447 u1 = static_cast< const uint8_t * >(source) + offsets[1]; 448 v1 = static_cast< const uint8_t * >(source) + offsets[2]; 418 449 u2 = u1 + pitches[1]; v2 = v1 + pitches[2]; 419 450 u3 = u2 + pitches[1]; v3 = v2 + pitches[2]; 420 451 u4 = u3 + pitches[1]; v4 = v3 + pitches[2]; 421 452 422 453 height -= 4; 423 454 424 uint8_t v[4], u[4];425 426 455 // pack main body 427 456 for (int row = 0; row < height; row += 4) 428 457 { 458 b1 = buf; 459 b2 = b1 + bgra_width; 460 b3 = b2 + bgra_width; 461 b4 = b3 + bgra_width; 462 429 463 for (int col = 0; col < width; col += 2) 430 464 { 431 c_interp(v, v1, v2, v3, v4); 432 c_interp(u, u1, u2, u3, u4); 433 434 *(dst_1++) = v[0]; *(dst_2++) = v[1]; 435 *(dst_3++) = v[2]; *(dst_4++) = v[3]; 436 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255; 437 *(dst_1++) = u[0]; *(dst_2++) = u[1]; 438 *(dst_3++) = u[2]; *(dst_4++) = u[3]; 439 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++); 440 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++); 441 442 *(dst_1++) = v[0]; *(dst_2++) = v[1]; 443 *(dst_3++) = v[2]; *(dst_4++) = v[3]; 444 *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255; 445 *(dst_1++) = u[0]; *(dst_2++) = u[1]; 446 *(dst_3++) = u[2]; *(dst_4++) = u[3]; 447 *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++); 448 *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++); 449 450 v1++; v2++; v3++; v4++; 451 u1++; u2++; u3++; u4++; 465 unsigned v[4], u[4]; 466 467 c_interp(v, *(v1++), *(v2++), *(v3++), *(v4++)); 468 c_interp(u, *(u1++), *(u2++), *(u3++), *(u4++)); 469 470 b1 += c_pack2(b1, v[0], u[0], ypt_1[col], ypt_1[col + 1]); 471 b2 += c_pack2(b2, v[1], u[1], ypt_2[col], ypt_2[col + 1]); 472 b3 += c_pack2(b3, v[2], u[2], ypt_3[col], ypt_3[col + 1]); 473 b4 += c_pack2(b4, v[3], u[3], ypt_4[col], ypt_4[col + 1]); 452 474 } 475 476 // Copy the packed data to dest 477 memcpy(dst_1, buf, bufsize); 478 dst_1 += bufsize; 479 453 480 ypt_1 += ywrap; ypt_2 += ywrap; ypt_3 += ywrap; ypt_4 += ywrap; 454 481 u1 += uwrap; u2 += uwrap; u3 += uwrap; u4 += uwrap; 455 482 v1 += vwrap; v2 += vwrap; v3 += vwrap; v4 += vwrap; 456 dst_1 += dwrap; dst_2 += dwrap; dst_3 += dwrap; dst_4 += dwrap;457 483 } 484 485 delete[] buf; 458 486 } -
mythtv/libs/libmythtv/util-opengl.h
diff --git a/mythtv/libs/libmythtv/util-opengl.h b/mythtv/libs/libmythtv/util-opengl.h index cb38101..8310c46 100644
a b void pack_yv12progressive(const unsigned char *source, const unsigned char *dest 12 12 const int *offsets, const int *pitches, 13 13 const QSize &size); 14 14 15 void pack_yv12interlaced(const unsigned char *source, constunsigned char *dest,16 const int *offsets, const int *pitches,15 void pack_yv12interlaced(const unsigned char *source, unsigned char *dest, 16 const int offsets[3], const int pitches[3], 17 17 const QSize &size); 18 18 19 19 #endif // USING_OPENGL