Context Navigation

Back to Ticket #10922

Ticket #10922: 0001-OpenGL-Optimize-writing-packed-images-to-pixel-buffe.patch

File 0001-OpenGL-Optimize-writing-packed-images-to-pixel-buffe.patch, 15.9 KB (added by Lawrence Rust <lvr@…>, 13 years ago)

mythtv/libs/libmythtv/util-opengl.cpp

From 0dbd9fd29d12f5d49e2c25f6c60bcb2d7bc27b48 Mon Sep 17 00:00:00 2001
From: Lawrence Rust <lvr@softsystem.co.uk>
Date: Wed, 18 Jul 2012 10:26:01 +0200
Subject: [PATCH] OpenGL: Optimize writing packed images to pixel buffer objects (PBO)

pack_yv12interlaced() packs pixels and writes them to an OpenGL
pixel buffer object.  This buffer may reside on the graphics card
so writes may be slower than to RAM.  Combining writes to the buffer
can improve performance by 200-300%.

This patch also provides optimized non-mmx code and also tightens
const correctness.

Signed-off-by: Lawrence Rust <lvr@softsystem.co.uk>
---
 mythtv/libs/libmythtv/util-opengl.cpp |  246 ++++++++++++++++++---------------
 mythtv/libs/libmythtv/util-opengl.h   |    4 +-
 2 files changed, 139 insertions(+), 111 deletions(-)

diff --git a/mythtv/libs/libmythtv/util-opengl.cpp b/mythtv/libs/libmythtv/util-opengl.cpp
index 5fca85a..1e5e547 100644

-                      a
 // -*- Mode: c++ -*-
+#include <string.h>
 #include <stdint.h>
 #include <QSize>
 #include "compat.h"
-…
+ extern "C" {
 #include "ffmpeg-mmx.h"
+}
 static mmx_t mmx_1s = {0xffffffffffffffffLL};
+static mmx_t const mmx_1s = {0xffffffffffffffffLL};
 static inline void mmx_pack_alpha1s_high(uint8_t *y1, uint8_t *y2)
+static inline void mmx_pack_alpha1s_high(const uint8_t *y1, const uint8_t *y2)
+{
     movq_m2r (mmx_1s, mm4);
     punpckhbw_m2r (*y1, mm4);
-…
+ static inline void mmx_pack_alpha1s_high(uint8_t *y1, uint8_t *y2)
     punpckhbw_m2r (*y2, mm7);
+}
 static inline void mmx_pack_alpha1s_low(uint8_t *y1, uint8_t *y2)
+static inline void mmx_pack_alpha1s_low(const uint8_t *y1, const uint8_t *y2)
+{
     movq_m2r (mmx_1s, mm4);
     punpcklbw_m2r (*y1, mm4);
-…
+ static inline void mmx_pack_end(uint8_t *dest1, uint8_t *dest2)
     movq_r2m (mm3, *(dest2 + 24));
+}
 static inline void mmx_pack_easy(uint8_t *dest, uint8_t *y)
+static inline void mmx_pack_easy(uint8_t *dest, const uint8_t *y)
+{
     movq_m2r (mmx_1s, mm4);
     punpcklbw_m2r (*y, mm4);
-…
+ static inline void mmx_pack_easy(uint8_t *dest, uint8_t *y)
     movq_r2m (mm3, *(dest + 24));
+}
 static mmx_t mmx_0s = {0x0000000000000000LL};
 static mmx_t round  = {0x0002000200020002LL};
+static mmx_t const mmx_0s = {0x0000000000000000LL};
+static mmx_t const round  = {0x0002000200020002LL};
 static inline void mmx_interp_start(uint8_t *left, uint8_t *right)
+static inline void mmx_interp_start(const uint8_t *left, const uint8_t *right)
+{
     movd_m2r  (*left, mm5);
     punpcklbw_m2r (mmx_0s, mm5);
-…
+ static inline void mmx_interp_endv(void)
     paddb_r2r (mm4, mm3);
+}
 static inline void mmx_pack_chroma(uint8_t *u, uint8_t *v)
+static inline void mmx_pack_chroma(const uint8_t *u, const uint8_t *v)
+{
     movd_m2r (*u,  mm2);
     movd_m2r (*v,  mm3);
-…
+ static inline void mmx_pack_chroma(uint8_t *u, uint8_t *v)
+}
 #endif // MMX
 static inline void c_interp(uint8_t *dest, uint8_t *a, uint8_t *b,
                             uint8_t *c, uint8_t *d)
+static inline void c_interp(unsigned dest[4], unsigned a, unsigned b,
+                            unsigned c, unsigned d)
+{
     unsigned int tmp = (unsigned int) *a;
+    unsigned int tmp = a;
     tmp *= 3;
     tmp += 2;
     tmp += (unsigned int) *c;
     dest[0] = (uint8_t) (tmp >> 2);
+    tmp += c;
+    dest[0] = tmp >> 2;
     tmp = (unsigned int) *b;
+    tmp = b;
     tmp *= 3;
     tmp += 2;
     tmp += (unsigned int) *d;
     dest[1] = (uint8_t) (tmp >> 2);
+    tmp += d;
+    dest[1] = tmp >> 2;
     tmp = (unsigned int) *c;
+    tmp = c;
     tmp *= 3;
     tmp += 2;
     tmp += (unsigned int) *a;
     dest[2] = (uint8_t) (tmp >> 2);
+    tmp += a;
+    dest[2] = tmp >> 2;
     tmp = (unsigned int) *d;
+    tmp = d;
     tmp *= 3;
     tmp += 2;
+    tmp += (unsigned int) *b;
+    dest[3] = (uint8_t) (tmp >> 2);
+    tmp += b;
+    dest[3] = tmp >> 2;
+}
+#ifdef __GNUC__
+#define MYTH_PACKED __attribute__((packed))
+#else
+#define MYTH_PACKED
+#endif
+static inline unsigned c_pack2(uint8_t dest[],
+    unsigned v, unsigned u, unsigned y1, unsigned y2)
+{
+    struct pack
+    {
+        uint8_t v1, a1, u1, y1;
+        uint8_t v2, a2, u2, y2;
+    } MYTH_PACKED tmp = {v, 0xff, u, y1, v, 0xff, u, y2};
+    *reinterpret_cast< struct pack * >(dest) = tmp;
+    return sizeof tmp;
+}
 void pack_yv12progressive(const unsigned char *source,
-…
+ void pack_yv12progressive(const unsigned char *source,
+}
 void pack_yv12interlaced(const unsigned char *source,
                          const unsigned char *dest,
                          const int *offsets,
                          const int *pitches,
+                         unsigned char *dest,
+                         const int offsets[3],
+                         const int pitches[3],
                          const QSize &size)
+{
     int width = size.width();
+    const int width = size.width();
     int height = size.height();
     if (height % 4 || width % 2)
         return;
+    uint bgra_width  = width << 2;
+    uint dwrap  = (bgra_width << 2) - bgra_width;
+    uint chroma_width = width >> 1;
+    uint ywrap     = (pitches[0] << 1) - width;
+    uint uwrap     = (pitches[1] << 1) - chroma_width;
+    uint vwrap     = (pitches[2] << 1) - chroma_width;
+    uint8_t *ypt_1   = (uint8_t *)source + offsets[0];
+    uint8_t *ypt_2   = ypt_1 + pitches[0];
+    uint8_t *ypt_3   = ypt_1 + (pitches[0] * (height - 2));
+    uint8_t *ypt_4   = ypt_3 + pitches[0];
+    uint8_t *u1     = (uint8_t *)source + offsets[1];
+    uint8_t *v1     = (uint8_t *)source + offsets[2];
+    uint8_t *u2     = u1 + pitches[1]; uint8_t *v2     = v1 + pitches[2];
+    uint8_t *u3     = u1 + (pitches[1] * ((height - 4) >> 1));
+    uint8_t *v3     = v1 + (pitches[2] * ((height - 4) >> 1));
+    uint8_t *u4     = u3 + pitches[1]; uint8_t *v4     = v3 + pitches[2];
+    uint8_t *dst_1   = (uint8_t *) dest;
+    uint8_t *dst_2   = dst_1 + bgra_width;
+    uint8_t *dst_3   = dst_1 + (bgra_width * (height - 2));
+    uint8_t *dst_4   = dst_3 + bgra_width;
+    const uint bgra_width   = width << 2;
+    const uint chroma_width = width >> 1;
+          uint ywrap        = (pitches[0] << 1) - width;
+    const uint uwrap        = (pitches[1] << 1) - chroma_width;
+    const uint vwrap        = (pitches[2] << 1) - chroma_width;
+    const uint8_t *ypt_1    = static_cast< const uint8_t * >(source) + offsets[0];
+    const uint8_t *ypt_2    = ypt_1 + pitches[0];
+    const uint8_t *ypt_3    = ypt_1 + (pitches[0] * (height - 2));
+    const uint8_t *ypt_4    = ypt_3 + pitches[0];
+    const uint8_t *u1       = static_cast< const uint8_t * >(source) + offsets[1];
+    const uint8_t *v1       = static_cast< const uint8_t * >(source) + offsets[2];
+    const uint8_t *u2       = u1 + pitches[1];
+    const uint8_t *v2       = v1 + pitches[2];
+    const uint8_t *u3       = u1 + (pitches[1] * ((height - 4) >> 1));
+    const uint8_t *v3       = v1 + (pitches[2] * ((height - 4) >> 1));
+    const uint8_t *u4       = u3 + pitches[1];
+    const uint8_t *v4       = v3 + pitches[2];
+    uint8_t *dst_1          = static_cast< uint8_t * >(dest);
+    uint8_t *dst_3          = dst_1 + (bgra_width * (height - 2));
+    // Allocate a 4 line packed data buffer
+    // NB dest is probably graphics memory so access may be slow
+    const uint bufsize = bgra_width * 4;
+    uint8_t *buf = new uint8_t[bufsize];
+    uint8_t *b1 = buf;
+    uint8_t *b2 = b1 + bgra_width;
+    const uint len = bgra_width * 2; // 2 line buffer size
+    uint8_t *b3 = buf + len;
+    uint8_t *b4 = b3 + bgra_width;
 #ifdef MMX
-…
+ void pack_yv12interlaced(const unsigned char *source,
         for (int col = 0; col < width; col += 8)
+        {
             mmx_pack_chroma(u1, v1);
             mmx_pack_easy(dst_1, ypt_1);
+            mmx_pack_easy(b1, ypt_1);
             mmx_pack_chroma(u2, v2);
             mmx_pack_easy(dst_2, ypt_2);
+            mmx_pack_easy(b2, ypt_2);
             mmx_pack_chroma(u3, v3);
             mmx_pack_easy(dst_3, ypt_3);
+            mmx_pack_easy(b3, ypt_3);
             mmx_pack_chroma(u4, v4);
             mmx_pack_easy(dst_4, ypt_4);
+            mmx_pack_easy(b4, ypt_4);
             dst_1 += 32; dst_2 += 32; dst_3 += 32; dst_4 += 32;
+            b1 += 32; b2 += 32; b3 += 32; b4 += 32;
             ypt_1 += 8; ypt_2 += 8; ypt_3 += 8; ypt_4 += 8;
             u1   += 4; v1   += 4; u2   += 4; v2   += 4;
             u3   += 4; v3   += 4; u4   += 4; v4   += 4;
+        }
+        ypt_1 += ywrap; ypt_2 += ywrap;
+        dst_1 += bgra_width; dst_2 += bgra_width;
+        memcpy(dst_1, buf, len);
+        dst_1 += len;
+        memcpy(dst_3, buf + len, len);
+        ypt_1 += ywrap; ypt_2 += ywrap;
         ypt_3 = ypt_2 + pitches[0];
         ypt_4 = ypt_3 + pitches[0];
-        dst_3 = dst_2 + bgra_width;
-        dst_4 = dst_3 + bgra_width;
         ywrap = (pitches[0] << 2) - width;
-…
+ void pack_yv12interlaced(const unsigned char *source,
         // pack main body
         for (int row = 0 ; row < height; row += 4)
+        {
+            b1 = buf;
+            b2 = b1 + bgra_width;
+            b3 = b2 + bgra_width;
+            b4 = b3 + bgra_width;
             for (int col = 0; col < width; col += 8)
+            {
                 mmx_interp_start(u1, u3); mmx_interp_endu();
                 mmx_interp_start(v1, v3); mmx_interp_endv();
                 mmx_pack_easy(dst_1, ypt_1);
+                mmx_pack_easy(b1, ypt_1);
                 mmx_interp_start(u2, u4); mmx_interp_endu();
                 mmx_interp_start(v2, v4); mmx_interp_endv();
                 mmx_pack_easy(dst_2, ypt_2);
+                mmx_pack_easy(b2, ypt_2);
                 mmx_interp_start(u3, u1); mmx_interp_endu();
                 mmx_interp_start(v3, v1); mmx_interp_endv();
                 mmx_pack_easy(dst_3, ypt_3);
+                mmx_pack_easy(b3, ypt_3);
                 mmx_interp_start(u4, u2); mmx_interp_endu();
                 mmx_interp_start(v4, v2); mmx_interp_endv();
                 mmx_pack_easy(dst_4, ypt_4);
+                mmx_pack_easy(b4, ypt_4);
                 dst_1 += 32; dst_2 += 32; dst_3 += 32; dst_4 += 32;
+                b1 += 32; b2 += 32; b3 += 32; b4 += 32;
                 ypt_1 += 8; ypt_2 += 8; ypt_3 += 8; ypt_4 += 8;
                 u1   += 4; u2   += 4; u3   += 4; u4   += 4;
                 v1   += 4; v2   += 4; v3   += 4; v4   += 4;
+            }
+            // Copy the packed data to dest
+            memcpy(dst_1, buf, bufsize);
+            dst_1 += bufsize;
             ypt_1 += ywrap; ypt_2 += ywrap; ypt_3 += ywrap; ypt_4 += ywrap;
-            dst_1 += dwrap; dst_2 += dwrap; dst_3 += dwrap; dst_4 += dwrap;
             u1 += uwrap; v1 += vwrap; u2 += uwrap; v2 += vwrap;
             u3 += uwrap; v3 += vwrap; u4 += uwrap;v4 += vwrap;
+        }
         emms();
+        delete[] buf;
         return;
+    }
 #endif //MMX
-…
+ void pack_yv12interlaced(const unsigned char *source,
     // pack first 2 and last 2 rows
     for (int col = 0; col < width; col += 2)
+    {
+        *(dst_1++) = *v1; *(dst_2++) = *v2; *(dst_3++) = *v3; *(dst_4++) = *v4;
+        *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
+        *(dst_1++) = *u1; *(dst_2++) = *u2; *(dst_3++) = *u3; *(dst_4++) = *u4;
+        *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
+        *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
+        *(dst_1++) = *(v1++); *(dst_2++) = *(v2++);
+        *(dst_3++) = *(v3++); *(dst_4++) = *(v4++);
+        *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
+        *(dst_1++) = *(u1++); *(dst_2++) = *(u2++);
+        *(dst_3++) = *(u3++); *(dst_4++) = *(u4++);
+        *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
+        *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
+        b1 += c_pack2(b1, *(v1++), *(u1++), ypt_1[col], ypt_1[col + 1]);
+        b2 += c_pack2(b2, *(v2++), *(u2++), ypt_2[col], ypt_2[col + 1]);
+        b3 += c_pack2(b3, *(v3++), *(u3++), ypt_3[col], ypt_3[col + 1]);
+        b4 += c_pack2(b4, *(v4++), *(u4++), ypt_4[col], ypt_4[col + 1]);
+    }
+    ypt_1 += ywrap; ypt_2 += ywrap;
+    dst_1 += bgra_width; dst_2 += bgra_width;
+    // Copy the packed data to dest
+    memcpy(dst_1, buf, len);
+    dst_1 += len;
+    memcpy(dst_3, buf + len, len);
+    ywrap = pitches[0] << 2;
+    ypt_1 += ywrap; ypt_2 += ywrap;
     ypt_3 = ypt_2 + pitches[0];
     ypt_4 = ypt_3 + pitches[0];
-    dst_3 = dst_2 + bgra_width;
-    dst_4 = dst_3 + bgra_width;
+    ywrap = (pitches[0] << 2) - width;
+    u1 = (uint8_t *)source + offsets[1];
+    v1 = (uint8_t *)source + offsets[2];
+    u1 = static_cast< const uint8_t * >(source) + offsets[1];
+    v1 = static_cast< const uint8_t * >(source) + offsets[2];
     u2 = u1 + pitches[1]; v2 = v1 + pitches[2];
     u3 = u2 + pitches[1]; v3 = v2 + pitches[2];
     u4 = u3 + pitches[1]; v4 = v3 + pitches[2];
     height -= 4;
-    uint8_t v[4], u[4];
     // pack main body
     for (int row = 0; row < height; row += 4)
+    {
+        b1 = buf;
+        b2 = b1 + bgra_width;
+        b3 = b2 + bgra_width;
+        b4 = b3 + bgra_width;
         for (int col = 0; col < width; col += 2)
+        {
+            c_interp(v, v1, v2, v3, v4);
+            c_interp(u, u1, u2, u3, u4);
+            *(dst_1++) = v[0]; *(dst_2++) = v[1];
+            *(dst_3++) = v[2]; *(dst_4++) = v[3];
+            *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
+            *(dst_1++) = u[0]; *(dst_2++) = u[1];
+            *(dst_3++) = u[2]; *(dst_4++) = u[3];
+            *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
+            *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
+            *(dst_1++) = v[0]; *(dst_2++) = v[1];
+            *(dst_3++) = v[2]; *(dst_4++) = v[3];
+            *(dst_1++) = 255; *(dst_2++) = 255; *(dst_3++) = 255; *(dst_4++) = 255;
+            *(dst_1++) = u[0]; *(dst_2++) = u[1];
+            *(dst_3++) = u[2]; *(dst_4++) = u[3];
+            *(dst_1++) = *(ypt_1++); *(dst_2++) = *(ypt_2++);
+            *(dst_3++) = *(ypt_3++); *(dst_4++) = *(ypt_4++);
+            v1++; v2++; v3++; v4++;
+            u1++; u2++; u3++; u4++;
+            unsigned v[4], u[4];
+            c_interp(v, *(v1++), *(v2++), *(v3++), *(v4++));
+            c_interp(u, *(u1++), *(u2++), *(u3++), *(u4++));
+            b1 += c_pack2(b1, v[0], u[0], ypt_1[col], ypt_1[col + 1]);
+            b2 += c_pack2(b2, v[1], u[1], ypt_2[col], ypt_2[col + 1]);
+            b3 += c_pack2(b3, v[2], u[2], ypt_3[col], ypt_3[col + 1]);
+            b4 += c_pack2(b4, v[3], u[3], ypt_4[col], ypt_4[col + 1]);
+        }
+        // Copy the packed data to dest
+        memcpy(dst_1, buf, bufsize);
+        dst_1 += bufsize;
         ypt_1 += ywrap; ypt_2 += ywrap; ypt_3 += ywrap; ypt_4 += ywrap;
         u1 += uwrap; u2 += uwrap; u3 += uwrap; u4 += uwrap;
         v1 += vwrap; v2 += vwrap; v3 += vwrap; v4 += vwrap;
-        dst_1 += dwrap; dst_2 += dwrap; dst_3 += dwrap; dst_4 += dwrap;
+    }
+    delete[] buf;
+}

mythtv/libs/libmythtv/util-opengl.h

diff --git a/mythtv/libs/libmythtv/util-opengl.h b/mythtv/libs/libmythtv/util-opengl.h
index cb38101..8310c46 100644

  void pack_yv12progressive(const unsigned char *source, const unsigned char *dest
                           const int *offsets, const int *pitches,
                           const QSize &size);
 void pack_yv12interlaced(const unsigned char *source, const unsigned char *dest,
                          const int *offsets, const int *pitches,
+void pack_yv12interlaced(const unsigned char *source, unsigned char *dest,
+                         const int offsets[3], const int pitches[3],
                          const QSize &size);
 #endif // USING_OPENGL

Download in other formats:

Original Format