MythTV  master
yuv2rgb.cpp
Go to the documentation of this file.
1 /*
2  * yuv2rgb_mmx.c
3  * Copyright (C) 2000-2001 Silicon Integrated System Corp.
4  * All Rights Reserved.
5  *
6  * Author: Olie Lho <ollie@sis.com.tw>
7  *
8  * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
9  * See http://libmpeg2.sourceforge.net/ for updates.
10  *
11  * mpeg2dec is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * mpeg2dec is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25 
26 #include <algorithm>
27 #include <cinttypes>
28 #include <climits>
29 #include <cmath>
30 #include <cstdio>
31 #include <cstdlib>
32 #include "mythconfig.h"
33 
34 #if HAVE_MMX
35 extern "C" {
36 #include "ffmpeg-mmx.h"
37 }
38 #define CPU_MMXEXT 0
39 #define CPU_MMX 1
40 #endif
41 
42 #if HAVE_ALTIVEC
43 extern "C" {
44 #include "libavutil/cpu.h"
45 }
46 int has_altivec(void);
47 #if HAVE_ALTIVEC_H
48 #include <altivec.h>
49 #else
50 #include <Accelerate/Accelerate.h>
51 #endif
52 #endif
53 #include "yuv2rgb.h"
54 
55 #if HAVE_ALTIVEC
56 int has_altivec(void)
57 {
58  int cpu_flags = av_get_cpu_flags();
59  if (cpu_flags & AV_CPU_FLAG_ALTIVEC)
60  return(1);
61 
62  return(0);
63 }
64 #endif
65 
71 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
72  unsigned char *pu, unsigned char *pv,
73  int h_size, int v_size, int rgb_stride,
74  int y_stride, int uv_stride, int alphaones);
75 
76 /* CPU_MMXEXT/CPU_MMX adaptation layer */
77 
78 #define movntq(src,dest) \
79 do { \
80  if (cpu == CPU_MMXEXT) \
81  movntq_r2m (src, dest); \
82  else \
83  movq_r2m (src, dest); \
84 } while (0)
85 
86 #if HAVE_MMX
87 static inline void mmx_yuv2rgb (const uint8_t * py, const uint8_t * pu, const uint8_t * pv)
88 {
89  static mmx_t mmx_80w = {0x0080008000800080ULL};
90  static mmx_t mmx_U_green = {0xf37df37df37df37dULL};
91  static mmx_t mmx_U_blue = {0x4093409340934093ULL};
92  static mmx_t mmx_V_red = {0x3312331233123312ULL};
93  static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcULL};
94  static mmx_t mmx_10w = {0x1010101010101010ULL};
95  static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffULL};
96  static mmx_t mmx_Y_coeff = {0x253f253f253f253fULL};
97 
98  movd_m2r (*pu, mm0); // mm0 = 00 00 00 00 u3 u2 u1 u0
99  movd_m2r (*pv, mm1); // mm1 = 00 00 00 00 v3 v2 v1 v0
100  movq_m2r (*py, mm6); // mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
101  pxor_r2r (mm4, mm4); // mm4 = 0
102  /* XXX might do cache preload for image here */
103 
104  /*
105  * Do the multiply part of the conversion for even and odd pixels
106  * register usage:
107  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
108  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels
109  * mm6 -> Y even, mm7 -> Y odd
110  */
111 
112  punpcklbw_r2r (mm4, mm0); // mm0 = u3 u2 u1 u0
113  punpcklbw_r2r (mm4, mm1); // mm1 = v3 v2 v1 v0
114  psubsw_m2r (mmx_80w, mm0); // u -= 128
115  psubsw_m2r (mmx_80w, mm1); // v -= 128
116  psllw_i2r (3, mm0); // promote precision
117  psllw_i2r (3, mm1); // promote precision
118  movq_r2r (mm0, mm2); // mm2 = u3 u2 u1 u0
119  movq_r2r (mm1, mm3); // mm3 = v3 v2 v1 v0
120  pmulhw_m2r (mmx_U_green, mm2); // mm2 = u * u_green
121  pmulhw_m2r (mmx_V_green, mm3); // mm3 = v * v_green
122  pmulhw_m2r (mmx_U_blue, mm0); // mm0 = chroma_b
123  pmulhw_m2r (mmx_V_red, mm1); // mm1 = chroma_r
124  paddsw_r2r (mm3, mm2); // mm2 = chroma_g
125 
126  psubusb_m2r (mmx_10w, mm6); // Y -= 16
127  movq_r2r (mm6, mm7); // mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0
128  pand_m2r (mmx_00ffw, mm6); // mm6 = Y6 Y4 Y2 Y0
129  psrlw_i2r (8, mm7); // mm7 = Y7 Y5 Y3 Y1
130  psllw_i2r (3, mm6); // promote precision
131  psllw_i2r (3, mm7); // promote precision
132  pmulhw_m2r (mmx_Y_coeff, mm6); // mm6 = luma_rgb even
133  pmulhw_m2r (mmx_Y_coeff, mm7); // mm7 = luma_rgb odd
134 
135  /*
136  * Do the addition part of the conversion for even and odd pixels
137  * register usage:
138  * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
139  * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels
140  * mm6 -> Y even, mm7 -> Y odd
141  */
142 
143  movq_r2r (mm0, mm3); // mm3 = chroma_b
144  movq_r2r (mm1, mm4); // mm4 = chroma_r
145  movq_r2r (mm2, mm5); // mm5 = chroma_g
146  paddsw_r2r (mm6, mm0); // mm0 = B6 B4 B2 B0
147  paddsw_r2r (mm7, mm3); // mm3 = B7 B5 B3 B1
148  paddsw_r2r (mm6, mm1); // mm1 = R6 R4 R2 R0
149  paddsw_r2r (mm7, mm4); // mm4 = R7 R5 R3 R1
150  paddsw_r2r (mm6, mm2); // mm2 = G6 G4 G2 G0
151  paddsw_r2r (mm7, mm5); // mm5 = G7 G5 G3 G1
152  packuswb_r2r (mm0, mm0); // saturate to 0-255
153  packuswb_r2r (mm1, mm1); // saturate to 0-255
154  packuswb_r2r (mm2, mm2); // saturate to 0-255
155  packuswb_r2r (mm3, mm3); // saturate to 0-255
156  packuswb_r2r (mm4, mm4); // saturate to 0-255
157  packuswb_r2r (mm5, mm5); // saturate to 0-255
158  punpcklbw_r2r (mm3, mm0); // mm0 = B7 B6 B5 B4 B3 B2 B1 B0
159  punpcklbw_r2r (mm4, mm1); // mm1 = R7 R6 R5 R4 R3 R2 R1 R0
160  punpcklbw_r2r (mm5, mm2); // mm2 = G7 G6 G5 G4 G3 G2 G1 G0
161 }
162 
163 static inline void mmx_unpack_16rgb (uint8_t * image, int cpu)
164 {
165  static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
166  static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
167  static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};
168 
169  /*
170  * convert RGB plane to RGB 16 bits
171  * mm0 -> B, mm1 -> R, mm2 -> G
172  * mm4 -> GB, mm5 -> AR pixel 4-7
173  * mm6 -> GB, mm7 -> AR pixel 0-3
174  */
175 
176  pand_m2r (mmx_bluemask, mm0); // mm0 = b7b6b5b4b3______
177  pand_m2r (mmx_greenmask, mm2); // mm2 = g7g6g5g4g3g2____
178  pand_m2r (mmx_redmask, mm1); // mm1 = r7r6r5r4r3______
179  psrlq_i2r (3, mm0); // mm0 = ______b7b6b5b4b3
180  pxor_r2r (mm4, mm4); // mm4 = 0
181  movq_r2r (mm0, mm5); // mm5 = ______b7b6b5b4b3
182  movq_r2r (mm2, mm7); // mm7 = g7g6g5g4g3g2____
183 
184  punpcklbw_r2r (mm4, mm2);
185  punpcklbw_r2r (mm1, mm0);
186  psllq_i2r (3, mm2);
187  por_r2r (mm2, mm0);
188  movntq (mm0, *image);
189 
190  punpckhbw_r2r (mm4, mm7);
191  punpckhbw_r2r (mm1, mm5);
192  psllq_i2r (3, mm7);
193  por_r2r (mm7, mm5);
194  movntq (mm5, *(image+8));
195 }
196 
197 static inline void mmx_unpack_32rgb (uint8_t * image, int cpu, int alphaones)
198 {
199  /*
200  * convert RGB plane to RGB packed format,
201  * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
202  * mm4 -> GB, mm5 -> AR pixel 4-7,
203  * mm6 -> GB, mm7 -> AR pixel 0-3
204  */
205 
206  if (alphaones)
207  {
208  static mmx_t mmx_1s = {0xffffffffffffffffLL};
209  movq_m2r (mmx_1s, mm3);
210  }
211  else
212  pxor_r2r (mm3, mm3);
213 
214  movq_r2r (mm0, mm6);
215  movq_r2r (mm1, mm7);
216  movq_r2r (mm0, mm4);
217  movq_r2r (mm1, mm5);
218  punpcklbw_r2r (mm2, mm6);
219  punpcklbw_r2r (mm3, mm7);
220  punpcklwd_r2r (mm7, mm6);
221  movntq (mm6, *image);
222  movq_r2r (mm0, mm6);
223  punpcklbw_r2r (mm2, mm6);
224  punpckhwd_r2r (mm7, mm6);
225  movntq (mm6, *(image+8));
226  punpckhbw_r2r (mm2, mm4);
227  punpckhbw_r2r (mm3, mm5);
228  punpcklwd_r2r (mm5, mm4);
229  movntq (mm4, *(image+16));
230  movq_r2r (mm0, mm4);
231  punpckhbw_r2r (mm2, mm4);
232  punpckhwd_r2r (mm5, mm4);
233  movntq (mm4, *(image+24));
234 }
235 
236 static inline void yuv420_rgb16 (uint8_t * image,
237  uint8_t * py, uint8_t * pu, uint8_t * pv,
238  int width, int height,
239  int rgb_stride, int y_stride, int uv_stride,
240  int cpu, int alphaones)
241 {
242  (void)alphaones;
243 
244  rgb_stride -= 2 * width;
245  y_stride -= width;
246  uv_stride -= width >> 1;
247  width >>= 3;
248 
249  do {
250  int i = width;
251  do {
252  mmx_yuv2rgb (py, pu, pv);
253  mmx_unpack_16rgb (image, cpu);
254  py += 8;
255  pu += 4;
256  pv += 4;
257  image += 16;
258  } while (--i);
259 
260  py += y_stride;
261  image += rgb_stride;
262  if (height & 1) {
263  pu += uv_stride;
264  pv += uv_stride;
265  } else {
266  pu -= 4 * width;
267  pv -= 4 * width;
268  }
269  } while (--height);
270 
271  emms();
272 }
273 
274 static inline void yuv420_argb32 (uint8_t * image, uint8_t * py,
275  uint8_t * pu, uint8_t * pv,
276  int width, int height,
277  int rgb_stride, int y_stride, int uv_stride,
278  int cpu, int alphaones)
279 {
280  rgb_stride -= 4 * width;
281  y_stride -= width;
282  uv_stride -= width >> 1;
283  width >>= 3;
284 
285  do {
286  int i = width;
287  do {
288  mmx_yuv2rgb (py, pu, pv);
289  mmx_unpack_32rgb (image, cpu, alphaones);
290  py += 8;
291  pu += 4;
292  pv += 4;
293  image += 32;
294  } while (--i);
295 
296  py += y_stride;
297  image += rgb_stride;
298  if (height & 1) {
299  pu += uv_stride;
300  pv += uv_stride;
301  } else {
302  pu -= 4 * width;
303  pv -= 4 * width;
304  }
305  } while (--height);
306 
307  emms();
308 }
309 
310 static void mmxext_rgb16 (uint8_t * image,
311  uint8_t * py, uint8_t * pu, uint8_t * pv,
312  int width, int height,
313  int rgb_stride, int y_stride, int uv_stride,
314  int alphaones)
315 {
316  yuv420_rgb16 (image, py, pu, pv, width, height,
317  rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
318 }
319 
320 static void mmxext_argb32 (uint8_t * image,
321  uint8_t * py, uint8_t * pu, uint8_t * pv,
322  int width, int height,
323  int rgb_stride, int y_stride, int uv_stride,
324  int alphaones)
325 {
326  yuv420_argb32 (image, py, pu, pv, width, height,
327  rgb_stride, y_stride, uv_stride, CPU_MMXEXT, alphaones);
328 }
329 
330 static void mmx_rgb16 (uint8_t * image,
331  uint8_t * py, uint8_t * pu, uint8_t * pv,
332  int width, int height,
333  int rgb_stride, int y_stride, int uv_stride,
334  int alphaones)
335 {
336  yuv420_rgb16 (image, py, pu, pv, width, height,
337  rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
338 }
339 
340 static void mmx_argb32 (uint8_t * image,
341  uint8_t * py, uint8_t * pu, uint8_t * pv,
342  int width, int height,
343  int rgb_stride, int y_stride, int uv_stride,
344  int alphaones)
345 {
346  yuv420_argb32 (image, py, pu, pv, width, height,
347  rgb_stride, y_stride, uv_stride, CPU_MMX, alphaones);
348 }
349 #endif
350 
360 yuv2rgb_fun yuv2rgb_init_mmxext (int bpp, int mode)
361 {
362 #if HAVE_MMX
363  if ((bpp == 16) && (mode == MODE_RGB))
364  return mmxext_rgb16;
365  if ((bpp == 32) && (mode == MODE_RGB))
366  return mmxext_argb32;
367 #endif
368 
369  (void)bpp;
370  (void)mode;
371 
372  return nullptr; /* Fallback to C */
373 }
374 
384 yuv2rgb_fun yuv2rgb_init_mmx (int bpp, int mode)
385 {
386 #if HAVE_MMX
387  if ((bpp == 16) && (mode == MODE_RGB))
388  return mmx_rgb16;
389  if ((bpp == 32) && (mode == MODE_RGB))
390  return mmx_argb32;
391 #endif
392  if ((bpp == 32) && (mode == MODE_RGB))
393  return yuv420_argb32_non_mmx;
394 
395  return nullptr;
396 }
397 
398 #define SCALE_BITS 10
399 
400 #define C_Y (76309 >> (16 - SCALE_BITS))
401 #define C_RV (117504 >> (16 - SCALE_BITS))
402 #define C_BU (138453 >> (16 - SCALE_BITS))
403 #define C_GU (13954 >> (16 - SCALE_BITS))
404 #define C_GV (34903 >> (16 - SCALE_BITS))
405 
406 #if defined(ANDROID)
407 #undef UCHAR_MAX
408 #define UCHAR_MAX 0xff
409 #endif
410 #if defined(__FreeBSD__)
411 // HACK: this is actually only needed on AMD64 at the moment,
412 // but is doesn't hurt the other architectures.
413 #undef UCHAR_MAX
414 #define UCHAR_MAX (int)__UCHAR_MAX
415 #endif
416 
417 #define RGBOUT(r, g, b, y1)\
418 {\
419  y = ((y1) - 16) * C_Y;\
420  (r) = std::min(UCHAR_MAX, std::max(0, (y + r_add) >> SCALE_BITS));\
421  (g) = std::min(UCHAR_MAX, std::max(0, (y + g_add) >> SCALE_BITS));\
422  (b) = std::min(UCHAR_MAX, std::max(0, (y + b_add) >> SCALE_BITS));\
423 }
424 
425 static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py,
426  unsigned char *pu, unsigned char *pv,
427  int h_size, int v_size, int rgb_stride,
428  int y_stride, int uv_stride, int alphaones)
429 {
430  unsigned char *y1_ptr, *cb_ptr, *cr_ptr, *d, *d1, *d2;
431  int w, y, cb, cr, r_add, g_add, b_add, width2;
432  int dstwidth;
433 
434 // byte indices
435 #if HAVE_BIGENDIAN
436 #define R_OI 1
437 #define G_OI 2
438 #define B_OI 3
439 #define A_OI 0
440 #else
441 #define R_OI 2
442 #define G_OI 1
443 #define B_OI 0
444 #define A_OI 3
445 #endif
446 
447  // squelch a warning
448  (void) rgb_stride; (void) y_stride; (void) uv_stride;
449 
450  d = image;
451  y1_ptr = py;
452  cb_ptr = pu;
453  cr_ptr = pv;
454  dstwidth = h_size * 4;
455  width2 = h_size / 2;
456 
457  for(;v_size > 0; v_size -= 2) {
458  d1 = d;
459  d2 = d + h_size * 4;
460  unsigned char *y2_ptr = y1_ptr + h_size;
461  for(w = width2; w > 0; w--) {
462  cb = cb_ptr[0] - 128;
463  cr = cr_ptr[0] - 128;
464  r_add = C_RV * cr + (1 << (SCALE_BITS - 1));
465  g_add = - C_GU * cb - C_GV * cr + (1 << (SCALE_BITS - 1));
466  b_add = C_BU * cb + (1 << (SCALE_BITS - 1));
467 
468  /* output 4 pixels */
469  RGBOUT(d1[R_OI], d1[G_OI], d1[B_OI], y1_ptr[0]);
470  RGBOUT(d1[R_OI+4], d1[G_OI+4], d1[B_OI+4], y1_ptr[1]);
471  RGBOUT(d2[R_OI], d2[G_OI], d2[B_OI], y2_ptr[0]);
472  RGBOUT(d2[R_OI+4], d2[G_OI+4], d2[B_OI+4], y2_ptr[1]);
473 
474  if (alphaones)
475  d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0xff;
476  else
477  d1[A_OI] = d1[A_OI+4] = d2[A_OI] = d2[A_OI+4] = 0;
478 
479  d1 += 8;
480  d2 += 8;
481  y1_ptr += 2;
482  y2_ptr += 2;
483  cb_ptr++;
484  cr_ptr++;
485  }
486  d += 2 * dstwidth;
487  y1_ptr += h_size;
488  }
489 }
490 
491 #define SCALEBITS 8
492 #define ONE_HALF (1 << (SCALEBITS - 1))
493 #define FIX(x) (lroundf((x) * (1L<<SCALEBITS)))
494 
499 void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr,
500  unsigned char *alpha, unsigned char *src,
501  int width, int height, int srcwidth)
502 {
503  int wrap, wrap4, x, y;
504  int r, g, b, r1, g1, b1;
505  unsigned char *p;
506 
507 // byte indices
508 #if HAVE_BIGENDIAN
509 #define R_II 3
510 #define G_II 2
511 #define B_II 1
512 #define A_II 0
513 #else
514 #define R_II 0
515 #define G_II 1
516 #define B_II 2
517 #define A_II 3
518 #endif
519 
520  wrap = (width + 1) & ~1;
521  wrap4 = srcwidth * 4;
522  p = src;
523  for(y=0;y+1<height;y+=2) {
524  for(x=0;x+1<width;x+=2) {
525  r = p[R_II];
526  g = p[G_II];
527  b = p[B_II];
528  r1 = r;
529  g1 = g;
530  b1 = b;
531  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
532  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
533  alpha[0] = p[A_II];
534 
535  r = p[R_II+4];
536  g = p[G_II+4];
537  b = p[B_II+4];
538  r1 += r;
539  g1 += g;
540  b1 += b;
541  lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
542  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
543  alpha[1] = p[A_II+4];
544 
545  p += wrap4;
546  lum += wrap;
547  alpha += wrap;
548 
549  r = p[R_II];
550  g = p[G_II];
551  b = p[B_II];
552  r1 += r;
553  g1 += g;
554  b1 += b;
555  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
556  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
557  alpha[0] = p[A_II];
558 
559  r = p[R_II+4];
560  g = p[G_II+4];
561  b = p[B_II+4];
562  r1 += r;
563  g1 += g;
564  b1 += b;
565  lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
566  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
567  alpha[1] = p[A_II+4];
568 
569  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
570  FIX(0.50000) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
571  128;
572  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
573  FIX(0.08131) * b1 + 4 * ONE_HALF - 1) >> (SCALEBITS + 2)) +
574  128;
575 
576  cb++;
577  cr++;
578  p += -wrap4 + 2 * 4;
579  lum += -wrap + 2;
580  alpha += -wrap + 2;
581  }
582  if (width & 1) {
583  r = p[R_II];
584  g = p[G_II];
585  b = p[B_II];
586  r1 = r;
587  g1 = g;
588  b1 = b;
589  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
590  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
591  alpha[0] = p[A_II];
592 
593  lum[1] = 16;
594  alpha[1] = 0;
595 
596  p += wrap4;
597  lum += wrap;
598  alpha += wrap;
599 
600  r = p[R_II];
601  g = p[G_II];
602  b = p[B_II];
603  r1 += r;
604  g1 += g;
605  b1 += b;
606  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
607  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
608  alpha[0] = p[A_II];
609 
610  lum[1] = 16;
611  alpha[1] = 0;
612 
613  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
614  FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
615  128;
616  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
617  FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
618  128;
619 
620  cb++;
621  cr++;
622  p += -wrap4 + 4;
623  lum += -wrap + 2;
624  alpha += -wrap + 2;
625  }
626  p += wrap4 * 2 - width * 4;
627  lum += wrap;
628  alpha += wrap;
629  }
630  if (height & 1) {
631  for(x=0;x+1<width;x+=2) {
632  r = p[R_II];
633  g = p[G_II];
634  b = p[B_II];
635  r1 = r;
636  g1 = g;
637  b1 = b;
638  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
639  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
640  alpha[0] = p[A_II];
641 
642  r = p[R_II+4];
643  g = p[G_II+4];
644  b = p[B_II+4];
645  r1 += r;
646  g1 += g;
647  b1 += b;
648  lum[1] = (FIX(0.29900) * r + FIX(0.58700) * g +
649  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
650  alpha[1] = p[A_II+4];
651 
652  lum += wrap;
653  alpha += wrap;
654 
655  lum[0] = 16;
656  alpha[0] = 0;
657 
658  lum[1] = 16;
659  alpha[1] = 0;
660 
661  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
662  FIX(0.50000) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
663  128;
664  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
665  FIX(0.08131) * b1 + 2 * ONE_HALF - 1) >> (SCALEBITS + 1)) +
666  128;
667 
668  cb++;
669  cr++;
670  p += 2 * 4;
671  lum += -wrap + 2;
672  alpha += -wrap + 2;
673  }
674  if (width & 1) {
675  r = p[R_II];
676  g = p[G_II];
677  b = p[B_II];
678  r1 = r;
679  g1 = g;
680  b1 = b;
681  lum[0] = (FIX(0.29900) * r + FIX(0.58700) * g +
682  FIX(0.11400) * b + ONE_HALF) >> SCALEBITS;
683  alpha[0] = p[A_II];
684 
685  lum[1] = 16;
686  alpha[1] = 0;
687 
688  lum += wrap;
689  alpha += wrap;
690 
691  lum[0] = 16;
692  alpha[0] = 0;
693 
694  lum[1] = 16;
695  alpha[1] = 0;
696 
697  cr[0] = ((- FIX(0.16874) * r1 - FIX(0.33126) * g1 +
698  FIX(0.50000) * b1 + ONE_HALF - 1) >> SCALEBITS) +
699  128;
700  cb[0] = ((FIX(0.50000) * r1 - FIX(0.41869) * g1 -
701  FIX(0.08131) * b1 + ONE_HALF - 1) >> SCALEBITS) +
702  128;
703 
704 #if 0 // no point in updating after the last pixel
705  cb++;
706  cr++;
707  p += 4;
708  lum += -wrap + 2;
709  alpha += -wrap + 2;
710 #endif
711  }
712  }
713 }
714 
715 /* I420 to 2VUY colorspace conversion routines.
716  *
717  * In the early days of the OS X port of MythTV, Paul Jara noticed that
718  * QuickTime spent a lot of time converting from YUV420 to YUV422.
719  * He found some sample code on the Ars Technica forum by a
720  * Frenchman called Titer which used Altivec to speed this up.
721  * Jeremiah Morris took that code and added it into MythTV.
722  *
723  * All was well until the Intel Macs came along,
724  * which seem to crash when fed YUV420 from MythTV.
725  *
726  * Fortunately, Mino Taoyama has provided an MMX optimised version too.
727  */
728 
741 static void non_vec_i420_2vuy(
742  uint8_t *image, int vuy_stride,
743  const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
744  int y_stride, int u_stride, int v_stride,
745  int h_size, int v_size)
746 {
747  const uint8_t *py1;
748  const uint8_t *py2;
749  const uint8_t *pu1;
750  const uint8_t *pv1;
751  int x, y;
752 
753  for (y = 0; y < (v_size>>1); y++)
754  {
755  uint8_t *pi1 = image + 2*y * vuy_stride;
756  uint8_t *pi2 = image + 2*y * vuy_stride + vuy_stride;
757  py1 = py + 2*y * y_stride;
758  py2 = py + 2*y * y_stride + y_stride;
759  pu1 = pu + y * u_stride;
760  pv1 = pv + y * v_stride;
761 
762  for (x = 0; x < (h_size>>1); x++)
763  {
764  pi1[4*x+0] = pu1[1*x+0];
765  pi2[4*x+0] = pu1[1*x+0];
766  pi1[4*x+1] = py1[2*x+0];
767  pi2[4*x+1] = py2[2*x+0];
768  pi1[4*x+2] = pv1[1*x+0];
769  pi2[4*x+2] = pv1[1*x+0];
770  pi1[4*x+3] = py1[2*x+1];
771  pi2[4*x+3] = py2[2*x+1];
772  }
773  }
774 }
775 
776 #if HAVE_MMX
777 
789 static void mmx_i420_2vuy(
790  uint8_t *image, int vuy_stride,
791  const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
792  int y_stride, int u_stride, int v_stride,
793  int h_size, int v_size)
794 {
795  int x,y;
796 
797  if ((h_size % 16) || (v_size % 2))
798  {
799  non_vec_i420_2vuy(image, vuy_stride,
800  py, pu, pv, y_stride, u_stride, v_stride,
801  h_size, v_size);
802  return;
803  }
804 
805  emms();
806 
807  for (y = 0; y < (v_size>>1); y++)
808  {
809  uint8_t *pi1 = image + 2*y * vuy_stride;
810  uint8_t *pi2 = image + 2*y * vuy_stride + vuy_stride;
811  const uint8_t *py1 = py + 2*y * y_stride;
812  const uint8_t *py2 = py + 2*y * y_stride + y_stride;
813  const uint8_t *pu1 = pu + y * u_stride;
814  const uint8_t *pv1 = pv + y * v_stride;
815 
816  for (x = 0; x < h_size / 16; x++)
817  {
818  movq_m2r (*py1, mm0); // y data
819  movq_m2r (*py2, mm1); // y data
820  movq_m2r (*pu1, mm2); // u data
821  movq_m2r (*pv1, mm3); // v data
822 
823  movq_r2r (mm2, mm4); // Copy U
824 
825  punpcklbw_r2r (mm3, mm2); // Combine low U & V mm2 = uv low
826  punpckhbw_r2r (mm3, mm4); // Combine high U & V mm4 = uv high
827 
828  movq_r2r (mm2, mm5); // Copy low UV mm5 = uv low
829  movq_r2r (mm2, mm6); // Copy low UV mm6 = uv low
830  punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv low
831  punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high
832 
833  movntq_r2m (mm5, *(pi1));
834  movntq_r2m (mm6, *(pi1+8));
835 
836  movq_r2r (mm2, mm5); // Copy low UV mm5 = uv low
837  movq_r2r (mm2, mm6); // Copy low UV mm6 = uv low
838  punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low
839  punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high
840 
841  movntq_r2m (mm5, *(pi2));
842  movntq_r2m (mm6, *(pi2+8));
843 
844 
845  movq_m2r (*(py1+8), mm0); // y data
846  movq_m2r (*(py2+8), mm1); // y data
847 
848  movq_r2r (mm4, mm5); // Copy high UV mm5 = uv high
849  movq_r2r (mm4, mm6); // Copy high UV mm6 = uv high
850  punpcklbw_r2r (mm0, mm5); // mm5 = y1 low uv high
851  punpckhbw_r2r (mm0, mm6); // mm6 = y1 high uv high
852 
853  movntq_r2m (mm5, *(pi1+16));
854  movntq_r2m (mm6, *(pi1+24));
855 
856  movq_r2r (mm4, mm5); // Copy high UV mm5 = uv high
857  movq_r2r (mm4, mm6); // Copy high UV mm6 = uv high
858  punpcklbw_r2r (mm1, mm5); // mm5 = y2 low uv low
859  punpckhbw_r2r (mm1, mm6); // mm6 = y2 high uv high
860 
861  movntq_r2m (mm5, *(pi2+16));
862  movntq_r2m (mm6, *(pi2+24));
863 
864  pi1 += 32;
865  pi2 += 32;
866  py1 += 16;
867  py2 += 16;
868  pu1 += 8;
869  pv1 += 8;
870  }
871  }
872 
873  emms();
874 }
875 
876 #endif // HAVE_MMX
877 
878 #if HAVE_ALTIVEC
879 
880 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara)
881 
882 #define VEC_NEXT_LINES() \
883  pi1 = pi2; \
884  pi2 += h_size * 2; \
885  py1 = py2; \
886  py2 += h_size;
887 
888 #define VEC_LOAD_UV() \
889  u_vec = vec_ld(0, pu); pu += 16; \
890  v_vec = vec_ld(0, pv); pv += 16;
891 
892 #define VEC_MERGE(a) \
893  uv_vec = a(u_vec, v_vec); \
894  y_vec = vec_ld(0, py1); py1 += 16; \
895  vec_st(vec_mergeh(uv_vec, y_vec), 0, pi1); pi1 += 16; \
896  vec_st(vec_mergel(uv_vec, y_vec), 0, pi1); pi1 += 16; \
897  y_vec = vec_ld(0, py2); py2 += 16; \
898  vec_st(vec_mergeh(uv_vec, y_vec), 0, pi2); pi2 += 16; \
899  vec_st(vec_mergel(uv_vec, y_vec), 0, pi2); pi2 += 16;
900 
913 static void altivec_i420_2vuy(
914  uint8_t *image, int vuy_stride,
915  const uint8_t *py, const uint8_t *pu, const uint8_t *pv,
916  int y_stride, int u_stride, int v_stride,
917  int h_size, int v_size)
918 {
919  uint8_t *pi1, *pi2 = image;
920  const uint8_t *py1;
921  const uint8_t *py2 = py;
922 
923  int x, y;
924 
925  vector unsigned char u_vec;
926  vector unsigned char v_vec;
927  vector unsigned char uv_vec;
928  vector unsigned char y_vec;
929 
930  int vuy_extra = vuy_stride - (h_size<<1);
931  int y_extra = y_stride - (h_size);
932  int u_extra = u_stride - (h_size>>1);
933  int v_extra = v_stride - (h_size>>1);
934 
935  if (vuy_extra || y_extra || u_extra || v_extra)
936  {
937  // Fall back to C version
938  non_vec_i420_2vuy(image, vuy_stride,
939  py, pu, pv,
940  y_stride, u_stride, v_stride,
941  h_size, v_size);
942  return;
943  }
944 
945  if (!((h_size % 32) || (v_size % 2)))
946  {
947  // Width is a multiple of 32, process 2 lines at a time
948  for (y = v_size / 2; y--; )
949  {
950  VEC_NEXT_LINES();
951  for (x = h_size / 32; x--; )
952  {
953  VEC_LOAD_UV();
954  VEC_MERGE(vec_mergeh);
955  VEC_MERGE(vec_mergel);
956  }
957  }
958 
959  }
960  else if (!((h_size % 16) || (v_size % 4)))
961  {
962  // Width is a multiple of 16, process 4 lines at a time
963  for (y = v_size / 4; y--; )
964  {
965  // Lines 1-2, pixels 0 to (width - 16)
966  VEC_NEXT_LINES();
967  for (x = h_size / 32; x--; )
968  {
969  VEC_LOAD_UV();
970  VEC_MERGE(vec_mergeh);
971  VEC_MERGE(vec_mergel);
972  }
973 
974  // Lines 1-2, pixels (width - 16) to width
975  VEC_LOAD_UV();
976  VEC_MERGE(vec_mergeh);
977 
978  // Lines 3-4, pixels 0-16
979  VEC_NEXT_LINES();
980  VEC_MERGE(vec_mergel);
981 
982  // Lines 3-4, pixels 16 to width
983  for (x = h_size / 32; x--; )
984  {
985  VEC_LOAD_UV();
986  VEC_MERGE(vec_mergeh);
987  VEC_MERGE(vec_mergel);
988  }
989  }
990  }
991  else
992  {
993  // Fall back to C version
994  non_vec_i420_2vuy(image, vuy_stride,
995  py, pu, pv,
996  y_stride, u_stride, v_stride,
997  h_size, v_size);
998  }
999 }
1000 
1001 #endif // HAVE_ALTIVEC
1002 
1003 
1018 {
1019 #if HAVE_ALTIVEC
1020  if (has_altivec())
1021  return altivec_i420_2vuy;
1022 #endif
1023 #if HAVE_MMX
1024  return mmx_i420_2vuy;
1025 #else
1026  return non_vec_i420_2vuy; /* Fallback to C */
1027 #endif
1028 }
1029 
1039 static void non_vec_2vuy_i420(
1040  uint8_t *py, uint8_t *pu, uint8_t *pv,
1041  int y_stride, int u_stride, int v_stride,
1042  const uint8_t *image, int vuy_stride,
1043  int h_size, int v_size)
1044 {
1045  const uint8_t *pi1;
1046  const uint8_t *pi2;
1047  int x, y;
1048 
1049  for (y = 0; y < (v_size>>1); y++)
1050  {
1051  uint8_t *py1, *py2, *pu1, *pv1;
1052 
1053  pi1 = image + 2*y * vuy_stride;
1054  pi2 = image + 2*y * vuy_stride + vuy_stride;
1055  py1 = py + 2*y * y_stride;
1056  py2 = py + 2*y * y_stride + y_stride;
1057  pu1 = pu + y * u_stride;
1058  pv1 = pv + y * v_stride;
1059 
1060  for (x = 0; x < (h_size>>1); x++)
1061  {
1062  pu1[1*x+0] = (pi1[4*x+0] + pi2[4*x+0]) >> 1;
1063  py1[2*x+0] = pi1[4*x+1];
1064  py2[2*x+0] = pi2[4*x+1];
1065  pv1[1*x+0] = (pi1[4*x+2] + pi2[4*x+2]) >> 1;
1066  py1[2*x+1] = pi1[4*x+3];
1067  py2[2*x+1] = pi2[4*x+3];
1068  }
1069  }
1070 }
1071 
1072 #if HAVE_ALTIVEC
1073 
1074 // Altivec code adapted from VLC's i420_yuv2.c (thanks to Titer and Paul Jara)
1075 
1076 #define VEC_READ_LINE(ptr, y, uv) \
1077  pa_vec = vec_ld(0, ptr); ptr += 16; \
1078  pb_vec = vec_ld(0, ptr); ptr += 16; \
1079  vec_st(vec_pack((vector unsigned short)pa_vec, \
1080  (vector unsigned short)pb_vec), \
1081  0, y); y += 16; \
1082  uv = vec_pack(vec_sr((vector unsigned short)pa_vec, eight_vec), \
1083  vec_sr((vector unsigned short)pb_vec, eight_vec));
1084 
1085 #define VEC_SPLIT(a) \
1086  VEC_READ_LINE(pi1, py1, uv1_vec); \
1087  VEC_READ_LINE(pi2, py2, uv2_vec); \
1088  a = vec_avg(uv1_vec, uv2_vec);
1089 
1090 #define VEC_STORE_UV() \
1091  vec_st(vec_pack((vector unsigned short)uva_vec, \
1092  (vector unsigned short)uvb_vec), \
1093  0, pv); pv += 16; \
1094  vec_st(vec_pack(vec_sr((vector unsigned short)uva_vec, eight_vec), \
1095  vec_sr((vector unsigned short)uvb_vec, eight_vec)), \
1096  0, pu); pu += 16;
1097 
1098 
1108 static void altivec_2vuy_i420(
1109  uint8_t *py, uint8_t *pu, uint8_t *pv,
1110  int y_stride, int u_stride, int v_stride,
1111  const uint8_t *image, int vuy_stride,
1112  int h_size, int v_size)
1113 {
1114  const uint8_t *pi1;
1115  const uint8_t *pi2 = image;
1116  uint8_t *py1, *py2 = py;
1117 
1118  int x, y;
1119 
1120  vector unsigned short eight_vec = vec_splat_u16(8);
1121  vector unsigned char pa_vec, pb_vec,
1122  uv1_vec, uv2_vec,
1123  uva_vec, uvb_vec;
1124 
1125  int vuy_extra = vuy_stride - (h_size<<1);
1126  int y_extra = y_stride - (h_size);
1127  int u_extra = u_stride - (h_size>>1);
1128  int v_extra = v_stride - (h_size>>1);
1129 
1130  if (vuy_extra || y_extra || u_extra || v_extra)
1131  {
1132  // Fall back to C version
1133  non_vec_2vuy_i420(py, pu, pv,
1134  y_stride, u_stride, v_stride,
1135  image, vuy_stride,
1136  h_size, v_size);
1137  return;
1138  }
1139 
1140  if (!((h_size % 32) || (v_size % 2)))
1141  {
1142  // Width is a multiple of 32, process 2 lines at a time
1143  for (y = v_size / 2; y--; )
1144  {
1145  VEC_NEXT_LINES();
1146  for (x = h_size / 32; x--; )
1147  {
1148  VEC_SPLIT(uva_vec);
1149  VEC_SPLIT(uvb_vec);
1150  VEC_STORE_UV();
1151  }
1152  }
1153  }
1154  else if (!((h_size % 16) || (v_size % 4)))
1155  {
1156  // Width is a multiple of 16, process 4 lines at a time
1157  for (y = v_size / 4; y--; )
1158  {
1159  // Lines 1-2, pixels 0 to (width - 16)
1160  VEC_NEXT_LINES();
1161  for (x = h_size / 32; x--; )
1162  {
1163  VEC_SPLIT(uva_vec);
1164  VEC_SPLIT(uvb_vec);
1165  VEC_STORE_UV();
1166  }
1167 
1168  // Lines 1-2, pixels (width - 16) to width
1169  VEC_SPLIT(uva_vec);
1170 
1171  // Lines 3-4, pixels 0-16
1172  VEC_NEXT_LINES();
1173  VEC_SPLIT(uvb_vec);
1174  VEC_STORE_UV();
1175 
1176  // Lines 3-4, pixels 16 to width
1177  for (x = h_size / 32; x--; )
1178  {
1179  VEC_SPLIT(uva_vec);
1180  VEC_SPLIT(uvb_vec);
1181  VEC_STORE_UV();
1182  }
1183  }
1184  }
1185  else
1186  {
1187  // Fall back to C version
1188  non_vec_2vuy_i420(py, pu, pv,
1189  y_stride, u_stride, v_stride,
1190  image, vuy_stride,
1191  h_size, v_size);
1192  }
1193 }
1194 
1195 #endif // HAVE_ALTIVEC
1196 
1197 
1212 {
1213 #if HAVE_ALTIVEC
1214  if (has_altivec())
1215  return altivec_2vuy_i420;
1216 #endif
1217  return non_vec_2vuy_i420; /* Fallback to C */
1218 }
yuv2rgb_fun yuv2rgb_init_mmx(int bpp, int mode)
This returns a yuv to rgba converter, using mmx if MMX was compiled in.
Definition: yuv2rgb.cpp:384
static void non_vec_i420_2vuy(uint8_t *image, int vuy_stride, const uint8_t *py, const uint8_t *pu, const uint8_t *pv, int y_stride, int u_stride, int v_stride, int h_size, int v_size)
Plain C I420 to 2VUY conversion function.
Definition: yuv2rgb.cpp:741
void(* yuv2rgb_fun)(uint8_t *image, uint8_t *py, uint8_t *pu, uint8_t *pv, int h_size, int v_size, int rgb_stride, int y_stride, int uv_stride, int alphaones)
Definition: yuv2rgb.h:32
static const mmx_t mmx_1s
Definition: util-opengl.cpp:16
#define movntq(src, dest)
Definition: yuv2rgb.cpp:78
#define G_II
yuv2rgb_fun yuv2rgb_init_mmxext(int bpp, int mode)
This returns a yuv to rgba converter, using mmxext if MMX was compiled in.
Definition: yuv2rgb.cpp:360
unsigned char r
Definition: ParseText.cpp:329
#define A_OI
unsigned char b
Definition: ParseText.cpp:329
conv_i420_2vuy_fun get_i420_2vuy_conv(void)
Returns I420 to 2VUY conversion function.
Definition: yuv2rgb.cpp:1017
#define B_OI
#define A_II
static const uint16_t * d
#define R_II
#define C_GV
Definition: yuv2rgb.cpp:404
static void yuv420_argb32_non_mmx(unsigned char *image, unsigned char *py, unsigned char *pu, unsigned char *pv, int h_size, int v_size, int rgb_stride, int y_stride, int uv_stride, int alphaones)
Definition: yuv2rgb.cpp:425
void(* conv_i420_2vuy_fun)(uint8_t *image, int vuy_stride, const uint8_t *py, const uint8_t *pu, const uint8_t *pv, int y_stride, int u_stride, int v_stride, int h_size, int v_size)
Definition: yuv2rgb.h:53
#define SCALE_BITS
Definition: yuv2rgb.cpp:398
#define B_II
#define MODE_RGB
Definition: yuv2rgb.h:29
#define ONE_HALF
Definition: yuv2rgb.cpp:492
#define C_GU
Definition: yuv2rgb.cpp:403
#define mmx_t
#define emms()
Definition: mm_arch.h:15
conv_2vuy_i420_fun get_2vuy_i420_conv(void)
Returns 2VUY to I420 conversion function.
Definition: yuv2rgb.cpp:1211
#define RGBOUT(r, g, b, y1)
Definition: yuv2rgb.cpp:417
static void non_vec_2vuy_i420(uint8_t *py, uint8_t *pu, uint8_t *pv, int y_stride, int u_stride, int v_stride, const uint8_t *image, int vuy_stride, int h_size, int v_size)
Plain C 2VUY to I420 conversion routine.
Definition: yuv2rgb.cpp:1039
#define C_BU
Definition: yuv2rgb.cpp:402
#define FIX(x)
Definition: yuv2rgb.cpp:493
#define R_OI
#define C_RV
Definition: yuv2rgb.cpp:401
void(* conv_2vuy_i420_fun)(uint8_t *py, uint8_t *pu, uint8_t *pv, int y_stride, int u_stride, int v_stride, const uint8_t *image, int vuy_stride, int h_size, int v_size)
Definition: yuv2rgb.h:62
#define G_OI
void rgb32_to_yuv420p(unsigned char *lum, unsigned char *cb, unsigned char *cr, unsigned char *alpha, unsigned char *src, int width, int height, int srcwidth)
Convert planar RGB to YUV420.
Definition: yuv2rgb.cpp:499
#define SCALEBITS
Definition: yuv2rgb.cpp:491
unsigned char g
Definition: ParseText.cpp:329