MythTV  master
RTjpegN.cpp
Go to the documentation of this file.
1 /*
2  RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
3 
4  With modifications by:
5  (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6  and
7  (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation; either version 2 of the License, or
12  (at your option) any later version.
13 
14  This program is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License for more details.
18 
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the Free Software
21  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 */
24 
25 #include <algorithm>
26 #include <array>
27 #include <cstdio>
28 #include <cstdlib>
29 #include <cstring>
30 #include <limits> // workaround QTBUG-90395
31 #include "RTjpegN.h"
32 
33 #include <QtGlobal>
34 #include <QtEndian>
35 
36 #ifdef MMX
37 static mmx_t RTjpeg_ones;
38 static mmx_t RTjpeg_half;
39 static mmx_t RTjpeg_C4;
40 static mmx_t RTjpeg_C6;
41 static mmx_t RTjpeg_C2mC6;
42 static mmx_t RTjpeg_C2pC6;
43 static mmx_t RTjpeg_zero;
44 #endif
45 
46 //#define SHOWBLOCK 1 // NOLINT(cppcoreguidelines-macro-usage)
47 #define BETTERCOMPRESSION 1 // NOLINT(cppcoreguidelines-macro-usage)
48 
49 static const std::array<const uint8_t,64> RTjpeg_ZZ {
50 0,
51 8, 1,
52 2, 9, 16,
53 24, 17, 10, 3,
54 4, 11, 18, 25, 32,
55 40, 33, 26, 19, 12, 5,
56 6, 13, 20, 27, 34, 41, 48,
57 56, 49, 42, 35, 28, 21, 14, 7,
58 15, 22, 29, 36, 43, 50, 57,
59 58, 51, 44, 37, 30, 23,
60 31, 38, 45, 52, 59,
61 60, 53, 46, 39,
62 47, 54, 61,
63 62, 55,
64 63 };
65 
66 static const std::array<const uint64_t,64> RTjpeg_aan_tab {
67 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
68 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
69 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
70 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
71 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
72 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
73 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
74 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
75 };
76 
77 static const std::array<const uint8_t,64> RTjpeg_lum_quant_tbl {
78  16, 11, 10, 16, 24, 40, 51, 61,
79  12, 12, 14, 19, 26, 58, 60, 55,
80  14, 13, 16, 24, 40, 57, 69, 56,
81  14, 17, 22, 29, 51, 87, 80, 62,
82  18, 22, 37, 56, 68, 109, 103, 77,
83  24, 35, 55, 64, 81, 104, 113, 92,
84  49, 64, 78, 87, 103, 121, 120, 101,
85  72, 92, 95, 98, 112, 100, 103, 99
86  };
87 
88 static const std::array<const uint8_t,64> RTjpeg_chrom_quant_tbl {
89  17, 18, 24, 47, 99, 99, 99, 99,
90  18, 21, 26, 66, 99, 99, 99, 99,
91  24, 26, 56, 99, 99, 99, 99, 99,
92  47, 66, 99, 99, 99, 99, 99, 99,
93  99, 99, 99, 99, 99, 99, 99, 99,
94  99, 99, 99, 99, 99, 99, 99, 99,
95  99, 99, 99, 99, 99, 99, 99, 99,
96  99, 99, 99, 99, 99, 99, 99, 99
97  };
98 
99 #ifdef BETTERCOMPRESSION
100 
101 /*--------------------------------------------------*/
102 /* better encoding, but needs a lot more cpu time */
103 /* seems to be more effective than old method +lzo */
104 /* with this encoding lzo isn't efficient anymore */
105 /* there is still more potential for better */
106 /* encoding but that would need even more cputime */
107 /* anyway your mileage may vary */
108 /* */
109 /* written by Martin BIELY and Roman HOCHLEITNER */
110 /*--------------------------------------------------*/
111 
112 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
113 /* Block to Stream (encoding) */
114 /* */
115 
116 int RTjpeg::b2s(const RTjpegData16 &data, int8_t *strm, uint8_t /*bt8*/)
117 {
118  int co=1;
119 
120  auto *ustrm = (uint8_t *)strm;
121 #ifdef SHOWBLOCK
122 
123  int ii;
124  for (ii=0; ii < 64; ii++) {
125  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
126  }
127  fprintf(stdout, "\n\n");
128 
129 #endif
130 
131 // *strm++ = 0x10;
132 // *strm = 0x00;
133 //
134 // return 2;
135 
136  // first byte allways written
137  ustrm[0]=
138  (uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
139 
140 
141  int ci=63;
142  while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
143 
144  unsigned char bitten = ((unsigned char)ci) << 2;
145 
146  if (ci==0) {
147  ustrm[1]= bitten;
148  co = 2;
149  return co;
150  }
151 
152  /* bitoff=0 because the high 6bit contain first non zero position */
153  unsigned char bitoff = 0;
154  co = 1;
155 
156  for(; ci>0; ci--) {
157 
158  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
159 
160  switch(ZZvalue) {
161  case 0:
162  break;
163  case 1:
164  bitten |= (0x01<<bitoff);
165  break;
166  case -1:
167  bitten |= (0x03<<bitoff);
168  break;
169  default:
170  bitten |= (0x02<<bitoff);
171  goto HERZWEH;
172  break;
173  }
174 
175  if ( bitoff == 0 ) {
176  ustrm[co]= bitten;
177  bitten = 0;
178  bitoff = 8;
179  co++;
180  } /* "fall through" */
181  bitoff-=2;
182 
183  }
184 
185  /* ci must be 0 */
186  if (bitoff != 6) {
187 
188  ustrm[co]= bitten;
189  co++;
190 
191  }
192  goto BAUCHWEH;
193 
194 HERZWEH:
195 /* ci cannot be 0 */
196 /* correct bitoff to nibble boundaries */
197 
198  switch(bitoff){
199  case 4:
200  case 6:
201  bitoff = 0;
202  break;
203  case 2:
204  case 0:
205  ustrm[co]= bitten;
206  bitoff = 4;
207  co++;
208  bitten = 0; // clear half nibble values in bitten
209  break;
210  default:
211  break;
212  }
213 
214  for(; ci>0; ci--) {
215 
216  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
217 
218  if ( (ZZvalue > 7) || (ZZvalue < -7) ) {
219  bitten |= (0x08<<bitoff);
220  goto HIRNWEH;
221  }
222 
223  bitten |= (ZZvalue&0xf)<<bitoff;
224 
225  if ( bitoff == 0 ) {
226  ustrm[co]= bitten;
227  bitten = 0;
228  bitoff = 8;
229  co++;
230  } /* "fall thru" */
231  bitoff-=4;
232  }
233 
234  /* ci must be 0 */
235  if ( bitoff == 0 ) {
236  ustrm[co]= bitten;
237  co++;
238  }
239  goto BAUCHWEH;
240 
241 HIRNWEH:
242 
243  ustrm[co]= bitten;
244  co++;
245 
246 
247  /* bitting is over now we bite */
248  for(; ci>0; ci--) {
249 
250  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
251 
252  if (ZZvalue>0)
253  {
254  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
255  }
256  else
257  {
258  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
259  }
260 
261  }
262 
263 
264 BAUCHWEH:
265  /* we gotoo much now we are ill */
266 #ifdef SHOWBLOCK
267 {
268 int i;
269 fprintf(stdout, "\nco = '%d'\n", co);
270  for (i=0; i < co+2; i++) {
271  fprintf(stdout, "%d ", strm[i]);
272  }
273 fprintf(stdout, "\n\n");
274 }
275 #endif
276 
277  return co;
278 }
279 
280 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
281 /* Stream to Block (decoding) */
282 /* */
283 
284 int RTjpeg::s2b(RTjpegData16 &data, const int8_t *strm, uint8_t /*bt8*/, RTjpegData32 &qtbla)
285 {
286  auto *qtbl = (uint32_t *)qtbla.data();
287  int ci = 0;
288  unsigned char bitoff = 0;
289 
290  /* first byte always read */
291  int i=RTjpeg_ZZ[0];
292  data[i]=((uint8_t)strm[0])*qtbl[i];
293 
294  /* we start at the behind */
295 
296  unsigned char bitten = ((unsigned char)strm[1]) >> 2;
297  int co = 63;
298  for(; co > bitten; co--) {
299 
300  data[RTjpeg_ZZ[co]] = 0;
301 
302  }
303 
304  if (co==0) {
305  ci = 2;
306  goto AUTOBAHN;
307  }
308 
309  /* we have to read the last 2 bits of the second byte */
310  ci=1;
311  bitoff = 0;
312 
313  for(; co>0; co--) {
314 
315  bitten = ((unsigned char)strm[ci]) >> bitoff;
316  bitten &= 0x03;
317 
318  i=RTjpeg_ZZ[co];
319 
320  switch( bitten ) {
321  case 0x03:
322  data[i]= -qtbl[i];
323  break;
324  case 0x02:
325  goto FUSSWEG;
326  break;
327  case 0x01:
328  data[i]= qtbl[i];
329  break;
330  case 0x00:
331  data[i]= 0;
332  break;
333  default:
334  break;
335  }
336 
337  if ( bitoff == 0 ) {
338  bitoff = 8;
339  ci++;
340  }
341  bitoff -= 2;
342  }
343  /* co is 0 now */
344  /* data is written properly */
345 
346  /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
347  if (bitoff!=6) ci++;
348 
349  goto AUTOBAHN;
350 
351 
352 FUSSWEG:
353 /* correct bitoff to nibble */
354  switch(bitoff){
355  case 4:
356  case 6:
357  bitoff = 0;
358  break;
359  case 2:
360  case 0:
361  /* we have to read from the next byte */
362  ci++;
363  bitoff = 4;
364  break;
365  default:
366  break;
367  }
368 
369  for(; co>0; co--) {
370 
371  bitten = ((unsigned char)strm[ci]) >> bitoff;
372  bitten &= 0x0f;
373 
374  i=RTjpeg_ZZ[co];
375 
376  if ( bitten == 0x08 ) {
377  goto STRASSE;
378  }
379 
380  /* the compiler cannot do sign extension for signed nibbles */
381  if ( bitten & 0x08 ) {
382  bitten |= 0xf0;
383  }
384  /* the unsigned char bitten now is a valid signed char */
385 
386  data[i]=((signed char)bitten)*qtbl[i];
387 
388  if ( bitoff == 0 ) {
389  bitoff = 8;
390  ci++;
391  }
392  bitoff -= 4;
393  }
394  /* co is 0 */
395 
396  /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
397  if (bitoff!=4) ci++;
398 
399  goto AUTOBAHN;
400 
401 STRASSE:
402  ci++;
403 
404  for(; co>0; co--) {
405  i=RTjpeg_ZZ[co];
406  data[i]=strm[ci++]*qtbl[i];
407  }
408 
409  /* ci now is the count, because it points to next element => no incrementing */
410 
411 AUTOBAHN:
412 
413 #ifdef SHOWBLOCK
414 fprintf(stdout, "\nci = '%d'\n", ci);
415  for (i=0; i < 64; i++) {
416  fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
417  }
418 fprintf(stdout, "\n\n");
419 #endif
420 
421  return ci;
422 }
423 
424 #else
425 
426 int RTjpeg::b2s(const int16_t *data, int8_t *strm, uint8_t bt8)
427 {
428  register int ci, co=1, tmp;
429  register int16_t ZZvalue;
430 
431 #ifdef SHOWBLOCK
432 
433  int ii;
434  for (ii=0; ii < 64; ii++) {
435  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
436  }
437  fprintf(stdout, "\n\n");
438 
439 #endif
440 
441  (uint8_t)strm[0]=(uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
442 
443  for(ci=1; ci<=bt8; ci++)
444  {
445  ZZvalue = data[RTjpeg_ZZ[ci]];
446 
447  if (ZZvalue>0)
448  {
449  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
450  }
451  else
452  {
453  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
454  }
455  }
456 
457  for(; ci<64; ci++)
458  {
459  ZZvalue = data[RTjpeg_ZZ[ci]];
460 
461  if (ZZvalue>0)
462  {
463  strm[co++]=(int8_t)(ZZvalue>63)?63:ZZvalue;
464  }
465  else if (ZZvalue<0)
466  {
467  strm[co++]=(int8_t)(ZZvalue<-64)?-64:ZZvalue;
468  }
469  else /* compress zeros */
470  {
471  tmp=ci;
472  do
473  {
474  ci++;
475  } while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
476 
477  strm[co++]=(int8_t)(63+(ci-tmp));
478  ci--;
479  }
480  }
481  return (int)co;
482 }
483 
484 int RTjpeg::s2b(int16_t *data, const int8_t *strm, uint8_t bt8, uint32_t *qtbla)
485 {
486  uint32_t *qtbl = (uint32_t *)qtbla;
487  int ci=1, co=1, tmp;
488  register int i;
489 
490  i=RTjpeg_ZZ[0];
491  data[i]=((uint8_t)strm[0])*qtbl[i];
492 
493  for(co=1; co<=bt8; co++)
494  {
495  i=RTjpeg_ZZ[co];
496  data[i]=strm[ci++]*qtbl[i];
497  }
498 
499  for(; co<64; co++)
500  {
501  if (strm[ci]>63)
502  {
503  tmp=co+strm[ci]-63;
504  for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
505  co--;
506  } else
507  {
508  i=RTjpeg_ZZ[co];
509  data[i]=strm[ci]*qtbl[i];
510  }
511  ci++;
512  }
513  return (int)ci;
514 }
515 #endif
516 
517 #ifdef MMX
519 {
520  using P16_32 = union { int16_t *m_int16; int32_t *m_int32; };
521  P16_32 qtbl;
522 
523  qtbl.m_int32 = m_lqt.data();
524  for (int i = 0; i < 64; i++)
525  qtbl.m_int16[i] = static_cast<int16_t>(m_lqt[i]);
526 
527  // cppcheck-suppress redundantAssignment
528  qtbl.m_int32 = m_cqt.data();
529  for (int i = 0; i < 64; i++)
530  qtbl.m_int16[i] = static_cast<int16_t>(m_cqt[i]);
531 }
532 
534 {
535  auto *ql=(mmx_t *)qtbl.data();
536  auto *bl=(mmx_t *)_block.data();
537 
538  movq_m2r(RTjpeg_ones, mm6);
539  movq_m2r(RTjpeg_half, mm7);
540 
541  for(int i=16; i; i--)
542  {
543  movq_m2r(*(ql++), mm0); /* quant vals (4) */
544  movq_m2r(*bl, mm2); /* block vals (4) */
545  movq_r2r(mm0, mm1);
546  movq_r2r(mm2, mm3);
547 
548  punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
549  punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
550 
551  punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
552  punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
553 
554  pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
555  pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
556 
557  psrad_i2r(16, mm0);
558  psrad_i2r(16, mm1);
559 
560  packssdw_r2r(mm1, mm0);
561 
562  movq_r2m(mm0, *(bl++));
563  }
564 }
565 #else
566 void RTjpeg::QuantInit()
567 {
568 }
569 
570 void RTjpeg::Quant(RTjpegData16 &_block, RTjpegData32 &qtbl)
571 {
572  int i;
573 
574  for(i=0; i<64; i++)
575  _block[i]=(int16_t)((_block[i]*qtbl[i]+32767)>>16);
576 }
577 #endif
578 
579 /*
580  * Perform the forward DCT on one block of samples.
581  */
582 #ifndef MMX
583 static constexpr int32_t FIX_0_382683433 { 98 }; /* FIX(0.382683433) */
584 static constexpr int32_t FIX_0_541196100 { 139 }; /* FIX(0.541196100) */
585 static constexpr int32_t FIX_0_707106781 { 181 }; /* FIX(0.707106781) */
586 static constexpr int32_t FIX_1_306562965 { 334 }; /* FIX(1.306562965) */
587 
588 static constexpr int16_t DESCALE10(int32_t x) { return static_cast<int16_t>((x+128) >> 8); };
589 static constexpr int16_t DESCALE20(int32_t x) { return static_cast<int16_t>((x+32768) >> 16); };
590 static constexpr int32_t D_MULTIPLY(int32_t var, int32_t constant) { return var * constant; };
591 #endif
592 
594 {
595  for (int i = 0; i < 64; i++)
596  {
597  m_lqt[i] = (((uint64_t)m_lqt[i] << 32) / RTjpeg_aan_tab[i]);
598  m_cqt[i] = (((uint64_t)m_cqt[i] << 32) / RTjpeg_aan_tab[i]);
599  }
600 }
601 
602 void RTjpeg::DctY(uint8_t *idata, int rskip)
603 {
604 #ifndef MMX
605  uint8_t *idataptr = idata;
606  int32_t *wsptr = m_ws.data();
607 
608  for (int ctr = 7; ctr >= 0; ctr--) {
609  int32_t tmp0 = idataptr[0] + idataptr[7];
610  int32_t tmp7 = idataptr[0] - idataptr[7];
611  int32_t tmp1 = idataptr[1] + idataptr[6];
612  int32_t tmp6 = idataptr[1] - idataptr[6];
613  int32_t tmp2 = idataptr[2] + idataptr[5];
614  int32_t tmp5 = idataptr[2] - idataptr[5];
615  int32_t tmp3 = idataptr[3] + idataptr[4];
616  int32_t tmp4 = idataptr[3] - idataptr[4];
617 
618  int32_t tmp10 = (tmp0 + tmp3); /* phase 2 */
619  int32_t tmp13 = tmp0 - tmp3;
620  int32_t tmp11 = (tmp1 + tmp2);
621  int32_t tmp12 = tmp1 - tmp2;
622 
623  wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
624  wsptr[4] = (tmp10 - tmp11)<<8;
625 
626  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
627  wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
628  wsptr[6] = (tmp13<<8) - z1;
629 
630  tmp10 = tmp4 + tmp5; /* phase 2 */
631  tmp11 = tmp5 + tmp6;
632  tmp12 = tmp6 + tmp7;
633 
634  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
635  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
636  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
637  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
638 
639  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
640  int32_t z13 = (tmp7<<8) - z3;
641 
642  wsptr[5] = z13 + z2; /* phase 6 */
643  wsptr[3] = z13 - z2;
644  wsptr[1] = z11 + z4;
645  wsptr[7] = z11 - z4;
646 
647  idataptr += rskip<<3; /* advance pointer to next row */
648  wsptr += 8;
649  }
650 
651  wsptr = m_ws.data();
652  int16_t *odataptr = m_block.data();
653  for (int ctr = 7; ctr >= 0; ctr--) {
654  int32_t tmp0 = wsptr[0] + wsptr[56];
655  int32_t tmp7 = wsptr[0] - wsptr[56];
656  int32_t tmp1 = wsptr[8] + wsptr[48];
657  int32_t tmp6 = wsptr[8] - wsptr[48];
658  int32_t tmp2 = wsptr[16] + wsptr[40];
659  int32_t tmp5 = wsptr[16] - wsptr[40];
660  int32_t tmp3 = wsptr[24] + wsptr[32];
661  int32_t tmp4 = wsptr[24] - wsptr[32];
662 
663  int32_t tmp10 = tmp0 + tmp3; /* phase 2 */
664  int32_t tmp13 = tmp0 - tmp3;
665  int32_t tmp11 = tmp1 + tmp2;
666  int32_t tmp12 = tmp1 - tmp2;
667 
668  odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
669  odataptr[32] = DESCALE10(tmp10 - tmp11);
670 
671  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
672  odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
673  odataptr[48] = DESCALE20((tmp13<<8) - z1);
674 
675  tmp10 = tmp4 + tmp5; /* phase 2 */
676  tmp11 = tmp5 + tmp6;
677  tmp12 = tmp6 + tmp7;
678 
679  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
680  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
681  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
682  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
683 
684  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
685  int32_t z13 = (tmp7<<8) - z3;
686 
687  odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
688  odataptr[24] = DESCALE20(z13 - z2);
689  odataptr[8] = DESCALE20(z11 + z4);
690  odataptr[56] = DESCALE20(z11 - z4);
691 
692  odataptr++; /* advance pointer to next column */
693  wsptr++;
694 
695  }
696 #else
697  volatile mmx_t tmp6 {};
698  volatile mmx_t tmp7 {};
699  auto *dataptr = (mmx_t *)m_block.data();
700  auto *idata2 = (mmx_t *)idata;
701 
702 
703  // first copy the input 8 bit to the destination 16 bits
704 
705  movq_m2r(RTjpeg_zero, mm2);
706 
707  movq_m2r(*idata2, mm0);
708  movq_r2r(mm0, mm1);
709 
710  punpcklbw_r2r(mm2, mm0);
711  movq_r2m(mm0, *(dataptr));
712 
713  punpckhbw_r2r(mm2, mm1);
714  movq_r2m(mm1, *(dataptr+1));
715 
716  idata2 += rskip;
717 
718  movq_m2r(*idata2, mm0);
719  movq_r2r(mm0, mm1);
720 
721  punpcklbw_r2r(mm2, mm0);
722  movq_r2m(mm0, *(dataptr+2));
723 
724  punpckhbw_r2r(mm2, mm1);
725  movq_r2m(mm1, *(dataptr+3));
726 
727  idata2 += rskip;
728 
729  movq_m2r(*idata2, mm0);
730  movq_r2r(mm0, mm1);
731 
732  punpcklbw_r2r(mm2, mm0);
733  movq_r2m(mm0, *(dataptr+4));
734 
735  punpckhbw_r2r(mm2, mm1);
736  movq_r2m(mm1, *(dataptr+5));
737 
738  idata2 += rskip;
739 
740  movq_m2r(*idata2, mm0);
741  movq_r2r(mm0, mm1);
742 
743  punpcklbw_r2r(mm2, mm0);
744  movq_r2m(mm0, *(dataptr+6));
745 
746  punpckhbw_r2r(mm2, mm1);
747  movq_r2m(mm1, *(dataptr+7));
748 
749  idata2 += rskip;
750 
751  movq_m2r(*idata2, mm0);
752  movq_r2r(mm0, mm1);
753 
754  punpcklbw_r2r(mm2, mm0);
755  movq_r2m(mm0, *(dataptr+8));
756 
757  punpckhbw_r2r(mm2, mm1);
758  movq_r2m(mm1, *(dataptr+9));
759 
760  idata2 += rskip;
761 
762  movq_m2r(*idata2, mm0);
763  movq_r2r(mm0, mm1);
764 
765  punpcklbw_r2r(mm2, mm0);
766  movq_r2m(mm0, *(dataptr+10));
767 
768  punpckhbw_r2r(mm2, mm1);
769  movq_r2m(mm1, *(dataptr+11));
770 
771  idata2 += rskip;
772 
773  movq_m2r(*idata2, mm0);
774  movq_r2r(mm0, mm1);
775 
776  punpcklbw_r2r(mm2, mm0);
777  movq_r2m(mm0, *(dataptr+12));
778 
779  punpckhbw_r2r(mm2, mm1);
780  movq_r2m(mm1, *(dataptr+13));
781 
782  idata2 += rskip;
783 
784  movq_m2r(*idata2, mm0);
785  movq_r2r(mm0, mm1);
786 
787  punpcklbw_r2r(mm2, mm0);
788  movq_r2m(mm0, *(dataptr+14));
789 
790  punpckhbw_r2r(mm2, mm1);
791  movq_r2m(mm1, *(dataptr+15));
792 
793 /* Start Transpose to do calculations on rows */
794 
795  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
796 
797  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
798  movq_r2r(mm7, mm5);
799 
800  punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
801  movq_r2r(mm6, mm2);
802 
803  punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
804  movq_r2r(mm7, mm1);
805 
806  movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
807  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
808 
809  movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
810  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
811 
812  movq_r2m(mm7,*(dataptr+9)); // write result 1
813  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
814 
815  movq_r2m(mm1,*(dataptr+11)); // write result 2
816  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
817 
818  movq_r2r(mm5, mm1);
819  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
820 
821  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
822  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
823 
824  movq_r2m(mm5,*(dataptr+13)); // write result 3
825 
826  // last 4x4 done
827 
828  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
829 
830  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
831  movq_r2r(mm0, mm6);
832 
833  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
834  movq_r2r(mm2, mm7);
835 
836  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
837  movq_r2r(mm0, mm4);
838 
839  //
840  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
841  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
842 
843  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
844  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
845 
846  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
847  movq_r2r(mm1, mm2); // copy first line
848 
849  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
850  movq_r2r(mm6, mm5); // copy first intermediate result
851 
852  movq_r2m(mm0, *(dataptr+8)); // write result 1
853  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
854 
855  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
856  movq_r2r(mm3, mm0); // copy third line
857 
858  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
859 
860  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
861  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
862 
863  punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
864  movq_r2r(mm1, mm4);
865 
866  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
867  punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
868 
869  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
870  movq_r2r(mm2, mm6);
871 
872  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
873  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
874 
875  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
876  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
877 
878  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
879  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
880 
881  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
882 
883  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
884 
885  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
886 
887 
888 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
889 
890  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
891  movq_r2r(mm0, mm2);
892 
893  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
894  movq_r2r(mm7, mm4);
895 
896  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
897  movq_r2r(mm0, mm1);
898 
899  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
900  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
901 
902  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
903  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
904 
905  movq_r2r(mm0, mm7); // write result 1
906  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
907 
908  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
909  movq_r2r(mm1, mm6); // write result 2
910 
911  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
912  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
913 
914  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
915  movq_r2r(mm2, mm3); // copy first intermediate result
916 
917  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
918  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
919 
920  movq_r2m(mm7, tmp7);
921  movq_r2r(mm2, mm5); // write result 3
922 
923  movq_r2m(mm6, tmp6);
924  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
925 
926  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
927  movq_r2r(mm3, mm4); // write result 4
928 
929 /************************************************************************************************
930  End of Transpose
931 ************************************************************************************************/
932 
933 
934  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
935  movq_r2r(mm0, mm7);
936 
937  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
938  movq_r2r(mm1, mm6);
939 
940  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
941  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
942 
943  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
944  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
945 
946  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
947  paddw_r2r(mm7, mm6); // tmp12 + tmp13
948 
949  /* stage 3 */
950 
951  movq_m2r(tmp6, mm2);
952  movq_r2r(mm0, mm3);
953 
954  psllw_i2r(2, mm6); // m8 * 2^2
955  paddw_r2r(mm1, mm0);
956 
957  pmulhw_m2r(RTjpeg_C4, mm6); // z1
958  psubw_r2r(mm1, mm3);
959 
960  movq_r2m(mm0, *dataptr);
961  movq_r2r(mm7, mm0);
962 
963  /* Odd part */
964  movq_r2m(mm3, *(dataptr+8));
965  paddw_r2r(mm5, mm4); // tmp10
966 
967  movq_m2r(tmp7, mm3);
968  paddw_r2r(mm6, mm0); // tmp32
969 
970  paddw_r2r(mm2, mm5); // tmp11
971  psubw_r2r(mm6, mm7); // tmp33
972 
973  movq_r2m(mm0, *(dataptr+4));
974  paddw_r2r(mm3, mm2); // tmp12
975 
976  /* stage 4 */
977 
978  movq_r2m(mm7, *(dataptr+12));
979  movq_r2r(mm4, mm1); // copy of tmp10
980 
981  psubw_r2r(mm2, mm1); // tmp10 - tmp12
982  psllw_i2r(2, mm4); // m8 * 2^2
983 
984  movq_m2r(RTjpeg_C2mC6, mm0);
985  psllw_i2r(2, mm1);
986 
987  pmulhw_m2r(RTjpeg_C6, mm1); // z5
988  psllw_i2r(2, mm2);
989 
990  pmulhw_r2r(mm0, mm4); // z5
991 
992  /* stage 5 */
993 
994  pmulhw_m2r(RTjpeg_C2pC6, mm2);
995  psllw_i2r(2, mm5);
996 
997  pmulhw_m2r(RTjpeg_C4, mm5); // z3
998  movq_r2r(mm3, mm0); // copy tmp7
999 
1000  movq_m2r(*(dataptr+1), mm7);
1001  paddw_r2r(mm1, mm4); // z2
1002 
1003  paddw_r2r(mm1, mm2); // z4
1004 
1005  paddw_r2r(mm5, mm0); // z11
1006  psubw_r2r(mm5, mm3); // z13
1007 
1008  /* stage 6 */
1009 
1010  movq_r2r(mm3, mm5); // copy z13
1011  psubw_r2r(mm4, mm3); // y3=z13 - z2
1012 
1013  paddw_r2r(mm4, mm5); // y5=z13 + z2
1014  movq_r2r(mm0, mm6); // copy z11
1015 
1016  movq_r2m(mm3, *(dataptr+6)); //save y3
1017  psubw_r2r(mm2, mm0); // y7=z11 - z4
1018 
1019  movq_r2m(mm5, *(dataptr+10)); //save y5
1020  paddw_r2r(mm2, mm6); // y1=z11 + z4
1021 
1022  movq_r2m(mm0, *(dataptr+14)); //save y7
1023 
1024  /************************************************
1025  * End of 1st 4 rows
1026  ************************************************/
1027 
1028  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1029  movq_r2r(mm7, mm0); // copy x0
1030 
1031  movq_r2m(mm6, *(dataptr+2)); //save y1
1032 
1033  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1034  movq_r2r(mm1, mm6); // copy x1
1035 
1036  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1037 
1038  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1039  movq_r2r(mm2, mm5); // copy x2
1040 
1041  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1042  movq_r2r(mm3, mm4); // copy x3
1043 
1044  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1045 
1046  movq_r2m(mm7, tmp7); // save tmp07
1047  movq_r2r(mm0, mm7); // copy tmp00
1048 
1049  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1050 
1051  /* stage 2, Even Part */
1052 
1053  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1054 
1055  movq_r2m(mm6, tmp6); // save tmp07
1056  movq_r2r(mm1, mm6); // copy tmp01
1057 
1058  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1059  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1060 
1061  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1062 
1063  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1064  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1065 
1066  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1067 
1068  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1069  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1070 
1071  /* stage 3, Even and stage 4 & 5 even */
1072 
1073  movq_m2r(tmp6, mm2); // load tmp6
1074  movq_r2r(mm0, mm3); // copy tmp10
1075 
1076  psllw_i2r(2, mm6); // shift z1
1077  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1078 
1079  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1080  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1081 
1082  movq_r2m(mm0, *(dataptr+1)); //save y0
1083  movq_r2r(mm7, mm0); // copy tmp13
1084 
1085  /* odd part */
1086 
1087  movq_r2m(mm3, *(dataptr+9)); //save y4
1088  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1089 
1090  movq_m2r(tmp7, mm3); // load tmp7
1091  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1092 
1093  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1094  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1095 
1096  movq_r2m(mm0, *(dataptr+5)); //save y2
1097  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1098 
1099  /* stage 4 */
1100 
1101  movq_r2m(mm7, *(dataptr+13)); //save y6
1102  movq_r2r(mm4, mm1); // copy tmp10
1103 
1104  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1105  psllw_i2r(2, mm4); // shift tmp10
1106 
1107  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1108  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1109 
1110  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1111  psllw_i2r(2, mm5); // prepare for multiply
1112 
1113  pmulhw_r2r(mm0, mm4); // multiply by converted real
1114 
1115  /* stage 5 */
1116 
1117  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1118  psllw_i2r(2, mm2); // prepare for multiply
1119 
1120  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1121  movq_r2r(mm3, mm0); // copy tmp7
1122 
1123  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1124  paddw_r2r(mm1, mm4); // z2
1125 
1126  paddw_r2r(mm5, mm0); // z11
1127  psubw_r2r(mm5, mm3); // z13
1128 
1129  /* stage 6 */
1130 
1131  movq_r2r(mm3, mm5); // copy z13
1132  paddw_r2r(mm1, mm2); // z4
1133 
1134  movq_r2r(mm0, mm6); // copy z11
1135  psubw_r2r(mm4, mm5); // y3
1136 
1137  paddw_r2r(mm2, mm6); // y1
1138  paddw_r2r(mm4, mm3); // y5
1139 
1140  movq_r2m(mm5, *(dataptr+7)); //save y3
1141 
1142  movq_r2m(mm6, *(dataptr+3)); //save y1
1143  psubw_r2r(mm2, mm0); // y7
1144 
1145 /************************************************************************************************
1146  Start of Transpose
1147 ************************************************************************************************/
1148 
1149  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1150  movq_r2r(mm7, mm5); // copy first line
1151 
1152  punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1153  movq_r2r(mm6, mm2); // copy third line
1154 
1155  punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1156  movq_r2r(mm7, mm1); // copy first intermediate result
1157 
1158  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1159 
1160  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1161 
1162  movq_r2m(mm7, *(dataptr+9)); // write result 1
1163  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1164 
1165  movq_r2m(mm1, *(dataptr+11)); // write result 2
1166  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1167 
1168  movq_r2r(mm5, mm1); // copy first intermediate result
1169  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1170 
1171  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1172  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1173 
1174  movq_r2m(mm5, *(dataptr+13)); // write result 3
1175 
1176  /****** last 4x4 done */
1177 
1178  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1179 
1180  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1181  movq_r2r(mm0, mm6); // copy first line
1182 
1183  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1184  movq_r2r(mm2, mm7); // copy third line
1185 
1186  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1187  movq_r2r(mm0, mm4); // copy first intermediate result
1188 
1189 
1190 
1191  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1192  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1193 
1194  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1195  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1196 
1197  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1198  movq_r2r(mm1, mm2); // copy first line
1199 
1200  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1201  movq_r2r(mm6, mm5); // copy first intermediate result
1202 
1203  movq_r2m(mm0, *(dataptr+8)); // write result 1
1204  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1205 
1206  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1207  movq_r2r(mm3, mm0); // copy third line
1208 
1209  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1210 
1211  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1212  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1213 
1214  punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1215  movq_r2r(mm1, mm4); // copy second intermediate result
1216 
1217  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1218  punpckldq_r2r(mm3, mm1); //
1219 
1220  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1221  movq_r2r(mm2, mm6); // copy second intermediate result
1222 
1223  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1224  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1225 
1226  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1227  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1228 
1229  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1230  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1231 
1232  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1233 
1234  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1235 
1236  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1237 
1238 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1239 
1240  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1241  movq_r2r(mm0, mm2); // copy first line
1242 
1243  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1244  movq_r2r(mm7, mm4); // copy third line
1245 
1246  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1247  movq_r2r(mm0, mm1); // copy first intermediate result
1248 
1249  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1250  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1251 
1252  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1253  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1254 
1255  movq_r2r(mm0, mm7); // write result 1
1256  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1257 
1258  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1259  movq_r2r(mm1, mm6); // write result 2
1260 
1261  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1262  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1263 
1264  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1265  movq_r2r(mm2, mm3); // copy first intermediate result
1266 
1267  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1268  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1269 
1270  movq_r2m(mm7, tmp7); // save tmp07
1271  movq_r2r(mm2, mm5); // write result 3
1272 
1273  movq_r2m(mm6, tmp6); // save tmp06
1274 
1275  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1276 
1277  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1278  movq_r2r(mm3, mm4); // write result 4
1279 
1280 /************************************************************************************************
1281  End of Transpose 2
1282 ************************************************************************************************/
1283 
1284  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1285  movq_r2r(mm0, mm7);
1286 
1287  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1288  movq_r2r(mm1, mm6);
1289 
1290  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1291  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1292 
1293  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1294  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1295 
1296  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1297  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1298 
1299  /* stage 3 */
1300 
1301  movq_m2r(tmp6, mm2);
1302  movq_r2r(mm0, mm3);
1303 
1304  psllw_i2r(2, mm6); // m8 * 2^2
1305  paddw_r2r(mm1, mm0);
1306 
1307  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1308  psubw_r2r(mm1, mm3);
1309 
1310  movq_r2m(mm0, *dataptr);
1311  movq_r2r(mm7, mm0);
1312 
1313  /* Odd part */
1314  movq_r2m(mm3, *(dataptr+8));
1315  paddw_r2r(mm5, mm4); // tmp10
1316 
1317  movq_m2r(tmp7, mm3);
1318  paddw_r2r(mm6, mm0); // tmp32
1319 
1320  paddw_r2r(mm2, mm5); // tmp11
1321  psubw_r2r(mm6, mm7); // tmp33
1322 
1323  movq_r2m(mm0, *(dataptr+4));
1324  paddw_r2r(mm3, mm2); // tmp12
1325 
1326  /* stage 4 */
1327  movq_r2m(mm7, *(dataptr+12));
1328  movq_r2r(mm4, mm1); // copy of tmp10
1329 
1330  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1331  psllw_i2r(2, mm4); // m8 * 2^2
1332 
1333  movq_m2r(RTjpeg_C2mC6, mm0);
1334  psllw_i2r(2, mm1);
1335 
1336  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1337  psllw_i2r(2, mm2);
1338 
1339  pmulhw_r2r(mm0, mm4); // z5
1340 
1341  /* stage 5 */
1342 
1343  pmulhw_m2r(RTjpeg_C2pC6, mm2);
1344  psllw_i2r(2, mm5);
1345 
1346  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1347  movq_r2r(mm3, mm0); // copy tmp7
1348 
1349  movq_m2r(*(dataptr+1), mm7);
1350  paddw_r2r(mm1, mm4); // z2
1351 
1352  paddw_r2r(mm1, mm2); // z4
1353 
1354  paddw_r2r(mm5, mm0); // z11
1355  psubw_r2r(mm5, mm3); // z13
1356 
1357  /* stage 6 */
1358 
1359  movq_r2r(mm3, mm5); // copy z13
1360  psubw_r2r(mm4, mm3); // y3=z13 - z2
1361 
1362  paddw_r2r(mm4, mm5); // y5=z13 + z2
1363  movq_r2r(mm0, mm6); // copy z11
1364 
1365  movq_r2m(mm3, *(dataptr+6)); //save y3
1366  psubw_r2r(mm2, mm0); // y7=z11 - z4
1367 
1368  movq_r2m(mm5, *(dataptr+10)); //save y5
1369  paddw_r2r(mm2, mm6); // y1=z11 + z4
1370 
1371  movq_r2m(mm0, *(dataptr+14)); //save y7
1372 
1373  /************************************************
1374  * End of 1st 4 rows
1375  ************************************************/
1376 
1377  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1378  movq_r2r(mm7, mm0); // copy x0
1379 
1380  movq_r2m(mm6, *(dataptr+2)); //save y1
1381 
1382  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1383  movq_r2r(mm1, mm6); // copy x1
1384 
1385  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1386 
1387  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1388  movq_r2r(mm2, mm5); // copy x2
1389 
1390  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1391  movq_r2r(mm3, mm4); // copy x3
1392 
1393  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1394 
1395  movq_r2m(mm7, tmp7); // save tmp07
1396  movq_r2r(mm0, mm7); // copy tmp00
1397 
1398  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1399 
1400  /* stage 2, Even Part */
1401 
1402  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1403 
1404  movq_r2m(mm6, tmp6); // save tmp07
1405  movq_r2r(mm1, mm6); // copy tmp01
1406 
1407  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1408  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1409 
1410  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1411 
1412  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1413  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1414 
1415  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1416 
1417  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1418  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1419 
1420  /* stage 3, Even and stage 4 & 5 even */
1421 
1422  movq_m2r(tmp6, mm2); // load tmp6
1423  movq_r2r(mm0, mm3); // copy tmp10
1424 
1425  psllw_i2r(2, mm6); // shift z1
1426  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1427 
1428  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1429  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1430 
1431  movq_r2m(mm0, *(dataptr+1)); //save y0
1432  movq_r2r(mm7, mm0); // copy tmp13
1433 
1434  /* odd part */
1435 
1436  movq_r2m(mm3, *(dataptr+9)); //save y4
1437  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1438 
1439  movq_m2r(tmp7, mm3); // load tmp7
1440  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1441 
1442  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1443  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1444 
1445  movq_r2m(mm0, *(dataptr+5)); //save y2
1446  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1447 
1448  /* stage 4 */
1449 
1450  movq_r2m(mm7, *(dataptr+13)); //save y6
1451  movq_r2r(mm4, mm1); // copy tmp10
1452 
1453  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1454  psllw_i2r(2, mm4); // shift tmp10
1455 
1456  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1457  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1458 
1459  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1460  psllw_i2r(2, mm5); // prepare for multiply
1461 
1462  pmulhw_r2r(mm0, mm4); // multiply by converted real
1463 
1464  /* stage 5 */
1465 
1466  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1467  psllw_i2r(2, mm2); // prepare for multiply
1468 
1469  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1470  movq_r2r(mm3, mm0); // copy tmp7
1471 
1472  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1473  paddw_r2r(mm1, mm4); // z2
1474 
1475  paddw_r2r(mm5, mm0); // z11
1476  psubw_r2r(mm5, mm3); // z13
1477 
1478  /* stage 6 */
1479 
1480  movq_r2r(mm3, mm5); // copy z13
1481  paddw_r2r(mm1, mm2); // z4
1482 
1483  movq_r2r(mm0, mm6); // copy z11
1484  psubw_r2r(mm4, mm5); // y3
1485 
1486  paddw_r2r(mm2, mm6); // y1
1487  paddw_r2r(mm4, mm3); // y5
1488 
1489  movq_r2m(mm5, *(dataptr+7)); //save y3
1490  psubw_r2r(mm2, mm0); // y7=z11 - z4
1491 
1492  movq_r2m(mm3, *(dataptr+11)); //save y5
1493 
1494  movq_r2m(mm6, *(dataptr+3)); //save y1
1495 
1496  movq_r2m(mm0, *(dataptr+15)); //save y7
1497 
1498 
1499 #endif
1500 }
1501 
1502 #ifndef MMX
1503 static constexpr int32_t FIX_1_082392200 { 277 }; /* FIX(1.082392200) */
1504 static constexpr int32_t FIX_1_414213562 { 362 }; /* FIX(1.414213562) */
1505 static constexpr int32_t FIX_1_847759065 { 473 }; /* FIX(1.847759065) */
1506 static constexpr int32_t FIX_2_613125930 { 669 }; /* FIX(2.613125930) */
1507 
1508 static constexpr int16_t DESCALE(int32_t x) { return static_cast<int16_t>((x+4) >> 3); };
1509 
1510 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1511 
1512 static inline int16_t RL(int32_t x) { return std::clamp(x, 16, 235); };
1513 static constexpr int32_t MULTIPLY(int32_t var, int32_t constant)
1514  { return ((var * constant) + 128) >> 8; };
1515 #endif
1516 
1518 {
1519  for(int i = 0; i < 64; i++)
1520  {
1521  m_liqt[i] = ((uint64_t)m_liqt[i] * RTjpeg_aan_tab[i]) >> 32;
1522  m_ciqt[i] = ((uint64_t)m_ciqt[i] * RTjpeg_aan_tab[i]) >> 32;
1523  }
1524 }
1525 
1526 void RTjpeg::Idct(uint8_t *odata, RTjpegData16 &data, int rskip)
1527 {
1528 #ifdef MMX
1529 
1530 static mmx_t s_fix141; s_fix141.q = 0x5a825a825a825a82LL;
1531 static mmx_t s_fix184n261; s_fix184n261.q = 0xcf04cf04cf04cf04LL;
1532 static mmx_t s_fix184; s_fix184.q = 0x7641764176417641LL;
1533 static mmx_t s_fixN184; s_fixN184.q = 0x896f896f896f896fLL;
1534 static mmx_t s_fix108n184; s_fix108n184.q = 0xcf04cf04cf04cf04LL;
1535 
1536  auto *wsptr = (mmx_t *)m_ws.data();
1537  auto *dataptr = (mmx_t *)odata;
1538  auto *idata = (mmx_t *)data.data();
1539 
1540  rskip = rskip>>3;
1541 /*
1542  * Perform inverse DCT on one block of coefficients.
1543  */
1544 
1545  /* Odd part */
1546 
1547  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1548 
1549  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1550 
1551  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1552 
1553  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1554 
1555  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1556 
1557  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1558 
1559  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1560 
1561  psllw_i2r(2, mm2); // shift z10
1562  movq_r2r(mm2, mm0); // copy z10
1563 
1564  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1565  movq_r2r(mm3, mm5); // copy tmp4
1566 
1567  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1568  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1569 
1570  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1571  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1572 
1573  psubw_r2r(mm1, mm6); // z11-z13
1574  psllw_i2r(2, mm5); // shift z12
1575 
1576  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1577  movq_r2r(mm5, mm7); // copy z12
1578 
1579  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1580  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1581 
1582  //ok
1583 
1584  /* Even part */
1585  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1586  psllw_i2r(2, mm6);
1587 
1588  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1589 
1590  paddw_r2r(mm5, mm0); // tmp10
1591 
1592  paddw_r2r(mm7, mm2); // tmp12
1593 
1594  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1595  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1596 
1597  movq_r2r(mm1, mm5); // copy tmp1
1598  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1599 
1600  psubw_r2r(mm4, mm5); // tmp1-tmp3
1601  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1602 
1603  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1604  psllw_i2r(2, mm5); // shift tmp1-tmp3
1605 
1606  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1607 
1608  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1609  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1610 
1611  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1612 
1613  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1614 
1615  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1616  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1617 
1618  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1619  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1620 
1621  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1622  movq_r2r(mm1, mm5); // copy tmp11
1623 
1624  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1625  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1626 
1627  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1628 
1629  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1630  movq_r2r(mm7, mm0); // copy tmp0
1631 
1632  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1633  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1634 
1635  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1636 
1637  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1638  movq_r2r(mm1, mm3); // copy tmp1
1639 
1640  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1641  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1642 
1643  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1644 
1645  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1646  movq_r2r(mm4, mm1); // copy tmp3
1647 
1648  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1649 
1650  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1651 
1652  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1653 
1654  movq_r2m(mm4, *(wsptr+8));
1655  movq_r2r(mm5, mm7); // copy tmp2
1656 
1657  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1658 
1659  movq_r2m(mm1, *(wsptr+6));
1660  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1661 
1662  movq_r2m(mm5, *(wsptr+4));
1663 
1664  movq_r2m(mm7, *(wsptr+10));
1665 
1666  //ok
1667 
1668 
1669 /*****************************************************************/
1670 
1671  idata++;
1672  wsptr++;
1673 
1674 /*****************************************************************/
1675 
1676  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1677 
1678  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1679 
1680  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1681  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1682 
1683  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1684  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1685 
1686  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1687 
1688  psllw_i2r(2, mm2); // shift z10
1689  movq_r2r(mm2, mm0); // copy z10
1690 
1691  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1692  movq_r2r(mm3, mm5); // copy tmp4
1693 
1694  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1695  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1696 
1697  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1698  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1699 
1700  psubw_r2r(mm1, mm6); // z11-z13
1701  psllw_i2r(2, mm5); // shift z12
1702 
1703  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1704  movq_r2r(mm5, mm7); // copy z12
1705 
1706  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1707  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1708 
1709  //ok
1710 
1711  /* Even part */
1712  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1713  psllw_i2r(2, mm6);
1714 
1715  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1716 
1717  paddw_r2r(mm5, mm0); // tmp10
1718 
1719  paddw_r2r(mm7, mm2); // tmp12
1720 
1721  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1722  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1723 
1724  movq_r2r(mm1, mm5); // copy tmp1
1725  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1726 
1727  psubw_r2r(mm4, mm5); // tmp1-tmp3
1728  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1729 
1730  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1731  psllw_i2r(2, mm5); // shift tmp1-tmp3
1732 
1733  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1734  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1735 
1736  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1737 
1738  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1739 
1740  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1741 
1742  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1743  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1744 
1745  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1746  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1747 
1748  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1749  movq_r2r(mm1, mm5); // copy tmp11
1750 
1751  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1752  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1753 
1754  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1755 
1756  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1757  movq_r2r(mm7, mm0); // copy tmp0
1758 
1759  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1760  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1761 
1762  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1763 
1764  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1765  movq_r2r(mm1, mm3); // copy tmp1
1766 
1767  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1768  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1769 
1770  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1771 
1772  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1773  movq_r2r(mm4, mm1); // copy tmp3
1774 
1775  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1776 
1777  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1778 
1779  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1780 
1781  movq_r2m(mm4, *(wsptr+8));
1782  movq_r2r(mm5, mm7); // copy tmp2
1783 
1784  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1785 
1786  movq_r2m(mm1, *(wsptr+6));
1787  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1788 
1789  movq_r2m(mm5, *(wsptr+4));
1790 
1791  movq_r2m(mm7, *(wsptr+10));
1792 
1793 /*****************************************************************/
1794 
1795  /* Pass 2: process rows from work array, store into output array. */
1796  /* Note that we must descale the results by a factor of 8 == 2**3, */
1797  /* and also undo the PASS1_BITS scaling. */
1798 
1799 /*****************************************************************/
1800  /* Even part */
1801 
1802  wsptr--;
1803 
1804 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1805 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1806 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1807 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1808  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1809 
1810  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1811  movq_r2r(mm0, mm2);
1812 
1813  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1814  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1815 
1816  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1817  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1818 
1819  movq_r2r(mm0, mm6);
1820  movq_r2r(mm3, mm5);
1821 
1822  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1823  movq_r2r(mm2, mm1);
1824 
1825  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1826  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1827 
1828  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1829  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1830 
1831  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1832  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1833 
1834  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1835  movq_r2r(mm3, mm4);
1836 
1837  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1838  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1839 
1840  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1841  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1842 
1843 
1844  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1845  movq_r2r(mm6, mm2);
1846 
1847  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1848  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1849 
1850  movq_r2r(mm3, mm5);
1851  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1852 
1853  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1854  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1855 
1856  movq_r2r(mm4, mm7);
1857  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1858 
1859  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1860 
1861  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1862 
1863  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1864  movq_r2r(mm1, mm6);
1865 
1866  //ok
1867 
1868 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1869 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1870 
1871 
1872  movq_r2r(mm0, mm2);
1873  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1874 
1875  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1876  psllw_i2r(2, mm6);
1877 
1878  pmulhw_m2r(s_fix141, mm6);
1879  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1880 
1881  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1882  movq_r2r(mm0, mm7);
1883 
1884 // tmp0 = tmp10 + tmp13;
1885 // tmp3 = tmp10 - tmp13;
1886  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1887  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1888 
1889 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1890  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1891 // tmp1 = tmp11 + tmp12;
1892 // tmp2 = tmp11 - tmp12;
1893  movq_r2r(mm1, mm5);
1894 
1895  //OK
1896 
1897  /* Odd part */
1898 
1899 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1900 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1901 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1902 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1903  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1904  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1905 
1906  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1907  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1908 
1909  movq_r2r(mm3, mm6);
1910  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1911 
1912  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1913  movq_r2r(mm3, mm2);
1914 
1915 //Save tmp0 and tmp1 in wsptr
1916  movq_r2m(mm0, *(wsptr)); // save tmp0
1917  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1918 
1919 
1920 //Continue with z10 --- z13
1921  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1922  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1923 
1924  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1925  movq_r2r(mm6, mm4);
1926 
1927  movq_r2m(mm1, *(wsptr+1)); // save tmp1
1928  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1929 
1930  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1931  movq_r2r(mm6, mm1);
1932 
1933 //Save tmp2 and tmp3 in wsptr
1934  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1935  movq_r2r(mm2, mm4);
1936 
1937 //Continue with z10 --- z13
1938  movq_r2m(mm5, *(wsptr+2)); // save tmp2
1939  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1940 
1941  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1942  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1943 
1944  movq_r2r(mm3, mm0);
1945  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1946 
1947  movq_r2m(mm7, *(wsptr+3)); // save tmp3
1948  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1949 
1950  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1951  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1952 
1953  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
1954  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1955 
1956  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
1957  movq_r2r(mm6, mm4);
1958 
1959  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
1960  movq_r2r(mm1, mm5);
1961 
1962  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
1963  movq_r2r(mm6, mm2);
1964 
1965  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
1966  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
1967 
1968  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
1969  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
1970 
1971  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
1972  movq_r2r(mm1, mm7);
1973 
1974  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
1975  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
1976 
1977  movq_r2r(mm6, mm5);
1978  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
1979 
1980  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
1981  movq_r2r(mm2, mm4);
1982 
1983  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
1984 
1985  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
1986 
1987  punpckhdq_r2r(mm6, mm4);
1988 
1989  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
1990  movq_r2r(mm0, mm5);
1991 
1992  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
1993 
1994  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
1995  movq_r2r(mm3, mm4);
1996 
1997  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
1998  movq_r2r(mm5, mm1);
1999 
2000  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2001 // tmp7 = z11 + z13; /* phase 5 */
2002 // tmp8 = z11 - z13; /* phase 5 */
2003  psubw_r2r(mm4, mm1); // tmp8
2004 
2005  paddw_r2r(mm4, mm5); // tmp7
2006 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2007  psllw_i2r(2, mm1);
2008 
2009  psllw_i2r(2, mm0);
2010 
2011  pmulhw_m2r(s_fix141, mm1); // tmp21
2012 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2013 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2014  psllw_i2r(2, mm3);
2015  movq_r2r(mm0, mm7);
2016 
2017  pmulhw_m2r(s_fixN184, mm7);
2018  movq_r2r(mm3, mm6);
2019 
2020  movq_m2r(*(wsptr), mm2); // tmp0,final1
2021 
2022  pmulhw_m2r(s_fix108n184, mm6);
2023 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2024 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2025  movq_r2r(mm2, mm4); // final1
2026 
2027  pmulhw_m2r(s_fix184n261, mm0);
2028  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2029 
2030  pmulhw_m2r(s_fix184, mm3);
2031  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2032 
2033 // tmp6 = tmp22 - tmp7; /* phase 2 */
2034  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2035 
2036  paddw_r2r(mm6, mm7); // tmp20
2037  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2038 
2039  paddw_r2r(mm0, mm3); // tmp22
2040 
2041 // tmp5 = tmp21 - tmp6;
2042  psubw_r2r(mm5, mm3); // tmp6
2043 
2044 // tmp4 = tmp20 + tmp5;
2045  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2046  psubw_r2r(mm3, mm1); // tmp5
2047 
2048  movq_r2r(mm0, mm6); // final2
2049  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2050 
2051  /* Final output stage: scale down by a factor of 8 and range-limit */
2052 
2053 
2054 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2055 // & RANGE_MASK];
2056 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2057 // & RANGE_MASK]; final1
2058 
2059 
2060 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2061 // & RANGE_MASK];
2062 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2063 // & RANGE_MASK]; final2
2064  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2065  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2066 
2067  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2068 
2069  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2070 
2071  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2072  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2073 
2074 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2075 // & RANGE_MASK];
2076 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2077 // & RANGE_MASK]; final3
2078  paddw_r2r(mm1, mm7); // tmp4
2079  movq_r2r(mm5, mm3);
2080 
2081  paddw_r2r(mm1, mm5); // tmp2+tmp5
2082  psubw_r2r(mm1, mm3); // tmp2-tmp5
2083 
2084  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2085 
2086  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2087  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2088 
2089 
2090 
2091 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2092 // & RANGE_MASK];
2093 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2094 // & RANGE_MASK]; final4
2095  movq_r2r(mm4, mm6);
2096  paddw_r2r(mm7, mm4); // tmp3+tmp4
2097 
2098  psubw_r2r(mm7, mm6); // tmp3-tmp4
2099  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2100 
2101  // mov ecx, [dataptr]
2102 
2103  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2104 
2105  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2106 
2107  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2108  movq_r2r(mm2, mm4);
2109 
2110  movq_r2r(mm5, mm7);
2111  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2112 
2113  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2114  movq_r2r(mm2, mm1);
2115 
2116  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2117 
2118  // add dataptr, 4
2119 
2120  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2121 
2122  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2123 
2124  // add ecx, output_col
2125 
2126  movq_r2r(mm7, mm6);
2127  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2128 
2129  movq_r2r(mm2, mm0);
2130  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2131 
2132  // mov idata, [dataptr]
2133 
2134  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2135 
2136  // add dataptr, 4
2137 
2138  movq_r2r(mm1, mm3);
2139 
2140  // add idata, output_col
2141 
2142  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2143 
2144  movq_r2m(mm2, *(dataptr));
2145 
2146  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2147 
2148  dataptr += rskip;
2149  movq_r2m(mm0, *(dataptr));
2150 
2151  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2152  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2153 
2154  dataptr += rskip;
2155  movq_r2m(mm1, *(dataptr));
2156 
2157  dataptr += rskip;
2158  movq_r2m(mm3, *(dataptr));
2159 
2160 /*******************************************************************/
2161 
2162  wsptr += 8;
2163 
2164 /*******************************************************************/
2165 
2166 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2167 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2168 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2169 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2170  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2171 
2172  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2173  movq_r2r(mm0, mm2);
2174 
2175  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2176  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2177 
2178  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2179  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2180 
2181  movq_r2r(mm0, mm6);
2182  movq_r2r(mm3, mm5);
2183 
2184  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2185  movq_r2r(mm2, mm1);
2186 
2187  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2188  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2189 
2190  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2191  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2192 
2193  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2194  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2195 
2196  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2197  movq_r2r(mm3, mm4);
2198 
2199  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2200  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2201 
2202  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2203  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2204 
2205  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2206  movq_r2r(mm6, mm2);
2207 
2208  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2209  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2210 
2211  movq_r2r(mm3, mm5);
2212  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2213 
2214  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2215  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2216 
2217  movq_r2r(mm4, mm7);
2218  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2219 
2220  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2221 
2222  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2223 
2224  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2225  movq_r2r(mm1, mm6);
2226 
2227  //OK
2228 
2229 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2230 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2231 
2232  movq_r2r(mm0, mm2);
2233  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2234 
2235  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2236  psllw_i2r(2, mm6);
2237 
2238  pmulhw_m2r(s_fix141, mm6);
2239  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2240 
2241  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2242  movq_r2r(mm0, mm7);
2243 
2244 // tmp0 = tmp10 + tmp13;
2245 // tmp3 = tmp10 - tmp13;
2246  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2247  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2248 
2249 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2250  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2251 // tmp1 = tmp11 + tmp12;
2252 // tmp2 = tmp11 - tmp12;
2253  movq_r2r(mm1, mm5);
2254 
2255  //OK
2256 
2257 
2258  /* Odd part */
2259 
2260 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2261 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2262 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2263 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2264  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2265  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2266 
2267  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2268  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2269 
2270  movq_r2r(mm3, mm6);
2271  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2272 
2273  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2274  movq_r2r(mm3, mm2);
2275 
2276 //Save tmp0 and tmp1 in wsptr
2277  movq_r2m(mm0, *(wsptr)); // save tmp0
2278  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2279 
2280 
2281 //Continue with z10 --- z13
2282  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2283  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2284 
2285  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2286  movq_r2r(mm6, mm4);
2287 
2288  movq_r2m(mm1, *(wsptr+1)); // save tmp1
2289  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2290 
2291  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2292  movq_r2r(mm6, mm1);
2293 
2294 //Save tmp2 and tmp3 in wsptr
2295  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2296  movq_r2r(mm2, mm4);
2297 
2298 //Continue with z10 --- z13
2299  movq_r2m(mm5, *(wsptr+2)); // save tmp2
2300  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2301 
2302  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2303  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2304 
2305  movq_r2r(mm3, mm0);
2306  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2307 
2308  movq_r2m(mm7, *(wsptr+3)); // save tmp3
2309  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2310 
2311  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2312  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2313 
2314  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2315  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2316 
2317  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2318  movq_r2r(mm6, mm4);
2319 
2320  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2321  movq_r2r(mm1, mm5);
2322 
2323  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2324  movq_r2r(mm6, mm2);
2325 
2326  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2327  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2328 
2329  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2330  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2331 
2332  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2333  movq_r2r(mm1, mm7);
2334 
2335  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2336  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2337 
2338  movq_r2r(mm6, mm5);
2339  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2340 
2341  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2342  movq_r2r(mm2, mm4);
2343 
2344  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2345 
2346  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2347 
2348  punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2349 
2350  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2351  movq_r2r(mm0, mm5);
2352 
2353  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2354 
2355  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2356  movq_r2r(mm3, mm4);
2357 
2358  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2359  movq_r2r(mm5, mm1);
2360 
2361  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2362 // tmp7 = z11 + z13; /* phase 5 */
2363 // tmp8 = z11 - z13; /* phase 5 */
2364  psubw_r2r(mm4, mm1); // tmp8
2365 
2366  paddw_r2r(mm4, mm5); // tmp7
2367 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2368  psllw_i2r(2, mm1);
2369 
2370  psllw_i2r(2, mm0);
2371 
2372  pmulhw_m2r(s_fix141, mm1); // tmp21
2373 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2374 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2375  psllw_i2r(2, mm3);
2376  movq_r2r(mm0, mm7);
2377 
2378  pmulhw_m2r(s_fixN184, mm7);
2379  movq_r2r(mm3, mm6);
2380 
2381  movq_m2r(*(wsptr), mm2); // tmp0,final1
2382 
2383  pmulhw_m2r(s_fix108n184, mm6);
2384 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2385 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2386  movq_r2r(mm2, mm4); // final1
2387 
2388  pmulhw_m2r(s_fix184n261, mm0);
2389  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2390 
2391  pmulhw_m2r(s_fix184, mm3);
2392  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2393 
2394 // tmp6 = tmp22 - tmp7; /* phase 2 */
2395  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2396 
2397  paddw_r2r(mm6, mm7); // tmp20
2398  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2399 
2400  paddw_r2r(mm0, mm3); // tmp22
2401 
2402 // tmp5 = tmp21 - tmp6;
2403  psubw_r2r(mm5, mm3); // tmp6
2404 
2405 // tmp4 = tmp20 + tmp5;
2406  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2407  psubw_r2r(mm3, mm1); // tmp5
2408 
2409  movq_r2r(mm0, mm6); // final2
2410  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2411 
2412  /* Final output stage: scale down by a factor of 8 and range-limit */
2413 
2414 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2415 // & RANGE_MASK];
2416 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2417 // & RANGE_MASK]; final1
2418 
2419 
2420 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2421 // & RANGE_MASK];
2422 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2423 // & RANGE_MASK]; final2
2424  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2425  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2426 
2427  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2428 
2429  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2430 
2431  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2432  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2433 
2434 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2435 // & RANGE_MASK];
2436 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2437 // & RANGE_MASK]; final3
2438  paddw_r2r(mm1, mm7); // tmp4
2439  movq_r2r(mm5, mm3);
2440 
2441  paddw_r2r(mm1, mm5); // tmp2+tmp5
2442  psubw_r2r(mm1, mm3); // tmp2-tmp5
2443 
2444  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2445 
2446  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2447  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2448 
2449 
2450 
2451 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2452 // & RANGE_MASK];
2453 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2454 // & RANGE_MASK]; final4
2455  movq_r2r(mm4, mm6);
2456  paddw_r2r(mm7, mm4); // tmp3+tmp4
2457 
2458  psubw_r2r(mm7, mm6); // tmp3-tmp4
2459  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2460 
2461  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2462 
2463  /*
2464  movq_r2m(mm4, *dummy);
2465  fprintf(stderr, "3-4 %016llx\n", dummy);
2466  movq_r2m(mm4, *dummy);
2467  fprintf(stderr, "3+4 %016llx\n", dummy);
2468  */
2469 
2470 
2471  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2472 
2473  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2474  movq_r2r(mm2, mm4);
2475 
2476  movq_r2r(mm5, mm7);
2477  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2478 
2479  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2480  movq_r2r(mm2, mm1);
2481 
2482  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2483 
2484  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2485 
2486  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2487 
2488  movq_r2r(mm7, mm6);
2489  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2490 
2491  movq_r2r(mm2, mm0);
2492  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2493 
2494  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2495 
2496  movq_r2r(mm1, mm3);
2497 
2498  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2499 
2500  dataptr += rskip;
2501  movq_r2m(mm2, *(dataptr));
2502 
2503  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2504 
2505  dataptr += rskip;
2506  movq_r2m(mm0, *(dataptr));
2507 
2508  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2509 
2510  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2511 
2512  dataptr += rskip;
2513  movq_r2m(mm1, *(dataptr));
2514 
2515  dataptr += rskip;
2516  movq_r2m(mm3, *(dataptr));
2517 
2518 #else
2519  int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2520  int32_t tmp10, tmp11, tmp12, tmp13;
2521  int32_t z5, z10, z11, z12, z13;
2522  int16_t *inptr;
2523  int32_t *wsptr;
2524  uint8_t *outptr;
2525  int ctr;
2526  int32_t dcval;
2527 
2528  inptr = data.data();
2529  wsptr = m_ws.data();
2530  for (ctr = 8; ctr > 0; ctr--) {
2531 
2532  if ((inptr[8] | inptr[16] | inptr[24] |
2533  inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2534  dcval = inptr[0];
2535  wsptr[0] = dcval;
2536  wsptr[8] = dcval;
2537  wsptr[16] = dcval;
2538  wsptr[24] = dcval;
2539  wsptr[32] = dcval;
2540  wsptr[40] = dcval;
2541  wsptr[48] = dcval;
2542  wsptr[56] = dcval;
2543 
2544  inptr++;
2545  wsptr++;
2546  continue;
2547  }
2548 
2549  tmp0 = inptr[0];
2550  tmp1 = inptr[16];
2551  tmp2 = inptr[32];
2552  tmp3 = inptr[48];
2553 
2554  tmp10 = tmp0 + tmp2;
2555  tmp11 = tmp0 - tmp2;
2556 
2557  tmp13 = tmp1 + tmp3;
2558  tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2559 
2560  tmp0 = tmp10 + tmp13;
2561  tmp3 = tmp10 - tmp13;
2562  tmp1 = tmp11 + tmp12;
2563  tmp2 = tmp11 - tmp12;
2564 
2565  tmp4 = inptr[8];
2566  tmp5 = inptr[24];
2567  tmp6 = inptr[40];
2568  tmp7 = inptr[56];
2569 
2570  z13 = tmp6 + tmp5;
2571  z10 = tmp6 - tmp5;
2572  z11 = tmp4 + tmp7;
2573  z12 = tmp4 - tmp7;
2574 
2575  tmp7 = z11 + z13;
2576  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2577 
2578  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2579  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2580  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2581 
2582  tmp6 = tmp12 - tmp7;
2583  tmp5 = tmp11 - tmp6;
2584  tmp4 = tmp10 + tmp5;
2585 
2586  wsptr[0] = (int32_t) (tmp0 + tmp7);
2587  wsptr[56] = (int32_t) (tmp0 - tmp7);
2588  wsptr[8] = (int32_t) (tmp1 + tmp6);
2589  wsptr[48] = (int32_t) (tmp1 - tmp6);
2590  wsptr[16] = (int32_t) (tmp2 + tmp5);
2591  wsptr[40] = (int32_t) (tmp2 - tmp5);
2592  wsptr[32] = (int32_t) (tmp3 + tmp4);
2593  wsptr[24] = (int32_t) (tmp3 - tmp4);
2594 
2595  inptr++;
2596  wsptr++;
2597  }
2598 
2599  wsptr = m_ws.data();
2600  for (ctr = 0; ctr < 8; ctr++) {
2601  outptr = &(odata[ctr*rskip]);
2602 
2603  tmp10 = wsptr[0] + wsptr[4];
2604  tmp11 = wsptr[0] - wsptr[4];
2605 
2606  tmp13 = wsptr[2] + wsptr[6];
2607  tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2608 
2609  tmp0 = tmp10 + tmp13;
2610  tmp3 = tmp10 - tmp13;
2611  tmp1 = tmp11 + tmp12;
2612  tmp2 = tmp11 - tmp12;
2613 
2614  z13 = wsptr[5] + wsptr[3];
2615  z10 = wsptr[5] - wsptr[3];
2616  z11 = wsptr[1] + wsptr[7];
2617  z12 = wsptr[1] - wsptr[7];
2618 
2619  tmp7 = z11 + z13;
2620  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2621 
2622  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2623  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2624  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2625 
2626  tmp6 = tmp12 - tmp7;
2627  tmp5 = tmp11 - tmp6;
2628  tmp4 = tmp10 + tmp5;
2629 
2630  outptr[0] = RL(DESCALE(tmp0 + tmp7));
2631  outptr[7] = RL(DESCALE(tmp0 - tmp7));
2632  outptr[1] = RL(DESCALE(tmp1 + tmp6));
2633  outptr[6] = RL(DESCALE(tmp1 - tmp6));
2634  outptr[2] = RL(DESCALE(tmp2 + tmp5));
2635  outptr[5] = RL(DESCALE(tmp2 - tmp5));
2636  outptr[4] = RL(DESCALE(tmp3 + tmp4));
2637  outptr[3] = RL(DESCALE(tmp3 - tmp4));
2638 
2639  wsptr += 8;
2640  }
2641 #endif
2642 }
2643 
2644 inline void RTjpeg::CalcTbls(void)
2645 {
2646  uint64_t qual = (uint64_t)m_q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */
2647 
2648  for(int i = 0; i < 64; i++)
2649  {
2650  m_lqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2651  if (m_lqt[i] == 0)
2652  m_lqt[i]=1;
2653 
2654  m_cqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2655  if (m_cqt[i] == 0)
2656  m_cqt[i]=1;
2657 
2658  m_liqt[i] = (1<<16) / (m_lqt[i]<<3);
2659  m_ciqt[i] = (1<<16) / (m_cqt[i]<<3);
2660  m_lqt[i] = ((1<<16) / m_liqt[i])>>3;
2661  m_cqt[i] = ((1<<16) / m_ciqt[i])>>3;
2662  }
2663 
2664  m_lB8 = 0;
2665  while (m_liqt[RTjpeg_ZZ[++m_lB8]] <= 8)
2666  ;
2667  m_lB8--;
2668  m_cB8 = 0;
2669 
2670  while (m_ciqt[RTjpeg_ZZ[++m_cB8]] <= 8)
2671  ;
2672  m_cB8--;
2673 }
2674 
2675 int RTjpeg::SetQuality(int *quality)
2676 {
2677  if (*quality < 1)
2678  *quality = 1;
2679  if (*quality > 255)
2680  *quality = 255;
2681 
2682  m_q = *quality;
2683 
2684  CalcTbls();
2685  DctInit();
2686  IdctInit();
2687  QuantInit();
2688 
2689  return 0;
2690 }
2691 
2692 int RTjpeg::SetFormat(const int *fmt)
2693 {
2694  m_f = *fmt;
2695  return 0;
2696 }
2697 
2698 int RTjpeg::SetSize(const int *w, const int *h)
2699 {
2700  if ((*w < 0) || (*w > 65535))
2701  return -1;
2702  if ((*h < 0) || (*h > 65535))
2703  return -1;
2704 
2705  m_width = *w;
2706  m_height = *h;
2707  m_yWidth = m_width>>3;
2708  m_ySize = m_width * m_height;
2709  m_cWidth = m_width>>4;
2710  m_cSize = (m_width>>1) * m_height;
2711 
2712  if (m_keyRate > 0)
2713  {
2714  delete [] m_old;
2715  m_old = new (std::align_val_t(32)) int16_t[4*m_width*m_height];
2716  if (!m_old)
2717  {
2718  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2719  return -1;
2720  }
2721  memset(m_old, 0, (4_UZ * m_width * m_height));
2722  }
2723  return 0;
2724 }
2725 
2726 int RTjpeg::SetIntra(int *key, int *lm, int *cm)
2727 {
2728  if (*key < 0)
2729  *key = 0;
2730  if (*key > 255)
2731  *key = 255;
2732  m_keyRate = *key;
2733 
2734  if (*lm < 0)
2735  *lm = 0;
2736  if (*lm > 16)
2737  *lm = 16;
2738  if (*cm < 0)
2739  *cm = 0;
2740  if (*cm > 16)
2741  *cm = 16;
2742 
2743 #ifdef MMX
2744  m_lMask.uq = (((uint64_t)(*lm)<<48)|((uint64_t)(*lm)<<32)|((uint64_t)(*lm)<<16)|(uint64_t)(*lm));
2745  m_cMask.uq = (((uint64_t)(*cm)<<48)|((uint64_t)(*cm)<<32)|((uint64_t)(*cm)<<16)|(uint64_t)(*cm));
2746 #else
2747  m_lMask = *lm;
2748  m_cMask = *cm;
2749 #endif
2750 
2751  delete [] m_old;
2752  m_old = new (std::align_val_t(32)) int16_t[4*m_width*m_height];
2753  if (!m_old)
2754  {
2755  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2756  return -1;
2757  }
2758  memset(m_old, 0, (4_UZ * m_width * m_height));
2759 
2760  return 0;
2761 }
2762 
2764 {
2765 #ifdef MMX
2766  RTjpeg_ones.q = 0x0001000100010001LL;
2767  RTjpeg_half.q = 0x7fff7fff7fff7fffLL;
2768  RTjpeg_C4.q = 0x2D412D412D412D41LL;
2769  RTjpeg_C6.q = 0x187E187E187E187ELL;
2770  RTjpeg_C2mC6.q= 0x22A322A322A322A3LL;
2771  RTjpeg_C2pC6.q= 0x539F539F539F539FLL;
2772  RTjpeg_zero.q = 0x0000000000000000LL;
2773 #endif
2774 }
2775 
2777 {
2778  delete [] m_old;
2779 }
2780 
2781 inline int RTjpeg::compressYUV420(int8_t *sp, uint8_t **planes)
2782 {
2783  uint8_t * bp = planes[0];
2784  uint8_t * bp1 = bp + (m_width<<3);
2785  uint8_t * bp2 = planes[1];
2786  uint8_t * bp3 = planes[2];
2787 
2788 #ifdef MMX
2789  emms();
2790 #endif
2791  int8_t * sb = sp;
2792 /* Y */
2793  for(int i = m_height >> 1; i; i -= 8)
2794  {
2795  for(int j = 0, k = 0; j < m_width; j += 16, k += 8)
2796  {
2797  DctY(bp+j, m_yWidth);
2798  Quant(m_block, m_lqt);
2799  sp += b2s(m_block, sp, m_lB8);
2800 
2801  DctY(bp+j+8, m_yWidth);
2802  Quant(m_block, m_lqt);
2803  sp += b2s(m_block, sp, m_lB8);
2804 
2805  DctY(bp1+j, m_yWidth);
2806  Quant(m_block, m_lqt);
2807  sp += b2s(m_block, sp, m_lB8);
2808 
2809  DctY(bp1+j+8, m_yWidth);
2810  Quant(m_block, m_lqt);
2811  sp += b2s(m_block, sp, m_lB8);
2812 
2813  DctY(bp2+k, m_cWidth);
2814  Quant(m_block, m_cqt);
2815  sp += b2s(m_block, sp, m_cB8);
2816 
2817  DctY(bp3+k, m_cWidth);
2818  Quant(m_block, m_cqt);
2819  sp += b2s(m_block, sp, m_cB8);
2820  }
2821  bp += m_width<<4;
2822  bp1 += m_width<<4;
2823  bp2 += m_width<<2;
2824  bp3 += m_width<<2;
2825  }
2826 #ifdef MMX
2827  emms();
2828 #endif
2829  return (sp - sb);
2830 }
2831 
2832 inline int RTjpeg::compressYUV422(int8_t *sp, uint8_t **planes)
2833 {
2834  uint8_t * bp = planes[0];
2835  uint8_t * bp2 = planes[1];
2836  uint8_t * bp3 = planes[2];
2837 
2838 #ifdef MMX
2839  emms();
2840 #endif
2841  int8_t * sb=sp;
2842 /* Y */
2843  for(int i=m_height; i; i-=8)
2844  {
2845  for(int j=0, k=0; j<m_width; j+=16, k+=8)
2846  {
2847  DctY(bp+j, m_yWidth);
2848  Quant(m_block, m_lqt);
2849  sp += b2s(m_block, sp, m_lB8);
2850 
2851  DctY(bp+j+8, m_yWidth);
2852  Quant(m_block, m_lqt);
2853  sp += b2s(m_block, sp, m_lB8);
2854 
2855  DctY(bp2+k, m_cWidth);
2856  Quant(m_block, m_cqt);
2857  sp+=b2s(m_block, sp, m_cB8);
2858 
2859  DctY(bp3+k, m_cWidth);
2860  Quant(m_block, m_cqt);
2861  sp+=b2s(m_block, sp, m_cB8);
2862 
2863  }
2864  bp += m_width << 3;
2865  bp2 += m_width << 2;
2866  bp3 += m_width << 2;
2867 
2868  }
2869 #ifdef MMX
2870  emms();
2871 #endif
2872  return (sp-sb);
2873 }
2874 
2875 inline int RTjpeg::compress8(int8_t *sp, uint8_t **planes)
2876 {
2877  int8_t * sb = nullptr;
2878  uint8_t * bp = planes[0];
2879 
2880 #ifdef MMX
2881  emms();
2882 #endif
2883 
2884  sb=sp;
2885 /* Y */
2886  for(int i=0; i<m_height; i+=8)
2887  {
2888  for(int j=0; j<m_width; j+=8)
2889  {
2890  DctY(bp+j, m_width);
2891  Quant(m_block, m_lqt);
2892  sp += b2s(m_block, sp, m_lB8);
2893  }
2894  bp += m_width;
2895  }
2896 
2897 #ifdef MMX
2898  emms();
2899 #endif
2900  return (sp-sb);
2901 }
2902 
2903 inline void RTjpeg::decompressYUV422(int8_t *sp, uint8_t **planes)
2904 {
2905  uint8_t * bp = planes[0];
2906  uint8_t * bp2 = planes[1];
2907  uint8_t * bp3 = planes[2];
2908 
2909 #ifdef MMX
2910  emms();
2911 #endif
2912 
2913 /* Y */
2914  for(int i=m_height; i; i-=8)
2915  {
2916  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2917  if (*sp==-1)sp++;
2918  else
2919  {
2920  sp += s2b(m_block, sp, m_lB8, m_liqt);
2921  Idct(bp+j, m_block, m_width);
2922  }
2923  if (*sp==-1)sp++;
2924  else
2925  {
2926  sp += s2b(m_block, sp, m_lB8, m_liqt);
2927  Idct(bp+j+8, m_block, m_width);
2928  }
2929  if (*sp==-1)sp++;
2930  else
2931  {
2932  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2933  Idct(bp2+k, m_block, m_width>>1);
2934  }
2935  if (*sp==-1)sp++;
2936  else
2937  {
2938  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2939  Idct(bp3+k, m_block, m_width>>1);
2940  }
2941  }
2942  bp += m_width<<3;
2943  bp2 += m_width<<2;
2944  bp3 += m_width<<2;
2945  }
2946 #ifdef MMX
2947  emms();
2948 #endif
2949 }
2950 
2951 inline void RTjpeg::decompressYUV420(int8_t *sp, uint8_t **planes)
2952 {
2953  uint8_t * bp = planes[0];
2954  uint8_t * bp1 = bp + (m_width<<3);
2955  uint8_t * bp2 = planes[1];
2956  uint8_t * bp3 = planes[2];
2957 
2958 #ifdef MMX
2959  emms();
2960 #endif
2961 
2962 /* Y */
2963  for(int i=m_height>>1; i; i-=8)
2964  {
2965  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2966  if (*sp==-1)sp++;
2967  else
2968  {
2969  sp += s2b(m_block, sp, m_lB8, m_liqt);
2970  Idct(bp+j, m_block, m_width);
2971  }
2972  if (*sp==-1)sp++;
2973  else
2974  {
2975  sp += s2b(m_block, sp, m_lB8, m_liqt);
2976  Idct(bp+j+8, m_block, m_width);
2977  }
2978  if (*sp==-1)sp++;
2979  else
2980  {
2981  sp += s2b(m_block, sp, m_lB8, m_liqt);
2982  Idct(bp1+j, m_block, m_width);
2983  }
2984  if (*sp==-1)sp++;
2985  else
2986  {
2987  sp += s2b(m_block, sp, m_lB8, m_liqt);
2988  Idct(bp1+j+8, m_block, m_width);
2989  }
2990  if (*sp==-1)sp++;
2991  else
2992  {
2993  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2994  Idct(bp2+k, m_block, m_width>>1);
2995  }
2996  if (*sp==-1)sp++;
2997  else
2998  {
2999  sp += s2b(m_block, sp, m_cB8, m_ciqt);
3000  Idct(bp3+k, m_block, m_width>>1);
3001  }
3002  }
3003  bp += m_width<<4;
3004  bp1 += m_width<<4;
3005  bp2 += m_width<<2;
3006  bp3 += m_width<<2;
3007  }
3008 #ifdef MMX
3009  emms();
3010 #endif
3011 }
3012 
3013 inline void RTjpeg::decompress8(int8_t *sp, uint8_t **planes)
3014 {
3015  uint8_t * bp = planes[0];
3016 
3017 #ifdef MMX
3018  emms();
3019 #endif
3020 
3021 /* Y */
3022  for(int i=0; i<m_height; i+=8)
3023  {
3024  for(int j=0; j<m_width; j+=8)
3025  {
3026  if (*sp==-1)sp++;
3027  else
3028  {
3029  sp += s2b(m_block, sp, m_lB8, m_liqt);
3030  Idct(bp+j, m_block, m_width);
3031  }
3032  }
3033  bp += m_width<<3;
3034  }
3035 }
3036 
3037 #ifdef MMX
3038 
3039 int RTjpeg::bcomp(RTjpegData16 &rblock, int16_t *_old, mmx_t *mask)
3040 {
3041  auto *mold=(mmx_t *)_old;
3042  auto *mblock=(mmx_t *)rblock.data();
3043  volatile mmx_t result {};
3044  static mmx_t s_neg= { 0xffffffffffffffffULL };
3045 
3046  movq_m2r(*mask, mm7);
3047  movq_m2r(s_neg, mm6);
3048  pxor_r2r(mm5, mm5);
3049 
3050  for(int i=0; i<8; i++)
3051  {
3052  movq_m2r(*(mblock++), mm0);
3053  movq_m2r(*(mblock++), mm2);
3054  movq_m2r(*(mold++), mm1);
3055  movq_m2r(*(mold++), mm3);
3056  psubsw_r2r(mm1, mm0);
3057  psubsw_r2r(mm3, mm2);
3058  movq_r2r(mm0, mm1);
3059  movq_r2r(mm2, mm3);
3060  pcmpgtw_r2r(mm7, mm0);
3061  pcmpgtw_r2r(mm7, mm2);
3062  pxor_r2r(mm6, mm1);
3063  pxor_r2r(mm6, mm3);
3064  pcmpgtw_r2r(mm7, mm1);
3065  pcmpgtw_r2r(mm7, mm3);
3066  por_r2r(mm0, mm5);
3067  por_r2r(mm2, mm5);
3068  por_r2r(mm1, mm5);
3069  por_r2r(mm3, mm5);
3070  }
3071  movq_r2m(mm5, result);
3072 
3073  if (result.q)
3074  {
3075  std::copy(rblock.cbegin(), rblock.cend(), _old);
3076  return 0;
3077  }
3078  return 1;
3079 }
3080 
3081 #else
3082 int RTjpeg::bcomp(RTjpegData16 &rblock, int16_t *_old, uint16_t *mask)
3083 {
3084  for(int i=0; i<64; i++)
3085  if (abs(_old[i]-rblock[i])>*mask)
3086  {
3087  std::copy(rblock.cbegin(), rblock.cend(), _old);
3088  return 0;
3089  }
3090  return 1;
3091 }
3092 #endif
3093 
3094 inline int RTjpeg::mcompressYUV420(int8_t *sp, uint8_t **planes)
3095 {
3096  uint8_t * bp = planes[0];
3097  uint8_t * bp1 = bp + (m_width<<3);
3098  uint8_t * bp2 = planes[1];
3099  uint8_t * bp3 = planes[2];
3100  int8_t * sb = sp;
3101  int16_t * lblock = m_old;
3102 
3103 /* Y */
3104  for(int i = m_height>>1; i; i-=8)
3105  {
3106  for(int j=0, k=0; j < m_width; j+=16, k+=8)
3107  {
3108  DctY(bp+j, m_yWidth);
3109  Quant(m_block, m_lqt);
3110  if (bcomp(m_block, lblock, &m_lMask))
3111  {
3112  *((uint8_t *)sp++)=255;
3113  }
3114  else sp+=b2s(m_block, sp, m_lB8);
3115  lblock += 64;
3116 
3117  DctY(bp+j+8, m_yWidth);
3118  Quant(m_block, m_lqt);
3119  if (bcomp(m_block, lblock, &m_lMask))
3120  {
3121  *((uint8_t *)sp++)=255;
3122  }
3123  else sp += b2s(m_block, sp, m_lB8);
3124  lblock += 64;
3125 
3126  DctY(bp1+j, m_yWidth);
3127  Quant(m_block, m_lqt);
3128  if (bcomp(m_block, lblock, &m_lMask))
3129  {
3130  *((uint8_t *)sp++)=255;
3131  }
3132  else sp += b2s(m_block, sp, m_lB8);
3133  lblock += 64;
3134 
3135  DctY(bp1+j+8, m_yWidth);
3136  Quant(m_block, m_lqt);
3137  if (bcomp(m_block, lblock, &m_lMask))
3138  {
3139  *((uint8_t *)sp++)=255;
3140  }
3141  else sp += b2s(m_block, sp, m_lB8);
3142  lblock += 64;
3143 
3144  DctY(bp2+k, m_cWidth);
3145  Quant(m_block, m_cqt);
3146  if (bcomp(m_block, lblock, &m_cMask))
3147  {
3148  *((uint8_t *)sp++)=255;
3149  }
3150  else
3151  sp+=b2s(m_block, sp, m_cB8);
3152  lblock+=64;
3153 
3154  DctY(bp3+k, m_cWidth);
3155  Quant(m_block, m_cqt);
3156  if (bcomp(m_block, lblock, &m_cMask))
3157  {
3158  *((uint8_t *)sp++)=255;
3159  }
3160  else
3161  sp+=b2s(m_block, sp, m_cB8);
3162  lblock+=64;
3163  }
3164  bp += m_width<<4;
3165  bp1 += m_width<<4;
3166  bp2 += m_width<<2;
3167  bp3 += m_width<<2;
3168  }
3169 #ifdef MMX
3170  emms();
3171 #endif
3172  return (sp-sb);
3173 }
3174 
3175 
3176 inline int RTjpeg::mcompressYUV422(int8_t *sp, uint8_t **planes)
3177 {
3178  uint8_t * bp = planes[0];
3179  uint8_t * bp2 = planes[1];
3180  uint8_t * bp3 = planes[2];
3181  int8_t * sb=sp;
3182  int16_t *lblock = m_old;
3183 
3184  for(int i = m_height; i; i-=8)
3185  {
3186  for(int j=0, k=0; j<m_width; j+=16, k+=8)
3187  {
3188  DctY(bp+j, m_yWidth);
3189  Quant(m_block, m_lqt);
3190  if (bcomp(m_block, lblock, &m_lMask))
3191  {
3192  *((uint8_t *)sp++)=255;
3193  }
3194  else sp+=b2s(m_block, sp, m_lB8);
3195  lblock+=64;
3196 
3197  DctY(bp+j+8, m_yWidth);
3198  Quant(m_block, m_lqt);
3199  if (bcomp(m_block, lblock, &m_lMask))
3200  {
3201  *((uint8_t *)sp++)=255;
3202  }
3203  else sp+=b2s(m_block, sp, m_lB8);
3204  lblock+=64;
3205 
3206  DctY(bp2+k, m_cWidth);
3207  Quant(m_block, m_cqt);
3208  if (bcomp(m_block, lblock, &m_cMask))
3209  {
3210  *((uint8_t *)sp++)=255;
3211  }
3212  else sp+=b2s(m_block, sp, m_cB8);
3213  lblock+=64;
3214 
3215  DctY(bp3+k, m_cWidth);
3216  Quant(m_block, m_cqt);
3217  if (bcomp(m_block, lblock, &m_cMask))
3218  {
3219  *((uint8_t *)sp++)=255;
3220  }
3221  else sp+=b2s(m_block, sp, m_cB8);
3222  lblock+=64;
3223 
3224  }
3225  bp += m_width<<3;
3226  bp2 += m_width<<2;
3227  bp3 += m_width<<2;
3228  }
3229 #ifdef MMX
3230  emms();
3231 #endif
3232  return (sp-sb);
3233 }
3234 
3235 inline int RTjpeg::mcompress8(int8_t *sp, uint8_t **planes)
3236 {
3237  uint8_t * bp = planes[0];
3238  int8_t * sb = sp;
3239  int16_t *lblock = m_old;
3240 
3241  for(int i=0; i<m_height; i+=8)
3242  {
3243  for(int j=0; j<m_width; j+=8)
3244  {
3245  DctY(bp+j, m_width);
3246  Quant(m_block, m_lqt);
3247  if (bcomp(m_block, lblock, &m_lMask))
3248  {
3249  *((uint8_t *)sp++)=255;
3250  } else sp+=b2s(m_block, sp, m_lB8);
3251  lblock+=64;
3252  }
3253  bp+=m_width<<3;
3254  }
3255 #ifdef MMX
3256  emms();
3257 #endif
3258  return (sp-sb);
3259 }
3260 
3262 {
3263  m_keyCount = 0;
3264 }
3265 
3266 int RTjpeg::Compress(int8_t *sp, uint8_t **planes)
3267 {
3268  auto * fh = reinterpret_cast<RTjpeg_frameheader *>(sp);
3269  int ds = 0;
3270 
3271  if (m_keyRate == 0)
3272  {
3273  switch(m_f)
3274  {
3275  case RTJ_YUV420: ds = compressYUV420((int8_t*)&(fh->data), planes); break;
3276  case RTJ_YUV422: ds = compressYUV422((int8_t*)&(fh->data), planes); break;
3277  case RTJ_RGB8: ds = compress8((int8_t*)&(fh->data), planes); break;
3278  }
3279  fh->key = 0;
3280  } else {
3281  if (m_keyCount == 0)
3282  memset(m_old, 0, (4_UZ * m_width * m_height));
3283  switch(m_f)
3284  {
3285  case RTJ_YUV420: ds = mcompressYUV420((int8_t*)&(fh->data), planes); break;
3286  case RTJ_YUV422: ds = mcompressYUV422((int8_t*)&(fh->data), planes); break;
3287  case RTJ_RGB8: ds = mcompress8((int8_t*)&(fh->data), planes); break;
3288  }
3289  fh->key = m_keyCount;
3290  if (++m_keyCount > m_keyRate)
3291  m_keyCount = 0;
3292  }
3293  ds += RTJPEG_HEADER_SIZE;
3294  fh->framesize = qToLittleEndian<qint32>(ds);
3295  fh->headersize = RTJPEG_HEADER_SIZE;
3296  fh->version = RTJPEG_FILE_VERSION;
3297  fh->width = qToLittleEndian<qint16>(m_width);
3298  fh->height = qToLittleEndian<qint16>(m_height);
3299  fh->quality = m_q;
3300  return ds;
3301 }
3302 
3303 void RTjpeg::Decompress(int8_t *sp, uint8_t **planes)
3304 {
3305  auto * fh = reinterpret_cast<RTjpeg_frameheader *>(sp);
3306 
3307  if ((qFromLittleEndian<qint16>(fh->width) != m_width)||
3308  (qFromLittleEndian<qint16>(fh->height) != m_height))
3309  {
3310  int w = qFromLittleEndian<qint16>(fh->width);
3311  int h = qFromLittleEndian<qint16>(fh->height);
3312  SetSize(&w, &h);
3313  }
3314  if (fh->quality != m_q)
3315  {
3316  int q = fh->quality;
3317  SetQuality(&q);
3318  }
3319  switch(m_f)
3320  {
3321  case RTJ_YUV420: decompressYUV420((int8_t*)&(fh->data), planes); break;
3322  case RTJ_YUV422: decompressYUV422((int8_t*)&(fh->data), planes); break;
3323  case RTJ_RGB8: decompress8((int8_t*)&(fh->data), planes); break;
3324  }
3325 }
RTjpeg::mcompressYUV422
int mcompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3176
RTjpeg_chrom_quant_tbl
static const std::array< const uint8_t, 64 > RTjpeg_chrom_quant_tbl
Definition: RTjpegN.cpp:88
RTjpeg::s2b
static int s2b(RTjpegData16 &data, const int8_t *strm, uint8_t bt8, RTjpegData32 &qtbla)
Definition: RTjpegN.cpp:284
RTjpeg::DctY
void DctY(uint8_t *idata, int rskip)
Definition: RTjpegN.cpp:602
RTjpeg_lum_quant_tbl
static const std::array< const uint8_t, 64 > RTjpeg_lum_quant_tbl
Definition: RTjpegN.cpp:77
RTjpegData16
std::array< int16_t, 64 > RTjpegData16
Definition: RTjpegN.h:38
RTjpeg::m_lB8
int32_t m_lB8
Definition: RTjpegN.h:108
RTjpeg::m_yWidth
int32_t m_yWidth
Definition: RTjpegN.h:110
RTjpeg::compress8
int compress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2875
RTjpeg::m_lMask
mmx_t m_lMask
Definition: RTjpegN.h:122
RTjpeg::m_ySize
int32_t m_ySize
Definition: RTjpegN.h:112
RTjpeg::b2s
static int b2s(const RTjpegData16 &data, int8_t *strm, uint8_t bt8)
Definition: RTjpegN.cpp:116
RTjpeg_C4
static mmx_t RTjpeg_C4
Definition: RTjpegN.cpp:39
RTjpeg::SetNextKey
void SetNextKey(void)
Definition: RTjpegN.cpp:3261
RTjpeg::m_q
int m_q
Definition: RTjpegN.h:119
RTjpeg_frameheader
Definition: RTjpegN.h:131
RTJ_RGB8
@ RTJ_RGB8
Definition: RTjpegN.h:50
RTjpeg_C6
static mmx_t RTjpeg_C6
Definition: RTjpegN.cpp:40
RTjpeg::m_height
int m_height
Definition: RTjpegN.h:118
tmp
static guint32 * tmp
Definition: goom_core.cpp:26
RTjpeg_half
static mmx_t RTjpeg_half
Definition: RTjpegN.cpp:38
RTjpeg::m_cB8
int32_t m_cB8
Definition: RTjpegN.h:109
RTjpeg::DctInit
void DctInit(void)
Definition: RTjpegN.cpp:593
RTjpeg::CalcTbls
void CalcTbls(void)
Definition: RTjpegN.cpp:2644
RTjpeg::QuantInit
void QuantInit(void)
Definition: RTjpegN.cpp:518
RTJPEG_FILE_VERSION
static constexpr uint8_t RTJPEG_FILE_VERSION
Definition: RTjpegN.h:35
MythFile::copy
MBASE_PUBLIC long long copy(QFile &dst, QFile &src, uint block_size=0)
Copies src file to dst file.
Definition: mythmiscutil.cpp:263
RTjpeg::bcomp
static int bcomp(RTjpegData16 &rblock, int16_t *old, mmx_t *mask)
Definition: RTjpegN.cpp:3039
RTJ_YUV420
@ RTJ_YUV420
Definition: RTjpegN.h:48
RTjpeg_C2mC6
static mmx_t RTjpeg_C2mC6
Definition: RTjpegN.cpp:41
RTjpeg_ones
static mmx_t RTjpeg_ones
Definition: RTjpegN.cpp:37
RTjpeg::mcompress8
int mcompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3235
RTjpeg::mcompressYUV420
int mcompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3094
RTjpeg::compressYUV420
int compressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2781
RTjpeg_ZZ
static const std::array< const uint8_t, 64 > RTjpeg_ZZ
Definition: RTjpegN.cpp:49
RTjpeg::decompress8
void decompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3013
RTjpeg::m_cWidth
int32_t m_cWidth
Definition: RTjpegN.h:111
hardwareprofile.distros.mythtv_data.main.stdout
stdout
Definition: main.py:87
RTJ_YUV422
@ RTJ_YUV422
Definition: RTjpegN.h:49
RTjpeg::Idct
void Idct(uint8_t *odata, RTjpegData16 &data, int rskip)
Definition: RTjpegN.cpp:1526
RTjpeg::m_width
int m_width
Definition: RTjpegN.h:117
RTjpeg::~RTjpeg
~RTjpeg()
Definition: RTjpegN.cpp:2776
RTjpeg::m_ws
std::array< int32_t, 64_UZ *4 > m_ws
Definition: RTjpegN.h:103
RTjpeg::Quant
static void Quant(RTjpegData16 &block, RTjpegData32 &qtbl)
Definition: RTjpegN.cpp:533
RTjpeg::m_block
RTjpegData16 m_block
Definition: RTjpegN.h:102
RTjpeg::decompressYUV420
void decompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2951
RTjpeg_C2pC6
static mmx_t RTjpeg_C2pC6
Definition: RTjpegN.cpp:42
RTjpeg::SetIntra
int SetIntra(int *key, int *lm, int *cm)
Definition: RTjpegN.cpp:2726
bbciplayer.stderr
stderr
Definition: bbciplayer.py:199
RTjpeg::m_lqt
RTjpegData32 m_lqt
Definition: RTjpegN.h:104
RTjpeg::m_keyCount
int m_keyCount
Definition: RTjpegN.h:115
RTjpeg::SetFormat
int SetFormat(const int *fmt)
Definition: RTjpegN.cpp:2692
RTjpeg::m_ciqt
RTjpegData32 m_ciqt
Definition: RTjpegN.h:107
RTjpeg::m_old
int16_t * m_old
Definition: RTjpegN.h:114
RTjpeg::m_keyRate
int m_keyRate
Definition: RTjpegN.h:128
RTjpeg_zero
static mmx_t RTjpeg_zero
Definition: RTjpegN.cpp:43
RTjpeg::SetSize
int SetSize(const int *w, const int *h)
Definition: RTjpegN.cpp:2698
RTjpeg::IdctInit
void IdctInit(void)
Definition: RTjpegN.cpp:1517
RTjpeg::decompressYUV422
void decompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2903
RTjpeg::Decompress
void Decompress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3303
RTjpeg::compressYUV422
int compressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2832
uint16_t
unsigned short uint16_t
Definition: iso6937tables.h:3
RTjpeg::m_cSize
int32_t m_cSize
Definition: RTjpegN.h:113
RTjpeg::m_cMask
mmx_t m_cMask
Definition: RTjpegN.h:123
RTjpegN.h
RTjpeg::m_cqt
RTjpegData32 m_cqt
Definition: RTjpegN.h:105
RTjpeg_aan_tab
static const std::array< const uint64_t, 64 > RTjpeg_aan_tab
Definition: RTjpegN.cpp:66
RTjpeg::m_liqt
RTjpegData32 m_liqt
Definition: RTjpegN.h:106
RTjpeg::SetQuality
int SetQuality(int *quality)
Definition: RTjpegN.cpp:2675
RTjpeg::Compress
int Compress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3266
RTjpegData32
std::array< int32_t, 64 > RTjpegData32
Definition: RTjpegN.h:39
RTjpeg::m_f
int m_f
Definition: RTjpegN.h:120
RTJPEG_HEADER_SIZE
static constexpr uint8_t RTJPEG_HEADER_SIZE
Definition: RTjpegN.h:36
RTjpeg::RTjpeg
RTjpeg()
Definition: RTjpegN.cpp:2763