MythTV  master
RTjpegN.cpp
Go to the documentation of this file.
1 /*
2  RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
3 
4  With modifications by:
5  (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6  and
7  (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation; either version 2 of the License, or
12  (at your option) any later version.
13 
14  This program is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License for more details.
18 
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the Free Software
21  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 */
24 
25 #include <algorithm>
26 #include <array>
27 #include <cstdio>
28 #include <cstdlib>
29 #include <cstring>
30 #include <limits> // workaround QTBUG-90395
31 #include "RTjpegN.h"
32 
33 #include <QtGlobal>
34 #include <QtEndian>
35 
36 #ifdef MMX
37 static mmx_t RTjpeg_ones;
38 static mmx_t RTjpeg_half;
39 static mmx_t RTjpeg_C4;
40 static mmx_t RTjpeg_C6;
41 static mmx_t RTjpeg_C2mC6;
42 static mmx_t RTjpeg_C2pC6;
43 static mmx_t RTjpeg_zero;
44 #endif
45 
46 //#define SHOWBLOCK 1 // NOLINT(cppcoreguidelines-macro-usage)
47 #define BETTERCOMPRESSION 1 // NOLINT(cppcoreguidelines-macro-usage)
48 
49 static const std::array<const uint8_t,64> RTjpeg_ZZ {
50 0,
51 8, 1,
52 2, 9, 16,
53 24, 17, 10, 3,
54 4, 11, 18, 25, 32,
55 40, 33, 26, 19, 12, 5,
56 6, 13, 20, 27, 34, 41, 48,
57 56, 49, 42, 35, 28, 21, 14, 7,
58 15, 22, 29, 36, 43, 50, 57,
59 58, 51, 44, 37, 30, 23,
60 31, 38, 45, 52, 59,
61 60, 53, 46, 39,
62 47, 54, 61,
63 62, 55,
64 63 };
65 
66 static const std::array<const uint64_t,64> RTjpeg_aan_tab {
67 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
68 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
69 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
70 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
71 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
72 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
73 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
74 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
75 };
76 
77 static const std::array<const uint8_t,64> RTjpeg_lum_quant_tbl {
78  16, 11, 10, 16, 24, 40, 51, 61,
79  12, 12, 14, 19, 26, 58, 60, 55,
80  14, 13, 16, 24, 40, 57, 69, 56,
81  14, 17, 22, 29, 51, 87, 80, 62,
82  18, 22, 37, 56, 68, 109, 103, 77,
83  24, 35, 55, 64, 81, 104, 113, 92,
84  49, 64, 78, 87, 103, 121, 120, 101,
85  72, 92, 95, 98, 112, 100, 103, 99
86  };
87 
88 static const std::array<const uint8_t,64> RTjpeg_chrom_quant_tbl {
89  17, 18, 24, 47, 99, 99, 99, 99,
90  18, 21, 26, 66, 99, 99, 99, 99,
91  24, 26, 56, 99, 99, 99, 99, 99,
92  47, 66, 99, 99, 99, 99, 99, 99,
93  99, 99, 99, 99, 99, 99, 99, 99,
94  99, 99, 99, 99, 99, 99, 99, 99,
95  99, 99, 99, 99, 99, 99, 99, 99,
96  99, 99, 99, 99, 99, 99, 99, 99
97  };
98 
99 #ifdef BETTERCOMPRESSION
100 
101 /*--------------------------------------------------*/
102 /* better encoding, but needs a lot more cpu time */
103 /* seems to be more effective than old method +lzo */
104 /* with this encoding lzo isn't efficient anymore */
105 /* there is still more potential for better */
106 /* encoding but that would need even more cputime */
107 /* anyway your mileage may vary */
108 /* */
109 /* written by Martin BIELY and Roman HOCHLEITNER */
110 /*--------------------------------------------------*/
111 
112 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
113 /* Block to Stream (encoding) */
114 /* */
115 
116 int RTjpeg::b2s(const RTjpegData16 &data, int8_t *strm, uint8_t /*bt8*/)
117 {
118  int co=1;
119 
120  auto *ustrm = (uint8_t *)strm;
121 #ifdef SHOWBLOCK
122 
123  int ii;
124  for (ii=0; ii < 64; ii++) {
125  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
126  }
127  fprintf(stdout, "\n\n");
128 
129 #endif
130 
131 // *strm++ = 0x10;
132 // *strm = 0x00;
133 //
134 // return 2;
135 
136  // first byte allways written
137  int32_t value = data[RTjpeg_ZZ[0]];
138  ustrm[0]= static_cast<uint8_t>(std::clamp(value, 0, 254));
139 
140  int ci=63;
141  while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
142 
143  unsigned char bitten = ((unsigned char)ci) << 2;
144 
145  if (ci==0) {
146  ustrm[1]= bitten;
147  co = 2;
148  return co;
149  }
150 
151  /* bitoff=0 because the high 6bit contain first non zero position */
152  unsigned char bitoff = 0;
153  co = 1;
154 
155  for(; ci>0; ci--) {
156 
157  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
158 
159  switch(ZZvalue) {
160  case 0:
161  break;
162  case 1:
163  bitten |= (0x01<<bitoff);
164  break;
165  case -1:
166  bitten |= (0x03<<bitoff);
167  break;
168  default:
169  bitten |= (0x02<<bitoff);
170  goto HERZWEH;
171  break;
172  }
173 
174  if ( bitoff == 0 ) {
175  ustrm[co]= bitten;
176  bitten = 0;
177  bitoff = 8;
178  co++;
179  } /* "fall through" */
180  bitoff-=2;
181 
182  }
183 
184  /* ci must be 0 */
185  if (bitoff != 6) {
186 
187  ustrm[co]= bitten;
188  co++;
189 
190  }
191  goto BAUCHWEH;
192 
193 HERZWEH:
194 /* ci cannot be 0 */
195 /* correct bitoff to nibble boundaries */
196 
197  switch(bitoff){
198  case 4:
199  case 6:
200  bitoff = 0;
201  break;
202  case 2:
203  case 0:
204  ustrm[co]= bitten;
205  bitoff = 4;
206  co++;
207  bitten = 0; // clear half nibble values in bitten
208  break;
209  default:
210  break;
211  }
212 
213  for(; ci>0; ci--) {
214 
215  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
216 
217  if ( (ZZvalue > 7) || (ZZvalue < -7) ) {
218  bitten |= (0x08<<bitoff);
219  goto HIRNWEH;
220  }
221 
222  bitten |= (ZZvalue&0xf)<<bitoff;
223 
224  if ( bitoff == 0 ) {
225  ustrm[co]= bitten;
226  bitten = 0;
227  bitoff = 8;
228  co++;
229  } /* "fall thru" */
230  bitoff-=4;
231  }
232 
233  /* ci must be 0 */
234  if ( bitoff == 0 ) {
235  ustrm[co]= bitten;
236  co++;
237  }
238  goto BAUCHWEH;
239 
240 HIRNWEH:
241 
242  ustrm[co]= bitten;
243  co++;
244 
245 
246  /* bitting is over now we bite */
247  for(; ci>0; ci--) {
248 
249  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
250 
251  if (ZZvalue>0)
252  {
253  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
254  }
255  else
256  {
257  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
258  }
259 
260  }
261 
262 
263 BAUCHWEH:
264  /* we gotoo much now we are ill */
265 #ifdef SHOWBLOCK
266 {
267 int i;
268 fprintf(stdout, "\nco = '%d'\n", co);
269  for (i=0; i < co+2; i++) {
270  fprintf(stdout, "%d ", strm[i]);
271  }
272 fprintf(stdout, "\n\n");
273 }
274 #endif
275 
276  return co;
277 }
278 
279 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
280 /* Stream to Block (decoding) */
281 /* */
282 
283 int RTjpeg::s2b(RTjpegData16 &data, const int8_t *strm, uint8_t /*bt8*/, RTjpegData32 &qtbla)
284 {
285  auto *qtbl = (uint32_t *)qtbla.data();
286  int ci = 0;
287  unsigned char bitoff = 0;
288 
289  /* first byte always read */
290  int i=RTjpeg_ZZ[0];
291  data[i]=((uint8_t)strm[0])*qtbl[i];
292 
293  /* we start at the behind */
294 
295  unsigned char bitten = ((unsigned char)strm[1]) >> 2;
296  int co = 63;
297  for(; co > bitten; co--) {
298 
299  data[RTjpeg_ZZ[co]] = 0;
300 
301  }
302 
303  if (co==0) {
304  ci = 2;
305  goto AUTOBAHN;
306  }
307 
308  /* we have to read the last 2 bits of the second byte */
309  ci=1;
310  bitoff = 0;
311 
312  for(; co>0; co--) {
313 
314  bitten = ((unsigned char)strm[ci]) >> bitoff;
315  bitten &= 0x03;
316 
317  i=RTjpeg_ZZ[co];
318 
319  switch( bitten ) {
320  case 0x03:
321  data[i]= -qtbl[i];
322  break;
323  case 0x02:
324  goto FUSSWEG;
325  break;
326  case 0x01:
327  data[i]= qtbl[i];
328  break;
329  case 0x00:
330  data[i]= 0;
331  break;
332  default:
333  break;
334  }
335 
336  if ( bitoff == 0 ) {
337  bitoff = 8;
338  ci++;
339  }
340  bitoff -= 2;
341  }
342  /* co is 0 now */
343  /* data is written properly */
344 
345  /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
346  if (bitoff!=6) ci++;
347 
348  goto AUTOBAHN;
349 
350 
351 FUSSWEG:
352 /* correct bitoff to nibble */
353  switch(bitoff){
354  case 4:
355  case 6:
356  bitoff = 0;
357  break;
358  case 2:
359  case 0:
360  /* we have to read from the next byte */
361  ci++;
362  bitoff = 4;
363  break;
364  default:
365  break;
366  }
367 
368  for(; co>0; co--) {
369 
370  bitten = ((unsigned char)strm[ci]) >> bitoff;
371  bitten &= 0x0f;
372 
373  i=RTjpeg_ZZ[co];
374 
375  if ( bitten == 0x08 ) {
376  goto STRASSE;
377  }
378 
379  /* the compiler cannot do sign extension for signed nibbles */
380  if ( bitten & 0x08 ) {
381  bitten |= 0xf0;
382  }
383  /* the unsigned char bitten now is a valid signed char */
384 
385  data[i]=((signed char)bitten)*qtbl[i];
386 
387  if ( bitoff == 0 ) {
388  bitoff = 8;
389  ci++;
390  }
391  bitoff -= 4;
392  }
393  /* co is 0 */
394 
395  /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
396  if (bitoff!=4) ci++;
397 
398  goto AUTOBAHN;
399 
400 STRASSE:
401  ci++;
402 
403  for(; co>0; co--) {
404  i=RTjpeg_ZZ[co];
405  data[i]=strm[ci++]*qtbl[i];
406  }
407 
408  /* ci now is the count, because it points to next element => no incrementing */
409 
410 AUTOBAHN:
411 
412 #ifdef SHOWBLOCK
413 fprintf(stdout, "\nci = '%d'\n", ci);
414  for (i=0; i < 64; i++) {
415  fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
416  }
417 fprintf(stdout, "\n\n");
418 #endif
419 
420  return ci;
421 }
422 
423 #else
424 
425 int RTjpeg::b2s(const int16_t *data, int8_t *strm, uint8_t bt8)
426 {
427  register int ci, co=1, tmp;
428  register int16_t ZZvalue;
429 
430 #ifdef SHOWBLOCK
431 
432  int ii;
433  for (ii=0; ii < 64; ii++) {
434  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
435  }
436  fprintf(stdout, "\n\n");
437 
438 #endif
439 
440  (uint8_t)strm[0]=(uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
441 
442  for(ci=1; ci<=bt8; ci++)
443  {
444  ZZvalue = data[RTjpeg_ZZ[ci]];
445 
446  if (ZZvalue>0)
447  {
448  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
449  }
450  else
451  {
452  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
453  }
454  }
455 
456  for(; ci<64; ci++)
457  {
458  ZZvalue = data[RTjpeg_ZZ[ci]];
459 
460  if (ZZvalue>0)
461  {
462  strm[co++]=(int8_t)(ZZvalue>63)?63:ZZvalue;
463  }
464  else if (ZZvalue<0)
465  {
466  strm[co++]=(int8_t)(ZZvalue<-64)?-64:ZZvalue;
467  }
468  else /* compress zeros */
469  {
470  tmp=ci;
471  do
472  {
473  ci++;
474  } while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
475 
476  strm[co++]=(int8_t)(63+(ci-tmp));
477  ci--;
478  }
479  }
480  return (int)co;
481 }
482 
483 int RTjpeg::s2b(int16_t *data, const int8_t *strm, uint8_t bt8, uint32_t *qtbla)
484 {
485  uint32_t *qtbl = (uint32_t *)qtbla;
486  int ci=1, co=1, tmp;
487  register int i;
488 
489  i=RTjpeg_ZZ[0];
490  data[i]=((uint8_t)strm[0])*qtbl[i];
491 
492  for(co=1; co<=bt8; co++)
493  {
494  i=RTjpeg_ZZ[co];
495  data[i]=strm[ci++]*qtbl[i];
496  }
497 
498  for(; co<64; co++)
499  {
500  if (strm[ci]>63)
501  {
502  tmp=co+strm[ci]-63;
503  for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
504  co--;
505  } else
506  {
507  i=RTjpeg_ZZ[co];
508  data[i]=strm[ci]*qtbl[i];
509  }
510  ci++;
511  }
512  return (int)ci;
513 }
514 #endif
515 
516 #ifdef MMX
518 {
519  using P16_32 = union { int16_t *m_int16; int32_t *m_int32; };
520  P16_32 qtbl;
521 
522  qtbl.m_int32 = m_lqt.data();
523  for (int i = 0; i < 64; i++)
524  qtbl.m_int16[i] = static_cast<int16_t>(m_lqt[i]);
525 
526  // cppcheck-suppress redundantAssignment
527  qtbl.m_int32 = m_cqt.data();
528  for (int i = 0; i < 64; i++)
529  qtbl.m_int16[i] = static_cast<int16_t>(m_cqt[i]);
530 }
531 
533 {
534  auto *ql=(mmx_t *)qtbl.data();
535  auto *bl=(mmx_t *)_block.data();
536 
537  movq_m2r(RTjpeg_ones, mm6);
538  movq_m2r(RTjpeg_half, mm7);
539 
540  for(int i=16; i; i--)
541  {
542  movq_m2r(*(ql++), mm0); /* quant vals (4) */
543  movq_m2r(*bl, mm2); /* block vals (4) */
544  movq_r2r(mm0, mm1);
545  movq_r2r(mm2, mm3);
546 
547  punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
548  punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
549 
550  punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
551  punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
552 
553  pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
554  pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
555 
556  psrad_i2r(16, mm0);
557  psrad_i2r(16, mm1);
558 
559  packssdw_r2r(mm1, mm0);
560 
561  movq_r2m(mm0, *(bl++));
562  }
563 }
564 #else
565 void RTjpeg::QuantInit()
566 {
567 }
568 
569 void RTjpeg::Quant(RTjpegData16 &_block, RTjpegData32 &qtbl)
570 {
571  int i;
572 
573  for(i=0; i<64; i++)
574  _block[i]=(int16_t)((_block[i]*qtbl[i]+32767)>>16);
575 }
576 #endif
577 
578 /*
579  * Perform the forward DCT on one block of samples.
580  */
581 #ifndef MMX
582 static constexpr int32_t FIX_0_382683433 { 98 }; /* FIX(0.382683433) */
583 static constexpr int32_t FIX_0_541196100 { 139 }; /* FIX(0.541196100) */
584 static constexpr int32_t FIX_0_707106781 { 181 }; /* FIX(0.707106781) */
585 static constexpr int32_t FIX_1_306562965 { 334 }; /* FIX(1.306562965) */
586 
587 static constexpr int16_t DESCALE10(int32_t x) { return static_cast<int16_t>((x+128) >> 8); };
588 static constexpr int16_t DESCALE20(int32_t x) { return static_cast<int16_t>((x+32768) >> 16); };
589 static constexpr int32_t D_MULTIPLY(int32_t var, int32_t constant) { return var * constant; };
590 #endif
591 
593 {
594  for (int i = 0; i < 64; i++)
595  {
596  m_lqt[i] = (((uint64_t)m_lqt[i] << 32) / RTjpeg_aan_tab[i]);
597  m_cqt[i] = (((uint64_t)m_cqt[i] << 32) / RTjpeg_aan_tab[i]);
598  }
599 }
600 
601 void RTjpeg::DctY(uint8_t *idata, int rskip)
602 {
603 #ifndef MMX
604  uint8_t *idataptr = idata;
605  int32_t *wsptr = m_ws.data();
606 
607  for (int ctr = 7; ctr >= 0; ctr--) {
608  int32_t tmp0 = idataptr[0] + idataptr[7];
609  int32_t tmp7 = idataptr[0] - idataptr[7];
610  int32_t tmp1 = idataptr[1] + idataptr[6];
611  int32_t tmp6 = idataptr[1] - idataptr[6];
612  int32_t tmp2 = idataptr[2] + idataptr[5];
613  int32_t tmp5 = idataptr[2] - idataptr[5];
614  int32_t tmp3 = idataptr[3] + idataptr[4];
615  int32_t tmp4 = idataptr[3] - idataptr[4];
616 
617  int32_t tmp10 = (tmp0 + tmp3); /* phase 2 */
618  int32_t tmp13 = tmp0 - tmp3;
619  int32_t tmp11 = (tmp1 + tmp2);
620  int32_t tmp12 = tmp1 - tmp2;
621 
622  wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
623  wsptr[4] = (tmp10 - tmp11)<<8;
624 
625  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
626  wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
627  wsptr[6] = (tmp13<<8) - z1;
628 
629  tmp10 = tmp4 + tmp5; /* phase 2 */
630  tmp11 = tmp5 + tmp6;
631  tmp12 = tmp6 + tmp7;
632 
633  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
634  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
635  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
636  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
637 
638  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
639  int32_t z13 = (tmp7<<8) - z3;
640 
641  wsptr[5] = z13 + z2; /* phase 6 */
642  wsptr[3] = z13 - z2;
643  wsptr[1] = z11 + z4;
644  wsptr[7] = z11 - z4;
645 
646  idataptr += rskip<<3; /* advance pointer to next row */
647  wsptr += 8;
648  }
649 
650  wsptr = m_ws.data();
651  int16_t *odataptr = m_block.data();
652  for (int ctr = 7; ctr >= 0; ctr--) {
653  int32_t tmp0 = wsptr[0] + wsptr[56];
654  int32_t tmp7 = wsptr[0] - wsptr[56];
655  int32_t tmp1 = wsptr[8] + wsptr[48];
656  int32_t tmp6 = wsptr[8] - wsptr[48];
657  int32_t tmp2 = wsptr[16] + wsptr[40];
658  int32_t tmp5 = wsptr[16] - wsptr[40];
659  int32_t tmp3 = wsptr[24] + wsptr[32];
660  int32_t tmp4 = wsptr[24] - wsptr[32];
661 
662  int32_t tmp10 = tmp0 + tmp3; /* phase 2 */
663  int32_t tmp13 = tmp0 - tmp3;
664  int32_t tmp11 = tmp1 + tmp2;
665  int32_t tmp12 = tmp1 - tmp2;
666 
667  odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
668  odataptr[32] = DESCALE10(tmp10 - tmp11);
669 
670  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
671  odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
672  odataptr[48] = DESCALE20((tmp13<<8) - z1);
673 
674  tmp10 = tmp4 + tmp5; /* phase 2 */
675  tmp11 = tmp5 + tmp6;
676  tmp12 = tmp6 + tmp7;
677 
678  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
679  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
680  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
681  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
682 
683  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
684  int32_t z13 = (tmp7<<8) - z3;
685 
686  odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
687  odataptr[24] = DESCALE20(z13 - z2);
688  odataptr[8] = DESCALE20(z11 + z4);
689  odataptr[56] = DESCALE20(z11 - z4);
690 
691  odataptr++; /* advance pointer to next column */
692  wsptr++;
693 
694  }
695 #else
696  volatile mmx_t tmp6 {};
697  volatile mmx_t tmp7 {};
698  auto *dataptr = (mmx_t *)m_block.data();
699  auto *idata2 = (mmx_t *)idata;
700 
701 
702  // first copy the input 8 bit to the destination 16 bits
703 
704  movq_m2r(RTjpeg_zero, mm2);
705 
706  movq_m2r(*idata2, mm0);
707  movq_r2r(mm0, mm1);
708 
709  punpcklbw_r2r(mm2, mm0);
710  movq_r2m(mm0, *(dataptr));
711 
712  punpckhbw_r2r(mm2, mm1);
713  movq_r2m(mm1, *(dataptr+1));
714 
715  idata2 += rskip;
716 
717  movq_m2r(*idata2, mm0);
718  movq_r2r(mm0, mm1);
719 
720  punpcklbw_r2r(mm2, mm0);
721  movq_r2m(mm0, *(dataptr+2));
722 
723  punpckhbw_r2r(mm2, mm1);
724  movq_r2m(mm1, *(dataptr+3));
725 
726  idata2 += rskip;
727 
728  movq_m2r(*idata2, mm0);
729  movq_r2r(mm0, mm1);
730 
731  punpcklbw_r2r(mm2, mm0);
732  movq_r2m(mm0, *(dataptr+4));
733 
734  punpckhbw_r2r(mm2, mm1);
735  movq_r2m(mm1, *(dataptr+5));
736 
737  idata2 += rskip;
738 
739  movq_m2r(*idata2, mm0);
740  movq_r2r(mm0, mm1);
741 
742  punpcklbw_r2r(mm2, mm0);
743  movq_r2m(mm0, *(dataptr+6));
744 
745  punpckhbw_r2r(mm2, mm1);
746  movq_r2m(mm1, *(dataptr+7));
747 
748  idata2 += rskip;
749 
750  movq_m2r(*idata2, mm0);
751  movq_r2r(mm0, mm1);
752 
753  punpcklbw_r2r(mm2, mm0);
754  movq_r2m(mm0, *(dataptr+8));
755 
756  punpckhbw_r2r(mm2, mm1);
757  movq_r2m(mm1, *(dataptr+9));
758 
759  idata2 += rskip;
760 
761  movq_m2r(*idata2, mm0);
762  movq_r2r(mm0, mm1);
763 
764  punpcklbw_r2r(mm2, mm0);
765  movq_r2m(mm0, *(dataptr+10));
766 
767  punpckhbw_r2r(mm2, mm1);
768  movq_r2m(mm1, *(dataptr+11));
769 
770  idata2 += rskip;
771 
772  movq_m2r(*idata2, mm0);
773  movq_r2r(mm0, mm1);
774 
775  punpcklbw_r2r(mm2, mm0);
776  movq_r2m(mm0, *(dataptr+12));
777 
778  punpckhbw_r2r(mm2, mm1);
779  movq_r2m(mm1, *(dataptr+13));
780 
781  idata2 += rskip;
782 
783  movq_m2r(*idata2, mm0);
784  movq_r2r(mm0, mm1);
785 
786  punpcklbw_r2r(mm2, mm0);
787  movq_r2m(mm0, *(dataptr+14));
788 
789  punpckhbw_r2r(mm2, mm1);
790  movq_r2m(mm1, *(dataptr+15));
791 
792 /* Start Transpose to do calculations on rows */
793 
794  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
795 
796  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
797  movq_r2r(mm7, mm5);
798 
799  punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
800  movq_r2r(mm6, mm2);
801 
802  punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
803  movq_r2r(mm7, mm1);
804 
805  movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
806  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
807 
808  movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
809  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
810 
811  movq_r2m(mm7,*(dataptr+9)); // write result 1
812  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
813 
814  movq_r2m(mm1,*(dataptr+11)); // write result 2
815  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
816 
817  movq_r2r(mm5, mm1);
818  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
819 
820  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
821  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
822 
823  movq_r2m(mm5,*(dataptr+13)); // write result 3
824 
825  // last 4x4 done
826 
827  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
828 
829  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
830  movq_r2r(mm0, mm6);
831 
832  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
833  movq_r2r(mm2, mm7);
834 
835  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
836  movq_r2r(mm0, mm4);
837 
838  //
839  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
840  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
841 
842  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
843  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
844 
845  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
846  movq_r2r(mm1, mm2); // copy first line
847 
848  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
849  movq_r2r(mm6, mm5); // copy first intermediate result
850 
851  movq_r2m(mm0, *(dataptr+8)); // write result 1
852  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
853 
854  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
855  movq_r2r(mm3, mm0); // copy third line
856 
857  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
858 
859  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
860  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
861 
862  punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
863  movq_r2r(mm1, mm4);
864 
865  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
866  punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
867 
868  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
869  movq_r2r(mm2, mm6);
870 
871  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
872  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
873 
874  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
875  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
876 
877  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
878  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
879 
880  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
881 
882  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
883 
884  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
885 
886 
887 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
888 
889  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
890  movq_r2r(mm0, mm2);
891 
892  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
893  movq_r2r(mm7, mm4);
894 
895  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
896  movq_r2r(mm0, mm1);
897 
898  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
899  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
900 
901  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
902  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
903 
904  movq_r2r(mm0, mm7); // write result 1
905  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
906 
907  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
908  movq_r2r(mm1, mm6); // write result 2
909 
910  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
911  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
912 
913  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
914  movq_r2r(mm2, mm3); // copy first intermediate result
915 
916  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
917  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
918 
919  movq_r2m(mm7, tmp7);
920  movq_r2r(mm2, mm5); // write result 3
921 
922  movq_r2m(mm6, tmp6);
923  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
924 
925  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
926  movq_r2r(mm3, mm4); // write result 4
927 
928 /************************************************************************************************
929  End of Transpose
930 ************************************************************************************************/
931 
932 
933  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
934  movq_r2r(mm0, mm7);
935 
936  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
937  movq_r2r(mm1, mm6);
938 
939  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
940  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
941 
942  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
943  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
944 
945  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
946  paddw_r2r(mm7, mm6); // tmp12 + tmp13
947 
948  /* stage 3 */
949 
950  movq_m2r(tmp6, mm2);
951  movq_r2r(mm0, mm3);
952 
953  psllw_i2r(2, mm6); // m8 * 2^2
954  paddw_r2r(mm1, mm0);
955 
956  pmulhw_m2r(RTjpeg_C4, mm6); // z1
957  psubw_r2r(mm1, mm3);
958 
959  movq_r2m(mm0, *dataptr);
960  movq_r2r(mm7, mm0);
961 
962  /* Odd part */
963  movq_r2m(mm3, *(dataptr+8));
964  paddw_r2r(mm5, mm4); // tmp10
965 
966  movq_m2r(tmp7, mm3);
967  paddw_r2r(mm6, mm0); // tmp32
968 
969  paddw_r2r(mm2, mm5); // tmp11
970  psubw_r2r(mm6, mm7); // tmp33
971 
972  movq_r2m(mm0, *(dataptr+4));
973  paddw_r2r(mm3, mm2); // tmp12
974 
975  /* stage 4 */
976 
977  movq_r2m(mm7, *(dataptr+12));
978  movq_r2r(mm4, mm1); // copy of tmp10
979 
980  psubw_r2r(mm2, mm1); // tmp10 - tmp12
981  psllw_i2r(2, mm4); // m8 * 2^2
982 
983  movq_m2r(RTjpeg_C2mC6, mm0);
984  psllw_i2r(2, mm1);
985 
986  pmulhw_m2r(RTjpeg_C6, mm1); // z5
987  psllw_i2r(2, mm2);
988 
989  pmulhw_r2r(mm0, mm4); // z5
990 
991  /* stage 5 */
992 
993  pmulhw_m2r(RTjpeg_C2pC6, mm2);
994  psllw_i2r(2, mm5);
995 
996  pmulhw_m2r(RTjpeg_C4, mm5); // z3
997  movq_r2r(mm3, mm0); // copy tmp7
998 
999  movq_m2r(*(dataptr+1), mm7);
1000  paddw_r2r(mm1, mm4); // z2
1001 
1002  paddw_r2r(mm1, mm2); // z4
1003 
1004  paddw_r2r(mm5, mm0); // z11
1005  psubw_r2r(mm5, mm3); // z13
1006 
1007  /* stage 6 */
1008 
1009  movq_r2r(mm3, mm5); // copy z13
1010  psubw_r2r(mm4, mm3); // y3=z13 - z2
1011 
1012  paddw_r2r(mm4, mm5); // y5=z13 + z2
1013  movq_r2r(mm0, mm6); // copy z11
1014 
1015  movq_r2m(mm3, *(dataptr+6)); //save y3
1016  psubw_r2r(mm2, mm0); // y7=z11 - z4
1017 
1018  movq_r2m(mm5, *(dataptr+10)); //save y5
1019  paddw_r2r(mm2, mm6); // y1=z11 + z4
1020 
1021  movq_r2m(mm0, *(dataptr+14)); //save y7
1022 
1023  /************************************************
1024  * End of 1st 4 rows
1025  ************************************************/
1026 
1027  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1028  movq_r2r(mm7, mm0); // copy x0
1029 
1030  movq_r2m(mm6, *(dataptr+2)); //save y1
1031 
1032  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1033  movq_r2r(mm1, mm6); // copy x1
1034 
1035  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1036 
1037  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1038  movq_r2r(mm2, mm5); // copy x2
1039 
1040  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1041  movq_r2r(mm3, mm4); // copy x3
1042 
1043  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1044 
1045  movq_r2m(mm7, tmp7); // save tmp07
1046  movq_r2r(mm0, mm7); // copy tmp00
1047 
1048  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1049 
1050  /* stage 2, Even Part */
1051 
1052  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1053 
1054  movq_r2m(mm6, tmp6); // save tmp07
1055  movq_r2r(mm1, mm6); // copy tmp01
1056 
1057  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1058  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1059 
1060  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1061 
1062  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1063  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1064 
1065  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1066 
1067  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1068  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1069 
1070  /* stage 3, Even and stage 4 & 5 even */
1071 
1072  movq_m2r(tmp6, mm2); // load tmp6
1073  movq_r2r(mm0, mm3); // copy tmp10
1074 
1075  psllw_i2r(2, mm6); // shift z1
1076  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1077 
1078  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1079  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1080 
1081  movq_r2m(mm0, *(dataptr+1)); //save y0
1082  movq_r2r(mm7, mm0); // copy tmp13
1083 
1084  /* odd part */
1085 
1086  movq_r2m(mm3, *(dataptr+9)); //save y4
1087  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1088 
1089  movq_m2r(tmp7, mm3); // load tmp7
1090  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1091 
1092  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1093  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1094 
1095  movq_r2m(mm0, *(dataptr+5)); //save y2
1096  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1097 
1098  /* stage 4 */
1099 
1100  movq_r2m(mm7, *(dataptr+13)); //save y6
1101  movq_r2r(mm4, mm1); // copy tmp10
1102 
1103  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1104  psllw_i2r(2, mm4); // shift tmp10
1105 
1106  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1107  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1108 
1109  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1110  psllw_i2r(2, mm5); // prepare for multiply
1111 
1112  pmulhw_r2r(mm0, mm4); // multiply by converted real
1113 
1114  /* stage 5 */
1115 
1116  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1117  psllw_i2r(2, mm2); // prepare for multiply
1118 
1119  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1120  movq_r2r(mm3, mm0); // copy tmp7
1121 
1122  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1123  paddw_r2r(mm1, mm4); // z2
1124 
1125  paddw_r2r(mm5, mm0); // z11
1126  psubw_r2r(mm5, mm3); // z13
1127 
1128  /* stage 6 */
1129 
1130  movq_r2r(mm3, mm5); // copy z13
1131  paddw_r2r(mm1, mm2); // z4
1132 
1133  movq_r2r(mm0, mm6); // copy z11
1134  psubw_r2r(mm4, mm5); // y3
1135 
1136  paddw_r2r(mm2, mm6); // y1
1137  paddw_r2r(mm4, mm3); // y5
1138 
1139  movq_r2m(mm5, *(dataptr+7)); //save y3
1140 
1141  movq_r2m(mm6, *(dataptr+3)); //save y1
1142  psubw_r2r(mm2, mm0); // y7
1143 
1144 /************************************************************************************************
1145  Start of Transpose
1146 ************************************************************************************************/
1147 
1148  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1149  movq_r2r(mm7, mm5); // copy first line
1150 
1151  punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1152  movq_r2r(mm6, mm2); // copy third line
1153 
1154  punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1155  movq_r2r(mm7, mm1); // copy first intermediate result
1156 
1157  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1158 
1159  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1160 
1161  movq_r2m(mm7, *(dataptr+9)); // write result 1
1162  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1163 
1164  movq_r2m(mm1, *(dataptr+11)); // write result 2
1165  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1166 
1167  movq_r2r(mm5, mm1); // copy first intermediate result
1168  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1169 
1170  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1171  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1172 
1173  movq_r2m(mm5, *(dataptr+13)); // write result 3
1174 
1175  /****** last 4x4 done */
1176 
1177  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1178 
1179  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1180  movq_r2r(mm0, mm6); // copy first line
1181 
1182  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1183  movq_r2r(mm2, mm7); // copy third line
1184 
1185  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1186  movq_r2r(mm0, mm4); // copy first intermediate result
1187 
1188 
1189 
1190  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1191  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1192 
1193  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1194  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1195 
1196  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1197  movq_r2r(mm1, mm2); // copy first line
1198 
1199  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1200  movq_r2r(mm6, mm5); // copy first intermediate result
1201 
1202  movq_r2m(mm0, *(dataptr+8)); // write result 1
1203  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1204 
1205  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1206  movq_r2r(mm3, mm0); // copy third line
1207 
1208  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1209 
1210  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1211  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1212 
1213  punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1214  movq_r2r(mm1, mm4); // copy second intermediate result
1215 
1216  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1217  punpckldq_r2r(mm3, mm1); //
1218 
1219  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1220  movq_r2r(mm2, mm6); // copy second intermediate result
1221 
1222  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1223  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1224 
1225  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1226  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1227 
1228  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1229  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1230 
1231  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1232 
1233  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1234 
1235  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1236 
1237 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1238 
1239  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1240  movq_r2r(mm0, mm2); // copy first line
1241 
1242  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1243  movq_r2r(mm7, mm4); // copy third line
1244 
1245  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1246  movq_r2r(mm0, mm1); // copy first intermediate result
1247 
1248  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1249  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1250 
1251  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1252  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1253 
1254  movq_r2r(mm0, mm7); // write result 1
1255  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1256 
1257  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1258  movq_r2r(mm1, mm6); // write result 2
1259 
1260  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1261  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1262 
1263  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1264  movq_r2r(mm2, mm3); // copy first intermediate result
1265 
1266  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1267  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1268 
1269  movq_r2m(mm7, tmp7); // save tmp07
1270  movq_r2r(mm2, mm5); // write result 3
1271 
1272  movq_r2m(mm6, tmp6); // save tmp06
1273 
1274  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1275 
1276  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1277  movq_r2r(mm3, mm4); // write result 4
1278 
1279 /************************************************************************************************
1280  End of Transpose 2
1281 ************************************************************************************************/
1282 
1283  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1284  movq_r2r(mm0, mm7);
1285 
1286  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1287  movq_r2r(mm1, mm6);
1288 
1289  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1290  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1291 
1292  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1293  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1294 
1295  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1296  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1297 
1298  /* stage 3 */
1299 
1300  movq_m2r(tmp6, mm2);
1301  movq_r2r(mm0, mm3);
1302 
1303  psllw_i2r(2, mm6); // m8 * 2^2
1304  paddw_r2r(mm1, mm0);
1305 
1306  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1307  psubw_r2r(mm1, mm3);
1308 
1309  movq_r2m(mm0, *dataptr);
1310  movq_r2r(mm7, mm0);
1311 
1312  /* Odd part */
1313  movq_r2m(mm3, *(dataptr+8));
1314  paddw_r2r(mm5, mm4); // tmp10
1315 
1316  movq_m2r(tmp7, mm3);
1317  paddw_r2r(mm6, mm0); // tmp32
1318 
1319  paddw_r2r(mm2, mm5); // tmp11
1320  psubw_r2r(mm6, mm7); // tmp33
1321 
1322  movq_r2m(mm0, *(dataptr+4));
1323  paddw_r2r(mm3, mm2); // tmp12
1324 
1325  /* stage 4 */
1326  movq_r2m(mm7, *(dataptr+12));
1327  movq_r2r(mm4, mm1); // copy of tmp10
1328 
1329  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1330  psllw_i2r(2, mm4); // m8 * 2^2
1331 
1332  movq_m2r(RTjpeg_C2mC6, mm0);
1333  psllw_i2r(2, mm1);
1334 
1335  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1336  psllw_i2r(2, mm2);
1337 
1338  pmulhw_r2r(mm0, mm4); // z5
1339 
1340  /* stage 5 */
1341 
1342  pmulhw_m2r(RTjpeg_C2pC6, mm2);
1343  psllw_i2r(2, mm5);
1344 
1345  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1346  movq_r2r(mm3, mm0); // copy tmp7
1347 
1348  movq_m2r(*(dataptr+1), mm7);
1349  paddw_r2r(mm1, mm4); // z2
1350 
1351  paddw_r2r(mm1, mm2); // z4
1352 
1353  paddw_r2r(mm5, mm0); // z11
1354  psubw_r2r(mm5, mm3); // z13
1355 
1356  /* stage 6 */
1357 
1358  movq_r2r(mm3, mm5); // copy z13
1359  psubw_r2r(mm4, mm3); // y3=z13 - z2
1360 
1361  paddw_r2r(mm4, mm5); // y5=z13 + z2
1362  movq_r2r(mm0, mm6); // copy z11
1363 
1364  movq_r2m(mm3, *(dataptr+6)); //save y3
1365  psubw_r2r(mm2, mm0); // y7=z11 - z4
1366 
1367  movq_r2m(mm5, *(dataptr+10)); //save y5
1368  paddw_r2r(mm2, mm6); // y1=z11 + z4
1369 
1370  movq_r2m(mm0, *(dataptr+14)); //save y7
1371 
1372  /************************************************
1373  * End of 1st 4 rows
1374  ************************************************/
1375 
1376  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1377  movq_r2r(mm7, mm0); // copy x0
1378 
1379  movq_r2m(mm6, *(dataptr+2)); //save y1
1380 
1381  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1382  movq_r2r(mm1, mm6); // copy x1
1383 
1384  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1385 
1386  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1387  movq_r2r(mm2, mm5); // copy x2
1388 
1389  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1390  movq_r2r(mm3, mm4); // copy x3
1391 
1392  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1393 
1394  movq_r2m(mm7, tmp7); // save tmp07
1395  movq_r2r(mm0, mm7); // copy tmp00
1396 
1397  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1398 
1399  /* stage 2, Even Part */
1400 
1401  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1402 
1403  movq_r2m(mm6, tmp6); // save tmp07
1404  movq_r2r(mm1, mm6); // copy tmp01
1405 
1406  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1407  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1408 
1409  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1410 
1411  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1412  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1413 
1414  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1415 
1416  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1417  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1418 
1419  /* stage 3, Even and stage 4 & 5 even */
1420 
1421  movq_m2r(tmp6, mm2); // load tmp6
1422  movq_r2r(mm0, mm3); // copy tmp10
1423 
1424  psllw_i2r(2, mm6); // shift z1
1425  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1426 
1427  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1428  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1429 
1430  movq_r2m(mm0, *(dataptr+1)); //save y0
1431  movq_r2r(mm7, mm0); // copy tmp13
1432 
1433  /* odd part */
1434 
1435  movq_r2m(mm3, *(dataptr+9)); //save y4
1436  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1437 
1438  movq_m2r(tmp7, mm3); // load tmp7
1439  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1440 
1441  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1442  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1443 
1444  movq_r2m(mm0, *(dataptr+5)); //save y2
1445  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1446 
1447  /* stage 4 */
1448 
1449  movq_r2m(mm7, *(dataptr+13)); //save y6
1450  movq_r2r(mm4, mm1); // copy tmp10
1451 
1452  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1453  psllw_i2r(2, mm4); // shift tmp10
1454 
1455  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1456  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1457 
1458  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1459  psllw_i2r(2, mm5); // prepare for multiply
1460 
1461  pmulhw_r2r(mm0, mm4); // multiply by converted real
1462 
1463  /* stage 5 */
1464 
1465  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1466  psllw_i2r(2, mm2); // prepare for multiply
1467 
1468  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1469  movq_r2r(mm3, mm0); // copy tmp7
1470 
1471  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1472  paddw_r2r(mm1, mm4); // z2
1473 
1474  paddw_r2r(mm5, mm0); // z11
1475  psubw_r2r(mm5, mm3); // z13
1476 
1477  /* stage 6 */
1478 
1479  movq_r2r(mm3, mm5); // copy z13
1480  paddw_r2r(mm1, mm2); // z4
1481 
1482  movq_r2r(mm0, mm6); // copy z11
1483  psubw_r2r(mm4, mm5); // y3
1484 
1485  paddw_r2r(mm2, mm6); // y1
1486  paddw_r2r(mm4, mm3); // y5
1487 
1488  movq_r2m(mm5, *(dataptr+7)); //save y3
1489  psubw_r2r(mm2, mm0); // y7=z11 - z4
1490 
1491  movq_r2m(mm3, *(dataptr+11)); //save y5
1492 
1493  movq_r2m(mm6, *(dataptr+3)); //save y1
1494 
1495  movq_r2m(mm0, *(dataptr+15)); //save y7
1496 
1497 
1498 #endif
1499 }
1500 
1501 #ifndef MMX
1502 static constexpr int32_t FIX_1_082392200 { 277 }; /* FIX(1.082392200) */
1503 static constexpr int32_t FIX_1_414213562 { 362 }; /* FIX(1.414213562) */
1504 static constexpr int32_t FIX_1_847759065 { 473 }; /* FIX(1.847759065) */
1505 static constexpr int32_t FIX_2_613125930 { 669 }; /* FIX(2.613125930) */
1506 
1507 static constexpr int16_t DESCALE(int32_t x) { return static_cast<int16_t>((x+4) >> 3); };
1508 
1509 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1510 
1511 static inline int16_t RL(int32_t x) { return std::clamp(x, 16, 235); };
1512 static constexpr int32_t MULTIPLY(int32_t var, int32_t constant)
1513  { return ((var * constant) + 128) >> 8; };
1514 #endif
1515 
1517 {
1518  for(int i = 0; i < 64; i++)
1519  {
1520  m_liqt[i] = ((uint64_t)m_liqt[i] * RTjpeg_aan_tab[i]) >> 32;
1521  m_ciqt[i] = ((uint64_t)m_ciqt[i] * RTjpeg_aan_tab[i]) >> 32;
1522  }
1523 }
1524 
1525 void RTjpeg::Idct(uint8_t *odata, RTjpegData16 &data, int rskip)
1526 {
1527 #ifdef MMX
1528 
1529 static mmx_t s_fix141; s_fix141.q = 0x5a825a825a825a82LL;
1530 static mmx_t s_fix184n261; s_fix184n261.q = 0xcf04cf04cf04cf04LL;
1531 static mmx_t s_fix184; s_fix184.q = 0x7641764176417641LL;
1532 static mmx_t s_fixN184; s_fixN184.q = 0x896f896f896f896fLL;
1533 static mmx_t s_fix108n184; s_fix108n184.q = 0xcf04cf04cf04cf04LL;
1534 
1535  auto *wsptr = (mmx_t *)m_ws.data();
1536  auto *dataptr = (mmx_t *)odata;
1537  auto *idata = (mmx_t *)data.data();
1538 
1539  rskip = rskip>>3;
1540 /*
1541  * Perform inverse DCT on one block of coefficients.
1542  */
1543 
1544  /* Odd part */
1545 
1546  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1547 
1548  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1549 
1550  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1551 
1552  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1553 
1554  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1555 
1556  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1557 
1558  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1559 
1560  psllw_i2r(2, mm2); // shift z10
1561  movq_r2r(mm2, mm0); // copy z10
1562 
1563  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1564  movq_r2r(mm3, mm5); // copy tmp4
1565 
1566  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1567  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1568 
1569  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1570  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1571 
1572  psubw_r2r(mm1, mm6); // z11-z13
1573  psllw_i2r(2, mm5); // shift z12
1574 
1575  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1576  movq_r2r(mm5, mm7); // copy z12
1577 
1578  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1579  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1580 
1581  //ok
1582 
1583  /* Even part */
1584  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1585  psllw_i2r(2, mm6);
1586 
1587  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1588 
1589  paddw_r2r(mm5, mm0); // tmp10
1590 
1591  paddw_r2r(mm7, mm2); // tmp12
1592 
1593  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1594  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1595 
1596  movq_r2r(mm1, mm5); // copy tmp1
1597  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1598 
1599  psubw_r2r(mm4, mm5); // tmp1-tmp3
1600  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1601 
1602  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1603  psllw_i2r(2, mm5); // shift tmp1-tmp3
1604 
1605  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1606 
1607  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1608  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1609 
1610  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1611 
1612  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1613 
1614  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1615  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1616 
1617  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1618  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1619 
1620  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1621  movq_r2r(mm1, mm5); // copy tmp11
1622 
1623  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1624  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1625 
1626  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1627 
1628  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1629  movq_r2r(mm7, mm0); // copy tmp0
1630 
1631  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1632  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1633 
1634  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1635 
1636  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1637  movq_r2r(mm1, mm3); // copy tmp1
1638 
1639  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1640  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1641 
1642  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1643 
1644  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1645  movq_r2r(mm4, mm1); // copy tmp3
1646 
1647  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1648 
1649  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1650 
1651  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1652 
1653  movq_r2m(mm4, *(wsptr+8));
1654  movq_r2r(mm5, mm7); // copy tmp2
1655 
1656  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1657 
1658  movq_r2m(mm1, *(wsptr+6));
1659  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1660 
1661  movq_r2m(mm5, *(wsptr+4));
1662 
1663  movq_r2m(mm7, *(wsptr+10));
1664 
1665  //ok
1666 
1667 
1668 /*****************************************************************/
1669 
1670  idata++;
1671  wsptr++;
1672 
1673 /*****************************************************************/
1674 
1675  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1676 
1677  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1678 
1679  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1680  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1681 
1682  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1683  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1684 
1685  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1686 
1687  psllw_i2r(2, mm2); // shift z10
1688  movq_r2r(mm2, mm0); // copy z10
1689 
1690  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1691  movq_r2r(mm3, mm5); // copy tmp4
1692 
1693  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1694  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1695 
1696  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1697  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1698 
1699  psubw_r2r(mm1, mm6); // z11-z13
1700  psllw_i2r(2, mm5); // shift z12
1701 
1702  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1703  movq_r2r(mm5, mm7); // copy z12
1704 
1705  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1706  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1707 
1708  //ok
1709 
1710  /* Even part */
1711  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1712  psllw_i2r(2, mm6);
1713 
1714  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1715 
1716  paddw_r2r(mm5, mm0); // tmp10
1717 
1718  paddw_r2r(mm7, mm2); // tmp12
1719 
1720  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1721  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1722 
1723  movq_r2r(mm1, mm5); // copy tmp1
1724  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1725 
1726  psubw_r2r(mm4, mm5); // tmp1-tmp3
1727  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1728 
1729  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1730  psllw_i2r(2, mm5); // shift tmp1-tmp3
1731 
1732  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1733  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1734 
1735  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1736 
1737  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1738 
1739  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1740 
1741  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1742  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1743 
1744  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1745  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1746 
1747  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1748  movq_r2r(mm1, mm5); // copy tmp11
1749 
1750  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1751  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1752 
1753  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1754 
1755  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1756  movq_r2r(mm7, mm0); // copy tmp0
1757 
1758  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1759  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1760 
1761  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1762 
1763  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1764  movq_r2r(mm1, mm3); // copy tmp1
1765 
1766  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1767  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1768 
1769  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1770 
1771  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1772  movq_r2r(mm4, mm1); // copy tmp3
1773 
1774  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1775 
1776  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1777 
1778  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1779 
1780  movq_r2m(mm4, *(wsptr+8));
1781  movq_r2r(mm5, mm7); // copy tmp2
1782 
1783  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1784 
1785  movq_r2m(mm1, *(wsptr+6));
1786  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1787 
1788  movq_r2m(mm5, *(wsptr+4));
1789 
1790  movq_r2m(mm7, *(wsptr+10));
1791 
1792 /*****************************************************************/
1793 
1794  /* Pass 2: process rows from work array, store into output array. */
1795  /* Note that we must descale the results by a factor of 8 == 2**3, */
1796  /* and also undo the PASS1_BITS scaling. */
1797 
1798 /*****************************************************************/
1799  /* Even part */
1800 
1801  wsptr--;
1802 
1803 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1804 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1805 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1806 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1807  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1808 
1809  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1810  movq_r2r(mm0, mm2);
1811 
1812  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1813  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1814 
1815  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1816  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1817 
1818  movq_r2r(mm0, mm6);
1819  movq_r2r(mm3, mm5);
1820 
1821  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1822  movq_r2r(mm2, mm1);
1823 
1824  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1825  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1826 
1827  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1828  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1829 
1830  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1831  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1832 
1833  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1834  movq_r2r(mm3, mm4);
1835 
1836  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1837  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1838 
1839  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1840  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1841 
1842 
1843  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1844  movq_r2r(mm6, mm2);
1845 
1846  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1847  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1848 
1849  movq_r2r(mm3, mm5);
1850  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1851 
1852  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1853  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1854 
1855  movq_r2r(mm4, mm7);
1856  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1857 
1858  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1859 
1860  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1861 
1862  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1863  movq_r2r(mm1, mm6);
1864 
1865  //ok
1866 
1867 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1868 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1869 
1870 
1871  movq_r2r(mm0, mm2);
1872  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1873 
1874  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1875  psllw_i2r(2, mm6);
1876 
1877  pmulhw_m2r(s_fix141, mm6);
1878  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1879 
1880  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1881  movq_r2r(mm0, mm7);
1882 
1883 // tmp0 = tmp10 + tmp13;
1884 // tmp3 = tmp10 - tmp13;
1885  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1886  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1887 
1888 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1889  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1890 // tmp1 = tmp11 + tmp12;
1891 // tmp2 = tmp11 - tmp12;
1892  movq_r2r(mm1, mm5);
1893 
1894  //OK
1895 
1896  /* Odd part */
1897 
1898 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1899 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1900 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1901 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1902  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1903  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1904 
1905  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1906  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1907 
1908  movq_r2r(mm3, mm6);
1909  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1910 
1911  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1912  movq_r2r(mm3, mm2);
1913 
1914 //Save tmp0 and tmp1 in wsptr
1915  movq_r2m(mm0, *(wsptr)); // save tmp0
1916  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1917 
1918 
1919 //Continue with z10 --- z13
1920  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1921  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1922 
1923  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1924  movq_r2r(mm6, mm4);
1925 
1926  movq_r2m(mm1, *(wsptr+1)); // save tmp1
1927  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1928 
1929  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1930  movq_r2r(mm6, mm1);
1931 
1932 //Save tmp2 and tmp3 in wsptr
1933  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1934  movq_r2r(mm2, mm4);
1935 
1936 //Continue with z10 --- z13
1937  movq_r2m(mm5, *(wsptr+2)); // save tmp2
1938  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1939 
1940  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1941  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1942 
1943  movq_r2r(mm3, mm0);
1944  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1945 
1946  movq_r2m(mm7, *(wsptr+3)); // save tmp3
1947  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1948 
1949  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1950  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1951 
1952  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
1953  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1954 
1955  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
1956  movq_r2r(mm6, mm4);
1957 
1958  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
1959  movq_r2r(mm1, mm5);
1960 
1961  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
1962  movq_r2r(mm6, mm2);
1963 
1964  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
1965  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
1966 
1967  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
1968  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
1969 
1970  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
1971  movq_r2r(mm1, mm7);
1972 
1973  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
1974  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
1975 
1976  movq_r2r(mm6, mm5);
1977  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
1978 
1979  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
1980  movq_r2r(mm2, mm4);
1981 
1982  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
1983 
1984  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
1985 
1986  punpckhdq_r2r(mm6, mm4);
1987 
1988  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
1989  movq_r2r(mm0, mm5);
1990 
1991  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
1992 
1993  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
1994  movq_r2r(mm3, mm4);
1995 
1996  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
1997  movq_r2r(mm5, mm1);
1998 
1999  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2000 // tmp7 = z11 + z13; /* phase 5 */
2001 // tmp8 = z11 - z13; /* phase 5 */
2002  psubw_r2r(mm4, mm1); // tmp8
2003 
2004  paddw_r2r(mm4, mm5); // tmp7
2005 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2006  psllw_i2r(2, mm1);
2007 
2008  psllw_i2r(2, mm0);
2009 
2010  pmulhw_m2r(s_fix141, mm1); // tmp21
2011 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2012 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2013  psllw_i2r(2, mm3);
2014  movq_r2r(mm0, mm7);
2015 
2016  pmulhw_m2r(s_fixN184, mm7);
2017  movq_r2r(mm3, mm6);
2018 
2019  movq_m2r(*(wsptr), mm2); // tmp0,final1
2020 
2021  pmulhw_m2r(s_fix108n184, mm6);
2022 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2023 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2024  movq_r2r(mm2, mm4); // final1
2025 
2026  pmulhw_m2r(s_fix184n261, mm0);
2027  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2028 
2029  pmulhw_m2r(s_fix184, mm3);
2030  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2031 
2032 // tmp6 = tmp22 - tmp7; /* phase 2 */
2033  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2034 
2035  paddw_r2r(mm6, mm7); // tmp20
2036  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2037 
2038  paddw_r2r(mm0, mm3); // tmp22
2039 
2040 // tmp5 = tmp21 - tmp6;
2041  psubw_r2r(mm5, mm3); // tmp6
2042 
2043 // tmp4 = tmp20 + tmp5;
2044  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2045  psubw_r2r(mm3, mm1); // tmp5
2046 
2047  movq_r2r(mm0, mm6); // final2
2048  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2049 
2050  /* Final output stage: scale down by a factor of 8 and range-limit */
2051 
2052 
2053 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2054 // & RANGE_MASK];
2055 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2056 // & RANGE_MASK]; final1
2057 
2058 
2059 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2060 // & RANGE_MASK];
2061 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2062 // & RANGE_MASK]; final2
2063  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2064  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2065 
2066  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2067 
2068  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2069 
2070  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2071  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2072 
2073 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2074 // & RANGE_MASK];
2075 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2076 // & RANGE_MASK]; final3
2077  paddw_r2r(mm1, mm7); // tmp4
2078  movq_r2r(mm5, mm3);
2079 
2080  paddw_r2r(mm1, mm5); // tmp2+tmp5
2081  psubw_r2r(mm1, mm3); // tmp2-tmp5
2082 
2083  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2084 
2085  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2086  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2087 
2088 
2089 
2090 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2091 // & RANGE_MASK];
2092 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2093 // & RANGE_MASK]; final4
2094  movq_r2r(mm4, mm6);
2095  paddw_r2r(mm7, mm4); // tmp3+tmp4
2096 
2097  psubw_r2r(mm7, mm6); // tmp3-tmp4
2098  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2099 
2100  // mov ecx, [dataptr]
2101 
2102  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2103 
2104  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2105 
2106  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2107  movq_r2r(mm2, mm4);
2108 
2109  movq_r2r(mm5, mm7);
2110  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2111 
2112  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2113  movq_r2r(mm2, mm1);
2114 
2115  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2116 
2117  // add dataptr, 4
2118 
2119  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2120 
2121  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2122 
2123  // add ecx, output_col
2124 
2125  movq_r2r(mm7, mm6);
2126  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2127 
2128  movq_r2r(mm2, mm0);
2129  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2130 
2131  // mov idata, [dataptr]
2132 
2133  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2134 
2135  // add dataptr, 4
2136 
2137  movq_r2r(mm1, mm3);
2138 
2139  // add idata, output_col
2140 
2141  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2142 
2143  movq_r2m(mm2, *(dataptr));
2144 
2145  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2146 
2147  dataptr += rskip;
2148  movq_r2m(mm0, *(dataptr));
2149 
2150  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2151  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2152 
2153  dataptr += rskip;
2154  movq_r2m(mm1, *(dataptr));
2155 
2156  dataptr += rskip;
2157  movq_r2m(mm3, *(dataptr));
2158 
2159 /*******************************************************************/
2160 
2161  wsptr += 8;
2162 
2163 /*******************************************************************/
2164 
2165 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2166 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2167 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2168 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2169  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2170 
2171  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2172  movq_r2r(mm0, mm2);
2173 
2174  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2175  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2176 
2177  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2178  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2179 
2180  movq_r2r(mm0, mm6);
2181  movq_r2r(mm3, mm5);
2182 
2183  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2184  movq_r2r(mm2, mm1);
2185 
2186  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2187  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2188 
2189  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2190  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2191 
2192  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2193  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2194 
2195  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2196  movq_r2r(mm3, mm4);
2197 
2198  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2199  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2200 
2201  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2202  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2203 
2204  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2205  movq_r2r(mm6, mm2);
2206 
2207  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2208  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2209 
2210  movq_r2r(mm3, mm5);
2211  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2212 
2213  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2214  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2215 
2216  movq_r2r(mm4, mm7);
2217  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2218 
2219  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2220 
2221  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2222 
2223  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2224  movq_r2r(mm1, mm6);
2225 
2226  //OK
2227 
2228 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2229 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2230 
2231  movq_r2r(mm0, mm2);
2232  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2233 
2234  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2235  psllw_i2r(2, mm6);
2236 
2237  pmulhw_m2r(s_fix141, mm6);
2238  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2239 
2240  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2241  movq_r2r(mm0, mm7);
2242 
2243 // tmp0 = tmp10 + tmp13;
2244 // tmp3 = tmp10 - tmp13;
2245  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2246  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2247 
2248 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2249  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2250 // tmp1 = tmp11 + tmp12;
2251 // tmp2 = tmp11 - tmp12;
2252  movq_r2r(mm1, mm5);
2253 
2254  //OK
2255 
2256 
2257  /* Odd part */
2258 
2259 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2260 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2261 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2262 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2263  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2264  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2265 
2266  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2267  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2268 
2269  movq_r2r(mm3, mm6);
2270  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2271 
2272  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2273  movq_r2r(mm3, mm2);
2274 
2275 //Save tmp0 and tmp1 in wsptr
2276  movq_r2m(mm0, *(wsptr)); // save tmp0
2277  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2278 
2279 
2280 //Continue with z10 --- z13
2281  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2282  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2283 
2284  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2285  movq_r2r(mm6, mm4);
2286 
2287  movq_r2m(mm1, *(wsptr+1)); // save tmp1
2288  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2289 
2290  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2291  movq_r2r(mm6, mm1);
2292 
2293 //Save tmp2 and tmp3 in wsptr
2294  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2295  movq_r2r(mm2, mm4);
2296 
2297 //Continue with z10 --- z13
2298  movq_r2m(mm5, *(wsptr+2)); // save tmp2
2299  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2300 
2301  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2302  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2303 
2304  movq_r2r(mm3, mm0);
2305  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2306 
2307  movq_r2m(mm7, *(wsptr+3)); // save tmp3
2308  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2309 
2310  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2311  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2312 
2313  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2314  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2315 
2316  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2317  movq_r2r(mm6, mm4);
2318 
2319  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2320  movq_r2r(mm1, mm5);
2321 
2322  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2323  movq_r2r(mm6, mm2);
2324 
2325  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2326  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2327 
2328  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2329  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2330 
2331  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2332  movq_r2r(mm1, mm7);
2333 
2334  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2335  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2336 
2337  movq_r2r(mm6, mm5);
2338  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2339 
2340  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2341  movq_r2r(mm2, mm4);
2342 
2343  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2344 
2345  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2346 
2347  punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2348 
2349  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2350  movq_r2r(mm0, mm5);
2351 
2352  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2353 
2354  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2355  movq_r2r(mm3, mm4);
2356 
2357  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2358  movq_r2r(mm5, mm1);
2359 
2360  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2361 // tmp7 = z11 + z13; /* phase 5 */
2362 // tmp8 = z11 - z13; /* phase 5 */
2363  psubw_r2r(mm4, mm1); // tmp8
2364 
2365  paddw_r2r(mm4, mm5); // tmp7
2366 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2367  psllw_i2r(2, mm1);
2368 
2369  psllw_i2r(2, mm0);
2370 
2371  pmulhw_m2r(s_fix141, mm1); // tmp21
2372 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2373 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2374  psllw_i2r(2, mm3);
2375  movq_r2r(mm0, mm7);
2376 
2377  pmulhw_m2r(s_fixN184, mm7);
2378  movq_r2r(mm3, mm6);
2379 
2380  movq_m2r(*(wsptr), mm2); // tmp0,final1
2381 
2382  pmulhw_m2r(s_fix108n184, mm6);
2383 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2384 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2385  movq_r2r(mm2, mm4); // final1
2386 
2387  pmulhw_m2r(s_fix184n261, mm0);
2388  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2389 
2390  pmulhw_m2r(s_fix184, mm3);
2391  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2392 
2393 // tmp6 = tmp22 - tmp7; /* phase 2 */
2394  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2395 
2396  paddw_r2r(mm6, mm7); // tmp20
2397  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2398 
2399  paddw_r2r(mm0, mm3); // tmp22
2400 
2401 // tmp5 = tmp21 - tmp6;
2402  psubw_r2r(mm5, mm3); // tmp6
2403 
2404 // tmp4 = tmp20 + tmp5;
2405  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2406  psubw_r2r(mm3, mm1); // tmp5
2407 
2408  movq_r2r(mm0, mm6); // final2
2409  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2410 
2411  /* Final output stage: scale down by a factor of 8 and range-limit */
2412 
2413 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2414 // & RANGE_MASK];
2415 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2416 // & RANGE_MASK]; final1
2417 
2418 
2419 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2420 // & RANGE_MASK];
2421 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2422 // & RANGE_MASK]; final2
2423  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2424  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2425 
2426  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2427 
2428  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2429 
2430  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2431  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2432 
2433 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2434 // & RANGE_MASK];
2435 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2436 // & RANGE_MASK]; final3
2437  paddw_r2r(mm1, mm7); // tmp4
2438  movq_r2r(mm5, mm3);
2439 
2440  paddw_r2r(mm1, mm5); // tmp2+tmp5
2441  psubw_r2r(mm1, mm3); // tmp2-tmp5
2442 
2443  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2444 
2445  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2446  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2447 
2448 
2449 
2450 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2451 // & RANGE_MASK];
2452 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2453 // & RANGE_MASK]; final4
2454  movq_r2r(mm4, mm6);
2455  paddw_r2r(mm7, mm4); // tmp3+tmp4
2456 
2457  psubw_r2r(mm7, mm6); // tmp3-tmp4
2458  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2459 
2460  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2461 
2462  /*
2463  movq_r2m(mm4, *dummy);
2464  fprintf(stderr, "3-4 %016llx\n", dummy);
2465  movq_r2m(mm4, *dummy);
2466  fprintf(stderr, "3+4 %016llx\n", dummy);
2467  */
2468 
2469 
2470  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2471 
2472  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2473  movq_r2r(mm2, mm4);
2474 
2475  movq_r2r(mm5, mm7);
2476  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2477 
2478  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2479  movq_r2r(mm2, mm1);
2480 
2481  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2482 
2483  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2484 
2485  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2486 
2487  movq_r2r(mm7, mm6);
2488  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2489 
2490  movq_r2r(mm2, mm0);
2491  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2492 
2493  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2494 
2495  movq_r2r(mm1, mm3);
2496 
2497  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2498 
2499  dataptr += rskip;
2500  movq_r2m(mm2, *(dataptr));
2501 
2502  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2503 
2504  dataptr += rskip;
2505  movq_r2m(mm0, *(dataptr));
2506 
2507  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2508 
2509  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2510 
2511  dataptr += rskip;
2512  movq_r2m(mm1, *(dataptr));
2513 
2514  dataptr += rskip;
2515  movq_r2m(mm3, *(dataptr));
2516 
2517 #else
2518  int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2519  int32_t tmp10, tmp11, tmp12, tmp13;
2520  int32_t z5, z10, z11, z12, z13;
2521  int16_t *inptr;
2522  int32_t *wsptr;
2523  uint8_t *outptr;
2524  int ctr;
2525  int32_t dcval;
2526 
2527  inptr = data.data();
2528  wsptr = m_ws.data();
2529  for (ctr = 8; ctr > 0; ctr--) {
2530 
2531  if ((inptr[8] | inptr[16] | inptr[24] |
2532  inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2533  dcval = inptr[0];
2534  wsptr[0] = dcval;
2535  wsptr[8] = dcval;
2536  wsptr[16] = dcval;
2537  wsptr[24] = dcval;
2538  wsptr[32] = dcval;
2539  wsptr[40] = dcval;
2540  wsptr[48] = dcval;
2541  wsptr[56] = dcval;
2542 
2543  inptr++;
2544  wsptr++;
2545  continue;
2546  }
2547 
2548  tmp0 = inptr[0];
2549  tmp1 = inptr[16];
2550  tmp2 = inptr[32];
2551  tmp3 = inptr[48];
2552 
2553  tmp10 = tmp0 + tmp2;
2554  tmp11 = tmp0 - tmp2;
2555 
2556  tmp13 = tmp1 + tmp3;
2557  tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2558 
2559  tmp0 = tmp10 + tmp13;
2560  tmp3 = tmp10 - tmp13;
2561  tmp1 = tmp11 + tmp12;
2562  tmp2 = tmp11 - tmp12;
2563 
2564  tmp4 = inptr[8];
2565  tmp5 = inptr[24];
2566  tmp6 = inptr[40];
2567  tmp7 = inptr[56];
2568 
2569  z13 = tmp6 + tmp5;
2570  z10 = tmp6 - tmp5;
2571  z11 = tmp4 + tmp7;
2572  z12 = tmp4 - tmp7;
2573 
2574  tmp7 = z11 + z13;
2575  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2576 
2577  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2578  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2579  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2580 
2581  tmp6 = tmp12 - tmp7;
2582  tmp5 = tmp11 - tmp6;
2583  tmp4 = tmp10 + tmp5;
2584 
2585  wsptr[0] = (int32_t) (tmp0 + tmp7);
2586  wsptr[56] = (int32_t) (tmp0 - tmp7);
2587  wsptr[8] = (int32_t) (tmp1 + tmp6);
2588  wsptr[48] = (int32_t) (tmp1 - tmp6);
2589  wsptr[16] = (int32_t) (tmp2 + tmp5);
2590  wsptr[40] = (int32_t) (tmp2 - tmp5);
2591  wsptr[32] = (int32_t) (tmp3 + tmp4);
2592  wsptr[24] = (int32_t) (tmp3 - tmp4);
2593 
2594  inptr++;
2595  wsptr++;
2596  }
2597 
2598  wsptr = m_ws.data();
2599  for (ctr = 0; ctr < 8; ctr++) {
2600  outptr = &(odata[ctr*rskip]);
2601 
2602  tmp10 = wsptr[0] + wsptr[4];
2603  tmp11 = wsptr[0] - wsptr[4];
2604 
2605  tmp13 = wsptr[2] + wsptr[6];
2606  tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2607 
2608  tmp0 = tmp10 + tmp13;
2609  tmp3 = tmp10 - tmp13;
2610  tmp1 = tmp11 + tmp12;
2611  tmp2 = tmp11 - tmp12;
2612 
2613  z13 = wsptr[5] + wsptr[3];
2614  z10 = wsptr[5] - wsptr[3];
2615  z11 = wsptr[1] + wsptr[7];
2616  z12 = wsptr[1] - wsptr[7];
2617 
2618  tmp7 = z11 + z13;
2619  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2620 
2621  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2622  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2623  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2624 
2625  tmp6 = tmp12 - tmp7;
2626  tmp5 = tmp11 - tmp6;
2627  tmp4 = tmp10 + tmp5;
2628 
2629  outptr[0] = RL(DESCALE(tmp0 + tmp7));
2630  outptr[7] = RL(DESCALE(tmp0 - tmp7));
2631  outptr[1] = RL(DESCALE(tmp1 + tmp6));
2632  outptr[6] = RL(DESCALE(tmp1 - tmp6));
2633  outptr[2] = RL(DESCALE(tmp2 + tmp5));
2634  outptr[5] = RL(DESCALE(tmp2 - tmp5));
2635  outptr[4] = RL(DESCALE(tmp3 + tmp4));
2636  outptr[3] = RL(DESCALE(tmp3 - tmp4));
2637 
2638  wsptr += 8;
2639  }
2640 #endif
2641 }
2642 
2643 inline void RTjpeg::CalcTbls(void)
2644 {
2645  uint64_t qual = (uint64_t)m_q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */
2646 
2647  for(int i = 0; i < 64; i++)
2648  {
2649  m_lqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2650  if (m_lqt[i] == 0)
2651  m_lqt[i]=1;
2652 
2653  m_cqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2654  if (m_cqt[i] == 0)
2655  m_cqt[i]=1;
2656 
2657  m_liqt[i] = (1<<16) / (m_lqt[i]<<3);
2658  m_ciqt[i] = (1<<16) / (m_cqt[i]<<3);
2659  m_lqt[i] = ((1<<16) / m_liqt[i])>>3;
2660  m_cqt[i] = ((1<<16) / m_ciqt[i])>>3;
2661  }
2662 
2663  m_lB8 = 0;
2664  while (m_liqt[RTjpeg_ZZ[++m_lB8]] <= 8)
2665  ;
2666  m_lB8--;
2667  m_cB8 = 0;
2668 
2669  while (m_ciqt[RTjpeg_ZZ[++m_cB8]] <= 8)
2670  ;
2671  m_cB8--;
2672 }
2673 
2674 int RTjpeg::SetQuality(int *quality)
2675 {
2676  *quality = std::clamp(*quality, 1, 255);
2677 
2678  m_q = *quality;
2679 
2680  CalcTbls();
2681  DctInit();
2682  IdctInit();
2683  QuantInit();
2684 
2685  return 0;
2686 }
2687 
2688 int RTjpeg::SetFormat(const int *fmt)
2689 {
2690  m_f = *fmt;
2691  return 0;
2692 }
2693 
2694 int RTjpeg::SetSize(const int *w, const int *h)
2695 {
2696  if ((*w < 0) || (*w > 65535))
2697  return -1;
2698  if ((*h < 0) || (*h > 65535))
2699  return -1;
2700 
2701  m_width = *w;
2702  m_height = *h;
2703  m_yWidth = m_width>>3;
2704  m_ySize = m_width * m_height;
2705  m_cWidth = m_width>>4;
2706  m_cSize = (m_width>>1) * m_height;
2707 
2708  if (m_keyRate > 0)
2709  {
2710  delete [] m_old;
2711  m_old = new (std::align_val_t(32)) int16_t[4*m_width*m_height];
2712  if (!m_old)
2713  {
2714  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2715  return -1;
2716  }
2717  memset(m_old, 0, (4_UZ * m_width * m_height));
2718  }
2719  return 0;
2720 }
2721 
2722 int RTjpeg::SetIntra(int *key, int *lm, int *cm)
2723 {
2724  *key = std::clamp(*key, 0, 255);
2725  m_keyRate = *key;
2726 
2727  *lm = std::clamp(*lm, 0, 16);
2728  *cm = std::clamp(*cm, 0, 16);
2729 
2730 #ifdef MMX
2731  m_lMask.uq = (((uint64_t)(*lm)<<48)|((uint64_t)(*lm)<<32)|((uint64_t)(*lm)<<16)|(uint64_t)(*lm));
2732  m_cMask.uq = (((uint64_t)(*cm)<<48)|((uint64_t)(*cm)<<32)|((uint64_t)(*cm)<<16)|(uint64_t)(*cm));
2733 #else
2734  m_lMask = *lm;
2735  m_cMask = *cm;
2736 #endif
2737 
2738  delete [] m_old;
2739  m_old = new (std::align_val_t(32)) int16_t[4*m_width*m_height];
2740  if (!m_old)
2741  {
2742  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2743  return -1;
2744  }
2745  memset(m_old, 0, (4_UZ * m_width * m_height));
2746 
2747  return 0;
2748 }
2749 
2751 {
2752 #ifdef MMX
2753  RTjpeg_ones.q = 0x0001000100010001LL;
2754  RTjpeg_half.q = 0x7fff7fff7fff7fffLL;
2755  RTjpeg_C4.q = 0x2D412D412D412D41LL;
2756  RTjpeg_C6.q = 0x187E187E187E187ELL;
2757  RTjpeg_C2mC6.q= 0x22A322A322A322A3LL;
2758  RTjpeg_C2pC6.q= 0x539F539F539F539FLL;
2759  RTjpeg_zero.q = 0x0000000000000000LL;
2760 #endif
2761 }
2762 
2764 {
2765  delete [] m_old;
2766 }
2767 
2768 inline int RTjpeg::compressYUV420(int8_t *sp, uint8_t **planes)
2769 {
2770  uint8_t * bp = planes[0];
2771  uint8_t * bp1 = bp + (m_width<<3);
2772  uint8_t * bp2 = planes[1];
2773  uint8_t * bp3 = planes[2];
2774 
2775 #ifdef MMX
2776  emms();
2777 #endif
2778  int8_t * sb = sp;
2779 /* Y */
2780  for(int i = m_height >> 1; i; i -= 8)
2781  {
2782  for(int j = 0, k = 0; j < m_width; j += 16, k += 8)
2783  {
2784  DctY(bp+j, m_yWidth);
2785  Quant(m_block, m_lqt);
2786  sp += b2s(m_block, sp, m_lB8);
2787 
2788  DctY(bp+j+8, m_yWidth);
2789  Quant(m_block, m_lqt);
2790  sp += b2s(m_block, sp, m_lB8);
2791 
2792  DctY(bp1+j, m_yWidth);
2793  Quant(m_block, m_lqt);
2794  sp += b2s(m_block, sp, m_lB8);
2795 
2796  DctY(bp1+j+8, m_yWidth);
2797  Quant(m_block, m_lqt);
2798  sp += b2s(m_block, sp, m_lB8);
2799 
2800  DctY(bp2+k, m_cWidth);
2801  Quant(m_block, m_cqt);
2802  sp += b2s(m_block, sp, m_cB8);
2803 
2804  DctY(bp3+k, m_cWidth);
2805  Quant(m_block, m_cqt);
2806  sp += b2s(m_block, sp, m_cB8);
2807  }
2808  bp += m_width<<4;
2809  bp1 += m_width<<4;
2810  bp2 += m_width<<2;
2811  bp3 += m_width<<2;
2812  }
2813 #ifdef MMX
2814  emms();
2815 #endif
2816  return (sp - sb);
2817 }
2818 
2819 inline int RTjpeg::compressYUV422(int8_t *sp, uint8_t **planes)
2820 {
2821  uint8_t * bp = planes[0];
2822  uint8_t * bp2 = planes[1];
2823  uint8_t * bp3 = planes[2];
2824 
2825 #ifdef MMX
2826  emms();
2827 #endif
2828  int8_t * sb=sp;
2829 /* Y */
2830  for(int i=m_height; i; i-=8)
2831  {
2832  for(int j=0, k=0; j<m_width; j+=16, k+=8)
2833  {
2834  DctY(bp+j, m_yWidth);
2835  Quant(m_block, m_lqt);
2836  sp += b2s(m_block, sp, m_lB8);
2837 
2838  DctY(bp+j+8, m_yWidth);
2839  Quant(m_block, m_lqt);
2840  sp += b2s(m_block, sp, m_lB8);
2841 
2842  DctY(bp2+k, m_cWidth);
2843  Quant(m_block, m_cqt);
2844  sp+=b2s(m_block, sp, m_cB8);
2845 
2846  DctY(bp3+k, m_cWidth);
2847  Quant(m_block, m_cqt);
2848  sp+=b2s(m_block, sp, m_cB8);
2849 
2850  }
2851  bp += m_width << 3;
2852  bp2 += m_width << 2;
2853  bp3 += m_width << 2;
2854 
2855  }
2856 #ifdef MMX
2857  emms();
2858 #endif
2859  return (sp-sb);
2860 }
2861 
2862 inline int RTjpeg::compress8(int8_t *sp, uint8_t **planes)
2863 {
2864  int8_t * sb = nullptr;
2865  uint8_t * bp = planes[0];
2866 
2867 #ifdef MMX
2868  emms();
2869 #endif
2870 
2871  sb=sp;
2872 /* Y */
2873  for(int i=0; i<m_height; i+=8)
2874  {
2875  for(int j=0; j<m_width; j+=8)
2876  {
2877  DctY(bp+j, m_width);
2878  Quant(m_block, m_lqt);
2879  sp += b2s(m_block, sp, m_lB8);
2880  }
2881  bp += m_width;
2882  }
2883 
2884 #ifdef MMX
2885  emms();
2886 #endif
2887  return (sp-sb);
2888 }
2889 
2890 inline void RTjpeg::decompressYUV422(int8_t *sp, uint8_t **planes)
2891 {
2892  uint8_t * bp = planes[0];
2893  uint8_t * bp2 = planes[1];
2894  uint8_t * bp3 = planes[2];
2895 
2896 #ifdef MMX
2897  emms();
2898 #endif
2899 
2900 /* Y */
2901  for(int i=m_height; i; i-=8)
2902  {
2903  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2904  if (*sp==-1)sp++;
2905  else
2906  {
2907  sp += s2b(m_block, sp, m_lB8, m_liqt);
2908  Idct(bp+j, m_block, m_width);
2909  }
2910  if (*sp==-1)sp++;
2911  else
2912  {
2913  sp += s2b(m_block, sp, m_lB8, m_liqt);
2914  Idct(bp+j+8, m_block, m_width);
2915  }
2916  if (*sp==-1)sp++;
2917  else
2918  {
2919  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2920  Idct(bp2+k, m_block, m_width>>1);
2921  }
2922  if (*sp==-1)sp++;
2923  else
2924  {
2925  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2926  Idct(bp3+k, m_block, m_width>>1);
2927  }
2928  }
2929  bp += m_width<<3;
2930  bp2 += m_width<<2;
2931  bp3 += m_width<<2;
2932  }
2933 #ifdef MMX
2934  emms();
2935 #endif
2936 }
2937 
2938 inline void RTjpeg::decompressYUV420(int8_t *sp, uint8_t **planes)
2939 {
2940  uint8_t * bp = planes[0];
2941  uint8_t * bp1 = bp + (m_width<<3);
2942  uint8_t * bp2 = planes[1];
2943  uint8_t * bp3 = planes[2];
2944 
2945 #ifdef MMX
2946  emms();
2947 #endif
2948 
2949 /* Y */
2950  for(int i=m_height>>1; i; i-=8)
2951  {
2952  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2953  if (*sp==-1)sp++;
2954  else
2955  {
2956  sp += s2b(m_block, sp, m_lB8, m_liqt);
2957  Idct(bp+j, m_block, m_width);
2958  }
2959  if (*sp==-1)sp++;
2960  else
2961  {
2962  sp += s2b(m_block, sp, m_lB8, m_liqt);
2963  Idct(bp+j+8, m_block, m_width);
2964  }
2965  if (*sp==-1)sp++;
2966  else
2967  {
2968  sp += s2b(m_block, sp, m_lB8, m_liqt);
2969  Idct(bp1+j, m_block, m_width);
2970  }
2971  if (*sp==-1)sp++;
2972  else
2973  {
2974  sp += s2b(m_block, sp, m_lB8, m_liqt);
2975  Idct(bp1+j+8, m_block, m_width);
2976  }
2977  if (*sp==-1)sp++;
2978  else
2979  {
2980  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2981  Idct(bp2+k, m_block, m_width>>1);
2982  }
2983  if (*sp==-1)sp++;
2984  else
2985  {
2986  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2987  Idct(bp3+k, m_block, m_width>>1);
2988  }
2989  }
2990  bp += m_width<<4;
2991  bp1 += m_width<<4;
2992  bp2 += m_width<<2;
2993  bp3 += m_width<<2;
2994  }
2995 #ifdef MMX
2996  emms();
2997 #endif
2998 }
2999 
3000 inline void RTjpeg::decompress8(int8_t *sp, uint8_t **planes)
3001 {
3002  uint8_t * bp = planes[0];
3003 
3004 #ifdef MMX
3005  emms();
3006 #endif
3007 
3008 /* Y */
3009  for(int i=0; i<m_height; i+=8)
3010  {
3011  for(int j=0; j<m_width; j+=8)
3012  {
3013  if (*sp==-1)sp++;
3014  else
3015  {
3016  sp += s2b(m_block, sp, m_lB8, m_liqt);
3017  Idct(bp+j, m_block, m_width);
3018  }
3019  }
3020  bp += m_width<<3;
3021  }
3022 }
3023 
3024 #ifdef MMX
3025 
3026 int RTjpeg::bcomp(RTjpegData16 &rblock, int16_t *_old, mmx_t *mask)
3027 {
3028  auto *mold=(mmx_t *)_old;
3029  auto *mblock=(mmx_t *)rblock.data();
3030  volatile mmx_t result {};
3031  static mmx_t s_neg= { 0xffffffffffffffffULL };
3032 
3033  movq_m2r(*mask, mm7);
3034  movq_m2r(s_neg, mm6);
3035  pxor_r2r(mm5, mm5);
3036 
3037  for(int i=0; i<8; i++)
3038  {
3039  movq_m2r(*(mblock++), mm0);
3040  movq_m2r(*(mblock++), mm2);
3041  movq_m2r(*(mold++), mm1);
3042  movq_m2r(*(mold++), mm3);
3043  psubsw_r2r(mm1, mm0);
3044  psubsw_r2r(mm3, mm2);
3045  movq_r2r(mm0, mm1);
3046  movq_r2r(mm2, mm3);
3047  pcmpgtw_r2r(mm7, mm0);
3048  pcmpgtw_r2r(mm7, mm2);
3049  pxor_r2r(mm6, mm1);
3050  pxor_r2r(mm6, mm3);
3051  pcmpgtw_r2r(mm7, mm1);
3052  pcmpgtw_r2r(mm7, mm3);
3053  por_r2r(mm0, mm5);
3054  por_r2r(mm2, mm5);
3055  por_r2r(mm1, mm5);
3056  por_r2r(mm3, mm5);
3057  }
3058  movq_r2m(mm5, result);
3059 
3060  if (result.q)
3061  {
3062  std::copy(rblock.cbegin(), rblock.cend(), _old);
3063  return 0;
3064  }
3065  return 1;
3066 }
3067 
3068 #else
3069 int RTjpeg::bcomp(RTjpegData16 &rblock, int16_t *_old, uint16_t *mask)
3070 {
3071  for(int i=0; i<64; i++)
3072  if (abs(_old[i]-rblock[i])>*mask)
3073  {
3074  std::copy(rblock.cbegin(), rblock.cend(), _old);
3075  return 0;
3076  }
3077  return 1;
3078 }
3079 #endif
3080 
3081 inline int RTjpeg::mcompressYUV420(int8_t *sp, uint8_t **planes)
3082 {
3083  uint8_t * bp = planes[0];
3084  uint8_t * bp1 = bp + (m_width<<3);
3085  uint8_t * bp2 = planes[1];
3086  uint8_t * bp3 = planes[2];
3087  int8_t * sb = sp;
3088  int16_t * lblock = m_old;
3089 
3090 /* Y */
3091  for(int i = m_height>>1; i; i-=8)
3092  {
3093  for(int j=0, k=0; j < m_width; j+=16, k+=8)
3094  {
3095  DctY(bp+j, m_yWidth);
3096  Quant(m_block, m_lqt);
3097  if (bcomp(m_block, lblock, &m_lMask))
3098  {
3099  *((uint8_t *)sp++)=255;
3100  }
3101  else
3102  {
3103  sp+=b2s(m_block, sp, m_lB8);
3104  }
3105  lblock += 64;
3106 
3107  DctY(bp+j+8, m_yWidth);
3108  Quant(m_block, m_lqt);
3109  if (bcomp(m_block, lblock, &m_lMask))
3110  {
3111  *((uint8_t *)sp++)=255;
3112  }
3113  else
3114  {
3115  sp += b2s(m_block, sp, m_lB8);
3116  }
3117  lblock += 64;
3118 
3119  DctY(bp1+j, m_yWidth);
3120  Quant(m_block, m_lqt);
3121  if (bcomp(m_block, lblock, &m_lMask))
3122  {
3123  *((uint8_t *)sp++)=255;
3124  }
3125  else
3126  {
3127  sp += b2s(m_block, sp, m_lB8);
3128  }
3129  lblock += 64;
3130 
3131  DctY(bp1+j+8, m_yWidth);
3132  Quant(m_block, m_lqt);
3133  if (bcomp(m_block, lblock, &m_lMask))
3134  {
3135  *((uint8_t *)sp++)=255;
3136  }
3137  else
3138  {
3139  sp += b2s(m_block, sp, m_lB8);
3140  }
3141  lblock += 64;
3142 
3143  DctY(bp2+k, m_cWidth);
3144  Quant(m_block, m_cqt);
3145  if (bcomp(m_block, lblock, &m_cMask))
3146  {
3147  *((uint8_t *)sp++)=255;
3148  }
3149  else
3150  {
3151  sp+=b2s(m_block, sp, m_cB8);
3152  }
3153  lblock+=64;
3154 
3155  DctY(bp3+k, m_cWidth);
3156  Quant(m_block, m_cqt);
3157  if (bcomp(m_block, lblock, &m_cMask))
3158  {
3159  *((uint8_t *)sp++)=255;
3160  }
3161  else
3162  {
3163  sp+=b2s(m_block, sp, m_cB8);
3164  }
3165  lblock+=64;
3166  }
3167  bp += m_width<<4;
3168  bp1 += m_width<<4;
3169  bp2 += m_width<<2;
3170  bp3 += m_width<<2;
3171  }
3172 #ifdef MMX
3173  emms();
3174 #endif
3175  return (sp-sb);
3176 }
3177 
3178 
3179 inline int RTjpeg::mcompressYUV422(int8_t *sp, uint8_t **planes)
3180 {
3181  uint8_t * bp = planes[0];
3182  uint8_t * bp2 = planes[1];
3183  uint8_t * bp3 = planes[2];
3184  int8_t * sb=sp;
3185  int16_t *lblock = m_old;
3186 
3187  for(int i = m_height; i; i-=8)
3188  {
3189  for(int j=0, k=0; j<m_width; j+=16, k+=8)
3190  {
3191  DctY(bp+j, m_yWidth);
3192  Quant(m_block, m_lqt);
3193  if (bcomp(m_block, lblock, &m_lMask))
3194  {
3195  *((uint8_t *)sp++)=255;
3196  }
3197  else
3198  {
3199  sp+=b2s(m_block, sp, m_lB8);
3200  }
3201  lblock+=64;
3202 
3203  DctY(bp+j+8, m_yWidth);
3204  Quant(m_block, m_lqt);
3205  if (bcomp(m_block, lblock, &m_lMask))
3206  {
3207  *((uint8_t *)sp++)=255;
3208  }
3209  else
3210  {
3211  sp+=b2s(m_block, sp, m_lB8);
3212  }
3213  lblock+=64;
3214 
3215  DctY(bp2+k, m_cWidth);
3216  Quant(m_block, m_cqt);
3217  if (bcomp(m_block, lblock, &m_cMask))
3218  {
3219  *((uint8_t *)sp++)=255;
3220  }
3221  else
3222  {
3223  sp+=b2s(m_block, sp, m_cB8);
3224  }
3225  lblock+=64;
3226 
3227  DctY(bp3+k, m_cWidth);
3228  Quant(m_block, m_cqt);
3229  if (bcomp(m_block, lblock, &m_cMask))
3230  {
3231  *((uint8_t *)sp++)=255;
3232  }
3233  else
3234  {
3235  sp+=b2s(m_block, sp, m_cB8);
3236  }
3237  lblock+=64;
3238 
3239  }
3240  bp += m_width<<3;
3241  bp2 += m_width<<2;
3242  bp3 += m_width<<2;
3243  }
3244 #ifdef MMX
3245  emms();
3246 #endif
3247  return (sp-sb);
3248 }
3249 
3250 inline int RTjpeg::mcompress8(int8_t *sp, uint8_t **planes)
3251 {
3252  uint8_t * bp = planes[0];
3253  int8_t * sb = sp;
3254  int16_t *lblock = m_old;
3255 
3256  for(int i=0; i<m_height; i+=8)
3257  {
3258  for(int j=0; j<m_width; j+=8)
3259  {
3260  DctY(bp+j, m_width);
3261  Quant(m_block, m_lqt);
3262  if (bcomp(m_block, lblock, &m_lMask))
3263  {
3264  *((uint8_t *)sp++)=255;
3265  }
3266  else
3267  {
3268  sp+=b2s(m_block, sp, m_lB8);
3269  }
3270  lblock+=64;
3271  }
3272  bp+=m_width<<3;
3273  }
3274 #ifdef MMX
3275  emms();
3276 #endif
3277  return (sp-sb);
3278 }
3279 
3281 {
3282  m_keyCount = 0;
3283 }
3284 
3285 int RTjpeg::Compress(int8_t *sp, uint8_t **planes)
3286 {
3287  auto * fh = reinterpret_cast<RTjpeg_frameheader *>(sp);
3288  int ds = 0;
3289 
3290  if (m_keyRate == 0)
3291  {
3292  switch(m_f)
3293  {
3294  case RTJ_YUV420: ds = compressYUV420((int8_t*)&(fh->data), planes); break;
3295  case RTJ_YUV422: ds = compressYUV422((int8_t*)&(fh->data), planes); break;
3296  case RTJ_RGB8: ds = compress8((int8_t*)&(fh->data), planes); break;
3297  }
3298  fh->key = 0;
3299  } else {
3300  if (m_keyCount == 0)
3301  memset(m_old, 0, (4_UZ * m_width * m_height));
3302  switch(m_f)
3303  {
3304  case RTJ_YUV420: ds = mcompressYUV420((int8_t*)&(fh->data), planes); break;
3305  case RTJ_YUV422: ds = mcompressYUV422((int8_t*)&(fh->data), planes); break;
3306  case RTJ_RGB8: ds = mcompress8((int8_t*)&(fh->data), planes); break;
3307  }
3308  fh->key = m_keyCount;
3309  if (++m_keyCount > m_keyRate)
3310  m_keyCount = 0;
3311  }
3312  ds += RTJPEG_HEADER_SIZE;
3313  fh->framesize = qToLittleEndian<qint32>(ds);
3314  fh->headersize = RTJPEG_HEADER_SIZE;
3315  fh->version = RTJPEG_FILE_VERSION;
3316  fh->width = qToLittleEndian<qint16>(m_width);
3317  fh->height = qToLittleEndian<qint16>(m_height);
3318  fh->quality = m_q;
3319  return ds;
3320 }
3321 
3322 void RTjpeg::Decompress(int8_t *sp, uint8_t **planes)
3323 {
3324  auto * fh = reinterpret_cast<RTjpeg_frameheader *>(sp);
3325 
3326  if ((qFromLittleEndian<qint16>(fh->width) != m_width)||
3327  (qFromLittleEndian<qint16>(fh->height) != m_height))
3328  {
3329  int w = qFromLittleEndian<qint16>(fh->width);
3330  int h = qFromLittleEndian<qint16>(fh->height);
3331  SetSize(&w, &h);
3332  }
3333  if (fh->quality != m_q)
3334  {
3335  int q = fh->quality;
3336  SetQuality(&q);
3337  }
3338  switch(m_f)
3339  {
3340  case RTJ_YUV420: decompressYUV420((int8_t*)&(fh->data), planes); break;
3341  case RTJ_YUV422: decompressYUV422((int8_t*)&(fh->data), planes); break;
3342  case RTJ_RGB8: decompress8((int8_t*)&(fh->data), planes); break;
3343  }
3344 }
RTjpeg::mcompressYUV422
int mcompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3179
RTjpeg_chrom_quant_tbl
static const std::array< const uint8_t, 64 > RTjpeg_chrom_quant_tbl
Definition: RTjpegN.cpp:88
RTjpeg::s2b
static int s2b(RTjpegData16 &data, const int8_t *strm, uint8_t bt8, RTjpegData32 &qtbla)
Definition: RTjpegN.cpp:283
RTjpeg::DctY
void DctY(uint8_t *idata, int rskip)
Definition: RTjpegN.cpp:601
RTjpeg_lum_quant_tbl
static const std::array< const uint8_t, 64 > RTjpeg_lum_quant_tbl
Definition: RTjpegN.cpp:77
RTjpegData16
std::array< int16_t, 64 > RTjpegData16
Definition: RTjpegN.h:38
RTjpeg::m_lB8
int32_t m_lB8
Definition: RTjpegN.h:108
RTjpeg::m_yWidth
int32_t m_yWidth
Definition: RTjpegN.h:110
RTjpeg::compress8
int compress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2862
RTjpeg::m_lMask
mmx_t m_lMask
Definition: RTjpegN.h:122
RTjpeg::m_ySize
int32_t m_ySize
Definition: RTjpegN.h:112
RTjpeg::b2s
static int b2s(const RTjpegData16 &data, int8_t *strm, uint8_t bt8)
Definition: RTjpegN.cpp:116
RTjpeg_C4
static mmx_t RTjpeg_C4
Definition: RTjpegN.cpp:39
RTjpeg::SetNextKey
void SetNextKey(void)
Definition: RTjpegN.cpp:3280
RTjpeg::m_q
int m_q
Definition: RTjpegN.h:119
RTjpeg_frameheader
Definition: RTjpegN.h:131
RTJ_RGB8
@ RTJ_RGB8
Definition: RTjpegN.h:50
RTjpeg_C6
static mmx_t RTjpeg_C6
Definition: RTjpegN.cpp:40
RTjpeg::m_height
int m_height
Definition: RTjpegN.h:118
tmp
static guint32 * tmp
Definition: goom_core.cpp:26
RTjpeg_half
static mmx_t RTjpeg_half
Definition: RTjpegN.cpp:38
RTjpeg::m_cB8
int32_t m_cB8
Definition: RTjpegN.h:109
RTjpeg::DctInit
void DctInit(void)
Definition: RTjpegN.cpp:592
RTjpeg::CalcTbls
void CalcTbls(void)
Definition: RTjpegN.cpp:2643
RTjpeg::QuantInit
void QuantInit(void)
Definition: RTjpegN.cpp:517
RTJPEG_FILE_VERSION
static constexpr uint8_t RTJPEG_FILE_VERSION
Definition: RTjpegN.h:35
MythFile::copy
MBASE_PUBLIC long long copy(QFile &dst, QFile &src, uint block_size=0)
Copies src file to dst file.
Definition: mythmiscutil.cpp:263
RTjpeg::bcomp
static int bcomp(RTjpegData16 &rblock, int16_t *old, mmx_t *mask)
Definition: RTjpegN.cpp:3026
RTJ_YUV420
@ RTJ_YUV420
Definition: RTjpegN.h:48
RTjpeg_C2mC6
static mmx_t RTjpeg_C2mC6
Definition: RTjpegN.cpp:41
RTjpeg_ones
static mmx_t RTjpeg_ones
Definition: RTjpegN.cpp:37
RTjpeg::mcompress8
int mcompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3250
RTjpeg::mcompressYUV420
int mcompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3081
RTjpeg::compressYUV420
int compressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2768
RTjpeg_ZZ
static const std::array< const uint8_t, 64 > RTjpeg_ZZ
Definition: RTjpegN.cpp:49
RTjpeg::decompress8
void decompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3000
RTjpeg::m_cWidth
int32_t m_cWidth
Definition: RTjpegN.h:111
hardwareprofile.distros.mythtv_data.main.stdout
stdout
Definition: main.py:87
RTJ_YUV422
@ RTJ_YUV422
Definition: RTjpegN.h:49
clamp
static eu8 clamp(eu8 value, eu8 low, eu8 high)
Definition: pxsup2dast.c:204
RTjpeg::Idct
void Idct(uint8_t *odata, RTjpegData16 &data, int rskip)
Definition: RTjpegN.cpp:1525
RTjpeg::m_width
int m_width
Definition: RTjpegN.h:117
RTjpeg::~RTjpeg
~RTjpeg()
Definition: RTjpegN.cpp:2763
RTjpeg::m_ws
std::array< int32_t, 64_UZ *4 > m_ws
Definition: RTjpegN.h:103
RTjpeg::Quant
static void Quant(RTjpegData16 &block, RTjpegData32 &qtbl)
Definition: RTjpegN.cpp:532
RTjpeg::m_block
RTjpegData16 m_block
Definition: RTjpegN.h:102
RTjpeg::decompressYUV420
void decompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2938
RTjpeg_C2pC6
static mmx_t RTjpeg_C2pC6
Definition: RTjpegN.cpp:42
RTjpeg::SetIntra
int SetIntra(int *key, int *lm, int *cm)
Definition: RTjpegN.cpp:2722
bbciplayer.stderr
stderr
Definition: bbciplayer.py:199
RTjpeg::m_lqt
RTjpegData32 m_lqt
Definition: RTjpegN.h:104
RTjpeg::m_keyCount
int m_keyCount
Definition: RTjpegN.h:115
RTjpeg::SetFormat
int SetFormat(const int *fmt)
Definition: RTjpegN.cpp:2688
RTjpeg::m_ciqt
RTjpegData32 m_ciqt
Definition: RTjpegN.h:107
RTjpeg::m_old
int16_t * m_old
Definition: RTjpegN.h:114
RTjpeg::m_keyRate
int m_keyRate
Definition: RTjpegN.h:128
RTjpeg_zero
static mmx_t RTjpeg_zero
Definition: RTjpegN.cpp:43
RTjpeg::SetSize
int SetSize(const int *w, const int *h)
Definition: RTjpegN.cpp:2694
RTjpeg::IdctInit
void IdctInit(void)
Definition: RTjpegN.cpp:1516
RTjpeg::decompressYUV422
void decompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2890
RTjpeg::Decompress
void Decompress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3322
RTjpeg::compressYUV422
int compressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2819
uint16_t
unsigned short uint16_t
Definition: iso6937tables.h:3
RTjpeg::m_cSize
int32_t m_cSize
Definition: RTjpegN.h:113
RTjpeg::m_cMask
mmx_t m_cMask
Definition: RTjpegN.h:123
RTjpegN.h
RTjpeg::m_cqt
RTjpegData32 m_cqt
Definition: RTjpegN.h:105
RTjpeg_aan_tab
static const std::array< const uint64_t, 64 > RTjpeg_aan_tab
Definition: RTjpegN.cpp:66
RTjpeg::m_liqt
RTjpegData32 m_liqt
Definition: RTjpegN.h:106
RTjpeg::SetQuality
int SetQuality(int *quality)
Definition: RTjpegN.cpp:2674
RTjpeg::Compress
int Compress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3285
RTjpegData32
std::array< int32_t, 64 > RTjpegData32
Definition: RTjpegN.h:39
RTjpeg::m_f
int m_f
Definition: RTjpegN.h:120
RTJPEG_HEADER_SIZE
static constexpr uint8_t RTJPEG_HEADER_SIZE
Definition: RTjpegN.h:36
RTjpeg::RTjpeg
RTjpeg()
Definition: RTjpegN.cpp:2750