MythTV  master
RTjpegN.cpp
Go to the documentation of this file.
1 /*
2  RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
3 
4  With modifications by:
5  (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6  and
7  (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation; either version 2 of the License, or
12  (at your option) any later version.
13 
14  This program is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License for more details.
18 
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the Free Software
21  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 */
24 
25 #include <array>
26 #include <cstdio>
27 #include <cstdlib>
28 #include <cstring>
29 #include "RTjpegN.h"
30 
31 #ifdef MMX
32 static mmx_t RTjpeg_ones;
33 static mmx_t RTjpeg_half;
34 static mmx_t RTjpeg_C4;
35 static mmx_t RTjpeg_C6;
36 static mmx_t RTjpeg_C2mC6;
37 static mmx_t RTjpeg_C2pC6;
38 static mmx_t RTjpeg_zero;
39 #endif
40 
41 //#define SHOWBLOCK 1
42 #define BETTERCOMPRESSION 1
43 
44 static const std::array<const uint8_t,64> RTjpeg_ZZ {
45 0,
46 8, 1,
47 2, 9, 16,
48 24, 17, 10, 3,
49 4, 11, 18, 25, 32,
50 40, 33, 26, 19, 12, 5,
51 6, 13, 20, 27, 34, 41, 48,
52 56, 49, 42, 35, 28, 21, 14, 7,
53 15, 22, 29, 36, 43, 50, 57,
54 58, 51, 44, 37, 30, 23,
55 31, 38, 45, 52, 59,
56 60, 53, 46, 39,
57 47, 54, 61,
58 62, 55,
59 63 };
60 
61 static const std::array<const uint64_t,64> RTjpeg_aan_tab {
62 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
63 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
64 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
65 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
66 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
67 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
68 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
69 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
70 };
71 
72 static const std::array<const uint8_t,64> RTjpeg_lum_quant_tbl {
73  16, 11, 10, 16, 24, 40, 51, 61,
74  12, 12, 14, 19, 26, 58, 60, 55,
75  14, 13, 16, 24, 40, 57, 69, 56,
76  14, 17, 22, 29, 51, 87, 80, 62,
77  18, 22, 37, 56, 68, 109, 103, 77,
78  24, 35, 55, 64, 81, 104, 113, 92,
79  49, 64, 78, 87, 103, 121, 120, 101,
80  72, 92, 95, 98, 112, 100, 103, 99
81  };
82 
83 static const std::array<const uint8_t,64> RTjpeg_chrom_quant_tbl {
84  17, 18, 24, 47, 99, 99, 99, 99,
85  18, 21, 26, 66, 99, 99, 99, 99,
86  24, 26, 56, 99, 99, 99, 99, 99,
87  47, 66, 99, 99, 99, 99, 99, 99,
88  99, 99, 99, 99, 99, 99, 99, 99,
89  99, 99, 99, 99, 99, 99, 99, 99,
90  99, 99, 99, 99, 99, 99, 99, 99,
91  99, 99, 99, 99, 99, 99, 99, 99
92  };
93 
94 #ifdef BETTERCOMPRESSION
95 
96 /*--------------------------------------------------*/
97 /* better encoding, but needs a lot more cpu time */
98 /* seems to be more effective than old method +lzo */
99 /* with this encoding lzo isn't efficient anymore */
100 /* there is still more potential for better */
101 /* encoding but that would need even more cputime */
102 /* anyway your mileage may vary */
103 /* */
104 /* written by Martin BIELY and Roman HOCHLEITNER */
105 /*--------------------------------------------------*/
106 
107 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
108 /* Block to Stream (encoding) */
109 /* */
110 
111 int RTjpeg::b2s(const int16_t *data, int8_t *strm, uint8_t /*bt8*/)
112 {
113  int co=1;
114 
115  auto *ustrm = (uint8_t *)strm;
116 #ifdef SHOWBLOCK
117 
118  int ii;
119  for (ii=0; ii < 64; ii++) {
120  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
121  }
122  fprintf(stdout, "\n\n");
123 
124 #endif
125 
126 // *strm++ = 0x10;
127 // *strm = 0x00;
128 //
129 // return 2;
130 
131  // first byte allways written
132  ustrm[0]=
133  (uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
134 
135 
136  int ci=63;
137  while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
138 
139  unsigned char bitten = ((unsigned char)ci) << 2;
140 
141  if (ci==0) {
142  ustrm[1]= bitten;
143  co = 2;
144  return co;
145  }
146 
147  /* bitoff=0 because the high 6bit contain first non zero position */
148  unsigned char bitoff = 0;
149  co = 1;
150 
151  for(; ci>0; ci--) {
152 
153  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
154 
155  switch(ZZvalue) {
156  case 0:
157  break;
158  case 1:
159  bitten |= (0x01<<bitoff);
160  break;
161  case -1:
162  bitten |= (0x03<<bitoff);
163  break;
164  default:
165  bitten |= (0x02<<bitoff);
166  goto HERZWEH;
167  break;
168  }
169 
170  if ( bitoff == 0 ) {
171  ustrm[co]= bitten;
172  bitten = 0;
173  bitoff = 8;
174  co++;
175  } /* "fall through" */
176  bitoff-=2;
177 
178  }
179 
180  /* ci must be 0 */
181  if (bitoff != 6) {
182 
183  ustrm[co]= bitten;
184  co++;
185 
186  }
187  goto BAUCHWEH;
188 
189 HERZWEH:
190 /* ci cannot be 0 */
191 /* correct bitoff to nibble boundaries */
192 
193  switch(bitoff){
194  case 4:
195  case 6:
196  bitoff = 0;
197  break;
198  case 2:
199  case 0:
200  ustrm[co]= bitten;
201  bitoff = 4;
202  co++;
203  bitten = 0; // clear half nibble values in bitten
204  break;
205  default:
206  break;
207  }
208 
209  for(; ci>0; ci--) {
210 
211  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
212 
213  if ( (ZZvalue > 7) || (ZZvalue < -7) ) {
214  bitten |= (0x08<<bitoff);
215  goto HIRNWEH;
216  }
217 
218  bitten |= (ZZvalue&0xf)<<bitoff;
219 
220  if ( bitoff == 0 ) {
221  ustrm[co]= bitten;
222  bitten = 0;
223  bitoff = 8;
224  co++;
225  } /* "fall thru" */
226  bitoff-=4;
227  }
228 
229  /* ci must be 0 */
230  if ( bitoff == 0 ) {
231  ustrm[co]= bitten;
232  co++;
233  }
234  goto BAUCHWEH;
235 
236 HIRNWEH:
237 
238  ustrm[co]= bitten;
239  co++;
240 
241 
242  /* bitting is over now we bite */
243  for(; ci>0; ci--) {
244 
245  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
246 
247  if (ZZvalue>0)
248  {
249  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
250  }
251  else
252  {
253  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
254  }
255 
256  }
257 
258 
259 BAUCHWEH:
260  /* we gotoo much now we are ill */
261 #ifdef SHOWBLOCK
262 {
263 int i;
264 fprintf(stdout, "\nco = '%d'\n", co);
265  for (i=0; i < co+2; i++) {
266  fprintf(stdout, "%d ", strm[i]);
267  }
268 fprintf(stdout, "\n\n");
269 }
270 #endif
271 
272  return co;
273 }
274 
275 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
276 /* Stream to Block (decoding) */
277 /* */
278 
279 int RTjpeg::s2b(int16_t *data, const int8_t *strm, uint8_t /*bt8*/, int32_t *qtbla)
280 {
281  auto *qtbl = (uint32_t *)qtbla;
282  int ci = 0;
283  unsigned char bitoff = 0;
284 
285  /* first byte always read */
286  int i=RTjpeg_ZZ[0];
287  data[i]=((uint8_t)strm[0])*qtbl[i];
288 
289  /* we start at the behind */
290 
291  unsigned char bitten = ((unsigned char)strm[1]) >> 2;
292  int co = 63;
293  for(; co > bitten; co--) {
294 
295  data[RTjpeg_ZZ[co]] = 0;
296 
297  }
298 
299  if (co==0) {
300  ci = 2;
301  goto AUTOBAHN;
302  }
303 
304  /* we have to read the last 2 bits of the second byte */
305  ci=1;
306  bitoff = 0;
307 
308  for(; co>0; co--) {
309 
310  bitten = ((unsigned char)strm[ci]) >> bitoff;
311  bitten &= 0x03;
312 
313  i=RTjpeg_ZZ[co];
314 
315  switch( bitten ) {
316  case 0x03:
317  data[i]= -qtbl[i];
318  break;
319  case 0x02:
320  goto FUSSWEG;
321  break;
322  case 0x01:
323  data[i]= qtbl[i];
324  break;
325  case 0x00:
326  data[i]= 0;
327  break;
328  default:
329  break;
330  }
331 
332  if ( bitoff == 0 ) {
333  bitoff = 8;
334  ci++;
335  }
336  bitoff -= 2;
337  }
338  /* co is 0 now */
339  /* data is written properly */
340 
341  /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
342  if (bitoff!=6) ci++;
343 
344  goto AUTOBAHN;
345 
346 
347 FUSSWEG:
348 /* correct bitoff to nibble */
349  switch(bitoff){
350  case 4:
351  case 6:
352  bitoff = 0;
353  break;
354  case 2:
355  case 0:
356  /* we have to read from the next byte */
357  ci++;
358  bitoff = 4;
359  break;
360  default:
361  break;
362  }
363 
364  for(; co>0; co--) {
365 
366  bitten = ((unsigned char)strm[ci]) >> bitoff;
367  bitten &= 0x0f;
368 
369  i=RTjpeg_ZZ[co];
370 
371  if ( bitten == 0x08 ) {
372  goto STRASSE;
373  }
374 
375  /* the compiler cannot do sign extension for signed nibbles */
376  if ( bitten & 0x08 ) {
377  bitten |= 0xf0;
378  }
379  /* the unsigned char bitten now is a valid signed char */
380 
381  data[i]=((signed char)bitten)*qtbl[i];
382 
383  if ( bitoff == 0 ) {
384  bitoff = 8;
385  ci++;
386  }
387  bitoff -= 4;
388  }
389  /* co is 0 */
390 
391  /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
392  if (bitoff!=4) ci++;
393 
394  goto AUTOBAHN;
395 
396 STRASSE:
397  ci++;
398 
399  for(; co>0; co--) {
400  i=RTjpeg_ZZ[co];
401  data[i]=strm[ci++]*qtbl[i];
402  }
403 
404  /* ci now is the count, because it points to next element => no incrementing */
405 
406 AUTOBAHN:
407 
408 #ifdef SHOWBLOCK
409 fprintf(stdout, "\nci = '%d'\n", ci);
410  for (i=0; i < 64; i++) {
411  fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
412  }
413 fprintf(stdout, "\n\n");
414 #endif
415 
416  return ci;
417 }
418 
419 #else
420 
421 int RTjpeg::b2s(const int16_t *data, int8_t *strm, uint8_t bt8)
422 {
423  register int ci, co=1, tmp;
424  register int16_t ZZvalue;
425 
426 #ifdef SHOWBLOCK
427 
428  int ii;
429  for (ii=0; ii < 64; ii++) {
430  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
431  }
432  fprintf(stdout, "\n\n");
433 
434 #endif
435 
436  (uint8_t)strm[0]=(uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
437 
438  for(ci=1; ci<=bt8; ci++)
439  {
440  ZZvalue = data[RTjpeg_ZZ[ci]];
441 
442  if (ZZvalue>0)
443  {
444  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
445  }
446  else
447  {
448  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
449  }
450  }
451 
452  for(; ci<64; ci++)
453  {
454  ZZvalue = data[RTjpeg_ZZ[ci]];
455 
456  if (ZZvalue>0)
457  {
458  strm[co++]=(int8_t)(ZZvalue>63)?63:ZZvalue;
459  }
460  else if (ZZvalue<0)
461  {
462  strm[co++]=(int8_t)(ZZvalue<-64)?-64:ZZvalue;
463  }
464  else /* compress zeros */
465  {
466  tmp=ci;
467  do
468  {
469  ci++;
470  } while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
471 
472  strm[co++]=(int8_t)(63+(ci-tmp));
473  ci--;
474  }
475  }
476  return (int)co;
477 }
478 
479 int RTjpeg::s2b(int16_t *data, const int8_t *strm, uint8_t bt8, uint32_t *qtbla)
480 {
481  uint32_t *qtbl = (uint32_t *)qtbla;
482  int ci=1, co=1, tmp;
483  register int i;
484 
485  i=RTjpeg_ZZ[0];
486  data[i]=((uint8_t)strm[0])*qtbl[i];
487 
488  for(co=1; co<=bt8; co++)
489  {
490  i=RTjpeg_ZZ[co];
491  data[i]=strm[ci++]*qtbl[i];
492  }
493 
494  for(; co<64; co++)
495  {
496  if (strm[ci]>63)
497  {
498  tmp=co+strm[ci]-63;
499  for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
500  co--;
501  } else
502  {
503  i=RTjpeg_ZZ[co];
504  data[i]=strm[ci]*qtbl[i];
505  }
506  ci++;
507  }
508  return (int)ci;
509 }
510 #endif
511 
512 #ifdef MMX
514 {
515  using P16_32 = union { int16_t *m_int16; int32_t *m_int32; };
516  P16_32 qtbl;
517 
518  qtbl.m_int32 = m_lqt;
519  for (int i = 0; i < 64; i++)
520  qtbl.m_int16[i] = static_cast<int16_t>(m_lqt[i]);
521 
522  // cppcheck-suppress unreadVariable
523  qtbl.m_int32 = m_cqt;
524  for (int i = 0; i < 64; i++)
525  qtbl.m_int16[i] = static_cast<int16_t>(m_cqt[i]);
526 }
527 
528 void RTjpeg::Quant(int16_t *_block, int32_t *qtbl)
529 {
530  auto *ql=(mmx_t *)qtbl;
531  auto *bl=(mmx_t *)_block;
532 
533  movq_m2r(RTjpeg_ones, mm6);
534  movq_m2r(RTjpeg_half, mm7);
535 
536  for(int i=16; i; i--)
537  {
538  movq_m2r(*(ql++), mm0); /* quant vals (4) */
539  movq_m2r(*bl, mm2); /* block vals (4) */
540  movq_r2r(mm0, mm1);
541  movq_r2r(mm2, mm3);
542 
543  punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
544  punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
545 
546  punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
547  punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
548 
549  pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
550  pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
551 
552  psrad_i2r(16, mm0);
553  psrad_i2r(16, mm1);
554 
555  packssdw_r2r(mm1, mm0);
556 
557  movq_r2m(mm0, *(bl++));
558  }
559 }
560 #else
561 void RTjpeg::QuantInit()
562 {
563 }
564 
565 void RTjpeg::Quant(int16_t *_block, int32_t *qtbl)
566 {
567  int i;
568 
569  for(i=0; i<64; i++)
570  _block[i]=(int16_t)((_block[i]*qtbl[i]+32767)>>16);
571 }
572 #endif
573 
574 /*
575  * Perform the forward DCT on one block of samples.
576  */
577 #ifndef MMX
578 #define FIX_0_382683433 ((int32_t) 98) /* FIX(0.382683433) */
579 #define FIX_0_541196100 ((int32_t) 139) /* FIX(0.541196100) */
580 #define FIX_0_707106781 ((int32_t) 181) /* FIX(0.707106781) */
581 #define FIX_1_306562965 ((int32_t) 334) /* FIX(1.306562965) */
582 
583 #define DESCALE10(x) (int16_t)( ((x)+128) >> 8)
584 #define DESCALE20(x) (int16_t)(((x)+32768) >> 16)
585 #define D_MULTIPLY(var,const) ((int32_t) ((var) * (const)))
586 #endif
587 
589 {
590  for (int i = 0; i < 64; i++)
591  {
592  m_lqt[i] = (((uint64_t)m_lqt[i] << 32) / RTjpeg_aan_tab[i]);
593  m_cqt[i] = (((uint64_t)m_cqt[i] << 32) / RTjpeg_aan_tab[i]);
594  }
595 }
596 
597 void RTjpeg::DctY(uint8_t *idata, int rskip)
598 {
599 #ifndef MMX
600  uint8_t *idataptr = idata;
601  int32_t *wsptr = m_ws;
602 
603  for (int ctr = 7; ctr >= 0; ctr--) {
604  int32_t tmp0 = idataptr[0] + idataptr[7];
605  int32_t tmp7 = idataptr[0] - idataptr[7];
606  int32_t tmp1 = idataptr[1] + idataptr[6];
607  int32_t tmp6 = idataptr[1] - idataptr[6];
608  int32_t tmp2 = idataptr[2] + idataptr[5];
609  int32_t tmp5 = idataptr[2] - idataptr[5];
610  int32_t tmp3 = idataptr[3] + idataptr[4];
611  int32_t tmp4 = idataptr[3] - idataptr[4];
612 
613  int32_t tmp10 = (tmp0 + tmp3); /* phase 2 */
614  int32_t tmp13 = tmp0 - tmp3;
615  int32_t tmp11 = (tmp1 + tmp2);
616  int32_t tmp12 = tmp1 - tmp2;
617 
618  wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
619  wsptr[4] = (tmp10 - tmp11)<<8;
620 
621  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
622  wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
623  wsptr[6] = (tmp13<<8) - z1;
624 
625  tmp10 = tmp4 + tmp5; /* phase 2 */
626  tmp11 = tmp5 + tmp6;
627  tmp12 = tmp6 + tmp7;
628 
629  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
630  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
631  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
632  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
633 
634  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
635  int32_t z13 = (tmp7<<8) - z3;
636 
637  wsptr[5] = z13 + z2; /* phase 6 */
638  wsptr[3] = z13 - z2;
639  wsptr[1] = z11 + z4;
640  wsptr[7] = z11 - z4;
641 
642  idataptr += rskip<<3; /* advance pointer to next row */
643  wsptr += 8;
644  }
645 
646  wsptr = m_ws;
647  int16_t *odataptr = m_block;
648  for (int ctr = 7; ctr >= 0; ctr--) {
649  int32_t tmp0 = wsptr[0] + wsptr[56];
650  int32_t tmp7 = wsptr[0] - wsptr[56];
651  int32_t tmp1 = wsptr[8] + wsptr[48];
652  int32_t tmp6 = wsptr[8] - wsptr[48];
653  int32_t tmp2 = wsptr[16] + wsptr[40];
654  int32_t tmp5 = wsptr[16] - wsptr[40];
655  int32_t tmp3 = wsptr[24] + wsptr[32];
656  int32_t tmp4 = wsptr[24] - wsptr[32];
657 
658  int32_t tmp10 = tmp0 + tmp3; /* phase 2 */
659  int32_t tmp13 = tmp0 - tmp3;
660  int32_t tmp11 = tmp1 + tmp2;
661  int32_t tmp12 = tmp1 - tmp2;
662 
663  odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
664  odataptr[32] = DESCALE10(tmp10 - tmp11);
665 
666  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
667  odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
668  odataptr[48] = DESCALE20((tmp13<<8) - z1);
669 
670  tmp10 = tmp4 + tmp5; /* phase 2 */
671  tmp11 = tmp5 + tmp6;
672  tmp12 = tmp6 + tmp7;
673 
674  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
675  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
676  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
677  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
678 
679  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
680  int32_t z13 = (tmp7<<8) - z3;
681 
682  odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
683  odataptr[24] = DESCALE20(z13 - z2);
684  odataptr[8] = DESCALE20(z11 + z4);
685  odataptr[56] = DESCALE20(z11 - z4);
686 
687  odataptr++; /* advance pointer to next column */
688  wsptr++;
689 
690  }
691 #else
692  volatile mmx_t tmp6 {};
693  volatile mmx_t tmp7 {};
694  auto *dataptr = (mmx_t *)m_block;
695  auto *idata2 = (mmx_t *)idata;
696 
697 
698  // first copy the input 8 bit to the destination 16 bits
699 
700  movq_m2r(RTjpeg_zero, mm2);
701 
702  movq_m2r(*idata2, mm0);
703  movq_r2r(mm0, mm1);
704 
705  punpcklbw_r2r(mm2, mm0);
706  movq_r2m(mm0, *(dataptr));
707 
708  punpckhbw_r2r(mm2, mm1);
709  movq_r2m(mm1, *(dataptr+1));
710 
711  idata2 += rskip;
712 
713  movq_m2r(*idata2, mm0);
714  movq_r2r(mm0, mm1);
715 
716  punpcklbw_r2r(mm2, mm0);
717  movq_r2m(mm0, *(dataptr+2));
718 
719  punpckhbw_r2r(mm2, mm1);
720  movq_r2m(mm1, *(dataptr+3));
721 
722  idata2 += rskip;
723 
724  movq_m2r(*idata2, mm0);
725  movq_r2r(mm0, mm1);
726 
727  punpcklbw_r2r(mm2, mm0);
728  movq_r2m(mm0, *(dataptr+4));
729 
730  punpckhbw_r2r(mm2, mm1);
731  movq_r2m(mm1, *(dataptr+5));
732 
733  idata2 += rskip;
734 
735  movq_m2r(*idata2, mm0);
736  movq_r2r(mm0, mm1);
737 
738  punpcklbw_r2r(mm2, mm0);
739  movq_r2m(mm0, *(dataptr+6));
740 
741  punpckhbw_r2r(mm2, mm1);
742  movq_r2m(mm1, *(dataptr+7));
743 
744  idata2 += rskip;
745 
746  movq_m2r(*idata2, mm0);
747  movq_r2r(mm0, mm1);
748 
749  punpcklbw_r2r(mm2, mm0);
750  movq_r2m(mm0, *(dataptr+8));
751 
752  punpckhbw_r2r(mm2, mm1);
753  movq_r2m(mm1, *(dataptr+9));
754 
755  idata2 += rskip;
756 
757  movq_m2r(*idata2, mm0);
758  movq_r2r(mm0, mm1);
759 
760  punpcklbw_r2r(mm2, mm0);
761  movq_r2m(mm0, *(dataptr+10));
762 
763  punpckhbw_r2r(mm2, mm1);
764  movq_r2m(mm1, *(dataptr+11));
765 
766  idata2 += rskip;
767 
768  movq_m2r(*idata2, mm0);
769  movq_r2r(mm0, mm1);
770 
771  punpcklbw_r2r(mm2, mm0);
772  movq_r2m(mm0, *(dataptr+12));
773 
774  punpckhbw_r2r(mm2, mm1);
775  movq_r2m(mm1, *(dataptr+13));
776 
777  idata2 += rskip;
778 
779  movq_m2r(*idata2, mm0);
780  movq_r2r(mm0, mm1);
781 
782  punpcklbw_r2r(mm2, mm0);
783  movq_r2m(mm0, *(dataptr+14));
784 
785  punpckhbw_r2r(mm2, mm1);
786  movq_r2m(mm1, *(dataptr+15));
787 
788 /* Start Transpose to do calculations on rows */
789 
790  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
791 
792  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
793  movq_r2r(mm7, mm5);
794 
795  punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
796  movq_r2r(mm6, mm2);
797 
798  punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
799  movq_r2r(mm7, mm1);
800 
801  movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
802  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
803 
804  movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
805  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
806 
807  movq_r2m(mm7,*(dataptr+9)); // write result 1
808  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
809 
810  movq_r2m(mm1,*(dataptr+11)); // write result 2
811  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
812 
813  movq_r2r(mm5, mm1);
814  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
815 
816  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
817  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
818 
819  movq_r2m(mm5,*(dataptr+13)); // write result 3
820 
821  // last 4x4 done
822 
823  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
824 
825  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
826  movq_r2r(mm0, mm6);
827 
828  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
829  movq_r2r(mm2, mm7);
830 
831  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
832  movq_r2r(mm0, mm4);
833 
834  //
835  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
836  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
837 
838  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
839  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
840 
841  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
842  movq_r2r(mm1, mm2); // copy first line
843 
844  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
845  movq_r2r(mm6, mm5); // copy first intermediate result
846 
847  movq_r2m(mm0, *(dataptr+8)); // write result 1
848  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
849 
850  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
851  movq_r2r(mm3, mm0); // copy third line
852 
853  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
854 
855  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
856  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
857 
858  punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
859  movq_r2r(mm1, mm4);
860 
861  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
862  punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
863 
864  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
865  movq_r2r(mm2, mm6);
866 
867  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
868  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
869 
870  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
871  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
872 
873  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
874  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
875 
876  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
877 
878  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
879 
880  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
881 
882 
883 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
884 
885  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
886  movq_r2r(mm0, mm2);
887 
888  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
889  movq_r2r(mm7, mm4);
890 
891  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
892  movq_r2r(mm0, mm1);
893 
894  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
895  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
896 
897  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
898  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
899 
900  movq_r2r(mm0, mm7); // write result 1
901  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
902 
903  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
904  movq_r2r(mm1, mm6); // write result 2
905 
906  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
907  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
908 
909  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
910  movq_r2r(mm2, mm3); // copy first intermediate result
911 
912  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
913  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
914 
915  movq_r2m(mm7, tmp7);
916  movq_r2r(mm2, mm5); // write result 3
917 
918  movq_r2m(mm6, tmp6);
919  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
920 
921  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
922  movq_r2r(mm3, mm4); // write result 4
923 
924 /************************************************************************************************
925  End of Transpose
926 ************************************************************************************************/
927 
928 
929  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
930  movq_r2r(mm0, mm7);
931 
932  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
933  movq_r2r(mm1, mm6);
934 
935  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
936  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
937 
938  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
939  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
940 
941  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
942  paddw_r2r(mm7, mm6); // tmp12 + tmp13
943 
944  /* stage 3 */
945 
946  movq_m2r(tmp6, mm2);
947  movq_r2r(mm0, mm3);
948 
949  psllw_i2r(2, mm6); // m8 * 2^2
950  paddw_r2r(mm1, mm0);
951 
952  pmulhw_m2r(RTjpeg_C4, mm6); // z1
953  psubw_r2r(mm1, mm3);
954 
955  movq_r2m(mm0, *dataptr);
956  movq_r2r(mm7, mm0);
957 
958  /* Odd part */
959  movq_r2m(mm3, *(dataptr+8));
960  paddw_r2r(mm5, mm4); // tmp10
961 
962  movq_m2r(tmp7, mm3);
963  paddw_r2r(mm6, mm0); // tmp32
964 
965  paddw_r2r(mm2, mm5); // tmp11
966  psubw_r2r(mm6, mm7); // tmp33
967 
968  movq_r2m(mm0, *(dataptr+4));
969  paddw_r2r(mm3, mm2); // tmp12
970 
971  /* stage 4 */
972 
973  movq_r2m(mm7, *(dataptr+12));
974  movq_r2r(mm4, mm1); // copy of tmp10
975 
976  psubw_r2r(mm2, mm1); // tmp10 - tmp12
977  psllw_i2r(2, mm4); // m8 * 2^2
978 
979  movq_m2r(RTjpeg_C2mC6, mm0);
980  psllw_i2r(2, mm1);
981 
982  pmulhw_m2r(RTjpeg_C6, mm1); // z5
983  psllw_i2r(2, mm2);
984 
985  pmulhw_r2r(mm0, mm4); // z5
986 
987  /* stage 5 */
988 
989  pmulhw_m2r(RTjpeg_C2pC6, mm2);
990  psllw_i2r(2, mm5);
991 
992  pmulhw_m2r(RTjpeg_C4, mm5); // z3
993  movq_r2r(mm3, mm0); // copy tmp7
994 
995  movq_m2r(*(dataptr+1), mm7);
996  paddw_r2r(mm1, mm4); // z2
997 
998  paddw_r2r(mm1, mm2); // z4
999 
1000  paddw_r2r(mm5, mm0); // z11
1001  psubw_r2r(mm5, mm3); // z13
1002 
1003  /* stage 6 */
1004 
1005  movq_r2r(mm3, mm5); // copy z13
1006  psubw_r2r(mm4, mm3); // y3=z13 - z2
1007 
1008  paddw_r2r(mm4, mm5); // y5=z13 + z2
1009  movq_r2r(mm0, mm6); // copy z11
1010 
1011  movq_r2m(mm3, *(dataptr+6)); //save y3
1012  psubw_r2r(mm2, mm0); // y7=z11 - z4
1013 
1014  movq_r2m(mm5, *(dataptr+10)); //save y5
1015  paddw_r2r(mm2, mm6); // y1=z11 + z4
1016 
1017  movq_r2m(mm0, *(dataptr+14)); //save y7
1018 
1019  /************************************************
1020  * End of 1st 4 rows
1021  ************************************************/
1022 
1023  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1024  movq_r2r(mm7, mm0); // copy x0
1025 
1026  movq_r2m(mm6, *(dataptr+2)); //save y1
1027 
1028  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1029  movq_r2r(mm1, mm6); // copy x1
1030 
1031  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1032 
1033  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1034  movq_r2r(mm2, mm5); // copy x2
1035 
1036  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1037  movq_r2r(mm3, mm4); // copy x3
1038 
1039  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1040 
1041  movq_r2m(mm7, tmp7); // save tmp07
1042  movq_r2r(mm0, mm7); // copy tmp00
1043 
1044  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1045 
1046  /* stage 2, Even Part */
1047 
1048  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1049 
1050  movq_r2m(mm6, tmp6); // save tmp07
1051  movq_r2r(mm1, mm6); // copy tmp01
1052 
1053  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1054  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1055 
1056  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1057 
1058  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1059  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1060 
1061  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1062 
1063  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1064  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1065 
1066  /* stage 3, Even and stage 4 & 5 even */
1067 
1068  movq_m2r(tmp6, mm2); // load tmp6
1069  movq_r2r(mm0, mm3); // copy tmp10
1070 
1071  psllw_i2r(2, mm6); // shift z1
1072  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1073 
1074  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1075  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1076 
1077  movq_r2m(mm0, *(dataptr+1)); //save y0
1078  movq_r2r(mm7, mm0); // copy tmp13
1079 
1080  /* odd part */
1081 
1082  movq_r2m(mm3, *(dataptr+9)); //save y4
1083  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1084 
1085  movq_m2r(tmp7, mm3); // load tmp7
1086  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1087 
1088  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1089  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1090 
1091  movq_r2m(mm0, *(dataptr+5)); //save y2
1092  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1093 
1094  /* stage 4 */
1095 
1096  movq_r2m(mm7, *(dataptr+13)); //save y6
1097  movq_r2r(mm4, mm1); // copy tmp10
1098 
1099  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1100  psllw_i2r(2, mm4); // shift tmp10
1101 
1102  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1103  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1104 
1105  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1106  psllw_i2r(2, mm5); // prepare for multiply
1107 
1108  pmulhw_r2r(mm0, mm4); // multiply by converted real
1109 
1110  /* stage 5 */
1111 
1112  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1113  psllw_i2r(2, mm2); // prepare for multiply
1114 
1115  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1116  movq_r2r(mm3, mm0); // copy tmp7
1117 
1118  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1119  paddw_r2r(mm1, mm4); // z2
1120 
1121  paddw_r2r(mm5, mm0); // z11
1122  psubw_r2r(mm5, mm3); // z13
1123 
1124  /* stage 6 */
1125 
1126  movq_r2r(mm3, mm5); // copy z13
1127  paddw_r2r(mm1, mm2); // z4
1128 
1129  movq_r2r(mm0, mm6); // copy z11
1130  psubw_r2r(mm4, mm5); // y3
1131 
1132  paddw_r2r(mm2, mm6); // y1
1133  paddw_r2r(mm4, mm3); // y5
1134 
1135  movq_r2m(mm5, *(dataptr+7)); //save y3
1136 
1137  movq_r2m(mm6, *(dataptr+3)); //save y1
1138  psubw_r2r(mm2, mm0); // y7
1139 
1140 /************************************************************************************************
1141  Start of Transpose
1142 ************************************************************************************************/
1143 
1144  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1145  movq_r2r(mm7, mm5); // copy first line
1146 
1147  punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1148  movq_r2r(mm6, mm2); // copy third line
1149 
1150  punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1151  movq_r2r(mm7, mm1); // copy first intermediate result
1152 
1153  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1154 
1155  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1156 
1157  movq_r2m(mm7, *(dataptr+9)); // write result 1
1158  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1159 
1160  movq_r2m(mm1, *(dataptr+11)); // write result 2
1161  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1162 
1163  movq_r2r(mm5, mm1); // copy first intermediate result
1164  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1165 
1166  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1167  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1168 
1169  movq_r2m(mm5, *(dataptr+13)); // write result 3
1170 
1171  /****** last 4x4 done */
1172 
1173  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1174 
1175  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1176  movq_r2r(mm0, mm6); // copy first line
1177 
1178  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1179  movq_r2r(mm2, mm7); // copy third line
1180 
1181  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1182  movq_r2r(mm0, mm4); // copy first intermediate result
1183 
1184 
1185 
1186  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1187  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1188 
1189  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1190  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1191 
1192  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1193  movq_r2r(mm1, mm2); // copy first line
1194 
1195  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1196  movq_r2r(mm6, mm5); // copy first intermediate result
1197 
1198  movq_r2m(mm0, *(dataptr+8)); // write result 1
1199  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1200 
1201  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1202  movq_r2r(mm3, mm0); // copy third line
1203 
1204  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1205 
1206  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1207  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1208 
1209  punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1210  movq_r2r(mm1, mm4); // copy second intermediate result
1211 
1212  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1213  punpckldq_r2r(mm3, mm1); //
1214 
1215  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1216  movq_r2r(mm2, mm6); // copy second intermediate result
1217 
1218  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1219  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1220 
1221  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1222  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1223 
1224  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1225  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1226 
1227  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1228 
1229  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1230 
1231  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1232 
1233 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1234 
1235  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1236  movq_r2r(mm0, mm2); // copy first line
1237 
1238  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1239  movq_r2r(mm7, mm4); // copy third line
1240 
1241  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1242  movq_r2r(mm0, mm1); // copy first intermediate result
1243 
1244  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1245  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1246 
1247  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1248  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1249 
1250  movq_r2r(mm0, mm7); // write result 1
1251  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1252 
1253  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1254  movq_r2r(mm1, mm6); // write result 2
1255 
1256  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1257  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1258 
1259  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1260  movq_r2r(mm2, mm3); // copy first intermediate result
1261 
1262  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1263  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1264 
1265  movq_r2m(mm7, tmp7); // save tmp07
1266  movq_r2r(mm2, mm5); // write result 3
1267 
1268  movq_r2m(mm6, tmp6); // save tmp06
1269 
1270  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1271 
1272  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1273  movq_r2r(mm3, mm4); // write result 4
1274 
1275 /************************************************************************************************
1276  End of Transpose 2
1277 ************************************************************************************************/
1278 
1279  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1280  movq_r2r(mm0, mm7);
1281 
1282  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1283  movq_r2r(mm1, mm6);
1284 
1285  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1286  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1287 
1288  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1289  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1290 
1291  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1292  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1293 
1294  /* stage 3 */
1295 
1296  movq_m2r(tmp6, mm2);
1297  movq_r2r(mm0, mm3);
1298 
1299  psllw_i2r(2, mm6); // m8 * 2^2
1300  paddw_r2r(mm1, mm0);
1301 
1302  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1303  psubw_r2r(mm1, mm3);
1304 
1305  movq_r2m(mm0, *dataptr);
1306  movq_r2r(mm7, mm0);
1307 
1308  /* Odd part */
1309  movq_r2m(mm3, *(dataptr+8));
1310  paddw_r2r(mm5, mm4); // tmp10
1311 
1312  movq_m2r(tmp7, mm3);
1313  paddw_r2r(mm6, mm0); // tmp32
1314 
1315  paddw_r2r(mm2, mm5); // tmp11
1316  psubw_r2r(mm6, mm7); // tmp33
1317 
1318  movq_r2m(mm0, *(dataptr+4));
1319  paddw_r2r(mm3, mm2); // tmp12
1320 
1321  /* stage 4 */
1322  movq_r2m(mm7, *(dataptr+12));
1323  movq_r2r(mm4, mm1); // copy of tmp10
1324 
1325  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1326  psllw_i2r(2, mm4); // m8 * 2^2
1327 
1328  movq_m2r(RTjpeg_C2mC6, mm0);
1329  psllw_i2r(2, mm1);
1330 
1331  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1332  psllw_i2r(2, mm2);
1333 
1334  pmulhw_r2r(mm0, mm4); // z5
1335 
1336  /* stage 5 */
1337 
1338  pmulhw_m2r(RTjpeg_C2pC6, mm2);
1339  psllw_i2r(2, mm5);
1340 
1341  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1342  movq_r2r(mm3, mm0); // copy tmp7
1343 
1344  movq_m2r(*(dataptr+1), mm7);
1345  paddw_r2r(mm1, mm4); // z2
1346 
1347  paddw_r2r(mm1, mm2); // z4
1348 
1349  paddw_r2r(mm5, mm0); // z11
1350  psubw_r2r(mm5, mm3); // z13
1351 
1352  /* stage 6 */
1353 
1354  movq_r2r(mm3, mm5); // copy z13
1355  psubw_r2r(mm4, mm3); // y3=z13 - z2
1356 
1357  paddw_r2r(mm4, mm5); // y5=z13 + z2
1358  movq_r2r(mm0, mm6); // copy z11
1359 
1360  movq_r2m(mm3, *(dataptr+6)); //save y3
1361  psubw_r2r(mm2, mm0); // y7=z11 - z4
1362 
1363  movq_r2m(mm5, *(dataptr+10)); //save y5
1364  paddw_r2r(mm2, mm6); // y1=z11 + z4
1365 
1366  movq_r2m(mm0, *(dataptr+14)); //save y7
1367 
1368  /************************************************
1369  * End of 1st 4 rows
1370  ************************************************/
1371 
1372  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1373  movq_r2r(mm7, mm0); // copy x0
1374 
1375  movq_r2m(mm6, *(dataptr+2)); //save y1
1376 
1377  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1378  movq_r2r(mm1, mm6); // copy x1
1379 
1380  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1381 
1382  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1383  movq_r2r(mm2, mm5); // copy x2
1384 
1385  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1386  movq_r2r(mm3, mm4); // copy x3
1387 
1388  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1389 
1390  movq_r2m(mm7, tmp7); // save tmp07
1391  movq_r2r(mm0, mm7); // copy tmp00
1392 
1393  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1394 
1395  /* stage 2, Even Part */
1396 
1397  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1398 
1399  movq_r2m(mm6, tmp6); // save tmp07
1400  movq_r2r(mm1, mm6); // copy tmp01
1401 
1402  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1403  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1404 
1405  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1406 
1407  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1408  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1409 
1410  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1411 
1412  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1413  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1414 
1415  /* stage 3, Even and stage 4 & 5 even */
1416 
1417  movq_m2r(tmp6, mm2); // load tmp6
1418  movq_r2r(mm0, mm3); // copy tmp10
1419 
1420  psllw_i2r(2, mm6); // shift z1
1421  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1422 
1423  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1424  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1425 
1426  movq_r2m(mm0, *(dataptr+1)); //save y0
1427  movq_r2r(mm7, mm0); // copy tmp13
1428 
1429  /* odd part */
1430 
1431  movq_r2m(mm3, *(dataptr+9)); //save y4
1432  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1433 
1434  movq_m2r(tmp7, mm3); // load tmp7
1435  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1436 
1437  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1438  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1439 
1440  movq_r2m(mm0, *(dataptr+5)); //save y2
1441  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1442 
1443  /* stage 4 */
1444 
1445  movq_r2m(mm7, *(dataptr+13)); //save y6
1446  movq_r2r(mm4, mm1); // copy tmp10
1447 
1448  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1449  psllw_i2r(2, mm4); // shift tmp10
1450 
1451  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1452  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1453 
1454  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1455  psllw_i2r(2, mm5); // prepare for multiply
1456 
1457  pmulhw_r2r(mm0, mm4); // multiply by converted real
1458 
1459  /* stage 5 */
1460 
1461  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1462  psllw_i2r(2, mm2); // prepare for multiply
1463 
1464  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1465  movq_r2r(mm3, mm0); // copy tmp7
1466 
1467  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1468  paddw_r2r(mm1, mm4); // z2
1469 
1470  paddw_r2r(mm5, mm0); // z11
1471  psubw_r2r(mm5, mm3); // z13
1472 
1473  /* stage 6 */
1474 
1475  movq_r2r(mm3, mm5); // copy z13
1476  paddw_r2r(mm1, mm2); // z4
1477 
1478  movq_r2r(mm0, mm6); // copy z11
1479  psubw_r2r(mm4, mm5); // y3
1480 
1481  paddw_r2r(mm2, mm6); // y1
1482  paddw_r2r(mm4, mm3); // y5
1483 
1484  movq_r2m(mm5, *(dataptr+7)); //save y3
1485  psubw_r2r(mm2, mm0); // yŤ=z11 - z4
1486 
1487  movq_r2m(mm3, *(dataptr+11)); //save y5
1488 
1489  movq_r2m(mm6, *(dataptr+3)); //save y1
1490 
1491  movq_r2m(mm0, *(dataptr+15)); //save y7
1492 
1493 
1494 #endif
1495 }
1496 
1497 #define FIX_1_082392200 ((int32_t) 277) /* FIX(1.082392200) */
1498 #define FIX_1_414213562 ((int32_t) 362) /* FIX(1.414213562) */
1499 #define FIX_1_847759065 ((int32_t) 473) /* FIX(1.847759065) */
1500 #define FIX_2_613125930 ((int32_t) 669) /* FIX(2.613125930) */
1502 #define DESCALE(x) (int16_t)( ((x)+4) >> 3)
1504 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1505 
1506 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1507 #define MULTIPLY(var,const) (((int32_t) ((var) * (const)) + 128)>>8)
1510 {
1511  for(int i = 0; i < 64; i++)
1512  {
1513  m_liqt[i] = ((uint64_t)m_liqt[i] * RTjpeg_aan_tab[i]) >> 32;
1514  m_ciqt[i] = ((uint64_t)m_ciqt[i] * RTjpeg_aan_tab[i]) >> 32;
1515  }
1516 }
1517 
1518 void RTjpeg::Idct(uint8_t *odata, int16_t *data, int rskip)
1519 {
1520 #ifdef MMX
1521 
1522 static mmx_t s_fix141; s_fix141.q = 0x5a825a825a825a82LL;
1523 static mmx_t s_fix184n261; s_fix184n261.q = 0xcf04cf04cf04cf04LL;
1524 static mmx_t s_fix184; s_fix184.q = 0x7641764176417641LL;
1525 static mmx_t s_fixN184; s_fixN184.q = 0x896f896f896f896fLL;
1526 static mmx_t s_fix108n184; s_fix108n184.q = 0xcf04cf04cf04cf04LL;
1527 
1528  auto *wsptr = (mmx_t *)m_ws;
1529  auto *dataptr = (mmx_t *)odata;
1530  auto *idata = (mmx_t *)data;
1531 
1532  rskip = rskip>>3;
1533 /*
1534  * Perform inverse DCT on one block of coefficients.
1535  */
1536 
1537  /* Odd part */
1538 
1539  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1540 
1541  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1542 
1543  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1544 
1545  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1546 
1547  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1548 
1549  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1550 
1551  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1552 
1553  psllw_i2r(2, mm2); // shift z10
1554  movq_r2r(mm2, mm0); // copy z10
1555 
1556  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1557  movq_r2r(mm3, mm5); // copy tmp4
1558 
1559  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1560  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1561 
1562  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1563  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1564 
1565  psubw_r2r(mm1, mm6); // z11-z13
1566  psllw_i2r(2, mm5); // shift z12
1567 
1568  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1569  movq_r2r(mm5, mm7); // copy z12
1570 
1571  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1572  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1573 
1574  //ok
1575 
1576  /* Even part */
1577  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1578  psllw_i2r(2, mm6);
1579 
1580  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1581 
1582  paddw_r2r(mm5, mm0); // tmp10
1583 
1584  paddw_r2r(mm7, mm2); // tmp12
1585 
1586  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1587  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1588 
1589  movq_r2r(mm1, mm5); // copy tmp1
1590  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1591 
1592  psubw_r2r(mm4, mm5); // tmp1-tmp3
1593  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1594 
1595  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1596  psllw_i2r(2, mm5); // shift tmp1-tmp3
1597 
1598  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1599 
1600  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1601  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1602 
1603  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1604 
1605  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1606 
1607  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1608  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1609 
1610  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1611  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1612 
1613  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1614  movq_r2r(mm1, mm5); // copy tmp11
1615 
1616  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1617  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1618 
1619  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1620 
1621  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1622  movq_r2r(mm7, mm0); // copy tmp0
1623 
1624  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1625  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1626 
1627  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1628 
1629  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1630  movq_r2r(mm1, mm3); // copy tmp1
1631 
1632  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1633  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1634 
1635  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1636 
1637  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1638  movq_r2r(mm4, mm1); // copy tmp3
1639 
1640  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1641 
1642  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1643 
1644  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1645 
1646  movq_r2m(mm4, *(wsptr+8));
1647  movq_r2r(mm5, mm7); // copy tmp2
1648 
1649  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1650 
1651  movq_r2m(mm1, *(wsptr+6));
1652  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1653 
1654  movq_r2m(mm5, *(wsptr+4));
1655 
1656  movq_r2m(mm7, *(wsptr+10));
1657 
1658  //ok
1659 
1660 
1661 /*****************************************************************/
1662 
1663  idata++;
1664  wsptr++;
1665 
1666 /*****************************************************************/
1667 
1668  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1669 
1670  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1671 
1672  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1673  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1674 
1675  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1676  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1677 
1678  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1679 
1680  psllw_i2r(2, mm2); // shift z10
1681  movq_r2r(mm2, mm0); // copy z10
1682 
1683  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1684  movq_r2r(mm3, mm5); // copy tmp4
1685 
1686  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1687  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1688 
1689  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1690  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1691 
1692  psubw_r2r(mm1, mm6); // z11-z13
1693  psllw_i2r(2, mm5); // shift z12
1694 
1695  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1696  movq_r2r(mm5, mm7); // copy z12
1697 
1698  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1699  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1700 
1701  //ok
1702 
1703  /* Even part */
1704  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1705  psllw_i2r(2, mm6);
1706 
1707  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1708 
1709  paddw_r2r(mm5, mm0); // tmp10
1710 
1711  paddw_r2r(mm7, mm2); // tmp12
1712 
1713  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1714  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1715 
1716  movq_r2r(mm1, mm5); // copy tmp1
1717  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1718 
1719  psubw_r2r(mm4, mm5); // tmp1-tmp3
1720  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1721 
1722  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1723  psllw_i2r(2, mm5); // shift tmp1-tmp3
1724 
1725  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1726  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1727 
1728  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1729 
1730  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1731 
1732  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1733 
1734  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1735  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1736 
1737  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1738  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1739 
1740  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1741  movq_r2r(mm1, mm5); // copy tmp11
1742 
1743  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1744  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1745 
1746  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1747 
1748  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1749  movq_r2r(mm7, mm0); // copy tmp0
1750 
1751  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1752  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1753 
1754  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1755 
1756  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1757  movq_r2r(mm1, mm3); // copy tmp1
1758 
1759  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1760  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1761 
1762  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1763 
1764  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1765  movq_r2r(mm4, mm1); // copy tmp3
1766 
1767  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1768 
1769  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1770 
1771  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1772 
1773  movq_r2m(mm4, *(wsptr+8));
1774  movq_r2r(mm5, mm7); // copy tmp2
1775 
1776  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1777 
1778  movq_r2m(mm1, *(wsptr+6));
1779  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1780 
1781  movq_r2m(mm5, *(wsptr+4));
1782 
1783  movq_r2m(mm7, *(wsptr+10));
1784 
1785 /*****************************************************************/
1786 
1787  /* Pass 2: process rows from work array, store into output array. */
1788  /* Note that we must descale the results by a factor of 8 == 2**3, */
1789  /* and also undo the PASS1_BITS scaling. */
1790 
1791 /*****************************************************************/
1792  /* Even part */
1793 
1794  wsptr--;
1795 
1796 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1797 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1798 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1799 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1800  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1801 
1802  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1803  movq_r2r(mm0, mm2);
1804 
1805  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1806  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1807 
1808  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1809  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1810 
1811  movq_r2r(mm0, mm6);
1812  movq_r2r(mm3, mm5);
1813 
1814  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1815  movq_r2r(mm2, mm1);
1816 
1817  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1818  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1819 
1820  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1821  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1822 
1823  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1824  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1825 
1826  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1827  movq_r2r(mm3, mm4);
1828 
1829  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1830  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1831 
1832  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1833  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1834 
1835 
1836  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1837  movq_r2r(mm6, mm2);
1838 
1839  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1840  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1841 
1842  movq_r2r(mm3, mm5);
1843  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1844 
1845  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1846  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1847 
1848  movq_r2r(mm4, mm7);
1849  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1850 
1851  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1852 
1853  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1854 
1855  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1856  movq_r2r(mm1, mm6);
1857 
1858  //ok
1859 
1860 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1861 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1862 
1863 
1864  movq_r2r(mm0, mm2);
1865  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1866 
1867  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1868  psllw_i2r(2, mm6);
1869 
1870  pmulhw_m2r(s_fix141, mm6);
1871  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1872 
1873  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1874  movq_r2r(mm0, mm7);
1875 
1876 // tmp0 = tmp10 + tmp13;
1877 // tmp3 = tmp10 - tmp13;
1878  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1879  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1880 
1881 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1882  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1883 // tmp1 = tmp11 + tmp12;
1884 // tmp2 = tmp11 - tmp12;
1885  movq_r2r(mm1, mm5);
1886 
1887  //OK
1888 
1889  /* Odd part */
1890 
1891 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1892 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1893 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1894 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1895  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1896  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1897 
1898  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1899  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1900 
1901  movq_r2r(mm3, mm6);
1902  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1903 
1904  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1905  movq_r2r(mm3, mm2);
1906 
1907 //Save tmp0 and tmp1 in wsptr
1908  movq_r2m(mm0, *(wsptr)); // save tmp0
1909  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1910 
1911 
1912 //Continue with z10 --- z13
1913  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1914  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1915 
1916  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1917  movq_r2r(mm6, mm4);
1918 
1919  movq_r2m(mm1, *(wsptr+1)); // save tmp1
1920  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1921 
1922  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1923  movq_r2r(mm6, mm1);
1924 
1925 //Save tmp2 and tmp3 in wsptr
1926  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1927  movq_r2r(mm2, mm4);
1928 
1929 //Continue with z10 --- z13
1930  movq_r2m(mm5, *(wsptr+2)); // save tmp2
1931  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1932 
1933  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1934  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1935 
1936  movq_r2r(mm3, mm0);
1937  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1938 
1939  movq_r2m(mm7, *(wsptr+3)); // save tmp3
1940  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1941 
1942  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1943  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1944 
1945  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
1946  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1947 
1948  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
1949  movq_r2r(mm6, mm4);
1950 
1951  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
1952  movq_r2r(mm1, mm5);
1953 
1954  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
1955  movq_r2r(mm6, mm2);
1956 
1957  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
1958  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
1959 
1960  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
1961  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
1962 
1963  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
1964  movq_r2r(mm1, mm7);
1965 
1966  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
1967  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
1968 
1969  movq_r2r(mm6, mm5);
1970  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
1971 
1972  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
1973  movq_r2r(mm2, mm4);
1974 
1975  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
1976 
1977  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
1978 
1979  punpckhdq_r2r(mm6, mm4);
1980 
1981  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
1982  movq_r2r(mm0, mm5);
1983 
1984  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
1985 
1986  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
1987  movq_r2r(mm3, mm4);
1988 
1989  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
1990  movq_r2r(mm5, mm1);
1991 
1992  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
1993 // tmp7 = z11 + z13; /* phase 5 */
1994 // tmp8 = z11 - z13; /* phase 5 */
1995  psubw_r2r(mm4, mm1); // tmp8
1996 
1997  paddw_r2r(mm4, mm5); // tmp7
1998 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
1999  psllw_i2r(2, mm1);
2000 
2001  psllw_i2r(2, mm0);
2002 
2003  pmulhw_m2r(s_fix141, mm1); // tmp21
2004 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2005 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2006  psllw_i2r(2, mm3);
2007  movq_r2r(mm0, mm7);
2008 
2009  pmulhw_m2r(s_fixN184, mm7);
2010  movq_r2r(mm3, mm6);
2011 
2012  movq_m2r(*(wsptr), mm2); // tmp0,final1
2013 
2014  pmulhw_m2r(s_fix108n184, mm6);
2015 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2016 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2017  movq_r2r(mm2, mm4); // final1
2018 
2019  pmulhw_m2r(s_fix184n261, mm0);
2020  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2021 
2022  pmulhw_m2r(s_fix184, mm3);
2023  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2024 
2025 // tmp6 = tmp22 - tmp7; /* phase 2 */
2026  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2027 
2028  paddw_r2r(mm6, mm7); // tmp20
2029  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2030 
2031  paddw_r2r(mm0, mm3); // tmp22
2032 
2033 // tmp5 = tmp21 - tmp6;
2034  psubw_r2r(mm5, mm3); // tmp6
2035 
2036 // tmp4 = tmp20 + tmp5;
2037  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2038  psubw_r2r(mm3, mm1); // tmp5
2039 
2040  movq_r2r(mm0, mm6); // final2
2041  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2042 
2043  /* Final output stage: scale down by a factor of 8 and range-limit */
2044 
2045 
2046 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2047 // & RANGE_MASK];
2048 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2049 // & RANGE_MASK]; final1
2050 
2051 
2052 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2053 // & RANGE_MASK];
2054 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2055 // & RANGE_MASK]; final2
2056  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2057  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2058 
2059  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2060 
2061  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2062 
2063  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2064  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2065 
2066 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2067 // & RANGE_MASK];
2068 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2069 // & RANGE_MASK]; final3
2070  paddw_r2r(mm1, mm7); // tmp4
2071  movq_r2r(mm5, mm3);
2072 
2073  paddw_r2r(mm1, mm5); // tmp2+tmp5
2074  psubw_r2r(mm1, mm3); // tmp2-tmp5
2075 
2076  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2077 
2078  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2079  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2080 
2081 
2082 
2083 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2084 // & RANGE_MASK];
2085 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2086 // & RANGE_MASK]; final4
2087  movq_r2r(mm4, mm6);
2088  paddw_r2r(mm7, mm4); // tmp3+tmp4
2089 
2090  psubw_r2r(mm7, mm6); // tmp3-tmp4
2091  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2092 
2093  // mov ecx, [dataptr]
2094 
2095  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2096 
2097  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2098 
2099  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2100  movq_r2r(mm2, mm4);
2101 
2102  movq_r2r(mm5, mm7);
2103  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2104 
2105  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2106  movq_r2r(mm2, mm1);
2107 
2108  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2109 
2110  // add dataptr, 4
2111 
2112  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2113 
2114  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2115 
2116  // add ecx, output_col
2117 
2118  movq_r2r(mm7, mm6);
2119  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2120 
2121  movq_r2r(mm2, mm0);
2122  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2123 
2124  // mov idata, [dataptr]
2125 
2126  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2127 
2128  // add dataptr, 4
2129 
2130  movq_r2r(mm1, mm3);
2131 
2132  // add idata, output_col
2133 
2134  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2135 
2136  movq_r2m(mm2, *(dataptr));
2137 
2138  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2139 
2140  dataptr += rskip;
2141  movq_r2m(mm0, *(dataptr));
2142 
2143  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2144  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2145 
2146  dataptr += rskip;
2147  movq_r2m(mm1, *(dataptr));
2148 
2149  dataptr += rskip;
2150  movq_r2m(mm3, *(dataptr));
2151 
2152 /*******************************************************************/
2153 
2154  wsptr += 8;
2155 
2156 /*******************************************************************/
2157 
2158 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2159 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2160 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2161 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2162  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2163 
2164  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2165  movq_r2r(mm0, mm2);
2166 
2167  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2168  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2169 
2170  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2171  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2172 
2173  movq_r2r(mm0, mm6);
2174  movq_r2r(mm3, mm5);
2175 
2176  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2177  movq_r2r(mm2, mm1);
2178 
2179  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2180  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2181 
2182  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2183  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2184 
2185  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2186  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2187 
2188  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2189  movq_r2r(mm3, mm4);
2190 
2191  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2192  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2193 
2194  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2195  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2196 
2197  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2198  movq_r2r(mm6, mm2);
2199 
2200  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2201  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2202 
2203  movq_r2r(mm3, mm5);
2204  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2205 
2206  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2207  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2208 
2209  movq_r2r(mm4, mm7);
2210  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2211 
2212  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2213 
2214  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2215 
2216  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2217  movq_r2r(mm1, mm6);
2218 
2219  //OK
2220 
2221 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2222 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2223 
2224  movq_r2r(mm0, mm2);
2225  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2226 
2227  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2228  psllw_i2r(2, mm6);
2229 
2230  pmulhw_m2r(s_fix141, mm6);
2231  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2232 
2233  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2234  movq_r2r(mm0, mm7);
2235 
2236 // tmp0 = tmp10 + tmp13;
2237 // tmp3 = tmp10 - tmp13;
2238  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2239  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2240 
2241 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2242  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2243 // tmp1 = tmp11 + tmp12;
2244 // tmp2 = tmp11 - tmp12;
2245  movq_r2r(mm1, mm5);
2246 
2247  //OK
2248 
2249 
2250  /* Odd part */
2251 
2252 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2253 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2254 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2255 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2256  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2257  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2258 
2259  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2260  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2261 
2262  movq_r2r(mm3, mm6);
2263  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2264 
2265  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2266  movq_r2r(mm3, mm2);
2267 
2268 //Save tmp0 and tmp1 in wsptr
2269  movq_r2m(mm0, *(wsptr)); // save tmp0
2270  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2271 
2272 
2273 //Continue with z10 --- z13
2274  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2275  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2276 
2277  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2278  movq_r2r(mm6, mm4);
2279 
2280  movq_r2m(mm1, *(wsptr+1)); // save tmp1
2281  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2282 
2283  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2284  movq_r2r(mm6, mm1);
2285 
2286 //Save tmp2 and tmp3 in wsptr
2287  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2288  movq_r2r(mm2, mm4);
2289 
2290 //Continue with z10 --- z13
2291  movq_r2m(mm5, *(wsptr+2)); // save tmp2
2292  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2293 
2294  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2295  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2296 
2297  movq_r2r(mm3, mm0);
2298  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2299 
2300  movq_r2m(mm7, *(wsptr+3)); // save tmp3
2301  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2302 
2303  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2304  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2305 
2306  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2307  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2308 
2309  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2310  movq_r2r(mm6, mm4);
2311 
2312  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2313  movq_r2r(mm1, mm5);
2314 
2315  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2316  movq_r2r(mm6, mm2);
2317 
2318  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2319  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2320 
2321  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2322  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2323 
2324  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2325  movq_r2r(mm1, mm7);
2326 
2327  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2328  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2329 
2330  movq_r2r(mm6, mm5);
2331  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2332 
2333  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2334  movq_r2r(mm2, mm4);
2335 
2336  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2337 
2338  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2339 
2340  punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2341 
2342  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2343  movq_r2r(mm0, mm5);
2344 
2345  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2346 
2347  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2348  movq_r2r(mm3, mm4);
2349 
2350  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2351  movq_r2r(mm5, mm1);
2352 
2353  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2354 // tmp7 = z11 + z13; /* phase 5 */
2355 // tmp8 = z11 - z13; /* phase 5 */
2356  psubw_r2r(mm4, mm1); // tmp8
2357 
2358  paddw_r2r(mm4, mm5); // tmp7
2359 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2360  psllw_i2r(2, mm1);
2361 
2362  psllw_i2r(2, mm0);
2363 
2364  pmulhw_m2r(s_fix141, mm1); // tmp21
2365 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2366 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2367  psllw_i2r(2, mm3);
2368  movq_r2r(mm0, mm7);
2369 
2370  pmulhw_m2r(s_fixN184, mm7);
2371  movq_r2r(mm3, mm6);
2372 
2373  movq_m2r(*(wsptr), mm2); // tmp0,final1
2374 
2375  pmulhw_m2r(s_fix108n184, mm6);
2376 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2377 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2378  movq_r2r(mm2, mm4); // final1
2379 
2380  pmulhw_m2r(s_fix184n261, mm0);
2381  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2382 
2383  pmulhw_m2r(s_fix184, mm3);
2384  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2385 
2386 // tmp6 = tmp22 - tmp7; /* phase 2 */
2387  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2388 
2389  paddw_r2r(mm6, mm7); // tmp20
2390  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2391 
2392  paddw_r2r(mm0, mm3); // tmp22
2393 
2394 // tmp5 = tmp21 - tmp6;
2395  psubw_r2r(mm5, mm3); // tmp6
2396 
2397 // tmp4 = tmp20 + tmp5;
2398  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2399  psubw_r2r(mm3, mm1); // tmp5
2400 
2401  movq_r2r(mm0, mm6); // final2
2402  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2403 
2404  /* Final output stage: scale down by a factor of 8 and range-limit */
2405 
2406 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2407 // & RANGE_MASK];
2408 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2409 // & RANGE_MASK]; final1
2410 
2411 
2412 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2413 // & RANGE_MASK];
2414 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2415 // & RANGE_MASK]; final2
2416  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2417  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2418 
2419  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2420 
2421  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2422 
2423  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2424  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2425 
2426 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2427 // & RANGE_MASK];
2428 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2429 // & RANGE_MASK]; final3
2430  paddw_r2r(mm1, mm7); // tmp4
2431  movq_r2r(mm5, mm3);
2432 
2433  paddw_r2r(mm1, mm5); // tmp2+tmp5
2434  psubw_r2r(mm1, mm3); // tmp2-tmp5
2435 
2436  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2437 
2438  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2439  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2440 
2441 
2442 
2443 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2444 // & RANGE_MASK];
2445 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2446 // & RANGE_MASK]; final4
2447  movq_r2r(mm4, mm6);
2448  paddw_r2r(mm7, mm4); // tmp3+tmp4
2449 
2450  psubw_r2r(mm7, mm6); // tmp3-tmp4
2451  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2452 
2453  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2454 
2455  /*
2456  movq_r2m(mm4, *dummy);
2457  fprintf(stderr, "3-4 %016llx\n", dummy);
2458  movq_r2m(mm4, *dummy);
2459  fprintf(stderr, "3+4 %016llx\n", dummy);
2460  */
2461 
2462 
2463  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2464 
2465  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2466  movq_r2r(mm2, mm4);
2467 
2468  movq_r2r(mm5, mm7);
2469  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2470 
2471  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2472  movq_r2r(mm2, mm1);
2473 
2474  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2475 
2476  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2477 
2478  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2479 
2480  movq_r2r(mm7, mm6);
2481  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2482 
2483  movq_r2r(mm2, mm0);
2484  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2485 
2486  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2487 
2488  movq_r2r(mm1, mm3);
2489 
2490  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2491 
2492  dataptr += rskip;
2493  movq_r2m(mm2, *(dataptr));
2494 
2495  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2496 
2497  dataptr += rskip;
2498  movq_r2m(mm0, *(dataptr));
2499 
2500  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2501 
2502  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2503 
2504  dataptr += rskip;
2505  movq_r2m(mm1, *(dataptr));
2506 
2507  dataptr += rskip;
2508  movq_r2m(mm3, *(dataptr));
2509 
2510 #else
2511  int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2512  int32_t tmp10, tmp11, tmp12, tmp13;
2513  int32_t z5, z10, z11, z12, z13;
2514  int16_t *inptr;
2515  int32_t *wsptr;
2516  uint8_t *outptr;
2517  int ctr;
2518  int32_t dcval;
2519 
2520  inptr = data;
2521  wsptr = m_ws;
2522  for (ctr = 8; ctr > 0; ctr--) {
2523 
2524  if ((inptr[8] | inptr[16] | inptr[24] |
2525  inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2526  dcval = inptr[0];
2527  wsptr[0] = dcval;
2528  wsptr[8] = dcval;
2529  wsptr[16] = dcval;
2530  wsptr[24] = dcval;
2531  wsptr[32] = dcval;
2532  wsptr[40] = dcval;
2533  wsptr[48] = dcval;
2534  wsptr[56] = dcval;
2535 
2536  inptr++;
2537  wsptr++;
2538  continue;
2539  }
2540 
2541  tmp0 = inptr[0];
2542  tmp1 = inptr[16];
2543  tmp2 = inptr[32];
2544  tmp3 = inptr[48];
2545 
2546  tmp10 = tmp0 + tmp2;
2547  tmp11 = tmp0 - tmp2;
2548 
2549  tmp13 = tmp1 + tmp3;
2550  tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2551 
2552  tmp0 = tmp10 + tmp13;
2553  tmp3 = tmp10 - tmp13;
2554  tmp1 = tmp11 + tmp12;
2555  tmp2 = tmp11 - tmp12;
2556 
2557  tmp4 = inptr[8];
2558  tmp5 = inptr[24];
2559  tmp6 = inptr[40];
2560  tmp7 = inptr[56];
2561 
2562  z13 = tmp6 + tmp5;
2563  z10 = tmp6 - tmp5;
2564  z11 = tmp4 + tmp7;
2565  z12 = tmp4 - tmp7;
2566 
2567  tmp7 = z11 + z13;
2568  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2569 
2570  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2571  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2572  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2573 
2574  tmp6 = tmp12 - tmp7;
2575  tmp5 = tmp11 - tmp6;
2576  tmp4 = tmp10 + tmp5;
2577 
2578  wsptr[0] = (int32_t) (tmp0 + tmp7);
2579  wsptr[56] = (int32_t) (tmp0 - tmp7);
2580  wsptr[8] = (int32_t) (tmp1 + tmp6);
2581  wsptr[48] = (int32_t) (tmp1 - tmp6);
2582  wsptr[16] = (int32_t) (tmp2 + tmp5);
2583  wsptr[40] = (int32_t) (tmp2 - tmp5);
2584  wsptr[32] = (int32_t) (tmp3 + tmp4);
2585  wsptr[24] = (int32_t) (tmp3 - tmp4);
2586 
2587  inptr++;
2588  wsptr++;
2589  }
2590 
2591  wsptr = m_ws;
2592  for (ctr = 0; ctr < 8; ctr++) {
2593  outptr = &(odata[ctr*rskip]);
2594 
2595  tmp10 = wsptr[0] + wsptr[4];
2596  tmp11 = wsptr[0] - wsptr[4];
2597 
2598  tmp13 = wsptr[2] + wsptr[6];
2599  tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2600 
2601  tmp0 = tmp10 + tmp13;
2602  tmp3 = tmp10 - tmp13;
2603  tmp1 = tmp11 + tmp12;
2604  tmp2 = tmp11 - tmp12;
2605 
2606  z13 = wsptr[5] + wsptr[3];
2607  z10 = wsptr[5] - wsptr[3];
2608  z11 = wsptr[1] + wsptr[7];
2609  z12 = wsptr[1] - wsptr[7];
2610 
2611  tmp7 = z11 + z13;
2612  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2613 
2614  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2615  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2616  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2617 
2618  tmp6 = tmp12 - tmp7;
2619  tmp5 = tmp11 - tmp6;
2620  tmp4 = tmp10 + tmp5;
2621 
2622  outptr[0] = RL(DESCALE(tmp0 + tmp7));
2623  outptr[7] = RL(DESCALE(tmp0 - tmp7));
2624  outptr[1] = RL(DESCALE(tmp1 + tmp6));
2625  outptr[6] = RL(DESCALE(tmp1 - tmp6));
2626  outptr[2] = RL(DESCALE(tmp2 + tmp5));
2627  outptr[5] = RL(DESCALE(tmp2 - tmp5));
2628  outptr[4] = RL(DESCALE(tmp3 + tmp4));
2629  outptr[3] = RL(DESCALE(tmp3 - tmp4));
2630 
2631  wsptr += 8;
2632  }
2633 #endif
2634 }
2635 
2636 inline void RTjpeg::CalcTbls(void)
2637 {
2638  uint64_t qual = (uint64_t)m_q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */
2639 
2640  for(int i = 0; i < 64; i++)
2641  {
2642  m_lqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2643  if (m_lqt[i] == 0)
2644  m_lqt[i]=1;
2645 
2646  m_cqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2647  if (m_cqt[i] == 0)
2648  m_cqt[i]=1;
2649 
2650  m_liqt[i] = (1<<16) / (m_lqt[i]<<3);
2651  m_ciqt[i] = (1<<16) / (m_cqt[i]<<3);
2652  m_lqt[i] = ((1<<16) / m_liqt[i])>>3;
2653  m_cqt[i] = ((1<<16) / m_ciqt[i])>>3;
2654  }
2655 
2656  m_lB8 = 0;
2657  while (m_liqt[RTjpeg_ZZ[++m_lB8]] <= 8)
2658  ;
2659  m_lB8--;
2660  m_cB8 = 0;
2661 
2662  while (m_ciqt[RTjpeg_ZZ[++m_cB8]] <= 8)
2663  ;
2664  m_cB8--;
2665 }
2666 
2667 int RTjpeg::SetQuality(int *quality)
2668 {
2669  if (*quality < 1)
2670  *quality = 1;
2671  if (*quality > 255)
2672  *quality = 255;
2673 
2674  m_q = *quality;
2675 
2676  CalcTbls();
2677  DctInit();
2678  IdctInit();
2679  QuantInit();
2680 
2681  return 0;
2682 }
2683 
2684 int RTjpeg::SetFormat(const int *fmt)
2685 {
2686  m_f = *fmt;
2687  return 0;
2688 }
2689 
2690 int RTjpeg::SetSize(const int *w, const int *h)
2691 {
2692  if ((*w < 0) || (*w > 65535))
2693  return -1;
2694  if ((*h < 0) || (*h > 65535))
2695  return -1;
2696 
2697  m_width = *w;
2698  m_height = *h;
2699  m_yWidth = m_width>>3;
2700  m_ySize = m_width * m_height;
2701  m_cWidth = m_width>>4;
2702  m_cSize = (m_width>>1) * m_height;
2703 
2704  if (m_keyRate > 0)
2705  {
2706  if (m_old)
2707  delete [] m_oldStart;
2708  m_oldStart = new int16_t[((4*m_width*m_height)+32)];
2709 
2710  auto tmp = (unsigned long)m_oldStart;
2711  tmp += 32;
2712  tmp = tmp>>5;
2713 
2714  m_old = (int16_t *)(tmp<<5);
2715  if (!m_old)
2716  {
2717  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2718  return -1;
2719  }
2720  memset(m_old, 0, ((4*m_width*m_height)));
2721  }
2722  return 0;
2723 }
2724 
2725 int RTjpeg::SetIntra(int *key, int *lm, int *cm)
2726 {
2727  if (*key < 0)
2728  *key = 0;
2729  if (*key > 255)
2730  *key = 255;
2731  m_keyRate = *key;
2732 
2733  if (*lm < 0)
2734  *lm = 0;
2735  if (*lm > 16)
2736  *lm = 16;
2737  if (*cm < 0)
2738  *cm = 0;
2739  if (*cm > 16)
2740  *cm = 16;
2741 
2742 #ifdef MMX
2743  m_lMask.uq = (((uint64_t)(*lm)<<48)|((uint64_t)(*lm)<<32)|((uint64_t)(*lm)<<16)|(uint64_t)(*lm));
2744  m_cMask.uq = (((uint64_t)(*cm)<<48)|((uint64_t)(*cm)<<32)|((uint64_t)(*cm)<<16)|(uint64_t)(*cm));
2745 #else
2746  m_lMask = *lm;
2747  m_cMask = *cm;
2748 #endif
2749 
2750  if (m_old)
2751  delete [] m_oldStart;
2752  m_oldStart = new int16_t[((4*m_width*m_height)+32)];
2753  auto tmp = (unsigned long)m_oldStart;
2754  tmp += 32;
2755  tmp = tmp >> 5;
2756  m_old = (int16_t *)(tmp << 5);
2757  if (!m_old)
2758  {
2759  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2760  return -1;
2761  }
2762  memset(m_old, 0, ((4*m_width*m_height)));
2763 
2764  return 0;
2765 }
2766 
2768 {
2769 #ifdef MMX
2770  RTjpeg_ones.q = 0x0001000100010001LL;
2771  RTjpeg_half.q = 0x7fff7fff7fff7fffLL;
2772  RTjpeg_C4.q = 0x2D412D412D412D41LL;
2773  RTjpeg_C6.q = 0x187E187E187E187ELL;
2774  RTjpeg_C2mC6.q= 0x22A322A322A322A3LL;
2775  RTjpeg_C2pC6.q= 0x539F539F539F539FLL;
2776  RTjpeg_zero.q = 0x0000000000000000LL;
2777 #endif
2778 }
2779 
2781 {
2782  delete [] m_oldStart;
2783 }
2784 
2785 inline int RTjpeg::compressYUV420(int8_t *sp, uint8_t **planes)
2786 {
2787  uint8_t * bp = planes[0];
2788  uint8_t * bp1 = bp + (m_width<<3);
2789  uint8_t * bp2 = planes[1];
2790  uint8_t * bp3 = planes[2];
2791 
2792 #ifdef MMX
2793  emms();
2794 #endif
2795  int8_t * sb = sp;
2796 /* Y */
2797  for(int i = m_height >> 1; i; i -= 8)
2798  {
2799  for(int j = 0, k = 0; j < m_width; j += 16, k += 8)
2800  {
2801  DctY(bp+j, m_yWidth);
2802  Quant(m_block, m_lqt);
2803  sp += b2s(m_block, sp, m_lB8);
2804 
2805  DctY(bp+j+8, m_yWidth);
2806  Quant(m_block, m_lqt);
2807  sp += b2s(m_block, sp, m_lB8);
2808 
2809  DctY(bp1+j, m_yWidth);
2810  Quant(m_block, m_lqt);
2811  sp += b2s(m_block, sp, m_lB8);
2812 
2813  DctY(bp1+j+8, m_yWidth);
2814  Quant(m_block, m_lqt);
2815  sp += b2s(m_block, sp, m_lB8);
2816 
2817  DctY(bp2+k, m_cWidth);
2818  Quant(m_block, m_cqt);
2819  sp += b2s(m_block, sp, m_cB8);
2820 
2821  DctY(bp3+k, m_cWidth);
2822  Quant(m_block, m_cqt);
2823  sp += b2s(m_block, sp, m_cB8);
2824  }
2825  bp += m_width<<4;
2826  bp1 += m_width<<4;
2827  bp2 += m_width<<2;
2828  bp3 += m_width<<2;
2829  }
2830 #ifdef MMX
2831  emms();
2832 #endif
2833  return (sp - sb);
2834 }
2835 
2836 inline int RTjpeg::compressYUV422(int8_t *sp, uint8_t **planes)
2837 {
2838  uint8_t * bp = planes[0];
2839  uint8_t * bp2 = planes[1];
2840  uint8_t * bp3 = planes[2];
2841 
2842 #ifdef MMX
2843  emms();
2844 #endif
2845  int8_t * sb=sp;
2846 /* Y */
2847  for(int i=m_height; i; i-=8)
2848  {
2849  for(int j=0, k=0; j<m_width; j+=16, k+=8)
2850  {
2851  DctY(bp+j, m_yWidth);
2852  Quant(m_block, m_lqt);
2853  sp += b2s(m_block, sp, m_lB8);
2854 
2855  DctY(bp+j+8, m_yWidth);
2856  Quant(m_block, m_lqt);
2857  sp += b2s(m_block, sp, m_lB8);
2858 
2859  DctY(bp2+k, m_cWidth);
2860  Quant(m_block, m_cqt);
2861  sp+=b2s(m_block, sp, m_cB8);
2862 
2863  DctY(bp3+k, m_cWidth);
2864  Quant(m_block, m_cqt);
2865  sp+=b2s(m_block, sp, m_cB8);
2866 
2867  }
2868  bp += m_width << 3;
2869  bp2 += m_width << 2;
2870  bp3 += m_width << 2;
2871 
2872  }
2873 #ifdef MMX
2874  emms();
2875 #endif
2876  return (sp-sb);
2877 }
2878 
2879 inline int RTjpeg::compress8(int8_t *sp, uint8_t **planes)
2880 {
2881  int8_t * sb = nullptr;
2882  uint8_t * bp = planes[0];
2883 
2884 #ifdef MMX
2885  emms();
2886 #endif
2887 
2888  sb=sp;
2889 /* Y */
2890  for(int i=0; i<m_height; i+=8)
2891  {
2892  for(int j=0; j<m_width; j+=8)
2893  {
2894  DctY(bp+j, m_width);
2895  Quant(m_block, m_lqt);
2896  sp += b2s(m_block, sp, m_lB8);
2897  }
2898  bp += m_width;
2899  }
2900 
2901 #ifdef MMX
2902  emms();
2903 #endif
2904  return (sp-sb);
2905 }
2906 
2907 inline void RTjpeg::decompressYUV422(int8_t *sp, uint8_t **planes)
2908 {
2909  uint8_t * bp = planes[0];
2910  uint8_t * bp2 = planes[1];
2911  uint8_t * bp3 = planes[2];
2912 
2913 #ifdef MMX
2914  emms();
2915 #endif
2916 
2917 /* Y */
2918  for(int i=m_height; i; i-=8)
2919  {
2920  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2921  if (*sp==-1)sp++;
2922  else
2923  {
2924  sp += s2b(m_block, sp, m_lB8, m_liqt);
2925  Idct(bp+j, m_block, m_width);
2926  }
2927  if (*sp==-1)sp++;
2928  else
2929  {
2930  sp += s2b(m_block, sp, m_lB8, m_liqt);
2931  Idct(bp+j+8, m_block, m_width);
2932  }
2933  if (*sp==-1)sp++;
2934  else
2935  {
2936  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2937  Idct(bp2+k, m_block, m_width>>1);
2938  }
2939  if (*sp==-1)sp++;
2940  else
2941  {
2942  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2943  Idct(bp3+k, m_block, m_width>>1);
2944  }
2945  }
2946  bp += m_width<<3;
2947  bp2 += m_width<<2;
2948  bp3 += m_width<<2;
2949  }
2950 #ifdef MMX
2951  emms();
2952 #endif
2953 }
2954 
2955 inline void RTjpeg::decompressYUV420(int8_t *sp, uint8_t **planes)
2956 {
2957  uint8_t * bp = planes[0];
2958  uint8_t * bp1 = bp + (m_width<<3);
2959  uint8_t * bp2 = planes[1];
2960  uint8_t * bp3 = planes[2];
2961 
2962 #ifdef MMX
2963  emms();
2964 #endif
2965 
2966 /* Y */
2967  for(int i=m_height>>1; i; i-=8)
2968  {
2969  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2970  if (*sp==-1)sp++;
2971  else
2972  {
2973  sp += s2b(m_block, sp, m_lB8, m_liqt);
2974  Idct(bp+j, m_block, m_width);
2975  }
2976  if (*sp==-1)sp++;
2977  else
2978  {
2979  sp += s2b(m_block, sp, m_lB8, m_liqt);
2980  Idct(bp+j+8, m_block, m_width);
2981  }
2982  if (*sp==-1)sp++;
2983  else
2984  {
2985  sp += s2b(m_block, sp, m_lB8, m_liqt);
2986  Idct(bp1+j, m_block, m_width);
2987  }
2988  if (*sp==-1)sp++;
2989  else
2990  {
2991  sp += s2b(m_block, sp, m_lB8, m_liqt);
2992  Idct(bp1+j+8, m_block, m_width);
2993  }
2994  if (*sp==-1)sp++;
2995  else
2996  {
2997  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2998  Idct(bp2+k, m_block, m_width>>1);
2999  }
3000  if (*sp==-1)sp++;
3001  else
3002  {
3003  sp += s2b(m_block, sp, m_cB8, m_ciqt);
3004  Idct(bp3+k, m_block, m_width>>1);
3005  }
3006  }
3007  bp += m_width<<4;
3008  bp1 += m_width<<4;
3009  bp2 += m_width<<2;
3010  bp3 += m_width<<2;
3011  }
3012 #ifdef MMX
3013  emms();
3014 #endif
3015 }
3016 
3017 inline void RTjpeg::decompress8(int8_t *sp, uint8_t **planes)
3018 {
3019  uint8_t * bp = planes[0];
3020 
3021 #ifdef MMX
3022  emms();
3023 #endif
3024 
3025 /* Y */
3026  for(int i=0; i<m_height; i+=8)
3027  {
3028  for(int j=0; j<m_width; j+=8)
3029  {
3030  if (*sp==-1)sp++;
3031  else
3032  {
3033  sp += s2b(m_block, sp, m_lB8, m_liqt);
3034  Idct(bp+j, m_block, m_width);
3035  }
3036  }
3037  bp += m_width<<3;
3038  }
3039 }
3040 
3041 #ifdef MMX
3042 
3043 int RTjpeg::bcomp(int16_t *rblock, int16_t *_old, mmx_t *mask)
3044 {
3045  auto *mold=(mmx_t *)_old;
3046  auto *mblock=(mmx_t *)rblock;
3047  volatile mmx_t result {};
3048  static mmx_t s_neg= { 0xffffffffffffffffULL };
3049 
3050  movq_m2r(*mask, mm7);
3051  movq_m2r(s_neg, mm6);
3052  pxor_r2r(mm5, mm5);
3053 
3054  for(int i=0; i<8; i++)
3055  {
3056  movq_m2r(*(mblock++), mm0);
3057  movq_m2r(*(mblock++), mm2);
3058  movq_m2r(*(mold++), mm1);
3059  movq_m2r(*(mold++), mm3);
3060  psubsw_r2r(mm1, mm0);
3061  psubsw_r2r(mm3, mm2);
3062  movq_r2r(mm0, mm1);
3063  movq_r2r(mm2, mm3);
3064  pcmpgtw_r2r(mm7, mm0);
3065  pcmpgtw_r2r(mm7, mm2);
3066  pxor_r2r(mm6, mm1);
3067  pxor_r2r(mm6, mm3);
3068  pcmpgtw_r2r(mm7, mm1);
3069  pcmpgtw_r2r(mm7, mm3);
3070  por_r2r(mm0, mm5);
3071  por_r2r(mm2, mm5);
3072  por_r2r(mm1, mm5);
3073  por_r2r(mm3, mm5);
3074  }
3075  movq_r2m(mm5, result);
3076 
3077  if (result.q)
3078  {
3079  for(int i=0; i<16; i++)((uint64_t *)_old)[i]=((uint64_t *)rblock)[i];
3080  return 0;
3081  }
3082  return 1;
3083 }
3084 
3085 #else
3086 int RTjpeg::bcomp(int16_t *rblock, int16_t *_old, uint16_t *mask)
3087 {
3088  for(int i=0; i<64; i++)
3089  if (abs(_old[i]-rblock[i])>*mask)
3090  {
3091  for(i=0; i<16; i++)((uint64_t *)_old)[i]=((uint64_t *)rblock)[i];
3092  return 0;
3093  }
3094  return 1;
3095 }
3096 #endif
3097 
3098 inline int RTjpeg::mcompressYUV420(int8_t *sp, uint8_t **planes)
3099 {
3100  uint8_t * bp = planes[0];
3101  uint8_t * bp1 = bp + (m_width<<3);
3102  uint8_t * bp2 = planes[1];
3103  uint8_t * bp3 = planes[2];
3104  int8_t * sb = sp;
3105  int16_t * lblock = m_old;
3106 
3107 /* Y */
3108  for(int i = m_height>>1; i; i-=8)
3109  {
3110  for(int j=0, k=0; j < m_width; j+=16, k+=8)
3111  {
3112  DctY(bp+j, m_yWidth);
3113  Quant(m_block, m_lqt);
3114  if (bcomp(m_block, lblock, &m_lMask))
3115  {
3116  *((uint8_t *)sp++)=255;
3117  }
3118  else sp+=b2s(m_block, sp, m_lB8);
3119  lblock += 64;
3120 
3121  DctY(bp+j+8, m_yWidth);
3122  Quant(m_block, m_lqt);
3123  if (bcomp(m_block, lblock, &m_lMask))
3124  {
3125  *((uint8_t *)sp++)=255;
3126  }
3127  else sp += b2s(m_block, sp, m_lB8);
3128  lblock += 64;
3129 
3130  DctY(bp1+j, m_yWidth);
3131  Quant(m_block, m_lqt);
3132  if (bcomp(m_block, lblock, &m_lMask))
3133  {
3134  *((uint8_t *)sp++)=255;
3135  }
3136  else sp += b2s(m_block, sp, m_lB8);
3137  lblock += 64;
3138 
3139  DctY(bp1+j+8, m_yWidth);
3140  Quant(m_block, m_lqt);
3141  if (bcomp(m_block, lblock, &m_lMask))
3142  {
3143  *((uint8_t *)sp++)=255;
3144  }
3145  else sp += b2s(m_block, sp, m_lB8);
3146  lblock += 64;
3147 
3148  DctY(bp2+k, m_cWidth);
3149  Quant(m_block, m_cqt);
3150  if (bcomp(m_block, lblock, &m_cMask))
3151  {
3152  *((uint8_t *)sp++)=255;
3153  }
3154  else
3155  sp+=b2s(m_block, sp, m_cB8);
3156  lblock+=64;
3157 
3158  DctY(bp3+k, m_cWidth);
3159  Quant(m_block, m_cqt);
3160  if (bcomp(m_block, lblock, &m_cMask))
3161  {
3162  *((uint8_t *)sp++)=255;
3163  }
3164  else
3165  sp+=b2s(m_block, sp, m_cB8);
3166  lblock+=64;
3167  }
3168  bp += m_width<<4;
3169  bp1 += m_width<<4;
3170  bp2 += m_width<<2;
3171  bp3 += m_width<<2;
3172  }
3173 #ifdef MMX
3174  emms();
3175 #endif
3176  return (sp-sb);
3177 }
3178 
3179 
3180 inline int RTjpeg::mcompressYUV422(int8_t *sp, uint8_t **planes)
3181 {
3182  uint8_t * bp = planes[0];
3183  uint8_t * bp2 = planes[1];
3184  uint8_t * bp3 = planes[2];
3185  int8_t * sb=sp;
3186  int16_t *lblock = m_old;
3187 
3188  for(int i = m_height; i; i-=8)
3189  {
3190  for(int j=0, k=0; j<m_width; j+=16, k+=8)
3191  {
3192  DctY(bp+j, m_yWidth);
3193  Quant(m_block, m_lqt);
3194  if (bcomp(m_block, lblock, &m_lMask))
3195  {
3196  *((uint8_t *)sp++)=255;
3197  }
3198  else sp+=b2s(m_block, sp, m_lB8);
3199  lblock+=64;
3200 
3201  DctY(bp+j+8, m_yWidth);
3202  Quant(m_block, m_lqt);
3203  if (bcomp(m_block, lblock, &m_lMask))
3204  {
3205  *((uint8_t *)sp++)=255;
3206  }
3207  else sp+=b2s(m_block, sp, m_lB8);
3208  lblock+=64;
3209 
3210  DctY(bp2+k, m_cWidth);
3211  Quant(m_block, m_cqt);
3212  if (bcomp(m_block, lblock, &m_cMask))
3213  {
3214  *((uint8_t *)sp++)=255;
3215  }
3216  else sp+=b2s(m_block, sp, m_cB8);
3217  lblock+=64;
3218 
3219  DctY(bp3+k, m_cWidth);
3220  Quant(m_block, m_cqt);
3221  if (bcomp(m_block, lblock, &m_cMask))
3222  {
3223  *((uint8_t *)sp++)=255;
3224  }
3225  else sp+=b2s(m_block, sp, m_cB8);
3226  lblock+=64;
3227 
3228  }
3229  bp += m_width<<3;
3230  bp2 += m_width<<2;
3231  bp3 += m_width<<2;
3232  }
3233 #ifdef MMX
3234  emms();
3235 #endif
3236  return (sp-sb);
3237 }
3238 
3239 inline int RTjpeg::mcompress8(int8_t *sp, uint8_t **planes)
3240 {
3241  uint8_t * bp = planes[0];
3242  int8_t * sb = sp;
3243  int16_t *lblock = m_old;
3244 
3245  for(int i=0; i<m_height; i+=8)
3246  {
3247  for(int j=0; j<m_width; j+=8)
3248  {
3249  DctY(bp+j, m_width);
3250  Quant(m_block, m_lqt);
3251  if (bcomp(m_block, lblock, &m_lMask))
3252  {
3253  *((uint8_t *)sp++)=255;
3254  } else sp+=b2s(m_block, sp, m_lB8);
3255  lblock+=64;
3256  }
3257  bp+=m_width<<3;
3258  }
3259 #ifdef MMX
3260  emms();
3261 #endif
3262  return (sp-sb);
3263 }
3264 
3266 {
3267  m_keyCount = 0;
3268 }
3269 
3270 int RTjpeg::Compress(int8_t *sp, uint8_t **planes)
3271 {
3272  auto * fh = (RTjpeg_frameheader *)sp;
3273  int ds = 0;
3274 
3275  if (m_keyRate == 0)
3276  {
3277  switch(m_f)
3278  {
3279  case RTJ_YUV420: ds = compressYUV420((int8_t*)&(fh->data), planes); break;
3280  case RTJ_YUV422: ds = compressYUV422((int8_t*)&(fh->data), planes); break;
3281  case RTJ_RGB8: ds = compress8((int8_t*)&(fh->data), planes); break;
3282  }
3283  fh->key = 0;
3284  } else {
3285  if (m_keyCount == 0)
3286  memset(m_old, 0, ((4 * m_width * m_height)));
3287  switch(m_f)
3288  {
3289  case RTJ_YUV420: ds = mcompressYUV420((int8_t*)&(fh->data), planes); break;
3290  case RTJ_YUV422: ds = mcompressYUV422((int8_t*)&(fh->data), planes); break;
3291  case RTJ_RGB8: ds = mcompress8((int8_t*)&(fh->data), planes); break;
3292  }
3293  fh->key = m_keyCount;
3294  if (++m_keyCount > m_keyRate)
3295  m_keyCount = 0;
3296  }
3297  ds += RTJPEG_HEADER_SIZE;
3298  fh->framesize = RTJPEG_SWAP_WORD(ds);
3299  fh->headersize = RTJPEG_HEADER_SIZE;
3300  fh->version = RTJPEG_FILE_VERSION;
3301  fh->width = RTJPEG_SWAP_HALFWORD(m_width);
3302  fh->height = RTJPEG_SWAP_HALFWORD(m_height);
3303  fh->quality = m_q;
3304  return ds;
3305 }
3306 
3307 void RTjpeg::Decompress(int8_t *sp, uint8_t **planes)
3308 {
3309  auto * fh = (RTjpeg_frameheader *)sp;
3310 
3311  if ((RTJPEG_SWAP_HALFWORD(fh->width) != m_width)||
3312  (RTJPEG_SWAP_HALFWORD(fh->height) != m_height))
3313  {
3314  int w = RTJPEG_SWAP_HALFWORD(fh->width);
3315  int h = RTJPEG_SWAP_HALFWORD(fh->height);
3316  SetSize(&w, &h);
3317  }
3318  if (fh->quality != m_q)
3319  {
3320  int q = fh->quality;
3321  SetQuality(&q);
3322  }
3323  switch(m_f)
3324  {
3325  case RTJ_YUV420: decompressYUV420((int8_t*)&(fh->data), planes); break;
3326  case RTJ_YUV422: decompressYUV422((int8_t*)&(fh->data), planes); break;
3327  case RTJ_RGB8: decompress8((int8_t*)&(fh->data), planes); break;
3328  }
3329 }
RTjpeg::mcompressYUV422
int mcompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3180
RTjpeg_chrom_quant_tbl
static const std::array< const uint8_t, 64 > RTjpeg_chrom_quant_tbl
Definition: RTjpegN.cpp:83
RTjpeg::DctY
void DctY(uint8_t *idata, int rskip)
Definition: RTjpegN.cpp:597
RTjpeg_lum_quant_tbl
static const std::array< const uint8_t, 64 > RTjpeg_lum_quant_tbl
Definition: RTjpegN.cpp:72
RTjpeg::Quant
static void Quant(int16_t *block, int32_t *qtbl)
Definition: RTjpegN.cpp:528
RTjpeg::m_lB8
int32_t m_lB8
Definition: RTjpegN.h:119
RTjpeg::m_block
int16_t m_block[64]
Definition: RTjpegN.h:113
RTjpeg::m_yWidth
int32_t m_yWidth
Definition: RTjpegN.h:121
RTjpeg::m_ciqt
int32_t m_ciqt[64]
Definition: RTjpegN.h:118
RTjpeg::compress8
int compress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2879
RTjpeg::m_lMask
mmx_t m_lMask
Definition: RTjpegN.h:134
RTjpeg::m_ySize
int32_t m_ySize
Definition: RTjpegN.h:123
RTjpeg_C4
static mmx_t RTjpeg_C4
Definition: RTjpegN.cpp:34
RTjpeg::m_oldStart
int16_t * m_oldStart
Definition: RTjpegN.h:126
RTjpeg::SetNextKey
void SetNextKey(void)
Definition: RTjpegN.cpp:3265
RTjpeg::m_lqt
int32_t m_lqt[64]
Definition: RTjpegN.h:115
RTjpeg::m_q
int m_q
Definition: RTjpegN.h:131
FIX_1_414213562
#define FIX_1_414213562
Definition: RTjpegN.cpp:1498
RTjpeg_frameheader
Definition: RTjpegN.h:143
RTJPEG_FILE_VERSION
#define RTJPEG_FILE_VERSION
Definition: RTjpegN.h:35
RTJ_YUV422
#define RTJ_YUV422
Definition: RTjpegN.h:61
RTjpeg::m_liqt
int32_t m_liqt[64]
Definition: RTjpegN.h:117
RTjpeg_C6
static mmx_t RTjpeg_C6
Definition: RTjpegN.cpp:35
RTJ_RGB8
#define RTJ_RGB8
Definition: RTjpegN.h:62
RTjpeg::m_height
int m_height
Definition: RTjpegN.h:130
tmp
static guint32 * tmp
Definition: goom_core.cpp:30
RTjpeg::s2b
static int s2b(int16_t *data, const int8_t *strm, uint8_t bt8, int32_t *qtbla)
Definition: RTjpegN.cpp:279
RTjpeg_half
static mmx_t RTjpeg_half
Definition: RTjpegN.cpp:33
RTjpeg::bcomp
static int bcomp(int16_t *rblock, int16_t *old, mmx_t *mask)
Definition: RTjpegN.cpp:3043
RTjpeg::m_cB8
int32_t m_cB8
Definition: RTjpegN.h:120
RTjpeg::DctInit
void DctInit(void)
Definition: RTjpegN.cpp:588
RTjpeg::CalcTbls
void CalcTbls(void)
Definition: RTjpegN.cpp:2636
RTjpeg::QuantInit
void QuantInit(void)
Definition: RTjpegN.cpp:513
RTjpeg_C2mC6
static mmx_t RTjpeg_C2mC6
Definition: RTjpegN.cpp:36
RTjpeg_ones
static mmx_t RTjpeg_ones
Definition: RTjpegN.cpp:32
RTjpeg::mcompress8
int mcompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3239
RTJ_YUV420
#define RTJ_YUV420
Definition: RTjpegN.h:60
RTjpeg::mcompressYUV420
int mcompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3098
RTjpeg::compressYUV420
int compressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2785
RTjpeg_ZZ
static const std::array< const uint8_t, 64 > RTjpeg_ZZ
Definition: RTjpegN.cpp:44
hardwareprofile.smolt.long
long
Definition: smolt.py:76
RTjpeg::decompress8
void decompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3017
RTjpeg::m_cWidth
int32_t m_cWidth
Definition: RTjpegN.h:122
RTjpeg::b2s
static int b2s(const int16_t *data, int8_t *strm, uint8_t bt8)
Definition: RTjpegN.cpp:111
hardwareprofile.distros.mythtv_data.main.stdout
stdout
Definition: main.py:87
RTJPEG_SWAP_WORD
#define RTJPEG_SWAP_WORD(a)
Definition: RTjpegN.h:46
FIX_1_082392200
#define FIX_1_082392200
Definition: RTjpegN.cpp:1497
RTjpeg::m_width
int m_width
Definition: RTjpegN.h:129
RTjpeg::~RTjpeg
~RTjpeg()
Definition: RTjpegN.cpp:2780
FIX_2_613125930
#define FIX_2_613125930
Definition: RTjpegN.cpp:1500
RTjpeg::decompressYUV420
void decompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2955
RTjpeg_C2pC6
static mmx_t RTjpeg_C2pC6
Definition: RTjpegN.cpp:37
RTJPEG_HEADER_SIZE
#define RTJPEG_HEADER_SIZE
Definition: RTjpegN.h:36
RTjpeg::Idct
void Idct(uint8_t *odata, int16_t *data, int rskip)
Definition: RTjpegN.cpp:1518
RTjpeg::SetIntra
int SetIntra(int *key, int *lm, int *cm)
Definition: RTjpegN.cpp:2725
planes
static uint planes(VideoFrameType Type)
Definition: mythframe.h:570
RTjpeg::m_keyCount
int m_keyCount
Definition: RTjpegN.h:127
RTjpeg::SetFormat
int SetFormat(const int *fmt)
Definition: RTjpegN.cpp:2684
FIX_1_847759065
#define FIX_1_847759065
Definition: RTjpegN.cpp:1499
RTjpeg::m_old
int16_t * m_old
Definition: RTjpegN.h:125
RTjpeg::m_keyRate
int m_keyRate
Definition: RTjpegN.h:140
RTjpeg_zero
static mmx_t RTjpeg_zero
Definition: RTjpegN.cpp:38
RTjpeg::SetSize
int SetSize(const int *w, const int *h)
Definition: RTjpegN.cpp:2690
RTjpeg::IdctInit
void IdctInit(void)
Definition: RTjpegN.cpp:1509
ttvdb.stderr
stderr
Definition: ttvdb.py:1426
RTjpeg::decompressYUV422
void decompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2907
MULTIPLY
#define MULTIPLY(var, const)
Definition: RTjpegN.cpp:1507
RTjpeg::Decompress
void Decompress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3307
RTjpeg::compressYUV422
int compressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2836
uint16_t
unsigned short uint16_t
Definition: iso6937tables.h:1
RTjpeg::m_cSize
int32_t m_cSize
Definition: RTjpegN.h:124
RTjpeg::m_cMask
mmx_t m_cMask
Definition: RTjpegN.h:135
DESCALE
#define DESCALE(x)
Definition: RTjpegN.cpp:1502
RTjpegN.h
RTjpeg_aan_tab
static const std::array< const uint64_t, 64 > RTjpeg_aan_tab
Definition: RTjpegN.cpp:61
RTjpeg::SetQuality
int SetQuality(int *quality)
Definition: RTjpegN.cpp:2667
RL
#define RL(x)
Definition: RTjpegN.cpp:1506
RTjpeg::Compress
int Compress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3270
RTjpeg::m_ws
int32_t m_ws[64 *4]
Definition: RTjpegN.h:114
RTjpeg::m_cqt
int32_t m_cqt[64]
Definition: RTjpegN.h:116
RTjpeg::m_f
int m_f
Definition: RTjpegN.h:132
RTjpeg::RTjpeg
RTjpeg()
Definition: RTjpegN.cpp:2767
RTJPEG_SWAP_HALFWORD
#define RTJPEG_SWAP_HALFWORD(a)
Definition: RTjpegN.h:47