MythTV  master
RTjpegN.cpp
Go to the documentation of this file.
1 /*
2  RTjpeg (C) Justin Schoeman 1998 (justin@suntiger.ee.up.ac.za)
3 
4  With modifications by:
5  (c) 1998, 1999 by Joerg Walter <trouble@moes.pmnet.uni-oldenburg.de>
6  and
7  (c) 1999 by Wim Taymans <wim.taymans@tvd.be>
8 
9  This program is free software; you can redistribute it and/or modify
10  it under the terms of the GNU General Public License as published by
11  the Free Software Foundation; either version 2 of the License, or
12  (at your option) any later version.
13 
14  This program is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17  GNU General Public License for more details.
18 
19  You should have received a copy of the GNU General Public License
20  along with this program; if not, write to the Free Software
21  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 
23 */
24 
25 #include <cstdio>
26 #include <cstdlib>
27 #include <cstring>
28 #include "RTjpegN.h"
29 
30 #ifdef MMX
31 static mmx_t RTjpeg_ones;
32 static mmx_t RTjpeg_half;
33 static mmx_t RTjpeg_C4;
34 static mmx_t RTjpeg_C6;
35 static mmx_t RTjpeg_C2mC6;
36 static mmx_t RTjpeg_C2pC6;
37 static mmx_t RTjpeg_zero;
38 #endif
39 
40 //#define SHOWBLOCK 1
41 #define BETTERCOMPRESSION 1
42 
43 static const unsigned char RTjpeg_ZZ[64]={
44 0,
45 8, 1,
46 2, 9, 16,
47 24, 17, 10, 3,
48 4, 11, 18, 25, 32,
49 40, 33, 26, 19, 12, 5,
50 6, 13, 20, 27, 34, 41, 48,
51 56, 49, 42, 35, 28, 21, 14, 7,
52 15, 22, 29, 36, 43, 50, 57,
53 58, 51, 44, 37, 30, 23,
54 31, 38, 45, 52, 59,
55 60, 53, 46, 39,
56 47, 54, 61,
57 62, 55,
58 63 };
59 
60 static const uint64_t RTjpeg_aan_tab[64]={
61 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
62 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
63 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
64 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
65 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
66 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
67 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
68 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
69 };
70 
71 static const unsigned char RTjpeg_lum_quant_tbl[64] = {
72  16, 11, 10, 16, 24, 40, 51, 61,
73  12, 12, 14, 19, 26, 58, 60, 55,
74  14, 13, 16, 24, 40, 57, 69, 56,
75  14, 17, 22, 29, 51, 87, 80, 62,
76  18, 22, 37, 56, 68, 109, 103, 77,
77  24, 35, 55, 64, 81, 104, 113, 92,
78  49, 64, 78, 87, 103, 121, 120, 101,
79  72, 92, 95, 98, 112, 100, 103, 99
80  };
81 
82 static const unsigned char RTjpeg_chrom_quant_tbl[64] = {
83  17, 18, 24, 47, 99, 99, 99, 99,
84  18, 21, 26, 66, 99, 99, 99, 99,
85  24, 26, 56, 99, 99, 99, 99, 99,
86  47, 66, 99, 99, 99, 99, 99, 99,
87  99, 99, 99, 99, 99, 99, 99, 99,
88  99, 99, 99, 99, 99, 99, 99, 99,
89  99, 99, 99, 99, 99, 99, 99, 99,
90  99, 99, 99, 99, 99, 99, 99, 99
91  };
92 
93 #ifdef BETTERCOMPRESSION
94 
95 /*--------------------------------------------------*/
96 /* better encoding, but needs a lot more cpu time */
97 /* seems to be more effective than old method +lzo */
98 /* with this encoding lzo isn't efficient anymore */
99 /* there is still more potential for better */
100 /* encoding but that would need even more cputime */
101 /* anyway your mileage may vary */
102 /* */
103 /* written by Martin BIELY and Roman HOCHLEITNER */
104 /*--------------------------------------------------*/
105 
106 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
107 /* Block to Stream (encoding) */
108 /* */
109 
110 int RTjpeg::b2s(const int16_t *data, int8_t *strm, uint8_t /*bt8*/)
111 {
112  int co=1;
113 
114  auto *ustrm = (uint8_t *)strm;
115 #ifdef SHOWBLOCK
116 
117  int ii;
118  for (ii=0; ii < 64; ii++) {
119  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
120  }
121  fprintf(stdout, "\n\n");
122 
123 #endif
124 
125 // *strm++ = 0x10;
126 // *strm = 0x00;
127 //
128 // return 2;
129 
130  // first byte allways written
131  ustrm[0]=
132  (uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
133 
134 
135  int ci=63;
136  while (data[RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
137 
138  unsigned char bitten = ((unsigned char)ci) << 2;
139 
140  if (ci==0) {
141  ustrm[1]= bitten;
142  co = 2;
143  return co;
144  }
145 
146  /* bitoff=0 because the high 6bit contain first non zero position */
147  unsigned char bitoff = 0;
148  co = 1;
149 
150  for(; ci>0; ci--) {
151 
152  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
153 
154  switch(ZZvalue) {
155  case 0:
156  break;
157  case 1:
158  bitten |= (0x01<<bitoff);
159  break;
160  case -1:
161  bitten |= (0x03<<bitoff);
162  break;
163  default:
164  bitten |= (0x02<<bitoff);
165  goto HERZWEH;
166  break;
167  }
168 
169  if ( bitoff == 0 ) {
170  ustrm[co]= bitten;
171  bitten = 0;
172  bitoff = 8;
173  co++;
174  } /* "fall through" */
175  bitoff-=2;
176 
177  }
178 
179  /* ci must be 0 */
180  if (bitoff != 6) {
181 
182  ustrm[co]= bitten;
183  co++;
184 
185  }
186  goto BAUCHWEH;
187 
188 HERZWEH:
189 /* ci cannot be 0 */
190 /* correct bitoff to nibble boundaries */
191 
192  switch(bitoff){
193  case 4:
194  case 6:
195  bitoff = 0;
196  break;
197  case 2:
198  case 0:
199  ustrm[co]= bitten;
200  bitoff = 4;
201  co++;
202  bitten = 0; // clear half nibble values in bitten
203  break;
204  default:
205  break;
206  }
207 
208  for(; ci>0; ci--) {
209 
210  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
211 
212  if ( (ZZvalue > 7) || (ZZvalue < -7) ) {
213  bitten |= (0x08<<bitoff);
214  goto HIRNWEH;
215  }
216 
217  bitten |= (ZZvalue&0xf)<<bitoff;
218 
219  if ( bitoff == 0 ) {
220  ustrm[co]= bitten;
221  bitten = 0;
222  bitoff = 8;
223  co++;
224  } /* "fall thru" */
225  bitoff-=4;
226  }
227 
228  /* ci must be 0 */
229  if ( bitoff == 0 ) {
230  ustrm[co]= bitten;
231  co++;
232  }
233  goto BAUCHWEH;
234 
235 HIRNWEH:
236 
237  ustrm[co]= bitten;
238  co++;
239 
240 
241  /* bitting is over now we bite */
242  for(; ci>0; ci--) {
243 
244  int16_t ZZvalue = data[RTjpeg_ZZ[ci]];
245 
246  if (ZZvalue>0)
247  {
248  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
249  }
250  else
251  {
252  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
253  }
254 
255  }
256 
257 
258 BAUCHWEH:
259  /* we gotoo much now we are ill */
260 #ifdef SHOWBLOCK
261 {
262 int i;
263 fprintf(stdout, "\nco = '%d'\n", co);
264  for (i=0; i < co+2; i++) {
265  fprintf(stdout, "%d ", strm[i]);
266  }
267 fprintf(stdout, "\n\n");
268 }
269 #endif
270 
271  return co;
272 }
273 
274 /* +++++++++++++++++++++++++++++++++++++++++++++++++++*/
275 /* Stream to Block (decoding) */
276 /* */
277 
278 int RTjpeg::s2b(int16_t *data, const int8_t *strm, uint8_t /*bt8*/, int32_t *qtbla)
279 {
280  auto *qtbl = (uint32_t *)qtbla;
281  int ci = 0;
282  unsigned char bitoff = 0;
283 
284  /* first byte always read */
285  int i=RTjpeg_ZZ[0];
286  data[i]=((uint8_t)strm[0])*qtbl[i];
287 
288  /* we start at the behind */
289 
290  unsigned char bitten = ((unsigned char)strm[1]) >> 2;
291  int co = 63;
292  for(; co > bitten; co--) {
293 
294  data[RTjpeg_ZZ[co]] = 0;
295 
296  }
297 
298  if (co==0) {
299  ci = 2;
300  goto AUTOBAHN;
301  }
302 
303  /* we have to read the last 2 bits of the second byte */
304  ci=1;
305  bitoff = 0;
306 
307  for(; co>0; co--) {
308 
309  bitten = ((unsigned char)strm[ci]) >> bitoff;
310  bitten &= 0x03;
311 
312  i=RTjpeg_ZZ[co];
313 
314  switch( bitten ) {
315  case 0x03:
316  data[i]= -qtbl[i];
317  break;
318  case 0x02:
319  goto FUSSWEG;
320  break;
321  case 0x01:
322  data[i]= qtbl[i];
323  break;
324  case 0x00:
325  data[i]= 0;
326  break;
327  default:
328  break;
329  }
330 
331  if ( bitoff == 0 ) {
332  bitoff = 8;
333  ci++;
334  }
335  bitoff -= 2;
336  }
337  /* co is 0 now */
338  /* data is written properly */
339 
340  /* if bitoff!=6 then ci is the index, but should be the byte count, so we increment by 1 */
341  if (bitoff!=6) ci++;
342 
343  goto AUTOBAHN;
344 
345 
346 FUSSWEG:
347 /* correct bitoff to nibble */
348  switch(bitoff){
349  case 4:
350  case 6:
351  bitoff = 0;
352  break;
353  case 2:
354  case 0:
355  /* we have to read from the next byte */
356  ci++;
357  bitoff = 4;
358  break;
359  default:
360  break;
361  }
362 
363  for(; co>0; co--) {
364 
365  bitten = ((unsigned char)strm[ci]) >> bitoff;
366  bitten &= 0x0f;
367 
368  i=RTjpeg_ZZ[co];
369 
370  if ( bitten == 0x08 ) {
371  goto STRASSE;
372  }
373 
374  /* the compiler cannot do sign extension for signed nibbles */
375  if ( bitten & 0x08 ) {
376  bitten |= 0xf0;
377  }
378  /* the unsigned char bitten now is a valid signed char */
379 
380  data[i]=((signed char)bitten)*qtbl[i];
381 
382  if ( bitoff == 0 ) {
383  bitoff = 8;
384  ci++;
385  }
386  bitoff -= 4;
387  }
388  /* co is 0 */
389 
390  /* if bitoff!=4 then ci is the index, but should be the byte count, so we increment by 1 */
391  if (bitoff!=4) ci++;
392 
393  goto AUTOBAHN;
394 
395 STRASSE:
396  ci++;
397 
398  for(; co>0; co--) {
399  i=RTjpeg_ZZ[co];
400  data[i]=strm[ci++]*qtbl[i];
401  }
402 
403  /* ci now is the count, because it points to next element => no incrementing */
404 
405 AUTOBAHN:
406 
407 #ifdef SHOWBLOCK
408 fprintf(stdout, "\nci = '%d'\n", ci);
409  for (i=0; i < 64; i++) {
410  fprintf(stdout, "%d ", data[RTjpeg_ZZ[i]]);
411  }
412 fprintf(stdout, "\n\n");
413 #endif
414 
415  return ci;
416 }
417 
418 #else
419 
420 int RTjpeg::b2s(const int16_t *data, int8_t *strm, uint8_t bt8)
421 {
422  register int ci, co=1, tmp;
423  register int16_t ZZvalue;
424 
425 #ifdef SHOWBLOCK
426 
427  int ii;
428  for (ii=0; ii < 64; ii++) {
429  fprintf(stdout, "%d ", data[RTjpeg_ZZ[ii]]);
430  }
431  fprintf(stdout, "\n\n");
432 
433 #endif
434 
435  (uint8_t)strm[0]=(uint8_t)(data[RTjpeg_ZZ[0]]>254) ? 254:((data[RTjpeg_ZZ[0]]<0)?0:data[RTjpeg_ZZ[0]]);
436 
437  for(ci=1; ci<=bt8; ci++)
438  {
439  ZZvalue = data[RTjpeg_ZZ[ci]];
440 
441  if (ZZvalue>0)
442  {
443  strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
444  }
445  else
446  {
447  strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
448  }
449  }
450 
451  for(; ci<64; ci++)
452  {
453  ZZvalue = data[RTjpeg_ZZ[ci]];
454 
455  if (ZZvalue>0)
456  {
457  strm[co++]=(int8_t)(ZZvalue>63)?63:ZZvalue;
458  }
459  else if (ZZvalue<0)
460  {
461  strm[co++]=(int8_t)(ZZvalue<-64)?-64:ZZvalue;
462  }
463  else /* compress zeros */
464  {
465  tmp=ci;
466  do
467  {
468  ci++;
469  } while((ci<64)&&(data[RTjpeg_ZZ[ci]]==0));
470 
471  strm[co++]=(int8_t)(63+(ci-tmp));
472  ci--;
473  }
474  }
475  return (int)co;
476 }
477 
478 int RTjpeg::s2b(int16_t *data, const int8_t *strm, uint8_t bt8, uint32_t *qtbla)
479 {
480  uint32_t *qtbl = (uint32_t *)qtbla;
481  int ci=1, co=1, tmp;
482  register int i;
483 
484  i=RTjpeg_ZZ[0];
485  data[i]=((uint8_t)strm[0])*qtbl[i];
486 
487  for(co=1; co<=bt8; co++)
488  {
489  i=RTjpeg_ZZ[co];
490  data[i]=strm[ci++]*qtbl[i];
491  }
492 
493  for(; co<64; co++)
494  {
495  if (strm[ci]>63)
496  {
497  tmp=co+strm[ci]-63;
498  for(; co<tmp; co++)data[RTjpeg_ZZ[co]]=0;
499  co--;
500  } else
501  {
502  i=RTjpeg_ZZ[co];
503  data[i]=strm[ci]*qtbl[i];
504  }
505  ci++;
506  }
507  return (int)ci;
508 }
509 #endif
510 
511 #ifdef MMX
513 {
514  using P16_32 = union { int16_t *m_int16; int32_t *m_int32; };
515  P16_32 qtbl;
516 
517  qtbl.m_int32 = m_lqt;
518  for (int i = 0; i < 64; i++)
519  qtbl.m_int16[i] = static_cast<int16_t>(m_lqt[i]);
520 
521  // cppcheck-suppress unreadVariable
522  qtbl.m_int32 = m_cqt;
523  for (int i = 0; i < 64; i++)
524  qtbl.m_int16[i] = static_cast<int16_t>(m_cqt[i]);
525 }
526 
527 void RTjpeg::Quant(int16_t *_block, int32_t *qtbl)
528 {
529  auto *ql=(mmx_t *)qtbl;
530  auto *bl=(mmx_t *)_block;
531 
532  movq_m2r(RTjpeg_ones, mm6);
533  movq_m2r(RTjpeg_half, mm7);
534 
535  for(int i=16; i; i--)
536  {
537  movq_m2r(*(ql++), mm0); /* quant vals (4) */
538  movq_m2r(*bl, mm2); /* block vals (4) */
539  movq_r2r(mm0, mm1);
540  movq_r2r(mm2, mm3);
541 
542  punpcklwd_r2r(mm6, mm0); /* 1 qb 1 qa */
543  punpckhwd_r2r(mm6, mm1); /* 1 qd 1 qc */
544 
545  punpcklwd_r2r(mm7, mm2); /* 32767 bb 32767 ba */
546  punpckhwd_r2r(mm7, mm3); /* 32767 bd 32767 bc */
547 
548  pmaddwd_r2r(mm2, mm0); /* 32767+bb*qb 32767+ba*qa */
549  pmaddwd_r2r(mm3, mm1); /* 32767+bd*qd 32767+bc*qc */
550 
551  psrad_i2r(16, mm0);
552  psrad_i2r(16, mm1);
553 
554  packssdw_r2r(mm1, mm0);
555 
556  movq_r2m(mm0, *(bl++));
557  }
558 }
559 #else
560 void RTjpeg::QuantInit()
561 {
562 }
563 
564 void RTjpeg::Quant(int16_t *_block, int32_t *qtbl)
565 {
566  int i;
567 
568  for(i=0; i<64; i++)
569  _block[i]=(int16_t)((_block[i]*qtbl[i]+32767)>>16);
570 }
571 #endif
572 
573 /*
574  * Perform the forward DCT on one block of samples.
575  */
576 #ifndef MMX
577 #define FIX_0_382683433 ((int32_t) 98) /* FIX(0.382683433) */
578 #define FIX_0_541196100 ((int32_t) 139) /* FIX(0.541196100) */
579 #define FIX_0_707106781 ((int32_t) 181) /* FIX(0.707106781) */
580 #define FIX_1_306562965 ((int32_t) 334) /* FIX(1.306562965) */
581 
582 #define DESCALE10(x) (int16_t)( ((x)+128) >> 8)
583 #define DESCALE20(x) (int16_t)(((x)+32768) >> 16)
584 #define D_MULTIPLY(var,const) ((int32_t) ((var) * (const)))
585 #endif
586 
588 {
589  for (int i = 0; i < 64; i++)
590  {
591  m_lqt[i] = (((uint64_t)m_lqt[i] << 32) / RTjpeg_aan_tab[i]);
592  m_cqt[i] = (((uint64_t)m_cqt[i] << 32) / RTjpeg_aan_tab[i]);
593  }
594 }
595 
596 void RTjpeg::DctY(uint8_t *idata, int rskip)
597 {
598 #ifndef MMX
599  uint8_t *idataptr = idata;
600  int32_t *wsptr = m_ws;
601 
602  for (int ctr = 7; ctr >= 0; ctr--) {
603  int32_t tmp0 = idataptr[0] + idataptr[7];
604  int32_t tmp7 = idataptr[0] - idataptr[7];
605  int32_t tmp1 = idataptr[1] + idataptr[6];
606  int32_t tmp6 = idataptr[1] - idataptr[6];
607  int32_t tmp2 = idataptr[2] + idataptr[5];
608  int32_t tmp5 = idataptr[2] - idataptr[5];
609  int32_t tmp3 = idataptr[3] + idataptr[4];
610  int32_t tmp4 = idataptr[3] - idataptr[4];
611 
612  int32_t tmp10 = (tmp0 + tmp3); /* phase 2 */
613  int32_t tmp13 = tmp0 - tmp3;
614  int32_t tmp11 = (tmp1 + tmp2);
615  int32_t tmp12 = tmp1 - tmp2;
616 
617  wsptr[0] = (tmp10 + tmp11)<<8; /* phase 3 */
618  wsptr[4] = (tmp10 - tmp11)<<8;
619 
620  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
621  wsptr[2] = (tmp13<<8) + z1; /* phase 5 */
622  wsptr[6] = (tmp13<<8) - z1;
623 
624  tmp10 = tmp4 + tmp5; /* phase 2 */
625  tmp11 = tmp5 + tmp6;
626  tmp12 = tmp6 + tmp7;
627 
628  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
629  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
630  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
631  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
632 
633  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
634  int32_t z13 = (tmp7<<8) - z3;
635 
636  wsptr[5] = z13 + z2; /* phase 6 */
637  wsptr[3] = z13 - z2;
638  wsptr[1] = z11 + z4;
639  wsptr[7] = z11 - z4;
640 
641  idataptr += rskip<<3; /* advance pointer to next row */
642  wsptr += 8;
643  }
644 
645  wsptr = m_ws;
646  int16_t *odataptr = m_block;
647  for (int ctr = 7; ctr >= 0; ctr--) {
648  int32_t tmp0 = wsptr[0] + wsptr[56];
649  int32_t tmp7 = wsptr[0] - wsptr[56];
650  int32_t tmp1 = wsptr[8] + wsptr[48];
651  int32_t tmp6 = wsptr[8] - wsptr[48];
652  int32_t tmp2 = wsptr[16] + wsptr[40];
653  int32_t tmp5 = wsptr[16] - wsptr[40];
654  int32_t tmp3 = wsptr[24] + wsptr[32];
655  int32_t tmp4 = wsptr[24] - wsptr[32];
656 
657  int32_t tmp10 = tmp0 + tmp3; /* phase 2 */
658  int32_t tmp13 = tmp0 - tmp3;
659  int32_t tmp11 = tmp1 + tmp2;
660  int32_t tmp12 = tmp1 - tmp2;
661 
662  odataptr[0] = DESCALE10(tmp10 + tmp11); /* phase 3 */
663  odataptr[32] = DESCALE10(tmp10 - tmp11);
664 
665  int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781); /* c4 */
666  odataptr[16] = DESCALE20((tmp13<<8) + z1); /* phase 5 */
667  odataptr[48] = DESCALE20((tmp13<<8) - z1);
668 
669  tmp10 = tmp4 + tmp5; /* phase 2 */
670  tmp11 = tmp5 + tmp6;
671  tmp12 = tmp6 + tmp7;
672 
673  int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433); /* c6 */
674  int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5; /* c2-c6 */
675  int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5; /* c2+c6 */
676  int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781); /* c4 */
677 
678  int32_t z11 = (tmp7<<8) + z3; /* phase 5 */
679  int32_t z13 = (tmp7<<8) - z3;
680 
681  odataptr[40] = DESCALE20(z13 + z2); /* phase 6 */
682  odataptr[24] = DESCALE20(z13 - z2);
683  odataptr[8] = DESCALE20(z11 + z4);
684  odataptr[56] = DESCALE20(z11 - z4);
685 
686  odataptr++; /* advance pointer to next column */
687  wsptr++;
688 
689  }
690 #else
691  volatile mmx_t tmp6;
692  volatile mmx_t tmp7;
693  auto *dataptr = (mmx_t *)m_block;
694  auto *idata2 = (mmx_t *)idata;
695 
696 
697  // first copy the input 8 bit to the destination 16 bits
698 
699  movq_m2r(RTjpeg_zero, mm2);
700 
701  movq_m2r(*idata2, mm0);
702  movq_r2r(mm0, mm1);
703 
704  punpcklbw_r2r(mm2, mm0);
705  movq_r2m(mm0, *(dataptr));
706 
707  punpckhbw_r2r(mm2, mm1);
708  movq_r2m(mm1, *(dataptr+1));
709 
710  idata2 += rskip;
711 
712  movq_m2r(*idata2, mm0);
713  movq_r2r(mm0, mm1);
714 
715  punpcklbw_r2r(mm2, mm0);
716  movq_r2m(mm0, *(dataptr+2));
717 
718  punpckhbw_r2r(mm2, mm1);
719  movq_r2m(mm1, *(dataptr+3));
720 
721  idata2 += rskip;
722 
723  movq_m2r(*idata2, mm0);
724  movq_r2r(mm0, mm1);
725 
726  punpcklbw_r2r(mm2, mm0);
727  movq_r2m(mm0, *(dataptr+4));
728 
729  punpckhbw_r2r(mm2, mm1);
730  movq_r2m(mm1, *(dataptr+5));
731 
732  idata2 += rskip;
733 
734  movq_m2r(*idata2, mm0);
735  movq_r2r(mm0, mm1);
736 
737  punpcklbw_r2r(mm2, mm0);
738  movq_r2m(mm0, *(dataptr+6));
739 
740  punpckhbw_r2r(mm2, mm1);
741  movq_r2m(mm1, *(dataptr+7));
742 
743  idata2 += rskip;
744 
745  movq_m2r(*idata2, mm0);
746  movq_r2r(mm0, mm1);
747 
748  punpcklbw_r2r(mm2, mm0);
749  movq_r2m(mm0, *(dataptr+8));
750 
751  punpckhbw_r2r(mm2, mm1);
752  movq_r2m(mm1, *(dataptr+9));
753 
754  idata2 += rskip;
755 
756  movq_m2r(*idata2, mm0);
757  movq_r2r(mm0, mm1);
758 
759  punpcklbw_r2r(mm2, mm0);
760  movq_r2m(mm0, *(dataptr+10));
761 
762  punpckhbw_r2r(mm2, mm1);
763  movq_r2m(mm1, *(dataptr+11));
764 
765  idata2 += rskip;
766 
767  movq_m2r(*idata2, mm0);
768  movq_r2r(mm0, mm1);
769 
770  punpcklbw_r2r(mm2, mm0);
771  movq_r2m(mm0, *(dataptr+12));
772 
773  punpckhbw_r2r(mm2, mm1);
774  movq_r2m(mm1, *(dataptr+13));
775 
776  idata2 += rskip;
777 
778  movq_m2r(*idata2, mm0);
779  movq_r2r(mm0, mm1);
780 
781  punpcklbw_r2r(mm2, mm0);
782  movq_r2m(mm0, *(dataptr+14));
783 
784  punpckhbw_r2r(mm2, mm1);
785  movq_r2m(mm1, *(dataptr+15));
786 
787 /* Start Transpose to do calculations on rows */
788 
789  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into m5
790 
791  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
792  movq_r2r(mm7, mm5);
793 
794  punpcklwd_m2r(*(dataptr+11), mm7); // m11:m01|m10:m00 - interleave first and second lines
795  movq_r2r(mm6, mm2);
796 
797  punpcklwd_m2r(*(dataptr+15), mm6); // m31:m21|m30:m20 - interleave third and fourth lines
798  movq_r2r(mm7, mm1);
799 
800  movq_m2r(*(dataptr+11), mm3); // m13:m13|m11:m10 - second line
801  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
802 
803  movq_m2r(*(dataptr+15), mm0); // m13:m13|m11:m10 - fourth line
804  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
805 
806  movq_r2m(mm7,*(dataptr+9)); // write result 1
807  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
808 
809  movq_r2m(mm1,*(dataptr+11)); // write result 2
810  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
811 
812  movq_r2r(mm5, mm1);
813  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
814 
815  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
816  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
817 
818  movq_r2m(mm5,*(dataptr+13)); // write result 3
819 
820  // last 4x4 done
821 
822  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
823 
824  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
825  movq_r2r(mm0, mm6);
826 
827  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
828  movq_r2r(mm2, mm7);
829 
830  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
831  movq_r2r(mm0, mm4);
832 
833  //
834  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
835  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
836 
837  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
838  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
839 
840  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
841  movq_r2r(mm1, mm2); // copy first line
842 
843  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
844  movq_r2r(mm6, mm5); // copy first intermediate result
845 
846  movq_r2m(mm0, *(dataptr+8)); // write result 1
847  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
848 
849  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
850  movq_r2r(mm3, mm0); // copy third line
851 
852  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
853 
854  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
855  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
856 
857  punpcklwd_m2r(*(dataptr+14), mm3); // n31:n21|n30:n20 - interleave third and fourth lines
858  movq_r2r(mm1, mm4);
859 
860  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
861  punpckldq_r2r(mm3, mm1); // n30:n20|n10:n00 - produce first result
862 
863  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
864  movq_r2r(mm2, mm6);
865 
866  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
867  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
868 
869  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
870  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
871 
872  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
873  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
874 
875  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
876 
877  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
878 
879  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
880 
881 
882 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
883 
884  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
885  movq_r2r(mm0, mm2);
886 
887  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
888  movq_r2r(mm7, mm4);
889 
890  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
891  movq_r2r(mm0, mm1);
892 
893  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
894  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
895 
896  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
897  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
898 
899  movq_r2r(mm0, mm7); // write result 1
900  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
901 
902  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
903  movq_r2r(mm1, mm6); // write result 2
904 
905  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
906  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
907 
908  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
909  movq_r2r(mm2, mm3); // copy first intermediate result
910 
911  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
912  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
913 
914  movq_r2m(mm7, tmp7);
915  movq_r2r(mm2, mm5); // write result 3
916 
917  movq_r2m(mm6, tmp6);
918  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
919 
920  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+5 /* Stage 1 */
921  movq_r2r(mm3, mm4); // write result 4
922 
923 /************************************************************************************************
924  End of Transpose
925 ************************************************************************************************/
926 
927 
928  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
929  movq_r2r(mm0, mm7);
930 
931  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
932  movq_r2r(mm1, mm6);
933 
934  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
935  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
936 
937  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
938  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
939 
940  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
941  paddw_r2r(mm7, mm6); // tmp12 + tmp13
942 
943  /* stage 3 */
944 
945  movq_m2r(tmp6, mm2);
946  movq_r2r(mm0, mm3);
947 
948  psllw_i2r(2, mm6); // m8 * 2^2
949  paddw_r2r(mm1, mm0);
950 
951  pmulhw_m2r(RTjpeg_C4, mm6); // z1
952  psubw_r2r(mm1, mm3);
953 
954  movq_r2m(mm0, *dataptr);
955  movq_r2r(mm7, mm0);
956 
957  /* Odd part */
958  movq_r2m(mm3, *(dataptr+8));
959  paddw_r2r(mm5, mm4); // tmp10
960 
961  movq_m2r(tmp7, mm3);
962  paddw_r2r(mm6, mm0); // tmp32
963 
964  paddw_r2r(mm2, mm5); // tmp11
965  psubw_r2r(mm6, mm7); // tmp33
966 
967  movq_r2m(mm0, *(dataptr+4));
968  paddw_r2r(mm3, mm2); // tmp12
969 
970  /* stage 4 */
971 
972  movq_r2m(mm7, *(dataptr+12));
973  movq_r2r(mm4, mm1); // copy of tmp10
974 
975  psubw_r2r(mm2, mm1); // tmp10 - tmp12
976  psllw_i2r(2, mm4); // m8 * 2^2
977 
978  movq_m2r(RTjpeg_C2mC6, mm0);
979  psllw_i2r(2, mm1);
980 
981  pmulhw_m2r(RTjpeg_C6, mm1); // z5
982  psllw_i2r(2, mm2);
983 
984  pmulhw_r2r(mm0, mm4); // z5
985 
986  /* stage 5 */
987 
988  pmulhw_m2r(RTjpeg_C2pC6, mm2);
989  psllw_i2r(2, mm5);
990 
991  pmulhw_m2r(RTjpeg_C4, mm5); // z3
992  movq_r2r(mm3, mm0); // copy tmp7
993 
994  movq_m2r(*(dataptr+1), mm7);
995  paddw_r2r(mm1, mm4); // z2
996 
997  paddw_r2r(mm1, mm2); // z4
998 
999  paddw_r2r(mm5, mm0); // z11
1000  psubw_r2r(mm5, mm3); // z13
1001 
1002  /* stage 6 */
1003 
1004  movq_r2r(mm3, mm5); // copy z13
1005  psubw_r2r(mm4, mm3); // y3=z13 - z2
1006 
1007  paddw_r2r(mm4, mm5); // y5=z13 + z2
1008  movq_r2r(mm0, mm6); // copy z11
1009 
1010  movq_r2m(mm3, *(dataptr+6)); //save y3
1011  psubw_r2r(mm2, mm0); // y7=z11 - z4
1012 
1013  movq_r2m(mm5, *(dataptr+10)); //save y5
1014  paddw_r2r(mm2, mm6); // y1=z11 + z4
1015 
1016  movq_r2m(mm0, *(dataptr+14)); //save y7
1017 
1018  /************************************************
1019  * End of 1st 4 rows
1020  ************************************************/
1021 
1022  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1023  movq_r2r(mm7, mm0); // copy x0
1024 
1025  movq_r2m(mm6, *(dataptr+2)); //save y1
1026 
1027  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1028  movq_r2r(mm1, mm6); // copy x1
1029 
1030  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1031 
1032  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1033  movq_r2r(mm2, mm5); // copy x2
1034 
1035  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1036  movq_r2r(mm3, mm4); // copy x3
1037 
1038  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1039 
1040  movq_r2m(mm7, tmp7); // save tmp07
1041  movq_r2r(mm0, mm7); // copy tmp00
1042 
1043  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1044 
1045  /* stage 2, Even Part */
1046 
1047  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1048 
1049  movq_r2m(mm6, tmp6); // save tmp07
1050  movq_r2r(mm1, mm6); // copy tmp01
1051 
1052  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1053  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1054 
1055  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1056 
1057  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1058  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1059 
1060  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1061 
1062  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1063  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1064 
1065  /* stage 3, Even and stage 4 & 5 even */
1066 
1067  movq_m2r(tmp6, mm2); // load tmp6
1068  movq_r2r(mm0, mm3); // copy tmp10
1069 
1070  psllw_i2r(2, mm6); // shift z1
1071  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1072 
1073  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1074  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1075 
1076  movq_r2m(mm0, *(dataptr+1)); //save y0
1077  movq_r2r(mm7, mm0); // copy tmp13
1078 
1079  /* odd part */
1080 
1081  movq_r2m(mm3, *(dataptr+9)); //save y4
1082  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1083 
1084  movq_m2r(tmp7, mm3); // load tmp7
1085  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1086 
1087  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1088  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1089 
1090  movq_r2m(mm0, *(dataptr+5)); //save y2
1091  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1092 
1093  /* stage 4 */
1094 
1095  movq_r2m(mm7, *(dataptr+13)); //save y6
1096  movq_r2r(mm4, mm1); // copy tmp10
1097 
1098  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1099  psllw_i2r(2, mm4); // shift tmp10
1100 
1101  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1102  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1103 
1104  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1105  psllw_i2r(2, mm5); // prepare for multiply
1106 
1107  pmulhw_r2r(mm0, mm4); // multiply by converted real
1108 
1109  /* stage 5 */
1110 
1111  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1112  psllw_i2r(2, mm2); // prepare for multiply
1113 
1114  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1115  movq_r2r(mm3, mm0); // copy tmp7
1116 
1117  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1118  paddw_r2r(mm1, mm4); // z2
1119 
1120  paddw_r2r(mm5, mm0); // z11
1121  psubw_r2r(mm5, mm3); // z13
1122 
1123  /* stage 6 */
1124 
1125  movq_r2r(mm3, mm5); // copy z13
1126  paddw_r2r(mm1, mm2); // z4
1127 
1128  movq_r2r(mm0, mm6); // copy z11
1129  psubw_r2r(mm4, mm5); // y3
1130 
1131  paddw_r2r(mm2, mm6); // y1
1132  paddw_r2r(mm4, mm3); // y5
1133 
1134  movq_r2m(mm5, *(dataptr+7)); //save y3
1135 
1136  movq_r2m(mm6, *(dataptr+3)); //save y1
1137  psubw_r2r(mm2, mm0); // y7
1138 
1139 /************************************************************************************************
1140  Start of Transpose
1141 ************************************************************************************************/
1142 
1143  movq_m2r(*(dataptr+13), mm6); // m23:m22|m21:m20 - third line (line 6)and copy into m2
1144  movq_r2r(mm7, mm5); // copy first line
1145 
1146  punpcklwd_r2r(mm3, mm7); // m11:m01|m10:m00 - interleave first and second lines
1147  movq_r2r(mm6, mm2); // copy third line
1148 
1149  punpcklwd_r2r(mm0, mm6); // m31:m21|m30:m20 - interleave third and fourth lines
1150  movq_r2r(mm7, mm1); // copy first intermediate result
1151 
1152  punpckldq_r2r(mm6, mm7); // m30:m20|m10:m00 - interleave to produce result 1
1153 
1154  punpckhdq_r2r(mm6, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1155 
1156  movq_r2m(mm7, *(dataptr+9)); // write result 1
1157  punpckhwd_r2r(mm3, mm5); // m13:m03|m12:m02 - interleave first and second lines
1158 
1159  movq_r2m(mm1, *(dataptr+11)); // write result 2
1160  punpckhwd_r2r(mm0, mm2); // m33:m23|m32:m22 - interleave third and fourth lines
1161 
1162  movq_r2r(mm5, mm1); // copy first intermediate result
1163  punpckldq_r2r(mm2, mm5); // m32:m22|m12:m02 - interleave to produce result 3
1164 
1165  movq_m2r(*(dataptr+1), mm0); // m03:m02|m01:m00 - first line, 4x4
1166  punpckhdq_r2r(mm2, mm1); // m33:m23|m13:m03 - interleave to produce result 4
1167 
1168  movq_r2m(mm5, *(dataptr+13)); // write result 3
1169 
1170  /****** last 4x4 done */
1171 
1172  movq_r2m(mm1, *(dataptr+15)); // write result 4, last 4x4
1173 
1174  movq_m2r(*(dataptr+5), mm2); // m23:m22|m21:m20 - third line
1175  movq_r2r(mm0, mm6); // copy first line
1176 
1177  punpcklwd_m2r(*(dataptr+3), mm0); // m11:m01|m10:m00 - interleave first and second lines
1178  movq_r2r(mm2, mm7); // copy third line
1179 
1180  punpcklwd_m2r(*(dataptr+7), mm2); // m31:m21|m30:m20 - interleave third and fourth lines
1181  movq_r2r(mm0, mm4); // copy first intermediate result
1182 
1183 
1184 
1185  movq_m2r(*(dataptr+8), mm1); // n03:n02|n01:n00 - first line
1186  punpckldq_r2r(mm2, mm0); // m30:m20|m10:m00 - interleave to produce first result
1187 
1188  movq_m2r(*(dataptr+12), mm3); // n23:n22|n21:n20 - third line
1189  punpckhdq_r2r(mm2, mm4); // m31:m21|m11:m01 - interleave to produce second result
1190 
1191  punpckhwd_m2r(*(dataptr+3), mm6); // m13:m03|m12:m02 - interleave first and second lines
1192  movq_r2r(mm1, mm2); // copy first line
1193 
1194  punpckhwd_m2r(*(dataptr+7), mm7); // m33:m23|m32:m22 - interleave third and fourth lines
1195  movq_r2r(mm6, mm5); // copy first intermediate result
1196 
1197  movq_r2m(mm0, *(dataptr+8)); // write result 1
1198  punpckhdq_r2r(mm7, mm5); // m33:m23|m13:m03 - produce third result
1199 
1200  punpcklwd_m2r(*(dataptr+10), mm1); // n11:n01|n10:n00 - interleave first and second lines
1201  movq_r2r(mm3, mm0); // copy third line
1202 
1203  punpckhwd_m2r(*(dataptr+10), mm2); // n13:n03|n12:n02 - interleave first and second lines
1204 
1205  movq_r2m(mm4, *(dataptr+10)); // write result 2 out
1206  punpckldq_r2r(mm7, mm6); // m32:m22|m12:m02 - produce fourth result
1207 
1208  punpcklwd_m2r(*(dataptr+14), mm3); // n33:n23|n32:n22 - interleave third and fourth lines
1209  movq_r2r(mm1, mm4); // copy second intermediate result
1210 
1211  movq_r2m(mm6, *(dataptr+12)); // write result 3 out
1212  punpckldq_r2r(mm3, mm1); //
1213 
1214  punpckhwd_m2r(*(dataptr+14), mm0); // n33:n23|n32:n22 - interleave third and fourth lines
1215  movq_r2r(mm2, mm6); // copy second intermediate result
1216 
1217  movq_r2m(mm5, *(dataptr+14)); // write result 4 out
1218  punpckhdq_r2r(mm3, mm4); // n31:n21|n11:n01- produce second result
1219 
1220  movq_r2m(mm1, *(dataptr+1)); // write result 5 out - (first result for other 4 x 4 block)
1221  punpckldq_r2r(mm0, mm2); // n32:n22|n12:n02- produce third result
1222 
1223  movq_r2m(mm4, *(dataptr+3)); // write result 6 out
1224  punpckhdq_r2r(mm0, mm6); // n33:n23|n13:n03 - produce fourth result
1225 
1226  movq_r2m(mm2, *(dataptr+5)); // write result 7 out
1227 
1228  movq_m2r(*dataptr, mm0); // m03:m02|m01:m00 - first line, first 4x4
1229 
1230  movq_r2m(mm6, *(dataptr+7)); // write result 8 out
1231 
1232 // Do first 4x4 quadrant, which is used in the beginning of the DCT:
1233 
1234  movq_m2r(*(dataptr+4), mm7); // m23:m22|m21:m20 - third line
1235  movq_r2r(mm0, mm2); // copy first line
1236 
1237  punpcklwd_m2r(*(dataptr+2), mm0); // m11:m01|m10:m00 - interleave first and second lines
1238  movq_r2r(mm7, mm4); // copy third line
1239 
1240  punpcklwd_m2r(*(dataptr+6), mm7); // m31:m21|m30:m20 - interleave third and fourth lines
1241  movq_r2r(mm0, mm1); // copy first intermediate result
1242 
1243  movq_m2r(*(dataptr+2), mm6); // m13:m12|m11:m10 - second line
1244  punpckldq_r2r(mm7, mm0); // m30:m20|m10:m00 - interleave to produce result 1
1245 
1246  movq_m2r(*(dataptr+6), mm5); // m33:m32|m31:m30 - fourth line
1247  punpckhdq_r2r(mm7, mm1); // m31:m21|m11:m01 - interleave to produce result 2
1248 
1249  movq_r2r(mm0, mm7); // write result 1
1250  punpckhwd_r2r(mm6, mm2); // m13:m03|m12:m02 - interleave first and second lines
1251 
1252  psubw_m2r(*(dataptr+14), mm7); // tmp07=x0-x7 /* Stage 1 */
1253  movq_r2r(mm1, mm6); // write result 2
1254 
1255  paddw_m2r(*(dataptr+14), mm0); // tmp00=x0+x7 /* Stage 1 */
1256  punpckhwd_r2r(mm5, mm4); // m33:m23|m32:m22 - interleave third and fourth lines
1257 
1258  paddw_m2r(*(dataptr+12), mm1); // tmp01=x1+x6 /* Stage 1 */
1259  movq_r2r(mm2, mm3); // copy first intermediate result
1260 
1261  psubw_m2r(*(dataptr+12), mm6); // tmp06=x1-x6 /* Stage 1 */
1262  punpckldq_r2r(mm4, mm2); // m32:m22|m12:m02 - interleave to produce result 3
1263 
1264  movq_r2m(mm7, tmp7); // save tmp07
1265  movq_r2r(mm2, mm5); // write result 3
1266 
1267  movq_r2m(mm6, tmp6); // save tmp06
1268 
1269  punpckhdq_r2r(mm4, mm3); // m33:m23|m13:m03 - interleave to produce result 4
1270 
1271  paddw_m2r(*(dataptr+10), mm2); // tmp02=x2+x5 /* stage 1 */
1272  movq_r2r(mm3, mm4); // write result 4
1273 
1274 /************************************************************************************************
1275  End of Transpose 2
1276 ************************************************************************************************/
1277 
1278  paddw_m2r(*(dataptr+8), mm3); // tmp03=x3+x4 /* stage 1*/
1279  movq_r2r(mm0, mm7);
1280 
1281  psubw_m2r(*(dataptr+8), mm4); // tmp04=x3-x4 /* stage 1*/
1282  movq_r2r(mm1, mm6);
1283 
1284  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03 /* even 2 */
1285  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03 /* even 2 */
1286 
1287  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02 /* even 2 */
1288  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02 /* even 2 */
1289 
1290  psubw_m2r(*(dataptr+10), mm5); // tmp05=x2-x5 /* stage 1*/
1291  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1292 
1293  /* stage 3 */
1294 
1295  movq_m2r(tmp6, mm2);
1296  movq_r2r(mm0, mm3);
1297 
1298  psllw_i2r(2, mm6); // m8 * 2^2
1299  paddw_r2r(mm1, mm0);
1300 
1301  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1302  psubw_r2r(mm1, mm3);
1303 
1304  movq_r2m(mm0, *dataptr);
1305  movq_r2r(mm7, mm0);
1306 
1307  /* Odd part */
1308  movq_r2m(mm3, *(dataptr+8));
1309  paddw_r2r(mm5, mm4); // tmp10
1310 
1311  movq_m2r(tmp7, mm3);
1312  paddw_r2r(mm6, mm0); // tmp32
1313 
1314  paddw_r2r(mm2, mm5); // tmp11
1315  psubw_r2r(mm6, mm7); // tmp33
1316 
1317  movq_r2m(mm0, *(dataptr+4));
1318  paddw_r2r(mm3, mm2); // tmp12
1319 
1320  /* stage 4 */
1321  movq_r2m(mm7, *(dataptr+12));
1322  movq_r2r(mm4, mm1); // copy of tmp10
1323 
1324  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1325  psllw_i2r(2, mm4); // m8 * 2^2
1326 
1327  movq_m2r(RTjpeg_C2mC6, mm0);
1328  psllw_i2r(2, mm1);
1329 
1330  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1331  psllw_i2r(2, mm2);
1332 
1333  pmulhw_r2r(mm0, mm4); // z5
1334 
1335  /* stage 5 */
1336 
1337  pmulhw_m2r(RTjpeg_C2pC6, mm2);
1338  psllw_i2r(2, mm5);
1339 
1340  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1341  movq_r2r(mm3, mm0); // copy tmp7
1342 
1343  movq_m2r(*(dataptr+1), mm7);
1344  paddw_r2r(mm1, mm4); // z2
1345 
1346  paddw_r2r(mm1, mm2); // z4
1347 
1348  paddw_r2r(mm5, mm0); // z11
1349  psubw_r2r(mm5, mm3); // z13
1350 
1351  /* stage 6 */
1352 
1353  movq_r2r(mm3, mm5); // copy z13
1354  psubw_r2r(mm4, mm3); // y3=z13 - z2
1355 
1356  paddw_r2r(mm4, mm5); // y5=z13 + z2
1357  movq_r2r(mm0, mm6); // copy z11
1358 
1359  movq_r2m(mm3, *(dataptr+6)); //save y3
1360  psubw_r2r(mm2, mm0); // y7=z11 - z4
1361 
1362  movq_r2m(mm5, *(dataptr+10)); //save y5
1363  paddw_r2r(mm2, mm6); // y1=z11 + z4
1364 
1365  movq_r2m(mm0, *(dataptr+14)); //save y7
1366 
1367  /************************************************
1368  * End of 1st 4 rows
1369  ************************************************/
1370 
1371  movq_m2r(*(dataptr+3), mm1); // load x1 /* stage 1 */
1372  movq_r2r(mm7, mm0); // copy x0
1373 
1374  movq_r2m(mm6, *(dataptr+2)); //save y1
1375 
1376  movq_m2r(*(dataptr+5), mm2); // load x2 /* stage 1 */
1377  movq_r2r(mm1, mm6); // copy x1
1378 
1379  paddw_m2r(*(dataptr+15), mm0); // tmp00 = x0 + x7
1380 
1381  movq_m2r(*(dataptr+7), mm3); // load x3 /* stage 1 */
1382  movq_r2r(mm2, mm5); // copy x2
1383 
1384  psubw_m2r(*(dataptr+15), mm7); // tmp07 = x0 - x7
1385  movq_r2r(mm3, mm4); // copy x3
1386 
1387  paddw_m2r(*(dataptr+13), mm1); // tmp01 = x1 + x6
1388 
1389  movq_r2m(mm7, tmp7); // save tmp07
1390  movq_r2r(mm0, mm7); // copy tmp00
1391 
1392  psubw_m2r(*(dataptr+13), mm6); // tmp06 = x1 - x6
1393 
1394  /* stage 2, Even Part */
1395 
1396  paddw_m2r(*(dataptr+9), mm3); // tmp03 = x3 + x4
1397 
1398  movq_r2m(mm6, tmp6); // save tmp07
1399  movq_r2r(mm1, mm6); // copy tmp01
1400 
1401  paddw_m2r(*(dataptr+11), mm2); // tmp02 = x2 + x5
1402  paddw_r2r(mm3, mm0); // tmp10 = tmp00 + tmp03
1403 
1404  psubw_r2r(mm3, mm7); // tmp13 = tmp00 - tmp03
1405 
1406  psubw_m2r(*(dataptr+9), mm4); // tmp04 = x3 - x4
1407  psubw_r2r(mm2, mm6); // tmp12 = tmp01 - tmp02
1408 
1409  paddw_r2r(mm2, mm1); // tmp11 = tmp01 + tmp02
1410 
1411  psubw_m2r(*(dataptr+11), mm5); // tmp05 = x2 - x5
1412  paddw_r2r(mm7, mm6); // tmp12 + tmp13
1413 
1414  /* stage 3, Even and stage 4 & 5 even */
1415 
1416  movq_m2r(tmp6, mm2); // load tmp6
1417  movq_r2r(mm0, mm3); // copy tmp10
1418 
1419  psllw_i2r(2, mm6); // shift z1
1420  paddw_r2r(mm1, mm0); // y0=tmp10 + tmp11
1421 
1422  pmulhw_m2r(RTjpeg_C4, mm6); // z1
1423  psubw_r2r(mm1, mm3); // y4=tmp10 - tmp11
1424 
1425  movq_r2m(mm0, *(dataptr+1)); //save y0
1426  movq_r2r(mm7, mm0); // copy tmp13
1427 
1428  /* odd part */
1429 
1430  movq_r2m(mm3, *(dataptr+9)); //save y4
1431  paddw_r2r(mm5, mm4); // tmp10 = tmp4 + tmp5
1432 
1433  movq_m2r(tmp7, mm3); // load tmp7
1434  paddw_r2r(mm6, mm0); // tmp32 = tmp13 + z1
1435 
1436  paddw_r2r(mm2, mm5); // tmp11 = tmp5 + tmp6
1437  psubw_r2r(mm6, mm7); // tmp33 = tmp13 - z1
1438 
1439  movq_r2m(mm0, *(dataptr+5)); //save y2
1440  paddw_r2r(mm3, mm2); // tmp12 = tmp6 + tmp7
1441 
1442  /* stage 4 */
1443 
1444  movq_r2m(mm7, *(dataptr+13)); //save y6
1445  movq_r2r(mm4, mm1); // copy tmp10
1446 
1447  psubw_r2r(mm2, mm1); // tmp10 - tmp12
1448  psllw_i2r(2, mm4); // shift tmp10
1449 
1450  movq_m2r(RTjpeg_C2mC6, mm0); // load C2mC6
1451  psllw_i2r(2, mm1); // shift (tmp10-tmp12)
1452 
1453  pmulhw_m2r(RTjpeg_C6, mm1); // z5
1454  psllw_i2r(2, mm5); // prepare for multiply
1455 
1456  pmulhw_r2r(mm0, mm4); // multiply by converted real
1457 
1458  /* stage 5 */
1459 
1460  pmulhw_m2r(RTjpeg_C4, mm5); // z3
1461  psllw_i2r(2, mm2); // prepare for multiply
1462 
1463  pmulhw_m2r(RTjpeg_C2pC6, mm2); // multiply
1464  movq_r2r(mm3, mm0); // copy tmp7
1465 
1466  movq_m2r(*(dataptr+9), mm7); // m03:m02|m01:m00 - first line (line 4)and copy into mm7
1467  paddw_r2r(mm1, mm4); // z2
1468 
1469  paddw_r2r(mm5, mm0); // z11
1470  psubw_r2r(mm5, mm3); // z13
1471 
1472  /* stage 6 */
1473 
1474  movq_r2r(mm3, mm5); // copy z13
1475  paddw_r2r(mm1, mm2); // z4
1476 
1477  movq_r2r(mm0, mm6); // copy z11
1478  psubw_r2r(mm4, mm5); // y3
1479 
1480  paddw_r2r(mm2, mm6); // y1
1481  paddw_r2r(mm4, mm3); // y5
1482 
1483  movq_r2m(mm5, *(dataptr+7)); //save y3
1484  psubw_r2r(mm2, mm0); // yŤ=z11 - z4
1485 
1486  movq_r2m(mm3, *(dataptr+11)); //save y5
1487 
1488  movq_r2m(mm6, *(dataptr+3)); //save y1
1489 
1490  movq_r2m(mm0, *(dataptr+15)); //save y7
1491 
1492 
1493 #endif
1494 }
1495 
1496 #define FIX_1_082392200 ((int32_t) 277) /* FIX(1.082392200) */
1497 #define FIX_1_414213562 ((int32_t) 362) /* FIX(1.414213562) */
1498 #define FIX_1_847759065 ((int32_t) 473) /* FIX(1.847759065) */
1499 #define FIX_2_613125930 ((int32_t) 669) /* FIX(2.613125930) */
1500 
1501 #define DESCALE(x) (int16_t)( ((x)+4) >> 3)
1502 
1503 /* clip yuv to 16..235 (should be 16..240 for cr/cb but ... */
1504 
1505 #define RL(x) ((x)>235) ? 235 : (((x)<16) ? 16 : (x))
1506 #define MULTIPLY(var,const) (((int32_t) ((var) * (const)) + 128)>>8)
1507 
1509 {
1510  for(int i = 0; i < 64; i++)
1511  {
1512  m_liqt[i] = ((uint64_t)m_liqt[i] * RTjpeg_aan_tab[i]) >> 32;
1513  m_ciqt[i] = ((uint64_t)m_ciqt[i] * RTjpeg_aan_tab[i]) >> 32;
1514  }
1515 }
1516 
1517 void RTjpeg::Idct(uint8_t *odata, int16_t *data, int rskip)
1518 {
1519 #ifdef MMX
1520 
1521 static mmx_t s_fix141; s_fix141.q = 0x5a825a825a825a82LL;
1522 static mmx_t s_fix184n261; s_fix184n261.q = 0xcf04cf04cf04cf04LL;
1523 static mmx_t s_fix184; s_fix184.q = 0x7641764176417641LL;
1524 static mmx_t s_fixN184; s_fixN184.q = 0x896f896f896f896fLL;
1525 static mmx_t s_fix108n184; s_fix108n184.q = 0xcf04cf04cf04cf04LL;
1526 
1527  auto *wsptr = (mmx_t *)m_ws;
1528  auto *dataptr = (mmx_t *)odata;
1529  auto *idata = (mmx_t *)data;
1530 
1531  rskip = rskip>>3;
1532 /*
1533  * Perform inverse DCT on one block of coefficients.
1534  */
1535 
1536  /* Odd part */
1537 
1538  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1539 
1540  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1541 
1542  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1543 
1544  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1545 
1546  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1547 
1548  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1549 
1550  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1551 
1552  psllw_i2r(2, mm2); // shift z10
1553  movq_r2r(mm2, mm0); // copy z10
1554 
1555  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1556  movq_r2r(mm3, mm5); // copy tmp4
1557 
1558  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1559  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1560 
1561  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1562  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1563 
1564  psubw_r2r(mm1, mm6); // z11-z13
1565  psllw_i2r(2, mm5); // shift z12
1566 
1567  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1568  movq_r2r(mm5, mm7); // copy z12
1569 
1570  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1571  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1572 
1573  //ok
1574 
1575  /* Even part */
1576  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1577  psllw_i2r(2, mm6);
1578 
1579  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1580 
1581  paddw_r2r(mm5, mm0); // tmp10
1582 
1583  paddw_r2r(mm7, mm2); // tmp12
1584 
1585  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1586  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1587 
1588  movq_r2r(mm1, mm5); // copy tmp1
1589  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1590 
1591  psubw_r2r(mm4, mm5); // tmp1-tmp3
1592  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1593 
1594  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1595  psllw_i2r(2, mm5); // shift tmp1-tmp3
1596 
1597  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1598 
1599  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1600  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1601 
1602  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1603 
1604  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1605 
1606  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1607  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1608 
1609  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1610  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1611 
1612  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1613  movq_r2r(mm1, mm5); // copy tmp11
1614 
1615  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1616  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1617 
1618  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1619 
1620  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1621  movq_r2r(mm7, mm0); // copy tmp0
1622 
1623  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1624  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1625 
1626  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1627 
1628  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1629  movq_r2r(mm1, mm3); // copy tmp1
1630 
1631  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1632  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1633 
1634  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1635 
1636  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1637  movq_r2r(mm4, mm1); // copy tmp3
1638 
1639  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1640 
1641  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1642 
1643  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1644 
1645  movq_r2m(mm4, *(wsptr+8));
1646  movq_r2r(mm5, mm7); // copy tmp2
1647 
1648  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1649 
1650  movq_r2m(mm1, *(wsptr+6));
1651  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1652 
1653  movq_r2m(mm5, *(wsptr+4));
1654 
1655  movq_r2m(mm7, *(wsptr+10));
1656 
1657  //ok
1658 
1659 
1660 /*****************************************************************/
1661 
1662  idata++;
1663  wsptr++;
1664 
1665 /*****************************************************************/
1666 
1667  movq_m2r(*(idata+10), mm1); // load idata[DCTSIZE*5]
1668 
1669  movq_m2r(*(idata+6), mm0); // load idata[DCTSIZE*3]
1670 
1671  movq_m2r(*(idata+2), mm3); // load idata[DCTSIZE*1]
1672  movq_r2r(mm1, mm2); // copy tmp6 /* phase 6 */
1673 
1674  movq_m2r(*(idata+14), mm4); // load idata[DCTSIZE*7]
1675  paddw_r2r(mm0, mm1); // z13 = tmp6 + tmp5;
1676 
1677  psubw_r2r(mm0, mm2); // z10 = tmp6 - tmp5
1678 
1679  psllw_i2r(2, mm2); // shift z10
1680  movq_r2r(mm2, mm0); // copy z10
1681 
1682  pmulhw_m2r(s_fix184n261, mm2); // MULTIPLY( z12, FIX_1_847759065); /* 2*c2 */
1683  movq_r2r(mm3, mm5); // copy tmp4
1684 
1685  pmulhw_m2r(s_fixN184, mm0); // MULTIPLY(z10, -FIX_1_847759065); /* 2*c2 */
1686  paddw_r2r(mm4, mm3); // z11 = tmp4 + tmp7;
1687 
1688  movq_r2r(mm3, mm6); // copy z11 /* phase 5 */
1689  psubw_r2r(mm4, mm5); // z12 = tmp4 - tmp7;
1690 
1691  psubw_r2r(mm1, mm6); // z11-z13
1692  psllw_i2r(2, mm5); // shift z12
1693 
1694  movq_m2r(*(idata+12), mm4); // load idata[DCTSIZE*6], even part
1695  movq_r2r(mm5, mm7); // copy z12
1696 
1697  pmulhw_m2r(s_fix108n184, mm5); // MULT(z12, (FIX_1_08-FIX_1_84)) //- z5; /* 2*(c2-c6) */ even part
1698  paddw_r2r(mm1, mm3); // tmp7 = z11 + z13;
1699 
1700  //ok
1701 
1702  /* Even part */
1703  pmulhw_m2r(s_fix184, mm7); // MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) //+ z5; /* -2*(c2+c6) */
1704  psllw_i2r(2, mm6);
1705 
1706  movq_m2r(*(idata+4), mm1); // load idata[DCTSIZE*2]
1707 
1708  paddw_r2r(mm5, mm0); // tmp10
1709 
1710  paddw_r2r(mm7, mm2); // tmp12
1711 
1712  pmulhw_m2r(s_fix141, mm6); // tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562); /* 2*c4 */
1713  psubw_r2r(mm3, mm2); // tmp6 = tmp12 - tmp7
1714 
1715  movq_r2r(mm1, mm5); // copy tmp1
1716  paddw_r2r(mm4, mm1); // tmp13= tmp1 + tmp3; /* phases 5-3 */
1717 
1718  psubw_r2r(mm4, mm5); // tmp1-tmp3
1719  psubw_r2r(mm2, mm6); // tmp5 = tmp11 - tmp6;
1720 
1721  movq_r2m(mm1, *(wsptr)); // save tmp13 in workspace
1722  psllw_i2r(2, mm5); // shift tmp1-tmp3
1723 
1724  movq_m2r(*(idata), mm7); // load idata[DCTSIZE*0]
1725  paddw_r2r(mm6, mm0); // tmp4 = tmp10 + tmp5;
1726 
1727  pmulhw_m2r(s_fix141, mm5); // MULTIPLY(tmp1 - tmp3, FIX_1_414213562)
1728 
1729  movq_m2r(*(idata+8), mm4); // load idata[DCTSIZE*4]
1730 
1731  psubw_r2r(mm1, mm5); // tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13; /* 2*c4 */
1732 
1733  movq_r2m(mm0, *(wsptr+4)); // save tmp4 in workspace
1734  movq_r2r(mm7, mm1); // copy tmp0 /* phase 3 */
1735 
1736  movq_r2m(mm5, *(wsptr+2)); // save tmp12 in workspace
1737  psubw_r2r(mm4, mm1); // tmp11 = tmp0 - tmp2;
1738 
1739  paddw_r2r(mm4, mm7); // tmp10 = tmp0 + tmp2;
1740  movq_r2r(mm1, mm5); // copy tmp11
1741 
1742  paddw_m2r(*(wsptr+2), mm1); // tmp1 = tmp11 + tmp12;
1743  movq_r2r(mm7, mm4); // copy tmp10 /* phase 2 */
1744 
1745  paddw_m2r(*(wsptr), mm7); // tmp0 = tmp10 + tmp13;
1746 
1747  psubw_m2r(*(wsptr), mm4); // tmp3 = tmp10 - tmp13;
1748  movq_r2r(mm7, mm0); // copy tmp0
1749 
1750  psubw_m2r(*(wsptr+2), mm5); // tmp2 = tmp11 - tmp12;
1751  paddw_r2r(mm3, mm7); // wsptr[DCTSIZE*0] = (int) (tmp0 + tmp7);
1752 
1753  psubw_r2r(mm3, mm0); // wsptr[DCTSIZE*7] = (int) (tmp0 - tmp7);
1754 
1755  movq_r2m(mm7, *(wsptr)); // wsptr[DCTSIZE*0]
1756  movq_r2r(mm1, mm3); // copy tmp1
1757 
1758  movq_r2m(mm0, *(wsptr+14)); // wsptr[DCTSIZE*7]
1759  paddw_r2r(mm2, mm1); // wsptr[DCTSIZE*1] = (int) (tmp1 + tmp6);
1760 
1761  psubw_r2r(mm2, mm3); // wsptr[DCTSIZE*6] = (int) (tmp1 - tmp6);
1762 
1763  movq_r2m(mm1, *(wsptr+2)); // wsptr[DCTSIZE*1]
1764  movq_r2r(mm4, mm1); // copy tmp3
1765 
1766  movq_r2m(mm3, *(wsptr+12)); // wsptr[DCTSIZE*6]
1767 
1768  paddw_m2r(*(wsptr+4), mm4); // wsptr[DCTSIZE*4] = (int) (tmp3 + tmp4);
1769 
1770  psubw_m2r(*(wsptr+4), mm1); // wsptr[DCTSIZE*3] = (int) (tmp3 - tmp4);
1771 
1772  movq_r2m(mm4, *(wsptr+8));
1773  movq_r2r(mm5, mm7); // copy tmp2
1774 
1775  paddw_r2r(mm6, mm5); // wsptr[DCTSIZE*2] = (int) (tmp2 + tmp5)
1776 
1777  movq_r2m(mm1, *(wsptr+6));
1778  psubw_r2r(mm6, mm7); // wsptr[DCTSIZE*5] = (int) (tmp2 - tmp5);
1779 
1780  movq_r2m(mm5, *(wsptr+4));
1781 
1782  movq_r2m(mm7, *(wsptr+10));
1783 
1784 /*****************************************************************/
1785 
1786  /* Pass 2: process rows from work array, store into output array. */
1787  /* Note that we must descale the results by a factor of 8 == 2**3, */
1788  /* and also undo the PASS1_BITS scaling. */
1789 
1790 /*****************************************************************/
1791  /* Even part */
1792 
1793  wsptr--;
1794 
1795 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
1796 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
1797 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
1798 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
1799  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
1800 
1801  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
1802  movq_r2r(mm0, mm2);
1803 
1804  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
1805  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
1806 
1807  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
1808  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
1809 
1810  movq_r2r(mm0, mm6);
1811  movq_r2r(mm3, mm5);
1812 
1813  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
1814  movq_r2r(mm2, mm1);
1815 
1816  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
1817  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
1818 
1819  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
1820  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
1821 
1822  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
1823  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1824 
1825  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
1826  movq_r2r(mm3, mm4);
1827 
1828  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
1829  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
1830 
1831  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
1832  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1833 
1834 
1835  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
1836  movq_r2r(mm6, mm2);
1837 
1838  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
1839  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
1840 
1841  movq_r2r(mm3, mm5);
1842  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
1843 
1844  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
1845  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
1846 
1847  movq_r2r(mm4, mm7);
1848  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
1849 
1850  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
1851 
1852  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
1853 
1854  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
1855  movq_r2r(mm1, mm6);
1856 
1857  //ok
1858 
1859 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
1860 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
1861 
1862 
1863  movq_r2r(mm0, mm2);
1864  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
1865 
1866  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
1867  psllw_i2r(2, mm6);
1868 
1869  pmulhw_m2r(s_fix141, mm6);
1870  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
1871 
1872  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
1873  movq_r2r(mm0, mm7);
1874 
1875 // tmp0 = tmp10 + tmp13;
1876 // tmp3 = tmp10 - tmp13;
1877  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
1878  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
1879 
1880 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
1881  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
1882 // tmp1 = tmp11 + tmp12;
1883 // tmp2 = tmp11 - tmp12;
1884  movq_r2r(mm1, mm5);
1885 
1886  //OK
1887 
1888  /* Odd part */
1889 
1890 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
1891 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
1892 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
1893 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
1894  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
1895  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
1896 
1897  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
1898  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
1899 
1900  movq_r2r(mm3, mm6);
1901  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
1902 
1903  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
1904  movq_r2r(mm3, mm2);
1905 
1906 //Save tmp0 and tmp1 in wsptr
1907  movq_r2m(mm0, *(wsptr)); // save tmp0
1908  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
1909 
1910 
1911 //Continue with z10 --- z13
1912  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
1913  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
1914 
1915  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
1916  movq_r2r(mm6, mm4);
1917 
1918  movq_r2m(mm1, *(wsptr+1)); // save tmp1
1919  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
1920 
1921  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
1922  movq_r2r(mm6, mm1);
1923 
1924 //Save tmp2 and tmp3 in wsptr
1925  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
1926  movq_r2r(mm2, mm4);
1927 
1928 //Continue with z10 --- z13
1929  movq_r2m(mm5, *(wsptr+2)); // save tmp2
1930  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
1931 
1932  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
1933  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
1934 
1935  movq_r2r(mm3, mm0);
1936  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
1937 
1938  movq_r2m(mm7, *(wsptr+3)); // save tmp3
1939  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
1940 
1941  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
1942  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
1943 
1944  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
1945  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
1946 
1947  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
1948  movq_r2r(mm6, mm4);
1949 
1950  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
1951  movq_r2r(mm1, mm5);
1952 
1953  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
1954  movq_r2r(mm6, mm2);
1955 
1956  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
1957  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
1958 
1959  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
1960  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
1961 
1962  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
1963  movq_r2r(mm1, mm7);
1964 
1965  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
1966  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
1967 
1968  movq_r2r(mm6, mm5);
1969  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
1970 
1971  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
1972  movq_r2r(mm2, mm4);
1973 
1974  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
1975 
1976  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
1977 
1978  punpckhdq_r2r(mm6, mm4);
1979 
1980  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
1981  movq_r2r(mm0, mm5);
1982 
1983  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
1984 
1985  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
1986  movq_r2r(mm3, mm4);
1987 
1988  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
1989  movq_r2r(mm5, mm1);
1990 
1991  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
1992 // tmp7 = z11 + z13; /* phase 5 */
1993 // tmp8 = z11 - z13; /* phase 5 */
1994  psubw_r2r(mm4, mm1); // tmp8
1995 
1996  paddw_r2r(mm4, mm5); // tmp7
1997 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
1998  psllw_i2r(2, mm1);
1999 
2000  psllw_i2r(2, mm0);
2001 
2002  pmulhw_m2r(s_fix141, mm1); // tmp21
2003 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2004 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2005  psllw_i2r(2, mm3);
2006  movq_r2r(mm0, mm7);
2007 
2008  pmulhw_m2r(s_fixN184, mm7);
2009  movq_r2r(mm3, mm6);
2010 
2011  movq_m2r(*(wsptr), mm2); // tmp0,final1
2012 
2013  pmulhw_m2r(s_fix108n184, mm6);
2014 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2015 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2016  movq_r2r(mm2, mm4); // final1
2017 
2018  pmulhw_m2r(s_fix184n261, mm0);
2019  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2020 
2021  pmulhw_m2r(s_fix184, mm3);
2022  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2023 
2024 // tmp6 = tmp22 - tmp7; /* phase 2 */
2025  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2026 
2027  paddw_r2r(mm6, mm7); // tmp20
2028  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2029 
2030  paddw_r2r(mm0, mm3); // tmp22
2031 
2032 // tmp5 = tmp21 - tmp6;
2033  psubw_r2r(mm5, mm3); // tmp6
2034 
2035 // tmp4 = tmp20 + tmp5;
2036  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2037  psubw_r2r(mm3, mm1); // tmp5
2038 
2039  movq_r2r(mm0, mm6); // final2
2040  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2041 
2042  /* Final output stage: scale down by a factor of 8 and range-limit */
2043 
2044 
2045 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2046 // & RANGE_MASK];
2047 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2048 // & RANGE_MASK]; final1
2049 
2050 
2051 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2052 // & RANGE_MASK];
2053 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2054 // & RANGE_MASK]; final2
2055  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2056  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2057 
2058  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2059 
2060  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2061 
2062  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2063  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2064 
2065 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2066 // & RANGE_MASK];
2067 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2068 // & RANGE_MASK]; final3
2069  paddw_r2r(mm1, mm7); // tmp4
2070  movq_r2r(mm5, mm3);
2071 
2072  paddw_r2r(mm1, mm5); // tmp2+tmp5
2073  psubw_r2r(mm1, mm3); // tmp2-tmp5
2074 
2075  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2076 
2077  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2078  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2079 
2080 
2081 
2082 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2083 // & RANGE_MASK];
2084 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2085 // & RANGE_MASK]; final4
2086  movq_r2r(mm4, mm6);
2087  paddw_r2r(mm7, mm4); // tmp3+tmp4
2088 
2089  psubw_r2r(mm7, mm6); // tmp3-tmp4
2090  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2091 
2092  // mov ecx, [dataptr]
2093 
2094  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2095 
2096  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2097 
2098  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2099  movq_r2r(mm2, mm4);
2100 
2101  movq_r2r(mm5, mm7);
2102  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2103 
2104  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2105  movq_r2r(mm2, mm1);
2106 
2107  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2108 
2109  // add dataptr, 4
2110 
2111  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2112 
2113  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2114 
2115  // add ecx, output_col
2116 
2117  movq_r2r(mm7, mm6);
2118  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2119 
2120  movq_r2r(mm2, mm0);
2121  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2122 
2123  // mov idata, [dataptr]
2124 
2125  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2126 
2127  // add dataptr, 4
2128 
2129  movq_r2r(mm1, mm3);
2130 
2131  // add idata, output_col
2132 
2133  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2134 
2135  movq_r2m(mm2, *(dataptr));
2136 
2137  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2138 
2139  dataptr += rskip;
2140  movq_r2m(mm0, *(dataptr));
2141 
2142  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2143  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2144 
2145  dataptr += rskip;
2146  movq_r2m(mm1, *(dataptr));
2147 
2148  dataptr += rskip;
2149  movq_r2m(mm3, *(dataptr));
2150 
2151 /*******************************************************************/
2152 
2153  wsptr += 8;
2154 
2155 /*******************************************************************/
2156 
2157 // tmp10 = ((DCTELEM) wsptr[0] + (DCTELEM) wsptr[4]);
2158 // tmp13 = ((DCTELEM) wsptr[2] + (DCTELEM) wsptr[6]);
2159 // tmp11 = ((DCTELEM) wsptr[0] - (DCTELEM) wsptr[4]);
2160 // tmp14 = ((DCTELEM) wsptr[2] - (DCTELEM) wsptr[6]);
2161  movq_m2r(*(wsptr), mm0); // wsptr[0,0],[0,1],[0,2],[0,3]
2162 
2163  movq_m2r(*(wsptr+1), mm1); // wsptr[0,4],[0,5],[0,6],[0,7]
2164  movq_r2r(mm0, mm2);
2165 
2166  movq_m2r(*(wsptr+2), mm3); // wsptr[1,0],[1,1],[1,2],[1,3]
2167  paddw_r2r(mm1, mm0); // wsptr[0,tmp10],[xxx],[0,tmp13],[xxx]
2168 
2169  movq_m2r(*(wsptr+3), mm4); // wsptr[1,4],[1,5],[1,6],[1,7]
2170  psubw_r2r(mm1, mm2); // wsptr[0,tmp11],[xxx],[0,tmp14],[xxx]
2171 
2172  movq_r2r(mm0, mm6);
2173  movq_r2r(mm3, mm5);
2174 
2175  paddw_r2r(mm4, mm3); // wsptr[1,tmp10],[xxx],[1,tmp13],[xxx]
2176  movq_r2r(mm2, mm1);
2177 
2178  psubw_r2r(mm4, mm5); // wsptr[1,tmp11],[xxx],[1,tmp14],[xxx]
2179  punpcklwd_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[xxx],[xxx]
2180 
2181  movq_m2r(*(wsptr+7), mm7); // wsptr[3,4],[3,5],[3,6],[3,7]
2182  punpckhwd_r2r(mm3, mm6); // wsptr[0,tmp13],[1,tmp13],[xxx],[xxx]
2183 
2184  movq_m2r(*(wsptr+4), mm3); // wsptr[2,0],[2,1],[2,2],[2,3]
2185  punpckldq_r2r(mm6, mm0); // wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2186 
2187  punpcklwd_r2r(mm5, mm1); // wsptr[0,tmp11],[1,tmp11],[xxx],[xxx]
2188  movq_r2r(mm3, mm4);
2189 
2190  movq_m2r(*(wsptr+6), mm6); // wsptr[3,0],[3,1],[3,2],[3,3]
2191  punpckhwd_r2r(mm5, mm2); // wsptr[0,tmp14],[1,tmp14],[xxx],[xxx]
2192 
2193  movq_m2r(*(wsptr+5), mm5); // wsptr[2,4],[2,5],[2,6],[2,7]
2194  punpckldq_r2r(mm2, mm1); // wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2195 
2196  paddw_r2r(mm5, mm3); // wsptr[2,tmp10],[xxx],[2,tmp13],[xxx]
2197  movq_r2r(mm6, mm2);
2198 
2199  psubw_r2r(mm5, mm4); // wsptr[2,tmp11],[xxx],[2,tmp14],[xxx]
2200  paddw_r2r(mm7, mm6); // wsptr[3,tmp10],[xxx],[3,tmp13],[xxx]
2201 
2202  movq_r2r(mm3, mm5);
2203  punpcklwd_r2r(mm6, mm3); // wsptr[2,tmp10],[3,tmp10],[xxx],[xxx]
2204 
2205  psubw_r2r(mm7, mm2); // wsptr[3,tmp11],[xxx],[3,tmp14],[xxx]
2206  punpckhwd_r2r(mm6, mm5); // wsptr[2,tmp13],[3,tmp13],[xxx],[xxx]
2207 
2208  movq_r2r(mm4, mm7);
2209  punpckldq_r2r(mm5, mm3); // wsptr[2,tmp10],[3,tmp10],[2,tmp13],[3,tmp13]
2210 
2211  punpcklwd_r2r(mm2, mm4); // wsptr[2,tmp11],[3,tmp11],[xxx],[xxx]
2212 
2213  punpckhwd_r2r(mm2, mm7); // wsptr[2,tmp14],[3,tmp14],[xxx],[xxx]
2214 
2215  punpckldq_r2r(mm7, mm4); // wsptr[2,tmp11],[3,tmp11],[2,tmp14],[3,tmp14]
2216  movq_r2r(mm1, mm6);
2217 
2218  //OK
2219 
2220 // mm0 = ;wsptr[0,tmp10],[1,tmp10],[0,tmp13],[1,tmp13]
2221 // mm1 = ;wsptr[0,tmp11],[1,tmp11],[0,tmp14],[1,tmp14]
2222 
2223  movq_r2r(mm0, mm2);
2224  punpckhdq_r2r(mm4, mm6); // wsptr[0,tmp14],[1,tmp14],[2,tmp14],[3,tmp14]
2225 
2226  punpckldq_r2r(mm4, mm1); // wsptr[0,tmp11],[1,tmp11],[2,tmp11],[3,tmp11]
2227  psllw_i2r(2, mm6);
2228 
2229  pmulhw_m2r(s_fix141, mm6);
2230  punpckldq_r2r(mm3, mm0); // wsptr[0,tmp10],[1,tmp10],[2,tmp10],[3,tmp10]
2231 
2232  punpckhdq_r2r(mm3, mm2); // wsptr[0,tmp13],[1,tmp13],[2,tmp13],[3,tmp13]
2233  movq_r2r(mm0, mm7);
2234 
2235 // tmp0 = tmp10 + tmp13;
2236 // tmp3 = tmp10 - tmp13;
2237  paddw_r2r(mm2, mm0); // [0,tmp0],[1,tmp0],[2,tmp0],[3,tmp0]
2238  psubw_r2r(mm2, mm7); // [0,tmp3],[1,tmp3],[2,tmp3],[3,tmp3]
2239 
2240 // tmp12 = MULTIPLY(tmp14, FIX_1_414213562) - tmp13;
2241  psubw_r2r(mm2, mm6); // wsptr[0,tmp12],[1,tmp12],[2,tmp12],[3,tmp12]
2242 // tmp1 = tmp11 + tmp12;
2243 // tmp2 = tmp11 - tmp12;
2244  movq_r2r(mm1, mm5);
2245 
2246  //OK
2247 
2248 
2249  /* Odd part */
2250 
2251 // z13 = (DCTELEM) wsptr[5] + (DCTELEM) wsptr[3];
2252 // z10 = (DCTELEM) wsptr[5] - (DCTELEM) wsptr[3];
2253 // z11 = (DCTELEM) wsptr[1] + (DCTELEM) wsptr[7];
2254 // z12 = (DCTELEM) wsptr[1] - (DCTELEM) wsptr[7];
2255  movq_m2r(*(wsptr), mm3); // wsptr[0,0],[0,1],[0,2],[0,3]
2256  paddw_r2r(mm6, mm1); // [0,tmp1],[1,tmp1],[2,tmp1],[3,tmp1]
2257 
2258  movq_m2r(*(wsptr+1), mm4); // wsptr[0,4],[0,5],[0,6],[0,7]
2259  psubw_r2r(mm6, mm5); // [0,tmp2],[1,tmp2],[2,tmp2],[3,tmp2]
2260 
2261  movq_r2r(mm3, mm6);
2262  punpckldq_r2r(mm4, mm3); // wsptr[0,0],[0,1],[0,4],[0,5]
2263 
2264  punpckhdq_r2r(mm6, mm4); // wsptr[0,6],[0,7],[0,2],[0,3]
2265  movq_r2r(mm3, mm2);
2266 
2267 //Save tmp0 and tmp1 in wsptr
2268  movq_r2m(mm0, *(wsptr)); // save tmp0
2269  paddw_r2r(mm4, mm2); // wsptr[xxx],[0,z11],[xxx],[0,z13]
2270 
2271 
2272 //Continue with z10 --- z13
2273  movq_m2r(*(wsptr+2), mm6); // wsptr[1,0],[1,1],[1,2],[1,3]
2274  psubw_r2r(mm4, mm3); // wsptr[xxx],[0,z12],[xxx],[0,z10]
2275 
2276  movq_m2r(*(wsptr+3), mm0); // wsptr[1,4],[1,5],[1,6],[1,7]
2277  movq_r2r(mm6, mm4);
2278 
2279  movq_r2m(mm1, *(wsptr+1)); // save tmp1
2280  punpckldq_r2r(mm0, mm6); // wsptr[1,0],[1,1],[1,4],[1,5]
2281 
2282  punpckhdq_r2r(mm4, mm0); // wsptr[1,6],[1,7],[1,2],[1,3]
2283  movq_r2r(mm6, mm1);
2284 
2285 //Save tmp2 and tmp3 in wsptr
2286  paddw_r2r(mm0, mm6); // wsptr[xxx],[1,z11],[xxx],[1,z13]
2287  movq_r2r(mm2, mm4);
2288 
2289 //Continue with z10 --- z13
2290  movq_r2m(mm5, *(wsptr+2)); // save tmp2
2291  punpcklwd_r2r(mm6, mm2); // wsptr[xxx],[xxx],[0,z11],[1,z11]
2292 
2293  psubw_r2r(mm0, mm1); // wsptr[xxx],[1,z12],[xxx],[1,z10]
2294  punpckhwd_r2r(mm6, mm4); // wsptr[xxx],[xxx],[0,z13],[1,z13]
2295 
2296  movq_r2r(mm3, mm0);
2297  punpcklwd_r2r(mm1, mm3); // wsptr[xxx],[xxx],[0,z12],[1,z12]
2298 
2299  movq_r2m(mm7, *(wsptr+3)); // save tmp3
2300  punpckhwd_r2r(mm1, mm0); // wsptr[xxx],[xxx],[0,z10],[1,z10]
2301 
2302  movq_m2r(*(wsptr+4), mm6); // wsptr[2,0],[2,1],[2,2],[2,3]
2303  punpckhdq_r2r(mm2, mm0); // wsptr[0,z10],[1,z10],[0,z11],[1,z11]
2304 
2305  movq_m2r(*(wsptr+5), mm7); // wsptr[2,4],[2,5],[2,6],[2,7]
2306  punpckhdq_r2r(mm4, mm3); // wsptr[0,z12],[1,z12],[0,z13],[1,z13]
2307 
2308  movq_m2r(*(wsptr+6), mm1); // wsptr[3,0],[3,1],[3,2],[3,3]
2309  movq_r2r(mm6, mm4);
2310 
2311  punpckldq_r2r(mm7, mm6); // wsptr[2,0],[2,1],[2,4],[2,5]
2312  movq_r2r(mm1, mm5);
2313 
2314  punpckhdq_r2r(mm4, mm7); // wsptr[2,6],[2,7],[2,2],[2,3]
2315  movq_r2r(mm6, mm2);
2316 
2317  movq_m2r(*(wsptr+7), mm4); // wsptr[3,4],[3,5],[3,6],[3,7]
2318  paddw_r2r(mm7, mm6); // wsptr[xxx],[2,z11],[xxx],[2,z13]
2319 
2320  psubw_r2r(mm7, mm2); // wsptr[xxx],[2,z12],[xxx],[2,z10]
2321  punpckldq_r2r(mm4, mm1); // wsptr[3,0],[3,1],[3,4],[3,5]
2322 
2323  punpckhdq_r2r(mm5, mm4); // wsptr[3,6],[3,7],[3,2],[3,3]
2324  movq_r2r(mm1, mm7);
2325 
2326  paddw_r2r(mm4, mm1); // wsptr[xxx],[3,z11],[xxx],[3,z13]
2327  psubw_r2r(mm4, mm7); // wsptr[xxx],[3,z12],[xxx],[3,z10]
2328 
2329  movq_r2r(mm6, mm5);
2330  punpcklwd_r2r(mm1, mm6); // wsptr[xxx],[xxx],[2,z11],[3,z11]
2331 
2332  punpckhwd_r2r(mm1, mm5); // wsptr[xxx],[xxx],[2,z13],[3,z13]
2333  movq_r2r(mm2, mm4);
2334 
2335  punpcklwd_r2r(mm7, mm2); // wsptr[xxx],[xxx],[2,z12],[3,z12]
2336 
2337  punpckhwd_r2r(mm7, mm4); // wsptr[xxx],[xxx],[2,z10],[3,z10]
2338 
2339  punpckhdq_r2r(mm6, mm4); // wsptr[2,z10],[3,z10],[2,z11],[3,z11]
2340 
2341  punpckhdq_r2r(mm5, mm2); // wsptr[2,z12],[3,z12],[2,z13],[3,z13]
2342  movq_r2r(mm0, mm5);
2343 
2344  punpckldq_r2r(mm4, mm0); // wsptr[0,z10],[1,z10],[2,z10],[3,z10]
2345 
2346  punpckhdq_r2r(mm4, mm5); // wsptr[0,z11],[1,z11],[2,z11],[3,z11]
2347  movq_r2r(mm3, mm4);
2348 
2349  punpckhdq_r2r(mm2, mm4); // wsptr[0,z13],[1,z13],[2,z13],[3,z13]
2350  movq_r2r(mm5, mm1);
2351 
2352  punpckldq_r2r(mm2, mm3); // wsptr[0,z12],[1,z12],[2,z12],[3,z12]
2353 // tmp7 = z11 + z13; /* phase 5 */
2354 // tmp8 = z11 - z13; /* phase 5 */
2355  psubw_r2r(mm4, mm1); // tmp8
2356 
2357  paddw_r2r(mm4, mm5); // tmp7
2358 // tmp21 = MULTIPLY(tmp8, FIX_1_414213562); /* 2*c4 */
2359  psllw_i2r(2, mm1);
2360 
2361  psllw_i2r(2, mm0);
2362 
2363  pmulhw_m2r(s_fix141, mm1); // tmp21
2364 // tmp20 = MULTIPLY(z12, (FIX_1_082392200- FIX_1_847759065)) /* 2*(c2-c6) */
2365 // + MULTIPLY(z10, - FIX_1_847759065); /* 2*c2 */
2366  psllw_i2r(2, mm3);
2367  movq_r2r(mm0, mm7);
2368 
2369  pmulhw_m2r(s_fixN184, mm7);
2370  movq_r2r(mm3, mm6);
2371 
2372  movq_m2r(*(wsptr), mm2); // tmp0,final1
2373 
2374  pmulhw_m2r(s_fix108n184, mm6);
2375 // tmp22 = MULTIPLY(z10,(FIX_1_847759065 - FIX_2_613125930)) /* -2*(c2+c6) */
2376 // + MULTIPLY(z12, FIX_1_847759065); /* 2*c2 */
2377  movq_r2r(mm2, mm4); // final1
2378 
2379  pmulhw_m2r(s_fix184n261, mm0);
2380  paddw_r2r(mm5, mm2); // tmp0+tmp7,final1
2381 
2382  pmulhw_m2r(s_fix184, mm3);
2383  psubw_r2r(mm5, mm4); // tmp0-tmp7,final1
2384 
2385 // tmp6 = tmp22 - tmp7; /* phase 2 */
2386  psraw_i2r(3, mm2); // outptr[0,0],[1,0],[2,0],[3,0],final1
2387 
2388  paddw_r2r(mm6, mm7); // tmp20
2389  psraw_i2r(3, mm4); // outptr[0,7],[1,7],[2,7],[3,7],final1
2390 
2391  paddw_r2r(mm0, mm3); // tmp22
2392 
2393 // tmp5 = tmp21 - tmp6;
2394  psubw_r2r(mm5, mm3); // tmp6
2395 
2396 // tmp4 = tmp20 + tmp5;
2397  movq_m2r(*(wsptr+1), mm0); // tmp1,final2
2398  psubw_r2r(mm3, mm1); // tmp5
2399 
2400  movq_r2r(mm0, mm6); // final2
2401  paddw_r2r(mm3, mm0); // tmp1+tmp6,final2
2402 
2403  /* Final output stage: scale down by a factor of 8 and range-limit */
2404 
2405 // outptr[0] = range_limit[IDESCALE(tmp0 + tmp7, PASS1_BITS+3)
2406 // & RANGE_MASK];
2407 // outptr[7] = range_limit[IDESCALE(tmp0 - tmp7, PASS1_BITS+3)
2408 // & RANGE_MASK]; final1
2409 
2410 
2411 // outptr[1] = range_limit[IDESCALE(tmp1 + tmp6, PASS1_BITS+3)
2412 // & RANGE_MASK];
2413 // outptr[6] = range_limit[IDESCALE(tmp1 - tmp6, PASS1_BITS+3)
2414 // & RANGE_MASK]; final2
2415  psubw_r2r(mm3, mm6); // tmp1-tmp6,final2
2416  psraw_i2r(3, mm0); // outptr[0,1],[1,1],[2,1],[3,1]
2417 
2418  psraw_i2r(3, mm6); // outptr[0,6],[1,6],[2,6],[3,6]
2419 
2420  packuswb_r2r(mm4, mm0); // out[0,1],[1,1],[2,1],[3,1],[0,7],[1,7],[2,7],[3,7]
2421 
2422  movq_m2r(*(wsptr+2), mm5); // tmp2,final3
2423  packuswb_r2r(mm6, mm2); // out[0,0],[1,0],[2,0],[3,0],[0,6],[1,6],[2,6],[3,6]
2424 
2425 // outptr[2] = range_limit[IDESCALE(tmp2 + tmp5, PASS1_BITS+3)
2426 // & RANGE_MASK];
2427 // outptr[5] = range_limit[IDESCALE(tmp2 - tmp5, PASS1_BITS+3)
2428 // & RANGE_MASK]; final3
2429  paddw_r2r(mm1, mm7); // tmp4
2430  movq_r2r(mm5, mm3);
2431 
2432  paddw_r2r(mm1, mm5); // tmp2+tmp5
2433  psubw_r2r(mm1, mm3); // tmp2-tmp5
2434 
2435  psraw_i2r(3, mm5); // outptr[0,2],[1,2],[2,2],[3,2]
2436 
2437  movq_m2r(*(wsptr+3), mm4); // tmp3,final4
2438  psraw_i2r(3, mm3); // outptr[0,5],[1,5],[2,5],[3,5]
2439 
2440 
2441 
2442 // outptr[4] = range_limit[IDESCALE(tmp3 + tmp4, PASS1_BITS+3)
2443 // & RANGE_MASK];
2444 // outptr[3] = range_limit[IDESCALE(tmp3 - tmp4, PASS1_BITS+3)
2445 // & RANGE_MASK]; final4
2446  movq_r2r(mm4, mm6);
2447  paddw_r2r(mm7, mm4); // tmp3+tmp4
2448 
2449  psubw_r2r(mm7, mm6); // tmp3-tmp4
2450  psraw_i2r(3, mm4); // outptr[0,4],[1,4],[2,4],[3,4]
2451 
2452  psraw_i2r(3, mm6); // outptr[0,3],[1,3],[2,3],[3,3]
2453 
2454  /*
2455  movq_r2m(mm4, *dummy);
2456  fprintf(stderr, "3-4 %016llx\n", dummy);
2457  movq_r2m(mm4, *dummy);
2458  fprintf(stderr, "3+4 %016llx\n", dummy);
2459  */
2460 
2461 
2462  packuswb_r2r(mm4, mm5); // out[0,2],[1,2],[2,2],[3,2],[0,4],[1,4],[2,4],[3,4]
2463 
2464  packuswb_r2r(mm3, mm6); // out[0,3],[1,3],[2,3],[3,3],[0,5],[1,5],[2,5],[3,5]
2465  movq_r2r(mm2, mm4);
2466 
2467  movq_r2r(mm5, mm7);
2468  punpcklbw_r2r(mm0, mm2); // out[0,0],[0,1],[1,0],[1,1],[2,0],[2,1],[3,0],[3,1]
2469 
2470  punpckhbw_r2r(mm0, mm4); // out[0,6],[0,7],[1,6],[1,7],[2,6],[2,7],[3,6],[3,7]
2471  movq_r2r(mm2, mm1);
2472 
2473  punpcklbw_r2r(mm6, mm5); // out[0,2],[0,3],[1,2],[1,3],[2,2],[2,3],[3,2],[3,3]
2474 
2475  punpckhbw_r2r(mm6, mm7); // out[0,4],[0,5],[1,4],[1,5],[2,4],[2,5],[3,4],[3,5]
2476 
2477  punpcklwd_r2r(mm5, mm2); // out[0,0],[0,1],[0,2],[0,3],[1,0],[1,1],[1,2],[1,3]
2478 
2479  movq_r2r(mm7, mm6);
2480  punpckhwd_r2r(mm5, mm1); // out[2,0],[2,1],[2,2],[2,3],[3,0],[3,1],[3,2],[3,3]
2481 
2482  movq_r2r(mm2, mm0);
2483  punpcklwd_r2r(mm4, mm6); // out[0,4],[0,5],[0,6],[0,7],[1,4],[1,5],[1,6],[1,7]
2484 
2485  punpckldq_r2r(mm6, mm2); // out[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7]
2486 
2487  movq_r2r(mm1, mm3);
2488 
2489  punpckhwd_r2r(mm4, mm7); // out[2,4],[2,5],[2,6],[2,7],[3,4],[3,5],[3,6],[3,7]
2490 
2491  dataptr += rskip;
2492  movq_r2m(mm2, *(dataptr));
2493 
2494  punpckhdq_r2r(mm6, mm0); // out[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7]
2495 
2496  dataptr += rskip;
2497  movq_r2m(mm0, *(dataptr));
2498 
2499  punpckldq_r2r(mm7, mm1); // out[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,6],[2,7]
2500 
2501  punpckhdq_r2r(mm7, mm3); // out[3,0],[3,1],[3,2],[3,3],[3,4],[3,5],[3,6],[3,7]
2502 
2503  dataptr += rskip;
2504  movq_r2m(mm1, *(dataptr));
2505 
2506  dataptr += rskip;
2507  movq_r2m(mm3, *(dataptr));
2508 
2509 #else
2510  int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2511  int32_t tmp10, tmp11, tmp12, tmp13;
2512  int32_t z5, z10, z11, z12, z13;
2513  int16_t *inptr;
2514  int32_t *wsptr;
2515  uint8_t *outptr;
2516  int ctr;
2517  int32_t dcval;
2518 
2519  inptr = data;
2520  wsptr = m_ws;
2521  for (ctr = 8; ctr > 0; ctr--) {
2522 
2523  if ((inptr[8] | inptr[16] | inptr[24] |
2524  inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2525  dcval = inptr[0];
2526  wsptr[0] = dcval;
2527  wsptr[8] = dcval;
2528  wsptr[16] = dcval;
2529  wsptr[24] = dcval;
2530  wsptr[32] = dcval;
2531  wsptr[40] = dcval;
2532  wsptr[48] = dcval;
2533  wsptr[56] = dcval;
2534 
2535  inptr++;
2536  wsptr++;
2537  continue;
2538  }
2539 
2540  tmp0 = inptr[0];
2541  tmp1 = inptr[16];
2542  tmp2 = inptr[32];
2543  tmp3 = inptr[48];
2544 
2545  tmp10 = tmp0 + tmp2;
2546  tmp11 = tmp0 - tmp2;
2547 
2548  tmp13 = tmp1 + tmp3;
2549  tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2550 
2551  tmp0 = tmp10 + tmp13;
2552  tmp3 = tmp10 - tmp13;
2553  tmp1 = tmp11 + tmp12;
2554  tmp2 = tmp11 - tmp12;
2555 
2556  tmp4 = inptr[8];
2557  tmp5 = inptr[24];
2558  tmp6 = inptr[40];
2559  tmp7 = inptr[56];
2560 
2561  z13 = tmp6 + tmp5;
2562  z10 = tmp6 - tmp5;
2563  z11 = tmp4 + tmp7;
2564  z12 = tmp4 - tmp7;
2565 
2566  tmp7 = z11 + z13;
2567  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2568 
2569  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2570  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2571  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2572 
2573  tmp6 = tmp12 - tmp7;
2574  tmp5 = tmp11 - tmp6;
2575  tmp4 = tmp10 + tmp5;
2576 
2577  wsptr[0] = (int32_t) (tmp0 + tmp7);
2578  wsptr[56] = (int32_t) (tmp0 - tmp7);
2579  wsptr[8] = (int32_t) (tmp1 + tmp6);
2580  wsptr[48] = (int32_t) (tmp1 - tmp6);
2581  wsptr[16] = (int32_t) (tmp2 + tmp5);
2582  wsptr[40] = (int32_t) (tmp2 - tmp5);
2583  wsptr[32] = (int32_t) (tmp3 + tmp4);
2584  wsptr[24] = (int32_t) (tmp3 - tmp4);
2585 
2586  inptr++;
2587  wsptr++;
2588  }
2589 
2590  wsptr = m_ws;
2591  for (ctr = 0; ctr < 8; ctr++) {
2592  outptr = &(odata[ctr*rskip]);
2593 
2594  tmp10 = wsptr[0] + wsptr[4];
2595  tmp11 = wsptr[0] - wsptr[4];
2596 
2597  tmp13 = wsptr[2] + wsptr[6];
2598  tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2599 
2600  tmp0 = tmp10 + tmp13;
2601  tmp3 = tmp10 - tmp13;
2602  tmp1 = tmp11 + tmp12;
2603  tmp2 = tmp11 - tmp12;
2604 
2605  z13 = wsptr[5] + wsptr[3];
2606  z10 = wsptr[5] - wsptr[3];
2607  z11 = wsptr[1] + wsptr[7];
2608  z12 = wsptr[1] - wsptr[7];
2609 
2610  tmp7 = z11 + z13;
2611  tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2612 
2613  z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2614  tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2615  tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2616 
2617  tmp6 = tmp12 - tmp7;
2618  tmp5 = tmp11 - tmp6;
2619  tmp4 = tmp10 + tmp5;
2620 
2621  outptr[0] = RL(DESCALE(tmp0 + tmp7));
2622  outptr[7] = RL(DESCALE(tmp0 - tmp7));
2623  outptr[1] = RL(DESCALE(tmp1 + tmp6));
2624  outptr[6] = RL(DESCALE(tmp1 - tmp6));
2625  outptr[2] = RL(DESCALE(tmp2 + tmp5));
2626  outptr[5] = RL(DESCALE(tmp2 - tmp5));
2627  outptr[4] = RL(DESCALE(tmp3 + tmp4));
2628  outptr[3] = RL(DESCALE(tmp3 - tmp4));
2629 
2630  wsptr += 8;
2631  }
2632 #endif
2633 }
2634 
2635 inline void RTjpeg::CalcTbls(void)
2636 {
2637  uint64_t qual = (uint64_t)m_q << (32 - 7); /* 32 bit FP, 255=2, 0=0 */
2638 
2639  for(int i = 0; i < 64; i++)
2640  {
2641  m_lqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_lum_quant_tbl[i]<<16))>>3);
2642  if (m_lqt[i] == 0)
2643  m_lqt[i]=1;
2644 
2645  m_cqt[i] = (int32_t)((qual/((uint64_t)RTjpeg_chrom_quant_tbl[i]<<16))>>3);
2646  if (m_cqt[i] == 0)
2647  m_cqt[i]=1;
2648 
2649  m_liqt[i] = (1<<16) / (m_lqt[i]<<3);
2650  m_ciqt[i] = (1<<16) / (m_cqt[i]<<3);
2651  m_lqt[i] = ((1<<16) / m_liqt[i])>>3;
2652  m_cqt[i] = ((1<<16) / m_ciqt[i])>>3;
2653  }
2654 
2655  m_lB8 = 0;
2656  while (m_liqt[RTjpeg_ZZ[++m_lB8]] <= 8)
2657  ;
2658  m_lB8--;
2659  m_cB8 = 0;
2660 
2661  while (m_ciqt[RTjpeg_ZZ[++m_cB8]] <= 8)
2662  ;
2663  m_cB8--;
2664 }
2665 
2666 int RTjpeg::SetQuality(int *quality)
2667 {
2668  if (*quality < 1)
2669  *quality = 1;
2670  if (*quality > 255)
2671  *quality = 255;
2672 
2673  m_q = *quality;
2674 
2675  CalcTbls();
2676  DctInit();
2677  IdctInit();
2678  QuantInit();
2679 
2680  return 0;
2681 }
2682 
2683 int RTjpeg::SetFormat(const int *fmt)
2684 {
2685  m_f = *fmt;
2686  return 0;
2687 }
2688 
2689 int RTjpeg::SetSize(const int *w, const int *h)
2690 {
2691  if ((*w < 0) || (*w > 65535))
2692  return -1;
2693  if ((*h < 0) || (*h > 65535))
2694  return -1;
2695 
2696  m_width = *w;
2697  m_height = *h;
2698  m_yWidth = m_width>>3;
2699  m_ySize = m_width * m_height;
2700  m_cWidth = m_width>>4;
2701  m_cSize = (m_width>>1) * m_height;
2702 
2703  if (m_keyRate > 0)
2704  {
2705  if (m_old)
2706  delete [] m_oldStart;
2707  m_oldStart = new int16_t[((4*m_width*m_height)+32)];
2708 
2709  auto tmp = (unsigned long)m_oldStart;
2710  tmp += 32;
2711  tmp = tmp>>5;
2712 
2713  m_old = (int16_t *)(tmp<<5);
2714  if (!m_old)
2715  {
2716  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2717  return -1;
2718  }
2719  memset(m_old, 0, ((4*m_width*m_height)));
2720  }
2721  return 0;
2722 }
2723 
2724 int RTjpeg::SetIntra(int *key, int *lm, int *cm)
2725 {
2726  if (*key < 0)
2727  *key = 0;
2728  if (*key > 255)
2729  *key = 255;
2730  m_keyRate = *key;
2731 
2732  if (*lm < 0)
2733  *lm = 0;
2734  if (*lm > 16)
2735  *lm = 16;
2736  if (*cm < 0)
2737  *cm = 0;
2738  if (*cm > 16)
2739  *cm = 16;
2740 
2741 #ifdef MMX
2742  m_lMask.uq = (((uint64_t)(*lm)<<48)|((uint64_t)(*lm)<<32)|((uint64_t)(*lm)<<16)|(uint64_t)(*lm));
2743  m_cMask.uq = (((uint64_t)(*cm)<<48)|((uint64_t)(*cm)<<32)|((uint64_t)(*cm)<<16)|(uint64_t)(*cm));
2744 #else
2745  m_lMask = *lm;
2746  m_cMask = *cm;
2747 #endif
2748 
2749  if (m_old)
2750  delete [] m_oldStart;
2751  m_oldStart = new int16_t[((4*m_width*m_height)+32)];
2752  auto tmp = (unsigned long)m_oldStart;
2753  tmp += 32;
2754  tmp = tmp >> 5;
2755  m_old = (int16_t *)(tmp << 5);
2756  if (!m_old)
2757  {
2758  fprintf(stderr, "RTjpeg: Could not allocate memory\n");
2759  return -1;
2760  }
2761  memset(m_old, 0, ((4*m_width*m_height)));
2762 
2763  return 0;
2764 }
2765 
2767 {
2768 #ifdef MMX
2769  RTjpeg_ones.q = 0x0001000100010001LL;
2770  RTjpeg_half.q = 0x7fff7fff7fff7fffLL;
2771  RTjpeg_C4.q = 0x2D412D412D412D41LL;
2772  RTjpeg_C6.q = 0x187E187E187E187ELL;
2773  RTjpeg_C2mC6.q= 0x22A322A322A322A3LL;
2774  RTjpeg_C2pC6.q= 0x539F539F539F539FLL;
2775  RTjpeg_zero.q = 0x0000000000000000LL;
2776 #endif
2777 }
2778 
2780 {
2781  delete [] m_oldStart;
2782 }
2783 
2784 inline int RTjpeg::compressYUV420(int8_t *sp, uint8_t **planes)
2785 {
2786  uint8_t * bp = planes[0];
2787  uint8_t * bp1 = bp + (m_width<<3);
2788  uint8_t * bp2 = planes[1];
2789  uint8_t * bp3 = planes[2];
2790 
2791 #ifdef MMX
2792  emms();
2793 #endif
2794  int8_t * sb = sp;
2795 /* Y */
2796  for(int i = m_height >> 1; i; i -= 8)
2797  {
2798  for(int j = 0, k = 0; j < m_width; j += 16, k += 8)
2799  {
2800  DctY(bp+j, m_yWidth);
2801  Quant(m_block, m_lqt);
2802  sp += b2s(m_block, sp, m_lB8);
2803 
2804  DctY(bp+j+8, m_yWidth);
2805  Quant(m_block, m_lqt);
2806  sp += b2s(m_block, sp, m_lB8);
2807 
2808  DctY(bp1+j, m_yWidth);
2809  Quant(m_block, m_lqt);
2810  sp += b2s(m_block, sp, m_lB8);
2811 
2812  DctY(bp1+j+8, m_yWidth);
2813  Quant(m_block, m_lqt);
2814  sp += b2s(m_block, sp, m_lB8);
2815 
2816  DctY(bp2+k, m_cWidth);
2817  Quant(m_block, m_cqt);
2818  sp += b2s(m_block, sp, m_cB8);
2819 
2820  DctY(bp3+k, m_cWidth);
2821  Quant(m_block, m_cqt);
2822  sp += b2s(m_block, sp, m_cB8);
2823  }
2824  bp += m_width<<4;
2825  bp1 += m_width<<4;
2826  bp2 += m_width<<2;
2827  bp3 += m_width<<2;
2828  }
2829 #ifdef MMX
2830  emms();
2831 #endif
2832  return (sp - sb);
2833 }
2834 
2835 inline int RTjpeg::compressYUV422(int8_t *sp, uint8_t **planes)
2836 {
2837  uint8_t * bp = planes[0];
2838  uint8_t * bp2 = planes[1];
2839  uint8_t * bp3 = planes[2];
2840 
2841 #ifdef MMX
2842  emms();
2843 #endif
2844  int8_t * sb=sp;
2845 /* Y */
2846  for(int i=m_height; i; i-=8)
2847  {
2848  for(int j=0, k=0; j<m_width; j+=16, k+=8)
2849  {
2850  DctY(bp+j, m_yWidth);
2851  Quant(m_block, m_lqt);
2852  sp += b2s(m_block, sp, m_lB8);
2853 
2854  DctY(bp+j+8, m_yWidth);
2855  Quant(m_block, m_lqt);
2856  sp += b2s(m_block, sp, m_lB8);
2857 
2858  DctY(bp2+k, m_cWidth);
2859  Quant(m_block, m_cqt);
2860  sp+=b2s(m_block, sp, m_cB8);
2861 
2862  DctY(bp3+k, m_cWidth);
2863  Quant(m_block, m_cqt);
2864  sp+=b2s(m_block, sp, m_cB8);
2865 
2866  }
2867  bp += m_width << 3;
2868  bp2 += m_width << 2;
2869  bp3 += m_width << 2;
2870 
2871  }
2872 #ifdef MMX
2873  emms();
2874 #endif
2875  return (sp-sb);
2876 }
2877 
2878 inline int RTjpeg::compress8(int8_t *sp, uint8_t **planes)
2879 {
2880  int8_t * sb = nullptr;
2881  uint8_t * bp = planes[0];
2882 
2883 #ifdef MMX
2884  emms();
2885 #endif
2886 
2887  sb=sp;
2888 /* Y */
2889  for(int i=0; i<m_height; i+=8)
2890  {
2891  for(int j=0; j<m_width; j+=8)
2892  {
2893  DctY(bp+j, m_width);
2894  Quant(m_block, m_lqt);
2895  sp += b2s(m_block, sp, m_lB8);
2896  }
2897  bp += m_width;
2898  }
2899 
2900 #ifdef MMX
2901  emms();
2902 #endif
2903  return (sp-sb);
2904 }
2905 
2906 inline void RTjpeg::decompressYUV422(int8_t *sp, uint8_t **planes)
2907 {
2908  uint8_t * bp = planes[0];
2909  uint8_t * bp2 = planes[1];
2910  uint8_t * bp3 = planes[2];
2911 
2912 #ifdef MMX
2913  emms();
2914 #endif
2915 
2916 /* Y */
2917  for(int i=m_height; i; i-=8)
2918  {
2919  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2920  if (*sp==-1)sp++;
2921  else
2922  {
2923  sp += s2b(m_block, sp, m_lB8, m_liqt);
2924  Idct(bp+j, m_block, m_width);
2925  }
2926  if (*sp==-1)sp++;
2927  else
2928  {
2929  sp += s2b(m_block, sp, m_lB8, m_liqt);
2930  Idct(bp+j+8, m_block, m_width);
2931  }
2932  if (*sp==-1)sp++;
2933  else
2934  {
2935  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2936  Idct(bp2+k, m_block, m_width>>1);
2937  }
2938  if (*sp==-1)sp++;
2939  else
2940  {
2941  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2942  Idct(bp3+k, m_block, m_width>>1);
2943  }
2944  }
2945  bp += m_width<<3;
2946  bp2 += m_width<<2;
2947  bp3 += m_width<<2;
2948  }
2949 #ifdef MMX
2950  emms();
2951 #endif
2952 }
2953 
2954 inline void RTjpeg::decompressYUV420(int8_t *sp, uint8_t **planes)
2955 {
2956  uint8_t * bp = planes[0];
2957  uint8_t * bp1 = bp + (m_width<<3);
2958  uint8_t * bp2 = planes[1];
2959  uint8_t * bp3 = planes[2];
2960 
2961 #ifdef MMX
2962  emms();
2963 #endif
2964 
2965 /* Y */
2966  for(int i=m_height>>1; i; i-=8)
2967  {
2968  for(int k=0, j=0; j<m_width; j+=16, k+=8) {
2969  if (*sp==-1)sp++;
2970  else
2971  {
2972  sp += s2b(m_block, sp, m_lB8, m_liqt);
2973  Idct(bp+j, m_block, m_width);
2974  }
2975  if (*sp==-1)sp++;
2976  else
2977  {
2978  sp += s2b(m_block, sp, m_lB8, m_liqt);
2979  Idct(bp+j+8, m_block, m_width);
2980  }
2981  if (*sp==-1)sp++;
2982  else
2983  {
2984  sp += s2b(m_block, sp, m_lB8, m_liqt);
2985  Idct(bp1+j, m_block, m_width);
2986  }
2987  if (*sp==-1)sp++;
2988  else
2989  {
2990  sp += s2b(m_block, sp, m_lB8, m_liqt);
2991  Idct(bp1+j+8, m_block, m_width);
2992  }
2993  if (*sp==-1)sp++;
2994  else
2995  {
2996  sp += s2b(m_block, sp, m_cB8, m_ciqt);
2997  Idct(bp2+k, m_block, m_width>>1);
2998  }
2999  if (*sp==-1)sp++;
3000  else
3001  {
3002  sp += s2b(m_block, sp, m_cB8, m_ciqt);
3003  Idct(bp3+k, m_block, m_width>>1);
3004  }
3005  }
3006  bp += m_width<<4;
3007  bp1 += m_width<<4;
3008  bp2 += m_width<<2;
3009  bp3 += m_width<<2;
3010  }
3011 #ifdef MMX
3012  emms();
3013 #endif
3014 }
3015 
3016 inline void RTjpeg::decompress8(int8_t *sp, uint8_t **planes)
3017 {
3018  uint8_t * bp = planes[0];
3019 
3020 #ifdef MMX
3021  emms();
3022 #endif
3023 
3024 /* Y */
3025  for(int i=0; i<m_height; i+=8)
3026  {
3027  for(int j=0; j<m_width; j+=8)
3028  {
3029  if (*sp==-1)sp++;
3030  else
3031  {
3032  sp += s2b(m_block, sp, m_lB8, m_liqt);
3033  Idct(bp+j, m_block, m_width);
3034  }
3035  }
3036  bp += m_width<<3;
3037  }
3038 }
3039 
3040 #ifdef MMX
3041 
3042 int RTjpeg::bcomp(int16_t *rblock, int16_t *_old, mmx_t *mask)
3043 {
3044  auto *mold=(mmx_t *)_old;
3045  auto *mblock=(mmx_t *)rblock;
3046  volatile mmx_t result;
3047  static mmx_t s_neg= { 0xffffffffffffffffULL };
3048 
3049  movq_m2r(*mask, mm7);
3050  movq_m2r(s_neg, mm6);
3051  pxor_r2r(mm5, mm5);
3052 
3053  for(int i=0; i<8; i++)
3054  {
3055  movq_m2r(*(mblock++), mm0);
3056  movq_m2r(*(mblock++), mm2);
3057  movq_m2r(*(mold++), mm1);
3058  movq_m2r(*(mold++), mm3);
3059  psubsw_r2r(mm1, mm0);
3060  psubsw_r2r(mm3, mm2);
3061  movq_r2r(mm0, mm1);
3062  movq_r2r(mm2, mm3);
3063  pcmpgtw_r2r(mm7, mm0);
3064  pcmpgtw_r2r(mm7, mm2);
3065  pxor_r2r(mm6, mm1);
3066  pxor_r2r(mm6, mm3);
3067  pcmpgtw_r2r(mm7, mm1);
3068  pcmpgtw_r2r(mm7, mm3);
3069  por_r2r(mm0, mm5);
3070  por_r2r(mm2, mm5);
3071  por_r2r(mm1, mm5);
3072  por_r2r(mm3, mm5);
3073  }
3074  movq_r2m(mm5, result);
3075 
3076  if (result.q)
3077  {
3078  for(int i=0; i<16; i++)((uint64_t *)_old)[i]=((uint64_t *)rblock)[i];
3079  return 0;
3080  }
3081  return 1;
3082 }
3083 
3084 #else
3085 int RTjpeg::bcomp(int16_t *rblock, int16_t *_old, uint16_t *mask)
3086 {
3087  for(int i=0; i<64; i++)
3088  if (abs(_old[i]-rblock[i])>*mask)
3089  {
3090  for(i=0; i<16; i++)((uint64_t *)_old)[i]=((uint64_t *)rblock)[i];
3091  return 0;
3092  }
3093  return 1;
3094 }
3095 #endif
3096 
3097 inline int RTjpeg::mcompressYUV420(int8_t *sp, uint8_t **planes)
3098 {
3099  uint8_t * bp = planes[0];
3100  uint8_t * bp1 = bp + (m_width<<3);
3101  uint8_t * bp2 = planes[1];
3102  uint8_t * bp3 = planes[2];
3103  int8_t * sb = sp;
3104  int16_t * lblock = m_old;
3105 
3106 /* Y */
3107  for(int i = m_height>>1; i; i-=8)
3108  {
3109  for(int j=0, k=0; j < m_width; j+=16, k+=8)
3110  {
3111  DctY(bp+j, m_yWidth);
3112  Quant(m_block, m_lqt);
3113  if (bcomp(m_block, lblock, &m_lMask))
3114  {
3115  *((uint8_t *)sp++)=255;
3116  }
3117  else sp+=b2s(m_block, sp, m_lB8);
3118  lblock += 64;
3119 
3120  DctY(bp+j+8, m_yWidth);
3121  Quant(m_block, m_lqt);
3122  if (bcomp(m_block, lblock, &m_lMask))
3123  {
3124  *((uint8_t *)sp++)=255;
3125  }
3126  else sp += b2s(m_block, sp, m_lB8);
3127  lblock += 64;
3128 
3129  DctY(bp1+j, m_yWidth);
3130  Quant(m_block, m_lqt);
3131  if (bcomp(m_block, lblock, &m_lMask))
3132  {
3133  *((uint8_t *)sp++)=255;
3134  }
3135  else sp += b2s(m_block, sp, m_lB8);
3136  lblock += 64;
3137 
3138  DctY(bp1+j+8, m_yWidth);
3139  Quant(m_block, m_lqt);
3140  if (bcomp(m_block, lblock, &m_lMask))
3141  {
3142  *((uint8_t *)sp++)=255;
3143  }
3144  else sp += b2s(m_block, sp, m_lB8);
3145  lblock += 64;
3146 
3147  DctY(bp2+k, m_cWidth);
3148  Quant(m_block, m_cqt);
3149  if (bcomp(m_block, lblock, &m_cMask))
3150  {
3151  *((uint8_t *)sp++)=255;
3152  }
3153  else
3154  sp+=b2s(m_block, sp, m_cB8);
3155  lblock+=64;
3156 
3157  DctY(bp3+k, m_cWidth);
3158  Quant(m_block, m_cqt);
3159  if (bcomp(m_block, lblock, &m_cMask))
3160  {
3161  *((uint8_t *)sp++)=255;
3162  }
3163  else
3164  sp+=b2s(m_block, sp, m_cB8);
3165  lblock+=64;
3166  }
3167  bp += m_width<<4;
3168  bp1 += m_width<<4;
3169  bp2 += m_width<<2;
3170  bp3 += m_width<<2;
3171  }
3172 #ifdef MMX
3173  emms();
3174 #endif
3175  return (sp-sb);
3176 }
3177 
3178 
3179 inline int RTjpeg::mcompressYUV422(int8_t *sp, uint8_t **planes)
3180 {
3181  uint8_t * bp = planes[0];
3182  uint8_t * bp2 = planes[1];
3183  uint8_t * bp3 = planes[2];
3184  int8_t * sb=sp;
3185  int16_t *lblock = m_old;
3186 
3187  for(int i = m_height; i; i-=8)
3188  {
3189  for(int j=0, k=0; j<m_width; j+=16, k+=8)
3190  {
3191  DctY(bp+j, m_yWidth);
3192  Quant(m_block, m_lqt);
3193  if (bcomp(m_block, lblock, &m_lMask))
3194  {
3195  *((uint8_t *)sp++)=255;
3196  }
3197  else sp+=b2s(m_block, sp, m_lB8);
3198  lblock+=64;
3199 
3200  DctY(bp+j+8, m_yWidth);
3201  Quant(m_block, m_lqt);
3202  if (bcomp(m_block, lblock, &m_lMask))
3203  {
3204  *((uint8_t *)sp++)=255;
3205  }
3206  else sp+=b2s(m_block, sp, m_lB8);
3207  lblock+=64;
3208 
3209  DctY(bp2+k, m_cWidth);
3210  Quant(m_block, m_cqt);
3211  if (bcomp(m_block, lblock, &m_cMask))
3212  {
3213  *((uint8_t *)sp++)=255;
3214  }
3215  else sp+=b2s(m_block, sp, m_cB8);
3216  lblock+=64;
3217 
3218  DctY(bp3+k, m_cWidth);
3219  Quant(m_block, m_cqt);
3220  if (bcomp(m_block, lblock, &m_cMask))
3221  {
3222  *((uint8_t *)sp++)=255;
3223  }
3224  else sp+=b2s(m_block, sp, m_cB8);
3225  lblock+=64;
3226 
3227  }
3228  bp += m_width<<3;
3229  bp2 += m_width<<2;
3230  bp3 += m_width<<2;
3231  }
3232 #ifdef MMX
3233  emms();
3234 #endif
3235  return (sp-sb);
3236 }
3237 
3238 inline int RTjpeg::mcompress8(int8_t *sp, uint8_t **planes)
3239 {
3240  uint8_t * bp = planes[0];
3241  int8_t * sb = sp;
3242  int16_t *lblock = m_old;
3243 
3244  for(int i=0; i<m_height; i+=8)
3245  {
3246  for(int j=0; j<m_width; j+=8)
3247  {
3248  DctY(bp+j, m_width);
3249  Quant(m_block, m_lqt);
3250  if (bcomp(m_block, lblock, &m_lMask))
3251  {
3252  *((uint8_t *)sp++)=255;
3253  } else sp+=b2s(m_block, sp, m_lB8);
3254  lblock+=64;
3255  }
3256  bp+=m_width<<3;
3257  }
3258 #ifdef MMX
3259  emms();
3260 #endif
3261  return (sp-sb);
3262 }
3263 
3265 {
3266  m_keyCount = 0;
3267 }
3268 
3269 int RTjpeg::Compress(int8_t *sp, uint8_t **planes)
3270 {
3271  auto * fh = (RTjpeg_frameheader *)sp;
3272  int ds = 0;
3273 
3274  if (m_keyRate == 0)
3275  {
3276  switch(m_f)
3277  {
3278  case RTJ_YUV420: ds = compressYUV420((int8_t*)&(fh->data), planes); break;
3279  case RTJ_YUV422: ds = compressYUV422((int8_t*)&(fh->data), planes); break;
3280  case RTJ_RGB8: ds = compress8((int8_t*)&(fh->data), planes); break;
3281  }
3282  fh->key = 0;
3283  } else {
3284  if (m_keyCount == 0)
3285  memset(m_old, 0, ((4 * m_width * m_height)));
3286  switch(m_f)
3287  {
3288  case RTJ_YUV420: ds = mcompressYUV420((int8_t*)&(fh->data), planes); break;
3289  case RTJ_YUV422: ds = mcompressYUV422((int8_t*)&(fh->data), planes); break;
3290  case RTJ_RGB8: ds = mcompress8((int8_t*)&(fh->data), planes); break;
3291  }
3292  fh->key = m_keyCount;
3293  if (++m_keyCount > m_keyRate)
3294  m_keyCount = 0;
3295  }
3296  ds += RTJPEG_HEADER_SIZE;
3297  fh->framesize = RTJPEG_SWAP_WORD(ds);
3298  fh->headersize = RTJPEG_HEADER_SIZE;
3299  fh->version = RTJPEG_FILE_VERSION;
3300  fh->width = RTJPEG_SWAP_HALFWORD(m_width);
3301  fh->height = RTJPEG_SWAP_HALFWORD(m_height);
3302  fh->quality = m_q;
3303  return ds;
3304 }
3305 
3306 void RTjpeg::Decompress(int8_t *sp, uint8_t **planes)
3307 {
3308  auto * fh = (RTjpeg_frameheader *)sp;
3309 
3310  if ((RTJPEG_SWAP_HALFWORD(fh->width) != m_width)||
3311  (RTJPEG_SWAP_HALFWORD(fh->height) != m_height))
3312  {
3313  int w = RTJPEG_SWAP_HALFWORD(fh->width);
3314  int h = RTJPEG_SWAP_HALFWORD(fh->height);
3315  SetSize(&w, &h);
3316  }
3317  if (fh->quality != m_q)
3318  {
3319  int q = fh->quality;
3320  SetQuality(&q);
3321  }
3322  switch(m_f)
3323  {
3324  case RTJ_YUV420: decompressYUV420((int8_t*)&(fh->data), planes); break;
3325  case RTJ_YUV422: decompressYUV422((int8_t*)&(fh->data), planes); break;
3326  case RTJ_RGB8: decompress8((int8_t*)&(fh->data), planes); break;
3327  }
3328 }
void decompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3016
static mmx_t RTjpeg_half
Definition: RTjpegN.cpp:32
#define RL(x)
Definition: RTjpegN.cpp:1505
int mcompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3097
#define RTJPEG_SWAP_HALFWORD(a)
Definition: RTjpegN.h:47
int32_t m_cWidth
Definition: RTjpegN.h:122
stderr
Definition: ttvdb.py:1426
void SetNextKey(void)
Definition: RTjpegN.cpp:3264
static mmx_t RTjpeg_C6
Definition: RTjpegN.cpp:34
int compressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2784
void decompressYUV420(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2954
static const unsigned char RTjpeg_ZZ[64]
Definition: RTjpegN.cpp:43
int SetIntra(int *key, int *lm, int *cm)
Definition: RTjpegN.cpp:2724
~RTjpeg()
Definition: RTjpegN.cpp:2779
static mmx_t RTjpeg_C4
Definition: RTjpegN.cpp:33
int16_t * m_old
Definition: RTjpegN.h:125
int compressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2835
int SetSize(const int *w, const int *h)
Definition: RTjpegN.cpp:2689
int32_t m_cSize
Definition: RTjpegN.h:124
static uint planes(VideoFrameType Type)
Definition: mythframe.h:567
int32_t m_lB8
Definition: RTjpegN.h:119
#define FIX_1_414213562
Definition: RTjpegN.cpp:1497
static guint32 * tmp
Definition: goom_core.c:35
static void Quant(int16_t *block, int32_t *qtbl)
Definition: RTjpegN.cpp:527
mmx_t m_lMask
Definition: RTjpegN.h:134
static const unsigned char RTjpeg_lum_quant_tbl[64]
Definition: RTjpegN.cpp:71
#define FIX_2_613125930
Definition: RTjpegN.cpp:1499
#define RTJ_YUV422
Definition: RTjpegN.h:61
int compress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2878
#define RTJ_RGB8
Definition: RTjpegN.h:62
int m_f
Definition: RTjpegN.h:132
#define RTJ_YUV420
Definition: RTjpegN.h:60
int16_t * m_oldStart
Definition: RTjpegN.h:126
#define RTJPEG_FILE_VERSION
Definition: RTjpegN.h:35
int m_width
Definition: RTjpegN.h:129
int Compress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3269
static mmx_t RTjpeg_C2mC6
Definition: RTjpegN.cpp:35
static mmx_t RTjpeg_ones
Definition: RTjpegN.cpp:31
int SetFormat(const int *fmt)
Definition: RTjpegN.cpp:2683
void Idct(uint8_t *odata, int16_t *data, int rskip)
Definition: RTjpegN.cpp:1517
int32_t m_cB8
Definition: RTjpegN.h:120
RTjpeg()
Definition: RTjpegN.cpp:2766
#define RTJPEG_SWAP_WORD(a)
Definition: RTjpegN.h:46
#define FIX_1_082392200
Definition: RTjpegN.cpp:1496
static const unsigned char RTjpeg_chrom_quant_tbl[64]
Definition: RTjpegN.cpp:82
int mcompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3179
static const uint64_t RTjpeg_aan_tab[64]
Definition: RTjpegN.cpp:60
void decompressYUV422(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:2906
#define RTJPEG_HEADER_SIZE
Definition: RTjpegN.h:36
mmx_t m_cMask
Definition: RTjpegN.h:135
int m_q
Definition: RTjpegN.h:131
int32_t m_ySize
Definition: RTjpegN.h:123
static mmx_t RTjpeg_zero
Definition: RTjpegN.cpp:37
static mmx_t RTjpeg_C2pC6
Definition: RTjpegN.cpp:36
unsigned short uint16_t
Definition: iso6937tables.h:1
int32_t m_yWidth
Definition: RTjpegN.h:121
int SetQuality(int *quality)
Definition: RTjpegN.cpp:2666
static int b2s(const int16_t *data, int8_t *strm, uint8_t bt8)
Definition: RTjpegN.cpp:110
#define DESCALE(x)
Definition: RTjpegN.cpp:1501
int m_height
Definition: RTjpegN.h:130
void DctInit(void)
Definition: RTjpegN.cpp:587
static int bcomp(int16_t *rblock, int16_t *old, mmx_t *mask)
Definition: RTjpegN.cpp:3042
#define FIX_1_847759065
Definition: RTjpegN.cpp:1498
void DctY(uint8_t *idata, int rskip)
Definition: RTjpegN.cpp:596
int m_keyRate
Definition: RTjpegN.h:140
static int s2b(int16_t *data, const int8_t *strm, uint8_t bt8, int32_t *qtbla)
Definition: RTjpegN.cpp:278
void CalcTbls(void)
Definition: RTjpegN.cpp:2635
void IdctInit(void)
Definition: RTjpegN.cpp:1508
int m_keyCount
Definition: RTjpegN.h:127
void QuantInit(void)
Definition: RTjpegN.cpp:512
#define MULTIPLY(var, const)
Definition: RTjpegN.cpp:1506
void Decompress(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3306
int mcompress8(int8_t *sp, uint8_t **planes)
Definition: RTjpegN.cpp:3238