47 #define BETTERCOMPRESSION 1 // NOLINT(cppcoreguidelines-macro-usage)
49 static const std::array<const uint8_t,64>
RTjpeg_ZZ {
55 40, 33, 26, 19, 12, 5,
56 6, 13, 20, 27, 34, 41, 48,
57 56, 49, 42, 35, 28, 21, 14, 7,
58 15, 22, 29, 36, 43, 50, 57,
59 58, 51, 44, 37, 30, 23,
67 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
68 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
69 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
70 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
71 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
72 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
73 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
74 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
78 16, 11, 10, 16, 24, 40, 51, 61,
79 12, 12, 14, 19, 26, 58, 60, 55,
80 14, 13, 16, 24, 40, 57, 69, 56,
81 14, 17, 22, 29, 51, 87, 80, 62,
82 18, 22, 37, 56, 68, 109, 103, 77,
83 24, 35, 55, 64, 81, 104, 113, 92,
84 49, 64, 78, 87, 103, 121, 120, 101,
85 72, 92, 95, 98, 112, 100, 103, 99
89 17, 18, 24, 47, 99, 99, 99, 99,
90 18, 21, 26, 66, 99, 99, 99, 99,
91 24, 26, 56, 99, 99, 99, 99, 99,
92 47, 66, 99, 99, 99, 99, 99, 99,
93 99, 99, 99, 99, 99, 99, 99, 99,
94 99, 99, 99, 99, 99, 99, 99, 99,
95 99, 99, 99, 99, 99, 99, 99, 99,
96 99, 99, 99, 99, 99, 99, 99, 99
99 #ifdef BETTERCOMPRESSION
120 auto *ustrm = (uint8_t *)strm;
124 for (ii=0; ii < 64; ii++) {
142 while (data[
RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
144 unsigned char bitten = ((
unsigned char)ci) << 2;
153 unsigned char bitoff = 0;
164 bitten |= (0x01<<bitoff);
167 bitten |= (0x03<<bitoff);
170 bitten |= (0x02<<bitoff);
218 if ( (ZZvalue > 7) || (ZZvalue < -7) ) {
219 bitten |= (0x08<<bitoff);
223 bitten |= (ZZvalue&0xf)<<bitoff;
254 strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
258 strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
269 fprintf(
stdout,
"\nco = '%d'\n", co);
270 for (i=0; i < co+2; i++) {
271 fprintf(
stdout,
"%d ", strm[i]);
286 auto *qtbl = (uint32_t *)qtbla.data();
288 unsigned char bitoff = 0;
292 data[i]=((uint8_t)strm[0])*qtbl[i];
296 unsigned char bitten = ((
unsigned char)strm[1]) >> 2;
298 for(; co > bitten; co--) {
315 bitten = ((
unsigned char)strm[ci]) >> bitoff;
371 bitten = ((
unsigned char)strm[ci]) >> bitoff;
376 if ( bitten == 0x08 ) {
381 if ( bitten & 0x08 ) {
386 data[i]=((
signed char)bitten)*qtbl[i];
406 data[i]=strm[ci++]*qtbl[i];
414 fprintf(
stdout,
"\nci = '%d'\n", ci);
415 for (i=0; i < 64; i++) {
426 int RTjpeg::b2s(
const int16_t *data, int8_t *strm, uint8_t bt8)
428 register int ci, co=1,
tmp;
429 register int16_t ZZvalue;
434 for (ii=0; ii < 64; ii++) {
443 for(ci=1; ci<=bt8; ci++)
449 strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
453 strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
463 strm[co++]=(int8_t)(ZZvalue>63)?63:ZZvalue;
467 strm[co++]=(int8_t)(ZZvalue<-64)?-64:ZZvalue;
475 }
while((ci<64)&&(data[
RTjpeg_ZZ[ci]]==0));
477 strm[co++]=(int8_t)(63+(ci-
tmp));
484 int RTjpeg::s2b(int16_t *data,
const int8_t *strm, uint8_t bt8, uint32_t *qtbla)
486 uint32_t *qtbl = (uint32_t *)qtbla;
491 data[i]=((uint8_t)strm[0])*qtbl[i];
493 for(co=1; co<=bt8; co++)
496 data[i]=strm[ci++]*qtbl[i];
509 data[i]=strm[ci]*qtbl[i];
520 using P16_32 =
union { int16_t *m_int16; int32_t *m_int32; };
523 qtbl.m_int32 =
m_lqt.data();
524 for (
int i = 0; i < 64; i++)
525 qtbl.m_int16[i] =
static_cast<int16_t
>(
m_lqt[i]);
528 qtbl.m_int32 =
m_cqt.data();
529 for (
int i = 0; i < 64; i++)
530 qtbl.m_int16[i] =
static_cast<int16_t
>(
m_cqt[i]);
535 auto *ql=(mmx_t *)qtbl.data();
536 auto *bl=(mmx_t *)_block.data();
541 for(
int i=16; i; i--)
543 movq_m2r(*(ql++), mm0);
548 punpcklwd_r2r(mm6, mm0);
549 punpckhwd_r2r(mm6, mm1);
551 punpcklwd_r2r(mm7, mm2);
552 punpckhwd_r2r(mm7, mm3);
554 pmaddwd_r2r(mm2, mm0);
555 pmaddwd_r2r(mm3, mm1);
560 packssdw_r2r(mm1, mm0);
562 movq_r2m(mm0, *(bl++));
575 _block[i]=(int16_t)((_block[i]*qtbl[i]+32767)>>16);
583 static constexpr int32_t FIX_0_382683433 { 98 };
584 static constexpr int32_t FIX_0_541196100 { 139 };
585 static constexpr int32_t FIX_0_707106781 { 181 };
586 static constexpr int32_t FIX_1_306562965 { 334 };
588 static constexpr int16_t DESCALE10(int32_t x) {
return static_cast<int16_t
>((x+128) >> 8); };
589 static constexpr int16_t DESCALE20(int32_t x) {
return static_cast<int16_t
>((x+32768) >> 16); };
590 static constexpr int32_t D_MULTIPLY(int32_t var, int32_t constant) {
return var * constant; };
595 for (
int i = 0; i < 64; i++)
605 uint8_t *idataptr = idata;
606 int32_t *wsptr =
m_ws.data();
608 for (
int ctr = 7; ctr >= 0; ctr--) {
609 int32_t tmp0 = idataptr[0] + idataptr[7];
610 int32_t tmp7 = idataptr[0] - idataptr[7];
611 int32_t tmp1 = idataptr[1] + idataptr[6];
612 int32_t tmp6 = idataptr[1] - idataptr[6];
613 int32_t tmp2 = idataptr[2] + idataptr[5];
614 int32_t tmp5 = idataptr[2] - idataptr[5];
615 int32_t tmp3 = idataptr[3] + idataptr[4];
616 int32_t tmp4 = idataptr[3] - idataptr[4];
618 int32_t tmp10 = (tmp0 + tmp3);
619 int32_t tmp13 = tmp0 - tmp3;
620 int32_t tmp11 = (tmp1 + tmp2);
621 int32_t tmp12 = tmp1 - tmp2;
623 wsptr[0] = (tmp10 + tmp11)<<8;
624 wsptr[4] = (tmp10 - tmp11)<<8;
626 int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
627 wsptr[2] = (tmp13<<8) + z1;
628 wsptr[6] = (tmp13<<8) - z1;
634 int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433);
635 int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5;
636 int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5;
637 int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781);
639 int32_t z11 = (tmp7<<8) + z3;
640 int32_t z13 = (tmp7<<8) - z3;
647 idataptr += rskip<<3;
652 int16_t *odataptr =
m_block.data();
653 for (
int ctr = 7; ctr >= 0; ctr--) {
654 int32_t tmp0 = wsptr[0] + wsptr[56];
655 int32_t tmp7 = wsptr[0] - wsptr[56];
656 int32_t tmp1 = wsptr[8] + wsptr[48];
657 int32_t tmp6 = wsptr[8] - wsptr[48];
658 int32_t tmp2 = wsptr[16] + wsptr[40];
659 int32_t tmp5 = wsptr[16] - wsptr[40];
660 int32_t tmp3 = wsptr[24] + wsptr[32];
661 int32_t tmp4 = wsptr[24] - wsptr[32];
663 int32_t tmp10 = tmp0 + tmp3;
664 int32_t tmp13 = tmp0 - tmp3;
665 int32_t tmp11 = tmp1 + tmp2;
666 int32_t tmp12 = tmp1 - tmp2;
668 odataptr[0] = DESCALE10(tmp10 + tmp11);
669 odataptr[32] = DESCALE10(tmp10 - tmp11);
671 int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
672 odataptr[16] = DESCALE20((tmp13<<8) + z1);
673 odataptr[48] = DESCALE20((tmp13<<8) - z1);
679 int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433);
680 int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5;
681 int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5;
682 int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781);
684 int32_t z11 = (tmp7<<8) + z3;
685 int32_t z13 = (tmp7<<8) - z3;
687 odataptr[40] = DESCALE20(z13 + z2);
688 odataptr[24] = DESCALE20(z13 - z2);
689 odataptr[8] = DESCALE20(z11 + z4);
690 odataptr[56] = DESCALE20(z11 - z4);
697 volatile mmx_t tmp6 {};
698 volatile mmx_t tmp7 {};
699 auto *dataptr = (mmx_t *)
m_block.data();
700 auto *idata2 = (mmx_t *)idata;
707 movq_m2r(*idata2, mm0);
710 punpcklbw_r2r(mm2, mm0);
711 movq_r2m(mm0, *(dataptr));
713 punpckhbw_r2r(mm2, mm1);
714 movq_r2m(mm1, *(dataptr+1));
718 movq_m2r(*idata2, mm0);
721 punpcklbw_r2r(mm2, mm0);
722 movq_r2m(mm0, *(dataptr+2));
724 punpckhbw_r2r(mm2, mm1);
725 movq_r2m(mm1, *(dataptr+3));
729 movq_m2r(*idata2, mm0);
732 punpcklbw_r2r(mm2, mm0);
733 movq_r2m(mm0, *(dataptr+4));
735 punpckhbw_r2r(mm2, mm1);
736 movq_r2m(mm1, *(dataptr+5));
740 movq_m2r(*idata2, mm0);
743 punpcklbw_r2r(mm2, mm0);
744 movq_r2m(mm0, *(dataptr+6));
746 punpckhbw_r2r(mm2, mm1);
747 movq_r2m(mm1, *(dataptr+7));
751 movq_m2r(*idata2, mm0);
754 punpcklbw_r2r(mm2, mm0);
755 movq_r2m(mm0, *(dataptr+8));
757 punpckhbw_r2r(mm2, mm1);
758 movq_r2m(mm1, *(dataptr+9));
762 movq_m2r(*idata2, mm0);
765 punpcklbw_r2r(mm2, mm0);
766 movq_r2m(mm0, *(dataptr+10));
768 punpckhbw_r2r(mm2, mm1);
769 movq_r2m(mm1, *(dataptr+11));
773 movq_m2r(*idata2, mm0);
776 punpcklbw_r2r(mm2, mm0);
777 movq_r2m(mm0, *(dataptr+12));
779 punpckhbw_r2r(mm2, mm1);
780 movq_r2m(mm1, *(dataptr+13));
784 movq_m2r(*idata2, mm0);
787 punpcklbw_r2r(mm2, mm0);
788 movq_r2m(mm0, *(dataptr+14));
790 punpckhbw_r2r(mm2, mm1);
791 movq_r2m(mm1, *(dataptr+15));
795 movq_m2r(*(dataptr+9), mm7);
797 movq_m2r(*(dataptr+13), mm6);
800 punpcklwd_m2r(*(dataptr+11), mm7);
803 punpcklwd_m2r(*(dataptr+15), mm6);
806 movq_m2r(*(dataptr+11), mm3);
807 punpckldq_r2r(mm6, mm7);
809 movq_m2r(*(dataptr+15), mm0);
810 punpckhdq_r2r(mm6, mm1);
812 movq_r2m(mm7,*(dataptr+9));
813 punpckhwd_r2r(mm3, mm5);
815 movq_r2m(mm1,*(dataptr+11));
816 punpckhwd_r2r(mm0, mm2);
819 punpckldq_r2r(mm2, mm5);
821 movq_m2r(*(dataptr+1), mm0);
822 punpckhdq_r2r(mm2, mm1);
824 movq_r2m(mm5,*(dataptr+13));
828 movq_r2m(mm1, *(dataptr+15));
830 movq_m2r(*(dataptr+5), mm2);
833 punpcklwd_m2r(*(dataptr+3), mm0);
836 punpcklwd_m2r(*(dataptr+7), mm2);
840 movq_m2r(*(dataptr+8), mm1);
841 punpckldq_r2r(mm2, mm0);
843 movq_m2r(*(dataptr+12), mm3);
844 punpckhdq_r2r(mm2, mm4);
846 punpckhwd_m2r(*(dataptr+3), mm6);
849 punpckhwd_m2r(*(dataptr+7), mm7);
852 movq_r2m(mm0, *(dataptr+8));
853 punpckhdq_r2r(mm7, mm5);
855 punpcklwd_m2r(*(dataptr+10), mm1);
858 punpckhwd_m2r(*(dataptr+10), mm2);
860 movq_r2m(mm4, *(dataptr+10));
861 punpckldq_r2r(mm7, mm6);
863 punpcklwd_m2r(*(dataptr+14), mm3);
866 movq_r2m(mm6, *(dataptr+12));
867 punpckldq_r2r(mm3, mm1);
869 punpckhwd_m2r(*(dataptr+14), mm0);
872 movq_r2m(mm5, *(dataptr+14));
873 punpckhdq_r2r(mm3, mm4);
875 movq_r2m(mm1, *(dataptr+1));
876 punpckldq_r2r(mm0, mm2);
878 movq_r2m(mm4, *(dataptr+3));
879 punpckhdq_r2r(mm0, mm6);
881 movq_r2m(mm2, *(dataptr+5));
883 movq_m2r(*dataptr, mm0);
885 movq_r2m(mm6, *(dataptr+7));
890 movq_m2r(*(dataptr+4), mm7);
893 punpcklwd_m2r(*(dataptr+2), mm0);
896 punpcklwd_m2r(*(dataptr+6), mm7);
899 movq_m2r(*(dataptr+2), mm6);
900 punpckldq_r2r(mm7, mm0);
902 movq_m2r(*(dataptr+6), mm5);
903 punpckhdq_r2r(mm7, mm1);
906 punpckhwd_r2r(mm6, mm2);
908 psubw_m2r(*(dataptr+14), mm7);
911 paddw_m2r(*(dataptr+14), mm0);
912 punpckhwd_r2r(mm5, mm4);
914 paddw_m2r(*(dataptr+12), mm1);
917 psubw_m2r(*(dataptr+12), mm6);
918 punpckldq_r2r(mm4, mm2);
924 punpckhdq_r2r(mm4, mm3);
926 paddw_m2r(*(dataptr+10), mm2);
934 paddw_m2r(*(dataptr+8), mm3);
937 psubw_m2r(*(dataptr+8), mm4);
946 psubw_m2r(*(dataptr+10), mm5);
960 movq_r2m(mm0, *dataptr);
964 movq_r2m(mm3, *(dataptr+8));
973 movq_r2m(mm0, *(dataptr+4));
978 movq_r2m(mm7, *(dataptr+12));
990 pmulhw_r2r(mm0, mm4);
1000 movq_m2r(*(dataptr+1), mm7);
1001 paddw_r2r(mm1, mm4);
1003 paddw_r2r(mm1, mm2);
1005 paddw_r2r(mm5, mm0);
1006 psubw_r2r(mm5, mm3);
1011 psubw_r2r(mm4, mm3);
1013 paddw_r2r(mm4, mm5);
1016 movq_r2m(mm3, *(dataptr+6));
1017 psubw_r2r(mm2, mm0);
1019 movq_r2m(mm5, *(dataptr+10));
1020 paddw_r2r(mm2, mm6);
1022 movq_r2m(mm0, *(dataptr+14));
1028 movq_m2r(*(dataptr+3), mm1);
1031 movq_r2m(mm6, *(dataptr+2));
1033 movq_m2r(*(dataptr+5), mm2);
1036 paddw_m2r(*(dataptr+15), mm0);
1038 movq_m2r(*(dataptr+7), mm3);
1041 psubw_m2r(*(dataptr+15), mm7);
1044 paddw_m2r(*(dataptr+13), mm1);
1046 movq_r2m(mm7, tmp7);
1049 psubw_m2r(*(dataptr+13), mm6);
1053 paddw_m2r(*(dataptr+9), mm3);
1055 movq_r2m(mm6, tmp6);
1058 paddw_m2r(*(dataptr+11), mm2);
1059 paddw_r2r(mm3, mm0);
1061 psubw_r2r(mm3, mm7);
1063 psubw_m2r(*(dataptr+9), mm4);
1064 psubw_r2r(mm2, mm6);
1066 paddw_r2r(mm2, mm1);
1068 psubw_m2r(*(dataptr+11), mm5);
1069 paddw_r2r(mm7, mm6);
1073 movq_m2r(tmp6, mm2);
1077 paddw_r2r(mm1, mm0);
1080 psubw_r2r(mm1, mm3);
1082 movq_r2m(mm0, *(dataptr+1));
1087 movq_r2m(mm3, *(dataptr+9));
1088 paddw_r2r(mm5, mm4);
1090 movq_m2r(tmp7, mm3);
1091 paddw_r2r(mm6, mm0);
1093 paddw_r2r(mm2, mm5);
1094 psubw_r2r(mm6, mm7);
1096 movq_r2m(mm0, *(dataptr+5));
1097 paddw_r2r(mm3, mm2);
1101 movq_r2m(mm7, *(dataptr+13));
1104 psubw_r2r(mm2, mm1);
1113 pmulhw_r2r(mm0, mm4);
1123 movq_m2r(*(dataptr+9), mm7);
1124 paddw_r2r(mm1, mm4);
1126 paddw_r2r(mm5, mm0);
1127 psubw_r2r(mm5, mm3);
1132 paddw_r2r(mm1, mm2);
1135 psubw_r2r(mm4, mm5);
1137 paddw_r2r(mm2, mm6);
1138 paddw_r2r(mm4, mm3);
1140 movq_r2m(mm5, *(dataptr+7));
1142 movq_r2m(mm6, *(dataptr+3));
1143 psubw_r2r(mm2, mm0);
1149 movq_m2r(*(dataptr+13), mm6);
1152 punpcklwd_r2r(mm3, mm7);
1155 punpcklwd_r2r(mm0, mm6);
1158 punpckldq_r2r(mm6, mm7);
1160 punpckhdq_r2r(mm6, mm1);
1162 movq_r2m(mm7, *(dataptr+9));
1163 punpckhwd_r2r(mm3, mm5);
1165 movq_r2m(mm1, *(dataptr+11));
1166 punpckhwd_r2r(mm0, mm2);
1169 punpckldq_r2r(mm2, mm5);
1171 movq_m2r(*(dataptr+1), mm0);
1172 punpckhdq_r2r(mm2, mm1);
1174 movq_r2m(mm5, *(dataptr+13));
1178 movq_r2m(mm1, *(dataptr+15));
1180 movq_m2r(*(dataptr+5), mm2);
1183 punpcklwd_m2r(*(dataptr+3), mm0);
1186 punpcklwd_m2r(*(dataptr+7), mm2);
1191 movq_m2r(*(dataptr+8), mm1);
1192 punpckldq_r2r(mm2, mm0);
1194 movq_m2r(*(dataptr+12), mm3);
1195 punpckhdq_r2r(mm2, mm4);
1197 punpckhwd_m2r(*(dataptr+3), mm6);
1200 punpckhwd_m2r(*(dataptr+7), mm7);
1203 movq_r2m(mm0, *(dataptr+8));
1204 punpckhdq_r2r(mm7, mm5);
1206 punpcklwd_m2r(*(dataptr+10), mm1);
1209 punpckhwd_m2r(*(dataptr+10), mm2);
1211 movq_r2m(mm4, *(dataptr+10));
1212 punpckldq_r2r(mm7, mm6);
1214 punpcklwd_m2r(*(dataptr+14), mm3);
1217 movq_r2m(mm6, *(dataptr+12));
1218 punpckldq_r2r(mm3, mm1);
1220 punpckhwd_m2r(*(dataptr+14), mm0);
1223 movq_r2m(mm5, *(dataptr+14));
1224 punpckhdq_r2r(mm3, mm4);
1226 movq_r2m(mm1, *(dataptr+1));
1227 punpckldq_r2r(mm0, mm2);
1229 movq_r2m(mm4, *(dataptr+3));
1230 punpckhdq_r2r(mm0, mm6);
1232 movq_r2m(mm2, *(dataptr+5));
1234 movq_m2r(*dataptr, mm0);
1236 movq_r2m(mm6, *(dataptr+7));
1240 movq_m2r(*(dataptr+4), mm7);
1243 punpcklwd_m2r(*(dataptr+2), mm0);
1246 punpcklwd_m2r(*(dataptr+6), mm7);
1249 movq_m2r(*(dataptr+2), mm6);
1250 punpckldq_r2r(mm7, mm0);
1252 movq_m2r(*(dataptr+6), mm5);
1253 punpckhdq_r2r(mm7, mm1);
1256 punpckhwd_r2r(mm6, mm2);
1258 psubw_m2r(*(dataptr+14), mm7);
1261 paddw_m2r(*(dataptr+14), mm0);
1262 punpckhwd_r2r(mm5, mm4);
1264 paddw_m2r(*(dataptr+12), mm1);
1267 psubw_m2r(*(dataptr+12), mm6);
1268 punpckldq_r2r(mm4, mm2);
1270 movq_r2m(mm7, tmp7);
1273 movq_r2m(mm6, tmp6);
1275 punpckhdq_r2r(mm4, mm3);
1277 paddw_m2r(*(dataptr+10), mm2);
1284 paddw_m2r(*(dataptr+8), mm3);
1287 psubw_m2r(*(dataptr+8), mm4);
1290 paddw_r2r(mm3, mm0);
1291 psubw_r2r(mm3, mm7);
1293 psubw_r2r(mm2, mm6);
1294 paddw_r2r(mm2, mm1);
1296 psubw_m2r(*(dataptr+10), mm5);
1297 paddw_r2r(mm7, mm6);
1301 movq_m2r(tmp6, mm2);
1305 paddw_r2r(mm1, mm0);
1308 psubw_r2r(mm1, mm3);
1310 movq_r2m(mm0, *dataptr);
1314 movq_r2m(mm3, *(dataptr+8));
1315 paddw_r2r(mm5, mm4);
1317 movq_m2r(tmp7, mm3);
1318 paddw_r2r(mm6, mm0);
1320 paddw_r2r(mm2, mm5);
1321 psubw_r2r(mm6, mm7);
1323 movq_r2m(mm0, *(dataptr+4));
1324 paddw_r2r(mm3, mm2);
1327 movq_r2m(mm7, *(dataptr+12));
1330 psubw_r2r(mm2, mm1);
1339 pmulhw_r2r(mm0, mm4);
1349 movq_m2r(*(dataptr+1), mm7);
1350 paddw_r2r(mm1, mm4);
1352 paddw_r2r(mm1, mm2);
1354 paddw_r2r(mm5, mm0);
1355 psubw_r2r(mm5, mm3);
1360 psubw_r2r(mm4, mm3);
1362 paddw_r2r(mm4, mm5);
1365 movq_r2m(mm3, *(dataptr+6));
1366 psubw_r2r(mm2, mm0);
1368 movq_r2m(mm5, *(dataptr+10));
1369 paddw_r2r(mm2, mm6);
1371 movq_r2m(mm0, *(dataptr+14));
1377 movq_m2r(*(dataptr+3), mm1);
1380 movq_r2m(mm6, *(dataptr+2));
1382 movq_m2r(*(dataptr+5), mm2);
1385 paddw_m2r(*(dataptr+15), mm0);
1387 movq_m2r(*(dataptr+7), mm3);
1390 psubw_m2r(*(dataptr+15), mm7);
1393 paddw_m2r(*(dataptr+13), mm1);
1395 movq_r2m(mm7, tmp7);
1398 psubw_m2r(*(dataptr+13), mm6);
1402 paddw_m2r(*(dataptr+9), mm3);
1404 movq_r2m(mm6, tmp6);
1407 paddw_m2r(*(dataptr+11), mm2);
1408 paddw_r2r(mm3, mm0);
1410 psubw_r2r(mm3, mm7);
1412 psubw_m2r(*(dataptr+9), mm4);
1413 psubw_r2r(mm2, mm6);
1415 paddw_r2r(mm2, mm1);
1417 psubw_m2r(*(dataptr+11), mm5);
1418 paddw_r2r(mm7, mm6);
1422 movq_m2r(tmp6, mm2);
1426 paddw_r2r(mm1, mm0);
1429 psubw_r2r(mm1, mm3);
1431 movq_r2m(mm0, *(dataptr+1));
1436 movq_r2m(mm3, *(dataptr+9));
1437 paddw_r2r(mm5, mm4);
1439 movq_m2r(tmp7, mm3);
1440 paddw_r2r(mm6, mm0);
1442 paddw_r2r(mm2, mm5);
1443 psubw_r2r(mm6, mm7);
1445 movq_r2m(mm0, *(dataptr+5));
1446 paddw_r2r(mm3, mm2);
1450 movq_r2m(mm7, *(dataptr+13));
1453 psubw_r2r(mm2, mm1);
1462 pmulhw_r2r(mm0, mm4);
1472 movq_m2r(*(dataptr+9), mm7);
1473 paddw_r2r(mm1, mm4);
1475 paddw_r2r(mm5, mm0);
1476 psubw_r2r(mm5, mm3);
1481 paddw_r2r(mm1, mm2);
1484 psubw_r2r(mm4, mm5);
1486 paddw_r2r(mm2, mm6);
1487 paddw_r2r(mm4, mm3);
1489 movq_r2m(mm5, *(dataptr+7));
1490 psubw_r2r(mm2, mm0);
1492 movq_r2m(mm3, *(dataptr+11));
1494 movq_r2m(mm6, *(dataptr+3));
1496 movq_r2m(mm0, *(dataptr+15));
1503 static constexpr int32_t FIX_1_082392200 { 277 };
1504 static constexpr int32_t FIX_1_414213562 { 362 };
1505 static constexpr int32_t FIX_1_847759065 { 473 };
1506 static constexpr int32_t FIX_2_613125930 { 669 };
1508 static constexpr int16_t DESCALE(int32_t x) {
return static_cast<int16_t
>((x+4) >> 3); };
1512 static inline int16_t RL(int32_t x) {
return std::clamp(x, 16, 235); };
1513 static constexpr int32_t MULTIPLY(int32_t var, int32_t constant)
1514 {
return ((var * constant) + 128) >> 8; };
1519 for(
int i = 0; i < 64; i++)
1530 static mmx_t s_fix141; s_fix141.q = 0x5a825a825a825a82LL;
1531 static mmx_t s_fix184n261; s_fix184n261.q = 0xcf04cf04cf04cf04LL;
1532 static mmx_t s_fix184; s_fix184.q = 0x7641764176417641LL;
1533 static mmx_t s_fixN184; s_fixN184.q = 0x896f896f896f896fLL;
1534 static mmx_t s_fix108n184; s_fix108n184.q = 0xcf04cf04cf04cf04LL;
1536 auto *wsptr = (mmx_t *)
m_ws.data();
1537 auto *dataptr = (mmx_t *)odata;
1538 auto *idata = (mmx_t *)data.data();
1547 movq_m2r(*(idata+10), mm1);
1549 movq_m2r(*(idata+6), mm0);
1551 movq_m2r(*(idata+2), mm3);
1555 movq_m2r(*(idata+14), mm4);
1557 paddw_r2r(mm0, mm1);
1559 psubw_r2r(mm0, mm2);
1564 pmulhw_m2r(s_fix184n261, mm2);
1567 pmulhw_m2r(s_fixN184, mm0);
1568 paddw_r2r(mm4, mm3);
1571 psubw_r2r(mm4, mm5);
1573 psubw_r2r(mm1, mm6);
1576 movq_m2r(*(idata+12), mm4);
1579 pmulhw_m2r(s_fix108n184, mm5);
1580 paddw_r2r(mm1, mm3);
1585 pmulhw_m2r(s_fix184, mm7);
1588 movq_m2r(*(idata+4), mm1);
1590 paddw_r2r(mm5, mm0);
1592 paddw_r2r(mm7, mm2);
1594 pmulhw_m2r(s_fix141, mm6);
1595 psubw_r2r(mm3, mm2);
1598 paddw_r2r(mm4, mm1);
1600 psubw_r2r(mm4, mm5);
1601 psubw_r2r(mm2, mm6);
1603 movq_r2m(mm1, *(wsptr));
1606 movq_m2r(*(idata), mm7);
1608 pmulhw_m2r(s_fix141, mm5);
1609 paddw_r2r(mm6, mm0);
1611 movq_m2r(*(idata+8), mm4);
1613 psubw_r2r(mm1, mm5);
1615 movq_r2m(mm0, *(wsptr+4));
1618 movq_r2m(mm5, *(wsptr+2));
1619 psubw_r2r(mm4, mm1);
1621 paddw_r2r(mm4, mm7);
1624 paddw_m2r(*(wsptr+2), mm1);
1627 paddw_m2r(*(wsptr), mm7);
1629 psubw_m2r(*(wsptr), mm4);
1632 psubw_m2r(*(wsptr+2), mm5);
1633 paddw_r2r(mm3, mm7);
1635 psubw_r2r(mm3, mm0);
1637 movq_r2m(mm7, *(wsptr));
1640 movq_r2m(mm0, *(wsptr+14));
1641 paddw_r2r(mm2, mm1);
1643 psubw_r2r(mm2, mm3);
1645 movq_r2m(mm1, *(wsptr+2));
1648 movq_r2m(mm3, *(wsptr+12));
1650 paddw_m2r(*(wsptr+4), mm4);
1652 psubw_m2r(*(wsptr+4), mm1);
1654 movq_r2m(mm4, *(wsptr+8));
1657 paddw_r2r(mm6, mm5);
1659 movq_r2m(mm1, *(wsptr+6));
1660 psubw_r2r(mm6, mm7);
1662 movq_r2m(mm5, *(wsptr+4));
1664 movq_r2m(mm7, *(wsptr+10));
1676 movq_m2r(*(idata+10), mm1);
1678 movq_m2r(*(idata+6), mm0);
1680 movq_m2r(*(idata+2), mm3);
1683 movq_m2r(*(idata+14), mm4);
1684 paddw_r2r(mm0, mm1);
1686 psubw_r2r(mm0, mm2);
1691 pmulhw_m2r(s_fix184n261, mm2);
1694 pmulhw_m2r(s_fixN184, mm0);
1695 paddw_r2r(mm4, mm3);
1698 psubw_r2r(mm4, mm5);
1700 psubw_r2r(mm1, mm6);
1703 movq_m2r(*(idata+12), mm4);
1706 pmulhw_m2r(s_fix108n184, mm5);
1707 paddw_r2r(mm1, mm3);
1712 pmulhw_m2r(s_fix184, mm7);
1715 movq_m2r(*(idata+4), mm1);
1717 paddw_r2r(mm5, mm0);
1719 paddw_r2r(mm7, mm2);
1721 pmulhw_m2r(s_fix141, mm6);
1722 psubw_r2r(mm3, mm2);
1725 paddw_r2r(mm4, mm1);
1727 psubw_r2r(mm4, mm5);
1728 psubw_r2r(mm2, mm6);
1730 movq_r2m(mm1, *(wsptr));
1733 movq_m2r(*(idata), mm7);
1734 paddw_r2r(mm6, mm0);
1736 pmulhw_m2r(s_fix141, mm5);
1738 movq_m2r(*(idata+8), mm4);
1740 psubw_r2r(mm1, mm5);
1742 movq_r2m(mm0, *(wsptr+4));
1745 movq_r2m(mm5, *(wsptr+2));
1746 psubw_r2r(mm4, mm1);
1748 paddw_r2r(mm4, mm7);
1751 paddw_m2r(*(wsptr+2), mm1);
1754 paddw_m2r(*(wsptr), mm7);
1756 psubw_m2r(*(wsptr), mm4);
1759 psubw_m2r(*(wsptr+2), mm5);
1760 paddw_r2r(mm3, mm7);
1762 psubw_r2r(mm3, mm0);
1764 movq_r2m(mm7, *(wsptr));
1767 movq_r2m(mm0, *(wsptr+14));
1768 paddw_r2r(mm2, mm1);
1770 psubw_r2r(mm2, mm3);
1772 movq_r2m(mm1, *(wsptr+2));
1775 movq_r2m(mm3, *(wsptr+12));
1777 paddw_m2r(*(wsptr+4), mm4);
1779 psubw_m2r(*(wsptr+4), mm1);
1781 movq_r2m(mm4, *(wsptr+8));
1784 paddw_r2r(mm6, mm5);
1786 movq_r2m(mm1, *(wsptr+6));
1787 psubw_r2r(mm6, mm7);
1789 movq_r2m(mm5, *(wsptr+4));
1791 movq_r2m(mm7, *(wsptr+10));
1808 movq_m2r(*(wsptr), mm0);
1810 movq_m2r(*(wsptr+1), mm1);
1813 movq_m2r(*(wsptr+2), mm3);
1814 paddw_r2r(mm1, mm0);
1816 movq_m2r(*(wsptr+3), mm4);
1817 psubw_r2r(mm1, mm2);
1822 paddw_r2r(mm4, mm3);
1825 psubw_r2r(mm4, mm5);
1826 punpcklwd_r2r(mm3, mm0);
1828 movq_m2r(*(wsptr+7), mm7);
1829 punpckhwd_r2r(mm3, mm6);
1831 movq_m2r(*(wsptr+4), mm3);
1832 punpckldq_r2r(mm6, mm0);
1834 punpcklwd_r2r(mm5, mm1);
1837 movq_m2r(*(wsptr+6), mm6);
1838 punpckhwd_r2r(mm5, mm2);
1840 movq_m2r(*(wsptr+5), mm5);
1841 punpckldq_r2r(mm2, mm1);
1844 paddw_r2r(mm5, mm3);
1847 psubw_r2r(mm5, mm4);
1848 paddw_r2r(mm7, mm6);
1851 punpcklwd_r2r(mm6, mm3);
1853 psubw_r2r(mm7, mm2);
1854 punpckhwd_r2r(mm6, mm5);
1857 punpckldq_r2r(mm5, mm3);
1859 punpcklwd_r2r(mm2, mm4);
1861 punpckhwd_r2r(mm2, mm7);
1863 punpckldq_r2r(mm7, mm4);
1873 punpckhdq_r2r(mm4, mm6);
1875 punpckldq_r2r(mm4, mm1);
1878 pmulhw_m2r(s_fix141, mm6);
1879 punpckldq_r2r(mm3, mm0);
1881 punpckhdq_r2r(mm3, mm2);
1886 paddw_r2r(mm2, mm0);
1887 psubw_r2r(mm2, mm7);
1890 psubw_r2r(mm2, mm6);
1903 movq_m2r(*(wsptr), mm3);
1904 paddw_r2r(mm6, mm1);
1906 movq_m2r(*(wsptr+1), mm4);
1907 psubw_r2r(mm6, mm5);
1910 punpckldq_r2r(mm4, mm3);
1912 punpckhdq_r2r(mm6, mm4);
1916 movq_r2m(mm0, *(wsptr));
1917 paddw_r2r(mm4, mm2);
1921 movq_m2r(*(wsptr+2), mm6);
1922 psubw_r2r(mm4, mm3);
1924 movq_m2r(*(wsptr+3), mm0);
1927 movq_r2m(mm1, *(wsptr+1));
1928 punpckldq_r2r(mm0, mm6);
1930 punpckhdq_r2r(mm4, mm0);
1934 paddw_r2r(mm0, mm6);
1938 movq_r2m(mm5, *(wsptr+2));
1939 punpcklwd_r2r(mm6, mm2);
1941 psubw_r2r(mm0, mm1);
1942 punpckhwd_r2r(mm6, mm4);
1945 punpcklwd_r2r(mm1, mm3);
1947 movq_r2m(mm7, *(wsptr+3));
1948 punpckhwd_r2r(mm1, mm0);
1950 movq_m2r(*(wsptr+4), mm6);
1951 punpckhdq_r2r(mm2, mm0);
1953 movq_m2r(*(wsptr+5), mm7);
1954 punpckhdq_r2r(mm4, mm3);
1956 movq_m2r(*(wsptr+6), mm1);
1959 punpckldq_r2r(mm7, mm6);
1962 punpckhdq_r2r(mm4, mm7);
1965 movq_m2r(*(wsptr+7), mm4);
1966 paddw_r2r(mm7, mm6);
1968 psubw_r2r(mm7, mm2);
1969 punpckldq_r2r(mm4, mm1);
1971 punpckhdq_r2r(mm5, mm4);
1974 paddw_r2r(mm4, mm1);
1975 psubw_r2r(mm4, mm7);
1978 punpcklwd_r2r(mm1, mm6);
1980 punpckhwd_r2r(mm1, mm5);
1983 punpcklwd_r2r(mm7, mm2);
1985 punpckhwd_r2r(mm7, mm4);
1987 punpckhdq_r2r(mm6, mm4);
1989 punpckhdq_r2r(mm5, mm2);
1992 punpckldq_r2r(mm4, mm0);
1994 punpckhdq_r2r(mm4, mm5);
1997 punpckhdq_r2r(mm2, mm4);
2000 punpckldq_r2r(mm2, mm3);
2003 psubw_r2r(mm4, mm1);
2005 paddw_r2r(mm4, mm5);
2011 pmulhw_m2r(s_fix141, mm1);
2017 pmulhw_m2r(s_fixN184, mm7);
2020 movq_m2r(*(wsptr), mm2);
2022 pmulhw_m2r(s_fix108n184, mm6);
2027 pmulhw_m2r(s_fix184n261, mm0);
2028 paddw_r2r(mm5, mm2);
2030 pmulhw_m2r(s_fix184, mm3);
2031 psubw_r2r(mm5, mm4);
2036 paddw_r2r(mm6, mm7);
2039 paddw_r2r(mm0, mm3);
2042 psubw_r2r(mm5, mm3);
2045 movq_m2r(*(wsptr+1), mm0);
2046 psubw_r2r(mm3, mm1);
2049 paddw_r2r(mm3, mm0);
2064 psubw_r2r(mm3, mm6);
2069 packuswb_r2r(mm4, mm0);
2071 movq_m2r(*(wsptr+2), mm5);
2072 packuswb_r2r(mm6, mm2);
2078 paddw_r2r(mm1, mm7);
2081 paddw_r2r(mm1, mm5);
2082 psubw_r2r(mm1, mm3);
2086 movq_m2r(*(wsptr+3), mm4);
2096 paddw_r2r(mm7, mm4);
2098 psubw_r2r(mm7, mm6);
2105 packuswb_r2r(mm4, mm5);
2107 packuswb_r2r(mm3, mm6);
2111 punpcklbw_r2r(mm0, mm2);
2113 punpckhbw_r2r(mm0, mm4);
2116 punpcklbw_r2r(mm6, mm5);
2120 punpckhbw_r2r(mm6, mm7);
2122 punpcklwd_r2r(mm5, mm2);
2127 punpckhwd_r2r(mm5, mm1);
2130 punpcklwd_r2r(mm4, mm6);
2134 punpckldq_r2r(mm6, mm2);
2142 punpckhwd_r2r(mm4, mm7);
2144 movq_r2m(mm2, *(dataptr));
2146 punpckhdq_r2r(mm6, mm0);
2149 movq_r2m(mm0, *(dataptr));
2151 punpckldq_r2r(mm7, mm1);
2152 punpckhdq_r2r(mm7, mm3);
2155 movq_r2m(mm1, *(dataptr));
2158 movq_r2m(mm3, *(dataptr));
2170 movq_m2r(*(wsptr), mm0);
2172 movq_m2r(*(wsptr+1), mm1);
2175 movq_m2r(*(wsptr+2), mm3);
2176 paddw_r2r(mm1, mm0);
2178 movq_m2r(*(wsptr+3), mm4);
2179 psubw_r2r(mm1, mm2);
2184 paddw_r2r(mm4, mm3);
2187 psubw_r2r(mm4, mm5);
2188 punpcklwd_r2r(mm3, mm0);
2190 movq_m2r(*(wsptr+7), mm7);
2191 punpckhwd_r2r(mm3, mm6);
2193 movq_m2r(*(wsptr+4), mm3);
2194 punpckldq_r2r(mm6, mm0);
2196 punpcklwd_r2r(mm5, mm1);
2199 movq_m2r(*(wsptr+6), mm6);
2200 punpckhwd_r2r(mm5, mm2);
2202 movq_m2r(*(wsptr+5), mm5);
2203 punpckldq_r2r(mm2, mm1);
2205 paddw_r2r(mm5, mm3);
2208 psubw_r2r(mm5, mm4);
2209 paddw_r2r(mm7, mm6);
2212 punpcklwd_r2r(mm6, mm3);
2214 psubw_r2r(mm7, mm2);
2215 punpckhwd_r2r(mm6, mm5);
2218 punpckldq_r2r(mm5, mm3);
2220 punpcklwd_r2r(mm2, mm4);
2222 punpckhwd_r2r(mm2, mm7);
2224 punpckldq_r2r(mm7, mm4);
2233 punpckhdq_r2r(mm4, mm6);
2235 punpckldq_r2r(mm4, mm1);
2238 pmulhw_m2r(s_fix141, mm6);
2239 punpckldq_r2r(mm3, mm0);
2241 punpckhdq_r2r(mm3, mm2);
2246 paddw_r2r(mm2, mm0);
2247 psubw_r2r(mm2, mm7);
2250 psubw_r2r(mm2, mm6);
2264 movq_m2r(*(wsptr), mm3);
2265 paddw_r2r(mm6, mm1);
2267 movq_m2r(*(wsptr+1), mm4);
2268 psubw_r2r(mm6, mm5);
2271 punpckldq_r2r(mm4, mm3);
2273 punpckhdq_r2r(mm6, mm4);
2277 movq_r2m(mm0, *(wsptr));
2278 paddw_r2r(mm4, mm2);
2282 movq_m2r(*(wsptr+2), mm6);
2283 psubw_r2r(mm4, mm3);
2285 movq_m2r(*(wsptr+3), mm0);
2288 movq_r2m(mm1, *(wsptr+1));
2289 punpckldq_r2r(mm0, mm6);
2291 punpckhdq_r2r(mm4, mm0);
2295 paddw_r2r(mm0, mm6);
2299 movq_r2m(mm5, *(wsptr+2));
2300 punpcklwd_r2r(mm6, mm2);
2302 psubw_r2r(mm0, mm1);
2303 punpckhwd_r2r(mm6, mm4);
2306 punpcklwd_r2r(mm1, mm3);
2308 movq_r2m(mm7, *(wsptr+3));
2309 punpckhwd_r2r(mm1, mm0);
2311 movq_m2r(*(wsptr+4), mm6);
2312 punpckhdq_r2r(mm2, mm0);
2314 movq_m2r(*(wsptr+5), mm7);
2315 punpckhdq_r2r(mm4, mm3);
2317 movq_m2r(*(wsptr+6), mm1);
2320 punpckldq_r2r(mm7, mm6);
2323 punpckhdq_r2r(mm4, mm7);
2326 movq_m2r(*(wsptr+7), mm4);
2327 paddw_r2r(mm7, mm6);
2329 psubw_r2r(mm7, mm2);
2330 punpckldq_r2r(mm4, mm1);
2332 punpckhdq_r2r(mm5, mm4);
2335 paddw_r2r(mm4, mm1);
2336 psubw_r2r(mm4, mm7);
2339 punpcklwd_r2r(mm1, mm6);
2341 punpckhwd_r2r(mm1, mm5);
2344 punpcklwd_r2r(mm7, mm2);
2346 punpckhwd_r2r(mm7, mm4);
2348 punpckhdq_r2r(mm6, mm4);
2350 punpckhdq_r2r(mm5, mm2);
2353 punpckldq_r2r(mm4, mm0);
2355 punpckhdq_r2r(mm4, mm5);
2358 punpckhdq_r2r(mm2, mm4);
2361 punpckldq_r2r(mm2, mm3);
2364 psubw_r2r(mm4, mm1);
2366 paddw_r2r(mm4, mm5);
2372 pmulhw_m2r(s_fix141, mm1);
2378 pmulhw_m2r(s_fixN184, mm7);
2381 movq_m2r(*(wsptr), mm2);
2383 pmulhw_m2r(s_fix108n184, mm6);
2388 pmulhw_m2r(s_fix184n261, mm0);
2389 paddw_r2r(mm5, mm2);
2391 pmulhw_m2r(s_fix184, mm3);
2392 psubw_r2r(mm5, mm4);
2397 paddw_r2r(mm6, mm7);
2400 paddw_r2r(mm0, mm3);
2403 psubw_r2r(mm5, mm3);
2406 movq_m2r(*(wsptr+1), mm0);
2407 psubw_r2r(mm3, mm1);
2410 paddw_r2r(mm3, mm0);
2424 psubw_r2r(mm3, mm6);
2429 packuswb_r2r(mm4, mm0);
2431 movq_m2r(*(wsptr+2), mm5);
2432 packuswb_r2r(mm6, mm2);
2438 paddw_r2r(mm1, mm7);
2441 paddw_r2r(mm1, mm5);
2442 psubw_r2r(mm1, mm3);
2446 movq_m2r(*(wsptr+3), mm4);
2456 paddw_r2r(mm7, mm4);
2458 psubw_r2r(mm7, mm6);
2471 packuswb_r2r(mm4, mm5);
2473 packuswb_r2r(mm3, mm6);
2477 punpcklbw_r2r(mm0, mm2);
2479 punpckhbw_r2r(mm0, mm4);
2482 punpcklbw_r2r(mm6, mm5);
2484 punpckhbw_r2r(mm6, mm7);
2486 punpcklwd_r2r(mm5, mm2);
2489 punpckhwd_r2r(mm5, mm1);
2492 punpcklwd_r2r(mm4, mm6);
2494 punpckldq_r2r(mm6, mm2);
2498 punpckhwd_r2r(mm4, mm7);
2501 movq_r2m(mm2, *(dataptr));
2503 punpckhdq_r2r(mm6, mm0);
2506 movq_r2m(mm0, *(dataptr));
2508 punpckldq_r2r(mm7, mm1);
2510 punpckhdq_r2r(mm7, mm3);
2513 movq_r2m(mm1, *(dataptr));
2516 movq_r2m(mm3, *(dataptr));
2519 int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2520 int32_t tmp10, tmp11, tmp12, tmp13;
2521 int32_t z5, z10, z11, z12, z13;
2528 inptr = data.data();
2529 wsptr =
m_ws.data();
2530 for (ctr = 8; ctr > 0; ctr--) {
2532 if ((inptr[8] | inptr[16] | inptr[24] |
2533 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2554 tmp10 = tmp0 + tmp2;
2555 tmp11 = tmp0 - tmp2;
2557 tmp13 = tmp1 + tmp3;
2558 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2560 tmp0 = tmp10 + tmp13;
2561 tmp3 = tmp10 - tmp13;
2562 tmp1 = tmp11 + tmp12;
2563 tmp2 = tmp11 - tmp12;
2576 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2578 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2579 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2580 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2582 tmp6 = tmp12 - tmp7;
2583 tmp5 = tmp11 - tmp6;
2584 tmp4 = tmp10 + tmp5;
2586 wsptr[0] = (int32_t) (tmp0 + tmp7);
2587 wsptr[56] = (int32_t) (tmp0 - tmp7);
2588 wsptr[8] = (int32_t) (tmp1 + tmp6);
2589 wsptr[48] = (int32_t) (tmp1 - tmp6);
2590 wsptr[16] = (int32_t) (tmp2 + tmp5);
2591 wsptr[40] = (int32_t) (tmp2 - tmp5);
2592 wsptr[32] = (int32_t) (tmp3 + tmp4);
2593 wsptr[24] = (int32_t) (tmp3 - tmp4);
2599 wsptr =
m_ws.data();
2600 for (ctr = 0; ctr < 8; ctr++) {
2601 outptr = &(odata[ctr*rskip]);
2603 tmp10 = wsptr[0] + wsptr[4];
2604 tmp11 = wsptr[0] - wsptr[4];
2606 tmp13 = wsptr[2] + wsptr[6];
2607 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2609 tmp0 = tmp10 + tmp13;
2610 tmp3 = tmp10 - tmp13;
2611 tmp1 = tmp11 + tmp12;
2612 tmp2 = tmp11 - tmp12;
2614 z13 = wsptr[5] + wsptr[3];
2615 z10 = wsptr[5] - wsptr[3];
2616 z11 = wsptr[1] + wsptr[7];
2617 z12 = wsptr[1] - wsptr[7];
2620 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2622 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2623 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2624 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2626 tmp6 = tmp12 - tmp7;
2627 tmp5 = tmp11 - tmp6;
2628 tmp4 = tmp10 + tmp5;
2630 outptr[0] = RL(DESCALE(tmp0 + tmp7));
2631 outptr[7] = RL(DESCALE(tmp0 - tmp7));
2632 outptr[1] = RL(DESCALE(tmp1 + tmp6));
2633 outptr[6] = RL(DESCALE(tmp1 - tmp6));
2634 outptr[2] = RL(DESCALE(tmp2 + tmp5));
2635 outptr[5] = RL(DESCALE(tmp2 - tmp5));
2636 outptr[4] = RL(DESCALE(tmp3 + tmp4));
2637 outptr[3] = RL(DESCALE(tmp3 - tmp4));
2646 uint64_t qual = (uint64_t)
m_q << (32 - 7);
2648 for(
int i = 0; i < 64; i++)
2700 if ((*w < 0) || (*w > 65535))
2702 if ((*h < 0) || (*h > 65535))
2725 fprintf(
stderr,
"RTjpeg: Could not allocate memory\n");
2751 m_lMask.uq = (((uint64_t)(*lm)<<48)|((uint64_t)(*lm)<<32)|((uint64_t)(*lm)<<16)|(uint64_t)(*lm));
2752 m_cMask.uq = (((uint64_t)(*cm)<<48)|((uint64_t)(*cm)<<32)|((uint64_t)(*cm)<<16)|(uint64_t)(*cm));
2767 fprintf(
stderr,
"RTjpeg: Could not allocate memory\n");
2795 uint8_t * bp = planes[0];
2796 uint8_t * bp1 = bp + (
m_width<<3);
2797 uint8_t * bp2 = planes[1];
2798 uint8_t * bp3 = planes[2];
2805 for(
int i =
m_height >> 1; i; i -= 8)
2807 for(
int j = 0, k = 0; j <
m_width; j += 16, k += 8)
2846 uint8_t * bp = planes[0];
2847 uint8_t * bp2 = planes[1];
2848 uint8_t * bp3 = planes[2];
2857 for(
int j=0, k=0; j<
m_width; j+=16, k+=8)
2889 int8_t * sb =
nullptr;
2890 uint8_t * bp = planes[0];
2917 uint8_t * bp = planes[0];
2918 uint8_t * bp2 = planes[1];
2919 uint8_t * bp3 = planes[2];
2928 for(
int k=0, j=0; j<
m_width; j+=16, k+=8) {
2965 uint8_t * bp = planes[0];
2966 uint8_t * bp1 = bp + (
m_width<<3);
2967 uint8_t * bp2 = planes[1];
2968 uint8_t * bp3 = planes[2];
2977 for(
int k=0, j=0; j<
m_width; j+=16, k+=8) {
3027 uint8_t * bp = planes[0];
3053 auto *mold=(mmx_t *)_old;
3054 auto *mblock=(mmx_t *)rblock.data();
3055 volatile mmx_t result {};
3056 static mmx_t s_neg= { 0xffffffffffffffffULL };
3058 movq_m2r(*mask, mm7);
3059 movq_m2r(s_neg, mm6);
3062 for(
int i=0; i<8; i++)
3064 movq_m2r(*(mblock++), mm0);
3065 movq_m2r(*(mblock++), mm2);
3066 movq_m2r(*(mold++), mm1);
3067 movq_m2r(*(mold++), mm3);
3068 psubsw_r2r(mm1, mm0);
3069 psubsw_r2r(mm3, mm2);
3072 pcmpgtw_r2r(mm7, mm0);
3073 pcmpgtw_r2r(mm7, mm2);
3076 pcmpgtw_r2r(mm7, mm1);
3077 pcmpgtw_r2r(mm7, mm3);
3083 movq_r2m(mm5, result);
3087 std::copy(rblock.cbegin(), rblock.cend(), _old);
3096 for(
int i=0; i<64; i++)
3097 if (abs(_old[i]-rblock[i])>*mask)
3099 std::copy(rblock.cbegin(), rblock.cend(), _old);
3108 uint8_t * bp = planes[0];
3109 uint8_t * bp1 = bp + (
m_width<<3);
3110 uint8_t * bp2 = planes[1];
3111 uint8_t * bp3 = planes[2];
3113 int16_t * lblock =
m_old;
3118 for(
int j=0, k=0; j <
m_width; j+=16, k+=8)
3124 *((uint8_t *)sp++)=255;
3133 *((uint8_t *)sp++)=255;
3142 *((uint8_t *)sp++)=255;
3151 *((uint8_t *)sp++)=255;
3160 *((uint8_t *)sp++)=255;
3170 *((uint8_t *)sp++)=255;
3190 uint8_t * bp = planes[0];
3191 uint8_t * bp2 = planes[1];
3192 uint8_t * bp3 = planes[2];
3194 int16_t *lblock =
m_old;
3198 for(
int j=0, k=0; j<
m_width; j+=16, k+=8)
3204 *((uint8_t *)sp++)=255;
3213 *((uint8_t *)sp++)=255;
3222 *((uint8_t *)sp++)=255;
3231 *((uint8_t *)sp++)=255;
3249 uint8_t * bp = planes[0];
3251 int16_t *lblock =
m_old;
3261 *((uint8_t *)sp++)=255;
3306 fh->framesize = qToLittleEndian<qint32>(ds);
3309 fh->width = qToLittleEndian<qint16>(
m_width);
3310 fh->height = qToLittleEndian<qint16>(
m_height);
3319 if ((qFromLittleEndian<qint16>(fh->width) !=
m_width)||
3320 (qFromLittleEndian<qint16>(fh->height) !=
m_height))
3322 int w = qFromLittleEndian<qint16>(fh->width);
3323 int h = qFromLittleEndian<qint16>(fh->height);
3326 if (fh->quality !=
m_q)
3328 int q = fh->quality;