47 #define BETTERCOMPRESSION 1 // NOLINT(cppcoreguidelines-macro-usage)
49 static const std::array<const uint8_t,64>
RTjpeg_ZZ {
55 40, 33, 26, 19, 12, 5,
56 6, 13, 20, 27, 34, 41, 48,
57 56, 49, 42, 35, 28, 21, 14, 7,
58 15, 22, 29, 36, 43, 50, 57,
59 58, 51, 44, 37, 30, 23,
67 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
68 5957222912ULL, 8263040512ULL, 7783580160ULL, 7005009920ULL, 5957222912ULL, 4680582144ULL, 3224107520ULL, 1643641088ULL,
69 5611718144ULL, 7783580160ULL, 7331904512ULL, 6598688768ULL, 5611718144ULL, 4408998912ULL, 3036936960ULL, 1548224000ULL,
70 5050464768ULL, 7005009920ULL, 6598688768ULL, 5938608128ULL, 5050464768ULL, 3968072960ULL, 2733115392ULL, 1393296000ULL,
71 4294967296ULL, 5957222912ULL, 5611718144ULL, 5050464768ULL, 4294967296ULL, 3374581504ULL, 2324432128ULL, 1184891264ULL,
72 3374581504ULL, 4680582144ULL, 4408998912ULL, 3968072960ULL, 3374581504ULL, 2651326208ULL, 1826357504ULL, 931136000ULL,
73 2324432128ULL, 3224107520ULL, 3036936960ULL, 2733115392ULL, 2324432128ULL, 1826357504ULL, 1258030336ULL, 641204288ULL,
74 1184891264ULL, 1643641088ULL, 1548224000ULL, 1393296000ULL, 1184891264ULL, 931136000ULL, 641204288ULL, 326894240ULL,
78 16, 11, 10, 16, 24, 40, 51, 61,
79 12, 12, 14, 19, 26, 58, 60, 55,
80 14, 13, 16, 24, 40, 57, 69, 56,
81 14, 17, 22, 29, 51, 87, 80, 62,
82 18, 22, 37, 56, 68, 109, 103, 77,
83 24, 35, 55, 64, 81, 104, 113, 92,
84 49, 64, 78, 87, 103, 121, 120, 101,
85 72, 92, 95, 98, 112, 100, 103, 99
89 17, 18, 24, 47, 99, 99, 99, 99,
90 18, 21, 26, 66, 99, 99, 99, 99,
91 24, 26, 56, 99, 99, 99, 99, 99,
92 47, 66, 99, 99, 99, 99, 99, 99,
93 99, 99, 99, 99, 99, 99, 99, 99,
94 99, 99, 99, 99, 99, 99, 99, 99,
95 99, 99, 99, 99, 99, 99, 99, 99,
96 99, 99, 99, 99, 99, 99, 99, 99
99 #ifdef BETTERCOMPRESSION
120 auto *ustrm = (uint8_t *)strm;
124 for (ii=0; ii < 64; ii++) {
138 ustrm[0]=
static_cast<uint8_t
>(
std::clamp(value, 0, 254));
141 while (data[
RTjpeg_ZZ[ci]]==0 && ci>0) ci--;
143 unsigned char bitten = ((
unsigned char)ci) << 2;
152 unsigned char bitoff = 0;
163 bitten |= (0x01<<bitoff);
166 bitten |= (0x03<<bitoff);
169 bitten |= (0x02<<bitoff);
217 if ( (ZZvalue > 7) || (ZZvalue < -7) ) {
218 bitten |= (0x08<<bitoff);
222 bitten |= (ZZvalue&0xf)<<bitoff;
253 strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
257 strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
268 fprintf(
stdout,
"\nco = '%d'\n", co);
269 for (i=0; i < co+2; i++) {
270 fprintf(
stdout,
"%d ", strm[i]);
285 auto *qtbl = (uint32_t *)qtbla.data();
287 unsigned char bitoff = 0;
291 data[i]=((uint8_t)strm[0])*qtbl[i];
295 unsigned char bitten = ((
unsigned char)strm[1]) >> 2;
297 for(; co > bitten; co--) {
314 bitten = ((
unsigned char)strm[ci]) >> bitoff;
370 bitten = ((
unsigned char)strm[ci]) >> bitoff;
375 if ( bitten == 0x08 ) {
380 if ( bitten & 0x08 ) {
385 data[i]=((
signed char)bitten)*qtbl[i];
405 data[i]=strm[ci++]*qtbl[i];
413 fprintf(
stdout,
"\nci = '%d'\n", ci);
414 for (i=0; i < 64; i++) {
425 int RTjpeg::b2s(
const int16_t *data, int8_t *strm, uint8_t bt8)
427 register int ci, co=1,
tmp;
428 register int16_t ZZvalue;
433 for (ii=0; ii < 64; ii++) {
442 for(ci=1; ci<=bt8; ci++)
448 strm[co++]=(int8_t)(ZZvalue>127)?127:ZZvalue;
452 strm[co++]=(int8_t)(ZZvalue<-128)?-128:ZZvalue;
462 strm[co++]=(int8_t)(ZZvalue>63)?63:ZZvalue;
466 strm[co++]=(int8_t)(ZZvalue<-64)?-64:ZZvalue;
474 }
while((ci<64)&&(data[
RTjpeg_ZZ[ci]]==0));
476 strm[co++]=(int8_t)(63+(ci-
tmp));
483 int RTjpeg::s2b(int16_t *data,
const int8_t *strm, uint8_t bt8, uint32_t *qtbla)
485 uint32_t *qtbl = (uint32_t *)qtbla;
490 data[i]=((uint8_t)strm[0])*qtbl[i];
492 for(co=1; co<=bt8; co++)
495 data[i]=strm[ci++]*qtbl[i];
508 data[i]=strm[ci]*qtbl[i];
519 using P16_32 =
union { int16_t *m_int16; int32_t *m_int32; };
522 qtbl.m_int32 =
m_lqt.data();
523 for (
int i = 0; i < 64; i++)
524 qtbl.m_int16[i] =
static_cast<int16_t
>(
m_lqt[i]);
527 qtbl.m_int32 =
m_cqt.data();
528 for (
int i = 0; i < 64; i++)
529 qtbl.m_int16[i] =
static_cast<int16_t
>(
m_cqt[i]);
534 auto *ql=(mmx_t *)qtbl.data();
535 auto *bl=(mmx_t *)_block.data();
540 for(
int i=16; i; i--)
542 movq_m2r(*(ql++), mm0);
547 punpcklwd_r2r(mm6, mm0);
548 punpckhwd_r2r(mm6, mm1);
550 punpcklwd_r2r(mm7, mm2);
551 punpckhwd_r2r(mm7, mm3);
553 pmaddwd_r2r(mm2, mm0);
554 pmaddwd_r2r(mm3, mm1);
559 packssdw_r2r(mm1, mm0);
561 movq_r2m(mm0, *(bl++));
574 _block[i]=(int16_t)((_block[i]*qtbl[i]+32767)>>16);
582 static constexpr int32_t FIX_0_382683433 { 98 };
583 static constexpr int32_t FIX_0_541196100 { 139 };
584 static constexpr int32_t FIX_0_707106781 { 181 };
585 static constexpr int32_t FIX_1_306562965 { 334 };
587 static constexpr int16_t DESCALE10(int32_t x) {
return static_cast<int16_t
>((x+128) >> 8); };
588 static constexpr int16_t DESCALE20(int32_t x) {
return static_cast<int16_t
>((x+32768) >> 16); };
589 static constexpr int32_t D_MULTIPLY(int32_t var, int32_t constant) {
return var * constant; };
594 for (
int i = 0; i < 64; i++)
604 uint8_t *idataptr = idata;
605 int32_t *wsptr =
m_ws.data();
607 for (
int ctr = 7; ctr >= 0; ctr--) {
608 int32_t tmp0 = idataptr[0] + idataptr[7];
609 int32_t tmp7 = idataptr[0] - idataptr[7];
610 int32_t tmp1 = idataptr[1] + idataptr[6];
611 int32_t tmp6 = idataptr[1] - idataptr[6];
612 int32_t tmp2 = idataptr[2] + idataptr[5];
613 int32_t tmp5 = idataptr[2] - idataptr[5];
614 int32_t tmp3 = idataptr[3] + idataptr[4];
615 int32_t tmp4 = idataptr[3] - idataptr[4];
617 int32_t tmp10 = (tmp0 + tmp3);
618 int32_t tmp13 = tmp0 - tmp3;
619 int32_t tmp11 = (tmp1 + tmp2);
620 int32_t tmp12 = tmp1 - tmp2;
622 wsptr[0] = (tmp10 + tmp11)<<8;
623 wsptr[4] = (tmp10 - tmp11)<<8;
625 int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
626 wsptr[2] = (tmp13<<8) + z1;
627 wsptr[6] = (tmp13<<8) - z1;
633 int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433);
634 int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5;
635 int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5;
636 int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781);
638 int32_t z11 = (tmp7<<8) + z3;
639 int32_t z13 = (tmp7<<8) - z3;
646 idataptr += rskip<<3;
651 int16_t *odataptr =
m_block.data();
652 for (
int ctr = 7; ctr >= 0; ctr--) {
653 int32_t tmp0 = wsptr[0] + wsptr[56];
654 int32_t tmp7 = wsptr[0] - wsptr[56];
655 int32_t tmp1 = wsptr[8] + wsptr[48];
656 int32_t tmp6 = wsptr[8] - wsptr[48];
657 int32_t tmp2 = wsptr[16] + wsptr[40];
658 int32_t tmp5 = wsptr[16] - wsptr[40];
659 int32_t tmp3 = wsptr[24] + wsptr[32];
660 int32_t tmp4 = wsptr[24] - wsptr[32];
662 int32_t tmp10 = tmp0 + tmp3;
663 int32_t tmp13 = tmp0 - tmp3;
664 int32_t tmp11 = tmp1 + tmp2;
665 int32_t tmp12 = tmp1 - tmp2;
667 odataptr[0] = DESCALE10(tmp10 + tmp11);
668 odataptr[32] = DESCALE10(tmp10 - tmp11);
670 int32_t z1 = D_MULTIPLY(tmp12 + tmp13, FIX_0_707106781);
671 odataptr[16] = DESCALE20((tmp13<<8) + z1);
672 odataptr[48] = DESCALE20((tmp13<<8) - z1);
678 int32_t z5 = D_MULTIPLY(tmp10 - tmp12, FIX_0_382683433);
679 int32_t z2 = D_MULTIPLY(tmp10, FIX_0_541196100) + z5;
680 int32_t z4 = D_MULTIPLY(tmp12, FIX_1_306562965) + z5;
681 int32_t z3 = D_MULTIPLY(tmp11, FIX_0_707106781);
683 int32_t z11 = (tmp7<<8) + z3;
684 int32_t z13 = (tmp7<<8) - z3;
686 odataptr[40] = DESCALE20(z13 + z2);
687 odataptr[24] = DESCALE20(z13 - z2);
688 odataptr[8] = DESCALE20(z11 + z4);
689 odataptr[56] = DESCALE20(z11 - z4);
696 volatile mmx_t tmp6 {};
697 volatile mmx_t tmp7 {};
698 auto *dataptr = (mmx_t *)
m_block.data();
699 auto *idata2 = (mmx_t *)idata;
706 movq_m2r(*idata2, mm0);
709 punpcklbw_r2r(mm2, mm0);
710 movq_r2m(mm0, *(dataptr));
712 punpckhbw_r2r(mm2, mm1);
713 movq_r2m(mm1, *(dataptr+1));
717 movq_m2r(*idata2, mm0);
720 punpcklbw_r2r(mm2, mm0);
721 movq_r2m(mm0, *(dataptr+2));
723 punpckhbw_r2r(mm2, mm1);
724 movq_r2m(mm1, *(dataptr+3));
728 movq_m2r(*idata2, mm0);
731 punpcklbw_r2r(mm2, mm0);
732 movq_r2m(mm0, *(dataptr+4));
734 punpckhbw_r2r(mm2, mm1);
735 movq_r2m(mm1, *(dataptr+5));
739 movq_m2r(*idata2, mm0);
742 punpcklbw_r2r(mm2, mm0);
743 movq_r2m(mm0, *(dataptr+6));
745 punpckhbw_r2r(mm2, mm1);
746 movq_r2m(mm1, *(dataptr+7));
750 movq_m2r(*idata2, mm0);
753 punpcklbw_r2r(mm2, mm0);
754 movq_r2m(mm0, *(dataptr+8));
756 punpckhbw_r2r(mm2, mm1);
757 movq_r2m(mm1, *(dataptr+9));
761 movq_m2r(*idata2, mm0);
764 punpcklbw_r2r(mm2, mm0);
765 movq_r2m(mm0, *(dataptr+10));
767 punpckhbw_r2r(mm2, mm1);
768 movq_r2m(mm1, *(dataptr+11));
772 movq_m2r(*idata2, mm0);
775 punpcklbw_r2r(mm2, mm0);
776 movq_r2m(mm0, *(dataptr+12));
778 punpckhbw_r2r(mm2, mm1);
779 movq_r2m(mm1, *(dataptr+13));
783 movq_m2r(*idata2, mm0);
786 punpcklbw_r2r(mm2, mm0);
787 movq_r2m(mm0, *(dataptr+14));
789 punpckhbw_r2r(mm2, mm1);
790 movq_r2m(mm1, *(dataptr+15));
794 movq_m2r(*(dataptr+9), mm7);
796 movq_m2r(*(dataptr+13), mm6);
799 punpcklwd_m2r(*(dataptr+11), mm7);
802 punpcklwd_m2r(*(dataptr+15), mm6);
805 movq_m2r(*(dataptr+11), mm3);
806 punpckldq_r2r(mm6, mm7);
808 movq_m2r(*(dataptr+15), mm0);
809 punpckhdq_r2r(mm6, mm1);
811 movq_r2m(mm7,*(dataptr+9));
812 punpckhwd_r2r(mm3, mm5);
814 movq_r2m(mm1,*(dataptr+11));
815 punpckhwd_r2r(mm0, mm2);
818 punpckldq_r2r(mm2, mm5);
820 movq_m2r(*(dataptr+1), mm0);
821 punpckhdq_r2r(mm2, mm1);
823 movq_r2m(mm5,*(dataptr+13));
827 movq_r2m(mm1, *(dataptr+15));
829 movq_m2r(*(dataptr+5), mm2);
832 punpcklwd_m2r(*(dataptr+3), mm0);
835 punpcklwd_m2r(*(dataptr+7), mm2);
839 movq_m2r(*(dataptr+8), mm1);
840 punpckldq_r2r(mm2, mm0);
842 movq_m2r(*(dataptr+12), mm3);
843 punpckhdq_r2r(mm2, mm4);
845 punpckhwd_m2r(*(dataptr+3), mm6);
848 punpckhwd_m2r(*(dataptr+7), mm7);
851 movq_r2m(mm0, *(dataptr+8));
852 punpckhdq_r2r(mm7, mm5);
854 punpcklwd_m2r(*(dataptr+10), mm1);
857 punpckhwd_m2r(*(dataptr+10), mm2);
859 movq_r2m(mm4, *(dataptr+10));
860 punpckldq_r2r(mm7, mm6);
862 punpcklwd_m2r(*(dataptr+14), mm3);
865 movq_r2m(mm6, *(dataptr+12));
866 punpckldq_r2r(mm3, mm1);
868 punpckhwd_m2r(*(dataptr+14), mm0);
871 movq_r2m(mm5, *(dataptr+14));
872 punpckhdq_r2r(mm3, mm4);
874 movq_r2m(mm1, *(dataptr+1));
875 punpckldq_r2r(mm0, mm2);
877 movq_r2m(mm4, *(dataptr+3));
878 punpckhdq_r2r(mm0, mm6);
880 movq_r2m(mm2, *(dataptr+5));
882 movq_m2r(*dataptr, mm0);
884 movq_r2m(mm6, *(dataptr+7));
889 movq_m2r(*(dataptr+4), mm7);
892 punpcklwd_m2r(*(dataptr+2), mm0);
895 punpcklwd_m2r(*(dataptr+6), mm7);
898 movq_m2r(*(dataptr+2), mm6);
899 punpckldq_r2r(mm7, mm0);
901 movq_m2r(*(dataptr+6), mm5);
902 punpckhdq_r2r(mm7, mm1);
905 punpckhwd_r2r(mm6, mm2);
907 psubw_m2r(*(dataptr+14), mm7);
910 paddw_m2r(*(dataptr+14), mm0);
911 punpckhwd_r2r(mm5, mm4);
913 paddw_m2r(*(dataptr+12), mm1);
916 psubw_m2r(*(dataptr+12), mm6);
917 punpckldq_r2r(mm4, mm2);
923 punpckhdq_r2r(mm4, mm3);
925 paddw_m2r(*(dataptr+10), mm2);
933 paddw_m2r(*(dataptr+8), mm3);
936 psubw_m2r(*(dataptr+8), mm4);
945 psubw_m2r(*(dataptr+10), mm5);
959 movq_r2m(mm0, *dataptr);
963 movq_r2m(mm3, *(dataptr+8));
972 movq_r2m(mm0, *(dataptr+4));
977 movq_r2m(mm7, *(dataptr+12));
989 pmulhw_r2r(mm0, mm4);
999 movq_m2r(*(dataptr+1), mm7);
1000 paddw_r2r(mm1, mm4);
1002 paddw_r2r(mm1, mm2);
1004 paddw_r2r(mm5, mm0);
1005 psubw_r2r(mm5, mm3);
1010 psubw_r2r(mm4, mm3);
1012 paddw_r2r(mm4, mm5);
1015 movq_r2m(mm3, *(dataptr+6));
1016 psubw_r2r(mm2, mm0);
1018 movq_r2m(mm5, *(dataptr+10));
1019 paddw_r2r(mm2, mm6);
1021 movq_r2m(mm0, *(dataptr+14));
1027 movq_m2r(*(dataptr+3), mm1);
1030 movq_r2m(mm6, *(dataptr+2));
1032 movq_m2r(*(dataptr+5), mm2);
1035 paddw_m2r(*(dataptr+15), mm0);
1037 movq_m2r(*(dataptr+7), mm3);
1040 psubw_m2r(*(dataptr+15), mm7);
1043 paddw_m2r(*(dataptr+13), mm1);
1045 movq_r2m(mm7, tmp7);
1048 psubw_m2r(*(dataptr+13), mm6);
1052 paddw_m2r(*(dataptr+9), mm3);
1054 movq_r2m(mm6, tmp6);
1057 paddw_m2r(*(dataptr+11), mm2);
1058 paddw_r2r(mm3, mm0);
1060 psubw_r2r(mm3, mm7);
1062 psubw_m2r(*(dataptr+9), mm4);
1063 psubw_r2r(mm2, mm6);
1065 paddw_r2r(mm2, mm1);
1067 psubw_m2r(*(dataptr+11), mm5);
1068 paddw_r2r(mm7, mm6);
1072 movq_m2r(tmp6, mm2);
1076 paddw_r2r(mm1, mm0);
1079 psubw_r2r(mm1, mm3);
1081 movq_r2m(mm0, *(dataptr+1));
1086 movq_r2m(mm3, *(dataptr+9));
1087 paddw_r2r(mm5, mm4);
1089 movq_m2r(tmp7, mm3);
1090 paddw_r2r(mm6, mm0);
1092 paddw_r2r(mm2, mm5);
1093 psubw_r2r(mm6, mm7);
1095 movq_r2m(mm0, *(dataptr+5));
1096 paddw_r2r(mm3, mm2);
1100 movq_r2m(mm7, *(dataptr+13));
1103 psubw_r2r(mm2, mm1);
1112 pmulhw_r2r(mm0, mm4);
1122 movq_m2r(*(dataptr+9), mm7);
1123 paddw_r2r(mm1, mm4);
1125 paddw_r2r(mm5, mm0);
1126 psubw_r2r(mm5, mm3);
1131 paddw_r2r(mm1, mm2);
1134 psubw_r2r(mm4, mm5);
1136 paddw_r2r(mm2, mm6);
1137 paddw_r2r(mm4, mm3);
1139 movq_r2m(mm5, *(dataptr+7));
1141 movq_r2m(mm6, *(dataptr+3));
1142 psubw_r2r(mm2, mm0);
1148 movq_m2r(*(dataptr+13), mm6);
1151 punpcklwd_r2r(mm3, mm7);
1154 punpcklwd_r2r(mm0, mm6);
1157 punpckldq_r2r(mm6, mm7);
1159 punpckhdq_r2r(mm6, mm1);
1161 movq_r2m(mm7, *(dataptr+9));
1162 punpckhwd_r2r(mm3, mm5);
1164 movq_r2m(mm1, *(dataptr+11));
1165 punpckhwd_r2r(mm0, mm2);
1168 punpckldq_r2r(mm2, mm5);
1170 movq_m2r(*(dataptr+1), mm0);
1171 punpckhdq_r2r(mm2, mm1);
1173 movq_r2m(mm5, *(dataptr+13));
1177 movq_r2m(mm1, *(dataptr+15));
1179 movq_m2r(*(dataptr+5), mm2);
1182 punpcklwd_m2r(*(dataptr+3), mm0);
1185 punpcklwd_m2r(*(dataptr+7), mm2);
1190 movq_m2r(*(dataptr+8), mm1);
1191 punpckldq_r2r(mm2, mm0);
1193 movq_m2r(*(dataptr+12), mm3);
1194 punpckhdq_r2r(mm2, mm4);
1196 punpckhwd_m2r(*(dataptr+3), mm6);
1199 punpckhwd_m2r(*(dataptr+7), mm7);
1202 movq_r2m(mm0, *(dataptr+8));
1203 punpckhdq_r2r(mm7, mm5);
1205 punpcklwd_m2r(*(dataptr+10), mm1);
1208 punpckhwd_m2r(*(dataptr+10), mm2);
1210 movq_r2m(mm4, *(dataptr+10));
1211 punpckldq_r2r(mm7, mm6);
1213 punpcklwd_m2r(*(dataptr+14), mm3);
1216 movq_r2m(mm6, *(dataptr+12));
1217 punpckldq_r2r(mm3, mm1);
1219 punpckhwd_m2r(*(dataptr+14), mm0);
1222 movq_r2m(mm5, *(dataptr+14));
1223 punpckhdq_r2r(mm3, mm4);
1225 movq_r2m(mm1, *(dataptr+1));
1226 punpckldq_r2r(mm0, mm2);
1228 movq_r2m(mm4, *(dataptr+3));
1229 punpckhdq_r2r(mm0, mm6);
1231 movq_r2m(mm2, *(dataptr+5));
1233 movq_m2r(*dataptr, mm0);
1235 movq_r2m(mm6, *(dataptr+7));
1239 movq_m2r(*(dataptr+4), mm7);
1242 punpcklwd_m2r(*(dataptr+2), mm0);
1245 punpcklwd_m2r(*(dataptr+6), mm7);
1248 movq_m2r(*(dataptr+2), mm6);
1249 punpckldq_r2r(mm7, mm0);
1251 movq_m2r(*(dataptr+6), mm5);
1252 punpckhdq_r2r(mm7, mm1);
1255 punpckhwd_r2r(mm6, mm2);
1257 psubw_m2r(*(dataptr+14), mm7);
1260 paddw_m2r(*(dataptr+14), mm0);
1261 punpckhwd_r2r(mm5, mm4);
1263 paddw_m2r(*(dataptr+12), mm1);
1266 psubw_m2r(*(dataptr+12), mm6);
1267 punpckldq_r2r(mm4, mm2);
1269 movq_r2m(mm7, tmp7);
1272 movq_r2m(mm6, tmp6);
1274 punpckhdq_r2r(mm4, mm3);
1276 paddw_m2r(*(dataptr+10), mm2);
1283 paddw_m2r(*(dataptr+8), mm3);
1286 psubw_m2r(*(dataptr+8), mm4);
1289 paddw_r2r(mm3, mm0);
1290 psubw_r2r(mm3, mm7);
1292 psubw_r2r(mm2, mm6);
1293 paddw_r2r(mm2, mm1);
1295 psubw_m2r(*(dataptr+10), mm5);
1296 paddw_r2r(mm7, mm6);
1300 movq_m2r(tmp6, mm2);
1304 paddw_r2r(mm1, mm0);
1307 psubw_r2r(mm1, mm3);
1309 movq_r2m(mm0, *dataptr);
1313 movq_r2m(mm3, *(dataptr+8));
1314 paddw_r2r(mm5, mm4);
1316 movq_m2r(tmp7, mm3);
1317 paddw_r2r(mm6, mm0);
1319 paddw_r2r(mm2, mm5);
1320 psubw_r2r(mm6, mm7);
1322 movq_r2m(mm0, *(dataptr+4));
1323 paddw_r2r(mm3, mm2);
1326 movq_r2m(mm7, *(dataptr+12));
1329 psubw_r2r(mm2, mm1);
1338 pmulhw_r2r(mm0, mm4);
1348 movq_m2r(*(dataptr+1), mm7);
1349 paddw_r2r(mm1, mm4);
1351 paddw_r2r(mm1, mm2);
1353 paddw_r2r(mm5, mm0);
1354 psubw_r2r(mm5, mm3);
1359 psubw_r2r(mm4, mm3);
1361 paddw_r2r(mm4, mm5);
1364 movq_r2m(mm3, *(dataptr+6));
1365 psubw_r2r(mm2, mm0);
1367 movq_r2m(mm5, *(dataptr+10));
1368 paddw_r2r(mm2, mm6);
1370 movq_r2m(mm0, *(dataptr+14));
1376 movq_m2r(*(dataptr+3), mm1);
1379 movq_r2m(mm6, *(dataptr+2));
1381 movq_m2r(*(dataptr+5), mm2);
1384 paddw_m2r(*(dataptr+15), mm0);
1386 movq_m2r(*(dataptr+7), mm3);
1389 psubw_m2r(*(dataptr+15), mm7);
1392 paddw_m2r(*(dataptr+13), mm1);
1394 movq_r2m(mm7, tmp7);
1397 psubw_m2r(*(dataptr+13), mm6);
1401 paddw_m2r(*(dataptr+9), mm3);
1403 movq_r2m(mm6, tmp6);
1406 paddw_m2r(*(dataptr+11), mm2);
1407 paddw_r2r(mm3, mm0);
1409 psubw_r2r(mm3, mm7);
1411 psubw_m2r(*(dataptr+9), mm4);
1412 psubw_r2r(mm2, mm6);
1414 paddw_r2r(mm2, mm1);
1416 psubw_m2r(*(dataptr+11), mm5);
1417 paddw_r2r(mm7, mm6);
1421 movq_m2r(tmp6, mm2);
1425 paddw_r2r(mm1, mm0);
1428 psubw_r2r(mm1, mm3);
1430 movq_r2m(mm0, *(dataptr+1));
1435 movq_r2m(mm3, *(dataptr+9));
1436 paddw_r2r(mm5, mm4);
1438 movq_m2r(tmp7, mm3);
1439 paddw_r2r(mm6, mm0);
1441 paddw_r2r(mm2, mm5);
1442 psubw_r2r(mm6, mm7);
1444 movq_r2m(mm0, *(dataptr+5));
1445 paddw_r2r(mm3, mm2);
1449 movq_r2m(mm7, *(dataptr+13));
1452 psubw_r2r(mm2, mm1);
1461 pmulhw_r2r(mm0, mm4);
1471 movq_m2r(*(dataptr+9), mm7);
1472 paddw_r2r(mm1, mm4);
1474 paddw_r2r(mm5, mm0);
1475 psubw_r2r(mm5, mm3);
1480 paddw_r2r(mm1, mm2);
1483 psubw_r2r(mm4, mm5);
1485 paddw_r2r(mm2, mm6);
1486 paddw_r2r(mm4, mm3);
1488 movq_r2m(mm5, *(dataptr+7));
1489 psubw_r2r(mm2, mm0);
1491 movq_r2m(mm3, *(dataptr+11));
1493 movq_r2m(mm6, *(dataptr+3));
1495 movq_r2m(mm0, *(dataptr+15));
1502 static constexpr int32_t FIX_1_082392200 { 277 };
1503 static constexpr int32_t FIX_1_414213562 { 362 };
1504 static constexpr int32_t FIX_1_847759065 { 473 };
1505 static constexpr int32_t FIX_2_613125930 { 669 };
1507 static constexpr int16_t DESCALE(int32_t x) {
return static_cast<int16_t
>((x+4) >> 3); };
1511 static inline int16_t RL(int32_t x) {
return std::clamp(x, 16, 235); };
1512 static constexpr int32_t MULTIPLY(int32_t var, int32_t constant)
1513 {
return ((var * constant) + 128) >> 8; };
1518 for(
int i = 0; i < 64; i++)
1529 static mmx_t s_fix141; s_fix141.q = 0x5a825a825a825a82LL;
1530 static mmx_t s_fix184n261; s_fix184n261.q = 0xcf04cf04cf04cf04LL;
1531 static mmx_t s_fix184; s_fix184.q = 0x7641764176417641LL;
1532 static mmx_t s_fixN184; s_fixN184.q = 0x896f896f896f896fLL;
1533 static mmx_t s_fix108n184; s_fix108n184.q = 0xcf04cf04cf04cf04LL;
1535 auto *wsptr = (mmx_t *)
m_ws.data();
1536 auto *dataptr = (mmx_t *)odata;
1537 auto *idata = (mmx_t *)data.data();
1546 movq_m2r(*(idata+10), mm1);
1548 movq_m2r(*(idata+6), mm0);
1550 movq_m2r(*(idata+2), mm3);
1554 movq_m2r(*(idata+14), mm4);
1556 paddw_r2r(mm0, mm1);
1558 psubw_r2r(mm0, mm2);
1563 pmulhw_m2r(s_fix184n261, mm2);
1566 pmulhw_m2r(s_fixN184, mm0);
1567 paddw_r2r(mm4, mm3);
1570 psubw_r2r(mm4, mm5);
1572 psubw_r2r(mm1, mm6);
1575 movq_m2r(*(idata+12), mm4);
1578 pmulhw_m2r(s_fix108n184, mm5);
1579 paddw_r2r(mm1, mm3);
1584 pmulhw_m2r(s_fix184, mm7);
1587 movq_m2r(*(idata+4), mm1);
1589 paddw_r2r(mm5, mm0);
1591 paddw_r2r(mm7, mm2);
1593 pmulhw_m2r(s_fix141, mm6);
1594 psubw_r2r(mm3, mm2);
1597 paddw_r2r(mm4, mm1);
1599 psubw_r2r(mm4, mm5);
1600 psubw_r2r(mm2, mm6);
1602 movq_r2m(mm1, *(wsptr));
1605 movq_m2r(*(idata), mm7);
1607 pmulhw_m2r(s_fix141, mm5);
1608 paddw_r2r(mm6, mm0);
1610 movq_m2r(*(idata+8), mm4);
1612 psubw_r2r(mm1, mm5);
1614 movq_r2m(mm0, *(wsptr+4));
1617 movq_r2m(mm5, *(wsptr+2));
1618 psubw_r2r(mm4, mm1);
1620 paddw_r2r(mm4, mm7);
1623 paddw_m2r(*(wsptr+2), mm1);
1626 paddw_m2r(*(wsptr), mm7);
1628 psubw_m2r(*(wsptr), mm4);
1631 psubw_m2r(*(wsptr+2), mm5);
1632 paddw_r2r(mm3, mm7);
1634 psubw_r2r(mm3, mm0);
1636 movq_r2m(mm7, *(wsptr));
1639 movq_r2m(mm0, *(wsptr+14));
1640 paddw_r2r(mm2, mm1);
1642 psubw_r2r(mm2, mm3);
1644 movq_r2m(mm1, *(wsptr+2));
1647 movq_r2m(mm3, *(wsptr+12));
1649 paddw_m2r(*(wsptr+4), mm4);
1651 psubw_m2r(*(wsptr+4), mm1);
1653 movq_r2m(mm4, *(wsptr+8));
1656 paddw_r2r(mm6, mm5);
1658 movq_r2m(mm1, *(wsptr+6));
1659 psubw_r2r(mm6, mm7);
1661 movq_r2m(mm5, *(wsptr+4));
1663 movq_r2m(mm7, *(wsptr+10));
1675 movq_m2r(*(idata+10), mm1);
1677 movq_m2r(*(idata+6), mm0);
1679 movq_m2r(*(idata+2), mm3);
1682 movq_m2r(*(idata+14), mm4);
1683 paddw_r2r(mm0, mm1);
1685 psubw_r2r(mm0, mm2);
1690 pmulhw_m2r(s_fix184n261, mm2);
1693 pmulhw_m2r(s_fixN184, mm0);
1694 paddw_r2r(mm4, mm3);
1697 psubw_r2r(mm4, mm5);
1699 psubw_r2r(mm1, mm6);
1702 movq_m2r(*(idata+12), mm4);
1705 pmulhw_m2r(s_fix108n184, mm5);
1706 paddw_r2r(mm1, mm3);
1711 pmulhw_m2r(s_fix184, mm7);
1714 movq_m2r(*(idata+4), mm1);
1716 paddw_r2r(mm5, mm0);
1718 paddw_r2r(mm7, mm2);
1720 pmulhw_m2r(s_fix141, mm6);
1721 psubw_r2r(mm3, mm2);
1724 paddw_r2r(mm4, mm1);
1726 psubw_r2r(mm4, mm5);
1727 psubw_r2r(mm2, mm6);
1729 movq_r2m(mm1, *(wsptr));
1732 movq_m2r(*(idata), mm7);
1733 paddw_r2r(mm6, mm0);
1735 pmulhw_m2r(s_fix141, mm5);
1737 movq_m2r(*(idata+8), mm4);
1739 psubw_r2r(mm1, mm5);
1741 movq_r2m(mm0, *(wsptr+4));
1744 movq_r2m(mm5, *(wsptr+2));
1745 psubw_r2r(mm4, mm1);
1747 paddw_r2r(mm4, mm7);
1750 paddw_m2r(*(wsptr+2), mm1);
1753 paddw_m2r(*(wsptr), mm7);
1755 psubw_m2r(*(wsptr), mm4);
1758 psubw_m2r(*(wsptr+2), mm5);
1759 paddw_r2r(mm3, mm7);
1761 psubw_r2r(mm3, mm0);
1763 movq_r2m(mm7, *(wsptr));
1766 movq_r2m(mm0, *(wsptr+14));
1767 paddw_r2r(mm2, mm1);
1769 psubw_r2r(mm2, mm3);
1771 movq_r2m(mm1, *(wsptr+2));
1774 movq_r2m(mm3, *(wsptr+12));
1776 paddw_m2r(*(wsptr+4), mm4);
1778 psubw_m2r(*(wsptr+4), mm1);
1780 movq_r2m(mm4, *(wsptr+8));
1783 paddw_r2r(mm6, mm5);
1785 movq_r2m(mm1, *(wsptr+6));
1786 psubw_r2r(mm6, mm7);
1788 movq_r2m(mm5, *(wsptr+4));
1790 movq_r2m(mm7, *(wsptr+10));
1807 movq_m2r(*(wsptr), mm0);
1809 movq_m2r(*(wsptr+1), mm1);
1812 movq_m2r(*(wsptr+2), mm3);
1813 paddw_r2r(mm1, mm0);
1815 movq_m2r(*(wsptr+3), mm4);
1816 psubw_r2r(mm1, mm2);
1821 paddw_r2r(mm4, mm3);
1824 psubw_r2r(mm4, mm5);
1825 punpcklwd_r2r(mm3, mm0);
1827 movq_m2r(*(wsptr+7), mm7);
1828 punpckhwd_r2r(mm3, mm6);
1830 movq_m2r(*(wsptr+4), mm3);
1831 punpckldq_r2r(mm6, mm0);
1833 punpcklwd_r2r(mm5, mm1);
1836 movq_m2r(*(wsptr+6), mm6);
1837 punpckhwd_r2r(mm5, mm2);
1839 movq_m2r(*(wsptr+5), mm5);
1840 punpckldq_r2r(mm2, mm1);
1843 paddw_r2r(mm5, mm3);
1846 psubw_r2r(mm5, mm4);
1847 paddw_r2r(mm7, mm6);
1850 punpcklwd_r2r(mm6, mm3);
1852 psubw_r2r(mm7, mm2);
1853 punpckhwd_r2r(mm6, mm5);
1856 punpckldq_r2r(mm5, mm3);
1858 punpcklwd_r2r(mm2, mm4);
1860 punpckhwd_r2r(mm2, mm7);
1862 punpckldq_r2r(mm7, mm4);
1872 punpckhdq_r2r(mm4, mm6);
1874 punpckldq_r2r(mm4, mm1);
1877 pmulhw_m2r(s_fix141, mm6);
1878 punpckldq_r2r(mm3, mm0);
1880 punpckhdq_r2r(mm3, mm2);
1885 paddw_r2r(mm2, mm0);
1886 psubw_r2r(mm2, mm7);
1889 psubw_r2r(mm2, mm6);
1902 movq_m2r(*(wsptr), mm3);
1903 paddw_r2r(mm6, mm1);
1905 movq_m2r(*(wsptr+1), mm4);
1906 psubw_r2r(mm6, mm5);
1909 punpckldq_r2r(mm4, mm3);
1911 punpckhdq_r2r(mm6, mm4);
1915 movq_r2m(mm0, *(wsptr));
1916 paddw_r2r(mm4, mm2);
1920 movq_m2r(*(wsptr+2), mm6);
1921 psubw_r2r(mm4, mm3);
1923 movq_m2r(*(wsptr+3), mm0);
1926 movq_r2m(mm1, *(wsptr+1));
1927 punpckldq_r2r(mm0, mm6);
1929 punpckhdq_r2r(mm4, mm0);
1933 paddw_r2r(mm0, mm6);
1937 movq_r2m(mm5, *(wsptr+2));
1938 punpcklwd_r2r(mm6, mm2);
1940 psubw_r2r(mm0, mm1);
1941 punpckhwd_r2r(mm6, mm4);
1944 punpcklwd_r2r(mm1, mm3);
1946 movq_r2m(mm7, *(wsptr+3));
1947 punpckhwd_r2r(mm1, mm0);
1949 movq_m2r(*(wsptr+4), mm6);
1950 punpckhdq_r2r(mm2, mm0);
1952 movq_m2r(*(wsptr+5), mm7);
1953 punpckhdq_r2r(mm4, mm3);
1955 movq_m2r(*(wsptr+6), mm1);
1958 punpckldq_r2r(mm7, mm6);
1961 punpckhdq_r2r(mm4, mm7);
1964 movq_m2r(*(wsptr+7), mm4);
1965 paddw_r2r(mm7, mm6);
1967 psubw_r2r(mm7, mm2);
1968 punpckldq_r2r(mm4, mm1);
1970 punpckhdq_r2r(mm5, mm4);
1973 paddw_r2r(mm4, mm1);
1974 psubw_r2r(mm4, mm7);
1977 punpcklwd_r2r(mm1, mm6);
1979 punpckhwd_r2r(mm1, mm5);
1982 punpcklwd_r2r(mm7, mm2);
1984 punpckhwd_r2r(mm7, mm4);
1986 punpckhdq_r2r(mm6, mm4);
1988 punpckhdq_r2r(mm5, mm2);
1991 punpckldq_r2r(mm4, mm0);
1993 punpckhdq_r2r(mm4, mm5);
1996 punpckhdq_r2r(mm2, mm4);
1999 punpckldq_r2r(mm2, mm3);
2002 psubw_r2r(mm4, mm1);
2004 paddw_r2r(mm4, mm5);
2010 pmulhw_m2r(s_fix141, mm1);
2016 pmulhw_m2r(s_fixN184, mm7);
2019 movq_m2r(*(wsptr), mm2);
2021 pmulhw_m2r(s_fix108n184, mm6);
2026 pmulhw_m2r(s_fix184n261, mm0);
2027 paddw_r2r(mm5, mm2);
2029 pmulhw_m2r(s_fix184, mm3);
2030 psubw_r2r(mm5, mm4);
2035 paddw_r2r(mm6, mm7);
2038 paddw_r2r(mm0, mm3);
2041 psubw_r2r(mm5, mm3);
2044 movq_m2r(*(wsptr+1), mm0);
2045 psubw_r2r(mm3, mm1);
2048 paddw_r2r(mm3, mm0);
2063 psubw_r2r(mm3, mm6);
2068 packuswb_r2r(mm4, mm0);
2070 movq_m2r(*(wsptr+2), mm5);
2071 packuswb_r2r(mm6, mm2);
2077 paddw_r2r(mm1, mm7);
2080 paddw_r2r(mm1, mm5);
2081 psubw_r2r(mm1, mm3);
2085 movq_m2r(*(wsptr+3), mm4);
2095 paddw_r2r(mm7, mm4);
2097 psubw_r2r(mm7, mm6);
2104 packuswb_r2r(mm4, mm5);
2106 packuswb_r2r(mm3, mm6);
2110 punpcklbw_r2r(mm0, mm2);
2112 punpckhbw_r2r(mm0, mm4);
2115 punpcklbw_r2r(mm6, mm5);
2119 punpckhbw_r2r(mm6, mm7);
2121 punpcklwd_r2r(mm5, mm2);
2126 punpckhwd_r2r(mm5, mm1);
2129 punpcklwd_r2r(mm4, mm6);
2133 punpckldq_r2r(mm6, mm2);
2141 punpckhwd_r2r(mm4, mm7);
2143 movq_r2m(mm2, *(dataptr));
2145 punpckhdq_r2r(mm6, mm0);
2148 movq_r2m(mm0, *(dataptr));
2150 punpckldq_r2r(mm7, mm1);
2151 punpckhdq_r2r(mm7, mm3);
2154 movq_r2m(mm1, *(dataptr));
2157 movq_r2m(mm3, *(dataptr));
2169 movq_m2r(*(wsptr), mm0);
2171 movq_m2r(*(wsptr+1), mm1);
2174 movq_m2r(*(wsptr+2), mm3);
2175 paddw_r2r(mm1, mm0);
2177 movq_m2r(*(wsptr+3), mm4);
2178 psubw_r2r(mm1, mm2);
2183 paddw_r2r(mm4, mm3);
2186 psubw_r2r(mm4, mm5);
2187 punpcklwd_r2r(mm3, mm0);
2189 movq_m2r(*(wsptr+7), mm7);
2190 punpckhwd_r2r(mm3, mm6);
2192 movq_m2r(*(wsptr+4), mm3);
2193 punpckldq_r2r(mm6, mm0);
2195 punpcklwd_r2r(mm5, mm1);
2198 movq_m2r(*(wsptr+6), mm6);
2199 punpckhwd_r2r(mm5, mm2);
2201 movq_m2r(*(wsptr+5), mm5);
2202 punpckldq_r2r(mm2, mm1);
2204 paddw_r2r(mm5, mm3);
2207 psubw_r2r(mm5, mm4);
2208 paddw_r2r(mm7, mm6);
2211 punpcklwd_r2r(mm6, mm3);
2213 psubw_r2r(mm7, mm2);
2214 punpckhwd_r2r(mm6, mm5);
2217 punpckldq_r2r(mm5, mm3);
2219 punpcklwd_r2r(mm2, mm4);
2221 punpckhwd_r2r(mm2, mm7);
2223 punpckldq_r2r(mm7, mm4);
2232 punpckhdq_r2r(mm4, mm6);
2234 punpckldq_r2r(mm4, mm1);
2237 pmulhw_m2r(s_fix141, mm6);
2238 punpckldq_r2r(mm3, mm0);
2240 punpckhdq_r2r(mm3, mm2);
2245 paddw_r2r(mm2, mm0);
2246 psubw_r2r(mm2, mm7);
2249 psubw_r2r(mm2, mm6);
2263 movq_m2r(*(wsptr), mm3);
2264 paddw_r2r(mm6, mm1);
2266 movq_m2r(*(wsptr+1), mm4);
2267 psubw_r2r(mm6, mm5);
2270 punpckldq_r2r(mm4, mm3);
2272 punpckhdq_r2r(mm6, mm4);
2276 movq_r2m(mm0, *(wsptr));
2277 paddw_r2r(mm4, mm2);
2281 movq_m2r(*(wsptr+2), mm6);
2282 psubw_r2r(mm4, mm3);
2284 movq_m2r(*(wsptr+3), mm0);
2287 movq_r2m(mm1, *(wsptr+1));
2288 punpckldq_r2r(mm0, mm6);
2290 punpckhdq_r2r(mm4, mm0);
2294 paddw_r2r(mm0, mm6);
2298 movq_r2m(mm5, *(wsptr+2));
2299 punpcklwd_r2r(mm6, mm2);
2301 psubw_r2r(mm0, mm1);
2302 punpckhwd_r2r(mm6, mm4);
2305 punpcklwd_r2r(mm1, mm3);
2307 movq_r2m(mm7, *(wsptr+3));
2308 punpckhwd_r2r(mm1, mm0);
2310 movq_m2r(*(wsptr+4), mm6);
2311 punpckhdq_r2r(mm2, mm0);
2313 movq_m2r(*(wsptr+5), mm7);
2314 punpckhdq_r2r(mm4, mm3);
2316 movq_m2r(*(wsptr+6), mm1);
2319 punpckldq_r2r(mm7, mm6);
2322 punpckhdq_r2r(mm4, mm7);
2325 movq_m2r(*(wsptr+7), mm4);
2326 paddw_r2r(mm7, mm6);
2328 psubw_r2r(mm7, mm2);
2329 punpckldq_r2r(mm4, mm1);
2331 punpckhdq_r2r(mm5, mm4);
2334 paddw_r2r(mm4, mm1);
2335 psubw_r2r(mm4, mm7);
2338 punpcklwd_r2r(mm1, mm6);
2340 punpckhwd_r2r(mm1, mm5);
2343 punpcklwd_r2r(mm7, mm2);
2345 punpckhwd_r2r(mm7, mm4);
2347 punpckhdq_r2r(mm6, mm4);
2349 punpckhdq_r2r(mm5, mm2);
2352 punpckldq_r2r(mm4, mm0);
2354 punpckhdq_r2r(mm4, mm5);
2357 punpckhdq_r2r(mm2, mm4);
2360 punpckldq_r2r(mm2, mm3);
2363 psubw_r2r(mm4, mm1);
2365 paddw_r2r(mm4, mm5);
2371 pmulhw_m2r(s_fix141, mm1);
2377 pmulhw_m2r(s_fixN184, mm7);
2380 movq_m2r(*(wsptr), mm2);
2382 pmulhw_m2r(s_fix108n184, mm6);
2387 pmulhw_m2r(s_fix184n261, mm0);
2388 paddw_r2r(mm5, mm2);
2390 pmulhw_m2r(s_fix184, mm3);
2391 psubw_r2r(mm5, mm4);
2396 paddw_r2r(mm6, mm7);
2399 paddw_r2r(mm0, mm3);
2402 psubw_r2r(mm5, mm3);
2405 movq_m2r(*(wsptr+1), mm0);
2406 psubw_r2r(mm3, mm1);
2409 paddw_r2r(mm3, mm0);
2423 psubw_r2r(mm3, mm6);
2428 packuswb_r2r(mm4, mm0);
2430 movq_m2r(*(wsptr+2), mm5);
2431 packuswb_r2r(mm6, mm2);
2437 paddw_r2r(mm1, mm7);
2440 paddw_r2r(mm1, mm5);
2441 psubw_r2r(mm1, mm3);
2445 movq_m2r(*(wsptr+3), mm4);
2455 paddw_r2r(mm7, mm4);
2457 psubw_r2r(mm7, mm6);
2470 packuswb_r2r(mm4, mm5);
2472 packuswb_r2r(mm3, mm6);
2476 punpcklbw_r2r(mm0, mm2);
2478 punpckhbw_r2r(mm0, mm4);
2481 punpcklbw_r2r(mm6, mm5);
2483 punpckhbw_r2r(mm6, mm7);
2485 punpcklwd_r2r(mm5, mm2);
2488 punpckhwd_r2r(mm5, mm1);
2491 punpcklwd_r2r(mm4, mm6);
2493 punpckldq_r2r(mm6, mm2);
2497 punpckhwd_r2r(mm4, mm7);
2500 movq_r2m(mm2, *(dataptr));
2502 punpckhdq_r2r(mm6, mm0);
2505 movq_r2m(mm0, *(dataptr));
2507 punpckldq_r2r(mm7, mm1);
2509 punpckhdq_r2r(mm7, mm3);
2512 movq_r2m(mm1, *(dataptr));
2515 movq_r2m(mm3, *(dataptr));
2518 int32_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2519 int32_t tmp10, tmp11, tmp12, tmp13;
2520 int32_t z5, z10, z11, z12, z13;
2527 inptr = data.data();
2528 wsptr =
m_ws.data();
2529 for (ctr = 8; ctr > 0; ctr--) {
2531 if ((inptr[8] | inptr[16] | inptr[24] |
2532 inptr[32] | inptr[40] | inptr[48] | inptr[56]) == 0) {
2553 tmp10 = tmp0 + tmp2;
2554 tmp11 = tmp0 - tmp2;
2556 tmp13 = tmp1 + tmp3;
2557 tmp12 = MULTIPLY(tmp1 - tmp3, FIX_1_414213562) - tmp13;
2559 tmp0 = tmp10 + tmp13;
2560 tmp3 = tmp10 - tmp13;
2561 tmp1 = tmp11 + tmp12;
2562 tmp2 = tmp11 - tmp12;
2575 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2577 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2578 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2579 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2581 tmp6 = tmp12 - tmp7;
2582 tmp5 = tmp11 - tmp6;
2583 tmp4 = tmp10 + tmp5;
2585 wsptr[0] = (int32_t) (tmp0 + tmp7);
2586 wsptr[56] = (int32_t) (tmp0 - tmp7);
2587 wsptr[8] = (int32_t) (tmp1 + tmp6);
2588 wsptr[48] = (int32_t) (tmp1 - tmp6);
2589 wsptr[16] = (int32_t) (tmp2 + tmp5);
2590 wsptr[40] = (int32_t) (tmp2 - tmp5);
2591 wsptr[32] = (int32_t) (tmp3 + tmp4);
2592 wsptr[24] = (int32_t) (tmp3 - tmp4);
2598 wsptr =
m_ws.data();
2599 for (ctr = 0; ctr < 8; ctr++) {
2600 outptr = &(odata[ctr*rskip]);
2602 tmp10 = wsptr[0] + wsptr[4];
2603 tmp11 = wsptr[0] - wsptr[4];
2605 tmp13 = wsptr[2] + wsptr[6];
2606 tmp12 = MULTIPLY(wsptr[2] - wsptr[6], FIX_1_414213562) - tmp13;
2608 tmp0 = tmp10 + tmp13;
2609 tmp3 = tmp10 - tmp13;
2610 tmp1 = tmp11 + tmp12;
2611 tmp2 = tmp11 - tmp12;
2613 z13 = wsptr[5] + wsptr[3];
2614 z10 = wsptr[5] - wsptr[3];
2615 z11 = wsptr[1] + wsptr[7];
2616 z12 = wsptr[1] - wsptr[7];
2619 tmp11 = MULTIPLY(z11 - z13, FIX_1_414213562);
2621 z5 = MULTIPLY(z10 + z12, FIX_1_847759065);
2622 tmp10 = MULTIPLY(z12, FIX_1_082392200) - z5;
2623 tmp12 = MULTIPLY(z10, - FIX_2_613125930) + z5;
2625 tmp6 = tmp12 - tmp7;
2626 tmp5 = tmp11 - tmp6;
2627 tmp4 = tmp10 + tmp5;
2629 outptr[0] = RL(DESCALE(tmp0 + tmp7));
2630 outptr[7] = RL(DESCALE(tmp0 - tmp7));
2631 outptr[1] = RL(DESCALE(tmp1 + tmp6));
2632 outptr[6] = RL(DESCALE(tmp1 - tmp6));
2633 outptr[2] = RL(DESCALE(tmp2 + tmp5));
2634 outptr[5] = RL(DESCALE(tmp2 - tmp5));
2635 outptr[4] = RL(DESCALE(tmp3 + tmp4));
2636 outptr[3] = RL(DESCALE(tmp3 - tmp4));
2645 uint64_t qual = (uint64_t)
m_q << (32 - 7);
2647 for(
int i = 0; i < 64; i++)
2696 if ((*w < 0) || (*w > 65535))
2698 if ((*h < 0) || (*h > 65535))
2714 fprintf(
stderr,
"RTjpeg: Could not allocate memory\n");
2731 m_lMask.uq = (((uint64_t)(*lm)<<48)|((uint64_t)(*lm)<<32)|((uint64_t)(*lm)<<16)|(uint64_t)(*lm));
2732 m_cMask.uq = (((uint64_t)(*cm)<<48)|((uint64_t)(*cm)<<32)|((uint64_t)(*cm)<<16)|(uint64_t)(*cm));
2742 fprintf(
stderr,
"RTjpeg: Could not allocate memory\n");
2770 uint8_t * bp = planes[0];
2771 uint8_t * bp1 = bp + (
m_width<<3);
2772 uint8_t * bp2 = planes[1];
2773 uint8_t * bp3 = planes[2];
2780 for(
int i =
m_height >> 1; i; i -= 8)
2782 for(
int j = 0, k = 0; j <
m_width; j += 16, k += 8)
2821 uint8_t * bp = planes[0];
2822 uint8_t * bp2 = planes[1];
2823 uint8_t * bp3 = planes[2];
2832 for(
int j=0, k=0; j<
m_width; j+=16, k+=8)
2864 int8_t * sb =
nullptr;
2865 uint8_t * bp = planes[0];
2892 uint8_t * bp = planes[0];
2893 uint8_t * bp2 = planes[1];
2894 uint8_t * bp3 = planes[2];
2903 for(
int k=0, j=0; j<
m_width; j+=16, k+=8) {
2940 uint8_t * bp = planes[0];
2941 uint8_t * bp1 = bp + (
m_width<<3);
2942 uint8_t * bp2 = planes[1];
2943 uint8_t * bp3 = planes[2];
2952 for(
int k=0, j=0; j<
m_width; j+=16, k+=8) {
3002 uint8_t * bp = planes[0];
3028 auto *mold=(mmx_t *)_old;
3029 auto *mblock=(mmx_t *)rblock.data();
3030 volatile mmx_t result {};
3031 static mmx_t s_neg= { 0xffffffffffffffffULL };
3033 movq_m2r(*mask, mm7);
3034 movq_m2r(s_neg, mm6);
3037 for(
int i=0; i<8; i++)
3039 movq_m2r(*(mblock++), mm0);
3040 movq_m2r(*(mblock++), mm2);
3041 movq_m2r(*(mold++), mm1);
3042 movq_m2r(*(mold++), mm3);
3043 psubsw_r2r(mm1, mm0);
3044 psubsw_r2r(mm3, mm2);
3047 pcmpgtw_r2r(mm7, mm0);
3048 pcmpgtw_r2r(mm7, mm2);
3051 pcmpgtw_r2r(mm7, mm1);
3052 pcmpgtw_r2r(mm7, mm3);
3058 movq_r2m(mm5, result);
3062 std::copy(rblock.cbegin(), rblock.cend(), _old);
3071 for(
int i=0; i<64; i++)
3072 if (abs(_old[i]-rblock[i])>*mask)
3074 std::copy(rblock.cbegin(), rblock.cend(), _old);
3083 uint8_t * bp = planes[0];
3084 uint8_t * bp1 = bp + (
m_width<<3);
3085 uint8_t * bp2 = planes[1];
3086 uint8_t * bp3 = planes[2];
3088 int16_t * lblock =
m_old;
3093 for(
int j=0, k=0; j <
m_width; j+=16, k+=8)
3099 *((uint8_t *)sp++)=255;
3111 *((uint8_t *)sp++)=255;
3123 *((uint8_t *)sp++)=255;
3135 *((uint8_t *)sp++)=255;
3147 *((uint8_t *)sp++)=255;
3159 *((uint8_t *)sp++)=255;
3181 uint8_t * bp = planes[0];
3182 uint8_t * bp2 = planes[1];
3183 uint8_t * bp3 = planes[2];
3185 int16_t *lblock =
m_old;
3189 for(
int j=0, k=0; j<
m_width; j+=16, k+=8)
3195 *((uint8_t *)sp++)=255;
3207 *((uint8_t *)sp++)=255;
3219 *((uint8_t *)sp++)=255;
3231 *((uint8_t *)sp++)=255;
3252 uint8_t * bp = planes[0];
3254 int16_t *lblock =
m_old;
3264 *((uint8_t *)sp++)=255;
3313 fh->framesize = qToLittleEndian<qint32>(ds);
3316 fh->width = qToLittleEndian<qint16>(
m_width);
3317 fh->height = qToLittleEndian<qint16>(
m_height);
3326 if ((qFromLittleEndian<qint16>(fh->width) !=
m_width)||
3327 (qFromLittleEndian<qint16>(fh->height) !=
m_height))
3329 int w = qFromLittleEndian<qint16>(fh->width);
3330 int h = qFromLittleEndian<qint16>(fh->height);
3333 if (fh->quality !=
m_q)
3335 int q = fh->quality;