MythTV master
zoom_filter_xmmx.cpp
Go to the documentation of this file.
2
3#if defined(MMX) && !(defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64))
4/* a definir pour avoir exactement le meme resultat que la fonction C
5 * (un chouillat plus lent)
6 */
7#define STRICT_COMPAT
8//#define HAVE_ATHLON
9
10#define BUFFPOINTNB 16
11#define BUFFPOINTMASK 0xffff
12#define BUFFINCR 0xff
13
14#define sqrtperte 16
15// faire : a % sqrtperte <=> a & pertemask
16#define PERTEMASK 0xf
17// faire : a / sqrtperte <=> a >> PERTEDEC
18#define PERTEDEC 4
19
20
21//#define MMX_TRACE
22#include "mmx.h"
23extern "C" {
24#include "libavutil/cpu.h"
25}
26
28 return (av_get_cpu_flags() & AV_CPU_FLAG_SSE) >> 3;
29}
30
32 unsigned int *expix1, unsigned int *expix2,
33 const int *lbruS, const int *lbruD, int buffratio,
35{
36 int bufsize = prevX * prevY; /* taille du buffer */
37 volatile int loop; /* variable de boucle */
38
39 mmx_t *brutS = (mmx_t*)lbruS; /* buffer de transformation source */
40 mmx_t *brutD = (mmx_t*)lbruD; /* buffer de transformation dest */
41
42 volatile mmx_t prevXY;
43 volatile mmx_t ratiox;
44 //volatile mmx_t interpix;
45
47
48 prevXY.ud[0] = (prevX-1)<<PERTEDEC;
49 prevXY.ud[1] = (prevY-1)<<PERTEDEC;
50
51 ratiox.d[0] = buffratio;
52 ratiox.d[1] = buffratio;
53 movq_m2r (ratiox, mm6);
54 pslld_i2r (16,mm6);
55
56 pxor_r2r (mm7,mm7); /* mise a zero de mm7 */
57
58 loop=0;
59
60 /*
61 * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
62 */
63 while (loop < bufsize)
64 {
65 /*
66 * pre : mm6 = [buffratio<<16|buffratio<<16]
67 * post : mm0 = S + ((D-S)*buffratio)>>16 format [X|Y]
68 * modified = mm0,mm1,mm2
69 */
70
71 __asm__ __volatile__ (
72 "movq %0,%%mm0\n"
73 "movq %1,%%mm1\n"
74 : :"m"(brutS[loop]),"m"(brutD[loop])
75 ); /* mm0 = S */
76
77 psubd_r2r (mm0,mm1); /* mm1 = D - S */
78 movq_r2r (mm1, mm2); /* mm2 = D - S */
79
80 pslld_i2r (16,mm1);
81 mmx_r2r (pmulhuw, mm6, mm1); /* mm1 = ?? */
82 pmullw_r2r (mm6, mm2);
83
84 paddd_r2r (mm2, mm1); /* mm1 = (D - S) * buffratio >> 16 */
85 pslld_i2r (16,mm0);
86
87 paddd_r2r (mm1, mm0); /* mm0 = S + mm1 */
88 psrld_i2r (16, mm0);
89
90 /*
91 * pre : mm0 : position vector on screen
92 * prevXY : coordinate of the lower-right point on screen
93 * post : clipped mm0
94 * modified : mm0,mm1,mm2
95 */
96 movq_m2r (prevXY,mm1);
97 pcmpgtd_r2r (mm0, mm1); /* mm0 en X contient :
98 1111 si prevXY > px
99 0000 si prevXY <= px
100 (idem pour y) */
101#ifdef STRICT_COMPAT
102 movq_r2r (mm1,mm2);
103 punpckhdq_r2r (mm2,mm2);
104 punpckldq_r2r (mm1,mm1);
105 pand_r2r (mm2, mm0);
106#endif
107 pand_r2r (mm1, mm0); /* on met a zero la partie qui deborde */
108
109 /*
110 * pre : mm0 : clipped position on screen
111 *
112 * post : mm3 & mm4 : coefs for this position
113 * mm1 : X vector [0|X]
114 *
115 * modif : eax,ecx
116 */
117 __asm__ __volatile__ (
118 "movd %%mm0,%%ecx\n"
119 "movq %%mm0,%%mm1\n"
120
121 "andl $15,%%ecx\n"
122 "psrlq $32,%%mm1\n"
123
124 "shll $6,%%ecx\n"
125 "movd %%mm1,%%eax\n"
126
127 "addl %0,%%ecx\n"
128 "andl $15,%%eax\n"
129
130 "movd (%%ecx,%%eax,4),%%mm3\n"
131 /* ::"X"(precalCoef):"eax","ecx"); */
132 ::"m"(precalCoef):"eax","ecx");
133
134
135 /*
136 * extraction des coefficients...
137 *
138 * pre : coef dans mm3
139 *
140 * post : coef extraits dans mm3 (c1 & c2)
141 * et mm4 (c3 & c4)
142 *
143 * modif : mm5
144 */
145// entrelace avec portion d'apres (cf les '^')
146// movq_r2r (mm3, mm5); /* ??-??-??-??-c4-c3-c2-c1 */
147// punpcklbw_r2r (mm5, mm3); /* c4-c4-c3-c3-c2-c2-c1-c1 */
148// movq_r2r (mm3, mm4); /* c4-c4-c3-c3-c2-c2-c1-c1 */
149// movq_r2r (mm3, mm5); /* c4-c4-c3-c3-c2-c2-c1-c1 */
150
151// punpcklbw_r2r (mm5, mm3); /* c2-c2-c2-c2-c1-c1-c1-c1 */
152// punpckhbw_r2r (mm5, mm4); /* c4-c4-c4-c4-c3-c3-c3-c3 */
153
154 /*
155 * pre : mm0 : Y pos [*|Y]
156 * mm1 : X pos [*|X]
157 *
158 * post : mm0 : expix1[position]
159 * mm2 : expix1[position+largeur]
160 *
161 * modif : eax,ecx
162 */
163 psrld_i2r (PERTEDEC,mm0);
164 psrld_i2r (PERTEDEC,mm1);
165 __asm__ __volatile__ (
166 "movd %%mm1,%%eax\n"
167 /*^*/ "movq %%mm3,%%mm5\n" /*^*/
168
169 "mull %1\n"
170 "movd %%mm0,%%ecx\n"
171 /*^*/ "punpcklbw %%mm5, %%mm3\n" /*^*/
172
173 "addl %%ecx,%%eax\n"
174 /*^*/ "movq %%mm3,%%mm4\n" /*^*/
175 /*^*/ "movq %%mm3,%%mm5\n" /*^*/
176
177 "movl %0,%%ecx\n"
178 /*^*/ "punpcklbw %%mm5,%%mm3\n" /*^*/
179
180 "movq (%%ecx,%%eax,4),%%mm0\n"
181 /*^*/ "punpckhbw %%mm5,%%mm4\n" /*^*/
182
183 "addl %1,%%eax\n"
184 "movq (%%ecx,%%eax,4),%%mm2\n"
185
186 : : "X"(expix1), "X"(prevX):"eax","ecx"
187 );
188
189 /*
190 * pre : mm0 : expix1[position]
191 * mm2 : expix1[position+largeur]
192 * mm3 & mm4 : coefs
193 */
194
195 /* recopie des deux premiers pixels dans mm0 et mm1 */
196 movq_r2r (mm0, mm1); /* b1-v1-r1-a1-b2-v2-r2-a2 */
197
198 /* depackage du premier pixel */
199 punpcklbw_r2r (mm7, mm0); /* 00-b2-00-v2-00-r2-00-a2 */
200
201 /* extraction des coefficients... */
202
203 movq_r2r (mm3, mm5); /* c2-c2-c2-c2-c1-c1-c1-c1 */
204
205 /*^en parrallele^*/ /* depackage du 2ieme pixel */
206 /*^*/ punpckhbw_r2r (mm7, mm1); /* 00-b1-00-v1-00-r1-00-a1 */
207
208 punpcklbw_r2r (mm7, mm5); /* 00-c1-00-c1-00-c1-00-c1 */
209 punpckhbw_r2r (mm7, mm3); /* 00-c2-00-c2-00-c2-00-c2 */
210
211 /* multiplication des pixels par les coefficients */
212 pmullw_r2r (mm5, mm0); /* c1*b2-c1*v2-c1*r2-c1*a2 */
213 pmullw_r2r (mm3, mm1); /* c2*b1-c2*v1-c2*r1-c2*a1 */
214 paddw_r2r (mm1, mm0);
215
216 /* ...extraction des 2 derniers coefficients */
217 movq_r2r (mm4, mm5); /* c4-c4-c4-c4-c3-c3-c3-c3 */
218 punpcklbw_r2r (mm7, mm4); /* 00-c3-00-c3-00-c3-00-c3 */
219 punpckhbw_r2r (mm7, mm5); /* 00-c4-00-c4-00-c4-00-c4 */
220
221 /* recuperation des 2 derniers pixels */
222 movq_r2r (mm2, mm1);
223
224 /* depackage des pixels */
225 punpcklbw_r2r (mm7, mm1);
226 punpckhbw_r2r (mm7, mm2);
227
228 /* multiplication pas les coeffs */
229 pmullw_r2r (mm4, mm1);
230 pmullw_r2r (mm5, mm2);
231
232 /* ajout des valeurs obtenues à la valeur finale */
233 paddw_r2r (mm1, mm0);
234 paddw_r2r (mm2, mm0);
235
236 /* division par 256 = 16+16+16+16, puis repackage du pixel final */
237 psrlw_i2r (8, mm0);
238 packuswb_r2r (mm7, mm0);
239
240 movd_r2m (mm0,expix2[loop]);
241
242 ++loop;
243 }
244#ifdef HAVE_ATHLON
245 __asm__ __volatile__ ("femms\n");
246#else
247 emms();
248#endif
249}
250#else
252 return 0;
253}
254void zoom_filter_xmmx ([[maybe_unused]] int prevX,
255 [[maybe_unused]] int prevY,
256 [[maybe_unused]] unsigned int *expix1,
257 [[maybe_unused]] unsigned int *expix2,
258 [[maybe_unused]] const int *brutS,
259 [[maybe_unused]] const int *brutD,
260 [[maybe_unused]] int buffratio,
261 [[maybe_unused]] GoomCoefficients& precalCoef)
262{
263}
264#endif
signed int * brutD
Definition: filters.cpp:101
guint32 * expix2
Definition: filters.cpp:106
guint32 * expix1
Definition: filters.cpp:105
unsigned int prevY
Definition: filters.cpp:111
unsigned int prevX
Definition: filters.cpp:111
GoomCoefficients precalCoef
modif d'optim by Jeko : precalcul des 4 coefs résultant des 2 pos
Definition: filters.cpp:146
int buffratio
modif by jeko : fixedpoint : buffration = (16:16) (donc 0<=buffration<=2^16)
Definition: filters.cpp:127
signed int * brutS
Definition: filters.cpp:100
#define PERTEDEC
void zoom_filter_xmmx(int prevX, int prevY, unsigned int *expix1, unsigned int *expix2, const int *lbruS, const int *lbruD, int buffratio, GoomCoefficients &precalCoef)
int zoom_filter_xmmx_supported()
std::array< std::array< int, 16 >, 16 > GoomCoefficients
Definition: zoom_filters.h:3