MythTV  master
zoom_filter_xmmx.cpp
Go to the documentation of this file.
2 
3 #if defined(MMX) && !(defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || defined(_M_X64))
4 /* a definir pour avoir exactement le meme resultat que la fonction C
5  * (un chouillat plus lent)
6  */
7 #define STRICT_COMPAT
8 //#define HAVE_ATHLON
9 
10 #define BUFFPOINTNB 16
11 #define BUFFPOINTMASK 0xffff
12 #define BUFFINCR 0xff
13 
14 #define sqrtperte 16
15 // faire : a % sqrtperte <=> a & pertemask
16 #define PERTEMASK 0xf
17 // faire : a / sqrtperte <=> a >> PERTEDEC
18 #define PERTEDEC 4
19 
20 
21 //#define MMX_TRACE
22 #include "mmx.h"
23 extern "C" {
24 #include "libavutil/cpu.h"
25 }
26 
28  return (av_get_cpu_flags() & AV_CPU_FLAG_SSE) >> 3;
29 }
30 
31 void zoom_filter_xmmx (int prevX, int prevY,
32  unsigned int *expix1, unsigned int *expix2,
33  const int *lbruS, const int *lbruD, int buffratio,
35 {
36  int bufsize = prevX * prevY; /* taille du buffer */
37  volatile int loop; /* variable de boucle */
38 
39  mmx_t *brutS = (mmx_t*)lbruS; /* buffer de transformation source */
40  mmx_t *brutD = (mmx_t*)lbruD; /* buffer de transformation dest */
41 
42  volatile mmx_t prevXY;
43  volatile mmx_t ratiox;
44  //volatile mmx_t interpix;
45 
47 
48  prevXY.ud[0] = (prevX-1)<<PERTEDEC;
49  prevXY.ud[1] = (prevY-1)<<PERTEDEC;
50 
51  ratiox.d[0] = buffratio;
52  ratiox.d[1] = buffratio;
53  movq_m2r (ratiox, mm6);
54  pslld_i2r (16,mm6);
55 
56  pxor_r2r (mm7,mm7); /* mise a zero de mm7 */
57 
58  loop=0;
59 
60  /*
61  * NOTE : mm6 et mm7 ne sont pas modifies dans la boucle.
62  */
63  while (loop < bufsize)
64  {
65  /*
66  * pre : mm6 = [buffratio<<16|buffratio<<16]
67  * post : mm0 = S + ((D-S)*buffratio)>>16 format [X|Y]
68  * modified = mm0,mm1,mm2
69  */
70 
71  __asm__ __volatile__ (
72  "movq %0,%%mm0\n"
73  "movq %1,%%mm1\n"
74  : :"m"(brutS[loop]),"m"(brutD[loop])
75  ); /* mm0 = S */
76 
77  psubd_r2r (mm0,mm1); /* mm1 = D - S */
78  movq_r2r (mm1, mm2); /* mm2 = D - S */
79 
80  pslld_i2r (16,mm1);
81  mmx_r2r (pmulhuw, mm6, mm1); /* mm1 = ?? */
82  pmullw_r2r (mm6, mm2);
83 
84  paddd_r2r (mm2, mm1); /* mm1 = (D - S) * buffratio >> 16 */
85  pslld_i2r (16,mm0);
86 
87  paddd_r2r (mm1, mm0); /* mm0 = S + mm1 */
88  psrld_i2r (16, mm0);
89 
90  /*
91  * pre : mm0 : position vector on screen
92  * prevXY : coordinate of the lower-right point on screen
93  * post : clipped mm0
94  * modified : mm0,mm1,mm2
95  */
96  movq_m2r (prevXY,mm1);
97  pcmpgtd_r2r (mm0, mm1); /* mm0 en X contient :
98  1111 si prevXY > px
99  0000 si prevXY <= px
100  (idem pour y) */
101 #ifdef STRICT_COMPAT
102  movq_r2r (mm1,mm2);
103  punpckhdq_r2r (mm2,mm2);
104  punpckldq_r2r (mm1,mm1);
105  pand_r2r (mm2, mm0);
106 #endif
107  pand_r2r (mm1, mm0); /* on met a zero la partie qui deborde */
108 
109  /*
110  * pre : mm0 : clipped position on screen
111  *
112  * post : mm3 & mm4 : coefs for this position
113  * mm1 : X vector [0|X]
114  *
115  * modif : eax,ecx
116  */
117  __asm__ __volatile__ (
118  "movd %%mm0,%%ecx\n"
119  "movq %%mm0,%%mm1\n"
120 
121  "andl $15,%%ecx\n"
122  "psrlq $32,%%mm1\n"
123 
124  "shll $6,%%ecx\n"
125  "movd %%mm1,%%eax\n"
126 
127  "addl %0,%%ecx\n"
128  "andl $15,%%eax\n"
129 
130  "movd (%%ecx,%%eax,4),%%mm3\n"
131  /* ::"X"(precalCoef):"eax","ecx"); */
132  ::"m"(precalCoef):"eax","ecx");
133 
134 
135  /*
136  * extraction des coefficients...
137  *
138  * pre : coef dans mm3
139  *
140  * post : coef extraits dans mm3 (c1 & c2)
141  * et mm4 (c3 & c4)
142  *
143  * modif : mm5
144  */
145 // entrelace avec portion d'apres (cf les '^')
146 // movq_r2r (mm3, mm5); /* ??-??-??-??-c4-c3-c2-c1 */
147 // punpcklbw_r2r (mm5, mm3); /* c4-c4-c3-c3-c2-c2-c1-c1 */
148 // movq_r2r (mm3, mm4); /* c4-c4-c3-c3-c2-c2-c1-c1 */
149 // movq_r2r (mm3, mm5); /* c4-c4-c3-c3-c2-c2-c1-c1 */
150 
151 // punpcklbw_r2r (mm5, mm3); /* c2-c2-c2-c2-c1-c1-c1-c1 */
152 // punpckhbw_r2r (mm5, mm4); /* c4-c4-c4-c4-c3-c3-c3-c3 */
153 
154  /*
155  * pre : mm0 : Y pos [*|Y]
156  * mm1 : X pos [*|X]
157  *
158  * post : mm0 : expix1[position]
159  * mm2 : expix1[position+largeur]
160  *
161  * modif : eax,ecx
162  */
163  psrld_i2r (PERTEDEC,mm0);
164  psrld_i2r (PERTEDEC,mm1);
165  __asm__ __volatile__ (
166  "movd %%mm1,%%eax\n"
167  /*^*/ "movq %%mm3,%%mm5\n" /*^*/
168 
169  "mull %1\n"
170  "movd %%mm0,%%ecx\n"
171  /*^*/ "punpcklbw %%mm5, %%mm3\n" /*^*/
172 
173  "addl %%ecx,%%eax\n"
174  /*^*/ "movq %%mm3,%%mm4\n" /*^*/
175  /*^*/ "movq %%mm3,%%mm5\n" /*^*/
176 
177  "movl %0,%%ecx\n"
178  /*^*/ "punpcklbw %%mm5,%%mm3\n" /*^*/
179 
180  "movq (%%ecx,%%eax,4),%%mm0\n"
181  /*^*/ "punpckhbw %%mm5,%%mm4\n" /*^*/
182 
183  "addl %1,%%eax\n"
184  "movq (%%ecx,%%eax,4),%%mm2\n"
185 
186  : : "X"(expix1), "X"(prevX):"eax","ecx"
187  );
188 
189  /*
190  * pre : mm0 : expix1[position]
191  * mm2 : expix1[position+largeur]
192  * mm3 & mm4 : coefs
193  */
194 
195  /* recopie des deux premiers pixels dans mm0 et mm1 */
196  movq_r2r (mm0, mm1); /* b1-v1-r1-a1-b2-v2-r2-a2 */
197 
198  /* depackage du premier pixel */
199  punpcklbw_r2r (mm7, mm0); /* 00-b2-00-v2-00-r2-00-a2 */
200 
201  /* extraction des coefficients... */
202 
203  movq_r2r (mm3, mm5); /* c2-c2-c2-c2-c1-c1-c1-c1 */
204 
205  /*^en parrallele^*/ /* depackage du 2ieme pixel */
206  /*^*/ punpckhbw_r2r (mm7, mm1); /* 00-b1-00-v1-00-r1-00-a1 */
207 
208  punpcklbw_r2r (mm7, mm5); /* 00-c1-00-c1-00-c1-00-c1 */
209  punpckhbw_r2r (mm7, mm3); /* 00-c2-00-c2-00-c2-00-c2 */
210 
211  /* multiplication des pixels par les coefficients */
212  pmullw_r2r (mm5, mm0); /* c1*b2-c1*v2-c1*r2-c1*a2 */
213  pmullw_r2r (mm3, mm1); /* c2*b1-c2*v1-c2*r1-c2*a1 */
214  paddw_r2r (mm1, mm0);
215 
216  /* ...extraction des 2 derniers coefficients */
217  movq_r2r (mm4, mm5); /* c4-c4-c4-c4-c3-c3-c3-c3 */
218  punpcklbw_r2r (mm7, mm4); /* 00-c3-00-c3-00-c3-00-c3 */
219  punpckhbw_r2r (mm7, mm5); /* 00-c4-00-c4-00-c4-00-c4 */
220 
221  /* recuperation des 2 derniers pixels */
222  movq_r2r (mm2, mm1);
223 
224  /* depackage des pixels */
225  punpcklbw_r2r (mm7, mm1);
226  punpckhbw_r2r (mm7, mm2);
227 
228  /* multiplication pas les coeffs */
229  pmullw_r2r (mm4, mm1);
230  pmullw_r2r (mm5, mm2);
231 
232  /* ajout des valeurs obtenues à la valeur finale */
233  paddw_r2r (mm1, mm0);
234  paddw_r2r (mm2, mm0);
235 
236  /* division par 256 = 16+16+16+16, puis repackage du pixel final */
237  psrlw_i2r (8, mm0);
238  packuswb_r2r (mm7, mm0);
239 
240  movd_r2m (mm0,expix2[loop]);
241 
242  ++loop;
243  }
244 #ifdef HAVE_ATHLON
245  __asm__ __volatile__ ("femms\n");
246 #else
247  emms();
248 #endif
249 }
250 #else
252  return 0;
253 }
254 void zoom_filter_xmmx ([[maybe_unused]] int prevX,
255  [[maybe_unused]] int prevY,
256  [[maybe_unused]] unsigned int *expix1,
257  [[maybe_unused]] unsigned int *expix2,
258  [[maybe_unused]] const int *brutS,
259  [[maybe_unused]] const int *brutD,
260  [[maybe_unused]] int buffratio,
261  [[maybe_unused]] GoomCoefficients& precalCoef)
262 {
263 }
264 #endif
precalCoef
GoomCoefficients precalCoef
modif d'optim by Jeko : precalcul des 4 coefs résultant des 2 pos
Definition: filters.cpp:146
prevX
unsigned int prevX
Definition: filters.cpp:111
expix1
guint32 * expix1
Definition: filters.cpp:105
mmx.h
PERTEDEC
#define PERTEDEC
Definition: zoom_filter_xmmx.cpp:18
prevY
unsigned int prevY
Definition: filters.cpp:111
GoomCoefficients
std::array< std::array< int, 16 >, 16 > GoomCoefficients
Definition: zoom_filters.h:3
buffratio
int buffratio
modif by jeko : fixedpoint : buffration = (16:16) (donc 0<=buffration<=2^16)
Definition: filters.cpp:127
zoom_filter_xmmx_supported
int zoom_filter_xmmx_supported()
Definition: zoom_filter_xmmx.cpp:27
zoom_filters.h
brutD
signed int * brutD
Definition: filters.cpp:101
brutS
signed int * brutS
Definition: filters.cpp:100
zoom_filter_xmmx
void zoom_filter_xmmx(int prevX, int prevY, unsigned int *expix1, unsigned int *expix2, const int *lbruS, const int *lbruD, int buffratio, GoomCoefficients &precalCoef)
Definition: zoom_filter_xmmx.cpp:31
expix2
guint32 * expix2
Definition: filters.cpp:106