MythTV  0.28pre
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Groups Pages
sse_gcc.cpp
Go to the documentation of this file.
1 // SSE2 versions of the expensive routines for float samples
2 
3 #include "STTypes.h"
4 #include "TDStretch.h"
5 #include "FIRFilter.h"
6 #include "inttypes.h"
7 
8 using namespace soundtouch;
9 
10 double TDStretchSSE3::calcCrossCorrMulti(const float *mPos, const float *cPos) const
11 {
12  double corr = 0;
13  int count = overlapLength * channels;
14  int loops = count >> 4;
15  int i = loops << 4;
16  const float *mp = mPos;
17  const float *cp = cPos;
18 
19  __asm__ volatile (
20  "xorpd %%xmm7, %%xmm7 \n\t"
21  "1: \n\t"
22  "movups (%1), %%xmm0 \n\t"
23  "movups 16(%1), %%xmm1 \n\t"
24  "mulps (%2), %%xmm0 \n\t"
25  "movups 32(%1), %%xmm2 \n\t"
26  "addps %%xmm0, %%xmm7 \n\t"
27  "mulps 16(%2), %%xmm1 \n\t"
28  "movups 48(%1), %%xmm3 \n\t"
29  "mulps 32(%2), %%xmm2 \n\t"
30  "addps %%xmm1, %%xmm7 \n\t"
31  "mulps 48(%2), %%xmm3 \n\t"
32  "addps %%xmm2, %%xmm7 \n\t"
33  "add $64, %1 \n\t"
34  "add $64, %2 \n\t"
35  "addps %%xmm3, %%xmm7 \n\t"
36  "sub $1, %%ecx \n\t"
37  "jnz 1b \n\t"
38  "haddps %%xmm7, %%xmm7 \n\t"
39  "cvtps2pd %%xmm7, %%xmm7 \n\t"
40  "haddpd %%xmm7, %%xmm7 \n\t"
41  "movsd %%xmm7, %0 \n\t"
42  :"=m"(corr),"+r"(mp), "+r"(cp)
43  :"c"(loops)
44  );
45 
46  for (; i < count; i++)
47  corr += *mp++ * *cp++;
48 
49  return corr;
50 }
51 
52 double TDStretchSSE2::calcCrossCorrMulti(const float *mPos, const float *cPos) const
53 {
54  double corr = 0;
55  int count = overlapLength * channels;
56  int loops = count >> 4;
57  int i = loops << 4;
58  const float *mp = mPos;
59  const float *cp = cPos;
60 
61  __asm__ volatile (
62  "xorpd %%xmm7, %%xmm7 \n\t"
63  "1: \n\t"
64  "movups (%1), %%xmm0 \n\t"
65  "movups 16(%1), %%xmm1 \n\t"
66  "mulps (%2), %%xmm0 \n\t"
67  "movups 32(%1), %%xmm2 \n\t"
68  "addps %%xmm0, %%xmm7 \n\t"
69  "mulps 16(%2), %%xmm1 \n\t"
70  "movups 48(%1), %%xmm3 \n\t"
71  "mulps 32(%2), %%xmm2 \n\t"
72  "addps %%xmm1, %%xmm7 \n\t"
73  "mulps 48(%2), %%xmm3 \n\t"
74  "addps %%xmm2, %%xmm7 \n\t"
75  "add $64, %1 \n\t"
76  "add $64, %2 \n\t"
77  "addps %%xmm3, %%xmm7 \n\t"
78  "sub $1, %%ecx \n\t"
79  "jnz 1b \n\t"
80  "movaps %%xmm7, %%xmm6 \n\t"
81  "shufps $0x4e, %%xmm7, %%xmm6 \n\t"
82  "addps %%xmm6, %%xmm7 \n\t"
83  "cvtps2pd %%xmm7, %%xmm7 \n\t"
84  "movapd %%xmm7, %%xmm6 \n\t"
85  "shufpd $0x01, %%xmm7, %%xmm6 \n\t"
86  "addpd %%xmm6, %%xmm7 \n\t"
87  "movsd %%xmm7, %0 \n\t"
88  :"=m"(corr),"+r"(mp), "+r"(cp)
89  :"c"(loops)
90  );
91 
92  for (; i < count; i++)
93  corr += *mp++ * *cp++;
94 
95  return corr;
96 }
97 
98 double TDStretchSSE3::calcCrossCorrStereo(const float *mPos, const float *cPos) const
99 {
100  double corr = 0;
101  int count = overlapLength <<1;
102  int loops = count >> 4;
103  int i = loops << 4;
104  const float *mp = mPos;
105  const float *cp = cPos;
106 
107  __asm__ volatile (
108  "xorpd %%xmm7, %%xmm7 \n\t"
109  "1: \n\t"
110  "movups (%1), %%xmm0 \n\t"
111  "movups 16(%1), %%xmm1 \n\t"
112  "mulps (%2), %%xmm0 \n\t"
113  "movups 32(%1), %%xmm2 \n\t"
114  "addps %%xmm0, %%xmm7 \n\t"
115  "mulps 16(%2), %%xmm1 \n\t"
116  "movups 48(%1), %%xmm3 \n\t"
117  "mulps 32(%2), %%xmm2 \n\t"
118  "addps %%xmm1, %%xmm7 \n\t"
119  "mulps 48(%2), %%xmm3 \n\t"
120  "addps %%xmm2, %%xmm7 \n\t"
121  "add $64, %1 \n\t"
122  "add $64, %2 \n\t"
123  "addps %%xmm3, %%xmm7 \n\t"
124  "sub $1, %%ecx \n\t"
125  "jnz 1b \n\t"
126  "haddps %%xmm7, %%xmm7 \n\t"
127  "cvtps2pd %%xmm7, %%xmm7 \n\t"
128  "haddpd %%xmm7, %%xmm7 \n\t"
129  "movsd %%xmm7, %0 \n\t"
130  :"=m"(corr),"+r"(mp), "+r"(cp)
131  :"c"(loops)
132  );
133 
134  for (; i < count; i += 2)
135  corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]);
136 
137  return corr;
138 }
139 
140 double TDStretchSSE2::calcCrossCorrStereo(const float *mPos, const float *cPos) const
141 {
142  double corr = 0;
143  int count = overlapLength <<1;
144  int loops = count >> 4;
145  int i = loops << 4;
146  const float *mp = mPos;
147  const float *cp = cPos;
148 
149  __asm__ volatile (
150  "xorpd %%xmm7, %%xmm7 \n\t"
151  "1: \n\t"
152  "movups (%1), %%xmm0 \n\t"
153  "movups 16(%1), %%xmm1 \n\t"
154  "mulps (%2), %%xmm0 \n\t"
155  "movups 32(%1), %%xmm2 \n\t"
156  "addps %%xmm0, %%xmm7 \n\t"
157  "mulps 16(%2), %%xmm1 \n\t"
158  "movups 48(%1), %%xmm3 \n\t"
159  "mulps 32(%2), %%xmm2 \n\t"
160  "addps %%xmm1, %%xmm7 \n\t"
161  "mulps 48(%2), %%xmm3 \n\t"
162  "addps %%xmm2, %%xmm7 \n\t"
163  "add $64, %1 \n\t"
164  "add $64, %2 \n\t"
165  "addps %%xmm3, %%xmm7 \n\t"
166  "sub $1, %%ecx \n\t"
167  "jnz 1b \n\t"
168  "movaps %%xmm7, %%xmm6 \n\t"
169  "shufps $0x4e, %%xmm7, %%xmm6 \n\t"
170  "addps %%xmm6, %%xmm7 \n\t"
171  "cvtps2pd %%xmm7, %%xmm7 \n\t"
172  "movapd %%xmm7, %%xmm6 \n\t"
173  "shufpd $0x01, %%xmm7, %%xmm6 \n\t"
174  "addpd %%xmm6, %%xmm7 \n\t"
175  "movsd %%xmm7, %0 \n\t"
176  :"=m"(corr),"+r"(mp), "+r"(cp)
177  :"c"(loops)
178  );
179 
180  for (; i < count; i += 2)
181  corr += (mp[i] * cp[i] + mp[i + 1] * cp[i + 1]);
182 
183  return corr;
184 }
185 
186 void TDStretchSSE2::overlapMulti(float *output, const float *input) const
187 {
188 
189  float *o = output;
190  const float *i = input;
191  const float *m = pMidBuffer;
192 
193  if (channels > 4)
194  __asm__ volatile (
195  "cvtsi2ss %%ecx, %%xmm7 \n\t"
196  "shl $2, %4 \n\t"
197  "punpckldq %%xmm7, %%xmm7 \n\t"
198  "xorpd %%xmm6, %%xmm6 \n\t"
199  "punpckldq %%xmm7, %%xmm7 \n\t"
200  "rcpps %%xmm7, %%xmm1 \n\t"
201  "mulps %%xmm1, %%xmm7 \n\t"
202  "1: \n\t"
203  "movups (%1), %%xmm2 \n\t"
204  "movups 16(%1), %%xmm4 \n\t"
205  "mulps %%xmm6, %%xmm2 \n\t"
206  "movups (%2), %%xmm3 \n\t"
207  "movups 16(%2), %%xmm5 \n\t"
208  "mulps %%xmm7, %%xmm3 \n\t"
209  "add %4, %1 \n\t"
210  "mulps %%xmm6, %%xmm4 \n\t"
211  "addps %%xmm2, %%xmm3 \n\t"
212  "mulps %%xmm7, %%xmm5 \n\t"
213  "movups %%xmm3, (%3) \n\t"
214  "addps %%xmm4, %%xmm5 \n\t"
215  "add %4, %2 \n\t"
216  "movups %%xmm5, 16(%3) \n\t"
217  "addps %%xmm1, %%xmm6 \n\t"
218  "add %4, %3 \n\t"
219  "subps %%xmm1, %%xmm7 \n\t"
220  "sub $1, %%ecx \n\t"
221  "jnz 1b \n\t"
222  :
223  :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels)
224  );
225  else
226  __asm__ volatile (
227  "cvtsi2ss %%ecx, %%xmm7 \n\t"
228  "shl $2, %4 \n\t"
229  "shr %%ecx \n\t"
230  "punpckldq %%xmm7, %%xmm7 \n\t"
231  "xorpd %%xmm6, %%xmm6 \n\t"
232  "punpckldq %%xmm7, %%xmm7 \n\t"
233  "rcpps %%xmm7, %%xmm1 \n\t"
234  "mulps %%xmm1, %%xmm7 \n\t"
235  "1: \n\t"
236  "movups (%1), %%xmm2 \n\t"
237  "movups 16(%1), %%xmm4 \n\t"
238  "mulps %%xmm6, %%xmm2 \n\t"
239  "movups (%2), %%xmm3 \n\t"
240  "movups 16(%2), %%xmm5 \n\t"
241  "mulps %%xmm7, %%xmm3 \n\t"
242  "addps %%xmm1, %%xmm6 \n\t"
243  "add %4, %1 \n\t"
244  "addps %%xmm2, %%xmm3 \n\t"
245  "add %4, %2 \n\t"
246  "subps %%xmm1, %%xmm7 \n\t"
247  "movups %%xmm3, (%3) \n\t"
248  "add %4, %3 \n\t"
249  "mulps %%xmm6, %%xmm4 \n\t"
250  "add %4, %1 \n\t"
251  "mulps %%xmm7, %%xmm5 \n\t"
252  "addps %%xmm1, %%xmm6 \n\t"
253  "add %4, %2 \n\t"
254  "addps %%xmm4, %%xmm5 \n\t"
255  "subps %%xmm1, %%xmm7 \n\t"
256  "movups %%xmm5, (%3) \n\t"
257  "add %4, %3 \n\t"
258  "sub $1, %%ecx \n\t"
259  "jnz 1b \n\t"
260  :
261  :"c"(overlapLength),"r"(i),"r"(m),"r"(o),"r"((long)channels)
262  );
263 }
264 
265 void TDStretchSSE2::overlapStereo(float *output, const float *input) const
266 {
267  float *o = output;
268  const float *i = input;
269  const float *m = pMidBuffer;
270 
271  __asm__ volatile (
272  "cvtsi2ss %%ecx, %%xmm7 \n\t"
273  "shr %%ecx \n\t"
274  "xorpd %%xmm6, %%xmm6 \n\t"
275  "punpckldq %%xmm7, %%xmm7 \n\t"
276  "rcpps %%xmm7, %%xmm1 \n\t"
277  "mulps %%xmm1, %%xmm7 \n\t"
278  "1: \n\t"
279  "movups (%1), %%xmm2 \n\t"
280  "movups 8(%1), %%xmm4 \n\t"
281  "mulps %%xmm6, %%xmm2 \n\t"
282  "movups (%2), %%xmm3 \n\t"
283  "movups 8(%2), %%xmm5 \n\t"
284  "mulps %%xmm7, %%xmm3 \n\t"
285  "addps %%xmm1, %%xmm6 \n\t"
286  "addps %%xmm2, %%xmm3 \n\t"
287  "subps %%xmm1, %%xmm7 \n\t"
288  "movlps %%xmm3, (%3) \n\t"
289  "add $8, %3 \n\t"
290  "mulps %%xmm6, %%xmm4 \n\t"
291  "add $16, %1 \n\t"
292  "mulps %%xmm7, %%xmm5 \n\t"
293  "addps %%xmm1, %%xmm6 \n\t"
294  "add $16, %2 \n\t"
295  "addps %%xmm4, %%xmm5 \n\t"
296  "subps %%xmm1, %%xmm7 \n\t"
297  "movlps %%xmm5, (%3) \n\t"
298  "add $8, %3 \n\t"
299  "sub $1, %%ecx \n\t"
300  "jnz 1b \n\t"
301  :
302  :"c"(overlapLength),"r"(i),"r"(m),"r"(o)
303  );
304 }
305 
307 {
308  filterCoeffsAlign = NULL;
309  filterCoeffsUnalign = NULL;
310 }
311 
313 {
314  delete[] filterCoeffsUnalign;
315  filterCoeffsAlign = NULL;
316  filterCoeffsUnalign = NULL;
317 }
318 
319 
320 void FIRFilterSSE2::setCoefficients(const float *coeffs, uint newLen, uint uRDF)
321 {
322  uint i;
323  FIRFilter::setCoefficients(coeffs, newLen, uRDF);
324 
325  // Ensure that filter coeffs array is aligned to 16-byte boundary
326  delete[] filterCoeffsUnalign;
327  filterCoeffsUnalign = new float[2 * newLen + 16];
328  filterCoeffsAlign = (float *)(((ulong)filterCoeffsUnalign + 15) & -16);
329 
330  float fdiv = (float)resultDivider;
331 
332  for (i = 0; i < newLen; i++)
333  {
334  filterCoeffsAlign[2 * i + 0] =
335  filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fdiv;
336  }
337 }
338 
339 uint FIRFilterSSE2::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
340 {
341  uint count = (numSamples - length) & -2;
342 
343  for (int i = 0; i < count; i += 2)
344  {
345  __asm__ volatile(
346  "xorpd %%xmm6, %%xmm6 \n\t"
347  "xorpd %%xmm7, %%xmm7 \n\t"
348  "1: \n\t"
349  "movups (%1), %%xmm1 \n\t"
350  "movups 8(%1), %%xmm2 \n\t"
351  "mulps (%2), %%xmm1 \n\t"
352  "movups 16(%1), %%xmm3 \n\t"
353  "mulps (%2), %%xmm2 \n\t"
354  "addps %%xmm1, %%xmm6 \n\t"
355  "movups 24(%1), %%xmm4 \n\t"
356  "addps %%xmm2, %%xmm7 \n\t"
357  "mulps 16(%2), %%xmm3 \n\t"
358  "movups 32(%1), %%xmm1 \n\t"
359  "mulps 16(%2), %%xmm4 \n\t"
360  "addps %%xmm3, %%xmm6 \n\t"
361  "movups 40(%1), %%xmm2 \n\t"
362  "addps %%xmm4, %%xmm7 \n\t"
363  "mulps 32(%2), %%xmm1 \n\t"
364  "movups 48(%1), %%xmm3 \n\t"
365  "mulps 32(%2), %%xmm2 \n\t"
366  "addps %%xmm1, %%xmm6 \n\t"
367  "movups 56(%1), %%xmm4 \n\t"
368  "addps %%xmm2, %%xmm7 \n\t"
369  "mulps 48(%2), %%xmm3 \n\t"
370  "add $64, %1 \n\t"
371  "mulps 48(%2), %%xmm4 \n\t"
372  "addps %%xmm3, %%xmm6 \n\t"
373  "add $64, %2 \n\t"
374  "addps %%xmm4, %%xmm7 \n\t"
375  "sub $1, %%ecx \n\t"
376  "jnz 1b \n\t"
377  "movhlps %%xmm6, %%xmm0 \n\t"
378  "movlhps %%xmm7, %%xmm0 \n\t"
379  "shufps $0xe4, %%xmm7, %%xmm6 \n\t"
380  "addps %%xmm0, %%xmm6 \n\t"
381  "movups %%xmm6, (%0) \n\t"
382  :
383  :"r"(dest),"r"(src),"r"(filterCoeffsAlign),"c"(length>>3)
384  );
385  src += 4;
386  dest += 4;
387  }
388 
389  return count;
390 }
virtual void overlapStereo(float *output, const float *input) const
Definition: sse_gcc.cpp:265
double calcCrossCorrStereo(const float *mixingPos, const float *compare) const
Definition: sse_gcc.cpp:98
unsigned int uint
Definition: compat.h:135
SAMPLETYPE * pMidBuffer
Definition: TDStretch.h:104
double calcCrossCorrStereo(const float *mixingPos, const float *compare) const
Definition: sse_gcc.cpp:140
double calcCrossCorrMulti(const float *mixingPos, const float *compare) const
Definition: sse_gcc.cpp:10
dest
Definition: minilzo.cpp:2074
virtual uint evaluateFilterStereo(float *dest, const float *src, uint numSamples) const
Definition: sse_gcc.cpp:339
virtual void setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
Definition: sse_gcc.cpp:320
FIFOSamplePipe * output
Internal pipe where processed samples are put.
uint length
Definition: FIRFilter.h:51
typedef long(ZCALLBACK *tell_file_func) OF((voidpf opaque
soundtouch::SAMPLETYPE resultDivider
Definition: FIRFilter.h:59
double calcCrossCorrMulti(const float *mixingPos, const float *compare) const
Definition: sse_gcc.cpp:52
float * filterCoeffsAlign
Definition: FIRFilter.h:107
virtual void setCoefficients(const soundtouch::SAMPLETYPE *coeffs, uint newLength, uint uResultDivFactor)
Definition: FIRFilter.cpp:171
float * filterCoeffsUnalign
Definition: FIRFilter.h:106
unsigned long ulong
Definition: STTypes.h:43
virtual void overlapMulti(float *output, const float *input) const
Definition: sse_gcc.cpp:186
src
Definition: minilzo.cpp:2074