MythTV  0.28pre
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Properties Friends Groups Pages
mmx_gcc.cpp
Go to the documentation of this file.
1 //
18 // Last changed : $Date$
19 // File revision : $Revision$
20 //
21 // $Id$
22 //
24 //
25 // License :
26 //
27 // SoundTouch audio processing library
28 // Copyright (c) Olli Parviainen
29 //
30 // This library is free software; you can redistribute it and/or
31 // modify it under the terms of the GNU Lesser General Public
32 // License as published by the Free Software Foundation; either
33 // version 2.1 of the License, or (at your option) any later version.
34 //
35 // This library is distributed in the hope that it will be useful,
36 // but WITHOUT ANY WARRANTY; without even the implied warranty of
37 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
38 // Lesser General Public License for more details.
39 //
40 // You should have received a copy of the GNU Lesser General Public
41 // License along with this library; if not, write to the Free Software
42 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
43 //
45 
46 #include "STTypes.h"
47 using namespace soundtouch;
48 
49 #ifdef ALLOW_MMX
50 #include <stdexcept>
51 #include <string>
52 #include <climits>
53 
54 // define USE_GCC_INTRINSICS to use gcc 3.x intrinsics instead of our mmx.h
55 //#define USE_GCC_INTRINSICS
56 
57 #ifdef USE_GCC_INTRINSICS
58 # include <mmintrin.h>
59 # define SI(A,B...) A
60 # define GI(X...) X
61 #else
62 # include "x86/mmx.h"
63 # define _mm_empty() __asm__ __volatile__ ("emms")
64 # define __m64 mmx_t
65 # define SI(A,B...) B
66 # define GI(X...)
67 #endif
68 
69 #include "cpu_detect.h"
70 #include "TDStretch.h"
71 
72 // MMX routines available only with integer sample type
73 
75 //
76 // Implementation of MMX optimized functions of class 'TDStretch'
77 //
79 
80 // these are declared in 'TDStretch.cpp'
81 extern int scanOffsets[4][24];
82 
83 // Calculates cross correlation of two buffers
84 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
85 {
86  uint tmp;
87  uint counter = (overlapLength>>3)-1; // load counter to counter = overlapLength / 8 - 1
88  __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
89  GI(__m64 m0, m1, m2, m3, m4, m5); // temporaries
90  uint shift = overlapDividerBits;
91 
92  // prepare to the first round by loading
93  SI(m1 = pv1[0], movq_a2r(0, pv1, mm1)); // load m1 = pv1[0]
94  SI(m2 = pv1[1], movq_a2r(8, pv1, mm2)); // load m2 = pv1[1]
95  SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0)); // clear m0
96  SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5)); // shift in 64bit reg
97 
98  do {
99  // Calculate cross-correlation between the tempOffset and tmpbid_buffer.
100  // Process 4 parallel batches of 2 * stereo samples each during one
101  // round to improve CPU-level parallellization.
102  SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1)); // multiply-add m1 = m1 * pv2[0]
103  SI(m3 = pv1[2], movq_a2r(16, pv1, mm3)); // load mm3 = pv1[2]
104  SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2)); // multiply-add m2 = m2 * pv2[1]
105  SI(m4 = pv1[3], movq_a2r(24, pv1, mm4)); // load mm4 = pv1[3]
106  SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));// multiply-add m3 = m3 * pv2[2]
107  SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2)); // add m2 += m1
108  SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));// multiply-add m4 = m4 * pv2[3]
109  SI(m1 = pv1[4], movq_a2r(32, pv1, mm1)); // mm1 = pv1[0] for next round
110  SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2)); // m2 >>= shift (mm5)
111  pv1 += 4; // increment first pointer
112  SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3)); // m3 += m4
113  SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0)); // m0 += m2
114  SI(m2 = pv1[1], movq_a2r(8, pv1, mm2)); // mm2 = pv1[1] for next round
115  SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3)); // m3 >>= shift (mm5)
116  pv2 += 4; // increment second pointer
117  SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0)); // add m0 += m3
118  } while ((--counter)!=0);
119 
120  // Finalize the last partial loop:
121  SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
122  SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
123  SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
124  SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
125  SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
126  SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
127  SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
128  SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
129  SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
130  SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
131  SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
132  SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
133 
134  // copy hi-dword of mm0 to lo-dword of mm1, then sum mm0+mm1
135  // and finally return the result
136  SI(m1 = m0, movq_r2r(mm0, mm1));
137  SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
138  SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
139  SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
140  return tmp;
141 }
142 
143 #ifdef USE_MULTI_MMX
144 // Calculates cross correlation of two buffers
145 long TDStretchMMX::calcCrossCorrMulti(const short *pV1, const short *pV2) const
146 {
147  //static const unsigned long long int mm_half __attribute__ ((aligned(8))) = 0xffffffffULL;
148  static const __m64 mm_mask[4][8] __attribute__ ((aligned(8))) = {
149  {
150  // even bit
151  0xffffffffffffffffULL,
152  0xffffffffffffffffULL,
153  0xffffffffffffffffULL,
154  0xffffffffffffffffULL,
155  0,
156  0,
157  0,
158  0
159  },
160  {
161  0xffffffffffffffffULL,
162  0xffffffffffffffffULL,
163  0xffffffffffffffffULL,
164  0x0000ffffffffffffULL,
165  0,
166  0,
167  0,
168  0
169  },
170  {
171  0xffffffffffffffffULL,
172  0xffffffffffffffffULL,
173  0xffffffffffffffffULL,
174  0x00000000ffffffffULL,
175  0,
176  0,
177  0,
178  0
179  },
180  {
181  0xffffffffffffffffULL,
182  0xffffffffffffffffULL,
183  0xffffffffffffffffULL,
184  0x000000000000ffffULL,
185  0,
186  0,
187  0,
188  0
189  }
190  };
191  uint tmp;
192  uint adjustedOverlapLength = overlapLength*channels;
193  uint counter = ((adjustedOverlapLength+15)>>4)-1; // load counter to counter = overlapLength / 8 - 1
194  uint remainder = (16-adjustedOverlapLength)&0xf; // since there are 1/3 sample per 1/2 quadword
195 
196  __m64 *ph = (__m64*)&mm_mask[remainder&3][remainder>>2];
197  __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
198  GI(__m64 m0, m1, m2, m3, m4, m5, m6); // temporaries
199  uint shift = overlapDividerBits;
200 
201  // prepare to the first round by loading
202  SI(m1 = pv1[0], movq_a2r(0, pv1, mm1)); // load m1 = pv1[0]
203  SI(m2 = pv1[1], movq_a2r(8, pv1, mm2)); // load m2 = pv1[1]
204  SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0)); // clear m0
205  SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5)); // shift in 64bit reg
206 
207  do {
208  // Calculate cross-correlation between the tempOffset and tmpbid_buffer.
209  // Process 4 parallel batches of 2 * stereo samples each during one
210  // round to improve CPU-level parallellization.
211  SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1)); // multiply-add m1 = m1 * pv2[0]
212  SI(m3 = pv1[2], movq_a2r(16, pv1, mm3)); // load mm3 = pv1[2]
213  SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2)); // multiply-add m2 = m2 * pv2[1]
214  SI(m4 = pv1[3], movq_a2r(24, pv1, mm4)); // load mm4 = pv1[3]
215  SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));// multiply-add m3 = m3 * pv2[2]
216  SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2)); // add m2 += m1
217  SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));// multiply-add m4 = m4 * pv2[3]
218  SI(m1 = pv1[4], movq_a2r(32, pv1, mm1)); // mm1 = pv1[0] for next round
219  SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2)); // m2 >>= shift (mm5)
220  pv1 += 4; // increment first pointer
221  SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3)); // m3 += m4
222  SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0)); // m0 += m2
223  SI(m2 = pv1[1], movq_a2r(8, pv1, mm2)); // mm2 = pv1[1] for next round
224  SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3)); // m3 >>= shift (mm5)
225  pv2 += 4; // increment second pointer
226  SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0)); // add m0 += m3
227  } while ((--counter)!=0);
228 
229  SI(m6 = ph[0], movq_a2r(0, ph, mm6));
230  // Finalize the last partial loop:
231  SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
232  SI(m1 = _mm_and_si64(m1, m6), pand_r2r(mm6, mm1));
233  SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
234  SI(m6 = ph[1], movq_a2r(8, ph, mm6));
235  SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
236  SI(m2 = _mm_and_si64(m2, m6), pand_r2r(mm6, mm2));
237  SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
238  SI(m6 = ph[2], movq_a2r(16, ph, mm6));
239  SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
240  SI(m3 = _mm_and_si64(m3, m6), pand_r2r(mm6, mm3));
241  SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
242  SI(m6 = ph[3], movq_a2r(24, ph, mm6));
243  SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
244  SI(m4 = _mm_and_si64(m4, m6), pand_r2r(mm6, mm4));
245  SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
246  SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
247  SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
248  SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
249  SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
250 
251  // copy hi-dword of mm0 to lo-dword of mm1, then sum mm0+mm1
252  // and finally return the result
253  SI(m1 = m0, movq_r2r(mm0, mm1));
254  SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
255  SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
256  SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
257  return tmp;
258 }
259 #endif
260 
262 {
263  _mm_empty();
264 }
265 
266 // MMX-optimized version of the function overlapStereo
267 void TDStretchMMX::overlapStereo(short *output, const short *input) const
268 {
269  _mm_empty();
270  uint shift = overlapDividerBits;
271  uint counter = overlapLength>>2; // counter = overlapLength / 4
272  __m64 *inPtr = (__m64*) input; // load address of inputBuffer
273  __m64 *midPtr = (__m64*) pMidBuffer; // load address of midBuffer
274  __m64 *outPtr = ((__m64*) output)-2; // load address of outputBuffer
275  GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7); // temporaries
276 
277  // load mixing value adder to mm5
278  uint tmp0 = 0x0002fffe; // tmp0 = 0x0002 fffe
279  SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5)); // mm5 = 0x0000 0000 0002 fffe
280  SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5)); // mm5 = 0x0002 fffe 0002 fffe
281  // load sliding mixing value counter to mm6
282  SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
283  SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6)); // mm6 = 0x0000 OVL_ 0000 OVL_
284  // load sliding mixing value counter to mm7
285  uint tmp1 = (overlapLength-1)|0x00010000; // tmp1 = 0x0001 overlapLength-1
286  SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7)); // mm7 = 0x0000 0000 0001 01ff
287  SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7)); // mm7 = 0x0001 01ff 0001 01ff
288 
289  do {
290  // Process two parallel batches of 2+2 stereo samples during each round
291  // to improve CPU-level parallellization.
292  //
293  // Load [midPtr] into m0 and m1
294  // Load [inPtr] into m3
295  // unpack words of m0, m1 and m3 into m0 and m1
296  // multiply-add m0*m6 and m1*m7, store results into m0 and m1
297  // divide m0 and m1 by 512 (=right-shift by overlapDividerBits)
298  // pack the result into m0 and store into [edx]
299  //
300  // Load [midPtr+8] into m2 and m3
301  // Load [inPtr+8] into m4
302  // unpack words of m2, m3 and m4 into m2 and m3
303  // multiply-add m2*m6 and m3*m7, store results into m2 and m3
304  // divide m2 and m3 by 512 (=right-shift by overlapDividerBits)
305  // pack the result into m2 and store into [edx+8]
306  SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));// mm0 = m1l m1r m0l m0r
307  outPtr += 2;
308  SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3)); // mm3 = i1l i1r i0l i0r
309  SI(m1 = m0, movq_r2r(mm0, mm1)); // mm1 = m1l m1r m0l m0r
310  SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));// mm2 = m3l m3r m2l m2r
311  SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0)); // mm0 = i0l m0l i0r m0r
312  midPtr += 2;
313  SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4)); // mm4 = i3l i3r i2l i2r
314  SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1)); // mm1 = i1l m1l i1r m1r
315  inPtr+=2;
316  SI(m3 = m2, movq_r2r(mm2, mm3)); // mm3 = m3l m3r m2l m2r
317  SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2)); // mm2 = i2l m2l i2r m2r
318  // mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
319  SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
320  SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3)); // mm3 = i3l m3l i3r m3r
321  SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4)); // mm4 = shift
322  // mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
323  SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
324  SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
325  SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
326  SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0)); // mm0 >>= shift
327  // mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
328  SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
329  SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1)); // mm1 >>= shift
330  // mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
331  SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
332  SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2)); // mm2 >>= shift
333  SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0)); // mm0 = mm1h mm1l mm0h mm0l
334  SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3)); // mm3 >>= shift
335  SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
336  SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2)); // mm2 = mm2h mm2l mm3h mm3l
337  SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
338  SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
339  SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
340  } while ((--counter)!=0);
341  _mm_empty();
342 }
343 
344 #if 0
345 // MMX-optimized version of the function overlapMulti
346 void TDStretchMMX::overlapMulti(short *output, const short *input) const
347 {
348  _mm_empty();
349  uint shift = overlapDividerBits;
350  uint counter = overlapLength>>2; // counter = overlapLength / 4
351  __m64 *inPtr = (__m64*) input; // load address of inputBuffer
352  __m64 *midPtr = (__m64*) pMidBuffer; // load address of midBuffer
353  __m64 *outPtr = ((__m64*) output)-2; // load address of outputBuffer
354  GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7); // temporaries
355 
356  // load mixing value adder to mm5
357  uint tmp0 = 0x0002fffe; // tmp0 = 0x0002 fffe
358  SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5)); // mm5 = 0x0000 0000 0002 fffe
359  SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5)); // mm5 = 0x0002 fffe 0002 fffe
360  // load sliding mixing value counter to mm6
361  SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
362  SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6)); // mm6 = 0x0000 OVL_ 0000 OVL_
363  // load sliding mixing value counter to mm7
364  uint tmp1 = (overlapLength-1)|0x00010000; // tmp1 = 0x0001 overlapLength-1
365  SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7)); // mm7 = 0x0000 0000 0001 01ff
366  SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7)); // mm7 = 0x0001 01ff 0001 01ff
367 
368  do {
369  // Process two parallel batches of 2+2 stereo samples during each round
370  // to improve CPU-level parallellization.
371  //
372  // Load [midPtr] into m0 and m1
373  // Load [inPtr] into m3
374  // unpack words of m0, m1 and m3 into m0 and m1
375  // multiply-add m0*m6 and m1*m7, store results into m0 and m1
376  // divide m0 and m1 by 512 (=right-shift by overlapDividerBits)
377  // pack the result into m0 and store into [edx]
378  //
379  // Load [midPtr+8] into m2 and m3
380  // Load [inPtr+8] into m4
381  // unpack words of m2, m3 and m4 into m2 and m3
382  // multiply-add m2*m6 and m3*m7, store results into m2 and m3
383  // divide m2 and m3 by 512 (=right-shift by overlapDividerBits)
384  // pack the result into m2 and store into [edx+8]
385  SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));// mm0 = m1l m1r m0l m0r
386  outPtr += 2;
387  SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3)); // mm3 = i1l i1r i0l i0r
388  SI(m1 = m0, movq_r2r(mm0, mm1)); // mm1 = m1l m1r m0l m0r
389  SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));// mm2 = m3l m3r m2l m2r
390  SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0)); // mm0 = i0l m0l i0r m0r
391  midPtr += 2;
392  SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4)); // mm4 = i3l i3r i2l i2r
393  SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1)); // mm1 = i1l m1l i1r m1r
394  inPtr+=2;
395  SI(m3 = m2, movq_r2r(mm2, mm3)); // mm3 = m3l m3r m2l m2r
396  SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2)); // mm2 = i2l m2l i2r m2r
397  // mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
398  SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
399  SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3)); // mm3 = i3l m3l i3r m3r
400  SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4)); // mm4 = shift
401  // mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
402  SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
403  SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
404  SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
405  SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0)); // mm0 >>= shift
406  // mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
407  SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
408  SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1)); // mm1 >>= shift
409  // mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
410  SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
411  SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2)); // mm2 >>= shift
412  SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0)); // mm0 = mm1h mm1l mm0h mm0l
413  SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3)); // mm3 >>= shift
414  SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
415  SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2)); // mm2 = mm2h mm2l mm3h mm3l
416  SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
417  SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
418  SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
419  } while ((--counter)!=0);
420  _mm_empty();
421 }
422 #endif
423 
425 //
426 // implementation of MMX optimized functions of class 'FIRFilter'
427 //
429 
430 #include "FIRFilter.h"
432 {
433  filterCoeffsUnalign = NULL;
434 }
435 
436 
438 {
439  delete[] filterCoeffsUnalign;
440 }
441 
442 // (overloaded) Calculates filter coefficients for MMX routine
443 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
444 {
445  uint i;
446  FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
447 
448  // Ensure that filter coeffs array is aligned to 16-byte boundary
449  delete[] filterCoeffsUnalign;
450  filterCoeffsUnalign = new short[2 * newLength + 8];
451  filterCoeffsAlign = (short *)(((ulong)filterCoeffsUnalign + 15) & -16);
452 
453  // rearrange the filter coefficients for mmx routines
454  for (i = 0;i < length; i += 4)
455  {
456  filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
457  filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
458  filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
459  filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
460 
461  filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
462  filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
463  filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
464  filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
465  }
466 }
467 
468 
469 
470 // mmx-optimized version of the filter routine for stereo sound
471 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
472 {
473  _mm_empty();
474  __m64 *inPtr = (__m64*)src;
475  __m64 *outPtr = ((__m64*)dest) - 1;
476  uint counter = (numSamples - length) >> 1;
477  GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
478 
479  do {
480  __m64 *filterInPtr = inPtr; // Load pointer to samples
481  __m64 *filterPtr = (__m64*)filterCoeffsAlign; // Load pointer to filter coefficients
482  uint filterCounter = lengthDiv8; // Load filter length/8 to filterCounter
483 
484  SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0)); // zero sums
485  SI(m1 = filterInPtr[0], movq_a2r(0, filterInPtr, mm1)); // mm1 = l1 r1 l0 r0
486  SI(m7 = _mm_setzero_si64(), pxor_r2r(mm7, mm7)); // zero sums
487 
488  do {
489  SI(m2 = filterInPtr[1], movq_a2r(8, filterInPtr, mm2)); // mm2 = l3 r3 l2 r2
490  SI(m4 = m1, movq_r2r(mm1, mm4)); // mm4 = l1 r1 l0 r0
491  SI(m3 = filterInPtr[2], movq_a2r(16, filterInPtr, mm3));// mm3 = l5 r5 l4 r4
492  SI(m1 = _mm_unpackhi_pi16(m1, m2), punpckhwd_r2r(mm2, mm1)); // mm1 = l3 l1 r3 r1
493  SI(m6 = m2, movq_r2r(mm2, mm6)); // mm6 = l3 r3 l2 r2
494  SI(m4 = _mm_unpacklo_pi16(m4, m2), punpcklwd_r2r(mm2, mm4)); // mm4 = l2 l0 r2 r0
495  SI(m2 = filterPtr[0], movq_a2r(0, filterPtr, mm2)); // mm2 = f2 f0 f2 f0
496  SI(m5 = m1, movq_r2r(mm1, mm5)); // mm5 = l3 l1 r3 r1
497  SI(m6 = _mm_unpacklo_pi16(m6, m3), punpcklwd_r2r(mm3, mm6)); // mm6 = l4 l2 r4 r2
498  SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4)); // mm4 = l2*f2+l0*f0 r2*f2+r0*f0
499  SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5)); // mm5 = l3*f2+l1*f0 r3*f2+l1*f0
500  SI(m2 = filterPtr[1], movq_a2r(8, filterPtr, mm2)); // mm2 = f3 f1 f3 f1
501  SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0)); // mm0 += s02*f02
502  SI(m4 = m3, movq_r2r(mm3, mm4)); // mm4 = l1 r1 l0 r0
503  SI(m1 = _mm_madd_pi16(m1, m2), pmaddwd_r2r(mm2, mm1)); // mm1 = l3*f3+l1*f1 r3*f3+l1*f1
504  SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7)); // mm7 += s13*f02
505  SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6)); // mm6 = l4*f3+l2*f1 r4*f3+f4*f1
506  SI(m2 = filterInPtr[3], movq_a2r(24, filterInPtr, mm2));// mm2 = l3 r3 l2 r2
507  SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0)); // mm0 += s31*f31
508  SI(m1 = filterInPtr[4], movq_a2r(32, filterInPtr, mm1));// mm1 = l5 r5 l4 r4
509  SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7)); // mm7 += s42*f31
510  SI(m3 = _mm_unpackhi_pi16(m3, m2), punpckhwd_r2r(mm2, mm3)); // mm3 = l3 l1 r3 r1
511  SI(m6 = m2, movq_r2r(mm2, mm6)); // mm6 = l3 r3 l2 r2
512  SI(m4 = _mm_unpackhi_pi16(m4, m2), punpcklwd_r2r(mm2, mm4)); // mm4 = l2 l0 r2 r0
513  SI(m2 = filterPtr[2], movq_a2r(16, filterInPtr, mm2));// mm2 = f2 f0 f2 f0
514  SI(m5 = m3, movq_r2r(mm3, mm5)); // mm5 = l3 l1 r3 r1
515  SI(m6 = _mm_unpackhi_pi16(m6, m1), punpcklwd_r2r(mm1, mm6)); // mm6 = l4 l2 r4 r2
516  filterPtr += 4;
517  SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4)); // mm4 = l2*f2+l0*f0 r2*f2+r0*f0
518  filterInPtr += 4;
519  SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5)); // mm5 = l3*f2+l1*f0 r3*f2+l1*f0
520  SI(m2 = filterPtr[-1], movq_a2r(-8, filterPtr, mm2)); // mm2 = f3 f1 f3 f1
521  SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0)); // mm0 += s02*f02
522  SI(m3 = _mm_madd_pi16(m3, m2), pmaddwd_r2r(mm2, mm3)); // mm3 = l3*f3+l1*f1 r3*f3+l1*f1
523  SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7)); // mm7 += s13*f02
524  SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6)); // mm6 = l4*f3+l2*f1 r4*f3+f4*f1
525  SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0)); // mm0 += s31*f31
526  SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7)); // mm7 += s42*f31
527  } while ((--filterCounter)!=0);
528 
529  SI(m4 = _mm_cvtsi32_si64(resultDivFactor), movd_v2r(resultDivFactor, mm4)); // mm4 = shift
530  // Divide mm0 by 8192 (= right-shift by 13)
531  SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
532  outPtr++;
533  // Divide mm7 by 8192 (= right-shift by 13)
534  SI(m7 = _mm_srai_pi32(m7, m4), psrad_r2r(mm4, mm7));
535  inPtr++;
536  // pack and store to [outPtr]
537  SI(m0 = _mm_packs_pi32(m0, m7), packssdw_r2r(mm7, mm0));
538  SI(*outPtr = m0, movq_r2a(mm0, 0, outPtr));
539  } while ((--counter)!=0);
540 
541  _mm_empty();
542  return (numSamples & 0xfffffffe) - length;
543 }
544 
545 #endif // ALLOW_MMX
int scanOffsets[4][24]
Definition: TDStretch.cpp:74
virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
Definition: mmx_gcc.cpp:443
my tmp
Definition: twit.tv.pl:144
virtual void overlapStereo(short *output, const short *input) const
Definition: mmx_gcc.cpp:267
short * filterCoeffsAlign
Definition: FIRFilter.h:126
short * filterCoeffsUnalign
Definition: FIRFilter.h:125
unsigned int uint
Definition: compat.h:135
uint lengthDiv8
Definition: FIRFilter.h:53
SAMPLETYPE * pMidBuffer
Definition: TDStretch.h:104
long calcCrossCorrMulti(const short *mixingPos, const short *compare) const
Definition: mmx_gcc.cpp:145
virtual void overlapMulti(SAMPLETYPE *output, const SAMPLETYPE *input) const
Definition: TDStretch.cpp:918
dest
Definition: minilzo.cpp:2074
uint length
Definition: FIRFilter.h:51
uint resultDivFactor
Definition: FIRFilter.h:56
virtual void clearCrossCorrState()
clear cross correlation routine state if necessary
Definition: mmx_gcc.cpp:261
typedef __attribute__
virtual void setCoefficients(const soundtouch::SAMPLETYPE *coeffs, uint newLength, uint uResultDivFactor)
Definition: FIRFilter.cpp:171
unsigned long ulong
Definition: STTypes.h:43
virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
Definition: mmx_gcc.cpp:471
long calcCrossCorrStereo(const short *mixingPos, const short *compare) const
Definition: mmx_gcc.cpp:84
src
Definition: minilzo.cpp:2074