47 using namespace soundtouch;
57 #ifdef USE_GCC_INTRINSICS
58 # include <mmintrin.h>
63 # define _mm_empty() __asm__ __volatile__ ("emms")
88 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
89 GI(__m64 m0, m1, m2, m3, m4, m5);
93 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1));
94 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
95 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
96 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));
102 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1));
103 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
104 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2));
105 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
106 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));
107 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
108 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));
109 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1));
110 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
112 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
113 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
114 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
115 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
117 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
118 }
while ((--counter)!=0);
121 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
122 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
123 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
124 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
125 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
126 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
127 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
128 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
129 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
130 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
131 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
132 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
136 SI(m1 = m0, movq_r2r(mm0, mm1));
137 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
138 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
139 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
148 static const __m64 mm_mask[4][8]
__attribute__ ((aligned(8))) = {
151 0xffffffffffffffffULL,
152 0xffffffffffffffffULL,
153 0xffffffffffffffffULL,
154 0xffffffffffffffffULL,
161 0xffffffffffffffffULL,
162 0xffffffffffffffffULL,
163 0xffffffffffffffffULL,
164 0x0000ffffffffffffULL,
171 0xffffffffffffffffULL,
172 0xffffffffffffffffULL,
173 0xffffffffffffffffULL,
174 0x00000000ffffffffULL,
181 0xffffffffffffffffULL,
182 0xffffffffffffffffULL,
183 0xffffffffffffffffULL,
184 0x000000000000ffffULL,
193 uint counter = ((adjustedOverlapLength+15)>>4)-1;
194 uint remainder = (16-adjustedOverlapLength)&0xf;
196 __m64 *ph = (__m64*)&mm_mask[remainder&3][remainder>>2];
197 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
198 GI(__m64 m0, m1, m2, m3, m4, m5, m6);
202 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1));
203 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
204 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
205 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));
211 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1));
212 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
213 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2));
214 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
215 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));
216 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
217 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));
218 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1));
219 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
221 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
222 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
223 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
224 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
226 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
227 }
while ((--counter)!=0);
229 SI(m6 = ph[0], movq_a2r(0, ph, mm6));
231 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
232 SI(m1 = _mm_and_si64(m1, m6), pand_r2r(mm6, mm1));
233 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
234 SI(m6 = ph[1], movq_a2r(8, ph, mm6));
235 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
236 SI(m2 = _mm_and_si64(m2, m6), pand_r2r(mm6, mm2));
237 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
238 SI(m6 = ph[2], movq_a2r(16, ph, mm6));
239 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
240 SI(m3 = _mm_and_si64(m3, m6), pand_r2r(mm6, mm3));
241 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
242 SI(m6 = ph[3], movq_a2r(24, ph, mm6));
243 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
244 SI(m4 = _mm_and_si64(m4, m6), pand_r2r(mm6, mm4));
245 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
246 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
247 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
248 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
249 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
253 SI(m1 = m0, movq_r2r(mm0, mm1));
254 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
255 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
256 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
272 __m64 *inPtr = (__m64*) input;
274 __m64 *outPtr = ((__m64*) output)-2;
275 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
278 uint tmp0 = 0x0002fffe;
279 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5));
280 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5));
283 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6));
286 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7));
287 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7));
306 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));
308 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3));
309 SI(m1 = m0, movq_r2r(mm0, mm1));
310 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));
311 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0));
313 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4));
314 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1));
316 SI(m3 = m2, movq_r2r(mm2, mm3));
317 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2));
319 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
320 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3));
321 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4));
323 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
324 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
325 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
326 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
328 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
329 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1));
331 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
332 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2));
333 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0));
334 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3));
335 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
336 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2));
337 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
338 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
339 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
340 }
while ((--counter)!=0);
351 __m64 *inPtr = (__m64*) input;
353 __m64 *outPtr = ((__m64*) output)-2;
354 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
357 uint tmp0 = 0x0002fffe;
358 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5));
359 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5));
362 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6));
365 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7));
366 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7));
385 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));
387 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3));
388 SI(m1 = m0, movq_r2r(mm0, mm1));
389 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));
390 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0));
392 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4));
393 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1));
395 SI(m3 = m2, movq_r2r(mm2, mm3));
396 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2));
398 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
399 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3));
400 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4));
402 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
403 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
404 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
405 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
407 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
408 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1));
410 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
411 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2));
412 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0));
413 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3));
414 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
415 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2));
416 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
417 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
418 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
419 }
while ((--counter)!=0);
454 for (i = 0;i <
length; i += 4)
474 __m64 *inPtr = (__m64*)src;
475 __m64 *outPtr = ((__m64*)dest) - 1;
477 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
480 __m64 *filterInPtr = inPtr;
484 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
485 SI(m1 = filterInPtr[0], movq_a2r(0, filterInPtr, mm1));
486 SI(m7 = _mm_setzero_si64(), pxor_r2r(mm7, mm7));
489 SI(m2 = filterInPtr[1], movq_a2r(8, filterInPtr, mm2));
490 SI(m4 = m1, movq_r2r(mm1, mm4));
491 SI(m3 = filterInPtr[2], movq_a2r(16, filterInPtr, mm3));
492 SI(m1 = _mm_unpackhi_pi16(m1, m2), punpckhwd_r2r(mm2, mm1));
493 SI(m6 = m2, movq_r2r(mm2, mm6));
494 SI(m4 = _mm_unpacklo_pi16(m4, m2), punpcklwd_r2r(mm2, mm4));
495 SI(m2 = filterPtr[0], movq_a2r(0, filterPtr, mm2));
496 SI(m5 = m1, movq_r2r(mm1, mm5));
497 SI(m6 = _mm_unpacklo_pi16(m6, m3), punpcklwd_r2r(mm3, mm6));
498 SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4));
499 SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5));
500 SI(m2 = filterPtr[1], movq_a2r(8, filterPtr, mm2));
501 SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0));
502 SI(m4 = m3, movq_r2r(mm3, mm4));
503 SI(m1 = _mm_madd_pi16(m1, m2), pmaddwd_r2r(mm2, mm1));
504 SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7));
505 SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6));
506 SI(m2 = filterInPtr[3], movq_a2r(24, filterInPtr, mm2));
507 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
508 SI(m1 = filterInPtr[4], movq_a2r(32, filterInPtr, mm1));
509 SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7));
510 SI(m3 = _mm_unpackhi_pi16(m3, m2), punpckhwd_r2r(mm2, mm3));
511 SI(m6 = m2, movq_r2r(mm2, mm6));
512 SI(m4 = _mm_unpackhi_pi16(m4, m2), punpcklwd_r2r(mm2, mm4));
513 SI(m2 = filterPtr[2], movq_a2r(16, filterInPtr, mm2));
514 SI(m5 = m3, movq_r2r(mm3, mm5));
515 SI(m6 = _mm_unpackhi_pi16(m6, m1), punpcklwd_r2r(mm1, mm6));
517 SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4));
519 SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5));
520 SI(m2 = filterPtr[-1], movq_a2r(-8, filterPtr, mm2));
521 SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0));
522 SI(m3 = _mm_madd_pi16(m3, m2), pmaddwd_r2r(mm2, mm3));
523 SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7));
524 SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6));
525 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
526 SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7));
527 }
while ((--filterCounter)!=0);
531 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
534 SI(m7 = _mm_srai_pi32(m7, m4), psrad_r2r(mm4, mm7));
537 SI(m0 = _mm_packs_pi32(m0, m7), packssdw_r2r(mm7, mm0));
538 SI(*outPtr = m0, movq_r2a(mm0, 0, outPtr));
539 }
while ((--counter)!=0);
542 return (numSamples & 0xfffffffe) -
length;