Ticket #1104: mythtv_multitimestretch.patch
File mythtv_multitimestretch.patch, 24.2 KB (added by , 18 years ago) |
---|
-
libs/libmythsoundtouch/TDStretch.cpp
96 96 97 97 pMidBuffer = NULL; 98 98 pRefMidBufferUnaligned = NULL; 99 midBufferLength = 0; 99 100 overlapLength = 0; 100 101 101 102 setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS); … … 108 109 109 110 TDStretch::~TDStretch() 110 111 { 111 delete[] pMidBuffer; 112 delete[] pRefMidBufferUnaligned; 112 if (midBufferLength) 113 { 114 delete[] pMidBuffer; 115 delete[] pRefMidBufferUnaligned; 116 midBufferLength = 0; 117 } 113 118 } 114 119 115 120 … … 196 201 197 202 void TDStretch::clearMidBuffer() 198 203 { 199 if (bMidBufferDirty )204 if (bMidBufferDirty && midBufferLength) 200 205 { 201 memset(pMidBuffer, 0, 2* sizeof(SAMPLETYPE) * overlapLength);206 memset(pMidBuffer, 0, channels * sizeof(SAMPLETYPE) * overlapLength); 202 207 bMidBufferDirty = FALSE; 203 208 } 204 209 } … … 239 244 // Seeks for the optimal overlap-mixing position. 240 245 uint TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos) 241 246 { 247 #ifdef MULTICHANNEL 248 if (channels > 2) 249 { 250 // stereo sound 251 if (bQuickseek) 252 { 253 return seekBestOverlapPositionMultiQuick(refPos); 254 } 255 else 256 { 257 return seekBestOverlapPositionMulti(refPos); 258 } 259 } 260 else 261 #endif 242 262 if (channels == 2) 243 263 { 244 264 // stereo sound … … 272 292 // of 'ovlPos'. 273 293 inline void TDStretch::overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const 274 294 { 295 #ifdef MULTICHANNEL 296 if (channels > 2) 297 { 298 overlapMulti(output, input + channels * ovlPos); 299 } 300 else 301 #endif 275 302 if (channels == 2) 276 303 { 277 304 // stereo sound … … 285 312 286 313 287 314 315 #ifdef MULTICHANNEL 288 316 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the 289 317 // routine 290 318 // 291 319 // The best position is determined as the position where the two overlapped 292 320 // sample sequences are 'most alike', in terms of the highest cross-correlation 293 321 // value over the overlapping period 322 uint TDStretch::seekBestOverlapPositionMulti(const SAMPLETYPE *refPos) 323 { 324 uint bestOffs; 325 LONG_SAMPLETYPE bestCorr, corr; 326 uint i; 327 328 // Slopes the amplitudes of the 'midBuffer' samples 329 precalcCorrReference(); 330 331 bestCorr = INT_MIN; 332 bestOffs = 0; 333 334 // Scans for the best correlation value by testing each possible position 335 // over the permitted range. 336 for (i = 0; i < seekLength; i ++) 337 { 338 // Calculates correlation value for the mixing position corresponding 339 // to 'i' 340 corr = calcCrossCorrMulti(refPos + channels * i, pRefMidBuffer); 341 342 // Checks for the highest correlation value 343 if (corr > bestCorr) 344 { 345 bestCorr = corr; 346 bestOffs = i; 347 } 348 } 349 // clear cross correlation routine state if necessary (is so e.g. in MMX routines). 350 clearCrossCorrState(); 351 352 return bestOffs; 353 } 354 355 356 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the 357 // routine 358 // 359 // The best position is determined as the position where the two overlapped 360 // sample sequences are 'most alike', in terms of the highest cross-correlation 361 // value over the overlapping period 362 uint TDStretch::seekBestOverlapPositionMultiQuick(const SAMPLETYPE *refPos) 363 { 364 uint j; 365 uint bestOffs; 366 LONG_SAMPLETYPE bestCorr, corr; 367 uint scanCount, corrOffset, tempOffset; 368 369 // Slopes the amplitude of the 'midBuffer' samples 370 precalcCorrReference(); 371 372 bestCorr = INT_MIN; 373 bestOffs = 0; 374 corrOffset = 0; 375 tempOffset = 0; 376 377 // Scans for the best correlation value using four-pass hierarchical search. 378 // 379 // The look-up table 'scans' has hierarchical position adjusting steps. 380 // In first pass the routine searhes for the highest correlation with 381 // relatively coarse steps, then rescans the neighbourhood of the highest 382 // correlation with better resolution and so on. 383 for (scanCount = 0;scanCount < 4; scanCount ++) 384 { 385 j = 0; 386 while (scanOffsets[scanCount][j]) 387 { 388 tempOffset = corrOffset + scanOffsets[scanCount][j]; 389 if (tempOffset >= seekLength) break; 390 391 // Calculates correlation value for the mixing position corresponding 392 // to 'tempOffset' 393 corr = calcCrossCorrMulti(refPos + channels * tempOffset, pRefMidBuffer); 394 395 // Checks for the highest correlation value 396 if (corr > bestCorr) 397 { 398 bestCorr = corr; 399 bestOffs = tempOffset; 400 } 401 j ++; 402 } 403 corrOffset = bestOffs; 404 } 405 // clear cross correlation routine state if necessary (is so e.g. in MMX routines). 406 clearCrossCorrState(); 407 408 return bestOffs; 409 } 410 #endif 411 412 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the 413 // routine 414 // 415 // The best position is determined as the position where the two overlapped 416 // sample sequences are 'most alike', in terms of the highest cross-correlation 417 // value over the overlapping period 294 418 uint TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) 295 419 { 296 420 uint bestOffs; … … 512 636 void TDStretch::setChannels(uint numChannels) 513 637 { 514 638 if (channels == numChannels) return; 639 #ifdef MULTICHANNEL 640 assert(numChannels >= 1 && numChannels <= MULTICHANNEL); 641 #else 515 642 assert(numChannels == 1 || numChannels == 2); 643 #endif 516 644 517 645 channels = numChannels; 518 646 inputBuffer.setChannels(channels); … … 635 763 /// Set new overlap length parameter & reallocate RefMidBuffer if necessary. 636 764 void TDStretch::acceptNewOverlapLength(uint newOverlapLength) 637 765 { 638 uint prevOvl;639 640 prevOvl = overlapLength;641 766 overlapLength = newOverlapLength; 642 767 643 if (overlapLength > prevOvl)768 if (overlapLength*channels > midBufferLength) 644 769 { 645 delete[] pMidBuffer; 646 delete[] pRefMidBufferUnaligned; 770 if (midBufferLength) 771 { 772 delete[] pMidBuffer; 773 delete[] pRefMidBufferUnaligned; 774 midBufferLength = 0; 775 } 647 776 648 pMidBuffer = new SAMPLETYPE[overlapLength * 2]; 777 midBufferLength = overlapLength * channels; 778 pMidBuffer = new SAMPLETYPE[midBufferLength]; 649 779 bMidBufferDirty = TRUE; 650 780 clearMidBuffer(); 651 781 652 pRefMidBufferUnaligned = new SAMPLETYPE[ 2 * overlapLength + 16 / sizeof(SAMPLETYPE)];782 pRefMidBufferUnaligned = new SAMPLETYPE[midBufferLength + 16 / sizeof(SAMPLETYPE)]; 653 783 // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency 654 784 pRefMidBuffer = (SAMPLETYPE *)((((ulong)pRefMidBufferUnaligned) + 15) & -16); 655 785 } … … 718 848 719 849 #ifdef INTEGER_SAMPLES 720 850 851 #ifdef MULTICHANNEL 721 852 // Slopes the amplitude of the 'midBuffer' samples so that cross correlation 722 853 // is faster to calculate 854 void TDStretch::precalcCorrReference() 855 { 856 int i,j; 857 int temp, temp2; 858 short *src = pMidBuffer; 859 short *dest = pRefMidBuffer; 860 861 for (i=0 ; i < (int)overlapLength ;i ++) 862 { 863 temp = i * (overlapLength - i); 864 865 for(j=0;j<channels;j++) 866 { 867 temp2 = (*src++ * temp) / slopingDivider; 868 *dest++ = (short)(temp2); 869 } 870 } 871 } 872 #endif 873 874 // Slopes the amplitude of the 'midBuffer' samples so that cross correlation 875 // is faster to calculate 723 876 void TDStretch::precalcCorrReferenceStereo() 724 877 { 725 878 int i, cnt2; … … 772 925 } 773 926 } 774 927 928 #ifdef MULTICHANNEL 929 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' 930 // version of the routine. 931 void TDStretch::overlapMulti(short *output, const short *input) const 932 { 933 int i,j; 934 short temp; 935 //uint cnt2; 936 const short *ip = input; 937 short *op = output; 938 const short *md = pMidBuffer; 775 939 940 for (i = 0; i < (int)overlapLength ; i ++) 941 { 942 temp = (short)(overlapLength - i); 943 for(j=0;j<channels;j++) 944 *op++ = (*ip++ * i + *md++ * temp ) / overlapLength; 945 } 946 } 947 #endif 948 949 776 950 /// Calculates overlap period length in samples. 777 951 /// Integer version rounds overlap length to closest power of 2 778 952 /// for a divide scaling operation. … … 824 998 return corr; 825 999 } 826 1000 1001 #ifdef MULTICHANNEL 1002 long TDStretch::calcCrossCorrMulti(const short *mixingPos, const short *compare) const 1003 { 1004 long corr; 1005 uint i; 1006 1007 corr = 0; 1008 for (i = channels; i < channels * overlapLength; i++) 1009 { 1010 corr += (mixingPos[i] * compare[i]) >> overlapDividerBits; 1011 } 1012 1013 return corr; 1014 } 1015 #endif 1016 827 1017 #endif // INTEGER_SAMPLES 828 1018 829 1019 ////////////////////////////////////////////////////////////////////////////// -
libs/libmythsoundtouch/TDStretch.h
48 48 #include "RateTransposer.h" 49 49 #include "FIFOSamplePipe.h" 50 50 51 #ifdef MULTICHANNEL 52 #define USE_MULTI_MMX 53 #endif 54 51 55 namespace soundtouch 52 56 { 53 57 … … 100 104 SAMPLETYPE *pMidBuffer; 101 105 SAMPLETYPE *pRefMidBuffer; 102 106 SAMPLETYPE *pRefMidBufferUnaligned; 107 uint midBufferLength; 103 108 uint overlapLength; 104 109 uint overlapDividerBits; 105 110 uint slopingDivider; … … 123 128 virtual void clearCrossCorrState(); 124 129 void calculateOverlapLength(uint overlapMs); 125 130 131 #ifdef MULTICHANNEL 132 virtual LONG_SAMPLETYPE calcCrossCorrMulti(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; 133 #endif 126 134 virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; 127 135 virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const; 128 136 137 #ifdef MULTICHANNEL 138 virtual uint seekBestOverlapPositionMulti(const SAMPLETYPE *refPos); 139 virtual uint seekBestOverlapPositionMultiQuick(const SAMPLETYPE *refPos); 140 #endif 129 141 virtual uint seekBestOverlapPositionStereo(const SAMPLETYPE *refPos); 130 142 virtual uint seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos); 131 143 virtual uint seekBestOverlapPositionMono(const SAMPLETYPE *refPos); 132 144 virtual uint seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos); 133 145 uint seekBestOverlapPosition(const SAMPLETYPE *refPos); 134 146 147 #ifdef MULTICHANNEL 148 virtual void overlapMulti(SAMPLETYPE *output, const SAMPLETYPE *input) const; 149 #endif 135 150 virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const; 136 151 virtual void overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const; 137 152 138 153 void clearMidBuffer(); 139 154 void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const; 140 155 156 #ifdef MULTICHANNEL 157 void precalcCorrReference(); 158 #endif 141 159 void precalcCorrReferenceMono(); 142 160 void precalcCorrReferenceStereo(); 143 161 … … 225 243 class TDStretchMMX : public TDStretch 226 244 { 227 245 protected: 246 #ifdef USE_MULTI_MMX 247 #ifdef MULTICHANNEL 248 long calcCrossCorrMulti(const short *mixingPos, const short *compare) const; 249 #endif 250 #endif 228 251 long calcCrossCorrStereo(const short *mixingPos, const short *compare) const; 229 252 virtual void overlapStereo(short *output, const short *input) const; 230 253 virtual void clearCrossCorrState(); … … 237 260 class TDStretch3DNow : public TDStretch 238 261 { 239 262 protected: 263 #ifdef MULTICHANNEL 264 //double calcCrossCorrMulti(const float *mixingPos, const float *compare) const; 265 #endif 240 266 double calcCrossCorrStereo(const float *mixingPos, const float *compare) const; 241 267 }; 242 268 #endif /// ALLOW_3DNOW … … 247 273 class TDStretchSSE : public TDStretch 248 274 { 249 275 protected: 276 #ifdef MULTICHANNEL 277 //double calcCrossCorrMulti(const float *mixingPos, const float *compare) const; 278 #endif 250 279 double calcCrossCorrStereo(const float *mixingPos, const float *compare) const; 251 280 }; 252 281 -
libs/libmythsoundtouch/RateTransposer.cpp
330 330 { 331 331 if (uChannels == numchannels) return; 332 332 333 #ifdef MULTICHANNEL 334 assert(numchannels >= 1 && numchannels <= MULTICHANNEL); 335 #else 333 336 assert(numchannels == 1 || numchannels == 2); 337 #endif 334 338 uChannels = numchannels; 335 339 336 340 storeBuffer.setChannels(uChannels); -
libs/libmythsoundtouch/mmx_gcc.cpp
141 141 return tmp; 142 142 } 143 143 144 #ifdef USE_MULTI_MMX 145 // Calculates cross correlation of two buffers 146 long TDStretchMMX::calcCrossCorrMulti(const short *pV1, const short *pV2) const 147 { 148 //static const unsigned long long int mm_half __attribute__ ((aligned(8))) = 0xffffffffULL; 149 static const __m64 mm_mask[4][8] __attribute__ ((aligned(8))) = { 150 { 151 // even bit 152 0xffffffffffffffffULL, 153 0xffffffffffffffffULL, 154 0xffffffffffffffffULL, 155 0xffffffffffffffffULL, 156 0, 157 0, 158 0, 159 0 160 }, 161 { 162 0xffffffffffffffffULL, 163 0xffffffffffffffffULL, 164 0xffffffffffffffffULL, 165 0x0000ffffffffffffULL, 166 0, 167 0, 168 0, 169 0 170 }, 171 { 172 0xffffffffffffffffULL, 173 0xffffffffffffffffULL, 174 0xffffffffffffffffULL, 175 0x00000000ffffffffULL, 176 0, 177 0, 178 0, 179 0 180 }, 181 { 182 0xffffffffffffffffULL, 183 0xffffffffffffffffULL, 184 0xffffffffffffffffULL, 185 0x000000000000ffffULL, 186 0, 187 0, 188 0, 189 0 190 } 191 }; 192 uint tmp; 193 uint adjustedOverlapLength = overlapLength*channels; 194 uint counter = ((adjustedOverlapLength+15)>>4)-1; // load counter to counter = overlapLength / 8 - 1 195 uint remainder = (16-adjustedOverlapLength)&0xf; // since there are 1/3 sample per 1/2 quadword 196 197 __m64 *ph = (__m64*)&mm_mask[remainder&3][remainder>>2]; 198 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2; 199 GI(__m64 m0, m1, m2, m3, m4, m5, m6); // temporaries 200 uint shift = overlapDividerBits; 201 202 // prepare to the first round by loading 203 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1)); // load m1 = pv1[0] 204 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2)); // load m2 = pv1[1] 205 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0)); // clear m0 206 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5)); // shift in 64bit reg 207 208 do { 209 // Calculate cross-correlation between the tempOffset and tmpbid_buffer. 210 // Process 4 parallel batches of 2 * stereo samples each during one 211 // round to improve CPU-level parallellization. 212 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1)); // multiply-add m1 = m1 * pv2[0] 213 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3)); // load mm3 = pv1[2] 214 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2)); // multiply-add m2 = m2 * pv2[1] 215 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4)); // load mm4 = pv1[3] 216 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));// multiply-add m3 = m3 * pv2[2] 217 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2)); // add m2 += m1 218 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));// multiply-add m4 = m4 * pv2[3] 219 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1)); // mm1 = pv1[0] for next round 220 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2)); // m2 >>= shift (mm5) 221 pv1 += 4; // increment first pointer 222 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3)); // m3 += m4 223 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0)); // m0 += m2 224 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2)); // mm2 = pv1[1] for next round 225 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3)); // m3 >>= shift (mm5) 226 pv2 += 4; // increment second pointer 227 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0)); // add m0 += m3 228 } while ((--counter)!=0); 229 230 SI(m6 = ph[0], movq_a2r(0, ph, mm6)); 231 // Finalize the last partial loop: 232 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1)); 233 SI(m1 = _mm_and_si64(m1, m6), pand_r2r(mm6, mm1)); 234 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3)); 235 SI(m6 = ph[1], movq_a2r(8, ph, mm6)); 236 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2)); 237 SI(m2 = _mm_and_si64(m2, m6), pand_r2r(mm6, mm2)); 238 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4)); 239 SI(m6 = ph[2], movq_a2r(16, ph, mm6)); 240 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3)); 241 SI(m3 = _mm_and_si64(m3, m6), pand_r2r(mm6, mm3)); 242 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2)); 243 SI(m6 = ph[3], movq_a2r(24, ph, mm6)); 244 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4)); 245 SI(m4 = _mm_and_si64(m4, m6), pand_r2r(mm6, mm4)); 246 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2)); 247 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3)); 248 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0)); 249 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3)); 250 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0)); 251 252 // copy hi-dword of mm0 to lo-dword of mm1, then sum mm0+mm1 253 // and finally return the result 254 SI(m1 = m0, movq_r2r(mm0, mm1)); 255 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1)); 256 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0)); 257 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp)); 258 return tmp; 259 } 260 #endif 261 144 262 void TDStretchMMX::clearCrossCorrState() 145 263 { 146 264 _mm_empty(); … … 224 342 _mm_empty(); 225 343 } 226 344 345 #if 0 346 // MMX-optimized version of the function overlapMulti 347 void TDStretchMMX::overlapMulti(short *output, const short *input) const 348 { 349 _mm_empty(); 350 uint shift = overlapDividerBits; 351 uint counter = overlapLength>>2; // counter = overlapLength / 4 352 __m64 *inPtr = (__m64*) input; // load address of inputBuffer 353 __m64 *midPtr = (__m64*) pMidBuffer; // load address of midBuffer 354 __m64 *outPtr = ((__m64*) output)-2; // load address of outputBuffer 355 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7); // temporaries 356 357 // load mixing value adder to mm5 358 uint tmp0 = 0x0002fffe; // tmp0 = 0x0002 fffe 359 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5)); // mm5 = 0x0000 0000 0002 fffe 360 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5)); // mm5 = 0x0002 fffe 0002 fffe 361 // load sliding mixing value counter to mm6 362 SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6)); 363 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6)); // mm6 = 0x0000 OVL_ 0000 OVL_ 364 // load sliding mixing value counter to mm7 365 uint tmp1 = (overlapLength-1)|0x00010000; // tmp1 = 0x0001 overlapLength-1 366 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7)); // mm7 = 0x0000 0000 0001 01ff 367 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7)); // mm7 = 0x0001 01ff 0001 01ff 368 369 do { 370 // Process two parallel batches of 2+2 stereo samples during each round 371 // to improve CPU-level parallellization. 372 // 373 // Load [midPtr] into m0 and m1 374 // Load [inPtr] into m3 375 // unpack words of m0, m1 and m3 into m0 and m1 376 // multiply-add m0*m6 and m1*m7, store results into m0 and m1 377 // divide m0 and m1 by 512 (=right-shift by overlapDividerBits) 378 // pack the result into m0 and store into [edx] 379 // 380 // Load [midPtr+8] into m2 and m3 381 // Load [inPtr+8] into m4 382 // unpack words of m2, m3 and m4 into m2 and m3 383 // multiply-add m2*m6 and m3*m7, store results into m2 and m3 384 // divide m2 and m3 by 512 (=right-shift by overlapDividerBits) 385 // pack the result into m2 and store into [edx+8] 386 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));// mm0 = m1l m1r m0l m0r 387 outPtr += 2; 388 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3)); // mm3 = i1l i1r i0l i0r 389 SI(m1 = m0, movq_r2r(mm0, mm1)); // mm1 = m1l m1r m0l m0r 390 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));// mm2 = m3l m3r m2l m2r 391 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0)); // mm0 = i0l m0l i0r m0r 392 midPtr += 2; 393 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4)); // mm4 = i3l i3r i2l i2r 394 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1)); // mm1 = i1l m1l i1r m1r 395 inPtr+=2; 396 SI(m3 = m2, movq_r2r(mm2, mm3)); // mm3 = m3l m3r m2l m2r 397 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2)); // mm2 = i2l m2l i2r m2r 398 // mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60 399 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0)); 400 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3)); // mm3 = i3l m3l i3r m3r 401 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4)); // mm4 = shift 402 // mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70 403 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1)); 404 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6)); 405 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7)); 406 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0)); // mm0 >>= shift 407 // mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60 408 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2)); 409 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1)); // mm1 >>= shift 410 // mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70 411 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3)); 412 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2)); // mm2 >>= shift 413 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0)); // mm0 = mm1h mm1l mm0h mm0l 414 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3)); // mm3 >>= shift 415 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6)); 416 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2)); // mm2 = mm2h mm2l mm3h mm3l 417 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7)); 418 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr)); 419 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr)); 420 } while ((--counter)!=0); 421 _mm_empty(); 422 } 423 #endif 424 227 425 ////////////////////////////////////////////////////////////////////////////// 228 426 // 229 427 // implementation of MMX optimized functions of class 'FIRFilter' -
libs/libmythsoundtouch/STTypes.h
61 61 #define INTEGER_SAMPLES //< 16bit integer samples 62 62 //#define FLOAT_SAMPLES //< 32bit float samples 63 63 64 #define MULTICHANNEL 6 64 65 65 66 /// Define this to allow CPU-specific assembler optimizations. Notice that 66 67 /// having this enabled on non-x86 platforms doesn't matter; the compiler can -
libs/libmythsoundtouch/SoundTouch.cpp
140 140 // Sets the number of channels, 1 = mono, 2 = stereo 141 141 void SoundTouch::setChannels(uint numChannels) 142 142 { 143 #ifdef MULTICHANNEL 144 if (numChannels < 1 || numChannels > MULTICHANNEL) 145 #else 143 146 if (numChannels != 1 && numChannels != 2) 147 #endif 144 148 { 145 149 throw std::runtime_error("Illegal number of channels"); 146 150 }