Ticket #1104: mythtv_multitimestretch.patch

File mythtv_multitimestretch.patch, 24.2 KB (added by Mark Spieth, 18 years ago)
  • libs/libmythsoundtouch/TDStretch.cpp

     
    9696
    9797    pMidBuffer = NULL;
    9898    pRefMidBufferUnaligned = NULL;
     99    midBufferLength = 0;
    99100    overlapLength = 0;
    100101
    101102    setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
     
    108109
    109110TDStretch::~TDStretch()
    110111{
    111     delete[] pMidBuffer;
    112     delete[] pRefMidBufferUnaligned;
     112    if (midBufferLength)
     113    {
     114        delete[] pMidBuffer;
     115        delete[] pRefMidBufferUnaligned;
     116        midBufferLength = 0;
     117    }
    113118}
    114119
    115120
     
    196201
    197202void TDStretch::clearMidBuffer()
    198203{
    199     if (bMidBufferDirty)
     204    if (bMidBufferDirty && midBufferLength)
    200205    {
    201         memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
     206        memset(pMidBuffer, 0, channels * sizeof(SAMPLETYPE) * overlapLength);
    202207        bMidBufferDirty = FALSE;
    203208    }
    204209}
     
    239244// Seeks for the optimal overlap-mixing position.
    240245uint TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
    241246{
     247#ifdef MULTICHANNEL
     248    if (channels > 2)
     249    {
     250        // stereo sound
     251        if (bQuickseek)
     252        {
     253            return seekBestOverlapPositionMultiQuick(refPos);
     254        }
     255        else
     256        {
     257            return seekBestOverlapPositionMulti(refPos);
     258        }
     259    }
     260    else
     261#endif
    242262    if (channels == 2)
    243263    {
    244264        // stereo sound
     
    272292// of 'ovlPos'.
    273293inline void TDStretch::overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const
    274294{
     295#ifdef MULTICHANNEL
     296    if (channels > 2)
     297    {
     298        overlapMulti(output, input + channels * ovlPos);
     299    }
     300    else
     301#endif
    275302    if (channels == 2)
    276303    {
    277304        // stereo sound
     
    285312
    286313
    287314
     315#ifdef MULTICHANNEL
    288316// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
    289317// routine
    290318//
    291319// The best position is determined as the position where the two overlapped
    292320// sample sequences are 'most alike', in terms of the highest cross-correlation
    293321// value over the overlapping period
     322uint TDStretch::seekBestOverlapPositionMulti(const SAMPLETYPE *refPos)
     323{
     324    uint bestOffs;
     325    LONG_SAMPLETYPE bestCorr, corr;
     326    uint i;
     327
     328    // Slopes the amplitudes of the 'midBuffer' samples
     329    precalcCorrReference();
     330
     331    bestCorr = INT_MIN;
     332    bestOffs = 0;
     333
     334    // Scans for the best correlation value by testing each possible position
     335    // over the permitted range.
     336    for (i = 0; i < seekLength; i ++)
     337    {
     338        // Calculates correlation value for the mixing position corresponding
     339        // to 'i'
     340        corr = calcCrossCorrMulti(refPos + channels * i, pRefMidBuffer);
     341
     342        // Checks for the highest correlation value
     343        if (corr > bestCorr)
     344        {
     345            bestCorr = corr;
     346            bestOffs = i;
     347        }
     348    }
     349    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     350    clearCrossCorrState();
     351
     352    return bestOffs;
     353}
     354
     355
     356// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
     357// routine
     358//
     359// The best position is determined as the position where the two overlapped
     360// sample sequences are 'most alike', in terms of the highest cross-correlation
     361// value over the overlapping period
     362uint TDStretch::seekBestOverlapPositionMultiQuick(const SAMPLETYPE *refPos)
     363{
     364    uint j;
     365    uint bestOffs;
     366    LONG_SAMPLETYPE bestCorr, corr;
     367    uint scanCount, corrOffset, tempOffset;
     368
     369    // Slopes the amplitude of the 'midBuffer' samples
     370    precalcCorrReference();
     371
     372    bestCorr = INT_MIN;
     373    bestOffs = 0;
     374    corrOffset = 0;
     375    tempOffset = 0;
     376
     377    // Scans for the best correlation value using four-pass hierarchical search.
     378    //
     379    // The look-up table 'scans' has hierarchical position adjusting steps.
     380    // In first pass the routine searhes for the highest correlation with
     381    // relatively coarse steps, then rescans the neighbourhood of the highest
     382    // correlation with better resolution and so on.
     383    for (scanCount = 0;scanCount < 4; scanCount ++)
     384    {
     385        j = 0;
     386        while (scanOffsets[scanCount][j])
     387        {
     388            tempOffset = corrOffset + scanOffsets[scanCount][j];
     389            if (tempOffset >= seekLength) break;
     390
     391            // Calculates correlation value for the mixing position corresponding
     392            // to 'tempOffset'
     393            corr = calcCrossCorrMulti(refPos + channels * tempOffset, pRefMidBuffer);
     394
     395            // Checks for the highest correlation value
     396            if (corr > bestCorr)
     397            {
     398                bestCorr = corr;
     399                bestOffs = tempOffset;
     400            }
     401            j ++;
     402        }
     403        corrOffset = bestOffs;
     404    }
     405    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
     406    clearCrossCorrState();
     407
     408    return bestOffs;
     409}
     410#endif
     411
     412// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
     413// routine
     414//
     415// The best position is determined as the position where the two overlapped
     416// sample sequences are 'most alike', in terms of the highest cross-correlation
     417// value over the overlapping period
    294418uint TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
    295419{
    296420    uint bestOffs;
     
    512636void TDStretch::setChannels(uint numChannels)
    513637{
    514638    if (channels == numChannels) return;
     639#ifdef MULTICHANNEL
     640    assert(numChannels >= 1 && numChannels <= MULTICHANNEL);
     641#else
    515642    assert(numChannels == 1 || numChannels == 2);
     643#endif
    516644
    517645    channels = numChannels;
    518646    inputBuffer.setChannels(channels);
     
    635763/// Set new overlap length parameter & reallocate RefMidBuffer if necessary.
    636764void TDStretch::acceptNewOverlapLength(uint newOverlapLength)
    637765{
    638     uint prevOvl;
    639 
    640     prevOvl = overlapLength;
    641766    overlapLength = newOverlapLength;
    642767
    643     if (overlapLength > prevOvl)
     768    if (overlapLength*channels > midBufferLength)
    644769    {
    645         delete[] pMidBuffer;
    646         delete[] pRefMidBufferUnaligned;
     770        if (midBufferLength)
     771        {
     772            delete[] pMidBuffer;
     773            delete[] pRefMidBufferUnaligned;
     774            midBufferLength = 0;
     775        }
    647776
    648         pMidBuffer = new SAMPLETYPE[overlapLength * 2];
     777        midBufferLength = overlapLength * channels;
     778        pMidBuffer = new SAMPLETYPE[midBufferLength];
    649779        bMidBufferDirty = TRUE;
    650780        clearMidBuffer();
    651781
    652         pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
     782        pRefMidBufferUnaligned = new SAMPLETYPE[midBufferLength + 16 / sizeof(SAMPLETYPE)];
    653783        // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency
    654784        pRefMidBuffer = (SAMPLETYPE *)((((ulong)pRefMidBufferUnaligned) + 15) & -16);
    655785    }
     
    718848
    719849#ifdef INTEGER_SAMPLES
    720850
     851#ifdef MULTICHANNEL
    721852// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
    722853// is faster to calculate
     854void TDStretch::precalcCorrReference()
     855{
     856    int i,j;
     857    int temp, temp2;
     858    short *src = pMidBuffer;
     859    short *dest = pRefMidBuffer;
     860
     861    for (i=0 ; i < (int)overlapLength ;i ++)
     862    {
     863        temp = i * (overlapLength - i);
     864
     865        for(j=0;j<channels;j++)
     866        {
     867            temp2 = (*src++ * temp) / slopingDivider;
     868            *dest++ = (short)(temp2);
     869        }
     870    }
     871}
     872#endif
     873
     874// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
     875// is faster to calculate
    723876void TDStretch::precalcCorrReferenceStereo()
    724877{
    725878    int i, cnt2;
     
    772925    }
    773926}
    774927
     928#ifdef MULTICHANNEL
     929// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo'
     930// version of the routine.
     931void TDStretch::overlapMulti(short *output, const short *input) const
     932{
     933    int i,j;
     934    short temp;
     935    //uint cnt2;
     936    const short *ip = input;
     937    short *op = output;
     938    const short *md = pMidBuffer;
    775939
     940    for (i = 0; i < (int)overlapLength ; i ++)
     941    {
     942        temp = (short)(overlapLength - i);
     943        for(j=0;j<channels;j++)
     944            *op++ = (*ip++ * i + *md++ * temp )  / overlapLength;
     945    }
     946}
     947#endif
     948
     949
    776950/// Calculates overlap period length in samples.
    777951/// Integer version rounds overlap length to closest power of 2
    778952/// for a divide scaling operation.
     
    824998    return corr;
    825999}
    8261000
     1001#ifdef MULTICHANNEL
     1002long TDStretch::calcCrossCorrMulti(const short *mixingPos, const short *compare) const
     1003{
     1004    long corr;
     1005    uint i;
     1006
     1007    corr = 0;
     1008    for (i = channels; i < channels * overlapLength; i++)
     1009    {
     1010        corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
     1011    }
     1012
     1013    return corr;
     1014}
     1015#endif
     1016
    8271017#endif // INTEGER_SAMPLES
    8281018
    8291019//////////////////////////////////////////////////////////////////////////////
  • libs/libmythsoundtouch/TDStretch.h

     
    4848#include "RateTransposer.h"
    4949#include "FIFOSamplePipe.h"
    5050
     51#ifdef MULTICHANNEL
     52#define USE_MULTI_MMX
     53#endif
     54
    5155namespace soundtouch
    5256{
    5357
     
    100104    SAMPLETYPE *pMidBuffer;
    101105    SAMPLETYPE *pRefMidBuffer;
    102106    SAMPLETYPE *pRefMidBufferUnaligned;
     107    uint midBufferLength;
    103108    uint overlapLength;
    104109    uint overlapDividerBits;
    105110    uint slopingDivider;
     
    123128    virtual void clearCrossCorrState();
    124129    void calculateOverlapLength(uint overlapMs);
    125130
     131#ifdef MULTICHANNEL
     132    virtual LONG_SAMPLETYPE calcCrossCorrMulti(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
     133#endif
    126134    virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
    127135    virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
    128136
     137#ifdef MULTICHANNEL
     138    virtual uint seekBestOverlapPositionMulti(const SAMPLETYPE *refPos);
     139    virtual uint seekBestOverlapPositionMultiQuick(const SAMPLETYPE *refPos);
     140#endif
    129141    virtual uint seekBestOverlapPositionStereo(const SAMPLETYPE *refPos);
    130142    virtual uint seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos);
    131143    virtual uint seekBestOverlapPositionMono(const SAMPLETYPE *refPos);
    132144    virtual uint seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos);
    133145    uint seekBestOverlapPosition(const SAMPLETYPE *refPos);
    134146
     147#ifdef MULTICHANNEL
     148    virtual void overlapMulti(SAMPLETYPE *output, const SAMPLETYPE *input) const;
     149#endif
    135150    virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const;
    136151    virtual void overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const;
    137152
    138153    void clearMidBuffer();
    139154    void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const;
    140155
     156#ifdef MULTICHANNEL
     157    void precalcCorrReference();
     158#endif
    141159    void precalcCorrReferenceMono();
    142160    void precalcCorrReferenceStereo();
    143161
     
    225243    class TDStretchMMX : public TDStretch
    226244    {
    227245    protected:
     246#ifdef USE_MULTI_MMX
     247#ifdef MULTICHANNEL
     248        long calcCrossCorrMulti(const short *mixingPos, const short *compare) const;
     249#endif
     250#endif
    228251        long calcCrossCorrStereo(const short *mixingPos, const short *compare) const;
    229252        virtual void overlapStereo(short *output, const short *input) const;
    230253        virtual void clearCrossCorrState();
     
    237260    class TDStretch3DNow : public TDStretch
    238261    {
    239262    protected:
     263#ifdef MULTICHANNEL
     264        //double calcCrossCorrMulti(const float *mixingPos, const float *compare) const;
     265#endif
    240266        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
    241267    };
    242268#endif /// ALLOW_3DNOW
     
    247273    class TDStretchSSE : public TDStretch
    248274    {
    249275    protected:
     276#ifdef MULTICHANNEL
     277        //double calcCrossCorrMulti(const float *mixingPos, const float *compare) const;
     278#endif
    250279        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
    251280    };
    252281
  • libs/libmythsoundtouch/RateTransposer.cpp

     
    330330{
    331331    if (uChannels == numchannels) return;
    332332
     333#ifdef MULTICHANNEL
     334    assert(numchannels >= 1 && numchannels <= MULTICHANNEL);
     335#else
    333336    assert(numchannels == 1 || numchannels == 2);
     337#endif
    334338    uChannels = numchannels;
    335339
    336340    storeBuffer.setChannels(uChannels);
  • libs/libmythsoundtouch/mmx_gcc.cpp

     
    141141    return tmp;
    142142}
    143143
     144#ifdef USE_MULTI_MMX
     145// Calculates cross correlation of two buffers
     146long TDStretchMMX::calcCrossCorrMulti(const short *pV1, const short *pV2) const
     147{
     148    //static const unsigned long long int mm_half __attribute__ ((aligned(8))) = 0xffffffffULL;
     149    static const __m64 mm_mask[4][8] __attribute__ ((aligned(8))) = {
     150        {
     151            // even bit
     152            0xffffffffffffffffULL,
     153            0xffffffffffffffffULL,
     154            0xffffffffffffffffULL,
     155            0xffffffffffffffffULL,
     156            0,
     157            0,
     158            0,
     159            0
     160        },
     161        {
     162            0xffffffffffffffffULL,
     163            0xffffffffffffffffULL,
     164            0xffffffffffffffffULL,
     165            0x0000ffffffffffffULL,
     166            0,
     167            0,
     168            0,
     169            0
     170        },
     171        {
     172            0xffffffffffffffffULL,
     173            0xffffffffffffffffULL,
     174            0xffffffffffffffffULL,
     175            0x00000000ffffffffULL,
     176            0,
     177            0,
     178            0,
     179            0
     180        },
     181        {
     182            0xffffffffffffffffULL,
     183            0xffffffffffffffffULL,
     184            0xffffffffffffffffULL,
     185            0x000000000000ffffULL,
     186            0,
     187            0,
     188            0,
     189            0
     190        }
     191    };
     192    uint tmp;
     193    uint adjustedOverlapLength = overlapLength*channels;
     194    uint counter = ((adjustedOverlapLength+15)>>4)-1;    // load counter to counter = overlapLength / 8 - 1
     195    uint remainder = (16-adjustedOverlapLength)&0xf;     // since there are 1/3 sample per 1/2 quadword
     196
     197    __m64 *ph = (__m64*)&mm_mask[remainder&3][remainder>>2];
     198    __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
     199    GI(__m64 m0, m1, m2, m3, m4, m5, m6); // temporaries
     200    uint shift = overlapDividerBits;
     201
     202    // prepare to the first round by loading
     203    SI(m1 = pv1[0],             movq_a2r(0, pv1, mm1)); // load m1 = pv1[0]
     204    SI(m2 = pv1[1],             movq_a2r(8, pv1, mm2)); // load m2 = pv1[1]
     205    SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));    // clear m0
     206    SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));   // shift in 64bit reg
     207
     208    do {
     209        // Calculate cross-correlation between the tempOffset and tmpbid_buffer.
     210        // Process 4 parallel batches of 2 * stereo samples each during one
     211        // round to improve CPU-level parallellization.
     212        SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1)); // multiply-add m1 = m1 * pv2[0]
     213        SI(m3 = pv1[2],                   movq_a2r(16, pv1, mm3));   // load mm3 = pv1[2]
     214        SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2)); // multiply-add m2 = m2 * pv2[1]
     215        SI(m4 = pv1[3],                   movq_a2r(24, pv1, mm4));   // load mm4 = pv1[3]
     216        SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));// multiply-add m3 = m3 * pv2[2]
     217        SI(m2 = _mm_add_pi32(m2, m1),     paddd_r2r(mm1, mm2));      // add m2 += m1
     218        SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));// multiply-add m4 = m4 * pv2[3]
     219        SI(m1 = pv1[4],                   movq_a2r(32, pv1, mm1));   // mm1 = pv1[0] for next round
     220        SI(m2 = _mm_srai_pi32(m2, m5),    psrad_r2r(mm5, mm2));      // m2 >>= shift (mm5)
     221        pv1 += 4;                                                    // increment first pointer
     222        SI(m3 = _mm_add_pi32(m3, m4),     paddd_r2r(mm4, mm3));      // m3 += m4
     223        SI(m0 = _mm_add_pi32(m0, m2),     paddd_r2r(mm2, mm0));      // m0 += m2
     224        SI(m2 = pv1[1],                   movq_a2r(8, pv1, mm2));    // mm2 = pv1[1] for next round
     225        SI(m3 = _mm_srai_pi32(m3, m5),    psrad_r2r(mm5, mm3));    // m3 >>= shift (mm5)
     226        pv2 += 4;                                                    // increment second pointer
     227        SI(m0 = _mm_add_pi32(m0, m3),     paddd_r2r(mm3, mm0));      // add m0 += m3
     228    } while ((--counter)!=0);
     229
     230    SI(m6 = ph[0], movq_a2r(0, ph, mm6));
     231    // Finalize the last partial loop:
     232    SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
     233    SI(m1 = _mm_and_si64(m1, m6),      pand_r2r(mm6, mm1));
     234    SI(m3 = pv1[2],                    movq_a2r(16, pv1, mm3));
     235    SI(m6 = ph[1], movq_a2r(8, ph, mm6));
     236    SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
     237    SI(m2 = _mm_and_si64(m2, m6),      pand_r2r(mm6, mm2));
     238    SI(m4 = pv1[3],                    movq_a2r(24, pv1, mm4));
     239    SI(m6 = ph[2], movq_a2r(16, ph, mm6));
     240    SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
     241    SI(m3 = _mm_and_si64(m3, m6),      pand_r2r(mm6, mm3));
     242    SI(m2 = _mm_add_pi32(m2, m1),      paddd_r2r(mm1, mm2));
     243    SI(m6 = ph[3], movq_a2r(24, ph, mm6));
     244    SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
     245    SI(m4 = _mm_and_si64(m4, m6),      pand_r2r(mm6, mm4));
     246    SI(m2 = _mm_srai_pi32(m2, m5),     psrad_r2r(mm5, mm2));
     247    SI(m3 = _mm_add_pi32(m3, m4),      paddd_r2r(mm4, mm3));
     248    SI(m0 = _mm_add_pi32(m0, m2),      paddd_r2r(mm2, mm0));
     249    SI(m3 = _mm_srai_pi32(m3, m5),     psrad_r2r(mm5, mm3));
     250    SI(m0 = _mm_add_pi32(m0, m3),      paddd_r2r(mm3, mm0));
     251
     252    // copy hi-dword of mm0 to lo-dword of mm1, then sum mm0+mm1
     253    // and finally return the result
     254    SI(m1 = m0,                        movq_r2r(mm0, mm1));
     255    SI(m1 = _mm_srli_si64(m1, 32),     psrld_i2r(32, mm1));
     256    SI(m0 = _mm_add_pi32(m0, m1),      paddd_r2r(mm1, mm0));
     257    SI(tmp = _mm_cvtsi64_si32(m0),     movd_r2m(mm0, tmp));
     258    return tmp;
     259}
     260#endif
     261
    144262void TDStretchMMX::clearCrossCorrState()
    145263{
    146264    _mm_empty();
     
    224342    _mm_empty();
    225343}
    226344
     345#if 0
     346// MMX-optimized version of the function overlapMulti
     347void TDStretchMMX::overlapMulti(short *output, const short *input) const
     348{
     349    _mm_empty();
     350    uint shift = overlapDividerBits;
     351    uint counter = overlapLength>>2;                 // counter = overlapLength / 4
     352    __m64 *inPtr = (__m64*) input;                   // load address of inputBuffer
     353    __m64 *midPtr = (__m64*) pMidBuffer;             // load address of midBuffer
     354    __m64 *outPtr = ((__m64*) output)-2;             // load address of outputBuffer
     355    GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);        // temporaries
     356
     357    // load mixing value adder to mm5
     358    uint tmp0 = 0x0002fffe;                                      // tmp0 = 0x0002 fffe
     359    SI(m5 = _mm_cvtsi32_si64(tmp0),    movd_v2r(tmp0, mm5));     // mm5 = 0x0000 0000 0002 fffe
     360    SI(m5 = _mm_unpacklo_pi32(m5,m5),  punpckldq_r2r(mm5, mm5)); // mm5 = 0x0002 fffe 0002 fffe
     361    // load sliding mixing value counter to mm6
     362    SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
     363    SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6)); // mm6 = 0x0000 OVL_ 0000 OVL_
     364    // load sliding mixing value counter to mm7
     365    uint tmp1 = (overlapLength-1)|0x00010000;                    // tmp1 = 0x0001 overlapLength-1
     366    SI(m7 = _mm_cvtsi32_si64(tmp1),    movd_v2r(tmp1, mm7));     // mm7 = 0x0000 0000 0001 01ff
     367    SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7)); // mm7 = 0x0001 01ff 0001 01ff
     368
     369    do {
     370        // Process two parallel batches of 2+2 stereo samples during each round
     371        // to improve CPU-level parallellization.
     372        //
     373        // Load [midPtr] into m0 and m1
     374        // Load [inPtr] into m3
     375        // unpack words of m0, m1 and m3 into m0 and m1
     376        // multiply-add m0*m6 and m1*m7, store results into m0 and m1
     377        // divide m0 and m1 by 512 (=right-shift by overlapDividerBits)
     378        // pack the result into m0 and store into [edx]
     379        //
     380        // Load [midPtr+8] into m2 and m3
     381        // Load [inPtr+8] into m4
     382        // unpack words of m2, m3 and m4 into m2 and m3
     383        // multiply-add m2*m6 and m3*m7, store results into m2 and m3
     384        // divide m2 and m3 by 512 (=right-shift by overlapDividerBits)
     385        // pack the result into m2 and store into [edx+8]
     386        SI(m0 = midPtr[0],                movq_a2r(0, midPtr, mm0));// mm0 = m1l m1r m0l m0r
     387        outPtr += 2;
     388        SI(m3 = inPtr[0],                 movq_a2r(0, inPtr, mm3)); // mm3 = i1l i1r i0l i0r
     389        SI(m1 = m0,                       movq_r2r(mm0, mm1));      // mm1 = m1l m1r m0l m0r
     390        SI(m2 = midPtr[1],                movq_a2r(8, midPtr, mm2));// mm2 = m3l m3r m2l m2r
     391        SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0)); // mm0 = i0l m0l i0r m0r
     392        midPtr += 2;
     393        SI(m4 = inPtr[1],                 movq_a2r(8, inPtr, mm4)); // mm4 = i3l i3r i2l i2r
     394        SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1)); // mm1 = i1l m1l i1r m1r
     395        inPtr+=2;
     396        SI(m3 = m2,                       movq_r2r(mm2, mm3));      // mm3 = m3l m3r m2l m2r
     397        SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2)); // mm2 = i2l m2l i2r m2r
     398        // mm0 = i0l*m63+m0l*m62 i0r*m61+m0r*m60
     399        SI(m0 = _mm_madd_pi16(m0, m6),    pmaddwd_r2r(mm6, mm0));
     400        SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3)); // mm3 = i3l m3l i3r m3r
     401        SI(m4 = _mm_cvtsi32_si64(shift),  movd_v2r(shift, mm4));    // mm4 = shift
     402        // mm1 = i1l*m73+m1l*m72 i1r*m71+m1r*m70
     403        SI(m1 = _mm_madd_pi16(m1, m7),    pmaddwd_r2r(mm7, mm1));
     404        SI(m6 = _mm_add_pi16(m6, m5),     paddw_r2r(mm5, mm6));
     405        SI(m7 = _mm_add_pi16(m7, m5),     paddw_r2r(mm5, mm7));
     406        SI(m0 = _mm_srai_pi32(m0, m4),    psrad_r2r(mm4, mm0));    // mm0 >>= shift
     407        // mm2 = i2l*m63+m2l*m62 i2r*m61+m2r*m60
     408        SI(m2 = _mm_madd_pi16(m2, m6),    pmaddwd_r2r(mm6, mm2));
     409        SI(m1 = _mm_srai_pi32(m1, m4),    psrad_r2r(mm4, mm1));    // mm1 >>= shift
     410        // mm3 = i3l*m73+m3l*m72 i3r*m71+m3r*m70
     411        SI(m3 = _mm_madd_pi16(m3, m7),    pmaddwd_r2r(mm7, mm3));
     412        SI(m2 = _mm_srai_pi32(m2, m4),    psrad_r2r(mm4, mm2));    // mm2 >>= shift
     413        SI(m0 = _mm_packs_pi32(m0, m1),   packssdw_r2r(mm1, mm0)); // mm0 = mm1h mm1l mm0h mm0l
     414        SI(m3 = _mm_srai_pi32(m3, m4),    psrad_r2r(mm4, mm3));    // mm3 >>= shift
     415        SI(m6 = _mm_add_pi16(m6, m5),     paddw_r2r(mm5, mm6));
     416        SI(m2 = _mm_packs_pi32(m2, m3),   packssdw_r2r(mm3, mm2)); // mm2 = mm2h mm2l mm3h mm3l
     417        SI(m7 = _mm_add_pi16(m7, m5),     paddw_r2r(mm5, mm7));
     418        SI(outPtr[0] = m0,                movq_r2a(mm0, 0, outPtr));
     419        SI(outPtr[1] = m2,                movq_r2a(mm2, 8, outPtr));
     420    } while ((--counter)!=0);
     421    _mm_empty();
     422}
     423#endif
     424
    227425//////////////////////////////////////////////////////////////////////////////
    228426//
    229427// implementation of MMX optimized functions of class 'FIRFilter'
  • libs/libmythsoundtouch/STTypes.h

     
    6161    #define INTEGER_SAMPLES       //< 16bit integer samples
    6262    //#define FLOAT_SAMPLES       //< 32bit float samples
    6363
     64    #define MULTICHANNEL 6
    6465
    6566    /// Define this to allow CPU-specific assembler optimizations. Notice that
    6667    /// having this enabled on non-x86 platforms doesn't matter; the compiler can
  • libs/libmythsoundtouch/SoundTouch.cpp

     
    140140// Sets the number of channels, 1 = mono, 2 = stereo
    141141void SoundTouch::setChannels(uint numChannels)
    142142{
     143#ifdef MULTICHANNEL
     144    if (numChannels < 1 || numChannels > MULTICHANNEL)
     145#else
    143146    if (numChannels != 1 && numChannels != 2)
     147#endif
    144148    {
    145149        throw std::runtime_error("Illegal number of channels");
    146150    }