Ticket #6275: st-sse.patch

File st-sse.patch, 4.3 KB (added by foobum@…, 11 years ago)
  • mythtv/libs/libmythsoundtouch/sse_gcc.cpp

    diff --git a/mythtv/libs/libmythsoundtouch/sse_gcc.cpp b/mythtv/libs/libmythsoundtouch/sse_gcc.cpp
    index c26d9a3..5ef4fd6 100644
    a b long TDStretchSSE2::calcCrossCorrMulti(const short *mPos, const short *cPos) con 
    1717    int count = (overlapLength * channels) - channels;
    1818    long loops = count >> 5;
    1919    long remainder = count - (loops<<5);
     20    const short *mp = mPos;
     21    const short *cp = cPos;
    2022
    21     mPos += channels;
    22     cPos += channels;
     23    mp += channels;
     24    cp += channels;
    2325
    24     asm(
     26    asm volatile (
    2527        "xorps      %%xmm5, %%xmm5      \n\t"
    2628        "movd       %4, %%xmm7          \n\t"
    2729        "1:                             \n\t"
    long TDStretchSSE2::calcCrossCorrMulti(const short *mPos, const short *cPos) con 
    5052        "sub        $1, %%ecx           \n\t"
    5153        "jnz        1b                  \n\t"
    5254        "movdqa     %%xmm5, %0          \n\t"
    53         :"=m"(out[0])
    54         :"r"(mPos), "r"(cPos), "c"(loops), "m"(overlapDividerBits)
     55        :"=m"(out[0]),"+r"(mPos), "+r"(cPos)
     56        :"c"(loops), "m"(overlapDividerBits)
    5557    );
    5658
    5759    corr = out[0] + out[1] + out[2] + out[3];
    5860
    59     mPos += loops<<5;
    60     cPos += loops<<5;
    61 
    6261    for (i = 0; i < remainder; i++)
    6362        corr += (mPos[i] * cPos[i]) >> overlapDividerBits;
    6463
    long TDStretchSSE2::calcCrossCorrStereo(const short *mPos, const short *cPos) co 
    7271    int count = (overlapLength<<1) - 2;
    7372    long loops = count >> 5;
    7473    long remainder = count - (loops<<5);
     74    const short *mp = mPos;
     75    const short *cp = cPos;
    7576
    76     mPos += 2;
    77     cPos += 2;
     77    mp += 2;
     78    cp += 2;
    7879
    79     asm(
     80    asm volatile (
    8081        "xorps      %%xmm5, %%xmm5      \n\t"
    8182        "movd       %4, %%xmm7          \n\t"
    8283        "1:                             \n\t"
    long TDStretchSSE2::calcCrossCorrStereo(const short *mPos, const short *cPos) co 
    105106        "sub        $1, %%ecx           \n\t"
    106107        "jnz        1b                  \n\t"
    107108        "movdqa     %%xmm5, %0          \n\t"
    108         :"=m"(out[0])
    109         :"r"(mPos), "r"(cPos), "c"(loops), "m"(overlapDividerBits)
     109        :"=m"(out[0]),"+r"(mp),"+r"(cp)
     110        :"c"(loops), "m"(overlapDividerBits)
    110111    );
    111112
    112113    corr = out[0] + out[1] + out[2] + out[3];
    113114
    114     mPos += loops<<5;
    115     cPos += loops<<5;
    116 
    117115    for (i = 0; i < remainder; i += 2)
    118116        corr += (mPos[i] * cPos[i] +
    119117                 mPos[i+1] * cPos[i+1]) >> overlapDividerBits;
    long TDStretchSSE2::calcCrossCorrStereo(const short *mPos, const short *cPos) co 
    121119    return corr;
    122120}
    123121
    124 __attribute__((noinline))
    125122void TDStretchSSE2::overlapMulti(short *output, const short *input) const
    126123{
    127     asm(
     124
     125    short *o = output;
     126    const short *i = input;
     127    const short *m = pMidBuffer;
     128    long ch = (long)channels;
     129
     130    asm volatile (
    128131        "movd       %%ecx, %%xmm0       \n\t"
    129132        "shl        %6                  \n\t"
    130133        "punpckldq  %%xmm0, %%xmm0      \n\t"
    void TDStretchSSE2::overlapMulti(short *output, const short *input) const 
    160163        "add        %6, %5              \n\t"
    161164        "sub        $1, %%ecx           \n\t"
    162165        "jnz        1b                  \n\t"
    163         ::"c"(overlapLength),"m"(sadd),"m"(ones),"r"(input),"r"(pMidBuffer),
    164           "r"(output),"r"((long)channels)
     166        ::"c"(overlapLength),"m"(sadd),"m"(ones),"r"(i),"r"(m),"r"(o),"r"(ch)
     167        :"memory"
    165168    );
    166169}
    167170
    168 __attribute__((noinline))
    169171void TDStretchSSE2::overlapStereo(short *output, const short *input) const
    170172{
    171     asm(
     173    short *o = output;
     174    const short *i = input;
     175    const short *m = pMidBuffer;
     176
     177    asm volatile (
    172178        "movd       %%ecx, %%mm0        \n\t"
    173179        "pxor       %%mm7, %%mm7        \n\t"
    174180        "punpckldq  %%mm0, %%mm0        \n\t"
    void TDStretchSSE2::overlapStereo(short *output, const short *input) const 
    206212        "sub        $1, %%ecx           \n\t"
    207213        "jnz        1b                  \n\t"
    208214        "emms                           \n\t"
    209         ::"c"(overlapLength),"m"(sadd),"m"(ones),"r"(input),"r"(pMidBuffer),
    210           "r"(output)
     215        ::"c"(overlapLength),"m"(sadd),"m"(ones),"r"(i),"r"(m),"r"(o)
     216        :"memory"
    211217    );
    212218}
    213219#endif // ALLOW_SSE2