Added USE_SIMD compatibility for Visual Studio

This commit is contained in:
daniou 2019-08-26 14:03:46 +02:00
parent 1efe72041e
commit b31b2018b8
2 changed files with 86 additions and 28 deletions

View File

@ -13,6 +13,13 @@
#include "kiss_fft.h"
#include <limits.h>
#ifdef _MSC_VER
#ifdef USE_SIMD
#define MSVC_SIMD
#include <xmmintrin.h>
#endif
#endif
#define MAXFACTORS 32
/* e.g. an fft of length 128 has 4 factors
as far as kissfft is concerned
@ -78,20 +85,66 @@ struct kiss_fft_state{
#else /* not FIXED_POINT*/
# define S_MUL(a,b) ( (a)*(b) )
#define C_FIXDIV(c,div) /* NOOP */
#ifdef MSVC_SIMD
#define S_SUB(a, b) _mm_sub_ps((a), (b))
#define S_ADD(a, b) _mm_add_ps((a), (b))
#define S_ADD3(a, b, c) _mm_add_ps((c), _mm_add_ps((a), (b)))
#define S_INV_SIGN(a) _mm_sub_ps(_mm_set1_ps(0.0f), (a))
#define S_MUL(a,b) _mm_mul_ps((a),(b))
#define C_MUL(m,a,b) \
do{ (m).r = _mm_sub_ps(_mm_mul_ps((a).r, (b).r), _mm_mul_ps((a).i, (b).i));\
(m).i = _mm_add_ps(_mm_mul_ps((a).r, (b).i), _mm_mul_ps((a).i, (b).r));} while(0)
#define C_MULBYSCALAR( c, s ) \
do{ (c).r = _mm_mul_ps((c).r, s);\
(c).i = _mm_mul_ps((c).i, s); }while(0)
#else
#define S_SUB(a, b) ((a)-(b))
#define S_ADD(a, b) ((a)+(b))
#define S_ADD3(a, b, c) ((a)+(b)+(c))
#define S_INV_SIGN(a) (-a)
#define S_MUL(a,b) ( (a)*(b) )
#define C_MUL(m,a,b) \
do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
(m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
# define C_FIXDIV(c,div) /* NOOP */
# define C_MULBYSCALAR( c, s ) \
#define C_MULBYSCALAR( c, s ) \
do{ (c).r *= (s);\
(c).i *= (s); }while(0)
#endif
#endif
#ifndef CHECK_OVERFLOW_OP
# define CHECK_OVERFLOW_OP(a,op,b) /* noop */
#endif
#ifdef MSVC_SIMD
#define C_ADD( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,+,(b).r)\
CHECK_OVERFLOW_OP((a).i,+,(b).i)\
(res).r = _mm_add_ps((a).r, (b).r); (res).i=_mm_add_ps((a).i, (b).i); \
}while(0)
#define C_SUB( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,-,(b).r)\
CHECK_OVERFLOW_OP((a).i,-,(b).i)\
(res).r = _mm_sub_ps((a).r, (b).r); (res).i=_mm_sub_ps((a).i, (b).i); \
}while(0)
#define C_ADDTO( res , a)\
do { \
CHECK_OVERFLOW_OP((res).r,+,(a).r)\
CHECK_OVERFLOW_OP((res).i,+,(a).i)\
(res).r = _mm_add_ps((res).r, (a).r); (res).i=_mm_add_ps((res).i, (a).i); \
}while(0)
#define C_SUBFROM( res , a)\
do {\
CHECK_OVERFLOW_OP((res).r,-,(a).r)\
CHECK_OVERFLOW_OP((res).i,-,(a).i)\
(res).r = _mm_sub_ps((r).r, (a).r); (res).i=_mm_sub_ps((r).i, (b).i); \
}while(0)
#else
#define C_ADD( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,+,(b).r)\
@ -117,6 +170,7 @@ struct kiss_fft_state{
CHECK_OVERFLOW_OP((res).i,-,(a).i)\
(res).r -= (a).r; (res).i -= (a).i; \
}while(0)
#endif
#ifdef FIXED_POINT
@ -126,7 +180,11 @@ struct kiss_fft_state{
#elif defined(USE_SIMD)
# define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
# define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
#ifdef MSVC_SIMD
# define HALF_OF(x) _mm_mul_ps((x), _mm_set1_ps(.5f))
#else
# define HALF_OF(x) ((x)*_mm_set1_ps(.5))
#endif
#else
# define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
# define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)

View File

@ -69,15 +69,15 @@ static void kf_bfly4(
C_ADDTO( *Fout , scratch[3] );
if(st->inverse) {
Fout[m].r = scratch[5].r - scratch[4].i;
Fout[m].i = scratch[5].i + scratch[4].r;
Fout[m3].r = scratch[5].r + scratch[4].i;
Fout[m3].i = scratch[5].i - scratch[4].r;
Fout[m].r = S_SUB(scratch[5].r, scratch[4].i);
Fout[m].i = S_ADD(scratch[5].i, scratch[4].r);
Fout[m3].r = S_ADD(scratch[5].r, scratch[4].i);
Fout[m3].i = S_SUB(scratch[5].i, scratch[4].r);
}else{
Fout[m].r = scratch[5].r + scratch[4].i;
Fout[m].i = scratch[5].i - scratch[4].r;
Fout[m3].r = scratch[5].r - scratch[4].i;
Fout[m3].i = scratch[5].i + scratch[4].r;
Fout[m].r = S_ADD(scratch[5].r, scratch[4].i);
Fout[m].i = S_SUB(scratch[5].i, scratch[4].r);
Fout[m3].r = S_SUB(scratch[5].r, scratch[4].i);
Fout[m3].i = S_ADD(scratch[5].i, scratch[4].r);
}
++Fout;
}while(--k);
@ -110,18 +110,18 @@ static void kf_bfly3(
tw1 += fstride;
tw2 += fstride*2;
Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
Fout[m].r = S_SUB(Fout->r, HALF_OF(scratch[3].r));
Fout[m].i = S_SUB(Fout->i, HALF_OF(scratch[3].i));
C_MULBYSCALAR( scratch[0] , epi3.i );
C_ADDTO(*Fout,scratch[3]);
Fout[m2].r = Fout[m].r + scratch[0].i;
Fout[m2].i = Fout[m].i - scratch[0].r;
Fout[m2].r = S_ADD(Fout[m].r, scratch[0].i);
Fout[m2].i = S_SUB(Fout[m].i, scratch[0].r);
Fout[m].r -= scratch[0].i;
Fout[m].i += scratch[0].r;
Fout[m].r = S_SUB(Fout[m].r, scratch[0].i);
Fout[m].i = S_ADD(Fout[m].i, scratch[0].r);
++Fout;
}while(--k);
@ -164,22 +164,22 @@ static void kf_bfly5(
C_ADD( scratch[8],scratch[2],scratch[3]);
C_SUB( scratch[9],scratch[2],scratch[3]);
Fout0->r += scratch[7].r + scratch[8].r;
Fout0->i += scratch[7].i + scratch[8].i;
Fout0->r = S_ADD3(Fout0->r, scratch[7].r, scratch[8].r);
Fout0->i = S_ADD3(Fout0->i, scratch[7].i, scratch[8].i);
scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
scratch[5].r = S_ADD3(scratch[0].r, S_MUL(scratch[7].r,ya.r), S_MUL(scratch[8].r,yb.r));
scratch[5].i = S_ADD3(scratch[0].i, S_MUL(scratch[7].i,ya.r), S_MUL(scratch[8].i,yb.r));
scratch[6].r = S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
scratch[6].r = S_ADD(S_MUL(scratch[10].i,ya.i), S_MUL(scratch[9].i,yb.i));
scratch[6].i = S_SUB(S_INV_SIGN(S_MUL(scratch[10].r,ya.i)), S_MUL(scratch[9].r,yb.i));
C_SUB(*Fout1,scratch[5],scratch[6]);
C_ADD(*Fout4,scratch[5],scratch[6]);
scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
scratch[11].r = S_ADD3(scratch[0].r, S_MUL(scratch[7].r,yb.r), S_MUL(scratch[8].r,ya.r));
scratch[11].i = S_ADD3(scratch[0].i, S_MUL(scratch[7].i,yb.r), S_MUL(scratch[8].i,ya.r));
scratch[12].r = S_ADD(S_INV_SIGN(S_MUL(scratch[10].i,yb.i)), S_MUL(scratch[9].i,ya.i));
scratch[12].i = S_SUB(S_MUL(scratch[10].r,yb.i), S_MUL(scratch[9].r,ya.i));
C_ADD(*Fout2,scratch[11],scratch[12]);
C_SUB(*Fout3,scratch[11],scratch[12]);
@ -217,7 +217,7 @@ static void kf_bfly_generic(
int twidx=0;
Fout[ k ] = scratch[0];
for (q=1;q<p;++q ) {
twidx += fstride * k;
twidx += (int)(fstride * k);
if (twidx>=Norig) twidx-=Norig;
C_MUL(t,scratch[q] , twiddles[twidx] );
C_ADDTO( Fout[ k ] ,t);