diff --git a/CHANGELOG b/CHANGELOG index 95c86b6..a2fc0c2 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,12 @@ +1.2.3 (June 25, 2005) The "you want to use WHAT as a sample" release. + Added ability to use 32 bit fixed point samples -- requires a 64 bit intermediate result, a la 'long long' + + Added ability to do 4 FFTs in parallel by using SSE SIMD instructions. This is accomplished by + using the __m128 (vector of 4 floats) as kiss_fft_scalar. Define USE_SIMD to use this. + + I know, I know ... this is drifting a bit from the "kiss" principle, but the speed advantages + make it worth it for some. Also recent gcc makes it SOO easy to use vectors of 4 floats as a POD type. + 1.2.2 (May 6, 2005) The Matthew release Replaced fixed point division with multiply&shift. Thanks to Jean-Marc Valin for discussions regarding. Considerable speedup. diff --git a/Makefile b/Makefile index 80be924..eb16f1a 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,10 @@ TARBALL=kiss_fft_v$(KFVER).tar.gz ZIPFILE=kiss_fft_v$(KFVER).zip testall: + # The simd and long types may or may not work on your machine export DATATYPE=simd && cd test && make test + export DATATYPE=long && cd test && make test + export DATATYPE=short && cd test && make test export DATATYPE=float && cd test && make test export DATATYPE=double && cd test && make test diff --git a/_kiss_fft_guts.h b/_kiss_fft_guts.h index a7df0ce..c0cb337 100644 --- a/_kiss_fft_guts.h +++ b/_kiss_fft_guts.h @@ -125,24 +125,26 @@ struct kiss_fft_state{ }while(0) - - -static -void kf_cexp(kiss_fft_cpx * x,double phase) /* returns e ** (j*phase) */ -{ #ifdef FIXED_POINT - x->r = (kiss_fft_scalar) (SAMP_MAX * cos (phase)); - x->i = (kiss_fft_scalar) (SAMP_MAX * sin (phase)); +# define KISS_FFT_COS(phase) (kiss_fft_scalar) (SAMP_MAX * cos (phase)) +# define KISS_FFT_SIN(phase) (kiss_fft_scalar) (SAMP_MAX * sin (phase)) +# define HALF_OF(x) ((x)>>1) #elif defined(USE_SIMD) - float r = cos (phase); - float i = sin (phase); - x->r = _mm_load1_ps(&r); - x->i = _mm_load1_ps(&i); +# define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) ) +# define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) ) +# define HALF_OF(x) ((x)*_mm_set1_ps(.5)) #else - x->r = (kiss_fft_scalar) cos (phase); - x->i = (kiss_fft_scalar) sin (phase); +# define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase) +# define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase) +# define HALF_OF(x) ((x)*.5) #endif -} + +#define kf_cexp(x,phase) \ + do{ \ + (x)->r = KISS_FFT_COS(phase);\ + (x)->i = KISS_FFT_SIN(phase);\ + }while(0) + /* a debugging function */ #define pcpx(c)\ diff --git a/kiss_fft.c b/kiss_fft.c index 98f00dd..0c26f66 100644 --- a/kiss_fft.c +++ b/kiss_fft.c @@ -129,13 +129,8 @@ static void kf_bfly3( tw1 += fstride; tw2 += fstride*2; -#ifdef USE_SIMD - Fout[m].r = Fout->r - scratch[3].r * _mm_set1_ps(.5); - Fout[m].i = Fout->i - scratch[3].i * _mm_set1_ps(.5); -#else - Fout[m].r = Fout->r - scratch[3].r/2; - Fout[m].i = Fout->i - scratch[3].i/2; -#endif + Fout[m].r = Fout->r - HALF_OF(scratch[3].r); + Fout[m].i = Fout->i - HALF_OF(scratch[3].i); C_MULBYSCALAR( scratch[0] , epi3.i ); @@ -331,11 +326,7 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/ if ( lenmem==NULL ) { -#ifdef USE_SIMD - st = ( kiss_fft_cfg)memalign( sizeof(kiss_fft_cpx), memneeded ); -#else - st = ( kiss_fft_cfg)malloc( memneeded ); -#endif + st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded ); }else{ if (*lenmem >= memneeded) st = (kiss_fft_cfg)mem; diff --git a/kiss_fft.h b/kiss_fft.h index 7ef7bb2..a0e232f 100644 --- a/kiss_fft.h +++ b/kiss_fft.h @@ -27,6 +27,9 @@ extern "C" { #ifdef USE_SIMD # include # define kiss_fft_scalar __m128 +#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes) +#else +#define KISS_FFT_MALLOC malloc #endif diff --git a/test/Makefile b/test/Makefile index 950274e..9fe7c3b 100644 --- a/test/Makefile +++ b/test/Makefile @@ -67,9 +67,6 @@ $(TESTKFC): $(SRCFILES) $(TESTREAL): test_real.c $(SRCFILES) $(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+ -bm_simd: benchkiss.c ../kiss_fft.c pstats.c ../tools/kfc.c - $(CC) -o $@ $(CFLAGS) -DUSE_SIMD -msse -m3dnow -lm $+ - $(BENCHKISS): benchkiss.c $(SRCFILES) $(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+ diff --git a/test/test_real.c b/test/test_real.c index 27cd498..454e3ef 100644 --- a/test/test_real.c +++ b/test/test_real.c @@ -15,9 +15,19 @@ static double snr_compare( kiss_fft_cpx * vec1,kiss_fft_cpx * vec2, int n) { int k; - double sigpow,noisepow,err,snr,scale=0; - sigpow = noisepow = .00000000000000000001; + double sigpow=1e-10,noisepow=1e-10,err,snr,scale=0; +#ifdef USE_SIMD + float *fv1 = (float*)vec1; + float *fv2 = (float*)vec2; + for (k=0;k<8*n;++k) { + sigpow += *fv1 * *fv1; + err = *fv1 - *fv2; + noisepow += err*err; + ++fv1; + ++fv2; + } +#else for (k=0;k>1)*cos(f1*i) - + (maxrange>>1)*cos(f2*i) ); + + (maxrange>>1)*cos(f2*i) ); #else tbuf[i] = (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i); @@ -42,8 +42,13 @@ double two_tone_test( int nfft, int bin1,int bin2) kiss_fftr(cfg, tbuf, kout); for (i=0;i < (nfft/2+1);++i) { +#ifdef USE_SIMD double tmpr = (double)*(float*)&kout[i].r / (double)maxrange; double tmpi = (double)*(float*)&kout[i].i / (double)maxrange; +#else + double tmpr = (double)kout[i].r / (double)maxrange; + double tmpi = (double)kout[i].i / (double)maxrange; +#endif double mag2 = tmpr*tmpr + tmpi*tmpi; if (i!=0 && i!= nfft/2) mag2 *= 2; /* all bins except DC and Nyquist have symmetric counterparts implied*/ diff --git a/tools/kfc.c b/tools/kfc.c index 7f1327b..424e119 100644 --- a/tools/kfc.c +++ b/tools/kfc.c @@ -42,7 +42,7 @@ static kiss_fft_cfg find_cached_fft(int nfft,int inverse) if (cur== NULL) { /* no cached node found, need to create a new one*/ kiss_fft_alloc(nfft,inverse,0,&len); - cur = (kfc_cfg)malloc(sizeof(struct cached_fft) + len ); + cur = (kfc_cfg)KISS_FFT_MALLOC((sizeof(struct cached_fft) + len )); if (cur == NULL) return NULL; cur->cfg = (kiss_fft_cfg)(cur+1); diff --git a/tools/kiss_fftr.c b/tools/kiss_fftr.c index 552fd4b..c3f9dcf 100644 --- a/tools/kiss_fftr.c +++ b/tools/kiss_fftr.c @@ -40,11 +40,7 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2); if (lenmem == NULL) { -#ifdef USE_SIMD - st = (kiss_fftr_cfg) memalign (sizeof(kiss_fft_cpx),memneeded); -#else - st = (kiss_fftr_cfg) malloc (memneeded); -#endif + st = (kiss_fftr_cfg) KISS_FFT_MALLOC (memneeded); } else { if (*lenmem >= memneeded) st = (kiss_fftr_cfg) mem; @@ -109,18 +105,10 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr C_MUL( tw , f2k , st->super_twiddles[k]); C_ADD( freqdata[k] , f1k ,tw); -#ifdef USE_SIMD - freqdata[k].r = (f1k.r + tw.r) * _mm_set1_ps(.5); - freqdata[k].i = (f1k.i + tw.i) * _mm_set1_ps(.5); - freqdata[N-k].r = (f1k.r - tw.r) * _mm_set1_ps(.5); - freqdata[N-k].i = - (f1k.i - tw.i) * _mm_set1_ps(.5); - -#else - freqdata[k].r = (f1k.r + tw.r) / 2; - freqdata[k].i = (f1k.i + tw.i) / 2; - freqdata[N-k].r = (f1k.r - tw.r)/2; - freqdata[N-k].i = - (f1k.i - tw.i)/2; -#endif + freqdata[k].r = HALF_OF(f1k.r + tw.r); + freqdata[k].i = HALF_OF(f1k.i + tw.i); + freqdata[N-k].r = HALF_OF(f1k.r - tw.r); + freqdata[N-k].i = HALF_OF(tw.i - f1k.i); } CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);