From 7f6cbeab2e9ceec757d0c5577f1eecb5a791786c Mon Sep 17 00:00:00 2001 From: Mark Borgerding Date: Fri, 24 Jun 2005 03:03:31 +0000 Subject: [PATCH] added simd capability --- Makefile | 1 + _kiss_fft_guts.h | 5 +++++ kiss_fft.c | 9 +++++++++ kiss_fft.h | 7 +++++++ test/Makefile | 11 ++++++++++- test/benchkiss.c | 23 +++++++++++++++++++++++ test/twotonetest.c | 15 +++++++++------ tools/Makefile | 7 ++++++- tools/kiss_fastfir.c | 13 +++++++++++++ tools/kiss_fftr.c | 30 +++++++++++++++++++++++++++++- 10 files changed, 112 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 9e48dc4..80be924 100644 --- a/Makefile +++ b/Makefile @@ -5,6 +5,7 @@ TARBALL=kiss_fft_v$(KFVER).tar.gz ZIPFILE=kiss_fft_v$(KFVER).zip testall: + export DATATYPE=simd && cd test && make test export DATATYPE=short && cd test && make test export DATATYPE=float && cd test && make test export DATATYPE=double && cd test && make test diff --git a/_kiss_fft_guts.h b/_kiss_fft_guts.h index 1587cb2..a7df0ce 100644 --- a/_kiss_fft_guts.h +++ b/_kiss_fft_guts.h @@ -133,6 +133,11 @@ void kf_cexp(kiss_fft_cpx * x,double phase) /* returns e ** (j*phase) */ #ifdef FIXED_POINT x->r = (kiss_fft_scalar) (SAMP_MAX * cos (phase)); x->i = (kiss_fft_scalar) (SAMP_MAX * sin (phase)); +#elif defined(USE_SIMD) + float r = cos (phase); + float i = sin (phase); + x->r = _mm_load1_ps(&r); + x->i = _mm_load1_ps(&i); #else x->r = (kiss_fft_scalar) cos (phase); x->i = (kiss_fft_scalar) sin (phase); diff --git a/kiss_fft.c b/kiss_fft.c index cd295b3..98f00dd 100644 --- a/kiss_fft.c +++ b/kiss_fft.c @@ -129,8 +129,13 @@ static void kf_bfly3( tw1 += fstride; tw2 += fstride*2; +#ifdef USE_SIMD + Fout[m].r = Fout->r - scratch[3].r * _mm_set1_ps(.5); + Fout[m].i = Fout->i - scratch[3].i * _mm_set1_ps(.5); +#else Fout[m].r = Fout->r - scratch[3].r/2; Fout[m].i = Fout->i - scratch[3].i/2; +#endif C_MULBYSCALAR( scratch[0] , epi3.i ); @@ -326,7 +331,11 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/ if ( lenmem==NULL ) { +#ifdef USE_SIMD + st = ( kiss_fft_cfg)memalign( sizeof(kiss_fft_cpx), memneeded ); +#else st = ( kiss_fft_cfg)malloc( memneeded ); +#endif }else{ if (*lenmem >= memneeded) st = (kiss_fft_cfg)mem; diff --git a/kiss_fft.h b/kiss_fft.h index a208bc2..7ef7bb2 100644 --- a/kiss_fft.h +++ b/kiss_fft.h @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -23,6 +24,12 @@ extern "C" { in the tools/ directory. */ +#ifdef USE_SIMD +# include +# define kiss_fft_scalar __m128 +#endif + + #ifdef FIXED_POINT # if (FIXED_POINT == 32) # define kiss_fft_scalar long diff --git a/test/Makefile b/test/Makefile index ec42ca8..950274e 100644 --- a/test/Makefile +++ b/test/Makefile @@ -23,6 +23,7 @@ TESTKFC=tkfc_$(DATATYPE) FASTFILTREAL=ffr_$(DATATYPE) SELFTESTSRC=twotonetest.c + TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE) ifeq "$(DATATYPE)" "short" @@ -31,7 +32,12 @@ endif ifeq "$(DATATYPE)" "long" TYPEFLAGS=-DFIXED_POINT=32 -endif +endif + +ifeq "$(DATATYPE)" "simd" + TYPEFLAGS=-DUSE_SIMD=1 -msse +endif + ifeq "$(DATATYPE)" "float" # fftw needs to be built with --enable-float to build this lib @@ -61,6 +67,9 @@ $(TESTKFC): $(SRCFILES) $(TESTREAL): test_real.c $(SRCFILES) $(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+ +bm_simd: benchkiss.c ../kiss_fft.c pstats.c ../tools/kfc.c + $(CC) -o $@ $(CFLAGS) -DUSE_SIMD -msse -m3dnow -lm $+ + $(BENCHKISS): benchkiss.c $(SRCFILES) $(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+ diff --git a/test/benchkiss.c b/test/benchkiss.c index 3440966..b9edb94 100644 --- a/test/benchkiss.c +++ b/test/benchkiss.c @@ -6,6 +6,8 @@ #include "pstats.h" +#define CHK fprintf(stderr,"line %d\n" , __LINE__ ) + int main(int argc,char ** argv) { int nfft=1024; @@ -31,20 +33,41 @@ int main(int argc,char ** argv) break; } } + CHK; +#ifdef USE_SIMD + buf=(kiss_fft_cpx*)memalign(sizeof(kiss_fft_cpx),sizeof(kiss_fft_cpx) * nfft); + bufout=(kiss_fft_cpx*)memalign(sizeof(kiss_fft_cpx),sizeof(kiss_fft_cpx) * nfft); + + numffts /= 4; + fprintf(stderr,"since SIMD implementation does 4 ffts at a time, numffts is being reduced to %d\n",numffts); +#else buf=(kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx) * nfft); bufout=(kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx) * nfft); +#endif + + fprintf(stderr,"buf at %p, bufout at %p\n",buf,bufout); + CHK; for (i=0;i -#define pcpx(c)\ - fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) ) static double two_tone_test( int nfft, int bin1,int bin2) @@ -27,20 +25,25 @@ double two_tone_test( int nfft, int bin1,int bin2) #endif cfg = kiss_fftr_alloc(nfft , 0, NULL, NULL); - tbuf = malloc(nfft * sizeof(kiss_fft_scalar)); - kout = malloc(nfft * sizeof(kiss_fft_cpx)); + tbuf = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_scalar)); + kout = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_cpx)); /* generate a signal with two tones*/ for (i = 0; i < nfft; i++) { +#ifdef USE_SIMD + tbuf[i] = _mm_set1_ps( (maxrange>>1)*cos(f1*i) + + (maxrange>>1)*cos(f2*i) ); +#else tbuf[i] = (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i); +#endif } kiss_fftr(cfg, tbuf, kout); for (i=0;i < (nfft/2+1);++i) { - double tmpr = (double)kout[i].r / (double)maxrange; - double tmpi = (double)kout[i].i / (double)maxrange; + double tmpr = (double)*(float*)&kout[i].r / (double)maxrange; + double tmpi = (double)*(float*)&kout[i].i / (double)maxrange; double mag2 = tmpr*tmpr + tmpi*tmpi; if (i!=0 && i!= nfft/2) mag2 *= 2; /* all bins except DC and Nyquist have symmetric counterparts implied*/ diff --git a/tools/Makefile b/tools/Makefile index dbf4ab6..fa3fe46 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -13,6 +13,10 @@ else TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE) endif +ifeq "$(DATATYPE)" "simd" + TYPEFLAGS=-DUSE_SIMD=1 -msse +endif + ifeq "$(DATATYPE)" "float" FFTUTIL=fft FASTFILT=fastconv @@ -27,7 +31,8 @@ else DUMPHDR=dumphdr_$(DATATYPE) endif -all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL) $(PSDPNG) +all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL) +# $(PSDPNG) # $(DUMPHDR) #CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS) diff --git a/tools/kiss_fastfir.c b/tools/kiss_fastfir.c index ad1a62d..1c96216 100644 --- a/tools/kiss_fastfir.c +++ b/tools/kiss_fastfir.c @@ -159,8 +159,13 @@ kiss_fastfir_cfg kiss_fastfir_alloc( scale = 1.0 / st->nfft; for ( i=0; i < st->n_freq_bins; ++i ) { +#ifdef USE_SIMD + st->fir_freq_resp[i].r *= _mm_set1_ps(scale); + st->fir_freq_resp[i].i *= _mm_set1_ps(scale); +#else st->fir_freq_resp[i].r *= scale; st->fir_freq_resp[i].i *= scale; +#endif } return st; } @@ -286,14 +291,22 @@ void direct_file_filter( for (k = 0; k < nread; ++k) { tmph = imp_resp+nlag; #ifdef REAL_FASTFIR +# ifdef USE_SIMD + outval = _mm_set1_ps(0); +#else outval = 0; +#endif for (tap = oldestlag; tap < nlag; ++tap) outval += circbuf[tap] * *tmph--; for (tap = 0; tap < oldestlag; ++tap) outval += circbuf[tap] * *tmph--; outval += buf[k] * *tmph; #else +# ifdef USE_SIMD + outval.r = outval.i = _mm_set1_ps(0); +#else outval.r = outval.i = 0; +#endif for (tap = oldestlag; tap < nlag; ++tap){ C_MUL(tmp,circbuf[tap],*tmph); --tmph; diff --git a/tools/kiss_fftr.c b/tools/kiss_fftr.c index 859ac90..552fd4b 100644 --- a/tools/kiss_fftr.c +++ b/tools/kiss_fftr.c @@ -19,6 +19,9 @@ struct kiss_fftr_state{ kiss_fft_cfg substate; kiss_fft_cpx * tmpbuf; kiss_fft_cpx * super_twiddles; +#ifdef USE_SIMD + long pad; +#endif }; kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem) @@ -37,7 +40,11 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2); if (lenmem == NULL) { +#ifdef USE_SIMD + st = (kiss_fftr_cfg) memalign (sizeof(kiss_fft_cpx),memneeded); +#else st = (kiss_fftr_cfg) malloc (memneeded); +#endif } else { if (*lenmem >= memneeded) st = (kiss_fftr_cfg) mem; @@ -83,7 +90,11 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i); freqdata[0].r = tdc.r + tdc.i; +#ifdef USE_SIMD + freqdata[0].i = _mm_set1_ps(0); +#else freqdata[0].i = 0; +#endif for (k=1;k <= N/2 ; ++k ) { @@ -98,15 +109,28 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr C_MUL( tw , f2k , st->super_twiddles[k]); C_ADD( freqdata[k] , f1k ,tw); +#ifdef USE_SIMD + freqdata[k].r = (f1k.r + tw.r) * _mm_set1_ps(.5); + freqdata[k].i = (f1k.i + tw.i) * _mm_set1_ps(.5); + freqdata[N-k].r = (f1k.r - tw.r) * _mm_set1_ps(.5); + freqdata[N-k].i = - (f1k.i - tw.i) * _mm_set1_ps(.5); + +#else freqdata[k].r = (f1k.r + tw.r) / 2; freqdata[k].i = (f1k.i + tw.i) / 2; - freqdata[N-k].r = (f1k.r - tw.r)/2; freqdata[N-k].i = - (f1k.i - tw.i)/2; +#endif + } CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i); freqdata[N].r = tdc.r - tdc.i; + +#ifdef USE_SIMD + freqdata[N].i = _mm_set1_ps(0); +#else freqdata[N].i = 0; +#endif } void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata) @@ -137,7 +161,11 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t C_MUL (fok, tmpbuf, st->super_twiddles[k]); C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[N - k], fek, fok); +#ifdef USE_SIMD + st->tmpbuf[N - k].i *= _mm_set1_ps(-1.0); +#else st->tmpbuf[N - k].i *= -1; +#endif } kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata); }