added simd capability

This commit is contained in:
Mark Borgerding 2005-06-24 03:03:31 +00:00
parent 24be1c5850
commit 7f6cbeab2e
10 changed files with 112 additions and 9 deletions

View File

@ -5,6 +5,7 @@ TARBALL=kiss_fft_v$(KFVER).tar.gz
ZIPFILE=kiss_fft_v$(KFVER).zip
testall:
export DATATYPE=simd && cd test && make test
export DATATYPE=short && cd test && make test
export DATATYPE=float && cd test && make test
export DATATYPE=double && cd test && make test

View File

@ -133,6 +133,11 @@ void kf_cexp(kiss_fft_cpx * x,double phase) /* returns e ** (j*phase) */
#ifdef FIXED_POINT
x->r = (kiss_fft_scalar) (SAMP_MAX * cos (phase));
x->i = (kiss_fft_scalar) (SAMP_MAX * sin (phase));
#elif defined(USE_SIMD)
float r = cos (phase);
float i = sin (phase);
x->r = _mm_load1_ps(&r);
x->i = _mm_load1_ps(&i);
#else
x->r = (kiss_fft_scalar) cos (phase);
x->i = (kiss_fft_scalar) sin (phase);

View File

@ -129,8 +129,13 @@ static void kf_bfly3(
tw1 += fstride;
tw2 += fstride*2;
#ifdef USE_SIMD
Fout[m].r = Fout->r - scratch[3].r * _mm_set1_ps(.5);
Fout[m].i = Fout->i - scratch[3].i * _mm_set1_ps(.5);
#else
Fout[m].r = Fout->r - scratch[3].r/2;
Fout[m].i = Fout->i - scratch[3].i/2;
#endif
C_MULBYSCALAR( scratch[0] , epi3.i );
@ -326,7 +331,11 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
+ sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
if ( lenmem==NULL ) {
#ifdef USE_SIMD
st = ( kiss_fft_cfg)memalign( sizeof(kiss_fft_cpx), memneeded );
#else
st = ( kiss_fft_cfg)malloc( memneeded );
#endif
}else{
if (*lenmem >= memneeded)
st = (kiss_fft_cfg)mem;

View File

@ -5,6 +5,7 @@
#include <stdio.h>
#include <math.h>
#include <memory.h>
#include <malloc.h>
#ifdef __cplusplus
extern "C" {
@ -23,6 +24,12 @@ extern "C" {
in the tools/ directory.
*/
#ifdef USE_SIMD
# include <xmmintrin.h>
# define kiss_fft_scalar __m128
#endif
#ifdef FIXED_POINT
# if (FIXED_POINT == 32)
# define kiss_fft_scalar long

View File

@ -23,6 +23,7 @@ TESTKFC=tkfc_$(DATATYPE)
FASTFILTREAL=ffr_$(DATATYPE)
SELFTESTSRC=twotonetest.c
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
ifeq "$(DATATYPE)" "short"
@ -31,7 +32,12 @@ endif
ifeq "$(DATATYPE)" "long"
TYPEFLAGS=-DFIXED_POINT=32
endif
endif
ifeq "$(DATATYPE)" "simd"
TYPEFLAGS=-DUSE_SIMD=1 -msse
endif
ifeq "$(DATATYPE)" "float"
# fftw needs to be built with --enable-float to build this lib
@ -61,6 +67,9 @@ $(TESTKFC): $(SRCFILES)
$(TESTREAL): test_real.c $(SRCFILES)
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
bm_simd: benchkiss.c ../kiss_fft.c pstats.c ../tools/kfc.c
$(CC) -o $@ $(CFLAGS) -DUSE_SIMD -msse -m3dnow -lm $+
$(BENCHKISS): benchkiss.c $(SRCFILES)
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+

View File

@ -6,6 +6,8 @@
#include "pstats.h"
#define CHK fprintf(stderr,"line %d\n" , __LINE__ )
int main(int argc,char ** argv)
{
int nfft=1024;
@ -31,20 +33,41 @@ int main(int argc,char ** argv)
break;
}
}
CHK;
#ifdef USE_SIMD
buf=(kiss_fft_cpx*)memalign(sizeof(kiss_fft_cpx),sizeof(kiss_fft_cpx) * nfft);
bufout=(kiss_fft_cpx*)memalign(sizeof(kiss_fft_cpx),sizeof(kiss_fft_cpx) * nfft);
numffts /= 4;
fprintf(stderr,"since SIMD implementation does 4 ffts at a time, numffts is being reduced to %d\n",numffts);
#else
buf=(kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx) * nfft);
bufout=(kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx) * nfft);
#endif
fprintf(stderr,"buf at %p, bufout at %p\n",buf,bufout);
CHK;
for (i=0;i<nfft;++i ) {
#ifdef USE_SIMD
buf[i].r = _mm_set_ps1((float)( rand() - RAND_MAX/2));
buf[i].i = _mm_set_ps1((float)( rand() - RAND_MAX/2));
#else
buf[i].r = rand() - RAND_MAX/2;
buf[i].i = rand() - RAND_MAX/2;
#endif
}
CHK;
pstats_init();
CHK;
st = kiss_fft_alloc( nfft ,isinverse ,0,0);
CHK;
for (i=0;i<numffts;++i)
kiss_fft( st ,buf,bufout );
CHK;
free(st);

View File

@ -5,8 +5,6 @@
#include "kiss_fftr.h"
#include <limits.h>
#define pcpx(c)\
fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
static
double two_tone_test( int nfft, int bin1,int bin2)
@ -27,20 +25,25 @@ double two_tone_test( int nfft, int bin1,int bin2)
#endif
cfg = kiss_fftr_alloc(nfft , 0, NULL, NULL);
tbuf = malloc(nfft * sizeof(kiss_fft_scalar));
kout = malloc(nfft * sizeof(kiss_fft_cpx));
tbuf = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_scalar));
kout = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_cpx));
/* generate a signal with two tones*/
for (i = 0; i < nfft; i++) {
#ifdef USE_SIMD
tbuf[i] = _mm_set1_ps( (maxrange>>1)*cos(f1*i)
+ (maxrange>>1)*cos(f2*i) );
#else
tbuf[i] = (maxrange>>1)*cos(f1*i)
+ (maxrange>>1)*cos(f2*i);
#endif
}
kiss_fftr(cfg, tbuf, kout);
for (i=0;i < (nfft/2+1);++i) {
double tmpr = (double)kout[i].r / (double)maxrange;
double tmpi = (double)kout[i].i / (double)maxrange;
double tmpr = (double)*(float*)&kout[i].r / (double)maxrange;
double tmpi = (double)*(float*)&kout[i].i / (double)maxrange;
double mag2 = tmpr*tmpr + tmpi*tmpi;
if (i!=0 && i!= nfft/2)
mag2 *= 2; /* all bins except DC and Nyquist have symmetric counterparts implied*/

View File

@ -13,6 +13,10 @@ else
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
endif
ifeq "$(DATATYPE)" "simd"
TYPEFLAGS=-DUSE_SIMD=1 -msse
endif
ifeq "$(DATATYPE)" "float"
FFTUTIL=fft
FASTFILT=fastconv
@ -27,7 +31,8 @@ else
DUMPHDR=dumphdr_$(DATATYPE)
endif
all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL) $(PSDPNG)
all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
# $(PSDPNG)
# $(DUMPHDR)
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)

View File

@ -159,8 +159,13 @@ kiss_fastfir_cfg kiss_fastfir_alloc(
scale = 1.0 / st->nfft;
for ( i=0; i < st->n_freq_bins; ++i ) {
#ifdef USE_SIMD
st->fir_freq_resp[i].r *= _mm_set1_ps(scale);
st->fir_freq_resp[i].i *= _mm_set1_ps(scale);
#else
st->fir_freq_resp[i].r *= scale;
st->fir_freq_resp[i].i *= scale;
#endif
}
return st;
}
@ -286,14 +291,22 @@ void direct_file_filter(
for (k = 0; k < nread; ++k) {
tmph = imp_resp+nlag;
#ifdef REAL_FASTFIR
# ifdef USE_SIMD
outval = _mm_set1_ps(0);
#else
outval = 0;
#endif
for (tap = oldestlag; tap < nlag; ++tap)
outval += circbuf[tap] * *tmph--;
for (tap = 0; tap < oldestlag; ++tap)
outval += circbuf[tap] * *tmph--;
outval += buf[k] * *tmph;
#else
# ifdef USE_SIMD
outval.r = outval.i = _mm_set1_ps(0);
#else
outval.r = outval.i = 0;
#endif
for (tap = oldestlag; tap < nlag; ++tap){
C_MUL(tmp,circbuf[tap],*tmph);
--tmph;

View File

@ -19,6 +19,9 @@ struct kiss_fftr_state{
kiss_fft_cfg substate;
kiss_fft_cpx * tmpbuf;
kiss_fft_cpx * super_twiddles;
#ifdef USE_SIMD
long pad;
#endif
};
kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
@ -37,7 +40,11 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme
memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2);
if (lenmem == NULL) {
#ifdef USE_SIMD
st = (kiss_fftr_cfg) memalign (sizeof(kiss_fft_cpx),memneeded);
#else
st = (kiss_fftr_cfg) malloc (memneeded);
#endif
} else {
if (*lenmem >= memneeded)
st = (kiss_fftr_cfg) mem;
@ -83,7 +90,11 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
freqdata[0].r = tdc.r + tdc.i;
#ifdef USE_SIMD
freqdata[0].i = _mm_set1_ps(0);
#else
freqdata[0].i = 0;
#endif
for (k=1;k <= N/2 ; ++k ) {
@ -98,15 +109,28 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
C_MUL( tw , f2k , st->super_twiddles[k]);
C_ADD( freqdata[k] , f1k ,tw);
#ifdef USE_SIMD
freqdata[k].r = (f1k.r + tw.r) * _mm_set1_ps(.5);
freqdata[k].i = (f1k.i + tw.i) * _mm_set1_ps(.5);
freqdata[N-k].r = (f1k.r - tw.r) * _mm_set1_ps(.5);
freqdata[N-k].i = - (f1k.i - tw.i) * _mm_set1_ps(.5);
#else
freqdata[k].r = (f1k.r + tw.r) / 2;
freqdata[k].i = (f1k.i + tw.i) / 2;
freqdata[N-k].r = (f1k.r - tw.r)/2;
freqdata[N-k].i = - (f1k.i - tw.i)/2;
#endif
}
CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
freqdata[N].r = tdc.r - tdc.i;
#ifdef USE_SIMD
freqdata[N].i = _mm_set1_ps(0);
#else
freqdata[N].i = 0;
#endif
}
void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
@ -137,7 +161,11 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t
C_MUL (fok, tmpbuf, st->super_twiddles[k]);
C_ADD (st->tmpbuf[k], fek, fok);
C_SUB (st->tmpbuf[N - k], fek, fok);
#ifdef USE_SIMD
st->tmpbuf[N - k].i *= _mm_set1_ps(-1.0);
#else
st->tmpbuf[N - k].i *= -1;
#endif
}
kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
}