mirror of
https://github.com/mborgerding/kissfft.git
synced 2025-05-27 21:20:27 -04:00
added simd capability
This commit is contained in:
parent
24be1c5850
commit
7f6cbeab2e
1
Makefile
1
Makefile
@ -5,6 +5,7 @@ TARBALL=kiss_fft_v$(KFVER).tar.gz
|
||||
ZIPFILE=kiss_fft_v$(KFVER).zip
|
||||
|
||||
testall:
|
||||
export DATATYPE=simd && cd test && make test
|
||||
export DATATYPE=short && cd test && make test
|
||||
export DATATYPE=float && cd test && make test
|
||||
export DATATYPE=double && cd test && make test
|
||||
|
@ -133,6 +133,11 @@ void kf_cexp(kiss_fft_cpx * x,double phase) /* returns e ** (j*phase) */
|
||||
#ifdef FIXED_POINT
|
||||
x->r = (kiss_fft_scalar) (SAMP_MAX * cos (phase));
|
||||
x->i = (kiss_fft_scalar) (SAMP_MAX * sin (phase));
|
||||
#elif defined(USE_SIMD)
|
||||
float r = cos (phase);
|
||||
float i = sin (phase);
|
||||
x->r = _mm_load1_ps(&r);
|
||||
x->i = _mm_load1_ps(&i);
|
||||
#else
|
||||
x->r = (kiss_fft_scalar) cos (phase);
|
||||
x->i = (kiss_fft_scalar) sin (phase);
|
||||
|
@ -129,8 +129,13 @@ static void kf_bfly3(
|
||||
tw1 += fstride;
|
||||
tw2 += fstride*2;
|
||||
|
||||
#ifdef USE_SIMD
|
||||
Fout[m].r = Fout->r - scratch[3].r * _mm_set1_ps(.5);
|
||||
Fout[m].i = Fout->i - scratch[3].i * _mm_set1_ps(.5);
|
||||
#else
|
||||
Fout[m].r = Fout->r - scratch[3].r/2;
|
||||
Fout[m].i = Fout->i - scratch[3].i/2;
|
||||
#endif
|
||||
|
||||
C_MULBYSCALAR( scratch[0] , epi3.i );
|
||||
|
||||
@ -326,7 +331,11 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
|
||||
+ sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
|
||||
|
||||
if ( lenmem==NULL ) {
|
||||
#ifdef USE_SIMD
|
||||
st = ( kiss_fft_cfg)memalign( sizeof(kiss_fft_cpx), memneeded );
|
||||
#else
|
||||
st = ( kiss_fft_cfg)malloc( memneeded );
|
||||
#endif
|
||||
}else{
|
||||
if (*lenmem >= memneeded)
|
||||
st = (kiss_fft_cfg)mem;
|
||||
|
@ -5,6 +5,7 @@
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <memory.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@ -23,6 +24,12 @@ extern "C" {
|
||||
in the tools/ directory.
|
||||
*/
|
||||
|
||||
#ifdef USE_SIMD
|
||||
# include <xmmintrin.h>
|
||||
# define kiss_fft_scalar __m128
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
# if (FIXED_POINT == 32)
|
||||
# define kiss_fft_scalar long
|
||||
|
@ -23,6 +23,7 @@ TESTKFC=tkfc_$(DATATYPE)
|
||||
FASTFILTREAL=ffr_$(DATATYPE)
|
||||
SELFTESTSRC=twotonetest.c
|
||||
|
||||
|
||||
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
|
||||
|
||||
ifeq "$(DATATYPE)" "short"
|
||||
@ -31,7 +32,12 @@ endif
|
||||
|
||||
ifeq "$(DATATYPE)" "long"
|
||||
TYPEFLAGS=-DFIXED_POINT=32
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq "$(DATATYPE)" "simd"
|
||||
TYPEFLAGS=-DUSE_SIMD=1 -msse
|
||||
endif
|
||||
|
||||
|
||||
ifeq "$(DATATYPE)" "float"
|
||||
# fftw needs to be built with --enable-float to build this lib
|
||||
@ -61,6 +67,9 @@ $(TESTKFC): $(SRCFILES)
|
||||
$(TESTREAL): test_real.c $(SRCFILES)
|
||||
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
||||
|
||||
bm_simd: benchkiss.c ../kiss_fft.c pstats.c ../tools/kfc.c
|
||||
$(CC) -o $@ $(CFLAGS) -DUSE_SIMD -msse -m3dnow -lm $+
|
||||
|
||||
$(BENCHKISS): benchkiss.c $(SRCFILES)
|
||||
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
||||
|
||||
|
@ -6,6 +6,8 @@
|
||||
|
||||
#include "pstats.h"
|
||||
|
||||
#define CHK fprintf(stderr,"line %d\n" , __LINE__ )
|
||||
|
||||
int main(int argc,char ** argv)
|
||||
{
|
||||
int nfft=1024;
|
||||
@ -31,20 +33,41 @@ int main(int argc,char ** argv)
|
||||
break;
|
||||
}
|
||||
}
|
||||
CHK;
|
||||
#ifdef USE_SIMD
|
||||
buf=(kiss_fft_cpx*)memalign(sizeof(kiss_fft_cpx),sizeof(kiss_fft_cpx) * nfft);
|
||||
bufout=(kiss_fft_cpx*)memalign(sizeof(kiss_fft_cpx),sizeof(kiss_fft_cpx) * nfft);
|
||||
|
||||
numffts /= 4;
|
||||
fprintf(stderr,"since SIMD implementation does 4 ffts at a time, numffts is being reduced to %d\n",numffts);
|
||||
#else
|
||||
buf=(kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx) * nfft);
|
||||
bufout=(kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx) * nfft);
|
||||
#endif
|
||||
|
||||
fprintf(stderr,"buf at %p, bufout at %p\n",buf,bufout);
|
||||
CHK;
|
||||
|
||||
for (i=0;i<nfft;++i ) {
|
||||
#ifdef USE_SIMD
|
||||
buf[i].r = _mm_set_ps1((float)( rand() - RAND_MAX/2));
|
||||
buf[i].i = _mm_set_ps1((float)( rand() - RAND_MAX/2));
|
||||
#else
|
||||
buf[i].r = rand() - RAND_MAX/2;
|
||||
buf[i].i = rand() - RAND_MAX/2;
|
||||
#endif
|
||||
}
|
||||
CHK;
|
||||
|
||||
pstats_init();
|
||||
CHK;
|
||||
|
||||
st = kiss_fft_alloc( nfft ,isinverse ,0,0);
|
||||
CHK;
|
||||
|
||||
for (i=0;i<numffts;++i)
|
||||
kiss_fft( st ,buf,bufout );
|
||||
CHK;
|
||||
|
||||
free(st);
|
||||
|
||||
|
@ -5,8 +5,6 @@
|
||||
#include "kiss_fftr.h"
|
||||
#include <limits.h>
|
||||
|
||||
#define pcpx(c)\
|
||||
fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
|
||||
|
||||
static
|
||||
double two_tone_test( int nfft, int bin1,int bin2)
|
||||
@ -27,20 +25,25 @@ double two_tone_test( int nfft, int bin1,int bin2)
|
||||
#endif
|
||||
|
||||
cfg = kiss_fftr_alloc(nfft , 0, NULL, NULL);
|
||||
tbuf = malloc(nfft * sizeof(kiss_fft_scalar));
|
||||
kout = malloc(nfft * sizeof(kiss_fft_cpx));
|
||||
tbuf = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_scalar));
|
||||
kout = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_cpx));
|
||||
|
||||
/* generate a signal with two tones*/
|
||||
for (i = 0; i < nfft; i++) {
|
||||
#ifdef USE_SIMD
|
||||
tbuf[i] = _mm_set1_ps( (maxrange>>1)*cos(f1*i)
|
||||
+ (maxrange>>1)*cos(f2*i) );
|
||||
#else
|
||||
tbuf[i] = (maxrange>>1)*cos(f1*i)
|
||||
+ (maxrange>>1)*cos(f2*i);
|
||||
#endif
|
||||
}
|
||||
|
||||
kiss_fftr(cfg, tbuf, kout);
|
||||
|
||||
for (i=0;i < (nfft/2+1);++i) {
|
||||
double tmpr = (double)kout[i].r / (double)maxrange;
|
||||
double tmpi = (double)kout[i].i / (double)maxrange;
|
||||
double tmpr = (double)*(float*)&kout[i].r / (double)maxrange;
|
||||
double tmpi = (double)*(float*)&kout[i].i / (double)maxrange;
|
||||
double mag2 = tmpr*tmpr + tmpi*tmpi;
|
||||
if (i!=0 && i!= nfft/2)
|
||||
mag2 *= 2; /* all bins except DC and Nyquist have symmetric counterparts implied*/
|
||||
|
@ -13,6 +13,10 @@ else
|
||||
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
|
||||
endif
|
||||
|
||||
ifeq "$(DATATYPE)" "simd"
|
||||
TYPEFLAGS=-DUSE_SIMD=1 -msse
|
||||
endif
|
||||
|
||||
ifeq "$(DATATYPE)" "float"
|
||||
FFTUTIL=fft
|
||||
FASTFILT=fastconv
|
||||
@ -27,7 +31,8 @@ else
|
||||
DUMPHDR=dumphdr_$(DATATYPE)
|
||||
endif
|
||||
|
||||
all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL) $(PSDPNG)
|
||||
all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
|
||||
# $(PSDPNG)
|
||||
# $(DUMPHDR)
|
||||
|
||||
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
|
||||
|
@ -159,8 +159,13 @@ kiss_fastfir_cfg kiss_fastfir_alloc(
|
||||
scale = 1.0 / st->nfft;
|
||||
|
||||
for ( i=0; i < st->n_freq_bins; ++i ) {
|
||||
#ifdef USE_SIMD
|
||||
st->fir_freq_resp[i].r *= _mm_set1_ps(scale);
|
||||
st->fir_freq_resp[i].i *= _mm_set1_ps(scale);
|
||||
#else
|
||||
st->fir_freq_resp[i].r *= scale;
|
||||
st->fir_freq_resp[i].i *= scale;
|
||||
#endif
|
||||
}
|
||||
return st;
|
||||
}
|
||||
@ -286,14 +291,22 @@ void direct_file_filter(
|
||||
for (k = 0; k < nread; ++k) {
|
||||
tmph = imp_resp+nlag;
|
||||
#ifdef REAL_FASTFIR
|
||||
# ifdef USE_SIMD
|
||||
outval = _mm_set1_ps(0);
|
||||
#else
|
||||
outval = 0;
|
||||
#endif
|
||||
for (tap = oldestlag; tap < nlag; ++tap)
|
||||
outval += circbuf[tap] * *tmph--;
|
||||
for (tap = 0; tap < oldestlag; ++tap)
|
||||
outval += circbuf[tap] * *tmph--;
|
||||
outval += buf[k] * *tmph;
|
||||
#else
|
||||
# ifdef USE_SIMD
|
||||
outval.r = outval.i = _mm_set1_ps(0);
|
||||
#else
|
||||
outval.r = outval.i = 0;
|
||||
#endif
|
||||
for (tap = oldestlag; tap < nlag; ++tap){
|
||||
C_MUL(tmp,circbuf[tap],*tmph);
|
||||
--tmph;
|
||||
|
@ -19,6 +19,9 @@ struct kiss_fftr_state{
|
||||
kiss_fft_cfg substate;
|
||||
kiss_fft_cpx * tmpbuf;
|
||||
kiss_fft_cpx * super_twiddles;
|
||||
#ifdef USE_SIMD
|
||||
long pad;
|
||||
#endif
|
||||
};
|
||||
|
||||
kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
|
||||
@ -37,7 +40,11 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme
|
||||
memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2);
|
||||
|
||||
if (lenmem == NULL) {
|
||||
#ifdef USE_SIMD
|
||||
st = (kiss_fftr_cfg) memalign (sizeof(kiss_fft_cpx),memneeded);
|
||||
#else
|
||||
st = (kiss_fftr_cfg) malloc (memneeded);
|
||||
#endif
|
||||
} else {
|
||||
if (*lenmem >= memneeded)
|
||||
st = (kiss_fftr_cfg) mem;
|
||||
@ -83,7 +90,11 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
|
||||
|
||||
CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
|
||||
freqdata[0].r = tdc.r + tdc.i;
|
||||
#ifdef USE_SIMD
|
||||
freqdata[0].i = _mm_set1_ps(0);
|
||||
#else
|
||||
freqdata[0].i = 0;
|
||||
#endif
|
||||
|
||||
for (k=1;k <= N/2 ; ++k ) {
|
||||
|
||||
@ -98,15 +109,28 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
|
||||
C_MUL( tw , f2k , st->super_twiddles[k]);
|
||||
|
||||
C_ADD( freqdata[k] , f1k ,tw);
|
||||
#ifdef USE_SIMD
|
||||
freqdata[k].r = (f1k.r + tw.r) * _mm_set1_ps(.5);
|
||||
freqdata[k].i = (f1k.i + tw.i) * _mm_set1_ps(.5);
|
||||
freqdata[N-k].r = (f1k.r - tw.r) * _mm_set1_ps(.5);
|
||||
freqdata[N-k].i = - (f1k.i - tw.i) * _mm_set1_ps(.5);
|
||||
|
||||
#else
|
||||
freqdata[k].r = (f1k.r + tw.r) / 2;
|
||||
freqdata[k].i = (f1k.i + tw.i) / 2;
|
||||
|
||||
freqdata[N-k].r = (f1k.r - tw.r)/2;
|
||||
freqdata[N-k].i = - (f1k.i - tw.i)/2;
|
||||
#endif
|
||||
|
||||
}
|
||||
CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
|
||||
freqdata[N].r = tdc.r - tdc.i;
|
||||
|
||||
#ifdef USE_SIMD
|
||||
freqdata[N].i = _mm_set1_ps(0);
|
||||
#else
|
||||
freqdata[N].i = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
|
||||
@ -137,7 +161,11 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t
|
||||
C_MUL (fok, tmpbuf, st->super_twiddles[k]);
|
||||
C_ADD (st->tmpbuf[k], fek, fok);
|
||||
C_SUB (st->tmpbuf[N - k], fek, fok);
|
||||
#ifdef USE_SIMD
|
||||
st->tmpbuf[N - k].i *= _mm_set1_ps(-1.0);
|
||||
#else
|
||||
st->tmpbuf[N - k].i *= -1;
|
||||
#endif
|
||||
}
|
||||
kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user