mirror of
https://github.com/mborgerding/kissfft.git
synced 2025-06-04 01:28:23 -04:00
polishing SIMD changes
This commit is contained in:
parent
95fd244f93
commit
4be23bffa4
@ -1,3 +1,12 @@
|
||||
1.2.3 (June 25, 2005) The "you want to use WHAT as a sample" release.
|
||||
Added ability to use 32 bit fixed point samples -- requires a 64 bit intermediate result, a la 'long long'
|
||||
|
||||
Added ability to do 4 FFTs in parallel by using SSE SIMD instructions. This is accomplished by
|
||||
using the __m128 (vector of 4 floats) as kiss_fft_scalar. Define USE_SIMD to use this.
|
||||
|
||||
I know, I know ... this is drifting a bit from the "kiss" principle, but the speed advantages
|
||||
make it worth it for some. Also recent gcc makes it SOO easy to use vectors of 4 floats as a POD type.
|
||||
|
||||
1.2.2 (May 6, 2005) The Matthew release
|
||||
Replaced fixed point division with multiply&shift. Thanks to Jean-Marc Valin for
|
||||
discussions regarding. Considerable speedup.
|
||||
|
3
Makefile
3
Makefile
@ -5,7 +5,10 @@ TARBALL=kiss_fft_v$(KFVER).tar.gz
|
||||
ZIPFILE=kiss_fft_v$(KFVER).zip
|
||||
|
||||
testall:
|
||||
# The simd and long types may or may not work on your machine
|
||||
export DATATYPE=simd && cd test && make test
|
||||
export DATATYPE=long && cd test && make test
|
||||
|
||||
export DATATYPE=short && cd test && make test
|
||||
export DATATYPE=float && cd test && make test
|
||||
export DATATYPE=double && cd test && make test
|
||||
|
@ -125,24 +125,26 @@ struct kiss_fft_state{
|
||||
}while(0)
|
||||
|
||||
|
||||
|
||||
|
||||
static
|
||||
void kf_cexp(kiss_fft_cpx * x,double phase) /* returns e ** (j*phase) */
|
||||
{
|
||||
#ifdef FIXED_POINT
|
||||
x->r = (kiss_fft_scalar) (SAMP_MAX * cos (phase));
|
||||
x->i = (kiss_fft_scalar) (SAMP_MAX * sin (phase));
|
||||
# define KISS_FFT_COS(phase) (kiss_fft_scalar) (SAMP_MAX * cos (phase))
|
||||
# define KISS_FFT_SIN(phase) (kiss_fft_scalar) (SAMP_MAX * sin (phase))
|
||||
# define HALF_OF(x) ((x)>>1)
|
||||
#elif defined(USE_SIMD)
|
||||
float r = cos (phase);
|
||||
float i = sin (phase);
|
||||
x->r = _mm_load1_ps(&r);
|
||||
x->i = _mm_load1_ps(&i);
|
||||
# define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
|
||||
# define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
|
||||
# define HALF_OF(x) ((x)*_mm_set1_ps(.5))
|
||||
#else
|
||||
x->r = (kiss_fft_scalar) cos (phase);
|
||||
x->i = (kiss_fft_scalar) sin (phase);
|
||||
# define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
|
||||
# define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)
|
||||
# define HALF_OF(x) ((x)*.5)
|
||||
#endif
|
||||
}
|
||||
|
||||
#define kf_cexp(x,phase) \
|
||||
do{ \
|
||||
(x)->r = KISS_FFT_COS(phase);\
|
||||
(x)->i = KISS_FFT_SIN(phase);\
|
||||
}while(0)
|
||||
|
||||
|
||||
/* a debugging function */
|
||||
#define pcpx(c)\
|
||||
|
15
kiss_fft.c
15
kiss_fft.c
@ -129,13 +129,8 @@ static void kf_bfly3(
|
||||
tw1 += fstride;
|
||||
tw2 += fstride*2;
|
||||
|
||||
#ifdef USE_SIMD
|
||||
Fout[m].r = Fout->r - scratch[3].r * _mm_set1_ps(.5);
|
||||
Fout[m].i = Fout->i - scratch[3].i * _mm_set1_ps(.5);
|
||||
#else
|
||||
Fout[m].r = Fout->r - scratch[3].r/2;
|
||||
Fout[m].i = Fout->i - scratch[3].i/2;
|
||||
#endif
|
||||
Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
|
||||
Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
|
||||
|
||||
C_MULBYSCALAR( scratch[0] , epi3.i );
|
||||
|
||||
@ -331,11 +326,7 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
|
||||
+ sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
|
||||
|
||||
if ( lenmem==NULL ) {
|
||||
#ifdef USE_SIMD
|
||||
st = ( kiss_fft_cfg)memalign( sizeof(kiss_fft_cpx), memneeded );
|
||||
#else
|
||||
st = ( kiss_fft_cfg)malloc( memneeded );
|
||||
#endif
|
||||
st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
|
||||
}else{
|
||||
if (*lenmem >= memneeded)
|
||||
st = (kiss_fft_cfg)mem;
|
||||
|
@ -27,6 +27,9 @@ extern "C" {
|
||||
#ifdef USE_SIMD
|
||||
# include <xmmintrin.h>
|
||||
# define kiss_fft_scalar __m128
|
||||
#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
|
||||
#else
|
||||
#define KISS_FFT_MALLOC malloc
|
||||
#endif
|
||||
|
||||
|
||||
|
@ -67,9 +67,6 @@ $(TESTKFC): $(SRCFILES)
|
||||
$(TESTREAL): test_real.c $(SRCFILES)
|
||||
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
||||
|
||||
bm_simd: benchkiss.c ../kiss_fft.c pstats.c ../tools/kfc.c
|
||||
$(CC) -o $@ $(CFLAGS) -DUSE_SIMD -msse -m3dnow -lm $+
|
||||
|
||||
$(BENCHKISS): benchkiss.c $(SRCFILES)
|
||||
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
||||
|
||||
|
@ -15,9 +15,19 @@ static
|
||||
double snr_compare( kiss_fft_cpx * vec1,kiss_fft_cpx * vec2, int n)
|
||||
{
|
||||
int k;
|
||||
double sigpow,noisepow,err,snr,scale=0;
|
||||
sigpow = noisepow = .00000000000000000001;
|
||||
double sigpow=1e-10,noisepow=1e-10,err,snr,scale=0;
|
||||
|
||||
#ifdef USE_SIMD
|
||||
float *fv1 = (float*)vec1;
|
||||
float *fv2 = (float*)vec2;
|
||||
for (k=0;k<8*n;++k) {
|
||||
sigpow += *fv1 * *fv1;
|
||||
err = *fv1 - *fv2;
|
||||
noisepow += err*err;
|
||||
++fv1;
|
||||
++fv2;
|
||||
}
|
||||
#else
|
||||
for (k=0;k<n;++k) {
|
||||
sigpow += (double)vec1[k].r * (double)vec1[k].r +
|
||||
(double)vec1[k].i * (double)vec1[k].i;
|
||||
@ -28,11 +38,8 @@ double snr_compare( kiss_fft_cpx * vec1,kiss_fft_cpx * vec2, int n)
|
||||
|
||||
if (vec1[k].r)
|
||||
scale +=(double) vec2[k].r / (double)vec1[k].r;
|
||||
/*
|
||||
fprintf(stderr,"vec1=");pcpx(vec1+k);
|
||||
fprintf(stderr,"vec2=");pcpx(vec2+k);
|
||||
*/
|
||||
}
|
||||
#endif
|
||||
snr = 10*log10( sigpow / noisepow );
|
||||
scale /= n;
|
||||
if (snr<10) {
|
||||
@ -41,15 +48,10 @@ double snr_compare( kiss_fft_cpx * vec1,kiss_fft_cpx * vec2, int n)
|
||||
}
|
||||
return snr;
|
||||
}
|
||||
#define RANDOM
|
||||
#ifndef RANDOM
|
||||
#define NFFT 8
|
||||
#else
|
||||
#define NFFT 8*3*5
|
||||
#endif
|
||||
|
||||
#ifndef NUMFFTS
|
||||
#define NUMFFTS 1000
|
||||
#define NUMFFTS 10000
|
||||
#endif
|
||||
|
||||
|
||||
@ -58,21 +60,24 @@ int main(void)
|
||||
double ts,tfft,trfft;
|
||||
int i;
|
||||
kiss_fft_cpx cin[NFFT];
|
||||
kiss_fft_scalar rin[NFFT] = {0.309655,0.815653,0.768570,0.591841,0.404767,0.637617,0.007803,0.012665};
|
||||
kiss_fft_cpx cout[NFFT];
|
||||
kiss_fft_cpx sout[NFFT];
|
||||
|
||||
kiss_fft_cfg kiss_fft_state;
|
||||
kiss_fftr_cfg kiss_fftr_state;
|
||||
|
||||
kiss_fft_scalar rin[NFFT];
|
||||
|
||||
srand(time(0));
|
||||
|
||||
for (i=0;i<NFFT;++i) {
|
||||
#ifdef RANDOM
|
||||
#ifdef USE_SIMD
|
||||
rin[i] = _mm_set1_ps(rand()-RAND_MAX/2);
|
||||
cin[i].i = _mm_set1_ps(0);
|
||||
#else
|
||||
rin[i] = (kiss_fft_scalar)(rand()-RAND_MAX/2);
|
||||
cin[i].i = 0;
|
||||
#endif
|
||||
cin[i].r = rin[i];
|
||||
cin[i].i = 0;
|
||||
}
|
||||
|
||||
kiss_fft_state = kiss_fft_alloc(NFFT,0,0,0);
|
||||
@ -81,7 +86,6 @@ int main(void)
|
||||
kiss_fftr(kiss_fftr_state,rin,sout);
|
||||
printf( "nfft=%d, inverse=%d, snr=%g\n",
|
||||
NFFT,0, snr_compare(cout,sout,(NFFT/2)+1) );
|
||||
#ifdef RANDOM
|
||||
ts = cputime();
|
||||
for (i=0;i<NUMFFTS;++i) {
|
||||
kiss_fft(kiss_fft_state,cin,cout);
|
||||
@ -96,7 +100,7 @@ int main(void)
|
||||
trfft = cputime() - ts;
|
||||
|
||||
printf("%d complex ffts took %gs, real took %gs\n",NUMFFTS,tfft,trfft);
|
||||
#endif
|
||||
|
||||
free(kiss_fft_state);
|
||||
free(kiss_fftr_state);
|
||||
|
||||
@ -108,7 +112,11 @@ int main(void)
|
||||
|
||||
for (i=0;i<NFFT;++i) {
|
||||
sout[i].r = rin[i];
|
||||
#ifdef USE_SIMD
|
||||
sout[i].i ^= sout[i].i;
|
||||
#else
|
||||
sout[i].i = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
printf( "nfft=%d, inverse=%d, snr=%g\n",
|
||||
|
@ -25,14 +25,14 @@ double two_tone_test( int nfft, int bin1,int bin2)
|
||||
#endif
|
||||
|
||||
cfg = kiss_fftr_alloc(nfft , 0, NULL, NULL);
|
||||
tbuf = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_scalar));
|
||||
kout = memalign(sizeof(kiss_fft_scalar),nfft * sizeof(kiss_fft_cpx));
|
||||
tbuf = KISS_FFT_MALLOC(nfft * sizeof(kiss_fft_scalar));
|
||||
kout = KISS_FFT_MALLOC(nfft * sizeof(kiss_fft_cpx));
|
||||
|
||||
/* generate a signal with two tones*/
|
||||
for (i = 0; i < nfft; i++) {
|
||||
#ifdef USE_SIMD
|
||||
tbuf[i] = _mm_set1_ps( (maxrange>>1)*cos(f1*i)
|
||||
+ (maxrange>>1)*cos(f2*i) );
|
||||
+ (maxrange>>1)*cos(f2*i) );
|
||||
#else
|
||||
tbuf[i] = (maxrange>>1)*cos(f1*i)
|
||||
+ (maxrange>>1)*cos(f2*i);
|
||||
@ -42,8 +42,13 @@ double two_tone_test( int nfft, int bin1,int bin2)
|
||||
kiss_fftr(cfg, tbuf, kout);
|
||||
|
||||
for (i=0;i < (nfft/2+1);++i) {
|
||||
#ifdef USE_SIMD
|
||||
double tmpr = (double)*(float*)&kout[i].r / (double)maxrange;
|
||||
double tmpi = (double)*(float*)&kout[i].i / (double)maxrange;
|
||||
#else
|
||||
double tmpr = (double)kout[i].r / (double)maxrange;
|
||||
double tmpi = (double)kout[i].i / (double)maxrange;
|
||||
#endif
|
||||
double mag2 = tmpr*tmpr + tmpi*tmpi;
|
||||
if (i!=0 && i!= nfft/2)
|
||||
mag2 *= 2; /* all bins except DC and Nyquist have symmetric counterparts implied*/
|
||||
|
@ -42,7 +42,7 @@ static kiss_fft_cfg find_cached_fft(int nfft,int inverse)
|
||||
if (cur== NULL) {
|
||||
/* no cached node found, need to create a new one*/
|
||||
kiss_fft_alloc(nfft,inverse,0,&len);
|
||||
cur = (kfc_cfg)malloc(sizeof(struct cached_fft) + len );
|
||||
cur = (kfc_cfg)KISS_FFT_MALLOC((sizeof(struct cached_fft) + len ));
|
||||
if (cur == NULL)
|
||||
return NULL;
|
||||
cur->cfg = (kiss_fft_cfg)(cur+1);
|
||||
|
@ -40,11 +40,7 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme
|
||||
memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2);
|
||||
|
||||
if (lenmem == NULL) {
|
||||
#ifdef USE_SIMD
|
||||
st = (kiss_fftr_cfg) memalign (sizeof(kiss_fft_cpx),memneeded);
|
||||
#else
|
||||
st = (kiss_fftr_cfg) malloc (memneeded);
|
||||
#endif
|
||||
st = (kiss_fftr_cfg) KISS_FFT_MALLOC (memneeded);
|
||||
} else {
|
||||
if (*lenmem >= memneeded)
|
||||
st = (kiss_fftr_cfg) mem;
|
||||
@ -109,18 +105,10 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
|
||||
C_MUL( tw , f2k , st->super_twiddles[k]);
|
||||
|
||||
C_ADD( freqdata[k] , f1k ,tw);
|
||||
#ifdef USE_SIMD
|
||||
freqdata[k].r = (f1k.r + tw.r) * _mm_set1_ps(.5);
|
||||
freqdata[k].i = (f1k.i + tw.i) * _mm_set1_ps(.5);
|
||||
freqdata[N-k].r = (f1k.r - tw.r) * _mm_set1_ps(.5);
|
||||
freqdata[N-k].i = - (f1k.i - tw.i) * _mm_set1_ps(.5);
|
||||
|
||||
#else
|
||||
freqdata[k].r = (f1k.r + tw.r) / 2;
|
||||
freqdata[k].i = (f1k.i + tw.i) / 2;
|
||||
freqdata[N-k].r = (f1k.r - tw.r)/2;
|
||||
freqdata[N-k].i = - (f1k.i - tw.i)/2;
|
||||
#endif
|
||||
freqdata[k].r = HALF_OF(f1k.r + tw.r);
|
||||
freqdata[k].i = HALF_OF(f1k.i + tw.i);
|
||||
freqdata[N-k].r = HALF_OF(f1k.r - tw.r);
|
||||
freqdata[N-k].i = HALF_OF(tw.i - f1k.i);
|
||||
|
||||
}
|
||||
CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
|
||||
|
Loading…
x
Reference in New Issue
Block a user