added simd capability

This commit is contained in:
Mark Borgerding
2005-06-24 03:03:31 +00:00
parent 24be1c5850
commit 7f6cbeab2e
10 changed files with 112 additions and 9 deletions

View File

@ -13,6 +13,10 @@ else
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
endif
ifeq "$(DATATYPE)" "simd"
TYPEFLAGS=-DUSE_SIMD=1 -msse
endif
ifeq "$(DATATYPE)" "float"
FFTUTIL=fft
FASTFILT=fastconv
@ -27,7 +31,8 @@ else
DUMPHDR=dumphdr_$(DATATYPE)
endif
all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL) $(PSDPNG)
all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
# $(PSDPNG)
# $(DUMPHDR)
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)

View File

@ -159,8 +159,13 @@ kiss_fastfir_cfg kiss_fastfir_alloc(
scale = 1.0 / st->nfft;
for ( i=0; i < st->n_freq_bins; ++i ) {
#ifdef USE_SIMD
st->fir_freq_resp[i].r *= _mm_set1_ps(scale);
st->fir_freq_resp[i].i *= _mm_set1_ps(scale);
#else
st->fir_freq_resp[i].r *= scale;
st->fir_freq_resp[i].i *= scale;
#endif
}
return st;
}
@ -286,14 +291,22 @@ void direct_file_filter(
for (k = 0; k < nread; ++k) {
tmph = imp_resp+nlag;
#ifdef REAL_FASTFIR
# ifdef USE_SIMD
outval = _mm_set1_ps(0);
#else
outval = 0;
#endif
for (tap = oldestlag; tap < nlag; ++tap)
outval += circbuf[tap] * *tmph--;
for (tap = 0; tap < oldestlag; ++tap)
outval += circbuf[tap] * *tmph--;
outval += buf[k] * *tmph;
#else
# ifdef USE_SIMD
outval.r = outval.i = _mm_set1_ps(0);
#else
outval.r = outval.i = 0;
#endif
for (tap = oldestlag; tap < nlag; ++tap){
C_MUL(tmp,circbuf[tap],*tmph);
--tmph;

View File

@ -19,6 +19,9 @@ struct kiss_fftr_state{
kiss_fft_cfg substate;
kiss_fft_cpx * tmpbuf;
kiss_fft_cpx * super_twiddles;
#ifdef USE_SIMD
long pad;
#endif
};
kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
@ -37,7 +40,11 @@ kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenme
memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 2);
if (lenmem == NULL) {
#ifdef USE_SIMD
st = (kiss_fftr_cfg) memalign (sizeof(kiss_fft_cpx),memneeded);
#else
st = (kiss_fftr_cfg) malloc (memneeded);
#endif
} else {
if (*lenmem >= memneeded)
st = (kiss_fftr_cfg) mem;
@ -83,7 +90,11 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
freqdata[0].r = tdc.r + tdc.i;
#ifdef USE_SIMD
freqdata[0].i = _mm_set1_ps(0);
#else
freqdata[0].i = 0;
#endif
for (k=1;k <= N/2 ; ++k ) {
@ -98,15 +109,28 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
C_MUL( tw , f2k , st->super_twiddles[k]);
C_ADD( freqdata[k] , f1k ,tw);
#ifdef USE_SIMD
freqdata[k].r = (f1k.r + tw.r) * _mm_set1_ps(.5);
freqdata[k].i = (f1k.i + tw.i) * _mm_set1_ps(.5);
freqdata[N-k].r = (f1k.r - tw.r) * _mm_set1_ps(.5);
freqdata[N-k].i = - (f1k.i - tw.i) * _mm_set1_ps(.5);
#else
freqdata[k].r = (f1k.r + tw.r) / 2;
freqdata[k].i = (f1k.i + tw.i) / 2;
freqdata[N-k].r = (f1k.r - tw.r)/2;
freqdata[N-k].i = - (f1k.i - tw.i)/2;
#endif
}
CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
freqdata[N].r = tdc.r - tdc.i;
#ifdef USE_SIMD
freqdata[N].i = _mm_set1_ps(0);
#else
freqdata[N].i = 0;
#endif
}
void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
@ -137,7 +161,11 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t
C_MUL (fok, tmpbuf, st->super_twiddles[k]);
C_ADD (st->tmpbuf[k], fek, fok);
C_SUB (st->tmpbuf[N - k], fek, fok);
#ifdef USE_SIMD
st->tmpbuf[N - k].i *= _mm_set1_ps(-1.0);
#else
st->tmpbuf[N - k].i *= -1;
#endif
}
kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
}