took laptop timings

./bm_kiss_float -x 10000 -n 2048

from 1.70 s cpu time
to   1.57
This commit is contained in:
Mark Borgerding 2004-02-16 13:36:24 +00:00
parent 7e3d2a69b0
commit d1e99f297b

View File

@ -51,48 +51,47 @@ static void kf_bfly4(
kiss_fft_cpx * Fout, kiss_fft_cpx * Fout,
const int fstride, const int fstride,
const kiss_fft_state * st, const kiss_fft_state * st,
int m const size_t m
) )
{ {
kiss_fft_cpx *Fout1,*Fout2,*Fout3;
kiss_fft_cpx *tw1,*tw2,*tw3; kiss_fft_cpx *tw1,*tw2,*tw3;
kiss_fft_cpx scratch[6]; kiss_fft_cpx scratch[6];
size_t k=m;
const size_t m2=2*m;
const size_t m3=3*m;
Fout1 = Fout + m;
Fout2 = Fout + 2*m;
Fout3 = Fout + 3*m;
tw3 = tw2 = tw1 = st->twiddles; tw3 = tw2 = tw1 = st->twiddles;
do { do {
C_FIXDIV(*Fout,4); C_FIXDIV(*Fout1,4); C_FIXDIV(*Fout2,4); C_FIXDIV(*Fout3,4); C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
C_MUL(scratch[0],*Fout1 , *tw1 ); C_MUL(scratch[0],Fout[m] , *tw1 );
C_MUL(scratch[1],*Fout2 , *tw2 ); C_MUL(scratch[1],Fout[m2] , *tw2 );
C_MUL(scratch[2],*Fout3 , *tw3 ); C_MUL(scratch[2],Fout[m3] , *tw3 );
C_SUB( scratch[5] , *Fout, scratch[1] ); C_SUB( scratch[5] , *Fout, scratch[1] );
C_ADDTO(*Fout, scratch[1]); C_ADDTO(*Fout, scratch[1]);
C_ADD( scratch[3] , scratch[0] , scratch[2] ); C_ADD( scratch[3] , scratch[0] , scratch[2] );
C_SUB( scratch[4] , scratch[0] , scratch[2] ); C_SUB( scratch[4] , scratch[0] , scratch[2] );
C_SUB( *Fout2, *Fout, scratch[3] ); C_SUB( Fout[m2], *Fout, scratch[3] );
tw1 += fstride; tw1 += fstride;
tw2 += fstride*2; tw2 += fstride*2;
tw3 += fstride*3; tw3 += fstride*3;
C_ADDTO( *Fout , scratch[3] ); C_ADDTO( *Fout , scratch[3] );
if(st->inverse) { if(st->inverse) {
Fout1->r = scratch[5].r - scratch[4].i; Fout[m].r = scratch[5].r - scratch[4].i;
Fout1->i = scratch[5].i + scratch[4].r; Fout[m].i = scratch[5].i + scratch[4].r;
Fout3->r = scratch[5].r + scratch[4].i; Fout[m3].r = scratch[5].r + scratch[4].i;
Fout3->i = scratch[5].i - scratch[4].r; Fout[m3].i = scratch[5].i - scratch[4].r;
}else{ }else{
Fout1->r = scratch[5].r + scratch[4].i; Fout[m].r = scratch[5].r + scratch[4].i;
Fout1->i = scratch[5].i - scratch[4].r; Fout[m].i = scratch[5].i - scratch[4].r;
Fout3->r = scratch[5].r - scratch[4].i; Fout[m3].r = scratch[5].r - scratch[4].i;
Fout3->i = scratch[5].i + scratch[4].r; Fout[m3].i = scratch[5].i + scratch[4].r;
} }
++Fout; ++Fout1; ++Fout2; ++Fout3; ++Fout;
}while(--m); }while(--k);
} }
static void kf_bfly3( static void kf_bfly3(