diff --git a/CHANGELOG b/CHANGELOG index 517de23..caf6226 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,3 +1,8 @@ +1.2.8 + Changed memory.h to string.h -- apparently more standard + + Added openmp extensions. This can have fairly linear speedups for larger FFT sizes. + 1.2.7 Shrank the real-fft memory footprint. Thanks to Galen Seitz. diff --git a/Makefile b/Makefile index e7f2131..b272b40 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,18 @@ -KFVER=1_2_7 +KFVER=1_2_8 DISTDIR=kiss_fft_v$(KFVER) TARBALL=kiss_fft_v$(KFVER).tar.gz ZIPFILE=kiss_fft_v$(KFVER).zip +doc: + @echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'" + @echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not" + testall: # The simd and int32_t types may or may not work on your machine - #export DATATYPE=simd && cd test && make test - #export DATATYPE=int32_t && cd test && make test - #export DATATYPE=int16_t && cd test && make test + export DATATYPE=simd && cd test && make test + export DATATYPE=int32_t && cd test && make test + export DATATYPE=int16_t && cd test && make test export DATATYPE=float && cd test && make test export DATATYPE=double && cd test && make test diff --git a/README b/README index a5941ec..e1ce587 100644 --- a/README +++ b/README @@ -99,7 +99,7 @@ UNDER THE HOOD: Optimized butterflies are used for factors 2,3,4, and 5. - The real optimization code only works for even length ffts. It does two half-length + The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length FFTs in parallel (packed into real&imag), and then combines them via twiddling. The fast convolution filtering uses the overlap-scrap method, slightly diff --git a/TIPS b/TIPS index c02f0c4..dc439dc 100644 --- a/TIPS +++ b/TIPS @@ -1,4 +1,8 @@ Speed: + * If you want to use multiple cores, then compile with -openmp or -fopenmp (see your compiler docs). + Realize that larger FFTs will reap more benefit than smaller FFTs. This generally uses more CPU time, but + less wall time. + * experiment with compiler flags Special thanks to Oscar Lesta. He suggested some compiler flags for gcc that make a big difference. They shave 10-15% off @@ -12,7 +16,7 @@ Speed: * If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine, then you might want to experiment with the USE_SIMD code. - + Reducing code size: * remove some of the butterflies. There are currently butterflies optimized for radices diff --git a/kiss_fft.c b/kiss_fft.c index c6b2807..fb048e7 100644 --- a/kiss_fft.c +++ b/kiss_fft.c @@ -262,6 +262,29 @@ void kf_work( const int m=*factors++; /* stage's fft length/p */ const kiss_fft_cpx * Fout_end = Fout + p*m; +#ifdef _OPENMP + // use openmp extensions at the + // top-level (not recursive) + if (fstride==1) { + int k; + + // execute the p different work units in different threads +# pragma omp parallel for + for (k=0;k