openmp directives

This commit is contained in:
Mark Borgerding 2008-08-22 21:43:25 +00:00
parent 262fe2297b
commit 3df04c8671
7 changed files with 50 additions and 11 deletions

View File

@ -1,3 +1,8 @@
1.2.8
Changed memory.h to string.h -- apparently more standard
Added openmp extensions. This can have fairly linear speedups for larger FFT sizes.
1.2.7
Shrank the real-fft memory footprint. Thanks to Galen Seitz.

View File

@ -1,14 +1,18 @@
KFVER=1_2_7
KFVER=1_2_8
DISTDIR=kiss_fft_v$(KFVER)
TARBALL=kiss_fft_v$(KFVER).tar.gz
ZIPFILE=kiss_fft_v$(KFVER).zip
doc:
@echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'"
@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not"
testall:
# The simd and int32_t types may or may not work on your machine
#export DATATYPE=simd && cd test && make test
#export DATATYPE=int32_t && cd test && make test
#export DATATYPE=int16_t && cd test && make test
export DATATYPE=simd && cd test && make test
export DATATYPE=int32_t && cd test && make test
export DATATYPE=int16_t && cd test && make test
export DATATYPE=float && cd test && make test
export DATATYPE=double && cd test && make test

2
README
View File

@ -99,7 +99,7 @@ UNDER THE HOOD:
Optimized butterflies are used for factors 2,3,4, and 5.
The real optimization code only works for even length ffts. It does two half-length
The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length
FFTs in parallel (packed into real&imag), and then combines them via twiddling.
The fast convolution filtering uses the overlap-scrap method, slightly

6
TIPS
View File

@ -1,4 +1,8 @@
Speed:
* If you want to use multiple cores, then compile with -openmp or -fopenmp (see your compiler docs).
Realize that larger FFTs will reap more benefit than smaller FFTs. This generally uses more CPU time, but
less wall time.
* experiment with compiler flags
Special thanks to Oscar Lesta. He suggested some compiler flags
for gcc that make a big difference. They shave 10-15% off
@ -12,7 +16,7 @@ Speed:
* If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine,
then you might want to experiment with the USE_SIMD code.
Reducing code size:
* remove some of the butterflies. There are currently butterflies optimized for radices

View File

@ -262,6 +262,29 @@ void kf_work(
const int m=*factors++; /* stage's fft length/p */
const kiss_fft_cpx * Fout_end = Fout + p*m;
#ifdef _OPENMP
// use openmp extensions at the
// top-level (not recursive)
if (fstride==1) {
int k;
// execute the p different work units in different threads
# pragma omp parallel for
for (k=0;k<p;++k)
kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
// all threads have joined by this point
switch (p) {
case 2: kf_bfly2(Fout,fstride,st,m); break;
case 3: kf_bfly3(Fout,fstride,st,m); break;
case 4: kf_bfly4(Fout,fstride,st,m); break;
case 5: kf_bfly5(Fout,fstride,st,m); break;
default: kf_bfly_generic(Fout,fstride,st,m,p); break;
}
return;
}
#endif
if (m==1) {
do{
*Fout = *f;

View File

@ -3,6 +3,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
-Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \
-Wwrite-strings
# for x86 pentium+ machines , these flags work well
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
# If the above flags do not work, try the following
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
ifeq "$(NFFT)" ""
NFFT=1800
@ -55,10 +61,6 @@ all: tools $(BENCHKISS) $(SELFTEST) $(BENCHFFTW) $(TESTREAL) $(TESTKFC)
tools:
cd ../tools && make all
# for x86 pentium+ machines , these flags work well
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
# If the above flags do not work, try the following
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
$(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+

View File

@ -35,7 +35,8 @@ all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
# If the above flags do not work, try the following
CFLAGS=-Wall -O3 $(WARNINGS)
CFLAGS=-Wall -O3 $(WARNINGS)
# tip: try -openmp or -fopenmp to use multiple cores
$(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL