openmp directives

This commit is contained in:
Mark Borgerding 2008-08-22 21:43:25 +00:00
parent 262fe2297b
commit 3df04c8671
7 changed files with 50 additions and 11 deletions

View File

@ -1,3 +1,8 @@
1.2.8
Changed memory.h to string.h -- apparently more standard
Added openmp extensions. This can have fairly linear speedups for larger FFT sizes.
1.2.7 1.2.7
Shrank the real-fft memory footprint. Thanks to Galen Seitz. Shrank the real-fft memory footprint. Thanks to Galen Seitz.

View File

@ -1,14 +1,18 @@
KFVER=1_2_7 KFVER=1_2_8
DISTDIR=kiss_fft_v$(KFVER) DISTDIR=kiss_fft_v$(KFVER)
TARBALL=kiss_fft_v$(KFVER).tar.gz TARBALL=kiss_fft_v$(KFVER).tar.gz
ZIPFILE=kiss_fft_v$(KFVER).zip ZIPFILE=kiss_fft_v$(KFVER).zip
doc:
@echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'"
@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not"
testall: testall:
# The simd and int32_t types may or may not work on your machine # The simd and int32_t types may or may not work on your machine
#export DATATYPE=simd && cd test && make test export DATATYPE=simd && cd test && make test
#export DATATYPE=int32_t && cd test && make test export DATATYPE=int32_t && cd test && make test
#export DATATYPE=int16_t && cd test && make test export DATATYPE=int16_t && cd test && make test
export DATATYPE=float && cd test && make test export DATATYPE=float && cd test && make test
export DATATYPE=double && cd test && make test export DATATYPE=double && cd test && make test

2
README
View File

@ -99,7 +99,7 @@ UNDER THE HOOD:
Optimized butterflies are used for factors 2,3,4, and 5. Optimized butterflies are used for factors 2,3,4, and 5.
The real optimization code only works for even length ffts. It does two half-length The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length
FFTs in parallel (packed into real&imag), and then combines them via twiddling. FFTs in parallel (packed into real&imag), and then combines them via twiddling.
The fast convolution filtering uses the overlap-scrap method, slightly The fast convolution filtering uses the overlap-scrap method, slightly

6
TIPS
View File

@ -1,4 +1,8 @@
Speed: Speed:
* If you want to use multiple cores, then compile with -openmp or -fopenmp (see your compiler docs).
Realize that larger FFTs will reap more benefit than smaller FFTs. This generally uses more CPU time, but
less wall time.
* experiment with compiler flags * experiment with compiler flags
Special thanks to Oscar Lesta. He suggested some compiler flags Special thanks to Oscar Lesta. He suggested some compiler flags
for gcc that make a big difference. They shave 10-15% off for gcc that make a big difference. They shave 10-15% off
@ -12,7 +16,7 @@ Speed:
* If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine, * If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine,
then you might want to experiment with the USE_SIMD code. then you might want to experiment with the USE_SIMD code.
Reducing code size: Reducing code size:
* remove some of the butterflies. There are currently butterflies optimized for radices * remove some of the butterflies. There are currently butterflies optimized for radices

View File

@ -262,6 +262,29 @@ void kf_work(
const int m=*factors++; /* stage's fft length/p */ const int m=*factors++; /* stage's fft length/p */
const kiss_fft_cpx * Fout_end = Fout + p*m; const kiss_fft_cpx * Fout_end = Fout + p*m;
#ifdef _OPENMP
// use openmp extensions at the
// top-level (not recursive)
if (fstride==1) {
int k;
// execute the p different work units in different threads
# pragma omp parallel for
for (k=0;k<p;++k)
kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
// all threads have joined by this point
switch (p) {
case 2: kf_bfly2(Fout,fstride,st,m); break;
case 3: kf_bfly3(Fout,fstride,st,m); break;
case 4: kf_bfly4(Fout,fstride,st,m); break;
case 5: kf_bfly5(Fout,fstride,st,m); break;
default: kf_bfly_generic(Fout,fstride,st,m,p); break;
}
return;
}
#endif
if (m==1) { if (m==1) {
do{ do{
*Fout = *f; *Fout = *f;

View File

@ -3,6 +3,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
-Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \ -Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \
-Wwrite-strings -Wwrite-strings
# for x86 pentium+ machines , these flags work well
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
# If the above flags do not work, try the following
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
ifeq "$(NFFT)" "" ifeq "$(NFFT)" ""
NFFT=1800 NFFT=1800
@ -55,10 +61,6 @@ all: tools $(BENCHKISS) $(SELFTEST) $(BENCHFFTW) $(TESTREAL) $(TESTKFC)
tools: tools:
cd ../tools && make all cd ../tools && make all
# for x86 pentium+ machines , these flags work well
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
# If the above flags do not work, try the following
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
$(SELFTEST): $(SELFTESTSRC) $(SRCFILES) $(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+ $(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+

View File

@ -35,7 +35,8 @@ all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS) #CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
# If the above flags do not work, try the following # If the above flags do not work, try the following
CFLAGS=-Wall -O3 $(WARNINGS) CFLAGS=-Wall -O3 $(WARNINGS)
# tip: try -openmp or -fopenmp to use multiple cores
$(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c $(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL $(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL