mirror of
https://github.com/mborgerding/kissfft.git
synced 2025-05-27 21:20:27 -04:00
openmp directives
This commit is contained in:
parent
262fe2297b
commit
3df04c8671
@ -1,3 +1,8 @@
|
||||
1.2.8
|
||||
Changed memory.h to string.h -- apparently more standard
|
||||
|
||||
Added openmp extensions. This can have fairly linear speedups for larger FFT sizes.
|
||||
|
||||
1.2.7
|
||||
Shrank the real-fft memory footprint. Thanks to Galen Seitz.
|
||||
|
||||
|
12
Makefile
12
Makefile
@ -1,14 +1,18 @@
|
||||
KFVER=1_2_7
|
||||
KFVER=1_2_8
|
||||
|
||||
DISTDIR=kiss_fft_v$(KFVER)
|
||||
TARBALL=kiss_fft_v$(KFVER).tar.gz
|
||||
ZIPFILE=kiss_fft_v$(KFVER).zip
|
||||
|
||||
doc:
|
||||
@echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'"
|
||||
@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not"
|
||||
|
||||
testall:
|
||||
# The simd and int32_t types may or may not work on your machine
|
||||
#export DATATYPE=simd && cd test && make test
|
||||
#export DATATYPE=int32_t && cd test && make test
|
||||
#export DATATYPE=int16_t && cd test && make test
|
||||
export DATATYPE=simd && cd test && make test
|
||||
export DATATYPE=int32_t && cd test && make test
|
||||
export DATATYPE=int16_t && cd test && make test
|
||||
export DATATYPE=float && cd test && make test
|
||||
export DATATYPE=double && cd test && make test
|
||||
|
||||
|
2
README
2
README
@ -99,7 +99,7 @@ UNDER THE HOOD:
|
||||
|
||||
Optimized butterflies are used for factors 2,3,4, and 5.
|
||||
|
||||
The real optimization code only works for even length ffts. It does two half-length
|
||||
The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length
|
||||
FFTs in parallel (packed into real&imag), and then combines them via twiddling.
|
||||
|
||||
The fast convolution filtering uses the overlap-scrap method, slightly
|
||||
|
6
TIPS
6
TIPS
@ -1,4 +1,8 @@
|
||||
Speed:
|
||||
* If you want to use multiple cores, then compile with -openmp or -fopenmp (see your compiler docs).
|
||||
Realize that larger FFTs will reap more benefit than smaller FFTs. This generally uses more CPU time, but
|
||||
less wall time.
|
||||
|
||||
* experiment with compiler flags
|
||||
Special thanks to Oscar Lesta. He suggested some compiler flags
|
||||
for gcc that make a big difference. They shave 10-15% off
|
||||
@ -12,7 +16,7 @@ Speed:
|
||||
|
||||
* If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine,
|
||||
then you might want to experiment with the USE_SIMD code.
|
||||
|
||||
|
||||
|
||||
Reducing code size:
|
||||
* remove some of the butterflies. There are currently butterflies optimized for radices
|
||||
|
23
kiss_fft.c
23
kiss_fft.c
@ -262,6 +262,29 @@ void kf_work(
|
||||
const int m=*factors++; /* stage's fft length/p */
|
||||
const kiss_fft_cpx * Fout_end = Fout + p*m;
|
||||
|
||||
#ifdef _OPENMP
|
||||
// use openmp extensions at the
|
||||
// top-level (not recursive)
|
||||
if (fstride==1) {
|
||||
int k;
|
||||
|
||||
// execute the p different work units in different threads
|
||||
# pragma omp parallel for
|
||||
for (k=0;k<p;++k)
|
||||
kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
|
||||
// all threads have joined by this point
|
||||
|
||||
switch (p) {
|
||||
case 2: kf_bfly2(Fout,fstride,st,m); break;
|
||||
case 3: kf_bfly3(Fout,fstride,st,m); break;
|
||||
case 4: kf_bfly4(Fout,fstride,st,m); break;
|
||||
case 5: kf_bfly5(Fout,fstride,st,m); break;
|
||||
default: kf_bfly_generic(Fout,fstride,st,m,p); break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (m==1) {
|
||||
do{
|
||||
*Fout = *f;
|
||||
|
@ -3,6 +3,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
|
||||
-Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \
|
||||
-Wwrite-strings
|
||||
|
||||
# for x86 pentium+ machines , these flags work well
|
||||
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
||||
# If the above flags do not work, try the following
|
||||
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
||||
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
|
||||
|
||||
|
||||
ifeq "$(NFFT)" ""
|
||||
NFFT=1800
|
||||
@ -55,10 +61,6 @@ all: tools $(BENCHKISS) $(SELFTEST) $(BENCHFFTW) $(TESTREAL) $(TESTKFC)
|
||||
tools:
|
||||
cd ../tools && make all
|
||||
|
||||
# for x86 pentium+ machines , these flags work well
|
||||
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
||||
# If the above flags do not work, try the following
|
||||
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
||||
|
||||
$(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
|
||||
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
||||
|
@ -35,7 +35,8 @@ all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
|
||||
|
||||
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
|
||||
# If the above flags do not work, try the following
|
||||
CFLAGS=-Wall -O3 $(WARNINGS)
|
||||
CFLAGS=-Wall -O3 $(WARNINGS)
|
||||
# tip: try -openmp or -fopenmp to use multiple cores
|
||||
|
||||
$(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
|
||||
$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL
|
||||
|
Loading…
Reference in New Issue
Block a user