mirror of
https://github.com/mborgerding/kissfft.git
synced 2025-05-27 21:20:27 -04:00
openmp directives
This commit is contained in:
parent
262fe2297b
commit
3df04c8671
@ -1,3 +1,8 @@
|
|||||||
|
1.2.8
|
||||||
|
Changed memory.h to string.h -- apparently more standard
|
||||||
|
|
||||||
|
Added openmp extensions. This can have fairly linear speedups for larger FFT sizes.
|
||||||
|
|
||||||
1.2.7
|
1.2.7
|
||||||
Shrank the real-fft memory footprint. Thanks to Galen Seitz.
|
Shrank the real-fft memory footprint. Thanks to Galen Seitz.
|
||||||
|
|
||||||
|
12
Makefile
12
Makefile
@ -1,14 +1,18 @@
|
|||||||
KFVER=1_2_7
|
KFVER=1_2_8
|
||||||
|
|
||||||
DISTDIR=kiss_fft_v$(KFVER)
|
DISTDIR=kiss_fft_v$(KFVER)
|
||||||
TARBALL=kiss_fft_v$(KFVER).tar.gz
|
TARBALL=kiss_fft_v$(KFVER).tar.gz
|
||||||
ZIPFILE=kiss_fft_v$(KFVER).zip
|
ZIPFILE=kiss_fft_v$(KFVER).zip
|
||||||
|
|
||||||
|
doc:
|
||||||
|
@echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'"
|
||||||
|
@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not"
|
||||||
|
|
||||||
testall:
|
testall:
|
||||||
# The simd and int32_t types may or may not work on your machine
|
# The simd and int32_t types may or may not work on your machine
|
||||||
#export DATATYPE=simd && cd test && make test
|
export DATATYPE=simd && cd test && make test
|
||||||
#export DATATYPE=int32_t && cd test && make test
|
export DATATYPE=int32_t && cd test && make test
|
||||||
#export DATATYPE=int16_t && cd test && make test
|
export DATATYPE=int16_t && cd test && make test
|
||||||
export DATATYPE=float && cd test && make test
|
export DATATYPE=float && cd test && make test
|
||||||
export DATATYPE=double && cd test && make test
|
export DATATYPE=double && cd test && make test
|
||||||
|
|
||||||
|
2
README
2
README
@ -99,7 +99,7 @@ UNDER THE HOOD:
|
|||||||
|
|
||||||
Optimized butterflies are used for factors 2,3,4, and 5.
|
Optimized butterflies are used for factors 2,3,4, and 5.
|
||||||
|
|
||||||
The real optimization code only works for even length ffts. It does two half-length
|
The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length
|
||||||
FFTs in parallel (packed into real&imag), and then combines them via twiddling.
|
FFTs in parallel (packed into real&imag), and then combines them via twiddling.
|
||||||
|
|
||||||
The fast convolution filtering uses the overlap-scrap method, slightly
|
The fast convolution filtering uses the overlap-scrap method, slightly
|
||||||
|
6
TIPS
6
TIPS
@ -1,4 +1,8 @@
|
|||||||
Speed:
|
Speed:
|
||||||
|
* If you want to use multiple cores, then compile with -openmp or -fopenmp (see your compiler docs).
|
||||||
|
Realize that larger FFTs will reap more benefit than smaller FFTs. This generally uses more CPU time, but
|
||||||
|
less wall time.
|
||||||
|
|
||||||
* experiment with compiler flags
|
* experiment with compiler flags
|
||||||
Special thanks to Oscar Lesta. He suggested some compiler flags
|
Special thanks to Oscar Lesta. He suggested some compiler flags
|
||||||
for gcc that make a big difference. They shave 10-15% off
|
for gcc that make a big difference. They shave 10-15% off
|
||||||
@ -12,7 +16,7 @@ Speed:
|
|||||||
|
|
||||||
* If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine,
|
* If you can rearrange your code to do 4 FFTs in parallel and you are on a recent Intel or AMD machine,
|
||||||
then you might want to experiment with the USE_SIMD code.
|
then you might want to experiment with the USE_SIMD code.
|
||||||
|
|
||||||
|
|
||||||
Reducing code size:
|
Reducing code size:
|
||||||
* remove some of the butterflies. There are currently butterflies optimized for radices
|
* remove some of the butterflies. There are currently butterflies optimized for radices
|
||||||
|
23
kiss_fft.c
23
kiss_fft.c
@ -262,6 +262,29 @@ void kf_work(
|
|||||||
const int m=*factors++; /* stage's fft length/p */
|
const int m=*factors++; /* stage's fft length/p */
|
||||||
const kiss_fft_cpx * Fout_end = Fout + p*m;
|
const kiss_fft_cpx * Fout_end = Fout + p*m;
|
||||||
|
|
||||||
|
#ifdef _OPENMP
|
||||||
|
// use openmp extensions at the
|
||||||
|
// top-level (not recursive)
|
||||||
|
if (fstride==1) {
|
||||||
|
int k;
|
||||||
|
|
||||||
|
// execute the p different work units in different threads
|
||||||
|
# pragma omp parallel for
|
||||||
|
for (k=0;k<p;++k)
|
||||||
|
kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
|
||||||
|
// all threads have joined by this point
|
||||||
|
|
||||||
|
switch (p) {
|
||||||
|
case 2: kf_bfly2(Fout,fstride,st,m); break;
|
||||||
|
case 3: kf_bfly3(Fout,fstride,st,m); break;
|
||||||
|
case 4: kf_bfly4(Fout,fstride,st,m); break;
|
||||||
|
case 5: kf_bfly5(Fout,fstride,st,m); break;
|
||||||
|
default: kf_bfly_generic(Fout,fstride,st,m,p); break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if (m==1) {
|
if (m==1) {
|
||||||
do{
|
do{
|
||||||
*Fout = *f;
|
*Fout = *f;
|
||||||
|
@ -3,6 +3,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
|
|||||||
-Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \
|
-Wcast-align -Wcast-qual -Wnested-externs -Wshadow -Wbad-function-cast \
|
||||||
-Wwrite-strings
|
-Wwrite-strings
|
||||||
|
|
||||||
|
# for x86 pentium+ machines , these flags work well
|
||||||
|
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
||||||
|
# If the above flags do not work, try the following
|
||||||
|
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
||||||
|
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
|
||||||
|
|
||||||
|
|
||||||
ifeq "$(NFFT)" ""
|
ifeq "$(NFFT)" ""
|
||||||
NFFT=1800
|
NFFT=1800
|
||||||
@ -55,10 +61,6 @@ all: tools $(BENCHKISS) $(SELFTEST) $(BENCHFFTW) $(TESTREAL) $(TESTKFC)
|
|||||||
tools:
|
tools:
|
||||||
cd ../tools && make all
|
cd ../tools && make all
|
||||||
|
|
||||||
# for x86 pentium+ machines , these flags work well
|
|
||||||
#CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
|
||||||
# If the above flags do not work, try the following
|
|
||||||
CFLAGS=-O3 -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
|
||||||
|
|
||||||
$(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
|
$(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
|
||||||
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
|
||||||
|
@ -35,7 +35,8 @@ all: $(FFTUTIL) $(FASTFILT) $(FASTFILTREAL)
|
|||||||
|
|
||||||
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
|
#CFLAGS=-Wall -O3 -pedantic -march=pentiumpro -ffast-math -fomit-frame-pointer $(WARNINGS)
|
||||||
# If the above flags do not work, try the following
|
# If the above flags do not work, try the following
|
||||||
CFLAGS=-Wall -O3 $(WARNINGS)
|
CFLAGS=-Wall -O3 $(WARNINGS)
|
||||||
|
# tip: try -openmp or -fopenmp to use multiple cores
|
||||||
|
|
||||||
$(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
|
$(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
|
||||||
$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL
|
$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL
|
||||||
|
Loading…
Reference in New Issue
Block a user