made threadsafe

This commit is contained in:
Mark Borgerding 2010-05-27 22:54:01 -04:00
parent 583019e074
commit 57925fd126
9 changed files with 70 additions and 68 deletions

8
.hgignore Normal file
View File

@ -0,0 +1,8 @@
syntax:glob
test/bm_*
test/st_*
test/tkfc_*
test/tr_*
tools/fastconv_*
tools/fastconvr_*
tools/fft_*

View File

@ -1,8 +1,4 @@
KFVER=1_2_9 KFVER=129
DISTDIR=kiss_fft_v$(KFVER)
TARBALL=kiss_fft_v$(KFVER).tar.gz
ZIPFILE=kiss_fft_v$(KFVER).zip
doc: doc:
@echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'" @echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'"
@ -10,36 +6,25 @@ doc:
testall: testall:
# The simd and int32_t types may or may not work on your machine # The simd and int32_t types may or may not work on your machine
export DATATYPE=simd && cd test && make test make -C test DATATYPE=simd CFLAGADD="$(CFLAGADD)" test
export DATATYPE=int32_t && cd test && make test make -C test DATATYPE=int32_t CFLAGADD="$(CFLAGADD)" test
export DATATYPE=int16_t && cd test && make test make -C test DATATYPE=int16_t CFLAGADD="$(CFLAGADD)" test
export DATATYPE=float && cd test && make test make -C test DATATYPE=float CFLAGADD="$(CFLAGADD)" test
export DATATYPE=double && cd test && make test make -C test DATATYPE=double CFLAGADD="$(CFLAGADD)" test
tarball: clean tarball: clean
tar --exclude CVS --exclude .cvsignore --exclude $(TARBALL) -cvzf $(TARBALL) . hg archive -r v$(KFVER) -t tgz kiss_fft$(KFVER).tar.gz
hg archive -r v$(KFVER) -t zip kiss_fft$(KFVER).zip
clean: clean:
cd test && make clean cd test && make clean
cd tools && make clean cd tools && make clean
rm -f kiss_fft*.tar.gz *~ *.pyc kiss_fft*.zip rm -f kiss_fft*.tar.gz *~ *.pyc kiss_fft*.zip
rm -rf $(DISTDIR)
dist: tarball
mkdir $(DISTDIR)
cd $(DISTDIR) && tar -zxf ../$(TARBALL)
rm $(TARBALL)
tar -czf $(TARBALL) $(DISTDIR)
zip -r $(ZIPFILE) $(DISTDIR)
rm -rf $(DISTDIR)
upload: dist
ncftpput upload.sourceforge.net incoming $(ZIPFILE) $(TARBALL)
asm: kiss_fft.s asm: kiss_fft.s
kiss_fft.s: kiss_fft.c kiss_fft.h _kiss_fft_guts.h kiss_fft.s: kiss_fft.c kiss_fft.h _kiss_fft_guts.h
[ -e kiss_fft.s ] && mv kiss_fft.s kiss_fft.s~ || true [ -e kiss_fft.s ] && mv kiss_fft.s kiss_fft.s~ || true
gcc -S kiss_fft.c -O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -dA -fverbose-asm gcc -S kiss_fft.c -O3 -mtune=native -ffast-math -fomit-frame-pointer -unroll-loops -dA -fverbose-asm
gcc -o kiss_fft_short.s -S kiss_fft.c -O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -dA -fverbose-asm -DFIXED_POINT gcc -o kiss_fft_short.s -S kiss_fft.c -O3 -mtune=native -ffast-math -fomit-frame-pointer -dA -fverbose-asm -DFIXED_POINT
[ -e kiss_fft.s~ ] && diff kiss_fft.s~ kiss_fft.s || true [ -e kiss_fft.s~ ] && diff kiss_fft.s~ kiss_fft.s || true

22
README
View File

@ -36,8 +36,8 @@ Code definitions for 1d complex FFTs are in kiss_fft.c.
You can do other cool stuff with the extras you'll find in tools/ You can do other cool stuff with the extras you'll find in tools/
* multi-dimensional FFTs * multi-dimensional FFTs
* real-optimized FFTs * real-optimized FFTs (returns the positive half-spectrum: (nfft/2+1) complex frequency bins)
* fast convolution FIR filtering * fast convolution FIR filtering (not available for fixed point)
* spectrum image creation * spectrum image creation
The core fft and most tools/ code can be compiled to use float, double The core fft and most tools/ code can be compiled to use float, double
@ -59,7 +59,7 @@ During this process, I learned:
1. FFT_BRANDX has more than 100K lines of code. The core of kiss_fft is about 500 lines (cpx 1-d). 1. FFT_BRANDX has more than 100K lines of code. The core of kiss_fft is about 500 lines (cpx 1-d).
2. It took me an embarrassingly long time to get FFT_BRANDX working. 2. It took me an embarrassingly long time to get FFT_BRANDX working.
3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB. 3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB (without optimizing for size).
4. FFT_BRANDX is roughly twice as fast as KISS FFT in default mode. 4. FFT_BRANDX is roughly twice as fast as KISS FFT in default mode.
It is wonderful that free, highly optimized libraries like FFT_BRANDX exist. It is wonderful that free, highly optimized libraries like FFT_BRANDX exist.
@ -78,6 +78,11 @@ FREQUENTLY ASKED QUESTIONS:
2) mixed build environment -- all code must be compiled with same preprocessor 2) mixed build environment -- all code must be compiled with same preprocessor
definitions for FIXED_POINT and kiss_fft_scalar definitions for FIXED_POINT and kiss_fft_scalar
Q: Will you write/debug my code for me?
A: Probably not unless you pay me. I am happy to answer pointed and topical questions, but
I may refer you to a book, a forum, or some other resource.
PERFORMANCE: PERFORMANCE:
(on Athlon XP 2100+, with gcc 2.96, float data type) (on Athlon XP 2100+, with gcc 2.96, float data type)
@ -92,7 +97,10 @@ DO NOT:
UNDER THE HOOD: UNDER THE HOOD:
Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT. Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT. If you give it an input buffer
and output buffer that are the same, a temporary buffer will be created to hold the data.
No static data is used. The core routines of kiss_fft are thread-safe (but not all of the tools directory).
No scaling is done for the floating point version (for speed). No scaling is done for the floating point version (for speed).
Scaling is done both ways for the fixed-point version (for overflow prevention). Scaling is done both ways for the fixed-point version (for overflow prevention).
@ -100,7 +108,8 @@ UNDER THE HOOD:
Optimized butterflies are used for factors 2,3,4, and 5. Optimized butterflies are used for factors 2,3,4, and 5.
The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length
FFTs in parallel (packed into real&imag), and then combines them via twiddling. FFTs in parallel (packed into real&imag), and then combines them via twiddling. The result is
nfft/2+1 complex frequency bins from DC to Nyquist. If you don't know what this means, search the web.
The fast convolution filtering uses the overlap-scrap method, slightly The fast convolution filtering uses the overlap-scrap method, slightly
modified to put the scrap at the tail. modified to put the scrap at the tail.
@ -111,6 +120,9 @@ LICENSE:
Note this license is compatible with GPL at one end of the spectrum and closed, commercial software at Note this license is compatible with GPL at one end of the spectrum and closed, commercial software at
the other end. See http://www.fsf.org/licensing/licenses the other end. See http://www.fsf.org/licensing/licenses
A commercial license is available which removes the requirement for attribution. Contact me for details.
TODO: TODO:
*) Add real optimization for odd length FFTs *) Add real optimization for odd length FFTs
*) Document/revisit the input/output fft scaling *) Document/revisit the input/output fft scaling

View File

@ -148,3 +148,17 @@ struct kiss_fft_state{
/* a debugging function */ /* a debugging function */
#define pcpx(c)\ #define pcpx(c)\
fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) ) fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
#ifdef KISS_FFT_USE_ALLOCA
// define this to allow use of alloca instead of malloc for temporary buffers
// Temporary buffers are used in two case:
// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
// 2. "in-place" FFTs. Notice the quotes, since kissfft does not really do an in-place transform.
#include <alloca.h>
#define KISS_FFT_TMP_ALLOC(nbytes) alloca(nbytes)
#define KISS_FFT_TMP_FREE(ptr)
#else
#define KISS_FFT_TMP_ALLOC(nbytes) KISS_FFT_MALLOC(nbytes)
#define KISS_FFT_TMP_FREE(ptr) KISS_FFT_FREE(ptr)
#endif

View File

@ -1,5 +1,5 @@
/* /*
Copyright (c) 2003-2004, Mark Borgerding Copyright (c) 2003-2010, Mark Borgerding
All rights reserved. All rights reserved.
@ -14,27 +14,10 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#include "_kiss_fft_guts.h" #include "_kiss_fft_guts.h"
/* The guts header contains all the multiplication and addition macros that are defined for /* The guts header contains all the multiplication and addition macros that are defined for
fixed or floating point complex numbers. It also delares the kf_ internal functions. fixed or floating point complex numbers. It also delares the kf_ internal functions.
*/ */
static kiss_fft_cpx *scratchbuf=NULL;
static size_t nscratchbuf=0;
static kiss_fft_cpx *tmpbuf=NULL;
static size_t ntmpbuf=0;
#define CHECKBUF(buf,nbuf,n) \
do { \
if ( nbuf < (size_t)(n) ) {\
free(buf); \
buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); \
nbuf = (size_t)(n); \
} \
}while(0)
static void kf_bfly2( static void kf_bfly2(
kiss_fft_cpx * Fout, kiss_fft_cpx * Fout,
const size_t fstride, const size_t fstride,
@ -225,7 +208,7 @@ static void kf_bfly_generic(
kiss_fft_cpx t; kiss_fft_cpx t;
int Norig = st->nfft; int Norig = st->nfft;
kiss_fft_cpx * scratch = (kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx)*p); kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);
for ( u=0; u<m; ++u ) { for ( u=0; u<m; ++u ) {
k=u; k=u;
@ -248,7 +231,7 @@ static void kf_bfly_generic(
k += m; k += m;
} }
} }
free(scratch); KISS_FFT_TMP_FREE(scratch);
} }
static static
@ -385,14 +368,15 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
} }
void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride) void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
{ {
if (fin == fout) { if (fin == fout) {
CHECKBUF(tmpbuf,ntmpbuf,st->nfft); //NOTE: this is not really an in-place FFT algorithm.
//It just performs an out-of-place FFT into a temp buffer
kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
kf_work(tmpbuf,fin,1,in_stride, st->factors,st); kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft); memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
KISS_FFT_TMP_FREE(tmpbuf);
}else{ }else{
kf_work( fout, fin, 1,in_stride, st->factors,st ); kf_work( fout, fin, 1,in_stride, st->factors,st );
} }
@ -404,17 +388,9 @@ void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
} }
/* not really necessary to call, but if someone is doing in-place ffts, they may want to free the
buffers from CHECKBUF
*/
void kiss_fft_cleanup(void) void kiss_fft_cleanup(void)
{ {
free(scratchbuf); // nothing needed any more
scratchbuf = NULL;
nscratchbuf=0;
free(tmpbuf);
tmpbuf=NULL;
ntmpbuf=0;
} }
int kiss_fft_next_fast_size(int n) int kiss_fft_next_fast_size(int n)

View File

@ -27,11 +27,11 @@ extern "C" {
#ifdef USE_SIMD #ifdef USE_SIMD
# include <xmmintrin.h> # include <xmmintrin.h>
# define kiss_fft_scalar __m128 # define kiss_fft_scalar __m128
//#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
#define KISS_FFT_MALLOC(nbytes) _mm_malloc(nbytes,16) #define KISS_FFT_MALLOC(nbytes) _mm_malloc(nbytes,16)
#define KISS_FFT_FREE _mm_free
#else #else
#define KISS_FFT_MALLOC malloc #define KISS_FFT_MALLOC malloc
#define KISS_FFT_FREE free
#endif #endif

View File

@ -5,10 +5,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
CFLAGS=-O3 -I.. -I../tools $(WARNINGS) CFLAGS=-O3 -I.. -I../tools $(WARNINGS)
CFLAGS+=-ffast-math -fomit-frame-pointer CFLAGS+=-ffast-math -fomit-frame-pointer
CFLAGS+=-march=prescott #CFLAGS+=-funroll-loops
#CFLAGS+= -mtune=native #CFLAGS+=-march=prescott
CFLAGS+= -mtune=native
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores # TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
CFLAGS+=-fopenmp #CFLAGS+=-fopenmp
CFLAGS+= $(CFLAGADD)
ifeq "$(NFFT)" "" ifeq "$(NFFT)" ""

View File

@ -22,6 +22,11 @@ ifeq "$(TYPEFLAGS)" ""
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE) TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
endif endif
ifneq ("$(KISS_FFT_USE_ALLOCA)","")
CFLAGS+= -DKISS_FFT_USE_ALLOCA=1
endif
CFLAGS+= $(CFLAGADD)
FFTUTIL=fft_$(DATATYPE) FFTUTIL=fft_$(DATATYPE)
FASTFILT=fastconv_$(DATATYPE) FASTFILT=fastconv_$(DATATYPE)

View File

@ -20,7 +20,7 @@ struct kiss_fftr_state{
kiss_fft_cpx * tmpbuf; kiss_fft_cpx * tmpbuf;
kiss_fft_cpx * super_twiddles; kiss_fft_cpx * super_twiddles;
#ifdef USE_SIMD #ifdef USE_SIMD
long pad; void * pad;
#endif #endif
}; };