made threadsafe

This commit is contained in:
Mark Borgerding 2010-05-27 22:54:01 -04:00
parent 583019e074
commit 57925fd126
9 changed files with 70 additions and 68 deletions

8
.hgignore Normal file
View File

@ -0,0 +1,8 @@
syntax:glob
test/bm_*
test/st_*
test/tkfc_*
test/tr_*
tools/fastconv_*
tools/fastconvr_*
tools/fft_*

View File

@ -1,8 +1,4 @@
KFVER=1_2_9
DISTDIR=kiss_fft_v$(KFVER)
TARBALL=kiss_fft_v$(KFVER).tar.gz
ZIPFILE=kiss_fft_v$(KFVER).zip
KFVER=129
doc:
@echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'"
@ -10,36 +6,25 @@ doc:
testall:
# The simd and int32_t types may or may not work on your machine
export DATATYPE=simd && cd test && make test
export DATATYPE=int32_t && cd test && make test
export DATATYPE=int16_t && cd test && make test
export DATATYPE=float && cd test && make test
export DATATYPE=double && cd test && make test
make -C test DATATYPE=simd CFLAGADD="$(CFLAGADD)" test
make -C test DATATYPE=int32_t CFLAGADD="$(CFLAGADD)" test
make -C test DATATYPE=int16_t CFLAGADD="$(CFLAGADD)" test
make -C test DATATYPE=float CFLAGADD="$(CFLAGADD)" test
make -C test DATATYPE=double CFLAGADD="$(CFLAGADD)" test
tarball: clean
tar --exclude CVS --exclude .cvsignore --exclude $(TARBALL) -cvzf $(TARBALL) .
hg archive -r v$(KFVER) -t tgz kiss_fft$(KFVER).tar.gz
hg archive -r v$(KFVER) -t zip kiss_fft$(KFVER).zip
clean:
cd test && make clean
cd tools && make clean
rm -f kiss_fft*.tar.gz *~ *.pyc kiss_fft*.zip
rm -rf $(DISTDIR)
dist: tarball
mkdir $(DISTDIR)
cd $(DISTDIR) && tar -zxf ../$(TARBALL)
rm $(TARBALL)
tar -czf $(TARBALL) $(DISTDIR)
zip -r $(ZIPFILE) $(DISTDIR)
rm -rf $(DISTDIR)
upload: dist
ncftpput upload.sourceforge.net incoming $(ZIPFILE) $(TARBALL)
asm: kiss_fft.s
kiss_fft.s: kiss_fft.c kiss_fft.h _kiss_fft_guts.h
[ -e kiss_fft.s ] && mv kiss_fft.s kiss_fft.s~ || true
gcc -S kiss_fft.c -O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -dA -fverbose-asm
gcc -o kiss_fft_short.s -S kiss_fft.c -O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -dA -fverbose-asm -DFIXED_POINT
gcc -S kiss_fft.c -O3 -mtune=native -ffast-math -fomit-frame-pointer -unroll-loops -dA -fverbose-asm
gcc -o kiss_fft_short.s -S kiss_fft.c -O3 -mtune=native -ffast-math -fomit-frame-pointer -dA -fverbose-asm -DFIXED_POINT
[ -e kiss_fft.s~ ] && diff kiss_fft.s~ kiss_fft.s || true

22
README
View File

@ -36,8 +36,8 @@ Code definitions for 1d complex FFTs are in kiss_fft.c.
You can do other cool stuff with the extras you'll find in tools/
* multi-dimensional FFTs
* real-optimized FFTs
* fast convolution FIR filtering
* real-optimized FFTs (returns the positive half-spectrum: (nfft/2+1) complex frequency bins)
* fast convolution FIR filtering (not available for fixed point)
* spectrum image creation
The core fft and most tools/ code can be compiled to use float, double
@ -59,7 +59,7 @@ During this process, I learned:
1. FFT_BRANDX has more than 100K lines of code. The core of kiss_fft is about 500 lines (cpx 1-d).
2. It took me an embarrassingly long time to get FFT_BRANDX working.
3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB.
3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB (without optimizing for size).
4. FFT_BRANDX is roughly twice as fast as KISS FFT in default mode.
It is wonderful that free, highly optimized libraries like FFT_BRANDX exist.
@ -78,6 +78,11 @@ FREQUENTLY ASKED QUESTIONS:
2) mixed build environment -- all code must be compiled with same preprocessor
definitions for FIXED_POINT and kiss_fft_scalar
Q: Will you write/debug my code for me?
A: Probably not unless you pay me. I am happy to answer pointed and topical questions, but
I may refer you to a book, a forum, or some other resource.
PERFORMANCE:
(on Athlon XP 2100+, with gcc 2.96, float data type)
@ -92,7 +97,10 @@ DO NOT:
UNDER THE HOOD:
Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT.
Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT. If you give it an input buffer
and output buffer that are the same, a temporary buffer will be created to hold the data.
No static data is used. The core routines of kiss_fft are thread-safe (but not all of the tools directory).
No scaling is done for the floating point version (for speed).
Scaling is done both ways for the fixed-point version (for overflow prevention).
@ -100,7 +108,8 @@ UNDER THE HOOD:
Optimized butterflies are used for factors 2,3,4, and 5.
The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length
FFTs in parallel (packed into real&imag), and then combines them via twiddling.
FFTs in parallel (packed into real&imag), and then combines them via twiddling. The result is
nfft/2+1 complex frequency bins from DC to Nyquist. If you don't know what this means, search the web.
The fast convolution filtering uses the overlap-scrap method, slightly
modified to put the scrap at the tail.
@ -111,6 +120,9 @@ LICENSE:
Note this license is compatible with GPL at one end of the spectrum and closed, commercial software at
the other end. See http://www.fsf.org/licensing/licenses
A commercial license is available which removes the requirement for attribution. Contact me for details.
TODO:
*) Add real optimization for odd length FFTs
*) Document/revisit the input/output fft scaling

View File

@ -148,3 +148,17 @@ struct kiss_fft_state{
/* a debugging function */
#define pcpx(c)\
fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
#ifdef KISS_FFT_USE_ALLOCA
// define this to allow use of alloca instead of malloc for temporary buffers
// Temporary buffers are used in two case:
// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
// 2. "in-place" FFTs. Notice the quotes, since kissfft does not really do an in-place transform.
#include <alloca.h>
#define KISS_FFT_TMP_ALLOC(nbytes) alloca(nbytes)
#define KISS_FFT_TMP_FREE(ptr)
#else
#define KISS_FFT_TMP_ALLOC(nbytes) KISS_FFT_MALLOC(nbytes)
#define KISS_FFT_TMP_FREE(ptr) KISS_FFT_FREE(ptr)
#endif

View File

@ -1,5 +1,5 @@
/*
Copyright (c) 2003-2004, Mark Borgerding
Copyright (c) 2003-2010, Mark Borgerding
All rights reserved.
@ -14,27 +14,10 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
#include "_kiss_fft_guts.h"
/* The guts header contains all the multiplication and addition macros that are defined for
fixed or floating point complex numbers. It also delares the kf_ internal functions.
*/
static kiss_fft_cpx *scratchbuf=NULL;
static size_t nscratchbuf=0;
static kiss_fft_cpx *tmpbuf=NULL;
static size_t ntmpbuf=0;
#define CHECKBUF(buf,nbuf,n) \
do { \
if ( nbuf < (size_t)(n) ) {\
free(buf); \
buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); \
nbuf = (size_t)(n); \
} \
}while(0)
static void kf_bfly2(
kiss_fft_cpx * Fout,
const size_t fstride,
@ -225,7 +208,7 @@ static void kf_bfly_generic(
kiss_fft_cpx t;
int Norig = st->nfft;
kiss_fft_cpx * scratch = (kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx)*p);
kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);
for ( u=0; u<m; ++u ) {
k=u;
@ -248,7 +231,7 @@ static void kf_bfly_generic(
k += m;
}
}
free(scratch);
KISS_FFT_TMP_FREE(scratch);
}
static
@ -385,14 +368,15 @@ kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem
}
void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
{
if (fin == fout) {
CHECKBUF(tmpbuf,ntmpbuf,st->nfft);
//NOTE: this is not really an in-place FFT algorithm.
//It just performs an out-of-place FFT into a temp buffer
kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
KISS_FFT_TMP_FREE(tmpbuf);
}else{
kf_work( fout, fin, 1,in_stride, st->factors,st );
}
@ -404,17 +388,9 @@ void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
}
/* not really necessary to call, but if someone is doing in-place ffts, they may want to free the
buffers from CHECKBUF
*/
void kiss_fft_cleanup(void)
{
free(scratchbuf);
scratchbuf = NULL;
nscratchbuf=0;
free(tmpbuf);
tmpbuf=NULL;
ntmpbuf=0;
// nothing needed any more
}
int kiss_fft_next_fast_size(int n)

View File

@ -27,11 +27,11 @@ extern "C" {
#ifdef USE_SIMD
# include <xmmintrin.h>
# define kiss_fft_scalar __m128
//#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes)
#define KISS_FFT_MALLOC(nbytes) _mm_malloc(nbytes,16)
#define KISS_FFT_FREE _mm_free
#else
#define KISS_FFT_MALLOC malloc
#define KISS_FFT_FREE free
#endif

View File

@ -5,10 +5,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
CFLAGS=-O3 -I.. -I../tools $(WARNINGS)
CFLAGS+=-ffast-math -fomit-frame-pointer
CFLAGS+=-march=prescott
#CFLAGS+= -mtune=native
#CFLAGS+=-funroll-loops
#CFLAGS+=-march=prescott
CFLAGS+= -mtune=native
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
CFLAGS+=-fopenmp
#CFLAGS+=-fopenmp
CFLAGS+= $(CFLAGADD)
ifeq "$(NFFT)" ""

View File

@ -22,6 +22,11 @@ ifeq "$(TYPEFLAGS)" ""
TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE)
endif
ifneq ("$(KISS_FFT_USE_ALLOCA)","")
CFLAGS+= -DKISS_FFT_USE_ALLOCA=1
endif
CFLAGS+= $(CFLAGADD)
FFTUTIL=fft_$(DATATYPE)
FASTFILT=fastconv_$(DATATYPE)

View File

@ -20,7 +20,7 @@ struct kiss_fftr_state{
kiss_fft_cpx * tmpbuf;
kiss_fft_cpx * super_twiddles;
#ifdef USE_SIMD
long pad;
void * pad;
#endif
};