diff --git a/.hgignore b/.hgignore new file mode 100644 index 0000000..d41343b --- /dev/null +++ b/.hgignore @@ -0,0 +1,8 @@ +syntax:glob +test/bm_* +test/st_* +test/tkfc_* +test/tr_* +tools/fastconv_* +tools/fastconvr_* +tools/fft_* diff --git a/Makefile b/Makefile index 48f536e..e501c5c 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,4 @@ -KFVER=1_2_9 - -DISTDIR=kiss_fft_v$(KFVER) -TARBALL=kiss_fft_v$(KFVER).tar.gz -ZIPFILE=kiss_fft_v$(KFVER).zip +KFVER=129 doc: @echo "Start by reading the README file. If you want to build and test lots of stuff, do a 'make testall'" @@ -10,36 +6,25 @@ doc: testall: # The simd and int32_t types may or may not work on your machine - export DATATYPE=simd && cd test && make test - export DATATYPE=int32_t && cd test && make test - export DATATYPE=int16_t && cd test && make test - export DATATYPE=float && cd test && make test - export DATATYPE=double && cd test && make test + make -C test DATATYPE=simd CFLAGADD="$(CFLAGADD)" test + make -C test DATATYPE=int32_t CFLAGADD="$(CFLAGADD)" test + make -C test DATATYPE=int16_t CFLAGADD="$(CFLAGADD)" test + make -C test DATATYPE=float CFLAGADD="$(CFLAGADD)" test + make -C test DATATYPE=double CFLAGADD="$(CFLAGADD)" test tarball: clean - tar --exclude CVS --exclude .cvsignore --exclude $(TARBALL) -cvzf $(TARBALL) . + hg archive -r v$(KFVER) -t tgz kiss_fft$(KFVER).tar.gz + hg archive -r v$(KFVER) -t zip kiss_fft$(KFVER).zip clean: cd test && make clean cd tools && make clean rm -f kiss_fft*.tar.gz *~ *.pyc kiss_fft*.zip - rm -rf $(DISTDIR) - -dist: tarball - mkdir $(DISTDIR) - cd $(DISTDIR) && tar -zxf ../$(TARBALL) - rm $(TARBALL) - tar -czf $(TARBALL) $(DISTDIR) - zip -r $(ZIPFILE) $(DISTDIR) - rm -rf $(DISTDIR) - -upload: dist - ncftpput upload.sourceforge.net incoming $(ZIPFILE) $(TARBALL) asm: kiss_fft.s kiss_fft.s: kiss_fft.c kiss_fft.h _kiss_fft_guts.h [ -e kiss_fft.s ] && mv kiss_fft.s kiss_fft.s~ || true - gcc -S kiss_fft.c -O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -dA -fverbose-asm - gcc -o kiss_fft_short.s -S kiss_fft.c -O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -dA -fverbose-asm -DFIXED_POINT + gcc -S kiss_fft.c -O3 -mtune=native -ffast-math -fomit-frame-pointer -unroll-loops -dA -fverbose-asm + gcc -o kiss_fft_short.s -S kiss_fft.c -O3 -mtune=native -ffast-math -fomit-frame-pointer -dA -fverbose-asm -DFIXED_POINT [ -e kiss_fft.s~ ] && diff kiss_fft.s~ kiss_fft.s || true diff --git a/README b/README index e1ce587..39a06eb 100644 --- a/README +++ b/README @@ -36,8 +36,8 @@ Code definitions for 1d complex FFTs are in kiss_fft.c. You can do other cool stuff with the extras you'll find in tools/ * multi-dimensional FFTs - * real-optimized FFTs - * fast convolution FIR filtering + * real-optimized FFTs (returns the positive half-spectrum: (nfft/2+1) complex frequency bins) + * fast convolution FIR filtering (not available for fixed point) * spectrum image creation The core fft and most tools/ code can be compiled to use float, double @@ -59,7 +59,7 @@ During this process, I learned: 1. FFT_BRANDX has more than 100K lines of code. The core of kiss_fft is about 500 lines (cpx 1-d). 2. It took me an embarrassingly long time to get FFT_BRANDX working. - 3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB. + 3. A simple program using FFT_BRANDX is 522KB. A similar program using kiss_fft is 18KB (without optimizing for size). 4. FFT_BRANDX is roughly twice as fast as KISS FFT in default mode. It is wonderful that free, highly optimized libraries like FFT_BRANDX exist. @@ -78,6 +78,11 @@ FREQUENTLY ASKED QUESTIONS: 2) mixed build environment -- all code must be compiled with same preprocessor definitions for FIXED_POINT and kiss_fft_scalar + Q: Will you write/debug my code for me? + A: Probably not unless you pay me. I am happy to answer pointed and topical questions, but + I may refer you to a book, a forum, or some other resource. + + PERFORMANCE: (on Athlon XP 2100+, with gcc 2.96, float data type) @@ -92,7 +97,10 @@ DO NOT: UNDER THE HOOD: - Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT. + Kiss FFT uses a time decimation, mixed-radix, out-of-place FFT. If you give it an input buffer + and output buffer that are the same, a temporary buffer will be created to hold the data. + + No static data is used. The core routines of kiss_fft are thread-safe (but not all of the tools directory). No scaling is done for the floating point version (for speed). Scaling is done both ways for the fixed-point version (for overflow prevention). @@ -100,7 +108,8 @@ UNDER THE HOOD: Optimized butterflies are used for factors 2,3,4, and 5. The real (i.e. not complex) optimization code only works for even length ffts. It does two half-length - FFTs in parallel (packed into real&imag), and then combines them via twiddling. + FFTs in parallel (packed into real&imag), and then combines them via twiddling. The result is + nfft/2+1 complex frequency bins from DC to Nyquist. If you don't know what this means, search the web. The fast convolution filtering uses the overlap-scrap method, slightly modified to put the scrap at the tail. @@ -111,6 +120,9 @@ LICENSE: Note this license is compatible with GPL at one end of the spectrum and closed, commercial software at the other end. See http://www.fsf.org/licensing/licenses + A commercial license is available which removes the requirement for attribution. Contact me for details. + + TODO: *) Add real optimization for odd length FFTs *) Document/revisit the input/output fft scaling diff --git a/_kiss_fft_guts.h b/_kiss_fft_guts.h index 1c1d4d7..957a748 100644 --- a/_kiss_fft_guts.h +++ b/_kiss_fft_guts.h @@ -148,3 +148,17 @@ struct kiss_fft_state{ /* a debugging function */ #define pcpx(c)\ fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) ) + + +#ifdef KISS_FFT_USE_ALLOCA +// define this to allow use of alloca instead of malloc for temporary buffers +// Temporary buffers are used in two case: +// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5 +// 2. "in-place" FFTs. Notice the quotes, since kissfft does not really do an in-place transform. +#include +#define KISS_FFT_TMP_ALLOC(nbytes) alloca(nbytes) +#define KISS_FFT_TMP_FREE(ptr) +#else +#define KISS_FFT_TMP_ALLOC(nbytes) KISS_FFT_MALLOC(nbytes) +#define KISS_FFT_TMP_FREE(ptr) KISS_FFT_FREE(ptr) +#endif diff --git a/kiss_fft.c b/kiss_fft.c index e0540a6..465d6c9 100644 --- a/kiss_fft.c +++ b/kiss_fft.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2003-2004, Mark Borgerding +Copyright (c) 2003-2010, Mark Borgerding All rights reserved. @@ -14,27 +14,10 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND #include "_kiss_fft_guts.h" - - /* The guts header contains all the multiplication and addition macros that are defined for fixed or floating point complex numbers. It also delares the kf_ internal functions. */ -static kiss_fft_cpx *scratchbuf=NULL; -static size_t nscratchbuf=0; -static kiss_fft_cpx *tmpbuf=NULL; -static size_t ntmpbuf=0; - -#define CHECKBUF(buf,nbuf,n) \ - do { \ - if ( nbuf < (size_t)(n) ) {\ - free(buf); \ - buf = (kiss_fft_cpx*)KISS_FFT_MALLOC(sizeof(kiss_fft_cpx)*(n)); \ - nbuf = (size_t)(n); \ - } \ - }while(0) - - static void kf_bfly2( kiss_fft_cpx * Fout, const size_t fstride, @@ -225,7 +208,7 @@ static void kf_bfly_generic( kiss_fft_cpx t; int Norig = st->nfft; - kiss_fft_cpx * scratch = (kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx)*p); + kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p); for ( u=0; unfft); + //NOTE: this is not really an in-place FFT algorithm. + //It just performs an out-of-place FFT into a temp buffer + kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft); kf_work(tmpbuf,fin,1,in_stride, st->factors,st); memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft); + KISS_FFT_TMP_FREE(tmpbuf); }else{ kf_work( fout, fin, 1,in_stride, st->factors,st ); } @@ -404,17 +388,9 @@ void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout) } -/* not really necessary to call, but if someone is doing in-place ffts, they may want to free the - buffers from CHECKBUF - */ void kiss_fft_cleanup(void) { - free(scratchbuf); - scratchbuf = NULL; - nscratchbuf=0; - free(tmpbuf); - tmpbuf=NULL; - ntmpbuf=0; + // nothing needed any more } int kiss_fft_next_fast_size(int n) diff --git a/kiss_fft.h b/kiss_fft.h index 5ed835d..20621d8 100644 --- a/kiss_fft.h +++ b/kiss_fft.h @@ -27,11 +27,11 @@ extern "C" { #ifdef USE_SIMD # include # define kiss_fft_scalar __m128 - -//#define KISS_FFT_MALLOC(nbytes) memalign(16,nbytes) #define KISS_FFT_MALLOC(nbytes) _mm_malloc(nbytes,16) +#define KISS_FFT_FREE _mm_free #else #define KISS_FFT_MALLOC malloc +#define KISS_FFT_FREE free #endif diff --git a/test/Makefile b/test/Makefile index 5456247..6483207 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,10 +5,12 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \ CFLAGS=-O3 -I.. -I../tools $(WARNINGS) CFLAGS+=-ffast-math -fomit-frame-pointer -CFLAGS+=-march=prescott -#CFLAGS+= -mtune=native +#CFLAGS+=-funroll-loops +#CFLAGS+=-march=prescott +CFLAGS+= -mtune=native # TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores -CFLAGS+=-fopenmp +#CFLAGS+=-fopenmp +CFLAGS+= $(CFLAGADD) ifeq "$(NFFT)" "" diff --git a/tools/Makefile b/tools/Makefile index c7b3950..bf52220 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -22,6 +22,11 @@ ifeq "$(TYPEFLAGS)" "" TYPEFLAGS=-Dkiss_fft_scalar=$(DATATYPE) endif +ifneq ("$(KISS_FFT_USE_ALLOCA)","") + CFLAGS+= -DKISS_FFT_USE_ALLOCA=1 +endif +CFLAGS+= $(CFLAGADD) + FFTUTIL=fft_$(DATATYPE) FASTFILT=fastconv_$(DATATYPE) diff --git a/tools/kiss_fftr.c b/tools/kiss_fftr.c index bfd989f..f389770 100644 --- a/tools/kiss_fftr.c +++ b/tools/kiss_fftr.c @@ -20,7 +20,7 @@ struct kiss_fftr_state{ kiss_fft_cpx * tmpbuf; kiss_fft_cpx * super_twiddles; #ifdef USE_SIMD - long pad; + void * pad; #endif };