From e2baa5e270959ffe8eb0163d4e4a2ca2a340020a Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sat, 26 Mar 2011 21:25:10 -0400
Subject: [PATCH 1/5] fixed alignment issue with SIMD

---
 tools/kfc.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/kfc.c b/tools/kfc.c
index 424e119..d94d124 100644
--- a/tools/kfc.c
+++ b/tools/kfc.c
@@ -42,10 +42,18 @@ static kiss_fft_cfg find_cached_fft(int nfft,int inverse)
     if (cur== NULL) {
         /* no cached node found, need to create a new one*/
         kiss_fft_alloc(nfft,inverse,0,&len);
+#ifdef USE_SIMD
+        int padding = (16-sizeof(struct cached_fft)) & 15;
+        // make sure the cfg aligns on a 16 byte boundary
+        len += padding;
+#endif
         cur = (kfc_cfg)KISS_FFT_MALLOC((sizeof(struct cached_fft) + len ));
         if (cur == NULL)
             return NULL;
         cur->cfg = (kiss_fft_cfg)(cur+1);
+#ifdef USE_SIMD
+        cur->cfg = (kiss_fft_cfg) ((char*)(cur+1)+padding);
+#endif
         kiss_fft_alloc(nfft,inverse,cur->cfg,&len);
         cur->nfft=nfft;
         cur->inverse=inverse;

From dc6bfad0ab680eb7436adb20dc34649646f54090 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sun, 15 Jul 2012 22:35:28 -0400
Subject: [PATCH 2/5] previous gcc versions might've been silently adding -lm
 at the end of the link line.

---
 test/Makefile  | 12 ++++++------
 tools/Makefile | 10 +++++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 6483207..ac839ad 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -66,20 +66,20 @@ tools:
 
 
 $(SELFTEST): $(SELFTESTSRC) $(SRCFILES)
-	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
+	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) $+ -lm 
 
 $(TESTKFC): $(SRCFILES) 
-	$(CC) -o $@ $(CFLAGS)  -DKFC_TEST $(TYPEFLAGS) -lm $+
+	$(CC) -o $@ $(CFLAGS)  -DKFC_TEST $(TYPEFLAGS) $+ -lm
 	
 $(TESTREAL): test_real.c $(SRCFILES)
-	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm $+
+	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) $+ -lm
 
 $(BENCHKISS): benchkiss.c $(SRCFILES)
-	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS) -lm  $+
+	$(CC) -o $@ $(CFLAGS) $(TYPEFLAGS)  $+ -lm
 
 $(BENCHFFTW): benchfftw.c pstats.c
 	@echo "======attempting to build FFTW benchmark"
-	@$(CC) -o $@ $(CFLAGS) -DDATATYPE$(DATATYPE) $+ -lm $(FFTWLIB) $(FFTWLIBDIR) || echo "FFTW not available for comparison"
+	@$(CC) -o $@ $(CFLAGS) -DDATATYPE$(DATATYPE) $+ $(FFTWLIB) $(FFTWLIBDIR) -lm || echo "FFTW not available for comparison"
 
 test: all
 	@./$(TESTKFC)
@@ -101,7 +101,7 @@ selftest_short.c:
 
 CXXFLAGS=-O3 -ffast-math -fomit-frame-pointer  -I.. -I../tools -W -Wall
 testcpp: testcpp.cc ../kissfft.hh
-	$(CXX) -o $@ $(CXXFLAGS) -lm testcpp.cc
+	$(CXX) -o $@ $(CXXFLAGS) testcpp.cc -lm
 
 
 clean:
diff --git a/tools/Makefile b/tools/Makefile
index bf52220..ae7646b 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -44,19 +44,19 @@ CFLAGS=-Wall -O3 $(WARNINGS)
 # tip: try -openmp or -fopenmp to use multiple cores
 
 $(FASTFILTREAL): ../kiss_fft.c kiss_fastfir.c kiss_fftr.c
-	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR -lm $+ -DFAST_FILT_UTIL
+	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -DREAL_FASTFIR  $+ -DFAST_FILT_UTIL -lm 
 
 $(FASTFILT): ../kiss_fft.c kiss_fastfir.c
-	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -lm $+ -DFAST_FILT_UTIL
+	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) $+  -DFAST_FILT_UTIL -lm
 
 $(FFTUTIL): ../kiss_fft.c fftutil.c kiss_fftnd.c kiss_fftr.c kiss_fftndr.c
-	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -lm $+
+	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) $+ -lm
 
 $(PSDPNG): ../kiss_fft.c psdpng.c kiss_fftr.c
-	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -lm -lpng $+
+	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) $+ -lpng -lm
 
 $(DUMPHDR): ../kiss_fft.c dumphdr.c
-	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) -lm $+
+	$(CC) -o $@ $(CFLAGS) -I.. $(TYPEFLAGS) $+ -lm
 
 clean:
 	rm -f *~ fft fft_* fastconv fastconv_* fastconvr fastconvr_* psdpng psdpng_*

From 0c1d22a9747dccfe1082ececb667cf92f555111b Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Sun, 15 Jul 2012 22:36:18 -0400
Subject: [PATCH 3/5] minor documentation tweaks

---
 Makefile    | 4 +++-
 README.simd | 6 ++++--
 TIPS        | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index e501c5c..dd31333 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,9 @@ KFVER=129
 
 doc:
 	@echo "Start by reading the README file.  If you want to build and test lots of stuff, do a 'make testall'"
-	@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not"
+	@echo "but be aware that 'make testall' has dependencies that the basic kissfft software does not."
+	@echo "It is generally unneeded to run these tests yourself, unless you plan on changing the inner workings"
+	@echo "of kissfft and would like to make use of its regression tests."
 
 testall:
 	# The simd and int32_t types may or may not work on your machine 
diff --git a/README.simd b/README.simd
index 915541d..b0fdac5 100644
--- a/README.simd
+++ b/README.simd
@@ -1,4 +1,5 @@
-If you are reading this, it means you think you may be interested in using the SIMD extensions within kissfft.
+If you are reading this, it means you think you may be interested in using the SIMD extensions in kissfft 
+to do 4 *separate* FFTs at once.
 
 Beware! Beyond here there be dragons!
 
@@ -29,7 +30,8 @@ Search on "SIMD alignment" for more info.
 
 
 Robin at Divide Concept was kind enough to share his code for formatting to/from the SIMD kissfft.  
-I have not run it -- use it at your own risk.
+I have not run it -- use it at your own risk.  It appears to do 4xN and Nx4 transpositions 
+(out of place).
 
 void SSETools::pack128(float* target, float* source, unsigned long size128)
 {
diff --git a/TIPS b/TIPS
index cf7ac2a..6a9579d 100644
--- a/TIPS
+++ b/TIPS
@@ -21,7 +21,7 @@ Speed:
 Reducing code size:
     * remove some of the butterflies. There are currently butterflies optimized for radices
         2,3,4,5.  It is worth mentioning that you can still use FFT sizes that contain 
-        these factors, they just won't be quite as fast.  You can decide for yourself 
+        other factors, they just won't be quite as fast.  You can decide for yourself 
         whether to keep radix 2 or 4.  If you do some work in this area, let me 
         know what you find.
 

From 8fedba4d91ece59375e7f8cc8746564e0f0c3d26 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 17 Jul 2012 23:30:31 -0400
Subject: [PATCH 4/5] fixed warnings about ignored return value and wrong
 format code in printf

---
 tools/kiss_fastfir.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tools/kiss_fastfir.c b/tools/kiss_fastfir.c
index 1c96216..4560aa3 100644
--- a/tools/kiss_fastfir.c
+++ b/tools/kiss_fastfir.c
@@ -362,7 +362,7 @@ void do_file_filter(
     n_samps_buf = 8*4096/sizeof(kffsamp_t); 
     n_samps_buf = nfft + 4*(nfft-n_imp_resp+1);
 
-    if (verbose) fprintf(stderr,"bufsize=%d\n",sizeof(kffsamp_t)*n_samps_buf );
+    if (verbose) fprintf(stderr,"bufsize=%d\n",(int)(sizeof(kffsamp_t)*n_samps_buf) );
      
 
     /*allocate space and initialize pointers */
@@ -449,10 +449,12 @@ int main(int argc,char**argv)
     }
     fseek(filtfile,0,SEEK_END);
     nh = ftell(filtfile) / sizeof(kffsamp_t);
-    if (verbose) fprintf(stderr,"%d samples in FIR filter\n",nh);
+    if (verbose) fprintf(stderr,"%d samples in FIR filter\n",(int)nh);
     h = (kffsamp_t*)malloc(sizeof(kffsamp_t)*nh);
     fseek(filtfile,0,SEEK_SET);
-    fread(h,sizeof(kffsamp_t),nh,filtfile);
+    if (fread(h,sizeof(kffsamp_t),nh,filtfile) != nh)
+        fprintf(stderr,"short read on filter file\n");
+
     fclose(filtfile);
  
     if (use_direct)

From 4faaa8307503fa769ed749e398711910c551ff97 Mon Sep 17 00:00:00 2001
From: Mark Borgerding <mark@borgerding.net>
Date: Tue, 17 Jul 2012 23:31:00 -0400
Subject: [PATCH 5/5] converted testkiss.py from Numeric to numpy

---
 Makefile         |  1 +
 test/testkiss.py | 36 ++++++++++++++++++------------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index dd31333..d71b925 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@ testall:
 	make -C test DATATYPE=int16_t CFLAGADD="$(CFLAGADD)" test
 	make -C test DATATYPE=float CFLAGADD="$(CFLAGADD)" test
 	make -C test DATATYPE=double CFLAGADD="$(CFLAGADD)" test
+	echo "all tests passed"
 
 tarball: clean
 	hg archive -r v$(KFVER) -t tgz kiss_fft$(KFVER).tar.gz 
diff --git a/test/testkiss.py b/test/testkiss.py
index a5f7005..5749c7c 100755
--- a/test/testkiss.py
+++ b/test/testkiss.py
@@ -7,8 +7,7 @@ import random
 import struct
 import popen2
 import getopt
-import Numeric
-import FFT
+import numpy
 
 pi=math.pi
 e=math.e
@@ -26,7 +25,7 @@ elif datatype=='int16_t':
     fmt='h'
     minsnr=10
 elif datatype=='int32_t':
-    fmt='l'
+    fmt='i'
 elif datatype=='simd':
     fmt='4f'
     sys.stderr.write('testkiss.py does not yet test simd')
@@ -39,21 +38,21 @@ else:
  
 
 def dopack(x,cpx=1):
-    x = Numeric.reshape( x, ( Numeric.size(x),) )
+    x = numpy.reshape( x, ( numpy.size(x),) )
     
     if cpx:
         s = ''.join( [ struct.pack(fmt*2,c.real,c.imag) for c in x ] )
     else:
-        s = ''.join( [ struct.pack(fmt,c) for c in x ] )
+        s = ''.join( [ struct.pack(fmt,c.real) for c in x ] )
     return s
 
 def dounpack(x,cpx):
     uf = fmt * ( len(x) / struct.calcsize(fmt) )
     s = struct.unpack(uf,x)
     if cpx:
-        return Numeric.array(s[::2]) + Numeric.array( s[1::2] )*j
+        return numpy.array(s[::2]) + numpy.array( s[1::2] )*j
     else:
-        return Numeric.array(s )
+        return numpy.array(s )
 
 def make_random(dims=[1]):
     res = []
@@ -67,11 +66,11 @@ def make_random(dims=[1]):
                 res.append( complex(r,i) )
         else:
             res.append( make_random( dims[1:] ) )
-    return Numeric.array(res)
+    return numpy.array(res)
 
 def flatten(x):
-    ntotal = Numeric.product(Numeric.shape(x))
-    return Numeric.reshape(x,(ntotal,))
+    ntotal = numpy.size(x)
+    return numpy.reshape(x,(ntotal,))
 
 def randmat( ndims ):
     dims=[]
@@ -85,11 +84,11 @@ def randmat( ndims ):
 def test_fft(ndims):
     x=randmat( ndims )
 
-    print 'dimensions=%s' % str( Numeric.shape(x) ),
+
     if doreal:
-        xver = FFT.real_fftnd(x)
+        xver = numpy.fft.rfftn(x)
     else:
-        xver = FFT.fftnd(x)
+        xver = numpy.fft.fftn(x)
     
     open('/tmp/fftexp.dat','w').write(dopack( flatten(xver) , True ) )
 
@@ -97,8 +96,8 @@ def test_fft(ndims):
     err = xver - x2
     errf = flatten(err)
     xverf = flatten(xver)
-    errpow = Numeric.vdot(errf,errf)+1e-10
-    sigpow = Numeric.vdot(xverf,xverf)+1e-10
+    errpow = numpy.vdot(errf,errf)+1e-10
+    sigpow = numpy.vdot(xverf,xverf)+1e-10
     snr = 10*math.log10(abs(sigpow/errpow) )
     print 'SNR (compared to NumPy) : %.1fdB' % float(snr)
 
@@ -109,9 +108,9 @@ def test_fft(ndims):
         sys.exit(1)
  
 def dofft(x):
-    dims=list( Numeric.shape(x) )
+    dims=list( numpy.shape(x) )
     x = flatten(x)
-    iscomp = (type(x[0]) == complex)
+    iscomp = (all(x.conj()==x)==False)
 
     scale=1
     if datatype=='int16_t':
@@ -126,6 +125,7 @@ def dofft(x):
     if doreal:
         cmd += ' -R '
 
+    print cmd
     p = popen2.Popen3(cmd )
 
     open('/tmp/fftin.dat','w').write(dopack( x , iscomp ) )
@@ -141,7 +141,7 @@ def dofft(x):
     res = scale * res
 
     p.wait()
-    return Numeric.reshape(res,dims)
+    return numpy.reshape(res,dims)
 
 def main():
     opts,args = getopt.getopt(sys.argv[1:],'r')