mirror of
https://github.com/mborgerding/kissfft.git
synced 2025-06-04 01:28:23 -04:00
*** empty log message ***
This commit is contained in:
parent
68a8b95a47
commit
51e95088b3
76
README.simd
Normal file
76
README.simd
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
If you are reading this, it means you think you may be interested in using the SIMD extensions within kissfft.
|
||||||
|
|
||||||
|
Beware! Beyond here there be dragons!
|
||||||
|
|
||||||
|
This API is not easy to use, is not well documented, and breaks the KISS principle.
|
||||||
|
|
||||||
|
|
||||||
|
Still reading? Okay, you may get rewarded for your patience with a considerable speedup
|
||||||
|
(2-3x) on intel x86 machines with SSE if you are willing to jump through some hoops.
|
||||||
|
|
||||||
|
The basic idea is to use the packed 4 float __m128 data type as a scalar element.
|
||||||
|
This means that the format is pretty convoluted. It performs 4 FFTs per fft call on signals A,B,C,D.
|
||||||
|
|
||||||
|
For complex data, the data is interlaced as follows:
|
||||||
|
rA0,rB0,rC0,rD0, iA0,iB0,iC0,iD0, rA1,rB1,rC1,rD1, iA1,iB1,iC1,iD1 ...
|
||||||
|
where "rA0" is the real part of the zeroth sample for signal A
|
||||||
|
|
||||||
|
Real-only data is laid out:
|
||||||
|
rA0,rB0,rC0,rD0, rA1,rB1,rC1,rD1, ...
|
||||||
|
|
||||||
|
Compile with gcc flags something like
|
||||||
|
-O3 -mpreferred-stack-boundary=4 -DUSE_SIMD=1 -msse
|
||||||
|
|
||||||
|
Be aware of SIMD alignment. This is the most likely cause of segfaults.
|
||||||
|
The code within kissfft uses scratch variables on the stack.
|
||||||
|
With SIMD, these must have addresses on 16 byte boundaries.
|
||||||
|
Search on "SIMD alignment" for more info.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Robin at Divide Concept was kind enough to share his code for formatting to/from the SIMD kissfft.
|
||||||
|
I have not run it -- use it at your own risk.
|
||||||
|
|
||||||
|
void SSETools::pack128(float* target, float* source, unsigned long size128)
|
||||||
|
{
|
||||||
|
__m128* pDest = (__m128*)target;
|
||||||
|
__m128* pDestEnd = pDest+size128;
|
||||||
|
float* source0=source;
|
||||||
|
float* source1=source0+size128;
|
||||||
|
float* source2=source1+size128;
|
||||||
|
float* source3=source2+size128;
|
||||||
|
|
||||||
|
while(pDest<pDestEnd)
|
||||||
|
{
|
||||||
|
*pDest=_mm_set_ps(*source3,*source2,*source1,*source0);
|
||||||
|
source0++;
|
||||||
|
source1++;
|
||||||
|
source2++;
|
||||||
|
source3++;
|
||||||
|
pDest++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SSETools::unpack128(float* target, float* source, unsigned long size128)
|
||||||
|
{
|
||||||
|
|
||||||
|
float* pSrc = source;
|
||||||
|
float* pSrcEnd = pSrc+size128*4;
|
||||||
|
float* target0=target;
|
||||||
|
float* target1=target0+size128;
|
||||||
|
float* target2=target1+size128;
|
||||||
|
float* target3=target2+size128;
|
||||||
|
|
||||||
|
while(pSrc<pSrcEnd)
|
||||||
|
{
|
||||||
|
*target0=pSrc[0];
|
||||||
|
*target1=pSrc[1];
|
||||||
|
*target2=pSrc[2];
|
||||||
|
*target3=pSrc[3];
|
||||||
|
target0++;
|
||||||
|
target1++;
|
||||||
|
target2++;
|
||||||
|
target3++;
|
||||||
|
pSrc+=4;
|
||||||
|
}
|
||||||
|
}
|
@ -4,10 +4,13 @@ WARNINGS=-W -Wall -Wstrict-prototypes -Wmissing-prototypes -Waggregate-return \
|
|||||||
-Wwrite-strings
|
-Wwrite-strings
|
||||||
|
|
||||||
# for x86 pentium+ machines , these flags work well
|
# for x86 pentium+ machines , these flags work well
|
||||||
CFLAGS=-O3 -march=pentiumpro -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
#
|
||||||
# If the above flags do not work, try the following
|
CFLAGS=-O3 -I.. -I../tools $(WARNINGS)
|
||||||
#CFLAGS=-O3 -mtune=native -ffast-math -fomit-frame-pointer -I.. -I../tools $(WARNINGS)
|
CFLAGS+=-ffast-math -fomit-frame-pointer
|
||||||
|
CFLAGS+=-march=prescott
|
||||||
|
#CFLAGS+= -mtune=native
|
||||||
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
|
# TIP: try adding -openmp or -fopenmp to enable OPENMP directives and use of multiple cores
|
||||||
|
#CFLAGS+= -fopenmp
|
||||||
|
|
||||||
|
|
||||||
ifeq "$(NFFT)" ""
|
ifeq "$(NFFT)" ""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user