Merge pull request #111 from HecaiYuan/master

loongarch: add loongarch simd opt
This commit is contained in:
mborgerding
2026-05-11 20:03:41 -04:00
committed by GitHub
12 changed files with 192 additions and 7 deletions

View File

@@ -165,7 +165,13 @@ else()
if(KISSFFT_DATATYPE MATCHES "^simd$")
list(APPEND KISSFFT_COMPILE_DEFINITIONS USE_SIMD)
if (NOT MSVC)
if(HAVE_LASX)
list(APPEND KISSFFT_COMPILE_DEFINITIONS HAVE_LASX)
target_compile_options(kissfft PRIVATE -mlasx)
elseif(HAVE_LSX)
list(APPEND KISSFFT_COMPILE_DEFINITIONS HAVE_LSX)
target_compile_options(kissfft PRIVATE -mlsx)
elseif(NOT MSVC)
target_compile_options(kissfft PRIVATE -msse)
else()
target_compile_options(kissfft PRIVATE "/arch:SSE")
@@ -260,6 +266,13 @@ function(add_kissfft_executable NAME)
target_link_libraries(${NAME} PRIVATE m)
endif()
if(HAVE_LASX)
target_compile_options(${NAME} PRIVATE -mlasx)
endif()
if(HAVE_LSX)
target_compile_options(${NAME} PRIVATE -mlsx)
endif()
if (NOT KISSFFT_OPENMP)
set_target_properties(${NAME} PROPERTIES
OUTPUT_NAME "${NAME}-${KISSFFT_DATATYPE}")
@@ -327,7 +340,11 @@ if (KISSFFT_PKGCONFIG)
join_paths(PKGCONFIG_KISSFFT_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}")
if(KISSFFT_DATATYPE MATCHES "^simd$")
list(APPEND KISSFFT_COMPILE_DEFINITIONS USE_SIMD)
if (NOT MSVC)
if(HAVE_LASX)
set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -mlasx")
elseif(HAVE_LSX)
set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -mlsx")
elseif(NOT MSVC)
set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -msse")
else()
set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} /ARCH:SSE")

View File

@@ -72,6 +72,9 @@ ifneq ($(MAKECMDGOALS),clean)
ifeq ($(_UNAME_ARCH),x86_64)
CANDIDATE_LIBDIR_NAME = lib64
endif
ifeq ($(_UNAME_ARCH),loongarch64)
CANDIDATE_LIBDIR_NAME = lib64
endif
endif
endif
@@ -113,6 +116,8 @@ else
endif
export KISSFFTLIB_SHORTNAME
export HAVE_LSX
export HAVE_LASX
#
# Compile-time definitions by datatype
@@ -127,7 +132,13 @@ ifeq "$(KISSFFT_DATATYPE)" "int32_t"
else ifeq "$(KISSFFT_DATATYPE)" "int16_t"
TYPEFLAGS += -DFIXED_POINT=16
else ifeq "$(KISSFFT_DATATYPE)" "simd"
ifeq "$(HAVE_LASX)" "lasx"
TYPEFLAGS += -DUSE_SIMD=1 -DHAVE_LASX=1 -mlasx
else ifeq "$(HAVE_LSX)" "lsx"
TYPEFLAGS += -DUSE_SIMD=1 -DHAVE_LSX=1 -mlsx
else
TYPEFLAGS += -DUSE_SIMD=1 -msse
endif
else ifeq "$(KISSFFT_DATATYPE)" "float"
TYPEFLAGS += -Dkiss_fft_scalar=$(KISSFFT_DATATYPE)
else ifeq "$(KISSFFT_DATATYPE)" "double"
@@ -273,21 +284,39 @@ testall:
$(MAKE) KISSFFT_DATATYPE=int16_t testsingle
# The simd and int32_t types may or may not work on your machine
$(MAKE) KISSFFT_DATATYPE=int32_t testsingle
$(MAKE) KISSFFT_DATATYPE=simd testsingle
@if [ "$(HAVE_LASX)" = "lasx" ]; then \
$(MAKE) KISSFFT_DATATYPE=simd HAVE_LASX=lasx testsingle; \
elif [ "$(HAVE_LSX)" = "lsx" ]; then \
$(MAKE) KISSFFT_DATATYPE=simd HAVE_LSX=lsx testsingle; \
else \
$(MAKE) KISSFFT_DATATYPE=simd testsingle; \
fi
# Static libraries
$(MAKE) KISSFFT_DATATYPE=double KISSFFT_STATIC=1 testsingle
$(MAKE) KISSFFT_DATATYPE=float KISSFFT_STATIC=1 testsingle
$(MAKE) KISSFFT_DATATYPE=int16_t KISSFFT_STATIC=1 testsingle
# The simd and int32_t types may or may not work on your machine
$(MAKE) KISSFFT_DATATYPE=int32_t KISSFFT_STATIC=1 testsingle
$(MAKE) KISSFFT_DATATYPE=simd KISSFFT_STATIC=1 testsingle
@if [ "$(HAVE_LASX)" = "lasx" ]; then \
$(MAKE) KISSFFT_DATATYPE=simd HAVE_LASX=lasx testsingle; \
elif [ "$(HAVE_LSX)" = "lsx" ]; then \
$(MAKE) KISSFFT_DATATYPE=simd HAVE_LSX=lsx KISSFFT_STATIC=1 testsingle; \
else \
$(MAKE) KISSFFT_DATATYPE=simd KISSFFT_STATIC=1 testsingle; \
fi
# OpenMP libraries
$(MAKE) KISSFFT_DATATYPE=double KISSFFT_OPENMP=1 testsingle
$(MAKE) KISSFFT_DATATYPE=float KISSFFT_OPENMP=1 testsingle
$(MAKE) KISSFFT_DATATYPE=int16_t KISSFFT_OPENMP=1 testsingle
# The simd and int32_t types may or may not work on your machine
$(MAKE) KISSFFT_DATATYPE=int32_t KISSFFT_OPENMP=1 testsingle
$(MAKE) KISSFFT_DATATYPE=simd KISSFFT_OPENMP=1 testsingle
@if [ "$(HAVE_LASX)" = "lasx" ]; then \
$(MAKE) KISSFFT_DATATYPE=simd HAVE_LASX=lasx testsingle; \
elif [ "$(HAVE_LSX)" = "lsx" ]; then \
$(MAKE) KISSFFT_DATATYPE=simd HAVE_LSX=lsx KISSFFT_OPENMP=1 testsingle; \
else \
$(MAKE) KISSFFT_DATATYPE=simd KISSFFT_OPENMP=1 testsingle; \
fi
$(warning All tests passed!)
#

View File

@@ -129,9 +129,31 @@ struct kiss_fft_state{
# define KISS_FFT_SIN(phase) floor(.5+SAMP_MAX * sin (phase))
# define HALF_OF(x) ((x)>>1)
#elif defined(USE_SIMD)
#if defined(HAVE_LASX)
#define KISS_FFT_COS(phase) ({ \
float __cos_val = cosf(phase); \
(__m256)(__lasx_xvldrepl_w(&__cos_val, 0)); \
})
#define KISS_FFT_SIN(phase) ({ \
float __sin_val = sinf(phase); \
(__m256)(__lasx_xvldrepl_w(&__sin_val, 0)); \
})
#define HALF_OF(x) ((x) * (__m256)(__lasx_xvreplgr2vr_w(0x3F000000))) // 0.5f
#elif defined(HAVE_LSX)
#define KISS_FFT_COS(phase) ({ \
float __cos_val = cosf(phase); \
(__m128)(__lsx_vldrepl_w(&__cos_val, 0)); \
})
#define KISS_FFT_SIN(phase) ({ \
float __sin_val = sinf(phase); \
(__m128)(__lsx_vldrepl_w(&__sin_val, 0)); \
})
#define HALF_OF(x) ((x) * (__m128)(__lsx_vreplgr2vr_w(0x3F000000))) // 0.5f
#else
# define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) )
# define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) )
# define HALF_OF(x) ((x)*_mm_set1_ps(.5))
#endif
#else
# define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase)
# define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase)

View File

@@ -48,6 +48,29 @@ extern "C" {
/* User may override KISS_FFT_MALLOC and/or KISS_FFT_FREE. */
#ifdef USE_SIMD
#ifdef HAVE_LASX
# include <lasxintrin.h>
# define kiss_fft_scalar __m256
# ifndef KISS_FFT_MALLOC
# define KISS_FFT_MALLOC(nbytes) aligned_alloc(32, KISS_FFT_ALIGN_SIZE_UP(nbytes))
# define KISS_FFT_ALIGN_CHECK(ptr)
# define KISS_FFT_ALIGN_SIZE_UP(size) ((size + 31UL) & ~0x1FUL)
# endif
# ifndef KISS_FFT_FREE
# define KISS_FFT_FREE free
# endif
#elif defined(HAVE_LSX)
# include <lsxintrin.h>
# define kiss_fft_scalar __m128
# ifndef KISS_FFT_MALLOC
# define KISS_FFT_MALLOC(nbytes) aligned_alloc(16, KISS_FFT_ALIGN_SIZE_UP(nbytes))
# define KISS_FFT_ALIGN_CHECK(ptr)
# define KISS_FFT_ALIGN_SIZE_UP(size) ((size + 15UL) & ~0xFUL)
# endif
# ifndef KISS_FFT_FREE
# define KISS_FFT_FREE free
# endif
#else
# include <xmmintrin.h>
# define kiss_fft_scalar __m128
# ifndef KISS_FFT_MALLOC
@@ -58,6 +81,7 @@ extern "C" {
# ifndef KISS_FFT_FREE
# define KISS_FFT_FREE _mm_free
# endif
#endif
#else
# define KISS_FFT_ALIGN_CHECK(ptr)
# define KISS_FFT_ALIGN_SIZE_UP(size) (size)

View File

@@ -93,7 +93,15 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr
freqdata[0].r = tdc.r + tdc.i;
freqdata[ncfft].r = tdc.r - tdc.i;
#ifdef USE_SIMD
#ifdef HAVE_LASX
freqdata[0].i = (__m256)(__lasx_xvreplgr2vr_w(0));
freqdata[ncfft].i = freqdata[0].i;
#elif defined(HAVE_LSX)
freqdata[0].i = (__m128)(__lsx_vreplgr2vr_w(0));
freqdata[ncfft].i = freqdata[0].i;
#else
freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0);
#endif
#else
freqdata[ncfft].i = freqdata[0].i = 0;
#endif
@@ -146,7 +154,15 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t
C_ADD (st->tmpbuf[k], fek, fok);
C_SUB (st->tmpbuf[ncfft - k], fek, fok);
#ifdef USE_SIMD
#ifdef HAVE_LASX
__m256 neg_one = (__m256)__lasx_xvreplgr2vr_w(0xBF800000); // -1.0f
st->tmpbuf[ncfft - k].i = __lasx_xvfmul_s(st->tmpbuf[ncfft - k].i, neg_one);
#elif defined(HAVE_LSX)
__m128 neg_one = (__m128)__lsx_vreplgr2vr_w(0xBF800000); // -1.0f
st->tmpbuf[ncfft - k].i = __lsx_vfmul_s(st->tmpbuf[ncfft - k].i, neg_one);
#else
st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
#endif
#else
st->tmpbuf[ncfft - k].i *= -1;
#endif

View File

@@ -2,6 +2,14 @@ function(add_kissfft_test_executable NAME)
add_kissfft_executable(${NAME} ${ARGN})
target_include_directories(${NAME} PRIVATE ..)
if(HAVE_LASX)
target_compile_options(${NAME} PRIVATE -mlasx)
endif()
if(HAVE_LSX)
target_compile_options(${NAME} PRIVATE -mlsx)
endif()
add_test(NAME ${NAME} COMMAND ${NAME})
set_tests_properties(${NAME} PROPERTIES TIMEOUT 3600)
endfunction()
@@ -46,7 +54,11 @@ add_kissfft_test_executable(testcpp testcpp.cc)
if(KISSFFT_DATATYPE MATCHES "^simd$")
add_kissfft_test_executable(tsimd test_simd.c)
target_compile_definitions(tsimd PRIVATE USE_SIMD)
if (NOT MSVC)
if(HAVE_LASX)
target_compile_options(kissfft PRIVATE -mlasx)
elseif(HAVE_LSX)
target_compile_options(kissfft PRIVATE -mlsx)
elseif(NOT MSVC)
target_compile_options(kissfft PRIVATE -msse)
else()
target_compile_options(kissfft PRIVATE "/arch:SSE")

View File

@@ -138,7 +138,13 @@ endif
$(TESTSIMD): test_simd.c
ifeq "$(KISSFFT_DATATYPE)" "simd"
ifeq "$(HAVE_LASX)" "lasx"
$(CC) -o $@ -g $(CFLAGS) -DUSE_SIMD=1 -DHAVE_LASX=1 -mlasx $< -L.. -l$(KISSFFTLIB_SHORTNAME) -lm
else ifeq "$(HAVE_LSX)" "lsx"
$(CC) -o $@ -g $(CFLAGS) -DUSE_SIMD=1 -DHAVE_LSX=1 -mlsx $< -L.. -l$(KISSFFTLIB_SHORTNAME) -lm
else
$(CC) -o $@ -g $(CFLAGS) -DUSE_SIMD=1 -msse $< -L.. -l$(KISSFFTLIB_SHORTNAME) -lm
endif
else
$(error ERROR: This test makes sense only with KISSFFT_DATATYPE=simd)
endif

View File

@@ -70,8 +70,13 @@ int main(int argc,char ** argv)
nbytes *= nfft[k];
#ifdef USE_SIMD
#ifdef HAVE_LASX
numffts /= 8;
fprintf(stderr,"since SIMD implementation does 8 ffts at a time, numffts is being reduced to %d\n",numffts);
#else
numffts /= 4;
fprintf(stderr,"since SIMD implementation does 4 ffts at a time, numffts is being reduced to %d\n",numffts);
#endif
#endif
buf=(kiss_fft_cpx*)KISS_FFT_MALLOC(nbytes);

View File

@@ -22,7 +22,15 @@ static
kiss_fft_scalar rand_scalar(void)
{
#ifdef USE_SIMD
#ifdef HAVE_LASX
float tmp = rand()-RAND_MAX/2;
return (__m256)(__lasx_xvldrepl_w(&tmp, 0));
#elif defined(HAVE_LSX)
float tmp = rand()-RAND_MAX/2;
return (__m128)(__lsx_vldrepl_w(&tmp, 0));
#else
return _mm_set1_ps(rand()-RAND_MAX/2);
#endif
#else
kiss_fft_scalar s = (kiss_fft_scalar)(rand() -RAND_MAX/2);
return s/2;

View File

@@ -6,16 +6,30 @@ static void test1(void)
int n[2] = {256,256};
size_t nbytes = sizeof(kiss_fft_cpx)*n[0]*n[1];
#if defined(HAVE_LSX) || defined(HAVE_LASX)
kiss_fft_cpx * inbuf = NULL;
kiss_fft_cpx * outbuf = NULL;
if (posix_memalign((void**)&inbuf, 16, nbytes) ||
posix_memalign((void**)&outbuf, 16, nbytes))
return;
#else
kiss_fft_cpx * inbuf = _mm_malloc(nbytes,16);
kiss_fft_cpx * outbuf = _mm_malloc(nbytes,16);
#endif
memset(inbuf,0,nbytes);
memset(outbuf,0,nbytes);
kiss_fftnd_cfg cfg = kiss_fftnd_alloc(n,2,is_inverse,0,0);
kiss_fftnd(cfg,inbuf,outbuf);
kiss_fft_free(cfg);
#if defined(HAVE_LSX) || defined(HAVE_LASX)
free(inbuf);
free(outbuf);
#else
_mm_free(inbuf);
_mm_free(outbuf);
#endif
}
int main(void)

View File

@@ -38,8 +38,16 @@ double two_tone_test( int nfft, int bin1,int bin2)
/* generate a signal with two tones*/
for (i = 0; i < nfft; i++) {
#ifdef USE_SIMD
#ifdef HAVE_LASX
float tmp = (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i);
tbuf[i] = (__m256)__lasx_xvldrepl_w(&tmp, 0);
#elif defined(HAVE_LSX)
float tmp = (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i);
tbuf[i] = (__m128)__lsx_vldrepl_w(&tmp, 0);
#else
tbuf[i] = _mm_set1_ps( (maxrange>>1)*cos(f1*i)
+ (maxrange>>1)*cos(f2*i) );
#endif
#else
tbuf[i] = (maxrange>>1)*cos(f1*i)
+ (maxrange>>1)*cos(f2*i);

View File

@@ -154,8 +154,18 @@ kiss_fastfir_cfg kiss_fastfir_alloc(
for ( i=0; i < st->n_freq_bins; ++i ) {
#ifdef USE_SIMD
#ifdef HAVE_LASX
__m256 tmp = (__m256)__lasx_xvldrepl_w(&scale, 0);
st->fir_freq_resp[i].r = __lasx_xvfmul_s(tmp, st->fir_freq_resp[i].r);
st->fir_freq_resp[i].i = __lasx_xvfmul_s(tmp, st->fir_freq_resp[i].i);
#elif defined(HAVE_LSX)
__m128 tmp = (__m128)__lsx_vldrepl_w(&scale, 0);
st->fir_freq_resp[i].r = __lsx_vfmul_s(tmp, st->fir_freq_resp[i].r);
st->fir_freq_resp[i].i = __lsx_vfmul_s(tmp, st->fir_freq_resp[i].i);
#else
st->fir_freq_resp[i].r *= _mm_set1_ps(scale);
st->fir_freq_resp[i].i *= _mm_set1_ps(scale);
#endif
#else
st->fir_freq_resp[i].r *= scale;
st->fir_freq_resp[i].i *= scale;
@@ -286,7 +296,13 @@ void direct_file_filter(
tmph = imp_resp+nlag;
#ifdef REAL_FASTFIR
# ifdef USE_SIMD
#ifdef HAVE_LASX
outval = (__m256)(__lasx_xvreplgr2vr_w(0));
#elif defined(HAVE_LSX)
outval = (__m128)(__lsx_vreplgr2vr_w(0));
#else
outval = _mm_set1_ps(0);
#endif
#else
outval = 0;
#endif
@@ -297,7 +313,15 @@ void direct_file_filter(
outval += buf[k] * *tmph;
#else
# ifdef USE_SIMD
#ifdef HAVE_LASX
outval.i = (__m256)(__lasx_xvreplgr2vr_w(0));
outval.r = outval.i;
#elif defined(HAVE_LSX)
outval.i = (__m128)(__lsx_vreplgr2vr_w(0));
outval.r = outval.i;
#else
outval.r = outval.i = _mm_set1_ps(0);
#endif
#else
outval.r = outval.i = 0;
#endif