From eeb2e98ff601b063b2c52bf321965bddd5ac9ce8 Mon Sep 17 00:00:00 2001 From: yuanhecai Date: Wed, 27 Aug 2025 11:17:54 +0800 Subject: [PATCH] loongarch: add lasx simd opt --- CMakeLists.txt | 33 +++++++++++++-------------------- Makefile | 20 ++++++++++++++------ _kiss_fft_guts.h | 16 ++++++++++++++-- kiss_fft.h | 13 ++++++++++++- kiss_fftr.c | 10 ++++++++-- test/CMakeLists.txt | 10 ++++++++-- test/Makefile | 4 +++- test/benchkiss.c | 5 +++++ test/test_real.c | 5 ++++- test/test_simd.c | 4 ++-- test/twotonetest.c | 6 ++++-- tools/kiss_fastfir.c | 15 ++++++++++++--- 12 files changed, 99 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f120a3..fbe3e3d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -109,21 +109,6 @@ if(CMAKE_SYSTEM_NAME MATCHES "^(Linux|kFreeBSD|GNU)$" AND NOT CMAKE_CROSSCOMPILI include(GNUInstallDirs) endif() -include(CheckCXXSourceCompiles) - -if (CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch32|loongarch64") - set (CMAKE_REQUIRED_FLAGS -mlsx) - check_cxx_source_compiles( - "int main() { - #if !defined(__loongarch_sx) - static_assert(false, \"__loongarch_sx is not defined\"); - #endif - return 0; - }" - COMPILER_SUPPORT_LSX_FLAG - ) -endif() - # # Declare PKGINCLUDEDIR for kissfft include path # @@ -182,7 +167,10 @@ else() if(KISSFFT_DATATYPE MATCHES "^simd$") list(APPEND KISSFFT_COMPILE_DEFINITIONS USE_SIMD) - if(COMPILER_SUPPORT_LSX_FLAG) + if(HAVE_LASX) + list(APPEND KISSFFT_COMPILE_DEFINITIONS HAVE_LASX) + target_compile_options(kissfft PRIVATE -mlasx) + elseif(HAVE_LSX) list(APPEND KISSFFT_COMPILE_DEFINITIONS HAVE_LSX) target_compile_options(kissfft PRIVATE -mlsx) elseif(NOT MSVC) @@ -280,8 +268,11 @@ function(add_kissfft_executable NAME) target_link_libraries(${NAME} PRIVATE m) endif() - if(COMPILER_SUPPORT_LSX_FLAG) - target_compile_options(${NAME} PRIVATE -mlsx) + if(HAVE_LASX) + target_compile_options(${NAME} PRIVATE -mlasx) + endif() + if(HAVE_LSX) + target_compile_options(${NAME} PRIVATE -mlsx) endif() if (NOT KISSFFT_OPENMP) @@ -351,8 +342,10 @@ if (KISSFFT_PKGCONFIG) join_paths(PKGCONFIG_KISSFFT_INCLUDEDIR "\${prefix}" "${CMAKE_INSTALL_INCLUDEDIR}") if(KISSFFT_DATATYPE MATCHES "^simd$") list(APPEND KISSFFT_COMPILE_DEFINITIONS USE_SIMD) - if(COMPILER_SUPPORT_LSX_FLAG) - set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -mlsx") + if(HAVE_LASX) + set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -mlasx") + elseif(HAVE_LSX) + set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -mlsx") elseif(NOT MSVC) set(PKG_KISSFFT_DEFS "${PKG_KISSFFT_DEFS} -msse") else() diff --git a/Makefile b/Makefile index 93f540f..566f5f3 100644 --- a/Makefile +++ b/Makefile @@ -74,8 +74,6 @@ ifneq ($(MAKECMDGOALS),clean) endif ifeq ($(_UNAME_ARCH),loongarch64) CANDIDATE_LIBDIR_NAME = lib64 - HAVE_LSX=lsx - export HAVE_LSX endif endif endif @@ -118,6 +116,8 @@ else endif export KISSFFTLIB_SHORTNAME +export HAVE_LSX +export HAVE_LASX # # Compile-time definitions by datatype @@ -132,7 +132,9 @@ ifeq "$(KISSFFT_DATATYPE)" "int32_t" else ifeq "$(KISSFFT_DATATYPE)" "int16_t" TYPEFLAGS += -DFIXED_POINT=16 else ifeq "$(KISSFFT_DATATYPE)" "simd" - ifeq "$(HAVE_LSX)" "lsx" + ifeq "$(HAVE_LASX)" "lasx" + TYPEFLAGS += -DUSE_SIMD=1 -DHAVE_LASX=1 -mlasx + else ifeq "$(HAVE_LSX)" "lsx" TYPEFLAGS += -DUSE_SIMD=1 -DHAVE_LSX=1 -mlsx else TYPEFLAGS += -DUSE_SIMD=1 -msse @@ -282,7 +284,9 @@ testall: $(MAKE) KISSFFT_DATATYPE=int16_t testsingle # The simd and int32_t types may or may not work on your machine $(MAKE) KISSFFT_DATATYPE=int32_t testsingle - @if [ "$(HAVE_LSX)" = "lsx" ]; then \ + @if [ "$(HAVE_LASX)" = "lasx" ]; then \ + $(MAKE) KISSFFT_DATATYPE=simd HAVE_LASX=lasx testsingle; \ + elif [ "$(HAVE_LSX)" = "lsx" ]; then \ $(MAKE) KISSFFT_DATATYPE=simd HAVE_LSX=lsx testsingle; \ else \ $(MAKE) KISSFFT_DATATYPE=simd testsingle; \ @@ -293,7 +297,9 @@ testall: $(MAKE) KISSFFT_DATATYPE=int16_t KISSFFT_STATIC=1 testsingle # The simd and int32_t types may or may not work on your machine $(MAKE) KISSFFT_DATATYPE=int32_t KISSFFT_STATIC=1 testsingle - @if [ "$(HAVE_LSX)" = "lsx" ]; then \ + @if [ "$(HAVE_LASX)" = "lasx" ]; then \ + $(MAKE) KISSFFT_DATATYPE=simd HAVE_LASX=lasx testsingle; \ + elif [ "$(HAVE_LSX)" = "lsx" ]; then \ $(MAKE) KISSFFT_DATATYPE=simd HAVE_LSX=lsx KISSFFT_STATIC=1 testsingle; \ else \ $(MAKE) KISSFFT_DATATYPE=simd KISSFFT_STATIC=1 testsingle; \ @@ -304,7 +310,9 @@ testall: $(MAKE) KISSFFT_DATATYPE=int16_t KISSFFT_OPENMP=1 testsingle # The simd and int32_t types may or may not work on your machine $(MAKE) KISSFFT_DATATYPE=int32_t KISSFFT_OPENMP=1 testsingle - @if [ "$(HAVE_LSX)" = "lsx" ]; then \ + @if [ "$(HAVE_LASX)" = "lasx" ]; then \ + $(MAKE) KISSFFT_DATATYPE=simd HAVE_LASX=lasx testsingle; \ + elif [ "$(HAVE_LSX)" = "lsx" ]; then \ $(MAKE) KISSFFT_DATATYPE=simd HAVE_LSX=lsx KISSFFT_OPENMP=1 testsingle; \ else \ $(MAKE) KISSFFT_DATATYPE=simd KISSFFT_OPENMP=1 testsingle; \ diff --git a/_kiss_fft_guts.h b/_kiss_fft_guts.h index 4f18089..f4b6832 100644 --- a/_kiss_fft_guts.h +++ b/_kiss_fft_guts.h @@ -128,7 +128,18 @@ struct kiss_fft_state{ # define KISS_FFT_COS(phase) floor(.5+SAMP_MAX * cos (phase)) # define KISS_FFT_SIN(phase) floor(.5+SAMP_MAX * sin (phase)) # define HALF_OF(x) ((x)>>1) -#elif defined(USE_SIMD) && defined(HAVE_LSX) +#elif defined(USE_SIMD) +#if defined(HAVE_LASX) +#define KISS_FFT_COS(phase) ({ \ + float __cos_val = cosf(phase); \ + (__m256)(__lasx_xvldrepl_w(&__cos_val, 0)); \ +}) +#define KISS_FFT_SIN(phase) ({ \ + float __sin_val = sinf(phase); \ + (__m256)(__lasx_xvldrepl_w(&__sin_val, 0)); \ +}) +#define HALF_OF(x) ((x) * (__m256)(__lasx_xvreplgr2vr_w(0x3F000000))) // 0.5f +#elif defined(HAVE_LSX) #define KISS_FFT_COS(phase) ({ \ float __cos_val = cosf(phase); \ (__m128)(__lsx_vldrepl_w(&__cos_val, 0)); \ @@ -138,10 +149,11 @@ struct kiss_fft_state{ (__m128)(__lsx_vldrepl_w(&__sin_val, 0)); \ }) #define HALF_OF(x) ((x) * (__m128)(__lsx_vreplgr2vr_w(0x3F000000))) // 0.5f -#elif defined(USE_SIMD) +#else # define KISS_FFT_COS(phase) _mm_set1_ps( cos(phase) ) # define KISS_FFT_SIN(phase) _mm_set1_ps( sin(phase) ) # define HALF_OF(x) ((x)*_mm_set1_ps(.5)) +#endif #else # define KISS_FFT_COS(phase) (kiss_fft_scalar) cos(phase) # define KISS_FFT_SIN(phase) (kiss_fft_scalar) sin(phase) diff --git a/kiss_fft.h b/kiss_fft.h index dc54844..51b6306 100644 --- a/kiss_fft.h +++ b/kiss_fft.h @@ -48,7 +48,18 @@ extern "C" { /* User may override KISS_FFT_MALLOC and/or KISS_FFT_FREE. */ #ifdef USE_SIMD -#ifdef HAVE_LSX +#ifdef HAVE_LASX +# include +# define kiss_fft_scalar __m256 +# ifndef KISS_FFT_MALLOC +# define KISS_FFT_MALLOC(nbytes) aligned_alloc(32, KISS_FFT_ALIGN_SIZE_UP(nbytes)) +# define KISS_FFT_ALIGN_CHECK(ptr) +# define KISS_FFT_ALIGN_SIZE_UP(size) ((size + 31UL) & ~0x1FUL) +# endif +# ifndef KISS_FFT_FREE +# define KISS_FFT_FREE free +# endif +#elif defined(HAVE_LSX) # include # define kiss_fft_scalar __m128 # ifndef KISS_FFT_MALLOC diff --git a/kiss_fftr.c b/kiss_fftr.c index 6214a75..08b6d7b 100644 --- a/kiss_fftr.c +++ b/kiss_fftr.c @@ -93,7 +93,10 @@ void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *fr freqdata[0].r = tdc.r + tdc.i; freqdata[ncfft].r = tdc.r - tdc.i; #ifdef USE_SIMD -#ifdef HAVE_LSX +#ifdef HAVE_LASX + freqdata[0].i = (__m256)(__lasx_xvreplgr2vr_w(0)); + freqdata[ncfft].i = freqdata[0].i; +#elif defined(HAVE_LSX) freqdata[0].i = (__m128)(__lsx_vreplgr2vr_w(0)); freqdata[ncfft].i = freqdata[0].i; #else @@ -151,7 +154,10 @@ void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *t C_ADD (st->tmpbuf[k], fek, fok); C_SUB (st->tmpbuf[ncfft - k], fek, fok); #ifdef USE_SIMD -#ifdef HAVE_LSX +#ifdef HAVE_LASX + __m256 neg_one = (__m256)__lasx_xvreplgr2vr_w(0xBF800000); // -1.0f + st->tmpbuf[ncfft - k].i = __lasx_xvfmul_s(st->tmpbuf[ncfft - k].i, neg_one); +#elif defined(HAVE_LSX) __m128 neg_one = (__m128)__lsx_vreplgr2vr_w(0xBF800000); // -1.0f st->tmpbuf[ncfft - k].i = __lsx_vfmul_s(st->tmpbuf[ncfft - k].i, neg_one); #else diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 1e89f38..3de863c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,7 +2,11 @@ function(add_kissfft_test_executable NAME) add_kissfft_executable(${NAME} ${ARGN}) target_include_directories(${NAME} PRIVATE ..) - if(COMPILER_SUPPORT_LSX_FLAG) + if(HAVE_LASX) + target_compile_options(${NAME} PRIVATE -mlasx) + endif() + + if(HAVE_LSX) target_compile_options(${NAME} PRIVATE -mlsx) endif() @@ -50,7 +54,9 @@ add_kissfft_test_executable(testcpp testcpp.cc) if(KISSFFT_DATATYPE MATCHES "^simd$") add_kissfft_test_executable(tsimd test_simd.c) target_compile_definitions(tsimd PRIVATE USE_SIMD) - if(COMPILER_SUPPORT_LSX_FLAG) + if(HAVE_LASX) + target_compile_options(kissfft PRIVATE -mlasx) + elseif(HAVE_LSX) target_compile_options(kissfft PRIVATE -mlsx) elseif(NOT MSVC) target_compile_options(kissfft PRIVATE -msse) diff --git a/test/Makefile b/test/Makefile index 5c3449d..714c404 100644 --- a/test/Makefile +++ b/test/Makefile @@ -143,7 +143,9 @@ endif $(TESTSIMD): test_simd.c ifeq "$(KISSFFT_DATATYPE)" "simd" -ifeq "$(HAVE_LSX)" "lsx" +ifeq "$(HAVE_LASX)" "lasx" + $(CC) -o $@ -g $(CFLAGS) -DUSE_SIMD=1 -DHAVE_LASX=1 -mlasx $< -L.. -l$(KISSFFTLIB_SHORTNAME) -lm +else ifeq "$(HAVE_LSX)" "lsx" $(CC) -o $@ -g $(CFLAGS) -DUSE_SIMD=1 -DHAVE_LSX=1 -mlsx $< -L.. -l$(KISSFFTLIB_SHORTNAME) -lm else $(CC) -o $@ -g $(CFLAGS) -DUSE_SIMD=1 -msse $< -L.. -l$(KISSFFTLIB_SHORTNAME) -lm diff --git a/test/benchkiss.c b/test/benchkiss.c index 7dae65d..ac0d98c 100644 --- a/test/benchkiss.c +++ b/test/benchkiss.c @@ -70,8 +70,13 @@ int main(int argc,char ** argv) nbytes *= nfft[k]; #ifdef USE_SIMD +#ifdef HAVE_LASX + numffts /= 8; + fprintf(stderr,"since SIMD implementation does 8 ffts at a time, numffts is being reduced to %d\n",numffts); +#else numffts /= 4; fprintf(stderr,"since SIMD implementation does 4 ffts at a time, numffts is being reduced to %d\n",numffts); +#endif #endif buf=(kiss_fft_cpx*)KISS_FFT_MALLOC(nbytes); diff --git a/test/test_real.c b/test/test_real.c index 9412305..52e9be8 100644 --- a/test/test_real.c +++ b/test/test_real.c @@ -22,7 +22,10 @@ static kiss_fft_scalar rand_scalar(void) { #ifdef USE_SIMD -#ifdef HAVE_LSX +#ifdef HAVE_LASX + float tmp = rand()-RAND_MAX/2; + return (__m256)(__lasx_xvldrepl_w(&tmp, 0)); +#elif defined(HAVE_LSX) float tmp = rand()-RAND_MAX/2; return (__m128)(__lsx_vldrepl_w(&tmp, 0)); #else diff --git a/test/test_simd.c b/test/test_simd.c index bb8ffd3..25a6097 100644 --- a/test/test_simd.c +++ b/test/test_simd.c @@ -6,7 +6,7 @@ static void test1(void) int n[2] = {256,256}; size_t nbytes = sizeof(kiss_fft_cpx)*n[0]*n[1]; -#ifdef HAVE_LSX +#if defined(HAVE_LSX) || defined(HAVE_LASX) kiss_fft_cpx * inbuf = NULL; kiss_fft_cpx * outbuf = NULL; if (posix_memalign((void**)&inbuf, 16, nbytes) || @@ -23,7 +23,7 @@ static void test1(void) kiss_fftnd(cfg,inbuf,outbuf); kiss_fft_free(cfg); -#ifdef HAVE_LSX +#if defined(HAVE_LSX) || defined(HAVE_LASX) free(inbuf); free(outbuf); #else diff --git a/test/twotonetest.c b/test/twotonetest.c index 4f896bb..e93fac3 100644 --- a/test/twotonetest.c +++ b/test/twotonetest.c @@ -38,10 +38,12 @@ double two_tone_test( int nfft, int bin1,int bin2) /* generate a signal with two tones*/ for (i = 0; i < nfft; i++) { #ifdef USE_SIMD -#ifdef HAVE_LSX +#ifdef HAVE_LASX + float tmp = (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i); + tbuf[i] = (__m256)__lasx_xvldrepl_w(&tmp, 0); +#elif defined(HAVE_LSX) float tmp = (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i); tbuf[i] = (__m128)__lsx_vldrepl_w(&tmp, 0); - #else tbuf[i] = _mm_set1_ps( (maxrange>>1)*cos(f1*i) + (maxrange>>1)*cos(f2*i) ); diff --git a/tools/kiss_fastfir.c b/tools/kiss_fastfir.c index 6967589..e066328 100644 --- a/tools/kiss_fastfir.c +++ b/tools/kiss_fastfir.c @@ -154,7 +154,11 @@ kiss_fastfir_cfg kiss_fastfir_alloc( for ( i=0; i < st->n_freq_bins; ++i ) { #ifdef USE_SIMD -#ifdef HAVE_LSX +#ifdef HAVE_LASX + __m256 tmp = (__m256)__lasx_xvldrepl_w(&scale, 0); + st->fir_freq_resp[i].r = __lasx_xvfmul_s(tmp, st->fir_freq_resp[i].r); + st->fir_freq_resp[i].i = __lasx_xvfmul_s(tmp, st->fir_freq_resp[i].i); +#elif defined(HAVE_LSX) __m128 tmp = (__m128)__lsx_vldrepl_w(&scale, 0); st->fir_freq_resp[i].r = __lsx_vfmul_s(tmp, st->fir_freq_resp[i].r); st->fir_freq_resp[i].i = __lsx_vfmul_s(tmp, st->fir_freq_resp[i].i); @@ -292,7 +296,9 @@ void direct_file_filter( tmph = imp_resp+nlag; #ifdef REAL_FASTFIR # ifdef USE_SIMD -# ifdef HAVE_LSX +#ifdef HAVE_LASX + outval = (__m256)(__lasx_xvreplgr2vr_w(0)); +#elif defined(HAVE_LSX) outval = (__m128)(__lsx_vreplgr2vr_w(0)); #else outval = _mm_set1_ps(0); @@ -307,7 +313,10 @@ void direct_file_filter( outval += buf[k] * *tmph; #else # ifdef USE_SIMD -# ifdef HAVE_LSX +#ifdef HAVE_LASX + outval.i = (__m256)(__lasx_xvreplgr2vr_w(0)); + outval.r = outval.i; +#elif defined(HAVE_LSX) outval.i = (__m128)(__lsx_vreplgr2vr_w(0)); outval.r = outval.i; #else