mirror of
https://github.com/YACReader/yacreader
synced 2026-04-12 15:49:53 -04:00
Some checks failed
Build / Initialization (push) Has been cancelled
Build / Code Format Validation (push) Has been cancelled
Build / Linux (Qt6) (push) Has been cancelled
Build / Linux (Qt6 + 7zip) (push) Has been cancelled
Build / macOS (Qt6 Universal) (push) Has been cancelled
Build / Windows x64 (Qt6) (push) Has been cancelled
Build / Windows ARM64 (Qt6) (push) Has been cancelled
Build / Docker amd64 Image (push) Has been cancelled
Build / Docker arm64 Image (push) Has been cancelled
Build / Publish Dev Builds (push) Has been cancelled
Build / Publish Release (push) Has been cancelled
Build / Publish YACReader10 Pre-release Builds (push) Has been cancelled
2384 lines
71 KiB
C++
2384 lines
71 KiB
C++
/**
|
|
* @file lancir.h
|
|
*
|
|
* @version 3.1
|
|
*
|
|
* @brief Self-contained header-only "LANCIR" image resizing algorithm.
|
|
*
|
|
* This is a self-contained inclusion file for the "LANCIR" image resizer,
|
|
* a part of the AVIR library. Features scalar, AVX, SSE2, NEON, and WASM
|
|
* SIMD128 optimizations as well as batched resizing technique which provides
|
|
* a better CPU cache performance.
|
|
*
|
|
* AVIR Copyright (c) 2015-2025 Aleksey Vaneev
|
|
*
|
|
* @mainpage
|
|
*
|
|
* @section intro_sec Introduction
|
|
*
|
|
* Description is available at https://github.com/avaneev/avir
|
|
*
|
|
* @section license License
|
|
*
|
|
* LICENSE:
|
|
*
|
|
* Copyright (c) 2015-2025 Aleksey Vaneev
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in
|
|
* all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
|
* DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
#ifndef AVIR_CLANCIR_INCLUDED
|
|
#define AVIR_CLANCIR_INCLUDED
|
|
|
|
#include <cmath>
|
|
#include <cstring>
|
|
|
|
#if __cplusplus >= 201103L
|
|
|
|
#include <cstdint>
|
|
|
|
#else // __cplusplus >= 201103L
|
|
|
|
#include <stdint.h>
|
|
|
|
#endif // __cplusplus >= 201103L
|
|
|
|
/**
|
|
* @def LANCIR_ALIGN
|
|
* @brief Address alignment (granularity) used by resizing functions,
|
|
* in bytes.
|
|
*/
|
|
|
|
/**
|
|
* @def LANCIR_NULLPTR
|
|
* @brief Macro is defined, if `nullptr` workaround is in use, for pre-C++11
|
|
* compilers. Undefined at the end of file.
|
|
*/
|
|
|
|
#if defined(__AVX__)
|
|
|
|
#include <immintrin.h>
|
|
|
|
#define LANCIR_AVX
|
|
#define LANCIR_SSE2 // Some functions use SSE2; AVX has a higher priority.
|
|
#define LANCIR_ALIGN 32
|
|
|
|
#elif defined(__aarch64__) || defined(__arm64__) || \
|
|
defined(_M_ARM64) || defined(_M_ARM64EC)
|
|
|
|
#if defined(_MSC_VER)
|
|
#include <arm64_neon.h>
|
|
|
|
#if _MSC_VER < 1925
|
|
#define LANCIR_ARM32 // Do not use some newer NEON intrinsics.
|
|
#endif // _MSC_VER < 1925
|
|
#else // defined( _MSC_VER )
|
|
#include <arm_neon.h>
|
|
#endif // defined( _MSC_VER )
|
|
|
|
#define LANCIR_NEON
|
|
#define LANCIR_ALIGN 16
|
|
|
|
#elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM)
|
|
|
|
#include <arm_neon.h>
|
|
|
|
#define LANCIR_ARM32
|
|
#define LANCIR_NEON
|
|
#define LANCIR_ALIGN 16
|
|
|
|
#elif defined(__SSE2__) || defined(_M_AMD64) || \
|
|
(defined(_M_IX86_FP) && _M_IX86_FP == 2)
|
|
|
|
#if defined(_MSC_VER)
|
|
#include <intrin.h>
|
|
#else // defined( _MSC_VER )
|
|
#include <emmintrin.h>
|
|
#endif // defined( _MSC_VER )
|
|
|
|
#define LANCIR_SSE2
|
|
#define LANCIR_ALIGN 16
|
|
|
|
#elif defined(__wasm_simd128__)
|
|
|
|
#include <wasm_simd128.h>
|
|
|
|
#define LANCIR_WASM
|
|
#define LANCIR_ALIGN 16
|
|
|
|
#else // WASM
|
|
|
|
#define LANCIR_ALIGN 4
|
|
|
|
#endif // WASM
|
|
|
|
#if defined(LANCIR_SSE2)
|
|
|
|
#define lancvec_t __m128
|
|
#define lancvec_const_splat(v) _mm_set1_ps(v)
|
|
#define lancvec_load(m) _mm_load_ps(m)
|
|
#define lancvec_loadu(m) _mm_loadu_ps(m)
|
|
#define lancvec_store(m, v) _mm_store_ps(m, v)
|
|
#define lancvec_storeu(m, v) _mm_storeu_ps(m, v)
|
|
#define lancvec_add(v1, v2) _mm_add_ps(v1, v2)
|
|
#define lancvec_mul(v1, v2) _mm_mul_ps(v1, v2)
|
|
#define lancvec_min(v1, v2) _mm_min_ps(v1, v2)
|
|
#define lancvec_max(v1, v2) _mm_max_ps(v1, v2)
|
|
#define lancvec_madd(va, v1, v2) _mm_add_ps(va, _mm_mul_ps(v1, v2))
|
|
#define lancvec_addhl(vl, vh) _mm_add_ps(vl, _mm_movehl_ps(vh, vh))
|
|
#define lancvec_store32_addhl(m, v) \
|
|
_mm_store_ss(m, _mm_add_ss(v, _mm_shuffle_ps(v, v, 1)))
|
|
|
|
#define lancvec_store64_addhl(m, v) \
|
|
_mm_storel_pi((__m64 *)(m), lancvec_addhl(v, v))
|
|
|
|
#elif defined(LANCIR_NEON)
|
|
|
|
#define lancvec_t float32x4_t
|
|
#define lancvec_const_splat(v) vdupq_n_f32(v)
|
|
#define lancvec_load(m) vld1q_f32(m)
|
|
#define lancvec_store(m, v) vst1q_f32(m, v)
|
|
#define lancvec_add(v1, v2) vaddq_f32(v1, v2)
|
|
#define lancvec_mul(v1, v2) vmulq_f32(v1, v2)
|
|
#define lancvec_min(v1, v2) vminq_f32(v1, v2)
|
|
#define lancvec_max(v1, v2) vmaxq_f32(v1, v2)
|
|
#define lancvec_madd(va, v1, v2) vmlaq_f32(va, v1, v2)
|
|
|
|
#if defined(LANCIR_ARM32)
|
|
#define lancvec_store32_hadd(m, v) \
|
|
{ \
|
|
const float32x2_t v2 = vadd_f32(vget_high_f32(v), \
|
|
vget_low_f32(v)); \
|
|
*(m) = vget_lane_f32(v2, 0) + \
|
|
vget_lane_f32(v2, 1); \
|
|
} \
|
|
(void)0
|
|
#else // defined( LANCIR_ARM32 )
|
|
#define lancvec_store32_hadd(m, v) *(m) = vaddvq_f32(v)
|
|
#endif // defined( LANCIR_ARM32 )
|
|
|
|
#define lancvec_store64_addhl(m, v) \
|
|
vst1_f32(m, vadd_f32(vget_high_f32(v), vget_low_f32(v)));
|
|
|
|
#elif defined(LANCIR_WASM)
|
|
|
|
#define lancvec_t v128_t
|
|
#define lancvec_const_splat(v) wasm_f32x4_const_splat(v)
|
|
#define lancvec_load32_splat(m) wasm_v128_load32_splat(m)
|
|
#define lancvec_load(m) wasm_v128_load(m)
|
|
#define lancvec_store(m, v) wasm_v128_store(m, v)
|
|
#define lancvec_add(v1, v2) wasm_f32x4_add(v1, v2)
|
|
#define lancvec_mul(v1, v2) wasm_f32x4_mul(v1, v2)
|
|
#define lancvec_min(v1, v2) wasm_f32x4_min(v1, v2)
|
|
#define lancvec_max(v1, v2) wasm_f32x4_max(v1, v2)
|
|
#define lancvec_madd(va, v1, v2) wasm_f32x4_add(va, \
|
|
wasm_f32x4_mul(v1, v2))
|
|
|
|
#define lancvec_addhl(vl, vh) wasm_f32x4_add(vl, \
|
|
wasm_i32x4_shuffle(vh, vh, 6, 7, 2, 3))
|
|
|
|
#define lancvec_store32_addhl(m, v) \
|
|
*(m) = (wasm_f32x4_extract_lane(v, 0) + \
|
|
wasm_f32x4_extract_lane(v, 1))
|
|
|
|
#define lancvec_store64_addhl(m, v) \
|
|
wasm_v128_store64_lane(m, lancvec_addhl(v, v), 0)
|
|
|
|
#endif // defined( LANCIR_WASM )
|
|
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
#if !defined(lancvec_load32_splat)
|
|
#define lancvec_load32_splat(m) lancvec_const_splat(*(m))
|
|
#endif // !defined( lancvec_load32_splat )
|
|
|
|
#if !defined(lancvec_loadu)
|
|
#define lancvec_loadu(m) lancvec_load(m)
|
|
#endif // !defined( lancvec_loadu )
|
|
|
|
#if !defined(lancvec_storeu)
|
|
#define lancvec_storeu(m, v) lancvec_store(m, v)
|
|
#endif // !defined( lancvec_storeu )
|
|
|
|
#if !defined(lancvec_store32_hadd)
|
|
#define lancvec_store32_hadd(m, v) \
|
|
{ \
|
|
const lancvec_t v2 = lancvec_addhl(v, v); \
|
|
lancvec_store32_addhl(m, v2); \
|
|
} \
|
|
(void)0
|
|
#endif // !defined( lancvec_store32_hadd )
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
namespace avir {
|
|
|
|
using std ::ceil;
|
|
using std ::cos;
|
|
using std ::fabs;
|
|
using std ::floor;
|
|
using std ::memcpy;
|
|
using std ::memset;
|
|
using std ::sin;
|
|
using std ::size_t;
|
|
|
|
#if __cplusplus >= 201103L
|
|
|
|
using std ::intptr_t;
|
|
using std ::uintptr_t;
|
|
|
|
#else // __cplusplus >= 201103L
|
|
|
|
// Workaround for pre-C++11 compilers. `nullptr` is a keyword, and not a
|
|
// macro, but check if such workaround is already in place.
|
|
|
|
#if !defined(nullptr)
|
|
#define nullptr NULL
|
|
#define LANCIR_NULLPTR
|
|
#endif // !defined( nullptr )
|
|
|
|
#endif // __cplusplus >= 201103L
|
|
|
|
/**
|
|
* @brief LANCIR resizing parameters class.
|
|
*
|
|
* An object of this class, which can be allocated on stack, can be used to
|
|
* pass non-default parameters to the resizing algorithm. See the constructor
|
|
* for the default values.
|
|
*/
|
|
|
|
class CLancIRParams
|
|
{
|
|
public:
|
|
int SrcSSize; ///< Physical size of the source scanline, in elements (not
|
|
///< bytes). If this value is below 1, `SrcWidth * ElCount` will be
|
|
///< used.
|
|
int NewSSize; ///< Physical size of the destination scanline, in elements
|
|
///< (not bytes). If this value is below 1, `NewWidth * ElCount` will
|
|
///< be used.
|
|
double kx; ///< Resizing step - horizontal (one output pixel corresponds
|
|
///< to `k` input pixels). A downsizing factor if greater than 1.0;
|
|
///< upsizing factor if below or equal to 1.0. Multiply by -1 if you
|
|
///< would like to bypass `ox` and `oy` adjustment which is done by
|
|
///< default to produce a centered image. If this step value equals 0,
|
|
///< the step value will be chosen automatically.
|
|
double ky; ///< Resizing step - vertical. Same as `kx`.
|
|
double ox; ///< Start X pixel offset within the source image, can be
|
|
///< negative. A positive offset moves the image to the left.
|
|
double oy; ///< Start Y pixel offset within the source image, can be
|
|
///< negative. A positive offset moves the image to the top.
|
|
double la; ///< Lanczos window function's `a` parameter, greater or equal
|
|
///< to 2.0.
|
|
|
|
/**
|
|
* @brief Default constructor, with optional arguments that correspond to
|
|
* class variables.
|
|
*
|
|
* @param aSrcSSize Physical size of the source scanline.
|
|
* @param aNewSSize Physical size of the destination scanline.
|
|
* @param akx Resizing step - horizontal.
|
|
* @param aky Resizing step - vertical.
|
|
* @param aox Start X pixel offset.
|
|
* @param aoy Start Y pixel offset.
|
|
*/
|
|
|
|
CLancIRParams(const int aSrcSSize = 0, const int aNewSSize = 0,
|
|
const double akx = 0.0, const double aky = 0.0,
|
|
const double aox = 0.0, const double aoy = 0.0)
|
|
: SrcSSize(aSrcSSize), NewSSize(aNewSSize), kx(akx), ky(aky), ox(aox), oy(aoy), la(3.0)
|
|
{
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @brief LANCIR image resizer class.
|
|
*
|
|
* The object of this class can be used to resize 1-4 channel images to any
|
|
* required size. Resizing is performed by utilizing Lanczos filters, with
|
|
* 8-bit precision. This class offers a kind of "optimal" Lanczos resampling
|
|
* implementation.
|
|
*
|
|
* Object of this class can be allocated on stack.
|
|
*
|
|
* Note that object of this class does not free temporary buffers and
|
|
* variables after the resizeImage() function call (until object's
|
|
* destruction): these buffers are reused (or reallocated) on subsequent
|
|
* calls, thus making batch resizing of images faster. This means resizing is
|
|
* not thread-safe: a separate CLancIR object should be created for each
|
|
* thread.
|
|
*/
|
|
|
|
class CLancIR
|
|
{
|
|
private:
|
|
CLancIR(const CLancIR &)
|
|
{
|
|
// Unsupported.
|
|
}
|
|
|
|
CLancIR &operator=(const CLancIR &)
|
|
{
|
|
// Unsupported.
|
|
return (*this);
|
|
}
|
|
|
|
public:
|
|
CLancIR()
|
|
: FltBuf0(nullptr), FltBuf0Len(0), spv0(nullptr), spv0len(0), spv(nullptr)
|
|
{
|
|
}
|
|
|
|
~CLancIR()
|
|
{
|
|
delete[] FltBuf0;
|
|
delete[] spv0;
|
|
}
|
|
|
|
/**
|
|
* @brief Function resizes an image.
|
|
*
|
|
* Performs input-to-output type conversion, if necessary.
|
|
*
|
|
* @param[in] SrcBuf Source image buffer.
|
|
* @param SrcWidth Source image width, in pixels.
|
|
* @param SrcHeight Source image height, in pixels.
|
|
* @param[out] NewBuf Buffer to accept the resized image. Cannot be equal
|
|
* to `SrcBuf`.
|
|
* @param NewWidth New image width, in pixels.
|
|
* @param NewHeight New image height, in pixels.
|
|
* @param ElCount The number of elements (channels) used to store each
|
|
* source and destination pixel (1-4).
|
|
* @param aParams Custom resizing parameters. Can be `nullptr`, for
|
|
* default values.
|
|
* @tparam Tin Input buffer's element type. Can be `uint8_t` (`0..255`
|
|
* value range), `uint16_t` (`0..65535` value range), `float` (`0..1`
|
|
* value range), `double` (`0..1` value range). `uint32_t` type is treated
|
|
* as `uint16_t`. Signed integer types and larger integer types are not
|
|
* supported.
|
|
* @tparam Tout Output buffer's element type, treated like `Tin`. If `Tin`
|
|
* and `Tout` types do not match, an output value scaling will be applied.
|
|
* Floating-point output will not be clamped/clipped/saturated; integer
|
|
* output is always rounded and clamped.
|
|
* @return The number of available output scanlines. Equals to
|
|
* `NewHeight`, or 0 on function parameters error.
|
|
*/
|
|
|
|
template<typename Tin, typename Tout>
|
|
int resizeImage(const Tin *const SrcBuf, const int SrcWidth,
|
|
const int SrcHeight, Tout *const NewBuf, const int NewWidth,
|
|
const int NewHeight, const int ElCount,
|
|
const CLancIRParams *const aParams = nullptr)
|
|
{
|
|
if ((SrcWidth < 0) | (SrcHeight < 0) |
|
|
(NewWidth <= 0) | (NewHeight <= 0) |
|
|
(SrcBuf == nullptr) | (NewBuf == nullptr) |
|
|
((const void *)SrcBuf == (const void *)NewBuf)) {
|
|
return (0);
|
|
}
|
|
|
|
static const CLancIRParams DefParams;
|
|
const CLancIRParams &Params = (aParams != nullptr ? *aParams : DefParams);
|
|
|
|
if (Params.la < 2.0) {
|
|
return (0);
|
|
}
|
|
|
|
const int OutSLen = NewWidth * ElCount;
|
|
const size_t NewScanlineSize = (size_t)(Params.NewSSize < 1 ? OutSLen : Params.NewSSize);
|
|
|
|
if ((SrcWidth == 0) | (SrcHeight == 0)) {
|
|
Tout *op = NewBuf;
|
|
int i;
|
|
|
|
for (i = 0; i < NewHeight; i++) {
|
|
memset(op, 0, (size_t)OutSLen * sizeof(Tout));
|
|
op += NewScanlineSize;
|
|
}
|
|
|
|
return (NewHeight);
|
|
}
|
|
|
|
const size_t SrcScanlineSize = (size_t)(Params.SrcSSize < 1 ? SrcWidth * ElCount : Params.SrcSSize);
|
|
|
|
double ox = Params.ox;
|
|
double oy = Params.oy;
|
|
double kx;
|
|
double ky;
|
|
|
|
if (Params.kx >= 0.0) {
|
|
kx = (Params.kx == 0.0 ? (double)SrcWidth / NewWidth : Params.kx);
|
|
|
|
ox += (kx - 1.0) * 0.5;
|
|
} else {
|
|
kx = -Params.kx;
|
|
}
|
|
|
|
if (Params.ky >= 0.0) {
|
|
ky = (Params.ky == 0.0 ? (double)SrcHeight / NewHeight : Params.ky);
|
|
|
|
oy += (ky - 1.0) * 0.5;
|
|
} else {
|
|
ky = -Params.ky;
|
|
}
|
|
|
|
if (rfv.update(Params.la, ky, ElCount)) {
|
|
rsv.reset();
|
|
rsh.reset();
|
|
}
|
|
|
|
CResizeFilters *rfh; // Pointer to resizing filters for horizontal
|
|
// resizing, may equal to `rfv` if the same stepping is in use.
|
|
|
|
if (kx == ky) {
|
|
rfh = &rfv;
|
|
} else {
|
|
rfh = &rfh0;
|
|
|
|
if (rfh0.update(Params.la, kx, ElCount)) {
|
|
rsh.reset();
|
|
}
|
|
}
|
|
|
|
rsv.update(SrcHeight, NewHeight, oy, rfv, spv);
|
|
rsh.update(SrcWidth, NewWidth, ox, *rfh);
|
|
|
|
// Calculate vertical progressive resizing's batch size. Progressive
|
|
// batching is used to try to keep addressing within the cache
|
|
// capacity. This technique definitely works well for single-threaded
|
|
// resizing on most CPUs, but may not provide an additional benefit
|
|
// for multi-threaded resizing, or in a system-wide high-load
|
|
// situations.
|
|
|
|
const size_t FltWidthE = (size_t)((rsh.padl + SrcWidth +
|
|
rsh.padr) *
|
|
ElCount);
|
|
|
|
const double CacheSize = 5500000.0; // Tuned for various CPUs.
|
|
const double OpSize = (double)SrcScanlineSize * SrcHeight *
|
|
sizeof(Tin) +
|
|
(double)FltWidthE * NewHeight * sizeof(float);
|
|
|
|
int BatchSize = (int)(NewHeight * CacheSize / (OpSize + 1.0));
|
|
|
|
if (BatchSize < 8) {
|
|
BatchSize = 8;
|
|
}
|
|
|
|
if (BatchSize > NewHeight) {
|
|
BatchSize = NewHeight;
|
|
}
|
|
|
|
// Allocate/resize intermediate buffers.
|
|
|
|
const int svs = (rsv.padl + SrcHeight + rsv.padr) * ElCount;
|
|
float *const pspv0 = spv0;
|
|
reallocBuf(spv0, spv, spv0len, (svs > OutSLen ? svs : OutSLen));
|
|
reallocBuf(FltBuf0, FltBuf, FltBuf0Len,
|
|
FltWidthE * (size_t)BatchSize);
|
|
|
|
if (spv0 != pspv0) {
|
|
rsv.updateSPO(rfv, spv);
|
|
}
|
|
|
|
// Prepare output-related constants.
|
|
|
|
static const bool IsInFloat = ((Tin)0.25f != 0);
|
|
static const bool IsOutFloat = ((Tout)0.25f != 0);
|
|
static const bool IsUnityMul = (IsInFloat && IsOutFloat) ||
|
|
(IsInFloat == IsOutFloat && sizeof(Tin) == sizeof(Tout));
|
|
|
|
const float Clamp = (sizeof(Tout) == 1 ? 255.0f : 65535.0f);
|
|
const float OutMul = (IsOutFloat ? 1.0f : Clamp) /
|
|
(IsInFloat ? 1.0f : (sizeof(Tin) == 1 ? 255.0f : 65535.0f));
|
|
|
|
// Perform batched resizing.
|
|
|
|
const CResizePos *rpv = rsv.pos;
|
|
Tout *opn = NewBuf;
|
|
int bl = NewHeight;
|
|
|
|
while (bl > 0) {
|
|
const int bc = (bl > BatchSize ? BatchSize : bl);
|
|
|
|
int kl = rfv.KernelLen;
|
|
const Tin *ip = SrcBuf;
|
|
float *op = FltBuf + rsh.padl * ElCount;
|
|
|
|
const int so = (int)rpv[0].so;
|
|
float *const sp = spv + so * ElCount;
|
|
|
|
int cc = (int)rpv[bc - 1].so - so + kl; // Pixel copy count.
|
|
int rl = 0; // Leftmost pixel's replication count.
|
|
int rr = 0; // Rightmost pixel's replication count.
|
|
|
|
const int socc = so + cc;
|
|
const int spe = rsv.padl + SrcHeight;
|
|
|
|
// Calculate scanline copying and padding parameters, depending on
|
|
// the batch's size and its vertical offset.
|
|
|
|
if (so < rsv.padl) {
|
|
if (socc <= rsv.padl) {
|
|
rl = cc;
|
|
cc = 0;
|
|
} else {
|
|
if (socc > spe) {
|
|
rr = socc - spe;
|
|
cc -= rr;
|
|
}
|
|
|
|
rl = rsv.padl - so;
|
|
cc -= rl;
|
|
}
|
|
} else {
|
|
if (so >= spe) {
|
|
rr = cc;
|
|
cc = 0;
|
|
ip += (size_t)SrcHeight * SrcScanlineSize;
|
|
} else {
|
|
if (socc > spe) {
|
|
rr = socc - spe;
|
|
cc -= rr;
|
|
}
|
|
|
|
ip += (size_t)(so - rsv.padl) * SrcScanlineSize;
|
|
}
|
|
}
|
|
|
|
// Batched vertical resizing.
|
|
|
|
int i;
|
|
|
|
if (ElCount == 1) {
|
|
for (i = 0; i < SrcWidth; i++) {
|
|
copyScanline1v(ip, SrcScanlineSize, sp, cc, rl, rr);
|
|
resize1<false>(nullptr, op, FltWidthE, rpv, kl, bc);
|
|
ip += 1;
|
|
op += 1;
|
|
}
|
|
} else if (ElCount == 2) {
|
|
for (i = 0; i < SrcWidth; i++) {
|
|
copyScanline2v(ip, SrcScanlineSize, sp, cc, rl, rr);
|
|
resize2<false>(nullptr, op, FltWidthE, rpv, kl, bc);
|
|
ip += 2;
|
|
op += 2;
|
|
}
|
|
} else if (ElCount == 3) {
|
|
for (i = 0; i < SrcWidth; i++) {
|
|
copyScanline3v(ip, SrcScanlineSize, sp, cc, rl, rr);
|
|
resize3<false>(nullptr, op, FltWidthE, rpv, kl, bc);
|
|
ip += 3;
|
|
op += 3;
|
|
}
|
|
} else // ElCount == 4
|
|
{
|
|
for (i = 0; i < SrcWidth; i++) {
|
|
copyScanline4v(ip, SrcScanlineSize, sp, cc, rl, rr);
|
|
resize4<false>(nullptr, op, FltWidthE, rpv, kl, bc);
|
|
ip += 4;
|
|
op += 4;
|
|
}
|
|
}
|
|
|
|
// Perform horizontal resizing batch, and produce final output.
|
|
|
|
float *ipf = FltBuf;
|
|
kl = rfh->KernelLen;
|
|
|
|
if (ElCount == 1) {
|
|
for (i = 0; i < bc; i++) {
|
|
padScanline1h(ipf, rsh, SrcWidth);
|
|
resize1<true>(ipf, spv, 1, rsh.pos, kl, NewWidth);
|
|
outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
|
|
OutSLen, Clamp, OutMul);
|
|
|
|
ipf += FltWidthE;
|
|
opn += NewScanlineSize;
|
|
}
|
|
} else if (ElCount == 2) {
|
|
for (i = 0; i < bc; i++) {
|
|
padScanline2h(ipf, rsh, SrcWidth);
|
|
resize2<true>(ipf, spv, 2, rsh.pos, kl, NewWidth);
|
|
outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
|
|
OutSLen, Clamp, OutMul);
|
|
|
|
ipf += FltWidthE;
|
|
opn += NewScanlineSize;
|
|
}
|
|
} else if (ElCount == 3) {
|
|
for (i = 0; i < bc; i++) {
|
|
padScanline3h(ipf, rsh, SrcWidth);
|
|
resize3<true>(ipf, spv, 3, rsh.pos, kl, NewWidth);
|
|
outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
|
|
OutSLen, Clamp, OutMul);
|
|
|
|
ipf += FltWidthE;
|
|
opn += NewScanlineSize;
|
|
}
|
|
} else // ElCount == 4
|
|
{
|
|
for (i = 0; i < bc; i++) {
|
|
padScanline4h(ipf, rsh, SrcWidth);
|
|
resize4<true>(ipf, spv, 4, rsh.pos, kl, NewWidth);
|
|
outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
|
|
OutSLen, Clamp, OutMul);
|
|
|
|
ipf += FltWidthE;
|
|
opn += NewScanlineSize;
|
|
}
|
|
}
|
|
|
|
rpv += bc;
|
|
bl -= bc;
|
|
}
|
|
|
|
return (NewHeight);
|
|
}
|
|
|
|
/**
|
|
* @brief Legacy image resizing function.
|
|
*
|
|
* Not recommended for new projects. See the prior resizeImage() function
|
|
* and CLancIRParams class for details.
|
|
*
|
|
* @param[in] SrcBuf Source image buffer.
|
|
* @param SrcWidth Source image width, in pixels.
|
|
* @param SrcHeight Source image height, in pixels.
|
|
* @param SrcSSize Physical size of the source scanline, in elements (not
|
|
* bytes).
|
|
* @param[out] NewBuf Buffer to accept the resized image. Cannot be equal
|
|
* to SrcBuf.
|
|
* @param NewWidth New image width, in pixels.
|
|
* @param NewHeight New image height, in pixels.
|
|
* @param NewSSize Physical size of the destination scanline, in elements
|
|
* (not bytes).
|
|
* @param ElCount The number of elements (channels) used to store each
|
|
* source and destination pixel (1-4).
|
|
* @param kx0 Resizing step - horizontal.
|
|
* @param ky0 Resizing step - vertical. Same as `kx0`.
|
|
* @param ox Start X pixel offset within the source image.
|
|
* @param oy Start Y pixel offset within the source image.
|
|
* @tparam Tin Input buffer's element type.
|
|
* @tparam Tout Output buffer's element type.
|
|
* @return The number of available output scanlines. Equals to
|
|
* `NewHeight`, or 0 on function parameters error.
|
|
*/
|
|
|
|
template<typename Tin, typename Tout>
|
|
int resizeImage(const Tin *const SrcBuf, const int SrcWidth,
|
|
const int SrcHeight, const int SrcSSize, Tout *const NewBuf,
|
|
const int NewWidth, const int NewHeight, const int NewSSize,
|
|
const int ElCount, const double kx0 = 0.0, const double ky0 = 0.0,
|
|
double ox = 0.0, double oy = 0.0)
|
|
{
|
|
const CLancIRParams Params(SrcSSize, NewSSize, kx0, ky0, ox, oy);
|
|
|
|
return (resizeImage(SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth,
|
|
NewHeight, ElCount, &Params));
|
|
}
|
|
|
|
protected:
|
|
float *FltBuf0; ///< Intermediate resizing buffer.
|
|
size_t FltBuf0Len; ///< Length of `FltBuf0`.
|
|
float *FltBuf; ///< Address-aligned `FltBuf0`.
|
|
float *spv0; ///< Scanline buffer for vertical resizing, also used at the
|
|
///< output stage.
|
|
int spv0len; ///< Length of `spv0`.
|
|
float *spv; ///< Address-aligned `spv0`.
|
|
|
|
/**
|
|
* @brief Typed buffer reallocation function, with address alignment.
|
|
*
|
|
* Function reallocates a typed buffer if its current length is
|
|
* smaller than the required length, applies `LANCIR_ALIGN` address
|
|
* alignment to the buffer pointer.
|
|
*
|
|
* @param buf0 Reference to the pointer of the previously allocated
|
|
* buffer.
|
|
* @param buf Reference to address-aligned `buf0` pointer.
|
|
* @param len The current length of the `buf0`.
|
|
* @param newlen A new required length.
|
|
* @tparam Tb Buffer element type.
|
|
* @tparam Tl Length variable type.
|
|
*/
|
|
|
|
template<typename Tb, typename Tl>
|
|
static void reallocBuf(Tb *&buf0, Tb *&buf, Tl &len, Tl newlen)
|
|
{
|
|
newlen += LANCIR_ALIGN;
|
|
|
|
if (newlen > len) {
|
|
if (buf0 != nullptr) {
|
|
delete[] buf0;
|
|
buf0 = nullptr;
|
|
len = 0;
|
|
}
|
|
|
|
buf0 = new Tb[newlen];
|
|
len = newlen;
|
|
buf = (Tb *)(((uintptr_t)buf0 + LANCIR_ALIGN - 1) &
|
|
~(uintptr_t)(LANCIR_ALIGN - 1));
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Typed buffer reallocation function.
|
|
*
|
|
* Function reallocates a typed buffer if its current length is smaller
|
|
* than the required length.
|
|
*
|
|
* @param buf Reference to the pointer of the previously allocated buffer;
|
|
* address alignment will not be applied.
|
|
* @param len The current length of the `buf0`.
|
|
* @param newlen A new required length.
|
|
* @tparam Tb Buffer element type.
|
|
* @tparam Tl Length variable type.
|
|
*/
|
|
|
|
template<typename Tb, typename Tl>
|
|
static void reallocBuf(Tb *&buf, Tl &len, const Tl newlen)
|
|
{
|
|
if (newlen > len) {
|
|
if (buf != nullptr) {
|
|
delete[] buf;
|
|
buf = nullptr;
|
|
len = 0;
|
|
}
|
|
|
|
buf = new Tb[newlen];
|
|
len = newlen;
|
|
}
|
|
}
|
|
|
|
class CResizeScanline;
|
|
|
|
/**
|
|
* @brief Class for fractional delay filter bank storage and calculation.
|
|
*/
|
|
|
|
class CResizeFilters
|
|
{
|
|
friend class CResizeScanline;
|
|
|
|
public:
|
|
int KernelLen; ///< Resampling filter kernel's length, taps. Available
|
|
///< after the update() function call. Always an even value,
|
|
///< should not be lesser than 4.
|
|
|
|
CResizeFilters()
|
|
: Filters(nullptr), FiltersLen(0), la(0.0)
|
|
{
|
|
memset(Bufs0, 0, sizeof(Bufs0));
|
|
memset(Bufs0Len, 0, sizeof(Bufs0Len));
|
|
}
|
|
|
|
~CResizeFilters()
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < BufCount; i++) {
|
|
delete[] Bufs0[i];
|
|
}
|
|
|
|
delete[] Filters;
|
|
}
|
|
|
|
/**
|
|
* @brief Function updates the filter bank.
|
|
*
|
|
* @param la0 Lanczos `a` parameter value (greater or equal to 2.0),
|
|
* can be fractional.
|
|
* @param k0 Resizing step.
|
|
* @param ElCount0 Image's element count, may be used for SIMD filter
|
|
* tap replication.
|
|
* @return `true`, if an update occured and scanline resizing
|
|
* positions should be updated unconditionally.
|
|
*/
|
|
|
|
bool update(const double la0, const double k0, const int ElCount0)
|
|
{
|
|
if (la0 == la && k0 == k && ElCount0 == ElCount) {
|
|
return (false);
|
|
}
|
|
|
|
const double NormFreq = (k0 <= 1.0 ? 1.0 : 1.0 / k0);
|
|
Freq = 3.1415926535897932 * NormFreq;
|
|
FreqA = Freq / la0;
|
|
|
|
Len2 = la0 / NormFreq;
|
|
fl2 = (int)ceil(Len2);
|
|
KernelLen = fl2 + fl2;
|
|
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
ElRepl = ElCount0;
|
|
KernelLenA = KernelLen * ElRepl;
|
|
|
|
const int elalign =
|
|
(int)(LANCIR_ALIGN / sizeof(float)) - 1;
|
|
|
|
KernelLenA = (KernelLenA + elalign) & ~elalign;
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
ElRepl = 1;
|
|
KernelLenA = KernelLen;
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
FracCount = 1000; // Enough for Lanczos implicit 8-bit precision.
|
|
|
|
la = 0.0;
|
|
reallocBuf(Filters, FiltersLen, FracCount + 1);
|
|
|
|
memset(Filters, 0, (size_t)FiltersLen * sizeof(Filters[0]));
|
|
|
|
setBuf(0);
|
|
|
|
la = la0;
|
|
k = k0;
|
|
ElCount = ElCount0;
|
|
|
|
return (true);
|
|
}
|
|
|
|
/**
|
|
* @brief Filter acquisition function.
|
|
*
|
|
* Function returns filter at the specified fractional offset. This
|
|
* function can only be called after a prior update() function call.
|
|
*
|
|
* @param x Fractional offset, [0; 1].
|
|
* @return Pointer to a previously-calculated or a new filter.
|
|
*/
|
|
|
|
const float *getFilter(const double x)
|
|
{
|
|
const int Frac = (int)(x * FracCount + 0.5);
|
|
float *flt = Filters[Frac];
|
|
|
|
if (flt != nullptr) {
|
|
return (flt);
|
|
}
|
|
|
|
flt = Bufs[CurBuf] + CurBufFill * KernelLenA;
|
|
Filters[Frac] = flt;
|
|
CurBufFill++;
|
|
|
|
if (CurBufFill == BufLen) {
|
|
setBuf(CurBuf + 1);
|
|
}
|
|
|
|
makeFilterNorm(flt, 1.0 - (double)Frac / FracCount);
|
|
|
|
if (ElRepl > 1) {
|
|
replicateFilter(flt, KernelLen, ElRepl);
|
|
}
|
|
|
|
return (flt);
|
|
}
|
|
|
|
protected:
|
|
double Freq; ///< Circular frequency of the filter.
|
|
double FreqA; ///< Circular frequency of the window function.
|
|
double Len2; ///< Half resampling filter's length, unrounded.
|
|
int fl2; ///< Half resampling filter's length, integer.
|
|
int FracCount; ///< The number of fractional positions for which
|
|
///< filters can be created.
|
|
int KernelLenA; ///< SIMD-aligned and replicated filter kernel's
|
|
///< length.
|
|
int ElRepl; ///< The number of repetitions of each filter tap.
|
|
static const int BufCount = 4; ///< The maximal number of buffers
|
|
///< (filter batches) that can be in use.
|
|
static const int BufLen = 256; ///< The number of fractional filters
|
|
///< a single buffer (filter batch) may contain. Both the `BufLen`
|
|
///< and `BufCount` should correspond to the `FracCount` used.
|
|
float *Bufs0[BufCount]; ///< Buffers that hold all filters,
|
|
///< original.
|
|
int Bufs0Len[BufCount]; ///< Allocated lengthes in `Bufs0`, in
|
|
///< `float` elements.
|
|
float *Bufs[BufCount]; ///< Address-aligned `Bufs0`.
|
|
int CurBuf; ///< Filter buffer currently being filled.
|
|
int CurBufFill; ///< The number of fractional positions filled in the
|
|
///< current filter buffer.
|
|
float **Filters; ///< Fractional delay filters for all positions.
|
|
///< A particular pointer equals `nullptr`, if a filter for such
|
|
///< position has not been created yet.
|
|
int FiltersLen; ///< Allocated length of Filters, in elements.
|
|
double la; ///< Current `la`.
|
|
double k; ///< Current `k`.
|
|
int ElCount; ///< Current `ElCount`.
|
|
|
|
/**
|
|
* @brief Current buffer (filter batch) repositioning function.
|
|
*
|
|
* Function changes the buffer currently being filled, checks its size
|
|
* and reallocates it, if necessary, then resets its fill counter.
|
|
*
|
|
* @param bi A new current buffer index.
|
|
*/
|
|
|
|
void setBuf(const int bi)
|
|
{
|
|
reallocBuf(Bufs0[bi], Bufs[bi], Bufs0Len[bi],
|
|
BufLen * KernelLenA);
|
|
|
|
CurBuf = bi;
|
|
CurBufFill = 0;
|
|
}
|
|
|
|
/**
|
|
* @brief Sine-wave signal generator class.
|
|
*
|
|
* Class implements sine-wave signal generator without biasing, with
|
|
* constructor-based initialization only. This generator uses an
|
|
* oscillator instead of the `sin()` function.
|
|
*/
|
|
|
|
class CSineGen
|
|
{
|
|
public:
|
|
/**
|
|
* @brief Constructor initializes *this* sine-wave signal
|
|
* generator.
|
|
*
|
|
* @param si Sine function increment, in radians.
|
|
* @param ph Starting phase, in radians. Add `0.5*pi` for a
|
|
* cosine function.
|
|
*/
|
|
|
|
CSineGen(const double si, const double ph)
|
|
: svalue1(sin(ph)), svalue2(sin(ph - si)), sincr(2.0 * cos(si))
|
|
{
|
|
}
|
|
|
|
/**
|
|
* @brief Generates the next sine-wave sample, without biasing.
|
|
*/
|
|
|
|
double generate()
|
|
{
|
|
const double res = svalue1;
|
|
|
|
svalue1 = sincr * res - svalue2;
|
|
svalue2 = res;
|
|
|
|
return (res);
|
|
}
|
|
|
|
private:
|
|
double svalue1; ///< Current sine value.
|
|
double svalue2; ///< Previous sine value.
|
|
double sincr; ///< Sine value increment.
|
|
};
|
|
|
|
/**
|
|
* @brief Filter calculation function.
|
|
*
|
|
* Function creates a filter for the specified fractional delay. The
|
|
* update() function should be called prior to calling this function.
|
|
* The created filter is normalized (DC gain=1).
|
|
*
|
|
* @param[out] op Output filter buffer.
|
|
* @param FracDelay Fractional delay, 0 to 1, inclusive.
|
|
*/
|
|
|
|
void makeFilterNorm(float *op, const double FracDelay) const
|
|
{
|
|
CSineGen f(Freq, Freq * (FracDelay - fl2));
|
|
CSineGen fw(FreqA, FreqA * (FracDelay - fl2));
|
|
|
|
float *op0 = op;
|
|
double s = 0.0;
|
|
double ut;
|
|
|
|
int t = -fl2;
|
|
|
|
if (t + FracDelay < -Len2) {
|
|
f.generate();
|
|
fw.generate();
|
|
*op = 0;
|
|
op++;
|
|
t++;
|
|
}
|
|
|
|
int IsZeroX = (fabs(FracDelay - 1.0) < 2.3e-13);
|
|
int mt = 0 - IsZeroX;
|
|
IsZeroX |= (fabs(FracDelay) < 2.3e-13);
|
|
|
|
while (t < mt) {
|
|
ut = t + FracDelay;
|
|
*op = (float)(f.generate() * fw.generate() / (ut * ut));
|
|
s += *op;
|
|
op++;
|
|
t++;
|
|
}
|
|
|
|
if (IsZeroX) // t+FracDelay==0
|
|
{
|
|
*op = (float)(Freq * FreqA);
|
|
s += *op;
|
|
f.generate();
|
|
fw.generate();
|
|
} else {
|
|
ut = FracDelay; // t==0
|
|
*op = (float)(f.generate() * fw.generate() / (ut * ut));
|
|
s += *op;
|
|
}
|
|
|
|
mt = fl2 - 2;
|
|
|
|
while (t < mt) {
|
|
op++;
|
|
t++;
|
|
ut = t + FracDelay;
|
|
*op = (float)(f.generate() * fw.generate() / (ut * ut));
|
|
s += *op;
|
|
}
|
|
|
|
op++;
|
|
ut = t + 1 + FracDelay;
|
|
|
|
if (ut > Len2) {
|
|
*op = 0;
|
|
} else {
|
|
*op = (float)(f.generate() * fw.generate() / (ut * ut));
|
|
s += *op;
|
|
}
|
|
|
|
s = 1.0 / s;
|
|
t = (int)(op - op0 + 1);
|
|
|
|
while (t != 0) {
|
|
*op0 = (float)(*op0 * s);
|
|
op0++;
|
|
t--;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @brief Filter tap replication function, for SIMD operations.
|
|
*
|
|
* Function replicates taps of the specified filter so that it can
|
|
* be used with SIMD loading instructions. This function works
|
|
* "in-place".
|
|
*
|
|
* @param[in,out] p Filter buffer pointer, should be sized to contain
|
|
* `kl * erp` elements.
|
|
* @param kl Filter kernel's length, in taps.
|
|
* @param erp The number of repetitions to apply.
|
|
*/
|
|
|
|
static void replicateFilter(float *const p, const int kl,
|
|
const int erp)
|
|
{
|
|
const float *ip = p + kl - 1;
|
|
float *op = p + (kl - 1) * erp;
|
|
int c = kl;
|
|
|
|
if (erp == 2) {
|
|
while (c != 0) {
|
|
const float v = *ip;
|
|
op[0] = v;
|
|
op[1] = v;
|
|
ip--;
|
|
op -= 2;
|
|
c--;
|
|
}
|
|
} else if (erp == 3) {
|
|
while (c != 0) {
|
|
const float v = *ip;
|
|
op[0] = v;
|
|
op[1] = v;
|
|
op[2] = v;
|
|
ip--;
|
|
op -= 3;
|
|
c--;
|
|
}
|
|
} else // erp == 4
|
|
{
|
|
while (c != 0) {
|
|
const float v = *ip;
|
|
op[0] = v;
|
|
op[1] = v;
|
|
op[2] = v;
|
|
op[3] = v;
|
|
ip--;
|
|
op -= 4;
|
|
c--;
|
|
}
|
|
}
|
|
}
|
|
};
|
|
|
|
/**
|
|
* @brief Structure defines source scanline positions and filters for each
|
|
* destination pixel.
|
|
*/
|
|
|
|
struct CResizePos {
|
|
const float *flt; ///< Fractional delay filter.
|
|
intptr_t spo; ///< Source scanline's pixel offset, in bytes, or
|
|
///< a direct pointer to scanline buffer.
|
|
intptr_t so; ///< Offset within the source scanline, in pixels.
|
|
};
|
|
|
|
/**
|
|
* @brief Scanline resizing positions class.
|
|
*
|
|
* Class contains resizing positions, and prepares source scanline
|
|
* positions for resize filtering. The public variables become available
|
|
* after the update() function call.
|
|
*/
|
|
|
|
class CResizeScanline
|
|
{
|
|
public:
|
|
int padl; ///< Left-padding (in pixels) required for source scanline.
|
|
int padr; ///< Right-padding (in pixels) required for source scanline.
|
|
CResizePos *pos; ///< Source scanline positions (offsets) and filters
|
|
///< for each destination pixel position.
|
|
|
|
CResizeScanline()
|
|
: pos(nullptr), poslen(0), SrcLen(0)
|
|
{
|
|
}
|
|
|
|
~CResizeScanline()
|
|
{
|
|
delete[] pos;
|
|
}
|
|
|
|
/**
|
|
* @brief Object's reset function.
|
|
*
|
|
* Function "resets" *this* object so that the next update() call
|
|
* fully updates the position buffer. Reset is necessary if the
|
|
* corresponding CResizeFilters object was updated.
|
|
*/
|
|
|
|
void reset()
|
|
{
|
|
SrcLen = 0;
|
|
}
|
|
|
|
/**
|
|
* @brief Scanline positions update function.
|
|
*
|
|
* Function updates resizing positions, updates `padl`, `padr`, and
|
|
* `pos` buffer.
|
|
*
|
|
* @param SrcLen0 Source image scanline length, used to create a
|
|
* scanline buffer without length pre-calculation.
|
|
* @param DstLen0 Destination image scanline length.
|
|
* @param o0 Initial source image offset.
|
|
* @param rf Resizing filters object.
|
|
* @param sp A pointer to scanline buffer, to use for absolute
|
|
* scanline positioning, can be `nullptr`.
|
|
*/
|
|
|
|
void update(const int SrcLen0, const int DstLen0, const double o0,
|
|
CResizeFilters &rf, float *const sp = nullptr)
|
|
{
|
|
if (SrcLen0 == SrcLen && DstLen0 == DstLen && o0 == o) {
|
|
return;
|
|
}
|
|
|
|
const int fl2m1 = rf.fl2 - 1;
|
|
padl = fl2m1 - (int)floor(o0);
|
|
|
|
if (padl < 0) {
|
|
padl = 0;
|
|
}
|
|
|
|
// Make sure `padr` and `pos` are in sync: calculate ending `pos`
|
|
// offset in advance.
|
|
|
|
const double k = rf.k;
|
|
|
|
const int DstLen_m1 = DstLen0 - 1;
|
|
const double oe = o0 + k * DstLen_m1;
|
|
const int ie = (int)floor(oe);
|
|
|
|
padr = ie + rf.fl2 + 1 - SrcLen0;
|
|
|
|
if (padr < 0) {
|
|
padr = 0;
|
|
}
|
|
|
|
SrcLen = 0;
|
|
reallocBuf(pos, poslen, DstLen0);
|
|
|
|
const intptr_t ElCountF = rf.ElCount * (intptr_t)sizeof(float);
|
|
const int so = padl - fl2m1;
|
|
CResizePos *rp = pos;
|
|
intptr_t rpso;
|
|
int i;
|
|
|
|
for (i = 0; i < DstLen_m1; i++) {
|
|
const double ox = o0 + k * i;
|
|
const int ix = (int)floor(ox);
|
|
|
|
rp->flt = rf.getFilter(ox - ix);
|
|
rpso = so + ix;
|
|
rp->spo = (intptr_t)sp + rpso * ElCountF;
|
|
rp->so = rpso;
|
|
rp++;
|
|
}
|
|
|
|
rp->flt = rf.getFilter(oe - ie);
|
|
rpso = so + ie;
|
|
rp->spo = (intptr_t)sp + rpso * ElCountF;
|
|
rp->so = rpso;
|
|
|
|
SrcLen = SrcLen0;
|
|
DstLen = DstLen0;
|
|
o = o0;
|
|
}
|
|
|
|
/**
|
|
* @brief Scanline pixel offsets update function.
|
|
*
|
|
* Function updates `pos` buffer's `spo` (scanline pixel offset)
|
|
* values.
|
|
*
|
|
* @param rf Resizing filters object.
|
|
* @param sp A pointer to scanline buffer, to use for absolute
|
|
* scanline positioning, can be `nullptr`.
|
|
*/
|
|
|
|
void updateSPO(CResizeFilters &rf, float *const sp)
|
|
{
|
|
const intptr_t ElCountF = rf.ElCount * (intptr_t)sizeof(float);
|
|
CResizePos *const rp = pos;
|
|
int i;
|
|
|
|
for (i = 0; i < DstLen; i++) {
|
|
rp[i].spo = (intptr_t)sp + rp[i].so * ElCountF;
|
|
}
|
|
}
|
|
|
|
protected:
|
|
int poslen; ///< Allocated `pos` buffer's length.
|
|
int SrcLen; ///< Current `SrcLen`.
|
|
int DstLen; ///< Current `DstLen`.
|
|
double o; ///< Current `o`.
|
|
};
|
|
|
|
CResizeFilters rfv; ///< Resizing filters for vertical resizing.
|
|
CResizeFilters rfh0; ///< Resizing filters for horizontal resizing (may
|
|
///< not be in use).
|
|
CResizeScanline rsv; ///< Vertical resize scanline.
|
|
CResizeScanline rsh; ///< Horizontal resize scanline.
|
|
|
|
/**
|
|
* @{
|
|
* @brief Scanline copying function, for vertical resizing.
|
|
*
|
|
* Function copies scanline (fully or partially) from the source buffer,
|
|
* in its native format, to the internal scanline buffer, in preparation
|
|
* for vertical resizing. Variants for 1-4-channel images.
|
|
*
|
|
* @param ip Source scanline buffer pointer.
|
|
* @param ipinc `ip` increment per pixel.
|
|
* @param op Output scanline pointer.
|
|
* @param cc Source pixel copy count.
|
|
* @param repl Leftmost pixel's replication count.
|
|
* @param repr Rightmost pixel's replication count.
|
|
* @tparam T Source buffer's element type.
|
|
*/
|
|
|
|
template<typename T>
|
|
static void copyScanline1v(const T *ip, const size_t ipinc, float *op,
|
|
int cc, int repl, int repr)
|
|
{
|
|
float v0;
|
|
|
|
if (repl > 0) {
|
|
v0 = (float)ip[0];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op += 1;
|
|
|
|
} while (--repl != 0);
|
|
}
|
|
|
|
while (cc != 0) {
|
|
op[0] = (float)ip[0];
|
|
ip += ipinc;
|
|
op += 1;
|
|
cc--;
|
|
}
|
|
|
|
if (repr > 0) {
|
|
const T *const ipe = ip - ipinc;
|
|
v0 = (float)ipe[0];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op += 1;
|
|
|
|
} while (--repr != 0);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
static void copyScanline2v(const T *ip, const size_t ipinc, float *op,
|
|
int cc, int repl, int repr)
|
|
{
|
|
float v0, v1;
|
|
|
|
if (repl > 0) {
|
|
v0 = (float)ip[0];
|
|
v1 = (float)ip[1];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op += 2;
|
|
|
|
} while (--repl != 0);
|
|
}
|
|
|
|
while (cc != 0) {
|
|
op[0] = (float)ip[0];
|
|
op[1] = (float)ip[1];
|
|
ip += ipinc;
|
|
op += 2;
|
|
cc--;
|
|
}
|
|
|
|
if (repr > 0) {
|
|
const T *const ipe = ip - ipinc;
|
|
v0 = (float)ipe[0];
|
|
v1 = (float)ipe[1];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op += 2;
|
|
|
|
} while (--repr != 0);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
static void copyScanline3v(const T *ip, const size_t ipinc, float *op,
|
|
int cc, int repl, int repr)
|
|
{
|
|
float v0, v1, v2;
|
|
|
|
if (repl > 0) {
|
|
v0 = (float)ip[0];
|
|
v1 = (float)ip[1];
|
|
v2 = (float)ip[2];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op += 3;
|
|
|
|
} while (--repl != 0);
|
|
}
|
|
|
|
while (cc != 0) {
|
|
op[0] = (float)ip[0];
|
|
op[1] = (float)ip[1];
|
|
op[2] = (float)ip[2];
|
|
ip += ipinc;
|
|
op += 3;
|
|
cc--;
|
|
}
|
|
|
|
if (repr > 0) {
|
|
const T *const ipe = ip - ipinc;
|
|
v0 = (float)ipe[0];
|
|
v1 = (float)ipe[1];
|
|
v2 = (float)ipe[2];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op += 3;
|
|
|
|
} while (--repr != 0);
|
|
}
|
|
}
|
|
|
|
template<typename T>
|
|
static void copyScanline4v(const T *ip, const size_t ipinc, float *op,
|
|
int cc, int repl, int repr)
|
|
{
|
|
float v0, v1, v2, v3;
|
|
|
|
if (repl > 0) {
|
|
v0 = (float)ip[0];
|
|
v1 = (float)ip[1];
|
|
v2 = (float)ip[2];
|
|
v3 = (float)ip[3];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op[3] = v3;
|
|
op += 4;
|
|
|
|
} while (--repl != 0);
|
|
}
|
|
|
|
while (cc != 0) {
|
|
op[0] = (float)ip[0];
|
|
op[1] = (float)ip[1];
|
|
op[2] = (float)ip[2];
|
|
op[3] = (float)ip[3];
|
|
ip += ipinc;
|
|
op += 4;
|
|
cc--;
|
|
}
|
|
|
|
if (repr > 0) {
|
|
const T *const ipe = ip - ipinc;
|
|
v0 = (float)ipe[0];
|
|
v1 = (float)ipe[1];
|
|
v2 = (float)ipe[2];
|
|
v3 = (float)ipe[3];
|
|
|
|
do {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op[3] = v3;
|
|
op += 4;
|
|
|
|
} while (--repr != 0);
|
|
}
|
|
}
|
|
|
|
/** @} */
|
|
|
|
/**
|
|
* @{
|
|
* @brief Scanline padding function, for horizontal resizing.
|
|
*
|
|
* Function pads the specified scanline buffer to the left and right by
|
|
* replicating its first and last available pixels, in preparation for
|
|
* horizontal resizing. Variants for 1-4-channel images.
|
|
*
|
|
* @param[in,out] op Scanline buffer to pad.
|
|
* @param rs Scanline resizing positions object.
|
|
* @param l Source scanline's length, in pixels.
|
|
*/
|
|
|
|
static void padScanline1h(float *op, CResizeScanline &rs, const int l)
|
|
{
|
|
const float *ip = op + rs.padl;
|
|
|
|
float v0 = ip[0];
|
|
int i;
|
|
|
|
for (i = 0; i < rs.padl; i++) {
|
|
op[i] = v0;
|
|
}
|
|
|
|
ip += l;
|
|
op += rs.padl + l;
|
|
|
|
v0 = ip[-1];
|
|
|
|
for (i = 0; i < rs.padr; i++) {
|
|
op[i] = v0;
|
|
}
|
|
}
|
|
|
|
static void padScanline2h(float *op, CResizeScanline &rs, const int l)
|
|
{
|
|
const float *ip = op + rs.padl * 2;
|
|
|
|
float v0 = ip[0];
|
|
float v1 = ip[1];
|
|
int i;
|
|
|
|
for (i = 0; i < rs.padl; i++) {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op += 2;
|
|
}
|
|
|
|
const int lc = l * 2;
|
|
ip += lc;
|
|
op += lc;
|
|
|
|
v0 = ip[-2];
|
|
v1 = ip[-1];
|
|
|
|
for (i = 0; i < rs.padr; i++) {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op += 2;
|
|
}
|
|
}
|
|
|
|
static void padScanline3h(float *op, CResizeScanline &rs, const int l)
|
|
{
|
|
const float *ip = op + rs.padl * 3;
|
|
|
|
float v0 = ip[0];
|
|
float v1 = ip[1];
|
|
float v2 = ip[2];
|
|
int i;
|
|
|
|
for (i = 0; i < rs.padl; i++) {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op += 3;
|
|
}
|
|
|
|
const int lc = l * 3;
|
|
ip += lc;
|
|
op += lc;
|
|
|
|
v0 = ip[-3];
|
|
v1 = ip[-2];
|
|
v2 = ip[-1];
|
|
|
|
for (i = 0; i < rs.padr; i++) {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op += 3;
|
|
}
|
|
}
|
|
|
|
static void padScanline4h(float *op, CResizeScanline &rs, const int l)
|
|
{
|
|
const float *ip = op + rs.padl * 4;
|
|
|
|
float v0 = ip[0];
|
|
float v1 = ip[1];
|
|
float v2 = ip[2];
|
|
float v3 = ip[3];
|
|
int i;
|
|
|
|
for (i = 0; i < rs.padl; i++) {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op[3] = v3;
|
|
op += 4;
|
|
}
|
|
|
|
const int lc = l * 4;
|
|
ip += lc;
|
|
op += lc;
|
|
|
|
v0 = ip[-4];
|
|
v1 = ip[-3];
|
|
v2 = ip[-2];
|
|
v3 = ip[-1];
|
|
|
|
for (i = 0; i < rs.padr; i++) {
|
|
op[0] = v0;
|
|
op[1] = v1;
|
|
op[2] = v2;
|
|
op[3] = v3;
|
|
op += 4;
|
|
}
|
|
}
|
|
|
|
/** @} */
|
|
|
|
/**
|
|
* @brief Rounds a value, and applies clamping.
|
|
*
|
|
* @param v Value to round and clamp.
|
|
* @param Clamp High clamp level; low level is 0.
|
|
* @return Rounded and clamped value.
|
|
*/
|
|
|
|
static inline int roundclamp(const float v, const float Clamp)
|
|
{
|
|
return ((int)((v > Clamp ? Clamp : (v < 0.0f ? 0.0f : v)) +
|
|
0.5f));
|
|
}
|
|
|
|
/**
|
|
* @brief Scanline output function.
|
|
*
|
|
* Function performs output of the scanline pixels to the destination
|
|
* image buffer, with type conversion, scaling, clamping, if necessary.
|
|
*
|
|
* @param[in] ip Input (resized) scanline. Pointer must be aligned to
|
|
* LANCIR_ALIGN bytes.
|
|
* @param[out] op Output image buffer. Must be different to `ip`.
|
|
* @param l Output scanline's length, in elements (not pixel count).
|
|
* @param Clamp Clamp high level, used if `IsOutFloat` is `false`.
|
|
* @param OutMul Output multiplier, for value range conversion, applied
|
|
* before clamping.
|
|
* @tparam IsOutFloat `true`, if floating-point output, and no clamping is
|
|
* necessary.
|
|
* @tparam IsUnityMul `true`, if multiplication is optional. However, even
|
|
* if this parameter was specified as `true`, `OutMul` must be 1.
|
|
* @tparam T Output buffer's element type. Acquired implicitly.
|
|
*/
|
|
|
|
template<bool IsOutFloat, bool IsUnityMul, typename T>
|
|
static void outputScanline(const float *ip, T *op, int l,
|
|
const float Clamp, const float OutMul)
|
|
{
|
|
if (IsOutFloat) {
|
|
if (IsUnityMul) {
|
|
if (sizeof(op[0]) == sizeof(ip[0])) {
|
|
memcpy(op, ip, (size_t)l * sizeof(op[0]));
|
|
} else {
|
|
int l4 = l >> 2;
|
|
l &= 3;
|
|
|
|
while (l4 != 0) {
|
|
op[0] = (T)ip[0];
|
|
op[1] = (T)ip[1];
|
|
op[2] = (T)ip[2];
|
|
op[3] = (T)ip[3];
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
|
|
while (l != 0) {
|
|
*op = (T)*ip;
|
|
ip++;
|
|
op++;
|
|
l--;
|
|
}
|
|
}
|
|
} else {
|
|
int l4 = l >> 2;
|
|
l &= 3;
|
|
bool DoScalar = true;
|
|
|
|
if (sizeof(op[0]) == sizeof(ip[0])) {
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
DoScalar = false;
|
|
const lancvec_t om = lancvec_load32_splat(&OutMul);
|
|
|
|
while (l4 != 0) {
|
|
lancvec_storeu((float *)op,
|
|
lancvec_mul(lancvec_load(ip), om));
|
|
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
}
|
|
|
|
if (DoScalar) {
|
|
while (l4 != 0) {
|
|
op[0] = (T)(ip[0] * OutMul);
|
|
op[1] = (T)(ip[1] * OutMul);
|
|
op[2] = (T)(ip[2] * OutMul);
|
|
op[3] = (T)(ip[3] * OutMul);
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
}
|
|
|
|
while (l != 0) {
|
|
*op = (T)(*ip * OutMul);
|
|
ip++;
|
|
op++;
|
|
l--;
|
|
}
|
|
}
|
|
} else {
|
|
int l4 = l >> 2;
|
|
l &= 3;
|
|
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
const lancvec_t minv = lancvec_const_splat(0.0f);
|
|
const lancvec_t maxv = lancvec_load32_splat(&Clamp);
|
|
const lancvec_t om = lancvec_load32_splat(&OutMul);
|
|
|
|
#if defined(LANCIR_SSE2)
|
|
unsigned int prevrm = _MM_GET_ROUNDING_MODE();
|
|
_MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
|
|
#else // defined( LANCIR_SSE2 )
|
|
const lancvec_t v05 = lancvec_const_splat(0.5f);
|
|
#endif // defined( LANCIR_SSE2 )
|
|
|
|
if (sizeof(op[0]) == 4) {
|
|
while (l4 != 0) {
|
|
const lancvec_t v = lancvec_load(ip);
|
|
const lancvec_t cv = lancvec_max(lancvec_min(
|
|
(IsUnityMul ? v : lancvec_mul(v, om)),
|
|
maxv),
|
|
minv);
|
|
|
|
#if defined(LANCIR_SSE2)
|
|
|
|
_mm_storeu_si128((__m128i *)op, _mm_cvtps_epi32(cv));
|
|
|
|
#elif defined(LANCIR_NEON)
|
|
|
|
vst1q_u32((unsigned int *)op, vcvtq_u32_f32(vaddq_f32(cv, v05)));
|
|
|
|
#elif defined(LANCIR_WASM)
|
|
|
|
wasm_v128_store(op, wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_add(cv, v05)));
|
|
|
|
#endif // defined( LANCIR_WASM )
|
|
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
} else if (sizeof(op[0]) == 2) {
|
|
while (l4 != 0) {
|
|
const lancvec_t v = lancvec_load(ip);
|
|
const lancvec_t cv = lancvec_max(lancvec_min(
|
|
(IsUnityMul ? v : lancvec_mul(v, om)),
|
|
maxv),
|
|
minv);
|
|
|
|
#if defined(LANCIR_SSE2)
|
|
|
|
const __m128i v32 = _mm_cvtps_epi32(cv);
|
|
const __m128i v16s = _mm_shufflehi_epi16(
|
|
_mm_shufflelo_epi16(v32, 0 | 2 << 2), 0 | 2 << 2);
|
|
|
|
const __m128i v16 = _mm_shuffle_epi32(v16s, 0 | 2 << 2);
|
|
|
|
__m128i tmp;
|
|
_mm_store_si128(&tmp, v16);
|
|
memcpy(op, &tmp, 8);
|
|
|
|
#elif defined(LANCIR_NEON)
|
|
|
|
const uint32x4_t v32 = vcvtq_u32_f32(
|
|
vaddq_f32(cv, v05));
|
|
|
|
const uint16x4_t v16 = vmovn_u32(v32);
|
|
|
|
vst1_u16((unsigned short *)op, v16);
|
|
|
|
#elif defined(LANCIR_WASM)
|
|
|
|
const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(
|
|
wasm_f32x4_add(cv, v05));
|
|
|
|
wasm_v128_store64_lane(op,
|
|
wasm_u16x8_narrow_i32x4(v32, v32), 0);
|
|
|
|
#endif // defined( LANCIR_WASM )
|
|
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
} else {
|
|
while (l4 != 0) {
|
|
const lancvec_t v = lancvec_load(ip);
|
|
const lancvec_t cv = lancvec_max(lancvec_min(
|
|
(IsUnityMul ? v : lancvec_mul(v, om)),
|
|
maxv),
|
|
minv);
|
|
|
|
#if defined(LANCIR_SSE2)
|
|
|
|
const __m128i v32 = _mm_cvtps_epi32(cv);
|
|
const __m128i v16s = _mm_shufflehi_epi16(
|
|
_mm_shufflelo_epi16(v32, 0 | 2 << 2), 0 | 2 << 2);
|
|
|
|
const __m128i v16 = _mm_shuffle_epi32(v16s, 0 | 2 << 2);
|
|
const __m128i v8 = _mm_packus_epi16(v16, v16);
|
|
|
|
*(int *)op = _mm_cvtsi128_si32(v8);
|
|
|
|
#elif defined(LANCIR_NEON)
|
|
|
|
const uint32x4_t v32 = vcvtq_u32_f32(
|
|
vaddq_f32(cv, v05));
|
|
|
|
const uint16x4_t v16 = vmovn_u32(v32);
|
|
const uint8x8_t v8 = vmovn_u16(vcombine_u16(v16, v16));
|
|
|
|
*(unsigned int *)op = vget_lane_u32((uint32x2_t)v8, 0);
|
|
|
|
#elif defined(LANCIR_WASM)
|
|
|
|
const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(
|
|
wasm_f32x4_add(cv, v05));
|
|
|
|
const v128_t v16 = wasm_u16x8_narrow_i32x4(v32, v32);
|
|
|
|
wasm_v128_store32_lane(op,
|
|
wasm_u8x16_narrow_i16x8(v16, v16), 0);
|
|
|
|
#endif // defined( LANCIR_WASM )
|
|
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
}
|
|
|
|
#if defined(LANCIR_SSE2)
|
|
_MM_SET_ROUNDING_MODE(prevrm);
|
|
#endif // defined( LANCIR_SSE2 )
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
if (IsUnityMul) {
|
|
while (l4 != 0) {
|
|
op[0] = (T)roundclamp(ip[0], Clamp);
|
|
op[1] = (T)roundclamp(ip[1], Clamp);
|
|
op[2] = (T)roundclamp(ip[2], Clamp);
|
|
op[3] = (T)roundclamp(ip[3], Clamp);
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
} else {
|
|
while (l4 != 0) {
|
|
op[0] = (T)roundclamp(ip[0] * OutMul, Clamp);
|
|
op[1] = (T)roundclamp(ip[1] * OutMul, Clamp);
|
|
op[2] = (T)roundclamp(ip[2] * OutMul, Clamp);
|
|
op[3] = (T)roundclamp(ip[3] * OutMul, Clamp);
|
|
ip += 4;
|
|
op += 4;
|
|
l4--;
|
|
}
|
|
}
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
if (IsUnityMul) {
|
|
while (l != 0) {
|
|
*op = (T)roundclamp(*ip, Clamp);
|
|
ip++;
|
|
op++;
|
|
l--;
|
|
}
|
|
} else {
|
|
while (l != 0) {
|
|
*op = (T)roundclamp(*ip * OutMul, Clamp);
|
|
ip++;
|
|
op++;
|
|
l--;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @def LANCIR_LF_PRE
|
|
* @brief Scanline resize function prologue.
|
|
*/
|
|
|
|
#define LANCIR_LF_PRE \
|
|
const CResizePos *const rpe = rp + DstLen; \
|
|
while (rp != rpe) { \
|
|
const float *flt = rp->flt; \
|
|
const float *ip; \
|
|
if (UseSP) { \
|
|
ip = (const float *)((intptr_t)sp + rp->spo); \
|
|
} else { \
|
|
ip = (const float *)rp->spo; \
|
|
}
|
|
|
|
/**
|
|
* @def LANCIR_LF_POST
|
|
* @brief Scanline resize function epilogue.
|
|
*/
|
|
|
|
#define LANCIR_LF_POST \
|
|
op += opinc; \
|
|
rp++; \
|
|
}
|
|
|
|
/**
|
|
* @{
|
|
* @brief Function performs scanline resizing. Variants for 1-4-channel
|
|
* images.
|
|
*
|
|
* @param[in] sp Source scanline buffer.
|
|
* @param[out] op Destination buffer.
|
|
* @param opinc `op` increment.
|
|
* @param rp Source scanline offsets and resizing filters.
|
|
* @param kl Filter kernel's length, in taps (always an even value).
|
|
* @param DstLen Destination length, in pixels.
|
|
* @tparam UseSP `true`, if `sp` pointer should be added to `spo`.
|
|
*/
|
|
|
|
template<bool UseSP>
|
|
static void resize1(const float *const sp, float *op, const size_t opinc,
|
|
const CResizePos *rp, const int kl, const int DstLen)
|
|
{
|
|
const int ci = kl >> 2;
|
|
|
|
if ((kl & 3) == 0) {
|
|
LANCIR_LF_PRE
|
|
|
|
int c = ci;
|
|
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
lancvec_t sum = lancvec_mul(
|
|
lancvec_load(flt), lancvec_loadu(ip));
|
|
|
|
while (--c != 0) {
|
|
flt += 4;
|
|
ip += 4;
|
|
sum = lancvec_madd(sum, lancvec_load(flt),
|
|
lancvec_loadu(ip));
|
|
}
|
|
|
|
lancvec_store32_hadd(op, sum);
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
float sum0 = flt[0] * ip[0];
|
|
float sum1 = flt[1] * ip[1];
|
|
float sum2 = flt[2] * ip[2];
|
|
float sum3 = flt[3] * ip[3];
|
|
|
|
while (--c != 0) {
|
|
flt += 4;
|
|
ip += 4;
|
|
sum0 += flt[0] * ip[0];
|
|
sum1 += flt[1] * ip[1];
|
|
sum2 += flt[2] * ip[2];
|
|
sum3 += flt[3] * ip[3];
|
|
}
|
|
|
|
op[0] = (sum0 + sum1) + (sum2 + sum3);
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_POST
|
|
} else {
|
|
LANCIR_LF_PRE
|
|
|
|
int c = ci;
|
|
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
lancvec_t sum = lancvec_mul(lancvec_load(flt),
|
|
lancvec_loadu(ip));
|
|
|
|
while (--c != 0) {
|
|
flt += 4;
|
|
ip += 4;
|
|
sum = lancvec_madd(sum, lancvec_load(flt),
|
|
lancvec_loadu(ip));
|
|
}
|
|
|
|
#if defined(LANCIR_NEON)
|
|
|
|
float32x2_t sum2 = vadd_f32(vget_high_f32(sum),
|
|
vget_low_f32(sum));
|
|
|
|
sum2 = vmla_f32(sum2, vld1_f32(flt + 4),
|
|
vld1_f32(ip + 4));
|
|
|
|
#if defined(LANCIR_ARM32)
|
|
op[0] = vget_lane_f32(sum2, 0) +
|
|
vget_lane_f32(sum2, 1);
|
|
#else // defined( LANCIR_ARM32 )
|
|
op[0] = vaddv_f32(sum2);
|
|
#endif // defined( LANCIR_ARM32 )
|
|
|
|
#else // defined( LANCIR_NEON )
|
|
|
|
const lancvec_t sum2 = lancvec_mul(lancvec_loadu(flt + 2),
|
|
lancvec_loadu(ip + 2));
|
|
|
|
sum = lancvec_addhl(sum, sum);
|
|
sum = lancvec_addhl(sum, sum2);
|
|
|
|
lancvec_store32_addhl(op, sum);
|
|
|
|
#endif // defined( LANCIR_NEON )
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
float sum0 = flt[0] * ip[0];
|
|
float sum1 = flt[1] * ip[1];
|
|
float sum2 = flt[2] * ip[2];
|
|
float sum3 = flt[3] * ip[3];
|
|
|
|
while (--c != 0) {
|
|
flt += 4;
|
|
ip += 4;
|
|
sum0 += flt[0] * ip[0];
|
|
sum1 += flt[1] * ip[1];
|
|
sum2 += flt[2] * ip[2];
|
|
sum3 += flt[3] * ip[3];
|
|
}
|
|
|
|
op[0] = (sum0 + sum1) + (sum2 + sum3) +
|
|
flt[4] * ip[4] + flt[5] * ip[5];
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_POST
|
|
}
|
|
}
|
|
|
|
template<bool UseSP>
|
|
static void resize2(const float *const sp, float *op, const size_t opinc,
|
|
const CResizePos *rp, const int kl, const int DstLen)
|
|
{
|
|
#if LANCIR_ALIGN > 4
|
|
const int ci = kl >> 2;
|
|
const int cir = kl & 3;
|
|
#else // LANCIR_ALIGN > 4
|
|
const int ci = kl >> 1;
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_PRE
|
|
|
|
int c = ci;
|
|
|
|
#if defined(LANCIR_AVX)
|
|
|
|
__m256 sum = _mm256_mul_ps(_mm256_load_ps(flt),
|
|
_mm256_loadu_ps(ip));
|
|
|
|
while (--c != 0) {
|
|
flt += 8;
|
|
ip += 8;
|
|
sum = _mm256_add_ps(sum, _mm256_mul_ps(_mm256_load_ps(flt), _mm256_loadu_ps(ip)));
|
|
}
|
|
|
|
__m128 res = _mm_add_ps(_mm256_extractf128_ps(sum, 0),
|
|
_mm256_extractf128_ps(sum, 1));
|
|
|
|
if (cir == 2) {
|
|
res = _mm_add_ps(res, _mm_mul_ps(_mm_load_ps(flt + 8), _mm_loadu_ps(ip + 8)));
|
|
}
|
|
|
|
_mm_storel_pi((__m64 *)op,
|
|
_mm_add_ps(res, _mm_movehl_ps(res, res)));
|
|
|
|
#elif LANCIR_ALIGN > 4
|
|
|
|
lancvec_t sumA = lancvec_mul(
|
|
lancvec_load(flt), lancvec_loadu(ip));
|
|
|
|
lancvec_t sumB = lancvec_mul(
|
|
lancvec_load(flt + 4), lancvec_loadu(ip + 4));
|
|
|
|
while (--c != 0) {
|
|
flt += 8;
|
|
ip += 8;
|
|
sumA = lancvec_madd(sumA, lancvec_load(flt),
|
|
lancvec_loadu(ip));
|
|
|
|
sumB = lancvec_madd(sumB, lancvec_load(flt + 4),
|
|
lancvec_loadu(ip + 4));
|
|
}
|
|
|
|
sumA = lancvec_add(sumA, sumB);
|
|
|
|
if (cir == 2) {
|
|
sumA = lancvec_madd(sumA, lancvec_load(flt + 8),
|
|
lancvec_loadu(ip + 8));
|
|
}
|
|
|
|
lancvec_store64_addhl(op, sumA);
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
const float xx = flt[0];
|
|
const float xx2 = flt[1];
|
|
float sum0 = xx * ip[0];
|
|
float sum1 = xx * ip[1];
|
|
float sum2 = xx2 * ip[2];
|
|
float sum3 = xx2 * ip[3];
|
|
|
|
while (--c != 0) {
|
|
flt += 2;
|
|
ip += 4;
|
|
const float xx = flt[0];
|
|
const float xx2 = flt[1];
|
|
sum0 += xx * ip[0];
|
|
sum1 += xx * ip[1];
|
|
sum2 += xx2 * ip[2];
|
|
sum3 += xx2 * ip[3];
|
|
}
|
|
|
|
op[0] = sum0 + sum2;
|
|
op[1] = sum1 + sum3;
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_POST
|
|
}
|
|
|
|
template<bool UseSP>
|
|
static void resize3(const float *const sp, float *op, const size_t opinc,
|
|
const CResizePos *rp, const int kl, const int DstLen)
|
|
{
|
|
#if LANCIR_ALIGN > 4
|
|
|
|
const int ci = kl >> 2;
|
|
const int cir = kl & 3;
|
|
|
|
LANCIR_LF_PRE
|
|
|
|
float res[12];
|
|
int c = ci;
|
|
|
|
#if defined(LANCIR_AVX)
|
|
|
|
__m128 sumA = _mm_mul_ps(_mm_load_ps(flt), _mm_loadu_ps(ip));
|
|
__m256 sumB = _mm256_mul_ps(_mm256_loadu_ps(flt + 4),
|
|
_mm256_loadu_ps(ip + 4));
|
|
|
|
while (--c != 0) {
|
|
flt += 12;
|
|
ip += 12;
|
|
sumA = _mm_add_ps(sumA, _mm_mul_ps(_mm_load_ps(flt), _mm_loadu_ps(ip)));
|
|
|
|
sumB = _mm256_add_ps(sumB, _mm256_mul_ps(_mm256_loadu_ps(flt + 4), _mm256_loadu_ps(ip + 4)));
|
|
}
|
|
|
|
if (cir == 2) {
|
|
sumA = _mm_add_ps(sumA, _mm_mul_ps(_mm_load_ps(flt + 12), _mm_loadu_ps(ip + 12)));
|
|
}
|
|
|
|
_mm_storeu_ps(res, sumA);
|
|
|
|
float o0 = res[0] + res[3];
|
|
float o1 = res[1];
|
|
float o2 = res[2];
|
|
|
|
_mm256_storeu_ps(res + 4, sumB);
|
|
|
|
o1 += res[4];
|
|
o2 += res[5];
|
|
|
|
#else // defined( LANCIR_AVX )
|
|
|
|
lancvec_t sumA = lancvec_mul(lancvec_load(flt),
|
|
lancvec_loadu(ip));
|
|
|
|
lancvec_t sumB = lancvec_mul(lancvec_load(flt + 4),
|
|
lancvec_loadu(ip + 4));
|
|
|
|
lancvec_t sumC = lancvec_mul(lancvec_load(flt + 8),
|
|
lancvec_loadu(ip + 8));
|
|
|
|
while (--c != 0) {
|
|
flt += 12;
|
|
ip += 12;
|
|
sumA = lancvec_madd(sumA, lancvec_load(flt),
|
|
lancvec_loadu(ip));
|
|
|
|
sumB = lancvec_madd(sumB, lancvec_load(flt + 4),
|
|
lancvec_loadu(ip + 4));
|
|
|
|
sumC = lancvec_madd(sumC, lancvec_load(flt + 8),
|
|
lancvec_loadu(ip + 8));
|
|
}
|
|
|
|
if (cir == 2) {
|
|
sumA = lancvec_madd(sumA, lancvec_load(flt + 12),
|
|
lancvec_loadu(ip + 12));
|
|
}
|
|
|
|
lancvec_storeu(res, sumA);
|
|
lancvec_storeu(res + 4, sumB);
|
|
|
|
float o0 = res[0] + res[3];
|
|
float o1 = res[1] + res[4];
|
|
float o2 = res[2] + res[5];
|
|
|
|
lancvec_storeu(res + 8, sumC);
|
|
|
|
#endif // defined( LANCIR_AVX )
|
|
|
|
o0 += res[6] + res[9];
|
|
o1 += res[7] + res[10];
|
|
o2 += res[8] + res[11];
|
|
|
|
if (cir == 2) {
|
|
o1 += flt[16] * ip[16];
|
|
o2 += flt[17] * ip[17];
|
|
}
|
|
|
|
op[0] = o0;
|
|
op[1] = o1;
|
|
op[2] = o2;
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
const int ci = kl >> 1;
|
|
|
|
LANCIR_LF_PRE
|
|
|
|
int c = ci;
|
|
|
|
const float xx = flt[0];
|
|
float sum0 = xx * ip[0];
|
|
float sum1 = xx * ip[1];
|
|
float sum2 = xx * ip[2];
|
|
const float xx2 = flt[1];
|
|
float sum3 = xx2 * ip[3];
|
|
float sum4 = xx2 * ip[4];
|
|
float sum5 = xx2 * ip[5];
|
|
|
|
while (--c != 0) {
|
|
flt += 2;
|
|
ip += 6;
|
|
const float xx = flt[0];
|
|
sum0 += xx * ip[0];
|
|
sum1 += xx * ip[1];
|
|
sum2 += xx * ip[2];
|
|
const float xx2 = flt[1];
|
|
sum3 += xx2 * ip[3];
|
|
sum4 += xx2 * ip[4];
|
|
sum5 += xx2 * ip[5];
|
|
}
|
|
|
|
op[0] = sum0 + sum3;
|
|
op[1] = sum1 + sum4;
|
|
op[2] = sum2 + sum5;
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_POST
|
|
}
|
|
|
|
template<bool UseSP>
|
|
static void resize4(const float *const sp, float *op, const size_t opinc,
|
|
const CResizePos *rp, const int kl, const int DstLen)
|
|
{
|
|
#if LANCIR_ALIGN > 4
|
|
const int ci = kl >> 1;
|
|
#else // LANCIR_ALIGN > 4
|
|
const int ci = kl;
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_PRE
|
|
|
|
int c = ci;
|
|
|
|
#if defined(LANCIR_AVX)
|
|
|
|
__m256 sum = _mm256_mul_ps(_mm256_load_ps(flt),
|
|
_mm256_loadu_ps(ip));
|
|
|
|
while (--c != 0) {
|
|
flt += 8;
|
|
ip += 8;
|
|
sum = _mm256_add_ps(sum, _mm256_mul_ps(_mm256_load_ps(flt), _mm256_loadu_ps(ip)));
|
|
}
|
|
|
|
_mm_store_ps(op, _mm_add_ps(_mm256_extractf128_ps(sum, 0), _mm256_extractf128_ps(sum, 1)));
|
|
|
|
#elif LANCIR_ALIGN > 4
|
|
|
|
lancvec_t sumA = lancvec_mul(lancvec_load(flt),
|
|
lancvec_load(ip));
|
|
|
|
lancvec_t sumB = lancvec_mul(lancvec_load(flt + 4),
|
|
lancvec_load(ip + 4));
|
|
|
|
while (--c != 0) {
|
|
flt += 8;
|
|
ip += 8;
|
|
sumA = lancvec_madd(sumA, lancvec_load(flt),
|
|
lancvec_load(ip));
|
|
|
|
sumB = lancvec_madd(sumB, lancvec_load(flt + 4),
|
|
lancvec_load(ip + 4));
|
|
}
|
|
|
|
lancvec_store(op, lancvec_add(sumA, sumB));
|
|
|
|
#else // LANCIR_ALIGN > 4
|
|
|
|
const float xx = flt[0];
|
|
float sum0 = xx * ip[0];
|
|
float sum1 = xx * ip[1];
|
|
float sum2 = xx * ip[2];
|
|
float sum3 = xx * ip[3];
|
|
|
|
while (--c != 0) {
|
|
flt++;
|
|
ip += 4;
|
|
const float xx = flt[0];
|
|
sum0 += xx * ip[0];
|
|
sum1 += xx * ip[1];
|
|
sum2 += xx * ip[2];
|
|
sum3 += xx * ip[3];
|
|
}
|
|
|
|
op[0] = sum0;
|
|
op[1] = sum1;
|
|
op[2] = sum2;
|
|
op[3] = sum3;
|
|
|
|
#endif // LANCIR_ALIGN > 4
|
|
|
|
LANCIR_LF_POST
|
|
}
|
|
|
|
/** @} */
|
|
|
|
#undef LANCIR_LF_PRE
|
|
#undef LANCIR_LF_POST
|
|
};
|
|
|
|
#undef lancvec_t
|
|
#undef lancvec_const_splat
|
|
#undef lancvec_load32_splat
|
|
#undef lancvec_load
|
|
#undef lancvec_loadu
|
|
#undef lancvec_store
|
|
#undef lancvec_storeu
|
|
#undef lancvec_add
|
|
#undef lancvec_mul
|
|
#undef lancvec_min
|
|
#undef lancvec_max
|
|
#undef lancvec_madd
|
|
#undef lancvec_addhl
|
|
#undef lancvec_store32_addhl
|
|
#undef lancvec_store32_hadd
|
|
#undef lancvec_store64_addhl
|
|
|
|
#if defined(LANCIR_NULLPTR)
|
|
#undef nullptr
|
|
#undef LANCIR_NULLPTR
|
|
#endif // defined( LANCIR_NULLPTR )
|
|
|
|
} // namespace avir
|
|
|
|
#endif // AVIR_CLANCIR_INCLUDED
|