yacreader/image_processing/lancir.h

/**
 * @file lancir.h
 *
 * @version 3.1
 *
 * @brief Self-contained header-only "LANCIR" image resizing algorithm.
 *
 * This is a self-contained inclusion file for the "LANCIR" image resizer,
 * a part of the AVIR library. Features scalar, AVX, SSE2, NEON, and WASM
 * SIMD128 optimizations as well as batched resizing technique which provides
 * a better CPU cache performance.
 *
 * AVIR Copyright (c) 2015-2025 Aleksey Vaneev
 *
 * @mainpage
 *
 * @section intro_sec Introduction
 *
 * Description is available at https://github.com/avaneev/avir
 *
 * @section license License
 *
 * LICENSE:
 *
 * Copyright (c) 2015-2025 Aleksey Vaneev
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 * DEALINGS IN THE SOFTWARE.
 */

#ifndef AVIR_CLANCIR_INCLUDED
#define AVIR_CLANCIR_INCLUDED

#include <cmath>
#include <cstring>

#if __cplusplus >= 201103L

#include <cstdint>

#else // __cplusplus >= 201103L

#include <stdint.h>

#endif // __cplusplus >= 201103L

/**
 * @def LANCIR_ALIGN
 * @brief Address alignment (granularity) used by resizing functions,
 * in bytes.
 */

/**
 * @def LANCIR_NULLPTR
 * @brief Macro is defined, if `nullptr` workaround is in use, for pre-C++11
 * compilers. Undefined at the end of file.
 */

#if defined(__AVX__)

#include <immintrin.h>

#define LANCIR_AVX
#define LANCIR_SSE2 // Some functions use SSE2; AVX has a higher priority.
#define LANCIR_ALIGN 32

#elif defined(__aarch64__) || defined(__arm64__) || \
        defined(_M_ARM64) || defined(_M_ARM64EC)

#if defined(_MSC_VER)
#include <arm64_neon.h>

#if _MSC_VER < 1925
#define LANCIR_ARM32 // Do not use some newer NEON intrinsics.
#endif // _MSC_VER < 1925
#else // defined( _MSC_VER )
#include <arm_neon.h>
#endif // defined( _MSC_VER )

#define LANCIR_NEON
#define LANCIR_ALIGN 16

#elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM)

#include <arm_neon.h>

#define LANCIR_ARM32
#define LANCIR_NEON
#define LANCIR_ALIGN 16

#elif defined(__SSE2__) || defined(_M_AMD64) || \
        (defined(_M_IX86_FP) && _M_IX86_FP == 2)

#if defined(_MSC_VER)
#include <intrin.h>
#else // defined( _MSC_VER )
#include <emmintrin.h>
#endif // defined( _MSC_VER )

#define LANCIR_SSE2
#define LANCIR_ALIGN 16

#elif defined(__wasm_simd128__)

#include <wasm_simd128.h>

#define LANCIR_WASM
#define LANCIR_ALIGN 16

#else // WASM

#define LANCIR_ALIGN 4

#endif // WASM

#if defined(LANCIR_SSE2)

#define lancvec_t __m128
#define lancvec_const_splat(v) _mm_set1_ps(v)
#define lancvec_load(m) _mm_load_ps(m)
#define lancvec_loadu(m) _mm_loadu_ps(m)
#define lancvec_store(m, v) _mm_store_ps(m, v)
#define lancvec_storeu(m, v) _mm_storeu_ps(m, v)
#define lancvec_add(v1, v2) _mm_add_ps(v1, v2)
#define lancvec_mul(v1, v2) _mm_mul_ps(v1, v2)
#define lancvec_min(v1, v2) _mm_min_ps(v1, v2)
#define lancvec_max(v1, v2) _mm_max_ps(v1, v2)
#define lancvec_madd(va, v1, v2) _mm_add_ps(va, _mm_mul_ps(v1, v2))
#define lancvec_addhl(vl, vh) _mm_add_ps(vl, _mm_movehl_ps(vh, vh))
#define lancvec_store32_addhl(m, v) \
    _mm_store_ss(m, _mm_add_ss(v, _mm_shuffle_ps(v, v, 1)))

#define lancvec_store64_addhl(m, v) \
    _mm_storel_pi((__m64 *)(m), lancvec_addhl(v, v))

#elif defined(LANCIR_NEON)

#define lancvec_t float32x4_t
#define lancvec_const_splat(v) vdupq_n_f32(v)
#define lancvec_load(m) vld1q_f32(m)
#define lancvec_store(m, v) vst1q_f32(m, v)
#define lancvec_add(v1, v2) vaddq_f32(v1, v2)
#define lancvec_mul(v1, v2) vmulq_f32(v1, v2)
#define lancvec_min(v1, v2) vminq_f32(v1, v2)
#define lancvec_max(v1, v2) vmaxq_f32(v1, v2)
#define lancvec_madd(va, v1, v2) vmlaq_f32(va, v1, v2)

#if defined(LANCIR_ARM32)
#define lancvec_store32_hadd(m, v)                        \
    {                                                     \
        const float32x2_t v2 = vadd_f32(vget_high_f32(v), \
                                        vget_low_f32(v)); \
        *(m) = vget_lane_f32(v2, 0) +                     \
                vget_lane_f32(v2, 1);                     \
    }                                                     \
    (void)0
#else // defined( LANCIR_ARM32 )
#define lancvec_store32_hadd(m, v) *(m) = vaddvq_f32(v)
#endif // defined( LANCIR_ARM32 )

#define lancvec_store64_addhl(m, v) \
    vst1_f32(m, vadd_f32(vget_high_f32(v), vget_low_f32(v)));

#elif defined(LANCIR_WASM)

#define lancvec_t v128_t
#define lancvec_const_splat(v) wasm_f32x4_const_splat(v)
#define lancvec_load32_splat(m) wasm_v128_load32_splat(m)
#define lancvec_load(m) wasm_v128_load(m)
#define lancvec_store(m, v) wasm_v128_store(m, v)
#define lancvec_add(v1, v2) wasm_f32x4_add(v1, v2)
#define lancvec_mul(v1, v2) wasm_f32x4_mul(v1, v2)
#define lancvec_min(v1, v2) wasm_f32x4_min(v1, v2)
#define lancvec_max(v1, v2) wasm_f32x4_max(v1, v2)
#define lancvec_madd(va, v1, v2) wasm_f32x4_add(va, \
                                                wasm_f32x4_mul(v1, v2))

#define lancvec_addhl(vl, vh) wasm_f32x4_add(vl, \
                                             wasm_i32x4_shuffle(vh, vh, 6, 7, 2, 3))

#define lancvec_store32_addhl(m, v)         \
    *(m) = (wasm_f32x4_extract_lane(v, 0) + \
            wasm_f32x4_extract_lane(v, 1))

#define lancvec_store64_addhl(m, v) \
    wasm_v128_store64_lane(m, lancvec_addhl(v, v), 0)

#endif // defined( LANCIR_WASM )

#if LANCIR_ALIGN > 4

#if !defined(lancvec_load32_splat)
#define lancvec_load32_splat(m) lancvec_const_splat(*(m))
#endif // !defined( lancvec_load32_splat )

#if !defined(lancvec_loadu)
#define lancvec_loadu(m) lancvec_load(m)
#endif // !defined( lancvec_loadu )

#if !defined(lancvec_storeu)
#define lancvec_storeu(m, v) lancvec_store(m, v)
#endif // !defined( lancvec_storeu )

#if !defined(lancvec_store32_hadd)
#define lancvec_store32_hadd(m, v)                \
    {                                             \
        const lancvec_t v2 = lancvec_addhl(v, v); \
        lancvec_store32_addhl(m, v2);             \
    }                                             \
    (void)0
#endif // !defined( lancvec_store32_hadd )

#endif // LANCIR_ALIGN > 4

namespace avir {

using std ::ceil;
using std ::cos;
using std ::fabs;
using std ::floor;
using std ::memcpy;
using std ::memset;
using std ::sin;
using std ::size_t;

#if __cplusplus >= 201103L

using std ::intptr_t;
using std ::uintptr_t;

#else // __cplusplus >= 201103L

// Workaround for pre-C++11 compilers. `nullptr` is a keyword, and not a
// macro, but check if such workaround is already in place.

#if !defined(nullptr)
#define nullptr NULL
#define LANCIR_NULLPTR
#endif // !defined( nullptr )

#endif // __cplusplus >= 201103L

/**
 * @brief LANCIR resizing parameters class.
 *
 * An object of this class, which can be allocated on stack, can be used to
 * pass non-default parameters to the resizing algorithm. See the constructor
 * for the default values.
 */

class CLancIRParams
{
public:
    int SrcSSize; ///< Physical size of the source scanline, in elements (not
                  ///< bytes). If this value is below 1, `SrcWidth * ElCount` will be
                  ///< used.
    int NewSSize; ///< Physical size of the destination scanline, in elements
                  ///< (not bytes). If this value is below 1, `NewWidth * ElCount` will
                  ///< be used.
    double kx; ///< Resizing step - horizontal (one output pixel corresponds
               ///< to `k` input pixels). A downsizing factor if greater than 1.0;
               ///< upsizing factor if below or equal to 1.0. Multiply by -1 if you
               ///< would like to bypass `ox` and `oy` adjustment which is done by
               ///< default to produce a centered image. If this step value equals 0,
               ///< the step value will be chosen automatically.
    double ky; ///< Resizing step - vertical. Same as `kx`.
    double ox; ///< Start X pixel offset within the source image, can be
               ///< negative. A positive offset moves the image to the left.
    double oy; ///< Start Y pixel offset within the source image, can be
               ///< negative. A positive offset moves the image to the top.
    double la; ///< Lanczos window function's `a` parameter, greater or equal
               ///< to 2.0.

    /**
     * @brief Default constructor, with optional arguments that correspond to
     * class variables.
     *
     * @param aSrcSSize Physical size of the source scanline.
     * @param aNewSSize Physical size of the destination scanline.
     * @param akx Resizing step - horizontal.
     * @param aky Resizing step - vertical.
     * @param aox Start X pixel offset.
     * @param aoy Start Y pixel offset.
     */

    CLancIRParams(const int aSrcSSize = 0, const int aNewSSize = 0,
                  const double akx = 0.0, const double aky = 0.0,
                  const double aox = 0.0, const double aoy = 0.0)
        : SrcSSize(aSrcSSize), NewSSize(aNewSSize), kx(akx), ky(aky), ox(aox), oy(aoy), la(3.0)
    {
    }
};

/**
 * @brief LANCIR image resizer class.
 *
 * The object of this class can be used to resize 1-4 channel images to any
 * required size. Resizing is performed by utilizing Lanczos filters, with
 * 8-bit precision. This class offers a kind of "optimal" Lanczos resampling
 * implementation.
 *
 * Object of this class can be allocated on stack.
 *
 * Note that object of this class does not free temporary buffers and
 * variables after the resizeImage() function call (until object's
 * destruction): these buffers are reused (or reallocated) on subsequent
 * calls, thus making batch resizing of images faster. This means resizing is
 * not thread-safe: a separate CLancIR object should be created for each
 * thread.
 */

class CLancIR
{
private:
    CLancIR(const CLancIR &)
    {
        // Unsupported.
    }

    CLancIR &operator=(const CLancIR &)
    {
        // Unsupported.
        return (*this);
    }

public:
    CLancIR()
        : FltBuf0(nullptr), FltBuf0Len(0), spv0(nullptr), spv0len(0), spv(nullptr)
    {
    }

    ~CLancIR()
    {
        delete[] FltBuf0;
        delete[] spv0;
    }

    /**
     * @brief Function resizes an image.
     *
     * Performs input-to-output type conversion, if necessary.
     *
     * @param[in] SrcBuf Source image buffer.
     * @param SrcWidth Source image width, in pixels.
     * @param SrcHeight Source image height, in pixels.
     * @param[out] NewBuf Buffer to accept the resized image. Cannot be equal
     * to `SrcBuf`.
     * @param NewWidth New image width, in pixels.
     * @param NewHeight New image height, in pixels.
     * @param ElCount The number of elements (channels) used to store each
     * source and destination pixel (1-4).
     * @param aParams Custom resizing parameters. Can be `nullptr`, for
     * default values.
     * @tparam Tin Input buffer's element type. Can be `uint8_t` (`0..255`
     * value range), `uint16_t` (`0..65535` value range), `float` (`0..1`
     * value range), `double` (`0..1` value range). `uint32_t` type is treated
     * as `uint16_t`. Signed integer types and larger integer types are not
     * supported.
     * @tparam Tout Output buffer's element type, treated like `Tin`. If `Tin`
     * and `Tout` types do not match, an output value scaling will be applied.
     * Floating-point output will not be clamped/clipped/saturated; integer
     * output is always rounded and clamped.
     * @return The number of available output scanlines. Equals to
     * `NewHeight`, or 0 on function parameters error.
     */

    template<typename Tin, typename Tout>
    int resizeImage(const Tin *const SrcBuf, const int SrcWidth,
                    const int SrcHeight, Tout *const NewBuf, const int NewWidth,
                    const int NewHeight, const int ElCount,
                    const CLancIRParams *const aParams = nullptr)
    {
        if ((SrcWidth < 0) | (SrcHeight < 0) |
            (NewWidth <= 0) | (NewHeight <= 0) |
            (SrcBuf == nullptr) | (NewBuf == nullptr) |
            ((const void *)SrcBuf == (const void *)NewBuf)) {
            return (0);
        }

        static const CLancIRParams DefParams;
        const CLancIRParams &Params = (aParams != nullptr ? *aParams : DefParams);

        if (Params.la < 2.0) {
            return (0);
        }

        const int OutSLen = NewWidth * ElCount;
        const size_t NewScanlineSize = (size_t)(Params.NewSSize < 1 ? OutSLen : Params.NewSSize);

        if ((SrcWidth == 0) | (SrcHeight == 0)) {
            Tout *op = NewBuf;
            int i;

            for (i = 0; i < NewHeight; i++) {
                memset(op, 0, (size_t)OutSLen * sizeof(Tout));
                op += NewScanlineSize;
            }

            return (NewHeight);
        }

        const size_t SrcScanlineSize = (size_t)(Params.SrcSSize < 1 ? SrcWidth * ElCount : Params.SrcSSize);

        double ox = Params.ox;
        double oy = Params.oy;
        double kx;
        double ky;

        if (Params.kx >= 0.0) {
            kx = (Params.kx == 0.0 ? (double)SrcWidth / NewWidth : Params.kx);

            ox += (kx - 1.0) * 0.5;
        } else {
            kx = -Params.kx;
        }

        if (Params.ky >= 0.0) {
            ky = (Params.ky == 0.0 ? (double)SrcHeight / NewHeight : Params.ky);

            oy += (ky - 1.0) * 0.5;
        } else {
            ky = -Params.ky;
        }

        if (rfv.update(Params.la, ky, ElCount)) {
            rsv.reset();
            rsh.reset();
        }

        CResizeFilters *rfh; // Pointer to resizing filters for horizontal
                             // resizing, may equal to `rfv` if the same stepping is in use.

        if (kx == ky) {
            rfh = &rfv;
        } else {
            rfh = &rfh0;

            if (rfh0.update(Params.la, kx, ElCount)) {
                rsh.reset();
            }
        }

        rsv.update(SrcHeight, NewHeight, oy, rfv, spv);
        rsh.update(SrcWidth, NewWidth, ox, *rfh);

        // Calculate vertical progressive resizing's batch size. Progressive
        // batching is used to try to keep addressing within the cache
        // capacity. This technique definitely works well for single-threaded
        // resizing on most CPUs, but may not provide an additional benefit
        // for multi-threaded resizing, or in a system-wide high-load
        // situations.

        const size_t FltWidthE = (size_t)((rsh.padl + SrcWidth +
                                           rsh.padr) *
                                          ElCount);

        const double CacheSize = 5500000.0; // Tuned for various CPUs.
        const double OpSize = (double)SrcScanlineSize * SrcHeight *
                        sizeof(Tin) +
                (double)FltWidthE * NewHeight * sizeof(float);

        int BatchSize = (int)(NewHeight * CacheSize / (OpSize + 1.0));

        if (BatchSize < 8) {
            BatchSize = 8;
        }

        if (BatchSize > NewHeight) {
            BatchSize = NewHeight;
        }

        // Allocate/resize intermediate buffers.

        const int svs = (rsv.padl + SrcHeight + rsv.padr) * ElCount;
        float *const pspv0 = spv0;
        reallocBuf(spv0, spv, spv0len, (svs > OutSLen ? svs : OutSLen));
        reallocBuf(FltBuf0, FltBuf, FltBuf0Len,
                   FltWidthE * (size_t)BatchSize);

        if (spv0 != pspv0) {
            rsv.updateSPO(rfv, spv);
        }

        // Prepare output-related constants.

        static const bool IsInFloat = ((Tin)0.25f != 0);
        static const bool IsOutFloat = ((Tout)0.25f != 0);
        static const bool IsUnityMul = (IsInFloat && IsOutFloat) ||
                (IsInFloat == IsOutFloat && sizeof(Tin) == sizeof(Tout));

        const float Clamp = (sizeof(Tout) == 1 ? 255.0f : 65535.0f);
        const float OutMul = (IsOutFloat ? 1.0f : Clamp) /
                (IsInFloat ? 1.0f : (sizeof(Tin) == 1 ? 255.0f : 65535.0f));

        // Perform batched resizing.

        const CResizePos *rpv = rsv.pos;
        Tout *opn = NewBuf;
        int bl = NewHeight;

        while (bl > 0) {
            const int bc = (bl > BatchSize ? BatchSize : bl);

            int kl = rfv.KernelLen;
            const Tin *ip = SrcBuf;
            float *op = FltBuf + rsh.padl * ElCount;

            const int so = (int)rpv[0].so;
            float *const sp = spv + so * ElCount;

            int cc = (int)rpv[bc - 1].so - so + kl; // Pixel copy count.
            int rl = 0; // Leftmost pixel's replication count.
            int rr = 0; // Rightmost pixel's replication count.

            const int socc = so + cc;
            const int spe = rsv.padl + SrcHeight;

            // Calculate scanline copying and padding parameters, depending on
            // the batch's size and its vertical offset.

            if (so < rsv.padl) {
                if (socc <= rsv.padl) {
                    rl = cc;
                    cc = 0;
                } else {
                    if (socc > spe) {
                        rr = socc - spe;
                        cc -= rr;
                    }

                    rl = rsv.padl - so;
                    cc -= rl;
                }
            } else {
                if (so >= spe) {
                    rr = cc;
                    cc = 0;
                    ip += (size_t)SrcHeight * SrcScanlineSize;
                } else {
                    if (socc > spe) {
                        rr = socc - spe;
                        cc -= rr;
                    }

                    ip += (size_t)(so - rsv.padl) * SrcScanlineSize;
                }
            }

            // Batched vertical resizing.

            int i;

            if (ElCount == 1) {
                for (i = 0; i < SrcWidth; i++) {
                    copyScanline1v(ip, SrcScanlineSize, sp, cc, rl, rr);
                    resize1<false>(nullptr, op, FltWidthE, rpv, kl, bc);
                    ip += 1;
                    op += 1;
                }
            } else if (ElCount == 2) {
                for (i = 0; i < SrcWidth; i++) {
                    copyScanline2v(ip, SrcScanlineSize, sp, cc, rl, rr);
                    resize2<false>(nullptr, op, FltWidthE, rpv, kl, bc);
                    ip += 2;
                    op += 2;
                }
            } else if (ElCount == 3) {
                for (i = 0; i < SrcWidth; i++) {
                    copyScanline3v(ip, SrcScanlineSize, sp, cc, rl, rr);
                    resize3<false>(nullptr, op, FltWidthE, rpv, kl, bc);
                    ip += 3;
                    op += 3;
                }
            } else // ElCount == 4
            {
                for (i = 0; i < SrcWidth; i++) {
                    copyScanline4v(ip, SrcScanlineSize, sp, cc, rl, rr);
                    resize4<false>(nullptr, op, FltWidthE, rpv, kl, bc);
                    ip += 4;
                    op += 4;
                }
            }

            // Perform horizontal resizing batch, and produce final output.

            float *ipf = FltBuf;
            kl = rfh->KernelLen;

            if (ElCount == 1) {
                for (i = 0; i < bc; i++) {
                    padScanline1h(ipf, rsh, SrcWidth);
                    resize1<true>(ipf, spv, 1, rsh.pos, kl, NewWidth);
                    outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
                                                           OutSLen, Clamp, OutMul);

                    ipf += FltWidthE;
                    opn += NewScanlineSize;
                }
            } else if (ElCount == 2) {
                for (i = 0; i < bc; i++) {
                    padScanline2h(ipf, rsh, SrcWidth);
                    resize2<true>(ipf, spv, 2, rsh.pos, kl, NewWidth);
                    outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
                                                           OutSLen, Clamp, OutMul);

                    ipf += FltWidthE;
                    opn += NewScanlineSize;
                }
            } else if (ElCount == 3) {
                for (i = 0; i < bc; i++) {
                    padScanline3h(ipf, rsh, SrcWidth);
                    resize3<true>(ipf, spv, 3, rsh.pos, kl, NewWidth);
                    outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
                                                           OutSLen, Clamp, OutMul);

                    ipf += FltWidthE;
                    opn += NewScanlineSize;
                }
            } else // ElCount == 4
            {
                for (i = 0; i < bc; i++) {
                    padScanline4h(ipf, rsh, SrcWidth);
                    resize4<true>(ipf, spv, 4, rsh.pos, kl, NewWidth);
                    outputScanline<IsOutFloat, IsUnityMul>(spv, opn,
                                                           OutSLen, Clamp, OutMul);

                    ipf += FltWidthE;
                    opn += NewScanlineSize;
                }
            }

            rpv += bc;
            bl -= bc;
        }

        return (NewHeight);
    }

    /**
     * @brief Legacy image resizing function.
     *
     * Not recommended for new projects. See the prior resizeImage() function
     * and CLancIRParams class for details.
     *
     * @param[in] SrcBuf Source image buffer.
     * @param SrcWidth Source image width, in pixels.
     * @param SrcHeight Source image height, in pixels.
     * @param SrcSSize Physical size of the source scanline, in elements (not
     * bytes).
     * @param[out] NewBuf Buffer to accept the resized image. Cannot be equal
     * to SrcBuf.
     * @param NewWidth New image width, in pixels.
     * @param NewHeight New image height, in pixels.
     * @param NewSSize Physical size of the destination scanline, in elements
     * (not bytes).
     * @param ElCount The number of elements (channels) used to store each
     * source and destination pixel (1-4).
     * @param kx0 Resizing step - horizontal.
     * @param ky0 Resizing step - vertical. Same as `kx0`.
     * @param ox Start X pixel offset within the source image.
     * @param oy Start Y pixel offset within the source image.
     * @tparam Tin Input buffer's element type.
     * @tparam Tout Output buffer's element type.
     * @return The number of available output scanlines. Equals to
     * `NewHeight`, or 0 on function parameters error.
     */

    template<typename Tin, typename Tout>
    int resizeImage(const Tin *const SrcBuf, const int SrcWidth,
                    const int SrcHeight, const int SrcSSize, Tout *const NewBuf,
                    const int NewWidth, const int NewHeight, const int NewSSize,
                    const int ElCount, const double kx0 = 0.0, const double ky0 = 0.0,
                    double ox = 0.0, double oy = 0.0)
    {
        const CLancIRParams Params(SrcSSize, NewSSize, kx0, ky0, ox, oy);

        return (resizeImage(SrcBuf, SrcWidth, SrcHeight, NewBuf, NewWidth,
                            NewHeight, ElCount, &Params));
    }

protected:
    float *FltBuf0; ///< Intermediate resizing buffer.
    size_t FltBuf0Len; ///< Length of `FltBuf0`.
    float *FltBuf; ///< Address-aligned `FltBuf0`.
    float *spv0; ///< Scanline buffer for vertical resizing, also used at the
                 ///< output stage.
    int spv0len; ///< Length of `spv0`.
    float *spv; ///< Address-aligned `spv0`.

    /**
     * @brief Typed buffer reallocation function, with address alignment.
     *
     * Function reallocates a typed buffer if its current length is
     * smaller than the required length, applies `LANCIR_ALIGN` address
     * alignment to the buffer pointer.
     *
     * @param buf0 Reference to the pointer of the previously allocated
     * buffer.
     * @param buf Reference to address-aligned `buf0` pointer.
     * @param len The current length of the `buf0`.
     * @param newlen A new required length.
     * @tparam Tb Buffer element type.
     * @tparam Tl Length variable type.
     */

    template<typename Tb, typename Tl>
    static void reallocBuf(Tb *&buf0, Tb *&buf, Tl &len, Tl newlen)
    {
        newlen += LANCIR_ALIGN;

        if (newlen > len) {
            if (buf0 != nullptr) {
                delete[] buf0;
                buf0 = nullptr;
                len = 0;
            }

            buf0 = new Tb[newlen];
            len = newlen;
            buf = (Tb *)(((uintptr_t)buf0 + LANCIR_ALIGN - 1) &
                         ~(uintptr_t)(LANCIR_ALIGN - 1));
        }
    }

    /**
     * @brief Typed buffer reallocation function.
     *
     * Function reallocates a typed buffer if its current length is smaller
     * than the required length.
     *
     * @param buf Reference to the pointer of the previously allocated buffer;
     * address alignment will not be applied.
     * @param len The current length of the `buf0`.
     * @param newlen A new required length.
     * @tparam Tb Buffer element type.
     * @tparam Tl Length variable type.
     */

    template<typename Tb, typename Tl>
    static void reallocBuf(Tb *&buf, Tl &len, const Tl newlen)
    {
        if (newlen > len) {
            if (buf != nullptr) {
                delete[] buf;
                buf = nullptr;
                len = 0;
            }

            buf = new Tb[newlen];
            len = newlen;
        }
    }

    class CResizeScanline;

    /**
     * @brief Class for fractional delay filter bank storage and calculation.
     */

    class CResizeFilters
    {
        friend class CResizeScanline;

    public:
        int KernelLen; ///< Resampling filter kernel's length, taps. Available
                       ///< after the update() function call. Always an even value,
                       ///< should not be lesser than 4.

        CResizeFilters()
            : Filters(nullptr), FiltersLen(0), la(0.0)
        {
            memset(Bufs0, 0, sizeof(Bufs0));
            memset(Bufs0Len, 0, sizeof(Bufs0Len));
        }

        ~CResizeFilters()
        {
            int i;

            for (i = 0; i < BufCount; i++) {
                delete[] Bufs0[i];
            }

            delete[] Filters;
        }

        /**
         * @brief Function updates the filter bank.
         *
         * @param la0 Lanczos `a` parameter value (greater or equal to 2.0),
         * can be fractional.
         * @param k0 Resizing step.
         * @param ElCount0 Image's element count, may be used for SIMD filter
         * tap replication.
         * @return `true`, if an update occured and scanline resizing
         * positions should be updated unconditionally.
         */

        bool update(const double la0, const double k0, const int ElCount0)
        {
            if (la0 == la && k0 == k && ElCount0 == ElCount) {
                return (false);
            }

            const double NormFreq = (k0 <= 1.0 ? 1.0 : 1.0 / k0);
            Freq = 3.1415926535897932 * NormFreq;
            FreqA = Freq / la0;

            Len2 = la0 / NormFreq;
            fl2 = (int)ceil(Len2);
            KernelLen = fl2 + fl2;

#if LANCIR_ALIGN > 4

            ElRepl = ElCount0;
            KernelLenA = KernelLen * ElRepl;

            const int elalign =
                    (int)(LANCIR_ALIGN / sizeof(float)) - 1;

            KernelLenA = (KernelLenA + elalign) & ~elalign;

#else // LANCIR_ALIGN > 4

            ElRepl = 1;
            KernelLenA = KernelLen;

#endif // LANCIR_ALIGN > 4

            FracCount = 1000; // Enough for Lanczos implicit 8-bit precision.

            la = 0.0;
            reallocBuf(Filters, FiltersLen, FracCount + 1);

            memset(Filters, 0, (size_t)FiltersLen * sizeof(Filters[0]));

            setBuf(0);

            la = la0;
            k = k0;
            ElCount = ElCount0;

            return (true);
        }

        /**
         * @brief Filter acquisition function.
         *
         * Function returns filter at the specified fractional offset. This
         * function can only be called after a prior update() function call.
         *
         * @param x Fractional offset, [0; 1].
         * @return Pointer to a previously-calculated or a new filter.
         */

        const float *getFilter(const double x)
        {
            const int Frac = (int)(x * FracCount + 0.5);
            float *flt = Filters[Frac];

            if (flt != nullptr) {
                return (flt);
            }

            flt = Bufs[CurBuf] + CurBufFill * KernelLenA;
            Filters[Frac] = flt;
            CurBufFill++;

            if (CurBufFill == BufLen) {
                setBuf(CurBuf + 1);
            }

            makeFilterNorm(flt, 1.0 - (double)Frac / FracCount);

            if (ElRepl > 1) {
                replicateFilter(flt, KernelLen, ElRepl);
            }

            return (flt);
        }

    protected:
        double Freq; ///< Circular frequency of the filter.
        double FreqA; ///< Circular frequency of the window function.
        double Len2; ///< Half resampling filter's length, unrounded.
        int fl2; ///< Half resampling filter's length, integer.
        int FracCount; ///< The number of fractional positions for which
                       ///< filters can be created.
        int KernelLenA; ///< SIMD-aligned and replicated filter kernel's
                        ///< length.
        int ElRepl; ///< The number of repetitions of each filter tap.
        static const int BufCount = 4; ///< The maximal number of buffers
                                       ///< (filter batches) that can be in use.
        static const int BufLen = 256; ///< The number of fractional filters
                                       ///< a single buffer (filter batch) may contain. Both the `BufLen`
                                       ///< and `BufCount` should correspond to the `FracCount` used.
        float *Bufs0[BufCount]; ///< Buffers that hold all filters,
                                ///< original.
        int Bufs0Len[BufCount]; ///< Allocated lengthes in `Bufs0`, in
                                ///< `float` elements.
        float *Bufs[BufCount]; ///< Address-aligned `Bufs0`.
        int CurBuf; ///< Filter buffer currently being filled.
        int CurBufFill; ///< The number of fractional positions filled in the
                        ///< current filter buffer.
        float **Filters; ///< Fractional delay filters for all positions.
                         ///< A particular pointer equals `nullptr`, if a filter for such
                         ///< position has not been created yet.
        int FiltersLen; ///< Allocated length of Filters, in elements.
        double la; ///< Current `la`.
        double k; ///< Current `k`.
        int ElCount; ///< Current `ElCount`.

        /**
         * @brief Current buffer (filter batch) repositioning function.
         *
         * Function changes the buffer currently being filled, checks its size
         * and reallocates it, if necessary, then resets its fill counter.
         *
         * @param bi A new current buffer index.
         */

        void setBuf(const int bi)
        {
            reallocBuf(Bufs0[bi], Bufs[bi], Bufs0Len[bi],
                       BufLen * KernelLenA);

            CurBuf = bi;
            CurBufFill = 0;
        }

        /**
         * @brief Sine-wave signal generator class.
         *
         * Class implements sine-wave signal generator without biasing, with
         * constructor-based initialization only. This generator uses an
         * oscillator instead of the `sin()` function.
         */

        class CSineGen
        {
        public:
            /**
             * @brief Constructor initializes *this* sine-wave signal
             * generator.
             *
             * @param si Sine function increment, in radians.
             * @param ph Starting phase, in radians. Add `0.5*pi` for a
             * cosine function.
             */

            CSineGen(const double si, const double ph)
                : svalue1(sin(ph)), svalue2(sin(ph - si)), sincr(2.0 * cos(si))
            {
            }

            /**
             * @brief Generates the next sine-wave sample, without biasing.
             */

            double generate()
            {
                const double res = svalue1;

                svalue1 = sincr * res - svalue2;
                svalue2 = res;

                return (res);
            }

        private:
            double svalue1; ///< Current sine value.
            double svalue2; ///< Previous sine value.
            double sincr; ///< Sine value increment.
        };

        /**
         * @brief Filter calculation function.
         *
         * Function creates a filter for the specified fractional delay. The
         * update() function should be called prior to calling this function.
         * The created filter is normalized (DC gain=1).
         *
         * @param[out] op Output filter buffer.
         * @param FracDelay Fractional delay, 0 to 1, inclusive.
         */

        void makeFilterNorm(float *op, const double FracDelay) const
        {
            CSineGen f(Freq, Freq * (FracDelay - fl2));
            CSineGen fw(FreqA, FreqA * (FracDelay - fl2));

            float *op0 = op;
            double s = 0.0;
            double ut;

            int t = -fl2;

            if (t + FracDelay < -Len2) {
                f.generate();
                fw.generate();
                *op = 0;
                op++;
                t++;
            }

            int IsZeroX = (fabs(FracDelay - 1.0) < 2.3e-13);
            int mt = 0 - IsZeroX;
            IsZeroX |= (fabs(FracDelay) < 2.3e-13);

            while (t < mt) {
                ut = t + FracDelay;
                *op = (float)(f.generate() * fw.generate() / (ut * ut));
                s += *op;
                op++;
                t++;
            }

            if (IsZeroX) // t+FracDelay==0
            {
                *op = (float)(Freq * FreqA);
                s += *op;
                f.generate();
                fw.generate();
            } else {
                ut = FracDelay; // t==0
                *op = (float)(f.generate() * fw.generate() / (ut * ut));
                s += *op;
            }

            mt = fl2 - 2;

            while (t < mt) {
                op++;
                t++;
                ut = t + FracDelay;
                *op = (float)(f.generate() * fw.generate() / (ut * ut));
                s += *op;
            }

            op++;
            ut = t + 1 + FracDelay;

            if (ut > Len2) {
                *op = 0;
            } else {
                *op = (float)(f.generate() * fw.generate() / (ut * ut));
                s += *op;
            }

            s = 1.0 / s;
            t = (int)(op - op0 + 1);

            while (t != 0) {
                *op0 = (float)(*op0 * s);
                op0++;
                t--;
            }
        }

        /**
         * @brief Filter tap replication function, for SIMD operations.
         *
         * Function replicates taps of the specified filter so that it can
         * be used with SIMD loading instructions. This function works
         * "in-place".
         *
         * @param[in,out] p Filter buffer pointer, should be sized to contain
         * `kl * erp` elements.
         * @param kl Filter kernel's length, in taps.
         * @param erp The number of repetitions to apply.
         */

        static void replicateFilter(float *const p, const int kl,
                                    const int erp)
        {
            const float *ip = p + kl - 1;
            float *op = p + (kl - 1) * erp;
            int c = kl;

            if (erp == 2) {
                while (c != 0) {
                    const float v = *ip;
                    op[0] = v;
                    op[1] = v;
                    ip--;
                    op -= 2;
                    c--;
                }
            } else if (erp == 3) {
                while (c != 0) {
                    const float v = *ip;
                    op[0] = v;
                    op[1] = v;
                    op[2] = v;
                    ip--;
                    op -= 3;
                    c--;
                }
            } else // erp == 4
            {
                while (c != 0) {
                    const float v = *ip;
                    op[0] = v;
                    op[1] = v;
                    op[2] = v;
                    op[3] = v;
                    ip--;
                    op -= 4;
                    c--;
                }
            }
        }
    };

    /**
     * @brief Structure defines source scanline positions and filters for each
     * destination pixel.
     */

    struct CResizePos {
        const float *flt; ///< Fractional delay filter.
        intptr_t spo; ///< Source scanline's pixel offset, in bytes, or
                      ///< a direct pointer to scanline buffer.
        intptr_t so; ///< Offset within the source scanline, in pixels.
    };

    /**
     * @brief Scanline resizing positions class.
     *
     * Class contains resizing positions, and prepares source scanline
     * positions for resize filtering. The public variables become available
     * after the update() function call.
     */

    class CResizeScanline
    {
    public:
        int padl; ///< Left-padding (in pixels) required for source scanline.
        int padr; ///< Right-padding (in pixels) required for source scanline.
        CResizePos *pos; ///< Source scanline positions (offsets) and filters
                         ///< for each destination pixel position.

        CResizeScanline()
            : pos(nullptr), poslen(0), SrcLen(0)
        {
        }

        ~CResizeScanline()
        {
            delete[] pos;
        }

        /**
         * @brief Object's reset function.
         *
         * Function "resets" *this* object so that the next update() call
         * fully updates the position buffer. Reset is necessary if the
         * corresponding CResizeFilters object was updated.
         */

        void reset()
        {
            SrcLen = 0;
        }

        /**
         * @brief Scanline positions update function.
         *
         * Function updates resizing positions, updates `padl`, `padr`, and
         * `pos` buffer.
         *
         * @param SrcLen0 Source image scanline length, used to create a
         * scanline buffer without length pre-calculation.
         * @param DstLen0 Destination image scanline length.
         * @param o0 Initial source image offset.
         * @param rf Resizing filters object.
         * @param sp A pointer to scanline buffer, to use for absolute
         * scanline positioning, can be `nullptr`.
         */

        void update(const int SrcLen0, const int DstLen0, const double o0,
                    CResizeFilters &rf, float *const sp = nullptr)
        {
            if (SrcLen0 == SrcLen && DstLen0 == DstLen && o0 == o) {
                return;
            }

            const int fl2m1 = rf.fl2 - 1;
            padl = fl2m1 - (int)floor(o0);

            if (padl < 0) {
                padl = 0;
            }

            // Make sure `padr` and `pos` are in sync: calculate ending `pos`
            // offset in advance.

            const double k = rf.k;

            const int DstLen_m1 = DstLen0 - 1;
            const double oe = o0 + k * DstLen_m1;
            const int ie = (int)floor(oe);

            padr = ie + rf.fl2 + 1 - SrcLen0;

            if (padr < 0) {
                padr = 0;
            }

            SrcLen = 0;
            reallocBuf(pos, poslen, DstLen0);

            const intptr_t ElCountF = rf.ElCount * (intptr_t)sizeof(float);
            const int so = padl - fl2m1;
            CResizePos *rp = pos;
            intptr_t rpso;
            int i;

            for (i = 0; i < DstLen_m1; i++) {
                const double ox = o0 + k * i;
                const int ix = (int)floor(ox);

                rp->flt = rf.getFilter(ox - ix);
                rpso = so + ix;
                rp->spo = (intptr_t)sp + rpso * ElCountF;
                rp->so = rpso;
                rp++;
            }

            rp->flt = rf.getFilter(oe - ie);
            rpso = so + ie;
            rp->spo = (intptr_t)sp + rpso * ElCountF;
            rp->so = rpso;

            SrcLen = SrcLen0;
            DstLen = DstLen0;
            o = o0;
        }

        /**
         * @brief Scanline pixel offsets update function.
         *
         * Function updates `pos` buffer's `spo` (scanline pixel offset)
         * values.
         *
         * @param rf Resizing filters object.
         * @param sp A pointer to scanline buffer, to use for absolute
         * scanline positioning, can be `nullptr`.
         */

        void updateSPO(CResizeFilters &rf, float *const sp)
        {
            const intptr_t ElCountF = rf.ElCount * (intptr_t)sizeof(float);
            CResizePos *const rp = pos;
            int i;

            for (i = 0; i < DstLen; i++) {
                rp[i].spo = (intptr_t)sp + rp[i].so * ElCountF;
            }
        }

    protected:
        int poslen; ///< Allocated `pos` buffer's length.
        int SrcLen; ///< Current `SrcLen`.
        int DstLen; ///< Current `DstLen`.
        double o; ///< Current `o`.
    };

    CResizeFilters rfv; ///< Resizing filters for vertical resizing.
    CResizeFilters rfh0; ///< Resizing filters for horizontal resizing (may
                         ///< not be in use).
    CResizeScanline rsv; ///< Vertical resize scanline.
    CResizeScanline rsh; ///< Horizontal resize scanline.

    /**
     * @{
     * @brief Scanline copying function, for vertical resizing.
     *
     * Function copies scanline (fully or partially) from the source buffer,
     * in its native format, to the internal scanline buffer, in preparation
     * for vertical resizing. Variants for 1-4-channel images.
     *
     * @param ip Source scanline buffer pointer.
     * @param ipinc `ip` increment per pixel.
     * @param op Output scanline pointer.
     * @param cc Source pixel copy count.
     * @param repl Leftmost pixel's replication count.
     * @param repr Rightmost pixel's replication count.
     * @tparam T Source buffer's element type.
     */

    template<typename T>
    static void copyScanline1v(const T *ip, const size_t ipinc, float *op,
                               int cc, int repl, int repr)
    {
        float v0;

        if (repl > 0) {
            v0 = (float)ip[0];

            do {
                op[0] = v0;
                op += 1;

            } while (--repl != 0);
        }

        while (cc != 0) {
            op[0] = (float)ip[0];
            ip += ipinc;
            op += 1;
            cc--;
        }

        if (repr > 0) {
            const T *const ipe = ip - ipinc;
            v0 = (float)ipe[0];

            do {
                op[0] = v0;
                op += 1;

            } while (--repr != 0);
        }
    }

    template<typename T>
    static void copyScanline2v(const T *ip, const size_t ipinc, float *op,
                               int cc, int repl, int repr)
    {
        float v0, v1;

        if (repl > 0) {
            v0 = (float)ip[0];
            v1 = (float)ip[1];

            do {
                op[0] = v0;
                op[1] = v1;
                op += 2;

            } while (--repl != 0);
        }

        while (cc != 0) {
            op[0] = (float)ip[0];
            op[1] = (float)ip[1];
            ip += ipinc;
            op += 2;
            cc--;
        }

        if (repr > 0) {
            const T *const ipe = ip - ipinc;
            v0 = (float)ipe[0];
            v1 = (float)ipe[1];

            do {
                op[0] = v0;
                op[1] = v1;
                op += 2;

            } while (--repr != 0);
        }
    }

    template<typename T>
    static void copyScanline3v(const T *ip, const size_t ipinc, float *op,
                               int cc, int repl, int repr)
    {
        float v0, v1, v2;

        if (repl > 0) {
            v0 = (float)ip[0];
            v1 = (float)ip[1];
            v2 = (float)ip[2];

            do {
                op[0] = v0;
                op[1] = v1;
                op[2] = v2;
                op += 3;

            } while (--repl != 0);
        }

        while (cc != 0) {
            op[0] = (float)ip[0];
            op[1] = (float)ip[1];
            op[2] = (float)ip[2];
            ip += ipinc;
            op += 3;
            cc--;
        }

        if (repr > 0) {
            const T *const ipe = ip - ipinc;
            v0 = (float)ipe[0];
            v1 = (float)ipe[1];
            v2 = (float)ipe[2];

            do {
                op[0] = v0;
                op[1] = v1;
                op[2] = v2;
                op += 3;

            } while (--repr != 0);
        }
    }

    template<typename T>
    static void copyScanline4v(const T *ip, const size_t ipinc, float *op,
                               int cc, int repl, int repr)
    {
        float v0, v1, v2, v3;

        if (repl > 0) {
            v0 = (float)ip[0];
            v1 = (float)ip[1];
            v2 = (float)ip[2];
            v3 = (float)ip[3];

            do {
                op[0] = v0;
                op[1] = v1;
                op[2] = v2;
                op[3] = v3;
                op += 4;

            } while (--repl != 0);
        }

        while (cc != 0) {
            op[0] = (float)ip[0];
            op[1] = (float)ip[1];
            op[2] = (float)ip[2];
            op[3] = (float)ip[3];
            ip += ipinc;
            op += 4;
            cc--;
        }

        if (repr > 0) {
            const T *const ipe = ip - ipinc;
            v0 = (float)ipe[0];
            v1 = (float)ipe[1];
            v2 = (float)ipe[2];
            v3 = (float)ipe[3];

            do {
                op[0] = v0;
                op[1] = v1;
                op[2] = v2;
                op[3] = v3;
                op += 4;

            } while (--repr != 0);
        }
    }

    /** @} */

    /**
     * @{
     * @brief Scanline padding function, for horizontal resizing.
     *
     * Function pads the specified scanline buffer to the left and right by
     * replicating its first and last available pixels, in preparation for
     * horizontal resizing. Variants for 1-4-channel images.
     *
     * @param[in,out] op Scanline buffer to pad.
     * @param rs Scanline resizing positions object.
     * @param l Source scanline's length, in pixels.
     */

    static void padScanline1h(float *op, CResizeScanline &rs, const int l)
    {
        const float *ip = op + rs.padl;

        float v0 = ip[0];
        int i;

        for (i = 0; i < rs.padl; i++) {
            op[i] = v0;
        }

        ip += l;
        op += rs.padl + l;

        v0 = ip[-1];

        for (i = 0; i < rs.padr; i++) {
            op[i] = v0;
        }
    }

    static void padScanline2h(float *op, CResizeScanline &rs, const int l)
    {
        const float *ip = op + rs.padl * 2;

        float v0 = ip[0];
        float v1 = ip[1];
        int i;

        for (i = 0; i < rs.padl; i++) {
            op[0] = v0;
            op[1] = v1;
            op += 2;
        }

        const int lc = l * 2;
        ip += lc;
        op += lc;

        v0 = ip[-2];
        v1 = ip[-1];

        for (i = 0; i < rs.padr; i++) {
            op[0] = v0;
            op[1] = v1;
            op += 2;
        }
    }

    static void padScanline3h(float *op, CResizeScanline &rs, const int l)
    {
        const float *ip = op + rs.padl * 3;

        float v0 = ip[0];
        float v1 = ip[1];
        float v2 = ip[2];
        int i;

        for (i = 0; i < rs.padl; i++) {
            op[0] = v0;
            op[1] = v1;
            op[2] = v2;
            op += 3;
        }

        const int lc = l * 3;
        ip += lc;
        op += lc;

        v0 = ip[-3];
        v1 = ip[-2];
        v2 = ip[-1];

        for (i = 0; i < rs.padr; i++) {
            op[0] = v0;
            op[1] = v1;
            op[2] = v2;
            op += 3;
        }
    }

    static void padScanline4h(float *op, CResizeScanline &rs, const int l)
    {
        const float *ip = op + rs.padl * 4;

        float v0 = ip[0];
        float v1 = ip[1];
        float v2 = ip[2];
        float v3 = ip[3];
        int i;

        for (i = 0; i < rs.padl; i++) {
            op[0] = v0;
            op[1] = v1;
            op[2] = v2;
            op[3] = v3;
            op += 4;
        }

        const int lc = l * 4;
        ip += lc;
        op += lc;

        v0 = ip[-4];
        v1 = ip[-3];
        v2 = ip[-2];
        v3 = ip[-1];

        for (i = 0; i < rs.padr; i++) {
            op[0] = v0;
            op[1] = v1;
            op[2] = v2;
            op[3] = v3;
            op += 4;
        }
    }

    /** @} */

    /**
     * @brief Rounds a value, and applies clamping.
     *
     * @param v Value to round and clamp.
     * @param Clamp High clamp level; low level is 0.
     * @return Rounded and clamped value.
     */

    static inline int roundclamp(const float v, const float Clamp)
    {
        return ((int)((v > Clamp ? Clamp : (v < 0.0f ? 0.0f : v)) +
                      0.5f));
    }

    /**
     * @brief Scanline output function.
     *
     * Function performs output of the scanline pixels to the destination
     * image buffer, with type conversion, scaling, clamping, if necessary.
     *
     * @param[in] ip Input (resized) scanline. Pointer must be aligned to
     * LANCIR_ALIGN bytes.
     * @param[out] op Output image buffer. Must be different to `ip`.
     * @param l Output scanline's length, in elements (not pixel count).
     * @param Clamp Clamp high level, used if `IsOutFloat` is `false`.
     * @param OutMul Output multiplier, for value range conversion, applied
     * before clamping.
     * @tparam IsOutFloat `true`, if floating-point output, and no clamping is
     * necessary.
     * @tparam IsUnityMul `true`, if multiplication is optional. However, even
     * if this parameter was specified as `true`, `OutMul` must be 1.
     * @tparam T Output buffer's element type. Acquired implicitly.
     */

    template<bool IsOutFloat, bool IsUnityMul, typename T>
    static void outputScanline(const float *ip, T *op, int l,
                               const float Clamp, const float OutMul)
    {
        if (IsOutFloat) {
            if (IsUnityMul) {
                if (sizeof(op[0]) == sizeof(ip[0])) {
                    memcpy(op, ip, (size_t)l * sizeof(op[0]));
                } else {
                    int l4 = l >> 2;
                    l &= 3;

                    while (l4 != 0) {
                        op[0] = (T)ip[0];
                        op[1] = (T)ip[1];
                        op[2] = (T)ip[2];
                        op[3] = (T)ip[3];
                        ip += 4;
                        op += 4;
                        l4--;
                    }

                    while (l != 0) {
                        *op = (T)*ip;
                        ip++;
                        op++;
                        l--;
                    }
                }
            } else {
                int l4 = l >> 2;
                l &= 3;
                bool DoScalar = true;

                if (sizeof(op[0]) == sizeof(ip[0])) {
#if LANCIR_ALIGN > 4

                    DoScalar = false;
                    const lancvec_t om = lancvec_load32_splat(&OutMul);

                    while (l4 != 0) {
                        lancvec_storeu((float *)op,
                                       lancvec_mul(lancvec_load(ip), om));

                        ip += 4;
                        op += 4;
                        l4--;
                    }

#endif // LANCIR_ALIGN > 4
                }

                if (DoScalar) {
                    while (l4 != 0) {
                        op[0] = (T)(ip[0] * OutMul);
                        op[1] = (T)(ip[1] * OutMul);
                        op[2] = (T)(ip[2] * OutMul);
                        op[3] = (T)(ip[3] * OutMul);
                        ip += 4;
                        op += 4;
                        l4--;
                    }
                }

                while (l != 0) {
                    *op = (T)(*ip * OutMul);
                    ip++;
                    op++;
                    l--;
                }
            }
        } else {
            int l4 = l >> 2;
            l &= 3;

#if LANCIR_ALIGN > 4

            const lancvec_t minv = lancvec_const_splat(0.0f);
            const lancvec_t maxv = lancvec_load32_splat(&Clamp);
            const lancvec_t om = lancvec_load32_splat(&OutMul);

#if defined(LANCIR_SSE2)
            unsigned int prevrm = _MM_GET_ROUNDING_MODE();
            _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
#else // defined( LANCIR_SSE2 )
            const lancvec_t v05 = lancvec_const_splat(0.5f);
#endif // defined( LANCIR_SSE2 )

            if (sizeof(op[0]) == 4) {
                while (l4 != 0) {
                    const lancvec_t v = lancvec_load(ip);
                    const lancvec_t cv = lancvec_max(lancvec_min(
                                                             (IsUnityMul ? v : lancvec_mul(v, om)),
                                                             maxv),
                                                     minv);

#if defined(LANCIR_SSE2)

                    _mm_storeu_si128((__m128i *)op, _mm_cvtps_epi32(cv));

#elif defined(LANCIR_NEON)

                    vst1q_u32((unsigned int *)op, vcvtq_u32_f32(vaddq_f32(cv, v05)));

#elif defined(LANCIR_WASM)

                    wasm_v128_store(op, wasm_i32x4_trunc_sat_f32x4(wasm_f32x4_add(cv, v05)));

#endif // defined( LANCIR_WASM )

                    ip += 4;
                    op += 4;
                    l4--;
                }
            } else if (sizeof(op[0]) == 2) {
                while (l4 != 0) {
                    const lancvec_t v = lancvec_load(ip);
                    const lancvec_t cv = lancvec_max(lancvec_min(
                                                             (IsUnityMul ? v : lancvec_mul(v, om)),
                                                             maxv),
                                                     minv);

#if defined(LANCIR_SSE2)

                    const __m128i v32 = _mm_cvtps_epi32(cv);
                    const __m128i v16s = _mm_shufflehi_epi16(
                            _mm_shufflelo_epi16(v32, 0 | 2 << 2), 0 | 2 << 2);

                    const __m128i v16 = _mm_shuffle_epi32(v16s, 0 | 2 << 2);

                    __m128i tmp;
                    _mm_store_si128(&tmp, v16);
                    memcpy(op, &tmp, 8);

#elif defined(LANCIR_NEON)

                    const uint32x4_t v32 = vcvtq_u32_f32(
                            vaddq_f32(cv, v05));

                    const uint16x4_t v16 = vmovn_u32(v32);

                    vst1_u16((unsigned short *)op, v16);

#elif defined(LANCIR_WASM)

                    const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(
                            wasm_f32x4_add(cv, v05));

                    wasm_v128_store64_lane(op,
                                           wasm_u16x8_narrow_i32x4(v32, v32), 0);

#endif // defined( LANCIR_WASM )

                    ip += 4;
                    op += 4;
                    l4--;
                }
            } else {
                while (l4 != 0) {
                    const lancvec_t v = lancvec_load(ip);
                    const lancvec_t cv = lancvec_max(lancvec_min(
                                                             (IsUnityMul ? v : lancvec_mul(v, om)),
                                                             maxv),
                                                     minv);

#if defined(LANCIR_SSE2)

                    const __m128i v32 = _mm_cvtps_epi32(cv);
                    const __m128i v16s = _mm_shufflehi_epi16(
                            _mm_shufflelo_epi16(v32, 0 | 2 << 2), 0 | 2 << 2);

                    const __m128i v16 = _mm_shuffle_epi32(v16s, 0 | 2 << 2);
                    const __m128i v8 = _mm_packus_epi16(v16, v16);

                    *(int *)op = _mm_cvtsi128_si32(v8);

#elif defined(LANCIR_NEON)

                    const uint32x4_t v32 = vcvtq_u32_f32(
                            vaddq_f32(cv, v05));

                    const uint16x4_t v16 = vmovn_u32(v32);
                    const uint8x8_t v8 = vmovn_u16(vcombine_u16(v16, v16));

                    *(unsigned int *)op = vget_lane_u32((uint32x2_t)v8, 0);

#elif defined(LANCIR_WASM)

                    const v128_t v32 = wasm_i32x4_trunc_sat_f32x4(
                            wasm_f32x4_add(cv, v05));

                    const v128_t v16 = wasm_u16x8_narrow_i32x4(v32, v32);

                    wasm_v128_store32_lane(op,
                                           wasm_u8x16_narrow_i16x8(v16, v16), 0);

#endif // defined( LANCIR_WASM )

                    ip += 4;
                    op += 4;
                    l4--;
                }
            }

#if defined(LANCIR_SSE2)
            _MM_SET_ROUNDING_MODE(prevrm);
#endif // defined( LANCIR_SSE2 )

#else // LANCIR_ALIGN > 4

            if (IsUnityMul) {
                while (l4 != 0) {
                    op[0] = (T)roundclamp(ip[0], Clamp);
                    op[1] = (T)roundclamp(ip[1], Clamp);
                    op[2] = (T)roundclamp(ip[2], Clamp);
                    op[3] = (T)roundclamp(ip[3], Clamp);
                    ip += 4;
                    op += 4;
                    l4--;
                }
            } else {
                while (l4 != 0) {
                    op[0] = (T)roundclamp(ip[0] * OutMul, Clamp);
                    op[1] = (T)roundclamp(ip[1] * OutMul, Clamp);
                    op[2] = (T)roundclamp(ip[2] * OutMul, Clamp);
                    op[3] = (T)roundclamp(ip[3] * OutMul, Clamp);
                    ip += 4;
                    op += 4;
                    l4--;
                }
            }

#endif // LANCIR_ALIGN > 4

            if (IsUnityMul) {
                while (l != 0) {
                    *op = (T)roundclamp(*ip, Clamp);
                    ip++;
                    op++;
                    l--;
                }
            } else {
                while (l != 0) {
                    *op = (T)roundclamp(*ip * OutMul, Clamp);
                    ip++;
                    op++;
                    l--;
                }
            }
        }
    }

    /**
     * @def LANCIR_LF_PRE
     * @brief Scanline resize function prologue.
     */

#define LANCIR_LF_PRE                                     \
    const CResizePos *const rpe = rp + DstLen;            \
    while (rp != rpe) {                                   \
        const float *flt = rp->flt;                       \
        const float *ip;                                  \
        if (UseSP) {                                      \
            ip = (const float *)((intptr_t)sp + rp->spo); \
        } else {                                          \
            ip = (const float *)rp->spo;                  \
        }

    /**
     * @def LANCIR_LF_POST
     * @brief Scanline resize function epilogue.
     */

#define LANCIR_LF_POST \
    op += opinc;       \
    rp++;              \
    }

    /**
     * @{
     * @brief Function performs scanline resizing. Variants for 1-4-channel
     * images.
     *
     * @param[in] sp Source scanline buffer.
     * @param[out] op Destination buffer.
     * @param opinc `op` increment.
     * @param rp Source scanline offsets and resizing filters.
     * @param kl Filter kernel's length, in taps (always an even value).
     * @param DstLen Destination length, in pixels.
     * @tparam UseSP `true`, if `sp` pointer should be added to `spo`.
     */

    template<bool UseSP>
    static void resize1(const float *const sp, float *op, const size_t opinc,
                        const CResizePos *rp, const int kl, const int DstLen)
    {
        const int ci = kl >> 2;

        if ((kl & 3) == 0) {
            LANCIR_LF_PRE

            int c = ci;

#if LANCIR_ALIGN > 4

            lancvec_t sum = lancvec_mul(
                    lancvec_load(flt), lancvec_loadu(ip));

            while (--c != 0) {
                flt += 4;
                ip += 4;
                sum = lancvec_madd(sum, lancvec_load(flt),
                                   lancvec_loadu(ip));
            }

            lancvec_store32_hadd(op, sum);

#else // LANCIR_ALIGN > 4

            float sum0 = flt[0] * ip[0];
            float sum1 = flt[1] * ip[1];
            float sum2 = flt[2] * ip[2];
            float sum3 = flt[3] * ip[3];

            while (--c != 0) {
                flt += 4;
                ip += 4;
                sum0 += flt[0] * ip[0];
                sum1 += flt[1] * ip[1];
                sum2 += flt[2] * ip[2];
                sum3 += flt[3] * ip[3];
            }

            op[0] = (sum0 + sum1) + (sum2 + sum3);

#endif // LANCIR_ALIGN > 4

            LANCIR_LF_POST
        } else {
            LANCIR_LF_PRE

            int c = ci;

#if LANCIR_ALIGN > 4

            lancvec_t sum = lancvec_mul(lancvec_load(flt),
                                        lancvec_loadu(ip));

            while (--c != 0) {
                flt += 4;
                ip += 4;
                sum = lancvec_madd(sum, lancvec_load(flt),
                                   lancvec_loadu(ip));
            }

#if defined(LANCIR_NEON)

            float32x2_t sum2 = vadd_f32(vget_high_f32(sum),
                                        vget_low_f32(sum));

            sum2 = vmla_f32(sum2, vld1_f32(flt + 4),
                            vld1_f32(ip + 4));

#if defined(LANCIR_ARM32)
            op[0] = vget_lane_f32(sum2, 0) +
                    vget_lane_f32(sum2, 1);
#else // defined( LANCIR_ARM32 )
            op[0] = vaddv_f32(sum2);
#endif // defined( LANCIR_ARM32 )

#else // defined( LANCIR_NEON )

            const lancvec_t sum2 = lancvec_mul(lancvec_loadu(flt + 2),
                                               lancvec_loadu(ip + 2));

            sum = lancvec_addhl(sum, sum);
            sum = lancvec_addhl(sum, sum2);

            lancvec_store32_addhl(op, sum);

#endif // defined( LANCIR_NEON )

#else // LANCIR_ALIGN > 4

            float sum0 = flt[0] * ip[0];
            float sum1 = flt[1] * ip[1];
            float sum2 = flt[2] * ip[2];
            float sum3 = flt[3] * ip[3];

            while (--c != 0) {
                flt += 4;
                ip += 4;
                sum0 += flt[0] * ip[0];
                sum1 += flt[1] * ip[1];
                sum2 += flt[2] * ip[2];
                sum3 += flt[3] * ip[3];
            }

            op[0] = (sum0 + sum1) + (sum2 + sum3) +
                    flt[4] * ip[4] + flt[5] * ip[5];

#endif // LANCIR_ALIGN > 4

            LANCIR_LF_POST
        }
    }

    template<bool UseSP>
    static void resize2(const float *const sp, float *op, const size_t opinc,
                        const CResizePos *rp, const int kl, const int DstLen)
    {
#if LANCIR_ALIGN > 4
        const int ci = kl >> 2;
        const int cir = kl & 3;
#else // LANCIR_ALIGN > 4
        const int ci = kl >> 1;
#endif // LANCIR_ALIGN > 4

        LANCIR_LF_PRE

        int c = ci;

#if defined(LANCIR_AVX)

        __m256 sum = _mm256_mul_ps(_mm256_load_ps(flt),
                                   _mm256_loadu_ps(ip));

        while (--c != 0) {
            flt += 8;
            ip += 8;
            sum = _mm256_add_ps(sum, _mm256_mul_ps(_mm256_load_ps(flt), _mm256_loadu_ps(ip)));
        }

        __m128 res = _mm_add_ps(_mm256_extractf128_ps(sum, 0),
                                _mm256_extractf128_ps(sum, 1));

        if (cir == 2) {
            res = _mm_add_ps(res, _mm_mul_ps(_mm_load_ps(flt + 8), _mm_loadu_ps(ip + 8)));
        }

        _mm_storel_pi((__m64 *)op,
                      _mm_add_ps(res, _mm_movehl_ps(res, res)));

#elif LANCIR_ALIGN > 4

        lancvec_t sumA = lancvec_mul(
                lancvec_load(flt), lancvec_loadu(ip));

        lancvec_t sumB = lancvec_mul(
                lancvec_load(flt + 4), lancvec_loadu(ip + 4));

        while (--c != 0) {
            flt += 8;
            ip += 8;
            sumA = lancvec_madd(sumA, lancvec_load(flt),
                                lancvec_loadu(ip));

            sumB = lancvec_madd(sumB, lancvec_load(flt + 4),
                                lancvec_loadu(ip + 4));
        }

        sumA = lancvec_add(sumA, sumB);

        if (cir == 2) {
            sumA = lancvec_madd(sumA, lancvec_load(flt + 8),
                                lancvec_loadu(ip + 8));
        }

        lancvec_store64_addhl(op, sumA);

#else // LANCIR_ALIGN > 4

        const float xx = flt[0];
        const float xx2 = flt[1];
        float sum0 = xx * ip[0];
        float sum1 = xx * ip[1];
        float sum2 = xx2 * ip[2];
        float sum3 = xx2 * ip[3];

        while (--c != 0) {
            flt += 2;
            ip += 4;
            const float xx = flt[0];
            const float xx2 = flt[1];
            sum0 += xx * ip[0];
            sum1 += xx * ip[1];
            sum2 += xx2 * ip[2];
            sum3 += xx2 * ip[3];
        }

        op[0] = sum0 + sum2;
        op[1] = sum1 + sum3;

#endif // LANCIR_ALIGN > 4

        LANCIR_LF_POST
    }

    template<bool UseSP>
    static void resize3(const float *const sp, float *op, const size_t opinc,
                        const CResizePos *rp, const int kl, const int DstLen)
    {
#if LANCIR_ALIGN > 4

        const int ci = kl >> 2;
        const int cir = kl & 3;

        LANCIR_LF_PRE

        float res[12];
        int c = ci;

#if defined(LANCIR_AVX)

        __m128 sumA = _mm_mul_ps(_mm_load_ps(flt), _mm_loadu_ps(ip));
        __m256 sumB = _mm256_mul_ps(_mm256_loadu_ps(flt + 4),
                                    _mm256_loadu_ps(ip + 4));

        while (--c != 0) {
            flt += 12;
            ip += 12;
            sumA = _mm_add_ps(sumA, _mm_mul_ps(_mm_load_ps(flt), _mm_loadu_ps(ip)));

            sumB = _mm256_add_ps(sumB, _mm256_mul_ps(_mm256_loadu_ps(flt + 4), _mm256_loadu_ps(ip + 4)));
        }

        if (cir == 2) {
            sumA = _mm_add_ps(sumA, _mm_mul_ps(_mm_load_ps(flt + 12), _mm_loadu_ps(ip + 12)));
        }

        _mm_storeu_ps(res, sumA);

        float o0 = res[0] + res[3];
        float o1 = res[1];
        float o2 = res[2];

        _mm256_storeu_ps(res + 4, sumB);

        o1 += res[4];
        o2 += res[5];

#else // defined( LANCIR_AVX )

        lancvec_t sumA = lancvec_mul(lancvec_load(flt),
                                     lancvec_loadu(ip));

        lancvec_t sumB = lancvec_mul(lancvec_load(flt + 4),
                                     lancvec_loadu(ip + 4));

        lancvec_t sumC = lancvec_mul(lancvec_load(flt + 8),
                                     lancvec_loadu(ip + 8));

        while (--c != 0) {
            flt += 12;
            ip += 12;
            sumA = lancvec_madd(sumA, lancvec_load(flt),
                                lancvec_loadu(ip));

            sumB = lancvec_madd(sumB, lancvec_load(flt + 4),
                                lancvec_loadu(ip + 4));

            sumC = lancvec_madd(sumC, lancvec_load(flt + 8),
                                lancvec_loadu(ip + 8));
        }

        if (cir == 2) {
            sumA = lancvec_madd(sumA, lancvec_load(flt + 12),
                                lancvec_loadu(ip + 12));
        }

        lancvec_storeu(res, sumA);
        lancvec_storeu(res + 4, sumB);

        float o0 = res[0] + res[3];
        float o1 = res[1] + res[4];
        float o2 = res[2] + res[5];

        lancvec_storeu(res + 8, sumC);

#endif // defined( LANCIR_AVX )

        o0 += res[6] + res[9];
        o1 += res[7] + res[10];
        o2 += res[8] + res[11];

        if (cir == 2) {
            o1 += flt[16] * ip[16];
            o2 += flt[17] * ip[17];
        }

        op[0] = o0;
        op[1] = o1;
        op[2] = o2;

#else // LANCIR_ALIGN > 4

        const int ci = kl >> 1;

        LANCIR_LF_PRE

        int c = ci;

        const float xx = flt[0];
        float sum0 = xx * ip[0];
        float sum1 = xx * ip[1];
        float sum2 = xx * ip[2];
        const float xx2 = flt[1];
        float sum3 = xx2 * ip[3];
        float sum4 = xx2 * ip[4];
        float sum5 = xx2 * ip[5];

        while (--c != 0) {
            flt += 2;
            ip += 6;
            const float xx = flt[0];
            sum0 += xx * ip[0];
            sum1 += xx * ip[1];
            sum2 += xx * ip[2];
            const float xx2 = flt[1];
            sum3 += xx2 * ip[3];
            sum4 += xx2 * ip[4];
            sum5 += xx2 * ip[5];
        }

        op[0] = sum0 + sum3;
        op[1] = sum1 + sum4;
        op[2] = sum2 + sum5;

#endif // LANCIR_ALIGN > 4

        LANCIR_LF_POST
    }

    template<bool UseSP>
    static void resize4(const float *const sp, float *op, const size_t opinc,
                        const CResizePos *rp, const int kl, const int DstLen)
    {
#if LANCIR_ALIGN > 4
        const int ci = kl >> 1;
#else // LANCIR_ALIGN > 4
        const int ci = kl;
#endif // LANCIR_ALIGN > 4

        LANCIR_LF_PRE

        int c = ci;

#if defined(LANCIR_AVX)

        __m256 sum = _mm256_mul_ps(_mm256_load_ps(flt),
                                   _mm256_loadu_ps(ip));

        while (--c != 0) {
            flt += 8;
            ip += 8;
            sum = _mm256_add_ps(sum, _mm256_mul_ps(_mm256_load_ps(flt), _mm256_loadu_ps(ip)));
        }

        _mm_store_ps(op, _mm_add_ps(_mm256_extractf128_ps(sum, 0), _mm256_extractf128_ps(sum, 1)));

#elif LANCIR_ALIGN > 4

        lancvec_t sumA = lancvec_mul(lancvec_load(flt),
                                     lancvec_load(ip));

        lancvec_t sumB = lancvec_mul(lancvec_load(flt + 4),
                                     lancvec_load(ip + 4));

        while (--c != 0) {
            flt += 8;
            ip += 8;
            sumA = lancvec_madd(sumA, lancvec_load(flt),
                                lancvec_load(ip));

            sumB = lancvec_madd(sumB, lancvec_load(flt + 4),
                                lancvec_load(ip + 4));
        }

        lancvec_store(op, lancvec_add(sumA, sumB));

#else // LANCIR_ALIGN > 4

        const float xx = flt[0];
        float sum0 = xx * ip[0];
        float sum1 = xx * ip[1];
        float sum2 = xx * ip[2];
        float sum3 = xx * ip[3];

        while (--c != 0) {
            flt++;
            ip += 4;
            const float xx = flt[0];
            sum0 += xx * ip[0];
            sum1 += xx * ip[1];
            sum2 += xx * ip[2];
            sum3 += xx * ip[3];
        }

        op[0] = sum0;
        op[1] = sum1;
        op[2] = sum2;
        op[3] = sum3;

#endif // LANCIR_ALIGN > 4

        LANCIR_LF_POST
    }

    /** @} */

#undef LANCIR_LF_PRE
#undef LANCIR_LF_POST
};

#undef lancvec_t
#undef lancvec_const_splat
#undef lancvec_load32_splat
#undef lancvec_load
#undef lancvec_loadu
#undef lancvec_store
#undef lancvec_storeu
#undef lancvec_add
#undef lancvec_mul
#undef lancvec_min
#undef lancvec_max
#undef lancvec_madd
#undef lancvec_addhl
#undef lancvec_store32_addhl
#undef lancvec_store32_hadd
#undef lancvec_store64_addhl

#if defined(LANCIR_NULLPTR)
#undef nullptr
#undef LANCIR_NULLPTR
#endif // defined( LANCIR_NULLPTR )

} // namespace avir

#endif // AVIR_CLANCIR_INCLUDED