yacreader/image_processing/resize_image.cpp

#include "resize_image.h"

#include "qmath.h"
#include <QtConcurrent>
#include <QImage>
#include <QColor>
#include <cstring>

QPixmap scalePixmapBicubic(const QPixmap &pixmap, int width, int height);
QPixmap scalePixmapLanczos(const QPixmap &pixmap, int width, int height);
QPixmap scalePixmapArea(const QPixmap &pixmap, int width, int height);
QPixmap scalePixmapLanczosQt(const QPixmap &pixmap, int targetWidth, int targetHeight, int a = 3);

QPixmap smartScalePixmap(const QPixmap &pixmap, int width, int height)
{
    const int w = pixmap.width();
    const int h = pixmap.height();
    if ((w == width && h == height) || pixmap.isNull()) {
        return pixmap;
    }

    if (w <= width && h <= height) { // upscaling
        return scalePixmapLanczos(pixmap, width, height);
    }

    return pixmap;
}

QPixmap scalePixmap(const QPixmap &pixmap, int width, int height, ScaleMethod method)
{
    const int w = pixmap.width();
    const int h = pixmap.height();
    if (w == width && h == height) {
        return pixmap;
    }

    switch (method) {
    case ScaleMethod::QtFast:
        return pixmap.scaled(width, height, Qt::KeepAspectRatio, Qt::FastTransformation);
    case ScaleMethod::QtSmooth:
        return pixmap.scaled(width, height, Qt::KeepAspectRatio, Qt::SmoothTransformation);
    case ScaleMethod::Bicubic:
        return scalePixmapBicubic(pixmap, width, height);
    case ScaleMethod::Lanczos:
        return scalePixmapLanczos(pixmap, width, height);
    case ScaleMethod::Area:
        return scalePixmapArea(pixmap, width, height);
    }
}

QPixmap scalePixmapBicubic(const QPixmap &pixmap, int width, int height)
{
    // TODO: implement
    return pixmap.scaled(width, height, Qt::KeepAspectRatio, Qt::SmoothTransformation);
}

QPixmap scalePixmapLanczos(const QPixmap &pixmap, int width, int height)
{
    return scalePixmapLanczosQt(pixmap, width, height);
}

QPixmap scalePixmapArea(const QPixmap &pixmap, int width, int height)
{
    // TODO: implement
    return pixmap.scaled(width, height, Qt::KeepAspectRatio, Qt::SmoothTransformation);
}

// Platform-specific SIMD includes and checks
#if defined(__AVX__) || defined(__AVX2__)
#include <immintrin.h> // For x86 SSE/AVX
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h> // For ARM NEON
#else
#warning "No SIMD instructions detected, falling back to scalar implementation."
#endif

// Define SIMD intrinsics for different platforms
#if defined(__AVX__) || defined(__AVX2__)

// Function to normalize angles in radians to the range [-PI, PI]
__m256d normalize_angle(__m256d x)
{
    const __m256d pi = _mm256_set1_pd(M_PI);
    const __m256d two_pi = _mm256_set1_pd(2 * M_PI);
    // Calculate the quotient of x / (2*PI)
    __m256d quotient = _mm256_div_pd(x, two_pi);
    // Use floor to get the nearest lower integer
    quotient = _mm256_floor_pd(quotient);
    // Calculate the remainder
    __m256d remainder = _mm256_sub_pd(x, _mm256_mul_pd(quotient, two_pi));
    // Adjust the range to [-PI, PI]
    __m256d adjust = _mm256_cmp_pd(remainder, pi, _CMP_GT_OS);
    remainder = _mm256_sub_pd(remainder, _mm256_and_pd(adjust, two_pi));
    return remainder;
}

// Improved sine approximation function for __m256d using the normalized angle
__m256d sin_pd_approx(__m256d x)
{
    x = normalize_angle(x); // Normalize x to the range [-PI, PI]

    // Sine approximation coefficients
    const __m256d a0 = _mm256_set1_pd(-0.16666666666666666);
    const __m256d a1 = _mm256_set1_pd(0.008333333333333333);
    const __m256d a2 = _mm256_set1_pd(-0.0001984126984126984);

    __m256d x2 = _mm256_mul_pd(x, x);
    __m256d x3 = _mm256_mul_pd(x2, x);
    __m256d x5 = _mm256_mul_pd(x3, x2);
    __m256d x7 = _mm256_mul_pd(x5, x2);

    // Compute the polynomial approximation
    __m256d result = _mm256_add_pd(x, _mm256_mul_pd(a0, x3));
    result = _mm256_add_pd(result, _mm256_mul_pd(a1, x5));
    result = _mm256_add_pd(result, _mm256_mul_pd(a2, x7));

    return result;
}

inline __m256d lanczosKernelAVX(const __m256d &x, const __m256d &a_val)
{
    __m256d zero = _mm256_setzero_pd();
    __m256d one = _mm256_set1_pd(1.0);
    __m256d pix = _mm256_mul_pd(_mm256_set1_pd(M_PI), x);
    __m256d sin_pix = sin_pd_approx(pix);
    __m256d sin_pix_a = sin_pd_approx(_mm256_div_pd(pix, a_val));
    __m256d numerator = _mm256_mul_pd(_mm256_mul_pd(a_val, sin_pix), sin_pix_a);
    __m256d denominator = _mm256_mul_pd(pix, pix);
    __m256d result = _mm256_div_pd(numerator, denominator);
    result = _mm256_blendv_pd(result, one, _mm256_cmp_pd(x, zero, _CMP_EQ_OQ));
    return result;
}

QVector<QRgb> processRow(int y, int targetWidth, int targetHeight, const QImage &sourceImage, int a)
{
    QVector<QRgb> resultRow(targetWidth);
    int sourceWidth = sourceImage.width();
    int sourceHeight = sourceImage.height();
    __m256d a_vec = _mm256_set1_pd(a);

    for (int x = 0; x < targetWidth; ++x) {
        double gx = ((double)x / targetWidth) * (sourceWidth - 1);
        double gy = ((double)y / targetHeight) * (sourceHeight - 1);

        __m256d red_vec = _mm256_setzero_pd();
        __m256d green_vec = _mm256_setzero_pd();
        __m256d blue_vec = _mm256_setzero_pd();
        __m256d alpha_vec = _mm256_setzero_pd();
        __m256d weight_vec = _mm256_setzero_pd();

        for (int ix = (int)gx - a + 1; ix <= (int)gx + a; ++ix) {
            for (int iy = (int)gy - a + 1; iy <= (int)gy + a; ++iy) {
                if (ix >= 0 && ix < sourceWidth && iy >= 0 && iy < sourceHeight) {
                    __m256d gx_vec = _mm256_set1_pd(gx - ix);
                    __m256d gy_vec = _mm256_set1_pd(gy - iy);
                    __m256d weight_x = lanczosKernelAVX(gx_vec, a_vec);
                    __m256d weight_y = lanczosKernelAVX(gy_vec, a_vec);
                    __m256d weight = _mm256_mul_pd(weight_x, weight_y);

                    QColor color(sourceImage.pixel(ix, iy));
                    __m256d color_red = _mm256_set1_pd(color.red());
                    __m256d color_green = _mm256_set1_pd(color.green());
                    __m256d color_blue = _mm256_set1_pd(color.blue());
                    __m256d color_alpha = _mm256_set1_pd(color.alpha());

                    red_vec = _mm256_add_pd(red_vec, _mm256_mul_pd(weight, color_red));
                    green_vec = _mm256_add_pd(green_vec, _mm256_mul_pd(weight, color_green));
                    blue_vec = _mm256_add_pd(blue_vec, _mm256_mul_pd(weight, color_blue));
                    alpha_vec = _mm256_add_pd(alpha_vec, _mm256_mul_pd(weight, color_alpha));

                    weight_vec = _mm256_add_pd(weight_vec, weight);
                }
            }
        }

        double red = _mm256_cvtsd_f64(_mm256_hadd_pd(red_vec, red_vec));
        double green = _mm256_cvtsd_f64(_mm256_hadd_pd(green_vec, green_vec));
        double blue = _mm256_cvtsd_f64(_mm256_hadd_pd(blue_vec, blue_vec));
        double alpha = _mm256_cvtsd_f64(_mm256_hadd_pd(alpha_vec, alpha_vec));
        double sumWeights = _mm256_cvtsd_f64(_mm256_hadd_pd(weight_vec, weight_vec));

        if (sumWeights > 0.0) {
            red = std::clamp(red / sumWeights, 0.0, 255.0);
            green = std::clamp(green / sumWeights, 0.0, 255.0);
            blue = std::clamp(blue / sumWeights, 0.0, 255.0);
            alpha = std::clamp(alpha / sumWeights, 0.0, 255.0);
        }

        resultRow[x] = qRgba(static_cast<int>(red), static_cast<int>(green), static_cast<int>(blue), static_cast<int>(alpha));
    }

    return resultRow;
}

#elif defined(__ARM_NEON) || defined(__ARM_NEON__)

inline float64x2_t lanczosKernelNEON(const float64x2_t &x, int a)
{
    // Load constants
    float64x2_t zero = vdupq_n_f64(0.0);
    float64x2_t one = vdupq_n_f64(1.0);
    float64x2_t a_val = vdupq_n_f64(a);

    // Convert to scalar arrays
    double x_array[2];
    vst1q_f64(x_array, x);
    double a_val_array[2] = { static_cast<double>(a), static_cast<double>(a) };

    // Compute sin(x * pi)
    float64x2_t pix = vmulq_f64(vdupq_n_f64(M_PI), x);
    double pix_array[2];
    vst1q_f64(pix_array, pix);

    double sin_pix_array[2];
    sin_pix_array[0] = std::sin(pix_array[0]);
    sin_pix_array[1] = std::sin(pix_array[1]);
    float64x2_t sin_pix = vld1q_f64(sin_pix_array);

    // Compute sin(x * pi / a)
    float64x2_t pix_div_a = vdivq_f64(pix, a_val);
    double pix_div_a_array[2];
    vst1q_f64(pix_div_a_array, pix_div_a);

    double sin_pix_div_a_array[2];
    sin_pix_div_a_array[0] = std::sin(pix_div_a_array[0]);
    sin_pix_div_a_array[1] = std::sin(pix_div_a_array[1]);
    float64x2_t sin_pix_a = vld1q_f64(sin_pix_div_a_array);

    // Compute Lanczos kernel
    float64x2_t numerator = vmulq_f64(vmulq_f64(a_val, sin_pix), sin_pix_a);
    float64x2_t denominator = vmulq_f64(pix, pix);
    float64x2_t result = vdivq_f64(numerator, denominator);

    // Handle the case where x is zero
    uint64x2_t mask = vceqq_f64(x, zero);
    result = vbslq_f64(mask, one, result);

    return result;
}

QVector<QRgb> processRow(int y, int targetWidth, int targetHeight, const QImage &sourceImage, int a)
{
    QVector<QRgb> resultRow(targetWidth);
    int sourceWidth = sourceImage.width();
    int sourceHeight = sourceImage.height();

    for (int x = 0; x < targetWidth; ++x) {
        double gx = ((double)x / targetWidth) * (sourceWidth - 1);
        double gy = ((double)y / targetHeight) * (sourceHeight - 1);

        float64x2_t red_vec = vdupq_n_f64(0.0);
        float64x2_t green_vec = vdupq_n_f64(0.0);
        float64x2_t blue_vec = vdupq_n_f64(0.0);
        float64x2_t alpha_vec = vdupq_n_f64(0.0);
        float64x2_t weight_vec = vdupq_n_f64(0.0);

        for (int ix = (int)gx - a + 1; ix <= (int)gx + a; ++ix) {
            for (int iy = (int)gy - a + 1; iy <= (int)gy + a; ++iy) {
                if (ix >= 0 && ix < sourceWidth && iy >= 0 && iy < sourceHeight) {
                    float64x2_t gx_vec = vdupq_n_f64(gx - ix);
                    float64x2_t gy_vec = vdupq_n_f64(gy - iy);
                    float64x2_t weight_x = lanczosKernelNEON(gx_vec, a);
                    float64x2_t weight_y = lanczosKernelNEON(gy_vec, a);
                    float64x2_t weight = vmulq_f64(weight_x, weight_y);

                    QColor color(sourceImage.pixel(ix, iy));
                    float64x2_t color_red = vdupq_n_f64(color.red());
                    float64x2_t color_green = vdupq_n_f64(color.green());
                    float64x2_t color_blue = vdupq_n_f64(color.blue());
                    float64x2_t color_alpha = vdupq_n_f64(color.alpha());

                    red_vec = vmlaq_f64(red_vec, weight, color_red);
                    green_vec = vmlaq_f64(green_vec, weight, color_green);
                    blue_vec = vmlaq_f64(blue_vec, weight, color_blue);
                    alpha_vec = vmlaq_f64(alpha_vec, weight, color_alpha);
                    weight_vec = vaddq_f64(weight_vec, weight);
                }
            }
        }

        double red = vgetq_lane_f64(red_vec, 0) + vgetq_lane_f64(red_vec, 1);
        double green = vgetq_lane_f64(green_vec, 0) + vgetq_lane_f64(green_vec, 1);
        double blue = vgetq_lane_f64(blue_vec, 0) + vgetq_lane_f64(blue_vec, 1);
        double alpha = vgetq_lane_f64(alpha_vec, 0) + vgetq_lane_f64(alpha_vec, 1);
        double sumWeights = vgetq_lane_f64(weight_vec, 0) + vgetq_lane_f64(weight_vec, 1);

        if (sumWeights > 0.0) {
            red = std::clamp(red / sumWeights, 0.0, 255.0);
            green = std::clamp(green / sumWeights, 0.0, 255.0);
            blue = std::clamp(blue / sumWeights, 0.0, 255.0);
            alpha = std::clamp(alpha / sumWeights, 0.0, 255.0);
        }

        resultRow[x] = qRgba(static_cast<int>(red), static_cast<int>(green), static_cast<int>(blue), static_cast<int>(alpha));
    }

    return resultRow;
}

#else

// Scalar fallback for unsupported platforms
double lanczosKernel(double x, int a)
{
    if (x == 0.0)
        return 1.0;
    if (x < -a || x > a)
        return 0.0;
    double pix = M_PI * x;
    return a * std::sin(pix) * std::sin(pix / a) / (pix * pix);
}

QVector<QRgb> processRow(int y, int targetWidth, int targetHeight, const QImage &sourceImage, int a)
{
    QVector<QRgb> resultRow(targetWidth);
    int sourceWidth = sourceImage.width();
    int sourceHeight = sourceImage.height();

    for (int x = 0; x < targetWidth; ++x) {
        double gx = ((double)x / targetWidth) * (sourceWidth - 1);
        double gy = ((double)y / targetHeight) * (sourceHeight - 1);

        double red = 0.0, green = 0.0, blue = 0.0, alpha = 0.0, sumWeights = 0.0;

        for (int ix = (int)gx - a + 1; ix <= (int)gx + a; ++ix) {
            for (int iy = (int)gy - a + 1; iy <= (int)gy + a; ++iy) {
                if (ix >= 0 && ix < sourceWidth && iy >= 0 && iy < sourceHeight) {
                    double weight = lanczosKernel(gx - ix, a) * lanczosKernel(gy - iy, a);

                    QColor color(sourceImage.pixel(ix, iy));

                    red += weight * color.red();
                    green += weight * color.green();
                    blue += weight * color.blue();
                    alpha += weight * color.alpha();
                    sumWeights += weight;
                }
            }
        }

        if (sumWeights > 0.0) {
            red = std::clamp(red / sumWeights, 0.0, 255.0);
            green = std::clamp(green / sumWeights, 0.0, 255.0);
            blue = std::clamp(blue / sumWeights, 0.0, 255.0);
            alpha = std::clamp(alpha / sumWeights, 0.0, 255.0);
        }

        resultRow[x] = qRgba(static_cast<int>(red), static_cast<int>(green), static_cast<int>(blue), static_cast<int>(alpha));
    }

    return resultRow;
}

#endif

QImage scaleImageLanczos(const QImage &sourceImage, int targetWidth, int targetHeight, int a = 3)
{
    QImage targetImage(targetWidth, targetHeight, QImage::Format_ARGB32);

    QVector<int> rows(targetHeight);
    for (int i = 0; i < targetHeight; ++i) {
        rows[i] = i;
    }

    uchar *imgBits = targetImage.bits();
    int bytesPerLine = targetImage.bytesPerLine();

    QtConcurrent::blockingMap(rows, [targetWidth, targetHeight, &sourceImage, a, imgBits, bytesPerLine](int y) {
        QVector<QRgb> rowPixels = processRow(y, targetWidth, targetHeight, sourceImage, a);
        uchar *rowPtr = imgBits + y * bytesPerLine;
        std::memcpy(rowPtr, rowPixels.constData(), targetWidth * sizeof(QRgb));
    });

    return targetImage;
}

QPixmap scalePixmapLanczosQt(const QPixmap &pixmap, int targetWidth, int targetHeight, int a)
{
    QImage sourceImage = pixmap.toImage();
    QImage scaledImage = scaleImageLanczos(sourceImage, targetWidth, targetHeight, a);
    return QPixmap::fromImage(scaledImage);
}