Browse Source

revert back to better blurring behaviour

master
Pandora 7 years ago
parent
commit
2040285ce9
  1. 159
      blur.c
  2. 14
      blur.h
  3. 276
      blur_simd.c
  4. 4
      configure.ac

159
blur.c

@ -1,6 +1,6 @@
/* /*
* Copyright © 2008 Kristian Høgsberg * Copyright © 2008 Kristian Høgsberg
* Copyright © 2009 Chris Wilson * Copyright © 2009 Chris Wilson
* *
* Permission to use, copy, modify, distribute, and sell this software and its * Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that * documentation for any purpose is hereby granted without fee, provided that
@ -23,12 +23,9 @@
#include <math.h> #include <math.h>
#include "blur.h" #include "blur.h"
/* Performs a simple 2D Gaussian blur of standard devation @sigma surface @surface. */
#define ARRAY_LENGTH(a) (sizeof (a) / sizeof (a)[0])
/* Performs a simple 2D Gaussian blur of radius @radius on surface @surface. */
void void
blur_image_surface (cairo_surface_t *surface, int radius) blur_image_surface (cairo_surface_t *surface, int sigma)
{ {
cairo_surface_t *tmp; cairo_surface_t *tmp;
int width, height; int width, height;
@ -63,92 +60,86 @@ blur_image_surface (cairo_surface_t *surface, int radius)
return; return;
src = (uint32_t*)cairo_image_surface_get_data (surface); src = (uint32_t*)cairo_image_surface_get_data (surface);
dst = (uint32_t*)cairo_image_surface_get_data (tmp); dst = (uint32_t*)cairo_image_surface_get_data (tmp);
#ifdef __SSE3__ // according to a paper by Peter Kovesi [1], box filter of width w, equals to Gaussian blur of following sigma:
blur_impl_ssse3(src, dst, width, height, 4.5); // σ_av = sqrt((w*w-1)/12)
#elif __SSE2__ // for our 7x7 filter we have σ_av = 2.0.
blur_impl_sse2(src, dst, width, height, 4.5); // applying the same Gaussian filter n times results in σ_n = sqrt(n*σ_av*σ_av) [2]
// after some trivial math, we arrive at n = ((σ_d)/(σ_av))^2
// since it's a box blur filter, n >= 3
//
// [1]: http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
// [2]: https://en.wikipedia.org/wiki/Gaussian_blur#Mathematics
int n = lrintf((sigma*sigma)/(SIGMA_AV*SIGMA_AV));
if (n < 3) n = 3;
for (int i = 0; i < n; i++)
{
// horizontal pass includes image transposition:
// instead of writing pixel src[x] to dst[x],
// we write it to transposed location.
// (to be exact: dst[height * current_column + current_row])
#ifdef __SSE2__
blur_impl_horizontal_pass_sse2(src, dst, width, height);
blur_impl_horizontal_pass_sse2(dst, src, height, width);
#else #else
int src_stride = cairo_image_surface_get_stride (surface); blur_impl_horizontal_pass_generic(src, dst, width, height);
int dst_stride = cairo_image_surface_get_stride (tmp); blur_impl_horizontal_pass_generic(dst, src, height, width);
blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
#endif #endif
}
cairo_surface_destroy (tmp); cairo_surface_destroy (tmp);
cairo_surface_flush (surface); cairo_surface_flush (surface);
cairo_surface_mark_dirty (surface); cairo_surface_mark_dirty (surface);
} }
void blur_impl_naive(uint32_t* _src, uint32_t* _dst, int width, int height, int src_stride, int dst_stride, int radius) void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height) {
{ for (int row = 0; row < height; row++) {
int x, y, z, w; for (int column = 0; column < width; column++, src++) {
uint32_t *s, *d, a, p; uint32_t rgbaIn[KERNEL_SIZE];
int i, j, k;
uint8_t kernel[17]; // handle borders
const int size = ARRAY_LENGTH (kernel); int leftBorder = column < HALF_KERNEL;
const int half = size / 2; int rightBorder = column > width - HALF_KERNEL;
int i = 0;
uint8_t *src = (uint8_t*)_src; if (leftBorder) {
uint8_t *dst = (uint8_t*)_dst; // for kernel size 7x7 and column == 0, we have:
// x x x P0 P1 P2 P3
a = 0; // first loop mirrors P{0..3} to fill x's,
for (i = 0; i < size; i++) { // second one loads P{0..3}
double f = i - half; for (; i < HALF_KERNEL - column; i++)
a += kernel[i] = exp (- f * f / 30.0) * 80; rgbaIn[i] = *(src + (HALF_KERNEL - i));
} for (; i < KERNEL_SIZE; i++)
rgbaIn[i] = *(src - (HALF_KERNEL - i));
/* Horizontally blur from surface -> tmp */ } else if (rightBorder) {
for (i = 0; i < height; i++) { for (; i < width - column; i++)
s = (uint32_t *) (src + i * src_stride); rgbaIn[i] = *(src + i);
d = (uint32_t *) (dst + i * dst_stride); for (int k = 0; i < KERNEL_SIZE; i++, k++)
for (j = 0; j < width; j++) { rgbaIn[i] = *(src - k);
if (radius < j && j < width - radius) { } else {
d[j] = s[j]; for (; i < KERNEL_SIZE; i++)
continue; rgbaIn[i] = *(src + i - HALF_KERNEL);
} }
x = y = z = w = 0; uint32_t acc[4] = {0};
for (k = 0; k < size; k++) {
if (j - half + k < 0 || j - half + k >= width) for (i = 0; i < KERNEL_SIZE; i++) {
continue; acc[0] += (rgbaIn[i] & 0xFF000000) >> 24;
acc[1] += (rgbaIn[i] & 0x00FF0000) >> 16;
p = s[j - half + k]; acc[2] += (rgbaIn[i] & 0x0000FF00) >> 8;
acc[3] += (rgbaIn[i] & 0x000000FF) >> 0;
x += ((p >> 24) & 0xff) * kernel[k]; }
y += ((p >> 16) & 0xff) * kernel[k];
z += ((p >> 8) & 0xff) * kernel[k]; for(i = 0; i < 4; i++)
w += ((p >> 0) & 0xff) * kernel[k]; acc[i] *= 1.0/KERNEL_SIZE;
}
d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a; *(dst + height * column + row) = (acc[0] << 24) |
} (acc[1] << 16) |
} (acc[2] << 8 ) |
(acc[3] << 0);
/* Then vertically blur from tmp -> surface */
for (i = 0; i < height; i++) {
s = (uint32_t *) (dst + i * dst_stride);
d = (uint32_t *) (src + i * src_stride);
for (j = 0; j < width; j++) {
if (radius <= i && i < height - radius) {
d[j] = s[j];
continue;
} }
x = y = z = w = 0;
for (k = 0; k < size; k++) {
if (i - half + k < 0 || i - half + k >= height)
continue;
s = (uint32_t *) (dst + (i - half + k) * dst_stride);
p = s[j];
x += ((p >> 24) & 0xff) * kernel[k];
y += ((p >> 16) & 0xff) * kernel[k];
z += ((p >> 8) & 0xff) * kernel[k];
w += ((p >> 0) & 0xff) * kernel[k];
}
d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
}
} }
} }

14
blur.h

@ -4,12 +4,14 @@
#include <stdint.h> #include <stdint.h>
#include <cairo.h> #include <cairo.h>
void blur_image_surface (cairo_surface_t *surface, int radius); #define KERNEL_SIZE 7
void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius); #define SIGMA_AV 2
void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma); #define HALF_KERNEL KERNEL_SIZE / 2
void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma); void blur_image_surface(cairo_surface_t *surface, int sigma);
void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height); void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height);
void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height);
#endif #endif

276
blur_simd.c

@ -1,56 +1,19 @@
/* /*
* vim:ts=4:sw=4:expandtab * vim:ts=4:sw=4:expandtab
* *
* © 2016 Sebastian Frysztak * © 2016 Sebastian Frysztak
* *
* See LICENSE for licensing information * See LICENSE for licensing information
* *
*/ */
#include "blur.h" #include "blur.h"
#include <math.h>
#include <xmmintrin.h> #include <xmmintrin.h>
#include <tmmintrin.h>
#include <stdio.h> // number of xmm registers needed to store input pixels for given kernel size
#define ALIGN16 __attribute__((aligned(16)))
#define KERNEL_SIZE 15
#define HALF_KERNEL KERNEL_SIZE / 2
// number of xmm registers needed to store
// input pixels for given kernel size
#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
// scaling factor for kernel coefficients. void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height) {
// higher values cause desaturation.
// used in SSSE3 implementation.
#define SCALE_FACTOR 7
void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
// prepare kernel
float kernel[KERNEL_SIZE];
float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
for (int i = 0; i < KERNEL_SIZE; i++) {
float x = HALF_KERNEL - i;
kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
sum += kernel[i];
}
// normalize kernel
for (int i = 0; i < KERNEL_SIZE; i++)
kernel[i] /= sum;
// horizontal pass includes image transposition:
// instead of writing pixel src[x] to dst[x],
// we write it to transposed location.
// (to be exact: dst[height * current_column + current_row])
blur_impl_horizontal_pass_sse2(src, dst, kernel, width, height);
blur_impl_horizontal_pass_sse2(dst, src, kernel, height, width);
}
void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
uint32_t* o_src = src; uint32_t* o_src = src;
for (int row = 0; row < height; row++) { for (int row = 0; row < height; row++) {
for (int column = 0; column < width; column++, src++) { for (int column = 0; column < width; column++, src++) {
@ -59,227 +22,58 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
// handle borders // handle borders
int leftBorder = column < HALF_KERNEL; int leftBorder = column < HALF_KERNEL;
int rightBorder = column > width - HALF_KERNEL; int rightBorder = column > width - HALF_KERNEL;
if (leftBorder || rightBorder) { uint32_t _rgbaIn[KERNEL_SIZE + 1] __attribute__((aligned(16)));
uint32_t _rgbaIn[KERNEL_SIZE + 1] ALIGN16; int i = 0;
int i = 0; if (leftBorder) {
if (leftBorder) { // for kernel size 7x7 and column == 0, we have:
// for kernel size 7x7 and column == 0, we have: // x x x P0 P1 P2 P3
// x x x P0 P1 P2 P3 // first loop mirrors P{0..3} to fill x's,
// first loop mirrors P{0..3} to fill x's, // second one loads P{0..3}
// second one loads P{0..3} for (; i < HALF_KERNEL - column; i++)
for (; i < HALF_KERNEL - column; i++) _rgbaIn[i] = *(src + (HALF_KERNEL - i));
_rgbaIn[i] = *(src + (HALF_KERNEL - i)); for (; i < KERNEL_SIZE; i++)
for (; i < KERNEL_SIZE; i++) _rgbaIn[i] = *(src - (HALF_KERNEL - i));
_rgbaIn[i] = *(src - (HALF_KERNEL - i));
} else {
for (; i < width - column; i++)
_rgbaIn[i] = *(src + i);
for (int k = 0; i < KERNEL_SIZE; i++, k++)
_rgbaIn[i] = *(src - k);
}
for (int k = 0; k < REGISTERS_CNT; k++) for (int k = 0; k < REGISTERS_CNT; k++)
rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k)); rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
} else { } else if (rightBorder) {
for (int k = 0; k < REGISTERS_CNT; k++) { for (; i < width - column; i++)
#if 0 _rgbaIn[i] = *(src + i);
printf("%p -> %p (%ld) || %p->%p\n", for (int k = 0; i < KERNEL_SIZE; i++, k++)
o_src, _rgbaIn[i] = *(src - k);
o_src + (height * width),
o_src + (height * width) - src,
src + 4*k - HALF_KERNEL,
((__m128i*)src + 4*k - HALF_KERNEL) + 1
);
#endif
// if this copy would go out of bounds, break
if ((long long) (((__m128i*) src + 4*k - HALF_KERNEL) + 1)
> (long long) (o_src + (height * width)))
break;
rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
}
}
// unpack each pixel, convert to float,
// multiply by corresponding kernel value
// and add to accumulator
__m128i tmp;
__m128i zero = _mm_setzero_si128();
__m128 rgba_ps;
__m128 acc = _mm_setzero_ps();
int counter = 0;
for (int i = 0; i < 3; i++)
{
tmp = _mm_unpacklo_epi8(rgbaIn[i], zero);
rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
tmp = _mm_unpackhi_epi8(rgbaIn[i], zero);
rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
}
tmp = _mm_unpacklo_epi8(rgbaIn[3], zero);
rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
__m128i rgbaOut = _mm_cvtps_epi32(acc);
rgbaOut = _mm_packs_epi32(rgbaOut, zero);
rgbaOut = _mm_packus_epi16(rgbaOut, zero);
*(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut);
}
}
}
void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) { for (int k = 0; k < REGISTERS_CNT; k++)
// prepare kernel
float kernelf[KERNEL_SIZE];
int8_t kernel[KERNEL_SIZE + 1];
float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
for (int i = 0; i < KERNEL_SIZE; i++) {
float x = HALF_KERNEL - i;
kernelf[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
sum += kernelf[i];
}
// normalize kernel
for (int i = 0; i < KERNEL_SIZE; i++)
kernelf[i] /= sum;
// round to nearest integer and convert to int
for (int i = 0; i < KERNEL_SIZE; i++)
kernel[i] = (int8_t)rintf(kernelf[i] * (1 << SCALE_FACTOR));
kernel[KERNEL_SIZE] = 0;
// horizontal pass includes image transposition:
// instead of writing pixel src[x] to dst[x],
// we write it to transposed location.
// (to be exact: dst[height * current_column + current_row])
blur_impl_horizontal_pass_ssse3(src, dst, kernel, width, height);
blur_impl_horizontal_pass_ssse3(dst, src, kernel, height, width);
}
void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height) {
uint32_t* o_src = src;
__m128i _kern = _mm_loadu_si128((__m128i*)kernel);
__m128i rgbaIn[REGISTERS_CNT];
for (int row = 0; row < height; row++) {
for (int column = 0; column < width; column++, src++) {
uint32_t _rgbaIn[KERNEL_SIZE + 1] ALIGN16;
#if 0
for (int j = 0; j < KERNEL_SIZE; ++j) {
printf("_rgbaIn[%d]: %p->%p\n", j, &_rgbaIn[j], &_rgbaIn[j] + 1);
}
#endif
// handle borders
int leftBorder = column < HALF_KERNEL;
int rightBorder = column > width - HALF_KERNEL;
if (leftBorder || rightBorder) {
int i = 0;
if (leftBorder) {
// for kernel size 7x7 and column == 0, we have:
// x x x P0 P1 P2 P3
// first loop mirrors P{0..3} to fill x's,
// second one loads P{0..3}
for (; i < HALF_KERNEL - column; i++)
_rgbaIn[i] = *(src + (HALF_KERNEL - i));
for (; i < KERNEL_SIZE; i++)
_rgbaIn[i] = *(src - (HALF_KERNEL - i));
} else {
for (; i < width - column; i++)
_rgbaIn[i] = *(src + i);
for (int k = 0; i < KERNEL_SIZE; i++, k++)
_rgbaIn[i] = *(src - k);
}
for (int k = 0; k < REGISTERS_CNT; k++) {
#if 0
printf("K: %d; p: %p, p+4*k: %p, end of p: %p\n", k, _rgbaIn, _rgbaIn+4*k, ((__m128i*) (_rgbaIn +4*k)) + 1);
#endif
rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k)); rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
}
} else { } else {
for (int k = 0; k < REGISTERS_CNT; k++) { for (int k = 0; k < REGISTERS_CNT; k++) {
if ((long long) (((__m128i*) src + 4*k - HALF_KERNEL) + 1) if ((long long) (((__m128i*) src + 4*k - HALF_KERNEL) + 1)
> (long long) (o_src + (height * width))) > (long long) (o_src + (height * width)))
break; break;
#if 0
printf("K: %d; p: %p -> %p\n", k, src+4*k - HALF_KERNEL, ((__m128i*) (src +4*k - HALF_KERNEL)) + 1);
printf("%p->%p, %p->%p (%ld)\n", (__m128i*) src + 4*k - HALF_KERNEL, ((__m128i*) src + 4*k - HALF_KERNEL) + 1, o_src, o_src + (width * height), o_src + (width * height) - src);
#endif
rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL)); rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
} }
} }
// basis of this implementation is _mm_maddubs_epi16 (aka pmaddubsw).
// 'rgba' holds 16 unsigned bytes, so 4 pixels.
// 'kern' holds 16 signed bytes kernel values multiplied by (1 << SCALE_FACTOR).
// before multiplication takes place, vectors need to be prepared:
// 'rgba' is shuffled from R1B1G1A1...R4B4G4A4 to R1R2R3R4...A1A2A3A4
// 'kern' is shuffled from w1w2w3w4...w13w14w15w16 to w1w2w3w4 repeated 4 times
// then we call _mm_maddubs_epi16 and we get:
// --------------------------------------------------------------------------------------
// | R1*w1 + R2*w2 | R3*w3 + R4*w4 | G1*w1 + G2*w2 | G3*w3 + G4*w4 | repeat for B and A |
// --------------------------------------------------------------------------------------
// each 'rectangle' is a 16-byte signed int.
// then we repeat the process for the rest of input pixels,
// call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
__m128i rgba, kern;
__m128i zero = _mm_setzero_si128(); __m128i zero = _mm_setzero_si128();
__m128i acc = _mm_setzero_si128(); __m128i acc = _mm_setzero_si128();
const __m128i rgba_shuf_mask = _mm_setr_epi8(0, 4, 8, 12, acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));
1, 5, 9, 13, acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));
2, 6, 10, 14, acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));
3, 7, 11, 15);
const __m128i kern_shuf_mask = _mm_setr_epi8(0, 1, 2, 3,
0, 1, 2, 3,
0, 1, 2, 3,
0, 1, 2, 3);
rgba = _mm_shuffle_epi8(rgbaIn[0], rgba_shuf_mask);
kern = _mm_shuffle_epi8(_kern, kern_shuf_mask);
acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
rgba = _mm_shuffle_epi8(rgbaIn[1], rgba_shuf_mask); // kernel size equals to 7, but we can only load multiples of 4 pixels
kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 4), kern_shuf_mask); // we have to set 8th pixel to zero
acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern)); acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0),
_mm_unpackhi_epi8(rgbaIn[1], zero)));
acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero),
_mm_unpackhi_epi16(acc, zero));
rgba = _mm_shuffle_epi8(rgbaIn[2], rgba_shuf_mask); // multiplication is significantly faster than division
kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 8), kern_shuf_mask); acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern)); _mm_set1_ps(1.0/KERNEL_SIZE)));
rgba = _mm_shuffle_epi8(rgbaIn[3], rgba_shuf_mask);
kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 12), kern_shuf_mask);
acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
acc = _mm_hadds_epi16(acc, zero);
acc = _mm_srai_epi16(acc, SCALE_FACTOR);
// Cairo sets alpha channel to 255
// (or -1, depending how you look at it)
// this quickly overflows accumulator,
// and alpha is calculated completely wrong.
// I assume most people don't use semi-transparent
// lock screen images, so no one will mind if we
// 'correct it' by setting alpha to 255.
*(dst + height * column + row) = *(dst + height * column + row) =
_mm_cvtsi128_si32(_mm_packus_epi16(acc, zero)) | 0xFF000000; _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));
} }
} }
} }

4
configure.ac

@ -101,10 +101,8 @@ AC_PROG_LN_S
AM_PROG_AR AM_PROG_AR
AX_FLAGS_WARN_ALL AX_FLAGS_WARN_ALL
AX_APPEND_FLAG([-march=native], [AM_CFLAGS]) AX_APPEND_FLAG([-O2], [AM_CFLAGS])
AX_APPEND_FLAG([-O2], [AM_CFLAGS])
AX_APPEND_FLAG([-funroll-loops], [AM_CFLAGS]) AX_APPEND_FLAG([-funroll-loops], [AM_CFLAGS])
AX_APPEND_FLAG([-std=gnu99], [AM_CFLAGS])
AX_CHECK_COMPILE_FLAG([-Wunused-value], [AX_APPEND_FLAG([-Wunused-value], [AM_CFLAGS])]) AX_CHECK_COMPILE_FLAG([-Wunused-value], [AX_APPEND_FLAG([-Wunused-value], [AM_CFLAGS])])
AC_SUBST(AM_CFLAGS) AC_SUBST(AM_CFLAGS)

Loading…
Cancel
Save