From f06dc6cbc4a6ddd6f3754781658629c18d50c0ff Mon Sep 17 00:00:00 2001 From: Sebastian Frysztak Date: Fri, 4 Nov 2016 22:19:29 +0100 Subject: [PATCH] Add AVX version. It relies on some SSE2 instructions, so performance gain is not that huge (about 1.4x). I experimented with 256-bit loads, but they turned out to be slower (at least on Sandy Bridge). --- Makefile | 1 + blur.h | 8 +++- blur_simd.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d979100..9ab3a91 100644 --- a/Makefile +++ b/Makefile @@ -15,6 +15,7 @@ CFLAGS += -std=c99 CFLAGS += -pipe CFLAGS += -Wall CFLAGS += -mssse3 +CFLAGS += -mavx CFLAGS += -O2 CPPFLAGS += -D_GNU_SOURCE CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi ) diff --git a/blur.h b/blur.h index 7ba45fe..607aa70 100644 --- a/blur.h +++ b/blur.h @@ -6,8 +6,14 @@ void blur_image_surface (cairo_surface_t *surface, int radius); void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius); + void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma); -void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height); +void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) + __attribute__ ((__target__ ("no-avx"))); + +void blur_impl_avx(uint32_t* src, uint32_t* dst, int width, int height, float sigma); +void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height); + void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma); void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int16_t *kernel, int width, int height); diff --git a/blur_simd.c b/blur_simd.c index 8eac808..6dc2ec3 100644 --- a/blur_simd.c +++ b/blur_simd.c @@ -11,6 +11,7 @@ #include #include #include +#include #define ALIGN16 __attribute__((aligned(16))) #define KERNEL_SIZE 15 @@ -25,6 +26,12 @@ // used in SSSE3 implementation. #define SCALE_FACTOR 14 +// AVX intrinsics missing in GCC +#define _mm256_set_m128i(v0, v1) _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1) +#define _mm256_setr_m128i(v0, v1) _mm256_set_m128i((v1), (v0)) +#define _mm256_set_m128(v0, v1) _mm256_insertf128_ps(_mm256_castps128_ps256(v1), (v0), 1) +#define _mm256_setr_m128(v0, v1) _mm256_set_m128((v1), (v0)) + void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) { // prepare kernel float kernel[KERNEL_SIZE]; @@ -124,6 +131,108 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, } } +void blur_impl_avx(uint32_t *src, uint32_t *dst, int width, int height, float sigma) { + // prepare kernel + float kernel[KERNEL_SIZE]; + float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0; + + for (int i = 0; i < KERNEL_SIZE; i++) { + float x = HALF_KERNEL - i; + kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma)); + sum += kernel[i]; + } + + // normalize kernel + for (int i = 0; i < KERNEL_SIZE; i++) + kernel[i] /= sum; + + // horizontal pass includes image transposition: + // instead of writing pixel src[x] to dst[x], + // we write it to transposed location. + // (to be exact: dst[height * current_column + current_row]) + blur_impl_horizontal_pass_avx(src, dst, kernel, width, height); + blur_impl_horizontal_pass_avx(dst, src, kernel, height, width); +} + +void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) { + __m256 kernels[HALF_KERNEL]; + for (int i = 0, k = 0; i < HALF_KERNEL; i++, k += 2) + kernels[i] = _mm256_setr_m128(_mm_set1_ps(kernel[k]), _mm_set1_ps(kernel[k+1])); + + for (int row = 0; row < height; row++) { + for (int column = 0; column < width; column++, src++) { + __m128i rgbaIn[REGISTERS_CNT]; + + // handle borders + int leftBorder = column < HALF_KERNEL; + int rightBorder = column > width - HALF_KERNEL; + if (leftBorder || rightBorder) { + uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16; + int i = 0; + if (leftBorder) { + // for kernel size 7x7 and column == 0, we have: + // x x x P0 P1 P2 P3 + // first loop mirrors P{0..3} to fill x's, + // second one loads P{0..3} + for (; i < HALF_KERNEL - column; i++) + _rgbaIn[i] = *(src + (HALF_KERNEL - i)); + for (; i < KERNEL_SIZE; i++) + _rgbaIn[i] = *(src - (HALF_KERNEL - i)); + } else { + for (; i < width - column; i++) + _rgbaIn[i] = *(src + i); + for (int k = 0; i < KERNEL_SIZE; i++, k++) + _rgbaIn[i] = *(src - k); + } + + for (int k = 0; k < REGISTERS_CNT; k++) + rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k)); + } else { + for (int k = 0; k < REGISTERS_CNT; k++) + rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL)); + } + + // unpack each pixel, convert to float, + // multiply by corresponding kernel value + // and add to accumulator + __m128i tmp; + __m128i zero = _mm_setzero_si128(); + __m128 rgba_ps_128; + __m256 rgba_ps; + __m256 acc = _mm256_setzero_ps(); + int counter = 0; + + for (int i = 0; i < 3; i++) + { + tmp = _mm_unpacklo_epi8(rgbaIn[i], zero); + rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero), + _mm_unpackhi_epi16(tmp, zero))); + acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter++])); + + tmp = _mm_unpackhi_epi8(rgbaIn[i], zero); + rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero), + _mm_unpackhi_epi16(tmp, zero))); + acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter++])); + } + + tmp = _mm_unpacklo_epi8(rgbaIn[3], zero); + rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero), + _mm_unpackhi_epi16(tmp, zero))); + acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter])); + + tmp = _mm_unpackhi_epi8(rgbaIn[3], zero); + rgba_ps_128 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero)); + rgba_ps_128 = _mm_mul_ps(rgba_ps_128, _mm_set1_ps(kernel[KERNEL_SIZE-1])); + rgba_ps_128 = _mm_add_ps(rgba_ps_128, _mm_add_ps(_mm256_extractf128_ps(acc, 0), + _mm256_extractf128_ps(acc, 1))); + + __m128i rgbaOut = _mm_packs_epi32(_mm_cvtps_epi32(rgba_ps_128), zero); + rgbaOut = _mm_packus_epi16(rgbaOut, zero); + *(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut); + } + } +} + void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) { // prepare kernel float kernelf[KERNEL_SIZE];