Chris Guillott
8 years ago
9 changed files with 293 additions and 3 deletions
@ -0,0 +1,145 @@ |
|||
/*
|
|||
* Copyright © 2008 Kristian Høgsberg |
|||
* Copyright © 2009 Chris Wilson |
|||
* |
|||
* Permission to use, copy, modify, distribute, and sell this software and its |
|||
* documentation for any purpose is hereby granted without fee, provided that |
|||
* the above copyright notice appear in all copies and that both that copyright |
|||
* notice and this permission notice appear in supporting documentation, and |
|||
* that the name of the copyright holders not be used in advertising or |
|||
* publicity pertaining to distribution of the software without specific, |
|||
* written prior permission. The copyright holders make no representations |
|||
* about the suitability of this software for any purpose. It is provided "as |
|||
* is" without express or implied warranty. |
|||
* |
|||
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, |
|||
* INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO |
|||
* EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR |
|||
* CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, |
|||
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER |
|||
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE |
|||
* OF THIS SOFTWARE. |
|||
*/ |
|||
|
|||
#include <math.h> |
|||
#include "blur.h" |
|||
|
|||
/* Performs a simple 2D Gaussian blur of standard devation @sigma surface @surface. */ |
|||
void |
|||
blur_image_surface (cairo_surface_t *surface, int sigma) |
|||
{ |
|||
cairo_surface_t *tmp; |
|||
int width, height; |
|||
uint32_t *src, *dst; |
|||
|
|||
if (cairo_surface_status (surface)) |
|||
return; |
|||
|
|||
width = cairo_image_surface_get_width (surface); |
|||
height = cairo_image_surface_get_height (surface); |
|||
|
|||
switch (cairo_image_surface_get_format (surface)) { |
|||
case CAIRO_FORMAT_A1: |
|||
default: |
|||
/* Don't even think about it! */ |
|||
return; |
|||
|
|||
case CAIRO_FORMAT_A8: |
|||
/* Handle a8 surfaces by effectively unrolling the loops by a
|
|||
* factor of 4 - this is safe since we know that stride has to be a |
|||
* multiple of uint32_t. */ |
|||
width /= 4; |
|||
break; |
|||
|
|||
case CAIRO_FORMAT_RGB24: |
|||
case CAIRO_FORMAT_ARGB32: |
|||
break; |
|||
} |
|||
|
|||
tmp = cairo_image_surface_create (CAIRO_FORMAT_ARGB32, width, height); |
|||
if (cairo_surface_status (tmp)) |
|||
return; |
|||
|
|||
src = (uint32_t*)cairo_image_surface_get_data (surface); |
|||
dst = (uint32_t*)cairo_image_surface_get_data (tmp); |
|||
|
|||
// according to a paper by Peter Kovesi [1], box filter of width w, equals to Gaussian blur of following sigma:
|
|||
// σ_av = sqrt((w*w-1)/12)
|
|||
// for our 7x7 filter we have σ_av = 2.0.
|
|||
// applying the same Gaussian filter n times results in σ_n = sqrt(n*σ_av*σ_av) [2]
|
|||
// after some trivial math, we arrive at n = ((σ_d)/(σ_av))^2
|
|||
// since it's a box blur filter, n >= 3
|
|||
//
|
|||
// [1]: http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
|
|||
// [2]: https://en.wikipedia.org/wiki/Gaussian_blur#Mathematics
|
|||
|
|||
int n = lrintf((sigma*sigma)/(SIGMA_AV*SIGMA_AV)); |
|||
if (n < 3) n = 3; |
|||
|
|||
for (int i = 0; i < n; i++) |
|||
{ |
|||
// horizontal pass includes image transposition:
|
|||
// instead of writing pixel src[x] to dst[x],
|
|||
// we write it to transposed location.
|
|||
// (to be exact: dst[height * current_column + current_row])
|
|||
#ifdef __SSE2__ |
|||
blur_impl_horizontal_pass_sse2(src, dst, width, height); |
|||
blur_impl_horizontal_pass_sse2(dst, src, height, width); |
|||
#else |
|||
blur_impl_horizontal_pass_generic(src, dst, width, height); |
|||
blur_impl_horizontal_pass_generic(dst, src, height, width); |
|||
#endif |
|||
} |
|||
|
|||
cairo_surface_destroy (tmp); |
|||
cairo_surface_flush (surface); |
|||
cairo_surface_mark_dirty (surface); |
|||
} |
|||
|
|||
void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height) { |
|||
for (int row = 0; row < height; row++) { |
|||
for (int column = 0; column < width; column++, src++) { |
|||
uint32_t rgbaIn[KERNEL_SIZE]; |
|||
|
|||
// handle borders
|
|||
int leftBorder = column < HALF_KERNEL; |
|||
int rightBorder = column > width - HALF_KERNEL; |
|||
int i = 0; |
|||
if (leftBorder) { |
|||
// for kernel size 7x7 and column == 0, we have:
|
|||
// x x x P0 P1 P2 P3
|
|||
// first loop mirrors P{0..3} to fill x's,
|
|||
// second one loads P{0..3}
|
|||
for (; i < HALF_KERNEL - column; i++) |
|||
rgbaIn[i] = *(src + (HALF_KERNEL - i)); |
|||
for (; i < KERNEL_SIZE; i++) |
|||
rgbaIn[i] = *(src - (HALF_KERNEL - i)); |
|||
} else if (rightBorder) { |
|||
for (; i < width - column; i++) |
|||
rgbaIn[i] = *(src + i); |
|||
for (int k = 0; i < KERNEL_SIZE; i++, k++) |
|||
rgbaIn[i] = *(src - k); |
|||
} else { |
|||
for (; i < KERNEL_SIZE; i++) |
|||
rgbaIn[i] = *(src + i - HALF_KERNEL); |
|||
} |
|||
|
|||
uint32_t acc[4] = {0}; |
|||
|
|||
for (i = 0; i < KERNEL_SIZE; i++) { |
|||
acc[0] += (rgbaIn[i] & 0xFF000000) >> 24; |
|||
acc[1] += (rgbaIn[i] & 0x00FF0000) >> 16; |
|||
acc[2] += (rgbaIn[i] & 0x0000FF00) >> 8; |
|||
acc[3] += (rgbaIn[i] & 0x000000FF) >> 0; |
|||
} |
|||
|
|||
for(i = 0; i < 4; i++) |
|||
acc[i] *= 1.0/KERNEL_SIZE; |
|||
|
|||
*(dst + height * column + row) = (acc[0] << 24) | |
|||
(acc[1] << 16) | |
|||
(acc[2] << 8 ) | |
|||
(acc[3] << 0); |
|||
} |
|||
} |
|||
} |
@ -0,0 +1,16 @@ |
|||
#ifndef _BLUR_H |
|||
#define _BLUR_H |
|||
|
|||
#include <stdint.h> |
|||
#include <cairo.h> |
|||
|
|||
#define KERNEL_SIZE 7 |
|||
#define SIGMA_AV 2 |
|||
#define HALF_KERNEL KERNEL_SIZE / 2 |
|||
|
|||
void blur_image_surface(cairo_surface_t *surface, int sigma); |
|||
void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height); |
|||
void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height); |
|||
|
|||
#endif |
|||
|
@ -0,0 +1,73 @@ |
|||
/*
|
|||
* vim:ts=4:sw=4:expandtab |
|||
* |
|||
* © 2016 Sebastian Frysztak |
|||
* |
|||
* See LICENSE for licensing information |
|||
* |
|||
*/ |
|||
|
|||
#include "blur.h" |
|||
#include <xmmintrin.h> |
|||
|
|||
// number of xmm registers needed to store input pixels for given kernel size
|
|||
#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4 |
|||
|
|||
void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height) { |
|||
for (int row = 0; row < height; row++) { |
|||
for (int column = 0; column < width; column++, src++) { |
|||
__m128i rgbaIn[REGISTERS_CNT]; |
|||
|
|||
// handle borders
|
|||
int leftBorder = column < HALF_KERNEL; |
|||
int rightBorder = column > width - HALF_KERNEL; |
|||
uint32_t _rgbaIn[KERNEL_SIZE] __attribute__((aligned(16))); |
|||
int i = 0; |
|||
if (leftBorder) { |
|||
// for kernel size 7x7 and column == 0, we have:
|
|||
// x x x P0 P1 P2 P3
|
|||
// first loop mirrors P{0..3} to fill x's,
|
|||
// second one loads P{0..3}
|
|||
for (; i < HALF_KERNEL - column; i++) |
|||
_rgbaIn[i] = *(src + (HALF_KERNEL - i)); |
|||
for (; i < KERNEL_SIZE; i++) |
|||
_rgbaIn[i] = *(src - (HALF_KERNEL - i)); |
|||
|
|||
for (int k = 0; k < REGISTERS_CNT; k++) |
|||
rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k)); |
|||
} else if (rightBorder) { |
|||
for (; i < width - column; i++) |
|||
_rgbaIn[i] = *(src + i); |
|||
for (int k = 0; i < KERNEL_SIZE; i++, k++) |
|||
_rgbaIn[i] = *(src - k); |
|||
|
|||
for (int k = 0; k < REGISTERS_CNT; k++) |
|||
rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k)); |
|||
} else { |
|||
for (int k = 0; k < REGISTERS_CNT; k++) |
|||
rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL)); |
|||
} |
|||
|
|||
__m128i zero = _mm_setzero_si128(); |
|||
__m128i acc = _mm_setzero_si128(); |
|||
|
|||
acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero)); |
|||
acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero)); |
|||
acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero)); |
|||
|
|||
// kernel size equals to 7, but we can only load multiples of 4 pixels
|
|||
// we have to set 8th pixel to zero
|
|||
acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0), |
|||
_mm_unpackhi_epi8(rgbaIn[1], zero))); |
|||
acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero), |
|||
_mm_unpackhi_epi16(acc, zero)); |
|||
|
|||
// multiplication is significantly faster than division
|
|||
acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc), |
|||
_mm_set1_ps(1.0/KERNEL_SIZE))); |
|||
|
|||
*(dst + height * column + row) = |
|||
_mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero)); |
|||
} |
|||
} |
|||
} |
Loading…
Reference in new issue