From 9f8496441c777849c31d2a3965dc5168de5fe68b Mon Sep 17 00:00:00 2001 From: Chris Guillott Date: Tue, 5 Dec 2017 22:07:38 -0500 Subject: [PATCH] blurring stuff should work perfectly fine now --- blur.c | 16 +++++++++------- blur_simd.c | 23 ++++++++++++++++++++--- configure.ac | 4 ++-- m4/ax_check_enable_debug.m4 | 10 +++++----- m4/ax_code_coverage.m4 | 4 ++-- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/blur.c b/blur.c index 94ff42a..b91ffe6 100644 --- a/blur.c +++ b/blur.c @@ -32,7 +32,6 @@ blur_image_surface (cairo_surface_t *surface, int radius) { cairo_surface_t *tmp; int width, height; -// int src_stride, dst_stride; uint32_t *src, *dst; if (cairo_surface_status (surface)) @@ -64,15 +63,18 @@ blur_image_surface (cairo_surface_t *surface, int radius) return; src = (uint32_t*)cairo_image_surface_get_data (surface); -// src_stride = cairo_image_surface_get_stride (surface); dst = (uint32_t*)cairo_image_surface_get_data (tmp); -// dst_stride = cairo_image_surface_get_stride (tmp); - - //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000); - //blur_impl_sse2(src, dst, width, height, 4.5); + +#ifdef __SSE4_1__ blur_impl_ssse3(src, dst, width, height, 4.5); - +#elif __SSE2__ + blur_impl_sse2(src, dst, width, height, 4.5); +#else + int src_stride = cairo_image_surface_get_stride (surface); + int dst_stride = cairo_image_surface_get_stride (tmp); + blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000); +#endif cairo_surface_destroy (tmp); cairo_surface_flush (surface); cairo_surface_mark_dirty (surface); diff --git a/blur_simd.c b/blur_simd.c index 52954ed..9ee4abd 100644 --- a/blur_simd.c +++ b/blur_simd.c @@ -51,6 +51,7 @@ void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float s } void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) { + uint32_t* o_src = src; for (int row = 0; row < height; row++) { for (int column = 0; column < width; column++, src++) { __m128i rgbaIn[REGISTERS_CNT]; @@ -59,7 +60,7 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int leftBorder = column < HALF_KERNEL; int rightBorder = column > width - HALF_KERNEL; if (leftBorder || rightBorder) { - uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16; + uint32_t _rgbaIn[KERNEL_SIZE + 1] ALIGN16; int i = 0; if (leftBorder) { // for kernel size 7x7 and column == 0, we have: @@ -80,8 +81,22 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, for (int k = 0; k < REGISTERS_CNT; k++) rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k)); } else { - for (int k = 0; k < REGISTERS_CNT; k++) + for (int k = 0; k < REGISTERS_CNT; k++) { +#if 0 + printf("%p -> %p (%ld) || %p->%p\n", + o_src, + o_src + (height * width), + o_src + (height * width) - src, + src + 4*k - HALF_KERNEL, + ((__m128i*)src + 4*k - HALF_KERNEL) + 1 + ); +#endif + // if this copy would go out of bounds, break + if ((long long) (((__m128i*) src + 4*k - HALF_KERNEL) + 1) + > (long long) (o_src + (height * width))) + break; rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL)); + } } // unpack each pixel, convert to float, @@ -198,7 +213,9 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne } } else { for (int k = 0; k < REGISTERS_CNT; k++) { - if ((long long)(((__m128i*) src + 4*k - HALF_KERNEL) + 1) > (long long)((o_src + (width * height)))) break; + if ((long long) (((__m128i*) src + 4*k - HALF_KERNEL) + 1) + > (long long) (o_src + (height * width))) + break; #if 0 printf("K: %d; p: %p -> %p\n", k, src+4*k - HALF_KERNEL, ((__m128i*) (src +4*k - HALF_KERNEL)) + 1); printf("%p->%p, %p->%p (%ld)\n", (__m128i*) src + 4*k - HALF_KERNEL, ((__m128i*) src + 4*k - HALF_KERNEL) + 1, o_src, o_src + (width * height), o_src + (width * height) - src); diff --git a/configure.ac b/configure.ac index 319a6bf..0ae5567 100644 --- a/configure.ac +++ b/configure.ac @@ -101,8 +101,8 @@ AC_PROG_LN_S AM_PROG_AR AX_FLAGS_WARN_ALL -AX_APPEND_FLAG([-msse4.1], [AM_CFLAGS]) -AX_APPEND_FLAG([-O2], [AM_CFLAGS]) +AX_APPEND_FLAG([-march=native], [AM_CFLAGS]) + AX_APPEND_FLAG([-O2], [AM_CFLAGS]) AX_APPEND_FLAG([-funroll-loops], [AM_CFLAGS]) AX_APPEND_FLAG([-std=gnu99], [AM_CFLAGS]) AX_CHECK_COMPILE_FLAG([-Wunused-value], [AX_APPEND_FLAG([-Wunused-value], [AM_CFLAGS])]) diff --git a/m4/ax_check_enable_debug.m4 b/m4/ax_check_enable_debug.m4 index 56c9fc1..f99d75f 100644 --- a/m4/ax_check_enable_debug.m4 +++ b/m4/ax_check_enable_debug.m4 @@ -76,11 +76,11 @@ AC_DEFUN([AX_CHECK_ENABLE_DEBUG],[ AS_CASE([$enable_debug], [yes],[ AC_MSG_RESULT(yes) - CFLAGS="${CFLAGS} -g -O2" - CXXFLAGS="${CXXFLAGS} -g -O2" - FFLAGS="${FFLAGS} -g -O2" - FCFLAGS="${FCFLAGS} -g -O2" - OBJCFLAGS="${OBJCFLAGS} -g -O2" + CFLAGS="${CFLAGS} -g -O0" + CXXFLAGS="${CXXFLAGS} -g -O0" + FFLAGS="${FFLAGS} -g -O0" + FCFLAGS="${FCFLAGS} -g -O0" + OBJCFLAGS="${OBJCFLAGS} -g -O0" ], [info],[ AC_MSG_RESULT(info) diff --git a/m4/ax_code_coverage.m4 b/m4/ax_code_coverage.m4 index eebf062..6c985eb 100644 --- a/m4/ax_code_coverage.m4 +++ b/m4/ax_code_coverage.m4 @@ -141,8 +141,8 @@ AC_DEFUN([AX_CODE_COVERAGE],[ dnl Build the code coverage flags CODE_COVERAGE_CPPFLAGS="-DNDEBUG" - CODE_COVERAGE_CFLAGS="-O2 -g -fprofile-arcs -ftest-coverage" - CODE_COVERAGE_CXXFLAGS="-O2 -g -fprofile-arcs -ftest-coverage" + CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" + CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" CODE_COVERAGE_LDFLAGS="-lgcov" AC_SUBST([CODE_COVERAGE_CPPFLAGS])