i3lock-color/blur_simd.c

/*
 * vim:ts=4:sw=4:expandtab
 *
 * © 2016 Sebastian Frysztak
 *
 * See LICENSE for licensing information
 *
 */

#include "blur.h"
#include <xmmintrin.h>

// number of xmm registers needed to store input pixels for given kernel size
#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4

void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height) {
    uint32_t* o_src = src;
    for (int row = 0; row < height; row++) {
        for (int column = 0; column < width; column++, src++) {
            __m128i rgbaIn[REGISTERS_CNT];

            // handle borders
            int leftBorder = column < HALF_KERNEL;
            int rightBorder = column > width - HALF_KERNEL;
            uint32_t _rgbaIn[KERNEL_SIZE + 1] __attribute__((aligned(16)));
            int i = 0;
            if (leftBorder) {
                // for kernel size 7x7 and column == 0, we have:
                // x x x P0 P1 P2 P3
                // first loop mirrors P{0..3} to fill x's,
                // second one loads P{0..3}
                for (; i < HALF_KERNEL - column; i++)
                    _rgbaIn[i] = *(src + (HALF_KERNEL - i));
                for (; i < KERNEL_SIZE; i++)
                    _rgbaIn[i] = *(src - (HALF_KERNEL - i));

                for (int k = 0; k < REGISTERS_CNT; k++)
                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
            } else if (rightBorder) {
                for (; i < width - column; i++)
                    _rgbaIn[i] = *(src + i);
                for (int k = 0; i < KERNEL_SIZE; i++, k++)
                    _rgbaIn[i] = *(src - k);

                for (int k = 0; k < REGISTERS_CNT; k++)
                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
            } else {
                for (int k = 0; k < REGISTERS_CNT; k++) {
                    if ((long long) (((__m128i*) src + 4*k - HALF_KERNEL) + 1)
                            > (long long) (o_src + (height * width)))
                        break;
                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
                }
            }

            __m128i zero = _mm_setzero_si128();
            __m128i acc = _mm_setzero_si128();

            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));
            acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));
            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));

            // kernel size equals to 7, but we can only load multiples of 4 pixels
            // we have to set 8th pixel to zero
            acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0),
                                                      _mm_unpackhi_epi8(rgbaIn[1], zero)));
            acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero),
                                _mm_unpackhi_epi16(acc, zero));

            // multiplication is significantly faster than division
            acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
                                             _mm_set1_ps(1.0/KERNEL_SIZE)));

            *(dst + height * column + row) =
                _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));
        }
    }
}
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago			`/*`
			`* vim:ts=4:sw=4:expandtab`
			`*`
revert back to better blurring behaviour 7 years ago			`* © 2016 Sebastian Frysztak`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago			`*`
			`* See LICENSE for licensing information`
			`*`
			`*/`

			`#include "blur.h"`
			`#include <xmmintrin.h>`

revert back to better blurring behaviour 7 years ago			`// number of xmm registers needed to store input pixels for given kernel size`
Extend kernel size to 15x15. 8 years ago			`#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4`

revert back to better blurring behaviour 7 years ago			`void blur_impl_horizontal_pass_sse2(uint32_t src, uint32_t dst, int width, int height) {`
blurring stuff should work perfectly fine now 7 years ago			`uint32_t* o_src = src;`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago			`for (int row = 0; row < height; row++) {`
			`for (int column = 0; column < width; column++, src++) {`
Extend kernel size to 15x15. 8 years ago			`__m128i rgbaIn[REGISTERS_CNT];`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago
			`// handle borders`
			`int leftBorder = column < HALF_KERNEL;`
Improve border handling for larger kernels. 8 years ago			`int rightBorder = column > width - HALF_KERNEL;`
revert back to better blurring behaviour 7 years ago			`uint32_t _rgbaIn[KERNEL_SIZE + 1] __attribute__((aligned(16)));`
			`int i = 0;`
			`if (leftBorder) {`
			`// for kernel size 7x7 and column == 0, we have:`
			`// x x x P0 P1 P2 P3`
			`// first loop mirrors P{0..3} to fill x's,`
			`// second one loads P{0..3}`
			`for (; i < HALF_KERNEL - column; i++)`
			`_rgbaIn[i] = *(src + (HALF_KERNEL - i));`
			`for (; i < KERNEL_SIZE; i++)`
			`_rgbaIn[i] = *(src - (HALF_KERNEL - i));`
Slightly refactor border handling code. 8 years ago
			`for (int k = 0; k < REGISTERS_CNT; k++)`
			`rgbaIn[k] = _mm_load_si128((__m128i)(_rgbaIn + 4k));`
revert back to better blurring behaviour 7 years ago			`} else if (rightBorder) {`
			`for (; i < width - column; i++)`
			`_rgbaIn[i] = *(src + i);`
			`for (int k = 0; i < KERNEL_SIZE; i++, k++)`
			`_rgbaIn[i] = *(src - k);`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 8 years ago
revert back to better blurring behaviour 7 years ago			`for (int k = 0; k < REGISTERS_CNT; k++)`
Extend kernel size to 15x15. 8 years ago			`rgbaIn[k] = _mm_load_si128((__m128i)(_rgbaIn + 4k));`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago			`} else {`
first commit towards fixing this 7 years ago			`for (int k = 0; k < REGISTERS_CNT; k++) {`
revert back to better blurring behaviour 7 years ago			`if ((long long) (((__m128i) src + 4k - HALF_KERNEL) + 1)`
blurring stuff should work perfectly fine now 7 years ago			`> (long long) (o_src + (height * width)))`
			`break;`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 8 years ago			`rgbaIn[k] = _mm_loadu_si128((__m128i)(src + 4k - HALF_KERNEL));`
first commit towards fixing this 7 years ago			`}`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago			`}`

			`__m128i zero = _mm_setzero_si128();`
SSE2: switch from Gaussian to box blur 8 years ago			`__m128i acc = _mm_setzero_si128();`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago
revert back to better blurring behaviour 7 years ago			`acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));`
			`acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));`
			`acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 8 years ago
revert back to better blurring behaviour 7 years ago			`// kernel size equals to 7, but we can only load multiples of 4 pixels`
			`// we have to set 8th pixel to zero`
			`acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0),`
			`_mm_unpackhi_epi8(rgbaIn[1], zero)));`
			`acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero),`
			`_mm_unpackhi_epi16(acc, zero));`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 8 years ago
revert back to better blurring behaviour 7 years ago			`// multiplication is significantly faster than division`
			`acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),`
			`_mm_set1_ps(1.0/KERNEL_SIZE)));`
Add SSSE3-based blur implementation. Calculations are done on integer, rather than floating point numbers, so this implementation is not as accurate (but when scale factor is reasonable enough, no artifacs are visible). It is, however, faster by a factor of ~3. 8 years ago
			`(dst + height column + row) =`
revert back to better blurring behaviour 7 years ago			`_mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));`
Add SSE2-optimized blur. About 4-6 times faster than naive implementation. 8 years ago			`}`
			`}`
			`}`
revert back to better blurring behaviour 7 years ago