From ade9a207881fca816fe2a630b18e0d1ff2655e3e Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 22 Oct 2016 14:32:35 +0200
Subject: [PATCH 1/6] Isolate blur implementation to a function.

This will allow easier switching between naive/SSE2/AVX implementations.
---
 blur.c | 39 +++++++++++++++++++++++----------------
 blur.h |  4 ++++
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/blur.c b/blur.c
index a71b984..b0349d5 100644
--- a/blur.c
+++ b/blur.c
@@ -22,9 +22,7 @@
  */
 
 #include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <cairo.h>
+#include "blur.h"
 
 #define ARRAY_LENGTH(a) (sizeof (a) / sizeof (a)[0])
 
@@ -35,13 +33,7 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     cairo_surface_t *tmp;
     int width, height;
     int src_stride, dst_stride;
-    int x, y, z, w;
-    uint8_t *src, *dst;
-    uint32_t *s, *d, a, p;
-    int i, j, k;
-    uint8_t kernel[17];
-    const int size = ARRAY_LENGTH (kernel);
-    const int half = size / 2;
+    uint32_t *src, *dst;
 
     if (cairo_surface_status (surface))
     return;
@@ -71,12 +63,31 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     if (cairo_surface_status (tmp))
     return;
 
-    src = cairo_image_surface_get_data (surface);
+    src = (uint32_t*)cairo_image_surface_get_data (surface);
     src_stride = cairo_image_surface_get_stride (surface);
 
-    dst = cairo_image_surface_get_data (tmp);
+    dst = (uint32_t*)cairo_image_surface_get_data (tmp);
     dst_stride = cairo_image_surface_get_stride (tmp);
 
+    blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
+
+    cairo_surface_destroy (tmp);
+    cairo_surface_flush (surface);
+    cairo_surface_mark_dirty (surface);
+}
+
+void blur_impl_naive(uint32_t* _src, uint32_t* _dst, int width, int height, int src_stride, int dst_stride, int radius)
+{
+    int x, y, z, w;
+    uint32_t *s, *d, a, p;
+    int i, j, k;
+    uint8_t kernel[17];
+    const int size = ARRAY_LENGTH (kernel);
+    const int half = size / 2;
+
+    uint8_t *src = (uint8_t*)_src;
+    uint8_t *dst = (uint8_t*)_dst;
+
     a = 0;
     for (i = 0; i < size; i++) {
     double f = i - half;
@@ -135,9 +146,5 @@ blur_image_surface (cairo_surface_t *surface, int radius)
         d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
     }
     }
-
-    cairo_surface_destroy (tmp);
-    cairo_surface_flush (surface);
-    cairo_surface_mark_dirty (surface);
 }
 
diff --git a/blur.h b/blur.h
index c3d13fd..1c1eb7a 100644
--- a/blur.h
+++ b/blur.h
@@ -1,7 +1,11 @@
 #ifndef _BLUR_H
 #define _BLUR_H
 
+#include <stdint.h>
+#include <cairo.h>
+
 void blur_image_surface (cairo_surface_t *surface, int radius);
+void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 
 #endif
 

From fb5dbbe661c420509c1aa71dc3e8ff742d9d59fe Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 22 Oct 2016 15:30:27 +0200
Subject: [PATCH 2/6] Add SSE2-optimized blur.

About 4-6 times faster than naive implementation.
---
 blur.c      |   3 +-
 blur.h      |   2 +
 blur_simd.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 blur_simd.c

diff --git a/blur.c b/blur.c
index b0349d5..8434e81 100644
--- a/blur.c
+++ b/blur.c
@@ -69,7 +69,8 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     dst = (uint32_t*)cairo_image_surface_get_data (tmp);
     dst_stride = cairo_image_surface_get_stride (tmp);
 
-    blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
+    //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
+    blur_impl_sse2(src, dst, width, height, 2.5);
 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
diff --git a/blur.h b/blur.h
index 1c1eb7a..2a5f45c 100644
--- a/blur.h
+++ b/blur.h
@@ -6,6 +6,8 @@
 
 void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
+void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
new file mode 100644
index 0000000..2186cf5
--- /dev/null
+++ b/blur_simd.c
@@ -0,0 +1,116 @@
+/*
+ * vim:ts=4:sw=4:expandtab
+ *
+ * © 2016 Sebastian Frysztak 
+ *
+ * See LICENSE for licensing information
+ *
+ */
+
+#include "blur.h"
+#include <math.h>
+#include <xmmintrin.h>
+
+#define ALIGN16 __attribute__((aligned(16)))
+#define KERNEL_SIZE 7
+#define HALF_KERNEL KERNEL_SIZE / 2
+
+void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
+    // prepare kernel
+    float kernel[KERNEL_SIZE];
+    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
+
+    for (int i = 0; i < KERNEL_SIZE; i++) {
+        float x = HALF_KERNEL - i;
+        kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
+        sum += kernel[i];
+    }
+
+    // normalize kernel
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernel[i] /= sum;
+
+    // horizontal pass includes image transposition:
+    // instead of writing pixel src[x] to dst[x],
+    // we write it to transposed location.
+    // (to be exact: dst[height * current_column + current_row])
+    blur_impl_horizontal_pass_sse2(src, dst, kernel, width, height);
+    blur_impl_horizontal_pass_sse2(dst, src, kernel, height, width);
+}
+
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
+    for (int row = 0; row < height; row++) {
+        // remember first and last pixel in a row
+        // (used to handle borders)
+        uint32_t firstPixel = *src;
+        uint32_t lastPixel = *(src + width - 1);
+
+        for (int column = 0; column < width; column++, src++) {
+            __m128i rgbaIn1, rgbaIn2;
+
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column + HALF_KERNEL >= width;
+            if (leftBorder || rightBorder) {
+                uint32_t rgbaIn[KERNEL_SIZE] ALIGN16;
+                int i = 0;
+                if (leftBorder) {
+                    // for kernel size 7x7 and column == 0, we have:
+                    // x x x P0 P1 P2 P3
+                    // first loop fills x's with P0, second one loads P{0..3}
+                    for (; i < HALF_KERNEL - column; i++)
+                        rgbaIn[i] = firstPixel;
+                    for (; i < KERNEL_SIZE; i++)
+                        rgbaIn[i] = *(src + i - HALF_KERNEL);
+                } else {
+                    for (; width < column; i++)
+                        rgbaIn[i] = *(src - i - HALF_KERNEL);
+                    for (; i < KERNEL_SIZE; i++)
+                        rgbaIn[i] = lastPixel;
+                }
+
+                rgbaIn1 = _mm_load_si128((__m128i *)(rgbaIn));
+                rgbaIn2 = _mm_load_si128((__m128i *)(rgbaIn + 4));
+            } else {
+                rgbaIn1 = _mm_loadu_si128((__m128i *)(src - 3));
+                rgbaIn2 = _mm_loadu_si128((__m128i *)(src + 1));
+            }
+
+            // unpack each pixel, convert to float,
+            // multiply by corresponding kernel value
+            // and add to accumulator
+            __m128i tmp;
+            __m128i zero = _mm_setzero_si128();
+            __m128 rgba_ps;
+            __m128 acc = _mm_setzero_ps();
+            int counter = 0;
+
+            tmp = _mm_unpacklo_epi8(rgbaIn1, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            tmp = _mm_unpackhi_epi8(rgbaIn1, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            tmp = _mm_unpacklo_epi8(rgbaIn2, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            tmp = _mm_unpackhi_epi8(rgbaIn2, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            __m128i rgbaOut = _mm_cvtps_epi32(acc);
+            rgbaOut = _mm_packs_epi32(rgbaOut, zero);
+            rgbaOut = _mm_packus_epi16(rgbaOut, zero);
+            *(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut);
+        }
+    }
+}

From a48ddb61db973684d89efaddcca3d7803bfaf276 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 22 Oct 2016 15:31:53 +0200
Subject: [PATCH 3/6] Build with -O2.

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 020beaa..b0a3dcc 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ endif
 CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
+CFLAGS += -O2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
 CFLAGS += $(shell $(PKG_CONFIG) --cflags cairo xcb-dpms xcb-xinerama xcb-atom xcb-image xcb-xkb xkbcommon xkbcommon-x11)

From afe41c5754a2871e4aa5852343363083bc551782 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 28 Oct 2016 17:35:33 +0200
Subject: [PATCH 4/6] Extend kernel size to 15x15.

---
 blur.c      |  2 +-
 blur_simd.c | 55 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/blur.c b/blur.c
index 8434e81..d91f57d 100644
--- a/blur.c
+++ b/blur.c
@@ -70,7 +70,7 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     dst_stride = cairo_image_surface_get_stride (tmp);
 
     //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
-    blur_impl_sse2(src, dst, width, height, 2.5);
+    blur_impl_sse2(src, dst, width, height, 4.5);
 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
diff --git a/blur_simd.c b/blur_simd.c
index 2186cf5..1628264 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -12,9 +12,13 @@
 #include <xmmintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
-#define KERNEL_SIZE 7
+#define KERNEL_SIZE 15 
 #define HALF_KERNEL KERNEL_SIZE / 2
 
+// number of xmm registers needed to store 
+// input pixels for given kernel size
+#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
+
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernel[KERNEL_SIZE];
@@ -46,34 +50,34 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
         uint32_t lastPixel = *(src + width - 1);
 
         for (int column = 0; column < width; column++, src++) {
-            __m128i rgbaIn1, rgbaIn2;
+            __m128i rgbaIn[REGISTERS_CNT];
 
             // handle borders
             int leftBorder = column < HALF_KERNEL;
             int rightBorder = column + HALF_KERNEL >= width;
             if (leftBorder || rightBorder) {
-                uint32_t rgbaIn[KERNEL_SIZE] ALIGN16;
+                uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
                 int i = 0;
                 if (leftBorder) {
                     // for kernel size 7x7 and column == 0, we have:
                     // x x x P0 P1 P2 P3
                     // first loop fills x's with P0, second one loads P{0..3}
                     for (; i < HALF_KERNEL - column; i++)
-                        rgbaIn[i] = firstPixel;
+                        _rgbaIn[i] = firstPixel;
                     for (; i < KERNEL_SIZE; i++)
-                        rgbaIn[i] = *(src + i - HALF_KERNEL);
+                        _rgbaIn[i] = *(src + i - HALF_KERNEL);
                 } else {
                     for (; width < column; i++)
-                        rgbaIn[i] = *(src - i - HALF_KERNEL);
+                        _rgbaIn[i] = *(src - i - HALF_KERNEL);
                     for (; i < KERNEL_SIZE; i++)
-                        rgbaIn[i] = lastPixel;
+                        _rgbaIn[i] = lastPixel;
                 }
 
-                rgbaIn1 = _mm_load_si128((__m128i *)(rgbaIn));
-                rgbaIn2 = _mm_load_si128((__m128i *)(rgbaIn + 4));
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
             } else {
-                rgbaIn1 = _mm_loadu_si128((__m128i *)(src - 3));
-                rgbaIn2 = _mm_loadu_si128((__m128i *)(src + 1));
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
             }
 
             // unpack each pixel, convert to float,
@@ -85,25 +89,28 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
             __m128 acc = _mm_setzero_ps();
             int counter = 0;
 
-            tmp = _mm_unpacklo_epi8(rgbaIn1, zero);
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-
-            tmp = _mm_unpackhi_epi8(rgbaIn1, zero);
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            for (int i = 0; i < 3; i++)
+            {
+                tmp = _mm_unpacklo_epi8(rgbaIn[i], zero);
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+                tmp = _mm_unpackhi_epi8(rgbaIn[i], zero);
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            }
 
-            tmp = _mm_unpacklo_epi8(rgbaIn2, zero);
+            tmp = _mm_unpacklo_epi8(rgbaIn[3], zero);
             rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
             acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
             rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
             acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
 
-            tmp = _mm_unpackhi_epi8(rgbaIn2, zero);
+            tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
             rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
             acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
 

From 3662b8e18714ed9e3e29c7d55c951557fc5ae104 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 28 Oct 2016 17:36:43 +0200
Subject: [PATCH 5/6] Improve border handling for larger kernels.

---
 blur_simd.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blur_simd.c b/blur_simd.c
index 1628264..4cacb9a 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -44,33 +44,29 @@ void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float s
 
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
     for (int row = 0; row < height; row++) {
-        // remember first and last pixel in a row
-        // (used to handle borders)
-        uint32_t firstPixel = *src;
-        uint32_t lastPixel = *(src + width - 1);
-
         for (int column = 0; column < width; column++, src++) {
             __m128i rgbaIn[REGISTERS_CNT];
 
             // handle borders
             int leftBorder = column < HALF_KERNEL;
-            int rightBorder = column + HALF_KERNEL >= width;
+            int rightBorder = column > width - HALF_KERNEL;
             if (leftBorder || rightBorder) {
                 uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
                 int i = 0;
                 if (leftBorder) {
                     // for kernel size 7x7 and column == 0, we have:
                     // x x x P0 P1 P2 P3
-                    // first loop fills x's with P0, second one loads P{0..3}
+                    // first loop mirrors P{0..3} to fill x's,
+                    // second one loads P{0..3}
                     for (; i < HALF_KERNEL - column; i++)
-                        _rgbaIn[i] = firstPixel;
+                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
                     for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = *(src + i - HALF_KERNEL);
+                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
                 } else {
-                    for (; width < column; i++)
-                        _rgbaIn[i] = *(src - i - HALF_KERNEL);
-                    for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = lastPixel;
+                    for (; i < width - column; i++)
+                        _rgbaIn[i] = *(src + i);
+                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                        _rgbaIn[i] = *(src - k);
                 }
 
                 for (int k = 0; k < REGISTERS_CNT; k++)

From 72aec8704714f5128e076236b077dd7fedcea9da Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 29 Oct 2016 14:32:49 +0200
Subject: [PATCH 6/6] Add SSSE3-based blur implementation.

Calculations are done on integer, rather than floating point numbers,
so this implementation is not as accurate (but when scale factor is
reasonable enough, no artifacs are visible).
It is, however, faster by a factor of ~3.
---
 Makefile    |   1 +
 blur.c      |   3 +-
 blur.h      |   2 +
 blur_simd.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b0a3dcc..d979100 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ endif
 CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
+CFLAGS += -mssse3
 CFLAGS += -O2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
diff --git a/blur.c b/blur.c
index d91f57d..a5b0bd3 100644
--- a/blur.c
+++ b/blur.c
@@ -70,7 +70,8 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     dst_stride = cairo_image_surface_get_stride (tmp);
 
     //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
-    blur_impl_sse2(src, dst, width, height, 4.5);
+    //blur_impl_sse2(src, dst, width, height, 4.5);
+    blur_impl_ssse3(src, dst, width, height, 4.5);
 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
diff --git a/blur.h b/blur.h
index 2a5f45c..478e2f0 100644
--- a/blur.h
+++ b/blur.h
@@ -8,6 +8,8 @@ void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
+void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index 4cacb9a..7dcd9a4 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -10,6 +10,7 @@
 #include "blur.h"
 #include <math.h>
 #include <xmmintrin.h>
+#include <tmmintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
 #define KERNEL_SIZE 15 
@@ -19,6 +20,11 @@
 // input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
+// scaling factor for kernel coefficients.
+// higher values cause desaturation.
+// used in SSSE3 implementation.
+#define SCALE_FACTOR 7
+
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernel[KERNEL_SIZE];
@@ -117,3 +123,128 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
         }
     }
 }
+
+void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
+    // prepare kernel
+    float kernelf[KERNEL_SIZE];
+    int8_t kernel[KERNEL_SIZE + 1];
+    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
+
+    for (int i = 0; i < KERNEL_SIZE; i++) {
+        float x = HALF_KERNEL - i;
+        kernelf[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
+        sum += kernelf[i];
+    }
+
+    // normalize kernel
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernelf[i] /= sum;
+
+    // round to nearest integer and convert to int
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernel[i] = (int8_t)rintf(kernelf[i] * (1 << SCALE_FACTOR));
+    kernel[KERNEL_SIZE] = 0;
+
+    // horizontal pass includes image transposition:
+    // instead of writing pixel src[x] to dst[x],
+    // we write it to transposed location.
+    // (to be exact: dst[height * current_column + current_row])
+    blur_impl_horizontal_pass_ssse3(src, dst, kernel, width, height);
+    blur_impl_horizontal_pass_ssse3(dst, src, kernel, height, width);
+}
+
+
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height) {
+    __m128i _kern = _mm_loadu_si128((__m128i*)kernel);
+    __m128i rgbaIn[REGISTERS_CNT];
+
+    for (int row = 0; row < height; row++) {
+        for (int column = 0; column < width; column++, src++) {
+            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column > width - HALF_KERNEL;
+            if (leftBorder || rightBorder) {
+                int i = 0;
+                if (leftBorder) {
+                    // for kernel size 7x7 and column == 0, we have:
+                    // x x x P0 P1 P2 P3
+                    // first loop mirrors P{0..3} to fill x's,
+                    // second one loads P{0..3}
+                    for (; i < HALF_KERNEL - column; i++)
+                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                    for (; i < KERNEL_SIZE; i++)
+                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
+                } else {
+                    for (; i < width - column; i++)
+                        _rgbaIn[i] = *(src + i);
+                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                        _rgbaIn[i] = *(src - k);
+                }
+
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
+            } else {
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
+            }
+
+            // basis of this implementation is _mm_maddubs_epi16 (aka pmaddubsw).
+            // 'rgba' holds 16 unsigned bytes, so 4 pixels.
+            // 'kern' holds 16 signed bytes kernel values multiplied by (1 << SCALE_FACTOR).
+            // before multiplication takes place, vectors need to be prepared:
+            // 'rgba' is shuffled from R1B1G1A1...R4B4G4A4 to R1R2R3R4...A1A2A3A4
+            // 'kern' is shuffled from w1w2w3w4...w13w14w15w16 to w1w2w3w4 repeated 4 times
+            // then we call _mm_maddubs_epi16 and we get:
+            // --------------------------------------------------------------------------------------
+            // | R1*w1 + R2*w2 | R3*w3 + R4*w4 | G1*w1 + G2*w2 | G3*w3 + G4*w4 | repeat for B and A |
+            // --------------------------------------------------------------------------------------
+            // each 'rectangle' is a 16-byte signed int.
+            // then we repeat the process for the rest of input pixels,
+            // call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
+
+            __m128i rgba, kern;
+            __m128i zero = _mm_setzero_si128();
+            __m128i acc = _mm_setzero_si128();
+
+            const __m128i rgba_shuf_mask = _mm_setr_epi8(0, 4, 8,  12,
+                                                         1, 5, 9,  13,
+                                                         2, 6, 10, 14,
+                                                         3, 7, 11, 15);
+
+            const __m128i kern_shuf_mask = _mm_setr_epi8(0, 1, 2, 3,
+                                                         0, 1, 2, 3,
+                                                         0, 1, 2, 3,
+                                                         0, 1, 2, 3);
+
+            rgba = _mm_shuffle_epi8(rgbaIn[0], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_kern, kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[1], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 4), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[2], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 8), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[3], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 12), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            acc = _mm_hadds_epi16(acc, zero);
+            acc = _mm_srai_epi16(acc, SCALE_FACTOR);
+
+            // Cairo sets alpha channel to 255
+            // (or -1, depending how you look at it)
+            // this quickly overflows accumulator,
+            // and alpha is calculated completely wrong.
+            // I assume most people don't use semi-transparent
+            // lock screen images, so no one will mind if we
+            // 'correct it' by setting alpha to 255.
+            *(dst + height * column + row) =
+                _mm_cvtsi128_si32(_mm_packus_epi16(acc, zero)) | 0xFF000000;
+        }
+    }
+}