From c3a95b86d6d89bd00e526214a85855629d92521e Mon Sep 17 00:00:00 2001
From: Chris Guillott <chris@techfo.xyz>
Date: Fri, 14 Oct 2016 22:06:43 -0400
Subject: [PATCH 01/25] add initial blur support

thanks to https://github.com/shiver/i3lock
---
 blur.c   | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 blur.h   |   7 +++
 i3lock.c |  34 ++++++++++++-
 lock.sh  |   4 +-
 xcb.c    |  14 ++++++
 xcb.h    |   1 +
 6 files changed, 201 insertions(+), 2 deletions(-)
 create mode 100644 blur.c
 create mode 100644 blur.h

diff --git a/blur.c b/blur.c
new file mode 100644
index 0000000..a71b984
--- /dev/null
+++ b/blur.c
@@ -0,0 +1,143 @@
+/*
+ * Copyright Â© 2008 Kristian HÃ¸gsberg
+ * Copyright Â© 2009 Chris Wilson
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <math.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <cairo.h>
+
+#define ARRAY_LENGTH(a) (sizeof (a) / sizeof (a)[0])
+
+/* Performs a simple 2D Gaussian blur of radius @radius on surface @surface. */
+void
+blur_image_surface (cairo_surface_t *surface, int radius)
+{
+    cairo_surface_t *tmp;
+    int width, height;
+    int src_stride, dst_stride;
+    int x, y, z, w;
+    uint8_t *src, *dst;
+    uint32_t *s, *d, a, p;
+    int i, j, k;
+    uint8_t kernel[17];
+    const int size = ARRAY_LENGTH (kernel);
+    const int half = size / 2;
+
+    if (cairo_surface_status (surface))
+    return;
+
+    width = cairo_image_surface_get_width (surface);
+    height = cairo_image_surface_get_height (surface);
+
+    switch (cairo_image_surface_get_format (surface)) {
+    case CAIRO_FORMAT_A1:
+    default:
+    /* Don't even think about it! */
+    return;
+
+    case CAIRO_FORMAT_A8:
+    /* Handle a8 surfaces by effectively unrolling the loops by a
+     * factor of 4 - this is safe since we know that stride has to be a
+     * multiple of uint32_t. */
+    width /= 4;
+    break;
+
+    case CAIRO_FORMAT_RGB24:
+    case CAIRO_FORMAT_ARGB32:
+    break;
+    }
+
+    tmp = cairo_image_surface_create (CAIRO_FORMAT_ARGB32, width, height);
+    if (cairo_surface_status (tmp))
+    return;
+
+    src = cairo_image_surface_get_data (surface);
+    src_stride = cairo_image_surface_get_stride (surface);
+
+    dst = cairo_image_surface_get_data (tmp);
+    dst_stride = cairo_image_surface_get_stride (tmp);
+
+    a = 0;
+    for (i = 0; i < size; i++) {
+    double f = i - half;
+    a += kernel[i] = exp (- f * f / 30.0) * 80;
+    }
+
+    /* Horizontally blur from surface -> tmp */
+    for (i = 0; i < height; i++) {
+    s = (uint32_t *) (src + i * src_stride);
+    d = (uint32_t *) (dst + i * dst_stride);
+    for (j = 0; j < width; j++) {
+        if (radius < j && j < width - radius) {
+        d[j] = s[j];
+        continue;
+        }
+
+        x = y = z = w = 0;
+        for (k = 0; k < size; k++) {
+        if (j - half + k < 0 || j - half + k >= width)
+            continue;
+
+        p = s[j - half + k];
+
+        x += ((p >> 24) & 0xff) * kernel[k];
+        y += ((p >> 16) & 0xff) * kernel[k];
+        z += ((p >>  8) & 0xff) * kernel[k];
+        w += ((p >>  0) & 0xff) * kernel[k];
+        }
+        d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
+    }
+    }
+
+    /* Then vertically blur from tmp -> surface */
+    for (i = 0; i < height; i++) {
+    s = (uint32_t *) (dst + i * dst_stride);
+    d = (uint32_t *) (src + i * src_stride);
+    for (j = 0; j < width; j++) {
+        if (radius <= i && i < height - radius) {
+        d[j] = s[j];
+        continue;
+        }
+
+        x = y = z = w = 0;
+        for (k = 0; k < size; k++) {
+        if (i - half + k < 0 || i - half + k >= height)
+            continue;
+
+        s = (uint32_t *) (dst + (i - half + k) * dst_stride);
+        p = s[j];
+
+        x += ((p >> 24) & 0xff) * kernel[k];
+        y += ((p >> 16) & 0xff) * kernel[k];
+        z += ((p >>  8) & 0xff) * kernel[k];
+        w += ((p >>  0) & 0xff) * kernel[k];
+        }
+        d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
+    }
+    }
+
+    cairo_surface_destroy (tmp);
+    cairo_surface_flush (surface);
+    cairo_surface_mark_dirty (surface);
+}
+
diff --git a/blur.h b/blur.h
new file mode 100644
index 0000000..c3d13fd
--- /dev/null
+++ b/blur.h
@@ -0,0 +1,7 @@
+#ifndef _BLUR_H
+#define _BLUR_H
+
+void blur_image_surface (cairo_surface_t *surface, int radius);
+
+#endif
+
diff --git a/i3lock.c b/i3lock.c
index ad06a78..c6a6a45 100644
--- a/i3lock.c
+++ b/i3lock.c
@@ -36,6 +36,7 @@
 #include "cursors.h"
 #include "unlock_indicator.h"
 #include "xinerama.h"
+#include "blur.h"
 
 #define TSTAMP_N_SECS(n) (n * 1.0)
 #define TSTAMP_N_MINS(n) (60 * TSTAMP_N_SECS(n))
@@ -75,6 +76,10 @@ bool show_clock = false;
 char time_format[32] = "%H:%M:%S\0";
 char date_format[32] = "%A, %m %Y\0";
 
+/* opts for blurring */
+bool blur = false;
+bool step_blur = false;
+int blur_radius = 5;
 
 uint32_t last_resolution[2];
 xcb_window_t win;
@@ -861,6 +866,8 @@ int main(int argc, char *argv[]) {
         {"timestr", required_argument, NULL, 0},
         {"datestr", required_argument, NULL, 0},
 
+        {"blur", no_argument, NULL, 'B'},
+
         {"ignore-empty-password", no_argument, NULL, 'e'},
         {"inactivity-timeout", required_argument, NULL, 'I'},
         {"show-failed-attempts", no_argument, NULL, 'f'},
@@ -871,7 +878,7 @@ int main(int argc, char *argv[]) {
     if ((username = pw->pw_name) == NULL)
         errx(EXIT_FAILURE, "pw->pw_name is NULL.\n");
 
-    char *optstring = "hvnbdc:p:ui:teI:frsS:k";
+    char *optstring = "hvnbdc:p:ui:teI:frsS:kB";
     while ((o = getopt_long(argc, argv, optstring, longopts, &optind)) != -1) {
         switch (o) {
             case 'v':
@@ -941,6 +948,9 @@ int main(int argc, char *argv[]) {
             case 'k':
                 show_clock = true;
                 break;
+            case 'B':
+                blur = true;
+                break;
             case 0:
                 if (strcmp(longopts[optind].name, "debug") == 0)
                     debug_mode = true;
@@ -1182,12 +1192,34 @@ int main(int argc, char *argv[]) {
         free(image_path);
     }
 
+    xcb_pixmap_t blur_pixmap;
+    if (blur) {
+        if(!img) {
+            xcb_visualtype_t *vistype = get_root_visual_type(screen);
+            blur_pixmap = capture_bg_pixmap(conn, screen, last_resolution);
+            cairo_surface_t *xcb_img = cairo_xcb_surface_create(conn, blur_pixmap, vistype, last_resolution[0], last_resolution[1]);
+
+            img = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, last_resolution[0], last_resolution[1]);
+            cairo_t *ctx = cairo_create(img);
+            cairo_set_source_surface(ctx, xcb_img, 0, 0);
+            cairo_paint(ctx);
+
+            cairo_destroy(ctx);
+            cairo_surface_destroy(xcb_img);
+        }
+        blur_image_surface(img, 10000);
+    }
+
     /* Pixmap on which the image is rendered to (if any) */
     xcb_pixmap_t bg_pixmap = draw_image(last_resolution);
 
     /* Open the fullscreen window, already with the correct pixmap in place */
     win = open_fullscreen_window(conn, screen, color, bg_pixmap);
     xcb_free_pixmap(conn, bg_pixmap);
+    if (blur_pixmap) {
+        xcb_free_pixmap(conn, blur_pixmap);
+    }
+
 
     cursor = create_cursor(conn, screen, win, curs_choice);
 
diff --git a/lock.sh b/lock.sh
index 27b6d9c..ffbabe1 100755
--- a/lock.sh
+++ b/lock.sh
@@ -35,4 +35,6 @@ V='#bb00bbbb'  # verifying
 --screen 0            \
 --clock               \
 --timestr="%H:%M:%S"  \
---datestr="%A, %m %Y" 
+--datestr="%A, %m %Y" \
+-B                    \
+
diff --git a/xcb.c b/xcb.c
index e0b7811..23d33fa 100644
--- a/xcb.c
+++ b/xcb.c
@@ -307,3 +307,17 @@ xcb_cursor_t create_cursor(xcb_connection_t *conn, xcb_screen_t *screen, xcb_win
 
     return cursor;
 }
+
+xcb_pixmap_t capture_bg_pixmap(xcb_connection_t *conn, xcb_screen_t *scr, u_int32_t * resolution) {
+    xcb_pixmap_t bg_pixmap = xcb_generate_id(conn);
+    xcb_create_pixmap(conn, scr->root_depth, bg_pixmap, scr->root, resolution[0], resolution[1]);
+    xcb_gcontext_t gc = xcb_generate_id(conn);
+    uint32_t values[] = { scr->black_pixel, 1};
+    xcb_create_gc(conn, gc, bg_pixmap, XCB_GC_FOREGROUND | XCB_GC_SUBWINDOW_MODE, values);
+    xcb_rectangle_t rect = { 0, 0, resolution[0], resolution[1] };
+    xcb_poly_fill_rectangle(conn, bg_pixmap, gc, 1, &rect);
+    xcb_copy_area(conn, scr->root, bg_pixmap, gc, 0, 0, 0, 0, resolution[0], resolution[1]);
+    xcb_flush(conn);
+    xcb_free_gc(conn, gc);
+    return bg_pixmap;
+}
diff --git a/xcb.h b/xcb.h
index 1e0cbb1..49eea41 100644
--- a/xcb.h
+++ b/xcb.h
@@ -13,5 +13,6 @@ xcb_window_t open_fullscreen_window(xcb_connection_t *conn, xcb_screen_t *scr, c
 void grab_pointer_and_keyboard(xcb_connection_t *conn, xcb_screen_t *screen, xcb_cursor_t cursor);
 void dpms_set_mode(xcb_connection_t *conn, xcb_dpms_dpms_mode_t mode);
 xcb_cursor_t create_cursor(xcb_connection_t *conn, xcb_screen_t *screen, xcb_window_t win, int choice);
+xcb_pixmap_t capture_bg_pixmap(xcb_connection_t *conn, xcb_screen_t *scr, u_int32_t* resolution);
 
 #endif

From ade9a207881fca816fe2a630b18e0d1ff2655e3e Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 22 Oct 2016 14:32:35 +0200
Subject: [PATCH 02/25] Isolate blur implementation to a function.

This will allow easier switching between naive/SSE2/AVX implementations.
---
 blur.c | 39 +++++++++++++++++++++++----------------
 blur.h |  4 ++++
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/blur.c b/blur.c
index a71b984..b0349d5 100644
--- a/blur.c
+++ b/blur.c
@@ -22,9 +22,7 @@
  */
 
 #include <math.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <cairo.h>
+#include "blur.h"
 
 #define ARRAY_LENGTH(a) (sizeof (a) / sizeof (a)[0])
 
@@ -35,13 +33,7 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     cairo_surface_t *tmp;
     int width, height;
     int src_stride, dst_stride;
-    int x, y, z, w;
-    uint8_t *src, *dst;
-    uint32_t *s, *d, a, p;
-    int i, j, k;
-    uint8_t kernel[17];
-    const int size = ARRAY_LENGTH (kernel);
-    const int half = size / 2;
+    uint32_t *src, *dst;
 
     if (cairo_surface_status (surface))
     return;
@@ -71,12 +63,31 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     if (cairo_surface_status (tmp))
     return;
 
-    src = cairo_image_surface_get_data (surface);
+    src = (uint32_t*)cairo_image_surface_get_data (surface);
     src_stride = cairo_image_surface_get_stride (surface);
 
-    dst = cairo_image_surface_get_data (tmp);
+    dst = (uint32_t*)cairo_image_surface_get_data (tmp);
     dst_stride = cairo_image_surface_get_stride (tmp);
 
+    blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
+
+    cairo_surface_destroy (tmp);
+    cairo_surface_flush (surface);
+    cairo_surface_mark_dirty (surface);
+}
+
+void blur_impl_naive(uint32_t* _src, uint32_t* _dst, int width, int height, int src_stride, int dst_stride, int radius)
+{
+    int x, y, z, w;
+    uint32_t *s, *d, a, p;
+    int i, j, k;
+    uint8_t kernel[17];
+    const int size = ARRAY_LENGTH (kernel);
+    const int half = size / 2;
+
+    uint8_t *src = (uint8_t*)_src;
+    uint8_t *dst = (uint8_t*)_dst;
+
     a = 0;
     for (i = 0; i < size; i++) {
     double f = i - half;
@@ -135,9 +146,5 @@ blur_image_surface (cairo_surface_t *surface, int radius)
         d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
     }
     }
-
-    cairo_surface_destroy (tmp);
-    cairo_surface_flush (surface);
-    cairo_surface_mark_dirty (surface);
 }
 
diff --git a/blur.h b/blur.h
index c3d13fd..1c1eb7a 100644
--- a/blur.h
+++ b/blur.h
@@ -1,7 +1,11 @@
 #ifndef _BLUR_H
 #define _BLUR_H
 
+#include <stdint.h>
+#include <cairo.h>
+
 void blur_image_surface (cairo_surface_t *surface, int radius);
+void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 
 #endif
 

From fb5dbbe661c420509c1aa71dc3e8ff742d9d59fe Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 22 Oct 2016 15:30:27 +0200
Subject: [PATCH 03/25] Add SSE2-optimized blur.

About 4-6 times faster than naive implementation.
---
 blur.c      |   3 +-
 blur.h      |   2 +
 blur_simd.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 120 insertions(+), 1 deletion(-)
 create mode 100644 blur_simd.c

diff --git a/blur.c b/blur.c
index b0349d5..8434e81 100644
--- a/blur.c
+++ b/blur.c
@@ -69,7 +69,8 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     dst = (uint32_t*)cairo_image_surface_get_data (tmp);
     dst_stride = cairo_image_surface_get_stride (tmp);
 
-    blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
+    //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
+    blur_impl_sse2(src, dst, width, height, 2.5);
 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
diff --git a/blur.h b/blur.h
index 1c1eb7a..2a5f45c 100644
--- a/blur.h
+++ b/blur.h
@@ -6,6 +6,8 @@
 
 void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
+void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
new file mode 100644
index 0000000..2186cf5
--- /dev/null
+++ b/blur_simd.c
@@ -0,0 +1,116 @@
+/*
+ * vim:ts=4:sw=4:expandtab
+ *
+ * © 2016 Sebastian Frysztak 
+ *
+ * See LICENSE for licensing information
+ *
+ */
+
+#include "blur.h"
+#include <math.h>
+#include <xmmintrin.h>
+
+#define ALIGN16 __attribute__((aligned(16)))
+#define KERNEL_SIZE 7
+#define HALF_KERNEL KERNEL_SIZE / 2
+
+void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
+    // prepare kernel
+    float kernel[KERNEL_SIZE];
+    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
+
+    for (int i = 0; i < KERNEL_SIZE; i++) {
+        float x = HALF_KERNEL - i;
+        kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
+        sum += kernel[i];
+    }
+
+    // normalize kernel
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernel[i] /= sum;
+
+    // horizontal pass includes image transposition:
+    // instead of writing pixel src[x] to dst[x],
+    // we write it to transposed location.
+    // (to be exact: dst[height * current_column + current_row])
+    blur_impl_horizontal_pass_sse2(src, dst, kernel, width, height);
+    blur_impl_horizontal_pass_sse2(dst, src, kernel, height, width);
+}
+
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
+    for (int row = 0; row < height; row++) {
+        // remember first and last pixel in a row
+        // (used to handle borders)
+        uint32_t firstPixel = *src;
+        uint32_t lastPixel = *(src + width - 1);
+
+        for (int column = 0; column < width; column++, src++) {
+            __m128i rgbaIn1, rgbaIn2;
+
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column + HALF_KERNEL >= width;
+            if (leftBorder || rightBorder) {
+                uint32_t rgbaIn[KERNEL_SIZE] ALIGN16;
+                int i = 0;
+                if (leftBorder) {
+                    // for kernel size 7x7 and column == 0, we have:
+                    // x x x P0 P1 P2 P3
+                    // first loop fills x's with P0, second one loads P{0..3}
+                    for (; i < HALF_KERNEL - column; i++)
+                        rgbaIn[i] = firstPixel;
+                    for (; i < KERNEL_SIZE; i++)
+                        rgbaIn[i] = *(src + i - HALF_KERNEL);
+                } else {
+                    for (; width < column; i++)
+                        rgbaIn[i] = *(src - i - HALF_KERNEL);
+                    for (; i < KERNEL_SIZE; i++)
+                        rgbaIn[i] = lastPixel;
+                }
+
+                rgbaIn1 = _mm_load_si128((__m128i *)(rgbaIn));
+                rgbaIn2 = _mm_load_si128((__m128i *)(rgbaIn + 4));
+            } else {
+                rgbaIn1 = _mm_loadu_si128((__m128i *)(src - 3));
+                rgbaIn2 = _mm_loadu_si128((__m128i *)(src + 1));
+            }
+
+            // unpack each pixel, convert to float,
+            // multiply by corresponding kernel value
+            // and add to accumulator
+            __m128i tmp;
+            __m128i zero = _mm_setzero_si128();
+            __m128 rgba_ps;
+            __m128 acc = _mm_setzero_ps();
+            int counter = 0;
+
+            tmp = _mm_unpacklo_epi8(rgbaIn1, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            tmp = _mm_unpackhi_epi8(rgbaIn1, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            tmp = _mm_unpacklo_epi8(rgbaIn2, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            tmp = _mm_unpackhi_epi8(rgbaIn2, zero);
+            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+            __m128i rgbaOut = _mm_cvtps_epi32(acc);
+            rgbaOut = _mm_packs_epi32(rgbaOut, zero);
+            rgbaOut = _mm_packus_epi16(rgbaOut, zero);
+            *(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut);
+        }
+    }
+}

From a48ddb61db973684d89efaddcca3d7803bfaf276 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 22 Oct 2016 15:31:53 +0200
Subject: [PATCH 04/25] Build with -O2.

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 020beaa..b0a3dcc 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ endif
 CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
+CFLAGS += -O2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
 CFLAGS += $(shell $(PKG_CONFIG) --cflags cairo xcb-dpms xcb-xinerama xcb-atom xcb-image xcb-xkb xkbcommon xkbcommon-x11)

From afe41c5754a2871e4aa5852343363083bc551782 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 28 Oct 2016 17:35:33 +0200
Subject: [PATCH 05/25] Extend kernel size to 15x15.

---
 blur.c      |  2 +-
 blur_simd.c | 55 ++++++++++++++++++++++++++++++-----------------------
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/blur.c b/blur.c
index 8434e81..d91f57d 100644
--- a/blur.c
+++ b/blur.c
@@ -70,7 +70,7 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     dst_stride = cairo_image_surface_get_stride (tmp);
 
     //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
-    blur_impl_sse2(src, dst, width, height, 2.5);
+    blur_impl_sse2(src, dst, width, height, 4.5);
 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
diff --git a/blur_simd.c b/blur_simd.c
index 2186cf5..1628264 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -12,9 +12,13 @@
 #include <xmmintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
-#define KERNEL_SIZE 7
+#define KERNEL_SIZE 15 
 #define HALF_KERNEL KERNEL_SIZE / 2
 
+// number of xmm registers needed to store 
+// input pixels for given kernel size
+#define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
+
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernel[KERNEL_SIZE];
@@ -46,34 +50,34 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
         uint32_t lastPixel = *(src + width - 1);
 
         for (int column = 0; column < width; column++, src++) {
-            __m128i rgbaIn1, rgbaIn2;
+            __m128i rgbaIn[REGISTERS_CNT];
 
             // handle borders
             int leftBorder = column < HALF_KERNEL;
             int rightBorder = column + HALF_KERNEL >= width;
             if (leftBorder || rightBorder) {
-                uint32_t rgbaIn[KERNEL_SIZE] ALIGN16;
+                uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
                 int i = 0;
                 if (leftBorder) {
                     // for kernel size 7x7 and column == 0, we have:
                     // x x x P0 P1 P2 P3
                     // first loop fills x's with P0, second one loads P{0..3}
                     for (; i < HALF_KERNEL - column; i++)
-                        rgbaIn[i] = firstPixel;
+                        _rgbaIn[i] = firstPixel;
                     for (; i < KERNEL_SIZE; i++)
-                        rgbaIn[i] = *(src + i - HALF_KERNEL);
+                        _rgbaIn[i] = *(src + i - HALF_KERNEL);
                 } else {
                     for (; width < column; i++)
-                        rgbaIn[i] = *(src - i - HALF_KERNEL);
+                        _rgbaIn[i] = *(src - i - HALF_KERNEL);
                     for (; i < KERNEL_SIZE; i++)
-                        rgbaIn[i] = lastPixel;
+                        _rgbaIn[i] = lastPixel;
                 }
 
-                rgbaIn1 = _mm_load_si128((__m128i *)(rgbaIn));
-                rgbaIn2 = _mm_load_si128((__m128i *)(rgbaIn + 4));
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
             } else {
-                rgbaIn1 = _mm_loadu_si128((__m128i *)(src - 3));
-                rgbaIn2 = _mm_loadu_si128((__m128i *)(src + 1));
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
             }
 
             // unpack each pixel, convert to float,
@@ -85,25 +89,28 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
             __m128 acc = _mm_setzero_ps();
             int counter = 0;
 
-            tmp = _mm_unpacklo_epi8(rgbaIn1, zero);
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-
-            tmp = _mm_unpackhi_epi8(rgbaIn1, zero);
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            for (int i = 0; i < 3; i++)
+            {
+                tmp = _mm_unpacklo_epi8(rgbaIn[i], zero);
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+
+                tmp = _mm_unpackhi_epi8(rgbaIn[i], zero);
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+                rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
+                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            }
 
-            tmp = _mm_unpacklo_epi8(rgbaIn2, zero);
+            tmp = _mm_unpacklo_epi8(rgbaIn[3], zero);
             rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
             acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
             rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
             acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
 
-            tmp = _mm_unpackhi_epi8(rgbaIn2, zero);
+            tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
             rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
             acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
 

From 3662b8e18714ed9e3e29c7d55c951557fc5ae104 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 28 Oct 2016 17:36:43 +0200
Subject: [PATCH 06/25] Improve border handling for larger kernels.

---
 blur_simd.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/blur_simd.c b/blur_simd.c
index 1628264..4cacb9a 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -44,33 +44,29 @@ void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float s
 
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
     for (int row = 0; row < height; row++) {
-        // remember first and last pixel in a row
-        // (used to handle borders)
-        uint32_t firstPixel = *src;
-        uint32_t lastPixel = *(src + width - 1);
-
         for (int column = 0; column < width; column++, src++) {
             __m128i rgbaIn[REGISTERS_CNT];
 
             // handle borders
             int leftBorder = column < HALF_KERNEL;
-            int rightBorder = column + HALF_KERNEL >= width;
+            int rightBorder = column > width - HALF_KERNEL;
             if (leftBorder || rightBorder) {
                 uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
                 int i = 0;
                 if (leftBorder) {
                     // for kernel size 7x7 and column == 0, we have:
                     // x x x P0 P1 P2 P3
-                    // first loop fills x's with P0, second one loads P{0..3}
+                    // first loop mirrors P{0..3} to fill x's,
+                    // second one loads P{0..3}
                     for (; i < HALF_KERNEL - column; i++)
-                        _rgbaIn[i] = firstPixel;
+                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
                     for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = *(src + i - HALF_KERNEL);
+                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
                 } else {
-                    for (; width < column; i++)
-                        _rgbaIn[i] = *(src - i - HALF_KERNEL);
-                    for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = lastPixel;
+                    for (; i < width - column; i++)
+                        _rgbaIn[i] = *(src + i);
+                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                        _rgbaIn[i] = *(src - k);
                 }
 
                 for (int k = 0; k < REGISTERS_CNT; k++)

From 72aec8704714f5128e076236b077dd7fedcea9da Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 29 Oct 2016 14:32:49 +0200
Subject: [PATCH 07/25] Add SSSE3-based blur implementation.

Calculations are done on integer, rather than floating point numbers,
so this implementation is not as accurate (but when scale factor is
reasonable enough, no artifacs are visible).
It is, however, faster by a factor of ~3.
---
 Makefile    |   1 +
 blur.c      |   3 +-
 blur.h      |   2 +
 blur_simd.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index b0a3dcc..d979100 100644
--- a/Makefile
+++ b/Makefile
@@ -14,6 +14,7 @@ endif
 CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
+CFLAGS += -mssse3
 CFLAGS += -O2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
diff --git a/blur.c b/blur.c
index d91f57d..a5b0bd3 100644
--- a/blur.c
+++ b/blur.c
@@ -70,7 +70,8 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     dst_stride = cairo_image_surface_get_stride (tmp);
 
     //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
-    blur_impl_sse2(src, dst, width, height, 4.5);
+    //blur_impl_sse2(src, dst, width, height, 4.5);
+    blur_impl_ssse3(src, dst, width, height, 4.5);
 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
diff --git a/blur.h b/blur.h
index 2a5f45c..478e2f0 100644
--- a/blur.h
+++ b/blur.h
@@ -8,6 +8,8 @@ void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
+void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index 4cacb9a..7dcd9a4 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -10,6 +10,7 @@
 #include "blur.h"
 #include <math.h>
 #include <xmmintrin.h>
+#include <tmmintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
 #define KERNEL_SIZE 15 
@@ -19,6 +20,11 @@
 // input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
+// scaling factor for kernel coefficients.
+// higher values cause desaturation.
+// used in SSSE3 implementation.
+#define SCALE_FACTOR 7
+
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernel[KERNEL_SIZE];
@@ -117,3 +123,128 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
         }
     }
 }
+
+void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
+    // prepare kernel
+    float kernelf[KERNEL_SIZE];
+    int8_t kernel[KERNEL_SIZE + 1];
+    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
+
+    for (int i = 0; i < KERNEL_SIZE; i++) {
+        float x = HALF_KERNEL - i;
+        kernelf[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
+        sum += kernelf[i];
+    }
+
+    // normalize kernel
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernelf[i] /= sum;
+
+    // round to nearest integer and convert to int
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernel[i] = (int8_t)rintf(kernelf[i] * (1 << SCALE_FACTOR));
+    kernel[KERNEL_SIZE] = 0;
+
+    // horizontal pass includes image transposition:
+    // instead of writing pixel src[x] to dst[x],
+    // we write it to transposed location.
+    // (to be exact: dst[height * current_column + current_row])
+    blur_impl_horizontal_pass_ssse3(src, dst, kernel, width, height);
+    blur_impl_horizontal_pass_ssse3(dst, src, kernel, height, width);
+}
+
+
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height) {
+    __m128i _kern = _mm_loadu_si128((__m128i*)kernel);
+    __m128i rgbaIn[REGISTERS_CNT];
+
+    for (int row = 0; row < height; row++) {
+        for (int column = 0; column < width; column++, src++) {
+            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column > width - HALF_KERNEL;
+            if (leftBorder || rightBorder) {
+                int i = 0;
+                if (leftBorder) {
+                    // for kernel size 7x7 and column == 0, we have:
+                    // x x x P0 P1 P2 P3
+                    // first loop mirrors P{0..3} to fill x's,
+                    // second one loads P{0..3}
+                    for (; i < HALF_KERNEL - column; i++)
+                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                    for (; i < KERNEL_SIZE; i++)
+                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
+                } else {
+                    for (; i < width - column; i++)
+                        _rgbaIn[i] = *(src + i);
+                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                        _rgbaIn[i] = *(src - k);
+                }
+
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
+            } else {
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
+            }
+
+            // basis of this implementation is _mm_maddubs_epi16 (aka pmaddubsw).
+            // 'rgba' holds 16 unsigned bytes, so 4 pixels.
+            // 'kern' holds 16 signed bytes kernel values multiplied by (1 << SCALE_FACTOR).
+            // before multiplication takes place, vectors need to be prepared:
+            // 'rgba' is shuffled from R1B1G1A1...R4B4G4A4 to R1R2R3R4...A1A2A3A4
+            // 'kern' is shuffled from w1w2w3w4...w13w14w15w16 to w1w2w3w4 repeated 4 times
+            // then we call _mm_maddubs_epi16 and we get:
+            // --------------------------------------------------------------------------------------
+            // | R1*w1 + R2*w2 | R3*w3 + R4*w4 | G1*w1 + G2*w2 | G3*w3 + G4*w4 | repeat for B and A |
+            // --------------------------------------------------------------------------------------
+            // each 'rectangle' is a 16-byte signed int.
+            // then we repeat the process for the rest of input pixels,
+            // call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
+
+            __m128i rgba, kern;
+            __m128i zero = _mm_setzero_si128();
+            __m128i acc = _mm_setzero_si128();
+
+            const __m128i rgba_shuf_mask = _mm_setr_epi8(0, 4, 8,  12,
+                                                         1, 5, 9,  13,
+                                                         2, 6, 10, 14,
+                                                         3, 7, 11, 15);
+
+            const __m128i kern_shuf_mask = _mm_setr_epi8(0, 1, 2, 3,
+                                                         0, 1, 2, 3,
+                                                         0, 1, 2, 3,
+                                                         0, 1, 2, 3);
+
+            rgba = _mm_shuffle_epi8(rgbaIn[0], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_kern, kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[1], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 4), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[2], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 8), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            rgba = _mm_shuffle_epi8(rgbaIn[3], rgba_shuf_mask);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 12), kern_shuf_mask);
+            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+
+            acc = _mm_hadds_epi16(acc, zero);
+            acc = _mm_srai_epi16(acc, SCALE_FACTOR);
+
+            // Cairo sets alpha channel to 255
+            // (or -1, depending how you look at it)
+            // this quickly overflows accumulator,
+            // and alpha is calculated completely wrong.
+            // I assume most people don't use semi-transparent
+            // lock screen images, so no one will mind if we
+            // 'correct it' by setting alpha to 255.
+            *(dst + height * column + row) =
+                _mm_cvtsi128_si32(_mm_packus_epi16(acc, zero)) | 0xFF000000;
+        }
+    }
+}

From 95c333cba510864ec91e97117272c87f429b097e Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Thu, 3 Nov 2016 20:16:06 +0100
Subject: [PATCH 08/25] SSSE3: use 16-bit weights.

Overall, I'm very happy with performance of this code, but not so much
with resulting image. It seems like integer approximations won't do.
I might remove this code altogether, so I didn't update comments.
---
 blur.h      |  2 +-
 blur_simd.c | 57 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/blur.h b/blur.h
index 478e2f0..7ba45fe 100644
--- a/blur.h
+++ b/blur.h
@@ -9,7 +9,7 @@ void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int sr
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
 void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height);
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int16_t *kernel, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index 7dcd9a4..8eac808 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -23,7 +23,7 @@
 // scaling factor for kernel coefficients.
 // higher values cause desaturation.
 // used in SSSE3 implementation.
-#define SCALE_FACTOR 7
+#define SCALE_FACTOR 14
 
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
@@ -127,7 +127,7 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
 void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernelf[KERNEL_SIZE];
-    int8_t kernel[KERNEL_SIZE + 1];
+    int16_t kernel[KERNEL_SIZE + 1];
     float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
 
     for (int i = 0; i < KERNEL_SIZE; i++) {
@@ -142,7 +142,7 @@ void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float
 
     // round to nearest integer and convert to int
     for (int i = 0; i < KERNEL_SIZE; i++)
-        kernel[i] = (int8_t)rintf(kernelf[i] * (1 << SCALE_FACTOR));
+        kernel[i] = (int16_t)lrintf(kernelf[i] * (1 << SCALE_FACTOR));
     kernel[KERNEL_SIZE] = 0;
 
     // horizontal pass includes image transposition:
@@ -154,8 +154,10 @@ void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float
 }
 
 
-void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kernel, int width, int height) {
-    __m128i _kern = _mm_loadu_si128((__m128i*)kernel);
+void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int16_t *kernel, int width, int height) {
+    __m128i _kern[2];
+    _kern[0] = _mm_loadu_si128((__m128i*)kernel);
+    _kern[1] = _mm_loadu_si128((__m128i*)(kernel + 8));
     __m128i rgbaIn[REGISTERS_CNT];
 
     for (int row = 0; row < height; row++) {
@@ -203,9 +205,10 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne
             // then we repeat the process for the rest of input pixels,
             // call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
 
-            __m128i rgba, kern;
+            __m128i rgba, rg, ba, kern;
             __m128i zero = _mm_setzero_si128();
-            __m128i acc = _mm_setzero_si128();
+            __m128i acc_rg = _mm_setzero_si128();
+            __m128i acc_ba = _mm_setzero_si128();
 
             const __m128i rgba_shuf_mask = _mm_setr_epi8(0, 4, 8,  12,
                                                          1, 5, 9,  13,
@@ -213,28 +216,40 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne
                                                          3, 7, 11, 15);
 
             const __m128i kern_shuf_mask = _mm_setr_epi8(0, 1, 2, 3,
+                                                         4, 5, 6, 7,
                                                          0, 1, 2, 3,
-                                                         0, 1, 2, 3,
-                                                         0, 1, 2, 3);
+                                                         4, 5, 6, 7);
 
             rgba = _mm_shuffle_epi8(rgbaIn[0], rgba_shuf_mask);
-            kern = _mm_shuffle_epi8(_kern, kern_shuf_mask);
-            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+            rg = _mm_unpacklo_epi8(rgba, zero);
+            ba = _mm_unpackhi_epi8(rgba, zero);
+            kern = _mm_shuffle_epi8(_kern[0], kern_shuf_mask);
+            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
+            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
 
             rgba = _mm_shuffle_epi8(rgbaIn[1], rgba_shuf_mask);
-            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 4), kern_shuf_mask);
-            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
-
+            rg = _mm_unpacklo_epi8(rgba, zero);
+            ba = _mm_unpackhi_epi8(rgba, zero);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern[0], 8), kern_shuf_mask);
+            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
+            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
+    
             rgba = _mm_shuffle_epi8(rgbaIn[2], rgba_shuf_mask);
-            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 8), kern_shuf_mask);
-            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+            rg = _mm_unpacklo_epi8(rgba, zero);
+            ba = _mm_unpackhi_epi8(rgba, zero);
+            kern = _mm_shuffle_epi8(_kern[1], kern_shuf_mask);
+            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
+            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
 
             rgba = _mm_shuffle_epi8(rgbaIn[3], rgba_shuf_mask);
-            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern, 12), kern_shuf_mask);
-            acc = _mm_adds_epi16(acc, _mm_maddubs_epi16(rgba, kern));
+            rg = _mm_unpacklo_epi8(rgba, zero);
+            ba = _mm_unpackhi_epi8(rgba, zero);
+            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern[1], 8), kern_shuf_mask);
+            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
+            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
 
-            acc = _mm_hadds_epi16(acc, zero);
-            acc = _mm_srai_epi16(acc, SCALE_FACTOR);
+            rgba = _mm_hadd_epi32(acc_rg, acc_ba);
+            rgba = _mm_srai_epi32(rgba, SCALE_FACTOR);
 
             // Cairo sets alpha channel to 255
             // (or -1, depending how you look at it)
@@ -244,7 +259,7 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne
             // lock screen images, so no one will mind if we
             // 'correct it' by setting alpha to 255.
             *(dst + height * column + row) =
-                _mm_cvtsi128_si32(_mm_packus_epi16(acc, zero)) | 0xFF000000;
+                _mm_cvtsi128_si32(_mm_shuffle_epi8(rgba, rgba_shuf_mask));
         }
     }
 }

From f06dc6cbc4a6ddd6f3754781658629c18d50c0ff Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 4 Nov 2016 22:19:29 +0100
Subject: [PATCH 09/25] Add AVX version.

It relies on some SSE2 instructions, so performance gain is not that
huge (about 1.4x).
I experimented with 256-bit loads, but they turned out to be slower (at
least on Sandy Bridge).
---
 Makefile    |   1 +
 blur.h      |   8 +++-
 blur_simd.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index d979100..9ab3a91 100644
--- a/Makefile
+++ b/Makefile
@@ -15,6 +15,7 @@ CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
 CFLAGS += -mssse3
+CFLAGS += -mavx
 CFLAGS += -O2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
diff --git a/blur.h b/blur.h
index 7ba45fe..607aa70 100644
--- a/blur.h
+++ b/blur.h
@@ -6,8 +6,14 @@
 
 void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
+
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height)
+    __attribute__ ((__target__ ("no-avx")));
+
+void blur_impl_avx(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
+void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
+
 void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int16_t *kernel, int width, int height);
 
diff --git a/blur_simd.c b/blur_simd.c
index 8eac808..6dc2ec3 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -11,6 +11,7 @@
 #include <math.h>
 #include <xmmintrin.h>
 #include <tmmintrin.h>
+#include <immintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
 #define KERNEL_SIZE 15 
@@ -25,6 +26,12 @@
 // used in SSSE3 implementation.
 #define SCALE_FACTOR 14
 
+// AVX intrinsics missing in GCC
+#define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
+#define _mm256_setr_m128i(v0, v1) _mm256_set_m128i((v1), (v0))
+#define _mm256_set_m128(v0, v1)   _mm256_insertf128_ps(_mm256_castps128_ps256(v1), (v0), 1)
+#define _mm256_setr_m128(v0, v1)  _mm256_set_m128((v1), (v0))
+
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernel[KERNEL_SIZE];
@@ -124,6 +131,108 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
     }
 }
 
+void blur_impl_avx(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
+    // prepare kernel
+    float kernel[KERNEL_SIZE];
+    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
+
+    for (int i = 0; i < KERNEL_SIZE; i++) {
+        float x = HALF_KERNEL - i;
+        kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
+        sum += kernel[i];
+    }
+
+    // normalize kernel
+    for (int i = 0; i < KERNEL_SIZE; i++)
+        kernel[i] /= sum;
+
+    // horizontal pass includes image transposition:
+    // instead of writing pixel src[x] to dst[x],
+    // we write it to transposed location.
+    // (to be exact: dst[height * current_column + current_row])
+    blur_impl_horizontal_pass_avx(src, dst, kernel, width, height);
+    blur_impl_horizontal_pass_avx(dst, src, kernel, height, width);
+}
+
+void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
+    __m256 kernels[HALF_KERNEL];
+    for (int i = 0, k = 0; i < HALF_KERNEL; i++, k += 2)
+        kernels[i] = _mm256_setr_m128(_mm_set1_ps(kernel[k]), _mm_set1_ps(kernel[k+1]));
+
+    for (int row = 0; row < height; row++) {
+        for (int column = 0; column < width; column++, src++) {
+            __m128i rgbaIn[REGISTERS_CNT];
+
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column > width - HALF_KERNEL;
+            if (leftBorder || rightBorder) {
+                uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+                int i = 0;
+                if (leftBorder) {
+                    // for kernel size 7x7 and column == 0, we have:
+                    // x x x P0 P1 P2 P3
+                    // first loop mirrors P{0..3} to fill x's,
+                    // second one loads P{0..3}
+                    for (; i < HALF_KERNEL - column; i++)
+                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                    for (; i < KERNEL_SIZE; i++)
+                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
+                } else {
+                    for (; i < width - column; i++)
+                        _rgbaIn[i] = *(src + i);
+                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                        _rgbaIn[i] = *(src - k);
+                }
+
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
+            } else {
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
+            }
+
+            // unpack each pixel, convert to float,
+            // multiply by corresponding kernel value
+            // and add to accumulator
+            __m128i tmp;
+            __m128i zero = _mm_setzero_si128();
+            __m128 rgba_ps_128;
+            __m256 rgba_ps;
+            __m256 acc = _mm256_setzero_ps();
+            int counter = 0;
+
+            for (int i = 0; i < 3; i++)
+            {
+                tmp = _mm_unpacklo_epi8(rgbaIn[i], zero);
+                rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero),
+                                                               _mm_unpackhi_epi16(tmp, zero)));
+                acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter++]));
+
+                tmp = _mm_unpackhi_epi8(rgbaIn[i], zero);
+                rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero),
+                                                               _mm_unpackhi_epi16(tmp, zero)));
+                acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter++]));
+            }
+
+            tmp = _mm_unpacklo_epi8(rgbaIn[3], zero);
+            rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero),
+                                                           _mm_unpackhi_epi16(tmp, zero)));
+            acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter]));
+
+            tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
+            rgba_ps_128 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
+            rgba_ps_128 = _mm_mul_ps(rgba_ps_128, _mm_set1_ps(kernel[KERNEL_SIZE-1]));
+            rgba_ps_128 = _mm_add_ps(rgba_ps_128, _mm_add_ps(_mm256_extractf128_ps(acc, 0),
+                                                             _mm256_extractf128_ps(acc, 1)));
+
+            __m128i rgbaOut = _mm_packs_epi32(_mm_cvtps_epi32(rgba_ps_128), zero);
+            rgbaOut = _mm_packus_epi16(rgbaOut, zero);
+            *(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut);
+        }
+    }
+}
+
 void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernelf[KERNEL_SIZE];

From 252999f6403d53252c642042a97ea13fa0e53891 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 4 Nov 2016 22:41:17 +0100
Subject: [PATCH 10/25] Slightly refactor border handling code.

---
 blur_simd.c | 74 +++++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 36 deletions(-)

diff --git a/blur_simd.c b/blur_simd.c
index 6dc2ec3..194f55a 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -63,24 +63,25 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
             // handle borders
             int leftBorder = column < HALF_KERNEL;
             int rightBorder = column > width - HALF_KERNEL;
-            if (leftBorder || rightBorder) {
-                uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
-                int i = 0;
-                if (leftBorder) {
-                    // for kernel size 7x7 and column == 0, we have:
-                    // x x x P0 P1 P2 P3
-                    // first loop mirrors P{0..3} to fill x's,
-                    // second one loads P{0..3}
-                    for (; i < HALF_KERNEL - column; i++)
-                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
-                    for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
-                } else {
-                    for (; i < width - column; i++)
-                        _rgbaIn[i] = *(src + i);
-                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
-                        _rgbaIn[i] = *(src - k);
-                }
+            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+            int i = 0;
+            if (leftBorder) {
+                // for kernel size 7x7 and column == 0, we have:
+                // x x x P0 P1 P2 P3
+                // first loop mirrors P{0..3} to fill x's,
+                // second one loads P{0..3}
+                for (; i < HALF_KERNEL - column; i++)
+                    _rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                for (; i < KERNEL_SIZE; i++)
+                    _rgbaIn[i] = *(src - (HALF_KERNEL - i));
+
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
+            } else if (rightBorder) {
+                for (; i < width - column; i++)
+                    _rgbaIn[i] = *(src + i);
+                for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                    _rgbaIn[i] = *(src - k);
 
                 for (int k = 0; k < REGISTERS_CNT; k++)
                     rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
@@ -166,24 +167,25 @@ void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel,
             // handle borders
             int leftBorder = column < HALF_KERNEL;
             int rightBorder = column > width - HALF_KERNEL;
-            if (leftBorder || rightBorder) {
-                uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
-                int i = 0;
-                if (leftBorder) {
-                    // for kernel size 7x7 and column == 0, we have:
-                    // x x x P0 P1 P2 P3
-                    // first loop mirrors P{0..3} to fill x's,
-                    // second one loads P{0..3}
-                    for (; i < HALF_KERNEL - column; i++)
-                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
-                    for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
-                } else {
-                    for (; i < width - column; i++)
-                        _rgbaIn[i] = *(src + i);
-                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
-                        _rgbaIn[i] = *(src - k);
-                }
+            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+            int i = 0;
+            if (leftBorder) {
+                // for kernel size 7x7 and column == 0, we have:
+                // x x x P0 P1 P2 P3
+                // first loop mirrors P{0..3} to fill x's,
+                // second one loads P{0..3}
+                for (; i < HALF_KERNEL - column; i++)
+                    _rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                for (; i < KERNEL_SIZE; i++)
+                    _rgbaIn[i] = *(src - (HALF_KERNEL - i));
+
+                for (int k = 0; k < REGISTERS_CNT; k++)
+                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
+            } else if (rightBorder) {
+                for (; i < width - column; i++)
+                    _rgbaIn[i] = *(src + i);
+                for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                    _rgbaIn[i] = *(src - k);
 
                 for (int k = 0; k < REGISTERS_CNT; k++)
                     rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));

From 6ae9934e2017bb00f78677fb37c4baf772abe536 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 5 Nov 2016 10:07:05 +0100
Subject: [PATCH 11/25] Pass SIMD-specific CFLAGS only to blur_simd.c

---
 Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 9ab3a91..694bd61 100644
--- a/Makefile
+++ b/Makefile
@@ -14,9 +14,9 @@ endif
 CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
-CFLAGS += -mssse3
-CFLAGS += -mavx
 CFLAGS += -O2
+SIMD_CFLAGS += -mavx
+SIMD_CFLAGS += -mssse3
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
 CFLAGS += $(shell $(PKG_CONFIG) --cflags cairo xcb-dpms xcb-xinerama xcb-atom xcb-image xcb-xkb xkbcommon xkbcommon-x11)
@@ -45,6 +45,7 @@ all: i3lock
 debug: CFLAGS += -g
 debug: i3lock
 
+blur_simd.o : CFLAGS += $(SIMD_CFLAGS)
 i3lock: ${FILES}
 	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
 

From ea730e70e6b2523ce0c0e4a86ff7f22267e4e7bf Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 5 Nov 2016 10:07:42 +0100
Subject: [PATCH 12/25] Unroll loops.

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 694bd61..7354272 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,7 @@ CFLAGS += -Wall
 CFLAGS += -O2
 SIMD_CFLAGS += -mavx
 SIMD_CFLAGS += -mssse3
+SIMD_CFLAGS += -funroll-loops
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
 CFLAGS += $(shell $(PKG_CONFIG) --cflags cairo xcb-dpms xcb-xinerama xcb-atom xcb-image xcb-xkb xkbcommon xkbcommon-x11)

From 4b58824e5e9ada8aa65a56f56db0140838b88b15 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 5 Nov 2016 12:30:22 +0100
Subject: [PATCH 13/25] SSE2: don't use VEX prefix.

---
 Makefile | 1 +
 blur.h   | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 7354272..da23bf3 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ CFLAGS += -pipe
 CFLAGS += -Wall
 CFLAGS += -O2
 SIMD_CFLAGS += -mavx
+SIMD_CFLAGS += -mno-sse2avx
 SIMD_CFLAGS += -mssse3
 SIMD_CFLAGS += -funroll-loops
 CPPFLAGS += -D_GNU_SOURCE
diff --git a/blur.h b/blur.h
index 607aa70..7469871 100644
--- a/blur.h
+++ b/blur.h
@@ -7,9 +7,10 @@
 void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 
+__attribute__((__target__(("no-avx"))))
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height)
-    __attribute__ ((__target__ ("no-avx")));
+__attribute__((__target__(("no-avx"))))
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
 
 void blur_impl_avx(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);

From ab41586b3923e1cbab8a78ed21c4d9ec41981b26 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Sat, 5 Nov 2016 16:01:40 +0100
Subject: [PATCH 14/25] SSE2: switch from Gaussian to box blur

---
 blur_simd.c | 42 ++++++++++++++----------------------------
 1 file changed, 14 insertions(+), 28 deletions(-)

diff --git a/blur_simd.c b/blur_simd.c
index 194f55a..4bd1847 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -90,44 +90,30 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
                     rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
             }
 
-            // unpack each pixel, convert to float,
-            // multiply by corresponding kernel value
-            // and add to accumulator
             __m128i tmp;
             __m128i zero = _mm_setzero_si128();
-            __m128 rgba_ps;
-            __m128 acc = _mm_setzero_ps();
-            int counter = 0;
+            __m128i acc = _mm_setzero_si128();
 
             for (int i = 0; i < 3; i++)
             {
-                tmp = _mm_unpacklo_epi8(rgbaIn[i], zero);
-                rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-                rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-
-                tmp = _mm_unpackhi_epi8(rgbaIn[i], zero);
-                rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-                rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-                acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+                acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[i], zero));
+                acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[i], zero));
             }
 
-            tmp = _mm_unpacklo_epi8(rgbaIn[3], zero);
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpackhi_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[3], zero));
 
             tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
-            rgba_ps = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            acc = _mm_add_ps(acc, _mm_mul_ps(rgba_ps, _mm_set1_ps(kernel[counter++])));
+            // set 16th pixel to zeroes
+            tmp = _mm_andnot_si128(_mm_set_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0,0,0,0), tmp);
+            acc = _mm_add_epi16(acc, tmp);
+            acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero), _mm_unpackhi_epi16(acc, zero));
 
-            __m128i rgbaOut = _mm_cvtps_epi32(acc);
-            rgbaOut = _mm_packs_epi32(rgbaOut, zero);
-            rgbaOut = _mm_packus_epi16(rgbaOut, zero);
-            *(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut);
+            acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
+                                             _mm_set1_ps(1/((float)(KERNEL_SIZE)))));
+
+            acc = _mm_packs_epi32(acc, zero);
+            acc = _mm_packus_epi16(acc, zero);
+            *(dst + height * column + row) = _mm_cvtsi128_si32(acc);
         }
     }
 }

From b47631d785a798224dc42a9a66e310aa5716b977 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 11 Nov 2016 13:11:11 +0100
Subject: [PATCH 15/25] SSE2: resize filter to 7x7. clean up a little.

---
 blur_simd.c | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/blur_simd.c b/blur_simd.c
index 4bd1847..4bf4e38 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -14,7 +14,7 @@
 #include <immintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
-#define KERNEL_SIZE 15 
+#define KERNEL_SIZE 7 
 #define HALF_KERNEL KERNEL_SIZE / 2
 
 // number of xmm registers needed to store 
@@ -90,30 +90,25 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
                     rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
             }
 
-            __m128i tmp;
             __m128i zero = _mm_setzero_si128();
             __m128i acc = _mm_setzero_si128();
 
-            for (int i = 0; i < 3; i++)
-            {
-                acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[i], zero));
-                acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[i], zero));
-            }
-
-            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[3], zero));
+            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[0], zero));
+            acc = _mm_add_epi16(acc, _mm_unpackhi_epi8(rgbaIn[0], zero));
+            acc = _mm_add_epi16(acc, _mm_unpacklo_epi8(rgbaIn[1], zero));
 
-            tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
-            // set 16th pixel to zeroes
-            tmp = _mm_andnot_si128(_mm_set_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0,0,0,0), tmp);
-            acc = _mm_add_epi16(acc, tmp);
-            acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero), _mm_unpackhi_epi16(acc, zero));
+            // kernel size equals to 7, but we can only load multiples of 4 pixels
+            // we have to set 8th pixel to zero
+            acc = _mm_add_epi16(acc, _mm_andnot_si128(_mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0), 
+                                                      _mm_unpackhi_epi8(rgbaIn[1], zero)));
+            acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero), 
+                                _mm_unpackhi_epi16(acc, zero));
 
             acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
-                                             _mm_set1_ps(1/((float)(KERNEL_SIZE)))));
+                                             _mm_set1_ps(1/((float)KERNEL_SIZE))));
 
-            acc = _mm_packs_epi32(acc, zero);
-            acc = _mm_packus_epi16(acc, zero);
-            *(dst + height * column + row) = _mm_cvtsi128_si32(acc);
+            *(dst + height * column + row) = 
+                _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));
         }
     }
 }

From e5e636892636d4de128a30c310cd630fa2147b86 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 11 Nov 2016 16:46:53 +0100
Subject: [PATCH 16/25] Remove SSSE3 version.

---
 Makefile    |   1 -
 blur.h      |   3 --
 blur_simd.c | 146 ----------------------------------------------------
 3 files changed, 150 deletions(-)

diff --git a/Makefile b/Makefile
index da23bf3..4edcfe5 100644
--- a/Makefile
+++ b/Makefile
@@ -17,7 +17,6 @@ CFLAGS += -Wall
 CFLAGS += -O2
 SIMD_CFLAGS += -mavx
 SIMD_CFLAGS += -mno-sse2avx
-SIMD_CFLAGS += -mssse3
 SIMD_CFLAGS += -funroll-loops
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
diff --git a/blur.h b/blur.h
index 7469871..5c08411 100644
--- a/blur.h
+++ b/blur.h
@@ -15,8 +15,5 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
 void blur_impl_avx(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
 
-void blur_impl_ssse3(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int16_t *kernel, int width, int height);
-
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index 4bf4e38..91325a3 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -10,7 +10,6 @@
 #include "blur.h"
 #include <math.h>
 #include <xmmintrin.h>
-#include <tmmintrin.h>
 #include <immintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
@@ -21,11 +20,6 @@
 // input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
-// scaling factor for kernel coefficients.
-// higher values cause desaturation.
-// used in SSSE3 implementation.
-#define SCALE_FACTOR 14
-
 // AVX intrinsics missing in GCC
 #define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
 #define _mm256_setr_m128i(v0, v1) _mm256_set_m128i((v1), (v0))
@@ -215,143 +209,3 @@ void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel,
         }
     }
 }
-
-void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
-    // prepare kernel
-    float kernelf[KERNEL_SIZE];
-    int16_t kernel[KERNEL_SIZE + 1];
-    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
-
-    for (int i = 0; i < KERNEL_SIZE; i++) {
-        float x = HALF_KERNEL - i;
-        kernelf[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
-        sum += kernelf[i];
-    }
-
-    // normalize kernel
-    for (int i = 0; i < KERNEL_SIZE; i++)
-        kernelf[i] /= sum;
-
-    // round to nearest integer and convert to int
-    for (int i = 0; i < KERNEL_SIZE; i++)
-        kernel[i] = (int16_t)lrintf(kernelf[i] * (1 << SCALE_FACTOR));
-    kernel[KERNEL_SIZE] = 0;
-
-    // horizontal pass includes image transposition:
-    // instead of writing pixel src[x] to dst[x],
-    // we write it to transposed location.
-    // (to be exact: dst[height * current_column + current_row])
-    blur_impl_horizontal_pass_ssse3(src, dst, kernel, width, height);
-    blur_impl_horizontal_pass_ssse3(dst, src, kernel, height, width);
-}
-
-
-void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int16_t *kernel, int width, int height) {
-    __m128i _kern[2];
-    _kern[0] = _mm_loadu_si128((__m128i*)kernel);
-    _kern[1] = _mm_loadu_si128((__m128i*)(kernel + 8));
-    __m128i rgbaIn[REGISTERS_CNT];
-
-    for (int row = 0; row < height; row++) {
-        for (int column = 0; column < width; column++, src++) {
-            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
-            // handle borders
-            int leftBorder = column < HALF_KERNEL;
-            int rightBorder = column > width - HALF_KERNEL;
-            if (leftBorder || rightBorder) {
-                int i = 0;
-                if (leftBorder) {
-                    // for kernel size 7x7 and column == 0, we have:
-                    // x x x P0 P1 P2 P3
-                    // first loop mirrors P{0..3} to fill x's,
-                    // second one loads P{0..3}
-                    for (; i < HALF_KERNEL - column; i++)
-                        _rgbaIn[i] = *(src + (HALF_KERNEL - i));
-                    for (; i < KERNEL_SIZE; i++)
-                        _rgbaIn[i] = *(src - (HALF_KERNEL - i));
-                } else {
-                    for (; i < width - column; i++)
-                        _rgbaIn[i] = *(src + i);
-                    for (int k = 0; i < KERNEL_SIZE; i++, k++)
-                        _rgbaIn[i] = *(src - k);
-                }
-
-                for (int k = 0; k < REGISTERS_CNT; k++)
-                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
-            } else {
-                for (int k = 0; k < REGISTERS_CNT; k++)
-                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
-            }
-
-            // basis of this implementation is _mm_maddubs_epi16 (aka pmaddubsw).
-            // 'rgba' holds 16 unsigned bytes, so 4 pixels.
-            // 'kern' holds 16 signed bytes kernel values multiplied by (1 << SCALE_FACTOR).
-            // before multiplication takes place, vectors need to be prepared:
-            // 'rgba' is shuffled from R1B1G1A1...R4B4G4A4 to R1R2R3R4...A1A2A3A4
-            // 'kern' is shuffled from w1w2w3w4...w13w14w15w16 to w1w2w3w4 repeated 4 times
-            // then we call _mm_maddubs_epi16 and we get:
-            // --------------------------------------------------------------------------------------
-            // | R1*w1 + R2*w2 | R3*w3 + R4*w4 | G1*w1 + G2*w2 | G3*w3 + G4*w4 | repeat for B and A |
-            // --------------------------------------------------------------------------------------
-            // each 'rectangle' is a 16-byte signed int.
-            // then we repeat the process for the rest of input pixels,
-            // call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
-
-            __m128i rgba, rg, ba, kern;
-            __m128i zero = _mm_setzero_si128();
-            __m128i acc_rg = _mm_setzero_si128();
-            __m128i acc_ba = _mm_setzero_si128();
-
-            const __m128i rgba_shuf_mask = _mm_setr_epi8(0, 4, 8,  12,
-                                                         1, 5, 9,  13,
-                                                         2, 6, 10, 14,
-                                                         3, 7, 11, 15);
-
-            const __m128i kern_shuf_mask = _mm_setr_epi8(0, 1, 2, 3,
-                                                         4, 5, 6, 7,
-                                                         0, 1, 2, 3,
-                                                         4, 5, 6, 7);
-
-            rgba = _mm_shuffle_epi8(rgbaIn[0], rgba_shuf_mask);
-            rg = _mm_unpacklo_epi8(rgba, zero);
-            ba = _mm_unpackhi_epi8(rgba, zero);
-            kern = _mm_shuffle_epi8(_kern[0], kern_shuf_mask);
-            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
-            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
-
-            rgba = _mm_shuffle_epi8(rgbaIn[1], rgba_shuf_mask);
-            rg = _mm_unpacklo_epi8(rgba, zero);
-            ba = _mm_unpackhi_epi8(rgba, zero);
-            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern[0], 8), kern_shuf_mask);
-            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
-            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
-    
-            rgba = _mm_shuffle_epi8(rgbaIn[2], rgba_shuf_mask);
-            rg = _mm_unpacklo_epi8(rgba, zero);
-            ba = _mm_unpackhi_epi8(rgba, zero);
-            kern = _mm_shuffle_epi8(_kern[1], kern_shuf_mask);
-            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
-            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
-
-            rgba = _mm_shuffle_epi8(rgbaIn[3], rgba_shuf_mask);
-            rg = _mm_unpacklo_epi8(rgba, zero);
-            ba = _mm_unpackhi_epi8(rgba, zero);
-            kern = _mm_shuffle_epi8(_mm_srli_si128(_kern[1], 8), kern_shuf_mask);
-            acc_rg = _mm_add_epi32(acc_rg, _mm_madd_epi16(rg, kern));
-            acc_ba = _mm_add_epi32(acc_ba, _mm_madd_epi16(ba, kern));
-
-            rgba = _mm_hadd_epi32(acc_rg, acc_ba);
-            rgba = _mm_srai_epi32(rgba, SCALE_FACTOR);
-
-            // Cairo sets alpha channel to 255
-            // (or -1, depending how you look at it)
-            // this quickly overflows accumulator,
-            // and alpha is calculated completely wrong.
-            // I assume most people don't use semi-transparent
-            // lock screen images, so no one will mind if we
-            // 'correct it' by setting alpha to 255.
-            *(dst + height * column + row) =
-                _mm_cvtsi128_si32(_mm_shuffle_epi8(rgba, rgba_shuf_mask));
-        }
-    }
-}

From 020af692e6a0c2b28bf13cf8638402f46e806db1 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 11 Nov 2016 17:11:31 +0100
Subject: [PATCH 17/25] Remove AVX version.

---
 Makefile    |   2 -
 blur.h      |   5 ---
 blur_simd.c | 110 ----------------------------------------------------
 3 files changed, 117 deletions(-)

diff --git a/Makefile b/Makefile
index 4edcfe5..86f3a3c 100644
--- a/Makefile
+++ b/Makefile
@@ -15,8 +15,6 @@ CFLAGS += -std=c99
 CFLAGS += -pipe
 CFLAGS += -Wall
 CFLAGS += -O2
-SIMD_CFLAGS += -mavx
-SIMD_CFLAGS += -mno-sse2avx
 SIMD_CFLAGS += -funroll-loops
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
diff --git a/blur.h b/blur.h
index 5c08411..83e1b9b 100644
--- a/blur.h
+++ b/blur.h
@@ -7,13 +7,8 @@
 void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 
-__attribute__((__target__(("no-avx"))))
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-__attribute__((__target__(("no-avx"))))
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
 
-void blur_impl_avx(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
-
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index 91325a3..27afb5f 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -10,7 +10,6 @@
 #include "blur.h"
 #include <math.h>
 #include <xmmintrin.h>
-#include <immintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
 #define KERNEL_SIZE 7 
@@ -20,12 +19,6 @@
 // input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
-// AVX intrinsics missing in GCC
-#define _mm256_set_m128i(v0, v1)  _mm256_insertf128_si256(_mm256_castsi128_si256(v1), (v0), 1)
-#define _mm256_setr_m128i(v0, v1) _mm256_set_m128i((v1), (v0))
-#define _mm256_set_m128(v0, v1)   _mm256_insertf128_ps(_mm256_castps128_ps256(v1), (v0), 1)
-#define _mm256_setr_m128(v0, v1)  _mm256_set_m128((v1), (v0))
-
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
     // prepare kernel
     float kernel[KERNEL_SIZE];
@@ -106,106 +99,3 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
         }
     }
 }
-
-void blur_impl_avx(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
-    // prepare kernel
-    float kernel[KERNEL_SIZE];
-    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
-
-    for (int i = 0; i < KERNEL_SIZE; i++) {
-        float x = HALF_KERNEL - i;
-        kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
-        sum += kernel[i];
-    }
-
-    // normalize kernel
-    for (int i = 0; i < KERNEL_SIZE; i++)
-        kernel[i] /= sum;
-
-    // horizontal pass includes image transposition:
-    // instead of writing pixel src[x] to dst[x],
-    // we write it to transposed location.
-    // (to be exact: dst[height * current_column + current_row])
-    blur_impl_horizontal_pass_avx(src, dst, kernel, width, height);
-    blur_impl_horizontal_pass_avx(dst, src, kernel, height, width);
-}
-
-void blur_impl_horizontal_pass_avx(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
-    __m256 kernels[HALF_KERNEL];
-    for (int i = 0, k = 0; i < HALF_KERNEL; i++, k += 2)
-        kernels[i] = _mm256_setr_m128(_mm_set1_ps(kernel[k]), _mm_set1_ps(kernel[k+1]));
-
-    for (int row = 0; row < height; row++) {
-        for (int column = 0; column < width; column++, src++) {
-            __m128i rgbaIn[REGISTERS_CNT];
-
-            // handle borders
-            int leftBorder = column < HALF_KERNEL;
-            int rightBorder = column > width - HALF_KERNEL;
-            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
-            int i = 0;
-            if (leftBorder) {
-                // for kernel size 7x7 and column == 0, we have:
-                // x x x P0 P1 P2 P3
-                // first loop mirrors P{0..3} to fill x's,
-                // second one loads P{0..3}
-                for (; i < HALF_KERNEL - column; i++)
-                    _rgbaIn[i] = *(src + (HALF_KERNEL - i));
-                for (; i < KERNEL_SIZE; i++)
-                    _rgbaIn[i] = *(src - (HALF_KERNEL - i));
-
-                for (int k = 0; k < REGISTERS_CNT; k++)
-                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
-            } else if (rightBorder) {
-                for (; i < width - column; i++)
-                    _rgbaIn[i] = *(src + i);
-                for (int k = 0; i < KERNEL_SIZE; i++, k++)
-                    _rgbaIn[i] = *(src - k);
-
-                for (int k = 0; k < REGISTERS_CNT; k++)
-                    rgbaIn[k] = _mm_load_si128((__m128i*)(_rgbaIn + 4*k));
-            } else {
-                for (int k = 0; k < REGISTERS_CNT; k++)
-                    rgbaIn[k] = _mm_loadu_si128((__m128i*)(src + 4*k - HALF_KERNEL));
-            }
-
-            // unpack each pixel, convert to float,
-            // multiply by corresponding kernel value
-            // and add to accumulator
-            __m128i tmp;
-            __m128i zero = _mm_setzero_si128();
-            __m128 rgba_ps_128;
-            __m256 rgba_ps;
-            __m256 acc = _mm256_setzero_ps();
-            int counter = 0;
-
-            for (int i = 0; i < 3; i++)
-            {
-                tmp = _mm_unpacklo_epi8(rgbaIn[i], zero);
-                rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero),
-                                                               _mm_unpackhi_epi16(tmp, zero)));
-                acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter++]));
-
-                tmp = _mm_unpackhi_epi8(rgbaIn[i], zero);
-                rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero),
-                                                               _mm_unpackhi_epi16(tmp, zero)));
-                acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter++]));
-            }
-
-            tmp = _mm_unpacklo_epi8(rgbaIn[3], zero);
-            rgba_ps = _mm256_cvtepi32_ps(_mm256_setr_m128i(_mm_unpacklo_epi16(tmp, zero),
-                                                           _mm_unpackhi_epi16(tmp, zero)));
-            acc = _mm256_add_ps(acc, _mm256_mul_ps(rgba_ps, kernels[counter]));
-
-            tmp = _mm_unpackhi_epi8(rgbaIn[3], zero);
-            rgba_ps_128 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(tmp, zero));
-            rgba_ps_128 = _mm_mul_ps(rgba_ps_128, _mm_set1_ps(kernel[KERNEL_SIZE-1]));
-            rgba_ps_128 = _mm_add_ps(rgba_ps_128, _mm_add_ps(_mm256_extractf128_ps(acc, 0),
-                                                             _mm256_extractf128_ps(acc, 1)));
-
-            __m128i rgbaOut = _mm_packs_epi32(_mm_cvtps_epi32(rgba_ps_128), zero);
-            rgbaOut = _mm_packus_epi16(rgbaOut, zero);
-            *(dst + height * column + row) = _mm_cvtsi128_si32(rgbaOut);
-        }
-    }
-}

From 6029c8e0b5b3431f1982391c84187c9b8bd2caca Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Fri, 11 Nov 2016 18:45:20 +0100
Subject: [PATCH 18/25] Clean up a bit.

---
 blur.h      |  2 +-
 blur_simd.c | 44 ++++++++++++++++++++++++--------------------
 2 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/blur.h b/blur.h
index 83e1b9b..5e959a9 100644
--- a/blur.h
+++ b/blur.h
@@ -8,7 +8,7 @@ void blur_image_surface (cairo_surface_t *surface, int radius);
 void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 
 void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
-void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height);
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index 27afb5f..b45ffdd 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -13,6 +13,7 @@
 
 #define ALIGN16 __attribute__((aligned(16)))
 #define KERNEL_SIZE 7 
+#define SIGMA_AV 2
 #define HALF_KERNEL KERNEL_SIZE / 2
 
 // number of xmm registers needed to store 
@@ -20,29 +21,31 @@
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
 void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
-    // prepare kernel
-    float kernel[KERNEL_SIZE];
-    float coeff = 1.0 / sqrtf(2 * M_PI * sigma * sigma), sum = 0;
-
-    for (int i = 0; i < KERNEL_SIZE; i++) {
-        float x = HALF_KERNEL - i;
-        kernel[i] = coeff * expf(-x * x / (2.0 * sigma * sigma));
-        sum += kernel[i];
+    // according to a paper by Peter Kovesi [1], box filter of width w, equals to Gaussian blur of following sigma:
+    // σ_av = sqrt((w*w-1)/12)
+    // for our 7x7 filter we have σ_av = 2.0.
+    // applying the same Gaussian filter n times results in σ_n = sqrt(n*σ_av*σ_av) [2]
+    // after some trivial math, we arrive at n = ((σ_d)/(σ_av))^2
+    // since it's a box blur filter, n >= 3
+    //
+    // [1]: http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
+    // [2]: https://en.wikipedia.org/wiki/Gaussian_blur#Mathematics
+    
+    int n = lrintf((sigma*sigma)/(SIGMA_AV*SIGMA_AV));
+    if (n < 3) n = 3;
+
+    for (int i = 0; i < n; i++)
+    {
+        // horizontal pass includes image transposition:
+        // instead of writing pixel src[x] to dst[x],
+        // we write it to transposed location.
+        // (to be exact: dst[height * current_column + current_row])
+        blur_impl_horizontal_pass_sse2(src, dst, width, height);
+        blur_impl_horizontal_pass_sse2(dst, src, height, width);
     }
-
-    // normalize kernel
-    for (int i = 0; i < KERNEL_SIZE; i++)
-        kernel[i] /= sum;
-
-    // horizontal pass includes image transposition:
-    // instead of writing pixel src[x] to dst[x],
-    // we write it to transposed location.
-    // (to be exact: dst[height * current_column + current_row])
-    blur_impl_horizontal_pass_sse2(src, dst, kernel, width, height);
-    blur_impl_horizontal_pass_sse2(dst, src, kernel, height, width);
 }
 
-void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel, int width, int height) {
+void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height) {
     for (int row = 0; row < height; row++) {
         for (int column = 0; column < width; column++, src++) {
             __m128i rgbaIn[REGISTERS_CNT];
@@ -91,6 +94,7 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
             acc = _mm_add_epi32(_mm_unpacklo_epi16(acc, zero), 
                                 _mm_unpackhi_epi16(acc, zero));
 
+            // multiplication is significantly faster than division
             acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
                                              _mm_set1_ps(1/((float)KERNEL_SIZE))));
 

From 3598cf19e80f32bc0c4dbdc2fb9b73c436689696 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Wed, 15 Feb 2017 11:22:06 +0100
Subject: [PATCH 19/25] Implement generic box blur

---
 blur.c      | 153 +++++++++++++++++++++++++---------------------------
 blur.h      |   7 ++-
 blur_simd.c |  33 +-----------
 3 files changed, 80 insertions(+), 113 deletions(-)

diff --git a/blur.c b/blur.c
index a5b0bd3..070e5f2 100644
--- a/blur.c
+++ b/blur.c
@@ -24,15 +24,12 @@
 #include <math.h>
 #include "blur.h"
 
-#define ARRAY_LENGTH(a) (sizeof (a) / sizeof (a)[0])
-
 /* Performs a simple 2D Gaussian blur of radius @radius on surface @surface. */
 void
 blur_image_surface (cairo_surface_t *surface, int radius)
 {
     cairo_surface_t *tmp;
     int width, height;
-    int src_stride, dst_stride;
     uint32_t *src, *dst;
 
     if (cairo_surface_status (surface))
@@ -64,89 +61,87 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     return;
 
     src = (uint32_t*)cairo_image_surface_get_data (surface);
-    src_stride = cairo_image_surface_get_stride (surface);
-
     dst = (uint32_t*)cairo_image_surface_get_data (tmp);
-    dst_stride = cairo_image_surface_get_stride (tmp);
-
-    //blur_impl_naive(src, dst, width, height, src_stride, dst_stride, 10000);
-    //blur_impl_sse2(src, dst, width, height, 4.5);
-    blur_impl_ssse3(src, dst, width, height, 4.5);
 
+    // according to a paper by Peter Kovesi [1], box filter of width w, equals to Gaussian blur of following sigma:
+    // σ_av = sqrt((w*w-1)/12)
+    // for our 7x7 filter we have σ_av = 2.0.
+    // applying the same Gaussian filter n times results in σ_n = sqrt(n*σ_av*σ_av) [2]
+    // after some trivial math, we arrive at n = ((σ_d)/(σ_av))^2
+    // since it's a box blur filter, n >= 3
+    //
+    // [1]: http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
+    // [2]: https://en.wikipedia.org/wiki/Gaussian_blur#Mathematics
+    
+    float sigma = 5;
+    
+    int n = lrintf((sigma*sigma)/(SIGMA_AV*SIGMA_AV));
+    if (n < 3) n = 3;
+
+    for (int i = 0; i < n; i++)
+    {
+        // horizontal pass includes image transposition:
+        // instead of writing pixel src[x] to dst[x],
+        // we write it to transposed location.
+        // (to be exact: dst[height * current_column + current_row])
+#ifdef __x86_64__
+        blur_impl_horizontal_pass_sse2(src, dst, width, height);
+        blur_impl_horizontal_pass_sse2(dst, src, height, width);
+#else
+        blur_impl_horizontal_pass_generic(src, dst, width, height);
+        blur_impl_horizontal_pass_generic(dst, src, height, width);
+#endif
+    }
+ 
     cairo_surface_destroy (tmp);
     cairo_surface_flush (surface);
     cairo_surface_mark_dirty (surface);
 }
 
-void blur_impl_naive(uint32_t* _src, uint32_t* _dst, int width, int height, int src_stride, int dst_stride, int radius)
-{
-    int x, y, z, w;
-    uint32_t *s, *d, a, p;
-    int i, j, k;
-    uint8_t kernel[17];
-    const int size = ARRAY_LENGTH (kernel);
-    const int half = size / 2;
-
-    uint8_t *src = (uint8_t*)_src;
-    uint8_t *dst = (uint8_t*)_dst;
-
-    a = 0;
-    for (i = 0; i < size; i++) {
-    double f = i - half;
-    a += kernel[i] = exp (- f * f / 30.0) * 80;
-    }
-
-    /* Horizontally blur from surface -> tmp */
-    for (i = 0; i < height; i++) {
-    s = (uint32_t *) (src + i * src_stride);
-    d = (uint32_t *) (dst + i * dst_stride);
-    for (j = 0; j < width; j++) {
-        if (radius < j && j < width - radius) {
-        d[j] = s[j];
-        continue;
-        }
-
-        x = y = z = w = 0;
-        for (k = 0; k < size; k++) {
-        if (j - half + k < 0 || j - half + k >= width)
-            continue;
-
-        p = s[j - half + k];
-
-        x += ((p >> 24) & 0xff) * kernel[k];
-        y += ((p >> 16) & 0xff) * kernel[k];
-        z += ((p >>  8) & 0xff) * kernel[k];
-        w += ((p >>  0) & 0xff) * kernel[k];
+void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height) {
+    for (int row = 0; row < height; row++) {
+        for (int column = 0; column < width; column++, src++) {
+            uint32_t rgbaIn[KERNEL_SIZE];
+
+            // handle borders
+            int leftBorder = column < HALF_KERNEL;
+            int rightBorder = column > width - HALF_KERNEL;
+            int i = 0;
+            if (leftBorder) {
+                // for kernel size 7x7 and column == 0, we have:
+                // x x x P0 P1 P2 P3
+                // first loop mirrors P{0..3} to fill x's,
+                // second one loads P{0..3}
+                for (; i < HALF_KERNEL - column; i++)
+                    rgbaIn[i] = *(src + (HALF_KERNEL - i));
+                for (; i < KERNEL_SIZE; i++)
+                    rgbaIn[i] = *(src - (HALF_KERNEL - i));
+            } else if (rightBorder) {
+                for (; i < width - column; i++)
+                    rgbaIn[i] = *(src + i);
+                for (int k = 0; i < KERNEL_SIZE; i++, k++)
+                    rgbaIn[i] = *(src - k);
+            } else {
+                for (; i < KERNEL_SIZE; i++)
+                    rgbaIn[i] = *(src + i - HALF_KERNEL);
+            }
+
+            uint32_t acc[4] = {0};
+
+            for (i = 0; i < KERNEL_SIZE; i++) {
+                acc[0] += (rgbaIn[i] & 0xFF000000) >> 24;
+                acc[1] += (rgbaIn[i] & 0x00FF0000) >> 16;
+                acc[2] += (rgbaIn[i] & 0x0000FF00) >> 8;
+                acc[3] += (rgbaIn[i] & 0x000000FF) >> 0;
+            }
+
+            for(i = 0; i < 4; i++)
+                acc[i] *= 1.0/KERNEL_SIZE;
+
+            *(dst + height * column + row) = (acc[0] << 24) |
+                                             (acc[1] << 16) |
+                                             (acc[2] << 8 ) |
+                                             (acc[3] << 0);
         }
-        d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
-    }
-    }
-
-    /* Then vertically blur from tmp -> surface */
-    for (i = 0; i < height; i++) {
-    s = (uint32_t *) (dst + i * dst_stride);
-    d = (uint32_t *) (src + i * src_stride);
-    for (j = 0; j < width; j++) {
-        if (radius <= i && i < height - radius) {
-        d[j] = s[j];
-        continue;
-        }
-
-        x = y = z = w = 0;
-        for (k = 0; k < size; k++) {
-        if (i - half + k < 0 || i - half + k >= height)
-            continue;
-
-        s = (uint32_t *) (dst + (i - half + k) * dst_stride);
-        p = s[j];
-
-        x += ((p >> 24) & 0xff) * kernel[k];
-        y += ((p >> 16) & 0xff) * kernel[k];
-        z += ((p >>  8) & 0xff) * kernel[k];
-        w += ((p >>  0) & 0xff) * kernel[k];
-        }
-        d[j] = (x / a << 24) | (y / a << 16) | (z / a << 8) | w / a;
-    }
     }
 }
-
diff --git a/blur.h b/blur.h
index 5e959a9..c1fabc0 100644
--- a/blur.h
+++ b/blur.h
@@ -4,11 +4,14 @@
 #include <stdint.h>
 #include <cairo.h>
 
+#define KERNEL_SIZE 7 
+#define SIGMA_AV 2
+#define HALF_KERNEL KERNEL_SIZE / 2
+
 void blur_image_surface (cairo_surface_t *surface, int radius);
-void blur_impl_naive(uint32_t* src, uint32_t* dst, int width, int height, int src_stride, int dst_stride, int radius);
 
-void blur_impl_sse2(uint32_t* src, uint32_t* dst, int width, int height, float sigma);
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height);
+void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height);
 
 #endif
 
diff --git a/blur_simd.c b/blur_simd.c
index b45ffdd..6861ce8 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -8,43 +8,12 @@
  */
 
 #include "blur.h"
-#include <math.h>
 #include <xmmintrin.h>
 
 #define ALIGN16 __attribute__((aligned(16)))
-#define KERNEL_SIZE 7 
-#define SIGMA_AV 2
-#define HALF_KERNEL KERNEL_SIZE / 2
-
-// number of xmm registers needed to store 
-// input pixels for given kernel size
+// number of xmm registers needed to store input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
-void blur_impl_sse2(uint32_t *src, uint32_t *dst, int width, int height, float sigma) {
-    // according to a paper by Peter Kovesi [1], box filter of width w, equals to Gaussian blur of following sigma:
-    // σ_av = sqrt((w*w-1)/12)
-    // for our 7x7 filter we have σ_av = 2.0.
-    // applying the same Gaussian filter n times results in σ_n = sqrt(n*σ_av*σ_av) [2]
-    // after some trivial math, we arrive at n = ((σ_d)/(σ_av))^2
-    // since it's a box blur filter, n >= 3
-    //
-    // [1]: http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
-    // [2]: https://en.wikipedia.org/wiki/Gaussian_blur#Mathematics
-    
-    int n = lrintf((sigma*sigma)/(SIGMA_AV*SIGMA_AV));
-    if (n < 3) n = 3;
-
-    for (int i = 0; i < n; i++)
-    {
-        // horizontal pass includes image transposition:
-        // instead of writing pixel src[x] to dst[x],
-        // we write it to transposed location.
-        // (to be exact: dst[height * current_column + current_row])
-        blur_impl_horizontal_pass_sse2(src, dst, width, height);
-        blur_impl_horizontal_pass_sse2(dst, src, height, width);
-    }
-}
-
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height) {
     for (int row = 0; row < height; row++) {
         for (int column = 0; column < width; column++, src++) {

From 024dc2980e8acc474b5dbbea8f806b899b6fd11b Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Wed, 15 Feb 2017 11:27:43 +0100
Subject: [PATCH 20/25] Minor style changes

---
 blur.c      | 4 ++--
 blur_simd.c | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/blur.c b/blur.c
index 070e5f2..04c9429 100644
--- a/blur.c
+++ b/blur.c
@@ -1,6 +1,6 @@
 /*
- * Copyright Â© 2008 Kristian HÃ¸gsberg
- * Copyright Â© 2009 Chris Wilson
+ * Copyright © 2008 Kristian Høgsberg
+ * Copyright © 2009 Chris Wilson
  *
  * Permission to use, copy, modify, distribute, and sell this software and its
  * documentation for any purpose is hereby granted without fee, provided that
diff --git a/blur_simd.c b/blur_simd.c
index 6861ce8..b654e98 100644
--- a/blur_simd.c
+++ b/blur_simd.c
@@ -10,7 +10,6 @@
 #include "blur.h"
 #include <xmmintrin.h>
 
-#define ALIGN16 __attribute__((aligned(16)))
 // number of xmm registers needed to store input pixels for given kernel size
 #define REGISTERS_CNT (KERNEL_SIZE + 4/2) / 4
 
@@ -22,7 +21,7 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int
             // handle borders
             int leftBorder = column < HALF_KERNEL;
             int rightBorder = column > width - HALF_KERNEL;
-            uint32_t _rgbaIn[KERNEL_SIZE] ALIGN16;
+            uint32_t _rgbaIn[KERNEL_SIZE] __attribute__((aligned(16)));
             int i = 0;
             if (leftBorder) {
                 // for kernel size 7x7 and column == 0, we have:
@@ -65,7 +64,7 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int
 
             // multiplication is significantly faster than division
             acc = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(acc),
-                                             _mm_set1_ps(1/((float)KERNEL_SIZE))));
+                                             _mm_set1_ps(1.0/KERNEL_SIZE)));
 
             *(dst + height * column + row) = 
                 _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packs_epi32(acc, zero), zero));

From 0f989add083cd53c85b57d1e2431b2090d708133 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Wed, 15 Feb 2017 12:01:08 +0100
Subject: [PATCH 21/25] Pass blur strength from command line arguments

---
 blur.c   |  6 ++----
 blur.h   |  3 +--
 i3lock.c | 12 +++++++-----
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/blur.c b/blur.c
index 04c9429..f78f8cb 100644
--- a/blur.c
+++ b/blur.c
@@ -24,9 +24,9 @@
 #include <math.h>
 #include "blur.h"
 
-/* Performs a simple 2D Gaussian blur of radius @radius on surface @surface. */
+/* Performs a simple 2D Gaussian blur of standard devation @sigma surface @surface. */
 void
-blur_image_surface (cairo_surface_t *surface, int radius)
+blur_image_surface (cairo_surface_t *surface, int sigma)
 {
     cairo_surface_t *tmp;
     int width, height;
@@ -73,8 +73,6 @@ blur_image_surface (cairo_surface_t *surface, int radius)
     // [1]: http://www.peterkovesi.com/papers/FastGaussianSmoothing.pdf
     // [2]: https://en.wikipedia.org/wiki/Gaussian_blur#Mathematics
     
-    float sigma = 5;
-    
     int n = lrintf((sigma*sigma)/(SIGMA_AV*SIGMA_AV));
     if (n < 3) n = 3;
 
diff --git a/blur.h b/blur.h
index c1fabc0..dfc1c3d 100644
--- a/blur.h
+++ b/blur.h
@@ -8,8 +8,7 @@
 #define SIGMA_AV 2
 #define HALF_KERNEL KERNEL_SIZE / 2
 
-void blur_image_surface (cairo_surface_t *surface, int radius);
-
+void blur_image_surface(cairo_surface_t *surface, int sigma);
 void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, int width, int height);
 void blur_impl_horizontal_pass_generic(uint32_t *src, uint32_t *dst, int width, int height);
 
diff --git a/i3lock.c b/i3lock.c
index c6a6a45..9b50cde 100644
--- a/i3lock.c
+++ b/i3lock.c
@@ -79,7 +79,7 @@ char date_format[32] = "%A, %m %Y\0";
 /* opts for blurring */
 bool blur = false;
 bool step_blur = false;
-int blur_radius = 5;
+int blur_sigma = 5;
 
 uint32_t last_resolution[2];
 xcb_window_t win;
@@ -866,7 +866,7 @@ int main(int argc, char *argv[]) {
         {"timestr", required_argument, NULL, 0},
         {"datestr", required_argument, NULL, 0},
 
-        {"blur", no_argument, NULL, 'B'},
+        {"blur", required_argument, NULL, 'B'},
 
         {"ignore-empty-password", no_argument, NULL, 'e'},
         {"inactivity-timeout", required_argument, NULL, 'I'},
@@ -878,7 +878,7 @@ int main(int argc, char *argv[]) {
     if ((username = pw->pw_name) == NULL)
         errx(EXIT_FAILURE, "pw->pw_name is NULL.\n");
 
-    char *optstring = "hvnbdc:p:ui:teI:frsS:kB";
+    char *optstring = "hvnbdc:p:ui:teI:frsS:kB:";
     while ((o = getopt_long(argc, argv, optstring, longopts, &optind)) != -1) {
         switch (o) {
             case 'v':
@@ -950,6 +950,7 @@ int main(int argc, char *argv[]) {
                 break;
             case 'B':
                 blur = true;
+                blur_sigma = atoi(optarg);
                 break;
             case 0:
                 if (strcmp(longopts[optind].name, "debug") == 0)
@@ -1084,7 +1085,8 @@ int main(int argc, char *argv[]) {
                 break;
             default:
                 errx(EXIT_FAILURE, "Syntax: i3lock-color [-v] [-n] [-b] [-d] [-c color] [-u] [-p win|default]"
-                                   " [-i image.png] [-t] [-e] [-I timeout] [-f] [-r|s] [-S screen_number] [-k] [--fuckton-of-color-args=rrggbbaa]");
+                                   " [-i image.png] [-t] [-e] [-I timeout] [-f] [-r|s] [-S screen_number] [-k]"
+                                   " [-B blur_strength] [--fuckton-of-color-args=rrggbbaa]");
         }
     }
 
@@ -1207,7 +1209,7 @@ int main(int argc, char *argv[]) {
             cairo_destroy(ctx);
             cairo_surface_destroy(xcb_img);
         }
-        blur_image_surface(img, 10000);
+        blur_image_surface(img, blur_sigma);
     }
 
     /* Pixmap on which the image is rendered to (if any) */

From 863f621ff3465dda8dc13c71cb120c045f8d0802 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Wed, 15 Feb 2017 12:11:28 +0100
Subject: [PATCH 22/25] Update readme

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 07f0e13..cb5aab3 100644
--- a/README.md
+++ b/README.md
@@ -32,6 +32,7 @@ Many little improvements have been made to i3lock over time:
      - `--line-uses-inside`, `-s` -- the line between the inside and outer ring uses the inside color for its color
      - `-S, --screen` -- specifies which display to draw the unlock indicator on
      - `-k, --clock` -- enables the clock display.
+     - `-B, --blur` -- enables Gaussian blur
      - `--timestr="%H:%M:%S"` -- allows custom overriding of the time format string. Accepts `strftime` formatting. Default is `"%H:%M:%S"`.
      - `--datestr="%A, %m %Y"` -- allows custom overriding of the date format string. Accepts `strftime` formatting. Default is `"%A, %m %Y"`.
   - All the colors have an alpha channel now. Please keep in mind that this was not intended when the program was originally written, so making things transparent that weren't before can make it look strange.

From bef18f2b74a51d97c5b22911a8932241f191eba4 Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Wed, 15 Feb 2017 12:17:06 +0100
Subject: [PATCH 23/25] Update lock.sh

---
 lock.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lock.sh b/lock.sh
index ffbabe1..5cb04c7 100755
--- a/lock.sh
+++ b/lock.sh
@@ -36,5 +36,5 @@ V='#bb00bbbb'  # verifying
 --clock               \
 --timestr="%H:%M:%S"  \
 --datestr="%A, %m %Y" \
--B                    \
+--blur 5              \
 

From efcee548b2c0803c6730bc9ab075c986e4c1b11b Mon Sep 17 00:00:00 2001
From: Sebastian Frysztak <sebfry@gmail.com>
Date: Wed, 15 Feb 2017 12:22:44 +0100
Subject: [PATCH 24/25] Properly detect SSE2 on 32-bit systems

---
 blur.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blur.c b/blur.c
index f78f8cb..1838df4 100644
--- a/blur.c
+++ b/blur.c
@@ -82,7 +82,7 @@ blur_image_surface (cairo_surface_t *surface, int sigma)
         // instead of writing pixel src[x] to dst[x],
         // we write it to transposed location.
         // (to be exact: dst[height * current_column + current_row])
-#ifdef __x86_64__
+#ifdef __SSE2__
         blur_impl_horizontal_pass_sse2(src, dst, width, height);
         blur_impl_horizontal_pass_sse2(dst, src, height, width);
 #else

From b07ec97602aae607b16538cec4fb86eb1a9e6e37 Mon Sep 17 00:00:00 2001
From: Chris Guillott <chris@techfo.xyz>
Date: Fri, 17 Feb 2017 22:41:10 -0500
Subject: [PATCH 25/25] Update Makefile

---
 Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile b/Makefile
index 713c7e0..6436254 100644
--- a/Makefile
+++ b/Makefile
@@ -16,6 +16,7 @@ CFLAGS += -pipe
 CFLAGS += -Wall
 CFLAGS += -O2
 SIMD_CFLAGS += -funroll-loops
+SIMD_CFLAGS += -msse2
 CPPFLAGS += -D_GNU_SOURCE
 CPPFLAGS += -DXKBCOMPOSE=$(shell if test -e /usr/include/xkbcommon/xkbcommon-compose.h ; then echo 1 ; else echo 0 ; fi )
 CFLAGS += $(shell $(PKG_CONFIG) --cflags cairo xcb-dpms xcb-xinerama xcb-atom xcb-image xcb-xkb xkbcommon xkbcommon-x11)