@ -23,7 +23,7 @@
// scaling factor for kernel coefficients.
// scaling factor for kernel coefficients.
// higher values cause desaturation.
// higher values cause desaturation.
// used in SSSE3 implementation.
// used in SSSE3 implementation.
# define SCALE_FACTOR 7
# define SCALE_FACTOR 14
void blur_impl_sse2 ( uint32_t * src , uint32_t * dst , int width , int height , float sigma ) {
void blur_impl_sse2 ( uint32_t * src , uint32_t * dst , int width , int height , float sigma ) {
// prepare kernel
// prepare kernel
@ -127,7 +127,7 @@ void blur_impl_horizontal_pass_sse2(uint32_t *src, uint32_t *dst, float *kernel,
void blur_impl_ssse3 ( uint32_t * src , uint32_t * dst , int width , int height , float sigma ) {
void blur_impl_ssse3 ( uint32_t * src , uint32_t * dst , int width , int height , float sigma ) {
// prepare kernel
// prepare kernel
float kernelf [ KERNEL_SIZE ] ;
float kernelf [ KERNEL_SIZE ] ;
int8 _t kernel [ KERNEL_SIZE + 1 ] ;
int16 _t kernel [ KERNEL_SIZE + 1 ] ;
float coeff = 1.0 / sqrtf ( 2 * M_PI * sigma * sigma ) , sum = 0 ;
float coeff = 1.0 / sqrtf ( 2 * M_PI * sigma * sigma ) , sum = 0 ;
for ( int i = 0 ; i < KERNEL_SIZE ; i + + ) {
for ( int i = 0 ; i < KERNEL_SIZE ; i + + ) {
@ -142,7 +142,7 @@ void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float
// round to nearest integer and convert to int
// round to nearest integer and convert to int
for ( int i = 0 ; i < KERNEL_SIZE ; i + + )
for ( int i = 0 ; i < KERNEL_SIZE ; i + + )
kernel [ i ] = ( int8_t ) rintf ( kernelf [ i ] * ( 1 < < SCALE_FACTOR ) ) ;
kernel [ i ] = ( int16_t ) l rintf ( kernelf [ i ] * ( 1 < < SCALE_FACTOR ) ) ;
kernel [ KERNEL_SIZE ] = 0 ;
kernel [ KERNEL_SIZE ] = 0 ;
// horizontal pass includes image transposition:
// horizontal pass includes image transposition:
@ -154,8 +154,10 @@ void blur_impl_ssse3(uint32_t *src, uint32_t *dst, int width, int height, float
}
}
void blur_impl_horizontal_pass_ssse3 ( uint32_t * src , uint32_t * dst , int8_t * kernel , int width , int height ) {
void blur_impl_horizontal_pass_ssse3 ( uint32_t * src , uint32_t * dst , int16_t * kernel , int width , int height ) {
__m128i _kern = _mm_loadu_si128 ( ( __m128i * ) kernel ) ;
__m128i _kern [ 2 ] ;
_kern [ 0 ] = _mm_loadu_si128 ( ( __m128i * ) kernel ) ;
_kern [ 1 ] = _mm_loadu_si128 ( ( __m128i * ) ( kernel + 8 ) ) ;
__m128i rgbaIn [ REGISTERS_CNT ] ;
__m128i rgbaIn [ REGISTERS_CNT ] ;
for ( int row = 0 ; row < height ; row + + ) {
for ( int row = 0 ; row < height ; row + + ) {
@ -203,9 +205,10 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne
// then we repeat the process for the rest of input pixels,
// then we repeat the process for the rest of input pixels,
// call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
// call _mm_hadds_epi16 to add adjacent ints and shift right to scale by SCALE_FACTOR.
__m128i rgba , kern ;
__m128i rgba , rg , ba , kern ;
__m128i zero = _mm_setzero_si128 ( ) ;
__m128i zero = _mm_setzero_si128 ( ) ;
__m128i acc = _mm_setzero_si128 ( ) ;
__m128i acc_rg = _mm_setzero_si128 ( ) ;
__m128i acc_ba = _mm_setzero_si128 ( ) ;
const __m128i rgba_shuf_mask = _mm_setr_epi8 ( 0 , 4 , 8 , 12 ,
const __m128i rgba_shuf_mask = _mm_setr_epi8 ( 0 , 4 , 8 , 12 ,
1 , 5 , 9 , 13 ,
1 , 5 , 9 , 13 ,
@ -213,28 +216,40 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne
3 , 7 , 11 , 15 ) ;
3 , 7 , 11 , 15 ) ;
const __m128i kern_shuf_mask = _mm_setr_epi8 ( 0 , 1 , 2 , 3 ,
const __m128i kern_shuf_mask = _mm_setr_epi8 ( 0 , 1 , 2 , 3 ,
4 , 5 , 6 , 7 ,
0 , 1 , 2 , 3 ,
0 , 1 , 2 , 3 ,
0 , 1 , 2 , 3 ,
4 , 5 , 6 , 7 ) ;
0 , 1 , 2 , 3 ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 0 ] , rgba_shuf_mask ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 0 ] , rgba_shuf_mask ) ;
kern = _mm_shuffle_epi8 ( _kern , kern_shuf_mask ) ;
rg = _mm_unpacklo_epi8 ( rgba , zero ) ;
acc = _mm_adds_epi16 ( acc , _mm_maddubs_epi16 ( rgba , kern ) ) ;
ba = _mm_unpackhi_epi8 ( rgba , zero ) ;
kern = _mm_shuffle_epi8 ( _kern [ 0 ] , kern_shuf_mask ) ;
acc_rg = _mm_add_epi32 ( acc_rg , _mm_madd_epi16 ( rg , kern ) ) ;
acc_ba = _mm_add_epi32 ( acc_ba , _mm_madd_epi16 ( ba , kern ) ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 1 ] , rgba_shuf_mask ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 1 ] , rgba_shuf_mask ) ;
kern = _mm_shuffle_epi8 ( _mm_srli_si128 ( _kern , 4 ) , kern_shuf_mask ) ;
rg = _mm_unpacklo_epi8 ( rgba , zero ) ;
acc = _mm_adds_epi16 ( acc , _mm_maddubs_epi16 ( rgba , kern ) ) ;
ba = _mm_unpackhi_epi8 ( rgba , zero ) ;
kern = _mm_shuffle_epi8 ( _mm_srli_si128 ( _kern [ 0 ] , 8 ) , kern_shuf_mask ) ;
acc_rg = _mm_add_epi32 ( acc_rg , _mm_madd_epi16 ( rg , kern ) ) ;
acc_ba = _mm_add_epi32 ( acc_ba , _mm_madd_epi16 ( ba , kern ) ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 2 ] , rgba_shuf_mask ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 2 ] , rgba_shuf_mask ) ;
kern = _mm_shuffle_epi8 ( _mm_srli_si128 ( _kern , 8 ) , kern_shuf_mask ) ;
rg = _mm_unpacklo_epi8 ( rgba , zero ) ;
acc = _mm_adds_epi16 ( acc , _mm_maddubs_epi16 ( rgba , kern ) ) ;
ba = _mm_unpackhi_epi8 ( rgba , zero ) ;
kern = _mm_shuffle_epi8 ( _kern [ 1 ] , kern_shuf_mask ) ;
acc_rg = _mm_add_epi32 ( acc_rg , _mm_madd_epi16 ( rg , kern ) ) ;
acc_ba = _mm_add_epi32 ( acc_ba , _mm_madd_epi16 ( ba , kern ) ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 3 ] , rgba_shuf_mask ) ;
rgba = _mm_shuffle_epi8 ( rgbaIn [ 3 ] , rgba_shuf_mask ) ;
kern = _mm_shuffle_epi8 ( _mm_srli_si128 ( _kern , 12 ) , kern_shuf_mask ) ;
rg = _mm_unpacklo_epi8 ( rgba , zero ) ;
acc = _mm_adds_epi16 ( acc , _mm_maddubs_epi16 ( rgba , kern ) ) ;
ba = _mm_unpackhi_epi8 ( rgba , zero ) ;
kern = _mm_shuffle_epi8 ( _mm_srli_si128 ( _kern [ 1 ] , 8 ) , kern_shuf_mask ) ;
acc_rg = _mm_add_epi32 ( acc_rg , _mm_madd_epi16 ( rg , kern ) ) ;
acc_ba = _mm_add_epi32 ( acc_ba , _mm_madd_epi16 ( ba , kern ) ) ;
acc = _mm_hadds_epi16 ( acc , zero ) ;
rgb a = _mm_hadd_epi32 ( acc_rg , acc_ba ) ;
acc = _mm_srai_epi16 ( acc , SCALE_FACTOR ) ;
rgb a = _mm_srai_epi32 ( rgba , SCALE_FACTOR ) ;
// Cairo sets alpha channel to 255
// Cairo sets alpha channel to 255
// (or -1, depending how you look at it)
// (or -1, depending how you look at it)
@ -244,7 +259,7 @@ void blur_impl_horizontal_pass_ssse3(uint32_t *src, uint32_t *dst, int8_t *kerne
// lock screen images, so no one will mind if we
// lock screen images, so no one will mind if we
// 'correct it' by setting alpha to 255.
// 'correct it' by setting alpha to 255.
* ( dst + height * column + row ) =
* ( dst + height * column + row ) =
_mm_cvtsi128_si32 ( _mm_packus_epi16 ( acc , zero ) ) | 0xFF000000 ;
_mm_cvtsi128_si32 ( _mm_shuffle_epi8 ( rgba , rgba_shuf_mask ) ) ;
}
}
}
}
}
}