Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1243)

Side by Side Diff: src/opts/SkBlurImage_opts_SSE4.cpp

Issue 1123263003: Really use SSE4 (and SSSE3) in SkBlurImage_SSE4 (Closed) Base URL: https://skia.googlesource.com/skia.git@master
Patch Set: 0 Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2014 The Android Open Source Project 2 * Copyright 2014 The Android Open Source Project
3 * 3 *
4 * Use of this source code is governed by a BSD-style license that can be 4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file. 5 * found in the LICENSE file.
6 */ 6 */
7 7
8 #include "SkBitmap.h" 8 #include "SkBitmap.h"
9 #include "SkBlurImage_opts_SSE4.h" 9 #include "SkBlurImage_opts_SSE4.h"
10 #include "SkColorPriv.h" 10 #include "SkColorPriv.h"
11 #include "SkRect.h" 11 #include "SkRect.h"
12 12
13 /* With the exception of the compilers that don't support it, we always build th e 13 /* With the exception of the compilers that don't support it, we always build th e
14 * SSE4 functions and enable the caller to determine SSE4 support. However for 14 * SSE4 functions and enable the caller to determine SSE4 support. However for
15 * compilers that do not support SSE4x we provide a stub implementation. 15 * compilers that do not support SSE4x we provide a stub implementation.
16 */ 16 */
17 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41 17 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
18 18
19 #include <smmintrin.h> 19 #include <smmintrin.h>
20 20
21 namespace { 21 namespace {
22 enum BlurDirection { 22 enum BlurDirection {
23 kX, kY 23 kX, kY
24 }; 24 };
25 25
26 /* Helper function to spread the components of a 32-bit integer into the 26 /* Helper function to spread the components of a 32-bit integer into the
27 * lower 8 bits of each 32-bit element of an SSE register. 27 * lower 8 bits of each 32-bit element of an SSE register.
28 */ 28 */
29 inline __m128i expand(int a) { 29 inline __m128i expand(int a) {
30 const __m128i zero = _mm_setzero_si128(); 30 // ARGB -> 0000 0000 0000 ARGB
31 31 __m128i widened = _mm_cvtsi32_si128(a);
32 // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B 32 // SSE4.1 has xxxx xxxx xxxx ARGB -> 000A 000R 000G 000B as a one-stop-shop instruction.
33 __m128i result = _mm_cvtsi32_si128(a); 33 // It can even work from memory, so a smart compiler probably merges in the _mm_cvtsi32_si128().
34 34 return _mm_cvtepu8_epi32(widened);
35 // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B
36 result = _mm_unpacklo_epi8(result, zero);
37
38 // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B
39 return _mm_unpacklo_epi16(result, zero);
40 } 35 }
41 36
42 template<BlurDirection srcDirection, BlurDirection dstDirection> 37 template<BlurDirection srcDirection, BlurDirection dstDirection>
43 void SkBoxBlur_SSE4(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize, 38 void SkBoxBlur_SSE4(const SkPMColor* src, int srcStride, SkPMColor* dst, int ker nelSize,
44 int leftOffset, int rightOffset, int width, int height) 39 int leftOffset, int rightOffset, int width, int height)
45 { 40 {
46 const int rightBorder = SkMin32(rightOffset + 1, width); 41 const int rightBorder = SkMin32(rightOffset + 1, width);
47 const int srcStrideX = srcDirection == kX ? 1 : srcStride; 42 const int srcStrideX = srcDirection == kX ? 1 : srcStride;
48 const int dstStrideX = dstDirection == kX ? 1 : height; 43 const int dstStrideX = dstDirection == kX ? 1 : height;
49 const int srcStrideY = srcDirection == kX ? srcStride : 1; 44 const int srcStrideY = srcDirection == kX ? srcStride : 1;
50 const int dstStrideY = dstDirection == kX ? width : 1; 45 const int dstStrideY = dstDirection == kX ? width : 1;
51 const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize); 46 const __m128i scale = _mm_set1_epi32((1 << 24) / kernelSize);
52 const __m128i half = _mm_set1_epi32(1 << 23); 47 const __m128i half = _mm_set1_epi32(1 << 23);
53 const __m128i zero = _mm_setzero_si128();
54 for (int y = 0; y < height; ++y) { 48 for (int y = 0; y < height; ++y) {
55 __m128i sum = zero; 49 __m128i sum = _mm_setzero_si128();
56 const SkPMColor* p = src; 50 const SkPMColor* p = src;
57 for (int i = 0; i < rightBorder; ++i) { 51 for (int i = 0; i < rightBorder; ++i) {
58 sum = _mm_add_epi32(sum, expand(*p)); 52 sum = _mm_add_epi32(sum, expand(*p));
59 p += srcStrideX; 53 p += srcStrideX;
60 } 54 }
61 55
62 const SkPMColor* sptr = src; 56 const SkPMColor* sptr = src;
63 SkColor* dptr = dst; 57 SkColor* dptr = dst;
64 for (int x = 0; x < width; ++x) { 58 for (int x = 0; x < width; ++x) {
59 // TODO(mtklein): We are working in 8.24 here. Drop to 8.8 when the kernel is narrow?
60
61 // Multiply each component by scale (i.e. divide by kernel size) and add half to round.
65 __m128i result = _mm_mullo_epi32(sum, scale); 62 __m128i result = _mm_mullo_epi32(sum, scale);
66
67 // sumA*scale+.5 sumB*scale+.5 sumG*scale+.5 sumB*scale+.5
68 result = _mm_add_epi32(result, half); 63 result = _mm_add_epi32(result, half);
69 64
70 // 0 0 0 A 0 0 0 R 0 0 0 G 0 0 0 B 65 // Now pack the top byte of each 32-bit lane back down into one 32-b it color.
71 result = _mm_srli_epi32(result, 24); 66 // Axxx Rxxx Gxxx Bxxx -> xxxx xxxx xxxx ARGB
67 const char _ = 0; // Don't care what ends up in these bytes. Happe ns to be byte 0.
68 result = _mm_shuffle_epi8(result, _mm_set_epi8(_,_,_,_, _,_,_,_, _,_ ,_,_, 15,11,7,3));
72 69
73 // 0 0 0 0 0 0 0 0 0 A 0 R 0 G 0 B 70 *dptr = _mm_cvtsi128_si32(result);
74 result = _mm_packs_epi32(result, zero);
75 71
76 // 0 0 0 0 0 0 0 0 0 0 0 0 A R G B 72 // TODO(mtklein): experiment with breaking this loop into 3 parts
77 result = _mm_packus_epi16(result, zero);
78 *dptr = _mm_cvtsi128_si32(result);
79 if (x >= leftOffset) { 73 if (x >= leftOffset) {
80 SkColor l = *(sptr - leftOffset * srcStrideX); 74 SkColor l = *(sptr - leftOffset * srcStrideX);
81 sum = _mm_sub_epi32(sum, expand(l)); 75 sum = _mm_sub_epi32(sum, expand(l));
82 } 76 }
83 if (x + rightOffset + 1 < width) { 77 if (x + rightOffset + 1 < width) {
84 SkColor r = *(sptr + (rightOffset + 1) * srcStrideX); 78 SkColor r = *(sptr + (rightOffset + 1) * srcStrideX);
85 sum = _mm_add_epi32(sum, expand(r)); 79 sum = _mm_add_epi32(sum, expand(r));
86 } 80 }
87 sptr += srcStrideX; 81 sptr += srcStrideX;
88 if (srcDirection == kY) { 82 if (srcDirection == kY) {
83 // TODO(mtklein): experiment with moving this prefetch forward
89 _mm_prefetch(reinterpret_cast<const char*>(sptr + (rightOffset + 1) * srcStrideX), 84 _mm_prefetch(reinterpret_cast<const char*>(sptr + (rightOffset + 1) * srcStrideX),
90 _MM_HINT_T0); 85 _MM_HINT_T0);
91 } 86 }
92 dptr += dstStrideX; 87 dptr += dstStrideX;
93 } 88 }
94 src += srcStrideY; 89 src += srcStrideY;
95 dst += dstStrideY; 90 dst += dstStrideY;
96 } 91 }
97 } 92 }
98 93
(...skipping 15 matching lines...) Expand all
114 bool SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc* boxBlurX, 109 bool SkBoxBlurGetPlatformProcs_SSE4(SkBoxBlurProc* boxBlurX,
115 SkBoxBlurProc* boxBlurY, 110 SkBoxBlurProc* boxBlurY,
116 SkBoxBlurProc* boxBlurXY, 111 SkBoxBlurProc* boxBlurXY,
117 SkBoxBlurProc* boxBlurYX) { 112 SkBoxBlurProc* boxBlurYX) {
118 sk_throw(); 113 sk_throw();
119 return false; 114 return false;
120 } 115 }
121 116
122 117
123 #endif 118 #endif
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698