Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright 2016 Google Inc. | 2 * Copyright 2016 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #ifndef SkBitmapFilter_opts_DEFINED | 8 #ifndef SkBitmapFilter_opts_DEFINED |
| 9 #define SkBitmapFilter_opts_DEFINED | 9 #define SkBitmapFilter_opts_DEFINED |
| 10 | 10 |
| 11 #include "SkConvolver.h" | 11 #include "SkConvolver.h" |
| 12 | 12 |
| 13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
| 14 #include <immintrin.h> | |
|
mtklein_C
2016/12/05 18:42:46
I think we can just use immintrin.h for everything
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 15 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
| 14 #include <emmintrin.h> | 16 #include <emmintrin.h> |
| 15 #elif defined(SK_ARM_HAS_NEON) | 17 #elif defined(SK_ARM_HAS_NEON) |
| 16 #include <arm_neon.h> | 18 #include <arm_neon.h> |
| 17 #endif | 19 #endif |
| 18 | 20 |
| 19 namespace SK_OPTS_NS { | 21 namespace SK_OPTS_NS { |
| 20 | 22 |
| 21 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | 23 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2 |
| 24 | |
| 25 static SK_ALWAYS_INLINE | |
| 26 void ComputeCoefficientRow(SkConvolutionFilter1D::ConvolutionFixed filterVal ue, const unsigned char* sourceDataRows, | |
|
mtklein_C
2016/12/05 18:42:45
Typically we name static functions with lower_case
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 27 __m256i& Y01, __m256i& Y23, __m256i& Y45, __m256i & Y67) { | |
| 28 __m256i coefs = _mm256_set1_epi16(filterValue); | |
| 29 __m256i pixels = _mm256_lddqu_si256(reinterpret_cast<const __m256i *>(so urceDataRows)); | |
|
mtklein_C
2016/12/05 18:42:45
I have always shied away from lddqu, instead just
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 30 __m256i zero = _mm256_setzero_si256(); | |
| 31 | |
| 32 // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 33 __m256i Yt1 = _mm256_unpacklo_epi8(pixels, zero); | |
|
mtklein_C
2016/12/05 18:42:45
Does Yt1 mean ymm, temporary #1?
We usually find
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 34 // [32] cja3 cjb3 cjg3 cjr3 cja2 cjb2 cjg2 cjr2 | |
| 35 __m256i Yt2 = _mm256_mulhi_epi16(Yt1, coefs); | |
| 36 // [32] cja1 cjb1 cjg1 cjr1 cja0 cjb0 cjg0 cjr0 | |
| 37 Yt1 = _mm256_mullo_epi16(Yt1, coefs); | |
| 38 | |
| 39 // pixel 0 and 1 | |
| 40 __m256i Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2); | |
| 41 Y01 = _mm256_add_epi32(Y01, Yt3); | |
| 42 // pixel 2 and 3 | |
| 43 Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2); | |
| 44 Y23 = _mm256_add_epi32(Y23, Yt3); | |
| 45 | |
| 46 // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4 | |
| 47 Yt1 = _mm256_unpackhi_epi8(pixels, zero); | |
| 48 // [32] cja7 cjb7 cjg7 cjr7 cja6 cjb6 cjg6 cjr6 | |
| 49 Yt2 = _mm256_mulhi_epi16(Yt1, coefs); | |
| 50 // [32] cja5 cjb5 cjg5 cjr5 cja4 cjb4 cjg4 cjr4 | |
| 51 Yt1 = _mm256_mullo_epi16(Yt1, coefs); | |
| 52 | |
| 53 // pixel 4 and 5 | |
| 54 Yt3 = _mm256_unpacklo_epi16(Yt1, Yt2); | |
| 55 Y45 = _mm256_add_epi32(Y45, Yt3); | |
| 56 // pixel 6 and 7 | |
| 57 Yt3 = _mm256_unpackhi_epi16(Yt1, Yt2); | |
| 58 Y67 = _mm256_add_epi32(Y67, Yt3); | |
| 59 } | |
| 60 | |
| 61 template<bool hasAlpha> | |
| 62 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
| 63 int filterLength, | |
| 64 unsigned char* const * sourceDataRows, | |
| 65 int pixelWidth, | |
| 66 unsigned char* outRow) { | |
| 67 | |
| 68 int outX, filterY; | |
| 69 int width = pixelWidth & ~7; | |
| 70 int length = filterLength & ~3; | |
| 71 | |
| 72 __m256i Yt0, Yt1, Yt2; | |
| 73 | |
| 74 // Output eight pixels per iteration (32 bytes). | |
| 75 for (outX = 0; outX < width; outX += 8) { | |
| 76 // Accumulated result for each pixel. 32 bits per RGBA channel. | |
| 77 __m256i Y01 = _mm256_setzero_si256(); | |
| 78 __m256i Y23 = _mm256_setzero_si256(); | |
| 79 __m256i Y45 = _mm256_setzero_si256(); | |
| 80 __m256i Y67 = _mm256_setzero_si256(); | |
| 81 | |
| 82 // Convolve with 4 filter coefficient per iteration. | |
| 83 for (int filterY = 0; filterY < length; filterY += 4) { | |
| 84 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67); | |
|
mtklein_C
2016/12/05 18:42:46
This might line up neater with a couple "+ 0".
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 85 ComputeCoefficientRow(filterValues[filterY + 1], sourceDataRows[ filterY + 1] + outX * 4, Y01, Y23, Y45, Y67); | |
| 86 ComputeCoefficientRow(filterValues[filterY + 2], sourceDataRows[ filterY + 2] + outX * 4, Y01, Y23, Y45, Y67); | |
| 87 ComputeCoefficientRow(filterValues[filterY + 3], sourceDataRows[ filterY + 3] + outX * 4, Y01, Y23, Y45, Y67); | |
| 88 } | |
| 89 for (filterY = length; filterY < filterLength; filterY++) { | |
| 90 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67); | |
| 91 } | |
| 92 | |
| 93 // Shift right for fixed point implementation. | |
| 94 Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits); | |
| 95 Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits); | |
| 96 Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits); | |
| 97 Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits); | |
| 98 | |
| 99 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
| 100 // [16] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 101 Yt0 = _mm256_packs_epi32(Y01, Y23); | |
| 102 | |
| 103 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
| 104 // [16] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4 | |
| 105 Yt1 = _mm256_packs_epi32(Y45, Y67); | |
| 106 | |
| 107 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
| 108 // [8] a7 b7 g7 r7 a6 b6 g6 r6 a5 b5 g5 r5 a4 b4 g4 r4 a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 109 Yt0 = _mm256_packus_epi16(Yt0, Yt1); | |
| 110 | |
| 111 if (hasAlpha) { | |
| 112 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 113 Yt1 = _mm256_srli_epi32(Yt0, 8); | |
| 114 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 115 Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g. | |
| 116 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 117 Yt1 = _mm256_srli_epi32(Yt0, 16); | |
| 118 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 119 Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b. | |
| 120 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 121 Yt2 = _mm256_slli_epi32(Yt2, 24); | |
| 122 | |
| 123 // Make sure the value of alpha channel is always larger than ma ximum | |
|
mtklein_C
2016/12/05 18:42:46
This comment might be better moved up just under i
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 124 // value of color channels. | |
| 125 Yt0 = _mm256_max_epu8(Yt2, Yt0); | |
| 126 } else { | |
| 127 __m256i mask = _mm256_set1_epi32(0xff000000); | |
| 128 Yt0 = _mm256_or_si256(Yt0, mask); | |
| 129 } | |
| 130 | |
| 131 // Store the convolution result (32 bytes) and advance the pixel poi nters. | |
| 132 _mm256_storeu_si256(reinterpret_cast<__m256i *>(outRow), Yt0); | |
| 133 outRow += 32; | |
| 134 } | |
| 135 | |
| 136 if (pixelWidth & 7) { | |
| 137 __m256i Y01 = _mm256_setzero_si256(); | |
| 138 __m256i Y23 = _mm256_setzero_si256(); | |
| 139 __m256i Y45 = _mm256_setzero_si256(); | |
| 140 __m256i Y67 = _mm256_setzero_si256(); | |
| 141 | |
| 142 for (int filterY = 0; filterY < filterLength; filterY++) { | |
| 143 ComputeCoefficientRow(filterValues[filterY], sourceDataRows[filt erY] + outX * 4, Y01, Y23, Y45, Y67); | |
| 144 } | |
| 145 | |
| 146 Y01 = _mm256_srai_epi32(Y01, SkConvolutionFilter1D::kShiftBits); | |
| 147 Y23 = _mm256_srai_epi32(Y23, SkConvolutionFilter1D::kShiftBits); | |
| 148 Y45 = _mm256_srai_epi32(Y45, SkConvolutionFilter1D::kShiftBits); | |
| 149 Y67 = _mm256_srai_epi32(Y67, SkConvolutionFilter1D::kShiftBits); | |
| 150 | |
| 151 Yt0 = _mm256_packs_epi32(Y01, Y23); | |
| 152 Yt1 = _mm256_packs_epi32(Y45, Y67); | |
| 153 Yt0 = _mm256_packus_epi16(Yt0, Yt1); | |
| 154 | |
| 155 if (hasAlpha) { | |
| 156 Yt1 = _mm256_srli_epi32(Yt0, 8); | |
| 157 Yt2 = _mm256_max_epu8(Yt1, Yt0); // Max of r and g. | |
|
mtklein_C
2016/12/05 18:42:45
This is the sort of code that using more variables
xiangze.zhang
2016/12/07 11:17:01
Done.
| |
| 158 Yt1 = _mm256_srli_epi32(Yt0, 16); | |
| 159 Yt2 = _mm256_max_epu8(Yt1, Yt2); // Max of r and g and b. | |
| 160 Yt2 = _mm256_slli_epi32(Yt2, 24); | |
| 161 Yt0 = _mm256_max_epu8(Yt2, Yt0); | |
| 162 } else { | |
| 163 __m256i mask = _mm256_set1_epi32(0xff000000); | |
| 164 Yt0 = _mm256_or_si256(Yt0, mask); | |
| 165 } | |
| 166 | |
| 167 for (int i = width; i < pixelWidth; i++) { | |
|
mtklein_C
2016/12/05 18:42:46
I'd have expected a call to _mm256_maskstore_epi32
xiangze.zhang
2016/12/07 11:17:01
I tried something like this:
__m256i mask = _m
| |
| 168 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(_mm256_cas tsi256_si128(Yt0)); | |
| 169 __m256i rotate = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); | |
| 170 Yt0 = _mm256_permutevar8x32_epi32(Yt0, rotate); | |
| 171 outRow += 4; | |
| 172 } | |
| 173 } | |
| 174 } | |
| 175 | |
| 176 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
| 22 | 177 |
| 23 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, | 178 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, |
| 24 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) { | 179 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) { |
| 25 int remainder[4] = {0}; | 180 int remainder[4] = {0}; |
| 26 for (int i = 0; i < r; i++) { | 181 for (int i = 0; i < r; i++) { |
| 27 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; | 182 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; |
| 28 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; | 183 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; |
| 29 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; | 184 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; |
| 30 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; | 185 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; |
| 31 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; | 186 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; |
| (...skipping 899 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 931 pixelWidth, outRow); | 1086 pixelWidth, outRow); |
| 932 } else { | 1087 } else { |
| 933 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows , | 1088 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows , |
| 934 pixelWidth, outRow); | 1089 pixelWidth, outRow); |
| 935 } | 1090 } |
| 936 } | 1091 } |
| 937 | 1092 |
| 938 } // namespace SK_OPTS_NS | 1093 } // namespace SK_OPTS_NS |
| 939 | 1094 |
| 940 #endif//SkBitmapFilter_opts_DEFINED | 1095 #endif//SkBitmapFilter_opts_DEFINED |
| OLD | NEW |