Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 /* | |
| 2 * Copyright 2016 Google Inc. | |
| 3 * | |
| 4 * Use of this source code is governed by a BSD-style license that can be | |
| 5 * found in the LICENSE file. | |
| 6 */ | |
| 7 | |
| 8 #ifndef SkBitmapFilter_opts_DEFINED | |
| 9 #define SkBitmapFilter_opts_DEFINED | |
| 10 | |
| 11 #include "SkConvolver.h" | |
| 12 | |
| 13 namespace SK_OPTS_NS { | |
| 14 | |
| 15 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 | |
| 16 | |
| 17 #include <emmintrin.h> | |
|
mtklein_C
2016/11/16 14:24:47
This will #include the SSE intrinsics into the SK_
xiangze.zhang
2016/11/17 02:33:07
Done.
| |
| 18 | |
| 19 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, | |
| 20 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i & accum, int r) { | |
| 21 int remainder[4] = {0}; | |
| 22 for (int i = 0; i < r; i++) { | |
| 23 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; | |
| 24 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; | |
| 25 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; | |
| 26 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; | |
| 27 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; | |
| 28 } | |
| 29 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem ainder[3]); | |
| 30 accum = _mm_add_epi32(accum, t); | |
| 31 } | |
| 32 | |
| 33 // Convolves horizontally along a single row. The row data is given in | |
| 34 // |srcData| and continues for the numValues() of the filter. | |
| 35 void convolve_horizontally(const unsigned char* srcData, | |
| 36 const SkConvolutionFilter1D& filter, | |
| 37 unsigned char* outRow, | |
| 38 bool /*hasAlpha*/) { | |
| 39 // Output one pixel each iteration, calculating all channels (RGBA) toge ther. | |
| 40 int numValues = filter.numValues(); | |
| 41 for (int outX = 0; outX < numValues; outX++) { | |
| 42 // Get the filter that determines the current output pixel. | |
| 43 int filterOffset, filterLength; | |
| 44 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
| 45 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
| 46 | |
| 47 // Compute the first pixel in this row that the filter affects. It w ill | |
| 48 // touch |filterLength| pixels (4 bytes each) after this. | |
| 49 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
| 50 | |
| 51 __m128i zero = _mm_setzero_si128(); | |
| 52 __m128i accum = _mm_setzero_si128(); | |
| 53 | |
| 54 // We will load and accumulate with four coefficients per iteration. | |
| 55 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { | |
| 56 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. | |
| 57 __m128i coeff, coeff16; | |
| 58 // [16] xx xx xx xx c3 c2 c1 c0 | |
| 59 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues)); | |
| 60 // [16] xx xx xx xx c1 c1 c0 c0 | |
| 61 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
| 62 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | |
| 63 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
| 64 | |
| 65 // Load four pixels => unpack the first two pixels to 16 bits => | |
| 66 // multiply with coefficients => accumulate the convolution resu lt. | |
| 67 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 68 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>( rowToFilter)); | |
| 69 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 70 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
| 71 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 72 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 73 // [32] a0*c0 b0*c0 g0*c0 r0*c0 | |
| 74 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 75 accum = _mm_add_epi32(accum, t); | |
| 76 // [32] a1*c1 b1*c1 g1*c1 r1*c1 | |
| 77 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 78 accum = _mm_add_epi32(accum, t); | |
| 79 | |
| 80 // Duplicate 3rd and 4th coefficients for all channels => | |
| 81 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe fficients | |
| 82 // => accumulate the convolution results. | |
| 83 // [16] xx xx xx xx c3 c3 c2 c2 | |
| 84 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
| 85 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | |
| 86 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
| 87 // [16] a3 g3 b3 r3 a2 g2 b2 r2 | |
| 88 src16 = _mm_unpackhi_epi8(src8, zero); | |
| 89 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 90 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 91 // [32] a2*c2 b2*c2 g2*c2 r2*c2 | |
| 92 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 93 accum = _mm_add_epi32(accum, t); | |
| 94 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | |
| 95 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 96 accum = _mm_add_epi32(accum, t); | |
| 97 | |
| 98 // Advance the pixel and coefficients pointers. | |
| 99 rowToFilter += 16; | |
| 100 filterValues += 4; | |
| 101 } | |
| 102 | |
| 103 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3 | |
| 104 // coefficients one at a time. | |
| 105 int r = filterLength & 3; | |
| 106 if (r) { | |
| 107 int remainderOffset = (filterOffset + filterLength - r) * 4; | |
| 108 AccumRemainder(srcData + remainderOffset, filterValues, accum, r ); | |
| 109 } | |
| 110 | |
| 111 // Shift right for fixed point implementation. | |
| 112 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | |
| 113 | |
| 114 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
| 115 accum = _mm_packs_epi32(accum, zero); | |
| 116 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
| 117 accum = _mm_packus_epi16(accum, zero); | |
| 118 | |
| 119 // Store the pixel value of 32 bits. | |
| 120 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum); | |
| 121 outRow += 4; | |
| 122 } | |
| 123 } | |
| 124 | |
| 125 // Convolves horizontally along four rows. The row data is given in | |
| 126 // |srcData| and continues for the numValues() of the filter. | |
| 127 // The algorithm is almost same as |convolve_horizontally|. Please | |
| 128 // refer to that function for detailed comments. | |
| 129 void convolve_4_rows_horizontally(const unsigned char* srcData[4], | |
| 130 const SkConvolutionFilter1D& filter, | |
| 131 unsigned char* outRow[4], | |
| 132 size_t outRowBytes) { | |
| 133 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];) | |
| 134 | |
| 135 // Output one pixel each iteration, calculating all channels (RGBA) toge ther. | |
| 136 int numValues = filter.numValues(); | |
| 137 for (int outX = 0; outX < numValues; outX++) { | |
| 138 int filterOffset, filterLength; | |
| 139 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
| 140 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
| 141 | |
| 142 __m128i zero = _mm_setzero_si128(); | |
| 143 | |
| 144 // four pixels in a column per iteration. | |
| 145 __m128i accum0 = _mm_setzero_si128(); | |
| 146 __m128i accum1 = _mm_setzero_si128(); | |
| 147 __m128i accum2 = _mm_setzero_si128(); | |
| 148 __m128i accum3 = _mm_setzero_si128(); | |
| 149 | |
| 150 int start = filterOffset * 4; | |
| 151 // We will load and accumulate with four coefficients per iteration. | |
| 152 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { | |
| 153 __m128i coeff, coeff16lo, coeff16hi; | |
| 154 // [16] xx xx xx xx c3 c2 c1 c0 | |
| 155 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV alues)); | |
| 156 // [16] xx xx xx xx c1 c1 c0 c0 | |
| 157 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
| 158 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | |
| 159 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
| 160 // [16] xx xx xx xx c3 c3 c2 c2 | |
| 161 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
| 162 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | |
| 163 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
| 164 | |
| 165 __m128i src8, src16, mul_hi, mul_lo, t; | |
| 166 | |
| 167 #define ITERATION(src, accum) \ | |
| 168 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ | |
| 169 src16 = _mm_unpacklo_epi8(src8, zero); \ | |
| 170 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ | |
| 171 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ | |
| 172 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | |
| 173 accum = _mm_add_epi32(accum, t); \ | |
| 174 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | |
| 175 accum = _mm_add_epi32(accum, t); \ | |
| 176 src16 = _mm_unpackhi_epi8(src8, zero); \ | |
| 177 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ | |
| 178 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ | |
| 179 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | |
| 180 accum = _mm_add_epi32(accum, t); \ | |
| 181 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | |
| 182 accum = _mm_add_epi32(accum, t) | |
| 183 | |
| 184 ITERATION(srcData[0] + start, accum0); | |
| 185 ITERATION(srcData[1] + start, accum1); | |
| 186 ITERATION(srcData[2] + start, accum2); | |
| 187 ITERATION(srcData[3] + start, accum3); | |
| 188 | |
| 189 start += 16; | |
| 190 filterValues += 4; | |
| 191 } | |
| 192 | |
| 193 int r = filterLength & 3; | |
| 194 if (r) { | |
| 195 int remainderOffset = (filterOffset + filterLength - r) * 4; | |
| 196 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum 0, r); | |
| 197 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum 1, r); | |
| 198 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum 2, r); | |
| 199 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum 3, r); | |
| 200 } | |
| 201 | |
| 202 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 203 accum0 = _mm_packs_epi32(accum0, zero); | |
| 204 accum0 = _mm_packus_epi16(accum0, zero); | |
| 205 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 206 accum1 = _mm_packs_epi32(accum1, zero); | |
| 207 accum1 = _mm_packus_epi16(accum1, zero); | |
| 208 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 209 accum2 = _mm_packs_epi32(accum2, zero); | |
| 210 accum2 = _mm_packus_epi16(accum2, zero); | |
| 211 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | |
| 212 accum3 = _mm_packs_epi32(accum3, zero); | |
| 213 accum3 = _mm_packus_epi16(accum3, zero); | |
| 214 | |
| 215 // We seem to be running off the edge here (chromium:491660). | |
| 216 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes ); | |
| 217 | |
| 218 *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0); | |
| 219 *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1); | |
| 220 *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2); | |
| 221 *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3); | |
| 222 | |
| 223 outRow[0] += 4; | |
| 224 outRow[1] += 4; | |
| 225 outRow[2] += 4; | |
| 226 outRow[3] += 4; | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 // Does vertical convolution to produce one output row. The filter values an d | |
| 231 // length are given in the first two parameters. These are applied to each | |
| 232 // of the rows pointed to in the |sourceDataRows| array, with each row | |
| 233 // being |pixelWidth| wide. | |
| 234 // | |
| 235 // The output must have room for |pixelWidth * 4| bytes. | |
| 236 template<bool hasAlpha> | |
| 237 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
| 238 int filterLength, | |
| 239 unsigned char* const* sourceDataRows, | |
| 240 int pixelWidth, | |
| 241 unsigned char* outRow) { | |
| 242 // Output four pixels per iteration (16 bytes). | |
| 243 int width = pixelWidth & ~3; | |
| 244 __m128i zero = _mm_setzero_si128(); | |
| 245 for (int outX = 0; outX < width; outX += 4) { | |
| 246 // Accumulated result for each pixel. 32 bits per RGBA channel. | |
| 247 __m128i accum0 = _mm_setzero_si128(); | |
| 248 __m128i accum1 = _mm_setzero_si128(); | |
| 249 __m128i accum2 = _mm_setzero_si128(); | |
| 250 __m128i accum3 = _mm_setzero_si128(); | |
| 251 | |
| 252 // Convolve with one filter coefficient per iteration. | |
| 253 for (int filterY = 0; filterY < filterLength; filterY++) { | |
| 254 | |
| 255 // Duplicate the filter coefficient 8 times. | |
| 256 // [16] cj cj cj cj cj cj cj cj | |
| 257 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); | |
| 258 | |
| 259 // Load four pixels (16 bytes) together. | |
| 260 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 261 const __m128i* src = reinterpret_cast<const __m128i*>( | |
| 262 &sourceDataRows[filterY][outX << 2]); | |
| 263 __m128i src8 = _mm_loadu_si128(src); | |
| 264 | |
| 265 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha nnels => | |
| 266 // multiply with current coefficient => accumulate the result. | |
| 267 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 268 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
| 269 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 270 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 271 // [32] a0 b0 g0 r0 | |
| 272 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 273 accum0 = _mm_add_epi32(accum0, t); | |
| 274 // [32] a1 b1 g1 r1 | |
| 275 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 276 accum1 = _mm_add_epi32(accum1, t); | |
| 277 | |
| 278 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha nnels => | |
| 279 // multiply with current coefficient => accumulate the result. | |
| 280 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 281 src16 = _mm_unpackhi_epi8(src8, zero); | |
| 282 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 283 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 284 // [32] a2 b2 g2 r2 | |
| 285 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 286 accum2 = _mm_add_epi32(accum2, t); | |
| 287 // [32] a3 b3 g3 r3 | |
| 288 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 289 accum3 = _mm_add_epi32(accum3, t); | |
| 290 } | |
| 291 | |
| 292 // Shift right for fixed point implementation. | |
| 293 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 294 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 295 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 296 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | |
| 297 | |
| 298 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
| 299 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 300 accum0 = _mm_packs_epi32(accum0, accum1); | |
| 301 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 302 accum2 = _mm_packs_epi32(accum2, accum3); | |
| 303 | |
| 304 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
| 305 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 306 accum0 = _mm_packus_epi16(accum0, accum2); | |
| 307 | |
| 308 if (hasAlpha) { | |
| 309 // Compute the max(ri, gi, bi) for each pixel. | |
| 310 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 311 __m128i a = _mm_srli_epi32(accum0, 8); | |
| 312 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 313 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
| 314 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 315 a = _mm_srli_epi32(accum0, 16); | |
| 316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 317 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
| 318 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 319 b = _mm_slli_epi32(b, 24); | |
| 320 | |
| 321 // Make sure the value of alpha channel is always larger than ma ximum | |
| 322 // value of color channels. | |
| 323 accum0 = _mm_max_epu8(b, accum0); | |
| 324 } else { | |
| 325 // Set value of alpha channels to 0xFF. | |
| 326 __m128i mask = _mm_set1_epi32(0xff000000); | |
| 327 accum0 = _mm_or_si128(accum0, mask); | |
| 328 } | |
| 329 | |
| 330 // Store the convolution result (16 bytes) and advance the pixel poi nters. | |
| 331 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0); | |
| 332 outRow += 16; | |
| 333 } | |
| 334 | |
| 335 // When the width of the output is not divisible by 4, We need to save o ne | |
| 336 // pixel (4 bytes) each time. And also the fourth pixel is always absent . | |
| 337 int r = pixelWidth & 3; | |
| 338 if (r) { | |
| 339 __m128i accum0 = _mm_setzero_si128(); | |
| 340 __m128i accum1 = _mm_setzero_si128(); | |
| 341 __m128i accum2 = _mm_setzero_si128(); | |
| 342 for (int filterY = 0; filterY < filterLength; ++filterY) { | |
| 343 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); | |
| 344 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 345 const __m128i* src = reinterpret_cast<const __m128i*>( | |
| 346 &sourceDataRows[filterY][width << 2]); | |
| 347 __m128i src8 = _mm_loadu_si128(src); | |
| 348 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 349 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
| 350 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 351 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 352 // [32] a0 b0 g0 r0 | |
| 353 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 354 accum0 = _mm_add_epi32(accum0, t); | |
| 355 // [32] a1 b1 g1 r1 | |
| 356 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 357 accum1 = _mm_add_epi32(accum1, t); | |
| 358 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 359 src16 = _mm_unpackhi_epi8(src8, zero); | |
| 360 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 361 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 362 // [32] a2 b2 g2 r2 | |
| 363 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 364 accum2 = _mm_add_epi32(accum2, t); | |
| 365 } | |
| 366 | |
| 367 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 368 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 369 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 370 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 371 accum0 = _mm_packs_epi32(accum0, accum1); | |
| 372 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 373 accum2 = _mm_packs_epi32(accum2, zero); | |
| 374 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 375 accum0 = _mm_packus_epi16(accum0, accum2); | |
| 376 if (hasAlpha) { | |
| 377 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 378 __m128i a = _mm_srli_epi32(accum0, 8); | |
| 379 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 380 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
| 381 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 382 a = _mm_srli_epi32(accum0, 16); | |
| 383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 384 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
| 385 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 386 b = _mm_slli_epi32(b, 24); | |
| 387 accum0 = _mm_max_epu8(b, accum0); | |
| 388 } else { | |
| 389 __m128i mask = _mm_set1_epi32(0xff000000); | |
| 390 accum0 = _mm_or_si128(accum0, mask); | |
| 391 } | |
| 392 | |
| 393 for (int i = 0; i < r; i++) { | |
| 394 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0); | |
| 395 accum0 = _mm_srli_si128(accum0, 4); | |
| 396 outRow += 4; | |
| 397 } | |
| 398 } | |
| 399 } | |
| 400 | |
| 401 #elif defined(SK_ARM_HAS_NEON) | |
| 402 | |
| 403 #include <arm_neon.h> | |
|
mtklein_C
2016/11/16 14:24:47
Same deal with emmintrin. It's probably best to p
xiangze.zhang
2016/11/17 02:33:07
Done.
| |
| 404 | |
| 405 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, | |
| 406 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4 _t& accum, int r) { | |
| 407 int remainder[4] = {0}; | |
| 408 for (int i = 0; i < r; i++) { | |
| 409 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; | |
| 410 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; | |
| 411 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; | |
| 412 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; | |
| 413 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; | |
| 414 } | |
| 415 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; | |
| 416 accum += t; | |
| 417 } | |
| 418 | |
| 419 // Convolves horizontally along a single row. The row data is given in | |
| 420 // |srcData| and continues for the numValues() of the filter. | |
| 421 void convolve_horizontally(const unsigned char* srcData, | |
| 422 const SkConvolutionFilter1D& filter, | |
| 423 unsigned char* outRow, | |
| 424 bool /*hasAlpha*/) { | |
| 425 // Loop over each pixel on this row in the output image. | |
| 426 int numValues = filter.numValues(); | |
| 427 for (int outX = 0; outX < numValues; outX++) { | |
| 428 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | |
| 429 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | |
| 430 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | |
| 431 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | |
| 432 // Get the filter that determines the current output pixel. | |
| 433 int filterOffset, filterLength; | |
| 434 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
| 435 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
| 436 | |
| 437 // Compute the first pixel in this row that the filter affects. It w ill | |
| 438 // touch |filterLength| pixels (4 bytes each) after this. | |
| 439 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
| 440 | |
| 441 // Apply the filter to the row to get the destination pixel in |accu m|. | |
| 442 int32x4_t accum = vdupq_n_s32(0); | |
| 443 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { | |
| 444 // Load 4 coefficients | |
| 445 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | |
| 446 coeffs = vld1_s16(filterValues); | |
| 447 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0)); | |
| 448 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1)); | |
| 449 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2)); | |
| 450 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3)); | |
| 451 | |
| 452 // Load pixels and calc | |
| 453 uint8x16_t pixels = vld1q_u8(rowToFilter); | |
| 454 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi xels))); | |
| 455 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p ixels))); | |
| 456 | |
| 457 int16x4_t p0_src = vget_low_s16(p01_16); | |
| 458 int16x4_t p1_src = vget_high_s16(p01_16); | |
| 459 int16x4_t p2_src = vget_low_s16(p23_16); | |
| 460 int16x4_t p3_src = vget_high_s16(p23_16); | |
| 461 | |
| 462 int32x4_t p0 = vmull_s16(p0_src, coeff0); | |
| 463 int32x4_t p1 = vmull_s16(p1_src, coeff1); | |
| 464 int32x4_t p2 = vmull_s16(p2_src, coeff2); | |
| 465 int32x4_t p3 = vmull_s16(p3_src, coeff3); | |
| 466 | |
| 467 accum += p0; | |
| 468 accum += p1; | |
| 469 accum += p2; | |
| 470 accum += p3; | |
| 471 | |
| 472 // Advance the pointers | |
| 473 rowToFilter += 16; | |
| 474 filterValues += 4; | |
| 475 } | |
| 476 | |
| 477 int r = filterLength & 3; | |
| 478 if (r) { | |
| 479 int remainder_offset = (filterOffset + filterLength - r) * 4; | |
| 480 AccumRemainder(srcData + remainder_offset, filterValues, accum, r); | |
| 481 } | |
| 482 | |
| 483 // Bring this value back in range. All of the filter scaling factors | |
| 484 // are in fixed point with kShiftBits bits of fractional part. | |
| 485 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); | |
| 486 | |
| 487 // Pack and store the new pixel. | |
| 488 int16x4_t accum16 = vqmovn_s32(accum); | |
| 489 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); | |
| 490 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_ u8(accum8), 0); | |
| 491 outRow += 4; | |
| 492 } | |
| 493 } | |
| 494 | |
| 495 // Convolves horizontally along four rows. The row data is given in | |
| 496 // |srcData| and continues for the numValues() of the filter. | |
| 497 // The algorithm is almost same as |convolve_horizontally|. Please | |
| 498 // refer to that function for detailed comments. | |
| 499 void convolve_4_rows_horizontally(const unsigned char* srcData[4], | |
| 500 const SkConvolutionFilter1D& filter, | |
| 501 unsigned char* outRow[4], | |
| 502 size_t outRowBytes) { | |
| 503 // Output one pixel each iteration, calculating all channels (RGBA) toge ther. | |
| 504 int numValues = filter.numValues(); | |
| 505 for (int outX = 0; outX < numValues; outX++) { | |
| 506 | |
| 507 int filterOffset, filterLength; | |
| 508 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
| 509 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
| 510 | |
| 511 // four pixels in a column per iteration. | |
| 512 int32x4_t accum0 = vdupq_n_s32(0); | |
| 513 int32x4_t accum1 = vdupq_n_s32(0); | |
| 514 int32x4_t accum2 = vdupq_n_s32(0); | |
| 515 int32x4_t accum3 = vdupq_n_s32(0); | |
| 516 | |
| 517 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); | |
| 518 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); | |
| 519 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); | |
| 520 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); | |
| 521 | |
| 522 int start = filterOffset * 4; | |
| 523 | |
| 524 // We will load and accumulate with four coefficients per iteration. | |
| 525 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { | |
| 526 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; | |
| 527 | |
| 528 coeffs = vld1_s16(filterValues); | |
| 529 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask0)); | |
| 530 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask1)); | |
| 531 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask2)); | |
| 532 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs ), coeff_mask3)); | |
| 533 | |
| 534 uint8x16_t pixels; | |
| 535 int16x8_t p01_16, p23_16; | |
| 536 int32x4_t p0, p1, p2, p3; | |
| 537 | |
| 538 #define ITERATION(src, accum) \ | |
| 539 pixels = vld1q_u8(src); \ | |
| 540 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); \ | |
| 541 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); \ | |
| 542 p0 = vmull_s16(vget_low_s16(p01_16), coeff0); \ | |
| 543 p1 = vmull_s16(vget_high_s16(p01_16), coeff1); \ | |
| 544 p2 = vmull_s16(vget_low_s16(p23_16), coeff2); \ | |
| 545 p3 = vmull_s16(vget_high_s16(p23_16), coeff3); \ | |
| 546 accum += p0; \ | |
| 547 accum += p1; \ | |
| 548 accum += p2; \ | |
| 549 accum += p3 | |
| 550 | |
| 551 ITERATION(srcData[0] + start, accum0); | |
| 552 ITERATION(srcData[1] + start, accum1); | |
| 553 ITERATION(srcData[2] + start, accum2); | |
| 554 ITERATION(srcData[3] + start, accum3); | |
| 555 | |
| 556 start += 16; | |
| 557 filterValues += 4; | |
| 558 } | |
| 559 | |
| 560 int r = filterLength & 3; | |
| 561 if (r) { | |
| 562 int remainder_offset = (filterOffset + filterLength - r) * 4; | |
| 563 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu m0, r); | |
| 564 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu m1, r); | |
| 565 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu m2, r); | |
| 566 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu m3, r); | |
| 567 } | |
| 568 | |
| 569 int16x4_t accum16; | |
| 570 uint8x8_t res0, res1, res2, res3; | |
| 571 | |
| 572 #define PACK_RESULT(accum, res) \ | |
| 573 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ | |
| 574 accum16 = vqmovn_s32(accum); \ | |
| 575 res = vqmovun_s16(vcombine_s16(accum16, accum16)); | |
| 576 | |
| 577 PACK_RESULT(accum0, res0); | |
| 578 PACK_RESULT(accum1, res1); | |
| 579 PACK_RESULT(accum2, res2); | |
| 580 PACK_RESULT(accum3, res3); | |
| 581 | |
| 582 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u 32_u8(res0), 0); | |
| 583 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u 32_u8(res1), 0); | |
| 584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u 32_u8(res2), 0); | |
| 585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u 32_u8(res3), 0); | |
| 586 outRow[0] += 4; | |
| 587 outRow[1] += 4; | |
| 588 outRow[2] += 4; | |
| 589 outRow[3] += 4; | |
| 590 } | |
| 591 } | |
| 592 | |
| 593 | |
| 594 // Does vertical convolution to produce one output row. The filter values an d | |
| 595 // length are given in the first two parameters. These are applied to each | |
| 596 // of the rows pointed to in the |sourceDataRows| array, with each row | |
| 597 // being |pixelWidth| wide. | |
| 598 // | |
| 599 // The output must have room for |pixelWidth * 4| bytes. | |
| 600 template<bool hasAlpha> | |
| 601 void convolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
| 602 int filterLength, | |
| 603 unsigned char* const* sourceDataRows, | |
| 604 int pixelWidth, | |
| 605 unsigned char* outRow) { | |
| 606 int width = pixelWidth & ~3; | |
| 607 | |
| 608 // Output four pixels per iteration (16 bytes). | |
| 609 for (int outX = 0; outX < width; outX += 4) { | |
| 610 | |
| 611 // Accumulated result for each pixel. 32 bits per RGBA channel. | |
| 612 int32x4_t accum0 = vdupq_n_s32(0); | |
| 613 int32x4_t accum1 = vdupq_n_s32(0); | |
| 614 int32x4_t accum2 = vdupq_n_s32(0); | |
| 615 int32x4_t accum3 = vdupq_n_s32(0); | |
| 616 | |
| 617 // Convolve with one filter coefficient per iteration. | |
| 618 for (int filterY = 0; filterY < filterLength; filterY++) { | |
| 619 | |
| 620 // Duplicate the filter coefficient 4 times. | |
| 621 // [16] cj cj cj cj | |
| 622 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); | |
| 623 | |
| 624 // Load four pixels (16 bytes) together. | |
| 625 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 626 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); | |
| 627 | |
| 628 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8))); | |
| 629 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8))); | |
| 630 int16x4_t src16_0 = vget_low_s16(src16_01); | |
| 631 int16x4_t src16_1 = vget_high_s16(src16_01); | |
| 632 int16x4_t src16_2 = vget_low_s16(src16_23); | |
| 633 int16x4_t src16_3 = vget_high_s16(src16_23); | |
| 634 | |
| 635 accum0 += vmull_s16(src16_0, coeff16); | |
| 636 accum1 += vmull_s16(src16_1, coeff16); | |
| 637 accum2 += vmull_s16(src16_2, coeff16); | |
| 638 accum3 += vmull_s16(src16_3, coeff16); | |
| 639 } | |
| 640 | |
| 641 // Shift right for fixed point implementation. | |
| 642 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 643 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 644 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 645 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); | |
| 646 | |
| 647 // Packing 32 bits |accum| to 16 bits per channel (signed saturation ). | |
| 648 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 649 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1)); | |
| 650 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 651 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum3)); | |
| 652 | |
| 653 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio n). | |
| 654 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 655 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1)); | |
| 656 | |
| 657 if (hasAlpha) { | |
| 658 // Compute the max(ri, gi, bi) for each pixel. | |
| 659 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 660 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8)); | |
| 661 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 662 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g | |
| 663 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 664 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16)); | |
| 665 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 666 b = vmaxq_u8(a, b); // Max of r and g and b. | |
| 667 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 668 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 )); | |
| 669 | |
| 670 // Make sure the value of alpha channel is always larger than ma ximum | |
| 671 // value of color channels. | |
| 672 accum8 = vmaxq_u8(b, accum8); | |
| 673 } else { | |
| 674 // Set value of alpha channels to 0xFF. | |
| 675 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000)); | |
| 676 } | |
| 677 | |
| 678 // Store the convolution result (16 bytes) and advance the pixel poi nters. | |
| 679 vst1q_u8(outRow, accum8); | |
| 680 outRow += 16; | |
| 681 } | |
| 682 | |
| 683 // Process the leftovers when the width of the output is not divisible | |
| 684 // by 4, that is at most 3 pixels. | |
| 685 int r = pixelWidth & 3; | |
| 686 if (r) { | |
| 687 | |
| 688 int32x4_t accum0 = vdupq_n_s32(0); | |
| 689 int32x4_t accum1 = vdupq_n_s32(0); | |
| 690 int32x4_t accum2 = vdupq_n_s32(0); | |
| 691 | |
| 692 for (int filterY = 0; filterY < filterLength; ++filterY) { | |
| 693 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); | |
| 694 | |
| 695 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 696 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]) ; | |
| 697 | |
| 698 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8( src8))); | |
| 699 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8 (src8))); | |
| 700 int16x4_t src16_0 = vget_low_s16(src16_01); | |
| 701 int16x4_t src16_1 = vget_high_s16(src16_01); | |
| 702 int16x4_t src16_2 = vget_low_s16(src16_23); | |
| 703 | |
| 704 accum0 += vmull_s16(src16_0, coeff16); | |
| 705 accum1 += vmull_s16(src16_1, coeff16); | |
| 706 accum2 += vmull_s16(src16_2, coeff16); | |
| 707 } | |
| 708 | |
| 709 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 710 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 711 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 712 | |
| 713 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac cum1)); | |
| 714 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac cum2)); | |
| 715 | |
| 716 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16( accum16_1)); | |
| 717 | |
| 718 if (hasAlpha) { | |
| 719 // Compute the max(ri, gi, bi) for each pixel. | |
| 720 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 721 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3 2_u8(accum8), 8)); | |
| 722 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 723 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g | |
| 724 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 725 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8 ), 16)); | |
| 726 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 727 b = vmaxq_u8(a, b); // Max of r and g and b. | |
| 728 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 729 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24 )); | |
| 730 | |
| 731 // Make sure the value of alpha channel is always larger than ma ximum | |
| 732 // value of color channels. | |
| 733 accum8 = vmaxq_u8(b, accum8); | |
| 734 } else { | |
| 735 // Set value of alpha channels to 0xFF. | |
| 736 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu pq_n_u32(0xFF000000)); | |
| 737 } | |
| 738 | |
| 739 switch(r) { | |
| 740 case 1: | |
| 741 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret q_u32_u8(accum8), 0); | |
| 742 break; | |
| 743 case 2: | |
| 744 vst1_u32(reinterpret_cast<uint32_t*>(outRow), | |
| 745 vreinterpret_u32_u8(vget_low_u8(accum8))); | |
| 746 break; | |
| 747 case 3: | |
| 748 vst1_u32(reinterpret_cast<uint32_t*>(outRow), | |
| 749 vreinterpret_u32_u8(vget_low_u8(accum8))); | |
| 750 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr etq_u32_u8(accum8), 2); | |
| 751 break; | |
| 752 } | |
| 753 } | |
| 754 } | |
| 755 | |
| 756 #else | |
| 757 | |
| 758 // Converts the argument to an 8-bit unsigned value by clamping to the range | |
| 759 // 0-255. | |
| 760 inline unsigned char ClampTo8(int a) { | |
| 761 if (static_cast<unsigned>(a) < 256) { | |
| 762 return a; // Avoid the extra check in the common case. | |
| 763 } | |
| 764 if (a < 0) { | |
| 765 return 0; | |
| 766 } | |
| 767 return 255; | |
| 768 } | |
| 769 | |
| 770 // Convolves horizontally along a single row. The row data is given in | |
| 771 // |srcData| and continues for the numValues() of the filter. | |
| 772 template<bool hasAlpha> | |
| 773 void ConvolveHorizontally(const unsigned char* srcData, | |
| 774 const SkConvolutionFilter1D& filter, | |
| 775 unsigned char* outRow) { | |
| 776 // Loop over each pixel on this row in the output image. | |
| 777 int numValues = filter.numValues(); | |
| 778 for (int outX = 0; outX < numValues; outX++) { | |
| 779 // Get the filter that determines the current output pixel. | |
| 780 int filterOffset, filterLength; | |
| 781 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = | |
| 782 filter.FilterForValue(outX, &filterOffset, &filterLength); | |
| 783 | |
| 784 // Compute the first pixel in this row that the filter affects. It w ill | |
| 785 // touch |filterLength| pixels (4 bytes each) after this. | |
| 786 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; | |
| 787 | |
| 788 // Apply the filter to the row to get the destination pixel in |accu m|. | |
| 789 int accum[4] = {0}; | |
| 790 for (int filterX = 0; filterX < filterLength; filterX++) { | |
| 791 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterX]; | |
| 792 accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; | |
| 793 accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; | |
| 794 accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; | |
| 795 if (hasAlpha) { | |
| 796 accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; | |
| 797 } | |
| 798 } | |
| 799 | |
| 800 // Bring this value back in range. All of the filter scaling factors | |
| 801 // are in fixed point with kShiftBits bits of fractional part. | |
| 802 accum[0] >>= SkConvolutionFilter1D::kShiftBits; | |
| 803 accum[1] >>= SkConvolutionFilter1D::kShiftBits; | |
| 804 accum[2] >>= SkConvolutionFilter1D::kShiftBits; | |
| 805 if (hasAlpha) { | |
| 806 accum[3] >>= SkConvolutionFilter1D::kShiftBits; | |
| 807 } | |
| 808 | |
| 809 // Store the new pixel. | |
| 810 outRow[outX * 4 + 0] = ClampTo8(accum[0]); | |
| 811 outRow[outX * 4 + 1] = ClampTo8(accum[1]); | |
| 812 outRow[outX * 4 + 2] = ClampTo8(accum[2]); | |
| 813 if (hasAlpha) { | |
| 814 outRow[outX * 4 + 3] = ClampTo8(accum[3]); | |
| 815 } | |
| 816 } | |
| 817 } | |
| 818 | |
| 819 // Does vertical convolution to produce one output row. The filter values an d | |
| 820 // length are given in the first two parameters. These are applied to each | |
| 821 // of the rows pointed to in the |sourceDataRows| array, with each row | |
| 822 // being |pixelWidth| wide. | |
| 823 // | |
| 824 // The output must have room for |pixelWidth * 4| bytes. | |
| 825 template<bool hasAlpha> | |
| 826 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte rValues, | |
| 827 int filterLength, | |
| 828 unsigned char* const* sourceDataRows, | |
| 829 int pixelWidth, | |
| 830 unsigned char* outRow) { | |
| 831 // We go through each column in the output and do a vertical convolution , | |
| 832 // generating one output pixel each time. | |
| 833 for (int outX = 0; outX < pixelWidth; outX++) { | |
| 834 // Compute the number of bytes over in each row that the current col umn | |
| 835 // we're convolving starts at. The pixel will cover the next 4 bytes . | |
| 836 int byteOffset = outX * 4; | |
| 837 | |
| 838 // Apply the filter to one column of pixels. | |
| 839 int accum[4] = {0}; | |
| 840 for (int filterY = 0; filterY < filterLength; filterY++) { | |
| 841 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues [filterY]; | |
| 842 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; | |
| 843 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; | |
| 844 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; | |
| 845 if (hasAlpha) { | |
| 846 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3]; | |
| 847 } | |
| 848 } | |
| 849 | |
| 850 // Bring this value back in range. All of the filter scaling factors | |
| 851 // are in fixed point with kShiftBits bits of precision. | |
| 852 accum[0] >>= SkConvolutionFilter1D::kShiftBits; | |
| 853 accum[1] >>= SkConvolutionFilter1D::kShiftBits; | |
| 854 accum[2] >>= SkConvolutionFilter1D::kShiftBits; | |
| 855 if (hasAlpha) { | |
| 856 accum[3] >>= SkConvolutionFilter1D::kShiftBits; | |
| 857 } | |
| 858 | |
| 859 // Store the new pixel. | |
| 860 outRow[byteOffset + 0] = ClampTo8(accum[0]); | |
| 861 outRow[byteOffset + 1] = ClampTo8(accum[1]); | |
| 862 outRow[byteOffset + 2] = ClampTo8(accum[2]); | |
| 863 if (hasAlpha) { | |
| 864 unsigned char alpha = ClampTo8(accum[3]); | |
| 865 | |
| 866 // Make sure the alpha channel doesn't come out smaller than any of the | |
| 867 // color channels. We use premultipled alpha channels, so this s hould | |
| 868 // never happen, but rounding errors will cause this from time t o time. | |
| 869 // These "impossible" colors will cause overflows (and hence ran dom pixel | |
| 870 // values) when the resulting bitmap is drawn to the screen. | |
| 871 // | |
| 872 // We only need to do this when generating the final output row (here). | |
| 873 int maxColorChannel = SkTMax(outRow[byteOffset + 0], | |
| 874 SkTMax(outRow[byteOffset + 1], | |
| 875 outRow[byteOffset + 2])); | |
| 876 if (alpha < maxColorChannel) { | |
| 877 outRow[byteOffset + 3] = maxColorChannel; | |
| 878 } else { | |
| 879 outRow[byteOffset + 3] = alpha; | |
| 880 } | |
| 881 } else { | |
| 882 // No alpha channel, the image is opaque. | |
| 883 outRow[byteOffset + 3] = 0xff; | |
| 884 } | |
| 885 } | |
| 886 } | |
| 887 | |
| 888 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize ). We originally | |
| 889 // thought this was 32 bit only, but subsequent tests show that some 64 bit gcc compiles | |
| 890 // suffer here too. | |
| 891 // | |
| 892 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http s://bug.skia.org/2575 | |
| 893 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) | |
| 894 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no inline)) | |
| 895 #else | |
| 896 #define SK_MAYBE_DISABLE_VECTORIZATION | |
| 897 #endif | |
| 898 | |
| 899 SK_MAYBE_DISABLE_VECTORIZATION | |
| 900 void convolve_horizontally(const unsigned char* srcData, | |
| 901 const SkConvolutionFilter1D& filter, | |
| 902 unsigned char* outRow, | |
| 903 bool has_alpha) { | |
| 904 if (has_alpha) { | |
| 905 ConvolveHorizontally<true>(srcData, filter, outRow); | |
| 906 } else { | |
| 907 ConvolveHorizontally<false>(srcData, filter, outRow); | |
| 908 } | |
| 909 } | |
| 910 #undef SK_MAYBE_DISABLE_VECTORIZATION | |
| 911 | |
| 912 void (*convolve_4_rows_horizontally)(const unsigned char* src_data[4], | |
|
mtklein_C
2016/11/16 14:24:47
Do you think it'd make the calling code clearer to
xiangze.zhang
2016/11/17 02:33:07
The calling code can check and allocate a smaller
| |
| 913 const SkConvolutionFilter1D& filter, | |
| 914 unsigned char* out_row[4], | |
| 915 size_t out_row_bytes) | |
| 916 = nullptr; | |
| 917 | |
| 918 | |
| 919 #endif | |
| 920 | |
| 921 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt er_values, | |
| 922 int filter_length, | |
| 923 unsigned char* const* source_data_rows, | |
| 924 int pixel_width, | |
| 925 unsigned char* out_row, | |
| 926 bool has_alpha) { | |
| 927 if (has_alpha) { | |
| 928 convolveVertically<true>(filter_values, filter_length, source_data_r ows, | |
| 929 pixel_width, out_row); | |
| 930 } else { | |
| 931 convolveVertically<false>(filter_values, filter_length, source_data_ rows, | |
| 932 pixel_width, out_row); | |
| 933 } | |
| 934 } | |
| 935 | |
| 936 } // namespace SK_OPTS_NS | |
| 937 | |
| 938 #endif//SkBitmapFilter_opts_DEFINED | |
| OLD | NEW |