OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright 2016 Google Inc. |
| 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. |
| 6 */ |
| 7 |
| 8 #ifndef SkBitmapFilter_opts_DEFINED |
| 9 #define SkBitmapFilter_opts_DEFINED |
| 10 |
| 11 #include "SkConvolver.h" |
| 12 |
| 13 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 14 #include <emmintrin.h> |
| 15 #elif defined(SK_ARM_HAS_NEON) |
| 16 #include <arm_neon.h> |
| 17 #endif |
| 18 |
| 19 namespace SK_OPTS_NS { |
| 20 |
| 21 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2 |
| 22 |
| 23 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, |
| 24 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i
& accum, int r) { |
| 25 int remainder[4] = {0}; |
| 26 for (int i = 0; i < r; i++) { |
| 27 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; |
| 28 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; |
| 29 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; |
| 30 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; |
| 31 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; |
| 32 } |
| 33 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], rem
ainder[3]); |
| 34 accum = _mm_add_epi32(accum, t); |
| 35 } |
| 36 |
| 37 // Convolves horizontally along a single row. The row data is given in |
| 38 // |srcData| and continues for the numValues() of the filter. |
| 39 void convolve_horizontally(const unsigned char* srcData, |
| 40 const SkConvolutionFilter1D& filter, |
| 41 unsigned char* outRow, |
| 42 bool /*hasAlpha*/) { |
| 43 // Output one pixel each iteration, calculating all channels (RGBA) toge
ther. |
| 44 int numValues = filter.numValues(); |
| 45 for (int outX = 0; outX < numValues; outX++) { |
| 46 // Get the filter that determines the current output pixel. |
| 47 int filterOffset, filterLength; |
| 48 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
| 49 filter.FilterForValue(outX, &filterOffset, &filterLength); |
| 50 |
| 51 // Compute the first pixel in this row that the filter affects. It w
ill |
| 52 // touch |filterLength| pixels (4 bytes each) after this. |
| 53 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; |
| 54 |
| 55 __m128i zero = _mm_setzero_si128(); |
| 56 __m128i accum = _mm_setzero_si128(); |
| 57 |
| 58 // We will load and accumulate with four coefficients per iteration. |
| 59 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { |
| 60 // Load 4 coefficients => duplicate 1st and 2nd of them for all
channels. |
| 61 __m128i coeff, coeff16; |
| 62 // [16] xx xx xx xx c3 c2 c1 c0 |
| 63 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV
alues)); |
| 64 // [16] xx xx xx xx c1 c1 c0 c0 |
| 65 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 66 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
| 67 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 68 |
| 69 // Load four pixels => unpack the first two pixels to 16 bits => |
| 70 // multiply with coefficients => accumulate the convolution resu
lt. |
| 71 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 72 __m128i src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(
rowToFilter)); |
| 73 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 74 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 75 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 76 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 77 // [32] a0*c0 b0*c0 g0*c0 r0*c0 |
| 78 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 79 accum = _mm_add_epi32(accum, t); |
| 80 // [32] a1*c1 b1*c1 g1*c1 r1*c1 |
| 81 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 82 accum = _mm_add_epi32(accum, t); |
| 83 |
| 84 // Duplicate 3rd and 4th coefficients for all channels => |
| 85 // unpack the 3rd and 4th pixels to 16 bits => multiply with coe
fficients |
| 86 // => accumulate the convolution results. |
| 87 // [16] xx xx xx xx c3 c3 c2 c2 |
| 88 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 89 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
| 90 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 91 // [16] a3 g3 b3 r3 a2 g2 b2 r2 |
| 92 src16 = _mm_unpackhi_epi8(src8, zero); |
| 93 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 94 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 95 // [32] a2*c2 b2*c2 g2*c2 r2*c2 |
| 96 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 97 accum = _mm_add_epi32(accum, t); |
| 98 // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
| 99 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 100 accum = _mm_add_epi32(accum, t); |
| 101 |
| 102 // Advance the pixel and coefficients pointers. |
| 103 rowToFilter += 16; |
| 104 filterValues += 4; |
| 105 } |
| 106 |
| 107 // When |filterLength| is not divisible by 4, we accumulate the last
1 - 3 |
| 108 // coefficients one at a time. |
| 109 int r = filterLength & 3; |
| 110 if (r) { |
| 111 int remainderOffset = (filterOffset + filterLength - r) * 4; |
| 112 AccumRemainder(srcData + remainderOffset, filterValues, accum, r
); |
| 113 } |
| 114 |
| 115 // Shift right for fixed point implementation. |
| 116 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |
| 117 |
| 118 // Packing 32 bits |accum| to 16 bits per channel (signed saturation
). |
| 119 accum = _mm_packs_epi32(accum, zero); |
| 120 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio
n). |
| 121 accum = _mm_packus_epi16(accum, zero); |
| 122 |
| 123 // Store the pixel value of 32 bits. |
| 124 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum); |
| 125 outRow += 4; |
| 126 } |
| 127 } |
| 128 |
| 129 // Convolves horizontally along four rows. The row data is given in |
| 130 // |srcData| and continues for the numValues() of the filter. |
| 131 // The algorithm is almost same as |convolve_horizontally|. Please |
| 132 // refer to that function for detailed comments. |
| 133 void convolve_4_rows_horizontally(const unsigned char* srcData[4], |
| 134 const SkConvolutionFilter1D& filter, |
| 135 unsigned char* outRow[4], |
| 136 size_t outRowBytes) { |
| 137 SkDEBUGCODE(const unsigned char* out_row_0_start = outRow[0];) |
| 138 |
| 139 // Output one pixel each iteration, calculating all channels (RGBA) toge
ther. |
| 140 int numValues = filter.numValues(); |
| 141 for (int outX = 0; outX < numValues; outX++) { |
| 142 int filterOffset, filterLength; |
| 143 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
| 144 filter.FilterForValue(outX, &filterOffset, &filterLength); |
| 145 |
| 146 __m128i zero = _mm_setzero_si128(); |
| 147 |
| 148 // four pixels in a column per iteration. |
| 149 __m128i accum0 = _mm_setzero_si128(); |
| 150 __m128i accum1 = _mm_setzero_si128(); |
| 151 __m128i accum2 = _mm_setzero_si128(); |
| 152 __m128i accum3 = _mm_setzero_si128(); |
| 153 |
| 154 int start = filterOffset * 4; |
| 155 // We will load and accumulate with four coefficients per iteration. |
| 156 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { |
| 157 __m128i coeff, coeff16lo, coeff16hi; |
| 158 // [16] xx xx xx xx c3 c2 c1 c0 |
| 159 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterV
alues)); |
| 160 // [16] xx xx xx xx c1 c1 c0 c0 |
| 161 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 162 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
| 163 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
| 164 // [16] xx xx xx xx c3 c3 c2 c2 |
| 165 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 166 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
| 167 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
| 168 |
| 169 __m128i src8, src16, mul_hi, mul_lo, t; |
| 170 |
| 171 #define ITERATION(src, accum)
\ |
| 172 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
\ |
| 173 src16 = _mm_unpacklo_epi8(src8, zero);
\ |
| 174 mul_hi = _mm_mulhi_epi16(src16, coeff16lo);
\ |
| 175 mul_lo = _mm_mullo_epi16(src16, coeff16lo);
\ |
| 176 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
\ |
| 177 accum = _mm_add_epi32(accum, t);
\ |
| 178 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
\ |
| 179 accum = _mm_add_epi32(accum, t);
\ |
| 180 src16 = _mm_unpackhi_epi8(src8, zero);
\ |
| 181 mul_hi = _mm_mulhi_epi16(src16, coeff16hi);
\ |
| 182 mul_lo = _mm_mullo_epi16(src16, coeff16hi);
\ |
| 183 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
\ |
| 184 accum = _mm_add_epi32(accum, t);
\ |
| 185 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
\ |
| 186 accum = _mm_add_epi32(accum, t) |
| 187 |
| 188 ITERATION(srcData[0] + start, accum0); |
| 189 ITERATION(srcData[1] + start, accum1); |
| 190 ITERATION(srcData[2] + start, accum2); |
| 191 ITERATION(srcData[3] + start, accum3); |
| 192 |
| 193 start += 16; |
| 194 filterValues += 4; |
| 195 } |
| 196 |
| 197 int r = filterLength & 3; |
| 198 if (r) { |
| 199 int remainderOffset = (filterOffset + filterLength - r) * 4; |
| 200 AccumRemainder(srcData[0] + remainderOffset, filterValues, accum
0, r); |
| 201 AccumRemainder(srcData[1] + remainderOffset, filterValues, accum
1, r); |
| 202 AccumRemainder(srcData[2] + remainderOffset, filterValues, accum
2, r); |
| 203 AccumRemainder(srcData[3] + remainderOffset, filterValues, accum
3, r); |
| 204 } |
| 205 |
| 206 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 207 accum0 = _mm_packs_epi32(accum0, zero); |
| 208 accum0 = _mm_packus_epi16(accum0, zero); |
| 209 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 210 accum1 = _mm_packs_epi32(accum1, zero); |
| 211 accum1 = _mm_packus_epi16(accum1, zero); |
| 212 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 213 accum2 = _mm_packs_epi32(accum2, zero); |
| 214 accum2 = _mm_packus_epi16(accum2, zero); |
| 215 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 216 accum3 = _mm_packs_epi32(accum3, zero); |
| 217 accum3 = _mm_packus_epi16(accum3, zero); |
| 218 |
| 219 // We seem to be running off the edge here (chromium:491660). |
| 220 SkASSERT(((size_t)outRow[0] - (size_t)out_row_0_start) < outRowBytes
); |
| 221 |
| 222 *(reinterpret_cast<int*>(outRow[0])) = _mm_cvtsi128_si32(accum0); |
| 223 *(reinterpret_cast<int*>(outRow[1])) = _mm_cvtsi128_si32(accum1); |
| 224 *(reinterpret_cast<int*>(outRow[2])) = _mm_cvtsi128_si32(accum2); |
| 225 *(reinterpret_cast<int*>(outRow[3])) = _mm_cvtsi128_si32(accum3); |
| 226 |
| 227 outRow[0] += 4; |
| 228 outRow[1] += 4; |
| 229 outRow[2] += 4; |
| 230 outRow[3] += 4; |
| 231 } |
| 232 } |
| 233 |
| 234 // Does vertical convolution to produce one output row. The filter values an
d |
| 235 // length are given in the first two parameters. These are applied to each |
| 236 // of the rows pointed to in the |sourceDataRows| array, with each row |
| 237 // being |pixelWidth| wide. |
| 238 // |
| 239 // The output must have room for |pixelWidth * 4| bytes. |
| 240 template<bool hasAlpha> |
| 241 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte
rValues, |
| 242 int filterLength, |
| 243 unsigned char* const* sourceDataRows, |
| 244 int pixelWidth, |
| 245 unsigned char* outRow) { |
| 246 // Output four pixels per iteration (16 bytes). |
| 247 int width = pixelWidth & ~3; |
| 248 __m128i zero = _mm_setzero_si128(); |
| 249 for (int outX = 0; outX < width; outX += 4) { |
| 250 // Accumulated result for each pixel. 32 bits per RGBA channel. |
| 251 __m128i accum0 = _mm_setzero_si128(); |
| 252 __m128i accum1 = _mm_setzero_si128(); |
| 253 __m128i accum2 = _mm_setzero_si128(); |
| 254 __m128i accum3 = _mm_setzero_si128(); |
| 255 |
| 256 // Convolve with one filter coefficient per iteration. |
| 257 for (int filterY = 0; filterY < filterLength; filterY++) { |
| 258 |
| 259 // Duplicate the filter coefficient 8 times. |
| 260 // [16] cj cj cj cj cj cj cj cj |
| 261 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); |
| 262 |
| 263 // Load four pixels (16 bytes) together. |
| 264 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 265 const __m128i* src = reinterpret_cast<const __m128i*>( |
| 266 &sourceDataRows[filterY][outX << 2]); |
| 267 __m128i src8 = _mm_loadu_si128(src); |
| 268 |
| 269 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each cha
nnels => |
| 270 // multiply with current coefficient => accumulate the result. |
| 271 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 272 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 273 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 274 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 275 // [32] a0 b0 g0 r0 |
| 276 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 277 accum0 = _mm_add_epi32(accum0, t); |
| 278 // [32] a1 b1 g1 r1 |
| 279 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 280 accum1 = _mm_add_epi32(accum1, t); |
| 281 |
| 282 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each cha
nnels => |
| 283 // multiply with current coefficient => accumulate the result. |
| 284 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 285 src16 = _mm_unpackhi_epi8(src8, zero); |
| 286 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 287 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 288 // [32] a2 b2 g2 r2 |
| 289 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 290 accum2 = _mm_add_epi32(accum2, t); |
| 291 // [32] a3 b3 g3 r3 |
| 292 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 293 accum3 = _mm_add_epi32(accum3, t); |
| 294 } |
| 295 |
| 296 // Shift right for fixed point implementation. |
| 297 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 298 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 299 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 300 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 301 |
| 302 // Packing 32 bits |accum| to 16 bits per channel (signed saturation
). |
| 303 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 304 accum0 = _mm_packs_epi32(accum0, accum1); |
| 305 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 306 accum2 = _mm_packs_epi32(accum2, accum3); |
| 307 |
| 308 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio
n). |
| 309 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 310 accum0 = _mm_packus_epi16(accum0, accum2); |
| 311 |
| 312 if (hasAlpha) { |
| 313 // Compute the max(ri, gi, bi) for each pixel. |
| 314 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 315 __m128i a = _mm_srli_epi32(accum0, 8); |
| 316 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 317 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 318 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 319 a = _mm_srli_epi32(accum0, 16); |
| 320 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 321 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 322 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 323 b = _mm_slli_epi32(b, 24); |
| 324 |
| 325 // Make sure the value of alpha channel is always larger than ma
ximum |
| 326 // value of color channels. |
| 327 accum0 = _mm_max_epu8(b, accum0); |
| 328 } else { |
| 329 // Set value of alpha channels to 0xFF. |
| 330 __m128i mask = _mm_set1_epi32(0xff000000); |
| 331 accum0 = _mm_or_si128(accum0, mask); |
| 332 } |
| 333 |
| 334 // Store the convolution result (16 bytes) and advance the pixel poi
nters. |
| 335 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0); |
| 336 outRow += 16; |
| 337 } |
| 338 |
| 339 // When the width of the output is not divisible by 4, We need to save o
ne |
| 340 // pixel (4 bytes) each time. And also the fourth pixel is always absent
. |
| 341 int r = pixelWidth & 3; |
| 342 if (r) { |
| 343 __m128i accum0 = _mm_setzero_si128(); |
| 344 __m128i accum1 = _mm_setzero_si128(); |
| 345 __m128i accum2 = _mm_setzero_si128(); |
| 346 for (int filterY = 0; filterY < filterLength; ++filterY) { |
| 347 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); |
| 348 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 349 const __m128i* src = reinterpret_cast<const __m128i*>( |
| 350 &sourceDataRows[filterY][width << 2]); |
| 351 __m128i src8 = _mm_loadu_si128(src); |
| 352 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 353 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 354 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 355 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 356 // [32] a0 b0 g0 r0 |
| 357 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 358 accum0 = _mm_add_epi32(accum0, t); |
| 359 // [32] a1 b1 g1 r1 |
| 360 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 361 accum1 = _mm_add_epi32(accum1, t); |
| 362 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 363 src16 = _mm_unpackhi_epi8(src8, zero); |
| 364 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 365 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 366 // [32] a2 b2 g2 r2 |
| 367 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 368 accum2 = _mm_add_epi32(accum2, t); |
| 369 } |
| 370 |
| 371 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 372 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 373 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 374 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 375 accum0 = _mm_packs_epi32(accum0, accum1); |
| 376 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 377 accum2 = _mm_packs_epi32(accum2, zero); |
| 378 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 379 accum0 = _mm_packus_epi16(accum0, accum2); |
| 380 if (hasAlpha) { |
| 381 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 382 __m128i a = _mm_srli_epi32(accum0, 8); |
| 383 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 384 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 385 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 386 a = _mm_srli_epi32(accum0, 16); |
| 387 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 388 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 389 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 390 b = _mm_slli_epi32(b, 24); |
| 391 accum0 = _mm_max_epu8(b, accum0); |
| 392 } else { |
| 393 __m128i mask = _mm_set1_epi32(0xff000000); |
| 394 accum0 = _mm_or_si128(accum0, mask); |
| 395 } |
| 396 |
| 397 for (int i = 0; i < r; i++) { |
| 398 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0); |
| 399 accum0 = _mm_srli_si128(accum0, 4); |
| 400 outRow += 4; |
| 401 } |
| 402 } |
| 403 } |
| 404 |
| 405 #elif defined(SK_ARM_HAS_NEON) |
| 406 |
| 407 static SK_ALWAYS_INLINE void AccumRemainder(const unsigned char* pixelsLeft, |
| 408 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, int32x4
_t& accum, int r) { |
| 409 int remainder[4] = {0}; |
| 410 for (int i = 0; i < r; i++) { |
| 411 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; |
| 412 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; |
| 413 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; |
| 414 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; |
| 415 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; |
| 416 } |
| 417 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; |
| 418 accum += t; |
| 419 } |
| 420 |
| 421 // Convolves horizontally along a single row. The row data is given in |
| 422 // |srcData| and continues for the numValues() of the filter. |
| 423 void convolve_horizontally(const unsigned char* srcData, |
| 424 const SkConvolutionFilter1D& filter, |
| 425 unsigned char* outRow, |
| 426 bool /*hasAlpha*/) { |
| 427 // Loop over each pixel on this row in the output image. |
| 428 int numValues = filter.numValues(); |
| 429 for (int outX = 0; outX < numValues; outX++) { |
| 430 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); |
| 431 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); |
| 432 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); |
| 433 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); |
| 434 // Get the filter that determines the current output pixel. |
| 435 int filterOffset, filterLength; |
| 436 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
| 437 filter.FilterForValue(outX, &filterOffset, &filterLength); |
| 438 |
| 439 // Compute the first pixel in this row that the filter affects. It w
ill |
| 440 // touch |filterLength| pixels (4 bytes each) after this. |
| 441 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; |
| 442 |
| 443 // Apply the filter to the row to get the destination pixel in |accu
m|. |
| 444 int32x4_t accum = vdupq_n_s32(0); |
| 445 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { |
| 446 // Load 4 coefficients |
| 447 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; |
| 448 coeffs = vld1_s16(filterValues); |
| 449 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask0)); |
| 450 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask1)); |
| 451 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask2)); |
| 452 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask3)); |
| 453 |
| 454 // Load pixels and calc |
| 455 uint8x16_t pixels = vld1q_u8(rowToFilter); |
| 456 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pi
xels))); |
| 457 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(p
ixels))); |
| 458 |
| 459 int16x4_t p0_src = vget_low_s16(p01_16); |
| 460 int16x4_t p1_src = vget_high_s16(p01_16); |
| 461 int16x4_t p2_src = vget_low_s16(p23_16); |
| 462 int16x4_t p3_src = vget_high_s16(p23_16); |
| 463 |
| 464 int32x4_t p0 = vmull_s16(p0_src, coeff0); |
| 465 int32x4_t p1 = vmull_s16(p1_src, coeff1); |
| 466 int32x4_t p2 = vmull_s16(p2_src, coeff2); |
| 467 int32x4_t p3 = vmull_s16(p3_src, coeff3); |
| 468 |
| 469 accum += p0; |
| 470 accum += p1; |
| 471 accum += p2; |
| 472 accum += p3; |
| 473 |
| 474 // Advance the pointers |
| 475 rowToFilter += 16; |
| 476 filterValues += 4; |
| 477 } |
| 478 |
| 479 int r = filterLength & 3; |
| 480 if (r) { |
| 481 int remainder_offset = (filterOffset + filterLength - r) * 4; |
| 482 AccumRemainder(srcData + remainder_offset, filterValues, accum,
r); |
| 483 } |
| 484 |
| 485 // Bring this value back in range. All of the filter scaling factors |
| 486 // are in fixed point with kShiftBits bits of fractional part. |
| 487 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); |
| 488 |
| 489 // Pack and store the new pixel. |
| 490 int16x4_t accum16 = vqmovn_s32(accum); |
| 491 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); |
| 492 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret_u32_
u8(accum8), 0); |
| 493 outRow += 4; |
| 494 } |
| 495 } |
| 496 |
| 497 // Convolves horizontally along four rows. The row data is given in |
| 498 // |srcData| and continues for the numValues() of the filter. |
| 499 // The algorithm is almost same as |convolve_horizontally|. Please |
| 500 // refer to that function for detailed comments. |
| 501 void convolve_4_rows_horizontally(const unsigned char* srcData[4], |
| 502 const SkConvolutionFilter1D& filter, |
| 503 unsigned char* outRow[4], |
| 504 size_t outRowBytes) { |
| 505 // Output one pixel each iteration, calculating all channels (RGBA) toge
ther. |
| 506 int numValues = filter.numValues(); |
| 507 for (int outX = 0; outX < numValues; outX++) { |
| 508 |
| 509 int filterOffset, filterLength; |
| 510 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
| 511 filter.FilterForValue(outX, &filterOffset, &filterLength); |
| 512 |
| 513 // four pixels in a column per iteration. |
| 514 int32x4_t accum0 = vdupq_n_s32(0); |
| 515 int32x4_t accum1 = vdupq_n_s32(0); |
| 516 int32x4_t accum2 = vdupq_n_s32(0); |
| 517 int32x4_t accum3 = vdupq_n_s32(0); |
| 518 |
| 519 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); |
| 520 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); |
| 521 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); |
| 522 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); |
| 523 |
| 524 int start = filterOffset * 4; |
| 525 |
| 526 // We will load and accumulate with four coefficients per iteration. |
| 527 for (int filterX = 0; filterX < (filterLength >> 2); filterX++) { |
| 528 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; |
| 529 |
| 530 coeffs = vld1_s16(filterValues); |
| 531 coeff0 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask0)); |
| 532 coeff1 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask1)); |
| 533 coeff2 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask2)); |
| 534 coeff3 = vreinterpret_s16_u8(vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask3)); |
| 535 |
| 536 uint8x16_t pixels; |
| 537 int16x8_t p01_16, p23_16; |
| 538 int32x4_t p0, p1, p2, p3; |
| 539 |
| 540 #define ITERATION(src, accum)
\ |
| 541 pixels = vld1q_u8(src);
\ |
| 542 p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
\ |
| 543 p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
\ |
| 544 p0 = vmull_s16(vget_low_s16(p01_16), coeff0);
\ |
| 545 p1 = vmull_s16(vget_high_s16(p01_16), coeff1);
\ |
| 546 p2 = vmull_s16(vget_low_s16(p23_16), coeff2);
\ |
| 547 p3 = vmull_s16(vget_high_s16(p23_16), coeff3);
\ |
| 548 accum += p0;
\ |
| 549 accum += p1;
\ |
| 550 accum += p2;
\ |
| 551 accum += p3 |
| 552 |
| 553 ITERATION(srcData[0] + start, accum0); |
| 554 ITERATION(srcData[1] + start, accum1); |
| 555 ITERATION(srcData[2] + start, accum2); |
| 556 ITERATION(srcData[3] + start, accum3); |
| 557 |
| 558 start += 16; |
| 559 filterValues += 4; |
| 560 } |
| 561 |
| 562 int r = filterLength & 3; |
| 563 if (r) { |
| 564 int remainder_offset = (filterOffset + filterLength - r) * 4; |
| 565 AccumRemainder(srcData[0] + remainder_offset, filterValues, accu
m0, r); |
| 566 AccumRemainder(srcData[1] + remainder_offset, filterValues, accu
m1, r); |
| 567 AccumRemainder(srcData[2] + remainder_offset, filterValues, accu
m2, r); |
| 568 AccumRemainder(srcData[3] + remainder_offset, filterValues, accu
m3, r); |
| 569 } |
| 570 |
| 571 int16x4_t accum16; |
| 572 uint8x8_t res0, res1, res2, res3; |
| 573 |
| 574 #define PACK_RESULT(accum, res) \ |
| 575 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); \ |
| 576 accum16 = vqmovn_s32(accum); \ |
| 577 res = vqmovun_s16(vcombine_s16(accum16, accum16)); |
| 578 |
| 579 PACK_RESULT(accum0, res0); |
| 580 PACK_RESULT(accum1, res1); |
| 581 PACK_RESULT(accum2, res2); |
| 582 PACK_RESULT(accum3, res3); |
| 583 |
| 584 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[0]), vreinterpret_u
32_u8(res0), 0); |
| 585 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[1]), vreinterpret_u
32_u8(res1), 0); |
| 586 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[2]), vreinterpret_u
32_u8(res2), 0); |
| 587 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow[3]), vreinterpret_u
32_u8(res3), 0); |
| 588 outRow[0] += 4; |
| 589 outRow[1] += 4; |
| 590 outRow[2] += 4; |
| 591 outRow[3] += 4; |
| 592 } |
| 593 } |
| 594 |
| 595 |
| 596 // Does vertical convolution to produce one output row. The filter values an
d |
| 597 // length are given in the first two parameters. These are applied to each |
| 598 // of the rows pointed to in the |sourceDataRows| array, with each row |
| 599 // being |pixelWidth| wide. |
| 600 // |
| 601 // The output must have room for |pixelWidth * 4| bytes. |
| 602 template<bool hasAlpha> |
| 603 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte
rValues, |
| 604 int filterLength, |
| 605 unsigned char* const* sourceDataRows, |
| 606 int pixelWidth, |
| 607 unsigned char* outRow) { |
| 608 int width = pixelWidth & ~3; |
| 609 |
| 610 // Output four pixels per iteration (16 bytes). |
| 611 for (int outX = 0; outX < width; outX += 4) { |
| 612 |
| 613 // Accumulated result for each pixel. 32 bits per RGBA channel. |
| 614 int32x4_t accum0 = vdupq_n_s32(0); |
| 615 int32x4_t accum1 = vdupq_n_s32(0); |
| 616 int32x4_t accum2 = vdupq_n_s32(0); |
| 617 int32x4_t accum3 = vdupq_n_s32(0); |
| 618 |
| 619 // Convolve with one filter coefficient per iteration. |
| 620 for (int filterY = 0; filterY < filterLength; filterY++) { |
| 621 |
| 622 // Duplicate the filter coefficient 4 times. |
| 623 // [16] cj cj cj cj |
| 624 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); |
| 625 |
| 626 // Load four pixels (16 bytes) together. |
| 627 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 628 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); |
| 629 |
| 630 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(
src8))); |
| 631 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8
(src8))); |
| 632 int16x4_t src16_0 = vget_low_s16(src16_01); |
| 633 int16x4_t src16_1 = vget_high_s16(src16_01); |
| 634 int16x4_t src16_2 = vget_low_s16(src16_23); |
| 635 int16x4_t src16_3 = vget_high_s16(src16_23); |
| 636 |
| 637 accum0 += vmull_s16(src16_0, coeff16); |
| 638 accum1 += vmull_s16(src16_1, coeff16); |
| 639 accum2 += vmull_s16(src16_2, coeff16); |
| 640 accum3 += vmull_s16(src16_3, coeff16); |
| 641 } |
| 642 |
| 643 // Shift right for fixed point implementation. |
| 644 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 645 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 646 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 647 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 648 |
| 649 // Packing 32 bits |accum| to 16 bits per channel (signed saturation
). |
| 650 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 651 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac
cum1)); |
| 652 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 653 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac
cum3)); |
| 654 |
| 655 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturatio
n). |
| 656 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 657 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(
accum16_1)); |
| 658 |
| 659 if (hasAlpha) { |
| 660 // Compute the max(ri, gi, bi) for each pixel. |
| 661 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 662 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3
2_u8(accum8), 8)); |
| 663 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 664 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g |
| 665 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 666 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8
), 16)); |
| 667 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 668 b = vmaxq_u8(a, b); // Max of r and g and b. |
| 669 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 670 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24
)); |
| 671 |
| 672 // Make sure the value of alpha channel is always larger than ma
ximum |
| 673 // value of color channels. |
| 674 accum8 = vmaxq_u8(b, accum8); |
| 675 } else { |
| 676 // Set value of alpha channels to 0xFF. |
| 677 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu
pq_n_u32(0xFF000000)); |
| 678 } |
| 679 |
| 680 // Store the convolution result (16 bytes) and advance the pixel poi
nters. |
| 681 vst1q_u8(outRow, accum8); |
| 682 outRow += 16; |
| 683 } |
| 684 |
| 685 // Process the leftovers when the width of the output is not divisible |
| 686 // by 4, that is at most 3 pixels. |
| 687 int r = pixelWidth & 3; |
| 688 if (r) { |
| 689 |
| 690 int32x4_t accum0 = vdupq_n_s32(0); |
| 691 int32x4_t accum1 = vdupq_n_s32(0); |
| 692 int32x4_t accum2 = vdupq_n_s32(0); |
| 693 |
| 694 for (int filterY = 0; filterY < filterLength; ++filterY) { |
| 695 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); |
| 696 |
| 697 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 698 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2])
; |
| 699 |
| 700 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(
src8))); |
| 701 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8
(src8))); |
| 702 int16x4_t src16_0 = vget_low_s16(src16_01); |
| 703 int16x4_t src16_1 = vget_high_s16(src16_01); |
| 704 int16x4_t src16_2 = vget_low_s16(src16_23); |
| 705 |
| 706 accum0 += vmull_s16(src16_0, coeff16); |
| 707 accum1 += vmull_s16(src16_1, coeff16); |
| 708 accum2 += vmull_s16(src16_2, coeff16); |
| 709 } |
| 710 |
| 711 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 712 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 713 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 714 |
| 715 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(ac
cum1)); |
| 716 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(ac
cum2)); |
| 717 |
| 718 uint8x16_t accum8 = vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(
accum16_1)); |
| 719 |
| 720 if (hasAlpha) { |
| 721 // Compute the max(ri, gi, bi) for each pixel. |
| 722 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 723 uint8x16_t a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u3
2_u8(accum8), 8)); |
| 724 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 725 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g |
| 726 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 727 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8
), 16)); |
| 728 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 729 b = vmaxq_u8(a, b); // Max of r and g and b. |
| 730 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 731 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24
)); |
| 732 |
| 733 // Make sure the value of alpha channel is always larger than ma
ximum |
| 734 // value of color channels. |
| 735 accum8 = vmaxq_u8(b, accum8); |
| 736 } else { |
| 737 // Set value of alpha channels to 0xFF. |
| 738 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | vdu
pq_n_u32(0xFF000000)); |
| 739 } |
| 740 |
| 741 switch(r) { |
| 742 case 1: |
| 743 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), vreinterpret
q_u32_u8(accum8), 0); |
| 744 break; |
| 745 case 2: |
| 746 vst1_u32(reinterpret_cast<uint32_t*>(outRow), |
| 747 vreinterpret_u32_u8(vget_low_u8(accum8))); |
| 748 break; |
| 749 case 3: |
| 750 vst1_u32(reinterpret_cast<uint32_t*>(outRow), |
| 751 vreinterpret_u32_u8(vget_low_u8(accum8))); |
| 752 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow+8), vreinterpr
etq_u32_u8(accum8), 2); |
| 753 break; |
| 754 } |
| 755 } |
| 756 } |
| 757 |
| 758 #else |
| 759 |
| 760 // Converts the argument to an 8-bit unsigned value by clamping to the range |
| 761 // 0-255. |
| 762 inline unsigned char ClampTo8(int a) { |
| 763 if (static_cast<unsigned>(a) < 256) { |
| 764 return a; // Avoid the extra check in the common case. |
| 765 } |
| 766 if (a < 0) { |
| 767 return 0; |
| 768 } |
| 769 return 255; |
| 770 } |
| 771 |
| 772 // Convolves horizontally along a single row. The row data is given in |
| 773 // |srcData| and continues for the numValues() of the filter. |
| 774 template<bool hasAlpha> |
| 775 void ConvolveHorizontally(const unsigned char* srcData, |
| 776 const SkConvolutionFilter1D& filter, |
| 777 unsigned char* outRow) { |
| 778 // Loop over each pixel on this row in the output image. |
| 779 int numValues = filter.numValues(); |
| 780 for (int outX = 0; outX < numValues; outX++) { |
| 781 // Get the filter that determines the current output pixel. |
| 782 int filterOffset, filterLength; |
| 783 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = |
| 784 filter.FilterForValue(outX, &filterOffset, &filterLength); |
| 785 |
| 786 // Compute the first pixel in this row that the filter affects. It w
ill |
| 787 // touch |filterLength| pixels (4 bytes each) after this. |
| 788 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; |
| 789 |
| 790 // Apply the filter to the row to get the destination pixel in |accu
m|. |
| 791 int accum[4] = {0}; |
| 792 for (int filterX = 0; filterX < filterLength; filterX++) { |
| 793 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues
[filterX]; |
| 794 accum[0] += curFilter * rowToFilter[filterX * 4 + 0]; |
| 795 accum[1] += curFilter * rowToFilter[filterX * 4 + 1]; |
| 796 accum[2] += curFilter * rowToFilter[filterX * 4 + 2]; |
| 797 if (hasAlpha) { |
| 798 accum[3] += curFilter * rowToFilter[filterX * 4 + 3]; |
| 799 } |
| 800 } |
| 801 |
| 802 // Bring this value back in range. All of the filter scaling factors |
| 803 // are in fixed point with kShiftBits bits of fractional part. |
| 804 accum[0] >>= SkConvolutionFilter1D::kShiftBits; |
| 805 accum[1] >>= SkConvolutionFilter1D::kShiftBits; |
| 806 accum[2] >>= SkConvolutionFilter1D::kShiftBits; |
| 807 if (hasAlpha) { |
| 808 accum[3] >>= SkConvolutionFilter1D::kShiftBits; |
| 809 } |
| 810 |
| 811 // Store the new pixel. |
| 812 outRow[outX * 4 + 0] = ClampTo8(accum[0]); |
| 813 outRow[outX * 4 + 1] = ClampTo8(accum[1]); |
| 814 outRow[outX * 4 + 2] = ClampTo8(accum[2]); |
| 815 if (hasAlpha) { |
| 816 outRow[outX * 4 + 3] = ClampTo8(accum[3]); |
| 817 } |
| 818 } |
| 819 } |
| 820 |
| 821 // Does vertical convolution to produce one output row. The filter values an
d |
| 822 // length are given in the first two parameters. These are applied to each |
| 823 // of the rows pointed to in the |sourceDataRows| array, with each row |
| 824 // being |pixelWidth| wide. |
| 825 // |
| 826 // The output must have room for |pixelWidth * 4| bytes. |
| 827 template<bool hasAlpha> |
| 828 void ConvolveVertically(const SkConvolutionFilter1D::ConvolutionFixed* filte
rValues, |
| 829 int filterLength, |
| 830 unsigned char* const* sourceDataRows, |
| 831 int pixelWidth, |
| 832 unsigned char* outRow) { |
| 833 // We go through each column in the output and do a vertical convolution
, |
| 834 // generating one output pixel each time. |
| 835 for (int outX = 0; outX < pixelWidth; outX++) { |
| 836 // Compute the number of bytes over in each row that the current col
umn |
| 837 // we're convolving starts at. The pixel will cover the next 4 bytes
. |
| 838 int byteOffset = outX * 4; |
| 839 |
| 840 // Apply the filter to one column of pixels. |
| 841 int accum[4] = {0}; |
| 842 for (int filterY = 0; filterY < filterLength; filterY++) { |
| 843 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues
[filterY]; |
| 844 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0]; |
| 845 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1]; |
| 846 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2]; |
| 847 if (hasAlpha) { |
| 848 accum[3] += curFilter * sourceDataRows[filterY][byteOffset +
3]; |
| 849 } |
| 850 } |
| 851 |
| 852 // Bring this value back in range. All of the filter scaling factors |
| 853 // are in fixed point with kShiftBits bits of precision. |
| 854 accum[0] >>= SkConvolutionFilter1D::kShiftBits; |
| 855 accum[1] >>= SkConvolutionFilter1D::kShiftBits; |
| 856 accum[2] >>= SkConvolutionFilter1D::kShiftBits; |
| 857 if (hasAlpha) { |
| 858 accum[3] >>= SkConvolutionFilter1D::kShiftBits; |
| 859 } |
| 860 |
| 861 // Store the new pixel. |
| 862 outRow[byteOffset + 0] = ClampTo8(accum[0]); |
| 863 outRow[byteOffset + 1] = ClampTo8(accum[1]); |
| 864 outRow[byteOffset + 2] = ClampTo8(accum[2]); |
| 865 if (hasAlpha) { |
| 866 unsigned char alpha = ClampTo8(accum[3]); |
| 867 |
| 868 // Make sure the alpha channel doesn't come out smaller than any
of the |
| 869 // color channels. We use premultipled alpha channels, so this s
hould |
| 870 // never happen, but rounding errors will cause this from time t
o time. |
| 871 // These "impossible" colors will cause overflows (and hence ran
dom pixel |
| 872 // values) when the resulting bitmap is drawn to the screen. |
| 873 // |
| 874 // We only need to do this when generating the final output row
(here). |
| 875 int maxColorChannel = SkTMax(outRow[byteOffset + 0], |
| 876 SkTMax(outRow[byteOffset + 1], |
| 877 outRow[byteOffset + 2])); |
| 878 if (alpha < maxColorChannel) { |
| 879 outRow[byteOffset + 3] = maxColorChannel; |
| 880 } else { |
| 881 outRow[byteOffset + 3] = alpha; |
| 882 } |
| 883 } else { |
| 884 // No alpha channel, the image is opaque. |
| 885 outRow[byteOffset + 3] = 0xff; |
| 886 } |
| 887 } |
| 888 } |
| 889 |
| 890 // There's a bug somewhere here with GCC autovectorization (-ftree-vectorize
). We originally |
| 891 // thought this was 32 bit only, but subsequent tests show that some 64 bit
gcc compiles |
| 892 // suffer here too. |
| 893 // |
| 894 // Dropping to -O2 disables -ftree-vectorize. GCC 4.6 needs noinline. http
s://bug.skia.org/2575 |
| 895 #if SK_HAS_ATTRIBUTE(optimize) && defined(SK_RELEASE) |
| 896 #define SK_MAYBE_DISABLE_VECTORIZATION __attribute__((optimize("O2"), no
inline)) |
| 897 #else |
| 898 #define SK_MAYBE_DISABLE_VECTORIZATION |
| 899 #endif |
| 900 |
| 901 SK_MAYBE_DISABLE_VECTORIZATION |
| 902 void convolve_horizontally(const unsigned char* srcData, |
| 903 const SkConvolutionFilter1D& filter, |
| 904 unsigned char* outRow, |
| 905 bool hasAlpha) { |
| 906 if (hasAlpha) { |
| 907 ConvolveHorizontally<true>(srcData, filter, outRow); |
| 908 } else { |
| 909 ConvolveHorizontally<false>(srcData, filter, outRow); |
| 910 } |
| 911 } |
| 912 #undef SK_MAYBE_DISABLE_VECTORIZATION |
| 913 |
| 914 void (*convolve_4_rows_horizontally)(const unsigned char* srcData[4], |
| 915 const SkConvolutionFilter1D& filter, |
| 916 unsigned char* outRow[4], |
| 917 size_t outRowBytes) |
| 918 = nullptr; |
| 919 |
| 920 |
| 921 #endif |
| 922 |
| 923 void convolve_vertically(const SkConvolutionFilter1D::ConvolutionFixed* filt
erValues, |
| 924 int filterLength, |
| 925 unsigned char* const* sourceDataRows, |
| 926 int pixelWidth, |
| 927 unsigned char* outRow, |
| 928 bool hasAlpha) { |
| 929 if (hasAlpha) { |
| 930 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows, |
| 931 pixelWidth, outRow); |
| 932 } else { |
| 933 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows
, |
| 934 pixelWidth, outRow); |
| 935 } |
| 936 } |
| 937 |
| 938 } // namespace SK_OPTS_NS |
| 939 |
| 940 #endif//SkBitmapFilter_opts_DEFINED |
OLD | NEW |