| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> |
| 9 #include "SkBitmap.h" |
| 10 #include "SkBitmapFilter_opts_SSE2.h" |
| 8 #include "SkBitmapProcState.h" | 11 #include "SkBitmapProcState.h" |
| 9 #include "SkBitmap.h" | |
| 10 #include "SkColor.h" | 12 #include "SkColor.h" |
| 11 #include "SkColorPriv.h" | 13 #include "SkColorPriv.h" |
| 14 #include "SkConvolver.h" |
| 15 #include "SkShader.h" |
| 12 #include "SkUnPreMultiply.h" | 16 #include "SkUnPreMultiply.h" |
| 13 #include "SkShader.h" | |
| 14 #include "SkConvolver.h" | |
| 15 | |
| 16 #include "SkBitmapFilter_opts_SSE2.h" | |
| 17 | |
| 18 #include <emmintrin.h> | |
| 19 | 17 |
| 20 #if 0 | 18 #if 0 |
| 21 static inline void print128i(__m128i value) { | 19 static inline void print128i(__m128i value) { |
| 22 int *v = (int*) &value; | 20 int *v = (int*) &value; |
| 23 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); | 21 printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); |
| 24 } | 22 } |
| 25 | 23 |
| 26 static inline void print128i_16(__m128i value) { | 24 static inline void print128i_16(__m128i value) { |
| 27 short *v = (short*) &value; | 25 short *v = (short*) &value; |
| 28 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2]
, v[3], v[4], v[5], v[6], v[7]); | 26 printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2]
, v[3], v[4], v[5], v[6], v[7]); |
| (...skipping 139 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 168 int r = SkClampMax(int(localResult[1]), a); | 166 int r = SkClampMax(int(localResult[1]), a); |
| 169 int g = SkClampMax(int(localResult[2]), a); | 167 int g = SkClampMax(int(localResult[2]), a); |
| 170 int b = SkClampMax(int(localResult[3]), a); | 168 int b = SkClampMax(int(localResult[3]), a); |
| 171 | 169 |
| 172 *colors++ = SkPackARGB32(a, r, g, b); | 170 *colors++ = SkPackARGB32(a, r, g, b); |
| 173 | 171 |
| 174 x++; | 172 x++; |
| 175 | 173 |
| 176 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), | 174 s.fInvProc(s.fInvMatrix, SkIntToScalar(x), |
| 177 SkIntToScalar(y), &srcPt); | 175 SkIntToScalar(y), &srcPt); |
| 178 | |
| 179 } | 176 } |
| 180 } | 177 } |
| 181 | 178 |
| 182 // Convolves horizontally along a single row. The row data is given in | 179 // Convolves horizontally along a single row. The row data is given in |
| 183 // |src_data| and continues for the num_values() of the filter. | 180 // |src_data| and continues for the num_values() of the filter. |
| 184 void convolveHorizontally_SSE2(const unsigned char* src_data, | 181 void convolveHorizontally_SSE2(const unsigned char* src_data, |
| 185 const SkConvolutionFilter1D& filter, | 182 const SkConvolutionFilter1D& filter, |
| 186 unsigned char* out_row, | 183 unsigned char* out_row, |
| 187 bool /*has_alpha*/) { | 184 bool /*has_alpha*/) { |
| 188 int num_values = filter.numValues(); | 185 int num_values = filter.numValues(); |
| 189 | 186 |
| 190 int filter_offset, filter_length; | 187 int filter_offset, filter_length; |
| 191 __m128i zero = _mm_setzero_si128(); | 188 __m128i zero = _mm_setzero_si128(); |
| 192 __m128i mask[4]; | 189 __m128i mask[4]; |
| 193 // |mask| will be used to decimate all extra filter coefficients that are | 190 // |mask| will be used to decimate all extra filter coefficients that are |
| 194 // loaded by SIMD when |filter_length| is not divisible by 4. | 191 // loaded by SIMD when |filter_length| is not divisible by 4. |
| 195 // mask[0] is not used in following algorithm. | 192 // mask[0] is not used in following algorithm. |
| 196 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | 193 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
| 197 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | 194 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
| 198 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | 195 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
| 199 | 196 |
| 200 // Output one pixel each iteration, calculating all channels (RGBA) together. | 197 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
| 201 for (int out_x = 0; out_x < num_values; out_x++) { | 198 for (int out_x = 0; out_x < num_values; out_x++) { |
| 202 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 199 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
| 203 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 200 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 204 | 201 |
| 205 __m128i accum = _mm_setzero_si128(); | 202 __m128i accum = _mm_setzero_si128(); |
| 206 | 203 |
| 207 // Compute the first pixel in this row that the filter affects. It will | 204 // Compute the first pixel in this row that the filter affects. It will |
| 208 // touch |filter_length| pixels (4 bytes each) after this. | 205 // touch |filter_length| pixels (4 bytes each) after this. |
| 209 const __m128i* row_to_filter = | 206 const __m128i* row_to_filter = |
| 210 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); | 207 reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); |
| 211 | 208 |
| 212 // We will load and accumulate with four coefficients per iteration. | 209 // We will load and accumulate with four coefficients per iteration. |
| 213 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { | 210 for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { |
| 214 | 211 |
| 215 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. | 212 // Load 4 coefficients => duplicate 1st and 2nd of them for all chan
nels. |
| 216 __m128i coeff, coeff16; | 213 __m128i coeff, coeff16; |
| 217 // [16] xx xx xx xx c3 c2 c1 c0 | 214 // [16] xx xx xx xx c3 c2 c1 c0 |
| 218 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | 215 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
| 219 // [16] xx xx xx xx c1 c1 c0 c0 | 216 // [16] xx xx xx xx c1 c1 c0 c0 |
| 220 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | 217 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 221 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | 218 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
| 222 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | 219 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 223 | 220 |
| 224 // Load four pixels => unpack the first two pixels to 16 bits => | 221 // Load four pixels => unpack the first two pixels to 16 bits => |
| 225 // multiply with coefficients => accumulate the convolution result. | 222 // multiply with coefficients => accumulate the convolution result. |
| 226 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | 223 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 227 __m128i src8 = _mm_loadu_si128(row_to_filter); | 224 __m128i src8 = _mm_loadu_si128(row_to_filter); |
| 228 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | 225 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 229 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | 226 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 230 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | 227 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 231 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | 228 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 232 // [32] a0*c0 b0*c0 g0*c0 r0*c0 | 229 // [32] a0*c0 b0*c0 g0*c0 r0*c0 |
| 233 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 230 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 234 accum = _mm_add_epi32(accum, t); | 231 accum = _mm_add_epi32(accum, t); |
| 235 // [32] a1*c1 b1*c1 g1*c1 r1*c1 | 232 // [32] a1*c1 b1*c1 g1*c1 r1*c1 |
| 236 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 233 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 237 accum = _mm_add_epi32(accum, t); | 234 accum = _mm_add_epi32(accum, t); |
| 238 | 235 |
| 239 // Duplicate 3rd and 4th coefficients for all channels => | 236 // Duplicate 3rd and 4th coefficients for all channels => |
| 240 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients | 237 // unpack the 3rd and 4th pixels to 16 bits => multiply with coeffic
ients |
| 241 // => accumulate the convolution results. | 238 // => accumulate the convolution results. |
| 242 // [16] xx xx xx xx c3 c3 c2 c2 | 239 // [16] xx xx xx xx c3 c3 c2 c2 |
| 243 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | 240 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 244 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | 241 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
| 245 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | 242 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 246 // [16] a3 g3 b3 r3 a2 g2 b2 r2 | 243 // [16] a3 g3 b3 r3 a2 g2 b2 r2 |
| 247 src16 = _mm_unpackhi_epi8(src8, zero); | 244 src16 = _mm_unpackhi_epi8(src8, zero); |
| 248 mul_hi = _mm_mulhi_epi16(src16, coeff16); | 245 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 249 mul_lo = _mm_mullo_epi16(src16, coeff16); | 246 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 250 // [32] a2*c2 b2*c2 g2*c2 r2*c2 | 247 // [32] a2*c2 b2*c2 g2*c2 r2*c2 |
| 251 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 248 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 252 accum = _mm_add_epi32(accum, t); | 249 accum = _mm_add_epi32(accum, t); |
| 253 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | 250 // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
| 254 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 251 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 255 accum = _mm_add_epi32(accum, t); | 252 accum = _mm_add_epi32(accum, t); |
| 256 | 253 |
| 257 // Advance the pixel and coefficients pointers. | 254 // Advance the pixel and coefficients pointers. |
| 258 row_to_filter += 1; | 255 row_to_filter += 1; |
| 259 filter_values += 4; | 256 filter_values += 4; |
| 257 } |
| 258 |
| 259 // When |filter_length| is not divisible by 4, we need to decimate some
of |
| 260 // the filter coefficient that was loaded incorrectly to zero; Other tha
n |
| 261 // that the algorithm is same with above, exceot that the 4th pixel will
be |
| 262 // always absent. |
| 263 int r = filter_length&3; |
| 264 if (r) { |
| 265 // Note: filter_values must be padded to align_up(filter_offset, 8). |
| 266 __m128i coeff, coeff16; |
| 267 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
| 268 // Mask out extra filter taps. |
| 269 coeff = _mm_and_si128(coeff, mask[r]); |
| 270 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 271 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 272 |
| 273 // Note: line buffer must be padded to align_up(filter_offset, 16). |
| 274 // We resolve this by use C-version for the last horizontal line. |
| 275 __m128i src8 = _mm_loadu_si128(row_to_filter); |
| 276 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 277 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 278 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 279 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 280 accum = _mm_add_epi32(accum, t); |
| 281 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 282 accum = _mm_add_epi32(accum, t); |
| 283 |
| 284 src16 = _mm_unpackhi_epi8(src8, zero); |
| 285 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 286 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |
| 287 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 288 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 289 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 290 accum = _mm_add_epi32(accum, t); |
| 291 } |
| 292 |
| 293 // Shift right for fixed point implementation. |
| 294 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |
| 295 |
| 296 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 297 accum = _mm_packs_epi32(accum, zero); |
| 298 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 299 accum = _mm_packus_epi16(accum, zero); |
| 300 |
| 301 // Store the pixel value of 32 bits. |
| 302 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); |
| 303 out_row += 4; |
| 260 } | 304 } |
| 261 | |
| 262 // When |filter_length| is not divisible by 4, we need to decimate some of | |
| 263 // the filter coefficient that was loaded incorrectly to zero; Other than | |
| 264 // that the algorithm is same with above, exceot that the 4th pixel will be | |
| 265 // always absent. | |
| 266 int r = filter_length&3; | |
| 267 if (r) { | |
| 268 // Note: filter_values must be padded to align_up(filter_offset, 8). | |
| 269 __m128i coeff, coeff16; | |
| 270 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
| 271 // Mask out extra filter taps. | |
| 272 coeff = _mm_and_si128(coeff, mask[r]); | |
| 273 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
| 274 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
| 275 | |
| 276 // Note: line buffer must be padded to align_up(filter_offset, 16). | |
| 277 // We resolve this by use C-version for the last horizontal line. | |
| 278 __m128i src8 = _mm_loadu_si128(row_to_filter); | |
| 279 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
| 280 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 281 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 282 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 283 accum = _mm_add_epi32(accum, t); | |
| 284 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 285 accum = _mm_add_epi32(accum, t); | |
| 286 | |
| 287 src16 = _mm_unpackhi_epi8(src8, zero); | |
| 288 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
| 289 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
| 290 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 291 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 292 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 293 accum = _mm_add_epi32(accum, t); | |
| 294 } | |
| 295 | |
| 296 // Shift right for fixed point implementation. | |
| 297 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | |
| 298 | |
| 299 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | |
| 300 accum = _mm_packs_epi32(accum, zero); | |
| 301 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | |
| 302 accum = _mm_packus_epi16(accum, zero); | |
| 303 | |
| 304 // Store the pixel value of 32 bits. | |
| 305 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); | |
| 306 out_row += 4; | |
| 307 } | |
| 308 } | 305 } |
| 309 | 306 |
| 310 // Convolves horizontally along four rows. The row data is given in | 307 // Convolves horizontally along four rows. The row data is given in |
| 311 // |src_data| and continues for the num_values() of the filter. | 308 // |src_data| and continues for the num_values() of the filter. |
| 312 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please | 309 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please |
| 313 // refer to that function for detailed comments. | 310 // refer to that function for detailed comments. |
| 314 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 311 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |
| 315 const SkConvolutionFilter1D& filter, | 312 const SkConvolutionFilter1D& filter, |
| 316 unsigned char* out_row[4]) { | 313 unsigned char* out_row[4]) { |
| 317 int num_values = filter.numValues(); | 314 int num_values = filter.numValues(); |
| 318 | 315 |
| 319 int filter_offset, filter_length; | 316 int filter_offset, filter_length; |
| 320 __m128i zero = _mm_setzero_si128(); | 317 __m128i zero = _mm_setzero_si128(); |
| 321 __m128i mask[4]; | 318 __m128i mask[4]; |
| 322 // |mask| will be used to decimate all extra filter coefficients that are | 319 // |mask| will be used to decimate all extra filter coefficients that are |
| 323 // loaded by SIMD when |filter_length| is not divisible by 4. | 320 // loaded by SIMD when |filter_length| is not divisible by 4. |
| 324 // mask[0] is not used in following algorithm. | 321 // mask[0] is not used in following algorithm. |
| 325 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | 322 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
| 326 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | 323 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
| 327 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | 324 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |
| 328 | 325 |
| 329 // Output one pixel each iteration, calculating all channels (RGBA) together. | 326 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
| 330 for (int out_x = 0; out_x < num_values; out_x++) { | 327 for (int out_x = 0; out_x < num_values; out_x++) { |
| 331 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 328 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
| 332 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 329 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 333 | 330 |
| 334 // four pixels in a column per iteration. | 331 // four pixels in a column per iteration. |
| 335 __m128i accum0 = _mm_setzero_si128(); | 332 __m128i accum0 = _mm_setzero_si128(); |
| 336 __m128i accum1 = _mm_setzero_si128(); | 333 __m128i accum1 = _mm_setzero_si128(); |
| 337 __m128i accum2 = _mm_setzero_si128(); | 334 __m128i accum2 = _mm_setzero_si128(); |
| 338 __m128i accum3 = _mm_setzero_si128(); | 335 __m128i accum3 = _mm_setzero_si128(); |
| 339 int start = (filter_offset<<2); | 336 int start = (filter_offset<<2); |
| 340 // We will load and accumulate with four coefficients per iteration. | 337 // We will load and accumulate with four coefficients per iteration. |
| 341 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { | 338 for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { |
| 342 __m128i coeff, coeff16lo, coeff16hi; | 339 __m128i coeff, coeff16lo, coeff16hi; |
| 343 // [16] xx xx xx xx c3 c2 c1 c0 | 340 // [16] xx xx xx xx c3 c2 c1 c0 |
| 344 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | 341 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
| 345 // [16] xx xx xx xx c1 c1 c0 c0 | 342 // [16] xx xx xx xx c1 c1 c0 c0 |
| 346 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | 343 coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |
| 347 // [16] c1 c1 c1 c1 c0 c0 c0 c0 | 344 // [16] c1 c1 c1 c1 c0 c0 c0 c0 |
| 348 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | 345 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
| 349 // [16] xx xx xx xx c3 c3 c2 c2 | 346 // [16] xx xx xx xx c3 c3 c2 c2 |
| 350 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | 347 coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |
| 351 // [16] c3 c3 c3 c3 c2 c2 c2 c2 | 348 // [16] c3 c3 c3 c3 c2 c2 c2 c2 |
| 352 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | 349 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
| 353 | 350 |
| 354 __m128i src8, src16, mul_hi, mul_lo, t; | 351 __m128i src8, src16, mul_hi, mul_lo, t; |
| 355 | 352 |
| 356 #define ITERATION(src, accum) \ | 353 #define ITERATION(src, accum) \ |
| 357 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ | 354 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \ |
| 358 src16 = _mm_unpacklo_epi8(src8, zero); \ | 355 src16 = _mm_unpacklo_epi8(src8, zero); \ |
| 359 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ | 356 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \ |
| 360 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ | 357 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \ |
| 361 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | 358 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
| 362 accum = _mm_add_epi32(accum, t); \ | 359 accum = _mm_add_epi32(accum, t); \ |
| 363 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | 360 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
| 364 accum = _mm_add_epi32(accum, t); \ | 361 accum = _mm_add_epi32(accum, t); \ |
| 365 src16 = _mm_unpackhi_epi8(src8, zero); \ | 362 src16 = _mm_unpackhi_epi8(src8, zero); \ |
| 366 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ | 363 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \ |
| 367 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ | 364 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \ |
| 368 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ | 365 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \ |
| 369 accum = _mm_add_epi32(accum, t); \ | 366 accum = _mm_add_epi32(accum, t); \ |
| 370 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ | 367 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \ |
| 371 accum = _mm_add_epi32(accum, t) | 368 accum = _mm_add_epi32(accum, t) |
| 372 | 369 |
| 373 ITERATION(src_data[0] + start, accum0); | 370 ITERATION(src_data[0] + start, accum0); |
| 374 ITERATION(src_data[1] + start, accum1); | 371 ITERATION(src_data[1] + start, accum1); |
| 375 ITERATION(src_data[2] + start, accum2); | 372 ITERATION(src_data[2] + start, accum2); |
| 376 ITERATION(src_data[3] + start, accum3); | 373 ITERATION(src_data[3] + start, accum3); |
| 377 | 374 |
| 378 start += 16; | 375 start += 16; |
| 379 filter_values += 4; | 376 filter_values += 4; |
| 377 } |
| 378 |
| 379 int r = filter_length & 3; |
| 380 if (r) { |
| 381 // Note: filter_values must be padded to align_up(filter_offset, 8); |
| 382 __m128i coeff; |
| 383 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); |
| 384 // Mask out extra filter taps. |
| 385 coeff = _mm_and_si128(coeff, mask[r]); |
| 386 |
| 387 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0,
0)); |
| 388 /* c1 c1 c1 c1 c0 c0 c0 c0 */ |
| 389 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |
| 390 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2,
2)); |
| 391 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |
| 392 |
| 393 __m128i src8, src16, mul_hi, mul_lo, t; |
| 394 |
| 395 ITERATION(src_data[0] + start, accum0); |
| 396 ITERATION(src_data[1] + start, accum1); |
| 397 ITERATION(src_data[2] + start, accum2); |
| 398 ITERATION(src_data[3] + start, accum3); |
| 399 } |
| 400 |
| 401 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 402 accum0 = _mm_packs_epi32(accum0, zero); |
| 403 accum0 = _mm_packus_epi16(accum0, zero); |
| 404 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 405 accum1 = _mm_packs_epi32(accum1, zero); |
| 406 accum1 = _mm_packus_epi16(accum1, zero); |
| 407 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 408 accum2 = _mm_packs_epi32(accum2, zero); |
| 409 accum2 = _mm_packus_epi16(accum2, zero); |
| 410 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 411 accum3 = _mm_packs_epi32(accum3, zero); |
| 412 accum3 = _mm_packus_epi16(accum3, zero); |
| 413 |
| 414 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); |
| 415 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); |
| 416 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); |
| 417 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); |
| 418 |
| 419 out_row[0] += 4; |
| 420 out_row[1] += 4; |
| 421 out_row[2] += 4; |
| 422 out_row[3] += 4; |
| 380 } | 423 } |
| 381 | |
| 382 int r = filter_length & 3; | |
| 383 if (r) { | |
| 384 // Note: filter_values must be padded to align_up(filter_offset, 8); | |
| 385 __m128i coeff; | |
| 386 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_values)); | |
| 387 // Mask out extra filter taps. | |
| 388 coeff = _mm_and_si128(coeff, mask[r]); | |
| 389 | |
| 390 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
| 391 /* c1 c1 c1 c1 c0 c0 c0 c0 */ | |
| 392 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
| 393 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
| 394 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
| 395 | |
| 396 __m128i src8, src16, mul_hi, mul_lo, t; | |
| 397 | |
| 398 ITERATION(src_data[0] + start, accum0); | |
| 399 ITERATION(src_data[1] + start, accum1); | |
| 400 ITERATION(src_data[2] + start, accum2); | |
| 401 ITERATION(src_data[3] + start, accum3); | |
| 402 } | |
| 403 | |
| 404 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 405 accum0 = _mm_packs_epi32(accum0, zero); | |
| 406 accum0 = _mm_packus_epi16(accum0, zero); | |
| 407 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 408 accum1 = _mm_packs_epi32(accum1, zero); | |
| 409 accum1 = _mm_packus_epi16(accum1, zero); | |
| 410 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 411 accum2 = _mm_packs_epi32(accum2, zero); | |
| 412 accum2 = _mm_packus_epi16(accum2, zero); | |
| 413 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | |
| 414 accum3 = _mm_packs_epi32(accum3, zero); | |
| 415 accum3 = _mm_packus_epi16(accum3, zero); | |
| 416 | |
| 417 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); | |
| 418 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); | |
| 419 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); | |
| 420 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); | |
| 421 | |
| 422 out_row[0] += 4; | |
| 423 out_row[1] += 4; | |
| 424 out_row[2] += 4; | |
| 425 out_row[3] += 4; | |
| 426 } | |
| 427 } | 424 } |
| 428 | 425 |
| 429 // Does vertical convolution to produce one output row. The filter values and | 426 // Does vertical convolution to produce one output row. The filter values and |
| 430 // length are given in the first two parameters. These are applied to each | 427 // length are given in the first two parameters. These are applied to each |
| 431 // of the rows pointed to in the |source_data_rows| array, with each row | 428 // of the rows pointed to in the |source_data_rows| array, with each row |
| 432 // being |pixel_width| wide. | 429 // being |pixel_width| wide. |
| 433 // | 430 // |
| 434 // The output must have room for |pixel_width * 4| bytes. | 431 // The output must have room for |pixel_width * 4| bytes. |
| 435 template<bool has_alpha> | 432 template<bool has_alpha> |
| 436 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, | 433 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, |
| 437 int filter_length, | 434 int filter_length, |
| 438 unsigned char* const* source_data_rows, | 435 unsigned char* const* source_data_rows, |
| 439 int pixel_width, | 436 int pixel_width, |
| 440 unsigned char* out_row) { | 437 unsigned char* out_row) { |
| 441 int width = pixel_width & ~3; | 438 int width = pixel_width & ~3; |
| 442 | 439 |
| 443 __m128i zero = _mm_setzero_si128(); | 440 __m128i zero = _mm_setzero_si128(); |
| 444 __m128i accum0, accum1, accum2, accum3, coeff16; | 441 __m128i accum0, accum1, accum2, accum3, coeff16; |
| 445 const __m128i* src; | 442 const __m128i* src; |
| 446 // Output four pixels per iteration (16 bytes). | 443 // Output four pixels per iteration (16 bytes). |
| 447 for (int out_x = 0; out_x < width; out_x += 4) { | 444 for (int out_x = 0; out_x < width; out_x += 4) { |
| 448 | 445 |
| 449 // Accumulated result for each pixel. 32 bits per RGBA channel. | 446 // Accumulated result for each pixel. 32 bits per RGBA channel. |
| 450 accum0 = _mm_setzero_si128(); | 447 accum0 = _mm_setzero_si128(); |
| 451 accum1 = _mm_setzero_si128(); | 448 accum1 = _mm_setzero_si128(); |
| 452 accum2 = _mm_setzero_si128(); | 449 accum2 = _mm_setzero_si128(); |
| 453 accum3 = _mm_setzero_si128(); | 450 accum3 = _mm_setzero_si128(); |
| 454 | 451 |
| 455 // Convolve with one filter coefficient per iteration. | 452 // Convolve with one filter coefficient per iteration. |
| 456 for (int filter_y = 0; filter_y < filter_length; filter_y++) { | 453 for (int filter_y = 0; filter_y < filter_length; filter_y++) { |
| 457 | 454 |
| 458 // Duplicate the filter coefficient 8 times. | 455 // Duplicate the filter coefficient 8 times. |
| 459 // [16] cj cj cj cj cj cj cj cj | 456 // [16] cj cj cj cj cj cj cj cj |
| 460 coeff16 = _mm_set1_epi16(filter_values[filter_y]); | 457 coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
| 461 | 458 |
| 462 // Load four pixels (16 bytes) together. | 459 // Load four pixels (16 bytes) together. |
| 463 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | 460 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 464 src = reinterpret_cast<const __m128i*>( | 461 src = reinterpret_cast<const __m128i*>( |
| 465 &source_data_rows[filter_y][out_x << 2]); | 462 &source_data_rows[filter_y][out_x << 2]); |
| 466 __m128i src8 = _mm_loadu_si128(src); | 463 __m128i src8 = _mm_loadu_si128(src); |
| 467 | 464 |
| 468 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => | 465 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channel
s => |
| 469 // multiply with current coefficient => accumulate the result. | 466 // multiply with current coefficient => accumulate the result. |
| 470 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | 467 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 471 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | 468 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 472 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | 469 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 473 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | 470 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 474 // [32] a0 b0 g0 r0 | 471 // [32] a0 b0 g0 r0 |
| 475 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 472 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 476 accum0 = _mm_add_epi32(accum0, t); | 473 accum0 = _mm_add_epi32(accum0, t); |
| 477 // [32] a1 b1 g1 r1 | 474 // [32] a1 b1 g1 r1 |
| 478 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 475 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 479 accum1 = _mm_add_epi32(accum1, t); | 476 accum1 = _mm_add_epi32(accum1, t); |
| 480 | 477 |
| 481 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => | 478 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channel
s => |
| 482 // multiply with current coefficient => accumulate the result. | 479 // multiply with current coefficient => accumulate the result. |
| 483 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | 480 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 484 src16 = _mm_unpackhi_epi8(src8, zero); | 481 src16 = _mm_unpackhi_epi8(src8, zero); |
| 485 mul_hi = _mm_mulhi_epi16(src16, coeff16); | 482 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 486 mul_lo = _mm_mullo_epi16(src16, coeff16); | 483 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 487 // [32] a2 b2 g2 r2 | 484 // [32] a2 b2 g2 r2 |
| 488 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | 485 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 489 accum2 = _mm_add_epi32(accum2, t); | 486 accum2 = _mm_add_epi32(accum2, t); |
| 490 // [32] a3 b3 g3 r3 | 487 // [32] a3 b3 g3 r3 |
| 491 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 488 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 492 accum3 = _mm_add_epi32(accum3, t); | 489 accum3 = _mm_add_epi32(accum3, t); |
| 490 } |
| 491 |
| 492 // Shift right for fixed point implementation. |
| 493 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 494 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 495 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 496 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 497 |
| 498 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 499 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 500 accum0 = _mm_packs_epi32(accum0, accum1); |
| 501 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 502 accum2 = _mm_packs_epi32(accum2, accum3); |
| 503 |
| 504 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 505 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 506 accum0 = _mm_packus_epi16(accum0, accum2); |
| 507 |
| 508 if (has_alpha) { |
| 509 // Compute the max(ri, gi, bi) for each pixel. |
| 510 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 511 __m128i a = _mm_srli_epi32(accum0, 8); |
| 512 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 513 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 514 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 515 a = _mm_srli_epi32(accum0, 16); |
| 516 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 517 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 518 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 519 b = _mm_slli_epi32(b, 24); |
| 520 |
| 521 // Make sure the value of alpha channel is always larger than maximu
m |
| 522 // value of color channels. |
| 523 accum0 = _mm_max_epu8(b, accum0); |
| 524 } else { |
| 525 // Set value of alpha channels to 0xFF. |
| 526 __m128i mask = _mm_set1_epi32(0xff000000); |
| 527 accum0 = _mm_or_si128(accum0, mask); |
| 528 } |
| 529 |
| 530 // Store the convolution result (16 bytes) and advance the pixel pointer
s. |
| 531 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); |
| 532 out_row += 16; |
| 493 } | 533 } |
| 494 | 534 |
| 495 // Shift right for fixed point implementation. | 535 // When the width of the output is not divisible by 4, We need to save one |
| 496 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | 536 // pixel (4 bytes) each time. And also the fourth pixel is always absent. |
| 497 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 537 if (pixel_width & 3) { |
| 498 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 538 accum0 = _mm_setzero_si128(); |
| 499 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | 539 accum1 = _mm_setzero_si128(); |
| 540 accum2 = _mm_setzero_si128(); |
| 541 for (int filter_y = 0; filter_y < filter_length; ++filter_y) { |
| 542 coeff16 = _mm_set1_epi16(filter_values[filter_y]); |
| 543 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 544 src = reinterpret_cast<const __m128i*>( |
| 545 &source_data_rows[filter_y][width<<2]); |
| 546 __m128i src8 = _mm_loadu_si128(src); |
| 547 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 548 __m128i src16 = _mm_unpacklo_epi8(src8, zero); |
| 549 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 550 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 551 // [32] a0 b0 g0 r0 |
| 552 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 553 accum0 = _mm_add_epi32(accum0, t); |
| 554 // [32] a1 b1 g1 r1 |
| 555 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 556 accum1 = _mm_add_epi32(accum1, t); |
| 557 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 558 src16 = _mm_unpackhi_epi8(src8, zero); |
| 559 mul_hi = _mm_mulhi_epi16(src16, coeff16); |
| 560 mul_lo = _mm_mullo_epi16(src16, coeff16); |
| 561 // [32] a2 b2 g2 r2 |
| 562 t = _mm_unpacklo_epi16(mul_lo, mul_hi); |
| 563 accum2 = _mm_add_epi32(accum2, t); |
| 564 } |
| 500 | 565 |
| 501 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | 566 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 502 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | 567 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 503 accum0 = _mm_packs_epi32(accum0, accum1); | 568 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 504 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | 569 // [16] a1 b1 g1 r1 a0 b0 g0 r0 |
| 505 accum2 = _mm_packs_epi32(accum2, accum3); | 570 accum0 = _mm_packs_epi32(accum0, accum1); |
| 571 // [16] a3 b3 g3 r3 a2 b2 g2 r2 |
| 572 accum2 = _mm_packs_epi32(accum2, zero); |
| 573 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |
| 574 accum0 = _mm_packus_epi16(accum0, accum2); |
| 575 if (has_alpha) { |
| 576 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |
| 577 __m128i a = _mm_srli_epi32(accum0, 8); |
| 578 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 579 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. |
| 580 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |
| 581 a = _mm_srli_epi32(accum0, 16); |
| 582 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |
| 583 b = _mm_max_epu8(a, b); // Max of r and g and b. |
| 584 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |
| 585 b = _mm_slli_epi32(b, 24); |
| 586 accum0 = _mm_max_epu8(b, accum0); |
| 587 } else { |
| 588 __m128i mask = _mm_set1_epi32(0xff000000); |
| 589 accum0 = _mm_or_si128(accum0, mask); |
| 590 } |
| 506 | 591 |
| 507 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | 592 for (int out_x = width; out_x < pixel_width; out_x++) { |
| 508 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | 593 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); |
| 509 accum0 = _mm_packus_epi16(accum0, accum2); | 594 accum0 = _mm_srli_si128(accum0, 4); |
| 510 | 595 out_row += 4; |
| 511 if (has_alpha) { | 596 } |
| 512 // Compute the max(ri, gi, bi) for each pixel. | |
| 513 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 514 __m128i a = _mm_srli_epi32(accum0, 8); | |
| 515 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 516 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
| 517 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 518 a = _mm_srli_epi32(accum0, 16); | |
| 519 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 520 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
| 521 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 522 b = _mm_slli_epi32(b, 24); | |
| 523 | |
| 524 // Make sure the value of alpha channel is always larger than maximum | |
| 525 // value of color channels. | |
| 526 accum0 = _mm_max_epu8(b, accum0); | |
| 527 } else { | |
| 528 // Set value of alpha channels to 0xFF. | |
| 529 __m128i mask = _mm_set1_epi32(0xff000000); | |
| 530 accum0 = _mm_or_si128(accum0, mask); | |
| 531 } | 597 } |
| 532 | |
| 533 // Store the convolution result (16 bytes) and advance the pixel pointers. | |
| 534 _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); | |
| 535 out_row += 16; | |
| 536 } | |
| 537 | |
| 538 // When the width of the output is not divisible by 4, We need to save one | |
| 539 // pixel (4 bytes) each time. And also the fourth pixel is always absent. | |
| 540 if (pixel_width & 3) { | |
| 541 accum0 = _mm_setzero_si128(); | |
| 542 accum1 = _mm_setzero_si128(); | |
| 543 accum2 = _mm_setzero_si128(); | |
| 544 for (int filter_y = 0; filter_y < filter_length; ++filter_y) { | |
| 545 coeff16 = _mm_set1_epi16(filter_values[filter_y]); | |
| 546 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 547 src = reinterpret_cast<const __m128i*>( | |
| 548 &source_data_rows[filter_y][width<<2]); | |
| 549 __m128i src8 = _mm_loadu_si128(src); | |
| 550 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 551 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
| 552 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 553 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 554 // [32] a0 b0 g0 r0 | |
| 555 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 556 accum0 = _mm_add_epi32(accum0, t); | |
| 557 // [32] a1 b1 g1 r1 | |
| 558 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 559 accum1 = _mm_add_epi32(accum1, t); | |
| 560 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 561 src16 = _mm_unpackhi_epi8(src8, zero); | |
| 562 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 563 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 564 // [32] a2 b2 g2 r2 | |
| 565 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 566 accum2 = _mm_add_epi32(accum2, t); | |
| 567 } | |
| 568 | |
| 569 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | |
| 570 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | |
| 571 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | |
| 572 // [16] a1 b1 g1 r1 a0 b0 g0 r0 | |
| 573 accum0 = _mm_packs_epi32(accum0, accum1); | |
| 574 // [16] a3 b3 g3 r3 a2 b2 g2 r2 | |
| 575 accum2 = _mm_packs_epi32(accum2, zero); | |
| 576 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 | |
| 577 accum0 = _mm_packus_epi16(accum0, accum2); | |
| 578 if (has_alpha) { | |
| 579 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 | |
| 580 __m128i a = _mm_srli_epi32(accum0, 8); | |
| 581 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 582 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. | |
| 583 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 | |
| 584 a = _mm_srli_epi32(accum0, 16); | |
| 585 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 | |
| 586 b = _mm_max_epu8(a, b); // Max of r and g and b. | |
| 587 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 | |
| 588 b = _mm_slli_epi32(b, 24); | |
| 589 accum0 = _mm_max_epu8(b, accum0); | |
| 590 } else { | |
| 591 __m128i mask = _mm_set1_epi32(0xff000000); | |
| 592 accum0 = _mm_or_si128(accum0, mask); | |
| 593 } | |
| 594 | |
| 595 for (int out_x = width; out_x < pixel_width; out_x++) { | |
| 596 *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); | |
| 597 accum0 = _mm_srli_si128(accum0, 4); | |
| 598 out_row += 4; | |
| 599 } | |
| 600 } | |
| 601 } | 598 } |
| 602 | 599 |
| 603 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, | 600 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
er_values, |
| 604 int filter_length, | 601 int filter_length, |
| 605 unsigned char* const* source_data_rows, | 602 unsigned char* const* source_data_rows, |
| 606 int pixel_width, | 603 int pixel_width, |
| 607 unsigned char* out_row, | 604 unsigned char* out_row, |
| 608 bool has_alpha) { | 605 bool has_alpha) { |
| 609 if (has_alpha) { | 606 if (has_alpha) { |
| 610 convolveVertically_SSE2<true>(filter_values, | 607 convolveVertically_SSE2<true>(filter_values, |
| 611 filter_length, | 608 filter_length, |
| 612 source_data_rows, | 609 source_data_rows, |
| 613 pixel_width, | 610 pixel_width, |
| 614 out_row); | 611 out_row); |
| 615 } else { | 612 } else { |
| 616 convolveVertically_SSE2<false>(filter_values, | 613 convolveVertically_SSE2<false>(filter_values, |
| 617 filter_length, | 614 filter_length, |
| 618 source_data_rows, | 615 source_data_rows, |
| 619 pixel_width, | 616 pixel_width, |
| 620 out_row); | 617 out_row); |
| 621 } | 618 } |
| 622 } | 619 } |
| 623 | 620 |
| 624 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 621 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
| 625 // Padding |paddingCount| of more dummy coefficients after the coefficients | 622 // Padding |paddingCount| of more dummy coefficients after the coefficients |
| 626 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 623 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
| 627 // together to access invalid memory areas. We are not trying to align the | 624 // together to access invalid memory areas. We are not trying to align the |
| 628 // coefficients right now due to the opaqueness of <vector> implementation. | 625 // coefficients right now due to the opaqueness of <vector> implementation. |
| 629 // This has to be done after all |AddFilter| calls. | 626 // This has to be done after all |AddFilter| calls. |
| 630 for (int i = 0; i < 8; ++i) { | 627 for (int i = 0; i < 8; ++i) { |
| 631 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | 628 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); |
| 632 } | 629 } |
| 633 } | 630 } |
| OLD | NEW |