| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
| 9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
| 10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" |
| (...skipping 22 matching lines...) Expand all Loading... |
| 33 v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] | 33 v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] |
| 34 ); | 34 ); |
| 35 } | 35 } |
| 36 | 36 |
| 37 static inline void print128f(__m128 value) { | 37 static inline void print128f(__m128 value) { |
| 38 float *f = (float*) &value; | 38 float *f = (float*) &value; |
| 39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 39 printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); |
| 40 } | 40 } |
| 41 #endif | 41 #endif |
| 42 | 42 |
| 43 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, |
| 44 const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& a
ccum, int r) { |
| 45 int remainder[4] = {0}; |
| 46 for (int i = 0; i < r; i++) { |
| 47 SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; |
| 48 remainder[0] += coeff * pixels_left[i * 4 + 0]; |
| 49 remainder[1] += coeff * pixels_left[i * 4 + 1]; |
| 50 remainder[2] += coeff * pixels_left[i * 4 + 2]; |
| 51 remainder[3] += coeff * pixels_left[i * 4 + 3]; |
| 52 } |
| 53 __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remaind
er[3]); |
| 54 accum = _mm_add_epi32(accum, t); |
| 55 } |
| 56 |
| 43 // Convolves horizontally along a single row. The row data is given in | 57 // Convolves horizontally along a single row. The row data is given in |
| 44 // |src_data| and continues for the num_values() of the filter. | 58 // |src_data| and continues for the num_values() of the filter. |
| 45 void convolveHorizontally_SSE2(const unsigned char* src_data, | 59 void convolveHorizontally_SSE2(const unsigned char* src_data, |
| 46 const SkConvolutionFilter1D& filter, | 60 const SkConvolutionFilter1D& filter, |
| 47 unsigned char* out_row, | 61 unsigned char* out_row, |
| 48 bool /*has_alpha*/) { | 62 bool /*has_alpha*/) { |
| 49 int num_values = filter.numValues(); | 63 int num_values = filter.numValues(); |
| 50 | 64 |
| 51 int filter_offset, filter_length; | 65 int filter_offset, filter_length; |
| 52 __m128i zero = _mm_setzero_si128(); | 66 __m128i zero = _mm_setzero_si128(); |
| 53 __m128i mask[4]; | |
| 54 // |mask| will be used to decimate all extra filter coefficients that are | |
| 55 // loaded by SIMD when |filter_length| is not divisible by 4. | |
| 56 // mask[0] is not used in following algorithm. | |
| 57 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | |
| 58 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | |
| 59 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | |
| 60 | 67 |
| 61 // Output one pixel each iteration, calculating all channels (RGBA) together
. | 68 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
| 62 for (int out_x = 0; out_x < num_values; out_x++) { | 69 for (int out_x = 0; out_x < num_values; out_x++) { |
| 63 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 70 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
| 64 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 71 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 65 | 72 |
| 66 __m128i accum = _mm_setzero_si128(); | 73 __m128i accum = _mm_setzero_si128(); |
| 67 | 74 |
| 68 // Compute the first pixel in this row that the filter affects. It will | 75 // Compute the first pixel in this row that the filter affects. It will |
| 69 // touch |filter_length| pixels (4 bytes each) after this. | 76 // touch |filter_length| pixels (4 bytes each) after this. |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 113 accum = _mm_add_epi32(accum, t); | 120 accum = _mm_add_epi32(accum, t); |
| 114 // [32] a3*c3 b3*c3 g3*c3 r3*c3 | 121 // [32] a3*c3 b3*c3 g3*c3 r3*c3 |
| 115 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 122 t = _mm_unpackhi_epi16(mul_lo, mul_hi); |
| 116 accum = _mm_add_epi32(accum, t); | 123 accum = _mm_add_epi32(accum, t); |
| 117 | 124 |
| 118 // Advance the pixel and coefficients pointers. | 125 // Advance the pixel and coefficients pointers. |
| 119 row_to_filter += 1; | 126 row_to_filter += 1; |
| 120 filter_values += 4; | 127 filter_values += 4; |
| 121 } | 128 } |
| 122 | 129 |
| 123 // When |filter_length| is not divisible by 4, we need to decimate some
of | 130 // When |filter_length| is not divisible by 4, we accumulate the last 1
- 3 |
| 124 // the filter coefficient that was loaded incorrectly to zero; Other tha
n | 131 // coefficients one at a time. |
| 125 // that the algorithm is same with above, exceot that the 4th pixel will
be | 132 int r = filter_length & 3; |
| 126 // always absent. | |
| 127 int r = filter_length&3; | |
| 128 if (r) { | 133 if (r) { |
| 129 // Note: filter_values must be padded to align_up(filter_offset, 8). | 134 int remainder_offset = (filter_offset + filter_length - r) * 4; |
| 130 __m128i coeff, coeff16; | 135 accum_remainder(src_data + remainder_offset, filter_values, accum, r
); |
| 131 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); | |
| 132 // Mask out extra filter taps. | |
| 133 coeff = _mm_and_si128(coeff, mask[r]); | |
| 134 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); | |
| 135 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
| 136 | |
| 137 // Note: line buffer must be padded to align_up(filter_offset, 16). | |
| 138 // We resolve this by use C-version for the last horizontal line. | |
| 139 __m128i src8 = _mm_loadu_si128(row_to_filter); | |
| 140 __m128i src16 = _mm_unpacklo_epi8(src8, zero); | |
| 141 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 142 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 143 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 144 accum = _mm_add_epi32(accum, t); | |
| 145 t = _mm_unpackhi_epi16(mul_lo, mul_hi); | |
| 146 accum = _mm_add_epi32(accum, t); | |
| 147 | |
| 148 src16 = _mm_unpackhi_epi8(src8, zero); | |
| 149 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); | |
| 150 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); | |
| 151 mul_hi = _mm_mulhi_epi16(src16, coeff16); | |
| 152 mul_lo = _mm_mullo_epi16(src16, coeff16); | |
| 153 t = _mm_unpacklo_epi16(mul_lo, mul_hi); | |
| 154 accum = _mm_add_epi32(accum, t); | |
| 155 } | 136 } |
| 156 | 137 |
| 157 // Shift right for fixed point implementation. | 138 // Shift right for fixed point implementation. |
| 158 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | 139 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |
| 159 | 140 |
| 160 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | 141 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |
| 161 accum = _mm_packs_epi32(accum, zero); | 142 accum = _mm_packs_epi32(accum, zero); |
| 162 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | 143 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |
| 163 accum = _mm_packus_epi16(accum, zero); | 144 accum = _mm_packus_epi16(accum, zero); |
| 164 | 145 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 156 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |
| 176 const SkConvolutionFilter1D& filter, | 157 const SkConvolutionFilter1D& filter, |
| 177 unsigned char* out_row[4], | 158 unsigned char* out_row[4], |
| 178 size_t outRowBytes) { | 159 size_t outRowBytes) { |
| 179 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) | 160 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) |
| 180 | 161 |
| 181 int num_values = filter.numValues(); | 162 int num_values = filter.numValues(); |
| 182 | 163 |
| 183 int filter_offset, filter_length; | 164 int filter_offset, filter_length; |
| 184 __m128i zero = _mm_setzero_si128(); | 165 __m128i zero = _mm_setzero_si128(); |
| 185 __m128i mask[4]; | |
| 186 // |mask| will be used to decimate all extra filter coefficients that are | |
| 187 // loaded by SIMD when |filter_length| is not divisible by 4. | |
| 188 // mask[0] is not used in following algorithm. | |
| 189 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | |
| 190 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | |
| 191 mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); | |
| 192 | 166 |
| 193 // Output one pixel each iteration, calculating all channels (RGBA) together
. | 167 // Output one pixel each iteration, calculating all channels (RGBA) together
. |
| 194 for (int out_x = 0; out_x < num_values; out_x++) { | 168 for (int out_x = 0; out_x < num_values; out_x++) { |
| 195 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 169 const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |
| 196 filter.FilterForValue(out_x, &filter_offset, &filter_length); | 170 filter.FilterForValue(out_x, &filter_offset, &filter_length); |
| 197 | 171 |
| 198 // four pixels in a column per iteration. | 172 // four pixels in a column per iteration. |
| 199 __m128i accum0 = _mm_setzero_si128(); | 173 __m128i accum0 = _mm_setzero_si128(); |
| 200 __m128i accum1 = _mm_setzero_si128(); | 174 __m128i accum1 = _mm_setzero_si128(); |
| 201 __m128i accum2 = _mm_setzero_si128(); | 175 __m128i accum2 = _mm_setzero_si128(); |
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 238 ITERATION(src_data[1] + start, accum1); | 212 ITERATION(src_data[1] + start, accum1); |
| 239 ITERATION(src_data[2] + start, accum2); | 213 ITERATION(src_data[2] + start, accum2); |
| 240 ITERATION(src_data[3] + start, accum3); | 214 ITERATION(src_data[3] + start, accum3); |
| 241 | 215 |
| 242 start += 16; | 216 start += 16; |
| 243 filter_values += 4; | 217 filter_values += 4; |
| 244 } | 218 } |
| 245 | 219 |
| 246 int r = filter_length & 3; | 220 int r = filter_length & 3; |
| 247 if (r) { | 221 if (r) { |
| 248 // Note: filter_values must be padded to align_up(filter_offset, 8); | 222 int remainder_offset = (filter_offset + filter_length - r) * 4; |
| 249 __m128i coeff; | 223 accum_remainder(src_data[0] + remainder_offset, filter_values, accum
0, r); |
| 250 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
es)); | 224 accum_remainder(src_data[1] + remainder_offset, filter_values, accum
1, r); |
| 251 // Mask out extra filter taps. | 225 accum_remainder(src_data[2] + remainder_offset, filter_values, accum
2, r); |
| 252 coeff = _mm_and_si128(coeff, mask[r]); | 226 accum_remainder(src_data[3] + remainder_offset, filter_values, accum
3, r); |
| 253 | |
| 254 __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0,
0)); | |
| 255 /* c1 c1 c1 c1 c0 c0 c0 c0 */ | |
| 256 coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); | |
| 257 __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2,
2)); | |
| 258 coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); | |
| 259 | |
| 260 __m128i src8, src16, mul_hi, mul_lo, t; | |
| 261 | |
| 262 ITERATION(src_data[0] + start, accum0); | |
| 263 ITERATION(src_data[1] + start, accum1); | |
| 264 ITERATION(src_data[2] + start, accum2); | |
| 265 ITERATION(src_data[3] + start, accum3); | |
| 266 } | 227 } |
| 267 | 228 |
| 268 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | 229 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |
| 269 accum0 = _mm_packs_epi32(accum0, zero); | 230 accum0 = _mm_packs_epi32(accum0, zero); |
| 270 accum0 = _mm_packus_epi16(accum0, zero); | 231 accum0 = _mm_packus_epi16(accum0, zero); |
| 271 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 232 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 272 accum1 = _mm_packs_epi32(accum1, zero); | 233 accum1 = _mm_packs_epi32(accum1, zero); |
| 273 accum1 = _mm_packus_epi16(accum1, zero); | 234 accum1 = _mm_packus_epi16(accum1, zero); |
| 274 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 235 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 275 accum2 = _mm_packs_epi32(accum2, zero); | 236 accum2 = _mm_packs_epi32(accum2, zero); |
| (...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 480 pixel_width, | 441 pixel_width, |
| 481 out_row); | 442 out_row); |
| 482 } else { | 443 } else { |
| 483 convolveVertically_SSE2<false>(filter_values, | 444 convolveVertically_SSE2<false>(filter_values, |
| 484 filter_length, | 445 filter_length, |
| 485 source_data_rows, | 446 source_data_rows, |
| 486 pixel_width, | 447 pixel_width, |
| 487 out_row); | 448 out_row); |
| 488 } | 449 } |
| 489 } | 450 } |
| 490 | |
| 491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | |
| 492 // Padding |paddingCount| of more dummy coefficients after the coefficients | |
| 493 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | |
| 494 // together to access invalid memory areas. We are not trying to align the | |
| 495 // coefficients right now due to the opaqueness of <vector> implementation. | |
| 496 // This has to be done after all |AddFilter| calls. | |
| 497 for (int i = 0; i < 8; ++i) { | |
| 498 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | |
| 499 } | |
| 500 } | |
| OLD | NEW |