| OLD | NEW | 
|---|
| 1 /* | 1 /* | 
| 2  * Copyright 2013 Google Inc. | 2  * Copyright 2013 Google Inc. | 
| 3  * | 3  * | 
| 4  * Use of this source code is governed by a BSD-style license that can be | 4  * Use of this source code is governed by a BSD-style license that can be | 
| 5  * found in the LICENSE file. | 5  * found in the LICENSE file. | 
| 6  */ | 6  */ | 
| 7 | 7 | 
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> | 
| 9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" | 
| 10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" | 
| (...skipping 22 matching lines...) Expand all  Loading... | 
| 33            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] | 33            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] | 
| 34            ); | 34            ); | 
| 35 } | 35 } | 
| 36 | 36 | 
| 37 static inline void print128f(__m128 value) { | 37 static inline void print128f(__m128 value) { | 
| 38     float *f = (float*) &value; | 38     float *f = (float*) &value; | 
| 39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); | 
| 40 } | 40 } | 
| 41 #endif | 41 #endif | 
| 42 | 42 | 
|  | 43 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, | 
|  | 44         const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& a
     ccum, int r) { | 
|  | 45     int remainder[4] = {0}; | 
|  | 46     for (int i = 0; i < r; i++) { | 
|  | 47         SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; | 
|  | 48         remainder[0] += coeff * pixels_left[i * 4 + 0]; | 
|  | 49         remainder[1] += coeff * pixels_left[i * 4 + 1]; | 
|  | 50         remainder[2] += coeff * pixels_left[i * 4 + 2]; | 
|  | 51         remainder[3] += coeff * pixels_left[i * 4 + 3]; | 
|  | 52     } | 
|  | 53     __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remaind
     er[3]); | 
|  | 54     accum = _mm_add_epi32(accum, t); | 
|  | 55 } | 
|  | 56 | 
| 43 // Convolves horizontally along a single row. The row data is given in | 57 // Convolves horizontally along a single row. The row data is given in | 
| 44 // |src_data| and continues for the num_values() of the filter. | 58 // |src_data| and continues for the num_values() of the filter. | 
| 45 void convolveHorizontally_SSE2(const unsigned char* src_data, | 59 void convolveHorizontally_SSE2(const unsigned char* src_data, | 
| 46                                const SkConvolutionFilter1D& filter, | 60                                const SkConvolutionFilter1D& filter, | 
| 47                                unsigned char* out_row, | 61                                unsigned char* out_row, | 
| 48                                bool /*has_alpha*/) { | 62                                bool /*has_alpha*/) { | 
| 49     int num_values = filter.numValues(); | 63     int num_values = filter.numValues(); | 
| 50 | 64 | 
| 51     int filter_offset, filter_length; | 65     int filter_offset, filter_length; | 
| 52     __m128i zero = _mm_setzero_si128(); | 66     __m128i zero = _mm_setzero_si128(); | 
| 53     __m128i mask[4]; |  | 
| 54     // |mask| will be used to decimate all extra filter coefficients that are |  | 
| 55     // loaded by SIMD when |filter_length| is not divisible by 4. |  | 
| 56     // mask[0] is not used in following algorithm. |  | 
| 57     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |  | 
| 58     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |  | 
| 59     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |  | 
| 60 | 67 | 
| 61     // Output one pixel each iteration, calculating all channels (RGBA) together
     . | 68     // Output one pixel each iteration, calculating all channels (RGBA) together
     . | 
| 62     for (int out_x = 0; out_x < num_values; out_x++) { | 69     for (int out_x = 0; out_x < num_values; out_x++) { | 
| 63         const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 70         const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 
| 64             filter.FilterForValue(out_x, &filter_offset, &filter_length); | 71             filter.FilterForValue(out_x, &filter_offset, &filter_length); | 
| 65 | 72 | 
| 66         __m128i accum = _mm_setzero_si128(); | 73         __m128i accum = _mm_setzero_si128(); | 
| 67 | 74 | 
| 68         // Compute the first pixel in this row that the filter affects. It will | 75         // Compute the first pixel in this row that the filter affects. It will | 
| 69         // touch |filter_length| pixels (4 bytes each) after this. | 76         // touch |filter_length| pixels (4 bytes each) after this. | 
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 113             accum = _mm_add_epi32(accum, t); | 120             accum = _mm_add_epi32(accum, t); | 
| 114             // [32]  a3*c3 b3*c3 g3*c3 r3*c3 | 121             // [32]  a3*c3 b3*c3 g3*c3 r3*c3 | 
| 115             t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 122             t = _mm_unpackhi_epi16(mul_lo, mul_hi); | 
| 116             accum = _mm_add_epi32(accum, t); | 123             accum = _mm_add_epi32(accum, t); | 
| 117 | 124 | 
| 118             // Advance the pixel and coefficients pointers. | 125             // Advance the pixel and coefficients pointers. | 
| 119             row_to_filter += 1; | 126             row_to_filter += 1; | 
| 120             filter_values += 4; | 127             filter_values += 4; | 
| 121         } | 128         } | 
| 122 | 129 | 
| 123         // When |filter_length| is not divisible by 4, we need to decimate some 
     of | 130         // When |filter_length| is not divisible by 4, we accumulate the last 1 
     - 3 | 
| 124         // the filter coefficient that was loaded incorrectly to zero; Other tha
     n | 131         // coefficients one at a time. | 
| 125         // that the algorithm is same with above, exceot that the 4th pixel will
      be | 132         int r = filter_length & 3; | 
| 126         // always absent. |  | 
| 127         int r = filter_length&3; |  | 
| 128         if (r) { | 133         if (r) { | 
| 129             // Note: filter_values must be padded to align_up(filter_offset, 8). | 134             int remainder_offset = (filter_offset + filter_length - r) * 4; | 
| 130             __m128i coeff, coeff16; | 135             accum_remainder(src_data + remainder_offset, filter_values, accum, r
     ); | 
| 131             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
     es)); |  | 
| 132             // Mask out extra filter taps. |  | 
| 133             coeff = _mm_and_si128(coeff, mask[r]); |  | 
| 134             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |  | 
| 135             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |  | 
| 136 |  | 
| 137             // Note: line buffer must be padded to align_up(filter_offset, 16). |  | 
| 138             // We resolve this by use C-version for the last horizontal line. |  | 
| 139             __m128i src8 = _mm_loadu_si128(row_to_filter); |  | 
| 140             __m128i src16 = _mm_unpacklo_epi8(src8, zero); |  | 
| 141             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
| 142             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
| 143             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
| 144             accum = _mm_add_epi32(accum, t); |  | 
| 145             t = _mm_unpackhi_epi16(mul_lo, mul_hi); |  | 
| 146             accum = _mm_add_epi32(accum, t); |  | 
| 147 |  | 
| 148             src16 = _mm_unpackhi_epi8(src8, zero); |  | 
| 149             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |  | 
| 150             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |  | 
| 151             mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
| 152             mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
| 153             t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
| 154             accum = _mm_add_epi32(accum, t); |  | 
| 155         } | 136         } | 
| 156 | 137 | 
| 157         // Shift right for fixed point implementation. | 138         // Shift right for fixed point implementation. | 
| 158         accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | 139         accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); | 
| 159 | 140 | 
| 160         // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | 141         // Packing 32 bits |accum| to 16 bits per channel (signed saturation). | 
| 161         accum = _mm_packs_epi32(accum, zero); | 142         accum = _mm_packs_epi32(accum, zero); | 
| 162         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | 143         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). | 
| 163         accum = _mm_packus_epi16(accum, zero); | 144         accum = _mm_packus_epi16(accum, zero); | 
| 164 | 145 | 
| (...skipping 10 matching lines...) Expand all  Loading... | 
| 175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 156 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 
| 176                                     const SkConvolutionFilter1D& filter, | 157                                     const SkConvolutionFilter1D& filter, | 
| 177                                     unsigned char* out_row[4], | 158                                     unsigned char* out_row[4], | 
| 178                                     size_t outRowBytes) { | 159                                     size_t outRowBytes) { | 
| 179     SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) | 160     SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) | 
| 180 | 161 | 
| 181     int num_values = filter.numValues(); | 162     int num_values = filter.numValues(); | 
| 182 | 163 | 
| 183     int filter_offset, filter_length; | 164     int filter_offset, filter_length; | 
| 184     __m128i zero = _mm_setzero_si128(); | 165     __m128i zero = _mm_setzero_si128(); | 
| 185     __m128i mask[4]; |  | 
| 186     // |mask| will be used to decimate all extra filter coefficients that are |  | 
| 187     // loaded by SIMD when |filter_length| is not divisible by 4. |  | 
| 188     // mask[0] is not used in following algorithm. |  | 
| 189     mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |  | 
| 190     mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |  | 
| 191     mask[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1); |  | 
| 192 | 166 | 
| 193     // Output one pixel each iteration, calculating all channels (RGBA) together
     . | 167     // Output one pixel each iteration, calculating all channels (RGBA) together
     . | 
| 194     for (int out_x = 0; out_x < num_values; out_x++) { | 168     for (int out_x = 0; out_x < num_values; out_x++) { | 
| 195         const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 169         const SkConvolutionFilter1D::ConvolutionFixed* filter_values = | 
| 196             filter.FilterForValue(out_x, &filter_offset, &filter_length); | 170             filter.FilterForValue(out_x, &filter_offset, &filter_length); | 
| 197 | 171 | 
| 198         // four pixels in a column per iteration. | 172         // four pixels in a column per iteration. | 
| 199         __m128i accum0 = _mm_setzero_si128(); | 173         __m128i accum0 = _mm_setzero_si128(); | 
| 200         __m128i accum1 = _mm_setzero_si128(); | 174         __m128i accum1 = _mm_setzero_si128(); | 
| 201         __m128i accum2 = _mm_setzero_si128(); | 175         __m128i accum2 = _mm_setzero_si128(); | 
| (...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 238             ITERATION(src_data[1] + start, accum1); | 212             ITERATION(src_data[1] + start, accum1); | 
| 239             ITERATION(src_data[2] + start, accum2); | 213             ITERATION(src_data[2] + start, accum2); | 
| 240             ITERATION(src_data[3] + start, accum3); | 214             ITERATION(src_data[3] + start, accum3); | 
| 241 | 215 | 
| 242             start += 16; | 216             start += 16; | 
| 243             filter_values += 4; | 217             filter_values += 4; | 
| 244         } | 218         } | 
| 245 | 219 | 
| 246         int r = filter_length & 3; | 220         int r = filter_length & 3; | 
| 247         if (r) { | 221         if (r) { | 
| 248             // Note: filter_values must be padded to align_up(filter_offset, 8); | 222             int remainder_offset = (filter_offset + filter_length - r) * 4; | 
| 249             __m128i coeff; | 223             accum_remainder(src_data[0] + remainder_offset, filter_values, accum
     0, r); | 
| 250             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
     es)); | 224             accum_remainder(src_data[1] + remainder_offset, filter_values, accum
     1, r); | 
| 251             // Mask out extra filter taps. | 225             accum_remainder(src_data[2] + remainder_offset, filter_values, accum
     2, r); | 
| 252             coeff = _mm_and_si128(coeff, mask[r]); | 226             accum_remainder(src_data[3] + remainder_offset, filter_values, accum
     3, r); | 
| 253 |  | 
| 254             __m128i coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 
     0)); |  | 
| 255             /* c1 c1 c1 c1 c0 c0 c0 c0 */ |  | 
| 256             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |  | 
| 257             __m128i coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 
     2)); |  | 
| 258             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |  | 
| 259 |  | 
| 260             __m128i src8, src16, mul_hi, mul_lo, t; |  | 
| 261 |  | 
| 262             ITERATION(src_data[0] + start, accum0); |  | 
| 263             ITERATION(src_data[1] + start, accum1); |  | 
| 264             ITERATION(src_data[2] + start, accum2); |  | 
| 265             ITERATION(src_data[3] + start, accum3); |  | 
| 266         } | 227         } | 
| 267 | 228 | 
| 268         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | 229         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); | 
| 269         accum0 = _mm_packs_epi32(accum0, zero); | 230         accum0 = _mm_packs_epi32(accum0, zero); | 
| 270         accum0 = _mm_packus_epi16(accum0, zero); | 231         accum0 = _mm_packus_epi16(accum0, zero); | 
| 271         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 232         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 
| 272         accum1 = _mm_packs_epi32(accum1, zero); | 233         accum1 = _mm_packs_epi32(accum1, zero); | 
| 273         accum1 = _mm_packus_epi16(accum1, zero); | 234         accum1 = _mm_packus_epi16(accum1, zero); | 
| 274         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 235         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 
| 275         accum2 = _mm_packs_epi32(accum2, zero); | 236         accum2 = _mm_packs_epi32(accum2, zero); | 
| (...skipping 204 matching lines...) Expand 10 before | Expand all | Expand 10 after  Loading... | 
| 480                                       pixel_width, | 441                                       pixel_width, | 
| 481                                       out_row); | 442                                       out_row); | 
| 482     } else { | 443     } else { | 
| 483         convolveVertically_SSE2<false>(filter_values, | 444         convolveVertically_SSE2<false>(filter_values, | 
| 484                                        filter_length, | 445                                        filter_length, | 
| 485                                        source_data_rows, | 446                                        source_data_rows, | 
| 486                                        pixel_width, | 447                                        pixel_width, | 
| 487                                        out_row); | 448                                        out_row); | 
| 488     } | 449     } | 
| 489 } | 450 } | 
| 490 |  | 
| 491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |  | 
| 492     // Padding |paddingCount| of more dummy coefficients after the coefficients |  | 
| 493     // of last filter to prevent SIMD instructions which load 8 or 16 bytes |  | 
| 494     // together to access invalid memory areas. We are not trying to align the |  | 
| 495     // coefficients right now due to the opaqueness of <vector> implementation. |  | 
| 496     // This has to be done after all |AddFilter| calls. |  | 
| 497     for (int i = 0; i < 8; ++i) { |  | 
| 498         filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
     ed>(0)); |  | 
| 499     } |  | 
| 500 } |  | 
| OLD | NEW | 
|---|