| OLD | NEW | 
 | (Empty) | 
|    1 /* |  | 
|    2  * Copyright 2013 Google Inc. |  | 
|    3  * |  | 
|    4  * Use of this source code is governed by a BSD-style license that can be |  | 
|    5  * found in the LICENSE file. |  | 
|    6  */ |  | 
|    7  |  | 
|    8 #include <emmintrin.h> |  | 
|    9 #include "SkBitmap.h" |  | 
|   10 #include "SkBitmapFilter_opts_SSE2.h" |  | 
|   11 #include "SkBitmapProcState.h" |  | 
|   12 #include "SkColor.h" |  | 
|   13 #include "SkColorPriv.h" |  | 
|   14 #include "SkConvolver.h" |  | 
|   15 #include "SkShader.h" |  | 
|   16 #include "SkUnPreMultiply.h" |  | 
|   17  |  | 
|   18 #if 0 |  | 
|   19 static inline void print128i(__m128i value) { |  | 
|   20     int *v = (int*) &value; |  | 
|   21     printf("% .11d % .11d % .11d % .11d\n", v[0], v[1], v[2], v[3]); |  | 
|   22 } |  | 
|   23  |  | 
|   24 static inline void print128i_16(__m128i value) { |  | 
|   25     short *v = (short*) &value; |  | 
|   26     printf("% .5d % .5d % .5d % .5d % .5d % .5d % .5d % .5d\n", v[0], v[1], v[2]
     , v[3], v[4], v[5], v[6], v[7]); |  | 
|   27 } |  | 
|   28  |  | 
|   29 static inline void print128i_8(__m128i value) { |  | 
|   30     unsigned char *v = (unsigned char*) &value; |  | 
|   31     printf("%.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3u %.3
     u %.3u %.3u\n", |  | 
|   32            v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7], |  | 
|   33            v[8], v[9], v[10], v[11], v[12], v[13], v[14], v[15] |  | 
|   34            ); |  | 
|   35 } |  | 
|   36  |  | 
|   37 static inline void print128f(__m128 value) { |  | 
|   38     float *f = (float*) &value; |  | 
|   39     printf("%3.4f %3.4f %3.4f %3.4f\n", f[0], f[1], f[2], f[3]); |  | 
|   40 } |  | 
|   41 #endif |  | 
|   42  |  | 
|   43 static SK_ALWAYS_INLINE void accum_remainder(const unsigned char* pixels_left, |  | 
|   44         const SkConvolutionFilter1D::ConvolutionFixed* filter_values, __m128i& a
     ccum, int r) { |  | 
|   45     int remainder[4] = {0}; |  | 
|   46     for (int i = 0; i < r; i++) { |  | 
|   47         SkConvolutionFilter1D::ConvolutionFixed coeff = filter_values[i]; |  | 
|   48         remainder[0] += coeff * pixels_left[i * 4 + 0]; |  | 
|   49         remainder[1] += coeff * pixels_left[i * 4 + 1]; |  | 
|   50         remainder[2] += coeff * pixels_left[i * 4 + 2]; |  | 
|   51         remainder[3] += coeff * pixels_left[i * 4 + 3]; |  | 
|   52     } |  | 
|   53     __m128i t = _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remaind
     er[3]); |  | 
|   54     accum = _mm_add_epi32(accum, t); |  | 
|   55 } |  | 
|   56  |  | 
|   57 // Convolves horizontally along a single row. The row data is given in |  | 
|   58 // |src_data| and continues for the num_values() of the filter. |  | 
|   59 void convolveHorizontally_SSE2(const unsigned char* src_data, |  | 
|   60                                const SkConvolutionFilter1D& filter, |  | 
|   61                                unsigned char* out_row, |  | 
|   62                                bool /*has_alpha*/) { |  | 
|   63     int num_values = filter.numValues(); |  | 
|   64  |  | 
|   65     int filter_offset, filter_length; |  | 
|   66     __m128i zero = _mm_setzero_si128(); |  | 
|   67  |  | 
|   68     // Output one pixel each iteration, calculating all channels (RGBA) together
     . |  | 
|   69     for (int out_x = 0; out_x < num_values; out_x++) { |  | 
|   70         const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |  | 
|   71             filter.FilterForValue(out_x, &filter_offset, &filter_length); |  | 
|   72  |  | 
|   73         __m128i accum = _mm_setzero_si128(); |  | 
|   74  |  | 
|   75         // Compute the first pixel in this row that the filter affects. It will |  | 
|   76         // touch |filter_length| pixels (4 bytes each) after this. |  | 
|   77         const __m128i* row_to_filter = |  | 
|   78             reinterpret_cast<const __m128i*>(&src_data[filter_offset << 2]); |  | 
|   79  |  | 
|   80         // We will load and accumulate with four coefficients per iteration. |  | 
|   81         for (int filter_x = 0; filter_x < filter_length >> 2; filter_x++) { |  | 
|   82  |  | 
|   83             // Load 4 coefficients => duplicate 1st and 2nd of them for all chan
     nels. |  | 
|   84             __m128i coeff, coeff16; |  | 
|   85             // [16] xx xx xx xx c3 c2 c1 c0 |  | 
|   86             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
     es)); |  | 
|   87             // [16] xx xx xx xx c1 c1 c0 c0 |  | 
|   88             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |  | 
|   89             // [16] c1 c1 c1 c1 c0 c0 c0 c0 |  | 
|   90             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |  | 
|   91  |  | 
|   92             // Load four pixels => unpack the first two pixels to 16 bits => |  | 
|   93             // multiply with coefficients => accumulate the convolution result. |  | 
|   94             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|   95             __m128i src8 = _mm_loadu_si128(row_to_filter); |  | 
|   96             // [16] a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|   97             __m128i src16 = _mm_unpacklo_epi8(src8, zero); |  | 
|   98             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
|   99             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
|  100             // [32]  a0*c0 b0*c0 g0*c0 r0*c0 |  | 
|  101             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
|  102             accum = _mm_add_epi32(accum, t); |  | 
|  103             // [32]  a1*c1 b1*c1 g1*c1 r1*c1 |  | 
|  104             t = _mm_unpackhi_epi16(mul_lo, mul_hi); |  | 
|  105             accum = _mm_add_epi32(accum, t); |  | 
|  106  |  | 
|  107             // Duplicate 3rd and 4th coefficients for all channels => |  | 
|  108             // unpack the 3rd and 4th pixels to 16 bits => multiply with coeffic
     ients |  | 
|  109             // => accumulate the convolution results. |  | 
|  110             // [16] xx xx xx xx c3 c3 c2 c2 |  | 
|  111             coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |  | 
|  112             // [16] c3 c3 c3 c3 c2 c2 c2 c2 |  | 
|  113             coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); |  | 
|  114             // [16] a3 g3 b3 r3 a2 g2 b2 r2 |  | 
|  115             src16 = _mm_unpackhi_epi8(src8, zero); |  | 
|  116             mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
|  117             mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
|  118             // [32]  a2*c2 b2*c2 g2*c2 r2*c2 |  | 
|  119             t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
|  120             accum = _mm_add_epi32(accum, t); |  | 
|  121             // [32]  a3*c3 b3*c3 g3*c3 r3*c3 |  | 
|  122             t = _mm_unpackhi_epi16(mul_lo, mul_hi); |  | 
|  123             accum = _mm_add_epi32(accum, t); |  | 
|  124  |  | 
|  125             // Advance the pixel and coefficients pointers. |  | 
|  126             row_to_filter += 1; |  | 
|  127             filter_values += 4; |  | 
|  128         } |  | 
|  129  |  | 
|  130         // When |filter_length| is not divisible by 4, we accumulate the last 1 
     - 3 |  | 
|  131         // coefficients one at a time. |  | 
|  132         int r = filter_length & 3; |  | 
|  133         if (r) { |  | 
|  134             int remainder_offset = (filter_offset + filter_length - r) * 4; |  | 
|  135             accum_remainder(src_data + remainder_offset, filter_values, accum, r
     ); |  | 
|  136         } |  | 
|  137  |  | 
|  138         // Shift right for fixed point implementation. |  | 
|  139         accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); |  | 
|  140  |  | 
|  141         // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |  | 
|  142         accum = _mm_packs_epi32(accum, zero); |  | 
|  143         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |  | 
|  144         accum = _mm_packus_epi16(accum, zero); |  | 
|  145  |  | 
|  146         // Store the pixel value of 32 bits. |  | 
|  147         *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum); |  | 
|  148         out_row += 4; |  | 
|  149     } |  | 
|  150 } |  | 
|  151  |  | 
|  152 // Convolves horizontally along four rows. The row data is given in |  | 
|  153 // |src_data| and continues for the num_values() of the filter. |  | 
|  154 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please |  | 
|  155 // refer to that function for detailed comments. |  | 
|  156 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |  | 
|  157                                     const SkConvolutionFilter1D& filter, |  | 
|  158                                     unsigned char* out_row[4], |  | 
|  159                                     size_t outRowBytes) { |  | 
|  160     SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) |  | 
|  161  |  | 
|  162     int num_values = filter.numValues(); |  | 
|  163  |  | 
|  164     int filter_offset, filter_length; |  | 
|  165     __m128i zero = _mm_setzero_si128(); |  | 
|  166  |  | 
|  167     // Output one pixel each iteration, calculating all channels (RGBA) together
     . |  | 
|  168     for (int out_x = 0; out_x < num_values; out_x++) { |  | 
|  169         const SkConvolutionFilter1D::ConvolutionFixed* filter_values = |  | 
|  170             filter.FilterForValue(out_x, &filter_offset, &filter_length); |  | 
|  171  |  | 
|  172         // four pixels in a column per iteration. |  | 
|  173         __m128i accum0 = _mm_setzero_si128(); |  | 
|  174         __m128i accum1 = _mm_setzero_si128(); |  | 
|  175         __m128i accum2 = _mm_setzero_si128(); |  | 
|  176         __m128i accum3 = _mm_setzero_si128(); |  | 
|  177         int start = (filter_offset<<2); |  | 
|  178         // We will load and accumulate with four coefficients per iteration. |  | 
|  179         for (int filter_x = 0; filter_x < (filter_length >> 2); filter_x++) { |  | 
|  180             __m128i coeff, coeff16lo, coeff16hi; |  | 
|  181             // [16] xx xx xx xx c3 c2 c1 c0 |  | 
|  182             coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filter_valu
     es)); |  | 
|  183             // [16] xx xx xx xx c1 c1 c0 c0 |  | 
|  184             coeff16lo = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); |  | 
|  185             // [16] c1 c1 c1 c1 c0 c0 c0 c0 |  | 
|  186             coeff16lo = _mm_unpacklo_epi16(coeff16lo, coeff16lo); |  | 
|  187             // [16] xx xx xx xx c3 c3 c2 c2 |  | 
|  188             coeff16hi = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); |  | 
|  189             // [16] c3 c3 c3 c3 c2 c2 c2 c2 |  | 
|  190             coeff16hi = _mm_unpacklo_epi16(coeff16hi, coeff16hi); |  | 
|  191  |  | 
|  192             __m128i src8, src16, mul_hi, mul_lo, t; |  | 
|  193  |  | 
|  194 #define ITERATION(src, accum)                                                \ |  | 
|  195             src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));   \ |  | 
|  196             src16 = _mm_unpacklo_epi8(src8, zero);                           \ |  | 
|  197             mul_hi = _mm_mulhi_epi16(src16, coeff16lo);                      \ |  | 
|  198             mul_lo = _mm_mullo_epi16(src16, coeff16lo);                      \ |  | 
|  199             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \ |  | 
|  200             accum = _mm_add_epi32(accum, t);                                 \ |  | 
|  201             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \ |  | 
|  202             accum = _mm_add_epi32(accum, t);                                 \ |  | 
|  203             src16 = _mm_unpackhi_epi8(src8, zero);                           \ |  | 
|  204             mul_hi = _mm_mulhi_epi16(src16, coeff16hi);                      \ |  | 
|  205             mul_lo = _mm_mullo_epi16(src16, coeff16hi);                      \ |  | 
|  206             t = _mm_unpacklo_epi16(mul_lo, mul_hi);                          \ |  | 
|  207             accum = _mm_add_epi32(accum, t);                                 \ |  | 
|  208             t = _mm_unpackhi_epi16(mul_lo, mul_hi);                          \ |  | 
|  209             accum = _mm_add_epi32(accum, t) |  | 
|  210  |  | 
|  211             ITERATION(src_data[0] + start, accum0); |  | 
|  212             ITERATION(src_data[1] + start, accum1); |  | 
|  213             ITERATION(src_data[2] + start, accum2); |  | 
|  214             ITERATION(src_data[3] + start, accum3); |  | 
|  215  |  | 
|  216             start += 16; |  | 
|  217             filter_values += 4; |  | 
|  218         } |  | 
|  219  |  | 
|  220         int r = filter_length & 3; |  | 
|  221         if (r) { |  | 
|  222             int remainder_offset = (filter_offset + filter_length - r) * 4; |  | 
|  223             accum_remainder(src_data[0] + remainder_offset, filter_values, accum
     0, r); |  | 
|  224             accum_remainder(src_data[1] + remainder_offset, filter_values, accum
     1, r); |  | 
|  225             accum_remainder(src_data[2] + remainder_offset, filter_values, accum
     2, r); |  | 
|  226             accum_remainder(src_data[3] + remainder_offset, filter_values, accum
     3, r); |  | 
|  227         } |  | 
|  228  |  | 
|  229         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |  | 
|  230         accum0 = _mm_packs_epi32(accum0, zero); |  | 
|  231         accum0 = _mm_packus_epi16(accum0, zero); |  | 
|  232         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |  | 
|  233         accum1 = _mm_packs_epi32(accum1, zero); |  | 
|  234         accum1 = _mm_packus_epi16(accum1, zero); |  | 
|  235         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |  | 
|  236         accum2 = _mm_packs_epi32(accum2, zero); |  | 
|  237         accum2 = _mm_packus_epi16(accum2, zero); |  | 
|  238         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |  | 
|  239         accum3 = _mm_packs_epi32(accum3, zero); |  | 
|  240         accum3 = _mm_packus_epi16(accum3, zero); |  | 
|  241  |  | 
|  242         // We seem to be running off the edge here (chromium:491660). |  | 
|  243         SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes); |  | 
|  244  |  | 
|  245         *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); |  | 
|  246         *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); |  | 
|  247         *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); |  | 
|  248         *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); |  | 
|  249  |  | 
|  250         out_row[0] += 4; |  | 
|  251         out_row[1] += 4; |  | 
|  252         out_row[2] += 4; |  | 
|  253         out_row[3] += 4; |  | 
|  254     } |  | 
|  255 } |  | 
|  256  |  | 
|  257 // Does vertical convolution to produce one output row. The filter values and |  | 
|  258 // length are given in the first two parameters. These are applied to each |  | 
|  259 // of the rows pointed to in the |source_data_rows| array, with each row |  | 
|  260 // being |pixel_width| wide. |  | 
|  261 // |  | 
|  262 // The output must have room for |pixel_width * 4| bytes. |  | 
|  263 template<bool has_alpha> |  | 
|  264 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
     er_values, |  | 
|  265                              int filter_length, |  | 
|  266                              unsigned char* const* source_data_rows, |  | 
|  267                              int pixel_width, |  | 
|  268                              unsigned char* out_row) { |  | 
|  269     int width = pixel_width & ~3; |  | 
|  270  |  | 
|  271     __m128i zero = _mm_setzero_si128(); |  | 
|  272     __m128i accum0, accum1, accum2, accum3, coeff16; |  | 
|  273     const __m128i* src; |  | 
|  274     // Output four pixels per iteration (16 bytes). |  | 
|  275     for (int out_x = 0; out_x < width; out_x += 4) { |  | 
|  276  |  | 
|  277         // Accumulated result for each pixel. 32 bits per RGBA channel. |  | 
|  278         accum0 = _mm_setzero_si128(); |  | 
|  279         accum1 = _mm_setzero_si128(); |  | 
|  280         accum2 = _mm_setzero_si128(); |  | 
|  281         accum3 = _mm_setzero_si128(); |  | 
|  282  |  | 
|  283         // Convolve with one filter coefficient per iteration. |  | 
|  284         for (int filter_y = 0; filter_y < filter_length; filter_y++) { |  | 
|  285  |  | 
|  286             // Duplicate the filter coefficient 8 times. |  | 
|  287             // [16] cj cj cj cj cj cj cj cj |  | 
|  288             coeff16 = _mm_set1_epi16(filter_values[filter_y]); |  | 
|  289  |  | 
|  290             // Load four pixels (16 bytes) together. |  | 
|  291             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  292             src = reinterpret_cast<const __m128i*>( |  | 
|  293                 &source_data_rows[filter_y][out_x << 2]); |  | 
|  294             __m128i src8 = _mm_loadu_si128(src); |  | 
|  295  |  | 
|  296             // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channel
     s => |  | 
|  297             // multiply with current coefficient => accumulate the result. |  | 
|  298             // [16] a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  299             __m128i src16 = _mm_unpacklo_epi8(src8, zero); |  | 
|  300             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
|  301             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
|  302             // [32] a0 b0 g0 r0 |  | 
|  303             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
|  304             accum0 = _mm_add_epi32(accum0, t); |  | 
|  305             // [32] a1 b1 g1 r1 |  | 
|  306             t = _mm_unpackhi_epi16(mul_lo, mul_hi); |  | 
|  307             accum1 = _mm_add_epi32(accum1, t); |  | 
|  308  |  | 
|  309             // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channel
     s => |  | 
|  310             // multiply with current coefficient => accumulate the result. |  | 
|  311             // [16] a3 b3 g3 r3 a2 b2 g2 r2 |  | 
|  312             src16 = _mm_unpackhi_epi8(src8, zero); |  | 
|  313             mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
|  314             mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
|  315             // [32] a2 b2 g2 r2 |  | 
|  316             t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
|  317             accum2 = _mm_add_epi32(accum2, t); |  | 
|  318             // [32] a3 b3 g3 r3 |  | 
|  319             t = _mm_unpackhi_epi16(mul_lo, mul_hi); |  | 
|  320             accum3 = _mm_add_epi32(accum3, t); |  | 
|  321         } |  | 
|  322  |  | 
|  323         // Shift right for fixed point implementation. |  | 
|  324         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |  | 
|  325         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |  | 
|  326         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |  | 
|  327         accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |  | 
|  328  |  | 
|  329         // Packing 32 bits |accum| to 16 bits per channel (signed saturation). |  | 
|  330         // [16] a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  331         accum0 = _mm_packs_epi32(accum0, accum1); |  | 
|  332         // [16] a3 b3 g3 r3 a2 b2 g2 r2 |  | 
|  333         accum2 = _mm_packs_epi32(accum2, accum3); |  | 
|  334  |  | 
|  335         // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). |  | 
|  336         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  337         accum0 = _mm_packus_epi16(accum0, accum2); |  | 
|  338  |  | 
|  339         if (has_alpha) { |  | 
|  340             // Compute the max(ri, gi, bi) for each pixel. |  | 
|  341             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |  | 
|  342             __m128i a = _mm_srli_epi32(accum0, 8); |  | 
|  343             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |  | 
|  344             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g. |  | 
|  345             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |  | 
|  346             a = _mm_srli_epi32(accum0, 16); |  | 
|  347             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |  | 
|  348             b = _mm_max_epu8(a, b);  // Max of r and g and b. |  | 
|  349             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |  | 
|  350             b = _mm_slli_epi32(b, 24); |  | 
|  351  |  | 
|  352             // Make sure the value of alpha channel is always larger than maximu
     m |  | 
|  353             // value of color channels. |  | 
|  354             accum0 = _mm_max_epu8(b, accum0); |  | 
|  355         } else { |  | 
|  356             // Set value of alpha channels to 0xFF. |  | 
|  357             __m128i mask = _mm_set1_epi32(0xff000000); |  | 
|  358             accum0 = _mm_or_si128(accum0, mask); |  | 
|  359         } |  | 
|  360  |  | 
|  361         // Store the convolution result (16 bytes) and advance the pixel pointer
     s. |  | 
|  362         _mm_storeu_si128(reinterpret_cast<__m128i*>(out_row), accum0); |  | 
|  363         out_row += 16; |  | 
|  364     } |  | 
|  365  |  | 
|  366     // When the width of the output is not divisible by 4, We need to save one |  | 
|  367     // pixel (4 bytes) each time. And also the fourth pixel is always absent. |  | 
|  368     if (pixel_width & 3) { |  | 
|  369         accum0 = _mm_setzero_si128(); |  | 
|  370         accum1 = _mm_setzero_si128(); |  | 
|  371         accum2 = _mm_setzero_si128(); |  | 
|  372         for (int filter_y = 0; filter_y < filter_length; ++filter_y) { |  | 
|  373             coeff16 = _mm_set1_epi16(filter_values[filter_y]); |  | 
|  374             // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  375             src = reinterpret_cast<const __m128i*>( |  | 
|  376                 &source_data_rows[filter_y][width<<2]); |  | 
|  377             __m128i src8 = _mm_loadu_si128(src); |  | 
|  378             // [16] a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  379             __m128i src16 = _mm_unpacklo_epi8(src8, zero); |  | 
|  380             __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
|  381             __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
|  382             // [32] a0 b0 g0 r0 |  | 
|  383             __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
|  384             accum0 = _mm_add_epi32(accum0, t); |  | 
|  385             // [32] a1 b1 g1 r1 |  | 
|  386             t = _mm_unpackhi_epi16(mul_lo, mul_hi); |  | 
|  387             accum1 = _mm_add_epi32(accum1, t); |  | 
|  388             // [16] a3 b3 g3 r3 a2 b2 g2 r2 |  | 
|  389             src16 = _mm_unpackhi_epi8(src8, zero); |  | 
|  390             mul_hi = _mm_mulhi_epi16(src16, coeff16); |  | 
|  391             mul_lo = _mm_mullo_epi16(src16, coeff16); |  | 
|  392             // [32] a2 b2 g2 r2 |  | 
|  393             t = _mm_unpacklo_epi16(mul_lo, mul_hi); |  | 
|  394             accum2 = _mm_add_epi32(accum2, t); |  | 
|  395         } |  | 
|  396  |  | 
|  397         accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); |  | 
|  398         accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |  | 
|  399         accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |  | 
|  400         // [16] a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  401         accum0 = _mm_packs_epi32(accum0, accum1); |  | 
|  402         // [16] a3 b3 g3 r3 a2 b2 g2 r2 |  | 
|  403         accum2 = _mm_packs_epi32(accum2, zero); |  | 
|  404         // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 |  | 
|  405         accum0 = _mm_packus_epi16(accum0, accum2); |  | 
|  406         if (has_alpha) { |  | 
|  407             // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 |  | 
|  408             __m128i a = _mm_srli_epi32(accum0, 8); |  | 
|  409             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |  | 
|  410             __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g. |  | 
|  411             // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 |  | 
|  412             a = _mm_srli_epi32(accum0, 16); |  | 
|  413             // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 |  | 
|  414             b = _mm_max_epu8(a, b);  // Max of r and g and b. |  | 
|  415             // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 |  | 
|  416             b = _mm_slli_epi32(b, 24); |  | 
|  417             accum0 = _mm_max_epu8(b, accum0); |  | 
|  418         } else { |  | 
|  419             __m128i mask = _mm_set1_epi32(0xff000000); |  | 
|  420             accum0 = _mm_or_si128(accum0, mask); |  | 
|  421         } |  | 
|  422  |  | 
|  423         for (int out_x = width; out_x < pixel_width; out_x++) { |  | 
|  424             *(reinterpret_cast<int*>(out_row)) = _mm_cvtsi128_si32(accum0); |  | 
|  425             accum0 = _mm_srli_si128(accum0, 4); |  | 
|  426             out_row += 4; |  | 
|  427         } |  | 
|  428     } |  | 
|  429 } |  | 
|  430  |  | 
|  431 void convolveVertically_SSE2(const SkConvolutionFilter1D::ConvolutionFixed* filt
     er_values, |  | 
|  432                              int filter_length, |  | 
|  433                              unsigned char* const* source_data_rows, |  | 
|  434                              int pixel_width, |  | 
|  435                              unsigned char* out_row, |  | 
|  436                              bool has_alpha) { |  | 
|  437     if (has_alpha) { |  | 
|  438         convolveVertically_SSE2<true>(filter_values, |  | 
|  439                                       filter_length, |  | 
|  440                                       source_data_rows, |  | 
|  441                                       pixel_width, |  | 
|  442                                       out_row); |  | 
|  443     } else { |  | 
|  444         convolveVertically_SSE2<false>(filter_values, |  | 
|  445                                        filter_length, |  | 
|  446                                        source_data_rows, |  | 
|  447                                        pixel_width, |  | 
|  448                                        out_row); |  | 
|  449     } |  | 
|  450 } |  | 
| OLD | NEW |