| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 Google Inc. | 2 * Copyright 2013 Google Inc. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
| 5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
| 6 */ | 6 */ |
| 7 | 7 |
| 8 #include <emmintrin.h> | 8 #include <emmintrin.h> |
| 9 #include "SkBitmap.h" | 9 #include "SkBitmap.h" |
| 10 #include "SkBitmapFilter_opts_SSE2.h" | 10 #include "SkBitmapFilter_opts_SSE2.h" |
| (...skipping 156 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 167 out_row += 4; | 167 out_row += 4; |
| 168 } | 168 } |
| 169 } | 169 } |
| 170 | 170 |
| 171 // Convolves horizontally along four rows. The row data is given in | 171 // Convolves horizontally along four rows. The row data is given in |
| 172 // |src_data| and continues for the num_values() of the filter. | 172 // |src_data| and continues for the num_values() of the filter. |
| 173 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please | 173 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please |
| 174 // refer to that function for detailed comments. | 174 // refer to that function for detailed comments. |
| 175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], | 175 void convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4], |
| 176 const SkConvolutionFilter1D& filter, | 176 const SkConvolutionFilter1D& filter, |
| 177 unsigned char* out_row[4]) { | 177 unsigned char* out_row[4], |
| 178 size_t outRowBytes) { |
| 179 SkDEBUGCODE(const unsigned char* out_row_0_start = out_row[0];) |
| 180 |
| 178 int num_values = filter.numValues(); | 181 int num_values = filter.numValues(); |
| 179 | 182 |
| 180 int filter_offset, filter_length; | 183 int filter_offset, filter_length; |
| 181 __m128i zero = _mm_setzero_si128(); | 184 __m128i zero = _mm_setzero_si128(); |
| 182 __m128i mask[4]; | 185 __m128i mask[4]; |
| 183 // |mask| will be used to decimate all extra filter coefficients that are | 186 // |mask| will be used to decimate all extra filter coefficients that are |
| 184 // loaded by SIMD when |filter_length| is not divisible by 4. | 187 // loaded by SIMD when |filter_length| is not divisible by 4. |
| 185 // mask[0] is not used in following algorithm. | 188 // mask[0] is not used in following algorithm. |
| 186 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); | 189 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1); |
| 187 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); | 190 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1); |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 268 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); | 271 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); |
| 269 accum1 = _mm_packs_epi32(accum1, zero); | 272 accum1 = _mm_packs_epi32(accum1, zero); |
| 270 accum1 = _mm_packus_epi16(accum1, zero); | 273 accum1 = _mm_packus_epi16(accum1, zero); |
| 271 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); | 274 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); |
| 272 accum2 = _mm_packs_epi32(accum2, zero); | 275 accum2 = _mm_packs_epi32(accum2, zero); |
| 273 accum2 = _mm_packus_epi16(accum2, zero); | 276 accum2 = _mm_packus_epi16(accum2, zero); |
| 274 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); | 277 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); |
| 275 accum3 = _mm_packs_epi32(accum3, zero); | 278 accum3 = _mm_packs_epi32(accum3, zero); |
| 276 accum3 = _mm_packus_epi16(accum3, zero); | 279 accum3 = _mm_packus_epi16(accum3, zero); |
| 277 | 280 |
| 281 // We seem to be running off the edge here (chromium:491660). |
| 282 SkASSERT(((size_t)out_row[0] - (size_t)out_row_0_start) < outRowBytes); |
| 283 |
| 278 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); | 284 *(reinterpret_cast<int*>(out_row[0])) = _mm_cvtsi128_si32(accum0); |
| 279 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); | 285 *(reinterpret_cast<int*>(out_row[1])) = _mm_cvtsi128_si32(accum1); |
| 280 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); | 286 *(reinterpret_cast<int*>(out_row[2])) = _mm_cvtsi128_si32(accum2); |
| 281 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); | 287 *(reinterpret_cast<int*>(out_row[3])) = _mm_cvtsi128_si32(accum3); |
| 282 | 288 |
| 283 out_row[0] += 4; | 289 out_row[0] += 4; |
| 284 out_row[1] += 4; | 290 out_row[1] += 4; |
| 285 out_row[2] += 4; | 291 out_row[2] += 4; |
| 286 out_row[3] += 4; | 292 out_row[3] += 4; |
| 287 } | 293 } |
| (...skipping 197 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 485 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { | 491 void applySIMDPadding_SSE2(SkConvolutionFilter1D *filter) { |
| 486 // Padding |paddingCount| of more dummy coefficients after the coefficients | 492 // Padding |paddingCount| of more dummy coefficients after the coefficients |
| 487 // of last filter to prevent SIMD instructions which load 8 or 16 bytes | 493 // of last filter to prevent SIMD instructions which load 8 or 16 bytes |
| 488 // together to access invalid memory areas. We are not trying to align the | 494 // together to access invalid memory areas. We are not trying to align the |
| 489 // coefficients right now due to the opaqueness of <vector> implementation. | 495 // coefficients right now due to the opaqueness of <vector> implementation. |
| 490 // This has to be done after all |AddFilter| calls. | 496 // This has to be done after all |AddFilter| calls. |
| 491 for (int i = 0; i < 8; ++i) { | 497 for (int i = 0; i < 8; ++i) { |
| 492 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); | 498 filter->addFilterValue(static_cast<SkConvolutionFilter1D::ConvolutionFix
ed>(0)); |
| 493 } | 499 } |
| 494 } | 500 } |
| OLD | NEW |