skia/ext/convolver_SSE2.cc - Issue 13293004: enable SSE2 in skia/convolver for linux32

Side by Side Diff: skia/ext/convolver_SSE2.cc

Issue 13293004: enable SSE2 in skia/convolver for linux32 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: forgot to include convolver_SSE2.h in convolver_SSE2.cc Created 7 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <algorithm>	5 #include <algorithm>

6	6

7 #include "skia/ext/convolver.h"	7 #include "skia/ext/convolver.h"

	8 #include "skia/ext/convolver_SSE2.h"

8 #include "third_party/skia/include/core/SkTypes.h"	9 #include "third_party/skia/include/core/SkTypes.h"

9	10

10 #if defined(SIMD_SSE2)

11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h	11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h

12 #endif

13	12

14 namespace skia {	13 namespace skia {

15	14

16 namespace {

17

18 // Converts the argument to an 8-bit unsigned value by clamping to the range

19 // 0-255.

20 inline unsigned char ClampTo8(int a) {

21 if (static_cast<unsigned>(a) < 256)

22 return a; // Avoid the extra check in the common case.

23 if (a < 0)

24 return 0;

25 return 255;

26 }

27

28 // Stores a list of rows in a circular buffer. The usage is you write into it

29 // by calling AdvanceRow. It will keep track of which row in the buffer it

30 // should use next, and the total number of rows added.

31 class CircularRowBuffer {

32 public:

33 // The number of pixels in each row is given in \|source_row_pixel_width\|.

34 // The maximum number of rows needed in the buffer is \|max_y_filter_size\|

35 // (we only need to store enough rows for the biggest filter).

36 //

37 // We use the \|first_input_row\| to compute the coordinates of all of the

38 // following rows returned by Advance().

39 CircularRowBuffer(int dest_row_pixel_width, int max_y_filter_size,

40 int first_input_row)

41 : row_byte_width_(dest_row_pixel_width * 4),

42 num_rows_(max_y_filter_size),

43 next_row_(0),

44 next_row_coordinate_(first_input_row) {

45 buffer_.resize(row_byte_width_ * max_y_filter_size);

46 row_addresses_.resize(num_rows_);

47 }

48

49 // Moves to the next row in the buffer, returning a pointer to the beginning

50 // of it.

51 unsigned char* AdvanceRow() {

52 unsigned char* row = &buffer_[next_row_ * row_byte_width_];

53 next_row_coordinate_++;

54

55 // Set the pointer to the next row to use, wrapping around if necessary.

56 next_row_++;

57 if (next_row_ == num_rows_)

58 next_row_ = 0;

59 return row;

60 }

61

62 // Returns a pointer to an "unrolled" array of rows. These rows will start

63 // at the y coordinate placed into \|*first_row_index\| and will continue in

64 // order for the maximum number of rows in this circular buffer.

65 //

66 // The \|first_row_index_\| may be negative. This means the circular buffer

67 // starts before the top of the image (it hasn't been filled yet).

68 unsigned char* const* GetRowAddresses(int* first_row_index) {

69 // Example for a 4-element circular buffer holding coords 6-9.

70 // Row 0 Coord 8

71 // Row 1 Coord 9

72 // Row 2 Coord 6 <- next_row_ = 2, next_row_coordinate_ = 10.

73 // Row 3 Coord 7

74 //

75 // The "next" row is also the first (lowest) coordinate. This computation

76 // may yield a negative value, but that's OK, the math will work out

77 // since the user of this buffer will compute the offset relative

78 // to the first_row_index and the negative rows will never be used.

79 *first_row_index = next_row_coordinate_ - num_rows_;

80

81 int cur_row = next_row_;

82 for (int i = 0; i < num_rows_; i++) {

83 row_addresses_[i] = &buffer_[cur_row * row_byte_width_];

84

85 // Advance to the next row, wrapping if necessary.

86 cur_row++;

87 if (cur_row == num_rows_)

88 cur_row = 0;

89 }

90 return &row_addresses_[0];

91 }

92

93 private:

94 // The buffer storing the rows. They are packed, each one row_byte_width_.

95 std::vector<unsigned char> buffer_;

96

97 // Number of bytes per row in the \|buffer_\|.

98 int row_byte_width_;

99

100 // The number of rows available in the buffer.

101 int num_rows_;

102

103 // The next row index we should write into. This wraps around as the

104 // circular buffer is used.

105 int next_row_;

106

107 // The y coordinate of the \|next_row_\|. This is incremented each time a

108 // new row is appended and does not wrap.

109 int next_row_coordinate_;

110

111 // Buffer used by GetRowAddresses().

112 std::vector<unsigned char*> row_addresses_;

113 };

114

115 // Convolves horizontally along a single row. The row data is given in

116 // \|src_data\| and continues for the num_values() of the filter.

117 template<bool has_alpha>

118 void ConvolveHorizontally(const unsigned char* src_data,

119 const ConvolutionFilter1D& filter,

120 unsigned char* out_row) {

121 // Loop over each pixel on this row in the output image.

122 int num_values = filter.num_values();

123 for (int out_x = 0; out_x < num_values; out_x++) {

124 // Get the filter that determines the current output pixel.

125 int filter_offset, filter_length;

126 const ConvolutionFilter1D::Fixed* filter_values =

127 filter.FilterForValue(out_x, &filter_offset, &filter_length);

128

129 // Compute the first pixel in this row that the filter affects. It will

130 // touch \|filter_length\| pixels (4 bytes each) after this.

131 const unsigned char* row_to_filter = &src_data[filter_offset * 4];

132

133 // Apply the filter to the row to get the destination pixel in \|accum\|.

134 int accum[4] = {0};

135 for (int filter_x = 0; filter_x < filter_length; filter_x++) {

136 ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_x];

137 accum[0] += cur_filter * row_to_filter[filter_x * 4 + 0];

138 accum[1] += cur_filter * row_to_filter[filter_x * 4 + 1];

139 accum[2] += cur_filter * row_to_filter[filter_x * 4 + 2];

140 if (has_alpha)

141 accum[3] += cur_filter * row_to_filter[filter_x * 4 + 3];

142 }

143

144 // Bring this value back in range. All of the filter scaling factors

145 // are in fixed point with kShiftBits bits of fractional part.

146 accum[0] >>= ConvolutionFilter1D::kShiftBits;

147 accum[1] >>= ConvolutionFilter1D::kShiftBits;

148 accum[2] >>= ConvolutionFilter1D::kShiftBits;

149 if (has_alpha)

150 accum[3] >>= ConvolutionFilter1D::kShiftBits;

151

152 // Store the new pixel.

153 out_row[out_x * 4 + 0] = ClampTo8(accum[0]);

154 out_row[out_x * 4 + 1] = ClampTo8(accum[1]);

155 out_row[out_x * 4 + 2] = ClampTo8(accum[2]);

156 if (has_alpha)

157 out_row[out_x * 4 + 3] = ClampTo8(accum[3]);

158 }

159 }

160

161 // Does vertical convolution to produce one output row. The filter values and

162 // length are given in the first two parameters. These are applied to each

163 // of the rows pointed to in the \|source_data_rows\| array, with each row

164 // being \|pixel_width\| wide.

165 //

166 // The output must have room for \|pixel_width * 4\| bytes.

167 template<bool has_alpha>

168 void ConvolveVertically(const ConvolutionFilter1D::Fixed* filter_values,

169 int filter_length,

170 unsigned char* const* source_data_rows,

171 int pixel_width,

172 unsigned char* out_row) {

173 // We go through each column in the output and do a vertical convolution,

174 // generating one output pixel each time.

175 for (int out_x = 0; out_x < pixel_width; out_x++) {

176 // Compute the number of bytes over in each row that the current column

177 // we're convolving starts at. The pixel will cover the next 4 bytes.

178 int byte_offset = out_x * 4;

179

180 // Apply the filter to one column of pixels.

181 int accum[4] = {0};

182 for (int filter_y = 0; filter_y < filter_length; filter_y++) {

183 ConvolutionFilter1D::Fixed cur_filter = filter_values[filter_y];

184 accum[0] += cur_filter * source_data_rows[filter_y][byte_offset + 0];

185 accum[1] += cur_filter * source_data_rows[filter_y][byte_offset + 1];

186 accum[2] += cur_filter * source_data_rows[filter_y][byte_offset + 2];

187 if (has_alpha)

188 accum[3] += cur_filter * source_data_rows[filter_y][byte_offset + 3];

189 }

190

191 // Bring this value back in range. All of the filter scaling factors

192 // are in fixed point with kShiftBits bits of precision.

193 accum[0] >>= ConvolutionFilter1D::kShiftBits;

194 accum[1] >>= ConvolutionFilter1D::kShiftBits;

195 accum[2] >>= ConvolutionFilter1D::kShiftBits;

196 if (has_alpha)

197 accum[3] >>= ConvolutionFilter1D::kShiftBits;

198

199 // Store the new pixel.

200 out_row[byte_offset + 0] = ClampTo8(accum[0]);

201 out_row[byte_offset + 1] = ClampTo8(accum[1]);

202 out_row[byte_offset + 2] = ClampTo8(accum[2]);

203 if (has_alpha) {

204 unsigned char alpha = ClampTo8(accum[3]);

205

206 // Make sure the alpha channel doesn't come out smaller than any of the

207 // color channels. We use premultipled alpha channels, so this should

208 // never happen, but rounding errors will cause this from time to time.

209 // These "impossible" colors will cause overflows (and hence random pixel

210 // values) when the resulting bitmap is drawn to the screen.

211 //

212 // We only need to do this when generating the final output row (here).

213 int max_color_channel = std::max(out_row[byte_offset + 0],

214 std::max(out_row[byte_offset + 1], out_row[byte_offset + 2]));

215 if (alpha < max_color_channel)

216 out_row[byte_offset + 3] = max_color_channel;

217 else

218 out_row[byte_offset + 3] = alpha;

219 } else {

220 // No alpha channel, the image is opaque.

221 out_row[byte_offset + 3] = 0xff;

222 }

223 }

224 }

225

226

227 // Convolves horizontally along a single row. The row data is given in	15 // Convolves horizontally along a single row. The row data is given in

228 // \|src_data\| and continues for the num_values() of the filter.	16 // \|src_data\| and continues for the num_values() of the filter.

229 void ConvolveHorizontally_SSE2(const unsigned char* src_data,	17 void ConvolveHorizontally_SSE2(const unsigned char* src_data,

230 const ConvolutionFilter1D& filter,	18 const ConvolutionFilter1D& filter,

231 unsigned char* out_row) {	19 unsigned char* out_row) {

232 #if defined(SIMD_SSE2)

233 int num_values = filter.num_values();	20 int num_values = filter.num_values();

234	21

235 int filter_offset, filter_length;	22 int filter_offset, filter_length;

236 __m128i zero = _mm_setzero_si128();	23 __m128i zero = _mm_setzero_si128();

237 __m128i mask[4];	24 __m128i mask[4];

238 // \|mask\| will be used to decimate all extra filter coefficients that are	25 // \|mask\| will be used to decimate all extra filter coefficients that are

239 // loaded by SIMD when \|filter_length\| is not divisible by 4.	26 // loaded by SIMD when \|filter_length\| is not divisible by 4.

240 // mask[0] is not used in following algorithm.	27 // mask[0] is not used in following algorithm.

241 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);	28 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

242 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);	29 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

(...skipping 100 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
343	130

344 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).	131 // Packing 32 bits \|accum\| to 16 bits per channel (signed saturation).

345 accum = _mm_packs_epi32(accum, zero);	132 accum = _mm_packs_epi32(accum, zero);

346 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).	133 // Packing 16 bits \|accum\| to 8 bits per channel (unsigned saturation).

347 accum = _mm_packus_epi16(accum, zero);	134 accum = _mm_packus_epi16(accum, zero);

348	135

349 // Store the pixel value of 32 bits.	136 // Store the pixel value of 32 bits.

350 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);	137 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum);

351 out_row += 4;	138 out_row += 4;

352 }	139 }

353 #endif

354 }	140 }

355	141

356 // Convolves horizontally along four rows. The row data is given in	142 // Convolves horizontally along four rows. The row data is given in

357 // \|src_data\| and continues for the num_values() of the filter.	143 // \|src_data\| and continues for the num_values() of the filter.

358 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please	144 // The algorithm is almost same as \|ConvolveHorizontally_SSE2\|. Please

359 // refer to that function for detailed comments.	145 // refer to that function for detailed comments.

360 void ConvolveHorizontally4_SSE2(const unsigned char* src_data[4],	146 void Convolve4RowsHorizontally_SSE2(const unsigned char* src_data[4],

361 const ConvolutionFilter1D& filter,	147 const ConvolutionFilter1D& filter,

362 unsigned char* out_row[4]) {	148 unsigned char* out_row[4]) {

363 #if defined(SIMD_SSE2)

364 int num_values = filter.num_values();	149 int num_values = filter.num_values();

365	150

366 int filter_offset, filter_length;	151 int filter_offset, filter_length;

367 __m128i zero = _mm_setzero_si128();	152 __m128i zero = _mm_setzero_si128();

368 __m128i mask[4];	153 __m128i mask[4];

369 // \|mask\| will be used to decimate all extra filter coefficients that are	154 // \|mask\| will be used to decimate all extra filter coefficients that are

370 // loaded by SIMD when \|filter_length\| is not divisible by 4.	155 // loaded by SIMD when \|filter_length\| is not divisible by 4.

371 // mask[0] is not used in following algorithm.	156 // mask[0] is not used in following algorithm.

372 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);	157 mask[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);

373 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);	158 mask[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);

(...skipping 90 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
464 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);	249 (reinterpret_cast<int>(out_row[0])) = _mm_cvtsi128_si32(accum0);

465 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);	250 (reinterpret_cast<int>(out_row[1])) = _mm_cvtsi128_si32(accum1);

466 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);	251 (reinterpret_cast<int>(out_row[2])) = _mm_cvtsi128_si32(accum2);

467 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);	252 (reinterpret_cast<int>(out_row[3])) = _mm_cvtsi128_si32(accum3);

468	253

469 out_row[0] += 4;	254 out_row[0] += 4;

470 out_row[1] += 4;	255 out_row[1] += 4;

471 out_row[2] += 4;	256 out_row[2] += 4;

472 out_row[3] += 4;	257 out_row[3] += 4;

473 }	258 }

474 #endif

475 }	259 }

476	260

477 // Does vertical convolution to produce one output row. The filter values and	261 // Does vertical convolution to produce one output row. The filter values and

478 // length are given in the first two parameters. These are applied to each	262 // length are given in the first two parameters. These are applied to each

479 // of the rows pointed to in the \|source_data_rows\| array, with each row	263 // of the rows pointed to in the \|source_data_rows\| array, with each row

480 // being \|pixel_width\| wide.	264 // being \|pixel_width\| wide.

481 //	265 //

482 // The output must have room for \|pixel_width * 4\| bytes.	266 // The output must have room for \|pixel_width * 4\| bytes.

483 template<bool has_alpha>	267 template<bool has_alpha>

484 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,	268 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,

485 int filter_length,	269 int filter_length,

486 unsigned char* const* source_data_rows,	270 unsigned char* const* source_data_rows,

487 int pixel_width,	271 int pixel_width,

488 unsigned char* out_row) {	272 unsigned char* out_row) {

489 #if defined(SIMD_SSE2)

490 int width = pixel_width & ~3;	273 int width = pixel_width & ~3;

491	274

492 __m128i zero = _mm_setzero_si128();	275 __m128i zero = _mm_setzero_si128();

493 __m128i accum0, accum1, accum2, accum3, coeff16;	276 __m128i accum0, accum1, accum2, accum3, coeff16;

494 const __m128i* src;	277 const __m128i* src;

495 // Output four pixels per iteration (16 bytes).	278 // Output four pixels per iteration (16 bytes).

496 for (int out_x = 0; out_x < width; out_x += 4) {	279 for (int out_x = 0; out_x < width; out_x += 4) {

497	280

498 // Accumulated result for each pixel. 32 bits per RGBA channel.	281 // Accumulated result for each pixel. 32 bits per RGBA channel.

499 accum0 = _mm_setzero_si128();	282 accum0 = _mm_setzero_si128();

(...skipping 140 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
640 __m128i mask = _mm_set1_epi32(0xff000000);	423 __m128i mask = _mm_set1_epi32(0xff000000);

641 accum0 = _mm_or_si128(accum0, mask);	424 accum0 = _mm_or_si128(accum0, mask);

642 }	425 }

643	426

644 for (int out_x = width; out_x < pixel_width; out_x++) {	427 for (int out_x = width; out_x < pixel_width; out_x++) {

645 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);	428 (reinterpret_cast<int>(out_row)) = _mm_cvtsi128_si32(accum0);

646 accum0 = _mm_srli_si128(accum0, 4);	429 accum0 = _mm_srli_si128(accum0, 4);

647 out_row += 4;	430 out_row += 4;

648 }	431 }

649 }	432 }

650 #endif

651 }

652

653 } // namespace

654

655 // ConvolutionFilter1D ---------------------------------------------------------

656

657 ConvolutionFilter1D::ConvolutionFilter1D()

658 : max_filter_(0) {

659 }

660

661 ConvolutionFilter1D::~ConvolutionFilter1D() {

662 }

663

664 void ConvolutionFilter1D::AddFilter(int filter_offset,

665 const float* filter_values,

666 int filter_length) {

667 SkASSERT(filter_length > 0);

668

669 std::vector<Fixed> fixed_values;

670 fixed_values.reserve(filter_length);

671

672 for (int i = 0; i < filter_length; ++i)

673 fixed_values.push_back(FloatToFixed(filter_values[i]));

674

675 AddFilter(filter_offset, &fixed_values[0], filter_length);

676 }

677

678 void ConvolutionFilter1D::AddFilter(int filter_offset,

679 const Fixed* filter_values,

680 int filter_length) {

681 // It is common for leading/trailing filter values to be zeros. In such

682 // cases it is beneficial to only store the central factors.

683 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on

684 // a 1080p image this optimization gives a ~10% speed improvement.

685 int first_non_zero = 0;

686 while (first_non_zero < filter_length && filter_values[first_non_zero] == 0)

687 first_non_zero++;

688

689 if (first_non_zero < filter_length) {

690 // Here we have at least one non-zero factor.

691 int last_non_zero = filter_length - 1;

692 while (last_non_zero >= 0 && filter_values[last_non_zero] == 0)

693 last_non_zero--;

694

695 filter_offset += first_non_zero;

696 filter_length = last_non_zero + 1 - first_non_zero;

697 SkASSERT(filter_length > 0);

698

699 for (int i = first_non_zero; i <= last_non_zero; i++)

700 filter_values_.push_back(filter_values[i]);

701 } else {

702 // Here all the factors were zeroes.

703 filter_length = 0;

704 }

705

706 FilterInstance instance;

707

708 // We pushed filter_length elements onto filter_values_

709 instance.data_location = (static_cast<int>(filter_values_.size()) -

710 filter_length);

711 instance.offset = filter_offset;

712 instance.length = filter_length;

713 filters_.push_back(instance);

714

715 max_filter_ = std::max(max_filter_, filter_length);

716 }

717

718 void BGRAConvolve2D(const unsigned char* source_data,

719 int source_byte_row_stride,

720 bool source_has_alpha,

721 const ConvolutionFilter1D& filter_x,

722 const ConvolutionFilter1D& filter_y,

723 int output_byte_row_stride,

724 unsigned char* output,

725 bool use_sse2) {

726 #if !defined(SIMD_SSE2)

727 // Even we have runtime support for SSE2 instructions, since the binary

728 // was not built with SSE2 support, we had to fallback to C version.

729 use_sse2 = false;

730 #endif

731

732 int max_y_filter_size = filter_y.max_filter();

733

734 // The next row in the input that we will generate a horizontally

735 // convolved row for. If the filter doesn't start at the beginning of the

736 // image (this is the case when we are only resizing a subset), then we

737 // don't want to generate any output rows before that. Compute the starting

738 // row for convolution as the first pixel for the first vertical filter.

739 int filter_offset, filter_length;

740 const ConvolutionFilter1D::Fixed* filter_values =

741 filter_y.FilterForValue(0, &filter_offset, &filter_length);

742 int next_x_row = filter_offset;

743

744 // We loop over each row in the input doing a horizontal convolution. This

745 // will result in a horizontally convolved image. We write the results into

746 // a circular buffer of convolved rows and do vertical convolution as rows

747 // are available. This prevents us from having to store the entire

748 // intermediate image and helps cache coherency.

749 // We will need four extra rows to allow horizontal convolution could be done

750 // simultaneously. We also padding each row in row buffer to be aligned-up to

751 // 16 bytes.

752 // TODO(jiesun): We do not use aligned load from row buffer in vertical

753 // convolution pass yet. Somehow Windows does not like it.

754 int row_buffer_width = (filter_x.num_values() + 15) & ~0xF;

755 int row_buffer_height = max_y_filter_size + (use_sse2 ? 4 : 0);

756 CircularRowBuffer row_buffer(row_buffer_width,

757 row_buffer_height,

758 filter_offset);

759

760 // Loop over every possible output row, processing just enough horizontal

761 // convolutions to run each subsequent vertical convolution.

762 SkASSERT(output_byte_row_stride >= filter_x.num_values() * 4);

763 int num_output_rows = filter_y.num_values();

764

765 // We need to check which is the last line to convolve before we advance 4

766 // lines in one iteration.

767 int last_filter_offset, last_filter_length;

768

769 // SSE2 can access up to 3 extra pixels past the end of the

770 // buffer. At the bottom of the image, we have to be careful

771 // not to access data past the end of the buffer. Normally

772 // we fall back to the C++ implementation for the last row.

773 // If the last row is less than 3 pixels wide, we may have to fall

774 // back to the C++ version for more rows. Compute how many

775 // rows we need to avoid the SSE implementation for here.

776 filter_x.FilterForValue(filter_x.num_values() - 1, &last_filter_offset,

777 &last_filter_length);

778 int avoid_sse_rows = 1 + 3/(last_filter_offset + last_filter_length);

779

780 filter_y.FilterForValue(num_output_rows - 1, &last_filter_offset,

781 &last_filter_length);

782

783 for (int out_y = 0; out_y < num_output_rows; out_y++) {

784 filter_values = filter_y.FilterForValue(out_y,

785 &filter_offset, &filter_length);

786

787 // Generate output rows until we have enough to run the current filter.

788 if (use_sse2) {

789 while (next_x_row < filter_offset + filter_length) {

790 if (next_x_row + 3 < last_filter_offset + last_filter_length -

791 avoid_sse_rows) {

792 const unsigned char* src[4];

793 unsigned char* out_row[4];

794 for (int i = 0; i < 4; ++i) {

795 src[i] = &source_data[(next_x_row + i) * source_byte_row_stride];

796 out_row[i] = row_buffer.AdvanceRow();

797 }

798 ConvolveHorizontally4_SSE2(src, filter_x, out_row);

799 next_x_row += 4;

800 } else {

801 // Check if we need to avoid SSE2 for this row.

802 if (next_x_row >= last_filter_offset + last_filter_length -

803 avoid_sse_rows) {

804 if (source_has_alpha) {

805 ConvolveHorizontally<true>(

806 &source_data[next_x_row * source_byte_row_stride],

807 filter_x, row_buffer.AdvanceRow());

808 } else {

809 ConvolveHorizontally<false>(

810 &source_data[next_x_row * source_byte_row_stride],

811 filter_x, row_buffer.AdvanceRow());

812 }

813 } else {

814 ConvolveHorizontally_SSE2(

815 &source_data[next_x_row * source_byte_row_stride],

816 filter_x, row_buffer.AdvanceRow());

817 }

818 next_x_row++;

819 }

820 }

821 } else {

822 while (next_x_row < filter_offset + filter_length) {

823 if (source_has_alpha) {

824 ConvolveHorizontally<true>(

825 &source_data[next_x_row * source_byte_row_stride],

826 filter_x, row_buffer.AdvanceRow());

827 } else {

828 ConvolveHorizontally<false>(

829 &source_data[next_x_row * source_byte_row_stride],

830 filter_x, row_buffer.AdvanceRow());

831 }

832 next_x_row++;

833 }

834 }

835

836 // Compute where in the output image this row of final data will go.

837 unsigned char* cur_output_row = &output[out_y * output_byte_row_stride];

838

839 // Get the list of rows that the circular buffer has, in order.

840 int first_row_in_circular_buffer;

841 unsigned char* const* rows_to_convolve =

842 row_buffer.GetRowAddresses(&first_row_in_circular_buffer);

843

844 // Now compute the start of the subset of those rows that the filter

845 // needs.

846 unsigned char* const* first_row_for_filter =

847 &rows_to_convolve[filter_offset - first_row_in_circular_buffer];

848

849 if (source_has_alpha) {

850 if (use_sse2) {

851 ConvolveVertically_SSE2<true>(filter_values, filter_length,

852 first_row_for_filter,

853 filter_x.num_values(), cur_output_row);

854 } else {

855 ConvolveVertically<true>(filter_values, filter_length,

856 first_row_for_filter,

857 filter_x.num_values(), cur_output_row);

858 }

859 } else {

860 if (use_sse2) {

861 ConvolveVertically_SSE2<false>(filter_values, filter_length,

862 first_row_for_filter,

863 filter_x.num_values(), cur_output_row);

864 } else {

865 ConvolveVertically<false>(filter_values, filter_length,

866 first_row_for_filter,

867 filter_x.num_values(), cur_output_row);

868 }

869 }

870 }

871 }	433 }

872	434

	435 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed* filter_values,
	Stephen White 2013/04/05 08:40:47 It might be a good idea to add a similar wrapper f It might be a good idea to add a similar wrapper for the non-SSE2 flavour as well. It would clean up the call site a bit to hide the template parameters in the same way, as well as making it more symmetrical with the SSE2 path. hubbe 2013/04/06 20:45:46 Done. Show quoted text On 2013/04/05 08:40:47, Stephen White wrote: > It might be a good idea to add a similar wrapper for the non-SSE2 flavour as > well. It would clean up the call site a bit to hide the template parameters in > the same way, as well as making it more symmetrical with the SSE2 path. Done.
	436 int filter_length,

	437 unsigned char* const* source_data_rows,

	438 int pixel_width,

	439 unsigned char* out_row,

	440 bool has_alpha) {

	441 if (has_alpha) {

	442 ConvolveVertically_SSE2<true>(filter_values,

	443 filter_length,

	444 source_data_rows,

	445 pixel_width,

	446 out_row);

	447 } else {

	448 ConvolveVertically_SSE2<false>(filter_values,

	449 filter_length,

	450 source_data_rows,

	451 pixel_width,

	452 out_row);

	453 }

	454 }

	455

873 } // namespace skia	456 } // namespace skia

OLD	NEW

« no previous file with comments | « skia/ext/convolver_SSE2.h ('k') | skia/ext/convolver_unittest.cc » ('j') | skia/ext/convolver_unittest.cc » ('J')