media/base/sinc_resampler.cc - Issue 18566009: Optimize loop condition for SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 18566009: Optimize loop condition for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Cleanup. Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_	5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_

6 // and r4_ will move after the first load):	6 // and r4_ will move after the first load):

7 //	7 //

8 // \|----------------\|-----------------------------------------\|----------------\|	8 // \|----------------\|-----------------------------------------\|----------------\|

9 //	9 //

10 // request_frames_	10 // request_frames_

(...skipping 120 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
131 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	131 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

132 #define CONVOLVE_FUNC Convolve_NEON	132 #define CONVOLVE_FUNC Convolve_NEON

133 void SincResampler::InitializeCPUSpecificFeatures() {}	133 void SincResampler::InitializeCPUSpecificFeatures() {}

134 #else	134 #else

135 // Unknown architecture.	135 // Unknown architecture.

136 #define CONVOLVE_FUNC Convolve_C	136 #define CONVOLVE_FUNC Convolve_C

137 void SincResampler::InitializeCPUSpecificFeatures() {}	137 void SincResampler::InitializeCPUSpecificFeatures() {}

138 #endif	138 #endif

139	139

140 SincResampler::SincResampler(double io_sample_rate_ratio,	140 SincResampler::SincResampler(double io_sample_rate_ratio,

141 size_t request_frames,	141 int request_frames,

142 const ReadCB& read_cb)	142 const ReadCB& read_cb)

143 : io_sample_rate_ratio_(io_sample_rate_ratio),	143 : io_sample_rate_ratio_(io_sample_rate_ratio),

144 read_cb_(read_cb),	144 read_cb_(read_cb),

145 request_frames_(request_frames),	145 request_frames_(request_frames),

146 input_buffer_size_(request_frames_ + kKernelSize),	146 input_buffer_size_(request_frames_ + kKernelSize),

147 // Create input buffers with a 16-byte alignment for SSE optimizations.	147 // Create input buffers with a 16-byte alignment for SSE optimizations.

148 kernel_storage_(static_cast<float*>(	148 kernel_storage_(static_cast<float*>(

149 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	149 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

150 kernel_pre_sinc_storage_(static_cast<float*>(	150 kernel_pre_sinc_storage_(static_cast<float*>(

151 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	151 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

152 kernel_window_storage_(static_cast<float*>(	152 kernel_window_storage_(static_cast<float*>(

153 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	153 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

154 input_buffer_(static_cast<float*>(	154 input_buffer_(static_cast<float*>(

155 base::AlignedAlloc(sizeof(float) * input_buffer_size_, 16))),	155 base::AlignedAlloc(sizeof(float) * input_buffer_size_, 16))),

156 r1_(input_buffer_.get()),	156 r1_(input_buffer_.get()),

157 r2_(input_buffer_.get() + kKernelSize / 2) {	157 r2_(input_buffer_.get() + kKernelSize / 2) {

	158 CHECK_GT(request_frames_, 0);

158 Flush();	159 Flush();

159 CHECK_GT(block_size_, static_cast<size_t>(kKernelSize))	160 CHECK_GT(block_size_, kKernelSize)

160 << "block_size must be greater than kKernelSize!";	161 << "block_size must be greater than kKernelSize!";

161	162

162 memset(kernel_storage_.get(), 0,	163 memset(kernel_storage_.get(), 0,

163 sizeof(kernel_storage_.get()) kKernelStorageSize);	164 sizeof(kernel_storage_.get()) kKernelStorageSize);

164 memset(kernel_pre_sinc_storage_.get(), 0,	165 memset(kernel_pre_sinc_storage_.get(), 0,

165 sizeof(kernel_pre_sinc_storage_.get()) kKernelStorageSize);	166 sizeof(kernel_pre_sinc_storage_.get()) kKernelStorageSize);

166 memset(kernel_window_storage_.get(), 0,	167 memset(kernel_window_storage_.get(), 0,

167 sizeof(kernel_window_storage_.get()) kKernelStorageSize);	168 sizeof(kernel_window_storage_.get()) kKernelStorageSize);

168	169

169 InitializeKernel();	170 InitializeKernel();

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
248 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc;	249 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc;

249 }	250 }

250 }	251 }

251 }	252 }

252 }	253 }

253	254

254 void SincResampler::Resample(int frames, float* destination) {	255 void SincResampler::Resample(int frames, float* destination) {

255 int remaining_frames = frames;	256 int remaining_frames = frames;

256	257

257 // Step (1) -- Prime the input buffer at the start of the input stream.	258 // Step (1) -- Prime the input buffer at the start of the input stream.

258 if (!buffer_primed_) {	259 if (!buffer_primed_ && remaining_frames) {

259 read_cb_.Run(request_frames_, r0_);	260 read_cb_.Run(request_frames_, r0_);

260 buffer_primed_ = true;	261 buffer_primed_ = true;

261 }	262 }

262	263

263 // Step (2) -- Resample!	264 // Step (2) -- Resample!

264 while (remaining_frames) {	265 while (remaining_frames) {

	266 // Avoid comparing a double to an int on most platforms. Provides a 3% to

	267 // 20% increase in some cases. http://llvm.org/bugs/show_bug.cgi?id=16578

	268 #if defined(USE_NEON) \|\| defined(__clang__)

265 while (virtual_source_idx_ < block_size_) {	269 while (virtual_source_idx_ < block_size_) {
	Ami GONE FROM CHROMIUM 2013/07/09 23:20:08 what happens if you try: while ((const int source_ what happens if you try: while ((const int source_idx = virtual_source_idx) < block_size_) on the various platforms? DaleCurtis 2013/07/10 21:06:31 As discussed offline, this doesn't compile, and mo Show quoted text On 2013/07/09 23:20:08, Ami Fischman wrote: > what happens if you try: > while ((const int source_idx = virtual_source_idx) < block_size_) > on the various platforms? As discussed offline, this doesn't compile, and moving the decl outside of the while shows the same performance hit seen elsewhere. Ami GONE FROM CHROMIUM 2013/07/10 21:18:00 FTR that's only b/c of the "const"; declaring the Show quoted text On 2013/07/10 21:06:31, DaleCurtis wrote: > On 2013/07/09 23:20:08, Ami Fischman wrote: > > what happens if you try: > > while ((const int source_idx = virtual_source_idx) < block_size_) > > on the various platforms? > > As discussed offline, this doesn't compile FTR that's only b/c of the "const"; declaring the var in loop scope should still work. Show quoted text > and moving the decl outside of the > while shows the same performance hit seen elsewhere. :(
	270 const int source_idx = virtual_source_idx_;

	271 #else

	272 while (true) {

	273 const int source_idx = virtual_source_idx_;

	274 if (source_idx >= block_size_)

	275 break;

	276 #endif
	Ami GONE FROM CHROMIUM 2013/07/10 21:18:00 Perverted sense of perversion leads me to wonder h Perverted sense of perversion leads me to wonder how this performs against for (int loop_iters = (block_size_ - virtual_source_idx_) / io_sample_rate_ratio_; loop_iters; --loop_iters) DaleCurtis 2013/07/10 23:01:34 Works well on my N4 w/ gcc and OSX w/ clang. I've Show quoted text On 2013/07/10 21:18:00, Ami Fischman wrote: > Perverted sense of perversion leads me to wonder how this performs against > > for (int loop_iters = (block_size_ - virtual_source_idx_) / > io_sample_rate_ratio_; loop_iters; --loop_iters) > Works well on my N4 w/ gcc and OSX w/ clang. I've pinged the guy with the A9 to see how it does for him. Strangely if I try to optimize further by making it std::min(loop_iters, remaining_frames) I hit the 20% performance regression previously seen with clang... DaleCurtis 2013/07/11 21:04:34 Switched to for(), but needs a ceil() to be accura Show quoted text On 2013/07/10 23:01:34, DaleCurtis wrote: > On 2013/07/10 21:18:00, Ami Fischman wrote: > > Perverted sense of perversion leads me to wonder how this performs against > > > > for (int loop_iters = (block_size_ - virtual_source_idx_) / > > io_sample_rate_ratio_; loop_iters; --loop_iters) > > > > Works well on my N4 w/ gcc and OSX w/ clang. I've pinged the guy with the A9 to > see how it does for him. Strangely if I try to optimize further by making it > std::min(loop_iters, remaining_frames) I hit the 20% performance regression > previously seen with clang... Switched to for(), but needs a ceil() to be accurate. Still faster than the existing approach. Added some DCHECKs() to verify accuracy.
266 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out	277 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out

267 // what they are.	278 // what they are.

268 const int source_idx = virtual_source_idx_;

269 const double subsample_remainder = virtual_source_idx_ - source_idx;	279 const double subsample_remainder = virtual_source_idx_ - source_idx;

270	280

271 const double virtual_offset_idx =	281 const double virtual_offset_idx =

272 subsample_remainder * kKernelOffsetCount;	282 subsample_remainder * kKernelOffsetCount;

273 const int offset_idx = virtual_offset_idx;	283 const int offset_idx = virtual_offset_idx;

274	284

275 // We'll compute "convolutions" for the two kernels which straddle	285 // We'll compute "convolutions" for the two kernels which straddle

276 // \|virtual_source_idx_\|.	286 // \|virtual_source_idx_\|.

277 const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;	287 const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;

278 const float* k2 = k1 + kKernelSize;	288 const float* k2 = k1 + kKernelSize;

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
370 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),	380 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

371 m_sums2, vmovq_n_f32(kernel_interpolation_factor));	381 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

372	382

373 // Sum components together.	383 // Sum components together.

374 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));	384 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

375 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);	385 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

376 }	386 }

377 #endif	387 #endif

378	388

379 } // namespace media	389 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | media/media.gyp » ('j') | media/media.gyp » ('J')