media/base/sinc_resampler.cc - Issue 18566009: Optimize loop condition for SincResampler.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 18566009: Optimize loop condition for SincResampler. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Comments. const! Created 7 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_	5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_

6 // and r4_ will move after the first load):	6 // and r4_ will move after the first load):

7 //	7 //

8 // \|----------------\|-----------------------------------------\|----------------\|	8 // \|----------------\|-----------------------------------------\|----------------\|

9 //	9 //

10 // request_frames_	10 // request_frames_

(...skipping 120 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
131 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)	131 #elif defined(ARCH_CPU_ARM_FAMILY) && defined(USE_NEON)

132 #define CONVOLVE_FUNC Convolve_NEON	132 #define CONVOLVE_FUNC Convolve_NEON

133 void SincResampler::InitializeCPUSpecificFeatures() {}	133 void SincResampler::InitializeCPUSpecificFeatures() {}

134 #else	134 #else

135 // Unknown architecture.	135 // Unknown architecture.

136 #define CONVOLVE_FUNC Convolve_C	136 #define CONVOLVE_FUNC Convolve_C

137 void SincResampler::InitializeCPUSpecificFeatures() {}	137 void SincResampler::InitializeCPUSpecificFeatures() {}

138 #endif	138 #endif

139	139

140 SincResampler::SincResampler(double io_sample_rate_ratio,	140 SincResampler::SincResampler(double io_sample_rate_ratio,

141 size_t request_frames,	141 int request_frames,

142 const ReadCB& read_cb)	142 const ReadCB& read_cb)

143 : io_sample_rate_ratio_(io_sample_rate_ratio),	143 : io_sample_rate_ratio_(io_sample_rate_ratio),

144 read_cb_(read_cb),	144 read_cb_(read_cb),

145 request_frames_(request_frames),	145 request_frames_(request_frames),

146 input_buffer_size_(request_frames_ + kKernelSize),	146 input_buffer_size_(request_frames_ + kKernelSize),

147 // Create input buffers with a 16-byte alignment for SSE optimizations.	147 // Create input buffers with a 16-byte alignment for SSE optimizations.

148 kernel_storage_(static_cast<float*>(	148 kernel_storage_(static_cast<float*>(

149 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	149 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

150 kernel_pre_sinc_storage_(static_cast<float*>(	150 kernel_pre_sinc_storage_(static_cast<float*>(

151 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	151 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

152 kernel_window_storage_(static_cast<float*>(	152 kernel_window_storage_(static_cast<float*>(

153 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),	153 base::AlignedAlloc(sizeof(float) * kKernelStorageSize, 16))),

154 input_buffer_(static_cast<float*>(	154 input_buffer_(static_cast<float*>(

155 base::AlignedAlloc(sizeof(float) * input_buffer_size_, 16))),	155 base::AlignedAlloc(sizeof(float) * input_buffer_size_, 16))),

156 r1_(input_buffer_.get()),	156 r1_(input_buffer_.get()),

157 r2_(input_buffer_.get() + kKernelSize / 2) {	157 r2_(input_buffer_.get() + kKernelSize / 2) {

	158 CHECK_GT(request_frames_, 0);

158 Flush();	159 Flush();

159 CHECK_GT(block_size_, static_cast<size_t>(kKernelSize))	160 CHECK_GT(block_size_, kKernelSize)

160 << "block_size must be greater than kKernelSize!";	161 << "block_size must be greater than kKernelSize!";

161	162

162 memset(kernel_storage_.get(), 0,	163 memset(kernel_storage_.get(), 0,

163 sizeof(kernel_storage_.get()) kKernelStorageSize);	164 sizeof(kernel_storage_.get()) kKernelStorageSize);

164 memset(kernel_pre_sinc_storage_.get(), 0,	165 memset(kernel_pre_sinc_storage_.get(), 0,

165 sizeof(kernel_pre_sinc_storage_.get()) kKernelStorageSize);	166 sizeof(kernel_pre_sinc_storage_.get()) kKernelStorageSize);

166 memset(kernel_window_storage_.get(), 0,	167 memset(kernel_window_storage_.get(), 0,

167 sizeof(kernel_window_storage_.get()) kKernelStorageSize);	168 sizeof(kernel_window_storage_.get()) kKernelStorageSize);

168	169

169 InitializeKernel();	170 InitializeKernel();

(...skipping 78 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
248 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc;	249 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc;

249 }	250 }

250 }	251 }

251 }	252 }

252 }	253 }

253	254

254 void SincResampler::Resample(int frames, float* destination) {	255 void SincResampler::Resample(int frames, float* destination) {

255 int remaining_frames = frames;	256 int remaining_frames = frames;

256	257

257 // Step (1) -- Prime the input buffer at the start of the input stream.	258 // Step (1) -- Prime the input buffer at the start of the input stream.

258 if (!buffer_primed_) {	259 if (!buffer_primed_ && remaining_frames) {

259 read_cb_.Run(request_frames_, r0_);	260 read_cb_.Run(request_frames_, r0_);

260 buffer_primed_ = true;	261 buffer_primed_ = true;

261 }	262 }

262	263

263 // Step (2) -- Resample!	264 // Step (2) -- Resample! const what we can outside of the loop for speed. It

	265 // actually has an impact on ARM performance. See inner loop comment below.

	266 const double current_io_ratio = io_sample_rate_ratio_;

	267 const float* const kernel_ptr = kernel_storage_.get();

264 while (remaining_frames) {	268 while (remaining_frames) {

265 while (virtual_source_idx_ < block_size_) {	269 // \|i\| may be negative if the last Resample() call ended on an iteration

	270 // that put \|virtual_source_idx_\| over the limit.

	271 //

	272 // Note: The loop construct here can severely impact performance on ARM

	273 // or when built with clang. See https://codereview.chromium.org/18566009/

	274 for (int i = ceil((block_size_ - virtual_source_idx_) / current_io_ratio);

	275 i > 0; --i) {

	276 DCHECK_LT(virtual_source_idx_, block_size_);

	277

266 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out	278 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out

267 // what they are.	279 // what they are.

268 const int source_idx = virtual_source_idx_;	280 const int source_idx = virtual_source_idx_;

269 const double subsample_remainder = virtual_source_idx_ - source_idx;	281 const double subsample_remainder = virtual_source_idx_ - source_idx;

270	282

271 const double virtual_offset_idx =	283 const double virtual_offset_idx =

272 subsample_remainder * kKernelOffsetCount;	284 subsample_remainder * kKernelOffsetCount;

273 const int offset_idx = virtual_offset_idx;	285 const int offset_idx = virtual_offset_idx;

274	286

275 // We'll compute "convolutions" for the two kernels which straddle	287 // We'll compute "convolutions" for the two kernels which straddle

276 // \|virtual_source_idx_\|.	288 // \|virtual_source_idx_\|.

277 const float* k1 = kernel_storage_.get() + offset_idx * kKernelSize;	289 const float* const k1 = kernel_ptr + offset_idx * kKernelSize;

278 const float* k2 = k1 + kKernelSize;	290 const float* const k2 = k1 + kKernelSize;

279	291

280 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SIMD usage. Should always be	292 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SIMD usage. Should always be

281 // true so long as kKernelSize is a multiple of 16.	293 // true so long as kKernelSize is a multiple of 16.

282 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);	294 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

283 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);	295 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

284	296

285 // Initialize input pointer based on quantized \|virtual_source_idx_\|.	297 // Initialize input pointer based on quantized \|virtual_source_idx_\|.

286 const float* input_ptr = r1_ + source_idx;	298 const float* const input_ptr = r1_ + source_idx;

287	299

288 // Figure out how much to weight each kernel's "convolution".	300 // Figure out how much to weight each kernel's "convolution".

289 const double kernel_interpolation_factor =	301 const double kernel_interpolation_factor =

290 virtual_offset_idx - offset_idx;	302 virtual_offset_idx - offset_idx;

291 *destination++ = CONVOLVE_FUNC(	303 *destination++ = CONVOLVE_FUNC(

292 input_ptr, k1, k2, kernel_interpolation_factor);	304 input_ptr, k1, k2, kernel_interpolation_factor);

293	305

294 // Advance the virtual index.	306 // Advance the virtual index.

295 virtual_source_idx_ += io_sample_rate_ratio_;	307 virtual_source_idx_ += current_io_ratio;

296	308

297 if (!--remaining_frames)	309 if (!--remaining_frames)

298 return;	310 return;

299 }	311 }

300	312

301 // Wrap back around to the start.	313 // Wrap back around to the start.

302 virtual_source_idx_ -= block_size_;	314 virtual_source_idx_ -= block_size_;

303	315

304 // Step (3) -- Copy r3_, r4_ to r1_, r2_.	316 // Step (3) -- Copy r3_, r4_ to r1_, r2_.

305 // This wraps the last input frames back to the start of the buffer.	317 // This wraps the last input frames back to the start of the buffer.

(...skipping 64 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
370 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),	382 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

371 m_sums2, vmovq_n_f32(kernel_interpolation_factor));	383 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

372	384

373 // Sum components together.	385 // Sum components together.

374 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));	386 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

375 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);	387 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

376 }	388 }

377 #endif	389 #endif

378	390

379 } // namespace media	391 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/sinc_resampler.h ('k') | no next file » | no next file with comments »