media/base/sinc_resampler.cc - Issue 638123004: Type conversion fixes, media/ edition.

Side by Side Diff: media/base/sinc_resampler.cc

Issue 638123004: Type conversion fixes, media/ edition. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 //	4 //

5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_	5 // Initial input buffer layout, dividing into regions r0_ to r4_ (note: r0_, r3_

6 // and r4_ will move after the first load):	6 // and r4_ will move after the first load):

7 //	7 //

8 // \|----------------\|-----------------------------------------\|----------------\|	8 // \|----------------\|-----------------------------------------\|----------------\|

9 //	9 //

10 // request_frames_	10 // request_frames_

(...skipping 160 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
171	171

172 // Generates a set of windowed sinc() kernels.	172 // Generates a set of windowed sinc() kernels.

173 // We generate a range of sub-sample offsets from 0.0 to 1.0.	173 // We generate a range of sub-sample offsets from 0.0 to 1.0.

174 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_);	174 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_);

175 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) {	175 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) {

176 const float subsample_offset =	176 const float subsample_offset =

177 static_cast<float>(offset_idx) / kKernelOffsetCount;	177 static_cast<float>(offset_idx) / kKernelOffsetCount;

178	178

179 for (int i = 0; i < kKernelSize; ++i) {	179 for (int i = 0; i < kKernelSize; ++i) {

180 const int idx = i + offset_idx * kKernelSize;	180 const int idx = i + offset_idx * kKernelSize;

181 const float pre_sinc = M_PI * (i - kKernelSize / 2 - subsample_offset);	181 const float pre_sinc =

	182 static_cast<float>(M_PI * (i - kKernelSize / 2 - subsample_offset));
	Peter Kasting 2014/10/16 23:50:42 In general in this file, I tried to make the expli In general in this file, I tried to make the explicit casts exactly match the locations of the previous implicit casts, since it's not clear to me what operations are sensitive to possible precision loss. It may be possible to have fewer casts, better performance, or both by using floats more in intermediate calculations (e.g. by making all the constants at the top of this function floats and using a float version of M_PI), but that sort of a change is beyond my comprehension. Feel free to consider it. wolenetz 2014/10/17 19:23:01 I've filed https://crbug.com/424695 to track inves Show quoted text On 2014/10/16 23:50:42, Peter Kasting wrote: > In general in this file, I tried to make the explicit casts exactly match the > locations of the previous implicit casts, since it's not clear to me what > operations are sensitive to possible precision loss. > > It may be possible to have fewer casts, better performance, or both by using > floats more in intermediate calculations (e.g. by making all the constants at > the top of this function floats and using a float version of M_PI), but that > sort of a change is beyond my comprehension. Feel free to consider it. I've filed https://crbug.com/424695 to track investigation of these potential improvements. I would prefer keeping that investigation separate from this CL that has intent to eliminate the implicit typecast MSVC warnings. Thanks for pointing out this possibility.
182 kernel_pre_sinc_storage_[idx] = pre_sinc;	183 kernel_pre_sinc_storage_[idx] = pre_sinc;

183	184

184 // Compute Blackman window, matching the offset of the sinc().	185 // Compute Blackman window, matching the offset of the sinc().

185 const float x = (i - subsample_offset) / kKernelSize;	186 const float x = (i - subsample_offset) / kKernelSize;

186 const float window =	187 const float window = static_cast<float>(kA0 - kA1 * cos(2.0 * M_PI * x) +

187 kA0 - kA1 * cos(2.0 * M_PI * x) + kA2 * cos(4.0 * M_PI * x);	188 kA2 * cos(4.0 * M_PI * x));

188 kernel_window_storage_[idx] = window;	189 kernel_window_storage_[idx] = window;

189	190

190 // Compute the sinc with offset, then window the sinc() function and store	191 // Compute the sinc with offset, then window the sinc() function and store

191 // at the correct offset.	192 // at the correct offset.

192 if (pre_sinc == 0) {	193 kernel_storage_[idx] = static_cast<float>(window *
	Peter Kasting 2014/10/16 23:50:43 This is functionally equivalent to the previous co This is functionally equivalent to the previous code, it just avoids duplicating the static_cast<> (which otherwise would cause Even More Line Wrapping).
193 kernel_storage_[idx] = sinc_scale_factor * window;	194 ((pre_sinc == 0) ?

194 } else {	195 sinc_scale_factor :

195 kernel_storage_[idx] =	196 (sin(sinc_scale_factor * pre_sinc) / pre_sinc)));

196 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc;

197 }

198 }	197 }

199 }	198 }

200 }	199 }

201	200

202 void SincResampler::SetRatio(double io_sample_rate_ratio) {	201 void SincResampler::SetRatio(double io_sample_rate_ratio) {

203 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) <	202 if (fabs(io_sample_rate_ratio_ - io_sample_rate_ratio) <

204 std::numeric_limits<double>::epsilon()) {	203 std::numeric_limits<double>::epsilon()) {

205 return;	204 return;

206 }	205 }

207	206

208 io_sample_rate_ratio_ = io_sample_rate_ratio;	207 io_sample_rate_ratio_ = io_sample_rate_ratio;

209	208

210 // Optimize reinitialization by reusing values which are independent of	209 // Optimize reinitialization by reusing values which are independent of

211 // \|sinc_scale_factor\|. Provides a 3x speedup.	210 // \|sinc_scale_factor\|. Provides a 3x speedup.

212 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_);	211 const double sinc_scale_factor = SincScaleFactor(io_sample_rate_ratio_);

213 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) {	212 for (int offset_idx = 0; offset_idx <= kKernelOffsetCount; ++offset_idx) {

214 for (int i = 0; i < kKernelSize; ++i) {	213 for (int i = 0; i < kKernelSize; ++i) {

215 const int idx = i + offset_idx * kKernelSize;	214 const int idx = i + offset_idx * kKernelSize;

216 const float window = kernel_window_storage_[idx];	215 const float window = kernel_window_storage_[idx];

217 const float pre_sinc = kernel_pre_sinc_storage_[idx];	216 const float pre_sinc = kernel_pre_sinc_storage_[idx];

218	217

219 if (pre_sinc == 0) {	218 kernel_storage_[idx] = static_cast<float>(window *

220 kernel_storage_[idx] = sinc_scale_factor * window;	219 ((pre_sinc == 0) ?

221 } else {	220 sinc_scale_factor :

222 kernel_storage_[idx] =	221 (sin(sinc_scale_factor * pre_sinc) / pre_sinc)));

223 window * sin(sinc_scale_factor * pre_sinc) / pre_sinc;

224 }

225 }	222 }

226 }	223 }

227 }	224 }

228	225

229 void SincResampler::Resample(int frames, float* destination) {	226 void SincResampler::Resample(int frames, float* destination) {

230 int remaining_frames = frames;	227 int remaining_frames = frames;

231	228

232 // Step (1) -- Prime the input buffer at the start of the input stream.	229 // Step (1) -- Prime the input buffer at the start of the input stream.

233 if (!buffer_primed_ && remaining_frames) {	230 if (!buffer_primed_ && remaining_frames) {

234 read_cb_.Run(request_frames_, r0_);	231 read_cb_.Run(request_frames_, r0_);

235 buffer_primed_ = true;	232 buffer_primed_ = true;

236 }	233 }

237	234

238 // Step (2) -- Resample! const what we can outside of the loop for speed. It	235 // Step (2) -- Resample! const what we can outside of the loop for speed. It

239 // actually has an impact on ARM performance. See inner loop comment below.	236 // actually has an impact on ARM performance. See inner loop comment below.

240 const double current_io_ratio = io_sample_rate_ratio_;	237 const double current_io_ratio = io_sample_rate_ratio_;

241 const float* const kernel_ptr = kernel_storage_.get();	238 const float* const kernel_ptr = kernel_storage_.get();

242 while (remaining_frames) {	239 while (remaining_frames) {

243 // Note: The loop construct here can severely impact performance on ARM	240 // Note: The loop construct here can severely impact performance on ARM

244 // or when built with clang. See https://codereview.chromium.org/18566009/	241 // or when built with clang. See https://codereview.chromium.org/18566009/

245 int source_idx = virtual_source_idx_;	242 int source_idx = static_cast<int>(virtual_source_idx_);

246 while (source_idx < block_size_) {	243 while (source_idx < block_size_) {

247 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out	244 // \|virtual_source_idx_\| lies in between two kernel offsets so figure out

248 // what they are.	245 // what they are.

249 const double subsample_remainder = virtual_source_idx_ - source_idx;	246 const double subsample_remainder = virtual_source_idx_ - source_idx;

250	247

251 const double virtual_offset_idx =	248 const double virtual_offset_idx =

252 subsample_remainder * kKernelOffsetCount;	249 subsample_remainder * kKernelOffsetCount;

253 const int offset_idx = virtual_offset_idx;	250 const int offset_idx = static_cast<int>(virtual_offset_idx);

254	251

255 // We'll compute "convolutions" for the two kernels which straddle	252 // We'll compute "convolutions" for the two kernels which straddle

256 // \|virtual_source_idx_\|.	253 // \|virtual_source_idx_\|.

257 const float* const k1 = kernel_ptr + offset_idx * kKernelSize;	254 const float* const k1 = kernel_ptr + offset_idx * kKernelSize;

258 const float* const k2 = k1 + kKernelSize;	255 const float* const k2 = k1 + kKernelSize;

259	256

260 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SIMD usage. Should always be	257 // Ensure \|k1\|, \|k2\| are 16-byte aligned for SIMD usage. Should always be

261 // true so long as kKernelSize is a multiple of 16.	258 // true so long as kKernelSize is a multiple of 16.

262 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);	259 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k1) & 0x0F);

263 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);	260 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(k2) & 0x0F);

264	261

265 // Initialize input pointer based on quantized \|virtual_source_idx_\|.	262 // Initialize input pointer based on quantized \|virtual_source_idx_\|.

266 const float* const input_ptr = r1_ + source_idx;	263 const float* const input_ptr = r1_ + source_idx;

267	264

268 // Figure out how much to weight each kernel's "convolution".	265 // Figure out how much to weight each kernel's "convolution".

269 const double kernel_interpolation_factor =	266 const double kernel_interpolation_factor =

270 virtual_offset_idx - offset_idx;	267 virtual_offset_idx - offset_idx;

271 *destination++ = CONVOLVE_FUNC(	268 *destination++ = CONVOLVE_FUNC(

272 input_ptr, k1, k2, kernel_interpolation_factor);	269 input_ptr, k1, k2, kernel_interpolation_factor);

273	270

274 // Advance the virtual index.	271 // Advance the virtual index.

275 virtual_source_idx_ += current_io_ratio;	272 virtual_source_idx_ += current_io_ratio;

276 source_idx = virtual_source_idx_;	273 source_idx = static_cast<int>(virtual_source_idx_);

277	274

278 if (!--remaining_frames)	275 if (!--remaining_frames)

279 return;	276 return;

280 }	277 }

281	278

282 // Wrap back around to the start.	279 // Wrap back around to the start.

283 DCHECK_GE(virtual_source_idx_, block_size_);	280 DCHECK_GE(virtual_source_idx_, block_size_);

284 virtual_source_idx_ -= block_size_;	281 virtual_source_idx_ -= block_size_;

285	282

286 // Step (3) -- Copy r3_, r4_ to r1_, r2_.	283 // Step (3) -- Copy r3_, r4_ to r1_, r2_.

287 // This wraps the last input frames back to the start of the buffer.	284 // This wraps the last input frames back to the start of the buffer.

288 memcpy(r1_, r3_, sizeof(input_buffer_.get()) kKernelSize);	285 memcpy(r1_, r3_, sizeof(input_buffer_.get()) kKernelSize);

289	286

290 // Step (4) -- Reinitialize regions if necessary.	287 // Step (4) -- Reinitialize regions if necessary.

291 if (r0_ == r2_)	288 if (r0_ == r2_)

292 UpdateRegions(true);	289 UpdateRegions(true);

293	290

294 // Step (5) -- Refresh the buffer with more input.	291 // Step (5) -- Refresh the buffer with more input.

295 read_cb_.Run(request_frames_, r0_);	292 read_cb_.Run(request_frames_, r0_);

296 }	293 }

297 }	294 }

298	295

299 int SincResampler::ChunkSize() const {	296 int SincResampler::ChunkSize() const {

300 return block_size_ / io_sample_rate_ratio_;	297 return static_cast<int>(block_size_ / io_sample_rate_ratio_);

301 }	298 }

302	299

303 void SincResampler::Flush() {	300 void SincResampler::Flush() {

304 virtual_source_idx_ = 0;	301 virtual_source_idx_ = 0;

305 buffer_primed_ = false;	302 buffer_primed_ = false;

306 memset(input_buffer_.get(), 0,	303 memset(input_buffer_.get(), 0,

307 sizeof(input_buffer_.get()) input_buffer_size_);	304 sizeof(input_buffer_.get()) input_buffer_size_);

308 UpdateRegions(false);	305 UpdateRegions(false);

309 }	306 }

310	307

311 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,	308 float SincResampler::Convolve_C(const float* input_ptr, const float* k1,

312 const float* k2,	309 const float* k2,

313 double kernel_interpolation_factor) {	310 double kernel_interpolation_factor) {

314 float sum1 = 0;	311 float sum1 = 0;

315 float sum2 = 0;	312 float sum2 = 0;

316	313

317 // Generate a single output sample. Unrolling this loop hurt performance in	314 // Generate a single output sample. Unrolling this loop hurt performance in

318 // local testing.	315 // local testing.

319 int n = kKernelSize;	316 int n = kKernelSize;

320 while (n--) {	317 while (n--) {

321 sum1 += input_ptr *k1++;	318 sum1 += input_ptr *k1++;

322 sum2 += input_ptr++ *k2++;	319 sum2 += input_ptr++ *k2++;

323 }	320 }

324	321

325 // Linearly interpolate the two "convolutions".	322 // Linearly interpolate the two "convolutions".

326 return (1.0 - kernel_interpolation_factor) * sum1	323 return static_cast<float>((1.0 - kernel_interpolation_factor) * sum1 +
	wolenetz 2014/10/17 19:23:01 aside: nice wrapping style catch :) aside: nice wrapping style catch :)
327 + kernel_interpolation_factor * sum2;	324 kernel_interpolation_factor * sum2);

328 }	325 }

329	326

330 #if defined(ARCH_CPU_X86_FAMILY)	327 #if defined(ARCH_CPU_X86_FAMILY)

331 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,	328 float SincResampler::Convolve_SSE(const float* input_ptr, const float* k1,

332 const float* k2,	329 const float* k2,

333 double kernel_interpolation_factor) {	330 double kernel_interpolation_factor) {

334 __m128 m_input;	331 __m128 m_input;

335 __m128 m_sums1 = _mm_setzero_ps();	332 __m128 m_sums1 = _mm_setzero_ps();

336 __m128 m_sums2 = _mm_setzero_ps();	333 __m128 m_sums2 = _mm_setzero_ps();

337	334

338 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling	335 // Based on \|input_ptr\| alignment, we need to use loadu or load. Unrolling

339 // these loops hurt performance in local testing.	336 // these loops hurt performance in local testing.

340 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {	337 if (reinterpret_cast<uintptr_t>(input_ptr) & 0x0F) {

341 for (int i = 0; i < kKernelSize; i += 4) {	338 for (int i = 0; i < kKernelSize; i += 4) {

342 m_input = _mm_loadu_ps(input_ptr + i);	339 m_input = _mm_loadu_ps(input_ptr + i);

343 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));	340 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

344 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));	341 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

345 }	342 }

346 } else {	343 } else {

347 for (int i = 0; i < kKernelSize; i += 4) {	344 for (int i = 0; i < kKernelSize; i += 4) {

348 m_input = _mm_load_ps(input_ptr + i);	345 m_input = _mm_load_ps(input_ptr + i);

349 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));	346 m_sums1 = _mm_add_ps(m_sums1, _mm_mul_ps(m_input, _mm_load_ps(k1 + i)));

350 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));	347 m_sums2 = _mm_add_ps(m_sums2, _mm_mul_ps(m_input, _mm_load_ps(k2 + i)));

351 }	348 }

352 }	349 }

353	350

354 // Linearly interpolate the two "convolutions".	351 // Linearly interpolate the two "convolutions".

355 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(1.0 - kernel_interpolation_factor));	352 m_sums1 = _mm_mul_ps(m_sums1, _mm_set_ps1(

356 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(kernel_interpolation_factor));	353 static_cast<float>(1.0 - kernel_interpolation_factor)));
	wolenetz 2014/10/17 19:23:01 aside: From what I can tell, I don't think this lo aside: From what I can tell, I don't think this loses any precision versus the previous code, though kernel_interpolcation_factor is a double. Peter Kasting 2014/10/21 19:00:24 Right, it shouldn't, since _mm_set_ps1() takes a f Show quoted text On 2014/10/17 19:23:01, wolenetz wrote: > aside: From what I can tell, I don't think this loses any precision versus the > previous code, though kernel_interpolcation_factor is a double. Right, it shouldn't, since _mm_set_ps1() takes a float. I got burned in some other repo by trying to do this instead: _mm_set_ps1(1.0f - static_cast<float>(kernel_interpolation_factor)); ...so I avoided that. wolenetz 2014/10/21 19:15:15 Acknowledged. Show quoted text On 2014/10/21 19:00:24, Peter Kasting wrote: > On 2014/10/17 19:23:01, wolenetz wrote: > > aside: From what I can tell, I don't think this loses any precision versus the > > previous code, though kernel_interpolcation_factor is a double. > > Right, it shouldn't, since _mm_set_ps1() takes a float. I got burned in some > other repo by trying to do this instead: > > _mm_set_ps1(1.0f - static_cast<float>(kernel_interpolation_factor)); > > ...so I avoided that. Acknowledged.
	354 m_sums2 = _mm_mul_ps(m_sums2, _mm_set_ps1(

	355 static_cast<float>(kernel_interpolation_factor)));

357 m_sums1 = _mm_add_ps(m_sums1, m_sums2);	356 m_sums1 = _mm_add_ps(m_sums1, m_sums2);

358	357

359 // Sum components together.	358 // Sum components together.

360 float result;	359 float result;

361 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);	360 m_sums2 = _mm_add_ps(_mm_movehl_ps(m_sums1, m_sums1), m_sums1);

362 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(	361 _mm_store_ss(&result, _mm_add_ss(m_sums2, _mm_shuffle_ps(

363 m_sums2, m_sums2, 1)));	362 m_sums2, m_sums2, 1)));

364	363

365 return result;	364 return result;

366 }	365 }

(...skipping 20 matching lines...) Expand all Loading...
387 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),	386 vmulq_f32(m_sums1, vmovq_n_f32(1.0 - kernel_interpolation_factor)),

388 m_sums2, vmovq_n_f32(kernel_interpolation_factor));	387 m_sums2, vmovq_n_f32(kernel_interpolation_factor));

389	388

390 // Sum components together.	389 // Sum components together.

391 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));	390 float32x2_t m_half = vadd_f32(vget_high_f32(m_sums1), vget_low_f32(m_sums1));

392 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);	391 return vget_lane_f32(vpadd_f32(m_half, m_half), 0);

393 }	392 }

394 #endif	393 #endif

395	394

396 } // namespace media	395 } // namespace media

OLD	NEW

« no previous file with comments | « media/audio/audio_parameters.cc ('k') | media/cast/cast_defines.h » ('j') | no next file with comments »