Source/platform/audio/VectorMath.cpp - Issue 255573002: More optimize approach for NEON in VectorMath

Side by Side Diff: Source/platform/audio/VectorMath.cpp

Issue 255573002: More optimize approach for NEON in VectorMath Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2010, Google Inc. All rights reserved.	2 * Copyright (C) 2010, Google Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 161 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
172	172

173 if (destAligned)	173 if (destAligned)

174 SSE2_MULT_ADD(load, store)	174 SSE2_MULT_ADD(load, store)

175 else	175 else

176 SSE2_MULT_ADD(loadu, storeu)	176 SSE2_MULT_ADD(loadu, storeu)

177	177

178 n = tailFrames;	178 n = tailFrames;

179 }	179 }

180 #elif HAVE(ARM_NEON_INTRINSICS)	180 #elif HAVE(ARM_NEON_INTRINSICS)

181 if ((sourceStride == 1) && (destStride == 1)) {	181 if ((sourceStride == 1) && (destStride == 1)) {

182 int tailFrames = n % 4;	182 unsigned tailFrames = n & 15;

183 const float* endP = destP + n - tailFrames;	183 float32x4_t scaleNum = vdupq_n_f32(*scale);

184	184

185 float32x4_t k = vdupq_n_f32(*scale);	185 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {

186 while (destP < endP) {	186 float32x4_t dest0 = vld1q_f32(destP);

187 float32x4_t source = vld1q_f32(sourceP);	187 float32x4_t source0 = vld1q_f32(sourceP);

188 float32x4_t dest = vld1q_f32(destP);

189	188

190 dest = vmlaq_f32(dest, source, k);	189 float32x4_t dest1 = vld1q_f32(destP + 4);

191 vst1q_f32(destP, dest);	190 float32x4_t source1 = vld1q_f32(sourceP + 4);

192	191

193 sourceP += 4;	192 float32x4_t dest2 = vld1q_f32(destP + 8);

194 destP += 4;	193 float32x4_t source2 = vld1q_f32(sourceP + 8);

	194

	195 float32x4_t dest3 = vld1q_f32(destP + 12);

	196 float32x4_t source3 = vld1q_f32(sourceP + 12);

	197

	198 float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum);

	199 float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum);

	200 float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum);

	201 float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum);

	202

	203 vst1q_f32(destP, result0);

	204 vst1q_f32(destP + 4, result1);

	205 vst1q_f32(destP + 8, result2);

	206 vst1q_f32(destP + 12, result3);

	207

	208 sourceP += 16;

	209 destP += 16;

195 }	210 }

196 n = tailFrames;	211 n = tailFrames;

197 }	212 }

198 #endif	213 #endif

199 while (n) {	214 while (n) {

200 destP += sourceP * *scale;	215 destP += sourceP * *scale;

201 sourceP += sourceStride;	216 sourceP += sourceStride;

202 destP += destStride;	217 destP += destStride;

203 n--;	218 n--;

204 }	219 }

(...skipping 46 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
251 // Non-SSE handling for remaining frames which is less than 4.	266 // Non-SSE handling for remaining frames which is less than 4.

252 n %= 4;	267 n %= 4;

253 while (n) {	268 while (n) {

254 destP = k *sourceP;	269 destP = k *sourceP;

255 sourceP++;	270 sourceP++;

256 destP++;	271 destP++;

257 n--;	272 n--;

258 }	273 }

259 } else { // If strides are not 1, rollback to normal algorithm.	274 } else { // If strides are not 1, rollback to normal algorithm.

260 #elif HAVE(ARM_NEON_INTRINSICS)	275 #elif HAVE(ARM_NEON_INTRINSICS)

261 if ((sourceStride == 1) && (destStride == 1)) {	276 if (sourceStride == 1 && destStride == 1) {

262 float k = *scale;	277 unsigned tailFrames = framesToProcess & 15;

263 int tailFrames = n % 4;	278 float32x4_t scaleNum = vdupq_n_f32(*scale);
	Raymond Toy 2014/09/22 20:00:36 Is there any performance difference between making Is there any performance difference between making 4 copies of the same item in scaleNum instead of doing a vector multiply by a single float scalar, as done in the original code? KhNo 2014/09/23 02:07:32 No performance difference for scaleNum, It is just Show quoted text On 2014/09/22 20:00:36, Raymond Toy wrote: > Is there any performance difference between making 4 copies of the same item in > scaleNum instead of doing a vector multiply by a single float scalar, as done in > the original code? No performance difference for scaleNum, It is just for making correct type develver to vmulq_f32 function instead of vmulq_n_f32 that is originally used.
264 const float* endP = destP + n - tailFrames;

265	279

266 while (destP < endP) {	280 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) {

267 float32x4_t source = vld1q_f32(sourceP);	281 float32x4_t source0 = vld1q_f32(sourceP);

268 vst1q_f32(destP, vmulq_n_f32(source, k));	282 float32x4_t source1 = vld1q_f32(sourceP + 4);

	283 float32x4_t source2 = vld1q_f32(sourceP + 8);

	284 float32x4_t source3 = vld1q_f32(sourceP + 12);

269	285

270 sourceP += 4;	286 float32x4_t result0 = vmulq_f32(source0, scaleNum);

271 destP += 4;	287 float32x4_t result1 = vmulq_f32(source1, scaleNum);

	288 float32x4_t result2 = vmulq_f32(source2, scaleNum);

	289 float32x4_t result3 = vmulq_f32(source3, scaleNum);

	290

	291 vst1q_f32(destP, result0);

	292 vst1q_f32(destP + 4, result1);

	293 vst1q_f32(destP + 8, result2);

	294 vst1q_f32(destP + 12, result3);

	295

	296 sourceP += 16;

	297 destP += 16;

272 }	298 }

273 n = tailFrames;	299 n = tailFrames;

274 }	300 }

275 #endif	301 #endif

276 float k = *scale;	302 float k = *scale;

277 while (n--) {	303 while (n--) {

278 destP = k *sourceP;	304 destP = k *sourceP;

279 sourceP += sourceStride;	305 sourceP += sourceStride;

280 destP += destStride;	306 destP += destStride;

281 }	307 }

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
362 n %= 4;	388 n %= 4;

363 while (n) {	389 while (n) {

364 destP = source1P + *source2P;	390 destP = source1P + *source2P;

365 source1P++;	391 source1P++;

366 source2P++;	392 source2P++;

367 destP++;	393 destP++;

368 n--;	394 n--;

369 }	395 }

370 } else { // if strides are not 1, rollback to normal algorithm	396 } else { // if strides are not 1, rollback to normal algorithm

371 #elif HAVE(ARM_NEON_INTRINSICS)	397 #elif HAVE(ARM_NEON_INTRINSICS)

372 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {	398 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

373 int tailFrames = n % 4;	399 unsigned tailFrames = framesToProcess & 15;

374 const float* endP = destP + n - tailFrames;

375	400

376 while (destP < endP) {	401 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) {

377 float32x4_t source1 = vld1q_f32(source1P);	402 float32x4_t source10 = vld1q_f32(source1P);

378 float32x4_t source2 = vld1q_f32(source2P);	403 float32x4_t source20 = vld1q_f32(source2P);

379 vst1q_f32(destP, vaddq_f32(source1, source2));

380	404

381 source1P += 4;	405 float32x4_t source11 = vld1q_f32(source1P + 4);

382 source2P += 4;	406 float32x4_t source21 = vld1q_f32(source2P + 4);

383 destP += 4;	407

	408 float32x4_t source12 = vld1q_f32(source1P + 8);

	409 float32x4_t source22 = vld1q_f32(source2P + 8);

	410

	411 float32x4_t source13 = vld1q_f32(source1P + 12);

	412 float32x4_t source23 = vld1q_f32(source2P + 12);

	413

	414 float32x4_t result0 = vaddq_f32(source10, source20);

	415 float32x4_t result1 = vaddq_f32(source11, source21);

	416 float32x4_t result2 = vaddq_f32(source12, source22);

	417 float32x4_t result3 = vaddq_f32(source13, source23);

	418

	419 vst1q_f32(destP, result0);

	420 vst1q_f32(destP + 4, result1);

	421 vst1q_f32(destP + 8, result2);

	422 vst1q_f32(destP + 12, result3);

	423

	424 source1P += 16;

	425 source2P += 16;

	426 destP += 16;

384 }	427 }

385 n = tailFrames;	428 n = tailFrames;

386 }	429 }

387 #endif	430 #endif

388 while (n--) {	431 while (n--) {

389 destP = source1P + *source2P;	432 destP = source1P + *source2P;

390 source1P += sourceStride1;	433 source1P += sourceStride1;

391 source2P += sourceStride2;	434 source2P += sourceStride2;

392 destP += destStride;	435 destP += destStride;

393 }	436 }

(...skipping 45 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
439 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.	482 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.

440 SSE2_MULT(load, storeu)	483 SSE2_MULT(load, storeu)

441 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.	484 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.

442 SSE2_MULT(loadu, store)	485 SSE2_MULT(loadu, store)

443 else // Neither aligned.	486 else // Neither aligned.

444 SSE2_MULT(loadu, storeu)	487 SSE2_MULT(loadu, storeu)

445	488

446 n = tailFrames;	489 n = tailFrames;

447 }	490 }

448 #elif HAVE(ARM_NEON_INTRINSICS)	491 #elif HAVE(ARM_NEON_INTRINSICS)

449 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {	492 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

450 int tailFrames = n % 4;	493 unsigned tailFrames = n & 15;

451 const float* endP = destP + n - tailFrames;

452	494

453 while (destP < endP) {	495 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {

454 float32x4_t source1 = vld1q_f32(source1P);	496 float32x4_t source10 = vld1q_f32(source1P);

455 float32x4_t source2 = vld1q_f32(source2P);	497 float32x4_t source20 = vld1q_f32(source2P);
	Raymond Toy 2014/09/22 20:00:35 It might be advantageous to load all of the elemen It might be advantageous to load all of the elements from source1 and then source2 instead of alternating between source1 and source2 Caching might work out better? KhNo 2014/09/23 02:07:32 Thanks for review, I think also it is better for c Show quoted text On 2014/09/22 20:00:35, Raymond Toy wrote: > It might be advantageous to load all of the elements from source1 and then > source2 instead of alternating between source1 and source2 Caching might work > out better? Thanks for review, I think also it is better for cache mechanism.
456 vst1q_f32(destP, vmulq_f32(source1, source2));

457	498

458 source1P += 4;	499 float32x4_t source11 = vld1q_f32(source1P + 4);

459 source2P += 4;	500 float32x4_t source21 = vld1q_f32(source2P + 4);

460 destP += 4;	501

	502 float32x4_t source12 = vld1q_f32(source1P + 8);

	503 float32x4_t source22 = vld1q_f32(source2P + 8);

	504

	505 float32x4_t source13 = vld1q_f32(source1P + 12);

	506 float32x4_t source23 = vld1q_f32(source2P + 12);

	507

	508 float32x4_t result0 = vmulq_f32(source10, source20);

	509 float32x4_t result1 = vmulq_f32(source11, source21);

	510 float32x4_t result2 = vmulq_f32(source12, source22);

	511 float32x4_t result3 = vmulq_f32(source13, source23);

	512

	513 vst1q_f32(destP, result0);

	514 vst1q_f32(destP + 4, result1);

	515 vst1q_f32(destP + 8, result2);

	516 vst1q_f32(destP + 12, result3);

	517

	518 source1P += 16;

	519 source2P += 16;

	520 destP += 16;

461 }	521 }

462 n = tailFrames;	522 n = tailFrames;

463 }	523 }

464 #endif	524 #endif

465 while (n) {	525 while (n) {

466 destP = source1P * *source2P;	526 destP = source1P * *source2P;

467 source1P += sourceStride1;	527 source1P += sourceStride1;

468 source2P += sourceStride2;	528 source2P += sourceStride2;

469 destP += destStride;	529 destP += destStride;

470 n--;	530 n--;

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
556 }	616 }

557	617

558 // Summarize the SSE results.	618 // Summarize the SSE results.

559 const float* groupSumP = reinterpret_cast<float*>(&mSum);	619 const float* groupSumP = reinterpret_cast<float*>(&mSum);

560 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];	620 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];

561	621

562 n = tailFrames;	622 n = tailFrames;

563 }	623 }

564 #elif HAVE(ARM_NEON_INTRINSICS)	624 #elif HAVE(ARM_NEON_INTRINSICS)

565 if (sourceStride == 1) {	625 if (sourceStride == 1) {

566 int tailFrames = n % 4;	626 unsigned tailFrames = n & 15;

567 const float* endP = sourceP + n - tailFrames;

568	627

569 float32x4_t fourSum = vdupq_n_f32(0);	628 float32x4_t result0 = vdupq_n_f32(0);

570 while (sourceP < endP) {	629 float32x4_t result1 = result0;

571 float32x4_t source = vld1q_f32(sourceP);	630 float32x4_t result2 = result0;

572 fourSum = vmlaq_f32(fourSum, source, source);	631 float32x4_t result3 = result0;

573 sourceP += 4;	632

	633 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {

	634 float32x4_t source0 = vld1q_f32(sourceP);

	635 float32x4_t source1 = vld1q_f32(sourceP + 4);

	636 float32x4_t source2 = vld1q_f32(sourceP + 8);

	637 float32x4_t source3 = vld1q_f32(sourceP + 12);

	638

	639 result0 = vmlaq_f32(result0, source0, source0);

	640 result1 = vmlaq_f32(result1, source1, source1);

	641 result2 = vmlaq_f32(result2, source2, source2);

	642 result3 = vmlaq_f32(result3, source3, source3);

	643

	644 sourceP += 16;

574 }	645 }

575 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS um));

576	646

577 float groupSum[2];	647 result0 = vaddq_f32(result0, result1);

578 vst1_f32(groupSum, twoSum);	648 result0 = vaddq_f32(result0, result2);

579 sum += groupSum[0] + groupSum[1];	649 result0 = vaddq_f32(result0, result3);

	650

	651 sum += vgetq_lane_f32(result0, 0);

	652 sum += vgetq_lane_f32(result0, 1);

	653 sum += vgetq_lane_f32(result0, 2);

	654 sum += vgetq_lane_f32(result0, 3);

580	655

581 n = tailFrames;	656 n = tailFrames;

582 }	657 }

583 #endif	658 #endif

584	659

585 while (n--) {	660 while (n--) {

586 float sample = *sourceP;	661 float sample = *sourceP;

587 sum += sample * sample;	662 sum += sample * sample;

588 sourceP += sourceStride;	663 sourceP += sourceStride;

589 }	664 }

(...skipping 101 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
691 }	766 }

692 }	767 }

693	768

694 #endif // OS(MACOSX)	769 #endif // OS(MACOSX)

695	770

696 } // namespace VectorMath	771 } // namespace VectorMath

697	772

698 } // namespace WebCore	773 } // namespace WebCore

699	774

700 #endif // ENABLE(WEB_AUDIO)	775 #endif // ENABLE(WEB_AUDIO)

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »