Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(20)

Side by Side Diff: Source/platform/audio/VectorMath.cpp

Issue 255573002: More optimize approach for NEON in VectorMath Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2010, Google Inc. All rights reserved. 2 * Copyright (C) 2010, Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions 5 * modification, are permitted provided that the following conditions
6 * are met: 6 * are met:
7 * 1. Redistributions of source code must retain the above copyright 7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer. 8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright 9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the 10 * notice, this list of conditions and the following disclaimer in the
(...skipping 161 matching lines...) Expand 10 before | Expand all | Expand 10 after
172 172
173 if (destAligned) 173 if (destAligned)
174 SSE2_MULT_ADD(load, store) 174 SSE2_MULT_ADD(load, store)
175 else 175 else
176 SSE2_MULT_ADD(loadu, storeu) 176 SSE2_MULT_ADD(loadu, storeu)
177 177
178 n = tailFrames; 178 n = tailFrames;
179 } 179 }
180 #elif HAVE(ARM_NEON_INTRINSICS) 180 #elif HAVE(ARM_NEON_INTRINSICS)
181 if ((sourceStride == 1) && (destStride == 1)) { 181 if ((sourceStride == 1) && (destStride == 1)) {
182 int tailFrames = n % 4; 182 unsigned tailFrames = n & 15;
183 const float* endP = destP + n - tailFrames; 183 float32x4_t scaleNum = vdupq_n_f32(*scale);
184 184
185 float32x4_t k = vdupq_n_f32(*scale); 185 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
186 while (destP < endP) { 186 float32x4_t dest0 = vld1q_f32(destP);
187 float32x4_t source = vld1q_f32(sourceP); 187 float32x4_t source0 = vld1q_f32(sourceP);
188 float32x4_t dest = vld1q_f32(destP);
189 188
190 dest = vmlaq_f32(dest, source, k); 189 float32x4_t dest1 = vld1q_f32(destP + 4);
191 vst1q_f32(destP, dest); 190 float32x4_t source1 = vld1q_f32(sourceP + 4);
192 191
193 sourceP += 4; 192 float32x4_t dest2 = vld1q_f32(destP + 8);
194 destP += 4; 193 float32x4_t source2 = vld1q_f32(sourceP + 8);
194
195 float32x4_t dest3 = vld1q_f32(destP + 12);
196 float32x4_t source3 = vld1q_f32(sourceP + 12);
197
198 float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum);
199 float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum);
200 float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum);
201 float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum);
202
203 vst1q_f32(destP, result0);
204 vst1q_f32(destP + 4, result1);
205 vst1q_f32(destP + 8, result2);
206 vst1q_f32(destP + 12, result3);
207
208 sourceP += 16;
209 destP += 16;
195 } 210 }
196 n = tailFrames; 211 n = tailFrames;
197 } 212 }
198 #endif 213 #endif
199 while (n) { 214 while (n) {
200 *destP += *sourceP * *scale; 215 *destP += *sourceP * *scale;
201 sourceP += sourceStride; 216 sourceP += sourceStride;
202 destP += destStride; 217 destP += destStride;
203 n--; 218 n--;
204 } 219 }
(...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after
251 // Non-SSE handling for remaining frames which is less than 4. 266 // Non-SSE handling for remaining frames which is less than 4.
252 n %= 4; 267 n %= 4;
253 while (n) { 268 while (n) {
254 *destP = k * *sourceP; 269 *destP = k * *sourceP;
255 sourceP++; 270 sourceP++;
256 destP++; 271 destP++;
257 n--; 272 n--;
258 } 273 }
259 } else { // If strides are not 1, rollback to normal algorithm. 274 } else { // If strides are not 1, rollback to normal algorithm.
260 #elif HAVE(ARM_NEON_INTRINSICS) 275 #elif HAVE(ARM_NEON_INTRINSICS)
261 if ((sourceStride == 1) && (destStride == 1)) { 276 if (sourceStride == 1 && destStride == 1) {
262 float k = *scale; 277 unsigned tailFrames = framesToProcess & 15;
263 int tailFrames = n % 4; 278 float32x4_t scaleNum = vdupq_n_f32(*scale);
Raymond Toy 2014/09/22 20:00:36 Is there any performance difference between making
KhNo 2014/09/23 02:07:32 No performance difference for scaleNum, It is just
264 const float* endP = destP + n - tailFrames;
265 279
266 while (destP < endP) { 280 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) {
267 float32x4_t source = vld1q_f32(sourceP); 281 float32x4_t source0 = vld1q_f32(sourceP);
268 vst1q_f32(destP, vmulq_n_f32(source, k)); 282 float32x4_t source1 = vld1q_f32(sourceP + 4);
283 float32x4_t source2 = vld1q_f32(sourceP + 8);
284 float32x4_t source3 = vld1q_f32(sourceP + 12);
269 285
270 sourceP += 4; 286 float32x4_t result0 = vmulq_f32(source0, scaleNum);
271 destP += 4; 287 float32x4_t result1 = vmulq_f32(source1, scaleNum);
288 float32x4_t result2 = vmulq_f32(source2, scaleNum);
289 float32x4_t result3 = vmulq_f32(source3, scaleNum);
290
291 vst1q_f32(destP, result0);
292 vst1q_f32(destP + 4, result1);
293 vst1q_f32(destP + 8, result2);
294 vst1q_f32(destP + 12, result3);
295
296 sourceP += 16;
297 destP += 16;
272 } 298 }
273 n = tailFrames; 299 n = tailFrames;
274 } 300 }
275 #endif 301 #endif
276 float k = *scale; 302 float k = *scale;
277 while (n--) { 303 while (n--) {
278 *destP = k * *sourceP; 304 *destP = k * *sourceP;
279 sourceP += sourceStride; 305 sourceP += sourceStride;
280 destP += destStride; 306 destP += destStride;
281 } 307 }
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
362 n %= 4; 388 n %= 4;
363 while (n) { 389 while (n) {
364 *destP = *source1P + *source2P; 390 *destP = *source1P + *source2P;
365 source1P++; 391 source1P++;
366 source2P++; 392 source2P++;
367 destP++; 393 destP++;
368 n--; 394 n--;
369 } 395 }
370 } else { // if strides are not 1, rollback to normal algorithm 396 } else { // if strides are not 1, rollback to normal algorithm
371 #elif HAVE(ARM_NEON_INTRINSICS) 397 #elif HAVE(ARM_NEON_INTRINSICS)
372 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 398 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
373 int tailFrames = n % 4; 399 unsigned tailFrames = framesToProcess & 15;
374 const float* endP = destP + n - tailFrames;
375 400
376 while (destP < endP) { 401 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) {
377 float32x4_t source1 = vld1q_f32(source1P); 402 float32x4_t source10 = vld1q_f32(source1P);
378 float32x4_t source2 = vld1q_f32(source2P); 403 float32x4_t source20 = vld1q_f32(source2P);
379 vst1q_f32(destP, vaddq_f32(source1, source2));
380 404
381 source1P += 4; 405 float32x4_t source11 = vld1q_f32(source1P + 4);
382 source2P += 4; 406 float32x4_t source21 = vld1q_f32(source2P + 4);
383 destP += 4; 407
408 float32x4_t source12 = vld1q_f32(source1P + 8);
409 float32x4_t source22 = vld1q_f32(source2P + 8);
410
411 float32x4_t source13 = vld1q_f32(source1P + 12);
412 float32x4_t source23 = vld1q_f32(source2P + 12);
413
414 float32x4_t result0 = vaddq_f32(source10, source20);
415 float32x4_t result1 = vaddq_f32(source11, source21);
416 float32x4_t result2 = vaddq_f32(source12, source22);
417 float32x4_t result3 = vaddq_f32(source13, source23);
418
419 vst1q_f32(destP, result0);
420 vst1q_f32(destP + 4, result1);
421 vst1q_f32(destP + 8, result2);
422 vst1q_f32(destP + 12, result3);
423
424 source1P += 16;
425 source2P += 16;
426 destP += 16;
384 } 427 }
385 n = tailFrames; 428 n = tailFrames;
386 } 429 }
387 #endif 430 #endif
388 while (n--) { 431 while (n--) {
389 *destP = *source1P + *source2P; 432 *destP = *source1P + *source2P;
390 source1P += sourceStride1; 433 source1P += sourceStride1;
391 source2P += sourceStride2; 434 source2P += sourceStride2;
392 destP += destStride; 435 destP += destStride;
393 } 436 }
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
439 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. 482 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
440 SSE2_MULT(load, storeu) 483 SSE2_MULT(load, storeu)
441 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. 484 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
442 SSE2_MULT(loadu, store) 485 SSE2_MULT(loadu, store)
443 else // Neither aligned. 486 else // Neither aligned.
444 SSE2_MULT(loadu, storeu) 487 SSE2_MULT(loadu, storeu)
445 488
446 n = tailFrames; 489 n = tailFrames;
447 } 490 }
448 #elif HAVE(ARM_NEON_INTRINSICS) 491 #elif HAVE(ARM_NEON_INTRINSICS)
449 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 492 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
450 int tailFrames = n % 4; 493 unsigned tailFrames = n & 15;
451 const float* endP = destP + n - tailFrames;
452 494
453 while (destP < endP) { 495 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
454 float32x4_t source1 = vld1q_f32(source1P); 496 float32x4_t source10 = vld1q_f32(source1P);
455 float32x4_t source2 = vld1q_f32(source2P); 497 float32x4_t source20 = vld1q_f32(source2P);
Raymond Toy 2014/09/22 20:00:35 It might be advantageous to load all of the elemen
KhNo 2014/09/23 02:07:32 Thanks for review, I think also it is better for c
456 vst1q_f32(destP, vmulq_f32(source1, source2));
457 498
458 source1P += 4; 499 float32x4_t source11 = vld1q_f32(source1P + 4);
459 source2P += 4; 500 float32x4_t source21 = vld1q_f32(source2P + 4);
460 destP += 4; 501
502 float32x4_t source12 = vld1q_f32(source1P + 8);
503 float32x4_t source22 = vld1q_f32(source2P + 8);
504
505 float32x4_t source13 = vld1q_f32(source1P + 12);
506 float32x4_t source23 = vld1q_f32(source2P + 12);
507
508 float32x4_t result0 = vmulq_f32(source10, source20);
509 float32x4_t result1 = vmulq_f32(source11, source21);
510 float32x4_t result2 = vmulq_f32(source12, source22);
511 float32x4_t result3 = vmulq_f32(source13, source23);
512
513 vst1q_f32(destP, result0);
514 vst1q_f32(destP + 4, result1);
515 vst1q_f32(destP + 8, result2);
516 vst1q_f32(destP + 12, result3);
517
518 source1P += 16;
519 source2P += 16;
520 destP += 16;
461 } 521 }
462 n = tailFrames; 522 n = tailFrames;
463 } 523 }
464 #endif 524 #endif
465 while (n) { 525 while (n) {
466 *destP = *source1P * *source2P; 526 *destP = *source1P * *source2P;
467 source1P += sourceStride1; 527 source1P += sourceStride1;
468 source2P += sourceStride2; 528 source2P += sourceStride2;
469 destP += destStride; 529 destP += destStride;
470 n--; 530 n--;
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
556 } 616 }
557 617
558 // Summarize the SSE results. 618 // Summarize the SSE results.
559 const float* groupSumP = reinterpret_cast<float*>(&mSum); 619 const float* groupSumP = reinterpret_cast<float*>(&mSum);
560 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 620 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
561 621
562 n = tailFrames; 622 n = tailFrames;
563 } 623 }
564 #elif HAVE(ARM_NEON_INTRINSICS) 624 #elif HAVE(ARM_NEON_INTRINSICS)
565 if (sourceStride == 1) { 625 if (sourceStride == 1) {
566 int tailFrames = n % 4; 626 unsigned tailFrames = n & 15;
567 const float* endP = sourceP + n - tailFrames;
568 627
569 float32x4_t fourSum = vdupq_n_f32(0); 628 float32x4_t result0 = vdupq_n_f32(0);
570 while (sourceP < endP) { 629 float32x4_t result1 = result0;
571 float32x4_t source = vld1q_f32(sourceP); 630 float32x4_t result2 = result0;
572 fourSum = vmlaq_f32(fourSum, source, source); 631 float32x4_t result3 = result0;
573 sourceP += 4; 632
633 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) {
634 float32x4_t source0 = vld1q_f32(sourceP);
635 float32x4_t source1 = vld1q_f32(sourceP + 4);
636 float32x4_t source2 = vld1q_f32(sourceP + 8);
637 float32x4_t source3 = vld1q_f32(sourceP + 12);
638
639 result0 = vmlaq_f32(result0, source0, source0);
640 result1 = vmlaq_f32(result1, source1, source1);
641 result2 = vmlaq_f32(result2, source2, source2);
642 result3 = vmlaq_f32(result3, source3, source3);
643
644 sourceP += 16;
574 } 645 }
575 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS um));
576 646
577 float groupSum[2]; 647 result0 = vaddq_f32(result0, result1);
578 vst1_f32(groupSum, twoSum); 648 result0 = vaddq_f32(result0, result2);
579 sum += groupSum[0] + groupSum[1]; 649 result0 = vaddq_f32(result0, result3);
650
651 sum += vgetq_lane_f32(result0, 0);
652 sum += vgetq_lane_f32(result0, 1);
653 sum += vgetq_lane_f32(result0, 2);
654 sum += vgetq_lane_f32(result0, 3);
580 655
581 n = tailFrames; 656 n = tailFrames;
582 } 657 }
583 #endif 658 #endif
584 659
585 while (n--) { 660 while (n--) {
586 float sample = *sourceP; 661 float sample = *sourceP;
587 sum += sample * sample; 662 sum += sample * sample;
588 sourceP += sourceStride; 663 sourceP += sourceStride;
589 } 664 }
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after
691 } 766 }
692 } 767 }
693 768
694 #endif // OS(MACOSX) 769 #endif // OS(MACOSX)
695 770
696 } // namespace VectorMath 771 } // namespace VectorMath
697 772
698 } // namespace WebCore 773 } // namespace WebCore
699 774
700 #endif // ENABLE(WEB_AUDIO) 775 #endif // ENABLE(WEB_AUDIO)
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698