Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 161 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 172 | 172 |
| 173 if (destAligned) | 173 if (destAligned) |
| 174 SSE2_MULT_ADD(load, store) | 174 SSE2_MULT_ADD(load, store) |
| 175 else | 175 else |
| 176 SSE2_MULT_ADD(loadu, storeu) | 176 SSE2_MULT_ADD(loadu, storeu) |
| 177 | 177 |
| 178 n = tailFrames; | 178 n = tailFrames; |
| 179 } | 179 } |
| 180 #elif HAVE(ARM_NEON_INTRINSICS) | 180 #elif HAVE(ARM_NEON_INTRINSICS) |
| 181 if ((sourceStride == 1) && (destStride == 1)) { | 181 if ((sourceStride == 1) && (destStride == 1)) { |
| 182 int tailFrames = n % 4; | 182 unsigned tailFrames = n & 15; |
| 183 const float* endP = destP + n - tailFrames; | 183 float32x4_t scaleNum = vdupq_n_f32(*scale); |
| 184 | 184 |
| 185 float32x4_t k = vdupq_n_f32(*scale); | 185 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
| 186 while (destP < endP) { | 186 float32x4_t dest0 = vld1q_f32(destP); |
| 187 float32x4_t source = vld1q_f32(sourceP); | 187 float32x4_t source0 = vld1q_f32(sourceP); |
| 188 float32x4_t dest = vld1q_f32(destP); | |
| 189 | 188 |
| 190 dest = vmlaq_f32(dest, source, k); | 189 float32x4_t dest1 = vld1q_f32(destP + 4); |
| 191 vst1q_f32(destP, dest); | 190 float32x4_t source1 = vld1q_f32(sourceP + 4); |
| 192 | 191 |
| 193 sourceP += 4; | 192 float32x4_t dest2 = vld1q_f32(destP + 8); |
| 194 destP += 4; | 193 float32x4_t source2 = vld1q_f32(sourceP + 8); |
| 194 | |
| 195 float32x4_t dest3 = vld1q_f32(destP + 12); | |
| 196 float32x4_t source3 = vld1q_f32(sourceP + 12); | |
| 197 | |
| 198 float32x4_t result0 = vmlaq_f32(dest0, source0, scaleNum); | |
| 199 float32x4_t result1 = vmlaq_f32(dest1, source1, scaleNum); | |
| 200 float32x4_t result2 = vmlaq_f32(dest2, source2, scaleNum); | |
| 201 float32x4_t result3 = vmlaq_f32(dest3, source3, scaleNum); | |
| 202 | |
| 203 vst1q_f32(destP, result0); | |
| 204 vst1q_f32(destP + 4, result1); | |
| 205 vst1q_f32(destP + 8, result2); | |
| 206 vst1q_f32(destP + 12, result3); | |
| 207 | |
| 208 sourceP += 16; | |
| 209 destP += 16; | |
| 195 } | 210 } |
| 196 n = tailFrames; | 211 n = tailFrames; |
| 197 } | 212 } |
| 198 #endif | 213 #endif |
| 199 while (n) { | 214 while (n) { |
| 200 *destP += *sourceP * *scale; | 215 *destP += *sourceP * *scale; |
| 201 sourceP += sourceStride; | 216 sourceP += sourceStride; |
| 202 destP += destStride; | 217 destP += destStride; |
| 203 n--; | 218 n--; |
| 204 } | 219 } |
| (...skipping 46 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 251 // Non-SSE handling for remaining frames which is less than 4. | 266 // Non-SSE handling for remaining frames which is less than 4. |
| 252 n %= 4; | 267 n %= 4; |
| 253 while (n) { | 268 while (n) { |
| 254 *destP = k * *sourceP; | 269 *destP = k * *sourceP; |
| 255 sourceP++; | 270 sourceP++; |
| 256 destP++; | 271 destP++; |
| 257 n--; | 272 n--; |
| 258 } | 273 } |
| 259 } else { // If strides are not 1, rollback to normal algorithm. | 274 } else { // If strides are not 1, rollback to normal algorithm. |
| 260 #elif HAVE(ARM_NEON_INTRINSICS) | 275 #elif HAVE(ARM_NEON_INTRINSICS) |
| 261 if ((sourceStride == 1) && (destStride == 1)) { | 276 if (sourceStride == 1 && destStride == 1) { |
| 262 float k = *scale; | 277 unsigned tailFrames = framesToProcess & 15; |
| 263 int tailFrames = n % 4; | 278 float32x4_t scaleNum = vdupq_n_f32(*scale); |
|
Raymond Toy
2014/09/22 20:00:36
Is there any performance difference between making
KhNo
2014/09/23 02:07:32
No performance difference for scaleNum, It is just
| |
| 264 const float* endP = destP + n - tailFrames; | |
| 265 | 279 |
| 266 while (destP < endP) { | 280 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) { |
| 267 float32x4_t source = vld1q_f32(sourceP); | 281 float32x4_t source0 = vld1q_f32(sourceP); |
| 268 vst1q_f32(destP, vmulq_n_f32(source, k)); | 282 float32x4_t source1 = vld1q_f32(sourceP + 4); |
| 283 float32x4_t source2 = vld1q_f32(sourceP + 8); | |
| 284 float32x4_t source3 = vld1q_f32(sourceP + 12); | |
| 269 | 285 |
| 270 sourceP += 4; | 286 float32x4_t result0 = vmulq_f32(source0, scaleNum); |
| 271 destP += 4; | 287 float32x4_t result1 = vmulq_f32(source1, scaleNum); |
| 288 float32x4_t result2 = vmulq_f32(source2, scaleNum); | |
| 289 float32x4_t result3 = vmulq_f32(source3, scaleNum); | |
| 290 | |
| 291 vst1q_f32(destP, result0); | |
| 292 vst1q_f32(destP + 4, result1); | |
| 293 vst1q_f32(destP + 8, result2); | |
| 294 vst1q_f32(destP + 12, result3); | |
| 295 | |
| 296 sourceP += 16; | |
| 297 destP += 16; | |
| 272 } | 298 } |
| 273 n = tailFrames; | 299 n = tailFrames; |
| 274 } | 300 } |
| 275 #endif | 301 #endif |
| 276 float k = *scale; | 302 float k = *scale; |
| 277 while (n--) { | 303 while (n--) { |
| 278 *destP = k * *sourceP; | 304 *destP = k * *sourceP; |
| 279 sourceP += sourceStride; | 305 sourceP += sourceStride; |
| 280 destP += destStride; | 306 destP += destStride; |
| 281 } | 307 } |
| (...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 362 n %= 4; | 388 n %= 4; |
| 363 while (n) { | 389 while (n) { |
| 364 *destP = *source1P + *source2P; | 390 *destP = *source1P + *source2P; |
| 365 source1P++; | 391 source1P++; |
| 366 source2P++; | 392 source2P++; |
| 367 destP++; | 393 destP++; |
| 368 n--; | 394 n--; |
| 369 } | 395 } |
| 370 } else { // if strides are not 1, rollback to normal algorithm | 396 } else { // if strides are not 1, rollback to normal algorithm |
| 371 #elif HAVE(ARM_NEON_INTRINSICS) | 397 #elif HAVE(ARM_NEON_INTRINSICS) |
| 372 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 398 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 373 int tailFrames = n % 4; | 399 unsigned tailFrames = framesToProcess & 15; |
| 374 const float* endP = destP + n - tailFrames; | |
| 375 | 400 |
| 376 while (destP < endP) { | 401 for (unsigned loopCount = (framesToProcess >> 4); loopCount > 0; loopCou nt--) { |
| 377 float32x4_t source1 = vld1q_f32(source1P); | 402 float32x4_t source10 = vld1q_f32(source1P); |
| 378 float32x4_t source2 = vld1q_f32(source2P); | 403 float32x4_t source20 = vld1q_f32(source2P); |
| 379 vst1q_f32(destP, vaddq_f32(source1, source2)); | |
| 380 | 404 |
| 381 source1P += 4; | 405 float32x4_t source11 = vld1q_f32(source1P + 4); |
| 382 source2P += 4; | 406 float32x4_t source21 = vld1q_f32(source2P + 4); |
| 383 destP += 4; | 407 |
| 408 float32x4_t source12 = vld1q_f32(source1P + 8); | |
| 409 float32x4_t source22 = vld1q_f32(source2P + 8); | |
| 410 | |
| 411 float32x4_t source13 = vld1q_f32(source1P + 12); | |
| 412 float32x4_t source23 = vld1q_f32(source2P + 12); | |
| 413 | |
| 414 float32x4_t result0 = vaddq_f32(source10, source20); | |
| 415 float32x4_t result1 = vaddq_f32(source11, source21); | |
| 416 float32x4_t result2 = vaddq_f32(source12, source22); | |
| 417 float32x4_t result3 = vaddq_f32(source13, source23); | |
| 418 | |
| 419 vst1q_f32(destP, result0); | |
| 420 vst1q_f32(destP + 4, result1); | |
| 421 vst1q_f32(destP + 8, result2); | |
| 422 vst1q_f32(destP + 12, result3); | |
| 423 | |
| 424 source1P += 16; | |
| 425 source2P += 16; | |
| 426 destP += 16; | |
| 384 } | 427 } |
| 385 n = tailFrames; | 428 n = tailFrames; |
| 386 } | 429 } |
| 387 #endif | 430 #endif |
| 388 while (n--) { | 431 while (n--) { |
| 389 *destP = *source1P + *source2P; | 432 *destP = *source1P + *source2P; |
| 390 source1P += sourceStride1; | 433 source1P += sourceStride1; |
| 391 source2P += sourceStride2; | 434 source2P += sourceStride2; |
| 392 destP += destStride; | 435 destP += destStride; |
| 393 } | 436 } |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 439 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. | 482 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. |
| 440 SSE2_MULT(load, storeu) | 483 SSE2_MULT(load, storeu) |
| 441 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. | 484 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. |
| 442 SSE2_MULT(loadu, store) | 485 SSE2_MULT(loadu, store) |
| 443 else // Neither aligned. | 486 else // Neither aligned. |
| 444 SSE2_MULT(loadu, storeu) | 487 SSE2_MULT(loadu, storeu) |
| 445 | 488 |
| 446 n = tailFrames; | 489 n = tailFrames; |
| 447 } | 490 } |
| 448 #elif HAVE(ARM_NEON_INTRINSICS) | 491 #elif HAVE(ARM_NEON_INTRINSICS) |
| 449 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { | 492 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { |
| 450 int tailFrames = n % 4; | 493 unsigned tailFrames = n & 15; |
| 451 const float* endP = destP + n - tailFrames; | |
| 452 | 494 |
| 453 while (destP < endP) { | 495 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { |
| 454 float32x4_t source1 = vld1q_f32(source1P); | 496 float32x4_t source10 = vld1q_f32(source1P); |
| 455 float32x4_t source2 = vld1q_f32(source2P); | 497 float32x4_t source20 = vld1q_f32(source2P); |
|
Raymond Toy
2014/09/22 20:00:35
It might be advantageous to load all of the elemen
KhNo
2014/09/23 02:07:32
Thanks for review, I think also it is better for c
| |
| 456 vst1q_f32(destP, vmulq_f32(source1, source2)); | |
| 457 | 498 |
| 458 source1P += 4; | 499 float32x4_t source11 = vld1q_f32(source1P + 4); |
| 459 source2P += 4; | 500 float32x4_t source21 = vld1q_f32(source2P + 4); |
| 460 destP += 4; | 501 |
| 502 float32x4_t source12 = vld1q_f32(source1P + 8); | |
| 503 float32x4_t source22 = vld1q_f32(source2P + 8); | |
| 504 | |
| 505 float32x4_t source13 = vld1q_f32(source1P + 12); | |
| 506 float32x4_t source23 = vld1q_f32(source2P + 12); | |
| 507 | |
| 508 float32x4_t result0 = vmulq_f32(source10, source20); | |
| 509 float32x4_t result1 = vmulq_f32(source11, source21); | |
| 510 float32x4_t result2 = vmulq_f32(source12, source22); | |
| 511 float32x4_t result3 = vmulq_f32(source13, source23); | |
| 512 | |
| 513 vst1q_f32(destP, result0); | |
| 514 vst1q_f32(destP + 4, result1); | |
| 515 vst1q_f32(destP + 8, result2); | |
| 516 vst1q_f32(destP + 12, result3); | |
| 517 | |
| 518 source1P += 16; | |
| 519 source2P += 16; | |
| 520 destP += 16; | |
| 461 } | 521 } |
| 462 n = tailFrames; | 522 n = tailFrames; |
| 463 } | 523 } |
| 464 #endif | 524 #endif |
| 465 while (n) { | 525 while (n) { |
| 466 *destP = *source1P * *source2P; | 526 *destP = *source1P * *source2P; |
| 467 source1P += sourceStride1; | 527 source1P += sourceStride1; |
| 468 source2P += sourceStride2; | 528 source2P += sourceStride2; |
| 469 destP += destStride; | 529 destP += destStride; |
| 470 n--; | 530 n--; |
| (...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 556 } | 616 } |
| 557 | 617 |
| 558 // Summarize the SSE results. | 618 // Summarize the SSE results. |
| 559 const float* groupSumP = reinterpret_cast<float*>(&mSum); | 619 const float* groupSumP = reinterpret_cast<float*>(&mSum); |
| 560 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; | 620 sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; |
| 561 | 621 |
| 562 n = tailFrames; | 622 n = tailFrames; |
| 563 } | 623 } |
| 564 #elif HAVE(ARM_NEON_INTRINSICS) | 624 #elif HAVE(ARM_NEON_INTRINSICS) |
| 565 if (sourceStride == 1) { | 625 if (sourceStride == 1) { |
| 566 int tailFrames = n % 4; | 626 unsigned tailFrames = n & 15; |
| 567 const float* endP = sourceP + n - tailFrames; | |
| 568 | 627 |
| 569 float32x4_t fourSum = vdupq_n_f32(0); | 628 float32x4_t result0 = vdupq_n_f32(0); |
| 570 while (sourceP < endP) { | 629 float32x4_t result1 = result0; |
| 571 float32x4_t source = vld1q_f32(sourceP); | 630 float32x4_t result2 = result0; |
| 572 fourSum = vmlaq_f32(fourSum, source, source); | 631 float32x4_t result3 = result0; |
| 573 sourceP += 4; | 632 |
| 633 for (unsigned loopCount = (n >> 4); loopCount > 0; loopCount--) { | |
| 634 float32x4_t source0 = vld1q_f32(sourceP); | |
| 635 float32x4_t source1 = vld1q_f32(sourceP + 4); | |
| 636 float32x4_t source2 = vld1q_f32(sourceP + 8); | |
| 637 float32x4_t source3 = vld1q_f32(sourceP + 12); | |
| 638 | |
| 639 result0 = vmlaq_f32(result0, source0, source0); | |
| 640 result1 = vmlaq_f32(result1, source1, source1); | |
| 641 result2 = vmlaq_f32(result2, source2, source2); | |
| 642 result3 = vmlaq_f32(result3, source3, source3); | |
| 643 | |
| 644 sourceP += 16; | |
| 574 } | 645 } |
| 575 float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourS um)); | |
| 576 | 646 |
| 577 float groupSum[2]; | 647 result0 = vaddq_f32(result0, result1); |
| 578 vst1_f32(groupSum, twoSum); | 648 result0 = vaddq_f32(result0, result2); |
| 579 sum += groupSum[0] + groupSum[1]; | 649 result0 = vaddq_f32(result0, result3); |
| 650 | |
| 651 sum += vgetq_lane_f32(result0, 0); | |
| 652 sum += vgetq_lane_f32(result0, 1); | |
| 653 sum += vgetq_lane_f32(result0, 2); | |
| 654 sum += vgetq_lane_f32(result0, 3); | |
| 580 | 655 |
| 581 n = tailFrames; | 656 n = tailFrames; |
| 582 } | 657 } |
| 583 #endif | 658 #endif |
| 584 | 659 |
| 585 while (n--) { | 660 while (n--) { |
| 586 float sample = *sourceP; | 661 float sample = *sourceP; |
| 587 sum += sample * sample; | 662 sum += sample * sample; |
| 588 sourceP += sourceStride; | 663 sourceP += sourceStride; |
| 589 } | 664 } |
| (...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 691 } | 766 } |
| 692 } | 767 } |
| 693 | 768 |
| 694 #endif // OS(MACOSX) | 769 #endif // OS(MACOSX) |
| 695 | 770 |
| 696 } // namespace VectorMath | 771 } // namespace VectorMath |
| 697 | 772 |
| 698 } // namespace WebCore | 773 } // namespace WebCore |
| 699 | 774 |
| 700 #endif // ENABLE(WEB_AUDIO) | 775 #endif // ENABLE(WEB_AUDIO) |
| OLD | NEW |