Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 /* | 1 /* |
| 2 * Copyright (C) 2010, Google Inc. All rights reserved. | 2 * Copyright (C) 2010, Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 22 matching lines...) Expand all Loading... | |
| 33 #endif | 33 #endif |
| 34 | 34 |
| 35 #if CPU(X86) || CPU(X86_64) | 35 #if CPU(X86) || CPU(X86_64) |
| 36 #include <emmintrin.h> | 36 #include <emmintrin.h> |
| 37 #endif | 37 #endif |
| 38 | 38 |
| 39 #if HAVE(ARM_NEON_INTRINSICS) | 39 #if HAVE(ARM_NEON_INTRINSICS) |
| 40 #include <arm_neon.h> | 40 #include <arm_neon.h> |
| 41 #endif | 41 #endif |
| 42 | 42 |
| 43 #if HAVE(MIPS_MSA_INTRINSICS) | |
| 44 #include "platform/cpu/mips/CommonMacrosMSA.h" | |
| 45 #endif | |
| 46 | |
| 43 #include <math.h> | 47 #include <math.h> |
| 44 #include <algorithm> | 48 #include <algorithm> |
| 45 | 49 |
| 46 namespace blink { | 50 namespace blink { |
| 47 | 51 |
| 48 namespace VectorMath { | 52 namespace VectorMath { |
| 49 | 53 |
| 50 #if OS(MACOSX) | 54 #if OS(MACOSX) |
| 51 // On the Mac we use the highly optimized versions in Accelerate.framework | 55 // On the Mac we use the highly optimized versions in Accelerate.framework |
| 52 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as | 56 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as |
| (...skipping 122 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 175 float32x4_t dest = vld1q_f32(destP); | 179 float32x4_t dest = vld1q_f32(destP); |
| 176 | 180 |
| 177 dest = vmlaq_f32(dest, source, k); | 181 dest = vmlaq_f32(dest, source, k); |
| 178 vst1q_f32(destP, dest); | 182 vst1q_f32(destP, dest); |
| 179 | 183 |
| 180 sourceP += 4; | 184 sourceP += 4; |
| 181 destP += 4; | 185 destP += 4; |
| 182 } | 186 } |
| 183 n = tailFrames; | 187 n = tailFrames; |
| 184 } | 188 } |
| 189 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 190 if ((sourceStride == 1) && (destStride == 1)) { | |
| 191 float* destPCopy = destP; | |
| 192 const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale)); | |
| 193 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 194 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 195 | |
| 196 for (; n >= 32; n -= 32) { | |
| 197 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); | |
|
Raymond Toy
2016/10/03 16:47:06
Are there alignment constraints for sourceP and de
Prashant.Patil
2016/10/04 11:47:27
There are no alignment constraints
| |
| 198 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6 , vDst7); | |
| 199 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale ); | |
| 200 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale ); | |
| 201 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4); | |
| 202 } | |
| 203 | |
| 204 if (n > 0) { | |
| 205 if (n >= 28) { | |
|
Raymond Toy
2016/10/03 16:47:06
Is there really much to be gained in having this c
Prashant.Patil
2016/10/04 11:47:27
OK. I shall remove all cases below 32.
| |
| 206 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 207 vSrc6 = LD_SP(sourceP); | |
| 208 sourceP += 4; | |
| 209 LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5); | |
| 210 vDst6 = LD_SP(destPCopy); | |
| 211 destPCopy += 4; | |
| 212 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale); | |
| 213 VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale); | |
| 214 vDst6 += vSrc6 * vScale; | |
| 215 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 216 ST_SP(vDst6, destP); | |
| 217 destP += 4; | |
| 218 n -= 28; | |
| 219 } else if (n >= 24) { | |
| 220 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 221 LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5); | |
| 222 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale); | |
| 223 VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale); | |
| 224 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 225 n -= 24; | |
| 226 } else if (n >= 16) { | |
| 227 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 228 LD_SP4(destPCopy, 4, vDst0, vDst1, vDst2, vDst3); | |
| 229 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale); | |
| 230 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 231 n -= 16; | |
| 232 } else if (n >= 8) { | |
| 233 LD_SP2(sourceP, 4, vSrc0, vSrc1); | |
| 234 LD_SP2(destPCopy, 4, vDst0, vDst1); | |
| 235 VSMA2(vSrc0, vSrc1, vDst0, vDst1, vScale); | |
| 236 ST_SP2(vDst0, vDst1, destP, 4); | |
| 237 n -= 8; | |
| 238 } | |
| 239 if (n >= 4) { | |
| 240 vSrc0 = LD_SP(sourceP); | |
| 241 vDst0 = LD_SP(destPCopy); | |
| 242 vDst0 += vSrc0 * vScale; | |
| 243 ST_SP(vDst0, destP); | |
| 244 sourceP += 4; | |
| 245 destP += 4; | |
| 246 n -= 4; | |
| 247 } | |
| 248 } | |
| 249 } | |
| 185 #endif | 250 #endif |
| 186 while (n) { | 251 while (n) { |
| 187 *destP += *sourceP * *scale; | 252 *destP += *sourceP * *scale; |
| 188 sourceP += sourceStride; | 253 sourceP += sourceStride; |
| 189 destP += destStride; | 254 destP += destStride; |
| 190 n--; | 255 n--; |
| 191 } | 256 } |
| 192 } | 257 } |
| 193 | 258 |
| 194 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess) | 259 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess) |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 252 | 317 |
| 253 while (destP < endP) { | 318 while (destP < endP) { |
| 254 float32x4_t source = vld1q_f32(sourceP); | 319 float32x4_t source = vld1q_f32(sourceP); |
| 255 vst1q_f32(destP, vmulq_n_f32(source, k)); | 320 vst1q_f32(destP, vmulq_n_f32(source, k)); |
| 256 | 321 |
| 257 sourceP += 4; | 322 sourceP += 4; |
| 258 destP += 4; | 323 destP += 4; |
| 259 } | 324 } |
| 260 n = tailFrames; | 325 n = tailFrames; |
| 261 } | 326 } |
| 327 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 328 if ((sourceStride == 1) && (destStride == 1)) { | |
| 329 const v4f32 vScale = (v4f32) __msa_fill_w(*((int32_t *) scale)); | |
| 330 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9; | |
| 331 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDs t9; | |
| 332 | |
| 333 for (; n >= 40; n -= 40) { | |
|
Raymond Toy
2016/10/03 16:47:06
Is it really worth doing blocks of 40 instead of 3
Prashant.Patil
2016/10/04 11:47:27
OK. This was done considering 10 cycle vector load
Raymond Toy
2016/10/04 15:37:44
If there is significant gain in doing this, by all
| |
| 334 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 335 LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9); | |
| 336 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScal e); | |
| 337 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScal e); | |
| 338 VSMUL2(vSrc8, vSrc9, vDst8, vDst9, vScale); | |
| 339 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 340 ST_SP4(vDst6, vDst7, vDst8, vDst9, destP, 4); | |
| 341 } | |
| 342 | |
| 343 if (n > 0) { | |
| 344 if (n >= 24) { | |
| 345 if (n >= 32) { | |
| 346 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); | |
| 347 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale); | |
| 348 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst 7, vScale); | |
| 349 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst 7, destP, 4); | |
| 350 n -= 32; | |
| 351 } else if (n >= 28) { | |
| 352 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5) ; | |
| 353 vSrc6 = LD_SP(sourceP); | |
| 354 sourceP += 4; | |
| 355 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale); | |
| 356 VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale); | |
| 357 vDst6 = vSrc6 * vScale; | |
| 358 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 359 ST_SP(vDst6, destP); | |
| 360 destP += 4; | |
| 361 n -= 28; | |
| 362 } else { | |
| 363 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5) ; | |
| 364 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale); | |
| 365 VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale); | |
| 366 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 367 n -= 24; | |
| 368 } | |
| 369 } else { | |
| 370 if (n >= 16) { | |
| 371 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 372 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale); | |
| 373 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 374 n -= 16; | |
| 375 } else if (n >= 8) { | |
| 376 LD_SP2(sourceP, 4, vSrc0, vSrc1); | |
| 377 VSMUL2(vSrc0, vSrc1, vDst0, vDst1, vScale); | |
| 378 ST_SP2(vDst0, vDst1, destP, 4); | |
| 379 n -= 8; | |
| 380 } | |
| 381 } | |
| 382 if (n >= 4) { | |
| 383 vSrc0 = LD_SP(sourceP); | |
| 384 vDst0 = vSrc0 * vScale; | |
| 385 ST_SP(vDst0, destP); | |
| 386 sourceP += 4; | |
| 387 destP += 4; | |
| 388 n -= 4; | |
| 389 } | |
| 390 } | |
| 391 } | |
| 262 #endif | 392 #endif |
| 263 float k = *scale; | 393 float k = *scale; |
| 264 while (n--) { | 394 while (n--) { |
| 265 *destP = k * *sourceP; | 395 *destP = k * *sourceP; |
| 266 sourceP += sourceStride; | 396 sourceP += sourceStride; |
| 267 destP += destStride; | 397 destP += destStride; |
| 268 } | 398 } |
| 269 #if CPU(X86) || CPU(X86_64) | 399 #if CPU(X86) || CPU(X86_64) |
| 270 } | 400 } |
| 271 #endif | 401 #endif |
| (...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 364 float32x4_t source1 = vld1q_f32(source1P); | 494 float32x4_t source1 = vld1q_f32(source1P); |
| 365 float32x4_t source2 = vld1q_f32(source2P); | 495 float32x4_t source2 = vld1q_f32(source2P); |
| 366 vst1q_f32(destP, vaddq_f32(source1, source2)); | 496 vst1q_f32(destP, vaddq_f32(source1, source2)); |
| 367 | 497 |
| 368 source1P += 4; | 498 source1P += 4; |
| 369 source2P += 4; | 499 source2P += 4; |
| 370 destP += 4; | 500 destP += 4; |
| 371 } | 501 } |
| 372 n = tailFrames; | 502 n = tailFrames; |
| 373 } | 503 } |
| 504 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 505 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | |
| 506 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 507 v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15; | |
| 508 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 509 | |
| 510 for (; n >= 32; n -= 32) { | |
| 511 LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10 , vSrc11); | |
| 512 LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc 14, vSrc15); | |
|
Raymond Toy
2016/10/03 16:47:06
Can we pick better names for vSrc[n]? It's really
Prashant.Patil
2016/10/04 11:47:27
Done.
| |
| 513 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 514 ADD4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, v Dst4, vDst5, vDst6, vDst7); | |
| 515 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4); | |
| 516 } | |
| 517 | |
| 518 if (n > 0) { | |
| 519 if (n >= 20) { | |
|
Raymond Toy
2016/10/03 16:47:06
Is this really worth doing? In the typical use ca
Prashant.Patil
2016/10/04 11:47:27
Done.
| |
| 520 if (n >= 28) { | |
| 521 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 ); | |
| 522 vSrc10 = LD_SP(source1P); | |
| 523 source1P += 4; | |
| 524 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13); | |
| 525 vSrc14 = LD_SP(source2P); | |
| 526 source2P += 4; | |
| 527 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 528 ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); | |
| 529 vDst6 = vSrc10 + vSrc14; | |
| 530 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 531 ST_SP(vDst6, destP); | |
| 532 destP += 4; | |
| 533 n -= 28; | |
| 534 } else if (n >= 24) { | |
| 535 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 ); | |
| 536 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13); | |
| 537 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 538 ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); | |
| 539 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 540 ST_SP2(vDst4, vDst5, destP, 4); | |
| 541 n -= 24; | |
| 542 } else { | |
| 543 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 544 vSrc8 = LD_SP(source1P); | |
| 545 source1P += 4; | |
| 546 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); | |
| 547 vSrc12 = LD_SP(source2P); | |
| 548 source2P += 4; | |
| 549 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 550 vDst4 = vSrc8 + vSrc12; | |
| 551 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 552 ST_SP(vDst4, destP); | |
| 553 destP += 4; | |
| 554 n -= 20; | |
| 555 } | |
| 556 } else if (n >= 4) { | |
| 557 if (n >= 16) { | |
| 558 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 559 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); | |
| 560 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 561 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 562 n -= 16; | |
| 563 } else if (n >= 12) { | |
| 564 LD_SP2(source1P, 4, vSrc0, vSrc1); | |
| 565 vSrc2 = LD_SP(source1P); | |
| 566 source1P += 4; | |
| 567 LD_SP2(source2P, 4, vSrc4, vSrc5); | |
| 568 vSrc6 = LD_SP(source2P); | |
| 569 source2P += 4; | |
| 570 ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); | |
| 571 vDst2 = vSrc2 + vSrc6; | |
| 572 ST_SP2(vDst0, vDst1, destP, 4); | |
| 573 ST_SP(vDst2, destP); | |
| 574 destP += 4; | |
| 575 n -= 12; | |
| 576 } else if (n >= 8) { | |
| 577 LD_SP2(source1P, 4, vSrc0, vSrc1); | |
| 578 LD_SP2(source2P, 4, vSrc4, vSrc5); | |
| 579 ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); | |
| 580 ST_SP2(vDst0, vDst1, destP, 4); | |
| 581 n -= 8; | |
| 582 } else { | |
| 583 vSrc0 = LD_SP(source1P); | |
| 584 vSrc4 = LD_SP(source2P); | |
| 585 vDst0 = vSrc0 + vSrc4; | |
| 586 ST_SP(vDst0, destP); | |
| 587 source1P += 4; | |
| 588 source2P += 4; | |
| 589 destP += 4; | |
| 590 n -= 4; | |
| 591 } | |
| 592 } | |
| 593 } | |
| 594 } | |
| 374 #endif | 595 #endif |
| 375 while (n--) { | 596 while (n--) { |
| 376 *destP = *source1P + *source2P; | 597 *destP = *source1P + *source2P; |
| 377 source1P += sourceStride1; | 598 source1P += sourceStride1; |
| 378 source2P += sourceStride2; | 599 source2P += sourceStride2; |
| 379 destP += destStride; | 600 destP += destStride; |
| 380 } | 601 } |
| 381 #if CPU(X86) || CPU(X86_64) | 602 #if CPU(X86) || CPU(X86_64) |
| 382 } | 603 } |
| 383 #endif | 604 #endif |
| (...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 441 float32x4_t source1 = vld1q_f32(source1P); | 662 float32x4_t source1 = vld1q_f32(source1P); |
| 442 float32x4_t source2 = vld1q_f32(source2P); | 663 float32x4_t source2 = vld1q_f32(source2P); |
| 443 vst1q_f32(destP, vmulq_f32(source1, source2)); | 664 vst1q_f32(destP, vmulq_f32(source1, source2)); |
| 444 | 665 |
| 445 source1P += 4; | 666 source1P += 4; |
| 446 source2P += 4; | 667 source2P += 4; |
| 447 destP += 4; | 668 destP += 4; |
| 448 } | 669 } |
| 449 n = tailFrames; | 670 n = tailFrames; |
| 450 } | 671 } |
| 672 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 673 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { | |
| 674 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7; | |
| 675 v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15; | |
| 676 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7; | |
| 677 | |
| 678 for (; n >= 32; n -= 32) { | |
| 679 LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10 , vSrc11); | |
|
Raymond Toy
2016/10/03 16:47:06
Same comment as in line 512.
Prashant.Patil
2016/10/04 11:47:27
Done.
| |
| 680 LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc 14, vSrc15); | |
| 681 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 682 MUL4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, v Dst4, vDst5, vDst6, vDst7); | |
| 683 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4); | |
| 684 } | |
| 685 | |
| 686 if (n > 0) { | |
| 687 if (n >= 20) { | |
| 688 if (n >= 28) { | |
| 689 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 ); | |
| 690 vSrc10 = LD_SP(source1P); | |
| 691 source1P += 4; | |
| 692 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13); | |
| 693 vSrc14 = LD_SP(source2P); | |
| 694 source2P += 4; | |
| 695 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 696 MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); | |
| 697 vDst6 = vSrc10 * vSrc14; | |
| 698 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 699 ST_SP(vDst6, destP); | |
| 700 destP += 4; | |
| 701 n -= 28; | |
| 702 } else if (n >= 24) { | |
| 703 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 ); | |
| 704 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13); | |
| 705 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 706 MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5); | |
| 707 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 708 ST_SP2(vDst4, vDst5, destP, 4); | |
| 709 n -= 24; | |
| 710 } else { /* n >= 20 */ | |
| 711 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 712 vSrc8 = LD_SP(source1P); | |
| 713 source1P += 4; | |
| 714 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); | |
| 715 vSrc12 = LD_SP(source2P); | |
| 716 source2P += 4; | |
| 717 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 718 vDst4 = vSrc8 * vSrc12; | |
| 719 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 720 ST_SP(vDst4, destP); | |
| 721 destP += 4; | |
| 722 n -= 20; | |
| 723 } | |
| 724 } else if (n >= 4) { | |
| 725 if (n >= 16) { | |
| 726 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 727 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7); | |
| 728 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3); | |
| 729 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 730 n -= 16; | |
| 731 } else if (n >= 12) { | |
| 732 LD_SP2(source1P, 4, vSrc0, vSrc1); | |
| 733 vSrc2 = LD_SP(source1P); | |
| 734 source1P += 4; | |
| 735 LD_SP2(source2P, 4, vSrc4, vSrc5); | |
| 736 vSrc6 = LD_SP(source2P); | |
| 737 source2P += 4; | |
| 738 MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); | |
| 739 vDst2 = vSrc2 * vSrc6; | |
| 740 ST_SP2(vDst0, vDst1, destP, 4); | |
| 741 ST_SP(vDst2, destP); | |
| 742 destP += 4; | |
| 743 n -= 12; | |
| 744 } else if (n >= 8) { | |
| 745 LD_SP2(source1P, 4, vSrc0, vSrc1); | |
| 746 LD_SP2(source2P, 4, vSrc4, vSrc5); | |
| 747 MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1); | |
| 748 ST_SP2(vDst0, vDst1, destP, 4); | |
| 749 n -= 8; | |
| 750 } else { // n >= 4 | |
| 751 vSrc0 = LD_SP(source1P); | |
| 752 vSrc4 = LD_SP(source2P); | |
| 753 vDst0 = vSrc0 * vSrc4; | |
| 754 ST_SP(vDst0, destP); | |
| 755 source1P += 4; | |
| 756 source2P += 4; | |
| 757 destP += 4; | |
| 758 n -= 4; | |
| 759 } | |
| 760 } | |
| 761 } | |
| 762 } | |
| 451 #endif | 763 #endif |
| 452 while (n) { | 764 while (n) { |
| 453 *destP = *source1P * *source2P; | 765 *destP = *source1P * *source2P; |
| 454 source1P += sourceStride1; | 766 source1P += sourceStride1; |
| 455 source2P += sourceStride2; | 767 source2P += sourceStride2; |
| 456 destP += destStride; | 768 destP += destStride; |
| 457 n--; | 769 n--; |
| 458 } | 770 } |
| 459 } | 771 } |
| 460 | 772 |
| (...skipping 169 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 630 sourceP += 4; | 942 sourceP += 4; |
| 631 } | 943 } |
| 632 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax)); | 944 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax)); |
| 633 | 945 |
| 634 float groupMax[2]; | 946 float groupMax[2]; |
| 635 vst1_f32(groupMax, twoMax); | 947 vst1_f32(groupMax, twoMax); |
| 636 max = std::max(groupMax[0], groupMax[1]); | 948 max = std::max(groupMax[0], groupMax[1]); |
| 637 | 949 |
| 638 n = tailFrames; | 950 n = tailFrames; |
| 639 } | 951 } |
| 952 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 953 if (sourceStride == 1) { | |
| 954 v4f32 vMax = {0, }; | |
| 955 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9; | |
| 956 const v16i8 vMask = (v16i8) __msa_fill_w(0x7FFFFFFF); | |
| 957 | |
| 958 for (; n >= 40; n -= 40) { | |
| 959 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
|
Raymond Toy
2016/10/03 16:47:06
Same comment as in line 333.
Prashant.Patil
2016/10/04 11:47:27
Done.
| |
| 960 LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9); | |
| 961 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); | |
| 962 VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax); | |
| 963 VMAXMGV2(vSrc8, vSrc9, vMask, vMax); | |
| 964 } | |
| 965 | |
| 966 if (n > 0) { | |
| 967 if (n >= 32) { | |
| 968 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSr c6, vSrc7); | |
| 969 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); | |
| 970 VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax); | |
| 971 n -= 32; | |
| 972 } else if (n >= 28) { | |
| 973 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 974 vSrc6 = LD_SP(sourceP); | |
| 975 sourceP += 4; | |
| 976 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); | |
| 977 VMAXMGV2(vSrc4, vSrc5, vMask, vMax); | |
| 978 vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc6 & vMask)); | |
| 979 n -= 28; | |
| 980 } else if (n >= 24) { | |
| 981 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 982 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); | |
| 983 VMAXMGV2(vSrc4, vSrc5, vMask, vMax); | |
| 984 n -= 24; | |
| 985 } else if (n >= 16) { | |
| 986 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 987 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax); | |
| 988 n -= 16; | |
| 989 } else if (n >= 8) { | |
| 990 LD_SP2(sourceP, 4, vSrc0, vSrc1); | |
| 991 VMAXMGV2(vSrc0, vSrc1, vMask, vMax); | |
| 992 n -= 8; | |
| 993 } | |
| 994 | |
| 995 if (n >= 4) { | |
| 996 vSrc0 = LD_SP(sourceP); | |
| 997 sourceP += 4; | |
| 998 vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc0 & vMask)); | |
| 999 n -= 4; | |
| 1000 } | |
| 1001 } | |
| 1002 | |
| 1003 max = std::max(max, vMax[0]); | |
| 1004 max = std::max(max, vMax[1]); | |
| 1005 max = std::max(max, vMax[2]); | |
| 1006 max = std::max(max, vMax[3]); | |
| 1007 } | |
| 640 #endif | 1008 #endif |
| 641 | 1009 |
| 642 while (n--) { | 1010 while (n--) { |
| 643 max = std::max(max, fabsf(*sourceP)); | 1011 max = std::max(max, fabsf(*sourceP)); |
| 644 sourceP += sourceStride; | 1012 sourceP += sourceStride; |
| 645 } | 1013 } |
| 646 | 1014 |
| 647 ASSERT(maxP); | 1015 ASSERT(maxP); |
| 648 *maxP = max; | 1016 *maxP = max; |
| 649 } | 1017 } |
| (...skipping 13 matching lines...) Expand all Loading... | |
| 663 float32x4_t low = vdupq_n_f32(lowThreshold); | 1031 float32x4_t low = vdupq_n_f32(lowThreshold); |
| 664 float32x4_t high = vdupq_n_f32(highThreshold); | 1032 float32x4_t high = vdupq_n_f32(highThreshold); |
| 665 while (destP < endP) { | 1033 while (destP < endP) { |
| 666 float32x4_t source = vld1q_f32(sourceP); | 1034 float32x4_t source = vld1q_f32(sourceP); |
| 667 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); | 1035 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); |
| 668 sourceP += 4; | 1036 sourceP += 4; |
| 669 destP += 4; | 1037 destP += 4; |
| 670 } | 1038 } |
| 671 n = tailFrames; | 1039 n = tailFrames; |
| 672 } | 1040 } |
| 1041 #elif HAVE(MIPS_MSA_INTRINSICS) | |
| 1042 if ((sourceStride == 1) && (destStride == 1)) { | |
| 1043 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9; | |
| 1044 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDs t9; | |
| 1045 const v4f32 vLowThr = (v4f32) __msa_fill_w(*((int32_t *) lowThresholdP)) ; | |
| 1046 const v4f32 vHighThr = (v4f32) __msa_fill_w(*((int32_t *) highThresholdP )); | |
| 1047 | |
| 1048 for (; n >= 40; n -= 40) { | |
| 1049 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7); | |
| 1050 LD_SP2(sourceP, 4, vSrc8, vSrc9); | |
| 1051 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3); | |
| 1052 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7); | |
| 1053 VCLIP2(vSrc8, vSrc9, vLowThr, vHighThr, vDst8, vDst9); | |
| 1054 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4); | |
| 1055 ST_SP2(vDst8, vDst9, destP, 4); | |
| 1056 } | |
| 1057 | |
| 1058 if (n > 0) { | |
| 1059 if (n >= 32) { | |
| 1060 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSr c6, vSrc7); | |
| 1061 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3); | |
| 1062 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDs t5, vDst6, vDst7); | |
| 1063 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, d estP, 4); | |
| 1064 n -= 32; | |
| 1065 } else if (n >= 28) { | |
| 1066 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 1067 vSrc6 = LD_SP(sourceP); | |
| 1068 sourceP += 4; | |
| 1069 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3); | |
| 1070 VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5); | |
| 1071 vDst6 = __msa_fmax_w(__msa_fmin_w(vSrc6, vHighThr), vLowThr); | |
| 1072 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 1073 ST_SP(vDst6, destP); | |
| 1074 destP += 4; | |
| 1075 n -= 28; | |
| 1076 } else if (n >= 24) { | |
| 1077 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5); | |
| 1078 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3); | |
| 1079 VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5); | |
| 1080 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4); | |
| 1081 n -= 24; | |
| 1082 } else if (n >= 16) { | |
| 1083 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3); | |
| 1084 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3); | |
| 1085 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4); | |
| 1086 n -= 16; | |
| 1087 } else if (n >= 8) { | |
| 1088 LD_SP2(sourceP, 4, vSrc0, vSrc1); | |
| 1089 VCLIP2(vSrc0, vSrc1, vLowThr, vHighThr, vDst0, vDst1); | |
| 1090 ST_SP2(vDst0, vDst1, destP, 4); | |
| 1091 n -= 8; | |
| 1092 } | |
| 1093 if (n >= 4) { | |
| 1094 vSrc0 = LD_SP(sourceP); | |
| 1095 sourceP += 4; | |
| 1096 vDst0 = __msa_fmax_w(__msa_fmin_w(vSrc0, vHighThr), vLowThr); | |
| 1097 ST_SP(vDst0, destP); | |
| 1098 destP += 4; | |
| 1099 n -= 4; | |
| 1100 } | |
| 1101 } | |
| 1102 } | |
| 673 #endif | 1103 #endif |
| 674 while (n--) { | 1104 while (n--) { |
| 675 *destP = clampTo(*sourceP, lowThreshold, highThreshold); | 1105 *destP = clampTo(*sourceP, lowThreshold, highThreshold); |
| 676 sourceP += sourceStride; | 1106 sourceP += sourceStride; |
| 677 destP += destStride; | 1107 destP += destStride; |
| 678 } | 1108 } |
| 679 } | 1109 } |
| 680 | 1110 |
| 681 #endif // OS(MACOSX) | 1111 #endif // OS(MACOSX) |
| 682 | 1112 |
| 683 } // namespace VectorMath | 1113 } // namespace VectorMath |
| 684 | 1114 |
| 685 } // namespace blink | 1115 } // namespace blink |
| 686 | 1116 |
| OLD | NEW |