third_party/WebKit/Source/platform/audio/VectorMath.cpp - Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions

Side by Side Diff: third_party/WebKit/Source/platform/audio/VectorMath.cpp

Issue 2340583003: Add MSA (MIPS SIMD Arch) optimized VectorMath functions (Closed)

Patch Set: Removing zvmul and vsvesq Created 4 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2010, Google Inc. All rights reserved.	2 * Copyright (C) 2010, Google Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions	5 * modification, are permitted provided that the following conditions

6 * are met:	6 * are met:

7 * 1. Redistributions of source code must retain the above copyright	7 * 1. Redistributions of source code must retain the above copyright

8 * notice, this list of conditions and the following disclaimer.	8 * notice, this list of conditions and the following disclaimer.

9 * 2. Redistributions in binary form must reproduce the above copyright	9 * 2. Redistributions in binary form must reproduce the above copyright

10 * notice, this list of conditions and the following disclaimer in the	10 * notice, this list of conditions and the following disclaimer in the

(...skipping 22 matching lines...) Expand all Loading...
33 #endif	33 #endif

34	34

35 #if CPU(X86) \|\| CPU(X86_64)	35 #if CPU(X86) \|\| CPU(X86_64)

36 #include <emmintrin.h>	36 #include <emmintrin.h>

37 #endif	37 #endif

38	38

39 #if HAVE(ARM_NEON_INTRINSICS)	39 #if HAVE(ARM_NEON_INTRINSICS)

40 #include <arm_neon.h>	40 #include <arm_neon.h>

41 #endif	41 #endif

42	42

	43 #if HAVE(MIPS_MSA_INTRINSICS)

	44 #include "platform/cpu/mips/CommonMacrosMSA.h"

	45 #endif

	46

43 #include <math.h>	47 #include <math.h>

44 #include <algorithm>	48 #include <algorithm>

45	49

46 namespace blink {	50 namespace blink {

47	51

48 namespace VectorMath {	52 namespace VectorMath {

49	53

50 #if OS(MACOSX)	54 #if OS(MACOSX)

51 // On the Mac we use the highly optimized versions in Accelerate.framework	55 // On the Mac we use the highly optimized versions in Accelerate.framework

52 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as	56 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecL ib/vDSP_translate.h> which defines macros of the same name as

(...skipping 122 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
175 float32x4_t dest = vld1q_f32(destP);	179 float32x4_t dest = vld1q_f32(destP);

176	180

177 dest = vmlaq_f32(dest, source, k);	181 dest = vmlaq_f32(dest, source, k);

178 vst1q_f32(destP, dest);	182 vst1q_f32(destP, dest);

179	183

180 sourceP += 4;	184 sourceP += 4;

181 destP += 4;	185 destP += 4;

182 }	186 }

183 n = tailFrames;	187 n = tailFrames;

184 }	188 }

	189 #elif HAVE(MIPS_MSA_INTRINSICS)

	190 if ((sourceStride == 1) && (destStride == 1)) {

	191 float* destPCopy = destP;

	192 const v4f32 vScale = (v4f32) __msa_fill_w(((int32_t ) scale));

	193 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	194 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	195

	196 for (; n >= 32; n -= 32) {

	197 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);
	Raymond Toy 2016/10/03 16:47:06 Are there alignment constraints for sourceP and de Are there alignment constraints for sourceP and destPCopy? In general AudioArrays are on 16-byte boundaries (I think), but I'm not sure this hold everywhere for all callers. Same issue applies to all of the routines below. Prashant.Patil 2016/10/04 11:47:27 There are no alignment constraints Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Are there alignment constraints for sourceP and destPCopy? In general > AudioArrays are on 16-byte boundaries (I think), but I'm not sure this hold > everywhere for all callers. Same issue applies to all of the routines below. There are no alignment constraints
	198 LD_SP8(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6 , vDst7);

	199 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScale );

	200 VSMA4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScale );

	201 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);

	202 }

	203

	204 if (n > 0) {

	205 if (n >= 28) {
	Raymond Toy 2016/10/03 16:47:06 Is there really much to be gained in having this c Is there really much to be gained in having this complicated unrolling for all of the possible remainders that aren't a multiple of 32 but are a multiple of 4? The primary use case is for blocks of 128, so this code is never executed. for other uses, I think they're always a power of two (greater than 128), so this code is never executed there either. Prashant.Patil 2016/10/04 11:47:27 OK. I shall remove all cases below 32. Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Is there really much to be gained in having this complicated unrolling for all > of the possible remainders that aren't a multiple of 32 but are a multiple of 4? > > The primary use case is for blocks of 128, so this code is never executed. for > other uses, I think they're always a power of two (greater than 128), so this > code is never executed there either. OK. I shall remove all cases below 32.
	206 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	207 vSrc6 = LD_SP(sourceP);

	208 sourceP += 4;

	209 LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5);

	210 vDst6 = LD_SP(destPCopy);

	211 destPCopy += 4;

	212 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale);

	213 VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale);

	214 vDst6 += vSrc6 * vScale;

	215 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	216 ST_SP(vDst6, destP);

	217 destP += 4;

	218 n -= 28;

	219 } else if (n >= 24) {

	220 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	221 LD_SP6(destPCopy, 4, vDst0, vDst1, vDst2, vDst3, vDst4, vDst5);

	222 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale);

	223 VSMA2(vSrc4, vSrc5, vDst4, vDst5, vScale);

	224 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	225 n -= 24;

	226 } else if (n >= 16) {

	227 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	228 LD_SP4(destPCopy, 4, vDst0, vDst1, vDst2, vDst3);

	229 VSMA4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vS cale);

	230 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	231 n -= 16;

	232 } else if (n >= 8) {

	233 LD_SP2(sourceP, 4, vSrc0, vSrc1);

	234 LD_SP2(destPCopy, 4, vDst0, vDst1);

	235 VSMA2(vSrc0, vSrc1, vDst0, vDst1, vScale);

	236 ST_SP2(vDst0, vDst1, destP, 4);

	237 n -= 8;

	238 }

	239 if (n >= 4) {

	240 vSrc0 = LD_SP(sourceP);

	241 vDst0 = LD_SP(destPCopy);

	242 vDst0 += vSrc0 * vScale;

	243 ST_SP(vDst0, destP);

	244 sourceP += 4;

	245 destP += 4;

	246 n -= 4;

	247 }

	248 }

	249 }

185 #endif	250 #endif

186 while (n) {	251 while (n) {

187 destP += sourceP * *scale;	252 destP += sourceP * *scale;

188 sourceP += sourceStride;	253 sourceP += sourceStride;

189 destP += destStride;	254 destP += destStride;

190 n--;	255 n--;

191 }	256 }

192 }	257 }

193	258

194 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)	259 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* de stP, int destStride, size_t framesToProcess)

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
252	317

253 while (destP < endP) {	318 while (destP < endP) {

254 float32x4_t source = vld1q_f32(sourceP);	319 float32x4_t source = vld1q_f32(sourceP);

255 vst1q_f32(destP, vmulq_n_f32(source, k));	320 vst1q_f32(destP, vmulq_n_f32(source, k));

256	321

257 sourceP += 4;	322 sourceP += 4;

258 destP += 4;	323 destP += 4;

259 }	324 }

260 n = tailFrames;	325 n = tailFrames;

261 }	326 }

	327 #elif HAVE(MIPS_MSA_INTRINSICS)

	328 if ((sourceStride == 1) && (destStride == 1)) {

	329 const v4f32 vScale = (v4f32) __msa_fill_w(((int32_t ) scale));

	330 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9;

	331 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDs t9;

	332

	333 for (; n >= 40; n -= 40) {
	Raymond Toy 2016/10/03 16:47:06 Is it really worth doing blocks of 40 instead of 3 Is it really worth doing blocks of 40 instead of 32? Since the typical use case is powers of two, this guarantees the messy code below is run. Using blocks of 32 keeps a nice symmetry with the other code that uses 32. Prashant.Patil 2016/10/04 11:47:27 OK. This was done considering 10 cycle vector load Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Is it really worth doing blocks of 40 instead of 32? Since the typical use case > is powers of two, this guarantees the messy code below is run. > > Using blocks of 32 keeps a nice symmetry with the other code that uses 32. OK. This was done considering 10 cycle vector load latency in one of the mips variants. However most of the variants have ~8 cycle latency now, hence will change this to 32. Raymond Toy 2016/10/04 15:37:44 If there is significant gain in doing this, by all Show quoted text On 2016/10/04 11:47:27, Prashant.Patil wrote: > On 2016/10/03 16:47:06, Raymond Toy wrote: > > Is it really worth doing blocks of 40 instead of 32? Since the typical use > case > > is powers of two, this guarantees the messy code below is run. > > > > Using blocks of 32 keeps a nice symmetry with the other code that uses 32. > > OK. This was done considering 10 cycle vector load latency in one of the mips > variants. However most of the variants have ~8 cycle latency now, hence will > change this to 32. If there is significant gain in doing this, by all means let's do it. It just makes thinking about the code much harder.
	334 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	335 LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9);

	336 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst3, vScal e);

	337 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst7, vScal e);

	338 VSMUL2(vSrc8, vSrc9, vDst8, vDst9, vScale);

	339 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	340 ST_SP4(vDst6, vDst7, vDst8, vDst9, destP, 4);

	341 }

	342

	343 if (n > 0) {

	344 if (n >= 24) {

	345 if (n >= 32) {

	346 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);

	347 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);

	348 VSMUL4(vSrc4, vSrc5, vSrc6, vSrc7, vDst4, vDst5, vDst6, vDst 7, vScale);

	349 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst 7, destP, 4);

	350 n -= 32;

	351 } else if (n >= 28) {

	352 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5) ;

	353 vSrc6 = LD_SP(sourceP);

	354 sourceP += 4;

	355 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);

	356 VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale);

	357 vDst6 = vSrc6 * vScale;

	358 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	359 ST_SP(vDst6, destP);

	360 destP += 4;

	361 n -= 28;

	362 } else {

	363 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5) ;

	364 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);

	365 VSMUL2(vSrc4, vSrc5, vDst4, vDst5, vScale);

	366 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	367 n -= 24;

	368 }

	369 } else {

	370 if (n >= 16) {

	371 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	372 VSMUL4(vSrc0, vSrc1, vSrc2, vSrc3, vDst0, vDst1, vDst2, vDst 3, vScale);

	373 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	374 n -= 16;

	375 } else if (n >= 8) {

	376 LD_SP2(sourceP, 4, vSrc0, vSrc1);

	377 VSMUL2(vSrc0, vSrc1, vDst0, vDst1, vScale);

	378 ST_SP2(vDst0, vDst1, destP, 4);

	379 n -= 8;

	380 }

	381 }

	382 if (n >= 4) {

	383 vSrc0 = LD_SP(sourceP);

	384 vDst0 = vSrc0 * vScale;

	385 ST_SP(vDst0, destP);

	386 sourceP += 4;

	387 destP += 4;

	388 n -= 4;

	389 }

	390 }

	391 }

262 #endif	392 #endif

263 float k = *scale;	393 float k = *scale;

264 while (n--) {	394 while (n--) {

265 destP = k *sourceP;	395 destP = k *sourceP;

266 sourceP += sourceStride;	396 sourceP += sourceStride;

267 destP += destStride;	397 destP += destStride;

268 }	398 }

269 #if CPU(X86) \|\| CPU(X86_64)	399 #if CPU(X86) \|\| CPU(X86_64)

270 }	400 }

271 #endif	401 #endif

(...skipping 92 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
364 float32x4_t source1 = vld1q_f32(source1P);	494 float32x4_t source1 = vld1q_f32(source1P);

365 float32x4_t source2 = vld1q_f32(source2P);	495 float32x4_t source2 = vld1q_f32(source2P);

366 vst1q_f32(destP, vaddq_f32(source1, source2));	496 vst1q_f32(destP, vaddq_f32(source1, source2));

367	497

368 source1P += 4;	498 source1P += 4;

369 source2P += 4;	499 source2P += 4;

370 destP += 4;	500 destP += 4;

371 }	501 }

372 n = tailFrames;	502 n = tailFrames;

373 }	503 }

	504 #elif HAVE(MIPS_MSA_INTRINSICS)

	505 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

	506 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	507 v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15;

	508 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	509

	510 for (; n >= 32; n -= 32) {

	511 LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10 , vSrc11);

	512 LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc 14, vSrc15);
	Raymond Toy 2016/10/03 16:47:06 Can we pick better names for vSrc[n]? It's really Can we pick better names for vSrc[n]? It's really odd to see source1P loaded into vSrc0-3 and vSrc8-11. Prashant.Patil 2016/10/04 11:47:27 Done. Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Can we pick better names for vSrc[n]? It's really odd to see source1P loaded > into vSrc0-3 and vSrc8-11. Done.
	513 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	514 ADD4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, v Dst4, vDst5, vDst6, vDst7);

	515 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);

	516 }

	517

	518 if (n > 0) {

	519 if (n >= 20) {
	Raymond Toy 2016/10/03 16:47:06 Is this really worth doing? In the typical use ca Is this really worth doing? In the typical use case n is always a multiple of 32. Prashant.Patil 2016/10/04 11:47:27 Done. Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Is this really worth doing? In the typical use case n is always a multiple of > 32. Done.
	520 if (n >= 28) {

	521 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );

	522 vSrc10 = LD_SP(source1P);

	523 source1P += 4;

	524 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);

	525 vSrc14 = LD_SP(source2P);

	526 source2P += 4;

	527 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	528 ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);

	529 vDst6 = vSrc10 + vSrc14;

	530 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	531 ST_SP(vDst6, destP);

	532 destP += 4;

	533 n -= 28;

	534 } else if (n >= 24) {

	535 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );

	536 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);

	537 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	538 ADD2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);

	539 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	540 ST_SP2(vDst4, vDst5, destP, 4);

	541 n -= 24;

	542 } else {

	543 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	544 vSrc8 = LD_SP(source1P);

	545 source1P += 4;

	546 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);

	547 vSrc12 = LD_SP(source2P);

	548 source2P += 4;

	549 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	550 vDst4 = vSrc8 + vSrc12;

	551 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	552 ST_SP(vDst4, destP);

	553 destP += 4;

	554 n -= 20;

	555 }

	556 } else if (n >= 4) {

	557 if (n >= 16) {

	558 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	559 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);

	560 ADD4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	561 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	562 n -= 16;

	563 } else if (n >= 12) {

	564 LD_SP2(source1P, 4, vSrc0, vSrc1);

	565 vSrc2 = LD_SP(source1P);

	566 source1P += 4;

	567 LD_SP2(source2P, 4, vSrc4, vSrc5);

	568 vSrc6 = LD_SP(source2P);

	569 source2P += 4;

	570 ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);

	571 vDst2 = vSrc2 + vSrc6;

	572 ST_SP2(vDst0, vDst1, destP, 4);

	573 ST_SP(vDst2, destP);

	574 destP += 4;

	575 n -= 12;

	576 } else if (n >= 8) {

	577 LD_SP2(source1P, 4, vSrc0, vSrc1);

	578 LD_SP2(source2P, 4, vSrc4, vSrc5);

	579 ADD2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);

	580 ST_SP2(vDst0, vDst1, destP, 4);

	581 n -= 8;

	582 } else {

	583 vSrc0 = LD_SP(source1P);

	584 vSrc4 = LD_SP(source2P);

	585 vDst0 = vSrc0 + vSrc4;

	586 ST_SP(vDst0, destP);

	587 source1P += 4;

	588 source2P += 4;

	589 destP += 4;

	590 n -= 4;

	591 }

	592 }

	593 }

	594 }

374 #endif	595 #endif

375 while (n--) {	596 while (n--) {

376 destP = source1P + *source2P;	597 destP = source1P + *source2P;

377 source1P += sourceStride1;	598 source1P += sourceStride1;

378 source2P += sourceStride2;	599 source2P += sourceStride2;

379 destP += destStride;	600 destP += destStride;

380 }	601 }

381 #if CPU(X86) \|\| CPU(X86_64)	602 #if CPU(X86) \|\| CPU(X86_64)

382 }	603 }

383 #endif	604 #endif

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
441 float32x4_t source1 = vld1q_f32(source1P);	662 float32x4_t source1 = vld1q_f32(source1P);

442 float32x4_t source2 = vld1q_f32(source2P);	663 float32x4_t source2 = vld1q_f32(source2P);

443 vst1q_f32(destP, vmulq_f32(source1, source2));	664 vst1q_f32(destP, vmulq_f32(source1, source2));

444	665

445 source1P += 4;	666 source1P += 4;

446 source2P += 4;	667 source2P += 4;

447 destP += 4;	668 destP += 4;

448 }	669 }

449 n = tailFrames;	670 n = tailFrames;

450 }	671 }

	672 #elif HAVE(MIPS_MSA_INTRINSICS)

	673 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {

	674 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7;

	675 v4f32 vSrc8, vSrc9, vSrc10, vSrc11, vSrc12, vSrc13, vSrc14, vSrc15;

	676 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7;

	677

	678 for (; n >= 32; n -= 32) {

	679 LD_SP8(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9, vSrc10 , vSrc11);
	Raymond Toy 2016/10/03 16:47:06 Same comment as in line 512. Same comment as in line 512. Prashant.Patil 2016/10/04 11:47:27 Done. Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Same comment as in line 512. Done.
	680 LD_SP8(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc13, vSrc 14, vSrc15);

	681 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	682 MUL4(vSrc8, vSrc12, vSrc9, vSrc13, vSrc10, vSrc14, vSrc11, vSrc15, v Dst4, vDst5, vDst6, vDst7);

	683 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);

	684 }

	685

	686 if (n > 0) {

	687 if (n >= 20) {

	688 if (n >= 28) {

	689 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );

	690 vSrc10 = LD_SP(source1P);

	691 source1P += 4;

	692 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);

	693 vSrc14 = LD_SP(source2P);

	694 source2P += 4;

	695 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	696 MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);

	697 vDst6 = vSrc10 * vSrc14;

	698 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	699 ST_SP(vDst6, destP);

	700 destP += 4;

	701 n -= 28;

	702 } else if (n >= 24) {

	703 LD_SP6(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc8, vSrc9 );

	704 LD_SP6(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7, vSrc12, vSrc 13);

	705 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	706 MUL2(vSrc8, vSrc12, vSrc9, vSrc13, vDst4, vDst5);

	707 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	708 ST_SP2(vDst4, vDst5, destP, 4);

	709 n -= 24;

	710 } else { /* n >= 20 */

	711 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	712 vSrc8 = LD_SP(source1P);

	713 source1P += 4;

	714 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);

	715 vSrc12 = LD_SP(source2P);

	716 source2P += 4;

	717 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	718 vDst4 = vSrc8 * vSrc12;

	719 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	720 ST_SP(vDst4, destP);

	721 destP += 4;

	722 n -= 20;

	723 }

	724 } else if (n >= 4) {

	725 if (n >= 16) {

	726 LD_SP4(source1P, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	727 LD_SP4(source2P, 4, vSrc4, vSrc5, vSrc6, vSrc7);

	728 MUL4(vSrc0, vSrc4, vSrc1, vSrc5, vSrc2, vSrc6, vSrc3, vSrc7, vDst0, vDst1, vDst2, vDst3);

	729 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	730 n -= 16;

	731 } else if (n >= 12) {

	732 LD_SP2(source1P, 4, vSrc0, vSrc1);

	733 vSrc2 = LD_SP(source1P);

	734 source1P += 4;

	735 LD_SP2(source2P, 4, vSrc4, vSrc5);

	736 vSrc6 = LD_SP(source2P);

	737 source2P += 4;

	738 MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);

	739 vDst2 = vSrc2 * vSrc6;

	740 ST_SP2(vDst0, vDst1, destP, 4);

	741 ST_SP(vDst2, destP);

	742 destP += 4;

	743 n -= 12;

	744 } else if (n >= 8) {

	745 LD_SP2(source1P, 4, vSrc0, vSrc1);

	746 LD_SP2(source2P, 4, vSrc4, vSrc5);

	747 MUL2(vSrc0, vSrc4, vSrc1, vSrc5, vDst0, vDst1);

	748 ST_SP2(vDst0, vDst1, destP, 4);

	749 n -= 8;

	750 } else { // n >= 4

	751 vSrc0 = LD_SP(source1P);

	752 vSrc4 = LD_SP(source2P);

	753 vDst0 = vSrc0 * vSrc4;

	754 ST_SP(vDst0, destP);

	755 source1P += 4;

	756 source2P += 4;

	757 destP += 4;

	758 n -= 4;

	759 }

	760 }

	761 }

	762 }

451 #endif	763 #endif

452 while (n) {	764 while (n) {

453 destP = source1P * *source2P;	765 destP = source1P * *source2P;

454 source1P += sourceStride1;	766 source1P += sourceStride1;

455 source2P += sourceStride2;	767 source2P += sourceStride2;

456 destP += destStride;	768 destP += destStride;

457 n--;	769 n--;

458 }	770 }

459 }	771 }

460	772

(...skipping 169 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
630 sourceP += 4;	942 sourceP += 4;

631 }	943 }

632 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax));	944 float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourM ax));

633	945

634 float groupMax[2];	946 float groupMax[2];

635 vst1_f32(groupMax, twoMax);	947 vst1_f32(groupMax, twoMax);

636 max = std::max(groupMax[0], groupMax[1]);	948 max = std::max(groupMax[0], groupMax[1]);

637	949

638 n = tailFrames;	950 n = tailFrames;

639 }	951 }

	952 #elif HAVE(MIPS_MSA_INTRINSICS)

	953 if (sourceStride == 1) {

	954 v4f32 vMax = {0, };

	955 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9;

	956 const v16i8 vMask = (v16i8) __msa_fill_w(0x7FFFFFFF);

	957

	958 for (; n >= 40; n -= 40) {

	959 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);
	Raymond Toy 2016/10/03 16:47:06 Same comment as in line 333. Same comment as in line 333. Prashant.Patil 2016/10/04 11:47:27 Done. Show quoted text On 2016/10/03 16:47:06, Raymond Toy wrote: > Same comment as in line 333. Done.
	960 LD_SP4(sourceP, 4, vSrc6, vSrc7, vSrc8, vSrc9);

	961 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);

	962 VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax);

	963 VMAXMGV2(vSrc8, vSrc9, vMask, vMax);

	964 }

	965

	966 if (n > 0) {

	967 if (n >= 32) {

	968 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSr c6, vSrc7);

	969 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);

	970 VMAXMGV4(vSrc4, vSrc5, vSrc6, vSrc7, vMask, vMax);

	971 n -= 32;

	972 } else if (n >= 28) {

	973 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	974 vSrc6 = LD_SP(sourceP);

	975 sourceP += 4;

	976 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);

	977 VMAXMGV2(vSrc4, vSrc5, vMask, vMax);

	978 vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc6 & vMask));

	979 n -= 28;

	980 } else if (n >= 24) {

	981 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	982 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);

	983 VMAXMGV2(vSrc4, vSrc5, vMask, vMax);

	984 n -= 24;

	985 } else if (n >= 16) {

	986 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	987 VMAXMGV4(vSrc0, vSrc1, vSrc2, vSrc3, vMask, vMax);

	988 n -= 16;

	989 } else if (n >= 8) {

	990 LD_SP2(sourceP, 4, vSrc0, vSrc1);

	991 VMAXMGV2(vSrc0, vSrc1, vMask, vMax);

	992 n -= 8;

	993 }

	994

	995 if (n >= 4) {

	996 vSrc0 = LD_SP(sourceP);

	997 sourceP += 4;

	998 vMax = __msa_fmax_w(vMax, (v4f32) ((v16i8) vSrc0 & vMask));

	999 n -= 4;

	1000 }

	1001 }

	1002

	1003 max = std::max(max, vMax[0]);

	1004 max = std::max(max, vMax[1]);

	1005 max = std::max(max, vMax[2]);

	1006 max = std::max(max, vMax[3]);

	1007 }

640 #endif	1008 #endif

641	1009

642 while (n--) {	1010 while (n--) {

643 max = std::max(max, fabsf(*sourceP));	1011 max = std::max(max, fabsf(*sourceP));

644 sourceP += sourceStride;	1012 sourceP += sourceStride;

645 }	1013 }

646	1014

647 ASSERT(maxP);	1015 ASSERT(maxP);

648 *maxP = max;	1016 *maxP = max;

649 }	1017 }

(...skipping 13 matching lines...) Expand all Loading...
663 float32x4_t low = vdupq_n_f32(lowThreshold);	1031 float32x4_t low = vdupq_n_f32(lowThreshold);

664 float32x4_t high = vdupq_n_f32(highThreshold);	1032 float32x4_t high = vdupq_n_f32(highThreshold);

665 while (destP < endP) {	1033 while (destP < endP) {

666 float32x4_t source = vld1q_f32(sourceP);	1034 float32x4_t source = vld1q_f32(sourceP);

667 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));	1035 vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));

668 sourceP += 4;	1036 sourceP += 4;

669 destP += 4;	1037 destP += 4;

670 }	1038 }

671 n = tailFrames;	1039 n = tailFrames;

672 }	1040 }

	1041 #elif HAVE(MIPS_MSA_INTRINSICS)

	1042 if ((sourceStride == 1) && (destStride == 1)) {

	1043 v4f32 vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7, vSrc8, vSr c9;

	1044 v4f32 vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, vDst8, vDs t9;

	1045 const v4f32 vLowThr = (v4f32) __msa_fill_w(((int32_t ) lowThresholdP)) ;

	1046 const v4f32 vHighThr = (v4f32) __msa_fill_w(((int32_t ) highThresholdP ));

	1047

	1048 for (; n >= 40; n -= 40) {

	1049 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSrc6, vSrc7);

	1050 LD_SP2(sourceP, 4, vSrc8, vSrc9);

	1051 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDst1, vDst2, vDst3);

	1052 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDst5, vDst6, vDst7);

	1053 VCLIP2(vSrc8, vSrc9, vLowThr, vHighThr, vDst8, vDst9);

	1054 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, destP , 4);

	1055 ST_SP2(vDst8, vDst9, destP, 4);

	1056 }

	1057

	1058 if (n > 0) {

	1059 if (n >= 32) {

	1060 LD_SP8(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5, vSr c6, vSrc7);

	1061 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);

	1062 VCLIP4(vSrc4, vSrc5, vSrc6, vSrc7, vLowThr, vHighThr, vDst4, vDs t5, vDst6, vDst7);

	1063 ST_SP8(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, vDst6, vDst7, d estP, 4);

	1064 n -= 32;

	1065 } else if (n >= 28) {

	1066 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	1067 vSrc6 = LD_SP(sourceP);

	1068 sourceP += 4;

	1069 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);

	1070 VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5);

	1071 vDst6 = __msa_fmax_w(__msa_fmin_w(vSrc6, vHighThr), vLowThr);

	1072 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	1073 ST_SP(vDst6, destP);

	1074 destP += 4;

	1075 n -= 28;

	1076 } else if (n >= 24) {

	1077 LD_SP6(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3, vSrc4, vSrc5);

	1078 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);

	1079 VCLIP2(vSrc4, vSrc5, vLowThr, vHighThr, vDst4, vDst5);

	1080 ST_SP6(vDst0, vDst1, vDst2, vDst3, vDst4, vDst5, destP, 4);

	1081 n -= 24;

	1082 } else if (n >= 16) {

	1083 LD_SP4(sourceP, 4, vSrc0, vSrc1, vSrc2, vSrc3);

	1084 VCLIP4(vSrc0, vSrc1, vSrc2, vSrc3, vLowThr, vHighThr, vDst0, vDs t1, vDst2, vDst3);

	1085 ST_SP4(vDst0, vDst1, vDst2, vDst3, destP, 4);

	1086 n -= 16;

	1087 } else if (n >= 8) {

	1088 LD_SP2(sourceP, 4, vSrc0, vSrc1);

	1089 VCLIP2(vSrc0, vSrc1, vLowThr, vHighThr, vDst0, vDst1);

	1090 ST_SP2(vDst0, vDst1, destP, 4);

	1091 n -= 8;

	1092 }

	1093 if (n >= 4) {

	1094 vSrc0 = LD_SP(sourceP);

	1095 sourceP += 4;

	1096 vDst0 = __msa_fmax_w(__msa_fmin_w(vSrc0, vHighThr), vLowThr);

	1097 ST_SP(vDst0, destP);

	1098 destP += 4;

	1099 n -= 4;

	1100 }

	1101 }

	1102 }

673 #endif	1103 #endif

674 while (n--) {	1104 while (n--) {

675 destP = clampTo(sourceP, lowThreshold, highThreshold);	1105 destP = clampTo(sourceP, lowThreshold, highThreshold);

676 sourceP += sourceStride;	1106 sourceP += sourceStride;

677 destP += destStride;	1107 destP += destStride;

678 }	1108 }

679 }	1109 }

680	1110

681 #endif // OS(MACOSX)	1111 #endif // OS(MACOSX)

682	1112

683 } // namespace VectorMath	1113 } // namespace VectorMath

684	1114

685 } // namespace blink	1115 } // namespace blink

686	1116

OLD	NEW