| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 3442 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 3453 vmovdqu [edx], ymm0 | 3453 vmovdqu [edx], ymm0 |
| 3454 lea edx, [edx + 32] | 3454 lea edx, [edx + 32] |
| 3455 sub ecx, 32 | 3455 sub ecx, 32 |
| 3456 jg convertloop | 3456 jg convertloop |
| 3457 vzeroupper | 3457 vzeroupper |
| 3458 ret | 3458 ret |
| 3459 } | 3459 } |
| 3460 } | 3460 } |
| 3461 #endif // HAS_MIRRORROW_AVX2 | 3461 #endif // HAS_MIRRORROW_AVX2 |
| 3462 | 3462 |
| 3463 #ifdef HAS_MIRRORROW_SSE2 | |
| 3464 __declspec(naked) | |
| 3465 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
| 3466 __asm { | |
| 3467 mov eax, [esp + 4] // src | |
| 3468 mov edx, [esp + 8] // dst | |
| 3469 mov ecx, [esp + 12] // width | |
| 3470 | |
| 3471 convertloop: | |
| 3472 movdqu xmm0, [eax - 16 + ecx] | |
| 3473 movdqa xmm1, xmm0 // swap bytes | |
| 3474 psllw xmm0, 8 | |
| 3475 psrlw xmm1, 8 | |
| 3476 por xmm0, xmm1 | |
| 3477 pshuflw xmm0, xmm0, 0x1b // swap words | |
| 3478 pshufhw xmm0, xmm0, 0x1b | |
| 3479 pshufd xmm0, xmm0, 0x4e // swap qwords | |
| 3480 movdqu [edx], xmm0 | |
| 3481 lea edx, [edx + 16] | |
| 3482 sub ecx, 16 | |
| 3483 jg convertloop | |
| 3484 ret | |
| 3485 } | |
| 3486 } | |
| 3487 #endif // HAS_MIRRORROW_SSE2 | |
| 3488 | |
| 3489 #ifdef HAS_MIRRORROW_UV_SSSE3 | 3463 #ifdef HAS_MIRRORROW_UV_SSSE3 |
| 3490 // Shuffle table for reversing the bytes of UV channels. | 3464 // Shuffle table for reversing the bytes of UV channels. |
| 3491 static const uvec8 kShuffleMirrorUV = { | 3465 static const uvec8 kShuffleMirrorUV = { |
| 3492 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 3466 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
| 3493 }; | 3467 }; |
| 3494 | 3468 |
| 3495 __declspec(naked) | 3469 __declspec(naked) |
| 3496 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 3470 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
| 3497 int width) { | 3471 int width) { |
| 3498 __asm { | 3472 __asm { |
| (...skipping 876 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4375 lea edx, [edx + 8] | 4349 lea edx, [edx + 8] |
| 4376 sub ecx, 16 | 4350 sub ecx, 16 |
| 4377 jg convertloop | 4351 jg convertloop |
| 4378 | 4352 |
| 4379 pop edi | 4353 pop edi |
| 4380 ret | 4354 ret |
| 4381 } | 4355 } |
| 4382 } | 4356 } |
| 4383 #endif // HAS_YUY2TOYROW_SSE2 | 4357 #endif // HAS_YUY2TOYROW_SSE2 |
| 4384 | 4358 |
| 4385 #ifdef HAS_ARGBBLENDROW_SSE2 | |
| 4386 // Blend 8 pixels at a time. | |
| 4387 __declspec(naked) | |
| 4388 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
| 4389 uint8* dst_argb, int width) { | |
| 4390 __asm { | |
| 4391 push esi | |
| 4392 mov eax, [esp + 4 + 4] // src_argb0 | |
| 4393 mov esi, [esp + 4 + 8] // src_argb1 | |
| 4394 mov edx, [esp + 4 + 12] // dst_argb | |
| 4395 mov ecx, [esp + 4 + 16] // width | |
| 4396 pcmpeqb xmm7, xmm7 // generate constant 1 | |
| 4397 psrlw xmm7, 15 | |
| 4398 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff | |
| 4399 psrlw xmm6, 8 | |
| 4400 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | |
| 4401 psllw xmm5, 8 | |
| 4402 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | |
| 4403 pslld xmm4, 24 | |
| 4404 sub ecx, 4 | |
| 4405 jl convertloop4b // less than 4 pixels? | |
| 4406 | |
| 4407 // 4 pixel loop. | |
| 4408 convertloop4: | |
| 4409 movdqu xmm3, [eax] // src argb | |
| 4410 lea eax, [eax + 16] | |
| 4411 movdqa xmm0, xmm3 // src argb | |
| 4412 pxor xmm3, xmm4 // ~alpha | |
| 4413 movdqu xmm2, [esi] // _r_b | |
| 4414 psrlw xmm3, 8 // alpha | |
| 4415 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
| 4416 pshuflw xmm3, xmm3, 0F5h | |
| 4417 pand xmm2, xmm6 // _r_b | |
| 4418 paddw xmm3, xmm7 // 256 - alpha | |
| 4419 pmullw xmm2, xmm3 // _r_b * alpha | |
| 4420 movdqu xmm1, [esi] // _a_g | |
| 4421 lea esi, [esi + 16] | |
| 4422 psrlw xmm1, 8 // _a_g | |
| 4423 por xmm0, xmm4 // set alpha to 255 | |
| 4424 pmullw xmm1, xmm3 // _a_g * alpha | |
| 4425 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
| 4426 paddusb xmm0, xmm2 // + src argb | |
| 4427 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
| 4428 paddusb xmm0, xmm1 // + src argb | |
| 4429 movdqu [edx], xmm0 | |
| 4430 lea edx, [edx + 16] | |
| 4431 sub ecx, 4 | |
| 4432 jge convertloop4 | |
| 4433 | |
| 4434 convertloop4b: | |
| 4435 add ecx, 4 - 1 | |
| 4436 jl convertloop1b | |
| 4437 | |
| 4438 // 1 pixel loop. | |
| 4439 convertloop1: | |
| 4440 movd xmm3, [eax] // src argb | |
| 4441 lea eax, [eax + 4] | |
| 4442 movdqa xmm0, xmm3 // src argb | |
| 4443 pxor xmm3, xmm4 // ~alpha | |
| 4444 movd xmm2, [esi] // _r_b | |
| 4445 psrlw xmm3, 8 // alpha | |
| 4446 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
| 4447 pshuflw xmm3, xmm3, 0F5h | |
| 4448 pand xmm2, xmm6 // _r_b | |
| 4449 paddw xmm3, xmm7 // 256 - alpha | |
| 4450 pmullw xmm2, xmm3 // _r_b * alpha | |
| 4451 movd xmm1, [esi] // _a_g | |
| 4452 lea esi, [esi + 4] | |
| 4453 psrlw xmm1, 8 // _a_g | |
| 4454 por xmm0, xmm4 // set alpha to 255 | |
| 4455 pmullw xmm1, xmm3 // _a_g * alpha | |
| 4456 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
| 4457 paddusb xmm0, xmm2 // + src argb | |
| 4458 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
| 4459 paddusb xmm0, xmm1 // + src argb | |
| 4460 movd [edx], xmm0 | |
| 4461 lea edx, [edx + 4] | |
| 4462 sub ecx, 1 | |
| 4463 jge convertloop1 | |
| 4464 | |
| 4465 convertloop1b: | |
| 4466 pop esi | |
| 4467 ret | |
| 4468 } | |
| 4469 } | |
| 4470 #endif // HAS_ARGBBLENDROW_SSE2 | |
| 4471 | |
| 4472 #ifdef HAS_ARGBBLENDROW_SSSE3 | 4359 #ifdef HAS_ARGBBLENDROW_SSSE3 |
| 4473 // Shuffle table for isolating alpha. | 4360 // Shuffle table for isolating alpha. |
| 4474 static const uvec8 kShuffleAlpha = { | 4361 static const uvec8 kShuffleAlpha = { |
| 4475 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 4362 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
| 4476 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 4363 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
| 4477 }; | 4364 }; |
| 4478 // Same as SSE2, but replaces: | 4365 |
| 4479 // psrlw xmm3, 8 // alpha | |
| 4480 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
| 4481 // pshuflw xmm3, xmm3, 0F5h | |
| 4482 // with.. | |
| 4483 // pshufb xmm3, kShuffleAlpha // alpha | |
| 4484 // Blend 8 pixels at a time. | 4366 // Blend 8 pixels at a time. |
| 4485 | |
| 4486 __declspec(naked) | 4367 __declspec(naked) |
| 4487 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 4368 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
| 4488 uint8* dst_argb, int width) { | 4369 uint8* dst_argb, int width) { |
| 4489 __asm { | 4370 __asm { |
| 4490 push esi | 4371 push esi |
| 4491 mov eax, [esp + 4 + 4] // src_argb0 | 4372 mov eax, [esp + 4 + 4] // src_argb0 |
| 4492 mov esi, [esp + 4 + 8] // src_argb1 | 4373 mov esi, [esp + 4 + 8] // src_argb1 |
| 4493 mov edx, [esp + 4 + 12] // dst_argb | 4374 mov edx, [esp + 4 + 12] // dst_argb |
| 4494 mov ecx, [esp + 4 + 16] // width | 4375 mov ecx, [esp + 4 + 16] // width |
| 4495 pcmpeqb xmm7, xmm7 // generate constant 0x0001 | 4376 pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
| (...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 4557 sub ecx, 1 | 4438 sub ecx, 1 |
| 4558 jge convertloop1 | 4439 jge convertloop1 |
| 4559 | 4440 |
| 4560 convertloop1b: | 4441 convertloop1b: |
| 4561 pop esi | 4442 pop esi |
| 4562 ret | 4443 ret |
| 4563 } | 4444 } |
| 4564 } | 4445 } |
| 4565 #endif // HAS_ARGBBLENDROW_SSSE3 | 4446 #endif // HAS_ARGBBLENDROW_SSSE3 |
| 4566 | 4447 |
| 4567 #ifdef HAS_ARGBATTENUATEROW_SSE2 | |
| 4568 // Attenuate 4 pixels at a time. | |
| 4569 __declspec(naked) | |
| 4570 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | |
| 4571 __asm { | |
| 4572 mov eax, [esp + 4] // src_argb0 | |
| 4573 mov edx, [esp + 8] // dst_argb | |
| 4574 mov ecx, [esp + 12] // width | |
| 4575 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | |
| 4576 pslld xmm4, 24 | |
| 4577 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff | |
| 4578 psrld xmm5, 8 | |
| 4579 | |
| 4580 convertloop: | |
| 4581 movdqu xmm0, [eax] // read 4 pixels | |
| 4582 punpcklbw xmm0, xmm0 // first 2 | |
| 4583 pshufhw xmm2, xmm0, 0FFh // 8 alpha words | |
| 4584 pshuflw xmm2, xmm2, 0FFh | |
| 4585 pmulhuw xmm0, xmm2 // rgb * a | |
| 4586 movdqu xmm1, [eax] // read 4 pixels | |
| 4587 punpckhbw xmm1, xmm1 // next 2 pixels | |
| 4588 pshufhw xmm2, xmm1, 0FFh // 8 alpha words | |
| 4589 pshuflw xmm2, xmm2, 0FFh | |
| 4590 pmulhuw xmm1, xmm2 // rgb * a | |
| 4591 movdqu xmm2, [eax] // alphas | |
| 4592 lea eax, [eax + 16] | |
| 4593 psrlw xmm0, 8 | |
| 4594 pand xmm2, xmm4 | |
| 4595 psrlw xmm1, 8 | |
| 4596 packuswb xmm0, xmm1 | |
| 4597 pand xmm0, xmm5 // keep original alphas | |
| 4598 por xmm0, xmm2 | |
| 4599 movdqu [edx], xmm0 | |
| 4600 lea edx, [edx + 16] | |
| 4601 sub ecx, 4 | |
| 4602 jg convertloop | |
| 4603 | |
| 4604 ret | |
| 4605 } | |
| 4606 } | |
| 4607 #endif // HAS_ARGBATTENUATEROW_SSE2 | |
| 4608 | |
| 4609 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 4448 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
| 4610 // Shuffle table duplicating alpha. | 4449 // Shuffle table duplicating alpha. |
| 4611 static const uvec8 kShuffleAlpha0 = { | 4450 static const uvec8 kShuffleAlpha0 = { |
| 4612 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, | 4451 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
| 4613 }; | 4452 }; |
| 4614 static const uvec8 kShuffleAlpha1 = { | 4453 static const uvec8 kShuffleAlpha1 = { |
| 4615 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 4454 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
| 4616 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, | 4455 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
| 4617 }; | 4456 }; |
| 4618 __declspec(naked) | 4457 __declspec(naked) |
| (...skipping 2085 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6704 } | 6543 } |
| 6705 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6544 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6706 | 6545 |
| 6707 #endif // defined(_M_X64) | 6546 #endif // defined(_M_X64) |
| 6708 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6547 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6709 | 6548 |
| 6710 #ifdef __cplusplus | 6549 #ifdef __cplusplus |
| 6711 } // extern "C" | 6550 } // extern "C" |
| 6712 } // namespace libyuv | 6551 } // namespace libyuv |
| 6713 #endif | 6552 #endif |
| OLD | NEW |