OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 3442 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3453 vmovdqu [edx], ymm0 | 3453 vmovdqu [edx], ymm0 |
3454 lea edx, [edx + 32] | 3454 lea edx, [edx + 32] |
3455 sub ecx, 32 | 3455 sub ecx, 32 |
3456 jg convertloop | 3456 jg convertloop |
3457 vzeroupper | 3457 vzeroupper |
3458 ret | 3458 ret |
3459 } | 3459 } |
3460 } | 3460 } |
3461 #endif // HAS_MIRRORROW_AVX2 | 3461 #endif // HAS_MIRRORROW_AVX2 |
3462 | 3462 |
3463 #ifdef HAS_MIRRORROW_SSE2 | |
3464 __declspec(naked) | |
3465 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { | |
3466 __asm { | |
3467 mov eax, [esp + 4] // src | |
3468 mov edx, [esp + 8] // dst | |
3469 mov ecx, [esp + 12] // width | |
3470 | |
3471 convertloop: | |
3472 movdqu xmm0, [eax - 16 + ecx] | |
3473 movdqa xmm1, xmm0 // swap bytes | |
3474 psllw xmm0, 8 | |
3475 psrlw xmm1, 8 | |
3476 por xmm0, xmm1 | |
3477 pshuflw xmm0, xmm0, 0x1b // swap words | |
3478 pshufhw xmm0, xmm0, 0x1b | |
3479 pshufd xmm0, xmm0, 0x4e // swap qwords | |
3480 movdqu [edx], xmm0 | |
3481 lea edx, [edx + 16] | |
3482 sub ecx, 16 | |
3483 jg convertloop | |
3484 ret | |
3485 } | |
3486 } | |
3487 #endif // HAS_MIRRORROW_SSE2 | |
3488 | |
3489 #ifdef HAS_MIRRORROW_UV_SSSE3 | 3463 #ifdef HAS_MIRRORROW_UV_SSSE3 |
3490 // Shuffle table for reversing the bytes of UV channels. | 3464 // Shuffle table for reversing the bytes of UV channels. |
3491 static const uvec8 kShuffleMirrorUV = { | 3465 static const uvec8 kShuffleMirrorUV = { |
3492 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u | 3466 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u |
3493 }; | 3467 }; |
3494 | 3468 |
3495 __declspec(naked) | 3469 __declspec(naked) |
3496 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, | 3470 void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, |
3497 int width) { | 3471 int width) { |
3498 __asm { | 3472 __asm { |
(...skipping 876 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4375 lea edx, [edx + 8] | 4349 lea edx, [edx + 8] |
4376 sub ecx, 16 | 4350 sub ecx, 16 |
4377 jg convertloop | 4351 jg convertloop |
4378 | 4352 |
4379 pop edi | 4353 pop edi |
4380 ret | 4354 ret |
4381 } | 4355 } |
4382 } | 4356 } |
4383 #endif // HAS_YUY2TOYROW_SSE2 | 4357 #endif // HAS_YUY2TOYROW_SSE2 |
4384 | 4358 |
4385 #ifdef HAS_ARGBBLENDROW_SSE2 | |
4386 // Blend 8 pixels at a time. | |
4387 __declspec(naked) | |
4388 void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, | |
4389 uint8* dst_argb, int width) { | |
4390 __asm { | |
4391 push esi | |
4392 mov eax, [esp + 4 + 4] // src_argb0 | |
4393 mov esi, [esp + 4 + 8] // src_argb1 | |
4394 mov edx, [esp + 4 + 12] // dst_argb | |
4395 mov ecx, [esp + 4 + 16] // width | |
4396 pcmpeqb xmm7, xmm7 // generate constant 1 | |
4397 psrlw xmm7, 15 | |
4398 pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff | |
4399 psrlw xmm6, 8 | |
4400 pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 | |
4401 psllw xmm5, 8 | |
4402 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | |
4403 pslld xmm4, 24 | |
4404 sub ecx, 4 | |
4405 jl convertloop4b // less than 4 pixels? | |
4406 | |
4407 // 4 pixel loop. | |
4408 convertloop4: | |
4409 movdqu xmm3, [eax] // src argb | |
4410 lea eax, [eax + 16] | |
4411 movdqa xmm0, xmm3 // src argb | |
4412 pxor xmm3, xmm4 // ~alpha | |
4413 movdqu xmm2, [esi] // _r_b | |
4414 psrlw xmm3, 8 // alpha | |
4415 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
4416 pshuflw xmm3, xmm3, 0F5h | |
4417 pand xmm2, xmm6 // _r_b | |
4418 paddw xmm3, xmm7 // 256 - alpha | |
4419 pmullw xmm2, xmm3 // _r_b * alpha | |
4420 movdqu xmm1, [esi] // _a_g | |
4421 lea esi, [esi + 16] | |
4422 psrlw xmm1, 8 // _a_g | |
4423 por xmm0, xmm4 // set alpha to 255 | |
4424 pmullw xmm1, xmm3 // _a_g * alpha | |
4425 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
4426 paddusb xmm0, xmm2 // + src argb | |
4427 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
4428 paddusb xmm0, xmm1 // + src argb | |
4429 movdqu [edx], xmm0 | |
4430 lea edx, [edx + 16] | |
4431 sub ecx, 4 | |
4432 jge convertloop4 | |
4433 | |
4434 convertloop4b: | |
4435 add ecx, 4 - 1 | |
4436 jl convertloop1b | |
4437 | |
4438 // 1 pixel loop. | |
4439 convertloop1: | |
4440 movd xmm3, [eax] // src argb | |
4441 lea eax, [eax + 4] | |
4442 movdqa xmm0, xmm3 // src argb | |
4443 pxor xmm3, xmm4 // ~alpha | |
4444 movd xmm2, [esi] // _r_b | |
4445 psrlw xmm3, 8 // alpha | |
4446 pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
4447 pshuflw xmm3, xmm3, 0F5h | |
4448 pand xmm2, xmm6 // _r_b | |
4449 paddw xmm3, xmm7 // 256 - alpha | |
4450 pmullw xmm2, xmm3 // _r_b * alpha | |
4451 movd xmm1, [esi] // _a_g | |
4452 lea esi, [esi + 4] | |
4453 psrlw xmm1, 8 // _a_g | |
4454 por xmm0, xmm4 // set alpha to 255 | |
4455 pmullw xmm1, xmm3 // _a_g * alpha | |
4456 psrlw xmm2, 8 // _r_b convert to 8 bits again | |
4457 paddusb xmm0, xmm2 // + src argb | |
4458 pand xmm1, xmm5 // a_g_ convert to 8 bits again | |
4459 paddusb xmm0, xmm1 // + src argb | |
4460 movd [edx], xmm0 | |
4461 lea edx, [edx + 4] | |
4462 sub ecx, 1 | |
4463 jge convertloop1 | |
4464 | |
4465 convertloop1b: | |
4466 pop esi | |
4467 ret | |
4468 } | |
4469 } | |
4470 #endif // HAS_ARGBBLENDROW_SSE2 | |
4471 | |
4472 #ifdef HAS_ARGBBLENDROW_SSSE3 | 4359 #ifdef HAS_ARGBBLENDROW_SSSE3 |
4473 // Shuffle table for isolating alpha. | 4360 // Shuffle table for isolating alpha. |
4474 static const uvec8 kShuffleAlpha = { | 4361 static const uvec8 kShuffleAlpha = { |
4475 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, | 4362 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, |
4476 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 | 4363 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 |
4477 }; | 4364 }; |
4478 // Same as SSE2, but replaces: | 4365 |
4479 // psrlw xmm3, 8 // alpha | |
4480 // pshufhw xmm3, xmm3, 0F5h // 8 alpha words | |
4481 // pshuflw xmm3, xmm3, 0F5h | |
4482 // with.. | |
4483 // pshufb xmm3, kShuffleAlpha // alpha | |
4484 // Blend 8 pixels at a time. | 4366 // Blend 8 pixels at a time. |
4485 | |
4486 __declspec(naked) | 4367 __declspec(naked) |
4487 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, | 4368 void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, |
4488 uint8* dst_argb, int width) { | 4369 uint8* dst_argb, int width) { |
4489 __asm { | 4370 __asm { |
4490 push esi | 4371 push esi |
4491 mov eax, [esp + 4 + 4] // src_argb0 | 4372 mov eax, [esp + 4 + 4] // src_argb0 |
4492 mov esi, [esp + 4 + 8] // src_argb1 | 4373 mov esi, [esp + 4 + 8] // src_argb1 |
4493 mov edx, [esp + 4 + 12] // dst_argb | 4374 mov edx, [esp + 4 + 12] // dst_argb |
4494 mov ecx, [esp + 4 + 16] // width | 4375 mov ecx, [esp + 4 + 16] // width |
4495 pcmpeqb xmm7, xmm7 // generate constant 0x0001 | 4376 pcmpeqb xmm7, xmm7 // generate constant 0x0001 |
(...skipping 61 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4557 sub ecx, 1 | 4438 sub ecx, 1 |
4558 jge convertloop1 | 4439 jge convertloop1 |
4559 | 4440 |
4560 convertloop1b: | 4441 convertloop1b: |
4561 pop esi | 4442 pop esi |
4562 ret | 4443 ret |
4563 } | 4444 } |
4564 } | 4445 } |
4565 #endif // HAS_ARGBBLENDROW_SSSE3 | 4446 #endif // HAS_ARGBBLENDROW_SSSE3 |
4566 | 4447 |
4567 #ifdef HAS_ARGBATTENUATEROW_SSE2 | |
4568 // Attenuate 4 pixels at a time. | |
4569 __declspec(naked) | |
4570 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { | |
4571 __asm { | |
4572 mov eax, [esp + 4] // src_argb0 | |
4573 mov edx, [esp + 8] // dst_argb | |
4574 mov ecx, [esp + 12] // width | |
4575 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 | |
4576 pslld xmm4, 24 | |
4577 pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff | |
4578 psrld xmm5, 8 | |
4579 | |
4580 convertloop: | |
4581 movdqu xmm0, [eax] // read 4 pixels | |
4582 punpcklbw xmm0, xmm0 // first 2 | |
4583 pshufhw xmm2, xmm0, 0FFh // 8 alpha words | |
4584 pshuflw xmm2, xmm2, 0FFh | |
4585 pmulhuw xmm0, xmm2 // rgb * a | |
4586 movdqu xmm1, [eax] // read 4 pixels | |
4587 punpckhbw xmm1, xmm1 // next 2 pixels | |
4588 pshufhw xmm2, xmm1, 0FFh // 8 alpha words | |
4589 pshuflw xmm2, xmm2, 0FFh | |
4590 pmulhuw xmm1, xmm2 // rgb * a | |
4591 movdqu xmm2, [eax] // alphas | |
4592 lea eax, [eax + 16] | |
4593 psrlw xmm0, 8 | |
4594 pand xmm2, xmm4 | |
4595 psrlw xmm1, 8 | |
4596 packuswb xmm0, xmm1 | |
4597 pand xmm0, xmm5 // keep original alphas | |
4598 por xmm0, xmm2 | |
4599 movdqu [edx], xmm0 | |
4600 lea edx, [edx + 16] | |
4601 sub ecx, 4 | |
4602 jg convertloop | |
4603 | |
4604 ret | |
4605 } | |
4606 } | |
4607 #endif // HAS_ARGBATTENUATEROW_SSE2 | |
4608 | |
4609 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 4448 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
4610 // Shuffle table duplicating alpha. | 4449 // Shuffle table duplicating alpha. |
4611 static const uvec8 kShuffleAlpha0 = { | 4450 static const uvec8 kShuffleAlpha0 = { |
4612 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, | 4451 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, |
4613 }; | 4452 }; |
4614 static const uvec8 kShuffleAlpha1 = { | 4453 static const uvec8 kShuffleAlpha1 = { |
4615 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, | 4454 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, |
4616 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, | 4455 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, |
4617 }; | 4456 }; |
4618 __declspec(naked) | 4457 __declspec(naked) |
(...skipping 2085 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6704 } | 6543 } |
6705 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6544 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6706 | 6545 |
6707 #endif // defined(_M_X64) | 6546 #endif // defined(_M_X64) |
6708 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6547 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6709 | 6548 |
6710 #ifdef __cplusplus | 6549 #ifdef __cplusplus |
6711 } // extern "C" | 6550 } // extern "C" |
6712 } // namespace libyuv | 6551 } // namespace libyuv |
6713 #endif | 6552 #endif |
OLD | NEW |