OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 3451 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3462 "+r"(width) // %3 | 3462 "+r"(width) // %3 |
3463 : "m"(kShuffleAlpha) // %4 | 3463 : "m"(kShuffleAlpha) // %4 |
3464 : "memory", "cc" | 3464 : "memory", "cc" |
3465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 3465 , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
3466 ); | 3466 ); |
3467 } | 3467 } |
3468 #endif // HAS_ARGBBLENDROW_SSSE3 | 3468 #endif // HAS_ARGBBLENDROW_SSSE3 |
3469 | 3469 |
3470 #ifdef HAS_BLENDPLANEROW_SSSE3 | 3470 #ifdef HAS_BLENDPLANEROW_SSSE3 |
3471 // Blend 8 pixels at a time. | 3471 // Blend 8 pixels at a time. |
3472 // =((G2*C2)+(H2*(D2))+32768+127)/256 | 3472 // unsigned version of math |
| 3473 // =((A2*C2)+(B2*(255-C2))+255)/256 |
| 3474 // signed version of math |
| 3475 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
3473 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, | 3476 void BlendPlaneRow_SSSE3(const uint8* src0, const uint8* src1, |
3474 const uint8* alpha, uint8* dst, int width) { | 3477 const uint8* alpha, uint8* dst, int width) { |
3475 asm volatile ( | 3478 asm volatile ( |
3476 "pcmpeqb %%xmm5,%%xmm5 \n" | 3479 "pcmpeqb %%xmm5,%%xmm5 \n" |
3477 "psllw $0x8,%%xmm5 \n" | 3480 "psllw $0x8,%%xmm5 \n" |
3478 "mov $0x80808080,%%eax \n" | 3481 "mov $0x80808080,%%eax \n" |
3479 "movd %%eax,%%xmm6 \n" | 3482 "movd %%eax,%%xmm6 \n" |
3480 "pshufd $0x0,%%xmm6,%%xmm6 \n" | 3483 "pshufd $0x0,%%xmm6,%%xmm6 \n" |
3481 "mov $0x807f807f,%%eax \n" | 3484 "mov $0x807f807f,%%eax \n" |
3482 "movd %%eax,%%xmm7 \n" | 3485 "movd %%eax,%%xmm7 \n" |
(...skipping 24 matching lines...) Expand all Loading... |
3507 "+r"(src1), // %1 | 3510 "+r"(src1), // %1 |
3508 "+r"(alpha), // %2 | 3511 "+r"(alpha), // %2 |
3509 "+r"(dst), // %3 | 3512 "+r"(dst), // %3 |
3510 "+r"(width) // %4 | 3513 "+r"(width) // %4 |
3511 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" | 3514 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" |
3512 ); | 3515 ); |
3513 } | 3516 } |
3514 #endif // HAS_BLENDPLANEROW_SSSE3 | 3517 #endif // HAS_BLENDPLANEROW_SSSE3 |
3515 | 3518 |
3516 #ifdef HAS_BLENDPLANEROW_AVX2 | 3519 #ifdef HAS_BLENDPLANEROW_AVX2 |
3517 // Blend 16 pixels at a time. | 3520 // Blend 32 pixels at a time. |
3518 // =((G2*C2)+(H2*(D2))+32768+127)/256 | 3521 // unsigned version of math |
| 3522 // =((A2*C2)+(B2*(255-C2))+255)/256 |
| 3523 // signed version of math |
| 3524 // =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256 |
3519 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, | 3525 void BlendPlaneRow_AVX2(const uint8* src0, const uint8* src1, |
3520 const uint8* alpha, uint8* dst, int width) { | 3526 const uint8* alpha, uint8* dst, int width) { |
3521 asm volatile ( | 3527 asm volatile ( |
3522 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 3528 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
3523 "vpsllw $0x8,%%ymm5,%%ymm5 \n" | 3529 "vpsllw $0x8,%%ymm5,%%ymm5 \n" |
3524 "mov $0x80808080,%%eax \n" | 3530 "mov $0x80808080,%%eax \n" |
3525 "vmovd %%eax,%%xmm6 \n" | 3531 "vmovd %%eax,%%xmm6 \n" |
3526 "vbroadcastss %%xmm6,%%ymm6 \n" | 3532 "vbroadcastss %%xmm6,%%ymm6 \n" |
3527 "mov $0x807f807f,%%eax \n" | 3533 "mov $0x807f807f,%%eax \n" |
3528 "vmovd %%eax,%%xmm7 \n" | 3534 "vmovd %%eax,%%xmm7 \n" |
3529 "vbroadcastss %%xmm7,%%ymm7 \n" | 3535 "vbroadcastss %%xmm7,%%ymm7 \n" |
3530 "sub %2,%0 \n" | 3536 "sub %2,%0 \n" |
3531 "sub %2,%1 \n" | 3537 "sub %2,%1 \n" |
3532 "sub %2,%3 \n" | 3538 "sub %2,%3 \n" |
3533 | 3539 |
3534 // 16 pixel loop. | 3540 // 32 pixel loop. |
3535 LABELALIGN | 3541 LABELALIGN |
3536 "1: \n" | 3542 "1: \n" |
3537 "vmovdqu (%2),%%xmm0 \n" | 3543 "vmovdqu (%2),%%ymm0 \n" |
3538 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3544 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n" |
3539 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" | 3545 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" |
| 3546 "vpxor %%ymm5,%%ymm3,%%ymm3 \n" |
3540 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" | 3547 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" |
3541 "vmovdqu (%0,%2,1),%%xmm1 \n" | 3548 "vmovdqu (%0,%2,1),%%ymm1 \n" |
3542 "vmovdqu (%1,%2,1),%%xmm2 \n" | 3549 "vmovdqu (%1,%2,1),%%ymm2 \n" |
3543 "vpermq $0xd8,%%ymm1,%%ymm1 \n" | 3550 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n" |
3544 "vpermq $0xd8,%%ymm2,%%ymm2 \n" | |
3545 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" | 3551 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" |
| 3552 "vpsubb %%ymm6,%%ymm1,%%ymm4 \n" |
3546 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" | 3553 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" |
| 3554 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n" |
3547 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" | 3555 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" |
| 3556 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" |
3548 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" | 3557 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" |
| 3558 "vpsrlw $0x8,%%ymm3,%%ymm3 \n" |
3549 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" | 3559 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" |
3550 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" | 3560 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n" |
3551 "vpermq $0xd8,%%ymm0,%%ymm0 \n" | 3561 "vmovdqu %%ymm0,(%3,%2,1) \n" |
3552 "vmovdqu %%xmm0,(%3,%2,1) \n" | 3562 "lea 0x20(%2),%2 \n" |
3553 "lea 0x10(%2),%2 \n" | 3563 "sub $0x20,%4 \n" |
3554 "sub $0x10,%4 \n" | |
3555 "jg 1b \n" | 3564 "jg 1b \n" |
3556 "vzeroupper \n" | 3565 "vzeroupper \n" |
3557 : "+r"(src0), // %0 | 3566 : "+r"(src0), // %0 |
3558 "+r"(src1), // %1 | 3567 "+r"(src1), // %1 |
3559 "+r"(alpha), // %2 | 3568 "+r"(alpha), // %2 |
3560 "+r"(dst), // %3 | 3569 "+r"(dst), // %3 |
3561 "+r"(width) // %4 | 3570 "+r"(width) // %4 |
3562 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" | 3571 :: "memory", "cc", "eax", |
| 3572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
3563 ); | 3573 ); |
3564 } | 3574 } |
3565 #endif // HAS_BLENDPLANEROW_AVX2 | 3575 #endif // HAS_BLENDPLANEROW_AVX2 |
3566 | 3576 |
3567 #ifdef HAS_ARGBATTENUATEROW_SSSE3 | 3577 #ifdef HAS_ARGBATTENUATEROW_SSSE3 |
3568 // Shuffle table duplicating alpha | 3578 // Shuffle table duplicating alpha |
3569 static uvec8 kShuffleAlpha0 = { | 3579 static uvec8 kShuffleAlpha0 = { |
3570 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u | 3580 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u |
3571 }; | 3581 }; |
3572 static uvec8 kShuffleAlpha1 = { | 3582 static uvec8 kShuffleAlpha1 = { |
(...skipping 2014 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5587 ); | 5597 ); |
5588 } | 5598 } |
5589 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5599 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5590 | 5600 |
5591 #endif // defined(__x86_64__) || defined(__i386__) | 5601 #endif // defined(__x86_64__) || defined(__i386__) |
5592 | 5602 |
5593 #ifdef __cplusplus | 5603 #ifdef __cplusplus |
5594 } // extern "C" | 5604 } // extern "C" |
5595 } // namespace libyuv | 5605 } // namespace libyuv |
5596 #endif | 5606 #endif |
OLD | NEW |