Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(148)

Side by Side Diff: source/row_gcc.cc

Issue 1505673003: Optimize yuv alpha blend AVX2 code to do 32 pixels at time. (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: gcc port of avx2 that does 32 pixels Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // VERSION 2 1 // VERSION 2
2 /* 2 /*
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
4 * 4 *
5 * Use of this source code is governed by a BSD-style license 5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source 6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found 7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may 8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree. 9 * be found in the AUTHORS file in the root of the source tree.
10 */ 10 */
(...skipping 3513 matching lines...) Expand 10 before | Expand all | Expand 10 after
3524 "mov $0x80808080,%%eax \n" 3524 "mov $0x80808080,%%eax \n"
3525 "vmovd %%eax,%%xmm6 \n" 3525 "vmovd %%eax,%%xmm6 \n"
3526 "vbroadcastss %%xmm6,%%ymm6 \n" 3526 "vbroadcastss %%xmm6,%%ymm6 \n"
3527 "mov $0x807f807f,%%eax \n" 3527 "mov $0x807f807f,%%eax \n"
3528 "vmovd %%eax,%%xmm7 \n" 3528 "vmovd %%eax,%%xmm7 \n"
3529 "vbroadcastss %%xmm7,%%ymm7 \n" 3529 "vbroadcastss %%xmm7,%%ymm7 \n"
3530 "sub %2,%0 \n" 3530 "sub %2,%0 \n"
3531 "sub %2,%1 \n" 3531 "sub %2,%1 \n"
3532 "sub %2,%3 \n" 3532 "sub %2,%3 \n"
3533 3533
3534 // 16 pixel loop. 3534 // 32 pixel loop.
3535 LABELALIGN 3535 LABELALIGN
3536 "1: \n" 3536 "1: \n"
3537 "vmovdqu (%2),%%xmm0 \n" 3537 "vmovdqu (%2),%%ymm0 \n"
3538 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3538 "vpunpckhbw %%ymm0,%%ymm0,%%ymm3 \n"
3539 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" 3539 "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n"
3540 "vpxor %%ymm5,%%ymm3,%%ymm3 \n"
3540 "vpxor %%ymm5,%%ymm0,%%ymm0 \n" 3541 "vpxor %%ymm5,%%ymm0,%%ymm0 \n"
3541 "vmovdqu (%0,%2,1),%%xmm1 \n" 3542 "vmovdqu (%0,%2,1),%%ymm1 \n"
3542 "vmovdqu (%1,%2,1),%%xmm2 \n" 3543 "vmovdqu (%1,%2,1),%%ymm2 \n"
3543 "vpermq $0xd8,%%ymm1,%%ymm1 \n" 3544 "vpunpckhbw %%ymm2,%%ymm1,%%ymm4 \n"
3544 "vpermq $0xd8,%%ymm2,%%ymm2 \n"
3545 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n" 3545 "vpunpcklbw %%ymm2,%%ymm1,%%ymm1 \n"
3546 "vpsubb %%ymm6,%%ymm1,%%ymm4 \n"
3546 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n" 3547 "vpsubb %%ymm6,%%ymm1,%%ymm1 \n"
3548 "vpmaddubsw %%ymm4,%%ymm3,%%ymm3 \n"
3547 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n" 3549 "vpmaddubsw %%ymm1,%%ymm0,%%ymm0 \n"
3550 "vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
3548 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n" 3551 "vpaddw %%ymm7,%%ymm0,%%ymm0 \n"
3552 "vpsrlw $0x8,%%ymm3,%%ymm3 \n"
3549 "vpsrlw $0x8,%%ymm0,%%ymm0 \n" 3553 "vpsrlw $0x8,%%ymm0,%%ymm0 \n"
3550 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" 3554 "vpackuswb %%ymm3,%%ymm0,%%ymm0 \n"
3551 "vpermq $0xd8,%%ymm0,%%ymm0 \n" 3555 "vmovdqu %%ymm0,(%3,%2,1) \n"
3552 "vmovdqu %%xmm0,(%3,%2,1) \n" 3556 "lea 0x20(%2),%2 \n"
3553 "lea 0x10(%2),%2 \n" 3557 "sub $0x20,%4 \n"
3554 "sub $0x10,%4 \n"
3555 "jg 1b \n" 3558 "jg 1b \n"
3556 "vzeroupper \n" 3559 "vzeroupper \n"
3557 : "+r"(src0), // %0 3560 : "+r"(src0), // %0
3558 "+r"(src1), // %1 3561 "+r"(src1), // %1
3559 "+r"(alpha), // %2 3562 "+r"(alpha), // %2
3560 "+r"(dst), // %3 3563 "+r"(dst), // %3
3561 "+r"(width) // %4 3564 "+r"(width) // %4
3562 :: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7" 3565 :: "memory", "cc", "eax",
3566 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3563 ); 3567 );
3564 } 3568 }
3565 #endif // HAS_BLENDPLANEROW_AVX2 3569 #endif // HAS_BLENDPLANEROW_AVX2
3566 3570
3567 #ifdef HAS_ARGBATTENUATEROW_SSSE3 3571 #ifdef HAS_ARGBATTENUATEROW_SSSE3
3568 // Shuffle table duplicating alpha 3572 // Shuffle table duplicating alpha
3569 static uvec8 kShuffleAlpha0 = { 3573 static uvec8 kShuffleAlpha0 = {
3570 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u 3574 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u
3571 }; 3575 };
3572 static uvec8 kShuffleAlpha1 = { 3576 static uvec8 kShuffleAlpha1 = {
(...skipping 2014 matching lines...) Expand 10 before | Expand all | Expand 10 after
5587 ); 5591 );
5588 } 5592 }
5589 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 5593 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5590 5594
5591 #endif // defined(__x86_64__) || defined(__i386__) 5595 #endif // defined(__x86_64__) || defined(__i386__)
5592 5596
5593 #ifdef __cplusplus 5597 #ifdef __cplusplus
5594 } // extern "C" 5598 } // extern "C"
5595 } // namespace libyuv 5599 } // namespace libyuv
5596 #endif 5600 #endif
OLDNEW
« source/cpu_id.cc ('K') | « source/planar_functions.cc ('k') | source/row_win.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698