Index: source/libvpx/third_party/libyuv/source/row_x86.asm |
=================================================================== |
--- source/libvpx/third_party/libyuv/source/row_x86.asm (revision 0) |
+++ source/libvpx/third_party/libyuv/source/row_x86.asm (revision 0) |
@@ -0,0 +1,146 @@ |
+; |
+; Copyright 2012 The LibYuv Project Authors. All rights reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+%ifdef __YASM_VERSION_ID__ |
+%if __YASM_VERSION_ID__ < 01020000h |
+%error AVX2 is supported only by yasm 1.2.0 or later. |
+%endif |
+%endif |
+%include "x86inc.asm" |
+ |
+SECTION .text |
+ |
+; cglobal numeric constants are parameters, gpr regs, mm regs |
+ |
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) |
+ |
+%macro YUY2TOYROW 2-3 |
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix |
+%ifidn %1,YUY2 |
+ pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff |
+ psrlw m2, m2, 8 |
+%endif |
+ |
+ ALIGN 4 |
+.convertloop: |
+ mov%2 m0, [src_yuy2q] |
+ mov%2 m1, [src_yuy2q + mmsize] |
+ lea src_yuy2q, [src_yuy2q + mmsize * 2] |
+%ifidn %1,YUY2 |
+ pand m0, m0, m2 ; YUY2 even bytes are Y |
+ pand m1, m1, m2 |
+%else |
+ psrlw m0, m0, 8 ; UYVY odd bytes are Y |
+ psrlw m1, m1, 8 |
+%endif |
+ packuswb m0, m0, m1 |
+%if cpuflag(AVX2) |
+ vpermq m0, m0, 0xd8 |
+%endif |
+ sub pixd, mmsize |
+ mov%2 [dst_yq], m0 |
+ lea dst_yq, [dst_yq + mmsize] |
+ jg .convertloop |
+ REP_RET |
+%endmacro |
+ |
+; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version. |
+INIT_MMX MMX |
+YUY2TOYROW YUY2,a, |
+YUY2TOYROW YUY2,u,_Unaligned |
+YUY2TOYROW UYVY,a, |
+YUY2TOYROW UYVY,u,_Unaligned |
+INIT_XMM SSE2 |
+YUY2TOYROW YUY2,a, |
+YUY2TOYROW YUY2,u,_Unaligned |
+YUY2TOYROW UYVY,a, |
+YUY2TOYROW UYVY,u,_Unaligned |
+INIT_YMM AVX2 |
+YUY2TOYROW YUY2,a, |
+YUY2TOYROW UYVY,a, |
+ |
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) |
+ |
+%macro SplitUVRow 1-2 |
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix |
+ pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff |
+ psrlw m4, m4, 8 |
+ sub dst_vq, dst_uq |
+ |
+ ALIGN 4 |
+.convertloop: |
+ mov%1 m0, [src_uvq] |
+ mov%1 m1, [src_uvq + mmsize] |
+ lea src_uvq, [src_uvq + mmsize * 2] |
+ psrlw m2, m0, 8 ; odd bytes |
+ psrlw m3, m1, 8 |
+ pand m0, m0, m4 ; even bytes |
+ pand m1, m1, m4 |
+ packuswb m0, m0, m1 |
+ packuswb m2, m2, m3 |
+%if cpuflag(AVX2) |
+ vpermq m0, m0, 0xd8 |
+ vpermq m2, m2, 0xd8 |
+%endif |
+ mov%1 [dst_uq], m0 |
+ mov%1 [dst_uq + dst_vq], m2 |
+ lea dst_uq, [dst_uq + mmsize] |
+ sub pixd, mmsize |
+ jg .convertloop |
+ REP_RET |
+%endmacro |
+ |
+INIT_MMX MMX |
+SplitUVRow a, |
+SplitUVRow u,_Unaligned |
+INIT_XMM SSE2 |
+SplitUVRow a, |
+SplitUVRow u,_Unaligned |
+INIT_YMM AVX2 |
+SplitUVRow a, |
+ |
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, |
+; int width); |
+ |
+%macro MergeUVRow_ 1-2 |
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix |
+ sub src_vq, src_uq |
+ |
+ ALIGN 4 |
+.convertloop: |
+ mov%1 m0, [src_uq] |
+ mov%1 m1, [src_vq] |
+ lea src_uq, [src_uq + mmsize] |
+ punpcklbw m2, m0, m1 // first 8 UV pairs |
+ punpckhbw m0, m0, m1 // next 8 UV pairs |
+%if cpuflag(AVX2) |
+ vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0 |
+ vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0 |
+ mov%1 [dst_uvq], m1 |
+ mov%1 [dst_uvq + mmsize], m2 |
+%else |
+ mov%1 [dst_uvq], m2 |
+ mov%1 [dst_uvq + mmsize], m0 |
+%endif |
+ lea dst_uvq, [dst_uvq + mmsize * 2] |
+ sub pixd, mmsize |
+ jg .convertloop |
+ REP_RET |
+%endmacro |
+ |
+INIT_MMX MMX |
+MergeUVRow_ a, |
+MergeUVRow_ u,_Unaligned |
+INIT_XMM SSE2 |
+MergeUVRow_ a, |
+MergeUVRow_ u,_Unaligned |
+INIT_YMM AVX2 |
+MergeUVRow_ a, |
+ |