source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm - Issue 11555023: libvpx: Add VP9 decoder.

Unified Diff: source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm

Issue 11555023: libvpx: Add VP9 decoder. (Closed) Base URL: svn://chrome-svn/chrome/trunk/deps/third_party/libvpx/

Patch Set: Created 8 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm

===================================================================

--- source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm (revision 0)

+++ source/libvpx/vp9/common/x86/vp9_idctllm_sse2.asm (revision 0)

@@ -0,0 +1,712 @@

+; Use of this source code is governed by a BSD-style license

+; that can be found in the LICENSE file in the root of the source

+; tree. An additional intellectual property rights grant can be found

+; in the file PATENTS. All contributing project authors may

+; be found in the AUTHORS file in the root of the source tree.

+%include "vpx_ports/x86_abi_support.asm"

+;void vp9_idct_dequant_0_2x_sse2

+; (

+; short *qcoeff - 0

+; short *dequant - 1

+; unsigned char *pre - 2

+; unsigned char *dst - 3

+; int dst_stride - 4

+; int blk_stride - 5

+; )

+global sym(vp9_idct_dequant_0_2x_sse2)

+sym(vp9_idct_dequant_0_2x_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 6

+ GET_GOT rbx

+ ; end prolog

+ mov rdx, arg(1) ; dequant

+ mov rax, arg(0) ; qcoeff

+ movd xmm4, [rax]

+ movd xmm5, [rdx]

+ pinsrw xmm4, [rax+32], 4

+ pinsrw xmm5, [rdx], 4

+ pmullw xmm4, xmm5

+ ; Zero out xmm5, for use unpacking

+ pxor xmm5, xmm5

+ ; clear coeffs

+ movd [rax], xmm5

+ movd [rax+32], xmm5

+;pshufb

+ pshuflw xmm4, xmm4, 00000000b

+ pshufhw xmm4, xmm4, 00000000b

+ mov rax, arg(2) ; pre

+ paddw xmm4, [GLOBAL(fours)]

+ movsxd rcx, dword ptr arg(5) ; blk_stride

+ psraw xmm4, 3

+ movq xmm0, [rax]

+ movq xmm1, [rax+rcx]

+ movq xmm2, [rax+2*rcx]

+ lea rcx, [3*rcx]

+ movq xmm3, [rax+rcx]

+ punpcklbw xmm0, xmm5

+ punpcklbw xmm1, xmm5

+ punpcklbw xmm2, xmm5

+ punpcklbw xmm3, xmm5

+ mov rax, arg(3) ; dst

+ movsxd rdx, dword ptr arg(4) ; dst_stride

+ ; Add to predict buffer

+ paddw xmm0, xmm4

+ paddw xmm1, xmm4

+ paddw xmm2, xmm4

+ paddw xmm3, xmm4

+ ; pack up before storing

+ packuswb xmm0, xmm5

+ packuswb xmm1, xmm5

+ packuswb xmm2, xmm5

+ packuswb xmm3, xmm5

+ ; store blocks back out

+ movq [rax], xmm0

+ movq [rax + rdx], xmm1

+ lea rax, [rax + 2*rdx]

+ movq [rax], xmm2

+ movq [rax + rdx], xmm3

+ ; begin epilog

+ RESTORE_GOT

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+global sym(vp9_idct_dequant_full_2x_sse2)

+sym(vp9_idct_dequant_full_2x_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 7

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ ; special case when 2 blocks have 0 or 1 coeffs

+ ; dc is set as first coeff, so no need to load qcoeff

+ mov rax, arg(0) ; qcoeff

+ mov rsi, arg(2) ; pre

+ mov rdi, arg(3) ; dst

+ movsxd rcx, dword ptr arg(5) ; blk_stride

+ ; Zero out xmm7, for use unpacking

+ pxor xmm7, xmm7

+ mov rdx, arg(1) ; dequant

+ ; note the transpose of xmm1 and xmm2, necessary for shuffle

+ ; to spit out sensicle data

+ movdqa xmm0, [rax]

+ movdqa xmm2, [rax+16]

+ movdqa xmm1, [rax+32]

+ movdqa xmm3, [rax+48]

+ ; Clear out coeffs

+ movdqa [rax], xmm7

+ movdqa [rax+16], xmm7

+ movdqa [rax+32], xmm7

+ movdqa [rax+48], xmm7

+ ; dequantize qcoeff buffer

+ pmullw xmm0, [rdx]

+ pmullw xmm2, [rdx+16]

+ pmullw xmm1, [rdx]

+ pmullw xmm3, [rdx+16]

+ ; repack so block 0 row x and block 1 row x are together

+ movdqa xmm4, xmm0

+ punpckldq xmm0, xmm1

+ punpckhdq xmm4, xmm1

+ pshufd xmm0, xmm0, 11011000b

+ pshufd xmm1, xmm4, 11011000b

+ movdqa xmm4, xmm2

+ punpckldq xmm2, xmm3

+ punpckhdq xmm4, xmm3

+ pshufd xmm2, xmm2, 11011000b

+ pshufd xmm3, xmm4, 11011000b

+ ; first pass

+ psubw xmm0, xmm2 ; b1 = 0-2

+ paddw xmm2, xmm2 ;

+ movdqa xmm5, xmm1

+ paddw xmm2, xmm0 ; a1 = 0+2

+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]

+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)

+ movdqa xmm7, xmm3

+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)

+ psubw xmm7, xmm5 ; c1

+ movdqa xmm5, xmm1

+ movdqa xmm4, xmm3

+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm5, xmm1

+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]

+ paddw xmm3, xmm4

+ paddw xmm3, xmm5 ; d1

+ movdqa xmm6, xmm2 ; a1

+ movdqa xmm4, xmm0 ; b1

+ paddw xmm2, xmm3 ;0

+ paddw xmm4, xmm7 ;1

+ psubw xmm0, xmm7 ;2

+ psubw xmm6, xmm3 ;3

+ ; transpose for the second pass

+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000

+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000

+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100

+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008

+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008

+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108

+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000

+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000

+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002

+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100

+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100

+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102

+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000

+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000

+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001

+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002

+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002

+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003

+ pshufd xmm0, xmm2, 11011000b

+ pshufd xmm2, xmm1, 11011000b

+ pshufd xmm1, xmm5, 11011000b

+ pshufd xmm3, xmm7, 11011000b

+ ; second pass

+ psubw xmm0, xmm2 ; b1 = 0-2

+ paddw xmm2, xmm2

+ movdqa xmm5, xmm1

+ paddw xmm2, xmm0 ; a1 = 0+2

+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]

+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)

+ movdqa xmm7, xmm3

+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)

+ psubw xmm7, xmm5 ; c1

+ movdqa xmm5, xmm1

+ movdqa xmm4, xmm3

+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm5, xmm1

+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]

+ paddw xmm3, xmm4

+ paddw xmm3, xmm5 ; d1

+ paddw xmm0, [GLOBAL(fours)]

+ paddw xmm2, [GLOBAL(fours)]

+ movdqa xmm6, xmm2 ; a1

+ movdqa xmm4, xmm0 ; b1

+ paddw xmm2, xmm3 ;0

+ paddw xmm4, xmm7 ;1

+ psubw xmm0, xmm7 ;2

+ psubw xmm6, xmm3 ;3

+ psraw xmm2, 3

+ psraw xmm0, 3

+ psraw xmm4, 3

+ psraw xmm6, 3

+ ; transpose to save

+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000

+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000

+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100

+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008

+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008

+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108

+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000

+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000

+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002

+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100

+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100

+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102

+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000

+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000

+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001

+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002

+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002

+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003

+ pshufd xmm0, xmm2, 11011000b

+ pshufd xmm2, xmm1, 11011000b

+ pshufd xmm1, xmm5, 11011000b

+ pshufd xmm3, xmm7, 11011000b

+ pxor xmm7, xmm7

+ ; Load up predict blocks

+ movq xmm4, [rsi]

+ movq xmm5, [rsi+rcx]

+ punpcklbw xmm4, xmm7

+ punpcklbw xmm5, xmm7

+ paddw xmm0, xmm4

+ paddw xmm1, xmm5

+ movq xmm4, [rsi+2*rcx]

+ lea rcx, [3*rcx]

+ movq xmm5, [rsi+rcx]

+ punpcklbw xmm4, xmm7

+ punpcklbw xmm5, xmm7

+ paddw xmm2, xmm4

+ paddw xmm3, xmm5

+.finish:

+ ; pack up before storing

+ packuswb xmm0, xmm7

+ packuswb xmm1, xmm7

+ packuswb xmm2, xmm7

+ packuswb xmm3, xmm7

+ ; Load destination stride before writing out,

+ ; doesn't need to persist

+ movsxd rdx, dword ptr arg(4) ; dst_stride

+ ; store blocks back out

+ movq [rdi], xmm0

+ movq [rdi + rdx], xmm1

+ lea rdi, [rdi + 2*rdx]

+ movq [rdi], xmm2

+ movq [rdi + rdx], xmm3

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+;void vp9_idct_dequant_dc_0_2x_sse2

+; (

+; short *qcoeff - 0

+; short *dequant - 1

+; unsigned char *pre - 2

+; unsigned char *dst - 3

+; int dst_stride - 4

+; short *dc - 5

+; )

+global sym(vp9_idct_dequant_dc_0_2x_sse2)

+sym(vp9_idct_dequant_dc_0_2x_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ ; special case when 2 blocks have 0 or 1 coeffs

+ ; dc is set as first coeff, so no need to load qcoeff

+ mov rax, arg(0) ; qcoeff

+ mov rsi, arg(2) ; pre

+ mov rdi, arg(3) ; dst

+ mov rdx, arg(5) ; dc

+ ; Zero out xmm5, for use unpacking

+ pxor xmm5, xmm5

+ ; load up 2 dc words here == 2*16 = doubleword

+ movd xmm4, [rdx]

+ ; Load up predict blocks

+ movq xmm0, [rsi]

+ movq xmm1, [rsi+16]

+ movq xmm2, [rsi+32]

+ movq xmm3, [rsi+48]

+ ; Duplicate and expand dc across

+ punpcklwd xmm4, xmm4

+ punpckldq xmm4, xmm4

+ ; Rounding to dequant and downshift

+ paddw xmm4, [GLOBAL(fours)]

+ psraw xmm4, 3

+ ; Predict buffer needs to be expanded from bytes to words

+ punpcklbw xmm0, xmm5

+ punpcklbw xmm1, xmm5

+ punpcklbw xmm2, xmm5

+ punpcklbw xmm3, xmm5

+ ; Add to predict buffer

+ paddw xmm0, xmm4

+ paddw xmm1, xmm4

+ paddw xmm2, xmm4

+ paddw xmm3, xmm4

+ ; pack up before storing

+ packuswb xmm0, xmm5

+ packuswb xmm1, xmm5

+ packuswb xmm2, xmm5

+ packuswb xmm3, xmm5

+ ; Load destination stride before writing out,

+ ; doesn't need to persist

+ movsxd rdx, dword ptr arg(4) ; dst_stride

+ ; store blocks back out

+ movq [rdi], xmm0

+ movq [rdi + rdx], xmm1

+ lea rdi, [rdi + 2*rdx]

+ movq [rdi], xmm2

+ movq [rdi + rdx], xmm3

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+global sym(vp9_idct_dequant_dc_full_2x_sse2)

+sym(vp9_idct_dequant_dc_full_2x_sse2):

+ push rbp

+ mov rbp, rsp

+ SHADOW_ARGS_TO_STACK 7

+ SAVE_XMM 7

+ GET_GOT rbx

+ push rsi

+ push rdi

+ ; end prolog

+ ; special case when 2 blocks have 0 or 1 coeffs

+ ; dc is set as first coeff, so no need to load qcoeff

+ mov rax, arg(0) ; qcoeff

+ mov rsi, arg(2) ; pre

+ mov rdi, arg(3) ; dst

+ ; Zero out xmm7, for use unpacking

+ pxor xmm7, xmm7

+ mov rdx, arg(1) ; dequant

+ ; note the transpose of xmm1 and xmm2, necessary for shuffle

+ ; to spit out sensicle data

+ movdqa xmm0, [rax]

+ movdqa xmm2, [rax+16]

+ movdqa xmm1, [rax+32]

+ movdqa xmm3, [rax+48]

+ ; Clear out coeffs

+ movdqa [rax], xmm7

+ movdqa [rax+16], xmm7

+ movdqa [rax+32], xmm7

+ movdqa [rax+48], xmm7

+ ; dequantize qcoeff buffer

+ pmullw xmm0, [rdx]

+ pmullw xmm2, [rdx+16]

+ pmullw xmm1, [rdx]

+ pmullw xmm3, [rdx+16]

+ ; DC component

+ mov rdx, arg(5)

+ ; repack so block 0 row x and block 1 row x are together

+ movdqa xmm4, xmm0

+ punpckldq xmm0, xmm1

+ punpckhdq xmm4, xmm1

+ pshufd xmm0, xmm0, 11011000b

+ pshufd xmm1, xmm4, 11011000b

+ movdqa xmm4, xmm2

+ punpckldq xmm2, xmm3

+ punpckhdq xmm4, xmm3

+ pshufd xmm2, xmm2, 11011000b

+ pshufd xmm3, xmm4, 11011000b

+ ; insert DC component

+ pinsrw xmm0, [rdx], 0

+ pinsrw xmm0, [rdx+2], 4

+ ; first pass

+ psubw xmm0, xmm2 ; b1 = 0-2

+ paddw xmm2, xmm2 ;

+ movdqa xmm5, xmm1

+ paddw xmm2, xmm0 ; a1 = 0+2

+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]

+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)

+ movdqa xmm7, xmm3

+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)

+ psubw xmm7, xmm5 ; c1

+ movdqa xmm5, xmm1

+ movdqa xmm4, xmm3

+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm5, xmm1

+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]

+ paddw xmm3, xmm4

+ paddw xmm3, xmm5 ; d1

+ movdqa xmm6, xmm2 ; a1

+ movdqa xmm4, xmm0 ; b1

+ paddw xmm2, xmm3 ;0

+ paddw xmm4, xmm7 ;1

+ psubw xmm0, xmm7 ;2

+ psubw xmm6, xmm3 ;3

+ ; transpose for the second pass

+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000

+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000

+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100

+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008

+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008

+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108

+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000

+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000

+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002

+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100

+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100

+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102

+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000

+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000

+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001

+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002

+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002

+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003

+ pshufd xmm0, xmm2, 11011000b

+ pshufd xmm2, xmm1, 11011000b

+ pshufd xmm1, xmm5, 11011000b

+ pshufd xmm3, xmm7, 11011000b

+ ; second pass

+ psubw xmm0, xmm2 ; b1 = 0-2

+ paddw xmm2, xmm2

+ movdqa xmm5, xmm1

+ paddw xmm2, xmm0 ; a1 = 0+2

+ pmulhw xmm5, [GLOBAL(x_s1sqr2)]

+ paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2)

+ movdqa xmm7, xmm3

+ pmulhw xmm7, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2)

+ psubw xmm7, xmm5 ; c1

+ movdqa xmm5, xmm1

+ movdqa xmm4, xmm3

+ pmulhw xmm5, [GLOBAL(x_c1sqr2less1)]

+ paddw xmm5, xmm1

+ pmulhw xmm3, [GLOBAL(x_s1sqr2)]

+ paddw xmm3, xmm4

+ paddw xmm3, xmm5 ; d1

+ paddw xmm0, [GLOBAL(fours)]

+ paddw xmm2, [GLOBAL(fours)]

+ movdqa xmm6, xmm2 ; a1

+ movdqa xmm4, xmm0 ; b1

+ paddw xmm2, xmm3 ;0

+ paddw xmm4, xmm7 ;1

+ psubw xmm0, xmm7 ;2

+ psubw xmm6, xmm3 ;3

+ psraw xmm2, 3

+ psraw xmm0, 3

+ psraw xmm4, 3

+ psraw xmm6, 3

+ ; transpose to save

+ movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000

+ punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000

+ punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100

+ movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008

+ punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008

+ punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108

+ movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000

+ punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000

+ punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002

+ movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100

+ punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100

+ punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102

+ movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000

+ punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000

+ punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001

+ movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002

+ punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002

+ punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003

+ pshufd xmm0, xmm2, 11011000b

+ pshufd xmm2, xmm1, 11011000b

+ pshufd xmm1, xmm5, 11011000b

+ pshufd xmm3, xmm7, 11011000b

+ pxor xmm7, xmm7

+ ; Load up predict blocks

+ movq xmm4, [rsi]

+ movq xmm5, [rsi+16]

+ punpcklbw xmm4, xmm7

+ punpcklbw xmm5, xmm7

+ paddw xmm0, xmm4

+ paddw xmm1, xmm5

+ movq xmm4, [rsi+32]

+ movq xmm5, [rsi+48]

+ punpcklbw xmm4, xmm7

+ punpcklbw xmm5, xmm7

+ paddw xmm2, xmm4

+ paddw xmm3, xmm5

+.finish:

+ ; pack up before storing

+ packuswb xmm0, xmm7

+ packuswb xmm1, xmm7

+ packuswb xmm2, xmm7

+ packuswb xmm3, xmm7

+ ; Load destination stride before writing out,

+ ; doesn't need to persist

+ movsxd rdx, dword ptr arg(4) ; dst_stride

+ ; store blocks back out

+ movq [rdi], xmm0

+ movq [rdi + rdx], xmm1

+ lea rdi, [rdi + 2*rdx]

+ movq [rdi], xmm2

+ movq [rdi + rdx], xmm3

+ ; begin epilog

+ pop rdi

+ pop rsi

+ RESTORE_GOT

+ RESTORE_XMM

+ UNSHADOW_ARGS

+ pop rbp

+ ret

+SECTION_RODATA

+align 16

+fours:

+ times 8 dw 0x0004

+align 16

+x_s1sqr2:

+ times 8 dw 0x8A8C

+align 16

+x_c1sqr2less1:

+ times 8 dw 0x4E7B

« libvpx.gyp ('K') | « source/libvpx/vp9/common/x86/vp9_idctllm_mmx.asm ('k') | source/libvpx/vp9/common/x86/vp9_iwalsh_mmx.asm » ('j') | no next file with comments »