| Index: source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm (revision 278778)
|
| +++ source/libvpx/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm (working copy)
|
| @@ -15,41 +15,45 @@
|
| ; (unsigned char *frame1, | 0
|
| ; unsigned int stride, | 1
|
| ; unsigned char *frame2, | 2
|
| -; unsigned int block_size, | 3
|
| -; int strength, | 4
|
| -; int filter_weight, | 5
|
| -; unsigned int *accumulator, | 6
|
| -; unsigned short *count) | 7
|
| +; unsigned int block_width, | 3
|
| +; unsigned int block_height, | 4
|
| +; int strength, | 5
|
| +; int filter_weight, | 6
|
| +; unsigned int *accumulator, | 7
|
| +; unsigned short *count) | 8
|
| global sym(vp9_temporal_filter_apply_sse2) PRIVATE
|
| sym(vp9_temporal_filter_apply_sse2):
|
|
|
| push rbp
|
| mov rbp, rsp
|
| - SHADOW_ARGS_TO_STACK 8
|
| + SHADOW_ARGS_TO_STACK 9
|
| SAVE_XMM 7
|
| GET_GOT rbx
|
| push rsi
|
| push rdi
|
| ALIGN_STACK 16, rax
|
| - %define block_size 0
|
| - %define strength 16
|
| - %define filter_weight 32
|
| - %define rounding_bit 48
|
| - %define rbp_backup 64
|
| - %define stack_size 80
|
| + %define block_width 0
|
| + %define block_height 16
|
| + %define strength 32
|
| + %define filter_weight 48
|
| + %define rounding_bit 64
|
| + %define rbp_backup 80
|
| + %define stack_size 96
|
| sub rsp, stack_size
|
| mov [rsp + rbp_backup], rbp
|
| ; end prolog
|
|
|
| - mov rdx, arg(3)
|
| - mov [rsp + block_size], rdx
|
| - movd xmm6, arg(4)
|
| + mov edx, arg(3)
|
| + mov [rsp + block_width], rdx
|
| + mov edx, arg(4)
|
| + mov [rsp + block_height], rdx
|
| + movd xmm6, arg(5)
|
| movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
|
|
|
| ; calculate the rounding bit outside the loop
|
| ; 0x8000 >> (16 - strength)
|
| mov rdx, 16
|
| - sub rdx, arg(4) ; 16 - strength
|
| + sub rdx, arg(5) ; 16 - strength
|
| movq xmm4, rdx ; can't use rdx w/ shift
|
| movdqa xmm5, [GLOBAL(_const_top_bit)]
|
| psrlw xmm5, xmm4
|
| @@ -57,11 +61,11 @@
|
|
|
| mov rsi, arg(0) ; src/frame1
|
| mov rdx, arg(2) ; predictor frame
|
| - mov rdi, arg(6) ; accumulator
|
| - mov rax, arg(7) ; count
|
| + mov rdi, arg(7) ; accumulator
|
| + mov rax, arg(8) ; count
|
|
|
| ; dup the filter weight and store for later
|
| - movd xmm0, arg(5) ; filter_weight
|
| + movd xmm0, arg(6) ; filter_weight
|
| pshuflw xmm0, xmm0, 0
|
| punpcklwd xmm0, xmm0
|
| movdqa [rsp + filter_weight], xmm0
|
| @@ -69,10 +73,11 @@
|
| mov rbp, arg(1) ; stride
|
| pxor xmm7, xmm7 ; zero for extraction
|
|
|
| - lea rcx, [rdx + 16*16*1]
|
| - cmp dword ptr [rsp + block_size], 8
|
| + mov rcx, [rsp + block_width]
|
| + imul rcx, [rsp + block_height]
|
| + add rcx, rdx
|
| + cmp dword ptr [rsp + block_width], 8
|
| jne .temporal_filter_apply_load_16
|
| - lea rcx, [rdx + 8*8*1]
|
|
|
| .temporal_filter_apply_load_8:
|
| movq xmm0, [rsi] ; first row
|
| @@ -178,7 +183,7 @@
|
| cmp rdx, rcx
|
| je .temporal_filter_apply_epilog
|
| pxor xmm7, xmm7 ; zero for extraction
|
| - cmp dword ptr [rsp + block_size], 16
|
| + cmp dword ptr [rsp + block_width], 16
|
| je .temporal_filter_apply_load_16
|
| jmp .temporal_filter_apply_load_8
|
|
|
|
|