| Index: source/libvpx/vp9/common/x86/vp9_subpixel_mmx.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/common/x86/vp9_subpixel_mmx.asm (revision 0)
|
| +++ source/libvpx/vp9/common/x86/vp9_subpixel_mmx.asm (revision 0)
|
| @@ -0,0 +1,727 @@
|
| +;
|
| +; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| +;
|
| +; Use of this source code is governed by a BSD-style license
|
| +; that can be found in the LICENSE file in the root of the source
|
| +; tree. An additional intellectual property rights grant can be found
|
| +; in the file PATENTS. All contributing project authors may
|
| +; be found in the AUTHORS file in the root of the source tree.
|
| +;
|
| +
|
| +
|
| +%include "vpx_ports/x86_abi_support.asm"
|
| +
|
| +
|
| +%define BLOCK_HEIGHT_WIDTH 4
|
| +%define vp9_filter_weight 128
|
| +%define VP9_FILTER_SHIFT 7
|
| +
|
| +
|
| +;void vp9_filter_block1d_h6_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; unsigned short *output_ptr,
|
| +; unsigned int src_pixels_per_line,
|
| +; unsigned int pixel_step,
|
| +; unsigned int output_height,
|
| +; unsigned int output_width,
|
| +; short * vp9_filter
|
| +;)
|
| +global sym(vp9_filter_block1d_h6_mmx)
|
| +sym(vp9_filter_block1d_h6_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 7
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + mov rdx, arg(6) ;vp9_filter
|
| +
|
| + movq mm1, [rdx + 16] ; do both the negative taps first!!!
|
| + movq mm2, [rdx + 32] ;
|
| + movq mm6, [rdx + 48] ;
|
| + movq mm7, [rdx + 64] ;
|
| +
|
| + mov rdi, arg(1) ;output_ptr
|
| + mov rsi, arg(0) ;src_ptr
|
| + movsxd rcx, dword ptr arg(4) ;output_height
|
| + movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
|
| + pxor mm0, mm0 ; mm0 = 00000000
|
| +
|
| +.nextrow:
|
| + movq mm3, [rsi-2] ; mm3 = p-2..p5
|
| + movq mm4, mm3 ; mm4 = p-2..p5
|
| + psrlq mm3, 8 ; mm3 = p-1..p5
|
| + punpcklbw mm3, mm0 ; mm3 = p-1..p2
|
| + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
| +
|
| + movq mm5, mm4 ; mm5 = p-2..p5
|
| + punpckhbw mm4, mm0 ; mm5 = p2..p5
|
| + pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
|
| + paddsw mm3, mm4 ; mm3 += mm5
|
| +
|
| + movq mm4, mm5 ; mm4 = p-2..p5;
|
| + psrlq mm5, 16 ; mm5 = p0..p5;
|
| + punpcklbw mm5, mm0 ; mm5 = p0..p3
|
| + pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
|
| + paddsw mm3, mm5 ; mm3 += mm5
|
| +
|
| + movq mm5, mm4 ; mm5 = p-2..p5
|
| + psrlq mm4, 24 ; mm4 = p1..p5
|
| + punpcklbw mm4, mm0 ; mm4 = p1..p4
|
| + pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
|
| + paddsw mm3, mm4 ; mm3 += mm5
|
| +
|
| + ; do outer positive taps
|
| + movd mm4, [rsi+3]
|
| + punpcklbw mm4, mm0 ; mm5 = p3..p6
|
| + pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
|
| + paddsw mm3, mm4 ; mm3 += mm5
|
| +
|
| + punpcklbw mm5, mm0 ; mm5 = p-2..p1
|
| + pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
|
| + paddsw mm3, mm5 ; mm3 += mm5
|
| +
|
| + paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
|
| + packuswb mm3, mm0 ; pack and unpack to saturate
|
| + punpcklbw mm3, mm0 ;
|
| +
|
| + movq [rdi], mm3 ; store the results in the destination
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
|
| + add rdi, rax;
|
| +%else
|
| + movsxd r8, dword ptr arg(2) ;src_pixels_per_line
|
| + add rdi, rax;
|
| +
|
| + add rsi, r8 ; next line
|
| +%endif
|
| +
|
| + dec rcx ; decrement count
|
| + jnz .nextrow ; next row
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;void vp9_filter_block1dc_v6_mmx
|
| +;(
|
| +; short *src_ptr,
|
| +; unsigned char *output_ptr,
|
| +; int output_pitch,
|
| +; unsigned int pixels_per_line,
|
| +; unsigned int pixel_step,
|
| +; unsigned int output_height,
|
| +; unsigned int output_width,
|
| +; short * vp9_filter
|
| +;)
|
| +global sym(vp9_filter_block1dc_v6_mmx)
|
| +sym(vp9_filter_block1dc_v6_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 8
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + movq mm5, [GLOBAL(rd)]
|
| + push rbx
|
| + mov rbx, arg(7) ;vp9_filter
|
| + movq mm1, [rbx + 16] ; do both the negative taps first!!!
|
| + movq mm2, [rbx + 32] ;
|
| + movq mm6, [rbx + 48] ;
|
| + movq mm7, [rbx + 64] ;
|
| +
|
| + movsxd rdx, dword ptr arg(3) ;pixels_per_line
|
| + mov rdi, arg(1) ;output_ptr
|
| + mov rsi, arg(0) ;src_ptr
|
| + sub rsi, rdx
|
| + sub rsi, rdx
|
| + movsxd rcx, DWORD PTR arg(5) ;output_height
|
| + movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
|
| + pxor mm0, mm0 ; mm0 = 00000000
|
| +
|
| +
|
| +.nextrow_cv:
|
| + movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
|
| + pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
|
| +
|
| +
|
| + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
|
| + pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
|
| + paddsw mm3, mm4 ; mm3 += mm4
|
| +
|
| + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
|
| + pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
|
| + paddsw mm3, mm4 ; mm3 += mm4
|
| +
|
| + movq mm4, [rsi] ; mm4 = p0..p3 = row -2
|
| + pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
|
| + paddsw mm3, mm4 ; mm3 += mm4
|
| +
|
| +
|
| + add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
|
| + movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
|
| + pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
|
| + paddsw mm3, mm4 ; mm3 += mm4
|
| +
|
| + movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
|
| + pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
|
| + paddsw mm3, mm4 ; mm3 += mm4
|
| +
|
| +
|
| + paddsw mm3, mm5 ; mm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
|
| + packuswb mm3, mm0 ; pack and saturate
|
| +
|
| + movd [rdi],mm3 ; store the results in the destination
|
| + ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
|
| + ; recon block should be in cache this shouldn't cost much. Its obviously
|
| + ; avoidable!!!.
|
| + lea rdi, [rdi+rax] ;
|
| + dec rcx ; decrement count
|
| + jnz .nextrow_cv ; next row
|
| +
|
| + pop rbx
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;void bilinear_predict8x8_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_pixels_per_line,
|
| +; int xoffset,
|
| +; int yoffset,
|
| +; unsigned char *dst_ptr,
|
| +; int dst_pitch
|
| +;)
|
| +global sym(vp9_bilinear_predict8x8_mmx)
|
| +sym(vp9_bilinear_predict8x8_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + ;const short *HFilter = bilinear_filters_mmx[xoffset];
|
| + ;const short *VFilter = bilinear_filters_mmx[yoffset];
|
| +
|
| + movsxd rax, dword ptr arg(2) ;xoffset
|
| + mov rdi, arg(4) ;dst_ptr ;
|
| +
|
| + shl rax, 5 ; offset * 32
|
| + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
|
| +
|
| + add rax, rcx ; HFilter
|
| + mov rsi, arg(0) ;src_ptr ;
|
| +
|
| + movsxd rdx, dword ptr arg(5) ;dst_pitch
|
| + movq mm1, [rax] ;
|
| +
|
| + movq mm2, [rax+16] ;
|
| + movsxd rax, dword ptr arg(3) ;yoffset
|
| +
|
| + pxor mm0, mm0 ;
|
| +
|
| + shl rax, 5 ; offset*32
|
| + add rax, rcx ; VFilter
|
| +
|
| + lea rcx, [rdi+rdx*8] ;
|
| + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
| +
|
| +
|
| +
|
| + ; get the first horizontal line done ;
|
| + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
| + movq mm4, mm3 ; make a copy of current line
|
| +
|
| + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + pmullw mm3, mm1 ;
|
| + pmullw mm4, mm1 ;
|
| +
|
| + movq mm5, [rsi+1] ;
|
| + movq mm6, mm5 ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + punpckhbw mm6, mm0 ;
|
| +
|
| + pmullw mm5, mm2 ;
|
| + pmullw mm6, mm2 ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm4, mm6 ;
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + paddw mm4, [GLOBAL(rd)] ;
|
| + psraw mm4, VP9_FILTER_SHIFT ;
|
| +
|
| + movq mm7, mm3 ;
|
| + packuswb mm7, mm4 ;
|
| +
|
| + add rsi, rdx ; next line
|
| +.next_row_8x8:
|
| + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
| + movq mm4, mm3 ; make a copy of current line
|
| +
|
| + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + pmullw mm3, mm1 ;
|
| + pmullw mm4, mm1 ;
|
| +
|
| + movq mm5, [rsi+1] ;
|
| + movq mm6, mm5 ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + punpckhbw mm6, mm0 ;
|
| +
|
| + pmullw mm5, mm2 ;
|
| + pmullw mm6, mm2 ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm4, mm6 ;
|
| +
|
| + movq mm5, mm7 ;
|
| + movq mm6, mm7 ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + punpckhbw mm6, mm0
|
| +
|
| + pmullw mm5, [rax] ;
|
| + pmullw mm6, [rax] ;
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + paddw mm4, [GLOBAL(rd)] ;
|
| + psraw mm4, VP9_FILTER_SHIFT ;
|
| +
|
| + movq mm7, mm3 ;
|
| + packuswb mm7, mm4 ;
|
| +
|
| +
|
| + pmullw mm3, [rax+16] ;
|
| + pmullw mm4, [rax+16] ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm4, mm6 ;
|
| +
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + paddw mm4, [GLOBAL(rd)] ;
|
| + psraw mm4, VP9_FILTER_SHIFT ;
|
| +
|
| + packuswb mm3, mm4
|
| +
|
| + movq [rdi], mm3 ; store the results in the destination
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, rdx ; next line
|
| + add rdi, dword ptr arg(5) ;dst_pitch ;
|
| +%else
|
| + movsxd r8, dword ptr arg(5) ;dst_pitch
|
| + add rsi, rdx ; next line
|
| + add rdi, r8 ;dst_pitch
|
| +%endif
|
| + cmp rdi, rcx ;
|
| + jne .next_row_8x8
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;void bilinear_predict8x4_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_pixels_per_line,
|
| +; int xoffset,
|
| +; int yoffset,
|
| +; unsigned char *dst_ptr,
|
| +; int dst_pitch
|
| +;)
|
| +global sym(vp9_bilinear_predict8x4_mmx)
|
| +sym(vp9_bilinear_predict8x4_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + ;const short *HFilter = bilinear_filters_mmx[xoffset];
|
| + ;const short *VFilter = bilinear_filters_mmx[yoffset];
|
| +
|
| + movsxd rax, dword ptr arg(2) ;xoffset
|
| + mov rdi, arg(4) ;dst_ptr ;
|
| +
|
| + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
|
| + shl rax, 5
|
| +
|
| + mov rsi, arg(0) ;src_ptr ;
|
| + add rax, rcx
|
| +
|
| + movsxd rdx, dword ptr arg(5) ;dst_pitch
|
| + movq mm1, [rax] ;
|
| +
|
| + movq mm2, [rax+16] ;
|
| + movsxd rax, dword ptr arg(3) ;yoffset
|
| +
|
| + pxor mm0, mm0 ;
|
| + shl rax, 5
|
| +
|
| + add rax, rcx
|
| + lea rcx, [rdi+rdx*4] ;
|
| +
|
| + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
| +
|
| + ; get the first horizontal line done ;
|
| + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
| + movq mm4, mm3 ; make a copy of current line
|
| +
|
| + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + pmullw mm3, mm1 ;
|
| + pmullw mm4, mm1 ;
|
| +
|
| + movq mm5, [rsi+1] ;
|
| + movq mm6, mm5 ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + punpckhbw mm6, mm0 ;
|
| +
|
| + pmullw mm5, mm2 ;
|
| + pmullw mm6, mm2 ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm4, mm6 ;
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + paddw mm4, [GLOBAL(rd)] ;
|
| + psraw mm4, VP9_FILTER_SHIFT ;
|
| +
|
| + movq mm7, mm3 ;
|
| + packuswb mm7, mm4 ;
|
| +
|
| + add rsi, rdx ; next line
|
| +.next_row_8x4:
|
| + movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
| + movq mm4, mm3 ; make a copy of current line
|
| +
|
| + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
| + punpckhbw mm4, mm0 ;
|
| +
|
| + pmullw mm3, mm1 ;
|
| + pmullw mm4, mm1 ;
|
| +
|
| + movq mm5, [rsi+1] ;
|
| + movq mm6, mm5 ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + punpckhbw mm6, mm0 ;
|
| +
|
| + pmullw mm5, mm2 ;
|
| + pmullw mm6, mm2 ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm4, mm6 ;
|
| +
|
| + movq mm5, mm7 ;
|
| + movq mm6, mm7 ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + punpckhbw mm6, mm0
|
| +
|
| + pmullw mm5, [rax] ;
|
| + pmullw mm6, [rax] ;
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + paddw mm4, [GLOBAL(rd)] ;
|
| + psraw mm4, VP9_FILTER_SHIFT ;
|
| +
|
| + movq mm7, mm3 ;
|
| + packuswb mm7, mm4 ;
|
| +
|
| +
|
| + pmullw mm3, [rax+16] ;
|
| + pmullw mm4, [rax+16] ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm4, mm6 ;
|
| +
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + paddw mm4, [GLOBAL(rd)] ;
|
| + psraw mm4, VP9_FILTER_SHIFT ;
|
| +
|
| + packuswb mm3, mm4
|
| +
|
| + movq [rdi], mm3 ; store the results in the destination
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, rdx ; next line
|
| + add rdi, dword ptr arg(5) ;dst_pitch ;
|
| +%else
|
| + movsxd r8, dword ptr arg(5) ;dst_pitch
|
| + add rsi, rdx ; next line
|
| + add rdi, r8
|
| +%endif
|
| + cmp rdi, rcx ;
|
| + jne .next_row_8x4
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +;void bilinear_predict4x4_mmx
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_pixels_per_line,
|
| +; int xoffset,
|
| +; int yoffset,
|
| +; unsigned char *dst_ptr,
|
| +; int dst_pitch
|
| +;)
|
| +global sym(vp9_bilinear_predict4x4_mmx)
|
| +sym(vp9_bilinear_predict4x4_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 6
|
| + GET_GOT rbx
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| + ;const short *HFilter = bilinear_filters_mmx[xoffset];
|
| + ;const short *VFilter = bilinear_filters_mmx[yoffset];
|
| +
|
| + movsxd rax, dword ptr arg(2) ;xoffset
|
| + mov rdi, arg(4) ;dst_ptr ;
|
| +
|
| + lea rcx, [GLOBAL(sym(vp9_bilinear_filters_8x_mmx))]
|
| + shl rax, 5
|
| +
|
| + add rax, rcx ; HFilter
|
| + mov rsi, arg(0) ;src_ptr ;
|
| +
|
| + movsxd rdx, dword ptr arg(5) ;ldst_pitch
|
| + movq mm1, [rax] ;
|
| +
|
| + movq mm2, [rax+16] ;
|
| + movsxd rax, dword ptr arg(3) ;yoffset
|
| +
|
| + pxor mm0, mm0 ;
|
| + shl rax, 5
|
| +
|
| + add rax, rcx
|
| + lea rcx, [rdi+rdx*4] ;
|
| +
|
| + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ;
|
| +
|
| + ; get the first horizontal line done ;
|
| + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
| + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
| +
|
| + pmullw mm3, mm1 ;
|
| + movd mm5, [rsi+1] ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + pmullw mm5, mm2 ;
|
| +
|
| + paddw mm3, mm5 ;
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| +
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + movq mm7, mm3 ;
|
| + packuswb mm7, mm0 ;
|
| +
|
| + add rsi, rdx ; next line
|
| +.next_row_4x4:
|
| + movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
|
| + punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06
|
| +
|
| + pmullw mm3, mm1 ;
|
| + movd mm5, [rsi+1] ;
|
| +
|
| + punpcklbw mm5, mm0 ;
|
| + pmullw mm5, mm2 ;
|
| +
|
| + paddw mm3, mm5 ;
|
| +
|
| + movq mm5, mm7 ;
|
| + punpcklbw mm5, mm0 ;
|
| +
|
| + pmullw mm5, [rax] ;
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| +
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| + movq mm7, mm3 ;
|
| +
|
| + packuswb mm7, mm0 ;
|
| +
|
| + pmullw mm3, [rax+16] ;
|
| + paddw mm3, mm5 ;
|
| +
|
| +
|
| + paddw mm3, [GLOBAL(rd)] ; xmm3 += round value
|
| + psraw mm3, VP9_FILTER_SHIFT ; xmm3 /= 128
|
| +
|
| + packuswb mm3, mm0
|
| + movd [rdi], mm3 ; store the results in the destination
|
| +
|
| +%if ABI_IS_32BIT
|
| + add rsi, rdx ; next line
|
| + add rdi, dword ptr arg(5) ;dst_pitch ;
|
| +%else
|
| + movsxd r8, dword ptr arg(5) ;dst_pitch ;
|
| + add rsi, rdx ; next line
|
| + add rdi, r8
|
| +%endif
|
| +
|
| + cmp rdi, rcx ;
|
| + jne .next_row_4x4
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + RESTORE_GOT
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
| +
|
| +
|
| +
|
| +SECTION_RODATA
|
| +align 16
|
| +rd:
|
| + times 4 dw 0x40
|
| +
|
| +align 16
|
| +global HIDDEN_DATA(sym(vp9_six_tap_mmx))
|
| +sym(vp9_six_tap_mmx):
|
| + times 8 dw 0
|
| + times 8 dw 0
|
| + times 8 dw 128
|
| + times 8 dw 0
|
| + times 8 dw 0
|
| + times 8 dw 0
|
| +
|
| + times 8 dw 0
|
| + times 8 dw -6
|
| + times 8 dw 123
|
| + times 8 dw 12
|
| + times 8 dw -1
|
| + times 8 dw 0
|
| +
|
| + times 8 dw 2
|
| + times 8 dw -11
|
| + times 8 dw 108
|
| + times 8 dw 36
|
| + times 8 dw -8
|
| + times 8 dw 1
|
| +
|
| + times 8 dw 0
|
| + times 8 dw -9
|
| + times 8 dw 93
|
| + times 8 dw 50
|
| + times 8 dw -6
|
| + times 8 dw 0
|
| +
|
| + times 8 dw 3
|
| + times 8 dw -16
|
| + times 8 dw 77
|
| + times 8 dw 77
|
| + times 8 dw -16
|
| + times 8 dw 3
|
| +
|
| + times 8 dw 0
|
| + times 8 dw -6
|
| + times 8 dw 50
|
| + times 8 dw 93
|
| + times 8 dw -9
|
| + times 8 dw 0
|
| +
|
| + times 8 dw 1
|
| + times 8 dw -8
|
| + times 8 dw 36
|
| + times 8 dw 108
|
| + times 8 dw -11
|
| + times 8 dw 2
|
| +
|
| + times 8 dw 0
|
| + times 8 dw -1
|
| + times 8 dw 12
|
| + times 8 dw 123
|
| + times 8 dw -6
|
| + times 8 dw 0
|
| +
|
| +
|
| +align 16
|
| +global HIDDEN_DATA(sym(vp9_bilinear_filters_8x_mmx))
|
| +sym(vp9_bilinear_filters_8x_mmx):
|
| + times 8 dw 128
|
| + times 8 dw 0
|
| +
|
| + times 8 dw 112
|
| + times 8 dw 16
|
| +
|
| + times 8 dw 96
|
| + times 8 dw 32
|
| +
|
| + times 8 dw 80
|
| + times 8 dw 48
|
| +
|
| + times 8 dw 64
|
| + times 8 dw 64
|
| +
|
| + times 8 dw 48
|
| + times 8 dw 80
|
| +
|
| + times 8 dw 32
|
| + times 8 dw 96
|
| +
|
| + times 8 dw 16
|
| + times 8 dw 112
|
|
|