| Index: source/libvpx/vp9/encoder/x86/vp9_quantize_mmx.asm
|
| ===================================================================
|
| --- source/libvpx/vp9/encoder/x86/vp9_quantize_mmx.asm (revision 0)
|
| +++ source/libvpx/vp9/encoder/x86/vp9_quantize_mmx.asm (revision 0)
|
| @@ -0,0 +1,286 @@
|
| +;
|
| +; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| +;
|
| +; Use of this source code is governed by a BSD-style license
|
| +; that can be found in the LICENSE file in the root of the source
|
| +; tree. An additional intellectual property rights grant can be found
|
| +; in the file PATENTS. All contributing project authors may
|
| +; be found in the AUTHORS file in the root of the source tree.
|
| +;
|
| +
|
| +
|
| +%include "vpx_ports/x86_abi_support.asm"
|
| +
|
| +;int vp9_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
|
| +; short *qcoeff_ptr,short *dequant_ptr,
|
| +; short *scan_mask, short *round_ptr,
|
| +; short *quant_ptr, short *dqcoeff_ptr);
|
| +global sym(vp9_fast_quantize_b_impl_mmx)
|
| +sym(vp9_fast_quantize_b_impl_mmx):
|
| + push rbp
|
| + mov rbp, rsp
|
| + SHADOW_ARGS_TO_STACK 8
|
| + push rsi
|
| + push rdi
|
| + ; end prolog
|
| +
|
| +
|
| + mov rsi, arg(0) ;coeff_ptr
|
| + movq mm0, [rsi]
|
| +
|
| + mov rax, arg(1) ;zbin_ptr
|
| + movq mm1, [rax]
|
| +
|
| + movq mm3, mm0
|
| + psraw mm0, 15
|
| +
|
| + pxor mm3, mm0
|
| + psubw mm3, mm0 ; abs
|
| +
|
| + movq mm2, mm3
|
| + pcmpgtw mm1, mm2
|
| +
|
| + pandn mm1, mm2
|
| + movq mm3, mm1
|
| +
|
| + mov rdx, arg(6) ;quant_ptr
|
| + movq mm1, [rdx]
|
| +
|
| + mov rcx, arg(5) ;round_ptr
|
| + movq mm2, [rcx]
|
| +
|
| + paddw mm3, mm2
|
| + pmulhuw mm3, mm1
|
| +
|
| + pxor mm3, mm0
|
| + psubw mm3, mm0 ;gain the sign back
|
| +
|
| + mov rdi, arg(2) ;qcoeff_ptr
|
| + movq mm0, mm3
|
| +
|
| + movq [rdi], mm3
|
| +
|
| + mov rax, arg(3) ;dequant_ptr
|
| + movq mm2, [rax]
|
| +
|
| + pmullw mm3, mm2
|
| + mov rax, arg(7) ;dqcoeff_ptr
|
| +
|
| + movq [rax], mm3
|
| +
|
| + ; next 8
|
| + movq mm4, [rsi+8]
|
| +
|
| + mov rax, arg(1) ;zbin_ptr
|
| + movq mm5, [rax+8]
|
| +
|
| + movq mm7, mm4
|
| + psraw mm4, 15
|
| +
|
| + pxor mm7, mm4
|
| + psubw mm7, mm4 ; abs
|
| +
|
| + movq mm6, mm7
|
| + pcmpgtw mm5, mm6
|
| +
|
| + pandn mm5, mm6
|
| + movq mm7, mm5
|
| +
|
| + movq mm5, [rdx+8]
|
| + movq mm6, [rcx+8]
|
| +
|
| + paddw mm7, mm6
|
| + pmulhuw mm7, mm5
|
| +
|
| + pxor mm7, mm4
|
| + psubw mm7, mm4;gain the sign back
|
| +
|
| + mov rdi, arg(2) ;qcoeff_ptr
|
| +
|
| + movq mm1, mm7
|
| + movq [rdi+8], mm7
|
| +
|
| + mov rax, arg(3) ;dequant_ptr
|
| + movq mm6, [rax+8]
|
| +
|
| + pmullw mm7, mm6
|
| + mov rax, arg(7) ;dqcoeff_ptr
|
| +
|
| + movq [rax+8], mm7
|
| +
|
| +
|
| + ; next 8
|
| + movq mm4, [rsi+16]
|
| +
|
| + mov rax, arg(1) ;zbin_ptr
|
| + movq mm5, [rax+16]
|
| +
|
| + movq mm7, mm4
|
| + psraw mm4, 15
|
| +
|
| + pxor mm7, mm4
|
| + psubw mm7, mm4 ; abs
|
| +
|
| + movq mm6, mm7
|
| + pcmpgtw mm5, mm6
|
| +
|
| + pandn mm5, mm6
|
| + movq mm7, mm5
|
| +
|
| + movq mm5, [rdx+16]
|
| + movq mm6, [rcx+16]
|
| +
|
| + paddw mm7, mm6
|
| + pmulhuw mm7, mm5
|
| +
|
| + pxor mm7, mm4
|
| + psubw mm7, mm4;gain the sign back
|
| +
|
| + mov rdi, arg(2) ;qcoeff_ptr
|
| +
|
| + movq mm1, mm7
|
| + movq [rdi+16], mm7
|
| +
|
| + mov rax, arg(3) ;dequant_ptr
|
| + movq mm6, [rax+16]
|
| +
|
| + pmullw mm7, mm6
|
| + mov rax, arg(7) ;dqcoeff_ptr
|
| +
|
| + movq [rax+16], mm7
|
| +
|
| +
|
| + ; next 8
|
| + movq mm4, [rsi+24]
|
| +
|
| + mov rax, arg(1) ;zbin_ptr
|
| + movq mm5, [rax+24]
|
| +
|
| + movq mm7, mm4
|
| + psraw mm4, 15
|
| +
|
| + pxor mm7, mm4
|
| + psubw mm7, mm4 ; abs
|
| +
|
| + movq mm6, mm7
|
| + pcmpgtw mm5, mm6
|
| +
|
| + pandn mm5, mm6
|
| + movq mm7, mm5
|
| +
|
| + movq mm5, [rdx+24]
|
| + movq mm6, [rcx+24]
|
| +
|
| + paddw mm7, mm6
|
| + pmulhuw mm7, mm5
|
| +
|
| + pxor mm7, mm4
|
| + psubw mm7, mm4;gain the sign back
|
| +
|
| + mov rdi, arg(2) ;qcoeff_ptr
|
| +
|
| + movq mm1, mm7
|
| + movq [rdi+24], mm7
|
| +
|
| + mov rax, arg(3) ;dequant_ptr
|
| + movq mm6, [rax+24]
|
| +
|
| + pmullw mm7, mm6
|
| + mov rax, arg(7) ;dqcoeff_ptr
|
| +
|
| + movq [rax+24], mm7
|
| +
|
| +
|
| +
|
| + mov rdi, arg(4) ;scan_mask
|
| + mov rsi, arg(2) ;qcoeff_ptr
|
| +
|
| + pxor mm5, mm5
|
| + pxor mm7, mm7
|
| +
|
| + movq mm0, [rsi]
|
| + movq mm1, [rsi+8]
|
| +
|
| + movq mm2, [rdi]
|
| + movq mm3, [rdi+8];
|
| +
|
| + pcmpeqw mm0, mm7
|
| + pcmpeqw mm1, mm7
|
| +
|
| + pcmpeqw mm6, mm6
|
| + pxor mm0, mm6
|
| +
|
| + pxor mm1, mm6
|
| + psrlw mm0, 15
|
| +
|
| + psrlw mm1, 15
|
| + pmaddwd mm0, mm2
|
| +
|
| + pmaddwd mm1, mm3
|
| + movq mm5, mm0
|
| +
|
| + paddd mm5, mm1
|
| +
|
| + movq mm0, [rsi+16]
|
| + movq mm1, [rsi+24]
|
| +
|
| + movq mm2, [rdi+16]
|
| + movq mm3, [rdi+24];
|
| +
|
| + pcmpeqw mm0, mm7
|
| + pcmpeqw mm1, mm7
|
| +
|
| + pcmpeqw mm6, mm6
|
| + pxor mm0, mm6
|
| +
|
| + pxor mm1, mm6
|
| + psrlw mm0, 15
|
| +
|
| + psrlw mm1, 15
|
| + pmaddwd mm0, mm2
|
| +
|
| + pmaddwd mm1, mm3
|
| + paddd mm5, mm0
|
| +
|
| + paddd mm5, mm1
|
| + movq mm0, mm5
|
| +
|
| + psrlq mm5, 32
|
| + paddd mm0, mm5
|
| +
|
| + ; eob adjustment begins here
|
| + movq rcx, mm0
|
| + and rcx, 0xffff
|
| +
|
| + xor rdx, rdx
|
| + sub rdx, rcx ; rdx=-rcx
|
| +
|
| + bsr rax, rcx
|
| + inc rax
|
| +
|
| + sar rdx, 31
|
| + and rax, rdx
|
| + ; Substitute the sse assembly for the old mmx mixed assembly/C. The
|
| + ; following is kept as reference
|
| + ; movq rcx, mm0
|
| + ; bsr rax, rcx
|
| + ;
|
| + ; mov eob, rax
|
| + ; mov eee, rcx
|
| + ;
|
| + ;if(eee==0)
|
| + ;{
|
| + ; eob=-1;
|
| + ;}
|
| + ;else if(eee<0)
|
| + ;{
|
| + ; eob=15;
|
| + ;}
|
| + ;d->eob = eob+1;
|
| +
|
| + ; begin epilog
|
| + pop rdi
|
| + pop rsi
|
| + UNSHADOW_ARGS
|
| + pop rbp
|
| + ret
|
|
|