Index: source/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm |
=================================================================== |
--- source/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm (revision 0) |
+++ source/libvpx/vp8/encoder/x86/ssim_opt_x86_64.asm (revision 0) |
@@ -0,0 +1,216 @@ |
+; |
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
+; |
+; Use of this source code is governed by a BSD-style license |
+; that can be found in the LICENSE file in the root of the source |
+; tree. An additional intellectual property rights grant can be found |
+; in the file PATENTS. All contributing project authors may |
+; be found in the AUTHORS file in the root of the source tree. |
+; |
+ |
+%include "vpx_ports/x86_abi_support.asm" |
+ |
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr |
+%macro TABULATE_SSIM 0 |
+ paddusw xmm15, xmm3 ; sum_s |
+ paddusw xmm14, xmm4 ; sum_r |
+ movdqa xmm1, xmm3 |
+ pmaddwd xmm1, xmm1 |
+ paddd xmm13, xmm1 ; sum_sq_s |
+ movdqa xmm2, xmm4 |
+ pmaddwd xmm2, xmm2 |
+ paddd xmm12, xmm2 ; sum_sq_r |
+ pmaddwd xmm3, xmm4 |
+ paddd xmm11, xmm3 ; sum_sxr |
+%endmacro |
+ |
+; Sum across the register %1 starting with q words |
+%macro SUM_ACROSS_Q 1 |
+ movdqa xmm2,%1 |
+ punpckldq %1,xmm0 |
+ punpckhdq xmm2,xmm0 |
+ paddq %1,xmm2 |
+ movdqa xmm2,%1 |
+ punpcklqdq %1,xmm0 |
+ punpckhqdq xmm2,xmm0 |
+ paddq %1,xmm2 |
+%endmacro |
+ |
+; Sum across the register %1 starting with q words |
+%macro SUM_ACROSS_W 1 |
+ movdqa xmm1, %1 |
+ punpcklwd %1,xmm0 |
+ punpckhwd xmm1,xmm0 |
+ paddd %1, xmm1 |
+ SUM_ACROSS_Q %1 |
+%endmacro |
+;void ssim_parms_sse2( |
+; unsigned char *s, |
+; int sp, |
+; unsigned char *r, |
+; int rp |
+; unsigned long *sum_s, |
+; unsigned long *sum_r, |
+; unsigned long *sum_sq_s, |
+; unsigned long *sum_sq_r, |
+; unsigned long *sum_sxr); |
+; |
+; TODO: Use parm passing through structure, probably don't need the pxors |
+; ( calling app will initialize to 0 ) could easily fit everything in sse2 |
+; without too much hastle, and can probably do better estimates with psadw |
+; or pavgb At this point this is just meant to be first pass for calculating |
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion |
+; in mode selection code. |
+global sym(vp8_ssim_parms_16x16_sse2) PRIVATE |
+sym(vp8_ssim_parms_16x16_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 9 |
+ SAVE_XMM 15 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;s |
+ mov rcx, arg(1) ;sp |
+ mov rdi, arg(2) ;r |
+ mov rax, arg(3) ;rp |
+ |
+ pxor xmm0, xmm0 |
+ pxor xmm15,xmm15 ;sum_s |
+ pxor xmm14,xmm14 ;sum_r |
+ pxor xmm13,xmm13 ;sum_sq_s |
+ pxor xmm12,xmm12 ;sum_sq_r |
+ pxor xmm11,xmm11 ;sum_sxr |
+ |
+ mov rdx, 16 ;row counter |
+.NextRow: |
+ |
+ ;grab source and reference pixels |
+ movdqu xmm5, [rsi] |
+ movdqu xmm6, [rdi] |
+ movdqa xmm3, xmm5 |
+ movdqa xmm4, xmm6 |
+ punpckhbw xmm3, xmm0 ; high_s |
+ punpckhbw xmm4, xmm0 ; high_r |
+ |
+ TABULATE_SSIM |
+ |
+ movdqa xmm3, xmm5 |
+ movdqa xmm4, xmm6 |
+ punpcklbw xmm3, xmm0 ; low_s |
+ punpcklbw xmm4, xmm0 ; low_r |
+ |
+ TABULATE_SSIM |
+ |
+ add rsi, rcx ; next s row |
+ add rdi, rax ; next r row |
+ |
+ dec rdx ; counter |
+ jnz .NextRow |
+ |
+ SUM_ACROSS_W xmm15 |
+ SUM_ACROSS_W xmm14 |
+ SUM_ACROSS_Q xmm13 |
+ SUM_ACROSS_Q xmm12 |
+ SUM_ACROSS_Q xmm11 |
+ |
+ mov rdi,arg(4) |
+ movd [rdi], xmm15; |
+ mov rdi,arg(5) |
+ movd [rdi], xmm14; |
+ mov rdi,arg(6) |
+ movd [rdi], xmm13; |
+ mov rdi,arg(7) |
+ movd [rdi], xmm12; |
+ mov rdi,arg(8) |
+ movd [rdi], xmm11; |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |
+ |
+;void ssim_parms_sse2( |
+; unsigned char *s, |
+; int sp, |
+; unsigned char *r, |
+; int rp |
+; unsigned long *sum_s, |
+; unsigned long *sum_r, |
+; unsigned long *sum_sq_s, |
+; unsigned long *sum_sq_r, |
+; unsigned long *sum_sxr); |
+; |
+; TODO: Use parm passing through structure, probably don't need the pxors |
+; ( calling app will initialize to 0 ) could easily fit everything in sse2 |
+; without too much hastle, and can probably do better estimates with psadw |
+; or pavgb At this point this is just meant to be first pass for calculating |
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion |
+; in mode selection code. |
+global sym(vp8_ssim_parms_8x8_sse2) PRIVATE |
+sym(vp8_ssim_parms_8x8_sse2): |
+ push rbp |
+ mov rbp, rsp |
+ SHADOW_ARGS_TO_STACK 9 |
+ SAVE_XMM 15 |
+ push rsi |
+ push rdi |
+ ; end prolog |
+ |
+ mov rsi, arg(0) ;s |
+ mov rcx, arg(1) ;sp |
+ mov rdi, arg(2) ;r |
+ mov rax, arg(3) ;rp |
+ |
+ pxor xmm0, xmm0 |
+ pxor xmm15,xmm15 ;sum_s |
+ pxor xmm14,xmm14 ;sum_r |
+ pxor xmm13,xmm13 ;sum_sq_s |
+ pxor xmm12,xmm12 ;sum_sq_r |
+ pxor xmm11,xmm11 ;sum_sxr |
+ |
+ mov rdx, 8 ;row counter |
+.NextRow: |
+ |
+ ;grab source and reference pixels |
+ movq xmm3, [rsi] |
+ movq xmm4, [rdi] |
+ punpcklbw xmm3, xmm0 ; low_s |
+ punpcklbw xmm4, xmm0 ; low_r |
+ |
+ TABULATE_SSIM |
+ |
+ add rsi, rcx ; next s row |
+ add rdi, rax ; next r row |
+ |
+ dec rdx ; counter |
+ jnz .NextRow |
+ |
+ SUM_ACROSS_W xmm15 |
+ SUM_ACROSS_W xmm14 |
+ SUM_ACROSS_Q xmm13 |
+ SUM_ACROSS_Q xmm12 |
+ SUM_ACROSS_Q xmm11 |
+ |
+ mov rdi,arg(4) |
+ movd [rdi], xmm15; |
+ mov rdi,arg(5) |
+ movd [rdi], xmm14; |
+ mov rdi,arg(6) |
+ movd [rdi], xmm13; |
+ mov rdi,arg(7) |
+ movd [rdi], xmm12; |
+ mov rdi,arg(8) |
+ movd [rdi], xmm11; |
+ |
+ ; begin epilog |
+ pop rdi |
+ pop rsi |
+ RESTORE_XMM |
+ UNSHADOW_ARGS |
+ pop rbp |
+ ret |