| Index: source/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
|
| ===================================================================
|
| --- source/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm (revision 0)
|
| +++ source/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm (revision 0)
|
| @@ -0,0 +1,815 @@
|
| +;
|
| +; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| +;
|
| +; Use of this source code is governed by a BSD-style license
|
| +; that can be found in the LICENSE file in the root of the source
|
| +; tree. An additional intellectual property rights grant can be found
|
| +; in the file PATENTS. All contributing project authors may
|
| +; be found in the AUTHORS file in the root of the source tree.
|
| +;
|
| +
|
| +
|
| +%include "vpx_ports/x86_abi_support.asm"
|
| +
|
| +%macro LF_ABS 2
|
| + ; %1 value not preserved
|
| + ; %2 value preserved
|
| + ; output in %1
|
| + movdqa scratch1, %2 ; v2
|
| +
|
| + psubusb scratch1, %1 ; v2 - v1
|
| + psubusb %1, %2 ; v1 - v2
|
| + por %1, scratch1 ; abs(v2 - v1)
|
| +%endmacro
|
| +
|
| +%macro LF_FILTER_HEV_MASK 8-9
|
| +
|
| + LF_ABS %1, %2 ; abs(p3 - p2)
|
| + LF_ABS %2, %3 ; abs(p2 - p1)
|
| + pmaxub %1, %2 ; accumulate mask
|
| +%if %0 == 8
|
| + movdqa scratch2, %3 ; save p1
|
| + LF_ABS scratch2, %4 ; abs(p1 - p0)
|
| +%endif
|
| + LF_ABS %4, %5 ; abs(p0 - q0)
|
| + LF_ABS %5, %6 ; abs(q0 - q1)
|
| +%if %0 == 8
|
| + pmaxub %5, scratch2 ; accumulate hev
|
| +%else
|
| + pmaxub %5, %9
|
| +%endif
|
| + pmaxub %1, %5 ; accumulate mask
|
| +
|
| + LF_ABS %3, %6 ; abs(p1 - q1)
|
| + LF_ABS %6, %7 ; abs(q1 - q2)
|
| + pmaxub %1, %6 ; accumulate mask
|
| + LF_ABS %7, %8 ; abs(q2 - q3)
|
| + pmaxub %1, %7 ; accumulate mask
|
| +
|
| + paddusb %4, %4 ; 2 * abs(p0 - q0)
|
| + pand %3, [GLOBAL(tfe)]
|
| + psrlw %3, 1 ; abs(p1 - q1) / 2
|
| + paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
| +
|
| + psubusb %1, [limit]
|
| + psubusb %4, [blimit]
|
| + por %1, %4
|
| + pcmpeqb %1, zero ; mask
|
| +
|
| + psubusb %5, [thresh]
|
| + pcmpeqb %5, zero ; ~hev
|
| +%endmacro
|
| +
|
| +%macro LF_FILTER 6
|
| + ; %1-%4: p1-q1
|
| + ; %5: mask
|
| + ; %6: hev
|
| +
|
| + movdqa scratch2, %6 ; save hev
|
| +
|
| + pxor %1, [GLOBAL(t80)] ; ps1
|
| + pxor %4, [GLOBAL(t80)] ; qs1
|
| + movdqa scratch1, %1
|
| + psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
|
| + pandn scratch2, scratch1 ; vp8_filter &= hev
|
| +
|
| + pxor %2, [GLOBAL(t80)] ; ps0
|
| + pxor %3, [GLOBAL(t80)] ; qs0
|
| + movdqa scratch1, %3
|
| + psubsb scratch1, %2 ; qs0 - ps0
|
| + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
| + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
| + paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
| + pand %5, scratch2 ; &= mask
|
| +
|
| + movdqa scratch2, %5
|
| + paddsb %5, [GLOBAL(t4)] ; Filter1
|
| + paddsb scratch2, [GLOBAL(t3)] ; Filter2
|
| +
|
| + ; Filter1 >> 3
|
| + movdqa scratch1, zero
|
| + pcmpgtb scratch1, %5
|
| + psrlw %5, 3
|
| + pand scratch1, [GLOBAL(te0)]
|
| + pand %5, [GLOBAL(t1f)]
|
| + por %5, scratch1
|
| +
|
| + psubsb %3, %5 ; qs0 - Filter1
|
| + pxor %3, [GLOBAL(t80)]
|
| +
|
| + ; Filter2 >> 3
|
| + movdqa scratch1, zero
|
| + pcmpgtb scratch1, scratch2
|
| + psrlw scratch2, 3
|
| + pand scratch1, [GLOBAL(te0)]
|
| + pand scratch2, [GLOBAL(t1f)]
|
| + por scratch2, scratch1
|
| +
|
| + paddsb %2, scratch2 ; ps0 + Filter2
|
| + pxor %2, [GLOBAL(t80)]
|
| +
|
| + ; outer tap adjustments
|
| + paddsb %5, [GLOBAL(t1)]
|
| + movdqa scratch1, zero
|
| + pcmpgtb scratch1, %5
|
| + psrlw %5, 1
|
| + pand scratch1, [GLOBAL(t80)]
|
| + pand %5, [GLOBAL(t7f)]
|
| + por %5, scratch1
|
| + pand %5, %6 ; vp8_filter &= ~hev
|
| +
|
| + psubsb %4, %5 ; qs1 - vp8_filter
|
| + pxor %4, [GLOBAL(t80)]
|
| +
|
| + paddsb %1, %5 ; ps1 + vp8_filter
|
| + pxor %1, [GLOBAL(t80)]
|
| +%endmacro
|
| +
|
| +;void vp8_loop_filter_bh_y_sse2
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_pixel_step,
|
| +; const char *blimit,
|
| +; const char *limit,
|
| +; const char *thresh
|
| +;)
|
| +global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
|
| +sym(vp8_loop_filter_bh_y_sse2):
|
| +
|
| +%if LIBVPX_YASM_WIN64
|
| + %define src rcx ; src_ptr
|
| + %define stride rdx ; src_pixel_step
|
| + %define blimit r8
|
| + %define limit r9
|
| + %define thresh r10
|
| +
|
| + %define spp rax
|
| + %define stride3 r11
|
| + %define stride5 r12
|
| + %define stride7 r13
|
| +
|
| + push rbp
|
| + mov rbp, rsp
|
| + SAVE_XMM 11
|
| + push r12
|
| + push r13
|
| + mov thresh, arg(4)
|
| +%else
|
| + %define src rdi ; src_ptr
|
| + %define stride rsi ; src_pixel_step
|
| + %define blimit rdx
|
| + %define limit rcx
|
| + %define thresh r8
|
| +
|
| + %define spp rax
|
| + %define stride3 r9
|
| + %define stride5 r10
|
| + %define stride7 r11
|
| +%endif
|
| +
|
| + %define scratch1 xmm5
|
| + %define scratch2 xmm6
|
| + %define zero xmm7
|
| +
|
| + %define i0 [src]
|
| + %define i1 [spp]
|
| + %define i2 [src + 2 * stride]
|
| + %define i3 [spp + 2 * stride]
|
| + %define i4 [src + 4 * stride]
|
| + %define i5 [spp + 4 * stride]
|
| + %define i6 [src + 2 * stride3]
|
| + %define i7 [spp + 2 * stride3]
|
| + %define i8 [src + 8 * stride]
|
| + %define i9 [spp + 8 * stride]
|
| + %define i10 [src + 2 * stride5]
|
| + %define i11 [spp + 2 * stride5]
|
| + %define i12 [src + 4 * stride3]
|
| + %define i13 [spp + 4 * stride3]
|
| + %define i14 [src + 2 * stride7]
|
| + %define i15 [spp + 2 * stride7]
|
| +
|
| + ; prep work
|
| + lea spp, [src + stride]
|
| + lea stride3, [stride + 2 * stride]
|
| + lea stride5, [stride3 + 2 * stride]
|
| + lea stride7, [stride3 + 4 * stride]
|
| + pxor zero, zero
|
| +
|
| + ; load the first set into registers
|
| + movdqa xmm0, i0
|
| + movdqa xmm1, i1
|
| + movdqa xmm2, i2
|
| + movdqa xmm3, i3
|
| + movdqa xmm4, i4
|
| + movdqa xmm8, i5
|
| + movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
|
| + movdqa xmm10, i7
|
| +LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
|
| +
|
| + movdqa xmm1, i2
|
| + movdqa xmm2, i3
|
| + movdqa xmm3, i4
|
| + movdqa xmm8, i5
|
| +LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
|
| + movdqa i2, xmm1
|
| + movdqa i3, xmm2
|
| +
|
| +; second set
|
| + movdqa i4, xmm3
|
| + movdqa i5, xmm8
|
| +
|
| + movdqa xmm0, i6
|
| + movdqa xmm1, i7
|
| + movdqa xmm2, i8
|
| + movdqa xmm4, i9
|
| + movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
|
| + movdqa xmm11, i11
|
| +LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
|
| +
|
| + movdqa xmm0, i6
|
| + movdqa xmm1, i7
|
| + movdqa xmm4, i8
|
| + movdqa xmm8, i9
|
| +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
|
| + movdqa i6, xmm0
|
| + movdqa i7, xmm1
|
| +
|
| +; last set
|
| + movdqa i8, xmm4
|
| + movdqa i9, xmm8
|
| +
|
| + movdqa xmm0, i10
|
| + movdqa xmm1, i11
|
| + movdqa xmm2, i12
|
| + movdqa xmm3, i13
|
| + movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
|
| + movdqa xmm11, i15
|
| +LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
|
| +
|
| + movdqa xmm0, i10
|
| + movdqa xmm1, i11
|
| + movdqa xmm3, i12
|
| + movdqa xmm8, i13
|
| +LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
|
| + movdqa i10, xmm0
|
| + movdqa i11, xmm1
|
| + movdqa i12, xmm3
|
| + movdqa i13, xmm8
|
| +
|
| +%if LIBVPX_YASM_WIN64
|
| + pop r13
|
| + pop r12
|
| + RESTORE_XMM
|
| + pop rbp
|
| +%endif
|
| +
|
| + ret
|
| +
|
| +
|
| +;void vp8_loop_filter_bv_y_sse2
|
| +;(
|
| +; unsigned char *src_ptr,
|
| +; int src_pixel_step,
|
| +; const char *blimit,
|
| +; const char *limit,
|
| +; const char *thresh
|
| +;)
|
| +
|
| +global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
|
| +sym(vp8_loop_filter_bv_y_sse2):
|
| +
|
| +%if LIBVPX_YASM_WIN64
|
| + %define src rcx ; src_ptr
|
| + %define stride rdx ; src_pixel_step
|
| + %define blimit r8
|
| + %define limit r9
|
| + %define thresh r10
|
| +
|
| + %define spp rax
|
| + %define stride3 r11
|
| + %define stride5 r12
|
| + %define stride7 r13
|
| +
|
| + push rbp
|
| + mov rbp, rsp
|
| + SAVE_XMM 15
|
| + push r12
|
| + push r13
|
| + mov thresh, arg(4)
|
| +%else
|
| + %define src rdi
|
| + %define stride rsi
|
| + %define blimit rdx
|
| + %define limit rcx
|
| + %define thresh r8
|
| +
|
| + %define spp rax
|
| + %define stride3 r9
|
| + %define stride5 r10
|
| + %define stride7 r11
|
| +%endif
|
| +
|
| + %define scratch1 xmm5
|
| + %define scratch2 xmm6
|
| + %define zero xmm7
|
| +
|
| + %define s0 [src]
|
| + %define s1 [spp]
|
| + %define s2 [src + 2 * stride]
|
| + %define s3 [spp + 2 * stride]
|
| + %define s4 [src + 4 * stride]
|
| + %define s5 [spp + 4 * stride]
|
| + %define s6 [src + 2 * stride3]
|
| + %define s7 [spp + 2 * stride3]
|
| + %define s8 [src + 8 * stride]
|
| + %define s9 [spp + 8 * stride]
|
| + %define s10 [src + 2 * stride5]
|
| + %define s11 [spp + 2 * stride5]
|
| + %define s12 [src + 4 * stride3]
|
| + %define s13 [spp + 4 * stride3]
|
| + %define s14 [src + 2 * stride7]
|
| + %define s15 [spp + 2 * stride7]
|
| +
|
| + %define i0 [rsp]
|
| + %define i1 [rsp + 16]
|
| + %define i2 [rsp + 32]
|
| + %define i3 [rsp + 48]
|
| + %define i4 [rsp + 64]
|
| + %define i5 [rsp + 80]
|
| + %define i6 [rsp + 96]
|
| + %define i7 [rsp + 112]
|
| + %define i8 [rsp + 128]
|
| + %define i9 [rsp + 144]
|
| + %define i10 [rsp + 160]
|
| + %define i11 [rsp + 176]
|
| + %define i12 [rsp + 192]
|
| + %define i13 [rsp + 208]
|
| + %define i14 [rsp + 224]
|
| + %define i15 [rsp + 240]
|
| +
|
| + ALIGN_STACK 16, rax
|
| +
|
| + ; reserve stack space
|
| + %define temp_storage 0 ; size is 256 (16*16)
|
| + %define stack_size 256
|
| + sub rsp, stack_size
|
| +
|
| + ; prep work
|
| + lea spp, [src + stride]
|
| + lea stride3, [stride + 2 * stride]
|
| + lea stride5, [stride3 + 2 * stride]
|
| + lea stride7, [stride3 + 4 * stride]
|
| +
|
| + ; 8-f
|
| + movdqa xmm0, s8
|
| + movdqa xmm1, xmm0
|
| + punpcklbw xmm0, s9 ; 80 90
|
| + punpckhbw xmm1, s9 ; 88 98
|
| +
|
| + movdqa xmm2, s10
|
| + movdqa xmm3, xmm2
|
| + punpcklbw xmm2, s11 ; a0 b0
|
| + punpckhbw xmm3, s11 ; a8 b8
|
| +
|
| + movdqa xmm4, xmm0
|
| + punpcklwd xmm0, xmm2 ; 80 90 a0 b0
|
| + punpckhwd xmm4, xmm2 ; 84 94 a4 b4
|
| +
|
| + movdqa xmm2, xmm1
|
| + punpcklwd xmm1, xmm3 ; 88 98 a8 b8
|
| + punpckhwd xmm2, xmm3 ; 8c 9c ac bc
|
| +
|
| + ; using xmm[0124]
|
| + ; work on next 4 rows
|
| +
|
| + movdqa xmm3, s12
|
| + movdqa xmm5, xmm3
|
| + punpcklbw xmm3, s13 ; c0 d0
|
| + punpckhbw xmm5, s13 ; c8 d8
|
| +
|
| + movdqa xmm6, s14
|
| + movdqa xmm7, xmm6
|
| + punpcklbw xmm6, s15 ; e0 f0
|
| + punpckhbw xmm7, s15 ; e8 f8
|
| +
|
| + movdqa xmm8, xmm3
|
| + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
|
| + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
|
| +
|
| + movdqa xmm6, xmm5
|
| + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
|
| + punpckhwd xmm6, xmm7 ; cc dc ec fc
|
| +
|
| + ; pull the third and fourth sets together
|
| +
|
| + movdqa xmm7, xmm0
|
| + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
|
| + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
|
| +
|
| + movdqa xmm3, xmm4
|
| + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
|
| + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
|
| +
|
| + movdqa xmm8, xmm1
|
| + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
|
| + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
|
| +
|
| + movdqa xmm5, xmm2
|
| + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
|
| + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
|
| +
|
| + ; save the calculations. we only have 15 registers ...
|
| + movdqa i0, xmm0
|
| + movdqa i1, xmm7
|
| + movdqa i2, xmm4
|
| + movdqa i3, xmm3
|
| + movdqa i4, xmm1
|
| + movdqa i5, xmm8
|
| + movdqa i6, xmm2
|
| + movdqa i7, xmm5
|
| +
|
| + ; 0-7
|
| + movdqa xmm0, s0
|
| + movdqa xmm1, xmm0
|
| + punpcklbw xmm0, s1 ; 00 10
|
| + punpckhbw xmm1, s1 ; 08 18
|
| +
|
| + movdqa xmm2, s2
|
| + movdqa xmm3, xmm2
|
| + punpcklbw xmm2, s3 ; 20 30
|
| + punpckhbw xmm3, s3 ; 28 38
|
| +
|
| + movdqa xmm4, xmm0
|
| + punpcklwd xmm0, xmm2 ; 00 10 20 30
|
| + punpckhwd xmm4, xmm2 ; 04 14 24 34
|
| +
|
| + movdqa xmm2, xmm1
|
| + punpcklwd xmm1, xmm3 ; 08 18 28 38
|
| + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
|
| +
|
| + ; using xmm[0124]
|
| + ; work on next 4 rows
|
| +
|
| + movdqa xmm3, s4
|
| + movdqa xmm5, xmm3
|
| + punpcklbw xmm3, s5 ; 40 50
|
| + punpckhbw xmm5, s5 ; 48 58
|
| +
|
| + movdqa xmm6, s6
|
| + movdqa xmm7, xmm6
|
| + punpcklbw xmm6, s7 ; 60 70
|
| + punpckhbw xmm7, s7 ; 68 78
|
| +
|
| + movdqa xmm8, xmm3
|
| + punpcklwd xmm3, xmm6 ; 40 50 60 70
|
| + punpckhwd xmm8, xmm6 ; 44 54 64 74
|
| +
|
| + movdqa xmm6, xmm5
|
| + punpcklwd xmm5, xmm7 ; 48 58 68 78
|
| + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
|
| +
|
| + ; pull the first two sets together
|
| +
|
| + movdqa xmm7, xmm0
|
| + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
|
| + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
|
| +
|
| + movdqa xmm3, xmm4
|
| + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
|
| + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
|
| +
|
| + movdqa xmm8, xmm1
|
| + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
|
| + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
|
| +
|
| + movdqa xmm5, xmm2
|
| + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
|
| + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
|
| + ; final combination
|
| +
|
| + movdqa xmm6, xmm0
|
| + punpcklqdq xmm0, i0
|
| + punpckhqdq xmm6, i0
|
| +
|
| + movdqa xmm9, xmm7
|
| + punpcklqdq xmm7, i1
|
| + punpckhqdq xmm9, i1
|
| +
|
| + movdqa xmm10, xmm4
|
| + punpcklqdq xmm4, i2
|
| + punpckhqdq xmm10, i2
|
| +
|
| + movdqa xmm11, xmm3
|
| + punpcklqdq xmm3, i3
|
| + punpckhqdq xmm11, i3
|
| +
|
| + movdqa xmm12, xmm1
|
| + punpcklqdq xmm1, i4
|
| + punpckhqdq xmm12, i4
|
| +
|
| + movdqa xmm13, xmm8
|
| + punpcklqdq xmm8, i5
|
| + punpckhqdq xmm13, i5
|
| +
|
| + movdqa xmm14, xmm2
|
| + punpcklqdq xmm2, i6
|
| + punpckhqdq xmm14, i6
|
| +
|
| + movdqa xmm15, xmm5
|
| + punpcklqdq xmm5, i7
|
| + punpckhqdq xmm15, i7
|
| +
|
| + movdqa i0, xmm0
|
| + movdqa i1, xmm6
|
| + movdqa i2, xmm7
|
| + movdqa i3, xmm9
|
| + movdqa i4, xmm4
|
| + movdqa i5, xmm10
|
| + movdqa i6, xmm3
|
| + movdqa i7, xmm11
|
| + movdqa i8, xmm1
|
| + movdqa i9, xmm12
|
| + movdqa i10, xmm8
|
| + movdqa i11, xmm13
|
| + movdqa i12, xmm2
|
| + movdqa i13, xmm14
|
| + movdqa i14, xmm5
|
| + movdqa i15, xmm15
|
| +
|
| +; TRANSPOSED DATA AVAILABLE ON THE STACK
|
| +
|
| + movdqa xmm12, xmm6
|
| + movdqa xmm13, xmm7
|
| +
|
| + pxor zero, zero
|
| +
|
| +LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
|
| +
|
| + movdqa xmm1, i2
|
| + movdqa xmm2, i3
|
| + movdqa xmm8, i4
|
| + movdqa xmm9, i5
|
| +LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
|
| + movdqa i2, xmm1
|
| + movdqa i3, xmm2
|
| +
|
| +; second set
|
| + movdqa i4, xmm8
|
| + movdqa i5, xmm9
|
| +
|
| + movdqa xmm0, i6
|
| + movdqa xmm1, i7
|
| + movdqa xmm2, i8
|
| + movdqa xmm4, i9
|
| + movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
|
| + movdqa xmm11, i11
|
| +LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
|
| +
|
| + movdqa xmm0, i6
|
| + movdqa xmm1, i7
|
| + movdqa xmm3, i8
|
| + movdqa xmm4, i9
|
| +LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
|
| + movdqa i6, xmm0
|
| + movdqa i7, xmm1
|
| +
|
| +; last set
|
| + movdqa i8, xmm3
|
| + movdqa i9, xmm4
|
| +
|
| + movdqa xmm0, i10
|
| + movdqa xmm1, i11
|
| + movdqa xmm2, i12
|
| + movdqa xmm8, i13
|
| + movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
|
| + movdqa xmm11, i15
|
| +LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
|
| +
|
| + movdqa xmm0, i10
|
| + movdqa xmm1, i11
|
| + movdqa xmm4, i12
|
| + movdqa xmm8, i13
|
| +LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
|
| + movdqa i10, xmm0
|
| + movdqa i11, xmm1
|
| + movdqa i12, xmm4
|
| + movdqa i13, xmm8
|
| +
|
| +
|
| +; RESHUFFLE AND WRITE OUT
|
| + ; 8-f
|
| + movdqa xmm0, i8
|
| + movdqa xmm1, xmm0
|
| + punpcklbw xmm0, i9 ; 80 90
|
| + punpckhbw xmm1, i9 ; 88 98
|
| +
|
| + movdqa xmm2, i10
|
| + movdqa xmm3, xmm2
|
| + punpcklbw xmm2, i11 ; a0 b0
|
| + punpckhbw xmm3, i11 ; a8 b8
|
| +
|
| + movdqa xmm4, xmm0
|
| + punpcklwd xmm0, xmm2 ; 80 90 a0 b0
|
| + punpckhwd xmm4, xmm2 ; 84 94 a4 b4
|
| +
|
| + movdqa xmm2, xmm1
|
| + punpcklwd xmm1, xmm3 ; 88 98 a8 b8
|
| + punpckhwd xmm2, xmm3 ; 8c 9c ac bc
|
| +
|
| + ; using xmm[0124]
|
| + ; work on next 4 rows
|
| +
|
| + movdqa xmm3, i12
|
| + movdqa xmm5, xmm3
|
| + punpcklbw xmm3, i13 ; c0 d0
|
| + punpckhbw xmm5, i13 ; c8 d8
|
| +
|
| + movdqa xmm6, i14
|
| + movdqa xmm7, xmm6
|
| + punpcklbw xmm6, i15 ; e0 f0
|
| + punpckhbw xmm7, i15 ; e8 f8
|
| +
|
| + movdqa xmm8, xmm3
|
| + punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
|
| + punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
|
| +
|
| + movdqa xmm6, xmm5
|
| + punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
|
| + punpckhwd xmm6, xmm7 ; cc dc ec fc
|
| +
|
| + ; pull the third and fourth sets together
|
| +
|
| + movdqa xmm7, xmm0
|
| + punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
|
| + punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
|
| +
|
| + movdqa xmm3, xmm4
|
| + punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
|
| + punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
|
| +
|
| + movdqa xmm8, xmm1
|
| + punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
|
| + punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
|
| +
|
| + movdqa xmm5, xmm2
|
| + punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
|
| + punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
|
| +
|
| + ; save the calculations. we only have 15 registers ...
|
| + movdqa i8, xmm0
|
| + movdqa i9, xmm7
|
| + movdqa i10, xmm4
|
| + movdqa i11, xmm3
|
| + movdqa i12, xmm1
|
| + movdqa i13, xmm8
|
| + movdqa i14, xmm2
|
| + movdqa i15, xmm5
|
| +
|
| + ; 0-7
|
| + movdqa xmm0, i0
|
| + movdqa xmm1, xmm0
|
| + punpcklbw xmm0, i1 ; 00 10
|
| + punpckhbw xmm1, i1 ; 08 18
|
| +
|
| + movdqa xmm2, i2
|
| + movdqa xmm3, xmm2
|
| + punpcklbw xmm2, i3 ; 20 30
|
| + punpckhbw xmm3, i3 ; 28 38
|
| +
|
| + movdqa xmm4, xmm0
|
| + punpcklwd xmm0, xmm2 ; 00 10 20 30
|
| + punpckhwd xmm4, xmm2 ; 04 14 24 34
|
| +
|
| + movdqa xmm2, xmm1
|
| + punpcklwd xmm1, xmm3 ; 08 18 28 38
|
| + punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
|
| +
|
| + ; using xmm[0124]
|
| + ; work on next 4 rows
|
| +
|
| + movdqa xmm3, i4
|
| + movdqa xmm5, xmm3
|
| + punpcklbw xmm3, i5 ; 40 50
|
| + punpckhbw xmm5, i5 ; 48 58
|
| +
|
| + movdqa xmm6, i6
|
| + movdqa xmm7, xmm6
|
| + punpcklbw xmm6, i7 ; 60 70
|
| + punpckhbw xmm7, i7 ; 68 78
|
| +
|
| + movdqa xmm8, xmm3
|
| + punpcklwd xmm3, xmm6 ; 40 50 60 70
|
| + punpckhwd xmm8, xmm6 ; 44 54 64 74
|
| +
|
| + movdqa xmm6, xmm5
|
| + punpcklwd xmm5, xmm7 ; 48 58 68 78
|
| + punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
|
| +
|
| + ; pull the first two sets together
|
| +
|
| + movdqa xmm7, xmm0
|
| + punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
|
| + punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
|
| +
|
| + movdqa xmm3, xmm4
|
| + punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
|
| + punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
|
| +
|
| + movdqa xmm8, xmm1
|
| + punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
|
| + punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
|
| +
|
| + movdqa xmm5, xmm2
|
| + punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
|
| + punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
|
| + ; final combination
|
| +
|
| + movdqa xmm6, xmm0
|
| + punpcklqdq xmm0, i8
|
| + punpckhqdq xmm6, i8
|
| +
|
| + movdqa xmm9, xmm7
|
| + punpcklqdq xmm7, i9
|
| + punpckhqdq xmm9, i9
|
| +
|
| + movdqa xmm10, xmm4
|
| + punpcklqdq xmm4, i10
|
| + punpckhqdq xmm10, i10
|
| +
|
| + movdqa xmm11, xmm3
|
| + punpcklqdq xmm3, i11
|
| + punpckhqdq xmm11, i11
|
| +
|
| + movdqa xmm12, xmm1
|
| + punpcklqdq xmm1, i12
|
| + punpckhqdq xmm12, i12
|
| +
|
| + movdqa xmm13, xmm8
|
| + punpcklqdq xmm8, i13
|
| + punpckhqdq xmm13, i13
|
| +
|
| + movdqa xmm14, xmm2
|
| + punpcklqdq xmm2, i14
|
| + punpckhqdq xmm14, i14
|
| +
|
| + movdqa xmm15, xmm5
|
| + punpcklqdq xmm5, i15
|
| + punpckhqdq xmm15, i15
|
| +
|
| + movdqa s0, xmm0
|
| + movdqa s1, xmm6
|
| + movdqa s2, xmm7
|
| + movdqa s3, xmm9
|
| + movdqa s4, xmm4
|
| + movdqa s5, xmm10
|
| + movdqa s6, xmm3
|
| + movdqa s7, xmm11
|
| + movdqa s8, xmm1
|
| + movdqa s9, xmm12
|
| + movdqa s10, xmm8
|
| + movdqa s11, xmm13
|
| + movdqa s12, xmm2
|
| + movdqa s13, xmm14
|
| + movdqa s14, xmm5
|
| + movdqa s15, xmm15
|
| +
|
| + ; free stack space
|
| + add rsp, stack_size
|
| +
|
| + ; un-ALIGN_STACK
|
| + pop rsp
|
| +
|
| +%if LIBVPX_YASM_WIN64
|
| + pop r13
|
| + pop r12
|
| + RESTORE_XMM
|
| + pop rbp
|
| +%endif
|
| +
|
| + ret
|
| +
|
| +SECTION_RODATA
|
| +align 16
|
| +te0:
|
| + times 16 db 0xe0
|
| +align 16
|
| +t7f:
|
| + times 16 db 0x7f
|
| +align 16
|
| +tfe:
|
| + times 16 db 0xfe
|
| +align 16
|
| +t1f:
|
| + times 16 db 0x1f
|
| +align 16
|
| +t80:
|
| + times 16 db 0x80
|
| +align 16
|
| +t1:
|
| + times 16 db 0x01
|
| +align 16
|
| +t3:
|
| + times 16 db 0x03
|
| +align 16
|
| +t4:
|
| + times 16 db 0x04
|
|
|