| Index: source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm
|
| ===================================================================
|
| --- source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm (revision 278778)
|
| +++ source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm (working copy)
|
| @@ -1,815 +0,0 @@
|
| -;
|
| -; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
|
| -;
|
| -; Use of this source code is governed by a BSD-style license
|
| -; that can be found in the LICENSE file in the root of the source
|
| -; tree. An additional intellectual property rights grant can be found
|
| -; in the file PATENTS. All contributing project authors may
|
| -; be found in the AUTHORS file in the root of the source tree.
|
| -;
|
| -
|
| -
|
| -%include "vpx_ports/x86_abi_support.asm"
|
| -
|
| -%macro LF_ABS 2
|
| - ; %1 value not preserved
|
| - ; %2 value preserved
|
| - ; output in %1
|
| - movdqa scratch1, %2 ; v2
|
| -
|
| - psubusb scratch1, %1 ; v2 - v1
|
| - psubusb %1, %2 ; v1 - v2
|
| - por %1, scratch1 ; abs(v2 - v1)
|
| -%endmacro
|
| -
|
| -%macro LF_FILTER_HEV_MASK 8-9
|
| -
|
| - LF_ABS %1, %2 ; abs(p3 - p2)
|
| - LF_ABS %2, %3 ; abs(p2 - p1)
|
| - pmaxub %1, %2 ; accumulate mask
|
| -%if %0 == 8
|
| - movdqa scratch2, %3 ; save p1
|
| - LF_ABS scratch2, %4 ; abs(p1 - p0)
|
| -%endif
|
| - LF_ABS %4, %5 ; abs(p0 - q0)
|
| - LF_ABS %5, %6 ; abs(q0 - q1)
|
| -%if %0 == 8
|
| - pmaxub %5, scratch2 ; accumulate hev
|
| -%else
|
| - pmaxub %5, %9
|
| -%endif
|
| - pmaxub %1, %5 ; accumulate mask
|
| -
|
| - LF_ABS %3, %6 ; abs(p1 - q1)
|
| - LF_ABS %6, %7 ; abs(q1 - q2)
|
| - pmaxub %1, %6 ; accumulate mask
|
| - LF_ABS %7, %8 ; abs(q2 - q3)
|
| - pmaxub %1, %7 ; accumulate mask
|
| -
|
| - paddusb %4, %4 ; 2 * abs(p0 - q0)
|
| - pand %3, [GLOBAL(tfe)]
|
| - psrlw %3, 1 ; abs(p1 - q1) / 2
|
| - paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2
|
| -
|
| - psubusb %1, [limit]
|
| - psubusb %4, [blimit]
|
| - por %1, %4
|
| - pcmpeqb %1, zero ; mask
|
| -
|
| - psubusb %5, [thresh]
|
| - pcmpeqb %5, zero ; ~hev
|
| -%endmacro
|
| -
|
| -%macro LF_FILTER 6
|
| - ; %1-%4: p1-q1
|
| - ; %5: mask
|
| - ; %6: hev
|
| -
|
| - movdqa scratch2, %6 ; save hev
|
| -
|
| - pxor %1, [GLOBAL(t80)] ; ps1
|
| - pxor %4, [GLOBAL(t80)] ; qs1
|
| - movdqa scratch1, %1
|
| - psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1)
|
| - pandn scratch2, scratch1 ; vp8_filter &= hev
|
| -
|
| - pxor %2, [GLOBAL(t80)] ; ps0
|
| - pxor %3, [GLOBAL(t80)] ; qs0
|
| - movdqa scratch1, %3
|
| - psubsb scratch1, %2 ; qs0 - ps0
|
| - paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
| - paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
| - paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0)
|
| - pand %5, scratch2 ; &= mask
|
| -
|
| - movdqa scratch2, %5
|
| - paddsb %5, [GLOBAL(t4)] ; Filter1
|
| - paddsb scratch2, [GLOBAL(t3)] ; Filter2
|
| -
|
| - ; Filter1 >> 3
|
| - movdqa scratch1, zero
|
| - pcmpgtb scratch1, %5
|
| - psrlw %5, 3
|
| - pand scratch1, [GLOBAL(te0)]
|
| - pand %5, [GLOBAL(t1f)]
|
| - por %5, scratch1
|
| -
|
| - psubsb %3, %5 ; qs0 - Filter1
|
| - pxor %3, [GLOBAL(t80)]
|
| -
|
| - ; Filter2 >> 3
|
| - movdqa scratch1, zero
|
| - pcmpgtb scratch1, scratch2
|
| - psrlw scratch2, 3
|
| - pand scratch1, [GLOBAL(te0)]
|
| - pand scratch2, [GLOBAL(t1f)]
|
| - por scratch2, scratch1
|
| -
|
| - paddsb %2, scratch2 ; ps0 + Filter2
|
| - pxor %2, [GLOBAL(t80)]
|
| -
|
| - ; outer tap adjustments
|
| - paddsb %5, [GLOBAL(t1)]
|
| - movdqa scratch1, zero
|
| - pcmpgtb scratch1, %5
|
| - psrlw %5, 1
|
| - pand scratch1, [GLOBAL(t80)]
|
| - pand %5, [GLOBAL(t7f)]
|
| - por %5, scratch1
|
| - pand %5, %6 ; vp8_filter &= ~hev
|
| -
|
| - psubsb %4, %5 ; qs1 - vp8_filter
|
| - pxor %4, [GLOBAL(t80)]
|
| -
|
| - paddsb %1, %5 ; ps1 + vp8_filter
|
| - pxor %1, [GLOBAL(t80)]
|
| -%endmacro
|
| -
|
| -;void vp8_loop_filter_bh_y_sse2
|
| -;(
|
| -; unsigned char *src_ptr,
|
| -; int src_pixel_step,
|
| -; const char *blimit,
|
| -; const char *limit,
|
| -; const char *thresh
|
| -;)
|
| -global sym(vp8_loop_filter_bh_y_sse2) PRIVATE
|
| -sym(vp8_loop_filter_bh_y_sse2):
|
| -
|
| -%if LIBVPX_YASM_WIN64
|
| - %define src rcx ; src_ptr
|
| - %define stride rdx ; src_pixel_step
|
| - %define blimit r8
|
| - %define limit r9
|
| - %define thresh r10
|
| -
|
| - %define spp rax
|
| - %define stride3 r11
|
| - %define stride5 r12
|
| - %define stride7 r13
|
| -
|
| - push rbp
|
| - mov rbp, rsp
|
| - SAVE_XMM 11
|
| - push r12
|
| - push r13
|
| - mov thresh, arg(4)
|
| -%else
|
| - %define src rdi ; src_ptr
|
| - %define stride rsi ; src_pixel_step
|
| - %define blimit rdx
|
| - %define limit rcx
|
| - %define thresh r8
|
| -
|
| - %define spp rax
|
| - %define stride3 r9
|
| - %define stride5 r10
|
| - %define stride7 r11
|
| -%endif
|
| -
|
| - %define scratch1 xmm5
|
| - %define scratch2 xmm6
|
| - %define zero xmm7
|
| -
|
| - %define i0 [src]
|
| - %define i1 [spp]
|
| - %define i2 [src + 2 * stride]
|
| - %define i3 [spp + 2 * stride]
|
| - %define i4 [src + 4 * stride]
|
| - %define i5 [spp + 4 * stride]
|
| - %define i6 [src + 2 * stride3]
|
| - %define i7 [spp + 2 * stride3]
|
| - %define i8 [src + 8 * stride]
|
| - %define i9 [spp + 8 * stride]
|
| - %define i10 [src + 2 * stride5]
|
| - %define i11 [spp + 2 * stride5]
|
| - %define i12 [src + 4 * stride3]
|
| - %define i13 [spp + 4 * stride3]
|
| - %define i14 [src + 2 * stride7]
|
| - %define i15 [spp + 2 * stride7]
|
| -
|
| - ; prep work
|
| - lea spp, [src + stride]
|
| - lea stride3, [stride + 2 * stride]
|
| - lea stride5, [stride3 + 2 * stride]
|
| - lea stride7, [stride3 + 4 * stride]
|
| - pxor zero, zero
|
| -
|
| - ; load the first set into registers
|
| - movdqa xmm0, i0
|
| - movdqa xmm1, i1
|
| - movdqa xmm2, i2
|
| - movdqa xmm3, i3
|
| - movdqa xmm4, i4
|
| - movdqa xmm8, i5
|
| - movdqa xmm9, i6 ; q2, will contain abs(p1-p0)
|
| - movdqa xmm10, i7
|
| -LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10
|
| -
|
| - movdqa xmm1, i2
|
| - movdqa xmm2, i3
|
| - movdqa xmm3, i4
|
| - movdqa xmm8, i5
|
| -LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4
|
| - movdqa i2, xmm1
|
| - movdqa i3, xmm2
|
| -
|
| -; second set
|
| - movdqa i4, xmm3
|
| - movdqa i5, xmm8
|
| -
|
| - movdqa xmm0, i6
|
| - movdqa xmm1, i7
|
| - movdqa xmm2, i8
|
| - movdqa xmm4, i9
|
| - movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
|
| - movdqa xmm11, i11
|
| -LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9
|
| -
|
| - movdqa xmm0, i6
|
| - movdqa xmm1, i7
|
| - movdqa xmm4, i8
|
| - movdqa xmm8, i9
|
| -LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
|
| - movdqa i6, xmm0
|
| - movdqa i7, xmm1
|
| -
|
| -; last set
|
| - movdqa i8, xmm4
|
| - movdqa i9, xmm8
|
| -
|
| - movdqa xmm0, i10
|
| - movdqa xmm1, i11
|
| - movdqa xmm2, i12
|
| - movdqa xmm3, i13
|
| - movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
|
| - movdqa xmm11, i15
|
| -LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10
|
| -
|
| - movdqa xmm0, i10
|
| - movdqa xmm1, i11
|
| - movdqa xmm3, i12
|
| - movdqa xmm8, i13
|
| -LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2
|
| - movdqa i10, xmm0
|
| - movdqa i11, xmm1
|
| - movdqa i12, xmm3
|
| - movdqa i13, xmm8
|
| -
|
| -%if LIBVPX_YASM_WIN64
|
| - pop r13
|
| - pop r12
|
| - RESTORE_XMM
|
| - pop rbp
|
| -%endif
|
| -
|
| - ret
|
| -
|
| -
|
| -;void vp8_loop_filter_bv_y_sse2
|
| -;(
|
| -; unsigned char *src_ptr,
|
| -; int src_pixel_step,
|
| -; const char *blimit,
|
| -; const char *limit,
|
| -; const char *thresh
|
| -;)
|
| -
|
| -global sym(vp8_loop_filter_bv_y_sse2) PRIVATE
|
| -sym(vp8_loop_filter_bv_y_sse2):
|
| -
|
| -%if LIBVPX_YASM_WIN64
|
| - %define src rcx ; src_ptr
|
| - %define stride rdx ; src_pixel_step
|
| - %define blimit r8
|
| - %define limit r9
|
| - %define thresh r10
|
| -
|
| - %define spp rax
|
| - %define stride3 r11
|
| - %define stride5 r12
|
| - %define stride7 r13
|
| -
|
| - push rbp
|
| - mov rbp, rsp
|
| - SAVE_XMM 15
|
| - push r12
|
| - push r13
|
| - mov thresh, arg(4)
|
| -%else
|
| - %define src rdi
|
| - %define stride rsi
|
| - %define blimit rdx
|
| - %define limit rcx
|
| - %define thresh r8
|
| -
|
| - %define spp rax
|
| - %define stride3 r9
|
| - %define stride5 r10
|
| - %define stride7 r11
|
| -%endif
|
| -
|
| - %define scratch1 xmm5
|
| - %define scratch2 xmm6
|
| - %define zero xmm7
|
| -
|
| - %define s0 [src]
|
| - %define s1 [spp]
|
| - %define s2 [src + 2 * stride]
|
| - %define s3 [spp + 2 * stride]
|
| - %define s4 [src + 4 * stride]
|
| - %define s5 [spp + 4 * stride]
|
| - %define s6 [src + 2 * stride3]
|
| - %define s7 [spp + 2 * stride3]
|
| - %define s8 [src + 8 * stride]
|
| - %define s9 [spp + 8 * stride]
|
| - %define s10 [src + 2 * stride5]
|
| - %define s11 [spp + 2 * stride5]
|
| - %define s12 [src + 4 * stride3]
|
| - %define s13 [spp + 4 * stride3]
|
| - %define s14 [src + 2 * stride7]
|
| - %define s15 [spp + 2 * stride7]
|
| -
|
| - %define i0 [rsp]
|
| - %define i1 [rsp + 16]
|
| - %define i2 [rsp + 32]
|
| - %define i3 [rsp + 48]
|
| - %define i4 [rsp + 64]
|
| - %define i5 [rsp + 80]
|
| - %define i6 [rsp + 96]
|
| - %define i7 [rsp + 112]
|
| - %define i8 [rsp + 128]
|
| - %define i9 [rsp + 144]
|
| - %define i10 [rsp + 160]
|
| - %define i11 [rsp + 176]
|
| - %define i12 [rsp + 192]
|
| - %define i13 [rsp + 208]
|
| - %define i14 [rsp + 224]
|
| - %define i15 [rsp + 240]
|
| -
|
| - ALIGN_STACK 16, rax
|
| -
|
| - ; reserve stack space
|
| - %define temp_storage 0 ; size is 256 (16*16)
|
| - %define stack_size 256
|
| - sub rsp, stack_size
|
| -
|
| - ; prep work
|
| - lea spp, [src + stride]
|
| - lea stride3, [stride + 2 * stride]
|
| - lea stride5, [stride3 + 2 * stride]
|
| - lea stride7, [stride3 + 4 * stride]
|
| -
|
| - ; 8-f
|
| - movdqa xmm0, s8
|
| - movdqa xmm1, xmm0
|
| - punpcklbw xmm0, s9 ; 80 90
|
| - punpckhbw xmm1, s9 ; 88 98
|
| -
|
| - movdqa xmm2, s10
|
| - movdqa xmm3, xmm2
|
| - punpcklbw xmm2, s11 ; a0 b0
|
| - punpckhbw xmm3, s11 ; a8 b8
|
| -
|
| - movdqa xmm4, xmm0
|
| - punpcklwd xmm0, xmm2 ; 80 90 a0 b0
|
| - punpckhwd xmm4, xmm2 ; 84 94 a4 b4
|
| -
|
| - movdqa xmm2, xmm1
|
| - punpcklwd xmm1, xmm3 ; 88 98 a8 b8
|
| - punpckhwd xmm2, xmm3 ; 8c 9c ac bc
|
| -
|
| - ; using xmm[0124]
|
| - ; work on next 4 rows
|
| -
|
| - movdqa xmm3, s12
|
| - movdqa xmm5, xmm3
|
| - punpcklbw xmm3, s13 ; c0 d0
|
| - punpckhbw xmm5, s13 ; c8 d8
|
| -
|
| - movdqa xmm6, s14
|
| - movdqa xmm7, xmm6
|
| - punpcklbw xmm6, s15 ; e0 f0
|
| - punpckhbw xmm7, s15 ; e8 f8
|
| -
|
| - movdqa xmm8, xmm3
|
| - punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
|
| - punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
|
| -
|
| - movdqa xmm6, xmm5
|
| - punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
|
| - punpckhwd xmm6, xmm7 ; cc dc ec fc
|
| -
|
| - ; pull the third and fourth sets together
|
| -
|
| - movdqa xmm7, xmm0
|
| - punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
|
| - punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
|
| -
|
| - movdqa xmm3, xmm4
|
| - punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
|
| - punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
|
| -
|
| - movdqa xmm8, xmm1
|
| - punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
|
| - punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
|
| -
|
| - movdqa xmm5, xmm2
|
| - punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
|
| - punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
|
| -
|
| - ; save the calculations. we only have 15 registers ...
|
| - movdqa i0, xmm0
|
| - movdqa i1, xmm7
|
| - movdqa i2, xmm4
|
| - movdqa i3, xmm3
|
| - movdqa i4, xmm1
|
| - movdqa i5, xmm8
|
| - movdqa i6, xmm2
|
| - movdqa i7, xmm5
|
| -
|
| - ; 0-7
|
| - movdqa xmm0, s0
|
| - movdqa xmm1, xmm0
|
| - punpcklbw xmm0, s1 ; 00 10
|
| - punpckhbw xmm1, s1 ; 08 18
|
| -
|
| - movdqa xmm2, s2
|
| - movdqa xmm3, xmm2
|
| - punpcklbw xmm2, s3 ; 20 30
|
| - punpckhbw xmm3, s3 ; 28 38
|
| -
|
| - movdqa xmm4, xmm0
|
| - punpcklwd xmm0, xmm2 ; 00 10 20 30
|
| - punpckhwd xmm4, xmm2 ; 04 14 24 34
|
| -
|
| - movdqa xmm2, xmm1
|
| - punpcklwd xmm1, xmm3 ; 08 18 28 38
|
| - punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
|
| -
|
| - ; using xmm[0124]
|
| - ; work on next 4 rows
|
| -
|
| - movdqa xmm3, s4
|
| - movdqa xmm5, xmm3
|
| - punpcklbw xmm3, s5 ; 40 50
|
| - punpckhbw xmm5, s5 ; 48 58
|
| -
|
| - movdqa xmm6, s6
|
| - movdqa xmm7, xmm6
|
| - punpcklbw xmm6, s7 ; 60 70
|
| - punpckhbw xmm7, s7 ; 68 78
|
| -
|
| - movdqa xmm8, xmm3
|
| - punpcklwd xmm3, xmm6 ; 40 50 60 70
|
| - punpckhwd xmm8, xmm6 ; 44 54 64 74
|
| -
|
| - movdqa xmm6, xmm5
|
| - punpcklwd xmm5, xmm7 ; 48 58 68 78
|
| - punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
|
| -
|
| - ; pull the first two sets together
|
| -
|
| - movdqa xmm7, xmm0
|
| - punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
|
| - punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
|
| -
|
| - movdqa xmm3, xmm4
|
| - punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
|
| - punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
|
| -
|
| - movdqa xmm8, xmm1
|
| - punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
|
| - punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
|
| -
|
| - movdqa xmm5, xmm2
|
| - punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
|
| - punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
|
| - ; final combination
|
| -
|
| - movdqa xmm6, xmm0
|
| - punpcklqdq xmm0, i0
|
| - punpckhqdq xmm6, i0
|
| -
|
| - movdqa xmm9, xmm7
|
| - punpcklqdq xmm7, i1
|
| - punpckhqdq xmm9, i1
|
| -
|
| - movdqa xmm10, xmm4
|
| - punpcklqdq xmm4, i2
|
| - punpckhqdq xmm10, i2
|
| -
|
| - movdqa xmm11, xmm3
|
| - punpcklqdq xmm3, i3
|
| - punpckhqdq xmm11, i3
|
| -
|
| - movdqa xmm12, xmm1
|
| - punpcklqdq xmm1, i4
|
| - punpckhqdq xmm12, i4
|
| -
|
| - movdqa xmm13, xmm8
|
| - punpcklqdq xmm8, i5
|
| - punpckhqdq xmm13, i5
|
| -
|
| - movdqa xmm14, xmm2
|
| - punpcklqdq xmm2, i6
|
| - punpckhqdq xmm14, i6
|
| -
|
| - movdqa xmm15, xmm5
|
| - punpcklqdq xmm5, i7
|
| - punpckhqdq xmm15, i7
|
| -
|
| - movdqa i0, xmm0
|
| - movdqa i1, xmm6
|
| - movdqa i2, xmm7
|
| - movdqa i3, xmm9
|
| - movdqa i4, xmm4
|
| - movdqa i5, xmm10
|
| - movdqa i6, xmm3
|
| - movdqa i7, xmm11
|
| - movdqa i8, xmm1
|
| - movdqa i9, xmm12
|
| - movdqa i10, xmm8
|
| - movdqa i11, xmm13
|
| - movdqa i12, xmm2
|
| - movdqa i13, xmm14
|
| - movdqa i14, xmm5
|
| - movdqa i15, xmm15
|
| -
|
| -; TRANSPOSED DATA AVAILABLE ON THE STACK
|
| -
|
| - movdqa xmm12, xmm6
|
| - movdqa xmm13, xmm7
|
| -
|
| - pxor zero, zero
|
| -
|
| -LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11
|
| -
|
| - movdqa xmm1, i2
|
| - movdqa xmm2, i3
|
| - movdqa xmm8, i4
|
| - movdqa xmm9, i5
|
| -LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4
|
| - movdqa i2, xmm1
|
| - movdqa i3, xmm2
|
| -
|
| -; second set
|
| - movdqa i4, xmm8
|
| - movdqa i5, xmm9
|
| -
|
| - movdqa xmm0, i6
|
| - movdqa xmm1, i7
|
| - movdqa xmm2, i8
|
| - movdqa xmm4, i9
|
| - movdqa xmm10, i10 ; q2, will contain abs(p1-p0)
|
| - movdqa xmm11, i11
|
| -LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3
|
| -
|
| - movdqa xmm0, i6
|
| - movdqa xmm1, i7
|
| - movdqa xmm3, i8
|
| - movdqa xmm4, i9
|
| -LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2
|
| - movdqa i6, xmm0
|
| - movdqa i7, xmm1
|
| -
|
| -; last set
|
| - movdqa i8, xmm3
|
| - movdqa i9, xmm4
|
| -
|
| - movdqa xmm0, i10
|
| - movdqa xmm1, i11
|
| - movdqa xmm2, i12
|
| - movdqa xmm8, i13
|
| - movdqa xmm9, i14 ; q2, will contain abs(p1-p0)
|
| - movdqa xmm11, i15
|
| -LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10
|
| -
|
| - movdqa xmm0, i10
|
| - movdqa xmm1, i11
|
| - movdqa xmm4, i12
|
| - movdqa xmm8, i13
|
| -LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2
|
| - movdqa i10, xmm0
|
| - movdqa i11, xmm1
|
| - movdqa i12, xmm4
|
| - movdqa i13, xmm8
|
| -
|
| -
|
| -; RESHUFFLE AND WRITE OUT
|
| - ; 8-f
|
| - movdqa xmm0, i8
|
| - movdqa xmm1, xmm0
|
| - punpcklbw xmm0, i9 ; 80 90
|
| - punpckhbw xmm1, i9 ; 88 98
|
| -
|
| - movdqa xmm2, i10
|
| - movdqa xmm3, xmm2
|
| - punpcklbw xmm2, i11 ; a0 b0
|
| - punpckhbw xmm3, i11 ; a8 b8
|
| -
|
| - movdqa xmm4, xmm0
|
| - punpcklwd xmm0, xmm2 ; 80 90 a0 b0
|
| - punpckhwd xmm4, xmm2 ; 84 94 a4 b4
|
| -
|
| - movdqa xmm2, xmm1
|
| - punpcklwd xmm1, xmm3 ; 88 98 a8 b8
|
| - punpckhwd xmm2, xmm3 ; 8c 9c ac bc
|
| -
|
| - ; using xmm[0124]
|
| - ; work on next 4 rows
|
| -
|
| - movdqa xmm3, i12
|
| - movdqa xmm5, xmm3
|
| - punpcklbw xmm3, i13 ; c0 d0
|
| - punpckhbw xmm5, i13 ; c8 d8
|
| -
|
| - movdqa xmm6, i14
|
| - movdqa xmm7, xmm6
|
| - punpcklbw xmm6, i15 ; e0 f0
|
| - punpckhbw xmm7, i15 ; e8 f8
|
| -
|
| - movdqa xmm8, xmm3
|
| - punpcklwd xmm3, xmm6 ; c0 d0 e0 f0
|
| - punpckhwd xmm8, xmm6 ; c4 d4 e4 f4
|
| -
|
| - movdqa xmm6, xmm5
|
| - punpcklwd xmm5, xmm7 ; c8 d8 e8 f8
|
| - punpckhwd xmm6, xmm7 ; cc dc ec fc
|
| -
|
| - ; pull the third and fourth sets together
|
| -
|
| - movdqa xmm7, xmm0
|
| - punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0
|
| - punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2
|
| -
|
| - movdqa xmm3, xmm4
|
| - punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4
|
| - punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6
|
| -
|
| - movdqa xmm8, xmm1
|
| - punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8
|
| - punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa
|
| -
|
| - movdqa xmm5, xmm2
|
| - punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc
|
| - punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe
|
| -
|
| - ; save the calculations. we only have 15 registers ...
|
| - movdqa i8, xmm0
|
| - movdqa i9, xmm7
|
| - movdqa i10, xmm4
|
| - movdqa i11, xmm3
|
| - movdqa i12, xmm1
|
| - movdqa i13, xmm8
|
| - movdqa i14, xmm2
|
| - movdqa i15, xmm5
|
| -
|
| - ; 0-7
|
| - movdqa xmm0, i0
|
| - movdqa xmm1, xmm0
|
| - punpcklbw xmm0, i1 ; 00 10
|
| - punpckhbw xmm1, i1 ; 08 18
|
| -
|
| - movdqa xmm2, i2
|
| - movdqa xmm3, xmm2
|
| - punpcklbw xmm2, i3 ; 20 30
|
| - punpckhbw xmm3, i3 ; 28 38
|
| -
|
| - movdqa xmm4, xmm0
|
| - punpcklwd xmm0, xmm2 ; 00 10 20 30
|
| - punpckhwd xmm4, xmm2 ; 04 14 24 34
|
| -
|
| - movdqa xmm2, xmm1
|
| - punpcklwd xmm1, xmm3 ; 08 18 28 38
|
| - punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c
|
| -
|
| - ; using xmm[0124]
|
| - ; work on next 4 rows
|
| -
|
| - movdqa xmm3, i4
|
| - movdqa xmm5, xmm3
|
| - punpcklbw xmm3, i5 ; 40 50
|
| - punpckhbw xmm5, i5 ; 48 58
|
| -
|
| - movdqa xmm6, i6
|
| - movdqa xmm7, xmm6
|
| - punpcklbw xmm6, i7 ; 60 70
|
| - punpckhbw xmm7, i7 ; 68 78
|
| -
|
| - movdqa xmm8, xmm3
|
| - punpcklwd xmm3, xmm6 ; 40 50 60 70
|
| - punpckhwd xmm8, xmm6 ; 44 54 64 74
|
| -
|
| - movdqa xmm6, xmm5
|
| - punpcklwd xmm5, xmm7 ; 48 58 68 78
|
| - punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c
|
| -
|
| - ; pull the first two sets together
|
| -
|
| - movdqa xmm7, xmm0
|
| - punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70
|
| - punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72
|
| -
|
| - movdqa xmm3, xmm4
|
| - punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74
|
| - punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76
|
| -
|
| - movdqa xmm8, xmm1
|
| - punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78
|
| - punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a
|
| -
|
| - movdqa xmm5, xmm2
|
| - punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c
|
| - punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e
|
| - ; final combination
|
| -
|
| - movdqa xmm6, xmm0
|
| - punpcklqdq xmm0, i8
|
| - punpckhqdq xmm6, i8
|
| -
|
| - movdqa xmm9, xmm7
|
| - punpcklqdq xmm7, i9
|
| - punpckhqdq xmm9, i9
|
| -
|
| - movdqa xmm10, xmm4
|
| - punpcklqdq xmm4, i10
|
| - punpckhqdq xmm10, i10
|
| -
|
| - movdqa xmm11, xmm3
|
| - punpcklqdq xmm3, i11
|
| - punpckhqdq xmm11, i11
|
| -
|
| - movdqa xmm12, xmm1
|
| - punpcklqdq xmm1, i12
|
| - punpckhqdq xmm12, i12
|
| -
|
| - movdqa xmm13, xmm8
|
| - punpcklqdq xmm8, i13
|
| - punpckhqdq xmm13, i13
|
| -
|
| - movdqa xmm14, xmm2
|
| - punpcklqdq xmm2, i14
|
| - punpckhqdq xmm14, i14
|
| -
|
| - movdqa xmm15, xmm5
|
| - punpcklqdq xmm5, i15
|
| - punpckhqdq xmm15, i15
|
| -
|
| - movdqa s0, xmm0
|
| - movdqa s1, xmm6
|
| - movdqa s2, xmm7
|
| - movdqa s3, xmm9
|
| - movdqa s4, xmm4
|
| - movdqa s5, xmm10
|
| - movdqa s6, xmm3
|
| - movdqa s7, xmm11
|
| - movdqa s8, xmm1
|
| - movdqa s9, xmm12
|
| - movdqa s10, xmm8
|
| - movdqa s11, xmm13
|
| - movdqa s12, xmm2
|
| - movdqa s13, xmm14
|
| - movdqa s14, xmm5
|
| - movdqa s15, xmm15
|
| -
|
| - ; free stack space
|
| - add rsp, stack_size
|
| -
|
| - ; un-ALIGN_STACK
|
| - pop rsp
|
| -
|
| -%if LIBVPX_YASM_WIN64
|
| - pop r13
|
| - pop r12
|
| - RESTORE_XMM
|
| - pop rbp
|
| -%endif
|
| -
|
| - ret
|
| -
|
| -SECTION_RODATA
|
| -align 16
|
| -te0:
|
| - times 16 db 0xe0
|
| -align 16
|
| -t7f:
|
| - times 16 db 0x7f
|
| -align 16
|
| -tfe:
|
| - times 16 db 0xfe
|
| -align 16
|
| -t1f:
|
| - times 16 db 0x1f
|
| -align 16
|
| -t80:
|
| - times 16 db 0x80
|
| -align 16
|
| -t1:
|
| - times 16 db 0x01
|
| -align 16
|
| -t3:
|
| - times 16 db 0x03
|
| -align 16
|
| -t4:
|
| - times 16 db 0x04
|
|
|