Index: source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm |
=================================================================== |
--- source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm (revision 278778) |
+++ source/libvpx/vp8/common/x86/loopfilter_block_sse2.asm (working copy) |
@@ -1,815 +0,0 @@ |
-; |
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
-; |
-; Use of this source code is governed by a BSD-style license |
-; that can be found in the LICENSE file in the root of the source |
-; tree. An additional intellectual property rights grant can be found |
-; in the file PATENTS. All contributing project authors may |
-; be found in the AUTHORS file in the root of the source tree. |
-; |
- |
- |
-%include "vpx_ports/x86_abi_support.asm" |
- |
-%macro LF_ABS 2 |
- ; %1 value not preserved |
- ; %2 value preserved |
- ; output in %1 |
- movdqa scratch1, %2 ; v2 |
- |
- psubusb scratch1, %1 ; v2 - v1 |
- psubusb %1, %2 ; v1 - v2 |
- por %1, scratch1 ; abs(v2 - v1) |
-%endmacro |
- |
-%macro LF_FILTER_HEV_MASK 8-9 |
- |
- LF_ABS %1, %2 ; abs(p3 - p2) |
- LF_ABS %2, %3 ; abs(p2 - p1) |
- pmaxub %1, %2 ; accumulate mask |
-%if %0 == 8 |
- movdqa scratch2, %3 ; save p1 |
- LF_ABS scratch2, %4 ; abs(p1 - p0) |
-%endif |
- LF_ABS %4, %5 ; abs(p0 - q0) |
- LF_ABS %5, %6 ; abs(q0 - q1) |
-%if %0 == 8 |
- pmaxub %5, scratch2 ; accumulate hev |
-%else |
- pmaxub %5, %9 |
-%endif |
- pmaxub %1, %5 ; accumulate mask |
- |
- LF_ABS %3, %6 ; abs(p1 - q1) |
- LF_ABS %6, %7 ; abs(q1 - q2) |
- pmaxub %1, %6 ; accumulate mask |
- LF_ABS %7, %8 ; abs(q2 - q3) |
- pmaxub %1, %7 ; accumulate mask |
- |
- paddusb %4, %4 ; 2 * abs(p0 - q0) |
- pand %3, [GLOBAL(tfe)] |
- psrlw %3, 1 ; abs(p1 - q1) / 2 |
- paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 |
- |
- psubusb %1, [limit] |
- psubusb %4, [blimit] |
- por %1, %4 |
- pcmpeqb %1, zero ; mask |
- |
- psubusb %5, [thresh] |
- pcmpeqb %5, zero ; ~hev |
-%endmacro |
- |
-%macro LF_FILTER 6 |
- ; %1-%4: p1-q1 |
- ; %5: mask |
- ; %6: hev |
- |
- movdqa scratch2, %6 ; save hev |
- |
- pxor %1, [GLOBAL(t80)] ; ps1 |
- pxor %4, [GLOBAL(t80)] ; qs1 |
- movdqa scratch1, %1 |
- psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) |
- pandn scratch2, scratch1 ; vp8_filter &= hev |
- |
- pxor %2, [GLOBAL(t80)] ; ps0 |
- pxor %3, [GLOBAL(t80)] ; qs0 |
- movdqa scratch1, %3 |
- psubsb scratch1, %2 ; qs0 - ps0 |
- paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) |
- paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) |
- paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) |
- pand %5, scratch2 ; &= mask |
- |
- movdqa scratch2, %5 |
- paddsb %5, [GLOBAL(t4)] ; Filter1 |
- paddsb scratch2, [GLOBAL(t3)] ; Filter2 |
- |
- ; Filter1 >> 3 |
- movdqa scratch1, zero |
- pcmpgtb scratch1, %5 |
- psrlw %5, 3 |
- pand scratch1, [GLOBAL(te0)] |
- pand %5, [GLOBAL(t1f)] |
- por %5, scratch1 |
- |
- psubsb %3, %5 ; qs0 - Filter1 |
- pxor %3, [GLOBAL(t80)] |
- |
- ; Filter2 >> 3 |
- movdqa scratch1, zero |
- pcmpgtb scratch1, scratch2 |
- psrlw scratch2, 3 |
- pand scratch1, [GLOBAL(te0)] |
- pand scratch2, [GLOBAL(t1f)] |
- por scratch2, scratch1 |
- |
- paddsb %2, scratch2 ; ps0 + Filter2 |
- pxor %2, [GLOBAL(t80)] |
- |
- ; outer tap adjustments |
- paddsb %5, [GLOBAL(t1)] |
- movdqa scratch1, zero |
- pcmpgtb scratch1, %5 |
- psrlw %5, 1 |
- pand scratch1, [GLOBAL(t80)] |
- pand %5, [GLOBAL(t7f)] |
- por %5, scratch1 |
- pand %5, %6 ; vp8_filter &= ~hev |
- |
- psubsb %4, %5 ; qs1 - vp8_filter |
- pxor %4, [GLOBAL(t80)] |
- |
- paddsb %1, %5 ; ps1 + vp8_filter |
- pxor %1, [GLOBAL(t80)] |
-%endmacro |
- |
-;void vp8_loop_filter_bh_y_sse2 |
-;( |
-; unsigned char *src_ptr, |
-; int src_pixel_step, |
-; const char *blimit, |
-; const char *limit, |
-; const char *thresh |
-;) |
-global sym(vp8_loop_filter_bh_y_sse2) PRIVATE |
-sym(vp8_loop_filter_bh_y_sse2): |
- |
-%if LIBVPX_YASM_WIN64 |
- %define src rcx ; src_ptr |
- %define stride rdx ; src_pixel_step |
- %define blimit r8 |
- %define limit r9 |
- %define thresh r10 |
- |
- %define spp rax |
- %define stride3 r11 |
- %define stride5 r12 |
- %define stride7 r13 |
- |
- push rbp |
- mov rbp, rsp |
- SAVE_XMM 11 |
- push r12 |
- push r13 |
- mov thresh, arg(4) |
-%else |
- %define src rdi ; src_ptr |
- %define stride rsi ; src_pixel_step |
- %define blimit rdx |
- %define limit rcx |
- %define thresh r8 |
- |
- %define spp rax |
- %define stride3 r9 |
- %define stride5 r10 |
- %define stride7 r11 |
-%endif |
- |
- %define scratch1 xmm5 |
- %define scratch2 xmm6 |
- %define zero xmm7 |
- |
- %define i0 [src] |
- %define i1 [spp] |
- %define i2 [src + 2 * stride] |
- %define i3 [spp + 2 * stride] |
- %define i4 [src + 4 * stride] |
- %define i5 [spp + 4 * stride] |
- %define i6 [src + 2 * stride3] |
- %define i7 [spp + 2 * stride3] |
- %define i8 [src + 8 * stride] |
- %define i9 [spp + 8 * stride] |
- %define i10 [src + 2 * stride5] |
- %define i11 [spp + 2 * stride5] |
- %define i12 [src + 4 * stride3] |
- %define i13 [spp + 4 * stride3] |
- %define i14 [src + 2 * stride7] |
- %define i15 [spp + 2 * stride7] |
- |
- ; prep work |
- lea spp, [src + stride] |
- lea stride3, [stride + 2 * stride] |
- lea stride5, [stride3 + 2 * stride] |
- lea stride7, [stride3 + 4 * stride] |
- pxor zero, zero |
- |
- ; load the first set into registers |
- movdqa xmm0, i0 |
- movdqa xmm1, i1 |
- movdqa xmm2, i2 |
- movdqa xmm3, i3 |
- movdqa xmm4, i4 |
- movdqa xmm8, i5 |
- movdqa xmm9, i6 ; q2, will contain abs(p1-p0) |
- movdqa xmm10, i7 |
-LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 |
- |
- movdqa xmm1, i2 |
- movdqa xmm2, i3 |
- movdqa xmm3, i4 |
- movdqa xmm8, i5 |
-LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 |
- movdqa i2, xmm1 |
- movdqa i3, xmm2 |
- |
-; second set |
- movdqa i4, xmm3 |
- movdqa i5, xmm8 |
- |
- movdqa xmm0, i6 |
- movdqa xmm1, i7 |
- movdqa xmm2, i8 |
- movdqa xmm4, i9 |
- movdqa xmm10, i10 ; q2, will contain abs(p1-p0) |
- movdqa xmm11, i11 |
-LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 |
- |
- movdqa xmm0, i6 |
- movdqa xmm1, i7 |
- movdqa xmm4, i8 |
- movdqa xmm8, i9 |
-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 |
- movdqa i6, xmm0 |
- movdqa i7, xmm1 |
- |
-; last set |
- movdqa i8, xmm4 |
- movdqa i9, xmm8 |
- |
- movdqa xmm0, i10 |
- movdqa xmm1, i11 |
- movdqa xmm2, i12 |
- movdqa xmm3, i13 |
- movdqa xmm9, i14 ; q2, will contain abs(p1-p0) |
- movdqa xmm11, i15 |
-LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 |
- |
- movdqa xmm0, i10 |
- movdqa xmm1, i11 |
- movdqa xmm3, i12 |
- movdqa xmm8, i13 |
-LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 |
- movdqa i10, xmm0 |
- movdqa i11, xmm1 |
- movdqa i12, xmm3 |
- movdqa i13, xmm8 |
- |
-%if LIBVPX_YASM_WIN64 |
- pop r13 |
- pop r12 |
- RESTORE_XMM |
- pop rbp |
-%endif |
- |
- ret |
- |
- |
-;void vp8_loop_filter_bv_y_sse2 |
-;( |
-; unsigned char *src_ptr, |
-; int src_pixel_step, |
-; const char *blimit, |
-; const char *limit, |
-; const char *thresh |
-;) |
- |
-global sym(vp8_loop_filter_bv_y_sse2) PRIVATE |
-sym(vp8_loop_filter_bv_y_sse2): |
- |
-%if LIBVPX_YASM_WIN64 |
- %define src rcx ; src_ptr |
- %define stride rdx ; src_pixel_step |
- %define blimit r8 |
- %define limit r9 |
- %define thresh r10 |
- |
- %define spp rax |
- %define stride3 r11 |
- %define stride5 r12 |
- %define stride7 r13 |
- |
- push rbp |
- mov rbp, rsp |
- SAVE_XMM 15 |
- push r12 |
- push r13 |
- mov thresh, arg(4) |
-%else |
- %define src rdi |
- %define stride rsi |
- %define blimit rdx |
- %define limit rcx |
- %define thresh r8 |
- |
- %define spp rax |
- %define stride3 r9 |
- %define stride5 r10 |
- %define stride7 r11 |
-%endif |
- |
- %define scratch1 xmm5 |
- %define scratch2 xmm6 |
- %define zero xmm7 |
- |
- %define s0 [src] |
- %define s1 [spp] |
- %define s2 [src + 2 * stride] |
- %define s3 [spp + 2 * stride] |
- %define s4 [src + 4 * stride] |
- %define s5 [spp + 4 * stride] |
- %define s6 [src + 2 * stride3] |
- %define s7 [spp + 2 * stride3] |
- %define s8 [src + 8 * stride] |
- %define s9 [spp + 8 * stride] |
- %define s10 [src + 2 * stride5] |
- %define s11 [spp + 2 * stride5] |
- %define s12 [src + 4 * stride3] |
- %define s13 [spp + 4 * stride3] |
- %define s14 [src + 2 * stride7] |
- %define s15 [spp + 2 * stride7] |
- |
- %define i0 [rsp] |
- %define i1 [rsp + 16] |
- %define i2 [rsp + 32] |
- %define i3 [rsp + 48] |
- %define i4 [rsp + 64] |
- %define i5 [rsp + 80] |
- %define i6 [rsp + 96] |
- %define i7 [rsp + 112] |
- %define i8 [rsp + 128] |
- %define i9 [rsp + 144] |
- %define i10 [rsp + 160] |
- %define i11 [rsp + 176] |
- %define i12 [rsp + 192] |
- %define i13 [rsp + 208] |
- %define i14 [rsp + 224] |
- %define i15 [rsp + 240] |
- |
- ALIGN_STACK 16, rax |
- |
- ; reserve stack space |
- %define temp_storage 0 ; size is 256 (16*16) |
- %define stack_size 256 |
- sub rsp, stack_size |
- |
- ; prep work |
- lea spp, [src + stride] |
- lea stride3, [stride + 2 * stride] |
- lea stride5, [stride3 + 2 * stride] |
- lea stride7, [stride3 + 4 * stride] |
- |
- ; 8-f |
- movdqa xmm0, s8 |
- movdqa xmm1, xmm0 |
- punpcklbw xmm0, s9 ; 80 90 |
- punpckhbw xmm1, s9 ; 88 98 |
- |
- movdqa xmm2, s10 |
- movdqa xmm3, xmm2 |
- punpcklbw xmm2, s11 ; a0 b0 |
- punpckhbw xmm3, s11 ; a8 b8 |
- |
- movdqa xmm4, xmm0 |
- punpcklwd xmm0, xmm2 ; 80 90 a0 b0 |
- punpckhwd xmm4, xmm2 ; 84 94 a4 b4 |
- |
- movdqa xmm2, xmm1 |
- punpcklwd xmm1, xmm3 ; 88 98 a8 b8 |
- punpckhwd xmm2, xmm3 ; 8c 9c ac bc |
- |
- ; using xmm[0124] |
- ; work on next 4 rows |
- |
- movdqa xmm3, s12 |
- movdqa xmm5, xmm3 |
- punpcklbw xmm3, s13 ; c0 d0 |
- punpckhbw xmm5, s13 ; c8 d8 |
- |
- movdqa xmm6, s14 |
- movdqa xmm7, xmm6 |
- punpcklbw xmm6, s15 ; e0 f0 |
- punpckhbw xmm7, s15 ; e8 f8 |
- |
- movdqa xmm8, xmm3 |
- punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 |
- punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 |
- |
- movdqa xmm6, xmm5 |
- punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 |
- punpckhwd xmm6, xmm7 ; cc dc ec fc |
- |
- ; pull the third and fourth sets together |
- |
- movdqa xmm7, xmm0 |
- punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 |
- punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 |
- |
- movdqa xmm3, xmm4 |
- punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 |
- punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 |
- |
- movdqa xmm8, xmm1 |
- punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 |
- punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa |
- |
- movdqa xmm5, xmm2 |
- punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc |
- punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe |
- |
- ; save the calculations. we only have 15 registers ... |
- movdqa i0, xmm0 |
- movdqa i1, xmm7 |
- movdqa i2, xmm4 |
- movdqa i3, xmm3 |
- movdqa i4, xmm1 |
- movdqa i5, xmm8 |
- movdqa i6, xmm2 |
- movdqa i7, xmm5 |
- |
- ; 0-7 |
- movdqa xmm0, s0 |
- movdqa xmm1, xmm0 |
- punpcklbw xmm0, s1 ; 00 10 |
- punpckhbw xmm1, s1 ; 08 18 |
- |
- movdqa xmm2, s2 |
- movdqa xmm3, xmm2 |
- punpcklbw xmm2, s3 ; 20 30 |
- punpckhbw xmm3, s3 ; 28 38 |
- |
- movdqa xmm4, xmm0 |
- punpcklwd xmm0, xmm2 ; 00 10 20 30 |
- punpckhwd xmm4, xmm2 ; 04 14 24 34 |
- |
- movdqa xmm2, xmm1 |
- punpcklwd xmm1, xmm3 ; 08 18 28 38 |
- punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c |
- |
- ; using xmm[0124] |
- ; work on next 4 rows |
- |
- movdqa xmm3, s4 |
- movdqa xmm5, xmm3 |
- punpcklbw xmm3, s5 ; 40 50 |
- punpckhbw xmm5, s5 ; 48 58 |
- |
- movdqa xmm6, s6 |
- movdqa xmm7, xmm6 |
- punpcklbw xmm6, s7 ; 60 70 |
- punpckhbw xmm7, s7 ; 68 78 |
- |
- movdqa xmm8, xmm3 |
- punpcklwd xmm3, xmm6 ; 40 50 60 70 |
- punpckhwd xmm8, xmm6 ; 44 54 64 74 |
- |
- movdqa xmm6, xmm5 |
- punpcklwd xmm5, xmm7 ; 48 58 68 78 |
- punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c |
- |
- ; pull the first two sets together |
- |
- movdqa xmm7, xmm0 |
- punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 |
- punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 |
- |
- movdqa xmm3, xmm4 |
- punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 |
- punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 |
- |
- movdqa xmm8, xmm1 |
- punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 |
- punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a |
- |
- movdqa xmm5, xmm2 |
- punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c |
- punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e |
- ; final combination |
- |
- movdqa xmm6, xmm0 |
- punpcklqdq xmm0, i0 |
- punpckhqdq xmm6, i0 |
- |
- movdqa xmm9, xmm7 |
- punpcklqdq xmm7, i1 |
- punpckhqdq xmm9, i1 |
- |
- movdqa xmm10, xmm4 |
- punpcklqdq xmm4, i2 |
- punpckhqdq xmm10, i2 |
- |
- movdqa xmm11, xmm3 |
- punpcklqdq xmm3, i3 |
- punpckhqdq xmm11, i3 |
- |
- movdqa xmm12, xmm1 |
- punpcklqdq xmm1, i4 |
- punpckhqdq xmm12, i4 |
- |
- movdqa xmm13, xmm8 |
- punpcklqdq xmm8, i5 |
- punpckhqdq xmm13, i5 |
- |
- movdqa xmm14, xmm2 |
- punpcklqdq xmm2, i6 |
- punpckhqdq xmm14, i6 |
- |
- movdqa xmm15, xmm5 |
- punpcklqdq xmm5, i7 |
- punpckhqdq xmm15, i7 |
- |
- movdqa i0, xmm0 |
- movdqa i1, xmm6 |
- movdqa i2, xmm7 |
- movdqa i3, xmm9 |
- movdqa i4, xmm4 |
- movdqa i5, xmm10 |
- movdqa i6, xmm3 |
- movdqa i7, xmm11 |
- movdqa i8, xmm1 |
- movdqa i9, xmm12 |
- movdqa i10, xmm8 |
- movdqa i11, xmm13 |
- movdqa i12, xmm2 |
- movdqa i13, xmm14 |
- movdqa i14, xmm5 |
- movdqa i15, xmm15 |
- |
-; TRANSPOSED DATA AVAILABLE ON THE STACK |
- |
- movdqa xmm12, xmm6 |
- movdqa xmm13, xmm7 |
- |
- pxor zero, zero |
- |
-LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 |
- |
- movdqa xmm1, i2 |
- movdqa xmm2, i3 |
- movdqa xmm8, i4 |
- movdqa xmm9, i5 |
-LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 |
- movdqa i2, xmm1 |
- movdqa i3, xmm2 |
- |
-; second set |
- movdqa i4, xmm8 |
- movdqa i5, xmm9 |
- |
- movdqa xmm0, i6 |
- movdqa xmm1, i7 |
- movdqa xmm2, i8 |
- movdqa xmm4, i9 |
- movdqa xmm10, i10 ; q2, will contain abs(p1-p0) |
- movdqa xmm11, i11 |
-LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 |
- |
- movdqa xmm0, i6 |
- movdqa xmm1, i7 |
- movdqa xmm3, i8 |
- movdqa xmm4, i9 |
-LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 |
- movdqa i6, xmm0 |
- movdqa i7, xmm1 |
- |
-; last set |
- movdqa i8, xmm3 |
- movdqa i9, xmm4 |
- |
- movdqa xmm0, i10 |
- movdqa xmm1, i11 |
- movdqa xmm2, i12 |
- movdqa xmm8, i13 |
- movdqa xmm9, i14 ; q2, will contain abs(p1-p0) |
- movdqa xmm11, i15 |
-LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 |
- |
- movdqa xmm0, i10 |
- movdqa xmm1, i11 |
- movdqa xmm4, i12 |
- movdqa xmm8, i13 |
-LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 |
- movdqa i10, xmm0 |
- movdqa i11, xmm1 |
- movdqa i12, xmm4 |
- movdqa i13, xmm8 |
- |
- |
-; RESHUFFLE AND WRITE OUT |
- ; 8-f |
- movdqa xmm0, i8 |
- movdqa xmm1, xmm0 |
- punpcklbw xmm0, i9 ; 80 90 |
- punpckhbw xmm1, i9 ; 88 98 |
- |
- movdqa xmm2, i10 |
- movdqa xmm3, xmm2 |
- punpcklbw xmm2, i11 ; a0 b0 |
- punpckhbw xmm3, i11 ; a8 b8 |
- |
- movdqa xmm4, xmm0 |
- punpcklwd xmm0, xmm2 ; 80 90 a0 b0 |
- punpckhwd xmm4, xmm2 ; 84 94 a4 b4 |
- |
- movdqa xmm2, xmm1 |
- punpcklwd xmm1, xmm3 ; 88 98 a8 b8 |
- punpckhwd xmm2, xmm3 ; 8c 9c ac bc |
- |
- ; using xmm[0124] |
- ; work on next 4 rows |
- |
- movdqa xmm3, i12 |
- movdqa xmm5, xmm3 |
- punpcklbw xmm3, i13 ; c0 d0 |
- punpckhbw xmm5, i13 ; c8 d8 |
- |
- movdqa xmm6, i14 |
- movdqa xmm7, xmm6 |
- punpcklbw xmm6, i15 ; e0 f0 |
- punpckhbw xmm7, i15 ; e8 f8 |
- |
- movdqa xmm8, xmm3 |
- punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 |
- punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 |
- |
- movdqa xmm6, xmm5 |
- punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 |
- punpckhwd xmm6, xmm7 ; cc dc ec fc |
- |
- ; pull the third and fourth sets together |
- |
- movdqa xmm7, xmm0 |
- punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 |
- punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 |
- |
- movdqa xmm3, xmm4 |
- punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 |
- punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 |
- |
- movdqa xmm8, xmm1 |
- punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 |
- punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa |
- |
- movdqa xmm5, xmm2 |
- punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc |
- punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe |
- |
- ; save the calculations. we only have 15 registers ... |
- movdqa i8, xmm0 |
- movdqa i9, xmm7 |
- movdqa i10, xmm4 |
- movdqa i11, xmm3 |
- movdqa i12, xmm1 |
- movdqa i13, xmm8 |
- movdqa i14, xmm2 |
- movdqa i15, xmm5 |
- |
- ; 0-7 |
- movdqa xmm0, i0 |
- movdqa xmm1, xmm0 |
- punpcklbw xmm0, i1 ; 00 10 |
- punpckhbw xmm1, i1 ; 08 18 |
- |
- movdqa xmm2, i2 |
- movdqa xmm3, xmm2 |
- punpcklbw xmm2, i3 ; 20 30 |
- punpckhbw xmm3, i3 ; 28 38 |
- |
- movdqa xmm4, xmm0 |
- punpcklwd xmm0, xmm2 ; 00 10 20 30 |
- punpckhwd xmm4, xmm2 ; 04 14 24 34 |
- |
- movdqa xmm2, xmm1 |
- punpcklwd xmm1, xmm3 ; 08 18 28 38 |
- punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c |
- |
- ; using xmm[0124] |
- ; work on next 4 rows |
- |
- movdqa xmm3, i4 |
- movdqa xmm5, xmm3 |
- punpcklbw xmm3, i5 ; 40 50 |
- punpckhbw xmm5, i5 ; 48 58 |
- |
- movdqa xmm6, i6 |
- movdqa xmm7, xmm6 |
- punpcklbw xmm6, i7 ; 60 70 |
- punpckhbw xmm7, i7 ; 68 78 |
- |
- movdqa xmm8, xmm3 |
- punpcklwd xmm3, xmm6 ; 40 50 60 70 |
- punpckhwd xmm8, xmm6 ; 44 54 64 74 |
- |
- movdqa xmm6, xmm5 |
- punpcklwd xmm5, xmm7 ; 48 58 68 78 |
- punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c |
- |
- ; pull the first two sets together |
- |
- movdqa xmm7, xmm0 |
- punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 |
- punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 |
- |
- movdqa xmm3, xmm4 |
- punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 |
- punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 |
- |
- movdqa xmm8, xmm1 |
- punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 |
- punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a |
- |
- movdqa xmm5, xmm2 |
- punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c |
- punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e |
- ; final combination |
- |
- movdqa xmm6, xmm0 |
- punpcklqdq xmm0, i8 |
- punpckhqdq xmm6, i8 |
- |
- movdqa xmm9, xmm7 |
- punpcklqdq xmm7, i9 |
- punpckhqdq xmm9, i9 |
- |
- movdqa xmm10, xmm4 |
- punpcklqdq xmm4, i10 |
- punpckhqdq xmm10, i10 |
- |
- movdqa xmm11, xmm3 |
- punpcklqdq xmm3, i11 |
- punpckhqdq xmm11, i11 |
- |
- movdqa xmm12, xmm1 |
- punpcklqdq xmm1, i12 |
- punpckhqdq xmm12, i12 |
- |
- movdqa xmm13, xmm8 |
- punpcklqdq xmm8, i13 |
- punpckhqdq xmm13, i13 |
- |
- movdqa xmm14, xmm2 |
- punpcklqdq xmm2, i14 |
- punpckhqdq xmm14, i14 |
- |
- movdqa xmm15, xmm5 |
- punpcklqdq xmm5, i15 |
- punpckhqdq xmm15, i15 |
- |
- movdqa s0, xmm0 |
- movdqa s1, xmm6 |
- movdqa s2, xmm7 |
- movdqa s3, xmm9 |
- movdqa s4, xmm4 |
- movdqa s5, xmm10 |
- movdqa s6, xmm3 |
- movdqa s7, xmm11 |
- movdqa s8, xmm1 |
- movdqa s9, xmm12 |
- movdqa s10, xmm8 |
- movdqa s11, xmm13 |
- movdqa s12, xmm2 |
- movdqa s13, xmm14 |
- movdqa s14, xmm5 |
- movdqa s15, xmm15 |
- |
- ; free stack space |
- add rsp, stack_size |
- |
- ; un-ALIGN_STACK |
- pop rsp |
- |
-%if LIBVPX_YASM_WIN64 |
- pop r13 |
- pop r12 |
- RESTORE_XMM |
- pop rbp |
-%endif |
- |
- ret |
- |
-SECTION_RODATA |
-align 16 |
-te0: |
- times 16 db 0xe0 |
-align 16 |
-t7f: |
- times 16 db 0x7f |
-align 16 |
-tfe: |
- times 16 db 0xfe |
-align 16 |
-t1f: |
- times 16 db 0x1f |
-align 16 |
-t80: |
- times 16 db 0x80 |
-align 16 |
-t1: |
- times 16 db 0x01 |
-align 16 |
-t3: |
- times 16 db 0x03 |
-align 16 |
-t4: |
- times 16 db 0x04 |