OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2015 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 | 11 %include "third_party/x86inc/x86inc.asm" |
12 %include "vpx_ports/x86_abi_support.asm" | 12 |
13 | 13 SECTION_RODATA |
14 %macro VERTx4 1 | 14 pw_64: times 8 dw 64 |
15 mov rdx, arg(5) ;filter ptr | 15 |
16 mov rsi, arg(0) ;src_ptr | 16 ; %define USE_PMULHRSW |
17 mov rdi, arg(2) ;output_ptr | 17 ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss |
18 mov rcx, 0x0400040 | 18 ; when using this instruction. |
19 | 19 |
20 movdqa xmm4, [rdx] ;load filters | 20 SECTION .text |
21 movq xmm5, rcx | 21 %if ARCH_X86_64 |
22 packsswb xmm4, xmm4 | 22 %define LOCAL_VARS_SIZE 16*4 |
23 pshuflw xmm0, xmm4, 0b ;k0_k1 | 23 %else |
24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 24 %define LOCAL_VARS_SIZE 16*6 |
25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 25 %endif |
26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 26 |
27 | 27 %macro SETUP_LOCAL_VARS 0 |
28 punpcklqdq xmm0, xmm0 | 28 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + |
29 punpcklqdq xmm1, xmm1 | 29 ; pmaddubsw has a higher latency on some platforms, this might be eased by |
30 punpcklqdq xmm2, xmm2 | 30 ; interleaving the instructions. |
31 punpcklqdq xmm3, xmm3 | 31 %define k0k1 [rsp + 16*0] |
32 | 32 %define k2k3 [rsp + 16*1] |
33 movdqa k0k1, xmm0 | 33 %define k4k5 [rsp + 16*2] |
34 movdqa k2k3, xmm1 | 34 %define k6k7 [rsp + 16*3] |
35 pshufd xmm5, xmm5, 0 | 35 packsswb m4, m4 |
36 movdqa k4k5, xmm2 | 36 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on |
37 movdqa k6k7, xmm3 | 37 ; some platforms. |
38 movdqa krd, xmm5 | 38 pshuflw m0, m4, 0b ;k0_k1 |
39 | 39 pshuflw m1, m4, 01010101b ;k2_k3 |
40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line | 40 pshuflw m2, m4, 10101010b ;k4_k5 |
41 | 41 pshuflw m3, m4, 11111111b ;k6_k7 |
42 %if ABI_IS_32BIT=0 | 42 punpcklqdq m0, m0 |
43 movsxd r8, DWORD PTR arg(3) ;out_pitch | 43 punpcklqdq m1, m1 |
44 %endif | 44 punpcklqdq m2, m2 |
45 mov rax, rsi | 45 punpcklqdq m3, m3 |
46 movsxd rcx, DWORD PTR arg(4) ;output_height | 46 mova k0k1, m0 |
47 add rax, rdx | 47 mova k2k3, m1 |
48 | 48 mova k4k5, m2 |
49 lea rbx, [rdx + rdx*4] | 49 mova k6k7, m3 |
50 add rbx, rdx ;pitch * 6 | 50 %if ARCH_X86_64 |
51 | 51 %define krd m12 |
| 52 %define tmp m13 |
| 53 mova krd, [GLOBAL(pw_64)] |
| 54 %else |
| 55 %define tmp [rsp + 16*4] |
| 56 %define krd [rsp + 16*5] |
| 57 %if CONFIG_PIC=0 |
| 58 mova m6, [GLOBAL(pw_64)] |
| 59 %else |
| 60 ; build constants without accessing global memory |
| 61 pcmpeqb m6, m6 ;all ones |
| 62 psrlw m6, 15 |
| 63 psllw m6, 6 ;aka pw_64 |
| 64 %endif |
| 65 mova krd, m6 |
| 66 %endif |
| 67 %endm |
| 68 |
| 69 %macro HORIZx4_ROW 2 |
| 70 mova %2, %1 |
| 71 punpcklbw %1, %1 |
| 72 punpckhbw %2, %2 |
| 73 |
| 74 mova m3, %2 |
| 75 palignr %2, %1, 1 |
| 76 palignr m3, %1, 5 |
| 77 |
| 78 pmaddubsw %2, k0k1k4k5 |
| 79 pmaddubsw m3, k2k3k6k7 |
| 80 |
| 81 mova m4, %2 |
| 82 mova m5, m3 |
| 83 psrldq %2, 8 |
| 84 psrldq m3, 8 |
| 85 mova m6, m5 |
| 86 |
| 87 paddsw m4, m3 |
| 88 pmaxsw m5, %2 |
| 89 pminsw %2, m6 |
| 90 paddsw %2, m4 |
| 91 paddsw %2, m5 |
| 92 paddsw %2, krd |
| 93 psraw %2, 7 |
| 94 packuswb %2, %2 |
| 95 %endm |
| 96 |
| 97 ;------------------------------------------------------------------------------- |
| 98 %macro SUBPIX_HFILTER4 1 |
| 99 cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \ |
| 100 src, sstride, dst, dstride, height, filter |
| 101 mova m4, [filterq] |
| 102 packsswb m4, m4 |
| 103 %if ARCH_X86_64 |
| 104 %define k0k1k4k5 m8 |
| 105 %define k2k3k6k7 m9 |
| 106 %define krd m10 |
| 107 %define orig_height r7 |
| 108 mova krd, [GLOBAL(pw_64)] |
| 109 pshuflw k0k1k4k5, m4, 0b ;k0_k1 |
| 110 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 |
| 111 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 |
| 112 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 |
| 113 %else |
| 114 %define k0k1k4k5 [rsp + 16*0] |
| 115 %define k2k3k6k7 [rsp + 16*1] |
| 116 %define krd [rsp + 16*2] |
| 117 %define orig_height [rsp + 16*3] |
| 118 pshuflw m6, m4, 0b ;k0_k1 |
| 119 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 |
| 120 pshuflw m7, m4, 01010101b ;k2_k3 |
| 121 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 |
| 122 %if CONFIG_PIC=0 |
| 123 mova m1, [GLOBAL(pw_64)] |
| 124 %else |
| 125 ; build constants without accessing global memory |
| 126 pcmpeqb m1, m1 ;all ones |
| 127 psrlw m1, 15 |
| 128 psllw m1, 6 ;aka pw_64 |
| 129 %endif |
| 130 mova k0k1k4k5, m6 |
| 131 mova k2k3k6k7, m7 |
| 132 mova krd, m1 |
| 133 %endif |
| 134 mov orig_height, heightq |
| 135 shr heightq, 1 |
52 .loop: | 136 .loop: |
53 movd xmm0, [rsi] ;A | 137 ;Do two rows at once |
54 movd xmm1, [rsi + rdx] ;B | 138 movh m0, [srcq - 3] |
55 movd xmm2, [rsi + rdx * 2] ;C | 139 movh m1, [srcq + 5] |
56 movd xmm3, [rax + rdx * 2] ;D | 140 punpcklqdq m0, m1 |
57 movd xmm4, [rsi + rdx * 4] ;E | 141 mova m1, m0 |
58 movd xmm5, [rax + rdx * 4] ;F | 142 movh m2, [srcq + sstrideq - 3] |
59 | 143 movh m3, [srcq + sstrideq + 5] |
60 punpcklbw xmm0, xmm1 ;A B | 144 punpcklqdq m2, m3 |
61 punpcklbw xmm2, xmm3 ;C D | 145 mova m3, m2 |
62 punpcklbw xmm4, xmm5 ;E F | 146 punpcklbw m0, m0 |
63 | 147 punpckhbw m1, m1 |
64 movd xmm6, [rsi + rbx] ;G | 148 punpcklbw m2, m2 |
65 movd xmm7, [rax + rbx] ;H | 149 punpckhbw m3, m3 |
66 | 150 mova m4, m1 |
67 pmaddubsw xmm0, k0k1 | 151 palignr m4, m0, 1 |
68 pmaddubsw xmm2, k2k3 | 152 pmaddubsw m4, k0k1k4k5 |
69 punpcklbw xmm6, xmm7 ;G H | 153 palignr m1, m0, 5 |
70 pmaddubsw xmm4, k4k5 | 154 pmaddubsw m1, k2k3k6k7 |
71 pmaddubsw xmm6, k6k7 | 155 mova m7, m3 |
72 | 156 palignr m7, m2, 1 |
73 movdqa xmm1, xmm2 | 157 pmaddubsw m7, k0k1k4k5 |
74 paddsw xmm0, xmm6 | 158 palignr m3, m2, 5 |
75 pmaxsw xmm2, xmm4 | 159 pmaddubsw m3, k2k3k6k7 |
76 pminsw xmm4, xmm1 | 160 mova m0, m4 |
77 paddsw xmm0, xmm4 | 161 mova m5, m1 |
78 paddsw xmm0, xmm2 | 162 mova m2, m7 |
79 | 163 psrldq m4, 8 |
80 paddsw xmm0, krd | 164 psrldq m1, 8 |
81 psraw xmm0, 7 | 165 mova m6, m5 |
82 packuswb xmm0, xmm0 | 166 paddsw m0, m1 |
83 | 167 mova m1, m3 |
84 add rsi, rdx | 168 psrldq m7, 8 |
85 add rax, rdx | 169 psrldq m3, 8 |
86 %if %1 | 170 paddsw m2, m3 |
87 movd xmm1, [rdi] | 171 mova m3, m1 |
88 pavgb xmm0, xmm1 | 172 pmaxsw m5, m4 |
89 %endif | 173 pminsw m4, m6 |
90 movd [rdi], xmm0 | 174 paddsw m4, m0 |
91 | 175 paddsw m4, m5 |
92 %if ABI_IS_32BIT | 176 pmaxsw m1, m7 |
93 add rdi, DWORD PTR arg(3) ;out_pitch | 177 pminsw m7, m3 |
94 %else | 178 paddsw m7, m2 |
95 add rdi, r8 | 179 paddsw m7, m1 |
96 %endif | 180 |
97 dec rcx | 181 paddsw m4, krd |
98 jnz .loop | 182 psraw m4, 7 |
99 %endm | 183 packuswb m4, m4 |
100 | 184 paddsw m7, krd |
101 %macro VERTx8 1 | 185 psraw m7, 7 |
102 mov rdx, arg(5) ;filter ptr | 186 packuswb m7, m7 |
103 mov rsi, arg(0) ;src_ptr | 187 |
104 mov rdi, arg(2) ;output_ptr | 188 %ifidn %1, h8_avg |
105 mov rcx, 0x0400040 | 189 movd m0, [dstq] |
106 | 190 pavgb m4, m0 |
107 movdqa xmm4, [rdx] ;load filters | 191 movd m2, [dstq + dstrideq] |
108 movq xmm5, rcx | 192 pavgb m7, m2 |
109 packsswb xmm4, xmm4 | 193 %endif |
110 pshuflw xmm0, xmm4, 0b ;k0_k1 | 194 movd [dstq], m4 |
111 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 195 movd [dstq + dstrideq], m7 |
112 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 196 |
113 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 197 lea srcq, [srcq + sstrideq ] |
114 | 198 prefetcht0 [srcq + 4 * sstrideq - 3] |
115 punpcklqdq xmm0, xmm0 | 199 lea srcq, [srcq + sstrideq ] |
116 punpcklqdq xmm1, xmm1 | 200 lea dstq, [dstq + 2 * dstrideq ] |
117 punpcklqdq xmm2, xmm2 | 201 prefetcht0 [srcq + 2 * sstrideq - 3] |
118 punpcklqdq xmm3, xmm3 | 202 |
119 | 203 dec heightq |
120 movdqa k0k1, xmm0 | 204 jnz .loop |
121 movdqa k2k3, xmm1 | 205 |
122 pshufd xmm5, xmm5, 0 | 206 ; Do last row if output_height is odd |
123 movdqa k4k5, xmm2 | 207 mov heightq, orig_height |
124 movdqa k6k7, xmm3 | 208 and heightq, 1 |
125 movdqa krd, xmm5 | 209 je .done |
126 | 210 |
127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line | 211 movh m0, [srcq - 3] ; load src |
128 | 212 movh m1, [srcq + 5] |
129 %if ABI_IS_32BIT=0 | 213 punpcklqdq m0, m1 |
130 movsxd r8, DWORD PTR arg(3) ;out_pitch | 214 |
131 %endif | 215 HORIZx4_ROW m0, m1 |
132 mov rax, rsi | 216 %ifidn %1, h8_avg |
133 movsxd rcx, DWORD PTR arg(4) ;output_height | 217 movd m0, [dstq] |
134 add rax, rdx | 218 pavgb m1, m0 |
135 | 219 %endif |
136 lea rbx, [rdx + rdx*4] | 220 movd [dstq], m1 |
137 add rbx, rdx ;pitch * 6 | 221 .done |
| 222 RET |
| 223 %endm |
| 224 |
| 225 %macro HORIZx8_ROW 5 |
| 226 mova %2, %1 |
| 227 punpcklbw %1, %1 |
| 228 punpckhbw %2, %2 |
| 229 |
| 230 mova %3, %2 |
| 231 mova %4, %2 |
| 232 mova %5, %2 |
| 233 |
| 234 palignr %2, %1, 1 |
| 235 palignr %3, %1, 5 |
| 236 palignr %4, %1, 9 |
| 237 palignr %5, %1, 13 |
| 238 |
| 239 pmaddubsw %2, k0k1 |
| 240 pmaddubsw %3, k2k3 |
| 241 pmaddubsw %4, k4k5 |
| 242 pmaddubsw %5, k6k7 |
| 243 |
| 244 paddsw %2, %5 |
| 245 mova %1, %3 |
| 246 pminsw %3, %4 |
| 247 pmaxsw %1, %4 |
| 248 paddsw %2, %3 |
| 249 paddsw %1, %2 |
| 250 paddsw %1, krd |
| 251 psraw %1, 7 |
| 252 packuswb %1, %1 |
| 253 %endm |
| 254 |
| 255 ;------------------------------------------------------------------------------- |
| 256 %macro SUBPIX_HFILTER8 1 |
| 257 cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \ |
| 258 src, sstride, dst, dstride, height, filter |
| 259 mova m4, [filterq] |
| 260 SETUP_LOCAL_VARS |
| 261 %if ARCH_X86_64 |
| 262 %define orig_height r7 |
| 263 %else |
| 264 %define orig_height heightmp |
| 265 %endif |
| 266 mov orig_height, heightq |
| 267 shr heightq, 1 |
138 | 268 |
139 .loop: | 269 .loop: |
140 movq xmm0, [rsi] ;A | 270 movh m0, [srcq - 3] |
141 movq xmm1, [rsi + rdx] ;B | 271 movh m3, [srcq + 5] |
142 movq xmm2, [rsi + rdx * 2] ;C | 272 movh m4, [srcq + sstrideq - 3] |
143 movq xmm3, [rax + rdx * 2] ;D | 273 movh m7, [srcq + sstrideq + 5] |
144 movq xmm4, [rsi + rdx * 4] ;E | 274 punpcklqdq m0, m3 |
145 movq xmm5, [rax + rdx * 4] ;F | 275 mova m1, m0 |
146 | 276 punpcklbw m0, m0 |
147 punpcklbw xmm0, xmm1 ;A B | 277 punpckhbw m1, m1 |
148 punpcklbw xmm2, xmm3 ;C D | 278 mova m5, m1 |
149 punpcklbw xmm4, xmm5 ;E F | 279 palignr m5, m0, 13 |
150 | 280 pmaddubsw m5, k6k7 |
151 movq xmm6, [rsi + rbx] ;G | 281 mova m2, m1 |
152 movq xmm7, [rax + rbx] ;H | 282 mova m3, m1 |
153 | 283 palignr m1, m0, 1 |
154 pmaddubsw xmm0, k0k1 | 284 pmaddubsw m1, k0k1 |
155 pmaddubsw xmm2, k2k3 | 285 punpcklqdq m4, m7 |
156 punpcklbw xmm6, xmm7 ;G H | 286 mova m6, m4 |
157 pmaddubsw xmm4, k4k5 | 287 punpcklbw m4, m4 |
158 pmaddubsw xmm6, k6k7 | 288 palignr m2, m0, 5 |
159 | 289 punpckhbw m6, m6 |
160 paddsw xmm0, xmm6 | 290 palignr m3, m0, 9 |
161 movdqa xmm1, xmm2 | 291 mova m7, m6 |
162 pmaxsw xmm2, xmm4 | 292 pmaddubsw m2, k2k3 |
163 pminsw xmm4, xmm1 | 293 pmaddubsw m3, k4k5 |
164 paddsw xmm0, xmm4 | 294 |
165 paddsw xmm0, xmm2 | 295 palignr m7, m4, 13 |
166 | 296 paddsw m1, m5 |
167 paddsw xmm0, krd | 297 mova m5, m6 |
168 psraw xmm0, 7 | 298 mova m0, m2 |
169 packuswb xmm0, xmm0 | 299 palignr m5, m4, 5 |
170 | 300 pminsw m2, m3 |
171 add rsi, rdx | 301 pmaddubsw m7, k6k7 |
172 add rax, rdx | 302 pmaxsw m3, m0 |
173 %if %1 | 303 paddsw m1, m2 |
174 movq xmm1, [rdi] | 304 mova m0, m6 |
175 pavgb xmm0, xmm1 | 305 palignr m6, m4, 1 |
176 %endif | 306 pmaddubsw m5, k2k3 |
177 movq [rdi], xmm0 | 307 paddsw m1, m3 |
178 | 308 pmaddubsw m6, k0k1 |
179 %if ABI_IS_32BIT | 309 palignr m0, m4, 9 |
180 add rdi, DWORD PTR arg(3) ;out_pitch | 310 paddsw m1, krd |
181 %else | 311 pmaddubsw m0, k4k5 |
182 add rdi, r8 | 312 mova m4, m5 |
183 %endif | 313 psraw m1, 7 |
184 dec rcx | 314 pminsw m5, m0 |
185 jnz .loop | 315 paddsw m6, m7 |
186 %endm | 316 packuswb m1, m1 |
187 | 317 |
188 | 318 paddsw m6, m5 |
189 %macro VERTx16 1 | 319 pmaxsw m0, m4 |
190 mov rdx, arg(5) ;filter ptr | 320 paddsw m6, m0 |
191 mov rsi, arg(0) ;src_ptr | 321 paddsw m6, krd |
192 mov rdi, arg(2) ;output_ptr | 322 psraw m6, 7 |
193 mov rcx, 0x0400040 | 323 packuswb m6, m6 |
194 | 324 |
195 movdqa xmm4, [rdx] ;load filters | 325 %ifidn %1, h8_avg |
196 movq xmm5, rcx | 326 movh m0, [dstq] |
197 packsswb xmm4, xmm4 | 327 movh m2, [dstq + dstrideq] |
198 pshuflw xmm0, xmm4, 0b ;k0_k1 | 328 pavgb m1, m0 |
199 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 329 pavgb m6, m2 |
200 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 330 %endif |
201 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 331 movh [dstq], m1 |
202 | 332 movh [dstq + dstrideq], m6 |
203 punpcklqdq xmm0, xmm0 | 333 |
204 punpcklqdq xmm1, xmm1 | 334 lea srcq, [srcq + sstrideq ] |
205 punpcklqdq xmm2, xmm2 | 335 prefetcht0 [srcq + 4 * sstrideq - 3] |
206 punpcklqdq xmm3, xmm3 | 336 lea srcq, [srcq + sstrideq ] |
207 | 337 lea dstq, [dstq + 2 * dstrideq ] |
208 movdqa k0k1, xmm0 | 338 prefetcht0 [srcq + 2 * sstrideq - 3] |
209 movdqa k2k3, xmm1 | 339 dec heightq |
210 pshufd xmm5, xmm5, 0 | 340 jnz .loop |
211 movdqa k4k5, xmm2 | 341 |
212 movdqa k6k7, xmm3 | 342 ;Do last row if output_height is odd |
213 movdqa krd, xmm5 | 343 mov heightq, orig_height |
214 | 344 and heightq, 1 |
215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line | 345 je .done |
216 | 346 |
217 %if ABI_IS_32BIT=0 | 347 movh m0, [srcq - 3] |
218 movsxd r8, DWORD PTR arg(3) ;out_pitch | 348 movh m3, [srcq + 5] |
219 %endif | 349 punpcklqdq m0, m3 |
220 mov rax, rsi | 350 |
221 movsxd rcx, DWORD PTR arg(4) ;output_height | 351 HORIZx8_ROW m0, m1, m2, m3, m4 |
222 add rax, rdx | 352 |
223 | 353 %ifidn %1, h8_avg |
224 lea rbx, [rdx + rdx*4] | 354 movh m1, [dstq] |
225 add rbx, rdx ;pitch * 6 | 355 pavgb m0, m1 |
226 | 356 %endif |
| 357 movh [dstq], m0 |
| 358 .done: |
| 359 RET |
| 360 %endm |
| 361 |
| 362 ;------------------------------------------------------------------------------- |
| 363 %macro SUBPIX_HFILTER16 1 |
| 364 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \ |
| 365 src, sstride, dst, dstride, height, filter |
| 366 mova m4, [filterq] |
| 367 SETUP_LOCAL_VARS |
227 .loop: | 368 .loop: |
228 movq xmm0, [rsi] ;A | 369 prefetcht0 [srcq + 2 * sstrideq -3] |
229 movq xmm1, [rsi + rdx] ;B | 370 |
230 movq xmm2, [rsi + rdx * 2] ;C | 371 movh m0, [srcq - 3] |
231 movq xmm3, [rax + rdx * 2] ;D | 372 movh m4, [srcq + 5] |
232 movq xmm4, [rsi + rdx * 4] ;E | 373 movh m6, [srcq + 13] |
233 movq xmm5, [rax + rdx * 4] ;F | 374 punpcklqdq m0, m4 |
234 | 375 mova m7, m0 |
235 punpcklbw xmm0, xmm1 ;A B | 376 punpckhbw m0, m0 |
236 punpcklbw xmm2, xmm3 ;C D | 377 mova m1, m0 |
237 punpcklbw xmm4, xmm5 ;E F | 378 punpcklqdq m4, m6 |
238 | 379 mova m3, m0 |
239 movq xmm6, [rsi + rbx] ;G | 380 punpcklbw m7, m7 |
240 movq xmm7, [rax + rbx] ;H | 381 |
241 | 382 palignr m3, m7, 13 |
242 pmaddubsw xmm0, k0k1 | 383 mova m2, m0 |
243 pmaddubsw xmm2, k2k3 | 384 pmaddubsw m3, k6k7 |
244 punpcklbw xmm6, xmm7 ;G H | 385 palignr m0, m7, 1 |
245 pmaddubsw xmm4, k4k5 | 386 pmaddubsw m0, k0k1 |
246 pmaddubsw xmm6, k6k7 | 387 palignr m1, m7, 5 |
247 | 388 pmaddubsw m1, k2k3 |
248 paddsw xmm0, xmm6 | 389 palignr m2, m7, 9 |
249 movdqa xmm1, xmm2 | 390 pmaddubsw m2, k4k5 |
250 pmaxsw xmm2, xmm4 | 391 paddsw m0, m3 |
251 pminsw xmm4, xmm1 | 392 mova m3, m4 |
252 paddsw xmm0, xmm4 | 393 punpckhbw m4, m4 |
253 paddsw xmm0, xmm2 | 394 mova m5, m4 |
254 | 395 punpcklbw m3, m3 |
255 paddsw xmm0, krd | 396 mova m7, m4 |
256 psraw xmm0, 7 | 397 palignr m5, m3, 5 |
257 packuswb xmm0, xmm0 | 398 mova m6, m4 |
258 %if %1 | 399 palignr m4, m3, 1 |
259 movq xmm1, [rdi] | 400 pmaddubsw m4, k0k1 |
260 pavgb xmm0, xmm1 | 401 pmaddubsw m5, k2k3 |
261 %endif | 402 palignr m6, m3, 9 |
262 movq [rdi], xmm0 | 403 pmaddubsw m6, k4k5 |
263 | 404 palignr m7, m3, 13 |
264 movq xmm0, [rsi + 8] ;A | 405 pmaddubsw m7, k6k7 |
265 movq xmm1, [rsi + rdx + 8] ;B | 406 |
266 movq xmm2, [rsi + rdx * 2 + 8] ;C | 407 mova m3, m1 |
267 movq xmm3, [rax + rdx * 2 + 8] ;D | 408 pmaxsw m1, m2 |
268 movq xmm4, [rsi + rdx * 4 + 8] ;E | 409 pminsw m2, m3 |
269 movq xmm5, [rax + rdx * 4 + 8] ;F | 410 paddsw m0, m2 |
270 | 411 paddsw m0, m1 |
271 punpcklbw xmm0, xmm1 ;A B | 412 paddsw m4, m7 |
272 punpcklbw xmm2, xmm3 ;C D | 413 mova m7, m5 |
273 punpcklbw xmm4, xmm5 ;E F | 414 pmaxsw m5, m6 |
274 | 415 pminsw m6, m7 |
275 movq xmm6, [rsi + rbx + 8] ;G | 416 paddsw m4, m6 |
276 movq xmm7, [rax + rbx + 8] ;H | 417 paddsw m4, m5 |
277 punpcklbw xmm6, xmm7 ;G H | 418 paddsw m0, krd |
278 | 419 paddsw m4, krd |
279 pmaddubsw xmm0, k0k1 | 420 psraw m0, 7 |
280 pmaddubsw xmm2, k2k3 | 421 psraw m4, 7 |
281 pmaddubsw xmm4, k4k5 | 422 packuswb m0, m4 |
282 pmaddubsw xmm6, k6k7 | 423 %ifidn %1, h8_avg |
283 | 424 mova m1, [dstq] |
284 paddsw xmm0, xmm6 | 425 pavgb m0, m1 |
285 movdqa xmm1, xmm2 | 426 %endif |
286 pmaxsw xmm2, xmm4 | 427 lea srcq, [srcq + sstrideq] |
287 pminsw xmm4, xmm1 | 428 mova [dstq], m0 |
288 paddsw xmm0, xmm4 | 429 lea dstq, [dstq + dstrideq] |
289 paddsw xmm0, xmm2 | 430 dec heightq |
290 | 431 jnz .loop |
291 paddsw xmm0, krd | 432 RET |
292 psraw xmm0, 7 | 433 %endm |
293 packuswb xmm0, xmm0 | 434 |
294 | 435 INIT_XMM ssse3 |
295 add rsi, rdx | 436 SUBPIX_HFILTER16 h8 |
296 add rax, rdx | 437 SUBPIX_HFILTER16 h8_avg |
297 %if %1 | 438 SUBPIX_HFILTER8 h8 |
298 movq xmm1, [rdi+8] | 439 SUBPIX_HFILTER8 h8_avg |
299 pavgb xmm0, xmm1 | 440 SUBPIX_HFILTER4 h8 |
300 %endif | 441 SUBPIX_HFILTER4 h8_avg |
301 | 442 |
302 movq [rdi+8], xmm0 | 443 ;------------------------------------------------------------------------------- |
303 | 444 %macro SUBPIX_VFILTER 2 |
304 %if ABI_IS_32BIT | 445 cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \ |
305 add rdi, DWORD PTR arg(3) ;out_pitch | 446 src, sstride, dst, dstride, height, filter |
306 %else | 447 mova m4, [filterq] |
307 add rdi, r8 | 448 SETUP_LOCAL_VARS |
308 %endif | 449 %if ARCH_X86_64 |
309 dec rcx | 450 %define src1q r7 |
310 jnz .loop | 451 %define sstride6q r8 |
311 %endm | 452 %define dst_stride dstrideq |
312 | 453 %else |
313 ;void vpx_filter_block1d8_v8_ssse3 | 454 %define src1q filterq |
314 ;( | 455 %define sstride6q dstrideq |
315 ; unsigned char *src_ptr, | 456 %define dst_stride dstridemp |
316 ; unsigned int src_pitch, | 457 %endif |
317 ; unsigned char *output_ptr, | 458 mov src1q, srcq |
318 ; unsigned int out_pitch, | 459 add src1q, sstrideq |
319 ; unsigned int output_height, | 460 lea sstride6q, [sstrideq + sstrideq * 4] |
320 ; short *filter | 461 add sstride6q, sstrideq ;pitch * 6 |
321 ;) | 462 |
322 global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE | 463 %ifidn %2, 8 |
323 sym(vpx_filter_block1d4_v8_ssse3): | 464 %define movx movh |
324 push rbp | 465 %else |
325 mov rbp, rsp | 466 %define movx movd |
326 SHADOW_ARGS_TO_STACK 6 | 467 %endif |
327 SAVE_XMM 7 | |
328 push rsi | |
329 push rdi | |
330 push rbx | |
331 ; end prolog | |
332 | |
333 ALIGN_STACK 16, rax | |
334 sub rsp, 16*5 | |
335 %define k0k1 [rsp + 16*0] | |
336 %define k2k3 [rsp + 16*1] | |
337 %define k4k5 [rsp + 16*2] | |
338 %define k6k7 [rsp + 16*3] | |
339 %define krd [rsp + 16*4] | |
340 | |
341 VERTx4 0 | |
342 | |
343 add rsp, 16*5 | |
344 pop rsp | |
345 pop rbx | |
346 ; begin epilog | |
347 pop rdi | |
348 pop rsi | |
349 RESTORE_XMM | |
350 UNSHADOW_ARGS | |
351 pop rbp | |
352 ret | |
353 | |
354 ;void vpx_filter_block1d8_v8_ssse3 | |
355 ;( | |
356 ; unsigned char *src_ptr, | |
357 ; unsigned int src_pitch, | |
358 ; unsigned char *output_ptr, | |
359 ; unsigned int out_pitch, | |
360 ; unsigned int output_height, | |
361 ; short *filter | |
362 ;) | |
363 global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE | |
364 sym(vpx_filter_block1d8_v8_ssse3): | |
365 push rbp | |
366 mov rbp, rsp | |
367 SHADOW_ARGS_TO_STACK 6 | |
368 SAVE_XMM 7 | |
369 push rsi | |
370 push rdi | |
371 push rbx | |
372 ; end prolog | |
373 | |
374 ALIGN_STACK 16, rax | |
375 sub rsp, 16*5 | |
376 %define k0k1 [rsp + 16*0] | |
377 %define k2k3 [rsp + 16*1] | |
378 %define k4k5 [rsp + 16*2] | |
379 %define k6k7 [rsp + 16*3] | |
380 %define krd [rsp + 16*4] | |
381 | |
382 VERTx8 0 | |
383 | |
384 add rsp, 16*5 | |
385 pop rsp | |
386 pop rbx | |
387 ; begin epilog | |
388 pop rdi | |
389 pop rsi | |
390 RESTORE_XMM | |
391 UNSHADOW_ARGS | |
392 pop rbp | |
393 ret | |
394 | |
395 ;void vpx_filter_block1d16_v8_ssse3 | |
396 ;( | |
397 ; unsigned char *src_ptr, | |
398 ; unsigned int src_pitch, | |
399 ; unsigned char *output_ptr, | |
400 ; unsigned int out_pitch, | |
401 ; unsigned int output_height, | |
402 ; short *filter | |
403 ;) | |
404 global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE | |
405 sym(vpx_filter_block1d16_v8_ssse3): | |
406 push rbp | |
407 mov rbp, rsp | |
408 SHADOW_ARGS_TO_STACK 6 | |
409 SAVE_XMM 7 | |
410 push rsi | |
411 push rdi | |
412 push rbx | |
413 ; end prolog | |
414 | |
415 ALIGN_STACK 16, rax | |
416 sub rsp, 16*5 | |
417 %define k0k1 [rsp + 16*0] | |
418 %define k2k3 [rsp + 16*1] | |
419 %define k4k5 [rsp + 16*2] | |
420 %define k6k7 [rsp + 16*3] | |
421 %define krd [rsp + 16*4] | |
422 | |
423 VERTx16 0 | |
424 | |
425 add rsp, 16*5 | |
426 pop rsp | |
427 pop rbx | |
428 ; begin epilog | |
429 pop rdi | |
430 pop rsi | |
431 RESTORE_XMM | |
432 UNSHADOW_ARGS | |
433 pop rbp | |
434 ret | |
435 | |
436 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
437 | |
438 | |
439 global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE | |
440 sym(vpx_filter_block1d4_v8_avg_ssse3): | |
441 push rbp | |
442 mov rbp, rsp | |
443 SHADOW_ARGS_TO_STACK 6 | |
444 SAVE_XMM 7 | |
445 push rsi | |
446 push rdi | |
447 push rbx | |
448 ; end prolog | |
449 | |
450 ALIGN_STACK 16, rax | |
451 sub rsp, 16*5 | |
452 %define k0k1 [rsp + 16*0] | |
453 %define k2k3 [rsp + 16*1] | |
454 %define k4k5 [rsp + 16*2] | |
455 %define k6k7 [rsp + 16*3] | |
456 %define krd [rsp + 16*4] | |
457 | |
458 VERTx4 1 | |
459 | |
460 add rsp, 16*5 | |
461 pop rsp | |
462 pop rbx | |
463 ; begin epilog | |
464 pop rdi | |
465 pop rsi | |
466 RESTORE_XMM | |
467 UNSHADOW_ARGS | |
468 pop rbp | |
469 ret | |
470 | |
471 global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE | |
472 sym(vpx_filter_block1d8_v8_avg_ssse3): | |
473 push rbp | |
474 mov rbp, rsp | |
475 SHADOW_ARGS_TO_STACK 6 | |
476 SAVE_XMM 7 | |
477 push rsi | |
478 push rdi | |
479 push rbx | |
480 ; end prolog | |
481 | |
482 ALIGN_STACK 16, rax | |
483 sub rsp, 16*5 | |
484 %define k0k1 [rsp + 16*0] | |
485 %define k2k3 [rsp + 16*1] | |
486 %define k4k5 [rsp + 16*2] | |
487 %define k6k7 [rsp + 16*3] | |
488 %define krd [rsp + 16*4] | |
489 | |
490 VERTx8 1 | |
491 | |
492 add rsp, 16*5 | |
493 pop rsp | |
494 pop rbx | |
495 ; begin epilog | |
496 pop rdi | |
497 pop rsi | |
498 RESTORE_XMM | |
499 UNSHADOW_ARGS | |
500 pop rbp | |
501 ret | |
502 | |
503 global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE | |
504 sym(vpx_filter_block1d16_v8_avg_ssse3): | |
505 push rbp | |
506 mov rbp, rsp | |
507 SHADOW_ARGS_TO_STACK 6 | |
508 SAVE_XMM 7 | |
509 push rsi | |
510 push rdi | |
511 push rbx | |
512 ; end prolog | |
513 | |
514 ALIGN_STACK 16, rax | |
515 sub rsp, 16*5 | |
516 %define k0k1 [rsp + 16*0] | |
517 %define k2k3 [rsp + 16*1] | |
518 %define k4k5 [rsp + 16*2] | |
519 %define k6k7 [rsp + 16*3] | |
520 %define krd [rsp + 16*4] | |
521 | |
522 VERTx16 1 | |
523 | |
524 add rsp, 16*5 | |
525 pop rsp | |
526 pop rbx | |
527 ; begin epilog | |
528 pop rdi | |
529 pop rsi | |
530 RESTORE_XMM | |
531 UNSHADOW_ARGS | |
532 pop rbp | |
533 ret | |
534 | |
535 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | |
536 %macro HORIZx4_ROW 2 | |
537 movdqa %2, %1 | |
538 pshufb %1, [GLOBAL(shuf_t0t1)] | |
539 pshufb %2, [GLOBAL(shuf_t2t3)] | |
540 pmaddubsw %1, k0k1k4k5 | |
541 pmaddubsw %2, k2k3k6k7 | |
542 | |
543 movdqa xmm4, %1 | |
544 movdqa xmm5, %2 | |
545 psrldq %1, 8 | |
546 psrldq %2, 8 | |
547 movdqa xmm6, xmm5 | |
548 | |
549 paddsw xmm4, %2 | |
550 pmaxsw xmm5, %1 | |
551 pminsw %1, xmm6 | |
552 paddsw %1, xmm4 | |
553 paddsw %1, xmm5 | |
554 | |
555 paddsw %1, krd | |
556 psraw %1, 7 | |
557 packuswb %1, %1 | |
558 %endm | |
559 | |
560 %macro HORIZx4 1 | |
561 mov rdx, arg(5) ;filter ptr | |
562 mov rsi, arg(0) ;src_ptr | |
563 mov rdi, arg(2) ;output_ptr | |
564 mov rcx, 0x0400040 | |
565 | |
566 movdqa xmm4, [rdx] ;load filters | |
567 movq xmm5, rcx | |
568 packsswb xmm4, xmm4 | |
569 pshuflw xmm6, xmm4, 0b ;k0_k1 | |
570 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 | |
571 pshuflw xmm7, xmm4, 01010101b ;k2_k3 | |
572 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 | |
573 pshufd xmm5, xmm5, 0 ;rounding | |
574 | |
575 movdqa k0k1k4k5, xmm6 | |
576 movdqa k2k3k6k7, xmm7 | |
577 movdqa krd, xmm5 | |
578 | |
579 movsxd rax, dword ptr arg(1) ;src_pixels_per_line | |
580 movsxd rdx, dword ptr arg(3) ;output_pitch | |
581 movsxd rcx, dword ptr arg(4) ;output_height | |
582 shr rcx, 1 | |
583 .loop: | 468 .loop: |
584 ;Do two rows once | 469 movx m0, [srcq ] ;A |
585 movq xmm0, [rsi - 3] ;load src | 470 movx m1, [srcq + sstrideq ] ;B |
586 movq xmm1, [rsi + 5] | 471 punpcklbw m0, m1 ;A B |
587 movq xmm2, [rsi + rax - 3] | 472 movx m2, [srcq + sstrideq * 2 ] ;C |
588 movq xmm3, [rsi + rax + 5] | 473 pmaddubsw m0, k0k1 |
589 punpcklqdq xmm0, xmm1 | 474 mova m6, m2 |
590 punpcklqdq xmm2, xmm3 | 475 movx m3, [src1q + sstrideq * 2] ;D |
591 | 476 punpcklbw m2, m3 ;C D |
592 HORIZx4_ROW xmm0, xmm1 | 477 pmaddubsw m2, k2k3 |
593 HORIZx4_ROW xmm2, xmm3 | 478 movx m4, [srcq + sstrideq * 4 ] ;E |
594 %if %1 | 479 mova m7, m4 |
595 movd xmm1, [rdi] | 480 movx m5, [src1q + sstrideq * 4] ;F |
596 pavgb xmm0, xmm1 | 481 punpcklbw m4, m5 ;E F |
597 movd xmm3, [rdi + rdx] | 482 pmaddubsw m4, k4k5 |
598 pavgb xmm2, xmm3 | 483 punpcklbw m1, m6 ;A B next iter |
599 %endif | 484 movx m6, [srcq + sstride6q ] ;G |
600 movd [rdi], xmm0 | 485 punpcklbw m5, m6 ;E F next iter |
601 movd [rdi +rdx], xmm2 | 486 punpcklbw m3, m7 ;C D next iter |
602 | 487 pmaddubsw m5, k4k5 |
603 lea rsi, [rsi + rax] | 488 movx m7, [src1q + sstride6q ] ;H |
604 prefetcht0 [rsi + 4 * rax - 3] | 489 punpcklbw m6, m7 ;G H |
605 lea rsi, [rsi + rax] | 490 pmaddubsw m6, k6k7 |
606 lea rdi, [rdi + 2 * rdx] | 491 mova tmp, m2 |
607 prefetcht0 [rsi + 2 * rax - 3] | 492 pmaddubsw m3, k2k3 |
608 | 493 pmaddubsw m1, k0k1 |
609 dec rcx | 494 pmaxsw m2, m4 |
610 jnz .loop | 495 paddsw m0, m6 |
611 | 496 movx m6, [srcq + sstrideq * 8 ] ;H next iter |
612 ; Do last row if output_height is odd | 497 punpcklbw m7, m6 |
613 movsxd rcx, dword ptr arg(4) ;output_height | 498 pmaddubsw m7, k6k7 |
614 and rcx, 1 | 499 pminsw m4, tmp |
615 je .done | 500 paddsw m0, m4 |
616 | 501 mova m4, m3 |
617 movq xmm0, [rsi - 3] ; load src | 502 paddsw m0, m2 |
618 movq xmm1, [rsi + 5] | 503 pminsw m3, m5 |
619 punpcklqdq xmm0, xmm1 | 504 pmaxsw m5, m4 |
620 | 505 paddsw m0, krd |
621 HORIZx4_ROW xmm0, xmm1 | 506 psraw m0, 7 |
622 %if %1 | 507 paddsw m1, m7 |
623 movd xmm1, [rdi] | 508 packuswb m0, m0 |
624 pavgb xmm0, xmm1 | 509 |
625 %endif | 510 paddsw m1, m3 |
626 movd [rdi], xmm0 | 511 paddsw m1, m5 |
627 .done | 512 paddsw m1, krd |
628 %endm | 513 psraw m1, 7 |
629 | 514 lea srcq, [srcq + sstrideq * 2 ] |
630 %macro HORIZx8_ROW 4 | 515 lea src1q, [src1q + sstrideq * 2] |
631 movdqa %2, %1 | 516 packuswb m1, m1 |
632 movdqa %3, %1 | 517 |
633 movdqa %4, %1 | 518 %ifidn %1, v8_avg |
634 | 519 movx m2, [dstq] |
635 pshufb %1, [GLOBAL(shuf_t0t1)] | 520 pavgb m0, m2 |
636 pshufb %2, [GLOBAL(shuf_t2t3)] | 521 %endif |
637 pshufb %3, [GLOBAL(shuf_t4t5)] | 522 movx [dstq], m0 |
638 pshufb %4, [GLOBAL(shuf_t6t7)] | 523 add dstq, dst_stride |
639 | 524 %ifidn %1, v8_avg |
640 pmaddubsw %1, k0k1 | 525 movx m3, [dstq] |
641 pmaddubsw %2, k2k3 | 526 pavgb m1, m3 |
642 pmaddubsw %3, k4k5 | 527 %endif |
643 pmaddubsw %4, k6k7 | 528 movx [dstq], m1 |
644 | 529 add dstq, dst_stride |
645 paddsw %1, %4 | 530 sub heightq, 2 |
646 movdqa %4, %2 | 531 cmp heightq, 1 |
647 pmaxsw %2, %3 | 532 jg .loop |
648 pminsw %3, %4 | 533 |
649 paddsw %1, %3 | 534 cmp heightq, 0 |
650 paddsw %1, %2 | 535 je .done |
651 | 536 |
652 paddsw %1, krd | 537 movx m0, [srcq ] ;A |
653 psraw %1, 7 | 538 movx m1, [srcq + sstrideq ] ;B |
654 packuswb %1, %1 | 539 movx m6, [srcq + sstride6q ] ;G |
655 %endm | 540 punpcklbw m0, m1 ;A B |
656 | 541 movx m7, [rax + sstride6q ] ;H |
657 %macro HORIZx8 1 | 542 pmaddubsw m0, k0k1 |
658 mov rdx, arg(5) ;filter ptr | 543 movx m2, [srcq + sstrideq * 2 ] ;C |
659 mov rsi, arg(0) ;src_ptr | 544 punpcklbw m6, m7 ;G H |
660 mov rdi, arg(2) ;output_ptr | 545 movx m3, [rax + sstrideq * 2 ] ;D |
661 mov rcx, 0x0400040 | 546 pmaddubsw m6, k6k7 |
662 | 547 movx m4, [srcq + sstrideq * 4 ] ;E |
663 movdqa xmm4, [rdx] ;load filters | 548 punpcklbw m2, m3 ;C D |
664 movq xmm5, rcx | 549 movx m5, [src1q + sstrideq * 4] ;F |
665 packsswb xmm4, xmm4 | 550 punpcklbw m4, m5 ;E F |
666 pshuflw xmm0, xmm4, 0b ;k0_k1 | 551 pmaddubsw m2, k2k3 |
667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 552 pmaddubsw m4, k4k5 |
668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 553 paddsw m0, m6 |
669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 554 mova m1, m2 |
670 | 555 pmaxsw m2, m4 |
671 punpcklqdq xmm0, xmm0 | 556 pminsw m4, m1 |
672 punpcklqdq xmm1, xmm1 | 557 paddsw m0, m4 |
673 punpcklqdq xmm2, xmm2 | 558 paddsw m0, m2 |
674 punpcklqdq xmm3, xmm3 | 559 paddsw m0, krd |
675 | 560 psraw m0, 7 |
676 movdqa k0k1, xmm0 | 561 packuswb m0, m0 |
677 movdqa k2k3, xmm1 | 562 %ifidn %1, v8_avg |
678 pshufd xmm5, xmm5, 0 | 563 movx m1, [dstq] |
679 movdqa k4k5, xmm2 | 564 pavgb m0, m1 |
680 movdqa k6k7, xmm3 | 565 %endif |
681 movdqa krd, xmm5 | 566 movx [dstq], m0 |
682 | 567 .done: |
683 movsxd rax, dword ptr arg(1) ;src_pixels_per_line | 568 RET |
684 movsxd rdx, dword ptr arg(3) ;output_pitch | 569 %endm |
685 movsxd rcx, dword ptr arg(4) ;output_height | 570 |
686 shr rcx, 1 | 571 ;------------------------------------------------------------------------------- |
| 572 %macro SUBPIX_VFILTER16 1 |
| 573 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \ |
| 574 src, sstride, dst, dstride, height, filter |
| 575 |
| 576 mova m4, [filterq] |
| 577 SETUP_LOCAL_VARS |
| 578 %if ARCH_X86_64 |
| 579 %define src1q r7 |
| 580 %define sstride6q r8 |
| 581 %define dst_stride dstrideq |
| 582 %else |
| 583 %define src1q filterq |
| 584 %define sstride6q dstrideq |
| 585 %define dst_stride dstridemp |
| 586 %endif |
| 587 mov src1q, srcq |
| 588 add src1q, sstrideq |
| 589 lea sstride6q, [sstrideq + sstrideq * 4] |
| 590 add sstride6q, sstrideq ;pitch * 6 |
687 | 591 |
688 .loop: | 592 .loop: |
689 movq xmm0, [rsi - 3] ;load src | 593 movh m0, [srcq ] ;A |
690 movq xmm3, [rsi + 5] | 594 movh m1, [srcq + sstrideq ] ;B |
691 movq xmm4, [rsi + rax - 3] | 595 movh m2, [srcq + sstrideq * 2 ] ;C |
692 movq xmm7, [rsi + rax + 5] | 596 movh m3, [src1q + sstrideq * 2] ;D |
693 punpcklqdq xmm0, xmm3 | 597 movh m4, [srcq + sstrideq * 4 ] ;E |
694 punpcklqdq xmm4, xmm7 | 598 movh m5, [src1q + sstrideq * 4] ;F |
695 | 599 |
696 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 | 600 punpcklbw m0, m1 ;A B |
697 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 | 601 movh m6, [srcq + sstride6q] ;G |
698 %if %1 | 602 punpcklbw m2, m3 ;C D |
699 movq xmm1, [rdi] | 603 movh m7, [src1q + sstride6q] ;H |
700 movq xmm2, [rdi + rdx] | 604 punpcklbw m4, m5 ;E F |
701 pavgb xmm0, xmm1 | 605 pmaddubsw m0, k0k1 |
702 pavgb xmm4, xmm2 | 606 movh m3, [srcq + 8] ;A |
703 %endif | 607 pmaddubsw m2, k2k3 |
704 movq [rdi], xmm0 | 608 punpcklbw m6, m7 ;G H |
705 movq [rdi + rdx], xmm4 | 609 movh m5, [srcq + sstrideq + 8] ;B |
706 | 610 pmaddubsw m4, k4k5 |
707 lea rsi, [rsi + rax] | 611 punpcklbw m3, m5 ;A B |
708 prefetcht0 [rsi + 4 * rax - 3] | 612 movh m7, [srcq + sstrideq * 2 + 8] ;C |
709 lea rsi, [rsi + rax] | 613 pmaddubsw m6, k6k7 |
710 lea rdi, [rdi + 2 * rdx] | 614 mova m1, m2 |
711 prefetcht0 [rsi + 2 * rax - 3] | 615 movh m5, [src1q + sstrideq * 2 + 8] ;D |
712 dec rcx | 616 pmaxsw m2, m4 |
713 jnz .loop | 617 punpcklbw m7, m5 ;C D |
714 | 618 pminsw m4, m1 |
715 ;Do last row if output_height is odd | 619 paddsw m0, m6 |
716 movsxd rcx, dword ptr arg(4) ;output_height | 620 pmaddubsw m3, k0k1 |
717 and rcx, 1 | 621 movh m1, [srcq + sstrideq * 4 + 8] ;E |
718 je .done | 622 paddsw m0, m4 |
719 | 623 pmaddubsw m7, k2k3 |
720 movq xmm0, [rsi - 3] | 624 movh m6, [src1q + sstrideq * 4 + 8] ;F |
721 movq xmm3, [rsi + 5] | 625 punpcklbw m1, m6 ;E F |
722 punpcklqdq xmm0, xmm3 | 626 paddsw m0, m2 |
723 | 627 paddsw m0, krd |
724 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 | 628 movh m2, [srcq + sstride6q + 8] ;G |
725 %if %1 | 629 pmaddubsw m1, k4k5 |
726 movq xmm1, [rdi] | 630 movh m5, [src1q + sstride6q + 8] ;H |
727 pavgb xmm0, xmm1 | 631 psraw m0, 7 |
728 %endif | 632 punpcklbw m2, m5 ;G H |
729 movq [rdi], xmm0 | 633 packuswb m0, m0 |
730 .done | 634 pmaddubsw m2, k6k7 |
731 %endm | 635 %ifidn %1, v8_avg |
732 | 636 movh m4, [dstq] |
733 %macro HORIZx16 1 | 637 pavgb m0, m4 |
734 mov rdx, arg(5) ;filter ptr | 638 %endif |
735 mov rsi, arg(0) ;src_ptr | 639 movh [dstq], m0 |
736 mov rdi, arg(2) ;output_ptr | 640 mova m6, m7 |
737 mov rcx, 0x0400040 | 641 pmaxsw m7, m1 |
738 | 642 pminsw m1, m6 |
739 movdqa xmm4, [rdx] ;load filters | 643 paddsw m3, m2 |
740 movq xmm5, rcx | 644 paddsw m3, m1 |
741 packsswb xmm4, xmm4 | 645 paddsw m3, m7 |
742 pshuflw xmm0, xmm4, 0b ;k0_k1 | 646 paddsw m3, krd |
743 pshuflw xmm1, xmm4, 01010101b ;k2_k3 | 647 psraw m3, 7 |
744 pshuflw xmm2, xmm4, 10101010b ;k4_k5 | 648 packuswb m3, m3 |
745 pshuflw xmm3, xmm4, 11111111b ;k6_k7 | 649 |
746 | 650 add srcq, sstrideq |
747 punpcklqdq xmm0, xmm0 | 651 add src1q, sstrideq |
748 punpcklqdq xmm1, xmm1 | 652 %ifidn %1, v8_avg |
749 punpcklqdq xmm2, xmm2 | 653 movh m1, [dstq + 8] |
750 punpcklqdq xmm3, xmm3 | 654 pavgb m3, m1 |
751 | 655 %endif |
752 movdqa k0k1, xmm0 | 656 movh [dstq + 8], m3 |
753 movdqa k2k3, xmm1 | 657 add dstq, dst_stride |
754 pshufd xmm5, xmm5, 0 | 658 dec heightq |
755 movdqa k4k5, xmm2 | 659 jnz .loop |
756 movdqa k6k7, xmm3 | 660 RET |
757 movdqa krd, xmm5 | 661 %endm |
758 | 662 |
759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line | 663 INIT_XMM ssse3 |
760 movsxd rdx, dword ptr arg(3) ;output_pitch | 664 SUBPIX_VFILTER16 v8 |
761 movsxd rcx, dword ptr arg(4) ;output_height | 665 SUBPIX_VFILTER16 v8_avg |
762 | 666 SUBPIX_VFILTER v8, 8 |
763 .loop: | 667 SUBPIX_VFILTER v8_avg, 8 |
764 prefetcht0 [rsi + 2 * rax -3] | 668 SUBPIX_VFILTER v8, 4 |
765 | 669 SUBPIX_VFILTER v8_avg, 4 |
766 movq xmm0, [rsi - 3] ;load src data | |
767 movq xmm4, [rsi + 5] | |
768 movq xmm6, [rsi + 13] | |
769 punpcklqdq xmm0, xmm4 | |
770 punpcklqdq xmm4, xmm6 | |
771 | |
772 movdqa xmm7, xmm0 | |
773 | |
774 punpcklbw xmm7, xmm7 | |
775 punpckhbw xmm0, xmm0 | |
776 movdqa xmm1, xmm0 | |
777 movdqa xmm2, xmm0 | |
778 movdqa xmm3, xmm0 | |
779 | |
780 palignr xmm0, xmm7, 1 | |
781 palignr xmm1, xmm7, 5 | |
782 pmaddubsw xmm0, k0k1 | |
783 palignr xmm2, xmm7, 9 | |
784 pmaddubsw xmm1, k2k3 | |
785 palignr xmm3, xmm7, 13 | |
786 | |
787 pmaddubsw xmm2, k4k5 | |
788 pmaddubsw xmm3, k6k7 | |
789 paddsw xmm0, xmm3 | |
790 | |
791 movdqa xmm3, xmm4 | |
792 punpcklbw xmm3, xmm3 | |
793 punpckhbw xmm4, xmm4 | |
794 | |
795 movdqa xmm5, xmm4 | |
796 movdqa xmm6, xmm4 | |
797 movdqa xmm7, xmm4 | |
798 | |
799 palignr xmm4, xmm3, 1 | |
800 palignr xmm5, xmm3, 5 | |
801 palignr xmm6, xmm3, 9 | |
802 palignr xmm7, xmm3, 13 | |
803 | |
804 movdqa xmm3, xmm1 | |
805 pmaddubsw xmm4, k0k1 | |
806 pmaxsw xmm1, xmm2 | |
807 pmaddubsw xmm5, k2k3 | |
808 pminsw xmm2, xmm3 | |
809 pmaddubsw xmm6, k4k5 | |
810 paddsw xmm0, xmm2 | |
811 pmaddubsw xmm7, k6k7 | |
812 paddsw xmm0, xmm1 | |
813 | |
814 paddsw xmm4, xmm7 | |
815 movdqa xmm7, xmm5 | |
816 pmaxsw xmm5, xmm6 | |
817 pminsw xmm6, xmm7 | |
818 paddsw xmm4, xmm6 | |
819 paddsw xmm4, xmm5 | |
820 | |
821 paddsw xmm0, krd | |
822 paddsw xmm4, krd | |
823 psraw xmm0, 7 | |
824 psraw xmm4, 7 | |
825 packuswb xmm0, xmm0 | |
826 packuswb xmm4, xmm4 | |
827 punpcklqdq xmm0, xmm4 | |
828 %if %1 | |
829 movdqa xmm1, [rdi] | |
830 pavgb xmm0, xmm1 | |
831 %endif | |
832 | |
833 lea rsi, [rsi + rax] | |
834 movdqa [rdi], xmm0 | |
835 | |
836 lea rdi, [rdi + rdx] | |
837 dec rcx | |
838 jnz .loop | |
839 %endm | |
840 | |
841 ;void vpx_filter_block1d4_h8_ssse3 | |
842 ;( | |
843 ; unsigned char *src_ptr, | |
844 ; unsigned int src_pixels_per_line, | |
845 ; unsigned char *output_ptr, | |
846 ; unsigned int output_pitch, | |
847 ; unsigned int output_height, | |
848 ; short *filter | |
849 ;) | |
850 global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE | |
851 sym(vpx_filter_block1d4_h8_ssse3): | |
852 push rbp | |
853 mov rbp, rsp | |
854 SHADOW_ARGS_TO_STACK 6 | |
855 SAVE_XMM 7 | |
856 GET_GOT rbx | |
857 push rsi | |
858 push rdi | |
859 ; end prolog | |
860 | |
861 ALIGN_STACK 16, rax | |
862 sub rsp, 16 * 3 | |
863 %define k0k1k4k5 [rsp + 16 * 0] | |
864 %define k2k3k6k7 [rsp + 16 * 1] | |
865 %define krd [rsp + 16 * 2] | |
866 | |
867 HORIZx4 0 | |
868 | |
869 add rsp, 16 * 3 | |
870 pop rsp | |
871 ; begin epilog | |
872 pop rdi | |
873 pop rsi | |
874 RESTORE_GOT | |
875 RESTORE_XMM | |
876 UNSHADOW_ARGS | |
877 pop rbp | |
878 ret | |
879 | |
880 ;void vpx_filter_block1d8_h8_ssse3 | |
881 ;( | |
882 ; unsigned char *src_ptr, | |
883 ; unsigned int src_pixels_per_line, | |
884 ; unsigned char *output_ptr, | |
885 ; unsigned int output_pitch, | |
886 ; unsigned int output_height, | |
887 ; short *filter | |
888 ;) | |
889 global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE | |
890 sym(vpx_filter_block1d8_h8_ssse3): | |
891 push rbp | |
892 mov rbp, rsp | |
893 SHADOW_ARGS_TO_STACK 6 | |
894 SAVE_XMM 7 | |
895 GET_GOT rbx | |
896 push rsi | |
897 push rdi | |
898 ; end prolog | |
899 | |
900 ALIGN_STACK 16, rax | |
901 sub rsp, 16*5 | |
902 %define k0k1 [rsp + 16*0] | |
903 %define k2k3 [rsp + 16*1] | |
904 %define k4k5 [rsp + 16*2] | |
905 %define k6k7 [rsp + 16*3] | |
906 %define krd [rsp + 16*4] | |
907 | |
908 HORIZx8 0 | |
909 | |
910 add rsp, 16*5 | |
911 pop rsp | |
912 | |
913 ; begin epilog | |
914 pop rdi | |
915 pop rsi | |
916 RESTORE_GOT | |
917 RESTORE_XMM | |
918 UNSHADOW_ARGS | |
919 pop rbp | |
920 ret | |
921 | |
922 ;void vpx_filter_block1d16_h8_ssse3 | |
923 ;( | |
924 ; unsigned char *src_ptr, | |
925 ; unsigned int src_pixels_per_line, | |
926 ; unsigned char *output_ptr, | |
927 ; unsigned int output_pitch, | |
928 ; unsigned int output_height, | |
929 ; short *filter | |
930 ;) | |
931 global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE | |
932 sym(vpx_filter_block1d16_h8_ssse3): | |
933 push rbp | |
934 mov rbp, rsp | |
935 SHADOW_ARGS_TO_STACK 6 | |
936 SAVE_XMM 7 | |
937 GET_GOT rbx | |
938 push rsi | |
939 push rdi | |
940 ; end prolog | |
941 | |
942 ALIGN_STACK 16, rax | |
943 sub rsp, 16*5 | |
944 %define k0k1 [rsp + 16*0] | |
945 %define k2k3 [rsp + 16*1] | |
946 %define k4k5 [rsp + 16*2] | |
947 %define k6k7 [rsp + 16*3] | |
948 %define krd [rsp + 16*4] | |
949 | |
950 HORIZx16 0 | |
951 | |
952 add rsp, 16*5 | |
953 pop rsp | |
954 | |
955 ; begin epilog | |
956 pop rdi | |
957 pop rsi | |
958 RESTORE_GOT | |
959 RESTORE_XMM | |
960 UNSHADOW_ARGS | |
961 pop rbp | |
962 ret | |
963 | |
964 global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE | |
965 sym(vpx_filter_block1d4_h8_avg_ssse3): | |
966 push rbp | |
967 mov rbp, rsp | |
968 SHADOW_ARGS_TO_STACK 6 | |
969 SAVE_XMM 7 | |
970 GET_GOT rbx | |
971 push rsi | |
972 push rdi | |
973 ; end prolog | |
974 | |
975 ALIGN_STACK 16, rax | |
976 sub rsp, 16 * 3 | |
977 %define k0k1k4k5 [rsp + 16 * 0] | |
978 %define k2k3k6k7 [rsp + 16 * 1] | |
979 %define krd [rsp + 16 * 2] | |
980 | |
981 HORIZx4 1 | |
982 | |
983 add rsp, 16 * 3 | |
984 pop rsp | |
985 ; begin epilog | |
986 pop rdi | |
987 pop rsi | |
988 RESTORE_GOT | |
989 RESTORE_XMM | |
990 UNSHADOW_ARGS | |
991 pop rbp | |
992 ret | |
993 | |
994 global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE | |
995 sym(vpx_filter_block1d8_h8_avg_ssse3): | |
996 push rbp | |
997 mov rbp, rsp | |
998 SHADOW_ARGS_TO_STACK 6 | |
999 SAVE_XMM 7 | |
1000 GET_GOT rbx | |
1001 push rsi | |
1002 push rdi | |
1003 ; end prolog | |
1004 | |
1005 ALIGN_STACK 16, rax | |
1006 sub rsp, 16*5 | |
1007 %define k0k1 [rsp + 16*0] | |
1008 %define k2k3 [rsp + 16*1] | |
1009 %define k4k5 [rsp + 16*2] | |
1010 %define k6k7 [rsp + 16*3] | |
1011 %define krd [rsp + 16*4] | |
1012 | |
1013 HORIZx8 1 | |
1014 | |
1015 add rsp, 16*5 | |
1016 pop rsp | |
1017 | |
1018 ; begin epilog | |
1019 pop rdi | |
1020 pop rsi | |
1021 RESTORE_GOT | |
1022 RESTORE_XMM | |
1023 UNSHADOW_ARGS | |
1024 pop rbp | |
1025 ret | |
1026 | |
1027 global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE | |
1028 sym(vpx_filter_block1d16_h8_avg_ssse3): | |
1029 push rbp | |
1030 mov rbp, rsp | |
1031 SHADOW_ARGS_TO_STACK 6 | |
1032 SAVE_XMM 7 | |
1033 GET_GOT rbx | |
1034 push rsi | |
1035 push rdi | |
1036 ; end prolog | |
1037 | |
1038 ALIGN_STACK 16, rax | |
1039 sub rsp, 16*5 | |
1040 %define k0k1 [rsp + 16*0] | |
1041 %define k2k3 [rsp + 16*1] | |
1042 %define k4k5 [rsp + 16*2] | |
1043 %define k6k7 [rsp + 16*3] | |
1044 %define krd [rsp + 16*4] | |
1045 | |
1046 HORIZx16 1 | |
1047 | |
1048 add rsp, 16*5 | |
1049 pop rsp | |
1050 | |
1051 ; begin epilog | |
1052 pop rdi | |
1053 pop rsi | |
1054 RESTORE_GOT | |
1055 RESTORE_XMM | |
1056 UNSHADOW_ARGS | |
1057 pop rbp | |
1058 ret | |
1059 SECTION_RODATA | |
1060 align 16 | |
1061 shuf_t0t1: | |
1062 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 | |
1063 align 16 | |
1064 shuf_t2t3: | |
1065 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 | |
1066 align 16 | |
1067 shuf_t4t5: | |
1068 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 | |
1069 align 16 | |
1070 shuf_t6t7: | |
1071 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 | |
OLD | NEW |