Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(446)

Side by Side Diff: source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm

Issue 1339513003: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c ('k') | source/libvpx/webmdec.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 ; 1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 2 ; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 ; 3 ;
4 ; Use of this source code is governed by a BSD-style license 4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source 5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found 6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may 7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree. 8 ; be found in the AUTHORS file in the root of the source tree.
9 ; 9 ;
10 10
11 11 %include "third_party/x86inc/x86inc.asm"
12 %include "vpx_ports/x86_abi_support.asm" 12
13 13 SECTION_RODATA
14 %macro VERTx4 1 14 pw_64: times 8 dw 64
15 mov rdx, arg(5) ;filter ptr 15
16 mov rsi, arg(0) ;src_ptr 16 ; %define USE_PMULHRSW
17 mov rdi, arg(2) ;output_ptr 17 ; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
18 mov rcx, 0x0400040 18 ; when using this instruction.
19 19
20 movdqa xmm4, [rdx] ;load filters 20 SECTION .text
21 movq xmm5, rcx 21 %if ARCH_X86_64
22 packsswb xmm4, xmm4 22 %define LOCAL_VARS_SIZE 16*4
23 pshuflw xmm0, xmm4, 0b ;k0_k1 23 %else
24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 24 %define LOCAL_VARS_SIZE 16*6
25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 25 %endif
26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 26
27 27 %macro SETUP_LOCAL_VARS 0
28 punpcklqdq xmm0, xmm0 28 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 +
29 punpcklqdq xmm1, xmm1 29 ; pmaddubsw has a higher latency on some platforms, this might be eased by
30 punpcklqdq xmm2, xmm2 30 ; interleaving the instructions.
31 punpcklqdq xmm3, xmm3 31 %define k0k1 [rsp + 16*0]
32 32 %define k2k3 [rsp + 16*1]
33 movdqa k0k1, xmm0 33 %define k4k5 [rsp + 16*2]
34 movdqa k2k3, xmm1 34 %define k6k7 [rsp + 16*3]
35 pshufd xmm5, xmm5, 0 35 packsswb m4, m4
36 movdqa k4k5, xmm2 36 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
37 movdqa k6k7, xmm3 37 ; some platforms.
38 movdqa krd, xmm5 38 pshuflw m0, m4, 0b ;k0_k1
39 39 pshuflw m1, m4, 01010101b ;k2_k3
40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 40 pshuflw m2, m4, 10101010b ;k4_k5
41 41 pshuflw m3, m4, 11111111b ;k6_k7
42 %if ABI_IS_32BIT=0 42 punpcklqdq m0, m0
43 movsxd r8, DWORD PTR arg(3) ;out_pitch 43 punpcklqdq m1, m1
44 %endif 44 punpcklqdq m2, m2
45 mov rax, rsi 45 punpcklqdq m3, m3
46 movsxd rcx, DWORD PTR arg(4) ;output_height 46 mova k0k1, m0
47 add rax, rdx 47 mova k2k3, m1
48 48 mova k4k5, m2
49 lea rbx, [rdx + rdx*4] 49 mova k6k7, m3
50 add rbx, rdx ;pitch * 6 50 %if ARCH_X86_64
51 51 %define krd m12
52 %define tmp m13
53 mova krd, [GLOBAL(pw_64)]
54 %else
55 %define tmp [rsp + 16*4]
56 %define krd [rsp + 16*5]
57 %if CONFIG_PIC=0
58 mova m6, [GLOBAL(pw_64)]
59 %else
60 ; build constants without accessing global memory
61 pcmpeqb m6, m6 ;all ones
62 psrlw m6, 15
63 psllw m6, 6 ;aka pw_64
64 %endif
65 mova krd, m6
66 %endif
67 %endm
68
69 %macro HORIZx4_ROW 2
70 mova %2, %1
71 punpcklbw %1, %1
72 punpckhbw %2, %2
73
74 mova m3, %2
75 palignr %2, %1, 1
76 palignr m3, %1, 5
77
78 pmaddubsw %2, k0k1k4k5
79 pmaddubsw m3, k2k3k6k7
80
81 mova m4, %2
82 mova m5, m3
83 psrldq %2, 8
84 psrldq m3, 8
85 mova m6, m5
86
87 paddsw m4, m3
88 pmaxsw m5, %2
89 pminsw %2, m6
90 paddsw %2, m4
91 paddsw %2, m5
92 paddsw %2, krd
93 psraw %2, 7
94 packuswb %2, %2
95 %endm
96
97 ;-------------------------------------------------------------------------------
98 %macro SUBPIX_HFILTER4 1
99 cglobal filter_block1d4_%1, 6, 6+(ARCH_X86_64*2), 11, LOCAL_VARS_SIZE, \
100 src, sstride, dst, dstride, height, filter
101 mova m4, [filterq]
102 packsswb m4, m4
103 %if ARCH_X86_64
104 %define k0k1k4k5 m8
105 %define k2k3k6k7 m9
106 %define krd m10
107 %define orig_height r7
108 mova krd, [GLOBAL(pw_64)]
109 pshuflw k0k1k4k5, m4, 0b ;k0_k1
110 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
111 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
112 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
113 %else
114 %define k0k1k4k5 [rsp + 16*0]
115 %define k2k3k6k7 [rsp + 16*1]
116 %define krd [rsp + 16*2]
117 %define orig_height [rsp + 16*3]
118 pshuflw m6, m4, 0b ;k0_k1
119 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
120 pshuflw m7, m4, 01010101b ;k2_k3
121 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
122 %if CONFIG_PIC=0
123 mova m1, [GLOBAL(pw_64)]
124 %else
125 ; build constants without accessing global memory
126 pcmpeqb m1, m1 ;all ones
127 psrlw m1, 15
128 psllw m1, 6 ;aka pw_64
129 %endif
130 mova k0k1k4k5, m6
131 mova k2k3k6k7, m7
132 mova krd, m1
133 %endif
134 mov orig_height, heightq
135 shr heightq, 1
52 .loop: 136 .loop:
53 movd xmm0, [rsi] ;A 137 ;Do two rows at once
54 movd xmm1, [rsi + rdx] ;B 138 movh m0, [srcq - 3]
55 movd xmm2, [rsi + rdx * 2] ;C 139 movh m1, [srcq + 5]
56 movd xmm3, [rax + rdx * 2] ;D 140 punpcklqdq m0, m1
57 movd xmm4, [rsi + rdx * 4] ;E 141 mova m1, m0
58 movd xmm5, [rax + rdx * 4] ;F 142 movh m2, [srcq + sstrideq - 3]
59 143 movh m3, [srcq + sstrideq + 5]
60 punpcklbw xmm0, xmm1 ;A B 144 punpcklqdq m2, m3
61 punpcklbw xmm2, xmm3 ;C D 145 mova m3, m2
62 punpcklbw xmm4, xmm5 ;E F 146 punpcklbw m0, m0
63 147 punpckhbw m1, m1
64 movd xmm6, [rsi + rbx] ;G 148 punpcklbw m2, m2
65 movd xmm7, [rax + rbx] ;H 149 punpckhbw m3, m3
66 150 mova m4, m1
67 pmaddubsw xmm0, k0k1 151 palignr m4, m0, 1
68 pmaddubsw xmm2, k2k3 152 pmaddubsw m4, k0k1k4k5
69 punpcklbw xmm6, xmm7 ;G H 153 palignr m1, m0, 5
70 pmaddubsw xmm4, k4k5 154 pmaddubsw m1, k2k3k6k7
71 pmaddubsw xmm6, k6k7 155 mova m7, m3
72 156 palignr m7, m2, 1
73 movdqa xmm1, xmm2 157 pmaddubsw m7, k0k1k4k5
74 paddsw xmm0, xmm6 158 palignr m3, m2, 5
75 pmaxsw xmm2, xmm4 159 pmaddubsw m3, k2k3k6k7
76 pminsw xmm4, xmm1 160 mova m0, m4
77 paddsw xmm0, xmm4 161 mova m5, m1
78 paddsw xmm0, xmm2 162 mova m2, m7
79 163 psrldq m4, 8
80 paddsw xmm0, krd 164 psrldq m1, 8
81 psraw xmm0, 7 165 mova m6, m5
82 packuswb xmm0, xmm0 166 paddsw m0, m1
83 167 mova m1, m3
84 add rsi, rdx 168 psrldq m7, 8
85 add rax, rdx 169 psrldq m3, 8
86 %if %1 170 paddsw m2, m3
87 movd xmm1, [rdi] 171 mova m3, m1
88 pavgb xmm0, xmm1 172 pmaxsw m5, m4
89 %endif 173 pminsw m4, m6
90 movd [rdi], xmm0 174 paddsw m4, m0
91 175 paddsw m4, m5
92 %if ABI_IS_32BIT 176 pmaxsw m1, m7
93 add rdi, DWORD PTR arg(3) ;out_pitch 177 pminsw m7, m3
94 %else 178 paddsw m7, m2
95 add rdi, r8 179 paddsw m7, m1
96 %endif 180
97 dec rcx 181 paddsw m4, krd
98 jnz .loop 182 psraw m4, 7
99 %endm 183 packuswb m4, m4
100 184 paddsw m7, krd
101 %macro VERTx8 1 185 psraw m7, 7
102 mov rdx, arg(5) ;filter ptr 186 packuswb m7, m7
103 mov rsi, arg(0) ;src_ptr 187
104 mov rdi, arg(2) ;output_ptr 188 %ifidn %1, h8_avg
105 mov rcx, 0x0400040 189 movd m0, [dstq]
106 190 pavgb m4, m0
107 movdqa xmm4, [rdx] ;load filters 191 movd m2, [dstq + dstrideq]
108 movq xmm5, rcx 192 pavgb m7, m2
109 packsswb xmm4, xmm4 193 %endif
110 pshuflw xmm0, xmm4, 0b ;k0_k1 194 movd [dstq], m4
111 pshuflw xmm1, xmm4, 01010101b ;k2_k3 195 movd [dstq + dstrideq], m7
112 pshuflw xmm2, xmm4, 10101010b ;k4_k5 196
113 pshuflw xmm3, xmm4, 11111111b ;k6_k7 197 lea srcq, [srcq + sstrideq ]
114 198 prefetcht0 [srcq + 4 * sstrideq - 3]
115 punpcklqdq xmm0, xmm0 199 lea srcq, [srcq + sstrideq ]
116 punpcklqdq xmm1, xmm1 200 lea dstq, [dstq + 2 * dstrideq ]
117 punpcklqdq xmm2, xmm2 201 prefetcht0 [srcq + 2 * sstrideq - 3]
118 punpcklqdq xmm3, xmm3 202
119 203 dec heightq
120 movdqa k0k1, xmm0 204 jnz .loop
121 movdqa k2k3, xmm1 205
122 pshufd xmm5, xmm5, 0 206 ; Do last row if output_height is odd
123 movdqa k4k5, xmm2 207 mov heightq, orig_height
124 movdqa k6k7, xmm3 208 and heightq, 1
125 movdqa krd, xmm5 209 je .done
126 210
127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 211 movh m0, [srcq - 3] ; load src
128 212 movh m1, [srcq + 5]
129 %if ABI_IS_32BIT=0 213 punpcklqdq m0, m1
130 movsxd r8, DWORD PTR arg(3) ;out_pitch 214
131 %endif 215 HORIZx4_ROW m0, m1
132 mov rax, rsi 216 %ifidn %1, h8_avg
133 movsxd rcx, DWORD PTR arg(4) ;output_height 217 movd m0, [dstq]
134 add rax, rdx 218 pavgb m1, m0
135 219 %endif
136 lea rbx, [rdx + rdx*4] 220 movd [dstq], m1
137 add rbx, rdx ;pitch * 6 221 .done
222 RET
223 %endm
224
225 %macro HORIZx8_ROW 5
226 mova %2, %1
227 punpcklbw %1, %1
228 punpckhbw %2, %2
229
230 mova %3, %2
231 mova %4, %2
232 mova %5, %2
233
234 palignr %2, %1, 1
235 palignr %3, %1, 5
236 palignr %4, %1, 9
237 palignr %5, %1, 13
238
239 pmaddubsw %2, k0k1
240 pmaddubsw %3, k2k3
241 pmaddubsw %4, k4k5
242 pmaddubsw %5, k6k7
243
244 paddsw %2, %5
245 mova %1, %3
246 pminsw %3, %4
247 pmaxsw %1, %4
248 paddsw %2, %3
249 paddsw %1, %2
250 paddsw %1, krd
251 psraw %1, 7
252 packuswb %1, %1
253 %endm
254
255 ;-------------------------------------------------------------------------------
256 %macro SUBPIX_HFILTER8 1
257 cglobal filter_block1d8_%1, 6, 6+(ARCH_X86_64*1), 13, LOCAL_VARS_SIZE, \
258 src, sstride, dst, dstride, height, filter
259 mova m4, [filterq]
260 SETUP_LOCAL_VARS
261 %if ARCH_X86_64
262 %define orig_height r7
263 %else
264 %define orig_height heightmp
265 %endif
266 mov orig_height, heightq
267 shr heightq, 1
138 268
139 .loop: 269 .loop:
140 movq xmm0, [rsi] ;A 270 movh m0, [srcq - 3]
141 movq xmm1, [rsi + rdx] ;B 271 movh m3, [srcq + 5]
142 movq xmm2, [rsi + rdx * 2] ;C 272 movh m4, [srcq + sstrideq - 3]
143 movq xmm3, [rax + rdx * 2] ;D 273 movh m7, [srcq + sstrideq + 5]
144 movq xmm4, [rsi + rdx * 4] ;E 274 punpcklqdq m0, m3
145 movq xmm5, [rax + rdx * 4] ;F 275 mova m1, m0
146 276 punpcklbw m0, m0
147 punpcklbw xmm0, xmm1 ;A B 277 punpckhbw m1, m1
148 punpcklbw xmm2, xmm3 ;C D 278 mova m5, m1
149 punpcklbw xmm4, xmm5 ;E F 279 palignr m5, m0, 13
150 280 pmaddubsw m5, k6k7
151 movq xmm6, [rsi + rbx] ;G 281 mova m2, m1
152 movq xmm7, [rax + rbx] ;H 282 mova m3, m1
153 283 palignr m1, m0, 1
154 pmaddubsw xmm0, k0k1 284 pmaddubsw m1, k0k1
155 pmaddubsw xmm2, k2k3 285 punpcklqdq m4, m7
156 punpcklbw xmm6, xmm7 ;G H 286 mova m6, m4
157 pmaddubsw xmm4, k4k5 287 punpcklbw m4, m4
158 pmaddubsw xmm6, k6k7 288 palignr m2, m0, 5
159 289 punpckhbw m6, m6
160 paddsw xmm0, xmm6 290 palignr m3, m0, 9
161 movdqa xmm1, xmm2 291 mova m7, m6
162 pmaxsw xmm2, xmm4 292 pmaddubsw m2, k2k3
163 pminsw xmm4, xmm1 293 pmaddubsw m3, k4k5
164 paddsw xmm0, xmm4 294
165 paddsw xmm0, xmm2 295 palignr m7, m4, 13
166 296 paddsw m1, m5
167 paddsw xmm0, krd 297 mova m5, m6
168 psraw xmm0, 7 298 mova m0, m2
169 packuswb xmm0, xmm0 299 palignr m5, m4, 5
170 300 pminsw m2, m3
171 add rsi, rdx 301 pmaddubsw m7, k6k7
172 add rax, rdx 302 pmaxsw m3, m0
173 %if %1 303 paddsw m1, m2
174 movq xmm1, [rdi] 304 mova m0, m6
175 pavgb xmm0, xmm1 305 palignr m6, m4, 1
176 %endif 306 pmaddubsw m5, k2k3
177 movq [rdi], xmm0 307 paddsw m1, m3
178 308 pmaddubsw m6, k0k1
179 %if ABI_IS_32BIT 309 palignr m0, m4, 9
180 add rdi, DWORD PTR arg(3) ;out_pitch 310 paddsw m1, krd
181 %else 311 pmaddubsw m0, k4k5
182 add rdi, r8 312 mova m4, m5
183 %endif 313 psraw m1, 7
184 dec rcx 314 pminsw m5, m0
185 jnz .loop 315 paddsw m6, m7
186 %endm 316 packuswb m1, m1
187 317
188 318 paddsw m6, m5
189 %macro VERTx16 1 319 pmaxsw m0, m4
190 mov rdx, arg(5) ;filter ptr 320 paddsw m6, m0
191 mov rsi, arg(0) ;src_ptr 321 paddsw m6, krd
192 mov rdi, arg(2) ;output_ptr 322 psraw m6, 7
193 mov rcx, 0x0400040 323 packuswb m6, m6
194 324
195 movdqa xmm4, [rdx] ;load filters 325 %ifidn %1, h8_avg
196 movq xmm5, rcx 326 movh m0, [dstq]
197 packsswb xmm4, xmm4 327 movh m2, [dstq + dstrideq]
198 pshuflw xmm0, xmm4, 0b ;k0_k1 328 pavgb m1, m0
199 pshuflw xmm1, xmm4, 01010101b ;k2_k3 329 pavgb m6, m2
200 pshuflw xmm2, xmm4, 10101010b ;k4_k5 330 %endif
201 pshuflw xmm3, xmm4, 11111111b ;k6_k7 331 movh [dstq], m1
202 332 movh [dstq + dstrideq], m6
203 punpcklqdq xmm0, xmm0 333
204 punpcklqdq xmm1, xmm1 334 lea srcq, [srcq + sstrideq ]
205 punpcklqdq xmm2, xmm2 335 prefetcht0 [srcq + 4 * sstrideq - 3]
206 punpcklqdq xmm3, xmm3 336 lea srcq, [srcq + sstrideq ]
207 337 lea dstq, [dstq + 2 * dstrideq ]
208 movdqa k0k1, xmm0 338 prefetcht0 [srcq + 2 * sstrideq - 3]
209 movdqa k2k3, xmm1 339 dec heightq
210 pshufd xmm5, xmm5, 0 340 jnz .loop
211 movdqa k4k5, xmm2 341
212 movdqa k6k7, xmm3 342 ;Do last row if output_height is odd
213 movdqa krd, xmm5 343 mov heightq, orig_height
214 344 and heightq, 1
215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 345 je .done
216 346
217 %if ABI_IS_32BIT=0 347 movh m0, [srcq - 3]
218 movsxd r8, DWORD PTR arg(3) ;out_pitch 348 movh m3, [srcq + 5]
219 %endif 349 punpcklqdq m0, m3
220 mov rax, rsi 350
221 movsxd rcx, DWORD PTR arg(4) ;output_height 351 HORIZx8_ROW m0, m1, m2, m3, m4
222 add rax, rdx 352
223 353 %ifidn %1, h8_avg
224 lea rbx, [rdx + rdx*4] 354 movh m1, [dstq]
225 add rbx, rdx ;pitch * 6 355 pavgb m0, m1
226 356 %endif
357 movh [dstq], m0
358 .done:
359 RET
360 %endm
361
362 ;-------------------------------------------------------------------------------
363 %macro SUBPIX_HFILTER16 1
364 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*0), 13, LOCAL_VARS_SIZE, \
365 src, sstride, dst, dstride, height, filter
366 mova m4, [filterq]
367 SETUP_LOCAL_VARS
227 .loop: 368 .loop:
228 movq xmm0, [rsi] ;A 369 prefetcht0 [srcq + 2 * sstrideq -3]
229 movq xmm1, [rsi + rdx] ;B 370
230 movq xmm2, [rsi + rdx * 2] ;C 371 movh m0, [srcq - 3]
231 movq xmm3, [rax + rdx * 2] ;D 372 movh m4, [srcq + 5]
232 movq xmm4, [rsi + rdx * 4] ;E 373 movh m6, [srcq + 13]
233 movq xmm5, [rax + rdx * 4] ;F 374 punpcklqdq m0, m4
234 375 mova m7, m0
235 punpcklbw xmm0, xmm1 ;A B 376 punpckhbw m0, m0
236 punpcklbw xmm2, xmm3 ;C D 377 mova m1, m0
237 punpcklbw xmm4, xmm5 ;E F 378 punpcklqdq m4, m6
238 379 mova m3, m0
239 movq xmm6, [rsi + rbx] ;G 380 punpcklbw m7, m7
240 movq xmm7, [rax + rbx] ;H 381
241 382 palignr m3, m7, 13
242 pmaddubsw xmm0, k0k1 383 mova m2, m0
243 pmaddubsw xmm2, k2k3 384 pmaddubsw m3, k6k7
244 punpcklbw xmm6, xmm7 ;G H 385 palignr m0, m7, 1
245 pmaddubsw xmm4, k4k5 386 pmaddubsw m0, k0k1
246 pmaddubsw xmm6, k6k7 387 palignr m1, m7, 5
247 388 pmaddubsw m1, k2k3
248 paddsw xmm0, xmm6 389 palignr m2, m7, 9
249 movdqa xmm1, xmm2 390 pmaddubsw m2, k4k5
250 pmaxsw xmm2, xmm4 391 paddsw m0, m3
251 pminsw xmm4, xmm1 392 mova m3, m4
252 paddsw xmm0, xmm4 393 punpckhbw m4, m4
253 paddsw xmm0, xmm2 394 mova m5, m4
254 395 punpcklbw m3, m3
255 paddsw xmm0, krd 396 mova m7, m4
256 psraw xmm0, 7 397 palignr m5, m3, 5
257 packuswb xmm0, xmm0 398 mova m6, m4
258 %if %1 399 palignr m4, m3, 1
259 movq xmm1, [rdi] 400 pmaddubsw m4, k0k1
260 pavgb xmm0, xmm1 401 pmaddubsw m5, k2k3
261 %endif 402 palignr m6, m3, 9
262 movq [rdi], xmm0 403 pmaddubsw m6, k4k5
263 404 palignr m7, m3, 13
264 movq xmm0, [rsi + 8] ;A 405 pmaddubsw m7, k6k7
265 movq xmm1, [rsi + rdx + 8] ;B 406
266 movq xmm2, [rsi + rdx * 2 + 8] ;C 407 mova m3, m1
267 movq xmm3, [rax + rdx * 2 + 8] ;D 408 pmaxsw m1, m2
268 movq xmm4, [rsi + rdx * 4 + 8] ;E 409 pminsw m2, m3
269 movq xmm5, [rax + rdx * 4 + 8] ;F 410 paddsw m0, m2
270 411 paddsw m0, m1
271 punpcklbw xmm0, xmm1 ;A B 412 paddsw m4, m7
272 punpcklbw xmm2, xmm3 ;C D 413 mova m7, m5
273 punpcklbw xmm4, xmm5 ;E F 414 pmaxsw m5, m6
274 415 pminsw m6, m7
275 movq xmm6, [rsi + rbx + 8] ;G 416 paddsw m4, m6
276 movq xmm7, [rax + rbx + 8] ;H 417 paddsw m4, m5
277 punpcklbw xmm6, xmm7 ;G H 418 paddsw m0, krd
278 419 paddsw m4, krd
279 pmaddubsw xmm0, k0k1 420 psraw m0, 7
280 pmaddubsw xmm2, k2k3 421 psraw m4, 7
281 pmaddubsw xmm4, k4k5 422 packuswb m0, m4
282 pmaddubsw xmm6, k6k7 423 %ifidn %1, h8_avg
283 424 mova m1, [dstq]
284 paddsw xmm0, xmm6 425 pavgb m0, m1
285 movdqa xmm1, xmm2 426 %endif
286 pmaxsw xmm2, xmm4 427 lea srcq, [srcq + sstrideq]
287 pminsw xmm4, xmm1 428 mova [dstq], m0
288 paddsw xmm0, xmm4 429 lea dstq, [dstq + dstrideq]
289 paddsw xmm0, xmm2 430 dec heightq
290 431 jnz .loop
291 paddsw xmm0, krd 432 RET
292 psraw xmm0, 7 433 %endm
293 packuswb xmm0, xmm0 434
294 435 INIT_XMM ssse3
295 add rsi, rdx 436 SUBPIX_HFILTER16 h8
296 add rax, rdx 437 SUBPIX_HFILTER16 h8_avg
297 %if %1 438 SUBPIX_HFILTER8 h8
298 movq xmm1, [rdi+8] 439 SUBPIX_HFILTER8 h8_avg
299 pavgb xmm0, xmm1 440 SUBPIX_HFILTER4 h8
300 %endif 441 SUBPIX_HFILTER4 h8_avg
301 442
302 movq [rdi+8], xmm0 443 ;-------------------------------------------------------------------------------
303 444 %macro SUBPIX_VFILTER 2
304 %if ABI_IS_32BIT 445 cglobal filter_block1d%2_%1, 6, 6+(ARCH_X86_64*3), 14, LOCAL_VARS_SIZE, \
305 add rdi, DWORD PTR arg(3) ;out_pitch 446 src, sstride, dst, dstride, height, filter
306 %else 447 mova m4, [filterq]
307 add rdi, r8 448 SETUP_LOCAL_VARS
308 %endif 449 %if ARCH_X86_64
309 dec rcx 450 %define src1q r7
310 jnz .loop 451 %define sstride6q r8
311 %endm 452 %define dst_stride dstrideq
312 453 %else
313 ;void vpx_filter_block1d8_v8_ssse3 454 %define src1q filterq
314 ;( 455 %define sstride6q dstrideq
315 ; unsigned char *src_ptr, 456 %define dst_stride dstridemp
316 ; unsigned int src_pitch, 457 %endif
317 ; unsigned char *output_ptr, 458 mov src1q, srcq
318 ; unsigned int out_pitch, 459 add src1q, sstrideq
319 ; unsigned int output_height, 460 lea sstride6q, [sstrideq + sstrideq * 4]
320 ; short *filter 461 add sstride6q, sstrideq ;pitch * 6
321 ;) 462
322 global sym(vpx_filter_block1d4_v8_ssse3) PRIVATE 463 %ifidn %2, 8
323 sym(vpx_filter_block1d4_v8_ssse3): 464 %define movx movh
324 push rbp 465 %else
325 mov rbp, rsp 466 %define movx movd
326 SHADOW_ARGS_TO_STACK 6 467 %endif
327 SAVE_XMM 7
328 push rsi
329 push rdi
330 push rbx
331 ; end prolog
332
333 ALIGN_STACK 16, rax
334 sub rsp, 16*5
335 %define k0k1 [rsp + 16*0]
336 %define k2k3 [rsp + 16*1]
337 %define k4k5 [rsp + 16*2]
338 %define k6k7 [rsp + 16*3]
339 %define krd [rsp + 16*4]
340
341 VERTx4 0
342
343 add rsp, 16*5
344 pop rsp
345 pop rbx
346 ; begin epilog
347 pop rdi
348 pop rsi
349 RESTORE_XMM
350 UNSHADOW_ARGS
351 pop rbp
352 ret
353
354 ;void vpx_filter_block1d8_v8_ssse3
355 ;(
356 ; unsigned char *src_ptr,
357 ; unsigned int src_pitch,
358 ; unsigned char *output_ptr,
359 ; unsigned int out_pitch,
360 ; unsigned int output_height,
361 ; short *filter
362 ;)
363 global sym(vpx_filter_block1d8_v8_ssse3) PRIVATE
364 sym(vpx_filter_block1d8_v8_ssse3):
365 push rbp
366 mov rbp, rsp
367 SHADOW_ARGS_TO_STACK 6
368 SAVE_XMM 7
369 push rsi
370 push rdi
371 push rbx
372 ; end prolog
373
374 ALIGN_STACK 16, rax
375 sub rsp, 16*5
376 %define k0k1 [rsp + 16*0]
377 %define k2k3 [rsp + 16*1]
378 %define k4k5 [rsp + 16*2]
379 %define k6k7 [rsp + 16*3]
380 %define krd [rsp + 16*4]
381
382 VERTx8 0
383
384 add rsp, 16*5
385 pop rsp
386 pop rbx
387 ; begin epilog
388 pop rdi
389 pop rsi
390 RESTORE_XMM
391 UNSHADOW_ARGS
392 pop rbp
393 ret
394
395 ;void vpx_filter_block1d16_v8_ssse3
396 ;(
397 ; unsigned char *src_ptr,
398 ; unsigned int src_pitch,
399 ; unsigned char *output_ptr,
400 ; unsigned int out_pitch,
401 ; unsigned int output_height,
402 ; short *filter
403 ;)
404 global sym(vpx_filter_block1d16_v8_ssse3) PRIVATE
405 sym(vpx_filter_block1d16_v8_ssse3):
406 push rbp
407 mov rbp, rsp
408 SHADOW_ARGS_TO_STACK 6
409 SAVE_XMM 7
410 push rsi
411 push rdi
412 push rbx
413 ; end prolog
414
415 ALIGN_STACK 16, rax
416 sub rsp, 16*5
417 %define k0k1 [rsp + 16*0]
418 %define k2k3 [rsp + 16*1]
419 %define k4k5 [rsp + 16*2]
420 %define k6k7 [rsp + 16*3]
421 %define krd [rsp + 16*4]
422
423 VERTx16 0
424
425 add rsp, 16*5
426 pop rsp
427 pop rbx
428 ; begin epilog
429 pop rdi
430 pop rsi
431 RESTORE_XMM
432 UNSHADOW_ARGS
433 pop rbp
434 ret
435
436 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
437
438
439 global sym(vpx_filter_block1d4_v8_avg_ssse3) PRIVATE
440 sym(vpx_filter_block1d4_v8_avg_ssse3):
441 push rbp
442 mov rbp, rsp
443 SHADOW_ARGS_TO_STACK 6
444 SAVE_XMM 7
445 push rsi
446 push rdi
447 push rbx
448 ; end prolog
449
450 ALIGN_STACK 16, rax
451 sub rsp, 16*5
452 %define k0k1 [rsp + 16*0]
453 %define k2k3 [rsp + 16*1]
454 %define k4k5 [rsp + 16*2]
455 %define k6k7 [rsp + 16*3]
456 %define krd [rsp + 16*4]
457
458 VERTx4 1
459
460 add rsp, 16*5
461 pop rsp
462 pop rbx
463 ; begin epilog
464 pop rdi
465 pop rsi
466 RESTORE_XMM
467 UNSHADOW_ARGS
468 pop rbp
469 ret
470
471 global sym(vpx_filter_block1d8_v8_avg_ssse3) PRIVATE
472 sym(vpx_filter_block1d8_v8_avg_ssse3):
473 push rbp
474 mov rbp, rsp
475 SHADOW_ARGS_TO_STACK 6
476 SAVE_XMM 7
477 push rsi
478 push rdi
479 push rbx
480 ; end prolog
481
482 ALIGN_STACK 16, rax
483 sub rsp, 16*5
484 %define k0k1 [rsp + 16*0]
485 %define k2k3 [rsp + 16*1]
486 %define k4k5 [rsp + 16*2]
487 %define k6k7 [rsp + 16*3]
488 %define krd [rsp + 16*4]
489
490 VERTx8 1
491
492 add rsp, 16*5
493 pop rsp
494 pop rbx
495 ; begin epilog
496 pop rdi
497 pop rsi
498 RESTORE_XMM
499 UNSHADOW_ARGS
500 pop rbp
501 ret
502
503 global sym(vpx_filter_block1d16_v8_avg_ssse3) PRIVATE
504 sym(vpx_filter_block1d16_v8_avg_ssse3):
505 push rbp
506 mov rbp, rsp
507 SHADOW_ARGS_TO_STACK 6
508 SAVE_XMM 7
509 push rsi
510 push rdi
511 push rbx
512 ; end prolog
513
514 ALIGN_STACK 16, rax
515 sub rsp, 16*5
516 %define k0k1 [rsp + 16*0]
517 %define k2k3 [rsp + 16*1]
518 %define k4k5 [rsp + 16*2]
519 %define k6k7 [rsp + 16*3]
520 %define krd [rsp + 16*4]
521
522 VERTx16 1
523
524 add rsp, 16*5
525 pop rsp
526 pop rbx
527 ; begin epilog
528 pop rdi
529 pop rsi
530 RESTORE_XMM
531 UNSHADOW_ARGS
532 pop rbp
533 ret
534
535 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
536 %macro HORIZx4_ROW 2
537 movdqa %2, %1
538 pshufb %1, [GLOBAL(shuf_t0t1)]
539 pshufb %2, [GLOBAL(shuf_t2t3)]
540 pmaddubsw %1, k0k1k4k5
541 pmaddubsw %2, k2k3k6k7
542
543 movdqa xmm4, %1
544 movdqa xmm5, %2
545 psrldq %1, 8
546 psrldq %2, 8
547 movdqa xmm6, xmm5
548
549 paddsw xmm4, %2
550 pmaxsw xmm5, %1
551 pminsw %1, xmm6
552 paddsw %1, xmm4
553 paddsw %1, xmm5
554
555 paddsw %1, krd
556 psraw %1, 7
557 packuswb %1, %1
558 %endm
559
560 %macro HORIZx4 1
561 mov rdx, arg(5) ;filter ptr
562 mov rsi, arg(0) ;src_ptr
563 mov rdi, arg(2) ;output_ptr
564 mov rcx, 0x0400040
565
566 movdqa xmm4, [rdx] ;load filters
567 movq xmm5, rcx
568 packsswb xmm4, xmm4
569 pshuflw xmm6, xmm4, 0b ;k0_k1
570 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5
571 pshuflw xmm7, xmm4, 01010101b ;k2_k3
572 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7
573 pshufd xmm5, xmm5, 0 ;rounding
574
575 movdqa k0k1k4k5, xmm6
576 movdqa k2k3k6k7, xmm7
577 movdqa krd, xmm5
578
579 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
580 movsxd rdx, dword ptr arg(3) ;output_pitch
581 movsxd rcx, dword ptr arg(4) ;output_height
582 shr rcx, 1
583 .loop: 468 .loop:
584 ;Do two rows once 469 movx m0, [srcq ] ;A
585 movq xmm0, [rsi - 3] ;load src 470 movx m1, [srcq + sstrideq ] ;B
586 movq xmm1, [rsi + 5] 471 punpcklbw m0, m1 ;A B
587 movq xmm2, [rsi + rax - 3] 472 movx m2, [srcq + sstrideq * 2 ] ;C
588 movq xmm3, [rsi + rax + 5] 473 pmaddubsw m0, k0k1
589 punpcklqdq xmm0, xmm1 474 mova m6, m2
590 punpcklqdq xmm2, xmm3 475 movx m3, [src1q + sstrideq * 2] ;D
591 476 punpcklbw m2, m3 ;C D
592 HORIZx4_ROW xmm0, xmm1 477 pmaddubsw m2, k2k3
593 HORIZx4_ROW xmm2, xmm3 478 movx m4, [srcq + sstrideq * 4 ] ;E
594 %if %1 479 mova m7, m4
595 movd xmm1, [rdi] 480 movx m5, [src1q + sstrideq * 4] ;F
596 pavgb xmm0, xmm1 481 punpcklbw m4, m5 ;E F
597 movd xmm3, [rdi + rdx] 482 pmaddubsw m4, k4k5
598 pavgb xmm2, xmm3 483 punpcklbw m1, m6 ;A B next iter
599 %endif 484 movx m6, [srcq + sstride6q ] ;G
600 movd [rdi], xmm0 485 punpcklbw m5, m6 ;E F next iter
601 movd [rdi +rdx], xmm2 486 punpcklbw m3, m7 ;C D next iter
602 487 pmaddubsw m5, k4k5
603 lea rsi, [rsi + rax] 488 movx m7, [src1q + sstride6q ] ;H
604 prefetcht0 [rsi + 4 * rax - 3] 489 punpcklbw m6, m7 ;G H
605 lea rsi, [rsi + rax] 490 pmaddubsw m6, k6k7
606 lea rdi, [rdi + 2 * rdx] 491 mova tmp, m2
607 prefetcht0 [rsi + 2 * rax - 3] 492 pmaddubsw m3, k2k3
608 493 pmaddubsw m1, k0k1
609 dec rcx 494 pmaxsw m2, m4
610 jnz .loop 495 paddsw m0, m6
611 496 movx m6, [srcq + sstrideq * 8 ] ;H next iter
612 ; Do last row if output_height is odd 497 punpcklbw m7, m6
613 movsxd rcx, dword ptr arg(4) ;output_height 498 pmaddubsw m7, k6k7
614 and rcx, 1 499 pminsw m4, tmp
615 je .done 500 paddsw m0, m4
616 501 mova m4, m3
617 movq xmm0, [rsi - 3] ; load src 502 paddsw m0, m2
618 movq xmm1, [rsi + 5] 503 pminsw m3, m5
619 punpcklqdq xmm0, xmm1 504 pmaxsw m5, m4
620 505 paddsw m0, krd
621 HORIZx4_ROW xmm0, xmm1 506 psraw m0, 7
622 %if %1 507 paddsw m1, m7
623 movd xmm1, [rdi] 508 packuswb m0, m0
624 pavgb xmm0, xmm1 509
625 %endif 510 paddsw m1, m3
626 movd [rdi], xmm0 511 paddsw m1, m5
627 .done 512 paddsw m1, krd
628 %endm 513 psraw m1, 7
629 514 lea srcq, [srcq + sstrideq * 2 ]
630 %macro HORIZx8_ROW 4 515 lea src1q, [src1q + sstrideq * 2]
631 movdqa %2, %1 516 packuswb m1, m1
632 movdqa %3, %1 517
633 movdqa %4, %1 518 %ifidn %1, v8_avg
634 519 movx m2, [dstq]
635 pshufb %1, [GLOBAL(shuf_t0t1)] 520 pavgb m0, m2
636 pshufb %2, [GLOBAL(shuf_t2t3)] 521 %endif
637 pshufb %3, [GLOBAL(shuf_t4t5)] 522 movx [dstq], m0
638 pshufb %4, [GLOBAL(shuf_t6t7)] 523 add dstq, dst_stride
639 524 %ifidn %1, v8_avg
640 pmaddubsw %1, k0k1 525 movx m3, [dstq]
641 pmaddubsw %2, k2k3 526 pavgb m1, m3
642 pmaddubsw %3, k4k5 527 %endif
643 pmaddubsw %4, k6k7 528 movx [dstq], m1
644 529 add dstq, dst_stride
645 paddsw %1, %4 530 sub heightq, 2
646 movdqa %4, %2 531 cmp heightq, 1
647 pmaxsw %2, %3 532 jg .loop
648 pminsw %3, %4 533
649 paddsw %1, %3 534 cmp heightq, 0
650 paddsw %1, %2 535 je .done
651 536
652 paddsw %1, krd 537 movx m0, [srcq ] ;A
653 psraw %1, 7 538 movx m1, [srcq + sstrideq ] ;B
654 packuswb %1, %1 539 movx m6, [srcq + sstride6q ] ;G
655 %endm 540 punpcklbw m0, m1 ;A B
656 541 movx m7, [rax + sstride6q ] ;H
657 %macro HORIZx8 1 542 pmaddubsw m0, k0k1
658 mov rdx, arg(5) ;filter ptr 543 movx m2, [srcq + sstrideq * 2 ] ;C
659 mov rsi, arg(0) ;src_ptr 544 punpcklbw m6, m7 ;G H
660 mov rdi, arg(2) ;output_ptr 545 movx m3, [rax + sstrideq * 2 ] ;D
661 mov rcx, 0x0400040 546 pmaddubsw m6, k6k7
662 547 movx m4, [srcq + sstrideq * 4 ] ;E
663 movdqa xmm4, [rdx] ;load filters 548 punpcklbw m2, m3 ;C D
664 movq xmm5, rcx 549 movx m5, [src1q + sstrideq * 4] ;F
665 packsswb xmm4, xmm4 550 punpcklbw m4, m5 ;E F
666 pshuflw xmm0, xmm4, 0b ;k0_k1 551 pmaddubsw m2, k2k3
667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 552 pmaddubsw m4, k4k5
668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 553 paddsw m0, m6
669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 554 mova m1, m2
670 555 pmaxsw m2, m4
671 punpcklqdq xmm0, xmm0 556 pminsw m4, m1
672 punpcklqdq xmm1, xmm1 557 paddsw m0, m4
673 punpcklqdq xmm2, xmm2 558 paddsw m0, m2
674 punpcklqdq xmm3, xmm3 559 paddsw m0, krd
675 560 psraw m0, 7
676 movdqa k0k1, xmm0 561 packuswb m0, m0
677 movdqa k2k3, xmm1 562 %ifidn %1, v8_avg
678 pshufd xmm5, xmm5, 0 563 movx m1, [dstq]
679 movdqa k4k5, xmm2 564 pavgb m0, m1
680 movdqa k6k7, xmm3 565 %endif
681 movdqa krd, xmm5 566 movx [dstq], m0
682 567 .done:
683 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 568 RET
684 movsxd rdx, dword ptr arg(3) ;output_pitch 569 %endm
685 movsxd rcx, dword ptr arg(4) ;output_height 570
686 shr rcx, 1 571 ;-------------------------------------------------------------------------------
572 %macro SUBPIX_VFILTER16 1
573 cglobal filter_block1d16_%1, 6, 6+(ARCH_X86_64*2), 13, LOCAL_VARS_SIZE, \
574 src, sstride, dst, dstride, height, filter
575
576 mova m4, [filterq]
577 SETUP_LOCAL_VARS
578 %if ARCH_X86_64
579 %define src1q r7
580 %define sstride6q r8
581 %define dst_stride dstrideq
582 %else
583 %define src1q filterq
584 %define sstride6q dstrideq
585 %define dst_stride dstridemp
586 %endif
587 mov src1q, srcq
588 add src1q, sstrideq
589 lea sstride6q, [sstrideq + sstrideq * 4]
590 add sstride6q, sstrideq ;pitch * 6
687 591
688 .loop: 592 .loop:
689 movq xmm0, [rsi - 3] ;load src 593 movh m0, [srcq ] ;A
690 movq xmm3, [rsi + 5] 594 movh m1, [srcq + sstrideq ] ;B
691 movq xmm4, [rsi + rax - 3] 595 movh m2, [srcq + sstrideq * 2 ] ;C
692 movq xmm7, [rsi + rax + 5] 596 movh m3, [src1q + sstrideq * 2] ;D
693 punpcklqdq xmm0, xmm3 597 movh m4, [srcq + sstrideq * 4 ] ;E
694 punpcklqdq xmm4, xmm7 598 movh m5, [src1q + sstrideq * 4] ;F
695 599
696 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 600 punpcklbw m0, m1 ;A B
697 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 601 movh m6, [srcq + sstride6q] ;G
698 %if %1 602 punpcklbw m2, m3 ;C D
699 movq xmm1, [rdi] 603 movh m7, [src1q + sstride6q] ;H
700 movq xmm2, [rdi + rdx] 604 punpcklbw m4, m5 ;E F
701 pavgb xmm0, xmm1 605 pmaddubsw m0, k0k1
702 pavgb xmm4, xmm2 606 movh m3, [srcq + 8] ;A
703 %endif 607 pmaddubsw m2, k2k3
704 movq [rdi], xmm0 608 punpcklbw m6, m7 ;G H
705 movq [rdi + rdx], xmm4 609 movh m5, [srcq + sstrideq + 8] ;B
706 610 pmaddubsw m4, k4k5
707 lea rsi, [rsi + rax] 611 punpcklbw m3, m5 ;A B
708 prefetcht0 [rsi + 4 * rax - 3] 612 movh m7, [srcq + sstrideq * 2 + 8] ;C
709 lea rsi, [rsi + rax] 613 pmaddubsw m6, k6k7
710 lea rdi, [rdi + 2 * rdx] 614 mova m1, m2
711 prefetcht0 [rsi + 2 * rax - 3] 615 movh m5, [src1q + sstrideq * 2 + 8] ;D
712 dec rcx 616 pmaxsw m2, m4
713 jnz .loop 617 punpcklbw m7, m5 ;C D
714 618 pminsw m4, m1
715 ;Do last row if output_height is odd 619 paddsw m0, m6
716 movsxd rcx, dword ptr arg(4) ;output_height 620 pmaddubsw m3, k0k1
717 and rcx, 1 621 movh m1, [srcq + sstrideq * 4 + 8] ;E
718 je .done 622 paddsw m0, m4
719 623 pmaddubsw m7, k2k3
720 movq xmm0, [rsi - 3] 624 movh m6, [src1q + sstrideq * 4 + 8] ;F
721 movq xmm3, [rsi + 5] 625 punpcklbw m1, m6 ;E F
722 punpcklqdq xmm0, xmm3 626 paddsw m0, m2
723 627 paddsw m0, krd
724 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 628 movh m2, [srcq + sstride6q + 8] ;G
725 %if %1 629 pmaddubsw m1, k4k5
726 movq xmm1, [rdi] 630 movh m5, [src1q + sstride6q + 8] ;H
727 pavgb xmm0, xmm1 631 psraw m0, 7
728 %endif 632 punpcklbw m2, m5 ;G H
729 movq [rdi], xmm0 633 packuswb m0, m0
730 .done 634 pmaddubsw m2, k6k7
731 %endm 635 %ifidn %1, v8_avg
732 636 movh m4, [dstq]
733 %macro HORIZx16 1 637 pavgb m0, m4
734 mov rdx, arg(5) ;filter ptr 638 %endif
735 mov rsi, arg(0) ;src_ptr 639 movh [dstq], m0
736 mov rdi, arg(2) ;output_ptr 640 mova m6, m7
737 mov rcx, 0x0400040 641 pmaxsw m7, m1
738 642 pminsw m1, m6
739 movdqa xmm4, [rdx] ;load filters 643 paddsw m3, m2
740 movq xmm5, rcx 644 paddsw m3, m1
741 packsswb xmm4, xmm4 645 paddsw m3, m7
742 pshuflw xmm0, xmm4, 0b ;k0_k1 646 paddsw m3, krd
743 pshuflw xmm1, xmm4, 01010101b ;k2_k3 647 psraw m3, 7
744 pshuflw xmm2, xmm4, 10101010b ;k4_k5 648 packuswb m3, m3
745 pshuflw xmm3, xmm4, 11111111b ;k6_k7 649
746 650 add srcq, sstrideq
747 punpcklqdq xmm0, xmm0 651 add src1q, sstrideq
748 punpcklqdq xmm1, xmm1 652 %ifidn %1, v8_avg
749 punpcklqdq xmm2, xmm2 653 movh m1, [dstq + 8]
750 punpcklqdq xmm3, xmm3 654 pavgb m3, m1
751 655 %endif
752 movdqa k0k1, xmm0 656 movh [dstq + 8], m3
753 movdqa k2k3, xmm1 657 add dstq, dst_stride
754 pshufd xmm5, xmm5, 0 658 dec heightq
755 movdqa k4k5, xmm2 659 jnz .loop
756 movdqa k6k7, xmm3 660 RET
757 movdqa krd, xmm5 661 %endm
758 662
759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 663 INIT_XMM ssse3
760 movsxd rdx, dword ptr arg(3) ;output_pitch 664 SUBPIX_VFILTER16 v8
761 movsxd rcx, dword ptr arg(4) ;output_height 665 SUBPIX_VFILTER16 v8_avg
762 666 SUBPIX_VFILTER v8, 8
763 .loop: 667 SUBPIX_VFILTER v8_avg, 8
764 prefetcht0 [rsi + 2 * rax -3] 668 SUBPIX_VFILTER v8, 4
765 669 SUBPIX_VFILTER v8_avg, 4
766 movq xmm0, [rsi - 3] ;load src data
767 movq xmm4, [rsi + 5]
768 movq xmm6, [rsi + 13]
769 punpcklqdq xmm0, xmm4
770 punpcklqdq xmm4, xmm6
771
772 movdqa xmm7, xmm0
773
774 punpcklbw xmm7, xmm7
775 punpckhbw xmm0, xmm0
776 movdqa xmm1, xmm0
777 movdqa xmm2, xmm0
778 movdqa xmm3, xmm0
779
780 palignr xmm0, xmm7, 1
781 palignr xmm1, xmm7, 5
782 pmaddubsw xmm0, k0k1
783 palignr xmm2, xmm7, 9
784 pmaddubsw xmm1, k2k3
785 palignr xmm3, xmm7, 13
786
787 pmaddubsw xmm2, k4k5
788 pmaddubsw xmm3, k6k7
789 paddsw xmm0, xmm3
790
791 movdqa xmm3, xmm4
792 punpcklbw xmm3, xmm3
793 punpckhbw xmm4, xmm4
794
795 movdqa xmm5, xmm4
796 movdqa xmm6, xmm4
797 movdqa xmm7, xmm4
798
799 palignr xmm4, xmm3, 1
800 palignr xmm5, xmm3, 5
801 palignr xmm6, xmm3, 9
802 palignr xmm7, xmm3, 13
803
804 movdqa xmm3, xmm1
805 pmaddubsw xmm4, k0k1
806 pmaxsw xmm1, xmm2
807 pmaddubsw xmm5, k2k3
808 pminsw xmm2, xmm3
809 pmaddubsw xmm6, k4k5
810 paddsw xmm0, xmm2
811 pmaddubsw xmm7, k6k7
812 paddsw xmm0, xmm1
813
814 paddsw xmm4, xmm7
815 movdqa xmm7, xmm5
816 pmaxsw xmm5, xmm6
817 pminsw xmm6, xmm7
818 paddsw xmm4, xmm6
819 paddsw xmm4, xmm5
820
821 paddsw xmm0, krd
822 paddsw xmm4, krd
823 psraw xmm0, 7
824 psraw xmm4, 7
825 packuswb xmm0, xmm0
826 packuswb xmm4, xmm4
827 punpcklqdq xmm0, xmm4
828 %if %1
829 movdqa xmm1, [rdi]
830 pavgb xmm0, xmm1
831 %endif
832
833 lea rsi, [rsi + rax]
834 movdqa [rdi], xmm0
835
836 lea rdi, [rdi + rdx]
837 dec rcx
838 jnz .loop
839 %endm
840
841 ;void vpx_filter_block1d4_h8_ssse3
842 ;(
843 ; unsigned char *src_ptr,
844 ; unsigned int src_pixels_per_line,
845 ; unsigned char *output_ptr,
846 ; unsigned int output_pitch,
847 ; unsigned int output_height,
848 ; short *filter
849 ;)
850 global sym(vpx_filter_block1d4_h8_ssse3) PRIVATE
851 sym(vpx_filter_block1d4_h8_ssse3):
852 push rbp
853 mov rbp, rsp
854 SHADOW_ARGS_TO_STACK 6
855 SAVE_XMM 7
856 GET_GOT rbx
857 push rsi
858 push rdi
859 ; end prolog
860
861 ALIGN_STACK 16, rax
862 sub rsp, 16 * 3
863 %define k0k1k4k5 [rsp + 16 * 0]
864 %define k2k3k6k7 [rsp + 16 * 1]
865 %define krd [rsp + 16 * 2]
866
867 HORIZx4 0
868
869 add rsp, 16 * 3
870 pop rsp
871 ; begin epilog
872 pop rdi
873 pop rsi
874 RESTORE_GOT
875 RESTORE_XMM
876 UNSHADOW_ARGS
877 pop rbp
878 ret
879
880 ;void vpx_filter_block1d8_h8_ssse3
881 ;(
882 ; unsigned char *src_ptr,
883 ; unsigned int src_pixels_per_line,
884 ; unsigned char *output_ptr,
885 ; unsigned int output_pitch,
886 ; unsigned int output_height,
887 ; short *filter
888 ;)
889 global sym(vpx_filter_block1d8_h8_ssse3) PRIVATE
890 sym(vpx_filter_block1d8_h8_ssse3):
891 push rbp
892 mov rbp, rsp
893 SHADOW_ARGS_TO_STACK 6
894 SAVE_XMM 7
895 GET_GOT rbx
896 push rsi
897 push rdi
898 ; end prolog
899
900 ALIGN_STACK 16, rax
901 sub rsp, 16*5
902 %define k0k1 [rsp + 16*0]
903 %define k2k3 [rsp + 16*1]
904 %define k4k5 [rsp + 16*2]
905 %define k6k7 [rsp + 16*3]
906 %define krd [rsp + 16*4]
907
908 HORIZx8 0
909
910 add rsp, 16*5
911 pop rsp
912
913 ; begin epilog
914 pop rdi
915 pop rsi
916 RESTORE_GOT
917 RESTORE_XMM
918 UNSHADOW_ARGS
919 pop rbp
920 ret
921
922 ;void vpx_filter_block1d16_h8_ssse3
923 ;(
924 ; unsigned char *src_ptr,
925 ; unsigned int src_pixels_per_line,
926 ; unsigned char *output_ptr,
927 ; unsigned int output_pitch,
928 ; unsigned int output_height,
929 ; short *filter
930 ;)
931 global sym(vpx_filter_block1d16_h8_ssse3) PRIVATE
932 sym(vpx_filter_block1d16_h8_ssse3):
933 push rbp
934 mov rbp, rsp
935 SHADOW_ARGS_TO_STACK 6
936 SAVE_XMM 7
937 GET_GOT rbx
938 push rsi
939 push rdi
940 ; end prolog
941
942 ALIGN_STACK 16, rax
943 sub rsp, 16*5
944 %define k0k1 [rsp + 16*0]
945 %define k2k3 [rsp + 16*1]
946 %define k4k5 [rsp + 16*2]
947 %define k6k7 [rsp + 16*3]
948 %define krd [rsp + 16*4]
949
950 HORIZx16 0
951
952 add rsp, 16*5
953 pop rsp
954
955 ; begin epilog
956 pop rdi
957 pop rsi
958 RESTORE_GOT
959 RESTORE_XMM
960 UNSHADOW_ARGS
961 pop rbp
962 ret
963
964 global sym(vpx_filter_block1d4_h8_avg_ssse3) PRIVATE
965 sym(vpx_filter_block1d4_h8_avg_ssse3):
966 push rbp
967 mov rbp, rsp
968 SHADOW_ARGS_TO_STACK 6
969 SAVE_XMM 7
970 GET_GOT rbx
971 push rsi
972 push rdi
973 ; end prolog
974
975 ALIGN_STACK 16, rax
976 sub rsp, 16 * 3
977 %define k0k1k4k5 [rsp + 16 * 0]
978 %define k2k3k6k7 [rsp + 16 * 1]
979 %define krd [rsp + 16 * 2]
980
981 HORIZx4 1
982
983 add rsp, 16 * 3
984 pop rsp
985 ; begin epilog
986 pop rdi
987 pop rsi
988 RESTORE_GOT
989 RESTORE_XMM
990 UNSHADOW_ARGS
991 pop rbp
992 ret
993
994 global sym(vpx_filter_block1d8_h8_avg_ssse3) PRIVATE
995 sym(vpx_filter_block1d8_h8_avg_ssse3):
996 push rbp
997 mov rbp, rsp
998 SHADOW_ARGS_TO_STACK 6
999 SAVE_XMM 7
1000 GET_GOT rbx
1001 push rsi
1002 push rdi
1003 ; end prolog
1004
1005 ALIGN_STACK 16, rax
1006 sub rsp, 16*5
1007 %define k0k1 [rsp + 16*0]
1008 %define k2k3 [rsp + 16*1]
1009 %define k4k5 [rsp + 16*2]
1010 %define k6k7 [rsp + 16*3]
1011 %define krd [rsp + 16*4]
1012
1013 HORIZx8 1
1014
1015 add rsp, 16*5
1016 pop rsp
1017
1018 ; begin epilog
1019 pop rdi
1020 pop rsi
1021 RESTORE_GOT
1022 RESTORE_XMM
1023 UNSHADOW_ARGS
1024 pop rbp
1025 ret
1026
1027 global sym(vpx_filter_block1d16_h8_avg_ssse3) PRIVATE
1028 sym(vpx_filter_block1d16_h8_avg_ssse3):
1029 push rbp
1030 mov rbp, rsp
1031 SHADOW_ARGS_TO_STACK 6
1032 SAVE_XMM 7
1033 GET_GOT rbx
1034 push rsi
1035 push rdi
1036 ; end prolog
1037
1038 ALIGN_STACK 16, rax
1039 sub rsp, 16*5
1040 %define k0k1 [rsp + 16*0]
1041 %define k2k3 [rsp + 16*1]
1042 %define k4k5 [rsp + 16*2]
1043 %define k6k7 [rsp + 16*3]
1044 %define krd [rsp + 16*4]
1045
1046 HORIZx16 1
1047
1048 add rsp, 16*5
1049 pop rsp
1050
1051 ; begin epilog
1052 pop rdi
1053 pop rsi
1054 RESTORE_GOT
1055 RESTORE_XMM
1056 UNSHADOW_ARGS
1057 pop rbp
1058 ret
1059 SECTION_RODATA
1060 align 16
1061 shuf_t0t1:
1062 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
1063 align 16
1064 shuf_t2t3:
1065 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
1066 align 16
1067 shuf_t4t5:
1068 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
1069 align 16
1070 shuf_t6t7:
1071 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
OLDNEW
« no previous file with comments | « source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c ('k') | source/libvpx/webmdec.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698