OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 %macro PROCESS_16X2X3 1 | |
15 %if %1 | |
16 movdqa xmm0, XMMWORD PTR [rsi] | |
17 lddqu xmm5, XMMWORD PTR [rdi] | |
18 lddqu xmm6, XMMWORD PTR [rdi+1] | |
19 lddqu xmm7, XMMWORD PTR [rdi+2] | |
20 | |
21 psadbw xmm5, xmm0 | |
22 psadbw xmm6, xmm0 | |
23 psadbw xmm7, xmm0 | |
24 %else | |
25 movdqa xmm0, XMMWORD PTR [rsi] | |
26 lddqu xmm1, XMMWORD PTR [rdi] | |
27 lddqu xmm2, XMMWORD PTR [rdi+1] | |
28 lddqu xmm3, XMMWORD PTR [rdi+2] | |
29 | |
30 psadbw xmm1, xmm0 | |
31 psadbw xmm2, xmm0 | |
32 psadbw xmm3, xmm0 | |
33 | |
34 paddw xmm5, xmm1 | |
35 paddw xmm6, xmm2 | |
36 paddw xmm7, xmm3 | |
37 %endif | |
38 movdqa xmm0, XMMWORD PTR [rsi+rax] | |
39 lddqu xmm1, XMMWORD PTR [rdi+rdx] | |
40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1] | |
41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2] | |
42 | |
43 lea rsi, [rsi+rax*2] | |
44 lea rdi, [rdi+rdx*2] | |
45 | |
46 psadbw xmm1, xmm0 | |
47 psadbw xmm2, xmm0 | |
48 psadbw xmm3, xmm0 | |
49 | |
50 paddw xmm5, xmm1 | |
51 paddw xmm6, xmm2 | |
52 paddw xmm7, xmm3 | |
53 %endmacro | |
54 | |
55 %macro PROCESS_16X2X3_OFFSET 2 | |
56 %if %1 | |
57 movdqa xmm0, XMMWORD PTR [rsi] | |
58 movdqa xmm4, XMMWORD PTR [rdi] | |
59 movdqa xmm7, XMMWORD PTR [rdi+16] | |
60 | |
61 movdqa xmm5, xmm7 | |
62 palignr xmm5, xmm4, %2 | |
63 | |
64 movdqa xmm6, xmm7 | |
65 palignr xmm6, xmm4, (%2+1) | |
66 | |
67 palignr xmm7, xmm4, (%2+2) | |
68 | |
69 psadbw xmm5, xmm0 | |
70 psadbw xmm6, xmm0 | |
71 psadbw xmm7, xmm0 | |
72 %else | |
73 movdqa xmm0, XMMWORD PTR [rsi] | |
74 movdqa xmm4, XMMWORD PTR [rdi] | |
75 movdqa xmm3, XMMWORD PTR [rdi+16] | |
76 | |
77 movdqa xmm1, xmm3 | |
78 palignr xmm1, xmm4, %2 | |
79 | |
80 movdqa xmm2, xmm3 | |
81 palignr xmm2, xmm4, (%2+1) | |
82 | |
83 palignr xmm3, xmm4, (%2+2) | |
84 | |
85 psadbw xmm1, xmm0 | |
86 psadbw xmm2, xmm0 | |
87 psadbw xmm3, xmm0 | |
88 | |
89 paddw xmm5, xmm1 | |
90 paddw xmm6, xmm2 | |
91 paddw xmm7, xmm3 | |
92 %endif | |
93 movdqa xmm0, XMMWORD PTR [rsi+rax] | |
94 movdqa xmm4, XMMWORD PTR [rdi+rdx] | |
95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16] | |
96 | |
97 movdqa xmm1, xmm3 | |
98 palignr xmm1, xmm4, %2 | |
99 | |
100 movdqa xmm2, xmm3 | |
101 palignr xmm2, xmm4, (%2+1) | |
102 | |
103 palignr xmm3, xmm4, (%2+2) | |
104 | |
105 lea rsi, [rsi+rax*2] | |
106 lea rdi, [rdi+rdx*2] | |
107 | |
108 psadbw xmm1, xmm0 | |
109 psadbw xmm2, xmm0 | |
110 psadbw xmm3, xmm0 | |
111 | |
112 paddw xmm5, xmm1 | |
113 paddw xmm6, xmm2 | |
114 paddw xmm7, xmm3 | |
115 %endmacro | |
116 | |
117 %macro PROCESS_16X16X3_OFFSET 2 | |
118 %2_aligned_by_%1: | |
119 | |
120 sub rdi, %1 | |
121 | |
122 PROCESS_16X2X3_OFFSET 1, %1 | |
123 PROCESS_16X2X3_OFFSET 0, %1 | |
124 PROCESS_16X2X3_OFFSET 0, %1 | |
125 PROCESS_16X2X3_OFFSET 0, %1 | |
126 PROCESS_16X2X3_OFFSET 0, %1 | |
127 PROCESS_16X2X3_OFFSET 0, %1 | |
128 PROCESS_16X2X3_OFFSET 0, %1 | |
129 PROCESS_16X2X3_OFFSET 0, %1 | |
130 | |
131 jmp %2_store_off | |
132 | |
133 %endmacro | |
134 | |
135 %macro PROCESS_16X8X3_OFFSET 2 | |
136 %2_aligned_by_%1: | |
137 | |
138 sub rdi, %1 | |
139 | |
140 PROCESS_16X2X3_OFFSET 1, %1 | |
141 PROCESS_16X2X3_OFFSET 0, %1 | |
142 PROCESS_16X2X3_OFFSET 0, %1 | |
143 PROCESS_16X2X3_OFFSET 0, %1 | |
144 | |
145 jmp %2_store_off | |
146 | |
147 %endmacro | |
148 | |
149 ;void int vp8_sad16x16x3_ssse3( | |
150 ; unsigned char *src_ptr, | |
151 ; int src_stride, | |
152 ; unsigned char *ref_ptr, | |
153 ; int ref_stride, | |
154 ; int *results) | |
155 global sym(vp8_sad16x16x3_ssse3) PRIVATE | |
156 sym(vp8_sad16x16x3_ssse3): | |
157 push rbp | |
158 mov rbp, rsp | |
159 SHADOW_ARGS_TO_STACK 5 | |
160 SAVE_XMM 7 | |
161 push rsi | |
162 push rdi | |
163 push rcx | |
164 ; end prolog | |
165 | |
166 mov rsi, arg(0) ;src_ptr | |
167 mov rdi, arg(2) ;ref_ptr | |
168 | |
169 mov rdx, 0xf | |
170 and rdx, rdi | |
171 | |
172 jmp .vp8_sad16x16x3_ssse3_skiptable | |
173 .vp8_sad16x16x3_ssse3_jumptable: | |
174 dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump | |
175 dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump | |
176 dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump | |
177 dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump | |
178 dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump | |
179 dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump | |
180 dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump | |
181 dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump | |
182 dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump | |
183 dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump | |
184 dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump | |
185 dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump | |
186 dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump | |
187 dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump | |
188 dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump | |
189 dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump | |
190 .vp8_sad16x16x3_ssse3_skiptable: | |
191 | |
192 call .vp8_sad16x16x3_ssse3_do_jump | |
193 .vp8_sad16x16x3_ssse3_do_jump: | |
194 pop rcx ; get the address of do_jump | |
195 mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_
ssse3_do_jump | |
196 add rax, rcx ; get the absolute address of vp8_sad16x16x3_
ssse3_jumptable | |
197 | |
198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable | |
199 add rcx, rax | |
200 | |
201 movsxd rax, dword ptr arg(1) ;src_stride | |
202 movsxd rdx, dword ptr arg(3) ;ref_stride | |
203 | |
204 jmp rcx | |
205 | |
206 PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3 | |
207 PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3 | |
208 PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3 | |
209 PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3 | |
210 PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3 | |
211 PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3 | |
212 PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3 | |
213 PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3 | |
214 PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3 | |
215 PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3 | |
216 PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3 | |
217 PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3 | |
218 PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3 | |
219 PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3 | |
220 PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3 | |
221 | |
222 .vp8_sad16x16x3_ssse3_aligned_by_15: | |
223 PROCESS_16X2X3 1 | |
224 PROCESS_16X2X3 0 | |
225 PROCESS_16X2X3 0 | |
226 PROCESS_16X2X3 0 | |
227 PROCESS_16X2X3 0 | |
228 PROCESS_16X2X3 0 | |
229 PROCESS_16X2X3 0 | |
230 PROCESS_16X2X3 0 | |
231 | |
232 .vp8_sad16x16x3_ssse3_store_off: | |
233 mov rdi, arg(4) ;Results | |
234 | |
235 movq xmm0, xmm5 | |
236 psrldq xmm5, 8 | |
237 | |
238 paddw xmm0, xmm5 | |
239 movd [rdi], xmm0 | |
240 ;- | |
241 movq xmm0, xmm6 | |
242 psrldq xmm6, 8 | |
243 | |
244 paddw xmm0, xmm6 | |
245 movd [rdi+4], xmm0 | |
246 ;- | |
247 movq xmm0, xmm7 | |
248 psrldq xmm7, 8 | |
249 | |
250 paddw xmm0, xmm7 | |
251 movd [rdi+8], xmm0 | |
252 | |
253 ; begin epilog | |
254 pop rcx | |
255 pop rdi | |
256 pop rsi | |
257 RESTORE_XMM | |
258 UNSHADOW_ARGS | |
259 pop rbp | |
260 ret | |
261 | |
262 ;void int vp8_sad16x8x3_ssse3( | |
263 ; unsigned char *src_ptr, | |
264 ; int src_stride, | |
265 ; unsigned char *ref_ptr, | |
266 ; int ref_stride, | |
267 ; int *results) | |
268 global sym(vp8_sad16x8x3_ssse3) PRIVATE | |
269 sym(vp8_sad16x8x3_ssse3): | |
270 push rbp | |
271 mov rbp, rsp | |
272 SHADOW_ARGS_TO_STACK 5 | |
273 SAVE_XMM 7 | |
274 push rsi | |
275 push rdi | |
276 push rcx | |
277 ; end prolog | |
278 | |
279 mov rsi, arg(0) ;src_ptr | |
280 mov rdi, arg(2) ;ref_ptr | |
281 | |
282 mov rdx, 0xf | |
283 and rdx, rdi | |
284 | |
285 jmp .vp8_sad16x8x3_ssse3_skiptable | |
286 .vp8_sad16x8x3_ssse3_jumptable: | |
287 dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump | |
288 dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump | |
289 dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump | |
290 dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump | |
291 dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump | |
292 dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump | |
293 dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump | |
294 dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump | |
295 dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump | |
296 dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump | |
297 dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump | |
298 dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump | |
299 dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump | |
300 dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump | |
301 dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump | |
302 dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump | |
303 .vp8_sad16x8x3_ssse3_skiptable: | |
304 | |
305 call .vp8_sad16x8x3_ssse3_do_jump | |
306 .vp8_sad16x8x3_ssse3_do_jump: | |
307 pop rcx ; get the address of do_jump | |
308 mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ss
se3_do_jump | |
309 add rax, rcx ; get the absolute address of vp8_sad16x8x3_s
sse3_jumptable | |
310 | |
311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable | |
312 add rcx, rax | |
313 | |
314 movsxd rax, dword ptr arg(1) ;src_stride | |
315 movsxd rdx, dword ptr arg(3) ;ref_stride | |
316 | |
317 jmp rcx | |
318 | |
319 PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3 | |
320 PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3 | |
321 PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3 | |
322 PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3 | |
323 PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3 | |
324 PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3 | |
325 PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3 | |
326 PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3 | |
327 PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3 | |
328 PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3 | |
329 PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3 | |
330 PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3 | |
331 PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3 | |
332 PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3 | |
333 PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3 | |
334 | |
335 .vp8_sad16x8x3_ssse3_aligned_by_15: | |
336 | |
337 PROCESS_16X2X3 1 | |
338 PROCESS_16X2X3 0 | |
339 PROCESS_16X2X3 0 | |
340 PROCESS_16X2X3 0 | |
341 | |
342 .vp8_sad16x8x3_ssse3_store_off: | |
343 mov rdi, arg(4) ;Results | |
344 | |
345 movq xmm0, xmm5 | |
346 psrldq xmm5, 8 | |
347 | |
348 paddw xmm0, xmm5 | |
349 movd [rdi], xmm0 | |
350 ;- | |
351 movq xmm0, xmm6 | |
352 psrldq xmm6, 8 | |
353 | |
354 paddw xmm0, xmm6 | |
355 movd [rdi+4], xmm0 | |
356 ;- | |
357 movq xmm0, xmm7 | |
358 psrldq xmm7, 8 | |
359 | |
360 paddw xmm0, xmm7 | |
361 movd [rdi+8], xmm0 | |
362 | |
363 ; begin epilog | |
364 pop rcx | |
365 pop rdi | |
366 pop rsi | |
367 RESTORE_XMM | |
368 UNSHADOW_ARGS | |
369 pop rbp | |
370 ret | |
OLD | NEW |