OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 %include "vpx_ports/x86_abi_support.asm" | |
12 | |
13 %macro STACK_FRAME_CREATE_X3 0 | |
14 %if ABI_IS_32BIT | |
15 %define src_ptr rsi | |
16 %define src_stride rax | |
17 %define ref_ptr rdi | |
18 %define ref_stride rdx | |
19 %define end_ptr rcx | |
20 %define ret_var rbx | |
21 %define result_ptr arg(4) | |
22 %define max_err arg(4) | |
23 %define height dword ptr arg(4) | |
24 push rbp | |
25 mov rbp, rsp | |
26 push rsi | |
27 push rdi | |
28 push rbx | |
29 | |
30 mov rsi, arg(0) ; src_ptr | |
31 mov rdi, arg(2) ; ref_ptr | |
32 | |
33 movsxd rax, dword ptr arg(1) ; src_stride | |
34 movsxd rdx, dword ptr arg(3) ; ref_stride | |
35 %else | |
36 %if LIBVPX_YASM_WIN64 | |
37 SAVE_XMM 7, u | |
38 %define src_ptr rcx | |
39 %define src_stride rdx | |
40 %define ref_ptr r8 | |
41 %define ref_stride r9 | |
42 %define end_ptr r10 | |
43 %define ret_var r11 | |
44 %define result_ptr [rsp+xmm_stack_space+8+4*8] | |
45 %define max_err [rsp+xmm_stack_space+8+4*8] | |
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] | |
47 %else | |
48 %define src_ptr rdi | |
49 %define src_stride rsi | |
50 %define ref_ptr rdx | |
51 %define ref_stride rcx | |
52 %define end_ptr r9 | |
53 %define ret_var r10 | |
54 %define result_ptr r8 | |
55 %define max_err r8 | |
56 %define height r8 | |
57 %endif | |
58 %endif | |
59 | |
60 %endmacro | |
61 | |
62 %macro STACK_FRAME_DESTROY_X3 0 | |
63 %define src_ptr | |
64 %define src_stride | |
65 %define ref_ptr | |
66 %define ref_stride | |
67 %define end_ptr | |
68 %define ret_var | |
69 %define result_ptr | |
70 %define max_err | |
71 %define height | |
72 | |
73 %if ABI_IS_32BIT | |
74 pop rbx | |
75 pop rdi | |
76 pop rsi | |
77 pop rbp | |
78 %else | |
79 %if LIBVPX_YASM_WIN64 | |
80 RESTORE_XMM | |
81 %endif | |
82 %endif | |
83 ret | |
84 %endmacro | |
85 | |
86 %macro PROCESS_16X2X3 5 | |
87 %if %1==0 | |
88 movdqa xmm0, XMMWORD PTR [%2] | |
89 lddqu xmm5, XMMWORD PTR [%3] | |
90 lddqu xmm6, XMMWORD PTR [%3+1] | |
91 lddqu xmm7, XMMWORD PTR [%3+2] | |
92 | |
93 psadbw xmm5, xmm0 | |
94 psadbw xmm6, xmm0 | |
95 psadbw xmm7, xmm0 | |
96 %else | |
97 movdqa xmm0, XMMWORD PTR [%2] | |
98 lddqu xmm1, XMMWORD PTR [%3] | |
99 lddqu xmm2, XMMWORD PTR [%3+1] | |
100 lddqu xmm3, XMMWORD PTR [%3+2] | |
101 | |
102 psadbw xmm1, xmm0 | |
103 psadbw xmm2, xmm0 | |
104 psadbw xmm3, xmm0 | |
105 | |
106 paddw xmm5, xmm1 | |
107 paddw xmm6, xmm2 | |
108 paddw xmm7, xmm3 | |
109 %endif | |
110 movdqa xmm0, XMMWORD PTR [%2+%4] | |
111 lddqu xmm1, XMMWORD PTR [%3+%5] | |
112 lddqu xmm2, XMMWORD PTR [%3+%5+1] | |
113 lddqu xmm3, XMMWORD PTR [%3+%5+2] | |
114 | |
115 %if %1==0 || %1==1 | |
116 lea %2, [%2+%4*2] | |
117 lea %3, [%3+%5*2] | |
118 %endif | |
119 | |
120 psadbw xmm1, xmm0 | |
121 psadbw xmm2, xmm0 | |
122 psadbw xmm3, xmm0 | |
123 | |
124 paddw xmm5, xmm1 | |
125 paddw xmm6, xmm2 | |
126 paddw xmm7, xmm3 | |
127 %endmacro | |
128 | |
129 %macro PROCESS_8X2X3 5 | |
130 %if %1==0 | |
131 movq mm0, QWORD PTR [%2] | |
132 movq mm5, QWORD PTR [%3] | |
133 movq mm6, QWORD PTR [%3+1] | |
134 movq mm7, QWORD PTR [%3+2] | |
135 | |
136 psadbw mm5, mm0 | |
137 psadbw mm6, mm0 | |
138 psadbw mm7, mm0 | |
139 %else | |
140 movq mm0, QWORD PTR [%2] | |
141 movq mm1, QWORD PTR [%3] | |
142 movq mm2, QWORD PTR [%3+1] | |
143 movq mm3, QWORD PTR [%3+2] | |
144 | |
145 psadbw mm1, mm0 | |
146 psadbw mm2, mm0 | |
147 psadbw mm3, mm0 | |
148 | |
149 paddw mm5, mm1 | |
150 paddw mm6, mm2 | |
151 paddw mm7, mm3 | |
152 %endif | |
153 movq mm0, QWORD PTR [%2+%4] | |
154 movq mm1, QWORD PTR [%3+%5] | |
155 movq mm2, QWORD PTR [%3+%5+1] | |
156 movq mm3, QWORD PTR [%3+%5+2] | |
157 | |
158 %if %1==0 || %1==1 | |
159 lea %2, [%2+%4*2] | |
160 lea %3, [%3+%5*2] | |
161 %endif | |
162 | |
163 psadbw mm1, mm0 | |
164 psadbw mm2, mm0 | |
165 psadbw mm3, mm0 | |
166 | |
167 paddw mm5, mm1 | |
168 paddw mm6, mm2 | |
169 paddw mm7, mm3 | |
170 %endmacro | |
171 | |
172 ;void int vp9_sad16x16x3_sse3( | |
173 ; unsigned char *src_ptr, | |
174 ; int src_stride, | |
175 ; unsigned char *ref_ptr, | |
176 ; int ref_stride, | |
177 ; int *results) | |
178 global sym(vp9_sad16x16x3_sse3) PRIVATE | |
179 sym(vp9_sad16x16x3_sse3): | |
180 | |
181 STACK_FRAME_CREATE_X3 | |
182 | |
183 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
186 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
187 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
188 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
189 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
190 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
191 | |
192 mov rcx, result_ptr | |
193 | |
194 movq xmm0, xmm5 | |
195 psrldq xmm5, 8 | |
196 | |
197 paddw xmm0, xmm5 | |
198 movd [rcx], xmm0 | |
199 ;- | |
200 movq xmm0, xmm6 | |
201 psrldq xmm6, 8 | |
202 | |
203 paddw xmm0, xmm6 | |
204 movd [rcx+4], xmm0 | |
205 ;- | |
206 movq xmm0, xmm7 | |
207 psrldq xmm7, 8 | |
208 | |
209 paddw xmm0, xmm7 | |
210 movd [rcx+8], xmm0 | |
211 | |
212 STACK_FRAME_DESTROY_X3 | |
213 | |
214 ;void int vp9_sad16x8x3_sse3( | |
215 ; unsigned char *src_ptr, | |
216 ; int src_stride, | |
217 ; unsigned char *ref_ptr, | |
218 ; int ref_stride, | |
219 ; int *results) | |
220 global sym(vp9_sad16x8x3_sse3) PRIVATE | |
221 sym(vp9_sad16x8x3_sse3): | |
222 | |
223 STACK_FRAME_CREATE_X3 | |
224 | |
225 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
226 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
227 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
228 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
229 | |
230 mov rcx, result_ptr | |
231 | |
232 movq xmm0, xmm5 | |
233 psrldq xmm5, 8 | |
234 | |
235 paddw xmm0, xmm5 | |
236 movd [rcx], xmm0 | |
237 ;- | |
238 movq xmm0, xmm6 | |
239 psrldq xmm6, 8 | |
240 | |
241 paddw xmm0, xmm6 | |
242 movd [rcx+4], xmm0 | |
243 ;- | |
244 movq xmm0, xmm7 | |
245 psrldq xmm7, 8 | |
246 | |
247 paddw xmm0, xmm7 | |
248 movd [rcx+8], xmm0 | |
249 | |
250 STACK_FRAME_DESTROY_X3 | |
251 | |
252 ;void int vp9_sad8x16x3_sse3( | |
253 ; unsigned char *src_ptr, | |
254 ; int src_stride, | |
255 ; unsigned char *ref_ptr, | |
256 ; int ref_stride, | |
257 ; int *results) | |
258 global sym(vp9_sad8x16x3_sse3) PRIVATE | |
259 sym(vp9_sad8x16x3_sse3): | |
260 | |
261 STACK_FRAME_CREATE_X3 | |
262 | |
263 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
266 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
267 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
268 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
269 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
270 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
271 | |
272 mov rcx, result_ptr | |
273 | |
274 punpckldq mm5, mm6 | |
275 | |
276 movq [rcx], mm5 | |
277 movd [rcx+8], mm7 | |
278 | |
279 STACK_FRAME_DESTROY_X3 | |
280 | |
281 ;void int vp9_sad8x8x3_sse3( | |
282 ; unsigned char *src_ptr, | |
283 ; int src_stride, | |
284 ; unsigned char *ref_ptr, | |
285 ; int ref_stride, | |
286 ; int *results) | |
287 global sym(vp9_sad8x8x3_sse3) PRIVATE | |
288 sym(vp9_sad8x8x3_sse3): | |
289 | |
290 STACK_FRAME_CREATE_X3 | |
291 | |
292 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | |
293 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
294 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | |
295 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | |
296 | |
297 mov rcx, result_ptr | |
298 | |
299 punpckldq mm5, mm6 | |
300 | |
301 movq [rcx], mm5 | |
302 movd [rcx+8], mm7 | |
303 | |
304 STACK_FRAME_DESTROY_X3 | |
305 | |
306 ;void int vp9_sad4x4x3_sse3( | |
307 ; unsigned char *src_ptr, | |
308 ; int src_stride, | |
309 ; unsigned char *ref_ptr, | |
310 ; int ref_stride, | |
311 ; int *results) | |
312 global sym(vp9_sad4x4x3_sse3) PRIVATE | |
313 sym(vp9_sad4x4x3_sse3): | |
314 | |
315 STACK_FRAME_CREATE_X3 | |
316 | |
317 movd mm0, DWORD PTR [src_ptr] | |
318 movd mm1, DWORD PTR [ref_ptr] | |
319 | |
320 movd mm2, DWORD PTR [src_ptr+src_stride] | |
321 movd mm3, DWORD PTR [ref_ptr+ref_stride] | |
322 | |
323 punpcklbw mm0, mm2 | |
324 punpcklbw mm1, mm3 | |
325 | |
326 movd mm4, DWORD PTR [ref_ptr+1] | |
327 movd mm5, DWORD PTR [ref_ptr+2] | |
328 | |
329 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] | |
330 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] | |
331 | |
332 psadbw mm1, mm0 | |
333 | |
334 punpcklbw mm4, mm2 | |
335 punpcklbw mm5, mm3 | |
336 | |
337 psadbw mm4, mm0 | |
338 psadbw mm5, mm0 | |
339 | |
340 lea src_ptr, [src_ptr+src_stride*2] | |
341 lea ref_ptr, [ref_ptr+ref_stride*2] | |
342 | |
343 movd mm0, DWORD PTR [src_ptr] | |
344 movd mm2, DWORD PTR [ref_ptr] | |
345 | |
346 movd mm3, DWORD PTR [src_ptr+src_stride] | |
347 movd mm6, DWORD PTR [ref_ptr+ref_stride] | |
348 | |
349 punpcklbw mm0, mm3 | |
350 punpcklbw mm2, mm6 | |
351 | |
352 movd mm3, DWORD PTR [ref_ptr+1] | |
353 movd mm7, DWORD PTR [ref_ptr+2] | |
354 | |
355 psadbw mm2, mm0 | |
356 | |
357 paddw mm1, mm2 | |
358 | |
359 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] | |
360 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] | |
361 | |
362 punpcklbw mm3, mm2 | |
363 punpcklbw mm7, mm6 | |
364 | |
365 psadbw mm3, mm0 | |
366 psadbw mm7, mm0 | |
367 | |
368 paddw mm3, mm4 | |
369 paddw mm7, mm5 | |
370 | |
371 mov rcx, result_ptr | |
372 | |
373 punpckldq mm1, mm3 | |
374 | |
375 movq [rcx], mm1 | |
376 movd [rcx+8], mm7 | |
377 | |
378 STACK_FRAME_DESTROY_X3 | |
OLD | NEW |