OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
11 %include "vpx_ports/x86_abi_support.asm" | 11 %include "vpx_ports/x86_abi_support.asm" |
12 | 12 |
13 %macro STACK_FRAME_CREATE_X3 0 | 13 %macro STACK_FRAME_CREATE_X3 0 |
14 %if ABI_IS_32BIT | 14 %if ABI_IS_32BIT |
15 %define src_ptr rsi | 15 %define src_ptr rsi |
16 %define src_stride rax | 16 %define src_stride rax |
17 %define ref_ptr rdi | 17 %define ref_ptr rdi |
18 %define ref_stride rdx | 18 %define ref_stride rdx |
19 %define end_ptr rcx | 19 %define end_ptr rcx |
20 %define ret_var rbx | 20 %define ret_var rbx |
21 %define result_ptr arg(4) | 21 %define result_ptr arg(4) |
22 %define max_err arg(4) | |
23 %define height dword ptr arg(4) | 22 %define height dword ptr arg(4) |
24 push rbp | 23 push rbp |
25 mov rbp, rsp | 24 mov rbp, rsp |
26 push rsi | 25 push rsi |
27 push rdi | 26 push rdi |
28 push rbx | 27 push rbx |
29 | 28 |
30 mov rsi, arg(0) ; src_ptr | 29 mov rsi, arg(0) ; src_ptr |
31 mov rdi, arg(2) ; ref_ptr | 30 mov rdi, arg(2) ; ref_ptr |
32 | 31 |
33 movsxd rax, dword ptr arg(1) ; src_stride | 32 movsxd rax, dword ptr arg(1) ; src_stride |
34 movsxd rdx, dword ptr arg(3) ; ref_stride | 33 movsxd rdx, dword ptr arg(3) ; ref_stride |
35 %else | 34 %else |
36 %if LIBVPX_YASM_WIN64 | 35 %if LIBVPX_YASM_WIN64 |
37 SAVE_XMM 7, u | 36 SAVE_XMM 7, u |
38 %define src_ptr rcx | 37 %define src_ptr rcx |
39 %define src_stride rdx | 38 %define src_stride rdx |
40 %define ref_ptr r8 | 39 %define ref_ptr r8 |
41 %define ref_stride r9 | 40 %define ref_stride r9 |
42 %define end_ptr r10 | 41 %define end_ptr r10 |
43 %define ret_var r11 | 42 %define ret_var r11 |
44 %define result_ptr [rsp+xmm_stack_space+8+4*8] | 43 %define result_ptr [rsp+xmm_stack_space+8+4*8] |
45 %define max_err [rsp+xmm_stack_space+8+4*8] | |
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] | 44 %define height dword ptr [rsp+xmm_stack_space+8+4*8] |
47 %else | 45 %else |
48 %define src_ptr rdi | 46 %define src_ptr rdi |
49 %define src_stride rsi | 47 %define src_stride rsi |
50 %define ref_ptr rdx | 48 %define ref_ptr rdx |
51 %define ref_stride rcx | 49 %define ref_stride rcx |
52 %define end_ptr r9 | 50 %define end_ptr r9 |
53 %define ret_var r10 | 51 %define ret_var r10 |
54 %define result_ptr r8 | 52 %define result_ptr r8 |
55 %define max_err r8 | |
56 %define height r8 | 53 %define height r8 |
57 %endif | 54 %endif |
58 %endif | 55 %endif |
59 | 56 |
60 %endmacro | 57 %endmacro |
61 | 58 |
62 %macro STACK_FRAME_DESTROY_X3 0 | 59 %macro STACK_FRAME_DESTROY_X3 0 |
63 %define src_ptr | 60 %define src_ptr |
64 %define src_stride | 61 %define src_stride |
65 %define ref_ptr | 62 %define ref_ptr |
66 %define ref_stride | 63 %define ref_stride |
67 %define end_ptr | 64 %define end_ptr |
68 %define ret_var | 65 %define ret_var |
69 %define result_ptr | 66 %define result_ptr |
70 %define max_err | |
71 %define height | 67 %define height |
72 | 68 |
73 %if ABI_IS_32BIT | 69 %if ABI_IS_32BIT |
74 pop rbx | 70 pop rbx |
75 pop rdi | 71 pop rdi |
76 pop rsi | 72 pop rsi |
77 pop rbp | 73 pop rbp |
78 %else | 74 %else |
79 %if LIBVPX_YASM_WIN64 | 75 %if LIBVPX_YASM_WIN64 |
80 RESTORE_XMM | 76 RESTORE_XMM |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
162 | 158 |
163 psadbw mm1, mm0 | 159 psadbw mm1, mm0 |
164 psadbw mm2, mm0 | 160 psadbw mm2, mm0 |
165 psadbw mm3, mm0 | 161 psadbw mm3, mm0 |
166 | 162 |
167 paddw mm5, mm1 | 163 paddw mm5, mm1 |
168 paddw mm6, mm2 | 164 paddw mm6, mm2 |
169 paddw mm7, mm3 | 165 paddw mm7, mm3 |
170 %endmacro | 166 %endmacro |
171 | 167 |
172 ;void int vp9_sad16x16x3_sse3( | 168 ;void int vpx_sad16x16x3_sse3( |
173 ; unsigned char *src_ptr, | 169 ; unsigned char *src_ptr, |
174 ; int src_stride, | 170 ; int src_stride, |
175 ; unsigned char *ref_ptr, | 171 ; unsigned char *ref_ptr, |
176 ; int ref_stride, | 172 ; int ref_stride, |
177 ; int *results) | 173 ; int *results) |
178 global sym(vp9_sad16x16x3_sse3) PRIVATE | 174 global sym(vpx_sad16x16x3_sse3) PRIVATE |
179 sym(vp9_sad16x16x3_sse3): | 175 sym(vpx_sad16x16x3_sse3): |
180 | 176 |
181 STACK_FRAME_CREATE_X3 | 177 STACK_FRAME_CREATE_X3 |
182 | 178 |
183 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | 179 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 180 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 181 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
186 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 182 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
187 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 183 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
188 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
189 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
(...skipping 14 matching lines...) Expand all Loading... |
204 movd [rcx+4], xmm0 | 200 movd [rcx+4], xmm0 |
205 ;- | 201 ;- |
206 movq xmm0, xmm7 | 202 movq xmm0, xmm7 |
207 psrldq xmm7, 8 | 203 psrldq xmm7, 8 |
208 | 204 |
209 paddw xmm0, xmm7 | 205 paddw xmm0, xmm7 |
210 movd [rcx+8], xmm0 | 206 movd [rcx+8], xmm0 |
211 | 207 |
212 STACK_FRAME_DESTROY_X3 | 208 STACK_FRAME_DESTROY_X3 |
213 | 209 |
214 ;void int vp9_sad16x8x3_sse3( | 210 ;void int vpx_sad16x8x3_sse3( |
215 ; unsigned char *src_ptr, | 211 ; unsigned char *src_ptr, |
216 ; int src_stride, | 212 ; int src_stride, |
217 ; unsigned char *ref_ptr, | 213 ; unsigned char *ref_ptr, |
218 ; int ref_stride, | 214 ; int ref_stride, |
219 ; int *results) | 215 ; int *results) |
220 global sym(vp9_sad16x8x3_sse3) PRIVATE | 216 global sym(vpx_sad16x8x3_sse3) PRIVATE |
221 sym(vp9_sad16x8x3_sse3): | 217 sym(vpx_sad16x8x3_sse3): |
222 | 218 |
223 STACK_FRAME_CREATE_X3 | 219 STACK_FRAME_CREATE_X3 |
224 | 220 |
225 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | 221 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
226 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 222 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
227 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 223 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
228 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | 224 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
229 | 225 |
230 mov rcx, result_ptr | 226 mov rcx, result_ptr |
231 | 227 |
(...skipping 10 matching lines...) Expand all Loading... |
242 movd [rcx+4], xmm0 | 238 movd [rcx+4], xmm0 |
243 ;- | 239 ;- |
244 movq xmm0, xmm7 | 240 movq xmm0, xmm7 |
245 psrldq xmm7, 8 | 241 psrldq xmm7, 8 |
246 | 242 |
247 paddw xmm0, xmm7 | 243 paddw xmm0, xmm7 |
248 movd [rcx+8], xmm0 | 244 movd [rcx+8], xmm0 |
249 | 245 |
250 STACK_FRAME_DESTROY_X3 | 246 STACK_FRAME_DESTROY_X3 |
251 | 247 |
252 ;void int vp9_sad8x16x3_sse3( | 248 ;void int vpx_sad8x16x3_sse3( |
253 ; unsigned char *src_ptr, | 249 ; unsigned char *src_ptr, |
254 ; int src_stride, | 250 ; int src_stride, |
255 ; unsigned char *ref_ptr, | 251 ; unsigned char *ref_ptr, |
256 ; int ref_stride, | 252 ; int ref_stride, |
257 ; int *results) | 253 ; int *results) |
258 global sym(vp9_sad8x16x3_sse3) PRIVATE | 254 global sym(vpx_sad8x16x3_sse3) PRIVATE |
259 sym(vp9_sad8x16x3_sse3): | 255 sym(vpx_sad8x16x3_sse3): |
260 | 256 |
261 STACK_FRAME_CREATE_X3 | 257 STACK_FRAME_CREATE_X3 |
262 | 258 |
263 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | 259 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 260 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 261 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
266 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 262 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
267 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 263 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
268 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
269 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
270 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | 266 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
271 | 267 |
272 mov rcx, result_ptr | 268 mov rcx, result_ptr |
273 | 269 |
274 punpckldq mm5, mm6 | 270 punpckldq mm5, mm6 |
275 | 271 |
276 movq [rcx], mm5 | 272 movq [rcx], mm5 |
277 movd [rcx+8], mm7 | 273 movd [rcx+8], mm7 |
278 | 274 |
279 STACK_FRAME_DESTROY_X3 | 275 STACK_FRAME_DESTROY_X3 |
280 | 276 |
281 ;void int vp9_sad8x8x3_sse3( | 277 ;void int vpx_sad8x8x3_sse3( |
282 ; unsigned char *src_ptr, | 278 ; unsigned char *src_ptr, |
283 ; int src_stride, | 279 ; int src_stride, |
284 ; unsigned char *ref_ptr, | 280 ; unsigned char *ref_ptr, |
285 ; int ref_stride, | 281 ; int ref_stride, |
286 ; int *results) | 282 ; int *results) |
287 global sym(vp9_sad8x8x3_sse3) PRIVATE | 283 global sym(vpx_sad8x8x3_sse3) PRIVATE |
288 sym(vp9_sad8x8x3_sse3): | 284 sym(vpx_sad8x8x3_sse3): |
289 | 285 |
290 STACK_FRAME_CREATE_X3 | 286 STACK_FRAME_CREATE_X3 |
291 | 287 |
292 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride | 288 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
293 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 289 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
294 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride | 290 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
295 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride | 291 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
296 | 292 |
297 mov rcx, result_ptr | 293 mov rcx, result_ptr |
298 | 294 |
299 punpckldq mm5, mm6 | 295 punpckldq mm5, mm6 |
300 | 296 |
301 movq [rcx], mm5 | 297 movq [rcx], mm5 |
302 movd [rcx+8], mm7 | 298 movd [rcx+8], mm7 |
303 | 299 |
304 STACK_FRAME_DESTROY_X3 | 300 STACK_FRAME_DESTROY_X3 |
305 | 301 |
306 ;void int vp9_sad4x4x3_sse3( | 302 ;void int vpx_sad4x4x3_sse3( |
307 ; unsigned char *src_ptr, | 303 ; unsigned char *src_ptr, |
308 ; int src_stride, | 304 ; int src_stride, |
309 ; unsigned char *ref_ptr, | 305 ; unsigned char *ref_ptr, |
310 ; int ref_stride, | 306 ; int ref_stride, |
311 ; int *results) | 307 ; int *results) |
312 global sym(vp9_sad4x4x3_sse3) PRIVATE | 308 global sym(vpx_sad4x4x3_sse3) PRIVATE |
313 sym(vp9_sad4x4x3_sse3): | 309 sym(vpx_sad4x4x3_sse3): |
314 | 310 |
315 STACK_FRAME_CREATE_X3 | 311 STACK_FRAME_CREATE_X3 |
316 | 312 |
317 movd mm0, DWORD PTR [src_ptr] | 313 movd mm0, DWORD PTR [src_ptr] |
318 movd mm1, DWORD PTR [ref_ptr] | 314 movd mm1, DWORD PTR [ref_ptr] |
319 | 315 |
320 movd mm2, DWORD PTR [src_ptr+src_stride] | 316 movd mm2, DWORD PTR [src_ptr+src_stride] |
321 movd mm3, DWORD PTR [ref_ptr+ref_stride] | 317 movd mm3, DWORD PTR [ref_ptr+ref_stride] |
322 | 318 |
323 punpcklbw mm0, mm2 | 319 punpcklbw mm0, mm2 |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
369 paddw mm7, mm5 | 365 paddw mm7, mm5 |
370 | 366 |
371 mov rcx, result_ptr | 367 mov rcx, result_ptr |
372 | 368 |
373 punpckldq mm1, mm3 | 369 punpckldq mm1, mm3 |
374 | 370 |
375 movq [rcx], mm1 | 371 movq [rcx], mm1 |
376 movd [rcx+8], mm7 | 372 movd [rcx+8], mm7 |
377 | 373 |
378 STACK_FRAME_DESTROY_X3 | 374 STACK_FRAME_DESTROY_X3 |
OLD | NEW |