OLD | NEW |
| (Empty) |
1 ; | |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
3 ; | |
4 ; Use of this source code is governed by a BSD-style license | |
5 ; that can be found in the LICENSE file in the root of the source | |
6 ; tree. An additional intellectual property rights grant can be found | |
7 ; in the file PATENTS. All contributing project authors may | |
8 ; be found in the AUTHORS file in the root of the source tree. | |
9 ; | |
10 | |
11 | |
12 %include "vpx_ports/x86_abi_support.asm" | |
13 | |
14 %macro PROCESS_16X2X8 1 | |
15 %if %1 | |
16 movdqa xmm0, XMMWORD PTR [rsi] | |
17 movq xmm1, MMWORD PTR [rdi] | |
18 movq xmm3, MMWORD PTR [rdi+8] | |
19 movq xmm2, MMWORD PTR [rdi+16] | |
20 punpcklqdq xmm1, xmm3 | |
21 punpcklqdq xmm3, xmm2 | |
22 | |
23 movdqa xmm2, xmm1 | |
24 mpsadbw xmm1, xmm0, 0x0 | |
25 mpsadbw xmm2, xmm0, 0x5 | |
26 | |
27 psrldq xmm0, 8 | |
28 | |
29 movdqa xmm4, xmm3 | |
30 mpsadbw xmm3, xmm0, 0x0 | |
31 mpsadbw xmm4, xmm0, 0x5 | |
32 | |
33 paddw xmm1, xmm2 | |
34 paddw xmm1, xmm3 | |
35 paddw xmm1, xmm4 | |
36 %else | |
37 movdqa xmm0, XMMWORD PTR [rsi] | |
38 movq xmm5, MMWORD PTR [rdi] | |
39 movq xmm3, MMWORD PTR [rdi+8] | |
40 movq xmm2, MMWORD PTR [rdi+16] | |
41 punpcklqdq xmm5, xmm3 | |
42 punpcklqdq xmm3, xmm2 | |
43 | |
44 movdqa xmm2, xmm5 | |
45 mpsadbw xmm5, xmm0, 0x0 | |
46 mpsadbw xmm2, xmm0, 0x5 | |
47 | |
48 psrldq xmm0, 8 | |
49 | |
50 movdqa xmm4, xmm3 | |
51 mpsadbw xmm3, xmm0, 0x0 | |
52 mpsadbw xmm4, xmm0, 0x5 | |
53 | |
54 paddw xmm5, xmm2 | |
55 paddw xmm5, xmm3 | |
56 paddw xmm5, xmm4 | |
57 | |
58 paddw xmm1, xmm5 | |
59 %endif | |
60 movdqa xmm0, XMMWORD PTR [rsi + rax] | |
61 movq xmm5, MMWORD PTR [rdi+ rdx] | |
62 movq xmm3, MMWORD PTR [rdi+ rdx+8] | |
63 movq xmm2, MMWORD PTR [rdi+ rdx+16] | |
64 punpcklqdq xmm5, xmm3 | |
65 punpcklqdq xmm3, xmm2 | |
66 | |
67 lea rsi, [rsi+rax*2] | |
68 lea rdi, [rdi+rdx*2] | |
69 | |
70 movdqa xmm2, xmm5 | |
71 mpsadbw xmm5, xmm0, 0x0 | |
72 mpsadbw xmm2, xmm0, 0x5 | |
73 | |
74 psrldq xmm0, 8 | |
75 movdqa xmm4, xmm3 | |
76 mpsadbw xmm3, xmm0, 0x0 | |
77 mpsadbw xmm4, xmm0, 0x5 | |
78 | |
79 paddw xmm5, xmm2 | |
80 paddw xmm5, xmm3 | |
81 paddw xmm5, xmm4 | |
82 | |
83 paddw xmm1, xmm5 | |
84 %endmacro | |
85 | |
86 %macro PROCESS_8X2X8 1 | |
87 %if %1 | |
88 movq xmm0, MMWORD PTR [rsi] | |
89 movq xmm1, MMWORD PTR [rdi] | |
90 movq xmm3, MMWORD PTR [rdi+8] | |
91 punpcklqdq xmm1, xmm3 | |
92 | |
93 movdqa xmm2, xmm1 | |
94 mpsadbw xmm1, xmm0, 0x0 | |
95 mpsadbw xmm2, xmm0, 0x5 | |
96 paddw xmm1, xmm2 | |
97 %else | |
98 movq xmm0, MMWORD PTR [rsi] | |
99 movq xmm5, MMWORD PTR [rdi] | |
100 movq xmm3, MMWORD PTR [rdi+8] | |
101 punpcklqdq xmm5, xmm3 | |
102 | |
103 movdqa xmm2, xmm5 | |
104 mpsadbw xmm5, xmm0, 0x0 | |
105 mpsadbw xmm2, xmm0, 0x5 | |
106 paddw xmm5, xmm2 | |
107 | |
108 paddw xmm1, xmm5 | |
109 %endif | |
110 movq xmm0, MMWORD PTR [rsi + rax] | |
111 movq xmm5, MMWORD PTR [rdi+ rdx] | |
112 movq xmm3, MMWORD PTR [rdi+ rdx+8] | |
113 punpcklqdq xmm5, xmm3 | |
114 | |
115 lea rsi, [rsi+rax*2] | |
116 lea rdi, [rdi+rdx*2] | |
117 | |
118 movdqa xmm2, xmm5 | |
119 mpsadbw xmm5, xmm0, 0x0 | |
120 mpsadbw xmm2, xmm0, 0x5 | |
121 paddw xmm5, xmm2 | |
122 | |
123 paddw xmm1, xmm5 | |
124 %endmacro | |
125 | |
126 %macro PROCESS_4X2X8 1 | |
127 %if %1 | |
128 movd xmm0, [rsi] | |
129 movq xmm1, MMWORD PTR [rdi] | |
130 movq xmm3, MMWORD PTR [rdi+8] | |
131 punpcklqdq xmm1, xmm3 | |
132 | |
133 mpsadbw xmm1, xmm0, 0x0 | |
134 %else | |
135 movd xmm0, [rsi] | |
136 movq xmm5, MMWORD PTR [rdi] | |
137 movq xmm3, MMWORD PTR [rdi+8] | |
138 punpcklqdq xmm5, xmm3 | |
139 | |
140 mpsadbw xmm5, xmm0, 0x0 | |
141 | |
142 paddw xmm1, xmm5 | |
143 %endif | |
144 movd xmm0, [rsi + rax] | |
145 movq xmm5, MMWORD PTR [rdi+ rdx] | |
146 movq xmm3, MMWORD PTR [rdi+ rdx+8] | |
147 punpcklqdq xmm5, xmm3 | |
148 | |
149 lea rsi, [rsi+rax*2] | |
150 lea rdi, [rdi+rdx*2] | |
151 | |
152 mpsadbw xmm5, xmm0, 0x0 | |
153 | |
154 paddw xmm1, xmm5 | |
155 %endmacro | |
156 | |
157 %macro WRITE_AS_INTS 0 | |
158 mov rdi, arg(4) ;Results | |
159 pxor xmm0, xmm0 | |
160 movdqa xmm2, xmm1 | |
161 punpcklwd xmm1, xmm0 | |
162 punpckhwd xmm2, xmm0 | |
163 | |
164 movdqa [rdi], xmm1 | |
165 movdqa [rdi + 16], xmm2 | |
166 %endmacro | |
167 | |
168 ;void vp9_sad16x16x8_sse4( | |
169 ; const unsigned char *src_ptr, | |
170 ; int src_stride, | |
171 ; const unsigned char *ref_ptr, | |
172 ; int ref_stride, | |
173 ; unsigned short *sad_array); | |
174 global sym(vp9_sad16x16x8_sse4) PRIVATE | |
175 sym(vp9_sad16x16x8_sse4): | |
176 push rbp | |
177 mov rbp, rsp | |
178 SHADOW_ARGS_TO_STACK 5 | |
179 push rsi | |
180 push rdi | |
181 ; end prolog | |
182 | |
183 mov rsi, arg(0) ;src_ptr | |
184 mov rdi, arg(2) ;ref_ptr | |
185 | |
186 movsxd rax, dword ptr arg(1) ;src_stride | |
187 movsxd rdx, dword ptr arg(3) ;ref_stride | |
188 | |
189 PROCESS_16X2X8 1 | |
190 PROCESS_16X2X8 0 | |
191 PROCESS_16X2X8 0 | |
192 PROCESS_16X2X8 0 | |
193 PROCESS_16X2X8 0 | |
194 PROCESS_16X2X8 0 | |
195 PROCESS_16X2X8 0 | |
196 PROCESS_16X2X8 0 | |
197 | |
198 WRITE_AS_INTS | |
199 | |
200 ; begin epilog | |
201 pop rdi | |
202 pop rsi | |
203 UNSHADOW_ARGS | |
204 pop rbp | |
205 ret | |
206 | |
207 | |
208 ;void vp9_sad16x8x8_sse4( | |
209 ; const unsigned char *src_ptr, | |
210 ; int src_stride, | |
211 ; const unsigned char *ref_ptr, | |
212 ; int ref_stride, | |
213 ; unsigned short *sad_array | |
214 ;); | |
215 global sym(vp9_sad16x8x8_sse4) PRIVATE | |
216 sym(vp9_sad16x8x8_sse4): | |
217 push rbp | |
218 mov rbp, rsp | |
219 SHADOW_ARGS_TO_STACK 5 | |
220 push rsi | |
221 push rdi | |
222 ; end prolog | |
223 | |
224 mov rsi, arg(0) ;src_ptr | |
225 mov rdi, arg(2) ;ref_ptr | |
226 | |
227 movsxd rax, dword ptr arg(1) ;src_stride | |
228 movsxd rdx, dword ptr arg(3) ;ref_stride | |
229 | |
230 PROCESS_16X2X8 1 | |
231 PROCESS_16X2X8 0 | |
232 PROCESS_16X2X8 0 | |
233 PROCESS_16X2X8 0 | |
234 | |
235 WRITE_AS_INTS | |
236 | |
237 ; begin epilog | |
238 pop rdi | |
239 pop rsi | |
240 UNSHADOW_ARGS | |
241 pop rbp | |
242 ret | |
243 | |
244 | |
245 ;void vp9_sad8x8x8_sse4( | |
246 ; const unsigned char *src_ptr, | |
247 ; int src_stride, | |
248 ; const unsigned char *ref_ptr, | |
249 ; int ref_stride, | |
250 ; unsigned short *sad_array | |
251 ;); | |
252 global sym(vp9_sad8x8x8_sse4) PRIVATE | |
253 sym(vp9_sad8x8x8_sse4): | |
254 push rbp | |
255 mov rbp, rsp | |
256 SHADOW_ARGS_TO_STACK 5 | |
257 push rsi | |
258 push rdi | |
259 ; end prolog | |
260 | |
261 mov rsi, arg(0) ;src_ptr | |
262 mov rdi, arg(2) ;ref_ptr | |
263 | |
264 movsxd rax, dword ptr arg(1) ;src_stride | |
265 movsxd rdx, dword ptr arg(3) ;ref_stride | |
266 | |
267 PROCESS_8X2X8 1 | |
268 PROCESS_8X2X8 0 | |
269 PROCESS_8X2X8 0 | |
270 PROCESS_8X2X8 0 | |
271 | |
272 WRITE_AS_INTS | |
273 | |
274 ; begin epilog | |
275 pop rdi | |
276 pop rsi | |
277 UNSHADOW_ARGS | |
278 pop rbp | |
279 ret | |
280 | |
281 | |
282 ;void vp9_sad8x16x8_sse4( | |
283 ; const unsigned char *src_ptr, | |
284 ; int src_stride, | |
285 ; const unsigned char *ref_ptr, | |
286 ; int ref_stride, | |
287 ; unsigned short *sad_array | |
288 ;); | |
289 global sym(vp9_sad8x16x8_sse4) PRIVATE | |
290 sym(vp9_sad8x16x8_sse4): | |
291 push rbp | |
292 mov rbp, rsp | |
293 SHADOW_ARGS_TO_STACK 5 | |
294 push rsi | |
295 push rdi | |
296 ; end prolog | |
297 | |
298 mov rsi, arg(0) ;src_ptr | |
299 mov rdi, arg(2) ;ref_ptr | |
300 | |
301 movsxd rax, dword ptr arg(1) ;src_stride | |
302 movsxd rdx, dword ptr arg(3) ;ref_stride | |
303 | |
304 PROCESS_8X2X8 1 | |
305 PROCESS_8X2X8 0 | |
306 PROCESS_8X2X8 0 | |
307 PROCESS_8X2X8 0 | |
308 PROCESS_8X2X8 0 | |
309 PROCESS_8X2X8 0 | |
310 PROCESS_8X2X8 0 | |
311 PROCESS_8X2X8 0 | |
312 | |
313 WRITE_AS_INTS | |
314 | |
315 ; begin epilog | |
316 pop rdi | |
317 pop rsi | |
318 UNSHADOW_ARGS | |
319 pop rbp | |
320 ret | |
321 | |
322 | |
323 ;void vp9_sad4x4x8_c( | |
324 ; const unsigned char *src_ptr, | |
325 ; int src_stride, | |
326 ; const unsigned char *ref_ptr, | |
327 ; int ref_stride, | |
328 ; unsigned short *sad_array | |
329 ;); | |
330 global sym(vp9_sad4x4x8_sse4) PRIVATE | |
331 sym(vp9_sad4x4x8_sse4): | |
332 push rbp | |
333 mov rbp, rsp | |
334 SHADOW_ARGS_TO_STACK 5 | |
335 push rsi | |
336 push rdi | |
337 ; end prolog | |
338 | |
339 mov rsi, arg(0) ;src_ptr | |
340 mov rdi, arg(2) ;ref_ptr | |
341 | |
342 movsxd rax, dword ptr arg(1) ;src_stride | |
343 movsxd rdx, dword ptr arg(3) ;ref_stride | |
344 | |
345 PROCESS_4X2X8 1 | |
346 PROCESS_4X2X8 0 | |
347 | |
348 WRITE_AS_INTS | |
349 | |
350 ; begin epilog | |
351 pop rdi | |
352 pop rsi | |
353 UNSHADOW_ARGS | |
354 pop rbp | |
355 ret | |
356 | |
357 | |
358 | |
359 | |
OLD | NEW |