OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 128 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
139 | 139 |
140 PROCESS_16X2X3_OFFSET 1, %1 | 140 PROCESS_16X2X3_OFFSET 1, %1 |
141 PROCESS_16X2X3_OFFSET 0, %1 | 141 PROCESS_16X2X3_OFFSET 0, %1 |
142 PROCESS_16X2X3_OFFSET 0, %1 | 142 PROCESS_16X2X3_OFFSET 0, %1 |
143 PROCESS_16X2X3_OFFSET 0, %1 | 143 PROCESS_16X2X3_OFFSET 0, %1 |
144 | 144 |
145 jmp %2_store_off | 145 jmp %2_store_off |
146 | 146 |
147 %endmacro | 147 %endmacro |
148 | 148 |
149 ;void int vp8_sad16x16x3_ssse3( | 149 ;void int vpx_sad16x16x3_ssse3( |
150 ; unsigned char *src_ptr, | 150 ; unsigned char *src_ptr, |
151 ; int src_stride, | 151 ; int src_stride, |
152 ; unsigned char *ref_ptr, | 152 ; unsigned char *ref_ptr, |
153 ; int ref_stride, | 153 ; int ref_stride, |
154 ; int *results) | 154 ; int *results) |
155 global sym(vp8_sad16x16x3_ssse3) PRIVATE | 155 global sym(vpx_sad16x16x3_ssse3) PRIVATE |
156 sym(vp8_sad16x16x3_ssse3): | 156 sym(vpx_sad16x16x3_ssse3): |
157 push rbp | 157 push rbp |
158 mov rbp, rsp | 158 mov rbp, rsp |
159 SHADOW_ARGS_TO_STACK 5 | 159 SHADOW_ARGS_TO_STACK 5 |
160 SAVE_XMM 7 | 160 SAVE_XMM 7 |
161 push rsi | 161 push rsi |
162 push rdi | 162 push rdi |
163 push rcx | 163 push rcx |
164 ; end prolog | 164 ; end prolog |
165 | 165 |
166 mov rsi, arg(0) ;src_ptr | 166 mov rsi, arg(0) ;src_ptr |
167 mov rdi, arg(2) ;ref_ptr | 167 mov rdi, arg(2) ;ref_ptr |
168 | 168 |
169 mov rdx, 0xf | 169 mov rdx, 0xf |
170 and rdx, rdi | 170 and rdx, rdi |
171 | 171 |
172 jmp .vp8_sad16x16x3_ssse3_skiptable | 172 jmp .vpx_sad16x16x3_ssse3_skiptable |
173 .vp8_sad16x16x3_ssse3_jumptable: | 173 .vpx_sad16x16x3_ssse3_jumptable: |
174 dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump | 174 dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump |
175 dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump | 175 dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump |
176 dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump | 176 dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump |
177 dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump | 177 dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump |
178 dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump | 178 dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump |
179 dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump | 179 dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump |
180 dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump | 180 dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump |
181 dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump | 181 dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump |
182 dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump | 182 dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump |
183 dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump | 183 dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump |
184 dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump | 184 dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump |
185 dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump | 185 dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump |
186 dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump | 186 dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump |
187 dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump | 187 dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump |
188 dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump | 188 dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump |
189 dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump | 189 dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump |
190 .vp8_sad16x16x3_ssse3_skiptable: | 190 .vpx_sad16x16x3_ssse3_skiptable: |
191 | 191 |
192 call .vp8_sad16x16x3_ssse3_do_jump | 192 call .vpx_sad16x16x3_ssse3_do_jump |
193 .vp8_sad16x16x3_ssse3_do_jump: | 193 .vpx_sad16x16x3_ssse3_do_jump: |
194 pop rcx ; get the address of do_jump | 194 pop rcx ; get the address of do_jump |
195 mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_
ssse3_do_jump | 195 mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_
ssse3_do_jump |
196 add rax, rcx ; get the absolute address of vp8_sad16x16x3_
ssse3_jumptable | 196 add rax, rcx ; get the absolute address of vpx_sad16x16x3_
ssse3_jumptable |
197 | 197 |
198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable | 198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable |
199 add rcx, rax | 199 add rcx, rax |
200 | 200 |
201 movsxd rax, dword ptr arg(1) ;src_stride | 201 movsxd rax, dword ptr arg(1) ;src_stride |
202 movsxd rdx, dword ptr arg(3) ;ref_stride | 202 movsxd rdx, dword ptr arg(3) ;ref_stride |
203 | 203 |
204 jmp rcx | 204 jmp rcx |
205 | 205 |
206 PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3 | 206 PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 |
207 PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3 | 207 PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 |
208 PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3 | 208 PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 |
209 PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3 | 209 PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 |
210 PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3 | 210 PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 |
211 PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3 | 211 PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 |
212 PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3 | 212 PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 |
213 PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3 | 213 PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 |
214 PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3 | 214 PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 |
215 PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3 | 215 PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 |
216 PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3 | 216 PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 |
217 PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3 | 217 PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 |
218 PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3 | 218 PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 |
219 PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3 | 219 PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 |
220 PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3 | 220 PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 |
221 | 221 |
222 .vp8_sad16x16x3_ssse3_aligned_by_15: | 222 .vpx_sad16x16x3_ssse3_aligned_by_15: |
223 PROCESS_16X2X3 1 | 223 PROCESS_16X2X3 1 |
224 PROCESS_16X2X3 0 | 224 PROCESS_16X2X3 0 |
225 PROCESS_16X2X3 0 | 225 PROCESS_16X2X3 0 |
226 PROCESS_16X2X3 0 | 226 PROCESS_16X2X3 0 |
227 PROCESS_16X2X3 0 | 227 PROCESS_16X2X3 0 |
228 PROCESS_16X2X3 0 | 228 PROCESS_16X2X3 0 |
229 PROCESS_16X2X3 0 | 229 PROCESS_16X2X3 0 |
230 PROCESS_16X2X3 0 | 230 PROCESS_16X2X3 0 |
231 | 231 |
232 .vp8_sad16x16x3_ssse3_store_off: | 232 .vpx_sad16x16x3_ssse3_store_off: |
233 mov rdi, arg(4) ;Results | 233 mov rdi, arg(4) ;Results |
234 | 234 |
235 movq xmm0, xmm5 | 235 movq xmm0, xmm5 |
236 psrldq xmm5, 8 | 236 psrldq xmm5, 8 |
237 | 237 |
238 paddw xmm0, xmm5 | 238 paddw xmm0, xmm5 |
239 movd [rdi], xmm0 | 239 movd [rdi], xmm0 |
240 ;- | 240 ;- |
241 movq xmm0, xmm6 | 241 movq xmm0, xmm6 |
242 psrldq xmm6, 8 | 242 psrldq xmm6, 8 |
243 | 243 |
244 paddw xmm0, xmm6 | 244 paddw xmm0, xmm6 |
245 movd [rdi+4], xmm0 | 245 movd [rdi+4], xmm0 |
246 ;- | 246 ;- |
247 movq xmm0, xmm7 | 247 movq xmm0, xmm7 |
248 psrldq xmm7, 8 | 248 psrldq xmm7, 8 |
249 | 249 |
250 paddw xmm0, xmm7 | 250 paddw xmm0, xmm7 |
251 movd [rdi+8], xmm0 | 251 movd [rdi+8], xmm0 |
252 | 252 |
253 ; begin epilog | 253 ; begin epilog |
254 pop rcx | 254 pop rcx |
255 pop rdi | 255 pop rdi |
256 pop rsi | 256 pop rsi |
257 RESTORE_XMM | 257 RESTORE_XMM |
258 UNSHADOW_ARGS | 258 UNSHADOW_ARGS |
259 pop rbp | 259 pop rbp |
260 ret | 260 ret |
261 | 261 |
262 ;void int vp8_sad16x8x3_ssse3( | 262 ;void int vpx_sad16x8x3_ssse3( |
263 ; unsigned char *src_ptr, | 263 ; unsigned char *src_ptr, |
264 ; int src_stride, | 264 ; int src_stride, |
265 ; unsigned char *ref_ptr, | 265 ; unsigned char *ref_ptr, |
266 ; int ref_stride, | 266 ; int ref_stride, |
267 ; int *results) | 267 ; int *results) |
268 global sym(vp8_sad16x8x3_ssse3) PRIVATE | 268 global sym(vpx_sad16x8x3_ssse3) PRIVATE |
269 sym(vp8_sad16x8x3_ssse3): | 269 sym(vpx_sad16x8x3_ssse3): |
270 push rbp | 270 push rbp |
271 mov rbp, rsp | 271 mov rbp, rsp |
272 SHADOW_ARGS_TO_STACK 5 | 272 SHADOW_ARGS_TO_STACK 5 |
273 SAVE_XMM 7 | 273 SAVE_XMM 7 |
274 push rsi | 274 push rsi |
275 push rdi | 275 push rdi |
276 push rcx | 276 push rcx |
277 ; end prolog | 277 ; end prolog |
278 | 278 |
279 mov rsi, arg(0) ;src_ptr | 279 mov rsi, arg(0) ;src_ptr |
280 mov rdi, arg(2) ;ref_ptr | 280 mov rdi, arg(2) ;ref_ptr |
281 | 281 |
282 mov rdx, 0xf | 282 mov rdx, 0xf |
283 and rdx, rdi | 283 and rdx, rdi |
284 | 284 |
285 jmp .vp8_sad16x8x3_ssse3_skiptable | 285 jmp .vpx_sad16x8x3_ssse3_skiptable |
286 .vp8_sad16x8x3_ssse3_jumptable: | 286 .vpx_sad16x8x3_ssse3_jumptable: |
287 dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump | 287 dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump |
288 dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump | 288 dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump |
289 dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump | 289 dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump |
290 dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump | 290 dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump |
291 dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump | 291 dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump |
292 dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump | 292 dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump |
293 dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump | 293 dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump |
294 dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump | 294 dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump |
295 dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump | 295 dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump |
296 dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump | 296 dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump |
297 dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump | 297 dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump |
298 dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump | 298 dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump |
299 dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump | 299 dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump |
300 dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump | 300 dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump |
301 dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump | 301 dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump |
302 dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump | 302 dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump |
303 .vp8_sad16x8x3_ssse3_skiptable: | 303 .vpx_sad16x8x3_ssse3_skiptable: |
304 | 304 |
305 call .vp8_sad16x8x3_ssse3_do_jump | 305 call .vpx_sad16x8x3_ssse3_do_jump |
306 .vp8_sad16x8x3_ssse3_do_jump: | 306 .vpx_sad16x8x3_ssse3_do_jump: |
307 pop rcx ; get the address of do_jump | 307 pop rcx ; get the address of do_jump |
308 mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ss
se3_do_jump | 308 mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ss
se3_do_jump |
309 add rax, rcx ; get the absolute address of vp8_sad16x8x3_s
sse3_jumptable | 309 add rax, rcx ; get the absolute address of vpx_sad16x8x3_s
sse3_jumptable |
310 | 310 |
311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable | 311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable |
312 add rcx, rax | 312 add rcx, rax |
313 | 313 |
314 movsxd rax, dword ptr arg(1) ;src_stride | 314 movsxd rax, dword ptr arg(1) ;src_stride |
315 movsxd rdx, dword ptr arg(3) ;ref_stride | 315 movsxd rdx, dword ptr arg(3) ;ref_stride |
316 | 316 |
317 jmp rcx | 317 jmp rcx |
318 | 318 |
319 PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3 | 319 PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 |
320 PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3 | 320 PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 |
321 PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3 | 321 PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 |
322 PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3 | 322 PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 |
323 PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3 | 323 PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 |
324 PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3 | 324 PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 |
325 PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3 | 325 PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 |
326 PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3 | 326 PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 |
327 PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3 | 327 PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 |
328 PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3 | 328 PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 |
329 PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3 | 329 PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 |
330 PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3 | 330 PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 |
331 PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3 | 331 PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 |
332 PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3 | 332 PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 |
333 PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3 | 333 PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 |
334 | 334 |
335 .vp8_sad16x8x3_ssse3_aligned_by_15: | 335 .vpx_sad16x8x3_ssse3_aligned_by_15: |
336 | 336 |
337 PROCESS_16X2X3 1 | 337 PROCESS_16X2X3 1 |
338 PROCESS_16X2X3 0 | 338 PROCESS_16X2X3 0 |
339 PROCESS_16X2X3 0 | 339 PROCESS_16X2X3 0 |
340 PROCESS_16X2X3 0 | 340 PROCESS_16X2X3 0 |
341 | 341 |
342 .vp8_sad16x8x3_ssse3_store_off: | 342 .vpx_sad16x8x3_ssse3_store_off: |
343 mov rdi, arg(4) ;Results | 343 mov rdi, arg(4) ;Results |
344 | 344 |
345 movq xmm0, xmm5 | 345 movq xmm0, xmm5 |
346 psrldq xmm5, 8 | 346 psrldq xmm5, 8 |
347 | 347 |
348 paddw xmm0, xmm5 | 348 paddw xmm0, xmm5 |
349 movd [rdi], xmm0 | 349 movd [rdi], xmm0 |
350 ;- | 350 ;- |
351 movq xmm0, xmm6 | 351 movq xmm0, xmm6 |
352 psrldq xmm6, 8 | 352 psrldq xmm6, 8 |
353 | 353 |
354 paddw xmm0, xmm6 | 354 paddw xmm0, xmm6 |
355 movd [rdi+4], xmm0 | 355 movd [rdi+4], xmm0 |
356 ;- | 356 ;- |
357 movq xmm0, xmm7 | 357 movq xmm0, xmm7 |
358 psrldq xmm7, 8 | 358 psrldq xmm7, 8 |
359 | 359 |
360 paddw xmm0, xmm7 | 360 paddw xmm0, xmm7 |
361 movd [rdi+8], xmm0 | 361 movd [rdi+8], xmm0 |
362 | 362 |
363 ; begin epilog | 363 ; begin epilog |
364 pop rcx | 364 pop rcx |
365 pop rdi | 365 pop rdi |
366 pop rsi | 366 pop rsi |
367 RESTORE_XMM | 367 RESTORE_XMM |
368 UNSHADOW_ARGS | 368 UNSHADOW_ARGS |
369 pop rbp | 369 pop rbp |
370 ret | 370 ret |
OLD | NEW |