| OLD | NEW |
| (Empty) |
| 1 ; | |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. | |
| 3 ; | |
| 4 ; Use of this source code is governed by a BSD-style license | |
| 5 ; that can be found in the LICENSE file in the root of the source | |
| 6 ; tree. An additional intellectual property rights grant can be found | |
| 7 ; in the file PATENTS. All contributing project authors may | |
| 8 ; be found in the AUTHORS file in the root of the source tree. | |
| 9 ; | |
| 10 | |
| 11 | |
| 12 %include "vpx_ports/x86_abi_support.asm" | |
| 13 | |
| 14 %macro PROCESS_16X2X8 1 | |
| 15 %if %1 | |
| 16 movdqa xmm0, XMMWORD PTR [rsi] | |
| 17 movq xmm1, MMWORD PTR [rdi] | |
| 18 movq xmm3, MMWORD PTR [rdi+8] | |
| 19 movq xmm2, MMWORD PTR [rdi+16] | |
| 20 punpcklqdq xmm1, xmm3 | |
| 21 punpcklqdq xmm3, xmm2 | |
| 22 | |
| 23 movdqa xmm2, xmm1 | |
| 24 mpsadbw xmm1, xmm0, 0x0 | |
| 25 mpsadbw xmm2, xmm0, 0x5 | |
| 26 | |
| 27 psrldq xmm0, 8 | |
| 28 | |
| 29 movdqa xmm4, xmm3 | |
| 30 mpsadbw xmm3, xmm0, 0x0 | |
| 31 mpsadbw xmm4, xmm0, 0x5 | |
| 32 | |
| 33 paddw xmm1, xmm2 | |
| 34 paddw xmm1, xmm3 | |
| 35 paddw xmm1, xmm4 | |
| 36 %else | |
| 37 movdqa xmm0, XMMWORD PTR [rsi] | |
| 38 movq xmm5, MMWORD PTR [rdi] | |
| 39 movq xmm3, MMWORD PTR [rdi+8] | |
| 40 movq xmm2, MMWORD PTR [rdi+16] | |
| 41 punpcklqdq xmm5, xmm3 | |
| 42 punpcklqdq xmm3, xmm2 | |
| 43 | |
| 44 movdqa xmm2, xmm5 | |
| 45 mpsadbw xmm5, xmm0, 0x0 | |
| 46 mpsadbw xmm2, xmm0, 0x5 | |
| 47 | |
| 48 psrldq xmm0, 8 | |
| 49 | |
| 50 movdqa xmm4, xmm3 | |
| 51 mpsadbw xmm3, xmm0, 0x0 | |
| 52 mpsadbw xmm4, xmm0, 0x5 | |
| 53 | |
| 54 paddw xmm5, xmm2 | |
| 55 paddw xmm5, xmm3 | |
| 56 paddw xmm5, xmm4 | |
| 57 | |
| 58 paddw xmm1, xmm5 | |
| 59 %endif | |
| 60 movdqa xmm0, XMMWORD PTR [rsi + rax] | |
| 61 movq xmm5, MMWORD PTR [rdi+ rdx] | |
| 62 movq xmm3, MMWORD PTR [rdi+ rdx+8] | |
| 63 movq xmm2, MMWORD PTR [rdi+ rdx+16] | |
| 64 punpcklqdq xmm5, xmm3 | |
| 65 punpcklqdq xmm3, xmm2 | |
| 66 | |
| 67 lea rsi, [rsi+rax*2] | |
| 68 lea rdi, [rdi+rdx*2] | |
| 69 | |
| 70 movdqa xmm2, xmm5 | |
| 71 mpsadbw xmm5, xmm0, 0x0 | |
| 72 mpsadbw xmm2, xmm0, 0x5 | |
| 73 | |
| 74 psrldq xmm0, 8 | |
| 75 movdqa xmm4, xmm3 | |
| 76 mpsadbw xmm3, xmm0, 0x0 | |
| 77 mpsadbw xmm4, xmm0, 0x5 | |
| 78 | |
| 79 paddw xmm5, xmm2 | |
| 80 paddw xmm5, xmm3 | |
| 81 paddw xmm5, xmm4 | |
| 82 | |
| 83 paddw xmm1, xmm5 | |
| 84 %endmacro | |
| 85 | |
| 86 %macro PROCESS_8X2X8 1 | |
| 87 %if %1 | |
| 88 movq xmm0, MMWORD PTR [rsi] | |
| 89 movq xmm1, MMWORD PTR [rdi] | |
| 90 movq xmm3, MMWORD PTR [rdi+8] | |
| 91 punpcklqdq xmm1, xmm3 | |
| 92 | |
| 93 movdqa xmm2, xmm1 | |
| 94 mpsadbw xmm1, xmm0, 0x0 | |
| 95 mpsadbw xmm2, xmm0, 0x5 | |
| 96 paddw xmm1, xmm2 | |
| 97 %else | |
| 98 movq xmm0, MMWORD PTR [rsi] | |
| 99 movq xmm5, MMWORD PTR [rdi] | |
| 100 movq xmm3, MMWORD PTR [rdi+8] | |
| 101 punpcklqdq xmm5, xmm3 | |
| 102 | |
| 103 movdqa xmm2, xmm5 | |
| 104 mpsadbw xmm5, xmm0, 0x0 | |
| 105 mpsadbw xmm2, xmm0, 0x5 | |
| 106 paddw xmm5, xmm2 | |
| 107 | |
| 108 paddw xmm1, xmm5 | |
| 109 %endif | |
| 110 movq xmm0, MMWORD PTR [rsi + rax] | |
| 111 movq xmm5, MMWORD PTR [rdi+ rdx] | |
| 112 movq xmm3, MMWORD PTR [rdi+ rdx+8] | |
| 113 punpcklqdq xmm5, xmm3 | |
| 114 | |
| 115 lea rsi, [rsi+rax*2] | |
| 116 lea rdi, [rdi+rdx*2] | |
| 117 | |
| 118 movdqa xmm2, xmm5 | |
| 119 mpsadbw xmm5, xmm0, 0x0 | |
| 120 mpsadbw xmm2, xmm0, 0x5 | |
| 121 paddw xmm5, xmm2 | |
| 122 | |
| 123 paddw xmm1, xmm5 | |
| 124 %endmacro | |
| 125 | |
| 126 %macro PROCESS_4X2X8 1 | |
| 127 %if %1 | |
| 128 movd xmm0, [rsi] | |
| 129 movq xmm1, MMWORD PTR [rdi] | |
| 130 movq xmm3, MMWORD PTR [rdi+8] | |
| 131 punpcklqdq xmm1, xmm3 | |
| 132 | |
| 133 mpsadbw xmm1, xmm0, 0x0 | |
| 134 %else | |
| 135 movd xmm0, [rsi] | |
| 136 movq xmm5, MMWORD PTR [rdi] | |
| 137 movq xmm3, MMWORD PTR [rdi+8] | |
| 138 punpcklqdq xmm5, xmm3 | |
| 139 | |
| 140 mpsadbw xmm5, xmm0, 0x0 | |
| 141 | |
| 142 paddw xmm1, xmm5 | |
| 143 %endif | |
| 144 movd xmm0, [rsi + rax] | |
| 145 movq xmm5, MMWORD PTR [rdi+ rdx] | |
| 146 movq xmm3, MMWORD PTR [rdi+ rdx+8] | |
| 147 punpcklqdq xmm5, xmm3 | |
| 148 | |
| 149 lea rsi, [rsi+rax*2] | |
| 150 lea rdi, [rdi+rdx*2] | |
| 151 | |
| 152 mpsadbw xmm5, xmm0, 0x0 | |
| 153 | |
| 154 paddw xmm1, xmm5 | |
| 155 %endmacro | |
| 156 | |
| 157 %macro WRITE_AS_INTS 0 | |
| 158 mov rdi, arg(4) ;Results | |
| 159 pxor xmm0, xmm0 | |
| 160 movdqa xmm2, xmm1 | |
| 161 punpcklwd xmm1, xmm0 | |
| 162 punpckhwd xmm2, xmm0 | |
| 163 | |
| 164 movdqa [rdi], xmm1 | |
| 165 movdqa [rdi + 16], xmm2 | |
| 166 %endmacro | |
| 167 | |
| 168 ;void vp9_sad16x16x8_sse4( | |
| 169 ; const unsigned char *src_ptr, | |
| 170 ; int src_stride, | |
| 171 ; const unsigned char *ref_ptr, | |
| 172 ; int ref_stride, | |
| 173 ; unsigned short *sad_array); | |
| 174 global sym(vp9_sad16x16x8_sse4) PRIVATE | |
| 175 sym(vp9_sad16x16x8_sse4): | |
| 176 push rbp | |
| 177 mov rbp, rsp | |
| 178 SHADOW_ARGS_TO_STACK 5 | |
| 179 push rsi | |
| 180 push rdi | |
| 181 ; end prolog | |
| 182 | |
| 183 mov rsi, arg(0) ;src_ptr | |
| 184 mov rdi, arg(2) ;ref_ptr | |
| 185 | |
| 186 movsxd rax, dword ptr arg(1) ;src_stride | |
| 187 movsxd rdx, dword ptr arg(3) ;ref_stride | |
| 188 | |
| 189 PROCESS_16X2X8 1 | |
| 190 PROCESS_16X2X8 0 | |
| 191 PROCESS_16X2X8 0 | |
| 192 PROCESS_16X2X8 0 | |
| 193 PROCESS_16X2X8 0 | |
| 194 PROCESS_16X2X8 0 | |
| 195 PROCESS_16X2X8 0 | |
| 196 PROCESS_16X2X8 0 | |
| 197 | |
| 198 WRITE_AS_INTS | |
| 199 | |
| 200 ; begin epilog | |
| 201 pop rdi | |
| 202 pop rsi | |
| 203 UNSHADOW_ARGS | |
| 204 pop rbp | |
| 205 ret | |
| 206 | |
| 207 | |
| 208 ;void vp9_sad16x8x8_sse4( | |
| 209 ; const unsigned char *src_ptr, | |
| 210 ; int src_stride, | |
| 211 ; const unsigned char *ref_ptr, | |
| 212 ; int ref_stride, | |
| 213 ; unsigned short *sad_array | |
| 214 ;); | |
| 215 global sym(vp9_sad16x8x8_sse4) PRIVATE | |
| 216 sym(vp9_sad16x8x8_sse4): | |
| 217 push rbp | |
| 218 mov rbp, rsp | |
| 219 SHADOW_ARGS_TO_STACK 5 | |
| 220 push rsi | |
| 221 push rdi | |
| 222 ; end prolog | |
| 223 | |
| 224 mov rsi, arg(0) ;src_ptr | |
| 225 mov rdi, arg(2) ;ref_ptr | |
| 226 | |
| 227 movsxd rax, dword ptr arg(1) ;src_stride | |
| 228 movsxd rdx, dword ptr arg(3) ;ref_stride | |
| 229 | |
| 230 PROCESS_16X2X8 1 | |
| 231 PROCESS_16X2X8 0 | |
| 232 PROCESS_16X2X8 0 | |
| 233 PROCESS_16X2X8 0 | |
| 234 | |
| 235 WRITE_AS_INTS | |
| 236 | |
| 237 ; begin epilog | |
| 238 pop rdi | |
| 239 pop rsi | |
| 240 UNSHADOW_ARGS | |
| 241 pop rbp | |
| 242 ret | |
| 243 | |
| 244 | |
| 245 ;void vp9_sad8x8x8_sse4( | |
| 246 ; const unsigned char *src_ptr, | |
| 247 ; int src_stride, | |
| 248 ; const unsigned char *ref_ptr, | |
| 249 ; int ref_stride, | |
| 250 ; unsigned short *sad_array | |
| 251 ;); | |
| 252 global sym(vp9_sad8x8x8_sse4) PRIVATE | |
| 253 sym(vp9_sad8x8x8_sse4): | |
| 254 push rbp | |
| 255 mov rbp, rsp | |
| 256 SHADOW_ARGS_TO_STACK 5 | |
| 257 push rsi | |
| 258 push rdi | |
| 259 ; end prolog | |
| 260 | |
| 261 mov rsi, arg(0) ;src_ptr | |
| 262 mov rdi, arg(2) ;ref_ptr | |
| 263 | |
| 264 movsxd rax, dword ptr arg(1) ;src_stride | |
| 265 movsxd rdx, dword ptr arg(3) ;ref_stride | |
| 266 | |
| 267 PROCESS_8X2X8 1 | |
| 268 PROCESS_8X2X8 0 | |
| 269 PROCESS_8X2X8 0 | |
| 270 PROCESS_8X2X8 0 | |
| 271 | |
| 272 WRITE_AS_INTS | |
| 273 | |
| 274 ; begin epilog | |
| 275 pop rdi | |
| 276 pop rsi | |
| 277 UNSHADOW_ARGS | |
| 278 pop rbp | |
| 279 ret | |
| 280 | |
| 281 | |
| 282 ;void vp9_sad8x16x8_sse4( | |
| 283 ; const unsigned char *src_ptr, | |
| 284 ; int src_stride, | |
| 285 ; const unsigned char *ref_ptr, | |
| 286 ; int ref_stride, | |
| 287 ; unsigned short *sad_array | |
| 288 ;); | |
| 289 global sym(vp9_sad8x16x8_sse4) PRIVATE | |
| 290 sym(vp9_sad8x16x8_sse4): | |
| 291 push rbp | |
| 292 mov rbp, rsp | |
| 293 SHADOW_ARGS_TO_STACK 5 | |
| 294 push rsi | |
| 295 push rdi | |
| 296 ; end prolog | |
| 297 | |
| 298 mov rsi, arg(0) ;src_ptr | |
| 299 mov rdi, arg(2) ;ref_ptr | |
| 300 | |
| 301 movsxd rax, dword ptr arg(1) ;src_stride | |
| 302 movsxd rdx, dword ptr arg(3) ;ref_stride | |
| 303 | |
| 304 PROCESS_8X2X8 1 | |
| 305 PROCESS_8X2X8 0 | |
| 306 PROCESS_8X2X8 0 | |
| 307 PROCESS_8X2X8 0 | |
| 308 PROCESS_8X2X8 0 | |
| 309 PROCESS_8X2X8 0 | |
| 310 PROCESS_8X2X8 0 | |
| 311 PROCESS_8X2X8 0 | |
| 312 | |
| 313 WRITE_AS_INTS | |
| 314 | |
| 315 ; begin epilog | |
| 316 pop rdi | |
| 317 pop rsi | |
| 318 UNSHADOW_ARGS | |
| 319 pop rbp | |
| 320 ret | |
| 321 | |
| 322 | |
| 323 ;void vp9_sad4x4x8_c( | |
| 324 ; const unsigned char *src_ptr, | |
| 325 ; int src_stride, | |
| 326 ; const unsigned char *ref_ptr, | |
| 327 ; int ref_stride, | |
| 328 ; unsigned short *sad_array | |
| 329 ;); | |
| 330 global sym(vp9_sad4x4x8_sse4) PRIVATE | |
| 331 sym(vp9_sad4x4x8_sse4): | |
| 332 push rbp | |
| 333 mov rbp, rsp | |
| 334 SHADOW_ARGS_TO_STACK 5 | |
| 335 push rsi | |
| 336 push rdi | |
| 337 ; end prolog | |
| 338 | |
| 339 mov rsi, arg(0) ;src_ptr | |
| 340 mov rdi, arg(2) ;ref_ptr | |
| 341 | |
| 342 movsxd rax, dword ptr arg(1) ;src_stride | |
| 343 movsxd rdx, dword ptr arg(3) ;ref_stride | |
| 344 | |
| 345 PROCESS_4X2X8 1 | |
| 346 PROCESS_4X2X8 0 | |
| 347 | |
| 348 WRITE_AS_INTS | |
| 349 | |
| 350 ; begin epilog | |
| 351 pop rdi | |
| 352 pop rsi | |
| 353 UNSHADOW_ARGS | |
| 354 pop rbp | |
| 355 ret | |
| 356 | |
| 357 | |
| 358 | |
| 359 | |
| OLD | NEW |