OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) |
| 15 global sym(vp9_block_error_xmm) |
| 16 sym(vp9_block_error_xmm): |
| 17 push rbp |
| 18 mov rbp, rsp |
| 19 SHADOW_ARGS_TO_STACK 2 |
| 20 push rsi |
| 21 push rdi |
| 22 ; end prologue |
| 23 |
| 24 mov rsi, arg(0) ;coeff_ptr |
| 25 mov rdi, arg(1) ;dcoef_ptr |
| 26 |
| 27 movdqa xmm0, [rsi] |
| 28 movdqa xmm1, [rdi] |
| 29 |
| 30 movdqa xmm2, [rsi+16] |
| 31 movdqa xmm3, [rdi+16] |
| 32 |
| 33 psubw xmm0, xmm1 |
| 34 psubw xmm2, xmm3 |
| 35 |
| 36 pmaddwd xmm0, xmm0 |
| 37 pmaddwd xmm2, xmm2 |
| 38 |
| 39 paddd xmm0, xmm2 |
| 40 |
| 41 pxor xmm5, xmm5 |
| 42 movdqa xmm1, xmm0 |
| 43 |
| 44 punpckldq xmm0, xmm5 |
| 45 punpckhdq xmm1, xmm5 |
| 46 |
| 47 paddd xmm0, xmm1 |
| 48 movdqa xmm1, xmm0 |
| 49 |
| 50 psrldq xmm0, 8 |
| 51 paddd xmm0, xmm1 |
| 52 |
| 53 movq rax, xmm0 |
| 54 |
| 55 pop rdi |
| 56 pop rsi |
| 57 ; begin epilog |
| 58 UNSHADOW_ARGS |
| 59 pop rbp |
| 60 ret |
| 61 |
| 62 ;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) |
| 63 global sym(vp9_block_error_mmx) |
| 64 sym(vp9_block_error_mmx): |
| 65 push rbp |
| 66 mov rbp, rsp |
| 67 SHADOW_ARGS_TO_STACK 2 |
| 68 push rsi |
| 69 push rdi |
| 70 ; end prolog |
| 71 |
| 72 |
| 73 mov rsi, arg(0) ;coeff_ptr |
| 74 pxor mm7, mm7 |
| 75 |
| 76 mov rdi, arg(1) ;dcoef_ptr |
| 77 movq mm3, [rsi] |
| 78 |
| 79 movq mm4, [rdi] |
| 80 movq mm5, [rsi+8] |
| 81 |
| 82 movq mm6, [rdi+8] |
| 83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0 |
| 84 |
| 85 movq mm2, mm7 |
| 86 psubw mm5, mm6 |
| 87 |
| 88 por mm1, mm2 |
| 89 pmaddwd mm5, mm5 |
| 90 |
| 91 pcmpeqw mm1, mm7 |
| 92 psubw mm3, mm4 |
| 93 |
| 94 pand mm1, mm3 |
| 95 pmaddwd mm1, mm1 |
| 96 |
| 97 paddd mm1, mm5 |
| 98 movq mm3, [rsi+16] |
| 99 |
| 100 movq mm4, [rdi+16] |
| 101 movq mm5, [rsi+24] |
| 102 |
| 103 movq mm6, [rdi+24] |
| 104 psubw mm5, mm6 |
| 105 |
| 106 pmaddwd mm5, mm5 |
| 107 psubw mm3, mm4 |
| 108 |
| 109 pmaddwd mm3, mm3 |
| 110 paddd mm3, mm5 |
| 111 |
| 112 paddd mm1, mm3 |
| 113 movq mm0, mm1 |
| 114 |
| 115 psrlq mm1, 32 |
| 116 paddd mm0, mm1 |
| 117 |
| 118 movq rax, mm0 |
| 119 |
| 120 pop rdi |
| 121 pop rsi |
| 122 ; begin epilog |
| 123 UNSHADOW_ARGS |
| 124 pop rbp |
| 125 ret |
| 126 |
| 127 |
| 128 ;int vp9_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); |
| 129 global sym(vp9_mbblock_error_mmx_impl) |
| 130 sym(vp9_mbblock_error_mmx_impl): |
| 131 push rbp |
| 132 mov rbp, rsp |
| 133 SHADOW_ARGS_TO_STACK 3 |
| 134 push rsi |
| 135 push rdi |
| 136 ; end prolog |
| 137 |
| 138 |
| 139 mov rsi, arg(0) ;coeff_ptr |
| 140 pxor mm7, mm7 |
| 141 |
| 142 mov rdi, arg(1) ;dcoef_ptr |
| 143 pxor mm2, mm2 |
| 144 |
| 145 movd mm1, dword ptr arg(2) ;dc |
| 146 por mm1, mm2 |
| 147 |
| 148 pcmpeqw mm1, mm7 |
| 149 mov rcx, 16 |
| 150 |
| 151 .mberror_loop_mmx: |
| 152 movq mm3, [rsi] |
| 153 movq mm4, [rdi] |
| 154 |
| 155 movq mm5, [rsi+8] |
| 156 movq mm6, [rdi+8] |
| 157 |
| 158 |
| 159 psubw mm5, mm6 |
| 160 pmaddwd mm5, mm5 |
| 161 |
| 162 psubw mm3, mm4 |
| 163 pand mm3, mm1 |
| 164 |
| 165 pmaddwd mm3, mm3 |
| 166 paddd mm2, mm5 |
| 167 |
| 168 paddd mm2, mm3 |
| 169 movq mm3, [rsi+16] |
| 170 |
| 171 movq mm4, [rdi+16] |
| 172 movq mm5, [rsi+24] |
| 173 |
| 174 movq mm6, [rdi+24] |
| 175 psubw mm5, mm6 |
| 176 |
| 177 pmaddwd mm5, mm5 |
| 178 psubw mm3, mm4 |
| 179 |
| 180 pmaddwd mm3, mm3 |
| 181 paddd mm2, mm5 |
| 182 |
| 183 paddd mm2, mm3 |
| 184 add rsi, 32 |
| 185 |
| 186 add rdi, 32 |
| 187 sub rcx, 1 |
| 188 |
| 189 jnz .mberror_loop_mmx |
| 190 |
| 191 movq mm0, mm2 |
| 192 psrlq mm2, 32 |
| 193 |
| 194 paddd mm0, mm2 |
| 195 movq rax, mm0 |
| 196 |
| 197 pop rdi |
| 198 pop rsi |
| 199 ; begin epilog |
| 200 UNSHADOW_ARGS |
| 201 pop rbp |
| 202 ret |
| 203 |
| 204 |
| 205 ;int vp9_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); |
| 206 global sym(vp9_mbblock_error_xmm_impl) |
| 207 sym(vp9_mbblock_error_xmm_impl): |
| 208 push rbp |
| 209 mov rbp, rsp |
| 210 SHADOW_ARGS_TO_STACK 3 |
| 211 SAVE_XMM 6 |
| 212 push rsi |
| 213 push rdi |
| 214 ; end prolog |
| 215 |
| 216 |
| 217 mov rsi, arg(0) ;coeff_ptr |
| 218 pxor xmm6, xmm6 |
| 219 |
| 220 mov rdi, arg(1) ;dcoef_ptr |
| 221 pxor xmm4, xmm4 |
| 222 |
| 223 movd xmm5, dword ptr arg(2) ;dc |
| 224 por xmm5, xmm4 |
| 225 |
| 226 pcmpeqw xmm5, xmm6 |
| 227 mov rcx, 16 |
| 228 |
| 229 .mberror_loop: |
| 230 movdqa xmm0, [rsi] |
| 231 movdqa xmm1, [rdi] |
| 232 |
| 233 movdqa xmm2, [rsi+16] |
| 234 movdqa xmm3, [rdi+16] |
| 235 |
| 236 |
| 237 psubw xmm2, xmm3 |
| 238 pmaddwd xmm2, xmm2 |
| 239 |
| 240 psubw xmm0, xmm1 |
| 241 pand xmm0, xmm5 |
| 242 |
| 243 pmaddwd xmm0, xmm0 |
| 244 add rsi, 32 |
| 245 |
| 246 add rdi, 32 |
| 247 |
| 248 sub rcx, 1 |
| 249 paddd xmm4, xmm2 |
| 250 |
| 251 paddd xmm4, xmm0 |
| 252 jnz .mberror_loop |
| 253 |
| 254 movdqa xmm0, xmm4 |
| 255 punpckldq xmm0, xmm6 |
| 256 |
| 257 punpckhdq xmm4, xmm6 |
| 258 paddd xmm0, xmm4 |
| 259 |
| 260 movdqa xmm1, xmm0 |
| 261 psrldq xmm0, 8 |
| 262 |
| 263 paddd xmm0, xmm1 |
| 264 movq rax, xmm0 |
| 265 |
| 266 pop rdi |
| 267 pop rsi |
| 268 ; begin epilog |
| 269 RESTORE_XMM |
| 270 UNSHADOW_ARGS |
| 271 pop rbp |
| 272 ret |
| 273 |
| 274 |
| 275 ;int vp9_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); |
| 276 global sym(vp9_mbuverror_mmx_impl) |
| 277 sym(vp9_mbuverror_mmx_impl): |
| 278 push rbp |
| 279 mov rbp, rsp |
| 280 SHADOW_ARGS_TO_STACK 2 |
| 281 push rsi |
| 282 push rdi |
| 283 ; end prolog |
| 284 |
| 285 |
| 286 mov rsi, arg(0) ;s_ptr |
| 287 mov rdi, arg(1) ;d_ptr |
| 288 |
| 289 mov rcx, 16 |
| 290 pxor mm7, mm7 |
| 291 |
| 292 .mbuverror_loop_mmx: |
| 293 |
| 294 movq mm1, [rsi] |
| 295 movq mm2, [rdi] |
| 296 |
| 297 psubw mm1, mm2 |
| 298 pmaddwd mm1, mm1 |
| 299 |
| 300 |
| 301 movq mm3, [rsi+8] |
| 302 movq mm4, [rdi+8] |
| 303 |
| 304 psubw mm3, mm4 |
| 305 pmaddwd mm3, mm3 |
| 306 |
| 307 |
| 308 paddd mm7, mm1 |
| 309 paddd mm7, mm3 |
| 310 |
| 311 |
| 312 add rsi, 16 |
| 313 add rdi, 16 |
| 314 |
| 315 dec rcx |
| 316 jnz .mbuverror_loop_mmx |
| 317 |
| 318 movq mm0, mm7 |
| 319 psrlq mm7, 32 |
| 320 |
| 321 paddd mm0, mm7 |
| 322 movq rax, mm0 |
| 323 |
| 324 pop rdi |
| 325 pop rsi |
| 326 ; begin epilog |
| 327 UNSHADOW_ARGS |
| 328 pop rbp |
| 329 ret |
| 330 |
| 331 |
| 332 ;int vp9_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); |
| 333 global sym(vp9_mbuverror_xmm_impl) |
| 334 sym(vp9_mbuverror_xmm_impl): |
| 335 push rbp |
| 336 mov rbp, rsp |
| 337 SHADOW_ARGS_TO_STACK 2 |
| 338 push rsi |
| 339 push rdi |
| 340 ; end prolog |
| 341 |
| 342 |
| 343 mov rsi, arg(0) ;s_ptr |
| 344 mov rdi, arg(1) ;d_ptr |
| 345 |
| 346 mov rcx, 16 |
| 347 pxor xmm3, xmm3 |
| 348 |
| 349 .mbuverror_loop: |
| 350 |
| 351 movdqa xmm1, [rsi] |
| 352 movdqa xmm2, [rdi] |
| 353 |
| 354 psubw xmm1, xmm2 |
| 355 pmaddwd xmm1, xmm1 |
| 356 |
| 357 paddd xmm3, xmm1 |
| 358 |
| 359 add rsi, 16 |
| 360 add rdi, 16 |
| 361 |
| 362 dec rcx |
| 363 jnz .mbuverror_loop |
| 364 |
| 365 pxor xmm0, xmm0 |
| 366 movdqa xmm1, xmm3 |
| 367 |
| 368 movdqa xmm2, xmm1 |
| 369 punpckldq xmm1, xmm0 |
| 370 |
| 371 punpckhdq xmm2, xmm0 |
| 372 paddd xmm1, xmm2 |
| 373 |
| 374 movdqa xmm2, xmm1 |
| 375 |
| 376 psrldq xmm1, 8 |
| 377 paddd xmm1, xmm2 |
| 378 |
| 379 movq rax, xmm1 |
| 380 |
| 381 pop rdi |
| 382 pop rsi |
| 383 ; begin epilog |
| 384 UNSHADOW_ARGS |
| 385 pop rbp |
| 386 ret |
OLD | NEW |