OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, |
| 15 ; short *diff, unsigned char *Predictor, |
| 16 ; int pitch); |
| 17 global sym(vp9_subtract_b_mmx_impl) |
| 18 sym(vp9_subtract_b_mmx_impl): |
| 19 push rbp |
| 20 mov rbp, rsp |
| 21 SHADOW_ARGS_TO_STACK 5 |
| 22 push rsi |
| 23 push rdi |
| 24 ; end prolog |
| 25 |
| 26 |
| 27 mov rdi, arg(2) ;diff |
| 28 mov rax, arg(3) ;Predictor |
| 29 mov rsi, arg(0) ;z |
| 30 movsxd rdx, dword ptr arg(1);src_stride; |
| 31 movsxd rcx, dword ptr arg(4);pitch |
| 32 pxor mm7, mm7 |
| 33 |
| 34 movd mm0, [rsi] |
| 35 movd mm1, [rax] |
| 36 punpcklbw mm0, mm7 |
| 37 punpcklbw mm1, mm7 |
| 38 psubw mm0, mm1 |
| 39 movq [rdi], mm0 |
| 40 |
| 41 |
| 42 movd mm0, [rsi+rdx] |
| 43 movd mm1, [rax+rcx] |
| 44 punpcklbw mm0, mm7 |
| 45 punpcklbw mm1, mm7 |
| 46 psubw mm0, mm1 |
| 47 movq [rdi+rcx*2],mm0 |
| 48 |
| 49 |
| 50 movd mm0, [rsi+rdx*2] |
| 51 movd mm1, [rax+rcx*2] |
| 52 punpcklbw mm0, mm7 |
| 53 punpcklbw mm1, mm7 |
| 54 psubw mm0, mm1 |
| 55 movq [rdi+rcx*4], mm0 |
| 56 |
| 57 lea rsi, [rsi+rdx*2] |
| 58 lea rcx, [rcx+rcx*2] |
| 59 |
| 60 |
| 61 |
| 62 movd mm0, [rsi+rdx] |
| 63 movd mm1, [rax+rcx] |
| 64 punpcklbw mm0, mm7 |
| 65 punpcklbw mm1, mm7 |
| 66 psubw mm0, mm1 |
| 67 movq [rdi+rcx*2], mm0 |
| 68 |
| 69 ; begin epilog |
| 70 pop rdi |
| 71 pop rsi |
| 72 UNSHADOW_ARGS |
| 73 pop rbp |
| 74 ret |
| 75 |
| 76 ;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred,
int stride) |
| 77 global sym(vp9_subtract_mby_mmx) |
| 78 sym(vp9_subtract_mby_mmx): |
| 79 push rbp |
| 80 mov rbp, rsp |
| 81 SHADOW_ARGS_TO_STACK 4 |
| 82 push rsi |
| 83 push rdi |
| 84 ; end prolog |
| 85 |
| 86 |
| 87 mov rsi, arg(1) ;src |
| 88 mov rdi, arg(0) ;diff |
| 89 |
| 90 mov rax, arg(2) ;pred |
| 91 movsxd rdx, dword ptr arg(3) ;stride |
| 92 |
| 93 mov rcx, 16 |
| 94 pxor mm0, mm0 |
| 95 |
| 96 .submby_loop: |
| 97 |
| 98 movq mm1, [rsi] |
| 99 movq mm3, [rax] |
| 100 |
| 101 movq mm2, mm1 |
| 102 movq mm4, mm3 |
| 103 |
| 104 punpcklbw mm1, mm0 |
| 105 punpcklbw mm3, mm0 |
| 106 |
| 107 punpckhbw mm2, mm0 |
| 108 punpckhbw mm4, mm0 |
| 109 |
| 110 psubw mm1, mm3 |
| 111 psubw mm2, mm4 |
| 112 |
| 113 movq [rdi], mm1 |
| 114 movq [rdi+8], mm2 |
| 115 |
| 116 |
| 117 movq mm1, [rsi+8] |
| 118 movq mm3, [rax+8] |
| 119 |
| 120 movq mm2, mm1 |
| 121 movq mm4, mm3 |
| 122 |
| 123 punpcklbw mm1, mm0 |
| 124 punpcklbw mm3, mm0 |
| 125 |
| 126 punpckhbw mm2, mm0 |
| 127 punpckhbw mm4, mm0 |
| 128 |
| 129 psubw mm1, mm3 |
| 130 psubw mm2, mm4 |
| 131 |
| 132 movq [rdi+16], mm1 |
| 133 movq [rdi+24], mm2 |
| 134 |
| 135 |
| 136 add rdi, 32 |
| 137 add rax, 16 |
| 138 |
| 139 lea rsi, [rsi+rdx] |
| 140 |
| 141 sub rcx, 1 |
| 142 jnz .submby_loop |
| 143 |
| 144 pop rdi |
| 145 pop rsi |
| 146 ; begin epilog |
| 147 UNSHADOW_ARGS |
| 148 pop rbp |
| 149 ret |
| 150 |
| 151 |
| 152 ;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsr
c, unsigned char *pred, int stride) |
| 153 global sym(vp9_subtract_mbuv_mmx) |
| 154 sym(vp9_subtract_mbuv_mmx): |
| 155 push rbp |
| 156 mov rbp, rsp |
| 157 SHADOW_ARGS_TO_STACK 5 |
| 158 push rsi |
| 159 push rdi |
| 160 ; end prolog |
| 161 |
| 162 ;short *udiff = diff + 256; |
| 163 ;short *vdiff = diff + 320; |
| 164 ;unsigned char *upred = pred + 256; |
| 165 ;unsigned char *vpred = pred + 320; |
| 166 |
| 167 ;unsigned char *z = usrc; |
| 168 ;unsigned short *diff = udiff; |
| 169 ;unsigned char *Predictor= upred; |
| 170 |
| 171 mov rdi, arg(0) ;diff |
| 172 mov rax, arg(3) ;pred |
| 173 mov rsi, arg(1) ;z = usrc |
| 174 add rdi, 256*2 ;diff = diff + 256 (shorts) |
| 175 add rax, 256 ;Predictor = pred + 256 |
| 176 movsxd rdx, dword ptr arg(4) ;stride; |
| 177 pxor mm7, mm7 |
| 178 |
| 179 movq mm0, [rsi] |
| 180 movq mm1, [rax] |
| 181 movq mm3, mm0 |
| 182 movq mm4, mm1 |
| 183 punpcklbw mm0, mm7 |
| 184 punpcklbw mm1, mm7 |
| 185 punpckhbw mm3, mm7 |
| 186 punpckhbw mm4, mm7 |
| 187 psubw mm0, mm1 |
| 188 psubw mm3, mm4 |
| 189 movq [rdi], mm0 |
| 190 movq [rdi+8], mm3 |
| 191 |
| 192 |
| 193 movq mm0, [rsi+rdx] |
| 194 movq mm1, [rax+8] |
| 195 movq mm3, mm0 |
| 196 movq mm4, mm1 |
| 197 punpcklbw mm0, mm7 |
| 198 punpcklbw mm1, mm7 |
| 199 punpckhbw mm3, mm7 |
| 200 punpckhbw mm4, mm7 |
| 201 psubw mm0, mm1 |
| 202 psubw mm3, mm4 |
| 203 movq [rdi+16], mm0 |
| 204 movq [rdi+24], mm3 |
| 205 |
| 206 movq mm0, [rsi+rdx*2] |
| 207 movq mm1, [rax+16] |
| 208 movq mm3, mm0 |
| 209 movq mm4, mm1 |
| 210 punpcklbw mm0, mm7 |
| 211 punpcklbw mm1, mm7 |
| 212 punpckhbw mm3, mm7 |
| 213 punpckhbw mm4, mm7 |
| 214 psubw mm0, mm1 |
| 215 psubw mm3, mm4 |
| 216 movq [rdi+32], mm0 |
| 217 movq [rdi+40], mm3 |
| 218 lea rsi, [rsi+rdx*2] |
| 219 |
| 220 |
| 221 movq mm0, [rsi+rdx] |
| 222 movq mm1, [rax+24] |
| 223 movq mm3, mm0 |
| 224 movq mm4, mm1 |
| 225 punpcklbw mm0, mm7 |
| 226 punpcklbw mm1, mm7 |
| 227 punpckhbw mm3, mm7 |
| 228 punpckhbw mm4, mm7 |
| 229 psubw mm0, mm1 |
| 230 psubw mm3, mm4 |
| 231 |
| 232 movq [rdi+48], mm0 |
| 233 movq [rdi+56], mm3 |
| 234 |
| 235 |
| 236 add rdi, 64 |
| 237 add rax, 32 |
| 238 lea rsi, [rsi+rdx*2] |
| 239 |
| 240 |
| 241 movq mm0, [rsi] |
| 242 movq mm1, [rax] |
| 243 movq mm3, mm0 |
| 244 movq mm4, mm1 |
| 245 punpcklbw mm0, mm7 |
| 246 punpcklbw mm1, mm7 |
| 247 punpckhbw mm3, mm7 |
| 248 punpckhbw mm4, mm7 |
| 249 psubw mm0, mm1 |
| 250 psubw mm3, mm4 |
| 251 movq [rdi], mm0 |
| 252 movq [rdi+8], mm3 |
| 253 |
| 254 |
| 255 movq mm0, [rsi+rdx] |
| 256 movq mm1, [rax+8] |
| 257 movq mm3, mm0 |
| 258 movq mm4, mm1 |
| 259 punpcklbw mm0, mm7 |
| 260 punpcklbw mm1, mm7 |
| 261 punpckhbw mm3, mm7 |
| 262 punpckhbw mm4, mm7 |
| 263 psubw mm0, mm1 |
| 264 psubw mm3, mm4 |
| 265 movq [rdi+16], mm0 |
| 266 movq [rdi+24], mm3 |
| 267 |
| 268 movq mm0, [rsi+rdx*2] |
| 269 movq mm1, [rax+16] |
| 270 movq mm3, mm0 |
| 271 movq mm4, mm1 |
| 272 punpcklbw mm0, mm7 |
| 273 punpcklbw mm1, mm7 |
| 274 punpckhbw mm3, mm7 |
| 275 punpckhbw mm4, mm7 |
| 276 psubw mm0, mm1 |
| 277 psubw mm3, mm4 |
| 278 movq [rdi+32], mm0 |
| 279 movq [rdi+40], mm3 |
| 280 lea rsi, [rsi+rdx*2] |
| 281 |
| 282 |
| 283 movq mm0, [rsi+rdx] |
| 284 movq mm1, [rax+24] |
| 285 movq mm3, mm0 |
| 286 movq mm4, mm1 |
| 287 punpcklbw mm0, mm7 |
| 288 punpcklbw mm1, mm7 |
| 289 punpckhbw mm3, mm7 |
| 290 punpckhbw mm4, mm7 |
| 291 psubw mm0, mm1 |
| 292 psubw mm3, mm4 |
| 293 |
| 294 movq [rdi+48], mm0 |
| 295 movq [rdi+56], mm3 |
| 296 |
| 297 ;unsigned char *z = vsrc; |
| 298 ;unsigned short *diff = vdiff; |
| 299 ;unsigned char *Predictor= vpred; |
| 300 |
| 301 mov rdi, arg(0) ;diff |
| 302 mov rax, arg(3) ;pred |
| 303 mov rsi, arg(2) ;z = usrc |
| 304 add rdi, 320*2 ;diff = diff + 320 (shorts) |
| 305 add rax, 320 ;Predictor = pred + 320 |
| 306 movsxd rdx, dword ptr arg(4) ;stride; |
| 307 pxor mm7, mm7 |
| 308 |
| 309 movq mm0, [rsi] |
| 310 movq mm1, [rax] |
| 311 movq mm3, mm0 |
| 312 movq mm4, mm1 |
| 313 punpcklbw mm0, mm7 |
| 314 punpcklbw mm1, mm7 |
| 315 punpckhbw mm3, mm7 |
| 316 punpckhbw mm4, mm7 |
| 317 psubw mm0, mm1 |
| 318 psubw mm3, mm4 |
| 319 movq [rdi], mm0 |
| 320 movq [rdi+8], mm3 |
| 321 |
| 322 |
| 323 movq mm0, [rsi+rdx] |
| 324 movq mm1, [rax+8] |
| 325 movq mm3, mm0 |
| 326 movq mm4, mm1 |
| 327 punpcklbw mm0, mm7 |
| 328 punpcklbw mm1, mm7 |
| 329 punpckhbw mm3, mm7 |
| 330 punpckhbw mm4, mm7 |
| 331 psubw mm0, mm1 |
| 332 psubw mm3, mm4 |
| 333 movq [rdi+16], mm0 |
| 334 movq [rdi+24], mm3 |
| 335 |
| 336 movq mm0, [rsi+rdx*2] |
| 337 movq mm1, [rax+16] |
| 338 movq mm3, mm0 |
| 339 movq mm4, mm1 |
| 340 punpcklbw mm0, mm7 |
| 341 punpcklbw mm1, mm7 |
| 342 punpckhbw mm3, mm7 |
| 343 punpckhbw mm4, mm7 |
| 344 psubw mm0, mm1 |
| 345 psubw mm3, mm4 |
| 346 movq [rdi+32], mm0 |
| 347 movq [rdi+40], mm3 |
| 348 lea rsi, [rsi+rdx*2] |
| 349 |
| 350 |
| 351 movq mm0, [rsi+rdx] |
| 352 movq mm1, [rax+24] |
| 353 movq mm3, mm0 |
| 354 movq mm4, mm1 |
| 355 punpcklbw mm0, mm7 |
| 356 punpcklbw mm1, mm7 |
| 357 punpckhbw mm3, mm7 |
| 358 punpckhbw mm4, mm7 |
| 359 psubw mm0, mm1 |
| 360 psubw mm3, mm4 |
| 361 |
| 362 movq [rdi+48], mm0 |
| 363 movq [rdi+56], mm3 |
| 364 |
| 365 |
| 366 add rdi, 64 |
| 367 add rax, 32 |
| 368 lea rsi, [rsi+rdx*2] |
| 369 |
| 370 |
| 371 movq mm0, [rsi] |
| 372 movq mm1, [rax] |
| 373 movq mm3, mm0 |
| 374 movq mm4, mm1 |
| 375 punpcklbw mm0, mm7 |
| 376 punpcklbw mm1, mm7 |
| 377 punpckhbw mm3, mm7 |
| 378 punpckhbw mm4, mm7 |
| 379 psubw mm0, mm1 |
| 380 psubw mm3, mm4 |
| 381 movq [rdi], mm0 |
| 382 movq [rdi+8], mm3 |
| 383 |
| 384 |
| 385 movq mm0, [rsi+rdx] |
| 386 movq mm1, [rax+8] |
| 387 movq mm3, mm0 |
| 388 movq mm4, mm1 |
| 389 punpcklbw mm0, mm7 |
| 390 punpcklbw mm1, mm7 |
| 391 punpckhbw mm3, mm7 |
| 392 punpckhbw mm4, mm7 |
| 393 psubw mm0, mm1 |
| 394 psubw mm3, mm4 |
| 395 movq [rdi+16], mm0 |
| 396 movq [rdi+24], mm3 |
| 397 |
| 398 movq mm0, [rsi+rdx*2] |
| 399 movq mm1, [rax+16] |
| 400 movq mm3, mm0 |
| 401 movq mm4, mm1 |
| 402 punpcklbw mm0, mm7 |
| 403 punpcklbw mm1, mm7 |
| 404 punpckhbw mm3, mm7 |
| 405 punpckhbw mm4, mm7 |
| 406 psubw mm0, mm1 |
| 407 psubw mm3, mm4 |
| 408 movq [rdi+32], mm0 |
| 409 movq [rdi+40], mm3 |
| 410 lea rsi, [rsi+rdx*2] |
| 411 |
| 412 |
| 413 movq mm0, [rsi+rdx] |
| 414 movq mm1, [rax+24] |
| 415 movq mm3, mm0 |
| 416 movq mm4, mm1 |
| 417 punpcklbw mm0, mm7 |
| 418 punpcklbw mm1, mm7 |
| 419 punpckhbw mm3, mm7 |
| 420 punpckhbw mm4, mm7 |
| 421 psubw mm0, mm1 |
| 422 psubw mm3, mm4 |
| 423 |
| 424 movq [rdi+48], mm0 |
| 425 movq [rdi+56], mm3 |
| 426 |
| 427 ; begin epilog |
| 428 pop rdi |
| 429 pop rsi |
| 430 UNSHADOW_ARGS |
| 431 pop rbp |
| 432 ret |
OLD | NEW |