OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;void int vp8_makemask_sse3( |
| 15 ; unsigned char *y, |
| 16 ; unsigned char *u, |
| 17 ; unsigned char *v, |
| 18 ; unsigned char *ym, |
| 19 ; unsigned char *uvm, |
| 20 ; int yp, |
| 21 ; int uvp, |
| 22 ; int ys, |
| 23 ; int us, |
| 24 ; int vs, |
| 25 ; int yt, |
| 26 ; int ut, |
| 27 ; int vt) |
| 28 global sym(vp8_makemask_sse3) |
| 29 sym(vp8_makemask_sse3): |
| 30 push rbp |
| 31 mov rbp, rsp |
| 32 SHADOW_ARGS_TO_STACK 14 |
| 33 push rsi |
| 34 push rdi |
| 35 ; end prolog |
| 36 |
| 37 mov rsi, arg(0) ;y |
| 38 mov rdi, arg(1) ;u |
| 39 mov rcx, arg(2) ;v |
| 40 mov rax, arg(3) ;ym |
| 41 movsxd rbx, dword arg(4) ;yp |
| 42 movsxd rdx, dword arg(5) ;uvp |
| 43 |
| 44 pxor xmm0,xmm0 |
| 45 |
| 46 ;make 16 copies of the center y value |
| 47 movd xmm1, arg(6) |
| 48 pshufb xmm1, xmm0 |
| 49 |
| 50 ; make 16 copies of the center u value |
| 51 movd xmm2, arg(7) |
| 52 pshufb xmm2, xmm0 |
| 53 |
| 54 ; make 16 copies of the center v value |
| 55 movd xmm3, arg(8) |
| 56 pshufb xmm3, xmm0 |
| 57 unpcklpd xmm2, xmm3 |
| 58 |
| 59 ;make 16 copies of the y tolerance |
| 60 movd xmm3, arg(9) |
| 61 pshufb xmm3, xmm0 |
| 62 |
| 63 ;make 16 copies of the u tolerance |
| 64 movd xmm4, arg(10) |
| 65 pshufb xmm4, xmm0 |
| 66 |
| 67 ;make 16 copies of the v tolerance |
| 68 movd xmm5, arg(11) |
| 69 pshufb xmm5, xmm0 |
| 70 unpckhpd xmm4, xmm5 |
| 71 |
| 72 mov r8,8 |
| 73 |
| 74 NextPairOfRows: |
| 75 |
| 76 ;grab the y source values |
| 77 movdqu xmm0, [rsi] |
| 78 |
| 79 ;compute abs difference between source and y target |
| 80 movdqa xmm6, xmm1 |
| 81 movdqa xmm7, xmm0 |
| 82 psubusb xmm0, xmm1 |
| 83 psubusb xmm6, xmm7 |
| 84 por xmm0, xmm6 |
| 85 |
| 86 ;compute abs difference between |
| 87 movdqa xmm6, xmm3 |
| 88 pcmpgtb xmm6, xmm0 |
| 89 |
| 90 ;grab the y source values |
| 91 add rsi, rbx |
| 92 movdqu xmm0, [rsi] |
| 93 |
| 94 ;compute abs difference between source and y target |
| 95 movdqa xmm11, xmm1 |
| 96 movdqa xmm7, xmm0 |
| 97 psubusb xmm0, xmm1 |
| 98 psubusb xmm11, xmm7 |
| 99 por xmm0, xmm11 |
| 100 |
| 101 ;compute abs difference between |
| 102 movdqa xmm11, xmm3 |
| 103 pcmpgtb xmm11, xmm0 |
| 104 |
| 105 |
| 106 ;grab the u and v source values |
| 107 movdqu xmm7, [rdi] |
| 108 movdqu xmm8, [rcx] |
| 109 unpcklpd xmm7, xmm8 |
| 110 |
| 111 ;compute abs difference between source and uv targets |
| 112 movdqa xmm9, xmm2 |
| 113 movdqa xmm10, xmm7 |
| 114 psubusb xmm7, xmm2 |
| 115 psubusb xmm9, xmm10 |
| 116 por xmm7, xmm9 |
| 117 |
| 118 ;check whether the number is < tolerance |
| 119 movdqa xmm0, xmm4 |
| 120 pcmpgtb xmm0, xmm7 |
| 121 |
| 122 ;double u and v masks |
| 123 movdqa xmm8, xmm0 |
| 124 punpckhbw xmm0, xmm0 |
| 125 punpcklbw xmm8, xmm8 |
| 126 |
| 127 ;mask row 0 and output |
| 128 pand xmm6, xmm8 |
| 129 pand xmm6, xmm0 |
| 130 movdqa [rax],xmm6 |
| 131 |
| 132 ;mask row 1 and output |
| 133 pand xmm11, xmm8 |
| 134 pand xmm11, xmm0 |
| 135 movdqa [rax+16],xmm11 |
| 136 |
| 137 |
| 138 ; to the next row or set of rows |
| 139 add rsi, rbx |
| 140 add rdi, rdx |
| 141 add rcx, rdx |
| 142 add rax,32 |
| 143 dec r8 |
| 144 jnz NextPairOfRows |
| 145 |
| 146 |
| 147 ; begin epilog |
| 148 pop rdi |
| 149 pop rsi |
| 150 UNSHADOW_ARGS |
| 151 pop rbp |
| 152 ret |
| 153 |
| 154 ;GROW_HORIZ (register for result, source register or mem local) |
| 155 ; takes source and shifts left and ors with source |
| 156 ; then shifts right and ors with source |
| 157 %macro GROW_HORIZ 2 |
| 158 movdqa %1, %2 |
| 159 movdqa xmm14, %1 |
| 160 movdqa xmm15, %1 |
| 161 pslldq xmm14, 1 |
| 162 psrldq xmm15, 1 |
| 163 por %1,xmm14 |
| 164 por %1,xmm15 |
| 165 %endmacro |
| 166 ;GROW_VERT (result, center row, above row, below row) |
| 167 %macro GROW_VERT 4 |
| 168 movdqa %1,%2 |
| 169 por %1,%3 |
| 170 por %1,%4 |
| 171 %endmacro |
| 172 |
| 173 ;GROW_NEXTLINE (new line to grow, new source, line to write) |
| 174 %macro GROW_NEXTLINE 3 |
| 175 GROW_HORIZ %1, %2 |
| 176 GROW_VERT xmm3, xmm0, xmm1, xmm2 |
| 177 movdqa %3,xmm3 |
| 178 %endmacro |
| 179 |
| 180 |
| 181 ;void int vp8_growmaskmb_sse3( |
| 182 ; unsigned char *om, |
| 183 ; unsigned char *nm, |
| 184 global sym(vp8_growmaskmb_sse3) |
| 185 sym(vp8_growmaskmb_sse3): |
| 186 push rbp |
| 187 mov rbp, rsp |
| 188 SHADOW_ARGS_TO_STACK 2 |
| 189 push rsi |
| 190 push rdi |
| 191 ; end prolog |
| 192 |
| 193 mov rsi, arg(0) ;src |
| 194 mov rdi, arg(1) ;rst |
| 195 |
| 196 GROW_HORIZ xmm0, [rsi] |
| 197 GROW_HORIZ xmm1, [rsi+16] |
| 198 GROW_HORIZ xmm2, [rsi+32] |
| 199 |
| 200 GROW_VERT xmm3, xmm0, xmm1, xmm2 |
| 201 por xmm0,xmm1 |
| 202 movdqa [rdi], xmm0 |
| 203 movdqa [rdi+16],xmm3 |
| 204 |
| 205 GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] |
| 206 GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] |
| 207 GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] |
| 208 GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] |
| 209 GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] |
| 210 GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] |
| 211 GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] |
| 212 GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] |
| 213 GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] |
| 214 GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] |
| 215 GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] |
| 216 GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] |
| 217 GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] |
| 218 |
| 219 por xmm0,xmm2 |
| 220 movdqa [rdi+240], xmm0 |
| 221 |
| 222 ; begin epilog |
| 223 pop rdi |
| 224 pop rsi |
| 225 UNSHADOW_ARGS |
| 226 pop rbp |
| 227 ret |
| 228 |
| 229 |
| 230 |
| 231 ;unsigned int vp8_sad16x16_masked_wmt( |
| 232 ; unsigned char *src_ptr, |
| 233 ; int src_stride, |
| 234 ; unsigned char *ref_ptr, |
| 235 ; int ref_stride, |
| 236 ; unsigned char *mask) |
| 237 global sym(vp8_sad16x16_masked_wmt) |
| 238 sym(vp8_sad16x16_masked_wmt): |
| 239 push rbp |
| 240 mov rbp, rsp |
| 241 SHADOW_ARGS_TO_STACK 5 |
| 242 push rsi |
| 243 push rdi |
| 244 ; end prolog |
| 245 mov rsi, arg(0) ;src_ptr |
| 246 mov rdi, arg(2) ;ref_ptr |
| 247 |
| 248 mov rbx, arg(4) ;mask |
| 249 movsxd rax, dword ptr arg(1) ;src_stride |
| 250 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 251 |
| 252 mov rcx, 16 |
| 253 |
| 254 pxor xmm3, xmm3 |
| 255 |
| 256 NextSadRow: |
| 257 movdqu xmm0, [rsi] |
| 258 movdqu xmm1, [rdi] |
| 259 movdqu xmm2, [rbx] |
| 260 pand xmm0, xmm2 |
| 261 pand xmm1, xmm2 |
| 262 |
| 263 psadbw xmm0, xmm1 |
| 264 paddw xmm3, xmm0 |
| 265 |
| 266 add rsi, rax |
| 267 add rdi, rdx |
| 268 add rbx, 16 |
| 269 |
| 270 dec rcx |
| 271 jnz NextSadRow |
| 272 |
| 273 movdqa xmm4 , xmm3 |
| 274 psrldq xmm4, 8 |
| 275 paddw xmm3, xmm4 |
| 276 movq rax, xmm3 |
| 277 ; begin epilog |
| 278 pop rdi |
| 279 pop rsi |
| 280 UNSHADOW_ARGS |
| 281 pop rbp |
| 282 ret |
| 283 |
| 284 |
| 285 ;unsigned int vp8_sad16x16_unmasked_wmt( |
| 286 ; unsigned char *src_ptr, |
| 287 ; int src_stride, |
| 288 ; unsigned char *ref_ptr, |
| 289 ; int ref_stride, |
| 290 ; unsigned char *mask) |
| 291 global sym(vp8_sad16x16_unmasked_wmt) |
| 292 sym(vp8_sad16x16_unmasked_wmt): |
| 293 push rbp |
| 294 mov rbp, rsp |
| 295 SHADOW_ARGS_TO_STACK 5 |
| 296 push rsi |
| 297 push rdi |
| 298 ; end prolog |
| 299 mov rsi, arg(0) ;src_ptr |
| 300 mov rdi, arg(2) ;ref_ptr |
| 301 |
| 302 mov rbx, arg(4) ;mask |
| 303 movsxd rax, dword ptr arg(1) ;src_stride |
| 304 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 305 |
| 306 mov rcx, 16 |
| 307 |
| 308 pxor xmm3, xmm3 |
| 309 |
| 310 next_vp8_sad16x16_unmasked_wmt: |
| 311 movdqu xmm0, [rsi] |
| 312 movdqu xmm1, [rdi] |
| 313 movdqu xmm2, [rbx] |
| 314 por xmm0, xmm2 |
| 315 por xmm1, xmm2 |
| 316 |
| 317 psadbw xmm0, xmm1 |
| 318 paddw xmm3, xmm0 |
| 319 |
| 320 add rsi, rax |
| 321 add rdi, rdx |
| 322 add rbx, 16 |
| 323 |
| 324 dec rcx |
| 325 jnz next_vp8_sad16x16_unmasked_wmt |
| 326 |
| 327 movdqa xmm4 , xmm3 |
| 328 psrldq xmm4, 8 |
| 329 paddw xmm3, xmm4 |
| 330 movq rax, xmm3 |
| 331 ; begin epilog |
| 332 pop rdi |
| 333 pop rsi |
| 334 UNSHADOW_ARGS |
| 335 pop rbp |
| 336 ret |
| 337 |
| 338 |
| 339 ;unsigned int vp8_masked_predictor_wmt( |
| 340 ; unsigned char *masked, |
| 341 ; unsigned char *unmasked, |
| 342 ; int src_stride, |
| 343 ; unsigned char *dst_ptr, |
| 344 ; int dst_stride, |
| 345 ; unsigned char *mask) |
| 346 global sym(vp8_masked_predictor_wmt) |
| 347 sym(vp8_masked_predictor_wmt): |
| 348 push rbp |
| 349 mov rbp, rsp |
| 350 SHADOW_ARGS_TO_STACK 6 |
| 351 push rsi |
| 352 push rdi |
| 353 ; end prolog |
| 354 mov rsi, arg(0) ;src_ptr |
| 355 mov rdi, arg(1) ;ref_ptr |
| 356 |
| 357 mov rbx, arg(5) ;mask |
| 358 movsxd rax, dword ptr arg(2) ;src_stride |
| 359 mov r11, arg(3) ; destination |
| 360 movsxd rdx, dword ptr arg(4) ;dst_stride |
| 361 |
| 362 mov rcx, 16 |
| 363 |
| 364 pxor xmm3, xmm3 |
| 365 |
| 366 next_vp8_masked_predictor_wmt: |
| 367 movdqu xmm0, [rsi] |
| 368 movdqu xmm1, [rdi] |
| 369 movdqu xmm2, [rbx] |
| 370 |
| 371 pand xmm0, xmm2 |
| 372 pandn xmm2, xmm1 |
| 373 por xmm0, xmm2 |
| 374 movdqu [r11], xmm0 |
| 375 |
| 376 add r11, rdx |
| 377 add rsi, rax |
| 378 add rdi, rdx |
| 379 add rbx, 16 |
| 380 |
| 381 dec rcx |
| 382 jnz next_vp8_masked_predictor_wmt |
| 383 |
| 384 ; begin epilog |
| 385 pop rdi |
| 386 pop rsi |
| 387 UNSHADOW_ARGS |
| 388 pop rbp |
| 389 ret |
| 390 |
| 391 ;unsigned int vp8_masked_predictor_uv_wmt( |
| 392 ; unsigned char *masked, |
| 393 ; unsigned char *unmasked, |
| 394 ; int src_stride, |
| 395 ; unsigned char *dst_ptr, |
| 396 ; int dst_stride, |
| 397 ; unsigned char *mask) |
| 398 global sym(vp8_masked_predictor_uv_wmt) |
| 399 sym(vp8_masked_predictor_uv_wmt): |
| 400 push rbp |
| 401 mov rbp, rsp |
| 402 SHADOW_ARGS_TO_STACK 6 |
| 403 push rsi |
| 404 push rdi |
| 405 ; end prolog |
| 406 mov rsi, arg(0) ;src_ptr |
| 407 mov rdi, arg(1) ;ref_ptr |
| 408 |
| 409 mov rbx, arg(5) ;mask |
| 410 movsxd rax, dword ptr arg(2) ;src_stride |
| 411 mov r11, arg(3) ; destination |
| 412 movsxd rdx, dword ptr arg(4) ;dst_stride |
| 413 |
| 414 mov rcx, 8 |
| 415 |
| 416 pxor xmm3, xmm3 |
| 417 |
| 418 next_vp8_masked_predictor_uv_wmt: |
| 419 movq xmm0, [rsi] |
| 420 movq xmm1, [rdi] |
| 421 movq xmm2, [rbx] |
| 422 |
| 423 pand xmm0, xmm2 |
| 424 pandn xmm2, xmm1 |
| 425 por xmm0, xmm2 |
| 426 movq [r11], xmm0 |
| 427 |
| 428 add r11, rdx |
| 429 add rsi, rax |
| 430 add rdi, rax |
| 431 add rbx, 8 |
| 432 |
| 433 dec rcx |
| 434 jnz next_vp8_masked_predictor_uv_wmt |
| 435 |
| 436 ; begin epilog |
| 437 pop rdi |
| 438 pop rsi |
| 439 UNSHADOW_ARGS |
| 440 pop rbp |
| 441 ret |
| 442 |
| 443 |
| 444 ;unsigned int vp8_uv_from_y_mask( |
| 445 ; unsigned char *ymask, |
| 446 ; unsigned char *uvmask) |
| 447 global sym(vp8_uv_from_y_mask) |
| 448 sym(vp8_uv_from_y_mask): |
| 449 push rbp |
| 450 mov rbp, rsp |
| 451 SHADOW_ARGS_TO_STACK 6 |
| 452 push rsi |
| 453 push rdi |
| 454 ; end prolog |
| 455 mov rsi, arg(0) ;src_ptr |
| 456 mov rdi, arg(1) ;dst_ptr |
| 457 |
| 458 |
| 459 mov rcx, 8 |
| 460 |
| 461 pxor xmm3, xmm3 |
| 462 |
| 463 next_p8_uv_from_y_mask: |
| 464 movdqu xmm0, [rsi] |
| 465 pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] |
| 466 movq [rdi],xmm0 |
| 467 add rdi, 8 |
| 468 add rsi,32 |
| 469 |
| 470 dec rcx |
| 471 jnz next_p8_uv_from_y_mask |
| 472 |
| 473 ; begin epilog |
| 474 pop rdi |
| 475 pop rsi |
| 476 UNSHADOW_ARGS |
| 477 pop rbp |
| 478 ret |
| 479 |
| 480 SECTION_RODATA |
| 481 align 16 |
| 482 shuf1b: |
| 483 db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 |
| 484 |
OLD | NEW |