OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %define BLOCK_HEIGHT_WIDTH 4 |
| 15 %define VP9_FILTER_WEIGHT 128 |
| 16 %define VP9_FILTER_SHIFT 7 |
| 17 |
| 18 |
| 19 ;/******************************************************************************
****** |
| 20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe
ls. The |
| 21 ; input pixel array has output_height rows. This routine assumes that output_hei
ght is an |
| 22 ; even number. This function handles 8 pixels in horizontal direction, calculati
ng ONE |
| 23 ; rows each iteration to take advantage of the 128 bits operations. |
| 24 ; |
| 25 ; This is an implementation of some of the SSE optimizations first seen in ffvp8 |
| 26 ; |
| 27 ;*******************************************************************************
******/ |
| 28 ;void vp9_filter_block1d8_h6_ssse3 |
| 29 ;( |
| 30 ; unsigned char *src_ptr, |
| 31 ; unsigned int src_pixels_per_line, |
| 32 ; unsigned char *output_ptr, |
| 33 ; unsigned int output_pitch, |
| 34 ; unsigned int output_height, |
| 35 ; unsigned int vp9_filter_index |
| 36 ;) |
| 37 global sym(vp9_filter_block1d8_h6_ssse3) |
| 38 sym(vp9_filter_block1d8_h6_ssse3): |
| 39 push rbp |
| 40 mov rbp, rsp |
| 41 SHADOW_ARGS_TO_STACK 6 |
| 42 SAVE_XMM 7 |
| 43 GET_GOT rbx |
| 44 push rsi |
| 45 push rdi |
| 46 ; end prolog |
| 47 |
| 48 movsxd rdx, DWORD PTR arg(5) ;table index |
| 49 xor rsi, rsi |
| 50 shl rdx, 4 |
| 51 |
| 52 movdqa xmm7, [GLOBAL(rd)] |
| 53 |
| 54 lea rax, [GLOBAL(k0_k5)] |
| 55 add rax, rdx |
| 56 mov rdi, arg(2) ;output_ptr |
| 57 |
| 58 cmp esi, DWORD PTR [rax] |
| 59 je vp9_filter_block1d8_h4_ssse3 |
| 60 |
| 61 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 |
| 62 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 |
| 63 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 |
| 64 |
| 65 mov rsi, arg(0) ;src_ptr |
| 66 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 67 movsxd rcx, dword ptr arg(4) ;output_height |
| 68 |
| 69 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 70 |
| 71 sub rdi, rdx |
| 72 ;xmm3 free |
| 73 .filter_block1d8_h6_rowloop_ssse3: |
| 74 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 |
| 75 |
| 76 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 |
| 77 |
| 78 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7
3 8 4 9 5 10 |
| 79 |
| 80 movdqa xmm1, xmm0 |
| 81 pmaddubsw xmm0, xmm4 |
| 82 |
| 83 movdqa xmm2, xmm1 |
| 84 pshufb xmm1, [GLOBAL(shuf2bfrom1)] |
| 85 |
| 86 pshufb xmm2, [GLOBAL(shuf3bfrom1)] |
| 87 pmaddubsw xmm1, xmm5 |
| 88 |
| 89 lea rdi, [rdi + rdx] |
| 90 pmaddubsw xmm2, xmm6 |
| 91 |
| 92 lea rsi, [rsi + rax] |
| 93 dec rcx |
| 94 |
| 95 paddsw xmm0, xmm1 |
| 96 paddsw xmm2, xmm7 |
| 97 |
| 98 paddsw xmm0, xmm2 |
| 99 |
| 100 psraw xmm0, 7 |
| 101 |
| 102 packuswb xmm0, xmm0 |
| 103 |
| 104 movq MMWORD Ptr [rdi], xmm0 |
| 105 jnz .filter_block1d8_h6_rowloop_ssse3 |
| 106 |
| 107 ; begin epilog |
| 108 pop rdi |
| 109 pop rsi |
| 110 RESTORE_GOT |
| 111 RESTORE_XMM |
| 112 UNSHADOW_ARGS |
| 113 pop rbp |
| 114 ret |
| 115 |
| 116 vp9_filter_block1d8_h4_ssse3: |
| 117 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 |
| 118 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 |
| 119 |
| 120 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] |
| 121 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] |
| 122 |
| 123 mov rsi, arg(0) ;src_ptr |
| 124 |
| 125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 126 movsxd rcx, dword ptr arg(4) ;output_height |
| 127 |
| 128 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 129 |
| 130 sub rdi, rdx |
| 131 |
| 132 .filter_block1d8_h4_rowloop_ssse3: |
| 133 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 |
| 134 |
| 135 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 |
| 136 |
| 137 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7
3 8 4 9 5 10 |
| 138 |
| 139 movdqa xmm2, xmm0 |
| 140 pshufb xmm0, xmm3 |
| 141 |
| 142 pshufb xmm2, xmm4 |
| 143 pmaddubsw xmm0, xmm5 |
| 144 |
| 145 lea rdi, [rdi + rdx] |
| 146 pmaddubsw xmm2, xmm6 |
| 147 |
| 148 lea rsi, [rsi + rax] |
| 149 dec rcx |
| 150 |
| 151 paddsw xmm0, xmm7 |
| 152 |
| 153 paddsw xmm0, xmm2 |
| 154 |
| 155 psraw xmm0, 7 |
| 156 |
| 157 packuswb xmm0, xmm0 |
| 158 |
| 159 movq MMWORD Ptr [rdi], xmm0 |
| 160 |
| 161 jnz .filter_block1d8_h4_rowloop_ssse3 |
| 162 |
| 163 ; begin epilog |
| 164 pop rdi |
| 165 pop rsi |
| 166 RESTORE_GOT |
| 167 RESTORE_XMM |
| 168 UNSHADOW_ARGS |
| 169 pop rbp |
| 170 ret |
| 171 ;void vp9_filter_block1d16_h6_ssse3 |
| 172 ;( |
| 173 ; unsigned char *src_ptr, |
| 174 ; unsigned int src_pixels_per_line, |
| 175 ; unsigned char *output_ptr, |
| 176 ; unsigned int output_pitch, |
| 177 ; unsigned int output_height, |
| 178 ; unsigned int vp9_filter_index |
| 179 ;) |
| 180 global sym(vp9_filter_block1d16_h6_ssse3) |
| 181 sym(vp9_filter_block1d16_h6_ssse3): |
| 182 push rbp |
| 183 mov rbp, rsp |
| 184 SHADOW_ARGS_TO_STACK 6 |
| 185 SAVE_XMM 7 |
| 186 GET_GOT rbx |
| 187 push rsi |
| 188 push rdi |
| 189 ; end prolog |
| 190 |
| 191 movsxd rdx, DWORD PTR arg(5) ;table index |
| 192 xor rsi, rsi |
| 193 shl rdx, 4 ; |
| 194 |
| 195 lea rax, [GLOBAL(k0_k5)] |
| 196 add rax, rdx |
| 197 |
| 198 mov rdi, arg(2) ;output_ptr |
| 199 |
| 200 mov rsi, arg(0) ;src_ptr |
| 201 |
| 202 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 |
| 203 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 |
| 204 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 |
| 205 |
| 206 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 207 movsxd rcx, dword ptr arg(4) ;output_height |
| 208 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 209 |
| 210 .filter_block1d16_h6_rowloop_ssse3: |
| 211 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 |
| 212 |
| 213 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 |
| 214 |
| 215 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7
3 8 4 9 5 10 |
| 216 |
| 217 movdqa xmm1, xmm0 |
| 218 pmaddubsw xmm0, xmm4 |
| 219 |
| 220 movdqa xmm2, xmm1 |
| 221 pshufb xmm1, [GLOBAL(shuf2bfrom1)] |
| 222 |
| 223 pshufb xmm2, [GLOBAL(shuf3bfrom1)] |
| 224 movq xmm3, MMWORD PTR [rsi + 6] |
| 225 |
| 226 pmaddubsw xmm1, xmm5 |
| 227 movq xmm7, MMWORD PTR [rsi + 11] |
| 228 |
| 229 pmaddubsw xmm2, xmm6 |
| 230 punpcklbw xmm3, xmm7 |
| 231 |
| 232 paddsw xmm0, xmm1 |
| 233 movdqa xmm1, xmm3 |
| 234 |
| 235 pmaddubsw xmm3, xmm4 |
| 236 paddsw xmm0, xmm2 |
| 237 |
| 238 movdqa xmm2, xmm1 |
| 239 paddsw xmm0, [GLOBAL(rd)] |
| 240 |
| 241 pshufb xmm1, [GLOBAL(shuf2bfrom1)] |
| 242 pshufb xmm2, [GLOBAL(shuf3bfrom1)] |
| 243 |
| 244 psraw xmm0, 7 |
| 245 pmaddubsw xmm1, xmm5 |
| 246 |
| 247 pmaddubsw xmm2, xmm6 |
| 248 packuswb xmm0, xmm0 |
| 249 |
| 250 lea rsi, [rsi + rax] |
| 251 paddsw xmm3, xmm1 |
| 252 |
| 253 paddsw xmm3, xmm2 |
| 254 |
| 255 paddsw xmm3, [GLOBAL(rd)] |
| 256 |
| 257 psraw xmm3, 7 |
| 258 |
| 259 packuswb xmm3, xmm3 |
| 260 |
| 261 punpcklqdq xmm0, xmm3 |
| 262 |
| 263 movdqa XMMWORD Ptr [rdi], xmm0 |
| 264 |
| 265 lea rdi, [rdi + rdx] |
| 266 dec rcx |
| 267 jnz .filter_block1d16_h6_rowloop_ssse3 |
| 268 |
| 269 ; begin epilog |
| 270 pop rdi |
| 271 pop rsi |
| 272 RESTORE_GOT |
| 273 RESTORE_XMM |
| 274 UNSHADOW_ARGS |
| 275 pop rbp |
| 276 ret |
| 277 |
| 278 ;void vp9_filter_block1d4_h6_ssse3 |
| 279 ;( |
| 280 ; unsigned char *src_ptr, |
| 281 ; unsigned int src_pixels_per_line, |
| 282 ; unsigned char *output_ptr, |
| 283 ; unsigned int output_pitch, |
| 284 ; unsigned int output_height, |
| 285 ; unsigned int vp9_filter_index |
| 286 ;) |
| 287 global sym(vp9_filter_block1d4_h6_ssse3) |
| 288 sym(vp9_filter_block1d4_h6_ssse3): |
| 289 push rbp |
| 290 mov rbp, rsp |
| 291 SHADOW_ARGS_TO_STACK 6 |
| 292 SAVE_XMM 7 |
| 293 GET_GOT rbx |
| 294 push rsi |
| 295 push rdi |
| 296 ; end prolog |
| 297 |
| 298 movsxd rdx, DWORD PTR arg(5) ;table index |
| 299 xor rsi, rsi |
| 300 shl rdx, 4 ; |
| 301 |
| 302 lea rax, [GLOBAL(k0_k5)] |
| 303 add rax, rdx |
| 304 movdqa xmm7, [GLOBAL(rd)] |
| 305 |
| 306 cmp esi, DWORD PTR [rax] |
| 307 je .vp9_filter_block1d4_h4_ssse3 |
| 308 |
| 309 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 |
| 310 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 |
| 311 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 |
| 312 |
| 313 mov rsi, arg(0) ;src_ptr |
| 314 mov rdi, arg(2) ;output_ptr |
| 315 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 316 movsxd rcx, dword ptr arg(4) ;output_height |
| 317 |
| 318 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 319 |
| 320 ;xmm3 free |
| 321 .filter_block1d4_h6_rowloop_ssse3: |
| 322 movdqu xmm0, XMMWORD PTR [rsi - 2] |
| 323 |
| 324 movdqa xmm1, xmm0 |
| 325 pshufb xmm0, [GLOBAL(shuf1b)] |
| 326 |
| 327 movdqa xmm2, xmm1 |
| 328 pshufb xmm1, [GLOBAL(shuf2b)] |
| 329 pmaddubsw xmm0, xmm4 |
| 330 pshufb xmm2, [GLOBAL(shuf3b)] |
| 331 pmaddubsw xmm1, xmm5 |
| 332 |
| 333 ;-- |
| 334 pmaddubsw xmm2, xmm6 |
| 335 |
| 336 lea rsi, [rsi + rax] |
| 337 ;-- |
| 338 paddsw xmm0, xmm1 |
| 339 paddsw xmm0, xmm7 |
| 340 pxor xmm1, xmm1 |
| 341 paddsw xmm0, xmm2 |
| 342 psraw xmm0, 7 |
| 343 packuswb xmm0, xmm0 |
| 344 |
| 345 movd DWORD PTR [rdi], xmm0 |
| 346 |
| 347 add rdi, rdx |
| 348 dec rcx |
| 349 jnz .filter_block1d4_h6_rowloop_ssse3 |
| 350 |
| 351 ; begin epilog |
| 352 pop rdi |
| 353 pop rsi |
| 354 RESTORE_GOT |
| 355 UNSHADOW_ARGS |
| 356 pop rbp |
| 357 ret |
| 358 |
| 359 .vp9_filter_block1d4_h4_ssse3: |
| 360 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 |
| 361 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 |
| 362 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] |
| 363 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] |
| 364 |
| 365 mov rsi, arg(0) ;src_ptr |
| 366 mov rdi, arg(2) ;output_ptr |
| 367 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 368 movsxd rcx, dword ptr arg(4) ;output_height |
| 369 |
| 370 movsxd rdx, dword ptr arg(3) ;output_pitch |
| 371 |
| 372 .filter_block1d4_h4_rowloop_ssse3: |
| 373 movdqu xmm1, XMMWORD PTR [rsi - 2] |
| 374 |
| 375 movdqa xmm2, xmm1 |
| 376 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] |
| 377 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] |
| 378 pmaddubsw xmm1, xmm5 |
| 379 |
| 380 ;-- |
| 381 pmaddubsw xmm2, xmm6 |
| 382 |
| 383 lea rsi, [rsi + rax] |
| 384 ;-- |
| 385 paddsw xmm1, xmm7 |
| 386 paddsw xmm1, xmm2 |
| 387 psraw xmm1, 7 |
| 388 packuswb xmm1, xmm1 |
| 389 |
| 390 movd DWORD PTR [rdi], xmm1 |
| 391 |
| 392 add rdi, rdx |
| 393 dec rcx |
| 394 jnz .filter_block1d4_h4_rowloop_ssse3 |
| 395 |
| 396 ; begin epilog |
| 397 pop rdi |
| 398 pop rsi |
| 399 RESTORE_GOT |
| 400 RESTORE_XMM |
| 401 UNSHADOW_ARGS |
| 402 pop rbp |
| 403 ret |
| 404 |
| 405 |
| 406 |
| 407 ;void vp9_filter_block1d16_v6_ssse3 |
| 408 ;( |
| 409 ; unsigned char *src_ptr, |
| 410 ; unsigned int src_pitch, |
| 411 ; unsigned char *output_ptr, |
| 412 ; unsigned int out_pitch, |
| 413 ; unsigned int output_height, |
| 414 ; unsigned int vp9_filter_index |
| 415 ;) |
| 416 global sym(vp9_filter_block1d16_v6_ssse3) |
| 417 sym(vp9_filter_block1d16_v6_ssse3): |
| 418 push rbp |
| 419 mov rbp, rsp |
| 420 SHADOW_ARGS_TO_STACK 6 |
| 421 SAVE_XMM 7 |
| 422 GET_GOT rbx |
| 423 push rsi |
| 424 push rdi |
| 425 ; end prolog |
| 426 |
| 427 movsxd rdx, DWORD PTR arg(5) ;table index |
| 428 xor rsi, rsi |
| 429 shl rdx, 4 ; |
| 430 |
| 431 lea rax, [GLOBAL(k0_k5)] |
| 432 add rax, rdx |
| 433 |
| 434 cmp esi, DWORD PTR [rax] |
| 435 je .vp9_filter_block1d16_v4_ssse3 |
| 436 |
| 437 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 |
| 438 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 |
| 439 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 |
| 440 |
| 441 mov rsi, arg(0) ;src_ptr |
| 442 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
| 443 mov rdi, arg(2) ;output_ptr |
| 444 |
| 445 %if ABI_IS_32BIT=0 |
| 446 movsxd r8, DWORD PTR arg(3) ;out_pitch |
| 447 %endif |
| 448 mov rax, rsi |
| 449 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 450 add rax, rdx |
| 451 |
| 452 |
| 453 .vp9_filter_block1d16_v6_ssse3_loop: |
| 454 movq xmm1, MMWORD PTR [rsi] ;A |
| 455 movq xmm2, MMWORD PTR [rsi + rdx] ;B |
| 456 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C |
| 457 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D |
| 458 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E |
| 459 |
| 460 punpcklbw xmm2, xmm4 ;B D |
| 461 punpcklbw xmm3, xmm0 ;C E |
| 462 |
| 463 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F |
| 464 |
| 465 pmaddubsw xmm3, xmm6 |
| 466 punpcklbw xmm1, xmm0 ;A F |
| 467 pmaddubsw xmm2, xmm7 |
| 468 pmaddubsw xmm1, xmm5 |
| 469 |
| 470 paddsw xmm2, xmm3 |
| 471 paddsw xmm2, xmm1 |
| 472 paddsw xmm2, [GLOBAL(rd)] |
| 473 psraw xmm2, 7 |
| 474 packuswb xmm2, xmm2 |
| 475 |
| 476 movq MMWORD PTR [rdi], xmm2 ;store the results |
| 477 |
| 478 movq xmm1, MMWORD PTR [rsi + 8] ;A |
| 479 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B |
| 480 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C |
| 481 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D |
| 482 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E |
| 483 |
| 484 punpcklbw xmm2, xmm4 ;B D |
| 485 punpcklbw xmm3, xmm0 ;C E |
| 486 |
| 487 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F |
| 488 pmaddubsw xmm3, xmm6 |
| 489 punpcklbw xmm1, xmm0 ;A F |
| 490 pmaddubsw xmm2, xmm7 |
| 491 pmaddubsw xmm1, xmm5 |
| 492 |
| 493 add rsi, rdx |
| 494 add rax, rdx |
| 495 ;-- |
| 496 ;-- |
| 497 paddsw xmm2, xmm3 |
| 498 paddsw xmm2, xmm1 |
| 499 paddsw xmm2, [GLOBAL(rd)] |
| 500 psraw xmm2, 7 |
| 501 packuswb xmm2, xmm2 |
| 502 |
| 503 movq MMWORD PTR [rdi+8], xmm2 |
| 504 |
| 505 %if ABI_IS_32BIT |
| 506 add rdi, DWORD PTR arg(3) ;out_pitch |
| 507 %else |
| 508 add rdi, r8 |
| 509 %endif |
| 510 dec rcx |
| 511 jnz .vp9_filter_block1d16_v6_ssse3_loop |
| 512 |
| 513 ; begin epilog |
| 514 pop rdi |
| 515 pop rsi |
| 516 RESTORE_GOT |
| 517 RESTORE_XMM |
| 518 UNSHADOW_ARGS |
| 519 pop rbp |
| 520 ret |
| 521 |
| 522 .vp9_filter_block1d16_v4_ssse3: |
| 523 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 |
| 524 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 |
| 525 |
| 526 mov rsi, arg(0) ;src_ptr |
| 527 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
| 528 mov rdi, arg(2) ;output_ptr |
| 529 |
| 530 %if ABI_IS_32BIT=0 |
| 531 movsxd r8, DWORD PTR arg(3) ;out_pitch |
| 532 %endif |
| 533 mov rax, rsi |
| 534 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 535 add rax, rdx |
| 536 |
| 537 .vp9_filter_block1d16_v4_ssse3_loop: |
| 538 movq xmm2, MMWORD PTR [rsi + rdx] ;B |
| 539 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C |
| 540 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D |
| 541 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E |
| 542 |
| 543 punpcklbw xmm2, xmm4 ;B D |
| 544 punpcklbw xmm3, xmm0 ;C E |
| 545 |
| 546 pmaddubsw xmm3, xmm6 |
| 547 pmaddubsw xmm2, xmm7 |
| 548 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B |
| 549 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C |
| 550 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D |
| 551 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E |
| 552 |
| 553 paddsw xmm2, [GLOBAL(rd)] |
| 554 paddsw xmm2, xmm3 |
| 555 psraw xmm2, 7 |
| 556 packuswb xmm2, xmm2 |
| 557 |
| 558 punpcklbw xmm5, xmm4 ;B D |
| 559 punpcklbw xmm1, xmm0 ;C E |
| 560 |
| 561 pmaddubsw xmm1, xmm6 |
| 562 pmaddubsw xmm5, xmm7 |
| 563 |
| 564 movdqa xmm4, [GLOBAL(rd)] |
| 565 add rsi, rdx |
| 566 add rax, rdx |
| 567 ;-- |
| 568 ;-- |
| 569 paddsw xmm5, xmm1 |
| 570 paddsw xmm5, xmm4 |
| 571 psraw xmm5, 7 |
| 572 packuswb xmm5, xmm5 |
| 573 |
| 574 punpcklqdq xmm2, xmm5 |
| 575 |
| 576 movdqa XMMWORD PTR [rdi], xmm2 |
| 577 |
| 578 %if ABI_IS_32BIT |
| 579 add rdi, DWORD PTR arg(3) ;out_pitch |
| 580 %else |
| 581 add rdi, r8 |
| 582 %endif |
| 583 dec rcx |
| 584 jnz .vp9_filter_block1d16_v4_ssse3_loop |
| 585 |
| 586 ; begin epilog |
| 587 pop rdi |
| 588 pop rsi |
| 589 RESTORE_GOT |
| 590 RESTORE_XMM |
| 591 UNSHADOW_ARGS |
| 592 pop rbp |
| 593 ret |
| 594 |
| 595 ;void vp9_filter_block1d8_v6_ssse3 |
| 596 ;( |
| 597 ; unsigned char *src_ptr, |
| 598 ; unsigned int src_pitch, |
| 599 ; unsigned char *output_ptr, |
| 600 ; unsigned int out_pitch, |
| 601 ; unsigned int output_height, |
| 602 ; unsigned int vp9_filter_index |
| 603 ;) |
| 604 global sym(vp9_filter_block1d8_v6_ssse3) |
| 605 sym(vp9_filter_block1d8_v6_ssse3): |
| 606 push rbp |
| 607 mov rbp, rsp |
| 608 SHADOW_ARGS_TO_STACK 6 |
| 609 SAVE_XMM 7 |
| 610 GET_GOT rbx |
| 611 push rsi |
| 612 push rdi |
| 613 ; end prolog |
| 614 |
| 615 movsxd rdx, DWORD PTR arg(5) ;table index |
| 616 xor rsi, rsi |
| 617 shl rdx, 4 ; |
| 618 |
| 619 lea rax, [GLOBAL(k0_k5)] |
| 620 add rax, rdx |
| 621 |
| 622 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
| 623 mov rdi, arg(2) ;output_ptr |
| 624 %if ABI_IS_32BIT=0 |
| 625 movsxd r8, DWORD PTR arg(3) ; out_pitch |
| 626 %endif |
| 627 movsxd rcx, DWORD PTR arg(4) ;[output_height] |
| 628 |
| 629 cmp esi, DWORD PTR [rax] |
| 630 je .vp9_filter_block1d8_v4_ssse3 |
| 631 |
| 632 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 |
| 633 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 |
| 634 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 |
| 635 |
| 636 mov rsi, arg(0) ;src_ptr |
| 637 |
| 638 mov rax, rsi |
| 639 add rax, rdx |
| 640 |
| 641 .vp9_filter_block1d8_v6_ssse3_loop: |
| 642 movq xmm1, MMWORD PTR [rsi] ;A |
| 643 movq xmm2, MMWORD PTR [rsi + rdx] ;B |
| 644 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C |
| 645 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D |
| 646 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E |
| 647 |
| 648 punpcklbw xmm2, xmm4 ;B D |
| 649 punpcklbw xmm3, xmm0 ;C E |
| 650 |
| 651 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F |
| 652 movdqa xmm4, [GLOBAL(rd)] |
| 653 |
| 654 pmaddubsw xmm3, xmm6 |
| 655 punpcklbw xmm1, xmm0 ;A F |
| 656 pmaddubsw xmm2, xmm7 |
| 657 pmaddubsw xmm1, xmm5 |
| 658 add rsi, rdx |
| 659 add rax, rdx |
| 660 ;-- |
| 661 ;-- |
| 662 paddsw xmm2, xmm3 |
| 663 paddsw xmm2, xmm1 |
| 664 paddsw xmm2, xmm4 |
| 665 psraw xmm2, 7 |
| 666 packuswb xmm2, xmm2 |
| 667 |
| 668 movq MMWORD PTR [rdi], xmm2 |
| 669 |
| 670 %if ABI_IS_32BIT |
| 671 add rdi, DWORD PTR arg(3) ;[out_pitch] |
| 672 %else |
| 673 add rdi, r8 |
| 674 %endif |
| 675 dec rcx |
| 676 jnz .vp9_filter_block1d8_v6_ssse3_loop |
| 677 |
| 678 ; begin epilog |
| 679 pop rdi |
| 680 pop rsi |
| 681 RESTORE_GOT |
| 682 RESTORE_XMM |
| 683 UNSHADOW_ARGS |
| 684 pop rbp |
| 685 ret |
| 686 |
| 687 .vp9_filter_block1d8_v4_ssse3: |
| 688 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 |
| 689 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 |
| 690 movdqa xmm5, [GLOBAL(rd)] |
| 691 |
| 692 mov rsi, arg(0) ;src_ptr |
| 693 |
| 694 mov rax, rsi |
| 695 add rax, rdx |
| 696 |
| 697 .vp9_filter_block1d8_v4_ssse3_loop: |
| 698 movq xmm2, MMWORD PTR [rsi + rdx] ;B |
| 699 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C |
| 700 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D |
| 701 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E |
| 702 |
| 703 punpcklbw xmm2, xmm4 ;B D |
| 704 punpcklbw xmm3, xmm0 ;C E |
| 705 |
| 706 pmaddubsw xmm3, xmm6 |
| 707 pmaddubsw xmm2, xmm7 |
| 708 add rsi, rdx |
| 709 add rax, rdx |
| 710 ;-- |
| 711 ;-- |
| 712 paddsw xmm2, xmm3 |
| 713 paddsw xmm2, xmm5 |
| 714 psraw xmm2, 7 |
| 715 packuswb xmm2, xmm2 |
| 716 |
| 717 movq MMWORD PTR [rdi], xmm2 |
| 718 |
| 719 %if ABI_IS_32BIT |
| 720 add rdi, DWORD PTR arg(3) ;[out_pitch] |
| 721 %else |
| 722 add rdi, r8 |
| 723 %endif |
| 724 dec rcx |
| 725 jnz .vp9_filter_block1d8_v4_ssse3_loop |
| 726 |
| 727 ; begin epilog |
| 728 pop rdi |
| 729 pop rsi |
| 730 RESTORE_GOT |
| 731 RESTORE_XMM |
| 732 UNSHADOW_ARGS |
| 733 pop rbp |
| 734 ret |
| 735 ;void vp9_filter_block1d4_v6_ssse3 |
| 736 ;( |
| 737 ; unsigned char *src_ptr, |
| 738 ; unsigned int src_pitch, |
| 739 ; unsigned char *output_ptr, |
| 740 ; unsigned int out_pitch, |
| 741 ; unsigned int output_height, |
| 742 ; unsigned int vp9_filter_index |
| 743 ;) |
| 744 global sym(vp9_filter_block1d4_v6_ssse3) |
| 745 sym(vp9_filter_block1d4_v6_ssse3): |
| 746 push rbp |
| 747 mov rbp, rsp |
| 748 SHADOW_ARGS_TO_STACK 6 |
| 749 GET_GOT rbx |
| 750 push rsi |
| 751 push rdi |
| 752 ; end prolog |
| 753 |
| 754 movsxd rdx, DWORD PTR arg(5) ;table index |
| 755 xor rsi, rsi |
| 756 shl rdx, 4 ; |
| 757 |
| 758 lea rax, [GLOBAL(k0_k5)] |
| 759 add rax, rdx |
| 760 |
| 761 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line |
| 762 mov rdi, arg(2) ;output_ptr |
| 763 %if ABI_IS_32BIT=0 |
| 764 movsxd r8, DWORD PTR arg(3) ; out_pitch |
| 765 %endif |
| 766 movsxd rcx, DWORD PTR arg(4) ;[output_height] |
| 767 |
| 768 cmp esi, DWORD PTR [rax] |
| 769 je .vp9_filter_block1d4_v4_ssse3 |
| 770 |
| 771 movq mm5, MMWORD PTR [rax] ;k0_k5 |
| 772 movq mm6, MMWORD PTR [rax+256] ;k2_k4 |
| 773 movq mm7, MMWORD PTR [rax+128] ;k1_k3 |
| 774 |
| 775 mov rsi, arg(0) ;src_ptr |
| 776 |
| 777 mov rax, rsi |
| 778 add rax, rdx |
| 779 |
| 780 .vp9_filter_block1d4_v6_ssse3_loop: |
| 781 movd mm1, DWORD PTR [rsi] ;A |
| 782 movd mm2, DWORD PTR [rsi + rdx] ;B |
| 783 movd mm3, DWORD PTR [rsi + rdx * 2] ;C |
| 784 movd mm4, DWORD PTR [rax + rdx * 2] ;D |
| 785 movd mm0, DWORD PTR [rsi + rdx * 4] ;E |
| 786 |
| 787 punpcklbw mm2, mm4 ;B D |
| 788 punpcklbw mm3, mm0 ;C E |
| 789 |
| 790 movd mm0, DWORD PTR [rax + rdx * 4] ;F |
| 791 |
| 792 movq mm4, [GLOBAL(rd)] |
| 793 |
| 794 pmaddubsw mm3, mm6 |
| 795 punpcklbw mm1, mm0 ;A F |
| 796 pmaddubsw mm2, mm7 |
| 797 pmaddubsw mm1, mm5 |
| 798 add rsi, rdx |
| 799 add rax, rdx |
| 800 ;-- |
| 801 ;-- |
| 802 paddsw mm2, mm3 |
| 803 paddsw mm2, mm1 |
| 804 paddsw mm2, mm4 |
| 805 psraw mm2, 7 |
| 806 packuswb mm2, mm2 |
| 807 |
| 808 movd DWORD PTR [rdi], mm2 |
| 809 |
| 810 %if ABI_IS_32BIT |
| 811 add rdi, DWORD PTR arg(3) ;[out_pitch] |
| 812 %else |
| 813 add rdi, r8 |
| 814 %endif |
| 815 dec rcx |
| 816 jnz .vp9_filter_block1d4_v6_ssse3_loop |
| 817 |
| 818 ; begin epilog |
| 819 pop rdi |
| 820 pop rsi |
| 821 RESTORE_GOT |
| 822 UNSHADOW_ARGS |
| 823 pop rbp |
| 824 ret |
| 825 |
| 826 .vp9_filter_block1d4_v4_ssse3: |
| 827 movq mm6, MMWORD PTR [rax+256] ;k2_k4 |
| 828 movq mm7, MMWORD PTR [rax+128] ;k1_k3 |
| 829 movq mm5, MMWORD PTR [GLOBAL(rd)] |
| 830 |
| 831 mov rsi, arg(0) ;src_ptr |
| 832 |
| 833 mov rax, rsi |
| 834 add rax, rdx |
| 835 |
| 836 .vp9_filter_block1d4_v4_ssse3_loop: |
| 837 movd mm2, DWORD PTR [rsi + rdx] ;B |
| 838 movd mm3, DWORD PTR [rsi + rdx * 2] ;C |
| 839 movd mm4, DWORD PTR [rax + rdx * 2] ;D |
| 840 movd mm0, DWORD PTR [rsi + rdx * 4] ;E |
| 841 |
| 842 punpcklbw mm2, mm4 ;B D |
| 843 punpcklbw mm3, mm0 ;C E |
| 844 |
| 845 pmaddubsw mm3, mm6 |
| 846 pmaddubsw mm2, mm7 |
| 847 add rsi, rdx |
| 848 add rax, rdx |
| 849 ;-- |
| 850 ;-- |
| 851 paddsw mm2, mm3 |
| 852 paddsw mm2, mm5 |
| 853 psraw mm2, 7 |
| 854 packuswb mm2, mm2 |
| 855 |
| 856 movd DWORD PTR [rdi], mm2 |
| 857 |
| 858 %if ABI_IS_32BIT |
| 859 add rdi, DWORD PTR arg(3) ;[out_pitch] |
| 860 %else |
| 861 add rdi, r8 |
| 862 %endif |
| 863 dec rcx |
| 864 jnz .vp9_filter_block1d4_v4_ssse3_loop |
| 865 |
| 866 ; begin epilog |
| 867 pop rdi |
| 868 pop rsi |
| 869 RESTORE_GOT |
| 870 UNSHADOW_ARGS |
| 871 pop rbp |
| 872 ret |
| 873 |
| 874 ;void vp9_bilinear_predict16x16_ssse3 |
| 875 ;( |
| 876 ; unsigned char *src_ptr, |
| 877 ; int src_pixels_per_line, |
| 878 ; int xoffset, |
| 879 ; int yoffset, |
| 880 ; unsigned char *dst_ptr, |
| 881 ; int dst_pitch |
| 882 ;) |
| 883 global sym(vp9_bilinear_predict16x16_ssse3) |
| 884 sym(vp9_bilinear_predict16x16_ssse3): |
| 885 push rbp |
| 886 mov rbp, rsp |
| 887 SHADOW_ARGS_TO_STACK 6 |
| 888 SAVE_XMM 7 |
| 889 GET_GOT rbx |
| 890 push rsi |
| 891 push rdi |
| 892 ; end prolog |
| 893 |
| 894 lea rcx, [GLOBAL(bilinear_filters_ssse3)] |
| 895 movsxd rax, dword ptr arg(2) ; xoffset |
| 896 |
| 897 cmp rax, 0 ; skip first_pass filter if
xoffset=0 |
| 898 je .b16x16_sp_only |
| 899 |
| 900 shl rax, 4 |
| 901 lea rax, [rax + rcx] ; HFilter |
| 902 |
| 903 mov rdi, arg(4) ; dst_ptr |
| 904 mov rsi, arg(0) ; src_ptr |
| 905 movsxd rdx, dword ptr arg(5) ; dst_pitch |
| 906 |
| 907 movdqa xmm1, [rax] |
| 908 |
| 909 movsxd rax, dword ptr arg(3) ; yoffset |
| 910 |
| 911 cmp rax, 0 ; skip second_pass filter if
yoffset=0 |
| 912 je .b16x16_fp_only |
| 913 |
| 914 shl rax, 4 |
| 915 lea rax, [rax + rcx] ; VFilter |
| 916 |
| 917 lea rcx, [rdi+rdx*8] |
| 918 lea rcx, [rcx+rdx*8] |
| 919 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line |
| 920 |
| 921 movdqa xmm2, [rax] |
| 922 |
| 923 %if ABI_IS_32BIT=0 |
| 924 movsxd r8, dword ptr arg(5) ; dst_pitch |
| 925 %endif |
| 926 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 |
| 927 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 |
| 928 |
| 929 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04
05 05 06 06 07 07 08 |
| 930 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 |
| 931 |
| 932 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 |
| 933 |
| 934 lea rsi, [rsi + rdx] ; next line |
| 935 |
| 936 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 |
| 937 |
| 938 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12
13 13 14 14 15 15 16 |
| 939 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 |
| 940 |
| 941 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 942 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 943 |
| 944 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value |
| 945 psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 |
| 946 |
| 947 movdqa xmm7, xmm3 |
| 948 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08
09 10 11 12 13 14 15 |
| 949 |
| 950 .next_row: |
| 951 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 |
| 952 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 |
| 953 |
| 954 punpcklbw xmm6, xmm5 |
| 955 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 |
| 956 |
| 957 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 |
| 958 lea rsi, [rsi + rdx] ; next line |
| 959 |
| 960 pmaddubsw xmm6, xmm1 |
| 961 |
| 962 punpcklbw xmm4, xmm5 |
| 963 pmaddubsw xmm4, xmm1 |
| 964 |
| 965 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value |
| 966 psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 |
| 967 |
| 968 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value |
| 969 psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128 |
| 970 |
| 971 packuswb xmm6, xmm4 |
| 972 movdqa xmm5, xmm7 |
| 973 |
| 974 punpcklbw xmm5, xmm6 |
| 975 pmaddubsw xmm5, xmm2 |
| 976 |
| 977 punpckhbw xmm7, xmm6 |
| 978 pmaddubsw xmm7, xmm2 |
| 979 |
| 980 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value |
| 981 psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128 |
| 982 |
| 983 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value |
| 984 psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 |
| 985 |
| 986 packuswb xmm5, xmm7 |
| 987 movdqa xmm7, xmm6 |
| 988 |
| 989 movdqa [rdi], xmm5 ; store the results in the d
estination |
| 990 %if ABI_IS_32BIT |
| 991 add rdi, DWORD PTR arg(5) ; dst_pitch |
| 992 %else |
| 993 add rdi, r8 |
| 994 %endif |
| 995 |
| 996 cmp rdi, rcx |
| 997 jne .next_row |
| 998 |
| 999 jmp .done |
| 1000 |
| 1001 .b16x16_sp_only: |
| 1002 movsxd rax, dword ptr arg(3) ; yoffset |
| 1003 shl rax, 4 |
| 1004 lea rax, [rax + rcx] ; VFilter |
| 1005 |
| 1006 mov rdi, arg(4) ; dst_ptr |
| 1007 mov rsi, arg(0) ; src_ptr |
| 1008 movsxd rdx, dword ptr arg(5) ; dst_pitch |
| 1009 |
| 1010 movdqa xmm1, [rax] ; VFilter |
| 1011 |
| 1012 lea rcx, [rdi+rdx*8] |
| 1013 lea rcx, [rcx+rdx*8] |
| 1014 movsxd rax, dword ptr arg(1) ; src_pixels_per_line |
| 1015 |
| 1016 ; get the first horizontal line done |
| 1017 movq xmm4, [rsi] ; load row 0 |
| 1018 movq xmm2, [rsi + 8] ; load row 0 |
| 1019 |
| 1020 lea rsi, [rsi + rax] ; next line |
| 1021 .next_row_sp: |
| 1022 movq xmm3, [rsi] ; load row + 1 |
| 1023 movq xmm5, [rsi + 8] ; load row + 1 |
| 1024 |
| 1025 punpcklbw xmm4, xmm3 |
| 1026 punpcklbw xmm2, xmm5 |
| 1027 |
| 1028 pmaddubsw xmm4, xmm1 |
| 1029 movq xmm7, [rsi + rax] ; load row + 2 |
| 1030 |
| 1031 pmaddubsw xmm2, xmm1 |
| 1032 movq xmm6, [rsi + rax + 8] ; load row + 2 |
| 1033 |
| 1034 punpcklbw xmm3, xmm7 |
| 1035 punpcklbw xmm5, xmm6 |
| 1036 |
| 1037 pmaddubsw xmm3, xmm1 |
| 1038 paddw xmm4, [GLOBAL(rd)] |
| 1039 |
| 1040 pmaddubsw xmm5, xmm1 |
| 1041 paddw xmm2, [GLOBAL(rd)] |
| 1042 |
| 1043 psraw xmm4, VP9_FILTER_SHIFT |
| 1044 psraw xmm2, VP9_FILTER_SHIFT |
| 1045 |
| 1046 packuswb xmm4, xmm2 |
| 1047 paddw xmm3, [GLOBAL(rd)] |
| 1048 |
| 1049 movdqa [rdi], xmm4 ; store row 0 |
| 1050 paddw xmm5, [GLOBAL(rd)] |
| 1051 |
| 1052 psraw xmm3, VP9_FILTER_SHIFT |
| 1053 psraw xmm5, VP9_FILTER_SHIFT |
| 1054 |
| 1055 packuswb xmm3, xmm5 |
| 1056 movdqa xmm4, xmm7 |
| 1057 |
| 1058 movdqa [rdi + rdx],xmm3 ; store row 1 |
| 1059 lea rsi, [rsi + 2*rax] |
| 1060 |
| 1061 movdqa xmm2, xmm6 |
| 1062 lea rdi, [rdi + 2*rdx] |
| 1063 |
| 1064 cmp rdi, rcx |
| 1065 jne .next_row_sp |
| 1066 |
| 1067 jmp .done |
| 1068 |
| 1069 .b16x16_fp_only: |
| 1070 lea rcx, [rdi+rdx*8] |
| 1071 lea rcx, [rcx+rdx*8] |
| 1072 movsxd rax, dword ptr arg(1) ; src_pixels_per_line |
| 1073 |
| 1074 .next_row_fp: |
| 1075 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 |
| 1076 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 |
| 1077 |
| 1078 punpcklbw xmm2, xmm4 |
| 1079 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 |
| 1080 |
| 1081 pmaddubsw xmm2, xmm1 |
| 1082 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 |
| 1083 |
| 1084 lea rsi, [rsi + rax] ; next line |
| 1085 punpcklbw xmm3, xmm4 |
| 1086 |
| 1087 pmaddubsw xmm3, xmm1 |
| 1088 movq xmm5, [rsi] |
| 1089 |
| 1090 paddw xmm2, [GLOBAL(rd)] |
| 1091 movq xmm7, [rsi+1] |
| 1092 |
| 1093 movq xmm6, [rsi+8] |
| 1094 psraw xmm2, VP9_FILTER_SHIFT |
| 1095 |
| 1096 punpcklbw xmm5, xmm7 |
| 1097 movq xmm7, [rsi+9] |
| 1098 |
| 1099 paddw xmm3, [GLOBAL(rd)] |
| 1100 pmaddubsw xmm5, xmm1 |
| 1101 |
| 1102 psraw xmm3, VP9_FILTER_SHIFT |
| 1103 punpcklbw xmm6, xmm7 |
| 1104 |
| 1105 packuswb xmm2, xmm3 |
| 1106 pmaddubsw xmm6, xmm1 |
| 1107 |
| 1108 movdqa [rdi], xmm2 ; store the results in the d
estination |
| 1109 paddw xmm5, [GLOBAL(rd)] |
| 1110 |
| 1111 lea rdi, [rdi + rdx] ; dst_pitch |
| 1112 psraw xmm5, VP9_FILTER_SHIFT |
| 1113 |
| 1114 paddw xmm6, [GLOBAL(rd)] |
| 1115 psraw xmm6, VP9_FILTER_SHIFT |
| 1116 |
| 1117 packuswb xmm5, xmm6 |
| 1118 lea rsi, [rsi + rax] ; next line |
| 1119 |
| 1120 movdqa [rdi], xmm5 ; store the results in the d
estination |
| 1121 lea rdi, [rdi + rdx] ; dst_pitch |
| 1122 |
| 1123 cmp rdi, rcx |
| 1124 |
| 1125 jne .next_row_fp |
| 1126 |
| 1127 .done: |
| 1128 ; begin epilog |
| 1129 pop rdi |
| 1130 pop rsi |
| 1131 RESTORE_GOT |
| 1132 RESTORE_XMM |
| 1133 UNSHADOW_ARGS |
| 1134 pop rbp |
| 1135 ret |
| 1136 |
| 1137 ;void vp9_bilinear_predict8x8_ssse3 |
| 1138 ;( |
| 1139 ; unsigned char *src_ptr, |
| 1140 ; int src_pixels_per_line, |
| 1141 ; int xoffset, |
| 1142 ; int yoffset, |
| 1143 ; unsigned char *dst_ptr, |
| 1144 ; int dst_pitch |
| 1145 ;) |
| 1146 global sym(vp9_bilinear_predict8x8_ssse3) |
| 1147 sym(vp9_bilinear_predict8x8_ssse3): |
| 1148 push rbp |
| 1149 mov rbp, rsp |
| 1150 SHADOW_ARGS_TO_STACK 6 |
| 1151 SAVE_XMM 7 |
| 1152 GET_GOT rbx |
| 1153 push rsi |
| 1154 push rdi |
| 1155 ; end prolog |
| 1156 |
| 1157 ALIGN_STACK 16, rax |
| 1158 sub rsp, 144 ; reserve 144 bytes |
| 1159 |
| 1160 lea rcx, [GLOBAL(bilinear_filters_ssse3)] |
| 1161 |
| 1162 mov rsi, arg(0) ;src_ptr |
| 1163 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line |
| 1164 |
| 1165 ;Read 9-line unaligned data in and put them on stack. This gives a big |
| 1166 ;performance boost. |
| 1167 movdqu xmm0, [rsi] |
| 1168 lea rax, [rdx + rdx*2] |
| 1169 movdqu xmm1, [rsi+rdx] |
| 1170 movdqu xmm2, [rsi+rdx*2] |
| 1171 add rsi, rax |
| 1172 movdqu xmm3, [rsi] |
| 1173 movdqu xmm4, [rsi+rdx] |
| 1174 movdqu xmm5, [rsi+rdx*2] |
| 1175 add rsi, rax |
| 1176 movdqu xmm6, [rsi] |
| 1177 movdqu xmm7, [rsi+rdx] |
| 1178 |
| 1179 movdqa XMMWORD PTR [rsp], xmm0 |
| 1180 |
| 1181 movdqu xmm0, [rsi+rdx*2] |
| 1182 |
| 1183 movdqa XMMWORD PTR [rsp+16], xmm1 |
| 1184 movdqa XMMWORD PTR [rsp+32], xmm2 |
| 1185 movdqa XMMWORD PTR [rsp+48], xmm3 |
| 1186 movdqa XMMWORD PTR [rsp+64], xmm4 |
| 1187 movdqa XMMWORD PTR [rsp+80], xmm5 |
| 1188 movdqa XMMWORD PTR [rsp+96], xmm6 |
| 1189 movdqa XMMWORD PTR [rsp+112], xmm7 |
| 1190 movdqa XMMWORD PTR [rsp+128], xmm0 |
| 1191 |
| 1192 movsxd rax, dword ptr arg(2) ; xoffset |
| 1193 cmp rax, 0 ; skip first_pass filter if
xoffset=0 |
| 1194 je .b8x8_sp_only |
| 1195 |
| 1196 shl rax, 4 |
| 1197 add rax, rcx ; HFilter |
| 1198 |
| 1199 mov rdi, arg(4) ; dst_ptr |
| 1200 movsxd rdx, dword ptr arg(5) ; dst_pitch |
| 1201 |
| 1202 movdqa xmm0, [rax] |
| 1203 |
| 1204 movsxd rax, dword ptr arg(3) ; yoffset |
| 1205 cmp rax, 0 ; skip second_pass filter if
yoffset=0 |
| 1206 je .b8x8_fp_only |
| 1207 |
| 1208 shl rax, 4 |
| 1209 lea rax, [rax + rcx] ; VFilter |
| 1210 |
| 1211 lea rcx, [rdi+rdx*8] |
| 1212 |
| 1213 movdqa xmm1, [rax] |
| 1214 |
| 1215 ; get the first horizontal line done |
| 1216 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08
09 10 11 12 13 14 15 |
| 1217 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09
10 11 12 13 14 15 xx |
| 1218 |
| 1219 psrldq xmm5, 1 |
| 1220 lea rsp, [rsp + 16] ; next line |
| 1221 |
| 1222 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04
05 05 06 06 07 07 08 |
| 1223 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 |
| 1224 |
| 1225 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1226 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1227 |
| 1228 movdqa xmm7, xmm3 |
| 1229 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08
09 10 11 12 13 14 15 |
| 1230 |
| 1231 .next_row: |
| 1232 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08
09 10 11 12 13 14 15 |
| 1233 lea rsp, [rsp + 16] ; next line |
| 1234 |
| 1235 movdqa xmm5, xmm6 |
| 1236 |
| 1237 psrldq xmm5, 1 |
| 1238 |
| 1239 punpcklbw xmm6, xmm5 |
| 1240 pmaddubsw xmm6, xmm0 |
| 1241 |
| 1242 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value |
| 1243 psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128 |
| 1244 |
| 1245 packuswb xmm6, xmm6 |
| 1246 |
| 1247 punpcklbw xmm7, xmm6 |
| 1248 pmaddubsw xmm7, xmm1 |
| 1249 |
| 1250 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value |
| 1251 psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128 |
| 1252 |
| 1253 packuswb xmm7, xmm7 |
| 1254 |
| 1255 movq [rdi], xmm7 ; store the results in the d
estination |
| 1256 lea rdi, [rdi + rdx] |
| 1257 |
| 1258 movdqa xmm7, xmm6 |
| 1259 |
| 1260 cmp rdi, rcx |
| 1261 jne .next_row |
| 1262 |
| 1263 jmp .done8x8 |
| 1264 |
| 1265 .b8x8_sp_only: |
| 1266 movsxd rax, dword ptr arg(3) ; yoffset |
| 1267 shl rax, 4 |
| 1268 lea rax, [rax + rcx] ; VFilter |
| 1269 |
| 1270 mov rdi, arg(4) ;dst_ptr |
| 1271 movsxd rdx, dword ptr arg(5) ; dst_pitch |
| 1272 |
| 1273 movdqa xmm0, [rax] ; VFilter |
| 1274 |
| 1275 movq xmm1, XMMWORD PTR [rsp] |
| 1276 movq xmm2, XMMWORD PTR [rsp+16] |
| 1277 |
| 1278 movq xmm3, XMMWORD PTR [rsp+32] |
| 1279 punpcklbw xmm1, xmm2 |
| 1280 |
| 1281 movq xmm4, XMMWORD PTR [rsp+48] |
| 1282 punpcklbw xmm2, xmm3 |
| 1283 |
| 1284 movq xmm5, XMMWORD PTR [rsp+64] |
| 1285 punpcklbw xmm3, xmm4 |
| 1286 |
| 1287 movq xmm6, XMMWORD PTR [rsp+80] |
| 1288 punpcklbw xmm4, xmm5 |
| 1289 |
| 1290 movq xmm7, XMMWORD PTR [rsp+96] |
| 1291 punpcklbw xmm5, xmm6 |
| 1292 |
| 1293 pmaddubsw xmm1, xmm0 |
| 1294 pmaddubsw xmm2, xmm0 |
| 1295 |
| 1296 pmaddubsw xmm3, xmm0 |
| 1297 pmaddubsw xmm4, xmm0 |
| 1298 |
| 1299 pmaddubsw xmm5, xmm0 |
| 1300 punpcklbw xmm6, xmm7 |
| 1301 |
| 1302 pmaddubsw xmm6, xmm0 |
| 1303 paddw xmm1, [GLOBAL(rd)] |
| 1304 |
| 1305 paddw xmm2, [GLOBAL(rd)] |
| 1306 psraw xmm1, VP9_FILTER_SHIFT |
| 1307 |
| 1308 paddw xmm3, [GLOBAL(rd)] |
| 1309 psraw xmm2, VP9_FILTER_SHIFT |
| 1310 |
| 1311 paddw xmm4, [GLOBAL(rd)] |
| 1312 psraw xmm3, VP9_FILTER_SHIFT |
| 1313 |
| 1314 paddw xmm5, [GLOBAL(rd)] |
| 1315 psraw xmm4, VP9_FILTER_SHIFT |
| 1316 |
| 1317 paddw xmm6, [GLOBAL(rd)] |
| 1318 psraw xmm5, VP9_FILTER_SHIFT |
| 1319 |
| 1320 psraw xmm6, VP9_FILTER_SHIFT |
| 1321 packuswb xmm1, xmm1 |
| 1322 |
| 1323 packuswb xmm2, xmm2 |
| 1324 movq [rdi], xmm1 |
| 1325 |
| 1326 packuswb xmm3, xmm3 |
| 1327 movq [rdi+rdx], xmm2 |
| 1328 |
| 1329 packuswb xmm4, xmm4 |
| 1330 movq xmm1, XMMWORD PTR [rsp+112] |
| 1331 |
| 1332 lea rdi, [rdi + 2*rdx] |
| 1333 movq xmm2, XMMWORD PTR [rsp+128] |
| 1334 |
| 1335 packuswb xmm5, xmm5 |
| 1336 movq [rdi], xmm3 |
| 1337 |
| 1338 packuswb xmm6, xmm6 |
| 1339 movq [rdi+rdx], xmm4 |
| 1340 |
| 1341 lea rdi, [rdi + 2*rdx] |
| 1342 punpcklbw xmm7, xmm1 |
| 1343 |
| 1344 movq [rdi], xmm5 |
| 1345 pmaddubsw xmm7, xmm0 |
| 1346 |
| 1347 movq [rdi+rdx], xmm6 |
| 1348 punpcklbw xmm1, xmm2 |
| 1349 |
| 1350 pmaddubsw xmm1, xmm0 |
| 1351 paddw xmm7, [GLOBAL(rd)] |
| 1352 |
| 1353 psraw xmm7, VP9_FILTER_SHIFT |
| 1354 paddw xmm1, [GLOBAL(rd)] |
| 1355 |
| 1356 psraw xmm1, VP9_FILTER_SHIFT |
| 1357 packuswb xmm7, xmm7 |
| 1358 |
| 1359 packuswb xmm1, xmm1 |
| 1360 lea rdi, [rdi + 2*rdx] |
| 1361 |
| 1362 movq [rdi], xmm7 |
| 1363 |
| 1364 movq [rdi+rdx], xmm1 |
| 1365 lea rsp, [rsp + 144] |
| 1366 |
| 1367 jmp .done8x8 |
| 1368 |
| 1369 .b8x8_fp_only: |
| 1370 lea rcx, [rdi+rdx*8] |
| 1371 |
| 1372 .next_row_fp: |
| 1373 movdqa xmm1, XMMWORD PTR [rsp] |
| 1374 movdqa xmm3, XMMWORD PTR [rsp+16] |
| 1375 |
| 1376 movdqa xmm2, xmm1 |
| 1377 movdqa xmm5, XMMWORD PTR [rsp+32] |
| 1378 |
| 1379 psrldq xmm2, 1 |
| 1380 movdqa xmm7, XMMWORD PTR [rsp+48] |
| 1381 |
| 1382 movdqa xmm4, xmm3 |
| 1383 psrldq xmm4, 1 |
| 1384 |
| 1385 movdqa xmm6, xmm5 |
| 1386 psrldq xmm6, 1 |
| 1387 |
| 1388 punpcklbw xmm1, xmm2 |
| 1389 pmaddubsw xmm1, xmm0 |
| 1390 |
| 1391 punpcklbw xmm3, xmm4 |
| 1392 pmaddubsw xmm3, xmm0 |
| 1393 |
| 1394 punpcklbw xmm5, xmm6 |
| 1395 pmaddubsw xmm5, xmm0 |
| 1396 |
| 1397 movdqa xmm2, xmm7 |
| 1398 psrldq xmm2, 1 |
| 1399 |
| 1400 punpcklbw xmm7, xmm2 |
| 1401 pmaddubsw xmm7, xmm0 |
| 1402 |
| 1403 paddw xmm1, [GLOBAL(rd)] |
| 1404 psraw xmm1, VP9_FILTER_SHIFT |
| 1405 |
| 1406 paddw xmm3, [GLOBAL(rd)] |
| 1407 psraw xmm3, VP9_FILTER_SHIFT |
| 1408 |
| 1409 paddw xmm5, [GLOBAL(rd)] |
| 1410 psraw xmm5, VP9_FILTER_SHIFT |
| 1411 |
| 1412 paddw xmm7, [GLOBAL(rd)] |
| 1413 psraw xmm7, VP9_FILTER_SHIFT |
| 1414 |
| 1415 packuswb xmm1, xmm1 |
| 1416 packuswb xmm3, xmm3 |
| 1417 |
| 1418 packuswb xmm5, xmm5 |
| 1419 movq [rdi], xmm1 |
| 1420 |
| 1421 packuswb xmm7, xmm7 |
| 1422 movq [rdi+rdx], xmm3 |
| 1423 |
| 1424 lea rdi, [rdi + 2*rdx] |
| 1425 movq [rdi], xmm5 |
| 1426 |
| 1427 lea rsp, [rsp + 4*16] |
| 1428 movq [rdi+rdx], xmm7 |
| 1429 |
| 1430 lea rdi, [rdi + 2*rdx] |
| 1431 cmp rdi, rcx |
| 1432 |
| 1433 jne .next_row_fp |
| 1434 |
| 1435 lea rsp, [rsp + 16] |
| 1436 |
| 1437 .done8x8: |
| 1438 ;add rsp, 144 |
| 1439 pop rsp |
| 1440 ; begin epilog |
| 1441 pop rdi |
| 1442 pop rsi |
| 1443 RESTORE_GOT |
| 1444 RESTORE_XMM |
| 1445 UNSHADOW_ARGS |
| 1446 pop rbp |
| 1447 ret |
| 1448 |
| 1449 SECTION_RODATA |
| 1450 align 16 |
| 1451 shuf1b: |
| 1452 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
| 1453 shuf2b: |
| 1454 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 |
| 1455 shuf3b: |
| 1456 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 |
| 1457 |
| 1458 align 16 |
| 1459 shuf2bfrom1: |
| 1460 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 |
| 1461 align 16 |
| 1462 shuf3bfrom1: |
| 1463 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 |
| 1464 |
| 1465 align 16 |
| 1466 rd: |
| 1467 times 8 dw 0x40 |
| 1468 |
| 1469 align 16 |
| 1470 k0_k5: |
| 1471 times 8 db 0, 0 ;placeholder |
| 1472 times 8 db 0, 0 |
| 1473 times 8 db 2, 1 |
| 1474 times 8 db 0, 0 |
| 1475 times 8 db 3, 3 |
| 1476 times 8 db 0, 0 |
| 1477 times 8 db 1, 2 |
| 1478 times 8 db 0, 0 |
| 1479 k1_k3: |
| 1480 times 8 db 0, 0 ;placeholder |
| 1481 times 8 db -6, 12 |
| 1482 times 8 db -11, 36 |
| 1483 times 8 db -9, 50 |
| 1484 times 8 db -16, 77 |
| 1485 times 8 db -6, 93 |
| 1486 times 8 db -8, 108 |
| 1487 times 8 db -1, 123 |
| 1488 k2_k4: |
| 1489 times 8 db 128, 0 ;placeholder |
| 1490 times 8 db 123, -1 |
| 1491 times 8 db 108, -8 |
| 1492 times 8 db 93, -6 |
| 1493 times 8 db 77, -16 |
| 1494 times 8 db 50, -9 |
| 1495 times 8 db 36, -11 |
| 1496 times 8 db 12, -6 |
| 1497 align 16 |
| 1498 bilinear_filters_ssse3: |
| 1499 times 8 db 128, 0 |
| 1500 times 8 db 120, 8 |
| 1501 times 8 db 112, 16 |
| 1502 times 8 db 104, 24 |
| 1503 times 8 db 96, 32 |
| 1504 times 8 db 88, 40 |
| 1505 times 8 db 80, 48 |
| 1506 times 8 db 72, 56 |
| 1507 times 8 db 64, 64 |
| 1508 times 8 db 56, 72 |
| 1509 times 8 db 48, 80 |
| 1510 times 8 db 40, 88 |
| 1511 times 8 db 32, 96 |
| 1512 times 8 db 24, 104 |
| 1513 times 8 db 16, 112 |
| 1514 times 8 db 8, 120 |
| 1515 |
OLD | NEW |