OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid |
| 15 ;overflow. |
| 16 |
| 17 %macro GET_FILTERS_4 0 |
| 18 mov rdx, arg(5) ;filter ptr |
| 19 mov rcx, 0x0400040 |
| 20 |
| 21 movdqa xmm7, [rdx] ;load filters |
| 22 pshuflw xmm0, xmm7, 0b ;k0 |
| 23 pshuflw xmm1, xmm7, 01010101b ;k1 |
| 24 pshuflw xmm2, xmm7, 10101010b ;k2 |
| 25 pshuflw xmm3, xmm7, 11111111b ;k3 |
| 26 psrldq xmm7, 8 |
| 27 pshuflw xmm4, xmm7, 0b ;k4 |
| 28 pshuflw xmm5, xmm7, 01010101b ;k5 |
| 29 pshuflw xmm6, xmm7, 10101010b ;k6 |
| 30 pshuflw xmm7, xmm7, 11111111b ;k7 |
| 31 |
| 32 punpcklqdq xmm0, xmm1 |
| 33 punpcklqdq xmm2, xmm3 |
| 34 punpcklqdq xmm5, xmm4 |
| 35 punpcklqdq xmm6, xmm7 |
| 36 |
| 37 movdqa k0k1, xmm0 |
| 38 movdqa k2k3, xmm2 |
| 39 movdqa k5k4, xmm5 |
| 40 movdqa k6k7, xmm6 |
| 41 |
| 42 movq xmm6, rcx |
| 43 pshufd xmm6, xmm6, 0 |
| 44 movdqa krd, xmm6 |
| 45 |
| 46 pxor xmm7, xmm7 |
| 47 movdqa zero, xmm7 |
| 48 %endm |
| 49 |
| 50 %macro APPLY_FILTER_4 1 |
| 51 punpckldq xmm0, xmm1 ;two row in one register |
| 52 punpckldq xmm6, xmm7 |
| 53 punpckldq xmm2, xmm3 |
| 54 punpckldq xmm5, xmm4 |
| 55 |
| 56 punpcklbw xmm0, zero ;unpack to word |
| 57 punpcklbw xmm6, zero |
| 58 punpcklbw xmm2, zero |
| 59 punpcklbw xmm5, zero |
| 60 |
| 61 pmullw xmm0, k0k1 ;multiply the filter factors |
| 62 pmullw xmm6, k6k7 |
| 63 pmullw xmm2, k2k3 |
| 64 pmullw xmm5, k5k4 |
| 65 |
| 66 paddsw xmm0, xmm6 ;sum |
| 67 movdqa xmm1, xmm0 |
| 68 psrldq xmm1, 8 |
| 69 paddsw xmm0, xmm1 |
| 70 paddsw xmm0, xmm2 |
| 71 psrldq xmm2, 8 |
| 72 paddsw xmm0, xmm5 |
| 73 psrldq xmm5, 8 |
| 74 paddsw xmm0, xmm2 |
| 75 paddsw xmm0, xmm5 |
| 76 |
| 77 paddsw xmm0, krd ;rounding |
| 78 psraw xmm0, 7 ;shift |
| 79 packuswb xmm0, xmm0 ;pack to byte |
| 80 |
| 81 %if %1 |
| 82 movd xmm1, [rdi] |
| 83 pavgb xmm0, xmm1 |
| 84 %endif |
| 85 movd [rdi], xmm0 |
| 86 %endm |
| 87 |
| 88 %macro GET_FILTERS 0 |
| 89 mov rdx, arg(5) ;filter ptr |
| 90 mov rsi, arg(0) ;src_ptr |
| 91 mov rdi, arg(2) ;output_ptr |
| 92 mov rcx, 0x0400040 |
| 93 |
| 94 movdqa xmm7, [rdx] ;load filters |
| 95 pshuflw xmm0, xmm7, 0b ;k0 |
| 96 pshuflw xmm1, xmm7, 01010101b ;k1 |
| 97 pshuflw xmm2, xmm7, 10101010b ;k2 |
| 98 pshuflw xmm3, xmm7, 11111111b ;k3 |
| 99 pshufhw xmm4, xmm7, 0b ;k4 |
| 100 pshufhw xmm5, xmm7, 01010101b ;k5 |
| 101 pshufhw xmm6, xmm7, 10101010b ;k6 |
| 102 pshufhw xmm7, xmm7, 11111111b ;k7 |
| 103 |
| 104 punpcklwd xmm0, xmm0 |
| 105 punpcklwd xmm1, xmm1 |
| 106 punpcklwd xmm2, xmm2 |
| 107 punpcklwd xmm3, xmm3 |
| 108 punpckhwd xmm4, xmm4 |
| 109 punpckhwd xmm5, xmm5 |
| 110 punpckhwd xmm6, xmm6 |
| 111 punpckhwd xmm7, xmm7 |
| 112 |
| 113 movdqa k0, xmm0 ;store filter factors on stack |
| 114 movdqa k1, xmm1 |
| 115 movdqa k2, xmm2 |
| 116 movdqa k3, xmm3 |
| 117 movdqa k4, xmm4 |
| 118 movdqa k5, xmm5 |
| 119 movdqa k6, xmm6 |
| 120 movdqa k7, xmm7 |
| 121 |
| 122 movq xmm6, rcx |
| 123 pshufd xmm6, xmm6, 0 |
| 124 movdqa krd, xmm6 ;rounding |
| 125 |
| 126 pxor xmm7, xmm7 |
| 127 movdqa zero, xmm7 |
| 128 %endm |
| 129 |
| 130 %macro LOAD_VERT_8 1 |
| 131 movq xmm0, [rsi + %1] ;0 |
| 132 movq xmm1, [rsi + rax + %1] ;1 |
| 133 movq xmm6, [rsi + rdx * 2 + %1] ;6 |
| 134 lea rsi, [rsi + rax] |
| 135 movq xmm7, [rsi + rdx * 2 + %1] ;7 |
| 136 movq xmm2, [rsi + rax + %1] ;2 |
| 137 movq xmm3, [rsi + rax * 2 + %1] ;3 |
| 138 movq xmm4, [rsi + rdx + %1] ;4 |
| 139 movq xmm5, [rsi + rax * 4 + %1] ;5 |
| 140 %endm |
| 141 |
| 142 %macro APPLY_FILTER_8 2 |
| 143 punpcklbw xmm0, zero |
| 144 punpcklbw xmm1, zero |
| 145 punpcklbw xmm6, zero |
| 146 punpcklbw xmm7, zero |
| 147 punpcklbw xmm2, zero |
| 148 punpcklbw xmm5, zero |
| 149 punpcklbw xmm3, zero |
| 150 punpcklbw xmm4, zero |
| 151 |
| 152 pmullw xmm0, k0 |
| 153 pmullw xmm1, k1 |
| 154 pmullw xmm6, k6 |
| 155 pmullw xmm7, k7 |
| 156 pmullw xmm2, k2 |
| 157 pmullw xmm5, k5 |
| 158 pmullw xmm3, k3 |
| 159 pmullw xmm4, k4 |
| 160 |
| 161 paddsw xmm0, xmm1 |
| 162 paddsw xmm0, xmm6 |
| 163 paddsw xmm0, xmm7 |
| 164 paddsw xmm0, xmm2 |
| 165 paddsw xmm0, xmm5 |
| 166 paddsw xmm0, xmm3 |
| 167 paddsw xmm0, xmm4 |
| 168 |
| 169 paddsw xmm0, krd ;rounding |
| 170 psraw xmm0, 7 ;shift |
| 171 packuswb xmm0, xmm0 ;pack back to byte |
| 172 %if %1 |
| 173 movq xmm1, [rdi + %2] |
| 174 pavgb xmm0, xmm1 |
| 175 %endif |
| 176 movq [rdi + %2], xmm0 |
| 177 %endm |
| 178 |
| 179 ;void vp9_filter_block1d4_v8_sse2 |
| 180 ;( |
| 181 ; unsigned char *src_ptr, |
| 182 ; unsigned int src_pitch, |
| 183 ; unsigned char *output_ptr, |
| 184 ; unsigned int out_pitch, |
| 185 ; unsigned int output_height, |
| 186 ; short *filter |
| 187 ;) |
| 188 global sym(vp9_filter_block1d4_v8_sse2) PRIVATE |
| 189 sym(vp9_filter_block1d4_v8_sse2): |
| 190 push rbp |
| 191 mov rbp, rsp |
| 192 SHADOW_ARGS_TO_STACK 6 |
| 193 SAVE_XMM 7 |
| 194 push rsi |
| 195 push rdi |
| 196 push rbx |
| 197 ; end prolog |
| 198 |
| 199 ALIGN_STACK 16, rax |
| 200 sub rsp, 16 * 6 |
| 201 %define k0k1 [rsp + 16 * 0] |
| 202 %define k2k3 [rsp + 16 * 1] |
| 203 %define k5k4 [rsp + 16 * 2] |
| 204 %define k6k7 [rsp + 16 * 3] |
| 205 %define krd [rsp + 16 * 4] |
| 206 %define zero [rsp + 16 * 5] |
| 207 |
| 208 GET_FILTERS_4 |
| 209 |
| 210 mov rsi, arg(0) ;src_ptr |
| 211 mov rdi, arg(2) ;output_ptr |
| 212 |
| 213 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 214 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 215 lea rdx, [rax + rax * 2] |
| 216 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 217 |
| 218 .loop: |
| 219 movd xmm0, [rsi] ;load src: row 0 |
| 220 movd xmm1, [rsi + rax] ;1 |
| 221 movd xmm6, [rsi + rdx * 2] ;6 |
| 222 lea rsi, [rsi + rax] |
| 223 movd xmm7, [rsi + rdx * 2] ;7 |
| 224 movd xmm2, [rsi + rax] ;2 |
| 225 movd xmm3, [rsi + rax * 2] ;3 |
| 226 movd xmm4, [rsi + rdx] ;4 |
| 227 movd xmm5, [rsi + rax * 4] ;5 |
| 228 |
| 229 APPLY_FILTER_4 0 |
| 230 |
| 231 lea rdi, [rdi + rbx] |
| 232 dec rcx |
| 233 jnz .loop |
| 234 |
| 235 add rsp, 16 * 6 |
| 236 pop rsp |
| 237 pop rbx |
| 238 ; begin epilog |
| 239 pop rdi |
| 240 pop rsi |
| 241 RESTORE_XMM |
| 242 UNSHADOW_ARGS |
| 243 pop rbp |
| 244 ret |
| 245 |
| 246 ;void vp9_filter_block1d8_v8_sse2 |
| 247 ;( |
| 248 ; unsigned char *src_ptr, |
| 249 ; unsigned int src_pitch, |
| 250 ; unsigned char *output_ptr, |
| 251 ; unsigned int out_pitch, |
| 252 ; unsigned int output_height, |
| 253 ; short *filter |
| 254 ;) |
| 255 global sym(vp9_filter_block1d8_v8_sse2) PRIVATE |
| 256 sym(vp9_filter_block1d8_v8_sse2): |
| 257 push rbp |
| 258 mov rbp, rsp |
| 259 SHADOW_ARGS_TO_STACK 6 |
| 260 SAVE_XMM 7 |
| 261 push rsi |
| 262 push rdi |
| 263 push rbx |
| 264 ; end prolog |
| 265 |
| 266 ALIGN_STACK 16, rax |
| 267 sub rsp, 16 * 10 |
| 268 %define k0 [rsp + 16 * 0] |
| 269 %define k1 [rsp + 16 * 1] |
| 270 %define k2 [rsp + 16 * 2] |
| 271 %define k3 [rsp + 16 * 3] |
| 272 %define k4 [rsp + 16 * 4] |
| 273 %define k5 [rsp + 16 * 5] |
| 274 %define k6 [rsp + 16 * 6] |
| 275 %define k7 [rsp + 16 * 7] |
| 276 %define krd [rsp + 16 * 8] |
| 277 %define zero [rsp + 16 * 9] |
| 278 |
| 279 GET_FILTERS |
| 280 |
| 281 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 282 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 283 lea rdx, [rax + rax * 2] |
| 284 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 285 |
| 286 .loop: |
| 287 LOAD_VERT_8 0 |
| 288 APPLY_FILTER_8 0, 0 |
| 289 |
| 290 lea rdi, [rdi + rbx] |
| 291 dec rcx |
| 292 jnz .loop |
| 293 |
| 294 add rsp, 16 * 10 |
| 295 pop rsp |
| 296 pop rbx |
| 297 ; begin epilog |
| 298 pop rdi |
| 299 pop rsi |
| 300 RESTORE_XMM |
| 301 UNSHADOW_ARGS |
| 302 pop rbp |
| 303 ret |
| 304 |
| 305 ;void vp9_filter_block1d16_v8_sse2 |
| 306 ;( |
| 307 ; unsigned char *src_ptr, |
| 308 ; unsigned int src_pitch, |
| 309 ; unsigned char *output_ptr, |
| 310 ; unsigned int out_pitch, |
| 311 ; unsigned int output_height, |
| 312 ; short *filter |
| 313 ;) |
| 314 global sym(vp9_filter_block1d16_v8_sse2) PRIVATE |
| 315 sym(vp9_filter_block1d16_v8_sse2): |
| 316 push rbp |
| 317 mov rbp, rsp |
| 318 SHADOW_ARGS_TO_STACK 6 |
| 319 SAVE_XMM 7 |
| 320 push rsi |
| 321 push rdi |
| 322 push rbx |
| 323 ; end prolog |
| 324 |
| 325 ALIGN_STACK 16, rax |
| 326 sub rsp, 16 * 10 |
| 327 %define k0 [rsp + 16 * 0] |
| 328 %define k1 [rsp + 16 * 1] |
| 329 %define k2 [rsp + 16 * 2] |
| 330 %define k3 [rsp + 16 * 3] |
| 331 %define k4 [rsp + 16 * 4] |
| 332 %define k5 [rsp + 16 * 5] |
| 333 %define k6 [rsp + 16 * 6] |
| 334 %define k7 [rsp + 16 * 7] |
| 335 %define krd [rsp + 16 * 8] |
| 336 %define zero [rsp + 16 * 9] |
| 337 |
| 338 GET_FILTERS |
| 339 |
| 340 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 341 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 342 lea rdx, [rax + rax * 2] |
| 343 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 344 |
| 345 .loop: |
| 346 LOAD_VERT_8 0 |
| 347 APPLY_FILTER_8 0, 0 |
| 348 sub rsi, rax |
| 349 |
| 350 LOAD_VERT_8 8 |
| 351 APPLY_FILTER_8 0, 8 |
| 352 add rdi, rbx |
| 353 |
| 354 dec rcx |
| 355 jnz .loop |
| 356 |
| 357 add rsp, 16 * 10 |
| 358 pop rsp |
| 359 pop rbx |
| 360 ; begin epilog |
| 361 pop rdi |
| 362 pop rsi |
| 363 RESTORE_XMM |
| 364 UNSHADOW_ARGS |
| 365 pop rbp |
| 366 ret |
| 367 |
| 368 global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE |
| 369 sym(vp9_filter_block1d4_v8_avg_sse2): |
| 370 push rbp |
| 371 mov rbp, rsp |
| 372 SHADOW_ARGS_TO_STACK 6 |
| 373 SAVE_XMM 7 |
| 374 push rsi |
| 375 push rdi |
| 376 push rbx |
| 377 ; end prolog |
| 378 |
| 379 ALIGN_STACK 16, rax |
| 380 sub rsp, 16 * 6 |
| 381 %define k0k1 [rsp + 16 * 0] |
| 382 %define k2k3 [rsp + 16 * 1] |
| 383 %define k5k4 [rsp + 16 * 2] |
| 384 %define k6k7 [rsp + 16 * 3] |
| 385 %define krd [rsp + 16 * 4] |
| 386 %define zero [rsp + 16 * 5] |
| 387 |
| 388 GET_FILTERS_4 |
| 389 |
| 390 mov rsi, arg(0) ;src_ptr |
| 391 mov rdi, arg(2) ;output_ptr |
| 392 |
| 393 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 394 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 395 lea rdx, [rax + rax * 2] |
| 396 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 397 |
| 398 .loop: |
| 399 movd xmm0, [rsi] ;load src: row 0 |
| 400 movd xmm1, [rsi + rax] ;1 |
| 401 movd xmm6, [rsi + rdx * 2] ;6 |
| 402 lea rsi, [rsi + rax] |
| 403 movd xmm7, [rsi + rdx * 2] ;7 |
| 404 movd xmm2, [rsi + rax] ;2 |
| 405 movd xmm3, [rsi + rax * 2] ;3 |
| 406 movd xmm4, [rsi + rdx] ;4 |
| 407 movd xmm5, [rsi + rax * 4] ;5 |
| 408 |
| 409 APPLY_FILTER_4 1 |
| 410 |
| 411 lea rdi, [rdi + rbx] |
| 412 dec rcx |
| 413 jnz .loop |
| 414 |
| 415 add rsp, 16 * 6 |
| 416 pop rsp |
| 417 pop rbx |
| 418 ; begin epilog |
| 419 pop rdi |
| 420 pop rsi |
| 421 RESTORE_XMM |
| 422 UNSHADOW_ARGS |
| 423 pop rbp |
| 424 ret |
| 425 |
| 426 global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE |
| 427 sym(vp9_filter_block1d8_v8_avg_sse2): |
| 428 push rbp |
| 429 mov rbp, rsp |
| 430 SHADOW_ARGS_TO_STACK 6 |
| 431 SAVE_XMM 7 |
| 432 push rsi |
| 433 push rdi |
| 434 push rbx |
| 435 ; end prolog |
| 436 |
| 437 ALIGN_STACK 16, rax |
| 438 sub rsp, 16 * 10 |
| 439 %define k0 [rsp + 16 * 0] |
| 440 %define k1 [rsp + 16 * 1] |
| 441 %define k2 [rsp + 16 * 2] |
| 442 %define k3 [rsp + 16 * 3] |
| 443 %define k4 [rsp + 16 * 4] |
| 444 %define k5 [rsp + 16 * 5] |
| 445 %define k6 [rsp + 16 * 6] |
| 446 %define k7 [rsp + 16 * 7] |
| 447 %define krd [rsp + 16 * 8] |
| 448 %define zero [rsp + 16 * 9] |
| 449 |
| 450 GET_FILTERS |
| 451 |
| 452 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 453 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 454 lea rdx, [rax + rax * 2] |
| 455 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 456 .loop: |
| 457 LOAD_VERT_8 0 |
| 458 APPLY_FILTER_8 1, 0 |
| 459 |
| 460 lea rdi, [rdi + rbx] |
| 461 dec rcx |
| 462 jnz .loop |
| 463 |
| 464 add rsp, 16 * 10 |
| 465 pop rsp |
| 466 pop rbx |
| 467 ; begin epilog |
| 468 pop rdi |
| 469 pop rsi |
| 470 RESTORE_XMM |
| 471 UNSHADOW_ARGS |
| 472 pop rbp |
| 473 ret |
| 474 |
| 475 global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE |
| 476 sym(vp9_filter_block1d16_v8_avg_sse2): |
| 477 push rbp |
| 478 mov rbp, rsp |
| 479 SHADOW_ARGS_TO_STACK 6 |
| 480 SAVE_XMM 7 |
| 481 push rsi |
| 482 push rdi |
| 483 push rbx |
| 484 ; end prolog |
| 485 |
| 486 ALIGN_STACK 16, rax |
| 487 sub rsp, 16 * 10 |
| 488 %define k0 [rsp + 16 * 0] |
| 489 %define k1 [rsp + 16 * 1] |
| 490 %define k2 [rsp + 16 * 2] |
| 491 %define k3 [rsp + 16 * 3] |
| 492 %define k4 [rsp + 16 * 4] |
| 493 %define k5 [rsp + 16 * 5] |
| 494 %define k6 [rsp + 16 * 6] |
| 495 %define k7 [rsp + 16 * 7] |
| 496 %define krd [rsp + 16 * 8] |
| 497 %define zero [rsp + 16 * 9] |
| 498 |
| 499 GET_FILTERS |
| 500 |
| 501 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 502 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 503 lea rdx, [rax + rax * 2] |
| 504 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 505 .loop: |
| 506 LOAD_VERT_8 0 |
| 507 APPLY_FILTER_8 1, 0 |
| 508 sub rsi, rax |
| 509 |
| 510 LOAD_VERT_8 8 |
| 511 APPLY_FILTER_8 1, 8 |
| 512 add rdi, rbx |
| 513 |
| 514 dec rcx |
| 515 jnz .loop |
| 516 |
| 517 add rsp, 16 * 10 |
| 518 pop rsp |
| 519 pop rbx |
| 520 ; begin epilog |
| 521 pop rdi |
| 522 pop rsi |
| 523 RESTORE_XMM |
| 524 UNSHADOW_ARGS |
| 525 pop rbp |
| 526 ret |
| 527 |
| 528 ;void vp9_filter_block1d4_h8_sse2 |
| 529 ;( |
| 530 ; unsigned char *src_ptr, |
| 531 ; unsigned int src_pixels_per_line, |
| 532 ; unsigned char *output_ptr, |
| 533 ; unsigned int output_pitch, |
| 534 ; unsigned int output_height, |
| 535 ; short *filter |
| 536 ;) |
| 537 global sym(vp9_filter_block1d4_h8_sse2) PRIVATE |
| 538 sym(vp9_filter_block1d4_h8_sse2): |
| 539 push rbp |
| 540 mov rbp, rsp |
| 541 SHADOW_ARGS_TO_STACK 6 |
| 542 SAVE_XMM 7 |
| 543 push rsi |
| 544 push rdi |
| 545 ; end prolog |
| 546 |
| 547 ALIGN_STACK 16, rax |
| 548 sub rsp, 16 * 6 |
| 549 %define k0k1 [rsp + 16 * 0] |
| 550 %define k2k3 [rsp + 16 * 1] |
| 551 %define k5k4 [rsp + 16 * 2] |
| 552 %define k6k7 [rsp + 16 * 3] |
| 553 %define krd [rsp + 16 * 4] |
| 554 %define zero [rsp + 16 * 5] |
| 555 |
| 556 GET_FILTERS_4 |
| 557 |
| 558 mov rsi, arg(0) ;src_ptr |
| 559 mov rdi, arg(2) ;output_ptr |
| 560 |
| 561 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 562 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 563 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 564 |
| 565 .loop: |
| 566 movdqu xmm0, [rsi - 3] ;load src |
| 567 |
| 568 movdqa xmm1, xmm0 |
| 569 movdqa xmm6, xmm0 |
| 570 movdqa xmm7, xmm0 |
| 571 movdqa xmm2, xmm0 |
| 572 movdqa xmm3, xmm0 |
| 573 movdqa xmm5, xmm0 |
| 574 movdqa xmm4, xmm0 |
| 575 |
| 576 psrldq xmm1, 1 |
| 577 psrldq xmm6, 6 |
| 578 psrldq xmm7, 7 |
| 579 psrldq xmm2, 2 |
| 580 psrldq xmm3, 3 |
| 581 psrldq xmm5, 5 |
| 582 psrldq xmm4, 4 |
| 583 |
| 584 APPLY_FILTER_4 0 |
| 585 |
| 586 lea rsi, [rsi + rax] |
| 587 lea rdi, [rdi + rdx] |
| 588 dec rcx |
| 589 jnz .loop |
| 590 |
| 591 add rsp, 16 * 6 |
| 592 pop rsp |
| 593 |
| 594 ; begin epilog |
| 595 pop rdi |
| 596 pop rsi |
| 597 RESTORE_XMM |
| 598 UNSHADOW_ARGS |
| 599 pop rbp |
| 600 ret |
| 601 |
| 602 ;void vp9_filter_block1d8_h8_sse2 |
| 603 ;( |
| 604 ; unsigned char *src_ptr, |
| 605 ; unsigned int src_pixels_per_line, |
| 606 ; unsigned char *output_ptr, |
| 607 ; unsigned int output_pitch, |
| 608 ; unsigned int output_height, |
| 609 ; short *filter |
| 610 ;) |
| 611 global sym(vp9_filter_block1d8_h8_sse2) PRIVATE |
| 612 sym(vp9_filter_block1d8_h8_sse2): |
| 613 push rbp |
| 614 mov rbp, rsp |
| 615 SHADOW_ARGS_TO_STACK 6 |
| 616 SAVE_XMM 7 |
| 617 push rsi |
| 618 push rdi |
| 619 ; end prolog |
| 620 |
| 621 ALIGN_STACK 16, rax |
| 622 sub rsp, 16 * 10 |
| 623 %define k0 [rsp + 16 * 0] |
| 624 %define k1 [rsp + 16 * 1] |
| 625 %define k2 [rsp + 16 * 2] |
| 626 %define k3 [rsp + 16 * 3] |
| 627 %define k4 [rsp + 16 * 4] |
| 628 %define k5 [rsp + 16 * 5] |
| 629 %define k6 [rsp + 16 * 6] |
| 630 %define k7 [rsp + 16 * 7] |
| 631 %define krd [rsp + 16 * 8] |
| 632 %define zero [rsp + 16 * 9] |
| 633 |
| 634 GET_FILTERS |
| 635 |
| 636 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 637 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 638 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 639 |
| 640 .loop: |
| 641 movdqu xmm0, [rsi - 3] ;load src |
| 642 |
| 643 movdqa xmm1, xmm0 |
| 644 movdqa xmm6, xmm0 |
| 645 movdqa xmm7, xmm0 |
| 646 movdqa xmm2, xmm0 |
| 647 movdqa xmm5, xmm0 |
| 648 movdqa xmm3, xmm0 |
| 649 movdqa xmm4, xmm0 |
| 650 |
| 651 psrldq xmm1, 1 |
| 652 psrldq xmm6, 6 |
| 653 psrldq xmm7, 7 |
| 654 psrldq xmm2, 2 |
| 655 psrldq xmm5, 5 |
| 656 psrldq xmm3, 3 |
| 657 psrldq xmm4, 4 |
| 658 |
| 659 APPLY_FILTER_8 0, 0 |
| 660 |
| 661 lea rsi, [rsi + rax] |
| 662 lea rdi, [rdi + rdx] |
| 663 dec rcx |
| 664 jnz .loop |
| 665 |
| 666 add rsp, 16 * 10 |
| 667 pop rsp |
| 668 |
| 669 ; begin epilog |
| 670 pop rdi |
| 671 pop rsi |
| 672 RESTORE_XMM |
| 673 UNSHADOW_ARGS |
| 674 pop rbp |
| 675 ret |
| 676 |
| 677 ;void vp9_filter_block1d16_h8_sse2 |
| 678 ;( |
| 679 ; unsigned char *src_ptr, |
| 680 ; unsigned int src_pixels_per_line, |
| 681 ; unsigned char *output_ptr, |
| 682 ; unsigned int output_pitch, |
| 683 ; unsigned int output_height, |
| 684 ; short *filter |
| 685 ;) |
| 686 global sym(vp9_filter_block1d16_h8_sse2) PRIVATE |
| 687 sym(vp9_filter_block1d16_h8_sse2): |
| 688 push rbp |
| 689 mov rbp, rsp |
| 690 SHADOW_ARGS_TO_STACK 6 |
| 691 SAVE_XMM 7 |
| 692 push rsi |
| 693 push rdi |
| 694 ; end prolog |
| 695 |
| 696 ALIGN_STACK 16, rax |
| 697 sub rsp, 16 * 10 |
| 698 %define k0 [rsp + 16 * 0] |
| 699 %define k1 [rsp + 16 * 1] |
| 700 %define k2 [rsp + 16 * 2] |
| 701 %define k3 [rsp + 16 * 3] |
| 702 %define k4 [rsp + 16 * 4] |
| 703 %define k5 [rsp + 16 * 5] |
| 704 %define k6 [rsp + 16 * 6] |
| 705 %define k7 [rsp + 16 * 7] |
| 706 %define krd [rsp + 16 * 8] |
| 707 %define zero [rsp + 16 * 9] |
| 708 |
| 709 GET_FILTERS |
| 710 |
| 711 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 712 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 713 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 714 |
| 715 .loop: |
| 716 movdqu xmm0, [rsi - 3] ;load src |
| 717 |
| 718 movdqa xmm1, xmm0 |
| 719 movdqa xmm6, xmm0 |
| 720 movdqa xmm7, xmm0 |
| 721 movdqa xmm2, xmm0 |
| 722 movdqa xmm5, xmm0 |
| 723 movdqa xmm3, xmm0 |
| 724 movdqa xmm4, xmm0 |
| 725 |
| 726 psrldq xmm1, 1 |
| 727 psrldq xmm6, 6 |
| 728 psrldq xmm7, 7 |
| 729 psrldq xmm2, 2 |
| 730 psrldq xmm5, 5 |
| 731 psrldq xmm3, 3 |
| 732 psrldq xmm4, 4 |
| 733 |
| 734 APPLY_FILTER_8 0, 0 |
| 735 |
| 736 movdqu xmm0, [rsi + 5] ;load src |
| 737 |
| 738 movdqa xmm1, xmm0 |
| 739 movdqa xmm6, xmm0 |
| 740 movdqa xmm7, xmm0 |
| 741 movdqa xmm2, xmm0 |
| 742 movdqa xmm5, xmm0 |
| 743 movdqa xmm3, xmm0 |
| 744 movdqa xmm4, xmm0 |
| 745 |
| 746 psrldq xmm1, 1 |
| 747 psrldq xmm6, 6 |
| 748 psrldq xmm7, 7 |
| 749 psrldq xmm2, 2 |
| 750 psrldq xmm5, 5 |
| 751 psrldq xmm3, 3 |
| 752 psrldq xmm4, 4 |
| 753 |
| 754 APPLY_FILTER_8 0, 8 |
| 755 |
| 756 lea rsi, [rsi + rax] |
| 757 lea rdi, [rdi + rdx] |
| 758 dec rcx |
| 759 jnz .loop |
| 760 |
| 761 add rsp, 16 * 10 |
| 762 pop rsp |
| 763 |
| 764 ; begin epilog |
| 765 pop rdi |
| 766 pop rsi |
| 767 RESTORE_XMM |
| 768 UNSHADOW_ARGS |
| 769 pop rbp |
| 770 ret |
| 771 |
| 772 global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE |
| 773 sym(vp9_filter_block1d4_h8_avg_sse2): |
| 774 push rbp |
| 775 mov rbp, rsp |
| 776 SHADOW_ARGS_TO_STACK 6 |
| 777 SAVE_XMM 7 |
| 778 push rsi |
| 779 push rdi |
| 780 ; end prolog |
| 781 |
| 782 ALIGN_STACK 16, rax |
| 783 sub rsp, 16 * 6 |
| 784 %define k0k1 [rsp + 16 * 0] |
| 785 %define k2k3 [rsp + 16 * 1] |
| 786 %define k5k4 [rsp + 16 * 2] |
| 787 %define k6k7 [rsp + 16 * 3] |
| 788 %define krd [rsp + 16 * 4] |
| 789 %define zero [rsp + 16 * 5] |
| 790 |
| 791 GET_FILTERS_4 |
| 792 |
| 793 mov rsi, arg(0) ;src_ptr |
| 794 mov rdi, arg(2) ;output_ptr |
| 795 |
| 796 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 797 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 798 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 799 |
| 800 .loop: |
| 801 movdqu xmm0, [rsi - 3] ;load src |
| 802 |
| 803 movdqa xmm1, xmm0 |
| 804 movdqa xmm6, xmm0 |
| 805 movdqa xmm7, xmm0 |
| 806 movdqa xmm2, xmm0 |
| 807 movdqa xmm3, xmm0 |
| 808 movdqa xmm5, xmm0 |
| 809 movdqa xmm4, xmm0 |
| 810 |
| 811 psrldq xmm1, 1 |
| 812 psrldq xmm6, 6 |
| 813 psrldq xmm7, 7 |
| 814 psrldq xmm2, 2 |
| 815 psrldq xmm3, 3 |
| 816 psrldq xmm5, 5 |
| 817 psrldq xmm4, 4 |
| 818 |
| 819 APPLY_FILTER_4 1 |
| 820 |
| 821 lea rsi, [rsi + rax] |
| 822 lea rdi, [rdi + rdx] |
| 823 dec rcx |
| 824 jnz .loop |
| 825 |
| 826 add rsp, 16 * 6 |
| 827 pop rsp |
| 828 |
| 829 ; begin epilog |
| 830 pop rdi |
| 831 pop rsi |
| 832 RESTORE_XMM |
| 833 UNSHADOW_ARGS |
| 834 pop rbp |
| 835 ret |
| 836 |
| 837 global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE |
| 838 sym(vp9_filter_block1d8_h8_avg_sse2): |
| 839 push rbp |
| 840 mov rbp, rsp |
| 841 SHADOW_ARGS_TO_STACK 6 |
| 842 SAVE_XMM 7 |
| 843 push rsi |
| 844 push rdi |
| 845 ; end prolog |
| 846 |
| 847 ALIGN_STACK 16, rax |
| 848 sub rsp, 16 * 10 |
| 849 %define k0 [rsp + 16 * 0] |
| 850 %define k1 [rsp + 16 * 1] |
| 851 %define k2 [rsp + 16 * 2] |
| 852 %define k3 [rsp + 16 * 3] |
| 853 %define k4 [rsp + 16 * 4] |
| 854 %define k5 [rsp + 16 * 5] |
| 855 %define k6 [rsp + 16 * 6] |
| 856 %define k7 [rsp + 16 * 7] |
| 857 %define krd [rsp + 16 * 8] |
| 858 %define zero [rsp + 16 * 9] |
| 859 |
| 860 GET_FILTERS |
| 861 |
| 862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 863 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 864 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 865 |
| 866 .loop: |
| 867 movdqu xmm0, [rsi - 3] ;load src |
| 868 |
| 869 movdqa xmm1, xmm0 |
| 870 movdqa xmm6, xmm0 |
| 871 movdqa xmm7, xmm0 |
| 872 movdqa xmm2, xmm0 |
| 873 movdqa xmm5, xmm0 |
| 874 movdqa xmm3, xmm0 |
| 875 movdqa xmm4, xmm0 |
| 876 |
| 877 psrldq xmm1, 1 |
| 878 psrldq xmm6, 6 |
| 879 psrldq xmm7, 7 |
| 880 psrldq xmm2, 2 |
| 881 psrldq xmm5, 5 |
| 882 psrldq xmm3, 3 |
| 883 psrldq xmm4, 4 |
| 884 |
| 885 APPLY_FILTER_8 1, 0 |
| 886 |
| 887 lea rsi, [rsi + rax] |
| 888 lea rdi, [rdi + rdx] |
| 889 dec rcx |
| 890 jnz .loop |
| 891 |
| 892 add rsp, 16 * 10 |
| 893 pop rsp |
| 894 |
| 895 ; begin epilog |
| 896 pop rdi |
| 897 pop rsi |
| 898 RESTORE_XMM |
| 899 UNSHADOW_ARGS |
| 900 pop rbp |
| 901 ret |
| 902 |
| 903 global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE |
| 904 sym(vp9_filter_block1d16_h8_avg_sse2): |
| 905 push rbp |
| 906 mov rbp, rsp |
| 907 SHADOW_ARGS_TO_STACK 6 |
| 908 SAVE_XMM 7 |
| 909 push rsi |
| 910 push rdi |
| 911 ; end prolog |
| 912 |
| 913 ALIGN_STACK 16, rax |
| 914 sub rsp, 16 * 10 |
| 915 %define k0 [rsp + 16 * 0] |
| 916 %define k1 [rsp + 16 * 1] |
| 917 %define k2 [rsp + 16 * 2] |
| 918 %define k3 [rsp + 16 * 3] |
| 919 %define k4 [rsp + 16 * 4] |
| 920 %define k5 [rsp + 16 * 5] |
| 921 %define k6 [rsp + 16 * 6] |
| 922 %define k7 [rsp + 16 * 7] |
| 923 %define krd [rsp + 16 * 8] |
| 924 %define zero [rsp + 16 * 9] |
| 925 |
| 926 GET_FILTERS |
| 927 |
| 928 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 929 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 930 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 931 |
| 932 .loop: |
| 933 movdqu xmm0, [rsi - 3] ;load src |
| 934 |
| 935 movdqa xmm1, xmm0 |
| 936 movdqa xmm6, xmm0 |
| 937 movdqa xmm7, xmm0 |
| 938 movdqa xmm2, xmm0 |
| 939 movdqa xmm5, xmm0 |
| 940 movdqa xmm3, xmm0 |
| 941 movdqa xmm4, xmm0 |
| 942 |
| 943 psrldq xmm1, 1 |
| 944 psrldq xmm6, 6 |
| 945 psrldq xmm7, 7 |
| 946 psrldq xmm2, 2 |
| 947 psrldq xmm5, 5 |
| 948 psrldq xmm3, 3 |
| 949 psrldq xmm4, 4 |
| 950 |
| 951 APPLY_FILTER_8 1, 0 |
| 952 |
| 953 movdqu xmm0, [rsi + 5] ;load src |
| 954 |
| 955 movdqa xmm1, xmm0 |
| 956 movdqa xmm6, xmm0 |
| 957 movdqa xmm7, xmm0 |
| 958 movdqa xmm2, xmm0 |
| 959 movdqa xmm5, xmm0 |
| 960 movdqa xmm3, xmm0 |
| 961 movdqa xmm4, xmm0 |
| 962 |
| 963 psrldq xmm1, 1 |
| 964 psrldq xmm6, 6 |
| 965 psrldq xmm7, 7 |
| 966 psrldq xmm2, 2 |
| 967 psrldq xmm5, 5 |
| 968 psrldq xmm3, 3 |
| 969 psrldq xmm4, 4 |
| 970 |
| 971 APPLY_FILTER_8 1, 8 |
| 972 |
| 973 lea rsi, [rsi + rax] |
| 974 lea rdi, [rdi + rdx] |
| 975 dec rcx |
| 976 jnz .loop |
| 977 |
| 978 add rsp, 16 * 10 |
| 979 pop rsp |
| 980 |
| 981 ; begin epilog |
| 982 pop rdi |
| 983 pop rsi |
| 984 RESTORE_XMM |
| 985 UNSHADOW_ARGS |
| 986 pop rbp |
| 987 ret |
OLD | NEW |