OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 ;Note: tap3 and tap4 have to be applied and added after other taps to avoid |
| 15 ;overflow. |
| 16 |
| 17 %macro HIGH_GET_FILTERS_4 0 |
| 18 mov rdx, arg(5) ;filter ptr |
| 19 mov rcx, 0x00000040 |
| 20 |
| 21 movdqa xmm7, [rdx] ;load filters |
| 22 pshuflw xmm0, xmm7, 0b ;k0 |
| 23 pshuflw xmm1, xmm7, 01010101b ;k1 |
| 24 pshuflw xmm2, xmm7, 10101010b ;k2 |
| 25 pshuflw xmm3, xmm7, 11111111b ;k3 |
| 26 psrldq xmm7, 8 |
| 27 pshuflw xmm4, xmm7, 0b ;k4 |
| 28 pshuflw xmm5, xmm7, 01010101b ;k5 |
| 29 pshuflw xmm6, xmm7, 10101010b ;k6 |
| 30 pshuflw xmm7, xmm7, 11111111b ;k7 |
| 31 |
| 32 punpcklwd xmm0, xmm6 |
| 33 punpcklwd xmm2, xmm5 |
| 34 punpcklwd xmm3, xmm4 |
| 35 punpcklwd xmm1, xmm7 |
| 36 |
| 37 movdqa k0k6, xmm0 |
| 38 movdqa k2k5, xmm2 |
| 39 movdqa k3k4, xmm3 |
| 40 movdqa k1k7, xmm1 |
| 41 |
| 42 movq xmm6, rcx |
| 43 pshufd xmm6, xmm6, 0 |
| 44 movdqa krd, xmm6 |
| 45 |
| 46 ;Compute max and min values of a pixel |
| 47 mov rdx, 0x00010001 |
| 48 movsxd rcx, DWORD PTR arg(6) ;bps |
| 49 movq xmm0, rdx |
| 50 movq xmm1, rcx |
| 51 pshufd xmm0, xmm0, 0b |
| 52 movdqa xmm2, xmm0 |
| 53 psllw xmm0, xmm1 |
| 54 psubw xmm0, xmm2 |
| 55 pxor xmm1, xmm1 |
| 56 movdqa max, xmm0 ;max value (for clamping) |
| 57 movdqa min, xmm1 ;min value (for clamping) |
| 58 |
| 59 %endm |
| 60 |
| 61 %macro HIGH_APPLY_FILTER_4 1 |
| 62 punpcklwd xmm0, xmm6 ;two row in one register |
| 63 punpcklwd xmm1, xmm7 |
| 64 punpcklwd xmm2, xmm5 |
| 65 punpcklwd xmm3, xmm4 |
| 66 |
| 67 pmaddwd xmm0, k0k6 ;multiply the filter factors |
| 68 pmaddwd xmm1, k1k7 |
| 69 pmaddwd xmm2, k2k5 |
| 70 pmaddwd xmm3, k3k4 |
| 71 |
| 72 paddd xmm0, xmm1 ;sum |
| 73 paddd xmm0, xmm2 |
| 74 paddd xmm0, xmm3 |
| 75 |
| 76 paddd xmm0, krd ;rounding |
| 77 psrad xmm0, 7 ;shift |
| 78 packssdw xmm0, xmm0 ;pack to word |
| 79 |
| 80 ;clamp the values |
| 81 pminsw xmm0, max |
| 82 pmaxsw xmm0, min |
| 83 |
| 84 %if %1 |
| 85 movq xmm1, [rdi] |
| 86 pavgw xmm0, xmm1 |
| 87 %endif |
| 88 movq [rdi], xmm0 |
| 89 %endm |
| 90 |
| 91 %macro HIGH_GET_FILTERS 0 |
| 92 mov rdx, arg(5) ;filter ptr |
| 93 mov rsi, arg(0) ;src_ptr |
| 94 mov rdi, arg(2) ;output_ptr |
| 95 mov rcx, 0x00000040 |
| 96 |
| 97 movdqa xmm7, [rdx] ;load filters |
| 98 pshuflw xmm0, xmm7, 0b ;k0 |
| 99 pshuflw xmm1, xmm7, 01010101b ;k1 |
| 100 pshuflw xmm2, xmm7, 10101010b ;k2 |
| 101 pshuflw xmm3, xmm7, 11111111b ;k3 |
| 102 pshufhw xmm4, xmm7, 0b ;k4 |
| 103 pshufhw xmm5, xmm7, 01010101b ;k5 |
| 104 pshufhw xmm6, xmm7, 10101010b ;k6 |
| 105 pshufhw xmm7, xmm7, 11111111b ;k7 |
| 106 punpcklqdq xmm2, xmm2 |
| 107 punpcklqdq xmm3, xmm3 |
| 108 punpcklwd xmm0, xmm1 |
| 109 punpckhwd xmm6, xmm7 |
| 110 punpckhwd xmm2, xmm5 |
| 111 punpckhwd xmm3, xmm4 |
| 112 |
| 113 movdqa k0k1, xmm0 ;store filter factors on stack |
| 114 movdqa k6k7, xmm6 |
| 115 movdqa k2k5, xmm2 |
| 116 movdqa k3k4, xmm3 |
| 117 |
| 118 movq xmm6, rcx |
| 119 pshufd xmm6, xmm6, 0 |
| 120 movdqa krd, xmm6 ;rounding |
| 121 |
| 122 ;Compute max and min values of a pixel |
| 123 mov rdx, 0x00010001 |
| 124 movsxd rcx, DWORD PTR arg(6) ;bps |
| 125 movq xmm0, rdx |
| 126 movq xmm1, rcx |
| 127 pshufd xmm0, xmm0, 0b |
| 128 movdqa xmm2, xmm0 |
| 129 psllw xmm0, xmm1 |
| 130 psubw xmm0, xmm2 |
| 131 pxor xmm1, xmm1 |
| 132 movdqa max, xmm0 ;max value (for clamping) |
| 133 movdqa min, xmm1 ;min value (for clamping) |
| 134 %endm |
| 135 |
| 136 %macro LOAD_VERT_8 1 |
| 137 movdqu xmm0, [rsi + %1] ;0 |
| 138 movdqu xmm1, [rsi + rax + %1] ;1 |
| 139 movdqu xmm6, [rsi + rdx * 2 + %1] ;6 |
| 140 lea rsi, [rsi + rax] |
| 141 movdqu xmm7, [rsi + rdx * 2 + %1] ;7 |
| 142 movdqu xmm2, [rsi + rax + %1] ;2 |
| 143 movdqu xmm3, [rsi + rax * 2 + %1] ;3 |
| 144 movdqu xmm4, [rsi + rdx + %1] ;4 |
| 145 movdqu xmm5, [rsi + rax * 4 + %1] ;5 |
| 146 %endm |
| 147 |
| 148 %macro HIGH_APPLY_FILTER_8 2 |
| 149 movdqu temp, xmm4 |
| 150 movdqa xmm4, xmm0 |
| 151 punpcklwd xmm0, xmm1 |
| 152 punpckhwd xmm4, xmm1 |
| 153 movdqa xmm1, xmm6 |
| 154 punpcklwd xmm6, xmm7 |
| 155 punpckhwd xmm1, xmm7 |
| 156 movdqa xmm7, xmm2 |
| 157 punpcklwd xmm2, xmm5 |
| 158 punpckhwd xmm7, xmm5 |
| 159 |
| 160 movdqu xmm5, temp |
| 161 movdqu temp, xmm4 |
| 162 movdqa xmm4, xmm3 |
| 163 punpcklwd xmm3, xmm5 |
| 164 punpckhwd xmm4, xmm5 |
| 165 movdqu xmm5, temp |
| 166 |
| 167 pmaddwd xmm0, k0k1 |
| 168 pmaddwd xmm5, k0k1 |
| 169 pmaddwd xmm6, k6k7 |
| 170 pmaddwd xmm1, k6k7 |
| 171 pmaddwd xmm2, k2k5 |
| 172 pmaddwd xmm7, k2k5 |
| 173 pmaddwd xmm3, k3k4 |
| 174 pmaddwd xmm4, k3k4 |
| 175 |
| 176 paddd xmm0, xmm6 |
| 177 paddd xmm0, xmm2 |
| 178 paddd xmm0, xmm3 |
| 179 paddd xmm5, xmm1 |
| 180 paddd xmm5, xmm7 |
| 181 paddd xmm5, xmm4 |
| 182 |
| 183 paddd xmm0, krd ;rounding |
| 184 paddd xmm5, krd |
| 185 psrad xmm0, 7 ;shift |
| 186 psrad xmm5, 7 |
| 187 packssdw xmm0, xmm5 ;pack back to word |
| 188 |
| 189 ;clamp the values |
| 190 pminsw xmm0, max |
| 191 pmaxsw xmm0, min |
| 192 |
| 193 %if %1 |
| 194 movdqu xmm1, [rdi + %2] |
| 195 pavgw xmm0, xmm1 |
| 196 %endif |
| 197 movdqu [rdi + %2], xmm0 |
| 198 %endm |
| 199 |
| 200 ;void vp9_filter_block1d4_v8_sse2 |
| 201 ;( |
| 202 ; unsigned char *src_ptr, |
| 203 ; unsigned int src_pitch, |
| 204 ; unsigned char *output_ptr, |
| 205 ; unsigned int out_pitch, |
| 206 ; unsigned int output_height, |
| 207 ; short *filter |
| 208 ;) |
| 209 global sym(vp9_high_filter_block1d4_v8_sse2) PRIVATE |
| 210 sym(vp9_high_filter_block1d4_v8_sse2): |
| 211 push rbp |
| 212 mov rbp, rsp |
| 213 SHADOW_ARGS_TO_STACK 7 |
| 214 SAVE_XMM 7 |
| 215 push rsi |
| 216 push rdi |
| 217 push rbx |
| 218 ; end prolog |
| 219 |
| 220 ALIGN_STACK 16, rax |
| 221 sub rsp, 16 * 7 |
| 222 %define k0k6 [rsp + 16 * 0] |
| 223 %define k2k5 [rsp + 16 * 1] |
| 224 %define k3k4 [rsp + 16 * 2] |
| 225 %define k1k7 [rsp + 16 * 3] |
| 226 %define krd [rsp + 16 * 4] |
| 227 %define max [rsp + 16 * 5] |
| 228 %define min [rsp + 16 * 6] |
| 229 |
| 230 HIGH_GET_FILTERS_4 |
| 231 |
| 232 mov rsi, arg(0) ;src_ptr |
| 233 mov rdi, arg(2) ;output_ptr |
| 234 |
| 235 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 236 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 237 lea rax, [rax + rax] ;bytes per line |
| 238 lea rbx, [rbx + rbx] |
| 239 lea rdx, [rax + rax * 2] |
| 240 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 241 |
| 242 .loop: |
| 243 movq xmm0, [rsi] ;load src: row 0 |
| 244 movq xmm1, [rsi + rax] ;1 |
| 245 movq xmm6, [rsi + rdx * 2] ;6 |
| 246 lea rsi, [rsi + rax] |
| 247 movq xmm7, [rsi + rdx * 2] ;7 |
| 248 movq xmm2, [rsi + rax] ;2 |
| 249 movq xmm3, [rsi + rax * 2] ;3 |
| 250 movq xmm4, [rsi + rdx] ;4 |
| 251 movq xmm5, [rsi + rax * 4] ;5 |
| 252 |
| 253 HIGH_APPLY_FILTER_4 0 |
| 254 |
| 255 lea rdi, [rdi + rbx] |
| 256 dec rcx |
| 257 jnz .loop |
| 258 |
| 259 add rsp, 16 * 7 |
| 260 pop rsp |
| 261 pop rbx |
| 262 ; begin epilog |
| 263 pop rdi |
| 264 pop rsi |
| 265 RESTORE_XMM |
| 266 UNSHADOW_ARGS |
| 267 pop rbp |
| 268 ret |
| 269 |
| 270 ;void vp9_filter_block1d8_v8_sse2 |
| 271 ;( |
| 272 ; unsigned char *src_ptr, |
| 273 ; unsigned int src_pitch, |
| 274 ; unsigned char *output_ptr, |
| 275 ; unsigned int out_pitch, |
| 276 ; unsigned int output_height, |
| 277 ; short *filter |
| 278 ;) |
| 279 global sym(vp9_high_filter_block1d8_v8_sse2) PRIVATE |
| 280 sym(vp9_high_filter_block1d8_v8_sse2): |
| 281 push rbp |
| 282 mov rbp, rsp |
| 283 SHADOW_ARGS_TO_STACK 7 |
| 284 SAVE_XMM 7 |
| 285 push rsi |
| 286 push rdi |
| 287 push rbx |
| 288 ; end prolog |
| 289 |
| 290 ALIGN_STACK 16, rax |
| 291 sub rsp, 16 * 8 |
| 292 %define k0k1 [rsp + 16 * 0] |
| 293 %define k6k7 [rsp + 16 * 1] |
| 294 %define k2k5 [rsp + 16 * 2] |
| 295 %define k3k4 [rsp + 16 * 3] |
| 296 %define krd [rsp + 16 * 4] |
| 297 %define temp [rsp + 16 * 5] |
| 298 %define max [rsp + 16 * 6] |
| 299 %define min [rsp + 16 * 7] |
| 300 |
| 301 HIGH_GET_FILTERS |
| 302 |
| 303 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 304 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 305 lea rax, [rax + rax] ;bytes per line |
| 306 lea rbx, [rbx + rbx] |
| 307 lea rdx, [rax + rax * 2] |
| 308 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 309 |
| 310 .loop: |
| 311 LOAD_VERT_8 0 |
| 312 HIGH_APPLY_FILTER_8 0, 0 |
| 313 |
| 314 lea rdi, [rdi + rbx] |
| 315 dec rcx |
| 316 jnz .loop |
| 317 |
| 318 add rsp, 16 * 8 |
| 319 pop rsp |
| 320 pop rbx |
| 321 ; begin epilog |
| 322 pop rdi |
| 323 pop rsi |
| 324 RESTORE_XMM |
| 325 UNSHADOW_ARGS |
| 326 pop rbp |
| 327 ret |
| 328 |
| 329 ;void vp9_filter_block1d16_v8_sse2 |
| 330 ;( |
| 331 ; unsigned char *src_ptr, |
| 332 ; unsigned int src_pitch, |
| 333 ; unsigned char *output_ptr, |
| 334 ; unsigned int out_pitch, |
| 335 ; unsigned int output_height, |
| 336 ; short *filter |
| 337 ;) |
| 338 global sym(vp9_high_filter_block1d16_v8_sse2) PRIVATE |
| 339 sym(vp9_high_filter_block1d16_v8_sse2): |
| 340 push rbp |
| 341 mov rbp, rsp |
| 342 SHADOW_ARGS_TO_STACK 7 |
| 343 SAVE_XMM 7 |
| 344 push rsi |
| 345 push rdi |
| 346 push rbx |
| 347 ; end prolog |
| 348 |
| 349 ALIGN_STACK 16, rax |
| 350 sub rsp, 16 * 8 |
| 351 %define k0k1 [rsp + 16 * 0] |
| 352 %define k6k7 [rsp + 16 * 1] |
| 353 %define k2k5 [rsp + 16 * 2] |
| 354 %define k3k4 [rsp + 16 * 3] |
| 355 %define krd [rsp + 16 * 4] |
| 356 %define temp [rsp + 16 * 5] |
| 357 %define max [rsp + 16 * 6] |
| 358 %define min [rsp + 16 * 7] |
| 359 |
| 360 HIGH_GET_FILTERS |
| 361 |
| 362 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 363 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 364 lea rax, [rax + rax] ;bytes per line |
| 365 lea rbx, [rbx + rbx] |
| 366 lea rdx, [rax + rax * 2] |
| 367 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 368 |
| 369 .loop: |
| 370 LOAD_VERT_8 0 |
| 371 HIGH_APPLY_FILTER_8 0, 0 |
| 372 sub rsi, rax |
| 373 |
| 374 LOAD_VERT_8 16 |
| 375 HIGH_APPLY_FILTER_8 0, 16 |
| 376 add rdi, rbx |
| 377 |
| 378 dec rcx |
| 379 jnz .loop |
| 380 |
| 381 add rsp, 16 * 8 |
| 382 pop rsp |
| 383 pop rbx |
| 384 ; begin epilog |
| 385 pop rdi |
| 386 pop rsi |
| 387 RESTORE_XMM |
| 388 UNSHADOW_ARGS |
| 389 pop rbp |
| 390 ret |
| 391 |
| 392 global sym(vp9_high_filter_block1d4_v8_avg_sse2) PRIVATE |
| 393 sym(vp9_high_filter_block1d4_v8_avg_sse2): |
| 394 push rbp |
| 395 mov rbp, rsp |
| 396 SHADOW_ARGS_TO_STACK 7 |
| 397 SAVE_XMM 7 |
| 398 push rsi |
| 399 push rdi |
| 400 push rbx |
| 401 ; end prolog |
| 402 |
| 403 ALIGN_STACK 16, rax |
| 404 sub rsp, 16 * 7 |
| 405 %define k0k6 [rsp + 16 * 0] |
| 406 %define k2k5 [rsp + 16 * 1] |
| 407 %define k3k4 [rsp + 16 * 2] |
| 408 %define k1k7 [rsp + 16 * 3] |
| 409 %define krd [rsp + 16 * 4] |
| 410 %define max [rsp + 16 * 5] |
| 411 %define min [rsp + 16 * 6] |
| 412 |
| 413 HIGH_GET_FILTERS_4 |
| 414 |
| 415 mov rsi, arg(0) ;src_ptr |
| 416 mov rdi, arg(2) ;output_ptr |
| 417 |
| 418 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 419 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 420 lea rax, [rax + rax] ;bytes per line |
| 421 lea rbx, [rbx + rbx] |
| 422 lea rdx, [rax + rax * 2] |
| 423 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 424 |
| 425 .loop: |
| 426 movq xmm0, [rsi] ;load src: row 0 |
| 427 movq xmm1, [rsi + rax] ;1 |
| 428 movq xmm6, [rsi + rdx * 2] ;6 |
| 429 lea rsi, [rsi + rax] |
| 430 movq xmm7, [rsi + rdx * 2] ;7 |
| 431 movq xmm2, [rsi + rax] ;2 |
| 432 movq xmm3, [rsi + rax * 2] ;3 |
| 433 movq xmm4, [rsi + rdx] ;4 |
| 434 movq xmm5, [rsi + rax * 4] ;5 |
| 435 |
| 436 HIGH_APPLY_FILTER_4 1 |
| 437 |
| 438 lea rdi, [rdi + rbx] |
| 439 dec rcx |
| 440 jnz .loop |
| 441 |
| 442 add rsp, 16 * 7 |
| 443 pop rsp |
| 444 pop rbx |
| 445 ; begin epilog |
| 446 pop rdi |
| 447 pop rsi |
| 448 RESTORE_XMM |
| 449 UNSHADOW_ARGS |
| 450 pop rbp |
| 451 ret |
| 452 |
| 453 global sym(vp9_high_filter_block1d8_v8_avg_sse2) PRIVATE |
| 454 sym(vp9_high_filter_block1d8_v8_avg_sse2): |
| 455 push rbp |
| 456 mov rbp, rsp |
| 457 SHADOW_ARGS_TO_STACK 7 |
| 458 SAVE_XMM 7 |
| 459 push rsi |
| 460 push rdi |
| 461 push rbx |
| 462 ; end prolog |
| 463 |
| 464 ALIGN_STACK 16, rax |
| 465 sub rsp, 16 * 8 |
| 466 %define k0k1 [rsp + 16 * 0] |
| 467 %define k6k7 [rsp + 16 * 1] |
| 468 %define k2k5 [rsp + 16 * 2] |
| 469 %define k3k4 [rsp + 16 * 3] |
| 470 %define krd [rsp + 16 * 4] |
| 471 %define temp [rsp + 16 * 5] |
| 472 %define max [rsp + 16 * 6] |
| 473 %define min [rsp + 16 * 7] |
| 474 |
| 475 HIGH_GET_FILTERS |
| 476 |
| 477 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 478 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 479 lea rax, [rax + rax] ;bytes per line |
| 480 lea rbx, [rbx + rbx] |
| 481 lea rdx, [rax + rax * 2] |
| 482 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 483 .loop: |
| 484 LOAD_VERT_8 0 |
| 485 HIGH_APPLY_FILTER_8 1, 0 |
| 486 |
| 487 lea rdi, [rdi + rbx] |
| 488 dec rcx |
| 489 jnz .loop |
| 490 |
| 491 add rsp, 16 * 8 |
| 492 pop rsp |
| 493 pop rbx |
| 494 ; begin epilog |
| 495 pop rdi |
| 496 pop rsi |
| 497 RESTORE_XMM |
| 498 UNSHADOW_ARGS |
| 499 pop rbp |
| 500 ret |
| 501 |
| 502 global sym(vp9_high_filter_block1d16_v8_avg_sse2) PRIVATE |
| 503 sym(vp9_high_filter_block1d16_v8_avg_sse2): |
| 504 push rbp |
| 505 mov rbp, rsp |
| 506 SHADOW_ARGS_TO_STACK 7 |
| 507 SAVE_XMM 7 |
| 508 push rsi |
| 509 push rdi |
| 510 push rbx |
| 511 ; end prolog |
| 512 |
| 513 ALIGN_STACK 16, rax |
| 514 sub rsp, 16 * 8 |
| 515 %define k0k1 [rsp + 16 * 0] |
| 516 %define k6k7 [rsp + 16 * 1] |
| 517 %define k2k5 [rsp + 16 * 2] |
| 518 %define k3k4 [rsp + 16 * 3] |
| 519 %define krd [rsp + 16 * 4] |
| 520 %define temp [rsp + 16 * 5] |
| 521 %define max [rsp + 16 * 6] |
| 522 %define min [rsp + 16 * 7] |
| 523 |
| 524 HIGH_GET_FILTERS |
| 525 |
| 526 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 527 movsxd rbx, DWORD PTR arg(3) ;out_pitch |
| 528 lea rax, [rax + rax] ;bytes per line |
| 529 lea rbx, [rbx + rbx] |
| 530 lea rdx, [rax + rax * 2] |
| 531 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 532 .loop: |
| 533 LOAD_VERT_8 0 |
| 534 HIGH_APPLY_FILTER_8 1, 0 |
| 535 sub rsi, rax |
| 536 |
| 537 LOAD_VERT_8 16 |
| 538 HIGH_APPLY_FILTER_8 1, 16 |
| 539 add rdi, rbx |
| 540 |
| 541 dec rcx |
| 542 jnz .loop |
| 543 |
| 544 add rsp, 16 * 8 |
| 545 pop rsp |
| 546 pop rbx |
| 547 ; begin epilog |
| 548 pop rdi |
| 549 pop rsi |
| 550 RESTORE_XMM |
| 551 UNSHADOW_ARGS |
| 552 pop rbp |
| 553 ret |
| 554 |
| 555 ;void vp9_filter_block1d4_h8_sse2 |
| 556 ;( |
| 557 ; unsigned char *src_ptr, |
| 558 ; unsigned int src_pixels_per_line, |
| 559 ; unsigned char *output_ptr, |
| 560 ; unsigned int output_pitch, |
| 561 ; unsigned int output_height, |
| 562 ; short *filter |
| 563 ;) |
| 564 global sym(vp9_high_filter_block1d4_h8_sse2) PRIVATE |
| 565 sym(vp9_high_filter_block1d4_h8_sse2): |
| 566 push rbp |
| 567 mov rbp, rsp |
| 568 SHADOW_ARGS_TO_STACK 7 |
| 569 SAVE_XMM 7 |
| 570 push rsi |
| 571 push rdi |
| 572 ; end prolog |
| 573 |
| 574 ALIGN_STACK 16, rax |
| 575 sub rsp, 16 * 7 |
| 576 %define k0k6 [rsp + 16 * 0] |
| 577 %define k2k5 [rsp + 16 * 1] |
| 578 %define k3k4 [rsp + 16 * 2] |
| 579 %define k1k7 [rsp + 16 * 3] |
| 580 %define krd [rsp + 16 * 4] |
| 581 %define max [rsp + 16 * 5] |
| 582 %define min [rsp + 16 * 6] |
| 583 |
| 584 HIGH_GET_FILTERS_4 |
| 585 |
| 586 mov rsi, arg(0) ;src_ptr |
| 587 mov rdi, arg(2) ;output_ptr |
| 588 |
| 589 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 590 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 591 lea rax, [rax + rax] ;bytes per line |
| 592 lea rdx, [rdx + rdx] |
| 593 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 594 |
| 595 .loop: |
| 596 movdqu xmm0, [rsi - 6] ;load src |
| 597 movdqu xmm4, [rsi + 2] |
| 598 movdqa xmm1, xmm0 |
| 599 movdqa xmm6, xmm4 |
| 600 movdqa xmm7, xmm4 |
| 601 movdqa xmm2, xmm0 |
| 602 movdqa xmm3, xmm0 |
| 603 movdqa xmm5, xmm4 |
| 604 |
| 605 psrldq xmm1, 2 |
| 606 psrldq xmm6, 4 |
| 607 psrldq xmm7, 6 |
| 608 psrldq xmm2, 4 |
| 609 psrldq xmm3, 6 |
| 610 psrldq xmm5, 2 |
| 611 |
| 612 HIGH_APPLY_FILTER_4 0 |
| 613 |
| 614 lea rsi, [rsi + rax] |
| 615 lea rdi, [rdi + rdx] |
| 616 dec rcx |
| 617 jnz .loop |
| 618 |
| 619 add rsp, 16 * 7 |
| 620 pop rsp |
| 621 |
| 622 ; begin epilog |
| 623 pop rdi |
| 624 pop rsi |
| 625 RESTORE_XMM |
| 626 UNSHADOW_ARGS |
| 627 pop rbp |
| 628 ret |
| 629 |
| 630 ;void vp9_filter_block1d8_h8_sse2 |
| 631 ;( |
| 632 ; unsigned char *src_ptr, |
| 633 ; unsigned int src_pixels_per_line, |
| 634 ; unsigned char *output_ptr, |
| 635 ; unsigned int output_pitch, |
| 636 ; unsigned int output_height, |
| 637 ; short *filter |
| 638 ;) |
| 639 global sym(vp9_high_filter_block1d8_h8_sse2) PRIVATE |
| 640 sym(vp9_high_filter_block1d8_h8_sse2): |
| 641 push rbp |
| 642 mov rbp, rsp |
| 643 SHADOW_ARGS_TO_STACK 7 |
| 644 SAVE_XMM 7 |
| 645 push rsi |
| 646 push rdi |
| 647 ; end prolog |
| 648 |
| 649 ALIGN_STACK 16, rax |
| 650 sub rsp, 16 * 8 |
| 651 %define k0k1 [rsp + 16 * 0] |
| 652 %define k6k7 [rsp + 16 * 1] |
| 653 %define k2k5 [rsp + 16 * 2] |
| 654 %define k3k4 [rsp + 16 * 3] |
| 655 %define krd [rsp + 16 * 4] |
| 656 %define temp [rsp + 16 * 5] |
| 657 %define max [rsp + 16 * 6] |
| 658 %define min [rsp + 16 * 7] |
| 659 |
| 660 HIGH_GET_FILTERS |
| 661 |
| 662 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 663 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 664 lea rax, [rax + rax] ;bytes per line |
| 665 lea rdx, [rdx + rdx] |
| 666 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 667 |
| 668 .loop: |
| 669 movdqu xmm0, [rsi - 6] ;load src |
| 670 movdqu xmm1, [rsi - 4] |
| 671 movdqu xmm2, [rsi - 2] |
| 672 movdqu xmm3, [rsi] |
| 673 movdqu xmm4, [rsi + 2] |
| 674 movdqu xmm5, [rsi + 4] |
| 675 movdqu xmm6, [rsi + 6] |
| 676 movdqu xmm7, [rsi + 8] |
| 677 |
| 678 HIGH_APPLY_FILTER_8 0, 0 |
| 679 |
| 680 lea rsi, [rsi + rax] |
| 681 lea rdi, [rdi + rdx] |
| 682 dec rcx |
| 683 jnz .loop |
| 684 |
| 685 add rsp, 16 * 8 |
| 686 pop rsp |
| 687 |
| 688 ; begin epilog |
| 689 pop rdi |
| 690 pop rsi |
| 691 RESTORE_XMM |
| 692 UNSHADOW_ARGS |
| 693 pop rbp |
| 694 ret |
| 695 |
| 696 ;void vp9_filter_block1d16_h8_sse2 |
| 697 ;( |
| 698 ; unsigned char *src_ptr, |
| 699 ; unsigned int src_pixels_per_line, |
| 700 ; unsigned char *output_ptr, |
| 701 ; unsigned int output_pitch, |
| 702 ; unsigned int output_height, |
| 703 ; short *filter |
| 704 ;) |
| 705 global sym(vp9_high_filter_block1d16_h8_sse2) PRIVATE |
| 706 sym(vp9_high_filter_block1d16_h8_sse2): |
| 707 push rbp |
| 708 mov rbp, rsp |
| 709 SHADOW_ARGS_TO_STACK 7 |
| 710 SAVE_XMM 7 |
| 711 push rsi |
| 712 push rdi |
| 713 ; end prolog |
| 714 |
| 715 ALIGN_STACK 16, rax |
| 716 sub rsp, 16 * 8 |
| 717 %define k0k1 [rsp + 16 * 0] |
| 718 %define k6k7 [rsp + 16 * 1] |
| 719 %define k2k5 [rsp + 16 * 2] |
| 720 %define k3k4 [rsp + 16 * 3] |
| 721 %define krd [rsp + 16 * 4] |
| 722 %define temp [rsp + 16 * 5] |
| 723 %define max [rsp + 16 * 6] |
| 724 %define min [rsp + 16 * 7] |
| 725 |
| 726 HIGH_GET_FILTERS |
| 727 |
| 728 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 729 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 730 lea rax, [rax + rax] ;bytes per line |
| 731 lea rdx, [rdx + rdx] |
| 732 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 733 |
| 734 .loop: |
| 735 movdqu xmm0, [rsi - 6] ;load src |
| 736 movdqu xmm1, [rsi - 4] |
| 737 movdqu xmm2, [rsi - 2] |
| 738 movdqu xmm3, [rsi] |
| 739 movdqu xmm4, [rsi + 2] |
| 740 movdqu xmm5, [rsi + 4] |
| 741 movdqu xmm6, [rsi + 6] |
| 742 movdqu xmm7, [rsi + 8] |
| 743 |
| 744 HIGH_APPLY_FILTER_8 0, 0 |
| 745 |
| 746 movdqu xmm0, [rsi + 10] ;load src |
| 747 movdqu xmm1, [rsi + 12] |
| 748 movdqu xmm2, [rsi + 14] |
| 749 movdqu xmm3, [rsi + 16] |
| 750 movdqu xmm4, [rsi + 18] |
| 751 movdqu xmm5, [rsi + 20] |
| 752 movdqu xmm6, [rsi + 22] |
| 753 movdqu xmm7, [rsi + 24] |
| 754 |
| 755 HIGH_APPLY_FILTER_8 0, 16 |
| 756 |
| 757 lea rsi, [rsi + rax] |
| 758 lea rdi, [rdi + rdx] |
| 759 dec rcx |
| 760 jnz .loop |
| 761 |
| 762 add rsp, 16 * 8 |
| 763 pop rsp |
| 764 |
| 765 ; begin epilog |
| 766 pop rdi |
| 767 pop rsi |
| 768 RESTORE_XMM |
| 769 UNSHADOW_ARGS |
| 770 pop rbp |
| 771 ret |
| 772 |
| 773 global sym(vp9_high_filter_block1d4_h8_avg_sse2) PRIVATE |
| 774 sym(vp9_high_filter_block1d4_h8_avg_sse2): |
| 775 push rbp |
| 776 mov rbp, rsp |
| 777 SHADOW_ARGS_TO_STACK 7 |
| 778 SAVE_XMM 7 |
| 779 push rsi |
| 780 push rdi |
| 781 ; end prolog |
| 782 |
| 783 ALIGN_STACK 16, rax |
| 784 sub rsp, 16 * 7 |
| 785 %define k0k6 [rsp + 16 * 0] |
| 786 %define k2k5 [rsp + 16 * 1] |
| 787 %define k3k4 [rsp + 16 * 2] |
| 788 %define k1k7 [rsp + 16 * 3] |
| 789 %define krd [rsp + 16 * 4] |
| 790 %define max [rsp + 16 * 5] |
| 791 %define min [rsp + 16 * 6] |
| 792 |
| 793 HIGH_GET_FILTERS_4 |
| 794 |
| 795 mov rsi, arg(0) ;src_ptr |
| 796 mov rdi, arg(2) ;output_ptr |
| 797 |
| 798 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 799 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 800 lea rax, [rax + rax] ;bytes per line |
| 801 lea rdx, [rdx + rdx] |
| 802 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 803 |
| 804 .loop: |
| 805 movdqu xmm0, [rsi - 6] ;load src |
| 806 movdqu xmm4, [rsi + 2] |
| 807 movdqa xmm1, xmm0 |
| 808 movdqa xmm6, xmm4 |
| 809 movdqa xmm7, xmm4 |
| 810 movdqa xmm2, xmm0 |
| 811 movdqa xmm3, xmm0 |
| 812 movdqa xmm5, xmm4 |
| 813 |
| 814 psrldq xmm1, 2 |
| 815 psrldq xmm6, 4 |
| 816 psrldq xmm7, 6 |
| 817 psrldq xmm2, 4 |
| 818 psrldq xmm3, 6 |
| 819 psrldq xmm5, 2 |
| 820 |
| 821 HIGH_APPLY_FILTER_4 1 |
| 822 |
| 823 lea rsi, [rsi + rax] |
| 824 lea rdi, [rdi + rdx] |
| 825 dec rcx |
| 826 jnz .loop |
| 827 |
| 828 add rsp, 16 * 7 |
| 829 pop rsp |
| 830 |
| 831 ; begin epilog |
| 832 pop rdi |
| 833 pop rsi |
| 834 RESTORE_XMM |
| 835 UNSHADOW_ARGS |
| 836 pop rbp |
| 837 ret |
| 838 |
| 839 global sym(vp9_high_filter_block1d8_h8_avg_sse2) PRIVATE |
| 840 sym(vp9_high_filter_block1d8_h8_avg_sse2): |
| 841 push rbp |
| 842 mov rbp, rsp |
| 843 SHADOW_ARGS_TO_STACK 7 |
| 844 SAVE_XMM 7 |
| 845 push rsi |
| 846 push rdi |
| 847 ; end prolog |
| 848 |
| 849 ALIGN_STACK 16, rax |
| 850 sub rsp, 16 * 8 |
| 851 %define k0k1 [rsp + 16 * 0] |
| 852 %define k6k7 [rsp + 16 * 1] |
| 853 %define k2k5 [rsp + 16 * 2] |
| 854 %define k3k4 [rsp + 16 * 3] |
| 855 %define krd [rsp + 16 * 4] |
| 856 %define temp [rsp + 16 * 5] |
| 857 %define max [rsp + 16 * 6] |
| 858 %define min [rsp + 16 * 7] |
| 859 |
| 860 HIGH_GET_FILTERS |
| 861 |
| 862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 863 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 864 lea rax, [rax + rax] ;bytes per line |
| 865 lea rdx, [rdx + rdx] |
| 866 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 867 |
| 868 .loop: |
| 869 movdqu xmm0, [rsi - 6] ;load src |
| 870 movdqu xmm1, [rsi - 4] |
| 871 movdqu xmm2, [rsi - 2] |
| 872 movdqu xmm3, [rsi] |
| 873 movdqu xmm4, [rsi + 2] |
| 874 movdqu xmm5, [rsi + 4] |
| 875 movdqu xmm6, [rsi + 6] |
| 876 movdqu xmm7, [rsi + 8] |
| 877 |
| 878 HIGH_APPLY_FILTER_8 1, 0 |
| 879 |
| 880 lea rsi, [rsi + rax] |
| 881 lea rdi, [rdi + rdx] |
| 882 dec rcx |
| 883 jnz .loop |
| 884 |
| 885 add rsp, 16 * 8 |
| 886 pop rsp |
| 887 |
| 888 ; begin epilog |
| 889 pop rdi |
| 890 pop rsi |
| 891 RESTORE_XMM |
| 892 UNSHADOW_ARGS |
| 893 pop rbp |
| 894 ret |
| 895 |
| 896 global sym(vp9_high_filter_block1d16_h8_avg_sse2) PRIVATE |
| 897 sym(vp9_high_filter_block1d16_h8_avg_sse2): |
| 898 push rbp |
| 899 mov rbp, rsp |
| 900 SHADOW_ARGS_TO_STACK 7 |
| 901 SAVE_XMM 7 |
| 902 push rsi |
| 903 push rdi |
| 904 ; end prolog |
| 905 |
| 906 ALIGN_STACK 16, rax |
| 907 sub rsp, 16 * 8 |
| 908 %define k0k1 [rsp + 16 * 0] |
| 909 %define k6k7 [rsp + 16 * 1] |
| 910 %define k2k5 [rsp + 16 * 2] |
| 911 %define k3k4 [rsp + 16 * 3] |
| 912 %define krd [rsp + 16 * 4] |
| 913 %define temp [rsp + 16 * 5] |
| 914 %define max [rsp + 16 * 6] |
| 915 %define min [rsp + 16 * 7] |
| 916 |
| 917 HIGH_GET_FILTERS |
| 918 |
| 919 movsxd rax, DWORD PTR arg(1) ;pixels_per_line |
| 920 movsxd rdx, DWORD PTR arg(3) ;out_pitch |
| 921 lea rax, [rax + rax] ;bytes per line |
| 922 lea rdx, [rdx + rdx] |
| 923 movsxd rcx, DWORD PTR arg(4) ;output_height |
| 924 |
| 925 .loop: |
| 926 movdqu xmm0, [rsi - 6] ;load src |
| 927 movdqu xmm1, [rsi - 4] |
| 928 movdqu xmm2, [rsi - 2] |
| 929 movdqu xmm3, [rsi] |
| 930 movdqu xmm4, [rsi + 2] |
| 931 movdqu xmm5, [rsi + 4] |
| 932 movdqu xmm6, [rsi + 6] |
| 933 movdqu xmm7, [rsi + 8] |
| 934 |
| 935 HIGH_APPLY_FILTER_8 1, 0 |
| 936 |
| 937 movdqu xmm0, [rsi + 10] ;load src |
| 938 movdqu xmm1, [rsi + 12] |
| 939 movdqu xmm2, [rsi + 14] |
| 940 movdqu xmm3, [rsi + 16] |
| 941 movdqu xmm4, [rsi + 18] |
| 942 movdqu xmm5, [rsi + 20] |
| 943 movdqu xmm6, [rsi + 22] |
| 944 movdqu xmm7, [rsi + 24] |
| 945 |
| 946 HIGH_APPLY_FILTER_8 1, 16 |
| 947 |
| 948 lea rsi, [rsi + rax] |
| 949 lea rdi, [rdi + rdx] |
| 950 dec rcx |
| 951 jnz .loop |
| 952 |
| 953 add rsp, 16 * 8 |
| 954 pop rsp |
| 955 |
| 956 ; begin epilog |
| 957 pop rdi |
| 958 pop rsi |
| 959 RESTORE_XMM |
| 960 UNSHADOW_ARGS |
| 961 pop rbp |
| 962 ret |
OLD | NEW |