OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %define BLOCK_HEIGHT_WIDTH 4 |
| 15 %define VP9_FILTER_WEIGHT 128 |
| 16 %define VP9_FILTER_SHIFT 7 |
| 17 |
| 18 |
| 19 ;/******************************************************************************
****** |
| 20 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe
ls. The |
| 21 ; input pixel array has output_height rows. This routine assumes that output_hei
ght is an |
| 22 ; even number. This function handles 8 pixels in horizontal direction, calculati
ng ONE |
| 23 ; rows each iteration to take advantage of the 128 bits operations. |
| 24 ;*******************************************************************************
******/ |
| 25 ;void vp9_filter_block1d8_h6_sse2 |
| 26 ;( |
| 27 ; unsigned char *src_ptr, |
| 28 ; unsigned short *output_ptr, |
| 29 ; unsigned int src_pixels_per_line, |
| 30 ; unsigned int pixel_step, |
| 31 ; unsigned int output_height, |
| 32 ; unsigned int output_width, |
| 33 ; short *vp9_filter |
| 34 ;) |
| 35 global sym(vp9_filter_block1d8_h6_sse2) |
| 36 sym(vp9_filter_block1d8_h6_sse2): |
| 37 push rbp |
| 38 mov rbp, rsp |
| 39 SHADOW_ARGS_TO_STACK 7 |
| 40 SAVE_XMM 7 |
| 41 GET_GOT rbx |
| 42 push rsi |
| 43 push rdi |
| 44 ; end prolog |
| 45 |
| 46 mov rdx, arg(6) ;vp9_filter |
| 47 mov rsi, arg(0) ;src_ptr |
| 48 |
| 49 mov rdi, arg(1) ;output_ptr |
| 50 |
| 51 movsxd rcx, dword ptr arg(4) ;output_height |
| 52 movsxd rax, dword ptr arg(2) ;src_pixels_per_line
; Pitch for Source |
| 53 %if ABI_IS_32BIT=0 |
| 54 movsxd r8, dword ptr arg(5) ;output_width |
| 55 %endif |
| 56 pxor xmm0, xmm0 ; clear xmm0 for unp
ack |
| 57 |
| 58 .filter_block1d8_h6_rowloop: |
| 59 movq xmm3, MMWORD PTR [rsi - 2] |
| 60 movq xmm1, MMWORD PTR [rsi + 6] |
| 61 |
| 62 prefetcht2 [rsi+rax-2] |
| 63 |
| 64 pslldq xmm1, 8 |
| 65 por xmm1, xmm3 |
| 66 |
| 67 movdqa xmm4, xmm1 |
| 68 movdqa xmm5, xmm1 |
| 69 |
| 70 movdqa xmm6, xmm1 |
| 71 movdqa xmm7, xmm1 |
| 72 |
| 73 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 74 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09
08 07 06 05 04 03 02 01 00 -1 |
| 75 |
| 76 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap
1 |
| 77 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0
3 xx02 xx01 xx00 xx-1 |
| 78 |
| 79 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a
09 08 07 06 05 04 03 02 01 00 |
| 80 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap
2 |
| 81 |
| 82 |
| 83 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0
4 xx03 xx02 xx01 xx00 |
| 84 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b
0a 09 08 07 06 05 04 03 02 01 |
| 85 |
| 86 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap
3 |
| 87 |
| 88 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0
5 xx04 xx03 xx02 xx01 |
| 89 psrldq xmm7, 4 ; xx xx xx xx 0d 0c
0b 0a 09 08 07 06 05 04 03 02 |
| 90 |
| 91 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta
p 4 |
| 92 |
| 93 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0
6 xx05 xx04 xx03 xx02 |
| 94 psrldq xmm1, 5 ; xx xx xx xx xx 0d
0c 0b 0a 09 08 07 06 05 04 03 |
| 95 |
| 96 |
| 97 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta
p 5 |
| 98 |
| 99 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0
7 xx06 xx05 xx04 xx03 |
| 100 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta
p 6 |
| 101 |
| 102 |
| 103 paddsw xmm4, xmm7 |
| 104 paddsw xmm4, xmm5 |
| 105 |
| 106 paddsw xmm4, xmm3 |
| 107 paddsw xmm4, xmm6 |
| 108 |
| 109 paddsw xmm4, xmm1 |
| 110 paddsw xmm4, [GLOBAL(rd)] |
| 111 |
| 112 psraw xmm4, 7 |
| 113 |
| 114 packuswb xmm4, xmm0 |
| 115 punpcklbw xmm4, xmm0 |
| 116 |
| 117 movdqa XMMWORD Ptr [rdi], xmm4 |
| 118 lea rsi, [rsi + rax] |
| 119 |
| 120 %if ABI_IS_32BIT |
| 121 add rdi, DWORD Ptr arg(5) ;[output_width] |
| 122 %else |
| 123 add rdi, r8 |
| 124 %endif |
| 125 dec rcx |
| 126 |
| 127 jnz .filter_block1d8_h6_rowloop ; next row |
| 128 |
| 129 ; begin epilog |
| 130 pop rdi |
| 131 pop rsi |
| 132 RESTORE_GOT |
| 133 RESTORE_XMM |
| 134 UNSHADOW_ARGS |
| 135 pop rbp |
| 136 ret |
| 137 |
| 138 |
| 139 ;void vp9_filter_block1d16_h6_sse2 |
| 140 ;( |
| 141 ; unsigned char *src_ptr, |
| 142 ; unsigned short *output_ptr, |
| 143 ; unsigned int src_pixels_per_line, |
| 144 ; unsigned int pixel_step, |
| 145 ; unsigned int output_height, |
| 146 ; unsigned int output_width, |
| 147 ; short *vp9_filter |
| 148 ;) |
| 149 ;/******************************************************************************
****** |
| 150 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixe
ls. The |
| 151 ; input pixel array has output_height rows. This routine assumes that output_hei
ght is an |
| 152 ; even number. This function handles 8 pixels in horizontal direction, calculati
ng ONE |
| 153 ; rows each iteration to take advantage of the 128 bits operations. |
| 154 ;*******************************************************************************
******/ |
| 155 global sym(vp9_filter_block1d16_h6_sse2) |
| 156 sym(vp9_filter_block1d16_h6_sse2): |
| 157 push rbp |
| 158 mov rbp, rsp |
| 159 SHADOW_ARGS_TO_STACK 7 |
| 160 SAVE_XMM 7 |
| 161 GET_GOT rbx |
| 162 push rsi |
| 163 push rdi |
| 164 ; end prolog |
| 165 |
| 166 mov rdx, arg(6) ;vp9_filter |
| 167 mov rsi, arg(0) ;src_ptr |
| 168 |
| 169 mov rdi, arg(1) ;output_ptr |
| 170 |
| 171 movsxd rcx, dword ptr arg(4) ;output_height |
| 172 movsxd rax, dword ptr arg(2) ;src_pixels_per_line
; Pitch for Source |
| 173 %if ABI_IS_32BIT=0 |
| 174 movsxd r8, dword ptr arg(5) ;output_width |
| 175 %endif |
| 176 |
| 177 pxor xmm0, xmm0 ; clear xmm0 for unp
ack |
| 178 |
| 179 .filter_block1d16_h6_sse2_rowloop: |
| 180 movq xmm3, MMWORD PTR [rsi - 2] |
| 181 movq xmm1, MMWORD PTR [rsi + 6] |
| 182 |
| 183 movq xmm2, MMWORD PTR [rsi +14] |
| 184 pslldq xmm2, 8 |
| 185 |
| 186 por xmm2, xmm1 |
| 187 prefetcht2 [rsi+rax-2] |
| 188 |
| 189 pslldq xmm1, 8 |
| 190 por xmm1, xmm3 |
| 191 |
| 192 movdqa xmm4, xmm1 |
| 193 movdqa xmm5, xmm1 |
| 194 |
| 195 movdqa xmm6, xmm1 |
| 196 movdqa xmm7, xmm1 |
| 197 |
| 198 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 199 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09
08 07 06 05 04 03 02 01 00 -1 |
| 200 |
| 201 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap
1 |
| 202 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0
3 xx02 xx01 xx00 xx-1 |
| 203 |
| 204 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a
09 08 07 06 05 04 03 02 01 00 |
| 205 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap
2 |
| 206 |
| 207 |
| 208 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0
4 xx03 xx02 xx01 xx00 |
| 209 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b
0a 09 08 07 06 05 04 03 02 01 |
| 210 |
| 211 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap
3 |
| 212 |
| 213 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0
5 xx04 xx03 xx02 xx01 |
| 214 psrldq xmm7, 4 ; xx xx xx xx 0d 0c
0b 0a 09 08 07 06 05 04 03 02 |
| 215 |
| 216 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta
p 4 |
| 217 |
| 218 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0
6 xx05 xx04 xx03 xx02 |
| 219 psrldq xmm1, 5 ; xx xx xx xx xx 0d
0c 0b 0a 09 08 07 06 05 04 03 |
| 220 |
| 221 |
| 222 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta
p 5 |
| 223 |
| 224 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0
7 xx06 xx05 xx04 xx03 |
| 225 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta
p 6 |
| 226 |
| 227 paddsw xmm4, xmm7 |
| 228 paddsw xmm4, xmm5 |
| 229 |
| 230 paddsw xmm4, xmm3 |
| 231 paddsw xmm4, xmm6 |
| 232 |
| 233 paddsw xmm4, xmm1 |
| 234 paddsw xmm4, [GLOBAL(rd)] |
| 235 |
| 236 psraw xmm4, 7 |
| 237 |
| 238 packuswb xmm4, xmm0 |
| 239 punpcklbw xmm4, xmm0 |
| 240 |
| 241 movdqa XMMWORD Ptr [rdi], xmm4 |
| 242 |
| 243 movdqa xmm3, xmm2 |
| 244 movdqa xmm4, xmm2 |
| 245 |
| 246 movdqa xmm5, xmm2 |
| 247 movdqa xmm6, xmm2 |
| 248 |
| 249 movdqa xmm7, xmm2 |
| 250 |
| 251 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 252 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09
08 07 06 05 04 03 02 01 00 -1 |
| 253 |
| 254 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap
1 |
| 255 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0
3 xx02 xx01 xx00 xx-1 |
| 256 |
| 257 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a
09 08 07 06 05 04 03 02 01 00 |
| 258 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap
2 |
| 259 |
| 260 |
| 261 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0
4 xx03 xx02 xx01 xx00 |
| 262 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b
0a 09 08 07 06 05 04 03 02 01 |
| 263 |
| 264 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap
3 |
| 265 |
| 266 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0
5 xx04 xx03 xx02 xx01 |
| 267 psrldq xmm7, 4 ; xx xx xx xx 0d 0c
0b 0a 09 08 07 06 05 04 03 02 |
| 268 |
| 269 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta
p 4 |
| 270 |
| 271 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0
6 xx05 xx04 xx03 xx02 |
| 272 psrldq xmm2, 5 ; xx xx xx xx xx 0d
0c 0b 0a 09 08 07 06 05 04 03 |
| 273 |
| 274 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta
p 5 |
| 275 |
| 276 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx0
7 xx06 xx05 xx04 xx03 |
| 277 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Ta
p 6 |
| 278 |
| 279 |
| 280 paddsw xmm4, xmm7 |
| 281 paddsw xmm4, xmm5 |
| 282 |
| 283 paddsw xmm4, xmm3 |
| 284 paddsw xmm4, xmm6 |
| 285 |
| 286 paddsw xmm4, xmm2 |
| 287 paddsw xmm4, [GLOBAL(rd)] |
| 288 |
| 289 psraw xmm4, 7 |
| 290 |
| 291 packuswb xmm4, xmm0 |
| 292 punpcklbw xmm4, xmm0 |
| 293 |
| 294 movdqa XMMWORD Ptr [rdi+16], xmm4 |
| 295 |
| 296 lea rsi, [rsi + rax] |
| 297 %if ABI_IS_32BIT |
| 298 add rdi, DWORD Ptr arg(5) ;[output_width] |
| 299 %else |
| 300 add rdi, r8 |
| 301 %endif |
| 302 |
| 303 dec rcx |
| 304 jnz .filter_block1d16_h6_sse2_rowloop ; next row |
| 305 |
| 306 ; begin epilog |
| 307 pop rdi |
| 308 pop rsi |
| 309 RESTORE_GOT |
| 310 RESTORE_XMM |
| 311 UNSHADOW_ARGS |
| 312 pop rbp |
| 313 ret |
| 314 |
| 315 |
| 316 ;void vp9_filter_block1d8_v6_sse2 |
| 317 ;( |
| 318 ; short *src_ptr, |
| 319 ; unsigned char *output_ptr, |
| 320 ; int dst_ptich, |
| 321 ; unsigned int pixels_per_line, |
| 322 ; unsigned int pixel_step, |
| 323 ; unsigned int output_height, |
| 324 ; unsigned int output_width, |
| 325 ; short * vp9_filter |
| 326 ;) |
| 327 ;/******************************************************************************
****** |
| 328 ; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixel
s. The |
| 329 ; input pixel array has output_height rows. |
| 330 ;*******************************************************************************
******/ |
| 331 global sym(vp9_filter_block1d8_v6_sse2) |
| 332 sym(vp9_filter_block1d8_v6_sse2): |
| 333 push rbp |
| 334 mov rbp, rsp |
| 335 SHADOW_ARGS_TO_STACK 8 |
| 336 SAVE_XMM 7 |
| 337 GET_GOT rbx |
| 338 push rsi |
| 339 push rdi |
| 340 ; end prolog |
| 341 |
| 342 mov rax, arg(7) ;vp9_filter |
| 343 movsxd rdx, dword ptr arg(3) ;pixels_per_line |
| 344 |
| 345 mov rdi, arg(1) ;output_ptr |
| 346 mov rsi, arg(0) ;src_ptr |
| 347 |
| 348 sub rsi, rdx |
| 349 sub rsi, rdx |
| 350 |
| 351 movsxd rcx, DWORD PTR arg(5) ;[output_height] |
| 352 pxor xmm0, xmm0 ; clear xmm0 |
| 353 |
| 354 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] |
| 355 %if ABI_IS_32BIT=0 |
| 356 movsxd r8, dword ptr arg(2) ; dst_ptich |
| 357 %endif |
| 358 |
| 359 .vp9_filter_block1d8_v6_sse2_loop: |
| 360 movdqa xmm1, XMMWORD PTR [rsi] |
| 361 pmullw xmm1, [rax] |
| 362 |
| 363 movdqa xmm2, XMMWORD PTR [rsi + rdx] |
| 364 pmullw xmm2, [rax + 16] |
| 365 |
| 366 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] |
| 367 pmullw xmm3, [rax + 32] |
| 368 |
| 369 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] |
| 370 pmullw xmm5, [rax + 64] |
| 371 |
| 372 add rsi, rdx |
| 373 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] |
| 374 |
| 375 pmullw xmm4, [rax + 48] |
| 376 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] |
| 377 |
| 378 pmullw xmm6, [rax + 80] |
| 379 |
| 380 paddsw xmm2, xmm5 |
| 381 paddsw xmm2, xmm3 |
| 382 |
| 383 paddsw xmm2, xmm1 |
| 384 paddsw xmm2, xmm4 |
| 385 |
| 386 paddsw xmm2, xmm6 |
| 387 paddsw xmm2, xmm7 |
| 388 |
| 389 psraw xmm2, 7 |
| 390 packuswb xmm2, xmm0 ; pack and saturate |
| 391 |
| 392 movq QWORD PTR [rdi], xmm2 ; store the results in the des
tination |
| 393 %if ABI_IS_32BIT |
| 394 add rdi, DWORD PTR arg(2) ;[dst_ptich] |
| 395 %else |
| 396 add rdi, r8 |
| 397 %endif |
| 398 dec rcx ; decrement count |
| 399 jnz .vp9_filter_block1d8_v6_sse2_loop ; next row |
| 400 |
| 401 ; begin epilog |
| 402 pop rdi |
| 403 pop rsi |
| 404 RESTORE_GOT |
| 405 RESTORE_XMM |
| 406 UNSHADOW_ARGS |
| 407 pop rbp |
| 408 ret |
| 409 |
| 410 |
| 411 ;void vp9_filter_block1d16_v6_sse2 |
| 412 ;( |
| 413 ; unsigned short *src_ptr, |
| 414 ; unsigned char *output_ptr, |
| 415 ; int dst_ptich, |
| 416 ; unsigned int pixels_per_line, |
| 417 ; unsigned int pixel_step, |
| 418 ; unsigned int output_height, |
| 419 ; unsigned int output_width, |
| 420 ; const short *vp9_filter |
| 421 ;) |
| 422 ;/******************************************************************************
****** |
| 423 ; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixe
ls. The |
| 424 ; input pixel array has output_height rows. |
| 425 ;*******************************************************************************
******/ |
| 426 global sym(vp9_filter_block1d16_v6_sse2) |
| 427 sym(vp9_filter_block1d16_v6_sse2): |
| 428 push rbp |
| 429 mov rbp, rsp |
| 430 SHADOW_ARGS_TO_STACK 8 |
| 431 SAVE_XMM 7 |
| 432 GET_GOT rbx |
| 433 push rsi |
| 434 push rdi |
| 435 ; end prolog |
| 436 |
| 437 mov rax, arg(7) ;vp9_filter |
| 438 movsxd rdx, dword ptr arg(3) ;pixels_per_line |
| 439 |
| 440 mov rdi, arg(1) ;output_ptr |
| 441 mov rsi, arg(0) ;src_ptr |
| 442 |
| 443 sub rsi, rdx |
| 444 sub rsi, rdx |
| 445 |
| 446 movsxd rcx, DWORD PTR arg(5) ;[output_height] |
| 447 %if ABI_IS_32BIT=0 |
| 448 movsxd r8, dword ptr arg(2) ; dst_ptich |
| 449 %endif |
| 450 |
| 451 .vp9_filter_block1d16_v6_sse2_loop: |
| 452 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. |
| 453 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 |
| 454 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] |
| 455 pmullw xmm1, [rax + 16] |
| 456 pmullw xmm2, [rax + 16] |
| 457 |
| 458 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 |
| 459 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] |
| 460 pmullw xmm3, [rax + 64] |
| 461 pmullw xmm4, [rax + 64] |
| 462 |
| 463 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 |
| 464 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] |
| 465 pmullw xmm5, [rax + 32] |
| 466 pmullw xmm6, [rax + 32] |
| 467 |
| 468 movdqa xmm7, XMMWORD PTR [rsi] ; line 1 |
| 469 movdqa xmm0, XMMWORD PTR [rsi + 16] |
| 470 pmullw xmm7, [rax] |
| 471 pmullw xmm0, [rax] |
| 472 |
| 473 paddsw xmm1, xmm3 |
| 474 paddsw xmm2, xmm4 |
| 475 paddsw xmm1, xmm5 |
| 476 paddsw xmm2, xmm6 |
| 477 paddsw xmm1, xmm7 |
| 478 paddsw xmm2, xmm0 |
| 479 |
| 480 add rsi, rdx |
| 481 |
| 482 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 |
| 483 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] |
| 484 pmullw xmm3, [rax + 48] |
| 485 pmullw xmm4, [rax + 48] |
| 486 |
| 487 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 |
| 488 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] |
| 489 pmullw xmm5, [rax + 80] |
| 490 pmullw xmm6, [rax + 80] |
| 491 |
| 492 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] |
| 493 pxor xmm0, xmm0 ; clear xmm0 |
| 494 |
| 495 paddsw xmm1, xmm3 |
| 496 paddsw xmm2, xmm4 |
| 497 paddsw xmm1, xmm5 |
| 498 paddsw xmm2, xmm6 |
| 499 |
| 500 paddsw xmm1, xmm7 |
| 501 paddsw xmm2, xmm7 |
| 502 |
| 503 psraw xmm1, 7 |
| 504 psraw xmm2, 7 |
| 505 |
| 506 packuswb xmm1, xmm2 ; pack and saturate |
| 507 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the des
tination |
| 508 %if ABI_IS_32BIT |
| 509 add rdi, DWORD PTR arg(2) ;[dst_ptich] |
| 510 %else |
| 511 add rdi, r8 |
| 512 %endif |
| 513 dec rcx ; decrement count |
| 514 jnz .vp9_filter_block1d16_v6_sse2_loop ; next row |
| 515 |
| 516 ; begin epilog |
| 517 pop rdi |
| 518 pop rsi |
| 519 RESTORE_GOT |
| 520 RESTORE_XMM |
| 521 UNSHADOW_ARGS |
| 522 pop rbp |
| 523 ret |
| 524 |
| 525 |
| 526 ;void vp9_filter_block1d8_h6_only_sse2 |
| 527 ;( |
| 528 ; unsigned char *src_ptr, |
| 529 ; unsigned int src_pixels_per_line, |
| 530 ; unsigned char *output_ptr, |
| 531 ; int dst_ptich, |
| 532 ; unsigned int output_height, |
| 533 ; const short *vp9_filter |
| 534 ;) |
| 535 ; First-pass filter only when yoffset==0 |
| 536 global sym(vp9_filter_block1d8_h6_only_sse2) |
| 537 sym(vp9_filter_block1d8_h6_only_sse2): |
| 538 push rbp |
| 539 mov rbp, rsp |
| 540 SHADOW_ARGS_TO_STACK 6 |
| 541 SAVE_XMM 7 |
| 542 GET_GOT rbx |
| 543 push rsi |
| 544 push rdi |
| 545 ; end prolog |
| 546 |
| 547 mov rdx, arg(5) ;vp9_filter |
| 548 mov rsi, arg(0) ;src_ptr |
| 549 |
| 550 mov rdi, arg(2) ;output_ptr |
| 551 |
| 552 movsxd rcx, dword ptr arg(4) ;output_height |
| 553 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
; Pitch for Source |
| 554 %if ABI_IS_32BIT=0 |
| 555 movsxd r8, dword ptr arg(3) ;dst_ptich |
| 556 %endif |
| 557 pxor xmm0, xmm0 ; clear xmm0 for unp
ack |
| 558 |
| 559 .filter_block1d8_h6_only_rowloop: |
| 560 movq xmm3, MMWORD PTR [rsi - 2] |
| 561 movq xmm1, MMWORD PTR [rsi + 6] |
| 562 |
| 563 prefetcht2 [rsi+rax-2] |
| 564 |
| 565 pslldq xmm1, 8 |
| 566 por xmm1, xmm3 |
| 567 |
| 568 movdqa xmm4, xmm1 |
| 569 movdqa xmm5, xmm1 |
| 570 |
| 571 movdqa xmm6, xmm1 |
| 572 movdqa xmm7, xmm1 |
| 573 |
| 574 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 575 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09
08 07 06 05 04 03 02 01 00 -1 |
| 576 |
| 577 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap
1 |
| 578 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0
3 xx02 xx01 xx00 xx-1 |
| 579 |
| 580 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a
09 08 07 06 05 04 03 02 01 00 |
| 581 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap
2 |
| 582 |
| 583 |
| 584 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0
4 xx03 xx02 xx01 xx00 |
| 585 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b
0a 09 08 07 06 05 04 03 02 01 |
| 586 |
| 587 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap
3 |
| 588 |
| 589 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0
5 xx04 xx03 xx02 xx01 |
| 590 psrldq xmm7, 4 ; xx xx xx xx 0d 0c
0b 0a 09 08 07 06 05 04 03 02 |
| 591 |
| 592 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta
p 4 |
| 593 |
| 594 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0
6 xx05 xx04 xx03 xx02 |
| 595 psrldq xmm1, 5 ; xx xx xx xx xx 0d
0c 0b 0a 09 08 07 06 05 04 03 |
| 596 |
| 597 |
| 598 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta
p 5 |
| 599 |
| 600 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0
7 xx06 xx05 xx04 xx03 |
| 601 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta
p 6 |
| 602 |
| 603 |
| 604 paddsw xmm4, xmm7 |
| 605 paddsw xmm4, xmm5 |
| 606 |
| 607 paddsw xmm4, xmm3 |
| 608 paddsw xmm4, xmm6 |
| 609 |
| 610 paddsw xmm4, xmm1 |
| 611 paddsw xmm4, [GLOBAL(rd)] |
| 612 |
| 613 psraw xmm4, 7 |
| 614 |
| 615 packuswb xmm4, xmm0 |
| 616 |
| 617 movq QWORD PTR [rdi], xmm4 ; store the results in the des
tination |
| 618 lea rsi, [rsi + rax] |
| 619 |
| 620 %if ABI_IS_32BIT |
| 621 add rdi, DWORD Ptr arg(3) ;dst_ptich |
| 622 %else |
| 623 add rdi, r8 |
| 624 %endif |
| 625 dec rcx |
| 626 |
| 627 jnz .filter_block1d8_h6_only_rowloop ; next row |
| 628 |
| 629 ; begin epilog |
| 630 pop rdi |
| 631 pop rsi |
| 632 RESTORE_GOT |
| 633 RESTORE_XMM |
| 634 UNSHADOW_ARGS |
| 635 pop rbp |
| 636 ret |
| 637 |
| 638 |
| 639 ;void vp9_filter_block1d16_h6_only_sse2 |
| 640 ;( |
| 641 ; unsigned char *src_ptr, |
| 642 ; unsigned int src_pixels_per_line, |
| 643 ; unsigned char *output_ptr, |
| 644 ; int dst_ptich, |
| 645 ; unsigned int output_height, |
| 646 ; const short *vp9_filter |
| 647 ;) |
| 648 ; First-pass filter only when yoffset==0 |
| 649 global sym(vp9_filter_block1d16_h6_only_sse2) |
| 650 sym(vp9_filter_block1d16_h6_only_sse2): |
| 651 push rbp |
| 652 mov rbp, rsp |
| 653 SHADOW_ARGS_TO_STACK 6 |
| 654 SAVE_XMM 7 |
| 655 GET_GOT rbx |
| 656 push rsi |
| 657 push rdi |
| 658 ; end prolog |
| 659 |
| 660 mov rdx, arg(5) ;vp9_filter |
| 661 mov rsi, arg(0) ;src_ptr |
| 662 |
| 663 mov rdi, arg(2) ;output_ptr |
| 664 |
| 665 movsxd rcx, dword ptr arg(4) ;output_height |
| 666 movsxd rax, dword ptr arg(1) ;src_pixels_per_line
; Pitch for Source |
| 667 %if ABI_IS_32BIT=0 |
| 668 movsxd r8, dword ptr arg(3) ;dst_ptich |
| 669 %endif |
| 670 |
| 671 pxor xmm0, xmm0 ; clear xmm0 for unp
ack |
| 672 |
| 673 .filter_block1d16_h6_only_sse2_rowloop: |
| 674 movq xmm3, MMWORD PTR [rsi - 2] |
| 675 movq xmm1, MMWORD PTR [rsi + 6] |
| 676 |
| 677 movq xmm2, MMWORD PTR [rsi +14] |
| 678 pslldq xmm2, 8 |
| 679 |
| 680 por xmm2, xmm1 |
| 681 prefetcht2 [rsi+rax-2] |
| 682 |
| 683 pslldq xmm1, 8 |
| 684 por xmm1, xmm3 |
| 685 |
| 686 movdqa xmm4, xmm1 |
| 687 movdqa xmm5, xmm1 |
| 688 |
| 689 movdqa xmm6, xmm1 |
| 690 movdqa xmm7, xmm1 |
| 691 |
| 692 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 693 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09
08 07 06 05 04 03 02 01 00 -1 |
| 694 |
| 695 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap
1 |
| 696 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0
3 xx02 xx01 xx00 xx-1 |
| 697 |
| 698 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a
09 08 07 06 05 04 03 02 01 00 |
| 699 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap
2 |
| 700 |
| 701 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0
4 xx03 xx02 xx01 xx00 |
| 702 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b
0a 09 08 07 06 05 04 03 02 01 |
| 703 |
| 704 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap
3 |
| 705 |
| 706 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0
5 xx04 xx03 xx02 xx01 |
| 707 psrldq xmm7, 4 ; xx xx xx xx 0d 0c
0b 0a 09 08 07 06 05 04 03 02 |
| 708 |
| 709 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta
p 4 |
| 710 |
| 711 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0
6 xx05 xx04 xx03 xx02 |
| 712 psrldq xmm1, 5 ; xx xx xx xx xx 0d
0c 0b 0a 09 08 07 06 05 04 03 |
| 713 |
| 714 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta
p 5 |
| 715 |
| 716 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx0
7 xx06 xx05 xx04 xx03 |
| 717 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Ta
p 6 |
| 718 |
| 719 paddsw xmm4, xmm7 |
| 720 paddsw xmm4, xmm5 |
| 721 |
| 722 paddsw xmm4, xmm3 |
| 723 paddsw xmm4, xmm6 |
| 724 |
| 725 paddsw xmm4, xmm1 |
| 726 paddsw xmm4, [GLOBAL(rd)] |
| 727 |
| 728 psraw xmm4, 7 |
| 729 |
| 730 packuswb xmm4, xmm0 ; lower 8 bytes |
| 731 |
| 732 movq QWORD Ptr [rdi], xmm4 ; store the results
in the destination |
| 733 |
| 734 movdqa xmm3, xmm2 |
| 735 movdqa xmm4, xmm2 |
| 736 |
| 737 movdqa xmm5, xmm2 |
| 738 movdqa xmm6, xmm2 |
| 739 |
| 740 movdqa xmm7, xmm2 |
| 741 |
| 742 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 743 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09
08 07 06 05 04 03 02 01 00 -1 |
| 744 |
| 745 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap
1 |
| 746 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx0
3 xx02 xx01 xx00 xx-1 |
| 747 |
| 748 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a
09 08 07 06 05 04 03 02 01 00 |
| 749 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap
2 |
| 750 |
| 751 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx0
4 xx03 xx02 xx01 xx00 |
| 752 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b
0a 09 08 07 06 05 04 03 02 01 |
| 753 |
| 754 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap
3 |
| 755 |
| 756 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx0
5 xx04 xx03 xx02 xx01 |
| 757 psrldq xmm7, 4 ; xx xx xx xx 0d 0c
0b 0a 09 08 07 06 05 04 03 02 |
| 758 |
| 759 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Ta
p 4 |
| 760 |
| 761 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx0
6 xx05 xx04 xx03 xx02 |
| 762 psrldq xmm2, 5 ; xx xx xx xx xx 0d
0c 0b 0a 09 08 07 06 05 04 03 |
| 763 |
| 764 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Ta
p 5 |
| 765 |
| 766 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx0
7 xx06 xx05 xx04 xx03 |
| 767 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Ta
p 6 |
| 768 |
| 769 paddsw xmm4, xmm7 |
| 770 paddsw xmm4, xmm5 |
| 771 |
| 772 paddsw xmm4, xmm3 |
| 773 paddsw xmm4, xmm6 |
| 774 |
| 775 paddsw xmm4, xmm2 |
| 776 paddsw xmm4, [GLOBAL(rd)] |
| 777 |
| 778 psraw xmm4, 7 |
| 779 |
| 780 packuswb xmm4, xmm0 ; higher 8 bytes |
| 781 |
| 782 movq QWORD Ptr [rdi+8], xmm4 ; store the results
in the destination |
| 783 |
| 784 lea rsi, [rsi + rax] |
| 785 %if ABI_IS_32BIT |
| 786 add rdi, DWORD Ptr arg(3) ;dst_ptich |
| 787 %else |
| 788 add rdi, r8 |
| 789 %endif |
| 790 |
| 791 dec rcx |
| 792 jnz .filter_block1d16_h6_only_sse2_rowloop ; next
row |
| 793 |
| 794 ; begin epilog |
| 795 pop rdi |
| 796 pop rsi |
| 797 RESTORE_GOT |
| 798 RESTORE_XMM |
| 799 UNSHADOW_ARGS |
| 800 pop rbp |
| 801 ret |
| 802 |
| 803 |
| 804 ;void vp9_filter_block1d8_v6_only_sse2 |
| 805 ;( |
| 806 ; unsigned char *src_ptr, |
| 807 ; unsigned int src_pixels_per_line, |
| 808 ; unsigned char *output_ptr, |
| 809 ; int dst_ptich, |
| 810 ; unsigned int output_height, |
| 811 ; const short *vp9_filter |
| 812 ;) |
| 813 ; Second-pass filter only when xoffset==0 |
| 814 global sym(vp9_filter_block1d8_v6_only_sse2) |
| 815 sym(vp9_filter_block1d8_v6_only_sse2): |
| 816 push rbp |
| 817 mov rbp, rsp |
| 818 SHADOW_ARGS_TO_STACK 6 |
| 819 SAVE_XMM 7 |
| 820 GET_GOT rbx |
| 821 push rsi |
| 822 push rdi |
| 823 ; end prolog |
| 824 |
| 825 mov rsi, arg(0) ;src_ptr |
| 826 mov rdi, arg(2) ;output_ptr |
| 827 |
| 828 movsxd rcx, dword ptr arg(4) ;output_height |
| 829 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line |
| 830 |
| 831 mov rax, arg(5) ;vp9_filter |
| 832 |
| 833 pxor xmm0, xmm0 ; clear xmm0 |
| 834 |
| 835 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] |
| 836 %if ABI_IS_32BIT=0 |
| 837 movsxd r8, dword ptr arg(3) ; dst_ptich |
| 838 %endif |
| 839 |
| 840 .vp9_filter_block1d8_v6_only_sse2_loop: |
| 841 movq xmm1, MMWORD PTR [rsi] |
| 842 movq xmm2, MMWORD PTR [rsi + rdx] |
| 843 movq xmm3, MMWORD PTR [rsi + rdx * 2] |
| 844 movq xmm5, MMWORD PTR [rsi + rdx * 4] |
| 845 add rsi, rdx |
| 846 movq xmm4, MMWORD PTR [rsi + rdx * 2] |
| 847 movq xmm6, MMWORD PTR [rsi + rdx * 4] |
| 848 |
| 849 punpcklbw xmm1, xmm0 |
| 850 pmullw xmm1, [rax] |
| 851 |
| 852 punpcklbw xmm2, xmm0 |
| 853 pmullw xmm2, [rax + 16] |
| 854 |
| 855 punpcklbw xmm3, xmm0 |
| 856 pmullw xmm3, [rax + 32] |
| 857 |
| 858 punpcklbw xmm5, xmm0 |
| 859 pmullw xmm5, [rax + 64] |
| 860 |
| 861 punpcklbw xmm4, xmm0 |
| 862 pmullw xmm4, [rax + 48] |
| 863 |
| 864 punpcklbw xmm6, xmm0 |
| 865 pmullw xmm6, [rax + 80] |
| 866 |
| 867 paddsw xmm2, xmm5 |
| 868 paddsw xmm2, xmm3 |
| 869 |
| 870 paddsw xmm2, xmm1 |
| 871 paddsw xmm2, xmm4 |
| 872 |
| 873 paddsw xmm2, xmm6 |
| 874 paddsw xmm2, xmm7 |
| 875 |
| 876 psraw xmm2, 7 |
| 877 packuswb xmm2, xmm0 ; pack and saturate |
| 878 |
| 879 movq QWORD PTR [rdi], xmm2 ; store the results in the des
tination |
| 880 %if ABI_IS_32BIT |
| 881 add rdi, DWORD PTR arg(3) ;[dst_ptich] |
| 882 %else |
| 883 add rdi, r8 |
| 884 %endif |
| 885 dec rcx ; decrement count |
| 886 jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next r
ow |
| 887 |
| 888 ; begin epilog |
| 889 pop rdi |
| 890 pop rsi |
| 891 RESTORE_GOT |
| 892 RESTORE_XMM |
| 893 UNSHADOW_ARGS |
| 894 pop rbp |
| 895 ret |
| 896 |
| 897 |
| 898 ;void vp9_unpack_block1d16_h6_sse2 |
| 899 ;( |
| 900 ; unsigned char *src_ptr, |
| 901 ; unsigned short *output_ptr, |
| 902 ; unsigned int src_pixels_per_line, |
| 903 ; unsigned int output_height, |
| 904 ; unsigned int output_width |
| 905 ;) |
| 906 global sym(vp9_unpack_block1d16_h6_sse2) |
| 907 sym(vp9_unpack_block1d16_h6_sse2): |
| 908 push rbp |
| 909 mov rbp, rsp |
| 910 SHADOW_ARGS_TO_STACK 5 |
| 911 GET_GOT rbx |
| 912 push rsi |
| 913 push rdi |
| 914 ; end prolog |
| 915 |
| 916 mov rsi, arg(0) ;src_ptr |
| 917 mov rdi, arg(1) ;output_ptr |
| 918 |
| 919 movsxd rcx, dword ptr arg(3) ;output_height |
| 920 movsxd rax, dword ptr arg(2) ;src_pixels_per_line
; Pitch for Source |
| 921 |
| 922 pxor xmm0, xmm0 ; clear xmm0 for unp
ack |
| 923 %if ABI_IS_32BIT=0 |
| 924 movsxd r8, dword ptr arg(4) ;output_width ; Pitc
h for Source |
| 925 %endif |
| 926 |
| 927 .unpack_block1d16_h6_sse2_rowloop: |
| 928 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08
07 06 05 04 03 02 01 00 -1 -2 |
| 929 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 |
| 930 |
| 931 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx0
2 xx01 xx01 xx-1 xx-2 |
| 932 punpcklbw xmm1, xmm0 |
| 933 |
| 934 movdqa XMMWORD Ptr [rdi], xmm1 |
| 935 movdqa XMMWORD Ptr [rdi + 16], xmm3 |
| 936 |
| 937 lea rsi, [rsi + rax] |
| 938 %if ABI_IS_32BIT |
| 939 add rdi, DWORD Ptr arg(4) ;[output_width] |
| 940 %else |
| 941 add rdi, r8 |
| 942 %endif |
| 943 dec rcx |
| 944 jnz .unpack_block1d16_h6_sse2_rowloop ; next row |
| 945 |
| 946 ; begin epilog |
| 947 pop rdi |
| 948 pop rsi |
| 949 RESTORE_GOT |
| 950 UNSHADOW_ARGS |
| 951 pop rbp |
| 952 ret |
| 953 |
| 954 |
| 955 ;void vp9_bilinear_predict16x16_sse2 |
| 956 ;( |
| 957 ; unsigned char *src_ptr, |
| 958 ; int src_pixels_per_line, |
| 959 ; int xoffset, |
| 960 ; int yoffset, |
| 961 ; unsigned char *dst_ptr, |
| 962 ; int dst_pitch |
| 963 ;) |
| 964 extern sym(vp9_bilinear_filters_mmx) |
| 965 global sym(vp9_bilinear_predict16x16_sse2) |
| 966 sym(vp9_bilinear_predict16x16_sse2): |
| 967 push rbp |
| 968 mov rbp, rsp |
| 969 SHADOW_ARGS_TO_STACK 6 |
| 970 SAVE_XMM 7 |
| 971 GET_GOT rbx |
| 972 push rsi |
| 973 push rdi |
| 974 ; end prolog |
| 975 |
| 976 ;const short *HFilter = bilinear_filters_mmx[xoffset] |
| 977 ;const short *VFilter = bilinear_filters_mmx[yoffset] |
| 978 |
| 979 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] |
| 980 movsxd rax, dword ptr arg(2) ;xoffset |
| 981 |
| 982 cmp rax, 0 ;skip first_pass filter if xoffset=0 |
| 983 je .b16x16_sp_only |
| 984 |
| 985 shl rax, 5 |
| 986 add rax, rcx ;HFilter |
| 987 |
| 988 mov rdi, arg(4) ;dst_ptr |
| 989 mov rsi, arg(0) ;src_ptr |
| 990 movsxd rdx, dword ptr arg(5) ;dst_pitch |
| 991 |
| 992 movdqa xmm1, [rax] |
| 993 movdqa xmm2, [rax+16] |
| 994 |
| 995 movsxd rax, dword ptr arg(3) ;yoffset |
| 996 |
| 997 cmp rax, 0 ;skip second_pass filter if yoffset=0 |
| 998 je .b16x16_fp_only |
| 999 |
| 1000 shl rax, 5 |
| 1001 add rax, rcx ;VFilter |
| 1002 |
| 1003 lea rcx, [rdi+rdx*8] |
| 1004 lea rcx, [rcx+rdx*8] |
| 1005 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line |
| 1006 |
| 1007 pxor xmm0, xmm0 |
| 1008 |
| 1009 %if ABI_IS_32BIT=0 |
| 1010 movsxd r8, dword ptr arg(5) ;dst_pitch |
| 1011 %endif |
| 1012 ; get the first horizontal line done |
| 1013 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 |
| 1014 movdqa xmm4, xmm3 ; make a copy of current li
ne |
| 1015 |
| 1016 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 |
| 1017 punpckhbw xmm4, xmm0 |
| 1018 |
| 1019 pmullw xmm3, xmm1 |
| 1020 pmullw xmm4, xmm1 |
| 1021 |
| 1022 movdqu xmm5, [rsi+1] |
| 1023 movdqa xmm6, xmm5 |
| 1024 |
| 1025 punpcklbw xmm5, xmm0 |
| 1026 punpckhbw xmm6, xmm0 |
| 1027 |
| 1028 pmullw xmm5, xmm2 |
| 1029 pmullw xmm6, xmm2 |
| 1030 |
| 1031 paddw xmm3, xmm5 |
| 1032 paddw xmm4, xmm6 |
| 1033 |
| 1034 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1035 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1036 |
| 1037 paddw xmm4, [GLOBAL(rd)] |
| 1038 psraw xmm4, VP9_FILTER_SHIFT |
| 1039 |
| 1040 movdqa xmm7, xmm3 |
| 1041 packuswb xmm7, xmm4 |
| 1042 |
| 1043 add rsi, rdx ; next line |
| 1044 .next_row: |
| 1045 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 |
| 1046 movdqa xmm4, xmm3 ; make a copy of current li
ne |
| 1047 |
| 1048 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 |
| 1049 punpckhbw xmm4, xmm0 |
| 1050 |
| 1051 pmullw xmm3, xmm1 |
| 1052 pmullw xmm4, xmm1 |
| 1053 |
| 1054 movdqu xmm5, [rsi+1] |
| 1055 movdqa xmm6, xmm5 |
| 1056 |
| 1057 punpcklbw xmm5, xmm0 |
| 1058 punpckhbw xmm6, xmm0 |
| 1059 |
| 1060 pmullw xmm5, xmm2 |
| 1061 pmullw xmm6, xmm2 |
| 1062 |
| 1063 paddw xmm3, xmm5 |
| 1064 paddw xmm4, xmm6 |
| 1065 |
| 1066 movdqa xmm5, xmm7 |
| 1067 movdqa xmm6, xmm7 |
| 1068 |
| 1069 punpcklbw xmm5, xmm0 |
| 1070 punpckhbw xmm6, xmm0 |
| 1071 |
| 1072 pmullw xmm5, [rax] |
| 1073 pmullw xmm6, [rax] |
| 1074 |
| 1075 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1076 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1077 |
| 1078 paddw xmm4, [GLOBAL(rd)] |
| 1079 psraw xmm4, VP9_FILTER_SHIFT |
| 1080 |
| 1081 movdqa xmm7, xmm3 |
| 1082 packuswb xmm7, xmm4 |
| 1083 |
| 1084 pmullw xmm3, [rax+16] |
| 1085 pmullw xmm4, [rax+16] |
| 1086 |
| 1087 paddw xmm3, xmm5 |
| 1088 paddw xmm4, xmm6 |
| 1089 |
| 1090 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1091 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1092 |
| 1093 paddw xmm4, [GLOBAL(rd)] |
| 1094 psraw xmm4, VP9_FILTER_SHIFT |
| 1095 |
| 1096 packuswb xmm3, xmm4 |
| 1097 movdqa [rdi], xmm3 ; store the results in the
destination |
| 1098 |
| 1099 add rsi, rdx ; next line |
| 1100 %if ABI_IS_32BIT |
| 1101 add rdi, DWORD PTR arg(5) ;dst_pitch |
| 1102 %else |
| 1103 add rdi, r8 |
| 1104 %endif |
| 1105 |
| 1106 cmp rdi, rcx |
| 1107 jne .next_row |
| 1108 |
| 1109 jmp .done |
| 1110 |
| 1111 .b16x16_sp_only: |
| 1112 movsxd rax, dword ptr arg(3) ;yoffset |
| 1113 shl rax, 5 |
| 1114 add rax, rcx ;VFilter |
| 1115 |
| 1116 mov rdi, arg(4) ;dst_ptr |
| 1117 mov rsi, arg(0) ;src_ptr |
| 1118 movsxd rdx, dword ptr arg(5) ;dst_pitch |
| 1119 |
| 1120 movdqa xmm1, [rax] |
| 1121 movdqa xmm2, [rax+16] |
| 1122 |
| 1123 lea rcx, [rdi+rdx*8] |
| 1124 lea rcx, [rcx+rdx*8] |
| 1125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 1126 |
| 1127 pxor xmm0, xmm0 |
| 1128 |
| 1129 ; get the first horizontal line done |
| 1130 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 |
| 1131 |
| 1132 add rsi, rax ; next line |
| 1133 .next_row_spo: |
| 1134 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 |
| 1135 |
| 1136 movdqa xmm5, xmm7 |
| 1137 movdqa xmm6, xmm7 |
| 1138 |
| 1139 movdqa xmm4, xmm3 ; make a copy of current li
ne |
| 1140 movdqa xmm7, xmm3 |
| 1141 |
| 1142 punpcklbw xmm5, xmm0 |
| 1143 punpckhbw xmm6, xmm0 |
| 1144 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 |
| 1145 punpckhbw xmm4, xmm0 |
| 1146 |
| 1147 pmullw xmm5, xmm1 |
| 1148 pmullw xmm6, xmm1 |
| 1149 pmullw xmm3, xmm2 |
| 1150 pmullw xmm4, xmm2 |
| 1151 |
| 1152 paddw xmm3, xmm5 |
| 1153 paddw xmm4, xmm6 |
| 1154 |
| 1155 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1156 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1157 |
| 1158 paddw xmm4, [GLOBAL(rd)] |
| 1159 psraw xmm4, VP9_FILTER_SHIFT |
| 1160 |
| 1161 packuswb xmm3, xmm4 |
| 1162 movdqa [rdi], xmm3 ; store the results in the
destination |
| 1163 |
| 1164 add rsi, rax ; next line |
| 1165 add rdi, rdx ;dst_pitch |
| 1166 cmp rdi, rcx |
| 1167 jne .next_row_spo |
| 1168 |
| 1169 jmp .done |
| 1170 |
| 1171 .b16x16_fp_only: |
| 1172 lea rcx, [rdi+rdx*8] |
| 1173 lea rcx, [rcx+rdx*8] |
| 1174 movsxd rax, dword ptr arg(1) ;src_pixels_per_line |
| 1175 pxor xmm0, xmm0 |
| 1176 |
| 1177 .next_row_fpo: |
| 1178 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07
08 09 10 11 12 13 14 |
| 1179 movdqa xmm4, xmm3 ; make a copy of current li
ne |
| 1180 |
| 1181 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 |
| 1182 punpckhbw xmm4, xmm0 |
| 1183 |
| 1184 pmullw xmm3, xmm1 |
| 1185 pmullw xmm4, xmm1 |
| 1186 |
| 1187 movdqu xmm5, [rsi+1] |
| 1188 movdqa xmm6, xmm5 |
| 1189 |
| 1190 punpcklbw xmm5, xmm0 |
| 1191 punpckhbw xmm6, xmm0 |
| 1192 |
| 1193 pmullw xmm5, xmm2 |
| 1194 pmullw xmm6, xmm2 |
| 1195 |
| 1196 paddw xmm3, xmm5 |
| 1197 paddw xmm4, xmm6 |
| 1198 |
| 1199 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1200 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1201 |
| 1202 paddw xmm4, [GLOBAL(rd)] |
| 1203 psraw xmm4, VP9_FILTER_SHIFT |
| 1204 |
| 1205 packuswb xmm3, xmm4 |
| 1206 movdqa [rdi], xmm3 ; store the results in the
destination |
| 1207 |
| 1208 add rsi, rax ; next line |
| 1209 add rdi, rdx ; dst_pitch |
| 1210 cmp rdi, rcx |
| 1211 jne .next_row_fpo |
| 1212 |
| 1213 .done: |
| 1214 ; begin epilog |
| 1215 pop rdi |
| 1216 pop rsi |
| 1217 RESTORE_GOT |
| 1218 RESTORE_XMM |
| 1219 UNSHADOW_ARGS |
| 1220 pop rbp |
| 1221 ret |
| 1222 |
| 1223 |
| 1224 ;void vp9_bilinear_predict8x8_sse2 |
| 1225 ;( |
| 1226 ; unsigned char *src_ptr, |
| 1227 ; int src_pixels_per_line, |
| 1228 ; int xoffset, |
| 1229 ; int yoffset, |
| 1230 ; unsigned char *dst_ptr, |
| 1231 ; int dst_pitch |
| 1232 ;) |
| 1233 extern sym(vp9_bilinear_filters_mmx) |
| 1234 global sym(vp9_bilinear_predict8x8_sse2) |
| 1235 sym(vp9_bilinear_predict8x8_sse2): |
| 1236 push rbp |
| 1237 mov rbp, rsp |
| 1238 SHADOW_ARGS_TO_STACK 6 |
| 1239 SAVE_XMM 7 |
| 1240 GET_GOT rbx |
| 1241 push rsi |
| 1242 push rdi |
| 1243 ; end prolog |
| 1244 |
| 1245 ALIGN_STACK 16, rax |
| 1246 sub rsp, 144 ; reserve 144 bytes |
| 1247 |
| 1248 ;const short *HFilter = bilinear_filters_mmx[xoffset] |
| 1249 ;const short *VFilter = bilinear_filters_mmx[yoffset] |
| 1250 lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))] |
| 1251 |
| 1252 mov rsi, arg(0) ;src_ptr |
| 1253 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line |
| 1254 |
| 1255 ;Read 9-line unaligned data in and put them on stack. This gives a big |
| 1256 ;performance boost. |
| 1257 movdqu xmm0, [rsi] |
| 1258 lea rax, [rdx + rdx*2] |
| 1259 movdqu xmm1, [rsi+rdx] |
| 1260 movdqu xmm2, [rsi+rdx*2] |
| 1261 add rsi, rax |
| 1262 movdqu xmm3, [rsi] |
| 1263 movdqu xmm4, [rsi+rdx] |
| 1264 movdqu xmm5, [rsi+rdx*2] |
| 1265 add rsi, rax |
| 1266 movdqu xmm6, [rsi] |
| 1267 movdqu xmm7, [rsi+rdx] |
| 1268 |
| 1269 movdqa XMMWORD PTR [rsp], xmm0 |
| 1270 |
| 1271 movdqu xmm0, [rsi+rdx*2] |
| 1272 |
| 1273 movdqa XMMWORD PTR [rsp+16], xmm1 |
| 1274 movdqa XMMWORD PTR [rsp+32], xmm2 |
| 1275 movdqa XMMWORD PTR [rsp+48], xmm3 |
| 1276 movdqa XMMWORD PTR [rsp+64], xmm4 |
| 1277 movdqa XMMWORD PTR [rsp+80], xmm5 |
| 1278 movdqa XMMWORD PTR [rsp+96], xmm6 |
| 1279 movdqa XMMWORD PTR [rsp+112], xmm7 |
| 1280 movdqa XMMWORD PTR [rsp+128], xmm0 |
| 1281 |
| 1282 movsxd rax, dword ptr arg(2) ;xoffset |
| 1283 shl rax, 5 |
| 1284 add rax, rcx ;HFilter |
| 1285 |
| 1286 mov rdi, arg(4) ;dst_ptr |
| 1287 movsxd rdx, dword ptr arg(5) ;dst_pitch |
| 1288 |
| 1289 movdqa xmm1, [rax] |
| 1290 movdqa xmm2, [rax+16] |
| 1291 |
| 1292 movsxd rax, dword ptr arg(3) ;yoffset |
| 1293 shl rax, 5 |
| 1294 add rax, rcx ;VFilter |
| 1295 |
| 1296 lea rcx, [rdi+rdx*8] |
| 1297 |
| 1298 movdqa xmm5, [rax] |
| 1299 movdqa xmm6, [rax+16] |
| 1300 |
| 1301 pxor xmm0, xmm0 |
| 1302 |
| 1303 ; get the first horizontal line done |
| 1304 movdqa xmm3, XMMWORD PTR [rsp] |
| 1305 movdqa xmm4, xmm3 ; make a copy of current li
ne |
| 1306 psrldq xmm4, 1 |
| 1307 |
| 1308 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 |
| 1309 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 |
| 1310 |
| 1311 pmullw xmm3, xmm1 |
| 1312 pmullw xmm4, xmm2 |
| 1313 |
| 1314 paddw xmm3, xmm4 |
| 1315 |
| 1316 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1317 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1318 |
| 1319 movdqa xmm7, xmm3 |
| 1320 add rsp, 16 ; next line |
| 1321 .next_row8x8: |
| 1322 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04
05 06 07 08 09 10 11 12 13 14 15 |
| 1323 movdqa xmm4, xmm3 ; make a copy of current li
ne |
| 1324 psrldq xmm4, 1 |
| 1325 |
| 1326 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 |
| 1327 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 |
| 1328 |
| 1329 pmullw xmm3, xmm1 |
| 1330 pmullw xmm4, xmm2 |
| 1331 |
| 1332 paddw xmm3, xmm4 |
| 1333 pmullw xmm7, xmm5 |
| 1334 |
| 1335 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1336 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1337 |
| 1338 movdqa xmm4, xmm3 |
| 1339 |
| 1340 pmullw xmm3, xmm6 |
| 1341 paddw xmm3, xmm7 |
| 1342 |
| 1343 movdqa xmm7, xmm4 |
| 1344 |
| 1345 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value |
| 1346 psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128 |
| 1347 |
| 1348 packuswb xmm3, xmm0 |
| 1349 movq [rdi], xmm3 ; store the results in the
destination |
| 1350 |
| 1351 add rsp, 16 ; next line |
| 1352 add rdi, rdx |
| 1353 |
| 1354 cmp rdi, rcx |
| 1355 jne .next_row8x8 |
| 1356 |
| 1357 ;add rsp, 144 |
| 1358 pop rsp |
| 1359 ; begin epilog |
| 1360 pop rdi |
| 1361 pop rsi |
| 1362 RESTORE_GOT |
| 1363 RESTORE_XMM |
| 1364 UNSHADOW_ARGS |
| 1365 pop rbp |
| 1366 ret |
| 1367 |
| 1368 |
| 1369 SECTION_RODATA |
| 1370 align 16 |
| 1371 rd: |
| 1372 times 8 dw 0x40 |
OLD | NEW |