OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %define xmm_filter_shift 7 |
| 15 |
| 16 |
| 17 ;void vp9_filter_block2d_bil_var_ssse3 |
| 18 ;( |
| 19 ; unsigned char *ref_ptr, |
| 20 ; int ref_pixels_per_line, |
| 21 ; unsigned char *src_ptr, |
| 22 ; int src_pixels_per_line, |
| 23 ; unsigned int Height, |
| 24 ; int xoffset, |
| 25 ; int yoffset, |
| 26 ; int *sum, |
| 27 ; unsigned int *sumsquared;; |
| 28 ; |
| 29 ;) |
| 30 ;Note: The filter coefficient at offset=0 is 128. Since the second register |
| 31 ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. |
| 32 global sym(vp9_filter_block2d_bil_var_ssse3) |
| 33 sym(vp9_filter_block2d_bil_var_ssse3): |
| 34 push rbp |
| 35 mov rbp, rsp |
| 36 SHADOW_ARGS_TO_STACK 9 |
| 37 SAVE_XMM 7 |
| 38 GET_GOT rbx |
| 39 push rsi |
| 40 push rdi |
| 41 ; end prolog |
| 42 |
| 43 pxor xmm6, xmm6 |
| 44 pxor xmm7, xmm7 |
| 45 |
| 46 lea rcx, [GLOBAL(bilinear_filters_ssse3)] |
| 47 movsxd rax, dword ptr arg(5) ; xoffset |
| 48 |
| 49 cmp rax, 0 ; skip first_pass f
ilter if xoffset=0 |
| 50 je .filter_block2d_bil_var_ssse3_sp_only |
| 51 |
| 52 shl rax, 4 ; point to filter c
oeff with xoffset |
| 53 lea rax, [rax + rcx] ; HFilter |
| 54 |
| 55 movsxd rdx, dword ptr arg(6) ; yoffset |
| 56 |
| 57 cmp rdx, 0 ; skip second_pass
filter if yoffset=0 |
| 58 je .filter_block2d_bil_var_ssse3_fp_only |
| 59 |
| 60 shl rdx, 4 |
| 61 lea rdx, [rdx + rcx] ; VFilter |
| 62 |
| 63 mov rsi, arg(0) ;ref_ptr |
| 64 mov rdi, arg(2) ;src_ptr |
| 65 movsxd rcx, dword ptr arg(4) ;Height |
| 66 |
| 67 movdqu xmm0, XMMWORD PTR [rsi] |
| 68 movdqu xmm1, XMMWORD PTR [rsi+1] |
| 69 movdqa xmm2, xmm0 |
| 70 |
| 71 punpcklbw xmm0, xmm1 |
| 72 punpckhbw xmm2, xmm1 |
| 73 pmaddubsw xmm0, [rax] |
| 74 pmaddubsw xmm2, [rax] |
| 75 |
| 76 paddw xmm0, [GLOBAL(xmm_bi_rd)] |
| 77 paddw xmm2, [GLOBAL(xmm_bi_rd)] |
| 78 psraw xmm0, xmm_filter_shift |
| 79 psraw xmm2, xmm_filter_shift |
| 80 |
| 81 packuswb xmm0, xmm2 |
| 82 |
| 83 %if ABI_IS_32BIT |
| 84 add rsi, dword ptr arg(1) ;ref_pixels_per_line |
| 85 %else |
| 86 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line |
| 87 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 88 lea rsi, [rsi + r8] |
| 89 %endif |
| 90 |
| 91 .filter_block2d_bil_var_ssse3_loop: |
| 92 movdqu xmm1, XMMWORD PTR [rsi] |
| 93 movdqu xmm2, XMMWORD PTR [rsi+1] |
| 94 movdqa xmm3, xmm1 |
| 95 |
| 96 punpcklbw xmm1, xmm2 |
| 97 punpckhbw xmm3, xmm2 |
| 98 pmaddubsw xmm1, [rax] |
| 99 pmaddubsw xmm3, [rax] |
| 100 |
| 101 paddw xmm1, [GLOBAL(xmm_bi_rd)] |
| 102 paddw xmm3, [GLOBAL(xmm_bi_rd)] |
| 103 psraw xmm1, xmm_filter_shift |
| 104 psraw xmm3, xmm_filter_shift |
| 105 packuswb xmm1, xmm3 |
| 106 |
| 107 movdqa xmm2, xmm0 |
| 108 movdqa xmm0, xmm1 |
| 109 movdqa xmm3, xmm2 |
| 110 |
| 111 punpcklbw xmm2, xmm1 |
| 112 punpckhbw xmm3, xmm1 |
| 113 pmaddubsw xmm2, [rdx] |
| 114 pmaddubsw xmm3, [rdx] |
| 115 |
| 116 paddw xmm2, [GLOBAL(xmm_bi_rd)] |
| 117 paddw xmm3, [GLOBAL(xmm_bi_rd)] |
| 118 psraw xmm2, xmm_filter_shift |
| 119 psraw xmm3, xmm_filter_shift |
| 120 |
| 121 movq xmm1, QWORD PTR [rdi] |
| 122 pxor xmm4, xmm4 |
| 123 punpcklbw xmm1, xmm4 |
| 124 movq xmm5, QWORD PTR [rdi+8] |
| 125 punpcklbw xmm5, xmm4 |
| 126 |
| 127 psubw xmm2, xmm1 |
| 128 psubw xmm3, xmm5 |
| 129 paddw xmm6, xmm2 |
| 130 paddw xmm6, xmm3 |
| 131 pmaddwd xmm2, xmm2 |
| 132 pmaddwd xmm3, xmm3 |
| 133 paddd xmm7, xmm2 |
| 134 paddd xmm7, xmm3 |
| 135 |
| 136 %if ABI_IS_32BIT |
| 137 add rsi, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 138 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e |
| 139 %else |
| 140 lea rsi, [rsi + r8] |
| 141 lea rdi, [rdi + r9] |
| 142 %endif |
| 143 |
| 144 sub rcx, 1 |
| 145 jnz .filter_block2d_bil_var_ssse3_loop |
| 146 |
| 147 jmp .filter_block2d_bil_variance |
| 148 |
| 149 .filter_block2d_bil_var_ssse3_sp_only: |
| 150 movsxd rdx, dword ptr arg(6) ; yoffset |
| 151 |
| 152 cmp rdx, 0 ; Both xoffset =0 a
nd yoffset=0 |
| 153 je .filter_block2d_bil_var_ssse3_full_pixel |
| 154 |
| 155 shl rdx, 4 |
| 156 lea rdx, [rdx + rcx] ; VFilter |
| 157 |
| 158 mov rsi, arg(0) ;ref_ptr |
| 159 mov rdi, arg(2) ;src_ptr |
| 160 movsxd rcx, dword ptr arg(4) ;Height |
| 161 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 162 |
| 163 movdqu xmm1, XMMWORD PTR [rsi] |
| 164 movdqa xmm0, xmm1 |
| 165 |
| 166 %if ABI_IS_32BIT=0 |
| 167 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 168 %endif |
| 169 |
| 170 lea rsi, [rsi + rax] |
| 171 |
| 172 .filter_block2d_bil_sp_only_loop: |
| 173 movdqu xmm3, XMMWORD PTR [rsi] |
| 174 movdqa xmm2, xmm1 |
| 175 movdqa xmm0, xmm3 |
| 176 |
| 177 punpcklbw xmm1, xmm3 |
| 178 punpckhbw xmm2, xmm3 |
| 179 pmaddubsw xmm1, [rdx] |
| 180 pmaddubsw xmm2, [rdx] |
| 181 |
| 182 paddw xmm1, [GLOBAL(xmm_bi_rd)] |
| 183 paddw xmm2, [GLOBAL(xmm_bi_rd)] |
| 184 psraw xmm1, xmm_filter_shift |
| 185 psraw xmm2, xmm_filter_shift |
| 186 |
| 187 movq xmm3, QWORD PTR [rdi] |
| 188 pxor xmm4, xmm4 |
| 189 punpcklbw xmm3, xmm4 |
| 190 movq xmm5, QWORD PTR [rdi+8] |
| 191 punpcklbw xmm5, xmm4 |
| 192 |
| 193 psubw xmm1, xmm3 |
| 194 psubw xmm2, xmm5 |
| 195 paddw xmm6, xmm1 |
| 196 paddw xmm6, xmm2 |
| 197 pmaddwd xmm1, xmm1 |
| 198 pmaddwd xmm2, xmm2 |
| 199 paddd xmm7, xmm1 |
| 200 paddd xmm7, xmm2 |
| 201 |
| 202 movdqa xmm1, xmm0 |
| 203 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e |
| 204 |
| 205 %if ABI_IS_32BIT |
| 206 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e |
| 207 %else |
| 208 lea rdi, [rdi + r9] |
| 209 %endif |
| 210 |
| 211 sub rcx, 1 |
| 212 jnz .filter_block2d_bil_sp_only_loop |
| 213 |
| 214 jmp .filter_block2d_bil_variance |
| 215 |
| 216 .filter_block2d_bil_var_ssse3_full_pixel: |
| 217 mov rsi, arg(0) ;ref_ptr |
| 218 mov rdi, arg(2) ;src_ptr |
| 219 movsxd rcx, dword ptr arg(4) ;Height |
| 220 movsxd rax, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 221 movsxd rdx, dword ptr arg(3) ;src_pixels_per_lin
e |
| 222 pxor xmm0, xmm0 |
| 223 |
| 224 .filter_block2d_bil_full_pixel_loop: |
| 225 movq xmm1, QWORD PTR [rsi] |
| 226 punpcklbw xmm1, xmm0 |
| 227 movq xmm2, QWORD PTR [rsi+8] |
| 228 punpcklbw xmm2, xmm0 |
| 229 |
| 230 movq xmm3, QWORD PTR [rdi] |
| 231 punpcklbw xmm3, xmm0 |
| 232 movq xmm4, QWORD PTR [rdi+8] |
| 233 punpcklbw xmm4, xmm0 |
| 234 |
| 235 psubw xmm1, xmm3 |
| 236 psubw xmm2, xmm4 |
| 237 paddw xmm6, xmm1 |
| 238 paddw xmm6, xmm2 |
| 239 pmaddwd xmm1, xmm1 |
| 240 pmaddwd xmm2, xmm2 |
| 241 paddd xmm7, xmm1 |
| 242 paddd xmm7, xmm2 |
| 243 |
| 244 lea rsi, [rsi + rax] ;ref_pixels_per_lin
e |
| 245 lea rdi, [rdi + rdx] ;src_pixels_per_lin
e |
| 246 sub rcx, 1 |
| 247 jnz .filter_block2d_bil_full_pixel_loop |
| 248 |
| 249 jmp .filter_block2d_bil_variance |
| 250 |
| 251 .filter_block2d_bil_var_ssse3_fp_only: |
| 252 mov rsi, arg(0) ;ref_ptr |
| 253 mov rdi, arg(2) ;src_ptr |
| 254 movsxd rcx, dword ptr arg(4) ;Height |
| 255 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_lin
e |
| 256 |
| 257 pxor xmm0, xmm0 |
| 258 |
| 259 %if ABI_IS_32BIT=0 |
| 260 movsxd r9, dword ptr arg(3) ;src_pixels_per_line |
| 261 %endif |
| 262 |
| 263 .filter_block2d_bil_fp_only_loop: |
| 264 movdqu xmm1, XMMWORD PTR [rsi] |
| 265 movdqu xmm2, XMMWORD PTR [rsi+1] |
| 266 movdqa xmm3, xmm1 |
| 267 |
| 268 punpcklbw xmm1, xmm2 |
| 269 punpckhbw xmm3, xmm2 |
| 270 pmaddubsw xmm1, [rax] |
| 271 pmaddubsw xmm3, [rax] |
| 272 |
| 273 paddw xmm1, [GLOBAL(xmm_bi_rd)] |
| 274 paddw xmm3, [GLOBAL(xmm_bi_rd)] |
| 275 psraw xmm1, xmm_filter_shift |
| 276 psraw xmm3, xmm_filter_shift |
| 277 |
| 278 movq xmm2, XMMWORD PTR [rdi] |
| 279 pxor xmm4, xmm4 |
| 280 punpcklbw xmm2, xmm4 |
| 281 movq xmm5, QWORD PTR [rdi+8] |
| 282 punpcklbw xmm5, xmm4 |
| 283 |
| 284 psubw xmm1, xmm2 |
| 285 psubw xmm3, xmm5 |
| 286 paddw xmm6, xmm1 |
| 287 paddw xmm6, xmm3 |
| 288 pmaddwd xmm1, xmm1 |
| 289 pmaddwd xmm3, xmm3 |
| 290 paddd xmm7, xmm1 |
| 291 paddd xmm7, xmm3 |
| 292 |
| 293 lea rsi, [rsi + rdx] |
| 294 %if ABI_IS_32BIT |
| 295 add rdi, dword ptr arg(3) ;src_pixels_per_lin
e |
| 296 %else |
| 297 lea rdi, [rdi + r9] |
| 298 %endif |
| 299 |
| 300 sub rcx, 1 |
| 301 jnz .filter_block2d_bil_fp_only_loop |
| 302 |
| 303 jmp .filter_block2d_bil_variance |
| 304 |
| 305 .filter_block2d_bil_variance: |
| 306 pxor xmm0, xmm0 |
| 307 pxor xmm1, xmm1 |
| 308 pxor xmm5, xmm5 |
| 309 |
| 310 punpcklwd xmm0, xmm6 |
| 311 punpckhwd xmm1, xmm6 |
| 312 psrad xmm0, 16 |
| 313 psrad xmm1, 16 |
| 314 paddd xmm0, xmm1 |
| 315 movdqa xmm1, xmm0 |
| 316 |
| 317 movdqa xmm6, xmm7 |
| 318 punpckldq xmm6, xmm5 |
| 319 punpckhdq xmm7, xmm5 |
| 320 paddd xmm6, xmm7 |
| 321 |
| 322 punpckldq xmm0, xmm5 |
| 323 punpckhdq xmm1, xmm5 |
| 324 paddd xmm0, xmm1 |
| 325 |
| 326 movdqa xmm7, xmm6 |
| 327 movdqa xmm1, xmm0 |
| 328 |
| 329 psrldq xmm7, 8 |
| 330 psrldq xmm1, 8 |
| 331 |
| 332 paddd xmm6, xmm7 |
| 333 paddd xmm0, xmm1 |
| 334 |
| 335 mov rsi, arg(7) ;[Sum] |
| 336 mov rdi, arg(8) ;[SSE] |
| 337 |
| 338 movd [rsi], xmm0 |
| 339 movd [rdi], xmm6 |
| 340 |
| 341 ; begin epilog |
| 342 pop rdi |
| 343 pop rsi |
| 344 RESTORE_GOT |
| 345 RESTORE_XMM |
| 346 UNSHADOW_ARGS |
| 347 pop rbp |
| 348 ret |
| 349 |
| 350 |
| 351 SECTION_RODATA |
| 352 align 16 |
| 353 xmm_bi_rd: |
| 354 times 8 dw 64 |
| 355 align 16 |
| 356 bilinear_filters_ssse3: |
| 357 times 8 db 128, 0 |
| 358 times 8 db 120, 8 |
| 359 times 8 db 112, 16 |
| 360 times 8 db 104, 24 |
| 361 times 8 db 96, 32 |
| 362 times 8 db 88, 40 |
| 363 times 8 db 80, 48 |
| 364 times 8 db 72, 56 |
| 365 times 8 db 64, 64 |
| 366 times 8 db 56, 72 |
| 367 times 8 db 48, 80 |
| 368 times 8 db 40, 88 |
| 369 times 8 db 32, 96 |
| 370 times 8 db 24, 104 |
| 371 times 8 db 16, 112 |
| 372 times 8 db 8, 120 |
OLD | NEW |