OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 |
| 12 %include "vpx_ports/x86_abi_support.asm" |
| 13 |
| 14 %macro PROCESS_16X2X3 1 |
| 15 %if %1 |
| 16 movdqa xmm0, XMMWORD PTR [rsi] |
| 17 lddqu xmm5, XMMWORD PTR [rdi] |
| 18 lddqu xmm6, XMMWORD PTR [rdi+1] |
| 19 lddqu xmm7, XMMWORD PTR [rdi+2] |
| 20 |
| 21 psadbw xmm5, xmm0 |
| 22 psadbw xmm6, xmm0 |
| 23 psadbw xmm7, xmm0 |
| 24 %else |
| 25 movdqa xmm0, XMMWORD PTR [rsi] |
| 26 lddqu xmm1, XMMWORD PTR [rdi] |
| 27 lddqu xmm2, XMMWORD PTR [rdi+1] |
| 28 lddqu xmm3, XMMWORD PTR [rdi+2] |
| 29 |
| 30 psadbw xmm1, xmm0 |
| 31 psadbw xmm2, xmm0 |
| 32 psadbw xmm3, xmm0 |
| 33 |
| 34 paddw xmm5, xmm1 |
| 35 paddw xmm6, xmm2 |
| 36 paddw xmm7, xmm3 |
| 37 %endif |
| 38 movdqa xmm0, XMMWORD PTR [rsi+rax] |
| 39 lddqu xmm1, XMMWORD PTR [rdi+rdx] |
| 40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1] |
| 41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2] |
| 42 |
| 43 lea rsi, [rsi+rax*2] |
| 44 lea rdi, [rdi+rdx*2] |
| 45 |
| 46 psadbw xmm1, xmm0 |
| 47 psadbw xmm2, xmm0 |
| 48 psadbw xmm3, xmm0 |
| 49 |
| 50 paddw xmm5, xmm1 |
| 51 paddw xmm6, xmm2 |
| 52 paddw xmm7, xmm3 |
| 53 %endmacro |
| 54 |
| 55 %macro PROCESS_16X2X3_OFFSET 2 |
| 56 %if %1 |
| 57 movdqa xmm0, XMMWORD PTR [rsi] |
| 58 movdqa xmm4, XMMWORD PTR [rdi] |
| 59 movdqa xmm7, XMMWORD PTR [rdi+16] |
| 60 |
| 61 movdqa xmm5, xmm7 |
| 62 palignr xmm5, xmm4, %2 |
| 63 |
| 64 movdqa xmm6, xmm7 |
| 65 palignr xmm6, xmm4, (%2+1) |
| 66 |
| 67 palignr xmm7, xmm4, (%2+2) |
| 68 |
| 69 psadbw xmm5, xmm0 |
| 70 psadbw xmm6, xmm0 |
| 71 psadbw xmm7, xmm0 |
| 72 %else |
| 73 movdqa xmm0, XMMWORD PTR [rsi] |
| 74 movdqa xmm4, XMMWORD PTR [rdi] |
| 75 movdqa xmm3, XMMWORD PTR [rdi+16] |
| 76 |
| 77 movdqa xmm1, xmm3 |
| 78 palignr xmm1, xmm4, %2 |
| 79 |
| 80 movdqa xmm2, xmm3 |
| 81 palignr xmm2, xmm4, (%2+1) |
| 82 |
| 83 palignr xmm3, xmm4, (%2+2) |
| 84 |
| 85 psadbw xmm1, xmm0 |
| 86 psadbw xmm2, xmm0 |
| 87 psadbw xmm3, xmm0 |
| 88 |
| 89 paddw xmm5, xmm1 |
| 90 paddw xmm6, xmm2 |
| 91 paddw xmm7, xmm3 |
| 92 %endif |
| 93 movdqa xmm0, XMMWORD PTR [rsi+rax] |
| 94 movdqa xmm4, XMMWORD PTR [rdi+rdx] |
| 95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16] |
| 96 |
| 97 movdqa xmm1, xmm3 |
| 98 palignr xmm1, xmm4, %2 |
| 99 |
| 100 movdqa xmm2, xmm3 |
| 101 palignr xmm2, xmm4, (%2+1) |
| 102 |
| 103 palignr xmm3, xmm4, (%2+2) |
| 104 |
| 105 lea rsi, [rsi+rax*2] |
| 106 lea rdi, [rdi+rdx*2] |
| 107 |
| 108 psadbw xmm1, xmm0 |
| 109 psadbw xmm2, xmm0 |
| 110 psadbw xmm3, xmm0 |
| 111 |
| 112 paddw xmm5, xmm1 |
| 113 paddw xmm6, xmm2 |
| 114 paddw xmm7, xmm3 |
| 115 %endmacro |
| 116 |
| 117 %macro PROCESS_16X16X3_OFFSET 2 |
| 118 %2_aligned_by_%1: |
| 119 |
| 120 sub rdi, %1 |
| 121 |
| 122 PROCESS_16X2X3_OFFSET 1, %1 |
| 123 PROCESS_16X2X3_OFFSET 0, %1 |
| 124 PROCESS_16X2X3_OFFSET 0, %1 |
| 125 PROCESS_16X2X3_OFFSET 0, %1 |
| 126 PROCESS_16X2X3_OFFSET 0, %1 |
| 127 PROCESS_16X2X3_OFFSET 0, %1 |
| 128 PROCESS_16X2X3_OFFSET 0, %1 |
| 129 PROCESS_16X2X3_OFFSET 0, %1 |
| 130 |
| 131 jmp %2_store_off |
| 132 |
| 133 %endmacro |
| 134 |
| 135 %macro PROCESS_16X8X3_OFFSET 2 |
| 136 %2_aligned_by_%1: |
| 137 |
| 138 sub rdi, %1 |
| 139 |
| 140 PROCESS_16X2X3_OFFSET 1, %1 |
| 141 PROCESS_16X2X3_OFFSET 0, %1 |
| 142 PROCESS_16X2X3_OFFSET 0, %1 |
| 143 PROCESS_16X2X3_OFFSET 0, %1 |
| 144 |
| 145 jmp %2_store_off |
| 146 |
| 147 %endmacro |
| 148 |
| 149 ;void int vp9_sad16x16x3_ssse3( |
| 150 ; unsigned char *src_ptr, |
| 151 ; int src_stride, |
| 152 ; unsigned char *ref_ptr, |
| 153 ; int ref_stride, |
| 154 ; int *results) |
| 155 global sym(vp9_sad16x16x3_ssse3) |
| 156 sym(vp9_sad16x16x3_ssse3): |
| 157 push rbp |
| 158 mov rbp, rsp |
| 159 SHADOW_ARGS_TO_STACK 5 |
| 160 SAVE_XMM 7 |
| 161 push rsi |
| 162 push rdi |
| 163 push rcx |
| 164 ; end prolog |
| 165 |
| 166 mov rsi, arg(0) ;src_ptr |
| 167 mov rdi, arg(2) ;ref_ptr |
| 168 |
| 169 mov rdx, 0xf |
| 170 and rdx, rdi |
| 171 |
| 172 jmp .vp9_sad16x16x3_ssse3_skiptable |
| 173 .vp9_sad16x16x3_ssse3_jumptable: |
| 174 dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump |
| 175 dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump |
| 176 dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump |
| 177 dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump |
| 178 dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump |
| 179 dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump |
| 180 dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump |
| 181 dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump |
| 182 dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump |
| 183 dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump |
| 184 dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump |
| 185 dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump |
| 186 dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump |
| 187 dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump |
| 188 dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump |
| 189 dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump |
| 190 .vp9_sad16x16x3_ssse3_skiptable: |
| 191 |
| 192 call .vp9_sad16x16x3_ssse3_do_jump |
| 193 .vp9_sad16x16x3_ssse3_do_jump: |
| 194 pop rcx ; get the address of do_jump |
| 195 mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_
ssse3_do_jump |
| 196 add rax, rcx ; get the absolute address of vp9_sad16x16x3_
ssse3_jumptable |
| 197 |
| 198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable |
| 199 add rcx, rax |
| 200 |
| 201 movsxd rax, dword ptr arg(1) ;src_stride |
| 202 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 203 |
| 204 jmp rcx |
| 205 |
| 206 PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3 |
| 207 PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3 |
| 208 PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3 |
| 209 PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3 |
| 210 PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3 |
| 211 PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3 |
| 212 PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3 |
| 213 PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3 |
| 214 PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3 |
| 215 PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3 |
| 216 PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3 |
| 217 PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3 |
| 218 PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3 |
| 219 PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3 |
| 220 PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3 |
| 221 |
| 222 .vp9_sad16x16x3_ssse3_aligned_by_15: |
| 223 PROCESS_16X2X3 1 |
| 224 PROCESS_16X2X3 0 |
| 225 PROCESS_16X2X3 0 |
| 226 PROCESS_16X2X3 0 |
| 227 PROCESS_16X2X3 0 |
| 228 PROCESS_16X2X3 0 |
| 229 PROCESS_16X2X3 0 |
| 230 PROCESS_16X2X3 0 |
| 231 |
| 232 .vp9_sad16x16x3_ssse3_store_off: |
| 233 mov rdi, arg(4) ;Results |
| 234 |
| 235 movq xmm0, xmm5 |
| 236 psrldq xmm5, 8 |
| 237 |
| 238 paddw xmm0, xmm5 |
| 239 movd [rdi], xmm0 |
| 240 ;- |
| 241 movq xmm0, xmm6 |
| 242 psrldq xmm6, 8 |
| 243 |
| 244 paddw xmm0, xmm6 |
| 245 movd [rdi+4], xmm0 |
| 246 ;- |
| 247 movq xmm0, xmm7 |
| 248 psrldq xmm7, 8 |
| 249 |
| 250 paddw xmm0, xmm7 |
| 251 movd [rdi+8], xmm0 |
| 252 |
| 253 ; begin epilog |
| 254 pop rcx |
| 255 pop rdi |
| 256 pop rsi |
| 257 RESTORE_XMM |
| 258 UNSHADOW_ARGS |
| 259 pop rbp |
| 260 ret |
| 261 |
| 262 ;void int vp9_sad16x8x3_ssse3( |
| 263 ; unsigned char *src_ptr, |
| 264 ; int src_stride, |
| 265 ; unsigned char *ref_ptr, |
| 266 ; int ref_stride, |
| 267 ; int *results) |
| 268 global sym(vp9_sad16x8x3_ssse3) |
| 269 sym(vp9_sad16x8x3_ssse3): |
| 270 push rbp |
| 271 mov rbp, rsp |
| 272 SHADOW_ARGS_TO_STACK 5 |
| 273 SAVE_XMM 7 |
| 274 push rsi |
| 275 push rdi |
| 276 push rcx |
| 277 ; end prolog |
| 278 |
| 279 mov rsi, arg(0) ;src_ptr |
| 280 mov rdi, arg(2) ;ref_ptr |
| 281 |
| 282 mov rdx, 0xf |
| 283 and rdx, rdi |
| 284 |
| 285 jmp .vp9_sad16x8x3_ssse3_skiptable |
| 286 .vp9_sad16x8x3_ssse3_jumptable: |
| 287 dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump |
| 288 dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump |
| 289 dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump |
| 290 dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump |
| 291 dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump |
| 292 dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump |
| 293 dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump |
| 294 dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump |
| 295 dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump |
| 296 dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump |
| 297 dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump |
| 298 dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump |
| 299 dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump |
| 300 dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump |
| 301 dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump |
| 302 dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump |
| 303 .vp9_sad16x8x3_ssse3_skiptable: |
| 304 |
| 305 call .vp9_sad16x8x3_ssse3_do_jump |
| 306 .vp9_sad16x8x3_ssse3_do_jump: |
| 307 pop rcx ; get the address of do_jump |
| 308 mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ss
se3_do_jump |
| 309 add rax, rcx ; get the absolute address of vp9_sad16x8x3_s
sse3_jumptable |
| 310 |
| 311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from
the jumptable |
| 312 add rcx, rax |
| 313 |
| 314 movsxd rax, dword ptr arg(1) ;src_stride |
| 315 movsxd rdx, dword ptr arg(3) ;ref_stride |
| 316 |
| 317 jmp rcx |
| 318 |
| 319 PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3 |
| 320 PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3 |
| 321 PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3 |
| 322 PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3 |
| 323 PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3 |
| 324 PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3 |
| 325 PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3 |
| 326 PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3 |
| 327 PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3 |
| 328 PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3 |
| 329 PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3 |
| 330 PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3 |
| 331 PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3 |
| 332 PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3 |
| 333 PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3 |
| 334 |
| 335 .vp9_sad16x8x3_ssse3_aligned_by_15: |
| 336 |
| 337 PROCESS_16X2X3 1 |
| 338 PROCESS_16X2X3 0 |
| 339 PROCESS_16X2X3 0 |
| 340 PROCESS_16X2X3 0 |
| 341 |
| 342 .vp9_sad16x8x3_ssse3_store_off: |
| 343 mov rdi, arg(4) ;Results |
| 344 |
| 345 movq xmm0, xmm5 |
| 346 psrldq xmm5, 8 |
| 347 |
| 348 paddw xmm0, xmm5 |
| 349 movd [rdi], xmm0 |
| 350 ;- |
| 351 movq xmm0, xmm6 |
| 352 psrldq xmm6, 8 |
| 353 |
| 354 paddw xmm0, xmm6 |
| 355 movd [rdi+4], xmm0 |
| 356 ;- |
| 357 movq xmm0, xmm7 |
| 358 psrldq xmm7, 8 |
| 359 |
| 360 paddw xmm0, xmm7 |
| 361 movd [rdi+8], xmm0 |
| 362 |
| 363 ; begin epilog |
| 364 pop rcx |
| 365 pop rdi |
| 366 pop rsi |
| 367 RESTORE_XMM |
| 368 UNSHADOW_ARGS |
| 369 pop rbp |
| 370 ret |
OLD | NEW |