OLD | NEW |
(Empty) | |
| 1 ; |
| 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
| 3 ; |
| 4 ; Use of this source code is governed by a BSD-style license |
| 5 ; that can be found in the LICENSE file in the root of the source |
| 6 ; tree. An additional intellectual property rights grant can be found |
| 7 ; in the file PATENTS. All contributing project authors may |
| 8 ; be found in the AUTHORS file in the root of the source tree. |
| 9 ; |
| 10 |
| 11 %include "third_party/x86inc/x86inc.asm" |
| 12 |
| 13 SECTION_RODATA |
| 14 pw_4: times 8 dw 4 |
| 15 pw_8: times 8 dw 8 |
| 16 pw_16: times 4 dd 16 |
| 17 pw_32: times 4 dd 32 |
| 18 |
| 19 SECTION .text |
| 20 INIT_MMX sse |
| 21 cglobal high_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset |
| 22 GET_GOT goffsetq |
| 23 |
| 24 movq m0, [aboveq] |
| 25 movq m2, [leftq] |
| 26 DEFINE_ARGS dst, stride, one |
| 27 mov oned, 0x0001 |
| 28 pxor m1, m1 |
| 29 movd m3, oned |
| 30 pshufw m3, m3, 0x0 |
| 31 paddw m0, m2 |
| 32 pmaddwd m0, m3 |
| 33 packssdw m0, m1 |
| 34 pmaddwd m0, m3 |
| 35 paddw m0, [GLOBAL(pw_4)] |
| 36 psraw m0, 3 |
| 37 pshufw m0, m0, 0x0 |
| 38 movq [dstq ], m0 |
| 39 movq [dstq+strideq*2], m0 |
| 40 lea dstq, [dstq+strideq*4] |
| 41 movq [dstq ], m0 |
| 42 movq [dstq+strideq*2], m0 |
| 43 |
| 44 RESTORE_GOT |
| 45 RET |
| 46 |
| 47 INIT_XMM sse2 |
| 48 cglobal high_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset |
| 49 GET_GOT goffsetq |
| 50 |
| 51 pxor m1, m1 |
| 52 mova m0, [aboveq] |
| 53 mova m2, [leftq] |
| 54 DEFINE_ARGS dst, stride, stride3, one |
| 55 mov oned, 0x00010001 |
| 56 lea stride3q, [strideq*3] |
| 57 movd m3, oned |
| 58 pshufd m3, m3, 0x0 |
| 59 paddw m0, m2 |
| 60 pmaddwd m0, m3 |
| 61 packssdw m0, m1 |
| 62 pmaddwd m0, m3 |
| 63 packssdw m0, m1 |
| 64 pmaddwd m0, m3 |
| 65 paddw m0, [GLOBAL(pw_8)] |
| 66 psrlw m0, 4 |
| 67 pshuflw m0, m0, 0x0 |
| 68 punpcklqdq m0, m0 |
| 69 mova [dstq ], m0 |
| 70 mova [dstq+strideq*2 ], m0 |
| 71 mova [dstq+strideq*4 ], m0 |
| 72 mova [dstq+stride3q*2], m0 |
| 73 lea dstq, [dstq+strideq*8] |
| 74 mova [dstq ], m0 |
| 75 mova [dstq+strideq*2 ], m0 |
| 76 mova [dstq+strideq*4 ], m0 |
| 77 mova [dstq+stride3q*2], m0 |
| 78 |
| 79 RESTORE_GOT |
| 80 RET |
| 81 |
| 82 INIT_XMM sse2 |
| 83 cglobal high_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset |
| 84 GET_GOT goffsetq |
| 85 |
| 86 pxor m1, m1 |
| 87 mova m0, [aboveq] |
| 88 mova m3, [aboveq+16] |
| 89 mova m2, [leftq] |
| 90 mova m4, [leftq+16] |
| 91 DEFINE_ARGS dst, stride, stride3, lines4 |
| 92 lea stride3q, [strideq*3] |
| 93 mov lines4d, 4 |
| 94 paddw m0, m2 |
| 95 paddw m0, m3 |
| 96 paddw m0, m4 |
| 97 movhlps m2, m0 |
| 98 paddw m0, m2 |
| 99 punpcklwd m0, m1 |
| 100 movhlps m2, m0 |
| 101 paddd m0, m2 |
| 102 punpckldq m0, m1 |
| 103 movhlps m2, m0 |
| 104 paddd m0, m2 |
| 105 paddd m0, [GLOBAL(pw_16)] |
| 106 psrad m0, 5 |
| 107 pshuflw m0, m0, 0x0 |
| 108 punpcklqdq m0, m0 |
| 109 .loop: |
| 110 mova [dstq ], m0 |
| 111 mova [dstq +16], m0 |
| 112 mova [dstq+strideq*2 ], m0 |
| 113 mova [dstq+strideq*2 +16], m0 |
| 114 mova [dstq+strideq*4 ], m0 |
| 115 mova [dstq+strideq*4 +16], m0 |
| 116 mova [dstq+stride3q*2 ], m0 |
| 117 mova [dstq+stride3q*2+16], m0 |
| 118 lea dstq, [dstq+strideq*8] |
| 119 dec lines4d |
| 120 jnz .loop |
| 121 |
| 122 RESTORE_GOT |
| 123 REP_RET |
| 124 |
| 125 %if ARCH_X86_64 |
| 126 INIT_XMM sse2 |
| 127 cglobal high_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset |
| 128 GET_GOT goffsetq |
| 129 |
| 130 pxor m1, m1 |
| 131 mova m0, [aboveq] |
| 132 mova m2, [aboveq+16] |
| 133 mova m3, [aboveq+32] |
| 134 mova m4, [aboveq+48] |
| 135 mova m5, [leftq] |
| 136 mova m6, [leftq+16] |
| 137 mova m7, [leftq+32] |
| 138 mova m8, [leftq+48] |
| 139 DEFINE_ARGS dst, stride, stride3, lines4 |
| 140 lea stride3q, [strideq*3] |
| 141 mov lines4d, 8 |
| 142 paddw m0, m2 |
| 143 paddw m0, m3 |
| 144 paddw m0, m4 |
| 145 paddw m0, m5 |
| 146 paddw m0, m6 |
| 147 paddw m0, m7 |
| 148 paddw m0, m8 |
| 149 movhlps m2, m0 |
| 150 paddw m0, m2 |
| 151 punpcklwd m0, m1 |
| 152 movhlps m2, m0 |
| 153 paddd m0, m2 |
| 154 punpckldq m0, m1 |
| 155 movhlps m2, m0 |
| 156 paddd m0, m2 |
| 157 paddd m0, [GLOBAL(pw_32)] |
| 158 psrad m0, 6 |
| 159 pshuflw m0, m0, 0x0 |
| 160 punpcklqdq m0, m0 |
| 161 .loop: |
| 162 mova [dstq ], m0 |
| 163 mova [dstq +16 ], m0 |
| 164 mova [dstq +32 ], m0 |
| 165 mova [dstq +48 ], m0 |
| 166 mova [dstq+strideq*2 ], m0 |
| 167 mova [dstq+strideq*2+16 ], m0 |
| 168 mova [dstq+strideq*2+32 ], m0 |
| 169 mova [dstq+strideq*2+48 ], m0 |
| 170 mova [dstq+strideq*4 ], m0 |
| 171 mova [dstq+strideq*4+16 ], m0 |
| 172 mova [dstq+strideq*4+32 ], m0 |
| 173 mova [dstq+strideq*4+48 ], m0 |
| 174 mova [dstq+stride3q*2 ], m0 |
| 175 mova [dstq+stride3q*2 +16], m0 |
| 176 mova [dstq+stride3q*2 +32], m0 |
| 177 mova [dstq+stride3q*2 +48], m0 |
| 178 lea dstq, [dstq+strideq*8] |
| 179 dec lines4d |
| 180 jnz .loop |
| 181 |
| 182 RESTORE_GOT |
| 183 REP_RET |
| 184 %endif |
| 185 |
| 186 INIT_MMX sse |
| 187 cglobal high_v_predictor_4x4, 3, 3, 1, dst, stride, above |
| 188 movq m0, [aboveq] |
| 189 movq [dstq ], m0 |
| 190 movq [dstq+strideq*2], m0 |
| 191 lea dstq, [dstq+strideq*4] |
| 192 movq [dstq ], m0 |
| 193 movq [dstq+strideq*2], m0 |
| 194 RET |
| 195 |
| 196 INIT_XMM sse2 |
| 197 cglobal high_v_predictor_8x8, 3, 3, 1, dst, stride, above |
| 198 mova m0, [aboveq] |
| 199 DEFINE_ARGS dst, stride, stride3 |
| 200 lea stride3q, [strideq*3] |
| 201 mova [dstq ], m0 |
| 202 mova [dstq+strideq*2 ], m0 |
| 203 mova [dstq+strideq*4 ], m0 |
| 204 mova [dstq+stride3q*2], m0 |
| 205 lea dstq, [dstq+strideq*8] |
| 206 mova [dstq ], m0 |
| 207 mova [dstq+strideq*2 ], m0 |
| 208 mova [dstq+strideq*4 ], m0 |
| 209 mova [dstq+stride3q*2], m0 |
| 210 RET |
| 211 |
| 212 INIT_XMM sse2 |
| 213 cglobal high_v_predictor_16x16, 3, 4, 2, dst, stride, above |
| 214 mova m0, [aboveq] |
| 215 mova m1, [aboveq+16] |
| 216 DEFINE_ARGS dst, stride, stride3, nlines4 |
| 217 lea stride3q, [strideq*3] |
| 218 mov nlines4d, 4 |
| 219 .loop: |
| 220 mova [dstq ], m0 |
| 221 mova [dstq +16], m1 |
| 222 mova [dstq+strideq*2 ], m0 |
| 223 mova [dstq+strideq*2 +16], m1 |
| 224 mova [dstq+strideq*4 ], m0 |
| 225 mova [dstq+strideq*4 +16], m1 |
| 226 mova [dstq+stride3q*2 ], m0 |
| 227 mova [dstq+stride3q*2+16], m1 |
| 228 lea dstq, [dstq+strideq*8] |
| 229 dec nlines4d |
| 230 jnz .loop |
| 231 REP_RET |
| 232 |
| 233 INIT_XMM sse2 |
| 234 cglobal high_v_predictor_32x32, 3, 4, 4, dst, stride, above |
| 235 mova m0, [aboveq] |
| 236 mova m1, [aboveq+16] |
| 237 mova m2, [aboveq+32] |
| 238 mova m3, [aboveq+48] |
| 239 DEFINE_ARGS dst, stride, stride3, nlines4 |
| 240 lea stride3q, [strideq*3] |
| 241 mov nlines4d, 8 |
| 242 .loop: |
| 243 mova [dstq ], m0 |
| 244 mova [dstq +16], m1 |
| 245 mova [dstq +32], m2 |
| 246 mova [dstq +48], m3 |
| 247 mova [dstq+strideq*2 ], m0 |
| 248 mova [dstq+strideq*2 +16], m1 |
| 249 mova [dstq+strideq*2 +32], m2 |
| 250 mova [dstq+strideq*2 +48], m3 |
| 251 mova [dstq+strideq*4 ], m0 |
| 252 mova [dstq+strideq*4 +16], m1 |
| 253 mova [dstq+strideq*4 +32], m2 |
| 254 mova [dstq+strideq*4 +48], m3 |
| 255 mova [dstq+stride3q*2 ], m0 |
| 256 mova [dstq+stride3q*2 +16], m1 |
| 257 mova [dstq+stride3q*2 +32], m2 |
| 258 mova [dstq+stride3q*2 +48], m3 |
| 259 lea dstq, [dstq+strideq*8] |
| 260 dec nlines4d |
| 261 jnz .loop |
| 262 REP_RET |
| 263 |
| 264 INIT_MMX sse |
| 265 cglobal high_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one |
| 266 movd m1, [aboveq-2] |
| 267 movq m0, [aboveq] |
| 268 pshufw m1, m1, 0x0 |
| 269 ; Get the values to compute the maximum value at this bit depth |
| 270 mov oned, 1 |
| 271 movd m3, oned |
| 272 movd m4, bpsd |
| 273 pshufw m3, m3, 0x0 |
| 274 DEFINE_ARGS dst, stride, line, left |
| 275 mov lineq, -2 |
| 276 mova m2, m3 |
| 277 psllw m3, m4 |
| 278 add leftq, 8 |
| 279 psubw m3, m2 ; max possible value |
| 280 pxor m4, m4 ; min possible value |
| 281 psubw m0, m1 |
| 282 .loop: |
| 283 movq m1, [leftq+lineq*4] |
| 284 movq m2, [leftq+lineq*4+2] |
| 285 pshufw m1, m1, 0x0 |
| 286 pshufw m2, m2, 0x0 |
| 287 paddw m1, m0 |
| 288 paddw m2, m0 |
| 289 ;Clamp to the bit-depth |
| 290 pminsw m1, m3 |
| 291 pminsw m2, m3 |
| 292 pmaxsw m1, m4 |
| 293 pmaxsw m2, m4 |
| 294 ;Store the values |
| 295 movq [dstq ], m1 |
| 296 movq [dstq+strideq*2], m2 |
| 297 lea dstq, [dstq+strideq*4] |
| 298 inc lineq |
| 299 jnz .loop |
| 300 REP_RET |
| 301 |
| 302 INIT_XMM sse2 |
| 303 cglobal high_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one |
| 304 movd m1, [aboveq-2] |
| 305 mova m0, [aboveq] |
| 306 pshuflw m1, m1, 0x0 |
| 307 ; Get the values to compute the maximum value at this bit depth |
| 308 mov oned, 1 |
| 309 pxor m3, m3 |
| 310 pxor m4, m4 |
| 311 pinsrw m3, oned, 0 |
| 312 pinsrw m4, bpsd, 0 |
| 313 pshuflw m3, m3, 0x0 |
| 314 DEFINE_ARGS dst, stride, line, left |
| 315 punpcklqdq m3, m3 |
| 316 mov lineq, -4 |
| 317 mova m2, m3 |
| 318 punpcklqdq m1, m1 |
| 319 psllw m3, m4 |
| 320 add leftq, 16 |
| 321 psubw m3, m2 ; max possible value |
| 322 pxor m4, m4 ; min possible value |
| 323 psubw m0, m1 |
| 324 .loop: |
| 325 movd m1, [leftq+lineq*4] |
| 326 movd m2, [leftq+lineq*4+2] |
| 327 pshuflw m1, m1, 0x0 |
| 328 pshuflw m2, m2, 0x0 |
| 329 punpcklqdq m1, m1 |
| 330 punpcklqdq m2, m2 |
| 331 paddw m1, m0 |
| 332 paddw m2, m0 |
| 333 ;Clamp to the bit-depth |
| 334 pminsw m1, m3 |
| 335 pminsw m2, m3 |
| 336 pmaxsw m1, m4 |
| 337 pmaxsw m2, m4 |
| 338 ;Store the values |
| 339 mova [dstq ], m1 |
| 340 mova [dstq+strideq*2], m2 |
| 341 lea dstq, [dstq+strideq*4] |
| 342 inc lineq |
| 343 jnz .loop |
| 344 REP_RET |
| 345 |
| 346 %if ARCH_X86_64 |
| 347 INIT_XMM sse2 |
| 348 cglobal high_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one |
| 349 movd m2, [aboveq-2] |
| 350 mova m0, [aboveq] |
| 351 mova m1, [aboveq+16] |
| 352 pshuflw m2, m2, 0x0 |
| 353 ; Get the values to compute the maximum value at this bit depth |
| 354 mov oned, 1 |
| 355 pxor m7, m7 |
| 356 pxor m8, m8 |
| 357 pinsrw m7, oned, 0 |
| 358 pinsrw m8, bpsd, 0 |
| 359 pshuflw m7, m7, 0x0 |
| 360 DEFINE_ARGS dst, stride, line, left |
| 361 punpcklqdq m7, m7 |
| 362 mov lineq, -8 |
| 363 mova m5, m7 |
| 364 punpcklqdq m2, m2 |
| 365 psllw m7, m8 |
| 366 add leftq, 32 |
| 367 psubw m7, m5 ; max possible value |
| 368 pxor m8, m8 ; min possible value |
| 369 psubw m0, m2 |
| 370 psubw m1, m2 |
| 371 .loop: |
| 372 movd m2, [leftq+lineq*4] |
| 373 movd m3, [leftq+lineq*4+2] |
| 374 pshuflw m2, m2, 0x0 |
| 375 pshuflw m3, m3, 0x0 |
| 376 punpcklqdq m2, m2 |
| 377 punpcklqdq m3, m3 |
| 378 paddw m4, m2, m0 |
| 379 paddw m5, m3, m0 |
| 380 paddw m2, m1 |
| 381 paddw m3, m1 |
| 382 ;Clamp to the bit-depth |
| 383 pminsw m4, m7 |
| 384 pminsw m5, m7 |
| 385 pminsw m2, m7 |
| 386 pminsw m3, m7 |
| 387 pmaxsw m4, m8 |
| 388 pmaxsw m5, m8 |
| 389 pmaxsw m2, m8 |
| 390 pmaxsw m3, m8 |
| 391 ;Store the values |
| 392 mova [dstq ], m4 |
| 393 mova [dstq+strideq*2 ], m5 |
| 394 mova [dstq +16], m2 |
| 395 mova [dstq+strideq*2+16], m3 |
| 396 lea dstq, [dstq+strideq*4] |
| 397 inc lineq |
| 398 jnz .loop |
| 399 REP_RET |
| 400 |
| 401 INIT_XMM sse2 |
| 402 cglobal high_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one |
| 403 movd m0, [aboveq-2] |
| 404 mova m1, [aboveq] |
| 405 mova m2, [aboveq+16] |
| 406 mova m3, [aboveq+32] |
| 407 mova m4, [aboveq+48] |
| 408 pshuflw m0, m0, 0x0 |
| 409 ; Get the values to compute the maximum value at this bit depth |
| 410 mov oned, 1 |
| 411 pxor m10, m10 |
| 412 pxor m11, m11 |
| 413 pinsrw m10, oned, 0 |
| 414 pinsrw m11, bpsd, 0 |
| 415 pshuflw m10, m10, 0x0 |
| 416 DEFINE_ARGS dst, stride, line, left |
| 417 punpcklqdq m10, m10 |
| 418 mov lineq, -16 |
| 419 mova m5, m10 |
| 420 punpcklqdq m0, m0 |
| 421 psllw m10, m11 |
| 422 add leftq, 64 |
| 423 psubw m10, m5 ; max possible value |
| 424 pxor m11, m11 ; min possible value |
| 425 psubw m1, m0 |
| 426 psubw m2, m0 |
| 427 psubw m3, m0 |
| 428 psubw m4, m0 |
| 429 .loop: |
| 430 movd m5, [leftq+lineq*4] |
| 431 movd m6, [leftq+lineq*4+2] |
| 432 pshuflw m5, m5, 0x0 |
| 433 pshuflw m6, m6, 0x0 |
| 434 punpcklqdq m5, m5 |
| 435 punpcklqdq m6, m6 |
| 436 paddw m7, m5, m1 |
| 437 paddw m8, m5, m2 |
| 438 paddw m9, m5, m3 |
| 439 paddw m5, m4 |
| 440 ;Clamp these values to the bit-depth |
| 441 pminsw m7, m10 |
| 442 pminsw m8, m10 |
| 443 pminsw m9, m10 |
| 444 pminsw m5, m10 |
| 445 pmaxsw m7, m11 |
| 446 pmaxsw m8, m11 |
| 447 pmaxsw m9, m11 |
| 448 pmaxsw m5, m11 |
| 449 ;Store these values |
| 450 mova [dstq ], m7 |
| 451 mova [dstq +16], m8 |
| 452 mova [dstq +32], m9 |
| 453 mova [dstq +48], m5 |
| 454 paddw m7, m6, m1 |
| 455 paddw m8, m6, m2 |
| 456 paddw m9, m6, m3 |
| 457 paddw m6, m4 |
| 458 ;Clamp these values to the bit-depth |
| 459 pminsw m7, m10 |
| 460 pminsw m8, m10 |
| 461 pminsw m9, m10 |
| 462 pminsw m6, m10 |
| 463 pmaxsw m7, m11 |
| 464 pmaxsw m8, m11 |
| 465 pmaxsw m9, m11 |
| 466 pmaxsw m6, m11 |
| 467 ;Store these values |
| 468 mova [dstq+strideq*2 ], m7 |
| 469 mova [dstq+strideq*2+16], m8 |
| 470 mova [dstq+strideq*2+32], m9 |
| 471 mova [dstq+strideq*2+48], m6 |
| 472 lea dstq, [dstq+strideq*4] |
| 473 inc lineq |
| 474 jnz .loop |
| 475 REP_RET |
| 476 %endif |
OLD | NEW |