OLD | NEW |
1 ; | 1 ; |
2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. | 2 ; Copyright (c) 2014 The WebM project authors. All Rights Reserved. |
3 ; | 3 ; |
4 ; Use of this source code is governed by a BSD-style license | 4 ; Use of this source code is governed by a BSD-style license |
5 ; that can be found in the LICENSE file in the root of the source | 5 ; that can be found in the LICENSE file in the root of the source |
6 ; tree. An additional intellectual property rights grant can be found | 6 ; tree. An additional intellectual property rights grant can be found |
7 ; in the file PATENTS. All contributing project authors may | 7 ; in the file PATENTS. All contributing project authors may |
8 ; be found in the AUTHORS file in the root of the source tree. | 8 ; be found in the AUTHORS file in the root of the source tree. |
9 ; | 9 ; |
10 | 10 |
(...skipping 181 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
192 %endif | 192 %endif |
193 %endif | 193 %endif |
194 | 194 |
195 ASSERT %1 <= 16 ; m6 overflows if w > 16 | 195 ASSERT %1 <= 16 ; m6 overflows if w > 16 |
196 pxor m6, m6 ; sum | 196 pxor m6, m6 ; sum |
197 pxor m7, m7 ; sse | 197 pxor m7, m7 ; sse |
198 | 198 |
199 %if %1 < 16 | 199 %if %1 < 16 |
200 sar h, 1 | 200 sar h, 1 |
201 %endif | 201 %endif |
| 202 %if %2 == 1 ; avg |
| 203 shl sec_str, 1 |
| 204 %endif |
202 | 205 |
203 ; FIXME(rbultje) replace by jumptable? | 206 ; FIXME(rbultje) replace by jumptable? |
204 test x_offsetd, x_offsetd | 207 test x_offsetd, x_offsetd |
205 jnz .x_nonzero | 208 jnz .x_nonzero |
206 ; x_offset == 0 | 209 ; x_offset == 0 |
207 test y_offsetd, y_offsetd | 210 test y_offsetd, y_offsetd |
208 jnz .x_zero_y_nonzero | 211 jnz .x_zero_y_nonzero |
209 | 212 |
210 ; x_offset == 0 && y_offset == 0 | 213 ; x_offset == 0 && y_offset == 0 |
211 .x_zero_y_zero_loop: | 214 .x_zero_y_zero_loop: |
212 %if %1 == 16 | 215 %if %1 == 16 |
213 movu m0, [srcq] | 216 movu m0, [srcq] |
214 movu m2, [srcq + 16] | 217 movu m2, [srcq + 16] |
215 mova m1, [dstq] | 218 mova m1, [dstq] |
216 mova m3, [dstq + 16] | 219 mova m3, [dstq + 16] |
217 %if %2 == 1 ; avg | 220 %if %2 == 1 ; avg |
218 pavgw m0, [secq] | 221 pavgw m0, [secq] |
219 pavgw m2, [secq+16] | 222 pavgw m2, [secq+16] |
220 %endif | 223 %endif |
221 SUM_SSE m0, m1, m2, m3, m6, m7 | 224 SUM_SSE m0, m1, m2, m3, m6, m7 |
222 | 225 |
223 lea srcq, [srcq + src_strideq*2] | 226 lea srcq, [srcq + src_strideq*2] |
224 lea dstq, [dstq + dst_strideq*2] | 227 lea dstq, [dstq + dst_strideq*2] |
225 %if %2 == 1 ; avg | 228 %if %2 == 1 ; avg |
226 lea secq, [secq + sec_str*2] | 229 add secq, sec_str |
227 %endif | 230 %endif |
228 %else ; %1 < 16 | 231 %else ; %1 < 16 |
229 movu m0, [srcq] | 232 movu m0, [srcq] |
230 movu m2, [srcq + src_strideq*2] | 233 movu m2, [srcq + src_strideq*2] |
231 mova m1, [dstq] | 234 mova m1, [dstq] |
232 mova m3, [dstq + dst_strideq*2] | 235 mova m3, [dstq + dst_strideq*2] |
233 %if %2 == 1 ; avg | 236 %if %2 == 1 ; avg |
234 pavgw m0, [secq] | 237 pavgw m0, [secq] |
235 pavgw m2, [secq + sec_str*2] | 238 add secq, sec_str |
| 239 pavgw m2, [secq] |
236 %endif | 240 %endif |
237 SUM_SSE m0, m1, m2, m3, m6, m7 | 241 SUM_SSE m0, m1, m2, m3, m6, m7 |
238 | 242 |
239 lea srcq, [srcq + src_strideq*4] | 243 lea srcq, [srcq + src_strideq*4] |
240 lea dstq, [dstq + dst_strideq*4] | 244 lea dstq, [dstq + dst_strideq*4] |
241 %if %2 == 1 ; avg | 245 %if %2 == 1 ; avg |
242 lea secq, [secq + sec_str*4] | 246 add secq, sec_str |
243 %endif | 247 %endif |
244 %endif | 248 %endif |
245 dec h | 249 dec h |
246 jg .x_zero_y_zero_loop | 250 jg .x_zero_y_zero_loop |
247 STORE_AND_RET | 251 STORE_AND_RET |
248 | 252 |
249 .x_zero_y_nonzero: | 253 .x_zero_y_nonzero: |
250 cmp y_offsetd, 8 | 254 cmp y_offsetd, 8 |
251 jne .x_zero_y_nonhalf | 255 jne .x_zero_y_nonhalf |
252 | 256 |
(...skipping 10 matching lines...) Expand all Loading... |
263 pavgw m1, m5 | 267 pavgw m1, m5 |
264 %if %2 == 1 ; avg | 268 %if %2 == 1 ; avg |
265 pavgw m0, [secq] | 269 pavgw m0, [secq] |
266 pavgw m1, [secq+16] | 270 pavgw m1, [secq+16] |
267 %endif | 271 %endif |
268 SUM_SSE m0, m2, m1, m3, m6, m7 | 272 SUM_SSE m0, m2, m1, m3, m6, m7 |
269 | 273 |
270 lea srcq, [srcq + src_strideq*2] | 274 lea srcq, [srcq + src_strideq*2] |
271 lea dstq, [dstq + dst_strideq*2] | 275 lea dstq, [dstq + dst_strideq*2] |
272 %if %2 == 1 ; avg | 276 %if %2 == 1 ; avg |
273 lea secq, [secq + sec_str*2] | 277 add secq, sec_str |
274 %endif | 278 %endif |
275 %else ; %1 < 16 | 279 %else ; %1 < 16 |
276 movu m0, [srcq] | 280 movu m0, [srcq] |
277 movu m1, [srcq+src_strideq*2] | 281 movu m1, [srcq+src_strideq*2] |
278 movu m5, [srcq+src_strideq*4] | 282 movu m5, [srcq+src_strideq*4] |
279 mova m2, [dstq] | 283 mova m2, [dstq] |
280 mova m3, [dstq+dst_strideq*2] | 284 mova m3, [dstq+dst_strideq*2] |
281 pavgw m0, m1 | 285 pavgw m0, m1 |
282 pavgw m1, m5 | 286 pavgw m1, m5 |
283 %if %2 == 1 ; avg | 287 %if %2 == 1 ; avg |
284 pavgw m0, [secq] | 288 pavgw m0, [secq] |
285 pavgw m1, [secq+sec_str*2] | 289 add secq, sec_str |
| 290 pavgw m1, [secq] |
286 %endif | 291 %endif |
287 SUM_SSE m0, m2, m1, m3, m6, m7 | 292 SUM_SSE m0, m2, m1, m3, m6, m7 |
288 | 293 |
289 lea srcq, [srcq + src_strideq*4] | 294 lea srcq, [srcq + src_strideq*4] |
290 lea dstq, [dstq + dst_strideq*4] | 295 lea dstq, [dstq + dst_strideq*4] |
291 %if %2 == 1 ; avg | 296 %if %2 == 1 ; avg |
292 lea secq, [secq + sec_str*4] | 297 add secq, sec_str |
293 %endif | 298 %endif |
294 %endif | 299 %endif |
295 dec h | 300 dec h |
296 jg .x_zero_y_half_loop | 301 jg .x_zero_y_half_loop |
297 STORE_AND_RET | 302 STORE_AND_RET |
298 | 303 |
299 .x_zero_y_nonhalf: | 304 .x_zero_y_nonhalf: |
300 ; x_offset == 0 && y_offset == bilin interpolation | 305 ; x_offset == 0 && y_offset == bilin interpolation |
301 %ifdef PIC | 306 %ifdef PIC |
302 lea bilin_filter, [bilin_filter_m] | 307 lea bilin_filter, [bilin_filter_m] |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
351 psrlw m0, 4 | 356 psrlw m0, 4 |
352 %if %2 == 1 ; avg | 357 %if %2 == 1 ; avg |
353 pavgw m0, [secq] | 358 pavgw m0, [secq] |
354 pavgw m1, [secq+16] | 359 pavgw m1, [secq+16] |
355 %endif | 360 %endif |
356 SUM_SSE m0, m2, m1, m3, m6, m7 | 361 SUM_SSE m0, m2, m1, m3, m6, m7 |
357 | 362 |
358 lea srcq, [srcq + src_strideq*2] | 363 lea srcq, [srcq + src_strideq*2] |
359 lea dstq, [dstq + dst_strideq*2] | 364 lea dstq, [dstq + dst_strideq*2] |
360 %if %2 == 1 ; avg | 365 %if %2 == 1 ; avg |
361 lea secq, [secq + sec_str*2] | 366 add secq, sec_str |
362 %endif | 367 %endif |
363 %else ; %1 < 16 | 368 %else ; %1 < 16 |
364 movu m0, [srcq] | 369 movu m0, [srcq] |
365 movu m1, [srcq+src_strideq*2] | 370 movu m1, [srcq+src_strideq*2] |
366 movu m5, [srcq+src_strideq*4] | 371 movu m5, [srcq+src_strideq*4] |
367 mova m4, m1 | 372 mova m4, m1 |
368 mova m2, [dstq] | 373 mova m2, [dstq] |
369 mova m3, [dstq+dst_strideq*2] | 374 mova m3, [dstq+dst_strideq*2] |
370 pmullw m1, filter_y_a | 375 pmullw m1, filter_y_a |
371 pmullw m5, filter_y_b | 376 pmullw m5, filter_y_b |
372 paddw m1, filter_rnd | 377 paddw m1, filter_rnd |
373 pmullw m0, filter_y_a | 378 pmullw m0, filter_y_a |
374 pmullw m4, filter_y_b | 379 pmullw m4, filter_y_b |
375 paddw m0, filter_rnd | 380 paddw m0, filter_rnd |
376 paddw m1, m5 | 381 paddw m1, m5 |
377 paddw m0, m4 | 382 paddw m0, m4 |
378 psrlw m1, 4 | 383 psrlw m1, 4 |
379 psrlw m0, 4 | 384 psrlw m0, 4 |
380 %if %2 == 1 ; avg | 385 %if %2 == 1 ; avg |
381 pavgw m0, [secq] | 386 pavgw m0, [secq] |
382 pavgw m1, [secq+sec_str*2] | 387 add secq, sec_str |
| 388 pavgw m1, [secq] |
383 %endif | 389 %endif |
384 SUM_SSE m0, m2, m1, m3, m6, m7 | 390 SUM_SSE m0, m2, m1, m3, m6, m7 |
385 | 391 |
386 lea srcq, [srcq + src_strideq*4] | 392 lea srcq, [srcq + src_strideq*4] |
387 lea dstq, [dstq + dst_strideq*4] | 393 lea dstq, [dstq + dst_strideq*4] |
388 %if %2 == 1 ; avg | 394 %if %2 == 1 ; avg |
389 lea secq, [secq + sec_str*4] | 395 add secq, sec_str |
390 %endif | 396 %endif |
391 %endif | 397 %endif |
392 dec h | 398 dec h |
393 jg .x_zero_y_other_loop | 399 jg .x_zero_y_other_loop |
394 %undef filter_y_a | 400 %undef filter_y_a |
395 %undef filter_y_b | 401 %undef filter_y_b |
396 %undef filter_rnd | 402 %undef filter_rnd |
397 STORE_AND_RET | 403 STORE_AND_RET |
398 | 404 |
399 .x_nonzero: | 405 .x_nonzero: |
(...skipping 16 matching lines...) Expand all Loading... |
416 pavgw m1, m5 | 422 pavgw m1, m5 |
417 %if %2 == 1 ; avg | 423 %if %2 == 1 ; avg |
418 pavgw m0, [secq] | 424 pavgw m0, [secq] |
419 pavgw m1, [secq+16] | 425 pavgw m1, [secq+16] |
420 %endif | 426 %endif |
421 SUM_SSE m0, m2, m1, m3, m6, m7 | 427 SUM_SSE m0, m2, m1, m3, m6, m7 |
422 | 428 |
423 lea srcq, [srcq + src_strideq*2] | 429 lea srcq, [srcq + src_strideq*2] |
424 lea dstq, [dstq + dst_strideq*2] | 430 lea dstq, [dstq + dst_strideq*2] |
425 %if %2 == 1 ; avg | 431 %if %2 == 1 ; avg |
426 lea secq, [secq + sec_str*2] | 432 add secq, sec_str |
427 %endif | 433 %endif |
428 %else ; %1 < 16 | 434 %else ; %1 < 16 |
429 movu m0, [srcq] | 435 movu m0, [srcq] |
430 movu m1, [srcq + src_strideq*2] | 436 movu m1, [srcq + src_strideq*2] |
431 movu m4, [srcq + 2] | 437 movu m4, [srcq + 2] |
432 movu m5, [srcq + src_strideq*2 + 2] | 438 movu m5, [srcq + src_strideq*2 + 2] |
433 mova m2, [dstq] | 439 mova m2, [dstq] |
434 mova m3, [dstq + dst_strideq*2] | 440 mova m3, [dstq + dst_strideq*2] |
435 pavgw m0, m4 | 441 pavgw m0, m4 |
436 pavgw m1, m5 | 442 pavgw m1, m5 |
437 %if %2 == 1 ; avg | 443 %if %2 == 1 ; avg |
438 pavgw m0, [secq] | 444 pavgw m0, [secq] |
439 pavgw m1, [secq+sec_str*2] | 445 add secq, sec_str |
| 446 pavgw m1, [secq] |
440 %endif | 447 %endif |
441 SUM_SSE m0, m2, m1, m3, m6, m7 | 448 SUM_SSE m0, m2, m1, m3, m6, m7 |
442 | 449 |
443 lea srcq, [srcq + src_strideq*4] | 450 lea srcq, [srcq + src_strideq*4] |
444 lea dstq, [dstq + dst_strideq*4] | 451 lea dstq, [dstq + dst_strideq*4] |
445 %if %2 == 1 ; avg | 452 %if %2 == 1 ; avg |
446 lea secq, [secq + sec_str*4] | 453 add secq, sec_str |
447 %endif | 454 %endif |
448 %endif | 455 %endif |
449 dec h | 456 dec h |
450 jg .x_half_y_zero_loop | 457 jg .x_half_y_zero_loop |
451 STORE_AND_RET | 458 STORE_AND_RET |
452 | 459 |
453 .x_half_y_nonzero: | 460 .x_half_y_nonzero: |
454 cmp y_offsetd, 8 | 461 cmp y_offsetd, 8 |
455 jne .x_half_y_nonhalf | 462 jne .x_half_y_nonhalf |
456 | 463 |
(...skipping 21 matching lines...) Expand all Loading... |
478 pavgw m0, [secq] | 485 pavgw m0, [secq] |
479 pavgw m1, [secq+16] | 486 pavgw m1, [secq+16] |
480 %endif | 487 %endif |
481 SUM_SSE m0, m4, m1, m5, m6, m7 | 488 SUM_SSE m0, m4, m1, m5, m6, m7 |
482 mova m0, m2 | 489 mova m0, m2 |
483 mova m1, m3 | 490 mova m1, m3 |
484 | 491 |
485 lea srcq, [srcq + src_strideq*2] | 492 lea srcq, [srcq + src_strideq*2] |
486 lea dstq, [dstq + dst_strideq*2] | 493 lea dstq, [dstq + dst_strideq*2] |
487 %if %2 == 1 ; avg | 494 %if %2 == 1 ; avg |
488 lea secq, [secq + sec_str*2] | 495 add secq, sec_str |
489 %endif | 496 %endif |
490 %else ; %1 < 16 | 497 %else ; %1 < 16 |
491 movu m0, [srcq] | 498 movu m0, [srcq] |
492 movu m2, [srcq+2] | 499 movu m2, [srcq+2] |
493 lea srcq, [srcq + src_strideq*2] | 500 lea srcq, [srcq + src_strideq*2] |
494 pavgw m0, m2 | 501 pavgw m0, m2 |
495 .x_half_y_half_loop: | 502 .x_half_y_half_loop: |
496 movu m2, [srcq] | 503 movu m2, [srcq] |
497 movu m3, [srcq + src_strideq*2] | 504 movu m3, [srcq + src_strideq*2] |
498 movu m4, [srcq + 2] | 505 movu m4, [srcq + 2] |
499 movu m5, [srcq + src_strideq*2 + 2] | 506 movu m5, [srcq + src_strideq*2 + 2] |
500 pavgw m2, m4 | 507 pavgw m2, m4 |
501 pavgw m3, m5 | 508 pavgw m3, m5 |
502 pavgw m0, m2 | 509 pavgw m0, m2 |
503 pavgw m2, m3 | 510 pavgw m2, m3 |
504 mova m4, [dstq] | 511 mova m4, [dstq] |
505 mova m5, [dstq + dst_strideq*2] | 512 mova m5, [dstq + dst_strideq*2] |
506 %if %2 == 1 ; avg | 513 %if %2 == 1 ; avg |
507 pavgw m0, [secq] | 514 pavgw m0, [secq] |
508 pavgw m2, [secq+sec_str*2] | 515 add secq, sec_str |
| 516 pavgw m2, [secq] |
509 %endif | 517 %endif |
510 SUM_SSE m0, m4, m2, m5, m6, m7 | 518 SUM_SSE m0, m4, m2, m5, m6, m7 |
511 mova m0, m3 | 519 mova m0, m3 |
512 | 520 |
513 lea srcq, [srcq + src_strideq*4] | 521 lea srcq, [srcq + src_strideq*4] |
514 lea dstq, [dstq + dst_strideq*4] | 522 lea dstq, [dstq + dst_strideq*4] |
515 %if %2 == 1 ; avg | 523 %if %2 == 1 ; avg |
516 lea secq, [secq + sec_str*4] | 524 add secq, sec_str |
517 %endif | 525 %endif |
518 %endif | 526 %endif |
519 dec h | 527 dec h |
520 jg .x_half_y_half_loop | 528 jg .x_half_y_half_loop |
521 STORE_AND_RET | 529 STORE_AND_RET |
522 | 530 |
523 .x_half_y_nonhalf: | 531 .x_half_y_nonhalf: |
524 ; x_offset == 0.5 && y_offset == bilin interpolation | 532 ; x_offset == 0.5 && y_offset == bilin interpolation |
525 %ifdef PIC | 533 %ifdef PIC |
526 lea bilin_filter, [bilin_filter_m] | 534 lea bilin_filter, [bilin_filter_m] |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
583 pavgw m0, [secq] | 591 pavgw m0, [secq] |
584 pavgw m1, [secq+16] | 592 pavgw m1, [secq+16] |
585 %endif | 593 %endif |
586 SUM_SSE m0, m2, m1, m3, m6, m7 | 594 SUM_SSE m0, m2, m1, m3, m6, m7 |
587 mova m0, m4 | 595 mova m0, m4 |
588 mova m1, m5 | 596 mova m1, m5 |
589 | 597 |
590 lea srcq, [srcq + src_strideq*2] | 598 lea srcq, [srcq + src_strideq*2] |
591 lea dstq, [dstq + dst_strideq*2] | 599 lea dstq, [dstq + dst_strideq*2] |
592 %if %2 == 1 ; avg | 600 %if %2 == 1 ; avg |
593 lea secq, [secq + sec_str*2] | 601 add secq, sec_str |
594 %endif | 602 %endif |
595 %else ; %1 < 16 | 603 %else ; %1 < 16 |
596 movu m0, [srcq] | 604 movu m0, [srcq] |
597 movu m2, [srcq+2] | 605 movu m2, [srcq+2] |
598 lea srcq, [srcq + src_strideq*2] | 606 lea srcq, [srcq + src_strideq*2] |
599 pavgw m0, m2 | 607 pavgw m0, m2 |
600 .x_half_y_other_loop: | 608 .x_half_y_other_loop: |
601 movu m2, [srcq] | 609 movu m2, [srcq] |
602 movu m3, [srcq+src_strideq*2] | 610 movu m3, [srcq+src_strideq*2] |
603 movu m4, [srcq+2] | 611 movu m4, [srcq+2] |
604 movu m5, [srcq+src_strideq*2+2] | 612 movu m5, [srcq+src_strideq*2+2] |
605 pavgw m2, m4 | 613 pavgw m2, m4 |
606 pavgw m3, m5 | 614 pavgw m3, m5 |
607 mova m4, m2 | 615 mova m4, m2 |
608 mova m5, m3 | 616 mova m5, m3 |
609 pmullw m4, filter_y_a | 617 pmullw m4, filter_y_a |
610 pmullw m3, filter_y_b | 618 pmullw m3, filter_y_b |
611 paddw m4, filter_rnd | 619 paddw m4, filter_rnd |
612 paddw m4, m3 | 620 paddw m4, m3 |
613 pmullw m0, filter_y_a | 621 pmullw m0, filter_y_a |
614 pmullw m2, filter_y_b | 622 pmullw m2, filter_y_b |
615 paddw m0, filter_rnd | 623 paddw m0, filter_rnd |
616 psrlw m4, 4 | 624 psrlw m4, 4 |
617 paddw m0, m2 | 625 paddw m0, m2 |
618 mova m2, [dstq] | 626 mova m2, [dstq] |
619 psrlw m0, 4 | 627 psrlw m0, 4 |
620 mova m3, [dstq+dst_strideq*2] | 628 mova m3, [dstq+dst_strideq*2] |
621 %if %2 == 1 ; avg | 629 %if %2 == 1 ; avg |
622 pavgw m0, [secq] | 630 pavgw m0, [secq] |
623 pavgw m4, [secq+sec_str*2] | 631 add secq, sec_str |
| 632 pavgw m4, [secq] |
624 %endif | 633 %endif |
625 SUM_SSE m0, m2, m4, m3, m6, m7 | 634 SUM_SSE m0, m2, m4, m3, m6, m7 |
626 mova m0, m5 | 635 mova m0, m5 |
627 | 636 |
628 lea srcq, [srcq + src_strideq*4] | 637 lea srcq, [srcq + src_strideq*4] |
629 lea dstq, [dstq + dst_strideq*4] | 638 lea dstq, [dstq + dst_strideq*4] |
630 %if %2 == 1 ; avg | 639 %if %2 == 1 ; avg |
631 lea secq, [secq + sec_str*4] | 640 add secq, sec_str |
632 %endif | 641 %endif |
633 %endif | 642 %endif |
634 dec h | 643 dec h |
635 jg .x_half_y_other_loop | 644 jg .x_half_y_other_loop |
636 %undef filter_y_a | 645 %undef filter_y_a |
637 %undef filter_y_b | 646 %undef filter_y_b |
638 %undef filter_rnd | 647 %undef filter_rnd |
639 STORE_AND_RET | 648 STORE_AND_RET |
640 | 649 |
641 .x_nonhalf: | 650 .x_nonhalf: |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
691 psrlw m0, 4 | 700 psrlw m0, 4 |
692 %if %2 == 1 ; avg | 701 %if %2 == 1 ; avg |
693 pavgw m0, [secq] | 702 pavgw m0, [secq] |
694 pavgw m1, [secq+16] | 703 pavgw m1, [secq+16] |
695 %endif | 704 %endif |
696 SUM_SSE m0, m4, m1, m5, m6, m7 | 705 SUM_SSE m0, m4, m1, m5, m6, m7 |
697 | 706 |
698 lea srcq, [srcq+src_strideq*2] | 707 lea srcq, [srcq+src_strideq*2] |
699 lea dstq, [dstq+dst_strideq*2] | 708 lea dstq, [dstq+dst_strideq*2] |
700 %if %2 == 1 ; avg | 709 %if %2 == 1 ; avg |
701 lea secq, [secq + sec_str*2] | 710 add secq, sec_str |
702 %endif | 711 %endif |
703 %else ; %1 < 16 | 712 %else ; %1 < 16 |
704 movu m0, [srcq] | 713 movu m0, [srcq] |
705 movu m1, [srcq+src_strideq*2] | 714 movu m1, [srcq+src_strideq*2] |
706 movu m2, [srcq+2] | 715 movu m2, [srcq+2] |
707 movu m3, [srcq+src_strideq*2+2] | 716 movu m3, [srcq+src_strideq*2+2] |
708 mova m4, [dstq] | 717 mova m4, [dstq] |
709 mova m5, [dstq+dst_strideq*2] | 718 mova m5, [dstq+dst_strideq*2] |
710 pmullw m1, filter_x_a | 719 pmullw m1, filter_x_a |
711 pmullw m3, filter_x_b | 720 pmullw m3, filter_x_b |
712 paddw m1, filter_rnd | 721 paddw m1, filter_rnd |
713 pmullw m0, filter_x_a | 722 pmullw m0, filter_x_a |
714 pmullw m2, filter_x_b | 723 pmullw m2, filter_x_b |
715 paddw m0, filter_rnd | 724 paddw m0, filter_rnd |
716 paddw m1, m3 | 725 paddw m1, m3 |
717 paddw m0, m2 | 726 paddw m0, m2 |
718 psrlw m1, 4 | 727 psrlw m1, 4 |
719 psrlw m0, 4 | 728 psrlw m0, 4 |
720 %if %2 == 1 ; avg | 729 %if %2 == 1 ; avg |
721 pavgw m0, [secq] | 730 pavgw m0, [secq] |
722 pavgw m1, [secq+sec_str*2] | 731 add secq, sec_str |
| 732 pavgw m1, [secq] |
723 %endif | 733 %endif |
724 SUM_SSE m0, m4, m1, m5, m6, m7 | 734 SUM_SSE m0, m4, m1, m5, m6, m7 |
725 | 735 |
726 lea srcq, [srcq+src_strideq*4] | 736 lea srcq, [srcq+src_strideq*4] |
727 lea dstq, [dstq+dst_strideq*4] | 737 lea dstq, [dstq+dst_strideq*4] |
728 %if %2 == 1 ; avg | 738 %if %2 == 1 ; avg |
729 lea secq, [secq + sec_str*4] | 739 add secq, sec_str |
730 %endif | 740 %endif |
731 %endif | 741 %endif |
732 dec h | 742 dec h |
733 jg .x_other_y_zero_loop | 743 jg .x_other_y_zero_loop |
734 %undef filter_x_a | 744 %undef filter_x_a |
735 %undef filter_x_b | 745 %undef filter_x_b |
736 %undef filter_rnd | 746 %undef filter_rnd |
737 STORE_AND_RET | 747 STORE_AND_RET |
738 | 748 |
739 .x_nonhalf_y_nonzero: | 749 .x_nonhalf_y_nonzero: |
(...skipping 68 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
808 pavgw m0, [secq] | 818 pavgw m0, [secq] |
809 pavgw m1, [secq+16] | 819 pavgw m1, [secq+16] |
810 %endif | 820 %endif |
811 SUM_SSE m0, m4, m1, m5, m6, m7 | 821 SUM_SSE m0, m4, m1, m5, m6, m7 |
812 mova m0, m2 | 822 mova m0, m2 |
813 mova m1, m3 | 823 mova m1, m3 |
814 | 824 |
815 lea srcq, [srcq+src_strideq*2] | 825 lea srcq, [srcq+src_strideq*2] |
816 lea dstq, [dstq+dst_strideq*2] | 826 lea dstq, [dstq+dst_strideq*2] |
817 %if %2 == 1 ; avg | 827 %if %2 == 1 ; avg |
818 lea secq, [secq + sec_str*2] | 828 add secq, sec_str |
819 %endif | 829 %endif |
820 %else ; %1 < 16 | 830 %else ; %1 < 16 |
821 movu m0, [srcq] | 831 movu m0, [srcq] |
822 movu m2, [srcq+2] | 832 movu m2, [srcq+2] |
823 pmullw m0, filter_x_a | 833 pmullw m0, filter_x_a |
824 pmullw m2, filter_x_b | 834 pmullw m2, filter_x_b |
825 paddw m0, filter_rnd | 835 paddw m0, filter_rnd |
826 paddw m0, m2 | 836 paddw m0, m2 |
827 psrlw m0, 4 | 837 psrlw m0, 4 |
828 lea srcq, [srcq+src_strideq*2] | 838 lea srcq, [srcq+src_strideq*2] |
(...skipping 11 matching lines...) Expand all Loading... |
840 paddw m2, m4 | 850 paddw m2, m4 |
841 paddw m3, m5 | 851 paddw m3, m5 |
842 mova m4, [dstq] | 852 mova m4, [dstq] |
843 mova m5, [dstq+dst_strideq*2] | 853 mova m5, [dstq+dst_strideq*2] |
844 psrlw m2, 4 | 854 psrlw m2, 4 |
845 psrlw m3, 4 | 855 psrlw m3, 4 |
846 pavgw m0, m2 | 856 pavgw m0, m2 |
847 pavgw m2, m3 | 857 pavgw m2, m3 |
848 %if %2 == 1 ; avg | 858 %if %2 == 1 ; avg |
849 pavgw m0, [secq] | 859 pavgw m0, [secq] |
850 pavgw m2, [secq+sec_str*2] | 860 add secq, sec_str |
| 861 pavgw m2, [secq] |
851 %endif | 862 %endif |
852 SUM_SSE m0, m4, m2, m5, m6, m7 | 863 SUM_SSE m0, m4, m2, m5, m6, m7 |
853 mova m0, m3 | 864 mova m0, m3 |
854 | 865 |
855 lea srcq, [srcq+src_strideq*4] | 866 lea srcq, [srcq+src_strideq*4] |
856 lea dstq, [dstq+dst_strideq*4] | 867 lea dstq, [dstq+dst_strideq*4] |
857 %if %2 == 1 ; avg | 868 %if %2 == 1 ; avg |
858 lea secq, [secq + sec_str*4] | 869 add secq, sec_str |
859 %endif | 870 %endif |
860 %endif | 871 %endif |
861 dec h | 872 dec h |
862 jg .x_other_y_half_loop | 873 jg .x_other_y_half_loop |
863 %undef filter_x_a | 874 %undef filter_x_a |
864 %undef filter_x_b | 875 %undef filter_x_b |
865 %undef filter_rnd | 876 %undef filter_rnd |
866 STORE_AND_RET | 877 STORE_AND_RET |
867 | 878 |
868 .x_nonhalf_y_nonhalf: | 879 .x_nonhalf_y_nonhalf: |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
962 pavgw m0, [secq] | 973 pavgw m0, [secq] |
963 pavgw m1, [secq+16] | 974 pavgw m1, [secq+16] |
964 %endif | 975 %endif |
965 SUM_SSE m0, m2, m1, m3, m6, m7 | 976 SUM_SSE m0, m2, m1, m3, m6, m7 |
966 mova m0, m4 | 977 mova m0, m4 |
967 mova m1, m5 | 978 mova m1, m5 |
968 | 979 |
969 INC_SRC_BY_SRC_STRIDE | 980 INC_SRC_BY_SRC_STRIDE |
970 lea dstq, [dstq + dst_strideq * 2] | 981 lea dstq, [dstq + dst_strideq * 2] |
971 %if %2 == 1 ; avg | 982 %if %2 == 1 ; avg |
972 lea secq, [secq + sec_str*2] | 983 add secq, sec_str |
973 %endif | 984 %endif |
974 %else ; %1 < 16 | 985 %else ; %1 < 16 |
975 movu m0, [srcq] | 986 movu m0, [srcq] |
976 movu m2, [srcq+2] | 987 movu m2, [srcq+2] |
977 pmullw m0, filter_x_a | 988 pmullw m0, filter_x_a |
978 pmullw m2, filter_x_b | 989 pmullw m2, filter_x_b |
979 paddw m0, filter_rnd | 990 paddw m0, filter_rnd |
980 paddw m0, m2 | 991 paddw m0, m2 |
981 psrlw m0, 4 | 992 psrlw m0, 4 |
982 | 993 |
(...skipping 23 matching lines...) Expand all Loading... |
1006 pmullw m3, filter_y_b | 1017 pmullw m3, filter_y_b |
1007 paddw m0, m2 | 1018 paddw m0, m2 |
1008 paddw m4, filter_rnd | 1019 paddw m4, filter_rnd |
1009 mova m2, [dstq] | 1020 mova m2, [dstq] |
1010 paddw m4, m3 | 1021 paddw m4, m3 |
1011 psrlw m0, 4 | 1022 psrlw m0, 4 |
1012 psrlw m4, 4 | 1023 psrlw m4, 4 |
1013 mova m3, [dstq+dst_strideq*2] | 1024 mova m3, [dstq+dst_strideq*2] |
1014 %if %2 == 1 ; avg | 1025 %if %2 == 1 ; avg |
1015 pavgw m0, [secq] | 1026 pavgw m0, [secq] |
1016 pavgw m4, [secq+sec_str*2] | 1027 add secq, sec_str |
| 1028 pavgw m4, [secq] |
1017 %endif | 1029 %endif |
1018 SUM_SSE m0, m2, m4, m3, m6, m7 | 1030 SUM_SSE m0, m2, m4, m3, m6, m7 |
1019 mova m0, m5 | 1031 mova m0, m5 |
1020 | 1032 |
1021 INC_SRC_BY_SRC_2STRIDE | 1033 INC_SRC_BY_SRC_2STRIDE |
1022 lea dstq, [dstq + dst_strideq * 4] | 1034 lea dstq, [dstq + dst_strideq * 4] |
1023 %if %2 == 1 ; avg | 1035 %if %2 == 1 ; avg |
1024 lea secq, [secq + sec_str*4] | 1036 add secq, sec_str |
1025 %endif | 1037 %endif |
1026 %endif | 1038 %endif |
1027 dec h | 1039 dec h |
1028 jg .x_other_y_other_loop | 1040 jg .x_other_y_other_loop |
1029 %undef filter_x_a | 1041 %undef filter_x_a |
1030 %undef filter_x_b | 1042 %undef filter_x_b |
1031 %undef filter_y_a | 1043 %undef filter_y_a |
1032 %undef filter_y_b | 1044 %undef filter_y_b |
1033 %undef filter_rnd | 1045 %undef filter_rnd |
1034 STORE_AND_RET | 1046 STORE_AND_RET |
1035 %endmacro | 1047 %endmacro |
1036 | 1048 |
1037 INIT_XMM sse2 | 1049 INIT_XMM sse2 |
1038 SUBPEL_VARIANCE 8 | 1050 SUBPEL_VARIANCE 8 |
1039 SUBPEL_VARIANCE 16 | 1051 SUBPEL_VARIANCE 16 |
1040 | 1052 |
1041 INIT_XMM sse2 | 1053 INIT_XMM sse2 |
1042 SUBPEL_VARIANCE 8, 1 | 1054 SUBPEL_VARIANCE 8, 1 |
1043 SUBPEL_VARIANCE 16, 1 | 1055 SUBPEL_VARIANCE 16, 1 |
OLD | NEW |