OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 555 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
566 "jg 1b \n" | 566 "jg 1b \n" |
567 : "+r"(src_ptr), // %0 | 567 : "+r"(src_ptr), // %0 |
568 "+r"(dst_ptr), // %1 | 568 "+r"(dst_ptr), // %1 |
569 "+r"(dst_width) // %2 | 569 "+r"(dst_width) // %2 |
570 : "r"((intptr_t)(src_stride)) // %3 | 570 : "r"((intptr_t)(src_stride)) // %3 |
571 : "memory", "cc", NACL_R14 | 571 : "memory", "cc", NACL_R14 |
572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 572 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
573 ); | 573 ); |
574 } | 574 } |
575 | 575 |
| 576 // Reads 16xN bytes and produces 16 shorts at a time. |
576 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, | 577 void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, |
577 uint16* dst_ptr, int src_width, int src_height) { | 578 uint16* dst_ptr, int src_width, int src_height) { |
578 int tmp_height = 0; | 579 int tmp_height = 0; |
579 intptr_t tmp_src = 0; | 580 intptr_t tmp_src = 0; |
580 asm volatile ( | 581 asm volatile ( |
| 582 "mov %0,%3 \n" // row pointer |
| 583 "mov %5,%2 \n" // height |
| 584 "pxor %%xmm0,%%xmm0 \n" // clear accumulators |
| 585 "pxor %%xmm1,%%xmm1 \n" |
581 "pxor %%xmm4,%%xmm4 \n" | 586 "pxor %%xmm4,%%xmm4 \n" |
582 "sub $0x1,%5 \n" | |
583 | 587 |
584 LABELALIGN | 588 LABELALIGN |
585 "1: \n" | 589 "1: \n" |
586 "movdqu " MEMACCESS(0) ",%%xmm0 \n" | 590 "movdqu " MEMACCESS(3) ",%%xmm2 \n" |
587 "mov %0,%3 \n" | 591 "add %6,%3 \n" |
588 "add %6,%0 \n" | |
589 "movdqa %%xmm0,%%xmm1 \n" | |
590 "punpcklbw %%xmm4,%%xmm0 \n" | |
591 "punpckhbw %%xmm4,%%xmm1 \n" | |
592 "mov %5,%2 \n" | |
593 "test %2,%2 \n" | |
594 "je 3f \n" | |
595 | |
596 LABELALIGN | |
597 "2: \n" | |
598 "movdqu " MEMACCESS(0) ",%%xmm2 \n" | |
599 "add %6,%0 \n" | |
600 "movdqa %%xmm2,%%xmm3 \n" | 592 "movdqa %%xmm2,%%xmm3 \n" |
601 "punpcklbw %%xmm4,%%xmm2 \n" | 593 "punpcklbw %%xmm4,%%xmm2 \n" |
602 "punpckhbw %%xmm4,%%xmm3 \n" | 594 "punpckhbw %%xmm4,%%xmm3 \n" |
603 "paddusw %%xmm2,%%xmm0 \n" | 595 "paddusw %%xmm2,%%xmm0 \n" |
604 "paddusw %%xmm3,%%xmm1 \n" | 596 "paddusw %%xmm3,%%xmm1 \n" |
605 "sub $0x1,%2 \n" | 597 "sub $0x1,%2 \n" |
606 "jg 2b \n" | 598 "jg 1b \n" |
607 | 599 |
608 LABELALIGN | |
609 "3: \n" | |
610 "movdqu %%xmm0," MEMACCESS(1) " \n" | 600 "movdqu %%xmm0," MEMACCESS(1) " \n" |
611 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" | 601 "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" |
612 "lea " MEMLEA(0x10,3) ",%0 \n" | |
613 "lea " MEMLEA(0x20,1) ",%1 \n" | 602 "lea " MEMLEA(0x20,1) ",%1 \n" |
| 603 "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 |
| 604 "mov %0,%3 \n" // row pointer |
| 605 "mov %5,%2 \n" // height |
| 606 "pxor %%xmm0,%%xmm0 \n" // clear accumulators |
| 607 "pxor %%xmm1,%%xmm1 \n" |
614 "sub $0x10,%4 \n" | 608 "sub $0x10,%4 \n" |
615 "jg 1b \n" | 609 "jg 1b \n" |
616 : "+r"(src_ptr), // %0 | 610 : "+r"(src_ptr), // %0 |
617 "+r"(dst_ptr), // %1 | 611 "+r"(dst_ptr), // %1 |
618 "+r"(tmp_height), // %2 | 612 "+r"(tmp_height), // %2 |
619 "+r"(tmp_src), // %3 | 613 "+r"(tmp_src), // %3 |
620 "+r"(src_width), // %4 | 614 "+r"(src_width), // %4 |
621 "+rm"(src_height) // %5 | 615 "+rm"(src_height) // %5 |
622 : "rm"((intptr_t)(src_stride)) // %6 | 616 : "rm"((intptr_t)(src_stride)) // %6 |
623 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 617 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
792 "+r"(dst_width) // %2 | 786 "+r"(dst_width) // %2 |
793 : "r"((intptr_t)(src_stride)) // %3 | 787 : "r"((intptr_t)(src_stride)) // %3 |
794 : "memory", "cc", NACL_R14 | 788 : "memory", "cc", NACL_R14 |
795 "xmm0", "xmm1", "xmm2", "xmm3" | 789 "xmm0", "xmm1", "xmm2", "xmm3" |
796 ); | 790 ); |
797 } | 791 } |
798 | 792 |
799 // Reads 4 pixels at a time. | 793 // Reads 4 pixels at a time. |
800 // Alignment requirement: dst_argb 16 byte aligned. | 794 // Alignment requirement: dst_argb 16 byte aligned. |
801 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, | 795 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
802 int src_stepx, | 796 int src_stepx, uint8* dst_argb, int dst_width) { |
803 uint8* dst_argb, int dst_width) { | |
804 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | 797 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
805 intptr_t src_stepx_x12 = 0; | 798 intptr_t src_stepx_x12 = 0; |
806 asm volatile ( | 799 asm volatile ( |
807 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | 800 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
808 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | 801 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
809 LABELALIGN | 802 LABELALIGN |
810 "1: \n" | 803 "1: \n" |
811 "movd " MEMACCESS(0) ",%%xmm0 \n" | 804 "movd " MEMACCESS(0) ",%%xmm0 \n" |
812 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 | 805 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
813 "punpckldq %%xmm1,%%xmm0 \n" | 806 "punpckldq %%xmm1,%%xmm0 \n" |
(...skipping 273 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1087 ); | 1080 ); |
1088 return num; | 1081 return num; |
1089 } | 1082 } |
1090 | 1083 |
1091 #endif // defined(__x86_64__) || defined(__i386__) | 1084 #endif // defined(__x86_64__) || defined(__i386__) |
1092 | 1085 |
1093 #ifdef __cplusplus | 1086 #ifdef __cplusplus |
1094 } // extern "C" | 1087 } // extern "C" |
1095 } // namespace libyuv | 1088 } // namespace libyuv |
1096 #endif | 1089 #endif |
OLD | NEW |