OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 803 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
814 "vzeroupper \n" | 814 "vzeroupper \n" |
815 : "+r"(src_ptr), // %0 | 815 : "+r"(src_ptr), // %0 |
816 "+r"(dst_ptr), // %1 | 816 "+r"(dst_ptr), // %1 |
817 "+r"(src_width) // %2 | 817 "+r"(src_width) // %2 |
818 : | 818 : |
819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
820 ); | 820 ); |
821 } | 821 } |
822 #endif // HAS_SCALEADDROW_AVX2 | 822 #endif // HAS_SCALEADDROW_AVX2 |
823 | 823 |
| 824 // Constant for making pixels signed to avoid pmaddubsw |
| 825 // saturation. |
| 826 static uvec8 kFsub80 = |
| 827 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, |
| 828 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 }; |
| 829 |
| 830 // Constant for making pixels unsigned and adding .5 for rounding. |
| 831 static uvec16 kFadd40 = |
| 832 { 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040, 0x4040 }; |
| 833 |
824 // Bilinear column filtering. SSSE3 version. | 834 // Bilinear column filtering. SSSE3 version. |
825 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 835 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
826 int dst_width, int x, int dx) { | 836 int dst_width, int x, int dx) { |
827 intptr_t x0, x1, temp_pixel; | 837 intptr_t x0, x1, temp_pixel; |
828 asm volatile ( | 838 asm volatile ( |
829 "movd %6,%%xmm2 \n" | 839 "movd %6,%%xmm2 \n" |
830 "movd %7,%%xmm3 \n" | 840 "movd %7,%%xmm3 \n" |
831 "movl $0x04040000,%k2 \n" | 841 "movl $0x04040000,%k2 \n" |
832 "movd %k2,%%xmm5 \n" | 842 "movd %k2,%%xmm5 \n" |
833 "pcmpeqb %%xmm6,%%xmm6 \n" | 843 "pcmpeqb %%xmm6,%%xmm6 \n" |
834 "psrlw $0x9,%%xmm6 \n" | 844 "psrlw $0x9,%%xmm6 \n" // 0x007f007f |
| 845 "pcmpeqb %%xmm7,%%xmm7 \n" |
| 846 "psrlw $15,%%xmm7 \n" // 0x00010001 |
| 847 |
835 "pextrw $0x1,%%xmm2,%k3 \n" | 848 "pextrw $0x1,%%xmm2,%k3 \n" |
836 "subl $0x2,%5 \n" | 849 "subl $0x2,%5 \n" |
837 "jl 29f \n" | 850 "jl 29f \n" |
838 "movdqa %%xmm2,%%xmm0 \n" | 851 "movdqa %%xmm2,%%xmm0 \n" |
839 "paddd %%xmm3,%%xmm0 \n" | 852 "paddd %%xmm3,%%xmm0 \n" |
840 "punpckldq %%xmm0,%%xmm2 \n" | 853 "punpckldq %%xmm0,%%xmm2 \n" |
841 "punpckldq %%xmm3,%%xmm3 \n" | 854 "punpckldq %%xmm3,%%xmm3 \n" |
842 "paddd %%xmm3,%%xmm3 \n" | 855 "paddd %%xmm3,%%xmm3 \n" |
843 "pextrw $0x3,%%xmm2,%k4 \n" | 856 "pextrw $0x3,%%xmm2,%k4 \n" |
844 | 857 |
845 LABELALIGN | 858 LABELALIGN |
846 "2: \n" | 859 "2: \n" |
847 "movdqa %%xmm2,%%xmm1 \n" | 860 "movdqa %%xmm2,%%xmm1 \n" |
848 "paddd %%xmm3,%%xmm2 \n" | 861 "paddd %%xmm3,%%xmm2 \n" |
849 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 | 862 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
850 "movd %k2,%%xmm0 \n" | 863 "movd %k2,%%xmm0 \n" |
851 "psrlw $0x9,%%xmm1 \n" | 864 "psrlw $0x9,%%xmm1 \n" |
852 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 | 865 MEMOPARG(movzwl,0x00,1,4,1,k2) // movzwl (%1,%4,1),%k2 |
853 "movd %k2,%%xmm4 \n" | 866 "movd %k2,%%xmm4 \n" |
854 "pshufb %%xmm5,%%xmm1 \n" | 867 "pshufb %%xmm5,%%xmm1 \n" |
855 "punpcklwd %%xmm4,%%xmm0 \n" | 868 "punpcklwd %%xmm4,%%xmm0 \n" |
856 "pxor %%xmm6,%%xmm1 \n" | 869 "psubb %8,%%xmm0 \n" // make pixels signed. |
857 "pmaddubsw %%xmm1,%%xmm0 \n" | 870 "pxor %%xmm6,%%xmm1 \n" // 128 -f = (f ^ 127 ) + 1 |
| 871 "paddusb %%xmm7,%%xmm1 \n" |
| 872 "pmaddubsw %%xmm0,%%xmm1 \n" |
858 "pextrw $0x1,%%xmm2,%k3 \n" | 873 "pextrw $0x1,%%xmm2,%k3 \n" |
859 "pextrw $0x3,%%xmm2,%k4 \n" | 874 "pextrw $0x3,%%xmm2,%k4 \n" |
860 "psrlw $0x7,%%xmm0 \n" | 875 "paddw %9,%%xmm1 \n" // make pixels unsigned. |
861 "packuswb %%xmm0,%%xmm0 \n" | 876 "psrlw $0x7,%%xmm1 \n" |
862 "movd %%xmm0,%k2 \n" | 877 "packuswb %%xmm1,%%xmm1 \n" |
| 878 "movd %%xmm1,%k2 \n" |
863 "mov %w2," MEMACCESS(0) " \n" | 879 "mov %w2," MEMACCESS(0) " \n" |
864 "lea " MEMLEA(0x2,0) ",%0 \n" | 880 "lea " MEMLEA(0x2,0) ",%0 \n" |
865 "sub $0x2,%5 \n" | 881 "sub $0x2,%5 \n" |
866 "jge 2b \n" | 882 "jge 2b \n" |
867 | 883 |
868 LABELALIGN | 884 LABELALIGN |
869 "29: \n" | 885 "29: \n" |
870 "addl $0x1,%5 \n" | 886 "addl $0x1,%5 \n" |
871 "jl 99f \n" | 887 "jl 99f \n" |
872 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 | 888 MEMOPARG(movzwl,0x00,1,3,1,k2) // movzwl (%1,%3,1),%k2 |
873 "movd %k2,%%xmm0 \n" | 889 "movd %k2,%%xmm0 \n" |
874 "psrlw $0x9,%%xmm2 \n" | 890 "psrlw $0x9,%%xmm2 \n" |
875 "pshufb %%xmm5,%%xmm2 \n" | 891 "pshufb %%xmm5,%%xmm2 \n" |
| 892 "psubb %8,%%xmm0 \n" // make pixels signed. |
876 "pxor %%xmm6,%%xmm2 \n" | 893 "pxor %%xmm6,%%xmm2 \n" |
877 "pmaddubsw %%xmm2,%%xmm0 \n" | 894 "paddusb %%xmm7,%%xmm2 \n" |
878 "psrlw $0x7,%%xmm0 \n" | 895 "pmaddubsw %%xmm0,%%xmm2 \n" |
879 "packuswb %%xmm0,%%xmm0 \n" | 896 "paddw %9,%%xmm2 \n" // make pixels unsigned. |
880 "movd %%xmm0,%k2 \n" | 897 "psrlw $0x7,%%xmm2 \n" |
| 898 "packuswb %%xmm2,%%xmm2 \n" |
| 899 "movd %%xmm2,%k2 \n" |
881 "mov %b2," MEMACCESS(0) " \n" | 900 "mov %b2," MEMACCESS(0) " \n" |
882 "99: \n" | 901 "99: \n" |
883 : "+r"(dst_ptr), // %0 | 902 : "+r"(dst_ptr), // %0 |
884 "+r"(src_ptr), // %1 | 903 "+r"(src_ptr), // %1 |
885 "=&a"(temp_pixel), // %2 | 904 "=&a"(temp_pixel), // %2 |
886 "=&r"(x0), // %3 | 905 "=&r"(x0), // %3 |
887 "=&r"(x1), // %4 | 906 "=&r"(x1), // %4 |
888 "+rm"(dst_width) // %5 | 907 "+rm"(dst_width) // %5 |
889 : "rm"(x), // %6 | 908 : "rm"(x), // %6 |
890 "rm"(dx) // %7 | 909 "rm"(dx), // %7 |
| 910 #if defined(__x86_64__) |
| 911 "x"(kFsub80), // %8 |
| 912 "x"(kFadd40) // %9 |
| 913 #else |
| 914 "m"(kFsub80), // %8 |
| 915 "m"(kFadd40) // %9 |
| 916 #endif |
891 : "memory", "cc", NACL_R14 | 917 : "memory", "cc", NACL_R14 |
892 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 918 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
893 ); | 919 ); |
894 } | 920 } |
895 | 921 |
896 // Reads 4 pixels, duplicates them and writes 8 pixels. | 922 // Reads 4 pixels, duplicates them and writes 8 pixels. |
897 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 923 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
898 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 924 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
899 int dst_width, int x, int dx) { | 925 int dst_width, int x, int dx) { |
900 asm volatile ( | 926 asm volatile ( |
901 LABELALIGN | 927 LABELALIGN |
902 "1: \n" | 928 "1: \n" |
(...skipping 380 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1283 ); | 1309 ); |
1284 return num; | 1310 return num; |
1285 } | 1311 } |
1286 | 1312 |
1287 #endif // defined(__x86_64__) || defined(__i386__) | 1313 #endif // defined(__x86_64__) || defined(__i386__) |
1288 | 1314 |
1289 #ifdef __cplusplus | 1315 #ifdef __cplusplus |
1290 } // extern "C" | 1316 } // extern "C" |
1291 } // namespace libyuv | 1317 } // namespace libyuv |
1292 #endif | 1318 #endif |
OLD | NEW |