OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
309 "jg 1b \n" | 309 "jg 1b \n" |
310 : "+r"(src_ptr), // %0 | 310 : "+r"(src_ptr), // %0 |
311 "+r"(dst_ptr), // %1 | 311 "+r"(dst_ptr), // %1 |
312 "+r"(dst_width) // %2 | 312 "+r"(dst_width) // %2 |
313 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 313 :: "memory", "cc", "xmm0", "xmm1", "xmm5" |
314 ); | 314 ); |
315 } | 315 } |
316 | 316 |
317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
318 uint8* dst_ptr, int dst_width) { | 318 uint8* dst_ptr, int dst_width) { |
319 intptr_t stridex3 = 0; | 319 intptr_t stridex3; |
320 asm volatile ( | 320 asm volatile ( |
321 "pcmpeqb %%xmm4,%%xmm4 \n" | 321 "pcmpeqb %%xmm4,%%xmm4 \n" |
322 "psrlw $0xf,%%xmm4 \n" | 322 "psrlw $0xf,%%xmm4 \n" |
323 "movdqa %%xmm4,%%xmm5 \n" | 323 "movdqa %%xmm4,%%xmm5 \n" |
324 "packuswb %%xmm4,%%xmm4 \n" | 324 "packuswb %%xmm4,%%xmm4 \n" |
325 "psllw $0x3,%%xmm5 \n" | 325 "psllw $0x3,%%xmm5 \n" |
326 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" | 326 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" |
327 | 327 |
328 LABELALIGN | 328 LABELALIGN |
329 "1: \n" | 329 "1: \n" |
(...skipping 24 matching lines...) Expand all Loading... |
354 "paddw %%xmm5,%%xmm0 \n" | 354 "paddw %%xmm5,%%xmm0 \n" |
355 "psrlw $0x4,%%xmm0 \n" | 355 "psrlw $0x4,%%xmm0 \n" |
356 "packuswb %%xmm0,%%xmm0 \n" | 356 "packuswb %%xmm0,%%xmm0 \n" |
357 "movq %%xmm0," MEMACCESS(1) " \n" | 357 "movq %%xmm0," MEMACCESS(1) " \n" |
358 "lea " MEMLEA(0x8,1) ",%1 \n" | 358 "lea " MEMLEA(0x8,1) ",%1 \n" |
359 "sub $0x8,%2 \n" | 359 "sub $0x8,%2 \n" |
360 "jg 1b \n" | 360 "jg 1b \n" |
361 : "+r"(src_ptr), // %0 | 361 : "+r"(src_ptr), // %0 |
362 "+r"(dst_ptr), // %1 | 362 "+r"(dst_ptr), // %1 |
363 "+r"(dst_width), // %2 | 363 "+r"(dst_width), // %2 |
364 "+r"(stridex3) // %3 | 364 "=&r"(stridex3) // %3 |
365 : "r"((intptr_t)(src_stride)) // %4 | 365 : "r"((intptr_t)(src_stride)) // %4 |
366 : "memory", "cc", NACL_R14 | 366 : "memory", "cc", NACL_R14 |
367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
368 ); | 368 ); |
369 } | 369 } |
370 | 370 |
371 | 371 |
372 #ifdef HAS_SCALEROWDOWN4_AVX2 | 372 #ifdef HAS_SCALEROWDOWN4_AVX2 |
373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
374 uint8* dst_ptr, int dst_width) { | 374 uint8* dst_ptr, int dst_width) { |
(...skipping 442 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
817 "+r"(src_width) // %2 | 817 "+r"(src_width) // %2 |
818 : | 818 : |
819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
820 ); | 820 ); |
821 } | 821 } |
822 #endif // HAS_SCALEADDROW_AVX2 | 822 #endif // HAS_SCALEADDROW_AVX2 |
823 | 823 |
824 // Bilinear column filtering. SSSE3 version. | 824 // Bilinear column filtering. SSSE3 version. |
825 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 825 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
826 int dst_width, int x, int dx) { | 826 int dst_width, int x, int dx) { |
827 intptr_t x0 = 0, x1 = 0, temp_pixel = 0; | 827 intptr_t x0, x1, temp_pixel; |
828 asm volatile ( | 828 asm volatile ( |
829 "movd %6,%%xmm2 \n" | 829 "movd %6,%%xmm2 \n" |
830 "movd %7,%%xmm3 \n" | 830 "movd %7,%%xmm3 \n" |
831 "movl $0x04040000,%k2 \n" | 831 "movl $0x04040000,%k2 \n" |
832 "movd %k2,%%xmm5 \n" | 832 "movd %k2,%%xmm5 \n" |
833 "pcmpeqb %%xmm6,%%xmm6 \n" | 833 "pcmpeqb %%xmm6,%%xmm6 \n" |
834 "psrlw $0x9,%%xmm6 \n" | 834 "psrlw $0x9,%%xmm6 \n" |
835 "pextrw $0x1,%%xmm2,%k3 \n" | 835 "pextrw $0x1,%%xmm2,%k3 \n" |
836 "subl $0x2,%5 \n" | 836 "subl $0x2,%5 \n" |
837 "jl 29f \n" | 837 "jl 29f \n" |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
873 "movd %k2,%%xmm0 \n" | 873 "movd %k2,%%xmm0 \n" |
874 "psrlw $0x9,%%xmm2 \n" | 874 "psrlw $0x9,%%xmm2 \n" |
875 "pshufb %%xmm5,%%xmm2 \n" | 875 "pshufb %%xmm5,%%xmm2 \n" |
876 "pxor %%xmm6,%%xmm2 \n" | 876 "pxor %%xmm6,%%xmm2 \n" |
877 "pmaddubsw %%xmm2,%%xmm0 \n" | 877 "pmaddubsw %%xmm2,%%xmm0 \n" |
878 "psrlw $0x7,%%xmm0 \n" | 878 "psrlw $0x7,%%xmm0 \n" |
879 "packuswb %%xmm0,%%xmm0 \n" | 879 "packuswb %%xmm0,%%xmm0 \n" |
880 "movd %%xmm0,%k2 \n" | 880 "movd %%xmm0,%k2 \n" |
881 "mov %b2," MEMACCESS(0) " \n" | 881 "mov %b2," MEMACCESS(0) " \n" |
882 "99: \n" | 882 "99: \n" |
883 : "+r"(dst_ptr), // %0 | 883 : "+r"(dst_ptr), // %0 |
884 "+r"(src_ptr), // %1 | 884 "+r"(src_ptr), // %1 |
885 "+a"(temp_pixel), // %2 | 885 "=&a"(temp_pixel), // %2 |
886 "+r"(x0), // %3 | 886 "=&r"(x0), // %3 |
887 "+r"(x1), // %4 | 887 "=&r"(x1), // %4 |
888 "+rm"(dst_width) // %5 | 888 "+rm"(dst_width) // %5 |
889 : "rm"(x), // %6 | 889 : "rm"(x), // %6 |
890 "rm"(dx) // %7 | 890 "rm"(dx) // %7 |
891 : "memory", "cc", NACL_R14 | 891 : "memory", "cc", NACL_R14 |
892 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 892 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
893 ); | 893 ); |
894 } | 894 } |
895 | 895 |
896 // Reads 4 pixels, duplicates them and writes 8 pixels. | 896 // Reads 4 pixels, duplicates them and writes 8 pixels. |
897 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 897 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
898 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 898 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
899 int dst_width, int x, int dx) { | 899 int dst_width, int x, int dx) { |
900 asm volatile ( | 900 asm volatile ( |
(...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
991 : "memory", "cc", NACL_R14 | 991 : "memory", "cc", NACL_R14 |
992 "xmm0", "xmm1", "xmm2", "xmm3" | 992 "xmm0", "xmm1", "xmm2", "xmm3" |
993 ); | 993 ); |
994 } | 994 } |
995 | 995 |
996 // Reads 4 pixels at a time. | 996 // Reads 4 pixels at a time. |
997 // Alignment requirement: dst_argb 16 byte aligned. | 997 // Alignment requirement: dst_argb 16 byte aligned. |
998 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, | 998 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
999 int src_stepx, uint8* dst_argb, int dst_width) { | 999 int src_stepx, uint8* dst_argb, int dst_width) { |
1000 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | 1000 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
1001 intptr_t src_stepx_x12 = 0; | 1001 intptr_t src_stepx_x12; |
1002 asm volatile ( | 1002 asm volatile ( |
1003 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | 1003 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
1004 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | 1004 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
1005 LABELALIGN | 1005 LABELALIGN |
1006 "1: \n" | 1006 "1: \n" |
1007 "movd " MEMACCESS(0) ",%%xmm0 \n" | 1007 "movd " MEMACCESS(0) ",%%xmm0 \n" |
1008 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 | 1008 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
1009 "punpckldq %%xmm1,%%xmm0 \n" | 1009 "punpckldq %%xmm1,%%xmm0 \n" |
1010 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 | 1010 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 |
1011 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 | 1011 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 |
1012 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" | 1012 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
1013 "punpckldq %%xmm3,%%xmm2 \n" | 1013 "punpckldq %%xmm3,%%xmm2 \n" |
1014 "punpcklqdq %%xmm2,%%xmm0 \n" | 1014 "punpcklqdq %%xmm2,%%xmm0 \n" |
1015 "movdqu %%xmm0," MEMACCESS(2) " \n" | 1015 "movdqu %%xmm0," MEMACCESS(2) " \n" |
1016 "lea " MEMLEA(0x10,2) ",%2 \n" | 1016 "lea " MEMLEA(0x10,2) ",%2 \n" |
1017 "sub $0x4,%3 \n" | 1017 "sub $0x4,%3 \n" |
1018 "jg 1b \n" | 1018 "jg 1b \n" |
1019 : "+r"(src_argb), // %0 | 1019 : "+r"(src_argb), // %0 |
1020 "+r"(src_stepx_x4), // %1 | 1020 "+r"(src_stepx_x4), // %1 |
1021 "+r"(dst_argb), // %2 | 1021 "+r"(dst_argb), // %2 |
1022 "+r"(dst_width), // %3 | 1022 "+r"(dst_width), // %3 |
1023 "+r"(src_stepx_x12) // %4 | 1023 "=&r"(src_stepx_x12) // %4 |
1024 :: "memory", "cc", NACL_R14 | 1024 :: "memory", "cc", NACL_R14 |
1025 "xmm0", "xmm1", "xmm2", "xmm3" | 1025 "xmm0", "xmm1", "xmm2", "xmm3" |
1026 ); | 1026 ); |
1027 } | 1027 } |
1028 | 1028 |
1029 // Blends four 2x2 to 4x1. | 1029 // Blends four 2x2 to 4x1. |
1030 // Alignment requirement: dst_argb 16 byte aligned. | 1030 // Alignment requirement: dst_argb 16 byte aligned. |
1031 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, | 1031 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
1032 ptrdiff_t src_stride, int src_stepx, | 1032 ptrdiff_t src_stride, int src_stepx, |
1033 uint8* dst_argb, int dst_width) { | 1033 uint8* dst_argb, int dst_width) { |
1034 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | 1034 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
1035 intptr_t src_stepx_x12 = 0; | 1035 intptr_t src_stepx_x12; |
1036 intptr_t row1 = (intptr_t)(src_stride); | 1036 intptr_t row1 = (intptr_t)(src_stride); |
1037 asm volatile ( | 1037 asm volatile ( |
1038 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | 1038 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
1039 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | 1039 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
1040 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" | 1040 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" |
1041 | 1041 |
1042 LABELALIGN | 1042 LABELALIGN |
1043 "1: \n" | 1043 "1: \n" |
1044 "movq " MEMACCESS(0) ",%%xmm0 \n" | 1044 "movq " MEMACCESS(0) ",%%xmm0 \n" |
1045 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 | 1045 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 |
1046 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 | 1046 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 |
1047 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 | 1047 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 |
1048 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" | 1048 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
1049 "movq " MEMACCESS(5) ",%%xmm2 \n" | 1049 "movq " MEMACCESS(5) ",%%xmm2 \n" |
1050 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 | 1050 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 |
1051 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 | 1051 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 |
1052 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 | 1052 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 |
1053 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" | 1053 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" |
1054 "pavgb %%xmm2,%%xmm0 \n" | 1054 "pavgb %%xmm2,%%xmm0 \n" |
1055 "pavgb %%xmm3,%%xmm1 \n" | 1055 "pavgb %%xmm3,%%xmm1 \n" |
1056 "movdqa %%xmm0,%%xmm2 \n" | 1056 "movdqa %%xmm0,%%xmm2 \n" |
1057 "shufps $0x88,%%xmm1,%%xmm0 \n" | 1057 "shufps $0x88,%%xmm1,%%xmm0 \n" |
1058 "shufps $0xdd,%%xmm1,%%xmm2 \n" | 1058 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
1059 "pavgb %%xmm2,%%xmm0 \n" | 1059 "pavgb %%xmm2,%%xmm0 \n" |
1060 "movdqu %%xmm0," MEMACCESS(2) " \n" | 1060 "movdqu %%xmm0," MEMACCESS(2) " \n" |
1061 "lea " MEMLEA(0x10,2) ",%2 \n" | 1061 "lea " MEMLEA(0x10,2) ",%2 \n" |
1062 "sub $0x4,%3 \n" | 1062 "sub $0x4,%3 \n" |
1063 "jg 1b \n" | 1063 "jg 1b \n" |
1064 : "+r"(src_argb), // %0 | 1064 : "+r"(src_argb), // %0 |
1065 "+r"(src_stepx_x4), // %1 | 1065 "+r"(src_stepx_x4), // %1 |
1066 "+r"(dst_argb), // %2 | 1066 "+r"(dst_argb), // %2 |
1067 "+rm"(dst_width), // %3 | 1067 "+rm"(dst_width), // %3 |
1068 "+r"(src_stepx_x12), // %4 | 1068 "=&r"(src_stepx_x12), // %4 |
1069 "+r"(row1) // %5 | 1069 "+r"(row1) // %5 |
1070 :: "memory", "cc", NACL_R14 | 1070 :: "memory", "cc", NACL_R14 |
1071 "xmm0", "xmm1", "xmm2", "xmm3" | 1071 "xmm0", "xmm1", "xmm2", "xmm3" |
1072 ); | 1072 ); |
1073 } | 1073 } |
1074 | 1074 |
1075 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, | 1075 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
1076 int dst_width, int x, int dx) { | 1076 int dst_width, int x, int dx) { |
1077 intptr_t x0 = 0, x1 = 0; | 1077 intptr_t x0, x1; |
1078 asm volatile ( | 1078 asm volatile ( |
1079 "movd %5,%%xmm2 \n" | 1079 "movd %5,%%xmm2 \n" |
1080 "movd %6,%%xmm3 \n" | 1080 "movd %6,%%xmm3 \n" |
1081 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 1081 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
1082 "pshufd $0x11,%%xmm3,%%xmm0 \n" | 1082 "pshufd $0x11,%%xmm3,%%xmm0 \n" |
1083 "paddd %%xmm0,%%xmm2 \n" | 1083 "paddd %%xmm0,%%xmm2 \n" |
1084 "paddd %%xmm3,%%xmm3 \n" | 1084 "paddd %%xmm3,%%xmm3 \n" |
1085 "pshufd $0x5,%%xmm3,%%xmm0 \n" | 1085 "pshufd $0x5,%%xmm3,%%xmm0 \n" |
1086 "paddd %%xmm0,%%xmm2 \n" | 1086 "paddd %%xmm0,%%xmm2 \n" |
1087 "paddd %%xmm3,%%xmm3 \n" | 1087 "paddd %%xmm3,%%xmm3 \n" |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1120 "pextrw $0x5,%%xmm2,%k0 \n" | 1120 "pextrw $0x5,%%xmm2,%k0 \n" |
1121 "punpckldq %%xmm1,%%xmm0 \n" | 1121 "punpckldq %%xmm1,%%xmm0 \n" |
1122 "movq %%xmm0," MEMACCESS(2) " \n" | 1122 "movq %%xmm0," MEMACCESS(2) " \n" |
1123 "lea " MEMLEA(0x8,2) ",%2 \n" | 1123 "lea " MEMLEA(0x8,2) ",%2 \n" |
1124 "29: \n" | 1124 "29: \n" |
1125 "test $0x1,%4 \n" | 1125 "test $0x1,%4 \n" |
1126 "je 99f \n" | 1126 "je 99f \n" |
1127 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 | 1127 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
1128 "movd %%xmm0," MEMACCESS(2) " \n" | 1128 "movd %%xmm0," MEMACCESS(2) " \n" |
1129 "99: \n" | 1129 "99: \n" |
1130 : "+a"(x0), // %0 | 1130 : "=&a"(x0), // %0 |
1131 "+d"(x1), // %1 | 1131 "=&d"(x1), // %1 |
1132 "+r"(dst_argb), // %2 | 1132 "+r"(dst_argb), // %2 |
1133 "+r"(src_argb), // %3 | 1133 "+r"(src_argb), // %3 |
1134 "+r"(dst_width) // %4 | 1134 "+r"(dst_width) // %4 |
1135 : "rm"(x), // %5 | 1135 : "rm"(x), // %5 |
1136 "rm"(dx) // %6 | 1136 "rm"(dx) // %6 |
1137 : "memory", "cc", NACL_R14 | 1137 : "memory", "cc", NACL_R14 |
1138 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 1138 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
1139 ); | 1139 ); |
1140 } | 1140 } |
1141 | 1141 |
(...skipping 30 matching lines...) Expand all Loading... |
1172 }; | 1172 }; |
1173 | 1173 |
1174 // Shuffle table for duplicating 2 fractions into 8 bytes each | 1174 // Shuffle table for duplicating 2 fractions into 8 bytes each |
1175 static uvec8 kShuffleFractions = { | 1175 static uvec8 kShuffleFractions = { |
1176 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, | 1176 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
1177 }; | 1177 }; |
1178 | 1178 |
1179 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version | 1179 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version |
1180 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, | 1180 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
1181 int dst_width, int x, int dx) { | 1181 int dst_width, int x, int dx) { |
1182 intptr_t x0 = 0, x1 = 0; | 1182 intptr_t x0, x1; |
1183 asm volatile ( | 1183 asm volatile ( |
1184 "movdqa %0,%%xmm4 \n" | 1184 "movdqa %0,%%xmm4 \n" |
1185 "movdqa %1,%%xmm5 \n" | 1185 "movdqa %1,%%xmm5 \n" |
1186 : | 1186 : |
1187 : "m"(kShuffleColARGB), // %0 | 1187 : "m"(kShuffleColARGB), // %0 |
1188 "m"(kShuffleFractions) // %1 | 1188 "m"(kShuffleFractions) // %1 |
1189 ); | 1189 ); |
1190 | 1190 |
1191 asm volatile ( | 1191 asm volatile ( |
1192 "movd %5,%%xmm2 \n" | 1192 "movd %5,%%xmm2 \n" |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1235 "pmaddubsw %%xmm2,%%xmm0 \n" | 1235 "pmaddubsw %%xmm2,%%xmm0 \n" |
1236 "psrlw $0x7,%%xmm0 \n" | 1236 "psrlw $0x7,%%xmm0 \n" |
1237 "packuswb %%xmm0,%%xmm0 \n" | 1237 "packuswb %%xmm0,%%xmm0 \n" |
1238 "movd %%xmm0," MEMACCESS(0) " \n" | 1238 "movd %%xmm0," MEMACCESS(0) " \n" |
1239 | 1239 |
1240 LABELALIGN | 1240 LABELALIGN |
1241 "99: \n" | 1241 "99: \n" |
1242 : "+r"(dst_argb), // %0 | 1242 : "+r"(dst_argb), // %0 |
1243 "+r"(src_argb), // %1 | 1243 "+r"(src_argb), // %1 |
1244 "+rm"(dst_width), // %2 | 1244 "+rm"(dst_width), // %2 |
1245 "+r"(x0), // %3 | 1245 "=&r"(x0), // %3 |
1246 "+r"(x1) // %4 | 1246 "=&r"(x1) // %4 |
1247 : "rm"(x), // %5 | 1247 : "rm"(x), // %5 |
1248 "rm"(dx) // %6 | 1248 "rm"(dx) // %6 |
1249 : "memory", "cc", NACL_R14 | 1249 : "memory", "cc", NACL_R14 |
1250 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 1250 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
1251 ); | 1251 ); |
1252 } | 1252 } |
1253 | 1253 |
1254 // Divide num by div and return as 16.16 fixed point result. | 1254 // Divide num by div and return as 16.16 fixed point result. |
1255 int FixedDiv_X86(int num, int div) { | 1255 int FixedDiv_X86(int num, int div) { |
1256 asm volatile ( | 1256 asm volatile ( |
(...skipping 26 matching lines...) Expand all Loading... |
1283 ); | 1283 ); |
1284 return num; | 1284 return num; |
1285 } | 1285 } |
1286 | 1286 |
1287 #endif // defined(__x86_64__) || defined(__i386__) | 1287 #endif // defined(__x86_64__) || defined(__i386__) |
1288 | 1288 |
1289 #ifdef __cplusplus | 1289 #ifdef __cplusplus |
1290 } // extern "C" | 1290 } // extern "C" |
1291 } // namespace libyuv | 1291 } // namespace libyuv |
1292 #endif | 1292 #endif |
OLD | NEW |