| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2013 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 309 "jg 1b \n" | 309 "jg 1b \n" |
| 310 : "+r"(src_ptr), // %0 | 310 : "+r"(src_ptr), // %0 |
| 311 "+r"(dst_ptr), // %1 | 311 "+r"(dst_ptr), // %1 |
| 312 "+r"(dst_width) // %2 | 312 "+r"(dst_width) // %2 |
| 313 :: "memory", "cc", "xmm0", "xmm1", "xmm5" | 313 :: "memory", "cc", "xmm0", "xmm1", "xmm5" |
| 314 ); | 314 ); |
| 315 } | 315 } |
| 316 | 316 |
| 317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, | 317 void ScaleRowDown4Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, |
| 318 uint8* dst_ptr, int dst_width) { | 318 uint8* dst_ptr, int dst_width) { |
| 319 intptr_t stridex3 = 0; | 319 intptr_t stridex3; |
| 320 asm volatile ( | 320 asm volatile ( |
| 321 "pcmpeqb %%xmm4,%%xmm4 \n" | 321 "pcmpeqb %%xmm4,%%xmm4 \n" |
| 322 "psrlw $0xf,%%xmm4 \n" | 322 "psrlw $0xf,%%xmm4 \n" |
| 323 "movdqa %%xmm4,%%xmm5 \n" | 323 "movdqa %%xmm4,%%xmm5 \n" |
| 324 "packuswb %%xmm4,%%xmm4 \n" | 324 "packuswb %%xmm4,%%xmm4 \n" |
| 325 "psllw $0x3,%%xmm5 \n" | 325 "psllw $0x3,%%xmm5 \n" |
| 326 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" | 326 "lea " MEMLEA4(0x00,4,4,2) ",%3 \n" |
| 327 | 327 |
| 328 LABELALIGN | 328 LABELALIGN |
| 329 "1: \n" | 329 "1: \n" |
| (...skipping 24 matching lines...) Expand all Loading... |
| 354 "paddw %%xmm5,%%xmm0 \n" | 354 "paddw %%xmm5,%%xmm0 \n" |
| 355 "psrlw $0x4,%%xmm0 \n" | 355 "psrlw $0x4,%%xmm0 \n" |
| 356 "packuswb %%xmm0,%%xmm0 \n" | 356 "packuswb %%xmm0,%%xmm0 \n" |
| 357 "movq %%xmm0," MEMACCESS(1) " \n" | 357 "movq %%xmm0," MEMACCESS(1) " \n" |
| 358 "lea " MEMLEA(0x8,1) ",%1 \n" | 358 "lea " MEMLEA(0x8,1) ",%1 \n" |
| 359 "sub $0x8,%2 \n" | 359 "sub $0x8,%2 \n" |
| 360 "jg 1b \n" | 360 "jg 1b \n" |
| 361 : "+r"(src_ptr), // %0 | 361 : "+r"(src_ptr), // %0 |
| 362 "+r"(dst_ptr), // %1 | 362 "+r"(dst_ptr), // %1 |
| 363 "+r"(dst_width), // %2 | 363 "+r"(dst_width), // %2 |
| 364 "+r"(stridex3) // %3 | 364 "=&r"(stridex3) // %3 |
| 365 : "r"((intptr_t)(src_stride)) // %4 | 365 : "r"((intptr_t)(src_stride)) // %4 |
| 366 : "memory", "cc", NACL_R14 | 366 : "memory", "cc", NACL_R14 |
| 367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 367 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 368 ); | 368 ); |
| 369 } | 369 } |
| 370 | 370 |
| 371 | 371 |
| 372 #ifdef HAS_SCALEROWDOWN4_AVX2 | 372 #ifdef HAS_SCALEROWDOWN4_AVX2 |
| 373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, | 373 void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, |
| 374 uint8* dst_ptr, int dst_width) { | 374 uint8* dst_ptr, int dst_width) { |
| (...skipping 442 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 817 "+r"(src_width) // %2 | 817 "+r"(src_width) // %2 |
| 818 : | 818 : |
| 819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" | 819 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" |
| 820 ); | 820 ); |
| 821 } | 821 } |
| 822 #endif // HAS_SCALEADDROW_AVX2 | 822 #endif // HAS_SCALEADDROW_AVX2 |
| 823 | 823 |
| 824 // Bilinear column filtering. SSSE3 version. | 824 // Bilinear column filtering. SSSE3 version. |
| 825 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, | 825 void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, |
| 826 int dst_width, int x, int dx) { | 826 int dst_width, int x, int dx) { |
| 827 intptr_t x0 = 0, x1 = 0, temp_pixel = 0; | 827 intptr_t x0, x1, temp_pixel; |
| 828 asm volatile ( | 828 asm volatile ( |
| 829 "movd %6,%%xmm2 \n" | 829 "movd %6,%%xmm2 \n" |
| 830 "movd %7,%%xmm3 \n" | 830 "movd %7,%%xmm3 \n" |
| 831 "movl $0x04040000,%k2 \n" | 831 "movl $0x04040000,%k2 \n" |
| 832 "movd %k2,%%xmm5 \n" | 832 "movd %k2,%%xmm5 \n" |
| 833 "pcmpeqb %%xmm6,%%xmm6 \n" | 833 "pcmpeqb %%xmm6,%%xmm6 \n" |
| 834 "psrlw $0x9,%%xmm6 \n" | 834 "psrlw $0x9,%%xmm6 \n" |
| 835 "pextrw $0x1,%%xmm2,%k3 \n" | 835 "pextrw $0x1,%%xmm2,%k3 \n" |
| 836 "subl $0x2,%5 \n" | 836 "subl $0x2,%5 \n" |
| 837 "jl 29f \n" | 837 "jl 29f \n" |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 873 "movd %k2,%%xmm0 \n" | 873 "movd %k2,%%xmm0 \n" |
| 874 "psrlw $0x9,%%xmm2 \n" | 874 "psrlw $0x9,%%xmm2 \n" |
| 875 "pshufb %%xmm5,%%xmm2 \n" | 875 "pshufb %%xmm5,%%xmm2 \n" |
| 876 "pxor %%xmm6,%%xmm2 \n" | 876 "pxor %%xmm6,%%xmm2 \n" |
| 877 "pmaddubsw %%xmm2,%%xmm0 \n" | 877 "pmaddubsw %%xmm2,%%xmm0 \n" |
| 878 "psrlw $0x7,%%xmm0 \n" | 878 "psrlw $0x7,%%xmm0 \n" |
| 879 "packuswb %%xmm0,%%xmm0 \n" | 879 "packuswb %%xmm0,%%xmm0 \n" |
| 880 "movd %%xmm0,%k2 \n" | 880 "movd %%xmm0,%k2 \n" |
| 881 "mov %b2," MEMACCESS(0) " \n" | 881 "mov %b2," MEMACCESS(0) " \n" |
| 882 "99: \n" | 882 "99: \n" |
| 883 : "+r"(dst_ptr), // %0 | 883 : "+r"(dst_ptr), // %0 |
| 884 "+r"(src_ptr), // %1 | 884 "+r"(src_ptr), // %1 |
| 885 "+a"(temp_pixel), // %2 | 885 "=&a"(temp_pixel), // %2 |
| 886 "+r"(x0), // %3 | 886 "=&r"(x0), // %3 |
| 887 "+r"(x1), // %4 | 887 "=&r"(x1), // %4 |
| 888 "+rm"(dst_width) // %5 | 888 "+rm"(dst_width) // %5 |
| 889 : "rm"(x), // %6 | 889 : "rm"(x), // %6 |
| 890 "rm"(dx) // %7 | 890 "rm"(dx) // %7 |
| 891 : "memory", "cc", NACL_R14 | 891 : "memory", "cc", NACL_R14 |
| 892 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 892 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 893 ); | 893 ); |
| 894 } | 894 } |
| 895 | 895 |
| 896 // Reads 4 pixels, duplicates them and writes 8 pixels. | 896 // Reads 4 pixels, duplicates them and writes 8 pixels. |
| 897 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. | 897 // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. |
| 898 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, | 898 void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, |
| 899 int dst_width, int x, int dx) { | 899 int dst_width, int x, int dx) { |
| 900 asm volatile ( | 900 asm volatile ( |
| (...skipping 90 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 991 : "memory", "cc", NACL_R14 | 991 : "memory", "cc", NACL_R14 |
| 992 "xmm0", "xmm1", "xmm2", "xmm3" | 992 "xmm0", "xmm1", "xmm2", "xmm3" |
| 993 ); | 993 ); |
| 994 } | 994 } |
| 995 | 995 |
| 996 // Reads 4 pixels at a time. | 996 // Reads 4 pixels at a time. |
| 997 // Alignment requirement: dst_argb 16 byte aligned. | 997 // Alignment requirement: dst_argb 16 byte aligned. |
| 998 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, | 998 void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, |
| 999 int src_stepx, uint8* dst_argb, int dst_width) { | 999 int src_stepx, uint8* dst_argb, int dst_width) { |
| 1000 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | 1000 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
| 1001 intptr_t src_stepx_x12 = 0; | 1001 intptr_t src_stepx_x12; |
| 1002 asm volatile ( | 1002 asm volatile ( |
| 1003 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | 1003 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
| 1004 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | 1004 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
| 1005 LABELALIGN | 1005 LABELALIGN |
| 1006 "1: \n" | 1006 "1: \n" |
| 1007 "movd " MEMACCESS(0) ",%%xmm0 \n" | 1007 "movd " MEMACCESS(0) ",%%xmm0 \n" |
| 1008 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 | 1008 MEMOPREG(movd,0x00,0,1,1,xmm1) // movd (%0,%1,1),%%xmm1 |
| 1009 "punpckldq %%xmm1,%%xmm0 \n" | 1009 "punpckldq %%xmm1,%%xmm0 \n" |
| 1010 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 | 1010 MEMOPREG(movd,0x00,0,1,2,xmm2) // movd (%0,%1,2),%%xmm2 |
| 1011 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 | 1011 MEMOPREG(movd,0x00,0,4,1,xmm3) // movd (%0,%4,1),%%xmm3 |
| 1012 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" | 1012 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
| 1013 "punpckldq %%xmm3,%%xmm2 \n" | 1013 "punpckldq %%xmm3,%%xmm2 \n" |
| 1014 "punpcklqdq %%xmm2,%%xmm0 \n" | 1014 "punpcklqdq %%xmm2,%%xmm0 \n" |
| 1015 "movdqu %%xmm0," MEMACCESS(2) " \n" | 1015 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 1016 "lea " MEMLEA(0x10,2) ",%2 \n" | 1016 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 1017 "sub $0x4,%3 \n" | 1017 "sub $0x4,%3 \n" |
| 1018 "jg 1b \n" | 1018 "jg 1b \n" |
| 1019 : "+r"(src_argb), // %0 | 1019 : "+r"(src_argb), // %0 |
| 1020 "+r"(src_stepx_x4), // %1 | 1020 "+r"(src_stepx_x4), // %1 |
| 1021 "+r"(dst_argb), // %2 | 1021 "+r"(dst_argb), // %2 |
| 1022 "+r"(dst_width), // %3 | 1022 "+r"(dst_width), // %3 |
| 1023 "+r"(src_stepx_x12) // %4 | 1023 "=&r"(src_stepx_x12) // %4 |
| 1024 :: "memory", "cc", NACL_R14 | 1024 :: "memory", "cc", NACL_R14 |
| 1025 "xmm0", "xmm1", "xmm2", "xmm3" | 1025 "xmm0", "xmm1", "xmm2", "xmm3" |
| 1026 ); | 1026 ); |
| 1027 } | 1027 } |
| 1028 | 1028 |
| 1029 // Blends four 2x2 to 4x1. | 1029 // Blends four 2x2 to 4x1. |
| 1030 // Alignment requirement: dst_argb 16 byte aligned. | 1030 // Alignment requirement: dst_argb 16 byte aligned. |
| 1031 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, | 1031 void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, |
| 1032 ptrdiff_t src_stride, int src_stepx, | 1032 ptrdiff_t src_stride, int src_stepx, |
| 1033 uint8* dst_argb, int dst_width) { | 1033 uint8* dst_argb, int dst_width) { |
| 1034 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); | 1034 intptr_t src_stepx_x4 = (intptr_t)(src_stepx); |
| 1035 intptr_t src_stepx_x12 = 0; | 1035 intptr_t src_stepx_x12; |
| 1036 intptr_t row1 = (intptr_t)(src_stride); | 1036 intptr_t row1 = (intptr_t)(src_stride); |
| 1037 asm volatile ( | 1037 asm volatile ( |
| 1038 "lea " MEMLEA3(0x00,1,4) ",%1 \n" | 1038 "lea " MEMLEA3(0x00,1,4) ",%1 \n" |
| 1039 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" | 1039 "lea " MEMLEA4(0x00,1,1,2) ",%4 \n" |
| 1040 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" | 1040 "lea " MEMLEA4(0x00,0,5,1) ",%5 \n" |
| 1041 | 1041 |
| 1042 LABELALIGN | 1042 LABELALIGN |
| 1043 "1: \n" | 1043 "1: \n" |
| 1044 "movq " MEMACCESS(0) ",%%xmm0 \n" | 1044 "movq " MEMACCESS(0) ",%%xmm0 \n" |
| 1045 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 | 1045 MEMOPREG(movhps,0x00,0,1,1,xmm0) // movhps (%0,%1,1),%%xmm0 |
| 1046 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 | 1046 MEMOPREG(movq,0x00,0,1,2,xmm1) // movq (%0,%1,2),%%xmm1 |
| 1047 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 | 1047 MEMOPREG(movhps,0x00,0,4,1,xmm1) // movhps (%0,%4,1),%%xmm1 |
| 1048 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" | 1048 "lea " MEMLEA4(0x00,0,1,4) ",%0 \n" |
| 1049 "movq " MEMACCESS(5) ",%%xmm2 \n" | 1049 "movq " MEMACCESS(5) ",%%xmm2 \n" |
| 1050 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 | 1050 MEMOPREG(movhps,0x00,5,1,1,xmm2) // movhps (%5,%1,1),%%xmm2 |
| 1051 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 | 1051 MEMOPREG(movq,0x00,5,1,2,xmm3) // movq (%5,%1,2),%%xmm3 |
| 1052 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 | 1052 MEMOPREG(movhps,0x00,5,4,1,xmm3) // movhps (%5,%4,1),%%xmm3 |
| 1053 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" | 1053 "lea " MEMLEA4(0x00,5,1,4) ",%5 \n" |
| 1054 "pavgb %%xmm2,%%xmm0 \n" | 1054 "pavgb %%xmm2,%%xmm0 \n" |
| 1055 "pavgb %%xmm3,%%xmm1 \n" | 1055 "pavgb %%xmm3,%%xmm1 \n" |
| 1056 "movdqa %%xmm0,%%xmm2 \n" | 1056 "movdqa %%xmm0,%%xmm2 \n" |
| 1057 "shufps $0x88,%%xmm1,%%xmm0 \n" | 1057 "shufps $0x88,%%xmm1,%%xmm0 \n" |
| 1058 "shufps $0xdd,%%xmm1,%%xmm2 \n" | 1058 "shufps $0xdd,%%xmm1,%%xmm2 \n" |
| 1059 "pavgb %%xmm2,%%xmm0 \n" | 1059 "pavgb %%xmm2,%%xmm0 \n" |
| 1060 "movdqu %%xmm0," MEMACCESS(2) " \n" | 1060 "movdqu %%xmm0," MEMACCESS(2) " \n" |
| 1061 "lea " MEMLEA(0x10,2) ",%2 \n" | 1061 "lea " MEMLEA(0x10,2) ",%2 \n" |
| 1062 "sub $0x4,%3 \n" | 1062 "sub $0x4,%3 \n" |
| 1063 "jg 1b \n" | 1063 "jg 1b \n" |
| 1064 : "+r"(src_argb), // %0 | 1064 : "+r"(src_argb), // %0 |
| 1065 "+r"(src_stepx_x4), // %1 | 1065 "+r"(src_stepx_x4), // %1 |
| 1066 "+r"(dst_argb), // %2 | 1066 "+r"(dst_argb), // %2 |
| 1067 "+rm"(dst_width), // %3 | 1067 "+rm"(dst_width), // %3 |
| 1068 "+r"(src_stepx_x12), // %4 | 1068 "=&r"(src_stepx_x12), // %4 |
| 1069 "+r"(row1) // %5 | 1069 "+r"(row1) // %5 |
| 1070 :: "memory", "cc", NACL_R14 | 1070 :: "memory", "cc", NACL_R14 |
| 1071 "xmm0", "xmm1", "xmm2", "xmm3" | 1071 "xmm0", "xmm1", "xmm2", "xmm3" |
| 1072 ); | 1072 ); |
| 1073 } | 1073 } |
| 1074 | 1074 |
| 1075 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, | 1075 void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, |
| 1076 int dst_width, int x, int dx) { | 1076 int dst_width, int x, int dx) { |
| 1077 intptr_t x0 = 0, x1 = 0; | 1077 intptr_t x0, x1; |
| 1078 asm volatile ( | 1078 asm volatile ( |
| 1079 "movd %5,%%xmm2 \n" | 1079 "movd %5,%%xmm2 \n" |
| 1080 "movd %6,%%xmm3 \n" | 1080 "movd %6,%%xmm3 \n" |
| 1081 "pshufd $0x0,%%xmm2,%%xmm2 \n" | 1081 "pshufd $0x0,%%xmm2,%%xmm2 \n" |
| 1082 "pshufd $0x11,%%xmm3,%%xmm0 \n" | 1082 "pshufd $0x11,%%xmm3,%%xmm0 \n" |
| 1083 "paddd %%xmm0,%%xmm2 \n" | 1083 "paddd %%xmm0,%%xmm2 \n" |
| 1084 "paddd %%xmm3,%%xmm3 \n" | 1084 "paddd %%xmm3,%%xmm3 \n" |
| 1085 "pshufd $0x5,%%xmm3,%%xmm0 \n" | 1085 "pshufd $0x5,%%xmm3,%%xmm0 \n" |
| 1086 "paddd %%xmm0,%%xmm2 \n" | 1086 "paddd %%xmm0,%%xmm2 \n" |
| 1087 "paddd %%xmm3,%%xmm3 \n" | 1087 "paddd %%xmm3,%%xmm3 \n" |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1120 "pextrw $0x5,%%xmm2,%k0 \n" | 1120 "pextrw $0x5,%%xmm2,%k0 \n" |
| 1121 "punpckldq %%xmm1,%%xmm0 \n" | 1121 "punpckldq %%xmm1,%%xmm0 \n" |
| 1122 "movq %%xmm0," MEMACCESS(2) " \n" | 1122 "movq %%xmm0," MEMACCESS(2) " \n" |
| 1123 "lea " MEMLEA(0x8,2) ",%2 \n" | 1123 "lea " MEMLEA(0x8,2) ",%2 \n" |
| 1124 "29: \n" | 1124 "29: \n" |
| 1125 "test $0x1,%4 \n" | 1125 "test $0x1,%4 \n" |
| 1126 "je 99f \n" | 1126 "je 99f \n" |
| 1127 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 | 1127 MEMOPREG(movd,0x00,3,0,4,xmm0) // movd (%3,%0,4),%%xmm0 |
| 1128 "movd %%xmm0," MEMACCESS(2) " \n" | 1128 "movd %%xmm0," MEMACCESS(2) " \n" |
| 1129 "99: \n" | 1129 "99: \n" |
| 1130 : "+a"(x0), // %0 | 1130 : "=&a"(x0), // %0 |
| 1131 "+d"(x1), // %1 | 1131 "=&d"(x1), // %1 |
| 1132 "+r"(dst_argb), // %2 | 1132 "+r"(dst_argb), // %2 |
| 1133 "+r"(src_argb), // %3 | 1133 "+r"(src_argb), // %3 |
| 1134 "+r"(dst_width) // %4 | 1134 "+r"(dst_width) // %4 |
| 1135 : "rm"(x), // %5 | 1135 : "rm"(x), // %5 |
| 1136 "rm"(dx) // %6 | 1136 "rm"(dx) // %6 |
| 1137 : "memory", "cc", NACL_R14 | 1137 : "memory", "cc", NACL_R14 |
| 1138 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" | 1138 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" |
| 1139 ); | 1139 ); |
| 1140 } | 1140 } |
| 1141 | 1141 |
| (...skipping 30 matching lines...) Expand all Loading... |
| 1172 }; | 1172 }; |
| 1173 | 1173 |
| 1174 // Shuffle table for duplicating 2 fractions into 8 bytes each | 1174 // Shuffle table for duplicating 2 fractions into 8 bytes each |
| 1175 static uvec8 kShuffleFractions = { | 1175 static uvec8 kShuffleFractions = { |
| 1176 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, | 1176 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, |
| 1177 }; | 1177 }; |
| 1178 | 1178 |
| 1179 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version | 1179 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version |
| 1180 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, | 1180 void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, |
| 1181 int dst_width, int x, int dx) { | 1181 int dst_width, int x, int dx) { |
| 1182 intptr_t x0 = 0, x1 = 0; | 1182 intptr_t x0, x1; |
| 1183 asm volatile ( | 1183 asm volatile ( |
| 1184 "movdqa %0,%%xmm4 \n" | 1184 "movdqa %0,%%xmm4 \n" |
| 1185 "movdqa %1,%%xmm5 \n" | 1185 "movdqa %1,%%xmm5 \n" |
| 1186 : | 1186 : |
| 1187 : "m"(kShuffleColARGB), // %0 | 1187 : "m"(kShuffleColARGB), // %0 |
| 1188 "m"(kShuffleFractions) // %1 | 1188 "m"(kShuffleFractions) // %1 |
| 1189 ); | 1189 ); |
| 1190 | 1190 |
| 1191 asm volatile ( | 1191 asm volatile ( |
| 1192 "movd %5,%%xmm2 \n" | 1192 "movd %5,%%xmm2 \n" |
| (...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1235 "pmaddubsw %%xmm2,%%xmm0 \n" | 1235 "pmaddubsw %%xmm2,%%xmm0 \n" |
| 1236 "psrlw $0x7,%%xmm0 \n" | 1236 "psrlw $0x7,%%xmm0 \n" |
| 1237 "packuswb %%xmm0,%%xmm0 \n" | 1237 "packuswb %%xmm0,%%xmm0 \n" |
| 1238 "movd %%xmm0," MEMACCESS(0) " \n" | 1238 "movd %%xmm0," MEMACCESS(0) " \n" |
| 1239 | 1239 |
| 1240 LABELALIGN | 1240 LABELALIGN |
| 1241 "99: \n" | 1241 "99: \n" |
| 1242 : "+r"(dst_argb), // %0 | 1242 : "+r"(dst_argb), // %0 |
| 1243 "+r"(src_argb), // %1 | 1243 "+r"(src_argb), // %1 |
| 1244 "+rm"(dst_width), // %2 | 1244 "+rm"(dst_width), // %2 |
| 1245 "+r"(x0), // %3 | 1245 "=&r"(x0), // %3 |
| 1246 "+r"(x1) // %4 | 1246 "=&r"(x1) // %4 |
| 1247 : "rm"(x), // %5 | 1247 : "rm"(x), // %5 |
| 1248 "rm"(dx) // %6 | 1248 "rm"(dx) // %6 |
| 1249 : "memory", "cc", NACL_R14 | 1249 : "memory", "cc", NACL_R14 |
| 1250 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" | 1250 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" |
| 1251 ); | 1251 ); |
| 1252 } | 1252 } |
| 1253 | 1253 |
| 1254 // Divide num by div and return as 16.16 fixed point result. | 1254 // Divide num by div and return as 16.16 fixed point result. |
| 1255 int FixedDiv_X86(int num, int div) { | 1255 int FixedDiv_X86(int num, int div) { |
| 1256 asm volatile ( | 1256 asm volatile ( |
| (...skipping 26 matching lines...) Expand all Loading... |
| 1283 ); | 1283 ); |
| 1284 return num; | 1284 return num; |
| 1285 } | 1285 } |
| 1286 | 1286 |
| 1287 #endif // defined(__x86_64__) || defined(__i386__) | 1287 #endif // defined(__x86_64__) || defined(__i386__) |
| 1288 | 1288 |
| 1289 #ifdef __cplusplus | 1289 #ifdef __cplusplus |
| 1290 } // extern "C" | 1290 } // extern "C" |
| 1291 } // namespace libyuv | 1291 } // namespace libyuv |
| 1292 #endif | 1292 #endif |
| OLD | NEW |