OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1005 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1016 "m"(kAddUV128), // %5 | 1016 "m"(kAddUV128), // %5 |
1017 "m"(kARGBToV), // %6 | 1017 "m"(kARGBToV), // %6 |
1018 "m"(kARGBToU), // %7 | 1018 "m"(kARGBToU), // %7 |
1019 "m"(kShufARGBToUV_AVX) // %8 | 1019 "m"(kShufARGBToUV_AVX) // %8 |
1020 : "memory", "cc", NACL_R14 | 1020 : "memory", "cc", NACL_R14 |
1021 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" | 1021 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
1022 ); | 1022 ); |
1023 } | 1023 } |
1024 #endif // HAS_ARGBTOUVROW_AVX2 | 1024 #endif // HAS_ARGBTOUVROW_AVX2 |
1025 | 1025 |
| 1026 #ifdef HAS_ARGBTOUVJROW_AVX2 |
| 1027 void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb, |
| 1028 uint8* dst_u, uint8* dst_v, int width) { |
| 1029 asm volatile ( |
| 1030 "vbroadcastf128 %5,%%ymm5 \n" |
| 1031 "vbroadcastf128 %6,%%ymm6 \n" |
| 1032 "vbroadcastf128 %7,%%ymm7 \n" |
| 1033 "sub %1,%2 \n" |
| 1034 LABELALIGN |
| 1035 "1: \n" |
| 1036 "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" |
| 1037 "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" |
| 1038 "vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n" |
| 1039 "vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n" |
| 1040 VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0 |
| 1041 VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1) |
| 1042 VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2) |
| 1043 VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3) |
| 1044 "lea " MEMLEA(0x80,0) ",%0 \n" |
| 1045 "vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n" |
| 1046 "vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n" |
| 1047 "vpavgb %%ymm4,%%ymm0,%%ymm0 \n" |
| 1048 "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" |
| 1049 "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" |
| 1050 "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" |
| 1051 |
| 1052 "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" |
| 1053 "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" |
| 1054 "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" |
| 1055 "vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n" |
| 1056 "vphaddw %%ymm3,%%ymm1,%%ymm1 \n" |
| 1057 "vphaddw %%ymm2,%%ymm0,%%ymm0 \n" |
| 1058 "vpaddw %%ymm5,%%ymm0,%%ymm0 \n" |
| 1059 "vpaddw %%ymm5,%%ymm1,%%ymm1 \n" |
| 1060 "vpsraw $0x8,%%ymm1,%%ymm1 \n" |
| 1061 "vpsraw $0x8,%%ymm0,%%ymm0 \n" |
| 1062 "vpacksswb %%ymm0,%%ymm1,%%ymm0 \n" |
| 1063 "vpermq $0xd8,%%ymm0,%%ymm0 \n" |
| 1064 "vpshufb %8,%%ymm0,%%ymm0 \n" |
| 1065 |
| 1066 "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" |
| 1067 VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1) |
| 1068 "lea " MEMLEA(0x10,1) ",%1 \n" |
| 1069 "sub $0x20,%3 \n" |
| 1070 "jg 1b \n" |
| 1071 "vzeroupper \n" |
| 1072 : "+r"(src_argb0), // %0 |
| 1073 "+r"(dst_u), // %1 |
| 1074 "+r"(dst_v), // %2 |
| 1075 "+rm"(width) // %3 |
| 1076 : "r"((intptr_t)(src_stride_argb)), // %4 |
| 1077 "m"(kAddUVJ128), // %5 |
| 1078 "m"(kARGBToVJ), // %6 |
| 1079 "m"(kARGBToUJ), // %7 |
| 1080 "m"(kShufARGBToUV_AVX) // %8 |
| 1081 : "memory", "cc", NACL_R14 |
| 1082 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" |
| 1083 ); |
| 1084 } |
| 1085 #endif // HAS_ARGBTOUVJROW_AVX2 |
| 1086 |
1026 #ifdef HAS_ARGBTOUVJROW_SSSE3 | 1087 #ifdef HAS_ARGBTOUVJROW_SSSE3 |
1027 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, | 1088 void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, |
1028 uint8* dst_u, uint8* dst_v, int width) { | 1089 uint8* dst_u, uint8* dst_v, int width) { |
1029 asm volatile ( | 1090 asm volatile ( |
1030 "movdqa %5,%%xmm3 \n" | 1091 "movdqa %5,%%xmm3 \n" |
1031 "movdqa %6,%%xmm4 \n" | 1092 "movdqa %6,%%xmm4 \n" |
1032 "movdqa %7,%%xmm5 \n" | 1093 "movdqa %7,%%xmm5 \n" |
1033 "sub %1,%2 \n" | 1094 "sub %1,%2 \n" |
1034 LABELALIGN | 1095 LABELALIGN |
1035 "1: \n" | 1096 "1: \n" |
(...skipping 432 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1468 // Read 2 UV from 411, upsample to 8 UV. | 1529 // Read 2 UV from 411, upsample to 8 UV. |
1469 // reading 4 bytes is an msan violation. | 1530 // reading 4 bytes is an msan violation. |
1470 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" | 1531 // "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" |
1471 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) | 1532 // MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1) |
1472 // pinsrw fails with drmemory | 1533 // pinsrw fails with drmemory |
1473 // __asm pinsrw xmm0, [esi], 0 /* U */ | 1534 // __asm pinsrw xmm0, [esi], 0 /* U */ |
1474 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ | 1535 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ |
1475 #define READYUV411_TEMP \ | 1536 #define READYUV411_TEMP \ |
1476 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ | 1537 "movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \ |
1477 "movd %[temp],%%xmm0 \n" \ | 1538 "movd %[temp],%%xmm0 \n" \ |
1478 MEMOPARG(movzwl,0x00,[u_buf],[v_buf],1,[temp]) " \n" \ | 1539 MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \ |
1479 "movd %[temp],%%xmm1 \n" \ | 1540 "movd %[temp],%%xmm1 \n" \ |
1480 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ | 1541 "lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \ |
1481 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1542 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1482 "punpcklwd %%xmm0,%%xmm0 \n" \ | 1543 "punpcklwd %%xmm0,%%xmm0 \n" \ |
1483 "punpckldq %%xmm0,%%xmm0 \n" \ | 1544 "punpckldq %%xmm0,%%xmm0 \n" \ |
1484 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1545 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1485 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1546 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1486 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1547 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1487 | 1548 |
1488 // Read 4 UV from NV12, upsample to 8 UV | 1549 // Read 4 UV from NV12, upsample to 8 UV |
(...skipping 536 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2025 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ | 2086 "vpaddsw %%ymm4,%%ymm1,%%ymm1 \n" \ |
2026 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ | 2087 "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" \ |
2027 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ | 2088 "vpsraw $0x6,%%ymm0,%%ymm0 \n" \ |
2028 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ | 2089 "vpsraw $0x6,%%ymm1,%%ymm1 \n" \ |
2029 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ | 2090 "vpsraw $0x6,%%ymm2,%%ymm2 \n" \ |
2030 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ | 2091 "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ |
2031 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ | 2092 "vpackuswb %%ymm1,%%ymm1,%%ymm1 \n" \ |
2032 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" | 2093 "vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" |
2033 #define YUVTORGB_REGS_AVX2 \ | 2094 #define YUVTORGB_REGS_AVX2 \ |
2034 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", | 2095 "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", |
2035 #else// Convert 16 pixels: 16 UV and 16 Y. | 2096 #else // Convert 16 pixels: 16 UV and 16 Y. |
2036 #define YUVTORGB_SETUP_AVX2(yuvconstants) | 2097 #define YUVTORGB_SETUP_AVX2(yuvconstants) |
2037 #define YUVTORGB_AVX2(yuvconstants) \ | 2098 #define YUVTORGB_AVX2(yuvconstants) \ |
2038 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ | 2099 "vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \ |
2039 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ | 2100 "vpmaddubsw " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1 \n" \ |
2040 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ | 2101 "vpmaddubsw " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0 \n" \ |
2041 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ | 2102 "vmovdqu " MEMACCESS2(160, [yuvconstants]) ",%%ymm3 \n" \ |
2042 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ | 2103 "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ |
2043 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ | 2104 "vmovdqu " MEMACCESS2(128, [yuvconstants]) ",%%ymm3 \n" \ |
2044 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ | 2105 "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ |
2045 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ | 2106 "vmovdqu " MEMACCESS2(96, [yuvconstants]) ",%%ymm3 \n" \ |
(...skipping 3391 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5437 ); | 5498 ); |
5438 } | 5499 } |
5439 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5500 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5440 | 5501 |
5441 #endif // defined(__x86_64__) || defined(__i386__) | 5502 #endif // defined(__x86_64__) || defined(__i386__) |
5442 | 5503 |
5443 #ifdef __cplusplus | 5504 #ifdef __cplusplus |
5444 } // extern "C" | 5505 } // extern "C" |
5445 } // namespace libyuv | 5506 } // namespace libyuv |
5446 #endif | 5507 #endif |
OLD | NEW |