OLD | NEW |
1 // VERSION 2 | 1 // VERSION 2 |
2 /* | 2 /* |
3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 3 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
4 * | 4 * |
5 * Use of this source code is governed by a BSD-style license | 5 * Use of this source code is governed by a BSD-style license |
6 * that can be found in the LICENSE file in the root of the source | 6 * that can be found in the LICENSE file in the root of the source |
7 * tree. An additional intellectual property rights grant can be found | 7 * tree. An additional intellectual property rights grant can be found |
8 * in the file PATENTS. All contributing project authors may | 8 * in the file PATENTS. All contributing project authors may |
9 * be found in the AUTHORS file in the root of the source tree. | 9 * be found in the AUTHORS file in the root of the source tree. |
10 */ | 10 */ |
(...skipping 1413 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1424 "m"(kRGBAToV), // %5 | 1424 "m"(kRGBAToV), // %5 |
1425 "m"(kRGBAToU), // %6 | 1425 "m"(kRGBAToU), // %6 |
1426 "m"(kAddUV128) // %7 | 1426 "m"(kAddUV128) // %7 |
1427 : "memory", "cc", NACL_R14 | 1427 : "memory", "cc", NACL_R14 |
1428 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" | 1428 "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" |
1429 ); | 1429 ); |
1430 } | 1430 } |
1431 | 1431 |
1432 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) | 1432 #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) |
1433 | 1433 |
1434 // Read 8 UV from 411 | 1434 // Read 8 UV from 444 |
1435 #define READYUV444 \ | 1435 #define READYUV444 \ |
1436 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ | 1436 "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
1437 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ | 1437 MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
1438 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ | 1438 "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ |
1439 "punpcklbw %%xmm1,%%xmm0 \n" \ | 1439 "punpcklbw %%xmm1,%%xmm0 \n" \ |
1440 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1440 "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1441 "punpcklbw %%xmm4,%%xmm4 \n" \ | 1441 "punpcklbw %%xmm4,%%xmm4 \n" \ |
1442 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" | 1442 "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" |
1443 | 1443 |
1444 // Read 4 UV from 422, upsample to 8 UV | 1444 // Read 4 UV from 422, upsample to 8 UV |
(...skipping 500 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1945 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1945 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1946 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1946 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1947 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1947 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1948 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1948 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1949 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1949 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1950 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ | 1950 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ |
1951 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ | 1951 "vmovdqu " MEMACCESS([a_buf]) ",%%xmm5 \n" \ |
1952 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ | 1952 "vpermq $0xd8,%%ymm5,%%ymm5 \n" \ |
1953 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" | 1953 "lea " MEMLEA(0x10, [a_buf]) ",%[a_buf] \n" |
1954 | 1954 |
| 1955 // Read 4 UV from 411, upsample to 16 UV. |
| 1956 #define READYUV411_AVX2 \ |
| 1957 "vmovd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ |
| 1958 MEMOPREG(vmovd, 0x00, [u_buf], [v_buf], 1, xmm1) \ |
| 1959 "lea " MEMLEA(0x4, [u_buf]) ",%[u_buf] \n" \ |
| 1960 "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" \ |
| 1961 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1962 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
| 1963 "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" \ |
| 1964 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
| 1965 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
| 1966 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
| 1967 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
| 1968 |
1955 // Read 8 UV from NV12, upsample to 16 UV. | 1969 // Read 8 UV from NV12, upsample to 16 UV. |
1956 #define READNV12_AVX2 \ | 1970 #define READNV12_AVX2 \ |
1957 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ | 1971 "vmovdqu " MEMACCESS([uv_buf]) ",%%xmm0 \n" \ |
1958 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ | 1972 "lea " MEMLEA(0x10, [uv_buf]) ",%[uv_buf] \n" \ |
1959 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ | 1973 "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ |
1960 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ | 1974 "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ |
1961 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ | 1975 "vmovdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \ |
1962 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ | 1976 "vpermq $0xd8,%%ymm4,%%ymm4 \n" \ |
1963 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ | 1977 "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ |
1964 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" | 1978 "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" |
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2060 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 2074 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
2061 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, | 2075 void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf, |
2062 const uint8* u_buf, | 2076 const uint8* u_buf, |
2063 const uint8* v_buf, | 2077 const uint8* v_buf, |
2064 uint8* dst_argb, | 2078 uint8* dst_argb, |
2065 const struct YuvConstants* yuvconstants, | 2079 const struct YuvConstants* yuvconstants, |
2066 int width) { | 2080 int width) { |
2067 asm volatile ( | 2081 asm volatile ( |
2068 YUVTORGB_SETUP_AVX2(yuvconstants) | 2082 YUVTORGB_SETUP_AVX2(yuvconstants) |
2069 "sub %[u_buf],%[v_buf] \n" | 2083 "sub %[u_buf],%[v_buf] \n" |
2070 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2084 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2071 LABELALIGN | 2085 LABELALIGN |
2072 "1: \n" | 2086 "1: \n" |
2073 READYUV444_AVX2 | 2087 READYUV444_AVX2 |
2074 YUVTORGB_AVX2(yuvconstants) | 2088 YUVTORGB_AVX2(yuvconstants) |
2075 STOREARGB_AVX2 | 2089 STOREARGB_AVX2 |
2076 "sub $0x10,%[width] \n" | 2090 "sub $0x10,%[width] \n" |
2077 "jg 1b \n" | 2091 "jg 1b \n" |
2078 "vzeroupper \n" | 2092 "vzeroupper \n" |
2079 : [y_buf]"+r"(y_buf), // %[y_buf] | 2093 : [y_buf]"+r"(y_buf), // %[y_buf] |
2080 [u_buf]"+r"(u_buf), // %[u_buf] | 2094 [u_buf]"+r"(u_buf), // %[u_buf] |
2081 [v_buf]"+r"(v_buf), // %[v_buf] | 2095 [v_buf]"+r"(v_buf), // %[v_buf] |
2082 [dst_argb]"+r"(dst_argb), // %[dst_argb] | 2096 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
2083 [width]"+rm"(width) // %[width] | 2097 [width]"+rm"(width) // %[width] |
2084 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] | 2098 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
2085 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 | 2099 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
2086 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" | 2100 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
2087 ); | 2101 ); |
2088 } | 2102 } |
2089 #endif // HAS_I444TOARGBROW_AVX2 | 2103 #endif // HAS_I444TOARGBROW_AVX2 |
2090 | 2104 |
| 2105 #ifdef HAS_I411TOARGBROW_AVX2 |
| 2106 // 16 pixels |
| 2107 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2108 void OMITFP I411ToARGBRow_AVX2(const uint8* y_buf, |
| 2109 const uint8* u_buf, |
| 2110 const uint8* v_buf, |
| 2111 uint8* dst_argb, |
| 2112 const struct YuvConstants* yuvconstants, |
| 2113 int width) { |
| 2114 asm volatile ( |
| 2115 YUVTORGB_SETUP_AVX2(yuvconstants) |
| 2116 "sub %[u_buf],%[v_buf] \n" |
| 2117 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
| 2118 LABELALIGN |
| 2119 "1: \n" |
| 2120 READYUV411_AVX2 |
| 2121 YUVTORGB_AVX2(yuvconstants) |
| 2122 STOREARGB_AVX2 |
| 2123 "sub $0x10,%[width] \n" |
| 2124 "jg 1b \n" |
| 2125 "vzeroupper \n" |
| 2126 : [y_buf]"+r"(y_buf), // %[y_buf] |
| 2127 [u_buf]"+r"(u_buf), // %[u_buf] |
| 2128 [v_buf]"+r"(v_buf), // %[v_buf] |
| 2129 [dst_argb]"+r"(dst_argb), // %[dst_argb] |
| 2130 [width]"+rm"(width) // %[width] |
| 2131 : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] |
| 2132 : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2 |
| 2133 "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" |
| 2134 ); |
| 2135 } |
| 2136 #endif // HAS_I411TOARGBROW_AVX2 |
| 2137 |
2091 #if defined(HAS_I422TOARGBROW_AVX2) | 2138 #if defined(HAS_I422TOARGBROW_AVX2) |
2092 // 16 pixels | 2139 // 16 pixels |
2093 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2140 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2094 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, | 2141 void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, |
2095 const uint8* u_buf, | 2142 const uint8* u_buf, |
2096 const uint8* v_buf, | 2143 const uint8* v_buf, |
2097 uint8* dst_argb, | 2144 uint8* dst_argb, |
2098 const struct YuvConstants* yuvconstants, | 2145 const struct YuvConstants* yuvconstants, |
2099 int width) { | 2146 int width) { |
2100 asm volatile ( | 2147 asm volatile ( |
2101 YUVTORGB_SETUP_AVX2(yuvconstants) | 2148 YUVTORGB_SETUP_AVX2(yuvconstants) |
2102 "sub %[u_buf],%[v_buf] \n" | 2149 "sub %[u_buf],%[v_buf] \n" |
2103 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" | 2150 "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" |
2104 LABELALIGN | 2151 LABELALIGN |
2105 "1: \n" | 2152 "1: \n" |
2106 READYUV422_AVX2 | 2153 READYUV422_AVX2 |
2107 YUVTORGB_AVX2(yuvconstants) | 2154 YUVTORGB_AVX2(yuvconstants) |
2108 STOREARGB_AVX2 | 2155 STOREARGB_AVX2 |
2109 "sub $0x10,%[width] \n" | 2156 "sub $0x10,%[width] \n" |
2110 "jg 1b \n" | 2157 "jg 1b \n" |
2111 "vzeroupper \n" | 2158 "vzeroupper \n" |
2112 : [y_buf]"+r"(y_buf), // %[y_buf] | 2159 : [y_buf]"+r"(y_buf), // %[y_buf] |
2113 [u_buf]"+r"(u_buf), // %[u_buf] | 2160 [u_buf]"+r"(u_buf), // %[u_buf] |
(...skipping 3276 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5390 ); | 5437 ); |
5391 } | 5438 } |
5392 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 5439 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
5393 | 5440 |
5394 #endif // defined(__x86_64__) || defined(__i386__) | 5441 #endif // defined(__x86_64__) || defined(__i386__) |
5395 | 5442 |
5396 #ifdef __cplusplus | 5443 #ifdef __cplusplus |
5397 } // extern "C" | 5444 } // extern "C" |
5398 } // namespace libyuv | 5445 } // namespace libyuv |
5399 #endif | 5446 #endif |
OLD | NEW |