| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 76 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ | 76 xmm2 = _mm_unpacklo_epi8(xmm2, xmm1); \ |
| 77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm5); \ | 77 xmm0 = _mm_unpacklo_epi8(xmm0, xmm5); \ |
| 78 xmm1 = _mm_loadu_si128(&xmm2); \ | 78 xmm1 = _mm_loadu_si128(&xmm2); \ |
| 79 xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \ | 79 xmm2 = _mm_unpacklo_epi16(xmm2, xmm0); \ |
| 80 xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \ | 80 xmm1 = _mm_unpackhi_epi16(xmm1, xmm0); \ |
| 81 _mm_storeu_si128((__m128i *)dst_argb, xmm2); \ | 81 _mm_storeu_si128((__m128i *)dst_argb, xmm2); \ |
| 82 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ | 82 _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1); \ |
| 83 dst_argb += 32; | 83 dst_argb += 32; |
| 84 | 84 |
| 85 | 85 |
| 86 #if defined(HAS_I422TOARGBMATRIXROW_SSSE3) | 86 #if defined(HAS_I422TOARGBROW_SSSE3) |
| 87 void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 87 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 88 const uint8* u_buf, | 88 const uint8* u_buf, |
| 89 const uint8* v_buf, | 89 const uint8* v_buf, |
| 90 uint8* dst_argb, | 90 uint8* dst_argb, |
| 91 struct YuvConstants* YuvConstants, | 91 struct YuvConstants* yuvconstants, |
| 92 int width) { | 92 int width) { |
| 93 __m128i xmm0, xmm1, xmm2, xmm3; | 93 __m128i xmm0, xmm1, xmm2, xmm3; |
| 94 const __m128i xmm5 = _mm_set1_epi8(-1); | 94 const __m128i xmm5 = _mm_set1_epi8(-1); |
| 95 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 95 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
| 96 while (width > 0) { | 96 while (width > 0) { |
| 97 READYUV422 | 97 READYUV422 |
| 98 YUVTORGB(YuvConstants) | 98 YUVTORGB(YuvConstants) |
| 99 STOREARGB | 99 STOREARGB |
| 100 width -= 8; | 100 width -= 8; |
| 101 } | 101 } |
| 102 } | 102 } |
| 103 #endif | 103 #endif |
| 104 | 104 |
| 105 #if defined(HAS_I422TOABGRMATRIXROW_SSSE3) | 105 #if defined(HAS_I422TOABGRROW_SSSE3) |
| 106 void I422ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 106 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 107 const uint8* u_buf, | 107 const uint8* u_buf, |
| 108 const uint8* v_buf, | 108 const uint8* v_buf, |
| 109 uint8* dst_argb, | 109 uint8* dst_argb, |
| 110 struct YuvConstants* YuvConstants, | 110 struct YuvConstants* yuvconstants, |
| 111 int width) { | 111 int width) { |
| 112 __m128i xmm0, xmm1, xmm2, xmm3; | 112 __m128i xmm0, xmm1, xmm2, xmm3; |
| 113 const __m128i xmm5 = _mm_set1_epi8(-1); | 113 const __m128i xmm5 = _mm_set1_epi8(-1); |
| 114 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 114 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
| 115 while (width > 0) { | 115 while (width > 0) { |
| 116 READYUV422 | 116 READYUV422 |
| 117 YUVTORGB(YuvConstants) | 117 YUVTORGB(YuvConstants) |
| 118 STOREABGR | 118 STOREABGR |
| 119 width -= 8; | 119 width -= 8; |
| 120 } | 120 } |
| 121 } | 121 } |
| (...skipping 1834 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1956 __asm vpermq ymm1, ymm1, 0xd8 \ | 1956 __asm vpermq ymm1, ymm1, 0xd8 \ |
| 1957 __asm vpunpcklbw ymm2, ymm0, ymm5 /* BA */ \ | 1957 __asm vpunpcklbw ymm2, ymm0, ymm5 /* BA */ \ |
| 1958 __asm vpermq ymm2, ymm2, 0xd8 \ | 1958 __asm vpermq ymm2, ymm2, 0xd8 \ |
| 1959 __asm vpunpcklwd ymm0, ymm1, ymm2 /* RGBA first 8 pixels */ \ | 1959 __asm vpunpcklwd ymm0, ymm1, ymm2 /* RGBA first 8 pixels */ \ |
| 1960 __asm vpunpckhwd ymm1, ymm1, ymm2 /* RGBA next 8 pixels */ \ | 1960 __asm vpunpckhwd ymm1, ymm1, ymm2 /* RGBA next 8 pixels */ \ |
| 1961 __asm vmovdqu [edx], ymm0 \ | 1961 __asm vmovdqu [edx], ymm0 \ |
| 1962 __asm vmovdqu [edx + 32], ymm1 \ | 1962 __asm vmovdqu [edx + 32], ymm1 \ |
| 1963 __asm lea edx, [edx + 64] \ | 1963 __asm lea edx, [edx + 64] \ |
| 1964 } | 1964 } |
| 1965 | 1965 |
| 1966 #ifdef HAS_I422TOARGBMATRIXROW_AVX2 | 1966 #ifdef HAS_I422TOARGBROW_AVX2 |
| 1967 // 16 pixels | 1967 // 16 pixels |
| 1968 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 1968 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 1969 __declspec(naked) | 1969 __declspec(naked) |
| 1970 void I422ToARGBMatrixRow_AVX2(const uint8* y_buf, | 1970 void I422ToARGBRow_AVX2(const uint8* y_buf, |
| 1971 const uint8* u_buf, | 1971 const uint8* u_buf, |
| 1972 const uint8* v_buf, | 1972 const uint8* v_buf, |
| 1973 uint8* dst_argb, | 1973 uint8* dst_argb, |
| 1974 struct YuvConstants* YuvConstants, | 1974 struct YuvConstants* yuvconstants, |
| 1975 int width) { | 1975 int width) { |
| 1976 __asm { | 1976 __asm { |
| 1977 push esi | 1977 push esi |
| 1978 push edi | 1978 push edi |
| 1979 push ebp | 1979 push ebp |
| 1980 mov eax, [esp + 12 + 4] // Y | 1980 mov eax, [esp + 12 + 4] // Y |
| 1981 mov esi, [esp + 12 + 8] // U | 1981 mov esi, [esp + 12 + 8] // U |
| 1982 mov edi, [esp + 12 + 12] // V | 1982 mov edi, [esp + 12 + 12] // V |
| 1983 mov edx, [esp + 12 + 16] // argb | 1983 mov edx, [esp + 12 + 16] // argb |
| 1984 mov ebp, [esp + 12 + 20] // YuvConstants | 1984 mov ebp, [esp + 12 + 20] // YuvConstants |
| 1985 mov ecx, [esp + 12 + 24] // width | 1985 mov ecx, [esp + 12 + 24] // width |
| 1986 sub edi, esi | 1986 sub edi, esi |
| 1987 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 1987 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 1988 | 1988 |
| 1989 convertloop: | 1989 convertloop: |
| 1990 READYUV422_AVX2 | 1990 READYUV422_AVX2 |
| 1991 YUVTORGB_AVX2(ebp) | 1991 YUVTORGB_AVX2(ebp) |
| 1992 STOREARGB_AVX2 | 1992 STOREARGB_AVX2 |
| 1993 | 1993 |
| 1994 sub ecx, 16 | 1994 sub ecx, 16 |
| 1995 jg convertloop | 1995 jg convertloop |
| 1996 | 1996 |
| 1997 pop ebp | 1997 pop ebp |
| 1998 pop edi | 1998 pop edi |
| 1999 pop esi | 1999 pop esi |
| 2000 vzeroupper | 2000 vzeroupper |
| 2001 ret | 2001 ret |
| 2002 } | 2002 } |
| 2003 } | 2003 } |
| 2004 #endif // HAS_I422TOARGBMATRIXROW_AVX2 | 2004 #endif // HAS_I422TOARGBROW_AVX2 |
| 2005 | 2005 |
| 2006 #ifdef HAS_I444TOARGBMATRIXROW_AVX2 | 2006 #ifdef HAS_I444TOARGBROW_AVX2 |
| 2007 // 16 pixels | 2007 // 16 pixels |
| 2008 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 2008 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| 2009 __declspec(naked) | 2009 __declspec(naked) |
| 2010 void I444ToARGBMatrixRow_AVX2(const uint8* y_buf, | 2010 void I444ToARGBRow_AVX2(const uint8* y_buf, |
| 2011 const uint8* u_buf, | 2011 const uint8* u_buf, |
| 2012 const uint8* v_buf, | 2012 const uint8* v_buf, |
| 2013 uint8* dst_argb, | 2013 uint8* dst_argb, |
| 2014 struct YuvConstants* YuvConstants, | 2014 struct YuvConstants* yuvconstants, |
| 2015 int width) { | 2015 int width) { |
| 2016 __asm { | 2016 __asm { |
| 2017 push esi | 2017 push esi |
| 2018 push edi | 2018 push edi |
| 2019 push ebp | 2019 push ebp |
| 2020 mov eax, [esp + 12 + 4] // Y | 2020 mov eax, [esp + 12 + 4] // Y |
| 2021 mov esi, [esp + 12 + 8] // U | 2021 mov esi, [esp + 12 + 8] // U |
| 2022 mov edi, [esp + 12 + 12] // V | 2022 mov edi, [esp + 12 + 12] // V |
| 2023 mov edx, [esp + 12 + 16] // argb | 2023 mov edx, [esp + 12 + 16] // argb |
| 2024 mov ebp, [esp + 12 + 20] // YuvConstants | 2024 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2025 mov ecx, [esp + 12 + 24] // width | 2025 mov ecx, [esp + 12 + 24] // width |
| 2026 sub edi, esi | 2026 sub edi, esi |
| 2027 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2027 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2028 convertloop: | 2028 convertloop: |
| 2029 READYUV444_AVX2 | 2029 READYUV444_AVX2 |
| 2030 YUVTORGB_AVX2(ebp) | 2030 YUVTORGB_AVX2(ebp) |
| 2031 STOREARGB_AVX2 | 2031 STOREARGB_AVX2 |
| 2032 | 2032 |
| 2033 sub ecx, 16 | 2033 sub ecx, 16 |
| 2034 jg convertloop | 2034 jg convertloop |
| 2035 | 2035 |
| 2036 pop ebp | 2036 pop ebp |
| 2037 pop edi | 2037 pop edi |
| 2038 pop esi | 2038 pop esi |
| 2039 vzeroupper | 2039 vzeroupper |
| 2040 ret | 2040 ret |
| 2041 } | 2041 } |
| 2042 } | 2042 } |
| 2043 #endif // HAS_I444TOARGBMATRIXROW_AVX2 | 2043 #endif // HAS_I444TOARGBROW_AVX2 |
| 2044 | 2044 |
| 2045 #ifdef HAS_I444TOABGRMATRIXROW_AVX2 | 2045 #ifdef HAS_I444TOABGRROW_AVX2 |
| 2046 // 16 pixels | 2046 // 16 pixels |
| 2047 // 16 UV values with 16 Y producing 16 ABGR (64 bytes). | 2047 // 16 UV values with 16 Y producing 16 ABGR (64 bytes). |
| 2048 __declspec(naked) | 2048 __declspec(naked) |
| 2049 void I444ToABGRMatrixRow_AVX2(const uint8* y_buf, | 2049 void I444ToABGRRow_AVX2(const uint8* y_buf, |
| 2050 const uint8* u_buf, | 2050 const uint8* u_buf, |
| 2051 const uint8* v_buf, | 2051 const uint8* v_buf, |
| 2052 uint8* dst_abgr, | 2052 uint8* dst_abgr, |
| 2053 struct YuvConstants* YuvConstants, | 2053 struct YuvConstants* yuvconstants, |
| 2054 int width) { | 2054 int width) { |
| 2055 __asm { | 2055 __asm { |
| 2056 push esi | 2056 push esi |
| 2057 push edi | 2057 push edi |
| 2058 push ebp | 2058 push ebp |
| 2059 mov eax, [esp + 12 + 4] // Y | 2059 mov eax, [esp + 12 + 4] // Y |
| 2060 mov esi, [esp + 12 + 8] // U | 2060 mov esi, [esp + 12 + 8] // U |
| 2061 mov edi, [esp + 12 + 12] // V | 2061 mov edi, [esp + 12 + 12] // V |
| 2062 mov edx, [esp + 12 + 16] // abgr | 2062 mov edx, [esp + 12 + 16] // abgr |
| 2063 mov ebp, [esp + 12 + 20] // YuvConstants | 2063 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2064 mov ecx, [esp + 12 + 24] // width | 2064 mov ecx, [esp + 12 + 24] // width |
| 2065 sub edi, esi | 2065 sub edi, esi |
| 2066 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2066 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2067 convertloop: | 2067 convertloop: |
| 2068 READYUV444_AVX2 | 2068 READYUV444_AVX2 |
| 2069 YUVTORGB_AVX2(ebp) | 2069 YUVTORGB_AVX2(ebp) |
| 2070 STOREABGR_AVX2 | 2070 STOREABGR_AVX2 |
| 2071 | 2071 |
| 2072 sub ecx, 16 | 2072 sub ecx, 16 |
| 2073 jg convertloop | 2073 jg convertloop |
| 2074 | 2074 |
| 2075 pop ebp | 2075 pop ebp |
| 2076 pop edi | 2076 pop edi |
| 2077 pop esi | 2077 pop esi |
| 2078 vzeroupper | 2078 vzeroupper |
| 2079 ret | 2079 ret |
| 2080 } | 2080 } |
| 2081 } | 2081 } |
| 2082 #endif // HAS_I444TOABGRMATRIXROW_AVX2 | 2082 #endif // HAS_I444TOABGRROW_AVX2 |
| 2083 | 2083 |
| 2084 #ifdef HAS_I411TOARGBROW_AVX2 | 2084 #ifdef HAS_I411TOARGBROW_AVX2 |
| 2085 // 16 pixels | 2085 // 16 pixels |
| 2086 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2086 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2087 __declspec(naked) | 2087 __declspec(naked) |
| 2088 void I411ToARGBRow_AVX2(const uint8* y_buf, | 2088 void I411ToARGBRow_AVX2(const uint8* y_buf, |
| 2089 const uint8* u_buf, | 2089 const uint8* u_buf, |
| 2090 const uint8* v_buf, | 2090 const uint8* v_buf, |
| 2091 uint8* dst_argb, | 2091 uint8* dst_argb, |
| 2092 struct YuvConstants* yuvconstants, |
| 2092 int width) { | 2093 int width) { |
| 2093 __asm { | 2094 __asm { |
| 2094 push esi | 2095 push esi |
| 2095 push edi | 2096 push edi |
| 2096 mov eax, [esp + 8 + 4] // Y | 2097 push ebp |
| 2097 mov esi, [esp + 8 + 8] // U | 2098 mov eax, [esp + 12 + 4] // Y |
| 2098 mov edi, [esp + 8 + 12] // V | 2099 mov esi, [esp + 12 + 8] // U |
| 2099 mov edx, [esp + 8 + 16] // argb | 2100 mov edi, [esp + 12 + 12] // V |
| 2100 mov ecx, [esp + 8 + 20] // width | 2101 mov edx, [esp + 12 + 16] // abgr |
| 2102 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2103 mov ecx, [esp + 12 + 24] // width |
| 2101 sub edi, esi | 2104 sub edi, esi |
| 2102 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2105 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2103 | 2106 |
| 2104 convertloop: | 2107 convertloop: |
| 2105 READYUV411_AVX2 | 2108 READYUV411_AVX2 |
| 2106 YUVTORGB_AVX2(kYuvConstants) | 2109 YUVTORGB_AVX2(ebp) |
| 2107 STOREARGB_AVX2 | 2110 STOREARGB_AVX2 |
| 2108 | 2111 |
| 2109 sub ecx, 16 | 2112 sub ecx, 16 |
| 2110 jg convertloop | 2113 jg convertloop |
| 2111 | 2114 |
| 2115 pop ebp |
| 2112 pop edi | 2116 pop edi |
| 2113 pop esi | 2117 pop esi |
| 2114 vzeroupper | 2118 vzeroupper |
| 2115 ret | 2119 ret |
| 2116 } | 2120 } |
| 2117 } | 2121 } |
| 2118 #endif // HAS_I411TOARGBROW_AVX2 | 2122 #endif // HAS_I411TOARGBROW_AVX2 |
| 2119 | 2123 |
| 2120 #ifdef HAS_NV12TOARGBROW_AVX2 | 2124 #ifdef HAS_NV12TOARGBROW_AVX2 |
| 2121 // 16 pixels. | 2125 // 16 pixels. |
| 2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2126 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2123 __declspec(naked) | 2127 __declspec(naked) |
| 2124 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2128 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 2125 const uint8* uv_buf, | 2129 const uint8* uv_buf, |
| 2126 uint8* dst_argb, | 2130 uint8* dst_argb, |
| 2131 struct YuvConstants* yuvconstants, |
| 2127 int width) { | 2132 int width) { |
| 2128 __asm { | 2133 __asm { |
| 2129 push esi | 2134 push esi |
| 2130 mov eax, [esp + 4 + 4] // Y | 2135 push ebp |
| 2131 mov esi, [esp + 4 + 8] // UV | 2136 mov eax, [esp + 8 + 4] // Y |
| 2132 mov edx, [esp + 4 + 12] // argb | 2137 mov esi, [esp + 8 + 8] // UV |
| 2133 mov ecx, [esp + 4 + 16] // width | 2138 mov edx, [esp + 8 + 12] // argb |
| 2139 mov ebp, [esp + 8 + 16] // YuvConstants |
| 2140 mov ecx, [esp + 8 + 20] // width |
| 2134 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2141 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2135 | 2142 |
| 2136 convertloop: | 2143 convertloop: |
| 2137 READNV12_AVX2 | 2144 READNV12_AVX2 |
| 2138 YUVTORGB_AVX2(kYuvConstants) | 2145 YUVTORGB_AVX2(ebp) |
| 2139 STOREARGB_AVX2 | 2146 STOREARGB_AVX2 |
| 2140 | 2147 |
| 2141 sub ecx, 16 | 2148 sub ecx, 16 |
| 2142 jg convertloop | 2149 jg convertloop |
| 2143 | 2150 |
| 2151 pop ebp |
| 2144 pop esi | 2152 pop esi |
| 2145 vzeroupper | 2153 vzeroupper |
| 2146 ret | 2154 ret |
| 2147 } | 2155 } |
| 2148 } | 2156 } |
| 2149 #endif // HAS_NV12TOARGBROW_AVX2 | 2157 #endif // HAS_NV12TOARGBROW_AVX2 |
| 2150 | 2158 |
| 2151 #ifdef HAS_NV21TOARGBROW_AVX2 | |
| 2152 // 16 pixels. | |
| 2153 // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). | |
| 2154 __declspec(naked) | |
| 2155 void NV21ToARGBRow_AVX2(const uint8* y_buf, | |
| 2156 const uint8* uv_buf, | |
| 2157 uint8* dst_argb, | |
| 2158 int width) { | |
| 2159 __asm { | |
| 2160 push esi | |
| 2161 mov eax, [esp + 4 + 4] // Y | |
| 2162 mov esi, [esp + 4 + 8] // UV | |
| 2163 mov edx, [esp + 4 + 12] // argb | |
| 2164 mov ecx, [esp + 4 + 16] // width | |
| 2165 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | |
| 2166 | |
| 2167 convertloop: | |
| 2168 READNV12_AVX2 | |
| 2169 YUVTORGB_AVX2(kYvuConstants) | |
| 2170 STOREARGB_AVX2 | |
| 2171 | |
| 2172 sub ecx, 16 | |
| 2173 jg convertloop | |
| 2174 | |
| 2175 pop esi | |
| 2176 vzeroupper | |
| 2177 ret | |
| 2178 } | |
| 2179 } | |
| 2180 #endif // HAS_NV21TOARGBROW_AVX2 | |
| 2181 | |
| 2182 #ifdef HAS_I422TOBGRAROW_AVX2 | 2159 #ifdef HAS_I422TOBGRAROW_AVX2 |
| 2183 // 16 pixels | 2160 // 16 pixels |
| 2184 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2161 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| 2185 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2162 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| 2186 __declspec(naked) | 2163 __declspec(naked) |
| 2187 void I422ToBGRARow_AVX2(const uint8* y_buf, | 2164 void I422ToBGRARow_AVX2(const uint8* y_buf, |
| 2188 const uint8* u_buf, | 2165 const uint8* u_buf, |
| 2189 const uint8* v_buf, | 2166 const uint8* v_buf, |
| 2190 uint8* dst_argb, | 2167 uint8* dst_argb, |
| 2168 struct YuvConstants* yuvconstants, |
| 2191 int width) { | 2169 int width) { |
| 2192 __asm { | 2170 __asm { |
| 2193 push esi | 2171 push esi |
| 2194 push edi | 2172 push edi |
| 2195 mov eax, [esp + 8 + 4] // Y | 2173 push ebp |
| 2196 mov esi, [esp + 8 + 8] // U | 2174 mov eax, [esp + 12 + 4] // Y |
| 2197 mov edi, [esp + 8 + 12] // V | 2175 mov esi, [esp + 12 + 8] // U |
| 2198 mov edx, [esp + 8 + 16] // argb | 2176 mov edi, [esp + 12 + 12] // V |
| 2199 mov ecx, [esp + 8 + 20] // width | 2177 mov edx, [esp + 12 + 16] // abgr |
| 2178 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2179 mov ecx, [esp + 12 + 24] // width |
| 2200 sub edi, esi | 2180 sub edi, esi |
| 2201 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2181 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2202 | 2182 |
| 2203 convertloop: | 2183 convertloop: |
| 2204 READYUV422_AVX2 | 2184 READYUV422_AVX2 |
| 2205 YUVTORGB_AVX2(kYuvConstants) | 2185 YUVTORGB_AVX2(ebp) |
| 2206 STOREBGRA_AVX2 | 2186 STOREBGRA_AVX2 |
| 2207 | 2187 |
| 2208 sub ecx, 16 | 2188 sub ecx, 16 |
| 2209 jg convertloop | 2189 jg convertloop |
| 2210 | 2190 |
| 2191 pop ebp |
| 2211 pop edi | 2192 pop edi |
| 2212 pop esi | 2193 pop esi |
| 2213 vzeroupper | 2194 vzeroupper |
| 2214 ret | 2195 ret |
| 2215 } | 2196 } |
| 2216 } | 2197 } |
| 2217 #endif // HAS_I422TOBGRAROW_AVX2 | 2198 #endif // HAS_I422TOBGRAROW_AVX2 |
| 2218 | 2199 |
| 2219 #ifdef HAS_I422TORGBAROW_AVX2 | 2200 #ifdef HAS_I422TORGBAROW_AVX2 |
| 2220 // 16 pixels | 2201 // 16 pixels |
| 2221 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2202 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| 2222 __declspec(naked) | 2203 __declspec(naked) |
| 2223 void I422ToRGBARow_AVX2(const uint8* y_buf, | 2204 void I422ToRGBARow_AVX2(const uint8* y_buf, |
| 2224 const uint8* u_buf, | 2205 const uint8* u_buf, |
| 2225 const uint8* v_buf, | 2206 const uint8* v_buf, |
| 2226 uint8* dst_argb, | 2207 uint8* dst_argb, |
| 2208 struct YuvConstants* yuvconstants, |
| 2227 int width) { | 2209 int width) { |
| 2228 __asm { | 2210 __asm { |
| 2229 push esi | 2211 push esi |
| 2230 push edi | 2212 push edi |
| 2231 mov eax, [esp + 8 + 4] // Y | 2213 push ebp |
| 2232 mov esi, [esp + 8 + 8] // U | 2214 mov eax, [esp + 12 + 4] // Y |
| 2233 mov edi, [esp + 8 + 12] // V | 2215 mov esi, [esp + 12 + 8] // U |
| 2234 mov edx, [esp + 8 + 16] // argb | 2216 mov edi, [esp + 12 + 12] // V |
| 2235 mov ecx, [esp + 8 + 20] // width | 2217 mov edx, [esp + 12 + 16] // abgr |
| 2218 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2219 mov ecx, [esp + 12 + 24] // width |
| 2236 sub edi, esi | 2220 sub edi, esi |
| 2237 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2221 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2238 | 2222 |
| 2239 convertloop: | 2223 convertloop: |
| 2240 READYUV422_AVX2 | 2224 READYUV422_AVX2 |
| 2241 YUVTORGB_AVX2(kYuvConstants) | 2225 YUVTORGB_AVX2(ebp) |
| 2242 STORERGBA_AVX2 | 2226 STORERGBA_AVX2 |
| 2243 | 2227 |
| 2244 sub ecx, 16 | 2228 sub ecx, 16 |
| 2245 jg convertloop | 2229 jg convertloop |
| 2246 | 2230 |
| 2231 pop ebp |
| 2247 pop edi | 2232 pop edi |
| 2248 pop esi | 2233 pop esi |
| 2249 vzeroupper | 2234 vzeroupper |
| 2250 ret | 2235 ret |
| 2251 } | 2236 } |
| 2252 } | 2237 } |
| 2253 #endif // HAS_I422TORGBAROW_AVX2 | 2238 #endif // HAS_I422TORGBAROW_AVX2 |
| 2254 | 2239 |
| 2255 #ifdef HAS_I422TOABGRROW_AVX2 | 2240 #ifdef HAS_I422TOABGRROW_AVX2 |
| 2256 // 16 pixels | 2241 // 16 pixels |
| 2257 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2242 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
| 2258 __declspec(naked) | 2243 __declspec(naked) |
| 2259 void I422ToABGRMatrixRow_AVX2(const uint8* y_buf, | 2244 void I422ToABGRRow_AVX2(const uint8* y_buf, |
| 2260 const uint8* u_buf, | 2245 const uint8* u_buf, |
| 2261 const uint8* v_buf, | 2246 const uint8* v_buf, |
| 2262 uint8* dst_argb, | 2247 uint8* dst_argb, |
| 2263 struct YuvConstants* YuvConstants, | 2248 struct YuvConstants* yuvconstants, |
| 2264 int width) { | 2249 int width) { |
| 2265 __asm { | 2250 __asm { |
| 2266 push esi | 2251 push esi |
| 2267 push edi | 2252 push edi |
| 2268 push ebp | 2253 push ebp |
| 2269 mov eax, [esp + 12 + 4] // Y | 2254 mov eax, [esp + 12 + 4] // Y |
| 2270 mov esi, [esp + 12 + 8] // U | 2255 mov esi, [esp + 12 + 8] // U |
| 2271 mov edi, [esp + 12 + 12] // V | 2256 mov edi, [esp + 12 + 12] // V |
| 2272 mov edx, [esp + 12 + 16] // argb | 2257 mov edx, [esp + 12 + 16] // argb |
| 2273 mov ebp, [esp + 12 + 20] // YuvConstants | 2258 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2274 mov ecx, [esp + 12 + 24] // width | 2259 mov ecx, [esp + 12 + 24] // width |
| (...skipping 199 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2474 __asm por xmm3, xmm2 /* BG */ \ | 2459 __asm por xmm3, xmm2 /* BG */ \ |
| 2475 __asm por xmm1, xmm3 /* BGR */ \ | 2460 __asm por xmm1, xmm3 /* BGR */ \ |
| 2476 __asm packssdw xmm0, xmm1 \ | 2461 __asm packssdw xmm0, xmm1 \ |
| 2477 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ | 2462 __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ |
| 2478 __asm lea edx, [edx + 16] \ | 2463 __asm lea edx, [edx + 16] \ |
| 2479 } | 2464 } |
| 2480 | 2465 |
| 2481 // 8 pixels. | 2466 // 8 pixels. |
| 2482 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). | 2467 // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2483 __declspec(naked) | 2468 __declspec(naked) |
| 2484 void I444ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 2469 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
| 2485 const uint8* u_buf, | 2470 const uint8* u_buf, |
| 2486 const uint8* v_buf, | 2471 const uint8* v_buf, |
| 2487 uint8* dst_argb, | 2472 uint8* dst_argb, |
| 2488 struct YuvConstants* YuvConstants, | 2473 struct YuvConstants* yuvconstants, |
| 2489 int width) { | 2474 int width) { |
| 2490 __asm { | 2475 __asm { |
| 2491 push esi | 2476 push esi |
| 2492 push edi | 2477 push edi |
| 2493 push ebp | 2478 push ebp |
| 2494 mov eax, [esp + 12 + 4] // Y | 2479 mov eax, [esp + 12 + 4] // Y |
| 2495 mov esi, [esp + 12 + 8] // U | 2480 mov esi, [esp + 12 + 8] // U |
| 2496 mov edi, [esp + 12 + 12] // V | 2481 mov edi, [esp + 12 + 12] // V |
| 2497 mov edx, [esp + 12 + 16] // argb | 2482 mov edx, [esp + 12 + 16] // argb |
| 2498 mov ebp, [esp + 12 + 20] // YuvConstants | 2483 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2499 mov ecx, [esp + 12 + 24] // width | 2484 mov ecx, [esp + 12 + 24] // width |
| (...skipping 11 matching lines...) Expand all Loading... |
| 2511 pop ebp | 2496 pop ebp |
| 2512 pop edi | 2497 pop edi |
| 2513 pop esi | 2498 pop esi |
| 2514 ret | 2499 ret |
| 2515 } | 2500 } |
| 2516 } | 2501 } |
| 2517 | 2502 |
| 2518 // 8 pixels. | 2503 // 8 pixels. |
| 2519 // 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). | 2504 // 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). |
| 2520 __declspec(naked) | 2505 __declspec(naked) |
| 2521 void I444ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 2506 void I444ToABGRRow_SSSE3(const uint8* y_buf, |
| 2522 const uint8* u_buf, | 2507 const uint8* u_buf, |
| 2523 const uint8* v_buf, | 2508 const uint8* v_buf, |
| 2524 uint8* dst_abgr, | 2509 uint8* dst_abgr, |
| 2525 struct YuvConstants* YuvConstants, | 2510 struct YuvConstants* yuvconstants, |
| 2526 int width) { | 2511 int width) { |
| 2527 __asm { | 2512 __asm { |
| 2528 push esi | 2513 push esi |
| 2529 push edi | 2514 push edi |
| 2530 push ebp | 2515 push ebp |
| 2531 mov eax, [esp + 12 + 4] // Y | 2516 mov eax, [esp + 12 + 4] // Y |
| 2532 mov esi, [esp + 12 + 8] // U | 2517 mov esi, [esp + 12 + 8] // U |
| 2533 mov edi, [esp + 12 + 12] // V | 2518 mov edi, [esp + 12 + 12] // V |
| 2534 mov edx, [esp + 12 + 16] // abgr | 2519 mov edx, [esp + 12 + 16] // abgr |
| 2535 mov ebp, [esp + 12 + 20] // YuvConstants | 2520 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2536 mov ecx, [esp + 12 + 24] // width | 2521 mov ecx, [esp + 12 + 24] // width |
| (...skipping 15 matching lines...) Expand all Loading... |
| 2552 } | 2537 } |
| 2553 } | 2538 } |
| 2554 | 2539 |
| 2555 // 8 pixels. | 2540 // 8 pixels. |
| 2556 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). | 2541 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
| 2557 __declspec(naked) | 2542 __declspec(naked) |
| 2558 void I422ToRGB24Row_SSSE3(const uint8* y_buf, | 2543 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 2559 const uint8* u_buf, | 2544 const uint8* u_buf, |
| 2560 const uint8* v_buf, | 2545 const uint8* v_buf, |
| 2561 uint8* dst_rgb24, | 2546 uint8* dst_rgb24, |
| 2547 struct YuvConstants* yuvconstants, |
| 2562 int width) { | 2548 int width) { |
| 2563 __asm { | 2549 __asm { |
| 2564 push esi | 2550 push esi |
| 2565 push edi | 2551 push edi |
| 2566 mov eax, [esp + 8 + 4] // Y | 2552 push ebp |
| 2567 mov esi, [esp + 8 + 8] // U | 2553 mov eax, [esp + 12 + 4] // Y |
| 2568 mov edi, [esp + 8 + 12] // V | 2554 mov esi, [esp + 12 + 8] // U |
| 2569 mov edx, [esp + 8 + 16] // rgb24 | 2555 mov edi, [esp + 12 + 12] // V |
| 2570 mov ecx, [esp + 8 + 20] // width | 2556 mov edx, [esp + 12 + 16] // argb |
| 2557 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2558 mov ecx, [esp + 12 + 24] // width |
| 2571 sub edi, esi | 2559 sub edi, esi |
| 2572 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 | 2560 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 |
| 2573 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 | 2561 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
| 2574 | 2562 |
| 2575 convertloop: | 2563 convertloop: |
| 2576 READYUV422 | 2564 READYUV422 |
| 2577 YUVTORGB(kYuvConstants) | 2565 YUVTORGB(ebp) |
| 2578 STORERGB24 | 2566 STORERGB24 |
| 2579 | 2567 |
| 2580 sub ecx, 8 | 2568 sub ecx, 8 |
| 2581 jg convertloop | 2569 jg convertloop |
| 2582 | 2570 |
| 2571 pop ebp |
| 2583 pop edi | 2572 pop edi |
| 2584 pop esi | 2573 pop esi |
| 2585 ret | 2574 ret |
| 2586 } | 2575 } |
| 2587 } | 2576 } |
| 2588 | 2577 |
| 2589 // 8 pixels. | 2578 // 8 pixels. |
| 2590 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). | 2579 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). |
| 2591 __declspec(naked) | 2580 __declspec(naked) |
| 2592 void I422ToRAWRow_SSSE3(const uint8* y_buf, | 2581 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
| 2593 const uint8* u_buf, | 2582 const uint8* u_buf, |
| 2594 const uint8* v_buf, | 2583 const uint8* v_buf, |
| 2595 uint8* dst_raw, | 2584 uint8* dst_raw, |
| 2585 struct YuvConstants* yuvconstants, |
| 2596 int width) { | 2586 int width) { |
| 2597 __asm { | 2587 __asm { |
| 2598 push esi | 2588 push esi |
| 2599 push edi | 2589 push edi |
| 2600 mov eax, [esp + 8 + 4] // Y | 2590 push ebp |
| 2601 mov esi, [esp + 8 + 8] // U | 2591 mov eax, [esp + 12 + 4] // Y |
| 2602 mov edi, [esp + 8 + 12] // V | 2592 mov esi, [esp + 12 + 8] // U |
| 2603 mov edx, [esp + 8 + 16] // raw | 2593 mov edi, [esp + 12 + 12] // V |
| 2604 mov ecx, [esp + 8 + 20] // width | 2594 mov edx, [esp + 12 + 16] // argb |
| 2595 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2596 mov ecx, [esp + 12 + 24] // width |
| 2605 sub edi, esi | 2597 sub edi, esi |
| 2606 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 | 2598 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 |
| 2607 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW | 2599 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
| 2608 | 2600 |
| 2609 convertloop: | 2601 convertloop: |
| 2610 READYUV422 | 2602 READYUV422 |
| 2611 YUVTORGB(kYuvConstants) | 2603 YUVTORGB(ebp) |
| 2612 STORERAW | 2604 STORERAW |
| 2613 | 2605 |
| 2614 sub ecx, 8 | 2606 sub ecx, 8 |
| 2615 jg convertloop | 2607 jg convertloop |
| 2616 | 2608 |
| 2609 pop ebp |
| 2617 pop edi | 2610 pop edi |
| 2618 pop esi | 2611 pop esi |
| 2619 ret | 2612 ret |
| 2620 } | 2613 } |
| 2621 } | 2614 } |
| 2622 | 2615 |
| 2623 // 8 pixels | 2616 // 8 pixels |
| 2624 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). | 2617 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
| 2625 __declspec(naked) | 2618 __declspec(naked) |
| 2626 void I422ToRGB565Row_SSSE3(const uint8* y_buf, | 2619 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
| 2627 const uint8* u_buf, | 2620 const uint8* u_buf, |
| 2628 const uint8* v_buf, | 2621 const uint8* v_buf, |
| 2629 uint8* rgb565_buf, | 2622 uint8* rgb565_buf, |
| 2623 struct YuvConstants* yuvconstants, |
| 2630 int width) { | 2624 int width) { |
| 2631 __asm { | 2625 __asm { |
| 2632 push esi | 2626 push esi |
| 2633 push edi | 2627 push edi |
| 2634 mov eax, [esp + 8 + 4] // Y | 2628 push ebp |
| 2635 mov esi, [esp + 8 + 8] // U | 2629 mov eax, [esp + 12 + 4] // Y |
| 2636 mov edi, [esp + 8 + 12] // V | 2630 mov esi, [esp + 12 + 8] // U |
| 2637 mov edx, [esp + 8 + 16] // rgb565 | 2631 mov edi, [esp + 12 + 12] // V |
| 2638 mov ecx, [esp + 8 + 20] // width | 2632 mov edx, [esp + 12 + 16] // argb |
| 2633 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2634 mov ecx, [esp + 12 + 24] // width |
| 2639 sub edi, esi | 2635 sub edi, esi |
| 2640 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f | 2636 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
| 2641 psrld xmm5, 27 | 2637 psrld xmm5, 27 |
| 2642 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 | 2638 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
| 2643 psrld xmm6, 26 | 2639 psrld xmm6, 26 |
| 2644 pslld xmm6, 5 | 2640 pslld xmm6, 5 |
| 2645 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 | 2641 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
| 2646 pslld xmm7, 11 | 2642 pslld xmm7, 11 |
| 2647 | 2643 |
| 2648 convertloop: | 2644 convertloop: |
| 2649 READYUV422 | 2645 READYUV422 |
| 2650 YUVTORGB(kYuvConstants) | 2646 YUVTORGB(ebp) |
| 2651 STORERGB565 | 2647 STORERGB565 |
| 2652 | 2648 |
| 2653 sub ecx, 8 | 2649 sub ecx, 8 |
| 2654 jg convertloop | 2650 jg convertloop |
| 2655 | 2651 |
| 2652 pop ebp |
| 2656 pop edi | 2653 pop edi |
| 2657 pop esi | 2654 pop esi |
| 2658 ret | 2655 ret |
| 2659 } | 2656 } |
| 2660 } | 2657 } |
| 2661 | 2658 |
| 2662 // 8 pixels. | 2659 // 8 pixels. |
| 2663 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2660 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2664 __declspec(naked) | 2661 __declspec(naked) |
| 2665 void I422ToARGBMatrixRow_SSSE3(const uint8* y_buf, | 2662 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2666 const uint8* u_buf, | 2663 const uint8* u_buf, |
| 2667 const uint8* v_buf, | 2664 const uint8* v_buf, |
| 2668 uint8* dst_argb, | 2665 uint8* dst_argb, |
| 2669 struct YuvConstants* YuvConstants, | 2666 struct YuvConstants* yuvconstants, |
| 2670 int width) { | 2667 int width) { |
| 2671 __asm { | 2668 __asm { |
| 2672 push esi | 2669 push esi |
| 2673 push edi | 2670 push edi |
| 2674 push ebp | 2671 push ebp |
| 2675 mov eax, [esp + 12 + 4] // Y | 2672 mov eax, [esp + 12 + 4] // Y |
| 2676 mov esi, [esp + 12 + 8] // U | 2673 mov esi, [esp + 12 + 8] // U |
| 2677 mov edi, [esp + 12 + 12] // V | 2674 mov edi, [esp + 12 + 12] // V |
| 2678 mov edx, [esp + 12 + 16] // argb | 2675 mov edx, [esp + 12 + 16] // argb |
| 2679 mov ebp, [esp + 12 + 20] // YuvConstants | 2676 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2680 mov ecx, [esp + 12 + 24] // width | 2677 mov ecx, [esp + 12 + 24] // width |
| (...skipping 16 matching lines...) Expand all Loading... |
| 2697 } | 2694 } |
| 2698 | 2695 |
| 2699 // 8 pixels. | 2696 // 8 pixels. |
| 2700 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2697 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2701 // Similar to I420 but duplicate UV once more. | 2698 // Similar to I420 but duplicate UV once more. |
| 2702 __declspec(naked) | 2699 __declspec(naked) |
| 2703 void I411ToARGBRow_SSSE3(const uint8* y_buf, | 2700 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 2704 const uint8* u_buf, | 2701 const uint8* u_buf, |
| 2705 const uint8* v_buf, | 2702 const uint8* v_buf, |
| 2706 uint8* dst_argb, | 2703 uint8* dst_argb, |
| 2704 struct YuvConstants* yuvconstants, |
| 2707 int width) { | 2705 int width) { |
| 2708 __asm { | 2706 __asm { |
| 2709 push ebx | |
| 2710 push esi | 2707 push esi |
| 2711 push edi | 2708 push edi |
| 2709 push ebp |
| 2712 mov eax, [esp + 12 + 4] // Y | 2710 mov eax, [esp + 12 + 4] // Y |
| 2713 mov esi, [esp + 12 + 8] // U | 2711 mov esi, [esp + 12 + 8] // U |
| 2714 mov edi, [esp + 12 + 12] // V | 2712 mov edi, [esp + 12 + 12] // V |
| 2715 mov edx, [esp + 12 + 16] // argb | 2713 mov edx, [esp + 12 + 16] // abgr |
| 2716 mov ecx, [esp + 12 + 20] // width | 2714 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2715 mov ecx, [esp + 12 + 24] // width |
| 2717 sub edi, esi | 2716 sub edi, esi |
| 2718 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2717 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2719 | 2718 |
| 2720 convertloop: | 2719 convertloop: |
| 2721 READYUV411 // modifies EBX | 2720 READYUV411 |
| 2722 YUVTORGB(kYuvConstants) | 2721 YUVTORGB(ebp) |
| 2723 STOREARGB | 2722 STOREARGB |
| 2724 | 2723 |
| 2725 sub ecx, 8 | 2724 sub ecx, 8 |
| 2726 jg convertloop | 2725 jg convertloop |
| 2727 | 2726 |
| 2727 pop ebp |
| 2728 pop edi | 2728 pop edi |
| 2729 pop esi | 2729 pop esi |
| 2730 pop ebx | |
| 2731 ret | 2730 ret |
| 2732 } | 2731 } |
| 2733 } | 2732 } |
| 2734 | 2733 |
| 2735 // 8 pixels. | 2734 // 8 pixels. |
| 2736 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2735 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2737 __declspec(naked) | 2736 __declspec(naked) |
| 2738 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2737 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 2739 const uint8* uv_buf, | 2738 const uint8* uv_buf, |
| 2740 uint8* dst_argb, | 2739 uint8* dst_argb, |
| 2740 struct YuvConstants* yuvconstants, |
| 2741 int width) { | 2741 int width) { |
| 2742 __asm { | 2742 __asm { |
| 2743 push esi | 2743 push esi |
| 2744 mov eax, [esp + 4 + 4] // Y | 2744 push ebp |
| 2745 mov esi, [esp + 4 + 8] // UV | 2745 mov eax, [esp + 8 + 4] // Y |
| 2746 mov edx, [esp + 4 + 12] // argb | 2746 mov esi, [esp + 8 + 8] // UV |
| 2747 mov ecx, [esp + 4 + 16] // width | 2747 mov edx, [esp + 8 + 12] // argb |
| 2748 mov ebp, [esp + 8 + 16] // YuvConstants |
| 2749 mov ecx, [esp + 8 + 20] // width |
| 2748 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2750 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2749 | 2751 |
| 2750 convertloop: | 2752 convertloop: |
| 2751 READNV12 | 2753 READNV12 |
| 2752 YUVTORGB(kYuvConstants) | 2754 YUVTORGB(ebp) |
| 2753 STOREARGB | 2755 STOREARGB |
| 2754 | 2756 |
| 2755 sub ecx, 8 | 2757 sub ecx, 8 |
| 2756 jg convertloop | 2758 jg convertloop |
| 2757 | 2759 |
| 2760 pop ebp |
| 2758 pop esi | 2761 pop esi |
| 2759 ret | 2762 ret |
| 2760 } | 2763 } |
| 2761 } | |
| 2762 | |
| 2763 // 8 pixels. | |
| 2764 // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). | |
| 2765 __declspec(naked) | |
| 2766 void NV21ToARGBRow_SSSE3(const uint8* y_buf, | |
| 2767 const uint8* uv_buf, | |
| 2768 uint8* dst_argb, | |
| 2769 int width) { | |
| 2770 __asm { | |
| 2771 push esi | |
| 2772 mov eax, [esp + 4 + 4] // Y | |
| 2773 mov esi, [esp + 4 + 8] // UV | |
| 2774 mov edx, [esp + 4 + 12] // argb | |
| 2775 mov ecx, [esp + 4 + 16] // width | |
| 2776 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
| 2777 | |
| 2778 convertloop: | |
| 2779 READNV12 | |
| 2780 YUVTORGB(kYvuConstants) | |
| 2781 STOREARGB | |
| 2782 | |
| 2783 sub ecx, 8 | |
| 2784 jg convertloop | |
| 2785 | |
| 2786 pop esi | |
| 2787 ret | |
| 2788 } | |
| 2789 } | 2764 } |
| 2790 | 2765 |
| 2791 __declspec(naked) | 2766 __declspec(naked) |
| 2792 void I422ToBGRARow_SSSE3(const uint8* y_buf, | 2767 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
| 2793 const uint8* u_buf, | 2768 const uint8* u_buf, |
| 2794 const uint8* v_buf, | 2769 const uint8* v_buf, |
| 2795 uint8* dst_bgra, | 2770 uint8* dst_bgra, |
| 2771 struct YuvConstants* yuvconstants, |
| 2796 int width) { | 2772 int width) { |
| 2797 __asm { | 2773 __asm { |
| 2798 push esi | 2774 push esi |
| 2799 push edi | 2775 push edi |
| 2800 mov eax, [esp + 8 + 4] // Y | 2776 push ebp |
| 2801 mov esi, [esp + 8 + 8] // U | 2777 mov eax, [esp + 12 + 4] // Y |
| 2802 mov edi, [esp + 8 + 12] // V | 2778 mov esi, [esp + 12 + 8] // U |
| 2803 mov edx, [esp + 8 + 16] // bgra | 2779 mov edi, [esp + 12 + 12] // V |
| 2804 mov ecx, [esp + 8 + 20] // width | 2780 mov edx, [esp + 12 + 16] // argb |
| 2781 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2782 mov ecx, [esp + 12 + 24] // width |
| 2805 sub edi, esi | 2783 sub edi, esi |
| 2806 | 2784 |
| 2807 convertloop: | 2785 convertloop: |
| 2808 READYUV422 | 2786 READYUV422 |
| 2809 YUVTORGB(kYuvConstants) | 2787 YUVTORGB(ebp) |
| 2810 STOREBGRA | 2788 STOREBGRA |
| 2811 | 2789 |
| 2812 sub ecx, 8 | 2790 sub ecx, 8 |
| 2813 jg convertloop | 2791 jg convertloop |
| 2814 | 2792 |
| 2793 pop ebp |
| 2815 pop edi | 2794 pop edi |
| 2816 pop esi | 2795 pop esi |
| 2817 ret | 2796 ret |
| 2818 } | 2797 } |
| 2819 } | 2798 } |
| 2820 | 2799 |
| 2821 __declspec(naked) | 2800 __declspec(naked) |
| 2822 void I422ToABGRMatrixRow_SSSE3(const uint8* y_buf, | 2801 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 2823 const uint8* u_buf, | 2802 const uint8* u_buf, |
| 2824 const uint8* v_buf, | 2803 const uint8* v_buf, |
| 2825 uint8* dst_abgr, | 2804 uint8* dst_abgr, |
| 2826 struct YuvConstants* YuvConstants, | 2805 struct YuvConstants* yuvconstants, |
| 2827 int width) { | 2806 int width) { |
| 2828 __asm { | 2807 __asm { |
| 2829 push esi | 2808 push esi |
| 2830 push edi | 2809 push edi |
| 2831 push ebp | 2810 push ebp |
| 2832 mov eax, [esp + 12 + 4] // Y | 2811 mov eax, [esp + 12 + 4] // Y |
| 2833 mov esi, [esp + 12 + 8] // U | 2812 mov esi, [esp + 12 + 8] // U |
| 2834 mov edi, [esp + 12 + 12] // V | 2813 mov edi, [esp + 12 + 12] // V |
| 2835 mov edx, [esp + 12 + 16] // argb | 2814 mov edx, [esp + 12 + 16] // argb |
| 2836 mov ebp, [esp + 12 + 20] // YuvConstants | 2815 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2837 mov ecx, [esp + 12 + 24] // width | 2816 mov ecx, [esp + 12 + 24] // width |
| (...skipping 13 matching lines...) Expand all Loading... |
| 2851 pop esi | 2830 pop esi |
| 2852 ret | 2831 ret |
| 2853 } | 2832 } |
| 2854 } | 2833 } |
| 2855 | 2834 |
| 2856 __declspec(naked) | 2835 __declspec(naked) |
| 2857 void I422ToRGBARow_SSSE3(const uint8* y_buf, | 2836 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 2858 const uint8* u_buf, | 2837 const uint8* u_buf, |
| 2859 const uint8* v_buf, | 2838 const uint8* v_buf, |
| 2860 uint8* dst_rgba, | 2839 uint8* dst_rgba, |
| 2840 struct YuvConstants* yuvconstants, |
| 2861 int width) { | 2841 int width) { |
| 2862 __asm { | 2842 __asm { |
| 2863 push esi | 2843 push esi |
| 2864 push edi | 2844 push edi |
| 2865 mov eax, [esp + 8 + 4] // Y | 2845 push ebp |
| 2866 mov esi, [esp + 8 + 8] // U | 2846 mov eax, [esp + 12 + 4] // Y |
| 2867 mov edi, [esp + 8 + 12] // V | 2847 mov esi, [esp + 12 + 8] // U |
| 2868 mov edx, [esp + 8 + 16] // rgba | 2848 mov edi, [esp + 12 + 12] // V |
| 2869 mov ecx, [esp + 8 + 20] // width | 2849 mov edx, [esp + 12 + 16] // argb |
| 2850 mov ebp, [esp + 12 + 20] // YuvConstants |
| 2851 mov ecx, [esp + 12 + 24] // width |
| 2870 sub edi, esi | 2852 sub edi, esi |
| 2871 | 2853 |
| 2872 convertloop: | 2854 convertloop: |
| 2873 READYUV422 | 2855 READYUV422 |
| 2874 YUVTORGB(kYuvConstants) | 2856 YUVTORGB(ebp) |
| 2875 STORERGBA | 2857 STORERGBA |
| 2876 | 2858 |
| 2877 sub ecx, 8 | 2859 sub ecx, 8 |
| 2878 jg convertloop | 2860 jg convertloop |
| 2879 | 2861 |
| 2862 pop ebp |
| 2880 pop edi | 2863 pop edi |
| 2881 pop esi | 2864 pop esi |
| 2882 ret | 2865 ret |
| 2883 } | 2866 } |
| 2884 } | 2867 } |
| 2885 | |
| 2886 #endif // HAS_I422TOARGBROW_SSSE3 | 2868 #endif // HAS_I422TOARGBROW_SSSE3 |
| 2887 | 2869 |
| 2888 #ifdef HAS_I400TOARGBROW_SSE2 | 2870 #ifdef HAS_I400TOARGBROW_SSE2 |
| 2889 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). | 2871 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). |
| 2890 __declspec(naked) | 2872 __declspec(naked) |
| 2891 void I400ToARGBRow_SSE2(const uint8* y_buf, | 2873 void I400ToARGBRow_SSE2(const uint8* y_buf, |
| 2892 uint8* rgb_buf, | 2874 uint8* rgb_buf, |
| 2893 int width) { | 2875 int width) { |
| 2894 __asm { | 2876 __asm { |
| 2895 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) | 2877 mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) |
| (...skipping 3382 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6278 } | 6260 } |
| 6279 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6261 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6280 | 6262 |
| 6281 #endif // defined(_M_X64) | 6263 #endif // defined(_M_X64) |
| 6282 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6264 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6283 | 6265 |
| 6284 #ifdef __cplusplus | 6266 #ifdef __cplusplus |
| 6285 } // extern "C" | 6267 } // extern "C" |
| 6286 } // namespace libyuv | 6268 } // namespace libyuv |
| 6287 #endif | 6269 #endif |
| OLD | NEW |