| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 22 matching lines...) Expand all Loading... |
| 33 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ | 33 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ |
| 34 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ | 34 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ |
| 35 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ | 35 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
| 36 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ | 36 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ |
| 37 u_buf += 4; \ | 37 u_buf += 4; \ |
| 38 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ | 38 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ |
| 39 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ | 39 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ |
| 40 y_buf += 8; \ | 40 y_buf += 8; \ |
| 41 | 41 |
| 42 // Convert 8 pixels: 8 UV and 8 Y. | 42 // Convert 8 pixels: 8 UV and 8 Y. |
| 43 #define YUVTORGB(YuvConstants) \ | 43 #define YUVTORGB(yuvconstants) \ |
| 44 xmm1 = _mm_loadu_si128(&xmm0); \ | 44 xmm1 = _mm_loadu_si128(&xmm0); \ |
| 45 xmm2 = _mm_loadu_si128(&xmm0); \ | 45 xmm2 = _mm_loadu_si128(&xmm0); \ |
| 46 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB); \ | 46 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)yuvconstants->kUVToB); \ |
| 47 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG); \ | 47 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)yuvconstants->kUVToG); \ |
| 48 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR); \ | 48 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)yuvconstants->kUVToR); \ |
| 49 xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \ | 49 xmm0 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasB, xmm0); \ |
| 50 xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \ | 50 xmm1 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasG, xmm1); \ |
| 51 xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \ | 51 xmm2 = _mm_sub_epi16(*(__m128i*)yuvconstants->kUVBiasR, xmm2); \ |
| 52 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb); \ | 52 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)yuvconstants->kYToRgb); \ |
| 53 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ | 53 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ |
| 54 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ | 54 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ |
| 55 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ | 55 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ |
| 56 xmm0 = _mm_srai_epi16(xmm0, 6); \ | 56 xmm0 = _mm_srai_epi16(xmm0, 6); \ |
| 57 xmm1 = _mm_srai_epi16(xmm1, 6); \ | 57 xmm1 = _mm_srai_epi16(xmm1, 6); \ |
| 58 xmm2 = _mm_srai_epi16(xmm2, 6); \ | 58 xmm2 = _mm_srai_epi16(xmm2, 6); \ |
| 59 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ | 59 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ |
| 60 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ | 60 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ |
| 61 xmm2 = _mm_packus_epi16(xmm2, xmm2); | 61 xmm2 = _mm_packus_epi16(xmm2, xmm2); |
| 62 | 62 |
| (...skipping 1960 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2023 __declspec(naked) | 2023 __declspec(naked) |
| 2024 void I422ToARGBRow_AVX2(const uint8* y_buf, | 2024 void I422ToARGBRow_AVX2(const uint8* y_buf, |
| 2025 const uint8* u_buf, | 2025 const uint8* u_buf, |
| 2026 const uint8* v_buf, | 2026 const uint8* v_buf, |
| 2027 uint8* dst_argb, | 2027 uint8* dst_argb, |
| 2028 struct YuvConstants* yuvconstants, | 2028 struct YuvConstants* yuvconstants, |
| 2029 int width) { | 2029 int width) { |
| 2030 __asm { | 2030 __asm { |
| 2031 push esi | 2031 push esi |
| 2032 push edi | 2032 push edi |
| 2033 push ebp | 2033 push ebx |
| 2034 mov eax, [esp + 12 + 4] // Y | 2034 mov eax, [esp + 12 + 4] // Y |
| 2035 mov esi, [esp + 12 + 8] // U | 2035 mov esi, [esp + 12 + 8] // U |
| 2036 mov edi, [esp + 12 + 12] // V | 2036 mov edi, [esp + 12 + 12] // V |
| 2037 mov edx, [esp + 12 + 16] // argb | 2037 mov edx, [esp + 12 + 16] // argb |
| 2038 mov ebp, [esp + 12 + 20] // yuvconstants | 2038 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2039 mov ecx, [esp + 12 + 24] // width | 2039 mov ecx, [esp + 12 + 24] // width |
| 2040 sub edi, esi | 2040 sub edi, esi |
| 2041 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2041 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2042 | 2042 |
| 2043 convertloop: | 2043 convertloop: |
| 2044 READYUV422_AVX2 | 2044 READYUV422_AVX2 |
| 2045 YUVTORGB_AVX2(ebp) | 2045 YUVTORGB_AVX2(ebx) |
| 2046 STOREARGB_AVX2 | 2046 STOREARGB_AVX2 |
| 2047 | 2047 |
| 2048 sub ecx, 16 | 2048 sub ecx, 16 |
| 2049 jg convertloop | 2049 jg convertloop |
| 2050 | 2050 |
| 2051 pop ebp | 2051 pop ebx |
| 2052 pop edi | 2052 pop edi |
| 2053 pop esi | 2053 pop esi |
| 2054 vzeroupper | 2054 vzeroupper |
| 2055 ret | 2055 ret |
| 2056 } | 2056 } |
| 2057 } | 2057 } |
| 2058 #endif // HAS_I422TOARGBROW_AVX2 | 2058 #endif // HAS_I422TOARGBROW_AVX2 |
| 2059 | 2059 |
| 2060 #ifdef HAS_I444TOARGBROW_AVX2 | 2060 #ifdef HAS_I444TOARGBROW_AVX2 |
| 2061 // 16 pixels | 2061 // 16 pixels |
| 2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). | 2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). |
| 2063 __declspec(naked) | 2063 __declspec(naked) |
| 2064 void I444ToARGBRow_AVX2(const uint8* y_buf, | 2064 void I444ToARGBRow_AVX2(const uint8* y_buf, |
| 2065 const uint8* u_buf, | 2065 const uint8* u_buf, |
| 2066 const uint8* v_buf, | 2066 const uint8* v_buf, |
| 2067 uint8* dst_argb, | 2067 uint8* dst_argb, |
| 2068 struct YuvConstants* yuvconstants, | 2068 struct YuvConstants* yuvconstants, |
| 2069 int width) { | 2069 int width) { |
| 2070 __asm { | 2070 __asm { |
| 2071 push esi | 2071 push esi |
| 2072 push edi | 2072 push edi |
| 2073 push ebp | 2073 push ebx |
| 2074 mov eax, [esp + 12 + 4] // Y | 2074 mov eax, [esp + 12 + 4] // Y |
| 2075 mov esi, [esp + 12 + 8] // U | 2075 mov esi, [esp + 12 + 8] // U |
| 2076 mov edi, [esp + 12 + 12] // V | 2076 mov edi, [esp + 12 + 12] // V |
| 2077 mov edx, [esp + 12 + 16] // argb | 2077 mov edx, [esp + 12 + 16] // argb |
| 2078 mov ebp, [esp + 12 + 20] // yuvconstants | 2078 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2079 mov ecx, [esp + 12 + 24] // width | 2079 mov ecx, [esp + 12 + 24] // width |
| 2080 sub edi, esi | 2080 sub edi, esi |
| 2081 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2081 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2082 convertloop: | 2082 convertloop: |
| 2083 READYUV444_AVX2 | 2083 READYUV444_AVX2 |
| 2084 YUVTORGB_AVX2(ebp) | 2084 YUVTORGB_AVX2(ebx) |
| 2085 STOREARGB_AVX2 | 2085 STOREARGB_AVX2 |
| 2086 | 2086 |
| 2087 sub ecx, 16 | 2087 sub ecx, 16 |
| 2088 jg convertloop | 2088 jg convertloop |
| 2089 | 2089 |
| 2090 pop ebp | 2090 pop ebx |
| 2091 pop edi | 2091 pop edi |
| 2092 pop esi | 2092 pop esi |
| 2093 vzeroupper | 2093 vzeroupper |
| 2094 ret | 2094 ret |
| 2095 } | 2095 } |
| 2096 } | 2096 } |
| 2097 #endif // HAS_I444TOARGBROW_AVX2 | 2097 #endif // HAS_I444TOARGBROW_AVX2 |
| 2098 | 2098 |
| 2099 #ifdef HAS_I444TOABGRROW_AVX2 | 2099 #ifdef HAS_I444TOABGRROW_AVX2 |
| 2100 // 16 pixels | 2100 // 16 pixels |
| 2101 // 16 UV values with 16 Y producing 16 ABGR (64 bytes). | 2101 // 16 UV values with 16 Y producing 16 ABGR (64 bytes). |
| 2102 __declspec(naked) | 2102 __declspec(naked) |
| 2103 void I444ToABGRRow_AVX2(const uint8* y_buf, | 2103 void I444ToABGRRow_AVX2(const uint8* y_buf, |
| 2104 const uint8* u_buf, | 2104 const uint8* u_buf, |
| 2105 const uint8* v_buf, | 2105 const uint8* v_buf, |
| 2106 uint8* dst_abgr, | 2106 uint8* dst_abgr, |
| 2107 struct YuvConstants* yuvconstants, | 2107 struct YuvConstants* yuvconstants, |
| 2108 int width) { | 2108 int width) { |
| 2109 __asm { | 2109 __asm { |
| 2110 push esi | 2110 push esi |
| 2111 push edi | 2111 push edi |
| 2112 push ebp | 2112 push ebx |
| 2113 mov eax, [esp + 12 + 4] // Y | 2113 mov eax, [esp + 12 + 4] // Y |
| 2114 mov esi, [esp + 12 + 8] // U | 2114 mov esi, [esp + 12 + 8] // U |
| 2115 mov edi, [esp + 12 + 12] // V | 2115 mov edi, [esp + 12 + 12] // V |
| 2116 mov edx, [esp + 12 + 16] // abgr | 2116 mov edx, [esp + 12 + 16] // abgr |
| 2117 mov ebp, [esp + 12 + 20] // yuvconstants | 2117 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2118 mov ecx, [esp + 12 + 24] // width | 2118 mov ecx, [esp + 12 + 24] // width |
| 2119 sub edi, esi | 2119 sub edi, esi |
| 2120 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2120 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2121 convertloop: | 2121 convertloop: |
| 2122 READYUV444_AVX2 | 2122 READYUV444_AVX2 |
| 2123 YUVTORGB_AVX2(ebp) | 2123 YUVTORGB_AVX2(ebx) |
| 2124 STOREABGR_AVX2 | 2124 STOREABGR_AVX2 |
| 2125 | 2125 |
| 2126 sub ecx, 16 | 2126 sub ecx, 16 |
| 2127 jg convertloop | 2127 jg convertloop |
| 2128 | 2128 |
| 2129 pop ebp | 2129 pop ebx |
| 2130 pop edi | 2130 pop edi |
| 2131 pop esi | 2131 pop esi |
| 2132 vzeroupper | 2132 vzeroupper |
| 2133 ret | 2133 ret |
| 2134 } | 2134 } |
| 2135 } | 2135 } |
| 2136 #endif // HAS_I444TOABGRROW_AVX2 | 2136 #endif // HAS_I444TOABGRROW_AVX2 |
| 2137 | 2137 |
| 2138 #ifdef HAS_I411TOARGBROW_AVX2 | 2138 #ifdef HAS_I411TOARGBROW_AVX2 |
| 2139 // 16 pixels | 2139 // 16 pixels |
| 2140 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2140 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2141 __declspec(naked) | 2141 __declspec(naked) |
| 2142 void I411ToARGBRow_AVX2(const uint8* y_buf, | 2142 void I411ToARGBRow_AVX2(const uint8* y_buf, |
| 2143 const uint8* u_buf, | 2143 const uint8* u_buf, |
| 2144 const uint8* v_buf, | 2144 const uint8* v_buf, |
| 2145 uint8* dst_argb, | 2145 uint8* dst_argb, |
| 2146 struct YuvConstants* yuvconstants, | 2146 struct YuvConstants* yuvconstants, |
| 2147 int width) { | 2147 int width) { |
| 2148 __asm { | 2148 __asm { |
| 2149 push esi | 2149 push esi |
| 2150 push edi | 2150 push edi |
| 2151 push ebp | 2151 push ebx |
| 2152 mov eax, [esp + 12 + 4] // Y | 2152 mov eax, [esp + 12 + 4] // Y |
| 2153 mov esi, [esp + 12 + 8] // U | 2153 mov esi, [esp + 12 + 8] // U |
| 2154 mov edi, [esp + 12 + 12] // V | 2154 mov edi, [esp + 12 + 12] // V |
| 2155 mov edx, [esp + 12 + 16] // abgr | 2155 mov edx, [esp + 12 + 16] // abgr |
| 2156 mov ebp, [esp + 12 + 20] // yuvconstants | 2156 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2157 mov ecx, [esp + 12 + 24] // width | 2157 mov ecx, [esp + 12 + 24] // width |
| 2158 sub edi, esi | 2158 sub edi, esi |
| 2159 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2159 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2160 | 2160 |
| 2161 convertloop: | 2161 convertloop: |
| 2162 READYUV411_AVX2 | 2162 READYUV411_AVX2 |
| 2163 YUVTORGB_AVX2(ebp) | 2163 YUVTORGB_AVX2(ebx) |
| 2164 STOREARGB_AVX2 | 2164 STOREARGB_AVX2 |
| 2165 | 2165 |
| 2166 sub ecx, 16 | 2166 sub ecx, 16 |
| 2167 jg convertloop | 2167 jg convertloop |
| 2168 | 2168 |
| 2169 pop ebp | 2169 pop ebx |
| 2170 pop edi | 2170 pop edi |
| 2171 pop esi | 2171 pop esi |
| 2172 vzeroupper | 2172 vzeroupper |
| 2173 ret | 2173 ret |
| 2174 } | 2174 } |
| 2175 } | 2175 } |
| 2176 #endif // HAS_I411TOARGBROW_AVX2 | 2176 #endif // HAS_I411TOARGBROW_AVX2 |
| 2177 | 2177 |
| 2178 #ifdef HAS_NV12TOARGBROW_AVX2 | 2178 #ifdef HAS_NV12TOARGBROW_AVX2 |
| 2179 // 16 pixels. | 2179 // 16 pixels. |
| 2180 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2180 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2181 __declspec(naked) | 2181 __declspec(naked) |
| 2182 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2182 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 2183 const uint8* uv_buf, | 2183 const uint8* uv_buf, |
| 2184 uint8* dst_argb, | 2184 uint8* dst_argb, |
| 2185 struct YuvConstants* yuvconstants, | 2185 struct YuvConstants* yuvconstants, |
| 2186 int width) { | 2186 int width) { |
| 2187 __asm { | 2187 __asm { |
| 2188 push esi | 2188 push esi |
| 2189 push ebp | 2189 push ebx |
| 2190 mov eax, [esp + 8 + 4] // Y | 2190 mov eax, [esp + 8 + 4] // Y |
| 2191 mov esi, [esp + 8 + 8] // UV | 2191 mov esi, [esp + 8 + 8] // UV |
| 2192 mov edx, [esp + 8 + 12] // argb | 2192 mov edx, [esp + 8 + 12] // argb |
| 2193 mov ebp, [esp + 8 + 16] // yuvconstants | 2193 mov ebx, [esp + 8 + 16] // yuvconstants |
| 2194 mov ecx, [esp + 8 + 20] // width | 2194 mov ecx, [esp + 8 + 20] // width |
| 2195 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2195 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2196 | 2196 |
| 2197 convertloop: | 2197 convertloop: |
| 2198 READNV12_AVX2 | 2198 READNV12_AVX2 |
| 2199 YUVTORGB_AVX2(ebp) | 2199 YUVTORGB_AVX2(ebx) |
| 2200 STOREARGB_AVX2 | 2200 STOREARGB_AVX2 |
| 2201 | 2201 |
| 2202 sub ecx, 16 | 2202 sub ecx, 16 |
| 2203 jg convertloop | 2203 jg convertloop |
| 2204 | 2204 |
| 2205 pop ebp | 2205 pop ebx |
| 2206 pop esi | 2206 pop esi |
| 2207 vzeroupper | 2207 vzeroupper |
| 2208 ret | 2208 ret |
| 2209 } | 2209 } |
| 2210 } | 2210 } |
| 2211 #endif // HAS_NV12TOARGBROW_AVX2 | 2211 #endif // HAS_NV12TOARGBROW_AVX2 |
| 2212 | 2212 |
| 2213 // 16 pixels. | 2213 // 16 pixels. |
| 2214 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2214 // 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2215 __declspec(naked) | 2215 __declspec(naked) |
| 2216 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, | 2216 void YUY2ToARGBRow_AVX2(const uint8* src_yuy2, |
| 2217 uint8* dst_argb, | 2217 uint8* dst_argb, |
| 2218 struct YuvConstants* yuvconstants, | 2218 struct YuvConstants* yuvconstants, |
| 2219 int width) { | 2219 int width) { |
| 2220 __asm { | 2220 __asm { |
| 2221 push ebp | 2221 push ebx |
| 2222 mov eax, [esp + 4 + 4] // yuy2 | 2222 mov eax, [esp + 4 + 4] // yuy2 |
| 2223 mov edx, [esp + 4 + 8] // argb | 2223 mov edx, [esp + 4 + 8] // argb |
| 2224 mov ebp, [esp + 4 + 12] // yuvconstants | 2224 mov ebx, [esp + 4 + 12] // yuvconstants |
| 2225 mov ecx, [esp + 4 + 16] // width | 2225 mov ecx, [esp + 4 + 16] // width |
| 2226 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2226 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2227 | 2227 |
| 2228 convertloop: | 2228 convertloop: |
| 2229 READYUY2_AVX2 | 2229 READYUY2_AVX2 |
| 2230 YUVTORGB_AVX2(ebp) | 2230 YUVTORGB_AVX2(ebx) |
| 2231 STOREARGB_AVX2 | 2231 STOREARGB_AVX2 |
| 2232 | 2232 |
| 2233 sub ecx, 16 | 2233 sub ecx, 16 |
| 2234 jg convertloop | 2234 jg convertloop |
| 2235 | 2235 |
| 2236 pop ebp | 2236 pop ebx |
| 2237 vzeroupper | 2237 vzeroupper |
| 2238 ret | 2238 ret |
| 2239 } | 2239 } |
| 2240 } | 2240 } |
| 2241 | 2241 |
| 2242 // 16 pixels. | 2242 // 16 pixels. |
| 2243 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). | 2243 // 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes). |
| 2244 __declspec(naked) | 2244 __declspec(naked) |
| 2245 void UYVYToARGBRow_AVX2(const uint8* src_uyvy, | 2245 void UYVYToARGBRow_AVX2(const uint8* src_uyvy, |
| 2246 uint8* dst_argb, | 2246 uint8* dst_argb, |
| 2247 struct YuvConstants* yuvconstants, | 2247 struct YuvConstants* yuvconstants, |
| 2248 int width) { | 2248 int width) { |
| 2249 __asm { | 2249 __asm { |
| 2250 push ebp | 2250 push ebx |
| 2251 mov eax, [esp + 4 + 4] // uyvy | 2251 mov eax, [esp + 4 + 4] // uyvy |
| 2252 mov edx, [esp + 4 + 8] // argb | 2252 mov edx, [esp + 4 + 8] // argb |
| 2253 mov ebp, [esp + 4 + 12] // yuvconstants | 2253 mov ebx, [esp + 4 + 12] // yuvconstants |
| 2254 mov ecx, [esp + 4 + 16] // width | 2254 mov ecx, [esp + 4 + 16] // width |
| 2255 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2255 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2256 | 2256 |
| 2257 convertloop: | 2257 convertloop: |
| 2258 READUYVY_AVX2 | 2258 READUYVY_AVX2 |
| 2259 YUVTORGB_AVX2(ebp) | 2259 YUVTORGB_AVX2(ebx) |
| 2260 STOREARGB_AVX2 | 2260 STOREARGB_AVX2 |
| 2261 | 2261 |
| 2262 sub ecx, 16 | 2262 sub ecx, 16 |
| 2263 jg convertloop | 2263 jg convertloop |
| 2264 | 2264 |
| 2265 pop ebp | 2265 pop ebx |
| 2266 vzeroupper | 2266 vzeroupper |
| 2267 ret | 2267 ret |
| 2268 } | 2268 } |
| 2269 } | 2269 } |
| 2270 | 2270 |
| 2271 | 2271 |
| 2272 #ifdef HAS_I422TOBGRAROW_AVX2 | 2272 #ifdef HAS_I422TOBGRAROW_AVX2 |
| 2273 // 16 pixels | 2273 // 16 pixels |
| 2274 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). | 2274 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). |
| 2275 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. | 2275 // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. |
| 2276 __declspec(naked) | 2276 __declspec(naked) |
| 2277 void I422ToBGRARow_AVX2(const uint8* y_buf, | 2277 void I422ToBGRARow_AVX2(const uint8* y_buf, |
| 2278 const uint8* u_buf, | 2278 const uint8* u_buf, |
| 2279 const uint8* v_buf, | 2279 const uint8* v_buf, |
| 2280 uint8* dst_argb, | 2280 uint8* dst_argb, |
| 2281 struct YuvConstants* yuvconstants, | 2281 struct YuvConstants* yuvconstants, |
| 2282 int width) { | 2282 int width) { |
| 2283 __asm { | 2283 __asm { |
| 2284 push esi | 2284 push esi |
| 2285 push edi | 2285 push edi |
| 2286 push ebp | 2286 push ebx |
| 2287 mov eax, [esp + 12 + 4] // Y | 2287 mov eax, [esp + 12 + 4] // Y |
| 2288 mov esi, [esp + 12 + 8] // U | 2288 mov esi, [esp + 12 + 8] // U |
| 2289 mov edi, [esp + 12 + 12] // V | 2289 mov edi, [esp + 12 + 12] // V |
| 2290 mov edx, [esp + 12 + 16] // abgr | 2290 mov edx, [esp + 12 + 16] // abgr |
| 2291 mov ebp, [esp + 12 + 20] // yuvconstants | 2291 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2292 mov ecx, [esp + 12 + 24] // width | 2292 mov ecx, [esp + 12 + 24] // width |
| 2293 sub edi, esi | 2293 sub edi, esi |
| 2294 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2294 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2295 | 2295 |
| 2296 convertloop: | 2296 convertloop: |
| 2297 READYUV422_AVX2 | 2297 READYUV422_AVX2 |
| 2298 YUVTORGB_AVX2(ebp) | 2298 YUVTORGB_AVX2(ebx) |
| 2299 STOREBGRA_AVX2 | 2299 STOREBGRA_AVX2 |
| 2300 | 2300 |
| 2301 sub ecx, 16 | 2301 sub ecx, 16 |
| 2302 jg convertloop | 2302 jg convertloop |
| 2303 | 2303 |
| 2304 pop ebp | 2304 pop ebx |
| 2305 pop edi | 2305 pop edi |
| 2306 pop esi | 2306 pop esi |
| 2307 vzeroupper | 2307 vzeroupper |
| 2308 ret | 2308 ret |
| 2309 } | 2309 } |
| 2310 } | 2310 } |
| 2311 #endif // HAS_I422TOBGRAROW_AVX2 | 2311 #endif // HAS_I422TOBGRAROW_AVX2 |
| 2312 | 2312 |
| 2313 #ifdef HAS_I422TORGBAROW_AVX2 | 2313 #ifdef HAS_I422TORGBAROW_AVX2 |
| 2314 // 16 pixels | 2314 // 16 pixels |
| 2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). | 2315 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). |
| 2316 __declspec(naked) | 2316 __declspec(naked) |
| 2317 void I422ToRGBARow_AVX2(const uint8* y_buf, | 2317 void I422ToRGBARow_AVX2(const uint8* y_buf, |
| 2318 const uint8* u_buf, | 2318 const uint8* u_buf, |
| 2319 const uint8* v_buf, | 2319 const uint8* v_buf, |
| 2320 uint8* dst_argb, | 2320 uint8* dst_argb, |
| 2321 struct YuvConstants* yuvconstants, | 2321 struct YuvConstants* yuvconstants, |
| 2322 int width) { | 2322 int width) { |
| 2323 __asm { | 2323 __asm { |
| 2324 push esi | 2324 push esi |
| 2325 push edi | 2325 push edi |
| 2326 push ebp | 2326 push ebx |
| 2327 mov eax, [esp + 12 + 4] // Y | 2327 mov eax, [esp + 12 + 4] // Y |
| 2328 mov esi, [esp + 12 + 8] // U | 2328 mov esi, [esp + 12 + 8] // U |
| 2329 mov edi, [esp + 12 + 12] // V | 2329 mov edi, [esp + 12 + 12] // V |
| 2330 mov edx, [esp + 12 + 16] // abgr | 2330 mov edx, [esp + 12 + 16] // abgr |
| 2331 mov ebp, [esp + 12 + 20] // yuvconstants | 2331 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2332 mov ecx, [esp + 12 + 24] // width | 2332 mov ecx, [esp + 12 + 24] // width |
| 2333 sub edi, esi | 2333 sub edi, esi |
| 2334 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2334 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2335 | 2335 |
| 2336 convertloop: | 2336 convertloop: |
| 2337 READYUV422_AVX2 | 2337 READYUV422_AVX2 |
| 2338 YUVTORGB_AVX2(ebp) | 2338 YUVTORGB_AVX2(ebx) |
| 2339 STORERGBA_AVX2 | 2339 STORERGBA_AVX2 |
| 2340 | 2340 |
| 2341 sub ecx, 16 | 2341 sub ecx, 16 |
| 2342 jg convertloop | 2342 jg convertloop |
| 2343 | 2343 |
| 2344 pop ebp | 2344 pop ebx |
| 2345 pop edi | 2345 pop edi |
| 2346 pop esi | 2346 pop esi |
| 2347 vzeroupper | 2347 vzeroupper |
| 2348 ret | 2348 ret |
| 2349 } | 2349 } |
| 2350 } | 2350 } |
| 2351 #endif // HAS_I422TORGBAROW_AVX2 | 2351 #endif // HAS_I422TORGBAROW_AVX2 |
| 2352 | 2352 |
| 2353 #ifdef HAS_I422TOABGRROW_AVX2 | 2353 #ifdef HAS_I422TOABGRROW_AVX2 |
| 2354 // 16 pixels | 2354 // 16 pixels |
| 2355 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). | 2355 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). |
| 2356 __declspec(naked) | 2356 __declspec(naked) |
| 2357 void I422ToABGRRow_AVX2(const uint8* y_buf, | 2357 void I422ToABGRRow_AVX2(const uint8* y_buf, |
| 2358 const uint8* u_buf, | 2358 const uint8* u_buf, |
| 2359 const uint8* v_buf, | 2359 const uint8* v_buf, |
| 2360 uint8* dst_argb, | 2360 uint8* dst_argb, |
| 2361 struct YuvConstants* yuvconstants, | 2361 struct YuvConstants* yuvconstants, |
| 2362 int width) { | 2362 int width) { |
| 2363 __asm { | 2363 __asm { |
| 2364 push esi | 2364 push esi |
| 2365 push edi | 2365 push edi |
| 2366 push ebp | 2366 push ebx |
| 2367 mov eax, [esp + 12 + 4] // Y | 2367 mov eax, [esp + 12 + 4] // Y |
| 2368 mov esi, [esp + 12 + 8] // U | 2368 mov esi, [esp + 12 + 8] // U |
| 2369 mov edi, [esp + 12 + 12] // V | 2369 mov edi, [esp + 12 + 12] // V |
| 2370 mov edx, [esp + 12 + 16] // argb | 2370 mov edx, [esp + 12 + 16] // argb |
| 2371 mov ebp, [esp + 12 + 20] // yuvconstants | 2371 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2372 mov ecx, [esp + 12 + 24] // width | 2372 mov ecx, [esp + 12 + 24] // width |
| 2373 sub edi, esi | 2373 sub edi, esi |
| 2374 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2374 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
| 2375 | 2375 |
| 2376 convertloop: | 2376 convertloop: |
| 2377 READYUV422_AVX2 | 2377 READYUV422_AVX2 |
| 2378 YUVTORGB_AVX2(ebp) | 2378 YUVTORGB_AVX2(ebx) |
| 2379 STOREABGR_AVX2 | 2379 STOREABGR_AVX2 |
| 2380 | 2380 |
| 2381 sub ecx, 16 | 2381 sub ecx, 16 |
| 2382 jg convertloop | 2382 jg convertloop |
| 2383 | 2383 |
| 2384 pop ebp | 2384 pop ebx |
| 2385 pop edi | 2385 pop edi |
| 2386 pop esi | 2386 pop esi |
| 2387 vzeroupper | 2387 vzeroupper |
| 2388 ret | 2388 ret |
| 2389 } | 2389 } |
| 2390 } | 2390 } |
| 2391 #endif // HAS_I422TOABGRROW_AVX2 | 2391 #endif // HAS_I422TOABGRROW_AVX2 |
| 2392 | 2392 |
| 2393 #if defined(HAS_I422TOARGBROW_SSSE3) | 2393 #if defined(HAS_I422TOARGBROW_SSSE3) |
| 2394 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. | 2394 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
| (...skipping 213 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2608 __declspec(naked) | 2608 __declspec(naked) |
| 2609 void I444ToARGBRow_SSSE3(const uint8* y_buf, | 2609 void I444ToARGBRow_SSSE3(const uint8* y_buf, |
| 2610 const uint8* u_buf, | 2610 const uint8* u_buf, |
| 2611 const uint8* v_buf, | 2611 const uint8* v_buf, |
| 2612 uint8* dst_argb, | 2612 uint8* dst_argb, |
| 2613 struct YuvConstants* yuvconstants, | 2613 struct YuvConstants* yuvconstants, |
| 2614 int width) { | 2614 int width) { |
| 2615 __asm { | 2615 __asm { |
| 2616 push esi | 2616 push esi |
| 2617 push edi | 2617 push edi |
| 2618 push ebp | 2618 push ebx |
| 2619 mov eax, [esp + 12 + 4] // Y | 2619 mov eax, [esp + 12 + 4] // Y |
| 2620 mov esi, [esp + 12 + 8] // U | 2620 mov esi, [esp + 12 + 8] // U |
| 2621 mov edi, [esp + 12 + 12] // V | 2621 mov edi, [esp + 12 + 12] // V |
| 2622 mov edx, [esp + 12 + 16] // argb | 2622 mov edx, [esp + 12 + 16] // argb |
| 2623 mov ebp, [esp + 12 + 20] // yuvconstants | 2623 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2624 mov ecx, [esp + 12 + 24] // width | 2624 mov ecx, [esp + 12 + 24] // width |
| 2625 sub edi, esi | 2625 sub edi, esi |
| 2626 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2626 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2627 | 2627 |
| 2628 convertloop: | 2628 convertloop: |
| 2629 READYUV444 | 2629 READYUV444 |
| 2630 YUVTORGB(ebp) | 2630 YUVTORGB(ebx) |
| 2631 STOREARGB | 2631 STOREARGB |
| 2632 | 2632 |
| 2633 sub ecx, 8 | 2633 sub ecx, 8 |
| 2634 jg convertloop | 2634 jg convertloop |
| 2635 | 2635 |
| 2636 pop ebp | 2636 pop ebx |
| 2637 pop edi | 2637 pop edi |
| 2638 pop esi | 2638 pop esi |
| 2639 ret | 2639 ret |
| 2640 } | 2640 } |
| 2641 } | 2641 } |
| 2642 | 2642 |
| 2643 // 8 pixels. | 2643 // 8 pixels. |
| 2644 // 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). | 2644 // 8 UV values, mixed with 8 Y producing 8 ABGR (32 bytes). |
| 2645 __declspec(naked) | 2645 __declspec(naked) |
| 2646 void I444ToABGRRow_SSSE3(const uint8* y_buf, | 2646 void I444ToABGRRow_SSSE3(const uint8* y_buf, |
| 2647 const uint8* u_buf, | 2647 const uint8* u_buf, |
| 2648 const uint8* v_buf, | 2648 const uint8* v_buf, |
| 2649 uint8* dst_abgr, | 2649 uint8* dst_abgr, |
| 2650 struct YuvConstants* yuvconstants, | 2650 struct YuvConstants* yuvconstants, |
| 2651 int width) { | 2651 int width) { |
| 2652 __asm { | 2652 __asm { |
| 2653 push esi | 2653 push esi |
| 2654 push edi | 2654 push edi |
| 2655 push ebp | 2655 push ebx |
| 2656 mov eax, [esp + 12 + 4] // Y | 2656 mov eax, [esp + 12 + 4] // Y |
| 2657 mov esi, [esp + 12 + 8] // U | 2657 mov esi, [esp + 12 + 8] // U |
| 2658 mov edi, [esp + 12 + 12] // V | 2658 mov edi, [esp + 12 + 12] // V |
| 2659 mov edx, [esp + 12 + 16] // abgr | 2659 mov edx, [esp + 12 + 16] // abgr |
| 2660 mov ebp, [esp + 12 + 20] // yuvconstants | 2660 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2661 mov ecx, [esp + 12 + 24] // width | 2661 mov ecx, [esp + 12 + 24] // width |
| 2662 sub edi, esi | 2662 sub edi, esi |
| 2663 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2663 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2664 | 2664 |
| 2665 convertloop: | 2665 convertloop: |
| 2666 READYUV444 | 2666 READYUV444 |
| 2667 YUVTORGB(ebp) | 2667 YUVTORGB(ebx) |
| 2668 STOREABGR | 2668 STOREABGR |
| 2669 | 2669 |
| 2670 sub ecx, 8 | 2670 sub ecx, 8 |
| 2671 jg convertloop | 2671 jg convertloop |
| 2672 | 2672 |
| 2673 pop ebp | 2673 pop ebx |
| 2674 pop edi | 2674 pop edi |
| 2675 pop esi | 2675 pop esi |
| 2676 ret | 2676 ret |
| 2677 } | 2677 } |
| 2678 } | 2678 } |
| 2679 | 2679 |
| 2680 // 8 pixels. | 2680 // 8 pixels. |
| 2681 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). | 2681 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). |
| 2682 __declspec(naked) | 2682 __declspec(naked) |
| 2683 void I422ToRGB24Row_SSSE3(const uint8* y_buf, | 2683 void I422ToRGB24Row_SSSE3(const uint8* y_buf, |
| 2684 const uint8* u_buf, | 2684 const uint8* u_buf, |
| 2685 const uint8* v_buf, | 2685 const uint8* v_buf, |
| 2686 uint8* dst_rgb24, | 2686 uint8* dst_rgb24, |
| 2687 struct YuvConstants* yuvconstants, | 2687 struct YuvConstants* yuvconstants, |
| 2688 int width) { | 2688 int width) { |
| 2689 __asm { | 2689 __asm { |
| 2690 push esi | 2690 push esi |
| 2691 push edi | 2691 push edi |
| 2692 push ebp | 2692 push ebx |
| 2693 mov eax, [esp + 12 + 4] // Y | 2693 mov eax, [esp + 12 + 4] // Y |
| 2694 mov esi, [esp + 12 + 8] // U | 2694 mov esi, [esp + 12 + 8] // U |
| 2695 mov edi, [esp + 12 + 12] // V | 2695 mov edi, [esp + 12 + 12] // V |
| 2696 mov edx, [esp + 12 + 16] // argb | 2696 mov edx, [esp + 12 + 16] // argb |
| 2697 mov ebp, [esp + 12 + 20] // yuvconstants | 2697 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2698 mov ecx, [esp + 12 + 24] // width | 2698 mov ecx, [esp + 12 + 24] // width |
| 2699 sub edi, esi | 2699 sub edi, esi |
| 2700 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 | 2700 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 |
| 2701 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 | 2701 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
| 2702 | 2702 |
| 2703 convertloop: | 2703 convertloop: |
| 2704 READYUV422 | 2704 READYUV422 |
| 2705 YUVTORGB(ebp) | 2705 YUVTORGB(ebx) |
| 2706 STORERGB24 | 2706 STORERGB24 |
| 2707 | 2707 |
| 2708 sub ecx, 8 | 2708 sub ecx, 8 |
| 2709 jg convertloop | 2709 jg convertloop |
| 2710 | 2710 |
| 2711 pop ebp | 2711 pop ebx |
| 2712 pop edi | 2712 pop edi |
| 2713 pop esi | 2713 pop esi |
| 2714 ret | 2714 ret |
| 2715 } | 2715 } |
| 2716 } | 2716 } |
| 2717 | 2717 |
| 2718 // 8 pixels. | 2718 // 8 pixels. |
| 2719 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). | 2719 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). |
| 2720 __declspec(naked) | 2720 __declspec(naked) |
| 2721 void I422ToRAWRow_SSSE3(const uint8* y_buf, | 2721 void I422ToRAWRow_SSSE3(const uint8* y_buf, |
| 2722 const uint8* u_buf, | 2722 const uint8* u_buf, |
| 2723 const uint8* v_buf, | 2723 const uint8* v_buf, |
| 2724 uint8* dst_raw, | 2724 uint8* dst_raw, |
| 2725 struct YuvConstants* yuvconstants, | 2725 struct YuvConstants* yuvconstants, |
| 2726 int width) { | 2726 int width) { |
| 2727 __asm { | 2727 __asm { |
| 2728 push esi | 2728 push esi |
| 2729 push edi | 2729 push edi |
| 2730 push ebp | 2730 push ebx |
| 2731 mov eax, [esp + 12 + 4] // Y | 2731 mov eax, [esp + 12 + 4] // Y |
| 2732 mov esi, [esp + 12 + 8] // U | 2732 mov esi, [esp + 12 + 8] // U |
| 2733 mov edi, [esp + 12 + 12] // V | 2733 mov edi, [esp + 12 + 12] // V |
| 2734 mov edx, [esp + 12 + 16] // argb | 2734 mov edx, [esp + 12 + 16] // argb |
| 2735 mov ebp, [esp + 12 + 20] // yuvconstants | 2735 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2736 mov ecx, [esp + 12 + 24] // width | 2736 mov ecx, [esp + 12 + 24] // width |
| 2737 sub edi, esi | 2737 sub edi, esi |
| 2738 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 | 2738 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 |
| 2739 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW | 2739 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
| 2740 | 2740 |
| 2741 convertloop: | 2741 convertloop: |
| 2742 READYUV422 | 2742 READYUV422 |
| 2743 YUVTORGB(ebp) | 2743 YUVTORGB(ebx) |
| 2744 STORERAW | 2744 STORERAW |
| 2745 | 2745 |
| 2746 sub ecx, 8 | 2746 sub ecx, 8 |
| 2747 jg convertloop | 2747 jg convertloop |
| 2748 | 2748 |
| 2749 pop ebp | 2749 pop ebx |
| 2750 pop edi | 2750 pop edi |
| 2751 pop esi | 2751 pop esi |
| 2752 ret | 2752 ret |
| 2753 } | 2753 } |
| 2754 } | 2754 } |
| 2755 | 2755 |
| 2756 // 8 pixels | 2756 // 8 pixels |
| 2757 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). | 2757 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). |
| 2758 __declspec(naked) | 2758 __declspec(naked) |
| 2759 void I422ToRGB565Row_SSSE3(const uint8* y_buf, | 2759 void I422ToRGB565Row_SSSE3(const uint8* y_buf, |
| 2760 const uint8* u_buf, | 2760 const uint8* u_buf, |
| 2761 const uint8* v_buf, | 2761 const uint8* v_buf, |
| 2762 uint8* rgb565_buf, | 2762 uint8* rgb565_buf, |
| 2763 struct YuvConstants* yuvconstants, | 2763 struct YuvConstants* yuvconstants, |
| 2764 int width) { | 2764 int width) { |
| 2765 __asm { | 2765 __asm { |
| 2766 push esi | 2766 push esi |
| 2767 push edi | 2767 push edi |
| 2768 push ebp | 2768 push ebx |
| 2769 mov eax, [esp + 12 + 4] // Y | 2769 mov eax, [esp + 12 + 4] // Y |
| 2770 mov esi, [esp + 12 + 8] // U | 2770 mov esi, [esp + 12 + 8] // U |
| 2771 mov edi, [esp + 12 + 12] // V | 2771 mov edi, [esp + 12 + 12] // V |
| 2772 mov edx, [esp + 12 + 16] // argb | 2772 mov edx, [esp + 12 + 16] // argb |
| 2773 mov ebp, [esp + 12 + 20] // yuvconstants | 2773 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2774 mov ecx, [esp + 12 + 24] // width | 2774 mov ecx, [esp + 12 + 24] // width |
| 2775 sub edi, esi | 2775 sub edi, esi |
| 2776 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f | 2776 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
| 2777 psrld xmm5, 27 | 2777 psrld xmm5, 27 |
| 2778 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 | 2778 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
| 2779 psrld xmm6, 26 | 2779 psrld xmm6, 26 |
| 2780 pslld xmm6, 5 | 2780 pslld xmm6, 5 |
| 2781 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 | 2781 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
| 2782 pslld xmm7, 11 | 2782 pslld xmm7, 11 |
| 2783 | 2783 |
| 2784 convertloop: | 2784 convertloop: |
| 2785 READYUV422 | 2785 READYUV422 |
| 2786 YUVTORGB(ebp) | 2786 YUVTORGB(ebx) |
| 2787 STORERGB565 | 2787 STORERGB565 |
| 2788 | 2788 |
| 2789 sub ecx, 8 | 2789 sub ecx, 8 |
| 2790 jg convertloop | 2790 jg convertloop |
| 2791 | 2791 |
| 2792 pop ebp | 2792 pop ebx |
| 2793 pop edi | 2793 pop edi |
| 2794 pop esi | 2794 pop esi |
| 2795 ret | 2795 ret |
| 2796 } | 2796 } |
| 2797 } | 2797 } |
| 2798 | 2798 |
| 2799 // 8 pixels. | 2799 // 8 pixels. |
| 2800 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2800 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2801 __declspec(naked) | 2801 __declspec(naked) |
| 2802 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 2802 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
| 2803 const uint8* u_buf, | 2803 const uint8* u_buf, |
| 2804 const uint8* v_buf, | 2804 const uint8* v_buf, |
| 2805 uint8* dst_argb, | 2805 uint8* dst_argb, |
| 2806 struct YuvConstants* yuvconstants, | 2806 struct YuvConstants* yuvconstants, |
| 2807 int width) { | 2807 int width) { |
| 2808 __asm { | 2808 __asm { |
| 2809 push esi | 2809 push esi |
| 2810 push edi | 2810 push edi |
| 2811 push ebp | 2811 push ebx |
| 2812 mov eax, [esp + 12 + 4] // Y | 2812 mov eax, [esp + 12 + 4] // Y |
| 2813 mov esi, [esp + 12 + 8] // U | 2813 mov esi, [esp + 12 + 8] // U |
| 2814 mov edi, [esp + 12 + 12] // V | 2814 mov edi, [esp + 12 + 12] // V |
| 2815 mov edx, [esp + 12 + 16] // argb | 2815 mov edx, [esp + 12 + 16] // argb |
| 2816 mov ebp, [esp + 12 + 20] // yuvconstants | 2816 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2817 mov ecx, [esp + 12 + 24] // width | 2817 mov ecx, [esp + 12 + 24] // width |
| 2818 sub edi, esi | 2818 sub edi, esi |
| 2819 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2819 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2820 | 2820 |
| 2821 convertloop: | 2821 convertloop: |
| 2822 READYUV422 | 2822 READYUV422 |
| 2823 YUVTORGB(ebp) | 2823 YUVTORGB(ebx) |
| 2824 STOREARGB | 2824 STOREARGB |
| 2825 | 2825 |
| 2826 sub ecx, 8 | 2826 sub ecx, 8 |
| 2827 jg convertloop | 2827 jg convertloop |
| 2828 | 2828 |
| 2829 pop ebp | 2829 pop ebx |
| 2830 pop edi | 2830 pop edi |
| 2831 pop esi | 2831 pop esi |
| 2832 ret | 2832 ret |
| 2833 } | 2833 } |
| 2834 } | 2834 } |
| 2835 | 2835 |
| 2836 // 8 pixels. | 2836 // 8 pixels. |
| 2837 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2837 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2838 // Similar to I420 but duplicate UV once more. | 2838 // Similar to I420 but duplicate UV once more. |
| 2839 __declspec(naked) | 2839 __declspec(naked) |
| 2840 void I411ToARGBRow_SSSE3(const uint8* y_buf, | 2840 void I411ToARGBRow_SSSE3(const uint8* y_buf, |
| 2841 const uint8* u_buf, | 2841 const uint8* u_buf, |
| 2842 const uint8* v_buf, | 2842 const uint8* v_buf, |
| 2843 uint8* dst_argb, | 2843 uint8* dst_argb, |
| 2844 struct YuvConstants* yuvconstants, | 2844 struct YuvConstants* yuvconstants, |
| 2845 int width) { | 2845 int width) { |
| 2846 __asm { | 2846 __asm { |
| 2847 push esi | 2847 push esi |
| 2848 push edi | 2848 push edi |
| 2849 push ebp | 2849 push ebx |
| 2850 mov eax, [esp + 12 + 4] // Y | 2850 mov eax, [esp + 12 + 4] // Y |
| 2851 mov esi, [esp + 12 + 8] // U | 2851 mov esi, [esp + 12 + 8] // U |
| 2852 mov edi, [esp + 12 + 12] // V | 2852 mov edi, [esp + 12 + 12] // V |
| 2853 mov edx, [esp + 12 + 16] // abgr | 2853 mov edx, [esp + 12 + 16] // abgr |
| 2854 mov ebp, [esp + 12 + 20] // yuvconstants | 2854 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2855 mov ecx, [esp + 12 + 24] // width | 2855 mov ecx, [esp + 12 + 24] // width |
| 2856 sub edi, esi | 2856 sub edi, esi |
| 2857 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2857 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2858 | 2858 |
| 2859 convertloop: | 2859 convertloop: |
| 2860 READYUV411 | 2860 READYUV411 |
| 2861 YUVTORGB(ebp) | 2861 YUVTORGB(ebx) |
| 2862 STOREARGB | 2862 STOREARGB |
| 2863 | 2863 |
| 2864 sub ecx, 8 | 2864 sub ecx, 8 |
| 2865 jg convertloop | 2865 jg convertloop |
| 2866 | 2866 |
| 2867 pop ebp | 2867 pop ebx |
| 2868 pop edi | 2868 pop edi |
| 2869 pop esi | 2869 pop esi |
| 2870 ret | 2870 ret |
| 2871 } | 2871 } |
| 2872 } | 2872 } |
| 2873 | 2873 |
| 2874 // 8 pixels. | 2874 // 8 pixels. |
| 2875 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2875 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2876 __declspec(naked) | 2876 __declspec(naked) |
| 2877 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2877 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 2878 const uint8* uv_buf, | 2878 const uint8* uv_buf, |
| 2879 uint8* dst_argb, | 2879 uint8* dst_argb, |
| 2880 struct YuvConstants* yuvconstants, | 2880 struct YuvConstants* yuvconstants, |
| 2881 int width) { | 2881 int width) { |
| 2882 __asm { | 2882 __asm { |
| 2883 push esi | 2883 push esi |
| 2884 push ebp | 2884 push ebx |
| 2885 mov eax, [esp + 8 + 4] // Y | 2885 mov eax, [esp + 8 + 4] // Y |
| 2886 mov esi, [esp + 8 + 8] // UV | 2886 mov esi, [esp + 8 + 8] // UV |
| 2887 mov edx, [esp + 8 + 12] // argb | 2887 mov edx, [esp + 8 + 12] // argb |
| 2888 mov ebp, [esp + 8 + 16] // yuvconstants | 2888 mov ebx, [esp + 8 + 16] // yuvconstants |
| 2889 mov ecx, [esp + 8 + 20] // width | 2889 mov ecx, [esp + 8 + 20] // width |
| 2890 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2890 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2891 | 2891 |
| 2892 convertloop: | 2892 convertloop: |
| 2893 READNV12 | 2893 READNV12 |
| 2894 YUVTORGB(ebp) | 2894 YUVTORGB(ebx) |
| 2895 STOREARGB | 2895 STOREARGB |
| 2896 | 2896 |
| 2897 sub ecx, 8 | 2897 sub ecx, 8 |
| 2898 jg convertloop | 2898 jg convertloop |
| 2899 | 2899 |
| 2900 pop ebp | 2900 pop ebx |
| 2901 pop esi | 2901 pop esi |
| 2902 ret | 2902 ret |
| 2903 } | 2903 } |
| 2904 } | 2904 } |
| 2905 | 2905 |
| 2906 // 8 pixels. | 2906 // 8 pixels. |
| 2907 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). | 2907 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
| 2908 __declspec(naked) | 2908 __declspec(naked) |
| 2909 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, | 2909 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, |
| 2910 uint8* dst_argb, | 2910 uint8* dst_argb, |
| 2911 struct YuvConstants* yuvconstants, | 2911 struct YuvConstants* yuvconstants, |
| 2912 int width) { | 2912 int width) { |
| 2913 __asm { | 2913 __asm { |
| 2914 push ebp | 2914 push ebx |
| 2915 mov eax, [esp + 4 + 4] // yuy2 | 2915 mov eax, [esp + 4 + 4] // yuy2 |
| 2916 mov edx, [esp + 4 + 8] // argb | 2916 mov edx, [esp + 4 + 8] // argb |
| 2917 mov ebp, [esp + 4 + 12] // yuvconstants | 2917 mov ebx, [esp + 4 + 12] // yuvconstants |
| 2918 mov ecx, [esp + 4 + 16] // width | 2918 mov ecx, [esp + 4 + 16] // width |
| 2919 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2919 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2920 | 2920 |
| 2921 convertloop: | 2921 convertloop: |
| 2922 READYUY2 | 2922 READYUY2 |
| 2923 YUVTORGB(ebp) | 2923 YUVTORGB(ebx) |
| 2924 STOREARGB | 2924 STOREARGB |
| 2925 | 2925 |
| 2926 sub ecx, 8 | 2926 sub ecx, 8 |
| 2927 jg convertloop | 2927 jg convertloop |
| 2928 | 2928 |
| 2929 pop ebp | 2929 pop ebx |
| 2930 ret | 2930 ret |
| 2931 } | 2931 } |
| 2932 } | 2932 } |
| 2933 | 2933 |
| 2934 // 8 pixels. | 2934 // 8 pixels. |
| 2935 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). | 2935 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
| 2936 __declspec(naked) | 2936 __declspec(naked) |
| 2937 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, | 2937 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, |
| 2938 uint8* dst_argb, | 2938 uint8* dst_argb, |
| 2939 struct YuvConstants* yuvconstants, | 2939 struct YuvConstants* yuvconstants, |
| 2940 int width) { | 2940 int width) { |
| 2941 __asm { | 2941 __asm { |
| 2942 push ebp | 2942 push ebx |
| 2943 mov eax, [esp + 4 + 4] // uyvy | 2943 mov eax, [esp + 4 + 4] // uyvy |
| 2944 mov edx, [esp + 4 + 8] // argb | 2944 mov edx, [esp + 4 + 8] // argb |
| 2945 mov ebp, [esp + 4 + 12] // yuvconstants | 2945 mov ebx, [esp + 4 + 12] // yuvconstants |
| 2946 mov ecx, [esp + 4 + 16] // width | 2946 mov ecx, [esp + 4 + 16] // width |
| 2947 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2947 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2948 | 2948 |
| 2949 convertloop: | 2949 convertloop: |
| 2950 READUYVY | 2950 READUYVY |
| 2951 YUVTORGB(ebp) | 2951 YUVTORGB(ebx) |
| 2952 STOREARGB | 2952 STOREARGB |
| 2953 | 2953 |
| 2954 sub ecx, 8 | 2954 sub ecx, 8 |
| 2955 jg convertloop | 2955 jg convertloop |
| 2956 | 2956 |
| 2957 pop ebp | 2957 pop ebx |
| 2958 ret | 2958 ret |
| 2959 } | 2959 } |
| 2960 } | 2960 } |
| 2961 | 2961 |
| 2962 __declspec(naked) | 2962 __declspec(naked) |
| 2963 void I422ToBGRARow_SSSE3(const uint8* y_buf, | 2963 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
| 2964 const uint8* u_buf, | 2964 const uint8* u_buf, |
| 2965 const uint8* v_buf, | 2965 const uint8* v_buf, |
| 2966 uint8* dst_bgra, | 2966 uint8* dst_bgra, |
| 2967 struct YuvConstants* yuvconstants, | 2967 struct YuvConstants* yuvconstants, |
| 2968 int width) { | 2968 int width) { |
| 2969 __asm { | 2969 __asm { |
| 2970 push esi | 2970 push esi |
| 2971 push edi | 2971 push edi |
| 2972 push ebp | 2972 push ebx |
| 2973 mov eax, [esp + 12 + 4] // Y | 2973 mov eax, [esp + 12 + 4] // Y |
| 2974 mov esi, [esp + 12 + 8] // U | 2974 mov esi, [esp + 12 + 8] // U |
| 2975 mov edi, [esp + 12 + 12] // V | 2975 mov edi, [esp + 12 + 12] // V |
| 2976 mov edx, [esp + 12 + 16] // argb | 2976 mov edx, [esp + 12 + 16] // argb |
| 2977 mov ebp, [esp + 12 + 20] // yuvconstants | 2977 mov ebx, [esp + 12 + 20] // yuvconstants |
| 2978 mov ecx, [esp + 12 + 24] // width | 2978 mov ecx, [esp + 12 + 24] // width |
| 2979 sub edi, esi | 2979 sub edi, esi |
| 2980 | 2980 |
| 2981 convertloop: | 2981 convertloop: |
| 2982 READYUV422 | 2982 READYUV422 |
| 2983 YUVTORGB(ebp) | 2983 YUVTORGB(ebx) |
| 2984 STOREBGRA | 2984 STOREBGRA |
| 2985 | 2985 |
| 2986 sub ecx, 8 | 2986 sub ecx, 8 |
| 2987 jg convertloop | 2987 jg convertloop |
| 2988 | 2988 |
| 2989 pop ebp | 2989 pop ebx |
| 2990 pop edi | 2990 pop edi |
| 2991 pop esi | 2991 pop esi |
| 2992 ret | 2992 ret |
| 2993 } | 2993 } |
| 2994 } | 2994 } |
| 2995 | 2995 |
| 2996 __declspec(naked) | 2996 __declspec(naked) |
| 2997 void I422ToABGRRow_SSSE3(const uint8* y_buf, | 2997 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
| 2998 const uint8* u_buf, | 2998 const uint8* u_buf, |
| 2999 const uint8* v_buf, | 2999 const uint8* v_buf, |
| 3000 uint8* dst_abgr, | 3000 uint8* dst_abgr, |
| 3001 struct YuvConstants* yuvconstants, | 3001 struct YuvConstants* yuvconstants, |
| 3002 int width) { | 3002 int width) { |
| 3003 __asm { | 3003 __asm { |
| 3004 push esi | 3004 push esi |
| 3005 push edi | 3005 push edi |
| 3006 push ebp | 3006 push ebx |
| 3007 mov eax, [esp + 12 + 4] // Y | 3007 mov eax, [esp + 12 + 4] // Y |
| 3008 mov esi, [esp + 12 + 8] // U | 3008 mov esi, [esp + 12 + 8] // U |
| 3009 mov edi, [esp + 12 + 12] // V | 3009 mov edi, [esp + 12 + 12] // V |
| 3010 mov edx, [esp + 12 + 16] // argb | 3010 mov edx, [esp + 12 + 16] // argb |
| 3011 mov ebp, [esp + 12 + 20] // yuvconstants | 3011 mov ebx, [esp + 12 + 20] // yuvconstants |
| 3012 mov ecx, [esp + 12 + 24] // width | 3012 mov ecx, [esp + 12 + 24] // width |
| 3013 sub edi, esi | 3013 sub edi, esi |
| 3014 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 3014 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 3015 | 3015 |
| 3016 convertloop: | 3016 convertloop: |
| 3017 READYUV422 | 3017 READYUV422 |
| 3018 YUVTORGB(ebp) | 3018 YUVTORGB(ebx) |
| 3019 STOREABGR | 3019 STOREABGR |
| 3020 | 3020 |
| 3021 sub ecx, 8 | 3021 sub ecx, 8 |
| 3022 jg convertloop | 3022 jg convertloop |
| 3023 | 3023 |
| 3024 pop ebp | 3024 pop ebx |
| 3025 pop edi | 3025 pop edi |
| 3026 pop esi | 3026 pop esi |
| 3027 ret | 3027 ret |
| 3028 } | 3028 } |
| 3029 } | 3029 } |
| 3030 | 3030 |
| 3031 __declspec(naked) | 3031 __declspec(naked) |
| 3032 void I422ToRGBARow_SSSE3(const uint8* y_buf, | 3032 void I422ToRGBARow_SSSE3(const uint8* y_buf, |
| 3033 const uint8* u_buf, | 3033 const uint8* u_buf, |
| 3034 const uint8* v_buf, | 3034 const uint8* v_buf, |
| 3035 uint8* dst_rgba, | 3035 uint8* dst_rgba, |
| 3036 struct YuvConstants* yuvconstants, | 3036 struct YuvConstants* yuvconstants, |
| 3037 int width) { | 3037 int width) { |
| 3038 __asm { | 3038 __asm { |
| 3039 push esi | 3039 push esi |
| 3040 push edi | 3040 push edi |
| 3041 push ebp | 3041 push ebx |
| 3042 mov eax, [esp + 12 + 4] // Y | 3042 mov eax, [esp + 12 + 4] // Y |
| 3043 mov esi, [esp + 12 + 8] // U | 3043 mov esi, [esp + 12 + 8] // U |
| 3044 mov edi, [esp + 12 + 12] // V | 3044 mov edi, [esp + 12 + 12] // V |
| 3045 mov edx, [esp + 12 + 16] // argb | 3045 mov edx, [esp + 12 + 16] // argb |
| 3046 mov ebp, [esp + 12 + 20] // yuvconstants | 3046 mov ebx, [esp + 12 + 20] // yuvconstants |
| 3047 mov ecx, [esp + 12 + 24] // width | 3047 mov ecx, [esp + 12 + 24] // width |
| 3048 sub edi, esi | 3048 sub edi, esi |
| 3049 | 3049 |
| 3050 convertloop: | 3050 convertloop: |
| 3051 READYUV422 | 3051 READYUV422 |
| 3052 YUVTORGB(ebp) | 3052 YUVTORGB(ebx) |
| 3053 STORERGBA | 3053 STORERGBA |
| 3054 | 3054 |
| 3055 sub ecx, 8 | 3055 sub ecx, 8 |
| 3056 jg convertloop | 3056 jg convertloop |
| 3057 | 3057 |
| 3058 pop ebp | 3058 pop ebx |
| 3059 pop edi | 3059 pop edi |
| 3060 pop esi | 3060 pop esi |
| 3061 ret | 3061 ret |
| 3062 } | 3062 } |
| 3063 } | 3063 } |
| 3064 #endif // HAS_I422TOARGBROW_SSSE3 | 3064 #endif // HAS_I422TOARGBROW_SSSE3 |
| 3065 | 3065 |
| 3066 #ifdef HAS_I400TOARGBROW_SSE2 | 3066 #ifdef HAS_I400TOARGBROW_SSE2 |
| 3067 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). | 3067 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). |
| 3068 __declspec(naked) | 3068 __declspec(naked) |
| (...skipping 3386 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6455 } | 6455 } |
| 6456 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6456 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6457 | 6457 |
| 6458 #endif // defined(_M_X64) | 6458 #endif // defined(_M_X64) |
| 6459 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6459 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| 6460 | 6460 |
| 6461 #ifdef __cplusplus | 6461 #ifdef __cplusplus |
| 6462 } // extern "C" | 6462 } // extern "C" |
| 6463 } // namespace libyuv | 6463 } // namespace libyuv |
| 6464 #endif | 6464 #endif |
| OLD | NEW |