| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
| 3 * | 3 * |
| 4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
| 5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
| 6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
| 7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
| 8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
| 9 */ | 9 */ |
| 10 | 10 |
| (...skipping 1951 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1962 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1962 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1963 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1963 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1964 __asm vpermq ymm4, ymm4, 0xd8 \ | 1964 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1965 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1965 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1966 __asm lea eax, [eax + 16] \ | 1966 __asm lea eax, [eax + 16] \ |
| 1967 __asm vmovdqu xmm5, [ebp] /* A */ \ | 1967 __asm vmovdqu xmm5, [ebp] /* A */ \ |
| 1968 __asm vpermq ymm5, ymm5, 0xd8 \ | 1968 __asm vpermq ymm5, ymm5, 0xd8 \ |
| 1969 __asm lea ebp, [ebp + 16] \ | 1969 __asm lea ebp, [ebp + 16] \ |
| 1970 } | 1970 } |
| 1971 | 1971 |
| 1972 // Read 4 UV from 411, upsample to 16 UV. | |
| 1973 #define READYUV411_AVX2 __asm { \ | |
| 1974 __asm vmovd xmm0, dword ptr [esi] /* U */ \ | |
| 1975 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ | |
| 1976 __asm lea esi, [esi + 4] \ | |
| 1977 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | |
| 1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | |
| 1979 __asm vpermq ymm0, ymm0, 0xd8 \ | |
| 1980 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ | |
| 1981 __asm vmovdqu xmm4, [eax] /* Y */ \ | |
| 1982 __asm vpermq ymm4, ymm4, 0xd8 \ | |
| 1983 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | |
| 1984 __asm lea eax, [eax + 16] \ | |
| 1985 } | |
| 1986 | |
| 1987 // Read 8 UV from NV12, upsample to 16 UV. | 1972 // Read 8 UV from NV12, upsample to 16 UV. |
| 1988 #define READNV12_AVX2 __asm { \ | 1973 #define READNV12_AVX2 __asm { \ |
| 1989 __asm vmovdqu xmm0, [esi] /* UV */ \ | 1974 __asm vmovdqu xmm0, [esi] /* UV */ \ |
| 1990 __asm lea esi, [esi + 16] \ | 1975 __asm lea esi, [esi + 16] \ |
| 1991 __asm vpermq ymm0, ymm0, 0xd8 \ | 1976 __asm vpermq ymm0, ymm0, 0xd8 \ |
| 1992 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1977 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1993 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1978 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1994 __asm vpermq ymm4, ymm4, 0xd8 \ | 1979 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1995 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1980 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1996 __asm lea eax, [eax + 16] \ | 1981 __asm lea eax, [eax + 16] \ |
| (...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2191 | 2176 |
| 2192 pop ebx | 2177 pop ebx |
| 2193 pop edi | 2178 pop edi |
| 2194 pop esi | 2179 pop esi |
| 2195 vzeroupper | 2180 vzeroupper |
| 2196 ret | 2181 ret |
| 2197 } | 2182 } |
| 2198 } | 2183 } |
| 2199 #endif // HAS_I444TOARGBROW_AVX2 | 2184 #endif // HAS_I444TOARGBROW_AVX2 |
| 2200 | 2185 |
| 2201 #ifdef HAS_I411TOARGBROW_AVX2 | |
| 2202 // 16 pixels | |
| 2203 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | |
| 2204 __declspec(naked) | |
| 2205 void I411ToARGBRow_AVX2(const uint8* y_buf, | |
| 2206 const uint8* u_buf, | |
| 2207 const uint8* v_buf, | |
| 2208 uint8* dst_argb, | |
| 2209 const struct YuvConstants* yuvconstants, | |
| 2210 int width) { | |
| 2211 __asm { | |
| 2212 push esi | |
| 2213 push edi | |
| 2214 push ebx | |
| 2215 mov eax, [esp + 12 + 4] // Y | |
| 2216 mov esi, [esp + 12 + 8] // U | |
| 2217 mov edi, [esp + 12 + 12] // V | |
| 2218 mov edx, [esp + 12 + 16] // abgr | |
| 2219 mov ebx, [esp + 12 + 20] // yuvconstants | |
| 2220 mov ecx, [esp + 12 + 24] // width | |
| 2221 sub edi, esi | |
| 2222 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | |
| 2223 | |
| 2224 convertloop: | |
| 2225 READYUV411_AVX2 | |
| 2226 YUVTORGB_AVX2(ebx) | |
| 2227 STOREARGB_AVX2 | |
| 2228 | |
| 2229 sub ecx, 16 | |
| 2230 jg convertloop | |
| 2231 | |
| 2232 pop ebx | |
| 2233 pop edi | |
| 2234 pop esi | |
| 2235 vzeroupper | |
| 2236 ret | |
| 2237 } | |
| 2238 } | |
| 2239 #endif // HAS_I411TOARGBROW_AVX2 | |
| 2240 | |
| 2241 #ifdef HAS_NV12TOARGBROW_AVX2 | 2186 #ifdef HAS_NV12TOARGBROW_AVX2 |
| 2242 // 16 pixels. | 2187 // 16 pixels. |
| 2243 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
| 2244 __declspec(naked) | 2189 __declspec(naked) |
| 2245 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2190 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
| 2246 const uint8* uv_buf, | 2191 const uint8* uv_buf, |
| 2247 uint8* dst_argb, | 2192 uint8* dst_argb, |
| 2248 const struct YuvConstants* yuvconstants, | 2193 const struct YuvConstants* yuvconstants, |
| 2249 int width) { | 2194 int width) { |
| 2250 __asm { | 2195 __asm { |
| (...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2444 __asm lea esi, [esi + 4] \ | 2389 __asm lea esi, [esi + 4] \ |
| 2445 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2390 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| 2446 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2391 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2447 __asm movq xmm4, qword ptr [eax] /* Y */ \ | 2392 __asm movq xmm4, qword ptr [eax] /* Y */ \ |
| 2448 __asm punpcklbw xmm4, xmm4 \ | 2393 __asm punpcklbw xmm4, xmm4 \ |
| 2449 __asm lea eax, [eax + 8] \ | 2394 __asm lea eax, [eax + 8] \ |
| 2450 __asm movq xmm5, qword ptr [ebp] /* A */ \ | 2395 __asm movq xmm5, qword ptr [ebp] /* A */ \ |
| 2451 __asm lea ebp, [ebp + 8] \ | 2396 __asm lea ebp, [ebp + 8] \ |
| 2452 } | 2397 } |
| 2453 | 2398 |
| 2454 // Read 2 UV from 411, upsample to 8 UV. | |
| 2455 // drmemory fails with memory fault if pinsrw used. libyuv bug: 525 | |
| 2456 // __asm pinsrw xmm0, [esi], 0 /* U */ | |
| 2457 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ | |
| 2458 #define READYUV411_EBX __asm { \ | |
| 2459 __asm movzx ebx, word ptr [esi] /* U */ \ | |
| 2460 __asm movd xmm0, ebx \ | |
| 2461 __asm movzx ebx, word ptr [esi + edi] /* V */ \ | |
| 2462 __asm movd xmm1, ebx \ | |
| 2463 __asm lea esi, [esi + 2] \ | |
| 2464 __asm punpcklbw xmm0, xmm1 /* UV */ \ | |
| 2465 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | |
| 2466 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ | |
| 2467 __asm movq xmm4, qword ptr [eax] \ | |
| 2468 __asm punpcklbw xmm4, xmm4 \ | |
| 2469 __asm lea eax, [eax + 8] \ | |
| 2470 } | |
| 2471 | |
| 2472 // Read 4 UV from NV12, upsample to 8 UV. | 2399 // Read 4 UV from NV12, upsample to 8 UV. |
| 2473 #define READNV12 __asm { \ | 2400 #define READNV12 __asm { \ |
| 2474 __asm movq xmm0, qword ptr [esi] /* UV */ \ | 2401 __asm movq xmm0, qword ptr [esi] /* UV */ \ |
| 2475 __asm lea esi, [esi + 8] \ | 2402 __asm lea esi, [esi + 8] \ |
| 2476 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2403 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2477 __asm movq xmm4, qword ptr [eax] \ | 2404 __asm movq xmm4, qword ptr [eax] \ |
| 2478 __asm punpcklbw xmm4, xmm4 \ | 2405 __asm punpcklbw xmm4, xmm4 \ |
| 2479 __asm lea eax, [eax + 8] \ | 2406 __asm lea eax, [eax + 8] \ |
| 2480 } | 2407 } |
| 2481 | 2408 |
| (...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 2813 | 2740 |
| 2814 pop ebp | 2741 pop ebp |
| 2815 pop ebx | 2742 pop ebx |
| 2816 pop edi | 2743 pop edi |
| 2817 pop esi | 2744 pop esi |
| 2818 ret | 2745 ret |
| 2819 } | 2746 } |
| 2820 } | 2747 } |
| 2821 | 2748 |
| 2822 // 8 pixels. | 2749 // 8 pixels. |
| 2823 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | |
| 2824 // Similar to I420 but duplicate UV once more. | |
| 2825 __declspec(naked) | |
| 2826 void I411ToARGBRow_SSSE3(const uint8* y_buf, | |
| 2827 const uint8* u_buf, | |
| 2828 const uint8* v_buf, | |
| 2829 uint8* dst_argb, | |
| 2830 const struct YuvConstants* yuvconstants, | |
| 2831 int width) { | |
| 2832 __asm { | |
| 2833 push esi | |
| 2834 push edi | |
| 2835 push ebx | |
| 2836 push ebp | |
| 2837 mov eax, [esp + 16 + 4] // Y | |
| 2838 mov esi, [esp + 16 + 8] // U | |
| 2839 mov edi, [esp + 16 + 12] // V | |
| 2840 mov edx, [esp + 16 + 16] // abgr | |
| 2841 mov ebp, [esp + 16 + 20] // yuvconstants | |
| 2842 mov ecx, [esp + 16 + 24] // width | |
| 2843 sub edi, esi | |
| 2844 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
| 2845 | |
| 2846 convertloop: | |
| 2847 READYUV411_EBX | |
| 2848 YUVTORGB(ebp) | |
| 2849 STOREARGB | |
| 2850 | |
| 2851 sub ecx, 8 | |
| 2852 jg convertloop | |
| 2853 | |
| 2854 pop ebp | |
| 2855 pop ebx | |
| 2856 pop edi | |
| 2857 pop esi | |
| 2858 ret | |
| 2859 } | |
| 2860 } | |
| 2861 | |
| 2862 // 8 pixels. | |
| 2863 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2750 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
| 2864 __declspec(naked) | 2751 __declspec(naked) |
| 2865 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2752 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
| 2866 const uint8* uv_buf, | 2753 const uint8* uv_buf, |
| 2867 uint8* dst_argb, | 2754 uint8* dst_argb, |
| 2868 const struct YuvConstants* yuvconstants, | 2755 const struct YuvConstants* yuvconstants, |
| 2869 int width) { | 2756 int width) { |
| 2870 __asm { | 2757 __asm { |
| 2871 push esi | 2758 push esi |
| 2872 push ebx | 2759 push ebx |
| (...skipping 3454 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 6327 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6214 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
| 6328 | 6215 |
| 6329 #endif // defined(_M_X64) | 6216 #endif // defined(_M_X64) |
| 6330 | 6217 |
| 6331 #ifdef __cplusplus | 6218 #ifdef __cplusplus |
| 6332 } // extern "C" | 6219 } // extern "C" |
| 6333 } // namespace libyuv | 6220 } // namespace libyuv |
| 6334 #endif | 6221 #endif |
| 6335 | 6222 |
| 6336 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
| OLD | NEW |