OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 1951 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1962 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1962 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1963 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1963 __asm vmovdqu xmm4, [eax] /* Y */ \ |
1964 __asm vpermq ymm4, ymm4, 0xd8 \ | 1964 __asm vpermq ymm4, ymm4, 0xd8 \ |
1965 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1965 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
1966 __asm lea eax, [eax + 16] \ | 1966 __asm lea eax, [eax + 16] \ |
1967 __asm vmovdqu xmm5, [ebp] /* A */ \ | 1967 __asm vmovdqu xmm5, [ebp] /* A */ \ |
1968 __asm vpermq ymm5, ymm5, 0xd8 \ | 1968 __asm vpermq ymm5, ymm5, 0xd8 \ |
1969 __asm lea ebp, [ebp + 16] \ | 1969 __asm lea ebp, [ebp + 16] \ |
1970 } | 1970 } |
1971 | 1971 |
1972 // Read 4 UV from 411, upsample to 16 UV. | |
1973 #define READYUV411_AVX2 __asm { \ | |
1974 __asm vmovd xmm0, dword ptr [esi] /* U */ \ | |
1975 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ | |
1976 __asm lea esi, [esi + 4] \ | |
1977 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | |
1978 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | |
1979 __asm vpermq ymm0, ymm0, 0xd8 \ | |
1980 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ | |
1981 __asm vmovdqu xmm4, [eax] /* Y */ \ | |
1982 __asm vpermq ymm4, ymm4, 0xd8 \ | |
1983 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | |
1984 __asm lea eax, [eax + 16] \ | |
1985 } | |
1986 | |
1987 // Read 8 UV from NV12, upsample to 16 UV. | 1972 // Read 8 UV from NV12, upsample to 16 UV. |
1988 #define READNV12_AVX2 __asm { \ | 1973 #define READNV12_AVX2 __asm { \ |
1989 __asm vmovdqu xmm0, [esi] /* UV */ \ | 1974 __asm vmovdqu xmm0, [esi] /* UV */ \ |
1990 __asm lea esi, [esi + 16] \ | 1975 __asm lea esi, [esi + 16] \ |
1991 __asm vpermq ymm0, ymm0, 0xd8 \ | 1976 __asm vpermq ymm0, ymm0, 0xd8 \ |
1992 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1977 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1993 __asm vmovdqu xmm4, [eax] /* Y */ \ | 1978 __asm vmovdqu xmm4, [eax] /* Y */ \ |
1994 __asm vpermq ymm4, ymm4, 0xd8 \ | 1979 __asm vpermq ymm4, ymm4, 0xd8 \ |
1995 __asm vpunpcklbw ymm4, ymm4, ymm4 \ | 1980 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
1996 __asm lea eax, [eax + 16] \ | 1981 __asm lea eax, [eax + 16] \ |
(...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2191 | 2176 |
2192 pop ebx | 2177 pop ebx |
2193 pop edi | 2178 pop edi |
2194 pop esi | 2179 pop esi |
2195 vzeroupper | 2180 vzeroupper |
2196 ret | 2181 ret |
2197 } | 2182 } |
2198 } | 2183 } |
2199 #endif // HAS_I444TOARGBROW_AVX2 | 2184 #endif // HAS_I444TOARGBROW_AVX2 |
2200 | 2185 |
2201 #ifdef HAS_I411TOARGBROW_AVX2 | |
2202 // 16 pixels | |
2203 // 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | |
2204 __declspec(naked) | |
2205 void I411ToARGBRow_AVX2(const uint8* y_buf, | |
2206 const uint8* u_buf, | |
2207 const uint8* v_buf, | |
2208 uint8* dst_argb, | |
2209 const struct YuvConstants* yuvconstants, | |
2210 int width) { | |
2211 __asm { | |
2212 push esi | |
2213 push edi | |
2214 push ebx | |
2215 mov eax, [esp + 12 + 4] // Y | |
2216 mov esi, [esp + 12 + 8] // U | |
2217 mov edi, [esp + 12 + 12] // V | |
2218 mov edx, [esp + 12 + 16] // abgr | |
2219 mov ebx, [esp + 12 + 20] // yuvconstants | |
2220 mov ecx, [esp + 12 + 24] // width | |
2221 sub edi, esi | |
2222 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | |
2223 | |
2224 convertloop: | |
2225 READYUV411_AVX2 | |
2226 YUVTORGB_AVX2(ebx) | |
2227 STOREARGB_AVX2 | |
2228 | |
2229 sub ecx, 16 | |
2230 jg convertloop | |
2231 | |
2232 pop ebx | |
2233 pop edi | |
2234 pop esi | |
2235 vzeroupper | |
2236 ret | |
2237 } | |
2238 } | |
2239 #endif // HAS_I411TOARGBROW_AVX2 | |
2240 | |
2241 #ifdef HAS_NV12TOARGBROW_AVX2 | 2186 #ifdef HAS_NV12TOARGBROW_AVX2 |
2242 // 16 pixels. | 2187 // 16 pixels. |
2243 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). | 2188 // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). |
2244 __declspec(naked) | 2189 __declspec(naked) |
2245 void NV12ToARGBRow_AVX2(const uint8* y_buf, | 2190 void NV12ToARGBRow_AVX2(const uint8* y_buf, |
2246 const uint8* uv_buf, | 2191 const uint8* uv_buf, |
2247 uint8* dst_argb, | 2192 uint8* dst_argb, |
2248 const struct YuvConstants* yuvconstants, | 2193 const struct YuvConstants* yuvconstants, |
2249 int width) { | 2194 int width) { |
2250 __asm { | 2195 __asm { |
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2444 __asm lea esi, [esi + 4] \ | 2389 __asm lea esi, [esi + 4] \ |
2445 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2390 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
2446 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2391 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
2447 __asm movq xmm4, qword ptr [eax] /* Y */ \ | 2392 __asm movq xmm4, qword ptr [eax] /* Y */ \ |
2448 __asm punpcklbw xmm4, xmm4 \ | 2393 __asm punpcklbw xmm4, xmm4 \ |
2449 __asm lea eax, [eax + 8] \ | 2394 __asm lea eax, [eax + 8] \ |
2450 __asm movq xmm5, qword ptr [ebp] /* A */ \ | 2395 __asm movq xmm5, qword ptr [ebp] /* A */ \ |
2451 __asm lea ebp, [ebp + 8] \ | 2396 __asm lea ebp, [ebp + 8] \ |
2452 } | 2397 } |
2453 | 2398 |
2454 // Read 2 UV from 411, upsample to 8 UV. | |
2455 // drmemory fails with memory fault if pinsrw used. libyuv bug: 525 | |
2456 // __asm pinsrw xmm0, [esi], 0 /* U */ | |
2457 // __asm pinsrw xmm1, [esi + edi], 0 /* V */ | |
2458 #define READYUV411_EBX __asm { \ | |
2459 __asm movzx ebx, word ptr [esi] /* U */ \ | |
2460 __asm movd xmm0, ebx \ | |
2461 __asm movzx ebx, word ptr [esi + edi] /* V */ \ | |
2462 __asm movd xmm1, ebx \ | |
2463 __asm lea esi, [esi + 2] \ | |
2464 __asm punpcklbw xmm0, xmm1 /* UV */ \ | |
2465 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | |
2466 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ | |
2467 __asm movq xmm4, qword ptr [eax] \ | |
2468 __asm punpcklbw xmm4, xmm4 \ | |
2469 __asm lea eax, [eax + 8] \ | |
2470 } | |
2471 | |
2472 // Read 4 UV from NV12, upsample to 8 UV. | 2399 // Read 4 UV from NV12, upsample to 8 UV. |
2473 #define READNV12 __asm { \ | 2400 #define READNV12 __asm { \ |
2474 __asm movq xmm0, qword ptr [esi] /* UV */ \ | 2401 __asm movq xmm0, qword ptr [esi] /* UV */ \ |
2475 __asm lea esi, [esi + 8] \ | 2402 __asm lea esi, [esi + 8] \ |
2476 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2403 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
2477 __asm movq xmm4, qword ptr [eax] \ | 2404 __asm movq xmm4, qword ptr [eax] \ |
2478 __asm punpcklbw xmm4, xmm4 \ | 2405 __asm punpcklbw xmm4, xmm4 \ |
2479 __asm lea eax, [eax + 8] \ | 2406 __asm lea eax, [eax + 8] \ |
2480 } | 2407 } |
2481 | 2408 |
(...skipping 331 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2813 | 2740 |
2814 pop ebp | 2741 pop ebp |
2815 pop ebx | 2742 pop ebx |
2816 pop edi | 2743 pop edi |
2817 pop esi | 2744 pop esi |
2818 ret | 2745 ret |
2819 } | 2746 } |
2820 } | 2747 } |
2821 | 2748 |
2822 // 8 pixels. | 2749 // 8 pixels. |
2823 // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | |
2824 // Similar to I420 but duplicate UV once more. | |
2825 __declspec(naked) | |
2826 void I411ToARGBRow_SSSE3(const uint8* y_buf, | |
2827 const uint8* u_buf, | |
2828 const uint8* v_buf, | |
2829 uint8* dst_argb, | |
2830 const struct YuvConstants* yuvconstants, | |
2831 int width) { | |
2832 __asm { | |
2833 push esi | |
2834 push edi | |
2835 push ebx | |
2836 push ebp | |
2837 mov eax, [esp + 16 + 4] // Y | |
2838 mov esi, [esp + 16 + 8] // U | |
2839 mov edi, [esp + 16 + 12] // V | |
2840 mov edx, [esp + 16 + 16] // abgr | |
2841 mov ebp, [esp + 16 + 20] // yuvconstants | |
2842 mov ecx, [esp + 16 + 24] // width | |
2843 sub edi, esi | |
2844 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | |
2845 | |
2846 convertloop: | |
2847 READYUV411_EBX | |
2848 YUVTORGB(ebp) | |
2849 STOREARGB | |
2850 | |
2851 sub ecx, 8 | |
2852 jg convertloop | |
2853 | |
2854 pop ebp | |
2855 pop ebx | |
2856 pop edi | |
2857 pop esi | |
2858 ret | |
2859 } | |
2860 } | |
2861 | |
2862 // 8 pixels. | |
2863 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). | 2750 // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). |
2864 __declspec(naked) | 2751 __declspec(naked) |
2865 void NV12ToARGBRow_SSSE3(const uint8* y_buf, | 2752 void NV12ToARGBRow_SSSE3(const uint8* y_buf, |
2866 const uint8* uv_buf, | 2753 const uint8* uv_buf, |
2867 uint8* dst_argb, | 2754 uint8* dst_argb, |
2868 const struct YuvConstants* yuvconstants, | 2755 const struct YuvConstants* yuvconstants, |
2869 int width) { | 2756 int width) { |
2870 __asm { | 2757 __asm { |
2871 push esi | 2758 push esi |
2872 push ebx | 2759 push ebx |
(...skipping 3454 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6327 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6214 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6328 | 6215 |
6329 #endif // defined(_M_X64) | 6216 #endif // defined(_M_X64) |
6330 | 6217 |
6331 #ifdef __cplusplus | 6218 #ifdef __cplusplus |
6332 } // extern "C" | 6219 } // extern "C" |
6333 } // namespace libyuv | 6220 } // namespace libyuv |
6334 #endif | 6221 #endif |
6335 | 6222 |
6336 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6223 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
OLD | NEW |