Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(708)

Side by Side Diff: source/row_win.cc

Issue 1372653003: avx2 I422AlphaToARGB (Closed) Base URL: https://chromium.googlesource.com/libyuv/libyuv@master
Patch Set: GCC AVX2 use storeABGR Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
(...skipping 1878 matching lines...) Expand 10 before | Expand all | Expand 10 after
1889 __asm lea esi, [esi + 8] \ 1889 __asm lea esi, [esi + 8] \
1890 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1890 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1891 __asm vpermq ymm0, ymm0, 0xd8 \ 1891 __asm vpermq ymm0, ymm0, 0xd8 \
1892 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1892 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1893 __asm vmovdqu xmm4, [eax] /* Y */ \ 1893 __asm vmovdqu xmm4, [eax] /* Y */ \
1894 __asm vpermq ymm4, ymm4, 0xd8 \ 1894 __asm vpermq ymm4, ymm4, 0xd8 \
1895 __asm vpunpcklbw ymm4, ymm4, ymm4 \ 1895 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1896 __asm lea eax, [eax + 16] \ 1896 __asm lea eax, [eax + 16] \
1897 } 1897 }
1898 1898
1899 // Read 8 UV from 422, upsample to 16 UV. With 16 Alpha.
1900 #define READYUVA422_AVX2 __asm { \
1901 __asm vmovq xmm0, qword ptr [esi] /* U */ \
1902 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \
1903 __asm lea esi, [esi + 8] \
1904 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1905 __asm vpermq ymm0, ymm0, 0xd8 \
1906 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1907 __asm vmovdqu xmm4, [eax] /* Y */ \
1908 __asm vpermq ymm4, ymm4, 0xd8 \
1909 __asm vpunpcklbw ymm4, ymm4, ymm4 \
1910 __asm lea eax, [eax + 16] \
1911 __asm vmovdqu xmm5, [ebp] /* A */ \
1912 __asm vpermq ymm5, ymm5, 0xd8 \
1913 __asm lea ebp, [ebp + 16] \
1914 }
1915
1899 // Read 4 UV from 411, upsample to 16 UV. 1916 // Read 4 UV from 411, upsample to 16 UV.
1900 #define READYUV411_AVX2 __asm { \ 1917 #define READYUV411_AVX2 __asm { \
1901 __asm vmovd xmm0, dword ptr [esi] /* U */ \ 1918 __asm vmovd xmm0, dword ptr [esi] /* U */ \
1902 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ 1919 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \
1903 __asm lea esi, [esi + 4] \ 1920 __asm lea esi, [esi + 4] \
1904 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ 1921 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \
1905 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ 1922 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \
1906 __asm vpermq ymm0, ymm0, 0xd8 \ 1923 __asm vpermq ymm0, ymm0, 0xd8 \
1907 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ 1924 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \
1908 __asm vmovdqu xmm4, [eax] /* Y */ \ 1925 __asm vmovdqu xmm4, [eax] /* Y */ \
(...skipping 141 matching lines...) Expand 10 before | Expand all | Expand 10 after
2050 2067
2051 pop ebx 2068 pop ebx
2052 pop edi 2069 pop edi
2053 pop esi 2070 pop esi
2054 vzeroupper 2071 vzeroupper
2055 ret 2072 ret
2056 } 2073 }
2057 } 2074 }
2058 #endif // HAS_I422TOARGBROW_AVX2 2075 #endif // HAS_I422TOARGBROW_AVX2
2059 2076
2077 #ifdef HAS_I422ALPHATOARGBROW_AVX2
2078 // 16 pixels
2079 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2080 __declspec(naked)
2081 void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2082 const uint8* u_buf,
2083 const uint8* v_buf,
2084 const uint8* a_buf,
2085 uint8* dst_argb,
2086 struct YuvConstants* yuvconstants,
2087 int width) {
2088 __asm {
2089 push esi
2090 push edi
2091 push ebx
2092 push ebp
2093 mov eax, [esp + 16 + 4] // Y
2094 mov esi, [esp + 16 + 8] // U
2095 mov edi, [esp + 16 + 12] // V
2096 mov ebp, [esp + 16 + 16] // A
2097 mov edx, [esp + 16 + 20] // argb
2098 mov ebx, [esp + 16 + 24] // yuvconstants
2099 mov ecx, [esp + 16 + 28] // width
2100 sub edi, esi
2101
2102 convertloop:
2103 READYUVA422_AVX2
2104 YUVTORGB_AVX2(ebx)
2105 STOREARGB_AVX2
2106
2107 sub ecx, 16
2108 jg convertloop
2109
2110 pop ebp
2111 pop ebx
2112 pop edi
2113 pop esi
2114 vzeroupper
2115 ret
2116 }
2117 }
2118 #endif // HAS_I422ALPHATOARGBROW_AVX2
2119
2120 #ifdef HAS_I422ALPHATOABGRROW_AVX2
2121 // 16 pixels
2122 // 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ABGR.
2123 __declspec(naked)
2124 void I422AlphaToABGRRow_AVX2(const uint8* y_buf,
2125 const uint8* u_buf,
2126 const uint8* v_buf,
2127 const uint8* a_buf,
2128 uint8* dst_abgr,
2129 struct YuvConstants* yuvconstants,
2130 int width) {
2131 __asm {
2132 push esi
2133 push edi
2134 push ebx
2135 push ebp
2136 mov eax, [esp + 16 + 4] // Y
2137 mov esi, [esp + 16 + 8] // U
2138 mov edi, [esp + 16 + 12] // V
2139 mov ebp, [esp + 16 + 16] // A
2140 mov edx, [esp + 16 + 20] // abgr
2141 mov ebx, [esp + 16 + 24] // yuvconstants
2142 mov ecx, [esp + 16 + 28] // width
2143 sub edi, esi
2144
2145 convertloop:
2146 READYUVA422_AVX2
2147 YUVTORGB_AVX2(ebx)
2148 STOREABGR_AVX2
2149
2150 sub ecx, 16
2151 jg convertloop
2152
2153 pop ebp
2154 pop ebx
2155 pop edi
2156 pop esi
2157 vzeroupper
2158 ret
2159 }
2160 }
2161 #endif // HAS_I422ALPHATOABGRROW_AVX2
2162
2060 #ifdef HAS_I444TOARGBROW_AVX2 2163 #ifdef HAS_I444TOARGBROW_AVX2
2061 // 16 pixels 2164 // 16 pixels
2062 // 16 UV values with 16 Y producing 16 ARGB (64 bytes). 2165 // 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2063 __declspec(naked) 2166 __declspec(naked)
2064 void I444ToARGBRow_AVX2(const uint8* y_buf, 2167 void I444ToARGBRow_AVX2(const uint8* y_buf,
2065 const uint8* u_buf, 2168 const uint8* u_buf,
2066 const uint8* v_buf, 2169 const uint8* v_buf,
2067 uint8* dst_argb, 2170 uint8* dst_argb,
2068 struct YuvConstants* yuvconstants, 2171 struct YuvConstants* yuvconstants,
2069 int width) { 2172 int width) {
(...skipping 771 matching lines...) Expand 10 before | Expand all | Expand 10 after
2841 jg convertloop 2944 jg convertloop
2842 2945
2843 pop ebx 2946 pop ebx
2844 pop edi 2947 pop edi
2845 pop esi 2948 pop esi
2846 ret 2949 ret
2847 } 2950 }
2848 } 2951 }
2849 2952
2850 // 8 pixels. 2953 // 8 pixels.
2851 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB (32 by tes). 2954 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ARGB.
2852 __declspec(naked) 2955 __declspec(naked)
2853 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, 2956 void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
2854 const uint8* u_buf, 2957 const uint8* u_buf,
2855 const uint8* v_buf, 2958 const uint8* v_buf,
2856 const uint8* a_buf, 2959 const uint8* a_buf,
2857 uint8* dst_argb, 2960 uint8* dst_argb,
2858 struct YuvConstants* yuvconstants, 2961 struct YuvConstants* yuvconstants,
2859 int width) { 2962 int width) {
2860 __asm { 2963 __asm {
2861 push esi 2964 push esi
2862 push edi 2965 push edi
2863 push ebx 2966 push ebx
2864 push ebp 2967 push ebp
2865 mov eax, [esp + 16 + 4] // Y 2968 mov eax, [esp + 16 + 4] // Y
2866 mov esi, [esp + 16 + 8] // U 2969 mov esi, [esp + 16 + 8] // U
2867 mov edi, [esp + 16 + 12] // V 2970 mov edi, [esp + 16 + 12] // V
2868 mov ebp, [esp + 16 + 16] // A 2971 mov ebp, [esp + 16 + 16] // A
2869 mov edx, [esp + 16 + 20] // argb 2972 mov edx, [esp + 16 + 20] // argb
2870 mov ebx, [esp + 16 + 24] // yuvconstants 2973 mov ebx, [esp + 16 + 24] // yuvconstants
2871 mov ecx, [esp + 16 + 28] // width 2974 mov ecx, [esp + 16 + 28] // width
2872 sub edi, esi 2975 sub edi, esi
2873 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2874 2976
2875 convertloop: 2977 convertloop:
2876 READYUVA422 2978 READYUVA422
2877 YUVTORGB(ebx) 2979 YUVTORGB(ebx)
2878 STOREARGB 2980 STOREARGB
2879 2981
2880 sub ecx, 8 2982 sub ecx, 8
2881 jg convertloop 2983 jg convertloop
2882 2984
2883 pop ebp 2985 pop ebp
2884 pop ebx 2986 pop ebx
2885 pop edi 2987 pop edi
2886 pop esi 2988 pop esi
2887 ret 2989 ret
2888 } 2990 }
2889 } 2991 }
2890 2992
2891 // 8 pixels. 2993 // 8 pixels.
2892 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR (32 by tes). 2994 // 4 UV values upsampled to 8 UV, mixed with 8 Y and 8 A producing 8 ABGR.
2893 __declspec(naked) 2995 __declspec(naked)
2894 void I422AlphaToABGRRow_SSSE3(const uint8* y_buf, 2996 void I422AlphaToABGRRow_SSSE3(const uint8* y_buf,
2895 const uint8* u_buf, 2997 const uint8* u_buf,
2896 const uint8* v_buf, 2998 const uint8* v_buf,
2897 const uint8* a_buf, 2999 const uint8* a_buf,
2898 uint8* dst_abgr, 3000 uint8* dst_abgr,
2899 struct YuvConstants* yuvconstants, 3001 struct YuvConstants* yuvconstants,
2900 int width) { 3002 int width) {
2901 __asm { 3003 __asm {
2902 push esi 3004 push esi
2903 push edi 3005 push edi
2904 push ebx 3006 push ebx
2905 push ebp 3007 push ebp
2906 mov eax, [esp + 16 + 4] // Y 3008 mov eax, [esp + 16 + 4] // Y
2907 mov esi, [esp + 16 + 8] // U 3009 mov esi, [esp + 16 + 8] // U
2908 mov edi, [esp + 16 + 12] // V 3010 mov edi, [esp + 16 + 12] // V
2909 mov ebp, [esp + 16 + 16] // A 3011 mov ebp, [esp + 16 + 16] // A
2910 mov edx, [esp + 16 + 20] // abgr 3012 mov edx, [esp + 16 + 20] // abgr
2911 mov ebx, [esp + 16 + 24] // yuvconstants 3013 mov ebx, [esp + 16 + 24] // yuvconstants
2912 mov ecx, [esp + 16 + 28] // width 3014 mov ecx, [esp + 16 + 28] // width
2913 sub edi, esi 3015 sub edi, esi
2914 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha
2915 3016
2916 convertloop: 3017 convertloop:
2917 READYUVA422 3018 READYUVA422
2918 YUVTORGB(ebx) 3019 YUVTORGB(ebx)
2919 STOREABGR 3020 STOREABGR
2920 3021
2921 sub ecx, 8 3022 sub ecx, 8
2922 jg convertloop 3023 jg convertloop
2923 3024
2924 pop ebp 3025 pop ebp
(...skipping 3626 matching lines...) Expand 10 before | Expand all | Expand 10 after
6551 } 6652 }
6552 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 6653 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6553 6654
6554 #endif // defined(_M_X64) 6655 #endif // defined(_M_X64)
6555 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) 6656 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64))
6556 6657
6557 #ifdef __cplusplus 6658 #ifdef __cplusplus
6558 } // extern "C" 6659 } // extern "C"
6559 } // namespace libyuv 6660 } // namespace libyuv
6560 #endif 6661 #endif
OLDNEW
« no previous file with comments | « source/row_gcc.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698