OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. | 2 * Copyright 2011 The LibYuv Project Authors. All rights reserved. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license | 4 * Use of this source code is governed by a BSD-style license |
5 * that can be found in the LICENSE file in the root of the source | 5 * that can be found in the LICENSE file in the root of the source |
6 * tree. An additional intellectual property rights grant can be found | 6 * tree. An additional intellectual property rights grant can be found |
7 * in the file PATENTS. All contributing project authors may | 7 * in the file PATENTS. All contributing project authors may |
8 * be found in the AUTHORS file in the root of the source tree. | 8 * be found in the AUTHORS file in the root of the source tree. |
9 */ | 9 */ |
10 | 10 |
(...skipping 16 matching lines...) Expand all Loading... |
27 | 27 |
28 // 64 bit | 28 // 64 bit |
29 #if defined(_M_X64) | 29 #if defined(_M_X64) |
30 | 30 |
31 // Read 4 UV from 422, upsample to 8 UV. | 31 // Read 4 UV from 422, upsample to 8 UV. |
32 #define READYUV422 \ | 32 #define READYUV422 \ |
33 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ | 33 xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf); \ |
34 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ | 34 xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset)); \ |
35 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ | 35 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
36 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ | 36 xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ |
37 u_buf += 4; | 37 u_buf += 4; \ |
| 38 xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ |
| 39 xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ |
| 40 y_buf += 8; \ |
38 | 41 |
39 // Convert 8 pixels: 8 UV and 8 Y. | 42 // Convert 8 pixels: 8 UV and 8 Y. |
40 #define YUVTORGB(YuvConstants) \ | 43 #define YUVTORGB(YuvConstants) \ |
41 xmm1 = _mm_loadu_si128(&xmm0); \ | 44 xmm1 = _mm_loadu_si128(&xmm0); \ |
42 xmm2 = _mm_loadu_si128(&xmm0); \ | 45 xmm2 = _mm_loadu_si128(&xmm0); \ |
43 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB); \ | 46 xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)YuvConstants->kUVToB); \ |
44 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG); \ | 47 xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)YuvConstants->kUVToG); \ |
45 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR); \ | 48 xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)YuvConstants->kUVToR); \ |
46 xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \ | 49 xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \ |
47 xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \ | 50 xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \ |
48 xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \ | 51 xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \ |
49 xmm3 = _mm_loadl_epi64((__m128i*)y_buf); \ | 52 xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb); \ |
50 y_buf += 8; \ | 53 xmm0 = _mm_adds_epi16(xmm0, xmm4); \ |
51 xmm3 = _mm_unpacklo_epi8(xmm3, xmm3); \ | 54 xmm1 = _mm_adds_epi16(xmm1, xmm4); \ |
52 xmm3 = _mm_mulhi_epu16(xmm3, *(__m128i*)YuvConstants->kYToRgb); \ | 55 xmm2 = _mm_adds_epi16(xmm2, xmm4); \ |
53 xmm0 = _mm_adds_epi16(xmm0, xmm3); \ | |
54 xmm1 = _mm_adds_epi16(xmm1, xmm3); \ | |
55 xmm2 = _mm_adds_epi16(xmm2, xmm3); \ | |
56 xmm0 = _mm_srai_epi16(xmm0, 6); \ | 56 xmm0 = _mm_srai_epi16(xmm0, 6); \ |
57 xmm1 = _mm_srai_epi16(xmm1, 6); \ | 57 xmm1 = _mm_srai_epi16(xmm1, 6); \ |
58 xmm2 = _mm_srai_epi16(xmm2, 6); \ | 58 xmm2 = _mm_srai_epi16(xmm2, 6); \ |
59 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ | 59 xmm0 = _mm_packus_epi16(xmm0, xmm0); \ |
60 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ | 60 xmm1 = _mm_packus_epi16(xmm1, xmm1); \ |
61 xmm2 = _mm_packus_epi16(xmm2, xmm2); | 61 xmm2 = _mm_packus_epi16(xmm2, xmm2); |
62 | 62 |
63 // Store 8 ARGB values. | 63 // Store 8 ARGB values. |
64 #define STOREARGB \ | 64 #define STOREARGB \ |
65 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ | 65 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ |
(...skipping 17 matching lines...) Expand all Loading... |
83 dst_argb += 32; | 83 dst_argb += 32; |
84 | 84 |
85 | 85 |
86 #if defined(HAS_I422TOARGBROW_SSSE3) | 86 #if defined(HAS_I422TOARGBROW_SSSE3) |
87 void I422ToARGBRow_SSSE3(const uint8* y_buf, | 87 void I422ToARGBRow_SSSE3(const uint8* y_buf, |
88 const uint8* u_buf, | 88 const uint8* u_buf, |
89 const uint8* v_buf, | 89 const uint8* v_buf, |
90 uint8* dst_argb, | 90 uint8* dst_argb, |
91 struct YuvConstants* yuvconstants, | 91 struct YuvConstants* yuvconstants, |
92 int width) { | 92 int width) { |
93 __m128i xmm0, xmm1, xmm2, xmm3; | 93 __m128i xmm0, xmm1, xmm2, xmm4; |
94 const __m128i xmm5 = _mm_set1_epi8(-1); | 94 const __m128i xmm5 = _mm_set1_epi8(-1); |
95 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 95 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
96 while (width > 0) { | 96 while (width > 0) { |
97 READYUV422 | 97 READYUV422 |
98 YUVTORGB(YuvConstants) | 98 YUVTORGB(yuvconstants) |
99 STOREARGB | 99 STOREARGB |
100 width -= 8; | 100 width -= 8; |
101 } | 101 } |
102 } | 102 } |
103 #endif | 103 #endif |
104 | 104 |
105 #if defined(HAS_I422TOABGRROW_SSSE3) | 105 #if defined(HAS_I422TOABGRROW_SSSE3) |
106 void I422ToABGRRow_SSSE3(const uint8* y_buf, | 106 void I422ToABGRRow_SSSE3(const uint8* y_buf, |
107 const uint8* u_buf, | 107 const uint8* u_buf, |
108 const uint8* v_buf, | 108 const uint8* v_buf, |
109 uint8* dst_argb, | 109 uint8* dst_argb, |
110 struct YuvConstants* yuvconstants, | 110 struct YuvConstants* yuvconstants, |
111 int width) { | 111 int width) { |
112 __m128i xmm0, xmm1, xmm2, xmm3; | 112 __m128i xmm0, xmm1, xmm2, xmm4; |
113 const __m128i xmm5 = _mm_set1_epi8(-1); | 113 const __m128i xmm5 = _mm_set1_epi8(-1); |
114 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; | 114 const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf; |
115 while (width > 0) { | 115 while (width > 0) { |
116 READYUV422 | 116 READYUV422 |
117 YUVTORGB(YuvConstants) | 117 YUVTORGB(yuvconstants) |
118 STOREABGR | 118 STOREABGR |
119 width -= 8; | 119 width -= 8; |
120 } | 120 } |
121 } | 121 } |
122 #endif | 122 #endif |
123 // 32 bit | 123 // 32 bit |
124 #else // defined(_M_X64) | 124 #else // defined(_M_X64) |
125 #ifdef HAS_ARGBTOYROW_SSSE3 | 125 #ifdef HAS_ARGBTOYROW_SSSE3 |
126 | 126 |
127 // Constants for ARGB. | 127 // Constants for ARGB. |
(...skipping 1717 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1845 #endif // HAS_ARGBTOYROW_SSSE3 | 1845 #endif // HAS_ARGBTOYROW_SSSE3 |
1846 | 1846 |
1847 // Read 16 UV from 444 | 1847 // Read 16 UV from 444 |
1848 #define READYUV444_AVX2 __asm { \ | 1848 #define READYUV444_AVX2 __asm { \ |
1849 __asm vmovdqu xmm0, [esi] /* U */ \ | 1849 __asm vmovdqu xmm0, [esi] /* U */ \ |
1850 __asm vmovdqu xmm1, [esi + edi] /* V */ \ | 1850 __asm vmovdqu xmm1, [esi + edi] /* V */ \ |
1851 __asm lea esi, [esi + 16] \ | 1851 __asm lea esi, [esi + 16] \ |
1852 __asm vpermq ymm0, ymm0, 0xd8 \ | 1852 __asm vpermq ymm0, ymm0, 0xd8 \ |
1853 __asm vpermq ymm1, ymm1, 0xd8 \ | 1853 __asm vpermq ymm1, ymm1, 0xd8 \ |
1854 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1854 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
| 1855 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1856 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1857 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1858 __asm lea eax, [eax + 16] \ |
1855 } | 1859 } |
1856 | 1860 |
1857 // Read 8 UV from 422, upsample to 16 UV. | 1861 // Read 8 UV from 422, upsample to 16 UV. |
1858 #define READYUV422_AVX2 __asm { \ | 1862 #define READYUV422_AVX2 __asm { \ |
1859 __asm vmovq xmm0, qword ptr [esi] /* U */ \ | 1863 __asm vmovq xmm0, qword ptr [esi] /* U */ \ |
1860 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ | 1864 __asm vmovq xmm1, qword ptr [esi + edi] /* V */ \ |
1861 __asm lea esi, [esi + 8] \ | 1865 __asm lea esi, [esi + 8] \ |
1862 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1866 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
1863 __asm vpermq ymm0, ymm0, 0xd8 \ | 1867 __asm vpermq ymm0, ymm0, 0xd8 \ |
1864 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1868 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1869 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1870 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1871 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1872 __asm lea eax, [eax + 16] \ |
1865 } | 1873 } |
1866 | 1874 |
1867 // Read 4 UV from 411, upsample to 16 UV. | 1875 // Read 4 UV from 411, upsample to 16 UV. |
1868 #define READYUV411_AVX2 __asm { \ | 1876 #define READYUV411_AVX2 __asm { \ |
1869 __asm vmovd xmm0, dword ptr [esi] /* U */ \ | 1877 __asm vmovd xmm0, dword ptr [esi] /* U */ \ |
1870 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ | 1878 __asm vmovd xmm1, dword ptr [esi + edi] /* V */ \ |
1871 __asm lea esi, [esi + 4] \ | 1879 __asm lea esi, [esi + 4] \ |
1872 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ | 1880 __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ |
1873 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1881 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
1874 __asm vpermq ymm0, ymm0, 0xd8 \ | 1882 __asm vpermq ymm0, ymm0, 0xd8 \ |
1875 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ | 1883 __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ |
| 1884 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1885 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1886 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1887 __asm lea eax, [eax + 16] \ |
1876 } | 1888 } |
1877 | 1889 |
1878 // Read 8 UV from NV12, upsample to 16 UV. | 1890 // Read 8 UV from NV12, upsample to 16 UV. |
1879 #define READNV12_AVX2 __asm { \ | 1891 #define READNV12_AVX2 __asm { \ |
1880 __asm vmovdqu xmm0, [esi] /* UV */ \ | 1892 __asm vmovdqu xmm0, [esi] /* UV */ \ |
1881 __asm lea esi, [esi + 16] \ | 1893 __asm lea esi, [esi + 16] \ |
1882 __asm vpermq ymm0, ymm0, 0xd8 \ | 1894 __asm vpermq ymm0, ymm0, 0xd8 \ |
1883 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ | 1895 __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ |
| 1896 __asm vmovdqu xmm4, [eax] /* Y */ \ |
| 1897 __asm vpermq ymm4, ymm4, 0xd8 \ |
| 1898 __asm vpunpcklbw ymm4, ymm4, ymm4 \ |
| 1899 __asm lea eax, [eax + 16] \ |
1884 } | 1900 } |
1885 | 1901 |
1886 // Convert 16 pixels: 16 UV and 16 Y. | 1902 // Convert 16 pixels: 16 UV and 16 Y. |
1887 #define YUVTORGB_AVX2(YuvConstants) __asm { \ | 1903 #define YUVTORGB_AVX2(YuvConstants) __asm { \ |
1888 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ | 1904 __asm vpmaddubsw ymm2, ymm0, ymmword ptr [YuvConstants + KUVTOR] /* R UV */\ |
1889 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ | 1905 __asm vpmaddubsw ymm1, ymm0, ymmword ptr [YuvConstants + KUVTOG] /* G UV */\ |
1890 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ | 1906 __asm vpmaddubsw ymm0, ymm0, ymmword ptr [YuvConstants + KUVTOB] /* B UV */\ |
1891 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ | 1907 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASR] \ |
1892 __asm vpsubw ymm2, ymm3, ymm2 \ | 1908 __asm vpsubw ymm2, ymm3, ymm2 \ |
1893 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ | 1909 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASG] \ |
1894 __asm vpsubw ymm1, ymm3, ymm1 \ | 1910 __asm vpsubw ymm1, ymm3, ymm1 \ |
1895 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ | 1911 __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ |
1896 __asm vpsubw ymm0, ymm3, ymm0 \ | 1912 __asm vpsubw ymm0, ymm3, ymm0 \ |
1897 /* Step 2: Find Y contribution to 16 R,G,B values */ \ | 1913 /* Step 2: Find Y contribution to 16 R,G,B values */ \ |
1898 __asm vmovdqu xmm3, [eax] \ | 1914 __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ |
1899 __asm lea eax, [eax + 16] \ | 1915 __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ |
1900 __asm vpermq ymm3, ymm3, 0xd8 \ | 1916 __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ |
1901 __asm vpunpcklbw ymm3, ymm3, ymm3 \ | 1917 __asm vpaddsw ymm2, ymm2, ymm4 /* R += Y */ \ |
1902 __asm vpmulhuw ymm3, ymm3, ymmword ptr [YuvConstants + KYTORGB] \ | |
1903 __asm vpaddsw ymm0, ymm0, ymm3 /* B += Y */ \ | |
1904 __asm vpaddsw ymm1, ymm1, ymm3 /* G += Y */ \ | |
1905 __asm vpaddsw ymm2, ymm2, ymm3 /* R += Y */ \ | |
1906 __asm vpsraw ymm0, ymm0, 6 \ | 1918 __asm vpsraw ymm0, ymm0, 6 \ |
1907 __asm vpsraw ymm1, ymm1, 6 \ | 1919 __asm vpsraw ymm1, ymm1, 6 \ |
1908 __asm vpsraw ymm2, ymm2, 6 \ | 1920 __asm vpsraw ymm2, ymm2, 6 \ |
1909 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ | 1921 __asm vpackuswb ymm0, ymm0, ymm0 /* B */ \ |
1910 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ | 1922 __asm vpackuswb ymm1, ymm1, ymm1 /* G */ \ |
1911 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ | 1923 __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ |
1912 } | 1924 } |
1913 | 1925 |
1914 // Store 16 ARGB values. | 1926 // Store 16 ARGB values. |
1915 #define STOREARGB_AVX2 __asm { \ | 1927 #define STOREARGB_AVX2 __asm { \ |
(...skipping 58 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1974 struct YuvConstants* yuvconstants, | 1986 struct YuvConstants* yuvconstants, |
1975 int width) { | 1987 int width) { |
1976 __asm { | 1988 __asm { |
1977 push esi | 1989 push esi |
1978 push edi | 1990 push edi |
1979 push ebp | 1991 push ebp |
1980 mov eax, [esp + 12 + 4] // Y | 1992 mov eax, [esp + 12 + 4] // Y |
1981 mov esi, [esp + 12 + 8] // U | 1993 mov esi, [esp + 12 + 8] // U |
1982 mov edi, [esp + 12 + 12] // V | 1994 mov edi, [esp + 12 + 12] // V |
1983 mov edx, [esp + 12 + 16] // argb | 1995 mov edx, [esp + 12 + 16] // argb |
1984 mov ebp, [esp + 12 + 20] // YuvConstants | 1996 mov ebp, [esp + 12 + 20] // yuvconstants |
1985 mov ecx, [esp + 12 + 24] // width | 1997 mov ecx, [esp + 12 + 24] // width |
1986 sub edi, esi | 1998 sub edi, esi |
1987 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 1999 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
1988 | 2000 |
1989 convertloop: | 2001 convertloop: |
1990 READYUV422_AVX2 | 2002 READYUV422_AVX2 |
1991 YUVTORGB_AVX2(ebp) | 2003 YUVTORGB_AVX2(ebp) |
1992 STOREARGB_AVX2 | 2004 STOREARGB_AVX2 |
1993 | 2005 |
1994 sub ecx, 16 | 2006 sub ecx, 16 |
(...skipping 19 matching lines...) Expand all Loading... |
2014 struct YuvConstants* yuvconstants, | 2026 struct YuvConstants* yuvconstants, |
2015 int width) { | 2027 int width) { |
2016 __asm { | 2028 __asm { |
2017 push esi | 2029 push esi |
2018 push edi | 2030 push edi |
2019 push ebp | 2031 push ebp |
2020 mov eax, [esp + 12 + 4] // Y | 2032 mov eax, [esp + 12 + 4] // Y |
2021 mov esi, [esp + 12 + 8] // U | 2033 mov esi, [esp + 12 + 8] // U |
2022 mov edi, [esp + 12 + 12] // V | 2034 mov edi, [esp + 12 + 12] // V |
2023 mov edx, [esp + 12 + 16] // argb | 2035 mov edx, [esp + 12 + 16] // argb |
2024 mov ebp, [esp + 12 + 20] // YuvConstants | 2036 mov ebp, [esp + 12 + 20] // yuvconstants |
2025 mov ecx, [esp + 12 + 24] // width | 2037 mov ecx, [esp + 12 + 24] // width |
2026 sub edi, esi | 2038 sub edi, esi |
2027 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2039 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2028 convertloop: | 2040 convertloop: |
2029 READYUV444_AVX2 | 2041 READYUV444_AVX2 |
2030 YUVTORGB_AVX2(ebp) | 2042 YUVTORGB_AVX2(ebp) |
2031 STOREARGB_AVX2 | 2043 STOREARGB_AVX2 |
2032 | 2044 |
2033 sub ecx, 16 | 2045 sub ecx, 16 |
2034 jg convertloop | 2046 jg convertloop |
(...skipping 18 matching lines...) Expand all Loading... |
2053 struct YuvConstants* yuvconstants, | 2065 struct YuvConstants* yuvconstants, |
2054 int width) { | 2066 int width) { |
2055 __asm { | 2067 __asm { |
2056 push esi | 2068 push esi |
2057 push edi | 2069 push edi |
2058 push ebp | 2070 push ebp |
2059 mov eax, [esp + 12 + 4] // Y | 2071 mov eax, [esp + 12 + 4] // Y |
2060 mov esi, [esp + 12 + 8] // U | 2072 mov esi, [esp + 12 + 8] // U |
2061 mov edi, [esp + 12 + 12] // V | 2073 mov edi, [esp + 12 + 12] // V |
2062 mov edx, [esp + 12 + 16] // abgr | 2074 mov edx, [esp + 12 + 16] // abgr |
2063 mov ebp, [esp + 12 + 20] // YuvConstants | 2075 mov ebp, [esp + 12 + 20] // yuvconstants |
2064 mov ecx, [esp + 12 + 24] // width | 2076 mov ecx, [esp + 12 + 24] // width |
2065 sub edi, esi | 2077 sub edi, esi |
2066 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2078 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2067 convertloop: | 2079 convertloop: |
2068 READYUV444_AVX2 | 2080 READYUV444_AVX2 |
2069 YUVTORGB_AVX2(ebp) | 2081 YUVTORGB_AVX2(ebp) |
2070 STOREABGR_AVX2 | 2082 STOREABGR_AVX2 |
2071 | 2083 |
2072 sub ecx, 16 | 2084 sub ecx, 16 |
2073 jg convertloop | 2085 jg convertloop |
(...skipping 18 matching lines...) Expand all Loading... |
2092 struct YuvConstants* yuvconstants, | 2104 struct YuvConstants* yuvconstants, |
2093 int width) { | 2105 int width) { |
2094 __asm { | 2106 __asm { |
2095 push esi | 2107 push esi |
2096 push edi | 2108 push edi |
2097 push ebp | 2109 push ebp |
2098 mov eax, [esp + 12 + 4] // Y | 2110 mov eax, [esp + 12 + 4] // Y |
2099 mov esi, [esp + 12 + 8] // U | 2111 mov esi, [esp + 12 + 8] // U |
2100 mov edi, [esp + 12 + 12] // V | 2112 mov edi, [esp + 12 + 12] // V |
2101 mov edx, [esp + 12 + 16] // abgr | 2113 mov edx, [esp + 12 + 16] // abgr |
2102 mov ebp, [esp + 12 + 20] // YuvConstants | 2114 mov ebp, [esp + 12 + 20] // yuvconstants |
2103 mov ecx, [esp + 12 + 24] // width | 2115 mov ecx, [esp + 12 + 24] // width |
2104 sub edi, esi | 2116 sub edi, esi |
2105 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2117 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2106 | 2118 |
2107 convertloop: | 2119 convertloop: |
2108 READYUV411_AVX2 | 2120 READYUV411_AVX2 |
2109 YUVTORGB_AVX2(ebp) | 2121 YUVTORGB_AVX2(ebp) |
2110 STOREARGB_AVX2 | 2122 STOREARGB_AVX2 |
2111 | 2123 |
2112 sub ecx, 16 | 2124 sub ecx, 16 |
(...skipping 16 matching lines...) Expand all Loading... |
2129 const uint8* uv_buf, | 2141 const uint8* uv_buf, |
2130 uint8* dst_argb, | 2142 uint8* dst_argb, |
2131 struct YuvConstants* yuvconstants, | 2143 struct YuvConstants* yuvconstants, |
2132 int width) { | 2144 int width) { |
2133 __asm { | 2145 __asm { |
2134 push esi | 2146 push esi |
2135 push ebp | 2147 push ebp |
2136 mov eax, [esp + 8 + 4] // Y | 2148 mov eax, [esp + 8 + 4] // Y |
2137 mov esi, [esp + 8 + 8] // UV | 2149 mov esi, [esp + 8 + 8] // UV |
2138 mov edx, [esp + 8 + 12] // argb | 2150 mov edx, [esp + 8 + 12] // argb |
2139 mov ebp, [esp + 8 + 16] // YuvConstants | 2151 mov ebp, [esp + 8 + 16] // yuvconstants |
2140 mov ecx, [esp + 8 + 20] // width | 2152 mov ecx, [esp + 8 + 20] // width |
2141 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2153 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2142 | 2154 |
2143 convertloop: | 2155 convertloop: |
2144 READNV12_AVX2 | 2156 READNV12_AVX2 |
2145 YUVTORGB_AVX2(ebp) | 2157 YUVTORGB_AVX2(ebp) |
2146 STOREARGB_AVX2 | 2158 STOREARGB_AVX2 |
2147 | 2159 |
2148 sub ecx, 16 | 2160 sub ecx, 16 |
2149 jg convertloop | 2161 jg convertloop |
(...skipping 18 matching lines...) Expand all Loading... |
2168 struct YuvConstants* yuvconstants, | 2180 struct YuvConstants* yuvconstants, |
2169 int width) { | 2181 int width) { |
2170 __asm { | 2182 __asm { |
2171 push esi | 2183 push esi |
2172 push edi | 2184 push edi |
2173 push ebp | 2185 push ebp |
2174 mov eax, [esp + 12 + 4] // Y | 2186 mov eax, [esp + 12 + 4] // Y |
2175 mov esi, [esp + 12 + 8] // U | 2187 mov esi, [esp + 12 + 8] // U |
2176 mov edi, [esp + 12 + 12] // V | 2188 mov edi, [esp + 12 + 12] // V |
2177 mov edx, [esp + 12 + 16] // abgr | 2189 mov edx, [esp + 12 + 16] // abgr |
2178 mov ebp, [esp + 12 + 20] // YuvConstants | 2190 mov ebp, [esp + 12 + 20] // yuvconstants |
2179 mov ecx, [esp + 12 + 24] // width | 2191 mov ecx, [esp + 12 + 24] // width |
2180 sub edi, esi | 2192 sub edi, esi |
2181 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2193 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2182 | 2194 |
2183 convertloop: | 2195 convertloop: |
2184 READYUV422_AVX2 | 2196 READYUV422_AVX2 |
2185 YUVTORGB_AVX2(ebp) | 2197 YUVTORGB_AVX2(ebp) |
2186 STOREBGRA_AVX2 | 2198 STOREBGRA_AVX2 |
2187 | 2199 |
2188 sub ecx, 16 | 2200 sub ecx, 16 |
(...skipping 19 matching lines...) Expand all Loading... |
2208 struct YuvConstants* yuvconstants, | 2220 struct YuvConstants* yuvconstants, |
2209 int width) { | 2221 int width) { |
2210 __asm { | 2222 __asm { |
2211 push esi | 2223 push esi |
2212 push edi | 2224 push edi |
2213 push ebp | 2225 push ebp |
2214 mov eax, [esp + 12 + 4] // Y | 2226 mov eax, [esp + 12 + 4] // Y |
2215 mov esi, [esp + 12 + 8] // U | 2227 mov esi, [esp + 12 + 8] // U |
2216 mov edi, [esp + 12 + 12] // V | 2228 mov edi, [esp + 12 + 12] // V |
2217 mov edx, [esp + 12 + 16] // abgr | 2229 mov edx, [esp + 12 + 16] // abgr |
2218 mov ebp, [esp + 12 + 20] // YuvConstants | 2230 mov ebp, [esp + 12 + 20] // yuvconstants |
2219 mov ecx, [esp + 12 + 24] // width | 2231 mov ecx, [esp + 12 + 24] // width |
2220 sub edi, esi | 2232 sub edi, esi |
2221 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2233 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2222 | 2234 |
2223 convertloop: | 2235 convertloop: |
2224 READYUV422_AVX2 | 2236 READYUV422_AVX2 |
2225 YUVTORGB_AVX2(ebp) | 2237 YUVTORGB_AVX2(ebp) |
2226 STORERGBA_AVX2 | 2238 STORERGBA_AVX2 |
2227 | 2239 |
2228 sub ecx, 16 | 2240 sub ecx, 16 |
(...skipping 19 matching lines...) Expand all Loading... |
2248 struct YuvConstants* yuvconstants, | 2260 struct YuvConstants* yuvconstants, |
2249 int width) { | 2261 int width) { |
2250 __asm { | 2262 __asm { |
2251 push esi | 2263 push esi |
2252 push edi | 2264 push edi |
2253 push ebp | 2265 push ebp |
2254 mov eax, [esp + 12 + 4] // Y | 2266 mov eax, [esp + 12 + 4] // Y |
2255 mov esi, [esp + 12 + 8] // U | 2267 mov esi, [esp + 12 + 8] // U |
2256 mov edi, [esp + 12 + 12] // V | 2268 mov edi, [esp + 12 + 12] // V |
2257 mov edx, [esp + 12 + 16] // argb | 2269 mov edx, [esp + 12 + 16] // argb |
2258 mov ebp, [esp + 12 + 20] // YuvConstants | 2270 mov ebp, [esp + 12 + 20] // yuvconstants |
2259 mov ecx, [esp + 12 + 24] // width | 2271 mov ecx, [esp + 12 + 24] // width |
2260 sub edi, esi | 2272 sub edi, esi |
2261 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha | 2273 vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha |
2262 | 2274 |
2263 convertloop: | 2275 convertloop: |
2264 READYUV422_AVX2 | 2276 READYUV422_AVX2 |
2265 YUVTORGB_AVX2(ebp) | 2277 YUVTORGB_AVX2(ebp) |
2266 STOREABGR_AVX2 | 2278 STOREABGR_AVX2 |
2267 | 2279 |
2268 sub ecx, 16 | 2280 sub ecx, 16 |
(...skipping 10 matching lines...) Expand all Loading... |
2279 | 2291 |
2280 #if defined(HAS_I422TOARGBROW_SSSE3) | 2292 #if defined(HAS_I422TOARGBROW_SSSE3) |
2281 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. | 2293 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444. |
2282 | 2294 |
2283 // Read 8 UV from 444. | 2295 // Read 8 UV from 444. |
2284 #define READYUV444 __asm { \ | 2296 #define READYUV444 __asm { \ |
2285 __asm movq xmm0, qword ptr [esi] /* U */ \ | 2297 __asm movq xmm0, qword ptr [esi] /* U */ \ |
2286 __asm movq xmm1, qword ptr [esi + edi] /* V */ \ | 2298 __asm movq xmm1, qword ptr [esi + edi] /* V */ \ |
2287 __asm lea esi, [esi + 8] \ | 2299 __asm lea esi, [esi + 8] \ |
2288 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2300 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
| 2301 __asm movq xmm4, qword ptr [eax] \ |
| 2302 __asm punpcklbw xmm4, xmm4 \ |
| 2303 __asm lea eax, [eax + 8] \ |
2289 } | 2304 } |
2290 | 2305 |
2291 // Read 4 UV from 422, upsample to 8 UV. | 2306 // Read 4 UV from 422, upsample to 8 UV. |
2292 #define READYUV422 __asm { \ | 2307 #define READYUV422 __asm { \ |
2293 __asm movd xmm0, [esi] /* U */ \ | 2308 __asm movd xmm0, [esi] /* U */ \ |
2294 __asm movd xmm1, [esi + edi] /* V */ \ | 2309 __asm movd xmm1, [esi + edi] /* V */ \ |
2295 __asm lea esi, [esi + 4] \ | 2310 __asm lea esi, [esi + 4] \ |
2296 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2311 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
2297 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2312 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2313 __asm movq xmm4, qword ptr [eax] \ |
| 2314 __asm punpcklbw xmm4, xmm4 \ |
| 2315 __asm lea eax, [eax + 8] \ |
2298 } | 2316 } |
2299 | 2317 |
2300 // Read 2 UV from 411, upsample to 8 UV. | 2318 // Read 2 UV from 411, upsample to 8 UV. |
2301 #define READYUV411 __asm { \ | 2319 #define READYUV411 __asm { \ |
2302 __asm pinsrw xmm0, [esi], 0 /* U */ \ | 2320 __asm pinsrw xmm0, [esi], 0 /* U */ \ |
2303 __asm pinsrw xmm1, [esi + edi], 0 /* V */ \ | 2321 __asm pinsrw xmm1, [esi + edi], 0 /* V */ \ |
2304 __asm lea esi, [esi + 2] \ | 2322 __asm lea esi, [esi + 2] \ |
2305 __asm punpcklbw xmm0, xmm1 /* UV */ \ | 2323 __asm punpcklbw xmm0, xmm1 /* UV */ \ |
2306 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2324 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
2307 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ | 2325 __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ |
| 2326 __asm movq xmm4, qword ptr [eax] \ |
| 2327 __asm punpcklbw xmm4, xmm4 \ |
| 2328 __asm lea eax, [eax + 8] \ |
2308 } | 2329 } |
2309 | 2330 |
2310 // Read 4 UV from NV12, upsample to 8 UV. | 2331 // Read 4 UV from NV12, upsample to 8 UV. |
2311 #define READNV12 __asm { \ | 2332 #define READNV12 __asm { \ |
2312 __asm movq xmm0, qword ptr [esi] /* UV */ \ | 2333 __asm movq xmm0, qword ptr [esi] /* UV */ \ |
2313 __asm lea esi, [esi + 8] \ | 2334 __asm lea esi, [esi + 8] \ |
2314 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ | 2335 __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ |
| 2336 __asm movq xmm4, qword ptr [eax] \ |
| 2337 __asm punpcklbw xmm4, xmm4 \ |
| 2338 __asm lea eax, [eax + 8] \ |
| 2339 } |
| 2340 |
| 2341 // YUY2 shuf 8 Y to 16 Y. |
| 2342 static const vec8 kShuffleYUY2Y = { |
| 2343 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 |
| 2344 }; |
| 2345 |
| 2346 // YUY2 shuf 4 UV to 8 UV. |
| 2347 static const vec8 kShuffleYUY2UV = { |
| 2348 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 |
| 2349 }; |
| 2350 |
| 2351 // Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. |
| 2352 #define READYUY2 __asm { \ |
| 2353 __asm movdqu xmm4, [eax] /* YUY2 */ \ |
| 2354 __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ |
| 2355 __asm movdqu xmm0, [eax] /* UV */ \ |
| 2356 __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ |
| 2357 __asm lea eax, [eax + 16] \ |
| 2358 } |
| 2359 |
| 2360 // UYVY shuf 8 Y to 16 Y. |
| 2361 static const vec8 kShuffleUYVYY = { |
| 2362 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 |
| 2363 }; |
| 2364 |
| 2365 // UYVY shuf 4 UV to 8 UV. |
| 2366 static const vec8 kShuffleUYVYUV = { |
| 2367 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 |
| 2368 }; |
| 2369 |
| 2370 // Read 4 UYVY with 8 Y and update 4 UV to 8 UV. |
| 2371 #define READUYVY __asm { \ |
| 2372 __asm movdqu xmm4, [eax] /* UYVY */ \ |
| 2373 __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ |
| 2374 __asm movdqu xmm0, [eax] /* UV */ \ |
| 2375 __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ |
| 2376 __asm lea eax, [eax + 16] \ |
2315 } | 2377 } |
2316 | 2378 |
2317 // Convert 8 pixels: 8 UV and 8 Y. | 2379 // Convert 8 pixels: 8 UV and 8 Y. |
2318 #define YUVTORGB(YuvConstants) __asm { \ | 2380 #define YUVTORGB(YuvConstants) __asm { \ |
2319 __asm movdqa xmm1, xmm0 \ | 2381 __asm movdqa xmm1, xmm0 \ |
2320 __asm movdqa xmm2, xmm0 \ | 2382 __asm movdqa xmm2, xmm0 \ |
2321 __asm movdqa xmm3, xmm0 \ | 2383 __asm movdqa xmm3, xmm0 \ |
2322 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ | 2384 __asm movdqa xmm0, xmmword ptr [YuvConstants + KUVBIASB] \ |
2323 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ | 2385 __asm pmaddubsw xmm1, xmmword ptr [YuvConstants + KUVTOB] \ |
2324 __asm psubw xmm0, xmm1 \ | 2386 __asm psubw xmm0, xmm1 \ |
2325 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ | 2387 __asm movdqa xmm1, xmmword ptr [YuvConstants + KUVBIASG] \ |
2326 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ | 2388 __asm pmaddubsw xmm2, xmmword ptr [YuvConstants + KUVTOG] \ |
2327 __asm psubw xmm1, xmm2 \ | 2389 __asm psubw xmm1, xmm2 \ |
2328 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ | 2390 __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ |
2329 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ | 2391 __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ |
2330 __asm psubw xmm2, xmm3 \ | 2392 __asm psubw xmm2, xmm3 \ |
2331 __asm movq xmm3, qword ptr [eax] \ | 2393 __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ |
2332 __asm lea eax, [eax + 8] \ | 2394 __asm paddsw xmm0, xmm4 /* B += Y */ \ |
2333 __asm punpcklbw xmm3, xmm3 \ | 2395 __asm paddsw xmm1, xmm4 /* G += Y */ \ |
2334 __asm pmulhuw xmm3, xmmword ptr [YuvConstants + KYTORGB] \ | 2396 __asm paddsw xmm2, xmm4 /* R += Y */ \ |
2335 __asm paddsw xmm0, xmm3 /* B += Y */ \ | |
2336 __asm paddsw xmm1, xmm3 /* G += Y */ \ | |
2337 __asm paddsw xmm2, xmm3 /* R += Y */ \ | |
2338 __asm psraw xmm0, 6 \ | 2397 __asm psraw xmm0, 6 \ |
2339 __asm psraw xmm1, 6 \ | 2398 __asm psraw xmm1, 6 \ |
2340 __asm psraw xmm2, 6 \ | 2399 __asm psraw xmm2, 6 \ |
2341 __asm packuswb xmm0, xmm0 /* B */ \ | 2400 __asm packuswb xmm0, xmm0 /* B */ \ |
2342 __asm packuswb xmm1, xmm1 /* G */ \ | 2401 __asm packuswb xmm1, xmm1 /* G */ \ |
2343 __asm packuswb xmm2, xmm2 /* R */ \ | 2402 __asm packuswb xmm2, xmm2 /* R */ \ |
2344 } | 2403 } |
2345 | 2404 |
2346 // Store 8 ARGB values. | 2405 // Store 8 ARGB values. |
2347 #define STOREARGB __asm { \ | 2406 #define STOREARGB __asm { \ |
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2473 struct YuvConstants* yuvconstants, | 2532 struct YuvConstants* yuvconstants, |
2474 int width) { | 2533 int width) { |
2475 __asm { | 2534 __asm { |
2476 push esi | 2535 push esi |
2477 push edi | 2536 push edi |
2478 push ebp | 2537 push ebp |
2479 mov eax, [esp + 12 + 4] // Y | 2538 mov eax, [esp + 12 + 4] // Y |
2480 mov esi, [esp + 12 + 8] // U | 2539 mov esi, [esp + 12 + 8] // U |
2481 mov edi, [esp + 12 + 12] // V | 2540 mov edi, [esp + 12 + 12] // V |
2482 mov edx, [esp + 12 + 16] // argb | 2541 mov edx, [esp + 12 + 16] // argb |
2483 mov ebp, [esp + 12 + 20] // YuvConstants | 2542 mov ebp, [esp + 12 + 20] // yuvconstants |
2484 mov ecx, [esp + 12 + 24] // width | 2543 mov ecx, [esp + 12 + 24] // width |
2485 sub edi, esi | 2544 sub edi, esi |
2486 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2545 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2487 | 2546 |
2488 convertloop: | 2547 convertloop: |
2489 READYUV444 | 2548 READYUV444 |
2490 YUVTORGB(ebp) | 2549 YUVTORGB(ebp) |
2491 STOREARGB | 2550 STOREARGB |
2492 | 2551 |
2493 sub ecx, 8 | 2552 sub ecx, 8 |
(...skipping 16 matching lines...) Expand all Loading... |
2510 struct YuvConstants* yuvconstants, | 2569 struct YuvConstants* yuvconstants, |
2511 int width) { | 2570 int width) { |
2512 __asm { | 2571 __asm { |
2513 push esi | 2572 push esi |
2514 push edi | 2573 push edi |
2515 push ebp | 2574 push ebp |
2516 mov eax, [esp + 12 + 4] // Y | 2575 mov eax, [esp + 12 + 4] // Y |
2517 mov esi, [esp + 12 + 8] // U | 2576 mov esi, [esp + 12 + 8] // U |
2518 mov edi, [esp + 12 + 12] // V | 2577 mov edi, [esp + 12 + 12] // V |
2519 mov edx, [esp + 12 + 16] // abgr | 2578 mov edx, [esp + 12 + 16] // abgr |
2520 mov ebp, [esp + 12 + 20] // YuvConstants | 2579 mov ebp, [esp + 12 + 20] // yuvconstants |
2521 mov ecx, [esp + 12 + 24] // width | 2580 mov ecx, [esp + 12 + 24] // width |
2522 sub edi, esi | 2581 sub edi, esi |
2523 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2582 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2524 | 2583 |
2525 convertloop: | 2584 convertloop: |
2526 READYUV444 | 2585 READYUV444 |
2527 YUVTORGB(ebp) | 2586 YUVTORGB(ebp) |
2528 STOREABGR | 2587 STOREABGR |
2529 | 2588 |
2530 sub ecx, 8 | 2589 sub ecx, 8 |
(...skipping 16 matching lines...) Expand all Loading... |
2547 struct YuvConstants* yuvconstants, | 2606 struct YuvConstants* yuvconstants, |
2548 int width) { | 2607 int width) { |
2549 __asm { | 2608 __asm { |
2550 push esi | 2609 push esi |
2551 push edi | 2610 push edi |
2552 push ebp | 2611 push ebp |
2553 mov eax, [esp + 12 + 4] // Y | 2612 mov eax, [esp + 12 + 4] // Y |
2554 mov esi, [esp + 12 + 8] // U | 2613 mov esi, [esp + 12 + 8] // U |
2555 mov edi, [esp + 12 + 12] // V | 2614 mov edi, [esp + 12 + 12] // V |
2556 mov edx, [esp + 12 + 16] // argb | 2615 mov edx, [esp + 12 + 16] // argb |
2557 mov ebp, [esp + 12 + 20] // YuvConstants | 2616 mov ebp, [esp + 12 + 20] // yuvconstants |
2558 mov ecx, [esp + 12 + 24] // width | 2617 mov ecx, [esp + 12 + 24] // width |
2559 sub edi, esi | 2618 sub edi, esi |
2560 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 | 2619 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 |
2561 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 | 2620 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRGB24 |
2562 | 2621 |
2563 convertloop: | 2622 convertloop: |
2564 READYUV422 | 2623 READYUV422 |
2565 YUVTORGB(ebp) | 2624 YUVTORGB(ebp) |
2566 STORERGB24 | 2625 STORERGB24 |
2567 | 2626 |
(...skipping 17 matching lines...) Expand all Loading... |
2585 struct YuvConstants* yuvconstants, | 2644 struct YuvConstants* yuvconstants, |
2586 int width) { | 2645 int width) { |
2587 __asm { | 2646 __asm { |
2588 push esi | 2647 push esi |
2589 push edi | 2648 push edi |
2590 push ebp | 2649 push ebp |
2591 mov eax, [esp + 12 + 4] // Y | 2650 mov eax, [esp + 12 + 4] // Y |
2592 mov esi, [esp + 12 + 8] // U | 2651 mov esi, [esp + 12 + 8] // U |
2593 mov edi, [esp + 12 + 12] // V | 2652 mov edi, [esp + 12 + 12] // V |
2594 mov edx, [esp + 12 + 16] // argb | 2653 mov edx, [esp + 12 + 16] // argb |
2595 mov ebp, [esp + 12 + 20] // YuvConstants | 2654 mov ebp, [esp + 12 + 20] // yuvconstants |
2596 mov ecx, [esp + 12 + 24] // width | 2655 mov ecx, [esp + 12 + 24] // width |
2597 sub edi, esi | 2656 sub edi, esi |
2598 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 | 2657 movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 |
2599 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW | 2658 movdqa xmm6, xmmword ptr kShuffleMaskARGBToRAW |
2600 | 2659 |
2601 convertloop: | 2660 convertloop: |
2602 READYUV422 | 2661 READYUV422 |
2603 YUVTORGB(ebp) | 2662 YUVTORGB(ebp) |
2604 STORERAW | 2663 STORERAW |
2605 | 2664 |
(...skipping 17 matching lines...) Expand all Loading... |
2623 struct YuvConstants* yuvconstants, | 2682 struct YuvConstants* yuvconstants, |
2624 int width) { | 2683 int width) { |
2625 __asm { | 2684 __asm { |
2626 push esi | 2685 push esi |
2627 push edi | 2686 push edi |
2628 push ebp | 2687 push ebp |
2629 mov eax, [esp + 12 + 4] // Y | 2688 mov eax, [esp + 12 + 4] // Y |
2630 mov esi, [esp + 12 + 8] // U | 2689 mov esi, [esp + 12 + 8] // U |
2631 mov edi, [esp + 12 + 12] // V | 2690 mov edi, [esp + 12 + 12] // V |
2632 mov edx, [esp + 12 + 16] // argb | 2691 mov edx, [esp + 12 + 16] // argb |
2633 mov ebp, [esp + 12 + 20] // YuvConstants | 2692 mov ebp, [esp + 12 + 20] // yuvconstants |
2634 mov ecx, [esp + 12 + 24] // width | 2693 mov ecx, [esp + 12 + 24] // width |
2635 sub edi, esi | 2694 sub edi, esi |
2636 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f | 2695 pcmpeqb xmm5, xmm5 // generate mask 0x0000001f |
2637 psrld xmm5, 27 | 2696 psrld xmm5, 27 |
2638 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 | 2697 pcmpeqb xmm6, xmm6 // generate mask 0x000007e0 |
2639 psrld xmm6, 26 | 2698 psrld xmm6, 26 |
2640 pslld xmm6, 5 | 2699 pslld xmm6, 5 |
2641 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 | 2700 pcmpeqb xmm7, xmm7 // generate mask 0xfffff800 |
2642 pslld xmm7, 11 | 2701 pslld xmm7, 11 |
2643 | 2702 |
(...skipping 22 matching lines...) Expand all Loading... |
2666 struct YuvConstants* yuvconstants, | 2725 struct YuvConstants* yuvconstants, |
2667 int width) { | 2726 int width) { |
2668 __asm { | 2727 __asm { |
2669 push esi | 2728 push esi |
2670 push edi | 2729 push edi |
2671 push ebp | 2730 push ebp |
2672 mov eax, [esp + 12 + 4] // Y | 2731 mov eax, [esp + 12 + 4] // Y |
2673 mov esi, [esp + 12 + 8] // U | 2732 mov esi, [esp + 12 + 8] // U |
2674 mov edi, [esp + 12 + 12] // V | 2733 mov edi, [esp + 12 + 12] // V |
2675 mov edx, [esp + 12 + 16] // argb | 2734 mov edx, [esp + 12 + 16] // argb |
2676 mov ebp, [esp + 12 + 20] // YuvConstants | 2735 mov ebp, [esp + 12 + 20] // yuvconstants |
2677 mov ecx, [esp + 12 + 24] // width | 2736 mov ecx, [esp + 12 + 24] // width |
2678 sub edi, esi | 2737 sub edi, esi |
2679 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2738 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2680 | 2739 |
2681 convertloop: | 2740 convertloop: |
2682 READYUV422 | 2741 READYUV422 |
2683 YUVTORGB(ebp) | 2742 YUVTORGB(ebp) |
2684 STOREARGB | 2743 STOREARGB |
2685 | 2744 |
2686 sub ecx, 8 | 2745 sub ecx, 8 |
(...skipping 17 matching lines...) Expand all Loading... |
2704 struct YuvConstants* yuvconstants, | 2763 struct YuvConstants* yuvconstants, |
2705 int width) { | 2764 int width) { |
2706 __asm { | 2765 __asm { |
2707 push esi | 2766 push esi |
2708 push edi | 2767 push edi |
2709 push ebp | 2768 push ebp |
2710 mov eax, [esp + 12 + 4] // Y | 2769 mov eax, [esp + 12 + 4] // Y |
2711 mov esi, [esp + 12 + 8] // U | 2770 mov esi, [esp + 12 + 8] // U |
2712 mov edi, [esp + 12 + 12] // V | 2771 mov edi, [esp + 12 + 12] // V |
2713 mov edx, [esp + 12 + 16] // abgr | 2772 mov edx, [esp + 12 + 16] // abgr |
2714 mov ebp, [esp + 12 + 20] // YuvConstants | 2773 mov ebp, [esp + 12 + 20] // yuvconstants |
2715 mov ecx, [esp + 12 + 24] // width | 2774 mov ecx, [esp + 12 + 24] // width |
2716 sub edi, esi | 2775 sub edi, esi |
2717 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2776 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2718 | 2777 |
2719 convertloop: | 2778 convertloop: |
2720 READYUV411 | 2779 READYUV411 |
2721 YUVTORGB(ebp) | 2780 YUVTORGB(ebp) |
2722 STOREARGB | 2781 STOREARGB |
2723 | 2782 |
2724 sub ecx, 8 | 2783 sub ecx, 8 |
(...skipping 13 matching lines...) Expand all Loading... |
2738 const uint8* uv_buf, | 2797 const uint8* uv_buf, |
2739 uint8* dst_argb, | 2798 uint8* dst_argb, |
2740 struct YuvConstants* yuvconstants, | 2799 struct YuvConstants* yuvconstants, |
2741 int width) { | 2800 int width) { |
2742 __asm { | 2801 __asm { |
2743 push esi | 2802 push esi |
2744 push ebp | 2803 push ebp |
2745 mov eax, [esp + 8 + 4] // Y | 2804 mov eax, [esp + 8 + 4] // Y |
2746 mov esi, [esp + 8 + 8] // UV | 2805 mov esi, [esp + 8 + 8] // UV |
2747 mov edx, [esp + 8 + 12] // argb | 2806 mov edx, [esp + 8 + 12] // argb |
2748 mov ebp, [esp + 8 + 16] // YuvConstants | 2807 mov ebp, [esp + 8 + 16] // yuvconstants |
2749 mov ecx, [esp + 8 + 20] // width | 2808 mov ecx, [esp + 8 + 20] // width |
2750 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2809 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2751 | 2810 |
2752 convertloop: | 2811 convertloop: |
2753 READNV12 | 2812 READNV12 |
2754 YUVTORGB(ebp) | 2813 YUVTORGB(ebp) |
2755 STOREARGB | 2814 STOREARGB |
2756 | 2815 |
2757 sub ecx, 8 | 2816 sub ecx, 8 |
2758 jg convertloop | 2817 jg convertloop |
2759 | 2818 |
2760 pop ebp | 2819 pop ebp |
2761 pop esi | 2820 pop esi |
2762 ret | 2821 ret |
2763 } | 2822 } |
2764 } | 2823 } |
2765 | 2824 |
| 2825 // 8 pixels. |
| 2826 // 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
| 2827 __declspec(naked) |
| 2828 void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, |
| 2829 uint8* dst_argb, |
| 2830 struct YuvConstants* yuvconstants, |
| 2831 int width) { |
| 2832 __asm { |
| 2833 push ebp |
| 2834 mov eax, [esp + 4 + 4] // yuy2 |
| 2835 mov edx, [esp + 4 + 8] // argb |
| 2836 mov ebp, [esp + 4 + 12] // yuvconstants |
| 2837 mov ecx, [esp + 4 + 16] // width |
| 2838 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2839 |
| 2840 convertloop: |
| 2841 READYUY2 |
| 2842 YUVTORGB(ebp) |
| 2843 STOREARGB |
| 2844 |
| 2845 sub ecx, 8 |
| 2846 jg convertloop |
| 2847 |
| 2848 pop ebp |
| 2849 ret |
| 2850 } |
| 2851 } |
| 2852 |
| 2853 // 8 pixels. |
| 2854 // 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). |
| 2855 __declspec(naked) |
| 2856 void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, |
| 2857 uint8* dst_argb, |
| 2858 struct YuvConstants* yuvconstants, |
| 2859 int width) { |
| 2860 __asm { |
| 2861 push ebp |
| 2862 mov eax, [esp + 4 + 4] // uyvy |
| 2863 mov edx, [esp + 4 + 8] // argb |
| 2864 mov ebp, [esp + 4 + 12] // yuvconstants |
| 2865 mov ecx, [esp + 4 + 16] // width |
| 2866 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
| 2867 |
| 2868 convertloop: |
| 2869 READUYVY |
| 2870 YUVTORGB(ebp) |
| 2871 STOREARGB |
| 2872 |
| 2873 sub ecx, 8 |
| 2874 jg convertloop |
| 2875 |
| 2876 pop ebp |
| 2877 ret |
| 2878 } |
| 2879 } |
| 2880 |
2766 __declspec(naked) | 2881 __declspec(naked) |
2767 void I422ToBGRARow_SSSE3(const uint8* y_buf, | 2882 void I422ToBGRARow_SSSE3(const uint8* y_buf, |
2768 const uint8* u_buf, | 2883 const uint8* u_buf, |
2769 const uint8* v_buf, | 2884 const uint8* v_buf, |
2770 uint8* dst_bgra, | 2885 uint8* dst_bgra, |
2771 struct YuvConstants* yuvconstants, | 2886 struct YuvConstants* yuvconstants, |
2772 int width) { | 2887 int width) { |
2773 __asm { | 2888 __asm { |
2774 push esi | 2889 push esi |
2775 push edi | 2890 push edi |
2776 push ebp | 2891 push ebp |
2777 mov eax, [esp + 12 + 4] // Y | 2892 mov eax, [esp + 12 + 4] // Y |
2778 mov esi, [esp + 12 + 8] // U | 2893 mov esi, [esp + 12 + 8] // U |
2779 mov edi, [esp + 12 + 12] // V | 2894 mov edi, [esp + 12 + 12] // V |
2780 mov edx, [esp + 12 + 16] // argb | 2895 mov edx, [esp + 12 + 16] // argb |
2781 mov ebp, [esp + 12 + 20] // YuvConstants | 2896 mov ebp, [esp + 12 + 20] // yuvconstants |
2782 mov ecx, [esp + 12 + 24] // width | 2897 mov ecx, [esp + 12 + 24] // width |
2783 sub edi, esi | 2898 sub edi, esi |
2784 | 2899 |
2785 convertloop: | 2900 convertloop: |
2786 READYUV422 | 2901 READYUV422 |
2787 YUVTORGB(ebp) | 2902 YUVTORGB(ebp) |
2788 STOREBGRA | 2903 STOREBGRA |
2789 | 2904 |
2790 sub ecx, 8 | 2905 sub ecx, 8 |
2791 jg convertloop | 2906 jg convertloop |
(...skipping 13 matching lines...) Expand all Loading... |
2805 struct YuvConstants* yuvconstants, | 2920 struct YuvConstants* yuvconstants, |
2806 int width) { | 2921 int width) { |
2807 __asm { | 2922 __asm { |
2808 push esi | 2923 push esi |
2809 push edi | 2924 push edi |
2810 push ebp | 2925 push ebp |
2811 mov eax, [esp + 12 + 4] // Y | 2926 mov eax, [esp + 12 + 4] // Y |
2812 mov esi, [esp + 12 + 8] // U | 2927 mov esi, [esp + 12 + 8] // U |
2813 mov edi, [esp + 12 + 12] // V | 2928 mov edi, [esp + 12 + 12] // V |
2814 mov edx, [esp + 12 + 16] // argb | 2929 mov edx, [esp + 12 + 16] // argb |
2815 mov ebp, [esp + 12 + 20] // YuvConstants | 2930 mov ebp, [esp + 12 + 20] // yuvconstants |
2816 mov ecx, [esp + 12 + 24] // width | 2931 mov ecx, [esp + 12 + 24] // width |
2817 sub edi, esi | 2932 sub edi, esi |
2818 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha | 2933 pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha |
2819 | 2934 |
2820 convertloop: | 2935 convertloop: |
2821 READYUV422 | 2936 READYUV422 |
2822 YUVTORGB(ebp) | 2937 YUVTORGB(ebp) |
2823 STOREABGR | 2938 STOREABGR |
2824 | 2939 |
2825 sub ecx, 8 | 2940 sub ecx, 8 |
(...skipping 14 matching lines...) Expand all Loading... |
2840 struct YuvConstants* yuvconstants, | 2955 struct YuvConstants* yuvconstants, |
2841 int width) { | 2956 int width) { |
2842 __asm { | 2957 __asm { |
2843 push esi | 2958 push esi |
2844 push edi | 2959 push edi |
2845 push ebp | 2960 push ebp |
2846 mov eax, [esp + 12 + 4] // Y | 2961 mov eax, [esp + 12 + 4] // Y |
2847 mov esi, [esp + 12 + 8] // U | 2962 mov esi, [esp + 12 + 8] // U |
2848 mov edi, [esp + 12 + 12] // V | 2963 mov edi, [esp + 12 + 12] // V |
2849 mov edx, [esp + 12 + 16] // argb | 2964 mov edx, [esp + 12 + 16] // argb |
2850 mov ebp, [esp + 12 + 20] // YuvConstants | 2965 mov ebp, [esp + 12 + 20] // yuvconstants |
2851 mov ecx, [esp + 12 + 24] // width | 2966 mov ecx, [esp + 12 + 24] // width |
2852 sub edi, esi | 2967 sub edi, esi |
2853 | 2968 |
2854 convertloop: | 2969 convertloop: |
2855 READYUV422 | 2970 READYUV422 |
2856 YUVTORGB(ebp) | 2971 YUVTORGB(ebp) |
2857 STORERGBA | 2972 STORERGBA |
2858 | 2973 |
2859 sub ecx, 8 | 2974 sub ecx, 8 |
2860 jg convertloop | 2975 jg convertloop |
(...skipping 644 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3505 mov ecx, [esp + 12] // count | 3620 mov ecx, [esp + 12] // count |
3506 rep stosd | 3621 rep stosd |
3507 mov edi, edx | 3622 mov edi, edx |
3508 ret | 3623 ret |
3509 } | 3624 } |
3510 } | 3625 } |
3511 #endif // HAS_SETROW_X86 | 3626 #endif // HAS_SETROW_X86 |
3512 | 3627 |
3513 #ifdef HAS_YUY2TOYROW_AVX2 | 3628 #ifdef HAS_YUY2TOYROW_AVX2 |
3514 __declspec(naked) | 3629 __declspec(naked) |
3515 void YUY2ToYRow_AVX2(const uint8* src_yuy2, | 3630 void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { |
3516 uint8* dst_y, int pix) { | |
3517 __asm { | 3631 __asm { |
3518 mov eax, [esp + 4] // src_yuy2 | 3632 mov eax, [esp + 4] // src_yuy2 |
3519 mov edx, [esp + 8] // dst_y | 3633 mov edx, [esp + 8] // dst_y |
3520 mov ecx, [esp + 12] // pix | 3634 mov ecx, [esp + 12] // pix |
3521 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff | 3635 vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff00ff |
3522 vpsrlw ymm5, ymm5, 8 | 3636 vpsrlw ymm5, ymm5, 8 |
3523 | 3637 |
3524 convertloop: | 3638 convertloop: |
3525 vmovdqu ymm0, [eax] | 3639 vmovdqu ymm0, [eax] |
3526 vmovdqu ymm1, [eax + 32] | 3640 vmovdqu ymm1, [eax + 32] |
(...skipping 2733 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6260 } | 6374 } |
6261 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 | 6375 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 |
6262 | 6376 |
6263 #endif // defined(_M_X64) | 6377 #endif // defined(_M_X64) |
6264 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) | 6378 #endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) |
6265 | 6379 |
6266 #ifdef __cplusplus | 6380 #ifdef __cplusplus |
6267 } // extern "C" | 6381 } // extern "C" |
6268 } // namespace libyuv | 6382 } // namespace libyuv |
6269 #endif | 6383 #endif |
OLD | NEW |