OLD | NEW |
1 // Copyright 2014 Google Inc. All Rights Reserved. | 1 // Copyright 2014 Google Inc. All Rights Reserved. |
2 // | 2 // |
3 // Use of this source code is governed by a BSD-style license | 3 // Use of this source code is governed by a BSD-style license |
4 // that can be found in the COPYING file in the root of the source | 4 // that can be found in the COPYING file in the root of the source |
5 // tree. An additional intellectual property rights grant can be found | 5 // tree. An additional intellectual property rights grant can be found |
6 // in the file PATENTS. All contributing project authors may | 6 // in the file PATENTS. All contributing project authors may |
7 // be found in the AUTHORS file in the root of the source tree. | 7 // be found in the AUTHORS file in the root of the source tree. |
8 // ----------------------------------------------------------------------------- | 8 // ----------------------------------------------------------------------------- |
9 // | 9 // |
10 // YUV->RGB conversion functions | 10 // YUV->RGB conversion functions |
11 // | 11 // |
12 // Author: Skal (pascal.massimino@gmail.com) | 12 // Author: Skal (pascal.massimino@gmail.com) |
13 | 13 |
14 #include "./yuv.h" | 14 #include "./yuv.h" |
15 | 15 |
16 #if defined(WEBP_USE_SSE2) | 16 #if defined(WEBP_USE_SSE2) |
17 | 17 |
18 #include <emmintrin.h> | 18 #include <emmintrin.h> |
19 #include <string.h> // for memcpy | |
20 | |
21 typedef union { // handy struct for converting SSE2 registers | |
22 int32_t i32[4]; | |
23 uint8_t u8[16]; | |
24 __m128i m; | |
25 } VP8kCstSSE2; | |
26 | |
27 #if defined(WEBP_YUV_USE_SSE2_TABLES) | |
28 | |
29 #include "./yuv_tables_sse2.h" | |
30 | |
31 void VP8YUVInitSSE2(void) {} | |
32 | |
33 #else | |
34 | |
35 static int done_sse2 = 0; | |
36 static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256]; | |
37 | |
38 void VP8YUVInitSSE2(void) { | |
39 if (!done_sse2) { | |
40 int i; | |
41 for (i = 0; i < 256; ++i) { | |
42 VP8kYtoRGBA[i].i32[0] = | |
43 VP8kYtoRGBA[i].i32[1] = | |
44 VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2; | |
45 VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2; | |
46 | |
47 VP8kUtoRGBA[i].i32[0] = 0; | |
48 VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128); | |
49 VP8kUtoRGBA[i].i32[2] = kUToB * (i - 128); | |
50 VP8kUtoRGBA[i].i32[3] = 0; | |
51 | |
52 VP8kVtoRGBA[i].i32[0] = kVToR * (i - 128); | |
53 VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128); | |
54 VP8kVtoRGBA[i].i32[2] = 0; | |
55 VP8kVtoRGBA[i].i32[3] = 0; | |
56 } | |
57 done_sse2 = 1; | |
58 | |
59 #if 0 // code used to generate 'yuv_tables_sse2.h' | |
60 printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n"); | |
61 for (i = 0; i < 256; ++i) { | |
62 printf(" {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n", | |
63 VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1], | |
64 VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]); | |
65 } | |
66 printf("};\n\n"); | |
67 printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n"); | |
68 for (i = 0; i < 256; ++i) { | |
69 printf(" {{0, 0x%.8x, 0x%.8x, 0}},\n", | |
70 VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]); | |
71 } | |
72 printf("};\n\n"); | |
73 printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n"); | |
74 for (i = 0; i < 256; ++i) { | |
75 printf(" {{0x%.8x, 0x%.8x, 0, 0}},\n", | |
76 VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]); | |
77 } | |
78 printf("};\n\n"); | |
79 #endif | |
80 } | |
81 } | |
82 | |
83 #endif // WEBP_YUV_USE_SSE2_TABLES | |
84 | |
85 //----------------------------------------------------------------------------- | |
86 | |
87 static WEBP_INLINE __m128i LoadUVPart(int u, int v) { | |
88 const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); | |
89 const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); | |
90 const __m128i uv_part = _mm_add_epi32(u_part, v_part); | |
91 return uv_part; | |
92 } | |
93 | |
94 static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) { | |
95 const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); | |
96 const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); | |
97 const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); | |
98 return rgba2; | |
99 } | |
100 | |
101 static WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) { | |
102 const __m128i uv_part = LoadUVPart(u, v); | |
103 return GetRGBA32bWithUV(y, uv_part); | |
104 } | |
105 | |
106 static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, | |
107 uint8_t* const rgb) { | |
108 const __m128i tmp0 = GetRGBA32b(y, u, v); | |
109 const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); | |
110 const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); | |
111 // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp | |
112 _mm_storel_epi64((__m128i*)rgb, tmp2); | |
113 } | |
114 | |
115 static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, | |
116 uint8_t* const bgr) { | |
117 const __m128i tmp0 = GetRGBA32b(y, u, v); | |
118 const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); | |
119 const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); | |
120 const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); | |
121 // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp | |
122 _mm_storel_epi64((__m128i*)bgr, tmp3); | |
123 } | |
124 | 19 |
125 //----------------------------------------------------------------------------- | 20 //----------------------------------------------------------------------------- |
126 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. | 21 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. |
127 | 22 |
128 #ifdef FANCY_UPSAMPLING | 23 // These constants are 14b fixed-point version of ITU-R BT.601 constants. |
| 24 // R = (19077 * y + 26149 * v - 14234) >> 6 |
| 25 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 |
| 26 // B = (19077 * y + 33050 * u - 17685) >> 6 |
| 27 static void ConvertYUV444ToRGB(const __m128i* const Y0, |
| 28 const __m128i* const U0, |
| 29 const __m128i* const V0, |
| 30 __m128i* const R, |
| 31 __m128i* const G, |
| 32 __m128i* const B) { |
| 33 const __m128i k19077 = _mm_set1_epi16(19077); |
| 34 const __m128i k26149 = _mm_set1_epi16(26149); |
| 35 const __m128i k14234 = _mm_set1_epi16(14234); |
| 36 const __m128i k33050 = _mm_set1_epi16(33050); |
| 37 const __m128i k17685 = _mm_set1_epi16(17685); |
| 38 const __m128i k6419 = _mm_set1_epi16(6419); |
| 39 const __m128i k13320 = _mm_set1_epi16(13320); |
| 40 const __m128i k8708 = _mm_set1_epi16(8708); |
| 41 |
| 42 const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); |
| 43 |
| 44 const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); |
| 45 const __m128i R1 = _mm_sub_epi16(Y1, k14234); |
| 46 const __m128i R2 = _mm_add_epi16(R1, R0); |
| 47 |
| 48 const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); |
| 49 const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); |
| 50 const __m128i G2 = _mm_add_epi16(Y1, k8708); |
| 51 const __m128i G3 = _mm_add_epi16(G0, G1); |
| 52 const __m128i G4 = _mm_sub_epi16(G2, G3); |
| 53 |
| 54 // be careful with the saturated *unsigned* arithmetic here! |
| 55 const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); |
| 56 const __m128i B1 = _mm_adds_epu16(B0, Y1); |
| 57 const __m128i B2 = _mm_subs_epu16(B1, k17685); |
| 58 |
| 59 // use logical shift for B2, which can be larger than 32767 |
| 60 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] |
| 61 *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] |
| 62 *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] |
| 63 } |
| 64 |
| 65 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. |
| 66 static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) { |
| 67 const __m128i zero = _mm_setzero_si128(); |
| 68 return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); |
| 69 } |
| 70 |
| 71 // Load and replicate the U/V samples |
| 72 static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) { |
| 73 const __m128i zero = _mm_setzero_si128(); |
| 74 const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src); |
| 75 const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); |
| 76 return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples |
| 77 } |
| 78 |
| 79 // Convert 32 samples of YUV444 to R/G/B |
| 80 static void YUV444ToRGB(const uint8_t* const y, |
| 81 const uint8_t* const u, |
| 82 const uint8_t* const v, |
| 83 __m128i* const R, __m128i* const G, __m128i* const B) { |
| 84 const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v); |
| 85 ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); |
| 86 } |
| 87 |
| 88 // Convert 32 samples of YUV420 to R/G/B |
| 89 static void YUV420ToRGB(const uint8_t* const y, |
| 90 const uint8_t* const u, |
| 91 const uint8_t* const v, |
| 92 __m128i* const R, __m128i* const G, __m128i* const B) { |
| 93 const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v); |
| 94 ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B); |
| 95 } |
| 96 |
| 97 // Pack R/G/B/A results into 32b output. |
| 98 static WEBP_INLINE void PackAndStore4(const __m128i* const R, |
| 99 const __m128i* const G, |
| 100 const __m128i* const B, |
| 101 const __m128i* const A, |
| 102 uint8_t* const dst) { |
| 103 const __m128i rb = _mm_packus_epi16(*R, *B); |
| 104 const __m128i ga = _mm_packus_epi16(*G, *A); |
| 105 const __m128i rg = _mm_unpacklo_epi8(rb, ga); |
| 106 const __m128i ba = _mm_unpackhi_epi8(rb, ga); |
| 107 const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba); |
| 108 const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba); |
| 109 _mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo); |
| 110 _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi); |
| 111 } |
| 112 |
| 113 // Pack R/G/B/A results into 16b output. |
| 114 static WEBP_INLINE void PackAndStore4444(const __m128i* const R, |
| 115 const __m128i* const G, |
| 116 const __m128i* const B, |
| 117 const __m128i* const A, |
| 118 uint8_t* const dst) { |
| 119 #if !defined(WEBP_SWAP_16BIT_CSP) |
| 120 const __m128i rg0 = _mm_packus_epi16(*R, *G); |
| 121 const __m128i ba0 = _mm_packus_epi16(*B, *A); |
| 122 #else |
| 123 const __m128i rg0 = _mm_packus_epi16(*B, *A); |
| 124 const __m128i ba0 = _mm_packus_epi16(*R, *G); |
| 125 #endif |
| 126 const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); |
| 127 const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb... |
| 128 const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga... |
| 129 const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0); |
| 130 const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4); |
| 131 const __m128i rgba4444 = _mm_or_si128(rb2, ga2); |
| 132 _mm_storeu_si128((__m128i*)dst, rgba4444); |
| 133 } |
| 134 |
| 135 // Pack R/G/B results into 16b output. |
| 136 static WEBP_INLINE void PackAndStore565(const __m128i* const R, |
| 137 const __m128i* const G, |
| 138 const __m128i* const B, |
| 139 uint8_t* const dst) { |
| 140 const __m128i r0 = _mm_packus_epi16(*R, *R); |
| 141 const __m128i g0 = _mm_packus_epi16(*G, *G); |
| 142 const __m128i b0 = _mm_packus_epi16(*B, *B); |
| 143 const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8)); |
| 144 const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f)); |
| 145 const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5); |
| 146 const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); |
| 147 const __m128i rg = _mm_or_si128(r1, g1); |
| 148 const __m128i gb = _mm_or_si128(g2, b1); |
| 149 #if !defined(WEBP_SWAP_16BIT_CSP) |
| 150 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); |
| 151 #else |
| 152 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); |
| 153 #endif |
| 154 _mm_storeu_si128((__m128i*)dst, rgb565); |
| 155 } |
| 156 |
| 157 // Function used several times in PlanarTo24b. |
| 158 // It samples the in buffer as follows: one every two unsigned char is stored |
| 159 // at the beginning of the buffer, while the other half is stored at the end. |
| 160 static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/, |
| 161 __m128i* const out /*out[6]*/) { |
| 162 const __m128i v_mask = _mm_set1_epi16(0x00ff); |
| 163 |
| 164 // Take one every two upper 8b values. |
| 165 out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask), |
| 166 _mm_and_si128(in[1], v_mask)); |
| 167 out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask), |
| 168 _mm_and_si128(in[3], v_mask)); |
| 169 out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask), |
| 170 _mm_and_si128(in[5], v_mask)); |
| 171 // Take one every two lower 8b values. |
| 172 out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8)); |
| 173 out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8)); |
| 174 out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8)); |
| 175 } |
| 176 |
| 177 // Pack the planar buffers |
| 178 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... |
| 179 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... |
| 180 static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) { |
| 181 // The input is 6 registers of sixteen 8b but for the sake of explanation, |
| 182 // let's take 6 registers of four 8b values. |
| 183 // To pack, we will keep taking one every two 8b integer and move it |
| 184 // around as follows: |
| 185 // Input: |
| 186 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 |
| 187 // Split the 6 registers in two sets of 3 registers: the first set as the even |
| 188 // 8b bytes, the second the odd ones: |
| 189 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 |
| 190 // Repeat the same permutations twice more: |
| 191 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 |
| 192 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 |
| 193 __m128i tmp[6]; |
| 194 PlanarTo24bHelper(in, tmp); |
| 195 PlanarTo24bHelper(tmp, in); |
| 196 PlanarTo24bHelper(in, tmp); |
| 197 // We need to do it two more times than the example as we have sixteen bytes. |
| 198 PlanarTo24bHelper(tmp, in); |
| 199 PlanarTo24bHelper(in, tmp); |
| 200 |
| 201 _mm_storeu_si128((__m128i*)(rgb + 0), tmp[0]); |
| 202 _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]); |
| 203 _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]); |
| 204 _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]); |
| 205 _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]); |
| 206 _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]); |
| 207 } |
| 208 #undef MK_UINT32 |
129 | 209 |
130 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 210 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
131 uint8_t* dst) { | 211 uint8_t* dst) { |
132 int n; | 212 const __m128i kAlpha = _mm_set1_epi16(255); |
133 for (n = 0; n < 32; n += 4) { | 213 int n; |
134 const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); | 214 for (n = 0; n < 32; n += 8, dst += 32) { |
135 const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); | 215 __m128i R, G, B; |
136 const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]); | 216 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
137 const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]); | 217 PackAndStore4(&R, &G, &B, &kAlpha, dst); |
138 const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); | |
139 const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); | |
140 const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); | |
141 _mm_storeu_si128((__m128i*)dst, tmp2); | |
142 dst += 4 * 4; | |
143 } | 218 } |
144 } | 219 } |
145 | 220 |
146 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 221 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
147 uint8_t* dst) { | 222 uint8_t* dst) { |
148 int n; | 223 const __m128i kAlpha = _mm_set1_epi16(255); |
149 for (n = 0; n < 32; n += 2) { | 224 int n; |
150 const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); | 225 for (n = 0; n < 32; n += 8, dst += 32) { |
151 const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); | 226 __m128i R, G, B; |
152 const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); | 227 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
153 const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); | 228 PackAndStore4(&B, &G, &R, &kAlpha, dst); |
154 const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); | 229 } |
155 const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); | 230 } |
156 _mm_storel_epi64((__m128i*)dst, tmp3); | 231 |
157 dst += 4 * 2; | 232 void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 233 uint8_t* dst) { |
| 234 const __m128i kAlpha = _mm_set1_epi16(255); |
| 235 int n; |
| 236 for (n = 0; n < 32; n += 8, dst += 32) { |
| 237 __m128i R, G, B; |
| 238 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
| 239 PackAndStore4(&kAlpha, &R, &G, &B, dst); |
| 240 } |
| 241 } |
| 242 |
| 243 void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 244 uint8_t* dst) { |
| 245 const __m128i kAlpha = _mm_set1_epi16(255); |
| 246 int n; |
| 247 for (n = 0; n < 32; n += 8, dst += 16) { |
| 248 __m128i R, G, B; |
| 249 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
| 250 PackAndStore4444(&R, &G, &B, &kAlpha, dst); |
| 251 } |
| 252 } |
| 253 |
| 254 void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 255 uint8_t* dst) { |
| 256 int n; |
| 257 for (n = 0; n < 32; n += 8, dst += 16) { |
| 258 __m128i R, G, B; |
| 259 YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B); |
| 260 PackAndStore565(&R, &G, &B, dst); |
158 } | 261 } |
159 } | 262 } |
160 | 263 |
161 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 264 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
162 uint8_t* dst) { | 265 uint8_t* dst) { |
163 int n; | 266 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
164 uint8_t tmp0[2 * 3 + 5 + 15]; | 267 __m128i rgb[6]; |
165 uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align | 268 |
166 for (n = 0; n < 30; ++n) { // we directly stomp the *dst memory | 269 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
167 YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3); | 270 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); |
168 } | 271 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); |
169 // Last two pixels are special: we write in a tmp buffer before sending | 272 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); |
170 // to dst. | 273 |
171 YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); | 274 // Cast to 8b and store as RRRRGGGGBBBB. |
172 YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); | 275 rgb[0] = _mm_packus_epi16(R0, R1); |
173 memcpy(dst + n * 3, tmp, 2 * 3); | 276 rgb[1] = _mm_packus_epi16(R2, R3); |
| 277 rgb[2] = _mm_packus_epi16(G0, G1); |
| 278 rgb[3] = _mm_packus_epi16(G2, G3); |
| 279 rgb[4] = _mm_packus_epi16(B0, B1); |
| 280 rgb[5] = _mm_packus_epi16(B2, B3); |
| 281 |
| 282 // Pack as RGBRGBRGBRGB. |
| 283 PlanarTo24b(rgb, dst); |
174 } | 284 } |
175 | 285 |
176 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, | 286 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
177 uint8_t* dst) { | 287 uint8_t* dst) { |
178 int n; | 288 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
179 uint8_t tmp0[2 * 3 + 5 + 15]; | 289 __m128i bgr[6]; |
180 uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15); // align | 290 |
181 for (n = 0; n < 30; ++n) { | 291 YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
182 YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3); | 292 YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1); |
183 } | 293 YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2); |
184 YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0); | 294 YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3); |
185 YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3); | 295 |
186 memcpy(dst + n * 3, tmp, 2 * 3); | 296 // Cast to 8b and store as BBBBGGGGRRRR. |
187 } | 297 bgr[0] = _mm_packus_epi16(B0, B1); |
188 | 298 bgr[1] = _mm_packus_epi16(B2, B3); |
189 #endif // FANCY_UPSAMPLING | 299 bgr[2] = _mm_packus_epi16(G0, G1); |
| 300 bgr[3] = _mm_packus_epi16(G2, G3); |
| 301 bgr[4] = _mm_packus_epi16(R0, R1); |
| 302 bgr[5] = _mm_packus_epi16(R2, R3); |
| 303 |
| 304 // Pack as BGRBGRBGRBGR. |
| 305 PlanarTo24b(bgr, dst); |
| 306 } |
190 | 307 |
191 //----------------------------------------------------------------------------- | 308 //----------------------------------------------------------------------------- |
192 // Arbitrary-length row conversion functions | 309 // Arbitrary-length row conversion functions |
193 | 310 |
194 static void YuvToRgbaRowSSE2(const uint8_t* y, | 311 static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
195 const uint8_t* u, const uint8_t* v, | 312 uint8_t* dst, int len) { |
196 uint8_t* dst, int len) { | 313 const __m128i kAlpha = _mm_set1_epi16(255); |
197 int n; | 314 int n; |
198 for (n = 0; n + 4 <= len; n += 4) { | 315 for (n = 0; n + 8 <= len; n += 8, dst += 32) { |
199 const __m128i uv_0 = LoadUVPart(u[0], v[0]); | 316 __m128i R, G, B; |
200 const __m128i uv_1 = LoadUVPart(u[1], v[1]); | 317 YUV420ToRGB(y, u, v, &R, &G, &B); |
201 const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); | 318 PackAndStore4(&R, &G, &B, &kAlpha, dst); |
202 const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); | 319 y += 8; |
203 const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1); | 320 u += 4; |
204 const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1); | 321 v += 4; |
205 const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2); | 322 } |
206 const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4); | 323 for (; n < len; ++n) { // Finish off |
207 const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2); | |
208 _mm_storeu_si128((__m128i*)dst, tmp2); | |
209 dst += 4 * 4; | |
210 y += 4; | |
211 u += 2; | |
212 v += 2; | |
213 } | |
214 // Finish off | |
215 while (n < len) { | |
216 VP8YuvToRgba(y[0], u[0], v[0], dst); | 324 VP8YuvToRgba(y[0], u[0], v[0], dst); |
217 dst += 4; | 325 dst += 4; |
218 ++y; | 326 y += 1; |
219 u += (n & 1); | 327 u += (n & 1); |
220 v += (n & 1); | 328 v += (n & 1); |
221 ++n; | 329 } |
222 } | 330 } |
223 } | 331 |
224 | 332 static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
225 static void YuvToBgraRowSSE2(const uint8_t* y, | 333 uint8_t* dst, int len) { |
226 const uint8_t* u, const uint8_t* v, | 334 const __m128i kAlpha = _mm_set1_epi16(255); |
227 uint8_t* dst, int len) { | 335 int n; |
228 int n; | 336 for (n = 0; n + 8 <= len; n += 8, dst += 32) { |
229 for (n = 0; n + 2 <= len; n += 2) { | 337 __m128i R, G, B; |
230 const __m128i uv_0 = LoadUVPart(u[0], v[0]); | 338 YUV420ToRGB(y, u, v, &R, &G, &B); |
231 const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); | 339 PackAndStore4(&B, &G, &R, &kAlpha, dst); |
232 const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); | 340 y += 8; |
233 const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); | 341 u += 4; |
234 const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); | 342 v += 4; |
235 const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); | 343 } |
236 const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); | 344 for (; n < len; ++n) { // Finish off |
237 _mm_storel_epi64((__m128i*)dst, tmp3); | |
238 dst += 4 * 2; | |
239 y += 2; | |
240 ++u; | |
241 ++v; | |
242 } | |
243 // Finish off | |
244 if (len & 1) { | |
245 VP8YuvToBgra(y[0], u[0], v[0], dst); | 345 VP8YuvToBgra(y[0], u[0], v[0], dst); |
246 } | 346 dst += 4; |
247 } | 347 y += 1; |
248 | |
249 static void YuvToArgbRowSSE2(const uint8_t* y, | |
250 const uint8_t* u, const uint8_t* v, | |
251 uint8_t* dst, int len) { | |
252 int n; | |
253 for (n = 0; n + 2 <= len; n += 2) { | |
254 const __m128i uv_0 = LoadUVPart(u[0], v[0]); | |
255 const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); | |
256 const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); | |
257 const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3)); | |
258 const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3)); | |
259 const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); | |
260 const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); | |
261 _mm_storel_epi64((__m128i*)dst, tmp3); | |
262 dst += 4 * 2; | |
263 y += 2; | |
264 ++u; | |
265 ++v; | |
266 } | |
267 // Finish off | |
268 if (len & 1) { | |
269 VP8YuvToArgb(y[0], u[0], v[0], dst); | |
270 } | |
271 } | |
272 | |
273 static void YuvToRgbRowSSE2(const uint8_t* y, | |
274 const uint8_t* u, const uint8_t* v, | |
275 uint8_t* dst, int len) { | |
276 int n; | |
277 for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory | |
278 YuvToRgbSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes | |
279 dst += 3; | |
280 ++y; | |
281 u += (n & 1); | 348 u += (n & 1); |
282 v += (n & 1); | 349 v += (n & 1); |
283 } | 350 } |
284 VP8YuvToRgb(y[0], u[0], v[0], dst); | 351 } |
285 if (len > 1) { | 352 |
286 VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3); | 353 static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
287 } | 354 uint8_t* dst, int len) { |
288 } | 355 const __m128i kAlpha = _mm_set1_epi16(255); |
289 | 356 int n; |
290 static void YuvToBgrRowSSE2(const uint8_t* y, | 357 for (n = 0; n + 8 <= len; n += 8, dst += 32) { |
291 const uint8_t* u, const uint8_t* v, | 358 __m128i R, G, B; |
292 uint8_t* dst, int len) { | 359 YUV420ToRGB(y, u, v, &R, &G, &B); |
293 int n; | 360 PackAndStore4(&kAlpha, &R, &G, &B, dst); |
294 for (n = 0; n + 2 < len; ++n) { // we directly stomp the *dst memory | 361 y += 8; |
295 YuvToBgrSSE2(y[0], u[0], v[0], dst); // stomps 8 bytes | 362 u += 4; |
296 dst += 3; | 363 v += 4; |
297 ++y; | 364 } |
| 365 for (; n < len; ++n) { // Finish off |
| 366 VP8YuvToArgb(y[0], u[0], v[0], dst); |
| 367 dst += 4; |
| 368 y += 1; |
298 u += (n & 1); | 369 u += (n & 1); |
299 v += (n & 1); | 370 v += (n & 1); |
300 } | 371 } |
301 VP8YuvToBgr(y[0], u[0], v[0], dst + 0); | 372 } |
302 if (len > 1) { | 373 |
303 VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3); | 374 static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
304 } | 375 uint8_t* dst, int len) { |
305 } | 376 int n; |
306 | 377 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { |
307 #endif // WEBP_USE_SSE2 | 378 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
| 379 __m128i rgb[6]; |
| 380 |
| 381 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
| 382 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); |
| 383 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); |
| 384 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); |
| 385 |
| 386 // Cast to 8b and store as RRRRGGGGBBBB. |
| 387 rgb[0] = _mm_packus_epi16(R0, R1); |
| 388 rgb[1] = _mm_packus_epi16(R2, R3); |
| 389 rgb[2] = _mm_packus_epi16(G0, G1); |
| 390 rgb[3] = _mm_packus_epi16(G2, G3); |
| 391 rgb[4] = _mm_packus_epi16(B0, B1); |
| 392 rgb[5] = _mm_packus_epi16(B2, B3); |
| 393 |
| 394 // Pack as RGBRGBRGBRGB. |
| 395 PlanarTo24b(rgb, dst); |
| 396 |
| 397 y += 32; |
| 398 u += 16; |
| 399 v += 16; |
| 400 } |
| 401 for (; n < len; ++n) { // Finish off |
| 402 VP8YuvToRgb(y[0], u[0], v[0], dst); |
| 403 dst += 3; |
| 404 y += 1; |
| 405 u += (n & 1); |
| 406 v += (n & 1); |
| 407 } |
| 408 } |
| 409 |
| 410 static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v, |
| 411 uint8_t* dst, int len) { |
| 412 int n; |
| 413 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { |
| 414 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; |
| 415 __m128i bgr[6]; |
| 416 |
| 417 YUV420ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0); |
| 418 YUV420ToRGB(y + 8, u + 4, v + 4, &R1, &G1, &B1); |
| 419 YUV420ToRGB(y + 16, u + 8, v + 8, &R2, &G2, &B2); |
| 420 YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3); |
| 421 |
| 422 // Cast to 8b and store as BBBBGGGGRRRR. |
| 423 bgr[0] = _mm_packus_epi16(B0, B1); |
| 424 bgr[1] = _mm_packus_epi16(B2, B3); |
| 425 bgr[2] = _mm_packus_epi16(G0, G1); |
| 426 bgr[3] = _mm_packus_epi16(G2, G3); |
| 427 bgr[4] = _mm_packus_epi16(R0, R1); |
| 428 bgr[5] = _mm_packus_epi16(R2, R3); |
| 429 |
| 430 // Pack as BGRBGRBGRBGR. |
| 431 PlanarTo24b(bgr, dst); |
| 432 |
| 433 y += 32; |
| 434 u += 16; |
| 435 v += 16; |
| 436 } |
| 437 for (; n < len; ++n) { // Finish off |
| 438 VP8YuvToBgr(y[0], u[0], v[0], dst); |
| 439 dst += 3; |
| 440 y += 1; |
| 441 u += (n & 1); |
| 442 v += (n & 1); |
| 443 } |
| 444 } |
308 | 445 |
309 //------------------------------------------------------------------------------ | 446 //------------------------------------------------------------------------------ |
310 // Entry point | 447 // Entry point |
311 | 448 |
312 extern void WebPInitSamplersSSE2(void); | 449 extern void WebPInitSamplersSSE2(void); |
313 | 450 |
314 void WebPInitSamplersSSE2(void) { | 451 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { |
315 #if defined(WEBP_USE_SSE2) | 452 WebPSamplers[MODE_RGB] = YuvToRgbRow; |
316 WebPSamplers[MODE_RGB] = YuvToRgbRowSSE2; | 453 WebPSamplers[MODE_RGBA] = YuvToRgbaRow; |
317 WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2; | 454 WebPSamplers[MODE_BGR] = YuvToBgrRow; |
318 WebPSamplers[MODE_BGR] = YuvToBgrRowSSE2; | 455 WebPSamplers[MODE_BGRA] = YuvToBgraRow; |
319 WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2; | 456 WebPSamplers[MODE_ARGB] = YuvToArgbRow; |
320 WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2; | 457 } |
| 458 |
| 459 //------------------------------------------------------------------------------ |
| 460 // RGB24/32 -> YUV converters |
| 461 |
| 462 // Load eight 16b-words from *src. |
| 463 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src)) |
| 464 // Store either 16b-words into *dst |
| 465 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V)) |
| 466 |
| 467 // Function that inserts a value of the second half of the in buffer in between |
| 468 // every two char of the first half. |
| 469 static WEBP_INLINE void RGB24PackedToPlanarHelper( |
| 470 const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) { |
| 471 out[0] = _mm_unpacklo_epi8(in[0], in[3]); |
| 472 out[1] = _mm_unpackhi_epi8(in[0], in[3]); |
| 473 out[2] = _mm_unpacklo_epi8(in[1], in[4]); |
| 474 out[3] = _mm_unpackhi_epi8(in[1], in[4]); |
| 475 out[4] = _mm_unpacklo_epi8(in[2], in[5]); |
| 476 out[5] = _mm_unpackhi_epi8(in[2], in[5]); |
| 477 } |
| 478 |
| 479 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: |
| 480 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... |
| 481 // Similar to PlanarTo24bHelper(), but in reverse order. |
| 482 static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb, |
| 483 __m128i* const out /*out[6]*/) { |
| 484 __m128i tmp[6]; |
| 485 tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); |
| 486 tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); |
| 487 tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32)); |
| 488 tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48)); |
| 489 tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64)); |
| 490 tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80)); |
| 491 |
| 492 RGB24PackedToPlanarHelper(tmp, out); |
| 493 RGB24PackedToPlanarHelper(out, tmp); |
| 494 RGB24PackedToPlanarHelper(tmp, out); |
| 495 RGB24PackedToPlanarHelper(out, tmp); |
| 496 RGB24PackedToPlanarHelper(tmp, out); |
| 497 } |
| 498 |
| 499 // Convert 8 packed ARGB to r[], g[], b[] |
| 500 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb, |
| 501 __m128i* const r, |
| 502 __m128i* const g, |
| 503 __m128i* const b) { |
| 504 const __m128i zero = _mm_setzero_si128(); |
| 505 const __m128i in0 = LOAD_16(argb + 0); // argb3 | argb2 | argb1 | argb0 |
| 506 const __m128i in1 = LOAD_16(argb + 4); // argb7 | argb6 | argb5 | argb4 |
| 507 // column-wise transpose |
| 508 const __m128i A0 = _mm_unpacklo_epi8(in0, in1); |
| 509 const __m128i A1 = _mm_unpackhi_epi8(in0, in1); |
| 510 const __m128i B0 = _mm_unpacklo_epi8(A0, A1); |
| 511 const __m128i B1 = _mm_unpackhi_epi8(A0, A1); |
| 512 // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0 |
| 513 // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0 |
| 514 const __m128i C0 = _mm_unpacklo_epi8(B0, B1); |
| 515 const __m128i C1 = _mm_unpackhi_epi8(B0, B1); |
| 516 // store 16b |
| 517 *r = _mm_unpacklo_epi8(C1, zero); |
| 518 *g = _mm_unpackhi_epi8(C0, zero); |
| 519 *b = _mm_unpacklo_epi8(C0, zero); |
| 520 } |
| 521 |
| 522 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX |
| 523 // It's a macro and not a function because we need to use immediate values with |
| 524 // srai_epi32, e.g. |
| 525 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ |
| 526 ROUNDER, DESCALE_FIX, OUT) do { \ |
| 527 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ |
| 528 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ |
| 529 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ |
| 530 const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \ |
| 531 const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \ |
| 532 const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \ |
| 533 const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \ |
| 534 const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \ |
| 535 const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \ |
| 536 const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \ |
| 537 (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \ |
| 538 } while (0) |
| 539 |
| 540 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) |
| 541 static WEBP_INLINE void ConvertRGBToY(const __m128i* const R, |
| 542 const __m128i* const G, |
| 543 const __m128i* const B, |
| 544 __m128i* const Y) { |
| 545 const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); |
| 546 const __m128i kGB_y = MK_CST_16(16384, 6420); |
| 547 const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); |
| 548 |
| 549 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); |
| 550 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); |
| 551 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); |
| 552 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); |
| 553 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); |
| 554 } |
| 555 |
| 556 static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R, |
| 557 const __m128i* const G, |
| 558 const __m128i* const B, |
| 559 __m128i* const U, __m128i* const V) { |
| 560 const __m128i kRG_u = MK_CST_16(-9719, -19081); |
| 561 const __m128i kGB_u = MK_CST_16(0, 28800); |
| 562 const __m128i kRG_v = MK_CST_16(28800, 0); |
| 563 const __m128i kGB_v = MK_CST_16(-24116, -4684); |
| 564 const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2); |
| 565 |
| 566 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); |
| 567 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); |
| 568 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); |
| 569 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); |
| 570 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, |
| 571 kHALF_UV, YUV_FIX + 2, *U); |
| 572 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v, |
| 573 kHALF_UV, YUV_FIX + 2, *V); |
| 574 } |
| 575 |
| 576 #undef MK_CST_16 |
| 577 #undef TRANSFORM |
| 578 |
| 579 static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) { |
| 580 const int max_width = width & ~31; |
| 581 int i; |
| 582 for (i = 0; i < max_width; rgb += 3 * 16 * 2) { |
| 583 __m128i rgb_plane[6]; |
| 584 int j; |
| 585 |
| 586 RGB24PackedToPlanar(rgb, rgb_plane); |
| 587 |
| 588 for (j = 0; j < 2; ++j, i += 16) { |
| 589 const __m128i zero = _mm_setzero_si128(); |
| 590 __m128i r, g, b, Y0, Y1; |
| 591 |
| 592 // Convert to 16-bit Y. |
| 593 r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); |
| 594 g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); |
| 595 b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); |
| 596 ConvertRGBToY(&r, &g, &b, &Y0); |
| 597 |
| 598 // Convert to 16-bit Y. |
| 599 r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); |
| 600 g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); |
| 601 b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); |
| 602 ConvertRGBToY(&r, &g, &b, &Y1); |
| 603 |
| 604 // Cast to 8-bit and store. |
| 605 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); |
| 606 } |
| 607 } |
| 608 for (; i < width; ++i, rgb += 3) { // left-over |
| 609 y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); |
| 610 } |
| 611 } |
| 612 |
| 613 static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) { |
| 614 const int max_width = width & ~31; |
| 615 int i; |
| 616 for (i = 0; i < max_width; bgr += 3 * 16 * 2) { |
| 617 __m128i bgr_plane[6]; |
| 618 int j; |
| 619 |
| 620 RGB24PackedToPlanar(bgr, bgr_plane); |
| 621 |
| 622 for (j = 0; j < 2; ++j, i += 16) { |
| 623 const __m128i zero = _mm_setzero_si128(); |
| 624 __m128i r, g, b, Y0, Y1; |
| 625 |
| 626 // Convert to 16-bit Y. |
| 627 b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); |
| 628 g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); |
| 629 r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); |
| 630 ConvertRGBToY(&r, &g, &b, &Y0); |
| 631 |
| 632 // Convert to 16-bit Y. |
| 633 b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); |
| 634 g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); |
| 635 r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); |
| 636 ConvertRGBToY(&r, &g, &b, &Y1); |
| 637 |
| 638 // Cast to 8-bit and store. |
| 639 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); |
| 640 } |
| 641 } |
| 642 for (; i < width; ++i, bgr += 3) { // left-over |
| 643 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); |
| 644 } |
| 645 } |
| 646 |
| 647 static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) { |
| 648 const int max_width = width & ~15; |
| 649 int i; |
| 650 for (i = 0; i < max_width; i += 16) { |
| 651 __m128i r, g, b, Y0, Y1; |
| 652 RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b); |
| 653 ConvertRGBToY(&r, &g, &b, &Y0); |
| 654 RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b); |
| 655 ConvertRGBToY(&r, &g, &b, &Y1); |
| 656 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); |
| 657 } |
| 658 for (; i < width; ++i) { // left-over |
| 659 const uint32_t p = argb[i]; |
| 660 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, |
| 661 YUV_HALF); |
| 662 } |
| 663 } |
| 664 |
| 665 // Horizontal add (doubled) of two 16b values, result is 16b. |
| 666 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... |
| 667 static void HorizontalAddPack(const __m128i* const A, const __m128i* const B, |
| 668 __m128i* const out) { |
| 669 const __m128i k2 = _mm_set1_epi16(2); |
| 670 const __m128i C = _mm_madd_epi16(*A, k2); |
| 671 const __m128i D = _mm_madd_epi16(*B, k2); |
| 672 *out = _mm_packs_epi32(C, D); |
| 673 } |
| 674 |
| 675 static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v, |
| 676 int src_width, int do_store) { |
| 677 const int max_width = src_width & ~31; |
| 678 int i; |
| 679 for (i = 0; i < max_width; i += 32, u += 16, v += 16) { |
| 680 __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1; |
| 681 RGB32PackedToPlanar(&argb[i + 0], &r0, &g0, &b0); |
| 682 RGB32PackedToPlanar(&argb[i + 8], &r1, &g1, &b1); |
| 683 HorizontalAddPack(&r0, &r1, &r0); |
| 684 HorizontalAddPack(&g0, &g1, &g0); |
| 685 HorizontalAddPack(&b0, &b1, &b0); |
| 686 ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0); |
| 687 |
| 688 RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0); |
| 689 RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1); |
| 690 HorizontalAddPack(&r0, &r1, &r0); |
| 691 HorizontalAddPack(&g0, &g1, &g0); |
| 692 HorizontalAddPack(&b0, &b1, &b0); |
| 693 ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1); |
| 694 |
| 695 U0 = _mm_packus_epi16(U0, U1); |
| 696 V0 = _mm_packus_epi16(V0, V1); |
| 697 if (!do_store) { |
| 698 const __m128i prev_u = LOAD_16(u); |
| 699 const __m128i prev_v = LOAD_16(v); |
| 700 U0 = _mm_avg_epu8(U0, prev_u); |
| 701 V0 = _mm_avg_epu8(V0, prev_v); |
| 702 } |
| 703 STORE_16(U0, u); |
| 704 STORE_16(V0, v); |
| 705 } |
| 706 if (i < src_width) { // left-over |
| 707 WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); |
| 708 } |
| 709 } |
| 710 |
| 711 // Convert 16 packed ARGB 16b-values to r[], g[], b[] |
| 712 static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx, |
| 713 __m128i* const r, |
| 714 __m128i* const g, |
| 715 __m128i* const b) { |
| 716 const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x |
| 717 const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x |
| 718 const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... |
| 719 const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... |
| 720 // column-wise transpose |
| 721 const __m128i A0 = _mm_unpacklo_epi16(in0, in1); |
| 722 const __m128i A1 = _mm_unpackhi_epi16(in0, in1); |
| 723 const __m128i A2 = _mm_unpacklo_epi16(in2, in3); |
| 724 const __m128i A3 = _mm_unpackhi_epi16(in2, in3); |
| 725 const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 .. |
| 726 const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x |
| 727 const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 .. |
| 728 const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x |
| 729 *r = _mm_unpacklo_epi64(B0, B2); |
| 730 *g = _mm_unpackhi_epi64(B0, B2); |
| 731 *b = _mm_unpacklo_epi64(B1, B3); |
| 732 } |
| 733 |
| 734 static void ConvertRGBA32ToUV(const uint16_t* rgb, |
| 735 uint8_t* u, uint8_t* v, int width) { |
| 736 const int max_width = width & ~15; |
| 737 const uint16_t* const last_rgb = rgb + 4 * max_width; |
| 738 while (rgb < last_rgb) { |
| 739 __m128i r, g, b, U0, V0, U1, V1; |
| 740 RGBA32PackedToPlanar_16b(rgb + 0, &r, &g, &b); |
| 741 ConvertRGBToUV(&r, &g, &b, &U0, &V0); |
| 742 RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b); |
| 743 ConvertRGBToUV(&r, &g, &b, &U1, &V1); |
| 744 STORE_16(_mm_packus_epi16(U0, U1), u); |
| 745 STORE_16(_mm_packus_epi16(V0, V1), v); |
| 746 u += 16; |
| 747 v += 16; |
| 748 rgb += 2 * 32; |
| 749 } |
| 750 if (max_width < width) { // left-over |
| 751 WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width); |
| 752 } |
| 753 } |
| 754 |
| 755 //------------------------------------------------------------------------------ |
| 756 |
| 757 extern void WebPInitConvertARGBToYUVSSE2(void); |
| 758 |
| 759 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { |
| 760 WebPConvertARGBToY = ConvertARGBToY; |
| 761 WebPConvertARGBToUV = ConvertARGBToUV; |
| 762 |
| 763 WebPConvertRGB24ToY = ConvertRGB24ToY; |
| 764 WebPConvertBGR24ToY = ConvertBGR24ToY; |
| 765 |
| 766 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV; |
| 767 } |
| 768 |
| 769 #else // !WEBP_USE_SSE2 |
| 770 |
| 771 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) |
| 772 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) |
| 773 |
321 #endif // WEBP_USE_SSE2 | 774 #endif // WEBP_USE_SSE2 |
322 } | |
OLD | NEW |