OLD | NEW |
| (Empty) |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include <stdint.h> | |
6 | |
7 #include "build/build_config.h" | |
8 #include "media/base/simd/convert_rgb_to_yuv.h" | |
9 | |
10 #if defined(COMPILER_MSVC) | |
11 #include <intrin.h> | |
12 #else | |
13 #include <mmintrin.h> | |
14 #include <emmintrin.h> | |
15 #endif | |
16 | |
17 #if defined(COMPILER_MSVC) | |
18 #define SIMD_ALIGNED(var) __declspec(align(16)) var | |
19 #else | |
20 #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) | |
21 #endif | |
22 | |
23 namespace media { | |
24 | |
25 #define FIX_SHIFT 12 | |
26 #define FIX(x) ((x) * (1 << FIX_SHIFT)) | |
27 | |
28 // Define a convenient macro to do static cast. | |
29 #define INT16_FIX(x) static_cast<int16_t>(FIX(x)) | |
30 | |
31 // Android's pixel layout is RGBA, while other platforms | |
32 // are BGRA. | |
33 #if defined(OS_ANDROID) | |
34 SIMD_ALIGNED(const int16_t ConvertRGBAToYUV_kTable[8 * 3]) = { | |
35 INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0, | |
36 INT16_FIX(0.257), INT16_FIX(0.504), INT16_FIX(0.098), 0, | |
37 -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0, | |
38 -INT16_FIX(0.148), -INT16_FIX(0.291), INT16_FIX(0.439), 0, | |
39 INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0, | |
40 INT16_FIX(0.439), -INT16_FIX(0.368), -INT16_FIX(0.071), 0, | |
41 }; | |
42 #else | |
43 SIMD_ALIGNED(const int16_t ConvertRGBAToYUV_kTable[8 * 3]) = { | |
44 INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0, | |
45 INT16_FIX(0.098), INT16_FIX(0.504), INT16_FIX(0.257), 0, | |
46 INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0, | |
47 INT16_FIX(0.439), -INT16_FIX(0.291), -INT16_FIX(0.148), 0, | |
48 -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0, | |
49 -INT16_FIX(0.071), -INT16_FIX(0.368), INT16_FIX(0.439), 0, | |
50 }; | |
51 #endif | |
52 | |
53 #undef INT16_FIX | |
54 | |
55 // This is the final offset for the conversion from signed yuv values to | |
56 // unsigned values. It is arranged so that offset of 16 is applied to Y | |
57 // components and 128 is added to UV components for 2 pixels. | |
58 SIMD_ALIGNED(const int32_t kYOffset[4]) = {16, 16, 16, 16}; | |
59 | |
60 static inline uint8_t Clamp(int value) { | |
61 if (value < 0) | |
62 return 0; | |
63 if (value > 255) | |
64 return 255; | |
65 return static_cast<uint8_t>(value); | |
66 } | |
67 | |
68 static inline uint8_t RGBToY(int r, int g, int b) { | |
69 int y = ConvertRGBAToYUV_kTable[0] * b + | |
70 ConvertRGBAToYUV_kTable[1] * g + | |
71 ConvertRGBAToYUV_kTable[2] * r; | |
72 y >>= FIX_SHIFT; | |
73 return Clamp(y + 16); | |
74 } | |
75 | |
76 static inline uint8_t RGBToU(int r, int g, int b, int shift) { | |
77 int u = ConvertRGBAToYUV_kTable[8] * b + | |
78 ConvertRGBAToYUV_kTable[9] * g + | |
79 ConvertRGBAToYUV_kTable[10] * r; | |
80 u >>= FIX_SHIFT + shift; | |
81 return Clamp(u + 128); | |
82 } | |
83 | |
84 static inline uint8_t RGBToV(int r, int g, int b, int shift) { | |
85 int v = ConvertRGBAToYUV_kTable[16] * b + | |
86 ConvertRGBAToYUV_kTable[17] * g + | |
87 ConvertRGBAToYUV_kTable[18] * r; | |
88 v >>= FIX_SHIFT + shift; | |
89 return Clamp(v + 128); | |
90 } | |
91 | |
92 #define CONVERT_Y(rgb_buf, y_buf) \ | |
93 b = *rgb_buf++; \ | |
94 g = *rgb_buf++; \ | |
95 r = *rgb_buf++; \ | |
96 ++rgb_buf; \ | |
97 sum_b += b; \ | |
98 sum_g += g; \ | |
99 sum_r += r; \ | |
100 *y_buf++ = RGBToY(r, g, b); | |
101 | |
102 static inline void ConvertRGBToYUV_V2H2(const uint8_t* rgb_buf_1, | |
103 const uint8_t* rgb_buf_2, | |
104 uint8_t* y_buf_1, | |
105 uint8_t* y_buf_2, | |
106 uint8_t* u_buf, | |
107 uint8_t* v_buf) { | |
108 int sum_b = 0; | |
109 int sum_g = 0; | |
110 int sum_r = 0; | |
111 int r, g, b; | |
112 | |
113 | |
114 | |
115 CONVERT_Y(rgb_buf_1, y_buf_1); | |
116 CONVERT_Y(rgb_buf_1, y_buf_1); | |
117 CONVERT_Y(rgb_buf_2, y_buf_2); | |
118 CONVERT_Y(rgb_buf_2, y_buf_2); | |
119 *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 2); | |
120 *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 2); | |
121 } | |
122 | |
123 static inline void ConvertRGBToYUV_V2H1(const uint8_t* rgb_buf_1, | |
124 const uint8_t* rgb_buf_2, | |
125 uint8_t* y_buf_1, | |
126 uint8_t* y_buf_2, | |
127 uint8_t* u_buf, | |
128 uint8_t* v_buf) { | |
129 int sum_b = 0; | |
130 int sum_g = 0; | |
131 int sum_r = 0; | |
132 int r, g, b; | |
133 | |
134 CONVERT_Y(rgb_buf_1, y_buf_1); | |
135 CONVERT_Y(rgb_buf_2, y_buf_2); | |
136 *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1); | |
137 *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1); | |
138 } | |
139 | |
140 static inline void ConvertRGBToYUV_V1H2(const uint8_t* rgb_buf, | |
141 uint8_t* y_buf, | |
142 uint8_t* u_buf, | |
143 uint8_t* v_buf) { | |
144 int sum_b = 0; | |
145 int sum_g = 0; | |
146 int sum_r = 0; | |
147 int r, g, b; | |
148 | |
149 CONVERT_Y(rgb_buf, y_buf); | |
150 CONVERT_Y(rgb_buf, y_buf); | |
151 *u_buf++ = RGBToU(sum_r, sum_g, sum_b, 1); | |
152 *v_buf++ = RGBToV(sum_r, sum_g, sum_b, 1); | |
153 } | |
154 | |
155 static inline void ConvertRGBToYUV_V1H1(const uint8_t* rgb_buf, | |
156 uint8_t* y_buf, | |
157 uint8_t* u_buf, | |
158 uint8_t* v_buf) { | |
159 int sum_b = 0; | |
160 int sum_g = 0; | |
161 int sum_r = 0; | |
162 int r, g, b; | |
163 | |
164 CONVERT_Y(rgb_buf, y_buf); | |
165 *u_buf++ = RGBToU(r, g, b, 0); | |
166 *v_buf++ = RGBToV(r, g, b, 0); | |
167 } | |
168 | |
169 static void ConvertRGB32ToYUVRow_SSE2(const uint8_t* rgb_buf_1, | |
170 const uint8_t* rgb_buf_2, | |
171 uint8_t* y_buf_1, | |
172 uint8_t* y_buf_2, | |
173 uint8_t* u_buf, | |
174 uint8_t* v_buf, | |
175 int width) { | |
176 while (width >= 4) { | |
177 // Name for the Y pixels: | |
178 // Row 1: a b c d | |
179 // Row 2: e f g h | |
180 // | |
181 // First row 4 pixels. | |
182 __m128i rgb_row_1 = _mm_loadu_si128( | |
183 reinterpret_cast<const __m128i*>(rgb_buf_1)); | |
184 __m128i zero_1 = _mm_xor_si128(rgb_row_1, rgb_row_1); | |
185 | |
186 __m128i y_table = _mm_load_si128( | |
187 reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable)); | |
188 | |
189 __m128i rgb_a_b = _mm_unpackhi_epi8(rgb_row_1, zero_1); | |
190 rgb_a_b = _mm_madd_epi16(rgb_a_b, y_table); | |
191 | |
192 __m128i rgb_c_d = _mm_unpacklo_epi8(rgb_row_1, zero_1); | |
193 rgb_c_d = _mm_madd_epi16(rgb_c_d, y_table); | |
194 | |
195 // Do a crazh shuffle so that we get: | |
196 // v------------ Multiply Add | |
197 // BG: a b c d | |
198 // A0: a b c d | |
199 __m128i bg_abcd = _mm_castps_si128( | |
200 _mm_shuffle_ps( | |
201 _mm_castsi128_ps(rgb_c_d), | |
202 _mm_castsi128_ps(rgb_a_b), | |
203 (3 << 6) | (1 << 4) | (3 << 2) | 1)); | |
204 __m128i r_abcd = _mm_castps_si128( | |
205 _mm_shuffle_ps( | |
206 _mm_castsi128_ps(rgb_c_d), | |
207 _mm_castsi128_ps(rgb_a_b), | |
208 (2 << 6) | (2 << 2))); | |
209 __m128i y_abcd = _mm_add_epi32(bg_abcd, r_abcd); | |
210 | |
211 // Down shift back to 8bits range. | |
212 __m128i y_offset = _mm_load_si128( | |
213 reinterpret_cast<const __m128i*>(kYOffset)); | |
214 y_abcd = _mm_srai_epi32(y_abcd, FIX_SHIFT); | |
215 y_abcd = _mm_add_epi32(y_abcd, y_offset); | |
216 y_abcd = _mm_packs_epi32(y_abcd, y_abcd); | |
217 y_abcd = _mm_packus_epi16(y_abcd, y_abcd); | |
218 *reinterpret_cast<uint32_t*>(y_buf_1) = _mm_cvtsi128_si32(y_abcd); | |
219 y_buf_1 += 4; | |
220 | |
221 // Second row 4 pixels. | |
222 __m128i rgb_row_2 = _mm_loadu_si128( | |
223 reinterpret_cast<const __m128i*>(rgb_buf_2)); | |
224 __m128i zero_2 = _mm_xor_si128(rgb_row_2, rgb_row_2); | |
225 __m128i rgb_e_f = _mm_unpackhi_epi8(rgb_row_2, zero_2); | |
226 __m128i rgb_g_h = _mm_unpacklo_epi8(rgb_row_2, zero_2); | |
227 | |
228 // Add two rows together. | |
229 __m128i rgb_ae_bf = | |
230 _mm_add_epi16(_mm_unpackhi_epi8(rgb_row_1, zero_2), rgb_e_f); | |
231 __m128i rgb_cg_dh = | |
232 _mm_add_epi16(_mm_unpacklo_epi8(rgb_row_1, zero_2), rgb_g_h); | |
233 | |
234 // Multiply add like the previous row. | |
235 rgb_e_f = _mm_madd_epi16(rgb_e_f, y_table); | |
236 rgb_g_h = _mm_madd_epi16(rgb_g_h, y_table); | |
237 | |
238 __m128i bg_efgh = _mm_castps_si128( | |
239 _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h), | |
240 _mm_castsi128_ps(rgb_e_f), | |
241 (3 << 6) | (1 << 4) | (3 << 2) | 1)); | |
242 __m128i r_efgh = _mm_castps_si128( | |
243 _mm_shuffle_ps(_mm_castsi128_ps(rgb_g_h), | |
244 _mm_castsi128_ps(rgb_e_f), | |
245 (2 << 6) | (2 << 2))); | |
246 __m128i y_efgh = _mm_add_epi32(bg_efgh, r_efgh); | |
247 y_efgh = _mm_srai_epi32(y_efgh, FIX_SHIFT); | |
248 y_efgh = _mm_add_epi32(y_efgh, y_offset); | |
249 y_efgh = _mm_packs_epi32(y_efgh, y_efgh); | |
250 y_efgh = _mm_packus_epi16(y_efgh, y_efgh); | |
251 *reinterpret_cast<uint32_t*>(y_buf_2) = _mm_cvtsi128_si32(y_efgh); | |
252 y_buf_2 += 4; | |
253 | |
254 __m128i rgb_ae_cg = _mm_castps_si128( | |
255 _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh), | |
256 _mm_castsi128_ps(rgb_ae_bf), | |
257 (3 << 6) | (2 << 4) | (3 << 2) | 2)); | |
258 __m128i rgb_bf_dh = _mm_castps_si128( | |
259 _mm_shuffle_ps(_mm_castsi128_ps(rgb_cg_dh), | |
260 _mm_castsi128_ps(rgb_ae_bf), | |
261 (1 << 6) | (1 << 2))); | |
262 | |
263 // This is a 2x2 subsampling for 2 pixels. | |
264 __m128i rgb_abef_cdgh = _mm_add_epi16(rgb_ae_cg, rgb_bf_dh); | |
265 | |
266 // Do a multiply add with U table. | |
267 __m128i u_a_b = _mm_madd_epi16( | |
268 rgb_abef_cdgh, | |
269 _mm_load_si128( | |
270 reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 8))); | |
271 u_a_b = _mm_add_epi32(_mm_shuffle_epi32(u_a_b, ((3 << 2) | 1)), | |
272 _mm_shuffle_epi32(u_a_b, (2 << 2))); | |
273 // Right shift 14 because of 12 from fixed point and 2 from subsampling. | |
274 u_a_b = _mm_srai_epi32(u_a_b, FIX_SHIFT + 2); | |
275 __m128i uv_offset = _mm_slli_epi32(y_offset, 3); | |
276 u_a_b = _mm_add_epi32(u_a_b, uv_offset); | |
277 u_a_b = _mm_packs_epi32(u_a_b, u_a_b); | |
278 u_a_b = _mm_packus_epi16(u_a_b, u_a_b); | |
279 *reinterpret_cast<uint16_t*>(u_buf) = | |
280 static_cast<uint16_t>(_mm_extract_epi16(u_a_b, 0)); | |
281 u_buf += 2; | |
282 | |
283 __m128i v_a_b = _mm_madd_epi16( | |
284 rgb_abef_cdgh, | |
285 _mm_load_si128( | |
286 reinterpret_cast<const __m128i*>(ConvertRGBAToYUV_kTable + 16))); | |
287 v_a_b = _mm_add_epi32(_mm_shuffle_epi32(v_a_b, ((3 << 2) | 1)), | |
288 _mm_shuffle_epi32(v_a_b, (2 << 2))); | |
289 v_a_b = _mm_srai_epi32(v_a_b, FIX_SHIFT + 2); | |
290 v_a_b = _mm_add_epi32(v_a_b, uv_offset); | |
291 v_a_b = _mm_packs_epi32(v_a_b, v_a_b); | |
292 v_a_b = _mm_packus_epi16(v_a_b, v_a_b); | |
293 *reinterpret_cast<uint16_t*>(v_buf) = | |
294 static_cast<uint16_t>(_mm_extract_epi16(v_a_b, 0)); | |
295 v_buf += 2; | |
296 | |
297 rgb_buf_1 += 16; | |
298 rgb_buf_2 += 16; | |
299 | |
300 // Move forward by 4 pixels. | |
301 width -= 4; | |
302 } | |
303 | |
304 // Just use C code to convert the remaining pixels. | |
305 if (width >= 2) { | |
306 ConvertRGBToYUV_V2H2(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf); | |
307 rgb_buf_1 += 8; | |
308 rgb_buf_2 += 8; | |
309 y_buf_1 += 2; | |
310 y_buf_2 += 2; | |
311 ++u_buf; | |
312 ++v_buf; | |
313 width -= 2; | |
314 } | |
315 | |
316 if (width) | |
317 ConvertRGBToYUV_V2H1(rgb_buf_1, rgb_buf_2, y_buf_1, y_buf_2, u_buf, v_buf); | |
318 } | |
319 | |
320 extern void ConvertRGB32ToYUV_SSE2(const uint8_t* rgbframe, | |
321 uint8_t* yplane, | |
322 uint8_t* uplane, | |
323 uint8_t* vplane, | |
324 int width, | |
325 int height, | |
326 int rgbstride, | |
327 int ystride, | |
328 int uvstride) { | |
329 while (height >= 2) { | |
330 ConvertRGB32ToYUVRow_SSE2(rgbframe, | |
331 rgbframe + rgbstride, | |
332 yplane, | |
333 yplane + ystride, | |
334 uplane, | |
335 vplane, | |
336 width); | |
337 rgbframe += 2 * rgbstride; | |
338 yplane += 2 * ystride; | |
339 uplane += uvstride; | |
340 vplane += uvstride; | |
341 height -= 2; | |
342 } | |
343 | |
344 if (!height) | |
345 return; | |
346 | |
347 // Handle the last row. | |
348 while (width >= 2) { | |
349 ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane); | |
350 rgbframe += 8; | |
351 yplane += 2; | |
352 ++uplane; | |
353 ++vplane; | |
354 width -= 2; | |
355 } | |
356 | |
357 if (width) | |
358 ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane); | |
359 } | |
360 | |
361 void ConvertRGB32ToYUV_SSE2_Reference(const uint8_t* rgbframe, | |
362 uint8_t* yplane, | |
363 uint8_t* uplane, | |
364 uint8_t* vplane, | |
365 int width, | |
366 int height, | |
367 int rgbstride, | |
368 int ystride, | |
369 int uvstride) { | |
370 while (height >= 2) { | |
371 int i = 0; | |
372 | |
373 // Convert a 2x2 block. | |
374 while (i + 2 <= width) { | |
375 ConvertRGBToYUV_V2H2(rgbframe + i * 4, | |
376 rgbframe + rgbstride + i * 4, | |
377 yplane + i, | |
378 yplane + ystride + i, | |
379 uplane + i / 2, | |
380 vplane + i / 2); | |
381 i += 2; | |
382 } | |
383 | |
384 // Convert the last pixel of two rows. | |
385 if (i < width) { | |
386 ConvertRGBToYUV_V2H1(rgbframe + i * 4, | |
387 rgbframe + rgbstride + i * 4, | |
388 yplane + i, | |
389 yplane + ystride + i, | |
390 uplane + i / 2, | |
391 vplane + i / 2); | |
392 } | |
393 | |
394 rgbframe += 2 * rgbstride; | |
395 yplane += 2 * ystride; | |
396 uplane += uvstride; | |
397 vplane += uvstride; | |
398 height -= 2; | |
399 } | |
400 | |
401 if (!height) | |
402 return; | |
403 | |
404 // Handle the last row. | |
405 while (width >= 2) { | |
406 ConvertRGBToYUV_V1H2(rgbframe, yplane, uplane, vplane); | |
407 rgbframe += 8; | |
408 yplane += 2; | |
409 ++uplane; | |
410 ++vplane; | |
411 width -= 2; | |
412 } | |
413 | |
414 // Handle the last pixel in the last row. | |
415 if (width) | |
416 ConvertRGBToYUV_V1H1(rgbframe, yplane, uplane, vplane); | |
417 } | |
418 | |
419 } // namespace media | |
OLD | NEW |