Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(393)

Side by Side Diff: media/base/yuv_convert.cc

Issue 1733004: Speed up vertical filtering using v = a+(b-a)*x formula... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 10 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « media/base/yuv_convert.h ('k') | media/tools/scaler_bench/scaler_bench.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // This webpage shows layout of YV12 and other YUV formats 5 // This webpage shows layout of YV12 and other YUV formats
6 // http://www.fourcc.org/yuv.php 6 // http://www.fourcc.org/yuv.php
7 // The actual conversion is best described here 7 // The actual conversion is best described here
8 // http://en.wikipedia.org/wiki/YUV 8 // http://en.wikipedia.org/wiki/YUV
9 // An article on optimizing YUV conversion using tables instead of multiplies 9 // An article on optimizing YUV conversion using tables instead of multiplies
10 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf 10 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
63 rgb_row, 63 rgb_row,
64 width); 64 width);
65 } 65 }
66 66
67 // MMX used for FastConvertYUVToRGB32Row requires emms instruction. 67 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
68 EMMS(); 68 EMMS();
69 } 69 }
70 70
71 #if USE_SSE2 71 #if USE_SSE2
72 // FilterRows combines two rows of the image using linear interpolation. 72 // FilterRows combines two rows of the image using linear interpolation.
73 // SSE2 version blends 8 pixels at a time. 73 // Blends 8 pixels at a time.
74 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 74 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
75 int source_width, int source_y_fraction) { 75 int source_width, int source_y_fraction) {
76 __m128i zero = _mm_setzero_si128(); 76 __m128i zero = _mm_setzero_si128();
77 __m128i y1_fraction = _mm_set1_epi16( 77 __m128i y1_fraction = _mm_set1_epi16(
78 static_cast<uint16>(source_y_fraction >> 8)); 78 static_cast<uint16>(source_y_fraction >> 8));
79 __m128i y0_fraction = _mm_set1_epi16(
80 static_cast<uint16>(256 - (source_y_fraction >> 8)));
81 79
82 uint8* end = ybuf + source_width; 80 uint8* end = ybuf + source_width;
83 if (ybuf < end) { 81 if (ybuf < end) {
84 do { 82 do {
85 __m128i y0 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y0_ptr)); 83 __m128i y0 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y0_ptr));
86 __m128i y1 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y1_ptr)); 84 __m128i y1 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y1_ptr));
87 y0 = _mm_unpacklo_epi8(y0, zero); 85 y0 = _mm_unpacklo_epi8(y0, zero);
88 y1 = _mm_unpacklo_epi8(y1, zero); 86 y1 = _mm_unpacklo_epi8(y1, zero);
89 y0 = _mm_mullo_epi16(y0, y0_fraction); 87 y1 = _mm_sub_epi16(y1, y0);
90 y1 = _mm_mullo_epi16(y1, y1_fraction); 88 y1 = _mm_mullo_epi16(y1, y1_fraction);
91 y0 = _mm_add_epi16(y0, y1); // 8.8 fixed point result 89 y1 = _mm_srai_epi16(y1, 8);
92 y0 = _mm_srli_epi16(y0, 8); 90 y1 = _mm_add_epi16(y1, y0);
93 y0 = _mm_packus_epi16(y0, y0); 91 y1 = _mm_packus_epi16(y1, y1);
94 _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y0); 92 _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y1);
95 y0_ptr += 8; 93 y0_ptr += 8;
96 y1_ptr += 8; 94 y1_ptr += 8;
97 ybuf += 8; 95 ybuf += 8;
98 } while (ybuf < end); 96 } while (ybuf < end);
99 } 97 }
100 } 98 }
101
102 #elif USE_MMX 99 #elif USE_MMX
103 // MMX version blends 4 pixels at a time.
104 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 100 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
105 int source_width, int source_y_fraction) { 101 int source_width, int source_y_fraction) {
106 __m64 zero = _mm_setzero_si64(); 102 __m64 zero = _mm_setzero_si64();
107 __m64 y1_fraction = _mm_set1_pi16( 103 __m64 y1_fraction = _mm_set1_pi16(
108 static_cast<int16>(source_y_fraction >> 8)); 104 static_cast<int16>(source_y_fraction >> 8));
109 __m64 y0_fraction = _mm_set1_pi16(
110 static_cast<int16>(256 - (source_y_fraction >> 8)));
111 105
112 uint8* end = ybuf + source_width; 106 uint8* end = ybuf + source_width;
113 if (ybuf < end) { 107 if (ybuf < end) {
114 do { 108 do {
115 __m64 y0 = _mm_cvtsi32_si64(*reinterpret_cast<const int *>(y0_ptr)); 109 __m64 y2 = *reinterpret_cast<const __m64 *>(y0_ptr);
116 __m64 y1 = _mm_cvtsi32_si64(*reinterpret_cast<const int *>(y1_ptr)); 110 __m64 y3 = *reinterpret_cast<const __m64 *>(y1_ptr);
117 y0 = _mm_unpacklo_pi8(y0, zero); 111 __m64 y0 = _mm_unpacklo_pi8(y2, zero);
118 y1 = _mm_unpacklo_pi8(y1, zero); 112 __m64 y1 = _mm_unpacklo_pi8(y3, zero);
119 y0 = _mm_mullo_pi16(y0, y0_fraction); 113 y2 = _mm_unpackhi_pi8(y2, zero);
114 y3 = _mm_unpackhi_pi8(y3, zero);
115 y1 = _mm_sub_pi16(y1, y0);
116 y3 = _mm_sub_pi16(y3, y2);
120 y1 = _mm_mullo_pi16(y1, y1_fraction); 117 y1 = _mm_mullo_pi16(y1, y1_fraction);
121 y0 = _mm_add_pi16(y0, y1); // 8.8 fixed point result 118 y3 = _mm_mullo_pi16(y3, y1_fraction);
122 y0 = _mm_srli_pi16(y0, 8); 119 y1 = _mm_srai_pi16(y1, 8);
123 y0 = _mm_packs_pu16(y0, y0); 120 y3 = _mm_srai_pi16(y3, 8);
124 *reinterpret_cast<int *>(ybuf) = _mm_cvtsi64_si32(y0); 121 y1 = _mm_add_pi16(y1, y0);
125 y0_ptr += 4; 122 y3 = _mm_add_pi16(y3, y2);
126 y1_ptr += 4; 123 y0 = _mm_packs_pu16(y1, y3);
127 ybuf += 4; 124 *reinterpret_cast<__m64 *>(ybuf) = y0;
125 y0_ptr += 8;
126 y1_ptr += 8;
127 ybuf += 8;
128 } while (ybuf < end); 128 } while (ybuf < end);
129 } 129 }
130 } 130 }
131 #else // no MMX or SSE2 131 #else // no MMX or SSE2
132 // C version blends 4 pixels at a time. 132
133 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr, 133 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
134 int source_width, int source_y_fraction) { 134 int source_width, int source_y_fraction) {
135 int y1_fraction = source_y_fraction >> 8; 135 int y1_fraction = (source_y_fraction >> 8);
136 int y0_fraction = 256 - (source_y_fraction >> 8);
137 uint8* end = ybuf + source_width; 136 uint8* end = ybuf + source_width;
138 if (ybuf < end) { 137 if (ybuf < end) {
139 do { 138 do {
140 ybuf[0] = (y0_ptr[0] * (y0_fraction) + y1_ptr[0] * (y1_fraction)) >> 8; 139 ybuf[0] = y0_ptr[0] + (((y1_ptr[0] - y0_ptr[0]) * y1_fraction) >> 8);
141 ybuf[1] = (y0_ptr[1] * (y0_fraction) + y1_ptr[1] * (y1_fraction)) >> 8; 140 ybuf[1] = y0_ptr[1] + (((y1_ptr[1] - y0_ptr[1]) * y1_fraction) >> 8);
142 ybuf[2] = (y0_ptr[2] * (y0_fraction) + y1_ptr[2] * (y1_fraction)) >> 8; 141 ybuf[2] = y0_ptr[2] + (((y1_ptr[2] - y0_ptr[2]) * y1_fraction) >> 8);
143 ybuf[3] = (y0_ptr[3] * (y0_fraction) + y1_ptr[3] * (y1_fraction)) >> 8; 142 ybuf[3] = y0_ptr[3] + (((y1_ptr[3] - y0_ptr[3]) * y1_fraction) >> 8);
144 y0_ptr += 4; 143 ybuf[4] = y0_ptr[4] + (((y1_ptr[4] - y0_ptr[4]) * y1_fraction) >> 8);
145 y1_ptr += 4; 144 ybuf[5] = y0_ptr[5] + (((y1_ptr[5] - y0_ptr[5]) * y1_fraction) >> 8);
146 ybuf += 4; 145 ybuf[6] = y0_ptr[6] + (((y1_ptr[6] - y0_ptr[6]) * y1_fraction) >> 8);
146 ybuf[7] = y0_ptr[7] + (((y1_ptr[7] - y0_ptr[7]) * y1_fraction) >> 8);
147 y0_ptr += 8;
148 y1_ptr += 8;
149 ybuf += 8;
147 } while (ybuf < end); 150 } while (ybuf < end);
148 } 151 }
149 } 152 }
150 #endif 153 #endif
151 154
152 // Scale a frame of YUV to 32 bit ARGB. 155 // Scale a frame of YUV to 32 bit ARGB.
153 void ScaleYUVToRGB32(const uint8* y_buf, 156 void ScaleYUVToRGB32(const uint8* y_buf,
154 const uint8* u_buf, 157 const uint8* u_buf,
155 const uint8* v_buf, 158 const uint8* v_buf,
156 uint8* rgb_buf, 159 uint8* rgb_buf,
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
254 const uint8* v1_ptr = v0_ptr + uv_pitch; 257 const uint8* v1_ptr = v0_ptr + uv_pitch;
255 258
256 int source_y_fraction = source_y_subpixel & kFractionMask; 259 int source_y_fraction = source_y_subpixel & kFractionMask;
257 int source_uv_fraction = (source_y_subpixel >> y_shift) & kFractionMask; 260 int source_uv_fraction = (source_y_subpixel >> y_shift) & kFractionMask;
258 261
259 const uint8* y_ptr = y0_ptr; 262 const uint8* y_ptr = y0_ptr;
260 const uint8* u_ptr = u0_ptr; 263 const uint8* u_ptr = u0_ptr;
261 const uint8* v_ptr = v0_ptr; 264 const uint8* v_ptr = v0_ptr;
262 // Apply vertical filtering if necessary. 265 // Apply vertical filtering if necessary.
263 // TODO(fbarchard): Remove memcpy when not necessary. 266 // TODO(fbarchard): Remove memcpy when not necessary.
264 if (filter == media::FILTER_BILINEAR) { 267 if (filter & media::FILTER_BILINEAR_V) {
265 if (yscale_fixed != kFractionMax && 268 if (yscale_fixed != kFractionMax &&
266 source_y_fraction && ((source_y + 1) < source_height)) { 269 source_y_fraction && ((source_y + 1) < source_height)) {
267 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction); 270 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
268 } else { 271 } else {
269 memcpy(ybuf, y0_ptr, source_width); 272 memcpy(ybuf, y0_ptr, source_width);
270 } 273 }
271 y_ptr = ybuf; 274 y_ptr = ybuf;
272 ybuf[source_width] = ybuf[source_width-1]; 275 ybuf[source_width] = ybuf[source_width-1];
273 int uv_source_width = (source_width + 1) / 2; 276 int uv_source_width = (source_width + 1) / 2;
274 if (yscale_fixed != kFractionMax && 277 if (yscale_fixed != kFractionMax &&
275 source_uv_fraction && 278 source_uv_fraction &&
276 (((source_y >> y_shift) + 1) < (source_height >> y_shift))) { 279 (((source_y >> y_shift) + 1) < (source_height >> y_shift))) {
277 FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction); 280 FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);
278 FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction); 281 FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);
279 } else { 282 } else {
280 memcpy(ubuf, u0_ptr, uv_source_width); 283 memcpy(ubuf, u0_ptr, uv_source_width);
281 memcpy(vbuf, v0_ptr, uv_source_width); 284 memcpy(vbuf, v0_ptr, uv_source_width);
282 } 285 }
283 u_ptr = ubuf; 286 u_ptr = ubuf;
284 v_ptr = vbuf; 287 v_ptr = vbuf;
285 ubuf[uv_source_width] = ubuf[uv_source_width - 1]; 288 ubuf[uv_source_width] = ubuf[uv_source_width - 1];
286 vbuf[uv_source_width] = vbuf[uv_source_width - 1]; 289 vbuf[uv_source_width] = vbuf[uv_source_width - 1];
287 } 290 }
288 if (source_dx == kFractionMax) { // Not scaled 291 if (source_dx == kFractionMax) { // Not scaled
289 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 292 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
290 dest_pixel, width); 293 dest_pixel, width);
291 } else { 294 } else {
292 if (filter == FILTER_BILINEAR) 295 if (filter & FILTER_BILINEAR_H)
293 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 296 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
294 dest_pixel, width, source_dx); 297 dest_pixel, width, source_dx);
295 else { 298 else {
296 // Specialized scalers and rotation. 299 // Specialized scalers and rotation.
297 #if USE_MMX && defined(_MSC_VER) 300 #if USE_MMX && defined(_MSC_VER)
298 if (width == (source_width * 2)) { 301 if (width == (source_width * 2)) {
299 DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr, 302 DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
300 dest_pixel, width); 303 dest_pixel, width);
301 } else if ((source_dx & kFractionMask) == 0) { 304 } else if ((source_dx & kFractionMask) == 0) {
302 // Scaling by integer scale factor. ie half. 305 // Scaling by integer scale factor. ie half.
(...skipping 14 matching lines...) Expand all
317 dest_pixel, width, source_dx); 320 dest_pixel, width, source_dx);
318 #endif 321 #endif
319 } 322 }
320 } 323 }
321 } 324 }
322 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms. 325 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
323 EMMS(); 326 EMMS();
324 } 327 }
325 328
326 } // namespace media 329 } // namespace media
OLDNEW
« no previous file with comments | « media/base/yuv_convert.h ('k') | media/tools/scaler_bench/scaler_bench.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698