media/base/yuv_convert.cc - Issue 1733004: Speed up vertical filtering using v = a+(b-a)*x formula...

Side by Side Diff: media/base/yuv_convert.cc

Issue 1733004: Speed up vertical filtering using v = a+(b-a)*x formula... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 10 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // This webpage shows layout of YV12 and other YUV formats	5 // This webpage shows layout of YV12 and other YUV formats

6 // http://www.fourcc.org/yuv.php	6 // http://www.fourcc.org/yuv.php

7 // The actual conversion is best described here	7 // The actual conversion is best described here

8 // http://en.wikipedia.org/wiki/YUV	8 // http://en.wikipedia.org/wiki/YUV

9 // An article on optimizing YUV conversion using tables instead of multiplies	9 // An article on optimizing YUV conversion using tables instead of multiplies

10 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf	10 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf

(...skipping 52 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
63 rgb_row,	63 rgb_row,

64 width);	64 width);

65 }	65 }

66	66

67 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.	67 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.

68 EMMS();	68 EMMS();

69 }	69 }

70	70

71 #if USE_SSE2	71 #if USE_SSE2

72 // FilterRows combines two rows of the image using linear interpolation.	72 // FilterRows combines two rows of the image using linear interpolation.

73 // SSE2 version blends 8 pixels at a time.	73 // Blends 8 pixels at a time.

74 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,	74 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

75 int source_width, int source_y_fraction) {	75 int source_width, int source_y_fraction) {

76 __m128i zero = _mm_setzero_si128();	76 __m128i zero = _mm_setzero_si128();

77 __m128i y1_fraction = _mm_set1_epi16(	77 __m128i y1_fraction = _mm_set1_epi16(

78 static_cast<uint16>(source_y_fraction >> 8));	78 static_cast<uint16>(source_y_fraction >> 8));

79 __m128i y0_fraction = _mm_set1_epi16(

80 static_cast<uint16>(256 - (source_y_fraction >> 8)));

81	79

82 uint8* end = ybuf + source_width;	80 uint8* end = ybuf + source_width;

83 if (ybuf < end) {	81 if (ybuf < end) {

84 do {	82 do {

85 __m128i y0 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y0_ptr));	83 __m128i y0 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y0_ptr));

86 __m128i y1 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y1_ptr));	84 __m128i y1 = _mm_loadl_epi64(reinterpret_cast<__m128i const*>(y1_ptr));

87 y0 = _mm_unpacklo_epi8(y0, zero);	85 y0 = _mm_unpacklo_epi8(y0, zero);

88 y1 = _mm_unpacklo_epi8(y1, zero);	86 y1 = _mm_unpacklo_epi8(y1, zero);

89 y0 = _mm_mullo_epi16(y0, y0_fraction);	87 y1 = _mm_sub_epi16(y1, y0);

90 y1 = _mm_mullo_epi16(y1, y1_fraction);	88 y1 = _mm_mullo_epi16(y1, y1_fraction);

91 y0 = _mm_add_epi16(y0, y1); // 8.8 fixed point result	89 y1 = _mm_srai_epi16(y1, 8);

92 y0 = _mm_srli_epi16(y0, 8);	90 y1 = _mm_add_epi16(y1, y0);

93 y0 = _mm_packus_epi16(y0, y0);	91 y1 = _mm_packus_epi16(y1, y1);

94 _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y0);	92 _mm_storel_epi64(reinterpret_cast<__m128i *>(ybuf), y1);

95 y0_ptr += 8;	93 y0_ptr += 8;

96 y1_ptr += 8;	94 y1_ptr += 8;

97 ybuf += 8;	95 ybuf += 8;

98 } while (ybuf < end);	96 } while (ybuf < end);

99 }	97 }

100 }	98 }

101

102 #elif USE_MMX	99 #elif USE_MMX

103 // MMX version blends 4 pixels at a time.

104 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,	100 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

105 int source_width, int source_y_fraction) {	101 int source_width, int source_y_fraction) {

106 __m64 zero = _mm_setzero_si64();	102 __m64 zero = _mm_setzero_si64();

107 __m64 y1_fraction = _mm_set1_pi16(	103 __m64 y1_fraction = _mm_set1_pi16(

108 static_cast<int16>(source_y_fraction >> 8));	104 static_cast<int16>(source_y_fraction >> 8));

109 __m64 y0_fraction = _mm_set1_pi16(

110 static_cast<int16>(256 - (source_y_fraction >> 8)));

111	105

112 uint8* end = ybuf + source_width;	106 uint8* end = ybuf + source_width;

113 if (ybuf < end) {	107 if (ybuf < end) {

114 do {	108 do {

115 __m64 y0 = _mm_cvtsi32_si64(reinterpret_cast<const int >(y0_ptr));	109 __m64 y2 = reinterpret_cast<const __m64 >(y0_ptr);

116 __m64 y1 = _mm_cvtsi32_si64(reinterpret_cast<const int >(y1_ptr));	110 __m64 y3 = reinterpret_cast<const __m64 >(y1_ptr);

117 y0 = _mm_unpacklo_pi8(y0, zero);	111 __m64 y0 = _mm_unpacklo_pi8(y2, zero);

118 y1 = _mm_unpacklo_pi8(y1, zero);	112 __m64 y1 = _mm_unpacklo_pi8(y3, zero);

119 y0 = _mm_mullo_pi16(y0, y0_fraction);	113 y2 = _mm_unpackhi_pi8(y2, zero);

	114 y3 = _mm_unpackhi_pi8(y3, zero);

	115 y1 = _mm_sub_pi16(y1, y0);

	116 y3 = _mm_sub_pi16(y3, y2);

120 y1 = _mm_mullo_pi16(y1, y1_fraction);	117 y1 = _mm_mullo_pi16(y1, y1_fraction);

121 y0 = _mm_add_pi16(y0, y1); // 8.8 fixed point result	118 y3 = _mm_mullo_pi16(y3, y1_fraction);

122 y0 = _mm_srli_pi16(y0, 8);	119 y1 = _mm_srai_pi16(y1, 8);

123 y0 = _mm_packs_pu16(y0, y0);	120 y3 = _mm_srai_pi16(y3, 8);

124 reinterpret_cast<int >(ybuf) = _mm_cvtsi64_si32(y0);	121 y1 = _mm_add_pi16(y1, y0);

125 y0_ptr += 4;	122 y3 = _mm_add_pi16(y3, y2);

126 y1_ptr += 4;	123 y0 = _mm_packs_pu16(y1, y3);

127 ybuf += 4;	124 reinterpret_cast<__m64 >(ybuf) = y0;

	125 y0_ptr += 8;

	126 y1_ptr += 8;

	127 ybuf += 8;

128 } while (ybuf < end);	128 } while (ybuf < end);

129 }	129 }

130 }	130 }

131 #else // no MMX or SSE2	131 #else // no MMX or SSE2

132 // C version blends 4 pixels at a time.	132

133 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,	133 static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,

134 int source_width, int source_y_fraction) {	134 int source_width, int source_y_fraction) {

135 int y1_fraction = source_y_fraction >> 8;	135 int y1_fraction = (source_y_fraction >> 8);

136 int y0_fraction = 256 - (source_y_fraction >> 8);

137 uint8* end = ybuf + source_width;	136 uint8* end = ybuf + source_width;

138 if (ybuf < end) {	137 if (ybuf < end) {

139 do {	138 do {

140 ybuf[0] = (y0_ptr[0] * (y0_fraction) + y1_ptr[0] * (y1_fraction)) >> 8;	139 ybuf[0] = y0_ptr[0] + (((y1_ptr[0] - y0_ptr[0]) * y1_fraction) >> 8);

141 ybuf[1] = (y0_ptr[1] * (y0_fraction) + y1_ptr[1] * (y1_fraction)) >> 8;	140 ybuf[1] = y0_ptr[1] + (((y1_ptr[1] - y0_ptr[1]) * y1_fraction) >> 8);

142 ybuf[2] = (y0_ptr[2] * (y0_fraction) + y1_ptr[2] * (y1_fraction)) >> 8;	141 ybuf[2] = y0_ptr[2] + (((y1_ptr[2] - y0_ptr[2]) * y1_fraction) >> 8);

143 ybuf[3] = (y0_ptr[3] * (y0_fraction) + y1_ptr[3] * (y1_fraction)) >> 8;	142 ybuf[3] = y0_ptr[3] + (((y1_ptr[3] - y0_ptr[3]) * y1_fraction) >> 8);

144 y0_ptr += 4;	143 ybuf[4] = y0_ptr[4] + (((y1_ptr[4] - y0_ptr[4]) * y1_fraction) >> 8);

145 y1_ptr += 4;	144 ybuf[5] = y0_ptr[5] + (((y1_ptr[5] - y0_ptr[5]) * y1_fraction) >> 8);

146 ybuf += 4;	145 ybuf[6] = y0_ptr[6] + (((y1_ptr[6] - y0_ptr[6]) * y1_fraction) >> 8);

	146 ybuf[7] = y0_ptr[7] + (((y1_ptr[7] - y0_ptr[7]) * y1_fraction) >> 8);

	147 y0_ptr += 8;

	148 y1_ptr += 8;

	149 ybuf += 8;

147 } while (ybuf < end);	150 } while (ybuf < end);

148 }	151 }

149 }	152 }

150 #endif	153 #endif

151	154

152 // Scale a frame of YUV to 32 bit ARGB.	155 // Scale a frame of YUV to 32 bit ARGB.

153 void ScaleYUVToRGB32(const uint8* y_buf,	156 void ScaleYUVToRGB32(const uint8* y_buf,

154 const uint8* u_buf,	157 const uint8* u_buf,

155 const uint8* v_buf,	158 const uint8* v_buf,

156 uint8* rgb_buf,	159 uint8* rgb_buf,

(...skipping 97 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
254 const uint8* v1_ptr = v0_ptr + uv_pitch;	257 const uint8* v1_ptr = v0_ptr + uv_pitch;

255	258

256 int source_y_fraction = source_y_subpixel & kFractionMask;	259 int source_y_fraction = source_y_subpixel & kFractionMask;

257 int source_uv_fraction = (source_y_subpixel >> y_shift) & kFractionMask;	260 int source_uv_fraction = (source_y_subpixel >> y_shift) & kFractionMask;

258	261

259 const uint8* y_ptr = y0_ptr;	262 const uint8* y_ptr = y0_ptr;

260 const uint8* u_ptr = u0_ptr;	263 const uint8* u_ptr = u0_ptr;

261 const uint8* v_ptr = v0_ptr;	264 const uint8* v_ptr = v0_ptr;

262 // Apply vertical filtering if necessary.	265 // Apply vertical filtering if necessary.

263 // TODO(fbarchard): Remove memcpy when not necessary.	266 // TODO(fbarchard): Remove memcpy when not necessary.

264 if (filter == media::FILTER_BILINEAR) {	267 if (filter & media::FILTER_BILINEAR_V) {

265 if (yscale_fixed != kFractionMax &&	268 if (yscale_fixed != kFractionMax &&

266 source_y_fraction && ((source_y + 1) < source_height)) {	269 source_y_fraction && ((source_y + 1) < source_height)) {

267 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);	270 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);

268 } else {	271 } else {

269 memcpy(ybuf, y0_ptr, source_width);	272 memcpy(ybuf, y0_ptr, source_width);

270 }	273 }

271 y_ptr = ybuf;	274 y_ptr = ybuf;

272 ybuf[source_width] = ybuf[source_width-1];	275 ybuf[source_width] = ybuf[source_width-1];

273 int uv_source_width = (source_width + 1) / 2;	276 int uv_source_width = (source_width + 1) / 2;

274 if (yscale_fixed != kFractionMax &&	277 if (yscale_fixed != kFractionMax &&

275 source_uv_fraction &&	278 source_uv_fraction &&

276 (((source_y >> y_shift) + 1) < (source_height >> y_shift))) {	279 (((source_y >> y_shift) + 1) < (source_height >> y_shift))) {

277 FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);	280 FilterRows(ubuf, u0_ptr, u1_ptr, uv_source_width, source_uv_fraction);

278 FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);	281 FilterRows(vbuf, v0_ptr, v1_ptr, uv_source_width, source_uv_fraction);

279 } else {	282 } else {

280 memcpy(ubuf, u0_ptr, uv_source_width);	283 memcpy(ubuf, u0_ptr, uv_source_width);

281 memcpy(vbuf, v0_ptr, uv_source_width);	284 memcpy(vbuf, v0_ptr, uv_source_width);

282 }	285 }

283 u_ptr = ubuf;	286 u_ptr = ubuf;

284 v_ptr = vbuf;	287 v_ptr = vbuf;

285 ubuf[uv_source_width] = ubuf[uv_source_width - 1];	288 ubuf[uv_source_width] = ubuf[uv_source_width - 1];

286 vbuf[uv_source_width] = vbuf[uv_source_width - 1];	289 vbuf[uv_source_width] = vbuf[uv_source_width - 1];

287 }	290 }

288 if (source_dx == kFractionMax) { // Not scaled	291 if (source_dx == kFractionMax) { // Not scaled

289 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,	292 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

290 dest_pixel, width);	293 dest_pixel, width);

291 } else {	294 } else {

292 if (filter == FILTER_BILINEAR)	295 if (filter & FILTER_BILINEAR_H)

293 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,	296 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

294 dest_pixel, width, source_dx);	297 dest_pixel, width, source_dx);

295 else {	298 else {

296 // Specialized scalers and rotation.	299 // Specialized scalers and rotation.

297 #if USE_MMX && defined(_MSC_VER)	300 #if USE_MMX && defined(_MSC_VER)

298 if (width == (source_width * 2)) {	301 if (width == (source_width * 2)) {

299 DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,	302 DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,

300 dest_pixel, width);	303 dest_pixel, width);

301 } else if ((source_dx & kFractionMask) == 0) {	304 } else if ((source_dx & kFractionMask) == 0) {

302 // Scaling by integer scale factor. ie half.	305 // Scaling by integer scale factor. ie half.

(...skipping 14 matching lines...) Expand all Loading...
317 dest_pixel, width, source_dx);	320 dest_pixel, width, source_dx);

318 #endif	321 #endif

319 }	322 }

320 }	323 }

321 }	324 }

322 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.	325 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.

323 EMMS();	326 EMMS();

324 }	327 }

325	328

326 } // namespace media	329 } // namespace media

OLD	NEW

« no previous file with comments | « media/base/yuv_convert.h ('k') | media/tools/scaler_bench/scaler_bench.cc » ('j') | no next file with comments »