third_party/libwebp/dsp/upsampling_sse2.c - Issue 116213006: Update libwebp to 0.4.0

Side by Side Diff: third_party/libwebp/dsp/upsampling_sse2.c

Issue 116213006: Update libwebp to 0.4.0 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // Use of this source code is governed by a BSD-style license	3 // Use of this source code is governed by a BSD-style license

4 // that can be found in the COPYING file in the root of the source	4 // that can be found in the COPYING file in the root of the source

5 // tree. An additional intellectual property rights grant can be found	5 // tree. An additional intellectual property rights grant can be found

6 // in the file PATENTS. All contributing project authors may	6 // in the file PATENTS. All contributing project authors may

7 // be found in the AUTHORS file in the root of the source tree.	7 // be found in the AUTHORS file in the root of the source tree.

8 // -----------------------------------------------------------------------------	8 // -----------------------------------------------------------------------------

9 //	9 //

10 // SSE2 version of YUV to RGB upsampling functions.	10 // SSE2 version of YUV to RGB upsampling functions.

11 //	11 //

12 // Author: somnath@google.com (Somnath Banerjee)	12 // Author: somnath@google.com (Somnath Banerjee)

13	13

14 #include "./dsp.h"	14 #include "./dsp.h"

15	15

16 #if defined(__cplusplus) \|\| defined(c_plusplus)

17 extern "C" {

18 #endif

19

20 #if defined(WEBP_USE_SSE2)	16 #if defined(WEBP_USE_SSE2)

21	17

22 #include <assert.h>	18 #include <assert.h>

23 #include <emmintrin.h>	19 #include <emmintrin.h>

24 #include <string.h>	20 #include <string.h>

25 #include "./yuv.h"	21 #include "./yuv.h"

26	22

27 #ifdef FANCY_UPSAMPLING	23 #ifdef FANCY_UPSAMPLING

28	24

29 // We compute (9a + 3b + 3*c + d + 8) / 16 as follows	25 // We compute (9a + 3b + 3*c + d + 8) / 16 as follows

(...skipping 14 matching lines...) Expand all Loading...
44 // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) \| (k^in)) & 1	40 // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) \| (k^in)) & 1

45 #define GET_M(ij, in, out) do { \	41 #define GET_M(ij, in, out) do { \

46 const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \	42 const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \

47 const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \	43 const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \

48 const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \	44 const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \

49 const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) \| (k^in) */\	45 const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) \| (k^in) */\

50 const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \	46 const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \

51 (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \	47 (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \

52 } while (0)	48 } while (0)

53	49

54 // pack and store two alterning pixel rows	50 // pack and store two alternating pixel rows

55 #define PACK_AND_STORE(a, b, da, db, out) do { \	51 #define PACK_AND_STORE(a, b, da, db, out) do { \

56 const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \	52 const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \

57 const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \	53 const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \

58 const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \	54 const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \

59 const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \	55 const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \

60 _mm_store_si128(((__m128i*)(out)) + 0, t_1); \	56 _mm_store_si128(((__m128i*)(out)) + 0, t_1); \

61 _mm_store_si128(((__m128i*)(out)) + 1, t_2); \	57 _mm_store_si128(((__m128i*)(out)) + 1, t_2); \

62 } while (0)	58 } while (0)

63	59

64 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.	60 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.

(...skipping 15 matching lines...) Expand all Loading...
80 const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) \| (b^c) \| (s^t) */ \	76 const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) \| (b^c) \| (s^t) */ \

81 const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) \| (b^c) \| (s^t) & 1 */ \	77 const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) \| (b^c) \| (s^t) & 1 */ \

82 const __m128i t4 = _mm_avg_epu8(s, t); \	78 const __m128i t4 = _mm_avg_epu8(s, t); \

83 const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \	79 const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \

84 __m128i diag1, diag2; \	80 __m128i diag1, diag2; \

85 \	81 \

86 GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \	82 GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \

87 GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \	83 GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \

88 \	84 \

89 /* pack the alternate pixels */ \	85 /* pack the alternate pixels */ \

90 PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \	86 PACK_AND_STORE(a, b, diag1, diag2, out + 0); /* store top */ \

91 PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \	87 PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32); /* store bottom */ \

92 }	88 }

93	89

94 // Turn the macro into a function for reducing code-size when non-critical	90 // Turn the macro into a function for reducing code-size when non-critical

95 static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],	91 static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],

96 uint8_t* const out) {	92 uint8_t* const out) {

97 UPSAMPLE_32PIXELS(r1, r2, out);	93 UPSAMPLE_32PIXELS(r1, r2, out);

98 }	94 }

99	95

100 #define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \	96 #define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \

101 uint8_t r1[17], r2[17]; \	97 uint8_t r1[17], r2[17]; \

102 memcpy(r1, (tb), (num_pixels)); \	98 memcpy(r1, (tb), (num_pixels)); \

103 memcpy(r2, (bb), (num_pixels)); \	99 memcpy(r2, (bb), (num_pixels)); \

104 /* replicate last byte */ \	100 /* replicate last byte */ \

105 memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \	101 memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \

106 memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \	102 memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \

107 /* using the shared function instead of the macro saves ~3k code size */ \	103 /* using the shared function instead of the macro saves ~3k code size */ \

108 Upsample32Pixels(r1, r2, out); \	104 Upsample32Pixels(r1, r2, out); \

109 }	105 }

110	106

111 #define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \	107 #define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, \

112 top_dst, bottom_dst, cur_x, num_pixels) { \	108 top_dst, bottom_dst, cur_x, num_pixels) { \

113 int n; \	109 int n; \

114 if (top_y) { \	110 for (n = 0; n < (num_pixels); ++n) { \

	111 FUNC(top_y[(cur_x) + n], r_u[n], r_v[n], \

	112 top_dst + ((cur_x) + n) * XSTEP); \

	113 } \

	114 if (bottom_y != NULL) { \

115 for (n = 0; n < (num_pixels); ++n) { \	115 for (n = 0; n < (num_pixels); ++n) { \

116 FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \	116 FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n], \

117 top_dst + ((cur_x) + n) * XSTEP); \

118 } \

119 } \

120 if (bottom_y) { \

121 for (n = 0; n < (num_pixels); ++n) { \

122 FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \

123 bottom_dst + ((cur_x) + n) * XSTEP); \	117 bottom_dst + ((cur_x) + n) * XSTEP); \

124 } \	118 } \

125 } \	119 } \

126 }	120 }

127	121

	122 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \

	123 top_dst, bottom_dst, cur_x) do { \

	124 FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP); \

	125 if (bottom_y != NULL) { \

	126 FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64, \

	127 bottom_dst + (cur_x) * XSTEP); \

	128 } \

	129 } while (0)

	130

128 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \	131 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \

129 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \	132 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \

130 const uint8_t* top_u, const uint8_t* top_v, \	133 const uint8_t* top_u, const uint8_t* top_v, \

131 const uint8_t* cur_u, const uint8_t* cur_v, \	134 const uint8_t* cur_u, const uint8_t* cur_v, \

132 uint8_t* top_dst, uint8_t* bottom_dst, int len) { \	135 uint8_t* top_dst, uint8_t* bottom_dst, int len) { \

133 int block; \	136 int uv_pos, pos; \

134 /* 16 byte aligned array to cache reconstructed u and v */ \	137 /* 16byte-aligned array to cache reconstructed u and v */ \

135 uint8_t uv_buf[4 * 32 + 15]; \	138 uint8_t uv_buf[4 * 32 + 15]; \

136 uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \	139 uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \

137 const int uv_len = (len + 1) >> 1; \	140 uint8_t* const r_v = r_u + 32; \

138 /* 17 pixels must be read-able for each block */ \

139 const int num_blocks = (uv_len - 1) >> 4; \

140 const int leftover = uv_len - num_blocks * 16; \

141 const int last_pos = 1 + 32 * num_blocks; \

142 \	141 \

143 const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \	142 assert(top_y != NULL); \

144 const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \	143 { /* Treat the first pixel in regular way */ \

145 \	144 const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \

146 assert(len > 0); \	145 const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \

147 /* Treat the first pixel in regular way */ \	146 const int u0_t = (top_u[0] + u_diag) >> 1; \

148 if (top_y) { \	147 const int v0_t = (top_v[0] + v_diag) >> 1; \

149 const int u0 = (top_u[0] + u_diag) >> 1; \	148 FUNC(top_y[0], u0_t, v0_t, top_dst); \

150 const int v0 = (top_v[0] + v_diag) >> 1; \	149 if (bottom_y != NULL) { \

151 FUNC(top_y[0], u0, v0, top_dst); \	150 const int u0_b = (cur_u[0] + u_diag) >> 1; \

	151 const int v0_b = (cur_v[0] + v_diag) >> 1; \

	152 FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \

	153 } \

152 } \	154 } \

153 if (bottom_y) { \	155 /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \

154 const int u0 = (cur_u[0] + u_diag) >> 1; \	156 for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \

155 const int v0 = (cur_v[0] + v_diag) >> 1; \	157 UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \

156 FUNC(bottom_y[0], u0, v0, bottom_dst); \	158 UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \

	159 CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \

157 } \	160 } \

158 \	161 if (len > 1) { \

159 for (block = 0; block < num_blocks; ++block) { \	162 const int left_over = ((len + 1) >> 1) - (pos >> 1); \

160 UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \	163 assert(left_over > 0); \

161 UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \	164 UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \

162 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \	165 UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \

163 32 * block + 1, 32) \	166 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, \

164 top_u += 16; \	167 pos, len - pos); \

165 cur_u += 16; \

166 top_v += 16; \

167 cur_v += 16; \

168 } \	168 } \

169 \

170 UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \

171 UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \

172 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \

173 last_pos, len - last_pos); \

174 }	169 }

175	170

176 // SSE2 variants of the fancy upsampler.	171 // SSE2 variants of the fancy upsampler.

177 SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3)	172 SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3)

178 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3)	173 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3)

179 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)	174 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)

180 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)	175 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)

181	176

182 #undef GET_M	177 #undef GET_M

183 #undef PACK_AND_STORE	178 #undef PACK_AND_STORE

184 #undef UPSAMPLE_32PIXELS	179 #undef UPSAMPLE_32PIXELS

185 #undef UPSAMPLE_LAST_BLOCK	180 #undef UPSAMPLE_LAST_BLOCK

186 #undef CONVERT2RGB	181 #undef CONVERT2RGB

	182 #undef CONVERT2RGB_32

187 #undef SSE2_UPSAMPLE_FUNC	183 #undef SSE2_UPSAMPLE_FUNC

188	184

189 #endif // FANCY_UPSAMPLING	185 #endif // FANCY_UPSAMPLING

190	186

191 #endif // WEBP_USE_SSE2	187 #endif // WEBP_USE_SSE2

192	188

193 //------------------------------------------------------------------------------	189 //------------------------------------------------------------------------------

194	190

	191 #ifdef FANCY_UPSAMPLING

	192

195 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];	193 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];

196	194

197 void WebPInitUpsamplersSSE2(void) {	195 void WebPInitUpsamplersSSE2(void) {

198 #if defined(WEBP_USE_SSE2)	196 #if defined(WEBP_USE_SSE2)

	197 VP8YUVInitSSE2();

199 WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;	198 WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;

200 WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;	199 WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;

201 WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;	200 WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;

202 WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;	201 WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;

203 #endif // WEBP_USE_SSE2	202 #endif // WEBP_USE_SSE2

204 }	203 }

205	204

206 void WebPInitPremultiplySSE2(void) {	205 void WebPInitPremultiplySSE2(void) {

207 #if defined(WEBP_USE_SSE2)	206 #if defined(WEBP_USE_SSE2)

208 WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;	207 WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;

209 WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;	208 WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;

210 #endif // WEBP_USE_SSE2	209 #endif // WEBP_USE_SSE2

211 }	210 }

212	211

213 #if defined(__cplusplus) \|\| defined(c_plusplus)	212 #else

214 } // extern "C"

215 #endif

216	213

	214 // this empty function is to avoid an empty .o

	215 void WebPInitPremultiplySSE2(void) {}

217	216

	217 #endif // FANCY_UPSAMPLING

	218

OLD	NEW

« third_party/libwebp/README.chromium ('K') | « third_party/libwebp/dsp/upsampling_neon.c ('k') | third_party/libwebp/dsp/yuv.h » ('j') | no next file with comments »