third_party/libwebp/dsp/upsampling_sse2.c - Issue 12942006: libwebp: update snapshot to v0.3.0-rc6

Side by Side Diff: third_party/libwebp/dsp/upsampling_sse2.c

Issue 12942006: libwebp: update snapshot to v0.3.0-rc6 (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: local webkit layout expectations Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright 2011 Google Inc. All Rights Reserved.	1 // Copyright 2011 Google Inc. All Rights Reserved.

2 //	2 //

3 // This code is licensed under the same terms as WebM:	3 // This code is licensed under the same terms as WebM:

4 // Software License Agreement: http://www.webmproject.org/license/software/	4 // Software License Agreement: http://www.webmproject.org/license/software/

5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/	5 // Additional IP Rights Grant: http://www.webmproject.org/license/additional/

6 // -----------------------------------------------------------------------------	6 // -----------------------------------------------------------------------------

7 //	7 //

8 // SSE2 version of YUV to RGB upsampling functions.	8 // SSE2 version of YUV to RGB upsampling functions.

9 //	9 //

10 // Author: somnath@google.com (Somnath Banerjee)	10 // Author: somnath@google.com (Somnath Banerjee)

11	11

12 #include "./dsp.h"	12 #include "./dsp.h"

13	13

	14 #if defined(__cplusplus) \|\| defined(c_plusplus)

	15 extern "C" {

	16 #endif

	17

14 #if defined(WEBP_USE_SSE2)	18 #if defined(WEBP_USE_SSE2)

15	19

16 #include <assert.h>	20 #include <assert.h>

17 #include <emmintrin.h>	21 #include <emmintrin.h>

18 #include <string.h>	22 #include <string.h>

19 #include "./yuv.h"	23 #include "./yuv.h"

20	24

21 #if defined(__cplusplus) \|\| defined(c_plusplus)

22 extern "C" {

23 #endif

24

25 #ifdef FANCY_UPSAMPLING	25 #ifdef FANCY_UPSAMPLING

26	26

27 // We compute (9a + 3b + 3*c + d + 8) / 16 as follows	27 // We compute (9a + 3b + 3*c + d + 8) / 16 as follows

28 // u = (9a + 3b + 3*c + d + 8) / 16	28 // u = (9a + 3b + 3*c + d + 8) / 16

29 // = (a + (a + 3b + 3c + d) / 8 + 1) / 2	29 // = (a + (a + 3b + 3c + d) / 8 + 1) / 2

30 // = (a + m + 1) / 2	30 // = (a + m + 1) / 2

31 // where m = (a + 3b + 3c + d) / 8	31 // where m = (a + 3b + 3c + d) / 8

32 // = ((a + b + c + d) / 2 + b + c) / 4	32 // = ((a + b + c + d) / 2 + b + c) / 4

33 //	33 //

34 // Let's say k = (a + b + c + d) / 4.	34 // Let's say k = (a + b + c + d) / 4.

35 // We can compute k as	35 // We can compute k as

36 // k = (s + t + 1) / 2 - ((a^d) \| (b^c) \| (s^t)) & 1	36 // k = (s + t + 1) / 2 - ((a^d) \| (b^c) \| (s^t)) & 1

37 // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2	37 // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2

38 //	38 //

39 // Then m can be written as	39 // Then m can be written as

40 // m = (k + t + 1) / 2 - (((b^c) & (s^t)) \| (k^t)) & 1	40 // m = (k + t + 1) / 2 - (((b^c) & (s^t)) \| (k^t)) & 1

41	41

42 // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) \| (k^in)) & 1	42 // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) \| (k^in)) & 1

43 #define GET_M(ij, in, out) do { \	43 #define GET_M(ij, in, out) do { \

44 const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \	44 const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \

45 const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \	45 const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \

46 const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \	46 const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \

47 const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) \| (k^in) */\	47 const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) \| (k^in) */\

48 const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \	48 const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \

49 (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \	49 (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \

50 } while (0)	50 } while (0)

51	51

52 // pack and store two alterning pixel rows	52 // pack and store two alterning pixel rows

53 #define PACK_AND_STORE(a, b, da, db, out) do { \	53 #define PACK_AND_STORE(a, b, da, db, out) do { \

54 const __m128i ta = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \	54 const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \

55 const __m128i tb = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \	55 const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \

56 const __m128i t1 = _mm_unpacklo_epi8(ta, tb); \	56 const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \

57 const __m128i t2 = _mm_unpackhi_epi8(ta, tb); \	57 const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \

58 _mm_store_si128(((__m128i*)(out)) + 0, t1); \	58 _mm_store_si128(((__m128i*)(out)) + 0, t_1); \

59 _mm_store_si128(((__m128i*)(out)) + 1, t2); \	59 _mm_store_si128(((__m128i*)(out)) + 1, t_2); \

60 } while (0)	60 } while (0)

61	61

62 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.	62 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.

63 #define UPSAMPLE_32PIXELS(r1, r2, out) { \	63 #define UPSAMPLE_32PIXELS(r1, r2, out) { \

64 const __m128i one = _mm_set1_epi8(1); \	64 const __m128i one = _mm_set1_epi8(1); \

65 const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]); \	65 const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]); \

66 const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]); \	66 const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]); \

67 const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]); \	67 const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]); \

68 const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]); \	68 const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]); \

69 \	69 \

(...skipping 51 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
121 bottom_dst + ((cur_x) + n) * XSTEP); \	121 bottom_dst + ((cur_x) + n) * XSTEP); \

122 } \	122 } \

123 } \	123 } \

124 }	124 }

125	125

126 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \	126 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \

127 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \	127 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \

128 const uint8_t* top_u, const uint8_t* top_v, \	128 const uint8_t* top_u, const uint8_t* top_v, \

129 const uint8_t* cur_u, const uint8_t* cur_v, \	129 const uint8_t* cur_u, const uint8_t* cur_v, \

130 uint8_t* top_dst, uint8_t* bottom_dst, int len) { \	130 uint8_t* top_dst, uint8_t* bottom_dst, int len) { \

131 int b; \	131 int block; \

132 /* 16 byte aligned array to cache reconstructed u and v */ \	132 /* 16 byte aligned array to cache reconstructed u and v */ \

133 uint8_t uv_buf[4 * 32 + 15]; \	133 uint8_t uv_buf[4 * 32 + 15]; \

134 uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \	134 uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \

135 const int uv_len = (len + 1) >> 1; \	135 const int uv_len = (len + 1) >> 1; \

136 /* 17 pixels must be read-able for each block */ \	136 /* 17 pixels must be read-able for each block */ \

137 const int num_blocks = (uv_len - 1) >> 4; \	137 const int num_blocks = (uv_len - 1) >> 4; \

138 const int leftover = uv_len - num_blocks * 16; \	138 const int leftover = uv_len - num_blocks * 16; \

139 const int last_pos = 1 + 32 * num_blocks; \	139 const int last_pos = 1 + 32 * num_blocks; \

140 \	140 \

141 const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \	141 const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \

142 const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \	142 const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \

143 \	143 \

144 assert(len > 0); \	144 assert(len > 0); \

145 /* Treat the first pixel in regular way */ \	145 /* Treat the first pixel in regular way */ \

146 if (top_y) { \	146 if (top_y) { \

147 const int u0 = (top_u[0] + u_diag) >> 1; \	147 const int u0 = (top_u[0] + u_diag) >> 1; \

148 const int v0 = (top_v[0] + v_diag) >> 1; \	148 const int v0 = (top_v[0] + v_diag) >> 1; \

149 FUNC(top_y[0], u0, v0, top_dst); \	149 FUNC(top_y[0], u0, v0, top_dst); \

150 } \	150 } \

151 if (bottom_y) { \	151 if (bottom_y) { \

152 const int u0 = (cur_u[0] + u_diag) >> 1; \	152 const int u0 = (cur_u[0] + u_diag) >> 1; \

153 const int v0 = (cur_v[0] + v_diag) >> 1; \	153 const int v0 = (cur_v[0] + v_diag) >> 1; \

154 FUNC(bottom_y[0], u0, v0, bottom_dst); \	154 FUNC(bottom_y[0], u0, v0, bottom_dst); \

155 } \	155 } \

156 \	156 \

157 for (b = 0; b < num_blocks; ++b) { \	157 for (block = 0; block < num_blocks; ++block) { \

158 UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \	158 UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \

159 UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \	159 UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \

160 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \	160 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \

161 32 * b + 1, 32) \	161 32 * block + 1, 32) \

162 top_u += 16; \	162 top_u += 16; \

163 cur_u += 16; \	163 cur_u += 16; \

164 top_v += 16; \	164 top_v += 16; \

165 cur_v += 16; \	165 cur_v += 16; \

166 } \	166 } \

167 \	167 \

168 UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \	168 UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \

169 UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \	169 UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \

170 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \	170 CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \

171 last_pos, len - last_pos); \	171 last_pos, len - last_pos); \

172 }	172 }

173	173

174 // SSE2 variants of the fancy upsampler.	174 // SSE2 variants of the fancy upsampler.

175 SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3)	175 SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3)

176 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3)	176 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3)

177 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)	177 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4)

178 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)	178 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)

179	179

180 #undef GET_M	180 #undef GET_M

181 #undef PACK_AND_STORE	181 #undef PACK_AND_STORE

182 #undef UPSAMPLE_32PIXELS	182 #undef UPSAMPLE_32PIXELS

183 #undef UPSAMPLE_LAST_BLOCK	183 #undef UPSAMPLE_LAST_BLOCK

184 #undef CONVERT2RGB	184 #undef CONVERT2RGB

185 #undef SSE2_UPSAMPLE_FUNC	185 #undef SSE2_UPSAMPLE_FUNC

186	186

	187 #endif // FANCY_UPSAMPLING

	188

	189 #endif // WEBP_USE_SSE2

	190

187 //------------------------------------------------------------------------------	191 //------------------------------------------------------------------------------

188	192

189 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];	193 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];

190	194

191 void WebPInitUpsamplersSSE2(void) {	195 void WebPInitUpsamplersSSE2(void) {

	196 #if defined(WEBP_USE_SSE2)

192 WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;	197 WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2;

193 WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;	198 WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;

194 WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;	199 WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2;

195 WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;	200 WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2;

	201 #endif // WEBP_USE_SSE2

196 }	202 }

197	203

198 void WebPInitPremultiplySSE2(void) {	204 void WebPInitPremultiplySSE2(void) {

	205 #if defined(WEBP_USE_SSE2)

199 WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;	206 WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePairSSE2;

200 WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;	207 WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePairSSE2;

	208 #endif // WEBP_USE_SSE2

201 }	209 }

202	210

203 #endif // FANCY_UPSAMPLING

204

205 #if defined(__cplusplus) \|\| defined(c_plusplus)	211 #if defined(__cplusplus) \|\| defined(c_plusplus)

206 } // extern "C"	212 } // extern "C"

207 #endif	213 #endif

208	214

209 #endif // WEBP_USE_SSE2	215

OLD	NEW

« third_party/libwebp/dsp/dec_neon.c ('K') | « third_party/libwebp/dsp/upsampling_neon.c ('k') | third_party/libwebp/dsp/yuv.h » ('j') | no next file with comments »