src/opts/SkColor_opts_SSE2.h - Issue 724333003: Optimize SkAlphaMulQ_SSE2

Side by Side Diff: src/opts/SkColor_opts_SSE2.h

Issue 724333003: Optimize SkAlphaMulQ_SSE2 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: undef ASSERT_EQ at the end of file Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 The Android Open Source Project	2 * Copyright 2014 The Android Open Source Project

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkColor_opts_SSE2_DEFINED	8 #ifndef SkColor_opts_SSE2_DEFINED

9 #define SkColor_opts_SSE2_DEFINED	9 #define SkColor_opts_SSE2_DEFINED

10	10

11 #include <emmintrin.h>	11 #include <emmintrin.h>

12	12

	13 #define ASSERT_EQ(a,b) SkASSERT(0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8((a), (b))))

	14

13 // Because no _mm_mul_epi32() in SSE2, we emulate it here.	15 // Because no _mm_mul_epi32() in SSE2, we emulate it here.

14 // Multiplies 4 32-bit integers from a by 4 32-bit intergers from b.	16 // Multiplies 4 32-bit integers from a by 4 32-bit intergers from b.

15 // The 4 multiplication results should be represented within 32-bit	17 // The 4 multiplication results should be represented within 32-bit

16 // integers, otherwise they would be overflow.	18 // integers, otherwise they would be overflow.

17 static inline __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) {	19 static inline __m128i Multiply32_SSE2(const __m128i& a, const __m128i& b) {

18 // Calculate results of a0 * b0 and a2 * b2.	20 // Calculate results of a0 * b0 and a2 * b2.

19 __m128i r1 = _mm_mul_epu32(a, b);	21 __m128i r1 = _mm_mul_epu32(a, b);

20 // Calculate results of a1 * b1 and a3 * b3.	22 // Calculate results of a1 * b1 and a3 * b3.

21 __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));	23 __m128i r2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));

22 // Shuffle results to [63..0] and interleave the results.	24 // Shuffle results to [63..0] and interleave the results.

(...skipping 22 matching lines...) Expand all Loading...
45 __m128i mask = _mm_set1_epi32(0xFF00FF);	47 __m128i mask = _mm_set1_epi32(0xFF00FF);

46 __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);	48 __m128i s = _mm_or_si128(_mm_slli_epi32(scale, 16), scale);

47	49

48 // uint32_t rb = ((c & mask) * scale) >> 8	50 // uint32_t rb = ((c & mask) * scale) >> 8

49 __m128i rb = _mm_and_si128(mask, c);	51 __m128i rb = _mm_and_si128(mask, c);

50 rb = _mm_mullo_epi16(rb, s);	52 rb = _mm_mullo_epi16(rb, s);

51 rb = _mm_srli_epi16(rb, 8);	53 rb = _mm_srli_epi16(rb, 8);

52	54

53 // uint32_t ag = ((c >> 8) & mask) * scale	55 // uint32_t ag = ((c >> 8) & mask) * scale

54 __m128i ag = _mm_srli_epi16(c, 8);	56 __m128i ag = _mm_srli_epi16(c, 8);

55 ag = _mm_and_si128(ag, mask);	57 ASSERT_EQ(ag, _mm_and_si128(mask, ag)); // ag = _mm_srli_epi16(c, 8) did th is for us.

56 ag = _mm_mullo_epi16(ag, s);	58 ag = _mm_mullo_epi16(ag, s);

57	59

58 // (rb & mask) \| (ag & ~mask)	60 // (rb & mask) \| (ag & ~mask)

59 rb = _mm_and_si128(mask, rb);	61 ASSERT_EQ(rb, _mm_and_si128(mask, rb)); // rb = _mm_srli_epi16(rb, 8) did t his for us.

60 ag = _mm_andnot_si128(mask, ag);	62 ag = _mm_andnot_si128(mask, ag);

61 return _mm_or_si128(rb, ag);	63 return _mm_or_si128(rb, ag);

62 }	64 }

63	65

64 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {	66 static inline __m128i SkGetPackedA32_SSE2(const __m128i& src) {

65 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));	67 __m128i a = _mm_slli_epi32(src, (24 - SK_A32_SHIFT));

66 return _mm_srli_epi32(a, 24);	68 return _mm_srli_epi32(a, 24);

67 }	69 }

68	70

69 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {	71 static inline __m128i SkGetPackedR32_SSE2(const __m128i& src) {

(...skipping 106 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
176 SK_B32_SHIFT + (8 - SK_B16_BITS));	178 SK_B32_SHIFT + (8 - SK_B16_BITS));

177 b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK));	179 b2 = _mm_and_si128(b2, _mm_set1_epi32(SK_B16_MASK));

178 __m128i b = _mm_packs_epi32(b1, b2);	180 __m128i b = _mm_packs_epi32(b1, b2);

179	181

180 // Store 8 16-bit colors in dst.	182 // Store 8 16-bit colors in dst.

181 __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);	183 __m128i d_pixel = SkPackRGB16_SSE2(r, g, b);

182	184

183 return d_pixel;	185 return d_pixel;

184 }	186 }

185	187

	188 #undef ASSERT_EQ

186 #endif // SkColor_opts_SSE2_DEFINED	189 #endif // SkColor_opts_SSE2_DEFINED

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »