OLD | NEW |
1 inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) { | 1 inline SkPMFloat& SkPMFloat::operator=(const SkPMFloat& that) { |
2 fColors = that.fColors; | 2 fColors = that.fColors; |
3 return *this; | 3 return *this; |
4 } | 4 } |
5 | 5 |
6 // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit compo
nents in 32 bits | 6 // For SkPMFloat(SkPMColor), we widen our 8 bit components (fix8) to 8-bit compo
nents in 32 bits |
7 // (fix8_32), then convert those to floats. | 7 // (fix8_32), then convert those to floats. |
8 | 8 |
9 // get() does the opposite, working from floats to 8-bit-in-32-bits, then back t
o packed 8 bit. | 9 // get() does the opposite, working from floats to 8-bit-in-32-bits, then back t
o packed 8 bit. |
10 | 10 |
11 // clamped() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bi
t, with | 11 // clamped() is the same as _SSE2: floats to 8-in-32, to 8-in-16, to packed 8 bi
t, with |
12 // _mm_packus_epi16() both clamping and narrowing. | 12 // _mm_packus_epi16() both clamping and narrowing. |
13 | 13 |
14 inline SkPMFloat::SkPMFloat(SkPMColor c) { | 14 inline SkPMFloat::SkPMFloat(SkPMColor c) { |
15 SkPMColorAssert(c); | 15 SkPMColorAssert(c); |
16 const int _ = 255; // _ means to zero that byte. | 16 const int _ = 255; // _ means to zero that byte. |
17 __m128i fix8 = _mm_set_epi32(0,0,0,c), | 17 __m128i fix8 = _mm_set_epi32(0,0,0,c), |
18 fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,
_,1, _,_,_,0)); | 18 fix8_32 = _mm_shuffle_epi8(fix8, _mm_set_epi8(_,_,_,3, _,_,_,2, _,_,
_,1, _,_,_,0)); |
19 fColors = _mm_cvtepi32_ps(fix8_32); | 19 fColors = _mm_cvtepi32_ps(fix8_32); |
20 SkASSERT(this->isValid()); | 20 SkASSERT(this->isValid()); |
21 } | 21 } |
22 | 22 |
23 inline SkPMColor SkPMFloat::get() const { | 23 inline SkPMColor SkPMFloat::get() const { |
24 SkASSERT(this->isValid()); | 24 SkASSERT(this->isValid()); |
25 const int _ = 255; // _ means to zero that byte. | 25 const int _ = 255; // _ means to zero that byte. |
26 __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for u
s! | 26 // We don't use _mm_cvtps_epi32, because we want precise control over how 0.
5 rounds (up). |
| 27 __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)), |
27 fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _
,_,_,_, 12,8,4,0)); | 28 fix8 = _mm_shuffle_epi8(fix8_32, _mm_set_epi8(_,_,_,_, _,_,_,_, _
,_,_,_, 12,8,4,0)); |
28 SkPMColor c = _mm_cvtsi128_si32(fix8); | 29 SkPMColor c = _mm_cvtsi128_si32(fix8); |
29 SkPMColorAssert(c); | 30 SkPMColorAssert(c); |
30 return c; | 31 return c; |
31 } | 32 } |
32 | 33 |
33 inline SkPMColor SkPMFloat::clamped() const { | 34 inline SkPMColor SkPMFloat::clamped() const { |
34 __m128i fix8_32 = _mm_cvtps_epi32(fColors), // _mm_cvtps_epi32 rounds for u
s! | 35 // We don't use _mm_cvtps_epi32, because we want precise control over how 0.
5 rounds (up). |
| 36 __m128i fix8_32 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), fColors)), |
35 fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), | 37 fix8_16 = _mm_packus_epi16(fix8_32, fix8_32), |
36 fix8 = _mm_packus_epi16(fix8_16, fix8_16); | 38 fix8 = _mm_packus_epi16(fix8_16, fix8_16); |
37 SkPMColor c = _mm_cvtsi128_si32(fix8); | 39 SkPMColor c = _mm_cvtsi128_si32(fix8); |
38 SkPMColorAssert(c); | 40 SkPMColorAssert(c); |
39 return c; | 41 return c; |
40 } | 42 } |
41 | 43 |
42 inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors
[4]) { | 44 inline void SkPMFloat::From4PMColors(SkPMFloat floats[4], const SkPMColor colors
[4]) { |
43 // Haven't beaten this yet. | 45 // Haven't beaten this yet. |
44 for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } | 46 for (int i = 0; i < 4; i++) { floats[i] = FromPMColor(colors[i]); } |
45 } | 47 } |
46 | 48 |
47 inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
]) { | 49 inline void SkPMFloat::To4PMColors(SkPMColor colors[4], const SkPMFloat floats[4
]) { |
48 // Haven't beaten this yet. Still faster than ClampTo4PMColors too. | 50 // Haven't beaten this yet. Still faster than ClampTo4PMColors too. |
49 for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); } | 51 for (int i = 0; i < 4; i++) { colors[i] = floats[i].get(); } |
50 } | 52 } |
51 | 53 |
52 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat flo
ats[4]) { | 54 inline void SkPMFloat::ClampTo4PMColors(SkPMColor colors[4], const SkPMFloat flo
ats[4]) { |
53 // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses
8. | 55 // Same as _SSE2.h's. We use 3 _mm_packus_epi16() where the naive loop uses
8. |
54 __m128i c0 = _mm_cvtps_epi32(floats[0].fColors), // _mm_cvtps_epi32 rounds
for us! | 56 // We don't use _mm_cvtps_epi32, because we want precise control over how 0.
5 rounds (up). |
55 c1 = _mm_cvtps_epi32(floats[1].fColors), | 57 __m128i c0 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[0].fColor
s)), |
56 c2 = _mm_cvtps_epi32(floats[2].fColors), | 58 c1 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[1].fColor
s)), |
57 c3 = _mm_cvtps_epi32(floats[3].fColors); | 59 c2 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[2].fColor
s)), |
| 60 c3 = _mm_cvttps_epi32(_mm_add_ps(_mm_set1_ps(0.5f), floats[3].fColor
s)); |
58 __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), | 61 __m128i c3210 = _mm_packus_epi16(_mm_packus_epi16(c0, c1), |
59 _mm_packus_epi16(c2, c3)); | 62 _mm_packus_epi16(c2, c3)); |
60 _mm_storeu_si128((__m128i*)colors, c3210); | 63 _mm_storeu_si128((__m128i*)colors, c3210); |
61 SkPMColorAssert(colors[0]); | 64 SkPMColorAssert(colors[0]); |
62 SkPMColorAssert(colors[1]); | 65 SkPMColorAssert(colors[1]); |
63 SkPMColorAssert(colors[2]); | 66 SkPMColorAssert(colors[2]); |
64 SkPMColorAssert(colors[3]); | 67 SkPMColorAssert(colors[3]); |
65 } | 68 } |
OLD | NEW |