src/core/SkHalf.h - Issue 2145663003: Expand _01 half<->float limitation to _finite. Simplify.

Side by Side Diff: src/core/SkHalf.h

Issue 2145663003: Expand _01 half<->float limitation to _finite. Simplify. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: typo Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 Google Inc.	2 * Copyright 2014 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkHalf_DEFINED	8 #ifndef SkHalf_DEFINED

9 #define SkHalf_DEFINED	9 #define SkHalf_DEFINED

10	10

11 #include "SkNx.h"	11 #include "SkNx.h"

12 #include "SkTypes.h"	12 #include "SkTypes.h"

13	13

14 // 16-bit floating point value	14 // 16-bit floating point value

15 // format is 1 bit sign, 5 bits exponent, 10 bits mantissa	15 // format is 1 bit sign, 5 bits exponent, 10 bits mantissa

16 // only used for storage	16 // only used for storage

17 typedef uint16_t SkHalf;	17 typedef uint16_t SkHalf;

18	18

19 #define SK_HalfMin 0x0400 // 2^-24 (minimum positive normal value)	19 #define SK_HalfMin 0x0400 // 2^-24 (minimum positive normal value)

20 #define SK_HalfMax 0x7bff // 65504	20 #define SK_HalfMax 0x7bff // 65504

21 #define SK_HalfEpsilon 0x1400 // 2^-10	21 #define SK_HalfEpsilon 0x1400 // 2^-10

22	22

23 // convert between half and single precision floating point	23 // convert between half and single precision floating point

24 float SkHalfToFloat(SkHalf h);	24 float SkHalfToFloat(SkHalf h);

25 SkHalf SkFloatToHalf(float f);	25 SkHalf SkFloatToHalf(float f);

26	26

27 // Convert between half and single precision floating point, but pull any dirty	27 // Convert between half and single precision floating point,

28 // trick we can to make it faster as long as it's correct enough for values in [ 0,1].	28 // assuming inputs and outputs are both finite.

29 static inline Sk4f SkHalfToFloat_01(uint64_t);	29 static inline Sk4f SkHalfToFloat_finite(uint64_t);

30 static inline uint64_t SkFloatToHalf_01(const Sk4f&);	30 static inline uint64_t SkFloatToHalf_finite(const Sk4f&);

31	31

32 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //	32 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //

33	33

34 // Like the serial versions in SkHalf.cpp, these are based on	34 // Like the serial versions in SkHalf.cpp, these are based on

35 // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/	35 // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/

36	36

37 // GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use i nline assembly.	37 // GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use i nline assembly.

38	38

39 static inline Sk4f SkHalfToFloat_01(uint64_t hs) {	39 static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {

40 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)	40 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)

41 float32x4_t fs;	41 float32x4_t fs;

42 asm ("fmov %d[fs], %[hs] \n" // vcreate_f16(hs)	42 asm ("fmov %d[fs], %[hs] \n" // vcreate_f16(hs)

43 "fcvtl %[fs].4s, %[fs].4h \n" // vcvt_f32_f16(...)	43 "fcvtl %[fs].4s, %[fs].4h \n" // vcvt_f32_f16(...)

44 : [fs] "=w" (fs) // =w: write-only NEON register	44 : [fs] "=w" (fs) // =w: write-only NEON register

45 : [hs] "r" (hs)); // r: read-only 64-bit general regis ter	45 : [hs] "r" (hs)); // r: read-only 64-bit general regis ter

46 return fs;	46 return fs;

	47 #else

	48 // Expand the halfs up to 32 bits each, and strip off the sign bit.

	49 Sk4i positive = SkNx_cast<int>(Sk4h::Load(&hs)),
	msarett 2016/07/13 22:07:05 nit: I found this code block confusing because "p nit: I found this code block confusing because "positive" is defined before the sign bit is stripped. This would be more clear: Sk4i positive = SkNx_cast<int>(Sk4h::Load(&hs)) & 0x00007FFF; But I guess since you use "sign" down below this doesn't save anything (and uses another constant).
	50 sign = positive & 0x00008000;

	51 positive ^= sign;

47	52

48 #elif !defined(SKNX_NO_SIMD) && defined(SK_ARM_HAS_NEON)	53 // For normal half floats, align the exponent/mantissa line and rebias the e xponent.
	msarett 2016/07/13 22:07:05 This is the simplest part, and still, this code is This is the simplest part, and still, this code is really complicated! Can we try to make it clearer? Ex: static constexpr int kF32MantissaBits = 23; static constexpr int kF32Bias = 127; static constexpr int kF16MantissaBits = 10; static constexpr int kF16Bias = 15; // Align the f16 exponent/mantissa line with the f32 exponent/mantissa line x = x << (kF32MantissaBits - kF16MantissaBits); // Rebias the exponent x = x + ((kF32Bias - kF16Bias) << kF32MantissaBits)
49 // NEON makes this pretty easy:	54 Sk4i norm = (positive << 13) + (112<<23);

50 // - denormals are 10-bit * 2^-14 == 24-bit fixed point;

51 // - handle normals the same way as in SSE: align mantissa, then rebias ex ponent.

52 uint32x4_t h = vmovl_u16(vcreate_u16(hs)),

53 is_denorm = vcltq_u32(h, vdupq_n_u32(1<<10));

54 float32x4_t denorm = vcvtq_n_f32_u32(h, 24),

55 norm = vreinterpretq_f32_u32(vaddq_u32(vshlq_n_u32(h, 13),

56 vdupq_n_u32((127-15) << 23)));

57 return vbslq_f32(is_denorm, denorm, norm);

58	55

59 #elif !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	56 // For denorm half floats, mask in a value with the right exponent for 2^-14 ,
	msarett 2016/07/13 22:07:05 I think the comment that would have made things mu I think the comment that would have made things much clearer for me is about why it's ok to put the exponent bits of the float16 in the mantissa of the float32. msarett 2016/07/14 12:39:23 Oh duh the exponent bits are all zero. Show quoted text On 2016/07/13 22:07:05, msarett wrote: > I think the comment that would have made things much clearer for me is about why > it's ok to put the exponent bits of the float16 in the mantissa of the float32. Oh duh the exponent bits are all zero.
60 // If our input is a normal 16-bit float, things are pretty easy:	57 // then subtract it off as a float. This leaves just our original fraction.
	msarett 2016/07/14 12:59:50 // Desired exponent is 2^-14 because that is the e // Desired exponent is 2^-14 because that is the exponent on denormal half floats (-bias + 1)
61 // - shift left by 13 to put the mantissa in the right place;	58 const Sk4i denorm_fixup = 126<<23;
	msarett 2016/07/14 12:59:50 // Because the bias is 127 this is an exponent of // Because the bias is 127 this is an exponent of 2^-1. But the mantissa is also shifted right by 13, so we really have 2^-14.
62 // - the exponent is wrong, but it just needs to be rebiased;	59 Sk4i denorm = positive \| denorm_fixup;
	msarett 2016/07/13 22:07:05 nit: Confused by the variable name. It's not "de nit: Confused by the variable name. It's not "denorm" here. It's just a bunch of bits. I guess this is the same style pattern as above. It's not my personal preference, but I don't feel too strongly.
63 // - re-bias the exponent from 15-bias to 127-bias by adding (127-15).	60 Sk4f denorm_f = Sk4f::Load(&denorm) - Sk4f::Load(&denorm_fixup);
	msarett 2016/07/14 12:59:50 // ((1 * 2^-1) + value) - (1 * 2^-1) = value // ((1 * 2^-1) + value) - (1 * 2^-1) = value
	61 denorm = Sk4i::Load(&denorm_f);

64	62

65 // If our input is denormalized, we're going to do the same steps, plus a fe w more fix ups:	63 Sk4i is_denorm = positive < (1<<10); // Exponent == 0?

66 // - the input is h = K*2^-14, for some 10-bit fixed point K in [0,1);	64 Sk4i merged = (sign << 16) \| is_denorm.thenElse(denorm, norm);

67 // - by shifting left 13 and adding (127-15) to the exponent, we construct ed the float value	65 return Sk4f::Load(&merged);

68 // 2^-15*(1+K);

69 // - we'd need to subtract 2^-15 and multiply by 2 to get back to K*2^-14, or equivallently

70 // multiply by 2 then subtract 2^-14.

71 //

72 // - We'll work that multiply by 2 into the rebias, by adding 1 more to th e exponent.

73 // - Conveniently, this leaves that rebias constant 2^-14, exactly what we want to subtract.

74

75 __m128i h = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i*)&hs), _mm_set zero_si128());

76 const __m128i is_denorm = _mm_cmplt_epi32(h, _mm_set1_epi32(1<<10));

77

78 __m128i rebias = _mm_set1_epi32((127-15) << 23);

79 rebias = _mm_add_epi32(rebias, _mm_and_si128(is_denorm, _mm_set1_epi32(1<<23 )));

80

81 __m128i f = _mm_add_epi32(_mm_slli_epi32(h, 13), rebias);

82 return _mm_sub_ps(_mm_castsi128_ps(f),

83 _mm_castsi128_ps(_mm_and_si128(is_denorm, rebias)));

84 #else

85 float fs[4];

86 for (int i = 0; i < 4; i++) {

87 fs[i] = SkHalfToFloat(hs >> (i*16));

88 }

89 return Sk4f::Load(fs);

90 #endif	66 #endif

91 }	67 }

92	68

93 static inline uint64_t SkFloatToHalf_01(const Sk4f& fs) {	69 static inline uint64_t SkFloatToHalf_finite(const Sk4f& fs) {

94 uint64_t r;	70 uint64_t r;

95 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)	71 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)

96 float32x4_t vec = fs.fVec;	72 float32x4_t vec = fs.fVec;

97 asm ("fcvtn %[vec].4h, %[vec].4s \n" // vcvt_f16_f32(vec)	73 asm ("fcvtn %[vec].4h, %[vec].4s \n" // vcvt_f16_f32(vec)

98 "fmov %[r], %d[vec] \n" // vst1_f16(&r, ...)	74 "fmov %[r], %d[vec] \n" // vst1_f16(&r, ...)

99 : [r] "=r" (r) // =r: write-only 64-bit general reg ister	75 : [r] "=r" (r) // =r: write-only 64-bit general reg ister

100 , [vec] "+w" (vec)); // +w: read-write NEON register	76 , [vec] "+w" (vec)); // +w: read-write NEON register

	77 #else

	78 // Strip the sign bit from each float.

	79 Sk4i positive = Sk4i::Load(&fs),

	80 sign = positive & 0x80000000;

	81 positive ^= sign;

101	82

102 // TODO: ARMv7 NEON float->half?	83 // Whether we'll produce normal or denorm half float results, either

	84 // way we just invert the logic from SkHalfToFloat_finite() above.

	85 Sk4i norm = (positive - (112<<23)) >> 13;
	msarett 2016/07/14 12:59:50 nit: Still think this is clearer with constants. nit: Still think this is clearer with constants. // What happens when the exponent is less than 112? It'll be a denormal half-float so it doesn't matter anyway?
103	86

104 #elif !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	87 const Sk4i denorm_fixup = 126<<23;
	msarett 2016/07/13 22:07:05 Haven't looked here yet... Haven't looked here yet...
105 // Scale down from 127-bias to 15-bias, then cut off bottom 13 mantissa bits .	88 Sk4f denorm_f = Sk4f::Load(&positive) + Sk4f::Load(&denorm_fixup);
	msarett 2016/07/14 12:59:50 // (1 * 2^-1) + small float effectively shifts the // (1 * 2^-1) + small float effectively shifts the small float down to the bottom ten bits of the mantissa.
106 // This doesn't round, so it can be 1 bit too small.	89 Sk4i denorm = Sk4i::Load(&denorm_f) ^ denorm_fixup;
	msarett 2016/07/14 12:59:50 Cool this saves us a mask. // Mask away the expon Cool this saves us a mask. // Mask away the exponent bits.
107 const __m128 rebias = _mm_castsi128_ps(_mm_set1_epi32((127 - (127-15)) << 23 ));

108 __m128i h = _mm_srli_epi32(_mm_castps_si128(_mm_mul_ps(fs.fVec, rebias)), 13 );

109 _mm_storel_epi64((__m128i*)&r, _mm_packs_epi32(h,h));

110	90

111 #else	91 Sk4i will_be_denorm = positive < ((127-14) << 23);

112 SkHalf hs[4];	92 Sk4i merged = (sign >> 16) \| will_be_denorm.thenElse(denorm, norm);

113 for (int i = 0; i < 4; i++) {	93 SkNx_cast<uint16_t>(merged).store(&r);

114 hs[i] = SkFloatToHalf(fs[i]);

115 }

116 r = (uint64_t)hs[3] << 48

117 \| (uint64_t)hs[2] << 32

118 \| (uint64_t)hs[1] << 16

119 \| (uint64_t)hs[0] << 0;

120 #endif	94 #endif

121 return r;	95 return r;

122 }	96 }

123	97

124 #endif	98 #endif

OLD	NEW

« no previous file with comments | « src/core/SkBitmap.cpp ('k') | src/core/SkLinearBitmapPipeline_sample.h » ('j') | src/opts/SkNx_sse.h » ('J')