src/core/SkHalf.h - Issue 2184753002: Add Sk4h_load4 for loading F16.

Side by Side Diff: src/core/SkHalf.h

Issue 2184753002: Add Sk4h_load4 for loading F16. (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: typo Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« no previous file with comments | « no previous file | src/core/SkNx.h » ('j') | src/opts/SkNx_sse.h » ('J')

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 Google Inc.	2 * Copyright 2014 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkHalf_DEFINED	8 #ifndef SkHalf_DEFINED

9 #define SkHalf_DEFINED	9 #define SkHalf_DEFINED

10	10

(...skipping 19 matching lines...) Expand all Loading...
30 static inline Sk4f SkHalfToFloat_finite(uint64_t);	30 static inline Sk4f SkHalfToFloat_finite(uint64_t);

31 static inline Sk4h SkFloatToHalf_finite(const Sk4f&);	31 static inline Sk4h SkFloatToHalf_finite(const Sk4f&);

32	32

33 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //	33 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //

34	34

35 // Like the serial versions in SkHalf.cpp, these are based on	35 // Like the serial versions in SkHalf.cpp, these are based on

36 // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/	36 // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/

37	37

38 // GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use i nline assembly.	38 // GCC 4.9 lacks the intrinsics to use ARMv8 f16<->f32 instructions, so we use i nline assembly.

39	39

40 static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {	40 static inline Sk4f SkHalfToFloat_finite(const Sk4h& hs) {

41 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)	41 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)

42 float32x4_t fs;	42 float32x4_t fs;

43 asm ("fmov %d[fs], %[hs] \n" // vcreate_f16(hs)	43 asm ("fcvtl %[fs].4s, %[hs].4h \n" // vcvt_f32_f16(...)

44 "fcvtl %[fs].4s, %[fs].4h \n" // vcvt_f32_f16(...)

45 : [fs] "=w" (fs) // =w: write-only NEON register	44 : [fs] "=w" (fs) // =w: write-only NEON register

46 : [hs] "r" (hs)); // r: read-only 64-bit general regis ter	45 : [hs] "w" (hs.fVec)); // w: read-only NEON register

47 return fs;	46 return fs;

48 #else	47 #else

49 Sk4i bits = SkNx_cast<int>(Sk4h::Load(&hs)), // Expand to 32 bit.	48 Sk4i bits = SkNx_cast<int>(hs), // Expand to 32 bit.

50 sign = bits & 0x00008000, // Save the sign bit for later...	49 sign = bits & 0x00008000, // Save the sign bit for later...

51 positive = bits ^ sign, // ...but strip it off f or now.	50 positive = bits ^ sign, // ...but strip it off for now.

52 is_denorm = positive < (1<<10); // Exponent == 0?	51 is_denorm = positive < (1<<10); // Exponent == 0?

53	52

54 // For normal half floats, extend the mantissa by 13 zero bits,	53 // For normal half floats, extend the mantissa by 13 zero bits,

55 // then adjust the exponent from 15 bias to 127 bias.	54 // then adjust the exponent from 15 bias to 127 bias.

56 Sk4i norm = (positive << 13) + ((127 - 15) << 23);	55 Sk4i norm = (positive << 13) + ((127 - 15) << 23);

57	56

58 // For denorm half floats, mask in the exponent-only float K that turns our	57 // For denorm half floats, mask in the exponent-only float K that turns our

59 // denorm value V2^-14 into a normalized float K + V2^-14. Then subtract off K.	58 // denorm value V2^-14 into a normalized float K + V2^-14. Then subtract off K.

60 const Sk4i K = ((127-15) + (23-10) + 1) << 23;	59 const Sk4i K = ((127-15) + (23-10) + 1) << 23;

61 Sk4i mask_K = positive \| K;	60 Sk4i mask_K = positive \| K;

62 Sk4f denorm = Sk4f::Load(&mask_K) - Sk4f::Load(&K);	61 Sk4f denorm = Sk4f::Load(&mask_K) - Sk4f::Load(&K);

63	62

64 Sk4i merged = (sign << 16) \| is_denorm.thenElse(Sk4i::Load(&denorm), norm);	63 Sk4i merged = (sign << 16) \| is_denorm.thenElse(Sk4i::Load(&denorm), norm);

65 return Sk4f::Load(&merged);	64 return Sk4f::Load(&merged);

66 #endif	65 #endif

67 }	66 }

68	67

	68 static inline Sk4f SkHalfToFloat_finite(uint64_t hs) {

	69 return SkHalfToFloat_finite(Sk4h::Load(&hs));

	70 }

	71

69 static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) {	72 static inline Sk4h SkFloatToHalf_finite(const Sk4f& fs) {

70 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)	73 #if !defined(SKNX_NO_SIMD) && defined(SK_CPU_ARM64)

71 float32x4_t vec = fs.fVec;	74 float32x4_t vec = fs.fVec;

72 asm ("fcvtn %[vec].4h, %[vec].4s \n" // vcvt_f16_f32(vec)	75 asm ("fcvtn %[vec].4h, %[vec].4s \n" // vcvt_f16_f32(vec)

73 : [vec] "+w" (vec)); // +w: read-write NEON register	76 : [vec] "+w" (vec)); // +w: read-write NEON register

74 return vreinterpret_u16_f32(vget_low_f32(vec));	77 return vreinterpret_u16_f32(vget_low_f32(vec));

75 #else	78 #else

76 Sk4i bits = Sk4i::Load(&fs),	79 Sk4i bits = Sk4i::Load(&fs),

77 sign = bits & 0x80000000, // Save the sign bit f or later...	80 sign = bits & 0x80000000, // Save the sign bit f or later...

78 positive = bits ^ sign, // ...but strip it off for now.	81 positive = bits ^ sign, // ...but strip it off for now.

79 will_be_denorm = positive < ((127-15+1) << 23); // positve < smallest normal half?	82 will_be_denorm = positive < ((127-15+1) << 23); // positve < smallest normal half?

80	83

81 // For normal half floats, adjust the exponent from 127 bias to 15 bias,	84 // For normal half floats, adjust the exponent from 127 bias to 15 bias,

82 // then drop the bottom 13 mantissa bits.	85 // then drop the bottom 13 mantissa bits.

83 Sk4i norm = (positive - ((127 - 15) << 23)) >> 13;	86 Sk4i norm = (positive - ((127 - 15) << 23)) >> 13;

84	87

85 // This mechanically inverts the denorm half -> normal float conversion abov e.	88 // This mechanically inverts the denorm half -> normal float conversion abov e.

86 // Knowning that and reading its explanation will leave you feeling more con fident	89 // Knowning that and reading its explanation will leave you feeling more con fident

87 // than reading my best attempt at explaining this directly.	90 // than reading my best attempt at explaining this directly.

88 const Sk4i K = ((127-15) + (23-10) + 1) << 23;	91 const Sk4i K = ((127-15) + (23-10) + 1) << 23;

89 Sk4f plus_K = Sk4f::Load(&positive) + Sk4f::Load(&K);	92 Sk4f plus_K = Sk4f::Load(&positive) + Sk4f::Load(&K);

90 Sk4i denorm = Sk4i::Load(&plus_K) ^ K;	93 Sk4i denorm = Sk4i::Load(&plus_K) ^ K;

91	94

92 Sk4i merged = (sign >> 16) \| will_be_denorm.thenElse(denorm, norm);	95 Sk4i merged = (sign >> 16) \| will_be_denorm.thenElse(denorm, norm);

93 return SkNx_cast<uint16_t>(merged);	96 return SkNx_cast<uint16_t>(merged);

94 #endif	97 #endif

95 }	98 }

96	99

97 #endif	100 #endif

OLD	NEW