src/core/SkHalf.h - Issue 1688233002: new version of SkHalfToFloat_01

Side by Side Diff: src/core/SkHalf.h

Issue 1688233002: new version of SkHalfToFloat_01 (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: swap cast order, neon on the brain Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 /*

2 * Copyright 2014 Google Inc.	2 * Copyright 2014 Google Inc.

3 *	3 *

4 * Use of this source code is governed by a BSD-style license that can be	4 * Use of this source code is governed by a BSD-style license that can be

5 * found in the LICENSE file.	5 * found in the LICENSE file.

6 */	6 */

7	7

8 #ifndef SkHalf_DEFINED	8 #ifndef SkHalf_DEFINED

9 #define SkHalf_DEFINED	9 #define SkHalf_DEFINED

10	10

(...skipping 19 matching lines...) Expand all Loading...
30 static inline uint64_t SkFloatToHalf_01(const Sk4f&);	30 static inline uint64_t SkFloatToHalf_01(const Sk4f&);

31	31

32 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //	32 // ~~~~~~~~~~~ impl ~~~~~~~~~~~~~~ //

33	33

34 // Like the serial versions in SkHalf.cpp, these are based on	34 // Like the serial versions in SkHalf.cpp, these are based on

35 // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/	35 // https://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/

36	36

37 // TODO: NEON versions	37 // TODO: NEON versions

38 static inline Sk4f SkHalfToFloat_01(uint64_t hs) {	38 static inline Sk4f SkHalfToFloat_01(uint64_t hs) {

39 #if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	39 #if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

40 // Load our 16-bit floats into the bottom 16 bits of each 32-bit lane, with zeroes on top.	40 // If our input is a normal 16-bit float, things are pretty easy:

	41 // - shift left by 13 to put the mantissa in the right place;

	42 // - the exponent is wrong, but it just needs to be rebiased;

	43 // - re-bias the exponent from 15-bias to 127-bias by adding (127-15).

	44

	45 // If our input is denormalized, we're going to do the same steps, plus a fe w more fix ups:

	46 // - the input is h = K*2^-14, for some 10-bit fixed point K in [0,1);

	47 // - by shifting left 13 and adding (127-15) to the exponent, we construct ed the float value

	48 // 2^-15*(1+K);

	49 // - we'd need to subtract 2^-15 and multiply by 2 to get back to K*2^-14, or equivallently

	50 // multiply by 2 then subtract 2^-14.

	51 //

	52 // - We'll work that multiply by 2 into the rebias, by adding 1 more to th e exponent.

	53 // - Conveniently, this leaves that rebias constant 2^-14, exactly what we want to subtract.

	54

41 __m128i h = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i*)&hs), _mm_set zero_si128());	55 __m128i h = _mm_unpacklo_epi16(_mm_loadl_epi64((const __m128i*)&hs), _mm_set zero_si128());

	56 const __m128i is_denorm = _mm_cmplt_epi32(h, _mm_set1_epi32(1<<10));

42	57

43 // Fork into two paths, depending on whether the 16-bit float is denormalize d.	58 __m128i rebias = _mm_set1_epi32((127-15) << 23);

44 __m128 is_denorm = _mm_castsi128_ps(_mm_cmplt_epi32(h, _mm_set1_epi32(0x0400 )));	59 rebias = _mm_add_epi32(rebias, _mm_and_si128(is_denorm, _mm_set1_epi32(1<<23 )));

45	60

46 // TODO: figure out, explain	61 __m128i f = _mm_add_epi32(_mm_slli_epi32(h, 13), rebias);

47 const __m128 half = _mm_set1_ps(0.5f);	62 return _mm_sub_ps(_mm_castsi128_ps(f),

48 __m128 denorm = _mm_sub_ps(_mm_or_ps(_mm_castsi128_ps(h), half), half);	63 _mm_castsi128_ps(_mm_and_si128(is_denorm, rebias)));

49

50 // If we're normalized, just shift ourselves so the exponent/mantissa dividi ng line

51 // is correct, then re-bias the exponent from 15 to 127.

52 __m128 norm = _mm_castsi128_ps(_mm_add_epi32(_mm_slli_epi32(h, 13),

53 _mm_set1_epi32((127-15) << 23)) );

54

55 return _mm_or_ps(_mm_and_ps (is_denorm, denorm),

56 _mm_andnot_ps(is_denorm, norm));

57 #else	64 #else

58 float fs[4];	65 float fs[4];

59 for (int i = 0; i < 4; i++) {	66 for (int i = 0; i < 4; i++) {

60 fs[i] = SkHalfToFloat(hs >> (i*16));	67 fs[i] = SkHalfToFloat(hs >> (i*16));

61 }	68 }

62 return Sk4f::Load(fs);	69 return Sk4f::Load(fs);

63 #endif	70 #endif

64 }	71 }

65	72

66 static inline uint64_t SkFloatToHalf_01(const Sk4f& fs) {	73 static inline uint64_t SkFloatToHalf_01(const Sk4f& fs) {

67 #if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2	74 #if !defined(SKNX_NO_SIMD) && SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2

68 // Scale our floats down by a tiny power of 2 to pull up our mantissa bits,	75 // Scale our floats down by a tiny power of 2 to pull up our mantissa bits,

69 // then shift back down to 16-bit float layout. This doesn't round, so can be 1 bit small.	76 // then shift back down to 16-bit float layout. This doesn't round, so can be 1 bit small.

70 // TODO: understand better. Why this scale factor?	77 // TODO: understand better. Why this scale factor?

71 const __m128 scale = _mm_castsi128_ps(_mm_set1_epi32(15 << 23));	78 const __m128 rebias = _mm_castsi128_ps(_mm_set1_epi32((127 - (127 - 15)) << 23));

72 __m128i h = _mm_srli_epi32(_mm_castps_si128(_mm_mul_ps(fs.fVec, scale)), 13) ;	79 __m128i h = _mm_srli_epi32(_mm_castps_si128(_mm_mul_ps(fs.fVec, rebias)), 13 );

73	80

74 uint64_t r;	81 uint64_t r;

75 _mm_storel_epi64((__m128i*)&r, _mm_packs_epi32(h,h));	82 _mm_storel_epi64((__m128i*)&r, _mm_packs_epi32(h,h));

76 return r;	83 return r;

77 #else	84 #else

78 SkHalf hs[4];	85 SkHalf hs[4];

79 for (int i = 0; i < 4; i++) {	86 for (int i = 0; i < 4; i++) {

80 hs[i] = SkFloatToHalf(fs[i]);	87 hs[i] = SkFloatToHalf(fs[i]);

81 }	88 }

82 return (uint64_t)hs[3] << 48	89 return (uint64_t)hs[3] << 48

83 \| (uint64_t)hs[2] << 32	90 \| (uint64_t)hs[2] << 32

84 \| (uint64_t)hs[1] << 16	91 \| (uint64_t)hs[1] << 16

85 \| (uint64_t)hs[0] << 0;	92 \| (uint64_t)hs[0] << 0;

86 #endif	93 #endif

87 }	94 }

88	95

89 #endif	96 #endif

OLD	NEW

« no previous file with comments | « no previous file | tests/Float16Test.cpp » ('j') | no next file with comments »