include/core/SkFloatingPoint.h - Issue 1264893002: Runtime CPU detection for rsqrt().

Side by Side Diff: include/core/SkFloatingPoint.h

Issue 1264893002: Runtime CPU detection for rsqrt(). (Closed) Base URL: https://skia.googlesource.com/skia.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1	1

2 /*	2 /*

3 * Copyright 2006 The Android Open Source Project	3 * Copyright 2006 The Android Open Source Project

4 *	4 *

5 * Use of this source code is governed by a BSD-style license that can be	5 * Use of this source code is governed by a BSD-style license that can be

6 * found in the LICENSE file.	6 * found in the LICENSE file.

7 */	7 */

8	8

9	9

10 #ifndef SkFloatingPoint_DEFINED	10 #ifndef SkFloatingPoint_DEFINED

(...skipping 109 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
120 #define sk_double_ceil2int(x) (int)ceil(x)	120 #define sk_double_ceil2int(x) (int)ceil(x)

121	121

122 extern const uint32_t gIEEENotANumber;	122 extern const uint32_t gIEEENotANumber;

123 extern const uint32_t gIEEEInfinity;	123 extern const uint32_t gIEEEInfinity;

124 extern const uint32_t gIEEENegativeInfinity;	124 extern const uint32_t gIEEENegativeInfinity;

125	125

126 #define SK_FloatNaN (SkTCast<const float>(&gIEEENotANumber))	126 #define SK_FloatNaN (SkTCast<const float>(&gIEEENotANumber))

127 #define SK_FloatInfinity (SkTCast<const float>(&gIEEEInfinity))	127 #define SK_FloatInfinity (SkTCast<const float>(&gIEEEInfinity))

128 #define SK_FloatNegativeInfinity (SkTCast<const float>(&gIEEENegativeInfini ty))	128 #define SK_FloatNegativeInfinity (SkTCast<const float>(&gIEEENegativeInfini ty))

129	129

	130 namespace SkOpts { extern float (*rsqrt)(float); }

	131

130 // Fast, approximate inverse square root.	132 // Fast, approximate inverse square root.

131 // Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON.	133 // Compare to name-brand "1.0f / sk_float_sqrt(x)". Should be around 10x faster on SSE, 2x on NEON.

132 static inline float sk_float_rsqrt(const float x) {	134 static inline float sk_float_rsqrt(const float x) {

133 // We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got	135 // We want all this inlined, so we'll inline SIMD and just take the hit when we don't know we've got

134 // it at compile time. This is going to be too fast to productively hide behind a function pointer.	136 // it at compile time. This is going to be too fast to productively hide behind a function pointer.

135 //	137 //

136 // We do one step of Newton's method to refine the estimates in the NEON and nul l paths. No	138 // We do one step of Newton's method to refine the estimates in the NEON and nul l paths. No

137 // refinement is faster, but very innacurate. Two steps is more accurate, but s lower than 1/sqrt.	139 // refinement is faster, but very innacurate. Two steps is more accurate, but s lower than 1/sqrt.

138 //	140 //

139 // Optimized constants in the null path courtesy of http://rrrola.wz.cz/inv_sqrt .html	141 // Optimized constants in the null path courtesy of http://rrrola.wz.cz/inv_sqrt .html

140 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1	142 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1

141 return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));	143 return _mm_cvtss_f32(_mm_rsqrt_ss(_mm_set_ss(x)));

142 #elif defined(SK_ARM_HAS_NEON)	144 #elif defined(SK_ARM_HAS_NEON)

143 // Get initial estimate.	145 // Get initial estimate.

144 const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doi ng everything 2x.	146 const float32x2_t xx = vdup_n_f32(x); // Clever readers will note we're doi ng everything 2x.

145 float32x2_t estimate = vrsqrte_f32(xx);	147 float32x2_t estimate = vrsqrte_f32(xx);

146	148

147 // One step of Newton's method to refine.	149 // One step of Newton's method to refine.

148 const float32x2_t estimate_sq = vmul_f32(estimate, estimate);	150 const float32x2_t estimate_sq = vmul_f32(estimate, estimate);

149 estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));	151 estimate = vmul_f32(estimate, vrsqrts_f32(xx, estimate_sq));

150 return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places.	152 return vget_lane_f32(estimate, 0); // 1 will work fine too; the answer's in both places.

151 #else	153 #else

152 // Get initial estimate.	154 // Perhaps runtime-detected NEON, or a portable fallback.

153 int i = SkTCast<int>(&x);	155 return SkOpts::rsqrt(x);

154 i = 0x5F1FFFF9 - (i>>1);

155 float estimate = SkTCast<float>(&i);

156

157 // One step of Newton's method to refine.

158 const float estimate_sq = estimate*estimate;

159 estimate = 0.703952253f(2.38924456f-x*estimate_sq);

160 return estimate;

161 #endif	156 #endif

162 }	157 }

163	158

164 #endif	159 #endif

OLD	NEW

« no previous file with comments | « no previous file | src/core/SkOpts.h » ('j') | no next file with comments »