Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(54)

Side by Side Diff: src/opts/Sk4x_neon.h

Issue 1020963002: Specialize Sk2d for ARM64 (Closed) Base URL: https://skia.googlesource.com/skia@master
Patch Set: Avoid use of vset[q]_lane, instead intializing vectors directly. Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/opts/Sk2x_neon.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // It is important _not_ to put header guards here. 1 // It is important _not_ to put header guards here.
2 // This file will be intentionally included three times. 2 // This file will be intentionally included three times.
3 3
4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362 4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362
5 5
6 #if defined(SK4X_PREAMBLE) 6 #if defined(SK4X_PREAMBLE)
7 #include <arm_neon.h> 7 #include <arm_neon.h>
8 8
9 // Template metaprogramming to map scalar types to vector types. 9 // Template metaprogramming to map scalar types to vector types.
10 template <typename T> struct SkScalarToSIMD; 10 template <typename T> struct SkScalarToSIMD;
(...skipping 19 matching lines...) Expand all
30 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; } 30 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; }
31 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) { 31 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) {
32 fVec = other.fVec; 32 fVec = other.fVec;
33 return *this; 33 return *this;
34 } 34 }
35 35
36 // Sk4f Methods 36 // Sk4f Methods
37 #define M(...) template <> inline __VA_ARGS__ Sk4f:: 37 #define M(...) template <> inline __VA_ARGS__ Sk4f::
38 38
39 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {} 39 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {}
40 M() Sk4x(float a, float b, float c, float d) { 40 M() Sk4x(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
41 // NEON lacks an intrinsic to make this easy. It is recommended to avoid
42 // this constructor unless it is absolutely necessary.
43
44 // I am choosing to use the set lane intrinsics. Particularly, in the case
45 // of floating point, it is likely that the values are already in the right
46 // register file, so this may be the best approach. However, I am not
47 // certain that this is the fastest approach and experimentation might be
48 // useful.
49 fVec = vsetq_lane_f32(a, fVec, 0);
50 fVec = vsetq_lane_f32(b, fVec, 1);
51 fVec = vsetq_lane_f32(c, fVec, 2);
52 fVec = vsetq_lane_f32(d, fVec, 3);
53 }
54 41
55 // As far as I can tell, it's not possible to provide an alignment hint to 42 // As far as I can tell, it's not possible to provide an alignment hint to
56 // NEON using intrinsics. However, I think it is possible at the assembly 43 // NEON using intrinsics. However, I think it is possible at the assembly
57 // level if we want to get into that. 44 // level if we want to get into that.
58 // TODO: Write our own aligned load and store. 45 // TODO: Write our own aligned load and store.
59 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); } 46 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); }
60 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); } 47 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); }
61 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); } 48 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); }
62 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); } 49 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); }
63 50
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
123 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec); 110 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec);
124 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec); 111 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec);
125 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0]; 112 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];
126 } 113 }
127 114
128 // Sk4i Methods 115 // Sk4i Methods
129 #undef M 116 #undef M
130 #define M(...) template <> inline __VA_ARGS__ Sk4i:: 117 #define M(...) template <> inline __VA_ARGS__ Sk4i::
131 118
132 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {} 119 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {}
133 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) { 120 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) { fVec = (int32x4_t) { a, b , c, d }; }
134 // NEON lacks an intrinsic to make this easy. It is recommended to avoid
135 // this constructor unless it is absolutely necessary.
136
137 // There are a few different implementation strategies.
138
139 // uint64_t ab_i = ((uint32_t) a) | (((uint64_t) b) << 32);
140 // uint64_t cd_i = ((uint32_t) c) | (((uint64_t) d) << 32);
141 // int32x2_t ab = vcreate_s32(ab_i);
142 // int32x2_t cd = vcreate_s32(cd_i);
143 // fVec = vcombine_s32(ab, cd);
144 // This might not be a bad idea for the integer case. Either way I think,
145 // we will need to move values from general registers to NEON registers.
146
147 // I am choosing to use the set lane intrinsics. I am not certain that
148 // this is the fastest approach. It may be useful to try the above code
149 // for integers.
150 fVec = vsetq_lane_s32(a, fVec, 0);
151 fVec = vsetq_lane_s32(b, fVec, 1);
152 fVec = vsetq_lane_s32(c, fVec, 2);
153 fVec = vsetq_lane_s32(d, fVec, 3);
154 }
155 121
156 // As far as I can tell, it's not possible to provide an alignment hint to 122 // As far as I can tell, it's not possible to provide an alignment hint to
157 // NEON using intrinsics. However, I think it is possible at the assembly 123 // NEON using intrinsics. However, I think it is possible at the assembly
158 // level if we want to get into that. 124 // level if we want to get into that.
159 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); } 125 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); }
160 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); } 126 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); }
161 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); } 127 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); }
162 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); } 128 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); }
163 129
164 template <> 130 template <>
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after
217 } 183 }
218 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) { 184 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) {
219 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec); 185 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec);
220 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec); 186 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec);
221 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0]; 187 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0];
222 } 188 }
223 189
224 #undef M 190 #undef M
225 191
226 #endif 192 #endif
OLDNEW
« no previous file with comments | « src/opts/Sk2x_neon.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698