OLD | NEW |
1 // It is important _not_ to put header guards here. | 1 // It is important _not_ to put header guards here. |
2 // This file will be intentionally included three times. | 2 // This file will be intentionally included three times. |
3 | 3 |
4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362 | 4 #include "SkTypes.h" // Keep this before any #ifdef for skbug.com/3362 |
5 | 5 |
6 #if defined(SK4X_PREAMBLE) | 6 #if defined(SK4X_PREAMBLE) |
7 #include <arm_neon.h> | 7 #include <arm_neon.h> |
8 | 8 |
9 // Template metaprogramming to map scalar types to vector types. | 9 // Template metaprogramming to map scalar types to vector types. |
10 template <typename T> struct SkScalarToSIMD; | 10 template <typename T> struct SkScalarToSIMD; |
(...skipping 19 matching lines...) Expand all Loading... |
30 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; } | 30 template <typename T> Sk4x<T>::Sk4x(const Sk4x& other) { *this = other; } |
31 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) { | 31 template <typename T> Sk4x<T>& Sk4x<T>::operator=(const Sk4x<T>& other) { |
32 fVec = other.fVec; | 32 fVec = other.fVec; |
33 return *this; | 33 return *this; |
34 } | 34 } |
35 | 35 |
36 // Sk4f Methods | 36 // Sk4f Methods |
37 #define M(...) template <> inline __VA_ARGS__ Sk4f:: | 37 #define M(...) template <> inline __VA_ARGS__ Sk4f:: |
38 | 38 |
39 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {} | 39 M() Sk4x(float v) : fVec(vdupq_n_f32(v)) {} |
40 M() Sk4x(float a, float b, float c, float d) { | 40 M() Sk4x(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
41 // NEON lacks an intrinsic to make this easy. It is recommended to avoid | |
42 // this constructor unless it is absolutely necessary. | |
43 | |
44 // I am choosing to use the set lane intrinsics. Particularly, in the case | |
45 // of floating point, it is likely that the values are already in the right | |
46 // register file, so this may be the best approach. However, I am not | |
47 // certain that this is the fastest approach and experimentation might be | |
48 // useful. | |
49 fVec = vsetq_lane_f32(a, fVec, 0); | |
50 fVec = vsetq_lane_f32(b, fVec, 1); | |
51 fVec = vsetq_lane_f32(c, fVec, 2); | |
52 fVec = vsetq_lane_f32(d, fVec, 3); | |
53 } | |
54 | 41 |
55 // As far as I can tell, it's not possible to provide an alignment hint to | 42 // As far as I can tell, it's not possible to provide an alignment hint to |
56 // NEON using intrinsics. However, I think it is possible at the assembly | 43 // NEON using intrinsics. However, I think it is possible at the assembly |
57 // level if we want to get into that. | 44 // level if we want to get into that. |
58 // TODO: Write our own aligned load and store. | 45 // TODO: Write our own aligned load and store. |
59 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); } | 46 M(Sk4f) Load (const float fs[4]) { return vld1q_f32(fs); } |
60 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); } | 47 M(Sk4f) LoadAligned(const float fs[4]) { return vld1q_f32(fs); } |
61 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); } | 48 M(void) store (float fs[4]) const { vst1q_f32(fs, fVec); } |
62 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); } | 49 M(void) storeAligned(float fs[4]) const { vst1q_f32 (fs, fVec); } |
63 | 50 |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
123 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec); | 110 float32x4x2_t xayb_zcwd = vzipq_f32(xyzw.fVec, abcd.fVec); |
124 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec); | 111 float32x4x2_t axby_czdw = vzipq_f32(abcd.fVec, xyzw.fVec); |
125 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0]; | 112 return vuzpq_f32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0]; |
126 } | 113 } |
127 | 114 |
128 // Sk4i Methods | 115 // Sk4i Methods |
129 #undef M | 116 #undef M |
130 #define M(...) template <> inline __VA_ARGS__ Sk4i:: | 117 #define M(...) template <> inline __VA_ARGS__ Sk4i:: |
131 | 118 |
132 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {} | 119 M() Sk4x(int32_t v) : fVec(vdupq_n_s32(v)) {} |
133 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) { | 120 M() Sk4x(int32_t a, int32_t b, int32_t c, int32_t d) { fVec = (int32x4_t) { a, b
, c, d }; } |
134 // NEON lacks an intrinsic to make this easy. It is recommended to avoid | |
135 // this constructor unless it is absolutely necessary. | |
136 | |
137 // There are a few different implementation strategies. | |
138 | |
139 // uint64_t ab_i = ((uint32_t) a) | (((uint64_t) b) << 32); | |
140 // uint64_t cd_i = ((uint32_t) c) | (((uint64_t) d) << 32); | |
141 // int32x2_t ab = vcreate_s32(ab_i); | |
142 // int32x2_t cd = vcreate_s32(cd_i); | |
143 // fVec = vcombine_s32(ab, cd); | |
144 // This might not be a bad idea for the integer case. Either way I think, | |
145 // we will need to move values from general registers to NEON registers. | |
146 | |
147 // I am choosing to use the set lane intrinsics. I am not certain that | |
148 // this is the fastest approach. It may be useful to try the above code | |
149 // for integers. | |
150 fVec = vsetq_lane_s32(a, fVec, 0); | |
151 fVec = vsetq_lane_s32(b, fVec, 1); | |
152 fVec = vsetq_lane_s32(c, fVec, 2); | |
153 fVec = vsetq_lane_s32(d, fVec, 3); | |
154 } | |
155 | 121 |
156 // As far as I can tell, it's not possible to provide an alignment hint to | 122 // As far as I can tell, it's not possible to provide an alignment hint to |
157 // NEON using intrinsics. However, I think it is possible at the assembly | 123 // NEON using intrinsics. However, I think it is possible at the assembly |
158 // level if we want to get into that. | 124 // level if we want to get into that. |
159 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); } | 125 M(Sk4i) Load (const int32_t is[4]) { return vld1q_s32(is); } |
160 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); } | 126 M(Sk4i) LoadAligned(const int32_t is[4]) { return vld1q_s32(is); } |
161 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); } | 127 M(void) store (int32_t is[4]) const { vst1q_s32(is, fVec); } |
162 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); } | 128 M(void) storeAligned(int32_t is[4]) const { vst1q_s32 (is, fVec); } |
163 | 129 |
164 template <> | 130 template <> |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
217 } | 183 } |
218 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) { | 184 M(Sk4i) ZWCD(const Sk4i& xyzw, const Sk4i& abcd) { |
219 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec); | 185 int32x4x2_t xayb_zcwd = vzipq_s32(xyzw.fVec, abcd.fVec); |
220 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec); | 186 int32x4x2_t axby_czdw = vzipq_s32(abcd.fVec, xyzw.fVec); |
221 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0]; | 187 return vuzpq_s32(xayb_zcwd.val[1], axby_czdw.val[1]).val[0]; |
222 } | 188 } |
223 | 189 |
224 #undef M | 190 #undef M |
225 | 191 |
226 #endif | 192 #endif |
OLD | NEW |