OLD | NEW |
1 /* | 1 /* |
2 * Copyright 2015 Google Inc. | 2 * Copyright 2015 Google Inc. |
3 * | 3 * |
4 * Use of this source code is governed by a BSD-style license that can be | 4 * Use of this source code is governed by a BSD-style license that can be |
5 * found in the LICENSE file. | 5 * found in the LICENSE file. |
6 */ | 6 */ |
7 | 7 |
8 #ifndef SkNx_neon_DEFINED | 8 #ifndef SkNx_neon_DEFINED |
9 #define SkNx_neon_DEFINED | 9 #define SkNx_neon_DEFINED |
10 | 10 |
(...skipping 15 matching lines...) Expand all Loading... |
26 | 26 |
27 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ | 27 #define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bit
s) { \ |
28 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ | 28 case 16: return op(v, 16); case 17: return op(v, 17); case 18: return op(v
, 18); \ |
29 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ | 29 case 19: return op(v, 19); case 20: return op(v, 20); case 21: return op(v
, 21); \ |
30 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ | 30 case 22: return op(v, 22); case 23: return op(v, 23); case 24: return op(v
, 24); \ |
31 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ | 31 case 25: return op(v, 25); case 26: return op(v, 26); case 27: return op(v
, 27); \ |
32 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ | 32 case 28: return op(v, 28); case 29: return op(v, 29); case 30: return op(v
, 30); \ |
33 case 31: return op(v, 31); } return fVec | 33 case 31: return op(v, 31); } return fVec |
34 | 34 |
35 template <> | 35 template <> |
36 class SkNf<2> { | 36 class SkNx<2, float> { |
37 public: | 37 public: |
38 SkNf(float32x2_t vec) : fVec(vec) {} | 38 SkNx(float32x2_t vec) : fVec(vec) {} |
39 | 39 |
40 SkNf() {} | 40 SkNx() {} |
41 SkNf(float val) : fVec(vdup_n_f32(val)) {} | 41 SkNx(float val) : fVec(vdup_n_f32(val)) {} |
42 static SkNf Load(const float vals[2]) { return vld1_f32(vals); } | 42 static SkNx Load(const float vals[2]) { return vld1_f32(vals); } |
43 SkNf(float a, float b) { fVec = (float32x2_t) { a, b }; } | 43 SkNx(float a, float b) { fVec = (float32x2_t) { a, b }; } |
44 | 44 |
45 void store(float vals[2]) const { vst1_f32(vals, fVec); } | 45 void store(float vals[2]) const { vst1_f32(vals, fVec); } |
46 | 46 |
47 SkNf approxInvert() const { | 47 SkNx approxInvert() const { |
48 float32x2_t est0 = vrecpe_f32(fVec), | 48 float32x2_t est0 = vrecpe_f32(fVec), |
49 est1 = vmul_f32(vrecps_f32(est0, fVec), est0); | 49 est1 = vmul_f32(vrecps_f32(est0, fVec), est0); |
50 return est1; | 50 return est1; |
51 } | 51 } |
52 SkNf invert() const { | 52 SkNx invert() const { |
53 float32x2_t est1 = this->approxInvert().fVec, | 53 float32x2_t est1 = this->approxInvert().fVec, |
54 est2 = vmul_f32(vrecps_f32(est1, fVec), est1); | 54 est2 = vmul_f32(vrecps_f32(est1, fVec), est1); |
55 return est2; | 55 return est2; |
56 } | 56 } |
57 | 57 |
58 SkNf operator + (const SkNf& o) const { return vadd_f32(fVec, o.fVec); } | 58 SkNx operator + (const SkNx& o) const { return vadd_f32(fVec, o.fVec); } |
59 SkNf operator - (const SkNf& o) const { return vsub_f32(fVec, o.fVec); } | 59 SkNx operator - (const SkNx& o) const { return vsub_f32(fVec, o.fVec); } |
60 SkNf operator * (const SkNf& o) const { return vmul_f32(fVec, o.fVec); } | 60 SkNx operator * (const SkNx& o) const { return vmul_f32(fVec, o.fVec); } |
61 SkNf operator / (const SkNf& o) const { | 61 SkNx operator / (const SkNx& o) const { |
62 #if defined(SK_CPU_ARM64) | 62 #if defined(SK_CPU_ARM64) |
63 return vdiv_f32(fVec, o.fVec); | 63 return vdiv_f32(fVec, o.fVec); |
64 #else | 64 #else |
65 return vmul_f32(fVec, o.invert().fVec); | 65 return vmul_f32(fVec, o.invert().fVec); |
66 #endif | 66 #endif |
67 } | 67 } |
68 | 68 |
69 SkNf operator == (const SkNf& o) const { return vreinterpret_f32_u32(vceq_f3
2(fVec, o.fVec)); } | 69 SkNx operator == (const SkNx& o) const { return vreinterpret_f32_u32(vceq_f3
2(fVec, o.fVec)); } |
70 SkNf operator < (const SkNf& o) const { return vreinterpret_f32_u32(vclt_f3
2(fVec, o.fVec)); } | 70 SkNx operator < (const SkNx& o) const { return vreinterpret_f32_u32(vclt_f3
2(fVec, o.fVec)); } |
71 SkNf operator > (const SkNf& o) const { return vreinterpret_f32_u32(vcgt_f3
2(fVec, o.fVec)); } | 71 SkNx operator > (const SkNx& o) const { return vreinterpret_f32_u32(vcgt_f3
2(fVec, o.fVec)); } |
72 SkNf operator <= (const SkNf& o) const { return vreinterpret_f32_u32(vcle_f3
2(fVec, o.fVec)); } | 72 SkNx operator <= (const SkNx& o) const { return vreinterpret_f32_u32(vcle_f3
2(fVec, o.fVec)); } |
73 SkNf operator >= (const SkNf& o) const { return vreinterpret_f32_u32(vcge_f3
2(fVec, o.fVec)); } | 73 SkNx operator >= (const SkNx& o) const { return vreinterpret_f32_u32(vcge_f3
2(fVec, o.fVec)); } |
74 SkNf operator != (const SkNf& o) const { | 74 SkNx operator != (const SkNx& o) const { |
75 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); | 75 return vreinterpret_f32_u32(vmvn_u32(vceq_f32(fVec, o.fVec))); |
76 } | 76 } |
77 | 77 |
78 static SkNf Min(const SkNf& l, const SkNf& r) { return vmin_f32(l.fVec, r.fV
ec); } | 78 static SkNx Min(const SkNx& l, const SkNx& r) { return vmin_f32(l.fVec, r.fV
ec); } |
79 static SkNf Max(const SkNf& l, const SkNf& r) { return vmax_f32(l.fVec, r.fV
ec); } | 79 static SkNx Max(const SkNx& l, const SkNx& r) { return vmax_f32(l.fVec, r.fV
ec); } |
80 | 80 |
81 SkNf rsqrt0() const { return vrsqrte_f32(fVec); } | 81 SkNx rsqrt0() const { return vrsqrte_f32(fVec); } |
82 SkNf rsqrt1() const { | 82 SkNx rsqrt1() const { |
83 float32x2_t est0 = this->rsqrt0().fVec; | 83 float32x2_t est0 = this->rsqrt0().fVec; |
84 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0); | 84 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0); |
85 } | 85 } |
86 SkNf rsqrt2() const { | 86 SkNx rsqrt2() const { |
87 float32x2_t est1 = this->rsqrt1().fVec; | 87 float32x2_t est1 = this->rsqrt1().fVec; |
88 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1); | 88 return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1); |
89 } | 89 } |
90 | 90 |
91 SkNf sqrt() const { | 91 SkNx sqrt() const { |
92 #if defined(SK_CPU_ARM64) | 92 #if defined(SK_CPU_ARM64) |
93 return vsqrt_f32(fVec); | 93 return vsqrt_f32(fVec); |
94 #else | 94 #else |
95 return *this * this->rsqrt2(); | 95 return *this * this->rsqrt2(); |
96 #endif | 96 #endif |
97 } | 97 } |
98 | 98 |
99 template <int k> float kth() const { | 99 template <int k> float kth() const { |
100 SkASSERT(0 <= k && k < 2); | 100 SkASSERT(0 <= k && k < 2); |
101 return vget_lane_f32(fVec, k&1); | 101 return vget_lane_f32(fVec, k&1); |
102 } | 102 } |
103 | 103 |
104 bool allTrue() const { | 104 bool allTrue() const { |
105 auto v = vreinterpret_u32_f32(fVec); | 105 auto v = vreinterpret_u32_f32(fVec); |
106 return vget_lane_u32(v,0) && vget_lane_u32(v,1); | 106 return vget_lane_u32(v,0) && vget_lane_u32(v,1); |
107 } | 107 } |
108 bool anyTrue() const { | 108 bool anyTrue() const { |
109 auto v = vreinterpret_u32_f32(fVec); | 109 auto v = vreinterpret_u32_f32(fVec); |
110 return vget_lane_u32(v,0) || vget_lane_u32(v,1); | 110 return vget_lane_u32(v,0) || vget_lane_u32(v,1); |
111 } | 111 } |
112 | 112 |
113 float32x2_t fVec; | 113 float32x2_t fVec; |
114 }; | 114 }; |
115 | 115 |
116 template <> | 116 template <> |
117 class SkNi<4, int> { | 117 class SkNx<4, int> { |
118 public: | 118 public: |
119 SkNi(const int32x4_t& vec) : fVec(vec) {} | 119 SkNx(const int32x4_t& vec) : fVec(vec) {} |
120 | 120 |
121 SkNi() {} | 121 SkNx() {} |
122 SkNi(int val) : fVec(vdupq_n_s32(val)) {} | 122 SkNx(int val) : fVec(vdupq_n_s32(val)) {} |
123 static SkNi Load(const int vals[4]) { return vld1q_s32(vals); } | 123 static SkNx Load(const int vals[4]) { return vld1q_s32(vals); } |
124 SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } | 124 SkNx(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; } |
125 | 125 |
126 void store(int vals[4]) const { vst1q_s32(vals, fVec); } | 126 void store(int vals[4]) const { vst1q_s32(vals, fVec); } |
127 | 127 |
128 SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); } | 128 SkNx operator + (const SkNx& o) const { return vaddq_s32(fVec, o.fVec); } |
129 SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); } | 129 SkNx operator - (const SkNx& o) const { return vsubq_s32(fVec, o.fVec); } |
130 SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); } | 130 SkNx operator * (const SkNx& o) const { return vmulq_s32(fVec, o.fVec); } |
131 | 131 |
132 SkNi operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } | 132 SkNx operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); } |
133 SkNi operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } | 133 SkNx operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); } |
134 | 134 |
135 template <int k> int kth() const { | 135 template <int k> int kth() const { |
136 SkASSERT(0 <= k && k < 4); | 136 SkASSERT(0 <= k && k < 4); |
137 return vgetq_lane_s32(fVec, k&3); | 137 return vgetq_lane_s32(fVec, k&3); |
138 } | 138 } |
139 | 139 |
140 int32x4_t fVec; | 140 int32x4_t fVec; |
141 }; | 141 }; |
142 | 142 |
143 template <> | 143 template <> |
144 class SkNf<4> { | 144 class SkNx<4, float> { |
145 public: | 145 public: |
146 SkNf(float32x4_t vec) : fVec(vec) {} | 146 SkNx(float32x4_t vec) : fVec(vec) {} |
147 | 147 |
148 SkNf() {} | 148 SkNx() {} |
149 SkNf(float val) : fVec(vdupq_n_f32(val)) {} | 149 SkNx(float val) : fVec(vdupq_n_f32(val)) {} |
150 static SkNf Load(const float vals[4]) { return vld1q_f32(vals); } | 150 static SkNx Load(const float vals[4]) { return vld1q_f32(vals); } |
151 static SkNf FromBytes(const uint8_t vals[4]) { | 151 static SkNx FromBytes(const uint8_t vals[4]) { |
152 uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); | 152 uint8x8_t fix8 = (uint8x8_t)vld1_dup_u32((const uint32_t*)vals); |
153 uint16x8_t fix8_16 = vmovl_u8(fix8); | 153 uint16x8_t fix8_16 = vmovl_u8(fix8); |
154 uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); | 154 uint32x4_t fix8_32 = vmovl_u16(vget_low_u16(fix8_16)); |
155 return SkNf(vcvtq_f32_u32(fix8_32)); | 155 return SkNx(vcvtq_f32_u32(fix8_32)); |
156 } | 156 } |
157 | 157 |
158 SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } | 158 SkNx(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d
}; } |
159 | 159 |
160 void store(float vals[4]) const { vst1q_f32(vals, fVec); } | 160 void store(float vals[4]) const { vst1q_f32(vals, fVec); } |
161 void toBytes(uint8_t bytes[4]) const { | 161 void toBytes(uint8_t bytes[4]) const { |
162 uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); | 162 uint32x4_t fix8_32 = vcvtq_u32_f32(fVec); |
163 uint16x4_t fix8_16 = vqmovn_u32(fix8_32); | 163 uint16x4_t fix8_16 = vqmovn_u32(fix8_32); |
164 uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); | 164 uint8x8_t fix8 = vqmovn_u16(vcombine_u16(fix8_16, vdup_n_u16(0))); |
165 vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); | 165 vst1_lane_u32((uint32_t*)bytes, (uint32x2_t)fix8, 0); |
166 } | 166 } |
167 | 167 |
168 SkNf approxInvert() const { | 168 SkNx approxInvert() const { |
169 float32x4_t est0 = vrecpeq_f32(fVec), | 169 float32x4_t est0 = vrecpeq_f32(fVec), |
170 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); | 170 est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0); |
171 return est1; | 171 return est1; |
172 } | 172 } |
173 SkNf invert() const { | 173 SkNx invert() const { |
174 float32x4_t est1 = this->approxInvert().fVec, | 174 float32x4_t est1 = this->approxInvert().fVec, |
175 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); | 175 est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1); |
176 return est2; | 176 return est2; |
177 } | 177 } |
178 | 178 |
179 SkNf operator + (const SkNf& o) const { return vaddq_f32(fVec, o.fVec); } | 179 SkNx operator + (const SkNx& o) const { return vaddq_f32(fVec, o.fVec); } |
180 SkNf operator - (const SkNf& o) const { return vsubq_f32(fVec, o.fVec); } | 180 SkNx operator - (const SkNx& o) const { return vsubq_f32(fVec, o.fVec); } |
181 SkNf operator * (const SkNf& o) const { return vmulq_f32(fVec, o.fVec); } | 181 SkNx operator * (const SkNx& o) const { return vmulq_f32(fVec, o.fVec); } |
182 SkNf operator / (const SkNf& o) const { | 182 SkNx operator / (const SkNx& o) const { |
183 #if defined(SK_CPU_ARM64) | 183 #if defined(SK_CPU_ARM64) |
184 return vdivq_f32(fVec, o.fVec); | 184 return vdivq_f32(fVec, o.fVec); |
185 #else | 185 #else |
186 return vmulq_f32(fVec, o.invert().fVec); | 186 return vmulq_f32(fVec, o.invert().fVec); |
187 #endif | 187 #endif |
188 } | 188 } |
189 | 189 |
190 SkNf operator==(const SkNf& o) const { return vreinterpretq_f32_u32(vceqq_f3
2(fVec, o.fVec)); } | 190 SkNx operator==(const SkNx& o) const { return vreinterpretq_f32_u32(vceqq_f3
2(fVec, o.fVec)); } |
191 SkNf operator <(const SkNf& o) const { return vreinterpretq_f32_u32(vcltq_f3
2(fVec, o.fVec)); } | 191 SkNx operator <(const SkNx& o) const { return vreinterpretq_f32_u32(vcltq_f3
2(fVec, o.fVec)); } |
192 SkNf operator >(const SkNf& o) const { return vreinterpretq_f32_u32(vcgtq_f3
2(fVec, o.fVec)); } | 192 SkNx operator >(const SkNx& o) const { return vreinterpretq_f32_u32(vcgtq_f3
2(fVec, o.fVec)); } |
193 SkNf operator<=(const SkNf& o) const { return vreinterpretq_f32_u32(vcleq_f3
2(fVec, o.fVec)); } | 193 SkNx operator<=(const SkNx& o) const { return vreinterpretq_f32_u32(vcleq_f3
2(fVec, o.fVec)); } |
194 SkNf operator>=(const SkNf& o) const { return vreinterpretq_f32_u32(vcgeq_f3
2(fVec, o.fVec)); } | 194 SkNx operator>=(const SkNx& o) const { return vreinterpretq_f32_u32(vcgeq_f3
2(fVec, o.fVec)); } |
195 SkNf operator!=(const SkNf& o) const { | 195 SkNx operator!=(const SkNx& o) const { |
196 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); | 196 return vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(fVec, o.fVec))); |
197 } | 197 } |
198 | 198 |
199 static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f32(l.fVec, r.f
Vec); } | 199 static SkNx Min(const SkNx& l, const SkNx& r) { return vminq_f32(l.fVec, r.f
Vec); } |
200 static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f32(l.fVec, r.f
Vec); } | 200 static SkNx Max(const SkNx& l, const SkNx& r) { return vmaxq_f32(l.fVec, r.f
Vec); } |
201 | 201 |
202 SkNf rsqrt0() const { return vrsqrteq_f32(fVec); } | 202 SkNx rsqrt0() const { return vrsqrteq_f32(fVec); } |
203 SkNf rsqrt1() const { | 203 SkNx rsqrt1() const { |
204 float32x4_t est0 = this->rsqrt0().fVec; | 204 float32x4_t est0 = this->rsqrt0().fVec; |
205 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); | 205 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0); |
206 } | 206 } |
207 SkNf rsqrt2() const { | 207 SkNx rsqrt2() const { |
208 float32x4_t est1 = this->rsqrt1().fVec; | 208 float32x4_t est1 = this->rsqrt1().fVec; |
209 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); | 209 return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1); |
210 } | 210 } |
211 | 211 |
212 SkNf sqrt() const { | 212 SkNx sqrt() const { |
213 #if defined(SK_CPU_ARM64) | 213 #if defined(SK_CPU_ARM64) |
214 return vsqrtq_f32(fVec); | 214 return vsqrtq_f32(fVec); |
215 #else | 215 #else |
216 return *this * this->rsqrt2(); | 216 return *this * this->rsqrt2(); |
217 #endif | 217 #endif |
218 } | 218 } |
219 | 219 |
220 template <int k> float kth() const { | 220 template <int k> float kth() const { |
221 SkASSERT(0 <= k && k < 4); | 221 SkASSERT(0 <= k && k < 4); |
222 return vgetq_lane_f32(fVec, k&3); | 222 return vgetq_lane_f32(fVec, k&3); |
223 } | 223 } |
224 | 224 |
225 bool allTrue() const { | 225 bool allTrue() const { |
226 auto v = vreinterpretq_u32_f32(fVec); | 226 auto v = vreinterpretq_u32_f32(fVec); |
227 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) | 227 return vgetq_lane_u32(v,0) && vgetq_lane_u32(v,1) |
228 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); | 228 && vgetq_lane_u32(v,2) && vgetq_lane_u32(v,3); |
229 } | 229 } |
230 bool anyTrue() const { | 230 bool anyTrue() const { |
231 auto v = vreinterpretq_u32_f32(fVec); | 231 auto v = vreinterpretq_u32_f32(fVec); |
232 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) | 232 return vgetq_lane_u32(v,0) || vgetq_lane_u32(v,1) |
233 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); | 233 || vgetq_lane_u32(v,2) || vgetq_lane_u32(v,3); |
234 } | 234 } |
235 | 235 |
236 SkNf thenElse(const SkNf& t, const SkNf& e) const { | 236 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
237 return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec); | 237 return vbslq_f32(vreinterpretq_u32_f32(fVec), t.fVec, e.fVec); |
238 } | 238 } |
239 | 239 |
240 float32x4_t fVec; | 240 float32x4_t fVec; |
241 }; | 241 }; |
242 | 242 |
243 template <> | 243 template <> |
244 class SkNi<8, uint16_t> { | 244 class SkNx<8, uint16_t> { |
245 public: | 245 public: |
246 SkNi(const uint16x8_t& vec) : fVec(vec) {} | 246 SkNx(const uint16x8_t& vec) : fVec(vec) {} |
247 | 247 |
248 SkNi() {} | 248 SkNx() {} |
249 SkNi(uint16_t val) : fVec(vdupq_n_u16(val)) {} | 249 SkNx(uint16_t val) : fVec(vdupq_n_u16(val)) {} |
250 static SkNi Load(const uint16_t vals[8]) { return vld1q_u16(vals); } | 250 static SkNx Load(const uint16_t vals[8]) { return vld1q_u16(vals); } |
251 | 251 |
252 SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d, | 252 SkNx(uint16_t a, uint16_t b, uint16_t c, uint16_t d, |
253 uint16_t e, uint16_t f, uint16_t g, uint16_t h) { | 253 uint16_t e, uint16_t f, uint16_t g, uint16_t h) { |
254 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h }; | 254 fVec = (uint16x8_t) { a,b,c,d, e,f,g,h }; |
255 } | 255 } |
256 | 256 |
257 void store(uint16_t vals[8]) const { vst1q_u16(vals, fVec); } | 257 void store(uint16_t vals[8]) const { vst1q_u16(vals, fVec); } |
258 | 258 |
259 SkNi operator + (const SkNi& o) const { return vaddq_u16(fVec, o.fVec); } | 259 SkNx operator + (const SkNx& o) const { return vaddq_u16(fVec, o.fVec); } |
260 SkNi operator - (const SkNi& o) const { return vsubq_u16(fVec, o.fVec); } | 260 SkNx operator - (const SkNx& o) const { return vsubq_u16(fVec, o.fVec); } |
261 SkNi operator * (const SkNi& o) const { return vmulq_u16(fVec, o.fVec); } | 261 SkNx operator * (const SkNx& o) const { return vmulq_u16(fVec, o.fVec); } |
262 | 262 |
263 SkNi operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } | 263 SkNx operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); } |
264 SkNi operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } | 264 SkNx operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); } |
265 | 265 |
266 static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u16(a.fVec, b.f
Vec); } | 266 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u16(a.fVec, b.f
Vec); } |
267 | 267 |
268 template <int k> uint16_t kth() const { | 268 template <int k> uint16_t kth() const { |
269 SkASSERT(0 <= k && k < 8); | 269 SkASSERT(0 <= k && k < 8); |
270 return vgetq_lane_u16(fVec, k&7); | 270 return vgetq_lane_u16(fVec, k&7); |
271 } | 271 } |
272 | 272 |
273 SkNi thenElse(const SkNi& t, const SkNi& e) const { | 273 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
274 return vbslq_u16(fVec, t.fVec, e.fVec); | 274 return vbslq_u16(fVec, t.fVec, e.fVec); |
275 } | 275 } |
276 | 276 |
277 uint16x8_t fVec; | 277 uint16x8_t fVec; |
278 }; | 278 }; |
279 | 279 |
280 template <> | 280 template <> |
281 class SkNi<16, uint8_t> { | 281 class SkNx<16, uint8_t> { |
282 public: | 282 public: |
283 SkNi(const uint8x16_t& vec) : fVec(vec) {} | 283 SkNx(const uint8x16_t& vec) : fVec(vec) {} |
284 | 284 |
285 SkNi() {} | 285 SkNx() {} |
286 SkNi(uint8_t val) : fVec(vdupq_n_u8(val)) {} | 286 SkNx(uint8_t val) : fVec(vdupq_n_u8(val)) {} |
287 static SkNi Load(const uint8_t vals[16]) { return vld1q_u8(vals); } | 287 static SkNx Load(const uint8_t vals[16]) { return vld1q_u8(vals); } |
288 | 288 |
289 SkNi(uint8_t a, uint8_t b, uint8_t c, uint8_t d, | 289 SkNx(uint8_t a, uint8_t b, uint8_t c, uint8_t d, |
290 uint8_t e, uint8_t f, uint8_t g, uint8_t h, | 290 uint8_t e, uint8_t f, uint8_t g, uint8_t h, |
291 uint8_t i, uint8_t j, uint8_t k, uint8_t l, | 291 uint8_t i, uint8_t j, uint8_t k, uint8_t l, |
292 uint8_t m, uint8_t n, uint8_t o, uint8_t p) { | 292 uint8_t m, uint8_t n, uint8_t o, uint8_t p) { |
293 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p }; | 293 fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p }; |
294 } | 294 } |
295 | 295 |
296 void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); } | 296 void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); } |
297 | 297 |
298 SkNi saturatedAdd(const SkNi& o) const { return vqaddq_u8(fVec, o.fVec); } | 298 SkNx saturatedAdd(const SkNx& o) const { return vqaddq_u8(fVec, o.fVec); } |
299 | 299 |
300 SkNi operator + (const SkNi& o) const { return vaddq_u8(fVec, o.fVec); } | 300 SkNx operator + (const SkNx& o) const { return vaddq_u8(fVec, o.fVec); } |
301 SkNi operator - (const SkNi& o) const { return vsubq_u8(fVec, o.fVec); } | 301 SkNx operator - (const SkNx& o) const { return vsubq_u8(fVec, o.fVec); } |
302 | 302 |
303 static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u8(a.fVec, b.fV
ec); } | 303 static SkNx Min(const SkNx& a, const SkNx& b) { return vminq_u8(a.fVec, b.fV
ec); } |
304 SkNi operator < (const SkNi& o) const { return vcltq_u8(fVec, o.fVec); } | 304 SkNx operator < (const SkNx& o) const { return vcltq_u8(fVec, o.fVec); } |
305 | 305 |
306 template <int k> uint8_t kth() const { | 306 template <int k> uint8_t kth() const { |
307 SkASSERT(0 <= k && k < 15); | 307 SkASSERT(0 <= k && k < 15); |
308 return vgetq_lane_u8(fVec, k&16); | 308 return vgetq_lane_u8(fVec, k&16); |
309 } | 309 } |
310 | 310 |
311 SkNi thenElse(const SkNi& t, const SkNi& e) const { | 311 SkNx thenElse(const SkNx& t, const SkNx& e) const { |
312 return vbslq_u8(fVec, t.fVec, e.fVec); | 312 return vbslq_u8(fVec, t.fVec, e.fVec); |
313 } | 313 } |
314 | 314 |
315 uint8x16_t fVec; | 315 uint8x16_t fVec; |
316 }; | 316 }; |
317 | 317 |
318 #undef SHIFT32 | 318 #undef SHIFT32 |
319 #undef SHIFT16 | 319 #undef SHIFT16 |
320 #undef SHIFT8 | 320 #undef SHIFT8 |
321 | 321 |
| 322 template<> |
| 323 inline SkNx<4, int> SkNx_cast<int, float, 4>(const SkNx<4, float>& src) { |
| 324 return vcvtq_s32_f32(src.fVec); |
| 325 } |
| 326 |
322 } // namespace | 327 } // namespace |
323 | 328 |
324 #endif//SkNx_neon_DEFINED | 329 #endif//SkNx_neon_DEFINED |
OLD | NEW |