Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(302)

Side by Side Diff: source/libvpx/vp9/common/arm/neon/vp9_reconintra_neon.c

Issue 1162573005: libvpx: Pull from upstream (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/libvpx.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. 2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 * 3 *
4 * Use of this source code is governed by a BSD-style license 4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source 5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found 6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may 7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree. 8 * be found in the AUTHORS file in the root of the source tree.
9 */ 9 */
10 10
11 #include <stddef.h>
12 #include <arm_neon.h> 11 #include <arm_neon.h>
13 12
14 void vp9_v_predictor_4x4_neon( 13 #include "./vp9_rtcd.h"
15 uint8_t *dst, 14 #include "./vpx_config.h"
16 ptrdiff_t y_stride, 15 #include "vpx/vpx_integer.h"
17 const uint8_t *above, 16
18 const uint8_t *left) { 17 //------------------------------------------------------------------------------
18 // DC 8x8
19
20 // 'do_above' and 'do_left' facilitate branch removal when inlined.
21 static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
22 const uint8_t *above, const uint8_t *left,
23 int do_above, int do_left) {
24 uint16x8_t sum_top;
25 uint16x8_t sum_left;
26 uint8x8_t dc0;
27
28 if (do_above) {
29 const uint8x8_t A = vld1_u8(above); // top row
30 const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top
31 const uint16x4_t p1 = vpadd_u16(p0, p0);
32 const uint16x4_t p2 = vpadd_u16(p1, p1);
33 sum_top = vcombine_u16(p2, p2);
34 }
35
36 if (do_left) {
37 const uint8x8_t L = vld1_u8(left); // left border
38 const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left
39 const uint16x4_t p1 = vpadd_u16(p0, p0);
40 const uint16x4_t p2 = vpadd_u16(p1, p1);
41 sum_left = vcombine_u16(p2, p2);
42 }
43
44 if (do_above && do_left) {
45 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
46 dc0 = vrshrn_n_u16(sum, 4);
47 } else if (do_above) {
48 dc0 = vrshrn_n_u16(sum_top, 3);
49 } else if (do_left) {
50 dc0 = vrshrn_n_u16(sum_left, 3);
51 } else {
52 dc0 = vdup_n_u8(0x80);
53 }
54
55 {
56 const uint8x8_t dc = vdup_lane_u8(dc0, 0);
19 int i; 57 int i;
20 uint32x2_t d0u32 = vdup_n_u32(0); 58 for (i = 0; i < 8; ++i) {
21 (void)left; 59 vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
22 60 }
23 d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); 61 }
24 for (i = 0; i < 4; i++, dst += y_stride) 62 }
25 vst1_lane_u32((uint32_t *)dst, d0u32, 0); 63
26 return; 64 void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
27 } 65 const uint8_t *above, const uint8_t *left) {
28 66 dc_8x8(dst, stride, above, left, 1, 1);
29 void vp9_v_predictor_8x8_neon( 67 }
30 uint8_t *dst, 68
31 ptrdiff_t y_stride, 69 void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
32 const uint8_t *above, 70 const uint8_t *above, const uint8_t *left) {
33 const uint8_t *left) { 71 (void)above;
72 dc_8x8(dst, stride, NULL, left, 0, 1);
73 }
74
75 void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
76 const uint8_t *above, const uint8_t *left) {
77 (void)left;
78 dc_8x8(dst, stride, above, NULL, 1, 0);
79 }
80
81 void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
82 const uint8_t *above, const uint8_t *left) {
83 (void)above;
84 (void)left;
85 dc_8x8(dst, stride, NULL, NULL, 0, 0);
86 }
87
88 //------------------------------------------------------------------------------
89 // DC 16x16
90
91 // 'do_above' and 'do_left' facilitate branch removal when inlined.
92 static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
93 const uint8_t *above, const uint8_t *left,
94 int do_above, int do_left) {
95 uint16x8_t sum_top;
96 uint16x8_t sum_left;
97 uint8x8_t dc0;
98
99 if (do_above) {
100 const uint8x16_t A = vld1q_u8(above); // top row
101 const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top
102 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
103 const uint16x4_t p2 = vpadd_u16(p1, p1);
104 const uint16x4_t p3 = vpadd_u16(p2, p2);
105 sum_top = vcombine_u16(p3, p3);
106 }
107
108 if (do_left) {
109 const uint8x16_t L = vld1q_u8(left); // left row
110 const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left
111 const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
112 const uint16x4_t p2 = vpadd_u16(p1, p1);
113 const uint16x4_t p3 = vpadd_u16(p2, p2);
114 sum_left = vcombine_u16(p3, p3);
115 }
116
117 if (do_above && do_left) {
118 const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
119 dc0 = vrshrn_n_u16(sum, 5);
120 } else if (do_above) {
121 dc0 = vrshrn_n_u16(sum_top, 4);
122 } else if (do_left) {
123 dc0 = vrshrn_n_u16(sum_left, 4);
124 } else {
125 dc0 = vdup_n_u8(0x80);
126 }
127
128 {
129 const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
34 int i; 130 int i;
35 uint8x8_t d0u8 = vdup_n_u8(0); 131 for (i = 0; i < 16; ++i) {
36 (void)left; 132 vst1q_u8(dst + i * stride, dc);
37
38 d0u8 = vld1_u8(above);
39 for (i = 0; i < 8; i++, dst += y_stride)
40 vst1_u8(dst, d0u8);
41 return;
42 }
43
44 void vp9_v_predictor_16x16_neon(
45 uint8_t *dst,
46 ptrdiff_t y_stride,
47 const uint8_t *above,
48 const uint8_t *left) {
49 int i;
50 uint8x16_t q0u8 = vdupq_n_u8(0);
51 (void)left;
52
53 q0u8 = vld1q_u8(above);
54 for (i = 0; i < 16; i++, dst += y_stride)
55 vst1q_u8(dst, q0u8);
56 return;
57 }
58
59 void vp9_v_predictor_32x32_neon(
60 uint8_t *dst,
61 ptrdiff_t y_stride,
62 const uint8_t *above,
63 const uint8_t *left) {
64 int i;
65 uint8x16_t q0u8 = vdupq_n_u8(0);
66 uint8x16_t q1u8 = vdupq_n_u8(0);
67 (void)left;
68
69 q0u8 = vld1q_u8(above);
70 q1u8 = vld1q_u8(above + 16);
71 for (i = 0; i < 32; i++, dst += y_stride) {
72 vst1q_u8(dst, q0u8);
73 vst1q_u8(dst + 16, q1u8);
74 } 133 }
75 return; 134 }
76 } 135 }
77 136
78 void vp9_h_predictor_4x4_neon( 137 void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
79 uint8_t *dst, 138 const uint8_t *above, const uint8_t *left) {
80 ptrdiff_t y_stride, 139 dc_16x16(dst, stride, above, left, 1, 1);
81 const uint8_t *above, 140 }
82 const uint8_t *left) { 141
83 uint8x8_t d0u8 = vdup_n_u8(0); 142 void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
84 uint32x2_t d1u32 = vdup_n_u32(0); 143 const uint8_t *above,
85 (void)above; 144 const uint8_t *left) {
86 145 (void)above;
87 d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); 146 dc_16x16(dst, stride, NULL, left, 0, 1);
88 147 }
89 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); 148
90 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 149 void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
91 dst += y_stride; 150 const uint8_t *above,
92 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); 151 const uint8_t *left) {
93 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 152 (void)left;
94 dst += y_stride; 153 dc_16x16(dst, stride, above, NULL, 1, 0);
95 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); 154 }
96 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 155
97 dst += y_stride; 156 void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
98 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); 157 const uint8_t *above,
99 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 158 const uint8_t *left) {
100 return; 159 (void)above;
101 } 160 (void)left;
102 161 dc_16x16(dst, stride, NULL, NULL, 0, 0);
103 void vp9_h_predictor_8x8_neon( 162 }
104 uint8_t *dst, 163
105 ptrdiff_t y_stride, 164 #if !HAVE_NEON_ASM
106 const uint8_t *above, 165
107 const uint8_t *left) { 166 void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
108 uint8x8_t d0u8 = vdup_n_u8(0); 167 const uint8_t *above, const uint8_t *left) {
109 uint64x1_t d1u64 = vdup_n_u64(0); 168 int i;
110 (void)above; 169 uint32x2_t d0u32 = vdup_n_u32(0);
111 170 (void)left;
112 d1u64 = vld1_u64((const uint64_t *)left); 171
113 172 d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
114 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); 173 for (i = 0; i < 4; i++, dst += stride)
174 vst1_lane_u32((uint32_t *)dst, d0u32, 0);
175 }
176
177 void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
178 const uint8_t *above, const uint8_t *left) {
179 int i;
180 uint8x8_t d0u8 = vdup_n_u8(0);
181 (void)left;
182
183 d0u8 = vld1_u8(above);
184 for (i = 0; i < 8; i++, dst += stride)
115 vst1_u8(dst, d0u8); 185 vst1_u8(dst, d0u8);
116 dst += y_stride; 186 }
117 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); 187
118 vst1_u8(dst, d0u8); 188 void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
119 dst += y_stride; 189 const uint8_t *above, const uint8_t *left) {
120 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); 190 int i;
121 vst1_u8(dst, d0u8); 191 uint8x16_t q0u8 = vdupq_n_u8(0);
122 dst += y_stride; 192 (void)left;
123 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); 193
124 vst1_u8(dst, d0u8); 194 q0u8 = vld1q_u8(above);
125 dst += y_stride; 195 for (i = 0; i < 16; i++, dst += stride)
126 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); 196 vst1q_u8(dst, q0u8);
127 vst1_u8(dst, d0u8); 197 }
128 dst += y_stride; 198
129 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); 199 void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
130 vst1_u8(dst, d0u8); 200 const uint8_t *above, const uint8_t *left) {
131 dst += y_stride; 201 int i;
132 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); 202 uint8x16_t q0u8 = vdupq_n_u8(0);
133 vst1_u8(dst, d0u8); 203 uint8x16_t q1u8 = vdupq_n_u8(0);
134 dst += y_stride; 204 (void)left;
135 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); 205
136 vst1_u8(dst, d0u8); 206 q0u8 = vld1q_u8(above);
137 return; 207 q1u8 = vld1q_u8(above + 16);
138 } 208 for (i = 0; i < 32; i++, dst += stride) {
139 209 vst1q_u8(dst, q0u8);
140 void vp9_h_predictor_16x16_neon( 210 vst1q_u8(dst + 16, q1u8);
141 uint8_t *dst, 211 }
142 ptrdiff_t y_stride, 212 }
143 const uint8_t *above, 213
144 const uint8_t *left) { 214 void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
145 int j; 215 const uint8_t *above, const uint8_t *left) {
146 uint8x8_t d2u8 = vdup_n_u8(0); 216 uint8x8_t d0u8 = vdup_n_u8(0);
147 uint8x16_t q0u8 = vdupq_n_u8(0); 217 uint32x2_t d1u32 = vdup_n_u32(0);
148 uint8x16_t q1u8 = vdupq_n_u8(0); 218 (void)above;
149 (void)above; 219
150 220 d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
221
222 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
223 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
224 dst += stride;
225 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
226 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
227 dst += stride;
228 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
229 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
230 dst += stride;
231 d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
232 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
233 }
234
235 void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
236 const uint8_t *above, const uint8_t *left) {
237 uint8x8_t d0u8 = vdup_n_u8(0);
238 uint64x1_t d1u64 = vdup_n_u64(0);
239 (void)above;
240
241 d1u64 = vld1_u64((const uint64_t *)left);
242
243 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
244 vst1_u8(dst, d0u8);
245 dst += stride;
246 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
247 vst1_u8(dst, d0u8);
248 dst += stride;
249 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
250 vst1_u8(dst, d0u8);
251 dst += stride;
252 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
253 vst1_u8(dst, d0u8);
254 dst += stride;
255 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
256 vst1_u8(dst, d0u8);
257 dst += stride;
258 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
259 vst1_u8(dst, d0u8);
260 dst += stride;
261 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
262 vst1_u8(dst, d0u8);
263 dst += stride;
264 d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
265 vst1_u8(dst, d0u8);
266 }
267
268 void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
269 const uint8_t *above, const uint8_t *left) {
270 int j;
271 uint8x8_t d2u8 = vdup_n_u8(0);
272 uint8x16_t q0u8 = vdupq_n_u8(0);
273 uint8x16_t q1u8 = vdupq_n_u8(0);
274 (void)above;
275
276 q1u8 = vld1q_u8(left);
277 d2u8 = vget_low_u8(q1u8);
278 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
279 q0u8 = vdupq_lane_u8(d2u8, 0);
280 vst1q_u8(dst, q0u8);
281 dst += stride;
282 q0u8 = vdupq_lane_u8(d2u8, 1);
283 vst1q_u8(dst, q0u8);
284 dst += stride;
285 q0u8 = vdupq_lane_u8(d2u8, 2);
286 vst1q_u8(dst, q0u8);
287 dst += stride;
288 q0u8 = vdupq_lane_u8(d2u8, 3);
289 vst1q_u8(dst, q0u8);
290 dst += stride;
291 q0u8 = vdupq_lane_u8(d2u8, 4);
292 vst1q_u8(dst, q0u8);
293 dst += stride;
294 q0u8 = vdupq_lane_u8(d2u8, 5);
295 vst1q_u8(dst, q0u8);
296 dst += stride;
297 q0u8 = vdupq_lane_u8(d2u8, 6);
298 vst1q_u8(dst, q0u8);
299 dst += stride;
300 q0u8 = vdupq_lane_u8(d2u8, 7);
301 vst1q_u8(dst, q0u8);
302 dst += stride;
303 }
304 }
305
306 void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
307 const uint8_t *above, const uint8_t *left) {
308 int j, k;
309 uint8x8_t d2u8 = vdup_n_u8(0);
310 uint8x16_t q0u8 = vdupq_n_u8(0);
311 uint8x16_t q1u8 = vdupq_n_u8(0);
312 (void)above;
313
314 for (k = 0; k < 2; k++, left += 16) {
151 q1u8 = vld1q_u8(left); 315 q1u8 = vld1q_u8(left);
152 d2u8 = vget_low_u8(q1u8); 316 d2u8 = vget_low_u8(q1u8);
153 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { 317 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
154 q0u8 = vdupq_lane_u8(d2u8, 0); 318 q0u8 = vdupq_lane_u8(d2u8, 0);
155 vst1q_u8(dst, q0u8); 319 vst1q_u8(dst, q0u8);
156 dst += y_stride; 320 vst1q_u8(dst + 16, q0u8);
157 q0u8 = vdupq_lane_u8(d2u8, 1); 321 dst += stride;
158 vst1q_u8(dst, q0u8); 322 q0u8 = vdupq_lane_u8(d2u8, 1);
159 dst += y_stride; 323 vst1q_u8(dst, q0u8);
160 q0u8 = vdupq_lane_u8(d2u8, 2); 324 vst1q_u8(dst + 16, q0u8);
161 vst1q_u8(dst, q0u8); 325 dst += stride;
162 dst += y_stride; 326 q0u8 = vdupq_lane_u8(d2u8, 2);
163 q0u8 = vdupq_lane_u8(d2u8, 3); 327 vst1q_u8(dst, q0u8);
164 vst1q_u8(dst, q0u8); 328 vst1q_u8(dst + 16, q0u8);
165 dst += y_stride; 329 dst += stride;
166 q0u8 = vdupq_lane_u8(d2u8, 4); 330 q0u8 = vdupq_lane_u8(d2u8, 3);
167 vst1q_u8(dst, q0u8); 331 vst1q_u8(dst, q0u8);
168 dst += y_stride; 332 vst1q_u8(dst + 16, q0u8);
169 q0u8 = vdupq_lane_u8(d2u8, 5); 333 dst += stride;
170 vst1q_u8(dst, q0u8); 334 q0u8 = vdupq_lane_u8(d2u8, 4);
171 dst += y_stride; 335 vst1q_u8(dst, q0u8);
172 q0u8 = vdupq_lane_u8(d2u8, 6); 336 vst1q_u8(dst + 16, q0u8);
173 vst1q_u8(dst, q0u8); 337 dst += stride;
174 dst += y_stride; 338 q0u8 = vdupq_lane_u8(d2u8, 5);
175 q0u8 = vdupq_lane_u8(d2u8, 7); 339 vst1q_u8(dst, q0u8);
176 vst1q_u8(dst, q0u8); 340 vst1q_u8(dst + 16, q0u8);
177 dst += y_stride; 341 dst += stride;
342 q0u8 = vdupq_lane_u8(d2u8, 6);
343 vst1q_u8(dst, q0u8);
344 vst1q_u8(dst + 16, q0u8);
345 dst += stride;
346 q0u8 = vdupq_lane_u8(d2u8, 7);
347 vst1q_u8(dst, q0u8);
348 vst1q_u8(dst + 16, q0u8);
349 dst += stride;
178 } 350 }
179 return; 351 }
180 } 352 }
181 353
182 void vp9_h_predictor_32x32_neon( 354 void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
183 uint8_t *dst, 355 const uint8_t *above, const uint8_t *left) {
184 ptrdiff_t y_stride, 356 int i;
185 const uint8_t *above, 357 uint16x8_t q1u16, q3u16;
186 const uint8_t *left) { 358 int16x8_t q1s16;
187 int j, k; 359 uint8x8_t d0u8 = vdup_n_u8(0);
188 uint8x8_t d2u8 = vdup_n_u8(0); 360 uint32x2_t d2u32 = vdup_n_u32(0);
189 uint8x16_t q0u8 = vdupq_n_u8(0); 361
190 uint8x16_t q1u8 = vdupq_n_u8(0); 362 d0u8 = vld1_dup_u8(above - 1);
191 (void)above; 363 d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
192 364 q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
193 for (k = 0; k < 2; k++, left += 16) { 365 for (i = 0; i < 4; i++, dst += stride) {
194 q1u8 = vld1q_u8(left); 366 q1u16 = vdupq_n_u16((uint16_t)left[i]);
195 d2u8 = vget_low_u8(q1u8); 367 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
196 for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { 368 vreinterpretq_s16_u16(q3u16));
197 q0u8 = vdupq_lane_u8(d2u8, 0); 369 d0u8 = vqmovun_s16(q1s16);
198 vst1q_u8(dst, q0u8); 370 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
199 vst1q_u8(dst + 16, q0u8); 371 }
200 dst += y_stride; 372 }
201 q0u8 = vdupq_lane_u8(d2u8, 1); 373
202 vst1q_u8(dst, q0u8); 374 void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
203 vst1q_u8(dst + 16, q0u8); 375 const uint8_t *above, const uint8_t *left) {
204 dst += y_stride; 376 int j;
205 q0u8 = vdupq_lane_u8(d2u8, 2); 377 uint16x8_t q0u16, q3u16, q10u16;
206 vst1q_u8(dst, q0u8); 378 int16x8_t q0s16;
207 vst1q_u8(dst + 16, q0u8); 379 uint16x4_t d20u16;
208 dst += y_stride; 380 uint8x8_t d0u8, d2u8, d30u8;
209 q0u8 = vdupq_lane_u8(d2u8, 3); 381
210 vst1q_u8(dst, q0u8); 382 d0u8 = vld1_dup_u8(above - 1);
211 vst1q_u8(dst + 16, q0u8); 383 d30u8 = vld1_u8(left);
212 dst += y_stride; 384 d2u8 = vld1_u8(above);
213 q0u8 = vdupq_lane_u8(d2u8, 4); 385 q10u16 = vmovl_u8(d30u8);
214 vst1q_u8(dst, q0u8); 386 q3u16 = vsubl_u8(d2u8, d0u8);
215 vst1q_u8(dst + 16, q0u8); 387 d20u16 = vget_low_u16(q10u16);
216 dst += y_stride; 388 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
217 q0u8 = vdupq_lane_u8(d2u8, 5); 389 q0u16 = vdupq_lane_u16(d20u16, 0);
218 vst1q_u8(dst, q0u8); 390 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
219 vst1q_u8(dst + 16, q0u8); 391 vreinterpretq_s16_u16(q0u16));
220 dst += y_stride; 392 d0u8 = vqmovun_s16(q0s16);
221 q0u8 = vdupq_lane_u8(d2u8, 6); 393 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
222 vst1q_u8(dst, q0u8); 394 dst += stride;
223 vst1q_u8(dst + 16, q0u8); 395 q0u16 = vdupq_lane_u16(d20u16, 1);
224 dst += y_stride; 396 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
225 q0u8 = vdupq_lane_u8(d2u8, 7); 397 vreinterpretq_s16_u16(q0u16));
226 vst1q_u8(dst, q0u8); 398 d0u8 = vqmovun_s16(q0s16);
227 vst1q_u8(dst + 16, q0u8); 399 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
228 dst += y_stride; 400 dst += stride;
229 } 401 q0u16 = vdupq_lane_u16(d20u16, 2);
230 } 402 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
231 return; 403 vreinterpretq_s16_u16(q0u16));
232 } 404 d0u8 = vqmovun_s16(q0s16);
233 405 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
234 void vp9_tm_predictor_4x4_neon( 406 dst += stride;
235 uint8_t *dst, 407 q0u16 = vdupq_lane_u16(d20u16, 3);
236 ptrdiff_t y_stride, 408 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
237 const uint8_t *above, 409 vreinterpretq_s16_u16(q0u16));
238 const uint8_t *left) { 410 d0u8 = vqmovun_s16(q0s16);
239 int i; 411 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
240 uint16x8_t q1u16, q3u16; 412 dst += stride;
241 int16x8_t q1s16; 413 }
242 uint8x8_t d0u8 = vdup_n_u8(0); 414 }
243 uint32x2_t d2u32 = vdup_n_u32(0); 415
244 416 void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
245 d0u8 = vdup_n_u8(above[-1]); 417 const uint8_t *above, const uint8_t *left) {
246 d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); 418 int j, k;
247 q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); 419 uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
248 for (i = 0; i < 4; i++, dst += y_stride) { 420 uint8x16_t q0u8, q1u8;
249 q1u16 = vdupq_n_u16((uint16_t)left[i]); 421 int16x8_t q0s16, q1s16, q8s16, q11s16;
250 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), 422 uint16x4_t d20u16;
251 vreinterpretq_s16_u16(q3u16)); 423 uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
252 d0u8 = vqmovun_s16(q1s16); 424
253 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); 425 q0u8 = vld1q_dup_u8(above - 1);
254 } 426 q1u8 = vld1q_u8(above);
255 return; 427 q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
256 } 428 q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
257 429 for (k = 0; k < 2; k++, left += 8) {
258 void vp9_tm_predictor_8x8_neon( 430 d18u8 = vld1_u8(left);
259 uint8_t *dst, 431 q10u16 = vmovl_u8(d18u8);
260 ptrdiff_t y_stride,
261 const uint8_t *above,
262 const uint8_t *left) {
263 int j;
264 uint16x8_t q0u16, q3u16, q10u16;
265 int16x8_t q0s16;
266 uint16x4_t d20u16;
267 uint8x8_t d0u8, d2u8, d30u8;
268
269 d0u8 = vdup_n_u8(above[-1]);
270 d30u8 = vld1_u8(left);
271 d2u8 = vld1_u8(above);
272 q10u16 = vmovl_u8(d30u8);
273 q3u16 = vsubl_u8(d2u8, d0u8);
274 d20u16 = vget_low_u16(q10u16); 432 d20u16 = vget_low_u16(q10u16);
275 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { 433 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
276 q0u16 = vdupq_lane_u16(d20u16, 0); 434 q0u16 = vdupq_lane_u16(d20u16, 0);
277 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 435 q8u16 = vdupq_lane_u16(d20u16, 1);
278 vreinterpretq_s16_u16(q0u16)); 436 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
279 d0u8 = vqmovun_s16(q0s16); 437 vreinterpretq_s16_u16(q2u16));
280 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 438 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
281 dst += y_stride; 439 vreinterpretq_s16_u16(q3u16));
282 q0u16 = vdupq_lane_u16(d20u16, 1); 440 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
283 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 441 vreinterpretq_s16_u16(q2u16));
284 vreinterpretq_s16_u16(q0u16)); 442 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
285 d0u8 = vqmovun_s16(q0s16); 443 vreinterpretq_s16_u16(q3u16));
286 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 444 d2u8 = vqmovun_s16(q1s16);
287 dst += y_stride; 445 d3u8 = vqmovun_s16(q0s16);
288 q0u16 = vdupq_lane_u16(d20u16, 2); 446 d22u8 = vqmovun_s16(q11s16);
289 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 447 d23u8 = vqmovun_s16(q8s16);
290 vreinterpretq_s16_u16(q0u16)); 448 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
291 d0u8 = vqmovun_s16(q0s16); 449 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
292 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 450 dst += stride;
293 dst += y_stride; 451 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
294 q0u16 = vdupq_lane_u16(d20u16, 3); 452 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
295 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), 453 dst += stride;
296 vreinterpretq_s16_u16(q0u16)); 454
297 d0u8 = vqmovun_s16(q0s16); 455 q0u16 = vdupq_lane_u16(d20u16, 2);
298 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); 456 q8u16 = vdupq_lane_u16(d20u16, 3);
299 dst += y_stride; 457 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
458 vreinterpretq_s16_u16(q2u16));
459 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
460 vreinterpretq_s16_u16(q3u16));
461 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
462 vreinterpretq_s16_u16(q2u16));
463 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
464 vreinterpretq_s16_u16(q3u16));
465 d2u8 = vqmovun_s16(q1s16);
466 d3u8 = vqmovun_s16(q0s16);
467 d22u8 = vqmovun_s16(q11s16);
468 d23u8 = vqmovun_s16(q8s16);
469 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
470 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
471 dst += stride;
472 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
473 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
474 dst += stride;
300 } 475 }
301 return; 476 }
302 } 477 }
303 478
304 void vp9_tm_predictor_16x16_neon( 479 void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
305 uint8_t *dst, 480 const uint8_t *above, const uint8_t *left) {
306 ptrdiff_t y_stride, 481 int j, k;
307 const uint8_t *above, 482 uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
308 const uint8_t *left) { 483 uint8x16_t q0u8, q1u8, q2u8;
309 int j, k; 484 int16x8_t q12s16, q13s16, q14s16, q15s16;
310 uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; 485 uint16x4_t d6u16;
311 uint8x16_t q0u8, q1u8; 486 uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
312 int16x8_t q0s16, q1s16, q8s16, q11s16; 487
313 uint16x4_t d20u16; 488 q0u8 = vld1q_dup_u8(above - 1);
314 uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; 489 q1u8 = vld1q_u8(above);
315 490 q2u8 = vld1q_u8(above + 16);
316 q0u8 = vdupq_n_u8(above[-1]); 491 q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
317 q1u8 = vld1q_u8(above); 492 q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
318 q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); 493 q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
319 q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); 494 q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
320 for (k = 0; k < 2; k++, left += 8) { 495 for (k = 0; k < 4; k++, left += 8) {
321 d18u8 = vld1_u8(left); 496 d26u8 = vld1_u8(left);
322 q10u16 = vmovl_u8(d18u8); 497 q3u16 = vmovl_u8(d26u8);
323 d20u16 = vget_low_u16(q10u16); 498 d6u16 = vget_low_u16(q3u16);
324 for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { 499 for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
325 q0u16 = vdupq_lane_u16(d20u16, 0); 500 q0u16 = vdupq_lane_u16(d6u16, 0);
326 q8u16 = vdupq_lane_u16(d20u16, 1); 501 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
327 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 502 vreinterpretq_s16_u16(q8u16));
328 vreinterpretq_s16_u16(q2u16)); 503 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
329 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 504 vreinterpretq_s16_u16(q9u16));
330 vreinterpretq_s16_u16(q3u16)); 505 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
331 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 506 vreinterpretq_s16_u16(q10u16));
332 vreinterpretq_s16_u16(q2u16)); 507 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
333 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 508 vreinterpretq_s16_u16(q11u16));
334 vreinterpretq_s16_u16(q3u16)); 509 d0u8 = vqmovun_s16(q12s16);
335 d2u8 = vqmovun_s16(q1s16); 510 d1u8 = vqmovun_s16(q13s16);
336 d3u8 = vqmovun_s16(q0s16); 511 d2u8 = vqmovun_s16(q14s16);
337 d22u8 = vqmovun_s16(q11s16); 512 d3u8 = vqmovun_s16(q15s16);
338 d23u8 = vqmovun_s16(q8s16); 513 q0u8 = vcombine_u8(d0u8, d1u8);
339 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); 514 q1u8 = vcombine_u8(d2u8, d3u8);
340 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); 515 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
341 dst += y_stride; 516 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
342 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); 517 dst += stride;
343 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); 518
344 dst += y_stride; 519 q0u16 = vdupq_lane_u16(d6u16, 1);
345 520 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
346 q0u16 = vdupq_lane_u16(d20u16, 2); 521 vreinterpretq_s16_u16(q8u16));
347 q8u16 = vdupq_lane_u16(d20u16, 3); 522 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
348 q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 523 vreinterpretq_s16_u16(q9u16));
349 vreinterpretq_s16_u16(q2u16)); 524 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
350 q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), 525 vreinterpretq_s16_u16(q10u16));
351 vreinterpretq_s16_u16(q3u16)); 526 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
352 q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 527 vreinterpretq_s16_u16(q11u16));
353 vreinterpretq_s16_u16(q2u16)); 528 d0u8 = vqmovun_s16(q12s16);
354 q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), 529 d1u8 = vqmovun_s16(q13s16);
355 vreinterpretq_s16_u16(q3u16)); 530 d2u8 = vqmovun_s16(q14s16);
356 d2u8 = vqmovun_s16(q1s16); 531 d3u8 = vqmovun_s16(q15s16);
357 d3u8 = vqmovun_s16(q0s16); 532 q0u8 = vcombine_u8(d0u8, d1u8);
358 d22u8 = vqmovun_s16(q11s16); 533 q1u8 = vcombine_u8(d2u8, d3u8);
359 d23u8 = vqmovun_s16(q8s16); 534 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
360 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); 535 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
361 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); 536 dst += stride;
362 dst += y_stride; 537
363 vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); 538 q0u16 = vdupq_lane_u16(d6u16, 2);
364 vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); 539 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
365 dst += y_stride; 540 vreinterpretq_s16_u16(q8u16));
366 } 541 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
542 vreinterpretq_s16_u16(q9u16));
543 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
544 vreinterpretq_s16_u16(q10u16));
545 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
546 vreinterpretq_s16_u16(q11u16));
547 d0u8 = vqmovun_s16(q12s16);
548 d1u8 = vqmovun_s16(q13s16);
549 d2u8 = vqmovun_s16(q14s16);
550 d3u8 = vqmovun_s16(q15s16);
551 q0u8 = vcombine_u8(d0u8, d1u8);
552 q1u8 = vcombine_u8(d2u8, d3u8);
553 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
554 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
555 dst += stride;
556
557 q0u16 = vdupq_lane_u16(d6u16, 3);
558 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
559 vreinterpretq_s16_u16(q8u16));
560 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
561 vreinterpretq_s16_u16(q9u16));
562 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
563 vreinterpretq_s16_u16(q10u16));
564 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
565 vreinterpretq_s16_u16(q11u16));
566 d0u8 = vqmovun_s16(q12s16);
567 d1u8 = vqmovun_s16(q13s16);
568 d2u8 = vqmovun_s16(q14s16);
569 d3u8 = vqmovun_s16(q15s16);
570 q0u8 = vcombine_u8(d0u8, d1u8);
571 q1u8 = vcombine_u8(d2u8, d3u8);
572 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
573 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
574 dst += stride;
367 } 575 }
368 return; 576 }
369 } 577 }
370 578 #endif // !HAVE_NEON_ASM
371 void vp9_tm_predictor_32x32_neon(
372 uint8_t *dst,
373 ptrdiff_t y_stride,
374 const uint8_t *above,
375 const uint8_t *left) {
376 int j, k;
377 uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
378 uint8x16_t q0u8, q1u8, q2u8;
379 int16x8_t q12s16, q13s16, q14s16, q15s16;
380 uint16x4_t d6u16;
381 uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
382
383 q0u8 = vdupq_n_u8(above[-1]);
384 q1u8 = vld1q_u8(above);
385 q2u8 = vld1q_u8(above + 16);
386 q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
387 q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
388 q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
389 q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
390 for (k = 0; k < 4; k++, left += 8) {
391 d26u8 = vld1_u8(left);
392 q3u16 = vmovl_u8(d26u8);
393 d6u16 = vget_low_u16(q3u16);
394 for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
395 q0u16 = vdupq_lane_u16(d6u16, 0);
396 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
397 vreinterpretq_s16_u16(q8u16));
398 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
399 vreinterpretq_s16_u16(q9u16));
400 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
401 vreinterpretq_s16_u16(q10u16));
402 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
403 vreinterpretq_s16_u16(q11u16));
404 d0u8 = vqmovun_s16(q12s16);
405 d1u8 = vqmovun_s16(q13s16);
406 d2u8 = vqmovun_s16(q14s16);
407 d3u8 = vqmovun_s16(q15s16);
408 q0u8 = vcombine_u8(d0u8, d1u8);
409 q1u8 = vcombine_u8(d2u8, d3u8);
410 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
411 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
412 dst += y_stride;
413
414 q0u16 = vdupq_lane_u16(d6u16, 1);
415 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
416 vreinterpretq_s16_u16(q8u16));
417 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
418 vreinterpretq_s16_u16(q9u16));
419 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
420 vreinterpretq_s16_u16(q10u16));
421 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
422 vreinterpretq_s16_u16(q11u16));
423 d0u8 = vqmovun_s16(q12s16);
424 d1u8 = vqmovun_s16(q13s16);
425 d2u8 = vqmovun_s16(q14s16);
426 d3u8 = vqmovun_s16(q15s16);
427 q0u8 = vcombine_u8(d0u8, d1u8);
428 q1u8 = vcombine_u8(d2u8, d3u8);
429 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
430 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
431 dst += y_stride;
432
433 q0u16 = vdupq_lane_u16(d6u16, 2);
434 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
435 vreinterpretq_s16_u16(q8u16));
436 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
437 vreinterpretq_s16_u16(q9u16));
438 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
439 vreinterpretq_s16_u16(q10u16));
440 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
441 vreinterpretq_s16_u16(q11u16));
442 d0u8 = vqmovun_s16(q12s16);
443 d1u8 = vqmovun_s16(q13s16);
444 d2u8 = vqmovun_s16(q14s16);
445 d3u8 = vqmovun_s16(q15s16);
446 q0u8 = vcombine_u8(d0u8, d1u8);
447 q1u8 = vcombine_u8(d2u8, d3u8);
448 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
449 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
450 dst += y_stride;
451
452 q0u16 = vdupq_lane_u16(d6u16, 3);
453 q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
454 vreinterpretq_s16_u16(q8u16));
455 q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
456 vreinterpretq_s16_u16(q9u16));
457 q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
458 vreinterpretq_s16_u16(q10u16));
459 q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
460 vreinterpretq_s16_u16(q11u16));
461 d0u8 = vqmovun_s16(q12s16);
462 d1u8 = vqmovun_s16(q13s16);
463 d2u8 = vqmovun_s16(q14s16);
464 d3u8 = vqmovun_s16(q15s16);
465 q0u8 = vcombine_u8(d0u8, d1u8);
466 q1u8 = vcombine_u8(d2u8, d3u8);
467 vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
468 vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
469 dst += y_stride;
470 }
471 }
472 return;
473 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698