OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved. | |
3 * | |
4 * Use of this source code is governed by a BSD-style license | |
5 * that can be found in the LICENSE file in the root of the source | |
6 * tree. An additional intellectual property rights grant can be found | |
7 * in the file PATENTS. All contributing project authors may | |
8 * be found in the AUTHORS file in the root of the source tree. | |
9 */ | |
10 | |
11 #include <stddef.h> | |
12 #include <arm_neon.h> | |
13 | |
14 void vp9_convolve_avg_neon( | |
15 const uint8_t *src, // r0 | |
16 ptrdiff_t src_stride, // r1 | |
17 uint8_t *dst, // r2 | |
18 ptrdiff_t dst_stride, // r3 | |
19 const int16_t *filter_x, | |
20 int filter_x_stride, | |
21 const int16_t *filter_y, | |
22 int filter_y_stride, | |
23 int w, | |
24 int h) { | |
25 uint8_t *d; | |
26 uint8x8_t d0u8, d1u8, d2u8, d3u8; | |
27 uint32x2_t d0u32, d2u32; | |
28 uint8x16_t q0u8, q1u8, q2u8, q3u8, q8u8, q9u8, q10u8, q11u8; | |
29 (void)filter_x; (void)filter_x_stride; | |
30 (void)filter_y; (void)filter_y_stride; | |
31 | |
32 d = dst; | |
33 if (w > 32) { // avg64 | |
34 for (; h > 0; h -= 1) { | |
35 q0u8 = vld1q_u8(src); | |
36 q1u8 = vld1q_u8(src + 16); | |
37 q2u8 = vld1q_u8(src + 32); | |
38 q3u8 = vld1q_u8(src + 48); | |
39 src += src_stride; | |
40 q8u8 = vld1q_u8(d); | |
41 q9u8 = vld1q_u8(d + 16); | |
42 q10u8 = vld1q_u8(d + 32); | |
43 q11u8 = vld1q_u8(d + 48); | |
44 d += dst_stride; | |
45 | |
46 q0u8 = vrhaddq_u8(q0u8, q8u8); | |
47 q1u8 = vrhaddq_u8(q1u8, q9u8); | |
48 q2u8 = vrhaddq_u8(q2u8, q10u8); | |
49 q3u8 = vrhaddq_u8(q3u8, q11u8); | |
50 | |
51 vst1q_u8(dst, q0u8); | |
52 vst1q_u8(dst + 16, q1u8); | |
53 vst1q_u8(dst + 32, q2u8); | |
54 vst1q_u8(dst + 48, q3u8); | |
55 dst += dst_stride; | |
56 } | |
57 } else if (w == 32) { // avg32 | |
58 for (; h > 0; h -= 2) { | |
59 q0u8 = vld1q_u8(src); | |
60 q1u8 = vld1q_u8(src + 16); | |
61 src += src_stride; | |
62 q2u8 = vld1q_u8(src); | |
63 q3u8 = vld1q_u8(src + 16); | |
64 src += src_stride; | |
65 q8u8 = vld1q_u8(d); | |
66 q9u8 = vld1q_u8(d + 16); | |
67 d += dst_stride; | |
68 q10u8 = vld1q_u8(d); | |
69 q11u8 = vld1q_u8(d + 16); | |
70 d += dst_stride; | |
71 | |
72 q0u8 = vrhaddq_u8(q0u8, q8u8); | |
73 q1u8 = vrhaddq_u8(q1u8, q9u8); | |
74 q2u8 = vrhaddq_u8(q2u8, q10u8); | |
75 q3u8 = vrhaddq_u8(q3u8, q11u8); | |
76 | |
77 vst1q_u8(dst, q0u8); | |
78 vst1q_u8(dst + 16, q1u8); | |
79 dst += dst_stride; | |
80 vst1q_u8(dst, q2u8); | |
81 vst1q_u8(dst + 16, q3u8); | |
82 dst += dst_stride; | |
83 } | |
84 } else if (w > 8) { // avg16 | |
85 for (; h > 0; h -= 2) { | |
86 q0u8 = vld1q_u8(src); | |
87 src += src_stride; | |
88 q1u8 = vld1q_u8(src); | |
89 src += src_stride; | |
90 q2u8 = vld1q_u8(d); | |
91 d += dst_stride; | |
92 q3u8 = vld1q_u8(d); | |
93 d += dst_stride; | |
94 | |
95 q0u8 = vrhaddq_u8(q0u8, q2u8); | |
96 q1u8 = vrhaddq_u8(q1u8, q3u8); | |
97 | |
98 vst1q_u8(dst, q0u8); | |
99 dst += dst_stride; | |
100 vst1q_u8(dst, q1u8); | |
101 dst += dst_stride; | |
102 } | |
103 } else if (w == 8) { // avg8 | |
104 for (; h > 0; h -= 2) { | |
105 d0u8 = vld1_u8(src); | |
106 src += src_stride; | |
107 d1u8 = vld1_u8(src); | |
108 src += src_stride; | |
109 d2u8 = vld1_u8(d); | |
110 d += dst_stride; | |
111 d3u8 = vld1_u8(d); | |
112 d += dst_stride; | |
113 | |
114 q0u8 = vcombine_u8(d0u8, d1u8); | |
115 q1u8 = vcombine_u8(d2u8, d3u8); | |
116 q0u8 = vrhaddq_u8(q0u8, q1u8); | |
117 | |
118 vst1_u8(dst, vget_low_u8(q0u8)); | |
119 dst += dst_stride; | |
120 vst1_u8(dst, vget_high_u8(q0u8)); | |
121 dst += dst_stride; | |
122 } | |
123 } else { // avg4 | |
124 for (; h > 0; h -= 2) { | |
125 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 0); | |
126 src += src_stride; | |
127 d0u32 = vld1_lane_u32((const uint32_t *)src, d0u32, 1); | |
128 src += src_stride; | |
129 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 0); | |
130 d += dst_stride; | |
131 d2u32 = vld1_lane_u32((const uint32_t *)d, d2u32, 1); | |
132 d += dst_stride; | |
133 | |
134 d0u8 = vrhadd_u8(vreinterpret_u8_u32(d0u32), | |
135 vreinterpret_u8_u32(d2u32)); | |
136 | |
137 d0u32 = vreinterpret_u32_u8(d0u8); | |
138 vst1_lane_u32((uint32_t *)dst, d0u32, 0); | |
139 dst += dst_stride; | |
140 vst1_lane_u32((uint32_t *)dst, d0u32, 1); | |
141 dst += dst_stride; | |
142 } | |
143 } | |
144 return; | |
145 } | |
OLD | NEW |