Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(272)

Side by Side Diff: celt/arm/celt_neon_intr.c

Issue 882843002: Update to opus-HEAD-66611f1. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/opus.git@master
Patch Set: Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /* Copyright (c) 2014-2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */
3 /**
4 @file celt_neon_intr.c
5 @brief ARM Neon Intrinsic optimizations for celt
6 */
7
8 /*
9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions
11 are met:
12
13 - Redistributions of source code must retain the above copyright
14 notice, this list of conditions and the following disclaimer.
15
16 - Redistributions in binary form must reproduce the above copyright
17 notice, this list of conditions and the following disclaimer in the
18 documentation and/or other materials provided with the distribution.
19
20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32 #include <arm_neon.h>
33 #include "../pitch.h"
34
35 /*
36 * Function: xcorr_kernel_neon_float
37 * ---------------------------------
38 * Computes 4 correlation values and stores them in sum[4]
39 */
40 static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
41 float32_t sum[4], int len) {
42 float32x4_t YY[3];
43 float32x4_t YEXT[3];
44 float32x4_t XX[2];
45 float32x2_t XX_2;
46 float32x4_t SUMM;
47 const float32_t *xi = x;
48 const float32_t *yi = y;
49
50 celt_assert(len>0);
51
52 YY[0] = vld1q_f32(yi);
53 SUMM = vdupq_n_f32(0);
54
55 /* Consume 8 elements in x vector and 12 elements in y
56 * vector. However, the 12'th element never really gets
57 * touched in this loop. So, if len == 8, then we only
58 * must access y[0] to y[10]. y[11] must not be accessed
59 * hence make sure len > 8 and not len >= 8
60 */
61 while (len > 8) {
62 yi += 4;
63 YY[1] = vld1q_f32(yi);
64 yi += 4;
65 YY[2] = vld1q_f32(yi);
66
67 XX[0] = vld1q_f32(xi);
68 xi += 4;
69 XX[1] = vld1q_f32(xi);
70 xi += 4;
71
72 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
73 YEXT[0] = vextq_f32(YY[0], YY[1], 1);
74 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
75 YEXT[1] = vextq_f32(YY[0], YY[1], 2);
76 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
77 YEXT[2] = vextq_f32(YY[0], YY[1], 3);
78 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
79
80 SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);
81 YEXT[0] = vextq_f32(YY[1], YY[2], 1);
82 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);
83 YEXT[1] = vextq_f32(YY[1], YY[2], 2);
84 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);
85 YEXT[2] = vextq_f32(YY[1], YY[2], 3);
86 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);
87
88 YY[0] = YY[2];
89 len -= 8;
90 }
91
92 /* Consume 4 elements in x vector and 8 elements in y
93 * vector. However, the 8'th element in y never really gets
94 * touched in this loop. So, if len == 4, then we only
95 * must access y[0] to y[6]. y[7] must not be accessed
96 * hence make sure len>4 and not len>=4
97 */
98 if (len > 4) {
99 yi += 4;
100 YY[1] = vld1q_f32(yi);
101
102 XX[0] = vld1q_f32(xi);
103 xi += 4;
104
105 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);
106 YEXT[0] = vextq_f32(YY[0], YY[1], 1);
107 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);
108 YEXT[1] = vextq_f32(YY[0], YY[1], 2);
109 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);
110 YEXT[2] = vextq_f32(YY[0], YY[1], 3);
111 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);
112
113 YY[0] = YY[1];
114 len -= 4;
115 }
116
117 while (--len > 0) {
118 XX_2 = vld1_dup_f32(xi++);
119 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
120 YY[0]= vld1q_f32(++yi);
121 }
122
123 XX_2 = vld1_dup_f32(xi);
124 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
125
126 vst1q_f32(sum, SUMM);
127 }
128
129 /*
130 * Function: xcorr_kernel_neon_float_process1
131 * ---------------------------------
132 * Computes single correlation values and stores in *sum
133 */
134 static void xcorr_kernel_neon_float_process1(const float32_t *x,
135 const float32_t *y, float32_t *sum, int len) {
136 float32x4_t XX[4];
137 float32x4_t YY[4];
138 float32x2_t XX_2;
139 float32x2_t YY_2;
140 float32x4_t SUMM;
141 float32x2_t SUMM_2[2];
142 const float32_t *xi = x;
143 const float32_t *yi = y;
144
145 SUMM = vdupq_n_f32(0);
146
147 /* Work on 16 values per iteration */
148 while (len >= 16) {
149 XX[0] = vld1q_f32(xi);
150 xi += 4;
151 XX[1] = vld1q_f32(xi);
152 xi += 4;
153 XX[2] = vld1q_f32(xi);
154 xi += 4;
155 XX[3] = vld1q_f32(xi);
156 xi += 4;
157
158 YY[0] = vld1q_f32(yi);
159 yi += 4;
160 YY[1] = vld1q_f32(yi);
161 yi += 4;
162 YY[2] = vld1q_f32(yi);
163 yi += 4;
164 YY[3] = vld1q_f32(yi);
165 yi += 4;
166
167 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
168 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
169 SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
170 SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
171 len -= 16;
172 }
173
174 /* Work on 8 values */
175 if (len >= 8) {
176 XX[0] = vld1q_f32(xi);
177 xi += 4;
178 XX[1] = vld1q_f32(xi);
179 xi += 4;
180
181 YY[0] = vld1q_f32(yi);
182 yi += 4;
183 YY[1] = vld1q_f32(yi);
184 yi += 4;
185
186 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
187 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
188 len -= 8;
189 }
190
191 /* Work on 4 values */
192 if (len >= 4) {
193 XX[0] = vld1q_f32(xi);
194 xi += 4;
195 YY[0] = vld1q_f32(yi);
196 yi += 4;
197 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
198 len -= 4;
199 }
200
201 /* Start accumulating results */
202 SUMM_2[0] = vget_low_f32(SUMM);
203 if (len >= 2) {
204 /* While at it, consume 2 more values if available */
205 XX_2 = vld1_f32(xi);
206 xi += 2;
207 YY_2 = vld1_f32(yi);
208 yi += 2;
209 SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
210 len -= 2;
211 }
212 SUMM_2[1] = vget_high_f32(SUMM);
213 SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
214 SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
215 /* Ok, now we have result accumulated in SUMM_2[0].0 */
216
217 if (len > 0) {
218 /* Case when you have one value left */
219 XX_2 = vld1_dup_f32(xi);
220 YY_2 = vld1_dup_f32(yi);
221 SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
222 }
223
224 vst1_lane_f32(sum, SUMM_2[0], 0);
225 }
226
227 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
228 opus_val32 *xcorr, int len, int max_pitch) {
229 int i;
230 celt_assert(max_pitch > 0);
231 celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
232
233 for (i = 0; i < (max_pitch-3); i += 4) {
234 xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
235 (float32_t *)xcorr+i, len);
236 }
237
238 /* In case max_pitch isn't multiple of 4
239 * compute single correlation value per iteration
240 */
241 for (; i < max_pitch; i++) {
242 xcorr_kernel_neon_float_process1((const float32_t *)_x,
243 (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
244 }
245 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698