Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: third_party/opus/src/celt/arm/celt_neon_intr.c

Issue 2962373002: [Opus] Update to v1.2.1 (Closed)
Patch Set: Pre-increment instead of post-increment Created 3 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* Copyright (c) 2014-2015 Xiph.Org Foundation 1 /* Copyright (c) 2014-2015 Xiph.Org Foundation
2 Written by Viswanath Puttagunta */ 2 Written by Viswanath Puttagunta */
3 /** 3 /**
4 @file celt_neon_intr.c 4 @file celt_neon_intr.c
5 @brief ARM Neon Intrinsic optimizations for celt 5 @brief ARM Neon Intrinsic optimizations for celt
6 */ 6 */
7 7
8 /* 8 /*
9 Redistribution and use in source and binary forms, with or without 9 Redistribution and use in source and binary forms, with or without
10 modification, are permitted provided that the following conditions 10 modification, are permitted provided that the following conditions
(...skipping 173 matching lines...) Expand 10 before | Expand all | Expand 10 after
184 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); 184 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
185 YY[0]= vld1q_f32(++yi); 185 YY[0]= vld1q_f32(++yi);
186 } 186 }
187 187
188 XX_2 = vld1_dup_f32(xi); 188 XX_2 = vld1_dup_f32(xi);
189 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0); 189 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);
190 190
191 vst1q_f32(sum, SUMM); 191 vst1q_f32(sum, SUMM);
192 } 192 }
193 193
194 /*
195 * Function: xcorr_kernel_neon_float_process1
196 * ---------------------------------
197 * Computes single correlation values and stores in *sum
198 */
199 static void xcorr_kernel_neon_float_process1(const float32_t *x,
200 const float32_t *y, float32_t *sum, int len) {
201 float32x4_t XX[4];
202 float32x4_t YY[4];
203 float32x2_t XX_2;
204 float32x2_t YY_2;
205 float32x4_t SUMM;
206 float32x2_t SUMM_2[2];
207 const float32_t *xi = x;
208 const float32_t *yi = y;
209
210 SUMM = vdupq_n_f32(0);
211
212 /* Work on 16 values per iteration */
213 while (len >= 16) {
214 XX[0] = vld1q_f32(xi);
215 xi += 4;
216 XX[1] = vld1q_f32(xi);
217 xi += 4;
218 XX[2] = vld1q_f32(xi);
219 xi += 4;
220 XX[3] = vld1q_f32(xi);
221 xi += 4;
222
223 YY[0] = vld1q_f32(yi);
224 yi += 4;
225 YY[1] = vld1q_f32(yi);
226 yi += 4;
227 YY[2] = vld1q_f32(yi);
228 yi += 4;
229 YY[3] = vld1q_f32(yi);
230 yi += 4;
231
232 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
233 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
234 SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
235 SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
236 len -= 16;
237 }
238
239 /* Work on 8 values */
240 if (len >= 8) {
241 XX[0] = vld1q_f32(xi);
242 xi += 4;
243 XX[1] = vld1q_f32(xi);
244 xi += 4;
245
246 YY[0] = vld1q_f32(yi);
247 yi += 4;
248 YY[1] = vld1q_f32(yi);
249 yi += 4;
250
251 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
252 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
253 len -= 8;
254 }
255
256 /* Work on 4 values */
257 if (len >= 4) {
258 XX[0] = vld1q_f32(xi);
259 xi += 4;
260 YY[0] = vld1q_f32(yi);
261 yi += 4;
262 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
263 len -= 4;
264 }
265
266 /* Start accumulating results */
267 SUMM_2[0] = vget_low_f32(SUMM);
268 if (len >= 2) {
269 /* While at it, consume 2 more values if available */
270 XX_2 = vld1_f32(xi);
271 xi += 2;
272 YY_2 = vld1_f32(yi);
273 yi += 2;
274 SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
275 len -= 2;
276 }
277 SUMM_2[1] = vget_high_f32(SUMM);
278 SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
279 SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
280 /* Ok, now we have result accumulated in SUMM_2[0].0 */
281
282 if (len > 0) {
283 /* Case when you have one value left */
284 XX_2 = vld1_dup_f32(xi);
285 YY_2 = vld1_dup_f32(yi);
286 SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
287 }
288
289 vst1_lane_f32(sum, SUMM_2[0], 0);
290 }
291
292 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y, 194 void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
293 opus_val32 *xcorr, int len, int max_pitch) { 195 opus_val32 *xcorr, int len, int max_pitch, int arch) {
294 int i; 196 int i;
197 (void)arch;
295 celt_assert(max_pitch > 0); 198 celt_assert(max_pitch > 0);
296 celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0); 199 celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
297 200
298 for (i = 0; i < (max_pitch-3); i += 4) { 201 for (i = 0; i < (max_pitch-3); i += 4) {
299 xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i, 202 xcorr_kernel_neon_float((const float32_t *)_x, (const float32_t *)_y+i,
300 (float32_t *)xcorr+i, len); 203 (float32_t *)xcorr+i, len);
301 } 204 }
302 205
303 /* In case max_pitch isn't multiple of 4 206 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
304 * compute single correlation value per iteration
305 */
306 for (; i < max_pitch; i++) { 207 for (; i < max_pitch; i++) {
307 xcorr_kernel_neon_float_process1((const float32_t *)_x, 208 xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
308 (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
309 } 209 }
310 } 210 }
311 #endif 211 #endif
OLDNEW
« no previous file with comments | « third_party/opus/src/celt/arm/celt_ne10_fft.c ('k') | third_party/opus/src/celt/arm/celt_pitch_xcorr_arm.s » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698