third_party/opus/src/celt/arm/celt_neon_intr.c - Issue 2962373002: [Opus] Update to v1.2.1

Side by Side Diff: third_party/opus/src/celt/arm/celt_neon_intr.c

Issue 2962373002: [Opus] Update to v1.2.1 (Closed)

Patch Set: Pre-increment instead of post-increment Created 3 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /* Copyright (c) 2014-2015 Xiph.Org Foundation	1 /* Copyright (c) 2014-2015 Xiph.Org Foundation

2 Written by Viswanath Puttagunta */	2 Written by Viswanath Puttagunta */

3 /**	3 /**

4 @file celt_neon_intr.c	4 @file celt_neon_intr.c

5 @brief ARM Neon Intrinsic optimizations for celt	5 @brief ARM Neon Intrinsic optimizations for celt

6 */	6 */

7	7

8 /*	8 /*

9 Redistribution and use in source and binary forms, with or without	9 Redistribution and use in source and binary forms, with or without

10 modification, are permitted provided that the following conditions	10 modification, are permitted provided that the following conditions

(...skipping 173 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
184 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);	184 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);

185 YY[0]= vld1q_f32(++yi);	185 YY[0]= vld1q_f32(++yi);

186 }	186 }

187	187

188 XX_2 = vld1_dup_f32(xi);	188 XX_2 = vld1_dup_f32(xi);

189 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);	189 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);

190	190

191 vst1q_f32(sum, SUMM);	191 vst1q_f32(sum, SUMM);

192 }	192 }

193	193

194 /*

195 * Function: xcorr_kernel_neon_float_process1

196 * ---------------------------------

197 * Computes single correlation values and stores in *sum

198 */

199 static void xcorr_kernel_neon_float_process1(const float32_t *x,

200 const float32_t y, float32_t sum, int len) {

201 float32x4_t XX[4];

202 float32x4_t YY[4];

203 float32x2_t XX_2;

204 float32x2_t YY_2;

205 float32x4_t SUMM;

206 float32x2_t SUMM_2[2];

207 const float32_t *xi = x;

208 const float32_t *yi = y;

209

210 SUMM = vdupq_n_f32(0);

211

212 /* Work on 16 values per iteration */

213 while (len >= 16) {

214 XX[0] = vld1q_f32(xi);

215 xi += 4;

216 XX[1] = vld1q_f32(xi);

217 xi += 4;

218 XX[2] = vld1q_f32(xi);

219 xi += 4;

220 XX[3] = vld1q_f32(xi);

221 xi += 4;

222

223 YY[0] = vld1q_f32(yi);

224 yi += 4;

225 YY[1] = vld1q_f32(yi);

226 yi += 4;

227 YY[2] = vld1q_f32(yi);

228 yi += 4;

229 YY[3] = vld1q_f32(yi);

230 yi += 4;

231

232 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);

233 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);

234 SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);

235 SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);

236 len -= 16;

237 }

238

239 /* Work on 8 values */

240 if (len >= 8) {

241 XX[0] = vld1q_f32(xi);

242 xi += 4;

243 XX[1] = vld1q_f32(xi);

244 xi += 4;

245

246 YY[0] = vld1q_f32(yi);

247 yi += 4;

248 YY[1] = vld1q_f32(yi);

249 yi += 4;

250

251 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);

252 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);

253 len -= 8;

254 }

255

256 /* Work on 4 values */

257 if (len >= 4) {

258 XX[0] = vld1q_f32(xi);

259 xi += 4;

260 YY[0] = vld1q_f32(yi);

261 yi += 4;

262 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);

263 len -= 4;

264 }

265

266 /* Start accumulating results */

267 SUMM_2[0] = vget_low_f32(SUMM);

268 if (len >= 2) {

269 /* While at it, consume 2 more values if available */

270 XX_2 = vld1_f32(xi);

271 xi += 2;

272 YY_2 = vld1_f32(yi);

273 yi += 2;

274 SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);

275 len -= 2;

276 }

277 SUMM_2[1] = vget_high_f32(SUMM);

278 SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);

279 SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);

280 /* Ok, now we have result accumulated in SUMM_2[0].0 */

281

282 if (len > 0) {

283 /* Case when you have one value left */

284 XX_2 = vld1_dup_f32(xi);

285 YY_2 = vld1_dup_f32(yi);

286 SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);

287 }

288

289 vst1_lane_f32(sum, SUMM_2[0], 0);

290 }

291

292 void celt_pitch_xcorr_float_neon(const opus_val16 _x, const opus_val16 _y,	194 void celt_pitch_xcorr_float_neon(const opus_val16 _x, const opus_val16 _y,

293 opus_val32 *xcorr, int len, int max_pitch) {	195 opus_val32 *xcorr, int len, int max_pitch, int arch) {

294 int i;	196 int i;

	197 (void)arch;

295 celt_assert(max_pitch > 0);	198 celt_assert(max_pitch > 0);

296 celt_assert((((unsigned char )_x-(unsigned char )NULL)&3)==0);	199 celt_assert((((unsigned char )_x-(unsigned char )NULL)&3)==0);

297	200

298 for (i = 0; i < (max_pitch-3); i += 4) {	201 for (i = 0; i < (max_pitch-3); i += 4) {

299 xcorr_kernel_neon_float((const float32_t )_x, (const float32_t )_y+i,	202 xcorr_kernel_neon_float((const float32_t )_x, (const float32_t )_y+i,

300 (float32_t *)xcorr+i, len);	203 (float32_t *)xcorr+i, len);

301 }	204 }

302	205

303 /* In case max_pitch isn't multiple of 4	206 /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */

304 * compute single correlation value per iteration

305 */

306 for (; i < max_pitch; i++) {	207 for (; i < max_pitch; i++) {

307 xcorr_kernel_neon_float_process1((const float32_t *)_x,	208 xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);

308 (const float32_t )_y+i, (float32_t )xcorr+i, len);

309 }	209 }

310 }	210 }

311 #endif	211 #endif

OLD	NEW

« no previous file with comments | « third_party/opus/src/celt/arm/celt_ne10_fft.c ('k') | third_party/opus/src/celt/arm/celt_pitch_xcorr_arm.s » ('j') | no next file with comments »