| Index: third_party/opus/src/celt/arm/celt_neon_intr.c
|
| diff --git a/third_party/opus/src/celt/arm/celt_neon_intr.c b/third_party/opus/src/celt/arm/celt_neon_intr.c
|
| index 47bbe3dc22ec93cfeb9398248bd144128c333975..cf443988ce5fe5323b8445cb656a991a98da514f 100644
|
| --- a/third_party/opus/src/celt/arm/celt_neon_intr.c
|
| +++ b/third_party/opus/src/celt/arm/celt_neon_intr.c
|
| @@ -191,107 +191,10 @@ static void xcorr_kernel_neon_float(const float32_t *x, const float32_t *y,
|
| vst1q_f32(sum, SUMM);
|
| }
|
|
|
| -/*
|
| - * Function: xcorr_kernel_neon_float_process1
|
| - * ---------------------------------
|
| - * Computes single correlation values and stores in *sum
|
| - */
|
| -static void xcorr_kernel_neon_float_process1(const float32_t *x,
|
| - const float32_t *y, float32_t *sum, int len) {
|
| - float32x4_t XX[4];
|
| - float32x4_t YY[4];
|
| - float32x2_t XX_2;
|
| - float32x2_t YY_2;
|
| - float32x4_t SUMM;
|
| - float32x2_t SUMM_2[2];
|
| - const float32_t *xi = x;
|
| - const float32_t *yi = y;
|
| -
|
| - SUMM = vdupq_n_f32(0);
|
| -
|
| - /* Work on 16 values per iteration */
|
| - while (len >= 16) {
|
| - XX[0] = vld1q_f32(xi);
|
| - xi += 4;
|
| - XX[1] = vld1q_f32(xi);
|
| - xi += 4;
|
| - XX[2] = vld1q_f32(xi);
|
| - xi += 4;
|
| - XX[3] = vld1q_f32(xi);
|
| - xi += 4;
|
| -
|
| - YY[0] = vld1q_f32(yi);
|
| - yi += 4;
|
| - YY[1] = vld1q_f32(yi);
|
| - yi += 4;
|
| - YY[2] = vld1q_f32(yi);
|
| - yi += 4;
|
| - YY[3] = vld1q_f32(yi);
|
| - yi += 4;
|
| -
|
| - SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
|
| - SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
|
| - SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);
|
| - SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);
|
| - len -= 16;
|
| - }
|
| -
|
| - /* Work on 8 values */
|
| - if (len >= 8) {
|
| - XX[0] = vld1q_f32(xi);
|
| - xi += 4;
|
| - XX[1] = vld1q_f32(xi);
|
| - xi += 4;
|
| -
|
| - YY[0] = vld1q_f32(yi);
|
| - yi += 4;
|
| - YY[1] = vld1q_f32(yi);
|
| - yi += 4;
|
| -
|
| - SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
|
| - SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);
|
| - len -= 8;
|
| - }
|
| -
|
| - /* Work on 4 values */
|
| - if (len >= 4) {
|
| - XX[0] = vld1q_f32(xi);
|
| - xi += 4;
|
| - YY[0] = vld1q_f32(yi);
|
| - yi += 4;
|
| - SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);
|
| - len -= 4;
|
| - }
|
| -
|
| - /* Start accumulating results */
|
| - SUMM_2[0] = vget_low_f32(SUMM);
|
| - if (len >= 2) {
|
| - /* While at it, consume 2 more values if available */
|
| - XX_2 = vld1_f32(xi);
|
| - xi += 2;
|
| - YY_2 = vld1_f32(yi);
|
| - yi += 2;
|
| - SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);
|
| - len -= 2;
|
| - }
|
| - SUMM_2[1] = vget_high_f32(SUMM);
|
| - SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);
|
| - SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);
|
| - /* Ok, now we have result accumulated in SUMM_2[0].0 */
|
| -
|
| - if (len > 0) {
|
| - /* Case when you have one value left */
|
| - XX_2 = vld1_dup_f32(xi);
|
| - YY_2 = vld1_dup_f32(yi);
|
| - SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);
|
| - }
|
| -
|
| - vst1_lane_f32(sum, SUMM_2[0], 0);
|
| -}
|
| -
|
| void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
|
| - opus_val32 *xcorr, int len, int max_pitch) {
|
| + opus_val32 *xcorr, int len, int max_pitch, int arch) {
|
| int i;
|
| + (void)arch;
|
| celt_assert(max_pitch > 0);
|
| celt_assert((((unsigned char *)_x-(unsigned char *)NULL)&3)==0);
|
|
|
| @@ -300,12 +203,9 @@ void celt_pitch_xcorr_float_neon(const opus_val16 *_x, const opus_val16 *_y,
|
| (float32_t *)xcorr+i, len);
|
| }
|
|
|
| - /* In case max_pitch isn't multiple of 4
|
| - * compute single correlation value per iteration
|
| - */
|
| + /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
|
| for (; i < max_pitch; i++) {
|
| - xcorr_kernel_neon_float_process1((const float32_t *)_x,
|
| - (const float32_t *)_y+i, (float32_t *)xcorr+i, len);
|
| + xcorr[i] = celt_inner_prod_neon(_x, _y+i, len);
|
| }
|
| }
|
| #endif
|
|
|