celt/arm/celt_neon_intr.c - Issue 882843002: Update to opus-HEAD-66611f1.

Side by Side Diff: celt/arm/celt_neon_intr.c

Issue 882843002: Update to opus-HEAD-66611f1. (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/opus.git@master

Patch Set: Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /* Copyright (c) 2014-2015 Xiph.Org Foundation

	2 Written by Viswanath Puttagunta */

	3 /**

	4 @file celt_neon_intr.c

	5 @brief ARM Neon Intrinsic optimizations for celt

	6 */

	7

	8 /*

	9 Redistribution and use in source and binary forms, with or without

	10 modification, are permitted provided that the following conditions

	11 are met:

	12

	13 - Redistributions of source code must retain the above copyright

	14 notice, this list of conditions and the following disclaimer.

	15

	16 - Redistributions in binary form must reproduce the above copyright

	17 notice, this list of conditions and the following disclaimer in the

	18 documentation and/or other materials provided with the distribution.

	19

	20 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	21 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	22 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	23 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER

	24 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

	25 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

	26 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

	27 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

	28 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

	29 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

	30 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	31 */

	32 #include <arm_neon.h>

	33 #include "../pitch.h"

	34

	35 /*

	36 * Function: xcorr_kernel_neon_float

	37 * ---------------------------------

	38 * Computes 4 correlation values and stores them in sum[4]

	39 */

	40 static void xcorr_kernel_neon_float(const float32_t x, const float32_t y,

	41 float32_t sum[4], int len) {

	42 float32x4_t YY[3];

	43 float32x4_t YEXT[3];

	44 float32x4_t XX[2];

	45 float32x2_t XX_2;

	46 float32x4_t SUMM;

	47 const float32_t *xi = x;

	48 const float32_t *yi = y;

	49

	50 celt_assert(len>0);

	51

	52 YY[0] = vld1q_f32(yi);

	53 SUMM = vdupq_n_f32(0);

	54

	55 /* Consume 8 elements in x vector and 12 elements in y

	56 * vector. However, the 12'th element never really gets

	57 * touched in this loop. So, if len == 8, then we only

	58 * must access y[0] to y[10]. y[11] must not be accessed

	59 * hence make sure len > 8 and not len >= 8

	60 */

	61 while (len > 8) {

	62 yi += 4;

	63 YY[1] = vld1q_f32(yi);

	64 yi += 4;

	65 YY[2] = vld1q_f32(yi);

	66

	67 XX[0] = vld1q_f32(xi);

	68 xi += 4;

	69 XX[1] = vld1q_f32(xi);

	70 xi += 4;

	71

	72 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);

	73 YEXT[0] = vextq_f32(YY[0], YY[1], 1);

	74 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);

	75 YEXT[1] = vextq_f32(YY[0], YY[1], 2);

	76 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);

	77 YEXT[2] = vextq_f32(YY[0], YY[1], 3);

	78 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);

	79

	80 SUMM = vmlaq_lane_f32(SUMM, YY[1], vget_low_f32(XX[1]), 0);

	81 YEXT[0] = vextq_f32(YY[1], YY[2], 1);

	82 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[1]), 1);

	83 YEXT[1] = vextq_f32(YY[1], YY[2], 2);

	84 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[1]), 0);

	85 YEXT[2] = vextq_f32(YY[1], YY[2], 3);

	86 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[1]), 1);

	87

	88 YY[0] = YY[2];

	89 len -= 8;

	90 }

	91

	92 /* Consume 4 elements in x vector and 8 elements in y

	93 * vector. However, the 8'th element in y never really gets

	94 * touched in this loop. So, if len == 4, then we only

	95 * must access y[0] to y[6]. y[7] must not be accessed

	96 * hence make sure len>4 and not len>=4

	97 */

	98 if (len > 4) {

	99 yi += 4;

	100 YY[1] = vld1q_f32(yi);

	101

	102 XX[0] = vld1q_f32(xi);

	103 xi += 4;

	104

	105 SUMM = vmlaq_lane_f32(SUMM, YY[0], vget_low_f32(XX[0]), 0);

	106 YEXT[0] = vextq_f32(YY[0], YY[1], 1);

	107 SUMM = vmlaq_lane_f32(SUMM, YEXT[0], vget_low_f32(XX[0]), 1);

	108 YEXT[1] = vextq_f32(YY[0], YY[1], 2);

	109 SUMM = vmlaq_lane_f32(SUMM, YEXT[1], vget_high_f32(XX[0]), 0);

	110 YEXT[2] = vextq_f32(YY[0], YY[1], 3);

	111 SUMM = vmlaq_lane_f32(SUMM, YEXT[2], vget_high_f32(XX[0]), 1);

	112

	113 YY[0] = YY[1];

	114 len -= 4;

	115 }

	116

	117 while (--len > 0) {

	118 XX_2 = vld1_dup_f32(xi++);

	119 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);

	120 YY[0]= vld1q_f32(++yi);

	121 }

	122

	123 XX_2 = vld1_dup_f32(xi);

	124 SUMM = vmlaq_lane_f32(SUMM, YY[0], XX_2, 0);

	125

	126 vst1q_f32(sum, SUMM);

	127 }

	128

	129 /*

	130 * Function: xcorr_kernel_neon_float_process1

	131 * ---------------------------------

	132 * Computes single correlation values and stores in *sum

	133 */

	134 static void xcorr_kernel_neon_float_process1(const float32_t *x,

	135 const float32_t y, float32_t sum, int len) {

	136 float32x4_t XX[4];

	137 float32x4_t YY[4];

	138 float32x2_t XX_2;

	139 float32x2_t YY_2;

	140 float32x4_t SUMM;

	141 float32x2_t SUMM_2[2];

	142 const float32_t *xi = x;

	143 const float32_t *yi = y;

	144

	145 SUMM = vdupq_n_f32(0);

	146

	147 /* Work on 16 values per iteration */

	148 while (len >= 16) {

	149 XX[0] = vld1q_f32(xi);

	150 xi += 4;

	151 XX[1] = vld1q_f32(xi);

	152 xi += 4;

	153 XX[2] = vld1q_f32(xi);

	154 xi += 4;

	155 XX[3] = vld1q_f32(xi);

	156 xi += 4;

	157

	158 YY[0] = vld1q_f32(yi);

	159 yi += 4;

	160 YY[1] = vld1q_f32(yi);

	161 yi += 4;

	162 YY[2] = vld1q_f32(yi);

	163 yi += 4;

	164 YY[3] = vld1q_f32(yi);

	165 yi += 4;

	166

	167 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);

	168 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);

	169 SUMM = vmlaq_f32(SUMM, YY[2], XX[2]);

	170 SUMM = vmlaq_f32(SUMM, YY[3], XX[3]);

	171 len -= 16;

	172 }

	173

	174 /* Work on 8 values */

	175 if (len >= 8) {

	176 XX[0] = vld1q_f32(xi);

	177 xi += 4;

	178 XX[1] = vld1q_f32(xi);

	179 xi += 4;

	180

	181 YY[0] = vld1q_f32(yi);

	182 yi += 4;

	183 YY[1] = vld1q_f32(yi);

	184 yi += 4;

	185

	186 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);

	187 SUMM = vmlaq_f32(SUMM, YY[1], XX[1]);

	188 len -= 8;

	189 }

	190

	191 /* Work on 4 values */

	192 if (len >= 4) {

	193 XX[0] = vld1q_f32(xi);

	194 xi += 4;

	195 YY[0] = vld1q_f32(yi);

	196 yi += 4;

	197 SUMM = vmlaq_f32(SUMM, YY[0], XX[0]);

	198 len -= 4;

	199 }

	200

	201 /* Start accumulating results */

	202 SUMM_2[0] = vget_low_f32(SUMM);

	203 if (len >= 2) {

	204 /* While at it, consume 2 more values if available */

	205 XX_2 = vld1_f32(xi);

	206 xi += 2;

	207 YY_2 = vld1_f32(yi);

	208 yi += 2;

	209 SUMM_2[0] = vmla_f32(SUMM_2[0], YY_2, XX_2);

	210 len -= 2;

	211 }

	212 SUMM_2[1] = vget_high_f32(SUMM);

	213 SUMM_2[0] = vadd_f32(SUMM_2[0], SUMM_2[1]);

	214 SUMM_2[0] = vpadd_f32(SUMM_2[0], SUMM_2[0]);

	215 /* Ok, now we have result accumulated in SUMM_2[0].0 */

	216

	217 if (len > 0) {

	218 /* Case when you have one value left */

	219 XX_2 = vld1_dup_f32(xi);

	220 YY_2 = vld1_dup_f32(yi);

	221 SUMM_2[0] = vmla_f32(SUMM_2[0], XX_2, YY_2);

	222 }

	223

	224 vst1_lane_f32(sum, SUMM_2[0], 0);

	225 }

	226

	227 void celt_pitch_xcorr_float_neon(const opus_val16 _x, const opus_val16 _y,

	228 opus_val32 *xcorr, int len, int max_pitch) {

	229 int i;

	230 celt_assert(max_pitch > 0);

	231 celt_assert((((unsigned char )_x-(unsigned char )NULL)&3)==0);

	232

	233 for (i = 0; i < (max_pitch-3); i += 4) {

	234 xcorr_kernel_neon_float((const float32_t )_x, (const float32_t )_y+i,

	235 (float32_t *)xcorr+i, len);

	236 }

	237

	238 /* In case max_pitch isn't multiple of 4

	239 * compute single correlation value per iteration

	240 */

	241 for (; i < max_pitch; i++) {

	242 xcorr_kernel_neon_float_process1((const float32_t *)_x,

	243 (const float32_t )_y+i, (float32_t )xcorr+i, len);

	244 }

	245 }

OLD	NEW

« autogen.sh ('K') | « celt/arm/arm_celt_map.c ('k') | celt/arm/celt_pitch_xcorr_arm.s » ('j') | doc/release.txt » ('J')