src/libFLAC/lpc_intrin_avx2.c - Issue 1961133002: Update FLAC to 1.3.1

Side by Side Diff: src/libFLAC/lpc_intrin_avx2.c

Issue 1961133002: Update FLAC to 1.3.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/flac.git@master

Patch Set: build config tweaks for Windows Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 /* libFLAC - Free Lossless Audio Codec library

	2 * Copyright (C) 2000-2009 Josh Coalson

	3 * Copyright (C) 2011-2014 Xiph.Org Foundation

	4 *

	5 * Redistribution and use in source and binary forms, with or without

	6 * modification, are permitted provided that the following conditions

	7 * are met:

	8 *

	9 * - Redistributions of source code must retain the above copyright

	10 * notice, this list of conditions and the following disclaimer.

	11 *

	12 * - Redistributions in binary form must reproduce the above copyright

	13 * notice, this list of conditions and the following disclaimer in the

	14 * documentation and/or other materials provided with the distribution.

	15 *

	16 * - Neither the name of the Xiph.org Foundation nor the names of its

	17 * contributors may be used to endorse or promote products derived from

	18 * this software without specific prior written permission.

	19 *

	20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR

	24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

	25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

	26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

	27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF

	28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING

	29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS

	30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	31 */

	32

	33 #ifdef HAVE_CONFIG_H

	34 # include <config.h>

	35 #endif

	36

	37 #ifndef FLAC__INTEGER_ONLY_LIBRARY

	38 #ifndef FLAC__NO_ASM

	39 #if (defined FLAC__CPU_IA32 \|\| defined FLAC__CPU_X86_64) && defined FLAC__HAS_X8 6INTRIN

	40 #include "private/lpc.h"

	41 #ifdef FLAC__AVX2_SUPPORTED

	42

	43 #include "FLAC/assert.h"

	44 #include "FLAC/format.h"

	45

	46 #include <immintrin.h> /* AVX2 */

	47

	48 FLAC__SSE_TARGET("avx2")

	49 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_ _int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])

	50 {

	51 int i;

	52 FLAC__int32 sum;

	53 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);

	54

	55 FLAC__ASSERT(order > 0);

	56 FLAC__ASSERT(order <= 32);

	57

	58 if(order <= 12) {

	59 if(order > 8) {

	60 if(order > 10) {

	61 if(order == 12) {

	62 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;

	63 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	64 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	65 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	66 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	67 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	68 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	69 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);

	70 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);

	71 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);

	72 q9 = _mm256_set1_epi32(0xffff & qlp_coe ff[9 ]);

	73 q10 = _mm256_set1_epi32(0xffff & qlp_coe ff[10]);

	74 q11 = _mm256_set1_epi32(0xffff & qlp_coe ff[11]);

	75

	76 for(i = 0; i < (int)data_len-7; i+=8) {

	77 __m256i summ, mull;

	78 summ = _mm256_madd_epi16(q11, _m m256_loadu_si256((const __m256i*)(data+i-12)));

	79 mull = _mm256_madd_epi16(q10, _m m256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mu ll);

	80 mull = _mm256_madd_epi16(q9, _m m256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mu ll);

	81 mull = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu ll);

	82 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);

	83 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);

	84 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);

	85 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	86 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	87 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	88 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	89 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	90 summ = _mm256_sra_epi32(summ, cn t);

	91 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	92 }

	93 }

	94 else { /* order == 11 */

	95 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;

	96 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	97 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	98 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	99 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	100 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	101 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	102 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);

	103 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);

	104 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);

	105 q9 = _mm256_set1_epi32(0xffff & qlp_coe ff[9 ]);

	106 q10 = _mm256_set1_epi32(0xffff & qlp_coe ff[10]);

	107

	108 for(i = 0; i < (int)data_len-7; i+=8) {

	109 __m256i summ, mull;

	110 summ = _mm256_madd_epi16(q10, _m m256_loadu_si256((const __m256i*)(data+i-11)));

	111 mull = _mm256_madd_epi16(q9, _m m256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mu ll);

	112 mull = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu ll);

	113 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);

	114 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);

	115 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);

	116 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	117 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	118 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	119 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	120 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	121 summ = _mm256_sra_epi32(summ, cn t);

	122 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	123 }

	124 }

	125 }

	126 else {

	127 if(order == 10) {

	128 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;

	129 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	130 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	131 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	132 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	133 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	134 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	135 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);

	136 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);

	137 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);

	138 q9 = _mm256_set1_epi32(0xffff & qlp_coe ff[9 ]);

	139

	140 for(i = 0; i < (int)data_len-7; i+=8) {

	141 __m256i summ, mull;

	142 summ = _mm256_madd_epi16(q9, _m m256_loadu_si256((const __m256i*)(data+i-10)));

	143 mull = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu ll);

	144 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);

	145 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);

	146 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);

	147 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	148 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	149 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	150 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	151 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	152 summ = _mm256_sra_epi32(summ, cn t);

	153 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	154 }

	155 }

	156 else { /* order == 9 */

	157 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;

	158 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	159 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	160 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	161 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	162 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	163 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	164 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);

	165 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);

	166 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);

	167

	168 for(i = 0; i < (int)data_len-7; i+=8) {

	169 __m256i summ, mull;

	170 summ = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 )));

	171 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);

	172 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);

	173 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);

	174 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	175 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	176 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	177 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	178 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	179 summ = _mm256_sra_epi32(summ, cn t);

	180 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	181 }

	182 }

	183 }

	184 }

	185 else if(order > 4) {

	186 if(order > 6) {

	187 if(order == 8) {

	188 __m256i q0, q1, q2, q3, q4, q5, q6, q7;

	189 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	190 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	191 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	192 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	193 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	194 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	195 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);

	196 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);

	197

	198 for(i = 0; i < (int)data_len-7; i+=8) {

	199 __m256i summ, mull;

	200 summ = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 )));

	201 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);

	202 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);

	203 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	204 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	205 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	206 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	207 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	208 summ = _mm256_sra_epi32(summ, cn t);

	209 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	210 }

	211 }

	212 else { /* order == 7 */

	213 __m256i q0, q1, q2, q3, q4, q5, q6;

	214 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	215 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	216 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	217 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	218 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	219 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	220 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);

	221

	222 for(i = 0; i < (int)data_len-7; i+=8) {

	223 __m256i summ, mull;

	224 summ = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 )));

	225 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);

	226 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	227 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	228 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	229 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	230 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	231 summ = _mm256_sra_epi32(summ, cn t);

	232 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	233 }

	234 }

	235 }

	236 else {

	237 if(order == 6) {

	238 __m256i q0, q1, q2, q3, q4, q5;

	239 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	240 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	241 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	242 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	243 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	244 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);

	245

	246 for(i = 0; i < (int)data_len-7; i+=8) {

	247 __m256i summ, mull;

	248 summ = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 )));

	249 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);

	250 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	251 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	252 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	253 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	254 summ = _mm256_sra_epi32(summ, cn t);

	255 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	256 }

	257 }

	258 else { /* order == 5 */

	259 __m256i q0, q1, q2, q3, q4;

	260 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	261 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	262 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	263 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	264 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);

	265

	266 for(i = 0; i < (int)data_len-7; i+=8) {

	267 __m256i summ, mull;

	268 summ = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 )));

	269 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);

	270 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	271 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	272 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	273 summ = _mm256_sra_epi32(summ, cn t);

	274 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	275 }

	276 }

	277 }

	278 }

	279 else {

	280 if(order > 2) {

	281 if(order == 4) {

	282 __m256i q0, q1, q2, q3;

	283 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	284 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	285 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	286 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);

	287

	288 for(i = 0; i < (int)data_len-7; i+=8) {

	289 __m256i summ, mull;

	290 summ = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 )));

	291 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);

	292 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	293 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	294 summ = _mm256_sra_epi32(summ, cn t);

	295 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	296 }

	297 }

	298 else { /* order == 3 */

	299 __m256i q0, q1, q2;

	300 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	301 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	302 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);

	303

	304 for(i = 0; i < (int)data_len-7; i+=8) {

	305 __m256i summ, mull;

	306 summ = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 )));

	307 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);

	308 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	309 summ = _mm256_sra_epi32(summ, cn t);

	310 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	311 }

	312 }

	313 }

	314 else {

	315 if(order == 2) {

	316 __m256i q0, q1;

	317 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	318 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);

	319

	320 for(i = 0; i < (int)data_len-7; i+=8) {

	321 __m256i summ, mull;

	322 summ = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 )));

	323 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);

	324 summ = _mm256_sra_epi32(summ, cn t);

	325 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	326 }

	327 }

	328 else { /* order == 1 */

	329 __m256i q0;

	330 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);

	331

	332 for(i = 0; i < (int)data_len-7; i+=8) {

	333 __m256i summ;

	334 summ = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 )));

	335 summ = _mm256_sra_epi32(summ, cn t);

	336 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	337 }

	338 }

	339 }

	340 }

	341 for(; i < (int)data_len; i++) {

	342 sum = 0;

	343 switch(order) {

	344 case 12: sum += qlp_coeff[11] * data[i-12];

	345 case 11: sum += qlp_coeff[10] * data[i-11];

	346 case 10: sum += qlp_coeff[ 9] * data[i-10];

	347 case 9: sum += qlp_coeff[ 8] * data[i- 9];

	348 case 8: sum += qlp_coeff[ 7] * data[i- 8];

	349 case 7: sum += qlp_coeff[ 6] * data[i- 7];

	350 case 6: sum += qlp_coeff[ 5] * data[i- 6];

	351 case 5: sum += qlp_coeff[ 4] * data[i- 5];

	352 case 4: sum += qlp_coeff[ 3] * data[i- 4];

	353 case 3: sum += qlp_coeff[ 2] * data[i- 3];

	354 case 2: sum += qlp_coeff[ 1] * data[i- 2];

	355 case 1: sum += qlp_coeff[ 0] * data[i- 1];

	356 }

	357 residual[i] = data[i] - (sum >> lp_quantization);

	358 }

	359 }

	360 else { /* order > 12 */

	361 for(i = 0; i < (int)data_len; i++) {

	362 sum = 0;

	363 switch(order) {

	364 case 32: sum += qlp_coeff[31] * data[i-32];

	365 case 31: sum += qlp_coeff[30] * data[i-31];

	366 case 30: sum += qlp_coeff[29] * data[i-30];

	367 case 29: sum += qlp_coeff[28] * data[i-29];

	368 case 28: sum += qlp_coeff[27] * data[i-28];

	369 case 27: sum += qlp_coeff[26] * data[i-27];

	370 case 26: sum += qlp_coeff[25] * data[i-26];

	371 case 25: sum += qlp_coeff[24] * data[i-25];

	372 case 24: sum += qlp_coeff[23] * data[i-24];

	373 case 23: sum += qlp_coeff[22] * data[i-23];

	374 case 22: sum += qlp_coeff[21] * data[i-22];

	375 case 21: sum += qlp_coeff[20] * data[i-21];

	376 case 20: sum += qlp_coeff[19] * data[i-20];

	377 case 19: sum += qlp_coeff[18] * data[i-19];

	378 case 18: sum += qlp_coeff[17] * data[i-18];

	379 case 17: sum += qlp_coeff[16] * data[i-17];

	380 case 16: sum += qlp_coeff[15] * data[i-16];

	381 case 15: sum += qlp_coeff[14] * data[i-15];

	382 case 14: sum += qlp_coeff[13] * data[i-14];

	383 case 13: sum += qlp_coeff[12] * data[i-13];

	384 sum += qlp_coeff[11] * data[i-12];

	385 sum += qlp_coeff[10] * data[i-11];

	386 sum += qlp_coeff[ 9] * data[i-10];

	387 sum += qlp_coeff[ 8] * data[i- 9];

	388 sum += qlp_coeff[ 7] * data[i- 8];

	389 sum += qlp_coeff[ 6] * data[i- 7];

	390 sum += qlp_coeff[ 5] * data[i- 6];

	391 sum += qlp_coeff[ 4] * data[i- 5];

	392 sum += qlp_coeff[ 3] * data[i- 4];

	393 sum += qlp_coeff[ 2] * data[i- 3];

	394 sum += qlp_coeff[ 1] * data[i- 2];

	395 sum += qlp_coeff[ 0] * data[i- 1];

	396 }

	397 residual[i] = data[i] - (sum >> lp_quantization);

	398 }

	399 }

	400 _mm256_zeroupper();

	401 }

	402

	403 FLAC__SSE_TARGET("avx2")

	404 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in t32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])

	405 {

	406 int i;

	407 FLAC__int32 sum;

	408 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);

	409

	410 FLAC__ASSERT(order > 0);

	411 FLAC__ASSERT(order <= 32);

	412

	413 if(order <= 12) {

	414 if(order > 8) {

	415 if(order > 10) {

	416 if(order == 12) {

	417 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;

	418 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	419 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	420 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	421 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	422 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	423 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	424 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);

	425 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);

	426 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);

	427 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);

	428 q10 = _mm256_set1_epi32(qlp_coeff[10]);

	429 q11 = _mm256_set1_epi32(qlp_coeff[11]);

	430

	431 for(i = 0; i < (int)data_len-7; i+=8) {

	432 __m256i summ, mull;

	433 summ = _mm256_mullo_epi32(q11, _ mm256_loadu_si256((const __m256i*)(data+i-12)));

	434 mull = _mm256_mullo_epi32(q10, _ mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, m ull);

	435 mull = _mm256_mullo_epi32(q9, _ mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, m ull);

	436 mull = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m ull);

	437 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);

	438 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);

	439 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);

	440 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	441 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	442 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	443 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	444 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	445 summ = _mm256_sra_epi32(summ, cn t);

	446 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	447 }

	448 }

	449 else { /* order == 11 */

	450 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;

	451 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	452 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	453 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	454 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	455 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	456 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	457 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);

	458 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);

	459 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);

	460 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);

	461 q10 = _mm256_set1_epi32(qlp_coeff[10]);

	462

	463 for(i = 0; i < (int)data_len-7; i+=8) {

	464 __m256i summ, mull;

	465 summ = _mm256_mullo_epi32(q10, _ mm256_loadu_si256((const __m256i*)(data+i-11)));

	466 mull = _mm256_mullo_epi32(q9, _ mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, m ull);

	467 mull = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m ull);

	468 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);

	469 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);

	470 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);

	471 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	472 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	473 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	474 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	475 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	476 summ = _mm256_sra_epi32(summ, cn t);

	477 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	478 }

	479 }

	480 }

	481 else {

	482 if(order == 10) {

	483 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;

	484 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	485 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	486 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	487 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	488 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	489 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	490 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);

	491 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);

	492 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);

	493 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);

	494

	495 for(i = 0; i < (int)data_len-7; i+=8) {

	496 __m256i summ, mull;

	497 summ = _mm256_mullo_epi32(q9, _ mm256_loadu_si256((const __m256i*)(data+i-10)));

	498 mull = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m ull);

	499 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);

	500 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);

	501 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);

	502 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	503 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	504 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	505 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	506 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	507 summ = _mm256_sra_epi32(summ, cn t);

	508 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	509 }

	510 }

	511 else { /* order == 9 */

	512 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;

	513 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	514 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	515 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	516 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	517 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	518 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	519 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);

	520 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);

	521 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);

	522

	523 for(i = 0; i < (int)data_len-7; i+=8) {

	524 __m256i summ, mull;

	525 summ = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9)));

	526 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);

	527 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);

	528 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);

	529 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	530 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	531 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	532 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	533 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	534 summ = _mm256_sra_epi32(summ, cn t);

	535 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	536 }

	537 }

	538 }

	539 }

	540 else if(order > 4) {

	541 if(order > 6) {

	542 if(order == 8) {

	543 __m256i q0, q1, q2, q3, q4, q5, q6, q7;

	544 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	545 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	546 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	547 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	548 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	549 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	550 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);

	551 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);

	552

	553 for(i = 0; i < (int)data_len-7; i+=8) {

	554 __m256i summ, mull;

	555 summ = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8)));

	556 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);

	557 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);

	558 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	559 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	560 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	561 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	562 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	563 summ = _mm256_sra_epi32(summ, cn t);

	564 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	565 }

	566 }

	567 else { /* order == 7 */

	568 __m256i q0, q1, q2, q3, q4, q5, q6;

	569 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	570 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	571 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	572 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	573 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	574 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	575 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);

	576

	577 for(i = 0; i < (int)data_len-7; i+=8) {

	578 __m256i summ, mull;

	579 summ = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7)));

	580 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);

	581 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	582 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	583 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	584 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	585 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	586 summ = _mm256_sra_epi32(summ, cn t);

	587 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	588 }

	589 }

	590 }

	591 else {

	592 if(order == 6) {

	593 __m256i q0, q1, q2, q3, q4, q5;

	594 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	595 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	596 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	597 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	598 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	599 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);

	600

	601 for(i = 0; i < (int)data_len-7; i+=8) {

	602 __m256i summ, mull;

	603 summ = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6)));

	604 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);

	605 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	606 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	607 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	608 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	609 summ = _mm256_sra_epi32(summ, cn t);

	610 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	611 }

	612 }

	613 else { /* order == 5 */

	614 __m256i q0, q1, q2, q3, q4;

	615 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	616 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	617 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	618 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	619 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);

	620

	621 for(i = 0; i < (int)data_len-7; i+=8) {

	622 __m256i summ, mull;

	623 summ = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5)));

	624 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);

	625 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	626 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	627 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	628 summ = _mm256_sra_epi32(summ, cn t);

	629 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	630 }

	631 }

	632 }

	633 }

	634 else {

	635 if(order > 2) {

	636 if(order == 4) {

	637 __m256i q0, q1, q2, q3;

	638 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	639 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	640 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	641 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);

	642

	643 for(i = 0; i < (int)data_len-7; i+=8) {

	644 __m256i summ, mull;

	645 summ = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4)));

	646 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);

	647 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	648 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	649 summ = _mm256_sra_epi32(summ, cn t);

	650 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	651 }

	652 }

	653 else { /* order == 3 */

	654 __m256i q0, q1, q2;

	655 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	656 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	657 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);

	658

	659 for(i = 0; i < (int)data_len-7; i+=8) {

	660 __m256i summ, mull;

	661 summ = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3)));

	662 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);

	663 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	664 summ = _mm256_sra_epi32(summ, cn t);

	665 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	666 }

	667 }

	668 }

	669 else {

	670 if(order == 2) {

	671 __m256i q0, q1;

	672 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	673 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);

	674

	675 for(i = 0; i < (int)data_len-7; i+=8) {

	676 __m256i summ, mull;

	677 summ = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2)));

	678 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);

	679 summ = _mm256_sra_epi32(summ, cn t);

	680 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	681 }

	682 }

	683 else { /* order == 1 */

	684 __m256i q0;

	685 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);

	686

	687 for(i = 0; i < (int)data_len-7; i+=8) {

	688 __m256i summ;

	689 summ = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1)));

	690 summ = _mm256_sra_epi32(summ, cn t);

	691 _mm256_storeu_si256((__m256i)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i)(data+i)), summ) );

	692 }

	693 }

	694 }

	695 }

	696 for(; i < (int)data_len; i++) {

	697 sum = 0;

	698 switch(order) {

	699 case 12: sum += qlp_coeff[11] * data[i-12];

	700 case 11: sum += qlp_coeff[10] * data[i-11];

	701 case 10: sum += qlp_coeff[ 9] * data[i-10];

	702 case 9: sum += qlp_coeff[ 8] * data[i- 9];

	703 case 8: sum += qlp_coeff[ 7] * data[i- 8];

	704 case 7: sum += qlp_coeff[ 6] * data[i- 7];

	705 case 6: sum += qlp_coeff[ 5] * data[i- 6];

	706 case 5: sum += qlp_coeff[ 4] * data[i- 5];

	707 case 4: sum += qlp_coeff[ 3] * data[i- 4];

	708 case 3: sum += qlp_coeff[ 2] * data[i- 3];

	709 case 2: sum += qlp_coeff[ 1] * data[i- 2];

	710 case 1: sum += qlp_coeff[ 0] * data[i- 1];

	711 }

	712 residual[i] = data[i] - (sum >> lp_quantization);

	713 }

	714 }

	715 else { /* order > 12 */

	716 for(i = 0; i < (int)data_len; i++) {

	717 sum = 0;

	718 switch(order) {

	719 case 32: sum += qlp_coeff[31] * data[i-32];

	720 case 31: sum += qlp_coeff[30] * data[i-31];

	721 case 30: sum += qlp_coeff[29] * data[i-30];

	722 case 29: sum += qlp_coeff[28] * data[i-29];

	723 case 28: sum += qlp_coeff[27] * data[i-28];

	724 case 27: sum += qlp_coeff[26] * data[i-27];

	725 case 26: sum += qlp_coeff[25] * data[i-26];

	726 case 25: sum += qlp_coeff[24] * data[i-25];

	727 case 24: sum += qlp_coeff[23] * data[i-24];

	728 case 23: sum += qlp_coeff[22] * data[i-23];

	729 case 22: sum += qlp_coeff[21] * data[i-22];

	730 case 21: sum += qlp_coeff[20] * data[i-21];

	731 case 20: sum += qlp_coeff[19] * data[i-20];

	732 case 19: sum += qlp_coeff[18] * data[i-19];

	733 case 18: sum += qlp_coeff[17] * data[i-18];

	734 case 17: sum += qlp_coeff[16] * data[i-17];

	735 case 16: sum += qlp_coeff[15] * data[i-16];

	736 case 15: sum += qlp_coeff[14] * data[i-15];

	737 case 14: sum += qlp_coeff[13] * data[i-14];

	738 case 13: sum += qlp_coeff[12] * data[i-13];

	739 sum += qlp_coeff[11] * data[i-12];

	740 sum += qlp_coeff[10] * data[i-11];

	741 sum += qlp_coeff[ 9] * data[i-10];

	742 sum += qlp_coeff[ 8] * data[i- 9];

	743 sum += qlp_coeff[ 7] * data[i- 8];

	744 sum += qlp_coeff[ 6] * data[i- 7];

	745 sum += qlp_coeff[ 5] * data[i- 6];

	746 sum += qlp_coeff[ 4] * data[i- 5];

	747 sum += qlp_coeff[ 3] * data[i- 4];

	748 sum += qlp_coeff[ 2] * data[i- 3];

	749 sum += qlp_coeff[ 1] * data[i- 2];

	750 sum += qlp_coeff[ 0] * data[i- 1];

	751 }

	752 residual[i] = data[i] - (sum >> lp_quantization);

	753 }

	754 }

	755 _mm256_zeroupper();

	756 }

	757

	758 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };

	759

	760 FLAC__SSE_TARGET("avx2")

	761 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA C__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order , int lp_quantization, FLAC__int32 residual[])

	762 {

	763 int i;

	764 FLAC__int64 sum;

	765 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);

	766 __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);

	767

	768 FLAC__ASSERT(order > 0);

	769 FLAC__ASSERT(order <= 32);

	770 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */

	771

	772 if(order <= 12) {

	773 if(order > 8) {

	774 if(order > 10) {

	775 if(order == 12) {

	776 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;

	777 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	778 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	779 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	780 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	781 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	782 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	783 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));

	784 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));

	785 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));

	786 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[9 ]));

	787 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[10]));

	788 q11 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[11]));

	789

	790 for(i = 0; i < (int)data_len-3; i+=4) {

	791 __m256i summ, mull;

	792 summ = _mm256_mul_epi32(q11, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));

	793 mull = _mm256_mul_epi32(q10, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256 _add_epi64(summ, mull);

	794 mull = _mm256_mul_epi32(q9, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256 _add_epi64(summ, mull);

	795 mull = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256 _add_epi64(summ, mull);

	796 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);

	797 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);

	798 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);

	799 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	800 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	801 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	802 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	803 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	804 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	805 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	806 }

	807 }

	808 else { /* order == 11 */

	809 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;

	810 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	811 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	812 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	813 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	814 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	815 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	816 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));

	817 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));

	818 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));

	819 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[9 ]));

	820 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[10]));

	821

	822 for(i = 0; i < (int)data_len-3; i+=4) {

	823 __m256i summ, mull;

	824 summ = _mm256_mul_epi32(q10, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));

	825 mull = _mm256_mul_epi32(q9, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256 _add_epi64(summ, mull);

	826 mull = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256 _add_epi64(summ, mull);

	827 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);

	828 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);

	829 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);

	830 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	831 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	832 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	833 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	834 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	835 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	836 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	837 }

	838 }

	839 }

	840 else {

	841 if(order == 10) {

	842 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;

	843 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	844 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	845 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	846 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	847 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	848 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	849 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));

	850 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));

	851 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));

	852 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[9 ]));

	853

	854 for(i = 0; i < (int)data_len-3; i+=4) {

	855 __m256i summ, mull;

	856 summ = _mm256_mul_epi32(q9, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));

	857 mull = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256 _add_epi64(summ, mull);

	858 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);

	859 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);

	860 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);

	861 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	862 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	863 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	864 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	865 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	866 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	867 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	868 }

	869 }

	870 else { /* order == 9 */

	871 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;

	872 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	873 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	874 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	875 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	876 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	877 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	878 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));

	879 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));

	880 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));

	881

	882 for(i = 0; i < (int)data_len-3; i+=4) {

	883 __m256i summ, mull;

	884 summ = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));

	885 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);

	886 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);

	887 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);

	888 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	889 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	890 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	891 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	892 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	893 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	894 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	895 }

	896 }

	897 }

	898 }

	899 else if(order > 4) {

	900 if(order > 6) {

	901 if(order == 8) {

	902 __m256i q0, q1, q2, q3, q4, q5, q6, q7;

	903 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	904 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	905 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	906 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	907 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	908 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	909 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));

	910 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));

	911

	912 for(i = 0; i < (int)data_len-3; i+=4) {

	913 __m256i summ, mull;

	914 summ = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));

	915 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);

	916 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);

	917 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	918 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	919 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	920 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	921 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	922 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	923 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	924 }

	925 }

	926 else { /* order == 7 */

	927 __m256i q0, q1, q2, q3, q4, q5, q6;

	928 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	929 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	930 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	931 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	932 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	933 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	934 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));

	935

	936 for(i = 0; i < (int)data_len-3; i+=4) {

	937 __m256i summ, mull;

	938 summ = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));

	939 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);

	940 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	941 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	942 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	943 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	944 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	945 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	946 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	947 }

	948 }

	949 }

	950 else {

	951 if(order == 6) {

	952 __m256i q0, q1, q2, q3, q4, q5;

	953 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	954 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	955 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	956 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	957 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	958 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));

	959

	960 for(i = 0; i < (int)data_len-3; i+=4) {

	961 __m256i summ, mull;

	962 summ = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));

	963 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);

	964 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	965 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	966 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	967 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	968 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	969 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	970 }

	971 }

	972 else { /* order == 5 */

	973 __m256i q0, q1, q2, q3, q4;

	974 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	975 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	976 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	977 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	978 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));

	979

	980 for(i = 0; i < (int)data_len-3; i+=4) {

	981 __m256i summ, mull;

	982 summ = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));

	983 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);

	984 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	985 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	986 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	987 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	988 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	989 }

	990 }

	991 }

	992 }

	993 else {

	994 if(order > 2) {

	995 if(order == 4) {

	996 __m256i q0, q1, q2, q3;

	997 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	998 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	999 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	1000 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));

	1001

	1002 for(i = 0; i < (int)data_len-3; i+=4) {

	1003 __m256i summ, mull;

	1004 summ = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));

	1005 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);

	1006 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	1007 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	1008 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	1009 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	1010 }

	1011 }

	1012 else { /* order == 3 */

	1013 __m256i q0, q1, q2;

	1014 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	1015 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	1016 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));

	1017

	1018 for(i = 0; i < (int)data_len-3; i+=4) {

	1019 __m256i summ, mull;

	1020 summ = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));

	1021 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);

	1022 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	1023 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	1024 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	1025 }

	1026 }

	1027 }

	1028 else {

	1029 if(order == 2) {

	1030 __m256i q0, q1;

	1031 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	1032 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));

	1033

	1034 for(i = 0; i < (int)data_len-3; i+=4) {

	1035 __m256i summ, mull;

	1036 summ = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));

	1037 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);

	1038 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	1039 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	1040 }

	1041 }

	1042 else { /* order == 1 */

	1043 __m256i q0;

	1044 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));

	1045

	1046 for(i = 0; i < (int)data_len-3; i+=4) {

	1047 __m256i summ;

	1048 summ = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));

	1049 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);

	1050 _mm_storeu_si128((__m128i)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i)(data+i)), _mm256_castsi2 56_si128(summ)));

	1051 }

	1052 }

	1053 }

	1054 }

	1055 for(; i < (int)data_len; i++) {

	1056 sum = 0;

	1057 switch(order) {

	1058 case 12: sum += qlp_coeff[11] * (FLAC__int64)dat a[i-12];

	1059 case 11: sum += qlp_coeff[10] * (FLAC__int64)dat a[i-11];

	1060 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)dat a[i-10];

	1061 case 9: sum += qlp_coeff[ 8] * (FLAC__int64)dat a[i- 9];

	1062 case 8: sum += qlp_coeff[ 7] * (FLAC__int64)dat a[i- 8];

	1063 case 7: sum += qlp_coeff[ 6] * (FLAC__int64)dat a[i- 7];

	1064 case 6: sum += qlp_coeff[ 5] * (FLAC__int64)dat a[i- 6];

	1065 case 5: sum += qlp_coeff[ 4] * (FLAC__int64)dat a[i- 5];

	1066 case 4: sum += qlp_coeff[ 3] * (FLAC__int64)dat a[i- 4];

	1067 case 3: sum += qlp_coeff[ 2] * (FLAC__int64)dat a[i- 3];

	1068 case 2: sum += qlp_coeff[ 1] * (FLAC__int64)dat a[i- 2];

	1069 case 1: sum += qlp_coeff[ 0] * (FLAC__int64)dat a[i- 1];

	1070 }

	1071 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza tion);

	1072 }

	1073 }

	1074 else { /* order > 12 */

	1075 for(i = 0; i < (int)data_len; i++) {

	1076 sum = 0;

	1077 switch(order) {

	1078 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat a[i-32];

	1079 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat a[i-31];

	1080 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat a[i-30];

	1081 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat a[i-29];

	1082 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat a[i-28];

	1083 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat a[i-27];

	1084 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat a[i-26];

	1085 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat a[i-25];

	1086 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat a[i-24];

	1087 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat a[i-23];

	1088 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat a[i-22];

	1089 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat a[i-21];

	1090 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat a[i-20];

	1091 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat a[i-19];

	1092 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat a[i-18];

	1093 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat a[i-17];

	1094 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat a[i-16];

	1095 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat a[i-15];

	1096 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat a[i-14];

	1097 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat a[i-13];

	1098 sum += qlp_coeff[11] * (FLAC__int64)dat a[i-12];

	1099 sum += qlp_coeff[10] * (FLAC__int64)dat a[i-11];

	1100 sum += qlp_coeff[ 9] * (FLAC__int64)dat a[i-10];

	1101 sum += qlp_coeff[ 8] * (FLAC__int64)dat a[i- 9];

	1102 sum += qlp_coeff[ 7] * (FLAC__int64)dat a[i- 8];

	1103 sum += qlp_coeff[ 6] * (FLAC__int64)dat a[i- 7];

	1104 sum += qlp_coeff[ 5] * (FLAC__int64)dat a[i- 6];

	1105 sum += qlp_coeff[ 4] * (FLAC__int64)dat a[i- 5];

	1106 sum += qlp_coeff[ 3] * (FLAC__int64)dat a[i- 4];

	1107 sum += qlp_coeff[ 2] * (FLAC__int64)dat a[i- 3];

	1108 sum += qlp_coeff[ 1] * (FLAC__int64)dat a[i- 2];

	1109 sum += qlp_coeff[ 0] * (FLAC__int64)dat a[i- 1];

	1110 }

	1111 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza tion);

	1112 }

	1113 }

	1114 _mm256_zeroupper();

	1115 }

	1116

	1117 #endif /* FLAC__AVX2_SUPPORTED */

	1118 #endif /* (FLAC__CPU_IA32 \|\| FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */

	1119 #endif /* FLAC__NO_ASM */

	1120 #endif /* FLAC__INTEGER_ONLY_LIBRARY */

OLD	NEW

« no previous file with comments | « src/libFLAC/lpc.c ('k') | src/libFLAC/lpc_intrin_sse.c » ('j') | no next file with comments »