Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/libFLAC/lpc_intrin_avx2.c

Issue 1961133002: Update FLAC to 1.3.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/flac.git@master
Patch Set: build config tweaks for Windows Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/libFLAC/lpc.c ('k') | src/libFLAC/lpc_intrin_sse.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2014 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X8 6INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__AVX2_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <immintrin.h> /* AVX2 */
47
48 FLAC__SSE_TARGET("avx2")
49 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_avx2(const FLAC_ _int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
50 {
51 int i;
52 FLAC__int32 sum;
53 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
54
55 FLAC__ASSERT(order > 0);
56 FLAC__ASSERT(order <= 32);
57
58 if(order <= 12) {
59 if(order > 8) {
60 if(order > 10) {
61 if(order == 12) {
62 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
63 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
64 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
65 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
66 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
67 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
68 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
69 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);
70 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);
71 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);
72 q9 = _mm256_set1_epi32(0xffff & qlp_coe ff[9 ]);
73 q10 = _mm256_set1_epi32(0xffff & qlp_coe ff[10]);
74 q11 = _mm256_set1_epi32(0xffff & qlp_coe ff[11]);
75
76 for(i = 0; i < (int)data_len-7; i+=8) {
77 __m256i summ, mull;
78 summ = _mm256_madd_epi16(q11, _m m256_loadu_si256((const __m256i*)(data+i-12)));
79 mull = _mm256_madd_epi16(q10, _m m256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, mu ll);
80 mull = _mm256_madd_epi16(q9, _m m256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mu ll);
81 mull = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu ll);
82 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);
83 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);
84 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);
85 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
86 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
87 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
88 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
89 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
90 summ = _mm256_sra_epi32(summ, cn t);
91 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
92 }
93 }
94 else { /* order == 11 */
95 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
96 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
97 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
98 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
99 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
100 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
101 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
102 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);
103 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);
104 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);
105 q9 = _mm256_set1_epi32(0xffff & qlp_coe ff[9 ]);
106 q10 = _mm256_set1_epi32(0xffff & qlp_coe ff[10]);
107
108 for(i = 0; i < (int)data_len-7; i+=8) {
109 __m256i summ, mull;
110 summ = _mm256_madd_epi16(q10, _m m256_loadu_si256((const __m256i*)(data+i-11)));
111 mull = _mm256_madd_epi16(q9, _m m256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, mu ll);
112 mull = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu ll);
113 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);
114 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);
115 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);
116 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
117 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
118 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
119 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
120 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
121 summ = _mm256_sra_epi32(summ, cn t);
122 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
123 }
124 }
125 }
126 else {
127 if(order == 10) {
128 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
129 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
130 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
131 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
132 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
133 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
134 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
135 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);
136 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);
137 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);
138 q9 = _mm256_set1_epi32(0xffff & qlp_coe ff[9 ]);
139
140 for(i = 0; i < (int)data_len-7; i+=8) {
141 __m256i summ, mull;
142 summ = _mm256_madd_epi16(q9, _m m256_loadu_si256((const __m256i*)(data+i-10)));
143 mull = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 ))); summ = _mm256_add_epi32(summ, mu ll);
144 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);
145 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);
146 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);
147 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
148 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
149 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
150 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
151 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
152 summ = _mm256_sra_epi32(summ, cn t);
153 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
154 }
155 }
156 else { /* order == 9 */
157 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
158 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
159 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
160 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
161 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
162 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
163 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
164 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);
165 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);
166 q8 = _mm256_set1_epi32(0xffff & qlp_coe ff[8 ]);
167
168 for(i = 0; i < (int)data_len-7; i+=8) {
169 __m256i summ, mull;
170 summ = _mm256_madd_epi16(q8, _m m256_loadu_si256((const __m256i*)(data+i-9 )));
171 mull = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 ))); summ = _mm256_add_epi32(summ, mu ll);
172 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);
173 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);
174 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
175 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
176 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
177 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
178 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
179 summ = _mm256_sra_epi32(summ, cn t);
180 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
181 }
182 }
183 }
184 }
185 else if(order > 4) {
186 if(order > 6) {
187 if(order == 8) {
188 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
189 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
190 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
191 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
192 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
193 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
194 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
195 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);
196 q7 = _mm256_set1_epi32(0xffff & qlp_coe ff[7 ]);
197
198 for(i = 0; i < (int)data_len-7; i+=8) {
199 __m256i summ, mull;
200 summ = _mm256_madd_epi16(q7, _m m256_loadu_si256((const __m256i*)(data+i-8 )));
201 mull = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 ))); summ = _mm256_add_epi32(summ, mu ll);
202 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);
203 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
204 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
205 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
206 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
207 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
208 summ = _mm256_sra_epi32(summ, cn t);
209 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
210 }
211 }
212 else { /* order == 7 */
213 __m256i q0, q1, q2, q3, q4, q5, q6;
214 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
215 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
216 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
217 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
218 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
219 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
220 q6 = _mm256_set1_epi32(0xffff & qlp_coe ff[6 ]);
221
222 for(i = 0; i < (int)data_len-7; i+=8) {
223 __m256i summ, mull;
224 summ = _mm256_madd_epi16(q6, _m m256_loadu_si256((const __m256i*)(data+i-7 )));
225 mull = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 ))); summ = _mm256_add_epi32(summ, mu ll);
226 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
227 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
228 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
229 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
230 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
231 summ = _mm256_sra_epi32(summ, cn t);
232 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
233 }
234 }
235 }
236 else {
237 if(order == 6) {
238 __m256i q0, q1, q2, q3, q4, q5;
239 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
240 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
241 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
242 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
243 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
244 q5 = _mm256_set1_epi32(0xffff & qlp_coe ff[5 ]);
245
246 for(i = 0; i < (int)data_len-7; i+=8) {
247 __m256i summ, mull;
248 summ = _mm256_madd_epi16(q5, _m m256_loadu_si256((const __m256i*)(data+i-6 )));
249 mull = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 ))); summ = _mm256_add_epi32(summ, mu ll);
250 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
251 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
252 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
253 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
254 summ = _mm256_sra_epi32(summ, cn t);
255 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
256 }
257 }
258 else { /* order == 5 */
259 __m256i q0, q1, q2, q3, q4;
260 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
261 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
262 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
263 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
264 q4 = _mm256_set1_epi32(0xffff & qlp_coe ff[4 ]);
265
266 for(i = 0; i < (int)data_len-7; i+=8) {
267 __m256i summ, mull;
268 summ = _mm256_madd_epi16(q4, _m m256_loadu_si256((const __m256i*)(data+i-5 )));
269 mull = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 ))); summ = _mm256_add_epi32(summ, mu ll);
270 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
271 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
272 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
273 summ = _mm256_sra_epi32(summ, cn t);
274 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
275 }
276 }
277 }
278 }
279 else {
280 if(order > 2) {
281 if(order == 4) {
282 __m256i q0, q1, q2, q3;
283 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
284 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
285 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
286 q3 = _mm256_set1_epi32(0xffff & qlp_coe ff[3 ]);
287
288 for(i = 0; i < (int)data_len-7; i+=8) {
289 __m256i summ, mull;
290 summ = _mm256_madd_epi16(q3, _m m256_loadu_si256((const __m256i*)(data+i-4 )));
291 mull = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 ))); summ = _mm256_add_epi32(summ, mu ll);
292 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
293 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
294 summ = _mm256_sra_epi32(summ, cn t);
295 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
296 }
297 }
298 else { /* order == 3 */
299 __m256i q0, q1, q2;
300 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
301 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
302 q2 = _mm256_set1_epi32(0xffff & qlp_coe ff[2 ]);
303
304 for(i = 0; i < (int)data_len-7; i+=8) {
305 __m256i summ, mull;
306 summ = _mm256_madd_epi16(q2, _m m256_loadu_si256((const __m256i*)(data+i-3 )));
307 mull = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 ))); summ = _mm256_add_epi32(summ, mu ll);
308 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
309 summ = _mm256_sra_epi32(summ, cn t);
310 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
311 }
312 }
313 }
314 else {
315 if(order == 2) {
316 __m256i q0, q1;
317 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
318 q1 = _mm256_set1_epi32(0xffff & qlp_coe ff[1 ]);
319
320 for(i = 0; i < (int)data_len-7; i+=8) {
321 __m256i summ, mull;
322 summ = _mm256_madd_epi16(q1, _m m256_loadu_si256((const __m256i*)(data+i-2 )));
323 mull = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 ))); summ = _mm256_add_epi32(summ, mu ll);
324 summ = _mm256_sra_epi32(summ, cn t);
325 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
326 }
327 }
328 else { /* order == 1 */
329 __m256i q0;
330 q0 = _mm256_set1_epi32(0xffff & qlp_coe ff[0 ]);
331
332 for(i = 0; i < (int)data_len-7; i+=8) {
333 __m256i summ;
334 summ = _mm256_madd_epi16(q0, _m m256_loadu_si256((const __m256i*)(data+i-1 )));
335 summ = _mm256_sra_epi32(summ, cn t);
336 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
337 }
338 }
339 }
340 }
341 for(; i < (int)data_len; i++) {
342 sum = 0;
343 switch(order) {
344 case 12: sum += qlp_coeff[11] * data[i-12];
345 case 11: sum += qlp_coeff[10] * data[i-11];
346 case 10: sum += qlp_coeff[ 9] * data[i-10];
347 case 9: sum += qlp_coeff[ 8] * data[i- 9];
348 case 8: sum += qlp_coeff[ 7] * data[i- 8];
349 case 7: sum += qlp_coeff[ 6] * data[i- 7];
350 case 6: sum += qlp_coeff[ 5] * data[i- 6];
351 case 5: sum += qlp_coeff[ 4] * data[i- 5];
352 case 4: sum += qlp_coeff[ 3] * data[i- 4];
353 case 3: sum += qlp_coeff[ 2] * data[i- 3];
354 case 2: sum += qlp_coeff[ 1] * data[i- 2];
355 case 1: sum += qlp_coeff[ 0] * data[i- 1];
356 }
357 residual[i] = data[i] - (sum >> lp_quantization);
358 }
359 }
360 else { /* order > 12 */
361 for(i = 0; i < (int)data_len; i++) {
362 sum = 0;
363 switch(order) {
364 case 32: sum += qlp_coeff[31] * data[i-32];
365 case 31: sum += qlp_coeff[30] * data[i-31];
366 case 30: sum += qlp_coeff[29] * data[i-30];
367 case 29: sum += qlp_coeff[28] * data[i-29];
368 case 28: sum += qlp_coeff[27] * data[i-28];
369 case 27: sum += qlp_coeff[26] * data[i-27];
370 case 26: sum += qlp_coeff[25] * data[i-26];
371 case 25: sum += qlp_coeff[24] * data[i-25];
372 case 24: sum += qlp_coeff[23] * data[i-24];
373 case 23: sum += qlp_coeff[22] * data[i-23];
374 case 22: sum += qlp_coeff[21] * data[i-22];
375 case 21: sum += qlp_coeff[20] * data[i-21];
376 case 20: sum += qlp_coeff[19] * data[i-20];
377 case 19: sum += qlp_coeff[18] * data[i-19];
378 case 18: sum += qlp_coeff[17] * data[i-18];
379 case 17: sum += qlp_coeff[16] * data[i-17];
380 case 16: sum += qlp_coeff[15] * data[i-16];
381 case 15: sum += qlp_coeff[14] * data[i-15];
382 case 14: sum += qlp_coeff[13] * data[i-14];
383 case 13: sum += qlp_coeff[12] * data[i-13];
384 sum += qlp_coeff[11] * data[i-12];
385 sum += qlp_coeff[10] * data[i-11];
386 sum += qlp_coeff[ 9] * data[i-10];
387 sum += qlp_coeff[ 8] * data[i- 9];
388 sum += qlp_coeff[ 7] * data[i- 8];
389 sum += qlp_coeff[ 6] * data[i- 7];
390 sum += qlp_coeff[ 5] * data[i- 6];
391 sum += qlp_coeff[ 4] * data[i- 5];
392 sum += qlp_coeff[ 3] * data[i- 4];
393 sum += qlp_coeff[ 2] * data[i- 3];
394 sum += qlp_coeff[ 1] * data[i- 2];
395 sum += qlp_coeff[ 0] * data[i- 1];
396 }
397 residual[i] = data[i] - (sum >> lp_quantization);
398 }
399 }
400 _mm256_zeroupper();
401 }
402
403 FLAC__SSE_TARGET("avx2")
404 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_avx2(const FLAC__in t32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
405 {
406 int i;
407 FLAC__int32 sum;
408 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
409
410 FLAC__ASSERT(order > 0);
411 FLAC__ASSERT(order <= 32);
412
413 if(order <= 12) {
414 if(order > 8) {
415 if(order > 10) {
416 if(order == 12) {
417 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
418 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
419 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
420 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
421 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
422 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
423 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
424 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
425 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
426 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
427 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
428 q10 = _mm256_set1_epi32(qlp_coeff[10]);
429 q11 = _mm256_set1_epi32(qlp_coeff[11]);
430
431 for(i = 0; i < (int)data_len-7; i+=8) {
432 __m256i summ, mull;
433 summ = _mm256_mullo_epi32(q11, _ mm256_loadu_si256((const __m256i*)(data+i-12)));
434 mull = _mm256_mullo_epi32(q10, _ mm256_loadu_si256((const __m256i*)(data+i-11))); summ = _mm256_add_epi32(summ, m ull);
435 mull = _mm256_mullo_epi32(q9, _ mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, m ull);
436 mull = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m ull);
437 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);
438 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);
439 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);
440 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
441 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
442 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
443 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
444 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
445 summ = _mm256_sra_epi32(summ, cn t);
446 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
447 }
448 }
449 else { /* order == 11 */
450 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
451 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
452 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
453 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
454 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
455 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
456 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
457 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
458 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
459 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
460 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
461 q10 = _mm256_set1_epi32(qlp_coeff[10]);
462
463 for(i = 0; i < (int)data_len-7; i+=8) {
464 __m256i summ, mull;
465 summ = _mm256_mullo_epi32(q10, _ mm256_loadu_si256((const __m256i*)(data+i-11)));
466 mull = _mm256_mullo_epi32(q9, _ mm256_loadu_si256((const __m256i*)(data+i-10))); summ = _mm256_add_epi32(summ, m ull);
467 mull = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m ull);
468 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);
469 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);
470 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);
471 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
472 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
473 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
474 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
475 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
476 summ = _mm256_sra_epi32(summ, cn t);
477 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
478 }
479 }
480 }
481 else {
482 if(order == 10) {
483 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
484 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
485 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
486 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
487 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
488 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
489 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
490 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
491 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
492 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
493 q9 = _mm256_set1_epi32(qlp_coeff[9 ]);
494
495 for(i = 0; i < (int)data_len-7; i+=8) {
496 __m256i summ, mull;
497 summ = _mm256_mullo_epi32(q9, _ mm256_loadu_si256((const __m256i*)(data+i-10)));
498 mull = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9))); summ = _mm256_add_epi32(summ, m ull);
499 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);
500 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);
501 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);
502 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
503 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
504 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
505 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
506 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
507 summ = _mm256_sra_epi32(summ, cn t);
508 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
509 }
510 }
511 else { /* order == 9 */
512 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
513 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
514 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
515 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
516 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
517 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
518 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
519 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
520 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
521 q8 = _mm256_set1_epi32(qlp_coeff[8 ]);
522
523 for(i = 0; i < (int)data_len-7; i+=8) {
524 __m256i summ, mull;
525 summ = _mm256_mullo_epi32(q8, _ mm256_loadu_si256((const __m256i*)(data+i-9)));
526 mull = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8))); summ = _mm256_add_epi32(summ, m ull);
527 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);
528 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);
529 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
530 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
531 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
532 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
533 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
534 summ = _mm256_sra_epi32(summ, cn t);
535 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
536 }
537 }
538 }
539 }
540 else if(order > 4) {
541 if(order > 6) {
542 if(order == 8) {
543 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
544 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
545 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
546 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
547 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
548 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
549 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
550 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
551 q7 = _mm256_set1_epi32(qlp_coeff[7 ]);
552
553 for(i = 0; i < (int)data_len-7; i+=8) {
554 __m256i summ, mull;
555 summ = _mm256_mullo_epi32(q7, _ mm256_loadu_si256((const __m256i*)(data+i-8)));
556 mull = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7))); summ = _mm256_add_epi32(summ, m ull);
557 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);
558 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
559 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
560 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
561 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
562 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
563 summ = _mm256_sra_epi32(summ, cn t);
564 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
565 }
566 }
567 else { /* order == 7 */
568 __m256i q0, q1, q2, q3, q4, q5, q6;
569 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
570 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
571 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
572 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
573 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
574 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
575 q6 = _mm256_set1_epi32(qlp_coeff[6 ]);
576
577 for(i = 0; i < (int)data_len-7; i+=8) {
578 __m256i summ, mull;
579 summ = _mm256_mullo_epi32(q6, _ mm256_loadu_si256((const __m256i*)(data+i-7)));
580 mull = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6))); summ = _mm256_add_epi32(summ, m ull);
581 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
582 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
583 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
584 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
585 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
586 summ = _mm256_sra_epi32(summ, cn t);
587 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
588 }
589 }
590 }
591 else {
592 if(order == 6) {
593 __m256i q0, q1, q2, q3, q4, q5;
594 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
595 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
596 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
597 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
598 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
599 q5 = _mm256_set1_epi32(qlp_coeff[5 ]);
600
601 for(i = 0; i < (int)data_len-7; i+=8) {
602 __m256i summ, mull;
603 summ = _mm256_mullo_epi32(q5, _ mm256_loadu_si256((const __m256i*)(data+i-6)));
604 mull = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5))); summ = _mm256_add_epi32(summ, m ull);
605 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
606 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
607 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
608 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
609 summ = _mm256_sra_epi32(summ, cn t);
610 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
611 }
612 }
613 else { /* order == 5 */
614 __m256i q0, q1, q2, q3, q4;
615 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
616 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
617 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
618 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
619 q4 = _mm256_set1_epi32(qlp_coeff[4 ]);
620
621 for(i = 0; i < (int)data_len-7; i+=8) {
622 __m256i summ, mull;
623 summ = _mm256_mullo_epi32(q4, _ mm256_loadu_si256((const __m256i*)(data+i-5)));
624 mull = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4))); summ = _mm256_add_epi32(summ, m ull);
625 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
626 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
627 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
628 summ = _mm256_sra_epi32(summ, cn t);
629 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
630 }
631 }
632 }
633 }
634 else {
635 if(order > 2) {
636 if(order == 4) {
637 __m256i q0, q1, q2, q3;
638 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
639 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
640 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
641 q3 = _mm256_set1_epi32(qlp_coeff[3 ]);
642
643 for(i = 0; i < (int)data_len-7; i+=8) {
644 __m256i summ, mull;
645 summ = _mm256_mullo_epi32(q3, _ mm256_loadu_si256((const __m256i*)(data+i-4)));
646 mull = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3))); summ = _mm256_add_epi32(summ, m ull);
647 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
648 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
649 summ = _mm256_sra_epi32(summ, cn t);
650 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
651 }
652 }
653 else { /* order == 3 */
654 __m256i q0, q1, q2;
655 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
656 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
657 q2 = _mm256_set1_epi32(qlp_coeff[2 ]);
658
659 for(i = 0; i < (int)data_len-7; i+=8) {
660 __m256i summ, mull;
661 summ = _mm256_mullo_epi32(q2, _ mm256_loadu_si256((const __m256i*)(data+i-3)));
662 mull = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2))); summ = _mm256_add_epi32(summ, m ull);
663 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
664 summ = _mm256_sra_epi32(summ, cn t);
665 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
666 }
667 }
668 }
669 else {
670 if(order == 2) {
671 __m256i q0, q1;
672 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
673 q1 = _mm256_set1_epi32(qlp_coeff[1 ]);
674
675 for(i = 0; i < (int)data_len-7; i+=8) {
676 __m256i summ, mull;
677 summ = _mm256_mullo_epi32(q1, _ mm256_loadu_si256((const __m256i*)(data+i-2)));
678 mull = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1))); summ = _mm256_add_epi32(summ, m ull);
679 summ = _mm256_sra_epi32(summ, cn t);
680 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
681 }
682 }
683 else { /* order == 1 */
684 __m256i q0;
685 q0 = _mm256_set1_epi32(qlp_coeff[0 ]);
686
687 for(i = 0; i < (int)data_len-7; i+=8) {
688 __m256i summ;
689 summ = _mm256_mullo_epi32(q0, _ mm256_loadu_si256((const __m256i*)(data+i-1)));
690 summ = _mm256_sra_epi32(summ, cn t);
691 _mm256_storeu_si256((__m256i*)(r esidual+i), _mm256_sub_epi32(_mm256_loadu_si256((const __m256i*)(data+i)), summ) );
692 }
693 }
694 }
695 }
696 for(; i < (int)data_len; i++) {
697 sum = 0;
698 switch(order) {
699 case 12: sum += qlp_coeff[11] * data[i-12];
700 case 11: sum += qlp_coeff[10] * data[i-11];
701 case 10: sum += qlp_coeff[ 9] * data[i-10];
702 case 9: sum += qlp_coeff[ 8] * data[i- 9];
703 case 8: sum += qlp_coeff[ 7] * data[i- 8];
704 case 7: sum += qlp_coeff[ 6] * data[i- 7];
705 case 6: sum += qlp_coeff[ 5] * data[i- 6];
706 case 5: sum += qlp_coeff[ 4] * data[i- 5];
707 case 4: sum += qlp_coeff[ 3] * data[i- 4];
708 case 3: sum += qlp_coeff[ 2] * data[i- 3];
709 case 2: sum += qlp_coeff[ 1] * data[i- 2];
710 case 1: sum += qlp_coeff[ 0] * data[i- 1];
711 }
712 residual[i] = data[i] - (sum >> lp_quantization);
713 }
714 }
715 else { /* order > 12 */
716 for(i = 0; i < (int)data_len; i++) {
717 sum = 0;
718 switch(order) {
719 case 32: sum += qlp_coeff[31] * data[i-32];
720 case 31: sum += qlp_coeff[30] * data[i-31];
721 case 30: sum += qlp_coeff[29] * data[i-30];
722 case 29: sum += qlp_coeff[28] * data[i-29];
723 case 28: sum += qlp_coeff[27] * data[i-28];
724 case 27: sum += qlp_coeff[26] * data[i-27];
725 case 26: sum += qlp_coeff[25] * data[i-26];
726 case 25: sum += qlp_coeff[24] * data[i-25];
727 case 24: sum += qlp_coeff[23] * data[i-24];
728 case 23: sum += qlp_coeff[22] * data[i-23];
729 case 22: sum += qlp_coeff[21] * data[i-22];
730 case 21: sum += qlp_coeff[20] * data[i-21];
731 case 20: sum += qlp_coeff[19] * data[i-20];
732 case 19: sum += qlp_coeff[18] * data[i-19];
733 case 18: sum += qlp_coeff[17] * data[i-18];
734 case 17: sum += qlp_coeff[16] * data[i-17];
735 case 16: sum += qlp_coeff[15] * data[i-16];
736 case 15: sum += qlp_coeff[14] * data[i-15];
737 case 14: sum += qlp_coeff[13] * data[i-14];
738 case 13: sum += qlp_coeff[12] * data[i-13];
739 sum += qlp_coeff[11] * data[i-12];
740 sum += qlp_coeff[10] * data[i-11];
741 sum += qlp_coeff[ 9] * data[i-10];
742 sum += qlp_coeff[ 8] * data[i- 9];
743 sum += qlp_coeff[ 7] * data[i- 8];
744 sum += qlp_coeff[ 6] * data[i- 7];
745 sum += qlp_coeff[ 5] * data[i- 6];
746 sum += qlp_coeff[ 4] * data[i- 5];
747 sum += qlp_coeff[ 3] * data[i- 4];
748 sum += qlp_coeff[ 2] * data[i- 3];
749 sum += qlp_coeff[ 1] * data[i- 2];
750 sum += qlp_coeff[ 0] * data[i- 1];
751 }
752 residual[i] = data[i] - (sum >> lp_quantization);
753 }
754 }
755 _mm256_zeroupper();
756 }
757
758 static FLAC__int32 pack_arr[8] = { 0, 2, 4, 6, 1, 3, 5, 7 };
759
760 FLAC__SSE_TARGET("avx2")
761 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_avx2(const FLA C__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order , int lp_quantization, FLAC__int32 residual[])
762 {
763 int i;
764 FLAC__int64 sum;
765 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
766 __m256i pack = _mm256_loadu_si256((const __m256i *)pack_arr);
767
768 FLAC__ASSERT(order > 0);
769 FLAC__ASSERT(order <= 32);
770 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm256_sra_epi64() so we have to use _mm256_srl_epi64() */
771
772 if(order <= 12) {
773 if(order > 8) {
774 if(order > 10) {
775 if(order == 12) {
776 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
777 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
778 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
779 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
780 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
781 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
782 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
783 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));
784 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));
785 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));
786 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[9 ]));
787 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[10]));
788 q11 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[11]));
789
790 for(i = 0; i < (int)data_len-3; i+=4) {
791 __m256i summ, mull;
792 summ = _mm256_mul_epi32(q11, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-12))));
793 mull = _mm256_mul_epi32(q10, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11)))); summ = _mm256 _add_epi64(summ, mull);
794 mull = _mm256_mul_epi32(q9, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256 _add_epi64(summ, mull);
795 mull = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256 _add_epi64(summ, mull);
796 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);
797 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);
798 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);
799 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
800 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
801 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
802 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
803 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
804 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
805 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
806 }
807 }
808 else { /* order == 11 */
809 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
810 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
811 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
812 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
813 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
814 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
815 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
816 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));
817 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));
818 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));
819 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[9 ]));
820 q10 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[10]));
821
822 for(i = 0; i < (int)data_len-3; i+=4) {
823 __m256i summ, mull;
824 summ = _mm256_mul_epi32(q10, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-11))));
825 mull = _mm256_mul_epi32(q9, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10)))); summ = _mm256 _add_epi64(summ, mull);
826 mull = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256 _add_epi64(summ, mull);
827 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);
828 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);
829 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);
830 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
831 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
832 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
833 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
834 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
835 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
836 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
837 }
838 }
839 }
840 else {
841 if(order == 10) {
842 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
843 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
844 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
845 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
846 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
847 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
848 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
849 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));
850 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));
851 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));
852 q9 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[9 ]));
853
854 for(i = 0; i < (int)data_len-3; i+=4) {
855 __m256i summ, mull;
856 summ = _mm256_mul_epi32(q9, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-10))));
857 mull = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 )))); summ = _mm256 _add_epi64(summ, mull);
858 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);
859 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);
860 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);
861 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
862 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
863 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
864 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
865 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
866 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
867 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
868 }
869 }
870 else { /* order == 9 */
871 __m256i q0, q1, q2, q3, q4, q5, q6, q7, q8;
872 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
873 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
874 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
875 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
876 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
877 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
878 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));
879 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));
880 q8 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[8 ]));
881
882 for(i = 0; i < (int)data_len-3; i+=4) {
883 __m256i summ, mull;
884 summ = _mm256_mul_epi32(q8, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-9 ))));
885 mull = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 )))); summ = _mm256 _add_epi64(summ, mull);
886 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);
887 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);
888 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
889 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
890 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
891 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
892 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
893 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
894 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
895 }
896 }
897 }
898 }
899 else if(order > 4) {
900 if(order > 6) {
901 if(order == 8) {
902 __m256i q0, q1, q2, q3, q4, q5, q6, q7;
903 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
904 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
905 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
906 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
907 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
908 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
909 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));
910 q7 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[7 ]));
911
912 for(i = 0; i < (int)data_len-3; i+=4) {
913 __m256i summ, mull;
914 summ = _mm256_mul_epi32(q7, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-8 ))));
915 mull = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 )))); summ = _mm256 _add_epi64(summ, mull);
916 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);
917 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
918 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
919 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
920 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
921 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
922 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
923 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
924 }
925 }
926 else { /* order == 7 */
927 __m256i q0, q1, q2, q3, q4, q5, q6;
928 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
929 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
930 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
931 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
932 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
933 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
934 q6 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[6 ]));
935
936 for(i = 0; i < (int)data_len-3; i+=4) {
937 __m256i summ, mull;
938 summ = _mm256_mul_epi32(q6, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-7 ))));
939 mull = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 )))); summ = _mm256 _add_epi64(summ, mull);
940 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
941 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
942 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
943 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
944 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
945 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
946 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
947 }
948 }
949 }
950 else {
951 if(order == 6) {
952 __m256i q0, q1, q2, q3, q4, q5;
953 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
954 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
955 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
956 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
957 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
958 q5 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[5 ]));
959
960 for(i = 0; i < (int)data_len-3; i+=4) {
961 __m256i summ, mull;
962 summ = _mm256_mul_epi32(q5, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-6 ))));
963 mull = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 )))); summ = _mm256 _add_epi64(summ, mull);
964 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
965 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
966 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
967 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
968 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
969 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
970 }
971 }
972 else { /* order == 5 */
973 __m256i q0, q1, q2, q3, q4;
974 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
975 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
976 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
977 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
978 q4 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[4 ]));
979
980 for(i = 0; i < (int)data_len-3; i+=4) {
981 __m256i summ, mull;
982 summ = _mm256_mul_epi32(q4, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-5 ))));
983 mull = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 )))); summ = _mm256 _add_epi64(summ, mull);
984 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
985 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
986 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
987 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
988 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
989 }
990 }
991 }
992 }
993 else {
994 if(order > 2) {
995 if(order == 4) {
996 __m256i q0, q1, q2, q3;
997 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
998 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
999 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
1000 q3 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[3 ]));
1001
1002 for(i = 0; i < (int)data_len-3; i+=4) {
1003 __m256i summ, mull;
1004 summ = _mm256_mul_epi32(q3, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-4 ))));
1005 mull = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 )))); summ = _mm256 _add_epi64(summ, mull);
1006 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
1007 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
1008 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
1009 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
1010 }
1011 }
1012 else { /* order == 3 */
1013 __m256i q0, q1, q2;
1014 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
1015 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
1016 q2 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[2 ]));
1017
1018 for(i = 0; i < (int)data_len-3; i+=4) {
1019 __m256i summ, mull;
1020 summ = _mm256_mul_epi32(q2, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-3 ))));
1021 mull = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 )))); summ = _mm256 _add_epi64(summ, mull);
1022 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
1023 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
1024 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
1025 }
1026 }
1027 }
1028 else {
1029 if(order == 2) {
1030 __m256i q0, q1;
1031 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
1032 q1 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[1 ]));
1033
1034 for(i = 0; i < (int)data_len-3; i+=4) {
1035 __m256i summ, mull;
1036 summ = _mm256_mul_epi32(q1, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-2 ))));
1037 mull = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 )))); summ = _mm256 _add_epi64(summ, mull);
1038 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
1039 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
1040 }
1041 }
1042 else { /* order == 1 */
1043 __m256i q0;
1044 q0 = _mm256_cvtepu32_epi64(_mm_set1_epi 32(qlp_coeff[0 ]));
1045
1046 for(i = 0; i < (int)data_len-3; i+=4) {
1047 __m256i summ;
1048 summ = _mm256_mul_epi32(q0, _mm 256_cvtepu32_epi64(_mm_loadu_si128((const __m128i*)(data+i-1 ))));
1049 summ = _mm256_permutevar8x32_epi 32(_mm256_srl_epi64(summ, cnt), pack);
1050 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), _mm256_castsi2 56_si128(summ)));
1051 }
1052 }
1053 }
1054 }
1055 for(; i < (int)data_len; i++) {
1056 sum = 0;
1057 switch(order) {
1058 case 12: sum += qlp_coeff[11] * (FLAC__int64)dat a[i-12];
1059 case 11: sum += qlp_coeff[10] * (FLAC__int64)dat a[i-11];
1060 case 10: sum += qlp_coeff[ 9] * (FLAC__int64)dat a[i-10];
1061 case 9: sum += qlp_coeff[ 8] * (FLAC__int64)dat a[i- 9];
1062 case 8: sum += qlp_coeff[ 7] * (FLAC__int64)dat a[i- 8];
1063 case 7: sum += qlp_coeff[ 6] * (FLAC__int64)dat a[i- 7];
1064 case 6: sum += qlp_coeff[ 5] * (FLAC__int64)dat a[i- 6];
1065 case 5: sum += qlp_coeff[ 4] * (FLAC__int64)dat a[i- 5];
1066 case 4: sum += qlp_coeff[ 3] * (FLAC__int64)dat a[i- 4];
1067 case 3: sum += qlp_coeff[ 2] * (FLAC__int64)dat a[i- 3];
1068 case 2: sum += qlp_coeff[ 1] * (FLAC__int64)dat a[i- 2];
1069 case 1: sum += qlp_coeff[ 0] * (FLAC__int64)dat a[i- 1];
1070 }
1071 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza tion);
1072 }
1073 }
1074 else { /* order > 12 */
1075 for(i = 0; i < (int)data_len; i++) {
1076 sum = 0;
1077 switch(order) {
1078 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat a[i-32];
1079 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat a[i-31];
1080 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat a[i-30];
1081 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat a[i-29];
1082 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat a[i-28];
1083 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat a[i-27];
1084 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat a[i-26];
1085 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat a[i-25];
1086 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat a[i-24];
1087 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat a[i-23];
1088 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat a[i-22];
1089 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat a[i-21];
1090 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat a[i-20];
1091 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat a[i-19];
1092 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat a[i-18];
1093 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat a[i-17];
1094 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat a[i-16];
1095 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat a[i-15];
1096 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat a[i-14];
1097 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat a[i-13];
1098 sum += qlp_coeff[11] * (FLAC__int64)dat a[i-12];
1099 sum += qlp_coeff[10] * (FLAC__int64)dat a[i-11];
1100 sum += qlp_coeff[ 9] * (FLAC__int64)dat a[i-10];
1101 sum += qlp_coeff[ 8] * (FLAC__int64)dat a[i- 9];
1102 sum += qlp_coeff[ 7] * (FLAC__int64)dat a[i- 8];
1103 sum += qlp_coeff[ 6] * (FLAC__int64)dat a[i- 7];
1104 sum += qlp_coeff[ 5] * (FLAC__int64)dat a[i- 6];
1105 sum += qlp_coeff[ 4] * (FLAC__int64)dat a[i- 5];
1106 sum += qlp_coeff[ 3] * (FLAC__int64)dat a[i- 4];
1107 sum += qlp_coeff[ 2] * (FLAC__int64)dat a[i- 3];
1108 sum += qlp_coeff[ 1] * (FLAC__int64)dat a[i- 2];
1109 sum += qlp_coeff[ 0] * (FLAC__int64)dat a[i- 1];
1110 }
1111 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza tion);
1112 }
1113 }
1114 _mm256_zeroupper();
1115 }
1116
1117 #endif /* FLAC__AVX2_SUPPORTED */
1118 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1119 #endif /* FLAC__NO_ASM */
1120 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
OLDNEW
« no previous file with comments | « src/libFLAC/lpc.c ('k') | src/libFLAC/lpc_intrin_sse.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698