Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(204)

Side by Side Diff: src/libFLAC/lpc_intrin_sse41.c

Issue 1961133002: Update FLAC to 1.3.1 (Closed) Base URL: https://chromium.googlesource.com/chromium/deps/flac.git@master
Patch Set: build config tweaks for Windows Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/libFLAC/lpc_intrin_sse2.c ('k') | src/libFLAC/md5.c » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2014 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X8 6INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE4_1_SUPPORTED
42
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <smmintrin.h> /* SSE4.1 */
47
48 #if defined FLAC__CPU_IA32 /* unused for x64 */
49
50 #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_s rl_epi64(xmmN, cnt))
51 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_s rli_epi64(xmmN, lp_quantization))
52
53 FLAC__SSE_TARGET("sse4.1")
54 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FL AC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned orde r, int lp_quantization, FLAC__int32 residual[])
55 {
56 int i;
57 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
58
59 FLAC__ASSERT(order > 0);
60 FLAC__ASSERT(order <= 32);
61 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
62
63 if(order <= 12) {
64 if(order > 8) { /* order == 9, 10, 11, 12 */
65 if(order > 10) { /* order == 11, 12 */
66 if(order == 12) {
67 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm m5, xmm6, xmm7;
68 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0)); // 0 0 q[1] q[0]
69 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2)); // 0 0 q[3] q[2]
70 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4)); // 0 0 q[5] q[4]
71 xmm3 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+6)); // 0 0 q[7] q[6]
72 xmm4 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+8)); // 0 0 q[9] q[8]
73 xmm5 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+10)); // 0 0 q[11] q[10]
74
75 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0)); // 0 q[1] 0 q[0]
76 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0)); // 0 q[3] 0 q[2]
77 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0)); // 0 q[5] 0 q[4]
78 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF LE(3,1,2,0)); // 0 q[7] 0 q[6]
79 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF LE(3,1,2,0)); // 0 q[9] 0 q[8]
80 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFF LE(3,1,2,0)); // 0 q[11] 0 q[10]
81
82 for(i = 0; i < (int)data_len; i++) {
83 //sum = 0;
84 //sum += qlp_coeff[11] * (FLAC__ int64)data[i-12];
85 //sum += qlp_coeff[10] * (FLAC__ int64)data[i-11];
86 xmm7 = _mm_loadl_epi64((const __ m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
87 xmm7 = _mm_shuffle_epi32(xmm7, _ MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
88 xmm7 = _mm_mul_epi32(xmm7, xmm5) ;
89
90 //sum += qlp_coeff[9] * (FLAC__i nt64)data[i-10];
91 //sum += qlp_coeff[8] * (FLAC__i nt64)data[i-9];
92 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-10));
93 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
94 xmm6 = _mm_mul_epi32(xmm6, xmm4) ;
95 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
96
97 //sum += qlp_coeff[7] * (FLAC__i nt64)data[i-8];
98 //sum += qlp_coeff[6] * (FLAC__i nt64)data[i-7];
99 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-8));
100 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
101 xmm6 = _mm_mul_epi32(xmm6, xmm3) ;
102 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
103
104 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
105 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
106 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
107 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
108 xmm6 = _mm_mul_epi32(xmm6, xmm2) ;
109 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
110
111 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
112 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
113 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
114 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
115 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
116 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
117
118 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
119 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
120 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
121 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
122 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
123 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
124
125 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
126 RESIDUAL64_RESULT1(xmm7);
127 }
128 }
129 else { /* order == 11 */
130 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm m5, xmm6, xmm7;
131 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
132 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
133 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4));
134 xmm3 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+6));
135 xmm4 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+8));
136 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
137
138 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
139 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
140 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0));
141 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF LE(3,1,2,0));
142 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF LE(3,1,2,0));
143
144 for(i = 0; i < (int)data_len; i++) {
145 //sum = 0;
146 //sum = qlp_coeff[10] * (FLAC__ int64)data[i-11];
147 xmm7 = _mm_cvtsi32_si128(data[i- 11]);
148 xmm7 = _mm_mul_epi32(xmm7, xmm5) ;
149
150 //sum += qlp_coeff[9] * (FLAC__i nt64)data[i-10];
151 //sum += qlp_coeff[8] * (FLAC__i nt64)data[i-9];
152 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-10));
153 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
154 xmm6 = _mm_mul_epi32(xmm6, xmm4) ;
155 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
156
157 //sum += qlp_coeff[7] * (FLAC__i nt64)data[i-8];
158 //sum += qlp_coeff[6] * (FLAC__i nt64)data[i-7];
159 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-8));
160 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
161 xmm6 = _mm_mul_epi32(xmm6, xmm3) ;
162 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
163
164 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
165 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
166 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
167 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
168 xmm6 = _mm_mul_epi32(xmm6, xmm2) ;
169 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
170
171 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
172 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
173 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
174 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
175 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
176 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
177
178 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
179 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
180 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
181 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
182 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
183 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
184
185 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
186 RESIDUAL64_RESULT1(xmm7);
187 }
188 }
189 }
190 else { /* order == 9, 10 */
191 if(order == 10) {
192 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm m6, xmm7;
193 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
194 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
195 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4));
196 xmm3 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+6));
197 xmm4 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+8));
198
199 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
200 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
201 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0));
202 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF LE(3,1,2,0));
203 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFF LE(3,1,2,0));
204
205 for(i = 0; i < (int)data_len; i++) {
206 //sum = 0;
207 //sum += qlp_coeff[9] * (FLAC__i nt64)data[i-10];
208 //sum += qlp_coeff[8] * (FLAC__i nt64)data[i-9];
209 xmm7 = _mm_loadl_epi64((const __ m128i*)(data+i-10));
210 xmm7 = _mm_shuffle_epi32(xmm7, _ MM_SHUFFLE(2,0,3,1));
211 xmm7 = _mm_mul_epi32(xmm7, xmm4) ;
212
213 //sum += qlp_coeff[7] * (FLAC__i nt64)data[i-8];
214 //sum += qlp_coeff[6] * (FLAC__i nt64)data[i-7];
215 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-8));
216 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
217 xmm6 = _mm_mul_epi32(xmm6, xmm3) ;
218 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
219
220 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
221 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
222 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
223 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
224 xmm6 = _mm_mul_epi32(xmm6, xmm2) ;
225 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
226
227 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
228 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
229 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
230 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
231 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
232 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
233
234 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
235 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
236 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
237 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
238 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
239 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
240
241 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
242 RESIDUAL64_RESULT(xmm7);
243 }
244 }
245 else { /* order == 9 */
246 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xm m6, xmm7;
247 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
248 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
249 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4));
250 xmm3 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+6));
251 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
252
253 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
254 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
255 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0));
256 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF LE(3,1,2,0));
257
258 for(i = 0; i < (int)data_len; i++) {
259 //sum = 0;
260 //sum = qlp_coeff[8] * (FLAC__i nt64)data[i-9];
261 xmm7 = _mm_cvtsi32_si128(data[i- 9]);
262 xmm7 = _mm_mul_epi32(xmm7, xmm4) ;
263
264 //sum += qlp_coeff[7] * (FLAC__i nt64)data[i-8];
265 //sum += qlp_coeff[6] * (FLAC__i nt64)data[i-7];
266 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-8));
267 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
268 xmm6 = _mm_mul_epi32(xmm6, xmm3) ;
269 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
270
271 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
272 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
273 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
274 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
275 xmm6 = _mm_mul_epi32(xmm6, xmm2) ;
276 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
277
278 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
279 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
280 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
281 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
282 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
283 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
284
285 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
286 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
287 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
288 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
289 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
290 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
291
292 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
293 RESIDUAL64_RESULT(xmm7);
294 }
295 }
296 }
297 }
298 else if(order > 4) { /* order == 5, 6, 7, 8 */
299 if(order > 6) { /* order == 7, 8 */
300 if(order == 8) {
301 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xm m7;
302 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
303 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
304 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4));
305 xmm3 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+6));
306
307 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
308 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
309 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0));
310 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFF LE(3,1,2,0));
311
312 for(i = 0; i < (int)data_len; i++) {
313 //sum = 0;
314 //sum += qlp_coeff[7] * (FLAC__i nt64)data[i-8];
315 //sum += qlp_coeff[6] * (FLAC__i nt64)data[i-7];
316 xmm7 = _mm_loadl_epi64((const __ m128i*)(data+i-8));
317 xmm7 = _mm_shuffle_epi32(xmm7, _ MM_SHUFFLE(2,0,3,1));
318 xmm7 = _mm_mul_epi32(xmm7, xmm3) ;
319
320 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
321 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
322 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
323 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
324 xmm6 = _mm_mul_epi32(xmm6, xmm2) ;
325 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
326
327 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
328 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
329 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
330 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
331 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
332 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
333
334 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
335 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
336 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
337 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
338 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
339 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
340
341 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
342 RESIDUAL64_RESULT(xmm7);
343 }
344 }
345 else { /* order == 7 */
346 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xm m7;
347 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
348 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
349 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4));
350 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
351
352 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
353 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
354 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0));
355
356 for(i = 0; i < (int)data_len; i++) {
357 //sum = 0;
358 //sum = qlp_coeff[6] * (FLAC__i nt64)data[i-7];
359 xmm7 = _mm_cvtsi32_si128(data[i- 7]);
360 xmm7 = _mm_mul_epi32(xmm7, xmm3) ;
361
362 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
363 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
364 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
365 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
366 xmm6 = _mm_mul_epi32(xmm6, xmm2) ;
367 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
368
369 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
370 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
371 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
372 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
373 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
374 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
375
376 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
377 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
378 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
379 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
380 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
381 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
382
383 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
384 RESIDUAL64_RESULT(xmm7);
385 }
386 }
387 }
388 else { /* order == 5, 6 */
389 if(order == 6) {
390 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
391 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
392 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
393 xmm2 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+4));
394
395 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
396 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
397 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFF LE(3,1,2,0));
398
399 for(i = 0; i < (int)data_len; i++) {
400 //sum = 0;
401 //sum += qlp_coeff[5] * (FLAC__i nt64)data[i-6];
402 //sum += qlp_coeff[4] * (FLAC__i nt64)data[i-5];
403 xmm7 = _mm_loadl_epi64((const __ m128i*)(data+i-6));
404 xmm7 = _mm_shuffle_epi32(xmm7, _ MM_SHUFFLE(2,0,3,1));
405 xmm7 = _mm_mul_epi32(xmm7, xmm2) ;
406
407 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
408 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
409 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
410 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
411 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
412 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
413
414 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
415 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
416 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
417 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
418 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
419 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
420
421 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
422 RESIDUAL64_RESULT(xmm7);
423 }
424 }
425 else { /* order == 5 */
426 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
427 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
428 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
429 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
430
431 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
432 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
433
434 for(i = 0; i < (int)data_len; i++) {
435 //sum = 0;
436 //sum = qlp_coeff[4] * (FLAC__i nt64)data[i-5];
437 xmm7 = _mm_cvtsi32_si128(data[i- 5]);
438 xmm7 = _mm_mul_epi32(xmm7, xmm2) ;
439
440 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
441 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
442 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
443 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
444 xmm6 = _mm_mul_epi32(xmm6, xmm1) ;
445 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
446
447 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
448 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
449 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
450 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
451 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
452 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
453
454 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
455 RESIDUAL64_RESULT(xmm7);
456 }
457 }
458 }
459 }
460 else { /* order == 1, 2, 3, 4 */
461 if(order > 2) { /* order == 3, 4 */
462 if(order == 4) {
463 __m128i xmm0, xmm1, xmm6, xmm7;
464 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
465 xmm1 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+2));
466
467 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
468 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFF LE(3,1,2,0));
469
470 for(i = 0; i < (int)data_len; i++) {
471 //sum = 0;
472 //sum += qlp_coeff[3] * (FLAC__i nt64)data[i-4];
473 //sum += qlp_coeff[2] * (FLAC__i nt64)data[i-3];
474 xmm7 = _mm_loadl_epi64((const __ m128i*)(data+i-4));
475 xmm7 = _mm_shuffle_epi32(xmm7, _ MM_SHUFFLE(2,0,3,1));
476 xmm7 = _mm_mul_epi32(xmm7, xmm1) ;
477
478 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
479 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
480 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
481 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
482 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
483 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
484
485 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
486 RESIDUAL64_RESULT(xmm7);
487 }
488 }
489 else { /* order == 3 */
490 __m128i xmm0, xmm1, xmm6, xmm7;
491 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
492 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
493
494 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
495
496 for(i = 0; i < (int)data_len; i++) {
497 //sum = 0;
498 //sum = qlp_coeff[2] * (FLAC__i nt64)data[i-3];
499 xmm7 = _mm_cvtsi32_si128(data[i- 3]);
500 xmm7 = _mm_mul_epi32(xmm7, xmm1) ;
501
502 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
503 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
504 xmm6 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
505 xmm6 = _mm_shuffle_epi32(xmm6, _ MM_SHUFFLE(2,0,3,1));
506 xmm6 = _mm_mul_epi32(xmm6, xmm0) ;
507 xmm7 = _mm_add_epi64(xmm7, xmm6) ;
508
509 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
510 RESIDUAL64_RESULT(xmm7);
511 }
512 }
513 }
514 else { /* order == 1, 2 */
515 if(order == 2) {
516 __m128i xmm0, xmm7;
517 xmm0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff+0));
518 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFF LE(3,1,2,0));
519
520 for(i = 0; i < (int)data_len; i++) {
521 //sum = 0;
522 //sum += qlp_coeff[1] * (FLAC__i nt64)data[i-2];
523 //sum += qlp_coeff[0] * (FLAC__i nt64)data[i-1];
524 xmm7 = _mm_loadl_epi64((const __ m128i*)(data+i-2));
525 xmm7 = _mm_shuffle_epi32(xmm7, _ MM_SHUFFLE(2,0,3,1));
526 xmm7 = _mm_mul_epi32(xmm7, xmm0) ;
527
528 xmm7 = _mm_add_epi64(xmm7, _mm_s rli_si128(xmm7, 8));
529 RESIDUAL64_RESULT(xmm7);
530 }
531 }
532 else { /* order == 1 */
533 __m128i xmm0, xmm7;
534 xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
535
536 for(i = 0; i < (int)data_len; i++) {
537 //sum = qlp_coeff[0] * (FLAC__in t64)data[i-1];
538 xmm7 = _mm_cvtsi32_si128(data[i- 1]);
539 xmm7 = _mm_mul_epi32(xmm7, xmm0) ;
540 RESIDUAL64_RESULT(xmm7);
541 }
542 }
543 }
544 }
545 }
546 else { /* order > 12 */
547 FLAC__int64 sum;
548 for(i = 0; i < (int)data_len; i++) {
549 sum = 0;
550 switch(order) {
551 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat a[i-32];
552 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat a[i-31];
553 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat a[i-30];
554 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat a[i-29];
555 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat a[i-28];
556 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat a[i-27];
557 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat a[i-26];
558 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat a[i-25];
559 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat a[i-24];
560 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat a[i-23];
561 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat a[i-22];
562 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat a[i-21];
563 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat a[i-20];
564 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat a[i-19];
565 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat a[i-18];
566 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat a[i-17];
567 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat a[i-16];
568 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat a[i-15];
569 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat a[i-14];
570 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat a[i-13];
571 sum += qlp_coeff[11] * (FLAC__int64)dat a[i-12];
572 sum += qlp_coeff[10] * (FLAC__int64)dat a[i-11];
573 sum += qlp_coeff[ 9] * (FLAC__int64)dat a[i-10];
574 sum += qlp_coeff[ 8] * (FLAC__int64)dat a[i- 9];
575 sum += qlp_coeff[ 7] * (FLAC__int64)dat a[i- 8];
576 sum += qlp_coeff[ 6] * (FLAC__int64)dat a[i- 7];
577 sum += qlp_coeff[ 5] * (FLAC__int64)dat a[i- 6];
578 sum += qlp_coeff[ 4] * (FLAC__int64)dat a[i- 5];
579 sum += qlp_coeff[ 3] * (FLAC__int64)dat a[i- 4];
580 sum += qlp_coeff[ 2] * (FLAC__int64)dat a[i- 3];
581 sum += qlp_coeff[ 1] * (FLAC__int64)dat a[i- 2];
582 sum += qlp_coeff[ 0] * (FLAC__int64)dat a[i- 1];
583 }
584 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantiza tion);
585 }
586 }
587 }
588
589 FLAC__SSE_TARGET("sse4.1")
590 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], un signed data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantizat ion, FLAC__int32 data[])
591 {
592 int i;
593 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
594
595 if (!data_len)
596 return;
597
598 FLAC__ASSERT(order > 0);
599 FLAC__ASSERT(order <= 32);
600 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
601
602 if(order <= 12) {
603 if(order > 8) { /* order == 9, 10, 11, 12 */
604 if(order > 10) { /* order == 11, 12 */
605 __m128i qlp[6], dat[6];
606 __m128i summ, temp;
607 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+0)); // 0 0 q[1] q[0]
608 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+2)); // 0 0 q[3] q[2]
609 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+4)); // 0 0 q[5] q[4]
610 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+6)); // 0 0 q[7] q[6]
611 qlp[4] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+8)); // 0 0 q[9] q[8]
612 if (order == 12)
613 qlp[5] = _mm_loadl_epi64((const __m128i* )(qlp_coeff+10)); // 0 0 q[11] q[10]
614 else
615 qlp[5] = _mm_cvtsi32_si128(qlp_coeff[10] ); // 0 0 0 q[10]
616
617 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2 ,0,3,1)); // 0 q[0] 0 q[1]
618 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2 ,0,3,1)); // 0 q[2] 0 q[3]
619 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2 ,0,3,1)); // 0 q[4] 0 q[5]
620 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2 ,0,3,1)); // 0 q[5] 0 q[7]
621 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2 ,0,3,1)); // 0 q[8] 0 q[9]
622 qlp[5] = _mm_shuffle_epi32(qlp[5], _MM_SHUFFLE(2 ,0,3,1)); // 0 q[10] 0 q[11]
623
624 dat[5] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-12))); // ? d[i-11] ? d[i-12]
625 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-10))); // ? d[i-9] ? d[i-10]
626 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-8 ))); // ? d[i-7] ? d[i-8]
627 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-6 ))); // ? d[i-5] ? d[i-6]
628 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-4 ))); // ? d[i-3] ? d[i-4]
629 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-2 ))); // ? d[i-1] ? d[i-2]
630
631 summ = _mm_mul_epi32(dat[5], qlp[5]) ;
632 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[4], qlp[4]));
633 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
634 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
635 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
636 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
637
638 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8)); // ?_64 sum_64
639 summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32
640 temp = _mm_cvtsi32_si128(residual[0]); // 0 0 0 r[i]
641 temp = _mm_add_epi32(temp, summ); // ? ? ? d[i]
642 data[0] = _mm_cvtsi128_si32(temp);
643
644 for(i = 1; i < (int)data_len; i++) {
645 dat[5] = _mm_alignr_epi8(dat[4], dat[5], 8); // ? d[i-10] ? d[i-11]
646 dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8); // ? d[i-8] ? d[i-9]
647 dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8); // ? d[i-6] ? d[i-7]
648 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8); // ? d[i-4] ? d[i-5]
649 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8); // ? d[i-2] ? d[i-3]
650 dat[0] = _mm_alignr_epi8(temp, dat[0], 8); // ? d[i ] ? d[i-1]
651
652 summ = _mm_mul_epi32 (dat[5], qlp[5]) ;
653 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[4], qlp[4]));
654 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[3], qlp[3]));
655 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[2], qlp[2]));
656 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[1], qlp[1]));
657 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[0], qlp[0]));
658
659 summ = _mm_add_epi64(summ, _mm_srli_si12 8(summ, 8)); // ?_64 sum_64
660 summ = _mm_srl_epi64(summ, cnt); // ?_64 (sum >> lp_quantization)_64 == ?_32 ?_32 ?_32 (sum >> lp_quantization)_32
661 temp = _mm_cvtsi32_si128(residual[i]); // 0 0 0 r[i]
662 temp = _mm_add_epi32(temp, summ); // ? ? ? d[i]
663 data[i] = _mm_cvtsi128_si32(temp);
664 }
665 }
666 else { /* order == 9, 10 */
667 __m128i qlp[5], dat[5];
668 __m128i summ, temp;
669 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+0));
670 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+2));
671 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+4));
672 qlp[3] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+6));
673 if (order == 10)
674 qlp[4] = _mm_loadl_epi64((const __m128i* )(qlp_coeff+8));
675 else
676 qlp[4] = _mm_cvtsi32_si128(qlp_coeff[8]) ;
677
678 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2 ,0,3,1));
679 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2 ,0,3,1));
680 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2 ,0,3,1));
681 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2 ,0,3,1));
682 qlp[4] = _mm_shuffle_epi32(qlp[4], _MM_SHUFFLE(2 ,0,3,1));
683
684 dat[4] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-10)));
685 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-8 )));
686 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-6 )));
687 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-4 )));
688 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-2 )));
689
690 summ = _mm_mul_epi32(dat[4], qlp[4]) ;
691 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[3], qlp[3]));
692 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
693 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
694 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
695
696 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
697 summ = _mm_srl_epi64(summ, cnt);
698 temp = _mm_cvtsi32_si128(residual[0]);
699 temp = _mm_add_epi32(temp, summ);
700 data[0] = _mm_cvtsi128_si32(temp);
701
702 for(i = 1; i < (int)data_len; i++) {
703 dat[4] = _mm_alignr_epi8(dat[3], dat[4], 8);
704 dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
705 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
706 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
707 dat[0] = _mm_alignr_epi8(temp, dat[0], 8);
708
709 summ = _mm_mul_epi32 (dat[4], qlp[4]) ;
710 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[3], qlp[3]));
711 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[2], qlp[2]));
712 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[1], qlp[1]));
713 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[0], qlp[0]));
714
715 summ = _mm_add_epi64(summ, _mm_srli_si12 8(summ, 8));
716 summ = _mm_srl_epi64(summ, cnt);
717 temp = _mm_cvtsi32_si128(residual[i]);
718 temp = _mm_add_epi32(temp, summ);
719 data[i] = _mm_cvtsi128_si32(temp);
720 }
721 }
722 }
723 else if(order > 4) { /* order == 5, 6, 7, 8 */
724 if(order > 6) { /* order == 7, 8 */
725 __m128i qlp[4], dat[4];
726 __m128i summ, temp;
727 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+0));
728 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+2));
729 qlp[2] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+4));
730 if (order == 8)
731 qlp[3] = _mm_loadl_epi64((const __m128i* )(qlp_coeff+6));
732 else
733 qlp[3] = _mm_cvtsi32_si128(qlp_coeff[6]) ;
734
735 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2 ,0,3,1));
736 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2 ,0,3,1));
737 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2 ,0,3,1));
738 qlp[3] = _mm_shuffle_epi32(qlp[3], _MM_SHUFFLE(2 ,0,3,1));
739
740 dat[3] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-8 )));
741 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-6 )));
742 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-4 )));
743 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-2 )));
744
745 summ = _mm_mul_epi32(dat[3], qlp[3]) ;
746 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[2], qlp[2]));
747 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
748 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
749
750 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
751 summ = _mm_srl_epi64(summ, cnt);
752 temp = _mm_cvtsi32_si128(residual[0]);
753 temp = _mm_add_epi32(temp, summ);
754 data[0] = _mm_cvtsi128_si32(temp);
755
756 for(i = 1; i < (int)data_len; i++) {
757 dat[3] = _mm_alignr_epi8(dat[2], dat[3], 8);
758 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
759 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
760 dat[0] = _mm_alignr_epi8(temp, dat[0], 8);
761
762 summ = _mm_mul_epi32 (dat[3], qlp[3]) ;
763 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[2], qlp[2]));
764 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[1], qlp[1]));
765 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[0], qlp[0]));
766
767 summ = _mm_add_epi64(summ, _mm_srli_si12 8(summ, 8));
768 summ = _mm_srl_epi64(summ, cnt);
769 temp = _mm_cvtsi32_si128(residual[i]);
770 temp = _mm_add_epi32(temp, summ);
771 data[i] = _mm_cvtsi128_si32(temp);
772 }
773 }
774 else { /* order == 5, 6 */
775 __m128i qlp[3], dat[3];
776 __m128i summ, temp;
777 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+0));
778 qlp[1] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+2));
779 if (order == 6)
780 qlp[2] = _mm_loadl_epi64((const __m128i* )(qlp_coeff+4));
781 else
782 qlp[2] = _mm_cvtsi32_si128(qlp_coeff[4]) ;
783
784 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2 ,0,3,1));
785 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2 ,0,3,1));
786 qlp[2] = _mm_shuffle_epi32(qlp[2], _MM_SHUFFLE(2 ,0,3,1));
787
788 dat[2] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-6 )));
789 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-4 )));
790 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-2 )));
791
792 summ = _mm_mul_epi32(dat[2], qlp[2]) ;
793 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[1], qlp[1]));
794 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
795
796 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
797 summ = _mm_srl_epi64(summ, cnt);
798 temp = _mm_cvtsi32_si128(residual[0]);
799 temp = _mm_add_epi32(temp, summ);
800 data[0] = _mm_cvtsi128_si32(temp);
801
802 for(i = 1; i < (int)data_len; i++) {
803 dat[2] = _mm_alignr_epi8(dat[1], dat[2], 8);
804 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
805 dat[0] = _mm_alignr_epi8(temp, dat[0], 8);
806
807 summ = _mm_mul_epi32 (dat[2], qlp[2]) ;
808 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[1], qlp[1]));
809 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[0], qlp[0]));
810
811 summ = _mm_add_epi64(summ, _mm_srli_si12 8(summ, 8));
812 summ = _mm_srl_epi64(summ, cnt);
813 temp = _mm_cvtsi32_si128(residual[i]);
814 temp = _mm_add_epi32(temp, summ);
815 data[i] = _mm_cvtsi128_si32(temp);
816 }
817 }
818 }
819 else { /* order == 1, 2, 3, 4 */
820 if(order > 2) { /* order == 3, 4 */
821 __m128i qlp[2], dat[2];
822 __m128i summ, temp;
823 qlp[0] = _mm_loadl_epi64((const __m128i*)(qlp_co eff+0));
824 if (order == 4)
825 qlp[1] = _mm_loadl_epi64((const __m128i* )(qlp_coeff+2));
826 else
827 qlp[1] = _mm_cvtsi32_si128(qlp_coeff[2]) ;
828
829 qlp[0] = _mm_shuffle_epi32(qlp[0], _MM_SHUFFLE(2 ,0,3,1));
830 qlp[1] = _mm_shuffle_epi32(qlp[1], _MM_SHUFFLE(2 ,0,3,1));
831
832 dat[1] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-4 )));
833 dat[0] = _mm_cvtepu32_epi64(_mm_loadl_epi64((con st __m128i*)(data-2 )));
834
835 summ = _mm_mul_epi32(dat[1], qlp[1]) ;
836 summ = _mm_add_epi64(summ, _mm_mul_epi32(dat[0], qlp[0]));
837
838 summ = _mm_add_epi64(summ, _mm_srli_si128(summ, 8));
839 summ = _mm_srl_epi64(summ, cnt);
840 temp = _mm_cvtsi32_si128(residual[0]);
841 temp = _mm_add_epi32(temp, summ);
842 data[0] = _mm_cvtsi128_si32(temp);
843
844 for(i = 1; i < (int)data_len; i++) {
845 dat[1] = _mm_alignr_epi8(dat[0], dat[1], 8);
846 dat[0] = _mm_alignr_epi8(temp, dat[0], 8);
847
848 summ = _mm_mul_epi32 (dat[1], qlp[1]) ;
849 summ = _mm_add_epi64(summ, _mm_mul_epi32 (dat[0], qlp[0]));
850
851 summ = _mm_add_epi64(summ, _mm_srli_si12 8(summ, 8));
852 summ = _mm_srl_epi64(summ, cnt);
853 temp = _mm_cvtsi32_si128(residual[i]);
854 temp = _mm_add_epi32(temp, summ);
855 data[i] = _mm_cvtsi128_si32(temp);
856 }
857 }
858 else { /* order == 1, 2 */
859 if(order == 2) {
860 __m128i qlp0, dat0;
861 __m128i summ, temp;
862 qlp0 = _mm_loadl_epi64((const __m128i*)( qlp_coeff));
863 qlp0 = _mm_shuffle_epi32(qlp0, _MM_SHUFF LE(2,0,3,1));
864
865 dat0 = _mm_cvtepu32_epi64(_mm_loadl_epi6 4((const __m128i*)(data-2 )));
866
867 summ = _mm_mul_epi32(dat0, qlp0) ;
868
869 summ = _mm_add_epi64(summ, _mm_srli_si12 8(summ, 8));
870 summ = _mm_srl_epi64(summ, cnt);
871 temp = _mm_cvtsi32_si128(residual[0]);
872 temp = _mm_add_epi32(temp, summ);
873 data[0] = _mm_cvtsi128_si32(temp);
874
875 for(i = 1; i < (int)data_len; i++) {
876 dat0 = _mm_alignr_epi8(temp, dat 0, 8);
877
878 summ = _mm_mul_epi32(dat0, qlp0) ;
879
880 summ = _mm_add_epi64(summ, _mm_s rli_si128(summ, 8));
881 summ = _mm_srl_epi64(summ, cnt);
882 temp = _mm_cvtsi32_si128(residua l[i]);
883 temp = _mm_add_epi32(temp, summ) ;
884 data[i] = _mm_cvtsi128_si32(temp );
885 }
886 }
887 else { /* order == 1 */
888 __m128i qlp0;
889 __m128i summ, temp;
890 qlp0 = _mm_cvtsi32_si128(qlp_coeff[0]);
891 temp = _mm_cvtsi32_si128(data[-1]);
892
893 summ = _mm_mul_epi32(temp, qlp0);
894 summ = _mm_srl_epi64(summ, cnt);
895 temp = _mm_cvtsi32_si128(residual[0]);
896 temp = _mm_add_epi32(temp, summ);
897 data[0] = _mm_cvtsi128_si32(temp);
898
899 for(i = 1; i < (int)data_len; i++) {
900 summ = _mm_mul_epi32(temp, qlp0) ;
901 summ = _mm_srl_epi64(summ, cnt);
902 temp = _mm_cvtsi32_si128(residua l[i]);
903 temp = _mm_add_epi32(temp, summ) ;
904 data[i] = _mm_cvtsi128_si32(temp );
905 }
906 }
907 }
908 }
909 }
910 else { /* order > 12 */
911 FLAC__int64 sum;
912 for(i = 0; i < (int)data_len; i++) {
913 sum = 0;
914 switch(order) {
915 case 32: sum += qlp_coeff[31] * (FLAC__int64)dat a[i-32];
916 case 31: sum += qlp_coeff[30] * (FLAC__int64)dat a[i-31];
917 case 30: sum += qlp_coeff[29] * (FLAC__int64)dat a[i-30];
918 case 29: sum += qlp_coeff[28] * (FLAC__int64)dat a[i-29];
919 case 28: sum += qlp_coeff[27] * (FLAC__int64)dat a[i-28];
920 case 27: sum += qlp_coeff[26] * (FLAC__int64)dat a[i-27];
921 case 26: sum += qlp_coeff[25] * (FLAC__int64)dat a[i-26];
922 case 25: sum += qlp_coeff[24] * (FLAC__int64)dat a[i-25];
923 case 24: sum += qlp_coeff[23] * (FLAC__int64)dat a[i-24];
924 case 23: sum += qlp_coeff[22] * (FLAC__int64)dat a[i-23];
925 case 22: sum += qlp_coeff[21] * (FLAC__int64)dat a[i-22];
926 case 21: sum += qlp_coeff[20] * (FLAC__int64)dat a[i-21];
927 case 20: sum += qlp_coeff[19] * (FLAC__int64)dat a[i-20];
928 case 19: sum += qlp_coeff[18] * (FLAC__int64)dat a[i-19];
929 case 18: sum += qlp_coeff[17] * (FLAC__int64)dat a[i-18];
930 case 17: sum += qlp_coeff[16] * (FLAC__int64)dat a[i-17];
931 case 16: sum += qlp_coeff[15] * (FLAC__int64)dat a[i-16];
932 case 15: sum += qlp_coeff[14] * (FLAC__int64)dat a[i-15];
933 case 14: sum += qlp_coeff[13] * (FLAC__int64)dat a[i-14];
934 case 13: sum += qlp_coeff[12] * (FLAC__int64)dat a[i-13];
935 sum += qlp_coeff[11] * (FLAC__int64)dat a[i-12];
936 sum += qlp_coeff[10] * (FLAC__int64)dat a[i-11];
937 sum += qlp_coeff[ 9] * (FLAC__int64)dat a[i-10];
938 sum += qlp_coeff[ 8] * (FLAC__int64)dat a[i- 9];
939 sum += qlp_coeff[ 7] * (FLAC__int64)dat a[i- 8];
940 sum += qlp_coeff[ 6] * (FLAC__int64)dat a[i- 7];
941 sum += qlp_coeff[ 5] * (FLAC__int64)dat a[i- 6];
942 sum += qlp_coeff[ 4] * (FLAC__int64)dat a[i- 5];
943 sum += qlp_coeff[ 3] * (FLAC__int64)dat a[i- 4];
944 sum += qlp_coeff[ 2] * (FLAC__int64)dat a[i- 3];
945 sum += qlp_coeff[ 1] * (FLAC__int64)dat a[i- 2];
946 sum += qlp_coeff[ 0] * (FLAC__int64)dat a[i- 1];
947 }
948 data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantiza tion);
949 }
950 }
951 }
952
953 #endif /* defined FLAC__CPU_IA32 */
954
955 FLAC__SSE_TARGET("sse4.1")
956 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse41(const FLAC__i nt32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, in t lp_quantization, FLAC__int32 residual[])
957 {
958 int i;
959 FLAC__int32 sum;
960 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
961
962 FLAC__ASSERT(order > 0);
963 FLAC__ASSERT(order <= 32);
964
965 if(order <= 12) {
966 if(order > 8) {
967 if(order > 10) {
968 if(order == 12) {
969 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
970 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
971 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
972 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
973 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
974 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
975 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
976 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
977 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
978 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
979 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
980 q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
981 q11 = _mm_cvtsi32_si128(qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
982
983 for(i = 0; i < (int)data_len-3; i+=4) {
984 __m128i summ, mull;
985 summ = _mm_mullo_epi32(q11, _mm_ loadu_si128((const __m128i*)(data+i-12)));
986 mull = _mm_mullo_epi32(q10, _mm_ loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
987 mull = _mm_mullo_epi32(q9, _mm_l oadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
988 mull = _mm_mullo_epi32(q8, _mm_l oadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
989 mull = _mm_mullo_epi32(q7, _mm_l oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
990 mull = _mm_mullo_epi32(q6, _mm_l oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
991 mull = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
992 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
993 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
994 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
995 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
996 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
997 summ = _mm_sra_epi32(summ, cnt);
998 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
999 }
1000 }
1001 else { /* order == 11 */
1002 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
1003 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1004 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1005 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1006 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1007 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1008 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1009 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1010 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1011 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1012 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
1013 q10 = _mm_cvtsi32_si128(qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
1014
1015 for(i = 0; i < (int)data_len-3; i+=4) {
1016 __m128i summ, mull;
1017 summ = _mm_mullo_epi32(q10, _mm_ loadu_si128((const __m128i*)(data+i-11)));
1018 mull = _mm_mullo_epi32(q9, _mm_l oadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
1019 mull = _mm_mullo_epi32(q8, _mm_l oadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
1020 mull = _mm_mullo_epi32(q7, _mm_l oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1021 mull = _mm_mullo_epi32(q6, _mm_l oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1022 mull = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1023 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1024 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1025 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1026 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1027 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1028 summ = _mm_sra_epi32(summ, cnt);
1029 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1030 }
1031 }
1032 }
1033 else {
1034 if(order == 10) {
1035 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
1036 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1037 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1038 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1039 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1040 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1041 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1042 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1043 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1044 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1045 q9 = _mm_cvtsi32_si128(qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
1046
1047 for(i = 0; i < (int)data_len-3; i+=4) {
1048 __m128i summ, mull;
1049 summ = _mm_mullo_epi32(q9, _mm_l oadu_si128((const __m128i*)(data+i-10)));
1050 mull = _mm_mullo_epi32(q8, _mm_l oadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
1051 mull = _mm_mullo_epi32(q7, _mm_l oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1052 mull = _mm_mullo_epi32(q6, _mm_l oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1053 mull = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1054 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1055 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1056 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1057 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1058 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1059 summ = _mm_sra_epi32(summ, cnt);
1060 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1061 }
1062 }
1063 else { /* order == 9 */
1064 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
1065 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1066 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1067 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1068 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1069 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1070 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1071 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1072 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1073 q8 = _mm_cvtsi32_si128(qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
1074
1075 for(i = 0; i < (int)data_len-3; i+=4) {
1076 __m128i summ, mull;
1077 summ = _mm_mullo_epi32(q8, _mm_l oadu_si128((const __m128i*)(data+i-9)));
1078 mull = _mm_mullo_epi32(q7, _mm_l oadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
1079 mull = _mm_mullo_epi32(q6, _mm_l oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1080 mull = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1081 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1082 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1083 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1084 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1085 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1086 summ = _mm_sra_epi32(summ, cnt);
1087 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1088 }
1089 }
1090 }
1091 }
1092 else if(order > 4) {
1093 if(order > 6) {
1094 if(order == 8) {
1095 __m128i q0, q1, q2, q3, q4, q5, q6, q7;
1096 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1097 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1098 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1099 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1100 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1101 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1102 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1103 q7 = _mm_cvtsi32_si128(qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
1104
1105 for(i = 0; i < (int)data_len-3; i+=4) {
1106 __m128i summ, mull;
1107 summ = _mm_mullo_epi32(q7, _mm_l oadu_si128((const __m128i*)(data+i-8)));
1108 mull = _mm_mullo_epi32(q6, _mm_l oadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
1109 mull = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1110 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1111 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1112 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1113 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1114 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1115 summ = _mm_sra_epi32(summ, cnt);
1116 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1117 }
1118 }
1119 else { /* order == 7 */
1120 __m128i q0, q1, q2, q3, q4, q5, q6;
1121 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1122 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1123 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1124 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1125 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1126 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1127 q6 = _mm_cvtsi32_si128(qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
1128
1129 for(i = 0; i < (int)data_len-3; i+=4) {
1130 __m128i summ, mull;
1131 summ = _mm_mullo_epi32(q6, _mm_l oadu_si128((const __m128i*)(data+i-7)));
1132 mull = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
1133 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1134 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1135 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1136 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1137 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1138 summ = _mm_sra_epi32(summ, cnt);
1139 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1140 }
1141 }
1142 }
1143 else {
1144 if(order == 6) {
1145 __m128i q0, q1, q2, q3, q4, q5;
1146 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1147 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1148 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1149 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1150 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1151 q5 = _mm_cvtsi32_si128(qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
1152
1153 for(i = 0; i < (int)data_len-3; i+=4) {
1154 __m128i summ, mull;
1155 summ = _mm_mullo_epi32(q5, _mm_l oadu_si128((const __m128i*)(data+i-6)));
1156 mull = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
1157 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1158 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1159 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1160 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1161 summ = _mm_sra_epi32(summ, cnt);
1162 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1163 }
1164 }
1165 else { /* order == 5 */
1166 __m128i q0, q1, q2, q3, q4;
1167 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1168 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1169 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1170 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1171 q4 = _mm_cvtsi32_si128(qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
1172
1173 for(i = 0; i < (int)data_len-3; i+=4) {
1174 __m128i summ, mull;
1175 summ = _mm_mullo_epi32(q4, _mm_l oadu_si128((const __m128i*)(data+i-5)));
1176 mull = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
1177 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1178 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1179 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1180 summ = _mm_sra_epi32(summ, cnt);
1181 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1182 }
1183 }
1184 }
1185 }
1186 else {
1187 if(order > 2) {
1188 if(order == 4) {
1189 __m128i q0, q1, q2, q3;
1190 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1191 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1192 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1193 q3 = _mm_cvtsi32_si128(qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
1194
1195 for(i = 0; i < (int)data_len-3; i+=4) {
1196 __m128i summ, mull;
1197 summ = _mm_mullo_epi32(q3, _mm_l oadu_si128((const __m128i*)(data+i-4)));
1198 mull = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
1199 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1200 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1201 summ = _mm_sra_epi32(summ, cnt);
1202 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1203 }
1204 }
1205 else { /* order == 3 */
1206 __m128i q0, q1, q2;
1207 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1208 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1209 q2 = _mm_cvtsi32_si128(qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
1210
1211 for(i = 0; i < (int)data_len-3; i+=4) {
1212 __m128i summ, mull;
1213 summ = _mm_mullo_epi32(q2, _mm_l oadu_si128((const __m128i*)(data+i-3)));
1214 mull = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
1215 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1216 summ = _mm_sra_epi32(summ, cnt);
1217 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1218 }
1219 }
1220 }
1221 else {
1222 if(order == 2) {
1223 __m128i q0, q1;
1224 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1225 q1 = _mm_cvtsi32_si128(qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
1226
1227 for(i = 0; i < (int)data_len-3; i+=4) {
1228 __m128i summ, mull;
1229 summ = _mm_mullo_epi32(q1, _mm_l oadu_si128((const __m128i*)(data+i-2)));
1230 mull = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
1231 summ = _mm_sra_epi32(summ, cnt);
1232 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1233 }
1234 }
1235 else { /* order == 1 */
1236 __m128i q0;
1237 q0 = _mm_cvtsi32_si128(qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
1238
1239 for(i = 0; i < (int)data_len-3; i+=4) {
1240 __m128i summ;
1241 summ = _mm_mullo_epi32(q0, _mm_l oadu_si128((const __m128i*)(data+i-1)));
1242 summ = _mm_sra_epi32(summ, cnt);
1243 _mm_storeu_si128((__m128i*)(resi dual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
1244 }
1245 }
1246 }
1247 }
1248 for(; i < (int)data_len; i++) {
1249 sum = 0;
1250 switch(order) {
1251 case 12: sum += qlp_coeff[11] * data[i-12];
1252 case 11: sum += qlp_coeff[10] * data[i-11];
1253 case 10: sum += qlp_coeff[ 9] * data[i-10];
1254 case 9: sum += qlp_coeff[ 8] * data[i- 9];
1255 case 8: sum += qlp_coeff[ 7] * data[i- 8];
1256 case 7: sum += qlp_coeff[ 6] * data[i- 7];
1257 case 6: sum += qlp_coeff[ 5] * data[i- 6];
1258 case 5: sum += qlp_coeff[ 4] * data[i- 5];
1259 case 4: sum += qlp_coeff[ 3] * data[i- 4];
1260 case 3: sum += qlp_coeff[ 2] * data[i- 3];
1261 case 2: sum += qlp_coeff[ 1] * data[i- 2];
1262 case 1: sum += qlp_coeff[ 0] * data[i- 1];
1263 }
1264 residual[i] = data[i] - (sum >> lp_quantization);
1265 }
1266 }
1267 else { /* order > 12 */
1268 for(i = 0; i < (int)data_len; i++) {
1269 sum = 0;
1270 switch(order) {
1271 case 32: sum += qlp_coeff[31] * data[i-32];
1272 case 31: sum += qlp_coeff[30] * data[i-31];
1273 case 30: sum += qlp_coeff[29] * data[i-30];
1274 case 29: sum += qlp_coeff[28] * data[i-29];
1275 case 28: sum += qlp_coeff[27] * data[i-28];
1276 case 27: sum += qlp_coeff[26] * data[i-27];
1277 case 26: sum += qlp_coeff[25] * data[i-26];
1278 case 25: sum += qlp_coeff[24] * data[i-25];
1279 case 24: sum += qlp_coeff[23] * data[i-24];
1280 case 23: sum += qlp_coeff[22] * data[i-23];
1281 case 22: sum += qlp_coeff[21] * data[i-22];
1282 case 21: sum += qlp_coeff[20] * data[i-21];
1283 case 20: sum += qlp_coeff[19] * data[i-20];
1284 case 19: sum += qlp_coeff[18] * data[i-19];
1285 case 18: sum += qlp_coeff[17] * data[i-18];
1286 case 17: sum += qlp_coeff[16] * data[i-17];
1287 case 16: sum += qlp_coeff[15] * data[i-16];
1288 case 15: sum += qlp_coeff[14] * data[i-15];
1289 case 14: sum += qlp_coeff[13] * data[i-14];
1290 case 13: sum += qlp_coeff[12] * data[i-13];
1291 sum += qlp_coeff[11] * data[i-12];
1292 sum += qlp_coeff[10] * data[i-11];
1293 sum += qlp_coeff[ 9] * data[i-10];
1294 sum += qlp_coeff[ 8] * data[i- 9];
1295 sum += qlp_coeff[ 7] * data[i- 8];
1296 sum += qlp_coeff[ 6] * data[i- 7];
1297 sum += qlp_coeff[ 5] * data[i- 6];
1298 sum += qlp_coeff[ 4] * data[i- 5];
1299 sum += qlp_coeff[ 3] * data[i- 4];
1300 sum += qlp_coeff[ 2] * data[i- 3];
1301 sum += qlp_coeff[ 1] * data[i- 2];
1302 sum += qlp_coeff[ 0] * data[i- 1];
1303 }
1304 residual[i] = data[i] - (sum >> lp_quantization);
1305 }
1306 }
1307 }
1308
1309 #endif /* FLAC__SSE4_1_SUPPORTED */
1310 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1311 #endif /* FLAC__NO_ASM */
1312 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
OLDNEW
« no previous file with comments | « src/libFLAC/lpc_intrin_sse2.c ('k') | src/libFLAC/md5.c » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698